xref: /aosp_15_r20/external/ComputeLibrary/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #include "depthfirst_driver.hpp"
26 #include "interleaves/generic.hpp"
27 
28 namespace arm_conv {
29 namespace depthwise {
30 
31 template <typename OutputStage>
32 class IPlanarStrategy
33 {
34   public:
35   virtual ~IPlanarStrategy() = default;
36   virtual unsigned int get_output_rows(void) const = 0;
37   virtual arm_gemm::VLType get_vl_type(void) const = 0;
38 
39   virtual size_t get_storage_size(const DepthwiseArgs &) const = 0;
40   virtual void pack_parameters(
41     const DepthwiseArgs &args, void *buffer,
42     const void *biases, const OutputStage &,
43     const void *weights, size_t ld_weight_col, size_t ld_weight_row
44   ) const = 0;
45 };
46 
47 
48 template <typename TInput, typename TWeight, typename TOutput, typename TAccum,
49           typename OutputStage>
50 struct PlanarKernelType;
51 
52 template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
53 struct PlanarKernelType<TInput, TWeight, TOutput, TAccum, Nothing>
54 {
55   using Type = std::function<void(
56     const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
57     unsigned int pad_top, unsigned int valid_input_rows,
58     unsigned int pad_left, unsigned int valid_input_cols,
59     const TWeight *, const TAccum *,
60     TOutput **, const size_t *, const size_t *, unsigned int output_cols,
61     unsigned int start_channels, unsigned int valid_channels,
62     TAccum act_min, TAccum act_max
63   )>;
64 
65   template <typename WorkspaceType>
executearm_conv::depthwise::PlanarKernelType66   static inline void execute(
67     const Type fn,
68     const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
69     unsigned int pad_top, unsigned int valid_input_rows,
70     unsigned int pad_left, unsigned int valid_input_cols,
71     const TWeight *weights, const TAccum *bias,
72     TOutput **outptrs, const size_t *outlds, const size_t *outvllds, unsigned int output_cols,
73     unsigned int start_channel, unsigned int valid_channels,
74     const Nothing &, const WorkspaceType *ws
75   )
76   {
77     fn(
78       inptr, ld_in_row, ld_in_col, ld_in_vl,
79       pad_top, valid_input_rows,
80       pad_left, valid_input_cols,
81       weights, bias,
82       outptrs, outlds, outvllds, output_cols,
83       start_channel, valid_channels,
84       ws->activation_min, ws->activation_max
85     );
86   }
87 };
88 
89 template <typename TInput, typename TWeight, typename TOutput>
90 struct PlanarKernelType<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
91 {
92   using Type = std::function<void(
93     const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
94     unsigned int pad_top, unsigned int valid_input_rows,
95     unsigned int pad_left, unsigned int valid_input_cols,
96     const TWeight *,
97     TOutput **, const size_t *, const size_t *, unsigned int output_cols,
98     unsigned int start_channel, unsigned int valid_channels,
99     const arm_gemm::Requantize32 &
100   )>;
101 
102   template <typename WorkspaceType>
executearm_conv::depthwise::PlanarKernelType103   static inline void execute(
104     const Type fn,
105     const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
106     unsigned int pad_top, unsigned int valid_input_rows,
107     unsigned int pad_left, unsigned int valid_input_cols,
108     const TWeight *weights, const int32_t *,
109     TOutput **outptrs, const size_t *outlds, const size_t *outldvls, unsigned int output_cols,
110     unsigned int first_channel, unsigned int valid_channels,
111     const arm_gemm::Requantize32 &qp, const WorkspaceType *
112   )
113   {
114     fn(
115       inptr, ld_in_row, ld_in_col, ld_in_vl,
116       pad_top, valid_input_rows,
117       pad_left, valid_input_cols,
118       weights,
119       outptrs, outlds, outldvls, output_cols,
120       first_channel, valid_channels,
121       qp
122     );
123   }
124 };
125 
126 
127 template <typename TInput, typename TWeight=TInput, typename TOutput=TInput,
128           typename TAccum=typename DefaultTAccum<TOutput>::Type,
129           typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
130 class PlanarStrategy : public IPlanarStrategy<OutputStage>
131 {
132   unsigned int m_kernel_rows, m_kernel_cols;
133   unsigned int m_stride_rows, m_stride_cols;
134   unsigned int m_output_rows;
135   arm_gemm::VLType m_vl_type;
136 
137   protected:
get_kernel_packing_point(const unsigned int index,unsigned int & x,unsigned int & y) const138   virtual bool get_kernel_packing_point(const unsigned int index, unsigned int &x, unsigned int &y) const
139   {
140     // Get the kernel point to pack at the given index; return false to
141     // indicate that this index (and all greater indices) is out of range.
142     if (m_kernel_rows * m_kernel_cols <= index)
143       return false;
144 
145     y = index % m_kernel_cols;
146     x = index / m_kernel_cols;
147     return true;
148   }
149 
get_kernel_packing_arguments(void) const150   virtual interleaves::PackingArguments get_kernel_packing_arguments(void) const
151   {
152     return interleaves::PackingArguments(
153       m_kernel_rows, m_kernel_cols, sizeof(TWeight),
154       false, sizeof(TAccum),  // Don't pack the bias
155       m_vl_type, sizeof(TAccum), 1,  // Accumulator depth of 1 TODO
156       [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
157       { return this->get_kernel_packing_point(idx, x, y); }
158     );
159   }
160 
161   public:
PlanarStrategy(unsigned int kernel_rows,unsigned int kernel_cols,unsigned int stride_rows,unsigned int stride_cols,unsigned int output_rows,arm_gemm::VLType vl_type)162   PlanarStrategy(
163     unsigned int kernel_rows, unsigned int kernel_cols,
164     unsigned int stride_rows, unsigned int stride_cols,
165     unsigned int output_rows,
166     arm_gemm::VLType vl_type
167   ) : m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols),
168       m_stride_rows(stride_rows), m_stride_cols(stride_cols),
169       m_output_rows(output_rows), m_vl_type(vl_type)
170   {
171   }
172 
get_output_rows(void) const173   unsigned int get_output_rows(void) const override { return m_output_rows; }
get_vl_type(void) const174   arm_gemm::VLType get_vl_type(void) const override { return m_vl_type; }
175 
get_storage_size(const DepthwiseArgs & args) const176   size_t get_storage_size(const DepthwiseArgs &args) const override
177   {
178     return interleaves::get_storage_size_generic(this->get_kernel_packing_arguments(), args);
179   }
180 
pack_parameters(const DepthwiseArgs & args,void * buffer,const void * biases,const OutputStage &,const void * weights,size_t ld_weight_col,size_t ld_weight_row) const181   void pack_parameters(
182     const DepthwiseArgs &args, void *buffer,
183     const void *biases, const OutputStage &,
184     const void *weights, size_t ld_weight_col, size_t ld_weight_row
185   ) const override
186   {
187     interleaves::pack_parameters_generic(
188       this->get_kernel_packing_arguments(), args,
189       buffer, biases, weights, ld_weight_col, ld_weight_row
190     );
191   }
192 
193   using KernelType = typename PlanarKernelType<TInput, TWeight, TOutput, TAccum, OutputStage>::Type;
194   virtual KernelType get_kernel(void) const = 0;
195 };
196 
197 
198 namespace {
199 
200 template <typename T>
201 struct OutputRowPtrsElement
202 {
203   struct Workspace
204   {
205     T **output_row_ptrs;
206     size_t *output_ld_cols;
207     size_t *output_ld_vls;  // Stride between vectors of channels
208     T *output_padding_buffer;
209   };
210 
211   template <typename OutputStage>
get_element_sizearm_conv::depthwise::__anon21c175b90211::OutputRowPtrsElement212   static size_t get_element_size(const WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage> &args)
213   {
214     // We need one pointer and stride for each row of output, and an additional
215     // blob of memory into which padded stores can go.
216     return args.strategy->get_output_rows() * (sizeof(T *) + 2*sizeof(size_t)) +
217            get_vector_length<char>(args.strategy->get_vl_type());
218   }
219 
220   template <typename WorkspaceType, typename OutputStage>
initialisearm_conv::depthwise::__anon21c175b90211::OutputRowPtrsElement221   static void *initialise(WorkspaceType *ws, void *buffer,
222                           const WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage> &args)
223   {
224     const auto n_rows = args.strategy->get_output_rows();
225     ws->output_row_ptrs = reinterpret_cast<T **>(buffer);
226     ws->output_ld_cols = reinterpret_cast<size_t *>(ws->output_row_ptrs + n_rows);
227     ws->output_ld_vls = ws->output_ld_cols + n_rows;
228     ws->output_padding_buffer = reinterpret_cast<T *>(ws->output_ld_vls + n_rows);
229     return ws->output_padding_buffer + get_vector_length<T>(args.strategy->get_vl_type());
230   }
231 };
232 
233 }  // namespace {anonymous}
234 
235 
236 template <typename TInput, typename TWeight=TInput, typename TOutput=TInput,
237           typename TAccum=typename DefaultTAccum<TOutput>::Type,
238           typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
239 class DepthwisePlanar : public DepthwiseCommon<TInput, TWeight, TOutput>
240 {
241   using Parent = DepthwiseCommon<TInput, TWeight, TOutput>;
242   using StrategyType = IPlanarStrategy<OutputStage>;
243   using WorkspaceManager = Workspace<
244     OutputRowPtrsElement<TOutput>,
245     ActivationsElement<TAccum, OutputStage>
246   >;
247   using WorkspaceType = typename WorkspaceManager::WorkspaceType;
248 
249   std::unique_ptr<StrategyType> m_strat;
250   const TAccum *m_bias;
251   OutputStage m_os;
252 
253   public:
DepthwisePlanar(StrategyType * const strat,const DepthwiseArgs & args,const OutputStage & os={})254   DepthwisePlanar(StrategyType *const strat, const DepthwiseArgs &args, const OutputStage &os = {})
255   : Parent(args), m_strat(strat), m_bias(nullptr), m_os(os)
256   {
257   }
258 
259   DepthwisePlanar(DepthwisePlanar &) = delete;
260   DepthwisePlanar &operator=(DepthwisePlanar &) = delete;
261 
get_storage_size(void) const262   size_t get_storage_size(void) const override
263   {
264     return m_strat->get_storage_size(this->m_args);
265   }
266 
pack_parameters(void * buffer,const void * biases,const void * weights,size_t ld_weight_col,size_t ld_weight_row)267   void pack_parameters(
268     void *buffer, const void *biases,
269     const void *weights, size_t ld_weight_col, size_t ld_weight_row
270   ) override
271   {
272     m_strat->pack_parameters(this->m_args, buffer, biases, {}, weights, ld_weight_col, ld_weight_row);
273     this->m_bias = reinterpret_cast<const TAccum *>(biases);
274     depthwise_depthfirst::stash_bias(this->m_os, biases);
275   }
276 
get_working_size(unsigned int n_threads,unsigned int) const277   size_t get_working_size(unsigned int n_threads, unsigned int) const override
278   {
279     return this->get_working_size_per_thread() * n_threads;
280   }
281 
282   protected:
283   /* Compute the amount of working space required for a single thread. */
get_working_size_per_thread(void) const284   virtual size_t get_working_size_per_thread(void) const
285   {
286     return WorkspaceManager::get_sizeof_workspace(
287       WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage>(m_strat.get(), this->m_args, m_os));
288   }
289 
290   /* Initialise the working space for a thread. */
initialise_working_space(void * buffer) const291   virtual void initialise_working_space(void *buffer) const
292   {
293     WorkspaceManager::initialise(
294       buffer,
295       WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage>(m_strat.get(), this->m_args, m_os)
296     );
297   }
298 
299   /* Execute the kernel for a given chunk of work. */
execute_kernel(const TInput * inptr,size_t ld_in_row,size_t ld_in_col,size_t ld_in_vl,unsigned int pad_top,unsigned int valid_input_rows,unsigned int pad_left,unsigned int valid_input_cols,const TWeight * weights,const TAccum * bias,TOutput * outptr,size_t ld_out_row,size_t ld_out_col,size_t ld_out_vl,unsigned int valid_output_rows,unsigned int valid_output_cols,unsigned int first_channel,unsigned int valid_channels,WorkspaceType * ws) const300   virtual void execute_kernel(
301     const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
302     unsigned int pad_top, unsigned int valid_input_rows,
303     unsigned int pad_left, unsigned int valid_input_cols,
304     const TWeight *weights, const TAccum *bias,
305     TOutput *outptr, size_t ld_out_row, size_t ld_out_col, size_t ld_out_vl,
306     unsigned int valid_output_rows, unsigned int valid_output_cols,
307     unsigned int first_channel, unsigned int valid_channels,
308     WorkspaceType *ws
309   ) const
310   {
311     // Initialise the output pointers
312     for (auto i = 0u; i < m_strat->get_output_rows(); i++)
313     {
314       // Point at the output tensor for all valid rows; otherwise point at the
315       // padding buffer.
316       ws->output_row_ptrs[i] = i < valid_output_rows ? outptr : ws->output_padding_buffer;
317       ws->output_ld_cols[i] = i < valid_output_rows ? ld_out_col : 0;
318       ws->output_ld_vls[i] = i < valid_output_rows ? ld_out_vl : 0;
319       outptr += ld_out_row;
320     }
321 
322     // Execute the kernel
323     PlanarKernelType<TInput, TWeight, TOutput, TAccum, OutputStage>::template execute<WorkspaceType>(
324       reinterpret_cast<const PlanarStrategy<TInput, TWeight, TOutput, TAccum, OutputStage> *>(m_strat.get())->get_kernel(),
325       inptr, ld_in_row, ld_in_col, ld_in_vl,
326       pad_top, valid_input_rows, pad_left, valid_input_cols,
327       weights, bias,
328       ws->output_row_ptrs, ws->output_ld_cols, ws->output_ld_vls,
329       valid_output_cols, first_channel, valid_channels,
330       this->m_os, ws
331     );
332   }
333 
execute_internal(unsigned int batches,unsigned int input_height,unsigned int input_width,unsigned int n_input_channels,const PaddingValues & padding,const void * input,size_t ld_input_col,size_t ld_input_row,size_t ld_input_batch,const void * parameters,unsigned int output_height,unsigned int output_width,void * output,size_t ld_output_col,size_t ld_output_row,size_t ld_output_batch,void * working_space,unsigned int thread_id,unsigned int n_threads) const334   void execute_internal(
335     unsigned int batches,
336     unsigned int input_height,
337     unsigned int input_width,
338     unsigned int n_input_channels,
339     const PaddingValues &padding,
340     const void *input,
341     size_t ld_input_col,
342     size_t ld_input_row,
343     size_t ld_input_batch,
344     const void *parameters,
345     unsigned int output_height,
346     unsigned int output_width,
347     void *output,
348     size_t ld_output_col,
349     size_t ld_output_row,
350     size_t ld_output_batch,
351     void *working_space,
352     unsigned int thread_id,
353     unsigned int n_threads
354   ) const override
355   {
356     // Get and initialise the working space for this thread.
357     void *thread_working_space =
358       static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread();
359     this->initialise_working_space(thread_working_space);
360     auto ws = reinterpret_cast<WorkspaceType *>(thread_working_space);
361 
362     const auto n_output_channels = n_input_channels * this->m_args.channel_multiplier;
363     const auto vl = get_vector_length<TAccum>(m_strat->get_vl_type());
364 
365     // Get typed pointers
366     auto input_batch = reinterpret_cast<const TInput *>(input);
367     auto output_batch = reinterpret_cast<TOutput *>(output);
368     auto weights = reinterpret_cast<const TWeight *>(parameters);
369 
370     // Iterate over batches
371     for (; batches; batches--)
372     {
373       // NOTE: Other loop orderings are possible and it would be worth
374       // investigating them.
375 
376       // Within a batch, stripe threads across rows.
377       for (auto start_output_i = thread_id * m_strat->get_output_rows();
378            start_output_i < output_height;
379            start_output_i += n_threads * m_strat->get_output_rows())
380       {
381         // Determine what (if any padding) is required on the top/bottom of
382         // this row of the convolution.
383         const int start_input_i = start_output_i * this->m_args.stride_rows - padding.top;
384         const unsigned int input_pad_top = start_input_i < 0 ? -start_input_i : 0;
385         const unsigned int input_i = start_input_i < 0 ? 0 : start_input_i;
386         const unsigned int valid_input_rows = input_i > input_height ? 0 : input_height - input_i;
387         const unsigned int valid_output_rows = output_height - start_output_i;
388 
389         auto inptr_row = input_batch + input_i*ld_input_row;
390         auto outptr_row = output_batch + start_output_i * ld_output_row;
391 
392         // Execute the kernel
393         this->execute_kernel(
394           inptr_row, ld_input_row, ld_input_col, vl,
395           input_pad_top, valid_input_rows, padding.left, input_width,
396           weights, this->m_bias,
397           outptr_row, ld_output_row, ld_output_col, vl,
398           valid_output_rows, output_width,
399           0 /* first channel */, n_output_channels,
400           ws
401         );
402       }
403 
404       // Update the input and output pointers to account for batch
405       input_batch += ld_input_batch;
406       output_batch += ld_output_batch;
407     }
408   }
409 };
410 
411 }  // namespace depthwise
412 }  // namespace arm_conv
413