xref: /aosp_15_r20/external/ComputeLibrary/src/core/NEON/kernels/convolution/winograd/output_transform.hpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #pragma once
26 
27 #include "src/core/NEON/kernels/assembly/winograd.hpp"
28 
29 #include "src/core/NEON/kernels/arm_conv/addressing.hpp"
30 
31 #include <algorithm>
32 #include <cstring>
33 #include <functional>
34 #include <limits>
35 
36 namespace arm_conv {
37 namespace winograd {
38 namespace output_transform {
39 
40 /* Driver class for the Winograd output transforms.
41  *
42  * This provides a base implementation which handles iteration over the output
43  * tensor; subclasses are responsible for managing working space and executing
44  * the transform on individual tiles.
45  */
46 template <typename TIn, typename TOut=TIn>
47 class TransformBase : public ITransform
48 {
49   const std::string m_name;
50   const unsigned int m_output_rows, m_output_cols;
51   const unsigned int m_kernel_rows, m_kernel_cols;
52 
53   protected:
get_working_space_per_thread(const ConvolutionArgs &) const54   virtual size_t get_working_space_per_thread(const ConvolutionArgs &) const
55   {
56     return 0;
57   }
58 
initialise_thread_working_space(const ConvolutionArgs &,void *) const59   virtual void initialise_thread_working_space(const ConvolutionArgs &, void *) const
60   {
61     // Nothing to do
62   }
63 
64   virtual void execute_tile(
65     unsigned int n_channels,
66     const TIn *inptr, size_t ld_in_matrix,
67     const TIn *bias,
68     TOut *outptr, size_t ld_out_row, size_t ld_out_col,
69     TOut activation_min, TOut activation_max,
70     unsigned int valid_rows, unsigned int valid_cols,
71     void *working_space
72   ) const = 0;
73 
execute_internal(const ConvolutionArgs & args,const TIn * inptr,size_t ld_in_batch,size_t ld_in_matrix,size_t ld_in_row,const TIn * bias,TOut * outptr,size_t ld_out_batch,size_t ld_out_row,size_t ld_out_col,void * working_space,unsigned int thread_id,unsigned int n_threads) const74   void execute_internal(
75     const ConvolutionArgs &args,
76     const TIn *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row,
77     const TIn *bias,
78     TOut *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col,
79     void *working_space, unsigned int thread_id, unsigned int n_threads
80   ) const
81   {
82     // Get the working space for this thread, and initialise it.
83     working_space = reinterpret_cast<char *>(working_space) +
84                     this->get_working_space_per_thread(args) * thread_id;
85     this->initialise_thread_working_space(args, working_space);
86 
87     // Get the activation values
88     auto activation_min = static_cast<TOut>(-std::numeric_limits<float>::infinity());
89     auto activation_max = static_cast<TOut>(+std::numeric_limits<float>::infinity());
90     switch (args.activation.type)
91     {
92       case arm_gemm::Activation::Type::BoundedReLU:
93         activation_max = static_cast<TOut>(args.activation.param1);
94         // Fall through
95       case arm_gemm::Activation::Type::ReLU:
96         activation_min = static_cast<TOut>(0);
97         break;
98       default:
99         break;
100     }
101 
102     // Determine the number of tiles in a row, we use this to get the right
103     // offset into the input data.
104     const auto n_tile_cols = (args.output_shape.cols + this->get_output_cols() - 1) / this->get_output_cols();
105 
106     // Execute over all batches
107     for (unsigned int batch = 0; batch < args.n_batches; batch++)
108     {
109       auto inptr_row = inptr + thread_id*n_tile_cols*ld_in_row;
110       auto outptr_row = outptr + thread_id*ld_out_row*this->get_output_rows();
111       inptr += ld_in_batch;
112       outptr += ld_out_batch;
113 
114       // Stripe rows of tiles over threads.
115       for (auto out_i = thread_id * this->get_output_rows();
116            out_i < args.output_shape.rows;
117            out_i += n_threads * this->get_output_rows())
118       {
119         auto inptr_tile = inptr_row;
120         auto outptr_tile = outptr_row;
121         inptr_row += n_threads * n_tile_cols * ld_in_row;
122         outptr_row += n_threads * this->get_output_rows() * ld_out_row;
123 
124         // Iterate over all columns
125         for (auto out_j = 0u; out_j < args.output_shape.cols;
126              out_j += this->get_output_cols())
127         {
128           // Execute the tile
129           this->execute_tile(
130             args.n_output_channels,
131             inptr_tile, ld_in_matrix,
132             bias,
133             outptr_tile, ld_out_row, ld_out_col,
134             activation_min, activation_max,
135             args.output_shape.rows - out_i,  // Number of valid rows remaining
136             args.output_shape.cols - out_j,  // Number of valid columns remaining
137             working_space
138           );
139 
140           // Progress the pointers
141           inptr_tile += ld_in_row;
142           outptr_tile += this->get_output_cols() * ld_out_col;
143         }
144       }
145     }
146   }
147 
148   public:
TransformBase(const std::string & name,unsigned int output_rows,unsigned int output_cols,unsigned int kernel_rows,unsigned int kernel_cols)149   TransformBase(const std::string &name,
150                 unsigned int output_rows, unsigned int output_cols,
151                 unsigned int kernel_rows, unsigned int kernel_cols)
152   : m_name(name),
153     m_output_rows(output_rows), m_output_cols(output_cols),
154     m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols)
155   {
156   }
157 
get_name(void) const158   const std::string &get_name(void) const override { return m_name; }
159 
get_input_rows(void) const160   unsigned int get_input_rows(void) const override final { return m_kernel_rows + m_output_rows - 1; }
get_input_cols(void) const161   unsigned int get_input_cols(void) const override final { return m_kernel_cols + m_output_cols - 1; }
162 
get_output_rows(void) const163   unsigned int get_output_rows(void) const override final { return m_output_rows; }
get_output_cols(void) const164   unsigned int get_output_cols(void) const override final { return m_output_cols; }
165 
get_kernel_rows(void) const166   unsigned int get_kernel_rows(void) const override final { return m_kernel_rows; }
get_kernel_cols(void) const167   unsigned int get_kernel_cols(void) const override final { return m_kernel_cols; }
168 
get_working_space_size(const ConvolutionArgs & args,unsigned int n_threads) const169   size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const override
170   {
171     return n_threads * this->get_working_space_per_thread(args);
172   }
173 
execute(const ConvolutionArgs & args,const void * inptr,size_t ld_in_batch,size_t ld_in_matrix,size_t ld_in_row,const void * bias,void * outptr,size_t ld_out_batch,size_t ld_out_row,size_t ld_out_col,void * working_space,unsigned int thread_id,unsigned int n_threads) const174   void execute(
175     const ConvolutionArgs &args,
176     const void *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row,
177     const void *bias,
178     void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col,
179     void *working_space, unsigned int thread_id, unsigned int n_threads
180   ) const override
181   {
182     execute_internal(
183       args,
184       reinterpret_cast<const TIn *>(inptr), ld_in_batch, ld_in_matrix, ld_in_row,
185       reinterpret_cast<const TIn *>(bias),
186       reinterpret_cast<TOut *>(outptr), ld_out_batch, ld_out_row, ld_out_col,
187       working_space, thread_id, n_threads
188     );
189   }
190 };
191 
192 template <typename TIn, typename TOut=TIn>
193 class TransformUnpadded : public TransformBase<TIn, TOut>
194 {
195   using Kernel = std::function<void(
196     unsigned int n_channels,
197     const TIn *inptr, size_t ld_in_matrix,
198     const TIn *bias,
199     TOut *outptr, size_t ld_out_row, size_t ld_out_col,
200     TOut activation_min, TOut activation_max
201   )>;
202   const Kernel m_kernel;
203 
204   protected:
get_working_space_per_thread(const ConvolutionArgs & args) const205   size_t get_working_space_per_thread(const ConvolutionArgs &args) const override
206   {
207     // We create a buffer the size of the output tile
208     const auto n_output_points = this->get_output_rows() * this->get_output_cols();
209     return sizeof(TOut) * n_output_points * args.n_output_channels;
210   }
211 
execute_tile(unsigned int n_channels,const TIn * inptr,size_t ld_in_matrix,const TIn * bias,TOut * outptr,size_t ld_out_row,size_t ld_out_col,TOut activation_min,TOut activation_max,unsigned int valid_rows,unsigned int valid_cols,void * working_space) const212   void execute_tile(
213     unsigned int n_channels,
214     const TIn *inptr, size_t ld_in_matrix,
215     const TIn *bias,
216     TOut *outptr, size_t ld_out_row, size_t ld_out_col,
217     TOut activation_min, TOut activation_max,
218     unsigned int valid_rows, unsigned int valid_cols,
219     void *working_space
220   ) const override final
221   {
222     // Get copies of the output tensor parameters
223     auto kernel_outptr = outptr;
224     auto kernel_ld_out_row = ld_out_row, kernel_ld_out_col = ld_out_col;
225 
226     // If there's padding on either the left or the right, then we execute the
227     // kernel into the output buffer and then perform a copy.
228     if (valid_rows < this->get_output_rows() ||
229         valid_cols < this->get_output_cols())
230     {
231       // Override the kernel output parameters
232       kernel_outptr = reinterpret_cast<TOut *>(working_space);
233       kernel_ld_out_col = n_channels;
234       kernel_ld_out_row = kernel_ld_out_col * this->get_output_cols();
235     }
236 
237     // Execute the kernel
238     m_kernel(
239       n_channels,
240       inptr, ld_in_matrix,
241       bias,
242       kernel_outptr, kernel_ld_out_row, kernel_ld_out_col,
243       activation_min, activation_max
244     );
245 
246     // If necessary, copy from the working space into the destination tensor.
247     if (valid_rows < this->get_output_rows() ||
248         valid_cols < this->get_output_cols())
249     {
250       const auto last_row = std::min(valid_rows, this->get_output_rows());
251       const auto last_col = std::min(valid_cols, this->get_output_cols());
252 
253       for (auto i = 0u; i < last_row; i++)
254       {
255         auto patch_tile = kernel_outptr;
256         auto out_tile = outptr;
257         kernel_outptr += kernel_ld_out_row;
258         outptr += ld_out_row;
259 
260         for (auto j = 0u; j < last_col; j++)
261         {
262           memcpy(out_tile, patch_tile, sizeof(TOut) * n_channels);
263           patch_tile += kernel_ld_out_col;
264           out_tile += ld_out_col;
265         }
266       }
267     }
268   }
269 
270   public:
TransformUnpadded(const std::string & name,unsigned int output_rows,unsigned int output_cols,unsigned int kernel_rows,unsigned int kernel_cols,const Kernel kernel)271   TransformUnpadded(const std::string &name,
272                     unsigned int output_rows, unsigned int output_cols,
273                     unsigned int kernel_rows, unsigned int kernel_cols,
274                     const Kernel kernel)
275   : TransformBase<TIn, TOut>(name, output_rows, output_cols, kernel_rows, kernel_cols),
276     m_kernel(kernel)
277   {
278   }
279 
280   /* Utility method to get a transposed variant of a kernel, this transposed
281    * version simply calls the original kernel with the output row and column
282    * strides swapped.
283    */
get_transposed_kernel(const Kernel & kernel)284   static constexpr Kernel get_transposed_kernel(const Kernel &kernel)
285   {
286     return [kernel] (
287       const unsigned int n_channels,
288       const TIn *const inptr, const size_t ld_in_matrix,
289       const TIn *const bias,
290       TOut *const outptr, const size_t ld_out_row, const size_t ld_out_col,
291       const TOut activation_min, const TOut activation_max
292     ) {
293       kernel(n_channels, inptr, ld_in_matrix, bias,
294              outptr, ld_out_col, ld_out_row,
295              activation_min, activation_max);
296     };
297   }
298 };
299 
300 }  // namespace output_transform
301 }  // namespace winograd
302 }  // namespace arm_conv
303