1 /* 2 * Copyright (c) 2022 Arm Limited. 3 * 4 * SPDX-License-Identifier: MIT 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to 8 * deal in the Software without restriction, including without limitation the 9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 10 * sell copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in all 14 * copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25 #include "depthfirst_driver.hpp" 26 #include "interleaves/generic.hpp" 27 28 namespace arm_conv { 29 namespace depthwise { 30 31 template <typename OutputStage> 32 class IPlanarStrategy 33 { 34 public: 35 virtual ~IPlanarStrategy() = default; 36 virtual unsigned int get_output_rows(void) const = 0; 37 virtual arm_gemm::VLType get_vl_type(void) const = 0; 38 39 virtual size_t get_storage_size(const DepthwiseArgs &) const = 0; 40 virtual void pack_parameters( 41 const DepthwiseArgs &args, void *buffer, 42 const void *biases, const OutputStage &, 43 const void *weights, size_t ld_weight_col, size_t ld_weight_row 44 ) const = 0; 45 }; 46 47 48 template <typename TInput, typename TWeight, typename TOutput, typename TAccum, 49 typename OutputStage> 50 struct PlanarKernelType; 51 52 template <typename TInput, typename TWeight, typename TOutput, typename TAccum> 53 struct PlanarKernelType<TInput, TWeight, TOutput, TAccum, Nothing> 54 { 55 using Type = std::function<void( 56 const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl, 57 unsigned int pad_top, unsigned int valid_input_rows, 58 unsigned int pad_left, unsigned int valid_input_cols, 59 const TWeight *, const TAccum *, 60 TOutput **, const size_t *, const size_t *, unsigned int output_cols, 61 unsigned int start_channels, unsigned int valid_channels, 62 TAccum act_min, TAccum act_max 63 )>; 64 65 template <typename WorkspaceType> executearm_conv::depthwise::PlanarKernelType66 static inline void execute( 67 const Type fn, 68 const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl, 69 unsigned int pad_top, unsigned int valid_input_rows, 70 unsigned int pad_left, unsigned int valid_input_cols, 71 const TWeight *weights, const TAccum *bias, 72 TOutput **outptrs, const size_t *outlds, const size_t *outvllds, unsigned int output_cols, 73 unsigned int start_channel, unsigned int valid_channels, 74 const Nothing &, const WorkspaceType *ws 75 ) 76 { 77 fn( 78 inptr, ld_in_row, ld_in_col, ld_in_vl, 79 pad_top, valid_input_rows, 80 pad_left, valid_input_cols, 81 weights, bias, 82 outptrs, outlds, outvllds, output_cols, 83 start_channel, valid_channels, 84 ws->activation_min, ws->activation_max 85 ); 86 } 87 }; 88 89 template <typename TInput, typename TWeight, typename TOutput> 90 struct PlanarKernelType<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32> 91 { 92 using Type = std::function<void( 93 const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl, 94 unsigned int pad_top, unsigned int valid_input_rows, 95 unsigned int pad_left, unsigned int valid_input_cols, 96 const TWeight *, 97 TOutput **, const size_t *, const size_t *, unsigned int output_cols, 98 unsigned int start_channel, unsigned int valid_channels, 99 const arm_gemm::Requantize32 & 100 )>; 101 102 template <typename WorkspaceType> executearm_conv::depthwise::PlanarKernelType103 static inline void execute( 104 const Type fn, 105 const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl, 106 unsigned int pad_top, unsigned int valid_input_rows, 107 unsigned int pad_left, unsigned int valid_input_cols, 108 const TWeight *weights, const int32_t *, 109 TOutput **outptrs, const size_t *outlds, const size_t *outldvls, unsigned int output_cols, 110 unsigned int first_channel, unsigned int valid_channels, 111 const arm_gemm::Requantize32 &qp, const WorkspaceType * 112 ) 113 { 114 fn( 115 inptr, ld_in_row, ld_in_col, ld_in_vl, 116 pad_top, valid_input_rows, 117 pad_left, valid_input_cols, 118 weights, 119 outptrs, outlds, outldvls, output_cols, 120 first_channel, valid_channels, 121 qp 122 ); 123 } 124 }; 125 126 127 template <typename TInput, typename TWeight=TInput, typename TOutput=TInput, 128 typename TAccum=typename DefaultTAccum<TOutput>::Type, 129 typename OutputStage=typename DefaultOutputStage<TOutput>::Type> 130 class PlanarStrategy : public IPlanarStrategy<OutputStage> 131 { 132 unsigned int m_kernel_rows, m_kernel_cols; 133 unsigned int m_stride_rows, m_stride_cols; 134 unsigned int m_output_rows; 135 arm_gemm::VLType m_vl_type; 136 137 protected: get_kernel_packing_point(const unsigned int index,unsigned int & x,unsigned int & y) const138 virtual bool get_kernel_packing_point(const unsigned int index, unsigned int &x, unsigned int &y) const 139 { 140 // Get the kernel point to pack at the given index; return false to 141 // indicate that this index (and all greater indices) is out of range. 142 if (m_kernel_rows * m_kernel_cols <= index) 143 return false; 144 145 y = index % m_kernel_cols; 146 x = index / m_kernel_cols; 147 return true; 148 } 149 get_kernel_packing_arguments(void) const150 virtual interleaves::PackingArguments get_kernel_packing_arguments(void) const 151 { 152 return interleaves::PackingArguments( 153 m_kernel_rows, m_kernel_cols, sizeof(TWeight), 154 false, sizeof(TAccum), // Don't pack the bias 155 m_vl_type, sizeof(TAccum), 1, // Accumulator depth of 1 TODO 156 [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool 157 { return this->get_kernel_packing_point(idx, x, y); } 158 ); 159 } 160 161 public: PlanarStrategy(unsigned int kernel_rows,unsigned int kernel_cols,unsigned int stride_rows,unsigned int stride_cols,unsigned int output_rows,arm_gemm::VLType vl_type)162 PlanarStrategy( 163 unsigned int kernel_rows, unsigned int kernel_cols, 164 unsigned int stride_rows, unsigned int stride_cols, 165 unsigned int output_rows, 166 arm_gemm::VLType vl_type 167 ) : m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols), 168 m_stride_rows(stride_rows), m_stride_cols(stride_cols), 169 m_output_rows(output_rows), m_vl_type(vl_type) 170 { 171 } 172 get_output_rows(void) const173 unsigned int get_output_rows(void) const override { return m_output_rows; } get_vl_type(void) const174 arm_gemm::VLType get_vl_type(void) const override { return m_vl_type; } 175 get_storage_size(const DepthwiseArgs & args) const176 size_t get_storage_size(const DepthwiseArgs &args) const override 177 { 178 return interleaves::get_storage_size_generic(this->get_kernel_packing_arguments(), args); 179 } 180 pack_parameters(const DepthwiseArgs & args,void * buffer,const void * biases,const OutputStage &,const void * weights,size_t ld_weight_col,size_t ld_weight_row) const181 void pack_parameters( 182 const DepthwiseArgs &args, void *buffer, 183 const void *biases, const OutputStage &, 184 const void *weights, size_t ld_weight_col, size_t ld_weight_row 185 ) const override 186 { 187 interleaves::pack_parameters_generic( 188 this->get_kernel_packing_arguments(), args, 189 buffer, biases, weights, ld_weight_col, ld_weight_row 190 ); 191 } 192 193 using KernelType = typename PlanarKernelType<TInput, TWeight, TOutput, TAccum, OutputStage>::Type; 194 virtual KernelType get_kernel(void) const = 0; 195 }; 196 197 198 namespace { 199 200 template <typename T> 201 struct OutputRowPtrsElement 202 { 203 struct Workspace 204 { 205 T **output_row_ptrs; 206 size_t *output_ld_cols; 207 size_t *output_ld_vls; // Stride between vectors of channels 208 T *output_padding_buffer; 209 }; 210 211 template <typename OutputStage> get_element_sizearm_conv::depthwise::__anon21c175b90211::OutputRowPtrsElement212 static size_t get_element_size(const WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage> &args) 213 { 214 // We need one pointer and stride for each row of output, and an additional 215 // blob of memory into which padded stores can go. 216 return args.strategy->get_output_rows() * (sizeof(T *) + 2*sizeof(size_t)) + 217 get_vector_length<char>(args.strategy->get_vl_type()); 218 } 219 220 template <typename WorkspaceType, typename OutputStage> initialisearm_conv::depthwise::__anon21c175b90211::OutputRowPtrsElement221 static void *initialise(WorkspaceType *ws, void *buffer, 222 const WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage> &args) 223 { 224 const auto n_rows = args.strategy->get_output_rows(); 225 ws->output_row_ptrs = reinterpret_cast<T **>(buffer); 226 ws->output_ld_cols = reinterpret_cast<size_t *>(ws->output_row_ptrs + n_rows); 227 ws->output_ld_vls = ws->output_ld_cols + n_rows; 228 ws->output_padding_buffer = reinterpret_cast<T *>(ws->output_ld_vls + n_rows); 229 return ws->output_padding_buffer + get_vector_length<T>(args.strategy->get_vl_type()); 230 } 231 }; 232 233 } // namespace {anonymous} 234 235 236 template <typename TInput, typename TWeight=TInput, typename TOutput=TInput, 237 typename TAccum=typename DefaultTAccum<TOutput>::Type, 238 typename OutputStage=typename DefaultOutputStage<TOutput>::Type> 239 class DepthwisePlanar : public DepthwiseCommon<TInput, TWeight, TOutput> 240 { 241 using Parent = DepthwiseCommon<TInput, TWeight, TOutput>; 242 using StrategyType = IPlanarStrategy<OutputStage>; 243 using WorkspaceManager = Workspace< 244 OutputRowPtrsElement<TOutput>, 245 ActivationsElement<TAccum, OutputStage> 246 >; 247 using WorkspaceType = typename WorkspaceManager::WorkspaceType; 248 249 std::unique_ptr<StrategyType> m_strat; 250 const TAccum *m_bias; 251 OutputStage m_os; 252 253 public: DepthwisePlanar(StrategyType * const strat,const DepthwiseArgs & args,const OutputStage & os={})254 DepthwisePlanar(StrategyType *const strat, const DepthwiseArgs &args, const OutputStage &os = {}) 255 : Parent(args), m_strat(strat), m_bias(nullptr), m_os(os) 256 { 257 } 258 259 DepthwisePlanar(DepthwisePlanar &) = delete; 260 DepthwisePlanar &operator=(DepthwisePlanar &) = delete; 261 get_storage_size(void) const262 size_t get_storage_size(void) const override 263 { 264 return m_strat->get_storage_size(this->m_args); 265 } 266 pack_parameters(void * buffer,const void * biases,const void * weights,size_t ld_weight_col,size_t ld_weight_row)267 void pack_parameters( 268 void *buffer, const void *biases, 269 const void *weights, size_t ld_weight_col, size_t ld_weight_row 270 ) override 271 { 272 m_strat->pack_parameters(this->m_args, buffer, biases, {}, weights, ld_weight_col, ld_weight_row); 273 this->m_bias = reinterpret_cast<const TAccum *>(biases); 274 depthwise_depthfirst::stash_bias(this->m_os, biases); 275 } 276 get_working_size(unsigned int n_threads,unsigned int) const277 size_t get_working_size(unsigned int n_threads, unsigned int) const override 278 { 279 return this->get_working_size_per_thread() * n_threads; 280 } 281 282 protected: 283 /* Compute the amount of working space required for a single thread. */ get_working_size_per_thread(void) const284 virtual size_t get_working_size_per_thread(void) const 285 { 286 return WorkspaceManager::get_sizeof_workspace( 287 WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage>(m_strat.get(), this->m_args, m_os)); 288 } 289 290 /* Initialise the working space for a thread. */ initialise_working_space(void * buffer) const291 virtual void initialise_working_space(void *buffer) const 292 { 293 WorkspaceManager::initialise( 294 buffer, 295 WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage>(m_strat.get(), this->m_args, m_os) 296 ); 297 } 298 299 /* Execute the kernel for a given chunk of work. */ execute_kernel(const TInput * inptr,size_t ld_in_row,size_t ld_in_col,size_t ld_in_vl,unsigned int pad_top,unsigned int valid_input_rows,unsigned int pad_left,unsigned int valid_input_cols,const TWeight * weights,const TAccum * bias,TOutput * outptr,size_t ld_out_row,size_t ld_out_col,size_t ld_out_vl,unsigned int valid_output_rows,unsigned int valid_output_cols,unsigned int first_channel,unsigned int valid_channels,WorkspaceType * ws) const300 virtual void execute_kernel( 301 const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl, 302 unsigned int pad_top, unsigned int valid_input_rows, 303 unsigned int pad_left, unsigned int valid_input_cols, 304 const TWeight *weights, const TAccum *bias, 305 TOutput *outptr, size_t ld_out_row, size_t ld_out_col, size_t ld_out_vl, 306 unsigned int valid_output_rows, unsigned int valid_output_cols, 307 unsigned int first_channel, unsigned int valid_channels, 308 WorkspaceType *ws 309 ) const 310 { 311 // Initialise the output pointers 312 for (auto i = 0u; i < m_strat->get_output_rows(); i++) 313 { 314 // Point at the output tensor for all valid rows; otherwise point at the 315 // padding buffer. 316 ws->output_row_ptrs[i] = i < valid_output_rows ? outptr : ws->output_padding_buffer; 317 ws->output_ld_cols[i] = i < valid_output_rows ? ld_out_col : 0; 318 ws->output_ld_vls[i] = i < valid_output_rows ? ld_out_vl : 0; 319 outptr += ld_out_row; 320 } 321 322 // Execute the kernel 323 PlanarKernelType<TInput, TWeight, TOutput, TAccum, OutputStage>::template execute<WorkspaceType>( 324 reinterpret_cast<const PlanarStrategy<TInput, TWeight, TOutput, TAccum, OutputStage> *>(m_strat.get())->get_kernel(), 325 inptr, ld_in_row, ld_in_col, ld_in_vl, 326 pad_top, valid_input_rows, pad_left, valid_input_cols, 327 weights, bias, 328 ws->output_row_ptrs, ws->output_ld_cols, ws->output_ld_vls, 329 valid_output_cols, first_channel, valid_channels, 330 this->m_os, ws 331 ); 332 } 333 execute_internal(unsigned int batches,unsigned int input_height,unsigned int input_width,unsigned int n_input_channels,const PaddingValues & padding,const void * input,size_t ld_input_col,size_t ld_input_row,size_t ld_input_batch,const void * parameters,unsigned int output_height,unsigned int output_width,void * output,size_t ld_output_col,size_t ld_output_row,size_t ld_output_batch,void * working_space,unsigned int thread_id,unsigned int n_threads) const334 void execute_internal( 335 unsigned int batches, 336 unsigned int input_height, 337 unsigned int input_width, 338 unsigned int n_input_channels, 339 const PaddingValues &padding, 340 const void *input, 341 size_t ld_input_col, 342 size_t ld_input_row, 343 size_t ld_input_batch, 344 const void *parameters, 345 unsigned int output_height, 346 unsigned int output_width, 347 void *output, 348 size_t ld_output_col, 349 size_t ld_output_row, 350 size_t ld_output_batch, 351 void *working_space, 352 unsigned int thread_id, 353 unsigned int n_threads 354 ) const override 355 { 356 // Get and initialise the working space for this thread. 357 void *thread_working_space = 358 static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread(); 359 this->initialise_working_space(thread_working_space); 360 auto ws = reinterpret_cast<WorkspaceType *>(thread_working_space); 361 362 const auto n_output_channels = n_input_channels * this->m_args.channel_multiplier; 363 const auto vl = get_vector_length<TAccum>(m_strat->get_vl_type()); 364 365 // Get typed pointers 366 auto input_batch = reinterpret_cast<const TInput *>(input); 367 auto output_batch = reinterpret_cast<TOutput *>(output); 368 auto weights = reinterpret_cast<const TWeight *>(parameters); 369 370 // Iterate over batches 371 for (; batches; batches--) 372 { 373 // NOTE: Other loop orderings are possible and it would be worth 374 // investigating them. 375 376 // Within a batch, stripe threads across rows. 377 for (auto start_output_i = thread_id * m_strat->get_output_rows(); 378 start_output_i < output_height; 379 start_output_i += n_threads * m_strat->get_output_rows()) 380 { 381 // Determine what (if any padding) is required on the top/bottom of 382 // this row of the convolution. 383 const int start_input_i = start_output_i * this->m_args.stride_rows - padding.top; 384 const unsigned int input_pad_top = start_input_i < 0 ? -start_input_i : 0; 385 const unsigned int input_i = start_input_i < 0 ? 0 : start_input_i; 386 const unsigned int valid_input_rows = input_i > input_height ? 0 : input_height - input_i; 387 const unsigned int valid_output_rows = output_height - start_output_i; 388 389 auto inptr_row = input_batch + input_i*ld_input_row; 390 auto outptr_row = output_batch + start_output_i * ld_output_row; 391 392 // Execute the kernel 393 this->execute_kernel( 394 inptr_row, ld_input_row, ld_input_col, vl, 395 input_pad_top, valid_input_rows, padding.left, input_width, 396 weights, this->m_bias, 397 outptr_row, ld_output_row, ld_output_col, vl, 398 valid_output_rows, output_width, 399 0 /* first channel */, n_output_channels, 400 ws 401 ); 402 } 403 404 // Update the input and output pointers to account for batch 405 input_batch += ld_input_batch; 406 output_batch += ld_output_batch; 407 } 408 } 409 }; 410 411 } // namespace depthwise 412 } // namespace arm_conv 413