xref: /aosp_15_r20/external/ruy/ruy/pack.h (revision bb86c7ed5fb1b98a7eac808e443a46cc8b90dfc0)
1 /* Copyright 2019 Google LLC. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // # What is "packing"?
17 //
18 // Before feeding data to the gemm kernels (the parts of Ruy that do lots
19 // of multiply-add operations), Ruy first performs a data transformation (which
20 // we call "packing") on the input matrices. This transformation has two main
21 // goals:
22 // - rearrange data into blocks that are a convenient size/layout for the gemm
23 // kernels to consume. This helps make the memory access pattern of the gemm
24 // kernel simpler and more contiguous, and puts the data in a layout most
25 // convenient for specific arithmetic instructions in the gemm kernel.
26 // - compute row/column sums needed for handling quantization with non-symmetric
27 // zero points.
28 //
29 // # Simplified algorithmic analysis of packing
30 //
31 // Packing is a relatively simple transformation which does a small constant
32 // amount of work on each element of an input matrix, and hence for an NxM
33 // matrix performs O(N*M) work. If N and M are of the same order, then this is
34 // O(N^2) work.
35 //
36 // A NxKxM matrix multiplication requires N*K*M multiply-accumulate operations.
37 // Note that if N, K, and M are all the same order, then the number of
38 // multiply-accumulate operations is O(N^3).
39 //
40 // Thus, the O(N^2) cost of packing is small compared to the O(N^3) work, in the
41 // case of all dimensions being roughly the same order.
42 //
43 // # Packing cost can be significant
44 //
45 // When matrix * matrix multiplications begin to look more like matrix * vector
46 // multiplications, packing cost can become significant. We sometimes call these
47 // cases "gemv-like".
48 //
49 // Continuing the algorithmic analysis above, if we consider a case where an
50 // NxKxM matrix multiplication has either N = O(1) or M = O(1), then the
51 // situation is different. In this case, the multiply-accumulate work is only
52 // quadratic, so the quadratic cost of packing can be come significant.
53 //
54 // Another way to say this is that the cost of packing an input matrix (either
55 // the LHS or RHS) is amortized across the non-depth dimension of the opposite
56 // input matrix. Thus, when the LHS has very few rows or the RHS has very few
57 // columns, the cost of packing the opposite input matrix can become
58 // significant.
59 //
60 // As a rough rule of thumb, the cost of packing starts to become significant
61 // when either N or M is below 32 (and other dimensions are hundreds), with very
62 // significant packing costs at 8 or below. This varies by data type, Path, and
63 // tuning, so these numbers are only rough guides.
64 //
65 // One practical use case that is affected by this is inference of
66 // fully connected neural network layers with a low batch size. The weight
67 // matrix (which is a constant for inference) is the one affected by significant
68 // packing cost.
69 //
70 // Ruy has an optional feature, accessed by Matrix::set_cache_policy(), to
71 // cache the packed forms of constant matrices.
72 //
73 // # Implementation notes
74 //
75 // Ruy's packing routines always operate on a range of columns and can be
76 // applied to either the LHS or RHS. This is possible because Ruy internally
77 // implements a TrMul, so the accumulation along depth is done along columns of
78 // both the LHS and RHS (whereas for a normal Mul the accumulation along depth
79 // for the LHS is along rows). As another example, we are always computing
80 // column sums for quantization (and never row sums, since the LHS is
81 // transposed).
82 
83 #ifndef RUY_RUY_PACK_H_
84 #define RUY_RUY_PACK_H_
85 
86 #include "ruy/mat.h"
87 #include "ruy/pack_common.h"
88 #include "ruy/path.h"
89 #include "ruy/platform.h"
90 #include "ruy/trace.h"
91 
92 // IWYU pragma: begin_exports
93 #if RUY_PLATFORM_NEON
94 #include "ruy/pack_arm.h"
95 #elif RUY_PLATFORM_X86
96 #include "ruy/pack_x86.h"
97 #endif
98 // IWYU pragma: end_exports
99 
100 namespace ruy {
101 
102 // General implementation of the PackImpl template, overridden by template
103 // specializations for specific SIMD code paths. This general implementation
104 // is used with Path::kStandardCpp and its internal test-only variants.
105 template <Path ThePath, typename FixedKernelLayout, typename Scalar,
106           typename PackedScalar, typename SumsType, Order SrcOrder>
107 struct PackImpl {
RunPackImpl108   static void Run(Tuning, const Mat<Scalar>& src_matrix,
109                   PMat<PackedScalar>* packed_matrix, int start_col,
110                   int end_col) {
111     profiler::ScopeLabel label("Pack (generic)");
112     RUY_DCHECK_EQ(SrcOrder, src_matrix.layout.order);
113     RUY_DCHECK_EQ((end_col - start_col) % FixedKernelLayout::kCols, 0);
114     SumsType* sums = packed_matrix->sums;
115     for (int col = start_col; col < end_col; col++) {
116       SumsType accum = 0;
117       for (int row = 0; row < packed_matrix->layout.rows; row++) {
118         PackedScalar packed_val;
119         if (col < src_matrix.layout.cols && row < src_matrix.layout.rows) {
120           packed_val = Pack<PackedScalar>(Element(src_matrix, row, col));
121         } else {
122           packed_val = packed_matrix->zero_point;
123         }
124         accum += packed_val;
125         *ElementPtr(packed_matrix, row, col) = packed_val;
126       }
127       if (sums) {
128         sums[col] = accum;
129       }
130     }
131   }
132 };
133 
134 // Main entry point for packing.
135 template <Path ThePath, typename FixedKernelLayout, typename Scalar,
136           typename PackedScalar>
RunPack(Tuning tuning,const EMat & src_matrix,PEMat * packed_matrix,int start_col,int end_col)137 void RunPack(Tuning tuning, const EMat& src_matrix, PEMat* packed_matrix,
138              int start_col, int end_col) {
139   RUY_TRACE_SCOPE;
140   using SumsType = typename PMat<PackedScalar>::SumsType;
141   Mat<Scalar> src = UneraseType<Scalar>(src_matrix);
142   PMat<PackedScalar> packed = UneraseType<PackedScalar>(*packed_matrix);
143   RUY_TRACE_INFO(RUN_PACK);
144   if (src.layout.order == Order::kColMajor) {
145     PackImpl<ThePath, FixedKernelLayout, Scalar, PackedScalar, SumsType,
146              Order::kColMajor>::Run(tuning, src, &packed, start_col, end_col);
147   } else {
148     PackImpl<ThePath, FixedKernelLayout, Scalar, PackedScalar, SumsType,
149              Order::kRowMajor>::Run(tuning, src, &packed, start_col, end_col);
150   }
151 }
152 
153 }  // namespace ruy
154 
155 #endif  // RUY_RUY_PACK_H_
156