1 //===- MatmulOptimizer.h -------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef POLLY_MATMULOPTIMIZER_H
10 #define POLLY_MATMULOPTIMIZER_H
11 
12 #include "isl/isl-noexceptions.h"
13 
14 namespace llvm {
15 class TargetTransformInfo;
16 }
17 
18 namespace polly {
19 class Dependences;
20 
21 /// Apply the BLIS matmul optimization pattern if possible.
22 ///
23 /// Make the loops containing the matrix multiplication be the innermost
24 /// loops and apply the BLIS matmul optimization pattern. BLIS implements
25 /// gemm as three nested loops around a macro-kernel, plus two packing
26 /// routines. The macro-kernel is implemented in terms of two additional
27 /// loops around a micro-kernel. The micro-kernel is a loop around a rank-1
28 /// (i.e., outer product) update.
29 ///
30 /// For a detailed description please see [1].
31 ///
32 /// The order of the loops defines the data reused in the BLIS implementation
33 /// of gemm ([1]). In particular, elements of the matrix B, the second
34 /// operand of matrix multiplication, are reused between iterations of the
35 /// innermost loop. To keep the reused data in cache, only elements of matrix
36 /// A, the first operand of matrix multiplication, should be evicted during
37 /// an iteration of the innermost loop. To provide such a cache replacement
38 /// policy, elements of the matrix A can, in particular, be loaded first and,
39 /// consequently, be least-recently-used.
40 ///
41 /// In our case matrices are stored in row-major order instead of
42 /// column-major order used in the BLIS implementation ([1]). It affects only
43 /// on the form of the BLIS micro kernel and the computation of its
44 /// parameters. In particular, reused elements of the matrix B are
45 /// successively multiplied by specific elements of the matrix A.
46 ///
47 /// Refs.:
48 /// [1] - Analytical Modeling is Enough for High Performance BLIS
49 /// Tze Meng Low, Francisco D Igual, Tyler M Smith, Enrique S Quintana-Orti
50 /// Technical Report, 2014
51 /// http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
52 ///
53 /// @see ScheduleTreeOptimizer::createMicroKernel
54 /// @see ScheduleTreeOptimizer::createMacroKernel
55 /// @see getMicroKernelParams
56 /// @see getMacroKernelParams
57 ///
58 /// TODO: Implement the packing transformation.
59 ///
60 /// @param Node The node that contains a band to be optimized. The node
61 ///             is required to successfully pass
62 ///             ScheduleTreeOptimizer::isMatrMultPattern.
63 /// @param TTI  Target Transform Info.
64 /// @param D    The dependencies.
65 ///
66 /// @returns    The transformed schedule or nullptr if the optimization
67 ///             cannot be applied.
68 isl::schedule_node
69 tryOptimizeMatMulPattern(isl::schedule_node Node,
70                          const llvm::TargetTransformInfo *TTI,
71                          const Dependences *D);
72 
73 } // namespace polly
74 #endif // POLLY_MATMULOPTIMIZER_H
75