1 //===- MatmulOptimizer.h -------------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #ifndef POLLY_MATMULOPTIMIZER_H 10 #define POLLY_MATMULOPTIMIZER_H 11 12 #include "isl/isl-noexceptions.h" 13 14 namespace llvm { 15 class TargetTransformInfo; 16 } 17 18 namespace polly { 19 class Dependences; 20 21 /// Apply the BLIS matmul optimization pattern if possible. 22 /// 23 /// Make the loops containing the matrix multiplication be the innermost 24 /// loops and apply the BLIS matmul optimization pattern. BLIS implements 25 /// gemm as three nested loops around a macro-kernel, plus two packing 26 /// routines. The macro-kernel is implemented in terms of two additional 27 /// loops around a micro-kernel. The micro-kernel is a loop around a rank-1 28 /// (i.e., outer product) update. 29 /// 30 /// For a detailed description please see [1]. 31 /// 32 /// The order of the loops defines the data reused in the BLIS implementation 33 /// of gemm ([1]). In particular, elements of the matrix B, the second 34 /// operand of matrix multiplication, are reused between iterations of the 35 /// innermost loop. To keep the reused data in cache, only elements of matrix 36 /// A, the first operand of matrix multiplication, should be evicted during 37 /// an iteration of the innermost loop. To provide such a cache replacement 38 /// policy, elements of the matrix A can, in particular, be loaded first and, 39 /// consequently, be least-recently-used. 40 /// 41 /// In our case matrices are stored in row-major order instead of 42 /// column-major order used in the BLIS implementation ([1]). It affects only 43 /// on the form of the BLIS micro kernel and the computation of its 44 /// parameters. In particular, reused elements of the matrix B are 45 /// successively multiplied by specific elements of the matrix A. 46 /// 47 /// Refs.: 48 /// [1] - Analytical Modeling is Enough for High Performance BLIS 49 /// Tze Meng Low, Francisco D Igual, Tyler M Smith, Enrique S Quintana-Orti 50 /// Technical Report, 2014 51 /// http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf 52 /// 53 /// @see ScheduleTreeOptimizer::createMicroKernel 54 /// @see ScheduleTreeOptimizer::createMacroKernel 55 /// @see getMicroKernelParams 56 /// @see getMacroKernelParams 57 /// 58 /// TODO: Implement the packing transformation. 59 /// 60 /// @param Node The node that contains a band to be optimized. The node 61 /// is required to successfully pass 62 /// ScheduleTreeOptimizer::isMatrMultPattern. 63 /// @param TTI Target Transform Info. 64 /// @param D The dependencies. 65 /// 66 /// @returns The transformed schedule or nullptr if the optimization 67 /// cannot be applied. 68 isl::schedule_node 69 tryOptimizeMatMulPattern(isl::schedule_node Node, 70 const llvm::TargetTransformInfo *TTI, 71 const Dependences *D); 72 73 } // namespace polly 74 #endif // POLLY_MATMULOPTIMIZER_H 75