xref: /aosp_15_r20/external/tensorflow/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMISSION_UTILS_H_
17 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMISSION_UTILS_H_
18 
19 #include <optional>
20 #include <string>
21 #include <utility>
22 #include <vector>
23 
24 #include "llvm/IR/IRBuilder.h"
25 #include "llvm/IR/Value.h"
26 #include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
27 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
28 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
29 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
30 
31 namespace xla {
32 namespace gpu {
33 
34 // Matrix multiplication before the rewrite.
35 //
36 // This function should never return "true" on instructions after
37 // GemmRewriter pass has finished.
38 bool IsMatrixMultiplication(const HloInstruction& dot);
39 
WarpSize()40 inline constexpr int64_t WarpSize() { return 32; }
41 
42 // Need at least 1024 threads/block for reasonable tree reduction
43 // performance (assuming all data fits).
MinThreadsXRowReduction()44 inline constexpr int64_t MinThreadsXRowReduction() { return 1024; }
45 
46 // When doing batched row reduction, how big the batch dimension could be.
BatchedReductionRaceFreeBound()47 inline constexpr int64_t BatchedReductionRaceFreeBound() { return 8; }
48 
49 // Returns true if `hlo` will be implemented as a call to a cuSolver routine.
50 //
51 // This returns true if `hlo` is a CustomCall HLO with a call target equal to
52 // one of the kCusolver... constants, but returns *false* for HLOs with
53 // say, a kCholesky opcode.
54 bool IsCustomCallToCusolver(const HloInstruction& hlo);
55 
56 // Cholesky decomposition. Takes a (batched) matrix as input, and returns a
57 // tuple of (result, workspace, info), where result is the result of the
58 // Cholesky decomposition, workspace is scratch space for cuSolver, and info
59 // is a success/failure code per batch element.
60 extern const char* const kCusolverCholeskyCallTarget;
61 
62 // Returns true if either the dimensions being reduced or the dimensions being
63 // kept are contiguous in the input of the reduce instruction.
64 bool IsReductionFromOrToContiguousDimensions(const HloInstruction& reduce);
65 
66 // MLIR variant.
67 bool IsReductionFromOrToContiguousDimensions(mlir::Operation* op);
68 
69 // Returns whether unnested_hlo is an input fusion whose root is either a slice
70 // or a tuple of slices. If verify_no_strides is true, returns false unless all
71 // ROOT slices have no strides.
72 bool IsInputFusibleSlices(mlir::Operation* unnested_hlo,
73                           bool verify_no_strides);
74 
75 struct ReductionDimensions {
76   // Indicates whether the reduction is a row reduction or a column reduction.
77   bool is_row_reduction;
78 
79   // Contains the size of the three contiguous components for
80   // the reduction [depth, height, width] (major-to-minor ordering).
81   //
82   // For row reduction, we do: [D, H, W] -> [D, H].
83   // For column reduction, we do: [D, H, W] -> [D, W].
84   std::array<int64_t, 3> dimensions;
85 };
86 
87 // Given the input shape and dimensions to reduce for a reduction, returns
88 // ReductionDimensions.
89 //
90 // Prerequisite: the reduction instruction passes the check
91 // IsReductionFromOrToContiguousDimensions, which guarantees either the
92 // dimensions to reduce or the dimensions to keep are consecutive.
93 ReductionDimensions GetReductionKindAndContiguousComponents(
94     const HloInstruction& reduce);
95 ReductionDimensions GetReductionKindAndContiguousComponents(
96     mlir::Operation* reduce);
97 
98 // Get tiling per thread for the given reduction in dimensions [D, H, W].
99 std::array<int64_t, 3> GetReductionTiling(
100     const ReductionDimensions& reduction_dimensions,
101     se::CudaComputeCapability cuda_compute_capability);
102 
103 // Emits call to "vprintf" with given format and arguments.
104 llvm::Value* EmitPrintf(absl::string_view fmt,
105                         absl::Span<llvm::Value* const> arguments,
106                         llvm::IRBuilder<>* builder);
107 
108 // Emits code to shuffle data between threads of a warp. This has the same
109 // semantics as the PTX "shfl.sync.down" instruction but works for values that
110 // aren't 32 bits in size. The last operand of the emitted "shfl" is
111 // `WarpSize() - 1`.
112 //
113 // This function emits a "full-warp" shuffle, which all threads of a warp
114 // participate in.  *Do not use this function from a divergent context:* You
115 // can't correctly do so on both Volta and earlier GPUs.
116 //
117 // https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-shfl-sync
118 llvm::Value* EmitFullWarpShuffleDown(llvm::Value* value, llvm::Value* offset,
119                                      llvm::IRBuilder<>* builder);
120 
121 // Emits code that determines whether the current thread is thread 0 within
122 // block 0 of the kernel.
123 llvm::Value* IsBlock0Thread0(llvm::IRBuilder<>* b);
124 
125 // Returns whether the output of a fusion with reduction are consistent with
126 // `first_reduce`.
127 bool IsFusedReductionOutputConsistent(const HloInstruction* inst,
128                                       const HloInstruction* first_reduce);
AreFusedReductionOutputsConsistent(absl::Span<const HloInstruction * const> output_instructions,const HloInstruction * first_reduce)129 inline bool AreFusedReductionOutputsConsistent(
130     absl::Span<const HloInstruction* const> output_instructions,
131     const HloInstruction* first_reduce) {
132   return absl::c_all_of(output_instructions, [=](const HloInstruction* inst) {
133     return IsFusedReductionOutputConsistent(inst, first_reduce);
134   });
135 }
136 
MlirToString(mlir::Operation * op)137 inline std::string MlirToString(mlir::Operation* op) {
138   std::string s;
139   {
140     llvm::raw_string_ostream os(s);
141     op->print(os);
142   }
143   return s;
144 }
145 
MlirToString(const mlir::Location & loc)146 inline std::string MlirToString(const mlir::Location& loc) {
147   std::string s;
148   {
149     llvm::raw_string_ostream os(s);
150     loc.print(os);
151   }
152   return s;
153 }
154 
155 int PartitionLmhloOperandsAndOutputs(mlir::Operation* op);
156 std::vector<mlir::Value> GetHloOperands(mlir::Operation* op);
157 std::vector<mlir::Value> GetHloOutputs(mlir::Operation* op);
158 
159 bool WritesMlirBuffer(mlir::Operation* op, mlir::Value operand);
160 
161 template <typename T>
ToStdVector(const llvm::SmallVectorImpl<T> & v)162 std::vector<T> ToStdVector(const llvm::SmallVectorImpl<T>& v) {
163   return std::vector<T>(v.begin(), v.end());
164 }
165 
166 StatusOr<BufferAllocation::Slice> GetAllocationSlice(
167     mlir::Value v, absl::Span<const BufferAllocation> allocations,
168     std::string* constant_name = nullptr);
169 
170 bool CanEmitFusedDynamicUpdateSliceInPlaceForGpu(
171     mlir::lmhlo::FusionOp fusion,
172     absl::Span<const BufferAllocation> allocations);
173 
174 Shape GetShape(mlir::Value value);
175 
176 // Returns whether the given reduction can be safely generated without atomics:
177 // that is, at most one block will write to every output element.
178 bool ReductionIsRaceFree(const ReductionDimensions& reduction_dimensions,
179                          const std::array<int64_t, 3>& reduction_tiling);
180 
181 // Description of how to emit a given transposition.
182 //
183 // On a group of input parameters that are 0-2-1 transpose of the outputs
184 // of a fusion kernel, stores the input parameters that are safe for the
185 // shared memory transpose implementation and the dimension permutation.
186 //
187 // When a tile based shared memory transpose is used to implement an input
188 // with 0-2-1 transpose, we preload a tile of the input elements [z, y..y+31,
189 // x..x+31] to compute the output tile elements of the same indices.
190 // Preloading the input tile this way is only safe when the computation of the
191 // output tile elements do not need any input element outside the preloaded
192 // tile. We inspect all the transitive users of the input parameter up to the
193 // fusion root instruction to see if we can find any instruction that can make
194 // preloading the input tile unsafe.
195 struct TransposeDimsAndParams {
196   // Permutation of the dimensions relative to output.
197   Vector3 dims;
198 
199   // Indices of parameters which are permuted.
200   std::vector<int64_t> params;
201 
ToStringTransposeDimsAndParams202   std::string ToString() const {
203     return absl::StrFormat("{dims={%s}, params={%s}}",
204                            absl::StrJoin(dims, ", "),
205                            absl::StrJoin(params, ", "));
206   }
207 };
208 
209 // Attempts to match 021 transpose on the given fusion and return a
210 // transposition description.
211 //
212 // Precondition: input is a fused computation, with kCopy as a root.
213 std::optional<TransposeDimsAndParams> Match021Transpose(
214     const HloComputation* fused_computation);
215 
216 }  // namespace gpu
217 }  // namespace xla
218 
219 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMISSION_UTILS_H_
220