xref: /aosp_15_r20/external/tensorflow/tensorflow/compiler/xla/service/gpu/gpu_conv_runner.h (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_CONV_RUNNER_H_
17 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_CONV_RUNNER_H_
18 
19 #include <optional>
20 
21 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
22 #include "tensorflow/compiler/xla/service/gpu/cublas_cudnn.h"
23 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
24 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
25 #include "tensorflow/compiler/xla/status.h"
26 #include "tensorflow/compiler/xla/statusor.h"
27 #include "tensorflow/compiler/xla/stream_executor/lazy_op_runner.h"
28 #include "tensorflow/compiler/xla/types.h"
29 #include "tensorflow/compiler/xla/xla_data.pb.h"
30 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
31 #include "tensorflow/stream_executor/dnn.h"
32 
33 namespace xla {
34 namespace gpu {
35 
36 // Structure to describe static properties of a GPU convolution.
37 struct GpuConvConfig {
38   // Field related to cuDNN's fused convolution are in FusionConfig &
39   // FusionParams structures. The result thus is defined as:
40   //   activation(conv_result_scale * conv(x, w) +
41   //       side_input_scale * side_input + broadcast(bias))
42   //
43   // The most common fused conv is conv forward + relu/identity, for example.
44   //
45   // bias_buf is a single-dimensional array, with the length equal to the number
46   // of output features. It'll be broadcasted to the output shape in order to be
47   // added to the final results.
48   //
49   // side_input_buf, if valid, must have the same shape as the output buffer.
50   struct FusionConfig {
51     se::dnn::ActivationMode mode;
52     double side_input_scale;
53   };
54 
55   PrimitiveType input_type;
56   PrimitiveType output_type;
57   CudnnConvKind kind;
58   se::dnn::AlgorithmDesc algorithm;
59   double conv_result_scale;
60 
61   se::dnn::BatchDescriptor input_descriptor;
62   se::dnn::FilterDescriptor filter_descriptor;
63   se::dnn::BatchDescriptor output_descriptor;
64   se::dnn::ConvolutionDescriptor conv_desc;
65 
66   Shape input_shape;
67   Shape filter_shape;
68   Shape output_shape;
69   std::optional<FusionConfig> fusion;
70 };
71 
72 // Implementation struct exposed for debugging and log analysis.
73 struct GpuConvParams {
74   const GpuConvConfig* config;  // Not owned
75   struct FusionParams {
76     se::DeviceMemoryBase bias_buf;
77     se::DeviceMemoryBase side_input_buf;  // nullable
78   };
79 
80   se::DeviceMemoryBase input_buf;
81   se::DeviceMemoryBase filter_buf;
82   se::DeviceMemoryBase output_buf;
83 
84   std::optional<FusionParams> fusion;
85 };
86 
87 // The XLA convolution plumbing is all dynamically-typed w.r.t. whether a
88 // convolution is fused (and has extra arguments) or unfused, which doesn't
89 // naturally play well with the typed APIs provided by StreamExecutor; rather
90 // than rewriting everything here, just propagate the dynamic typing to one more
91 // place by having either a FusedConvRunner or a ConvRunner.
92 class MaybeFusedConvRunner {
93  public:
94   MaybeFusedConvRunner() = default;
95 
MaybeFusedConvRunner(std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::FusedConvOp>> runner)96   explicit MaybeFusedConvRunner(
97       std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::FusedConvOp>> runner)
98       : repr_(std::move(runner)) {}
99 
MaybeFusedConvRunner(std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::ConvOp>> runner)100   explicit MaybeFusedConvRunner(
101       std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::ConvOp>> runner)
102       : repr_(std::move(runner)) {}
103 
MaybeFusedConvRunner(const GpuConvConfig & config)104   explicit MaybeFusedConvRunner(const GpuConvConfig& config)
105       : MaybeFusedConvRunner(
106             config.kind == CudnnConvKind::kForwardActivation
107                 ? MaybeFusedConvRunner(
108                       std::make_unique<
109                           se::dnn::LazyOpRunner<se::dnn::FusedConvOp>>(
110                           config.algorithm))
111                 : MaybeFusedConvRunner(
112                       std::make_unique<se::dnn::LazyOpRunner<se::dnn::ConvOp>>(
113                           config.algorithm))) {}
114 
ToAlgorithmDesc()115   se::dnn::AlgorithmDesc ToAlgorithmDesc() const {
116     return std::visit(ToAlgorithmDescVisitor{}, repr_);
117   }
118 
AsConvRunner()119   se::dnn::LazyOpRunner<se::dnn::ConvOp>* AsConvRunner() {
120     CHECK(std::holds_alternative<
121           std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::ConvOp>>>(repr_));
122     return std::get<std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::ConvOp>>>(
123                repr_)
124         .get();
125   }
126 
AsFusedConvRunner()127   se::dnn::LazyOpRunner<se::dnn::FusedConvOp>* AsFusedConvRunner() {
128     CHECK(std::holds_alternative<
129           std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::FusedConvOp>>>(repr_));
130     return std::get<
131                std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::FusedConvOp>>>(
132                repr_)
133         .get();
134   }
135 
136  private:
137   struct ToAlgorithmDescVisitor {
138     template <typename RunnerPtr>
operatorToAlgorithmDescVisitor139     se::dnn::AlgorithmDesc operator()(const RunnerPtr& runner) {
140       return runner->ToAlgorithmDesc();
141     }
142 
operatorToAlgorithmDescVisitor143     se::dnn::AlgorithmDesc operator()(const std::monostate&) {
144       CHECK(false) << "Internal error: uninitialized runner in ToAlgorithmDesc";
145     }
146   };
147 
148   using Repr =
149       std::variant<std::monostate,  // To allow GpuConvConfig default ctor
150                    std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::FusedConvOp>>,
151                    std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::ConvOp>>>;
152   Repr repr_;
153 };
154 
155 struct RunConvOptions {
156   // Nullable output-parameter pointer for profiling results.
157   se::dnn::ProfileResult* profile_result = nullptr;
158 
159   // Use this runner cache (and its configured algorithm), instead of the one
160   // from the instruction.
161   MaybeFusedConvRunner* runner_cache;
162 };
163 
164 // This file contains low-level routines for running cudnn convolutions.
165 
166 // Calls into cudnn to run the specified convolution.
167 //
168 // We provide one overload which takes a scratch buffer, and another which takes
169 // an allocator which is responsible for allocating the scratch space.  In
170 // theory the second one shouldn't be necessary -- users of this function could
171 // just ask cudnn how much scratch space it needs for a particular convolution.
172 // But in practice, StreamExecutor does not expose such an API, and in the name
173 // of parsimony, perhaps it's better not to add it.  Instead, the first time you
174 // call a convolution, you should call the version that takes a scratch
175 // allocator and take note of how much memory is used.  The next time you call
176 // the same conv, you can provide an explicitly preallocated scratch buffer of
177 // that size, if you like.
178 Status RunGpuConv(const GpuConvConfig& conv_config,
179                   absl::Span<const se::DeviceMemoryBase> operand_buffers,
180                   se::DeviceMemoryBase result_buffer,
181                   se::DeviceMemoryBase scratch_memory, se::Stream* stream,
182                   RunConvOptions = {});
183 
184 // Struct to describe properties of a convolution without being tied to specific
185 // IR. Will be used to help build Convolution thunks from either XLA HLO or
186 // LHLO GPU dialect in MLIR.
187 struct GpuConvDescriptor {
188   CudnnConvKind kind;
189   CudnnConvBackendConfig backend_config;
190   Shape operand0_shape;
191   Shape operand1_shape;
192   Shape result_shape;
193   size_t scratch_size;
194   Window window;
195   ConvolutionDimensionNumbers dnums;
196   int64_t feature_group_count;
197 };
198 
199 // Returns the convolution configuration given a XLA HLO instruction.
200 StatusOr<GpuConvConfig> GetGpuConvConfig(
201     const HloCustomCallInstruction* cudnn_call);
202 
203 // Returns the convolution configuration given a convolution descriptor `desc`
204 // and a string representation of the convolution instruction `inst_as_string`
205 // (for error reporting).
206 StatusOr<GpuConvConfig> GetGpuConvConfig(const GpuConvDescriptor& desc,
207                                          absl::string_view inst_as_string);
208 
209 // Implementation details exposed for debugging and log analysis.
210 StatusOr<GpuConvParams> GetGpuConvParams(
211     const GpuConvConfig& conv_config,
212     absl::Span<const se::DeviceMemoryBase> operand_buffers,
213     se::DeviceMemoryBase result_buffer);
214 
215 se::dnn::BatchDescriptor GetBiasDescriptor(const GpuConvConfig& config);
216 
BiasTypeForInputType(se::dnn::DataType input_type)217 inline se::dnn::DataType BiasTypeForInputType(se::dnn::DataType input_type) {
218   switch (input_type) {
219     default:
220       return input_type;
221     case se::dnn::DataType::kInt8:
222       return se::dnn::DataType::kFloat;
223   }
224 }
225 
226 }  // namespace gpu
227 }  // namespace xla
228 
229 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_CONV_RUNNER_H_
230