1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_CONV_RUNNER_H_
17 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_CONV_RUNNER_H_
18
19 #include <optional>
20
21 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
22 #include "tensorflow/compiler/xla/service/gpu/cublas_cudnn.h"
23 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
24 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
25 #include "tensorflow/compiler/xla/status.h"
26 #include "tensorflow/compiler/xla/statusor.h"
27 #include "tensorflow/compiler/xla/stream_executor/lazy_op_runner.h"
28 #include "tensorflow/compiler/xla/types.h"
29 #include "tensorflow/compiler/xla/xla_data.pb.h"
30 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
31 #include "tensorflow/stream_executor/dnn.h"
32
33 namespace xla {
34 namespace gpu {
35
36 // Structure to describe static properties of a GPU convolution.
37 struct GpuConvConfig {
38 // Field related to cuDNN's fused convolution are in FusionConfig &
39 // FusionParams structures. The result thus is defined as:
40 // activation(conv_result_scale * conv(x, w) +
41 // side_input_scale * side_input + broadcast(bias))
42 //
43 // The most common fused conv is conv forward + relu/identity, for example.
44 //
45 // bias_buf is a single-dimensional array, with the length equal to the number
46 // of output features. It'll be broadcasted to the output shape in order to be
47 // added to the final results.
48 //
49 // side_input_buf, if valid, must have the same shape as the output buffer.
50 struct FusionConfig {
51 se::dnn::ActivationMode mode;
52 double side_input_scale;
53 };
54
55 PrimitiveType input_type;
56 PrimitiveType output_type;
57 CudnnConvKind kind;
58 se::dnn::AlgorithmDesc algorithm;
59 double conv_result_scale;
60
61 se::dnn::BatchDescriptor input_descriptor;
62 se::dnn::FilterDescriptor filter_descriptor;
63 se::dnn::BatchDescriptor output_descriptor;
64 se::dnn::ConvolutionDescriptor conv_desc;
65
66 Shape input_shape;
67 Shape filter_shape;
68 Shape output_shape;
69 std::optional<FusionConfig> fusion;
70 };
71
72 // Implementation struct exposed for debugging and log analysis.
73 struct GpuConvParams {
74 const GpuConvConfig* config; // Not owned
75 struct FusionParams {
76 se::DeviceMemoryBase bias_buf;
77 se::DeviceMemoryBase side_input_buf; // nullable
78 };
79
80 se::DeviceMemoryBase input_buf;
81 se::DeviceMemoryBase filter_buf;
82 se::DeviceMemoryBase output_buf;
83
84 std::optional<FusionParams> fusion;
85 };
86
87 // The XLA convolution plumbing is all dynamically-typed w.r.t. whether a
88 // convolution is fused (and has extra arguments) or unfused, which doesn't
89 // naturally play well with the typed APIs provided by StreamExecutor; rather
90 // than rewriting everything here, just propagate the dynamic typing to one more
91 // place by having either a FusedConvRunner or a ConvRunner.
92 class MaybeFusedConvRunner {
93 public:
94 MaybeFusedConvRunner() = default;
95
MaybeFusedConvRunner(std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::FusedConvOp>> runner)96 explicit MaybeFusedConvRunner(
97 std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::FusedConvOp>> runner)
98 : repr_(std::move(runner)) {}
99
MaybeFusedConvRunner(std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::ConvOp>> runner)100 explicit MaybeFusedConvRunner(
101 std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::ConvOp>> runner)
102 : repr_(std::move(runner)) {}
103
MaybeFusedConvRunner(const GpuConvConfig & config)104 explicit MaybeFusedConvRunner(const GpuConvConfig& config)
105 : MaybeFusedConvRunner(
106 config.kind == CudnnConvKind::kForwardActivation
107 ? MaybeFusedConvRunner(
108 std::make_unique<
109 se::dnn::LazyOpRunner<se::dnn::FusedConvOp>>(
110 config.algorithm))
111 : MaybeFusedConvRunner(
112 std::make_unique<se::dnn::LazyOpRunner<se::dnn::ConvOp>>(
113 config.algorithm))) {}
114
ToAlgorithmDesc()115 se::dnn::AlgorithmDesc ToAlgorithmDesc() const {
116 return std::visit(ToAlgorithmDescVisitor{}, repr_);
117 }
118
AsConvRunner()119 se::dnn::LazyOpRunner<se::dnn::ConvOp>* AsConvRunner() {
120 CHECK(std::holds_alternative<
121 std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::ConvOp>>>(repr_));
122 return std::get<std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::ConvOp>>>(
123 repr_)
124 .get();
125 }
126
AsFusedConvRunner()127 se::dnn::LazyOpRunner<se::dnn::FusedConvOp>* AsFusedConvRunner() {
128 CHECK(std::holds_alternative<
129 std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::FusedConvOp>>>(repr_));
130 return std::get<
131 std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::FusedConvOp>>>(
132 repr_)
133 .get();
134 }
135
136 private:
137 struct ToAlgorithmDescVisitor {
138 template <typename RunnerPtr>
operatorToAlgorithmDescVisitor139 se::dnn::AlgorithmDesc operator()(const RunnerPtr& runner) {
140 return runner->ToAlgorithmDesc();
141 }
142
operatorToAlgorithmDescVisitor143 se::dnn::AlgorithmDesc operator()(const std::monostate&) {
144 CHECK(false) << "Internal error: uninitialized runner in ToAlgorithmDesc";
145 }
146 };
147
148 using Repr =
149 std::variant<std::monostate, // To allow GpuConvConfig default ctor
150 std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::FusedConvOp>>,
151 std::unique_ptr<se::dnn::LazyOpRunner<se::dnn::ConvOp>>>;
152 Repr repr_;
153 };
154
155 struct RunConvOptions {
156 // Nullable output-parameter pointer for profiling results.
157 se::dnn::ProfileResult* profile_result = nullptr;
158
159 // Use this runner cache (and its configured algorithm), instead of the one
160 // from the instruction.
161 MaybeFusedConvRunner* runner_cache;
162 };
163
164 // This file contains low-level routines for running cudnn convolutions.
165
166 // Calls into cudnn to run the specified convolution.
167 //
168 // We provide one overload which takes a scratch buffer, and another which takes
169 // an allocator which is responsible for allocating the scratch space. In
170 // theory the second one shouldn't be necessary -- users of this function could
171 // just ask cudnn how much scratch space it needs for a particular convolution.
172 // But in practice, StreamExecutor does not expose such an API, and in the name
173 // of parsimony, perhaps it's better not to add it. Instead, the first time you
174 // call a convolution, you should call the version that takes a scratch
175 // allocator and take note of how much memory is used. The next time you call
176 // the same conv, you can provide an explicitly preallocated scratch buffer of
177 // that size, if you like.
178 Status RunGpuConv(const GpuConvConfig& conv_config,
179 absl::Span<const se::DeviceMemoryBase> operand_buffers,
180 se::DeviceMemoryBase result_buffer,
181 se::DeviceMemoryBase scratch_memory, se::Stream* stream,
182 RunConvOptions = {});
183
184 // Struct to describe properties of a convolution without being tied to specific
185 // IR. Will be used to help build Convolution thunks from either XLA HLO or
186 // LHLO GPU dialect in MLIR.
187 struct GpuConvDescriptor {
188 CudnnConvKind kind;
189 CudnnConvBackendConfig backend_config;
190 Shape operand0_shape;
191 Shape operand1_shape;
192 Shape result_shape;
193 size_t scratch_size;
194 Window window;
195 ConvolutionDimensionNumbers dnums;
196 int64_t feature_group_count;
197 };
198
199 // Returns the convolution configuration given a XLA HLO instruction.
200 StatusOr<GpuConvConfig> GetGpuConvConfig(
201 const HloCustomCallInstruction* cudnn_call);
202
203 // Returns the convolution configuration given a convolution descriptor `desc`
204 // and a string representation of the convolution instruction `inst_as_string`
205 // (for error reporting).
206 StatusOr<GpuConvConfig> GetGpuConvConfig(const GpuConvDescriptor& desc,
207 absl::string_view inst_as_string);
208
209 // Implementation details exposed for debugging and log analysis.
210 StatusOr<GpuConvParams> GetGpuConvParams(
211 const GpuConvConfig& conv_config,
212 absl::Span<const se::DeviceMemoryBase> operand_buffers,
213 se::DeviceMemoryBase result_buffer);
214
215 se::dnn::BatchDescriptor GetBiasDescriptor(const GpuConvConfig& config);
216
BiasTypeForInputType(se::dnn::DataType input_type)217 inline se::dnn::DataType BiasTypeForInputType(se::dnn::DataType input_type) {
218 switch (input_type) {
219 default:
220 return input_type;
221 case se::dnn::DataType::kInt8:
222 return se::dnn::DataType::kFloat;
223 }
224 }
225
226 } // namespace gpu
227 } // namespace xla
228
229 #endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_CONV_RUNNER_H_
230