1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
17 #define TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
18
19 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
20
21 #include <unordered_map>
22
23 #include "absl/strings/str_cat.h"
24 #include "absl/types/span.h"
25 #include "tensorflow/compiler/xla/stream_executor/lazy_op_runner.h"
26 #include "tensorflow/core/framework/tensor.h"
27 #include "tensorflow/core/lib/core/status.h"
28 #include "tensorflow/core/lib/strings/str_util.h"
29 #include "tensorflow/core/lib/strings/strcat.h"
30 #include "tensorflow/core/lib/strings/stringprintf.h"
31 #include "tensorflow/core/platform/logging.h"
32 #include "tensorflow/core/platform/stream_executor.h"
33 #include "tensorflow/stream_executor/dnn.h"
34
35 namespace stream_executor {
36 class RedzoneAllocator;
37 } // namespace stream_executor
38
39 namespace tensorflow {
40
41 class NodeDef;
42 class AutotuneResult;
43
44 template <typename T>
AsDeviceMemory(const T * gpu_memory)45 se::DeviceMemory<T> AsDeviceMemory(const T* gpu_memory) {
46 se::DeviceMemoryBase wrapped(const_cast<T*>(gpu_memory));
47 se::DeviceMemory<T> typed(wrapped);
48 return typed;
49 }
50
51 // Return whether the redzone check is disabled.
52 //
53 // Controlled by the TF_DISABLE_RZ_CHECK environment variable.
54 bool RedzoneCheckDisabled();
55
56 // Return an allocated buffer with redzones the size of `buffer`. Does
57 // *not* copy the contents of the `buffer` into the newly allocated buffer:
58 // assumes that buffer is a pure out-parameter.
59 //
60 // Returns `buffer` if RedzoneCheckDisabled() is true.
61 //
62 // On error, return `buffer`, and log an error message (once).
63 se::DeviceMemoryBase WrapRedzoneBestEffort(se::RedzoneAllocator* rz_allocator,
64 se::DeviceMemoryBase buffer);
65
66 // Check the passed allocator for redzone violations.
67 // If violations have occurred, mark the corresponding autotune result
68 // as a failure.
69 void CheckRedzones(const se::RedzoneAllocator& rz_allocator,
70 AutotuneResult* autotune_result);
71
72 template <typename T>
AsDeviceMemory(const T * cuda_memory,uint64 size)73 inline se::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory, uint64 size) {
74 se::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory), size * sizeof(T));
75 se::DeviceMemory<T> typed(wrapped);
76 return typed;
77 }
78
79 // Returns whether cuBLASLt is enabled.
80 //
81 // Controlled by the TF_USE_CUBLASLT environment variable.
82 bool EnableCublasLtGemm();
83
84 namespace internal {
85
86 template <typename Parameters>
87 struct AutotuneMapHasher {
operatorAutotuneMapHasher88 std::size_t operator()(const Parameters& parameter) const {
89 return parameter.hash();
90 }
91 };
92
93 } // namespace internal
94
95 // A helper class that looks up the best autotuned config from parameters.
96 // Due to the noisy nature of autotune, especially with multiple devices, it
97 // only accepts a config if its margin exceeds a threshold.
98 // For the same shape configs, if a new best config matches the previous best,
99 // they get promoted; otherwise, the winner gets demoted. This process stops
100 // when the winner's score exceeds the threshold.
101 // In a bad case when two configs are very close to each other and flips
102 // back and forth randomly, the expected number of experiments before autotune
103 // settles is O(threshold ^ 2). So we recommend that number of warmup runs
104 // for any benchmarks.
105 template <typename Parameters, typename Config,
106 typename Hasher = internal::AutotuneMapHasher<Parameters>>
107 class AutotuneMap {
108 public:
Find(const Parameters & params,Config * config)109 bool Find(const Parameters& params, Config* config) const {
110 mutex_lock lock(mu_);
111 auto iter = params_config_map_.find(params);
112 if (iter == params_config_map_.end() ||
113 (iter->second.score < min_score_threshold_ &&
114 iter->second.count <= max_autotune_count_)) {
115 return false;
116 }
117 *config = iter->second.config;
118 return true;
119 }
Insert(const Parameters & params,const Config & config)120 void Insert(const Parameters& params, const Config& config) {
121 mutex_lock lock(mu_);
122 auto iter = params_config_map_.find(params);
123 int new_score = 0;
124 if (iter == params_config_map_.end()) {
125 // Create a new entry if params is new.
126 VLOG(1) << GetActionSummary("creates", params, config);
127 params_config_map_.insert(
128 std::make_pair(params, ValueType{config, 1, 1}));
129 new_score = 1;
130 } else if (iter->second.score < min_score_threshold_ &&
131 iter->second.count <= max_autotune_count_) {
132 DCHECK_GT(iter->second.score, 0);
133 if (iter->second.config != config) {
134 // If it is different from the current winner, demotes the winner.
135 VLOG(1) << GetActionSummary("demotes", params, config);
136 new_score = --iter->second.score;
137 ++iter->second.count;
138 if (new_score <= 0) {
139 VLOG(1) << GetActionSummary("erases", params, config);
140 params_config_map_.erase(iter);
141 }
142 } else {
143 // If it is the same as the current winner, promotes the winner.
144 VLOG(1) << GetActionSummary("promotes", params, config);
145 new_score = ++iter->second.score;
146 ++iter->second.count;
147 }
148 }
149 if (new_score >= min_score_threshold_) {
150 VLOG(1) << GetActionSummary("accepts", params, config);
151 } else if (autotune_global_count_ >= max_autotune_global_count_) {
152 // The autotuning exceeds the max iteration threshold and we accept the
153 // the winner if it exists in the map, otherwise we accept the current
154 // winner.
155 auto winner = params_config_map_.find(params);
156 if (winner == params_config_map_.end()) {
157 VLOG(1) << GetActionSummary("creates", params, config);
158 for (int i = 0; i < min_score_threshold_; ++i) {
159 VLOG(1) << GetActionSummary("promotes", params, config);
160 }
161 params_config_map_.insert(
162 std::make_pair(params, ValueType{config, min_score_threshold_, 1}));
163 } else {
164 int promotes_times = min_score_threshold_ - winner->second.score;
165 for (int i = 0; i < promotes_times; ++i) {
166 VLOG(1) << GetActionSummary("promotes", params, config);
167 }
168 winner->second.score = min_score_threshold_;
169 }
170 VLOG(1) << GetActionSummary("accepts", params, config);
171 }
172 autotune_global_count_++;
173 }
174
GetMap()175 std::unordered_map<Parameters, Config, Hasher> GetMap() const {
176 mutex_lock lock(mu_);
177 std::unordered_map<Parameters, Config, Hasher> map;
178 for (const auto& entry : params_config_map_) {
179 map.insert(std::make_pair(entry.first, entry.second.config));
180 }
181 return map;
182 }
183
184 // Only for testing
ClearMap()185 void ClearMap() {
186 mutex_lock lock(mu_);
187 params_config_map_.clear();
188 }
189
190 private:
191 // Underlying data structure of values in the map.
192 struct ValueType {
193 Config config;
194 int32 score;
195 int32 count;
196 };
AutotuneMap(const std::string & name)197 AutotuneMap(const std::string& name) : name_(name) {
198 min_score_threshold_ = 1;
199 int min_warmup_iterations = 10;
200 const char* threshold_str = getenv("TF_AUTOTUNE_THRESHOLD");
201 if (threshold_str != nullptr) {
202 VLOG(1) << "TF_AUTOTUNE_THRESHOLD = " << threshold_str;
203 strings::safe_strto32(threshold_str, &min_score_threshold_);
204 }
205 const char* min_warmup_iteration_str =
206 getenv("TF_AUTOTUNE_MIN_WARMUP_ITERATIONS");
207 if (min_warmup_iteration_str != nullptr) {
208 strings::safe_strto32(min_warmup_iteration_str, &min_warmup_iterations);
209 }
210 min_score_threshold_ = std::max(min_score_threshold_, 1);
211 max_autotune_count_ = std::max(
212 5 * min_score_threshold_ * min_score_threshold_, min_warmup_iterations);
213 max_autotune_global_count_ = 2 * max_autotune_count_;
214 autotune_global_count_ = 0;
215 }
216
217 template <class Group, class Params, class Cfg, class Hash>
218 friend class AutotuneSingleton;
219
GetActionSummary(StringPiece action,const Parameters & params,const Config & config)220 std::string GetActionSummary(StringPiece action, const Parameters& params,
221 const Config& config) {
222 return strings::Printf("autotune_map %s %s: %s -> (%s)", name_.c_str(),
223 string(action).c_str(), params.ToString().c_str(),
224 config.ToString().c_str());
225 }
226
227 mutable mutex mu_;
228
229 std::unordered_map<Parameters, ValueType, Hasher> params_config_map_
230 TF_GUARDED_BY(mu_);
231 std::string name_;
232 int32 min_score_threshold_;
233 int32 max_autotune_count_;
234 int32 max_autotune_global_count_;
235 int32 autotune_global_count_;
236
237 TF_DISALLOW_COPY_AND_ASSIGN(AutotuneMap);
238 };
239
240 // A Singleton helper that manages the global autotune results by groups.
241 // The caller specified arbitrary Group type that can distinguish between
242 // different autotune results, even if their Parameters and Configs are the
243 // same.
244 template <class Group, typename Parameters, typename Config,
245 typename Hasher = internal::AutotuneMapHasher<Parameters>>
246 class AutotuneSingleton {
247 public:
248 typedef AutotuneMap<Parameters, Config, Hasher> AutotuneType;
GetInstance()249 static AutotuneType* GetInstance() {
250 static AutotuneType* instance = new AutotuneType(Group::name());
251 return instance;
252 }
253 };
254
255 // Logs convolution results to customized back-storage.
256 void LogConvAutotuneResults(se::dnn::ConvolutionKind kind,
257 se::dnn::DataType element_type,
258 se::DeviceMemoryBase input_buffer,
259 se::DeviceMemoryBase filter_buffer,
260 se::DeviceMemoryBase output_buffer,
261 const se::dnn::BatchDescriptor& input_desc,
262 const se::dnn::FilterDescriptor& filter_desc,
263 const se::dnn::BatchDescriptor& output_desc,
264 const se::dnn::ConvolutionDescriptor& conv_desc,
265 se::StreamExecutor* stream_exec,
266 absl::Span<const AutotuneResult> results);
267
268 // Logs fused convolution results to customized back-storage.
269 void LogFusedConvForwardAutotuneResults(
270 se::dnn::DataType element_type, se::DeviceMemoryBase input_buffer,
271 se::DeviceMemoryBase filter_buffer, se::DeviceMemoryBase output_buffer,
272 se::DeviceMemoryBase bias_buffer, se::DeviceMemoryBase side_input_buffer,
273 const se::dnn::BatchDescriptor& input_desc,
274 const se::dnn::FilterDescriptor& filter_desc,
275 const se::dnn::BatchDescriptor& output_desc,
276 const se::dnn::ConvolutionDescriptor& conv_desc, double conv_scale,
277 double side_value_scale, se::dnn::ActivationMode activation_mode,
278 se::StreamExecutor* stream_exec, absl::Span<const AutotuneResult> results);
279
280 // Autotuning map entry for cuDNN-frontend-capable APIs.
281 //
282 // The longer-term intent is to remove the AlgorithmConfig variant and make this
283 // contain only the two LazyOpRunners, but for the time being ROCm is stuck on
284 // the legacy API and requires an AlgorithmConfig.
285 template <typename Op>
286 class AutotuneEntry {
287 public:
AutotuneEntry()288 AutotuneEntry() : is_algorithm_config_(true) {}
289
290 // Initialize with legacy-API AlgorithmConfig; used for the ROCm backend only.
AutotuneEntry(se::dnn::AlgorithmConfig config)291 explicit AutotuneEntry(se::dnn::AlgorithmConfig config)
292 : is_algorithm_config_(true), algorithm_config_(std::move(config)) {}
293
AutotuneEntry(std::shared_ptr<se::dnn::LazyOpRunner<Op>> primary,std::shared_ptr<se::dnn::LazyOpRunner<Op>> no_scratch_fallback)294 AutotuneEntry(std::shared_ptr<se::dnn::LazyOpRunner<Op>> primary,
295 std::shared_ptr<se::dnn::LazyOpRunner<Op>> no_scratch_fallback)
296 : is_algorithm_config_(false),
297 op_runners_{std::move(primary), std::move(no_scratch_fallback)} {}
298
299 // Initialize from config data, without pre-cached runners, such as when
300 // loading AoT autotuning maps.
AutotuneEntry(se::dnn::AlgorithmDesc primary,absl::optional<se::dnn::AlgorithmDesc> no_scratch_fallback)301 AutotuneEntry(se::dnn::AlgorithmDesc primary,
302 absl::optional<se::dnn::AlgorithmDesc> no_scratch_fallback)
303 : AutotuneEntry(std::make_shared<se::dnn::LazyOpRunner<Op>>(primary),
304 no_scratch_fallback
305 ? std::make_shared<se::dnn::LazyOpRunner<Op>>(
306 *no_scratch_fallback)
307 : nullptr) {}
308
309 // Initialize with pre-cached OpRunners, such as during autotuning.
FromOpRunners(std::unique_ptr<const se::dnn::OpRunner<typename Op::Signature>> primary,std::unique_ptr<const se::dnn::OpRunner<typename Op::Signature>> no_cache_fallback)310 static StatusOr<AutotuneEntry> FromOpRunners(
311 std::unique_ptr<const se::dnn::OpRunner<typename Op::Signature>> primary,
312 std::unique_ptr<const se::dnn::OpRunner<typename Op::Signature>>
313 no_cache_fallback) {
314 TF_ASSIGN_OR_RETURN(
315 auto primary_cache,
316 se::dnn::LazyOpRunner<Op>::FromOpRunner(std::move(primary)));
317
318 if (no_cache_fallback) {
319 TF_ASSIGN_OR_RETURN(auto fallback_cache,
320 se::dnn::LazyOpRunner<Op>::FromOpRunner(
321 std::move(no_cache_fallback)));
322 return AutotuneEntry(std::move(primary_cache), std::move(fallback_cache));
323
324 } else {
325 return AutotuneEntry(std::move(primary_cache), nullptr);
326 }
327 }
328
329 struct OpRunners {
330 OpRunners() = default;
331
OpRunnersOpRunners332 OpRunners(std::shared_ptr<se::dnn::LazyOpRunner<Op>> primary_,
333 std::shared_ptr<se::dnn::LazyOpRunner<Op>> no_scratch_fallback_)
334 : primary(std::move(primary_)),
335 no_scratch_fallback(std::move(no_scratch_fallback_)) {}
336
337 // Null iff this 'OpRunners' is default-constructed as part of the
338 // fake-variant in AutotuneEntry; users outside gpu_utils.h itself should
339 // never see primary = nullptr.
340 std::shared_ptr<se::dnn::LazyOpRunner<Op>> primary;
341 std::shared_ptr<se::dnn::LazyOpRunner<Op>> no_scratch_fallback; // Nullable
342
343 bool operator==(const OpRunners& other) const {
344 return *primary == *other.primary &&
345 ((!no_scratch_fallback && !other.no_scratch_fallback) ||
346 (no_scratch_fallback && other.no_scratch_fallback &&
347 *no_scratch_fallback == *other.no_scratch_fallback));
348 }
349 };
350
is_algorithm_config()351 bool is_algorithm_config() const { return is_algorithm_config_; }
352
GetAlgorithmConfig()353 const se::dnn::AlgorithmConfig& GetAlgorithmConfig() const {
354 DCHECK(is_algorithm_config_);
355 return algorithm_config_;
356 }
357
GetOpRunners()358 const OpRunners& GetOpRunners() const {
359 DCHECK(!is_algorithm_config_);
360 return op_runners_;
361 }
362
363 // AutotuneMap needs to test equality to keep track of the number of times an
364 // algorithm has won autotuning; for this purpose, we can use ToString to
365 // determine whether runners are equal.
366 bool operator==(const AutotuneEntry<Op>& other) const {
367 if (is_algorithm_config_) {
368 return other.is_algorithm_config_ &&
369 algorithm_config_ == other.algorithm_config_;
370 }
371
372 return !other.is_algorithm_config_ && op_runners_ == other.op_runners_;
373 }
374
375 bool operator!=(const AutotuneEntry<Op>& other) const {
376 return !(*this == other);
377 }
378
ToString()379 std::string ToString() const {
380 if (is_algorithm_config_) {
381 return algorithm_config_.ToString();
382 }
383 return absl::StrCat("{", op_runners_.primary->ToString(), ", ",
384 (op_runners_.no_scratch_fallback
385 ? op_runners_.no_scratch_fallback->ToString()
386 : "(op_runners have no fallback)"),
387 "}");
388 }
389
390 private:
391 // NVCC is broken, so we can't use absl::variant here. Just fake it with a
392 // bool and both fields.
393 bool is_algorithm_config_;
394 se::dnn::AlgorithmConfig algorithm_config_;
395 OpRunners op_runners_;
396 };
397
398 namespace internal {
399 StatusOr<std::tuple<int, int>> BestCudnnConvAlgorithmIndices(
400 absl::Span<const AutotuneResult> results);
401 } // namespace internal
402
403 // Returns the best algorithms for the config, one is the fastest, the other is
404 // other is fastest with 0 scratch space. Unsuccessful autotuning results are
405 // allowed and ignored.
406 StatusOr<se::dnn::AlgorithmConfig> BestCudnnConvAlgorithm(
407 absl::Span<const AutotuneResult> results);
408
409 // Explicitly-instantiated with ConvOp and FusedConvOp.
410 //
411 // The definition can't be in the header because including .pb.h files in
412 // headers is forbidden.
413 template <typename Op>
414 StatusOr<AutotuneEntry<Op>> BestCudnnConvAlgorithm(
415 absl::Span<const AutotuneResult> results,
416 std::vector<
417 std::unique_ptr<const se::dnn::OpRunner<typename Op::Signature>>>
418 runners);
419
420 // Get the Dnn workspace limit from the environment variable, which is in MB.
421 // Return the workspace memory limit in bytes. If no value is set, return the
422 // default value.
423 int64_t GetDnnWorkspaceLimit(const string& envvar_in_mb,
424 int64_t default_value_in_bytes);
425
426 } // namespace tensorflow
427
428 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
429
430 #endif // TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
431