xref: /aosp_15_r20/external/tensorflow/tensorflow/core/kernels/gpu_utils.h (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
17 #define TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
18 
19 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
20 
21 #include <unordered_map>
22 
23 #include "absl/strings/str_cat.h"
24 #include "absl/types/span.h"
25 #include "tensorflow/compiler/xla/stream_executor/lazy_op_runner.h"
26 #include "tensorflow/core/framework/tensor.h"
27 #include "tensorflow/core/lib/core/status.h"
28 #include "tensorflow/core/lib/strings/str_util.h"
29 #include "tensorflow/core/lib/strings/strcat.h"
30 #include "tensorflow/core/lib/strings/stringprintf.h"
31 #include "tensorflow/core/platform/logging.h"
32 #include "tensorflow/core/platform/stream_executor.h"
33 #include "tensorflow/stream_executor/dnn.h"
34 
35 namespace stream_executor {
36 class RedzoneAllocator;
37 }  // namespace stream_executor
38 
39 namespace tensorflow {
40 
41 class NodeDef;
42 class AutotuneResult;
43 
44 template <typename T>
AsDeviceMemory(const T * gpu_memory)45 se::DeviceMemory<T> AsDeviceMemory(const T* gpu_memory) {
46   se::DeviceMemoryBase wrapped(const_cast<T*>(gpu_memory));
47   se::DeviceMemory<T> typed(wrapped);
48   return typed;
49 }
50 
51 // Return whether the redzone check is disabled.
52 //
53 // Controlled by the TF_DISABLE_RZ_CHECK environment variable.
54 bool RedzoneCheckDisabled();
55 
56 // Return an allocated buffer with redzones the size of `buffer`. Does
57 // *not* copy the contents of the `buffer` into the newly allocated buffer:
58 // assumes that buffer is a pure out-parameter.
59 //
60 // Returns `buffer` if RedzoneCheckDisabled() is true.
61 //
62 // On error, return `buffer`, and log an error message (once).
63 se::DeviceMemoryBase WrapRedzoneBestEffort(se::RedzoneAllocator* rz_allocator,
64                                            se::DeviceMemoryBase buffer);
65 
66 // Check the passed allocator for redzone violations.
67 // If violations have occurred, mark the corresponding autotune result
68 // as a failure.
69 void CheckRedzones(const se::RedzoneAllocator& rz_allocator,
70                    AutotuneResult* autotune_result);
71 
72 template <typename T>
AsDeviceMemory(const T * cuda_memory,uint64 size)73 inline se::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory, uint64 size) {
74   se::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory), size * sizeof(T));
75   se::DeviceMemory<T> typed(wrapped);
76   return typed;
77 }
78 
79 // Returns whether cuBLASLt is enabled.
80 //
81 // Controlled by the TF_USE_CUBLASLT environment variable.
82 bool EnableCublasLtGemm();
83 
84 namespace internal {
85 
86 template <typename Parameters>
87 struct AutotuneMapHasher {
operatorAutotuneMapHasher88   std::size_t operator()(const Parameters& parameter) const {
89     return parameter.hash();
90   }
91 };
92 
93 }  // namespace internal
94 
95 // A helper class that looks up the best autotuned config from parameters.
96 // Due to the noisy nature of autotune, especially with multiple devices, it
97 // only accepts a config if its margin exceeds a threshold.
98 // For the same shape configs, if a new best config matches the previous best,
99 // they get promoted; otherwise, the winner gets demoted. This process stops
100 // when the winner's score exceeds the threshold.
101 // In a bad case when two configs are very close to each other and flips
102 // back and forth randomly, the expected number of experiments before autotune
103 // settles is O(threshold ^ 2). So we recommend that number of warmup runs
104 // for any benchmarks.
105 template <typename Parameters, typename Config,
106           typename Hasher = internal::AutotuneMapHasher<Parameters>>
107 class AutotuneMap {
108  public:
Find(const Parameters & params,Config * config)109   bool Find(const Parameters& params, Config* config) const {
110     mutex_lock lock(mu_);
111     auto iter = params_config_map_.find(params);
112     if (iter == params_config_map_.end() ||
113         (iter->second.score < min_score_threshold_ &&
114          iter->second.count <= max_autotune_count_)) {
115       return false;
116     }
117     *config = iter->second.config;
118     return true;
119   }
Insert(const Parameters & params,const Config & config)120   void Insert(const Parameters& params, const Config& config) {
121     mutex_lock lock(mu_);
122     auto iter = params_config_map_.find(params);
123     int new_score = 0;
124     if (iter == params_config_map_.end()) {
125       // Create a new entry if params is new.
126       VLOG(1) << GetActionSummary("creates", params, config);
127       params_config_map_.insert(
128           std::make_pair(params, ValueType{config, 1, 1}));
129       new_score = 1;
130     } else if (iter->second.score < min_score_threshold_ &&
131                iter->second.count <= max_autotune_count_) {
132       DCHECK_GT(iter->second.score, 0);
133       if (iter->second.config != config) {
134         // If it is different from the current winner, demotes the winner.
135         VLOG(1) << GetActionSummary("demotes", params, config);
136         new_score = --iter->second.score;
137         ++iter->second.count;
138         if (new_score <= 0) {
139           VLOG(1) << GetActionSummary("erases", params, config);
140           params_config_map_.erase(iter);
141         }
142       } else {
143         // If it is the same as the current winner, promotes the winner.
144         VLOG(1) << GetActionSummary("promotes", params, config);
145         new_score = ++iter->second.score;
146         ++iter->second.count;
147       }
148     }
149     if (new_score >= min_score_threshold_) {
150       VLOG(1) << GetActionSummary("accepts", params, config);
151     } else if (autotune_global_count_ >= max_autotune_global_count_) {
152       // The autotuning exceeds the max iteration threshold and we accept the
153       // the winner if it exists in the map, otherwise we accept the current
154       // winner.
155       auto winner = params_config_map_.find(params);
156       if (winner == params_config_map_.end()) {
157         VLOG(1) << GetActionSummary("creates", params, config);
158         for (int i = 0; i < min_score_threshold_; ++i) {
159           VLOG(1) << GetActionSummary("promotes", params, config);
160         }
161         params_config_map_.insert(
162             std::make_pair(params, ValueType{config, min_score_threshold_, 1}));
163       } else {
164         int promotes_times = min_score_threshold_ - winner->second.score;
165         for (int i = 0; i < promotes_times; ++i) {
166           VLOG(1) << GetActionSummary("promotes", params, config);
167         }
168         winner->second.score = min_score_threshold_;
169       }
170       VLOG(1) << GetActionSummary("accepts", params, config);
171     }
172     autotune_global_count_++;
173   }
174 
GetMap()175   std::unordered_map<Parameters, Config, Hasher> GetMap() const {
176     mutex_lock lock(mu_);
177     std::unordered_map<Parameters, Config, Hasher> map;
178     for (const auto& entry : params_config_map_) {
179       map.insert(std::make_pair(entry.first, entry.second.config));
180     }
181     return map;
182   }
183 
184   // Only for testing
ClearMap()185   void ClearMap() {
186     mutex_lock lock(mu_);
187     params_config_map_.clear();
188   }
189 
190  private:
191   // Underlying data structure of values in the map.
192   struct ValueType {
193     Config config;
194     int32 score;
195     int32 count;
196   };
AutotuneMap(const std::string & name)197   AutotuneMap(const std::string& name) : name_(name) {
198     min_score_threshold_ = 1;
199     int min_warmup_iterations = 10;
200     const char* threshold_str = getenv("TF_AUTOTUNE_THRESHOLD");
201     if (threshold_str != nullptr) {
202       VLOG(1) << "TF_AUTOTUNE_THRESHOLD = " << threshold_str;
203       strings::safe_strto32(threshold_str, &min_score_threshold_);
204     }
205     const char* min_warmup_iteration_str =
206         getenv("TF_AUTOTUNE_MIN_WARMUP_ITERATIONS");
207     if (min_warmup_iteration_str != nullptr) {
208       strings::safe_strto32(min_warmup_iteration_str, &min_warmup_iterations);
209     }
210     min_score_threshold_ = std::max(min_score_threshold_, 1);
211     max_autotune_count_ = std::max(
212         5 * min_score_threshold_ * min_score_threshold_, min_warmup_iterations);
213     max_autotune_global_count_ = 2 * max_autotune_count_;
214     autotune_global_count_ = 0;
215   }
216 
217   template <class Group, class Params, class Cfg, class Hash>
218   friend class AutotuneSingleton;
219 
GetActionSummary(StringPiece action,const Parameters & params,const Config & config)220   std::string GetActionSummary(StringPiece action, const Parameters& params,
221                                const Config& config) {
222     return strings::Printf("autotune_map %s %s: %s -> (%s)", name_.c_str(),
223                            string(action).c_str(), params.ToString().c_str(),
224                            config.ToString().c_str());
225   }
226 
227   mutable mutex mu_;
228 
229   std::unordered_map<Parameters, ValueType, Hasher> params_config_map_
230       TF_GUARDED_BY(mu_);
231   std::string name_;
232   int32 min_score_threshold_;
233   int32 max_autotune_count_;
234   int32 max_autotune_global_count_;
235   int32 autotune_global_count_;
236 
237   TF_DISALLOW_COPY_AND_ASSIGN(AutotuneMap);
238 };
239 
240 // A Singleton helper that manages the global autotune results by groups.
241 // The caller specified arbitrary Group type that can distinguish between
242 // different autotune results, even if their Parameters and Configs are the
243 // same.
244 template <class Group, typename Parameters, typename Config,
245           typename Hasher = internal::AutotuneMapHasher<Parameters>>
246 class AutotuneSingleton {
247  public:
248   typedef AutotuneMap<Parameters, Config, Hasher> AutotuneType;
GetInstance()249   static AutotuneType* GetInstance() {
250     static AutotuneType* instance = new AutotuneType(Group::name());
251     return instance;
252   }
253 };
254 
255 // Logs convolution results to customized back-storage.
256 void LogConvAutotuneResults(se::dnn::ConvolutionKind kind,
257                             se::dnn::DataType element_type,
258                             se::DeviceMemoryBase input_buffer,
259                             se::DeviceMemoryBase filter_buffer,
260                             se::DeviceMemoryBase output_buffer,
261                             const se::dnn::BatchDescriptor& input_desc,
262                             const se::dnn::FilterDescriptor& filter_desc,
263                             const se::dnn::BatchDescriptor& output_desc,
264                             const se::dnn::ConvolutionDescriptor& conv_desc,
265                             se::StreamExecutor* stream_exec,
266                             absl::Span<const AutotuneResult> results);
267 
268 // Logs fused convolution results to customized back-storage.
269 void LogFusedConvForwardAutotuneResults(
270     se::dnn::DataType element_type, se::DeviceMemoryBase input_buffer,
271     se::DeviceMemoryBase filter_buffer, se::DeviceMemoryBase output_buffer,
272     se::DeviceMemoryBase bias_buffer, se::DeviceMemoryBase side_input_buffer,
273     const se::dnn::BatchDescriptor& input_desc,
274     const se::dnn::FilterDescriptor& filter_desc,
275     const se::dnn::BatchDescriptor& output_desc,
276     const se::dnn::ConvolutionDescriptor& conv_desc, double conv_scale,
277     double side_value_scale, se::dnn::ActivationMode activation_mode,
278     se::StreamExecutor* stream_exec, absl::Span<const AutotuneResult> results);
279 
280 // Autotuning map entry for cuDNN-frontend-capable APIs.
281 //
282 // The longer-term intent is to remove the AlgorithmConfig variant and make this
283 // contain only the two LazyOpRunners, but for the time being ROCm is stuck on
284 // the legacy API and requires an AlgorithmConfig.
285 template <typename Op>
286 class AutotuneEntry {
287  public:
AutotuneEntry()288   AutotuneEntry() : is_algorithm_config_(true) {}
289 
290   // Initialize with legacy-API AlgorithmConfig; used for the ROCm backend only.
AutotuneEntry(se::dnn::AlgorithmConfig config)291   explicit AutotuneEntry(se::dnn::AlgorithmConfig config)
292       : is_algorithm_config_(true), algorithm_config_(std::move(config)) {}
293 
AutotuneEntry(std::shared_ptr<se::dnn::LazyOpRunner<Op>> primary,std::shared_ptr<se::dnn::LazyOpRunner<Op>> no_scratch_fallback)294   AutotuneEntry(std::shared_ptr<se::dnn::LazyOpRunner<Op>> primary,
295                 std::shared_ptr<se::dnn::LazyOpRunner<Op>> no_scratch_fallback)
296       : is_algorithm_config_(false),
297         op_runners_{std::move(primary), std::move(no_scratch_fallback)} {}
298 
299   // Initialize from config data, without pre-cached runners, such as when
300   // loading AoT autotuning maps.
AutotuneEntry(se::dnn::AlgorithmDesc primary,absl::optional<se::dnn::AlgorithmDesc> no_scratch_fallback)301   AutotuneEntry(se::dnn::AlgorithmDesc primary,
302                 absl::optional<se::dnn::AlgorithmDesc> no_scratch_fallback)
303       : AutotuneEntry(std::make_shared<se::dnn::LazyOpRunner<Op>>(primary),
304                       no_scratch_fallback
305                           ? std::make_shared<se::dnn::LazyOpRunner<Op>>(
306                                 *no_scratch_fallback)
307                           : nullptr) {}
308 
309   // Initialize with pre-cached OpRunners, such as during autotuning.
FromOpRunners(std::unique_ptr<const se::dnn::OpRunner<typename Op::Signature>> primary,std::unique_ptr<const se::dnn::OpRunner<typename Op::Signature>> no_cache_fallback)310   static StatusOr<AutotuneEntry> FromOpRunners(
311       std::unique_ptr<const se::dnn::OpRunner<typename Op::Signature>> primary,
312       std::unique_ptr<const se::dnn::OpRunner<typename Op::Signature>>
313           no_cache_fallback) {
314     TF_ASSIGN_OR_RETURN(
315         auto primary_cache,
316         se::dnn::LazyOpRunner<Op>::FromOpRunner(std::move(primary)));
317 
318     if (no_cache_fallback) {
319       TF_ASSIGN_OR_RETURN(auto fallback_cache,
320                           se::dnn::LazyOpRunner<Op>::FromOpRunner(
321                               std::move(no_cache_fallback)));
322       return AutotuneEntry(std::move(primary_cache), std::move(fallback_cache));
323 
324     } else {
325       return AutotuneEntry(std::move(primary_cache), nullptr);
326     }
327   }
328 
329   struct OpRunners {
330     OpRunners() = default;
331 
OpRunnersOpRunners332     OpRunners(std::shared_ptr<se::dnn::LazyOpRunner<Op>> primary_,
333               std::shared_ptr<se::dnn::LazyOpRunner<Op>> no_scratch_fallback_)
334         : primary(std::move(primary_)),
335           no_scratch_fallback(std::move(no_scratch_fallback_)) {}
336 
337     // Null iff this 'OpRunners' is default-constructed as part of the
338     // fake-variant in AutotuneEntry; users outside gpu_utils.h itself should
339     // never see primary = nullptr.
340     std::shared_ptr<se::dnn::LazyOpRunner<Op>> primary;
341     std::shared_ptr<se::dnn::LazyOpRunner<Op>> no_scratch_fallback;  // Nullable
342 
343     bool operator==(const OpRunners& other) const {
344       return *primary == *other.primary &&
345              ((!no_scratch_fallback && !other.no_scratch_fallback) ||
346               (no_scratch_fallback && other.no_scratch_fallback &&
347                *no_scratch_fallback == *other.no_scratch_fallback));
348     }
349   };
350 
is_algorithm_config()351   bool is_algorithm_config() const { return is_algorithm_config_; }
352 
GetAlgorithmConfig()353   const se::dnn::AlgorithmConfig& GetAlgorithmConfig() const {
354     DCHECK(is_algorithm_config_);
355     return algorithm_config_;
356   }
357 
GetOpRunners()358   const OpRunners& GetOpRunners() const {
359     DCHECK(!is_algorithm_config_);
360     return op_runners_;
361   }
362 
363   // AutotuneMap needs to test equality to keep track of the number of times an
364   // algorithm has won autotuning; for this purpose, we can use ToString to
365   // determine whether runners are equal.
366   bool operator==(const AutotuneEntry<Op>& other) const {
367     if (is_algorithm_config_) {
368       return other.is_algorithm_config_ &&
369              algorithm_config_ == other.algorithm_config_;
370     }
371 
372     return !other.is_algorithm_config_ && op_runners_ == other.op_runners_;
373   }
374 
375   bool operator!=(const AutotuneEntry<Op>& other) const {
376     return !(*this == other);
377   }
378 
ToString()379   std::string ToString() const {
380     if (is_algorithm_config_) {
381       return algorithm_config_.ToString();
382     }
383     return absl::StrCat("{", op_runners_.primary->ToString(), ", ",
384                         (op_runners_.no_scratch_fallback
385                              ? op_runners_.no_scratch_fallback->ToString()
386                              : "(op_runners have no fallback)"),
387                         "}");
388   }
389 
390  private:
391   // NVCC is broken, so we can't use absl::variant here.  Just fake it with a
392   // bool and both fields.
393   bool is_algorithm_config_;
394   se::dnn::AlgorithmConfig algorithm_config_;
395   OpRunners op_runners_;
396 };
397 
398 namespace internal {
399 StatusOr<std::tuple<int, int>> BestCudnnConvAlgorithmIndices(
400     absl::Span<const AutotuneResult> results);
401 }  // namespace internal
402 
403 // Returns the best algorithms for the config, one is the fastest, the other is
404 // other is fastest with 0 scratch space. Unsuccessful autotuning results are
405 // allowed and ignored.
406 StatusOr<se::dnn::AlgorithmConfig> BestCudnnConvAlgorithm(
407     absl::Span<const AutotuneResult> results);
408 
409 // Explicitly-instantiated with ConvOp and FusedConvOp.
410 //
411 // The definition can't be in the header because including .pb.h files in
412 // headers is forbidden.
413 template <typename Op>
414 StatusOr<AutotuneEntry<Op>> BestCudnnConvAlgorithm(
415     absl::Span<const AutotuneResult> results,
416     std::vector<
417         std::unique_ptr<const se::dnn::OpRunner<typename Op::Signature>>>
418         runners);
419 
420 // Get the Dnn workspace limit from the environment variable, which is in MB.
421 // Return the workspace memory limit in bytes. If no value is set, return the
422 // default value.
423 int64_t GetDnnWorkspaceLimit(const string& envvar_in_mb,
424                              int64_t default_value_in_bytes);
425 
426 }  // namespace tensorflow
427 
428 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
429 
430 #endif  // TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
431