xref: /aosp_15_r20/external/tensorflow/tensorflow/compiler/xla/service/gpu/launch_dimensions.h (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LAUNCH_DIMENSIONS_H_
17 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LAUNCH_DIMENSIONS_H_
18 
19 #include <map>
20 #include <memory>
21 #include <string>
22 
23 #include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
24 #include "tensorflow/compiler/xla/shape.h"
25 
26 namespace xla {
27 namespace gpu {
28 
29 // Encapsulates the launch dimensions of a kernel, e.g., the block count and the
30 // number of threads per block.
31 class LaunchDimensions {
32  public:
33   struct Dim3D {
34     int64_t x, y, z;
35   };
36 
37   // The default constructor creates a launch dimension that indicate
38   // single-threaded execution.
LaunchDimensions()39   LaunchDimensions()
40       : block_counts_({1, 1, 1}), thread_counts_per_block_({1, 1, 1}) {}
41 
LaunchDimensions(int64_t block_x_count,int64_t thread_x_count_per_block)42   LaunchDimensions(int64_t block_x_count, int64_t thread_x_count_per_block)
43       : block_counts_({block_x_count, 1, 1}),
44         thread_counts_per_block_({thread_x_count_per_block, 1, 1}) {}
45 
LaunchDimensions(const Dim3D & block_counts,const Dim3D & thread_counts_per_block)46   LaunchDimensions(const Dim3D& block_counts,
47                    const Dim3D& thread_counts_per_block)
48       : block_counts_(block_counts),
49         thread_counts_per_block_(thread_counts_per_block) {}
50 
block_counts()51   Dim3D block_counts() const { return block_counts_; }
52 
thread_counts_per_block()53   Dim3D thread_counts_per_block() const { return thread_counts_per_block_; }
54 
55   // Returns the total number of threads in a block.
total_nb_threads()56   int64_t total_nb_threads() const {
57     return thread_counts_per_block_.x * thread_counts_per_block_.y *
58            thread_counts_per_block_.z;
59   }
60 
launch_bound()61   int64_t launch_bound() const {
62     return block_counts_.x * thread_counts_per_block_.x * block_counts_.y *
63            thread_counts_per_block_.y * block_counts_.z *
64            thread_counts_per_block_.z;
65   }
66 
ToString()67   std::string ToString() const {
68     return absl::StrCat("blocks: {", block_counts_.x, ", ", block_counts_.y,
69                         ", ", block_counts_.z, "}, threads/block: {",
70                         thread_counts_per_block_.x, ", ",
71                         thread_counts_per_block_.y, ", ",
72                         thread_counts_per_block_.z, "}");
73   }
74 
75  private:
76   Dim3D block_counts_;
77   Dim3D thread_counts_per_block_;
78 };
79 
80 std::ostream& operator<<(std::ostream& out,
81                          const LaunchDimensions& launch_dims);
82 
83 struct LaunchDimensionsConfig {
84   // The kernel implementation will be unrolled if `unroll_factor` is
85   // greater than one.
86   int unroll_factor = 1;
87   // A wave is a group of blocks that execute at the same time on the
88   // GPU. If there are more blocks then the number that can run
89   // concurrently, there are multiple waves of blocks running
90   // sequentially.  If `few_waves` is true, each thread will loop over
91   // a block of unroll_factor elements. Otherwise each thread will
92   // handle only unroll_factor.
93   bool few_waves = false;
94   // If `row_optimized` is true, then the block size will equal to
95   // `hlo.shape().dimensions().back()/unroll_factor`.
96   // Currently few_waves and row_vectorized do not work together.
97   bool row_vectorized = false;
98   // If 'logical_order' is true, then adjacent threads will write to
99   // logically adjacent indices in output buffer.
100   bool logical_order = false;
101 
ToStringLaunchDimensionsConfig102   std::string ToString() {
103     return absl::StrCat(
104         "unroll_factor=", unroll_factor, ", few_waves=", few_waves,
105         ", row_vectorized=", row_vectorized, ", logical_order", logical_order);
106   }
107 };
108 
109 // Returns -1 if the shape doesn't allows the row vectorization code path.
110 // If supported, return the number of threads to use in that case.
111 int64_t ThreadsPerBlockRowVectorized(const Shape& shape,
112                                      GpuDeviceInfo gpu_device_info,
113                                      LaunchDimensionsConfig dim_config);
114 
115 // Calculates the launch dimensions used to invoke `hlo`.
116 StatusOr<LaunchDimensions> CalculateLaunchDimensions(
117     const Shape& shape, GpuDeviceInfo gpu_device_info,
118     LaunchDimensionsConfig dim_config = {});
119 
120 }  // namespace gpu
121 }  // namespace xla
122 
123 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LAUNCH_DIMENSIONS_H_
124