xref: /aosp_15_r20/external/tensorflow/tensorflow/lite/delegates/gpu/gl/kernels/mean.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/gl/kernels/mean.h"
17 
18 #include <algorithm>
19 #include <any>
20 #include <cstdint>
21 #include <cstring>
22 #include <memory>
23 #include <string>
24 #include <utility>
25 #include <vector>
26 
27 #include "absl/memory/memory.h"
28 #include "absl/status/status.h"
29 #include "tensorflow/lite/delegates/gpu/common/status.h"
30 #include "tensorflow/lite/delegates/gpu/common/types.h"
31 #include "tensorflow/lite/delegates/gpu/common/util.h"
32 
33 namespace tflite {
34 namespace gpu {
35 namespace gl {
36 namespace {
37 
UseSubgroupBasedImpl(const GpuInfo & gpu_info)38 bool UseSubgroupBasedImpl(const GpuInfo& gpu_info) {
39   return gpu_info.IsApiVulkan() &&
40          (gpu_info.vulkan_info.api_version_major > 1 ||
41           gpu_info.vulkan_info.api_version_minor >= 1) &&
42          gpu_info.vulkan_info.subgroup_size >= 32 &&
43          gpu_info.vulkan_info.supports_subgroup_arithmetic;
44 }
45 
46 // An implementation of Mean for desktop GPUs and some phones with recent
47 // Vulkan drivers. It is more parallel than the trivial Mean operation, but
48 // still limited to using a single work group.
GenerateSubgroupBasedMean(const NodeShader::GenerationContext & ctx,GeneratedCode * generated_code)49 void GenerateSubgroupBasedMean(const NodeShader::GenerationContext& ctx,
50                                GeneratedCode* generated_code) {
51   int height = ctx.input_shapes[0][1];
52   int width = ctx.input_shapes[0][2];
53   int depth = ctx.input_shapes[0][3];
54   std::vector<Variable> parameters = {
55       {"input_data_0_h", height},
56       {"input_data_0_w", width},
57       {"output_data_0_h", 1},
58       {"output_data_0_w", 1},
59   };
60 
61   std::string source = R"(
62   // Round columns and rows per invocation up, to ensure that we read the
63   // entire input.
64   const uint columns_per_invocation =
65       ($input_data_0_w$ + (gl_WorkGroupSize.x - 1))/gl_WorkGroupSize.x;
66   const uint rows_per_invocation =
67       ($input_data_0_h$ + (gl_WorkGroupSize.y - 1))/gl_WorkGroupSize.y;
68   const uint first_row = gl_GlobalInvocationID.y*rows_per_invocation;
69   const uint first_col = gl_GlobalInvocationID.x*columns_per_invocation;
70   const uint last_row_exclusive =
71       min(first_row+rows_per_invocation, $input_data_0_h$);
72   const uint last_column_exclusive =
73       min(first_col+columns_per_invocation, $input_data_0_w$);
74   vec4 value = vec4(0);
75   for (uint h = first_row; h < last_row_exclusive; ++h) {
76     for (uint w = first_col; w < last_column_exclusive; ++w) {
77       value += $input_data_0[w, h, gid.z]$;
78     }
79   }
80   highp vec4 subgroup_sum = subgroupAdd(value);
81   if(subgroupElect()) {
82     subgroup_sums[gl_SubgroupID] = subgroup_sum;
83   }
84 
85   memoryBarrierShared();
86   barrier();
87   // Do the final reduction in the first subgroup.
88   if(gl_SubgroupID == 0) {
89     highp vec4 subtotal = vec4(0);
90     if (gl_SubgroupInvocationID < gl_NumSubgroups) {
91       subtotal = subgroup_sums[gl_SubgroupInvocationID];
92     }
93     highp vec4 grand_total = subgroupAdd(subtotal);
94     if(subgroupElect()) {
95       highp vec4 result = grand_total / $input_data_0_w$ / $input_data_0_h$;
96       $output_data_0[0, 0, gid.z] = result$;
97     }
98   }
99   )";
100 
101   const uint32_t subgroup_size = ctx.gpu_info->vulkan_info.subgroup_size;
102   const uint32_t max_wg_size_x = ctx.gpu_info->GetMaxWorkGroupSizeForX();
103   const uint32_t max_wg_size_y = ctx.gpu_info->GetMaxWorkGroupSizeForY();
104   // Due to the design of the shader, at most subgroup_size subgroups can be
105   // launched. This may limit the maximal workgroup size.
106   const uint32_t max_wg_size =
107       std::min(static_cast<uint32_t>(ctx.gpu_info->GetMaxWorkGroupTotalSize()),
108                subgroup_size * subgroup_size);
109   const uint32_t max_number_of_subgroups = max_wg_size / subgroup_size;
110   uint32_t wg_size_x = 0;
111   uint32_t wg_size_y = 0;
112   if (width * height <= max_wg_size && width <= max_wg_size_x &&
113       height <= max_wg_size_y) {
114     wg_size_x = width;
115     wg_size_y = height;
116   } else {
117     // Approximately square workgroup. Also make sure to limit by driver limit
118     // and input size.
119     wg_size_x = std::min({static_cast<uint32_t>(std::sqrt(max_wg_size)),
120                           max_wg_size_x, static_cast<uint32_t>(width)});
121     wg_size_y = std::min({max_wg_size / wg_size_x, max_wg_size_y,
122                           static_cast<uint32_t>(height)});
123   }
124 
125   std::vector<Variable> shared_variables = {
126       {"subgroup_sums", std::vector<float4>(max_number_of_subgroups)},
127   };
128 
129   *generated_code = {
130       /*parameters=*/std::move(parameters),
131       /*objects=*/{},
132       /*shared_variables=*/{std::move(shared_variables)},
133       // Make sure we get one dispatch of size wg_size_x*wg_size_y*1 per layer.
134       /*workload=*/
135       uint3(wg_size_x, wg_size_y, uint32_t(DivideRoundUp(depth, 4))),
136       /*workgroup=*/uint3(wg_size_x, wg_size_y, 1u),
137       /*source_code=*/std::move(source),
138       /*input=*/IOStructure::ONLY_DEFINITIONS,
139       /*output=*/IOStructure::ONLY_DEFINITIONS,
140   };
141 }
142 
GenerateTrivialMean(const NodeShader::GenerationContext & ctx,GeneratedCode * generated_code)143 void GenerateTrivialMean(const NodeShader::GenerationContext& ctx,
144                          GeneratedCode* generated_code) {
145   std::vector<Variable> parameters = {
146       {"input_data_0_h", static_cast<int>(ctx.input_shapes[0][1])},
147       {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])}};
148 
149   // Shaders may be compiled with a precision hint mediump, which means that
150   // GLSL compiler may drop the size of float data type from 32 to 16 bits.
151   // If "sum" and "size" variables are 16bit floats, their values range
152   // become not enough for providing a good results accuracy. That is why
153   // their precision is forced to be 32bit by using highp qualifier.
154   std::string source = R"(
155     highp vec4 sum = vec4(0.0);
156     highp float size = float($input_data_0_w$ * $input_data_0_h$);
157     for (int w = 0; w < $input_data_0_w$; w++) {
158       for (int h = 0; h < $input_data_0_h$; h++) {
159         sum += $input_data_0[w, h, gid.z]$;
160       }
161     }
162     value_0 = sum / size;
163   )";
164   *generated_code = {
165       /*parameters=*/std::move(parameters),
166       /*objects=*/{},
167       /*shared_variables=*/{},
168       /*workload=*/uint3(),
169       /*workgroup=*/uint3(1, 1, 4),
170       /*source_code=*/std::move(source),
171       /*input=*/IOStructure::ONLY_DEFINITIONS,
172       /*output=*/IOStructure::AUTO,
173   };
174 }
175 
176 // Tiled implementation.
177 
178 constexpr uint3 kTileSize = {8, 8, 1};
179 
UseTiledImpl(const NodeShader::GenerationContext & ctx)180 inline bool UseTiledImpl(const NodeShader::GenerationContext& ctx) {
181   const int h = ctx.input_shapes[0][1];
182   const int w = ctx.input_shapes[0][2];
183   const int c = ctx.input_shapes[0][3];
184   return h % kTileSize.y == 0 && w % kTileSize.x == 0 && c % 4 == 0 &&
185          (h / kTileSize.y) * (w / kTileSize.x) * c * sizeof(float) <=
186              32768;  // required min value for GL_MAX_COMPUTE_SHARED_MEMORY_SIZE
187 }
188 
GenerateTiledMean(const NodeShader::GenerationContext & ctx,GeneratedCode * generated_code)189 void GenerateTiledMean(const NodeShader::GenerationContext& ctx,
190                        GeneratedCode* generated_code) {
191   const int h = ctx.input_shapes[0][1];
192   const int w = ctx.input_shapes[0][2];
193   const int s = DivideRoundUp(ctx.input_shapes[0][3], 4);
194 
195   std::vector<Variable> parameters = {
196       {"input_data_0_h", h},
197       {"input_data_0_w", w},
198       {"tile_size_h", kTileSize.y},
199       {"tile_size_w", kTileSize.x},
200   };
201 
202   std::vector<Variable> shared_variables = {
203       {"tile_sum",
204        std::vector<float4>((w / kTileSize.x) * (h / kTileSize.y) * s)}};
205 
206   std::string source = R"(
207   ivec2 tile_size = ivec2($tile_size_w$, $tile_size_h$);
208   ivec2 num_tiles = ivec2($input_data_0_w$, $input_data_0_h$) / tile_size;
209 
210   highp vec4 partial_sum = vec4(0.0);
211   for (int x = gid.x * tile_size.x; x < (gid.x + 1) * tile_size.x; ++x) {
212     for (int y = gid.y * tile_size.y; y < (gid.y + 1) * tile_size.y; ++y) {
213       partial_sum += $input_data_0[x, y, gid.z]$;
214     }
215   }
216   $tile_sum$[num_tiles.x * num_tiles.y * gid.z + num_tiles.x * gid.y + gid.x] = partial_sum;
217 
218   memoryBarrierShared(); barrier();
219 
220   if (gid.x == 0 && gid.y == 0) {
221     highp vec4 sum = vec4(0.0);
222     for (int i = 0; i < num_tiles.x * num_tiles.y; ++i) {
223       sum += $tile_sum$[num_tiles.x * num_tiles.y * gid.z + i];
224     }
225     highp vec4 mean = sum / float($input_data_0_w$ * $input_data_0_h$);
226     $output_data_0[0, 0, gid.z] = mean$;
227   }
228 )";
229   *generated_code = {
230       /*parameters=*/std::move(parameters),
231       /*objects=*/{},
232       /*shared_variables=*/std::move(shared_variables),
233       /*workload=*/uint3(kTileSize.x, kTileSize.y, static_cast<uint32_t>(s)),
234       /*workgroup=*/kTileSize,
235       /*source_code=*/std::move(source),
236       /*input=*/IOStructure::ONLY_DEFINITIONS,
237       /*output=*/IOStructure::ONLY_DEFINITIONS,
238   };
239 }
240 
241 class Mean : public NodeShader {
242  public:
GenerateCode(const GenerationContext & ctx,GeneratedCode * generated_code) const243   absl::Status GenerateCode(const GenerationContext& ctx,
244                             GeneratedCode* generated_code) const final {
245     const auto& attr = std::any_cast<const MeanAttributes&>(ctx.op_attr);
246     if (attr.dims != std::set<Axis>({Axis::HEIGHT, Axis::WIDTH})) {
247       return absl::InvalidArgumentError(
248           "Mean calculation is supported only for height and width.");
249     }
250 
251     if (!(ctx.input_shapes.size() == 1 && ctx.output_shapes.size() == 1 &&
252           ctx.output_shapes[0][1] == 1 && ctx.output_shapes[0][2] == 1 &&
253           ctx.output_shapes[0][3] == ctx.input_shapes[0][3])) {
254       return absl::InvalidArgumentError(
255           "Mean calculation is supported for one input and one 1x1 output with "
256           "the same channel count.");
257     }
258 
259     if (UseSubgroupBasedImpl(*ctx.gpu_info)) {
260       GenerateSubgroupBasedMean(ctx, generated_code);
261     } else if (UseTiledImpl(ctx)) {
262       GenerateTiledMean(ctx, generated_code);
263     } else {
264       GenerateTrivialMean(ctx, generated_code);
265     }
266     return absl::OkStatus();
267   }
268 };
269 
270 }  // namespace
271 
NewMeanNodeShader()272 std::unique_ptr<NodeShader> NewMeanNodeShader() {
273   return std::make_unique<Mean>();
274 }
275 
276 }  // namespace gl
277 }  // namespace gpu
278 }  // namespace tflite
279