1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/gl/kernels/mean.h"
17
18 #include <algorithm>
19 #include <any>
20 #include <cstdint>
21 #include <cstring>
22 #include <memory>
23 #include <string>
24 #include <utility>
25 #include <vector>
26
27 #include "absl/memory/memory.h"
28 #include "absl/status/status.h"
29 #include "tensorflow/lite/delegates/gpu/common/status.h"
30 #include "tensorflow/lite/delegates/gpu/common/types.h"
31 #include "tensorflow/lite/delegates/gpu/common/util.h"
32
33 namespace tflite {
34 namespace gpu {
35 namespace gl {
36 namespace {
37
UseSubgroupBasedImpl(const GpuInfo & gpu_info)38 bool UseSubgroupBasedImpl(const GpuInfo& gpu_info) {
39 return gpu_info.IsApiVulkan() &&
40 (gpu_info.vulkan_info.api_version_major > 1 ||
41 gpu_info.vulkan_info.api_version_minor >= 1) &&
42 gpu_info.vulkan_info.subgroup_size >= 32 &&
43 gpu_info.vulkan_info.supports_subgroup_arithmetic;
44 }
45
46 // An implementation of Mean for desktop GPUs and some phones with recent
47 // Vulkan drivers. It is more parallel than the trivial Mean operation, but
48 // still limited to using a single work group.
GenerateSubgroupBasedMean(const NodeShader::GenerationContext & ctx,GeneratedCode * generated_code)49 void GenerateSubgroupBasedMean(const NodeShader::GenerationContext& ctx,
50 GeneratedCode* generated_code) {
51 int height = ctx.input_shapes[0][1];
52 int width = ctx.input_shapes[0][2];
53 int depth = ctx.input_shapes[0][3];
54 std::vector<Variable> parameters = {
55 {"input_data_0_h", height},
56 {"input_data_0_w", width},
57 {"output_data_0_h", 1},
58 {"output_data_0_w", 1},
59 };
60
61 std::string source = R"(
62 // Round columns and rows per invocation up, to ensure that we read the
63 // entire input.
64 const uint columns_per_invocation =
65 ($input_data_0_w$ + (gl_WorkGroupSize.x - 1))/gl_WorkGroupSize.x;
66 const uint rows_per_invocation =
67 ($input_data_0_h$ + (gl_WorkGroupSize.y - 1))/gl_WorkGroupSize.y;
68 const uint first_row = gl_GlobalInvocationID.y*rows_per_invocation;
69 const uint first_col = gl_GlobalInvocationID.x*columns_per_invocation;
70 const uint last_row_exclusive =
71 min(first_row+rows_per_invocation, $input_data_0_h$);
72 const uint last_column_exclusive =
73 min(first_col+columns_per_invocation, $input_data_0_w$);
74 vec4 value = vec4(0);
75 for (uint h = first_row; h < last_row_exclusive; ++h) {
76 for (uint w = first_col; w < last_column_exclusive; ++w) {
77 value += $input_data_0[w, h, gid.z]$;
78 }
79 }
80 highp vec4 subgroup_sum = subgroupAdd(value);
81 if(subgroupElect()) {
82 subgroup_sums[gl_SubgroupID] = subgroup_sum;
83 }
84
85 memoryBarrierShared();
86 barrier();
87 // Do the final reduction in the first subgroup.
88 if(gl_SubgroupID == 0) {
89 highp vec4 subtotal = vec4(0);
90 if (gl_SubgroupInvocationID < gl_NumSubgroups) {
91 subtotal = subgroup_sums[gl_SubgroupInvocationID];
92 }
93 highp vec4 grand_total = subgroupAdd(subtotal);
94 if(subgroupElect()) {
95 highp vec4 result = grand_total / $input_data_0_w$ / $input_data_0_h$;
96 $output_data_0[0, 0, gid.z] = result$;
97 }
98 }
99 )";
100
101 const uint32_t subgroup_size = ctx.gpu_info->vulkan_info.subgroup_size;
102 const uint32_t max_wg_size_x = ctx.gpu_info->GetMaxWorkGroupSizeForX();
103 const uint32_t max_wg_size_y = ctx.gpu_info->GetMaxWorkGroupSizeForY();
104 // Due to the design of the shader, at most subgroup_size subgroups can be
105 // launched. This may limit the maximal workgroup size.
106 const uint32_t max_wg_size =
107 std::min(static_cast<uint32_t>(ctx.gpu_info->GetMaxWorkGroupTotalSize()),
108 subgroup_size * subgroup_size);
109 const uint32_t max_number_of_subgroups = max_wg_size / subgroup_size;
110 uint32_t wg_size_x = 0;
111 uint32_t wg_size_y = 0;
112 if (width * height <= max_wg_size && width <= max_wg_size_x &&
113 height <= max_wg_size_y) {
114 wg_size_x = width;
115 wg_size_y = height;
116 } else {
117 // Approximately square workgroup. Also make sure to limit by driver limit
118 // and input size.
119 wg_size_x = std::min({static_cast<uint32_t>(std::sqrt(max_wg_size)),
120 max_wg_size_x, static_cast<uint32_t>(width)});
121 wg_size_y = std::min({max_wg_size / wg_size_x, max_wg_size_y,
122 static_cast<uint32_t>(height)});
123 }
124
125 std::vector<Variable> shared_variables = {
126 {"subgroup_sums", std::vector<float4>(max_number_of_subgroups)},
127 };
128
129 *generated_code = {
130 /*parameters=*/std::move(parameters),
131 /*objects=*/{},
132 /*shared_variables=*/{std::move(shared_variables)},
133 // Make sure we get one dispatch of size wg_size_x*wg_size_y*1 per layer.
134 /*workload=*/
135 uint3(wg_size_x, wg_size_y, uint32_t(DivideRoundUp(depth, 4))),
136 /*workgroup=*/uint3(wg_size_x, wg_size_y, 1u),
137 /*source_code=*/std::move(source),
138 /*input=*/IOStructure::ONLY_DEFINITIONS,
139 /*output=*/IOStructure::ONLY_DEFINITIONS,
140 };
141 }
142
GenerateTrivialMean(const NodeShader::GenerationContext & ctx,GeneratedCode * generated_code)143 void GenerateTrivialMean(const NodeShader::GenerationContext& ctx,
144 GeneratedCode* generated_code) {
145 std::vector<Variable> parameters = {
146 {"input_data_0_h", static_cast<int>(ctx.input_shapes[0][1])},
147 {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])}};
148
149 // Shaders may be compiled with a precision hint mediump, which means that
150 // GLSL compiler may drop the size of float data type from 32 to 16 bits.
151 // If "sum" and "size" variables are 16bit floats, their values range
152 // become not enough for providing a good results accuracy. That is why
153 // their precision is forced to be 32bit by using highp qualifier.
154 std::string source = R"(
155 highp vec4 sum = vec4(0.0);
156 highp float size = float($input_data_0_w$ * $input_data_0_h$);
157 for (int w = 0; w < $input_data_0_w$; w++) {
158 for (int h = 0; h < $input_data_0_h$; h++) {
159 sum += $input_data_0[w, h, gid.z]$;
160 }
161 }
162 value_0 = sum / size;
163 )";
164 *generated_code = {
165 /*parameters=*/std::move(parameters),
166 /*objects=*/{},
167 /*shared_variables=*/{},
168 /*workload=*/uint3(),
169 /*workgroup=*/uint3(1, 1, 4),
170 /*source_code=*/std::move(source),
171 /*input=*/IOStructure::ONLY_DEFINITIONS,
172 /*output=*/IOStructure::AUTO,
173 };
174 }
175
176 // Tiled implementation.
177
178 constexpr uint3 kTileSize = {8, 8, 1};
179
UseTiledImpl(const NodeShader::GenerationContext & ctx)180 inline bool UseTiledImpl(const NodeShader::GenerationContext& ctx) {
181 const int h = ctx.input_shapes[0][1];
182 const int w = ctx.input_shapes[0][2];
183 const int c = ctx.input_shapes[0][3];
184 return h % kTileSize.y == 0 && w % kTileSize.x == 0 && c % 4 == 0 &&
185 (h / kTileSize.y) * (w / kTileSize.x) * c * sizeof(float) <=
186 32768; // required min value for GL_MAX_COMPUTE_SHARED_MEMORY_SIZE
187 }
188
GenerateTiledMean(const NodeShader::GenerationContext & ctx,GeneratedCode * generated_code)189 void GenerateTiledMean(const NodeShader::GenerationContext& ctx,
190 GeneratedCode* generated_code) {
191 const int h = ctx.input_shapes[0][1];
192 const int w = ctx.input_shapes[0][2];
193 const int s = DivideRoundUp(ctx.input_shapes[0][3], 4);
194
195 std::vector<Variable> parameters = {
196 {"input_data_0_h", h},
197 {"input_data_0_w", w},
198 {"tile_size_h", kTileSize.y},
199 {"tile_size_w", kTileSize.x},
200 };
201
202 std::vector<Variable> shared_variables = {
203 {"tile_sum",
204 std::vector<float4>((w / kTileSize.x) * (h / kTileSize.y) * s)}};
205
206 std::string source = R"(
207 ivec2 tile_size = ivec2($tile_size_w$, $tile_size_h$);
208 ivec2 num_tiles = ivec2($input_data_0_w$, $input_data_0_h$) / tile_size;
209
210 highp vec4 partial_sum = vec4(0.0);
211 for (int x = gid.x * tile_size.x; x < (gid.x + 1) * tile_size.x; ++x) {
212 for (int y = gid.y * tile_size.y; y < (gid.y + 1) * tile_size.y; ++y) {
213 partial_sum += $input_data_0[x, y, gid.z]$;
214 }
215 }
216 $tile_sum$[num_tiles.x * num_tiles.y * gid.z + num_tiles.x * gid.y + gid.x] = partial_sum;
217
218 memoryBarrierShared(); barrier();
219
220 if (gid.x == 0 && gid.y == 0) {
221 highp vec4 sum = vec4(0.0);
222 for (int i = 0; i < num_tiles.x * num_tiles.y; ++i) {
223 sum += $tile_sum$[num_tiles.x * num_tiles.y * gid.z + i];
224 }
225 highp vec4 mean = sum / float($input_data_0_w$ * $input_data_0_h$);
226 $output_data_0[0, 0, gid.z] = mean$;
227 }
228 )";
229 *generated_code = {
230 /*parameters=*/std::move(parameters),
231 /*objects=*/{},
232 /*shared_variables=*/std::move(shared_variables),
233 /*workload=*/uint3(kTileSize.x, kTileSize.y, static_cast<uint32_t>(s)),
234 /*workgroup=*/kTileSize,
235 /*source_code=*/std::move(source),
236 /*input=*/IOStructure::ONLY_DEFINITIONS,
237 /*output=*/IOStructure::ONLY_DEFINITIONS,
238 };
239 }
240
241 class Mean : public NodeShader {
242 public:
GenerateCode(const GenerationContext & ctx,GeneratedCode * generated_code) const243 absl::Status GenerateCode(const GenerationContext& ctx,
244 GeneratedCode* generated_code) const final {
245 const auto& attr = std::any_cast<const MeanAttributes&>(ctx.op_attr);
246 if (attr.dims != std::set<Axis>({Axis::HEIGHT, Axis::WIDTH})) {
247 return absl::InvalidArgumentError(
248 "Mean calculation is supported only for height and width.");
249 }
250
251 if (!(ctx.input_shapes.size() == 1 && ctx.output_shapes.size() == 1 &&
252 ctx.output_shapes[0][1] == 1 && ctx.output_shapes[0][2] == 1 &&
253 ctx.output_shapes[0][3] == ctx.input_shapes[0][3])) {
254 return absl::InvalidArgumentError(
255 "Mean calculation is supported for one input and one 1x1 output with "
256 "the same channel count.");
257 }
258
259 if (UseSubgroupBasedImpl(*ctx.gpu_info)) {
260 GenerateSubgroupBasedMean(ctx, generated_code);
261 } else if (UseTiledImpl(ctx)) {
262 GenerateTiledMean(ctx, generated_code);
263 } else {
264 GenerateTrivialMean(ctx, generated_code);
265 }
266 return absl::OkStatus();
267 }
268 };
269
270 } // namespace
271
NewMeanNodeShader()272 std::unique_ptr<NodeShader> NewMeanNodeShader() {
273 return std::make_unique<Mean>();
274 }
275
276 } // namespace gl
277 } // namespace gpu
278 } // namespace tflite
279