1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/gl/kernels/depthwise_conv.h"
17
18 #include <any>
19 #include <memory>
20 #include <string>
21 #include <utility>
22 #include <vector>
23
24 #include "absl/memory/memory.h"
25 #include "tensorflow/lite/delegates/gpu/common/convert.h"
26 #include "tensorflow/lite/delegates/gpu/common/operations.h"
27 #include "tensorflow/lite/delegates/gpu/common/shape.h"
28 #include "tensorflow/lite/delegates/gpu/common/status.h"
29 #include "tensorflow/lite/delegates/gpu/common/types.h"
30 #include "tensorflow/lite/delegates/gpu/common/util.h"
31 #include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
32 #include "tensorflow/lite/delegates/gpu/gl/variable.h"
33 #include "tensorflow/lite/delegates/gpu/gl/workgroups/ideal_workgroup_picker.h"
34
35 namespace tflite {
36 namespace gpu {
37 namespace gl {
38 namespace {
39
40 class DepthwiseConvolution : public NodeShader {
41 public:
GenerateCode(const GenerationContext & ctx,GeneratedCode * generated_code) const42 absl::Status GenerateCode(const GenerationContext& ctx,
43 GeneratedCode* generated_code) const final {
44 if (ctx.input_shapes.size() != 1) {
45 return absl::UnimplementedError(
46 "DepthWise Convolution does not support more than 1 runtime tensor");
47 }
48 const auto& attr =
49 std::any_cast<const DepthwiseConvolution2DAttributes&>(ctx.op_attr);
50 auto weights = attr.weights.shape;
51 const int offsets_count = weights.h * weights.w;
52 const bool offsets_count_too_large = offsets_count > kMaxConstArraySize;
53 std::vector<Variable> parameters;
54 if (offsets_count_too_large) {
55 parameters = {
56 {"input_data_0_h", static_cast<int>(ctx.input_shapes[0][1])},
57 {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])},
58 {"padding_w", attr.padding.prepended.w},
59 {"padding_h", attr.padding.prepended.h},
60 {"dilation_w", attr.dilations.w},
61 {"dilation_h", attr.dilations.h},
62 {"kernel_w", weights.w},
63 {"kernel_h", weights.h},
64 {"src_depth", DivideRoundUp(weights.i, 4)},
65 {"channel_multiplier", weights.o},
66 {"stride", int2(attr.strides.w, attr.strides.h)},
67 };
68 } else {
69 std::vector<int2> offsets;
70 for (int h = 0; h < weights.h; ++h) {
71 for (int w = 0; w < weights.w; ++w) {
72 offsets.emplace_back(w * attr.dilations.w - attr.padding.prepended.w,
73 h * attr.dilations.h - attr.padding.prepended.h);
74 }
75 }
76 parameters = {
77 {"input_data_0_h", static_cast<int>(ctx.input_shapes[0][1])},
78 {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])},
79 {"offsets_count", offsets_count},
80 {"offsets", offsets},
81 {"src_depth", DivideRoundUp(weights.i, 4)},
82 {"channel_multiplier", weights.o},
83 {"stride", int2(attr.strides.w, attr.strides.h)},
84 };
85 }
86 bool non_empty_padding =
87 attr.padding.appended.h != 0 || attr.padding.appended.w != 0 ||
88 attr.padding.prepended.h != 0 || attr.padding.prepended.w != 0;
89
90 std::vector<std::pair<std::string, Object>> objects = {
91 {"weights", MakeReadonlyObject(ConvertToPIOHW4(attr.weights))}};
92
93 std::string source;
94 if (offsets_count_too_large) {
95 source = R"(
96 int offsets_count = $kernel_w$ * $kernel_h$;
97 int src_layer_offset = (gid.z % $channel_multiplier$) * 4;
98 int i = 0;
99 for (int ky = 0; ky < $kernel_h$; ky++) {
100 for (int kx = 0; kx < $kernel_w$; kx++, i++) {
101 ivec2 coord = gid.xy * $stride$ + ivec2(kx * $dilation_w$ - $padding_w$, ky * $dilation_h$ - $padding_h$);)";
102 } else {
103 source = R"(
104 int offsets_count = $offsets_count$;
105 int src_layer_offset = (gid.z % $channel_multiplier$) * 4;
106 for (int i = 0; i < offsets_count; ++i) {
107 ivec2 coord = gid.xy * $stride$ + $offsets[i]$;)";
108 }
109 if (non_empty_padding) {
110 source += R"(
111 if (coord.x < 0 || coord.y < 0 ||
112 coord.x >= $input_data_0_w$ || coord.y >= $input_data_0_h$) {
113 continue;
114 })";
115 }
116 source += R"(
117 int src_layer = gid.z / $channel_multiplier$;
118 vec4 input_ = $input_data_0[coord.x, coord.y, src_layer]$;
119 vec4 input_shifted = vec4(
120 input_[(src_layer_offset + 0) / $channel_multiplier$],
121 input_[(src_layer_offset + 1) / $channel_multiplier$],
122 input_[(src_layer_offset + 2) / $channel_multiplier$],
123 input_[(src_layer_offset + 3) / $channel_multiplier$]
124 );
125 value_0 += input_shifted * $weights[gid.z * offsets_count + i]$;
126 }
127 )";
128 if (offsets_count_too_large) {
129 source += R"(
130 }
131 )";
132 }
133 if (!attr.bias.data.empty()) {
134 source += "value_0 += $bias[gid.z]$;\n";
135 objects.push_back({"bias", MakeReadonlyObject(attr.bias.data)});
136 }
137 *generated_code = {
138 /*parameters=*/std::move(parameters),
139 /*objects=*/std::move(objects),
140 /*shared_variables=*/{},
141 /*workload=*/uint3(),
142 /*workgroup=*/
143 GetIdealWorkgroupIfPossible(
144 *ctx.gpu_info, OperationType::DEPTHWISE_CONVOLUTION,
145 HW(attr.weights.shape.h, attr.weights.shape.w), attr.strides,
146 OHWI(attr.weights.shape.o, ctx.input_shapes[0][1],
147 ctx.input_shapes[0][2], ctx.input_shapes[0][3])),
148 /*source_code=*/std::move(source),
149 /*input=*/IOStructure::ONLY_DEFINITIONS,
150 /*output=*/IOStructure::AUTO,
151 };
152 return absl::OkStatus();
153 }
154 };
155
156 } // namespace
157
NewDepthwiseConvolutionNodeShader()158 std::unique_ptr<NodeShader> NewDepthwiseConvolutionNodeShader() {
159 return std::make_unique<DepthwiseConvolution>();
160 }
161
162 } // namespace gl
163 } // namespace gpu
164 } // namespace tflite
165