1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_FULLY_CONNECTED_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_FULLY_CONNECTED_H_
18
19 #include <stdint.h>
20
21 #include <memory>
22 #include <string>
23 #include <utility>
24 #include <vector>
25
26 #include "absl/memory/memory.h"
27 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
28 #include "tensorflow/lite/delegates/gpu/common/operations.h"
29 #include "tensorflow/lite/delegates/gpu/common/shape.h"
30 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
31 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
32 #include "tensorflow/lite/delegates/gpu/common/task/texture2d_desc.h"
33 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
34 #include "tensorflow/lite/delegates/gpu/common/types.h"
35 #include "tensorflow/lite/delegates/gpu/common/util.h"
36
37 namespace tflite {
38 namespace gpu {
39
40 template <DataType T, typename S>
RearrangeFCWeightsToIOO4I4(const tflite::gpu::Tensor<OHWI,T> & weights,S * dst)41 void RearrangeFCWeightsToIOO4I4(const tflite::gpu::Tensor<OHWI, T>& weights,
42 S* dst) {
43 const int src_channels = weights.shape.i;
44 const int padded_src_channels = AlignByN(src_channels, 4);
45 const int dst_channels = weights.shape.o;
46 const int padded_dst_channels = AlignByN(dst_channels, 4);
47
48 // Change the travelsal order of the weight matrix in the following way:
49 // The matrix is segmented to blocks of 4x4. If (any) dimension of the matrix
50 // size is not divisible by 4, then pad with zeros. Each block is stored
51 // contigously. The 16 elements within a block are ordered as 4 elements of
52 // the first column, 4 elems of the second, etc. Blocks then traversed as
53 // columns first, rows last. As an example, an 8x8 matrix would be traversed
54 // as below.
55 //
56 // | 0 4 8 12 32 36 40 44 |
57 // | 1 5 9 13 33 37 41 45 |
58 // | 2 6 10 14 34 38 42 46 |
59 // | 3 7 11 15 35 39 43 47 |
60 // | 16 20 24 28 48 52 56 60 |
61 // | 17 21 25 29 49 53 57 61 |
62 // | 18 22 26 30 50 54 58 62 |
63 // | 19 23 27 31 51 55 59 63 |
64 //
65 // The benefit of doing this is that reading contigous 16 elements gives a 4x4
66 // block of the matrix, where the first 4 elements is the first row of the
67 // block, second 4 elements is the second row of the block, etc. Subsequent
68 // blocks contain elements of the same 4 columns.
69
70 for (int block_y = 0; 4 * block_y < padded_dst_channels; block_y++) {
71 for (int y_in_block = 0; y_in_block < 4; y_in_block++) {
72 for (int block_x = 0; 4 * block_x < padded_src_channels; block_x++) {
73 for (int x_in_block = 0; x_in_block < 4; x_in_block++) {
74 int y = 4 * block_y + y_in_block;
75 int x = 4 * block_x + x_in_block;
76 // Consider destination as an array with extents
77 // [padded_src_channels/4][padded_dst_channels/4][4][4]
78 int dst_index = block_x * padded_dst_channels * 4 + block_y * 16 +
79 x_in_block * 4 + y_in_block;
80 if (x < src_channels && y < dst_channels) {
81 dst[dst_index] = weights.data[src_channels * y + x];
82 } else {
83 dst[dst_index] = 0.0f;
84 }
85 }
86 }
87 }
88 }
89 }
90
91 template <DataType T, typename S>
RearrangeFCWeightsToOIO4I4(const tflite::gpu::Tensor<OHWI,T> & weights,S * dst)92 void RearrangeFCWeightsToOIO4I4(const tflite::gpu::Tensor<OHWI, T>& weights,
93 S* dst) {
94 const int src_channels = weights.shape.i;
95 const int src_depth = DivideRoundUp(src_channels, 4);
96 const int dst_channels = weights.shape.o;
97 const int dst_depth = DivideRoundUp(dst_channels, 4);
98
99 int counter = 0;
100 for (int d = 0; d < dst_depth; ++d) {
101 for (int s = 0; s < src_depth; ++s) {
102 for (int i = 0; i < 4; ++i) {
103 const int src_ch = s * 4 + i;
104 for (int j = 0; j < 4; ++j) {
105 const int dst_ch = d * 4 + j;
106 if (src_ch < src_channels && dst_ch < dst_channels) {
107 dst[counter++] = weights.data[dst_ch * src_channels + src_ch];
108 } else {
109 dst[counter++] = 0.0f;
110 }
111 }
112 }
113 }
114 }
115 }
116
117 class FullyConnected : public GPUOperation {
118 public:
119 FullyConnected() = default;
GetPossibleKernelWorkGroups(TuningType tuning_type,const GpuInfo & gpu_info,const KernelInfo & kernel_info,std::vector<int3> * work_groups)120 void GetPossibleKernelWorkGroups(
121 TuningType tuning_type, const GpuInfo& gpu_info,
122 const KernelInfo& kernel_info,
123 std::vector<int3>* work_groups) const override {
124 work_groups->push_back(work_group_size_);
125 }
126 int3 GetGridSize() const override;
127
128 // Move only
129 FullyConnected(FullyConnected&& kernel);
130 FullyConnected& operator=(FullyConnected&& kernel);
131 FullyConnected(const FullyConnected&) = delete;
132 FullyConnected& operator=(const FullyConnected&) = delete;
133
134 private:
135 FullyConnected(const OperationDef& definition, const GpuInfo& gpu_info);
136 friend FullyConnected CreateFullyConnected(
137 const GpuInfo& gpu_info, const OperationDef& definition,
138 const FullyConnectedAttributes& attr);
139 friend FullyConnected CreateFullyConnected(
140 const GpuInfo& gpu_info, const OperationDef& definition,
141 const FullyConnectedInt8Attributes& attr);
142
143 void UploadQuantizedWeights(
144 const tflite::gpu::Tensor<OHWI, DataType::INT8>& weights, float scale,
145 float zero_point);
146 template <DataType T>
147 void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
148 bool weights_are_buffer);
149
150 std::string GetFullyConnectedKernelCode(const OperationDef& op_def,
151 const GpuInfo& gpu_info,
152 bool weights_are_buffer,
153 bool quantized);
154 };
155
156 template <DataType T>
UploadWeights(const tflite::gpu::Tensor<OHWI,T> & weights,bool weights_are_buffer)157 void FullyConnected::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
158 bool weights_are_buffer) {
159 const int src_depth = DivideRoundUp(weights.shape.i, 4);
160 const int dst_depth = DivideRoundUp(weights.shape.o, 4);
161
162 const int elements_count = src_depth * dst_depth * 4;
163 const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
164
165 const int float4_size = f32_weights ? 16 : 8;
166
167 if (weights_are_buffer) {
168 BufferDescriptor desc;
169 desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
170 desc.element_size = 16;
171 desc.size = float4_size * elements_count;
172 desc.data.resize(desc.size);
173
174 if (f32_weights) {
175 float* ptr = reinterpret_cast<float*>(desc.data.data());
176 RearrangeFCWeightsToIOO4I4(weights, ptr);
177 } else {
178 half* ptr = reinterpret_cast<half*>(desc.data.data());
179 RearrangeFCWeightsToIOO4I4(weights, ptr);
180 }
181
182 args_.AddObject("weights",
183 std::make_unique<BufferDescriptor>(std::move(desc)));
184 } else {
185 std::vector<uint8_t> data(float4_size * elements_count);
186 if (f32_weights) {
187 float* ptr = reinterpret_cast<float*>(data.data());
188 RearrangeFCWeightsToOIO4I4(weights, ptr);
189 } else {
190 half* ptr = reinterpret_cast<half*>(data.data());
191 RearrangeFCWeightsToOIO4I4(weights, ptr);
192 }
193
194 TensorDescriptor desc = CreateConstantHWVec4TensorDescriptor(
195 f32_weights ? DataType::FLOAT32 : DataType::FLOAT16,
196 TensorStorageType::TEXTURE_2D, src_depth * 4, dst_depth, data.data());
197 args_.AddObject("weights", std::make_unique<TensorDescriptor>(desc));
198 }
199 }
200
201 FullyConnected CreateFullyConnected(const GpuInfo& gpu_info,
202 const OperationDef& definition,
203 const FullyConnectedAttributes& attr);
204
205 FullyConnected CreateFullyConnected(const GpuInfo& gpu_info,
206 const OperationDef& definition,
207 const FullyConnectedInt8Attributes& attr);
208
209 } // namespace gpu
210 } // namespace tflite
211
212 #endif // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_FULLY_CONNECTED_H_
213