xref: /aosp_15_r20/external/tensorflow/tensorflow/lite/delegates/gpu/common/tasks/fully_connected.h (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_FULLY_CONNECTED_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_FULLY_CONNECTED_H_
18 
19 #include <stdint.h>
20 
21 #include <memory>
22 #include <string>
23 #include <utility>
24 #include <vector>
25 
26 #include "absl/memory/memory.h"
27 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
28 #include "tensorflow/lite/delegates/gpu/common/operations.h"
29 #include "tensorflow/lite/delegates/gpu/common/shape.h"
30 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
31 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
32 #include "tensorflow/lite/delegates/gpu/common/task/texture2d_desc.h"
33 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
34 #include "tensorflow/lite/delegates/gpu/common/types.h"
35 #include "tensorflow/lite/delegates/gpu/common/util.h"
36 
37 namespace tflite {
38 namespace gpu {
39 
40 template <DataType T, typename S>
RearrangeFCWeightsToIOO4I4(const tflite::gpu::Tensor<OHWI,T> & weights,S * dst)41 void RearrangeFCWeightsToIOO4I4(const tflite::gpu::Tensor<OHWI, T>& weights,
42                                 S* dst) {
43   const int src_channels = weights.shape.i;
44   const int padded_src_channels = AlignByN(src_channels, 4);
45   const int dst_channels = weights.shape.o;
46   const int padded_dst_channels = AlignByN(dst_channels, 4);
47 
48   // Change the travelsal order of the weight matrix in the following way:
49   // The matrix is segmented to blocks of 4x4. If (any) dimension of the matrix
50   // size is not divisible by 4, then pad with zeros. Each block is stored
51   // contigously. The 16 elements within a block are ordered as 4 elements of
52   // the first column, 4 elems of the second, etc. Blocks then traversed as
53   // columns first, rows last. As an example, an 8x8 matrix would be traversed
54   // as below.
55   //
56   //  |  0  4  8 12 32 36 40 44 |
57   //  |  1  5  9 13 33 37 41 45 |
58   //  |  2  6 10 14 34 38 42 46 |
59   //  |  3  7 11 15 35 39 43 47 |
60   //  | 16 20 24 28 48 52 56 60 |
61   //  | 17 21 25 29 49 53 57 61 |
62   //  | 18 22 26 30 50 54 58 62 |
63   //  | 19 23 27 31 51 55 59 63 |
64   //
65   // The benefit of doing this is that reading contigous 16 elements gives a 4x4
66   // block of the matrix, where the first 4 elements is the first row of the
67   // block, second 4 elements is the second row of the block, etc. Subsequent
68   // blocks contain elements of the same 4 columns.
69 
70   for (int block_y = 0; 4 * block_y < padded_dst_channels; block_y++) {
71     for (int y_in_block = 0; y_in_block < 4; y_in_block++) {
72       for (int block_x = 0; 4 * block_x < padded_src_channels; block_x++) {
73         for (int x_in_block = 0; x_in_block < 4; x_in_block++) {
74           int y = 4 * block_y + y_in_block;
75           int x = 4 * block_x + x_in_block;
76           // Consider destination as an array with extents
77           // [padded_src_channels/4][padded_dst_channels/4][4][4]
78           int dst_index = block_x * padded_dst_channels * 4 + block_y * 16 +
79                           x_in_block * 4 + y_in_block;
80           if (x < src_channels && y < dst_channels) {
81             dst[dst_index] = weights.data[src_channels * y + x];
82           } else {
83             dst[dst_index] = 0.0f;
84           }
85         }
86       }
87     }
88   }
89 }
90 
91 template <DataType T, typename S>
RearrangeFCWeightsToOIO4I4(const tflite::gpu::Tensor<OHWI,T> & weights,S * dst)92 void RearrangeFCWeightsToOIO4I4(const tflite::gpu::Tensor<OHWI, T>& weights,
93                                 S* dst) {
94   const int src_channels = weights.shape.i;
95   const int src_depth = DivideRoundUp(src_channels, 4);
96   const int dst_channels = weights.shape.o;
97   const int dst_depth = DivideRoundUp(dst_channels, 4);
98 
99   int counter = 0;
100   for (int d = 0; d < dst_depth; ++d) {
101     for (int s = 0; s < src_depth; ++s) {
102       for (int i = 0; i < 4; ++i) {
103         const int src_ch = s * 4 + i;
104         for (int j = 0; j < 4; ++j) {
105           const int dst_ch = d * 4 + j;
106           if (src_ch < src_channels && dst_ch < dst_channels) {
107             dst[counter++] = weights.data[dst_ch * src_channels + src_ch];
108           } else {
109             dst[counter++] = 0.0f;
110           }
111         }
112       }
113     }
114   }
115 }
116 
117 class FullyConnected : public GPUOperation {
118  public:
119   FullyConnected() = default;
GetPossibleKernelWorkGroups(TuningType tuning_type,const GpuInfo & gpu_info,const KernelInfo & kernel_info,std::vector<int3> * work_groups)120   void GetPossibleKernelWorkGroups(
121       TuningType tuning_type, const GpuInfo& gpu_info,
122       const KernelInfo& kernel_info,
123       std::vector<int3>* work_groups) const override {
124     work_groups->push_back(work_group_size_);
125   }
126   int3 GetGridSize() const override;
127 
128   // Move only
129   FullyConnected(FullyConnected&& kernel);
130   FullyConnected& operator=(FullyConnected&& kernel);
131   FullyConnected(const FullyConnected&) = delete;
132   FullyConnected& operator=(const FullyConnected&) = delete;
133 
134  private:
135   FullyConnected(const OperationDef& definition, const GpuInfo& gpu_info);
136   friend FullyConnected CreateFullyConnected(
137       const GpuInfo& gpu_info, const OperationDef& definition,
138       const FullyConnectedAttributes& attr);
139   friend FullyConnected CreateFullyConnected(
140       const GpuInfo& gpu_info, const OperationDef& definition,
141       const FullyConnectedInt8Attributes& attr);
142 
143   void UploadQuantizedWeights(
144       const tflite::gpu::Tensor<OHWI, DataType::INT8>& weights, float scale,
145       float zero_point);
146   template <DataType T>
147   void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
148                      bool weights_are_buffer);
149 
150   std::string GetFullyConnectedKernelCode(const OperationDef& op_def,
151                                           const GpuInfo& gpu_info,
152                                           bool weights_are_buffer,
153                                           bool quantized);
154 };
155 
156 template <DataType T>
UploadWeights(const tflite::gpu::Tensor<OHWI,T> & weights,bool weights_are_buffer)157 void FullyConnected::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
158                                    bool weights_are_buffer) {
159   const int src_depth = DivideRoundUp(weights.shape.i, 4);
160   const int dst_depth = DivideRoundUp(weights.shape.o, 4);
161 
162   const int elements_count = src_depth * dst_depth * 4;
163   const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
164 
165   const int float4_size = f32_weights ? 16 : 8;
166 
167   if (weights_are_buffer) {
168     BufferDescriptor desc;
169     desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
170     desc.element_size = 16;
171     desc.size = float4_size * elements_count;
172     desc.data.resize(desc.size);
173 
174     if (f32_weights) {
175       float* ptr = reinterpret_cast<float*>(desc.data.data());
176       RearrangeFCWeightsToIOO4I4(weights, ptr);
177     } else {
178       half* ptr = reinterpret_cast<half*>(desc.data.data());
179       RearrangeFCWeightsToIOO4I4(weights, ptr);
180     }
181 
182     args_.AddObject("weights",
183                     std::make_unique<BufferDescriptor>(std::move(desc)));
184   } else {
185     std::vector<uint8_t> data(float4_size * elements_count);
186     if (f32_weights) {
187       float* ptr = reinterpret_cast<float*>(data.data());
188       RearrangeFCWeightsToOIO4I4(weights, ptr);
189     } else {
190       half* ptr = reinterpret_cast<half*>(data.data());
191       RearrangeFCWeightsToOIO4I4(weights, ptr);
192     }
193 
194     TensorDescriptor desc = CreateConstantHWVec4TensorDescriptor(
195         f32_weights ? DataType::FLOAT32 : DataType::FLOAT16,
196         TensorStorageType::TEXTURE_2D, src_depth * 4, dst_depth, data.data());
197     args_.AddObject("weights", std::make_unique<TensorDescriptor>(desc));
198   }
199 }
200 
201 FullyConnected CreateFullyConnected(const GpuInfo& gpu_info,
202                                     const OperationDef& definition,
203                                     const FullyConnectedAttributes& attr);
204 
205 FullyConnected CreateFullyConnected(const GpuInfo& gpu_info,
206                                     const OperationDef& definition,
207                                     const FullyConnectedInt8Attributes& attr);
208 
209 }  // namespace gpu
210 }  // namespace tflite
211 
212 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_FULLY_CONNECTED_H_
213