xref: /aosp_15_r20/external/tensorflow/tensorflow/lite/kernels/internal/reference/svdf.h (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SVDF_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SVDF_H_
17 
18 #include <stdint.h>
19 
20 #include <algorithm>
21 #include <limits>
22 
23 #include "tensorflow/lite/c/builtin_op_data.h"
24 #include "tensorflow/lite/c/common.h"
25 #include "tensorflow/lite/kernels/internal/common.h"
26 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
27 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
28 #include "tensorflow/lite/kernels/internal/types.h"
29 
30 // SVDF op that compresses a fully connected op via low-rank matrix
31 // factorization. See https://research.google.com/pubs/archive/43813.pdf for
32 // details.
33 
34 namespace tflite {
35 namespace reference_ops {
36 
ApplyTimeWeightsBiasAndActivation(int batch_size,int memory_size,int num_filters,int num_units,int rank,const float * const __restrict__ weights_time_data,const float * const __restrict__ bias_ptr,TfLiteFusedActivation activation,float * const __restrict__ state_ptr,float * const __restrict__ scratch_ptr,float * const __restrict__ output_ptr)37 static inline void ApplyTimeWeightsBiasAndActivation(
38     int batch_size, int memory_size, int num_filters, int num_units, int rank,
39     const float* const __restrict__ weights_time_data,
40     const float* const __restrict__ bias_ptr, TfLiteFusedActivation activation,
41     float* const __restrict__ state_ptr, float* const __restrict__ scratch_ptr,
42     float* const __restrict__ output_ptr) {
43   // Compute matmul(state, weights_time).
44   for (int b = 0; b < batch_size; ++b) {
45     float* state_ptr_batch = state_ptr + b * memory_size * num_filters;
46     float* scratch_ptr_batch = scratch_ptr + b * num_filters;
47     tensor_utils::BatchVectorBatchVectorDotProduct(
48         weights_time_data, state_ptr_batch, memory_size, num_filters,
49         scratch_ptr_batch);
50   }
51 
52   // Reduction sum.
53   tensor_utils::ReductionSumVector(scratch_ptr, output_ptr,
54                                    batch_size * num_units, rank);
55   // Add bias if provided.
56   if (bias_ptr) {
57     tensor_utils::VectorBatchVectorAdd(bias_ptr, num_units, batch_size,
58                                        output_ptr);
59   }
60 
61   // Apply activation.
62   tensor_utils::ApplyActivationToVector(output_ptr, batch_size * num_units,
63                                         activation, output_ptr);
64 }
65 
EvalIntegerSVDF(const TfLiteSVDFParams * params,const RuntimeShape & input_shape,const int8_t * input_data,const RuntimeShape & weights_feature_shape,const int8_t * weights_feature_data,const RuntimeShape & weights_time_shape,const int16_t * weights_time_data,const RuntimeShape & bias_shape,const int32_t * bias_data,int16_t * state_data,const RuntimeShape & output_shape,int8_t * output_data,int32_t * scratch_data,int32_t * output_temp_data,int32_t scale_1_a,int scale_1_b,int32_t scale_2_a,int scale_2_b,int32_t input_zp,int32_t output_zp)66 inline void EvalIntegerSVDF(
67     const TfLiteSVDFParams* params, const RuntimeShape& input_shape,
68     const int8_t* input_data, const RuntimeShape& weights_feature_shape,
69     const int8_t* weights_feature_data, const RuntimeShape& weights_time_shape,
70     const int16_t* weights_time_data, const RuntimeShape& bias_shape,
71     const int32_t* bias_data, int16_t* state_data,
72     const RuntimeShape& output_shape, int8_t* output_data,
73     int32_t* scratch_data, int32_t* output_temp_data, int32_t scale_1_a,
74     int scale_1_b, int32_t scale_2_a, int scale_2_b, int32_t input_zp,
75     int32_t output_zp) {
76   const int n_rank = params->rank;
77   const int n_batch = input_shape.Dims(0);
78   const int n_input = input_shape.Dims(1);
79   const int n_filter = weights_feature_shape.Dims(0);
80   const int n_unit = n_filter / n_rank;
81   const int n_memory = weights_time_shape.Dims(1);
82 
83   // Left shift the activation_state.
84   // std::copy is fine for overlapping ranges if the output is outside of the
85   // input range. (This is not true for copy_n.)
86   std::copy(state_data + 1, state_data + n_batch * n_memory * n_filter,
87             state_data);
88 
89   // Feature matmul.
90   // Note: no need to clear the latest activation, matmul is not accumulative.
91   {
92     const int32_t output_max = std::numeric_limits<int16_t>::max();
93     const int32_t output_min = std::numeric_limits<int16_t>::min();
94     int16_t* result_in_batch = state_data + (n_memory - 1);
95     for (int b = 0; b < n_batch; b++) {
96       const int8_t* matrix_data = weights_feature_data;
97       for (int r = 0; r < n_filter; r++) {
98         int32_t dot_prod = 0;
99         const int8_t* vector_in_batch = input_data + b * n_input;
100         for (int c = 0; c < n_input; c++) {
101           dot_prod += *matrix_data++ * (*vector_in_batch++ - input_zp);
102         }
103         dot_prod =
104             MultiplyByQuantizedMultiplier(dot_prod, scale_1_a, scale_1_b);
105         dot_prod = std::min(std::max(output_min, dot_prod), output_max);
106         // This assumes state is symmetrically quantized. Otherwise last bit of
107         // state should be initialized to its zero point and accumulate the
108         // dot_prod.
109         // Equivalent as the following:
110         //     result_in_batch = zero point, which happens to be zero.
111         //     result_in_batch += dot_prod.
112         *result_in_batch = dot_prod;
113         result_in_batch += n_memory;
114       }
115     }
116   }
117 
118   // Time.
119   {
120     for (int b = 0; b < n_batch; ++b) {
121       const int16_t* state_data_batch = state_data + b * n_memory * n_filter;
122       int32_t* scratch_data_batch = scratch_data + b * n_filter;
123       tensor_utils::BatchVectorBatchVectorDotProduct(
124           weights_time_data, state_data_batch, n_memory, n_filter,
125           scratch_data_batch);
126     }
127   }
128 
129   // Reduce, add bias, rescale, activation.
130   {
131     // Reduce.
132     tensor_utils::ReductionSumVector(scratch_data, output_temp_data,
133                                      n_batch * n_unit, n_rank);
134     // Add bias.
135     if (bias_data) {
136       tensor_utils::VectorBatchVectorAdd(bias_data, n_unit, n_batch,
137                                          output_temp_data);
138     }
139     // Rescale.
140     const int32_t output_max = std::numeric_limits<int8_t>::max();
141     const int32_t output_min = std::numeric_limits<int8_t>::min();
142     for (int i = 0; i < n_batch * n_unit; ++i) {
143       int32_t x1 = output_temp_data[i];
144       int32_t x2 = MultiplyByQuantizedMultiplier(x1, scale_2_a, scale_2_b);
145       int32_t x3 = x2 + output_zp;
146       int32_t x4 = std::min(std::max(output_min, x3), output_max);
147       output_data[i] = static_cast<int8_t>(x4);
148     }
149   }
150 }
151 
EvalFloatSVDF(const TfLiteSVDFParams * params,const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & weights_feature_shape,const float * weights_feature_data,const RuntimeShape & weights_time_shape,const float * weights_time_data,const RuntimeShape & bias_shape,const float * bias_data,float * scratch_data,float * state_data,const RuntimeShape & output_shape,float * output_data)152 inline void EvalFloatSVDF(
153     const TfLiteSVDFParams* params, const RuntimeShape& input_shape,
154     const float* input_data, const RuntimeShape& weights_feature_shape,
155     const float* weights_feature_data, const RuntimeShape& weights_time_shape,
156     const float* weights_time_data, const RuntimeShape& bias_shape,
157     const float* bias_data, float* scratch_data, float* state_data,
158     const RuntimeShape& output_shape, float* output_data) {
159   const int rank = params->rank;
160   const int batch_size = input_shape.Dims(0);
161   const int input_size = input_shape.Dims(1);
162   const int num_filters = weights_feature_shape.Dims(0);
163   const int num_units = num_filters / rank;
164   const int memory_size = weights_time_shape.Dims(1);
165 
166   // Left shift the activation_state.
167   // std::copy is fine for overlapping ranges if the output is outside of the
168   // input range. (This is not true for copy_n.)
169   std::copy(state_data + 1, state_data + batch_size * memory_size * num_filters,
170             state_data);
171 
172   // Clear scratch (the matmul is accumulative).
173   std::fill_n(scratch_data, batch_size * num_filters, 0.0f);
174 
175   // Compute conv1d(inputs, weights_feature).
176   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
177       weights_feature_data, num_filters, input_size, input_data, batch_size,
178       scratch_data);
179 
180   // Copy the latest activation from scratch into activation_state:
181   // The last, i.e. (memory_size-1)th entry for each batch, and filter.
182   for (int i = 0; i < batch_size * num_filters; ++i) {
183     state_data[i * memory_size + memory_size - 1] = scratch_data[i];
184   }
185 
186   ApplyTimeWeightsBiasAndActivation(
187       batch_size, memory_size, num_filters, num_units, rank, weights_time_data,
188       bias_data, params->activation, state_data, scratch_data, output_data);
189 }
190 
EvalHybridSVDF(const TfLiteSVDFParams * params,const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & weights_feature_shape,const int8_t * weights_feature_data,const float weights_feature_scale,const RuntimeShape & weights_time_shape,const float * weights_time_data,const RuntimeShape & bias_shape,const float * bias_data,float * scratch,float * scaling_factors,int8_t * quantized_input,float * state,const RuntimeShape & output_shape,float * output_data,int32_t * zero_points,int32_t * row_sums,bool * compute_row_sums)191 inline void EvalHybridSVDF(
192     const TfLiteSVDFParams* params, const RuntimeShape& input_shape,
193     const float* input_data, const RuntimeShape& weights_feature_shape,
194     const int8_t* weights_feature_data, const float weights_feature_scale,
195     const RuntimeShape& weights_time_shape, const float* weights_time_data,
196     const RuntimeShape& bias_shape, const float* bias_data, float* scratch,
197     float* scaling_factors, int8_t* quantized_input, float* state,
198     const RuntimeShape& output_shape, float* output_data, int32_t* zero_points,
199     int32_t* row_sums, bool* compute_row_sums) {
200   const int rank = params->rank;
201   const int batch_size = input_shape.Dims(0);
202   const int input_size = input_shape.Dims(1);
203   const int num_filters = weights_feature_shape.Dims(0);
204   const int num_units = num_filters / rank;
205   const int memory_size = weights_time_shape.Dims(1);
206 
207   // Left shift the activation_state.
208   // std::copy is fine for overlapping ranges if the output is outside of the
209   // input range. (This is not true for copy_n.)
210   std::copy(state + 1, state + batch_size * memory_size * num_filters, state);
211 
212   // Clear scratch (the matmul is accumulative).
213   std::fill_n(scratch, batch_size * num_filters, 0.0f);
214 
215   if (!tensor_utils::IsZeroVector(input_data, batch_size * input_size)) {
216     // Quantize input from float to int8_t.
217     tensor_utils::BatchQuantizeFloats(
218         input_data, batch_size, input_size, quantized_input, scaling_factors,
219         zero_points, params->asymmetric_quantize_inputs);
220     for (int b = 0; b < batch_size; ++b) {
221       scaling_factors[b] *= weights_feature_scale;
222     }
223 
224     // Compute conv1d(inputs, weights_feature).
225     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
226         weights_feature_data, num_filters, input_size, quantized_input,
227         scaling_factors, batch_size, scratch,
228         /*per_channel_scale=*/nullptr, zero_points,
229         reinterpret_cast<int32_t*>(scratch), row_sums, compute_row_sums,
230         /*context=*/nullptr);
231   }
232   // Copy the latest activation from scratch into activation_state:
233   // The last, i.e. (memory_size-1)th entry for each batch, and filter.
234   for (int i = 0; i < batch_size * num_filters; ++i) {
235     state[i * memory_size + memory_size - 1] = scratch[i];
236   }
237 
238   // TODO(b/174275776): can optimize hybrid case ~5% by unrolling loop in
239   // applying time weights so that the inner loop multiplies eight elements at
240   // a time.
241   ApplyTimeWeightsBiasAndActivation(
242       batch_size, memory_size, num_filters, num_units, rank, weights_time_data,
243       bias_data, params->activation, state, scratch, output_data);
244 }
245 
246 }  // namespace reference_ops
247 }  // namespace tflite
248 
249 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SVDF_H_
250