1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SVDF_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SVDF_H_
17
18 #include <stdint.h>
19
20 #include <algorithm>
21 #include <limits>
22
23 #include "tensorflow/lite/c/builtin_op_data.h"
24 #include "tensorflow/lite/c/common.h"
25 #include "tensorflow/lite/kernels/internal/common.h"
26 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
27 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
28 #include "tensorflow/lite/kernels/internal/types.h"
29
30 // SVDF op that compresses a fully connected op via low-rank matrix
31 // factorization. See https://research.google.com/pubs/archive/43813.pdf for
32 // details.
33
34 namespace tflite {
35 namespace reference_ops {
36
ApplyTimeWeightsBiasAndActivation(int batch_size,int memory_size,int num_filters,int num_units,int rank,const float * const __restrict__ weights_time_data,const float * const __restrict__ bias_ptr,TfLiteFusedActivation activation,float * const __restrict__ state_ptr,float * const __restrict__ scratch_ptr,float * const __restrict__ output_ptr)37 static inline void ApplyTimeWeightsBiasAndActivation(
38 int batch_size, int memory_size, int num_filters, int num_units, int rank,
39 const float* const __restrict__ weights_time_data,
40 const float* const __restrict__ bias_ptr, TfLiteFusedActivation activation,
41 float* const __restrict__ state_ptr, float* const __restrict__ scratch_ptr,
42 float* const __restrict__ output_ptr) {
43 // Compute matmul(state, weights_time).
44 for (int b = 0; b < batch_size; ++b) {
45 float* state_ptr_batch = state_ptr + b * memory_size * num_filters;
46 float* scratch_ptr_batch = scratch_ptr + b * num_filters;
47 tensor_utils::BatchVectorBatchVectorDotProduct(
48 weights_time_data, state_ptr_batch, memory_size, num_filters,
49 scratch_ptr_batch);
50 }
51
52 // Reduction sum.
53 tensor_utils::ReductionSumVector(scratch_ptr, output_ptr,
54 batch_size * num_units, rank);
55 // Add bias if provided.
56 if (bias_ptr) {
57 tensor_utils::VectorBatchVectorAdd(bias_ptr, num_units, batch_size,
58 output_ptr);
59 }
60
61 // Apply activation.
62 tensor_utils::ApplyActivationToVector(output_ptr, batch_size * num_units,
63 activation, output_ptr);
64 }
65
EvalIntegerSVDF(const TfLiteSVDFParams * params,const RuntimeShape & input_shape,const int8_t * input_data,const RuntimeShape & weights_feature_shape,const int8_t * weights_feature_data,const RuntimeShape & weights_time_shape,const int16_t * weights_time_data,const RuntimeShape & bias_shape,const int32_t * bias_data,int16_t * state_data,const RuntimeShape & output_shape,int8_t * output_data,int32_t * scratch_data,int32_t * output_temp_data,int32_t scale_1_a,int scale_1_b,int32_t scale_2_a,int scale_2_b,int32_t input_zp,int32_t output_zp)66 inline void EvalIntegerSVDF(
67 const TfLiteSVDFParams* params, const RuntimeShape& input_shape,
68 const int8_t* input_data, const RuntimeShape& weights_feature_shape,
69 const int8_t* weights_feature_data, const RuntimeShape& weights_time_shape,
70 const int16_t* weights_time_data, const RuntimeShape& bias_shape,
71 const int32_t* bias_data, int16_t* state_data,
72 const RuntimeShape& output_shape, int8_t* output_data,
73 int32_t* scratch_data, int32_t* output_temp_data, int32_t scale_1_a,
74 int scale_1_b, int32_t scale_2_a, int scale_2_b, int32_t input_zp,
75 int32_t output_zp) {
76 const int n_rank = params->rank;
77 const int n_batch = input_shape.Dims(0);
78 const int n_input = input_shape.Dims(1);
79 const int n_filter = weights_feature_shape.Dims(0);
80 const int n_unit = n_filter / n_rank;
81 const int n_memory = weights_time_shape.Dims(1);
82
83 // Left shift the activation_state.
84 // std::copy is fine for overlapping ranges if the output is outside of the
85 // input range. (This is not true for copy_n.)
86 std::copy(state_data + 1, state_data + n_batch * n_memory * n_filter,
87 state_data);
88
89 // Feature matmul.
90 // Note: no need to clear the latest activation, matmul is not accumulative.
91 {
92 const int32_t output_max = std::numeric_limits<int16_t>::max();
93 const int32_t output_min = std::numeric_limits<int16_t>::min();
94 int16_t* result_in_batch = state_data + (n_memory - 1);
95 for (int b = 0; b < n_batch; b++) {
96 const int8_t* matrix_data = weights_feature_data;
97 for (int r = 0; r < n_filter; r++) {
98 int32_t dot_prod = 0;
99 const int8_t* vector_in_batch = input_data + b * n_input;
100 for (int c = 0; c < n_input; c++) {
101 dot_prod += *matrix_data++ * (*vector_in_batch++ - input_zp);
102 }
103 dot_prod =
104 MultiplyByQuantizedMultiplier(dot_prod, scale_1_a, scale_1_b);
105 dot_prod = std::min(std::max(output_min, dot_prod), output_max);
106 // This assumes state is symmetrically quantized. Otherwise last bit of
107 // state should be initialized to its zero point and accumulate the
108 // dot_prod.
109 // Equivalent as the following:
110 // result_in_batch = zero point, which happens to be zero.
111 // result_in_batch += dot_prod.
112 *result_in_batch = dot_prod;
113 result_in_batch += n_memory;
114 }
115 }
116 }
117
118 // Time.
119 {
120 for (int b = 0; b < n_batch; ++b) {
121 const int16_t* state_data_batch = state_data + b * n_memory * n_filter;
122 int32_t* scratch_data_batch = scratch_data + b * n_filter;
123 tensor_utils::BatchVectorBatchVectorDotProduct(
124 weights_time_data, state_data_batch, n_memory, n_filter,
125 scratch_data_batch);
126 }
127 }
128
129 // Reduce, add bias, rescale, activation.
130 {
131 // Reduce.
132 tensor_utils::ReductionSumVector(scratch_data, output_temp_data,
133 n_batch * n_unit, n_rank);
134 // Add bias.
135 if (bias_data) {
136 tensor_utils::VectorBatchVectorAdd(bias_data, n_unit, n_batch,
137 output_temp_data);
138 }
139 // Rescale.
140 const int32_t output_max = std::numeric_limits<int8_t>::max();
141 const int32_t output_min = std::numeric_limits<int8_t>::min();
142 for (int i = 0; i < n_batch * n_unit; ++i) {
143 int32_t x1 = output_temp_data[i];
144 int32_t x2 = MultiplyByQuantizedMultiplier(x1, scale_2_a, scale_2_b);
145 int32_t x3 = x2 + output_zp;
146 int32_t x4 = std::min(std::max(output_min, x3), output_max);
147 output_data[i] = static_cast<int8_t>(x4);
148 }
149 }
150 }
151
EvalFloatSVDF(const TfLiteSVDFParams * params,const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & weights_feature_shape,const float * weights_feature_data,const RuntimeShape & weights_time_shape,const float * weights_time_data,const RuntimeShape & bias_shape,const float * bias_data,float * scratch_data,float * state_data,const RuntimeShape & output_shape,float * output_data)152 inline void EvalFloatSVDF(
153 const TfLiteSVDFParams* params, const RuntimeShape& input_shape,
154 const float* input_data, const RuntimeShape& weights_feature_shape,
155 const float* weights_feature_data, const RuntimeShape& weights_time_shape,
156 const float* weights_time_data, const RuntimeShape& bias_shape,
157 const float* bias_data, float* scratch_data, float* state_data,
158 const RuntimeShape& output_shape, float* output_data) {
159 const int rank = params->rank;
160 const int batch_size = input_shape.Dims(0);
161 const int input_size = input_shape.Dims(1);
162 const int num_filters = weights_feature_shape.Dims(0);
163 const int num_units = num_filters / rank;
164 const int memory_size = weights_time_shape.Dims(1);
165
166 // Left shift the activation_state.
167 // std::copy is fine for overlapping ranges if the output is outside of the
168 // input range. (This is not true for copy_n.)
169 std::copy(state_data + 1, state_data + batch_size * memory_size * num_filters,
170 state_data);
171
172 // Clear scratch (the matmul is accumulative).
173 std::fill_n(scratch_data, batch_size * num_filters, 0.0f);
174
175 // Compute conv1d(inputs, weights_feature).
176 tensor_utils::MatrixBatchVectorMultiplyAccumulate(
177 weights_feature_data, num_filters, input_size, input_data, batch_size,
178 scratch_data);
179
180 // Copy the latest activation from scratch into activation_state:
181 // The last, i.e. (memory_size-1)th entry for each batch, and filter.
182 for (int i = 0; i < batch_size * num_filters; ++i) {
183 state_data[i * memory_size + memory_size - 1] = scratch_data[i];
184 }
185
186 ApplyTimeWeightsBiasAndActivation(
187 batch_size, memory_size, num_filters, num_units, rank, weights_time_data,
188 bias_data, params->activation, state_data, scratch_data, output_data);
189 }
190
EvalHybridSVDF(const TfLiteSVDFParams * params,const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & weights_feature_shape,const int8_t * weights_feature_data,const float weights_feature_scale,const RuntimeShape & weights_time_shape,const float * weights_time_data,const RuntimeShape & bias_shape,const float * bias_data,float * scratch,float * scaling_factors,int8_t * quantized_input,float * state,const RuntimeShape & output_shape,float * output_data,int32_t * zero_points,int32_t * row_sums,bool * compute_row_sums)191 inline void EvalHybridSVDF(
192 const TfLiteSVDFParams* params, const RuntimeShape& input_shape,
193 const float* input_data, const RuntimeShape& weights_feature_shape,
194 const int8_t* weights_feature_data, const float weights_feature_scale,
195 const RuntimeShape& weights_time_shape, const float* weights_time_data,
196 const RuntimeShape& bias_shape, const float* bias_data, float* scratch,
197 float* scaling_factors, int8_t* quantized_input, float* state,
198 const RuntimeShape& output_shape, float* output_data, int32_t* zero_points,
199 int32_t* row_sums, bool* compute_row_sums) {
200 const int rank = params->rank;
201 const int batch_size = input_shape.Dims(0);
202 const int input_size = input_shape.Dims(1);
203 const int num_filters = weights_feature_shape.Dims(0);
204 const int num_units = num_filters / rank;
205 const int memory_size = weights_time_shape.Dims(1);
206
207 // Left shift the activation_state.
208 // std::copy is fine for overlapping ranges if the output is outside of the
209 // input range. (This is not true for copy_n.)
210 std::copy(state + 1, state + batch_size * memory_size * num_filters, state);
211
212 // Clear scratch (the matmul is accumulative).
213 std::fill_n(scratch, batch_size * num_filters, 0.0f);
214
215 if (!tensor_utils::IsZeroVector(input_data, batch_size * input_size)) {
216 // Quantize input from float to int8_t.
217 tensor_utils::BatchQuantizeFloats(
218 input_data, batch_size, input_size, quantized_input, scaling_factors,
219 zero_points, params->asymmetric_quantize_inputs);
220 for (int b = 0; b < batch_size; ++b) {
221 scaling_factors[b] *= weights_feature_scale;
222 }
223
224 // Compute conv1d(inputs, weights_feature).
225 tensor_utils::MatrixBatchVectorMultiplyAccumulate(
226 weights_feature_data, num_filters, input_size, quantized_input,
227 scaling_factors, batch_size, scratch,
228 /*per_channel_scale=*/nullptr, zero_points,
229 reinterpret_cast<int32_t*>(scratch), row_sums, compute_row_sums,
230 /*context=*/nullptr);
231 }
232 // Copy the latest activation from scratch into activation_state:
233 // The last, i.e. (memory_size-1)th entry for each batch, and filter.
234 for (int i = 0; i < batch_size * num_filters; ++i) {
235 state[i * memory_size + memory_size - 1] = scratch[i];
236 }
237
238 // TODO(b/174275776): can optimize hybrid case ~5% by unrolling loop in
239 // applying time weights so that the inner loop multiplies eight elements at
240 // a time.
241 ApplyTimeWeightsBiasAndActivation(
242 batch_size, memory_size, num_filters, num_units, rank, weights_time_data,
243 bias_data, params->activation, state, scratch, output_data);
244 }
245
246 } // namespace reference_ops
247 } // namespace tflite
248
249 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SVDF_H_
250