1 /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_UTILS_H_
17 #define TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_UTILS_H_
18
19 #include <algorithm>
20 #include <cmath>
21 #include <cstdint>
22
23 #include "tensorflow/lite/c/builtin_op_data.h"
24 #include "tensorflow/lite/c/common.h"
25
26 #if defined(_MSC_VER)
27 #define __restrict__ __restrict
28 #endif
29
30 namespace tflite {
31
32 namespace tensor_utils {
33
34 // Multiplies a matrix with a scalar and reduce the result on each row to a
35 // scalar.
36 // Parameters:
37 // - matrix: matrix of size n_row * n_col
38 // - scalar: the scalar that is multiplied to each element in the matrix
39 // - n_row: the row count of the matrix
40 // - n_col: the column count of the matrix
41 // - output: the 32bit output
42 // Note: We do not need saturation because the int8 * int8 is safe from overflow
43 // in (2^31-1) / (2^14) = 131072, which is bigger than the n_row. Non-zero
44 // initial output value is not exceptionally large.
45 void MatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar,
46 int32_t n_row, int32_t n_col,
47 int32_t* output);
48
49 // Add another vector for each batch in the batch vector.
50 template <typename T>
VectorBatchVectorAdd(const T * vector,int v_size,int n_batch,T * batch_vector)51 void VectorBatchVectorAdd(const T* vector, int v_size, int n_batch,
52 T* batch_vector) {
53 for (int b = 0; b < n_batch; b++) {
54 for (int i = 0; i < v_size; ++i) {
55 batch_vector[i] += vector[i];
56 }
57 batch_vector += v_size;
58 }
59 }
60
61 // Cwise product of two vectors.
62 template <typename T>
VectorVectorCwiseProduct(const T * vector1,const T * vector2,int v_size,T * result)63 inline void VectorVectorCwiseProduct(const T* vector1, const T* vector2,
64 int v_size, T* result) {
65 for (int v = 0; v < v_size; v++) {
66 *result++ = *vector1++ * *vector2++;
67 }
68 }
69
70 // Cwise product of a vector and a batch-vector.
71 template <typename T>
VectorBatchVectorCwiseProduct(const T * vector,int v_size,const T * batch_vector,int n_batch,T * result)72 inline void VectorBatchVectorCwiseProduct(const T* vector, int v_size,
73 const T* batch_vector, int n_batch,
74 T* result) {
75 for (int b = 0; b < n_batch; b++) {
76 VectorVectorCwiseProduct(vector, batch_vector, v_size, result);
77 // Update the pointers.
78 result += v_size;
79 batch_vector += v_size;
80 }
81 }
82
83 // Cwise product and accumulate of two vectors. Since it's a MAC operation, the
84 // assumption here is that result array is initialized to valid values.
85 template <typename T>
VectorVectorCwiseProductAccumulate(const T * __restrict__ vector1,const T * __restrict__ vector2,int v_size,T * __restrict__ result)86 inline void VectorVectorCwiseProductAccumulate(const T* __restrict__ vector1,
87 const T* __restrict__ vector2,
88 int v_size,
89 T* __restrict__ result) {
90 for (int v = 0; v < v_size; v++) {
91 *result++ += *vector1++ * *vector2++;
92 }
93 }
94
95 // Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
96 // operation, the assumption here is that result array is initialized to valid
97 // values.
98 template <typename T>
VectorBatchVectorCwiseProductAccumulate(const T * vector,int v_size,const T * batch_vector,int n_batch,T * result)99 inline void VectorBatchVectorCwiseProductAccumulate(const T* vector, int v_size,
100 const T* batch_vector,
101 int n_batch, T* result) {
102 for (int b = 0; b < n_batch; b++) {
103 VectorVectorCwiseProductAccumulate(vector, batch_vector, v_size, result);
104 // Update the pointers.
105 result += v_size;
106 batch_vector += v_size;
107 }
108 }
109
110 // Batch vector initialization with another vector.
111 template <typename T>
VectorBatchVectorAssign(const T * vector,int v_size,int n_batch,T * batch_vector)112 void VectorBatchVectorAssign(const T* vector, int v_size, int n_batch,
113 T* batch_vector) {
114 for (int b = 0; b < n_batch; b++) {
115 std::copy_n(vector, v_size, batch_vector + b * v_size);
116 }
117 }
118
119 // Checks if all entries of vector are zero for float.
120 bool IsZeroVector(const float* vector, int v_size);
121
122 // Checks if all entries of vector are zero for int8.
123 bool IsZeroVector(const int8_t* vector, int v_size);
124
125 // Quantizes a buffer of floating point values using a symmetric quantization
126 // (i.e. linear quantization without an offset) to 8-bit signed integers.
127 // It also outputs the range (min, max) of the floating point buffer, and the
128 // scaling factor used to quantize the values.
129 void SymmetricQuantizeFloats(const float* values, const int size,
130 int8_t* quantized_values, float* min_value,
131 float* max_value, float* scaling_factor);
132
133 // Quantizes a buffer of floating point values using a symmetric quantization
134 // (i.e. linear quantization without an offset) to 8-bit signed integers.
135 // It uses the range (min, max) provided to the function to calculate the
136 // appropriate scaling factor to quantize the values.
137 void SymmetricQuantizeFloats(const float* values, const int size,
138 int8_t* quantized_values, float min_value,
139 float max_value, float* scaling_factor);
140
141 void AsymmetricQuantizeFloats(const float* values, const int size,
142 int8_t* quantized_values, float* scaling_factor,
143 int32_t* offset);
144
145 // Helper function to quantize floats.
146 // float_data_ptr input float vectors
147 // n_batch number of input vectors
148 // n_data size of a single input vector
149 // quantized_data_ptr (out) vector with quantized data
150 // scaling_factors (out) scaling factors (one per vector)
151 // zero_points (out) zero points (one per vector)
152 // do_asymmetric controls if the quantization should be asymmetric.
BatchQuantizeFloats(const float * float_data_ptr,int n_batch,int n_data,int8_t * quantized_data_ptr,float * scaling_factors,int32_t * zero_points,bool do_asymmetric)153 inline void BatchQuantizeFloats(const float* float_data_ptr, int n_batch,
154 int n_data, int8_t* quantized_data_ptr,
155 float* scaling_factors, int32_t* zero_points,
156 bool do_asymmetric) {
157 for (int b = 0; b < n_batch; ++b) {
158 const int offset = b * n_data;
159 if (do_asymmetric) {
160 tensor_utils::AsymmetricQuantizeFloats(
161 float_data_ptr + offset, n_data, quantized_data_ptr + offset,
162 &scaling_factors[b], &zero_points[b]);
163 } else {
164 float unused_min, unused_max;
165 tensor_utils::SymmetricQuantizeFloats(
166 float_data_ptr + offset, n_data, quantized_data_ptr + offset,
167 &unused_min, &unused_max, &scaling_factors[b]);
168 }
169 }
170 }
171
172 // Multiplies a matrix by a "batched" vector (i.e. a matrix with a batch
173 // dimension composed by input vectors independent from each other). The result
174 // of the multiplication is accumulated to the passed result buffer.
175 // More specifically, for a matrix M of shape [n, i] and a batched-vector
176 // of shape [i, batch] it will first compute the product of shape [n, batch].
177 // This product will be accumulated to the result buffer.
178 void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
179 int m_cols, const float* vector,
180 int n_batch, float* result);
181
182 // Same as the function above, but the matrix is a sparse tensor with block
183 // pattern 1x4.
184 // This function assumes that m_cols is a multiple of the block size (4 in this
185 // case) so that there's no incomplete block.
186 void SparseMatrixBatchVectorMultiplyAccumulate1x4(
187 const float* __restrict__ matrix, const int32_t* __restrict__ segments,
188 const int32_t* __restrict__ indices, int m_rows, int m_cols,
189 const float* __restrict__ vector, int n_batch, float* __restrict__ result);
190
191 // Same as the function above, but the matrix is stored in block compressed
192 // sparse row format with block pattern 1x16 which consists of two arrays:
193 // 1. A matrix array stores non-zero blocks of the matrix in row major.
194 // 2. A ledger array stores nrows groups, one group per row. Each group starts
195 // with an integer representing the number of non-zero blocks for the
196 // corresponding row and follows with column indexes of the first element
197 // of each non-zero block.
198 // This function assumes that
199 // 1. m_cols is a multiple of 16 so that all blocks are full blocks.
200 // 2. m_cols < 254 * 16 so that block index can be represented by uint8.
201 void SparseMatrixBatchVectorMultiplyAccumulate(
202 const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
203 int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
204 float* __restrict__ result);
205
206 // Same as the function above, but for values quantized using symmetric
207 // quantization (e.g. by calling SymmetricQuantizeFloats).
208 // The passed scaling factors is a buffer of the quantization scaling factors
209 // that will be used to dequentize the products into the final result buffer.
210 // These scaling factors are the multiplication of the matrix scaling factor
211 // by the vector's scaling factor, one per batch (i.e. this allows quantizing
212 // each batch in the batch-vector matrix independently).
213 void MatrixBatchVectorMultiplyAccumulate(
214 const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
215 const int8_t* __restrict__ vectors,
216 const float* __restrict__ scaling_factors, int n_batch,
217 float* __restrict__ result);
218
219 // Same as the function above except that vector values
220 // are quantized with asymmetric quantization per-batch and the matrix
221 // is quantized per row.
222 void MatrixBatchVectorMultiplyAccumulate(
223 const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
224 const int8_t* __restrict__ vectors,
225 const float* __restrict__ scaling_factors, int n_batch,
226 float* __restrict__ result, const float* __restrict__ per_channel_scale,
227 const int32_t* __restrict__ input_offset);
228
229 // Same as the function above, but the matrix is a sparse tensor with block
230 // pattern 1x16.
231 // This function assumes that m_cols is a multiple of the block size (16 in this
232 // case) so that there's no incomplete block. Also, it assumes all offsets of
233 // input, output and filter are zero.
234 void SparseMatrixBatchVectorMultiplyAccumulate1x16(
235 const int8_t* __restrict__ matrix, const int32_t* __restrict__ segments,
236 const int32_t* __restrict__ indices, int m_rows, int m_cols,
237 const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
238 int n_batch, const int32_t input_offset, const int32_t output_multiplier,
239 const int32_t output_shift, const int32_t output_offset,
240 const int32_t output_activation_min, const int32_t output_activation_max,
241 int8_t* __restrict__ result);
242
243 // Same as the function above, but the matrix is stored in block compressed
244 // sparse row format with block pattern 1x16 which consists of two arrays:
245 // 1. A matrix array stores non-zero blocks of the matrix in row major.
246 // 2. A ledger array stores nrows groups, one group per row. Each group starts
247 // with an integer representing the number of non-zero blocks for the
248 // corresponding row followed by column index of the first element of
249 // each non-zero block.
250 // This function assumes that
251 // 1. m_cols is a multiple of 16 so that all blocks are full blocks.
252 // 2. m_cols < 254 * 16 so that block index can be represented by uint8.
253 void SparseMatrixBatchVectorMultiplyAccumulate(
254 const int8_t* __restrict__ matrix, const uint8_t* __restrict__ ledger,
255 const int m_rows, const int m_cols, const int8_t* __restrict__ vectors,
256 const float* __restrict__ scaling_factors, int n_batch,
257 float* __restrict__ result);
258
259 // Same as the above 8, 8, 8 integer matmul except for the presence of zero
260 // point and non-accumulative.
261 // TODO(b/148688698): remove this function by folding zero point calculation in
262 // prepare() function.
263 void MatrixBatchVectorMultiply(const int8_t* input, int32_t input_zeropoint,
264 const int8_t* input_to_gate_weights,
265 int32_t input_to_gate_effective_scale_a,
266 int32_t input_to_gate_effective_scale_b,
267 int32_t n_batch, int32_t n_input, int32_t n_cell,
268 int8_t* gate_output, int8_t gate_output_zp);
269
270 // Same as above but has 16 bit and 8 bit input and 8 bit output.
271 // Used in projection when hidden is 16bit.
272 void MatrixBatchVectorMultiply(const int16_t* hidden,
273 const int8_t* hidden_to_output_weights,
274 int32_t proj_effective_scale_a,
275 int32_t proj_effective_scale_b,
276 const int32_t* gate_bias, int32_t n_batch,
277 int32_t n_hidden, int32_t n_output,
278 int32_t output_zp, int8_t* proj_output);
279
280 // Apply Layer Normalization (https://arxiv.org/abs/1607.06450) to a Quantized
281 // vector.
282 // Parameters:
283 // - input: batch vector of size n_batch * n_input; 16 bit.
284 // - layer_norm_weights: the quantized layer normalization weights.
285 // - bias: the bias for the layer normalization.
286 // - layer_norm_scale_a: multiplier for scale factor.
287 // - layer_norm_scale_b: shift for scale factor.
288 // - variance_limit: the guard to make sure the inverse does not overflow.
289 // - n_batch: the number of batches.
290 // - n_input: the size for input and output.
291 // - output: the 16 bit output
292 void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
293 const int32_t* bias, int32_t layer_norm_scale_a,
294 int32_t layer_norm_scale_b, int32_t variance_limit,
295 int n_batch, int n_input, int16_t* output);
296
297 // Same as above but the internal calculation is done in float.
298 void ApplyLayerNormFloat(const int16_t* input,
299 const int16_t* layer_norm_weights,
300 int32_t layer_norm_scale_a, int32_t layer_norm_scale_b,
301 const int32_t* bias, int n_batch, int n_input,
302 int16_t* output);
303
304 // Apply Sigmoid to a quantized vector.
305 // Parameters:
306 // - input: batch vector of size n_batch * n_input; 16 bit.
307 // - n_batch: the number of batches.
308 // - n_input: the size for input and output.
309 // - output: the 16 bit output
310 // The input is in Q3.12 format and the output is in Q0.15 format.
311 void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
312 int16_t* output);
313
314 // Same as above but the internal calcualtion is float.
315 void ApplySigmoidFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
316 int16_t* output);
317
318 // Apply Tanh to a quantized vector.
319 // Parameters:
320 // - integer_bits: the integer bits of the input.
321 // Currently supports 0, 1, 2, 3, 4, 5, 6.
322 // - input: batch vector of size n_batch * n_input; 16 bit.
323 // - n_batch: the number of batches.
324 // - n_input: the size for input and output.
325 // - output: the 16 bit output
326 // The input is in Qm.15-m format and the output is in Q0.15 format.
327 void ApplyTanh(int32_t intger_bits, const int16_t* input, int32_t n_batch,
328 int32_t n_input, int16_t* output);
329
330 // Apply Tanh to a quantized vector. Tbe internal calculation is in float.
331 // - Input has 2^(integer_bits) as scale.
332 // - Output has Q0.15 as scale.
333 void ApplyTanhFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
334 int32_t integer_bits, int16_t* output);
335
336 // Element-wise multiplication of two quantized vectors.
337 // Parameters:
338 // - input_1: batch vector of size n_batch * n_input; 16 bit.
339 // - input_2: batch vector of size n_batch * n_input; 16 bit.
340 // - n_batch: the number of batches.
341 // - n_input: the size for input and output.
342 // - shift: the shift needed to produce the output.
343 // - output: the 16 bit output of size n_batch * n_input.
344 // Output does not need to be initialized.
345 void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
346 int n_input, int shift, int16_t* output);
347
348 // Element-wise multiplication of two quantized vectors.
349 // Parameters:
350 // - input_1: batch vector of size n_batch * n_input; 16 bit.
351 // - input_2: batch vector of size n_batch * n_input; 16 bit.
352 // - n_batch: the number of batches.
353 // - n_input: the size for input and output.
354 // - shift: the shift needed to produce the output.
355 // - output: the 8 bit output of size n_batch * n_input.
356 // Output does not need to be initialized.
357 void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
358 int n_input, int shift, int8_t* output);
359
360 // Element-wise multiplication of two quantized vectors with rescaling.
361 // Parameters:
362 // - input_1: batch vector of size n_batch * n_input; 16 bit.
363 // - input_2: batch vector of size n_batch * n_input; 16 bit.
364 // - multiplier: the multiplier part of scale.
365 // - shift: the shift part of scale.
366 // - n_batch: the number of batches.
367 // - n_input: the size for input and output.
368 // - output: the 8 bit output of size n_batch * n_input.
369 // - output_zp: the zero point of output.
370 // Output does not need to be initialized.
371 // Multiplier ("m") and shift ("s") are connected to scale ("s") with s = m *
372 // 2^(s - 31).
373 void CwiseMul(const int16_t* input_1, const int16_t* input_2,
374 int32_t multiplier, int32_t shift, int32_t n_batch,
375 int32_t n_input, int32_t output_zp, int8_t* output);
376
377 // Element-wise saturating addition of two quantized vectors without rescaling.
378 // Parameters:
379 // - input_1: batch vector of size n_batch * n_input; 16 bit.
380 // - input_2: batch vector of size n_batch * n_input; 16 bit.
381 // - n_batch: the number of batches.
382 // - n_input: the size for input and output.
383 // - output: the 8 bit output of size n_batch * n_input.
384 // Output does not need to be initialized.
385 void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
386 int n_input, int16_t* output);
387
388 // Element-wise in-place clipping of a vector. Overloaded for float, int16_t,
389 // int8_t. Parameters:
390 // - vector: vector of size v_size.
391 // - v_size: the size of the vector.
392 // - clipping_value: the value used for clipping.
393 void CwiseClipping(float* vector, const int v_size, const float clipping_value);
394 void CwiseClipping(int16_t* vector, const int v_size,
395 const int16_t clipping_value);
396 void CwiseClipping(int8_t* vector, const int v_size,
397 const int8_t clipping_value);
398
399 // Dot product of two vectors.
400 float VectorVectorDotProduct(const float* vector1, const float* vector2,
401 int v_size);
402
403 // Dot product of two batch vectors of size n_batch * v_size:
404 // vector1 = [x_1_1, x_1_2, ..., x_1_vsize,
405 // x_2_1, x_2_2, ..., x_2_vsize,
406 // ...
407 // x_nbatch_1,..., x_nbatch_vsize]
408 // vector2 = [y_1_1, y_1_2, ..., y_1_vsize,
409 // y_2_1, y_2_2, ..., y_2_vsize,
410 // ...
411 // y_nbatch_1,..., y_nbatch_vsize]
412 // Then result will be a vector of n_batch size starting from 'result':
413 // [x_1_1 * y_1_1 + x_1_2 * y_1_2 + ... + x_1_vsize * y_1_vsize,
414 // x_2_1 * y_2_1 + x_2_2 * y_2_2 + ... + x_2_vsize * y_2_vsize,
415 // ...
416 // x_nbatch_1 * y_nbatch_1 + ... + x_nbatch_vsize * y_nbatch_vsize]
417 template <typename T>
BatchVectorBatchVectorDotProduct(const T * vector1,const T * vector2,int v_size,int n_batch,T * result)418 inline void BatchVectorBatchVectorDotProduct(const T* vector1, const T* vector2,
419 int v_size, int n_batch,
420 T* result) {
421 for (int b = 0; b < n_batch; b++) {
422 result[b] = VectorVectorDotProduct(vector1, vector2, v_size);
423 vector1 += v_size;
424 vector2 += v_size;
425 }
426 }
427
428 // Same as above but input is 16bit and output is 32bit.
429 void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
430 const int16_t* vector2, int v_size,
431 int n_batch, int32_t* result);
432
433 // Same as above, but inputs are 16bit integer and output is 16bit integer.
434 void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size,
435 const int16_t* batch_vector,
436 int n_batch, int32_t multiplier,
437 int shift, int16_t* result);
438
439 // Compute "1.0f - elements of vector" (used in CIFG).
440 void Sub1Vector(const float* vector, int v_size, float* result);
441
442 // Compute "1.0f - elements of vector" (used in CIFG) for int16 input.
443 // "vector" has range [0, 32767] because it is the output of sigmoid function.
444 void Sub1Vector(const int16_t* vector, int v_size, int16_t* result);
445
446 // Multiply all elements of vector with a scalar.
447 void VectorScalarMultiply(const int8_t* vector, int v_size, float scale,
448 float* result);
449
450 // Reduce-sum on a float input vector:
451 // input_vector: float pointer to input vector.
452 // output_vector: float pointer to vector.
453 // output_size: output vector size.
454 // reduction_size: number of consecutive elements from input vector which are
455 // added to get one element of output.
456 void ReductionSumVector(const float* input_vector, float* output_vector,
457 int output_size, int reduction_size);
458
459 // Same as above but input/output is 32 bit integer.
460 void ReductionSumVector(const int32_t* input_vector, int32_t* output_vector,
461 int output_size, int reduction_size);
462
463 // Same as above but input is 8 bit integer.
464 void ReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
465 int output_size, int reduction_size);
466
467 // Layer norm for each batch.
468 void MeanStddevNormalization(const float* input_vector, float* output_vector,
469 int v_size, int n_batch);
470
471 // Saturate Add with rescale on both inputs.
472 void TwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
473 const int8_t* recurrent, int8_t recurrent_zp,
474 int32_t input_effective_scale_a,
475 int32_t input_effective_scale_b,
476 int32_t recurrent_effective_scale_a,
477 int32_t recurrent_effective_scale_b, int32_t n_batch,
478 int32_t n_cell, int16_t* output);
479
480 } // namespace tensor_utils
481
482 } // namespace tflite
483
484 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_UTILS_H_
485