1 #pragma once
2
3 // DO NOT DEFINE STATIC DATA IN THIS HEADER!
4 // See Note [Do not compile initializers with AVX]
5
6 #include <ATen/cpu/vec/intrinsics.h>
7 #include <ATen/cpu/vec/vec_base.h>
8 #include <ATen/native/quantized/AffineQuantizerBase.h>
9
10 #include <c10/util/irange.h>
11 #include <c10/util/qint32.h>
12 #include <c10/util/qint8.h>
13 #include <c10/util/quint8.h>
14
15 #include <array>
16 #include <cmath>
17
18 // This file defines Vectorized<> for the quantized types.
19 //
20 //
21 // Currently, we simply use these classes as efficient converters between
22 // the quantized types and Vectorized<float>, usually in bandwidth-bound cases
23 // where doing the arithmetic in full-precision is acceptable (e.g.
24 // elementwise operators).
25 //
26 //
27 // Conversions are as follows:
28 // Vectorized<qint8> -> 4x Vectorized<float>
29 // Vectorized<quint8> -> 4x Vectorized<float>
30 // Vectorized<qint32> -> 1x Vectorized<float>
31 //
32 // The size of the returned float vector is specified by the special
33 // constexpr function float_num_vecs. The type of the value returned
34 // from dequantize (and expected as an argument to quantize) is
35 // specified by float_vec_return_type.
36 //
37 // When writing kernels with these vectors, it is expected that floating-
38 // point operations will be carried out in a loop over Vectorized<T>::float_num_vecs
39 // iterations.
40
41 namespace at {
42 namespace vec {
43 inline namespace CPU_CAPABILITY {
44
45 #if defined(CPU_CAPABILITY_AVX512)
46
47 #ifdef _MSC_VER
48 __declspec(align(64)) struct Vectorizedqi {
49 protected:
50 __m512i vals;
51 #else
52 struct Vectorizedqi {
53 protected:
54 __m512i vals __attribute__((aligned(64)));
55 #endif
56
57 public:
VectorizedqiVectorizedqi58 Vectorizedqi() {}
VectorizedqiVectorizedqi59 Vectorizedqi(__m512i v) : vals(v) {}
__m512iVectorizedqi60 operator __m512i() const {
61 return vals;
62 }
63 };
64
65
66 template <typename T>
67 __m512i pack_saturate_and_clamp(
68 __m512i first,
69 __m512i second,
70 T min_val,
71 T max_val);
72
73 template <>
74 inline __m512i pack_saturate_and_clamp<int32_t>(
75 __m512i first [[maybe_unused]],
76 __m512i second [[maybe_unused]],
77 int32_t min_val [[maybe_unused]],
78 int32_t max_val [[maybe_unused]]) {
79 // This function is for linkage only, will not be used
80 AT_ERROR("pack_saturate_and_clamp<int32_t> is not supported");
81 return __m512i{};
82 }
83
84 template <>
85 inline __m512i pack_saturate_and_clamp<int8_t>(
86 __m512i first,
87 __m512i second,
88 int8_t min_val,
89 int8_t max_val) {
90 __m512i packed_and_sat = _mm512_packs_epi16(first, second);
91 return _mm512_max_epi8(
92 _mm512_set1_epi8(min_val),
93 _mm512_min_epi8(packed_and_sat, _mm512_set1_epi8(max_val)));
94 }
95
96 template <>
97 inline __m512i pack_saturate_and_clamp<uint8_t>(
98 __m512i first,
99 __m512i second,
100 uint8_t min_val,
101 uint8_t max_val) {
102 __m512i packed_and_sat = _mm512_packus_epi16(first, second);
103 return _mm512_max_epu8(
104 _mm512_set1_epi8(min_val),
105 _mm512_min_epu8(packed_and_sat, _mm512_set1_epi8(max_val)));
106 }
107
108 template <typename T>
109 typename std::enable_if_t<std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>, at::vec::Vectorized<float>>
convert_int8_to_float(at::vec::Vectorized<T> src)110 inline convert_int8_to_float(at::vec::Vectorized<T> src) {
111 // Note: this function only convert inputs number of elements equal to at::vec::Vectorized<float>.size()
112 // Only handle first 16*8 bits
113 __m128i input_128 = _mm512_castsi512_si128(src);
114 // Convert from 16*uint8/int8 to 16*int32
115 __m512i input_512_extended;
116 if constexpr (std::is_same_v<T, uint8_t>)
117 input_512_extended = _mm512_cvtepu8_epi32(input_128);
118 else
119 input_512_extended = _mm512_cvtepi8_epi32(input_128);
120 // Convert from 16*int32 to 16*float32
121 return _mm512_cvtepi32_ps(input_512_extended);
122 }
123
124 template <typename T>
125 typename std::enable_if_t<std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>, at::vec::Vectorized<T>>
convert_float_to_int8(at::vec::Vectorized<float> src)126 inline convert_float_to_int8(at::vec::Vectorized<float> src) {
127 // Convert from float32 to int32 with truncation
128 __m512i x_values_int32 = _mm512_cvttps_epi32(src);
129
130 // Convert from int32 to int16 using signed saturation
131 __m512i xy_packed_v = _mm512_packs_epi32(x_values_int32, x_values_int32);
132
133 constexpr auto min_val = std::numeric_limits<T>::min();
134 constexpr auto max_val = std::numeric_limits<T>::max();
135
136 // Convert from int16 to uint8/int8 using unsigned saturation
137 __m512i xyzw_clamped_v = pack_saturate_and_clamp<T>(
138 xy_packed_v, xy_packed_v, min_val, max_val);
139 __m512i permute_mask_v =
140 _mm512_set_epi32(0x0f, 0x0b, 0x07, 0x03, 0x0e, 0x0a, 0x06, 0x02,
141 0x0d, 0x09, 0x05, 0x01, 0x0c, 0x08, 0x04, 0x00);
142 return _mm512_permutexvar_epi32(permute_mask_v, xyzw_clamped_v);
143 }
144
145 template <typename T>
QuantizeAvx512(const float * src,T * dst,int len,float inverse_scale,int64_t zero_point)146 __FORCE_INLINE void QuantizeAvx512(
147 const float* src,
148 T* dst,
149 int len,
150 float inverse_scale,
151 int64_t zero_point) {
152 constexpr int VLEN = 16;
153 constexpr auto min_val = std::numeric_limits<T>::min();
154 constexpr auto max_val = std::numeric_limits<T>::max();
155 const __m512i min_v = _mm512_set1_epi32(min_val);
156 const __m512i max_v = _mm512_set1_epi32(max_val);
157 // This is the largest int32 value < int32_max exactly representable in float
158 constexpr int32_t int32_float_max_val =
159 std::numeric_limits<int32_t>::max() - 127;
160 int i = 0;
161 __m512 inverse_scale_v = _mm512_set1_ps(inverse_scale);
162 // clang-format off
163 static const __m512i shuffle_mask_v = _mm512_set_epi8(
164 0xff, 0xff, 0xff, 0xff,
165 0xff, 0xff, 0xff, 0xff,
166 0xff, 0xff, 0xff, 0xff,
167 0x0c, 0x08, 0x04, 0x00,
168 0xff, 0xff, 0xff, 0xff,
169 0xff, 0xff, 0xff, 0xff,
170 0xff, 0xff, 0xff, 0xff,
171 0x0c, 0x08, 0x04, 0x00,
172 0xff, 0xff, 0xff, 0xff,
173 0xff, 0xff, 0xff, 0xff,
174 0xff, 0xff, 0xff, 0xff,
175 0x0c, 0x08, 0x04, 0x00,
176 0xff, 0xff, 0xff, 0xff,
177 0xff, 0xff, 0xff, 0xff,
178 0xff, 0xff, 0xff, 0xff,
179 0x0c, 0x08, 0x04, 0x00);
180 // clang-format on
181 __m512i permute_mask_v =
182 _mm512_set_epi32(0x0f, 0x0b, 0x07, 0x03, 0x0e, 0x0a, 0x06, 0x02,
183 0x0d, 0x09, 0x05, 0x01, 0x0c, 0x08, 0x04, 0x00);
184 __m512i permute_mask_l8_v =
185 _mm512_set_epi32(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
186 0x00, 0x00, 0x00, 0x00, 0x0c, 0x08, 0x04, 0x00);
187 int len_aligned = len / (VLEN * 4) * (VLEN * 4);
188 for (; i < len_aligned; i += 4 * VLEN) {
189 // x
190 __m512 x_vals = _mm512_load_ps(src + i);
191 __m512 x_transformed_v = _mm512_mul_ps(x_vals, inverse_scale_v);
192 // If the floating point value is greater than int32_max,
193 // _mm512_cvtps_epi32 converts them to -ve. Clip at int32_float_max_val to
194 // Clip at int32_float_max_val to avoid this.
195 x_transformed_v =
196 _mm512_min_ps(x_transformed_v, _mm512_set1_ps(int32_float_max_val));
197 // y
198 __m512 y_vals = _mm512_load_ps(src + i + VLEN);
199 __m512 y_transformed_v = _mm512_mul_ps(y_vals, inverse_scale_v);
200 y_transformed_v =
201 _mm512_min_ps(y_transformed_v, _mm512_set1_ps(int32_float_max_val));
202 // z
203 __m512 z_vals = _mm512_load_ps(src + i + 2 * VLEN);
204 __m512 z_transformed_v = _mm512_mul_ps(z_vals, inverse_scale_v);
205 z_transformed_v =
206 _mm512_min_ps(z_transformed_v, _mm512_set1_ps(int32_float_max_val));
207 // w
208 __m512 w_vals = _mm512_load_ps(src + i + 3 * VLEN);
209 __m512 w_transformed_v = _mm512_mul_ps(w_vals, inverse_scale_v);
210 w_transformed_v =
211 _mm512_min_ps(w_transformed_v, _mm512_set1_ps(int32_float_max_val));
212
213 __m512i x_rounded_v = _mm512_cvtps_epi32(x_transformed_v);
214 __m512i y_rounded_v = _mm512_cvtps_epi32(y_transformed_v);
215 __m512i z_rounded_v = _mm512_cvtps_epi32(z_transformed_v);
216 __m512i w_rounded_v = _mm512_cvtps_epi32(w_transformed_v);
217
218 // add zero point
219 x_rounded_v = _mm512_add_epi32(x_rounded_v, _mm512_set1_epi32(zero_point));
220 y_rounded_v = _mm512_add_epi32(y_rounded_v, _mm512_set1_epi32(zero_point));
221 z_rounded_v = _mm512_add_epi32(z_rounded_v, _mm512_set1_epi32(zero_point));
222 w_rounded_v = _mm512_add_epi32(w_rounded_v, _mm512_set1_epi32(zero_point));
223
224 __m512i xy_packed_v = _mm512_packs_epi32(x_rounded_v, y_rounded_v);
225 __m512i zw_packed_v = _mm512_packs_epi32(z_rounded_v, w_rounded_v);
226 __m512i xyzw_clamped_v =
227 pack_saturate_and_clamp<T>(xy_packed_v, zw_packed_v, min_val, max_val);
228
229 xyzw_clamped_v =
230 _mm512_permutexvar_epi32(permute_mask_v, xyzw_clamped_v);
231 _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + i), xyzw_clamped_v);
232 }
233
234 // Additional 8-lane AVX512 version to take advantage when len is smaller
235 // based on fbgemm::QuantizeAvx2 (https://github.com/pytorch/FBGEMM)
236 for (; i < len / VLEN * VLEN; i += VLEN) {
237 __m512 x_vals = _mm512_load_ps(src + i);
238 __m512 x_transformed_v = _mm512_mul_ps(x_vals, inverse_scale_v);
239 x_transformed_v =
240 _mm512_min_ps(x_transformed_v, _mm512_set1_ps(int32_float_max_val));
241 __m512i x_rounded_v = _mm512_cvtps_epi32(x_transformed_v);
242 x_rounded_v = _mm512_add_epi32(x_rounded_v, _mm512_set1_epi32(zero_point));
243 __m512i x_clipped_v =
244 _mm512_max_epi32(min_v, _mm512_min_epi32(max_v, x_rounded_v));
245
246 x_clipped_v = _mm512_shuffle_epi8(x_clipped_v, shuffle_mask_v);
247 x_clipped_v = _mm512_permutexvar_epi32(permute_mask_l8_v, x_clipped_v);
248 _mm_storeu_si128(
249 reinterpret_cast<__m128i*>(dst + i),
250 _mm512_castsi512_si128(x_clipped_v));
251 }
252
253 for (; i < len; ++i) {
254 float transformed = src[i] * inverse_scale;
255
256 // Not exactly the same behavior as the vectorized code.
257 // The vectorized code above always rounds to even in halfway cases
258 // (https://software.intel.com/en-us/node/523819), but std::nearbyint
259 // does the same only when the current rounding mode is FE_TONEAREST.
260 // However, in practice, this should not be a problem because most cases
261 // use the default rounding mode FE_TONEAREST.
262 // Note that we cannot implement the same behavior as the vectorized code
263 // using std::round because it does rounding away from zero in halfway
264 // cases.
265 transformed = zero_point + std::nearbyint(transformed);
266 float clipped =
267 std::min(std::max(transformed, float(min_val)), float(max_val));
268 dst[i] = clipped;
269 }
270 }
271
272 template<>
273 struct Vectorized<c10::qint32> : public Vectorizedqi {
274 using size_type = int;
275 static constexpr size_type size() {
276 return 16;
277 }
278
279 static constexpr int float_num_vecs() {
280 return 1;
281 }
282
283 static constexpr int int_num_vecs() {
284 return 1;
285 }
286
287 using float_vec_return_type = std::array<Vectorized<float>, 1>;
288 using int_vec_return_type = std::array<Vectorized<c10::qint32>, 1>;
289 using value_type = c10::qint32::underlying;
290
291 public:
292 using Vectorizedqi::Vectorizedqi;
293 Vectorized() {}
294
295 Vectorized(__m512i vals_) { vals = vals_;}
296
297 // Broadcast constructor
298 Vectorized(const c10::qint32& val) {
299 value_type uw = val.val_;
300 vals = _mm512_set1_epi32(uw);
301 }
302
303 void store(void* ptr, int count = size()) const {
304 if (count != size()) {
305 memcpy(ptr, &vals, count * sizeof(value_type));
306 } else {
307 _mm512_storeu_si512((__m512i*)ptr, vals);
308 }
309 }
310
311 static Vectorized<c10::qint32> loadu(const void* ptr) {
312 return Vectorized<c10::qint32>(ptr);
313 }
314
315 static Vectorized<c10::qint32> loadu(const void* ptr, int64_t count) {
316 __at_align__ value_type tmp_values[size()];
317 // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
318 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
319 // instructions while a loop would be compiled to one instruction.
320 for (const auto i : c10::irange(size())) {
321 tmp_values[i] = 0;
322 }
323 std::memcpy(tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
324 return loadu(tmp_values);
325 }
326
327 float_vec_return_type dequantize(
328 Vectorized<float> scale,
329 Vectorized<float> zero_point,
330 Vectorized<float> scale_zp_premul) const {
331 __m512 float_vals = _mm512_cvtepi32_ps(vals);
332 return {vec::fmadd(scale, Vectorized<float>(float_vals), scale_zp_premul)};
333 }
334
335 float_vec_return_type dequantize(
336 Vectorized<float> scale,
337 Vectorized<float> zero_point) const {
338 __m512 float_vals = _mm512_cvtepi32_ps(vals);
339 return {(Vectorized<float>(float_vals) - zero_point) * scale};
340 }
341
342 static Vectorized<c10::qint32> quantize(
343 const float_vec_return_type& rhs,
344 float scale,
345 int32_t zero_point,
346 float inverse_scale [[maybe_unused]]) {
347 Vectorized<c10::qint32> retval;
348 auto rhs_data = (__m512)rhs[0];
349 at::native::quantize_vec<c10::qint32, /*precision=*/32>(
350 scale, zero_point, (float*)&rhs_data, (c10::qint32*)&retval.vals, 16);
351 return retval;
352 }
353
354 Vectorized<c10::qint32> maximum(Vectorized<c10::qint32> b) const {
355 return _mm512_max_epi32(vals, b.vals);
356 }
357
358 Vectorized<c10::qint32> minimum(Vectorized<c10::qint32> b) const {
359 return _mm512_min_epi32(vals, b.vals);
360 }
361
362 Vectorized<c10::qint32> relu(Vectorized<c10::qint32> zero_point) const {
363 return maximum(zero_point);
364 }
365
366 Vectorized<c10::qint32> relu6(
367 Vectorized<c10::qint32> zero_point,
368 Vectorized<c10::qint32> q_six) {
369 return _mm512_min_epi32(
370 _mm512_max_epi32(vals, zero_point.vals), q_six.vals);
371 }
372
373 int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const {
374 return {_mm512_sub_epi32(vals, b)};
375 }
376
377 static Vectorized<c10::qint32> requantize_from_int(
378 const int_vec_return_type& inp,
379 float multiplier,
380 int32_t zero_point) {
381 __m512 multiplier_v = _mm512_set1_ps(multiplier);
382 __m512i zero_point_v = _mm512_set1_epi32(zero_point);
383
384 __m512 scaled = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[0]), multiplier_v);
385 __m512i rounded = _mm512_cvtps_epi32(scaled);
386 return _mm512_add_epi32(rounded, zero_point_v);
387 }
388
389 private:
390 // Load from memory constructor
391 Vectorized(const void* ptr) {
392 vals = _mm512_loadu_si512((const __m512i*)ptr);
393 }
394 };
395
396 template <>
397 Vectorized<c10::qint32> inline maximum(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
398 return a.maximum(b);
399 }
400
401 template <>
402 Vectorized<c10::qint32> inline operator*(
403 const Vectorized<c10::qint32>& a,
404 const Vectorized<c10::qint32>& b) {
405 return _mm512_mullo_epi32(a, b);
406 }
407
408 template <>
409 Vectorized<c10::qint32> inline operator+(
410 const Vectorized<c10::qint32>& a,
411 const Vectorized<c10::qint32>& b) {
412 return _mm512_add_epi32(a, b);
413 }
414
415 /*
416 * Convert values from int32 back to int8/uint8
417 */
418 template <typename T>
419 __m512i RequantizeAvx512(
420 const std::array<Vectorized<c10::qint32>, 4>& inp,
421 __m512 multiplier,
422 __m512i zp) {
423 static_assert(
424 std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>,
425 "Only int8_t/uint8_t are supported");
426 constexpr auto min_val = std::numeric_limits<T>::min();
427 constexpr auto max_val = std::numeric_limits<T>::max();
428 __m512i permute_mask_v =
429 _mm512_set_epi32(0x0f, 0x0b, 0x07, 0x03, 0x0e, 0x0a, 0x06, 0x02,
430 0x0d, 0x09, 0x05, 0x01, 0x0c, 0x08, 0x04, 0x00);
431 __m512 x_scaled_v = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[0]), multiplier);
432 __m512 y_scaled_v = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[1]), multiplier);
433 __m512 z_scaled_v = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[2]), multiplier);
434 __m512 w_scaled_v = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[3]), multiplier);
435
436 __m512i x_rounded_v = _mm512_cvtps_epi32(x_scaled_v);
437 __m512i y_rounded_v = _mm512_cvtps_epi32(y_scaled_v);
438 __m512i z_rounded_v = _mm512_cvtps_epi32(z_scaled_v);
439 __m512i w_rounded_v = _mm512_cvtps_epi32(w_scaled_v);
440
441 /* Add zero point */
442 __m512i x_v = _mm512_add_epi32(x_rounded_v, zp);
443 __m512i y_v = _mm512_add_epi32(y_rounded_v, zp);
444 __m512i z_v = _mm512_add_epi32(z_rounded_v, zp);
445 __m512i w_v = _mm512_add_epi32(w_rounded_v, zp);
446
447 /* Pack to int16_t and saturate */
448 __m512i xy_packed_v = _mm512_packs_epi32(x_v, y_v);
449 __m512i zw_packed_v = _mm512_packs_epi32(z_v, w_v);
450
451 __m512i xyzw_clamped_v =
452 pack_saturate_and_clamp<T>(xy_packed_v, zw_packed_v, min_val, max_val);
453
454 /*
455 * xyzw_clamped_v has results in the following layout so we need to
456 * permute: x0-3 y0-3 z0-3 w0-3 x4-7 y4-7 z4-7 w4-7 x8-11 y8-11 z8-11 w8-11 x12-15 y12-15 z12-15 w12-15
457 */
458 xyzw_clamped_v = _mm512_permutexvar_epi32(permute_mask_v, xyzw_clamped_v);
459 return xyzw_clamped_v;
460 }
461
462 template<>
463 struct Vectorized<c10::qint8> : public Vectorizedqi {
464 static constexpr int size() {
465 return 64;
466 }
467
468 static constexpr int float_num_vecs() {
469 return 4;
470 }
471
472 static constexpr int int_num_vecs() {
473 return 4;
474 }
475
476 using float_vec_return_type = std::array<Vectorized<float>, 4>;
477 using int_vec_return_type = std::array<Vectorized<c10::qint32>, 4>;
478 using value_type = typename c10::qint8::underlying;
479
480 public:
481 using Vectorizedqi::Vectorizedqi;
482
483 Vectorized() {}
484 Vectorized(__m512i vals_) { vals = vals_;}
485
486 // Broadcast constructor
487 Vectorized(const c10::qint8& val) {
488 value_type uw = val.val_;
489 vals = _mm512_set1_epi8(uw);
490 }
491
492 // This is needed because the compiler emits awful code for the default
493 // constructor for moving the enum
494 Vectorized(const Vectorized<c10::qint8>& other) : Vectorizedqi(other.vals) { }
495
496 // This is added to avoid error: definition of implicit copy assignment operator
497 // for 'Vectorized<c10::qint8>' is deprecated because it has a user-declared
498 // copy constructor [-Werror,-Wdeprecated-copy]
499 Vectorized& operator=(const Vectorized<c10::qint8>&) = default;
500
501 void store(void* ptr, int count = size()) const {
502 if (count != size()) {
503 memcpy(ptr, &vals, count * sizeof(value_type));
504 } else {
505 _mm512_storeu_si512((__m512i*)ptr, vals);
506 }
507 }
508
509 static Vectorized<c10::qint8> loadu(const void* ptr) {
510 return Vectorized<c10::qint8>(ptr);
511 }
512
513 static Vectorized<c10::qint8> loadu(const void* ptr, int64_t count) {
514 __at_align__ value_type tmp_values[size()];
515 // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
516 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
517 // instructions while a loop would be compiled to one instruction.
518 for (const auto i : c10::irange(size())) {
519 tmp_values[i] = 0;
520 }
521 std::memcpy(tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
522 return loadu(tmp_values);
523 }
524
525 private:
526 __m512i cvtepi8_epi32(__m128i epi8_vals) const {
527 return _mm512_cvtepi8_epi32(epi8_vals);
528 }
529
530 public:
531 float_vec_return_type dequantize(
532 Vectorized<float> scale,
533 Vectorized<float> zero_point,
534 Vectorized<float> scale_neg_zp_premul) const {
535 #if defined(_MSC_VER) && !defined(__clang__)
536 __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
537 __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
538 __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
539 __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
540 #else
541 __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
542 __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
543 __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
544 __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
545 #endif
546
547 __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0));
548 __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1));
549 __m512 float_val2 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val2));
550 __m512 float_val3 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val3));
551
552 auto val0 =
553 vec::fmadd(scale, Vectorized<float>(float_val0), scale_neg_zp_premul);
554 auto val1 =
555 vec::fmadd(scale, Vectorized<float>(float_val1), scale_neg_zp_premul);
556 auto val2 =
557 vec::fmadd(scale, Vectorized<float>(float_val2), scale_neg_zp_premul);
558 auto val3 =
559 vec::fmadd(scale, Vectorized<float>(float_val3), scale_neg_zp_premul);
560 return {val0, val1, val2, val3};
561 }
562
563 float_vec_return_type dequantize(
564 Vectorized<float> scale,
565 Vectorized<float> zero_point) const {
566 #if defined(_MSC_VER) && !defined(__clang__)
567 __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
568 __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
569 __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
570 __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
571 #else
572 __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
573 __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
574 __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
575 __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
576 #endif
577
578 __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0));
579 __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1));
580 __m512 float_val2 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val2));
581 __m512 float_val3 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val3));
582
583 auto val0 = (Vectorized<float>(float_val0) - zero_point) * scale;
584 auto val1 = (Vectorized<float>(float_val1) - zero_point) * scale;
585 auto val2 = (Vectorized<float>(float_val2) - zero_point) * scale;
586 auto val3 = (Vectorized<float>(float_val3) - zero_point) * scale;
587 return {val0, val1, val2, val3};
588 }
589
590 static Vectorized<c10::qint8> quantize(
591 const float_vec_return_type& rhs,
592 float scale,
593 int32_t zero_point,
594 float inverse_scale) {
595 auto* rhs_data = (float*)rhs.data();
596 int8_t quantized_values[64];
597 QuantizeAvx512<value_type>(
598 rhs_data, quantized_values, 64, inverse_scale, zero_point);
599 return Vectorized<c10::qint8>::loadu(quantized_values);
600 }
601
602 Vectorized<c10::qint8> maximum(Vectorized<c10::qint8> b) const {
603 return _mm512_max_epi8(vals, b.vals);
604 }
605
606 Vectorized<c10::qint8> minimum(Vectorized<c10::qint8> b) const {
607 return _mm512_min_epi8(vals, b.vals);
608 }
609
610 Vectorized<c10::qint8> relu(Vectorized<c10::qint8> zero_point) const {
611 return maximum(zero_point);
612 }
613
614 Vectorized<c10::qint8> relu6(
615 Vectorized<c10::qint8> zero_point,
616 Vectorized<c10::qint8> q_six) {
617 return _mm512_min_epi8(
618 _mm512_max_epi8(vals, zero_point.vals), q_six.vals);
619 }
620
621 int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
622 #if defined(_MSC_VER) && !defined(__clang__)
623 __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
624 __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
625 __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
626 __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
627 #else
628 __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
629 __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
630 __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
631 __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
632 #endif
633
634 __m512i int32_val0 = cvtepi8_epi32(int_val0);
635 __m512i int32_val1 = cvtepi8_epi32(int_val1);
636 __m512i int32_val2 = cvtepi8_epi32(int_val2);
637 __m512i int32_val3 = cvtepi8_epi32(int_val3);
638
639 #if defined(_MSC_VER) && !defined(__clang__)
640 __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]);
641 __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]);
642 __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]);
643 __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]);
644 #else
645 __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]);
646 __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]);
647 __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]);
648 __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]);
649 #endif
650
651 __m512i int32_b0 = cvtepi8_epi32(int_b0);
652 __m512i int32_b1 = cvtepi8_epi32(int_b1);
653 __m512i int32_b2 = cvtepi8_epi32(int_b2);
654 __m512i int32_b3 = cvtepi8_epi32(int_b3);
655
656 __m512i res_0 = _mm512_sub_epi32(int32_val0, int32_b0);
657 __m512i res_1 = _mm512_sub_epi32(int32_val1, int32_b1);
658 __m512i res_2 = _mm512_sub_epi32(int32_val2, int32_b2);
659 __m512i res_3 = _mm512_sub_epi32(int32_val3, int32_b3);
660
661 return {Vectorized<c10::qint32>(res_0),
662 Vectorized<c10::qint32>(res_1),
663 Vectorized<c10::qint32>(res_2),
664 Vectorized<c10::qint32>(res_3)};
665 }
666
667 static Vectorized<c10::qint8> requantize_from_int(
668 const int_vec_return_type& inp,
669 float multiplier,
670 int32_t zero_point) {
671 __m512 multiplier_v = _mm512_set1_ps(multiplier);
672 __m512i zero_point_v = _mm512_set1_epi32(zero_point);
673 return RequantizeAvx512<value_type>(inp, multiplier_v, zero_point_v);
674 }
675
676 private:
677 // Load from memory constructor
678 Vectorized(const void* ptr) {
679 vals = _mm512_loadu_si512((const __m512i*)ptr);
680 }
681 };
682
683 template <>
684 Vectorized<c10::qint8> inline maximum(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
685 return a.maximum(b);
686 }
687
688 template<>
689 struct Vectorized<c10::quint8> : public Vectorizedqi {
690 static constexpr int size() {
691 return 64;
692 }
693
694 static constexpr int float_num_vecs() {
695 return 4;
696 }
697
698 static constexpr int int_num_vecs() {
699 return 4;
700 }
701
702 using float_vec_return_type = std::array<Vectorized<float>, 4>;
703 using int_vec_return_type = std::array<Vectorized<c10::qint32>, 4>;
704 using value_type = typename c10::quint8::underlying;
705
706 public:
707 using Vectorizedqi::Vectorizedqi;
708 Vectorized() {}
709
710 Vectorized(__m512i vals_) { vals = vals_;}
711
712 // Broadcast constructor
713 Vectorized(const c10::quint8& val) {
714 value_type uw = val.val_;
715 vals = _mm512_set1_epi8(uw);
716 }
717
718 Vectorized(const Vectorized<c10::quint8>& other) : Vectorizedqi(other.vals) { }
719
720 // This is added to avoid error: definition of implicit copy assignment operator
721 // for 'Vectorized<c10::quint8>' is deprecated because it has a user-declared
722 // copy constructor [-Werror,-Wdeprecated-copy]
723 Vectorized& operator=(const Vectorized<c10::quint8>&) = default;
724
725 void store(void* ptr, int count = size()) const {
726 if (count != size()) {
727 memcpy(ptr, &vals, count * sizeof(value_type));
728 } else {
729 _mm512_storeu_si512((__m512i*)ptr, vals);
730 }
731 }
732
733 static Vectorized<c10::quint8> loadu(const void* ptr) {
734 return Vectorized<c10::quint8>(ptr);
735 }
736
737 static Vectorized<c10::quint8> loadu(const void* ptr, int64_t count) {
738 __at_align__ value_type tmp_values[size()];
739 // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
740 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
741 // instructions while a loop would be compiled to one instruction.
742 for (const auto i : c10::irange(size())) {
743 tmp_values[i] = 0;
744 }
745 std::memcpy(tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
746 return loadu(tmp_values);
747 }
748
749 private:
750 __m512i cvtepu8_epi32(__m128i epu8_vals) const {
751 return _mm512_cvtepu8_epi32(epu8_vals);
752 }
753
754 public:
755 float_vec_return_type dequantize(
756 Vectorized<float> scale,
757 Vectorized<float> zero_point,
758 Vectorized<float> scale_zp_premul) const {
759 #if defined(_MSC_VER) && !defined(__clang__)
760 __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
761 __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
762 __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
763 __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
764 #else
765 __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
766 __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
767 __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
768 __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
769 #endif
770
771 __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0));
772 __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1));
773 __m512 float_val2 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val2));
774 __m512 float_val3 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val3));
775
776 auto val0 =
777 vec::fmadd(scale, Vectorized<float>(float_val0), scale_zp_premul);
778 auto val1 =
779 vec::fmadd(scale, Vectorized<float>(float_val1), scale_zp_premul);
780 auto val2 =
781 vec::fmadd(scale, Vectorized<float>(float_val2), scale_zp_premul);
782 auto val3 =
783 vec::fmadd(scale, Vectorized<float>(float_val3), scale_zp_premul);
784
785 return {val0, val1, val2, val3};
786 }
787
788 float_vec_return_type dequantize(
789 Vectorized<float> scale,
790 Vectorized<float> zero_point) const {
791 #if defined(_MSC_VER) && !defined(__clang__)
792 __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
793 __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
794 __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
795 __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
796 #else
797 __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
798 __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
799 __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
800 __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
801 #endif
802
803 __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0));
804 __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1));
805 __m512 float_val2 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val2));
806 __m512 float_val3 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val3));
807
808 auto val0 = (Vectorized<float>(float_val0) - zero_point) * scale;
809 auto val1 = (Vectorized<float>(float_val1) - zero_point) * scale;
810 auto val2 = (Vectorized<float>(float_val2) - zero_point) * scale;
811 auto val3 = (Vectorized<float>(float_val3) - zero_point) * scale;
812
813 return {val0, val1, val2, val3};
814 }
815
816 static Vectorized<c10::quint8> quantize(
817 const float_vec_return_type& rhs,
818 float scale,
819 int32_t zero_point,
820 float inverse_scale) {
821 auto* rhs_data = (float*)rhs.data();
822 uint8_t quantized_values[64];
823 QuantizeAvx512<value_type>(
824 rhs_data, quantized_values, 64, inverse_scale, zero_point);
825 return Vectorized<c10::quint8>::loadu(quantized_values);
826 }
827
828 Vectorized<c10::quint8> maximum(Vectorized<c10::quint8> b) const {
829 return _mm512_max_epu8(vals, b.vals);
830 }
831
832 Vectorized<c10::quint8> minimum(Vectorized<c10::quint8> b) const {
833 return _mm512_min_epu8(vals, b.vals);
834 }
835
836 Vectorized<c10::quint8> relu(Vectorized<c10::quint8> zero_point) const {
837 return maximum(zero_point);
838 }
839
840 Vectorized<c10::quint8> relu6(
841 Vectorized<c10::quint8> zero_point,
842 Vectorized<c10::quint8> q_six) {
843 return _mm512_min_epu8(
844 _mm512_max_epu8(vals, zero_point.vals), q_six.vals);
845 }
846
847 int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
848 #if defined(_MSC_VER) && !defined(__clang__)
849 __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
850 __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
851 __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
852 __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
853 #else
854 __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
855 __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
856 __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
857 __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
858 #endif
859
860 __m512i int32_val0 = cvtepu8_epi32(int_val0);
861 __m512i int32_val1 = cvtepu8_epi32(int_val1);
862 __m512i int32_val2 = cvtepu8_epi32(int_val2);
863 __m512i int32_val3 = cvtepu8_epi32(int_val3);
864
865 #if defined(_MSC_VER) && !defined(__clang__)
866 __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]);
867 __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]);
868 __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]);
869 __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]);
870 #else
871 __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]);
872 __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]);
873 __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]);
874 __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]);
875 #endif
876
877 __m512i int32_b0 = cvtepu8_epi32(int_b0);
878 __m512i int32_b1 = cvtepu8_epi32(int_b1);
879 __m512i int32_b2 = cvtepu8_epi32(int_b2);
880 __m512i int32_b3 = cvtepu8_epi32(int_b3);
881
882 __m512i res_0 = _mm512_sub_epi32(int32_val0, int32_b0);
883 __m512i res_1 = _mm512_sub_epi32(int32_val1, int32_b1);
884 __m512i res_2 = _mm512_sub_epi32(int32_val2, int32_b2);
885 __m512i res_3 = _mm512_sub_epi32(int32_val3, int32_b3);
886 return {Vectorized<c10::qint32>(res_0),
887 Vectorized<c10::qint32>(res_1),
888 Vectorized<c10::qint32>(res_2),
889 Vectorized<c10::qint32>(res_3)};
890 }
891
892 static Vectorized<c10::quint8> requantize_from_int(
893 const int_vec_return_type& inp,
894 float multiplier,
895 int32_t zero_point) {
896 __m512 multiplier_v = _mm512_set1_ps(multiplier);
897 __m512i zero_point_v = _mm512_set1_epi32(zero_point);
898 return RequantizeAvx512<value_type>(inp, multiplier_v, zero_point_v);
899 }
900
901 private:
902
903 // Load from memory constructor
904 Vectorized(const void* ptr) {
905 vals = _mm512_loadu_si512((const __m512i*)ptr);
906 }
907 };
908
909 template <>
910 Vectorized<c10::quint8> inline maximum(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
911 return a.maximum(b);
912 }
913
914 #else
915
916 // NOTE: These are low-performance implementations that we fall back on.
917
918 template <
919 typename T,
920 typename float_vec_return_type_,
921 typename int_vec_return_type_,
922 int size_>
923 struct VectorizedQuantizedConverter {
924 static constexpr int size() {
925 return size_;
926 }
927
928 static constexpr int float_num_vecs() {
929 return size() / 8;
930 }
931
932 static constexpr int int_num_vecs() {
933 return size() / 8;
934 }
935
936 using float_vec_return_type = float_vec_return_type_;
937 using int_vec_return_type = int_vec_return_type_;
938
939 using value_type = typename T::underlying;
940 std::array<value_type, size_> vals;
941
942 VectorizedQuantizedConverter(T val) {
943 for (const auto i : c10::irange(size())) {
944 vals[i] = val.val_;
945 }
946 }
947
948 VectorizedQuantizedConverter(const void* ptr) {
949 memcpy(vals.data(), ptr, sizeof(value_type) * size());
950 }
951
952 void store(void* ptr, int count = size()) const {
953 memcpy(ptr, vals.data(), count * sizeof(value_type));
954 }
955
956 float_vec_return_type dequantize(
957 Vectorized<float> scale,
958 Vectorized<float> zero_point,
959 Vectorized<float> scale_zp_premul [[maybe_unused]]) const {
960 float_vec_return_type rv;
961 for (const auto i : c10::irange(float_num_vecs())) {
962 float tmp_vals[16];
963 for (const auto j : c10::irange(16)) {
964 tmp_vals[j] = at::native::dequantize_val<T>(
965 scale[j], zero_point[j], T(vals[16 * i + j]));
966 }
967 rv[i] = Vectorized<float>(tmp_vals[0],
968 tmp_vals[1],
969 tmp_vals[2],
970 tmp_vals[3],
971 tmp_vals[4],
972 tmp_vals[5],
973 tmp_vals[6],
974 tmp_vals[7],
975 tmp_vals[8],
976 tmp_vals[9],
977 tmp_vals[10],
978 tmp_vals[11],
979 tmp_vals[12],
980 tmp_vals[13],
981 tmp_vals[14],
982 tmp_vals[15]);
983 }
984 return rv;
985 }
986
987 float_vec_return_type dequantize(
988 Vectorized<float> scale,
989 Vectorized<float> zero_point) const {
990 Vectorized<float> scale_zp_premul;
991 return dequantize(scale, zero_point, scale_zp_premul);
992 }
993
994 protected:
995 VectorizedQuantizedConverter() {}
996 };
997
998 template <>
999 struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
1000 c10::qint32,
1001 std::array<Vectorized<float>, 1>,
1002 std::array<Vectorized<c10::qint32>, 1>,
1003 16> {
1004 Vectorized()
1005 : VectorizedQuantizedConverter<
1006 c10::qint32,
1007 std::array<Vectorized<float>, 1>,
1008 std::array<Vectorized<c10::qint32>, 1>,
1009 16>() {}
1010 Vectorized(c10::qint32 val)
1011 : VectorizedQuantizedConverter<
1012 c10::qint32,
1013 std::array<Vectorized<float>, 1>,
1014 std::array<Vectorized<c10::qint32>, 1>,
1015 16>(val) {}
1016 Vectorized(const void* ptr)
1017 : VectorizedQuantizedConverter<
1018 c10::qint32,
1019 std::array<Vectorized<float>, 1>,
1020 std::array<Vectorized<c10::qint32>, 1>,
1021 16>(ptr) {}
1022
1023 static Vectorized<c10::qint32> loadu(const void* ptr) {
1024 return Vectorized<c10::qint32>(ptr);
1025 }
1026
1027 static Vectorized<c10::qint32> loadu(const void* ptr, int64_t count) {
1028 __at_align__ value_type tmp_values[size()];
1029 // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
1030 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
1031 // instructions while a loop would be compiled to one instruction.
1032 for (const auto i : c10::irange(size())) {
1033 tmp_values[i] = 0;
1034 }
1035 std::memcpy(tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
1036 return loadu(tmp_values);
1037 }
1038
1039 static Vectorized<c10::qint32> quantize(
1040 const float_vec_return_type& rhs,
1041 float scale,
1042 int32_t zero_point,
1043 float inverse_scale [[maybe_unused]]) {
1044 std::array<value_type, size()> qvals;
1045 std::array<float, float_num_vecs() * 16> float_vals;
1046
1047 for (const auto i : c10::irange(float_num_vecs())) {
1048 rhs[i].store(&float_vals[i * 16], 16);
1049 }
1050
1051 at::native::quantize_vec<c10::qint32, /*precision=*/32>(
1052 scale,
1053 zero_point,
1054 float_vals.data(),
1055 (c10::qint32*)qvals.data(),
1056 16 * float_num_vecs());
1057
1058 return Vectorized<c10::qint32>::loadu(qvals.data());
1059 }
1060
1061 Vectorized<c10::qint32> maximum(Vectorized<c10::qint32> b) const {
1062 Vectorized<c10::qint32> retval;
1063 for (const auto i : c10::irange(size())) {
1064 retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
1065 }
1066 return retval;
1067 }
1068
1069 Vectorized<c10::qint32> minimum(Vectorized<c10::qint32> b) const {
1070 Vectorized<c10::qint32> retval;
1071 for (const auto i : c10::irange(size())) {
1072 retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
1073 }
1074 return retval;
1075 }
1076
1077 Vectorized<c10::qint32> relu(Vectorized<c10::qint32> zero_point) const {
1078 return maximum(zero_point);
1079 }
1080
1081
1082 Vectorized<c10::qint32> relu6(
1083 Vectorized<c10::qint32> zero_point,
1084 Vectorized<c10::qint32> q_six) {
1085 Vectorized<c10::qint32> retval;
1086 for (const auto i : c10::irange(size())) {
1087 retval.vals[i] = std::min<value_type>(
1088 std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
1089 }
1090 return retval;
1091 }
1092
1093 int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const {
1094 int_vec_return_type retval;
1095 for (const auto i : c10::irange(size())) {
1096 retval[0].vals[i] = vals[i] - b.vals[i];
1097 }
1098 return retval;
1099 }
1100
1101 static Vectorized<c10::qint32> requantize_from_int(
1102 const int_vec_return_type& inp,
1103 float multiplier,
1104 int32_t zero_point) {
1105 Vectorized<c10::qint32> retval;
1106 for (const auto i : c10::irange(size())) {
1107 retval.vals[i] =
1108 std::nearbyint(static_cast<float>(inp[0].vals[i]) * multiplier) +
1109 zero_point;
1110 }
1111 return retval;
1112 }
1113 };
1114
1115 template <>
1116 Vectorized<c10::qint32> inline maximum(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
1117 return a.maximum(b);
1118 }
1119
1120 template <>
1121 Vectorized<c10::qint32> inline operator*(
1122 const Vectorized<c10::qint32>& a,
1123 const Vectorized<c10::qint32>& b) {
1124 Vectorized<c10::qint32> retval;
1125 for (const auto i : c10::irange(std::decay_t<decltype(a)>::size())) {
1126 retval.vals[i] = a.vals[i] * b.vals[i];
1127 }
1128 return retval;
1129 }
1130
1131 template <>
1132 Vectorized<c10::qint32> inline operator+(
1133 const Vectorized<c10::qint32>& a,
1134 const Vectorized<c10::qint32>& b) {
1135 Vectorized<c10::qint32> retval;
1136 for (const auto i : c10::irange(std::decay_t<decltype(a)>::size())) {
1137 retval.vals[i] = a.vals[i] + b.vals[i];
1138 }
1139 return retval;
1140 }
1141
1142 template <>
1143 struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
1144 c10::qint8,
1145 std::array<Vectorized<float>, 4>,
1146 std::array<Vectorized<c10::qint32>, 4>,
1147 64> {
1148 Vectorized()
1149 : VectorizedQuantizedConverter<
1150 c10::qint8,
1151 std::array<Vectorized<float>, 4>,
1152 std::array<Vectorized<c10::qint32>, 4>,
1153 64>() {}
1154 Vectorized(c10::qint8 val)
1155 : VectorizedQuantizedConverter<
1156 c10::qint8,
1157 std::array<Vectorized<float>, 4>,
1158 std::array<Vectorized<c10::qint32>, 4>,
1159 64>(val) {}
1160 Vectorized(const void* ptr)
1161 : VectorizedQuantizedConverter<
1162 c10::qint8,
1163 std::array<Vectorized<float>, 4>,
1164 std::array<Vectorized<c10::qint32>, 4>,
1165 64>(ptr) {}
1166
1167 static Vectorized<c10::qint8> loadu(const void* ptr) {
1168 return Vectorized<c10::qint8>(ptr);
1169 }
1170
1171 static Vectorized<c10::qint8> loadu(const void* ptr, int64_t count) {
1172 __at_align__ value_type tmp_values[size()];
1173 // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
1174 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
1175 // instructions while a loop would be compiled to one instruction.
1176 for (const auto i : c10::irange(size())) {
1177 tmp_values[i] = 0;
1178 }
1179 std::memcpy(tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
1180 return loadu(tmp_values);
1181 }
1182
1183 static Vectorized<c10::qint8> quantize(
1184 const float_vec_return_type& rhs,
1185 float scale,
1186 int32_t zero_point,
1187 float inverse_scale [[maybe_unused]]) {
1188 std::array<value_type, size()> qvals;
1189 std::array<float, float_num_vecs() * 16> float_vals;
1190
1191 for (const auto i : c10::irange(float_num_vecs())) {
1192 rhs[i].store(&float_vals[i * 16], 16);
1193 }
1194
1195 at::native::quantize_vec<c10::qint8>(
1196 scale,
1197 zero_point,
1198 float_vals.data(),
1199 (c10::qint8*)qvals.data(),
1200 16 * float_num_vecs());
1201
1202 return Vectorized<c10::qint8>::loadu(qvals.data());
1203 }
1204
1205 Vectorized<c10::qint8> maximum(Vectorized<c10::qint8> b) const {
1206 Vectorized<c10::qint8> retval;
1207 for (const auto i : c10::irange(size())) {
1208 retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
1209 }
1210 return retval;
1211 }
1212
1213 Vectorized<c10::qint8> minimum(Vectorized<c10::qint8> b) const {
1214 Vectorized<c10::qint8> retval;
1215 for (const auto i : c10::irange(size())) {
1216 retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
1217 }
1218 return retval;
1219 }
1220
1221 Vectorized<c10::qint8> relu(Vectorized<c10::qint8> zero_point) const {
1222 return maximum(zero_point);
1223 }
1224
1225 Vectorized<c10::qint8> relu6(
1226 Vectorized<c10::qint8> zero_point,
1227 Vectorized<c10::qint8> q_six) {
1228 Vectorized<c10::qint8> retval;
1229 for (const auto i : c10::irange(size())) {
1230 retval.vals[i] = std::min<value_type>(
1231 std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
1232 }
1233 return retval;
1234 }
1235
1236 int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
1237 int_vec_return_type retval;
1238 constexpr int elem_per_int_vec = size() / int_num_vecs();
1239 for (const auto i : c10::irange(int_num_vecs())) {
1240 for (const auto j : c10::irange(elem_per_int_vec)) {
1241 retval[i].vals[j] =
1242 static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
1243 static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
1244 }
1245 }
1246 return retval;
1247 }
1248 static Vectorized<c10::qint8> requantize_from_int(
1249 const int_vec_return_type& inp,
1250 float multiplier,
1251 int32_t zero_point) {
1252 constexpr int elem_per_int_vec = size() / int_num_vecs();
1253 constexpr auto min_val = std::numeric_limits<value_type>::min();
1254 constexpr auto max_val = std::numeric_limits<value_type>::max();
1255 Vectorized<c10::qint8> retval;
1256 for (const auto i : c10::irange(int_num_vecs())) {
1257 for (const auto j : c10::irange(elem_per_int_vec)) {
1258 int32_t rounded =
1259 std::nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
1260 zero_point;
1261 retval.vals[i * elem_per_int_vec + j] =
1262 std::min<int32_t>(std::max<int32_t>(rounded, min_val), max_val);
1263 }
1264 }
1265 return retval;
1266 }
1267 };
1268
1269 template <>
1270 Vectorized<c10::qint8> inline maximum(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
1271 return a.maximum(b);
1272 }
1273
1274 template <>
1275 struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
1276 c10::quint8,
1277 std::array<Vectorized<float>, 4>,
1278 std::array<Vectorized<c10::qint32>, 4>,
1279 64> {
1280 Vectorized()
1281 : VectorizedQuantizedConverter<
1282 c10::quint8,
1283 std::array<Vectorized<float>, 4>,
1284 std::array<Vectorized<c10::qint32>, 4>,
1285 64>() {}
1286 Vectorized(c10::quint8 val)
1287 : VectorizedQuantizedConverter<
1288 c10::quint8,
1289 std::array<Vectorized<float>, 4>,
1290 std::array<Vectorized<c10::qint32>, 4>,
1291 64>(val) {}
1292 Vectorized(const void* ptr)
1293 : VectorizedQuantizedConverter<
1294 c10::quint8,
1295 std::array<Vectorized<float>, 4>,
1296 std::array<Vectorized<c10::qint32>, 4>,
1297 64>(ptr) {}
1298
1299 static Vectorized<c10::quint8> loadu(const void* ptr) {
1300 return Vectorized<c10::quint8>(ptr);
1301 }
1302
1303 static Vectorized<c10::quint8> loadu(const void* ptr, int64_t count) {
1304 __at_align__ value_type tmp_values[size()];
1305 // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
1306 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
1307 // instructions while a loop would be compiled to one instruction.
1308 for (const auto i : c10::irange(size())) {
1309 tmp_values[i] = 0;
1310 }
1311 std::memcpy(tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
1312 return loadu(tmp_values);
1313 }
1314
1315 static Vectorized<c10::quint8> quantize(
1316 const float_vec_return_type& rhs,
1317 float scale,
1318 int32_t zero_point,
1319 float inverse_scale [[maybe_unused]]) {
1320 std::array<value_type, size()> qvals;
1321 std::array<float, float_num_vecs() * 16> float_vals;
1322
1323 for (const auto i : c10::irange(float_num_vecs())) {
1324 rhs[i].store(&float_vals[i * 16], 16);
1325 }
1326
1327 at::native::quantize_vec<c10::quint8>(
1328 scale,
1329 zero_point,
1330 float_vals.data(),
1331 (c10::quint8*)qvals.data(),
1332 16 * float_num_vecs());
1333
1334 return Vectorized<c10::quint8>::loadu(qvals.data());
1335 }
1336
1337 Vectorized<c10::quint8> maximum(Vectorized<c10::quint8> b) const {
1338 Vectorized<c10::quint8> retval;
1339 for (const auto i : c10::irange(size())) {
1340 retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
1341 }
1342 return retval;
1343 }
1344
1345 Vectorized<c10::quint8> minimum(Vectorized<c10::quint8> b) const {
1346 Vectorized<c10::quint8> retval;
1347 for (const auto i : c10::irange(size())) {
1348 retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
1349 }
1350 return retval;
1351 }
1352
1353 Vectorized<c10::quint8> relu(Vectorized<c10::quint8> zero_point) const {
1354 return maximum(zero_point);
1355 }
1356
1357
1358 Vectorized<c10::quint8> relu6(
1359 Vectorized<c10::quint8> zero_point,
1360 Vectorized<c10::quint8> q_six) {
1361 Vectorized<c10::quint8> retval;
1362 for (const auto i : c10::irange(size())) {
1363 retval.vals[i] = std::min<value_type>(
1364 std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
1365 }
1366 return retval;
1367 }
1368
1369 int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
1370 int_vec_return_type retval;
1371 constexpr int elem_per_int_vec = size() / int_num_vecs();
1372 for (const auto i : c10::irange(int_num_vecs())) {
1373 for (const auto j : c10::irange(elem_per_int_vec)) {
1374 retval[i].vals[j] =
1375 static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
1376 static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
1377 }
1378 }
1379 return retval;
1380 }
1381 static Vectorized<c10::quint8> requantize_from_int(
1382 const int_vec_return_type& inp,
1383 float multiplier,
1384 int32_t zero_point) {
1385 constexpr int elem_per_int_vec = size() / int_num_vecs();
1386 constexpr auto min_val = std::numeric_limits<value_type>::min();
1387 constexpr auto max_val = std::numeric_limits<value_type>::max();
1388 Vectorized<c10::quint8> retval;
1389 for (const auto i : c10::irange(int_num_vecs())) {
1390 for (const auto j : c10::irange(elem_per_int_vec)) {
1391 int32_t rounded =
1392 std::nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
1393 zero_point;
1394 retval.vals[i * elem_per_int_vec + j] =
1395 std::min<int32_t>(std::max<int32_t>(rounded, min_val), max_val);
1396 }
1397 }
1398 return retval;
1399 }
1400 };
1401
1402 template <>
1403 Vectorized<c10::quint8> inline maximum(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
1404 return a.maximum(b);
1405 }
1406
1407 #endif // defined(CPU_CAPABILITY_AVX512) && !defined(MSVC)
1408
1409 }}}
1410