xref: /aosp_15_r20/external/pytorch/aten/src/ATen/cpu/vec/vec512/vec512_qint.h (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #pragma once
2 
3 // DO NOT DEFINE STATIC DATA IN THIS HEADER!
4 // See Note [Do not compile initializers with AVX]
5 
6 #include <ATen/cpu/vec/intrinsics.h>
7 #include <ATen/cpu/vec/vec_base.h>
8 #include <ATen/native/quantized/AffineQuantizerBase.h>
9 
10 #include <c10/util/irange.h>
11 #include <c10/util/qint32.h>
12 #include <c10/util/qint8.h>
13 #include <c10/util/quint8.h>
14 
15 #include <array>
16 #include <cmath>
17 
18 // This file defines Vectorized<> for the quantized types.
19 //
20 //
21 // Currently, we simply use these classes as efficient converters between
22 // the quantized types and Vectorized<float>, usually in bandwidth-bound cases
23 // where doing the arithmetic in full-precision is acceptable (e.g.
24 // elementwise operators).
25 //
26 //
27 // Conversions are as follows:
28 //  Vectorized<qint8> -> 4x Vectorized<float>
29 //  Vectorized<quint8> -> 4x Vectorized<float>
30 //  Vectorized<qint32> -> 1x Vectorized<float>
31 //
32 // The size of the returned float vector is specified by the special
33 // constexpr function float_num_vecs. The type of the value returned
34 // from dequantize (and expected as an argument to quantize) is
35 // specified by float_vec_return_type.
36 //
37 // When writing kernels with these vectors, it is expected that floating-
38 // point operations will be carried out in a loop over Vectorized<T>::float_num_vecs
39 // iterations.
40 
41 namespace at {
42 namespace vec {
43 inline namespace CPU_CAPABILITY {
44 
45 #if defined(CPU_CAPABILITY_AVX512)
46 
47 #ifdef _MSC_VER
48 __declspec(align(64)) struct Vectorizedqi {
49  protected:
50   __m512i vals;
51 #else
52 struct Vectorizedqi {
53  protected:
54   __m512i vals __attribute__((aligned(64)));
55 #endif
56 
57  public:
VectorizedqiVectorizedqi58   Vectorizedqi() {}
VectorizedqiVectorizedqi59   Vectorizedqi(__m512i v) : vals(v) {}
__m512iVectorizedqi60   operator __m512i() const {
61     return vals;
62   }
63 };
64 
65 
66 template <typename T>
67 __m512i pack_saturate_and_clamp(
68     __m512i first,
69     __m512i second,
70     T min_val,
71     T max_val);
72 
73 template <>
74 inline __m512i pack_saturate_and_clamp<int32_t>(
75     __m512i first [[maybe_unused]],
76     __m512i second [[maybe_unused]],
77     int32_t min_val [[maybe_unused]],
78     int32_t max_val [[maybe_unused]]) {
79   // This function is for linkage only, will not be used
80   AT_ERROR("pack_saturate_and_clamp<int32_t> is not supported");
81   return __m512i{};
82 }
83 
84 template <>
85 inline __m512i pack_saturate_and_clamp<int8_t>(
86     __m512i first,
87     __m512i second,
88     int8_t min_val,
89     int8_t max_val) {
90   __m512i packed_and_sat = _mm512_packs_epi16(first, second);
91   return _mm512_max_epi8(
92       _mm512_set1_epi8(min_val),
93       _mm512_min_epi8(packed_and_sat, _mm512_set1_epi8(max_val)));
94 }
95 
96 template <>
97 inline __m512i pack_saturate_and_clamp<uint8_t>(
98     __m512i first,
99     __m512i second,
100     uint8_t min_val,
101     uint8_t max_val) {
102   __m512i packed_and_sat = _mm512_packus_epi16(first, second);
103   return _mm512_max_epu8(
104       _mm512_set1_epi8(min_val),
105       _mm512_min_epu8(packed_and_sat, _mm512_set1_epi8(max_val)));
106 }
107 
108 template <typename T>
109 typename std::enable_if_t<std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>, at::vec::Vectorized<float>>
convert_int8_to_float(at::vec::Vectorized<T> src)110 inline convert_int8_to_float(at::vec::Vectorized<T> src) {
111   // Note: this function only convert inputs number of elements equal to at::vec::Vectorized<float>.size()
112   // Only handle first 16*8 bits
113   __m128i input_128 = _mm512_castsi512_si128(src);
114   // Convert from 16*uint8/int8 to 16*int32
115   __m512i input_512_extended;
116   if constexpr (std::is_same_v<T, uint8_t>)
117     input_512_extended = _mm512_cvtepu8_epi32(input_128);
118   else
119     input_512_extended = _mm512_cvtepi8_epi32(input_128);
120   // Convert from 16*int32 to 16*float32
121   return _mm512_cvtepi32_ps(input_512_extended);
122 }
123 
124 template <typename T>
125 typename std::enable_if_t<std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>, at::vec::Vectorized<T>>
convert_float_to_int8(at::vec::Vectorized<float> src)126 inline convert_float_to_int8(at::vec::Vectorized<float> src) {
127   // Convert from float32 to int32 with truncation
128   __m512i x_values_int32 = _mm512_cvttps_epi32(src);
129 
130   // Convert from int32 to int16 using signed saturation
131   __m512i xy_packed_v = _mm512_packs_epi32(x_values_int32, x_values_int32);
132 
133   constexpr auto min_val = std::numeric_limits<T>::min();
134   constexpr auto max_val = std::numeric_limits<T>::max();
135 
136   // Convert from int16 to uint8/int8 using unsigned saturation
137   __m512i xyzw_clamped_v = pack_saturate_and_clamp<T>(
138       xy_packed_v, xy_packed_v, min_val, max_val);
139   __m512i permute_mask_v =
140       _mm512_set_epi32(0x0f, 0x0b, 0x07, 0x03, 0x0e, 0x0a, 0x06, 0x02,
141                       0x0d, 0x09, 0x05, 0x01, 0x0c, 0x08, 0x04, 0x00);
142   return _mm512_permutexvar_epi32(permute_mask_v, xyzw_clamped_v);
143 }
144 
145 template <typename T>
QuantizeAvx512(const float * src,T * dst,int len,float inverse_scale,int64_t zero_point)146 __FORCE_INLINE void QuantizeAvx512(
147     const float* src,
148     T* dst,
149     int len,
150     float inverse_scale,
151     int64_t zero_point) {
152   constexpr int VLEN = 16;
153   constexpr auto min_val = std::numeric_limits<T>::min();
154   constexpr auto max_val = std::numeric_limits<T>::max();
155   const __m512i min_v = _mm512_set1_epi32(min_val);
156   const __m512i max_v = _mm512_set1_epi32(max_val);
157   // This is the largest int32 value < int32_max exactly representable in float
158   constexpr int32_t int32_float_max_val =
159       std::numeric_limits<int32_t>::max() - 127;
160   int i = 0;
161   __m512 inverse_scale_v = _mm512_set1_ps(inverse_scale);
162   // clang-format off
163   static const __m512i shuffle_mask_v = _mm512_set_epi8(
164       0xff, 0xff, 0xff, 0xff,
165       0xff, 0xff, 0xff, 0xff,
166       0xff, 0xff, 0xff, 0xff,
167       0x0c, 0x08, 0x04, 0x00,
168       0xff, 0xff, 0xff, 0xff,
169       0xff, 0xff, 0xff, 0xff,
170       0xff, 0xff, 0xff, 0xff,
171       0x0c, 0x08, 0x04, 0x00,
172       0xff, 0xff, 0xff, 0xff,
173       0xff, 0xff, 0xff, 0xff,
174       0xff, 0xff, 0xff, 0xff,
175       0x0c, 0x08, 0x04, 0x00,
176       0xff, 0xff, 0xff, 0xff,
177       0xff, 0xff, 0xff, 0xff,
178       0xff, 0xff, 0xff, 0xff,
179       0x0c, 0x08, 0x04, 0x00);
180   // clang-format on
181   __m512i permute_mask_v =
182       _mm512_set_epi32(0x0f, 0x0b, 0x07, 0x03, 0x0e, 0x0a, 0x06, 0x02,
183                        0x0d, 0x09, 0x05, 0x01, 0x0c, 0x08, 0x04, 0x00);
184   __m512i permute_mask_l8_v =
185       _mm512_set_epi32(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
186                        0x00, 0x00, 0x00, 0x00, 0x0c, 0x08, 0x04, 0x00);
187   int len_aligned = len / (VLEN * 4) * (VLEN * 4);
188   for (; i < len_aligned; i += 4 * VLEN) {
189     // x
190     __m512 x_vals = _mm512_load_ps(src + i);
191     __m512 x_transformed_v = _mm512_mul_ps(x_vals, inverse_scale_v);
192     // If the floating point value is greater than int32_max,
193     // _mm512_cvtps_epi32 converts them to -ve. Clip at int32_float_max_val to
194     // Clip at int32_float_max_val to avoid this.
195     x_transformed_v =
196         _mm512_min_ps(x_transformed_v, _mm512_set1_ps(int32_float_max_val));
197     // y
198     __m512 y_vals = _mm512_load_ps(src + i + VLEN);
199     __m512 y_transformed_v = _mm512_mul_ps(y_vals, inverse_scale_v);
200     y_transformed_v =
201         _mm512_min_ps(y_transformed_v, _mm512_set1_ps(int32_float_max_val));
202     // z
203     __m512 z_vals = _mm512_load_ps(src + i + 2 * VLEN);
204     __m512 z_transformed_v = _mm512_mul_ps(z_vals, inverse_scale_v);
205     z_transformed_v =
206         _mm512_min_ps(z_transformed_v, _mm512_set1_ps(int32_float_max_val));
207     // w
208     __m512 w_vals = _mm512_load_ps(src + i + 3 * VLEN);
209     __m512 w_transformed_v = _mm512_mul_ps(w_vals, inverse_scale_v);
210     w_transformed_v =
211         _mm512_min_ps(w_transformed_v, _mm512_set1_ps(int32_float_max_val));
212 
213     __m512i x_rounded_v = _mm512_cvtps_epi32(x_transformed_v);
214     __m512i y_rounded_v = _mm512_cvtps_epi32(y_transformed_v);
215     __m512i z_rounded_v = _mm512_cvtps_epi32(z_transformed_v);
216     __m512i w_rounded_v = _mm512_cvtps_epi32(w_transformed_v);
217 
218     // add zero point
219     x_rounded_v = _mm512_add_epi32(x_rounded_v, _mm512_set1_epi32(zero_point));
220     y_rounded_v = _mm512_add_epi32(y_rounded_v, _mm512_set1_epi32(zero_point));
221     z_rounded_v = _mm512_add_epi32(z_rounded_v, _mm512_set1_epi32(zero_point));
222     w_rounded_v = _mm512_add_epi32(w_rounded_v, _mm512_set1_epi32(zero_point));
223 
224     __m512i xy_packed_v = _mm512_packs_epi32(x_rounded_v, y_rounded_v);
225     __m512i zw_packed_v = _mm512_packs_epi32(z_rounded_v, w_rounded_v);
226     __m512i xyzw_clamped_v =
227         pack_saturate_and_clamp<T>(xy_packed_v, zw_packed_v, min_val, max_val);
228 
229     xyzw_clamped_v =
230         _mm512_permutexvar_epi32(permute_mask_v, xyzw_clamped_v);
231     _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + i), xyzw_clamped_v);
232   }
233 
234   // Additional 8-lane AVX512 version to take advantage when len is smaller
235   // based on fbgemm::QuantizeAvx2 (https://github.com/pytorch/FBGEMM)
236   for (; i < len / VLEN * VLEN; i += VLEN) {
237     __m512 x_vals = _mm512_load_ps(src + i);
238     __m512 x_transformed_v = _mm512_mul_ps(x_vals, inverse_scale_v);
239     x_transformed_v =
240         _mm512_min_ps(x_transformed_v, _mm512_set1_ps(int32_float_max_val));
241     __m512i x_rounded_v = _mm512_cvtps_epi32(x_transformed_v);
242     x_rounded_v = _mm512_add_epi32(x_rounded_v, _mm512_set1_epi32(zero_point));
243     __m512i x_clipped_v =
244         _mm512_max_epi32(min_v, _mm512_min_epi32(max_v, x_rounded_v));
245 
246     x_clipped_v = _mm512_shuffle_epi8(x_clipped_v, shuffle_mask_v);
247     x_clipped_v = _mm512_permutexvar_epi32(permute_mask_l8_v, x_clipped_v);
248     _mm_storeu_si128(
249         reinterpret_cast<__m128i*>(dst + i),
250         _mm512_castsi512_si128(x_clipped_v));
251   }
252 
253   for (; i < len; ++i) {
254     float transformed = src[i] * inverse_scale;
255 
256     // Not exactly the same behavior as the vectorized code.
257     // The vectorized code above always rounds to even in halfway cases
258     // (https://software.intel.com/en-us/node/523819), but std::nearbyint
259     // does the same only when the current rounding mode is FE_TONEAREST.
260     // However, in practice, this should not be a problem because most cases
261     // use the default rounding mode FE_TONEAREST.
262     // Note that we cannot implement the same behavior as the vectorized code
263     // using std::round because it does rounding away from zero in halfway
264     // cases.
265     transformed = zero_point + std::nearbyint(transformed);
266     float clipped =
267         std::min(std::max(transformed, float(min_val)), float(max_val));
268     dst[i] = clipped;
269   }
270 }
271 
272 template<>
273 struct Vectorized<c10::qint32> : public Vectorizedqi {
274     using size_type = int;
275     static constexpr size_type size() {
276         return 16;
277     }
278 
279     static constexpr int float_num_vecs() {
280         return 1;
281     }
282 
283     static constexpr int int_num_vecs() {
284         return 1;
285     }
286 
287     using float_vec_return_type = std::array<Vectorized<float>, 1>;
288     using int_vec_return_type = std::array<Vectorized<c10::qint32>, 1>;
289     using value_type = c10::qint32::underlying;
290 
291  public:
292     using Vectorizedqi::Vectorizedqi;
293     Vectorized() {}
294 
295     Vectorized(__m512i vals_) { vals = vals_;}
296 
297     // Broadcast constructor
298     Vectorized(const c10::qint32& val) {
299         value_type uw = val.val_;
300         vals = _mm512_set1_epi32(uw);
301     }
302 
303     void store(void* ptr, int count = size()) const {
304       if (count != size()) {
305         memcpy(ptr, &vals, count * sizeof(value_type));
306       } else {
307         _mm512_storeu_si512((__m512i*)ptr, vals);
308       }
309     }
310 
311     static Vectorized<c10::qint32> loadu(const void* ptr) {
312         return Vectorized<c10::qint32>(ptr);
313     }
314 
315     static Vectorized<c10::qint32> loadu(const void* ptr, int64_t count) {
316         __at_align__ value_type tmp_values[size()];
317         // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
318         // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
319         // instructions while a loop would be compiled to one instruction.
320         for (const auto i : c10::irange(size())) {
321           tmp_values[i] = 0;
322         }
323         std::memcpy(tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
324         return loadu(tmp_values);
325     }
326 
327     float_vec_return_type dequantize(
328         Vectorized<float> scale,
329         Vectorized<float> zero_point,
330         Vectorized<float> scale_zp_premul) const {
331       __m512 float_vals = _mm512_cvtepi32_ps(vals);
332       return {vec::fmadd(scale, Vectorized<float>(float_vals), scale_zp_premul)};
333     }
334 
335     float_vec_return_type dequantize(
336         Vectorized<float> scale,
337         Vectorized<float> zero_point) const {
338       __m512 float_vals = _mm512_cvtepi32_ps(vals);
339       return {(Vectorized<float>(float_vals) - zero_point) * scale};
340     }
341 
342     static Vectorized<c10::qint32> quantize(
343         const float_vec_return_type& rhs,
344         float scale,
345         int32_t zero_point,
346         float inverse_scale [[maybe_unused]]) {
347       Vectorized<c10::qint32> retval;
348       auto rhs_data = (__m512)rhs[0];
349       at::native::quantize_vec<c10::qint32, /*precision=*/32>(
350           scale, zero_point, (float*)&rhs_data, (c10::qint32*)&retval.vals, 16);
351       return retval;
352     }
353 
354     Vectorized<c10::qint32> maximum(Vectorized<c10::qint32> b) const {
355       return _mm512_max_epi32(vals, b.vals);
356     }
357 
358     Vectorized<c10::qint32> minimum(Vectorized<c10::qint32> b) const {
359       return _mm512_min_epi32(vals, b.vals);
360     }
361 
362     Vectorized<c10::qint32> relu(Vectorized<c10::qint32> zero_point) const {
363         return maximum(zero_point);
364     }
365 
366     Vectorized<c10::qint32> relu6(
367         Vectorized<c10::qint32> zero_point,
368         Vectorized<c10::qint32> q_six) {
369       return _mm512_min_epi32(
370           _mm512_max_epi32(vals, zero_point.vals), q_six.vals);
371     }
372 
373     int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const {
374       return {_mm512_sub_epi32(vals, b)};
375     }
376 
377     static Vectorized<c10::qint32> requantize_from_int(
378         const int_vec_return_type& inp,
379         float multiplier,
380         int32_t zero_point) {
381       __m512 multiplier_v = _mm512_set1_ps(multiplier);
382       __m512i zero_point_v = _mm512_set1_epi32(zero_point);
383 
384       __m512 scaled = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[0]), multiplier_v);
385       __m512i rounded = _mm512_cvtps_epi32(scaled);
386       return _mm512_add_epi32(rounded, zero_point_v);
387     }
388 
389  private:
390     // Load from memory constructor
391     Vectorized(const void* ptr) {
392       vals = _mm512_loadu_si512((const __m512i*)ptr);
393     }
394 };
395 
396 template <>
397 Vectorized<c10::qint32> inline maximum(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
398   return a.maximum(b);
399 }
400 
401 template <>
402 Vectorized<c10::qint32> inline operator*(
403     const Vectorized<c10::qint32>& a,
404     const Vectorized<c10::qint32>& b) {
405   return _mm512_mullo_epi32(a, b);
406 }
407 
408 template <>
409 Vectorized<c10::qint32> inline operator+(
410     const Vectorized<c10::qint32>& a,
411     const Vectorized<c10::qint32>& b) {
412   return _mm512_add_epi32(a, b);
413 }
414 
415 /*
416  * Convert values from int32 back to int8/uint8
417  */
418 template <typename T>
419 __m512i RequantizeAvx512(
420     const std::array<Vectorized<c10::qint32>, 4>& inp,
421     __m512 multiplier,
422     __m512i zp) {
423   static_assert(
424       std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>,
425       "Only int8_t/uint8_t are supported");
426   constexpr auto min_val = std::numeric_limits<T>::min();
427   constexpr auto max_val = std::numeric_limits<T>::max();
428   __m512i permute_mask_v =
429       _mm512_set_epi32(0x0f, 0x0b, 0x07, 0x03, 0x0e, 0x0a, 0x06, 0x02,
430                        0x0d, 0x09, 0x05, 0x01, 0x0c, 0x08, 0x04, 0x00);
431   __m512 x_scaled_v = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[0]), multiplier);
432   __m512 y_scaled_v = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[1]), multiplier);
433   __m512 z_scaled_v = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[2]), multiplier);
434   __m512 w_scaled_v = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[3]), multiplier);
435 
436   __m512i x_rounded_v = _mm512_cvtps_epi32(x_scaled_v);
437   __m512i y_rounded_v = _mm512_cvtps_epi32(y_scaled_v);
438   __m512i z_rounded_v = _mm512_cvtps_epi32(z_scaled_v);
439   __m512i w_rounded_v = _mm512_cvtps_epi32(w_scaled_v);
440 
441   /* Add zero point */
442   __m512i x_v = _mm512_add_epi32(x_rounded_v, zp);
443   __m512i y_v = _mm512_add_epi32(y_rounded_v, zp);
444   __m512i z_v = _mm512_add_epi32(z_rounded_v, zp);
445   __m512i w_v = _mm512_add_epi32(w_rounded_v, zp);
446 
447   /* Pack to int16_t and saturate */
448   __m512i xy_packed_v = _mm512_packs_epi32(x_v, y_v);
449   __m512i zw_packed_v = _mm512_packs_epi32(z_v, w_v);
450 
451   __m512i xyzw_clamped_v =
452       pack_saturate_and_clamp<T>(xy_packed_v, zw_packed_v, min_val, max_val);
453 
454   /*
455    * xyzw_clamped_v has results in the following layout so we need to
456    * permute: x0-3 y0-3 z0-3 w0-3 x4-7 y4-7 z4-7 w4-7 x8-11 y8-11 z8-11 w8-11 x12-15 y12-15 z12-15 w12-15
457    */
458   xyzw_clamped_v = _mm512_permutexvar_epi32(permute_mask_v, xyzw_clamped_v);
459   return xyzw_clamped_v;
460 }
461 
462 template<>
463 struct Vectorized<c10::qint8> : public Vectorizedqi {
464     static constexpr int size() {
465         return 64;
466     }
467 
468     static constexpr int float_num_vecs() {
469         return 4;
470     }
471 
472     static constexpr int int_num_vecs() {
473         return 4;
474     }
475 
476     using float_vec_return_type = std::array<Vectorized<float>, 4>;
477     using int_vec_return_type = std::array<Vectorized<c10::qint32>, 4>;
478     using value_type = typename c10::qint8::underlying;
479 
480  public:
481     using Vectorizedqi::Vectorizedqi;
482 
483     Vectorized() {}
484     Vectorized(__m512i vals_) { vals = vals_;}
485 
486     // Broadcast constructor
487     Vectorized(const c10::qint8& val) {
488         value_type uw = val.val_;
489         vals = _mm512_set1_epi8(uw);
490     }
491 
492     // This is needed because the compiler emits awful code for the default
493     // constructor for moving the enum
494     Vectorized(const Vectorized<c10::qint8>& other) : Vectorizedqi(other.vals) { }
495 
496     // This is added to avoid error: definition of implicit copy assignment operator
497     // for 'Vectorized<c10::qint8>' is deprecated because it has a user-declared
498     // copy constructor [-Werror,-Wdeprecated-copy]
499     Vectorized& operator=(const Vectorized<c10::qint8>&) = default;
500 
501     void store(void* ptr, int count = size()) const {
502         if (count != size()) {
503             memcpy(ptr, &vals, count * sizeof(value_type));
504         } else {
505             _mm512_storeu_si512((__m512i*)ptr, vals);
506         }
507     }
508 
509     static Vectorized<c10::qint8> loadu(const void* ptr) {
510         return Vectorized<c10::qint8>(ptr);
511     }
512 
513     static Vectorized<c10::qint8> loadu(const void* ptr, int64_t count) {
514         __at_align__ value_type tmp_values[size()];
515         // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
516         // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
517         // instructions while a loop would be compiled to one instruction.
518         for (const auto i : c10::irange(size())) {
519           tmp_values[i] = 0;
520         }
521         std::memcpy(tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
522         return loadu(tmp_values);
523     }
524 
525  private:
526     __m512i cvtepi8_epi32(__m128i epi8_vals) const {
527         return _mm512_cvtepi8_epi32(epi8_vals);
528     }
529 
530  public:
531   float_vec_return_type dequantize(
532       Vectorized<float> scale,
533       Vectorized<float> zero_point,
534       Vectorized<float> scale_neg_zp_premul) const {
535     #if defined(_MSC_VER) && !defined(__clang__)
536     __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
537     __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
538     __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
539     __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
540     #else
541     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
542     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
543     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
544     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
545     #endif
546 
547     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0));
548     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1));
549     __m512 float_val2 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val2));
550     __m512 float_val3 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val3));
551 
552     auto val0 =
553         vec::fmadd(scale, Vectorized<float>(float_val0), scale_neg_zp_premul);
554     auto val1 =
555         vec::fmadd(scale, Vectorized<float>(float_val1), scale_neg_zp_premul);
556     auto val2 =
557         vec::fmadd(scale, Vectorized<float>(float_val2), scale_neg_zp_premul);
558     auto val3 =
559         vec::fmadd(scale, Vectorized<float>(float_val3), scale_neg_zp_premul);
560     return {val0, val1, val2, val3};
561   }
562 
563   float_vec_return_type dequantize(
564       Vectorized<float> scale,
565       Vectorized<float> zero_point) const {
566     #if defined(_MSC_VER) && !defined(__clang__)
567     __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
568     __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
569     __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
570     __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
571     #else
572     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
573     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
574     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
575     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
576     #endif
577 
578     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0));
579     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1));
580     __m512 float_val2 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val2));
581     __m512 float_val3 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val3));
582 
583     auto val0 = (Vectorized<float>(float_val0) - zero_point) * scale;
584     auto val1 = (Vectorized<float>(float_val1) - zero_point) * scale;
585     auto val2 = (Vectorized<float>(float_val2) - zero_point) * scale;
586     auto val3 = (Vectorized<float>(float_val3) - zero_point) * scale;
587     return {val0, val1, val2, val3};
588   }
589 
590   static Vectorized<c10::qint8> quantize(
591       const float_vec_return_type& rhs,
592       float scale,
593       int32_t zero_point,
594       float inverse_scale) {
595     auto* rhs_data = (float*)rhs.data();
596     int8_t quantized_values[64];
597     QuantizeAvx512<value_type>(
598         rhs_data, quantized_values, 64, inverse_scale, zero_point);
599     return Vectorized<c10::qint8>::loadu(quantized_values);
600   }
601 
602   Vectorized<c10::qint8> maximum(Vectorized<c10::qint8> b) const {
603       return _mm512_max_epi8(vals, b.vals);
604     }
605 
606   Vectorized<c10::qint8> minimum(Vectorized<c10::qint8> b) const {
607       return _mm512_min_epi8(vals, b.vals);
608     }
609 
610     Vectorized<c10::qint8> relu(Vectorized<c10::qint8> zero_point) const {
611         return maximum(zero_point);
612     }
613 
614     Vectorized<c10::qint8> relu6(
615         Vectorized<c10::qint8> zero_point,
616         Vectorized<c10::qint8> q_six) {
617       return _mm512_min_epi8(
618           _mm512_max_epi8(vals, zero_point.vals), q_six.vals);
619     }
620 
621     int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
622       #if defined(_MSC_VER) && !defined(__clang__)
623       __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
624       __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
625       __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
626       __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
627       #else
628       __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
629       __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
630       __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
631       __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
632       #endif
633 
634       __m512i int32_val0 = cvtepi8_epi32(int_val0);
635       __m512i int32_val1 = cvtepi8_epi32(int_val1);
636       __m512i int32_val2 = cvtepi8_epi32(int_val2);
637       __m512i int32_val3 = cvtepi8_epi32(int_val3);
638 
639       #if defined(_MSC_VER) && !defined(__clang__)
640       __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]);
641       __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]);
642       __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]);
643       __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]);
644       #else
645       __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]);
646       __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]);
647       __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]);
648       __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]);
649       #endif
650 
651       __m512i int32_b0 = cvtepi8_epi32(int_b0);
652       __m512i int32_b1 = cvtepi8_epi32(int_b1);
653       __m512i int32_b2 = cvtepi8_epi32(int_b2);
654       __m512i int32_b3 = cvtepi8_epi32(int_b3);
655 
656       __m512i res_0 = _mm512_sub_epi32(int32_val0, int32_b0);
657       __m512i res_1 = _mm512_sub_epi32(int32_val1, int32_b1);
658       __m512i res_2 = _mm512_sub_epi32(int32_val2, int32_b2);
659       __m512i res_3 = _mm512_sub_epi32(int32_val3, int32_b3);
660 
661       return {Vectorized<c10::qint32>(res_0),
662               Vectorized<c10::qint32>(res_1),
663               Vectorized<c10::qint32>(res_2),
664               Vectorized<c10::qint32>(res_3)};
665     }
666 
667     static Vectorized<c10::qint8> requantize_from_int(
668         const int_vec_return_type& inp,
669         float multiplier,
670         int32_t zero_point) {
671       __m512 multiplier_v = _mm512_set1_ps(multiplier);
672       __m512i zero_point_v = _mm512_set1_epi32(zero_point);
673       return RequantizeAvx512<value_type>(inp, multiplier_v, zero_point_v);
674     }
675 
676  private:
677     // Load from memory constructor
678     Vectorized(const void* ptr) {
679         vals = _mm512_loadu_si512((const __m512i*)ptr);
680     }
681 };
682 
683 template <>
684 Vectorized<c10::qint8> inline maximum(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
685   return a.maximum(b);
686 }
687 
688 template<>
689 struct Vectorized<c10::quint8> : public Vectorizedqi {
690     static constexpr int size() {
691         return 64;
692     }
693 
694     static constexpr int float_num_vecs() {
695         return 4;
696     }
697 
698     static constexpr int int_num_vecs() {
699         return 4;
700     }
701 
702     using float_vec_return_type = std::array<Vectorized<float>, 4>;
703     using int_vec_return_type = std::array<Vectorized<c10::qint32>, 4>;
704     using value_type = typename c10::quint8::underlying;
705 
706  public:
707     using Vectorizedqi::Vectorizedqi;
708     Vectorized() {}
709 
710     Vectorized(__m512i vals_) { vals = vals_;}
711 
712     // Broadcast constructor
713     Vectorized(const c10::quint8& val) {
714         value_type uw = val.val_;
715         vals = _mm512_set1_epi8(uw);
716     }
717 
718     Vectorized(const Vectorized<c10::quint8>& other) : Vectorizedqi(other.vals) { }
719 
720     // This is added to avoid error: definition of implicit copy assignment operator
721     // for 'Vectorized<c10::quint8>' is deprecated because it has a user-declared
722     // copy constructor [-Werror,-Wdeprecated-copy]
723     Vectorized& operator=(const Vectorized<c10::quint8>&) = default;
724 
725     void store(void* ptr, int count = size()) const {
726         if (count != size()) {
727             memcpy(ptr, &vals, count * sizeof(value_type));
728         } else {
729             _mm512_storeu_si512((__m512i*)ptr, vals);
730         }
731     }
732 
733     static Vectorized<c10::quint8> loadu(const void* ptr) {
734         return Vectorized<c10::quint8>(ptr);
735     }
736 
737     static Vectorized<c10::quint8> loadu(const void* ptr, int64_t count) {
738         __at_align__ value_type tmp_values[size()];
739         // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
740         // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
741         // instructions while a loop would be compiled to one instruction.
742         for (const auto i : c10::irange(size())) {
743           tmp_values[i] = 0;
744         }
745         std::memcpy(tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
746         return loadu(tmp_values);
747     }
748 
749  private:
750     __m512i cvtepu8_epi32(__m128i epu8_vals) const {
751         return _mm512_cvtepu8_epi32(epu8_vals);
752     }
753 
754  public:
755   float_vec_return_type dequantize(
756       Vectorized<float> scale,
757       Vectorized<float> zero_point,
758       Vectorized<float> scale_zp_premul) const {
759     #if defined(_MSC_VER) && !defined(__clang__)
760     __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
761     __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
762     __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
763     __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
764     #else
765     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
766     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
767     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
768     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
769     #endif
770 
771     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0));
772     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1));
773     __m512 float_val2 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val2));
774     __m512 float_val3 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val3));
775 
776     auto val0 =
777         vec::fmadd(scale, Vectorized<float>(float_val0), scale_zp_premul);
778     auto val1 =
779         vec::fmadd(scale, Vectorized<float>(float_val1), scale_zp_premul);
780     auto val2 =
781         vec::fmadd(scale, Vectorized<float>(float_val2), scale_zp_premul);
782     auto val3 =
783         vec::fmadd(scale, Vectorized<float>(float_val3), scale_zp_premul);
784 
785     return {val0, val1, val2, val3};
786   }
787 
788   float_vec_return_type dequantize(
789       Vectorized<float> scale,
790       Vectorized<float> zero_point) const {
791     #if defined(_MSC_VER) && !defined(__clang__)
792     __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
793     __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
794     __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
795     __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
796     #else
797     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
798     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
799     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
800     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
801     #endif
802 
803     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0));
804     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1));
805     __m512 float_val2 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val2));
806     __m512 float_val3 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val3));
807 
808     auto val0 = (Vectorized<float>(float_val0) - zero_point) * scale;
809     auto val1 = (Vectorized<float>(float_val1) - zero_point) * scale;
810     auto val2 = (Vectorized<float>(float_val2) - zero_point) * scale;
811     auto val3 = (Vectorized<float>(float_val3) - zero_point) * scale;
812 
813     return {val0, val1, val2, val3};
814   }
815 
816   static Vectorized<c10::quint8> quantize(
817       const float_vec_return_type& rhs,
818       float scale,
819       int32_t zero_point,
820       float inverse_scale) {
821     auto* rhs_data = (float*)rhs.data();
822     uint8_t quantized_values[64];
823     QuantizeAvx512<value_type>(
824         rhs_data, quantized_values, 64, inverse_scale, zero_point);
825     return Vectorized<c10::quint8>::loadu(quantized_values);
826   }
827 
828   Vectorized<c10::quint8> maximum(Vectorized<c10::quint8> b) const {
829       return _mm512_max_epu8(vals, b.vals);
830     }
831 
832   Vectorized<c10::quint8> minimum(Vectorized<c10::quint8> b) const {
833       return _mm512_min_epu8(vals, b.vals);
834     }
835 
836     Vectorized<c10::quint8> relu(Vectorized<c10::quint8> zero_point) const {
837         return maximum(zero_point);
838     }
839 
840     Vectorized<c10::quint8> relu6(
841         Vectorized<c10::quint8> zero_point,
842         Vectorized<c10::quint8> q_six) {
843       return _mm512_min_epu8(
844           _mm512_max_epu8(vals, zero_point.vals), q_six.vals);
845     }
846 
847     int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
848       #if defined(_MSC_VER) && !defined(__clang__)
849       __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
850       __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
851       __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
852       __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
853       #else
854       __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
855       __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
856       __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
857       __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
858       #endif
859 
860       __m512i int32_val0 = cvtepu8_epi32(int_val0);
861       __m512i int32_val1 = cvtepu8_epi32(int_val1);
862       __m512i int32_val2 = cvtepu8_epi32(int_val2);
863       __m512i int32_val3 = cvtepu8_epi32(int_val3);
864 
865       #if defined(_MSC_VER) && !defined(__clang__)
866       __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]);
867       __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]);
868       __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]);
869       __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]);
870       #else
871       __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]);
872       __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]);
873       __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]);
874       __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]);
875       #endif
876 
877       __m512i int32_b0 = cvtepu8_epi32(int_b0);
878       __m512i int32_b1 = cvtepu8_epi32(int_b1);
879       __m512i int32_b2 = cvtepu8_epi32(int_b2);
880       __m512i int32_b3 = cvtepu8_epi32(int_b3);
881 
882       __m512i res_0 = _mm512_sub_epi32(int32_val0, int32_b0);
883       __m512i res_1 = _mm512_sub_epi32(int32_val1, int32_b1);
884       __m512i res_2 = _mm512_sub_epi32(int32_val2, int32_b2);
885       __m512i res_3 = _mm512_sub_epi32(int32_val3, int32_b3);
886       return {Vectorized<c10::qint32>(res_0),
887               Vectorized<c10::qint32>(res_1),
888               Vectorized<c10::qint32>(res_2),
889               Vectorized<c10::qint32>(res_3)};
890     }
891 
892     static Vectorized<c10::quint8> requantize_from_int(
893         const int_vec_return_type& inp,
894         float multiplier,
895         int32_t zero_point) {
896       __m512 multiplier_v = _mm512_set1_ps(multiplier);
897       __m512i zero_point_v = _mm512_set1_epi32(zero_point);
898       return RequantizeAvx512<value_type>(inp, multiplier_v, zero_point_v);
899     }
900 
901  private:
902 
903     // Load from memory constructor
904     Vectorized(const void* ptr) {
905         vals = _mm512_loadu_si512((const __m512i*)ptr);
906     }
907 };
908 
909 template <>
910 Vectorized<c10::quint8> inline maximum(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
911   return a.maximum(b);
912 }
913 
914 #else
915 
916 // NOTE: These are low-performance implementations that we fall back on.
917 
918 template <
919     typename T,
920     typename float_vec_return_type_,
921     typename int_vec_return_type_,
922     int size_>
923 struct VectorizedQuantizedConverter {
924   static constexpr int size() {
925     return size_;
926   }
927 
928   static constexpr int float_num_vecs() {
929     return size() / 8;
930   }
931 
932   static constexpr int int_num_vecs() {
933     return size() / 8;
934   }
935 
936   using float_vec_return_type = float_vec_return_type_;
937   using int_vec_return_type = int_vec_return_type_;
938 
939   using value_type = typename T::underlying;
940   std::array<value_type, size_> vals;
941 
942   VectorizedQuantizedConverter(T val) {
943     for (const auto i : c10::irange(size())) {
944       vals[i] = val.val_;
945     }
946   }
947 
948   VectorizedQuantizedConverter(const void* ptr) {
949     memcpy(vals.data(), ptr, sizeof(value_type) * size());
950   }
951 
952   void store(void* ptr, int count = size()) const {
953     memcpy(ptr, vals.data(), count * sizeof(value_type));
954   }
955 
956   float_vec_return_type dequantize(
957       Vectorized<float> scale,
958       Vectorized<float> zero_point,
959       Vectorized<float> scale_zp_premul [[maybe_unused]]) const {
960     float_vec_return_type rv;
961     for (const auto i : c10::irange(float_num_vecs())) {
962       float tmp_vals[16];
963       for (const auto j : c10::irange(16)) {
964         tmp_vals[j] = at::native::dequantize_val<T>(
965             scale[j], zero_point[j], T(vals[16 * i + j]));
966       }
967       rv[i] = Vectorized<float>(tmp_vals[0],
968           tmp_vals[1],
969           tmp_vals[2],
970           tmp_vals[3],
971           tmp_vals[4],
972           tmp_vals[5],
973           tmp_vals[6],
974           tmp_vals[7],
975           tmp_vals[8],
976           tmp_vals[9],
977           tmp_vals[10],
978           tmp_vals[11],
979           tmp_vals[12],
980           tmp_vals[13],
981           tmp_vals[14],
982           tmp_vals[15]);
983     }
984     return rv;
985   }
986 
987   float_vec_return_type dequantize(
988       Vectorized<float> scale,
989       Vectorized<float> zero_point) const {
990     Vectorized<float> scale_zp_premul;
991     return dequantize(scale, zero_point, scale_zp_premul);
992   }
993 
994  protected:
995   VectorizedQuantizedConverter() {}
996 };
997 
998 template <>
999 struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
1000                                  c10::qint32,
1001                                  std::array<Vectorized<float>, 1>,
1002                                  std::array<Vectorized<c10::qint32>, 1>,
1003                                  16> {
1004   Vectorized()
1005       : VectorizedQuantizedConverter<
1006             c10::qint32,
1007             std::array<Vectorized<float>, 1>,
1008             std::array<Vectorized<c10::qint32>, 1>,
1009             16>() {}
1010   Vectorized(c10::qint32 val)
1011       : VectorizedQuantizedConverter<
1012             c10::qint32,
1013             std::array<Vectorized<float>, 1>,
1014             std::array<Vectorized<c10::qint32>, 1>,
1015             16>(val) {}
1016   Vectorized(const void* ptr)
1017       : VectorizedQuantizedConverter<
1018             c10::qint32,
1019             std::array<Vectorized<float>, 1>,
1020             std::array<Vectorized<c10::qint32>, 1>,
1021             16>(ptr) {}
1022 
1023   static Vectorized<c10::qint32> loadu(const void* ptr) {
1024     return Vectorized<c10::qint32>(ptr);
1025   }
1026 
1027   static Vectorized<c10::qint32> loadu(const void* ptr, int64_t count) {
1028     __at_align__ value_type tmp_values[size()];
1029     // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
1030     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
1031     // instructions while a loop would be compiled to one instruction.
1032     for (const auto i : c10::irange(size())) {
1033       tmp_values[i] = 0;
1034     }
1035     std::memcpy(tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
1036     return loadu(tmp_values);
1037   }
1038 
1039   static Vectorized<c10::qint32> quantize(
1040       const float_vec_return_type& rhs,
1041       float scale,
1042       int32_t zero_point,
1043       float inverse_scale [[maybe_unused]]) {
1044     std::array<value_type, size()> qvals;
1045     std::array<float, float_num_vecs() * 16> float_vals;
1046 
1047     for (const auto i : c10::irange(float_num_vecs())) {
1048       rhs[i].store(&float_vals[i * 16], 16);
1049     }
1050 
1051     at::native::quantize_vec<c10::qint32, /*precision=*/32>(
1052         scale,
1053         zero_point,
1054         float_vals.data(),
1055         (c10::qint32*)qvals.data(),
1056         16 * float_num_vecs());
1057 
1058     return Vectorized<c10::qint32>::loadu(qvals.data());
1059   }
1060 
1061   Vectorized<c10::qint32> maximum(Vectorized<c10::qint32> b) const {
1062     Vectorized<c10::qint32> retval;
1063     for (const auto i : c10::irange(size())) {
1064       retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
1065     }
1066     return retval;
1067   }
1068 
1069   Vectorized<c10::qint32> minimum(Vectorized<c10::qint32> b) const {
1070     Vectorized<c10::qint32> retval;
1071     for (const auto i : c10::irange(size())) {
1072       retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
1073     }
1074     return retval;
1075   }
1076 
1077   Vectorized<c10::qint32> relu(Vectorized<c10::qint32> zero_point) const  {
1078     return maximum(zero_point);
1079   }
1080 
1081 
1082   Vectorized<c10::qint32> relu6(
1083       Vectorized<c10::qint32> zero_point,
1084       Vectorized<c10::qint32> q_six) {
1085     Vectorized<c10::qint32> retval;
1086     for (const auto i : c10::irange(size())) {
1087       retval.vals[i] = std::min<value_type>(
1088           std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
1089     }
1090     return retval;
1091   }
1092 
1093   int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const {
1094     int_vec_return_type retval;
1095     for (const auto i : c10::irange(size())) {
1096       retval[0].vals[i] = vals[i] - b.vals[i];
1097     }
1098     return retval;
1099   }
1100 
1101   static Vectorized<c10::qint32> requantize_from_int(
1102       const int_vec_return_type& inp,
1103       float multiplier,
1104       int32_t zero_point) {
1105     Vectorized<c10::qint32> retval;
1106     for (const auto i : c10::irange(size())) {
1107       retval.vals[i] =
1108           std::nearbyint(static_cast<float>(inp[0].vals[i]) * multiplier) +
1109           zero_point;
1110     }
1111     return retval;
1112   }
1113 };
1114 
1115 template <>
1116 Vectorized<c10::qint32> inline maximum(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
1117   return a.maximum(b);
1118 }
1119 
1120 template <>
1121 Vectorized<c10::qint32> inline operator*(
1122     const Vectorized<c10::qint32>& a,
1123     const Vectorized<c10::qint32>& b) {
1124   Vectorized<c10::qint32> retval;
1125   for (const auto i : c10::irange(std::decay_t<decltype(a)>::size())) {
1126     retval.vals[i] = a.vals[i] * b.vals[i];
1127   }
1128   return retval;
1129 }
1130 
1131 template <>
1132 Vectorized<c10::qint32> inline operator+(
1133     const Vectorized<c10::qint32>& a,
1134     const Vectorized<c10::qint32>& b) {
1135   Vectorized<c10::qint32> retval;
1136   for (const auto i : c10::irange(std::decay_t<decltype(a)>::size())) {
1137     retval.vals[i] = a.vals[i] + b.vals[i];
1138   }
1139   return retval;
1140 }
1141 
1142 template <>
1143 struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
1144                                 c10::qint8,
1145                                 std::array<Vectorized<float>, 4>,
1146                                 std::array<Vectorized<c10::qint32>, 4>,
1147                                 64> {
1148   Vectorized()
1149       : VectorizedQuantizedConverter<
1150             c10::qint8,
1151             std::array<Vectorized<float>, 4>,
1152             std::array<Vectorized<c10::qint32>, 4>,
1153             64>() {}
1154   Vectorized(c10::qint8 val)
1155       : VectorizedQuantizedConverter<
1156             c10::qint8,
1157             std::array<Vectorized<float>, 4>,
1158             std::array<Vectorized<c10::qint32>, 4>,
1159             64>(val) {}
1160   Vectorized(const void* ptr)
1161       : VectorizedQuantizedConverter<
1162             c10::qint8,
1163             std::array<Vectorized<float>, 4>,
1164             std::array<Vectorized<c10::qint32>, 4>,
1165             64>(ptr) {}
1166 
1167   static Vectorized<c10::qint8> loadu(const void* ptr) {
1168     return Vectorized<c10::qint8>(ptr);
1169   }
1170 
1171   static Vectorized<c10::qint8> loadu(const void* ptr, int64_t count) {
1172     __at_align__ value_type tmp_values[size()];
1173     // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
1174     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
1175     // instructions while a loop would be compiled to one instruction.
1176     for (const auto i : c10::irange(size())) {
1177       tmp_values[i] = 0;
1178     }
1179     std::memcpy(tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
1180     return loadu(tmp_values);
1181   }
1182 
1183   static Vectorized<c10::qint8> quantize(
1184       const float_vec_return_type& rhs,
1185       float scale,
1186       int32_t zero_point,
1187       float inverse_scale [[maybe_unused]]) {
1188     std::array<value_type, size()> qvals;
1189     std::array<float, float_num_vecs() * 16> float_vals;
1190 
1191     for (const auto i : c10::irange(float_num_vecs())) {
1192       rhs[i].store(&float_vals[i * 16], 16);
1193     }
1194 
1195     at::native::quantize_vec<c10::qint8>(
1196         scale,
1197         zero_point,
1198         float_vals.data(),
1199         (c10::qint8*)qvals.data(),
1200         16 * float_num_vecs());
1201 
1202     return Vectorized<c10::qint8>::loadu(qvals.data());
1203   }
1204 
1205   Vectorized<c10::qint8> maximum(Vectorized<c10::qint8> b) const {
1206     Vectorized<c10::qint8> retval;
1207     for (const auto i : c10::irange(size())) {
1208       retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
1209     }
1210     return retval;
1211   }
1212 
1213   Vectorized<c10::qint8> minimum(Vectorized<c10::qint8> b) const {
1214     Vectorized<c10::qint8> retval;
1215     for (const auto i : c10::irange(size())) {
1216       retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
1217     }
1218     return retval;
1219   }
1220 
1221   Vectorized<c10::qint8> relu(Vectorized<c10::qint8> zero_point) const {
1222     return maximum(zero_point);
1223   }
1224 
1225   Vectorized<c10::qint8> relu6(
1226       Vectorized<c10::qint8> zero_point,
1227       Vectorized<c10::qint8> q_six) {
1228     Vectorized<c10::qint8> retval;
1229     for (const auto i : c10::irange(size())) {
1230       retval.vals[i] = std::min<value_type>(
1231           std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
1232     }
1233     return retval;
1234   }
1235 
1236   int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
1237     int_vec_return_type retval;
1238     constexpr int elem_per_int_vec = size() / int_num_vecs();
1239     for (const auto i : c10::irange(int_num_vecs())) {
1240       for (const auto j : c10::irange(elem_per_int_vec)) {
1241         retval[i].vals[j] =
1242             static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
1243             static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
1244       }
1245     }
1246     return retval;
1247   }
1248   static Vectorized<c10::qint8> requantize_from_int(
1249       const int_vec_return_type& inp,
1250       float multiplier,
1251       int32_t zero_point) {
1252     constexpr int elem_per_int_vec = size() / int_num_vecs();
1253     constexpr auto min_val = std::numeric_limits<value_type>::min();
1254     constexpr auto max_val = std::numeric_limits<value_type>::max();
1255     Vectorized<c10::qint8> retval;
1256     for (const auto i : c10::irange(int_num_vecs())) {
1257       for (const auto j : c10::irange(elem_per_int_vec)) {
1258         int32_t rounded =
1259             std::nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
1260             zero_point;
1261         retval.vals[i * elem_per_int_vec + j] =
1262             std::min<int32_t>(std::max<int32_t>(rounded, min_val), max_val);
1263       }
1264     }
1265     return retval;
1266   }
1267 };
1268 
1269 template <>
1270 Vectorized<c10::qint8> inline maximum(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
1271   return a.maximum(b);
1272 }
1273 
1274 template <>
1275 struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
1276                                  c10::quint8,
1277                                  std::array<Vectorized<float>, 4>,
1278                                  std::array<Vectorized<c10::qint32>, 4>,
1279                                  64> {
1280   Vectorized()
1281       : VectorizedQuantizedConverter<
1282             c10::quint8,
1283             std::array<Vectorized<float>, 4>,
1284             std::array<Vectorized<c10::qint32>, 4>,
1285             64>() {}
1286   Vectorized(c10::quint8 val)
1287       : VectorizedQuantizedConverter<
1288             c10::quint8,
1289             std::array<Vectorized<float>, 4>,
1290             std::array<Vectorized<c10::qint32>, 4>,
1291             64>(val) {}
1292   Vectorized(const void* ptr)
1293       : VectorizedQuantizedConverter<
1294             c10::quint8,
1295             std::array<Vectorized<float>, 4>,
1296             std::array<Vectorized<c10::qint32>, 4>,
1297             64>(ptr) {}
1298 
1299   static Vectorized<c10::quint8> loadu(const void* ptr) {
1300     return Vectorized<c10::quint8>(ptr);
1301   }
1302 
1303   static Vectorized<c10::quint8> loadu(const void* ptr, int64_t count) {
1304     __at_align__ value_type tmp_values[size()];
1305     // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
1306     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
1307     // instructions while a loop would be compiled to one instruction.
1308     for (const auto i : c10::irange(size())) {
1309       tmp_values[i] = 0;
1310     }
1311     std::memcpy(tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
1312     return loadu(tmp_values);
1313   }
1314 
1315   static Vectorized<c10::quint8> quantize(
1316       const float_vec_return_type& rhs,
1317       float scale,
1318       int32_t zero_point,
1319       float inverse_scale [[maybe_unused]]) {
1320     std::array<value_type, size()> qvals;
1321     std::array<float, float_num_vecs() * 16> float_vals;
1322 
1323     for (const auto i : c10::irange(float_num_vecs())) {
1324       rhs[i].store(&float_vals[i * 16], 16);
1325     }
1326 
1327     at::native::quantize_vec<c10::quint8>(
1328         scale,
1329         zero_point,
1330         float_vals.data(),
1331         (c10::quint8*)qvals.data(),
1332         16 * float_num_vecs());
1333 
1334     return Vectorized<c10::quint8>::loadu(qvals.data());
1335   }
1336 
1337   Vectorized<c10::quint8> maximum(Vectorized<c10::quint8> b) const {
1338     Vectorized<c10::quint8> retval;
1339     for (const auto i : c10::irange(size())) {
1340       retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
1341     }
1342     return retval;
1343   }
1344 
1345   Vectorized<c10::quint8> minimum(Vectorized<c10::quint8> b) const {
1346     Vectorized<c10::quint8> retval;
1347     for (const auto i : c10::irange(size())) {
1348       retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
1349     }
1350     return retval;
1351   }
1352 
1353   Vectorized<c10::quint8> relu(Vectorized<c10::quint8> zero_point) const {
1354     return maximum(zero_point);
1355   }
1356 
1357 
1358   Vectorized<c10::quint8> relu6(
1359       Vectorized<c10::quint8> zero_point,
1360       Vectorized<c10::quint8> q_six) {
1361     Vectorized<c10::quint8> retval;
1362     for (const auto i : c10::irange(size())) {
1363       retval.vals[i] = std::min<value_type>(
1364           std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
1365     }
1366     return retval;
1367   }
1368 
1369   int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
1370     int_vec_return_type retval;
1371     constexpr int elem_per_int_vec = size() / int_num_vecs();
1372     for (const auto i : c10::irange(int_num_vecs())) {
1373       for (const auto j : c10::irange(elem_per_int_vec)) {
1374         retval[i].vals[j] =
1375             static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
1376             static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
1377       }
1378     }
1379     return retval;
1380   }
1381   static Vectorized<c10::quint8> requantize_from_int(
1382       const int_vec_return_type& inp,
1383       float multiplier,
1384       int32_t zero_point) {
1385     constexpr int elem_per_int_vec = size() / int_num_vecs();
1386     constexpr auto min_val = std::numeric_limits<value_type>::min();
1387     constexpr auto max_val = std::numeric_limits<value_type>::max();
1388     Vectorized<c10::quint8> retval;
1389     for (const auto i : c10::irange(int_num_vecs())) {
1390       for (const auto j : c10::irange(elem_per_int_vec)) {
1391         int32_t rounded =
1392             std::nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
1393             zero_point;
1394         retval.vals[i * elem_per_int_vec + j] =
1395             std::min<int32_t>(std::max<int32_t>(rounded, min_val), max_val);
1396       }
1397     }
1398     return retval;
1399   }
1400 };
1401 
1402 template <>
1403 Vectorized<c10::quint8> inline maximum(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
1404   return a.maximum(b);
1405 }
1406 
1407 #endif // defined(CPU_CAPABILITY_AVX512) && !defined(MSVC)
1408 
1409 }}}
1410