1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef BERBERIS_INTRINSICS_RISCV64_TO_ALL_VECTOR_INTRINSICS_H_
18 #define BERBERIS_INTRINSICS_RISCV64_TO_ALL_VECTOR_INTRINSICS_H_
19 
20 #include <algorithm>
21 #include <climits>  // CHAR_BIT
22 #include <cstdint>
23 #include <limits>
24 #include <tuple>
25 #include <type_traits>
26 
27 #include "berberis/base/bit_util.h"
28 #include "berberis/base/dependent_false.h"
29 #include "berberis/intrinsics/intrinsics.h"        // PreferredIntrinsicsImplementation
30 #if defined(__aarch64__)
31 #include "berberis/intrinsics/common/intrinsics_float.h"
32 #include "berberis/intrinsics/vector_intrinsics.h"
33 #else
34 #include "berberis/intrinsics/intrinsics_float.h"  // Float32/Float64
35 #endif
36 #include "berberis/intrinsics/simd_register.h"
37 #include "berberis/intrinsics/type_traits.h"
38 
39 namespace berberis::intrinsics {
40 
41 enum class TailProcessing {
42   kUndisturbed = 0,
43   kAgnostic = 1,
44 };
45 
46 enum class InactiveProcessing {
47   kUndisturbed = 0,
48   kAgnostic = 1,
49 };
50 
51 enum class NoInactiveProcessing {
52   kNoInactiveProcessing = 0,
53 };
54 
55 template <typename ElementType>
FullMaskForRegister(NoInactiveProcessing)56 [[nodiscard]] inline std::tuple<NoInactiveProcessing> FullMaskForRegister(NoInactiveProcessing) {
57   return {NoInactiveProcessing{}};
58 }
59 
60 template <typename ElementType>
61 [[nodiscard]] inline std::tuple<
62     std::conditional_t<sizeof(ElementType) == sizeof(Int8), RawInt16, RawInt8>>
FullMaskForRegister(SIMD128Register)63 FullMaskForRegister(SIMD128Register) {
64   if constexpr (sizeof(ElementType) == sizeof(uint8_t)) {
65     return {{0xffff}};
66   } else if constexpr (sizeof(ElementType) == sizeof(uint16_t)) {
67     return {{0xff}};
68   } else if constexpr (sizeof(ElementType) == sizeof(uint32_t)) {
69     return {{0xf}};
70   } else if constexpr (sizeof(ElementType) == sizeof(uint64_t)) {
71     return {{0x3}};
72   } else {
73     static_assert(kDependentTypeFalse<ElementType>, "Unsupported vector element type");
74   }
75 }
76 
77 template <typename ElementType>
MaskForRegisterInSequence(NoInactiveProcessing,size_t)78 [[nodiscard]] inline std::tuple<NoInactiveProcessing> MaskForRegisterInSequence(
79     NoInactiveProcessing,
80     size_t) {
81   return {NoInactiveProcessing{}};
82 }
83 
84 template <typename ElementType>
85 [[nodiscard]] inline std::tuple<
86     std::conditional_t<sizeof(ElementType) == sizeof(Int8), RawInt16, RawInt8>>
MaskForRegisterInSequence(SIMD128Register mask,size_t register_in_sequence)87 MaskForRegisterInSequence(SIMD128Register mask, size_t register_in_sequence) {
88   if constexpr (sizeof(ElementType) == sizeof(uint8_t)) {
89     return {mask.Get<RawInt16>(register_in_sequence)};
90   } else if constexpr (sizeof(ElementType) == sizeof(uint16_t)) {
91     return {mask.Get<RawInt8>(register_in_sequence)};
92   } else if constexpr (sizeof(ElementType) == sizeof(uint32_t)) {
93     return {RawInt8{TruncateTo<UInt8>(mask.Get<UInt32>(0) >> UInt64(register_in_sequence * 4)) &
94                     UInt8{0b1111}}};
95   } else if constexpr (sizeof(ElementType) == sizeof(uint64_t)) {
96     return {RawInt8{TruncateTo<UInt8>(mask.Get<UInt32>(0) >> UInt64(register_in_sequence * 2)) &
97                     UInt8{0b11}}};
98   } else {
99     static_assert(kDependentTypeFalse<ElementType>, "Unsupported vector element type");
100   }
101 }
102 
103 // Naïve implementation for tests.  Also used on not-x86 platforms.
MakeBitmaskFromVlForTests(size_t vl)104 [[nodiscard]] inline std::tuple<SIMD128Register> MakeBitmaskFromVlForTests(size_t vl) {
105   if (vl == 128) {
106     return {SIMD128Register(__int128(0))};
107   } else {
108     return {SIMD128Register((~__int128(0)) << vl)};
109   }
110 }
111 
112 #ifndef __x86_64__
MakeBitmaskFromVl(size_t vl)113 [[nodiscard]] inline std::tuple<SIMD128Register> MakeBitmaskFromVl(size_t vl) {
114   return {MakeBitmaskFromVlForTests(vl)};
115 }
116 #endif
117 
118 template <typename ElementType>
MakeBitmaskFromVl(size_t vl)119 [[nodiscard]] inline std::tuple<SIMD128Register> MakeBitmaskFromVl(size_t vl) {
120   return MakeBitmaskFromVl(vl * sizeof(ElementType) * CHAR_BIT);
121 }
122 
123 // Naïve implementation for tests.  Also used on not-x86 platforms.
124 template <typename ElementType>
BitMaskToSimdMaskForTests(size_t mask)125 [[nodiscard]] inline std::tuple<SIMD128Register> BitMaskToSimdMaskForTests(size_t mask) {
126   constexpr ElementType kZeroValue = ElementType{0};
127   constexpr ElementType kFillValue = ~ElementType{0};
128   SIMD128Register result;
129   for (size_t index = 0; index < sizeof(SIMD128Register) / sizeof(ElementType); ++index) {
130     size_t bit = 1 << index;
131     if (mask & bit) {
132       result.Set(kFillValue, index);
133     } else {
134       result.Set(kZeroValue, index);
135     }
136   }
137   return {result};
138 }
139 
140 #if !defined(__x86_64__) && !defined(__aarch64__)
141 template <typename ElementType>
BitMaskToSimdMask(size_t mask)142 [[nodiscard]] inline std::tuple<SIMD128Register> BitMaskToSimdMask(size_t mask) {
143   return {BitMaskToSimdMaskForTests<ElementType>(mask)};
144 }
145 #endif
146 
147 // Naïve implementation for tests.  Also used on not-x86 platforms.
148 template <typename ElementType>
149 [[nodiscard]] inline std::tuple<
150     std::conditional_t<sizeof(ElementType) == sizeof(Int8), RawInt16, RawInt8>>
SimdMaskToBitMaskForTests(SIMD128Register simd_mask)151 SimdMaskToBitMaskForTests(SIMD128Register simd_mask) {
152   using ResultType = std::conditional_t<sizeof(ElementType) == sizeof(Int8), UInt16, UInt8>;
153   ResultType mask{0};
154   constexpr ResultType kElementsCount{
155       static_cast<uint8_t>(sizeof(SIMD128Register) / sizeof(ElementType))};
156   for (ResultType index{0}; index < kElementsCount; index += ResultType{1}) {
157     if (simd_mask.Get<ElementType>(index) != ElementType{}) {
158       mask |= ResultType{1} << ResultType{index};
159     }
160   }
161   return mask;
162 }
163 
164 #ifndef __SSSE3__
165 template <typename ElementType>
166 [[nodiscard]] inline std::tuple<
167     std::conditional_t<sizeof(ElementType) == sizeof(Int8), RawInt16, RawInt8>>
SimdMaskToBitMask(SIMD128Register simd_mask)168 SimdMaskToBitMask(SIMD128Register simd_mask) {
169   return SimdMaskToBitMaskForTests<ElementType>(simd_mask);
170 }
171 #endif
172 
173 #if !defined(__aarch64__)
174 template <auto kElement>
VectorMaskedElementToForTests(SIMD128Register simd_mask,SIMD128Register result)175 [[nodiscard]] inline std::tuple<SIMD128Register> VectorMaskedElementToForTests(
176     SIMD128Register simd_mask,
177     SIMD128Register result) {
178   using ElementType = decltype(kElement);
179   constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
180   for (size_t index = 0; index < kElementsCount; ++index) {
181     if (!simd_mask.Get<ElementType>(index)) {
182       result.Set(kElement, index);
183     }
184   }
185   return result;
186 }
187 
188 #ifndef __x86_64__
189 template <typename ElementType>
VectorMaskedElementTo(SIMD128Register simd_mask,SIMD128Register result)190 [[nodiscard]] inline std::tuple<SIMD128Register> VectorMaskedElementTo(SIMD128Register simd_mask,
191                                                                        SIMD128Register result) {
192   return VectorMaskedElementToForTests(simd_mask, result);
193 }
194 #endif
195 
196 #endif
197 
198 // For instructions that operate on carry bits, expands single bit from mask register
199 //     into vector argument
200 template <typename ElementType, TailProcessing vta, auto vma>
GetMaskVectorArgument(SIMD128Register mask,size_t index)201 std::tuple<SIMD128Register> GetMaskVectorArgument(SIMD128Register mask, size_t index) {
202   using MaskType = std::conditional_t<sizeof(ElementType) == sizeof(Int8), UInt16, UInt8>;
203   auto register_mask = std::get<0>(MaskForRegisterInSequence<ElementType>(mask, index));
204   return BitMaskToSimdMaskForTests<ElementType>(Int64{MaskType{register_mask}});
205 }
206 
207 template <typename ElementType>
VectorElement(SIMD128Register src,int index)208 [[nodiscard]] inline ElementType VectorElement(SIMD128Register src, int index) {
209   return src.Get<ElementType>(index);
210 }
211 
212 template <typename ElementType>
VectorElement(ElementType src,int)213 [[nodiscard]] inline ElementType VectorElement(ElementType src, int) {
214   return src;
215 }
216 
217 template <typename ElementType>
VMovTopHalfToBottom(SIMD128Register src)218 [[nodiscard]] inline std::tuple<SIMD128Register> VMovTopHalfToBottom(SIMD128Register src) {
219   return {SIMD128Register{src.Get<uint64_t>(1)}};
220 }
221 
222 template <typename ElementType>
VMergeBottomHalfToTop(SIMD128Register bottom,SIMD128Register top)223 [[nodiscard]] inline std::tuple<SIMD128Register> VMergeBottomHalfToTop(SIMD128Register bottom,
224                                                                        SIMD128Register top) {
225   SIMD128Register result{bottom};
226   result.Set<uint64_t>(top.Get<uint64_t>(0), 1);
227   return result;
228 }
229 
230 // Naïve implementation for tests.  Also used on not-x86 platforms.
231 template <auto kDefaultElement>
VectorBroadcastForTests()232 [[nodiscard]] inline std::tuple<SIMD128Register> VectorBroadcastForTests() {
233   constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof kDefaultElement;
234   SIMD128Register dest;
235   for (size_t index = 0; index < kElementsCount; ++index) {
236     dest.Set(kDefaultElement, index);
237   }
238   return dest;
239 }
240 
241 #ifndef __x86_64__
242 template <auto kDefaultElement>
VectorBroadcast()243 [[nodiscard]] inline std::tuple<SIMD128Register> VectorBroadcast() {
244   return VectorBroadcastForTests<kDefaultElement>();
245 }
246 #endif
247 
248 template <auto kDefaultElement, TailProcessing vta, NoInactiveProcessing = NoInactiveProcessing{}>
VectorMasking(SIMD128Register result,int vstart,int vl)249 [[nodiscard]] inline std::tuple<SIMD128Register> VectorMasking(SIMD128Register result,
250                                                                int vstart,
251                                                                int vl) {
252   constexpr int kElementsCount = static_cast<int>(sizeof(SIMD128Register) / sizeof kDefaultElement);
253   if (vstart < 0) {
254     vstart = 0;
255   }
256   if (vl < 0) {
257     vl = 0;
258   }
259   if (vl > kElementsCount) {
260     vl = kElementsCount;
261   }
262   if constexpr (kDefaultElement == decltype(kDefaultElement){}) {
263     if (vstart == 0) [[likely]] {
264       if (vl != kElementsCount) [[unlikely]] {
265         const auto [tail_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vl);
266         result &= ~tail_bitmask;
267       }
268     } else if (vstart >= vl) [[unlikely]] {
269       // Note: vstart <= vl here because RISC-V instructions don't alter the result if vstart >= vl.
270       // But when vstart is so big that it's larger than kElementsCount and vl is also larger than
271       // kElementsCount we hit that corner case and return zero if that happens.
272       result = SIMD128Register{};
273     } else {
274       // Note: vstart < vl here because RISC-V instructions don't alter the result if vstart >= vl.
275       CHECK_LT(vstart, vl);
276       const auto [start_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vstart);
277       const auto [tail_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vl);
278       result &= start_bitmask;
279       result &= ~tail_bitmask;
280     }
281   } else if constexpr (kDefaultElement == ~decltype(kDefaultElement){}) {
282     if (vstart == 0) [[likely]] {
283       if (vl != kElementsCount) [[unlikely]] {
284         const auto [tail_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vl);
285         result |= tail_bitmask;
286       }
287     } else if (vstart >= vl) [[unlikely]] {
288       // Note: vstart <= vl here because RISC-V instructions don't alter the result if vstart >= vl.
289       // But when vstart is so big that it's larger than kElementsCount and vl is also larger than
290       // kElementsCount we hit that corner case and return zero if that happens.
291       result = ~SIMD128Register{};
292     } else {
293       // Note: vstart < vl here because RISC-V instructions don't alter the result if vstart >= vl.
294       CHECK_LT(vstart, vl);
295       const auto [start_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vstart);
296       const auto [tail_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vl);
297       result |= ~start_bitmask;
298       result |= tail_bitmask;
299     }
300   } else {
301     const std::tuple<SIMD128Register>& dest = VectorBroadcast<kDefaultElement>();
302     if (vstart == 0) [[likely]] {
303       if (vl != kElementsCount) [[unlikely]] {
304         const auto [tail_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vl);
305         result &= ~tail_bitmask;
306         result |= (std::get<0>(dest) & tail_bitmask);
307       }
308     } else if (vstart >= vl) [[unlikely]] {
309       // Note: vstart <= vl here because RISC-V instructions don't alter the result if vstart >= vl.
310       // But when vstart is so big that it's larger than kElementsCount and vl is also larger than
311       // kElementsCount we hit that corner case and return dest if that happens.
312       result = std::get<0>(dest);
313     } else {
314       const auto [start_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vstart);
315       const auto [tail_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vl);
316       result &= start_bitmask;
317       result &= ~tail_bitmask;
318       result |= (std::get<0>(dest) & (~start_bitmask | tail_bitmask));
319     }
320   }
321   return result;
322 }
323 
324 template <auto kDefaultElement,
325           TailProcessing vta,
326           auto vma = NoInactiveProcessing{},
327           typename MaskType>
328 [[nodiscard]] inline std::tuple<SIMD128Register> VectorMasking(SIMD128Register result,
329                                                                int vstart,
330                                                                int vl,
331                                                                MaskType mask) {
332   static_assert((std::is_same_v<decltype(vma), NoInactiveProcessing> &&
333                  std::is_same_v<MaskType, NoInactiveProcessing>) ||
334                 (std::is_same_v<decltype(vma), InactiveProcessing> &&
335                  (std::is_same_v<MaskType, RawInt8> || std::is_same_v<MaskType, RawInt16>)));
336   if constexpr (std::is_same_v<decltype(vma), InactiveProcessing>) {
337     const auto [simd_mask] = BitMaskToSimdMask<decltype(kDefaultElement)>(
338         static_cast<typename MaskType::BaseType>(mask));
339     if constexpr (kDefaultElement == ~decltype(kDefaultElement){}) {
340       result |= ~simd_mask;
341     } else {
342       result &= simd_mask;
343       if constexpr (kDefaultElement != decltype(kDefaultElement){}) {
344         const std::tuple<SIMD128Register>& dest = VectorBroadcast<kDefaultElement>();
345         result |= std::get<0>(dest) & ~simd_mask;
346       }
347     }
348   }
349   return VectorMasking<kDefaultElement, vta>(result, vstart, vl);
350 }
351 
352 template <typename ElementType, TailProcessing vta, NoInactiveProcessing = NoInactiveProcessing{}>
353 [[nodiscard]] inline std::tuple<SIMD128Register> VectorMasking(
354     SIMD128Register dest,
355     SIMD128Register result,
356     int vstart,
357     int vl,
358     NoInactiveProcessing /*mask*/ = NoInactiveProcessing{}) {
359   constexpr int kElementsCount = static_cast<int>(sizeof(SIMD128Register) / sizeof(ElementType));
360   if (vstart < 0) {
361     vstart = 0;
362   }
363   if (vl < 0) {
364     vl = 0;
365   }
366   if (vl > kElementsCount) {
367     vl = kElementsCount;
368   }
369   if (vstart == 0) [[likely]] {
370     if (vl == kElementsCount) [[likely]] {
371       return result;
372     }
373     const auto [tail_bitmask] = MakeBitmaskFromVl<ElementType>(vl);
374     if constexpr (vta == TailProcessing::kAgnostic) {
375       dest = result | tail_bitmask;
376     } else {
377       dest = (dest & tail_bitmask) | (result & ~tail_bitmask);
378     }
379   } else if (vstart < vl) [[likely]] {
380     // Note: vstart <= vl here because RISC-V instructions don't alter the result if vstart >= vl.
381     // But when vstart is so big that it's larger than kElementsCount and vl is also larger than
382     // kElementsCount we hit that corner case and return dest if that happens.
383     const auto [start_bitmask] = MakeBitmaskFromVl<ElementType>(vstart);
384     const auto [tail_bitmask] = MakeBitmaskFromVl<ElementType>(vl);
385     if constexpr (vta == TailProcessing::kAgnostic) {
386       dest = (dest & ~start_bitmask) | (result & start_bitmask) | tail_bitmask;
387     } else {
388       dest = (dest & (~start_bitmask | tail_bitmask)) | (result & start_bitmask & ~tail_bitmask);
389     }
390   } else if constexpr (vta == TailProcessing::kAgnostic) {
391     if (vstart == vl) {
392       // Corners case where vstart == vl may happen because of vslideup:
393       //   https://github.com/riscv/riscv-v-spec/issues/263
394       const auto [tail_bitmask] = MakeBitmaskFromVl<ElementType>(vl);
395       dest |= tail_bitmask;
396     }
397   }
398   return {dest};
399 }
400 
401 template <typename ElementType,
402           TailProcessing vta,
403           auto vma = NoInactiveProcessing{},
404           typename MaskType = NoInactiveProcessing>
405 [[nodiscard]] inline std::tuple<SIMD128Register> VectorMasking(
406     SIMD128Register dest,
407     SIMD128Register result,
408     SIMD128Register result_mask,
409     int vstart,
410     int vl,
411     MaskType mask = NoInactiveProcessing{}) {
412   static_assert((std::is_same_v<decltype(vma), NoInactiveProcessing> &&
413                  std::is_same_v<MaskType, NoInactiveProcessing>) ||
414                 (std::is_same_v<decltype(vma), InactiveProcessing> &&
415                  (std::is_same_v<MaskType, RawInt8> || std::is_same_v<MaskType, RawInt16>)));
416   if constexpr (std::is_same_v<decltype(vma), InactiveProcessing>) {
417     const auto [simd_mask] =
418         BitMaskToSimdMask<ElementType>(static_cast<typename MaskType::BaseType>(mask));
419     if (vma == InactiveProcessing::kAgnostic) {
420       result |= ~simd_mask;
421     } else {
422       result = (result & simd_mask) | (result_mask & ~simd_mask);
423     }
424   }
425   return VectorMasking<ElementType, vta>(dest, result, vstart, vl);
426 }
427 
428 template <typename ElementType, TailProcessing vta, InactiveProcessing vma, typename MaskType>
VectorMasking(SIMD128Register dest,SIMD128Register result,int vstart,int vl,MaskType mask)429 [[nodiscard]] inline std::tuple<SIMD128Register> VectorMasking(SIMD128Register dest,
430                                                                SIMD128Register result,
431                                                                int vstart,
432                                                                int vl,
433                                                                MaskType mask) {
434   return VectorMasking<ElementType, vta, vma>(dest,
435                                               result,
436                                               /*result_mask=*/dest,
437                                               vstart,
438                                               vl,
439                                               mask);
440 }
441 
442 template <typename ElementType, typename... ParameterType>
443 inline constexpr bool kIsAllowedArgumentForVector =
444     ((std::is_same_v<ParameterType, SIMD128Register> ||
445       std::is_same_v<ParameterType, ElementType>) &&
446      ...);
447 
448 // TODO(b/260725458): Pass lambda as template argument after C++20 would become available.
449 template <typename ElementType, typename Lambda, typename... ParameterType>
VectorProcessing(Lambda lambda,ParameterType...parameters)450 inline std::tuple<SIMD128Register> VectorProcessing(Lambda lambda, ParameterType... parameters) {
451   static_assert(kIsAllowedArgumentForVector<ElementType, ParameterType...>);
452   SIMD128Register result;
453   constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
454   for (size_t index = 0; index < kElementsCount; ++index) {
455     result.Set(lambda(VectorElement<ElementType>(parameters, index)...), index);
456   }
457   return result;
458 }
459 
460 // TODO(b/260725458): Pass lambda as template argument after C++20 would become available.
461 template <typename ElementType, typename Lambda, typename ResultType, typename... ParameterType>
VectorProcessingReduce(Lambda lambda,ResultType init,ParameterType...parameters)462 inline std::tuple<ResultType> VectorProcessingReduce(Lambda lambda,
463                                                      ResultType init,
464                                                      ParameterType... parameters) {
465   static_assert(std::is_same_v<ResultType, ElementType> ||
466                 std::is_same_v<ResultType, WideType<ElementType>>);
467   static_assert(kIsAllowedArgumentForVector<ElementType, ParameterType...>);
468   constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
469   for (size_t index = 0; index < kElementsCount; ++index) {
470     if constexpr (std::is_same_v<ResultType, WideType<ElementType>>) {
471       init = lambda(init, Widen(VectorElement<ElementType>(parameters, index)...));
472     } else {
473       init = lambda(init, VectorElement<ElementType>(parameters, index)...);
474     }
475   }
476   return init;
477 }
478 
479 // SEW = 2*SEW op SEW
480 // TODO(b/260725458): Pass lambda as template argument after C++20 would become available.
481 template <typename ElementType, typename Lambda, typename ParameterType1, typename ParameterType2>
VectorArithmeticNarrowwv(Lambda lambda,ParameterType1 src1,ParameterType2 src2)482 inline std::tuple<SIMD128Register> VectorArithmeticNarrowwv(Lambda lambda,
483                                                             ParameterType1 src1,
484                                                             ParameterType2 src2) {
485   static_assert(kIsAllowedArgumentForVector<ElementType, ParameterType1, ParameterType2>);
486   SIMD128Register result;
487   constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType) / 2;
488   for (size_t index = 0; index < kElementsCount; ++index) {
489     result.Set(Narrow(lambda(VectorElement<WideType<ElementType>>(src1, index),
490                              Widen(VectorElement<ElementType>(src2, index)))),
491                index);
492   }
493   return result;
494 }
495 
496 // 2*SEW = SEW op SEW
497 // TODO(b/260725458): Pass lambda as template argument after C++20 would become available.
498 template <typename ElementType, typename Lambda, typename... ParameterType>
VectorArithmeticWidenvv(Lambda lambda,ParameterType...parameters)499 inline std::tuple<SIMD128Register> VectorArithmeticWidenvv(Lambda lambda,
500                                                            ParameterType... parameters) {
501   static_assert(kIsAllowedArgumentForVector<ElementType, ParameterType...>);
502   SIMD128Register result;
503   constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType) / 2;
504   for (size_t index = 0; index < kElementsCount; ++index) {
505     result.Set(lambda(Widen(VectorElement<ElementType>(parameters, index))...), index);
506   }
507   return result;
508 }
509 
510 // 2*SEW = SEW op SEW op 2*SEW
511 // TODO(b/260725458): Pass lambda as template argument after C++20 would become available.
512 template <typename ElementType,
513           typename Lambda,
514           typename ParameterType1,
515           typename ParameterType2,
516           typename ParameterType3>
VectorArithmeticWidenvvw(Lambda lambda,ParameterType1 src1,ParameterType2 src2,ParameterType3 src3)517 inline std::tuple<SIMD128Register> VectorArithmeticWidenvvw(Lambda lambda,
518                                                             ParameterType1 src1,
519                                                             ParameterType2 src2,
520                                                             ParameterType3 src3) {
521   static_assert(
522       kIsAllowedArgumentForVector<ElementType, ParameterType1, ParameterType2, ParameterType3>);
523   SIMD128Register result;
524   constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType) / 2;
525   for (size_t index = 0; index < kElementsCount; ++index) {
526     result.Set(lambda(Widen(VectorElement<ElementType>(src1, index)),
527                       Widen(VectorElement<ElementType>(src2, index)),
528                       VectorElement<WideType<ElementType>>(src3, index)),
529                index);
530   }
531   return result;
532 }
533 
534 // SEW = 2*SEW op SEW
535 // TODO(b/260725458): Pass lambda as template argument after C++20 would become available.
536 template <typename ElementType, typename Lambda, typename ParameterType1, typename ParameterType2>
VectorArithmeticWidenwv(Lambda lambda,ParameterType1 src1,ParameterType2 src2)537 inline std::tuple<SIMD128Register> VectorArithmeticWidenwv(Lambda lambda,
538                                                            ParameterType1 src1,
539                                                            ParameterType2 src2) {
540   static_assert(kIsAllowedArgumentForVector<ElementType, ParameterType1, ParameterType2>);
541   SIMD128Register result;
542   constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType) / 2;
543   for (size_t index = 0; index < kElementsCount; ++index) {
544     result.Set(lambda(VectorElement<WideType<ElementType>>(src1, index),
545                       Widen(VectorElement<ElementType>(src2, index))),
546                index);
547   }
548   return result;
549 }
550 
551 template <typename ElementType>
VectorExtend(SIMD128Register src)552 SIMD128Register VectorExtend(SIMD128Register src) {
553   SIMD128Register result;
554   constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType) / 2;
555   for (size_t index = 0; index < kElementsCount; ++index) {
556     result.Set(Widen(VectorElement<ElementType>(src, index)), index);
557   }
558   return result;
559 }
560 
561 template <typename ElementType,
562           enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vextf2(SIMD128Register src)563 inline std::tuple<SIMD128Register> Vextf2(SIMD128Register src) {
564   using SourceElementType = NarrowType<ElementType>;
565   return {VectorExtend<SourceElementType>(src)};
566 }
567 
568 template <typename ElementType,
569           enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vextf4(SIMD128Register src)570 inline std::tuple<SIMD128Register> Vextf4(SIMD128Register src) {
571   using WideSourceElementType = NarrowType<ElementType>;
572   using SourceElementType = NarrowType<WideSourceElementType>;
573   return {VectorExtend<WideSourceElementType>(VectorExtend<SourceElementType>(src))};
574 }
575 
576 template <typename ElementType,
577           enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vextf8(SIMD128Register src)578 inline std::tuple<SIMD128Register> Vextf8(SIMD128Register src) {
579   using WideWideSourceElementType = NarrowType<ElementType>;
580   return {
581       VectorExtend<WideWideSourceElementType>(std::get<0>(Vextf4<WideWideSourceElementType>(src)))};
582 }
583 
584 template <typename ElementType,
585           enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
VidvForTests(size_t index)586 inline std::tuple<SIMD128Register> VidvForTests(size_t index) {
587   SIMD128Register result;
588   constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
589   ElementType element = {static_cast<typename ElementType::BaseType>(index * kElementsCount)};
590   for (size_t index = 0; index < kElementsCount; ++index) {
591     result.Set(element, index);
592     element += ElementType{1};
593   }
594   return result;
595 }
596 
597 // Handles "slide up" for a single destination register. Effectively copies the last offset elements
598 // in [kElementsCount - offset, kElementsCount) of src1 followed by the first [0, kElementsCount -
599 // offset) elements of src2 into the result.
600 //
601 // This leaves result looking like
602 //
603 //     result = {
604 //         src1[kElementsCount-offset+0],
605 //         src1[kElementsCount-offset+1],
606 //         ...,
607 //         src1[kElementsCount-offset+(offset-1),
608 //         src2[0],
609 //         src2[1],
610 //         ...,
611 //         src2[kElementsCount-offset-1]
612 //     };
613 template <typename ElementType>
VectorSlideUp(size_t offset,SIMD128Register src1,SIMD128Register src2)614 inline std::tuple<SIMD128Register> VectorSlideUp(size_t offset,
615                                                  SIMD128Register src1,
616                                                  SIMD128Register src2) {
617   SIMD128Register result;
618   constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
619   CHECK_LT(offset, kElementsCount);
620   for (size_t index = 0; index < offset; ++index) {
621     result.Set(VectorElement<ElementType>(src1, kElementsCount - offset + index), index);
622   }
623   for (size_t index = offset; index < kElementsCount; ++index) {
624     result.Set(VectorElement<ElementType>(src2, index - offset), index);
625   }
626   return result;
627 }
628 
629 // Handles "slide down" for a single destination register. Effectively copies the elements in
630 // [offset, kElementsCount) of src1 followed by the [0, kElementsCount - offset) elements of src2
631 // into the result.
632 //
633 // This leaves result looking like
634 //
635 //     result = {
636 //         [0] = src1[offset+0],
637 //         [1] = src1[offset+1],
638 //         ...,
639 //         [kElementsCount-offset-1] = src1[kElementsCount-1],
640 //         [kElementsCount-offset] = src2[0],
641 //         [kElementsCount-offset+1] = src2[1],
642 //         ...,
643 //         [kElementsCount-offset+(offset-1)] = src2[kElementsCount-offset-1]
644 //     };
645 template <typename ElementType>
VectorSlideDown(size_t offset,SIMD128Register src1,SIMD128Register src2)646 inline std::tuple<SIMD128Register> VectorSlideDown(size_t offset,
647                                                    SIMD128Register src1,
648                                                    SIMD128Register src2) {
649   SIMD128Register result;
650   constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
651   CHECK_LT(offset, kElementsCount);
652   for (size_t index = 0; index < kElementsCount - offset; ++index) {
653     result.Set(VectorElement<ElementType>(src1, offset + index), index);
654   }
655   for (size_t index = kElementsCount - offset; index < kElementsCount; ++index) {
656     result.Set(VectorElement<ElementType>(src2, index - (kElementsCount - offset)), index);
657   }
658   return result;
659 }
660 
661 template <enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vcpopm(SIMD128Register simd_src)662 inline std::tuple<SIMD128Register> Vcpopm(SIMD128Register simd_src) {
663   UInt128 src = simd_src.Get<UInt128>();
664   return Popcount(src);
665 }
666 
667 template <enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vfirstm(SIMD128Register simd_src)668 inline std::tuple<SIMD128Register> Vfirstm(SIMD128Register simd_src) {
669   UInt128 src = simd_src.Get<UInt128>();
670   if (src == Int128{0}) {
671     return ~UInt128{0};
672   }
673   return CountRZero(src);
674 }
675 
676 #ifndef __x86_64__
677 template <typename ElementType,
678           enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vidv(size_t index)679 inline std::tuple<SIMD128Register> Vidv(size_t index) {
680   return VidvForTests<ElementType>(index);
681 }
682 #endif
683 
684 template <enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vmsifm(SIMD128Register simd_src)685 inline std::tuple<SIMD128Register> Vmsifm(SIMD128Register simd_src) {
686   Int128 src = simd_src.Get<Int128>();
687   return {(src - Int128{1}) ^ src};
688 }
689 
690 template <enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vmsbfm(SIMD128Register simd_src)691 inline std::tuple<SIMD128Register> Vmsbfm(SIMD128Register simd_src) {
692   Int128 src = simd_src.Get<Int128>();
693   if (src == Int128{0}) {
694     return {~Int128{0}};
695   }
696   return {std::get<0>(Vmsifm(simd_src)).Get<Int128>() >> Int128{1}};
697 }
698 
699 template <enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vmsofm(SIMD128Register simd_src)700 inline std::tuple<SIMD128Register> Vmsofm(SIMD128Register simd_src) {
701   return {std::get<0>(Vmsbfm(simd_src)) ^ std::get<0>(Vmsifm(simd_src))};
702 }
703 
704 template <typename ElementType,
705           enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Viotam(SIMD128Register simd_src,size_t counter)706 inline std::tuple<SIMD128Register, size_t> Viotam(SIMD128Register simd_src, size_t counter) {
707   constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
708   __uint128_t src = simd_src.Get<__uint128_t>();
709   SIMD128Register result;
710   for (size_t index = 0; index < kElementsCount; ++index) {
711     typename Wrapping<typename ElementType::BaseType>::UnsignedType value{
712         static_cast<typename ElementType::BaseType>(counter)};
713     result.Set(value, index);
714     counter += static_cast<size_t>(src & 1);
715     src >>= 1;
716   }
717   return {result, counter};
718 }
719 
720 template <typename TargetElementType,
721           typename SourceElementType,
722           enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vfcvtv(int8_t rm,int8_t frm,SIMD128Register src)723 inline std::tuple<SIMD128Register> Vfcvtv(int8_t rm, int8_t frm, SIMD128Register src) {
724   SIMD128Register result;
725   size_t kElementsCount = std::min(sizeof(SIMD128Register) / sizeof(TargetElementType),
726                                    sizeof(SIMD128Register) / sizeof(SourceElementType));
727   for (size_t index = 0; index < kElementsCount; ++index) {
728     if constexpr (!std::is_same_v<TargetElementType, Float16> &&
729                   !std::is_same_v<TargetElementType, Float32> &&
730                   !std::is_same_v<TargetElementType, Float64>) {
731       result.Set(
732           std::get<0>(FCvtFloatToInteger<typename TargetElementType::BaseType, SourceElementType>(
733               rm, frm, src.Get<SourceElementType>(index))),
734           index);
735     } else if constexpr (!std::is_same_v<SourceElementType, Float16> &&
736                          !std::is_same_v<SourceElementType, Float32> &&
737                          !std::is_same_v<SourceElementType, Float64>) {
738       result.Set(
739           std::get<0>(FCvtIntegerToFloat<TargetElementType, typename SourceElementType::BaseType>(
740               rm, frm, src.Get<typename SourceElementType::BaseType>(index))),
741           index);
742     } else {
743       result.Set(std::get<0>(FCvtFloatToFloat<TargetElementType, SourceElementType>(
744                      rm, frm, src.Get<SourceElementType>(index))),
745                  index);
746     }
747   }
748   return result;
749 }
750 
751 // With wide intrinsics multiplication we may do sign-extension or zero-extension, but some
752 // intrinsics need mix: Signed * Unsigned. We narrow down value and then extend it again.
753 // Compiler is smart enough to eliminate dead code.
754 template <typename ElementType>
WideMultiplySignedUnsigned(ElementType arg1,ElementType arg2)755 std::tuple<ElementType> WideMultiplySignedUnsigned(ElementType arg1, ElementType arg2) {
756   return BitCastToUnsigned(Widen(BitCastToSigned(Narrow(arg1)))) *
757          Widen(BitCastToUnsigned(Narrow(arg2)));
758 }
759 
760 #define DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS(...) __VA_ARGS__
761 #define DEFINE_ARITHMETIC_INTRINSIC(Name, arithmetic, parameters, capture, arguments)             \
762   template <typename ElementType,                                                                 \
763             enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>       \
764   inline std::tuple<SIMD128Register> Name(DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS parameters) { \
765     return VectorProcessing<ElementType>(                                                         \
766         [DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS capture](auto... args) {                       \
767           static_assert((std::is_same_v<decltype(args), ElementType> && ...));                    \
768           arithmetic;                                                                             \
769         },                                                                                        \
770         DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS arguments);                                     \
771   }
772 
773 #define DEFINE_1OP_ARITHMETIC_INTRINSIC_V(name, ...)                 \
774   DEFINE_ARITHMETIC_INTRINSIC(V##name##v, return ({ __VA_ARGS__; }); \
775                               , (SIMD128Register src), (), (src))
776 
777 #define DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(name, ...)                 \
778   DEFINE_ARITHMETIC_INTRINSIC(V##name##vv, return ({ __VA_ARGS__; }); \
779                               , (SIMD128Register src1, SIMD128Register src2), (), (src1, src2))
780 
781 #define DEFINE_3OP_ARITHMETIC_INTRINSIC_VV(name, ...)                                             \
782   DEFINE_ARITHMETIC_INTRINSIC(V##name##vv, return ({ __VA_ARGS__; });                             \
783                               ,                                                                   \
784                               (SIMD128Register src1, SIMD128Register src2, SIMD128Register src3), \
785                               (),                                                                 \
786                               (src1, src2, src3))
787 
788 #define DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(name, ...)                            \
789   DEFINE_ARITHMETIC_INTRINSIC(                                                        \
790       V##name##vv, return ({ __VA_ARGS__; });                                         \
791       ,                                                                               \
792       (int8_t csr, SIMD128Register src1, SIMD128Register src2, SIMD128Register src3), \
793       (csr),                                                                          \
794       (src1, src2, src3))
795 
796 #define DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(name, ...)                 \
797   DEFINE_ARITHMETIC_INTRINSIC(V##name##vx, return ({ __VA_ARGS__; }); \
798                               , (SIMD128Register src1, ElementType src2), (), (src1, src2))
799 
800 #define DEFINE_3OP_ARITHMETIC_INTRINSIC_VX(name, ...) \
801   DEFINE_ARITHMETIC_INTRINSIC(                        \
802       V##name##vx, return ({ __VA_ARGS__; });         \
803       , (SIMD128Register src1, ElementType src2, SIMD128Register src3), (), (src1, src2, src3))
804 
805 #define DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(name, ...)                        \
806   DEFINE_ARITHMETIC_INTRINSIC(                                                    \
807       V##name##vf, return ({ __VA_ARGS__; });                                     \
808       ,                                                                           \
809       (int8_t csr, SIMD128Register src1, ElementType src2, SIMD128Register src3), \
810       (csr),                                                                      \
811       (src1, src2, src3))
812 
813 #define DEFINE_1OP_ARITHMETIC_INTRINSIC_X(name, ...) \
814   DEFINE_ARITHMETIC_INTRINSIC(V##name##x, return ({ __VA_ARGS__; });, (ElementType src), (), (src))
815 
816 #define DEFINE_1OP_1CSR_ARITHMETIC_INTRINSIC_V(name, ...)            \
817   DEFINE_ARITHMETIC_INTRINSIC(V##name##v, return ({ __VA_ARGS__; }); \
818                               , (int8_t csr, SIMD128Register src), (csr), (src))
819 
820 #define DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(name, ...) \
821   DEFINE_ARITHMETIC_INTRINSIC(                             \
822       V##name##vf, return ({ __VA_ARGS__; });              \
823       , (int8_t csr, SIMD128Register src1, ElementType src2), (csr), (src1, src2))
824 
825 #define DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(name, ...) \
826   DEFINE_ARITHMETIC_INTRINSIC(                             \
827       V##name##vv, return ({ __VA_ARGS__; });              \
828       , (int8_t csr, SIMD128Register src1, SIMD128Register src2), (csr), (src1, src2))
829 
830 #define DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VX(name, ...) \
831   DEFINE_ARITHMETIC_INTRINSIC(                             \
832       V##name##vx, return ({ __VA_ARGS__; });              \
833       , (int8_t csr, SIMD128Register src1, ElementType src2), (csr), (src1, src2))
834 
835 #define DEFINE_ARITHMETIC_REDUCE_INTRINSIC(Name, arithmetic, parameters, capture, arguments) \
836   template <typename ElementType,                                                            \
837             typename ResultType = ElementType,                                               \
838             enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>  \
839   inline std::tuple<ResultType> Name(DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS parameters) { \
840     return VectorProcessingReduce<ElementType>(                                              \
841         [DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS capture](auto... args) {                  \
842           static_assert((std::is_same_v<decltype(args), ResultType> && ...));                \
843           arithmetic;                                                                        \
844         },                                                                                   \
845         DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS arguments);                                \
846   }
847 
848 #define DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(name, ...)                        \
849   DEFINE_ARITHMETIC_REDUCE_INTRINSIC(V##name##vs, return ({ __VA_ARGS__; }); \
850                                      , (ResultType init, SIMD128Register src), (), (init, src))
851 
852 #define DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VS(name, ...) \
853   DEFINE_ARITHMETIC_REDUCE_INTRINSIC(                      \
854       Vfred##name##vs, return ({ __VA_ARGS__; });          \
855       , (int8_t csr, ResultType init, SIMD128Register src), (csr), (init, src))
856 
857 #define DEFINE_W_ARITHMETIC_INTRINSIC(Name, Pattern, arithmetic, parameters, capture, arguments)  \
858   template <typename ElementType,                                                                 \
859             enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>       \
860   inline std::tuple<SIMD128Register> Name(DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS parameters) { \
861     return VectorArithmetic##Pattern<ElementType>(                                                \
862         [DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS capture](auto... args) {                       \
863           static_assert((std::is_same_v<decltype(args), WideType<ElementType>> && ...));          \
864           arithmetic;                                                                             \
865         },                                                                                        \
866         DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS arguments);                                     \
867   }
868 
869 #define DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VV(name, ...)                       \
870   DEFINE_W_ARITHMETIC_INTRINSIC(Vw##name##vv, Widenvv, return ({ __VA_ARGS__; }); \
871                                 , (SIMD128Register src1, SIMD128Register src2), (), (src1, src2))
872 
873 #define DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VV(name, ...) \
874   DEFINE_W_ARITHMETIC_INTRINSIC(                                 \
875       Vfw##name##vv, Widenvv, return ({ __VA_ARGS__; });         \
876       , (int8_t csr, SIMD128Register src1, SIMD128Register src2), (csr), (src1, src2))
877 
878 #define DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VF(name, ...) \
879   DEFINE_W_ARITHMETIC_INTRINSIC(                                 \
880       Vfw##name##vf, Widenvv, return ({ __VA_ARGS__; });         \
881       , (int8_t csr, SIMD128Register src1, ElementType src2), (csr), (src1, src2))
882 
883 #define DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_WV(name, ...) \
884   DEFINE_W_ARITHMETIC_INTRINSIC(                                 \
885       Vfw##name##wv, Widenwv, return ({ __VA_ARGS__; });         \
886       , (int8_t csr, SIMD128Register src1, SIMD128Register src2), (csr), (src1, src2))
887 
888 #define DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_WF(name, ...) \
889   DEFINE_W_ARITHMETIC_INTRINSIC(                                 \
890       Vfw##name##wf, Widenwv, return ({ __VA_ARGS__; });         \
891       , (int8_t csr, SIMD128Register src1, ElementType src2), (csr), (src1, src2))
892 
893 #define DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VVW(name, ...)              \
894   DEFINE_W_ARITHMETIC_INTRINSIC(                                          \
895       Vw##name##vv, Widenvvw, return ({ __VA_ARGS__; });                  \
896       ,                                                                   \
897       (SIMD128Register src1, SIMD128Register src2, SIMD128Register src3), \
898       (),                                                                 \
899       (src1, src2, src3))
900 
901 #define DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VX(name, ...)                       \
902   DEFINE_W_ARITHMETIC_INTRINSIC(Vw##name##vx, Widenvv, return ({ __VA_ARGS__; }); \
903                                 , (SIMD128Register src1, ElementType src2), (), (src1, src2))
904 
905 #define DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VXW(name, ...) \
906   DEFINE_W_ARITHMETIC_INTRINSIC(                             \
907       Vw##name##vx, Widenvvw, return ({ __VA_ARGS__; });     \
908       , (SIMD128Register src1, ElementType src2, SIMD128Register src3), (), (src1, src2, src3))
909 
910 #define DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VVW(name, ...)                     \
911   DEFINE_W_ARITHMETIC_INTRINSIC(                                                      \
912       Vfw##name##vv, Widenvvw, return ({ __VA_ARGS__; });                             \
913       ,                                                                               \
914       (int8_t csr, SIMD128Register src1, SIMD128Register src2, SIMD128Register src3), \
915       (csr),                                                                          \
916       (src1, src2, src3))
917 
918 #define DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VXW(name, ...)                 \
919   DEFINE_W_ARITHMETIC_INTRINSIC(                                                  \
920       Vfw##name##vf, Widenvvw, return ({ __VA_ARGS__; });                         \
921       ,                                                                           \
922       (int8_t csr, SIMD128Register src1, ElementType src2, SIMD128Register src3), \
923       (csr),                                                                      \
924       (src1, src2, src3))
925 
926 #define DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WV(name, ...)                       \
927   DEFINE_W_ARITHMETIC_INTRINSIC(Vn##name##wv, Narrowwv, return ({ __VA_ARGS__; }); \
928                                 , (SIMD128Register src1, SIMD128Register src2), (), (src1, src2))
929 
930 #define DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WX(name, ...)                       \
931   DEFINE_W_ARITHMETIC_INTRINSIC(Vn##name##wx, Narrowwv, return ({ __VA_ARGS__; }); \
932                                 , (SIMD128Register src1, ElementType src2), (), (src1, src2))
933 
934 #define DEFINE_2OP_1CSR_NARROW_ARITHMETIC_INTRINSIC_WV(name, ...) \
935   DEFINE_W_ARITHMETIC_INTRINSIC(                                  \
936       Vn##name##wv, Narrowwv, return ({ __VA_ARGS__; });          \
937       , (int8_t csr, SIMD128Register src1, SIMD128Register src2), (csr), (src1, src2))
938 
939 #define DEFINE_2OP_1CSR_NARROW_ARITHMETIC_INTRINSIC_WX(name, ...) \
940   DEFINE_W_ARITHMETIC_INTRINSIC(                                  \
941       Vn##name##wx, Narrowwv, return ({ __VA_ARGS__; });          \
942       , (int8_t csr, SIMD128Register src1, ElementType src2), (csr), (src1, src2))
943 
944 #define DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VV(name, ...)                       \
945   DEFINE_W_ARITHMETIC_INTRINSIC(Vw##name##vv, Widenvv, return ({ __VA_ARGS__; }); \
946                                 , (SIMD128Register src1, SIMD128Register src2), (), (src1, src2))
947 
948 #define DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_WV(name, ...)                       \
949   DEFINE_W_ARITHMETIC_INTRINSIC(Vw##name##wv, Widenwv, return ({ __VA_ARGS__; }); \
950                                 , (SIMD128Register src1, SIMD128Register src2), (), (src1, src2))
951 
952 #define DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_WX(name, ...)                       \
953   DEFINE_W_ARITHMETIC_INTRINSIC(Vw##name##wx, Widenwv, return ({ __VA_ARGS__; }); \
954                                 , (SIMD128Register src1, ElementType src2), (), (src1, src2))
955 
956 #define DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VX(name, ...)                       \
957   DEFINE_W_ARITHMETIC_INTRINSIC(Vw##name##vx, Widenvv, return ({ __VA_ARGS__; }); \
958                                 , (SIMD128Register src1, ElementType src2), (), (src1, src2))
959 
960 DEFINE_1OP_ARITHMETIC_INTRINSIC_V(copy, auto [arg] = std::tuple{args...}; arg)
961 DEFINE_1OP_ARITHMETIC_INTRINSIC_X(copy, auto [arg] = std::tuple{args...}; arg)
962 DEFINE_1OP_ARITHMETIC_INTRINSIC_V(brev8, std::get<0>((Brev8(args...))))
963 DEFINE_1OP_ARITHMETIC_INTRINSIC_V(frsqrt7, RSqrtEstimate(args...))
964 DEFINE_1OP_ARITHMETIC_INTRINSIC_V(
965     fclass,
966     static_cast<typename TypeTraits<ElementType>::Int>(std::get<0>(FClass(args...))))
967 
968 DEFINE_1OP_1CSR_ARITHMETIC_INTRINSIC_V(fsqrt,
969                                        CanonicalizeNanTuple(FSqrt(FPFlags::DYN, csr, args...)))
970 
971 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(add, (args + ...))
972 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(add, (args + ...))
973 DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(redsum, (args + ...))
974 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(rsub, auto [arg1, arg2] = std::tuple{args...}; (arg2 - arg1))
975 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(sub, (args - ...))
976 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(sub, (args - ...))
977 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(and, (args & ...))
978 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(and, (args & ...))
979 DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(redand, (args & ...))
980 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(or, (args | ...))
981 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(or, (args | ...))
982 DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(redor, (args | ...))
983 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(xor, (args ^ ...))
984 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(xor, (args ^ ...))
985 DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(redxor, (args ^ ...))
986 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(
987     aadd,
988     ElementType{std::get<0>(Aadd(csr, static_cast<typename ElementType::BaseType>(args)...))})
989 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VX(
990     aadd,
991     ElementType{std::get<0>(Aadd(csr, static_cast<typename ElementType::BaseType>(args)...))})
992 
993 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(smul, auto [arg1, arg2] = std::tuple{args...}; ElementType{
994     Narrow(Saturating{std::get<0>(Roundoff(
995         csr,
996         static_cast<typename WideType<ElementType>::BaseType>(Widen(arg1) * Widen(arg2)),
997         static_cast<typename WideType<ElementType>::BaseType>((sizeof(ElementType) * CHAR_BIT) -
998                                                               1)))})})
999 
1000 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VX(smul, auto [arg1, arg2] = std::tuple{args...}; ElementType{
1001     Narrow(Saturating{std::get<0>(Roundoff(
1002         csr,
1003         static_cast<typename WideType<ElementType>::BaseType>(Widen(arg1) * Widen(arg2)),
1004         static_cast<typename WideType<ElementType>::BaseType>((sizeof(ElementType) * CHAR_BIT) -
1005                                                               1)))})})
1006 
1007 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(
1008     ssr,
1009     ElementType{std::get<0>(Roundoff(csr, static_cast<typename ElementType::BaseType>(args)...))})
1010 
1011 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VX(
1012     ssr,
1013     ElementType{std::get<0>(Roundoff(csr, static_cast<typename ElementType::BaseType>(args)...))})
1014 
1015 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(fadd,
1016                                         CanonicalizeNanTuple(FAdd(FPFlags::DYN, csr, args...)))
1017 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(fadd,
1018                                         CanonicalizeNanTuple(FAdd(FPFlags::DYN, csr, args...)))
1019 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VV(
1020     add,
1021     CanonicalizeNanTuple(FAdd(FPFlags::DYN, csr, args...)))
1022 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VF(
1023     add,
1024     CanonicalizeNanTuple(FAdd(FPFlags::DYN, csr, args...)))
1025 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VV(
1026     sub,
1027     CanonicalizeNanTuple(FSub(FPFlags::DYN, csr, args...)))
1028 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VF(
1029     sub,
1030     CanonicalizeNanTuple(FSub(FPFlags::DYN, csr, args...)))
1031 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VV(
1032     mul,
1033     CanonicalizeNanTuple(FMul(FPFlags::DYN, csr, args...)))
1034 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VF(
1035     mul,
1036     CanonicalizeNanTuple(FMul(FPFlags::DYN, csr, args...)))
1037 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_WV(
1038     add,
1039     CanonicalizeNanTuple(FAdd(FPFlags::DYN, csr, args...)))
1040 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_WF(
1041     add,
1042     CanonicalizeNanTuple(FAdd(FPFlags::DYN, csr, args...)))
1043 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_WV(
1044     sub,
1045     CanonicalizeNanTuple(FSub(FPFlags::DYN, csr, args...)))
1046 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_WF(
1047     sub,
1048     CanonicalizeNanTuple(FSub(FPFlags::DYN, csr, args...)))
1049 
1050 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(fsub,
1051                                         CanonicalizeNanTuple(FSub(FPFlags::DYN, csr, args...)))
1052 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(fsub,
1053                                         CanonicalizeNanTuple(FSub(FPFlags::DYN, csr, args...)))
1054 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(frsub, auto [arg1, arg2] = std::tuple{args...};
1055                                         CanonicalizeNanTuple(FSub(FPFlags::DYN, csr, arg2, arg1)))
1056 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VS(osum,
1057                                         CanonicalizeNanTuple(FAdd(FPFlags::DYN, csr, args...)))
1058 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VS(usum,
1059                                         CanonicalizeNanTuple(FAdd(FPFlags::DYN, csr, args...)))
1060 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(
1061     asub,
1062     ElementType{std::get<0>(Asub(csr, static_cast<typename ElementType::BaseType>(args)...))})
1063 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VX(
1064     asub,
1065     ElementType{std::get<0>(Asub(csr, static_cast<typename ElementType::BaseType>(args)...))})
1066 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(fmul,
1067                                         CanonicalizeNanTuple(FMul(FPFlags::DYN, csr, args...)))
1068 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(fmul,
1069                                         CanonicalizeNanTuple(FMul(FPFlags::DYN, csr, args...)))
1070 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(fdiv,
1071                                         CanonicalizeNanTuple(FDiv(FPFlags::DYN, csr, args...)))
1072 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(fdiv,
1073                                         CanonicalizeNanTuple(FDiv(FPFlags::DYN, csr, args...)))
1074 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(frdiv, auto [arg1, arg2] = std::tuple{args...};
1075                                         CanonicalizeNanTuple(FDiv(FPFlags::DYN, csr, arg2, arg1)))
1076 // SIMD mask either includes results with all bits set to 0 or all bits set to 1.
1077 // This way it may be used with VAnd and VAndN operations to perform masking.
1078 // Such comparison is effectively one instruction of x86-64 (via SSE or AVX) but
1079 // to achieve it we need to multiply bool result by (~IntType{0}) or (~ElementType{0}).
1080 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(feq, using IntType = typename TypeTraits<ElementType>::Int;
1081                                    (~IntType{0}) * IntType(std::get<0>(Feq(args...))))
1082 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(feq, using IntType = typename TypeTraits<ElementType>::Int;
1083                                    (~IntType{0}) * IntType(std::get<0>(Feq(args...))))
1084 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fne, using IntType = typename TypeTraits<ElementType>::Int;
1085                                    (~IntType{0}) * IntType(!std::get<0>(Feq(args...))))
1086 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fne, using IntType = typename TypeTraits<ElementType>::Int;
1087                                    (~IntType{0}) * IntType(!std::get<0>(Feq(args...))))
1088 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(flt, using IntType = typename TypeTraits<ElementType>::Int;
1089                                    (~IntType{0}) * IntType(std::get<0>(Flt(args...))))
1090 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(flt, using IntType = typename TypeTraits<ElementType>::Int;
1091                                    (~IntType{0}) * IntType(std::get<0>(Flt(args...))))
1092 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fle, using IntType = typename TypeTraits<ElementType>::Int;
1093                                    (~IntType{0}) * IntType(std::get<0>(Fle(args...))))
1094 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fle, using IntType = typename TypeTraits<ElementType>::Int;
1095                                    (~IntType{0}) * IntType(std::get<0>(Fle(args...))))
1096 // Note: for floating point numbers Flt(b, a) and !Fle(a, b) produce different and incompatible
1097 // results. IEEE754-2008 defined NOT (!=) predicate as negation of EQ (==) predicate while GT (>)
1098 // and GE (>=) are not negations of LE (<) or GT (<=) predicated but instead use swap of arguments.
1099 // Note that scalar form includes only three predicates (Feq, Fle, Fgt) while vector form includes
1100 // Vmfgt.vf and Vmfge.vf instructions only for vector+scalar case (vector+vector case is supposed
1101 // to be handled by swapping arguments). More here: https://github.com/riscv/riscv-v-spec/issues/300
1102 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fgt, auto [arg1, arg2] = std::tuple{args...};
1103                                    using IntType = typename TypeTraits<ElementType>::Int;
1104                                    (~IntType{0}) * IntType(std::get<0>(Flt(arg2, arg1))))
1105 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fge, auto [arg1, arg2] = std::tuple{args...};
1106                                    using IntType = typename TypeTraits<ElementType>::Int;
1107                                    (~IntType{0}) * IntType(std::get<0>(Fle(arg2, arg1))))
1108 DEFINE_3OP_ARITHMETIC_INTRINSIC_VX(adc, auto [arg1, arg2, arg3] = std::tuple{args...};
1109                                    (arg2 + arg1 - arg3))
1110 DEFINE_3OP_ARITHMETIC_INTRINSIC_VV(adc, auto [arg1, arg2, arg3] = std::tuple{args...};
1111                                    (arg2 + arg1 - arg3))
1112 DEFINE_3OP_ARITHMETIC_INTRINSIC_VX(sbc, auto [arg1, arg2, arg3] = std::tuple{args...};
1113                                    (arg2 - arg1 + arg3))
1114 DEFINE_3OP_ARITHMETIC_INTRINSIC_VV(sbc, auto [arg1, arg2, arg3] = std::tuple{args...};
1115                                    (arg2 - arg1 + arg3))
1116 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(
1117     seq,
1118     (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args == ...))})
1119 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(
1120     seq,
1121     (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args == ...))})
1122 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(
1123     sne,
1124     (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args != ...))})
1125 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(
1126     sne,
1127     (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args != ...))})
1128 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(
1129     slt,
1130     (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args < ...))})
1131 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(
1132     slt,
1133     (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args < ...))})
1134 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(
1135     sle,
1136     (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args <= ...))})
1137 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(
1138     sle,
1139     (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args <= ...))})
1140 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(
1141     sgt,
1142     (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args > ...))})
1143 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(sl, auto [arg1, arg2] = std::tuple{args...}; (arg1 << arg2))
1144 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(sl, auto [arg1, arg2] = std::tuple{args...}; (arg1 << arg2))
1145 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(sr, auto [arg1, arg2] = std::tuple{args...}; (arg1 >> arg2))
1146 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(sr, auto [arg1, arg2] = std::tuple{args...}; (arg1 >> arg2))
1147 DEFINE_3OP_ARITHMETIC_INTRINSIC_VV(macc, auto [arg1, arg2, arg3] = std::tuple{args...};
1148                                    ((arg2 * arg1) + arg3))
1149 DEFINE_3OP_ARITHMETIC_INTRINSIC_VX(macc, auto [arg1, arg2, arg3] = std::tuple{args...};
1150                                    ((arg2 * arg1) + arg3))
1151 DEFINE_3OP_ARITHMETIC_INTRINSIC_VV(nmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
1152                                    (-(arg2 * arg1) + arg3))
1153 DEFINE_3OP_ARITHMETIC_INTRINSIC_VX(nmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
1154                                    (-(arg2 * arg1) + arg3))
1155 DEFINE_3OP_ARITHMETIC_INTRINSIC_VV(madd, auto [arg1, arg2, arg3] = std::tuple{args...};
1156                                    ((arg2 * arg3) + arg1))
1157 DEFINE_3OP_ARITHMETIC_INTRINSIC_VX(madd, auto [arg1, arg2, arg3] = std::tuple{args...};
1158                                    ((arg2 * arg3) + arg1))
1159 DEFINE_3OP_ARITHMETIC_INTRINSIC_VV(nmsub, auto [arg1, arg2, arg3] = std::tuple{args...};
1160                                    (-(arg2 * arg3) + arg1))
1161 DEFINE_3OP_ARITHMETIC_INTRINSIC_VX(nmsub, auto [arg1, arg2, arg3] = std::tuple{args...};
1162                                    (-(arg2 * arg3) + arg1))
1163 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(fmacc, auto [arg1, arg2, arg3] = std::tuple{args...};
1164                                         std::get<0>(FMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
1165 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(fmacc, auto [arg1, arg2, arg3] = std::tuple{args...};
1166                                         std::get<0>(FMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
1167 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(fmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
1168                                         std::get<0>(FMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
1169 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(fmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
1170                                         std::get<0>(FMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
1171 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(fmadd, auto [arg1, arg2, arg3] = std::tuple{args...};
1172                                         std::get<0>(FMAdd(FPFlags::DYN, csr, arg3, arg2, arg1)))
1173 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(fmadd, auto [arg1, arg2, arg3] = std::tuple{args...};
1174                                         std::get<0>(FMAdd(FPFlags::DYN, csr, arg3, arg2, arg1)))
1175 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(fmsub, auto [arg1, arg2, arg3] = std::tuple{args...};
1176                                         std::get<0>(FMSub(FPFlags::DYN, csr, arg3, arg2, arg1)))
1177 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(fmsub, auto [arg1, arg2, arg3] = std::tuple{args...};
1178                                         std::get<0>(FMSub(FPFlags::DYN, csr, arg3, arg2, arg1)))
1179 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(fnmacc, auto [arg1, arg2, arg3] = std::tuple{args...};
1180                                         std::get<0>(FNMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
1181 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(fnmacc, auto [arg1, arg2, arg3] = std::tuple{args...};
1182                                         std::get<0>(FNMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
1183 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(fnmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
1184                                         std::get<0>(FNMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
1185 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(fnmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
1186                                         std::get<0>(FNMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
1187 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(fnmadd, auto [arg1, arg2, arg3] = std::tuple{args...};
1188                                         std::get<0>(FNMSub(FPFlags::DYN, csr, arg3, arg2, arg1)))
1189 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(fnmadd, auto [arg1, arg2, arg3] = std::tuple{args...};
1190                                         std::get<0>(FNMSub(FPFlags::DYN, csr, arg3, arg2, arg1)))
1191 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(fnmsub, auto [arg1, arg2, arg3] = std::tuple{args...};
1192                                         std::get<0>(FNMAdd(FPFlags::DYN, csr, arg3, arg2, arg1)))
1193 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(fnmsub, auto [arg1, arg2, arg3] = std::tuple{args...};
1194                                         std::get<0>(FNMAdd(FPFlags::DYN, csr, arg3, arg2, arg1)))
1195 
1196 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fmin, std::get<0>(FMin(args...)))
1197 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fmin, std::get<0>(FMin(args...)))
1198 DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(fredmin, std::get<0>(FMin(args...)))
1199 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fmax, std::get<0>(FMax(args...)))
1200 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fmax, std::get<0>(FMax(args...)))
1201 DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(fredmax, std::get<0>(FMax(args...)))
1202 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fsgnj, std::get<0>(FSgnj(args...)))
1203 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fsgnj, std::get<0>(FSgnj(args...)))
1204 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fsgnjn, std::get<0>(FSgnjn(args...)))
1205 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fsgnjn, std::get<0>(FSgnjn(args...)))
1206 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fsgnjx, std::get<0>(FSgnjx(args...)))
1207 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fsgnjx, std::get<0>(FSgnjx(args...)))
1208 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(min, std::min(args...))
1209 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(min, std::min(args...))
1210 DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(redmin, std::min(args...))
1211 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(max, std::max(args...))
1212 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(max, std::max(args...))
1213 DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(redmax, std::max(args...))
1214 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(mul, auto [arg1, arg2] = std::tuple{args...}; (arg2 * arg1))
1215 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(mul, auto [arg1, arg2] = std::tuple{args...}; (arg2 * arg1))
1216 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(mulh, auto [arg1, arg2] = std::tuple{args...};
1217                                    NarrowTopHalf(Widen(arg2) * Widen(arg1)))
1218 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(mulh, auto [arg1, arg2] = std::tuple{args...};
1219                                    NarrowTopHalf(Widen(arg2) * Widen(arg1)))
1220 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(mulhsu, auto [arg1, arg2] = std::tuple{args...};
1221                                    NarrowTopHalf(BitCastToUnsigned(Widen(BitCastToSigned(arg1))) *
1222                                                  Widen(BitCastToUnsigned(arg2))))
1223 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(mulhsu, auto [arg1, arg2] = std::tuple{args...};
1224                                    NarrowTopHalf(BitCastToUnsigned(Widen(BitCastToSigned(arg1))) *
1225                                                  Widen(BitCastToUnsigned(arg2))))
1226 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(
1227     div,
1228     ElementType{std::get<0>(Div(static_cast<typename ElementType::BaseType>(args)...))})
1229 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(
1230     div,
1231     ElementType{std::get<0>(Div(static_cast<typename ElementType::BaseType>(args)...))})
1232 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(
1233     rem,
1234     ElementType{std::get<0>(Rem(static_cast<typename ElementType::BaseType>(args)...))})
1235 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(
1236     rem,
1237     ElementType{std::get<0>(Rem(static_cast<typename ElementType::BaseType>(args)...))})
1238 
1239 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VV(add, (args + ...))
1240 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VX(add, (args + ...))
1241 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_WV(add, (args + ...))
1242 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_WX(add, (args + ...))
1243 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VV(sub, (args - ...))
1244 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VX(sub, (args - ...))
1245 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_WV(sub, (args - ...))
1246 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_WX(sub, (args - ...))
1247 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VV(mul, (args * ...))
1248 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VV(mulsu, std::get<0>(WideMultiplySignedUnsigned(args...)))
1249 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VX(mul, (args * ...))
1250 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VX(mulsu, std::get<0>(WideMultiplySignedUnsigned(args...)))
1251 
1252 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VVW(macc, auto [arg1, arg2, arg3] = std::tuple{args...};
1253                                           (arg1 * arg2) + arg3)
1254 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VXW(macc, auto [arg1, arg2, arg3] = std::tuple{args...};
1255                                           (arg1 * arg2) + arg3)
1256 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VVW(maccsu, auto [arg1, arg2, arg3] = std::tuple{args...};
1257                                           (std::get<0>(WideMultiplySignedUnsigned(arg2, arg1))) +
1258                                           arg3)
1259 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VXW(maccsu, auto [arg1, arg2, arg3] = std::tuple{args...};
1260                                           (std::get<0>(WideMultiplySignedUnsigned(arg2, arg1))) +
1261                                           arg3)
1262 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VXW(maccus, auto [arg1, arg2, arg3] = std::tuple{args...};
1263                                           (std::get<0>(WideMultiplySignedUnsigned(arg1, arg2))) +
1264                                           arg3)
1265 DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VVW(
1266     macc, auto [arg1, arg2, arg3] = std::tuple{args...};
1267     std::get<0>(FMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
1268 DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VXW(
1269     macc, auto [arg1, arg2, arg3] = std::tuple{args...};
1270     std::get<0>(FMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
1271 DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VVW(
1272     nmacc, auto [arg1, arg2, arg3] = std::tuple{args...};
1273     std::get<0>(FNMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
1274 DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VXW(
1275     nmacc, auto [arg1, arg2, arg3] = std::tuple{args...};
1276     std::get<0>(FNMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
1277 DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VVW(
1278     msac, auto [arg1, arg2, arg3] = std::tuple{args...};
1279     std::get<0>(FMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
1280 DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VXW(
1281     msac, auto [arg1, arg2, arg3] = std::tuple{args...};
1282     std::get<0>(FMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
1283 DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VVW(
1284     nmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
1285     std::get<0>(FNMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
1286 DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VXW(
1287     nmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
1288     std::get<0>(FNMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
1289 DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WV(sr, auto [arg1, arg2] = std::tuple{args...};
1290                                           (arg1 >> arg2))
1291 DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WX(sr, auto [arg1, arg2] = std::tuple{args...};
1292                                           (arg1 >> arg2))
1293 DEFINE_2OP_1CSR_NARROW_ARITHMETIC_INTRINSIC_WV(
1294     clip,
1295     WideType<ElementType>{(std::get<0>(
1296         Roundoff(csr, static_cast<typename WideType<ElementType>::BaseType>(args)...)))})
1297 DEFINE_2OP_1CSR_NARROW_ARITHMETIC_INTRINSIC_WX(
1298     clip,
1299     WideType<ElementType>{(std::get<0>(
1300         Roundoff(csr, static_cast<typename WideType<ElementType>::BaseType>(args)...)))})
1301 
1302 #undef DEFINE_ARITHMETIC_INTRINSIC
1303 #undef DEFINE_W_ARITHMETIC_INTRINSIC
1304 #undef DEFINE_ARITHMETIC_REDUCE_INTRINSIC
1305 #undef DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS
1306 #undef DEFINE_1OP_ARITHMETIC_INTRINSIC_V
1307 #undef DEFINE_2OP_ARITHMETIC_INTRINSIC_VS
1308 #undef DEFINE_2OP_ARITHMETIC_INTRINSIC_VV
1309 #undef DEFINE_3OP_ARITHMETIC_INTRINSIC_VV
1310 #undef DEFINE_2OP_ARITHMETIC_INTRINSIC_VX
1311 #undef DEFINE_3OP_ARITHMETIC_INTRINSIC_VX
1312 #undef DEFINE_1OP_ARITHMETIC_INTRINSIC_X
1313 #undef DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF
1314 #undef DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VS
1315 #undef DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VX
1316 #undef DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV
1317 #undef DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WV
1318 #undef DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WX
1319 #undef DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VV
1320 #undef DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF
1321 #undef DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV
1322 #undef DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VV
1323 #undef DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VF
1324 #undef DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_WV
1325 #undef DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_WF
1326 #undef DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VVW
1327 #undef DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_WV
1328 #undef DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_WX
1329 #undef DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VX
1330 #undef DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VXW
1331 #undef DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VVW
1332 #undef DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VXW
1333 
1334 }  // namespace berberis::intrinsics
1335 
1336 #endif  // BERBERIS_INTRINSICS_RISCV64_TO_ALL_VECTOR_INTRINSICS_H_
1337