1 /*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #ifndef BERBERIS_INTRINSICS_RISCV64_TO_ALL_VECTOR_INTRINSICS_H_
18 #define BERBERIS_INTRINSICS_RISCV64_TO_ALL_VECTOR_INTRINSICS_H_
19
20 #include <algorithm>
21 #include <climits> // CHAR_BIT
22 #include <cstdint>
23 #include <limits>
24 #include <tuple>
25 #include <type_traits>
26
27 #include "berberis/base/bit_util.h"
28 #include "berberis/base/dependent_false.h"
29 #include "berberis/intrinsics/intrinsics.h" // PreferredIntrinsicsImplementation
30 #if defined(__aarch64__)
31 #include "berberis/intrinsics/common/intrinsics_float.h"
32 #include "berberis/intrinsics/vector_intrinsics.h"
33 #else
34 #include "berberis/intrinsics/intrinsics_float.h" // Float32/Float64
35 #endif
36 #include "berberis/intrinsics/simd_register.h"
37 #include "berberis/intrinsics/type_traits.h"
38
39 namespace berberis::intrinsics {
40
41 enum class TailProcessing {
42 kUndisturbed = 0,
43 kAgnostic = 1,
44 };
45
46 enum class InactiveProcessing {
47 kUndisturbed = 0,
48 kAgnostic = 1,
49 };
50
51 enum class NoInactiveProcessing {
52 kNoInactiveProcessing = 0,
53 };
54
55 template <typename ElementType>
FullMaskForRegister(NoInactiveProcessing)56 [[nodiscard]] inline std::tuple<NoInactiveProcessing> FullMaskForRegister(NoInactiveProcessing) {
57 return {NoInactiveProcessing{}};
58 }
59
60 template <typename ElementType>
61 [[nodiscard]] inline std::tuple<
62 std::conditional_t<sizeof(ElementType) == sizeof(Int8), RawInt16, RawInt8>>
FullMaskForRegister(SIMD128Register)63 FullMaskForRegister(SIMD128Register) {
64 if constexpr (sizeof(ElementType) == sizeof(uint8_t)) {
65 return {{0xffff}};
66 } else if constexpr (sizeof(ElementType) == sizeof(uint16_t)) {
67 return {{0xff}};
68 } else if constexpr (sizeof(ElementType) == sizeof(uint32_t)) {
69 return {{0xf}};
70 } else if constexpr (sizeof(ElementType) == sizeof(uint64_t)) {
71 return {{0x3}};
72 } else {
73 static_assert(kDependentTypeFalse<ElementType>, "Unsupported vector element type");
74 }
75 }
76
77 template <typename ElementType>
MaskForRegisterInSequence(NoInactiveProcessing,size_t)78 [[nodiscard]] inline std::tuple<NoInactiveProcessing> MaskForRegisterInSequence(
79 NoInactiveProcessing,
80 size_t) {
81 return {NoInactiveProcessing{}};
82 }
83
84 template <typename ElementType>
85 [[nodiscard]] inline std::tuple<
86 std::conditional_t<sizeof(ElementType) == sizeof(Int8), RawInt16, RawInt8>>
MaskForRegisterInSequence(SIMD128Register mask,size_t register_in_sequence)87 MaskForRegisterInSequence(SIMD128Register mask, size_t register_in_sequence) {
88 if constexpr (sizeof(ElementType) == sizeof(uint8_t)) {
89 return {mask.Get<RawInt16>(register_in_sequence)};
90 } else if constexpr (sizeof(ElementType) == sizeof(uint16_t)) {
91 return {mask.Get<RawInt8>(register_in_sequence)};
92 } else if constexpr (sizeof(ElementType) == sizeof(uint32_t)) {
93 return {RawInt8{TruncateTo<UInt8>(mask.Get<UInt32>(0) >> UInt64(register_in_sequence * 4)) &
94 UInt8{0b1111}}};
95 } else if constexpr (sizeof(ElementType) == sizeof(uint64_t)) {
96 return {RawInt8{TruncateTo<UInt8>(mask.Get<UInt32>(0) >> UInt64(register_in_sequence * 2)) &
97 UInt8{0b11}}};
98 } else {
99 static_assert(kDependentTypeFalse<ElementType>, "Unsupported vector element type");
100 }
101 }
102
103 // Naïve implementation for tests. Also used on not-x86 platforms.
MakeBitmaskFromVlForTests(size_t vl)104 [[nodiscard]] inline std::tuple<SIMD128Register> MakeBitmaskFromVlForTests(size_t vl) {
105 if (vl == 128) {
106 return {SIMD128Register(__int128(0))};
107 } else {
108 return {SIMD128Register((~__int128(0)) << vl)};
109 }
110 }
111
112 #ifndef __x86_64__
MakeBitmaskFromVl(size_t vl)113 [[nodiscard]] inline std::tuple<SIMD128Register> MakeBitmaskFromVl(size_t vl) {
114 return {MakeBitmaskFromVlForTests(vl)};
115 }
116 #endif
117
118 template <typename ElementType>
MakeBitmaskFromVl(size_t vl)119 [[nodiscard]] inline std::tuple<SIMD128Register> MakeBitmaskFromVl(size_t vl) {
120 return MakeBitmaskFromVl(vl * sizeof(ElementType) * CHAR_BIT);
121 }
122
123 // Naïve implementation for tests. Also used on not-x86 platforms.
124 template <typename ElementType>
BitMaskToSimdMaskForTests(size_t mask)125 [[nodiscard]] inline std::tuple<SIMD128Register> BitMaskToSimdMaskForTests(size_t mask) {
126 constexpr ElementType kZeroValue = ElementType{0};
127 constexpr ElementType kFillValue = ~ElementType{0};
128 SIMD128Register result;
129 for (size_t index = 0; index < sizeof(SIMD128Register) / sizeof(ElementType); ++index) {
130 size_t bit = 1 << index;
131 if (mask & bit) {
132 result.Set(kFillValue, index);
133 } else {
134 result.Set(kZeroValue, index);
135 }
136 }
137 return {result};
138 }
139
140 #if !defined(__x86_64__) && !defined(__aarch64__)
141 template <typename ElementType>
BitMaskToSimdMask(size_t mask)142 [[nodiscard]] inline std::tuple<SIMD128Register> BitMaskToSimdMask(size_t mask) {
143 return {BitMaskToSimdMaskForTests<ElementType>(mask)};
144 }
145 #endif
146
147 // Naïve implementation for tests. Also used on not-x86 platforms.
148 template <typename ElementType>
149 [[nodiscard]] inline std::tuple<
150 std::conditional_t<sizeof(ElementType) == sizeof(Int8), RawInt16, RawInt8>>
SimdMaskToBitMaskForTests(SIMD128Register simd_mask)151 SimdMaskToBitMaskForTests(SIMD128Register simd_mask) {
152 using ResultType = std::conditional_t<sizeof(ElementType) == sizeof(Int8), UInt16, UInt8>;
153 ResultType mask{0};
154 constexpr ResultType kElementsCount{
155 static_cast<uint8_t>(sizeof(SIMD128Register) / sizeof(ElementType))};
156 for (ResultType index{0}; index < kElementsCount; index += ResultType{1}) {
157 if (simd_mask.Get<ElementType>(index) != ElementType{}) {
158 mask |= ResultType{1} << ResultType{index};
159 }
160 }
161 return mask;
162 }
163
164 #ifndef __SSSE3__
165 template <typename ElementType>
166 [[nodiscard]] inline std::tuple<
167 std::conditional_t<sizeof(ElementType) == sizeof(Int8), RawInt16, RawInt8>>
SimdMaskToBitMask(SIMD128Register simd_mask)168 SimdMaskToBitMask(SIMD128Register simd_mask) {
169 return SimdMaskToBitMaskForTests<ElementType>(simd_mask);
170 }
171 #endif
172
173 #if !defined(__aarch64__)
174 template <auto kElement>
VectorMaskedElementToForTests(SIMD128Register simd_mask,SIMD128Register result)175 [[nodiscard]] inline std::tuple<SIMD128Register> VectorMaskedElementToForTests(
176 SIMD128Register simd_mask,
177 SIMD128Register result) {
178 using ElementType = decltype(kElement);
179 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
180 for (size_t index = 0; index < kElementsCount; ++index) {
181 if (!simd_mask.Get<ElementType>(index)) {
182 result.Set(kElement, index);
183 }
184 }
185 return result;
186 }
187
188 #ifndef __x86_64__
189 template <typename ElementType>
VectorMaskedElementTo(SIMD128Register simd_mask,SIMD128Register result)190 [[nodiscard]] inline std::tuple<SIMD128Register> VectorMaskedElementTo(SIMD128Register simd_mask,
191 SIMD128Register result) {
192 return VectorMaskedElementToForTests(simd_mask, result);
193 }
194 #endif
195
196 #endif
197
198 // For instructions that operate on carry bits, expands single bit from mask register
199 // into vector argument
200 template <typename ElementType, TailProcessing vta, auto vma>
GetMaskVectorArgument(SIMD128Register mask,size_t index)201 std::tuple<SIMD128Register> GetMaskVectorArgument(SIMD128Register mask, size_t index) {
202 using MaskType = std::conditional_t<sizeof(ElementType) == sizeof(Int8), UInt16, UInt8>;
203 auto register_mask = std::get<0>(MaskForRegisterInSequence<ElementType>(mask, index));
204 return BitMaskToSimdMaskForTests<ElementType>(Int64{MaskType{register_mask}});
205 }
206
207 template <typename ElementType>
VectorElement(SIMD128Register src,int index)208 [[nodiscard]] inline ElementType VectorElement(SIMD128Register src, int index) {
209 return src.Get<ElementType>(index);
210 }
211
212 template <typename ElementType>
VectorElement(ElementType src,int)213 [[nodiscard]] inline ElementType VectorElement(ElementType src, int) {
214 return src;
215 }
216
217 template <typename ElementType>
VMovTopHalfToBottom(SIMD128Register src)218 [[nodiscard]] inline std::tuple<SIMD128Register> VMovTopHalfToBottom(SIMD128Register src) {
219 return {SIMD128Register{src.Get<uint64_t>(1)}};
220 }
221
222 template <typename ElementType>
VMergeBottomHalfToTop(SIMD128Register bottom,SIMD128Register top)223 [[nodiscard]] inline std::tuple<SIMD128Register> VMergeBottomHalfToTop(SIMD128Register bottom,
224 SIMD128Register top) {
225 SIMD128Register result{bottom};
226 result.Set<uint64_t>(top.Get<uint64_t>(0), 1);
227 return result;
228 }
229
230 // Naïve implementation for tests. Also used on not-x86 platforms.
231 template <auto kDefaultElement>
VectorBroadcastForTests()232 [[nodiscard]] inline std::tuple<SIMD128Register> VectorBroadcastForTests() {
233 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof kDefaultElement;
234 SIMD128Register dest;
235 for (size_t index = 0; index < kElementsCount; ++index) {
236 dest.Set(kDefaultElement, index);
237 }
238 return dest;
239 }
240
241 #ifndef __x86_64__
242 template <auto kDefaultElement>
VectorBroadcast()243 [[nodiscard]] inline std::tuple<SIMD128Register> VectorBroadcast() {
244 return VectorBroadcastForTests<kDefaultElement>();
245 }
246 #endif
247
248 template <auto kDefaultElement, TailProcessing vta, NoInactiveProcessing = NoInactiveProcessing{}>
VectorMasking(SIMD128Register result,int vstart,int vl)249 [[nodiscard]] inline std::tuple<SIMD128Register> VectorMasking(SIMD128Register result,
250 int vstart,
251 int vl) {
252 constexpr int kElementsCount = static_cast<int>(sizeof(SIMD128Register) / sizeof kDefaultElement);
253 if (vstart < 0) {
254 vstart = 0;
255 }
256 if (vl < 0) {
257 vl = 0;
258 }
259 if (vl > kElementsCount) {
260 vl = kElementsCount;
261 }
262 if constexpr (kDefaultElement == decltype(kDefaultElement){}) {
263 if (vstart == 0) [[likely]] {
264 if (vl != kElementsCount) [[unlikely]] {
265 const auto [tail_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vl);
266 result &= ~tail_bitmask;
267 }
268 } else if (vstart >= vl) [[unlikely]] {
269 // Note: vstart <= vl here because RISC-V instructions don't alter the result if vstart >= vl.
270 // But when vstart is so big that it's larger than kElementsCount and vl is also larger than
271 // kElementsCount we hit that corner case and return zero if that happens.
272 result = SIMD128Register{};
273 } else {
274 // Note: vstart < vl here because RISC-V instructions don't alter the result if vstart >= vl.
275 CHECK_LT(vstart, vl);
276 const auto [start_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vstart);
277 const auto [tail_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vl);
278 result &= start_bitmask;
279 result &= ~tail_bitmask;
280 }
281 } else if constexpr (kDefaultElement == ~decltype(kDefaultElement){}) {
282 if (vstart == 0) [[likely]] {
283 if (vl != kElementsCount) [[unlikely]] {
284 const auto [tail_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vl);
285 result |= tail_bitmask;
286 }
287 } else if (vstart >= vl) [[unlikely]] {
288 // Note: vstart <= vl here because RISC-V instructions don't alter the result if vstart >= vl.
289 // But when vstart is so big that it's larger than kElementsCount and vl is also larger than
290 // kElementsCount we hit that corner case and return zero if that happens.
291 result = ~SIMD128Register{};
292 } else {
293 // Note: vstart < vl here because RISC-V instructions don't alter the result if vstart >= vl.
294 CHECK_LT(vstart, vl);
295 const auto [start_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vstart);
296 const auto [tail_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vl);
297 result |= ~start_bitmask;
298 result |= tail_bitmask;
299 }
300 } else {
301 const std::tuple<SIMD128Register>& dest = VectorBroadcast<kDefaultElement>();
302 if (vstart == 0) [[likely]] {
303 if (vl != kElementsCount) [[unlikely]] {
304 const auto [tail_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vl);
305 result &= ~tail_bitmask;
306 result |= (std::get<0>(dest) & tail_bitmask);
307 }
308 } else if (vstart >= vl) [[unlikely]] {
309 // Note: vstart <= vl here because RISC-V instructions don't alter the result if vstart >= vl.
310 // But when vstart is so big that it's larger than kElementsCount and vl is also larger than
311 // kElementsCount we hit that corner case and return dest if that happens.
312 result = std::get<0>(dest);
313 } else {
314 const auto [start_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vstart);
315 const auto [tail_bitmask] = MakeBitmaskFromVl<decltype(kDefaultElement)>(vl);
316 result &= start_bitmask;
317 result &= ~tail_bitmask;
318 result |= (std::get<0>(dest) & (~start_bitmask | tail_bitmask));
319 }
320 }
321 return result;
322 }
323
324 template <auto kDefaultElement,
325 TailProcessing vta,
326 auto vma = NoInactiveProcessing{},
327 typename MaskType>
328 [[nodiscard]] inline std::tuple<SIMD128Register> VectorMasking(SIMD128Register result,
329 int vstart,
330 int vl,
331 MaskType mask) {
332 static_assert((std::is_same_v<decltype(vma), NoInactiveProcessing> &&
333 std::is_same_v<MaskType, NoInactiveProcessing>) ||
334 (std::is_same_v<decltype(vma), InactiveProcessing> &&
335 (std::is_same_v<MaskType, RawInt8> || std::is_same_v<MaskType, RawInt16>)));
336 if constexpr (std::is_same_v<decltype(vma), InactiveProcessing>) {
337 const auto [simd_mask] = BitMaskToSimdMask<decltype(kDefaultElement)>(
338 static_cast<typename MaskType::BaseType>(mask));
339 if constexpr (kDefaultElement == ~decltype(kDefaultElement){}) {
340 result |= ~simd_mask;
341 } else {
342 result &= simd_mask;
343 if constexpr (kDefaultElement != decltype(kDefaultElement){}) {
344 const std::tuple<SIMD128Register>& dest = VectorBroadcast<kDefaultElement>();
345 result |= std::get<0>(dest) & ~simd_mask;
346 }
347 }
348 }
349 return VectorMasking<kDefaultElement, vta>(result, vstart, vl);
350 }
351
352 template <typename ElementType, TailProcessing vta, NoInactiveProcessing = NoInactiveProcessing{}>
353 [[nodiscard]] inline std::tuple<SIMD128Register> VectorMasking(
354 SIMD128Register dest,
355 SIMD128Register result,
356 int vstart,
357 int vl,
358 NoInactiveProcessing /*mask*/ = NoInactiveProcessing{}) {
359 constexpr int kElementsCount = static_cast<int>(sizeof(SIMD128Register) / sizeof(ElementType));
360 if (vstart < 0) {
361 vstart = 0;
362 }
363 if (vl < 0) {
364 vl = 0;
365 }
366 if (vl > kElementsCount) {
367 vl = kElementsCount;
368 }
369 if (vstart == 0) [[likely]] {
370 if (vl == kElementsCount) [[likely]] {
371 return result;
372 }
373 const auto [tail_bitmask] = MakeBitmaskFromVl<ElementType>(vl);
374 if constexpr (vta == TailProcessing::kAgnostic) {
375 dest = result | tail_bitmask;
376 } else {
377 dest = (dest & tail_bitmask) | (result & ~tail_bitmask);
378 }
379 } else if (vstart < vl) [[likely]] {
380 // Note: vstart <= vl here because RISC-V instructions don't alter the result if vstart >= vl.
381 // But when vstart is so big that it's larger than kElementsCount and vl is also larger than
382 // kElementsCount we hit that corner case and return dest if that happens.
383 const auto [start_bitmask] = MakeBitmaskFromVl<ElementType>(vstart);
384 const auto [tail_bitmask] = MakeBitmaskFromVl<ElementType>(vl);
385 if constexpr (vta == TailProcessing::kAgnostic) {
386 dest = (dest & ~start_bitmask) | (result & start_bitmask) | tail_bitmask;
387 } else {
388 dest = (dest & (~start_bitmask | tail_bitmask)) | (result & start_bitmask & ~tail_bitmask);
389 }
390 } else if constexpr (vta == TailProcessing::kAgnostic) {
391 if (vstart == vl) {
392 // Corners case where vstart == vl may happen because of vslideup:
393 // https://github.com/riscv/riscv-v-spec/issues/263
394 const auto [tail_bitmask] = MakeBitmaskFromVl<ElementType>(vl);
395 dest |= tail_bitmask;
396 }
397 }
398 return {dest};
399 }
400
401 template <typename ElementType,
402 TailProcessing vta,
403 auto vma = NoInactiveProcessing{},
404 typename MaskType = NoInactiveProcessing>
405 [[nodiscard]] inline std::tuple<SIMD128Register> VectorMasking(
406 SIMD128Register dest,
407 SIMD128Register result,
408 SIMD128Register result_mask,
409 int vstart,
410 int vl,
411 MaskType mask = NoInactiveProcessing{}) {
412 static_assert((std::is_same_v<decltype(vma), NoInactiveProcessing> &&
413 std::is_same_v<MaskType, NoInactiveProcessing>) ||
414 (std::is_same_v<decltype(vma), InactiveProcessing> &&
415 (std::is_same_v<MaskType, RawInt8> || std::is_same_v<MaskType, RawInt16>)));
416 if constexpr (std::is_same_v<decltype(vma), InactiveProcessing>) {
417 const auto [simd_mask] =
418 BitMaskToSimdMask<ElementType>(static_cast<typename MaskType::BaseType>(mask));
419 if (vma == InactiveProcessing::kAgnostic) {
420 result |= ~simd_mask;
421 } else {
422 result = (result & simd_mask) | (result_mask & ~simd_mask);
423 }
424 }
425 return VectorMasking<ElementType, vta>(dest, result, vstart, vl);
426 }
427
428 template <typename ElementType, TailProcessing vta, InactiveProcessing vma, typename MaskType>
VectorMasking(SIMD128Register dest,SIMD128Register result,int vstart,int vl,MaskType mask)429 [[nodiscard]] inline std::tuple<SIMD128Register> VectorMasking(SIMD128Register dest,
430 SIMD128Register result,
431 int vstart,
432 int vl,
433 MaskType mask) {
434 return VectorMasking<ElementType, vta, vma>(dest,
435 result,
436 /*result_mask=*/dest,
437 vstart,
438 vl,
439 mask);
440 }
441
442 template <typename ElementType, typename... ParameterType>
443 inline constexpr bool kIsAllowedArgumentForVector =
444 ((std::is_same_v<ParameterType, SIMD128Register> ||
445 std::is_same_v<ParameterType, ElementType>) &&
446 ...);
447
448 // TODO(b/260725458): Pass lambda as template argument after C++20 would become available.
449 template <typename ElementType, typename Lambda, typename... ParameterType>
VectorProcessing(Lambda lambda,ParameterType...parameters)450 inline std::tuple<SIMD128Register> VectorProcessing(Lambda lambda, ParameterType... parameters) {
451 static_assert(kIsAllowedArgumentForVector<ElementType, ParameterType...>);
452 SIMD128Register result;
453 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
454 for (size_t index = 0; index < kElementsCount; ++index) {
455 result.Set(lambda(VectorElement<ElementType>(parameters, index)...), index);
456 }
457 return result;
458 }
459
460 // TODO(b/260725458): Pass lambda as template argument after C++20 would become available.
461 template <typename ElementType, typename Lambda, typename ResultType, typename... ParameterType>
VectorProcessingReduce(Lambda lambda,ResultType init,ParameterType...parameters)462 inline std::tuple<ResultType> VectorProcessingReduce(Lambda lambda,
463 ResultType init,
464 ParameterType... parameters) {
465 static_assert(std::is_same_v<ResultType, ElementType> ||
466 std::is_same_v<ResultType, WideType<ElementType>>);
467 static_assert(kIsAllowedArgumentForVector<ElementType, ParameterType...>);
468 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
469 for (size_t index = 0; index < kElementsCount; ++index) {
470 if constexpr (std::is_same_v<ResultType, WideType<ElementType>>) {
471 init = lambda(init, Widen(VectorElement<ElementType>(parameters, index)...));
472 } else {
473 init = lambda(init, VectorElement<ElementType>(parameters, index)...);
474 }
475 }
476 return init;
477 }
478
479 // SEW = 2*SEW op SEW
480 // TODO(b/260725458): Pass lambda as template argument after C++20 would become available.
481 template <typename ElementType, typename Lambda, typename ParameterType1, typename ParameterType2>
VectorArithmeticNarrowwv(Lambda lambda,ParameterType1 src1,ParameterType2 src2)482 inline std::tuple<SIMD128Register> VectorArithmeticNarrowwv(Lambda lambda,
483 ParameterType1 src1,
484 ParameterType2 src2) {
485 static_assert(kIsAllowedArgumentForVector<ElementType, ParameterType1, ParameterType2>);
486 SIMD128Register result;
487 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType) / 2;
488 for (size_t index = 0; index < kElementsCount; ++index) {
489 result.Set(Narrow(lambda(VectorElement<WideType<ElementType>>(src1, index),
490 Widen(VectorElement<ElementType>(src2, index)))),
491 index);
492 }
493 return result;
494 }
495
496 // 2*SEW = SEW op SEW
497 // TODO(b/260725458): Pass lambda as template argument after C++20 would become available.
498 template <typename ElementType, typename Lambda, typename... ParameterType>
VectorArithmeticWidenvv(Lambda lambda,ParameterType...parameters)499 inline std::tuple<SIMD128Register> VectorArithmeticWidenvv(Lambda lambda,
500 ParameterType... parameters) {
501 static_assert(kIsAllowedArgumentForVector<ElementType, ParameterType...>);
502 SIMD128Register result;
503 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType) / 2;
504 for (size_t index = 0; index < kElementsCount; ++index) {
505 result.Set(lambda(Widen(VectorElement<ElementType>(parameters, index))...), index);
506 }
507 return result;
508 }
509
510 // 2*SEW = SEW op SEW op 2*SEW
511 // TODO(b/260725458): Pass lambda as template argument after C++20 would become available.
512 template <typename ElementType,
513 typename Lambda,
514 typename ParameterType1,
515 typename ParameterType2,
516 typename ParameterType3>
VectorArithmeticWidenvvw(Lambda lambda,ParameterType1 src1,ParameterType2 src2,ParameterType3 src3)517 inline std::tuple<SIMD128Register> VectorArithmeticWidenvvw(Lambda lambda,
518 ParameterType1 src1,
519 ParameterType2 src2,
520 ParameterType3 src3) {
521 static_assert(
522 kIsAllowedArgumentForVector<ElementType, ParameterType1, ParameterType2, ParameterType3>);
523 SIMD128Register result;
524 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType) / 2;
525 for (size_t index = 0; index < kElementsCount; ++index) {
526 result.Set(lambda(Widen(VectorElement<ElementType>(src1, index)),
527 Widen(VectorElement<ElementType>(src2, index)),
528 VectorElement<WideType<ElementType>>(src3, index)),
529 index);
530 }
531 return result;
532 }
533
534 // SEW = 2*SEW op SEW
535 // TODO(b/260725458): Pass lambda as template argument after C++20 would become available.
536 template <typename ElementType, typename Lambda, typename ParameterType1, typename ParameterType2>
VectorArithmeticWidenwv(Lambda lambda,ParameterType1 src1,ParameterType2 src2)537 inline std::tuple<SIMD128Register> VectorArithmeticWidenwv(Lambda lambda,
538 ParameterType1 src1,
539 ParameterType2 src2) {
540 static_assert(kIsAllowedArgumentForVector<ElementType, ParameterType1, ParameterType2>);
541 SIMD128Register result;
542 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType) / 2;
543 for (size_t index = 0; index < kElementsCount; ++index) {
544 result.Set(lambda(VectorElement<WideType<ElementType>>(src1, index),
545 Widen(VectorElement<ElementType>(src2, index))),
546 index);
547 }
548 return result;
549 }
550
551 template <typename ElementType>
VectorExtend(SIMD128Register src)552 SIMD128Register VectorExtend(SIMD128Register src) {
553 SIMD128Register result;
554 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType) / 2;
555 for (size_t index = 0; index < kElementsCount; ++index) {
556 result.Set(Widen(VectorElement<ElementType>(src, index)), index);
557 }
558 return result;
559 }
560
561 template <typename ElementType,
562 enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vextf2(SIMD128Register src)563 inline std::tuple<SIMD128Register> Vextf2(SIMD128Register src) {
564 using SourceElementType = NarrowType<ElementType>;
565 return {VectorExtend<SourceElementType>(src)};
566 }
567
568 template <typename ElementType,
569 enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vextf4(SIMD128Register src)570 inline std::tuple<SIMD128Register> Vextf4(SIMD128Register src) {
571 using WideSourceElementType = NarrowType<ElementType>;
572 using SourceElementType = NarrowType<WideSourceElementType>;
573 return {VectorExtend<WideSourceElementType>(VectorExtend<SourceElementType>(src))};
574 }
575
576 template <typename ElementType,
577 enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vextf8(SIMD128Register src)578 inline std::tuple<SIMD128Register> Vextf8(SIMD128Register src) {
579 using WideWideSourceElementType = NarrowType<ElementType>;
580 return {
581 VectorExtend<WideWideSourceElementType>(std::get<0>(Vextf4<WideWideSourceElementType>(src)))};
582 }
583
584 template <typename ElementType,
585 enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
VidvForTests(size_t index)586 inline std::tuple<SIMD128Register> VidvForTests(size_t index) {
587 SIMD128Register result;
588 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
589 ElementType element = {static_cast<typename ElementType::BaseType>(index * kElementsCount)};
590 for (size_t index = 0; index < kElementsCount; ++index) {
591 result.Set(element, index);
592 element += ElementType{1};
593 }
594 return result;
595 }
596
597 // Handles "slide up" for a single destination register. Effectively copies the last offset elements
598 // in [kElementsCount - offset, kElementsCount) of src1 followed by the first [0, kElementsCount -
599 // offset) elements of src2 into the result.
600 //
601 // This leaves result looking like
602 //
603 // result = {
604 // src1[kElementsCount-offset+0],
605 // src1[kElementsCount-offset+1],
606 // ...,
607 // src1[kElementsCount-offset+(offset-1),
608 // src2[0],
609 // src2[1],
610 // ...,
611 // src2[kElementsCount-offset-1]
612 // };
613 template <typename ElementType>
VectorSlideUp(size_t offset,SIMD128Register src1,SIMD128Register src2)614 inline std::tuple<SIMD128Register> VectorSlideUp(size_t offset,
615 SIMD128Register src1,
616 SIMD128Register src2) {
617 SIMD128Register result;
618 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
619 CHECK_LT(offset, kElementsCount);
620 for (size_t index = 0; index < offset; ++index) {
621 result.Set(VectorElement<ElementType>(src1, kElementsCount - offset + index), index);
622 }
623 for (size_t index = offset; index < kElementsCount; ++index) {
624 result.Set(VectorElement<ElementType>(src2, index - offset), index);
625 }
626 return result;
627 }
628
629 // Handles "slide down" for a single destination register. Effectively copies the elements in
630 // [offset, kElementsCount) of src1 followed by the [0, kElementsCount - offset) elements of src2
631 // into the result.
632 //
633 // This leaves result looking like
634 //
635 // result = {
636 // [0] = src1[offset+0],
637 // [1] = src1[offset+1],
638 // ...,
639 // [kElementsCount-offset-1] = src1[kElementsCount-1],
640 // [kElementsCount-offset] = src2[0],
641 // [kElementsCount-offset+1] = src2[1],
642 // ...,
643 // [kElementsCount-offset+(offset-1)] = src2[kElementsCount-offset-1]
644 // };
645 template <typename ElementType>
VectorSlideDown(size_t offset,SIMD128Register src1,SIMD128Register src2)646 inline std::tuple<SIMD128Register> VectorSlideDown(size_t offset,
647 SIMD128Register src1,
648 SIMD128Register src2) {
649 SIMD128Register result;
650 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
651 CHECK_LT(offset, kElementsCount);
652 for (size_t index = 0; index < kElementsCount - offset; ++index) {
653 result.Set(VectorElement<ElementType>(src1, offset + index), index);
654 }
655 for (size_t index = kElementsCount - offset; index < kElementsCount; ++index) {
656 result.Set(VectorElement<ElementType>(src2, index - (kElementsCount - offset)), index);
657 }
658 return result;
659 }
660
661 template <enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vcpopm(SIMD128Register simd_src)662 inline std::tuple<SIMD128Register> Vcpopm(SIMD128Register simd_src) {
663 UInt128 src = simd_src.Get<UInt128>();
664 return Popcount(src);
665 }
666
667 template <enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vfirstm(SIMD128Register simd_src)668 inline std::tuple<SIMD128Register> Vfirstm(SIMD128Register simd_src) {
669 UInt128 src = simd_src.Get<UInt128>();
670 if (src == Int128{0}) {
671 return ~UInt128{0};
672 }
673 return CountRZero(src);
674 }
675
676 #ifndef __x86_64__
677 template <typename ElementType,
678 enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vidv(size_t index)679 inline std::tuple<SIMD128Register> Vidv(size_t index) {
680 return VidvForTests<ElementType>(index);
681 }
682 #endif
683
684 template <enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vmsifm(SIMD128Register simd_src)685 inline std::tuple<SIMD128Register> Vmsifm(SIMD128Register simd_src) {
686 Int128 src = simd_src.Get<Int128>();
687 return {(src - Int128{1}) ^ src};
688 }
689
690 template <enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vmsbfm(SIMD128Register simd_src)691 inline std::tuple<SIMD128Register> Vmsbfm(SIMD128Register simd_src) {
692 Int128 src = simd_src.Get<Int128>();
693 if (src == Int128{0}) {
694 return {~Int128{0}};
695 }
696 return {std::get<0>(Vmsifm(simd_src)).Get<Int128>() >> Int128{1}};
697 }
698
699 template <enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vmsofm(SIMD128Register simd_src)700 inline std::tuple<SIMD128Register> Vmsofm(SIMD128Register simd_src) {
701 return {std::get<0>(Vmsbfm(simd_src)) ^ std::get<0>(Vmsifm(simd_src))};
702 }
703
704 template <typename ElementType,
705 enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Viotam(SIMD128Register simd_src,size_t counter)706 inline std::tuple<SIMD128Register, size_t> Viotam(SIMD128Register simd_src, size_t counter) {
707 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
708 __uint128_t src = simd_src.Get<__uint128_t>();
709 SIMD128Register result;
710 for (size_t index = 0; index < kElementsCount; ++index) {
711 typename Wrapping<typename ElementType::BaseType>::UnsignedType value{
712 static_cast<typename ElementType::BaseType>(counter)};
713 result.Set(value, index);
714 counter += static_cast<size_t>(src & 1);
715 src >>= 1;
716 }
717 return {result, counter};
718 }
719
720 template <typename TargetElementType,
721 typename SourceElementType,
722 enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
Vfcvtv(int8_t rm,int8_t frm,SIMD128Register src)723 inline std::tuple<SIMD128Register> Vfcvtv(int8_t rm, int8_t frm, SIMD128Register src) {
724 SIMD128Register result;
725 size_t kElementsCount = std::min(sizeof(SIMD128Register) / sizeof(TargetElementType),
726 sizeof(SIMD128Register) / sizeof(SourceElementType));
727 for (size_t index = 0; index < kElementsCount; ++index) {
728 if constexpr (!std::is_same_v<TargetElementType, Float16> &&
729 !std::is_same_v<TargetElementType, Float32> &&
730 !std::is_same_v<TargetElementType, Float64>) {
731 result.Set(
732 std::get<0>(FCvtFloatToInteger<typename TargetElementType::BaseType, SourceElementType>(
733 rm, frm, src.Get<SourceElementType>(index))),
734 index);
735 } else if constexpr (!std::is_same_v<SourceElementType, Float16> &&
736 !std::is_same_v<SourceElementType, Float32> &&
737 !std::is_same_v<SourceElementType, Float64>) {
738 result.Set(
739 std::get<0>(FCvtIntegerToFloat<TargetElementType, typename SourceElementType::BaseType>(
740 rm, frm, src.Get<typename SourceElementType::BaseType>(index))),
741 index);
742 } else {
743 result.Set(std::get<0>(FCvtFloatToFloat<TargetElementType, SourceElementType>(
744 rm, frm, src.Get<SourceElementType>(index))),
745 index);
746 }
747 }
748 return result;
749 }
750
751 // With wide intrinsics multiplication we may do sign-extension or zero-extension, but some
752 // intrinsics need mix: Signed * Unsigned. We narrow down value and then extend it again.
753 // Compiler is smart enough to eliminate dead code.
754 template <typename ElementType>
WideMultiplySignedUnsigned(ElementType arg1,ElementType arg2)755 std::tuple<ElementType> WideMultiplySignedUnsigned(ElementType arg1, ElementType arg2) {
756 return BitCastToUnsigned(Widen(BitCastToSigned(Narrow(arg1)))) *
757 Widen(BitCastToUnsigned(Narrow(arg2)));
758 }
759
760 #define DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS(...) __VA_ARGS__
761 #define DEFINE_ARITHMETIC_INTRINSIC(Name, arithmetic, parameters, capture, arguments) \
762 template <typename ElementType, \
763 enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible> \
764 inline std::tuple<SIMD128Register> Name(DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS parameters) { \
765 return VectorProcessing<ElementType>( \
766 [DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS capture](auto... args) { \
767 static_assert((std::is_same_v<decltype(args), ElementType> && ...)); \
768 arithmetic; \
769 }, \
770 DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS arguments); \
771 }
772
773 #define DEFINE_1OP_ARITHMETIC_INTRINSIC_V(name, ...) \
774 DEFINE_ARITHMETIC_INTRINSIC(V##name##v, return ({ __VA_ARGS__; }); \
775 , (SIMD128Register src), (), (src))
776
777 #define DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(name, ...) \
778 DEFINE_ARITHMETIC_INTRINSIC(V##name##vv, return ({ __VA_ARGS__; }); \
779 , (SIMD128Register src1, SIMD128Register src2), (), (src1, src2))
780
781 #define DEFINE_3OP_ARITHMETIC_INTRINSIC_VV(name, ...) \
782 DEFINE_ARITHMETIC_INTRINSIC(V##name##vv, return ({ __VA_ARGS__; }); \
783 , \
784 (SIMD128Register src1, SIMD128Register src2, SIMD128Register src3), \
785 (), \
786 (src1, src2, src3))
787
788 #define DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(name, ...) \
789 DEFINE_ARITHMETIC_INTRINSIC( \
790 V##name##vv, return ({ __VA_ARGS__; }); \
791 , \
792 (int8_t csr, SIMD128Register src1, SIMD128Register src2, SIMD128Register src3), \
793 (csr), \
794 (src1, src2, src3))
795
796 #define DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(name, ...) \
797 DEFINE_ARITHMETIC_INTRINSIC(V##name##vx, return ({ __VA_ARGS__; }); \
798 , (SIMD128Register src1, ElementType src2), (), (src1, src2))
799
800 #define DEFINE_3OP_ARITHMETIC_INTRINSIC_VX(name, ...) \
801 DEFINE_ARITHMETIC_INTRINSIC( \
802 V##name##vx, return ({ __VA_ARGS__; }); \
803 , (SIMD128Register src1, ElementType src2, SIMD128Register src3), (), (src1, src2, src3))
804
805 #define DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(name, ...) \
806 DEFINE_ARITHMETIC_INTRINSIC( \
807 V##name##vf, return ({ __VA_ARGS__; }); \
808 , \
809 (int8_t csr, SIMD128Register src1, ElementType src2, SIMD128Register src3), \
810 (csr), \
811 (src1, src2, src3))
812
813 #define DEFINE_1OP_ARITHMETIC_INTRINSIC_X(name, ...) \
814 DEFINE_ARITHMETIC_INTRINSIC(V##name##x, return ({ __VA_ARGS__; });, (ElementType src), (), (src))
815
816 #define DEFINE_1OP_1CSR_ARITHMETIC_INTRINSIC_V(name, ...) \
817 DEFINE_ARITHMETIC_INTRINSIC(V##name##v, return ({ __VA_ARGS__; }); \
818 , (int8_t csr, SIMD128Register src), (csr), (src))
819
820 #define DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(name, ...) \
821 DEFINE_ARITHMETIC_INTRINSIC( \
822 V##name##vf, return ({ __VA_ARGS__; }); \
823 , (int8_t csr, SIMD128Register src1, ElementType src2), (csr), (src1, src2))
824
825 #define DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(name, ...) \
826 DEFINE_ARITHMETIC_INTRINSIC( \
827 V##name##vv, return ({ __VA_ARGS__; }); \
828 , (int8_t csr, SIMD128Register src1, SIMD128Register src2), (csr), (src1, src2))
829
830 #define DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VX(name, ...) \
831 DEFINE_ARITHMETIC_INTRINSIC( \
832 V##name##vx, return ({ __VA_ARGS__; }); \
833 , (int8_t csr, SIMD128Register src1, ElementType src2), (csr), (src1, src2))
834
835 #define DEFINE_ARITHMETIC_REDUCE_INTRINSIC(Name, arithmetic, parameters, capture, arguments) \
836 template <typename ElementType, \
837 typename ResultType = ElementType, \
838 enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible> \
839 inline std::tuple<ResultType> Name(DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS parameters) { \
840 return VectorProcessingReduce<ElementType>( \
841 [DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS capture](auto... args) { \
842 static_assert((std::is_same_v<decltype(args), ResultType> && ...)); \
843 arithmetic; \
844 }, \
845 DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS arguments); \
846 }
847
848 #define DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(name, ...) \
849 DEFINE_ARITHMETIC_REDUCE_INTRINSIC(V##name##vs, return ({ __VA_ARGS__; }); \
850 , (ResultType init, SIMD128Register src), (), (init, src))
851
852 #define DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VS(name, ...) \
853 DEFINE_ARITHMETIC_REDUCE_INTRINSIC( \
854 Vfred##name##vs, return ({ __VA_ARGS__; }); \
855 , (int8_t csr, ResultType init, SIMD128Register src), (csr), (init, src))
856
857 #define DEFINE_W_ARITHMETIC_INTRINSIC(Name, Pattern, arithmetic, parameters, capture, arguments) \
858 template <typename ElementType, \
859 enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible> \
860 inline std::tuple<SIMD128Register> Name(DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS parameters) { \
861 return VectorArithmetic##Pattern<ElementType>( \
862 [DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS capture](auto... args) { \
863 static_assert((std::is_same_v<decltype(args), WideType<ElementType>> && ...)); \
864 arithmetic; \
865 }, \
866 DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS arguments); \
867 }
868
869 #define DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VV(name, ...) \
870 DEFINE_W_ARITHMETIC_INTRINSIC(Vw##name##vv, Widenvv, return ({ __VA_ARGS__; }); \
871 , (SIMD128Register src1, SIMD128Register src2), (), (src1, src2))
872
873 #define DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VV(name, ...) \
874 DEFINE_W_ARITHMETIC_INTRINSIC( \
875 Vfw##name##vv, Widenvv, return ({ __VA_ARGS__; }); \
876 , (int8_t csr, SIMD128Register src1, SIMD128Register src2), (csr), (src1, src2))
877
878 #define DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VF(name, ...) \
879 DEFINE_W_ARITHMETIC_INTRINSIC( \
880 Vfw##name##vf, Widenvv, return ({ __VA_ARGS__; }); \
881 , (int8_t csr, SIMD128Register src1, ElementType src2), (csr), (src1, src2))
882
883 #define DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_WV(name, ...) \
884 DEFINE_W_ARITHMETIC_INTRINSIC( \
885 Vfw##name##wv, Widenwv, return ({ __VA_ARGS__; }); \
886 , (int8_t csr, SIMD128Register src1, SIMD128Register src2), (csr), (src1, src2))
887
888 #define DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_WF(name, ...) \
889 DEFINE_W_ARITHMETIC_INTRINSIC( \
890 Vfw##name##wf, Widenwv, return ({ __VA_ARGS__; }); \
891 , (int8_t csr, SIMD128Register src1, ElementType src2), (csr), (src1, src2))
892
893 #define DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VVW(name, ...) \
894 DEFINE_W_ARITHMETIC_INTRINSIC( \
895 Vw##name##vv, Widenvvw, return ({ __VA_ARGS__; }); \
896 , \
897 (SIMD128Register src1, SIMD128Register src2, SIMD128Register src3), \
898 (), \
899 (src1, src2, src3))
900
901 #define DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VX(name, ...) \
902 DEFINE_W_ARITHMETIC_INTRINSIC(Vw##name##vx, Widenvv, return ({ __VA_ARGS__; }); \
903 , (SIMD128Register src1, ElementType src2), (), (src1, src2))
904
905 #define DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VXW(name, ...) \
906 DEFINE_W_ARITHMETIC_INTRINSIC( \
907 Vw##name##vx, Widenvvw, return ({ __VA_ARGS__; }); \
908 , (SIMD128Register src1, ElementType src2, SIMD128Register src3), (), (src1, src2, src3))
909
910 #define DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VVW(name, ...) \
911 DEFINE_W_ARITHMETIC_INTRINSIC( \
912 Vfw##name##vv, Widenvvw, return ({ __VA_ARGS__; }); \
913 , \
914 (int8_t csr, SIMD128Register src1, SIMD128Register src2, SIMD128Register src3), \
915 (csr), \
916 (src1, src2, src3))
917
918 #define DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VXW(name, ...) \
919 DEFINE_W_ARITHMETIC_INTRINSIC( \
920 Vfw##name##vf, Widenvvw, return ({ __VA_ARGS__; }); \
921 , \
922 (int8_t csr, SIMD128Register src1, ElementType src2, SIMD128Register src3), \
923 (csr), \
924 (src1, src2, src3))
925
926 #define DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WV(name, ...) \
927 DEFINE_W_ARITHMETIC_INTRINSIC(Vn##name##wv, Narrowwv, return ({ __VA_ARGS__; }); \
928 , (SIMD128Register src1, SIMD128Register src2), (), (src1, src2))
929
930 #define DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WX(name, ...) \
931 DEFINE_W_ARITHMETIC_INTRINSIC(Vn##name##wx, Narrowwv, return ({ __VA_ARGS__; }); \
932 , (SIMD128Register src1, ElementType src2), (), (src1, src2))
933
934 #define DEFINE_2OP_1CSR_NARROW_ARITHMETIC_INTRINSIC_WV(name, ...) \
935 DEFINE_W_ARITHMETIC_INTRINSIC( \
936 Vn##name##wv, Narrowwv, return ({ __VA_ARGS__; }); \
937 , (int8_t csr, SIMD128Register src1, SIMD128Register src2), (csr), (src1, src2))
938
939 #define DEFINE_2OP_1CSR_NARROW_ARITHMETIC_INTRINSIC_WX(name, ...) \
940 DEFINE_W_ARITHMETIC_INTRINSIC( \
941 Vn##name##wx, Narrowwv, return ({ __VA_ARGS__; }); \
942 , (int8_t csr, SIMD128Register src1, ElementType src2), (csr), (src1, src2))
943
944 #define DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VV(name, ...) \
945 DEFINE_W_ARITHMETIC_INTRINSIC(Vw##name##vv, Widenvv, return ({ __VA_ARGS__; }); \
946 , (SIMD128Register src1, SIMD128Register src2), (), (src1, src2))
947
948 #define DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_WV(name, ...) \
949 DEFINE_W_ARITHMETIC_INTRINSIC(Vw##name##wv, Widenwv, return ({ __VA_ARGS__; }); \
950 , (SIMD128Register src1, SIMD128Register src2), (), (src1, src2))
951
952 #define DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_WX(name, ...) \
953 DEFINE_W_ARITHMETIC_INTRINSIC(Vw##name##wx, Widenwv, return ({ __VA_ARGS__; }); \
954 , (SIMD128Register src1, ElementType src2), (), (src1, src2))
955
956 #define DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VX(name, ...) \
957 DEFINE_W_ARITHMETIC_INTRINSIC(Vw##name##vx, Widenvv, return ({ __VA_ARGS__; }); \
958 , (SIMD128Register src1, ElementType src2), (), (src1, src2))
959
960 DEFINE_1OP_ARITHMETIC_INTRINSIC_V(copy, auto [arg] = std::tuple{args...}; arg)
961 DEFINE_1OP_ARITHMETIC_INTRINSIC_X(copy, auto [arg] = std::tuple{args...}; arg)
962 DEFINE_1OP_ARITHMETIC_INTRINSIC_V(brev8, std::get<0>((Brev8(args...))))
963 DEFINE_1OP_ARITHMETIC_INTRINSIC_V(frsqrt7, RSqrtEstimate(args...))
964 DEFINE_1OP_ARITHMETIC_INTRINSIC_V(
965 fclass,
966 static_cast<typename TypeTraits<ElementType>::Int>(std::get<0>(FClass(args...))))
967
968 DEFINE_1OP_1CSR_ARITHMETIC_INTRINSIC_V(fsqrt,
969 CanonicalizeNanTuple(FSqrt(FPFlags::DYN, csr, args...)))
970
971 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(add, (args + ...))
972 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(add, (args + ...))
973 DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(redsum, (args + ...))
974 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(rsub, auto [arg1, arg2] = std::tuple{args...}; (arg2 - arg1))
975 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(sub, (args - ...))
976 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(sub, (args - ...))
977 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(and, (args & ...))
978 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(and, (args & ...))
979 DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(redand, (args & ...))
980 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(or, (args | ...))
981 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(or, (args | ...))
982 DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(redor, (args | ...))
983 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(xor, (args ^ ...))
984 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(xor, (args ^ ...))
985 DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(redxor, (args ^ ...))
986 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(
987 aadd,
988 ElementType{std::get<0>(Aadd(csr, static_cast<typename ElementType::BaseType>(args)...))})
989 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VX(
990 aadd,
991 ElementType{std::get<0>(Aadd(csr, static_cast<typename ElementType::BaseType>(args)...))})
992
993 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(smul, auto [arg1, arg2] = std::tuple{args...}; ElementType{
994 Narrow(Saturating{std::get<0>(Roundoff(
995 csr,
996 static_cast<typename WideType<ElementType>::BaseType>(Widen(arg1) * Widen(arg2)),
997 static_cast<typename WideType<ElementType>::BaseType>((sizeof(ElementType) * CHAR_BIT) -
998 1)))})})
999
1000 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VX(smul, auto [arg1, arg2] = std::tuple{args...}; ElementType{
1001 Narrow(Saturating{std::get<0>(Roundoff(
1002 csr,
1003 static_cast<typename WideType<ElementType>::BaseType>(Widen(arg1) * Widen(arg2)),
1004 static_cast<typename WideType<ElementType>::BaseType>((sizeof(ElementType) * CHAR_BIT) -
1005 1)))})})
1006
1007 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(
1008 ssr,
1009 ElementType{std::get<0>(Roundoff(csr, static_cast<typename ElementType::BaseType>(args)...))})
1010
1011 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VX(
1012 ssr,
1013 ElementType{std::get<0>(Roundoff(csr, static_cast<typename ElementType::BaseType>(args)...))})
1014
1015 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(fadd,
1016 CanonicalizeNanTuple(FAdd(FPFlags::DYN, csr, args...)))
1017 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(fadd,
1018 CanonicalizeNanTuple(FAdd(FPFlags::DYN, csr, args...)))
1019 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VV(
1020 add,
1021 CanonicalizeNanTuple(FAdd(FPFlags::DYN, csr, args...)))
1022 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VF(
1023 add,
1024 CanonicalizeNanTuple(FAdd(FPFlags::DYN, csr, args...)))
1025 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VV(
1026 sub,
1027 CanonicalizeNanTuple(FSub(FPFlags::DYN, csr, args...)))
1028 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VF(
1029 sub,
1030 CanonicalizeNanTuple(FSub(FPFlags::DYN, csr, args...)))
1031 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VV(
1032 mul,
1033 CanonicalizeNanTuple(FMul(FPFlags::DYN, csr, args...)))
1034 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VF(
1035 mul,
1036 CanonicalizeNanTuple(FMul(FPFlags::DYN, csr, args...)))
1037 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_WV(
1038 add,
1039 CanonicalizeNanTuple(FAdd(FPFlags::DYN, csr, args...)))
1040 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_WF(
1041 add,
1042 CanonicalizeNanTuple(FAdd(FPFlags::DYN, csr, args...)))
1043 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_WV(
1044 sub,
1045 CanonicalizeNanTuple(FSub(FPFlags::DYN, csr, args...)))
1046 DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_WF(
1047 sub,
1048 CanonicalizeNanTuple(FSub(FPFlags::DYN, csr, args...)))
1049
1050 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(fsub,
1051 CanonicalizeNanTuple(FSub(FPFlags::DYN, csr, args...)))
1052 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(fsub,
1053 CanonicalizeNanTuple(FSub(FPFlags::DYN, csr, args...)))
1054 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(frsub, auto [arg1, arg2] = std::tuple{args...};
1055 CanonicalizeNanTuple(FSub(FPFlags::DYN, csr, arg2, arg1)))
1056 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VS(osum,
1057 CanonicalizeNanTuple(FAdd(FPFlags::DYN, csr, args...)))
1058 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VS(usum,
1059 CanonicalizeNanTuple(FAdd(FPFlags::DYN, csr, args...)))
1060 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(
1061 asub,
1062 ElementType{std::get<0>(Asub(csr, static_cast<typename ElementType::BaseType>(args)...))})
1063 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VX(
1064 asub,
1065 ElementType{std::get<0>(Asub(csr, static_cast<typename ElementType::BaseType>(args)...))})
1066 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(fmul,
1067 CanonicalizeNanTuple(FMul(FPFlags::DYN, csr, args...)))
1068 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(fmul,
1069 CanonicalizeNanTuple(FMul(FPFlags::DYN, csr, args...)))
1070 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(fdiv,
1071 CanonicalizeNanTuple(FDiv(FPFlags::DYN, csr, args...)))
1072 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(fdiv,
1073 CanonicalizeNanTuple(FDiv(FPFlags::DYN, csr, args...)))
1074 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(frdiv, auto [arg1, arg2] = std::tuple{args...};
1075 CanonicalizeNanTuple(FDiv(FPFlags::DYN, csr, arg2, arg1)))
1076 // SIMD mask either includes results with all bits set to 0 or all bits set to 1.
1077 // This way it may be used with VAnd and VAndN operations to perform masking.
1078 // Such comparison is effectively one instruction of x86-64 (via SSE or AVX) but
1079 // to achieve it we need to multiply bool result by (~IntType{0}) or (~ElementType{0}).
1080 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(feq, using IntType = typename TypeTraits<ElementType>::Int;
1081 (~IntType{0}) * IntType(std::get<0>(Feq(args...))))
1082 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(feq, using IntType = typename TypeTraits<ElementType>::Int;
1083 (~IntType{0}) * IntType(std::get<0>(Feq(args...))))
1084 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fne, using IntType = typename TypeTraits<ElementType>::Int;
1085 (~IntType{0}) * IntType(!std::get<0>(Feq(args...))))
1086 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fne, using IntType = typename TypeTraits<ElementType>::Int;
1087 (~IntType{0}) * IntType(!std::get<0>(Feq(args...))))
1088 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(flt, using IntType = typename TypeTraits<ElementType>::Int;
1089 (~IntType{0}) * IntType(std::get<0>(Flt(args...))))
1090 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(flt, using IntType = typename TypeTraits<ElementType>::Int;
1091 (~IntType{0}) * IntType(std::get<0>(Flt(args...))))
1092 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fle, using IntType = typename TypeTraits<ElementType>::Int;
1093 (~IntType{0}) * IntType(std::get<0>(Fle(args...))))
1094 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fle, using IntType = typename TypeTraits<ElementType>::Int;
1095 (~IntType{0}) * IntType(std::get<0>(Fle(args...))))
1096 // Note: for floating point numbers Flt(b, a) and !Fle(a, b) produce different and incompatible
1097 // results. IEEE754-2008 defined NOT (!=) predicate as negation of EQ (==) predicate while GT (>)
1098 // and GE (>=) are not negations of LE (<) or GT (<=) predicated but instead use swap of arguments.
1099 // Note that scalar form includes only three predicates (Feq, Fle, Fgt) while vector form includes
1100 // Vmfgt.vf and Vmfge.vf instructions only for vector+scalar case (vector+vector case is supposed
1101 // to be handled by swapping arguments). More here: https://github.com/riscv/riscv-v-spec/issues/300
1102 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fgt, auto [arg1, arg2] = std::tuple{args...};
1103 using IntType = typename TypeTraits<ElementType>::Int;
1104 (~IntType{0}) * IntType(std::get<0>(Flt(arg2, arg1))))
1105 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fge, auto [arg1, arg2] = std::tuple{args...};
1106 using IntType = typename TypeTraits<ElementType>::Int;
1107 (~IntType{0}) * IntType(std::get<0>(Fle(arg2, arg1))))
1108 DEFINE_3OP_ARITHMETIC_INTRINSIC_VX(adc, auto [arg1, arg2, arg3] = std::tuple{args...};
1109 (arg2 + arg1 - arg3))
1110 DEFINE_3OP_ARITHMETIC_INTRINSIC_VV(adc, auto [arg1, arg2, arg3] = std::tuple{args...};
1111 (arg2 + arg1 - arg3))
1112 DEFINE_3OP_ARITHMETIC_INTRINSIC_VX(sbc, auto [arg1, arg2, arg3] = std::tuple{args...};
1113 (arg2 - arg1 + arg3))
1114 DEFINE_3OP_ARITHMETIC_INTRINSIC_VV(sbc, auto [arg1, arg2, arg3] = std::tuple{args...};
1115 (arg2 - arg1 + arg3))
1116 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(
1117 seq,
1118 (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args == ...))})
1119 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(
1120 seq,
1121 (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args == ...))})
1122 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(
1123 sne,
1124 (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args != ...))})
1125 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(
1126 sne,
1127 (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args != ...))})
1128 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(
1129 slt,
1130 (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args < ...))})
1131 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(
1132 slt,
1133 (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args < ...))})
1134 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(
1135 sle,
1136 (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args <= ...))})
1137 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(
1138 sle,
1139 (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args <= ...))})
1140 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(
1141 sgt,
1142 (~ElementType{0}) * ElementType{static_cast<typename ElementType::BaseType>((args > ...))})
1143 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(sl, auto [arg1, arg2] = std::tuple{args...}; (arg1 << arg2))
1144 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(sl, auto [arg1, arg2] = std::tuple{args...}; (arg1 << arg2))
1145 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(sr, auto [arg1, arg2] = std::tuple{args...}; (arg1 >> arg2))
1146 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(sr, auto [arg1, arg2] = std::tuple{args...}; (arg1 >> arg2))
1147 DEFINE_3OP_ARITHMETIC_INTRINSIC_VV(macc, auto [arg1, arg2, arg3] = std::tuple{args...};
1148 ((arg2 * arg1) + arg3))
1149 DEFINE_3OP_ARITHMETIC_INTRINSIC_VX(macc, auto [arg1, arg2, arg3] = std::tuple{args...};
1150 ((arg2 * arg1) + arg3))
1151 DEFINE_3OP_ARITHMETIC_INTRINSIC_VV(nmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
1152 (-(arg2 * arg1) + arg3))
1153 DEFINE_3OP_ARITHMETIC_INTRINSIC_VX(nmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
1154 (-(arg2 * arg1) + arg3))
1155 DEFINE_3OP_ARITHMETIC_INTRINSIC_VV(madd, auto [arg1, arg2, arg3] = std::tuple{args...};
1156 ((arg2 * arg3) + arg1))
1157 DEFINE_3OP_ARITHMETIC_INTRINSIC_VX(madd, auto [arg1, arg2, arg3] = std::tuple{args...};
1158 ((arg2 * arg3) + arg1))
1159 DEFINE_3OP_ARITHMETIC_INTRINSIC_VV(nmsub, auto [arg1, arg2, arg3] = std::tuple{args...};
1160 (-(arg2 * arg3) + arg1))
1161 DEFINE_3OP_ARITHMETIC_INTRINSIC_VX(nmsub, auto [arg1, arg2, arg3] = std::tuple{args...};
1162 (-(arg2 * arg3) + arg1))
1163 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(fmacc, auto [arg1, arg2, arg3] = std::tuple{args...};
1164 std::get<0>(FMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
1165 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(fmacc, auto [arg1, arg2, arg3] = std::tuple{args...};
1166 std::get<0>(FMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
1167 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(fmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
1168 std::get<0>(FMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
1169 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(fmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
1170 std::get<0>(FMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
1171 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(fmadd, auto [arg1, arg2, arg3] = std::tuple{args...};
1172 std::get<0>(FMAdd(FPFlags::DYN, csr, arg3, arg2, arg1)))
1173 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(fmadd, auto [arg1, arg2, arg3] = std::tuple{args...};
1174 std::get<0>(FMAdd(FPFlags::DYN, csr, arg3, arg2, arg1)))
1175 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(fmsub, auto [arg1, arg2, arg3] = std::tuple{args...};
1176 std::get<0>(FMSub(FPFlags::DYN, csr, arg3, arg2, arg1)))
1177 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(fmsub, auto [arg1, arg2, arg3] = std::tuple{args...};
1178 std::get<0>(FMSub(FPFlags::DYN, csr, arg3, arg2, arg1)))
1179 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(fnmacc, auto [arg1, arg2, arg3] = std::tuple{args...};
1180 std::get<0>(FNMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
1181 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(fnmacc, auto [arg1, arg2, arg3] = std::tuple{args...};
1182 std::get<0>(FNMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
1183 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(fnmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
1184 std::get<0>(FNMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
1185 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(fnmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
1186 std::get<0>(FNMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
1187 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(fnmadd, auto [arg1, arg2, arg3] = std::tuple{args...};
1188 std::get<0>(FNMSub(FPFlags::DYN, csr, arg3, arg2, arg1)))
1189 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(fnmadd, auto [arg1, arg2, arg3] = std::tuple{args...};
1190 std::get<0>(FNMSub(FPFlags::DYN, csr, arg3, arg2, arg1)))
1191 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV(fnmsub, auto [arg1, arg2, arg3] = std::tuple{args...};
1192 std::get<0>(FNMAdd(FPFlags::DYN, csr, arg3, arg2, arg1)))
1193 DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF(fnmsub, auto [arg1, arg2, arg3] = std::tuple{args...};
1194 std::get<0>(FNMAdd(FPFlags::DYN, csr, arg3, arg2, arg1)))
1195
1196 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fmin, std::get<0>(FMin(args...)))
1197 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fmin, std::get<0>(FMin(args...)))
1198 DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(fredmin, std::get<0>(FMin(args...)))
1199 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fmax, std::get<0>(FMax(args...)))
1200 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fmax, std::get<0>(FMax(args...)))
1201 DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(fredmax, std::get<0>(FMax(args...)))
1202 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fsgnj, std::get<0>(FSgnj(args...)))
1203 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fsgnj, std::get<0>(FSgnj(args...)))
1204 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fsgnjn, std::get<0>(FSgnjn(args...)))
1205 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fsgnjn, std::get<0>(FSgnjn(args...)))
1206 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fsgnjx, std::get<0>(FSgnjx(args...)))
1207 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fsgnjx, std::get<0>(FSgnjx(args...)))
1208 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(min, std::min(args...))
1209 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(min, std::min(args...))
1210 DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(redmin, std::min(args...))
1211 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(max, std::max(args...))
1212 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(max, std::max(args...))
1213 DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(redmax, std::max(args...))
1214 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(mul, auto [arg1, arg2] = std::tuple{args...}; (arg2 * arg1))
1215 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(mul, auto [arg1, arg2] = std::tuple{args...}; (arg2 * arg1))
1216 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(mulh, auto [arg1, arg2] = std::tuple{args...};
1217 NarrowTopHalf(Widen(arg2) * Widen(arg1)))
1218 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(mulh, auto [arg1, arg2] = std::tuple{args...};
1219 NarrowTopHalf(Widen(arg2) * Widen(arg1)))
1220 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(mulhsu, auto [arg1, arg2] = std::tuple{args...};
1221 NarrowTopHalf(BitCastToUnsigned(Widen(BitCastToSigned(arg1))) *
1222 Widen(BitCastToUnsigned(arg2))))
1223 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(mulhsu, auto [arg1, arg2] = std::tuple{args...};
1224 NarrowTopHalf(BitCastToUnsigned(Widen(BitCastToSigned(arg1))) *
1225 Widen(BitCastToUnsigned(arg2))))
1226 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(
1227 div,
1228 ElementType{std::get<0>(Div(static_cast<typename ElementType::BaseType>(args)...))})
1229 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(
1230 div,
1231 ElementType{std::get<0>(Div(static_cast<typename ElementType::BaseType>(args)...))})
1232 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(
1233 rem,
1234 ElementType{std::get<0>(Rem(static_cast<typename ElementType::BaseType>(args)...))})
1235 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(
1236 rem,
1237 ElementType{std::get<0>(Rem(static_cast<typename ElementType::BaseType>(args)...))})
1238
1239 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VV(add, (args + ...))
1240 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VX(add, (args + ...))
1241 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_WV(add, (args + ...))
1242 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_WX(add, (args + ...))
1243 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VV(sub, (args - ...))
1244 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VX(sub, (args - ...))
1245 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_WV(sub, (args - ...))
1246 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_WX(sub, (args - ...))
1247 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VV(mul, (args * ...))
1248 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VV(mulsu, std::get<0>(WideMultiplySignedUnsigned(args...)))
1249 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VX(mul, (args * ...))
1250 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VX(mulsu, std::get<0>(WideMultiplySignedUnsigned(args...)))
1251
1252 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VVW(macc, auto [arg1, arg2, arg3] = std::tuple{args...};
1253 (arg1 * arg2) + arg3)
1254 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VXW(macc, auto [arg1, arg2, arg3] = std::tuple{args...};
1255 (arg1 * arg2) + arg3)
1256 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VVW(maccsu, auto [arg1, arg2, arg3] = std::tuple{args...};
1257 (std::get<0>(WideMultiplySignedUnsigned(arg2, arg1))) +
1258 arg3)
1259 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VXW(maccsu, auto [arg1, arg2, arg3] = std::tuple{args...};
1260 (std::get<0>(WideMultiplySignedUnsigned(arg2, arg1))) +
1261 arg3)
1262 DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VXW(maccus, auto [arg1, arg2, arg3] = std::tuple{args...};
1263 (std::get<0>(WideMultiplySignedUnsigned(arg1, arg2))) +
1264 arg3)
1265 DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VVW(
1266 macc, auto [arg1, arg2, arg3] = std::tuple{args...};
1267 std::get<0>(FMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
1268 DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VXW(
1269 macc, auto [arg1, arg2, arg3] = std::tuple{args...};
1270 std::get<0>(FMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
1271 DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VVW(
1272 nmacc, auto [arg1, arg2, arg3] = std::tuple{args...};
1273 std::get<0>(FNMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
1274 DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VXW(
1275 nmacc, auto [arg1, arg2, arg3] = std::tuple{args...};
1276 std::get<0>(FNMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
1277 DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VVW(
1278 msac, auto [arg1, arg2, arg3] = std::tuple{args...};
1279 std::get<0>(FMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
1280 DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VXW(
1281 msac, auto [arg1, arg2, arg3] = std::tuple{args...};
1282 std::get<0>(FMSub(FPFlags::DYN, csr, arg2, arg1, arg3)))
1283 DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VVW(
1284 nmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
1285 std::get<0>(FNMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
1286 DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VXW(
1287 nmsac, auto [arg1, arg2, arg3] = std::tuple{args...};
1288 std::get<0>(FNMAdd(FPFlags::DYN, csr, arg2, arg1, arg3)))
1289 DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WV(sr, auto [arg1, arg2] = std::tuple{args...};
1290 (arg1 >> arg2))
1291 DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WX(sr, auto [arg1, arg2] = std::tuple{args...};
1292 (arg1 >> arg2))
1293 DEFINE_2OP_1CSR_NARROW_ARITHMETIC_INTRINSIC_WV(
1294 clip,
1295 WideType<ElementType>{(std::get<0>(
1296 Roundoff(csr, static_cast<typename WideType<ElementType>::BaseType>(args)...)))})
1297 DEFINE_2OP_1CSR_NARROW_ARITHMETIC_INTRINSIC_WX(
1298 clip,
1299 WideType<ElementType>{(std::get<0>(
1300 Roundoff(csr, static_cast<typename WideType<ElementType>::BaseType>(args)...)))})
1301
1302 #undef DEFINE_ARITHMETIC_INTRINSIC
1303 #undef DEFINE_W_ARITHMETIC_INTRINSIC
1304 #undef DEFINE_ARITHMETIC_REDUCE_INTRINSIC
1305 #undef DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS
1306 #undef DEFINE_1OP_ARITHMETIC_INTRINSIC_V
1307 #undef DEFINE_2OP_ARITHMETIC_INTRINSIC_VS
1308 #undef DEFINE_2OP_ARITHMETIC_INTRINSIC_VV
1309 #undef DEFINE_3OP_ARITHMETIC_INTRINSIC_VV
1310 #undef DEFINE_2OP_ARITHMETIC_INTRINSIC_VX
1311 #undef DEFINE_3OP_ARITHMETIC_INTRINSIC_VX
1312 #undef DEFINE_1OP_ARITHMETIC_INTRINSIC_X
1313 #undef DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF
1314 #undef DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VS
1315 #undef DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VX
1316 #undef DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV
1317 #undef DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WV
1318 #undef DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WX
1319 #undef DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VV
1320 #undef DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VF
1321 #undef DEFINE_3OP_1CSR_ARITHMETIC_INTRINSIC_VV
1322 #undef DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VV
1323 #undef DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VF
1324 #undef DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_WV
1325 #undef DEFINE_2OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_WF
1326 #undef DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VVW
1327 #undef DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_WV
1328 #undef DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_WX
1329 #undef DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VX
1330 #undef DEFINE_2OP_WIDEN_ARITHMETIC_INTRINSIC_VXW
1331 #undef DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VVW
1332 #undef DEFINE_3OP_1CSR_WIDEN_ARITHMETIC_INTRINSIC_VXW
1333
1334 } // namespace berberis::intrinsics
1335
1336 #endif // BERBERIS_INTRINSICS_RISCV64_TO_ALL_VECTOR_INTRINSICS_H_
1337