1 // Copyright 2019, VIXL authors
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are met:
6 //
7 // * Redistributions of source code must retain the above copyright notice,
8 // this list of conditions and the following disclaimer.
9 // * Redistributions in binary form must reproduce the above copyright notice,
10 // this list of conditions and the following disclaimer in the documentation
11 // and/or other materials provided with the distribution.
12 // * Neither the name of ARM Limited nor the names of its contributors may be
13 // used to endorse or promote products derived from this software without
14 // specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
17 // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
27 #include <sys/mman.h>
28 #include <unistd.h>
29
30 #include <cfloat>
31 #include <cmath>
32 #include <cstdio>
33 #include <cstdlib>
34 #include <cstring>
35 #include <functional>
36
37 #include "test-runner.h"
38 #include "test-utils.h"
39 #include "aarch64/test-utils-aarch64.h"
40
41 #include "aarch64/cpu-aarch64.h"
42 #include "aarch64/disasm-aarch64.h"
43 #include "aarch64/macro-assembler-aarch64.h"
44 #include "aarch64/simulator-aarch64.h"
45 #include "test-assembler-aarch64.h"
46
47 #define TEST_SVE(name) TEST_SVE_INNER("ASM", name)
48
49 namespace vixl {
50 namespace aarch64 {
51
52 // Conveniently initialise P registers with scalar bit patterns. The destination
53 // lane size is ignored. This is optimised for call-site clarity, not generated
54 // code quality.
55 //
56 // Usage:
57 //
58 // Initialise(&masm, p0, 0x1234); // Sets p0 = 0b'0001'0010'0011'0100
Initialise(MacroAssembler * masm,const PRegister & pd,uint64_t value3,uint64_t value2,uint64_t value1,uint64_t value0)59 void Initialise(MacroAssembler* masm,
60 const PRegister& pd,
61 uint64_t value3,
62 uint64_t value2,
63 uint64_t value1,
64 uint64_t value0) {
65 // Generate a literal pool, as in the array form.
66 UseScratchRegisterScope temps(masm);
67 Register temp = temps.AcquireX();
68 Label data;
69 Label done;
70
71 masm->Adr(temp, &data);
72 masm->Ldr(pd, SVEMemOperand(temp));
73 masm->B(&done);
74 {
75 ExactAssemblyScope total(masm, kPRegMaxSizeInBytes);
76 masm->bind(&data);
77 masm->dc64(value0);
78 masm->dc64(value1);
79 masm->dc64(value2);
80 masm->dc64(value3);
81 }
82 masm->Bind(&done);
83 }
Initialise(MacroAssembler * masm,const PRegister & pd,uint64_t value2,uint64_t value1,uint64_t value0)84 void Initialise(MacroAssembler* masm,
85 const PRegister& pd,
86 uint64_t value2,
87 uint64_t value1,
88 uint64_t value0) {
89 Initialise(masm, pd, 0, value2, value1, value0);
90 }
Initialise(MacroAssembler * masm,const PRegister & pd,uint64_t value1,uint64_t value0)91 void Initialise(MacroAssembler* masm,
92 const PRegister& pd,
93 uint64_t value1,
94 uint64_t value0) {
95 Initialise(masm, pd, 0, 0, value1, value0);
96 }
Initialise(MacroAssembler * masm,const PRegister & pd,uint64_t value0)97 void Initialise(MacroAssembler* masm, const PRegister& pd, uint64_t value0) {
98 Initialise(masm, pd, 0, 0, 0, value0);
99 }
100
101 // Conveniently initialise P registers by lane. This is optimised for call-site
102 // clarity, not generated code quality.
103 //
104 // Usage:
105 //
106 // int values[] = { 0x0, 0x1, 0x2 };
107 // Initialise(&masm, p0.VnS(), values); // Sets p0 = 0b'0000'0001'0010
108 //
109 // The rightmost (highest-indexed) array element maps to the lowest-numbered
110 // lane. Unspecified lanes are set to 0 (inactive).
111 //
112 // Each element of the `values` array is mapped onto a lane in `pd`. The
113 // architecture only respects the lower bit, and writes zero the upper bits, but
114 // other (encodable) values can be specified if required by the test.
115 template <typename T, size_t N>
Initialise(MacroAssembler * masm,const PRegisterWithLaneSize & pd,const T (& values)[N])116 void Initialise(MacroAssembler* masm,
117 const PRegisterWithLaneSize& pd,
118 const T (&values)[N]) {
119 // Turn the array into 64-bit chunks.
120 uint64_t chunks[4] = {0, 0, 0, 0};
121 VIXL_STATIC_ASSERT(sizeof(chunks) == kPRegMaxSizeInBytes);
122
123 int p_bits_per_lane = pd.GetLaneSizeInBits() / kZRegBitsPerPRegBit;
124 VIXL_ASSERT((64 % p_bits_per_lane) == 0);
125 VIXL_ASSERT((N * p_bits_per_lane) <= kPRegMaxSize);
126
127 uint64_t p_lane_mask = GetUintMask(p_bits_per_lane);
128
129 VIXL_STATIC_ASSERT(N <= kPRegMaxSize);
130 size_t bit = 0;
131 for (int n = static_cast<int>(N - 1); n >= 0; n--) {
132 VIXL_ASSERT(bit < (sizeof(chunks) * kBitsPerByte));
133 uint64_t value = values[n] & p_lane_mask;
134 chunks[bit / 64] |= value << (bit % 64);
135 bit += p_bits_per_lane;
136 }
137
138 Initialise(masm, pd, chunks[3], chunks[2], chunks[1], chunks[0]);
139 }
140
141 // Ensure that basic test infrastructure works.
TEST_SVE(sve_test_infrastructure_z)142 TEST_SVE(sve_test_infrastructure_z) {
143 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
144 START();
145
146 __ Mov(x0, 0x0123456789abcdef);
147
148 // Test basic `Insr` behaviour.
149 __ Insr(z0.VnB(), 1);
150 __ Insr(z0.VnB(), 2);
151 __ Insr(z0.VnB(), x0);
152 __ Insr(z0.VnB(), -42);
153 __ Insr(z0.VnB(), 0);
154
155 // Test array inputs.
156 int z1_inputs[] = {3, 4, 5, -42, 0};
157 InsrHelper(&masm, z1.VnH(), z1_inputs);
158
159 // Test that sign-extension works as intended for various lane sizes.
160 __ Dup(z2.VnD(), 0); // Clear the register first.
161 __ Insr(z2.VnB(), -42); // 0xd6
162 __ Insr(z2.VnB(), 0xfe); // 0xfe
163 __ Insr(z2.VnH(), -42); // 0xffd6
164 __ Insr(z2.VnH(), 0xfedc); // 0xfedc
165 __ Insr(z2.VnS(), -42); // 0xffffffd6
166 __ Insr(z2.VnS(), 0xfedcba98); // 0xfedcba98
167 // Use another register for VnD(), so we can support 128-bit Z registers.
168 __ Insr(z3.VnD(), -42); // 0xffffffffffffffd6
169 __ Insr(z3.VnD(), 0xfedcba9876543210); // 0xfedcba9876543210
170
171 END();
172
173 if (CAN_RUN()) {
174 RUN();
175
176 // Test that array checks work properly on a register initialised
177 // lane-by-lane.
178 int z0_inputs_b[] = {0x01, 0x02, 0xef, 0xd6, 0x00};
179 ASSERT_EQUAL_SVE(z0_inputs_b, z0.VnB());
180
181 // Test that lane-by-lane checks work properly on a register initialised
182 // by array.
183 for (size_t i = 0; i < ArrayLength(z1_inputs); i++) {
184 // The rightmost (highest-indexed) array element maps to the
185 // lowest-numbered lane.
186 int lane = static_cast<int>(ArrayLength(z1_inputs) - i - 1);
187 ASSERT_EQUAL_SVE_LANE(z1_inputs[i], z1.VnH(), lane);
188 }
189
190 uint64_t z2_inputs_d[] = {0x0000d6feffd6fedc, 0xffffffd6fedcba98};
191 ASSERT_EQUAL_SVE(z2_inputs_d, z2.VnD());
192 uint64_t z3_inputs_d[] = {0xffffffffffffffd6, 0xfedcba9876543210};
193 ASSERT_EQUAL_SVE(z3_inputs_d, z3.VnD());
194 }
195 }
196
197 // Ensure that basic test infrastructure works.
TEST_SVE(sve_test_infrastructure_p)198 TEST_SVE(sve_test_infrastructure_p) {
199 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
200 START();
201
202 // Simple cases: move boolean (0 or 1) values.
203
204 int p0_inputs[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0};
205 Initialise(&masm, p0.VnB(), p0_inputs);
206
207 int p1_inputs[] = {1, 0, 1, 1, 0, 1, 1, 1};
208 Initialise(&masm, p1.VnH(), p1_inputs);
209
210 int p2_inputs[] = {1, 1, 0, 1};
211 Initialise(&masm, p2.VnS(), p2_inputs);
212
213 int p3_inputs[] = {0, 1};
214 Initialise(&masm, p3.VnD(), p3_inputs);
215
216 // Advanced cases: move numeric value into architecturally-ignored bits.
217
218 // B-sized lanes get one bit in a P register, so there are no ignored bits.
219
220 // H-sized lanes get two bits in a P register.
221 int p4_inputs[] = {0x3, 0x2, 0x1, 0x0, 0x1, 0x2, 0x3};
222 Initialise(&masm, p4.VnH(), p4_inputs);
223
224 // S-sized lanes get four bits in a P register.
225 int p5_inputs[] = {0xc, 0x7, 0x9, 0x6, 0xf};
226 Initialise(&masm, p5.VnS(), p5_inputs);
227
228 // D-sized lanes get eight bits in a P register.
229 int p6_inputs[] = {0x81, 0xcc, 0x55};
230 Initialise(&masm, p6.VnD(), p6_inputs);
231
232 // The largest possible P register has 32 bytes.
233 int p7_inputs[] = {0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
234 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
235 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
236 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f};
237 Initialise(&masm, p7.VnD(), p7_inputs);
238
239 END();
240
241 if (CAN_RUN()) {
242 RUN();
243
244 // Test that lane-by-lane checks work properly. The rightmost
245 // (highest-indexed) array element maps to the lowest-numbered lane.
246 for (size_t i = 0; i < ArrayLength(p0_inputs); i++) {
247 int lane = static_cast<int>(ArrayLength(p0_inputs) - i - 1);
248 ASSERT_EQUAL_SVE_LANE(p0_inputs[i], p0.VnB(), lane);
249 }
250 for (size_t i = 0; i < ArrayLength(p1_inputs); i++) {
251 int lane = static_cast<int>(ArrayLength(p1_inputs) - i - 1);
252 ASSERT_EQUAL_SVE_LANE(p1_inputs[i], p1.VnH(), lane);
253 }
254 for (size_t i = 0; i < ArrayLength(p2_inputs); i++) {
255 int lane = static_cast<int>(ArrayLength(p2_inputs) - i - 1);
256 ASSERT_EQUAL_SVE_LANE(p2_inputs[i], p2.VnS(), lane);
257 }
258 for (size_t i = 0; i < ArrayLength(p3_inputs); i++) {
259 int lane = static_cast<int>(ArrayLength(p3_inputs) - i - 1);
260 ASSERT_EQUAL_SVE_LANE(p3_inputs[i], p3.VnD(), lane);
261 }
262
263 // Test that array checks work properly on predicates initialised with a
264 // possibly-different lane size.
265 // 0b...11'10'01'00'01'10'11
266 int p4_expected[] = {0x39, 0x1b};
267 ASSERT_EQUAL_SVE(p4_expected, p4.VnD());
268
269 ASSERT_EQUAL_SVE(p5_inputs, p5.VnS());
270
271 // 0b...10000001'11001100'01010101
272 int p6_expected[] = {2, 0, 0, 1, 3, 0, 3, 0, 1, 1, 1, 1};
273 ASSERT_EQUAL_SVE(p6_expected, p6.VnH());
274
275 // 0b...10011100'10011101'10011110'10011111
276 int p7_expected[] = {1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1,
277 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1};
278 ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
279 }
280 }
281
282 // Test that writes to V registers clear the high bits of the corresponding Z
283 // register.
TEST_SVE(sve_v_write_clear)284 TEST_SVE(sve_v_write_clear) {
285 SVE_SETUP_WITH_FEATURES(CPUFeatures::kNEON,
286 CPUFeatures::kFP,
287 CPUFeatures::kSVE);
288 START();
289
290 // The Simulator has two mechansisms for writing V registers:
291 // - Write*Register, calling through to SimRegisterBase::Write.
292 // - LogicVRegister::ClearForWrite followed by one or more lane updates.
293 // Try to cover both variants.
294
295 // Prepare some known inputs.
296 uint8_t data[kQRegSizeInBytes];
297 for (size_t i = 0; i < kQRegSizeInBytes; i++) {
298 data[i] = 42 + i;
299 }
300 __ Mov(x10, reinterpret_cast<uintptr_t>(data));
301 __ Fmov(d30, 42.0);
302
303 // Use Index to label the lane indices, so failures are easy to detect and
304 // diagnose.
305 __ Index(z0.VnB(), 0, 1);
306 __ Index(z1.VnB(), 0, 1);
307 __ Index(z2.VnB(), 0, 1);
308 __ Index(z3.VnB(), 0, 1);
309 __ Index(z4.VnB(), 0, 1);
310
311 __ Index(z10.VnB(), 0, -1);
312 __ Index(z11.VnB(), 0, -1);
313 __ Index(z12.VnB(), 0, -1);
314 __ Index(z13.VnB(), 0, -1);
315 __ Index(z14.VnB(), 0, -1);
316
317 // Instructions using Write*Register (and SimRegisterBase::Write).
318 __ Ldr(b0, MemOperand(x10));
319 __ Fcvt(h1, d30);
320 __ Fmov(s2, 1.5f);
321 __ Fmov(d3, d30);
322 __ Ldr(q4, MemOperand(x10));
323
324 // Instructions using LogicVRegister::ClearForWrite.
325 // These also (incidentally) test that across-lane instructions correctly
326 // ignore the high-order Z register lanes.
327 __ Sminv(b10, v10.V16B());
328 __ Addv(h11, v11.V4H());
329 __ Saddlv(s12, v12.V8H());
330 __ Dup(v13.V8B(), b13, kDRegSizeInBytes);
331 __ Uaddl(v14.V8H(), v14.V8B(), v14.V8B());
332
333 END();
334
335 if (CAN_RUN()) {
336 RUN();
337
338 // Check the Q part first.
339 ASSERT_EQUAL_128(0x0000000000000000, 0x000000000000002a, v0);
340 ASSERT_EQUAL_128(0x0000000000000000, 0x0000000000005140, v1); // 42.0 (f16)
341 ASSERT_EQUAL_128(0x0000000000000000, 0x000000003fc00000, v2); // 1.5 (f32)
342 ASSERT_EQUAL_128(0x0000000000000000, 0x4045000000000000, v3); // 42.0 (f64)
343 ASSERT_EQUAL_128(0x3938373635343332, 0x31302f2e2d2c2b2a, v4);
344 ASSERT_EQUAL_128(0x0000000000000000, 0x00000000000000f1, v10); // -15
345 // 0xf9fa + 0xfbfc + 0xfdfe + 0xff00 -> 0xf2f4
346 ASSERT_EQUAL_128(0x0000000000000000, 0x000000000000f2f4, v11);
347 // 0xfffff1f2 + 0xfffff3f4 + ... + 0xfffffdfe + 0xffffff00 -> 0xffffc6c8
348 ASSERT_EQUAL_128(0x0000000000000000, 0x00000000ffffc6c8, v12);
349 ASSERT_EQUAL_128(0x0000000000000000, 0xf8f8f8f8f8f8f8f8, v13); // [-8] x 8
350 // [0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, 0x0000]
351 // + [0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, 0x0000]
352 // -> [0x01f2, 0x01f4, 0x01f6, 0x01f8, 0x01fa, 0x01fc, 0x01fe, 0x0000]
353 ASSERT_EQUAL_128(0x01f201f401f601f8, 0x01fa01fc01fe0000, v14);
354
355 // Check that the upper lanes are all clear.
356 for (int i = kQRegSizeInBytes; i < core.GetSVELaneCount(kBRegSize); i++) {
357 ASSERT_EQUAL_SVE_LANE(0x00, z0.VnB(), i);
358 ASSERT_EQUAL_SVE_LANE(0x00, z1.VnB(), i);
359 ASSERT_EQUAL_SVE_LANE(0x00, z2.VnB(), i);
360 ASSERT_EQUAL_SVE_LANE(0x00, z3.VnB(), i);
361 ASSERT_EQUAL_SVE_LANE(0x00, z4.VnB(), i);
362 ASSERT_EQUAL_SVE_LANE(0x00, z10.VnB(), i);
363 ASSERT_EQUAL_SVE_LANE(0x00, z11.VnB(), i);
364 ASSERT_EQUAL_SVE_LANE(0x00, z12.VnB(), i);
365 ASSERT_EQUAL_SVE_LANE(0x00, z13.VnB(), i);
366 ASSERT_EQUAL_SVE_LANE(0x00, z14.VnB(), i);
367 }
368 }
369 }
370
MlaMlsHelper(Test * config,unsigned lane_size_in_bits)371 static void MlaMlsHelper(Test* config, unsigned lane_size_in_bits) {
372 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
373 START();
374
375 int zd_inputs[] = {0xbb, 0xcc, 0xdd, 0xee};
376 int za_inputs[] = {-39, 1, -3, 2};
377 int zn_inputs[] = {-5, -20, 9, 8};
378 int zm_inputs[] = {9, -5, 4, 5};
379
380 ZRegister zd = z0.WithLaneSize(lane_size_in_bits);
381 ZRegister za = z1.WithLaneSize(lane_size_in_bits);
382 ZRegister zn = z2.WithLaneSize(lane_size_in_bits);
383 ZRegister zm = z3.WithLaneSize(lane_size_in_bits);
384
385 // TODO: Use a simple `Dup` once it accepts arbitrary immediates.
386 InsrHelper(&masm, zd, zd_inputs);
387 InsrHelper(&masm, za, za_inputs);
388 InsrHelper(&masm, zn, zn_inputs);
389 InsrHelper(&masm, zm, zm_inputs);
390
391 int p0_inputs[] = {1, 1, 0, 1};
392 int p1_inputs[] = {1, 0, 1, 1};
393 int p2_inputs[] = {0, 1, 1, 1};
394 int p3_inputs[] = {1, 1, 1, 0};
395
396 Initialise(&masm, p0.WithLaneSize(lane_size_in_bits), p0_inputs);
397 Initialise(&masm, p1.WithLaneSize(lane_size_in_bits), p1_inputs);
398 Initialise(&masm, p2.WithLaneSize(lane_size_in_bits), p2_inputs);
399 Initialise(&masm, p3.WithLaneSize(lane_size_in_bits), p3_inputs);
400
401 // The Mla macro automatically selects between mla, mad and movprfx + mla
402 // based on what registers are aliased.
403 ZRegister mla_da_result = z10.WithLaneSize(lane_size_in_bits);
404 ZRegister mla_dn_result = z11.WithLaneSize(lane_size_in_bits);
405 ZRegister mla_dm_result = z12.WithLaneSize(lane_size_in_bits);
406 ZRegister mla_d_result = z13.WithLaneSize(lane_size_in_bits);
407
408 __ Mov(mla_da_result, za);
409 __ Mla(mla_da_result, p0.Merging(), mla_da_result, zn, zm);
410
411 __ Mov(mla_dn_result, zn);
412 __ Mla(mla_dn_result, p1.Merging(), za, mla_dn_result, zm);
413
414 __ Mov(mla_dm_result, zm);
415 __ Mla(mla_dm_result, p2.Merging(), za, zn, mla_dm_result);
416
417 __ Mov(mla_d_result, zd);
418 __ Mla(mla_d_result, p3.Merging(), za, zn, zm);
419
420 // The Mls macro automatically selects between mls, msb and movprfx + mls
421 // based on what registers are aliased.
422 ZRegister mls_da_result = z20.WithLaneSize(lane_size_in_bits);
423 ZRegister mls_dn_result = z21.WithLaneSize(lane_size_in_bits);
424 ZRegister mls_dm_result = z22.WithLaneSize(lane_size_in_bits);
425 ZRegister mls_d_result = z23.WithLaneSize(lane_size_in_bits);
426
427 __ Mov(mls_da_result, za);
428 __ Mls(mls_da_result, p0.Merging(), mls_da_result, zn, zm);
429
430 __ Mov(mls_dn_result, zn);
431 __ Mls(mls_dn_result, p1.Merging(), za, mls_dn_result, zm);
432
433 __ Mov(mls_dm_result, zm);
434 __ Mls(mls_dm_result, p2.Merging(), za, zn, mls_dm_result);
435
436 __ Mov(mls_d_result, zd);
437 __ Mls(mls_d_result, p3.Merging(), za, zn, zm);
438
439 END();
440
441 if (CAN_RUN()) {
442 RUN();
443
444 ASSERT_EQUAL_SVE(za_inputs, z1.WithLaneSize(lane_size_in_bits));
445 ASSERT_EQUAL_SVE(zn_inputs, z2.WithLaneSize(lane_size_in_bits));
446 ASSERT_EQUAL_SVE(zm_inputs, z3.WithLaneSize(lane_size_in_bits));
447
448 int mla[] = {-84, 101, 33, 42};
449 int mls[] = {6, -99, -39, -38};
450
451 int mla_da_expected[] = {mla[0], mla[1], za_inputs[2], mla[3]};
452 ASSERT_EQUAL_SVE(mla_da_expected, mla_da_result);
453
454 int mla_dn_expected[] = {mla[0], zn_inputs[1], mla[2], mla[3]};
455 ASSERT_EQUAL_SVE(mla_dn_expected, mla_dn_result);
456
457 int mla_dm_expected[] = {zm_inputs[0], mla[1], mla[2], mla[3]};
458 ASSERT_EQUAL_SVE(mla_dm_expected, mla_dm_result);
459
460 int mla_d_expected[] = {mla[0], mla[1], mla[2], zd_inputs[3]};
461 ASSERT_EQUAL_SVE(mla_d_expected, mla_d_result);
462
463 int mls_da_expected[] = {mls[0], mls[1], za_inputs[2], mls[3]};
464 ASSERT_EQUAL_SVE(mls_da_expected, mls_da_result);
465
466 int mls_dn_expected[] = {mls[0], zn_inputs[1], mls[2], mls[3]};
467 ASSERT_EQUAL_SVE(mls_dn_expected, mls_dn_result);
468
469 int mls_dm_expected[] = {zm_inputs[0], mls[1], mls[2], mls[3]};
470 ASSERT_EQUAL_SVE(mls_dm_expected, mls_dm_result);
471
472 int mls_d_expected[] = {mls[0], mls[1], mls[2], zd_inputs[3]};
473 ASSERT_EQUAL_SVE(mls_d_expected, mls_d_result);
474 }
475 }
476
TEST_SVE(sve_mla_mls_b)477 TEST_SVE(sve_mla_mls_b) { MlaMlsHelper(config, kBRegSize); }
TEST_SVE(sve_mla_mls_h)478 TEST_SVE(sve_mla_mls_h) { MlaMlsHelper(config, kHRegSize); }
TEST_SVE(sve_mla_mls_s)479 TEST_SVE(sve_mla_mls_s) { MlaMlsHelper(config, kSRegSize); }
TEST_SVE(sve_mla_mls_d)480 TEST_SVE(sve_mla_mls_d) { MlaMlsHelper(config, kDRegSize); }
481
TEST_SVE(sve_bitwise_unpredicate_logical)482 TEST_SVE(sve_bitwise_unpredicate_logical) {
483 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
484 START();
485
486 uint64_t z8_inputs[] = {0xfedcba9876543210, 0x0123456789abcdef};
487 InsrHelper(&masm, z8.VnD(), z8_inputs);
488 uint64_t z15_inputs[] = {0xffffeeeeddddcccc, 0xccccddddeeeeffff};
489 InsrHelper(&masm, z15.VnD(), z15_inputs);
490
491 __ And(z1.VnD(), z8.VnD(), z15.VnD());
492 __ Bic(z2.VnD(), z8.VnD(), z15.VnD());
493 __ Eor(z3.VnD(), z8.VnD(), z15.VnD());
494 __ Orr(z4.VnD(), z8.VnD(), z15.VnD());
495
496 END();
497
498 if (CAN_RUN()) {
499 RUN();
500 uint64_t z1_expected[] = {0xfedcaa8854540000, 0x0000454588aacdef};
501 uint64_t z2_expected[] = {0x0000101022003210, 0x0123002201010000};
502 uint64_t z3_expected[] = {0x01235476ab89fedc, 0xcdef98ba67453210};
503 uint64_t z4_expected[] = {0xfffffefeffddfedc, 0xcdefddffefefffff};
504
505 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
506 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
507 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
508 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
509 }
510 }
511
TEST_SVE(sve_last_r)512 TEST_SVE(sve_last_r) {
513 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
514 START();
515
516 __ Pfalse(p1.VnB());
517 int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
518 int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
519 Initialise(&masm, p2.VnB(), p2_inputs);
520 Initialise(&masm, p3.VnB(), p3_inputs);
521 __ Ptrue(p4.VnB());
522
523 __ Index(z0.VnB(), 0x10, 1);
524 __ Lasta(x1, p1, z0.VnB());
525 __ Lastb(x2, p1, z0.VnB());
526 __ Lasta(x3, p2, z0.VnB());
527 __ Lastb(x4, p2, z0.VnB());
528 __ Lasta(x5, p3, z0.VnB());
529 __ Lastb(x6, p3, z0.VnB());
530 __ Lasta(x7, p4, z0.VnB());
531
532 __ Punpklo(p3.VnH(), p3.VnB());
533 __ Index(z0.VnH(), 0x1110, 1);
534 __ Lasta(x9, p1, z0.VnH());
535 __ Lastb(x10, p3, z0.VnH());
536 __ Lasta(x12, p4, z0.VnH());
537
538 __ Index(z0.VnS(), 0x11111110, 1);
539 __ Lastb(x13, p1, z0.VnS());
540 __ Lasta(x14, p2, z0.VnS());
541 __ Lastb(x18, p4, z0.VnS());
542
543 __ Index(z0.VnD(), 0x1111111111111110, 1);
544 __ Lasta(x19, p1, z0.VnD());
545 __ Lastb(x20, p3, z0.VnD());
546 __ Lasta(x21, p3, z0.VnD());
547 END();
548
549 if (CAN_RUN()) {
550 RUN();
551
552 ASSERT_EQUAL_64(0x0000000000000010, x1);
553 ASSERT_EQUAL_64(0x0000000000000011, x3);
554 ASSERT_EQUAL_64(0x0000000000000010, x4);
555 ASSERT_EQUAL_64(0x0000000000000019, x5);
556 ASSERT_EQUAL_64(0x0000000000000018, x6);
557 ASSERT_EQUAL_64(0x0000000000000010, x7);
558 ASSERT_EQUAL_64(0x0000000000001110, x9);
559 ASSERT_EQUAL_64(0x0000000000001110, x12);
560 ASSERT_EQUAL_64(0x0000000011111111, x14);
561 ASSERT_EQUAL_64(0x1111111111111110, x19);
562
563 int vl = core.GetSVELaneCount(kBRegSize) * 8;
564 switch (vl) {
565 case 128:
566 ASSERT_EQUAL_64(0x000000000000001f, x2);
567 ASSERT_EQUAL_64(0x0000000000001116, x10);
568 ASSERT_EQUAL_64(0x0000000011111113, x13);
569 ASSERT_EQUAL_64(0x0000000011111113, x18);
570 ASSERT_EQUAL_64(0x1111111111111111, x20);
571 ASSERT_EQUAL_64(0x1111111111111110, x21);
572 break;
573 case 384:
574 ASSERT_EQUAL_64(0x000000000000003f, x2);
575 ASSERT_EQUAL_64(0x0000000000001118, x10);
576 ASSERT_EQUAL_64(0x000000001111111b, x13);
577 ASSERT_EQUAL_64(0x000000001111111b, x18);
578 ASSERT_EQUAL_64(0x1111111111111112, x20);
579 ASSERT_EQUAL_64(0x1111111111111113, x21);
580 break;
581 case 2048:
582 ASSERT_EQUAL_64(0x000000000000000f, x2);
583 ASSERT_EQUAL_64(0x0000000000001118, x10);
584 ASSERT_EQUAL_64(0x000000001111114f, x13);
585 ASSERT_EQUAL_64(0x000000001111114f, x18);
586 ASSERT_EQUAL_64(0x1111111111111112, x20);
587 ASSERT_EQUAL_64(0x1111111111111113, x21);
588 break;
589 default:
590 printf("WARNING: Some tests skipped due to unexpected VL.\n");
591 break;
592 }
593 }
594 }
595
TEST_SVE(sve_last_v)596 TEST_SVE(sve_last_v) {
597 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
598 START();
599
600 __ Pfalse(p1.VnB());
601 int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
602 int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
603 Initialise(&masm, p2.VnB(), p2_inputs);
604 Initialise(&masm, p3.VnB(), p3_inputs);
605 __ Ptrue(p4.VnB());
606
607 __ Index(z0.VnB(), 0x10, 1);
608 __ Lasta(b1, p1, z0.VnB());
609 __ Lastb(b2, p1, z0.VnB());
610 __ Lasta(b3, p2, z0.VnB());
611 __ Lastb(b4, p2, z0.VnB());
612 __ Lasta(b5, p3, z0.VnB());
613 __ Lastb(b6, p3, z0.VnB());
614 __ Lasta(b7, p4, z0.VnB());
615
616 __ Punpklo(p3.VnH(), p3.VnB());
617 __ Index(z0.VnH(), 0x1110, 1);
618 __ Lasta(h9, p1, z0.VnH());
619 __ Lastb(h10, p3, z0.VnH());
620 __ Lasta(h12, p4, z0.VnH());
621
622 __ Index(z0.VnS(), 0x11111110, 1);
623 __ Lastb(s13, p1, z0.VnS());
624 __ Lasta(s14, p2, z0.VnS());
625 __ Lastb(s18, p4, z0.VnS());
626
627 __ Index(z0.VnD(), 0x1111111111111110, 1);
628 __ Lasta(d19, p1, z0.VnD());
629 __ Lastb(d20, p3, z0.VnD());
630 __ Lasta(d21, p3, z0.VnD());
631 END();
632
633 if (CAN_RUN()) {
634 RUN();
635
636 ASSERT_EQUAL_128(0, 0x0000000000000010, q1);
637 ASSERT_EQUAL_128(0, 0x0000000000000011, q3);
638 ASSERT_EQUAL_128(0, 0x0000000000000010, q4);
639 ASSERT_EQUAL_128(0, 0x0000000000000019, q5);
640 ASSERT_EQUAL_128(0, 0x0000000000000018, q6);
641 ASSERT_EQUAL_128(0, 0x0000000000000010, q7);
642 ASSERT_EQUAL_128(0, 0x0000000000001110, q9);
643 ASSERT_EQUAL_128(0, 0x0000000000001110, q12);
644 ASSERT_EQUAL_128(0, 0x0000000011111111, q14);
645 ASSERT_EQUAL_128(0, 0x1111111111111110, q19);
646
647 int vl = core.GetSVELaneCount(kBRegSize) * 8;
648 switch (vl) {
649 case 128:
650 ASSERT_EQUAL_128(0, 0x000000000000001f, q2);
651 ASSERT_EQUAL_128(0, 0x0000000000001116, q10);
652 ASSERT_EQUAL_128(0, 0x0000000011111113, q13);
653 ASSERT_EQUAL_128(0, 0x0000000011111113, q18);
654 ASSERT_EQUAL_128(0, 0x1111111111111111, q20);
655 ASSERT_EQUAL_128(0, 0x1111111111111110, q21);
656 break;
657 case 384:
658 ASSERT_EQUAL_128(0, 0x000000000000003f, q2);
659 ASSERT_EQUAL_128(0, 0x0000000000001118, q10);
660 ASSERT_EQUAL_128(0, 0x000000001111111b, q13);
661 ASSERT_EQUAL_128(0, 0x000000001111111b, q18);
662 ASSERT_EQUAL_128(0, 0x1111111111111112, q20);
663 ASSERT_EQUAL_128(0, 0x1111111111111113, q21);
664 break;
665 case 2048:
666 ASSERT_EQUAL_128(0, 0x000000000000000f, q2);
667 ASSERT_EQUAL_128(0, 0x0000000000001118, q10);
668 ASSERT_EQUAL_128(0, 0x000000001111114f, q13);
669 ASSERT_EQUAL_128(0, 0x000000001111114f, q18);
670 ASSERT_EQUAL_128(0, 0x1111111111111112, q20);
671 ASSERT_EQUAL_128(0, 0x1111111111111113, q21);
672 break;
673 default:
674 printf("WARNING: Some tests skipped due to unexpected VL.\n");
675 break;
676 }
677 }
678 }
679
TEST_SVE(sve_clast_r)680 TEST_SVE(sve_clast_r) {
681 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
682 START();
683
684 __ Pfalse(p1.VnB());
685 int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
686 int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
687 Initialise(&masm, p2.VnB(), p2_inputs);
688 Initialise(&masm, p3.VnB(), p3_inputs);
689 __ Ptrue(p4.VnB());
690
691 __ Index(z0.VnB(), 0x10, 1);
692 __ Mov(x1, -1);
693 __ Mov(x2, -1);
694 __ Clasta(x1, p1, x1, z0.VnB());
695 __ Clastb(x2, p1, x2, z0.VnB());
696 __ Clasta(x3, p2, x3, z0.VnB());
697 __ Clastb(x4, p2, x4, z0.VnB());
698 __ Clasta(x5, p3, x5, z0.VnB());
699 __ Clastb(x6, p3, x6, z0.VnB());
700 __ Clasta(x7, p4, x7, z0.VnB());
701
702 __ Punpklo(p3.VnH(), p3.VnB());
703 __ Index(z0.VnH(), 0x1110, 1);
704 __ Mov(x9, -1);
705 __ Clasta(x9, p1, x9, z0.VnH());
706 __ Clastb(x10, p3, x10, z0.VnH());
707 __ Clasta(x12, p4, x12, z0.VnH());
708
709 __ Index(z0.VnS(), 0x11111110, 1);
710 __ Mov(x13, -1);
711 __ Clasta(x13, p1, x13, z0.VnS());
712 __ Clastb(x14, p2, x14, z0.VnS());
713 __ Clasta(x18, p4, x18, z0.VnS());
714
715 __ Index(z0.VnD(), 0x1111111111111110, 1);
716 __ Mov(x19, -1);
717 __ Clasta(x19, p1, x19, z0.VnD());
718 __ Clastb(x20, p2, x20, z0.VnD());
719 __ Clasta(x21, p4, x21, z0.VnD());
720 END();
721
722 if (CAN_RUN()) {
723 RUN();
724 ASSERT_EQUAL_64(0x00000000000000ff, x1);
725 ASSERT_EQUAL_64(0x00000000000000ff, x2);
726 ASSERT_EQUAL_64(0x0000000000000011, x3);
727 ASSERT_EQUAL_64(0x0000000000000010, x4);
728 ASSERT_EQUAL_64(0x0000000000000019, x5);
729 ASSERT_EQUAL_64(0x0000000000000018, x6);
730 ASSERT_EQUAL_64(0x0000000000000010, x7);
731 ASSERT_EQUAL_64(0x000000000000ffff, x9);
732 ASSERT_EQUAL_64(0x0000000000001110, x12);
733 ASSERT_EQUAL_64(0x00000000ffffffff, x13);
734 ASSERT_EQUAL_64(0x0000000011111110, x14);
735 ASSERT_EQUAL_64(0x0000000011111110, x18);
736 ASSERT_EQUAL_64(0xffffffffffffffff, x19);
737 ASSERT_EQUAL_64(0x1111111111111110, x20);
738 ASSERT_EQUAL_64(0x1111111111111110, x21);
739
740 int vl = core.GetSVELaneCount(kBRegSize) * 8;
741 switch (vl) {
742 case 128:
743 ASSERT_EQUAL_64(0x0000000000001116, x10);
744 break;
745 case 384:
746 ASSERT_EQUAL_64(0x0000000000001118, x10);
747 break;
748 case 2048:
749 ASSERT_EQUAL_64(0x0000000000001118, x10);
750 break;
751 default:
752 printf("WARNING: Some tests skipped due to unexpected VL.\n");
753 break;
754 }
755 }
756 }
757
TEST_SVE(sve_clast_v)758 TEST_SVE(sve_clast_v) {
759 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
760 START();
761
762 __ Pfalse(p1.VnB());
763 int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
764 int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
765 Initialise(&masm, p2.VnB(), p2_inputs);
766 Initialise(&masm, p3.VnB(), p3_inputs);
767 __ Ptrue(p4.VnB());
768
769 __ Index(z0.VnB(), 0x10, 1);
770 __ Dup(z1.VnB(), -1);
771 __ Dup(z2.VnB(), -1);
772 __ Clasta(b1, p1, b1, z0.VnB());
773 __ Clastb(b2, p1, b2, z0.VnB());
774 __ Clasta(b3, p2, b3, z0.VnB());
775 __ Clastb(b4, p2, b4, z0.VnB());
776 __ Clasta(b5, p3, b5, z0.VnB());
777 __ Clastb(b6, p3, b6, z0.VnB());
778 __ Clasta(b7, p4, b7, z0.VnB());
779
780 __ Punpklo(p3.VnH(), p3.VnB());
781 __ Index(z0.VnH(), 0x1110, 1);
782 __ Dup(z9.VnB(), -1);
783 __ Clasta(h9, p1, h9, z0.VnH());
784 __ Clastb(h10, p3, h10, z0.VnH());
785 __ Clasta(h12, p4, h12, z0.VnH());
786
787 __ Index(z0.VnS(), 0x11111110, 1);
788 __ Dup(z13.VnB(), -1);
789 __ Clasta(s13, p1, s13, z0.VnS());
790 __ Clastb(s14, p2, s14, z0.VnS());
791 __ Clasta(s18, p4, s18, z0.VnS());
792
793 __ Index(z0.VnD(), 0x1111111111111110, 1);
794 __ Dup(z19.VnB(), -1);
795 __ Clasta(d19, p1, d19, z0.VnD());
796 __ Clastb(d20, p2, d20, z0.VnD());
797 __ Clasta(d21, p4, d21, z0.VnD());
798 END();
799
800 if (CAN_RUN()) {
801 RUN();
802 ASSERT_EQUAL_128(0, 0x00000000000000ff, q1);
803 ASSERT_EQUAL_128(0, 0x00000000000000ff, q2);
804 ASSERT_EQUAL_128(0, 0x0000000000000011, q3);
805 ASSERT_EQUAL_128(0, 0x0000000000000010, q4);
806 ASSERT_EQUAL_128(0, 0x0000000000000019, q5);
807 ASSERT_EQUAL_128(0, 0x0000000000000018, q6);
808 ASSERT_EQUAL_128(0, 0x0000000000000010, q7);
809 ASSERT_EQUAL_128(0, 0x000000000000ffff, q9);
810 ASSERT_EQUAL_128(0, 0x0000000000001110, q12);
811 ASSERT_EQUAL_128(0, 0x00000000ffffffff, q13);
812 ASSERT_EQUAL_128(0, 0x0000000011111110, q14);
813 ASSERT_EQUAL_128(0, 0x0000000011111110, q18);
814 ASSERT_EQUAL_128(0, 0xffffffffffffffff, q19);
815 ASSERT_EQUAL_128(0, 0x1111111111111110, q20);
816 ASSERT_EQUAL_128(0, 0x1111111111111110, q21);
817
818 int vl = core.GetSVELaneCount(kBRegSize) * 8;
819 switch (vl) {
820 case 128:
821 ASSERT_EQUAL_128(0, 0x0000000000001116, q10);
822 break;
823 case 384:
824 ASSERT_EQUAL_128(0, 0x0000000000001118, q10);
825 break;
826 case 2048:
827 ASSERT_EQUAL_128(0, 0x0000000000001118, q10);
828 break;
829 default:
830 printf("WARNING: Some tests skipped due to unexpected VL.\n");
831 break;
832 }
833 }
834 }
835
TEST_SVE(sve_clast_z)836 TEST_SVE(sve_clast_z) {
837 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
838 START();
839
840 __ Pfalse(p1.VnB());
841 int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
842 int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
843 Initialise(&masm, p2.VnB(), p2_inputs);
844 Initialise(&masm, p3.VnB(), p3_inputs);
845 __ Ptrue(p4.VnB());
846
847 __ Index(z0.VnB(), 0x10, 1);
848 __ Dup(z1.VnB(), 0xff);
849 __ Dup(z2.VnB(), 0xff);
850 __ Clasta(z1.VnB(), p1, z1.VnB(), z0.VnB());
851 __ Clastb(z2.VnB(), p1, z2.VnB(), z0.VnB());
852 __ Clasta(z3.VnB(), p2, z3.VnB(), z0.VnB());
853 __ Clastb(z4.VnB(), p2, z4.VnB(), z0.VnB());
854 __ Clasta(z5.VnB(), p3, z5.VnB(), z0.VnB());
855 __ Clastb(z6.VnB(), p3, z6.VnB(), z0.VnB());
856 __ Clasta(z7.VnB(), p4, z7.VnB(), z0.VnB());
857
858 __ Punpklo(p3.VnH(), p3.VnB());
859 __ Index(z0.VnH(), 0x1110, 1);
860 __ Dup(z9.VnB(), 0xff);
861 __ Clasta(z9.VnH(), p1, z9.VnH(), z0.VnH());
862 __ Clastb(z10.VnH(), p3, z10.VnH(), z0.VnH());
863 __ Clasta(z12.VnH(), p4, z12.VnH(), z0.VnH());
864
865 __ Index(z0.VnS(), 0x11111110, 1);
866 __ Dup(z13.VnB(), 0xff);
867 __ Clasta(z13.VnS(), p1, z13.VnS(), z0.VnS());
868 __ Clastb(z14.VnS(), p2, z14.VnS(), z0.VnS());
869 __ Clasta(z16.VnS(), p4, z16.VnS(), z0.VnS());
870
871 __ Index(z0.VnD(), 0x1111111111111110, 1);
872 __ Dup(z17.VnB(), 0xff);
873 __ Clasta(z17.VnD(), p1, z17.VnD(), z0.VnD());
874 __ Clastb(z18.VnD(), p2, z18.VnD(), z0.VnD());
875 __ Clasta(z20.VnD(), p4, z20.VnD(), z0.VnD());
876 END();
877
878 if (CAN_RUN()) {
879 RUN();
880 uint64_t z1_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
881 uint64_t z2_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
882 uint64_t z3_expected[] = {0x1111111111111111, 0x1111111111111111};
883 uint64_t z4_expected[] = {0x1010101010101010, 0x1010101010101010};
884 uint64_t z5_expected[] = {0x1919191919191919, 0x1919191919191919};
885 uint64_t z6_expected[] = {0x1818181818181818, 0x1818181818181818};
886 uint64_t z7_expected[] = {0x1010101010101010, 0x1010101010101010};
887 uint64_t z9_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
888 uint64_t z12_expected[] = {0x1110111011101110, 0x1110111011101110};
889 uint64_t z13_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
890 uint64_t z14_expected[] = {0x1111111011111110, 0x1111111011111110};
891 uint64_t z16_expected[] = {0x1111111011111110, 0x1111111011111110};
892 uint64_t z17_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
893 uint64_t z18_expected[] = {0x1111111111111110, 0x1111111111111110};
894 uint64_t z20_expected[] = {0x1111111111111110, 0x1111111111111110};
895
896 uint64_t z10_expected_vl128[] = {0x1116111611161116, 0x1116111611161116};
897 uint64_t z10_expected_vl_long[] = {0x1118111811181118, 0x1118111811181118};
898
899 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
900 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
901 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
902 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
903 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
904 ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
905 ASSERT_EQUAL_SVE(z7_expected, z7.VnD());
906 ASSERT_EQUAL_SVE(z9_expected, z9.VnD());
907 ASSERT_EQUAL_SVE(z12_expected, z12.VnD());
908 ASSERT_EQUAL_SVE(z13_expected, z13.VnD());
909 ASSERT_EQUAL_SVE(z14_expected, z14.VnD());
910 ASSERT_EQUAL_SVE(z16_expected, z16.VnD());
911 ASSERT_EQUAL_SVE(z17_expected, z17.VnD());
912 ASSERT_EQUAL_SVE(z18_expected, z18.VnD());
913 ASSERT_EQUAL_SVE(z20_expected, z20.VnD());
914
915 int vl = core.GetSVELaneCount(kBRegSize) * 8;
916 switch (vl) {
917 case 128:
918 ASSERT_EQUAL_SVE(z10_expected_vl128, z10.VnD());
919 break;
920 case 384:
921 case 2048:
922 ASSERT_EQUAL_SVE(z10_expected_vl_long, z10.VnD());
923 break;
924 default:
925 printf("WARNING: Some tests skipped due to unexpected VL.\n");
926 break;
927 }
928 }
929 }
930
TEST_SVE(sve_compact)931 TEST_SVE(sve_compact) {
932 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
933 START();
934
935 __ Ptrue(p0.VnB());
936 __ Pfalse(p1.VnB());
937 __ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
938 __ Zip1(p3.VnS(), p1.VnS(), p0.VnS());
939 __ Zip1(p4.VnD(), p0.VnD(), p1.VnD());
940
941 __ Index(z0.VnS(), 0x11111111, 0x11111111);
942 __ Mov(q0, q0);
943 __ Compact(z1.VnS(), p0, z0.VnS());
944 __ Compact(z2.VnS(), p2, z0.VnS());
945 __ Compact(z0.VnS(), p3, z0.VnS());
946
947 __ Index(z3.VnD(), 0x1111111111111111, 0x1111111111111111);
948 __ Mov(q3, q3);
949 __ Compact(z4.VnD(), p0, z3.VnD());
950 __ Compact(z5.VnD(), p1, z3.VnD());
951 __ Compact(z6.VnD(), p4, z3.VnD());
952
953 END();
954
955 if (CAN_RUN()) {
956 RUN();
957 uint64_t z1_expected[] = {0x4444444433333333, 0x2222222211111111};
958 uint64_t z2_expected[] = {0x0000000000000000, 0x3333333311111111};
959 uint64_t z0_expected[] = {0x0000000000000000, 0x4444444422222222};
960 uint64_t z4_expected[] = {0x2222222222222222, 0x1111111111111111};
961 uint64_t z5_expected[] = {0x0000000000000000, 0x0000000000000000};
962 uint64_t z6_expected[] = {0x0000000000000000, 0x1111111111111111};
963 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
964 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
965 ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
966 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
967 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
968 ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
969 }
970 }
971
TEST_SVE(sve_splice)972 TEST_SVE(sve_splice) {
973 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
974 START();
975
976 __ Ptrue(p0.VnB());
977 __ Pfalse(p1.VnB());
978 int p2b_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
979 int p3b_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0};
980 int p4b_inputs[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
981 int p5b_inputs[] = {0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0};
982 int p6b_inputs[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0};
983 Initialise(&masm, p2.VnB(), p2b_inputs);
984 Initialise(&masm, p3.VnB(), p3b_inputs);
985 Initialise(&masm, p4.VnB(), p4b_inputs);
986 Initialise(&masm, p5.VnB(), p5b_inputs);
987 Initialise(&masm, p6.VnB(), p6b_inputs);
988
989 __ Index(z30.VnB(), 1, 1);
990
991 __ Index(z0.VnB(), -1, -1);
992 __ Splice(z0.VnB(), p0, z0.VnB(), z30.VnB());
993 __ Index(z1.VnB(), -1, -1);
994 __ Splice(z1.VnB(), p1, z1.VnB(), z30.VnB());
995 __ Index(z2.VnB(), -1, -1);
996 __ Splice(z2.VnB(), p2, z2.VnB(), z30.VnB());
997 __ Index(z3.VnB(), -1, -1);
998 __ Splice(z3.VnB(), p3, z3.VnB(), z30.VnB());
999 __ Index(z4.VnB(), -1, -1);
1000 __ Splice(z4.VnB(), p4, z4.VnB(), z30.VnB());
1001 __ Index(z5.VnB(), -1, -1);
1002 __ Splice(z5.VnB(), p5, z5.VnB(), z30.VnB());
1003 __ Index(z6.VnB(), -1, -1);
1004 __ Splice(z6.VnB(), p6, z6.VnB(), z30.VnB());
1005
1006 int p2h_inputs[] = {0, 0, 0, 0, 0, 0, 1, 0};
1007 int p3h_inputs[] = {0, 0, 1, 0, 0, 0, 1, 0};
1008 Initialise(&masm, p2.VnH(), p2h_inputs);
1009 Initialise(&masm, p3.VnH(), p3h_inputs);
1010
1011 __ Index(z30.VnH(), 1, 1);
1012 __ Index(z29.VnH(), -1, -1);
1013 __ Splice(z7.VnH(), p2, z29.VnH(), z30.VnH());
1014 __ Splice(z8.VnH(), p3, z29.VnH(), z30.VnH());
1015
1016 int p2s_inputs[] = {0, 0, 1, 0};
1017 int p3s_inputs[] = {1, 0, 1, 0};
1018 Initialise(&masm, p2.VnS(), p2s_inputs);
1019 Initialise(&masm, p3.VnS(), p3s_inputs);
1020
1021 __ Index(z30.VnS(), 1, 1);
1022 __ Index(z29.VnS(), -1, -1);
1023 __ Splice(z9.VnS(), p2, z29.VnS(), z30.VnS());
1024 __ Splice(z10.VnS(), p3, z29.VnS(), z30.VnS());
1025
1026 int p2d_inputs[] = {0, 1};
1027 int p3d_inputs[] = {1, 0};
1028 Initialise(&masm, p2.VnD(), p2d_inputs);
1029 Initialise(&masm, p3.VnD(), p3d_inputs);
1030
1031 __ Index(z30.VnD(), 1, 1);
1032 __ Index(z29.VnD(), -1, -1);
1033 __ Splice(z11.VnD(), p2, z29.VnD(), z30.VnD());
1034 __ Splice(z30.VnD(), p3, z29.VnD(), z30.VnD());
1035
1036 END();
1037
1038 if (CAN_RUN()) {
1039 RUN();
1040 uint64_t z0_expected[] = {0xf0f1f2f3f4f5f6f7, 0xf8f9fafbfcfdfeff};
1041 uint64_t z1_expected[] = {0x100f0e0d0c0b0a09, 0x0807060504030201};
1042 uint64_t z2_expected[] = {0x0f0e0d0c0b0a0908, 0x07060504030201ff};
1043 uint64_t z3_expected[] = {0x0f0e0d0c0b0a0908, 0x07060504030201fe};
1044 uint64_t z4_expected[] = {0x0f0e0d0c0b0a0908, 0x07060504030201f0};
1045 uint64_t z5_expected[] = {0x0c0b0a0908070605, 0x04030201f6f7f8f9};
1046 uint64_t z6_expected[] = {0x01f0f1f2f3f4f5f6, 0xf7f8f9fafbfcfdfe};
1047 uint64_t z7_expected[] = {0x0007000600050004, 0x000300020001fffe};
1048 uint64_t z8_expected[] = {0x000300020001fffa, 0xfffbfffcfffdfffe};
1049 uint64_t z9_expected[] = {0x0000000300000002, 0x00000001fffffffe};
1050 uint64_t z10_expected[] = {0x00000001fffffffc, 0xfffffffdfffffffe};
1051 uint64_t z11_expected[] = {0x0000000000000001, 0xffffffffffffffff};
1052 uint64_t z30_expected[] = {0x0000000000000001, 0xfffffffffffffffe};
1053
1054 ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
1055 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
1056 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
1057 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
1058 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
1059 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
1060 ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
1061 ASSERT_EQUAL_SVE(z7_expected, z7.VnD());
1062 ASSERT_EQUAL_SVE(z8_expected, z8.VnD());
1063 ASSERT_EQUAL_SVE(z9_expected, z9.VnD());
1064 ASSERT_EQUAL_SVE(z10_expected, z10.VnD());
1065 ASSERT_EQUAL_SVE(z11_expected, z11.VnD());
1066 ASSERT_EQUAL_SVE(z30_expected, z30.VnD());
1067 }
1068 }
1069
TEST_SVE(sve_predicate_logical)1070 TEST_SVE(sve_predicate_logical) {
1071 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
1072 START();
1073
1074 // 0b...01011010'10110111
1075 int p10_inputs[] = {0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1}; // Pm
1076 // 0b...11011001'01010010
1077 int p11_inputs[] = {1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0}; // Pn
1078 // 0b...01010101'10110010
1079 int p12_inputs[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0}; // pg
1080
1081 Initialise(&masm, p10.VnB(), p10_inputs);
1082 Initialise(&masm, p11.VnB(), p11_inputs);
1083 Initialise(&masm, p12.VnB(), p12_inputs);
1084
1085 __ Ands(p0.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1086 __ Mrs(x0, NZCV);
1087 __ Bics(p1.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1088 __ Mrs(x1, NZCV);
1089 __ Eor(p2.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1090 __ Nand(p3.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1091 __ Nor(p4.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1092 __ Orn(p5.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1093 __ Orr(p6.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1094 __ Sel(p7.VnB(), p12, p11.VnB(), p10.VnB());
1095
1096 END();
1097
1098 if (CAN_RUN()) {
1099 RUN();
1100
1101 // 0b...01010000'00010010
1102 int p0_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0};
1103 // 0b...00000001'00000000
1104 int p1_expected[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0};
1105 // 0b...00000001'10100000
1106 int p2_expected[] = {0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0};
1107 // 0b...00000101'10100000
1108 int p3_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0};
1109 // 0b...00000100'00000000
1110 int p4_expected[] = {0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1111 // 0b...01010101'00010010
1112 int p5_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0};
1113 // 0b...01010001'10110010
1114 int p6_expected[] = {0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0};
1115 // 0b...01011011'00010111
1116 int p7_expected[] = {0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1};
1117
1118 ASSERT_EQUAL_SVE(p0_expected, p0.VnB());
1119 ASSERT_EQUAL_SVE(p1_expected, p1.VnB());
1120 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
1121 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
1122 ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
1123 ASSERT_EQUAL_SVE(p5_expected, p5.VnB());
1124 ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
1125 ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
1126
1127 ASSERT_EQUAL_32(SVEFirstFlag, w0);
1128 ASSERT_EQUAL_32(SVENotLastFlag, w1);
1129 }
1130 }
1131
TEST_SVE(sve_int_compare_vectors)1132 TEST_SVE(sve_int_compare_vectors) {
1133 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
1134 START();
1135
1136 int z10_inputs[] = {0x00, 0x80, 0xff, 0x7f, 0x00, 0x00, 0x00, 0xff};
1137 int z11_inputs[] = {0x00, 0x00, 0x00, 0x00, 0x80, 0xff, 0x7f, 0xfe};
1138 int p0_inputs[] = {1, 0, 1, 1, 1, 1, 1, 1};
1139 InsrHelper(&masm, z10.VnB(), z10_inputs);
1140 InsrHelper(&masm, z11.VnB(), z11_inputs);
1141 Initialise(&masm, p0.VnB(), p0_inputs);
1142
1143 __ Cmphs(p6.VnB(), p0.Zeroing(), z10.VnB(), z11.VnB());
1144 __ Mrs(x6, NZCV);
1145
1146 uint64_t z12_inputs[] = {0xffffffffffffffff, 0x8000000000000000};
1147 uint64_t z13_inputs[] = {0x0000000000000000, 0x8000000000000000};
1148 int p1_inputs[] = {1, 1};
1149 InsrHelper(&masm, z12.VnD(), z12_inputs);
1150 InsrHelper(&masm, z13.VnD(), z13_inputs);
1151 Initialise(&masm, p1.VnD(), p1_inputs);
1152
1153 __ Cmphi(p7.VnD(), p1.Zeroing(), z12.VnD(), z13.VnD());
1154 __ Mrs(x7, NZCV);
1155
1156 int z14_inputs[] = {0, 32767, -1, -32767, 0, 0, 0, 32766};
1157 int z15_inputs[] = {0, 0, 0, 0, 32767, -1, -32767, 32767};
1158
1159 int p2_inputs[] = {1, 0, 1, 1, 1, 1, 1, 1};
1160 InsrHelper(&masm, z14.VnH(), z14_inputs);
1161 InsrHelper(&masm, z15.VnH(), z15_inputs);
1162 Initialise(&masm, p2.VnH(), p2_inputs);
1163
1164 __ Cmpge(p8.VnH(), p2.Zeroing(), z14.VnH(), z15.VnH());
1165 __ Mrs(x8, NZCV);
1166
1167 __ Cmpeq(p9.VnH(), p2.Zeroing(), z14.VnH(), z15.VnH());
1168 __ Mrs(x9, NZCV);
1169
1170 int z16_inputs[] = {0, -1, 0, 0};
1171 int z17_inputs[] = {0, 0, 2147483647, -2147483648};
1172 int p3_inputs[] = {1, 1, 1, 1};
1173 InsrHelper(&masm, z16.VnS(), z16_inputs);
1174 InsrHelper(&masm, z17.VnS(), z17_inputs);
1175 Initialise(&masm, p3.VnS(), p3_inputs);
1176
1177 __ Cmpgt(p10.VnS(), p3.Zeroing(), z16.VnS(), z17.VnS());
1178 __ Mrs(x10, NZCV);
1179
1180 __ Cmpne(p11.VnS(), p3.Zeroing(), z16.VnS(), z17.VnS());
1181 __ Mrs(x11, NZCV);
1182
1183 // Architectural aliases testing.
1184 __ Cmpls(p12.VnB(), p0.Zeroing(), z11.VnB(), z10.VnB()); // HS
1185 __ Cmplo(p13.VnD(), p1.Zeroing(), z13.VnD(), z12.VnD()); // HI
1186 __ Cmple(p14.VnH(), p2.Zeroing(), z15.VnH(), z14.VnH()); // GE
1187 __ Cmplt(p15.VnS(), p3.Zeroing(), z17.VnS(), z16.VnS()); // GT
1188
1189 END();
1190
1191 if (CAN_RUN()) {
1192 RUN();
1193
1194 int p6_expected[] = {1, 0, 1, 1, 0, 0, 0, 1};
1195 for (size_t i = 0; i < ArrayLength(p6_expected); i++) {
1196 int lane = static_cast<int>(ArrayLength(p6_expected) - i - 1);
1197 ASSERT_EQUAL_SVE_LANE(p6_expected[i], p6.VnB(), lane);
1198 }
1199
1200 int p7_expected[] = {1, 0};
1201 ASSERT_EQUAL_SVE(p7_expected, p7.VnD());
1202
1203 int p8_expected[] = {1, 0, 0, 0, 0, 1, 1, 0};
1204 ASSERT_EQUAL_SVE(p8_expected, p8.VnH());
1205
1206 int p9_expected[] = {1, 0, 0, 0, 0, 0, 0, 0};
1207 ASSERT_EQUAL_SVE(p9_expected, p9.VnH());
1208
1209 int p10_expected[] = {0, 0, 0, 1};
1210 ASSERT_EQUAL_SVE(p10_expected, p10.VnS());
1211
1212 int p11_expected[] = {0, 1, 1, 1};
1213 ASSERT_EQUAL_SVE(p11_expected, p11.VnS());
1214
1215 // Reuse the expected results to verify the architectural aliases.
1216 ASSERT_EQUAL_SVE(p6_expected, p12.VnB());
1217 ASSERT_EQUAL_SVE(p7_expected, p13.VnD());
1218 ASSERT_EQUAL_SVE(p8_expected, p14.VnH());
1219 ASSERT_EQUAL_SVE(p10_expected, p15.VnS());
1220
1221 ASSERT_EQUAL_32(SVEFirstFlag, w6);
1222 ASSERT_EQUAL_32(NoFlag, w7);
1223 ASSERT_EQUAL_32(NoFlag, w8);
1224 ASSERT_EQUAL_32(NoFlag, w9);
1225 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w10);
1226 }
1227 }
1228
TEST_SVE(sve_int_compare_vectors_wide_elements)1229 TEST_SVE(sve_int_compare_vectors_wide_elements) {
1230 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
1231 START();
1232
1233 int src1_inputs_1[] = {0, 1, -1, -128, 127, 100, -66};
1234 int src2_inputs_1[] = {0, -1};
1235 int mask_inputs_1[] = {1, 1, 1, 1, 1, 0, 1};
1236 InsrHelper(&masm, z13.VnB(), src1_inputs_1);
1237 InsrHelper(&masm, z19.VnD(), src2_inputs_1);
1238 Initialise(&masm, p0.VnB(), mask_inputs_1);
1239
1240 __ Cmpge(p2.VnB(), p0.Zeroing(), z13.VnB(), z19.VnD());
1241 __ Mrs(x2, NZCV);
1242 __ Cmpgt(p3.VnB(), p0.Zeroing(), z13.VnB(), z19.VnD());
1243 __ Mrs(x3, NZCV);
1244
1245 int src1_inputs_2[] = {0, 32767, -1, -32767, 1, 1234, 0, 32766};
1246 int src2_inputs_2[] = {0, -32767};
1247 int mask_inputs_2[] = {1, 0, 1, 1, 1, 1, 1, 1};
1248 InsrHelper(&masm, z13.VnH(), src1_inputs_2);
1249 InsrHelper(&masm, z19.VnD(), src2_inputs_2);
1250 Initialise(&masm, p0.VnH(), mask_inputs_2);
1251
1252 __ Cmple(p4.VnH(), p0.Zeroing(), z13.VnH(), z19.VnD());
1253 __ Mrs(x4, NZCV);
1254 __ Cmplt(p5.VnH(), p0.Zeroing(), z13.VnH(), z19.VnD());
1255 __ Mrs(x5, NZCV);
1256
1257 int src1_inputs_3[] = {0, -1, 2147483647, -2147483648};
1258 int src2_inputs_3[] = {0, -2147483648};
1259 int mask_inputs_3[] = {1, 1, 1, 1};
1260 InsrHelper(&masm, z13.VnS(), src1_inputs_3);
1261 InsrHelper(&masm, z19.VnD(), src2_inputs_3);
1262 Initialise(&masm, p0.VnS(), mask_inputs_3);
1263
1264 __ Cmpeq(p6.VnS(), p0.Zeroing(), z13.VnS(), z19.VnD());
1265 __ Mrs(x6, NZCV);
1266 __ Cmpne(p7.VnS(), p0.Zeroing(), z13.VnS(), z19.VnD());
1267 __ Mrs(x7, NZCV);
1268
1269 int src1_inputs_4[] = {0x00, 0x80, 0x7f, 0xff, 0x7f, 0xf0, 0x0f, 0x55};
1270 int src2_inputs_4[] = {0x00, 0x7f};
1271 int mask_inputs_4[] = {1, 1, 1, 1, 0, 1, 1, 1};
1272 InsrHelper(&masm, z13.VnB(), src1_inputs_4);
1273 InsrHelper(&masm, z19.VnD(), src2_inputs_4);
1274 Initialise(&masm, p0.VnB(), mask_inputs_4);
1275
1276 __ Cmplo(p8.VnB(), p0.Zeroing(), z13.VnB(), z19.VnD());
1277 __ Mrs(x8, NZCV);
1278 __ Cmpls(p9.VnB(), p0.Zeroing(), z13.VnB(), z19.VnD());
1279 __ Mrs(x9, NZCV);
1280
1281 int src1_inputs_5[] = {0x0000, 0x8000, 0x7fff, 0xffff};
1282 int src2_inputs_5[] = {0x8000, 0xffff};
1283 int mask_inputs_5[] = {1, 1, 1, 1};
1284 InsrHelper(&masm, z13.VnS(), src1_inputs_5);
1285 InsrHelper(&masm, z19.VnD(), src2_inputs_5);
1286 Initialise(&masm, p0.VnS(), mask_inputs_5);
1287
1288 __ Cmphi(p10.VnS(), p0.Zeroing(), z13.VnS(), z19.VnD());
1289 __ Mrs(x10, NZCV);
1290 __ Cmphs(p11.VnS(), p0.Zeroing(), z13.VnS(), z19.VnD());
1291 __ Mrs(x11, NZCV);
1292
1293 END();
1294
1295 if (CAN_RUN()) {
1296 RUN();
1297 int p2_expected[] = {1, 1, 1, 0, 1, 0, 0};
1298 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
1299
1300 int p3_expected[] = {1, 1, 0, 0, 1, 0, 0};
1301 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
1302
1303 int p4_expected[] = {0x1, 0x0, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0};
1304 ASSERT_EQUAL_SVE(p4_expected, p4.VnH());
1305
1306 int p5_expected[] = {0x0, 0x0, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0};
1307 ASSERT_EQUAL_SVE(p5_expected, p5.VnH());
1308
1309 int p6_expected[] = {0x1, 0x0, 0x0, 0x1};
1310 ASSERT_EQUAL_SVE(p6_expected, p6.VnS());
1311
1312 int p7_expected[] = {0x0, 0x1, 0x1, 0x0};
1313 ASSERT_EQUAL_SVE(p7_expected, p7.VnS());
1314
1315 int p8_expected[] = {1, 0, 0, 0, 0, 0, 1, 1};
1316 ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
1317
1318 int p9_expected[] = {1, 0, 1, 0, 0, 0, 1, 1};
1319 ASSERT_EQUAL_SVE(p9_expected, p9.VnB());
1320
1321 int p10_expected[] = {0x0, 0x0, 0x0, 0x0};
1322 ASSERT_EQUAL_SVE(p10_expected, p10.VnS());
1323
1324 int p11_expected[] = {0x0, 0x1, 0x0, 0x1};
1325 ASSERT_EQUAL_SVE(p11_expected, p11.VnS());
1326
1327 ASSERT_EQUAL_32(NoFlag, w2);
1328 ASSERT_EQUAL_32(NoFlag, w3);
1329 ASSERT_EQUAL_32(NoFlag, w4);
1330 ASSERT_EQUAL_32(SVENotLastFlag, w5);
1331 ASSERT_EQUAL_32(SVEFirstFlag, w6);
1332 ASSERT_EQUAL_32(SVENotLastFlag, w7);
1333 ASSERT_EQUAL_32(SVEFirstFlag, w8);
1334 ASSERT_EQUAL_32(SVEFirstFlag, w9);
1335 ASSERT_EQUAL_32(SVENotLastFlag | SVENoneFlag, w10);
1336 ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w11);
1337 }
1338 }
1339
TEST_SVE(sve_bitwise_imm)1340 TEST_SVE(sve_bitwise_imm) {
1341 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
1342 START();
1343
1344 // clang-format off
1345 uint64_t z21_inputs[] = {0xfedcba9876543210, 0x0123456789abcdef};
1346 uint32_t z22_inputs[] = {0xfedcba98, 0x76543210, 0x01234567, 0x89abcdef};
1347 uint16_t z23_inputs[] = {0xfedc, 0xba98, 0x7654, 0x3210,
1348 0x0123, 0x4567, 0x89ab, 0xcdef};
1349 uint8_t z24_inputs[] = {0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10,
1350 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef};
1351 // clang-format on
1352
1353 InsrHelper(&masm, z1.VnD(), z21_inputs);
1354 InsrHelper(&masm, z2.VnS(), z22_inputs);
1355 InsrHelper(&masm, z3.VnH(), z23_inputs);
1356 InsrHelper(&masm, z4.VnB(), z24_inputs);
1357
1358 __ And(z1.VnD(), z1.VnD(), 0x0000ffff0000ffff);
1359 __ And(z2.VnS(), z2.VnS(), 0xff0000ff);
1360 __ And(z3.VnH(), z3.VnH(), 0x0ff0);
1361 __ And(z4.VnB(), z4.VnB(), 0x3f);
1362
1363 InsrHelper(&masm, z5.VnD(), z21_inputs);
1364 InsrHelper(&masm, z6.VnS(), z22_inputs);
1365 InsrHelper(&masm, z7.VnH(), z23_inputs);
1366 InsrHelper(&masm, z8.VnB(), z24_inputs);
1367
1368 __ Eor(z5.VnD(), z5.VnD(), 0x0000ffff0000ffff);
1369 __ Eor(z6.VnS(), z6.VnS(), 0xff0000ff);
1370 __ Eor(z7.VnH(), z7.VnH(), 0x0ff0);
1371 __ Eor(z8.VnB(), z8.VnB(), 0x3f);
1372
1373 InsrHelper(&masm, z9.VnD(), z21_inputs);
1374 InsrHelper(&masm, z10.VnS(), z22_inputs);
1375 InsrHelper(&masm, z11.VnH(), z23_inputs);
1376 InsrHelper(&masm, z12.VnB(), z24_inputs);
1377
1378 __ Orr(z9.VnD(), z9.VnD(), 0x0000ffff0000ffff);
1379 __ Orr(z10.VnS(), z10.VnS(), 0xff0000ff);
1380 __ Orr(z11.VnH(), z11.VnH(), 0x0ff0);
1381 __ Orr(z12.VnB(), z12.VnB(), 0x3f);
1382
1383 {
1384 // The `Dup` macro maps onto either `dup` or `dupm`, but has its own test,
1385 // so here we test `dupm` directly.
1386 ExactAssemblyScope guard(&masm, 4 * kInstructionSize);
1387 __ dupm(z13.VnD(), 0x7ffffff800000000);
1388 __ dupm(z14.VnS(), 0x7ffc7ffc);
1389 __ dupm(z15.VnH(), 0x3ffc);
1390 __ dupm(z16.VnB(), 0xc3);
1391 }
1392
1393 END();
1394
1395 if (CAN_RUN()) {
1396 RUN();
1397
1398 // clang-format off
1399 uint64_t z1_expected[] = {0x0000ba9800003210, 0x000045670000cdef};
1400 uint32_t z2_expected[] = {0xfe000098, 0x76000010, 0x01000067, 0x890000ef};
1401 uint16_t z3_expected[] = {0x0ed0, 0x0a90, 0x0650, 0x0210,
1402 0x0120, 0x0560, 0x09a0, 0x0de0};
1403 uint8_t z4_expected[] = {0x3e, 0x1c, 0x3a, 0x18, 0x36, 0x14, 0x32, 0x10,
1404 0x01, 0x23, 0x05, 0x27, 0x09, 0x2b, 0x0d, 0x2f};
1405
1406 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
1407 ASSERT_EQUAL_SVE(z2_expected, z2.VnS());
1408 ASSERT_EQUAL_SVE(z3_expected, z3.VnH());
1409 ASSERT_EQUAL_SVE(z4_expected, z4.VnB());
1410
1411 uint64_t z5_expected[] = {0xfedc45677654cdef, 0x0123ba9889ab3210};
1412 uint32_t z6_expected[] = {0x01dcba67, 0x895432ef, 0xfe234598, 0x76abcd10};
1413 uint16_t z7_expected[] = {0xf12c, 0xb568, 0x79a4, 0x3de0,
1414 0x0ed3, 0x4a97, 0x865b, 0xc21f};
1415 uint8_t z8_expected[] = {0xc1, 0xe3, 0x85, 0xa7, 0x49, 0x6b, 0x0d, 0x2f,
1416 0x3e, 0x1c, 0x7a, 0x58, 0xb6, 0x94, 0xf2, 0xd0};
1417
1418 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
1419 ASSERT_EQUAL_SVE(z6_expected, z6.VnS());
1420 ASSERT_EQUAL_SVE(z7_expected, z7.VnH());
1421 ASSERT_EQUAL_SVE(z8_expected, z8.VnB());
1422
1423 uint64_t z9_expected[] = {0xfedcffff7654ffff, 0x0123ffff89abffff};
1424 uint32_t z10_expected[] = {0xffdcbaff, 0xff5432ff, 0xff2345ff, 0xffabcdff};
1425 uint16_t z11_expected[] = {0xfffc, 0xbff8, 0x7ff4, 0x3ff0,
1426 0x0ff3, 0x4ff7, 0x8ffb, 0xcfff};
1427 uint8_t z12_expected[] = {0xff, 0xff, 0xbf, 0xbf, 0x7f, 0x7f, 0x3f, 0x3f,
1428 0x3f, 0x3f, 0x7f, 0x7f, 0xbf, 0xbf, 0xff, 0xff};
1429
1430 ASSERT_EQUAL_SVE(z9_expected, z9.VnD());
1431 ASSERT_EQUAL_SVE(z10_expected, z10.VnS());
1432 ASSERT_EQUAL_SVE(z11_expected, z11.VnH());
1433 ASSERT_EQUAL_SVE(z12_expected, z12.VnB());
1434
1435 uint64_t z13_expected[] = {0x7ffffff800000000, 0x7ffffff800000000};
1436 uint32_t z14_expected[] = {0x7ffc7ffc, 0x7ffc7ffc, 0x7ffc7ffc, 0x7ffc7ffc};
1437 uint16_t z15_expected[] = {0x3ffc, 0x3ffc, 0x3ffc, 0x3ffc,
1438 0x3ffc, 0x3ffc, 0x3ffc ,0x3ffc};
1439 ASSERT_EQUAL_SVE(z13_expected, z13.VnD());
1440 ASSERT_EQUAL_SVE(z14_expected, z14.VnS());
1441 ASSERT_EQUAL_SVE(z15_expected, z15.VnH());
1442 // clang-format on
1443 }
1444 }
1445
TEST_SVE(sve_dup_imm)1446 TEST_SVE(sve_dup_imm) {
1447 // The `Dup` macro can generate `dup`, `dupm`, and it can synthesise
1448 // unencodable immediates.
1449
1450 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
1451 START();
1452
1453 // Encodable with `dup` (shift 0).
1454 __ Dup(z0.VnD(), -1);
1455 __ Dup(z1.VnS(), 0x7f);
1456 __ Dup(z2.VnH(), -0x80);
1457 __ Dup(z3.VnB(), 42);
1458
1459 // Encodable with `dup` (shift 8).
1460 __ Dup(z4.VnD(), -42 * 256);
1461 __ Dup(z5.VnS(), -0x8000);
1462 __ Dup(z6.VnH(), 0x7f00);
1463 // B-sized lanes cannot take a shift of 8.
1464
1465 // Encodable with `dupm` (but not `dup`).
1466 __ Dup(z10.VnD(), 0x3fc);
1467 __ Dup(z11.VnS(), -516097); // 0xfff81fff, as a signed int.
1468 __ Dup(z12.VnH(), 0x0001);
1469 // All values that fit B-sized lanes are encodable with `dup`.
1470
1471 // Cases that require immediate synthesis.
1472 __ Dup(z20.VnD(), 0x1234);
1473 __ Dup(z21.VnD(), -4242);
1474 __ Dup(z22.VnD(), 0xfedcba9876543210);
1475 __ Dup(z23.VnS(), 0x01020304);
1476 __ Dup(z24.VnS(), -0x01020304);
1477 __ Dup(z25.VnH(), 0x3c38);
1478 // All values that fit B-sized lanes are directly encodable.
1479
1480 END();
1481
1482 if (CAN_RUN()) {
1483 RUN();
1484
1485 ASSERT_EQUAL_SVE(0xffffffffffffffff, z0.VnD());
1486 ASSERT_EQUAL_SVE(0x0000007f, z1.VnS());
1487 ASSERT_EQUAL_SVE(0xff80, z2.VnH());
1488 ASSERT_EQUAL_SVE(0x2a, z3.VnB());
1489
1490 ASSERT_EQUAL_SVE(0xffffffffffffd600, z4.VnD());
1491 ASSERT_EQUAL_SVE(0xffff8000, z5.VnS());
1492 ASSERT_EQUAL_SVE(0x7f00, z6.VnH());
1493
1494 ASSERT_EQUAL_SVE(0x00000000000003fc, z10.VnD());
1495 ASSERT_EQUAL_SVE(0xfff81fff, z11.VnS());
1496 ASSERT_EQUAL_SVE(0x0001, z12.VnH());
1497
1498 ASSERT_EQUAL_SVE(0x1234, z20.VnD());
1499 ASSERT_EQUAL_SVE(0xffffffffffffef6e, z21.VnD());
1500 ASSERT_EQUAL_SVE(0xfedcba9876543210, z22.VnD());
1501 ASSERT_EQUAL_SVE(0x01020304, z23.VnS());
1502 ASSERT_EQUAL_SVE(0xfefdfcfc, z24.VnS());
1503 ASSERT_EQUAL_SVE(0x3c38, z25.VnH());
1504 }
1505 }
1506
TEST_SVE(sve_inc_dec_p_scalar)1507 TEST_SVE(sve_inc_dec_p_scalar) {
1508 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
1509 START();
1510
1511 int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
1512 Initialise(&masm, p0.VnB(), p0_inputs);
1513
1514 int p0_b_count = 9;
1515 int p0_h_count = 5;
1516 int p0_s_count = 3;
1517 int p0_d_count = 2;
1518
1519 // 64-bit operations preserve their high bits.
1520 __ Mov(x0, 0x123456780000002a);
1521 __ Decp(x0, p0.VnB());
1522
1523 __ Mov(x1, 0x123456780000002a);
1524 __ Incp(x1, p0.VnH());
1525
1526 // Check that saturation does not occur.
1527 __ Mov(x10, 1);
1528 __ Decp(x10, p0.VnS());
1529
1530 __ Mov(x11, UINT64_MAX);
1531 __ Incp(x11, p0.VnD());
1532
1533 __ Mov(x12, INT64_MAX);
1534 __ Incp(x12, p0.VnB());
1535
1536 // With an all-true predicate, these instructions increment or decrement by
1537 // the vector length.
1538 __ Ptrue(p15.VnB());
1539
1540 __ Mov(x20, 0x4000000000000000);
1541 __ Decp(x20, p15.VnB());
1542
1543 __ Mov(x21, 0x4000000000000000);
1544 __ Incp(x21, p15.VnH());
1545
1546 END();
1547 if (CAN_RUN()) {
1548 RUN();
1549
1550 ASSERT_EQUAL_64(0x123456780000002a - p0_b_count, x0);
1551 ASSERT_EQUAL_64(0x123456780000002a + p0_h_count, x1);
1552
1553 ASSERT_EQUAL_64(UINT64_C(1) - p0_s_count, x10);
1554 ASSERT_EQUAL_64(UINT64_MAX + p0_d_count, x11);
1555 ASSERT_EQUAL_64(static_cast<uint64_t>(INT64_MAX) + p0_b_count, x12);
1556
1557 ASSERT_EQUAL_64(0x4000000000000000 - core.GetSVELaneCount(kBRegSize), x20);
1558 ASSERT_EQUAL_64(0x4000000000000000 + core.GetSVELaneCount(kHRegSize), x21);
1559 }
1560 }
1561
TEST_SVE(sve_sqinc_sqdec_p_scalar)1562 TEST_SVE(sve_sqinc_sqdec_p_scalar) {
1563 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
1564 START();
1565
1566 int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
1567 Initialise(&masm, p0.VnB(), p0_inputs);
1568
1569 int p0_b_count = 9;
1570 int p0_h_count = 5;
1571 int p0_s_count = 3;
1572 int p0_d_count = 2;
1573
1574 uint64_t placeholder_high = 0x1234567800000000;
1575
1576 // 64-bit operations preserve their high bits.
1577 __ Mov(x0, placeholder_high + 42);
1578 __ Sqdecp(x0, p0.VnB());
1579
1580 __ Mov(x1, placeholder_high + 42);
1581 __ Sqincp(x1, p0.VnH());
1582
1583 // 32-bit operations sign-extend into their high bits.
1584 __ Mov(x2, placeholder_high + 42);
1585 __ Sqdecp(x2, p0.VnS(), w2);
1586
1587 __ Mov(x3, placeholder_high + 42);
1588 __ Sqincp(x3, p0.VnD(), w3);
1589
1590 __ Mov(x4, placeholder_high + 1);
1591 __ Sqdecp(x4, p0.VnS(), w4);
1592
1593 __ Mov(x5, placeholder_high - 1);
1594 __ Sqincp(x5, p0.VnD(), w5);
1595
1596 // Check that saturation behaves correctly.
1597 __ Mov(x10, 0x8000000000000001); // INT64_MIN + 1
1598 __ Sqdecp(x10, p0.VnB());
1599
1600 __ Mov(x11, placeholder_high + 0x80000001); // INT32_MIN + 1
1601 __ Sqdecp(x11, p0.VnH(), w11);
1602
1603 __ Mov(x12, 1);
1604 __ Sqdecp(x12, p0.VnS());
1605
1606 __ Mov(x13, placeholder_high + 1);
1607 __ Sqdecp(x13, p0.VnD(), w13);
1608
1609 __ Mov(x14, 0x7ffffffffffffffe); // INT64_MAX - 1
1610 __ Sqincp(x14, p0.VnB());
1611
1612 __ Mov(x15, placeholder_high + 0x7ffffffe); // INT32_MAX - 1
1613 __ Sqincp(x15, p0.VnH(), w15);
1614
1615 // Don't use x16 and x17 since they are scratch registers by default.
1616
1617 __ Mov(x18, 0xffffffffffffffff);
1618 __ Sqincp(x18, p0.VnS());
1619
1620 __ Mov(x19, placeholder_high + 0xffffffff);
1621 __ Sqincp(x19, p0.VnD(), w19);
1622
1623 __ Mov(x20, placeholder_high + 0xffffffff);
1624 __ Sqdecp(x20, p0.VnB(), w20);
1625
1626 // With an all-true predicate, these instructions increment or decrement by
1627 // the vector length.
1628 __ Ptrue(p15.VnB());
1629
1630 __ Mov(x21, 0);
1631 __ Sqdecp(x21, p15.VnB());
1632
1633 __ Mov(x22, 0);
1634 __ Sqincp(x22, p15.VnH());
1635
1636 __ Mov(x23, placeholder_high);
1637 __ Sqdecp(x23, p15.VnS(), w23);
1638
1639 __ Mov(x24, placeholder_high);
1640 __ Sqincp(x24, p15.VnD(), w24);
1641
1642 END();
1643 if (CAN_RUN()) {
1644 RUN();
1645
1646 // 64-bit operations preserve their high bits.
1647 ASSERT_EQUAL_64(placeholder_high + 42 - p0_b_count, x0);
1648 ASSERT_EQUAL_64(placeholder_high + 42 + p0_h_count, x1);
1649
1650 // 32-bit operations sign-extend into their high bits.
1651 ASSERT_EQUAL_64(42 - p0_s_count, x2);
1652 ASSERT_EQUAL_64(42 + p0_d_count, x3);
1653 ASSERT_EQUAL_64(0xffffffff00000000 | (1 - p0_s_count), x4);
1654 ASSERT_EQUAL_64(p0_d_count - 1, x5);
1655
1656 // Check that saturation behaves correctly.
1657 ASSERT_EQUAL_64(INT64_MIN, x10);
1658 ASSERT_EQUAL_64(INT32_MIN, x11);
1659 ASSERT_EQUAL_64(1 - p0_s_count, x12);
1660 ASSERT_EQUAL_64(1 - p0_d_count, x13);
1661 ASSERT_EQUAL_64(INT64_MAX, x14);
1662 ASSERT_EQUAL_64(INT32_MAX, x15);
1663 ASSERT_EQUAL_64(p0_s_count - 1, x18);
1664 ASSERT_EQUAL_64(p0_d_count - 1, x19);
1665 ASSERT_EQUAL_64(-1 - p0_b_count, x20);
1666
1667 // Check all-true predicates.
1668 ASSERT_EQUAL_64(-core.GetSVELaneCount(kBRegSize), x21);
1669 ASSERT_EQUAL_64(core.GetSVELaneCount(kHRegSize), x22);
1670 ASSERT_EQUAL_64(-core.GetSVELaneCount(kSRegSize), x23);
1671 ASSERT_EQUAL_64(core.GetSVELaneCount(kDRegSize), x24);
1672 }
1673 }
1674
TEST_SVE(sve_uqinc_uqdec_p_scalar)1675 TEST_SVE(sve_uqinc_uqdec_p_scalar) {
1676 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
1677 START();
1678
1679 int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
1680 Initialise(&masm, p0.VnB(), p0_inputs);
1681
1682 int p0_b_count = 9;
1683 int p0_h_count = 5;
1684 int p0_s_count = 3;
1685 int p0_d_count = 2;
1686
1687 uint64_t placeholder_high = 0x1234567800000000;
1688
1689 // 64-bit operations preserve their high bits.
1690 __ Mov(x0, placeholder_high + 42);
1691 __ Uqdecp(x0, p0.VnB());
1692
1693 __ Mov(x1, placeholder_high + 42);
1694 __ Uqincp(x1, p0.VnH());
1695
1696 // 32-bit operations zero-extend into their high bits.
1697 __ Mov(x2, placeholder_high + 42);
1698 __ Uqdecp(x2, p0.VnS(), w2);
1699
1700 __ Mov(x3, placeholder_high + 42);
1701 __ Uqincp(x3, p0.VnD(), w3);
1702
1703 __ Mov(x4, placeholder_high + 0x80000001);
1704 __ Uqdecp(x4, p0.VnS(), w4);
1705
1706 __ Mov(x5, placeholder_high + 0x7fffffff);
1707 __ Uqincp(x5, p0.VnD(), w5);
1708
1709 // Check that saturation behaves correctly.
1710 __ Mov(x10, 1);
1711 __ Uqdecp(x10, p0.VnB(), x10);
1712
1713 __ Mov(x11, placeholder_high + 1);
1714 __ Uqdecp(x11, p0.VnH(), w11);
1715
1716 __ Mov(x12, 0x8000000000000000); // INT64_MAX + 1
1717 __ Uqdecp(x12, p0.VnS(), x12);
1718
1719 __ Mov(x13, placeholder_high + 0x80000000); // INT32_MAX + 1
1720 __ Uqdecp(x13, p0.VnD(), w13);
1721
1722 __ Mov(x14, 0xfffffffffffffffe); // UINT64_MAX - 1
1723 __ Uqincp(x14, p0.VnB(), x14);
1724
1725 __ Mov(x15, placeholder_high + 0xfffffffe); // UINT32_MAX - 1
1726 __ Uqincp(x15, p0.VnH(), w15);
1727
1728 // Don't use x16 and x17 since they are scratch registers by default.
1729
1730 __ Mov(x18, 0x7ffffffffffffffe); // INT64_MAX - 1
1731 __ Uqincp(x18, p0.VnS(), x18);
1732
1733 __ Mov(x19, placeholder_high + 0x7ffffffe); // INT32_MAX - 1
1734 __ Uqincp(x19, p0.VnD(), w19);
1735
1736 // With an all-true predicate, these instructions increment or decrement by
1737 // the vector length.
1738 __ Ptrue(p15.VnB());
1739
1740 __ Mov(x20, 0x4000000000000000);
1741 __ Uqdecp(x20, p15.VnB(), x20);
1742
1743 __ Mov(x21, 0x4000000000000000);
1744 __ Uqincp(x21, p15.VnH(), x21);
1745
1746 __ Mov(x22, placeholder_high + 0x40000000);
1747 __ Uqdecp(x22, p15.VnS(), w22);
1748
1749 __ Mov(x23, placeholder_high + 0x40000000);
1750 __ Uqincp(x23, p15.VnD(), w23);
1751
1752 END();
1753 if (CAN_RUN()) {
1754 RUN();
1755
1756 // 64-bit operations preserve their high bits.
1757 ASSERT_EQUAL_64(placeholder_high + 42 - p0_b_count, x0);
1758 ASSERT_EQUAL_64(placeholder_high + 42 + p0_h_count, x1);
1759
1760 // 32-bit operations zero-extend into their high bits.
1761 ASSERT_EQUAL_64(42 - p0_s_count, x2);
1762 ASSERT_EQUAL_64(42 + p0_d_count, x3);
1763 ASSERT_EQUAL_64(UINT64_C(0x80000001) - p0_s_count, x4);
1764 ASSERT_EQUAL_64(UINT64_C(0x7fffffff) + p0_d_count, x5);
1765
1766 // Check that saturation behaves correctly.
1767 ASSERT_EQUAL_64(0, x10);
1768 ASSERT_EQUAL_64(0, x11);
1769 ASSERT_EQUAL_64(0x8000000000000000 - p0_s_count, x12);
1770 ASSERT_EQUAL_64(UINT64_C(0x80000000) - p0_d_count, x13);
1771 ASSERT_EQUAL_64(UINT64_MAX, x14);
1772 ASSERT_EQUAL_64(UINT32_MAX, x15);
1773 ASSERT_EQUAL_64(0x7ffffffffffffffe + p0_s_count, x18);
1774 ASSERT_EQUAL_64(UINT64_C(0x7ffffffe) + p0_d_count, x19);
1775
1776 // Check all-true predicates.
1777 ASSERT_EQUAL_64(0x4000000000000000 - core.GetSVELaneCount(kBRegSize), x20);
1778 ASSERT_EQUAL_64(0x4000000000000000 + core.GetSVELaneCount(kHRegSize), x21);
1779 ASSERT_EQUAL_64(0x40000000 - core.GetSVELaneCount(kSRegSize), x22);
1780 ASSERT_EQUAL_64(0x40000000 + core.GetSVELaneCount(kDRegSize), x23);
1781 }
1782 }
1783
TEST_SVE(sve_inc_dec_p_vector)1784 TEST_SVE(sve_inc_dec_p_vector) {
1785 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
1786 START();
1787
1788 // There are {5, 3, 2} active {H, S, D} lanes. B-sized lanes are ignored.
1789 int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
1790 Initialise(&masm, p0.VnB(), p0_inputs);
1791
1792 // Check that saturation does not occur.
1793
1794 int64_t z0_inputs[] = {0x1234567800000042, 0, 1, INT64_MIN};
1795 InsrHelper(&masm, z0.VnD(), z0_inputs);
1796
1797 int64_t z1_inputs[] = {0x12345678ffffff2a, 0, -1, INT64_MAX};
1798 InsrHelper(&masm, z1.VnD(), z1_inputs);
1799
1800 int32_t z2_inputs[] = {0x12340042, 0, -1, 1, INT32_MAX, INT32_MIN};
1801 InsrHelper(&masm, z2.VnS(), z2_inputs);
1802
1803 int16_t z3_inputs[] = {0x122a, 0, 1, -1, INT16_MIN, INT16_MAX};
1804 InsrHelper(&masm, z3.VnH(), z3_inputs);
1805
1806 // The MacroAssembler implements non-destructive operations using movprfx.
1807 __ Decp(z10.VnD(), p0, z0.VnD());
1808 __ Decp(z11.VnD(), p0, z1.VnD());
1809 __ Decp(z12.VnS(), p0, z2.VnS());
1810 __ Decp(z13.VnH(), p0, z3.VnH());
1811
1812 __ Incp(z14.VnD(), p0, z0.VnD());
1813 __ Incp(z15.VnD(), p0, z1.VnD());
1814 __ Incp(z16.VnS(), p0, z2.VnS());
1815 __ Incp(z17.VnH(), p0, z3.VnH());
1816
1817 // Also test destructive forms.
1818 __ Mov(z4, z0);
1819 __ Mov(z5, z1);
1820 __ Mov(z6, z2);
1821 __ Mov(z7, z3);
1822
1823 __ Decp(z0.VnD(), p0);
1824 __ Decp(z1.VnD(), p0);
1825 __ Decp(z2.VnS(), p0);
1826 __ Decp(z3.VnH(), p0);
1827
1828 __ Incp(z4.VnD(), p0);
1829 __ Incp(z5.VnD(), p0);
1830 __ Incp(z6.VnS(), p0);
1831 __ Incp(z7.VnH(), p0);
1832
1833 END();
1834 if (CAN_RUN()) {
1835 RUN();
1836
1837 // z0_inputs[...] - number of active D lanes (2)
1838 int64_t z0_expected[] = {0x1234567800000040, -2, -1, 0x7ffffffffffffffe};
1839 ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
1840
1841 // z1_inputs[...] - number of active D lanes (2)
1842 int64_t z1_expected[] = {0x12345678ffffff28, -2, -3, 0x7ffffffffffffffd};
1843 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
1844
1845 // z2_inputs[...] - number of active S lanes (3)
1846 int32_t z2_expected[] = {0x1234003f, -3, -4, -2, 0x7ffffffc, 0x7ffffffd};
1847 ASSERT_EQUAL_SVE(z2_expected, z2.VnS());
1848
1849 // z3_inputs[...] - number of active H lanes (5)
1850 int16_t z3_expected[] = {0x1225, -5, -4, -6, 0x7ffb, 0x7ffa};
1851 ASSERT_EQUAL_SVE(z3_expected, z3.VnH());
1852
1853 // z0_inputs[...] + number of active D lanes (2)
1854 uint64_t z4_expected[] = {0x1234567800000044, 2, 3, 0x8000000000000002};
1855 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
1856
1857 // z1_inputs[...] + number of active D lanes (2)
1858 uint64_t z5_expected[] = {0x12345678ffffff2c, 2, 1, 0x8000000000000001};
1859 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
1860
1861 // z2_inputs[...] + number of active S lanes (3)
1862 uint32_t z6_expected[] = {0x12340045, 3, 2, 4, 0x80000002, 0x80000003};
1863 ASSERT_EQUAL_SVE(z6_expected, z6.VnS());
1864
1865 // z3_inputs[...] + number of active H lanes (5)
1866 uint16_t z7_expected[] = {0x122f, 5, 6, 4, 0x8005, 0x8004};
1867 ASSERT_EQUAL_SVE(z7_expected, z7.VnH());
1868
1869 // Check that the non-destructive macros produced the same results.
1870 ASSERT_EQUAL_SVE(z0_expected, z10.VnD());
1871 ASSERT_EQUAL_SVE(z1_expected, z11.VnD());
1872 ASSERT_EQUAL_SVE(z2_expected, z12.VnS());
1873 ASSERT_EQUAL_SVE(z3_expected, z13.VnH());
1874 ASSERT_EQUAL_SVE(z4_expected, z14.VnD());
1875 ASSERT_EQUAL_SVE(z5_expected, z15.VnD());
1876 ASSERT_EQUAL_SVE(z6_expected, z16.VnS());
1877 ASSERT_EQUAL_SVE(z7_expected, z17.VnH());
1878 }
1879 }
1880
TEST_SVE(sve_inc_dec_ptrue_vector)1881 TEST_SVE(sve_inc_dec_ptrue_vector) {
1882 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
1883 START();
1884
1885 // With an all-true predicate, these instructions increment or decrement by
1886 // the vector length.
1887 __ Ptrue(p15.VnB());
1888
1889 __ Dup(z0.VnD(), 0);
1890 __ Decp(z0.VnD(), p15);
1891
1892 __ Dup(z1.VnS(), 0);
1893 __ Decp(z1.VnS(), p15);
1894
1895 __ Dup(z2.VnH(), 0);
1896 __ Decp(z2.VnH(), p15);
1897
1898 __ Dup(z3.VnD(), 0);
1899 __ Incp(z3.VnD(), p15);
1900
1901 __ Dup(z4.VnS(), 0);
1902 __ Incp(z4.VnS(), p15);
1903
1904 __ Dup(z5.VnH(), 0);
1905 __ Incp(z5.VnH(), p15);
1906
1907 END();
1908 if (CAN_RUN()) {
1909 RUN();
1910
1911 int d_lane_count = core.GetSVELaneCount(kDRegSize);
1912 int s_lane_count = core.GetSVELaneCount(kSRegSize);
1913 int h_lane_count = core.GetSVELaneCount(kHRegSize);
1914
1915 for (int i = 0; i < d_lane_count; i++) {
1916 ASSERT_EQUAL_SVE_LANE(-d_lane_count, z0.VnD(), i);
1917 ASSERT_EQUAL_SVE_LANE(d_lane_count, z3.VnD(), i);
1918 }
1919
1920 for (int i = 0; i < s_lane_count; i++) {
1921 ASSERT_EQUAL_SVE_LANE(-s_lane_count, z1.VnS(), i);
1922 ASSERT_EQUAL_SVE_LANE(s_lane_count, z4.VnS(), i);
1923 }
1924
1925 for (int i = 0; i < h_lane_count; i++) {
1926 ASSERT_EQUAL_SVE_LANE(-h_lane_count, z2.VnH(), i);
1927 ASSERT_EQUAL_SVE_LANE(h_lane_count, z5.VnH(), i);
1928 }
1929 }
1930 }
1931
TEST_SVE(sve_sqinc_sqdec_p_vector)1932 TEST_SVE(sve_sqinc_sqdec_p_vector) {
1933 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
1934 START();
1935
1936 // There are {5, 3, 2} active {H, S, D} lanes. B-sized lanes are ignored.
1937 int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
1938 Initialise(&masm, p0.VnB(), p0_inputs);
1939
1940 // Check that saturation behaves correctly.
1941
1942 int64_t z0_inputs[] = {0x1234567800000042, 0, 1, INT64_MIN};
1943 InsrHelper(&masm, z0.VnD(), z0_inputs);
1944
1945 int64_t z1_inputs[] = {0x12345678ffffff2a, 0, -1, INT64_MAX};
1946 InsrHelper(&masm, z1.VnD(), z1_inputs);
1947
1948 int32_t z2_inputs[] = {0x12340042, 0, -1, 1, INT32_MAX, INT32_MIN};
1949 InsrHelper(&masm, z2.VnS(), z2_inputs);
1950
1951 int16_t z3_inputs[] = {0x122a, 0, 1, -1, INT16_MIN, INT16_MAX};
1952 InsrHelper(&masm, z3.VnH(), z3_inputs);
1953
1954 // The MacroAssembler implements non-destructive operations using movprfx.
1955 __ Sqdecp(z10.VnD(), p0, z0.VnD());
1956 __ Sqdecp(z11.VnD(), p0, z1.VnD());
1957 __ Sqdecp(z12.VnS(), p0, z2.VnS());
1958 __ Sqdecp(z13.VnH(), p0, z3.VnH());
1959
1960 __ Sqincp(z14.VnD(), p0, z0.VnD());
1961 __ Sqincp(z15.VnD(), p0, z1.VnD());
1962 __ Sqincp(z16.VnS(), p0, z2.VnS());
1963 __ Sqincp(z17.VnH(), p0, z3.VnH());
1964
1965 // Also test destructive forms.
1966 __ Mov(z4, z0);
1967 __ Mov(z5, z1);
1968 __ Mov(z6, z2);
1969 __ Mov(z7, z3);
1970
1971 __ Sqdecp(z0.VnD(), p0);
1972 __ Sqdecp(z1.VnD(), p0);
1973 __ Sqdecp(z2.VnS(), p0);
1974 __ Sqdecp(z3.VnH(), p0);
1975
1976 __ Sqincp(z4.VnD(), p0);
1977 __ Sqincp(z5.VnD(), p0);
1978 __ Sqincp(z6.VnS(), p0);
1979 __ Sqincp(z7.VnH(), p0);
1980
1981 END();
1982 if (CAN_RUN()) {
1983 RUN();
1984
1985 // z0_inputs[...] - number of active D lanes (2)
1986 int64_t z0_expected[] = {0x1234567800000040, -2, -1, INT64_MIN};
1987 ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
1988
1989 // z1_inputs[...] - number of active D lanes (2)
1990 int64_t z1_expected[] = {0x12345678ffffff28, -2, -3, 0x7ffffffffffffffd};
1991 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
1992
1993 // z2_inputs[...] - number of active S lanes (3)
1994 int32_t z2_expected[] = {0x1234003f, -3, -4, -2, 0x7ffffffc, INT32_MIN};
1995 ASSERT_EQUAL_SVE(z2_expected, z2.VnS());
1996
1997 // z3_inputs[...] - number of active H lanes (5)
1998 int16_t z3_expected[] = {0x1225, -5, -4, -6, INT16_MIN, 0x7ffa};
1999 ASSERT_EQUAL_SVE(z3_expected, z3.VnH());
2000
2001 // z0_inputs[...] + number of active D lanes (2)
2002 uint64_t z4_expected[] = {0x1234567800000044, 2, 3, 0x8000000000000002};
2003 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
2004
2005 // z1_inputs[...] + number of active D lanes (2)
2006 uint64_t z5_expected[] = {0x12345678ffffff2c, 2, 1, INT64_MAX};
2007 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
2008
2009 // z2_inputs[...] + number of active S lanes (3)
2010 uint32_t z6_expected[] = {0x12340045, 3, 2, 4, INT32_MAX, 0x80000003};
2011 ASSERT_EQUAL_SVE(z6_expected, z6.VnS());
2012
2013 // z3_inputs[...] + number of active H lanes (5)
2014 uint16_t z7_expected[] = {0x122f, 5, 6, 4, 0x8005, INT16_MAX};
2015 ASSERT_EQUAL_SVE(z7_expected, z7.VnH());
2016
2017 // Check that the non-destructive macros produced the same results.
2018 ASSERT_EQUAL_SVE(z0_expected, z10.VnD());
2019 ASSERT_EQUAL_SVE(z1_expected, z11.VnD());
2020 ASSERT_EQUAL_SVE(z2_expected, z12.VnS());
2021 ASSERT_EQUAL_SVE(z3_expected, z13.VnH());
2022 ASSERT_EQUAL_SVE(z4_expected, z14.VnD());
2023 ASSERT_EQUAL_SVE(z5_expected, z15.VnD());
2024 ASSERT_EQUAL_SVE(z6_expected, z16.VnS());
2025 ASSERT_EQUAL_SVE(z7_expected, z17.VnH());
2026 }
2027 }
2028
TEST_SVE(sve_sqinc_sqdec_ptrue_vector)2029 TEST_SVE(sve_sqinc_sqdec_ptrue_vector) {
2030 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2031 START();
2032
2033 // With an all-true predicate, these instructions increment or decrement by
2034 // the vector length.
2035 __ Ptrue(p15.VnB());
2036
2037 __ Dup(z0.VnD(), 0);
2038 __ Sqdecp(z0.VnD(), p15);
2039
2040 __ Dup(z1.VnS(), 0);
2041 __ Sqdecp(z1.VnS(), p15);
2042
2043 __ Dup(z2.VnH(), 0);
2044 __ Sqdecp(z2.VnH(), p15);
2045
2046 __ Dup(z3.VnD(), 0);
2047 __ Sqincp(z3.VnD(), p15);
2048
2049 __ Dup(z4.VnS(), 0);
2050 __ Sqincp(z4.VnS(), p15);
2051
2052 __ Dup(z5.VnH(), 0);
2053 __ Sqincp(z5.VnH(), p15);
2054
2055 END();
2056 if (CAN_RUN()) {
2057 RUN();
2058
2059 int d_lane_count = core.GetSVELaneCount(kDRegSize);
2060 int s_lane_count = core.GetSVELaneCount(kSRegSize);
2061 int h_lane_count = core.GetSVELaneCount(kHRegSize);
2062
2063 for (int i = 0; i < d_lane_count; i++) {
2064 ASSERT_EQUAL_SVE_LANE(-d_lane_count, z0.VnD(), i);
2065 ASSERT_EQUAL_SVE_LANE(d_lane_count, z3.VnD(), i);
2066 }
2067
2068 for (int i = 0; i < s_lane_count; i++) {
2069 ASSERT_EQUAL_SVE_LANE(-s_lane_count, z1.VnS(), i);
2070 ASSERT_EQUAL_SVE_LANE(s_lane_count, z4.VnS(), i);
2071 }
2072
2073 for (int i = 0; i < h_lane_count; i++) {
2074 ASSERT_EQUAL_SVE_LANE(-h_lane_count, z2.VnH(), i);
2075 ASSERT_EQUAL_SVE_LANE(h_lane_count, z5.VnH(), i);
2076 }
2077 }
2078 }
2079
TEST_SVE(sve_uqinc_uqdec_p_vector)2080 TEST_SVE(sve_uqinc_uqdec_p_vector) {
2081 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2082 START();
2083
2084 // There are {5, 3, 2} active {H, S, D} lanes. B-sized lanes are ignored.
2085 int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
2086 Initialise(&masm, p0.VnB(), p0_inputs);
2087
2088 // Check that saturation behaves correctly.
2089
2090 uint64_t z0_inputs[] = {0x1234567800000042, 0, 1, 0x8000000000000000};
2091 InsrHelper(&masm, z0.VnD(), z0_inputs);
2092
2093 uint64_t z1_inputs[] = {0x12345678ffffff2a, 0, UINT64_MAX, INT64_MAX};
2094 InsrHelper(&masm, z1.VnD(), z1_inputs);
2095
2096 uint32_t z2_inputs[] = {0x12340042, 0, UINT32_MAX, 1, INT32_MAX, 0x80000000};
2097 InsrHelper(&masm, z2.VnS(), z2_inputs);
2098
2099 uint16_t z3_inputs[] = {0x122a, 0, 1, UINT16_MAX, 0x8000, INT16_MAX};
2100 InsrHelper(&masm, z3.VnH(), z3_inputs);
2101
2102 // The MacroAssembler implements non-destructive operations using movprfx.
2103 __ Uqdecp(z10.VnD(), p0, z0.VnD());
2104 __ Uqdecp(z11.VnD(), p0, z1.VnD());
2105 __ Uqdecp(z12.VnS(), p0, z2.VnS());
2106 __ Uqdecp(z13.VnH(), p0, z3.VnH());
2107
2108 __ Uqincp(z14.VnD(), p0, z0.VnD());
2109 __ Uqincp(z15.VnD(), p0, z1.VnD());
2110 __ Uqincp(z16.VnS(), p0, z2.VnS());
2111 __ Uqincp(z17.VnH(), p0, z3.VnH());
2112
2113 // Also test destructive forms.
2114 __ Mov(z4, z0);
2115 __ Mov(z5, z1);
2116 __ Mov(z6, z2);
2117 __ Mov(z7, z3);
2118
2119 __ Uqdecp(z0.VnD(), p0);
2120 __ Uqdecp(z1.VnD(), p0);
2121 __ Uqdecp(z2.VnS(), p0);
2122 __ Uqdecp(z3.VnH(), p0);
2123
2124 __ Uqincp(z4.VnD(), p0);
2125 __ Uqincp(z5.VnD(), p0);
2126 __ Uqincp(z6.VnS(), p0);
2127 __ Uqincp(z7.VnH(), p0);
2128
2129 END();
2130 if (CAN_RUN()) {
2131 RUN();
2132
2133 // z0_inputs[...] - number of active D lanes (2)
2134 uint64_t z0_expected[] = {0x1234567800000040, 0, 0, 0x7ffffffffffffffe};
2135 ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
2136
2137 // z1_inputs[...] - number of active D lanes (2)
2138 uint64_t z1_expected[] = {0x12345678ffffff28,
2139 0,
2140 0xfffffffffffffffd,
2141 0x7ffffffffffffffd};
2142 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
2143
2144 // z2_inputs[...] - number of active S lanes (3)
2145 uint32_t z2_expected[] =
2146 {0x1234003f, 0, 0xfffffffc, 0, 0x7ffffffc, 0x7ffffffd};
2147 ASSERT_EQUAL_SVE(z2_expected, z2.VnS());
2148
2149 // z3_inputs[...] - number of active H lanes (5)
2150 uint16_t z3_expected[] = {0x1225, 0, 0, 0xfffa, 0x7ffb, 0x7ffa};
2151 ASSERT_EQUAL_SVE(z3_expected, z3.VnH());
2152
2153 // z0_inputs[...] + number of active D lanes (2)
2154 uint64_t z4_expected[] = {0x1234567800000044, 2, 3, 0x8000000000000002};
2155 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
2156
2157 // z1_inputs[...] + number of active D lanes (2)
2158 uint64_t z5_expected[] = {0x12345678ffffff2c,
2159 2,
2160 UINT64_MAX,
2161 0x8000000000000001};
2162 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
2163
2164 // z2_inputs[...] + number of active S lanes (3)
2165 uint32_t z6_expected[] =
2166 {0x12340045, 3, UINT32_MAX, 4, 0x80000002, 0x80000003};
2167 ASSERT_EQUAL_SVE(z6_expected, z6.VnS());
2168
2169 // z3_inputs[...] + number of active H lanes (5)
2170 uint16_t z7_expected[] = {0x122f, 5, 6, UINT16_MAX, 0x8005, 0x8004};
2171 ASSERT_EQUAL_SVE(z7_expected, z7.VnH());
2172
2173 // Check that the non-destructive macros produced the same results.
2174 ASSERT_EQUAL_SVE(z0_expected, z10.VnD());
2175 ASSERT_EQUAL_SVE(z1_expected, z11.VnD());
2176 ASSERT_EQUAL_SVE(z2_expected, z12.VnS());
2177 ASSERT_EQUAL_SVE(z3_expected, z13.VnH());
2178 ASSERT_EQUAL_SVE(z4_expected, z14.VnD());
2179 ASSERT_EQUAL_SVE(z5_expected, z15.VnD());
2180 ASSERT_EQUAL_SVE(z6_expected, z16.VnS());
2181 ASSERT_EQUAL_SVE(z7_expected, z17.VnH());
2182 }
2183 }
2184
TEST_SVE(sve_uqinc_uqdec_ptrue_vector)2185 TEST_SVE(sve_uqinc_uqdec_ptrue_vector) {
2186 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2187 START();
2188
2189 // With an all-true predicate, these instructions increment or decrement by
2190 // the vector length.
2191 __ Ptrue(p15.VnB());
2192
2193 __ Mov(x0, 0x1234567800000000);
2194 __ Mov(x1, 0x12340000);
2195 __ Mov(x2, 0x1200);
2196
2197 __ Dup(z0.VnD(), x0);
2198 __ Uqdecp(z0.VnD(), p15);
2199
2200 __ Dup(z1.VnS(), x1);
2201 __ Uqdecp(z1.VnS(), p15);
2202
2203 __ Dup(z2.VnH(), x2);
2204 __ Uqdecp(z2.VnH(), p15);
2205
2206 __ Dup(z3.VnD(), x0);
2207 __ Uqincp(z3.VnD(), p15);
2208
2209 __ Dup(z4.VnS(), x1);
2210 __ Uqincp(z4.VnS(), p15);
2211
2212 __ Dup(z5.VnH(), x2);
2213 __ Uqincp(z5.VnH(), p15);
2214
2215 END();
2216 if (CAN_RUN()) {
2217 RUN();
2218
2219 int d_lane_count = core.GetSVELaneCount(kDRegSize);
2220 int s_lane_count = core.GetSVELaneCount(kSRegSize);
2221 int h_lane_count = core.GetSVELaneCount(kHRegSize);
2222
2223 for (int i = 0; i < d_lane_count; i++) {
2224 ASSERT_EQUAL_SVE_LANE(0x1234567800000000 - d_lane_count, z0.VnD(), i);
2225 ASSERT_EQUAL_SVE_LANE(0x1234567800000000 + d_lane_count, z3.VnD(), i);
2226 }
2227
2228 for (int i = 0; i < s_lane_count; i++) {
2229 ASSERT_EQUAL_SVE_LANE(0x12340000 - s_lane_count, z1.VnS(), i);
2230 ASSERT_EQUAL_SVE_LANE(0x12340000 + s_lane_count, z4.VnS(), i);
2231 }
2232
2233 for (int i = 0; i < h_lane_count; i++) {
2234 ASSERT_EQUAL_SVE_LANE(0x1200 - h_lane_count, z2.VnH(), i);
2235 ASSERT_EQUAL_SVE_LANE(0x1200 + h_lane_count, z5.VnH(), i);
2236 }
2237 }
2238 }
2239
TEST_SVE(sve_index)2240 TEST_SVE(sve_index) {
2241 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2242 START();
2243
2244 // Simple cases.
2245 __ Index(z0.VnB(), 0, 1);
2246 __ Index(z1.VnH(), 1, 1);
2247 __ Index(z2.VnS(), 2, 1);
2248 __ Index(z3.VnD(), 3, 1);
2249
2250 // Synthesised immediates.
2251 __ Index(z4.VnB(), 42, -1);
2252 __ Index(z5.VnH(), -1, 42);
2253 __ Index(z6.VnS(), 42, 42);
2254
2255 // Register arguments.
2256 __ Mov(x0, 42);
2257 __ Mov(x1, -3);
2258 __ Index(z10.VnD(), x0, x1);
2259 __ Index(z11.VnB(), w0, w1);
2260 // The register size should correspond to the lane size, but VIXL allows any
2261 // register at least as big as the lane size.
2262 __ Index(z12.VnB(), x0, x1);
2263 __ Index(z13.VnH(), w0, x1);
2264 __ Index(z14.VnS(), x0, w1);
2265
2266 // Integer overflow.
2267 __ Index(z20.VnB(), UINT8_MAX - 2, 2);
2268 __ Index(z21.VnH(), 7, -3);
2269 __ Index(z22.VnS(), INT32_MAX - 2, 1);
2270 __ Index(z23.VnD(), INT64_MIN + 6, -7);
2271
2272 END();
2273
2274 if (CAN_RUN()) {
2275 RUN();
2276
2277 int b_lane_count = core.GetSVELaneCount(kBRegSize);
2278 int h_lane_count = core.GetSVELaneCount(kHRegSize);
2279 int s_lane_count = core.GetSVELaneCount(kSRegSize);
2280 int d_lane_count = core.GetSVELaneCount(kDRegSize);
2281
2282 uint64_t b_mask = GetUintMask(kBRegSize);
2283 uint64_t h_mask = GetUintMask(kHRegSize);
2284 uint64_t s_mask = GetUintMask(kSRegSize);
2285 uint64_t d_mask = GetUintMask(kDRegSize);
2286
2287 // Simple cases.
2288 for (int i = 0; i < b_lane_count; i++) {
2289 ASSERT_EQUAL_SVE_LANE((0 + i) & b_mask, z0.VnB(), i);
2290 }
2291 for (int i = 0; i < h_lane_count; i++) {
2292 ASSERT_EQUAL_SVE_LANE((1 + i) & h_mask, z1.VnH(), i);
2293 }
2294 for (int i = 0; i < s_lane_count; i++) {
2295 ASSERT_EQUAL_SVE_LANE((2 + i) & s_mask, z2.VnS(), i);
2296 }
2297 for (int i = 0; i < d_lane_count; i++) {
2298 ASSERT_EQUAL_SVE_LANE((3 + i) & d_mask, z3.VnD(), i);
2299 }
2300
2301 // Synthesised immediates.
2302 for (int i = 0; i < b_lane_count; i++) {
2303 ASSERT_EQUAL_SVE_LANE((42 - i) & b_mask, z4.VnB(), i);
2304 }
2305 for (int i = 0; i < h_lane_count; i++) {
2306 ASSERT_EQUAL_SVE_LANE((-1 + (42 * i)) & h_mask, z5.VnH(), i);
2307 }
2308 for (int i = 0; i < s_lane_count; i++) {
2309 ASSERT_EQUAL_SVE_LANE((42 + (42 * i)) & s_mask, z6.VnS(), i);
2310 }
2311
2312 // Register arguments.
2313 for (int i = 0; i < d_lane_count; i++) {
2314 ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & d_mask, z10.VnD(), i);
2315 }
2316 for (int i = 0; i < b_lane_count; i++) {
2317 ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & b_mask, z11.VnB(), i);
2318 }
2319 for (int i = 0; i < b_lane_count; i++) {
2320 ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & b_mask, z12.VnB(), i);
2321 }
2322 for (int i = 0; i < h_lane_count; i++) {
2323 ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & h_mask, z13.VnH(), i);
2324 }
2325 for (int i = 0; i < s_lane_count; i++) {
2326 ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & s_mask, z14.VnS(), i);
2327 }
2328
2329 // Integer overflow.
2330 uint8_t expected_z20[] = {0x05, 0x03, 0x01, 0xff, 0xfd};
2331 ASSERT_EQUAL_SVE(expected_z20, z20.VnB());
2332 uint16_t expected_z21[] = {0xfffb, 0xfffe, 0x0001, 0x0004, 0x0007};
2333 ASSERT_EQUAL_SVE(expected_z21, z21.VnH());
2334 uint32_t expected_z22[] = {0x80000000, 0x7fffffff, 0x7ffffffe, 0x7ffffffd};
2335 ASSERT_EQUAL_SVE(expected_z22, z22.VnS());
2336 uint64_t expected_z23[] = {0x7fffffffffffffff, 0x8000000000000006};
2337 ASSERT_EQUAL_SVE(expected_z23, z23.VnD());
2338 }
2339 }
2340
TEST(sve_int_compare_count_and_limit_scalars)2341 TEST(sve_int_compare_count_and_limit_scalars) {
2342 SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2343 START();
2344
2345 __ Mov(w20, 0xfffffffd);
2346 __ Mov(w21, 0xffffffff);
2347
2348 __ Whilele(p0.VnB(), w20, w21);
2349 __ Mrs(x0, NZCV);
2350 __ Whilele(p1.VnH(), w20, w21);
2351 __ Mrs(x1, NZCV);
2352
2353 __ Mov(w20, 0xffffffff);
2354 __ Mov(w21, 0x00000000);
2355
2356 __ Whilelt(p2.VnS(), w20, w21);
2357 __ Mrs(x2, NZCV);
2358 __ Whilelt(p3.VnD(), w20, w21);
2359 __ Mrs(x3, NZCV);
2360
2361 __ Mov(w20, 0xfffffffd);
2362 __ Mov(w21, 0xffffffff);
2363
2364 __ Whilels(p4.VnB(), w20, w21);
2365 __ Mrs(x4, NZCV);
2366 __ Whilels(p5.VnH(), w20, w21);
2367 __ Mrs(x5, NZCV);
2368
2369 __ Mov(w20, 0xffffffff);
2370 __ Mov(w21, 0x00000000);
2371
2372 __ Whilelo(p6.VnS(), w20, w21);
2373 __ Mrs(x6, NZCV);
2374 __ Whilelo(p7.VnD(), w20, w21);
2375 __ Mrs(x7, NZCV);
2376
2377 __ Mov(x20, 0xfffffffffffffffd);
2378 __ Mov(x21, 0xffffffffffffffff);
2379
2380 __ Whilele(p8.VnB(), x20, x21);
2381 __ Mrs(x8, NZCV);
2382 __ Whilele(p9.VnH(), x20, x21);
2383 __ Mrs(x9, NZCV);
2384
2385 __ Mov(x20, 0xffffffffffffffff);
2386 __ Mov(x21, 0x0000000000000000);
2387
2388 __ Whilelt(p10.VnS(), x20, x21);
2389 __ Mrs(x10, NZCV);
2390 __ Whilelt(p11.VnD(), x20, x21);
2391 __ Mrs(x11, NZCV);
2392
2393 __ Mov(x20, 0xfffffffffffffffd);
2394 __ Mov(x21, 0xffffffffffffffff);
2395
2396 __ Whilels(p12.VnB(), x20, x21);
2397 __ Mrs(x12, NZCV);
2398 __ Whilels(p13.VnH(), x20, x21);
2399 __ Mrs(x13, NZCV);
2400
2401 __ Mov(x20, 0xffffffffffffffff);
2402 __ Mov(x21, 0x0000000000000000);
2403
2404 __ Whilelo(p14.VnS(), x20, x21);
2405 __ Mrs(x14, NZCV);
2406 __ Whilelo(p15.VnD(), x20, x21);
2407 __ Mrs(x15, NZCV);
2408
2409 END();
2410
2411 if (CAN_RUN()) {
2412 RUN();
2413
2414 // 0b...00000000'00000111
2415 int p0_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
2416 ASSERT_EQUAL_SVE(p0_expected, p0.VnB());
2417
2418 // 0b...00000000'00010101
2419 int p1_expected[] = {0, 0, 0, 0, 0, 1, 1, 1};
2420 ASSERT_EQUAL_SVE(p1_expected, p1.VnH());
2421
2422 int p2_expected[] = {0x0, 0x0, 0x0, 0x1};
2423 ASSERT_EQUAL_SVE(p2_expected, p2.VnS());
2424
2425 int p3_expected[] = {0x00, 0x01};
2426 ASSERT_EQUAL_SVE(p3_expected, p3.VnD());
2427
2428 // 0b...11111111'11111111
2429 int p4_expected[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
2430 ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
2431
2432 // 0b...01010101'01010101
2433 int p5_expected[] = {1, 1, 1, 1, 1, 1, 1, 1};
2434 ASSERT_EQUAL_SVE(p5_expected, p5.VnH());
2435
2436 int p6_expected[] = {0x0, 0x0, 0x0, 0x0};
2437 ASSERT_EQUAL_SVE(p6_expected, p6.VnS());
2438
2439 int p7_expected[] = {0x00, 0x00};
2440 ASSERT_EQUAL_SVE(p7_expected, p7.VnD());
2441
2442 // 0b...00000000'00000111
2443 int p8_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
2444 ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
2445
2446 // 0b...00000000'00010101
2447 int p9_expected[] = {0, 0, 0, 0, 0, 1, 1, 1};
2448 ASSERT_EQUAL_SVE(p9_expected, p9.VnH());
2449
2450 int p10_expected[] = {0x0, 0x0, 0x0, 0x1};
2451 ASSERT_EQUAL_SVE(p10_expected, p10.VnS());
2452
2453 int p11_expected[] = {0x00, 0x01};
2454 ASSERT_EQUAL_SVE(p11_expected, p11.VnD());
2455
2456 // 0b...11111111'11111111
2457 int p12_expected[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
2458 ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
2459
2460 // 0b...01010101'01010101
2461 int p13_expected[] = {1, 1, 1, 1, 1, 1, 1, 1};
2462 ASSERT_EQUAL_SVE(p13_expected, p13.VnH());
2463
2464 int p14_expected[] = {0x0, 0x0, 0x0, 0x0};
2465 ASSERT_EQUAL_SVE(p14_expected, p14.VnS());
2466
2467 int p15_expected[] = {0x00, 0x00};
2468 ASSERT_EQUAL_SVE(p15_expected, p15.VnD());
2469
2470 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w0);
2471 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w1);
2472 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w2);
2473 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w3);
2474 ASSERT_EQUAL_32(SVEFirstFlag, w4);
2475 ASSERT_EQUAL_32(SVEFirstFlag, w5);
2476 ASSERT_EQUAL_32(SVENoneFlag | SVENotLastFlag, w6);
2477 ASSERT_EQUAL_32(SVENoneFlag | SVENotLastFlag, w7);
2478 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w8);
2479 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w9);
2480 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w10);
2481 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w11);
2482 ASSERT_EQUAL_32(SVEFirstFlag, w12);
2483 ASSERT_EQUAL_32(SVEFirstFlag, w13);
2484 ASSERT_EQUAL_32(SVENoneFlag | SVENotLastFlag, w14);
2485 ASSERT_EQUAL_32(SVENoneFlag | SVENotLastFlag, w15);
2486 }
2487 }
2488
TEST(sve_int_compare_count_and_limit_scalars_regression_test)2489 TEST(sve_int_compare_count_and_limit_scalars_regression_test) {
2490 SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2491 START();
2492
2493 __ Mov(w0, 0x7ffffffd);
2494 __ Mov(w1, 0x7fffffff);
2495 __ Whilele(p0.VnB(), w0, w1);
2496
2497 END();
2498
2499 if (CAN_RUN()) {
2500 RUN();
2501
2502 int p0_expected[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
2503 ASSERT_EQUAL_SVE(p0_expected, p0.VnB());
2504 }
2505 }
2506
TEST(sve_int_compare_vectors_signed_imm)2507 TEST(sve_int_compare_vectors_signed_imm) {
2508 SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2509 START();
2510
2511 int z13_inputs[] = {0, 1, -1, -15, 126, -127, -126, -15};
2512 int mask_inputs1[] = {1, 1, 1, 0, 1, 1, 1, 1};
2513 InsrHelper(&masm, z13.VnB(), z13_inputs);
2514 Initialise(&masm, p0.VnB(), mask_inputs1);
2515
2516 __ Cmpeq(p2.VnB(), p0.Zeroing(), z13.VnB(), -15);
2517 __ Mrs(x2, NZCV);
2518 __ Cmpeq(p3.VnB(), p0.Zeroing(), z13.VnB(), -127);
2519
2520 int z14_inputs[] = {0, 1, -1, -32767, -32766, 32767, 32766, 0};
2521 int mask_inputs2[] = {1, 1, 1, 0, 1, 1, 1, 1};
2522 InsrHelper(&masm, z14.VnH(), z14_inputs);
2523 Initialise(&masm, p0.VnH(), mask_inputs2);
2524
2525 __ Cmpge(p4.VnH(), p0.Zeroing(), z14.VnH(), -1);
2526 __ Mrs(x4, NZCV);
2527 __ Cmpge(p5.VnH(), p0.Zeroing(), z14.VnH(), -32767);
2528
2529 int z15_inputs[] = {0, 1, -1, INT_MIN};
2530 int mask_inputs3[] = {0, 1, 1, 1};
2531 InsrHelper(&masm, z15.VnS(), z15_inputs);
2532 Initialise(&masm, p0.VnS(), mask_inputs3);
2533
2534 __ Cmpgt(p6.VnS(), p0.Zeroing(), z15.VnS(), 0);
2535 __ Mrs(x6, NZCV);
2536 __ Cmpgt(p7.VnS(), p0.Zeroing(), z15.VnS(), INT_MIN + 1);
2537
2538 __ Cmplt(p8.VnS(), p0.Zeroing(), z15.VnS(), 0);
2539 __ Mrs(x8, NZCV);
2540 __ Cmplt(p9.VnS(), p0.Zeroing(), z15.VnS(), INT_MIN + 1);
2541
2542 int64_t z16_inputs[] = {0, -1};
2543 int mask_inputs4[] = {1, 1};
2544 InsrHelper(&masm, z16.VnD(), z16_inputs);
2545 Initialise(&masm, p0.VnD(), mask_inputs4);
2546
2547 __ Cmple(p10.VnD(), p0.Zeroing(), z16.VnD(), -1);
2548 __ Mrs(x10, NZCV);
2549 __ Cmple(p11.VnD(), p0.Zeroing(), z16.VnD(), LLONG_MIN);
2550
2551 __ Cmpne(p12.VnD(), p0.Zeroing(), z16.VnD(), -1);
2552 __ Mrs(x12, NZCV);
2553 __ Cmpne(p13.VnD(), p0.Zeroing(), z16.VnD(), LLONG_MAX);
2554
2555 END();
2556
2557 if (CAN_RUN()) {
2558 RUN();
2559
2560 int p2_expected[] = {0, 0, 0, 0, 0, 0, 0, 1};
2561 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
2562
2563 int p3_expected[] = {0, 0, 0, 0, 0, 1, 0, 0};
2564 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
2565
2566 int p4_expected[] = {0x1, 0x1, 0x1, 0x0, 0x0, 0x1, 0x1, 0x1};
2567 ASSERT_EQUAL_SVE(p4_expected, p4.VnH());
2568
2569 int p5_expected[] = {0x1, 0x1, 0x1, 0x0, 0x1, 0x1, 0x1, 0x1};
2570 ASSERT_EQUAL_SVE(p5_expected, p5.VnH());
2571
2572 int p6_expected[] = {0x0, 0x1, 0x0, 0x0};
2573 ASSERT_EQUAL_SVE(p6_expected, p6.VnS());
2574
2575 int p7_expected[] = {0x0, 0x1, 0x1, 0x0};
2576 ASSERT_EQUAL_SVE(p7_expected, p7.VnS());
2577
2578 int p8_expected[] = {0x0, 0x0, 0x1, 0x1};
2579 ASSERT_EQUAL_SVE(p8_expected, p8.VnS());
2580
2581 int p9_expected[] = {0x0, 0x0, 0x0, 0x1};
2582 ASSERT_EQUAL_SVE(p9_expected, p9.VnS());
2583
2584 int p10_expected[] = {0x00, 0x01};
2585 ASSERT_EQUAL_SVE(p10_expected, p10.VnD());
2586
2587 int p11_expected[] = {0x00, 0x00};
2588 ASSERT_EQUAL_SVE(p11_expected, p11.VnD());
2589
2590 int p12_expected[] = {0x01, 0x00};
2591 ASSERT_EQUAL_SVE(p12_expected, p12.VnD());
2592
2593 int p13_expected[] = {0x01, 0x01};
2594 ASSERT_EQUAL_SVE(p13_expected, p13.VnD());
2595
2596 ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w2);
2597 ASSERT_EQUAL_32(SVEFirstFlag, w4);
2598 ASSERT_EQUAL_32(NoFlag, w6);
2599 ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w8);
2600 ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w10);
2601 ASSERT_EQUAL_32(NoFlag, w12);
2602 }
2603 }
2604
TEST(sve_int_compare_vectors_unsigned_imm)2605 TEST(sve_int_compare_vectors_unsigned_imm) {
2606 SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2607 START();
2608
2609 uint32_t src1_inputs[] = {0xf7, 0x0f, 0x8f, 0x1f, 0x83, 0x12, 0x00, 0xf1};
2610 int mask_inputs1[] = {1, 1, 1, 0, 1, 1, 0, 1};
2611 InsrHelper(&masm, z13.VnB(), src1_inputs);
2612 Initialise(&masm, p0.VnB(), mask_inputs1);
2613
2614 __ Cmphi(p2.VnB(), p0.Zeroing(), z13.VnB(), 0x0f);
2615 __ Mrs(x2, NZCV);
2616 __ Cmphi(p3.VnB(), p0.Zeroing(), z13.VnB(), 0xf0);
2617
2618 uint32_t src2_inputs[] = {0xffff, 0x8000, 0x1fff, 0x0000, 0x1234};
2619 int mask_inputs2[] = {1, 1, 1, 1, 0};
2620 InsrHelper(&masm, z13.VnH(), src2_inputs);
2621 Initialise(&masm, p0.VnH(), mask_inputs2);
2622
2623 __ Cmphs(p4.VnH(), p0.Zeroing(), z13.VnH(), 0x1f);
2624 __ Mrs(x4, NZCV);
2625 __ Cmphs(p5.VnH(), p0.Zeroing(), z13.VnH(), 0x1fff);
2626
2627 uint32_t src3_inputs[] = {0xffffffff, 0xfedcba98, 0x0000ffff, 0x00000000};
2628 int mask_inputs3[] = {1, 1, 1, 1};
2629 InsrHelper(&masm, z13.VnS(), src3_inputs);
2630 Initialise(&masm, p0.VnS(), mask_inputs3);
2631
2632 __ Cmplo(p6.VnS(), p0.Zeroing(), z13.VnS(), 0x3f);
2633 __ Mrs(x6, NZCV);
2634 __ Cmplo(p7.VnS(), p0.Zeroing(), z13.VnS(), 0x3f3f3f3f);
2635
2636 uint64_t src4_inputs[] = {0xffffffffffffffff, 0x0000000000000000};
2637 int mask_inputs4[] = {1, 1};
2638 InsrHelper(&masm, z13.VnD(), src4_inputs);
2639 Initialise(&masm, p0.VnD(), mask_inputs4);
2640
2641 __ Cmpls(p8.VnD(), p0.Zeroing(), z13.VnD(), 0x2f);
2642 __ Mrs(x8, NZCV);
2643 __ Cmpls(p9.VnD(), p0.Zeroing(), z13.VnD(), 0x800000000000000);
2644
2645 END();
2646
2647 if (CAN_RUN()) {
2648 RUN();
2649
2650 int p2_expected[] = {1, 0, 1, 0, 1, 1, 0, 1};
2651 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
2652
2653 int p3_expected[] = {1, 0, 0, 0, 0, 0, 0, 1};
2654 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
2655
2656 int p4_expected[] = {0x1, 0x1, 0x1, 0x0, 0x0};
2657 ASSERT_EQUAL_SVE(p4_expected, p4.VnH());
2658
2659 int p5_expected[] = {0x1, 0x1, 0x1, 0x0, 0x0};
2660 ASSERT_EQUAL_SVE(p5_expected, p5.VnH());
2661
2662 int p6_expected[] = {0x0, 0x0, 0x0, 0x1};
2663 ASSERT_EQUAL_SVE(p6_expected, p6.VnS());
2664
2665 int p7_expected[] = {0x0, 0x0, 0x1, 0x1};
2666 ASSERT_EQUAL_SVE(p7_expected, p7.VnS());
2667
2668 int p8_expected[] = {0x00, 0x01};
2669 ASSERT_EQUAL_SVE(p8_expected, p8.VnD());
2670
2671 int p9_expected[] = {0x00, 0x01};
2672 ASSERT_EQUAL_SVE(p9_expected, p9.VnD());
2673
2674 ASSERT_EQUAL_32(SVEFirstFlag, w2);
2675 ASSERT_EQUAL_32(NoFlag, w4);
2676 ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w6);
2677 ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w8);
2678 }
2679 }
2680
TEST(sve_int_compare_conditionally_terminate_scalars)2681 TEST(sve_int_compare_conditionally_terminate_scalars) {
2682 SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2683 START();
2684
2685 __ Mov(x0, 0xfedcba9887654321);
2686 __ Mov(x1, 0x1000100010001000);
2687
2688 // Initialise Z and C. These are preserved by cterm*, and the V flag is set to
2689 // !C if the condition does not hold.
2690 __ Mov(x10, NoFlag);
2691 __ Msr(NZCV, x10);
2692
2693 __ Ctermeq(w0, w0);
2694 __ Mrs(x2, NZCV);
2695 __ Ctermeq(x0, x1);
2696 __ Mrs(x3, NZCV);
2697 __ Ctermne(x0, x0);
2698 __ Mrs(x4, NZCV);
2699 __ Ctermne(w0, w1);
2700 __ Mrs(x5, NZCV);
2701
2702 // As above, but with all flags initially set.
2703 __ Mov(x10, NZCVFlag);
2704 __ Msr(NZCV, x10);
2705
2706 __ Ctermeq(w0, w0);
2707 __ Mrs(x6, NZCV);
2708 __ Ctermeq(x0, x1);
2709 __ Mrs(x7, NZCV);
2710 __ Ctermne(x0, x0);
2711 __ Mrs(x8, NZCV);
2712 __ Ctermne(w0, w1);
2713 __ Mrs(x9, NZCV);
2714
2715 END();
2716
2717 if (CAN_RUN()) {
2718 RUN();
2719
2720 ASSERT_EQUAL_32(SVEFirstFlag, w2);
2721 ASSERT_EQUAL_32(VFlag, w3);
2722 ASSERT_EQUAL_32(VFlag, w4);
2723 ASSERT_EQUAL_32(SVEFirstFlag, w5);
2724
2725 ASSERT_EQUAL_32(SVEFirstFlag | ZCFlag, w6);
2726 ASSERT_EQUAL_32(ZCFlag, w7);
2727 ASSERT_EQUAL_32(ZCFlag, w8);
2728 ASSERT_EQUAL_32(SVEFirstFlag | ZCFlag, w9);
2729 }
2730 }
2731
2732 // Work out what the architectural `PredTest` pseudocode should produce for the
2733 // given result and governing predicate.
2734 template <typename Tg, typename Td, int N>
GetPredTestFlags(const Td (& pd)[N],const Tg (& pg)[N],int vl)2735 static StatusFlags GetPredTestFlags(const Td (&pd)[N],
2736 const Tg (&pg)[N],
2737 int vl) {
2738 int first = -1;
2739 int last = -1;
2740 bool any_active = false;
2741
2742 // Only consider potentially-active lanes.
2743 int start = (N > vl) ? (N - vl) : 0;
2744 for (int i = start; i < N; i++) {
2745 if ((pg[i] & 1) == 1) {
2746 // Look for the first and last active lanes.
2747 // Note that the 'first' lane is the one with the highest index.
2748 if (last < 0) last = i;
2749 first = i;
2750 // Look for any active lanes that are also active in pd.
2751 if ((pd[i] & 1) == 1) any_active = true;
2752 }
2753 }
2754
2755 uint32_t flags = 0;
2756 if ((first >= 0) && ((pd[first] & 1) == 1)) flags |= SVEFirstFlag;
2757 if (!any_active) flags |= SVENoneFlag;
2758 if ((last < 0) || ((pd[last] & 1) == 0)) flags |= SVENotLastFlag;
2759 return static_cast<StatusFlags>(flags);
2760 }
2761
2762 typedef void (MacroAssembler::*PfirstPnextFn)(const PRegisterWithLaneSize& pd,
2763 const PRegister& pg,
2764 const PRegisterWithLaneSize& pn);
2765 template <typename Tg, typename Tn, typename Td>
PfirstPnextHelper(Test * config,PfirstPnextFn macro,unsigned lane_size_in_bits,const Tg & pg_inputs,const Tn & pn_inputs,const Td & pd_expected)2766 static void PfirstPnextHelper(Test* config,
2767 PfirstPnextFn macro,
2768 unsigned lane_size_in_bits,
2769 const Tg& pg_inputs,
2770 const Tn& pn_inputs,
2771 const Td& pd_expected) {
2772 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2773 START();
2774
2775 PRegister pg = p15;
2776 PRegister pn = p14;
2777 Initialise(&masm, pg.WithLaneSize(lane_size_in_bits), pg_inputs);
2778 Initialise(&masm, pn.WithLaneSize(lane_size_in_bits), pn_inputs);
2779
2780 // Initialise NZCV to an impossible value, to check that we actually write it.
2781 __ Mov(x10, NZCVFlag);
2782
2783 // If pd.Is(pn), the MacroAssembler simply passes the arguments directly to
2784 // the Assembler.
2785 __ Msr(NZCV, x10);
2786 __ Mov(p0, pn);
2787 (masm.*macro)(p0.WithLaneSize(lane_size_in_bits),
2788 pg,
2789 p0.WithLaneSize(lane_size_in_bits));
2790 __ Mrs(x0, NZCV);
2791
2792 // The MacroAssembler supports non-destructive use.
2793 __ Msr(NZCV, x10);
2794 (masm.*macro)(p1.WithLaneSize(lane_size_in_bits),
2795 pg,
2796 pn.WithLaneSize(lane_size_in_bits));
2797 __ Mrs(x1, NZCV);
2798
2799 // If pd.Aliases(pg) the macro requires a scratch register.
2800 {
2801 UseScratchRegisterScope temps(&masm);
2802 temps.Include(p13);
2803 __ Msr(NZCV, x10);
2804 __ Mov(p2, p15);
2805 (masm.*macro)(p2.WithLaneSize(lane_size_in_bits),
2806 p2,
2807 pn.WithLaneSize(lane_size_in_bits));
2808 __ Mrs(x2, NZCV);
2809 }
2810
2811 END();
2812
2813 if (CAN_RUN()) {
2814 RUN();
2815
2816 // Check that the inputs weren't modified.
2817 ASSERT_EQUAL_SVE(pn_inputs, pn.WithLaneSize(lane_size_in_bits));
2818 ASSERT_EQUAL_SVE(pg_inputs, pg.WithLaneSize(lane_size_in_bits));
2819
2820 // Check the primary operation.
2821 ASSERT_EQUAL_SVE(pd_expected, p0.WithLaneSize(lane_size_in_bits));
2822 ASSERT_EQUAL_SVE(pd_expected, p1.WithLaneSize(lane_size_in_bits));
2823 ASSERT_EQUAL_SVE(pd_expected, p2.WithLaneSize(lane_size_in_bits));
2824
2825 // Check that the flags were properly set.
2826 StatusFlags nzcv_expected =
2827 GetPredTestFlags(pd_expected,
2828 pg_inputs,
2829 core.GetSVELaneCount(kBRegSize));
2830 ASSERT_EQUAL_64(nzcv_expected, x0);
2831 ASSERT_EQUAL_64(nzcv_expected, x1);
2832 ASSERT_EQUAL_64(nzcv_expected, x2);
2833 }
2834 }
2835
2836 template <typename Tg, typename Tn, typename Td>
PfirstHelper(Test * config,const Tg & pg_inputs,const Tn & pn_inputs,const Td & pd_expected)2837 static void PfirstHelper(Test* config,
2838 const Tg& pg_inputs,
2839 const Tn& pn_inputs,
2840 const Td& pd_expected) {
2841 PfirstPnextHelper(config,
2842 &MacroAssembler::Pfirst,
2843 kBRegSize, // pfirst only accepts B-sized lanes.
2844 pg_inputs,
2845 pn_inputs,
2846 pd_expected);
2847 }
2848
2849 template <typename Tg, typename Tn, typename Td>
PnextHelper(Test * config,unsigned lane_size_in_bits,const Tg & pg_inputs,const Tn & pn_inputs,const Td & pd_expected)2850 static void PnextHelper(Test* config,
2851 unsigned lane_size_in_bits,
2852 const Tg& pg_inputs,
2853 const Tn& pn_inputs,
2854 const Td& pd_expected) {
2855 PfirstPnextHelper(config,
2856 &MacroAssembler::Pnext,
2857 lane_size_in_bits,
2858 pg_inputs,
2859 pn_inputs,
2860 pd_expected);
2861 }
2862
TEST_SVE(sve_pfirst)2863 TEST_SVE(sve_pfirst) {
2864 // Provide more lanes than kPRegMinSize (to check propagation if we have a
2865 // large VL), but few enough to make the test easy to read.
2866 int in0[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
2867 int in1[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
2868 int in2[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
2869 int in3[] = {0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
2870 int in4[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
2871 VIXL_ASSERT(ArrayLength(in0) > kPRegMinSize);
2872
2873 // Pfirst finds the first active lane in pg, and activates the corresponding
2874 // lane in pn (if it isn't already active).
2875
2876 // The first active lane in in1 is here. |
2877 // v
2878 int exp10[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
2879 int exp12[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0};
2880 int exp13[] = {0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
2881 int exp14[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
2882 PfirstHelper(config, in1, in0, exp10);
2883 PfirstHelper(config, in1, in2, exp12);
2884 PfirstHelper(config, in1, in3, exp13);
2885 PfirstHelper(config, in1, in4, exp14);
2886
2887 // The first active lane in in2 is here. |
2888 // v
2889 int exp20[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0};
2890 int exp21[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0};
2891 int exp23[] = {0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
2892 int exp24[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0};
2893 PfirstHelper(config, in2, in0, exp20);
2894 PfirstHelper(config, in2, in1, exp21);
2895 PfirstHelper(config, in2, in3, exp23);
2896 PfirstHelper(config, in2, in4, exp24);
2897
2898 // The first active lane in in3 is here. |
2899 // v
2900 int exp30[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
2901 int exp31[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1};
2902 int exp32[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1};
2903 int exp34[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
2904 PfirstHelper(config, in3, in0, exp30);
2905 PfirstHelper(config, in3, in1, exp31);
2906 PfirstHelper(config, in3, in2, exp32);
2907 PfirstHelper(config, in3, in4, exp34);
2908
2909 // | The first active lane in in4 is here.
2910 // v
2911 int exp40[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
2912 int exp41[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
2913 int exp42[] = {1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
2914 int exp43[] = {1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
2915 PfirstHelper(config, in4, in0, exp40);
2916 PfirstHelper(config, in4, in1, exp41);
2917 PfirstHelper(config, in4, in2, exp42);
2918 PfirstHelper(config, in4, in3, exp43);
2919
2920 // If pg is all inactive, the input is passed through unchanged.
2921 PfirstHelper(config, in0, in0, in0);
2922 PfirstHelper(config, in0, in1, in1);
2923 PfirstHelper(config, in0, in2, in2);
2924 PfirstHelper(config, in0, in3, in3);
2925
2926 // If the values of pg and pn match, the value is passed through unchanged.
2927 PfirstHelper(config, in0, in0, in0);
2928 PfirstHelper(config, in1, in1, in1);
2929 PfirstHelper(config, in2, in2, in2);
2930 PfirstHelper(config, in3, in3, in3);
2931 }
2932
TEST_SVE(sve_pfirst_alias)2933 TEST_SVE(sve_pfirst_alias) {
2934 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2935 START();
2936
2937 // Check that the Simulator behaves correctly when all arguments are aliased.
2938 int in_b[] = {0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0};
2939 int in_h[] = {0, 0, 0, 0, 1, 1, 0, 0};
2940 int in_s[] = {0, 1, 1, 0};
2941 int in_d[] = {1, 1};
2942
2943 Initialise(&masm, p0.VnB(), in_b);
2944 Initialise(&masm, p1.VnH(), in_h);
2945 Initialise(&masm, p2.VnS(), in_s);
2946 Initialise(&masm, p3.VnD(), in_d);
2947
2948 // Initialise NZCV to an impossible value, to check that we actually write it.
2949 __ Mov(x10, NZCVFlag);
2950
2951 __ Msr(NZCV, x10);
2952 __ Pfirst(p0.VnB(), p0, p0.VnB());
2953 __ Mrs(x0, NZCV);
2954
2955 __ Msr(NZCV, x10);
2956 __ Pfirst(p1.VnB(), p1, p1.VnB());
2957 __ Mrs(x1, NZCV);
2958
2959 __ Msr(NZCV, x10);
2960 __ Pfirst(p2.VnB(), p2, p2.VnB());
2961 __ Mrs(x2, NZCV);
2962
2963 __ Msr(NZCV, x10);
2964 __ Pfirst(p3.VnB(), p3, p3.VnB());
2965 __ Mrs(x3, NZCV);
2966
2967 END();
2968
2969 if (CAN_RUN()) {
2970 RUN();
2971
2972 // The first lane from pg is already active in pdn, so the P register should
2973 // be unchanged.
2974 ASSERT_EQUAL_SVE(in_b, p0.VnB());
2975 ASSERT_EQUAL_SVE(in_h, p1.VnH());
2976 ASSERT_EQUAL_SVE(in_s, p2.VnS());
2977 ASSERT_EQUAL_SVE(in_d, p3.VnD());
2978
2979 ASSERT_EQUAL_64(SVEFirstFlag, x0);
2980 ASSERT_EQUAL_64(SVEFirstFlag, x1);
2981 ASSERT_EQUAL_64(SVEFirstFlag, x2);
2982 ASSERT_EQUAL_64(SVEFirstFlag, x3);
2983 }
2984 }
2985
TEST_SVE(sve_pnext_b)2986 TEST_SVE(sve_pnext_b) {
2987 // TODO: Once we have the infrastructure, provide more lanes than kPRegMinSize
2988 // (to check propagation if we have a large VL), but few enough to make the
2989 // test easy to read.
2990 // For now, we just use kPRegMinSize so that the test works anywhere.
2991 int in0[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
2992 int in1[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
2993 int in2[] = {0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
2994 int in3[] = {0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1};
2995 int in4[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
2996
2997 // Pnext activates the next element that is true in pg, after the last-active
2998 // element in pn. If all pn elements are false (as in in0), it starts looking
2999 // at element 0.
3000
3001 // There are no active lanes in in0, so the result is simply the first active
3002 // lane from pg.
3003 int exp00[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3004 int exp10[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
3005 int exp20[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0};
3006 int exp30[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
3007 int exp40[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3008
3009 // The last active lane in in1 is here. |
3010 // v
3011 int exp01[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3012 int exp11[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3013 int exp21[] = {0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3014 int exp31[] = {0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3015 int exp41[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3016
3017 // | The last active lane in in2 is here.
3018 // v
3019 int exp02[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3020 int exp12[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3021 int exp22[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3022 int exp32[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3023 int exp42[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3024
3025 // | The last active lane in in3 is here.
3026 // v
3027 int exp03[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3028 int exp13[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3029 int exp23[] = {0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3030 int exp33[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3031 int exp43[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3032
3033 // | The last active lane in in4 is here.
3034 // v
3035 int exp04[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3036 int exp14[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3037 int exp24[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3038 int exp34[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3039 int exp44[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3040
3041 PnextHelper(config, kBRegSize, in0, in0, exp00);
3042 PnextHelper(config, kBRegSize, in1, in0, exp10);
3043 PnextHelper(config, kBRegSize, in2, in0, exp20);
3044 PnextHelper(config, kBRegSize, in3, in0, exp30);
3045 PnextHelper(config, kBRegSize, in4, in0, exp40);
3046
3047 PnextHelper(config, kBRegSize, in0, in1, exp01);
3048 PnextHelper(config, kBRegSize, in1, in1, exp11);
3049 PnextHelper(config, kBRegSize, in2, in1, exp21);
3050 PnextHelper(config, kBRegSize, in3, in1, exp31);
3051 PnextHelper(config, kBRegSize, in4, in1, exp41);
3052
3053 PnextHelper(config, kBRegSize, in0, in2, exp02);
3054 PnextHelper(config, kBRegSize, in1, in2, exp12);
3055 PnextHelper(config, kBRegSize, in2, in2, exp22);
3056 PnextHelper(config, kBRegSize, in3, in2, exp32);
3057 PnextHelper(config, kBRegSize, in4, in2, exp42);
3058
3059 PnextHelper(config, kBRegSize, in0, in3, exp03);
3060 PnextHelper(config, kBRegSize, in1, in3, exp13);
3061 PnextHelper(config, kBRegSize, in2, in3, exp23);
3062 PnextHelper(config, kBRegSize, in3, in3, exp33);
3063 PnextHelper(config, kBRegSize, in4, in3, exp43);
3064
3065 PnextHelper(config, kBRegSize, in0, in4, exp04);
3066 PnextHelper(config, kBRegSize, in1, in4, exp14);
3067 PnextHelper(config, kBRegSize, in2, in4, exp24);
3068 PnextHelper(config, kBRegSize, in3, in4, exp34);
3069 PnextHelper(config, kBRegSize, in4, in4, exp44);
3070 }
3071
TEST_SVE(sve_pnext_h)3072 TEST_SVE(sve_pnext_h) {
3073 // TODO: Once we have the infrastructure, provide more lanes than kPRegMinSize
3074 // (to check propagation if we have a large VL), but few enough to make the
3075 // test easy to read.
3076 // For now, we just use kPRegMinSize so that the test works anywhere.
3077 int in0[] = {0, 0, 0, 0, 0, 0, 0, 0};
3078 int in1[] = {0, 0, 0, 1, 0, 2, 1, 0};
3079 int in2[] = {0, 1, 2, 0, 2, 0, 2, 0};
3080 int in3[] = {0, 0, 0, 3, 0, 0, 0, 3};
3081 int in4[] = {3, 0, 0, 0, 0, 0, 0, 0};
3082
3083 // Pnext activates the next element that is true in pg, after the last-active
3084 // element in pn. If all pn elements are false (as in in0), it starts looking
3085 // at element 0.
3086 //
3087 // As for other SVE instructions, elements are only considered to be active if
3088 // the _first_ bit in each field is one. Other bits are ignored.
3089
3090 // There are no active lanes in in0, so the result is simply the first active
3091 // lane from pg.
3092 int exp00[] = {0, 0, 0, 0, 0, 0, 0, 0};
3093 int exp10[] = {0, 0, 0, 0, 0, 0, 1, 0};
3094 int exp20[] = {0, 1, 0, 0, 0, 0, 0, 0};
3095 int exp30[] = {0, 0, 0, 0, 0, 0, 0, 1};
3096 int exp40[] = {1, 0, 0, 0, 0, 0, 0, 0};
3097
3098 // | The last active lane in in1 is here.
3099 // v
3100 int exp01[] = {0, 0, 0, 0, 0, 0, 0, 0};
3101 int exp11[] = {0, 0, 0, 0, 0, 0, 0, 0};
3102 int exp21[] = {0, 1, 0, 0, 0, 0, 0, 0};
3103 int exp31[] = {0, 0, 0, 0, 0, 0, 0, 0};
3104 int exp41[] = {1, 0, 0, 0, 0, 0, 0, 0};
3105
3106 // | The last active lane in in2 is here.
3107 // v
3108 int exp02[] = {0, 0, 0, 0, 0, 0, 0, 0};
3109 int exp12[] = {0, 0, 0, 0, 0, 0, 0, 0};
3110 int exp22[] = {0, 0, 0, 0, 0, 0, 0, 0};
3111 int exp32[] = {0, 0, 0, 0, 0, 0, 0, 0};
3112 int exp42[] = {1, 0, 0, 0, 0, 0, 0, 0};
3113
3114 // | The last active lane in in3 is here.
3115 // v
3116 int exp03[] = {0, 0, 0, 0, 0, 0, 0, 0};
3117 int exp13[] = {0, 0, 0, 0, 0, 0, 0, 0};
3118 int exp23[] = {0, 1, 0, 0, 0, 0, 0, 0};
3119 int exp33[] = {0, 0, 0, 0, 0, 0, 0, 0};
3120 int exp43[] = {1, 0, 0, 0, 0, 0, 0, 0};
3121
3122 // | The last active lane in in4 is here.
3123 // v
3124 int exp04[] = {0, 0, 0, 0, 0, 0, 0, 0};
3125 int exp14[] = {0, 0, 0, 0, 0, 0, 0, 0};
3126 int exp24[] = {0, 0, 0, 0, 0, 0, 0, 0};
3127 int exp34[] = {0, 0, 0, 0, 0, 0, 0, 0};
3128 int exp44[] = {0, 0, 0, 0, 0, 0, 0, 0};
3129
3130 PnextHelper(config, kHRegSize, in0, in0, exp00);
3131 PnextHelper(config, kHRegSize, in1, in0, exp10);
3132 PnextHelper(config, kHRegSize, in2, in0, exp20);
3133 PnextHelper(config, kHRegSize, in3, in0, exp30);
3134 PnextHelper(config, kHRegSize, in4, in0, exp40);
3135
3136 PnextHelper(config, kHRegSize, in0, in1, exp01);
3137 PnextHelper(config, kHRegSize, in1, in1, exp11);
3138 PnextHelper(config, kHRegSize, in2, in1, exp21);
3139 PnextHelper(config, kHRegSize, in3, in1, exp31);
3140 PnextHelper(config, kHRegSize, in4, in1, exp41);
3141
3142 PnextHelper(config, kHRegSize, in0, in2, exp02);
3143 PnextHelper(config, kHRegSize, in1, in2, exp12);
3144 PnextHelper(config, kHRegSize, in2, in2, exp22);
3145 PnextHelper(config, kHRegSize, in3, in2, exp32);
3146 PnextHelper(config, kHRegSize, in4, in2, exp42);
3147
3148 PnextHelper(config, kHRegSize, in0, in3, exp03);
3149 PnextHelper(config, kHRegSize, in1, in3, exp13);
3150 PnextHelper(config, kHRegSize, in2, in3, exp23);
3151 PnextHelper(config, kHRegSize, in3, in3, exp33);
3152 PnextHelper(config, kHRegSize, in4, in3, exp43);
3153
3154 PnextHelper(config, kHRegSize, in0, in4, exp04);
3155 PnextHelper(config, kHRegSize, in1, in4, exp14);
3156 PnextHelper(config, kHRegSize, in2, in4, exp24);
3157 PnextHelper(config, kHRegSize, in3, in4, exp34);
3158 PnextHelper(config, kHRegSize, in4, in4, exp44);
3159 }
3160
TEST_SVE(sve_pnext_s)3161 TEST_SVE(sve_pnext_s) {
3162 // TODO: Once we have the infrastructure, provide more lanes than kPRegMinSize
3163 // (to check propagation if we have a large VL), but few enough to make the
3164 // test easy to read.
3165 // For now, we just use kPRegMinSize so that the test works anywhere.
3166 int in0[] = {0xe, 0xc, 0x8, 0x0};
3167 int in1[] = {0x0, 0x2, 0x0, 0x1};
3168 int in2[] = {0x0, 0x1, 0xf, 0x0};
3169 int in3[] = {0xf, 0x0, 0x0, 0x0};
3170
3171 // Pnext activates the next element that is true in pg, after the last-active
3172 // element in pn. If all pn elements are false (as in in0), it starts looking
3173 // at element 0.
3174 //
3175 // As for other SVE instructions, elements are only considered to be active if
3176 // the _first_ bit in each field is one. Other bits are ignored.
3177
3178 // There are no active lanes in in0, so the result is simply the first active
3179 // lane from pg.
3180 int exp00[] = {0, 0, 0, 0};
3181 int exp10[] = {0, 0, 0, 1};
3182 int exp20[] = {0, 0, 1, 0};
3183 int exp30[] = {1, 0, 0, 0};
3184
3185 // | The last active lane in in1 is here.
3186 // v
3187 int exp01[] = {0, 0, 0, 0};
3188 int exp11[] = {0, 0, 0, 0};
3189 int exp21[] = {0, 0, 1, 0};
3190 int exp31[] = {1, 0, 0, 0};
3191
3192 // | The last active lane in in2 is here.
3193 // v
3194 int exp02[] = {0, 0, 0, 0};
3195 int exp12[] = {0, 0, 0, 0};
3196 int exp22[] = {0, 0, 0, 0};
3197 int exp32[] = {1, 0, 0, 0};
3198
3199 // | The last active lane in in3 is here.
3200 // v
3201 int exp03[] = {0, 0, 0, 0};
3202 int exp13[] = {0, 0, 0, 0};
3203 int exp23[] = {0, 0, 0, 0};
3204 int exp33[] = {0, 0, 0, 0};
3205
3206 PnextHelper(config, kSRegSize, in0, in0, exp00);
3207 PnextHelper(config, kSRegSize, in1, in0, exp10);
3208 PnextHelper(config, kSRegSize, in2, in0, exp20);
3209 PnextHelper(config, kSRegSize, in3, in0, exp30);
3210
3211 PnextHelper(config, kSRegSize, in0, in1, exp01);
3212 PnextHelper(config, kSRegSize, in1, in1, exp11);
3213 PnextHelper(config, kSRegSize, in2, in1, exp21);
3214 PnextHelper(config, kSRegSize, in3, in1, exp31);
3215
3216 PnextHelper(config, kSRegSize, in0, in2, exp02);
3217 PnextHelper(config, kSRegSize, in1, in2, exp12);
3218 PnextHelper(config, kSRegSize, in2, in2, exp22);
3219 PnextHelper(config, kSRegSize, in3, in2, exp32);
3220
3221 PnextHelper(config, kSRegSize, in0, in3, exp03);
3222 PnextHelper(config, kSRegSize, in1, in3, exp13);
3223 PnextHelper(config, kSRegSize, in2, in3, exp23);
3224 PnextHelper(config, kSRegSize, in3, in3, exp33);
3225 }
3226
TEST_SVE(sve_pnext_d)3227 TEST_SVE(sve_pnext_d) {
3228 // TODO: Once we have the infrastructure, provide more lanes than kPRegMinSize
3229 // (to check propagation if we have a large VL), but few enough to make the
3230 // test easy to read.
3231 // For now, we just use kPRegMinSize so that the test works anywhere.
3232 int in0[] = {0xfe, 0xf0};
3233 int in1[] = {0x00, 0x55};
3234 int in2[] = {0x33, 0xff};
3235
3236 // Pnext activates the next element that is true in pg, after the last-active
3237 // element in pn. If all pn elements are false (as in in0), it starts looking
3238 // at element 0.
3239 //
3240 // As for other SVE instructions, elements are only considered to be active if
3241 // the _first_ bit in each field is one. Other bits are ignored.
3242
3243 // There are no active lanes in in0, so the result is simply the first active
3244 // lane from pg.
3245 int exp00[] = {0, 0};
3246 int exp10[] = {0, 1};
3247 int exp20[] = {0, 1};
3248
3249 // | The last active lane in in1 is here.
3250 // v
3251 int exp01[] = {0, 0};
3252 int exp11[] = {0, 0};
3253 int exp21[] = {1, 0};
3254
3255 // | The last active lane in in2 is here.
3256 // v
3257 int exp02[] = {0, 0};
3258 int exp12[] = {0, 0};
3259 int exp22[] = {0, 0};
3260
3261 PnextHelper(config, kDRegSize, in0, in0, exp00);
3262 PnextHelper(config, kDRegSize, in1, in0, exp10);
3263 PnextHelper(config, kDRegSize, in2, in0, exp20);
3264
3265 PnextHelper(config, kDRegSize, in0, in1, exp01);
3266 PnextHelper(config, kDRegSize, in1, in1, exp11);
3267 PnextHelper(config, kDRegSize, in2, in1, exp21);
3268
3269 PnextHelper(config, kDRegSize, in0, in2, exp02);
3270 PnextHelper(config, kDRegSize, in1, in2, exp12);
3271 PnextHelper(config, kDRegSize, in2, in2, exp22);
3272 }
3273
TEST_SVE(sve_pnext_alias)3274 TEST_SVE(sve_pnext_alias) {
3275 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
3276 START();
3277
3278 // Check that the Simulator behaves correctly when all arguments are aliased.
3279 int in_b[] = {0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0};
3280 int in_h[] = {0, 0, 0, 0, 1, 1, 0, 0};
3281 int in_s[] = {0, 1, 1, 0};
3282 int in_d[] = {1, 1};
3283
3284 Initialise(&masm, p0.VnB(), in_b);
3285 Initialise(&masm, p1.VnH(), in_h);
3286 Initialise(&masm, p2.VnS(), in_s);
3287 Initialise(&masm, p3.VnD(), in_d);
3288
3289 // Initialise NZCV to an impossible value, to check that we actually write it.
3290 __ Mov(x10, NZCVFlag);
3291
3292 __ Msr(NZCV, x10);
3293 __ Pnext(p0.VnB(), p0, p0.VnB());
3294 __ Mrs(x0, NZCV);
3295
3296 __ Msr(NZCV, x10);
3297 __ Pnext(p1.VnB(), p1, p1.VnB());
3298 __ Mrs(x1, NZCV);
3299
3300 __ Msr(NZCV, x10);
3301 __ Pnext(p2.VnB(), p2, p2.VnB());
3302 __ Mrs(x2, NZCV);
3303
3304 __ Msr(NZCV, x10);
3305 __ Pnext(p3.VnB(), p3, p3.VnB());
3306 __ Mrs(x3, NZCV);
3307
3308 END();
3309
3310 if (CAN_RUN()) {
3311 RUN();
3312
3313 // Since pg.Is(pdn), there can be no active lanes in pg above the last
3314 // active lane in pdn, so the result should always be zero.
3315 ASSERT_EQUAL_SVE(0, p0.VnB());
3316 ASSERT_EQUAL_SVE(0, p1.VnH());
3317 ASSERT_EQUAL_SVE(0, p2.VnS());
3318 ASSERT_EQUAL_SVE(0, p3.VnD());
3319
3320 ASSERT_EQUAL_64(SVENoneFlag | SVENotLastFlag, x0);
3321 ASSERT_EQUAL_64(SVENoneFlag | SVENotLastFlag, x1);
3322 ASSERT_EQUAL_64(SVENoneFlag | SVENotLastFlag, x2);
3323 ASSERT_EQUAL_64(SVENoneFlag | SVENotLastFlag, x3);
3324 }
3325 }
3326
PtrueHelper(Test * config,unsigned lane_size_in_bits,FlagsUpdate s=LeaveFlags)3327 static void PtrueHelper(Test* config,
3328 unsigned lane_size_in_bits,
3329 FlagsUpdate s = LeaveFlags) {
3330 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
3331 START();
3332
3333 PRegisterWithLaneSize p[kNumberOfPRegisters];
3334 for (unsigned i = 0; i < kNumberOfPRegisters; i++) {
3335 p[i] = PRegister(i).WithLaneSize(lane_size_in_bits);
3336 }
3337
3338 // Initialise NZCV to an impossible value, to check that we actually write it.
3339 StatusFlags nzcv_unmodified = NZCVFlag;
3340 __ Mov(x20, nzcv_unmodified);
3341
3342 // We don't have enough registers to conveniently test every pattern, so take
3343 // samples from each group.
3344 __ Msr(NZCV, x20);
3345 __ Ptrue(p[0], SVE_POW2, s);
3346 __ Mrs(x0, NZCV);
3347
3348 __ Msr(NZCV, x20);
3349 __ Ptrue(p[1], SVE_VL1, s);
3350 __ Mrs(x1, NZCV);
3351
3352 __ Msr(NZCV, x20);
3353 __ Ptrue(p[2], SVE_VL2, s);
3354 __ Mrs(x2, NZCV);
3355
3356 __ Msr(NZCV, x20);
3357 __ Ptrue(p[3], SVE_VL5, s);
3358 __ Mrs(x3, NZCV);
3359
3360 __ Msr(NZCV, x20);
3361 __ Ptrue(p[4], SVE_VL6, s);
3362 __ Mrs(x4, NZCV);
3363
3364 __ Msr(NZCV, x20);
3365 __ Ptrue(p[5], SVE_VL8, s);
3366 __ Mrs(x5, NZCV);
3367
3368 __ Msr(NZCV, x20);
3369 __ Ptrue(p[6], SVE_VL16, s);
3370 __ Mrs(x6, NZCV);
3371
3372 __ Msr(NZCV, x20);
3373 __ Ptrue(p[7], SVE_VL64, s);
3374 __ Mrs(x7, NZCV);
3375
3376 __ Msr(NZCV, x20);
3377 __ Ptrue(p[8], SVE_VL256, s);
3378 __ Mrs(x8, NZCV);
3379
3380 {
3381 // We have to use the Assembler to use values not defined by
3382 // SVEPredicateConstraint, so call `ptrues` directly..
3383 typedef void (
3384 MacroAssembler::*AssemblePtrueFn)(const PRegisterWithLaneSize& pd,
3385 int pattern);
3386 AssemblePtrueFn assemble = &MacroAssembler::ptrue;
3387 if (s == SetFlags) {
3388 assemble = &MacroAssembler::ptrues;
3389 }
3390
3391 ExactAssemblyScope guard(&masm, 12 * kInstructionSize);
3392 __ msr(NZCV, x20);
3393 (masm.*assemble)(p[9], 0xe);
3394 __ mrs(x9, NZCV);
3395
3396 __ msr(NZCV, x20);
3397 (masm.*assemble)(p[10], 0x16);
3398 __ mrs(x10, NZCV);
3399
3400 __ msr(NZCV, x20);
3401 (masm.*assemble)(p[11], 0x1a);
3402 __ mrs(x11, NZCV);
3403
3404 __ msr(NZCV, x20);
3405 (masm.*assemble)(p[12], 0x1c);
3406 __ mrs(x12, NZCV);
3407 }
3408
3409 __ Msr(NZCV, x20);
3410 __ Ptrue(p[13], SVE_MUL4, s);
3411 __ Mrs(x13, NZCV);
3412
3413 __ Msr(NZCV, x20);
3414 __ Ptrue(p[14], SVE_MUL3, s);
3415 __ Mrs(x14, NZCV);
3416
3417 __ Msr(NZCV, x20);
3418 __ Ptrue(p[15], SVE_ALL, s);
3419 __ Mrs(x15, NZCV);
3420
3421 END();
3422
3423 if (CAN_RUN()) {
3424 RUN();
3425
3426 int all = core.GetSVELaneCount(lane_size_in_bits);
3427 int pow2 = 1 << HighestSetBitPosition(all);
3428 int mul4 = all - (all % 4);
3429 int mul3 = all - (all % 3);
3430
3431 // Check P register results.
3432 for (int i = 0; i < all; i++) {
3433 ASSERT_EQUAL_SVE_LANE(i < pow2, p[0], i);
3434 ASSERT_EQUAL_SVE_LANE((all >= 1) && (i < 1), p[1], i);
3435 ASSERT_EQUAL_SVE_LANE((all >= 2) && (i < 2), p[2], i);
3436 ASSERT_EQUAL_SVE_LANE((all >= 5) && (i < 5), p[3], i);
3437 ASSERT_EQUAL_SVE_LANE((all >= 6) && (i < 6), p[4], i);
3438 ASSERT_EQUAL_SVE_LANE((all >= 8) && (i < 8), p[5], i);
3439 ASSERT_EQUAL_SVE_LANE((all >= 16) && (i < 16), p[6], i);
3440 ASSERT_EQUAL_SVE_LANE((all >= 64) && (i < 64), p[7], i);
3441 ASSERT_EQUAL_SVE_LANE((all >= 256) && (i < 256), p[8], i);
3442 ASSERT_EQUAL_SVE_LANE(false, p[9], i);
3443 ASSERT_EQUAL_SVE_LANE(false, p[10], i);
3444 ASSERT_EQUAL_SVE_LANE(false, p[11], i);
3445 ASSERT_EQUAL_SVE_LANE(false, p[12], i);
3446 ASSERT_EQUAL_SVE_LANE(i < mul4, p[13], i);
3447 ASSERT_EQUAL_SVE_LANE(i < mul3, p[14], i);
3448 ASSERT_EQUAL_SVE_LANE(true, p[15], i);
3449 }
3450
3451 // Check NZCV results.
3452 if (s == LeaveFlags) {
3453 // No flags should have been updated.
3454 for (int i = 0; i <= 15; i++) {
3455 ASSERT_EQUAL_64(nzcv_unmodified, XRegister(i));
3456 }
3457 } else {
3458 StatusFlags zero = static_cast<StatusFlags>(SVENoneFlag | SVENotLastFlag);
3459 StatusFlags nonzero = SVEFirstFlag;
3460
3461 // POW2
3462 ASSERT_EQUAL_64(nonzero, x0);
3463 // VL*
3464 ASSERT_EQUAL_64((all >= 1) ? nonzero : zero, x1);
3465 ASSERT_EQUAL_64((all >= 2) ? nonzero : zero, x2);
3466 ASSERT_EQUAL_64((all >= 5) ? nonzero : zero, x3);
3467 ASSERT_EQUAL_64((all >= 6) ? nonzero : zero, x4);
3468 ASSERT_EQUAL_64((all >= 8) ? nonzero : zero, x5);
3469 ASSERT_EQUAL_64((all >= 16) ? nonzero : zero, x6);
3470 ASSERT_EQUAL_64((all >= 64) ? nonzero : zero, x7);
3471 ASSERT_EQUAL_64((all >= 256) ? nonzero : zero, x8);
3472 // #uimm5
3473 ASSERT_EQUAL_64(zero, x9);
3474 ASSERT_EQUAL_64(zero, x10);
3475 ASSERT_EQUAL_64(zero, x11);
3476 ASSERT_EQUAL_64(zero, x12);
3477 // MUL*
3478 ASSERT_EQUAL_64((all >= 4) ? nonzero : zero, x13);
3479 ASSERT_EQUAL_64((all >= 3) ? nonzero : zero, x14);
3480 // ALL
3481 ASSERT_EQUAL_64(nonzero, x15);
3482 }
3483 }
3484 }
3485
TEST_SVE(sve_ptrue_b)3486 TEST_SVE(sve_ptrue_b) { PtrueHelper(config, kBRegSize, LeaveFlags); }
TEST_SVE(sve_ptrue_h)3487 TEST_SVE(sve_ptrue_h) { PtrueHelper(config, kHRegSize, LeaveFlags); }
TEST_SVE(sve_ptrue_s)3488 TEST_SVE(sve_ptrue_s) { PtrueHelper(config, kSRegSize, LeaveFlags); }
TEST_SVE(sve_ptrue_d)3489 TEST_SVE(sve_ptrue_d) { PtrueHelper(config, kDRegSize, LeaveFlags); }
3490
TEST_SVE(sve_ptrues_b)3491 TEST_SVE(sve_ptrues_b) { PtrueHelper(config, kBRegSize, SetFlags); }
TEST_SVE(sve_ptrues_h)3492 TEST_SVE(sve_ptrues_h) { PtrueHelper(config, kHRegSize, SetFlags); }
TEST_SVE(sve_ptrues_s)3493 TEST_SVE(sve_ptrues_s) { PtrueHelper(config, kSRegSize, SetFlags); }
TEST_SVE(sve_ptrues_d)3494 TEST_SVE(sve_ptrues_d) { PtrueHelper(config, kDRegSize, SetFlags); }
3495
TEST_SVE(sve_pfalse)3496 TEST_SVE(sve_pfalse) {
3497 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
3498 START();
3499
3500 // Initialise non-zero inputs.
3501 __ Ptrue(p0.VnB());
3502 __ Ptrue(p1.VnH());
3503 __ Ptrue(p2.VnS());
3504 __ Ptrue(p3.VnD());
3505
3506 // The instruction only supports B-sized lanes, but the lane size has no
3507 // logical effect, so the MacroAssembler accepts anything.
3508 __ Pfalse(p0.VnB());
3509 __ Pfalse(p1.VnH());
3510 __ Pfalse(p2.VnS());
3511 __ Pfalse(p3.VnD());
3512
3513 END();
3514
3515 if (CAN_RUN()) {
3516 RUN();
3517
3518 ASSERT_EQUAL_SVE(0, p0.VnB());
3519 ASSERT_EQUAL_SVE(0, p1.VnB());
3520 ASSERT_EQUAL_SVE(0, p2.VnB());
3521 ASSERT_EQUAL_SVE(0, p3.VnB());
3522 }
3523 }
3524
TEST_SVE(sve_ptest)3525 TEST_SVE(sve_ptest) {
3526 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
3527 START();
3528
3529 // Initialise NZCV to a known (impossible) value.
3530 StatusFlags nzcv_unmodified = NZCVFlag;
3531 __ Mov(x0, nzcv_unmodified);
3532 __ Msr(NZCV, x0);
3533
3534 // Construct some test inputs.
3535 int in2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0};
3536 int in3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0};
3537 int in4[] = {0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0};
3538 __ Pfalse(p0.VnB());
3539 __ Ptrue(p1.VnB());
3540 Initialise(&masm, p2.VnB(), in2);
3541 Initialise(&masm, p3.VnB(), in3);
3542 Initialise(&masm, p4.VnB(), in4);
3543
3544 // All-inactive pg.
3545 __ Ptest(p0, p0.VnB());
3546 __ Mrs(x0, NZCV);
3547 __ Ptest(p0, p1.VnB());
3548 __ Mrs(x1, NZCV);
3549 __ Ptest(p0, p2.VnB());
3550 __ Mrs(x2, NZCV);
3551 __ Ptest(p0, p3.VnB());
3552 __ Mrs(x3, NZCV);
3553 __ Ptest(p0, p4.VnB());
3554 __ Mrs(x4, NZCV);
3555
3556 // All-active pg.
3557 __ Ptest(p1, p0.VnB());
3558 __ Mrs(x5, NZCV);
3559 __ Ptest(p1, p1.VnB());
3560 __ Mrs(x6, NZCV);
3561 __ Ptest(p1, p2.VnB());
3562 __ Mrs(x7, NZCV);
3563 __ Ptest(p1, p3.VnB());
3564 __ Mrs(x8, NZCV);
3565 __ Ptest(p1, p4.VnB());
3566 __ Mrs(x9, NZCV);
3567
3568 // Combinations of other inputs.
3569 __ Ptest(p2, p2.VnB());
3570 __ Mrs(x20, NZCV);
3571 __ Ptest(p2, p3.VnB());
3572 __ Mrs(x21, NZCV);
3573 __ Ptest(p2, p4.VnB());
3574 __ Mrs(x22, NZCV);
3575 __ Ptest(p3, p2.VnB());
3576 __ Mrs(x23, NZCV);
3577 __ Ptest(p3, p3.VnB());
3578 __ Mrs(x24, NZCV);
3579 __ Ptest(p3, p4.VnB());
3580 __ Mrs(x25, NZCV);
3581 __ Ptest(p4, p2.VnB());
3582 __ Mrs(x26, NZCV);
3583 __ Ptest(p4, p3.VnB());
3584 __ Mrs(x27, NZCV);
3585 __ Ptest(p4, p4.VnB());
3586 __ Mrs(x28, NZCV);
3587
3588 END();
3589
3590 if (CAN_RUN()) {
3591 RUN();
3592
3593 StatusFlags zero = static_cast<StatusFlags>(SVENoneFlag | SVENotLastFlag);
3594
3595 // If pg is all inactive, the value of pn is irrelevant.
3596 ASSERT_EQUAL_64(zero, x0);
3597 ASSERT_EQUAL_64(zero, x1);
3598 ASSERT_EQUAL_64(zero, x2);
3599 ASSERT_EQUAL_64(zero, x3);
3600 ASSERT_EQUAL_64(zero, x4);
3601
3602 // All-active pg.
3603 ASSERT_EQUAL_64(zero, x5); // All-inactive pn.
3604 ASSERT_EQUAL_64(SVEFirstFlag, x6); // All-active pn.
3605 // Other pn inputs are non-zero, but the first and last lanes are inactive.
3606 ASSERT_EQUAL_64(SVENotLastFlag, x7);
3607 ASSERT_EQUAL_64(SVENotLastFlag, x8);
3608 ASSERT_EQUAL_64(SVENotLastFlag, x9);
3609
3610 // Other inputs.
3611 ASSERT_EQUAL_64(SVEFirstFlag, x20); // pg: in2, pn: in2
3612 ASSERT_EQUAL_64(NoFlag, x21); // pg: in2, pn: in3
3613 ASSERT_EQUAL_64(zero, x22); // pg: in2, pn: in4
3614 ASSERT_EQUAL_64(static_cast<StatusFlags>(SVEFirstFlag | SVENotLastFlag),
3615 x23); // pg: in3, pn: in2
3616 ASSERT_EQUAL_64(SVEFirstFlag, x24); // pg: in3, pn: in3
3617 ASSERT_EQUAL_64(zero, x25); // pg: in3, pn: in4
3618 ASSERT_EQUAL_64(zero, x26); // pg: in4, pn: in2
3619 ASSERT_EQUAL_64(zero, x27); // pg: in4, pn: in3
3620 ASSERT_EQUAL_64(SVEFirstFlag, x28); // pg: in4, pn: in4
3621 }
3622 }
3623
TEST_SVE(sve_cntp)3624 TEST_SVE(sve_cntp) {
3625 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
3626 START();
3627
3628 // There are {7, 5, 2, 1} active {B, H, S, D} lanes.
3629 int p0_inputs[] = {0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0};
3630 Initialise(&masm, p0.VnB(), p0_inputs);
3631
3632 // With an all-true predicate, these instructions measure the vector length.
3633 __ Ptrue(p10.VnB());
3634 __ Ptrue(p11.VnH());
3635 __ Ptrue(p12.VnS());
3636 __ Ptrue(p13.VnD());
3637
3638 // `ptrue p10.b` provides an all-active pg.
3639 __ Cntp(x10, p10, p10.VnB());
3640 __ Cntp(x11, p10, p11.VnH());
3641 __ Cntp(x12, p10, p12.VnS());
3642 __ Cntp(x13, p10, p13.VnD());
3643
3644 // Check that the predicate mask is applied properly.
3645 __ Cntp(x14, p10, p10.VnB());
3646 __ Cntp(x15, p11, p10.VnB());
3647 __ Cntp(x16, p12, p10.VnB());
3648 __ Cntp(x17, p13, p10.VnB());
3649
3650 // Check other patterns (including some ignored bits).
3651 __ Cntp(x0, p10, p0.VnB());
3652 __ Cntp(x1, p10, p0.VnH());
3653 __ Cntp(x2, p10, p0.VnS());
3654 __ Cntp(x3, p10, p0.VnD());
3655 __ Cntp(x4, p0, p10.VnB());
3656 __ Cntp(x5, p0, p10.VnH());
3657 __ Cntp(x6, p0, p10.VnS());
3658 __ Cntp(x7, p0, p10.VnD());
3659
3660 END();
3661
3662 if (CAN_RUN()) {
3663 RUN();
3664
3665 int vl_b = core.GetSVELaneCount(kBRegSize);
3666 int vl_h = core.GetSVELaneCount(kHRegSize);
3667 int vl_s = core.GetSVELaneCount(kSRegSize);
3668 int vl_d = core.GetSVELaneCount(kDRegSize);
3669
3670 // Check all-active predicates in various combinations.
3671 ASSERT_EQUAL_64(vl_b, x10);
3672 ASSERT_EQUAL_64(vl_h, x11);
3673 ASSERT_EQUAL_64(vl_s, x12);
3674 ASSERT_EQUAL_64(vl_d, x13);
3675
3676 ASSERT_EQUAL_64(vl_b, x14);
3677 ASSERT_EQUAL_64(vl_h, x15);
3678 ASSERT_EQUAL_64(vl_s, x16);
3679 ASSERT_EQUAL_64(vl_d, x17);
3680
3681 // Check that irrelevant bits are properly ignored.
3682 ASSERT_EQUAL_64(7, x0);
3683 ASSERT_EQUAL_64(5, x1);
3684 ASSERT_EQUAL_64(2, x2);
3685 ASSERT_EQUAL_64(1, x3);
3686
3687 ASSERT_EQUAL_64(7, x4);
3688 ASSERT_EQUAL_64(5, x5);
3689 ASSERT_EQUAL_64(2, x6);
3690 ASSERT_EQUAL_64(1, x7);
3691 }
3692 }
3693
3694 typedef void (MacroAssembler::*CntFn)(const Register& dst,
3695 int pattern,
3696 int multiplier);
3697
3698 template <typename T>
GenerateCntSequence(MacroAssembler * masm,CntFn cnt,T acc_value,int multiplier)3699 void GenerateCntSequence(MacroAssembler* masm,
3700 CntFn cnt,
3701 T acc_value,
3702 int multiplier) {
3703 // Initialise accumulators.
3704 masm->Mov(x0, acc_value);
3705 masm->Mov(x1, acc_value);
3706 masm->Mov(x2, acc_value);
3707 masm->Mov(x3, acc_value);
3708 masm->Mov(x4, acc_value);
3709 masm->Mov(x5, acc_value);
3710 masm->Mov(x6, acc_value);
3711 masm->Mov(x7, acc_value);
3712 masm->Mov(x8, acc_value);
3713 masm->Mov(x9, acc_value);
3714 masm->Mov(x10, acc_value);
3715 masm->Mov(x11, acc_value);
3716 masm->Mov(x12, acc_value);
3717 masm->Mov(x13, acc_value);
3718 masm->Mov(x14, acc_value);
3719 masm->Mov(x15, acc_value);
3720 masm->Mov(x18, acc_value);
3721 masm->Mov(x19, acc_value);
3722 masm->Mov(x20, acc_value);
3723 masm->Mov(x21, acc_value);
3724
3725 (masm->*cnt)(Register(0, sizeof(T) * kBitsPerByte), SVE_POW2, multiplier);
3726 (masm->*cnt)(Register(1, sizeof(T) * kBitsPerByte), SVE_VL1, multiplier);
3727 (masm->*cnt)(Register(2, sizeof(T) * kBitsPerByte), SVE_VL2, multiplier);
3728 (masm->*cnt)(Register(3, sizeof(T) * kBitsPerByte), SVE_VL3, multiplier);
3729 (masm->*cnt)(Register(4, sizeof(T) * kBitsPerByte), SVE_VL4, multiplier);
3730 (masm->*cnt)(Register(5, sizeof(T) * kBitsPerByte), SVE_VL5, multiplier);
3731 (masm->*cnt)(Register(6, sizeof(T) * kBitsPerByte), SVE_VL6, multiplier);
3732 (masm->*cnt)(Register(7, sizeof(T) * kBitsPerByte), SVE_VL7, multiplier);
3733 (masm->*cnt)(Register(8, sizeof(T) * kBitsPerByte), SVE_VL8, multiplier);
3734 (masm->*cnt)(Register(9, sizeof(T) * kBitsPerByte), SVE_VL16, multiplier);
3735 (masm->*cnt)(Register(10, sizeof(T) * kBitsPerByte), SVE_VL32, multiplier);
3736 (masm->*cnt)(Register(11, sizeof(T) * kBitsPerByte), SVE_VL64, multiplier);
3737 (masm->*cnt)(Register(12, sizeof(T) * kBitsPerByte), SVE_VL128, multiplier);
3738 (masm->*cnt)(Register(13, sizeof(T) * kBitsPerByte), SVE_VL256, multiplier);
3739 (masm->*cnt)(Register(14, sizeof(T) * kBitsPerByte), 16, multiplier);
3740 (masm->*cnt)(Register(15, sizeof(T) * kBitsPerByte), 23, multiplier);
3741 (masm->*cnt)(Register(18, sizeof(T) * kBitsPerByte), 28, multiplier);
3742 (masm->*cnt)(Register(19, sizeof(T) * kBitsPerByte), SVE_MUL4, multiplier);
3743 (masm->*cnt)(Register(20, sizeof(T) * kBitsPerByte), SVE_MUL3, multiplier);
3744 (masm->*cnt)(Register(21, sizeof(T) * kBitsPerByte), SVE_ALL, multiplier);
3745 }
3746
FixedVL(int fixed,int length)3747 int FixedVL(int fixed, int length) {
3748 VIXL_ASSERT(((fixed >= 1) && (fixed <= 8)) || (fixed == 16) ||
3749 (fixed == 32) || (fixed == 64) || (fixed == 128) ||
3750 (fixed = 256));
3751 return (length >= fixed) ? fixed : 0;
3752 }
3753
CntHelper(Test * config,CntFn cnt,int multiplier,int lane_size_in_bits,int64_t acc_value=0,bool is_increment=true)3754 static void CntHelper(Test* config,
3755 CntFn cnt,
3756 int multiplier,
3757 int lane_size_in_bits,
3758 int64_t acc_value = 0,
3759 bool is_increment = true) {
3760 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
3761 START();
3762 GenerateCntSequence(&masm, cnt, acc_value, multiplier);
3763 END();
3764
3765 if (CAN_RUN()) {
3766 RUN();
3767
3768 int all = core.GetSVELaneCount(lane_size_in_bits);
3769 int pow2 = 1 << HighestSetBitPosition(all);
3770 int mul4 = all - (all % 4);
3771 int mul3 = all - (all % 3);
3772
3773 multiplier = is_increment ? multiplier : -multiplier;
3774
3775 ASSERT_EQUAL_64(acc_value + (multiplier * pow2), x0);
3776 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(1, all)), x1);
3777 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(2, all)), x2);
3778 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(3, all)), x3);
3779 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(4, all)), x4);
3780 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(5, all)), x5);
3781 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(6, all)), x6);
3782 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(7, all)), x7);
3783 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(8, all)), x8);
3784 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(16, all)), x9);
3785 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(32, all)), x10);
3786 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(64, all)), x11);
3787 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(128, all)), x12);
3788 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(256, all)), x13);
3789 ASSERT_EQUAL_64(acc_value, x14);
3790 ASSERT_EQUAL_64(acc_value, x15);
3791 ASSERT_EQUAL_64(acc_value, x18);
3792 ASSERT_EQUAL_64(acc_value + (multiplier * mul4), x19);
3793 ASSERT_EQUAL_64(acc_value + (multiplier * mul3), x20);
3794 ASSERT_EQUAL_64(acc_value + (multiplier * all), x21);
3795 }
3796 }
3797
IncHelper(Test * config,CntFn cnt,int multiplier,int lane_size_in_bits,int64_t acc_value)3798 static void IncHelper(Test* config,
3799 CntFn cnt,
3800 int multiplier,
3801 int lane_size_in_bits,
3802 int64_t acc_value) {
3803 CntHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, true);
3804 }
3805
DecHelper(Test * config,CntFn cnt,int multiplier,int lane_size_in_bits,int64_t acc_value)3806 static void DecHelper(Test* config,
3807 CntFn cnt,
3808 int multiplier,
3809 int lane_size_in_bits,
3810 int64_t acc_value) {
3811 CntHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, false);
3812 }
3813
TEST_SVE(sve_cntb)3814 TEST_SVE(sve_cntb) {
3815 CntHelper(config, &MacroAssembler::Cntb, 1, kBRegSize);
3816 CntHelper(config, &MacroAssembler::Cntb, 2, kBRegSize);
3817 CntHelper(config, &MacroAssembler::Cntb, 15, kBRegSize);
3818 CntHelper(config, &MacroAssembler::Cntb, 16, kBRegSize);
3819 }
3820
TEST_SVE(sve_cnth)3821 TEST_SVE(sve_cnth) {
3822 CntHelper(config, &MacroAssembler::Cnth, 1, kHRegSize);
3823 CntHelper(config, &MacroAssembler::Cnth, 2, kHRegSize);
3824 CntHelper(config, &MacroAssembler::Cnth, 15, kHRegSize);
3825 CntHelper(config, &MacroAssembler::Cnth, 16, kHRegSize);
3826 }
3827
TEST_SVE(sve_cntw)3828 TEST_SVE(sve_cntw) {
3829 CntHelper(config, &MacroAssembler::Cntw, 1, kWRegSize);
3830 CntHelper(config, &MacroAssembler::Cntw, 2, kWRegSize);
3831 CntHelper(config, &MacroAssembler::Cntw, 15, kWRegSize);
3832 CntHelper(config, &MacroAssembler::Cntw, 16, kWRegSize);
3833 }
3834
TEST_SVE(sve_cntd)3835 TEST_SVE(sve_cntd) {
3836 CntHelper(config, &MacroAssembler::Cntd, 1, kDRegSize);
3837 CntHelper(config, &MacroAssembler::Cntd, 2, kDRegSize);
3838 CntHelper(config, &MacroAssembler::Cntd, 15, kDRegSize);
3839 CntHelper(config, &MacroAssembler::Cntd, 16, kDRegSize);
3840 }
3841
TEST_SVE(sve_decb)3842 TEST_SVE(sve_decb) {
3843 DecHelper(config, &MacroAssembler::Decb, 1, kBRegSize, 42);
3844 DecHelper(config, &MacroAssembler::Decb, 2, kBRegSize, -1);
3845 DecHelper(config, &MacroAssembler::Decb, 15, kBRegSize, INT64_MIN);
3846 DecHelper(config, &MacroAssembler::Decb, 16, kBRegSize, -42);
3847 }
3848
TEST_SVE(sve_dech)3849 TEST_SVE(sve_dech) {
3850 DecHelper(config, &MacroAssembler::Dech, 1, kHRegSize, 42);
3851 DecHelper(config, &MacroAssembler::Dech, 2, kHRegSize, -1);
3852 DecHelper(config, &MacroAssembler::Dech, 15, kHRegSize, INT64_MIN);
3853 DecHelper(config, &MacroAssembler::Dech, 16, kHRegSize, -42);
3854 }
3855
TEST_SVE(sve_decw)3856 TEST_SVE(sve_decw) {
3857 DecHelper(config, &MacroAssembler::Decw, 1, kWRegSize, 42);
3858 DecHelper(config, &MacroAssembler::Decw, 2, kWRegSize, -1);
3859 DecHelper(config, &MacroAssembler::Decw, 15, kWRegSize, INT64_MIN);
3860 DecHelper(config, &MacroAssembler::Decw, 16, kWRegSize, -42);
3861 }
3862
TEST_SVE(sve_decd)3863 TEST_SVE(sve_decd) {
3864 DecHelper(config, &MacroAssembler::Decd, 1, kDRegSize, 42);
3865 DecHelper(config, &MacroAssembler::Decd, 2, kDRegSize, -1);
3866 DecHelper(config, &MacroAssembler::Decd, 15, kDRegSize, INT64_MIN);
3867 DecHelper(config, &MacroAssembler::Decd, 16, kDRegSize, -42);
3868 }
3869
TEST_SVE(sve_incb)3870 TEST_SVE(sve_incb) {
3871 IncHelper(config, &MacroAssembler::Incb, 1, kBRegSize, 42);
3872 IncHelper(config, &MacroAssembler::Incb, 2, kBRegSize, -1);
3873 IncHelper(config, &MacroAssembler::Incb, 15, kBRegSize, INT64_MAX);
3874 IncHelper(config, &MacroAssembler::Incb, 16, kBRegSize, -42);
3875 }
3876
TEST_SVE(sve_inch)3877 TEST_SVE(sve_inch) {
3878 IncHelper(config, &MacroAssembler::Inch, 1, kHRegSize, 42);
3879 IncHelper(config, &MacroAssembler::Inch, 2, kHRegSize, -1);
3880 IncHelper(config, &MacroAssembler::Inch, 15, kHRegSize, INT64_MAX);
3881 IncHelper(config, &MacroAssembler::Inch, 16, kHRegSize, -42);
3882 }
3883
TEST_SVE(sve_incw)3884 TEST_SVE(sve_incw) {
3885 IncHelper(config, &MacroAssembler::Incw, 1, kWRegSize, 42);
3886 IncHelper(config, &MacroAssembler::Incw, 2, kWRegSize, -1);
3887 IncHelper(config, &MacroAssembler::Incw, 15, kWRegSize, INT64_MAX);
3888 IncHelper(config, &MacroAssembler::Incw, 16, kWRegSize, -42);
3889 }
3890
TEST_SVE(sve_incd)3891 TEST_SVE(sve_incd) {
3892 IncHelper(config, &MacroAssembler::Incd, 1, kDRegSize, 42);
3893 IncHelper(config, &MacroAssembler::Incd, 2, kDRegSize, -1);
3894 IncHelper(config, &MacroAssembler::Incd, 15, kDRegSize, INT64_MAX);
3895 IncHelper(config, &MacroAssembler::Incd, 16, kDRegSize, -42);
3896 }
3897
3898 template <typename T>
QAdd(T x,int y)3899 static T QAdd(T x, int y) {
3900 VIXL_ASSERT(y > INT_MIN);
3901 T result;
3902 T min = std::numeric_limits<T>::min();
3903 T max = std::numeric_limits<T>::max();
3904 if ((x >= 0) && (y >= 0)) {
3905 // For positive a and b, saturate at max.
3906 result = (max - x) < static_cast<T>(y) ? max : x + y;
3907 } else if ((y < 0) && ((x < 0) || (min == 0))) {
3908 // For negative b, where either a negative or T unsigned.
3909 result = (x - min) < static_cast<T>(-y) ? min : x + y;
3910 } else {
3911 result = x + y;
3912 }
3913 return result;
3914 }
3915
3916 template <typename T>
QIncDecHelper(Test * config,CntFn cnt,int multiplier,int lane_size_in_bits,T acc_value,bool is_increment)3917 static void QIncDecHelper(Test* config,
3918 CntFn cnt,
3919 int multiplier,
3920 int lane_size_in_bits,
3921 T acc_value,
3922 bool is_increment) {
3923 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
3924 START();
3925 GenerateCntSequence(&masm, cnt, acc_value, multiplier);
3926 END();
3927
3928 if (CAN_RUN()) {
3929 RUN();
3930
3931 int all = core.GetSVELaneCount(lane_size_in_bits);
3932 int pow2 = 1 << HighestSetBitPosition(all);
3933 int mul4 = all - (all % 4);
3934 int mul3 = all - (all % 3);
3935
3936 multiplier = is_increment ? multiplier : -multiplier;
3937
3938 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * pow2), x0);
3939 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(1, all)), x1);
3940 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(2, all)), x2);
3941 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(3, all)), x3);
3942 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(4, all)), x4);
3943 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(5, all)), x5);
3944 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(6, all)), x6);
3945 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(7, all)), x7);
3946 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(8, all)), x8);
3947 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(16, all)), x9);
3948 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(32, all)), x10);
3949 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(64, all)), x11);
3950 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(128, all)), x12);
3951 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(256, all)), x13);
3952 ASSERT_EQUAL_64(acc_value, x14);
3953 ASSERT_EQUAL_64(acc_value, x15);
3954 ASSERT_EQUAL_64(acc_value, x18);
3955 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * mul4), x19);
3956 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * mul3), x20);
3957 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * all), x21);
3958 }
3959 }
3960
3961 template <typename T>
QIncHelper(Test * config,CntFn cnt,int multiplier,int lane_size_in_bits,T acc_value)3962 static void QIncHelper(Test* config,
3963 CntFn cnt,
3964 int multiplier,
3965 int lane_size_in_bits,
3966 T acc_value) {
3967 QIncDecHelper<T>(config, cnt, multiplier, lane_size_in_bits, acc_value, true);
3968 }
3969
3970 template <typename T>
QDecHelper(Test * config,CntFn cnt,int multiplier,int lane_size_in_bits,T acc_value)3971 static void QDecHelper(Test* config,
3972 CntFn cnt,
3973 int multiplier,
3974 int lane_size_in_bits,
3975 T acc_value) {
3976 QIncDecHelper<T>(config,
3977 cnt,
3978 multiplier,
3979 lane_size_in_bits,
3980 acc_value,
3981 false);
3982 }
3983
TEST_SVE(sve_sqdecb)3984 TEST_SVE(sve_sqdecb) {
3985 int64_t bigneg = INT64_MIN + 42;
3986 int64_t bigpos = INT64_MAX - 42;
3987 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecb, 1, kBRegSize, 1);
3988 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecb, 2, kBRegSize, bigneg);
3989 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecb, 15, kBRegSize, 999);
3990 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecb, 16, kBRegSize, bigpos);
3991 }
3992
TEST_SVE(sve_sqdech)3993 TEST_SVE(sve_sqdech) {
3994 int64_t bigneg = INT64_MIN + 42;
3995 int64_t bigpos = INT64_MAX - 42;
3996 QDecHelper<int64_t>(config, &MacroAssembler::Sqdech, 1, kHRegSize, 1);
3997 QDecHelper<int64_t>(config, &MacroAssembler::Sqdech, 2, kHRegSize, bigneg);
3998 QDecHelper<int64_t>(config, &MacroAssembler::Sqdech, 15, kHRegSize, 999);
3999 QDecHelper<int64_t>(config, &MacroAssembler::Sqdech, 16, kHRegSize, bigpos);
4000 }
4001
TEST_SVE(sve_sqdecw)4002 TEST_SVE(sve_sqdecw) {
4003 int64_t bigneg = INT64_MIN + 42;
4004 int64_t bigpos = INT64_MAX - 42;
4005 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecw, 1, kWRegSize, 1);
4006 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecw, 2, kWRegSize, bigneg);
4007 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecw, 15, kWRegSize, 999);
4008 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecw, 16, kWRegSize, bigpos);
4009 }
4010
TEST_SVE(sve_sqdecd)4011 TEST_SVE(sve_sqdecd) {
4012 int64_t bigneg = INT64_MIN + 42;
4013 int64_t bigpos = INT64_MAX - 42;
4014 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecd, 1, kDRegSize, 1);
4015 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecd, 2, kDRegSize, bigneg);
4016 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecd, 15, kDRegSize, 999);
4017 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecd, 16, kDRegSize, bigpos);
4018 }
4019
TEST_SVE(sve_sqincb)4020 TEST_SVE(sve_sqincb) {
4021 int64_t bigneg = INT64_MIN + 42;
4022 int64_t bigpos = INT64_MAX - 42;
4023 QIncHelper<int64_t>(config, &MacroAssembler::Sqincb, 1, kBRegSize, 1);
4024 QIncHelper<int64_t>(config, &MacroAssembler::Sqincb, 2, kBRegSize, bigneg);
4025 QIncHelper<int64_t>(config, &MacroAssembler::Sqincb, 15, kBRegSize, 999);
4026 QIncHelper<int64_t>(config, &MacroAssembler::Sqincb, 16, kBRegSize, bigpos);
4027 }
4028
TEST_SVE(sve_sqinch)4029 TEST_SVE(sve_sqinch) {
4030 int64_t bigneg = INT64_MIN + 42;
4031 int64_t bigpos = INT64_MAX - 42;
4032 QIncHelper<int64_t>(config, &MacroAssembler::Sqinch, 1, kHRegSize, 1);
4033 QIncHelper<int64_t>(config, &MacroAssembler::Sqinch, 2, kHRegSize, bigneg);
4034 QIncHelper<int64_t>(config, &MacroAssembler::Sqinch, 15, kHRegSize, 999);
4035 QIncHelper<int64_t>(config, &MacroAssembler::Sqinch, 16, kHRegSize, bigpos);
4036 }
4037
TEST_SVE(sve_sqincw)4038 TEST_SVE(sve_sqincw) {
4039 int64_t bigneg = INT64_MIN + 42;
4040 int64_t bigpos = INT64_MAX - 42;
4041 QIncHelper<int64_t>(config, &MacroAssembler::Sqincw, 1, kWRegSize, 1);
4042 QIncHelper<int64_t>(config, &MacroAssembler::Sqincw, 2, kWRegSize, bigneg);
4043 QIncHelper<int64_t>(config, &MacroAssembler::Sqincw, 15, kWRegSize, 999);
4044 QIncHelper<int64_t>(config, &MacroAssembler::Sqincw, 16, kWRegSize, bigpos);
4045 }
4046
TEST_SVE(sve_sqincd)4047 TEST_SVE(sve_sqincd) {
4048 int64_t bigneg = INT64_MIN + 42;
4049 int64_t bigpos = INT64_MAX - 42;
4050 QIncHelper<int64_t>(config, &MacroAssembler::Sqincd, 1, kDRegSize, 1);
4051 QIncHelper<int64_t>(config, &MacroAssembler::Sqincd, 2, kDRegSize, bigneg);
4052 QIncHelper<int64_t>(config, &MacroAssembler::Sqincd, 15, kDRegSize, 999);
4053 QIncHelper<int64_t>(config, &MacroAssembler::Sqincd, 16, kDRegSize, bigpos);
4054 }
4055
TEST_SVE(sve_uqdecb)4056 TEST_SVE(sve_uqdecb) {
4057 int32_t big32 = UINT32_MAX - 42;
4058 int64_t big64 = UINT64_MAX - 42;
4059 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecb, 1, kBRegSize, 1);
4060 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecb, 2, kBRegSize, 42);
4061 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecb, 15, kBRegSize, 999);
4062 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecb, 16, kBRegSize, big32);
4063 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecb, 1, kBRegSize, 1);
4064 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecb, 2, kBRegSize, 42);
4065 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecb, 15, kBRegSize, 999);
4066 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecb, 16, kBRegSize, big64);
4067 }
4068
TEST_SVE(sve_uqdech)4069 TEST_SVE(sve_uqdech) {
4070 int32_t big32 = UINT32_MAX - 42;
4071 int64_t big64 = UINT64_MAX - 42;
4072 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdech, 1, kHRegSize, 1);
4073 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdech, 2, kHRegSize, 42);
4074 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdech, 15, kHRegSize, 999);
4075 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdech, 16, kHRegSize, big32);
4076 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdech, 1, kHRegSize, 1);
4077 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdech, 2, kHRegSize, 42);
4078 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdech, 15, kHRegSize, 999);
4079 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdech, 16, kHRegSize, big64);
4080 }
4081
TEST_SVE(sve_uqdecw)4082 TEST_SVE(sve_uqdecw) {
4083 int32_t big32 = UINT32_MAX - 42;
4084 int64_t big64 = UINT64_MAX - 42;
4085 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecw, 1, kWRegSize, 1);
4086 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecw, 2, kWRegSize, 42);
4087 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecw, 15, kWRegSize, 999);
4088 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecw, 16, kWRegSize, big32);
4089 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecw, 1, kWRegSize, 1);
4090 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecw, 2, kWRegSize, 42);
4091 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecw, 15, kWRegSize, 999);
4092 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecw, 16, kWRegSize, big64);
4093 }
4094
TEST_SVE(sve_uqdecd)4095 TEST_SVE(sve_uqdecd) {
4096 int32_t big32 = UINT32_MAX - 42;
4097 int64_t big64 = UINT64_MAX - 42;
4098 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecd, 1, kDRegSize, 1);
4099 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecd, 2, kDRegSize, 42);
4100 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecd, 15, kDRegSize, 999);
4101 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecd, 16, kDRegSize, big32);
4102 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecd, 1, kDRegSize, 1);
4103 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecd, 2, kDRegSize, 42);
4104 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecd, 15, kDRegSize, 999);
4105 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecd, 16, kDRegSize, big64);
4106 }
4107
TEST_SVE(sve_uqincb)4108 TEST_SVE(sve_uqincb) {
4109 int32_t big32 = UINT32_MAX - 42;
4110 int64_t big64 = UINT64_MAX - 42;
4111 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincb, 1, kBRegSize, 1);
4112 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincb, 2, kBRegSize, 42);
4113 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincb, 15, kBRegSize, 999);
4114 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincb, 16, kBRegSize, big32);
4115 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincb, 1, kBRegSize, 1);
4116 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincb, 2, kBRegSize, 42);
4117 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincb, 15, kBRegSize, 999);
4118 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincb, 16, kBRegSize, big64);
4119 }
4120
TEST_SVE(sve_uqinch)4121 TEST_SVE(sve_uqinch) {
4122 int32_t big32 = UINT32_MAX - 42;
4123 int64_t big64 = UINT64_MAX - 42;
4124 QIncHelper<uint32_t>(config, &MacroAssembler::Uqinch, 1, kHRegSize, 1);
4125 QIncHelper<uint32_t>(config, &MacroAssembler::Uqinch, 2, kHRegSize, 42);
4126 QIncHelper<uint32_t>(config, &MacroAssembler::Uqinch, 15, kHRegSize, 999);
4127 QIncHelper<uint32_t>(config, &MacroAssembler::Uqinch, 16, kHRegSize, big32);
4128 QIncHelper<uint64_t>(config, &MacroAssembler::Uqinch, 1, kHRegSize, 1);
4129 QIncHelper<uint64_t>(config, &MacroAssembler::Uqinch, 2, kHRegSize, 42);
4130 QIncHelper<uint64_t>(config, &MacroAssembler::Uqinch, 15, kHRegSize, 999);
4131 QIncHelper<uint64_t>(config, &MacroAssembler::Uqinch, 16, kHRegSize, big64);
4132 }
4133
TEST_SVE(sve_uqincw)4134 TEST_SVE(sve_uqincw) {
4135 int32_t big32 = UINT32_MAX - 42;
4136 int64_t big64 = UINT64_MAX - 42;
4137 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincw, 1, kWRegSize, 1);
4138 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincw, 2, kWRegSize, 42);
4139 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincw, 15, kWRegSize, 999);
4140 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincw, 16, kWRegSize, big32);
4141 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincw, 1, kWRegSize, 1);
4142 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincw, 2, kWRegSize, 42);
4143 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincw, 15, kWRegSize, 999);
4144 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincw, 16, kWRegSize, big64);
4145 }
4146
TEST_SVE(sve_uqincd)4147 TEST_SVE(sve_uqincd) {
4148 int32_t big32 = UINT32_MAX - 42;
4149 int64_t big64 = UINT64_MAX - 42;
4150 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincd, 1, kDRegSize, 1);
4151 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincd, 2, kDRegSize, 42);
4152 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincd, 15, kDRegSize, 999);
4153 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincd, 16, kDRegSize, big32);
4154 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincd, 1, kDRegSize, 1);
4155 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincd, 2, kDRegSize, 42);
4156 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincd, 15, kDRegSize, 999);
4157 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincd, 16, kDRegSize, big64);
4158 }
4159
4160 typedef void (MacroAssembler::*QIncDecXWFn)(const Register& dst,
4161 const Register& src,
4162 int pattern,
4163 int multiplier);
4164
QIncDecXWHelper(Test * config,QIncDecXWFn cnt,int multiplier,int lane_size_in_bits,int32_t acc_value,bool is_increment)4165 static void QIncDecXWHelper(Test* config,
4166 QIncDecXWFn cnt,
4167 int multiplier,
4168 int lane_size_in_bits,
4169 int32_t acc_value,
4170 bool is_increment) {
4171 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
4172 START();
4173
4174 // Initialise accumulators.
4175 __ Mov(x0, acc_value);
4176 __ Mov(x1, acc_value);
4177 __ Mov(x2, acc_value);
4178 __ Mov(x3, acc_value);
4179 __ Mov(x4, acc_value);
4180 __ Mov(x5, acc_value);
4181 __ Mov(x6, acc_value);
4182 __ Mov(x7, acc_value);
4183 __ Mov(x8, acc_value);
4184 __ Mov(x9, acc_value);
4185 __ Mov(x10, acc_value);
4186 __ Mov(x11, acc_value);
4187 __ Mov(x12, acc_value);
4188 __ Mov(x13, acc_value);
4189 __ Mov(x14, acc_value);
4190 __ Mov(x15, acc_value);
4191 __ Mov(x18, acc_value);
4192 __ Mov(x19, acc_value);
4193 __ Mov(x20, acc_value);
4194 __ Mov(x21, acc_value);
4195
4196 (masm.*cnt)(x0, w0, SVE_POW2, multiplier);
4197 (masm.*cnt)(x1, w1, SVE_VL1, multiplier);
4198 (masm.*cnt)(x2, w2, SVE_VL2, multiplier);
4199 (masm.*cnt)(x3, w3, SVE_VL3, multiplier);
4200 (masm.*cnt)(x4, w4, SVE_VL4, multiplier);
4201 (masm.*cnt)(x5, w5, SVE_VL5, multiplier);
4202 (masm.*cnt)(x6, w6, SVE_VL6, multiplier);
4203 (masm.*cnt)(x7, w7, SVE_VL7, multiplier);
4204 (masm.*cnt)(x8, w8, SVE_VL8, multiplier);
4205 (masm.*cnt)(x9, w9, SVE_VL16, multiplier);
4206 (masm.*cnt)(x10, w10, SVE_VL32, multiplier);
4207 (masm.*cnt)(x11, w11, SVE_VL64, multiplier);
4208 (masm.*cnt)(x12, w12, SVE_VL128, multiplier);
4209 (masm.*cnt)(x13, w13, SVE_VL256, multiplier);
4210 (masm.*cnt)(x14, w14, 16, multiplier);
4211 (masm.*cnt)(x15, w15, 23, multiplier);
4212 (masm.*cnt)(x18, w18, 28, multiplier);
4213 (masm.*cnt)(x19, w19, SVE_MUL4, multiplier);
4214 (masm.*cnt)(x20, w20, SVE_MUL3, multiplier);
4215 (masm.*cnt)(x21, w21, SVE_ALL, multiplier);
4216
4217 END();
4218
4219 if (CAN_RUN()) {
4220 RUN();
4221
4222 int all = core.GetSVELaneCount(lane_size_in_bits);
4223 int pow2 = 1 << HighestSetBitPosition(all);
4224 int mul4 = all - (all % 4);
4225 int mul3 = all - (all % 3);
4226
4227 multiplier = is_increment ? multiplier : -multiplier;
4228
4229 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * pow2), x0);
4230 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(1, all)), x1);
4231 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(2, all)), x2);
4232 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(3, all)), x3);
4233 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(4, all)), x4);
4234 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(5, all)), x5);
4235 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(6, all)), x6);
4236 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(7, all)), x7);
4237 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(8, all)), x8);
4238 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(16, all)), x9);
4239 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(32, all)), x10);
4240 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(64, all)), x11);
4241 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(128, all)), x12);
4242 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(256, all)), x13);
4243 ASSERT_EQUAL_64(acc_value, x14);
4244 ASSERT_EQUAL_64(acc_value, x15);
4245 ASSERT_EQUAL_64(acc_value, x18);
4246 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * mul4), x19);
4247 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * mul3), x20);
4248 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * all), x21);
4249 }
4250 }
4251
QIncXWHelper(Test * config,QIncDecXWFn cnt,int multiplier,int lane_size_in_bits,int32_t acc_value)4252 static void QIncXWHelper(Test* config,
4253 QIncDecXWFn cnt,
4254 int multiplier,
4255 int lane_size_in_bits,
4256 int32_t acc_value) {
4257 QIncDecXWHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, true);
4258 }
4259
QDecXWHelper(Test * config,QIncDecXWFn cnt,int multiplier,int lane_size_in_bits,int32_t acc_value)4260 static void QDecXWHelper(Test* config,
4261 QIncDecXWFn cnt,
4262 int multiplier,
4263 int lane_size_in_bits,
4264 int32_t acc_value) {
4265 QIncDecXWHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, false);
4266 }
4267
TEST_SVE(sve_sqdecb_xw)4268 TEST_SVE(sve_sqdecb_xw) {
4269 QDecXWHelper(config, &MacroAssembler::Sqdecb, 1, kBRegSize, 1);
4270 QDecXWHelper(config, &MacroAssembler::Sqdecb, 2, kBRegSize, INT32_MIN + 42);
4271 QDecXWHelper(config, &MacroAssembler::Sqdecb, 15, kBRegSize, 999);
4272 QDecXWHelper(config, &MacroAssembler::Sqdecb, 16, kBRegSize, INT32_MAX - 42);
4273 }
4274
TEST_SVE(sve_sqdech_xw)4275 TEST_SVE(sve_sqdech_xw) {
4276 QDecXWHelper(config, &MacroAssembler::Sqdech, 1, kHRegSize, 1);
4277 QDecXWHelper(config, &MacroAssembler::Sqdech, 2, kHRegSize, INT32_MIN + 42);
4278 QDecXWHelper(config, &MacroAssembler::Sqdech, 15, kHRegSize, 999);
4279 QDecXWHelper(config, &MacroAssembler::Sqdech, 16, kHRegSize, INT32_MAX - 42);
4280 }
4281
TEST_SVE(sve_sqdecw_xw)4282 TEST_SVE(sve_sqdecw_xw) {
4283 QDecXWHelper(config, &MacroAssembler::Sqdecw, 1, kWRegSize, 1);
4284 QDecXWHelper(config, &MacroAssembler::Sqdecw, 2, kWRegSize, INT32_MIN + 42);
4285 QDecXWHelper(config, &MacroAssembler::Sqdecw, 15, kWRegSize, 999);
4286 QDecXWHelper(config, &MacroAssembler::Sqdecw, 16, kWRegSize, INT32_MAX - 42);
4287 }
4288
TEST_SVE(sve_sqdecd_xw)4289 TEST_SVE(sve_sqdecd_xw) {
4290 QDecXWHelper(config, &MacroAssembler::Sqdecd, 1, kDRegSize, 1);
4291 QDecXWHelper(config, &MacroAssembler::Sqdecd, 2, kDRegSize, INT32_MIN + 42);
4292 QDecXWHelper(config, &MacroAssembler::Sqdecd, 15, kDRegSize, 999);
4293 QDecXWHelper(config, &MacroAssembler::Sqdecd, 16, kDRegSize, INT32_MAX - 42);
4294 }
4295
TEST_SVE(sve_sqincb_xw)4296 TEST_SVE(sve_sqincb_xw) {
4297 QIncXWHelper(config, &MacroAssembler::Sqincb, 1, kBRegSize, 1);
4298 QIncXWHelper(config, &MacroAssembler::Sqincb, 2, kBRegSize, INT32_MIN + 42);
4299 QIncXWHelper(config, &MacroAssembler::Sqincb, 15, kBRegSize, 999);
4300 QIncXWHelper(config, &MacroAssembler::Sqincb, 16, kBRegSize, INT32_MAX - 42);
4301 }
4302
TEST_SVE(sve_sqinch_xw)4303 TEST_SVE(sve_sqinch_xw) {
4304 QIncXWHelper(config, &MacroAssembler::Sqinch, 1, kHRegSize, 1);
4305 QIncXWHelper(config, &MacroAssembler::Sqinch, 2, kHRegSize, INT32_MIN + 42);
4306 QIncXWHelper(config, &MacroAssembler::Sqinch, 15, kHRegSize, 999);
4307 QIncXWHelper(config, &MacroAssembler::Sqinch, 16, kHRegSize, INT32_MAX - 42);
4308 }
4309
TEST_SVE(sve_sqincw_xw)4310 TEST_SVE(sve_sqincw_xw) {
4311 QIncXWHelper(config, &MacroAssembler::Sqincw, 1, kWRegSize, 1);
4312 QIncXWHelper(config, &MacroAssembler::Sqincw, 2, kWRegSize, INT32_MIN + 42);
4313 QIncXWHelper(config, &MacroAssembler::Sqincw, 15, kWRegSize, 999);
4314 QIncXWHelper(config, &MacroAssembler::Sqincw, 16, kWRegSize, INT32_MAX - 42);
4315 }
4316
TEST_SVE(sve_sqincd_xw)4317 TEST_SVE(sve_sqincd_xw) {
4318 QIncXWHelper(config, &MacroAssembler::Sqincd, 1, kDRegSize, 1);
4319 QIncXWHelper(config, &MacroAssembler::Sqincd, 2, kDRegSize, INT32_MIN + 42);
4320 QIncXWHelper(config, &MacroAssembler::Sqincd, 15, kDRegSize, 999);
4321 QIncXWHelper(config, &MacroAssembler::Sqincd, 16, kDRegSize, INT32_MAX - 42);
4322 }
4323
4324 typedef void (MacroAssembler::*IncDecZFn)(const ZRegister& dst,
4325 int pattern,
4326 int multiplier);
4327 typedef void (MacroAssembler::*AddSubFn)(const ZRegister& dst,
4328 const ZRegister& src1,
4329 const ZRegister& src2);
4330
IncDecZHelper(Test * config,IncDecZFn fn,CntFn cnt,AddSubFn addsub,int multiplier,int lane_size_in_bits)4331 static void IncDecZHelper(Test* config,
4332 IncDecZFn fn,
4333 CntFn cnt,
4334 AddSubFn addsub,
4335 int multiplier,
4336 int lane_size_in_bits) {
4337 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
4338 START();
4339
4340 uint64_t acc_inputs[] = {0x7766554433221100,
4341 0xffffffffffffffff,
4342 0x0000000000000000,
4343 0xffffffff0000ffff,
4344 0x7fffffffffffffff,
4345 0x8000000000000000,
4346 0x7fffffff7fff7fff,
4347 0x8000000080008000};
4348
4349 for (unsigned i = 0; i < kNumberOfZRegisters; i++) {
4350 for (int j = 0; j < 4; j++) {
4351 InsrHelper(&masm, ZRegister(i, kDRegSize), acc_inputs);
4352 }
4353 }
4354 for (unsigned i = 0; i < 15; i++) {
4355 __ Mov(XRegister(i), 0);
4356 }
4357
4358 (masm.*fn)(z16.WithLaneSize(lane_size_in_bits), SVE_POW2, multiplier);
4359 (masm.*fn)(z17.WithLaneSize(lane_size_in_bits), SVE_VL1, multiplier);
4360 (masm.*fn)(z18.WithLaneSize(lane_size_in_bits), SVE_VL2, multiplier);
4361 (masm.*fn)(z19.WithLaneSize(lane_size_in_bits), SVE_VL3, multiplier);
4362 (masm.*fn)(z20.WithLaneSize(lane_size_in_bits), SVE_VL4, multiplier);
4363 (masm.*fn)(z21.WithLaneSize(lane_size_in_bits), SVE_VL7, multiplier);
4364 (masm.*fn)(z22.WithLaneSize(lane_size_in_bits), SVE_VL8, multiplier);
4365 (masm.*fn)(z23.WithLaneSize(lane_size_in_bits), SVE_VL16, multiplier);
4366 (masm.*fn)(z24.WithLaneSize(lane_size_in_bits), SVE_VL64, multiplier);
4367 (masm.*fn)(z25.WithLaneSize(lane_size_in_bits), SVE_VL256, multiplier);
4368 (masm.*fn)(z26.WithLaneSize(lane_size_in_bits), 16, multiplier);
4369 (masm.*fn)(z27.WithLaneSize(lane_size_in_bits), 28, multiplier);
4370 (masm.*fn)(z28.WithLaneSize(lane_size_in_bits), SVE_MUL3, multiplier);
4371 (masm.*fn)(z29.WithLaneSize(lane_size_in_bits), SVE_MUL4, multiplier);
4372 (masm.*fn)(z30.WithLaneSize(lane_size_in_bits), SVE_ALL, multiplier);
4373
4374 // Perform computation using alternative instructions.
4375 (masm.*cnt)(x0, SVE_POW2, multiplier);
4376 (masm.*cnt)(x1, SVE_VL1, multiplier);
4377 (masm.*cnt)(x2, SVE_VL2, multiplier);
4378 (masm.*cnt)(x3, SVE_VL3, multiplier);
4379 (masm.*cnt)(x4, SVE_VL4, multiplier);
4380 (masm.*cnt)(x5, SVE_VL7, multiplier);
4381 (masm.*cnt)(x6, SVE_VL8, multiplier);
4382 (masm.*cnt)(x7, SVE_VL16, multiplier);
4383 (masm.*cnt)(x8, SVE_VL64, multiplier);
4384 (masm.*cnt)(x9, SVE_VL256, multiplier);
4385 (masm.*cnt)(x10, 16, multiplier);
4386 (masm.*cnt)(x11, 28, multiplier);
4387 (masm.*cnt)(x12, SVE_MUL3, multiplier);
4388 (masm.*cnt)(x13, SVE_MUL4, multiplier);
4389 (masm.*cnt)(x14, SVE_ALL, multiplier);
4390
4391 ZRegister zscratch = z15.WithLaneSize(lane_size_in_bits);
4392 for (unsigned i = 0; i < 15; i++) {
4393 ZRegister zsrcdst = ZRegister(i, lane_size_in_bits);
4394 Register x = Register(i, kXRegSize);
4395 __ Dup(zscratch, x);
4396 (masm.*addsub)(zsrcdst, zsrcdst, zscratch);
4397 }
4398
4399 END();
4400
4401 if (CAN_RUN()) {
4402 RUN();
4403
4404 ASSERT_EQUAL_SVE(z0, z16);
4405 ASSERT_EQUAL_SVE(z1, z17);
4406 ASSERT_EQUAL_SVE(z2, z18);
4407 ASSERT_EQUAL_SVE(z3, z19);
4408 ASSERT_EQUAL_SVE(z4, z20);
4409 ASSERT_EQUAL_SVE(z5, z21);
4410 ASSERT_EQUAL_SVE(z6, z22);
4411 ASSERT_EQUAL_SVE(z7, z23);
4412 ASSERT_EQUAL_SVE(z8, z24);
4413 ASSERT_EQUAL_SVE(z9, z25);
4414 ASSERT_EQUAL_SVE(z10, z26);
4415 ASSERT_EQUAL_SVE(z11, z27);
4416 ASSERT_EQUAL_SVE(z12, z28);
4417 ASSERT_EQUAL_SVE(z13, z29);
4418 ASSERT_EQUAL_SVE(z14, z30);
4419 }
4420 }
4421
TEST_SVE(sve_inc_dec_vec)4422 TEST_SVE(sve_inc_dec_vec) {
4423 CntFn cnth = &MacroAssembler::Cnth;
4424 CntFn cntw = &MacroAssembler::Cntw;
4425 CntFn cntd = &MacroAssembler::Cntd;
4426 AddSubFn sub = &MacroAssembler::Sub;
4427 AddSubFn add = &MacroAssembler::Add;
4428 for (int mult = 1; mult <= 16; mult += 5) {
4429 IncDecZHelper(config, &MacroAssembler::Dech, cnth, sub, mult, kHRegSize);
4430 IncDecZHelper(config, &MacroAssembler::Decw, cntw, sub, mult, kSRegSize);
4431 IncDecZHelper(config, &MacroAssembler::Decd, cntd, sub, mult, kDRegSize);
4432 IncDecZHelper(config, &MacroAssembler::Inch, cnth, add, mult, kHRegSize);
4433 IncDecZHelper(config, &MacroAssembler::Incw, cntw, add, mult, kSRegSize);
4434 IncDecZHelper(config, &MacroAssembler::Incd, cntd, add, mult, kDRegSize);
4435 }
4436 }
4437
TEST_SVE(sve_unsigned_sat_inc_dec_vec)4438 TEST_SVE(sve_unsigned_sat_inc_dec_vec) {
4439 CntFn cnth = &MacroAssembler::Cnth;
4440 CntFn cntw = &MacroAssembler::Cntw;
4441 CntFn cntd = &MacroAssembler::Cntd;
4442 AddSubFn sub = &MacroAssembler::Uqsub;
4443 AddSubFn add = &MacroAssembler::Uqadd;
4444 for (int mult = 1; mult <= 16; mult += 5) {
4445 IncDecZHelper(config, &MacroAssembler::Uqdech, cnth, sub, mult, kHRegSize);
4446 IncDecZHelper(config, &MacroAssembler::Uqdecw, cntw, sub, mult, kSRegSize);
4447 IncDecZHelper(config, &MacroAssembler::Uqdecd, cntd, sub, mult, kDRegSize);
4448 IncDecZHelper(config, &MacroAssembler::Uqinch, cnth, add, mult, kHRegSize);
4449 IncDecZHelper(config, &MacroAssembler::Uqincw, cntw, add, mult, kSRegSize);
4450 IncDecZHelper(config, &MacroAssembler::Uqincd, cntd, add, mult, kDRegSize);
4451 }
4452 }
4453
TEST_SVE(sve_signed_sat_inc_dec_vec)4454 TEST_SVE(sve_signed_sat_inc_dec_vec) {
4455 CntFn cnth = &MacroAssembler::Cnth;
4456 CntFn cntw = &MacroAssembler::Cntw;
4457 CntFn cntd = &MacroAssembler::Cntd;
4458 AddSubFn sub = &MacroAssembler::Sqsub;
4459 AddSubFn add = &MacroAssembler::Sqadd;
4460 for (int mult = 1; mult <= 16; mult += 5) {
4461 IncDecZHelper(config, &MacroAssembler::Sqdech, cnth, sub, mult, kHRegSize);
4462 IncDecZHelper(config, &MacroAssembler::Sqdecw, cntw, sub, mult, kSRegSize);
4463 IncDecZHelper(config, &MacroAssembler::Sqdecd, cntd, sub, mult, kDRegSize);
4464 IncDecZHelper(config, &MacroAssembler::Sqinch, cnth, add, mult, kHRegSize);
4465 IncDecZHelper(config, &MacroAssembler::Sqincw, cntw, add, mult, kSRegSize);
4466 IncDecZHelper(config, &MacroAssembler::Sqincd, cntd, add, mult, kDRegSize);
4467 }
4468 }
4469
4470 typedef void (MacroAssembler::*ArithPredicatedFn)(const ZRegister& zd,
4471 const PRegisterM& pg,
4472 const ZRegister& zn,
4473 const ZRegister& zm);
4474
4475 template <typename Td, typename Tg, typename Tn>
IntBinArithHelper(Test * config,ArithPredicatedFn macro,unsigned lane_size_in_bits,const Tg & pg_inputs,const Tn & zn_inputs,const Tn & zm_inputs,const Td & zd_expected)4476 static void IntBinArithHelper(Test* config,
4477 ArithPredicatedFn macro,
4478 unsigned lane_size_in_bits,
4479 const Tg& pg_inputs,
4480 const Tn& zn_inputs,
4481 const Tn& zm_inputs,
4482 const Td& zd_expected) {
4483 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
4484 START();
4485
4486 ZRegister src_a = z31.WithLaneSize(lane_size_in_bits);
4487 ZRegister src_b = z27.WithLaneSize(lane_size_in_bits);
4488 InsrHelper(&masm, src_a, zn_inputs);
4489 InsrHelper(&masm, src_b, zm_inputs);
4490
4491 Initialise(&masm, p0.WithLaneSize(lane_size_in_bits), pg_inputs);
4492
4493 ZRegister zd_1 = z0.WithLaneSize(lane_size_in_bits);
4494 ZRegister zd_2 = z1.WithLaneSize(lane_size_in_bits);
4495 ZRegister zd_3 = z2.WithLaneSize(lane_size_in_bits);
4496
4497 // `instr` zd(dst), zd(src_a), zn(src_b)
4498 __ Mov(zd_1, src_a);
4499 (masm.*macro)(zd_1, p0.Merging(), zd_1, src_b);
4500
4501 // `instr` zd(dst), zm(src_a), zd(src_b)
4502 // Based on whether zd and zm registers are aliased, the macro of instructions
4503 // (`Instr`) swaps the order of operands if it has the commutative property,
4504 // otherwise, transfer to the reversed `Instr`, such as subr and divr.
4505 __ Mov(zd_2, src_b);
4506 (masm.*macro)(zd_2, p0.Merging(), src_a, zd_2);
4507
4508 // `instr` zd(dst), zm(src_a), zn(src_b)
4509 // The macro of instructions (`Instr`) automatically selects between `instr`
4510 // and movprfx + `instr` based on whether zd and zn registers are aliased.
4511 // A generated movprfx instruction is predicated that using the same
4512 // governing predicate register. In order to keep the result constant,
4513 // initialize the destination register first.
4514 __ Mov(zd_3, src_a);
4515 (masm.*macro)(zd_3, p0.Merging(), src_a, src_b);
4516
4517 END();
4518
4519 if (CAN_RUN()) {
4520 RUN();
4521 ASSERT_EQUAL_SVE(zd_expected, zd_1);
4522
4523 for (size_t i = 0; i < ArrayLength(zd_expected); i++) {
4524 int lane = static_cast<int>(ArrayLength(zd_expected) - i - 1);
4525 if (!core.HasSVELane(zd_1, lane)) break;
4526 if ((pg_inputs[i] & 1) != 0) {
4527 ASSERT_EQUAL_SVE_LANE(zd_expected[i], zd_1, lane);
4528 } else {
4529 ASSERT_EQUAL_SVE_LANE(zn_inputs[i], zd_1, lane);
4530 }
4531 }
4532
4533 ASSERT_EQUAL_SVE(zd_expected, zd_3);
4534 }
4535 }
4536
TEST_SVE(sve_binary_arithmetic_predicated_add)4537 TEST_SVE(sve_binary_arithmetic_predicated_add) {
4538 // clang-format off
4539 unsigned zn_b[] = {0x00, 0x01, 0x10, 0x81, 0xff, 0x0f, 0x01, 0x7f};
4540
4541 unsigned zm_b[] = {0x00, 0x01, 0x10, 0x00, 0x81, 0x80, 0xff, 0xff};
4542
4543 unsigned zn_h[] = {0x0000, 0x0123, 0x1010, 0x8181, 0xffff, 0x0f0f, 0x0101, 0x7f7f};
4544
4545 unsigned zm_h[] = {0x0000, 0x0123, 0x1010, 0x0000, 0x8181, 0x8080, 0xffff, 0xffff};
4546
4547 unsigned zn_s[] = {0x00000000, 0x01234567, 0x10101010, 0x81818181,
4548 0xffffffff, 0x0f0f0f0f, 0x01010101, 0x7f7f7f7f};
4549
4550 unsigned zm_s[] = {0x00000000, 0x01234567, 0x10101010, 0x00000000,
4551 0x81818181, 0x80808080, 0xffffffff, 0xffffffff};
4552
4553 uint64_t zn_d[] = {0x0000000000000000, 0x0123456789abcdef,
4554 0x1010101010101010, 0x8181818181818181,
4555 0xffffffffffffffff, 0x0f0f0f0f0f0f0f0f,
4556 0x0101010101010101, 0x7f7f7f7fffffffff};
4557
4558 uint64_t zm_d[] = {0x0000000000000000, 0x0123456789abcdef,
4559 0x1010101010101010, 0x0000000000000000,
4560 0x8181818181818181, 0x8080808080808080,
4561 0xffffffffffffffff, 0xffffffffffffffff};
4562
4563 int pg_b[] = {1, 1, 1, 0, 1, 1, 1, 0};
4564 int pg_h[] = {1, 1, 0, 1, 1, 1, 0, 1};
4565 int pg_s[] = {1, 0, 1, 1, 1, 0, 1, 1};
4566 int pg_d[] = {0, 1, 1, 1, 0, 1, 1, 1};
4567
4568 unsigned add_exp_b[] = {0x00, 0x02, 0x20, 0x81, 0x80, 0x8f, 0x00, 0x7f};
4569
4570 unsigned add_exp_h[] = {0x0000, 0x0246, 0x1010, 0x8181,
4571 0x8180, 0x8f8f, 0x0101, 0x7f7e};
4572
4573 unsigned add_exp_s[] = {0x00000000, 0x01234567, 0x20202020, 0x81818181,
4574 0x81818180, 0x0f0f0f0f, 0x01010100, 0x7f7f7f7e};
4575
4576 uint64_t add_exp_d[] = {0x0000000000000000, 0x02468acf13579bde,
4577 0x2020202020202020, 0x8181818181818181,
4578 0xffffffffffffffff, 0x8f8f8f8f8f8f8f8f,
4579 0x0101010101010100, 0x7f7f7f7ffffffffe};
4580
4581 ArithPredicatedFn fn = &MacroAssembler::Add;
4582 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, add_exp_b);
4583 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, add_exp_h);
4584 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, add_exp_s);
4585 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, add_exp_d);
4586
4587 unsigned sub_exp_b[] = {0x00, 0x00, 0x00, 0x81, 0x7e, 0x8f, 0x02, 0x7f};
4588
4589 unsigned sub_exp_h[] = {0x0000, 0x0000, 0x1010, 0x8181,
4590 0x7e7e, 0x8e8f, 0x0101, 0x7f80};
4591
4592 unsigned sub_exp_s[] = {0x00000000, 0x01234567, 0x00000000, 0x81818181,
4593 0x7e7e7e7e, 0x0f0f0f0f, 0x01010102, 0x7f7f7f80};
4594
4595 uint64_t sub_exp_d[] = {0x0000000000000000, 0x0000000000000000,
4596 0x0000000000000000, 0x8181818181818181,
4597 0xffffffffffffffff, 0x8e8e8e8e8e8e8e8f,
4598 0x0101010101010102, 0x7f7f7f8000000000};
4599
4600 fn = &MacroAssembler::Sub;
4601 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, sub_exp_b);
4602 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, sub_exp_h);
4603 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, sub_exp_s);
4604 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, sub_exp_d);
4605 // clang-format on
4606 }
4607
TEST_SVE(sve_binary_arithmetic_predicated_umin_umax_uabd)4608 TEST_SVE(sve_binary_arithmetic_predicated_umin_umax_uabd) {
4609 // clang-format off
4610 unsigned zn_b[] = {0x00, 0xff, 0x0f, 0xff, 0xf0, 0x98, 0x55, 0x67};
4611
4612 unsigned zm_b[] = {0x01, 0x00, 0x0e, 0xfe, 0xfe, 0xab, 0xcd, 0x78};
4613
4614 unsigned zn_h[] = {0x0000, 0xffff, 0x00ff, 0xffff,
4615 0xff00, 0xba98, 0x5555, 0x4567};
4616
4617 unsigned zm_h[] = {0x0001, 0x0000, 0x00ee, 0xfffe,
4618 0xfe00, 0xabab, 0xcdcd, 0x5678};
4619
4620 unsigned zn_s[] = {0x00000000, 0xffffffff, 0x0000ffff, 0xffffffff,
4621 0xffff0000, 0xfedcba98, 0x55555555, 0x01234567};
4622
4623 unsigned zm_s[] = {0x00000001, 0x00000000, 0x0000eeee, 0xfffffffe,
4624 0xfffe0000, 0xabababab, 0xcdcdcdcd, 0x12345678};
4625
4626 uint64_t zn_d[] = {0x0000000000000000, 0xffffffffffffffff,
4627 0x5555555555555555, 0x0000000001234567};
4628
4629 uint64_t zm_d[] = {0x0000000000000001, 0x0000000000000000,
4630 0xcdcdcdcdcdcdcdcd, 0x0000000012345678};
4631
4632 int pg_b[] = {1, 1, 1, 0, 1, 1, 1, 0};
4633 int pg_h[] = {1, 1, 0, 1, 1, 1, 0, 1};
4634 int pg_s[] = {1, 0, 1, 1, 1, 0, 1, 1};
4635 int pg_d[] = {1, 0, 1, 1};
4636
4637 unsigned umax_exp_b[] = {0x01, 0xff, 0x0f, 0xff, 0xfe, 0xab, 0xcd, 0x67};
4638
4639 unsigned umax_exp_h[] = {0x0001, 0xffff, 0x00ff, 0xffff,
4640 0xff00, 0xba98, 0x5555, 0x5678};
4641
4642 unsigned umax_exp_s[] = {0x00000001, 0xffffffff, 0x0000ffff, 0xffffffff,
4643 0xffff0000, 0xfedcba98, 0xcdcdcdcd, 0x12345678};
4644
4645 uint64_t umax_exp_d[] = {0x0000000000000001, 0xffffffffffffffff,
4646 0xcdcdcdcdcdcdcdcd, 0x0000000012345678};
4647
4648 ArithPredicatedFn fn = &MacroAssembler::Umax;
4649 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, umax_exp_b);
4650 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, umax_exp_h);
4651 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, umax_exp_s);
4652 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, umax_exp_d);
4653
4654 unsigned umin_exp_b[] = {0x00, 0x00, 0x0e, 0xff, 0xf0, 0x98, 0x55, 0x67};
4655
4656 unsigned umin_exp_h[] = {0x0000, 0x0000, 0x00ff, 0xfffe,
4657 0xfe00, 0xabab, 0x5555, 0x4567};
4658
4659 unsigned umin_exp_s[] = {0x00000000, 0xffffffff, 0x0000eeee, 0xfffffffe,
4660 0xfffe0000, 0xfedcba98, 0x55555555, 0x01234567};
4661
4662 uint64_t umin_exp_d[] = {0x0000000000000000, 0xffffffffffffffff,
4663 0x5555555555555555, 0x0000000001234567};
4664 fn = &MacroAssembler::Umin;
4665 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, umin_exp_b);
4666 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, umin_exp_h);
4667 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, umin_exp_s);
4668 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, umin_exp_d);
4669
4670 unsigned uabd_exp_b[] = {0x01, 0xff, 0x01, 0xff, 0x0e, 0x13, 0x78, 0x67};
4671
4672 unsigned uabd_exp_h[] = {0x0001, 0xffff, 0x00ff, 0x0001,
4673 0x0100, 0x0eed, 0x5555, 0x1111};
4674
4675 unsigned uabd_exp_s[] = {0x00000001, 0xffffffff, 0x00001111, 0x00000001,
4676 0x00010000, 0xfedcba98, 0x78787878, 0x11111111};
4677
4678 uint64_t uabd_exp_d[] = {0x0000000000000001, 0xffffffffffffffff,
4679 0x7878787878787878, 0x0000000011111111};
4680
4681 fn = &MacroAssembler::Uabd;
4682 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, uabd_exp_b);
4683 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, uabd_exp_h);
4684 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, uabd_exp_s);
4685 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, uabd_exp_d);
4686 // clang-format on
4687 }
4688
TEST_SVE(sve_binary_arithmetic_predicated_smin_smax_sabd)4689 TEST_SVE(sve_binary_arithmetic_predicated_smin_smax_sabd) {
4690 // clang-format off
4691 int zn_b[] = {0, -128, -128, -128, -128, 127, 127, 1};
4692
4693 int zm_b[] = {-1, 0, -1, -127, 127, 126, -1, 0};
4694
4695 int zn_h[] = {0, INT16_MIN, INT16_MIN, INT16_MIN,
4696 INT16_MIN, INT16_MAX, INT16_MAX, 1};
4697
4698 int zm_h[] = {-1, 0, -1, INT16_MIN + 1,
4699 INT16_MAX, INT16_MAX - 1, -1, 0};
4700
4701 int zn_s[] = {0, INT32_MIN, INT32_MIN, INT32_MIN,
4702 INT32_MIN, INT32_MAX, INT32_MAX, 1};
4703
4704 int zm_s[] = {-1, 0, -1, -INT32_MAX,
4705 INT32_MAX, INT32_MAX - 1, -1, 0};
4706
4707 int64_t zn_d[] = {0, INT64_MIN, INT64_MIN, INT64_MIN,
4708 INT64_MIN, INT64_MAX, INT64_MAX, 1};
4709
4710 int64_t zm_d[] = {-1, 0, -1, INT64_MIN + 1,
4711 INT64_MAX, INT64_MAX - 1, -1, 0};
4712
4713 int pg_b[] = {1, 1, 1, 0, 1, 1, 1, 0};
4714 int pg_h[] = {1, 1, 0, 1, 1, 1, 0, 1};
4715 int pg_s[] = {1, 0, 1, 1, 1, 0, 1, 1};
4716 int pg_d[] = {0, 1, 1, 1, 0, 1, 1, 1};
4717
4718 int smax_exp_b[] = {0, 0, -1, -128, 127, 127, 127, 1};
4719
4720 int smax_exp_h[] = {0, 0, INT16_MIN, INT16_MIN + 1,
4721 INT16_MAX, INT16_MAX, INT16_MAX, 1};
4722
4723 int smax_exp_s[] = {0, INT32_MIN, -1, INT32_MIN + 1,
4724 INT32_MAX, INT32_MAX, INT32_MAX, 1};
4725
4726 int64_t smax_exp_d[] = {0, 0, -1, INT64_MIN + 1,
4727 INT64_MIN, INT64_MAX, INT64_MAX, 1};
4728
4729 ArithPredicatedFn fn = &MacroAssembler::Smax;
4730 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, smax_exp_b);
4731 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, smax_exp_h);
4732 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, smax_exp_s);
4733 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, smax_exp_d);
4734
4735 int smin_exp_b[] = {-1, -128, -128, -128, -128, 126, -1, 1};
4736
4737 int smin_exp_h[] = {-1, INT16_MIN, INT16_MIN, INT16_MIN,
4738 INT16_MIN, INT16_MAX - 1, INT16_MAX, 0};
4739
4740 int smin_exp_s[] = {-1, INT32_MIN, INT32_MIN, INT32_MIN,
4741 INT32_MIN, INT32_MAX, -1, 0};
4742
4743 int64_t smin_exp_d[] = {0, INT64_MIN, INT64_MIN, INT64_MIN,
4744 INT64_MIN, INT64_MAX - 1, -1, 0};
4745
4746 fn = &MacroAssembler::Smin;
4747 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, smin_exp_b);
4748 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, smin_exp_h);
4749 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, smin_exp_s);
4750 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, smin_exp_d);
4751
4752 unsigned sabd_exp_b[] = {1, 128, 127, 128, 255, 1, 128, 1};
4753
4754 unsigned sabd_exp_h[] = {1, 0x8000, 0x8000, 1, 0xffff, 1, 0x7fff, 1};
4755
4756 unsigned sabd_exp_s[] = {1, 0x80000000, 0x7fffffff, 1,
4757 0xffffffff, 0x7fffffff, 0x80000000, 1};
4758
4759 uint64_t sabd_exp_d[] = {0, 0x8000000000000000, 0x7fffffffffffffff, 1,
4760 0x8000000000000000, 1, 0x8000000000000000, 1};
4761
4762 fn = &MacroAssembler::Sabd;
4763 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, sabd_exp_b);
4764 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, sabd_exp_h);
4765 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, sabd_exp_s);
4766 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, sabd_exp_d);
4767 // clang-format on
4768 }
4769
TEST_SVE(sve_binary_arithmetic_predicated_mul_umulh)4770 TEST_SVE(sve_binary_arithmetic_predicated_mul_umulh) {
4771 // clang-format off
4772 unsigned zn_b[] = {0x00, 0x01, 0x20, 0x08, 0x80, 0xff, 0x55, 0xaa};
4773
4774 unsigned zm_b[] = {0x7f, 0xcd, 0x80, 0xff, 0x55, 0xaa, 0x00, 0x08};
4775
4776 unsigned zn_h[] = {0x0000, 0x0001, 0x0020, 0x0800,
4777 0x8000, 0xff00, 0x5555, 0xaaaa};
4778
4779 unsigned zm_h[] = {0x007f, 0x00cd, 0x0800, 0xffff,
4780 0x5555, 0xaaaa, 0x0001, 0x1234};
4781
4782 unsigned zn_s[] = {0x00000000, 0x00000001, 0x00200020, 0x08000800,
4783 0x12345678, 0xffffffff, 0x55555555, 0xaaaaaaaa};
4784
4785 unsigned zm_s[] = {0x00000000, 0x00000001, 0x00200020, 0x08000800,
4786 0x12345678, 0x22223333, 0x55556666, 0x77778888};
4787
4788 uint64_t zn_d[] = {0x0000000000000000, 0x5555555555555555,
4789 0xffffffffffffffff, 0xaaaaaaaaaaaaaaaa};
4790
4791 uint64_t zm_d[] = {0x0000000000000000, 0x1111111133333333,
4792 0xddddddddeeeeeeee, 0xaaaaaaaaaaaaaaaa};
4793
4794 int pg_b[] = {0, 1, 1, 1, 0, 1, 1, 1};
4795 int pg_h[] = {1, 0, 1, 1, 1, 0, 1, 1};
4796 int pg_s[] = {1, 1, 0, 1, 1, 1, 0, 1};
4797 int pg_d[] = {1, 1, 0, 1};
4798
4799 unsigned mul_exp_b[] = {0x00, 0xcd, 0x00, 0xf8, 0x80, 0x56, 0x00, 0x50};
4800
4801 unsigned mul_exp_h[] = {0x0000, 0x0001, 0x0000, 0xf800,
4802 0x8000, 0xff00, 0x5555, 0x9e88};
4803
4804 unsigned mul_exp_s[] = {0x00000000, 0x00000001, 0x00200020, 0x00400000,
4805 0x1df4d840, 0xddddcccd, 0x55555555, 0xb05afa50};
4806
4807 uint64_t mul_exp_d[] = {0x0000000000000000, 0xa4fa4fa4eeeeeeef,
4808 0xffffffffffffffff, 0x38e38e38e38e38e4};
4809
4810 ArithPredicatedFn fn = &MacroAssembler::Mul;
4811 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, mul_exp_b);
4812 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, mul_exp_h);
4813 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, mul_exp_s);
4814 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, mul_exp_d);
4815
4816 unsigned umulh_exp_b[] = {0x00, 0x00, 0x10, 0x07, 0x80, 0xa9, 0x00, 0x05};
4817
4818 unsigned umulh_exp_h[] = {0x0000, 0x0001, 0x0001, 0x07ff,
4819 0x2aaa, 0xff00, 0x0000, 0x0c22};
4820
4821 unsigned umulh_exp_s[] = {0x00000000, 0x00000000, 0x00200020, 0x00400080,
4822 0x014b66dc, 0x22223332, 0x55555555, 0x4fa505af};
4823
4824 uint64_t umulh_exp_d[] = {0x0000000000000000, 0x05b05b05bbbbbbbb,
4825 0xffffffffffffffff, 0x71c71c71c71c71c6};
4826
4827 fn = &MacroAssembler::Umulh;
4828 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, umulh_exp_b);
4829 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, umulh_exp_h);
4830 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, umulh_exp_s);
4831 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, umulh_exp_d);
4832 // clang-format on
4833 }
4834
TEST_SVE(sve_binary_arithmetic_predicated_smulh)4835 TEST_SVE(sve_binary_arithmetic_predicated_smulh) {
4836 // clang-format off
4837 int zn_b[] = {0, 1, -1, INT8_MIN, INT8_MAX, -1, 100, -3};
4838
4839 int zm_b[] = {0, INT8_MIN, INT8_MIN, INT8_MAX, INT8_MAX, -1, 2, 66};
4840
4841 int zn_h[] = {0, 1, -1, INT16_MIN, INT16_MAX, -1, 10000, -3};
4842
4843 int zm_h[] = {0, INT16_MIN, INT16_MIN, INT16_MAX, INT16_MAX, -1, 2, 6666};
4844
4845 int zn_s[] = {0, 1, -1, INT32_MIN, INT32_MAX, -1, 100000000, -3};
4846
4847 int zm_s[] = {0, INT32_MIN, INT32_MIN, INT32_MAX, INT32_MAX, -1, 2, 66666666};
4848
4849 int64_t zn_d[] = {0, -1, INT64_MIN, INT64_MAX};
4850
4851 int64_t zm_d[] = {INT64_MIN, INT64_MAX, INT64_MIN, INT64_MAX};
4852
4853 int pg_b[] = {0, 1, 1, 1, 0, 1, 1, 1};
4854 int pg_h[] = {1, 0, 1, 1, 1, 0, 1, 1};
4855 int pg_s[] = {1, 1, 0, 1, 1, 1, 0, 1};
4856 int pg_d[] = {1, 1, 0, 1};
4857
4858 int exp_b[] = {0, -1, 0, -64, INT8_MAX, 0, 0, -1};
4859
4860 int exp_h[] = {0, 1, 0, -16384, 16383, -1, 0, -1};
4861
4862 int exp_s[] = {0, -1, -1, -1073741824, 1073741823, 0, 100000000, -1};
4863
4864 int64_t exp_d[] = {0, -1, INT64_MIN, 4611686018427387903};
4865
4866 ArithPredicatedFn fn = &MacroAssembler::Smulh;
4867 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, exp_b);
4868 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, exp_h);
4869 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, exp_s);
4870 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, exp_d);
4871 // clang-format on
4872 }
4873
TEST_SVE(sve_binary_arithmetic_predicated_logical)4874 TEST_SVE(sve_binary_arithmetic_predicated_logical) {
4875 // clang-format off
4876 unsigned zn_b[] = {0x00, 0x01, 0x20, 0x08, 0x80, 0xff, 0x55, 0xaa};
4877 unsigned zm_b[] = {0x7f, 0xcd, 0x80, 0xff, 0x55, 0xaa, 0x00, 0x08};
4878
4879 unsigned zn_h[] = {0x0000, 0x0001, 0x2020, 0x0008,
4880 0x8000, 0xffff, 0x5555, 0xaaaa};
4881 unsigned zm_h[] = {0x7fff, 0xabcd, 0x8000, 0xffff,
4882 0x5555, 0xaaaa, 0x0000, 0x0800};
4883
4884 unsigned zn_s[] = {0x00000001, 0x20200008, 0x8000ffff, 0x5555aaaa};
4885 unsigned zm_s[] = {0x7fffabcd, 0x8000ffff, 0x5555aaaa, 0x00000800};
4886
4887 uint64_t zn_d[] = {0xfedcba9876543210, 0x0123456789abcdef,
4888 0x0001200880ff55aa, 0x0022446688aaccee};
4889 uint64_t zm_d[] = {0xffffeeeeddddcccc, 0xccccddddeeeeffff,
4890 0x7fcd80ff55aa0008, 0x1133557799bbddff};
4891
4892 int pg_b[] = {0, 1, 1, 1, 0, 1, 1, 1};
4893 int pg_h[] = {1, 0, 1, 1, 1, 0, 1, 1};
4894 int pg_s[] = {1, 1, 1, 0};
4895 int pg_d[] = {1, 1, 0, 1};
4896
4897 unsigned and_exp_b[] = {0x00, 0x01, 0x00, 0x08, 0x80, 0xaa, 0x00, 0x08};
4898
4899 unsigned and_exp_h[] = {0x0000, 0x0001, 0x0000, 0x0008,
4900 0x0000, 0xffff, 0x0000, 0x0800};
4901
4902 unsigned and_exp_s[] = {0x00000001, 0x00000008, 0x0000aaaa, 0x5555aaaa};
4903
4904 uint64_t and_exp_d[] = {0xfedcaa8854540000, 0x0000454588aacdef,
4905 0x0001200880ff55aa, 0x0022446688aaccee};
4906
4907 ArithPredicatedFn fn = &MacroAssembler::And;
4908 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, and_exp_b);
4909 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, and_exp_h);
4910 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, and_exp_s);
4911 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, and_exp_d);
4912
4913 unsigned bic_exp_b[] = {0x00, 0x00, 0x20, 0x00, 0x80, 0x55, 0x55, 0xa2};
4914
4915 unsigned bic_exp_h[] = {0x0000, 0x0001, 0x2020, 0x0000,
4916 0x8000, 0xffff, 0x5555, 0xa2aa};
4917
4918 unsigned bic_exp_s[] = {0x00000000, 0x20200000, 0x80005555, 0x5555aaaa};
4919
4920 uint64_t bic_exp_d[] = {0x0000101022003210, 0x0123002201010000,
4921 0x0001200880ff55aa, 0x0000000000000000};
4922
4923 fn = &MacroAssembler::Bic;
4924 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, bic_exp_b);
4925 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, bic_exp_h);
4926 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, bic_exp_s);
4927 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, bic_exp_d);
4928
4929 unsigned eor_exp_b[] = {0x00, 0xcc, 0xa0, 0xf7, 0x80, 0x55, 0x55, 0xa2};
4930
4931 unsigned eor_exp_h[] = {0x7fff, 0x0001, 0xa020, 0xfff7,
4932 0xd555, 0xffff, 0x5555, 0xa2aa};
4933
4934 unsigned eor_exp_s[] = {0x7fffabcc, 0xa020fff7, 0xd5555555, 0x5555aaaa};
4935
4936 uint64_t eor_exp_d[] = {0x01235476ab89fedc, 0xcdef98ba67453210,
4937 0x0001200880ff55aa, 0x1111111111111111};
4938
4939 fn = &MacroAssembler::Eor;
4940 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, eor_exp_b);
4941 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, eor_exp_h);
4942 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, eor_exp_s);
4943 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, eor_exp_d);
4944
4945 unsigned orr_exp_b[] = {0x00, 0xcd, 0xa0, 0xff, 0x80, 0xff, 0x55, 0xaa};
4946
4947 unsigned orr_exp_h[] = {0x7fff, 0x0001, 0xa020, 0xffff,
4948 0xd555, 0xffff, 0x5555, 0xaaaa};
4949
4950 unsigned orr_exp_s[] = {0x7fffabcd, 0xa020ffff, 0xd555ffff, 0x5555aaaa};
4951
4952 uint64_t orr_exp_d[] = {0xfffffefeffddfedc, 0xcdefddffefefffff,
4953 0x0001200880ff55aa, 0x1133557799bbddff};
4954
4955 fn = &MacroAssembler::Orr;
4956 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, orr_exp_b);
4957 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, orr_exp_h);
4958 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, orr_exp_s);
4959 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, orr_exp_d);
4960 // clang-format on
4961 }
4962
TEST_SVE(sve_binary_arithmetic_predicated_sdiv)4963 TEST_SVE(sve_binary_arithmetic_predicated_sdiv) {
4964 // clang-format off
4965 int zn_s[] = {0, 1, -1, 2468,
4966 INT32_MIN, INT32_MAX, INT32_MIN, INT32_MAX,
4967 -11111111, 87654321, 0, 0};
4968
4969 int zm_s[] = {1, -1, 1, 1234,
4970 -1, INT32_MIN, 1, -1,
4971 22222222, 80000000, -1, 0};
4972
4973 int64_t zn_d[] = {0, 1, -1, 2468,
4974 INT64_MIN, INT64_MAX, INT64_MIN, INT64_MAX,
4975 -11111111, 87654321, 0, 0};
4976
4977 int64_t zm_d[] = {1, -1, 1, 1234,
4978 -1, INT64_MIN, 1, -1,
4979 22222222, 80000000, -1, 0};
4980
4981 int pg_s[] = {1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0};
4982 int pg_d[] = {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1};
4983
4984 int exp_s[] = {0, 1, -1, 2,
4985 INT32_MIN, 0, INT32_MIN, -INT32_MAX,
4986 0, 1, 0, 0};
4987
4988 int64_t exp_d[] = {0, -1, -1, 2,
4989 INT64_MIN, INT64_MAX, INT64_MIN, -INT64_MAX,
4990 0, 1, 0, 0};
4991
4992 ArithPredicatedFn fn = &MacroAssembler::Sdiv;
4993 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, exp_s);
4994 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, exp_d);
4995 // clang-format on
4996 }
4997
TEST_SVE(sve_binary_arithmetic_predicated_udiv)4998 TEST_SVE(sve_binary_arithmetic_predicated_udiv) {
4999 // clang-format off
5000 unsigned zn_s[] = {0x00000000, 0x00000001, 0xffffffff, 0x80000000,
5001 0xffffffff, 0x80000000, 0xffffffff, 0x0000f000};
5002
5003 unsigned zm_s[] = {0x00000001, 0xffffffff, 0x80000000, 0x00000002,
5004 0x00000000, 0x00000001, 0x00008000, 0xf0000000};
5005
5006 uint64_t zn_d[] = {0x0000000000000000, 0x0000000000000001,
5007 0xffffffffffffffff, 0x8000000000000000,
5008 0xffffffffffffffff, 0x8000000000000000,
5009 0xffffffffffffffff, 0xf0000000f0000000};
5010
5011 uint64_t zm_d[] = {0x0000000000000001, 0xffffffff00000000,
5012 0x8000000000000000, 0x0000000000000002,
5013 0x8888888888888888, 0x0000000000000001,
5014 0x0000000080000000, 0x00000000f0000000};
5015
5016 int pg_s[] = {1, 1, 0, 1, 1, 0, 1, 1};
5017 int pg_d[] = {1, 0, 1, 1, 1, 1, 0, 1};
5018
5019 unsigned exp_s[] = {0x00000000, 0x00000000, 0xffffffff, 0x40000000,
5020 0x00000000, 0x80000000, 0x0001ffff, 0x00000000};
5021
5022 uint64_t exp_d[] = {0x0000000000000000, 0x0000000000000001,
5023 0x0000000000000001, 0x4000000000000000,
5024 0x0000000000000001, 0x8000000000000000,
5025 0xffffffffffffffff, 0x0000000100000001};
5026
5027 ArithPredicatedFn fn = &MacroAssembler::Udiv;
5028 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, exp_s);
5029 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, exp_d);
5030 // clang-format on
5031 }
5032
5033 typedef void (MacroAssembler::*ArithFn)(const ZRegister& zd,
5034 const ZRegister& zn,
5035 const ZRegister& zm);
5036
5037 template <typename T>
IntArithHelper(Test * config,ArithFn macro,unsigned lane_size_in_bits,const T & zn_inputs,const T & zm_inputs,const T & zd_expected)5038 static void IntArithHelper(Test* config,
5039 ArithFn macro,
5040 unsigned lane_size_in_bits,
5041 const T& zn_inputs,
5042 const T& zm_inputs,
5043 const T& zd_expected) {
5044 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5045 START();
5046
5047 ZRegister zn = z31.WithLaneSize(lane_size_in_bits);
5048 ZRegister zm = z27.WithLaneSize(lane_size_in_bits);
5049 InsrHelper(&masm, zn, zn_inputs);
5050 InsrHelper(&masm, zm, zm_inputs);
5051
5052 ZRegister zd = z0.WithLaneSize(lane_size_in_bits);
5053 (masm.*macro)(zd, zn, zm);
5054
5055 END();
5056
5057 if (CAN_RUN()) {
5058 RUN();
5059 ASSERT_EQUAL_SVE(zd_expected, zd);
5060 }
5061 }
5062
TEST_SVE(sve_arithmetic_unpredicated_add_sqadd_uqadd)5063 TEST_SVE(sve_arithmetic_unpredicated_add_sqadd_uqadd) {
5064 // clang-format off
5065 unsigned in_b[] = {0x81, 0x7f, 0x10, 0xaa, 0x55, 0xff, 0xf0};
5066 unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa, 0x5555, 0xffff, 0xf0f0};
5067 unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0x10001010, 0xaaaaaaaa, 0xf000f0f0};
5068 uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f,
5069 0x1000000010001010, 0xf0000000f000f0f0};
5070
5071 ArithFn fn = &MacroAssembler::Add;
5072
5073 unsigned add_exp_b[] = {0x02, 0xfe, 0x20, 0x54, 0xaa, 0xfe, 0xe0};
5074 unsigned add_exp_h[] = {0x0302, 0xfefe, 0x2020, 0x5554, 0xaaaa, 0xfffe, 0xe1e0};
5075 unsigned add_exp_s[] = {0x00030302, 0xfffefefe, 0x20002020, 0x55555554, 0xe001e1e0};
5076 uint64_t add_exp_d[] = {0x0000000300030302, 0xfffffffefffefefe,
5077 0x2000000020002020, 0xe0000001e001e1e0};
5078
5079 IntArithHelper(config, fn, kBRegSize, in_b, in_b, add_exp_b);
5080 IntArithHelper(config, fn, kHRegSize, in_h, in_h, add_exp_h);
5081 IntArithHelper(config, fn, kSRegSize, in_s, in_s, add_exp_s);
5082 IntArithHelper(config, fn, kDRegSize, in_d, in_d, add_exp_d);
5083
5084 fn = &MacroAssembler::Sqadd;
5085
5086 unsigned sqadd_exp_b[] = {0x80, 0x7f, 0x20, 0x80, 0x7f, 0xfe, 0xe0};
5087 unsigned sqadd_exp_h[] = {0x8000, 0x7fff, 0x2020, 0x8000, 0x7fff, 0xfffe, 0xe1e0};
5088 unsigned sqadd_exp_s[] = {0x80000000, 0x7fffffff, 0x20002020, 0x80000000, 0xe001e1e0};
5089 uint64_t sqadd_exp_d[] = {0x8000000000000000, 0x7fffffffffffffff,
5090 0x2000000020002020, 0xe0000001e001e1e0};
5091
5092 IntArithHelper(config, fn, kBRegSize, in_b, in_b, sqadd_exp_b);
5093 IntArithHelper(config, fn, kHRegSize, in_h, in_h, sqadd_exp_h);
5094 IntArithHelper(config, fn, kSRegSize, in_s, in_s, sqadd_exp_s);
5095 IntArithHelper(config, fn, kDRegSize, in_d, in_d, sqadd_exp_d);
5096
5097 fn = &MacroAssembler::Uqadd;
5098
5099 unsigned uqadd_exp_b[] = {0xff, 0xfe, 0x20, 0xff, 0xaa, 0xff, 0xff};
5100 unsigned uqadd_exp_h[] = {0xffff, 0xfefe, 0x2020, 0xffff, 0xaaaa, 0xffff, 0xffff};
5101 unsigned uqadd_exp_s[] = {0xffffffff, 0xfffefefe, 0x20002020, 0xffffffff, 0xffffffff};
5102 uint64_t uqadd_exp_d[] = {0xffffffffffffffff, 0xfffffffefffefefe,
5103 0x2000000020002020, 0xffffffffffffffff};
5104
5105 IntArithHelper(config, fn, kBRegSize, in_b, in_b, uqadd_exp_b);
5106 IntArithHelper(config, fn, kHRegSize, in_h, in_h, uqadd_exp_h);
5107 IntArithHelper(config, fn, kSRegSize, in_s, in_s, uqadd_exp_s);
5108 IntArithHelper(config, fn, kDRegSize, in_d, in_d, uqadd_exp_d);
5109 // clang-format on
5110 }
5111
TEST_SVE(sve_arithmetic_unpredicated_sub_sqsub_uqsub)5112 TEST_SVE(sve_arithmetic_unpredicated_sub_sqsub_uqsub) {
5113 // clang-format off
5114
5115 unsigned ins1_b[] = {0x81, 0x7f, 0x7e, 0xaa};
5116 unsigned ins2_b[] = {0x10, 0xf0, 0xf0, 0x55};
5117
5118 unsigned ins1_h[] = {0x8181, 0x7f7f, 0x7e7e, 0xaaaa};
5119 unsigned ins2_h[] = {0x1010, 0xf0f0, 0xf0f0, 0x5555};
5120
5121 unsigned ins1_s[] = {0x80018181, 0x7fff7f7f, 0x7eee7e7e, 0xaaaaaaaa};
5122 unsigned ins2_s[] = {0x10001010, 0xf000f0f0, 0xf000f0f0, 0x55555555};
5123
5124 uint64_t ins1_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f,
5125 0x7eeeeeee7eee7e7e, 0xaaaaaaaaaaaaaaaa};
5126 uint64_t ins2_d[] = {0x1000000010001010, 0xf0000000f000f0f0,
5127 0xf0000000f000f0f0, 0x5555555555555555};
5128
5129 ArithFn fn = &MacroAssembler::Sub;
5130
5131 unsigned ins1_sub_ins2_exp_b[] = {0x71, 0x8f, 0x8e, 0x55};
5132 unsigned ins1_sub_ins2_exp_h[] = {0x7171, 0x8e8f, 0x8d8e, 0x5555};
5133 unsigned ins1_sub_ins2_exp_s[] = {0x70017171, 0x8ffe8e8f, 0x8eed8d8e, 0x55555555};
5134 uint64_t ins1_sub_ins2_exp_d[] = {0x7000000170017171, 0x8ffffffe8ffe8e8f,
5135 0x8eeeeeed8eed8d8e, 0x5555555555555555};
5136
5137 IntArithHelper(config, fn, kBRegSize, ins1_b, ins2_b, ins1_sub_ins2_exp_b);
5138 IntArithHelper(config, fn, kHRegSize, ins1_h, ins2_h, ins1_sub_ins2_exp_h);
5139 IntArithHelper(config, fn, kSRegSize, ins1_s, ins2_s, ins1_sub_ins2_exp_s);
5140 IntArithHelper(config, fn, kDRegSize, ins1_d, ins2_d, ins1_sub_ins2_exp_d);
5141
5142 unsigned ins2_sub_ins1_exp_b[] = {0x8f, 0x71, 0x72, 0xab};
5143 unsigned ins2_sub_ins1_exp_h[] = {0x8e8f, 0x7171, 0x7272, 0xaaab};
5144 unsigned ins2_sub_ins1_exp_s[] = {0x8ffe8e8f, 0x70017171, 0x71127272, 0xaaaaaaab};
5145 uint64_t ins2_sub_ins1_exp_d[] = {0x8ffffffe8ffe8e8f, 0x7000000170017171,
5146 0x7111111271127272, 0xaaaaaaaaaaaaaaab};
5147
5148 IntArithHelper(config, fn, kBRegSize, ins2_b, ins1_b, ins2_sub_ins1_exp_b);
5149 IntArithHelper(config, fn, kHRegSize, ins2_h, ins1_h, ins2_sub_ins1_exp_h);
5150 IntArithHelper(config, fn, kSRegSize, ins2_s, ins1_s, ins2_sub_ins1_exp_s);
5151 IntArithHelper(config, fn, kDRegSize, ins2_d, ins1_d, ins2_sub_ins1_exp_d);
5152
5153 fn = &MacroAssembler::Sqsub;
5154
5155 unsigned ins1_sqsub_ins2_exp_b[] = {0x80, 0x7f, 0x7f, 0x80};
5156 unsigned ins1_sqsub_ins2_exp_h[] = {0x8000, 0x7fff, 0x7fff, 0x8000};
5157 unsigned ins1_sqsub_ins2_exp_s[] = {0x80000000, 0x7fffffff, 0x7fffffff, 0x80000000};
5158 uint64_t ins1_sqsub_ins2_exp_d[] = {0x8000000000000000, 0x7fffffffffffffff,
5159 0x7fffffffffffffff, 0x8000000000000000};
5160
5161 IntArithHelper(config, fn, kBRegSize, ins1_b, ins2_b, ins1_sqsub_ins2_exp_b);
5162 IntArithHelper(config, fn, kHRegSize, ins1_h, ins2_h, ins1_sqsub_ins2_exp_h);
5163 IntArithHelper(config, fn, kSRegSize, ins1_s, ins2_s, ins1_sqsub_ins2_exp_s);
5164 IntArithHelper(config, fn, kDRegSize, ins1_d, ins2_d, ins1_sqsub_ins2_exp_d);
5165
5166 unsigned ins2_sqsub_ins1_exp_b[] = {0x7f, 0x80, 0x80, 0x7f};
5167 unsigned ins2_sqsub_ins1_exp_h[] = {0x7fff, 0x8000, 0x8000, 0x7fff};
5168 unsigned ins2_sqsub_ins1_exp_s[] = {0x7fffffff, 0x80000000, 0x80000000, 0x7fffffff};
5169 uint64_t ins2_sqsub_ins1_exp_d[] = {0x7fffffffffffffff, 0x8000000000000000,
5170 0x8000000000000000, 0x7fffffffffffffff};
5171
5172 IntArithHelper(config, fn, kBRegSize, ins2_b, ins1_b, ins2_sqsub_ins1_exp_b);
5173 IntArithHelper(config, fn, kHRegSize, ins2_h, ins1_h, ins2_sqsub_ins1_exp_h);
5174 IntArithHelper(config, fn, kSRegSize, ins2_s, ins1_s, ins2_sqsub_ins1_exp_s);
5175 IntArithHelper(config, fn, kDRegSize, ins2_d, ins1_d, ins2_sqsub_ins1_exp_d);
5176
5177 fn = &MacroAssembler::Uqsub;
5178
5179 unsigned ins1_uqsub_ins2_exp_b[] = {0x71, 0x00, 0x00, 0x55};
5180 unsigned ins1_uqsub_ins2_exp_h[] = {0x7171, 0x0000, 0x0000, 0x5555};
5181 unsigned ins1_uqsub_ins2_exp_s[] = {0x70017171, 0x00000000, 0x00000000, 0x55555555};
5182 uint64_t ins1_uqsub_ins2_exp_d[] = {0x7000000170017171, 0x0000000000000000,
5183 0x0000000000000000, 0x5555555555555555};
5184
5185 IntArithHelper(config, fn, kBRegSize, ins1_b, ins2_b, ins1_uqsub_ins2_exp_b);
5186 IntArithHelper(config, fn, kHRegSize, ins1_h, ins2_h, ins1_uqsub_ins2_exp_h);
5187 IntArithHelper(config, fn, kSRegSize, ins1_s, ins2_s, ins1_uqsub_ins2_exp_s);
5188 IntArithHelper(config, fn, kDRegSize, ins1_d, ins2_d, ins1_uqsub_ins2_exp_d);
5189
5190 unsigned ins2_uqsub_ins1_exp_b[] = {0x00, 0x71, 0x72, 0x00};
5191 unsigned ins2_uqsub_ins1_exp_h[] = {0x0000, 0x7171, 0x7272, 0x0000};
5192 unsigned ins2_uqsub_ins1_exp_s[] = {0x00000000, 0x70017171, 0x71127272, 0x00000000};
5193 uint64_t ins2_uqsub_ins1_exp_d[] = {0x0000000000000000, 0x7000000170017171,
5194 0x7111111271127272, 0x0000000000000000};
5195
5196 IntArithHelper(config, fn, kBRegSize, ins2_b, ins1_b, ins2_uqsub_ins1_exp_b);
5197 IntArithHelper(config, fn, kHRegSize, ins2_h, ins1_h, ins2_uqsub_ins1_exp_h);
5198 IntArithHelper(config, fn, kSRegSize, ins2_s, ins1_s, ins2_uqsub_ins1_exp_s);
5199 IntArithHelper(config, fn, kDRegSize, ins2_d, ins1_d, ins2_uqsub_ins1_exp_d);
5200 // clang-format on
5201 }
5202
TEST_SVE(sve_rdvl)5203 TEST_SVE(sve_rdvl) {
5204 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5205 START();
5206
5207 // Encodable multipliers.
5208 __ Rdvl(x0, 0);
5209 __ Rdvl(x1, 1);
5210 __ Rdvl(x2, 2);
5211 __ Rdvl(x3, 31);
5212 __ Rdvl(x4, -1);
5213 __ Rdvl(x5, -2);
5214 __ Rdvl(x6, -32);
5215
5216 // For unencodable multipliers, the MacroAssembler uses a sequence of
5217 // instructions.
5218 __ Rdvl(x10, 32);
5219 __ Rdvl(x11, -33);
5220 __ Rdvl(x12, 42);
5221 __ Rdvl(x13, -42);
5222
5223 // The maximum value of VL is 256 (bytes), so the multiplier is limited to the
5224 // range [INT64_MIN/256, INT64_MAX/256], to ensure that no signed overflow
5225 // occurs in the macro.
5226 __ Rdvl(x14, 0x007fffffffffffff);
5227 __ Rdvl(x15, -0x0080000000000000);
5228
5229 END();
5230
5231 if (CAN_RUN()) {
5232 RUN();
5233
5234 uint64_t vl = config->sve_vl_in_bytes();
5235
5236 ASSERT_EQUAL_64(vl * 0, x0);
5237 ASSERT_EQUAL_64(vl * 1, x1);
5238 ASSERT_EQUAL_64(vl * 2, x2);
5239 ASSERT_EQUAL_64(vl * 31, x3);
5240 ASSERT_EQUAL_64(vl * -1, x4);
5241 ASSERT_EQUAL_64(vl * -2, x5);
5242 ASSERT_EQUAL_64(vl * -32, x6);
5243
5244 ASSERT_EQUAL_64(vl * 32, x10);
5245 ASSERT_EQUAL_64(vl * -33, x11);
5246 ASSERT_EQUAL_64(vl * 42, x12);
5247 ASSERT_EQUAL_64(vl * -42, x13);
5248
5249 ASSERT_EQUAL_64(vl * 0x007fffffffffffff, x14);
5250 ASSERT_EQUAL_64(vl * 0xff80000000000000, x15);
5251 }
5252 }
5253
TEST_SVE(sve_rdpl)5254 TEST_SVE(sve_rdpl) {
5255 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5256 START();
5257
5258 // There is no `rdpl` instruction, so the MacroAssembler maps `Rdpl` onto
5259 // Addpl(xd, xzr, ...).
5260
5261 // Encodable multipliers (as `addvl`).
5262 __ Rdpl(x0, 0);
5263 __ Rdpl(x1, 8);
5264 __ Rdpl(x2, 248);
5265 __ Rdpl(x3, -8);
5266 __ Rdpl(x4, -256);
5267
5268 // Encodable multipliers (as `movz` + `addpl`).
5269 __ Rdpl(x7, 31);
5270 __ Rdpl(x8, -31);
5271
5272 // For unencodable multipliers, the MacroAssembler uses a sequence of
5273 // instructions.
5274 __ Rdpl(x10, 42);
5275 __ Rdpl(x11, -42);
5276
5277 // The maximum value of VL is 256 (bytes), so the multiplier is limited to the
5278 // range [INT64_MIN/256, INT64_MAX/256], to ensure that no signed overflow
5279 // occurs in the macro.
5280 __ Rdpl(x12, 0x007fffffffffffff);
5281 __ Rdpl(x13, -0x0080000000000000);
5282
5283 END();
5284
5285 if (CAN_RUN()) {
5286 RUN();
5287
5288 uint64_t vl = config->sve_vl_in_bytes();
5289 VIXL_ASSERT((vl % kZRegBitsPerPRegBit) == 0);
5290 uint64_t pl = vl / kZRegBitsPerPRegBit;
5291
5292 ASSERT_EQUAL_64(pl * 0, x0);
5293 ASSERT_EQUAL_64(pl * 8, x1);
5294 ASSERT_EQUAL_64(pl * 248, x2);
5295 ASSERT_EQUAL_64(pl * -8, x3);
5296 ASSERT_EQUAL_64(pl * -256, x4);
5297
5298 ASSERT_EQUAL_64(pl * 31, x7);
5299 ASSERT_EQUAL_64(pl * -31, x8);
5300
5301 ASSERT_EQUAL_64(pl * 42, x10);
5302 ASSERT_EQUAL_64(pl * -42, x11);
5303
5304 ASSERT_EQUAL_64(pl * 0x007fffffffffffff, x12);
5305 ASSERT_EQUAL_64(pl * 0xff80000000000000, x13);
5306 }
5307 }
5308
TEST_SVE(sve_addvl)5309 TEST_SVE(sve_addvl) {
5310 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5311 START();
5312
5313 uint64_t base = 0x1234567800000000;
5314 __ Mov(x30, base);
5315
5316 // Encodable multipliers.
5317 __ Addvl(x0, x30, 0);
5318 __ Addvl(x1, x30, 1);
5319 __ Addvl(x2, x30, 31);
5320 __ Addvl(x3, x30, -1);
5321 __ Addvl(x4, x30, -32);
5322
5323 // For unencodable multipliers, the MacroAssembler uses `Rdvl` and `Add`.
5324 __ Addvl(x5, x30, 32);
5325 __ Addvl(x6, x30, -33);
5326
5327 // Test the limits of the multiplier supported by the `Rdvl` macro.
5328 __ Addvl(x7, x30, 0x007fffffffffffff);
5329 __ Addvl(x8, x30, -0x0080000000000000);
5330
5331 // Check that xzr behaves correctly.
5332 __ Addvl(x9, xzr, 8);
5333 __ Addvl(x10, xzr, 42);
5334
5335 // Check that sp behaves correctly with encodable and unencodable multipliers.
5336 __ Addvl(sp, sp, -5);
5337 __ Addvl(sp, sp, -37);
5338 __ Addvl(x11, sp, -2);
5339 __ Addvl(sp, x11, 2);
5340 __ Addvl(x12, sp, -42);
5341
5342 // Restore the value of sp.
5343 __ Addvl(sp, x11, 39);
5344 __ Addvl(sp, sp, 5);
5345
5346 // Adjust x11 and x12 to make the test sp-agnostic.
5347 __ Sub(x11, sp, x11);
5348 __ Sub(x12, sp, x12);
5349
5350 // Check cases where xd.Is(xn). This stresses scratch register allocation.
5351 __ Mov(x20, x30);
5352 __ Mov(x21, x30);
5353 __ Mov(x22, x30);
5354 __ Addvl(x20, x20, 4);
5355 __ Addvl(x21, x21, 42);
5356 __ Addvl(x22, x22, -0x0080000000000000);
5357
5358 END();
5359
5360 if (CAN_RUN()) {
5361 RUN();
5362
5363 uint64_t vl = config->sve_vl_in_bytes();
5364
5365 ASSERT_EQUAL_64(base + (vl * 0), x0);
5366 ASSERT_EQUAL_64(base + (vl * 1), x1);
5367 ASSERT_EQUAL_64(base + (vl * 31), x2);
5368 ASSERT_EQUAL_64(base + (vl * -1), x3);
5369 ASSERT_EQUAL_64(base + (vl * -32), x4);
5370
5371 ASSERT_EQUAL_64(base + (vl * 32), x5);
5372 ASSERT_EQUAL_64(base + (vl * -33), x6);
5373
5374 ASSERT_EQUAL_64(base + (vl * 0x007fffffffffffff), x7);
5375 ASSERT_EQUAL_64(base + (vl * 0xff80000000000000), x8);
5376
5377 ASSERT_EQUAL_64(vl * 8, x9);
5378 ASSERT_EQUAL_64(vl * 42, x10);
5379
5380 ASSERT_EQUAL_64(vl * 44, x11);
5381 ASSERT_EQUAL_64(vl * 84, x12);
5382
5383 ASSERT_EQUAL_64(base + (vl * 4), x20);
5384 ASSERT_EQUAL_64(base + (vl * 42), x21);
5385 ASSERT_EQUAL_64(base + (vl * 0xff80000000000000), x22);
5386
5387 ASSERT_EQUAL_64(base, x30);
5388 }
5389 }
5390
TEST_SVE(sve_addpl)5391 TEST_SVE(sve_addpl) {
5392 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5393 START();
5394
5395 uint64_t base = 0x1234567800000000;
5396 __ Mov(x30, base);
5397
5398 // Encodable multipliers.
5399 __ Addpl(x0, x30, 0);
5400 __ Addpl(x1, x30, 1);
5401 __ Addpl(x2, x30, 31);
5402 __ Addpl(x3, x30, -1);
5403 __ Addpl(x4, x30, -32);
5404
5405 // For unencodable multipliers, the MacroAssembler uses `Addvl` if it can, or
5406 // it falls back to `Rdvl` and `Add`.
5407 __ Addpl(x5, x30, 32);
5408 __ Addpl(x6, x30, -33);
5409
5410 // Test the limits of the multiplier supported by the `Rdvl` macro.
5411 __ Addpl(x7, x30, 0x007fffffffffffff);
5412 __ Addpl(x8, x30, -0x0080000000000000);
5413
5414 // Check that xzr behaves correctly.
5415 __ Addpl(x9, xzr, 8);
5416 __ Addpl(x10, xzr, 42);
5417
5418 // Check that sp behaves correctly with encodable and unencodable multipliers.
5419 __ Addpl(sp, sp, -5);
5420 __ Addpl(sp, sp, -37);
5421 __ Addpl(x11, sp, -2);
5422 __ Addpl(sp, x11, 2);
5423 __ Addpl(x12, sp, -42);
5424
5425 // Restore the value of sp.
5426 __ Addpl(sp, x11, 39);
5427 __ Addpl(sp, sp, 5);
5428
5429 // Adjust x11 and x12 to make the test sp-agnostic.
5430 __ Sub(x11, sp, x11);
5431 __ Sub(x12, sp, x12);
5432
5433 // Check cases where xd.Is(xn). This stresses scratch register allocation.
5434 __ Mov(x20, x30);
5435 __ Mov(x21, x30);
5436 __ Mov(x22, x30);
5437 __ Addpl(x20, x20, 4);
5438 __ Addpl(x21, x21, 42);
5439 __ Addpl(x22, x22, -0x0080000000000000);
5440
5441 END();
5442
5443 if (CAN_RUN()) {
5444 RUN();
5445
5446 uint64_t vl = config->sve_vl_in_bytes();
5447 VIXL_ASSERT((vl % kZRegBitsPerPRegBit) == 0);
5448 uint64_t pl = vl / kZRegBitsPerPRegBit;
5449
5450 ASSERT_EQUAL_64(base + (pl * 0), x0);
5451 ASSERT_EQUAL_64(base + (pl * 1), x1);
5452 ASSERT_EQUAL_64(base + (pl * 31), x2);
5453 ASSERT_EQUAL_64(base + (pl * -1), x3);
5454 ASSERT_EQUAL_64(base + (pl * -32), x4);
5455
5456 ASSERT_EQUAL_64(base + (pl * 32), x5);
5457 ASSERT_EQUAL_64(base + (pl * -33), x6);
5458
5459 ASSERT_EQUAL_64(base + (pl * 0x007fffffffffffff), x7);
5460 ASSERT_EQUAL_64(base + (pl * 0xff80000000000000), x8);
5461
5462 ASSERT_EQUAL_64(pl * 8, x9);
5463 ASSERT_EQUAL_64(pl * 42, x10);
5464
5465 ASSERT_EQUAL_64(pl * 44, x11);
5466 ASSERT_EQUAL_64(pl * 84, x12);
5467
5468 ASSERT_EQUAL_64(base + (pl * 4), x20);
5469 ASSERT_EQUAL_64(base + (pl * 42), x21);
5470 ASSERT_EQUAL_64(base + (pl * 0xff80000000000000), x22);
5471
5472 ASSERT_EQUAL_64(base, x30);
5473 }
5474 }
5475
TEST_SVE(sve_calculate_sve_address)5476 TEST_SVE(sve_calculate_sve_address) {
5477 #pragma GCC diagnostic push
5478 #pragma GCC diagnostic ignored "-Wshadow"
5479
5480 // Shadow the `MacroAssembler` type so that the test macros work without
5481 // modification.
5482 typedef CalculateSVEAddressMacroAssembler MacroAssembler;
5483
5484 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5485 START(); // NOLINT(clang-diagnostic-local-type-template-args)
5486
5487 uint64_t base = 0x1234567800000000;
5488 __ Mov(x28, base);
5489 __ Mov(x29, 48);
5490 __ Mov(x30, -48);
5491
5492 // Simple scalar (or equivalent) cases.
5493
5494 __ CalculateSVEAddress(x0, SVEMemOperand(x28));
5495 __ CalculateSVEAddress(x1, SVEMemOperand(x28, 0));
5496 __ CalculateSVEAddress(x2, SVEMemOperand(x28, 0, SVE_MUL_VL));
5497 __ CalculateSVEAddress(x3, SVEMemOperand(x28, 0, SVE_MUL_VL), 3);
5498 __ CalculateSVEAddress(x4, SVEMemOperand(x28, xzr));
5499 __ CalculateSVEAddress(x5, SVEMemOperand(x28, xzr, LSL, 42));
5500
5501 // scalar-plus-immediate
5502
5503 // Unscaled immediates, handled with `Add`.
5504 __ CalculateSVEAddress(x6, SVEMemOperand(x28, 42));
5505 __ CalculateSVEAddress(x7, SVEMemOperand(x28, -42));
5506 // Scaled immediates, handled with `Addvl` or `Addpl`.
5507 __ CalculateSVEAddress(x8, SVEMemOperand(x28, 31, SVE_MUL_VL), 0);
5508 __ CalculateSVEAddress(x9, SVEMemOperand(x28, -32, SVE_MUL_VL), 0);
5509 // Out of `addvl` or `addpl` range.
5510 __ CalculateSVEAddress(x10, SVEMemOperand(x28, 42, SVE_MUL_VL), 0);
5511 __ CalculateSVEAddress(x11, SVEMemOperand(x28, -42, SVE_MUL_VL), 0);
5512 // As above, for VL-based accesses smaller than a Z register.
5513 VIXL_STATIC_ASSERT(kZRegBitsPerPRegBitLog2 == 3);
5514 __ CalculateSVEAddress(x12, SVEMemOperand(x28, -32 * 8, SVE_MUL_VL), 3);
5515 __ CalculateSVEAddress(x13, SVEMemOperand(x28, -42 * 8, SVE_MUL_VL), 3);
5516 __ CalculateSVEAddress(x14, SVEMemOperand(x28, -32 * 4, SVE_MUL_VL), 2);
5517 __ CalculateSVEAddress(x15, SVEMemOperand(x28, -42 * 4, SVE_MUL_VL), 2);
5518 __ CalculateSVEAddress(x18, SVEMemOperand(x28, -32 * 2, SVE_MUL_VL), 1);
5519 __ CalculateSVEAddress(x19, SVEMemOperand(x28, -42 * 2, SVE_MUL_VL), 1);
5520
5521 // scalar-plus-scalar
5522
5523 __ CalculateSVEAddress(x20, SVEMemOperand(x28, x29));
5524 __ CalculateSVEAddress(x21, SVEMemOperand(x28, x30));
5525 __ CalculateSVEAddress(x22, SVEMemOperand(x28, x29, LSL, 8));
5526 __ CalculateSVEAddress(x23, SVEMemOperand(x28, x30, LSL, 8));
5527
5528 // In-place updates, to stress scratch register allocation.
5529
5530 __ Mov(x24, 0xabcd000000000000);
5531 __ Mov(x25, 0xabcd101100000000);
5532 __ Mov(x26, 0xabcd202200000000);
5533 __ Mov(x27, 0xabcd303300000000);
5534 __ Mov(x28, 0xabcd404400000000);
5535 __ Mov(x29, 0xabcd505500000000);
5536
5537 __ CalculateSVEAddress(x24, SVEMemOperand(x24));
5538 __ CalculateSVEAddress(x25, SVEMemOperand(x25, 0x42));
5539 __ CalculateSVEAddress(x26, SVEMemOperand(x26, 3, SVE_MUL_VL), 0);
5540 __ CalculateSVEAddress(x27, SVEMemOperand(x27, 0x42, SVE_MUL_VL), 3);
5541 __ CalculateSVEAddress(x28, SVEMemOperand(x28, x30));
5542 __ CalculateSVEAddress(x29, SVEMemOperand(x29, x30, LSL, 4));
5543
5544 END();
5545
5546 if (CAN_RUN()) {
5547 RUN();
5548
5549 uint64_t vl = config->sve_vl_in_bytes();
5550 VIXL_ASSERT((vl % kZRegBitsPerPRegBit) == 0);
5551 uint64_t pl = vl / kZRegBitsPerPRegBit;
5552
5553 // Simple scalar (or equivalent) cases.
5554 ASSERT_EQUAL_64(base, x0);
5555 ASSERT_EQUAL_64(base, x1);
5556 ASSERT_EQUAL_64(base, x2);
5557 ASSERT_EQUAL_64(base, x3);
5558 ASSERT_EQUAL_64(base, x4);
5559 ASSERT_EQUAL_64(base, x5);
5560
5561 // scalar-plus-immediate
5562 ASSERT_EQUAL_64(base + 42, x6);
5563 ASSERT_EQUAL_64(base - 42, x7);
5564 ASSERT_EQUAL_64(base + (31 * vl), x8);
5565 ASSERT_EQUAL_64(base - (32 * vl), x9);
5566 ASSERT_EQUAL_64(base + (42 * vl), x10);
5567 ASSERT_EQUAL_64(base - (42 * vl), x11);
5568 ASSERT_EQUAL_64(base - (32 * vl), x12);
5569 ASSERT_EQUAL_64(base - (42 * vl), x13);
5570 ASSERT_EQUAL_64(base - (32 * vl), x14);
5571 ASSERT_EQUAL_64(base - (42 * vl), x15);
5572 ASSERT_EQUAL_64(base - (32 * vl), x18);
5573 ASSERT_EQUAL_64(base - (42 * vl), x19);
5574
5575 // scalar-plus-scalar
5576 ASSERT_EQUAL_64(base + 48, x20);
5577 ASSERT_EQUAL_64(base - 48, x21);
5578 ASSERT_EQUAL_64(base + (48 << 8), x22);
5579 ASSERT_EQUAL_64(base - (48 << 8), x23);
5580
5581 // In-place updates.
5582 ASSERT_EQUAL_64(0xabcd000000000000, x24);
5583 ASSERT_EQUAL_64(0xabcd101100000000 + 0x42, x25);
5584 ASSERT_EQUAL_64(0xabcd202200000000 + (3 * vl), x26);
5585 ASSERT_EQUAL_64(0xabcd303300000000 + (0x42 * pl), x27);
5586 ASSERT_EQUAL_64(0xabcd404400000000 - 48, x28);
5587 ASSERT_EQUAL_64(0xabcd505500000000 - (48 << 4), x29);
5588 }
5589 #pragma GCC diagnostic pop
5590 }
5591
TEST_SVE(sve_permute_vector_unpredicated)5592 TEST_SVE(sve_permute_vector_unpredicated) {
5593 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
5594 START();
5595
5596 // Initialise registers with known values first.
5597 __ Dup(z1.VnB(), 0x11);
5598 __ Dup(z2.VnB(), 0x22);
5599 __ Dup(z3.VnB(), 0x33);
5600 __ Dup(z4.VnB(), 0x44);
5601
5602 __ Mov(x0, 0x0123456789abcdef);
5603 __ Fmov(d0, RawbitsToDouble(0x7ffaaaaa22223456));
5604 __ Insr(z1.VnS(), w0);
5605 __ Insr(z2.VnD(), x0);
5606 __ Insr(z3.VnH(), h0);
5607 __ Insr(z4.VnD(), d0);
5608
5609 uint64_t inputs[] = {0xfedcba9876543210,
5610 0x0123456789abcdef,
5611 0x8f8e8d8c8b8a8988,
5612 0x8786858483828180};
5613
5614 // Initialize a distinguishable value throughout the register first.
5615 __ Dup(z9.VnB(), 0xff);
5616 InsrHelper(&masm, z9.VnD(), inputs);
5617
5618 __ Rev(z5.VnB(), z9.VnB());
5619 __ Rev(z6.VnH(), z9.VnH());
5620 __ Rev(z7.VnS(), z9.VnS());
5621 __ Rev(z8.VnD(), z9.VnD());
5622
5623 int index[7] = {22, 7, 7, 3, 1, 1, 63};
5624 // Broadcasting an data within the input array.
5625 __ Dup(z10.VnB(), z9.VnB(), index[0]);
5626 __ Dup(z11.VnH(), z9.VnH(), index[1]);
5627 __ Dup(z12.VnS(), z9.VnS(), index[2]);
5628 __ Dup(z13.VnD(), z9.VnD(), index[3]);
5629 __ Dup(z14.VnQ(), z9.VnQ(), index[4]);
5630 // Test dst == src
5631 __ Mov(z15, z9);
5632 __ Dup(z15.VnS(), z15.VnS(), index[5]);
5633 // Selecting an data beyond the input array.
5634 __ Dup(z16.VnB(), z9.VnB(), index[6]);
5635
5636 END();
5637
5638 if (CAN_RUN()) {
5639 RUN();
5640
5641 // Insr
5642 uint64_t z1_expected[] = {0x1111111111111111, 0x1111111189abcdef};
5643 uint64_t z2_expected[] = {0x2222222222222222, 0x0123456789abcdef};
5644 uint64_t z3_expected[] = {0x3333333333333333, 0x3333333333333456};
5645 uint64_t z4_expected[] = {0x4444444444444444, 0x7ffaaaaa22223456};
5646 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
5647 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
5648 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
5649 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
5650
5651 // Rev
5652 int lane_count = core.GetSVELaneCount(kBRegSize);
5653 for (int i = 0; i < lane_count; i++) {
5654 uint64_t expected =
5655 core.zreg_lane(z5.GetCode(), kBRegSize, lane_count - i - 1);
5656 uint64_t input = core.zreg_lane(z9.GetCode(), kBRegSize, i);
5657 ASSERT_EQUAL_64(expected, input);
5658 }
5659
5660 lane_count = core.GetSVELaneCount(kHRegSize);
5661 for (int i = 0; i < lane_count; i++) {
5662 uint64_t expected =
5663 core.zreg_lane(z6.GetCode(), kHRegSize, lane_count - i - 1);
5664 uint64_t input = core.zreg_lane(z9.GetCode(), kHRegSize, i);
5665 ASSERT_EQUAL_64(expected, input);
5666 }
5667
5668 lane_count = core.GetSVELaneCount(kSRegSize);
5669 for (int i = 0; i < lane_count; i++) {
5670 uint64_t expected =
5671 core.zreg_lane(z7.GetCode(), kSRegSize, lane_count - i - 1);
5672 uint64_t input = core.zreg_lane(z9.GetCode(), kSRegSize, i);
5673 ASSERT_EQUAL_64(expected, input);
5674 }
5675
5676 lane_count = core.GetSVELaneCount(kDRegSize);
5677 for (int i = 0; i < lane_count; i++) {
5678 uint64_t expected =
5679 core.zreg_lane(z8.GetCode(), kDRegSize, lane_count - i - 1);
5680 uint64_t input = core.zreg_lane(z9.GetCode(), kDRegSize, i);
5681 ASSERT_EQUAL_64(expected, input);
5682 }
5683
5684 // Dup
5685 unsigned vl = config->sve_vl_in_bits();
5686 lane_count = core.GetSVELaneCount(kBRegSize);
5687 uint64_t expected_z10 = (vl > (index[0] * kBRegSize)) ? 0x23 : 0;
5688 for (int i = 0; i < lane_count; i++) {
5689 ASSERT_EQUAL_SVE_LANE(expected_z10, z10.VnB(), i);
5690 }
5691
5692 lane_count = core.GetSVELaneCount(kHRegSize);
5693 uint64_t expected_z11 = (vl > (index[1] * kHRegSize)) ? 0x8f8e : 0;
5694 for (int i = 0; i < lane_count; i++) {
5695 ASSERT_EQUAL_SVE_LANE(expected_z11, z11.VnH(), i);
5696 }
5697
5698 lane_count = core.GetSVELaneCount(kSRegSize);
5699 uint64_t expected_z12 = (vl > (index[2] * kSRegSize)) ? 0xfedcba98 : 0;
5700 for (int i = 0; i < lane_count; i++) {
5701 ASSERT_EQUAL_SVE_LANE(expected_z12, z12.VnS(), i);
5702 }
5703
5704 lane_count = core.GetSVELaneCount(kDRegSize);
5705 uint64_t expected_z13 =
5706 (vl > (index[3] * kDRegSize)) ? 0xfedcba9876543210 : 0;
5707 for (int i = 0; i < lane_count; i++) {
5708 ASSERT_EQUAL_SVE_LANE(expected_z13, z13.VnD(), i);
5709 }
5710
5711 lane_count = core.GetSVELaneCount(kDRegSize);
5712 uint64_t expected_z14_lo = 0;
5713 uint64_t expected_z14_hi = 0;
5714 if (vl > (index[4] * kQRegSize)) {
5715 expected_z14_lo = 0x0123456789abcdef;
5716 expected_z14_hi = 0xfedcba9876543210;
5717 }
5718 for (int i = 0; i < lane_count; i += 2) {
5719 ASSERT_EQUAL_SVE_LANE(expected_z14_lo, z14.VnD(), i);
5720 ASSERT_EQUAL_SVE_LANE(expected_z14_hi, z14.VnD(), i + 1);
5721 }
5722
5723 lane_count = core.GetSVELaneCount(kSRegSize);
5724 uint64_t expected_z15 = (vl > (index[5] * kSRegSize)) ? 0x87868584 : 0;
5725 for (int i = 0; i < lane_count; i++) {
5726 ASSERT_EQUAL_SVE_LANE(expected_z15, z15.VnS(), i);
5727 }
5728
5729 lane_count = core.GetSVELaneCount(kBRegSize);
5730 uint64_t expected_z16 = (vl > (index[6] * kBRegSize)) ? 0xff : 0;
5731 for (int i = 0; i < lane_count; i++) {
5732 ASSERT_EQUAL_SVE_LANE(expected_z16, z16.VnB(), i);
5733 }
5734 }
5735 }
5736
TEST_SVE(sve_permute_vector_unpredicated_unpack_vector_elements)5737 TEST_SVE(sve_permute_vector_unpredicated_unpack_vector_elements) {
5738 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5739 START();
5740
5741 uint64_t z9_inputs[] = {0xfedcba9876543210,
5742 0x0123456789abcdef,
5743 0x8f8e8d8c8b8a8988,
5744 0x8786858483828180};
5745 InsrHelper(&masm, z9.VnD(), z9_inputs);
5746
5747 __ Sunpkhi(z10.VnH(), z9.VnB());
5748 __ Sunpkhi(z11.VnS(), z9.VnH());
5749 __ Sunpkhi(z12.VnD(), z9.VnS());
5750
5751 __ Sunpklo(z13.VnH(), z9.VnB());
5752 __ Sunpklo(z14.VnS(), z9.VnH());
5753 __ Sunpklo(z15.VnD(), z9.VnS());
5754
5755 __ Uunpkhi(z16.VnH(), z9.VnB());
5756 __ Uunpkhi(z17.VnS(), z9.VnH());
5757 __ Uunpkhi(z18.VnD(), z9.VnS());
5758
5759 __ Uunpklo(z19.VnH(), z9.VnB());
5760 __ Uunpklo(z20.VnS(), z9.VnH());
5761 __ Uunpklo(z21.VnD(), z9.VnS());
5762
5763 // Test unpacking with same source and destination.
5764 __ Mov(z22, z9);
5765 __ Sunpklo(z22.VnH(), z22.VnB());
5766 __ Mov(z23, z9);
5767 __ Uunpklo(z23.VnH(), z23.VnB());
5768
5769 END();
5770
5771 if (CAN_RUN()) {
5772 RUN();
5773
5774 // Suunpkhi
5775 int lane_count = core.GetSVELaneCount(kHRegSize);
5776 for (int i = lane_count - 1; i >= 0; i--) {
5777 uint16_t expected = core.zreg_lane<uint16_t>(z10.GetCode(), i);
5778 uint8_t b_lane = core.zreg_lane<uint8_t>(z9.GetCode(), i + lane_count);
5779 uint16_t input = SignExtend<int16_t>(b_lane, kBRegSize);
5780 ASSERT_EQUAL_64(expected, input);
5781 }
5782
5783 lane_count = core.GetSVELaneCount(kSRegSize);
5784 for (int i = lane_count - 1; i >= 0; i--) {
5785 uint32_t expected = core.zreg_lane<uint32_t>(z11.GetCode(), i);
5786 uint16_t h_lane = core.zreg_lane<uint16_t>(z9.GetCode(), i + lane_count);
5787 uint32_t input = SignExtend<int32_t>(h_lane, kHRegSize);
5788 ASSERT_EQUAL_64(expected, input);
5789 }
5790
5791 lane_count = core.GetSVELaneCount(kDRegSize);
5792 for (int i = lane_count - 1; i >= 0; i--) {
5793 uint64_t expected = core.zreg_lane<uint64_t>(z12.GetCode(), i);
5794 uint32_t s_lane = core.zreg_lane<uint32_t>(z9.GetCode(), i + lane_count);
5795 uint64_t input = SignExtend<int64_t>(s_lane, kSRegSize);
5796 ASSERT_EQUAL_64(expected, input);
5797 }
5798
5799 // Suunpklo
5800 lane_count = core.GetSVELaneCount(kHRegSize);
5801 for (int i = lane_count - 1; i >= 0; i--) {
5802 uint16_t expected = core.zreg_lane<uint16_t>(z13.GetCode(), i);
5803 uint8_t b_lane = core.zreg_lane<uint8_t>(z9.GetCode(), i);
5804 uint16_t input = SignExtend<int16_t>(b_lane, kBRegSize);
5805 ASSERT_EQUAL_64(expected, input);
5806 }
5807
5808 lane_count = core.GetSVELaneCount(kSRegSize);
5809 for (int i = lane_count - 1; i >= 0; i--) {
5810 uint32_t expected = core.zreg_lane<uint32_t>(z14.GetCode(), i);
5811 uint16_t h_lane = core.zreg_lane<uint16_t>(z9.GetCode(), i);
5812 uint32_t input = SignExtend<int32_t>(h_lane, kHRegSize);
5813 ASSERT_EQUAL_64(expected, input);
5814 }
5815
5816 lane_count = core.GetSVELaneCount(kDRegSize);
5817 for (int i = lane_count - 1; i >= 0; i--) {
5818 uint64_t expected = core.zreg_lane<uint64_t>(z15.GetCode(), i);
5819 uint32_t s_lane = core.zreg_lane<uint32_t>(z9.GetCode(), i);
5820 uint64_t input = SignExtend<int64_t>(s_lane, kSRegSize);
5821 ASSERT_EQUAL_64(expected, input);
5822 }
5823
5824 // Uuunpkhi
5825 lane_count = core.GetSVELaneCount(kHRegSize);
5826 for (int i = lane_count - 1; i >= 0; i--) {
5827 uint16_t expected = core.zreg_lane<uint16_t>(z16.GetCode(), i);
5828 uint16_t input = core.zreg_lane<uint8_t>(z9.GetCode(), i + lane_count);
5829 ASSERT_EQUAL_64(expected, input);
5830 }
5831
5832 lane_count = core.GetSVELaneCount(kSRegSize);
5833 for (int i = lane_count - 1; i >= 0; i--) {
5834 uint32_t expected = core.zreg_lane<uint32_t>(z17.GetCode(), i);
5835 uint32_t input = core.zreg_lane<uint16_t>(z9.GetCode(), i + lane_count);
5836 ASSERT_EQUAL_64(expected, input);
5837 }
5838
5839 lane_count = core.GetSVELaneCount(kDRegSize);
5840 for (int i = lane_count - 1; i >= 0; i--) {
5841 uint64_t expected = core.zreg_lane<uint64_t>(z18.GetCode(), i);
5842 uint64_t input = core.zreg_lane<uint32_t>(z9.GetCode(), i + lane_count);
5843 ASSERT_EQUAL_64(expected, input);
5844 }
5845
5846 // Uuunpklo
5847 lane_count = core.GetSVELaneCount(kHRegSize);
5848 for (int i = lane_count - 1; i >= 0; i--) {
5849 uint16_t expected = core.zreg_lane<uint16_t>(z19.GetCode(), i);
5850 uint16_t input = core.zreg_lane<uint8_t>(z9.GetCode(), i);
5851 ASSERT_EQUAL_64(expected, input);
5852 }
5853
5854 lane_count = core.GetSVELaneCount(kSRegSize);
5855 for (int i = lane_count - 1; i >= 0; i--) {
5856 uint32_t expected = core.zreg_lane<uint32_t>(z20.GetCode(), i);
5857 uint32_t input = core.zreg_lane<uint16_t>(z9.GetCode(), i);
5858 ASSERT_EQUAL_64(expected, input);
5859 }
5860
5861 lane_count = core.GetSVELaneCount(kDRegSize);
5862 for (int i = lane_count - 1; i >= 0; i--) {
5863 uint64_t expected = core.zreg_lane<uint64_t>(z21.GetCode(), i);
5864 uint64_t input = core.zreg_lane<uint32_t>(z9.GetCode(), i);
5865 ASSERT_EQUAL_64(expected, input);
5866 }
5867
5868 ASSERT_EQUAL_SVE(z13, z22);
5869 ASSERT_EQUAL_SVE(z19, z23);
5870 }
5871 }
5872
TEST_SVE(sve_cnot_not)5873 TEST_SVE(sve_cnot_not) {
5874 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5875 START();
5876
5877 uint64_t in[] = {0x0000000000000000, 0x00000000e1c30000, 0x123456789abcdef0};
5878
5879 // For simplicity, we re-use the same pg for various lane sizes.
5880 // For D lanes: 1, 1, 0
5881 // For S lanes: 1, 1, 1, 0, 0
5882 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
5883 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
5884 Initialise(&masm, p0.VnB(), pg_in);
5885 PRegisterM pg = p0.Merging();
5886
5887 // These are merging operations, so we have to initialise the result register.
5888 // We use a mixture of constructive and destructive operations.
5889
5890 InsrHelper(&masm, z31.VnD(), in);
5891 // Make a copy so we can check that constructive operations preserve zn.
5892 __ Mov(z30, z31);
5893
5894 // For constructive operations, use a different initial result value.
5895 __ Index(z29.VnB(), 0, -1);
5896
5897 __ Mov(z0, z31);
5898 __ Cnot(z0.VnB(), pg, z0.VnB()); // destructive
5899 __ Mov(z1, z29);
5900 __ Cnot(z1.VnH(), pg, z31.VnH());
5901 __ Mov(z2, z31);
5902 __ Cnot(z2.VnS(), pg, z2.VnS()); // destructive
5903 __ Mov(z3, z29);
5904 __ Cnot(z3.VnD(), pg, z31.VnD());
5905
5906 __ Mov(z4, z29);
5907 __ Not(z4.VnB(), pg, z31.VnB());
5908 __ Mov(z5, z31);
5909 __ Not(z5.VnH(), pg, z5.VnH()); // destructive
5910 __ Mov(z6, z29);
5911 __ Not(z6.VnS(), pg, z31.VnS());
5912 __ Mov(z7, z31);
5913 __ Not(z7.VnD(), pg, z7.VnD()); // destructive
5914
5915 END();
5916
5917 if (CAN_RUN()) {
5918 RUN();
5919
5920 // Check that constructive operations preserve their inputs.
5921 ASSERT_EQUAL_SVE(z30, z31);
5922
5923 // clang-format off
5924
5925 // Cnot (B) destructive
5926 uint64_t expected_z0[] =
5927 // pg: 0 0 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0
5928 {0x0000000001000101, 0x01000001e1000101, 0x12340078000000f0};
5929 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
5930
5931 // Cnot (H)
5932 uint64_t expected_z1[] =
5933 // pg: 0 0 0 1 0 1 1 1 0 0 1 0
5934 {0xe9eaebecedee0001, 0xf1f2000100000001, 0xf9fafbfc0000ff00};
5935 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
5936
5937 // Cnot (S) destructive
5938 uint64_t expected_z2[] =
5939 // pg: 0 1 1 1 0 0
5940 {0x0000000000000001, 0x0000000100000000, 0x123456789abcdef0};
5941 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
5942
5943 // Cnot (D)
5944 uint64_t expected_z3[] =
5945 // pg: 1 1 0
5946 {0x0000000000000001, 0x0000000000000000, 0xf9fafbfcfdfeff00};
5947 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
5948
5949 // Not (B)
5950 uint64_t expected_z4[] =
5951 // pg: 0 0 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0
5952 {0xe9eaebecffeeffff, 0xfff2f3fff53cffff, 0xf9faa9fc65432100};
5953 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
5954
5955 // Not (H) destructive
5956 uint64_t expected_z5[] =
5957 // pg: 0 0 0 1 0 1 1 1 0 0 1 0
5958 {0x000000000000ffff, 0x0000ffff1e3cffff, 0x123456786543def0};
5959 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
5960
5961 // Not (S)
5962 uint64_t expected_z6[] =
5963 // pg: 0 1 1 1 0 0
5964 {0xe9eaebecffffffff, 0xffffffff1e3cffff, 0xf9fafbfcfdfeff00};
5965 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
5966
5967 // Not (D) destructive
5968 uint64_t expected_z7[] =
5969 // pg: 1 1 0
5970 {0xffffffffffffffff, 0xffffffff1e3cffff, 0x123456789abcdef0};
5971 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
5972
5973 // clang-format on
5974 }
5975 }
5976
TEST_SVE(sve_fabs_fneg)5977 TEST_SVE(sve_fabs_fneg) {
5978 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5979 START();
5980
5981 // Include FP64, FP32 and FP16 signalling NaNs. Most FP operations quieten
5982 // NaNs, but fabs and fneg do not.
5983 uint64_t in[] = {0xc04500004228d140, // Recognisable (+/-42) values.
5984 0xfff00000ff80fc01, // Signalling NaNs.
5985 0x123456789abcdef0};
5986
5987 // For simplicity, we re-use the same pg for various lane sizes.
5988 // For D lanes: 1, 1, 0
5989 // For S lanes: 1, 1, 1, 0, 0
5990 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
5991 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
5992 Initialise(&masm, p0.VnB(), pg_in);
5993 PRegisterM pg = p0.Merging();
5994
5995 // These are merging operations, so we have to initialise the result register.
5996 // We use a mixture of constructive and destructive operations.
5997
5998 InsrHelper(&masm, z31.VnD(), in);
5999 // Make a copy so we can check that constructive operations preserve zn.
6000 __ Mov(z30, z31);
6001
6002 // For constructive operations, use a different initial result value.
6003 __ Index(z29.VnB(), 0, -1);
6004
6005 __ Mov(z0, z29);
6006 __ Fabs(z0.VnH(), pg, z31.VnH());
6007 __ Mov(z1, z31);
6008 __ Fabs(z1.VnS(), pg, z1.VnS()); // destructive
6009 __ Mov(z2, z29);
6010 __ Fabs(z2.VnD(), pg, z31.VnD());
6011
6012 __ Mov(z3, z31);
6013 __ Fneg(z3.VnH(), pg, z3.VnH()); // destructive
6014 __ Mov(z4, z29);
6015 __ Fneg(z4.VnS(), pg, z31.VnS());
6016 __ Mov(z5, z31);
6017 __ Fneg(z5.VnD(), pg, z5.VnD()); // destructive
6018
6019 END();
6020
6021 if (CAN_RUN()) {
6022 RUN();
6023
6024 // Check that constructive operations preserve their inputs.
6025 ASSERT_EQUAL_SVE(z30, z31);
6026
6027 // clang-format off
6028
6029 // Fabs (H)
6030 uint64_t expected_z0[] =
6031 // pg: 0 0 0 1 0 1 1 1 0 0 1 0
6032 {0xe9eaebecedee5140, 0xf1f200007f807c01, 0xf9fafbfc1abcff00};
6033 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
6034
6035 // Fabs (S) destructive
6036 uint64_t expected_z1[] =
6037 // pg: 0 1 1 1 0 0
6038 {0xc04500004228d140, 0x7ff000007f80fc01, 0x123456789abcdef0};
6039 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
6040
6041 // Fabs (D)
6042 uint64_t expected_z2[] =
6043 // pg: 1 1 0
6044 {0x404500004228d140, 0x7ff00000ff80fc01, 0xf9fafbfcfdfeff00};
6045 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
6046
6047 // Fneg (H) destructive
6048 uint64_t expected_z3[] =
6049 // pg: 0 0 0 1 0 1 1 1 0 0 1 0
6050 {0xc045000042285140, 0xfff080007f807c01, 0x123456781abcdef0};
6051 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6052
6053 // Fneg (S)
6054 uint64_t expected_z4[] =
6055 // pg: 0 1 1 1 0 0
6056 {0xe9eaebecc228d140, 0x7ff000007f80fc01, 0xf9fafbfcfdfeff00};
6057 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
6058
6059 // Fneg (D) destructive
6060 uint64_t expected_z5[] =
6061 // pg: 1 1 0
6062 {0x404500004228d140, 0x7ff00000ff80fc01, 0x123456789abcdef0};
6063 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
6064
6065 // clang-format on
6066 }
6067 }
6068
TEST_SVE(sve_cls_clz_cnt)6069 TEST_SVE(sve_cls_clz_cnt) {
6070 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6071 START();
6072
6073 uint64_t in[] = {0x0000000000000000, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};
6074
6075 // For simplicity, we re-use the same pg for various lane sizes.
6076 // For D lanes: 1, 1, 0
6077 // For S lanes: 1, 1, 1, 0, 0
6078 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
6079 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
6080 Initialise(&masm, p0.VnB(), pg_in);
6081 PRegisterM pg = p0.Merging();
6082
6083 // These are merging operations, so we have to initialise the result register.
6084 // We use a mixture of constructive and destructive operations.
6085
6086 InsrHelper(&masm, z31.VnD(), in);
6087 // Make a copy so we can check that constructive operations preserve zn.
6088 __ Mov(z30, z31);
6089
6090 // For constructive operations, use a different initial result value.
6091 __ Index(z29.VnB(), 0, -1);
6092
6093 __ Mov(z0, z29);
6094 __ Cls(z0.VnB(), pg, z31.VnB());
6095 __ Mov(z1, z31);
6096 __ Clz(z1.VnH(), pg, z1.VnH()); // destructive
6097 __ Mov(z2, z29);
6098 __ Cnt(z2.VnS(), pg, z31.VnS());
6099 __ Mov(z3, z31);
6100 __ Cnt(z3.VnD(), pg, z3.VnD()); // destructive
6101
6102 END();
6103
6104 if (CAN_RUN()) {
6105 RUN();
6106 // Check that non-destructive operations preserve their inputs.
6107 ASSERT_EQUAL_SVE(z30, z31);
6108
6109 // clang-format off
6110
6111 // cls (B)
6112 uint8_t expected_z0[] =
6113 // pg: 0 0 0 0 1 0 1 1
6114 // pg: 1 0 0 1 0 1 1 1
6115 // pg: 0 0 1 0 1 1 1 0
6116 {0xe9, 0xea, 0xeb, 0xec, 7, 0xee, 7, 7,
6117 6, 0xf2, 0xf3, 3, 0xf5, 1, 0, 3,
6118 0xf9, 0xfa, 0, 0xfc, 0, 0, 1, 0x00};
6119 ASSERT_EQUAL_SVE(expected_z0, z0.VnB());
6120
6121 // clz (H) destructive
6122 uint16_t expected_z1[] =
6123 // pg: 0 0 0 1
6124 // pg: 0 1 1 1
6125 // pg: 0 0 1 0
6126 {0x0000, 0x0000, 0x0000, 16,
6127 0xfefc, 0, 0, 0,
6128 0x1234, 0x5678, 0, 0xdef0};
6129 ASSERT_EQUAL_SVE(expected_z1, z1.VnH());
6130
6131 // cnt (S)
6132 uint32_t expected_z2[] =
6133 // pg: 0 1
6134 // pg: 1 1
6135 // pg: 0 0
6136 {0xe9eaebec, 0,
6137 22, 16,
6138 0xf9fafbfc, 0xfdfeff00};
6139 ASSERT_EQUAL_SVE(expected_z2, z2.VnS());
6140
6141 // cnt (D) destructive
6142 uint64_t expected_z3[] =
6143 // pg: 1 1 0
6144 { 0, 38, 0x123456789abcdef0};
6145 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6146
6147 // clang-format on
6148 }
6149 }
6150
TEST_SVE(sve_sxt)6151 TEST_SVE(sve_sxt) {
6152 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6153 START();
6154
6155 uint64_t in[] = {0x01f203f405f607f8, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};
6156
6157 // For simplicity, we re-use the same pg for various lane sizes.
6158 // For D lanes: 1, 1, 0
6159 // For S lanes: 1, 1, 1, 0, 0
6160 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
6161 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
6162 Initialise(&masm, p0.VnB(), pg_in);
6163 PRegisterM pg = p0.Merging();
6164
6165 // These are merging operations, so we have to initialise the result register.
6166 // We use a mixture of constructive and destructive operations.
6167
6168 InsrHelper(&masm, z31.VnD(), in);
6169 // Make a copy so we can check that constructive operations preserve zn.
6170 __ Mov(z30, z31);
6171
6172 // For constructive operations, use a different initial result value.
6173 __ Index(z29.VnB(), 0, -1);
6174
6175 __ Mov(z0, z31);
6176 __ Sxtb(z0.VnH(), pg, z0.VnH()); // destructive
6177 __ Mov(z1, z29);
6178 __ Sxtb(z1.VnS(), pg, z31.VnS());
6179 __ Mov(z2, z31);
6180 __ Sxtb(z2.VnD(), pg, z2.VnD()); // destructive
6181 __ Mov(z3, z29);
6182 __ Sxth(z3.VnS(), pg, z31.VnS());
6183 __ Mov(z4, z31);
6184 __ Sxth(z4.VnD(), pg, z4.VnD()); // destructive
6185 __ Mov(z5, z29);
6186 __ Sxtw(z5.VnD(), pg, z31.VnD());
6187
6188 END();
6189
6190 if (CAN_RUN()) {
6191 RUN();
6192 // Check that constructive operations preserve their inputs.
6193 ASSERT_EQUAL_SVE(z30, z31);
6194
6195 // clang-format off
6196
6197 // Sxtb (H) destructive
6198 uint64_t expected_z0[] =
6199 // pg: 0 0 0 1 0 1 1 1 0 0 1 0
6200 {0x01f203f405f6fff8, 0xfefcfff0ffc3000f, 0x12345678ffbcdef0};
6201 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
6202
6203 // Sxtb (S)
6204 uint64_t expected_z1[] =
6205 // pg: 0 1 1 1 0 0
6206 {0xe9eaebecfffffff8, 0xfffffff00000000f, 0xf9fafbfcfdfeff00};
6207 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
6208
6209 // Sxtb (D) destructive
6210 uint64_t expected_z2[] =
6211 // pg: 1 1 0
6212 {0xfffffffffffffff8, 0x000000000000000f, 0x123456789abcdef0};
6213 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
6214
6215 // Sxth (S)
6216 uint64_t expected_z3[] =
6217 // pg: 0 1 1 1 0 0
6218 {0xe9eaebec000007f8, 0xfffff8f0ffff870f, 0xf9fafbfcfdfeff00};
6219 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6220
6221 // Sxth (D) destructive
6222 uint64_t expected_z4[] =
6223 // pg: 1 1 0
6224 {0x00000000000007f8, 0xffffffffffff870f, 0x123456789abcdef0};
6225 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
6226
6227 // Sxtw (D)
6228 uint64_t expected_z5[] =
6229 // pg: 1 1 0
6230 {0x0000000005f607f8, 0xffffffffe1c3870f, 0xf9fafbfcfdfeff00};
6231 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
6232
6233 // clang-format on
6234 }
6235 }
6236
TEST_SVE(sve_uxt)6237 TEST_SVE(sve_uxt) {
6238 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6239 START();
6240
6241 uint64_t in[] = {0x01f203f405f607f8, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};
6242
6243 // For simplicity, we re-use the same pg for various lane sizes.
6244 // For D lanes: 1, 1, 0
6245 // For S lanes: 1, 1, 1, 0, 0
6246 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
6247 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
6248 Initialise(&masm, p0.VnB(), pg_in);
6249 PRegisterM pg = p0.Merging();
6250
6251 // These are merging operations, so we have to initialise the result register.
6252 // We use a mixture of constructive and destructive operations.
6253
6254 InsrHelper(&masm, z31.VnD(), in);
6255 // Make a copy so we can check that constructive operations preserve zn.
6256 __ Mov(z30, z31);
6257
6258 // For constructive operations, use a different initial result value.
6259 __ Index(z29.VnB(), 0, -1);
6260
6261 __ Mov(z0, z29);
6262 __ Uxtb(z0.VnH(), pg, z31.VnH());
6263 __ Mov(z1, z31);
6264 __ Uxtb(z1.VnS(), pg, z1.VnS()); // destructive
6265 __ Mov(z2, z29);
6266 __ Uxtb(z2.VnD(), pg, z31.VnD());
6267 __ Mov(z3, z31);
6268 __ Uxth(z3.VnS(), pg, z3.VnS()); // destructive
6269 __ Mov(z4, z29);
6270 __ Uxth(z4.VnD(), pg, z31.VnD());
6271 __ Mov(z5, z31);
6272 __ Uxtw(z5.VnD(), pg, z5.VnD()); // destructive
6273
6274 END();
6275
6276 if (CAN_RUN()) {
6277 RUN();
6278 // clang-format off
6279
6280 // Uxtb (H)
6281 uint64_t expected_z0[] =
6282 // pg: 0 0 0 1 0 1 1 1 0 0 1 0
6283 {0xe9eaebecedee00f8, 0xf1f200f000c3000f, 0xf9fafbfc00bcff00};
6284 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
6285
6286 // Uxtb (S) destructive
6287 uint64_t expected_z1[] =
6288 // pg: 0 1 1 1 0 0
6289 {0x01f203f4000000f8, 0x000000f00000000f, 0x123456789abcdef0};
6290 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
6291
6292 // Uxtb (D)
6293 uint64_t expected_z2[] =
6294 // pg: 1 1 0
6295 {0x00000000000000f8, 0x000000000000000f, 0xf9fafbfcfdfeff00};
6296 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
6297
6298 // Uxth (S) destructive
6299 uint64_t expected_z3[] =
6300 // pg: 0 1 1 1 0 0
6301 {0x01f203f4000007f8, 0x0000f8f00000870f, 0x123456789abcdef0};
6302 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6303
6304 // Uxth (D)
6305 uint64_t expected_z4[] =
6306 // pg: 1 1 0
6307 {0x00000000000007f8, 0x000000000000870f, 0xf9fafbfcfdfeff00};
6308 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
6309
6310 // Uxtw (D) destructive
6311 uint64_t expected_z5[] =
6312 // pg: 1 1 0
6313 {0x0000000005f607f8, 0x00000000e1c3870f, 0x123456789abcdef0};
6314 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
6315
6316 // clang-format on
6317 }
6318 }
6319
TEST_SVE(sve_abs_neg)6320 TEST_SVE(sve_abs_neg) {
6321 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6322 START();
6323
6324 uint64_t in[] = {0x01f203f405f607f8, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};
6325
6326 // For simplicity, we re-use the same pg for various lane sizes.
6327 // For D lanes: 1, 1, 0
6328 // For S lanes: 1, 1, 1, 0, 0
6329 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
6330 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
6331 Initialise(&masm, p0.VnB(), pg_in);
6332 PRegisterM pg = p0.Merging();
6333
6334 InsrHelper(&masm, z31.VnD(), in);
6335
6336 // These are merging operations, so we have to initialise the result register.
6337 // We use a mixture of constructive and destructive operations.
6338
6339 InsrHelper(&masm, z31.VnD(), in);
6340 // Make a copy so we can check that constructive operations preserve zn.
6341 __ Mov(z30, z31);
6342
6343 // For constructive operations, use a different initial result value.
6344 __ Index(z29.VnB(), 0, -1);
6345
6346 __ Mov(z0, z31);
6347 __ Abs(z0.VnD(), pg, z0.VnD()); // destructive
6348 __ Mov(z1, z29);
6349 __ Abs(z1.VnB(), pg, z31.VnB());
6350
6351 __ Mov(z2, z31);
6352 __ Neg(z2.VnH(), pg, z2.VnH()); // destructive
6353 __ Mov(z3, z29);
6354 __ Neg(z3.VnS(), pg, z31.VnS());
6355
6356 // The unpredicated form of `Neg` is implemented using `subr`.
6357 __ Mov(z4, z31);
6358 __ Neg(z4.VnB(), z4.VnB()); // destructive
6359 __ Mov(z5, z29);
6360 __ Neg(z5.VnD(), z31.VnD());
6361
6362 END();
6363
6364 if (CAN_RUN()) {
6365 RUN();
6366
6367 ASSERT_EQUAL_SVE(z30, z31);
6368
6369 // clang-format off
6370
6371 // Abs (D) destructive
6372 uint64_t expected_z0[] =
6373 // pg: 1 1 0
6374 {0x01f203f405f607f8, 0x0103070f1e3c78f1, 0x123456789abcdef0};
6375 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
6376
6377 // Abs (B)
6378 uint64_t expected_z1[] =
6379 // pg: 0 0 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0
6380 {0xe9eaebec05ee0708, 0x02f2f310f53d790f, 0xf9fa56fc66442200};
6381 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
6382
6383 // Neg (H) destructive
6384 uint64_t expected_z2[] =
6385 // pg: 0 0 0 1 0 1 1 1 0 0 1 0
6386 {0x01f203f405f6f808, 0xfefc07101e3d78f1, 0x123456786544def0};
6387 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
6388
6389 // Neg (S)
6390 uint64_t expected_z3[] =
6391 // pg: 0 1 1 1 0 0
6392 {0xe9eaebecfa09f808, 0x010307101e3c78f1, 0xf9fafbfcfdfeff00};
6393 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6394
6395 // Neg (B) destructive, unpredicated
6396 uint64_t expected_z4[] =
6397 {0xff0efd0cfb0af908, 0x020408101f3d79f1, 0xeeccaa8866442210};
6398 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
6399
6400 // Neg (D) unpredicated
6401 uint64_t expected_z5[] =
6402 {0xfe0dfc0bfa09f808, 0x0103070f1e3c78f1, 0xedcba98765432110};
6403 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
6404
6405 // clang-format on
6406 }
6407 }
6408
TEST_SVE(sve_cpy)6409 TEST_SVE(sve_cpy) {
6410 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
6411 START();
6412
6413 // For simplicity, we re-use the same pg for various lane sizes.
6414 // For D lanes: 0, 1, 1
6415 // For S lanes: 0, 1, 1, 0, 1
6416 // For H lanes: 1, 0, 0, 1, 0, 1, 1, 0, 0, 1
6417 int pg_in[] = {1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1};
6418
6419 PRegisterM pg = p7.Merging();
6420 Initialise(&masm, pg.VnB(), pg_in);
6421
6422 // These are merging operations, so we have to initialise the result registers
6423 // for each operation.
6424 for (unsigned i = 0; i < kNumberOfZRegisters; i++) {
6425 __ Index(ZRegister(i, kBRegSize), 0, -1);
6426 }
6427
6428 // Recognisable values to copy.
6429 __ Mov(x0, 0xdeadbeefdeadbe42);
6430 __ Mov(x1, 0xdeadbeefdead8421);
6431 __ Mov(x2, 0xdeadbeef80042001);
6432 __ Mov(x3, 0x8000000420000001);
6433
6434 // Use NEON moves, to avoid testing SVE `cpy` against itself.
6435 __ Dup(v28.V2D(), x0);
6436 __ Dup(v29.V2D(), x1);
6437 __ Dup(v30.V2D(), x2);
6438 __ Dup(v31.V2D(), x3);
6439
6440 // Register forms (CPY_z_p_r)
6441 __ Cpy(z0.VnB(), pg, w0);
6442 __ Cpy(z1.VnH(), pg, x1); // X registers are accepted for small lanes.
6443 __ Cpy(z2.VnS(), pg, w2);
6444 __ Cpy(z3.VnD(), pg, x3);
6445
6446 // VRegister forms (CPY_z_p_v)
6447 __ Cpy(z4.VnB(), pg, b28);
6448 __ Cpy(z5.VnH(), pg, h29);
6449 __ Cpy(z6.VnS(), pg, s30);
6450 __ Cpy(z7.VnD(), pg, d31);
6451
6452 // Check that we can copy the stack pointer.
6453 __ Mov(x10, sp);
6454 __ Mov(sp, 0xabcabcabcabcabca); // Set sp to a known value.
6455 __ Cpy(z16.VnB(), pg, sp);
6456 __ Cpy(z17.VnH(), pg, wsp);
6457 __ Cpy(z18.VnS(), pg, wsp);
6458 __ Cpy(z19.VnD(), pg, sp);
6459 __ Mov(sp, x10); // Restore sp.
6460
6461 END();
6462
6463 if (CAN_RUN()) {
6464 RUN();
6465 // clang-format off
6466
6467 uint64_t expected_b[] =
6468 // pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1
6469 {0xe9eaebec424242f0, 0x42f2f34242f64242, 0xf942fbfcfdfeff42};
6470 ASSERT_EQUAL_SVE(expected_b, z0.VnD());
6471 ASSERT_EQUAL_SVE(expected_b, z4.VnD());
6472
6473 uint64_t expected_h[] =
6474 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6475 {0xe9eaebec8421eff0, 0xf1f28421f5f68421, 0x8421fbfcfdfe8421};
6476 ASSERT_EQUAL_SVE(expected_h, z1.VnD());
6477 ASSERT_EQUAL_SVE(expected_h, z5.VnD());
6478
6479 uint64_t expected_s[] =
6480 // pg: 0 0 1 1 0 1
6481 {0xe9eaebecedeeeff0, 0x8004200180042001, 0xf9fafbfc80042001};
6482 ASSERT_EQUAL_SVE(expected_s, z2.VnD());
6483 ASSERT_EQUAL_SVE(expected_s, z6.VnD());
6484
6485 uint64_t expected_d[] =
6486 // pg: 0 1 1
6487 {0xe9eaebecedeeeff0, 0x8000000420000001, 0x8000000420000001};
6488 ASSERT_EQUAL_SVE(expected_d, z3.VnD());
6489 ASSERT_EQUAL_SVE(expected_d, z7.VnD());
6490
6491
6492 uint64_t expected_b_sp[] =
6493 // pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1
6494 {0xe9eaebeccacacaf0, 0xcaf2f3cacaf6caca, 0xf9cafbfcfdfeffca};
6495 ASSERT_EQUAL_SVE(expected_b_sp, z16.VnD());
6496
6497 uint64_t expected_h_sp[] =
6498 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6499 {0xe9eaebecabcaeff0, 0xf1f2abcaf5f6abca, 0xabcafbfcfdfeabca};
6500 ASSERT_EQUAL_SVE(expected_h_sp, z17.VnD());
6501
6502 uint64_t expected_s_sp[] =
6503 // pg: 0 0 1 1 0 1
6504 {0xe9eaebecedeeeff0, 0xcabcabcacabcabca, 0xf9fafbfccabcabca};
6505 ASSERT_EQUAL_SVE(expected_s_sp, z18.VnD());
6506
6507 uint64_t expected_d_sp[] =
6508 // pg: 0 1 1
6509 {0xe9eaebecedeeeff0, 0xabcabcabcabcabca, 0xabcabcabcabcabca};
6510 ASSERT_EQUAL_SVE(expected_d_sp, z19.VnD());
6511
6512 // clang-format on
6513 }
6514 }
6515
TEST_SVE(sve_cpy_imm)6516 TEST_SVE(sve_cpy_imm) {
6517 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6518 START();
6519
6520 // For simplicity, we re-use the same pg for various lane sizes.
6521 // For D lanes: 0, 1, 1
6522 // For S lanes: 0, 1, 1, 0, 1
6523 // For H lanes: 1, 0, 0, 1, 0, 1, 1, 0, 0, 1
6524 int pg_in[] = {1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1};
6525
6526 PRegister pg = p7;
6527 Initialise(&masm, pg.VnB(), pg_in);
6528
6529 // These are (mostly) merging operations, so we have to initialise the result
6530 // registers for each operation.
6531 for (unsigned i = 0; i < kNumberOfZRegisters; i++) {
6532 __ Index(ZRegister(i, kBRegSize), 0, -1);
6533 }
6534
6535 // Encodable integer forms (CPY_z_p_i)
6536 __ Cpy(z0.VnB(), pg.Merging(), 0);
6537 __ Cpy(z1.VnB(), pg.Zeroing(), 42);
6538 __ Cpy(z2.VnB(), pg.Merging(), -42);
6539 __ Cpy(z3.VnB(), pg.Zeroing(), 0xff);
6540 __ Cpy(z4.VnH(), pg.Merging(), 127);
6541 __ Cpy(z5.VnS(), pg.Zeroing(), -128);
6542 __ Cpy(z6.VnD(), pg.Merging(), -1);
6543
6544 // Forms encodable using fcpy.
6545 __ Cpy(z7.VnH(), pg.Merging(), Float16ToRawbits(Float16(-31.0)));
6546 __ Cpy(z8.VnS(), pg.Zeroing(), FloatToRawbits(2.0f));
6547 __ Cpy(z9.VnD(), pg.Merging(), DoubleToRawbits(-4.0));
6548
6549 // Other forms use a scratch register.
6550 __ Cpy(z10.VnH(), pg.Merging(), 0xff);
6551 __ Cpy(z11.VnD(), pg.Zeroing(), 0x0123456789abcdef);
6552
6553 END();
6554
6555 if (CAN_RUN()) {
6556 RUN();
6557 // clang-format off
6558
6559 uint64_t expected_z0[] =
6560 // pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1
6561 {0xe9eaebec000000f0, 0x00f2f30000f60000, 0xf900fbfcfdfeff00};
6562 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
6563
6564 uint64_t expected_z1[] =
6565 // pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1
6566 {0x000000002a2a2a00, 0x2a00002a2a002a2a, 0x002a00000000002a};
6567 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
6568
6569 uint64_t expected_z2[] =
6570 // pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1
6571 {0xe9eaebecd6d6d6f0, 0xd6f2f3d6d6f6d6d6, 0xf9d6fbfcfdfeffd6};
6572 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
6573
6574 uint64_t expected_z3[] =
6575 // pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1
6576 {0x00000000ffffff00, 0xff0000ffff00ffff, 0x00ff0000000000ff};
6577 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6578
6579 uint64_t expected_z4[] =
6580 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6581 {0xe9eaebec007feff0, 0xf1f2007ff5f6007f, 0x007ffbfcfdfe007f};
6582 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
6583
6584 uint64_t expected_z5[] =
6585 // pg: 0 0 1 1 0 1
6586 {0x0000000000000000, 0xffffff80ffffff80, 0x00000000ffffff80};
6587 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
6588
6589 uint64_t expected_z6[] =
6590 // pg: 0 1 1
6591 {0xe9eaebecedeeeff0, 0xffffffffffffffff, 0xffffffffffffffff};
6592 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
6593
6594 uint64_t expected_z7[] =
6595 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6596 {0xe9eaebeccfc0eff0, 0xf1f2cfc0f5f6cfc0, 0xcfc0fbfcfdfecfc0};
6597 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
6598
6599 uint64_t expected_z8[] =
6600 // pg: 0 0 1 1 0 1
6601 {0x0000000000000000, 0x4000000040000000, 0x0000000040000000};
6602 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
6603
6604 uint64_t expected_z9[] =
6605 // pg: 0 1 1
6606 {0xe9eaebecedeeeff0, 0xc010000000000000, 0xc010000000000000};
6607 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
6608
6609 uint64_t expected_z10[] =
6610 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6611 {0xe9eaebec00ffeff0, 0xf1f200fff5f600ff, 0x00fffbfcfdfe00ff};
6612 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
6613
6614 uint64_t expected_z11[] =
6615 // pg: 0 1 1
6616 {0x0000000000000000, 0x0123456789abcdef, 0x0123456789abcdef};
6617 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
6618
6619 // clang-format on
6620 }
6621 }
6622
TEST_SVE(sve_fcpy_imm)6623 TEST_SVE(sve_fcpy_imm) {
6624 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6625 START();
6626
6627 // For simplicity, we re-use the same pg for various lane sizes.
6628 // For D lanes: 0, 1, 1
6629 // For S lanes: 0, 1, 1, 0, 1
6630 // For H lanes: 1, 0, 0, 1, 0, 1, 1, 0, 0, 1
6631 int pg_in[] = {1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1};
6632
6633 PRegister pg = p7;
6634 Initialise(&masm, pg.VnB(), pg_in);
6635
6636 // These are (mostly) merging operations, so we have to initialise the result
6637 // registers for each operation.
6638 for (unsigned i = 0; i < kNumberOfZRegisters; i++) {
6639 __ Index(ZRegister(i, kBRegSize), 0, -1);
6640 }
6641
6642 // Encodable floating-point forms (FCPY_z_p_i)
6643 __ Fcpy(z1.VnH(), pg.Merging(), Float16(1.0));
6644 __ Fcpy(z2.VnH(), pg.Merging(), -2.0f);
6645 __ Fcpy(z3.VnH(), pg.Merging(), 3.0);
6646 __ Fcpy(z4.VnS(), pg.Merging(), Float16(-4.0));
6647 __ Fcpy(z5.VnS(), pg.Merging(), 5.0f);
6648 __ Fcpy(z6.VnS(), pg.Merging(), 6.0);
6649 __ Fcpy(z7.VnD(), pg.Merging(), Float16(7.0));
6650 __ Fcpy(z8.VnD(), pg.Merging(), 8.0f);
6651 __ Fmov(z9.VnD(), pg.Merging(), -9.0);
6652
6653 // Unencodable immediates.
6654 __ Fcpy(z10.VnS(), pg.Merging(), 0.0);
6655 __ Fcpy(z11.VnH(), pg.Merging(), Float16(42.0));
6656 __ Fcpy(z12.VnD(), pg.Merging(), RawbitsToDouble(0x7ff0000012340000)); // NaN
6657 __ Fcpy(z13.VnH(), pg.Merging(), kFP64NegativeInfinity);
6658
6659 // Fmov alias.
6660 __ Fmov(z14.VnS(), pg.Merging(), 0.0);
6661 __ Fmov(z15.VnH(), pg.Merging(), Float16(42.0));
6662 __ Fmov(z16.VnD(), pg.Merging(), RawbitsToDouble(0x7ff0000012340000)); // NaN
6663 __ Fmov(z17.VnH(), pg.Merging(), kFP64NegativeInfinity);
6664 END();
6665
6666 if (CAN_RUN()) {
6667 RUN();
6668 // clang-format off
6669
6670 // 1.0 as FP16: 0x3c00
6671 uint64_t expected_z1[] =
6672 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6673 {0xe9eaebec3c00eff0, 0xf1f23c00f5f63c00, 0x3c00fbfcfdfe3c00};
6674 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
6675
6676 // -2.0 as FP16: 0xc000
6677 uint64_t expected_z2[] =
6678 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6679 {0xe9eaebecc000eff0, 0xf1f2c000f5f6c000, 0xc000fbfcfdfec000};
6680 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
6681
6682 // 3.0 as FP16: 0x4200
6683 uint64_t expected_z3[] =
6684 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6685 {0xe9eaebec4200eff0, 0xf1f24200f5f64200, 0x4200fbfcfdfe4200};
6686 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6687
6688 // -4.0 as FP32: 0xc0800000
6689 uint64_t expected_z4[] =
6690 // pg: 0 0 1 1 0 1
6691 {0xe9eaebecedeeeff0, 0xc0800000c0800000, 0xf9fafbfcc0800000};
6692 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
6693
6694 // 5.0 as FP32: 0x40a00000
6695 uint64_t expected_z5[] =
6696 // pg: 0 0 1 1 0 1
6697 {0xe9eaebecedeeeff0, 0x40a0000040a00000, 0xf9fafbfc40a00000};
6698 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
6699
6700 // 6.0 as FP32: 0x40c00000
6701 uint64_t expected_z6[] =
6702 // pg: 0 0 1 1 0 1
6703 {0xe9eaebecedeeeff0, 0x40c0000040c00000, 0xf9fafbfc40c00000};
6704 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
6705
6706 // 7.0 as FP64: 0x401c000000000000
6707 uint64_t expected_z7[] =
6708 // pg: 0 1 1
6709 {0xe9eaebecedeeeff0, 0x401c000000000000, 0x401c000000000000};
6710 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
6711
6712 // 8.0 as FP64: 0x4020000000000000
6713 uint64_t expected_z8[] =
6714 // pg: 0 1 1
6715 {0xe9eaebecedeeeff0, 0x4020000000000000, 0x4020000000000000};
6716 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
6717
6718 // -9.0 as FP64: 0xc022000000000000
6719 uint64_t expected_z9[] =
6720 // pg: 0 1 1
6721 {0xe9eaebecedeeeff0, 0xc022000000000000, 0xc022000000000000};
6722 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
6723
6724 // 0.0 as FP32: 0x00000000
6725 uint64_t expected_z10[] =
6726 // pg: 0 0 1 1 0 1
6727 {0xe9eaebecedeeeff0, 0x0000000000000000, 0xf9fafbfc00000000};
6728 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
6729
6730 // 42.0 as FP16: 0x5140
6731 uint64_t expected_z11[] =
6732 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6733 {0xe9eaebec5140eff0, 0xf1f25140f5f65140, 0x5140fbfcfdfe5140};
6734 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
6735
6736 // Signalling NaN (with payload): 0x7ff0000012340000
6737 uint64_t expected_z12[] =
6738 // pg: 0 1 1
6739 {0xe9eaebecedeeeff0, 0x7ff0000012340000, 0x7ff0000012340000};
6740 ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
6741
6742 // -infinity as FP16: 0xfc00
6743 uint64_t expected_z13[] =
6744 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6745 {0xe9eaebecfc00eff0, 0xf1f2fc00f5f6fc00, 0xfc00fbfcfdfefc00};
6746 ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
6747
6748 ASSERT_EQUAL_SVE(z10.VnD(), z14.VnD());
6749 ASSERT_EQUAL_SVE(z11.VnD(), z15.VnD());
6750 ASSERT_EQUAL_SVE(z12.VnD(), z16.VnD());
6751 ASSERT_EQUAL_SVE(z13.VnD(), z17.VnD());
6752 // clang-format on
6753 }
6754 }
6755
TEST_SVE(sve_permute_vector_unpredicated_table_lookup)6756 TEST_SVE(sve_permute_vector_unpredicated_table_lookup) {
6757 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6758 START();
6759
6760 uint64_t table_inputs[] = {0xffeeddccbbaa9988, 0x7766554433221100};
6761
6762 int index_b[] = {255, 255, 11, 10, 15, 14, 13, 12, 1, 0, 4, 3, 7, 6, 5, 4};
6763
6764 int index_h[] = {5, 6, 7, 8, 2, 3, 6, 4};
6765
6766 int index_s[] = {1, 3, 2, 31, -1};
6767
6768 int index_d[] = {31, 1};
6769
6770 // Initialize the register with a value that doesn't existed in the table.
6771 __ Dup(z9.VnB(), 0x1f);
6772 InsrHelper(&masm, z9.VnD(), table_inputs);
6773
6774 ZRegister ind_b = z0.WithLaneSize(kBRegSize);
6775 ZRegister ind_h = z1.WithLaneSize(kHRegSize);
6776 ZRegister ind_s = z2.WithLaneSize(kSRegSize);
6777 ZRegister ind_d = z3.WithLaneSize(kDRegSize);
6778
6779 InsrHelper(&masm, ind_b, index_b);
6780 InsrHelper(&masm, ind_h, index_h);
6781 InsrHelper(&masm, ind_s, index_s);
6782 InsrHelper(&masm, ind_d, index_d);
6783
6784 __ Tbl(z26.VnB(), z9.VnB(), ind_b);
6785
6786 __ Tbl(z27.VnH(), z9.VnH(), ind_h);
6787
6788 __ Tbl(z28.VnS(), z9.VnS(), ind_s);
6789
6790 __ Tbl(z29.VnD(), z9.VnD(), ind_d);
6791
6792 END();
6793
6794 if (CAN_RUN()) {
6795 RUN();
6796
6797 // clang-format off
6798 unsigned z26_expected[] = {0x1f, 0x1f, 0xbb, 0xaa, 0xff, 0xee, 0xdd, 0xcc,
6799 0x11, 0x00, 0x44, 0x33, 0x77, 0x66, 0x55, 0x44};
6800
6801 unsigned z27_expected[] = {0xbbaa, 0xddcc, 0xffee, 0x1f1f,
6802 0x5544, 0x7766, 0xddcc, 0x9988};
6803
6804 unsigned z28_expected[] =
6805 {0x77665544, 0xffeeddcc, 0xbbaa9988, 0x1f1f1f1f, 0x1f1f1f1f};
6806
6807 uint64_t z29_expected[] = {0x1f1f1f1f1f1f1f1f, 0xffeeddccbbaa9988};
6808 // clang-format on
6809
6810 unsigned vl = config->sve_vl_in_bits();
6811 for (size_t i = 0; i < ArrayLength(index_b); i++) {
6812 int lane = static_cast<int>(ArrayLength(index_b) - i - 1);
6813 if (!core.HasSVELane(z26.VnB(), lane)) break;
6814 uint64_t expected = (vl > (index_b[i] * kBRegSize)) ? z26_expected[i] : 0;
6815 ASSERT_EQUAL_SVE_LANE(expected, z26.VnB(), lane);
6816 }
6817
6818 for (size_t i = 0; i < ArrayLength(index_h); i++) {
6819 int lane = static_cast<int>(ArrayLength(index_h) - i - 1);
6820 if (!core.HasSVELane(z27.VnH(), lane)) break;
6821 uint64_t expected = (vl > (index_h[i] * kHRegSize)) ? z27_expected[i] : 0;
6822 ASSERT_EQUAL_SVE_LANE(expected, z27.VnH(), lane);
6823 }
6824
6825 for (size_t i = 0; i < ArrayLength(index_s); i++) {
6826 int lane = static_cast<int>(ArrayLength(index_s) - i - 1);
6827 if (!core.HasSVELane(z28.VnS(), lane)) break;
6828 uint64_t expected = (vl > (index_s[i] * kSRegSize)) ? z28_expected[i] : 0;
6829 ASSERT_EQUAL_SVE_LANE(expected, z28.VnS(), lane);
6830 }
6831
6832 for (size_t i = 0; i < ArrayLength(index_d); i++) {
6833 int lane = static_cast<int>(ArrayLength(index_d) - i - 1);
6834 if (!core.HasSVELane(z29.VnD(), lane)) break;
6835 uint64_t expected = (vl > (index_d[i] * kDRegSize)) ? z29_expected[i] : 0;
6836 ASSERT_EQUAL_SVE_LANE(expected, z29.VnD(), lane);
6837 }
6838 }
6839 }
6840
TEST_SVE(ldr_str_z_bi)6841 TEST_SVE(ldr_str_z_bi) {
6842 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6843 START();
6844
6845 int vl = config->sve_vl_in_bytes();
6846
6847 // The immediate can address [-256, 255] times the VL, so allocate enough
6848 // space to exceed that in both directions.
6849 int data_size = vl * 1024;
6850
6851 uint8_t* data = new uint8_t[data_size];
6852 memset(data, 0, data_size);
6853
6854 // Set the base half-way through the buffer so we can use negative indices.
6855 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
6856
6857 __ Index(z1.VnB(), 1, 3);
6858 __ Index(z2.VnB(), 2, 5);
6859 __ Index(z3.VnB(), 3, 7);
6860 __ Index(z4.VnB(), 4, 11);
6861 __ Index(z5.VnB(), 5, 13);
6862 __ Index(z6.VnB(), 6, 2);
6863 __ Index(z7.VnB(), 7, 3);
6864 __ Index(z8.VnB(), 8, 5);
6865 __ Index(z9.VnB(), 9, 7);
6866
6867 // Encodable cases.
6868 __ Str(z1, SVEMemOperand(x0));
6869 __ Str(z2, SVEMemOperand(x0, 2, SVE_MUL_VL));
6870 __ Str(z3, SVEMemOperand(x0, -3, SVE_MUL_VL));
6871 __ Str(z4, SVEMemOperand(x0, 255, SVE_MUL_VL));
6872 __ Str(z5, SVEMemOperand(x0, -256, SVE_MUL_VL));
6873
6874 // Cases that fall back on `CalculateSVEAddress`.
6875 __ Str(z6, SVEMemOperand(x0, 6 * vl));
6876 __ Str(z7, SVEMemOperand(x0, -7 * vl));
6877 __ Str(z8, SVEMemOperand(x0, 314, SVE_MUL_VL));
6878 __ Str(z9, SVEMemOperand(x0, -314, SVE_MUL_VL));
6879
6880 // Corresponding loads.
6881 __ Ldr(z11, SVEMemOperand(x0, xzr)); // Test xzr operand.
6882 __ Ldr(z12, SVEMemOperand(x0, 2, SVE_MUL_VL));
6883 __ Ldr(z13, SVEMemOperand(x0, -3, SVE_MUL_VL));
6884 __ Ldr(z14, SVEMemOperand(x0, 255, SVE_MUL_VL));
6885 __ Ldr(z15, SVEMemOperand(x0, -256, SVE_MUL_VL));
6886
6887 __ Ldr(z16, SVEMemOperand(x0, 6 * vl));
6888 __ Ldr(z17, SVEMemOperand(x0, -7 * vl));
6889 __ Ldr(z18, SVEMemOperand(x0, 314, SVE_MUL_VL));
6890 __ Ldr(z19, SVEMemOperand(x0, -314, SVE_MUL_VL));
6891
6892 END();
6893
6894 if (CAN_RUN()) {
6895 RUN();
6896
6897 uint8_t* expected = new uint8_t[data_size];
6898 memset(expected, 0, data_size);
6899 uint8_t* middle = &expected[data_size / 2];
6900
6901 for (int i = 0; i < vl; i++) {
6902 middle[i] = (1 + (3 * i)) & 0xff; // z1
6903 middle[(2 * vl) + i] = (2 + (5 * i)) & 0xff; // z2
6904 middle[(-3 * vl) + i] = (3 + (7 * i)) & 0xff; // z3
6905 middle[(255 * vl) + i] = (4 + (11 * i)) & 0xff; // z4
6906 middle[(-256 * vl) + i] = (5 + (13 * i)) & 0xff; // z5
6907 middle[(6 * vl) + i] = (6 + (2 * i)) & 0xff; // z6
6908 middle[(-7 * vl) + i] = (7 + (3 * i)) & 0xff; // z7
6909 middle[(314 * vl) + i] = (8 + (5 * i)) & 0xff; // z8
6910 middle[(-314 * vl) + i] = (9 + (7 * i)) & 0xff; // z9
6911 }
6912
6913 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
6914
6915 ASSERT_EQUAL_SVE(z1, z11);
6916 ASSERT_EQUAL_SVE(z2, z12);
6917 ASSERT_EQUAL_SVE(z3, z13);
6918 ASSERT_EQUAL_SVE(z4, z14);
6919 ASSERT_EQUAL_SVE(z5, z15);
6920 ASSERT_EQUAL_SVE(z6, z16);
6921 ASSERT_EQUAL_SVE(z7, z17);
6922 ASSERT_EQUAL_SVE(z8, z18);
6923 ASSERT_EQUAL_SVE(z9, z19);
6924
6925 delete[] expected;
6926 }
6927 delete[] data;
6928 }
6929
TEST_SVE(ldr_str_p_bi)6930 TEST_SVE(ldr_str_p_bi) {
6931 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6932 START();
6933
6934 int vl = config->sve_vl_in_bytes();
6935 VIXL_ASSERT((vl % kZRegBitsPerPRegBit) == 0);
6936 int pl = vl / kZRegBitsPerPRegBit;
6937
6938 // The immediate can address [-256, 255] times the PL, so allocate enough
6939 // space to exceed that in both directions.
6940 int data_size = pl * 1024;
6941
6942 uint8_t* data = new uint8_t[data_size];
6943 memset(data, 0, data_size);
6944
6945 // Set the base half-way through the buffer so we can use negative indices.
6946 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
6947
6948 uint64_t pattern[4] = {0x1010101011101111,
6949 0x0010111011000101,
6950 0x1001101110010110,
6951 0x1010110101100011};
6952 for (int i = 8; i <= 15; i++) {
6953 // Initialise p8-p15 with a conveniently-recognisable, non-zero pattern.
6954 Initialise(&masm,
6955 PRegister(i),
6956 pattern[3] * i,
6957 pattern[2] * i,
6958 pattern[1] * i,
6959 pattern[0] * i);
6960 }
6961
6962 // Encodable cases.
6963 __ Str(p8, SVEMemOperand(x0));
6964 __ Str(p9, SVEMemOperand(x0, 2, SVE_MUL_VL));
6965 __ Str(p10, SVEMemOperand(x0, -3, SVE_MUL_VL));
6966 __ Str(p11, SVEMemOperand(x0, 255, SVE_MUL_VL));
6967
6968 // Cases that fall back on `CalculateSVEAddress`.
6969 __ Str(p12, SVEMemOperand(x0, 6 * pl));
6970 __ Str(p13, SVEMemOperand(x0, -7 * pl));
6971 __ Str(p14, SVEMemOperand(x0, 314, SVE_MUL_VL));
6972 __ Str(p15, SVEMemOperand(x0, -314, SVE_MUL_VL));
6973
6974 // Corresponding loads.
6975 __ Ldr(p0, SVEMemOperand(x0));
6976 __ Ldr(p1, SVEMemOperand(x0, 2, SVE_MUL_VL));
6977 __ Ldr(p2, SVEMemOperand(x0, -3, SVE_MUL_VL));
6978 __ Ldr(p3, SVEMemOperand(x0, 255, SVE_MUL_VL));
6979
6980 __ Ldr(p4, SVEMemOperand(x0, 6 * pl));
6981 __ Ldr(p5, SVEMemOperand(x0, -7 * pl));
6982 __ Ldr(p6, SVEMemOperand(x0, 314, SVE_MUL_VL));
6983 __ Ldr(p7, SVEMemOperand(x0, -314, SVE_MUL_VL));
6984
6985 END();
6986
6987 if (CAN_RUN()) {
6988 RUN();
6989
6990 uint8_t* expected = new uint8_t[data_size];
6991 memset(expected, 0, data_size);
6992 uint8_t* middle = &expected[data_size / 2];
6993
6994 for (int i = 0; i < pl; i++) {
6995 int bit_index = (i % sizeof(pattern[0])) * kBitsPerByte;
6996 size_t index = i / sizeof(pattern[0]);
6997 VIXL_ASSERT(index < ArrayLength(pattern));
6998 uint64_t byte = (pattern[index] >> bit_index) & 0xff;
6999 // Each byte of `pattern` can be multiplied by 15 without carry.
7000 VIXL_ASSERT((byte * 15) <= 0xff);
7001
7002 middle[i] = byte * 8; // p8
7003 middle[(2 * pl) + i] = byte * 9; // p9
7004 middle[(-3 * pl) + i] = byte * 10; // p10
7005 middle[(255 * pl) + i] = byte * 11; // p11
7006 middle[(6 * pl) + i] = byte * 12; // p12
7007 middle[(-7 * pl) + i] = byte * 13; // p13
7008 middle[(314 * pl) + i] = byte * 14; // p14
7009 middle[(-314 * pl) + i] = byte * 15; // p15
7010 }
7011
7012 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
7013
7014 ASSERT_EQUAL_SVE(p0, p8);
7015 ASSERT_EQUAL_SVE(p1, p9);
7016 ASSERT_EQUAL_SVE(p2, p10);
7017 ASSERT_EQUAL_SVE(p3, p11);
7018 ASSERT_EQUAL_SVE(p4, p12);
7019 ASSERT_EQUAL_SVE(p5, p13);
7020 ASSERT_EQUAL_SVE(p6, p14);
7021 ASSERT_EQUAL_SVE(p7, p15);
7022
7023 delete[] expected;
7024 }
7025 delete[] data;
7026 }
7027
7028 template <typename T>
MemoryWrite(uint8_t * base,int64_t offset,int64_t index,T data)7029 static void MemoryWrite(uint8_t* base, int64_t offset, int64_t index, T data) {
7030 memcpy(base + offset + (index * sizeof(data)), &data, sizeof(data));
7031 }
7032
TEST_SVE(sve_ld1_st1_contiguous)7033 TEST_SVE(sve_ld1_st1_contiguous) {
7034 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
7035 START();
7036
7037 int vl = config->sve_vl_in_bytes();
7038
7039 // The immediate can address [-8, 7] times the VL, so allocate enough space to
7040 // exceed that in both directions.
7041 int data_size = vl * 128;
7042
7043 uint8_t* data = new uint8_t[data_size];
7044 memset(data, 0, data_size);
7045
7046 // Set the base half-way through the buffer so we can use negative indices.
7047 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
7048
7049 // Encodable scalar-plus-immediate cases.
7050 __ Index(z1.VnB(), 1, -3);
7051 __ Ptrue(p1.VnB());
7052 __ St1b(z1.VnB(), p1, SVEMemOperand(x0));
7053
7054 __ Index(z2.VnH(), -2, 5);
7055 __ Ptrue(p2.VnH(), SVE_MUL3);
7056 __ St1b(z2.VnH(), p2, SVEMemOperand(x0, 7, SVE_MUL_VL));
7057
7058 __ Index(z3.VnS(), 3, -7);
7059 __ Ptrue(p3.VnS(), SVE_POW2);
7060 __ St1h(z3.VnS(), p3, SVEMemOperand(x0, -8, SVE_MUL_VL));
7061
7062 // Encodable scalar-plus-scalar cases.
7063 __ Index(z4.VnD(), -4, 11);
7064 __ Ptrue(p4.VnD(), SVE_VL3);
7065 __ Addvl(x1, x0, 8); // Try not to overlap with VL-dependent cases.
7066 __ Mov(x2, 17);
7067 __ St1b(z4.VnD(), p4, SVEMemOperand(x1, x2));
7068
7069 __ Index(z5.VnD(), 6, -2);
7070 __ Ptrue(p5.VnD(), SVE_VL16);
7071 __ Addvl(x3, x0, 10); // Try not to overlap with VL-dependent cases.
7072 __ Mov(x4, 6);
7073 __ St1d(z5.VnD(), p5, SVEMemOperand(x3, x4, LSL, 3));
7074
7075 // Unencodable cases fall back on `CalculateSVEAddress`.
7076 __ Index(z6.VnS(), -7, 3);
7077 // Setting SVE_ALL on B lanes checks that the Simulator ignores irrelevant
7078 // predicate bits when handling larger lanes.
7079 __ Ptrue(p6.VnB(), SVE_ALL);
7080 __ St1w(z6.VnS(), p6, SVEMemOperand(x0, 42, SVE_MUL_VL));
7081
7082 __ Index(z7.VnD(), 32, -11);
7083 __ Ptrue(p7.VnD(), SVE_MUL4);
7084 __ St1w(z7.VnD(), p7, SVEMemOperand(x0, 22, SVE_MUL_VL));
7085
7086 // Corresponding loads.
7087 __ Ld1b(z8.VnB(), p1.Zeroing(), SVEMemOperand(x0));
7088 __ Ld1b(z9.VnH(), p2.Zeroing(), SVEMemOperand(x0, 7, SVE_MUL_VL));
7089 __ Ld1h(z10.VnS(), p3.Zeroing(), SVEMemOperand(x0, -8, SVE_MUL_VL));
7090 __ Ld1b(z11.VnD(), p4.Zeroing(), SVEMemOperand(x1, x2));
7091 __ Ld1d(z12.VnD(), p5.Zeroing(), SVEMemOperand(x3, x4, LSL, 3));
7092 __ Ld1w(z13.VnS(), p6.Zeroing(), SVEMemOperand(x0, 42, SVE_MUL_VL));
7093
7094 __ Ld1sb(z14.VnH(), p2.Zeroing(), SVEMemOperand(x0, 7, SVE_MUL_VL));
7095 __ Ld1sh(z15.VnS(), p3.Zeroing(), SVEMemOperand(x0, -8, SVE_MUL_VL));
7096 __ Ld1sb(z16.VnD(), p4.Zeroing(), SVEMemOperand(x1, x2));
7097 __ Ld1sw(z17.VnD(), p7.Zeroing(), SVEMemOperand(x0, 22, SVE_MUL_VL));
7098
7099 // We can test ld1 by comparing the value loaded with the value stored. In
7100 // most cases, there are two complications:
7101 // - Loads have zeroing predication, so we have to clear the inactive
7102 // elements on our reference.
7103 // - We have to replicate any sign- or zero-extension.
7104
7105 // Ld1b(z8.VnB(), ...)
7106 __ Dup(z18.VnB(), 0);
7107 __ Mov(z18.VnB(), p1.Merging(), z1.VnB());
7108
7109 // Ld1b(z9.VnH(), ...)
7110 __ Dup(z19.VnH(), 0);
7111 __ Uxtb(z19.VnH(), p2.Merging(), z2.VnH());
7112
7113 // Ld1h(z10.VnS(), ...)
7114 __ Dup(z20.VnS(), 0);
7115 __ Uxth(z20.VnS(), p3.Merging(), z3.VnS());
7116
7117 // Ld1b(z11.VnD(), ...)
7118 __ Dup(z21.VnD(), 0);
7119 __ Uxtb(z21.VnD(), p4.Merging(), z4.VnD());
7120
7121 // Ld1d(z12.VnD(), ...)
7122 __ Dup(z22.VnD(), 0);
7123 __ Mov(z22.VnD(), p5.Merging(), z5.VnD());
7124
7125 // Ld1w(z13.VnS(), ...)
7126 __ Dup(z23.VnS(), 0);
7127 __ Mov(z23.VnS(), p6.Merging(), z6.VnS());
7128
7129 // Ld1sb(z14.VnH(), ...)
7130 __ Dup(z24.VnH(), 0);
7131 __ Sxtb(z24.VnH(), p2.Merging(), z2.VnH());
7132
7133 // Ld1sh(z15.VnS(), ...)
7134 __ Dup(z25.VnS(), 0);
7135 __ Sxth(z25.VnS(), p3.Merging(), z3.VnS());
7136
7137 // Ld1sb(z16.VnD(), ...)
7138 __ Dup(z26.VnD(), 0);
7139 __ Sxtb(z26.VnD(), p4.Merging(), z4.VnD());
7140
7141 // Ld1sw(z17.VnD(), ...)
7142 __ Dup(z27.VnD(), 0);
7143 __ Sxtw(z27.VnD(), p7.Merging(), z7.VnD());
7144
7145 END();
7146
7147 if (CAN_RUN()) {
7148 RUN();
7149
7150 uint8_t* expected = new uint8_t[data_size];
7151 memset(expected, 0, data_size);
7152 uint8_t* middle = &expected[data_size / 2];
7153
7154 int vl_b = vl / kBRegSizeInBytes;
7155 int vl_h = vl / kHRegSizeInBytes;
7156 int vl_s = vl / kSRegSizeInBytes;
7157 int vl_d = vl / kDRegSizeInBytes;
7158
7159 // Encodable cases.
7160
7161 // st1b { z1.b }, SVE_ALL
7162 for (int i = 0; i < vl_b; i++) {
7163 MemoryWrite(middle, 0, i, static_cast<uint8_t>(1 - (3 * i)));
7164 }
7165
7166 // st1b { z2.h }, SVE_MUL3
7167 int vl_h_mul3 = vl_h - (vl_h % 3);
7168 for (int i = 0; i < vl_h_mul3; i++) {
7169 int64_t offset = 7 * static_cast<int>(vl / (kHRegSize / kBRegSize));
7170 MemoryWrite(middle, offset, i, static_cast<uint8_t>(-2 + (5 * i)));
7171 }
7172
7173 // st1h { z3.s }, SVE_POW2
7174 int vl_s_pow2 = 1 << HighestSetBitPosition(vl_s);
7175 for (int i = 0; i < vl_s_pow2; i++) {
7176 int64_t offset = -8 * static_cast<int>(vl / (kSRegSize / kHRegSize));
7177 MemoryWrite(middle, offset, i, static_cast<uint16_t>(3 - (7 * i)));
7178 }
7179
7180 // st1b { z4.d }, SVE_VL3
7181 if (vl_d >= 3) {
7182 for (int i = 0; i < 3; i++) {
7183 MemoryWrite(middle,
7184 (8 * vl) + 17,
7185 i,
7186 static_cast<uint8_t>(-4 + (11 * i)));
7187 }
7188 }
7189
7190 // st1d { z5.d }, SVE_VL16
7191 if (vl_d >= 16) {
7192 for (int i = 0; i < 16; i++) {
7193 MemoryWrite(middle,
7194 (10 * vl) + (6 * kDRegSizeInBytes),
7195 i,
7196 static_cast<uint64_t>(6 - (2 * i)));
7197 }
7198 }
7199
7200 // Unencodable cases.
7201
7202 // st1w { z6.s }, SVE_ALL
7203 for (int i = 0; i < vl_s; i++) {
7204 MemoryWrite(middle, 42 * vl, i, static_cast<uint32_t>(-7 + (3 * i)));
7205 }
7206
7207 // st1w { z7.d }, SVE_MUL4
7208 int vl_d_mul4 = vl_d - (vl_d % 4);
7209 for (int i = 0; i < vl_d_mul4; i++) {
7210 int64_t offset = 22 * static_cast<int>(vl / (kDRegSize / kWRegSize));
7211 MemoryWrite(middle, offset, i, static_cast<uint32_t>(32 + (-11 * i)));
7212 }
7213
7214 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
7215
7216 // Check that we loaded back the expected values.
7217
7218 ASSERT_EQUAL_SVE(z18, z8);
7219 ASSERT_EQUAL_SVE(z19, z9);
7220 ASSERT_EQUAL_SVE(z20, z10);
7221 ASSERT_EQUAL_SVE(z21, z11);
7222 ASSERT_EQUAL_SVE(z22, z12);
7223 ASSERT_EQUAL_SVE(z23, z13);
7224 ASSERT_EQUAL_SVE(z24, z14);
7225 ASSERT_EQUAL_SVE(z25, z15);
7226 ASSERT_EQUAL_SVE(z26, z16);
7227 ASSERT_EQUAL_SVE(z27, z17);
7228
7229 delete[] expected;
7230 }
7231 delete[] data;
7232 }
7233
TEST_SVE(sve_ld2_st2_scalar_plus_imm)7234 TEST_SVE(sve_ld2_st2_scalar_plus_imm) {
7235 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
7236 START();
7237
7238 int vl = config->sve_vl_in_bytes();
7239
7240 // The immediate can address [-16, 14] times the VL, so allocate enough space
7241 // to exceed that in both directions.
7242 int data_size = vl * 128;
7243
7244 uint8_t* data = new uint8_t[data_size];
7245 memset(data, 0, data_size);
7246
7247 // Set the base half-way through the buffer so we can use negative indeces.
7248 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
7249
7250 __ Index(z14.VnB(), 1, -3);
7251 __ Index(z15.VnB(), 2, -3);
7252 __ Ptrue(p0.VnB());
7253 __ St2b(z14.VnB(), z15.VnB(), p0, SVEMemOperand(x0));
7254
7255 __ Index(z16.VnH(), -2, 5);
7256 __ Index(z17.VnH(), -3, 5);
7257 __ Ptrue(p1.VnH(), SVE_MUL3);
7258 __ St2h(z16.VnH(), z17.VnH(), p1, SVEMemOperand(x0, 8, SVE_MUL_VL));
7259
7260 // Wrap around from z31 to z0.
7261 __ Index(z31.VnS(), 3, -7);
7262 __ Index(z0.VnS(), 4, -7);
7263 __ Ptrue(p2.VnS(), SVE_POW2);
7264 __ St2w(z31.VnS(), z0.VnS(), p2, SVEMemOperand(x0, -12, SVE_MUL_VL));
7265
7266 __ Index(z18.VnD(), -7, 3);
7267 __ Index(z19.VnD(), -8, 3);
7268 // Sparse predication, including some irrelevant bits (0xe). To make the
7269 // results easy to check, activate each lane <n> where n is a multiple of 5.
7270 Initialise(&masm,
7271 p3,
7272 0xeee10000000001ee,
7273 0xeeeeeee100000000,
7274 0x01eeeeeeeee10000,
7275 0x000001eeeeeeeee1);
7276 __ St2d(z18.VnD(), z19.VnD(), p3, SVEMemOperand(x0, 14, SVE_MUL_VL));
7277
7278 // We can test ld2 by comparing the values loaded with the values stored.
7279 // There are two complications:
7280 // - Loads have zeroing predication, so we have to clear the inactive
7281 // elements on our reference.
7282 // - We want to test both loads and stores that span { z31, z0 }, so we have
7283 // to move some values around.
7284 //
7285 // Registers z4-z11 will hold as-stored values (with inactive elements
7286 // cleared). Registers z20-z27 will hold the values that were loaded.
7287
7288 // Ld2b(z14.VnB(), z15.VnB(), ...)
7289 __ Dup(z4.VnB(), 0);
7290 __ Dup(z5.VnB(), 0);
7291 __ Mov(z4.VnB(), p0.Merging(), z14.VnB());
7292 __ Mov(z5.VnB(), p0.Merging(), z15.VnB());
7293
7294 // Ld2h(z16.VnH(), z17.VnH(), ...)
7295 __ Dup(z6.VnH(), 0);
7296 __ Dup(z7.VnH(), 0);
7297 __ Mov(z6.VnH(), p1.Merging(), z16.VnH());
7298 __ Mov(z7.VnH(), p1.Merging(), z17.VnH());
7299
7300 // Ld2w(z31.VnS(), z0.VnS(), ...)
7301 __ Dup(z8.VnS(), 0);
7302 __ Dup(z9.VnS(), 0);
7303 __ Mov(z8.VnS(), p2.Merging(), z31.VnS());
7304 __ Mov(z9.VnS(), p2.Merging(), z0.VnS());
7305
7306 // Ld2d(z18.VnD(), z19.VnD(), ...)
7307 __ Dup(z10.VnD(), 0);
7308 __ Dup(z11.VnD(), 0);
7309 __ Mov(z10.VnD(), p3.Merging(), z18.VnD());
7310 __ Mov(z11.VnD(), p3.Merging(), z19.VnD());
7311
7312 // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
7313 __ Ld2b(z31.VnB(), z0.VnB(), p0.Zeroing(), SVEMemOperand(x0));
7314 __ Mov(z20, z31);
7315 __ Mov(z21, z0);
7316
7317 __ Ld2h(z22.VnH(), z23.VnH(), p1.Zeroing(), SVEMemOperand(x0, 8, SVE_MUL_VL));
7318 __ Ld2w(z24.VnS(),
7319 z25.VnS(),
7320 p2.Zeroing(),
7321 SVEMemOperand(x0, -12, SVE_MUL_VL));
7322 __ Ld2d(z26.VnD(),
7323 z27.VnD(),
7324 p3.Zeroing(),
7325 SVEMemOperand(x0, 14, SVE_MUL_VL));
7326
7327 END();
7328
7329 if (CAN_RUN()) {
7330 RUN();
7331
7332 uint8_t* expected = new uint8_t[data_size];
7333 memset(expected, 0, data_size);
7334 uint8_t* middle = &expected[data_size / 2];
7335
7336 int vl_b = vl / kBRegSizeInBytes;
7337 int vl_h = vl / kHRegSizeInBytes;
7338 int vl_s = vl / kSRegSizeInBytes;
7339 int vl_d = vl / kDRegSizeInBytes;
7340
7341 int reg_count = 2;
7342
7343 // st2b { z14.b, z15.b }, SVE_ALL
7344 for (int i = 0; i < vl_b; i++) {
7345 uint8_t lane0 = 1 - (3 * i);
7346 uint8_t lane1 = 2 - (3 * i);
7347 MemoryWrite(middle, 0, (i * reg_count) + 0, lane0);
7348 MemoryWrite(middle, 0, (i * reg_count) + 1, lane1);
7349 }
7350
7351 // st2h { z16.h, z17.h }, SVE_MUL3
7352 int vl_h_mul3 = vl_h - (vl_h % 3);
7353 for (int i = 0; i < vl_h_mul3; i++) {
7354 int64_t offset = 8 * vl;
7355 uint16_t lane0 = -2 + (5 * i);
7356 uint16_t lane1 = -3 + (5 * i);
7357 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7358 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7359 }
7360
7361 // st2w { z31.s, z0.s }, SVE_POW2
7362 int vl_s_pow2 = 1 << HighestSetBitPosition(vl_s);
7363 for (int i = 0; i < vl_s_pow2; i++) {
7364 int64_t offset = -12 * vl;
7365 uint32_t lane0 = 3 - (7 * i);
7366 uint32_t lane1 = 4 - (7 * i);
7367 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7368 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7369 }
7370
7371 // st2d { z18.d, z19.d }, ((i % 5) == 0)
7372 for (int i = 0; i < vl_d; i++) {
7373 if ((i % 5) == 0) {
7374 int64_t offset = 14 * vl;
7375 uint64_t lane0 = -7 + (3 * i);
7376 uint64_t lane1 = -8 + (3 * i);
7377 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7378 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7379 }
7380 }
7381
7382 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
7383
7384 // Check that we loaded back the expected values.
7385
7386 // st2b/ld2b
7387 ASSERT_EQUAL_SVE(z4, z20);
7388 ASSERT_EQUAL_SVE(z5, z21);
7389
7390 // st2h/ld2h
7391 ASSERT_EQUAL_SVE(z6, z22);
7392 ASSERT_EQUAL_SVE(z7, z23);
7393
7394 // st2w/ld2w
7395 ASSERT_EQUAL_SVE(z8, z24);
7396 ASSERT_EQUAL_SVE(z9, z25);
7397
7398 // st2d/ld2d
7399 ASSERT_EQUAL_SVE(z10, z26);
7400 ASSERT_EQUAL_SVE(z11, z27);
7401
7402 delete[] expected;
7403 }
7404 delete[] data;
7405 }
7406
TEST_SVE(sve_ld2_st2_scalar_plus_scalar)7407 TEST_SVE(sve_ld2_st2_scalar_plus_scalar) {
7408 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
7409 START();
7410
7411 int vl = config->sve_vl_in_bytes();
7412
7413 // Allocate plenty of space to enable indexing in both directions.
7414 int data_size = vl * 128;
7415
7416 uint8_t* data = new uint8_t[data_size];
7417 memset(data, 0, data_size);
7418
7419 // Set the base half-way through the buffer so we can use negative indeces.
7420 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
7421
7422 __ Index(z10.VnB(), -4, 11);
7423 __ Index(z11.VnB(), -5, 11);
7424 __ Ptrue(p7.VnB(), SVE_MUL4);
7425 __ Mov(x1, 0);
7426 __ St2b(z10.VnB(), z11.VnB(), p7, SVEMemOperand(x0, x1));
7427
7428 __ Index(z12.VnH(), 6, -2);
7429 __ Index(z13.VnH(), 7, -2);
7430 __ Ptrue(p6.VnH(), SVE_VL16);
7431 __ Rdvl(x2, 3); // Make offsets VL-dependent so we can avoid overlap.
7432 __ St2h(z12.VnH(), z13.VnH(), p6, SVEMemOperand(x0, x2, LSL, 1));
7433
7434 __ Index(z14.VnS(), -7, 3);
7435 __ Index(z15.VnS(), -8, 3);
7436 // Sparse predication, including some irrelevant bits (0xe). To make the
7437 // results easy to check, activate each lane <n> where n is a multiple of 5.
7438 Initialise(&masm,
7439 p5,
7440 0xeee1000010000100,
7441 0x001eeee100001000,
7442 0x0100001eeee10000,
7443 0x10000100001eeee1);
7444 __ Rdvl(x3, -3);
7445 __ St2w(z14.VnS(), z15.VnS(), p5, SVEMemOperand(x0, x3, LSL, 2));
7446
7447 // Wrap around from z31 to z0.
7448 __ Index(z31.VnD(), 32, -11);
7449 __ Index(z0.VnD(), 33, -11);
7450 __ Ptrue(p4.VnD(), SVE_MUL3);
7451 __ Rdvl(x4, 1);
7452 __ St2d(z31.VnD(), z0.VnD(), p4, SVEMemOperand(x0, x4, LSL, 3));
7453
7454 // We can test ld2 by comparing the values loaded with the values stored.
7455 // There are two complications:
7456 // - Loads have zeroing predication, so we have to clear the inactive
7457 // elements on our reference.
7458 // - We want to test both loads and stores that span { z31, z0 }, so we have
7459 // to move some values around.
7460 //
7461 // Registers z4-z11 will hold as-stored values (with inactive elements
7462 // cleared). Registers z20-z27 will hold the values that were loaded.
7463
7464 // Ld2b(z20.VnB(), z21.VnB(), ...)
7465 __ Dup(z4.VnB(), 0);
7466 __ Dup(z5.VnB(), 0);
7467 __ Mov(z4.VnB(), p7.Merging(), z10.VnB());
7468 __ Mov(z5.VnB(), p7.Merging(), z11.VnB());
7469
7470 // Ld2h(z22.VnH(), z23.VnH(), ...)
7471 __ Dup(z6.VnH(), 0);
7472 __ Dup(z7.VnH(), 0);
7473 __ Mov(z6.VnH(), p6.Merging(), z12.VnH());
7474 __ Mov(z7.VnH(), p6.Merging(), z13.VnH());
7475
7476 // Ld2w(z24.VnS(), z25.VnS(), ...)
7477 __ Dup(z8.VnS(), 0);
7478 __ Dup(z9.VnS(), 0);
7479 __ Mov(z8.VnS(), p5.Merging(), z14.VnS());
7480 __ Mov(z9.VnS(), p5.Merging(), z15.VnS());
7481
7482 // Ld2d(z31.VnD(), z0.VnD(), ...)
7483 __ Dup(z10.VnD(), 0);
7484 __ Dup(z11.VnD(), 0);
7485 __ Mov(z10.VnD(), p4.Merging(), z31.VnD());
7486 __ Mov(z11.VnD(), p4.Merging(), z0.VnD());
7487
7488 // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
7489 __ Ld2b(z31.VnB(), z0.VnB(), p7.Zeroing(), SVEMemOperand(x0, x1));
7490 __ Mov(z20, z31);
7491 __ Mov(z21, z0);
7492
7493 __ Ld2h(z22.VnH(), z23.VnH(), p6.Zeroing(), SVEMemOperand(x0, x2, LSL, 1));
7494 __ Ld2w(z24.VnS(), z25.VnS(), p5.Zeroing(), SVEMemOperand(x0, x3, LSL, 2));
7495 __ Ld2d(z26.VnD(), z27.VnD(), p4.Zeroing(), SVEMemOperand(x0, x4, LSL, 3));
7496
7497 END();
7498
7499 if (CAN_RUN()) {
7500 RUN();
7501
7502 uint8_t* expected = new uint8_t[data_size];
7503 memset(expected, 0, data_size);
7504 uint8_t* middle = &expected[data_size / 2];
7505
7506 int vl_b = vl / kBRegSizeInBytes;
7507 int vl_h = vl / kHRegSizeInBytes;
7508 int vl_s = vl / kSRegSizeInBytes;
7509 int vl_d = vl / kDRegSizeInBytes;
7510
7511 int reg_count = 2;
7512
7513 // st2b { z10.b, z11.b }, SVE_MUL4
7514 int vl_b_mul4 = vl_b - (vl_b % 4);
7515 for (int i = 0; i < vl_b_mul4; i++) {
7516 uint8_t lane0 = -4 + (11 * i);
7517 uint8_t lane1 = -5 + (11 * i);
7518 MemoryWrite(middle, 0, (i * reg_count) + 0, lane0);
7519 MemoryWrite(middle, 0, (i * reg_count) + 1, lane1);
7520 }
7521
7522 // st2h { z12.h, z13.h }, SVE_VL16
7523 if (vl_h >= 16) {
7524 for (int i = 0; i < 16; i++) {
7525 int64_t offset = (3 << kHRegSizeInBytesLog2) * vl;
7526 uint16_t lane0 = 6 - (2 * i);
7527 uint16_t lane1 = 7 - (2 * i);
7528 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7529 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7530 }
7531 }
7532
7533 // st2w { z14.s, z15.s }, ((i % 5) == 0)
7534 for (int i = 0; i < vl_s; i++) {
7535 if ((i % 5) == 0) {
7536 int64_t offset = -(3 << kSRegSizeInBytesLog2) * vl;
7537 uint32_t lane0 = -7 + (3 * i);
7538 uint32_t lane1 = -8 + (3 * i);
7539 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7540 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7541 }
7542 }
7543
7544 // st2d { z31.b, z0.b }, SVE_MUL3
7545 int vl_d_mul3 = vl_d - (vl_d % 3);
7546 for (int i = 0; i < vl_d_mul3; i++) {
7547 int64_t offset = (1 << kDRegSizeInBytesLog2) * vl;
7548 uint64_t lane0 = 32 - (11 * i);
7549 uint64_t lane1 = 33 - (11 * i);
7550 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7551 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7552 }
7553
7554 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
7555
7556 // Check that we loaded back the expected values.
7557
7558 // st2b/ld2b
7559 ASSERT_EQUAL_SVE(z4, z20);
7560 ASSERT_EQUAL_SVE(z5, z21);
7561
7562 // st2h/ld2h
7563 ASSERT_EQUAL_SVE(z6, z22);
7564 ASSERT_EQUAL_SVE(z7, z23);
7565
7566 // st2w/ld2w
7567 ASSERT_EQUAL_SVE(z8, z24);
7568 ASSERT_EQUAL_SVE(z9, z25);
7569
7570 // st2d/ld2d
7571 ASSERT_EQUAL_SVE(z10, z26);
7572 ASSERT_EQUAL_SVE(z11, z27);
7573
7574 delete[] expected;
7575 }
7576 delete[] data;
7577 }
7578
TEST_SVE(sve_ld3_st3_scalar_plus_imm)7579 TEST_SVE(sve_ld3_st3_scalar_plus_imm) {
7580 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
7581 START();
7582
7583 int vl = config->sve_vl_in_bytes();
7584
7585 // The immediate can address [-24, 21] times the VL, so allocate enough space
7586 // to exceed that in both directions.
7587 int data_size = vl * 128;
7588
7589 uint8_t* data = new uint8_t[data_size];
7590 memset(data, 0, data_size);
7591
7592 // Set the base half-way through the buffer so we can use negative indeces.
7593 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
7594
7595 // We can test ld3 by comparing the values loaded with the values stored.
7596 // There are two complications:
7597 // - Loads have zeroing predication, so we have to clear the inactive
7598 // elements on our reference.
7599 // - We want to test both loads and stores that span { z31, z0 }, so we have
7600 // to move some values around.
7601 //
7602 // Registers z4-z15 will hold as-stored values (with inactive elements
7603 // cleared). Registers z16-z27 will hold the values that were loaded.
7604
7605 __ Index(z10.VnB(), 1, -3);
7606 __ Index(z11.VnB(), 2, -3);
7607 __ Index(z12.VnB(), 3, -3);
7608 __ Ptrue(p0.VnB());
7609 __ St3b(z10.VnB(), z11.VnB(), z12.VnB(), p0, SVEMemOperand(x0));
7610 // Save the stored values for ld3 tests.
7611 __ Dup(z4.VnB(), 0);
7612 __ Dup(z5.VnB(), 0);
7613 __ Dup(z6.VnB(), 0);
7614 __ Mov(z4.VnB(), p0.Merging(), z10.VnB());
7615 __ Mov(z5.VnB(), p0.Merging(), z11.VnB());
7616 __ Mov(z6.VnB(), p0.Merging(), z12.VnB());
7617
7618 // Wrap around from z31 to z0.
7619 __ Index(z31.VnH(), -2, 5);
7620 __ Index(z0.VnH(), -3, 5);
7621 __ Index(z1.VnH(), -4, 5);
7622 __ Ptrue(p1.VnH(), SVE_MUL3);
7623 __ St3h(z31.VnH(), z0.VnH(), z1.VnH(), p1, SVEMemOperand(x0, 9, SVE_MUL_VL));
7624 // Save the stored values for ld3 tests.
7625 __ Dup(z7.VnH(), 0);
7626 __ Dup(z8.VnH(), 0);
7627 __ Dup(z9.VnH(), 0);
7628 __ Mov(z7.VnH(), p1.Merging(), z31.VnH());
7629 __ Mov(z8.VnH(), p1.Merging(), z0.VnH());
7630 __ Mov(z9.VnH(), p1.Merging(), z1.VnH());
7631
7632 __ Index(z30.VnS(), 3, -7);
7633 __ Index(z31.VnS(), 4, -7);
7634 __ Index(z0.VnS(), 5, -7);
7635 __ Ptrue(p2.VnS(), SVE_POW2);
7636 __ St3w(z30.VnS(),
7637 z31.VnS(),
7638 z0.VnS(),
7639 p2,
7640 SVEMemOperand(x0, -12, SVE_MUL_VL));
7641 // Save the stored values for ld3 tests.
7642 __ Dup(z10.VnS(), 0);
7643 __ Dup(z11.VnS(), 0);
7644 __ Dup(z12.VnS(), 0);
7645 __ Mov(z10.VnS(), p2.Merging(), z30.VnS());
7646 __ Mov(z11.VnS(), p2.Merging(), z31.VnS());
7647 __ Mov(z12.VnS(), p2.Merging(), z0.VnS());
7648
7649 __ Index(z0.VnD(), -7, 3);
7650 __ Index(z1.VnD(), -8, 3);
7651 __ Index(z2.VnD(), -9, 3);
7652 // Sparse predication, including some irrelevant bits (0xee). To make the
7653 // results easy to check, activate each lane <n> where n is a multiple of 5.
7654 Initialise(&masm,
7655 p3,
7656 0xeee10000000001ee,
7657 0xeeeeeee100000000,
7658 0x01eeeeeeeee10000,
7659 0x000001eeeeeeeee1);
7660 __ St3d(z0.VnD(), z1.VnD(), z2.VnD(), p3, SVEMemOperand(x0, 15, SVE_MUL_VL));
7661 // Save the stored values for ld3 tests.
7662 __ Dup(z13.VnD(), 0);
7663 __ Dup(z14.VnD(), 0);
7664 __ Dup(z15.VnD(), 0);
7665 __ Mov(z13.VnD(), p3.Merging(), z0.VnD());
7666 __ Mov(z14.VnD(), p3.Merging(), z1.VnD());
7667 __ Mov(z15.VnD(), p3.Merging(), z2.VnD());
7668
7669 // Corresponding loads.
7670 // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
7671 __ Ld3b(z31.VnB(), z0.VnB(), z1.VnB(), p0.Zeroing(), SVEMemOperand(x0));
7672 __ Mov(z16, z31);
7673 __ Mov(z17, z0);
7674 __ Mov(z18, z1);
7675 __ Ld3h(z30.VnH(),
7676 z31.VnH(),
7677 z0.VnH(),
7678 p1.Zeroing(),
7679 SVEMemOperand(x0, 9, SVE_MUL_VL));
7680 __ Mov(z19, z30);
7681 __ Mov(z20, z31);
7682 __ Mov(z21, z0);
7683 __ Ld3w(z22.VnS(),
7684 z23.VnS(),
7685 z24.VnS(),
7686 p2.Zeroing(),
7687 SVEMemOperand(x0, -12, SVE_MUL_VL));
7688 __ Ld3d(z25.VnD(),
7689 z26.VnD(),
7690 z27.VnD(),
7691 p3.Zeroing(),
7692 SVEMemOperand(x0, 15, SVE_MUL_VL));
7693
7694 END();
7695
7696 if (CAN_RUN()) {
7697 RUN();
7698
7699 uint8_t* expected = new uint8_t[data_size];
7700 memset(expected, 0, data_size);
7701 uint8_t* middle = &expected[data_size / 2];
7702
7703 int vl_b = vl / kBRegSizeInBytes;
7704 int vl_h = vl / kHRegSizeInBytes;
7705 int vl_s = vl / kSRegSizeInBytes;
7706 int vl_d = vl / kDRegSizeInBytes;
7707
7708 int reg_count = 3;
7709
7710 // st3b { z10.b, z11.b, z12.b }, SVE_ALL
7711 for (int i = 0; i < vl_b; i++) {
7712 uint8_t lane0 = 1 - (3 * i);
7713 uint8_t lane1 = 2 - (3 * i);
7714 uint8_t lane2 = 3 - (3 * i);
7715 MemoryWrite(middle, 0, (i * reg_count) + 0, lane0);
7716 MemoryWrite(middle, 0, (i * reg_count) + 1, lane1);
7717 MemoryWrite(middle, 0, (i * reg_count) + 2, lane2);
7718 }
7719
7720 // st3h { z31.h, z0.h, z1.h }, SVE_MUL3
7721 int vl_h_mul3 = vl_h - (vl_h % 3);
7722 for (int i = 0; i < vl_h_mul3; i++) {
7723 int64_t offset = 9 * vl;
7724 uint16_t lane0 = -2 + (5 * i);
7725 uint16_t lane1 = -3 + (5 * i);
7726 uint16_t lane2 = -4 + (5 * i);
7727 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7728 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7729 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7730 }
7731
7732 // st3w { z30.s, z31.s, z0.s }, SVE_POW2
7733 int vl_s_pow2 = 1 << HighestSetBitPosition(vl_s);
7734 for (int i = 0; i < vl_s_pow2; i++) {
7735 int64_t offset = -12 * vl;
7736 uint32_t lane0 = 3 - (7 * i);
7737 uint32_t lane1 = 4 - (7 * i);
7738 uint32_t lane2 = 5 - (7 * i);
7739 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7740 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7741 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7742 }
7743
7744 // st3d { z0.d, z1.d, z2.d }, ((i % 5) == 0)
7745 for (int i = 0; i < vl_d; i++) {
7746 if ((i % 5) == 0) {
7747 int64_t offset = 15 * vl;
7748 uint64_t lane0 = -7 + (3 * i);
7749 uint64_t lane1 = -8 + (3 * i);
7750 uint64_t lane2 = -9 + (3 * i);
7751 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7752 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7753 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7754 }
7755 }
7756
7757 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
7758
7759 // Check that we loaded back the expected values.
7760
7761 // st3b/ld3b
7762 ASSERT_EQUAL_SVE(z4, z16);
7763 ASSERT_EQUAL_SVE(z5, z17);
7764 ASSERT_EQUAL_SVE(z6, z18);
7765
7766 // st3h/ld3h
7767 ASSERT_EQUAL_SVE(z7, z19);
7768 ASSERT_EQUAL_SVE(z8, z20);
7769 ASSERT_EQUAL_SVE(z9, z21);
7770
7771 // st3w/ld3w
7772 ASSERT_EQUAL_SVE(z10, z22);
7773 ASSERT_EQUAL_SVE(z11, z23);
7774 ASSERT_EQUAL_SVE(z12, z24);
7775
7776 // st3d/ld3d
7777 ASSERT_EQUAL_SVE(z13, z25);
7778 ASSERT_EQUAL_SVE(z14, z26);
7779 ASSERT_EQUAL_SVE(z15, z27);
7780
7781 delete[] expected;
7782 }
7783 delete[] data;
7784 }
7785
TEST_SVE(sve_ld3_st3_scalar_plus_scalar)7786 TEST_SVE(sve_ld3_st3_scalar_plus_scalar) {
7787 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
7788 START();
7789
7790 int vl = config->sve_vl_in_bytes();
7791
7792 // Allocate plenty of space to enable indexing in both directions.
7793 int data_size = vl * 128;
7794
7795 uint8_t* data = new uint8_t[data_size];
7796 memset(data, 0, data_size);
7797
7798 // Set the base half-way through the buffer so we can use negative indeces.
7799 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
7800
7801 // We can test ld3 by comparing the values loaded with the values stored.
7802 // There are two complications:
7803 // - Loads have zeroing predication, so we have to clear the inactive
7804 // elements on our reference.
7805 // - We want to test both loads and stores that span { z31, z0 }, so we have
7806 // to move some values around.
7807 //
7808 // Registers z4-z15 will hold as-stored values (with inactive elements
7809 // cleared). Registers z16-z27 will hold the values that were loaded.
7810
7811 __ Index(z10.VnB(), -4, 11);
7812 __ Index(z11.VnB(), -5, 11);
7813 __ Index(z12.VnB(), -6, 11);
7814 __ Ptrue(p7.VnB(), SVE_MUL4);
7815 __ Rdvl(x1, -1); // Make offsets VL-dependent so we can avoid overlap.
7816 __ St3b(z10.VnB(), z11.VnB(), z12.VnB(), p7, SVEMemOperand(x0, x1, LSL, 0));
7817 // Save the stored values for ld3 tests.
7818 __ Dup(z4.VnB(), 0);
7819 __ Dup(z5.VnB(), 0);
7820 __ Dup(z6.VnB(), 0);
7821 __ Mov(z4.VnB(), p7.Merging(), z10.VnB());
7822 __ Mov(z5.VnB(), p7.Merging(), z11.VnB());
7823 __ Mov(z6.VnB(), p7.Merging(), z12.VnB());
7824
7825 __ Index(z13.VnH(), 6, -2);
7826 __ Index(z14.VnH(), 7, -2);
7827 __ Index(z15.VnH(), 8, -2);
7828 __ Ptrue(p6.VnH(), SVE_VL16);
7829 __ Rdvl(x2, 5); // (5 * vl) << 1 = 10 * vl
7830 __ St3h(z13.VnH(), z14.VnH(), z15.VnH(), p6, SVEMemOperand(x0, x2, LSL, 1));
7831 // Save the stored values for ld3 tests.
7832 __ Dup(z7.VnH(), 0);
7833 __ Dup(z8.VnH(), 0);
7834 __ Dup(z9.VnH(), 0);
7835 __ Mov(z7.VnH(), p6.Merging(), z13.VnH());
7836 __ Mov(z8.VnH(), p6.Merging(), z14.VnH());
7837 __ Mov(z9.VnH(), p6.Merging(), z15.VnH());
7838
7839 // Wrap around from z31 to z0.
7840 __ Index(z30.VnS(), -7, 3);
7841 __ Index(z31.VnS(), -8, 3);
7842 __ Index(z0.VnS(), -9, 3);
7843 // Sparse predication, including some irrelevant bits (0xe). To make the
7844 // results easy to check, activate each lane <n> where n is a multiple of 5.
7845 Initialise(&masm,
7846 p5,
7847 0xeee1000010000100,
7848 0x001eeee100001000,
7849 0x0100001eeee10000,
7850 0x10000100001eeee1);
7851 __ Rdvl(x3, -5); // -(5 * vl) << 2 = -20 * vl
7852 __ St3w(z30.VnS(), z31.VnS(), z0.VnS(), p5, SVEMemOperand(x0, x3, LSL, 2));
7853 // Save the stored values for ld3 tests.
7854 __ Dup(z10.VnS(), 0);
7855 __ Dup(z11.VnS(), 0);
7856 __ Dup(z12.VnS(), 0);
7857 __ Mov(z10.VnS(), p5.Merging(), z30.VnS());
7858 __ Mov(z11.VnS(), p5.Merging(), z31.VnS());
7859 __ Mov(z12.VnS(), p5.Merging(), z0.VnS());
7860
7861 __ Index(z31.VnD(), 32, -11);
7862 __ Index(z0.VnD(), 33, -11);
7863 __ Index(z1.VnD(), 34, -11);
7864 __ Ptrue(p4.VnD(), SVE_MUL3);
7865 __ Rdvl(x4, -1); // -(1 * vl) << 3 = -8 * vl
7866 __ St3d(z31.VnD(), z0.VnD(), z1.VnD(), p4, SVEMemOperand(x0, x4, LSL, 3));
7867 // Save the stored values for ld3 tests.
7868 __ Dup(z13.VnD(), 0);
7869 __ Dup(z14.VnD(), 0);
7870 __ Dup(z15.VnD(), 0);
7871 __ Mov(z13.VnD(), p4.Merging(), z31.VnD());
7872 __ Mov(z14.VnD(), p4.Merging(), z0.VnD());
7873 __ Mov(z15.VnD(), p4.Merging(), z1.VnD());
7874
7875 // Corresponding loads.
7876 // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
7877 __ Ld3b(z31.VnB(),
7878 z0.VnB(),
7879 z1.VnB(),
7880 p7.Zeroing(),
7881 SVEMemOperand(x0, x1, LSL, 0));
7882 __ Mov(z16, z31);
7883 __ Mov(z17, z0);
7884 __ Mov(z18, z1);
7885 __ Ld3h(z30.VnH(),
7886 z31.VnH(),
7887 z0.VnH(),
7888 p6.Zeroing(),
7889 SVEMemOperand(x0, x2, LSL, 1));
7890 __ Mov(z19, z30);
7891 __ Mov(z20, z31);
7892 __ Mov(z21, z0);
7893 __ Ld3w(z22.VnS(),
7894 z23.VnS(),
7895 z24.VnS(),
7896 p5.Zeroing(),
7897 SVEMemOperand(x0, x3, LSL, 2));
7898 __ Ld3d(z25.VnD(),
7899 z26.VnD(),
7900 z27.VnD(),
7901 p4.Zeroing(),
7902 SVEMemOperand(x0, x4, LSL, 3));
7903
7904 END();
7905
7906 if (CAN_RUN()) {
7907 RUN();
7908
7909 uint8_t* expected = new uint8_t[data_size];
7910 memset(expected, 0, data_size);
7911 uint8_t* middle = &expected[data_size / 2];
7912
7913 int vl_b = vl / kBRegSizeInBytes;
7914 int vl_h = vl / kHRegSizeInBytes;
7915 int vl_s = vl / kSRegSizeInBytes;
7916 int vl_d = vl / kDRegSizeInBytes;
7917
7918 int reg_count = 3;
7919
7920 // st3b { z10.b, z11.b, z12.b }, SVE_MUL4
7921 int vl_b_mul4 = vl_b - (vl_b % 4);
7922 for (int i = 0; i < vl_b_mul4; i++) {
7923 int64_t offset = -(1 << kBRegSizeInBytesLog2) * vl;
7924 uint8_t lane0 = -4 + (11 * i);
7925 uint8_t lane1 = -5 + (11 * i);
7926 uint8_t lane2 = -6 + (11 * i);
7927 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7928 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7929 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7930 }
7931
7932 // st3h { z13.h, z14.h, z15.h }, SVE_VL16
7933 if (vl_h >= 16) {
7934 for (int i = 0; i < 16; i++) {
7935 int64_t offset = (5 << kHRegSizeInBytesLog2) * vl;
7936 uint16_t lane0 = 6 - (2 * i);
7937 uint16_t lane1 = 7 - (2 * i);
7938 uint16_t lane2 = 8 - (2 * i);
7939 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7940 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7941 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7942 }
7943 }
7944
7945 // st3w { z30.s, z31.s, z0.s }, ((i % 5) == 0)
7946 for (int i = 0; i < vl_s; i++) {
7947 if ((i % 5) == 0) {
7948 int64_t offset = -(5 << kSRegSizeInBytesLog2) * vl;
7949 uint32_t lane0 = -7 + (3 * i);
7950 uint32_t lane1 = -8 + (3 * i);
7951 uint32_t lane2 = -9 + (3 * i);
7952 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7953 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7954 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7955 }
7956 }
7957
7958 // st3d { z31.d, z0.d, z1.d }, SVE_MUL3
7959 int vl_d_mul3 = vl_d - (vl_d % 3);
7960 for (int i = 0; i < vl_d_mul3; i++) {
7961 int64_t offset = -(1 << kDRegSizeInBytesLog2) * vl;
7962 uint64_t lane0 = 32 - (11 * i);
7963 uint64_t lane1 = 33 - (11 * i);
7964 uint64_t lane2 = 34 - (11 * i);
7965 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7966 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7967 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7968 }
7969
7970 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
7971
7972 // Check that we loaded back the expected values.
7973
7974 // st3b/ld3b
7975 ASSERT_EQUAL_SVE(z4, z16);
7976 ASSERT_EQUAL_SVE(z5, z17);
7977 ASSERT_EQUAL_SVE(z6, z18);
7978
7979 // st3h/ld3h
7980 ASSERT_EQUAL_SVE(z7, z19);
7981 ASSERT_EQUAL_SVE(z8, z20);
7982 ASSERT_EQUAL_SVE(z9, z21);
7983
7984 // st3w/ld3w
7985 ASSERT_EQUAL_SVE(z10, z22);
7986 ASSERT_EQUAL_SVE(z11, z23);
7987 ASSERT_EQUAL_SVE(z12, z24);
7988
7989 // st3d/ld3d
7990 ASSERT_EQUAL_SVE(z13, z25);
7991 ASSERT_EQUAL_SVE(z14, z26);
7992 ASSERT_EQUAL_SVE(z15, z27);
7993
7994 delete[] expected;
7995 }
7996 delete[] data;
7997 }
7998
TEST_SVE(sve_ld4_st4_scalar_plus_imm)7999 TEST_SVE(sve_ld4_st4_scalar_plus_imm) {
8000 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
8001 START();
8002
8003 int vl = config->sve_vl_in_bytes();
8004
8005 // The immediate can address [-24, 21] times the VL, so allocate enough space
8006 // to exceed that in both directions.
8007 int data_size = vl * 128;
8008
8009 uint8_t* data = new uint8_t[data_size];
8010 memset(data, 0, data_size);
8011
8012 // Set the base half-way through the buffer so we can use negative indeces.
8013 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
8014
8015 // We can test ld4 by comparing the values loaded with the values stored.
8016 // There are two complications:
8017 // - Loads have zeroing predication, so we have to clear the inactive
8018 // elements on our reference.
8019 // - We want to test both loads and stores that span { z31, z0 }, so we have
8020 // to move some values around.
8021 //
8022 // Registers z3-z18 will hold as-stored values (with inactive elements
8023 // cleared). Registers z19-z31 and z0-z2 will hold the values that were
8024 // loaded.
8025
8026 __ Index(z10.VnB(), 1, -7);
8027 __ Index(z11.VnB(), 2, -7);
8028 __ Index(z12.VnB(), 3, -7);
8029 __ Index(z13.VnB(), 4, -7);
8030 __ Ptrue(p0.VnB());
8031 __ St4b(z10.VnB(), z11.VnB(), z12.VnB(), z13.VnB(), p0, SVEMemOperand(x0));
8032 // Save the stored values for ld4 tests.
8033 __ Dup(z3.VnB(), 0);
8034 __ Dup(z4.VnB(), 0);
8035 __ Dup(z5.VnB(), 0);
8036 __ Dup(z6.VnB(), 0);
8037 __ Mov(z3.VnB(), p0.Merging(), z10.VnB());
8038 __ Mov(z4.VnB(), p0.Merging(), z11.VnB());
8039 __ Mov(z5.VnB(), p0.Merging(), z12.VnB());
8040 __ Mov(z6.VnB(), p0.Merging(), z13.VnB());
8041
8042 // Wrap around from z31 to z0.
8043 __ Index(z31.VnH(), -2, 5);
8044 __ Index(z0.VnH(), -3, 5);
8045 __ Index(z1.VnH(), -4, 5);
8046 __ Index(z2.VnH(), -5, 5);
8047 __ Ptrue(p1.VnH(), SVE_MUL3);
8048 __ St4h(z31.VnH(),
8049 z0.VnH(),
8050 z1.VnH(),
8051 z2.VnH(),
8052 p1,
8053 SVEMemOperand(x0, 4, SVE_MUL_VL));
8054 // Save the stored values for ld4 tests.
8055 __ Dup(z7.VnH(), 0);
8056 __ Dup(z8.VnH(), 0);
8057 __ Dup(z9.VnH(), 0);
8058 __ Dup(z10.VnH(), 0);
8059 __ Mov(z7.VnH(), p1.Merging(), z31.VnH());
8060 __ Mov(z8.VnH(), p1.Merging(), z0.VnH());
8061 __ Mov(z9.VnH(), p1.Merging(), z1.VnH());
8062 __ Mov(z10.VnH(), p1.Merging(), z2.VnH());
8063
8064 // Wrap around from z31 to z0.
8065 __ Index(z29.VnS(), 2, -7);
8066 __ Index(z30.VnS(), 3, -7);
8067 __ Index(z31.VnS(), 4, -7);
8068 __ Index(z0.VnS(), 5, -7);
8069 __ Ptrue(p2.VnS(), SVE_POW2);
8070 __ St4w(z29.VnS(),
8071 z30.VnS(),
8072 z31.VnS(),
8073 z0.VnS(),
8074 p2,
8075 SVEMemOperand(x0, -12, SVE_MUL_VL));
8076 // Save the stored values for ld4 tests.
8077 __ Dup(z11.VnS(), 0);
8078 __ Dup(z12.VnS(), 0);
8079 __ Dup(z13.VnS(), 0);
8080 __ Dup(z14.VnS(), 0);
8081 __ Mov(z11.VnS(), p2.Merging(), z29.VnS());
8082 __ Mov(z12.VnS(), p2.Merging(), z30.VnS());
8083 __ Mov(z13.VnS(), p2.Merging(), z31.VnS());
8084 __ Mov(z14.VnS(), p2.Merging(), z0.VnS());
8085
8086 __ Index(z20.VnD(), -7, 8);
8087 __ Index(z21.VnD(), -8, 8);
8088 __ Index(z22.VnD(), -9, 8);
8089 __ Index(z23.VnD(), -10, 8);
8090 // Sparse predication, including some irrelevant bits (0xee). To make the
8091 // results easy to check, activate each lane <n> where n is a multiple of 5.
8092 Initialise(&masm,
8093 p3,
8094 0xeee10000000001ee,
8095 0xeeeeeee100000000,
8096 0x01eeeeeeeee10000,
8097 0x000001eeeeeeeee1);
8098 __ St4d(z20.VnD(),
8099 z21.VnD(),
8100 z22.VnD(),
8101 z23.VnD(),
8102 p3,
8103 SVEMemOperand(x0, 16, SVE_MUL_VL));
8104 // Save the stored values for ld4 tests.
8105 __ Dup(z15.VnD(), 0);
8106 __ Dup(z16.VnD(), 0);
8107 __ Dup(z17.VnD(), 0);
8108 __ Dup(z18.VnD(), 0);
8109 __ Mov(z15.VnD(), p3.Merging(), z20.VnD());
8110 __ Mov(z16.VnD(), p3.Merging(), z21.VnD());
8111 __ Mov(z17.VnD(), p3.Merging(), z22.VnD());
8112 __ Mov(z18.VnD(), p3.Merging(), z23.VnD());
8113
8114 // Corresponding loads.
8115 // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
8116 __ Ld4b(z31.VnB(),
8117 z0.VnB(),
8118 z1.VnB(),
8119 z2.VnB(),
8120 p0.Zeroing(),
8121 SVEMemOperand(x0));
8122 __ Mov(z19, z31);
8123 __ Mov(z20, z0);
8124 __ Mov(z21, z1);
8125 __ Mov(z22, z2);
8126 __ Ld4h(z23.VnH(),
8127 z24.VnH(),
8128 z25.VnH(),
8129 z26.VnH(),
8130 p1.Zeroing(),
8131 SVEMemOperand(x0, 4, SVE_MUL_VL));
8132 __ Ld4w(z27.VnS(),
8133 z28.VnS(),
8134 z29.VnS(),
8135 z30.VnS(),
8136 p2.Zeroing(),
8137 SVEMemOperand(x0, -12, SVE_MUL_VL));
8138 // Wrap around from z31 to z0.
8139 __ Ld4d(z31.VnD(),
8140 z0.VnD(),
8141 z1.VnD(),
8142 z2.VnD(),
8143 p3.Zeroing(),
8144 SVEMemOperand(x0, 16, SVE_MUL_VL));
8145
8146 END();
8147
8148 if (CAN_RUN()) {
8149 RUN();
8150
8151 uint8_t* expected = new uint8_t[data_size];
8152 memset(expected, 0, data_size);
8153 uint8_t* middle = &expected[data_size / 2];
8154
8155 int vl_b = vl / kBRegSizeInBytes;
8156 int vl_h = vl / kHRegSizeInBytes;
8157 int vl_s = vl / kSRegSizeInBytes;
8158 int vl_d = vl / kDRegSizeInBytes;
8159
8160 int reg_count = 4;
8161
8162 // st2b { z10.b, z11.b, z12.b, z13.b }, SVE_ALL
8163 for (int i = 0; i < vl_b; i++) {
8164 uint8_t lane0 = 1 - (7 * i);
8165 uint8_t lane1 = 2 - (7 * i);
8166 uint8_t lane2 = 3 - (7 * i);
8167 uint8_t lane3 = 4 - (7 * i);
8168 MemoryWrite(middle, 0, (i * reg_count) + 0, lane0);
8169 MemoryWrite(middle, 0, (i * reg_count) + 1, lane1);
8170 MemoryWrite(middle, 0, (i * reg_count) + 2, lane2);
8171 MemoryWrite(middle, 0, (i * reg_count) + 3, lane3);
8172 }
8173
8174 // st4h { z31.h, z0.h, z1.h, z2.h }, SVE_MUL3
8175 int vl_h_mul3 = vl_h - (vl_h % 3);
8176 for (int i = 0; i < vl_h_mul3; i++) {
8177 int64_t offset = 4 * vl;
8178 uint16_t lane0 = -2 + (5 * i);
8179 uint16_t lane1 = -3 + (5 * i);
8180 uint16_t lane2 = -4 + (5 * i);
8181 uint16_t lane3 = -5 + (5 * i);
8182 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8183 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8184 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8185 MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8186 }
8187
8188 // st4w { z29.s, z30.s, z31.s, z0.s }, SVE_POW2
8189 int vl_s_pow2 = 1 << HighestSetBitPosition(vl_s);
8190 for (int i = 0; i < vl_s_pow2; i++) {
8191 int64_t offset = -12 * vl;
8192 uint32_t lane0 = 2 - (7 * i);
8193 uint32_t lane1 = 3 - (7 * i);
8194 uint32_t lane2 = 4 - (7 * i);
8195 uint32_t lane3 = 5 - (7 * i);
8196 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8197 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8198 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8199 MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8200 }
8201
8202 // st4d { z20.d, z21.d, z22.d, z23.d }, ((i % 5) == 0)
8203 for (int i = 0; i < vl_d; i++) {
8204 if ((i % 5) == 0) {
8205 int64_t offset = 16 * vl;
8206 uint64_t lane0 = -7 + (8 * i);
8207 uint64_t lane1 = -8 + (8 * i);
8208 uint64_t lane2 = -9 + (8 * i);
8209 uint64_t lane3 = -10 + (8 * i);
8210 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8211 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8212 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8213 MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8214 }
8215 }
8216
8217 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
8218
8219 // Check that we loaded back the expected values.
8220
8221 // st4b/ld4b
8222 ASSERT_EQUAL_SVE(z3, z19);
8223 ASSERT_EQUAL_SVE(z4, z20);
8224 ASSERT_EQUAL_SVE(z5, z21);
8225 ASSERT_EQUAL_SVE(z6, z22);
8226
8227 // st4h/ld4h
8228 ASSERT_EQUAL_SVE(z7, z23);
8229 ASSERT_EQUAL_SVE(z8, z24);
8230 ASSERT_EQUAL_SVE(z9, z25);
8231 ASSERT_EQUAL_SVE(z10, z26);
8232
8233 // st4w/ld4w
8234 ASSERT_EQUAL_SVE(z11, z27);
8235 ASSERT_EQUAL_SVE(z12, z28);
8236 ASSERT_EQUAL_SVE(z13, z29);
8237 ASSERT_EQUAL_SVE(z14, z30);
8238
8239 // st4d/ld4d
8240 ASSERT_EQUAL_SVE(z15, z31);
8241 ASSERT_EQUAL_SVE(z16, z0);
8242 ASSERT_EQUAL_SVE(z17, z1);
8243 ASSERT_EQUAL_SVE(z18, z2);
8244
8245 delete[] expected;
8246 }
8247 delete[] data;
8248 }
8249
TEST_SVE(sve_ld4_st4_scalar_plus_scalar)8250 TEST_SVE(sve_ld4_st4_scalar_plus_scalar) {
8251 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
8252 START();
8253
8254 int vl = config->sve_vl_in_bytes();
8255
8256 // Allocate plenty of space to enable indexing in both directions.
8257 int data_size = vl * 128;
8258
8259 uint8_t* data = new uint8_t[data_size];
8260 memset(data, 0, data_size);
8261
8262 // Set the base half-way through the buffer so we can use negative indeces.
8263 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
8264
8265 // We can test ld4 by comparing the values loaded with the values stored.
8266 // There are two complications:
8267 // - Loads have zeroing predication, so we have to clear the inactive
8268 // elements on our reference.
8269 // - We want to test both loads and stores that span { z31, z0 }, so we have
8270 // to move some values around.
8271 //
8272 // Registers z3-z18 will hold as-stored values (with inactive elements
8273 // cleared). Registers z19-z31 and z0-z2 will hold the values that were
8274 // loaded.
8275
8276 __ Index(z19.VnB(), -4, 11);
8277 __ Index(z20.VnB(), -5, 11);
8278 __ Index(z21.VnB(), -6, 11);
8279 __ Index(z22.VnB(), -7, 11);
8280 __ Ptrue(p7.VnB(), SVE_MUL4);
8281 __ Rdvl(x1, -1); // Make offsets VL-dependent so we can avoid overlap.
8282 __ St4b(z19.VnB(),
8283 z20.VnB(),
8284 z21.VnB(),
8285 z22.VnB(),
8286 p7,
8287 SVEMemOperand(x0, x1, LSL, 0));
8288 // Save the stored values for ld4 tests.
8289 __ Dup(z3.VnB(), 0);
8290 __ Dup(z4.VnB(), 0);
8291 __ Dup(z5.VnB(), 0);
8292 __ Dup(z6.VnB(), 0);
8293 __ Mov(z3.VnB(), p7.Merging(), z19.VnB());
8294 __ Mov(z4.VnB(), p7.Merging(), z20.VnB());
8295 __ Mov(z5.VnB(), p7.Merging(), z21.VnB());
8296 __ Mov(z6.VnB(), p7.Merging(), z22.VnB());
8297
8298 __ Index(z23.VnH(), 6, -2);
8299 __ Index(z24.VnH(), 7, -2);
8300 __ Index(z25.VnH(), 8, -2);
8301 __ Index(z26.VnH(), 9, -2);
8302 __ Ptrue(p6.VnH(), SVE_VL16);
8303 __ Rdvl(x2, 7); // (7 * vl) << 1 = 14 * vl
8304 __ St4h(z23.VnH(),
8305 z24.VnH(),
8306 z25.VnH(),
8307 z26.VnH(),
8308 p6,
8309 SVEMemOperand(x0, x2, LSL, 1));
8310 // Save the stored values for ld4 tests.
8311 __ Dup(z7.VnH(), 0);
8312 __ Dup(z8.VnH(), 0);
8313 __ Dup(z9.VnH(), 0);
8314 __ Dup(z10.VnH(), 0);
8315 __ Mov(z7.VnH(), p6.Merging(), z23.VnH());
8316 __ Mov(z8.VnH(), p6.Merging(), z24.VnH());
8317 __ Mov(z9.VnH(), p6.Merging(), z25.VnH());
8318 __ Mov(z10.VnH(), p6.Merging(), z26.VnH());
8319
8320 // Wrap around from z31 to z0.
8321 __ Index(z29.VnS(), -6, 7);
8322 __ Index(z30.VnS(), -7, 7);
8323 __ Index(z31.VnS(), -8, 7);
8324 __ Index(z0.VnS(), -9, 7);
8325 // Sparse predication, including some irrelevant bits (0xe). To make the
8326 // results easy to check, activate each lane <n> where n is a multiple of 5.
8327 Initialise(&masm,
8328 p5,
8329 0xeee1000010000100,
8330 0x001eeee100001000,
8331 0x0100001eeee10000,
8332 0x10000100001eeee1);
8333 __ Rdvl(x3, -5); // -(5 * vl) << 2 = -20 * vl
8334 __ St4w(z29.VnS(),
8335 z30.VnS(),
8336 z31.VnS(),
8337 z0.VnS(),
8338 p5,
8339 SVEMemOperand(x0, x3, LSL, 2));
8340 // Save the stored values for ld4 tests.
8341 __ Dup(z11.VnS(), 0);
8342 __ Dup(z12.VnS(), 0);
8343 __ Dup(z13.VnS(), 0);
8344 __ Dup(z14.VnS(), 0);
8345 __ Mov(z11.VnS(), p5.Merging(), z29.VnS());
8346 __ Mov(z12.VnS(), p5.Merging(), z30.VnS());
8347 __ Mov(z13.VnS(), p5.Merging(), z31.VnS());
8348 __ Mov(z14.VnS(), p5.Merging(), z0.VnS());
8349
8350 __ Index(z31.VnD(), 32, -11);
8351 __ Index(z0.VnD(), 33, -11);
8352 __ Index(z1.VnD(), 34, -11);
8353 __ Index(z2.VnD(), 35, -11);
8354 __ Ptrue(p4.VnD(), SVE_MUL3);
8355 __ Rdvl(x4, -1); // -(1 * vl) << 3 = -8 *vl
8356 __ St4d(z31.VnD(),
8357 z0.VnD(),
8358 z1.VnD(),
8359 z2.VnD(),
8360 p4,
8361 SVEMemOperand(x0, x4, LSL, 3));
8362 // Save the stored values for ld4 tests.
8363 __ Dup(z15.VnD(), 0);
8364 __ Dup(z16.VnD(), 0);
8365 __ Dup(z17.VnD(), 0);
8366 __ Dup(z18.VnD(), 0);
8367 __ Mov(z15.VnD(), p4.Merging(), z31.VnD());
8368 __ Mov(z16.VnD(), p4.Merging(), z0.VnD());
8369 __ Mov(z17.VnD(), p4.Merging(), z1.VnD());
8370 __ Mov(z18.VnD(), p4.Merging(), z2.VnD());
8371
8372 // Corresponding loads.
8373 // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
8374 __ Ld4b(z31.VnB(),
8375 z0.VnB(),
8376 z1.VnB(),
8377 z2.VnB(),
8378 p7.Zeroing(),
8379 SVEMemOperand(x0, x1, LSL, 0));
8380 __ Mov(z19, z31);
8381 __ Mov(z20, z0);
8382 __ Mov(z21, z1);
8383 __ Mov(z22, z2);
8384 __ Ld4h(z23.VnH(),
8385 z24.VnH(),
8386 z25.VnH(),
8387 z26.VnH(),
8388 p6.Zeroing(),
8389 SVEMemOperand(x0, x2, LSL, 1));
8390 __ Ld4w(z27.VnS(),
8391 z28.VnS(),
8392 z29.VnS(),
8393 z30.VnS(),
8394 p5.Zeroing(),
8395 SVEMemOperand(x0, x3, LSL, 2));
8396 // Wrap around from z31 to z0.
8397 __ Ld4d(z31.VnD(),
8398 z0.VnD(),
8399 z1.VnD(),
8400 z2.VnD(),
8401 p4.Zeroing(),
8402 SVEMemOperand(x0, x4, LSL, 3));
8403
8404 END();
8405
8406 if (CAN_RUN()) {
8407 RUN();
8408
8409 uint8_t* expected = new uint8_t[data_size];
8410 memset(expected, 0, data_size);
8411 uint8_t* middle = &expected[data_size / 2];
8412
8413 int vl_b = vl / kBRegSizeInBytes;
8414 int vl_h = vl / kHRegSizeInBytes;
8415 int vl_s = vl / kSRegSizeInBytes;
8416 int vl_d = vl / kDRegSizeInBytes;
8417
8418 int reg_count = 4;
8419
8420 // st4b { z19.b, z20.b, z21.b, z22.b }, SVE_MUL4
8421 int vl_b_mul4 = vl_b - (vl_b % 4);
8422 for (int i = 0; i < vl_b_mul4; i++) {
8423 int64_t offset = -(1 << kBRegSizeInBytesLog2) * vl;
8424 uint8_t lane0 = -4 + (11 * i);
8425 uint8_t lane1 = -5 + (11 * i);
8426 uint8_t lane2 = -6 + (11 * i);
8427 uint8_t lane3 = -7 + (11 * i);
8428 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8429 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8430 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8431 MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8432 }
8433
8434 // st4h { z22.h, z23.h, z24.h, z25.h }, SVE_VL16
8435 if (vl_h >= 16) {
8436 for (int i = 0; i < 16; i++) {
8437 int64_t offset = (7 << kHRegSizeInBytesLog2) * vl;
8438 uint16_t lane0 = 6 - (2 * i);
8439 uint16_t lane1 = 7 - (2 * i);
8440 uint16_t lane2 = 8 - (2 * i);
8441 uint16_t lane3 = 9 - (2 * i);
8442 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8443 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8444 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8445 MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8446 }
8447 }
8448
8449 // st4w { z29.s, z30.s, z31.s, z0.s }, ((i % 5) == 0)
8450 for (int i = 0; i < vl_s; i++) {
8451 if ((i % 5) == 0) {
8452 int64_t offset = -(5 << kSRegSizeInBytesLog2) * vl;
8453 uint32_t lane0 = -6 + (7 * i);
8454 uint32_t lane1 = -7 + (7 * i);
8455 uint32_t lane2 = -8 + (7 * i);
8456 uint32_t lane3 = -9 + (7 * i);
8457 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8458 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8459 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8460 MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8461 }
8462 }
8463
8464 // st4d { z31.d, z0.d, z1.d, z2.d }, SVE_MUL3
8465 int vl_d_mul3 = vl_d - (vl_d % 3);
8466 for (int i = 0; i < vl_d_mul3; i++) {
8467 int64_t offset = -(1 << kDRegSizeInBytesLog2) * vl;
8468 uint64_t lane0 = 32 - (11 * i);
8469 uint64_t lane1 = 33 - (11 * i);
8470 uint64_t lane2 = 34 - (11 * i);
8471 uint64_t lane3 = 35 - (11 * i);
8472 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8473 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8474 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8475 MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8476 }
8477
8478 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
8479
8480 // Check that we loaded back the expected values.
8481
8482 // st4b/ld4b
8483 ASSERT_EQUAL_SVE(z3, z19);
8484 ASSERT_EQUAL_SVE(z4, z20);
8485 ASSERT_EQUAL_SVE(z5, z21);
8486 ASSERT_EQUAL_SVE(z6, z22);
8487
8488 // st4h/ld4h
8489 ASSERT_EQUAL_SVE(z7, z23);
8490 ASSERT_EQUAL_SVE(z8, z24);
8491 ASSERT_EQUAL_SVE(z9, z25);
8492 ASSERT_EQUAL_SVE(z10, z26);
8493
8494 // st4w/ld4w
8495 ASSERT_EQUAL_SVE(z11, z27);
8496 ASSERT_EQUAL_SVE(z12, z28);
8497 ASSERT_EQUAL_SVE(z13, z29);
8498 ASSERT_EQUAL_SVE(z14, z30);
8499
8500 // st4d/ld4d
8501 ASSERT_EQUAL_SVE(z15, z31);
8502 ASSERT_EQUAL_SVE(z16, z0);
8503 ASSERT_EQUAL_SVE(z17, z1);
8504 ASSERT_EQUAL_SVE(z18, z2);
8505
8506 delete[] expected;
8507 }
8508 delete[] data;
8509 }
8510
TEST_SVE(sve_ld234_st234_scalar_plus_scalar_sp)8511 TEST_SVE(sve_ld234_st234_scalar_plus_scalar_sp) {
8512 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
8513 START();
8514
8515 // Check that the simulator correctly interprets rn == 31 as sp.
8516 // The indexing logic is the same regardless so we just check one load and
8517 // store of each type.
8518
8519 // There are no pre- or post-indexing modes, so reserve space first.
8520 __ ClaimVL(2 + 3 + 4);
8521
8522 __ Index(z0.VnB(), 42, 2);
8523 __ Index(z1.VnB(), 43, 2);
8524 __ Ptrue(p0.VnB(), SVE_VL7);
8525 __ Rdvl(x0, 0);
8526 __ St2b(z0.VnB(), z1.VnB(), p0, SVEMemOperand(sp, x0));
8527
8528 __ Index(z4.VnH(), 42, 3);
8529 __ Index(z5.VnH(), 43, 3);
8530 __ Index(z6.VnH(), 44, 3);
8531 __ Ptrue(p1.VnH(), SVE_POW2);
8532 __ Rdvl(x1, 2);
8533 __ Lsr(x1, x1, 1);
8534 __ St3h(z4.VnH(), z5.VnH(), z6.VnH(), p1, SVEMemOperand(sp, x1, LSL, 1));
8535
8536 __ Index(z8.VnS(), 42, 4);
8537 __ Index(z9.VnS(), 43, 4);
8538 __ Index(z10.VnS(), 44, 4);
8539 __ Index(z11.VnS(), 45, 4);
8540 __ Ptrue(p2.VnS());
8541 __ Rdvl(x2, 2 + 3);
8542 __ Lsr(x2, x2, 2);
8543 __ St4w(z8.VnS(),
8544 z9.VnS(),
8545 z10.VnS(),
8546 z11.VnS(),
8547 p2,
8548 SVEMemOperand(sp, x2, LSL, 2));
8549
8550 // Corresponding loads.
8551 // We have to explicitly zero inactive lanes in the reference values because
8552 // loads have zeroing predication.
8553 __ Dup(z12.VnB(), 0);
8554 __ Dup(z13.VnB(), 0);
8555 __ Mov(z12.VnB(), p0.Merging(), z0.VnB());
8556 __ Mov(z13.VnB(), p0.Merging(), z1.VnB());
8557 __ Ld2b(z0.VnB(), z1.VnB(), p0.Zeroing(), SVEMemOperand(sp, x0));
8558
8559 __ Dup(z16.VnH(), 0);
8560 __ Dup(z17.VnH(), 0);
8561 __ Dup(z18.VnH(), 0);
8562 __ Mov(z16.VnH(), p1.Merging(), z4.VnH());
8563 __ Mov(z17.VnH(), p1.Merging(), z5.VnH());
8564 __ Mov(z18.VnH(), p1.Merging(), z6.VnH());
8565 __ Ld3h(z4.VnH(),
8566 z5.VnH(),
8567 z6.VnH(),
8568 p1.Zeroing(),
8569 SVEMemOperand(sp, x1, LSL, 1));
8570
8571 __ Dup(z20.VnS(), 0);
8572 __ Dup(z21.VnS(), 0);
8573 __ Dup(z22.VnS(), 0);
8574 __ Dup(z23.VnS(), 0);
8575 __ Mov(z20.VnS(), p2.Merging(), z8.VnS());
8576 __ Mov(z21.VnS(), p2.Merging(), z9.VnS());
8577 __ Mov(z22.VnS(), p2.Merging(), z10.VnS());
8578 __ Mov(z23.VnS(), p2.Merging(), z11.VnS());
8579 __ Ld4w(z8.VnS(),
8580 z9.VnS(),
8581 z10.VnS(),
8582 z11.VnS(),
8583 p2.Zeroing(),
8584 SVEMemOperand(sp, x2, LSL, 2));
8585
8586 __ DropVL(2 + 3 + 4);
8587
8588 END();
8589
8590 if (CAN_RUN()) {
8591 RUN();
8592
8593 // The most likely failure mode is the that simulator reads sp as xzr and
8594 // crashes on execution. We already test the address calculations separately
8595 // and sp doesn't change this, so just test that we load the values we
8596 // stored.
8597
8598 // st2b/ld2b
8599 ASSERT_EQUAL_SVE(z0, z12);
8600 ASSERT_EQUAL_SVE(z1, z13);
8601
8602 // st3h/ld3h
8603 ASSERT_EQUAL_SVE(z4, z16);
8604 ASSERT_EQUAL_SVE(z5, z17);
8605 ASSERT_EQUAL_SVE(z6, z18);
8606
8607 // st4h/ld4h
8608 ASSERT_EQUAL_SVE(z8, z20);
8609 ASSERT_EQUAL_SVE(z9, z21);
8610 ASSERT_EQUAL_SVE(z10, z22);
8611 ASSERT_EQUAL_SVE(z11, z23);
8612 }
8613 }
8614
TEST_SVE(sve_ld234_st234_scalar_plus_imm_sp)8615 TEST_SVE(sve_ld234_st234_scalar_plus_imm_sp) {
8616 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
8617 START();
8618
8619 // Check that the simulator correctly interprets rn == 31 as sp.
8620 // The indexing logic is the same regardless so we just check one load and
8621 // store of each type.
8622
8623 // There are no pre- or post-indexing modes, so reserve space first.
8624 // Note that the stores fill in an order that allows each immediate to be a
8625 // multiple of the number of registers.
8626 __ ClaimVL(4 + 2 + 3);
8627
8628 __ Index(z0.VnB(), 42, 2);
8629 __ Index(z1.VnB(), 43, 2);
8630 __ Ptrue(p0.VnB(), SVE_POW2);
8631 __ St2b(z0.VnB(), z1.VnB(), p0, SVEMemOperand(sp, 4, SVE_MUL_VL));
8632
8633 __ Index(z4.VnH(), 42, 3);
8634 __ Index(z5.VnH(), 43, 3);
8635 __ Index(z6.VnH(), 44, 3);
8636 __ Ptrue(p1.VnH(), SVE_VL7);
8637 __ St3h(z4.VnH(), z5.VnH(), z6.VnH(), p1, SVEMemOperand(sp, 6, SVE_MUL_VL));
8638
8639 __ Index(z8.VnS(), 42, 4);
8640 __ Index(z9.VnS(), 43, 4);
8641 __ Index(z10.VnS(), 44, 4);
8642 __ Index(z11.VnS(), 45, 4);
8643 __ Ptrue(p2.VnS());
8644 __ St4w(z8.VnS(), z9.VnS(), z10.VnS(), z11.VnS(), p2, SVEMemOperand(sp));
8645
8646 // Corresponding loads.
8647 // We have to explicitly zero inactive lanes in the reference values because
8648 // loads have zeroing predication.
8649 __ Dup(z12.VnB(), 0);
8650 __ Dup(z13.VnB(), 0);
8651 __ Mov(z12.VnB(), p0.Merging(), z0.VnB());
8652 __ Mov(z13.VnB(), p0.Merging(), z1.VnB());
8653 __ Ld2b(z0.VnB(), z1.VnB(), p0.Zeroing(), SVEMemOperand(sp, 4, SVE_MUL_VL));
8654
8655 __ Dup(z16.VnH(), 0);
8656 __ Dup(z17.VnH(), 0);
8657 __ Dup(z18.VnH(), 0);
8658 __ Mov(z16.VnH(), p1.Merging(), z4.VnH());
8659 __ Mov(z17.VnH(), p1.Merging(), z5.VnH());
8660 __ Mov(z18.VnH(), p1.Merging(), z6.VnH());
8661 __ Ld3h(z4.VnH(),
8662 z5.VnH(),
8663 z6.VnH(),
8664 p1.Zeroing(),
8665 SVEMemOperand(sp, 6, SVE_MUL_VL));
8666
8667 __ Dup(z20.VnS(), 0);
8668 __ Dup(z21.VnS(), 0);
8669 __ Dup(z22.VnS(), 0);
8670 __ Dup(z23.VnS(), 0);
8671 __ Mov(z20.VnS(), p2.Merging(), z8.VnS());
8672 __ Mov(z21.VnS(), p2.Merging(), z9.VnS());
8673 __ Mov(z22.VnS(), p2.Merging(), z10.VnS());
8674 __ Mov(z23.VnS(), p2.Merging(), z11.VnS());
8675 __ Ld4w(z8.VnS(),
8676 z9.VnS(),
8677 z10.VnS(),
8678 z11.VnS(),
8679 p2.Zeroing(),
8680 SVEMemOperand(sp));
8681
8682 __ DropVL(4 + 2 + 3);
8683
8684 END();
8685
8686 if (CAN_RUN()) {
8687 RUN();
8688
8689 // The most likely failure mode is the that simulator reads sp as xzr and
8690 // crashes on execution. We already test the address calculations separately
8691 // and sp doesn't change this, so just test that we load the values we
8692 // stored.
8693 // TODO: Actually do this, once loads are implemented.
8694 }
8695 }
8696
8697 // Fill the input buffer with arbitrary data. Meanwhile, assign random offsets
8698 // from the base address of the buffer and corresponding addresses to the
8699 // arguments if provided.
BufferFillingHelper(uint64_t data_ptr,size_t buffer_size,unsigned lane_size_in_bytes,int lane_count,uint64_t * offsets,uint64_t * addresses=nullptr,uint64_t * max_address=nullptr)8700 static void BufferFillingHelper(uint64_t data_ptr,
8701 size_t buffer_size,
8702 unsigned lane_size_in_bytes,
8703 int lane_count,
8704 uint64_t* offsets,
8705 uint64_t* addresses = nullptr,
8706 uint64_t* max_address = nullptr) {
8707 // Use a fixed seed for nrand48() so that test runs are reproducible.
8708 unsigned short seed[3] = {1, 2, 3}; // NOLINT(runtime/int)
8709
8710 // Fill a buffer with arbitrary data.
8711 for (size_t i = 0; i < buffer_size; i++) {
8712 uint8_t byte = nrand48(seed) & 0xff;
8713 memcpy(reinterpret_cast<void*>(data_ptr + i), &byte, 1);
8714 }
8715
8716 if (max_address != nullptr) {
8717 *max_address = 0;
8718 }
8719
8720 // Vectors of random addresses and offsets into the buffer.
8721 for (int i = 0; i < lane_count; i++) {
8722 uint64_t rnd = nrand48(seed);
8723 // Limit the range to the set of completely-accessible elements in memory.
8724 offsets[i] = rnd % (buffer_size - lane_size_in_bytes);
8725 if ((addresses != nullptr) && (max_address != nullptr)) {
8726 addresses[i] = data_ptr + offsets[i];
8727 *max_address = std::max(*max_address, addresses[i]);
8728 }
8729 }
8730 }
8731
ScalarLoadHelper(MacroAssembler * masm,Register dst,Register addr,int msize_in_bits,bool is_signed)8732 static void ScalarLoadHelper(MacroAssembler* masm,
8733 Register dst,
8734 Register addr,
8735 int msize_in_bits,
8736 bool is_signed) {
8737 if (is_signed) {
8738 switch (msize_in_bits) {
8739 case kBRegSize:
8740 masm->Ldrsb(dst, MemOperand(addr));
8741 break;
8742 case kHRegSize:
8743 masm->Ldrsh(dst, MemOperand(addr));
8744 break;
8745 case kWRegSize:
8746 masm->Ldrsw(dst, MemOperand(addr));
8747 break;
8748 default:
8749 VIXL_UNIMPLEMENTED();
8750 break;
8751 }
8752 } else {
8753 switch (msize_in_bits) {
8754 case kBRegSize:
8755 masm->Ldrb(dst, MemOperand(addr));
8756 break;
8757 case kHRegSize:
8758 masm->Ldrh(dst, MemOperand(addr));
8759 break;
8760 case kWRegSize:
8761 masm->Ldr(dst.W(), MemOperand(addr));
8762 break;
8763 case kXRegSize:
8764 masm->Ldr(dst, MemOperand(addr));
8765 break;
8766 default:
8767 VIXL_UNIMPLEMENTED();
8768 break;
8769 }
8770 }
8771 }
8772
8773 // Generate a reference result using scalar loads.
8774 // For now this helper doesn't save and restore the caller registers.
8775 // Clobber register z30, x28, x29 and p7.
8776 template <size_t N>
ScalarLoadHelper(MacroAssembler * masm,int vl,const uint64_t (& addresses)[N],const ZRegister & zt_ref,const PRegisterZ & pg,unsigned esize_in_bits,unsigned msize_in_bits,bool is_signed)8777 static void ScalarLoadHelper(MacroAssembler* masm,
8778 int vl,
8779 const uint64_t (&addresses)[N],
8780 const ZRegister& zt_ref,
8781 const PRegisterZ& pg,
8782 unsigned esize_in_bits,
8783 unsigned msize_in_bits,
8784 bool is_signed) {
8785 unsigned esize_in_bytes = esize_in_bits / kBitsPerByte;
8786 ZRegister lane_numbers = z30.WithLaneSize(esize_in_bits);
8787 masm->Index(lane_numbers, 0, 1);
8788 masm->Dup(zt_ref, 0);
8789 for (unsigned i = 0; i < (vl / esize_in_bytes); i++) {
8790 masm->Mov(x29, addresses[N - i - 1]);
8791 Register rt(28, std::min(std::max(esize_in_bits, kSRegSize), kDRegSize));
8792 ScalarLoadHelper(masm, rt, x29, msize_in_bits, is_signed);
8793
8794 // Emulate predication.
8795 masm->Cmpeq(p7.WithLaneSize(esize_in_bits), pg, lane_numbers, i);
8796 masm->Cpy(zt_ref, p7.Merging(), rt);
8797 }
8798 }
8799
8800 typedef void (MacroAssembler::*Ld1Macro)(const ZRegister& zt,
8801 const PRegisterZ& pg,
8802 const SVEMemOperand& addr);
8803
8804 template <typename T>
Ldff1Helper(Test * config,uintptr_t data,unsigned msize_in_bits,unsigned esize_in_bits,CPURegister::RegisterType base_type,Ld1Macro ldff1,Ld1Macro ld1,T mod,bool scale=false)8805 static void Ldff1Helper(Test* config,
8806 uintptr_t data,
8807 unsigned msize_in_bits,
8808 unsigned esize_in_bits,
8809 CPURegister::RegisterType base_type,
8810 Ld1Macro ldff1,
8811 Ld1Macro ld1,
8812 T mod,
8813 bool scale = false) {
8814 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
8815 START();
8816
8817 int vl = config->sve_vl_in_bytes();
8818 size_t page_size = sysconf(_SC_PAGE_SIZE);
8819 VIXL_ASSERT(page_size > static_cast<size_t>(vl));
8820
8821 unsigned esize_in_bytes = esize_in_bits / kBitsPerByte;
8822 unsigned msize_in_bytes = msize_in_bits / kBitsPerByte;
8823 unsigned msize_in_bytes_log2 = std::log2(msize_in_bytes);
8824 VIXL_ASSERT(msize_in_bits <= esize_in_bits);
8825
8826 PRegister all = p7;
8827 __ Ptrue(all.VnB());
8828
8829 size_t offset_modifier = 0;
8830
8831 // The highest address at which a load stopped. Every FF load should fault at
8832 // `data + page_size`, so this value should not exceed that value. However,
8833 // the architecture allows fault-tolerant loads to fault arbitrarily, so the
8834 // real value may be lower.
8835 //
8836 // This is used to check that the `mprotect` above really does make the second
8837 // page inaccessible, and that the resulting FFR from each load reflects that.
8838 Register limit = x22;
8839 __ Mov(limit, 0);
8840
8841 // If the FFR grows unexpectedly, we increment this register by the
8842 // difference. FFR should never grow, except when explicitly set.
8843 Register ffr_grow_count = x23;
8844 __ Mov(ffr_grow_count, 0);
8845
8846 // Set the offset so that the load is guaranteed to start in the
8847 // accessible page, but end in the inaccessible one.
8848 VIXL_ASSERT((page_size % msize_in_bytes) == 0);
8849 VIXL_ASSERT((vl % msize_in_bytes) == 0);
8850 size_t elements_per_page = page_size / msize_in_bytes;
8851 size_t elements_per_access = vl / esize_in_bytes;
8852 size_t min_offset = (elements_per_page - elements_per_access) + 1;
8853 size_t max_offset = elements_per_page - 1;
8854 size_t offset =
8855 min_offset + (offset_modifier % (max_offset - min_offset + 1));
8856 offset_modifier++;
8857
8858 __ Setffr();
8859 __ Mov(x20, data);
8860 __ Mov(x21, offset);
8861
8862 if (base_type == CPURegister::kRegister) {
8863 // Scalar-plus-scalar mode.
8864 VIXL_ASSERT((std::is_same<T, vixl::aarch64::Shift>::value));
8865 VIXL_ASSERT((static_cast<int>(mod) == LSL) ||
8866 (static_cast<int>(mod) == NO_SHIFT));
8867 (masm.*ldff1)(z0.WithLaneSize(esize_in_bits),
8868 all.Zeroing(),
8869 SVEMemOperand(x20, x21, mod, msize_in_bytes_log2));
8870 } else {
8871 VIXL_ASSERT(base_type == CPURegister::kZRegister);
8872 int offs_size;
8873 bool offs_is_unsigned;
8874 if (std::is_same<T, vixl::aarch64::Extend>::value) {
8875 // Scalar-plus-vector mode with 32-bit optional unpacked or upacked, and
8876 // unscaled or scaled offset.
8877 VIXL_ASSERT((static_cast<int>(mod) == SXTW) ||
8878 (static_cast<int>(mod) == UXTW));
8879 if (scale == true) {
8880 // Gather first-fault bytes load doesn't support scaled offset.
8881 VIXL_ASSERT(msize_in_bits != kBRegSize);
8882 }
8883 offs_is_unsigned = (static_cast<int>(mod) == UXTW) ? true : false;
8884 offs_size = kSRegSize;
8885
8886 } else {
8887 // Scalar-plus-vector mode with 64-bit unscaled or scaled offset.
8888 VIXL_ASSERT((std::is_same<T, vixl::aarch64::Shift>::value));
8889 VIXL_ASSERT((static_cast<int>(mod) == LSL) ||
8890 (static_cast<int>(mod) == NO_SHIFT));
8891 offs_is_unsigned = false;
8892 offs_size = kDRegSize;
8893 }
8894
8895 // For generating the pattern of "base address + index << shift".
8896 // In case of unscaled-offset operation, use `msize_in_bytes` be an offset
8897 // of each decreasing memory accesses. otherwise, decreases the indexes by 1
8898 // and then scale it by the shift value.
8899 int shift = (scale == true) ? msize_in_bytes_log2 : 0;
8900 int index_offset = msize_in_bytes >> shift;
8901 VIXL_ASSERT(index_offset > 0);
8902 uint64_t index = 0;
8903 uint64_t base_address = 0;
8904
8905 if (offs_is_unsigned == true) {
8906 // Base address.
8907 base_address = data;
8908 // Maximum unsigned positive index.
8909 index = page_size >> shift;
8910
8911 } else {
8912 // Base address.
8913 base_address = data + (2 * page_size);
8914 // Maximum unsigned positive index.
8915 uint64_t uint_e_max =
8916 (esize_in_bits == kDRegSize) ? UINT64_MAX : UINT32_MAX;
8917 index = uint_e_max - (page_size >> shift) + 1;
8918 }
8919
8920 __ Mov(x19, base_address);
8921 if ((offs_size == kSRegSize) && (esize_in_bits == kDRegSize)) {
8922 // In this case, the index values are optionally sign or zero-extended
8923 // from 32 to 64 bits, assign a convenient value to the top 32 bits to
8924 // ensure only the low 32 bits be the index values.
8925 index |= 0x1234567800000000;
8926 }
8927
8928 index -= index_offset * (elements_per_access - 1);
8929 __ Index(z17.WithLaneSize(esize_in_bits), index, index_offset);
8930
8931 // Scalar plus vector mode.
8932 (masm.*
8933 ldff1)(z0.WithLaneSize(esize_in_bits),
8934 all.Zeroing(),
8935 SVEMemOperand(x19, z17.WithLaneSize(esize_in_bits), mod, shift));
8936 }
8937
8938 __ Rdffrs(p0.VnB(), all.Zeroing());
8939
8940 // Execute another Ldff1 with no offset, so that every element could be
8941 // read. It should respect FFR, and load no more than we loaded the
8942 // first time.
8943 (masm.*
8944 ldff1)(z16.WithLaneSize(esize_in_bits), all.Zeroing(), SVEMemOperand(x20));
8945 __ Rdffrs(p1.VnB(), all.Zeroing());
8946 __ Cntp(x0, all, p1.VnB());
8947 __ Uqdecp(x0, p0.VnB());
8948 __ Add(ffr_grow_count, ffr_grow_count, x0);
8949
8950 // Use the FFR to predicate the normal load. If it wasn't properly set,
8951 // the normal load will abort.
8952 (masm.*ld1)(z16.WithLaneSize(esize_in_bits),
8953 p0.Zeroing(),
8954 SVEMemOperand(x20, x21, LSL, msize_in_bytes_log2));
8955
8956 // Work out the address after the one that was just accessed.
8957 __ Incp(x21, p0.WithLaneSize(esize_in_bits));
8958 __ Add(x0, x20, Operand(x21, LSL, msize_in_bytes_log2));
8959 __ Cmp(limit, x0);
8960 __ Csel(limit, limit, x0, hs);
8961
8962 // Clear lanes inactive in FFR. These have an undefined result.
8963 __ Not(p0.VnB(), all.Zeroing(), p0.VnB());
8964 __ Mov(z0.WithLaneSize(esize_in_bits), p0.Merging(), 0);
8965
8966 END();
8967
8968 if (CAN_RUN()) {
8969 RUN();
8970
8971 uintptr_t expected_limit = data + page_size;
8972 uintptr_t measured_limit = core.xreg(limit.GetCode());
8973 VIXL_CHECK(measured_limit <= expected_limit);
8974 if (measured_limit < expected_limit) {
8975 // We can't fail the test for this case, but a warning is helpful for
8976 // manually-run tests.
8977 printf(
8978 "WARNING: All fault-tolerant loads detected faults before the\n"
8979 "expected limit. This is architecturally possible, but improbable,\n"
8980 "and could be a symptom of another problem.\n");
8981 }
8982
8983 ASSERT_EQUAL_64(0, ffr_grow_count);
8984
8985 ASSERT_EQUAL_SVE(z0.WithLaneSize(esize_in_bits),
8986 z16.WithLaneSize(esize_in_bits));
8987 }
8988 }
8989
TEST_SVE(sve_ldff1_scalar_plus_scalar)8990 TEST_SVE(sve_ldff1_scalar_plus_scalar) {
8991 size_t page_size = sysconf(_SC_PAGE_SIZE);
8992 VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));
8993
8994 // Allocate two pages, then mprotect the second one to make it inaccessible.
8995 uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
8996 page_size * 2,
8997 PROT_READ | PROT_WRITE,
8998 MAP_PRIVATE | MAP_ANONYMOUS,
8999 -1,
9000 0));
9001 mprotect(reinterpret_cast<void*>(data + page_size), page_size, PROT_NONE);
9002
9003 // Fill the accessible page with arbitrary data.
9004 for (size_t i = 0; i < page_size; i++) {
9005 // Reverse bits so we get a mixture of positive and negative values.
9006 uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
9007 memcpy(reinterpret_cast<void*>(data + i), &byte, 1);
9008 }
9009
9010 auto ldff1_unscaled_offset_helper = std::bind(&Ldff1Helper<Shift>,
9011 config,
9012 data,
9013 std::placeholders::_1,
9014 std::placeholders::_2,
9015 CPURegister::kRegister,
9016 std::placeholders::_3,
9017 std::placeholders::_4,
9018 NO_SHIFT,
9019 false);
9020
9021 Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
9022 Ld1Macro ld1b = &MacroAssembler::Ld1b;
9023 ldff1_unscaled_offset_helper(kBRegSize, kBRegSize, ldff1b, ld1b);
9024 ldff1_unscaled_offset_helper(kBRegSize, kHRegSize, ldff1b, ld1b);
9025 ldff1_unscaled_offset_helper(kBRegSize, kSRegSize, ldff1b, ld1b);
9026 ldff1_unscaled_offset_helper(kBRegSize, kDRegSize, ldff1b, ld1b);
9027
9028 Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
9029 Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
9030 ldff1_unscaled_offset_helper(kBRegSize, kHRegSize, ldff1sb, ld1sb);
9031 ldff1_unscaled_offset_helper(kBRegSize, kSRegSize, ldff1sb, ld1sb);
9032 ldff1_unscaled_offset_helper(kBRegSize, kDRegSize, ldff1sb, ld1sb);
9033
9034 auto ldff1_scaled_offset_helper = std::bind(&Ldff1Helper<Shift>,
9035 config,
9036 data,
9037 std::placeholders::_1,
9038 std::placeholders::_2,
9039 CPURegister::kRegister,
9040 std::placeholders::_3,
9041 std::placeholders::_4,
9042 LSL,
9043 true);
9044
9045 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9046 Ld1Macro ld1h = &MacroAssembler::Ld1h;
9047 ldff1_scaled_offset_helper(kHRegSize, kHRegSize, ldff1h, ld1h);
9048 ldff1_scaled_offset_helper(kHRegSize, kSRegSize, ldff1h, ld1h);
9049 ldff1_scaled_offset_helper(kHRegSize, kDRegSize, ldff1h, ld1h);
9050
9051 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9052 Ld1Macro ld1w = &MacroAssembler::Ld1w;
9053 ldff1_scaled_offset_helper(kSRegSize, kSRegSize, ldff1w, ld1w);
9054 ldff1_scaled_offset_helper(kSRegSize, kDRegSize, ldff1w, ld1w);
9055
9056 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
9057 Ld1Macro ld1d = &MacroAssembler::Ld1d;
9058 ldff1_scaled_offset_helper(kDRegSize, kDRegSize, ldff1d, ld1d);
9059
9060 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9061 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
9062 ldff1_scaled_offset_helper(kHRegSize, kSRegSize, ldff1sh, ld1sh);
9063 ldff1_scaled_offset_helper(kHRegSize, kDRegSize, ldff1sh, ld1sh);
9064
9065 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
9066 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
9067 ldff1_scaled_offset_helper(kSRegSize, kDRegSize, ldff1sw, ld1sw);
9068
9069 munmap(reinterpret_cast<void*>(data), page_size * 2);
9070 }
9071
sve_ldff1_scalar_plus_vector_32_scaled_offset(Test * config,uintptr_t data)9072 static void sve_ldff1_scalar_plus_vector_32_scaled_offset(Test* config,
9073 uintptr_t data) {
9074 auto ldff1_32_scaled_offset_helper = std::bind(&Ldff1Helper<Extend>,
9075 config,
9076 data,
9077 std::placeholders::_1,
9078 kSRegSize,
9079 CPURegister::kZRegister,
9080 std::placeholders::_2,
9081 std::placeholders::_3,
9082 std::placeholders::_4,
9083 true);
9084 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9085 Ld1Macro ld1h = &MacroAssembler::Ld1h;
9086 ldff1_32_scaled_offset_helper(kHRegSize, ldff1h, ld1h, UXTW);
9087 ldff1_32_scaled_offset_helper(kHRegSize, ldff1h, ld1h, SXTW);
9088
9089 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9090 Ld1Macro ld1w = &MacroAssembler::Ld1w;
9091 ldff1_32_scaled_offset_helper(kSRegSize, ldff1w, ld1w, UXTW);
9092 ldff1_32_scaled_offset_helper(kSRegSize, ldff1w, ld1w, SXTW);
9093
9094 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9095 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
9096 ldff1_32_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh, UXTW);
9097 ldff1_32_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh, SXTW);
9098 }
9099
sve_ldff1_scalar_plus_vector_32_unscaled_offset(Test * config,uintptr_t data)9100 static void sve_ldff1_scalar_plus_vector_32_unscaled_offset(Test* config,
9101 uintptr_t data) {
9102 auto ldff1_32_unscaled_offset_helper = std::bind(&Ldff1Helper<Extend>,
9103 config,
9104 data,
9105 std::placeholders::_1,
9106 kSRegSize,
9107 CPURegister::kZRegister,
9108 std::placeholders::_2,
9109 std::placeholders::_3,
9110 std::placeholders::_4,
9111 false);
9112
9113 Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
9114 Ld1Macro ld1b = &MacroAssembler::Ld1b;
9115 ldff1_32_unscaled_offset_helper(kBRegSize, ldff1b, ld1b, UXTW);
9116 ldff1_32_unscaled_offset_helper(kBRegSize, ldff1b, ld1b, SXTW);
9117
9118 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9119 Ld1Macro ld1h = &MacroAssembler::Ld1h;
9120 ldff1_32_unscaled_offset_helper(kHRegSize, ldff1h, ld1h, UXTW);
9121 ldff1_32_unscaled_offset_helper(kHRegSize, ldff1h, ld1h, SXTW);
9122
9123 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9124 Ld1Macro ld1w = &MacroAssembler::Ld1w;
9125 ldff1_32_unscaled_offset_helper(kSRegSize, ldff1w, ld1w, UXTW);
9126 ldff1_32_unscaled_offset_helper(kSRegSize, ldff1w, ld1w, SXTW);
9127
9128 Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
9129 Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
9130 ldff1_32_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb, UXTW);
9131 ldff1_32_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb, SXTW);
9132
9133 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9134 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
9135 ldff1_32_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh, UXTW);
9136 ldff1_32_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh, SXTW);
9137 }
9138
sve_ldff1_scalar_plus_vector_32_unpacked_scaled_offset(Test * config,uintptr_t data)9139 static void sve_ldff1_scalar_plus_vector_32_unpacked_scaled_offset(
9140 Test* config, uintptr_t data) {
9141 auto ldff1_32_unpacked_scaled_offset_helper =
9142 std::bind(&Ldff1Helper<Extend>,
9143 config,
9144 data,
9145 std::placeholders::_1,
9146 kDRegSize,
9147 CPURegister::kZRegister,
9148 std::placeholders::_2,
9149 std::placeholders::_3,
9150 std::placeholders::_4,
9151 true);
9152
9153 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9154 Ld1Macro ld1h = &MacroAssembler::Ld1h;
9155 ldff1_32_unpacked_scaled_offset_helper(kHRegSize, ldff1h, ld1h, UXTW);
9156 ldff1_32_unpacked_scaled_offset_helper(kHRegSize, ldff1h, ld1h, SXTW);
9157
9158 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9159 Ld1Macro ld1w = &MacroAssembler::Ld1w;
9160 ldff1_32_unpacked_scaled_offset_helper(kSRegSize, ldff1w, ld1w, UXTW);
9161 ldff1_32_unpacked_scaled_offset_helper(kSRegSize, ldff1w, ld1w, SXTW);
9162
9163 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
9164 Ld1Macro ld1d = &MacroAssembler::Ld1d;
9165 ldff1_32_unpacked_scaled_offset_helper(kDRegSize, ldff1d, ld1d, UXTW);
9166 ldff1_32_unpacked_scaled_offset_helper(kDRegSize, ldff1d, ld1d, SXTW);
9167
9168 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9169 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
9170 ldff1_32_unpacked_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh, UXTW);
9171 ldff1_32_unpacked_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh, SXTW);
9172
9173 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
9174 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
9175 ldff1_32_unpacked_scaled_offset_helper(kSRegSize, ldff1sw, ld1sw, UXTW);
9176 ldff1_32_unpacked_scaled_offset_helper(kSRegSize, ldff1sw, ld1sw, SXTW);
9177 }
9178
sve_ldff1_scalar_plus_vector_32_unpacked_unscaled_offset(Test * config,uintptr_t data)9179 static void sve_ldff1_scalar_plus_vector_32_unpacked_unscaled_offset(
9180 Test* config, uintptr_t data) {
9181 auto ldff1_32_unpacked_unscaled_offset_helper =
9182 std::bind(&Ldff1Helper<Extend>,
9183 config,
9184 data,
9185 std::placeholders::_1,
9186 kDRegSize,
9187 CPURegister::kZRegister,
9188 std::placeholders::_2,
9189 std::placeholders::_3,
9190 std::placeholders::_4,
9191 false);
9192
9193 Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
9194 Ld1Macro ld1b = &MacroAssembler::Ld1b;
9195 ldff1_32_unpacked_unscaled_offset_helper(kBRegSize, ldff1b, ld1b, UXTW);
9196 ldff1_32_unpacked_unscaled_offset_helper(kBRegSize, ldff1b, ld1b, SXTW);
9197
9198 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9199 Ld1Macro ld1h = &MacroAssembler::Ld1h;
9200 ldff1_32_unpacked_unscaled_offset_helper(kHRegSize, ldff1h, ld1h, UXTW);
9201 ldff1_32_unpacked_unscaled_offset_helper(kHRegSize, ldff1h, ld1h, SXTW);
9202
9203 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9204 Ld1Macro ld1w = &MacroAssembler::Ld1w;
9205 ldff1_32_unpacked_unscaled_offset_helper(kSRegSize, ldff1w, ld1w, UXTW);
9206 ldff1_32_unpacked_unscaled_offset_helper(kSRegSize, ldff1w, ld1w, SXTW);
9207
9208 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
9209 Ld1Macro ld1d = &MacroAssembler::Ld1d;
9210 ldff1_32_unpacked_unscaled_offset_helper(kDRegSize, ldff1d, ld1d, UXTW);
9211 ldff1_32_unpacked_unscaled_offset_helper(kDRegSize, ldff1d, ld1d, SXTW);
9212
9213 Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
9214 Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
9215 ldff1_32_unpacked_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb, UXTW);
9216 ldff1_32_unpacked_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb, SXTW);
9217
9218 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9219 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
9220 ldff1_32_unpacked_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh, UXTW);
9221 ldff1_32_unpacked_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh, SXTW);
9222
9223 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
9224 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
9225 ldff1_32_unpacked_unscaled_offset_helper(kSRegSize, ldff1sw, ld1sw, UXTW);
9226 ldff1_32_unpacked_unscaled_offset_helper(kSRegSize, ldff1sw, ld1sw, SXTW);
9227 }
9228
sve_ldff1_scalar_plus_vector_64_scaled_offset(Test * config,uintptr_t data)9229 static void sve_ldff1_scalar_plus_vector_64_scaled_offset(Test* config,
9230 uintptr_t data) {
9231 auto ldff1_64_scaled_offset_helper = std::bind(&Ldff1Helper<Shift>,
9232 config,
9233 data,
9234 std::placeholders::_1,
9235 kDRegSize,
9236 CPURegister::kZRegister,
9237 std::placeholders::_2,
9238 std::placeholders::_3,
9239 LSL,
9240 true);
9241
9242 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9243 Ld1Macro ld1h = &MacroAssembler::Ld1h;
9244 ldff1_64_scaled_offset_helper(kHRegSize, ldff1h, ld1h);
9245
9246 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9247 Ld1Macro ld1w = &MacroAssembler::Ld1w;
9248 ldff1_64_scaled_offset_helper(kSRegSize, ldff1w, ld1w);
9249
9250 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
9251 Ld1Macro ld1d = &MacroAssembler::Ld1d;
9252 ldff1_64_scaled_offset_helper(kDRegSize, ldff1d, ld1d);
9253
9254 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9255 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
9256 ldff1_64_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh);
9257
9258 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
9259 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
9260 ldff1_64_scaled_offset_helper(kSRegSize, ldff1sw, ld1sw);
9261 }
9262
sve_ldff1_scalar_plus_vector_64_unscaled_offset(Test * config,uintptr_t data)9263 static void sve_ldff1_scalar_plus_vector_64_unscaled_offset(Test* config,
9264 uintptr_t data) {
9265 auto ldff1_64_unscaled_offset_helper = std::bind(&Ldff1Helper<Shift>,
9266 config,
9267 data,
9268 std::placeholders::_1,
9269 kDRegSize,
9270 CPURegister::kZRegister,
9271 std::placeholders::_2,
9272 std::placeholders::_3,
9273 NO_SHIFT,
9274 false);
9275
9276 Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
9277 Ld1Macro ld1b = &MacroAssembler::Ld1b;
9278 ldff1_64_unscaled_offset_helper(kBRegSize, ldff1b, ld1b);
9279
9280 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9281 Ld1Macro ld1h = &MacroAssembler::Ld1h;
9282 ldff1_64_unscaled_offset_helper(kHRegSize, ldff1h, ld1h);
9283
9284 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9285 Ld1Macro ld1w = &MacroAssembler::Ld1w;
9286 ldff1_64_unscaled_offset_helper(kSRegSize, ldff1w, ld1w);
9287
9288 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
9289 Ld1Macro ld1d = &MacroAssembler::Ld1d;
9290 ldff1_64_unscaled_offset_helper(kDRegSize, ldff1d, ld1d);
9291
9292 Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
9293 Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
9294 ldff1_64_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb);
9295
9296 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9297 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
9298 ldff1_64_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh);
9299
9300 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
9301 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
9302 ldff1_64_unscaled_offset_helper(kSRegSize, ldff1sw, ld1sw);
9303 }
9304
TEST_SVE(sve_ldff1_scalar_plus_vector)9305 TEST_SVE(sve_ldff1_scalar_plus_vector) {
9306 size_t page_size = sysconf(_SC_PAGE_SIZE);
9307 VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));
9308
9309 // Allocate two pages, then mprotect the second one to make it inaccessible.
9310 uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
9311 page_size * 2,
9312 PROT_READ | PROT_WRITE,
9313 MAP_PRIVATE | MAP_ANONYMOUS,
9314 -1,
9315 0));
9316 mprotect(reinterpret_cast<void*>(data + page_size), page_size, PROT_NONE);
9317
9318 // Fill the accessible page with arbitrary data.
9319 for (size_t i = 0; i < page_size; i++) {
9320 // Reverse bits so we get a mixture of positive and negative values.
9321 uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
9322 memcpy(reinterpret_cast<void*>(data + i), &byte, 1);
9323 }
9324
9325 sve_ldff1_scalar_plus_vector_32_scaled_offset(config, data);
9326 sve_ldff1_scalar_plus_vector_32_unscaled_offset(config, data);
9327 sve_ldff1_scalar_plus_vector_32_unpacked_scaled_offset(config, data);
9328 sve_ldff1_scalar_plus_vector_32_unpacked_unscaled_offset(config, data);
9329 sve_ldff1_scalar_plus_vector_64_scaled_offset(config, data);
9330 sve_ldff1_scalar_plus_vector_64_unscaled_offset(config, data);
9331
9332 munmap(reinterpret_cast<void*>(data), page_size * 2);
9333 }
9334
TEST_SVE(sve_ldnf1)9335 TEST_SVE(sve_ldnf1) {
9336 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
9337 CPUFeatures::kNEON,
9338 CPUFeatures::kFP);
9339 START();
9340
9341 size_t page_size = sysconf(_SC_PAGE_SIZE);
9342 VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));
9343
9344 // Allocate two pages, fill them with data, then mprotect the second one to
9345 // make it inaccessible.
9346 uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
9347 page_size * 2,
9348 PROT_READ | PROT_WRITE,
9349 MAP_PRIVATE | MAP_ANONYMOUS,
9350 -1,
9351 0));
9352
9353 // Fill the pages with arbitrary data.
9354 for (size_t i = 0; i < page_size; i++) {
9355 // Reverse bits so we get a mixture of positive and negative values.
9356 uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
9357 memcpy(reinterpret_cast<void*>(data + i), &byte, 1);
9358 }
9359
9360 mprotect(reinterpret_cast<void*>(data + page_size), page_size, PROT_NONE);
9361
9362 __ Setffr();
9363 __ Ptrue(p0.VnB());
9364 __ Dup(z10.VnB(), 0);
9365
9366 // Move an address that points to the last unprotected eight bytes.
9367 __ Mov(x0, data + page_size - (kQRegSizeInBytes / kBRegSizeInBytes) / 2);
9368
9369 // Load, non-faulting, a vector of bytes from x0. At most, eight bytes will be
9370 // loaded, the rest being in a protected page.
9371 __ Ldnf1b(z0.VnB(), p0.Zeroing(), SVEMemOperand(x0));
9372 __ Rdffr(p1.VnB());
9373 __ Setffr();
9374
9375 // Create references using the FFR value in p1 to zero the undefined lanes.
9376 __ Sel(z0.VnB(), p1, z0.VnB(), z10.VnB());
9377 __ Ld1b(z20.VnB(), p1.Zeroing(), SVEMemOperand(x0));
9378
9379 // Repeat for larger elements and different addresses, giving different FFR
9380 // results.
9381 __ Add(x1, x0, 1);
9382 __ Ldnf1h(z1.VnH(), p0.Zeroing(), SVEMemOperand(x1));
9383 __ Rdffr(p1.VnB());
9384 __ Setffr();
9385 __ Sel(z1.VnH(), p1, z1.VnH(), z10.VnH());
9386 __ Ld1h(z21.VnH(), p1.Zeroing(), SVEMemOperand(x1));
9387
9388 __ Add(x1, x0, 2);
9389 __ Ldnf1w(z2.VnS(), p0.Zeroing(), SVEMemOperand(x1));
9390 __ Rdffr(p1.VnB());
9391 __ Setffr();
9392 __ Sel(z2.VnS(), p1, z2.VnS(), z10.VnS());
9393 __ Ld1w(z22.VnS(), p1.Zeroing(), SVEMemOperand(x1));
9394
9395 __ Sub(x1, x0, 1);
9396 __ Ldnf1d(z3.VnD(), p0.Zeroing(), SVEMemOperand(x1));
9397 __ Rdffr(p1.VnB());
9398 __ Setffr();
9399 __ Sel(z3.VnD(), p1, z3.VnD(), z10.VnD());
9400 __ Ld1d(z23.VnD(), p1.Zeroing(), SVEMemOperand(x1));
9401
9402 // Load from previous VL-sized area of memory. All of this should be in the
9403 // accessible page.
9404 __ Ldnf1b(z4.VnB(), p0.Zeroing(), SVEMemOperand(x0, -1, SVE_MUL_VL));
9405 __ Rdffr(p1.VnB());
9406 __ Setffr();
9407 __ Sel(z4.VnB(), p1, z4.VnB(), z10.VnB());
9408 __ Ld1b(z24.VnB(), p1.Zeroing(), SVEMemOperand(x0, -1, SVE_MUL_VL));
9409
9410 // Repeat partial load for larger element size.
9411 __ Mov(x0, data + page_size - (kQRegSizeInBytes / kSRegSizeInBytes) / 2);
9412 __ Ldnf1b(z5.VnS(), p0.Zeroing(), SVEMemOperand(x0));
9413 __ Rdffr(p1.VnB());
9414 __ Setffr();
9415 __ Sel(z5.VnS(), p1, z5.VnS(), z10.VnS());
9416 __ Ld1b(z25.VnS(), p1.Zeroing(), SVEMemOperand(x0));
9417
9418 // Repeat for sign extension.
9419 __ Mov(x0, data + page_size - (kQRegSizeInBytes / kHRegSizeInBytes) / 2);
9420 __ Ldnf1sb(z6.VnH(), p0.Zeroing(), SVEMemOperand(x0));
9421 __ Rdffr(p1.VnB());
9422 __ Setffr();
9423 __ Sel(z6.VnH(), p1, z6.VnH(), z10.VnH());
9424 __ Ld1sb(z26.VnH(), p1.Zeroing(), SVEMemOperand(x0));
9425
9426 END();
9427
9428 if (CAN_RUN()) {
9429 RUN();
9430 ASSERT_EQUAL_SVE(z20, z0);
9431 ASSERT_EQUAL_SVE(z21, z1);
9432 ASSERT_EQUAL_SVE(z22, z2);
9433 ASSERT_EQUAL_SVE(z23, z3);
9434 ASSERT_EQUAL_SVE(z24, z4);
9435 ASSERT_EQUAL_SVE(z25, z5);
9436 ASSERT_EQUAL_SVE(z26, z6);
9437 }
9438
9439 munmap(reinterpret_cast<void*>(data), page_size * 2);
9440 }
9441
9442 // Emphasis on test if the modifiers are propagated and simulated correctly.
TEST_SVE(sve_ldff1_regression_test)9443 TEST_SVE(sve_ldff1_regression_test) {
9444 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
9445 START();
9446
9447 size_t page_size = sysconf(_SC_PAGE_SIZE);
9448 VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));
9449
9450 uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
9451 page_size * 2,
9452 PROT_READ | PROT_WRITE,
9453 MAP_PRIVATE | MAP_ANONYMOUS,
9454 -1,
9455 0));
9456 uintptr_t middle = data + page_size;
9457 // Fill the accessible page with arbitrary data.
9458 for (size_t i = 0; i < page_size; i++) {
9459 // Reverse bits so we get a mixture of positive and negative values.
9460 uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
9461 memcpy(reinterpret_cast<void*>(middle + i), &byte, 1);
9462 // Make one bit roughly different in every byte and copy the bytes in the
9463 // reverse direction that convenient to verifying the loads in negative
9464 // indexes.
9465 byte += 1;
9466 memcpy(reinterpret_cast<void*>(middle - i), &byte, 1);
9467 }
9468
9469 PRegister all = p6;
9470 __ Ptrue(all.VnB());
9471
9472 __ Mov(x0, middle);
9473 __ Index(z31.VnS(), 0, 3);
9474 __ Neg(z30.VnS(), z31.VnS());
9475
9476 __ Setffr();
9477
9478 // Scalar plus vector 32 unscaled offset
9479 __ Ldff1b(z1.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
9480 __ Ldff1h(z2.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW));
9481 __ Ldff1w(z3.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
9482 __ Ldff1sb(z4.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW));
9483 __ Ldff1sh(z5.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
9484
9485 // Scalar plus vector 32 scaled offset
9486 __ Ldff1h(z6.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW, 1));
9487 __ Ldff1w(z7.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW, 2));
9488 __ Ldff1sh(z8.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW, 1));
9489
9490 __ Index(z31.VnD(), 0, 3);
9491 __ Neg(z30.VnD(), z31.VnD());
9492
9493 // Ensure only the low 32 bits are used for the testing with positive index
9494 // values. It also test if the indexes are treated as positive in `uxtw` form.
9495 __ Mov(x3, 0x8000000080000000);
9496 __ Dup(z28.VnD(), x3);
9497 __ Sub(x2, x0, 0x80000000);
9498 __ Add(z29.VnD(), z31.VnD(), z28.VnD());
9499
9500 // Scalar plus vector 32 unpacked unscaled offset
9501 __ Ldff1b(z9.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9502 __ Ldff1h(z10.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW));
9503 __ Ldff1w(z11.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9504 __ Ldff1sb(z12.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9505 __ Ldff1sh(z13.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW));
9506 __ Ldff1sw(z14.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9507
9508 // Scalar plus vector 32 unpacked scaled offset
9509 __ Ldff1h(z15.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 1));
9510 __ Ldff1w(z16.VnD(), all.Zeroing(), SVEMemOperand(x0, z31.VnD(), UXTW, 2));
9511 __ Ldff1d(z17.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 3));
9512 __ Ldff1sh(z18.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 1));
9513 __ Ldff1sw(z19.VnD(), all.Zeroing(), SVEMemOperand(x0, z31.VnD(), UXTW, 2));
9514
9515 __ Sub(x0, x0, x3);
9516 // Note that the positive indexes has been added by `0x8000000080000000`. The
9517 // wrong address will be accessed if the address is treated as negative.
9518
9519 // Scalar plus vector 64 unscaled offset
9520 __ Ldff1b(z20.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9521 __ Ldff1h(z21.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9522 __ Ldff1w(z22.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9523 __ Ldff1sh(z23.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9524 __ Ldff1sw(z24.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9525
9526 // Scalar plus vector 64 scaled offset
9527 __ Lsr(z29.VnD(), z28.VnD(), 1); // Shift right to 0x4000000040000000
9528 __ Add(z30.VnD(), z31.VnD(), z29.VnD());
9529 __ Ldff1h(z25.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 1));
9530 __ Ldff1sh(z26.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 1));
9531
9532 __ Lsr(z29.VnD(), z29.VnD(), 1); // Shift right to 0x2000000020000000
9533 __ Add(z30.VnD(), z31.VnD(), z29.VnD());
9534 __ Ldff1w(z27.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 2));
9535 __ Ldff1sw(z28.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 2));
9536
9537 __ Lsr(z29.VnD(), z29.VnD(), 1); // Shift right to 0x1000000010000000
9538 __ Add(z30.VnD(), z31.VnD(), z29.VnD());
9539 __ Ldff1d(z29.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 3));
9540
9541 __ Rdffr(p1.VnB());
9542 __ Cntp(x10, all, p1.VnB());
9543
9544 END();
9545
9546 if (CAN_RUN()) {
9547 RUN();
9548
9549 int64_t loaded_data_in_bytes = core.xreg(x10.GetCode());
9550 // Only check 128 bits in this test.
9551 if (loaded_data_in_bytes < kQRegSizeInBytes) {
9552 // Report a warning when we hit fault-tolerant loads before all expected
9553 // loads performed.
9554 printf(
9555 "WARNING: Fault-tolerant loads detected faults before the "
9556 "expected loads completed.\n");
9557 return;
9558 }
9559
9560 // Scalar plus vector 32 unscaled offset
9561 uint32_t expected_z1[] = {0x00000090, 0x00000060, 0x000000c0, 0x00000001};
9562 uint32_t expected_z2[] = {0x00001191, 0x0000a161, 0x000041c1, 0x00008001};
9563 uint32_t expected_z3[] = {0x30d05090, 0x9010e060, 0x60a020c0, 0xc0408001};
9564 uint32_t expected_z4[] = {0xffffff91, 0x00000061, 0xffffffc1, 0x00000001};
9565 uint32_t expected_z5[] = {0x00005090, 0xffffe060, 0x000020c0, 0xffff8001};
9566
9567 ASSERT_EQUAL_SVE(expected_z1, z1.VnS());
9568 ASSERT_EQUAL_SVE(expected_z2, z2.VnS());
9569 ASSERT_EQUAL_SVE(expected_z3, z3.VnS());
9570 ASSERT_EQUAL_SVE(expected_z4, z4.VnS());
9571 ASSERT_EQUAL_SVE(expected_z5, z5.VnS());
9572
9573 // Scalar plus vector 32 scaled offset
9574 uint32_t expected_z6[] = {0x0000c848, 0x0000b030, 0x0000e060, 0x00008001};
9575 uint32_t expected_z7[] = {0xe464a424, 0xd8589818, 0xf070b030, 0xc0408001};
9576 uint32_t expected_z8[] = {0xffff8949, 0xffffd131, 0xffffa161, 0xffff8001};
9577
9578 ASSERT_EQUAL_SVE(expected_z6, z6.VnS());
9579 ASSERT_EQUAL_SVE(expected_z7, z7.VnS());
9580 ASSERT_EQUAL_SVE(expected_z8, z8.VnS());
9581
9582 // Scalar plus vector 32 unpacked unscaled offset
9583 uint64_t expected_z9[] = {0x00000000000000c0, 0x0000000000000001};
9584 uint64_t expected_z10[] = {0x00000000000041c1, 0x0000000000008001};
9585 uint64_t expected_z11[] = {0x0000000060a020c0, 0x00000000c0408001};
9586 uint64_t expected_z12[] = {0xffffffffffffffc0, 0x0000000000000001};
9587 uint64_t expected_z13[] = {0x00000000000041c1, 0xffffffffffff8001};
9588 uint64_t expected_z14[] = {0x0000000060a020c0, 0xffffffffc0408001};
9589
9590 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
9591 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
9592 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
9593 ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
9594 ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
9595 ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
9596
9597 // Scalar plus vector 32 unpacked scaled offset
9598 uint64_t expected_z15[] = {0x000000000000a161, 0x0000000000008001};
9599 uint64_t expected_z16[] = {0x00000000f070b030, 0x00000000c0408001};
9600 uint64_t expected_z17[] = {0x8949c929a969e919, 0xe060a020c0408001};
9601 uint64_t expected_z18[] = {0xffffffffffffa161, 0xffffffffffff8001};
9602 uint64_t expected_z19[] = {0xfffffffff070b030, 0xffffffffc0408001};
9603
9604 ASSERT_EQUAL_SVE(expected_z15, z15.VnD());
9605 ASSERT_EQUAL_SVE(expected_z16, z16.VnD());
9606 ASSERT_EQUAL_SVE(expected_z17, z17.VnD());
9607 ASSERT_EQUAL_SVE(expected_z18, z18.VnD());
9608 ASSERT_EQUAL_SVE(expected_z19, z19.VnD());
9609
9610 // Scalar plus vector 64 unscaled offset
9611 uint64_t expected_z20[] = {0x00000000000000c0, 0x0000000000000001};
9612 uint64_t expected_z21[] = {0x00000000000020c0, 0x0000000000008001};
9613 uint64_t expected_z22[] = {0x0000000060a020c0, 0x00000000c0408001};
9614 uint64_t expected_z23[] = {0x00000000000020c0, 0xffffffffffff8001};
9615 uint64_t expected_z24[] = {0x0000000060a020c0, 0xffffffffc0408001};
9616
9617 ASSERT_EQUAL_SVE(expected_z20, z20.VnD());
9618 ASSERT_EQUAL_SVE(expected_z21, z21.VnD());
9619 ASSERT_EQUAL_SVE(expected_z22, z22.VnD());
9620 ASSERT_EQUAL_SVE(expected_z23, z23.VnD());
9621 ASSERT_EQUAL_SVE(expected_z24, z24.VnD());
9622
9623 uint64_t expected_z25[] = {0x000000000000e060, 0x0000000000008001};
9624 uint64_t expected_z26[] = {0xffffffffffffe060, 0xffffffffffff8001};
9625 uint64_t expected_z27[] = {0x00000000f070b030, 0x00000000c0408001};
9626 uint64_t expected_z28[] = {0xfffffffff070b030, 0xffffffffc0408001};
9627 uint64_t expected_z29[] = {0xf878b838d8589818, 0xe060a020c0408001};
9628
9629 // Scalar plus vector 64 scaled offset
9630 ASSERT_EQUAL_SVE(expected_z25, z25.VnD());
9631 ASSERT_EQUAL_SVE(expected_z26, z26.VnD());
9632 ASSERT_EQUAL_SVE(expected_z27, z27.VnD());
9633 ASSERT_EQUAL_SVE(expected_z28, z28.VnD());
9634 ASSERT_EQUAL_SVE(expected_z29, z29.VnD());
9635 }
9636 }
9637
9638 // Emphasis on test if the modifiers are propagated and simulated correctly.
TEST_SVE(sve_ld1_regression_test)9639 TEST_SVE(sve_ld1_regression_test) {
9640 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
9641 START();
9642
9643 size_t page_size = sysconf(_SC_PAGE_SIZE);
9644 VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));
9645
9646 uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
9647 page_size * 2,
9648 PROT_READ | PROT_WRITE,
9649 MAP_PRIVATE | MAP_ANONYMOUS,
9650 -1,
9651 0));
9652 uintptr_t middle = data + page_size;
9653 // Fill the accessible page with arbitrary data.
9654 for (size_t i = 0; i < page_size; i++) {
9655 // Reverse bits so we get a mixture of positive and negative values.
9656 uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
9657 memcpy(reinterpret_cast<void*>(middle + i), &byte, 1);
9658 // Make one bit roughly different in every byte and copy the bytes in the
9659 // reverse direction that convenient to verifying the loads in negative
9660 // indexes.
9661 byte += 1;
9662 memcpy(reinterpret_cast<void*>(middle - i), &byte, 1);
9663 }
9664
9665 PRegister all = p6;
9666 __ Ptrue(all.VnB());
9667
9668 __ Mov(x0, middle);
9669 __ Index(z31.VnS(), 0, 3);
9670 __ Neg(z30.VnS(), z31.VnS());
9671
9672 // Scalar plus vector 32 unscaled offset
9673 __ Ld1b(z1.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
9674 __ Ld1h(z2.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW));
9675 __ Ld1w(z3.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
9676 __ Ld1sb(z4.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW));
9677 __ Ld1sh(z5.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
9678
9679 // Scalar plus vector 32 scaled offset
9680 __ Ld1h(z6.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW, 1));
9681 __ Ld1w(z7.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW, 2));
9682 __ Ld1sh(z8.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW, 1));
9683
9684 __ Index(z31.VnD(), 0, 3);
9685 __ Neg(z30.VnD(), z31.VnD());
9686
9687 // Ensure only the low 32 bits are used for the testing with positive index
9688 // values. It also test if the indexes are treated as positive in `uxtw` form.
9689 __ Mov(x3, 0x8000000080000000);
9690 __ Dup(z28.VnD(), x3);
9691 __ Sub(x2, x0, 0x80000000);
9692 __ Add(z29.VnD(), z31.VnD(), z28.VnD());
9693
9694 // Scalar plus vector 32 unpacked unscaled offset
9695 __ Ld1b(z9.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9696 __ Ld1h(z10.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW));
9697 __ Ld1w(z11.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9698 __ Ld1sb(z12.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9699 __ Ld1sh(z13.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW));
9700 __ Ld1sw(z14.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9701
9702 // Scalar plus vector 32 unpacked scaled offset
9703 __ Ld1h(z15.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 1));
9704 __ Ld1w(z16.VnD(), all.Zeroing(), SVEMemOperand(x0, z31.VnD(), UXTW, 2));
9705 __ Ld1d(z17.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 3));
9706 __ Ld1sh(z18.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 1));
9707 __ Ld1sw(z19.VnD(), all.Zeroing(), SVEMemOperand(x0, z31.VnD(), UXTW, 2));
9708
9709 __ Sub(x0, x0, x3);
9710 // Note that the positive indexes has been added by `0x8000000080000000`. The
9711 // wrong address will be accessed if the address is treated as negative.
9712
9713 // Scalar plus vector 64 unscaled offset
9714 __ Ld1b(z20.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9715 __ Ld1h(z21.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9716 __ Ld1w(z22.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9717 __ Ld1sh(z23.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9718 __ Ld1sw(z24.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9719
9720 // Scalar plus vector 64 scaled offset
9721 __ Lsr(z29.VnD(), z28.VnD(), 1); // Shift right to 0x4000000040000000
9722 __ Add(z30.VnD(), z31.VnD(), z29.VnD());
9723 __ Ld1h(z25.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 1));
9724 __ Ld1sh(z26.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 1));
9725
9726 __ Lsr(z29.VnD(), z29.VnD(), 1); // Shift right to 0x2000000020000000
9727 __ Add(z30.VnD(), z31.VnD(), z29.VnD());
9728 __ Ld1w(z27.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 2));
9729 __ Ld1sw(z28.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 2));
9730
9731 __ Lsr(z29.VnD(), z29.VnD(), 1); // Shift right to 0x1000000010000000
9732 __ Add(z30.VnD(), z31.VnD(), z29.VnD());
9733 __ Ld1d(z29.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 3));
9734
9735 END();
9736
9737 if (CAN_RUN()) {
9738 RUN();
9739
9740 // Scalar plus vector 32 unscaled offset
9741 uint32_t expected_z1[] = {0x00000090, 0x00000060, 0x000000c0, 0x00000001};
9742 uint32_t expected_z2[] = {0x00001191, 0x0000a161, 0x000041c1, 0x00008001};
9743 uint32_t expected_z3[] = {0x30d05090, 0x9010e060, 0x60a020c0, 0xc0408001};
9744 uint32_t expected_z4[] = {0xffffff91, 0x00000061, 0xffffffc1, 0x00000001};
9745 uint32_t expected_z5[] = {0x00005090, 0xffffe060, 0x000020c0, 0xffff8001};
9746
9747 ASSERT_EQUAL_SVE(expected_z1, z1.VnS());
9748 ASSERT_EQUAL_SVE(expected_z2, z2.VnS());
9749 ASSERT_EQUAL_SVE(expected_z3, z3.VnS());
9750 ASSERT_EQUAL_SVE(expected_z4, z4.VnS());
9751 ASSERT_EQUAL_SVE(expected_z5, z5.VnS());
9752
9753 // Scalar plus vector 32 scaled offset
9754 uint32_t expected_z6[] = {0x0000c848, 0x0000b030, 0x0000e060, 0x00008001};
9755 uint32_t expected_z7[] = {0xe464a424, 0xd8589818, 0xf070b030, 0xc0408001};
9756 uint32_t expected_z8[] = {0xffff8949, 0xffffd131, 0xffffa161, 0xffff8001};
9757
9758 ASSERT_EQUAL_SVE(expected_z6, z6.VnS());
9759 ASSERT_EQUAL_SVE(expected_z7, z7.VnS());
9760 ASSERT_EQUAL_SVE(expected_z8, z8.VnS());
9761
9762 // Scalar plus vector 32 unpacked unscaled offset
9763 uint64_t expected_z9[] = {0x00000000000000c0, 0x0000000000000001};
9764 uint64_t expected_z10[] = {0x00000000000041c1, 0x0000000000008001};
9765 uint64_t expected_z11[] = {0x0000000060a020c0, 0x00000000c0408001};
9766 uint64_t expected_z12[] = {0xffffffffffffffc0, 0x0000000000000001};
9767 uint64_t expected_z13[] = {0x00000000000041c1, 0xffffffffffff8001};
9768 uint64_t expected_z14[] = {0x0000000060a020c0, 0xffffffffc0408001};
9769
9770 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
9771 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
9772 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
9773 ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
9774 ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
9775 ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
9776
9777 // Scalar plus vector 32 unpacked scaled offset
9778 uint64_t expected_z15[] = {0x000000000000a161, 0x0000000000008001};
9779 uint64_t expected_z16[] = {0x00000000f070b030, 0x00000000c0408001};
9780 uint64_t expected_z17[] = {0x8949c929a969e919, 0xe060a020c0408001};
9781 uint64_t expected_z18[] = {0xffffffffffffa161, 0xffffffffffff8001};
9782 uint64_t expected_z19[] = {0xfffffffff070b030, 0xffffffffc0408001};
9783
9784 ASSERT_EQUAL_SVE(expected_z15, z15.VnD());
9785 ASSERT_EQUAL_SVE(expected_z16, z16.VnD());
9786 ASSERT_EQUAL_SVE(expected_z17, z17.VnD());
9787 ASSERT_EQUAL_SVE(expected_z18, z18.VnD());
9788 ASSERT_EQUAL_SVE(expected_z19, z19.VnD());
9789
9790 // Scalar plus vector 64 unscaled offset
9791 uint64_t expected_z20[] = {0x00000000000000c0, 0x0000000000000001};
9792 uint64_t expected_z21[] = {0x00000000000020c0, 0x0000000000008001};
9793 uint64_t expected_z22[] = {0x0000000060a020c0, 0x00000000c0408001};
9794 uint64_t expected_z23[] = {0x00000000000020c0, 0xffffffffffff8001};
9795 uint64_t expected_z24[] = {0x0000000060a020c0, 0xffffffffc0408001};
9796
9797 ASSERT_EQUAL_SVE(expected_z20, z20.VnD());
9798 ASSERT_EQUAL_SVE(expected_z21, z21.VnD());
9799 ASSERT_EQUAL_SVE(expected_z22, z22.VnD());
9800 ASSERT_EQUAL_SVE(expected_z23, z23.VnD());
9801 ASSERT_EQUAL_SVE(expected_z24, z24.VnD());
9802
9803 uint64_t expected_z25[] = {0x000000000000e060, 0x0000000000008001};
9804 uint64_t expected_z26[] = {0xffffffffffffe060, 0xffffffffffff8001};
9805 uint64_t expected_z27[] = {0x00000000f070b030, 0x00000000c0408001};
9806 uint64_t expected_z28[] = {0xfffffffff070b030, 0xffffffffc0408001};
9807 uint64_t expected_z29[] = {0xf878b838d8589818, 0xe060a020c0408001};
9808
9809 // Scalar plus vector 64 scaled offset
9810 ASSERT_EQUAL_SVE(expected_z25, z25.VnD());
9811 ASSERT_EQUAL_SVE(expected_z26, z26.VnD());
9812 ASSERT_EQUAL_SVE(expected_z27, z27.VnD());
9813 ASSERT_EQUAL_SVE(expected_z28, z28.VnD());
9814 ASSERT_EQUAL_SVE(expected_z29, z29.VnD());
9815 }
9816 }
9817
9818 // Test gather loads by comparing them with the result of a set of equivalent
9819 // scalar loads.
9820 template <typename T>
GatherLoadScalarPlusVectorHelper(Test * config,unsigned msize_in_bits,unsigned esize_in_bits,Ld1Macro ld1,Ld1Macro ldff1,T mod,bool is_signed,bool is_scaled)9821 static void GatherLoadScalarPlusVectorHelper(Test* config,
9822 unsigned msize_in_bits,
9823 unsigned esize_in_bits,
9824 Ld1Macro ld1,
9825 Ld1Macro ldff1,
9826 T mod,
9827 bool is_signed,
9828 bool is_scaled) {
9829 // SVE supports 32- and 64-bit addressing for gather loads.
9830 VIXL_ASSERT((esize_in_bits == kSRegSize) || (esize_in_bits == kDRegSize));
9831 static const unsigned kMaxLaneCount = kZRegMaxSize / kSRegSize;
9832
9833 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
9834 START();
9835
9836 unsigned msize_in_bytes = msize_in_bits / kBitsPerByte;
9837 int vl = config->sve_vl_in_bytes();
9838
9839 uint64_t addresses[kMaxLaneCount];
9840 uint64_t offsets[kMaxLaneCount];
9841 uint64_t max_address = 0;
9842 uint64_t buffer_size = vl * 64;
9843 uint64_t data = reinterpret_cast<uintptr_t>(malloc(buffer_size));
9844 // Fill the buffer with arbitrary data. Meanwhile, create the random addresses
9845 // and offsets into the buffer placed in the argument list.
9846 BufferFillingHelper(data,
9847 buffer_size,
9848 msize_in_bytes,
9849 kMaxLaneCount,
9850 offsets,
9851 addresses,
9852 &max_address);
9853
9854 ZRegister zn = z0.WithLaneSize(esize_in_bits);
9855 ZRegister zt_ref = z1.WithLaneSize(esize_in_bits);
9856 ZRegister zt = z2.WithLaneSize(esize_in_bits);
9857 ZRegister zt_ff = z3.WithLaneSize(esize_in_bits);
9858 PRegisterWithLaneSize pg_ff = p1.WithLaneSize(esize_in_bits);
9859 PRegisterWithLaneSize pg_diff = p2.WithLaneSize(esize_in_bits);
9860
9861 int shift = 0;
9862 if (is_scaled) {
9863 shift = std::log2(msize_in_bytes);
9864 for (unsigned i = 0; i < kMaxLaneCount; i++) {
9865 // Ensure the offsets are the multiple of the scale factor of the
9866 // operation.
9867 offsets[i] = (offsets[i] >> shift) << shift;
9868 addresses[i] = data + offsets[i];
9869 }
9870 }
9871
9872 PRegister all = p6;
9873 __ Ptrue(all.WithLaneSize(esize_in_bits));
9874
9875 PRegisterZ pg = p0.Zeroing();
9876 Initialise(&masm,
9877 pg,
9878 0x9abcdef012345678,
9879 0xabcdef0123456789,
9880 0xf4f3f1f0fefdfcfa,
9881 0xf9f8f6f5f3f2f1ff);
9882
9883 __ Mov(x0, data);
9884
9885 // Generate a reference result for scalar-plus-scalar form using scalar loads.
9886 ScalarLoadHelper(&masm,
9887 vl,
9888 addresses,
9889 zt_ref,
9890 pg,
9891 esize_in_bits,
9892 msize_in_bits,
9893 is_signed);
9894
9895 InsrHelper(&masm, zn, offsets);
9896 if (is_scaled) {
9897 // Scale down the offsets if testing scaled-offset operation.
9898 __ Lsr(zn, zn, shift);
9899 }
9900
9901 (masm.*ld1)(zt, pg, SVEMemOperand(x0, zn, mod, shift));
9902
9903 Register ffr_check_count = x17;
9904 __ Mov(ffr_check_count, 0);
9905
9906 // Test the data correctness in which the data gather load from different
9907 // addresses. The first-fault behavior test is emphasized in `Ldff1Helper`.
9908 __ Setffr();
9909 (masm.*ldff1)(zt_ff, pg, SVEMemOperand(x0, zn, mod, shift));
9910
9911 // Compare these two vector register and place the different to
9912 // `ffr_check_count`.
9913 __ Rdffrs(pg_ff.VnB(), all.Zeroing());
9914 __ Cmpeq(pg_diff, all.Zeroing(), zt_ref, zt_ff);
9915 __ Eor(pg_diff.VnB(), all.Zeroing(), pg_diff.VnB(), pg_ff.VnB());
9916 __ Incp(ffr_check_count, pg_diff);
9917
9918 END();
9919
9920 if (CAN_RUN()) {
9921 RUN();
9922
9923 ASSERT_EQUAL_SVE(zt_ref, zt);
9924 ASSERT_EQUAL_64(0, ffr_check_count);
9925 }
9926
9927 free(reinterpret_cast<void*>(data));
9928 }
9929
9930 // Test gather loads by comparing them with the result of a set of equivalent
9931 // scalar loads.
9932 template <typename F>
GatherLoadScalarPlusScalarOrImmHelper(Test * config,unsigned msize_in_bits,unsigned esize_in_bits,F sve_ld1,bool is_signed)9933 static void GatherLoadScalarPlusScalarOrImmHelper(Test* config,
9934 unsigned msize_in_bits,
9935 unsigned esize_in_bits,
9936 F sve_ld1,
9937 bool is_signed) {
9938 // SVE supports 32- and 64-bit addressing for gather loads.
9939 VIXL_ASSERT((esize_in_bits == kSRegSize) || (esize_in_bits == kDRegSize));
9940 static const unsigned kMaxLaneCount = kZRegMaxSize / kSRegSize;
9941
9942 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
9943 START();
9944
9945 unsigned msize_in_bytes = msize_in_bits / kBitsPerByte;
9946 int vl = config->sve_vl_in_bytes();
9947
9948 uint64_t addresses[kMaxLaneCount];
9949 uint64_t offsets[kMaxLaneCount];
9950 uint64_t max_address = 0;
9951 uint64_t buffer_size = vl * 64;
9952 uint64_t data = reinterpret_cast<uintptr_t>(malloc(buffer_size));
9953 BufferFillingHelper(data,
9954 buffer_size,
9955 msize_in_bytes,
9956 kMaxLaneCount,
9957 offsets,
9958 addresses,
9959 &max_address);
9960
9961 // Maximised offsets, to ensure that the address calculation is modulo-2^64,
9962 // and that the vector addresses are not sign-extended.
9963 uint64_t uint_e_max = (esize_in_bits == kDRegSize) ? UINT64_MAX : UINT32_MAX;
9964 uint64_t maxed_offsets[kMaxLaneCount];
9965 uint64_t maxed_offsets_imm = max_address - uint_e_max;
9966 for (unsigned i = 0; i < kMaxLaneCount; i++) {
9967 maxed_offsets[i] = addresses[i] - maxed_offsets_imm;
9968 }
9969
9970 ZRegister zn = z0.WithLaneSize(esize_in_bits);
9971 ZRegister zt_addresses = z1.WithLaneSize(esize_in_bits);
9972 ZRegister zt_offsets = z2.WithLaneSize(esize_in_bits);
9973 ZRegister zt_maxed = z3.WithLaneSize(esize_in_bits);
9974 ZRegister zt_ref = z4.WithLaneSize(esize_in_bits);
9975
9976 PRegisterZ pg = p0.Zeroing();
9977 Initialise(&masm,
9978 pg,
9979 0x9abcdef012345678,
9980 0xabcdef0123456789,
9981 0xf4f3f1f0fefdfcfa,
9982 0xf9f8f6f5f3f2f0ff);
9983
9984 // Execute each load.
9985
9986 if (esize_in_bits == kDRegSize) {
9987 // Only test `addresses` if we can use 64-bit pointers. InsrHelper will fail
9988 // if any value won't fit in a lane of zn.
9989 InsrHelper(&masm, zn, addresses);
9990 (masm.*sve_ld1)(zt_addresses, pg, SVEMemOperand(zn));
9991 }
9992
9993 InsrHelper(&masm, zn, offsets);
9994 (masm.*sve_ld1)(zt_offsets, pg, SVEMemOperand(zn, data));
9995
9996 InsrHelper(&masm, zn, maxed_offsets);
9997 (masm.*sve_ld1)(zt_maxed, pg, SVEMemOperand(zn, maxed_offsets_imm));
9998
9999 // Generate a reference result using scalar loads.
10000 ScalarLoadHelper(&masm,
10001 vl,
10002 addresses,
10003 zt_ref,
10004 pg,
10005 esize_in_bits,
10006 msize_in_bits,
10007 is_signed);
10008
10009 END();
10010
10011 if (CAN_RUN()) {
10012 RUN();
10013
10014 if (esize_in_bits == kDRegSize) {
10015 ASSERT_EQUAL_SVE(zt_ref, zt_addresses);
10016 }
10017 ASSERT_EQUAL_SVE(zt_ref, zt_offsets);
10018 ASSERT_EQUAL_SVE(zt_ref, zt_maxed);
10019 }
10020
10021 free(reinterpret_cast<void*>(data));
10022 }
10023
TEST_SVE(sve_ld1b_64bit_vector_plus_immediate)10024 TEST_SVE(sve_ld1b_64bit_vector_plus_immediate) {
10025 GatherLoadScalarPlusScalarOrImmHelper(config,
10026 kBRegSize,
10027 kDRegSize,
10028 &MacroAssembler::Ld1b,
10029 false);
10030 }
10031
TEST_SVE(sve_ld1h_64bit_vector_plus_immediate)10032 TEST_SVE(sve_ld1h_64bit_vector_plus_immediate) {
10033 GatherLoadScalarPlusScalarOrImmHelper(config,
10034 kHRegSize,
10035 kDRegSize,
10036 &MacroAssembler::Ld1h,
10037 false);
10038 }
10039
TEST_SVE(sve_ld1w_64bit_vector_plus_immediate)10040 TEST_SVE(sve_ld1w_64bit_vector_plus_immediate) {
10041 GatherLoadScalarPlusScalarOrImmHelper(config,
10042 kSRegSize,
10043 kDRegSize,
10044 &MacroAssembler::Ld1w,
10045 false);
10046 }
10047
TEST_SVE(sve_ld1d_64bit_vector_plus_immediate)10048 TEST_SVE(sve_ld1d_64bit_vector_plus_immediate) {
10049 GatherLoadScalarPlusScalarOrImmHelper(config,
10050 kDRegSize,
10051 kDRegSize,
10052 &MacroAssembler::Ld1d,
10053 false);
10054 }
10055
TEST_SVE(sve_ld1sb_64bit_vector_plus_immediate)10056 TEST_SVE(sve_ld1sb_64bit_vector_plus_immediate) {
10057 GatherLoadScalarPlusScalarOrImmHelper(config,
10058 kBRegSize,
10059 kDRegSize,
10060 &MacroAssembler::Ld1sb,
10061 true);
10062 }
10063
TEST_SVE(sve_ld1sh_64bit_vector_plus_immediate)10064 TEST_SVE(sve_ld1sh_64bit_vector_plus_immediate) {
10065 GatherLoadScalarPlusScalarOrImmHelper(config,
10066 kHRegSize,
10067 kDRegSize,
10068 &MacroAssembler::Ld1sh,
10069 true);
10070 }
10071
TEST_SVE(sve_ld1sw_64bit_vector_plus_immediate)10072 TEST_SVE(sve_ld1sw_64bit_vector_plus_immediate) {
10073 GatherLoadScalarPlusScalarOrImmHelper(config,
10074 kSRegSize,
10075 kDRegSize,
10076 &MacroAssembler::Ld1sw,
10077 true);
10078 }
10079
TEST_SVE(sve_ld1b_32bit_vector_plus_immediate)10080 TEST_SVE(sve_ld1b_32bit_vector_plus_immediate) {
10081 GatherLoadScalarPlusScalarOrImmHelper(config,
10082 kBRegSize,
10083 kSRegSize,
10084 &MacroAssembler::Ld1b,
10085 false);
10086 }
10087
TEST_SVE(sve_ld1h_32bit_vector_plus_immediate)10088 TEST_SVE(sve_ld1h_32bit_vector_plus_immediate) {
10089 GatherLoadScalarPlusScalarOrImmHelper(config,
10090 kHRegSize,
10091 kSRegSize,
10092 &MacroAssembler::Ld1h,
10093 false);
10094 }
10095
TEST_SVE(sve_ld1w_32bit_vector_plus_immediate)10096 TEST_SVE(sve_ld1w_32bit_vector_plus_immediate) {
10097 GatherLoadScalarPlusScalarOrImmHelper(config,
10098 kSRegSize,
10099 kSRegSize,
10100 &MacroAssembler::Ld1w,
10101 false);
10102 }
10103
TEST_SVE(sve_ld1sb_32bit_vector_plus_immediate)10104 TEST_SVE(sve_ld1sb_32bit_vector_plus_immediate) {
10105 GatherLoadScalarPlusScalarOrImmHelper(config,
10106 kBRegSize,
10107 kSRegSize,
10108 &MacroAssembler::Ld1sb,
10109 true);
10110 }
10111
TEST_SVE(sve_ld1sh_32bit_vector_plus_immediate)10112 TEST_SVE(sve_ld1sh_32bit_vector_plus_immediate) {
10113 GatherLoadScalarPlusScalarOrImmHelper(config,
10114 kHRegSize,
10115 kSRegSize,
10116 &MacroAssembler::Ld1sh,
10117 true);
10118 }
10119
TEST_SVE(sve_ld1_scalar_plus_vector_32_scaled_offset)10120 TEST_SVE(sve_ld1_scalar_plus_vector_32_scaled_offset) {
10121 auto ld1_32_scaled_offset_helper =
10122 std::bind(&GatherLoadScalarPlusVectorHelper<Extend>,
10123 config,
10124 std::placeholders::_1,
10125 kSRegSize,
10126 std::placeholders::_2,
10127 std::placeholders::_3,
10128 std::placeholders::_4,
10129 std::placeholders::_5,
10130 true);
10131
10132 Ld1Macro ld1h = &MacroAssembler::Ld1h;
10133 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
10134 ld1_32_scaled_offset_helper(kHRegSize, ld1h, ldff1h, UXTW, false);
10135 ld1_32_scaled_offset_helper(kHRegSize, ld1h, ldff1h, SXTW, false);
10136
10137 Ld1Macro ld1w = &MacroAssembler::Ld1w;
10138 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
10139 ld1_32_scaled_offset_helper(kSRegSize, ld1w, ldff1w, UXTW, false);
10140 ld1_32_scaled_offset_helper(kSRegSize, ld1w, ldff1w, SXTW, false);
10141
10142 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
10143 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
10144 ld1_32_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, UXTW, true);
10145 ld1_32_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, SXTW, true);
10146 }
10147
TEST_SVE(sve_ld1_scalar_plus_vector_32_unscaled_offset)10148 TEST_SVE(sve_ld1_scalar_plus_vector_32_unscaled_offset) {
10149 auto ld1_32_unscaled_offset_helper =
10150 std::bind(&GatherLoadScalarPlusVectorHelper<Extend>,
10151 config,
10152 std::placeholders::_1,
10153 kSRegSize,
10154 std::placeholders::_2,
10155 std::placeholders::_3,
10156 std::placeholders::_4,
10157 std::placeholders::_5,
10158 false);
10159
10160 Ld1Macro ld1b = &MacroAssembler::Ld1b;
10161 Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
10162 ld1_32_unscaled_offset_helper(kBRegSize, ld1b, ldff1b, UXTW, false);
10163 ld1_32_unscaled_offset_helper(kBRegSize, ld1b, ldff1b, SXTW, false);
10164
10165 Ld1Macro ld1h = &MacroAssembler::Ld1h;
10166 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
10167 ld1_32_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, UXTW, false);
10168 ld1_32_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, SXTW, false);
10169
10170 Ld1Macro ld1w = &MacroAssembler::Ld1w;
10171 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
10172 ld1_32_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, UXTW, false);
10173 ld1_32_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, SXTW, false);
10174
10175 Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
10176 Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
10177 ld1_32_unscaled_offset_helper(kBRegSize, ld1sb, ldff1sb, UXTW, true);
10178 ld1_32_unscaled_offset_helper(kBRegSize, ld1sb, ldff1sb, SXTW, true);
10179
10180 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
10181 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
10182 ld1_32_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, UXTW, true);
10183 ld1_32_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, SXTW, true);
10184 }
10185
TEST_SVE(sve_ld1_scalar_plus_vector_32_unpacked_scaled_offset)10186 TEST_SVE(sve_ld1_scalar_plus_vector_32_unpacked_scaled_offset) {
10187 auto ld1_32_unpacked_scaled_offset_helper =
10188 std::bind(&GatherLoadScalarPlusVectorHelper<Extend>,
10189 config,
10190 std::placeholders::_1,
10191 kDRegSize,
10192 std::placeholders::_2,
10193 std::placeholders::_3,
10194 std::placeholders::_4,
10195 std::placeholders::_5,
10196 true);
10197
10198 Ld1Macro ld1h = &MacroAssembler::Ld1h;
10199 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
10200 ld1_32_unpacked_scaled_offset_helper(kHRegSize, ld1h, ldff1h, UXTW, false);
10201 ld1_32_unpacked_scaled_offset_helper(kHRegSize, ld1h, ldff1h, SXTW, false);
10202
10203 Ld1Macro ld1w = &MacroAssembler::Ld1w;
10204 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
10205 ld1_32_unpacked_scaled_offset_helper(kSRegSize, ld1w, ldff1w, UXTW, false);
10206 ld1_32_unpacked_scaled_offset_helper(kSRegSize, ld1w, ldff1w, SXTW, false);
10207
10208 Ld1Macro ld1d = &MacroAssembler::Ld1d;
10209 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
10210 ld1_32_unpacked_scaled_offset_helper(kDRegSize, ld1d, ldff1d, UXTW, false);
10211 ld1_32_unpacked_scaled_offset_helper(kDRegSize, ld1d, ldff1d, SXTW, false);
10212
10213 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
10214 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
10215 ld1_32_unpacked_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, UXTW, true);
10216 ld1_32_unpacked_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, SXTW, true);
10217
10218 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
10219 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
10220 ld1_32_unpacked_scaled_offset_helper(kSRegSize, ld1sw, ldff1sw, UXTW, true);
10221 ld1_32_unpacked_scaled_offset_helper(kSRegSize, ld1sw, ldff1sw, SXTW, true);
10222 }
10223
TEST_SVE(sve_ld1_scalar_plus_vector_32_unpacked_unscaled_offset)10224 TEST_SVE(sve_ld1_scalar_plus_vector_32_unpacked_unscaled_offset) {
10225 auto ld1_32_unpacked_unscaled_offset_helper =
10226 std::bind(&GatherLoadScalarPlusVectorHelper<Extend>,
10227 config,
10228 std::placeholders::_1,
10229 kDRegSize,
10230 std::placeholders::_2,
10231 std::placeholders::_3,
10232 std::placeholders::_4,
10233 std::placeholders::_5,
10234 false);
10235
10236 Ld1Macro ld1h = &MacroAssembler::Ld1h;
10237 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
10238 ld1_32_unpacked_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, UXTW, false);
10239 ld1_32_unpacked_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, SXTW, false);
10240
10241 Ld1Macro ld1w = &MacroAssembler::Ld1w;
10242 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
10243 ld1_32_unpacked_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, UXTW, false);
10244 ld1_32_unpacked_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, SXTW, false);
10245
10246 Ld1Macro ld1d = &MacroAssembler::Ld1d;
10247 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
10248 ld1_32_unpacked_unscaled_offset_helper(kDRegSize, ld1d, ldff1d, UXTW, false);
10249 ld1_32_unpacked_unscaled_offset_helper(kDRegSize, ld1d, ldff1d, SXTW, false);
10250
10251 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
10252 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
10253 ld1_32_unpacked_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, UXTW, true);
10254 ld1_32_unpacked_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, SXTW, true);
10255
10256 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
10257 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
10258 ld1_32_unpacked_unscaled_offset_helper(kSRegSize, ld1sw, ldff1sw, UXTW, true);
10259 ld1_32_unpacked_unscaled_offset_helper(kSRegSize, ld1sw, ldff1sw, SXTW, true);
10260 }
10261
TEST_SVE(sve_ld1_scalar_plus_vector_64_scaled_offset)10262 TEST_SVE(sve_ld1_scalar_plus_vector_64_scaled_offset) {
10263 auto ld1_64_scaled_offset_helper =
10264 std::bind(&GatherLoadScalarPlusVectorHelper<Shift>,
10265 config,
10266 std::placeholders::_1,
10267 kDRegSize,
10268 std::placeholders::_2,
10269 std::placeholders::_3,
10270 LSL,
10271 std::placeholders::_4,
10272 true);
10273
10274 Ld1Macro ld1h = &MacroAssembler::Ld1h;
10275 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
10276 ld1_64_scaled_offset_helper(kHRegSize, ld1h, ldff1h, false);
10277
10278 Ld1Macro ld1w = &MacroAssembler::Ld1w;
10279 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
10280 ld1_64_scaled_offset_helper(kSRegSize, ld1w, ldff1w, false);
10281
10282 Ld1Macro ld1d = &MacroAssembler::Ld1d;
10283 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
10284 ld1_64_scaled_offset_helper(kDRegSize, ld1d, ldff1d, false);
10285
10286 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
10287 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
10288 ld1_64_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, true);
10289
10290 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
10291 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
10292 ld1_64_scaled_offset_helper(kSRegSize, ld1sw, ldff1sw, true);
10293 }
10294
TEST_SVE(sve_ld1_scalar_plus_vector_64_unscaled_offset)10295 TEST_SVE(sve_ld1_scalar_plus_vector_64_unscaled_offset) {
10296 auto ld1_64_unscaled_offset_helper =
10297 std::bind(&GatherLoadScalarPlusVectorHelper<Shift>,
10298 config,
10299 std::placeholders::_1,
10300 kDRegSize,
10301 std::placeholders::_2,
10302 std::placeholders::_3,
10303 NO_SHIFT,
10304 std::placeholders::_4,
10305 false);
10306
10307 Ld1Macro ld1b = &MacroAssembler::Ld1b;
10308 Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
10309 ld1_64_unscaled_offset_helper(kBRegSize, ld1b, ldff1b, false);
10310
10311 Ld1Macro ld1h = &MacroAssembler::Ld1h;
10312 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
10313 ld1_64_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, false);
10314
10315 Ld1Macro ld1w = &MacroAssembler::Ld1w;
10316 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
10317 ld1_64_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, false);
10318
10319 Ld1Macro ld1d = &MacroAssembler::Ld1d;
10320 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
10321 ld1_64_unscaled_offset_helper(kDRegSize, ld1d, ldff1d, false);
10322
10323 Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
10324 Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
10325 ld1_64_unscaled_offset_helper(kBRegSize, ld1sb, ldff1sb, true);
10326
10327 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
10328 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
10329 ld1_64_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, true);
10330
10331 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
10332 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
10333 ld1_64_unscaled_offset_helper(kSRegSize, ld1sw, ldff1sw, true);
10334 }
10335
TEST_SVE(sve_ldnt1)10336 TEST_SVE(sve_ldnt1) {
10337 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
10338 START();
10339
10340 int data_size = kZRegMaxSizeInBytes * 16;
10341 uint8_t* data = new uint8_t[data_size];
10342 for (int i = 0; i < data_size; i++) {
10343 data[i] = i & 0xff;
10344 }
10345
10346 // Set the base half-way through the buffer so we can use negative indices.
10347 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
10348 __ Ptrue(p0.VnB());
10349 __ Punpklo(p1.VnH(), p0.VnB());
10350 __ Punpklo(p2.VnH(), p1.VnB());
10351 __ Punpklo(p3.VnH(), p2.VnB());
10352 __ Punpklo(p4.VnH(), p3.VnB());
10353
10354 __ Mov(x1, 42);
10355 __ Ld1b(z0.VnB(), p1.Zeroing(), SVEMemOperand(x0, x1));
10356 __ Ldnt1b(z1.VnB(), p1.Zeroing(), SVEMemOperand(x0, x1));
10357
10358 __ Mov(x1, -21);
10359 __ Ld1h(z2.VnH(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 1));
10360 __ Ldnt1h(z3.VnH(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 1));
10361
10362 __ Mov(x1, 10);
10363 __ Ld1w(z4.VnS(), p3.Zeroing(), SVEMemOperand(x0, x1, LSL, 2));
10364 __ Ldnt1w(z5.VnS(), p3.Zeroing(), SVEMemOperand(x0, x1, LSL, 2));
10365
10366 __ Mov(x1, -5);
10367 __ Ld1d(z6.VnD(), p4.Zeroing(), SVEMemOperand(x0, x1, LSL, 3));
10368 __ Ldnt1d(z7.VnD(), p4.Zeroing(), SVEMemOperand(x0, x1, LSL, 3));
10369
10370 __ Ld1b(z8.VnB(), p1.Zeroing(), SVEMemOperand(x0, 1, SVE_MUL_VL));
10371 __ Ldnt1b(z9.VnB(), p1.Zeroing(), SVEMemOperand(x0, 1, SVE_MUL_VL));
10372
10373 __ Ld1h(z10.VnH(), p2.Zeroing(), SVEMemOperand(x0, -1, SVE_MUL_VL));
10374 __ Ldnt1h(z11.VnH(), p2.Zeroing(), SVEMemOperand(x0, -1, SVE_MUL_VL));
10375
10376 __ Ld1w(z12.VnS(), p3.Zeroing(), SVEMemOperand(x0, 7, SVE_MUL_VL));
10377 __ Ldnt1w(z13.VnS(), p3.Zeroing(), SVEMemOperand(x0, 7, SVE_MUL_VL));
10378
10379 __ Ld1d(z14.VnD(), p4.Zeroing(), SVEMemOperand(x0, -8, SVE_MUL_VL));
10380 __ Ldnt1d(z15.VnD(), p4.Zeroing(), SVEMemOperand(x0, -8, SVE_MUL_VL));
10381 END();
10382
10383 if (CAN_RUN()) {
10384 RUN();
10385 ASSERT_EQUAL_SVE(z0, z1);
10386 ASSERT_EQUAL_SVE(z2, z3);
10387 ASSERT_EQUAL_SVE(z4, z5);
10388 ASSERT_EQUAL_SVE(z6, z7);
10389 ASSERT_EQUAL_SVE(z8, z9);
10390 ASSERT_EQUAL_SVE(z10, z11);
10391 ASSERT_EQUAL_SVE(z12, z13);
10392 ASSERT_EQUAL_SVE(z14, z15);
10393 }
10394 }
10395
TEST_SVE(sve_stnt1)10396 TEST_SVE(sve_stnt1) {
10397 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
10398 START();
10399
10400 int data_size = kZRegMaxSizeInBytes * 16;
10401 uint8_t* data = new uint8_t[data_size];
10402
10403 // Set the base half-way through the buffer so we can use negative indices.
10404 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
10405 __ Ptrue(p0.VnB());
10406 __ Punpklo(p1.VnH(), p0.VnB());
10407 __ Punpklo(p2.VnH(), p1.VnB());
10408 __ Punpklo(p3.VnH(), p2.VnB());
10409 __ Punpklo(p4.VnH(), p3.VnB());
10410 __ Dup(z0.VnB(), 0x55);
10411 __ Index(z1.VnB(), 0, 1);
10412
10413 // Store with all-true and patterned predication, load back, and create a
10414 // reference value for later comparison.
10415 __ Rdvl(x1, 1);
10416 __ Stnt1b(z0.VnB(), p0, SVEMemOperand(x0, x1));
10417 __ Stnt1b(z1.VnB(), p1, SVEMemOperand(x0, 1, SVE_MUL_VL));
10418 __ Ld1b(z2.VnB(), p0.Zeroing(), SVEMemOperand(x0, x1));
10419 __ Sel(z3.VnB(), p1, z1.VnB(), z0.VnB());
10420
10421 // Repeated, with wider elements and different offsets.
10422 __ Rdvl(x1, -1);
10423 __ Lsr(x1, x1, 1);
10424 __ Stnt1h(z0.VnH(), p0, SVEMemOperand(x0, x1, LSL, 1));
10425 __ Stnt1h(z1.VnH(), p2, SVEMemOperand(x0, -1, SVE_MUL_VL));
10426 __ Ld1b(z4.VnB(), p0.Zeroing(), SVEMemOperand(x0, x1, LSL, 1));
10427 __ Sel(z5.VnH(), p2, z1.VnH(), z0.VnH());
10428
10429 __ Rdvl(x1, 7);
10430 __ Lsr(x1, x1, 2);
10431 __ Stnt1w(z0.VnS(), p0, SVEMemOperand(x0, x1, LSL, 2));
10432 __ Stnt1w(z1.VnS(), p3, SVEMemOperand(x0, 7, SVE_MUL_VL));
10433 __ Ld1b(z6.VnB(), p0.Zeroing(), SVEMemOperand(x0, x1, LSL, 2));
10434 __ Sel(z7.VnS(), p3, z1.VnS(), z0.VnS());
10435
10436 __ Rdvl(x1, -8);
10437 __ Lsr(x1, x1, 3);
10438 __ Stnt1d(z0.VnD(), p0, SVEMemOperand(x0, x1, LSL, 3));
10439 __ Stnt1d(z1.VnD(), p4, SVEMemOperand(x0, -8, SVE_MUL_VL));
10440 __ Ld1b(z8.VnB(), p0.Zeroing(), SVEMemOperand(x0, x1, LSL, 3));
10441 __ Sel(z9.VnD(), p4, z1.VnD(), z0.VnD());
10442 END();
10443
10444 if (CAN_RUN()) {
10445 RUN();
10446 ASSERT_EQUAL_SVE(z2, z3);
10447 ASSERT_EQUAL_SVE(z4, z5);
10448 ASSERT_EQUAL_SVE(z6, z7);
10449 ASSERT_EQUAL_SVE(z8, z9);
10450 }
10451 }
10452
TEST_SVE(sve_ld1rq)10453 TEST_SVE(sve_ld1rq) {
10454 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
10455 START();
10456
10457 int data_size = (kQRegSizeInBytes + 128) * 2;
10458 uint8_t* data = new uint8_t[data_size];
10459 for (int i = 0; i < data_size; i++) {
10460 data[i] = i & 0xff;
10461 }
10462
10463 // Set the base half-way through the buffer so we can use negative indices.
10464 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
10465
10466 __ Index(z0.VnB(), 0, 1);
10467 __ Ptrue(p0.VnB());
10468 __ Cmplo(p0.VnB(), p0.Zeroing(), z0.VnB(), 4);
10469 __ Pfalse(p1.VnB());
10470 __ Zip1(p1.VnB(), p0.VnB(), p1.VnB());
10471
10472 // Load and broadcast using scalar offsets.
10473 __ Mov(x1, -42);
10474 __ Ld1rqb(z0.VnB(), p1.Zeroing(), SVEMemOperand(x0, x1));
10475
10476 __ Add(x2, x0, 1);
10477 __ Mov(x1, -21);
10478 __ Punpklo(p2.VnH(), p1.VnB());
10479 __ Ld1rqh(z1.VnH(), p2.Zeroing(), SVEMemOperand(x2, x1, LSL, 1));
10480
10481 __ Add(x2, x2, 1);
10482 __ Mov(x1, -10);
10483 __ Punpklo(p3.VnH(), p2.VnB());
10484 __ Ld1rqw(z2.VnS(), p3.Zeroing(), SVEMemOperand(x2, x1, LSL, 2));
10485
10486 __ Add(x2, x2, 1);
10487 __ Mov(x1, 5);
10488 __ Punpklo(p4.VnH(), p3.VnB());
10489 __ Ld1rqd(z3.VnD(), p4.Zeroing(), SVEMemOperand(x2, x1, LSL, 3));
10490
10491 // Check that all segments match by rotating the vector by one segment,
10492 // eoring, and orring across the vector.
10493 __ Mov(z4, z0);
10494 __ Ext(z4.VnB(), z4.VnB(), z4.VnB(), 16);
10495 __ Eor(z4.VnB(), z4.VnB(), z0.VnB());
10496 __ Orv(b4, p0, z4.VnB());
10497 __ Mov(z5, z1);
10498 __ Ext(z5.VnB(), z5.VnB(), z5.VnB(), 16);
10499 __ Eor(z5.VnB(), z5.VnB(), z1.VnB());
10500 __ Orv(b5, p0, z5.VnB());
10501 __ Orr(z4, z4, z5);
10502 __ Mov(z5, z2);
10503 __ Ext(z5.VnB(), z5.VnB(), z5.VnB(), 16);
10504 __ Eor(z5.VnB(), z5.VnB(), z2.VnB());
10505 __ Orv(b5, p0, z5.VnB());
10506 __ Orr(z4, z4, z5);
10507 __ Mov(z5, z3);
10508 __ Ext(z5.VnB(), z5.VnB(), z5.VnB(), 16);
10509 __ Eor(z5.VnB(), z5.VnB(), z3.VnB());
10510 __ Orv(b5, p0, z5.VnB());
10511 __ Orr(z4, z4, z5);
10512
10513 // Load and broadcast the same values, using immediate offsets.
10514 __ Add(x1, x0, 6);
10515 __ Ld1rqb(z5.VnB(), p1.Zeroing(), SVEMemOperand(x1, -48));
10516 __ Add(x1, x0, -9);
10517 __ Ld1rqh(z6.VnH(), p2.Zeroing(), SVEMemOperand(x1, -32));
10518 __ Add(x1, x0, -70);
10519 __ Ld1rqw(z7.VnS(), p3.Zeroing(), SVEMemOperand(x1, 32));
10520 __ Add(x1, x0, 27);
10521 __ Ld1rqd(z8.VnD(), p4.Zeroing(), SVEMemOperand(x1, 16));
10522 END();
10523
10524 if (CAN_RUN()) {
10525 RUN();
10526 uint64_t expected_z0[] = {0x0000000000000000, 0x006c006a00680066};
10527 uint64_t expected_z1[] = {0x000074730000706f, 0x00006c6b00006867};
10528 uint64_t expected_z2[] = {0x0000000075747372, 0x000000006d6c6b6a};
10529 uint64_t expected_z3[] = {0x0000000000000000, 0xc2c1c0bfbebdbcbb};
10530 uint64_t expected_z4[] = {0, 0};
10531 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
10532 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
10533 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
10534 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
10535 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
10536 ASSERT_EQUAL_SVE(z0, z5);
10537 ASSERT_EQUAL_SVE(z1, z6);
10538 ASSERT_EQUAL_SVE(z2, z7);
10539 ASSERT_EQUAL_SVE(z3, z8);
10540 }
10541 }
10542
TEST_SVE(sve_st1_vec_imm)10543 TEST_SVE(sve_st1_vec_imm) {
10544 SVE_SETUP_WITH_FEATURES(CPUFeatures::kNEON, CPUFeatures::kSVE);
10545 START();
10546
10547 // TODO: Use mmap() to request a buffer in the low 4GB, which allows testing
10548 // 32-bit address vectors.
10549 int data_size = kZRegMaxSizeInBytes * 16;
10550 uint8_t* data = new uint8_t[data_size];
10551
10552 // Set the base to 16 bytes from the end of the buffer so we can use negative
10553 // indices.
10554 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size - 16]));
10555 __ Ptrue(p0.VnB());
10556
10557 // Store a vector of index values in reverse order, using
10558 // vector-plus-immediate addressing to begin at byte 15, then storing to
10559 // bytes 14, 13, etc.
10560 __ Index(z1.VnD(), x0, -1);
10561 __ Index(z2.VnD(), 0, 1);
10562
10563 // Iterate in order to store at least 16 bytes. The number of iterations
10564 // depends on VL, eg. VL128 iterates eight times, storing bytes 15 and 14
10565 // on the first iteration, 13 and 12 on the next, etc.
10566 uint64_t dlanes = config->sve_vl_in_bytes() / kDRegSizeInBytes;
10567 for (int i = 15; i >= 0; i -= dlanes * kBRegSizeInBytes) {
10568 __ St1b(z2.VnD(), p0, SVEMemOperand(z1.VnD(), i));
10569 __ Incd(z2.VnD());
10570 }
10571
10572 // Reload the stored data, and build a reference for comparison. The reference
10573 // is truncated to a Q register, as only the least-significant 128 bits are
10574 // checked.
10575 __ Ldr(q4, MemOperand(x0));
10576 __ Index(z5.VnB(), 15, -1);
10577 __ Mov(q5, q5);
10578
10579 // Repeat for wider elements.
10580 __ Index(z1.VnD(), x0, -2); // Stepping by -2 for H-sized elements.
10581 __ Index(z2.VnD(), 0, 1);
10582 for (int i = 14; i >= 0; i -= dlanes * kHRegSizeInBytes) {
10583 __ St1h(z2.VnD(), p0, SVEMemOperand(z1.VnD(), i));
10584 __ Incd(z2.VnD());
10585 }
10586 __ Ldr(q6, MemOperand(x0));
10587 __ Index(z7.VnH(), 7, -1);
10588 __ Mov(q7, q7);
10589
10590 __ Index(z1.VnD(), x0, -4); // Stepping by -4 for S-sized elements.
10591 __ Index(z2.VnD(), 0, 1);
10592 for (int i = 12; i >= 0; i -= dlanes * kSRegSizeInBytes) {
10593 __ St1w(z2.VnD(), p0, SVEMemOperand(z1.VnD(), i));
10594 __ Incd(z2.VnD());
10595 }
10596 __ Ldr(q8, MemOperand(x0));
10597 __ Index(z9.VnS(), 3, -1);
10598 __ Mov(q9, q9);
10599
10600 __ Index(z1.VnD(), x0, -8); // Stepping by -8 for D-sized elements.
10601 __ Index(z2.VnD(), 0, 1);
10602 for (int i = 8; i >= 0; i -= dlanes * kDRegSizeInBytes) {
10603 __ St1d(z2.VnD(), p0, SVEMemOperand(z1.VnD(), i));
10604 __ Incd(z2.VnD());
10605 }
10606 __ Ldr(q10, MemOperand(x0));
10607 __ Index(z11.VnD(), 1, -1);
10608 __ Mov(q11, q11);
10609
10610 // Test predication by storing even halfwords to memory (using predication)
10611 // at byte-separated addresses. The result should be the same as storing
10612 // even halfwords contiguously to memory.
10613 __ Pfalse(p1.VnB());
10614 __ Zip1(p1.VnD(), p0.VnD(), p1.VnD());
10615 __ Mov(x0, reinterpret_cast<uintptr_t>(data));
10616 __ Index(z1.VnD(), x0, 1);
10617 __ Index(z2.VnD(), 0x1000, 1);
10618 for (int i = 0; i < 16; i += dlanes) {
10619 __ St1h(z2.VnD(), p1, SVEMemOperand(z1.VnD(), i));
10620 __ Incd(z2.VnD());
10621 }
10622 __ Ldr(q2, MemOperand(x0));
10623 __ Index(z3.VnH(), 0x1000, 2);
10624 __ Mov(q3, q3);
10625
10626 END();
10627
10628 if (CAN_RUN()) {
10629 RUN();
10630
10631 ASSERT_EQUAL_SVE(z3, z2);
10632 ASSERT_EQUAL_SVE(z5, z4);
10633 ASSERT_EQUAL_SVE(z7, z6);
10634 ASSERT_EQUAL_SVE(z9, z8);
10635 ASSERT_EQUAL_SVE(z11, z10);
10636 }
10637 }
10638
10639 template <typename T>
sve_st1_scalar_plus_vector_helper(Test * config,int esize_in_bits,T mod,bool is_scaled)10640 static void sve_st1_scalar_plus_vector_helper(Test* config,
10641 int esize_in_bits,
10642 T mod,
10643 bool is_scaled) {
10644 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
10645 START();
10646
10647 int vl = config->sve_vl_in_bytes();
10648 int data_size = vl * 160;
10649 uint8_t* data = new uint8_t[data_size];
10650 memset(data, 0, data_size);
10651 int vl_per_esize = vl / (esize_in_bits / kBitsPerByte);
10652
10653 ZRegister zn_b = z0.WithLaneSize(esize_in_bits);
10654 ZRegister zn_h = z1.WithLaneSize(esize_in_bits);
10655 ZRegister zn_s = z2.WithLaneSize(esize_in_bits);
10656 ZRegister zn_d = z3.WithLaneSize(esize_in_bits);
10657
10658 ZRegister zn_ld_b = z10.WithLaneSize(esize_in_bits);
10659 ZRegister zn_ld_h = z11.WithLaneSize(esize_in_bits);
10660 ZRegister zn_ld_s = z12.WithLaneSize(esize_in_bits);
10661 ZRegister zn_ld_d = z13.WithLaneSize(esize_in_bits);
10662 ZRegister offsets = z31.WithLaneSize(esize_in_bits);
10663
10664 // Set the base half-way through the buffer so we can use negative indices.
10665 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
10666 __ Ptrue(p6.WithLaneSize(esize_in_bits));
10667 __ Pfalse(p7.WithLaneSize(esize_in_bits));
10668 __ Zip1(p0.WithLaneSize(esize_in_bits),
10669 p6.WithLaneSize(esize_in_bits),
10670 p7.WithLaneSize(esize_in_bits));
10671 __ Zip1(p1.WithLaneSize(esize_in_bits),
10672 p7.WithLaneSize(esize_in_bits),
10673 p6.WithLaneSize(esize_in_bits));
10674
10675 // `st1b` doesn't have the scaled-offset forms.
10676 if (is_scaled == false) {
10677 // Simply stepping the index by 2 to simulate a scatter memory access.
10678 __ Index(offsets, 1, 2);
10679 __ St1b(offsets, p0, SVEMemOperand(x0, offsets, mod));
10680 __ Ld1b(zn_ld_b, p0.Zeroing(), SVEMemOperand(x0, offsets, mod));
10681 __ Dup(zn_b, 0);
10682 __ Mov(zn_b, p0.Merging(), offsets);
10683 }
10684
10685 // Store the values to isolated range different with other stores.
10686 int scale = is_scaled ? 1 : 0;
10687 __ Add(x1, x0, vl_per_esize * 4);
10688 __ Index(offsets, 6, 4);
10689 __ St1h(offsets, p0, SVEMemOperand(x1, offsets, mod, scale));
10690 __ Ld1h(zn_ld_h, p0.Zeroing(), SVEMemOperand(x1, offsets, mod, scale));
10691 __ Dup(zn_h, 0);
10692 __ Mov(zn_h, p0.Merging(), offsets);
10693
10694 scale = is_scaled ? 2 : 0;
10695 __ Add(x2, x0, UINT64_MAX + (vl_per_esize * -8) + 1);
10696 __ Index(offsets, 64, 8);
10697 if ((std::is_same<T, vixl::aarch64::Extend>::value) &&
10698 (static_cast<int>(mod) == SXTW)) {
10699 // Testing negative offsets.
10700 __ Neg(offsets, p6.Merging(), offsets);
10701 }
10702 __ St1w(offsets, p1, SVEMemOperand(x2, offsets, mod, scale));
10703 __ Ld1w(zn_ld_s, p1.Zeroing(), SVEMemOperand(x2, offsets, mod, scale));
10704 __ Dup(zn_s, 0);
10705 __ Mov(zn_s, p1.Merging(), offsets);
10706
10707 if (esize_in_bits == kDRegSize) {
10708 // Test st1w by comparing the 32-bit value loaded correspondingly with the
10709 // 32-bit value stored.
10710 __ Lsl(zn_s, zn_s, kSRegSize);
10711 __ Lsr(zn_s, zn_s, kSRegSize);
10712 }
10713
10714 // `st1d` doesn't have the S-sized lane forms.
10715 if (esize_in_bits == kDRegSize) {
10716 scale = is_scaled ? 3 : 0;
10717 __ Add(x3, x0, UINT64_MAX + (vl_per_esize * -16) + 1);
10718 __ Index(offsets, 128, 16);
10719 if ((std::is_same<T, vixl::aarch64::Extend>::value) &&
10720 (static_cast<int>(mod) == SXTW)) {
10721 __ Neg(offsets, p6.Merging(), offsets);
10722 }
10723 __ St1d(offsets, p1, SVEMemOperand(x3, offsets, mod, scale));
10724 __ Ld1d(zn_ld_d, p1.Zeroing(), SVEMemOperand(x3, offsets, mod, scale));
10725 __ Dup(zn_d, 0);
10726 __ Mov(zn_d, p1.Merging(), offsets);
10727 }
10728
10729 END();
10730
10731 if (CAN_RUN()) {
10732 RUN();
10733
10734 if (scale == false) {
10735 ASSERT_EQUAL_SVE(zn_ld_b, zn_b);
10736 }
10737
10738 ASSERT_EQUAL_SVE(zn_ld_h, zn_h);
10739 ASSERT_EQUAL_SVE(zn_ld_s, zn_s);
10740
10741 if (esize_in_bits == kDRegSize) {
10742 ASSERT_EQUAL_SVE(zn_ld_d, zn_d);
10743 }
10744 }
10745
10746 delete[] data;
10747 }
10748
TEST_SVE(sve_st1_sca_vec_32_unpacked_unscaled)10749 TEST_SVE(sve_st1_sca_vec_32_unpacked_unscaled) {
10750 sve_st1_scalar_plus_vector_helper(config, kDRegSize, UXTW, false);
10751 sve_st1_scalar_plus_vector_helper(config, kDRegSize, SXTW, false);
10752 }
10753
TEST_SVE(sve_st1_sca_vec_32_unpacked_scaled)10754 TEST_SVE(sve_st1_sca_vec_32_unpacked_scaled) {
10755 sve_st1_scalar_plus_vector_helper(config, kDRegSize, UXTW, true);
10756 sve_st1_scalar_plus_vector_helper(config, kDRegSize, SXTW, true);
10757 }
10758
TEST_SVE(sve_st1_sca_vec_32_unscaled)10759 TEST_SVE(sve_st1_sca_vec_32_unscaled) {
10760 sve_st1_scalar_plus_vector_helper(config, kSRegSize, UXTW, false);
10761 sve_st1_scalar_plus_vector_helper(config, kSRegSize, SXTW, false);
10762 }
10763
TEST_SVE(sve_st1_sca_vec_32_scaled)10764 TEST_SVE(sve_st1_sca_vec_32_scaled) {
10765 sve_st1_scalar_plus_vector_helper(config, kSRegSize, UXTW, true);
10766 sve_st1_scalar_plus_vector_helper(config, kSRegSize, SXTW, true);
10767 }
10768
TEST_SVE(sve_st1_sca_vec_64_scaled)10769 TEST_SVE(sve_st1_sca_vec_64_scaled) {
10770 sve_st1_scalar_plus_vector_helper(config, kDRegSize, LSL, true);
10771 }
10772
TEST_SVE(sve_st1_sca_vec_64_unscaled)10773 TEST_SVE(sve_st1_sca_vec_64_unscaled) {
10774 sve_st1_scalar_plus_vector_helper(config, kDRegSize, NO_SHIFT, false);
10775 }
10776
10777 typedef void (MacroAssembler::*IntWideImmFn)(const ZRegister& zd,
10778 const ZRegister& zn,
10779 const IntegerOperand imm);
10780
10781 template <typename F, typename Td, typename Tn>
IntWideImmHelper(Test * config,F macro,unsigned lane_size_in_bits,const Tn & zn_inputs,IntegerOperand imm,const Td & zd_expected)10782 static void IntWideImmHelper(Test* config,
10783 F macro,
10784 unsigned lane_size_in_bits,
10785 const Tn& zn_inputs,
10786 IntegerOperand imm,
10787 const Td& zd_expected) {
10788 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
10789 START();
10790
10791 ZRegister zd1 = z0.WithLaneSize(lane_size_in_bits);
10792 InsrHelper(&masm, zd1, zn_inputs);
10793
10794 // Also test with a different zn, to test the movprfx case.
10795 ZRegister zn = z1.WithLaneSize(lane_size_in_bits);
10796 InsrHelper(&masm, zn, zn_inputs);
10797 ZRegister zd2 = z2.WithLaneSize(lane_size_in_bits);
10798 ZRegister zn_copy = z3.WithSameLaneSizeAs(zn);
10799
10800 // Make a copy so we can check that constructive operations preserve zn.
10801 __ Mov(zn_copy, zn);
10802
10803 {
10804 UseScratchRegisterScope temps(&masm);
10805 // The MacroAssembler needs a P scratch register for some of these macros,
10806 // and it doesn't have one by default.
10807 temps.Include(p3);
10808
10809 (masm.*macro)(zd1, zd1, imm);
10810 (masm.*macro)(zd2, zn, imm);
10811 }
10812
10813 END();
10814
10815 if (CAN_RUN()) {
10816 RUN();
10817
10818 ASSERT_EQUAL_SVE(zd_expected, zd1);
10819
10820 // Check the result from `instr` with movprfx is the same as
10821 // the immediate version.
10822 ASSERT_EQUAL_SVE(zd_expected, zd2);
10823
10824 ASSERT_EQUAL_SVE(zn_copy, zn);
10825 }
10826 }
10827
TEST_SVE(sve_int_wide_imm_unpredicated_smax)10828 TEST_SVE(sve_int_wide_imm_unpredicated_smax) {
10829 int in_b[] = {0, -128, 127, -127, 126, 1, -1, 55};
10830 int in_h[] = {0, -128, 127, INT16_MIN, INT16_MAX, 1, -1, 5555};
10831 int in_s[] = {0, -128, 127, INT32_MIN, INT32_MAX, 1, -1, 555555};
10832 int64_t in_d[] = {1, 10, 10000, 1000000};
10833
10834 IntWideImmFn fn = &MacroAssembler::Smax;
10835
10836 int exp_b_1[] = {0, -1, 127, -1, 126, 1, -1, 55};
10837 int exp_h_1[] = {127, 127, 127, 127, INT16_MAX, 127, 127, 5555};
10838 int exp_s_1[] = {0, -128, 127, -128, INT32_MAX, 1, -1, 555555};
10839 int64_t exp_d_1[] = {99, 99, 10000, 1000000};
10840
10841 IntWideImmHelper(config, fn, kBRegSize, in_b, -1, exp_b_1);
10842 IntWideImmHelper(config, fn, kHRegSize, in_h, 127, exp_h_1);
10843 IntWideImmHelper(config, fn, kSRegSize, in_s, -128, exp_s_1);
10844 IntWideImmHelper(config, fn, kDRegSize, in_d, 99, exp_d_1);
10845
10846 int exp_h_2[] = {0, -128, 127, -255, INT16_MAX, 1, -1, 5555};
10847 int exp_s_2[] = {2048, 2048, 2048, 2048, INT32_MAX, 2048, 2048, 555555};
10848 int64_t exp_d_2[] = {INT16_MAX, INT16_MAX, INT16_MAX, 1000000};
10849
10850 // The immediate is in the range [-128, 127], but the macro is able to
10851 // synthesise unencodable immediates.
10852 // B-sized lanes cannot take an immediate out of the range [-128, 127].
10853 IntWideImmHelper(config, fn, kHRegSize, in_h, -255, exp_h_2);
10854 IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
10855 IntWideImmHelper(config, fn, kDRegSize, in_d, INT16_MAX, exp_d_2);
10856 }
10857
TEST_SVE(sve_int_wide_imm_unpredicated_smin)10858 TEST_SVE(sve_int_wide_imm_unpredicated_smin) {
10859 int in_b[] = {0, -128, 127, -127, 126, 1, -1, 55};
10860 int in_h[] = {0, -128, 127, INT16_MIN, INT16_MAX, 1, -1, 5555};
10861 int in_s[] = {0, -128, 127, INT32_MIN, INT32_MAX, 1, -1, 555555};
10862 int64_t in_d[] = {1, 10, 10000, 1000000};
10863
10864 IntWideImmFn fn = &MacroAssembler::Smin;
10865
10866 int exp_b_1[] = {-1, -128, -1, -127, -1, -1, -1, -1};
10867 int exp_h_1[] = {0, -128, 127, INT16_MIN, 127, 1, -1, 127};
10868 int exp_s_1[] = {-128, -128, -128, INT32_MIN, -128, -128, -128, -128};
10869 int64_t exp_d_1[] = {1, 10, 99, 99};
10870
10871 IntWideImmHelper(config, fn, kBRegSize, in_b, -1, exp_b_1);
10872 IntWideImmHelper(config, fn, kHRegSize, in_h, 127, exp_h_1);
10873 IntWideImmHelper(config, fn, kSRegSize, in_s, -128, exp_s_1);
10874 IntWideImmHelper(config, fn, kDRegSize, in_d, 99, exp_d_1);
10875
10876 int exp_h_2[] = {-255, -255, -255, INT16_MIN, -255, -255, -255, -255};
10877 int exp_s_2[] = {0, -128, 127, INT32_MIN, 2048, 1, -1, 2048};
10878 int64_t exp_d_2[] = {1, 10, 10000, INT16_MAX};
10879
10880 // The immediate is in the range [-128, 127], but the macro is able to
10881 // synthesise unencodable immediates.
10882 // B-sized lanes cannot take an immediate out of the range [-128, 127].
10883 IntWideImmHelper(config, fn, kHRegSize, in_h, -255, exp_h_2);
10884 IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
10885 IntWideImmHelper(config, fn, kDRegSize, in_d, INT16_MAX, exp_d_2);
10886 }
10887
TEST_SVE(sve_int_wide_imm_unpredicated_umax)10888 TEST_SVE(sve_int_wide_imm_unpredicated_umax) {
10889 int in_b[] = {0, 255, 127, 0x80, 1, 55};
10890 int in_h[] = {0, 255, 127, INT16_MAX, 1, 5555};
10891 int in_s[] = {0, 0xff, 0x7f, INT32_MAX, 1, 555555};
10892 int64_t in_d[] = {1, 10, 10000, 1000000};
10893
10894 IntWideImmFn fn = &MacroAssembler::Umax;
10895
10896 int exp_b_1[] = {17, 255, 127, 0x80, 17, 55};
10897 int exp_h_1[] = {127, 255, 127, INT16_MAX, 127, 5555};
10898 int exp_s_1[] = {255, 255, 255, INT32_MAX, 255, 555555};
10899 int64_t exp_d_1[] = {99, 99, 10000, 1000000};
10900
10901 IntWideImmHelper(config, fn, kBRegSize, in_b, 17, exp_b_1);
10902 IntWideImmHelper(config, fn, kHRegSize, in_h, 0x7f, exp_h_1);
10903 IntWideImmHelper(config, fn, kSRegSize, in_s, 0xff, exp_s_1);
10904 IntWideImmHelper(config, fn, kDRegSize, in_d, 99, exp_d_1);
10905
10906 int exp_h_2[] = {511, 511, 511, INT16_MAX, 511, 5555};
10907 int exp_s_2[] = {2048, 2048, 2048, INT32_MAX, 2048, 555555};
10908 int64_t exp_d_2[] = {INT16_MAX, INT16_MAX, INT16_MAX, 1000000};
10909
10910 // The immediate is in the range [0, 255], but the macro is able to
10911 // synthesise unencodable immediates.
10912 // B-sized lanes cannot take an immediate out of the range [0, 255].
10913 IntWideImmHelper(config, fn, kHRegSize, in_h, 511, exp_h_2);
10914 IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
10915 IntWideImmHelper(config, fn, kDRegSize, in_d, INT16_MAX, exp_d_2);
10916 }
10917
TEST_SVE(sve_int_wide_imm_unpredicated_umin)10918 TEST_SVE(sve_int_wide_imm_unpredicated_umin) {
10919 int in_b[] = {0, 255, 127, 0x80, 1, 55};
10920 int in_h[] = {0, 255, 127, INT16_MAX, 1, 5555};
10921 int in_s[] = {0, 0xff, 0x7f, INT32_MAX, 1, 555555};
10922 int64_t in_d[] = {1, 10, 10000, 1000000};
10923
10924 IntWideImmFn fn = &MacroAssembler::Umin;
10925
10926 int exp_b_1[] = {0, 17, 17, 17, 1, 17};
10927 int exp_h_1[] = {0, 127, 127, 127, 1, 127};
10928 int exp_s_1[] = {0, 255, 127, 255, 1, 255};
10929 int64_t exp_d_1[] = {1, 10, 99, 99};
10930
10931 IntWideImmHelper(config, fn, kBRegSize, in_b, 17, exp_b_1);
10932 IntWideImmHelper(config, fn, kHRegSize, in_h, 0x7f, exp_h_1);
10933 IntWideImmHelper(config, fn, kSRegSize, in_s, 255, exp_s_1);
10934 IntWideImmHelper(config, fn, kDRegSize, in_d, 99, exp_d_1);
10935
10936 int exp_h_2[] = {0, 255, 127, 511, 1, 511};
10937 int exp_s_2[] = {0, 255, 127, 2048, 1, 2048};
10938 int64_t exp_d_2[] = {1, 10, 10000, INT16_MAX};
10939
10940 // The immediate is in the range [0, 255], but the macro is able to
10941 // synthesise unencodable immediates.
10942 // B-sized lanes cannot take an immediate out of the range [0, 255].
10943 IntWideImmHelper(config, fn, kHRegSize, in_h, 511, exp_h_2);
10944 IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
10945 IntWideImmHelper(config, fn, kDRegSize, in_d, INT16_MAX, exp_d_2);
10946 }
10947
TEST_SVE(sve_int_wide_imm_unpredicated_mul)10948 TEST_SVE(sve_int_wide_imm_unpredicated_mul) {
10949 int in_b[] = {11, -1, 7, -3};
10950 int in_h[] = {111, -1, 17, -123};
10951 int in_s[] = {11111, -1, 117, -12345};
10952 int64_t in_d[] = {0x7fffffff, 0x80000000};
10953
10954 IntWideImmFn fn = &MacroAssembler::Mul;
10955
10956 int exp_b_1[] = {66, -6, 42, -18};
10957 int exp_h_1[] = {-14208, 128, -2176, 15744};
10958 int exp_s_1[] = {11111 * 127, -127, 117 * 127, -12345 * 127};
10959 int64_t exp_d_1[] = {0xfffffffe, 0x100000000};
10960
10961 IntWideImmHelper(config, fn, kBRegSize, in_b, 6, exp_b_1);
10962 IntWideImmHelper(config, fn, kHRegSize, in_h, -128, exp_h_1);
10963 IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
10964 IntWideImmHelper(config, fn, kDRegSize, in_d, 2, exp_d_1);
10965
10966 int exp_h_2[] = {-28305, 255, -4335, 31365};
10967 int exp_s_2[] = {22755328, -2048, 239616, -25282560};
10968 int64_t exp_d_2[] = {0x00000063ffffff38, 0x0000006400000000};
10969
10970 // The immediate is in the range [-128, 127], but the macro is able to
10971 // synthesise unencodable immediates.
10972 // B-sized lanes cannot take an immediate out of the range [0, 255].
10973 IntWideImmHelper(config, fn, kHRegSize, in_h, -255, exp_h_2);
10974 IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
10975 IntWideImmHelper(config, fn, kDRegSize, in_d, 200, exp_d_2);
10976
10977 // Integer overflow on multiplication.
10978 unsigned exp_b_3[] = {0x75, 0x81, 0x79, 0x83};
10979
10980 IntWideImmHelper(config, fn, kBRegSize, in_b, 0x7f, exp_b_3);
10981 }
10982
TEST_SVE(sve_int_wide_imm_unpredicated_add)10983 TEST_SVE(sve_int_wide_imm_unpredicated_add) {
10984 unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
10985 unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
10986 unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
10987 uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
10988
10989 IntWideImmFn fn = &MacroAssembler::Add;
10990
10991 unsigned exp_b_1[] = {0x02, 0x00, 0x91, 0x80};
10992 unsigned exp_h_1[] = {0x8191, 0x7f8f, 0x1020, 0xaaba};
10993 unsigned exp_s_1[] = {0x80018200, 0x7fff7ffe, 0xaaaaab29, 0xf000f16f};
10994 uint64_t exp_d_1[] = {0x8000000180018280, 0x7fffffff7fff807e};
10995
10996 // Encodable with `add` (shift 0).
10997 IntWideImmHelper(config, fn, kBRegSize, in_b, 0x81, exp_b_1);
10998 IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
10999 IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
11000 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff, exp_d_1);
11001
11002 unsigned exp_h_2[] = {0x9181, 0x8f7f, 0x2010, 0xbaaa};
11003 unsigned exp_s_2[] = {0x80020081, 0x7ffffe7f, 0xaaab29aa, 0xf0016ff0};
11004 uint64_t exp_d_2[] = {0x8000000180028081, 0x7fffffff80007e7f};
11005
11006 // Encodable with `add` (shift 8).
11007 // B-sized lanes cannot take a shift of 8.
11008 IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
11009 IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
11010 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
11011
11012 unsigned exp_s_3[] = {0x80808181, 0x807e7f7f, 0xab29aaaa, 0xf07ff0f0};
11013
11014 // The macro is able to synthesise unencodable immediates.
11015 IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 16, exp_s_3);
11016
11017 unsigned exp_b_4[] = {0x61, 0x5f, 0xf0, 0xdf};
11018 unsigned exp_h_4[] = {0x6181, 0x5f7f, 0xf010, 0x8aaa};
11019 unsigned exp_s_4[] = {0x00018181, 0xffff7f7f, 0x2aaaaaaa, 0x7000f0f0};
11020 uint64_t exp_d_4[] = {0x8000000180018180, 0x7fffffff7fff7f7e};
11021
11022 // Negative immediates use `sub`.
11023 IntWideImmHelper(config, fn, kBRegSize, in_b, -0x20, exp_b_4);
11024 IntWideImmHelper(config, fn, kHRegSize, in_h, -0x2000, exp_h_4);
11025 IntWideImmHelper(config, fn, kSRegSize, in_s, INT32_MIN, exp_s_4);
11026 IntWideImmHelper(config, fn, kDRegSize, in_d, -1, exp_d_4);
11027 }
11028
TEST_SVE(sve_int_wide_imm_unpredicated_sqadd)11029 TEST_SVE(sve_int_wide_imm_unpredicated_sqadd) {
11030 unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
11031 unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
11032 unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
11033 uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
11034
11035 IntWideImmFn fn = &MacroAssembler::Sqadd;
11036
11037 unsigned exp_b_1[] = {0x02, 0x7f, 0x7f, 0x7f};
11038 unsigned exp_h_1[] = {0x8191, 0x7f8f, 0x1020, 0xaaba};
11039 unsigned exp_s_1[] = {0x80018200, 0x7fff7ffe, 0xaaaaab29, 0xf000f16f};
11040 uint64_t exp_d_1[] = {0x8000000180018280, 0x7fffffff7fff807e};
11041
11042 // Encodable with `sqadd` (shift 0).
11043 // Note that encodable immediates are unsigned, even for signed saturation.
11044 IntWideImmHelper(config, fn, kBRegSize, in_b, 129, exp_b_1);
11045 IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
11046 IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
11047 IntWideImmHelper(config, fn, kDRegSize, in_d, 255, exp_d_1);
11048
11049 unsigned exp_h_2[] = {0x9181, 0x7fff, 0x2010, 0xbaaa};
11050 unsigned exp_s_2[] = {0x80020081, 0x7ffffe7f, 0xaaab29aa, 0xf0016ff0};
11051 uint64_t exp_d_2[] = {0x8000000180028081, 0x7fffffff80007e7f};
11052
11053 // Encodable with `sqadd` (shift 8).
11054 // B-sized lanes cannot take a shift of 8.
11055 IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
11056 IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
11057 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
11058 }
11059
TEST_SVE(sve_int_wide_imm_unpredicated_uqadd)11060 TEST_SVE(sve_int_wide_imm_unpredicated_uqadd) {
11061 unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
11062 unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
11063 unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
11064 uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
11065
11066 IntWideImmFn fn = &MacroAssembler::Uqadd;
11067
11068 unsigned exp_b_1[] = {0xff, 0xff, 0x91, 0xff};
11069 unsigned exp_h_1[] = {0x8191, 0x7f8f, 0x1020, 0xaaba};
11070 unsigned exp_s_1[] = {0x80018200, 0x7fff7ffe, 0xaaaaab29, 0xf000f16f};
11071 uint64_t exp_d_1[] = {0x8000000180018280, 0x7fffffff7fff807e};
11072
11073 // Encodable with `uqadd` (shift 0).
11074 IntWideImmHelper(config, fn, kBRegSize, in_b, 0x81, exp_b_1);
11075 IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
11076 IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
11077 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff, exp_d_1);
11078
11079 unsigned exp_h_2[] = {0x9181, 0x8f7f, 0x2010, 0xbaaa};
11080 unsigned exp_s_2[] = {0x80020081, 0x7ffffe7f, 0xaaab29aa, 0xf0016ff0};
11081 uint64_t exp_d_2[] = {0x8000000180028081, 0x7fffffff80007e7f};
11082
11083 // Encodable with `uqadd` (shift 8).
11084 // B-sized lanes cannot take a shift of 8.
11085 IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
11086 IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
11087 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
11088 }
11089
TEST_SVE(sve_int_wide_imm_unpredicated_sub)11090 TEST_SVE(sve_int_wide_imm_unpredicated_sub) {
11091 unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
11092 unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
11093 unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
11094 uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
11095
11096 IntWideImmFn fn = &MacroAssembler::Sub;
11097
11098 unsigned exp_b_1[] = {0x00, 0xfe, 0x8f, 0x7e};
11099 unsigned exp_h_1[] = {0x8171, 0x7f6f, 0x1000, 0xaa9a};
11100 unsigned exp_s_1[] = {0x80018102, 0x7fff7f00, 0xaaaaaa2b, 0xf000f071};
11101 uint64_t exp_d_1[] = {0x8000000180018082, 0x7fffffff7fff7e80};
11102
11103 // Encodable with `sub` (shift 0).
11104 IntWideImmHelper(config, fn, kBRegSize, in_b, 0x81, exp_b_1);
11105 IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
11106 IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
11107 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff, exp_d_1);
11108
11109 unsigned exp_h_2[] = {0x7181, 0x6f7f, 0x0010, 0x9aaa};
11110 unsigned exp_s_2[] = {0x80010281, 0x7fff007f, 0xaaaa2baa, 0xf00071f0};
11111 uint64_t exp_d_2[] = {0x8000000180008281, 0x7fffffff7ffe807f};
11112
11113 // Encodable with `sub` (shift 8).
11114 // B-sized lanes cannot take a shift of 8.
11115 IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
11116 IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
11117 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
11118
11119 unsigned exp_s_3[] = {0x7f828181, 0x7f807f7f, 0xaa2baaaa, 0xef81f0f0};
11120
11121 // The macro is able to synthesise unencodable immediates.
11122 IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 16, exp_s_3);
11123
11124 unsigned exp_b_4[] = {0xa1, 0x9f, 0x30, 0x1f};
11125 unsigned exp_h_4[] = {0xa181, 0x9f7f, 0x3010, 0xcaaa};
11126 unsigned exp_s_4[] = {0x00018181, 0xffff7f7f, 0x2aaaaaaa, 0x7000f0f0};
11127 uint64_t exp_d_4[] = {0x8000000180018182, 0x7fffffff7fff7f80};
11128
11129 // Negative immediates use `add`.
11130 IntWideImmHelper(config, fn, kBRegSize, in_b, -0x20, exp_b_4);
11131 IntWideImmHelper(config, fn, kHRegSize, in_h, -0x2000, exp_h_4);
11132 IntWideImmHelper(config, fn, kSRegSize, in_s, INT32_MIN, exp_s_4);
11133 IntWideImmHelper(config, fn, kDRegSize, in_d, -1, exp_d_4);
11134 }
11135
TEST_SVE(sve_int_wide_imm_unpredicated_sqsub)11136 TEST_SVE(sve_int_wide_imm_unpredicated_sqsub) {
11137 unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
11138 unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
11139 unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
11140 uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
11141
11142 IntWideImmFn fn = &MacroAssembler::Sqsub;
11143
11144 unsigned exp_b_1[] = {0x80, 0xfe, 0x8f, 0x80};
11145 unsigned exp_h_1[] = {0x8171, 0x7f6f, 0x1000, 0xaa9a};
11146 unsigned exp_s_1[] = {0x80018102, 0x7fff7f00, 0xaaaaaa2b, 0xf000f071};
11147 uint64_t exp_d_1[] = {0x8000000180018082, 0x7fffffff7fff7e80};
11148
11149 // Encodable with `sqsub` (shift 0).
11150 // Note that encodable immediates are unsigned, even for signed saturation.
11151 IntWideImmHelper(config, fn, kBRegSize, in_b, 129, exp_b_1);
11152 IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
11153 IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
11154 IntWideImmHelper(config, fn, kDRegSize, in_d, 255, exp_d_1);
11155
11156 unsigned exp_h_2[] = {0x8000, 0x6f7f, 0x0010, 0x9aaa};
11157 unsigned exp_s_2[] = {0x80010281, 0x7fff007f, 0xaaaa2baa, 0xf00071f0};
11158 uint64_t exp_d_2[] = {0x8000000180008281, 0x7fffffff7ffe807f};
11159
11160 // Encodable with `sqsub` (shift 8).
11161 // B-sized lanes cannot take a shift of 8.
11162 IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
11163 IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
11164 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
11165 }
11166
TEST_SVE(sve_int_wide_imm_unpredicated_uqsub)11167 TEST_SVE(sve_int_wide_imm_unpredicated_uqsub) {
11168 unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
11169 unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
11170 unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
11171 uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
11172
11173 IntWideImmFn fn = &MacroAssembler::Uqsub;
11174
11175 unsigned exp_b_1[] = {0x00, 0x00, 0x00, 0x7e};
11176 unsigned exp_h_1[] = {0x8171, 0x7f6f, 0x1000, 0xaa9a};
11177 unsigned exp_s_1[] = {0x80018102, 0x7fff7f00, 0xaaaaaa2b, 0xf000f071};
11178 uint64_t exp_d_1[] = {0x8000000180018082, 0x7fffffff7fff7e80};
11179
11180 // Encodable with `uqsub` (shift 0).
11181 IntWideImmHelper(config, fn, kBRegSize, in_b, 0x81, exp_b_1);
11182 IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
11183 IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
11184 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff, exp_d_1);
11185
11186 unsigned exp_h_2[] = {0x7181, 0x6f7f, 0x0010, 0x9aaa};
11187 unsigned exp_s_2[] = {0x80010281, 0x7fff007f, 0xaaaa2baa, 0xf00071f0};
11188 uint64_t exp_d_2[] = {0x8000000180008281, 0x7fffffff7ffe807f};
11189
11190 // Encodable with `uqsub` (shift 8).
11191 // B-sized lanes cannot take a shift of 8.
11192 IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
11193 IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
11194 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
11195 }
11196
TEST_SVE(sve_int_wide_imm_unpredicated_subr)11197 TEST_SVE(sve_int_wide_imm_unpredicated_subr) {
11198 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11199 START();
11200
11201 // Encodable with `subr` (shift 0).
11202 __ Index(z0.VnD(), 1, 1);
11203 __ Sub(z0.VnD(), 100, z0.VnD());
11204 __ Index(z1.VnS(), 0x7f, 1);
11205 __ Sub(z1.VnS(), 0xf7, z1.VnS());
11206 __ Index(z2.VnH(), 0xaaaa, 0x2222);
11207 __ Sub(z2.VnH(), 0x80, z2.VnH());
11208 __ Index(z3.VnB(), 133, 1);
11209 __ Sub(z3.VnB(), 255, z3.VnB());
11210
11211 // Encodable with `subr` (shift 8).
11212 __ Index(z4.VnD(), 256, -1);
11213 __ Sub(z4.VnD(), 42 * 256, z4.VnD());
11214 __ Index(z5.VnS(), 0x7878, 1);
11215 __ Sub(z5.VnS(), 0x8000, z5.VnS());
11216 __ Index(z6.VnH(), 0x30f0, -1);
11217 __ Sub(z6.VnH(), 0x7f00, z6.VnH());
11218 // B-sized lanes cannot take a shift of 8.
11219
11220 // Select with movprfx.
11221 __ Index(z31.VnD(), 256, 4001);
11222 __ Sub(z7.VnD(), 42 * 256, z31.VnD());
11223
11224 // Out of immediate encodable range of `sub`.
11225 __ Index(z30.VnS(), 0x11223344, 1);
11226 __ Sub(z8.VnS(), 0x88776655, z30.VnS());
11227
11228 END();
11229
11230 if (CAN_RUN()) {
11231 RUN();
11232
11233 int expected_z0[] = {87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99};
11234 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
11235
11236 int expected_z1[] = {0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78};
11237 ASSERT_EQUAL_SVE(expected_z1, z1.VnS());
11238
11239 int expected_z2[] = {0xab2c, 0xcd4e, 0xef70, 0x1192, 0x33b4, 0x55d6};
11240 ASSERT_EQUAL_SVE(expected_z2, z2.VnH());
11241
11242 int expected_z3[] = {0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a};
11243 ASSERT_EQUAL_SVE(expected_z3, z3.VnB());
11244
11245 int expected_z4[] = {10502, 10501, 10500, 10499, 10498, 10497, 10496};
11246 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
11247
11248 int expected_z5[] = {0x0783, 0x0784, 0x0785, 0x0786, 0x0787, 0x0788};
11249 ASSERT_EQUAL_SVE(expected_z5, z5.VnS());
11250
11251 int expected_z6[] = {0x4e15, 0x4e14, 0x4e13, 0x4e12, 0x4e11, 0x4e10};
11252 ASSERT_EQUAL_SVE(expected_z6, z6.VnH());
11253
11254 int expected_z7[] = {-13510, -9509, -5508, -1507, 2494, 6495, 10496};
11255 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
11256
11257 int expected_z8[] = {0x7755330e, 0x7755330f, 0x77553310, 0x77553311};
11258 ASSERT_EQUAL_SVE(expected_z8, z8.VnS());
11259 }
11260 }
11261
TEST_SVE(sve_int_wide_imm_unpredicated_fdup)11262 TEST_SVE(sve_int_wide_imm_unpredicated_fdup) {
11263 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11264 START();
11265
11266 // Immediates which can be encoded in the instructions.
11267 __ Fdup(z0.VnH(), RawbitsToFloat16(0xc500));
11268 __ Fdup(z1.VnS(), Float16(2.0));
11269 __ Fdup(z2.VnD(), Float16(3.875));
11270 __ Fdup(z3.VnH(), 8.0f);
11271 __ Fdup(z4.VnS(), -4.75f);
11272 __ Fdup(z5.VnD(), 0.5f);
11273 __ Fdup(z6.VnH(), 1.0);
11274 __ Fdup(z7.VnS(), 2.125);
11275 __ Fdup(z8.VnD(), -13.0);
11276
11277 // Immediates which cannot be encoded in the instructions.
11278 __ Fdup(z10.VnH(), Float16(0.0));
11279 __ Fdup(z11.VnH(), kFP16PositiveInfinity);
11280 __ Fdup(z12.VnS(), 255.0f);
11281 __ Fdup(z13.VnS(), kFP32NegativeInfinity);
11282 __ Fdup(z14.VnD(), 12.3456);
11283 __ Fdup(z15.VnD(), kFP64PositiveInfinity);
11284
11285 END();
11286
11287 if (CAN_RUN()) {
11288 RUN();
11289
11290 ASSERT_EQUAL_SVE(0xc500, z0.VnH());
11291 ASSERT_EQUAL_SVE(0x40000000, z1.VnS());
11292 ASSERT_EQUAL_SVE(0x400f000000000000, z2.VnD());
11293 ASSERT_EQUAL_SVE(0x4800, z3.VnH());
11294 ASSERT_EQUAL_SVE(FloatToRawbits(-4.75f), z4.VnS());
11295 ASSERT_EQUAL_SVE(DoubleToRawbits(0.5), z5.VnD());
11296 ASSERT_EQUAL_SVE(0x3c00, z6.VnH());
11297 ASSERT_EQUAL_SVE(FloatToRawbits(2.125f), z7.VnS());
11298 ASSERT_EQUAL_SVE(DoubleToRawbits(-13.0), z8.VnD());
11299
11300 ASSERT_EQUAL_SVE(0x0000, z10.VnH());
11301 ASSERT_EQUAL_SVE(Float16ToRawbits(kFP16PositiveInfinity), z11.VnH());
11302 ASSERT_EQUAL_SVE(FloatToRawbits(255.0), z12.VnS());
11303 ASSERT_EQUAL_SVE(FloatToRawbits(kFP32NegativeInfinity), z13.VnS());
11304 ASSERT_EQUAL_SVE(DoubleToRawbits(12.3456), z14.VnD());
11305 ASSERT_EQUAL_SVE(DoubleToRawbits(kFP64PositiveInfinity), z15.VnD());
11306 }
11307 }
11308
TEST_SVE(sve_andv_eorv_orv)11309 TEST_SVE(sve_andv_eorv_orv) {
11310 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11311 START();
11312
11313 uint64_t in[] = {0x8899aabbccddeeff, 0x7777555533331111, 0x123456789abcdef0};
11314 InsrHelper(&masm, z31.VnD(), in);
11315
11316 // For simplicity, we re-use the same pg for various lane sizes.
11317 // For D lanes: 1, 1, 0
11318 // For S lanes: 1, 1, 1, 0, 0
11319 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
11320 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
11321 Initialise(&masm, p0.VnB(), pg_in);
11322
11323 // Make a copy so we can check that constructive operations preserve zn.
11324 __ Mov(z0, z31);
11325 __ Andv(b0, p0, z0.VnB()); // destructive
11326 __ Andv(h1, p0, z31.VnH());
11327 __ Mov(z2, z31);
11328 __ Andv(s2, p0, z2.VnS()); // destructive
11329 __ Andv(d3, p0, z31.VnD());
11330
11331 __ Eorv(b4, p0, z31.VnB());
11332 __ Mov(z5, z31);
11333 __ Eorv(h5, p0, z5.VnH()); // destructive
11334 __ Eorv(s6, p0, z31.VnS());
11335 __ Mov(z7, z31);
11336 __ Eorv(d7, p0, z7.VnD()); // destructive
11337
11338 __ Mov(z8, z31);
11339 __ Orv(b8, p0, z8.VnB()); // destructive
11340 __ Orv(h9, p0, z31.VnH());
11341 __ Mov(z10, z31);
11342 __ Orv(s10, p0, z10.VnS()); // destructive
11343 __ Orv(d11, p0, z31.VnD());
11344
11345 END();
11346
11347 if (CAN_RUN()) {
11348 RUN();
11349
11350 if (static_cast<int>(ArrayLength(pg_in)) >= config->sve_vl_in_bytes()) {
11351 ASSERT_EQUAL_64(0x10, d0);
11352 ASSERT_EQUAL_64(0x1010, d1);
11353 ASSERT_EQUAL_64(0x33331111, d2);
11354 ASSERT_EQUAL_64(0x7777555533331111, d3);
11355 ASSERT_EQUAL_64(0xbf, d4);
11356 ASSERT_EQUAL_64(0xedcb, d5);
11357 ASSERT_EQUAL_64(0x44444444, d6);
11358 ASSERT_EQUAL_64(0x7777555533331111, d7);
11359 ASSERT_EQUAL_64(0xff, d8);
11360 ASSERT_EQUAL_64(0xffff, d9);
11361 ASSERT_EQUAL_64(0x77775555, d10);
11362 ASSERT_EQUAL_64(0x7777555533331111, d11);
11363 } else {
11364 ASSERT_EQUAL_64(0, d0);
11365 ASSERT_EQUAL_64(0x0010, d1);
11366 ASSERT_EQUAL_64(0x00110011, d2);
11367 ASSERT_EQUAL_64(0x0011001100110011, d3);
11368 ASSERT_EQUAL_64(0x62, d4);
11369 ASSERT_EQUAL_64(0x0334, d5);
11370 ASSERT_EQUAL_64(0x8899aabb, d6);
11371 ASSERT_EQUAL_64(0xffeeffeeffeeffee, d7);
11372 ASSERT_EQUAL_64(0xff, d8);
11373 ASSERT_EQUAL_64(0xffff, d9);
11374 ASSERT_EQUAL_64(0xffffffff, d10);
11375 ASSERT_EQUAL_64(0xffffffffffffffff, d11);
11376 }
11377
11378 // Check the upper lanes above the top of the V register are all clear.
11379 for (int i = 1; i < core.GetSVELaneCount(kDRegSize); i++) {
11380 ASSERT_EQUAL_SVE_LANE(0, z0.VnD(), i);
11381 ASSERT_EQUAL_SVE_LANE(0, z1.VnD(), i);
11382 ASSERT_EQUAL_SVE_LANE(0, z2.VnD(), i);
11383 ASSERT_EQUAL_SVE_LANE(0, z3.VnD(), i);
11384 ASSERT_EQUAL_SVE_LANE(0, z4.VnD(), i);
11385 ASSERT_EQUAL_SVE_LANE(0, z5.VnD(), i);
11386 ASSERT_EQUAL_SVE_LANE(0, z6.VnD(), i);
11387 ASSERT_EQUAL_SVE_LANE(0, z7.VnD(), i);
11388 ASSERT_EQUAL_SVE_LANE(0, z8.VnD(), i);
11389 ASSERT_EQUAL_SVE_LANE(0, z9.VnD(), i);
11390 ASSERT_EQUAL_SVE_LANE(0, z10.VnD(), i);
11391 ASSERT_EQUAL_SVE_LANE(0, z11.VnD(), i);
11392 }
11393 }
11394 }
11395
11396
TEST_SVE(sve_saddv_uaddv)11397 TEST_SVE(sve_saddv_uaddv) {
11398 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11399 START();
11400
11401 uint64_t in[] = {0x8899aabbccddeeff, 0x8182838485868788, 0x0807060504030201};
11402 InsrHelper(&masm, z31.VnD(), in);
11403
11404 // For simplicity, we re-use the same pg for various lane sizes.
11405 // For D lanes: 1, 1, 0
11406 // For S lanes: 1, 1, 1, 0, 0
11407 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
11408 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
11409 Initialise(&masm, p0.VnB(), pg_in);
11410
11411 // Make a copy so we can check that constructive operations preserve zn.
11412 __ Mov(z0, z31);
11413 __ Saddv(b0, p0, z0.VnB()); // destructive
11414 __ Saddv(h1, p0, z31.VnH());
11415 __ Mov(z2, z31);
11416 __ Saddv(s2, p0, z2.VnS()); // destructive
11417
11418 __ Uaddv(b4, p0, z31.VnB());
11419 __ Mov(z5, z31);
11420 __ Uaddv(h5, p0, z5.VnH()); // destructive
11421 __ Uaddv(s6, p0, z31.VnS());
11422 __ Mov(z7, z31);
11423 __ Uaddv(d7, p0, z7.VnD()); // destructive
11424
11425 END();
11426
11427 if (CAN_RUN()) {
11428 RUN();
11429
11430 if (static_cast<int>(ArrayLength(pg_in)) >= config->sve_vl_in_bytes()) {
11431 // Saddv
11432 ASSERT_EQUAL_64(0xfffffffffffffda9, d0);
11433 ASSERT_EQUAL_64(0xfffffffffffe9495, d1);
11434 ASSERT_EQUAL_64(0xffffffff07090b0c, d2);
11435 // Uaddv
11436 ASSERT_EQUAL_64(0x00000000000002a9, d4);
11437 ASSERT_EQUAL_64(0x0000000000019495, d5);
11438 ASSERT_EQUAL_64(0x0000000107090b0c, d6);
11439 ASSERT_EQUAL_64(0x8182838485868788, d7);
11440 } else {
11441 // Saddv
11442 ASSERT_EQUAL_64(0xfffffffffffffd62, d0);
11443 ASSERT_EQUAL_64(0xfffffffffffe8394, d1);
11444 ASSERT_EQUAL_64(0xfffffffed3e6fa0b, d2);
11445 // Uaddv
11446 ASSERT_EQUAL_64(0x0000000000000562, d4);
11447 ASSERT_EQUAL_64(0x0000000000028394, d5);
11448 ASSERT_EQUAL_64(0x00000001d3e6fa0b, d6);
11449 ASSERT_EQUAL_64(0x0a1c2e4052647687, d7);
11450 }
11451
11452 // Check the upper lanes above the top of the V register are all clear.
11453 for (int i = 1; i < core.GetSVELaneCount(kDRegSize); i++) {
11454 ASSERT_EQUAL_SVE_LANE(0, z0.VnD(), i);
11455 ASSERT_EQUAL_SVE_LANE(0, z1.VnD(), i);
11456 ASSERT_EQUAL_SVE_LANE(0, z2.VnD(), i);
11457 ASSERT_EQUAL_SVE_LANE(0, z4.VnD(), i);
11458 ASSERT_EQUAL_SVE_LANE(0, z5.VnD(), i);
11459 ASSERT_EQUAL_SVE_LANE(0, z6.VnD(), i);
11460 ASSERT_EQUAL_SVE_LANE(0, z7.VnD(), i);
11461 }
11462 }
11463 }
11464
11465
TEST_SVE(sve_sminv_uminv)11466 TEST_SVE(sve_sminv_uminv) {
11467 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11468 START();
11469
11470 uint64_t in[] = {0xfffa5555aaaaaaaa, 0x0011223344aafe80, 0x00112233aabbfc00};
11471 InsrHelper(&masm, z31.VnD(), in);
11472
11473 // For simplicity, we re-use the same pg for various lane sizes.
11474 // For D lanes: 1, 0, 1
11475 // For S lanes: 1, 1, 0, 0, 1
11476 // For H lanes: 1, 1, 0, 1, 1, 0, 0, 0, 1, 1
11477 int pg_in[] = {1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1};
11478 Initialise(&masm, p0.VnB(), pg_in);
11479
11480 // Make a copy so we can check that constructive operations preserve zn.
11481 __ Mov(z0, z31);
11482 __ Sminv(b0, p0, z0.VnB()); // destructive
11483 __ Sminv(h1, p0, z31.VnH());
11484 __ Mov(z2, z31);
11485 __ Sminv(s2, p0, z2.VnS()); // destructive
11486 __ Sminv(d3, p0, z31.VnD());
11487
11488 __ Uminv(b4, p0, z31.VnB());
11489 __ Mov(z5, z31);
11490 __ Uminv(h5, p0, z5.VnH()); // destructive
11491 __ Uminv(s6, p0, z31.VnS());
11492 __ Mov(z7, z31);
11493 __ Uminv(d7, p0, z7.VnD()); // destructive
11494
11495 END();
11496
11497 if (CAN_RUN()) {
11498 RUN();
11499
11500 if (static_cast<int>(ArrayLength(pg_in)) >= config->sve_vl_in_bytes()) {
11501 // Sminv
11502 ASSERT_EQUAL_64(0xaa, d0);
11503 ASSERT_EQUAL_64(0xaabb, d1);
11504 ASSERT_EQUAL_64(0xaabbfc00, d2);
11505 ASSERT_EQUAL_64(0x00112233aabbfc00, d3); // The smaller lane is inactive.
11506 // Uminv
11507 ASSERT_EQUAL_64(0, d4);
11508 ASSERT_EQUAL_64(0x2233, d5);
11509 ASSERT_EQUAL_64(0x112233, d6);
11510 ASSERT_EQUAL_64(0x00112233aabbfc00, d7); // The smaller lane is inactive.
11511 } else {
11512 // Sminv
11513 ASSERT_EQUAL_64(0xaa, d0);
11514 ASSERT_EQUAL_64(0xaaaa, d1);
11515 ASSERT_EQUAL_64(0xaaaaaaaa, d2);
11516 ASSERT_EQUAL_64(0xfffa5555aaaaaaaa, d3);
11517 // Uminv
11518 ASSERT_EQUAL_64(0, d4);
11519 ASSERT_EQUAL_64(0x2233, d5);
11520 ASSERT_EQUAL_64(0x112233, d6);
11521 ASSERT_EQUAL_64(0x00112233aabbfc00, d7);
11522 }
11523
11524 // Check the upper lanes above the top of the V register are all clear.
11525 for (int i = 1; i < core.GetSVELaneCount(kDRegSize); i++) {
11526 ASSERT_EQUAL_SVE_LANE(0, z0.VnD(), i);
11527 ASSERT_EQUAL_SVE_LANE(0, z1.VnD(), i);
11528 ASSERT_EQUAL_SVE_LANE(0, z2.VnD(), i);
11529 ASSERT_EQUAL_SVE_LANE(0, z3.VnD(), i);
11530 ASSERT_EQUAL_SVE_LANE(0, z4.VnD(), i);
11531 ASSERT_EQUAL_SVE_LANE(0, z5.VnD(), i);
11532 ASSERT_EQUAL_SVE_LANE(0, z6.VnD(), i);
11533 ASSERT_EQUAL_SVE_LANE(0, z7.VnD(), i);
11534 }
11535 }
11536 }
11537
TEST_SVE(sve_smaxv_umaxv)11538 TEST_SVE(sve_smaxv_umaxv) {
11539 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11540 START();
11541
11542 uint64_t in[] = {0xfffa5555aaaaaaaa, 0x0011223344aafe80, 0x00112233aabbfc00};
11543 InsrHelper(&masm, z31.VnD(), in);
11544
11545 // For simplicity, we re-use the same pg for various lane sizes.
11546 // For D lanes: 1, 0, 1
11547 // For S lanes: 1, 1, 0, 0, 1
11548 // For H lanes: 1, 1, 0, 1, 1, 0, 0, 0, 1, 1
11549 int pg_in[] = {1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1};
11550 Initialise(&masm, p0.VnB(), pg_in);
11551
11552 // Make a copy so we can check that constructive operations preserve zn.
11553 __ Mov(z0, z31);
11554 __ Smaxv(b0, p0, z0.VnB()); // destructive
11555 __ Smaxv(h1, p0, z31.VnH());
11556 __ Mov(z2, z31);
11557 __ Smaxv(s2, p0, z2.VnS()); // destructive
11558 __ Smaxv(d3, p0, z31.VnD());
11559
11560 __ Umaxv(b4, p0, z31.VnB());
11561 __ Mov(z5, z31);
11562 __ Umaxv(h5, p0, z5.VnH()); // destructive
11563 __ Umaxv(s6, p0, z31.VnS());
11564 __ Mov(z7, z31);
11565 __ Umaxv(d7, p0, z7.VnD()); // destructive
11566
11567 END();
11568
11569 if (CAN_RUN()) {
11570 RUN();
11571
11572 if (static_cast<int>(ArrayLength(pg_in)) >= config->sve_vl_in_bytes()) {
11573 // Smaxv
11574 ASSERT_EQUAL_64(0x33, d0);
11575 ASSERT_EQUAL_64(0x44aa, d1);
11576 ASSERT_EQUAL_64(0x112233, d2);
11577 ASSERT_EQUAL_64(0x112233aabbfc00, d3);
11578 // Umaxv
11579 ASSERT_EQUAL_64(0xfe, d4);
11580 ASSERT_EQUAL_64(0xfc00, d5);
11581 ASSERT_EQUAL_64(0xaabbfc00, d6);
11582 ASSERT_EQUAL_64(0x112233aabbfc00, d7);
11583 } else {
11584 // Smaxv
11585 ASSERT_EQUAL_64(0x33, d0);
11586 ASSERT_EQUAL_64(0x44aa, d1);
11587 ASSERT_EQUAL_64(0x112233, d2);
11588 ASSERT_EQUAL_64(0x00112233aabbfc00, d3);
11589 // Umaxv
11590 ASSERT_EQUAL_64(0xfe, d4);
11591 ASSERT_EQUAL_64(0xfc00, d5);
11592 ASSERT_EQUAL_64(0xaabbfc00, d6);
11593 ASSERT_EQUAL_64(0xfffa5555aaaaaaaa, d7);
11594 }
11595
11596 // Check the upper lanes above the top of the V register are all clear.
11597 for (int i = 1; i < core.GetSVELaneCount(kDRegSize); i++) {
11598 ASSERT_EQUAL_SVE_LANE(0, z0.VnD(), i);
11599 ASSERT_EQUAL_SVE_LANE(0, z1.VnD(), i);
11600 ASSERT_EQUAL_SVE_LANE(0, z2.VnD(), i);
11601 ASSERT_EQUAL_SVE_LANE(0, z3.VnD(), i);
11602 ASSERT_EQUAL_SVE_LANE(0, z4.VnD(), i);
11603 ASSERT_EQUAL_SVE_LANE(0, z5.VnD(), i);
11604 ASSERT_EQUAL_SVE_LANE(0, z6.VnD(), i);
11605 ASSERT_EQUAL_SVE_LANE(0, z7.VnD(), i);
11606 }
11607 }
11608 }
11609
11610 template <typename T, size_t M, size_t N>
SdotUdotHelper(Test * config,unsigned lane_size_in_bits,const T (& zd_inputs)[M],const T (& za_inputs)[M],const T (& zn_inputs)[N],const T (& zm_inputs)[N],const T (& zd_expected)[M],const T (& zdnm_expected)[M],bool is_signed,int index=-1)11611 static void SdotUdotHelper(Test* config,
11612 unsigned lane_size_in_bits,
11613 const T (&zd_inputs)[M],
11614 const T (&za_inputs)[M],
11615 const T (&zn_inputs)[N],
11616 const T (&zm_inputs)[N],
11617 const T (&zd_expected)[M],
11618 const T (&zdnm_expected)[M],
11619 bool is_signed,
11620 int index = -1) {
11621 VIXL_STATIC_ASSERT(N == (M * 4));
11622 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11623 START();
11624
11625 auto dot_fn = [&](const ZRegister& zd,
11626 const ZRegister& za,
11627 const ZRegister& zn,
11628 const ZRegister& zm,
11629 bool is_signed_fn,
11630 int index_fn) {
11631 if (is_signed_fn) {
11632 if (index_fn < 0) {
11633 __ Sdot(zd, za, zn, zm);
11634 } else {
11635 __ Sdot(zd, za, zn, zm, index_fn);
11636 }
11637 } else {
11638 if (index_fn < 0) {
11639 __ Udot(zd, za, zn, zm);
11640 } else {
11641 __ Udot(zd, za, zn, zm, index_fn);
11642 }
11643 }
11644 };
11645
11646 ZRegister zd = z0.WithLaneSize(lane_size_in_bits);
11647 ZRegister za = z1.WithLaneSize(lane_size_in_bits);
11648 ZRegister zn = z2.WithLaneSize(lane_size_in_bits / 4);
11649 ZRegister zm = z3.WithLaneSize(lane_size_in_bits / 4);
11650
11651 InsrHelper(&masm, zd, zd_inputs);
11652 InsrHelper(&masm, za, za_inputs);
11653 InsrHelper(&masm, zn, zn_inputs);
11654 InsrHelper(&masm, zm, zm_inputs);
11655
11656 // The Dot macro handles arbitrarily-aliased registers in the argument list.
11657 ZRegister dm_result = z4.WithLaneSize(lane_size_in_bits);
11658 ZRegister dnm_result = z5.WithLaneSize(lane_size_in_bits);
11659 ZRegister da_result = z6.WithLaneSize(lane_size_in_bits);
11660 ZRegister dn_result = z7.WithLaneSize(lane_size_in_bits);
11661 ZRegister d_result = z8.WithLaneSize(lane_size_in_bits);
11662
11663 __ Mov(da_result, za);
11664 // zda = zda + (zn . zm)
11665 dot_fn(da_result, da_result, zn, zm, is_signed, index);
11666
11667 __ Mov(dn_result, zn.WithSameLaneSizeAs(dn_result));
11668 // zdn = za + (zdn . zm)
11669 dot_fn(dn_result, za, dn_result.WithSameLaneSizeAs(zn), zm, is_signed, index);
11670
11671 __ Mov(dm_result, zm.WithSameLaneSizeAs(dm_result));
11672 // zdm = za + (zn . zdm)
11673 dot_fn(dm_result, za, zn, dm_result.WithSameLaneSizeAs(zm), is_signed, index);
11674
11675 __ Mov(d_result, zd);
11676 // zd = za + (zn . zm)
11677 dot_fn(d_result, za, zn, zm, is_signed, index);
11678
11679 __ Mov(dnm_result, zn.WithSameLaneSizeAs(dnm_result));
11680 // zdnm = za + (zdmn . zdnm)
11681 dot_fn(dnm_result,
11682 za,
11683 dnm_result.WithSameLaneSizeAs(zn),
11684 dnm_result.WithSameLaneSizeAs(zm),
11685 is_signed,
11686 index);
11687
11688 END();
11689
11690 if (CAN_RUN()) {
11691 RUN();
11692
11693 ASSERT_EQUAL_SVE(za_inputs, z1.WithLaneSize(lane_size_in_bits));
11694 ASSERT_EQUAL_SVE(zn_inputs, z2.WithLaneSize(lane_size_in_bits / 4));
11695 ASSERT_EQUAL_SVE(zm_inputs, z3.WithLaneSize(lane_size_in_bits / 4));
11696
11697 ASSERT_EQUAL_SVE(zd_expected, da_result);
11698 ASSERT_EQUAL_SVE(zd_expected, dn_result);
11699 ASSERT_EQUAL_SVE(zd_expected, dm_result);
11700 ASSERT_EQUAL_SVE(zd_expected, d_result);
11701
11702 ASSERT_EQUAL_SVE(zdnm_expected, dnm_result);
11703 }
11704 }
11705
TEST_SVE(sve_sdot)11706 TEST_SVE(sve_sdot) {
11707 int64_t zd_inputs[] = {0x33, 0xee, 0xff};
11708 int64_t za_inputs[] = {INT32_MAX, -3, 2};
11709 int64_t zn_inputs[] = {-128, -128, -128, -128, 9, -1, 1, 30, -5, -20, 9, 8};
11710 int64_t zm_inputs[] = {-128, -128, -128, -128, -19, 15, 6, 0, 9, -5, 4, 5};
11711
11712 // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
11713 int64_t zd_expected_s[] = {-2147418113, -183, 133}; // 0x8000ffff
11714 int64_t zd_expected_d[] = {2147549183, -183, 133}; // 0x8000ffff
11715
11716 // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
11717 int64_t zdnm_expected_s[] = {-2147418113, 980, 572};
11718 int64_t zdnm_expected_d[] = {2147549183, 980, 572};
11719
11720 SdotUdotHelper(config,
11721 kSRegSize,
11722 zd_inputs,
11723 za_inputs,
11724 zn_inputs,
11725 zm_inputs,
11726 zd_expected_s,
11727 zdnm_expected_s,
11728 true);
11729
11730 SdotUdotHelper(config,
11731 kDRegSize,
11732 zd_inputs,
11733 za_inputs,
11734 zn_inputs,
11735 zm_inputs,
11736 zd_expected_d,
11737 zdnm_expected_d,
11738 true);
11739 }
11740
TEST_SVE(sve_udot)11741 TEST_SVE(sve_udot) {
11742 int64_t zd_inputs[] = {0x33, 0xee, 0xff};
11743 int64_t za_inputs[] = {INT32_MAX, -3, 2};
11744 int64_t zn_inputs[] = {-128, -128, -128, -128, 9, -1, 1, 30, -5, -20, 9, 8};
11745 int64_t zm_inputs[] = {-128, -128, -128, -128, -19, 15, 6, 0, 9, -5, 4, 5};
11746
11747 // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
11748 int64_t zd_expected_s[] = {0x8000ffff, 0x00001749, 0x0000f085};
11749 int64_t zd_expected_d[] = {0x000000047c00ffff,
11750 0x000000000017ff49,
11751 0x00000000fff00085};
11752
11753 // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
11754 int64_t zdnm_expected_s[] = {0x8000ffff, 0x000101d4, 0x0001d03c};
11755 int64_t zdnm_expected_d[] = {0x000000047c00ffff,
11756 0x00000000fffe03d4,
11757 0x00000001ffce023c};
11758
11759 SdotUdotHelper(config,
11760 kSRegSize,
11761 zd_inputs,
11762 za_inputs,
11763 zn_inputs,
11764 zm_inputs,
11765 zd_expected_s,
11766 zdnm_expected_s,
11767 false);
11768
11769 SdotUdotHelper(config,
11770 kDRegSize,
11771 zd_inputs,
11772 za_inputs,
11773 zn_inputs,
11774 zm_inputs,
11775 zd_expected_d,
11776 zdnm_expected_d,
11777 false);
11778 }
11779
TEST_SVE(sve_sdot_indexed_s)11780 TEST_SVE(sve_sdot_indexed_s) {
11781 int64_t zd_inputs[] = {0xff, 0xff, 0xff, 0xff};
11782 int64_t za_inputs[] = {0, 1, 2, 3};
11783 int64_t zn_inputs[] =
11784 {-1, -1, -1, -1, -2, -2, -2, -2, -3, -3, -3, -3, -4, -4, -4, -4};
11785 int64_t zm_inputs[] =
11786 {127, 127, 127, 127, -128, -128, -128, -128, -1, -1, -1, -1, 0, 0, 0, 0};
11787
11788 constexpr int s = kQRegSize / kSRegSize;
11789
11790 // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
11791 int64_t zd_expected_s[][s] = {{0, 1, 2, 3}, // Generated from zm[0]
11792 {4, 9, 14, 19},
11793 {512, 1025, 1538, 2051},
11794 {-508, -1015, -1522, -2029}};
11795
11796 // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
11797 int64_t zdnm_expected_s[][s] = {{16, 33, 50, 67},
11798 {12, 25, 38, 51},
11799 {8, 17, 26, 35},
11800 {4, 9, 14, 19}};
11801
11802 for (unsigned i = 0; i < s; i++) {
11803 SdotUdotHelper(config,
11804 kSRegSize,
11805 zd_inputs,
11806 za_inputs,
11807 zn_inputs,
11808 zm_inputs,
11809 zd_expected_s[i],
11810 zdnm_expected_s[i],
11811 true,
11812 i);
11813 }
11814 }
11815
TEST_SVE(sve_sdot_indexed_d)11816 TEST_SVE(sve_sdot_indexed_d) {
11817 int64_t zd_inputs[] = {0xff, 0xff};
11818 int64_t za_inputs[] = {0, 1};
11819 int64_t zn_inputs[] = {-1, -1, -1, -1, -1, -1, -1, -1};
11820 int64_t zm_inputs[] = {-128, -128, -128, -128, 127, 127, 127, 127};
11821
11822 constexpr int d = kQRegSize / kDRegSize;
11823
11824 // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
11825 int64_t zd_expected_d[][d] = {{-508, -507}, // Generated from zm[0]
11826 {512, 513}};
11827
11828 // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
11829 int64_t zdnm_expected_d[][d] = {{4, 5}, {4, 5}};
11830
11831 for (unsigned i = 0; i < d; i++) {
11832 SdotUdotHelper(config,
11833 kDRegSize,
11834 zd_inputs,
11835 za_inputs,
11836 zn_inputs,
11837 zm_inputs,
11838 zd_expected_d[i],
11839 zdnm_expected_d[i],
11840 true,
11841 i);
11842 }
11843 }
11844
TEST_SVE(sve_udot_indexed_s)11845 TEST_SVE(sve_udot_indexed_s) {
11846 int64_t zd_inputs[] = {0xff, 0xff, 0xff, 0xff};
11847 int64_t za_inputs[] = {0, 1, 2, 3};
11848 int64_t zn_inputs[] = {1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4};
11849 int64_t zm_inputs[] =
11850 {127, 127, 127, 127, 255, 255, 255, 255, 1, 1, 1, 1, 0, 0, 0, 0};
11851
11852 constexpr int s = kQRegSize / kSRegSize;
11853
11854 // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
11855 int64_t zd_expected_s[][s] = {{0, 1, 2, 3},
11856 {4, 9, 14, 19},
11857 {1020, 2041, 3062, 4083},
11858 {508, 1017, 1526, 2035}};
11859
11860 // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
11861 int64_t zdnm_expected_s[][s] = {{16, 33, 50, 67},
11862 {12, 25, 38, 51},
11863 {8, 17, 26, 35},
11864 {4, 9, 14, 19}};
11865
11866 for (unsigned i = 0; i < s; i++) {
11867 SdotUdotHelper(config,
11868 kSRegSize,
11869 zd_inputs,
11870 za_inputs,
11871 zn_inputs,
11872 zm_inputs,
11873 zd_expected_s[i],
11874 zdnm_expected_s[i],
11875 false,
11876 i);
11877 }
11878 }
11879
TEST_SVE(sve_udot_indexed_d)11880 TEST_SVE(sve_udot_indexed_d) {
11881 int64_t zd_inputs[] = {0xff, 0xff};
11882 int64_t za_inputs[] = {0, 1};
11883 int64_t zn_inputs[] = {1, 1, 1, 1, 1, 1, 1, 1};
11884 int64_t zm_inputs[] = {255, 255, 255, 255, 127, 127, 127, 127};
11885
11886 constexpr int d = kQRegSize / kDRegSize;
11887
11888 // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
11889 int64_t zd_expected_d[][d] = {{508, 509}, {1020, 1021}};
11890
11891 // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
11892 int64_t zdnm_expected_d[][d] = {{4, 5}, {4, 5}};
11893
11894 for (unsigned i = 0; i < d; i++) {
11895 SdotUdotHelper(config,
11896 kDRegSize,
11897 zd_inputs,
11898 za_inputs,
11899 zn_inputs,
11900 zm_inputs,
11901 zd_expected_d[i],
11902 zdnm_expected_d[i],
11903 false,
11904 i);
11905 }
11906 }
11907
IntSegmentPatternHelper(MacroAssembler * masm,const ZRegister & dst,const ZRegister & src)11908 static void IntSegmentPatternHelper(MacroAssembler* masm,
11909 const ZRegister& dst,
11910 const ZRegister& src) {
11911 VIXL_ASSERT(AreSameLaneSize(dst, src));
11912 UseScratchRegisterScope temps(masm);
11913 ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(dst);
11914 masm->Index(ztmp, 0, 1);
11915 masm->Asr(ztmp, ztmp, kQRegSizeInBytesLog2 - dst.GetLaneSizeInBytesLog2());
11916 masm->Add(dst, src, ztmp);
11917 }
11918
TEST_SVE(sve_sdot_udot_indexed_s)11919 TEST_SVE(sve_sdot_udot_indexed_s) {
11920 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11921 START();
11922
11923 const int multiplier = 2;
11924 __ Dup(z9.VnS(), multiplier);
11925
11926 __ Ptrue(p0.VnB());
11927 __ Index(z29.VnS(), 4, 1);
11928
11929 // z29 = [... 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0]
11930 __ And(z29.VnS(), z29.VnS(), 3);
11931
11932 // p7 = [... 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1]
11933 __ Cmple(p7.VnS(), p0.Zeroing(), z29.VnS(), 0);
11934
11935 // p6 = [... 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
11936 __ Cmple(p6.VnS(), p0.Zeroing(), z29.VnS(), 1);
11937
11938 // p5 = [... 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1]
11939 __ Cmple(p5.VnS(), p0.Zeroing(), z29.VnS(), 2);
11940
11941 __ Index(z28.VnB(), 1, 1);
11942 __ Dup(z27.VnS(), z28.VnS(), 0);
11943
11944 // z27 = [... 3, 2, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1]
11945 IntSegmentPatternHelper(&masm, z27.VnB(), z27.VnB());
11946
11947 // z27 = [... 6, 4, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 8, 6, 4, 2]
11948 __ Mul(z27.VnS(), p7.Merging(), z27.VnS(), z9.VnS());
11949
11950 // z27 = [... 12, 8, 4, 3, 2, 1, 4, 3, 2, 1, 8, 6, 4, 2, 16, 12, 8, 4]
11951 __ Mul(z27.VnS(), p6.Merging(), z27.VnS(), z9.VnS());
11952
11953 // 2nd segment | 1st segment |
11954 // v v
11955 // z27 = [... 24, 16, 4, 3, 2, 1, 8, 6, 4, 2, 16, 12, 8, 4, 32, 24, 16, 8]
11956 __ Mul(z27.VnS(), p5.Merging(), z27.VnS(), z9.VnS());
11957
11958 __ Dup(z0.VnS(), 0);
11959 __ Dup(z1.VnS(), 0);
11960 __ Dup(z2.VnS(), 0);
11961 __ Dup(z3.VnS(), 0);
11962 __ Dup(z4.VnS(), 0);
11963 __ Dup(z5.VnS(), 0);
11964
11965 // Skip the lanes starting from the 129th lane since the value of these lanes
11966 // are overflow after the number sequence creation by `index`.
11967 __ Cmpls(p3.VnB(), p0.Zeroing(), z28.VnB(), 128);
11968 __ Mov(z0.VnB(), p3.Merging(), z27.VnB());
11969 __ Mov(z1.VnB(), p3.Merging(), z28.VnB());
11970
11971 __ Dup(z2.VnS(), 0);
11972 __ Dup(z3.VnS(), 0);
11973 __ Dup(z4.VnS(), 0);
11974 __ Dup(z5.VnS(), 0);
11975
11976 __ Udot(z2.VnS(), z2.VnS(), z1.VnB(), z0.VnB(), 0);
11977
11978 __ Udot(z3.VnS(), z3.VnS(), z1.VnB(), z0.VnB(), 1);
11979 __ Mul(z3.VnS(), z3.VnS(), 2);
11980
11981 __ Udot(z4.VnS(), z4.VnS(), z1.VnB(), z0.VnB(), 2);
11982 __ Mul(z4.VnS(), z4.VnS(), 4);
11983
11984 __ Udot(z5.VnS(), z5.VnS(), z1.VnB(), z0.VnB(), 3);
11985 __ Mul(z5.VnS(), z5.VnS(), 8);
11986
11987 __ Dup(z7.VnS(), 0);
11988 __ Dup(z8.VnS(), 0);
11989 __ Dup(z9.VnS(), 0);
11990 __ Dup(z10.VnS(), 0);
11991
11992 // Negate the all positive vector for testing signed dot.
11993 __ Neg(z6.VnB(), p0.Merging(), z0.VnB());
11994 __ Sdot(z7.VnS(), z7.VnS(), z1.VnB(), z6.VnB(), 0);
11995
11996 __ Sdot(z8.VnS(), z8.VnS(), z1.VnB(), z6.VnB(), 1);
11997 __ Mul(z8.VnS(), z8.VnS(), 2);
11998
11999 __ Sdot(z9.VnS(), z9.VnS(), z1.VnB(), z6.VnB(), 2);
12000 __ Mul(z9.VnS(), z9.VnS(), 4);
12001
12002 __ Sdot(z10.VnS(), z10.VnS(), z1.VnB(), z6.VnB(), 3);
12003 __ Mul(z10.VnS(), z10.VnS(), 8);
12004
12005 END();
12006
12007 if (CAN_RUN()) {
12008 RUN();
12009
12010 // Only compare the first 128-bit segment of destination register, use
12011 // another result from generated instructions to check the remaining part.
12012 // s_lane[0] = (1 * 8) + (2 * 16) + (3 * 24) + (4 * 32) = 240
12013 // ...
12014 // s_lane[3] = (13 * 8) + (14 * 16) + (15 * 24) + (16 * 32) = 1200
12015 int udot_expected[] = {1200, 880, 560, 240};
12016 ASSERT_EQUAL_SVE(udot_expected, z2.VnS());
12017 ASSERT_EQUAL_SVE(z2.VnS(), z3.VnS());
12018 ASSERT_EQUAL_SVE(z2.VnS(), z4.VnS());
12019 ASSERT_EQUAL_SVE(z2.VnS(), z5.VnS());
12020
12021 int sdot_expected[] = {-1200, -880, -560, -240};
12022 ASSERT_EQUAL_SVE(sdot_expected, z7.VnS());
12023 ASSERT_EQUAL_SVE(z7.VnS(), z8.VnS());
12024 ASSERT_EQUAL_SVE(z7.VnS(), z9.VnS());
12025 ASSERT_EQUAL_SVE(z7.VnS(), z10.VnS());
12026 }
12027 }
12028
TEST_SVE(sve_sdot_udot_indexed_d)12029 TEST_SVE(sve_sdot_udot_indexed_d) {
12030 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
12031 START();
12032
12033 const int multiplier = 2;
12034 __ Dup(z9.VnD(), multiplier);
12035
12036 __ Ptrue(p0.VnD());
12037 __ Pfalse(p1.VnD());
12038
12039 // p2 = [..., 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
12040 __ Zip1(p2.VnD(), p0.VnD(), p1.VnD());
12041
12042 __ Index(z1.VnH(), 1, 1);
12043 __ Dup(z0.VnD(), z1.VnD(), 0);
12044
12045 // z0 = [... 5, 4, 3, 2, 5, 4, 3, 2, 4, 3, 2, 1, 4, 3, 2, 1]
12046 IntSegmentPatternHelper(&masm, z0.VnH(), z0.VnH());
12047
12048 // 2nd segment | 1st segment |
12049 // v v
12050 // z0 = [... 5, 4, 3, 2, 10, 8, 6, 4, 4, 3, 2, 1, 8, 6, 4, 2]
12051 __ Mul(z0.VnD(), p2.Merging(), z0.VnD(), z9.VnD());
12052
12053 __ Dup(z3.VnD(), 0);
12054 __ Dup(z4.VnD(), 0);
12055
12056 __ Udot(z3.VnD(), z3.VnD(), z1.VnH(), z0.VnH(), 0);
12057
12058 __ Udot(z4.VnD(), z4.VnD(), z1.VnH(), z0.VnH(), 1);
12059 __ Mul(z4.VnD(), z4.VnD(), multiplier);
12060
12061 __ Dup(z12.VnD(), 0);
12062 __ Dup(z13.VnD(), 0);
12063
12064 __ Ptrue(p4.VnH());
12065 __ Neg(z10.VnH(), p4.Merging(), z0.VnH());
12066
12067 __ Sdot(z12.VnD(), z12.VnD(), z1.VnH(), z10.VnH(), 0);
12068
12069 __ Sdot(z13.VnD(), z13.VnD(), z1.VnH(), z10.VnH(), 1);
12070 __ Mul(z13.VnD(), z13.VnD(), multiplier);
12071
12072 END();
12073
12074 if (CAN_RUN()) {
12075 RUN();
12076
12077 // Only compare the first 128-bit segment of destination register, use
12078 // another result from generated instructions to check the remaining part.
12079 // d_lane[0] = (1 * 2) + (2 * 4) + (3 * 6) + (4 * 8) = 60
12080 // d_lane[1] = (5 * 2) + (6 * 4) + (7 * 6) + (8 * 8) = 140
12081 uint64_t udot_expected[] = {416, 304, 140, 60};
12082 ASSERT_EQUAL_SVE(udot_expected, z3.VnD());
12083 ASSERT_EQUAL_SVE(z3.VnD(), z4.VnD());
12084
12085 int64_t sdot_expected[] = {-416, -304, -140, -60};
12086 ASSERT_EQUAL_SVE(sdot_expected, z12.VnD());
12087 ASSERT_EQUAL_SVE(z12.VnD(), z13.VnD());
12088 }
12089 }
12090
12091 template <typename T, size_t N>
FPToRawbitsWithSize(const T (& inputs)[N],uint64_t * outputs,unsigned size_in_bits)12092 static void FPToRawbitsWithSize(const T (&inputs)[N],
12093 uint64_t* outputs,
12094 unsigned size_in_bits) {
12095 for (size_t i = 0; i < N; i++) {
12096 outputs[i] = vixl::FPToRawbitsWithSize(size_in_bits, inputs[i]);
12097 }
12098 }
12099
12100 template <typename Ti, typename Te, size_t N>
FPBinArithHelper(Test * config,ArithFn macro,int lane_size_in_bits,const Ti (& zn_inputs)[N],const Ti (& zm_inputs)[N],const Te (& zd_expected)[N])12101 static void FPBinArithHelper(Test* config,
12102 ArithFn macro,
12103 int lane_size_in_bits,
12104 const Ti (&zn_inputs)[N],
12105 const Ti (&zm_inputs)[N],
12106 const Te (&zd_expected)[N]) {
12107 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
12108
12109 START();
12110
12111 ZRegister zd = z29.WithLaneSize(lane_size_in_bits);
12112 ZRegister zn = z30.WithLaneSize(lane_size_in_bits);
12113 ZRegister zm = z31.WithLaneSize(lane_size_in_bits);
12114
12115 uint64_t zn_rawbits[N];
12116 uint64_t zm_rawbits[N];
12117
12118 FPToRawbitsWithSize(zn_inputs, zn_rawbits, lane_size_in_bits);
12119 FPToRawbitsWithSize(zm_inputs, zm_rawbits, lane_size_in_bits);
12120
12121 InsrHelper(&masm, zn, zn_rawbits);
12122 InsrHelper(&masm, zm, zm_rawbits);
12123
12124 (masm.*macro)(zd, zn, zm);
12125
12126 END();
12127
12128 if (CAN_RUN()) {
12129 RUN();
12130
12131 ASSERT_EQUAL_SVE(zd_expected, zd);
12132 }
12133 }
12134
TEST_SVE(sve_fp_arithmetic_unpredicated_fadd)12135 TEST_SVE(sve_fp_arithmetic_unpredicated_fadd) {
12136 double zn_inputs[] = {24.0,
12137 5.5,
12138 0.0,
12139 3.875,
12140 2.125,
12141 kFP64PositiveInfinity,
12142 kFP64NegativeInfinity};
12143
12144 double zm_inputs[] = {1024.0, 2048.0, 0.1, -4.75, 12.34, 255.0, -13.0};
12145
12146 ArithFn fn = &MacroAssembler::Fadd;
12147
12148 uint16_t expected_h[] = {Float16ToRawbits(Float16(1048.0)),
12149 Float16ToRawbits(Float16(2053.5)),
12150 Float16ToRawbits(Float16(0.1)),
12151 Float16ToRawbits(Float16(-0.875)),
12152 Float16ToRawbits(Float16(14.465)),
12153 Float16ToRawbits(kFP16PositiveInfinity),
12154 Float16ToRawbits(kFP16NegativeInfinity)};
12155
12156 FPBinArithHelper(config, fn, kHRegSize, zn_inputs, zm_inputs, expected_h);
12157
12158 uint32_t expected_s[] = {FloatToRawbits(1048.0f),
12159 FloatToRawbits(2053.5f),
12160 FloatToRawbits(0.1f),
12161 FloatToRawbits(-0.875f),
12162 FloatToRawbits(14.465f),
12163 FloatToRawbits(kFP32PositiveInfinity),
12164 FloatToRawbits(kFP32NegativeInfinity)};
12165
12166 FPBinArithHelper(config, fn, kSRegSize, zn_inputs, zm_inputs, expected_s);
12167
12168 uint64_t expected_d[] = {DoubleToRawbits(1048.0),
12169 DoubleToRawbits(2053.5),
12170 DoubleToRawbits(0.1),
12171 DoubleToRawbits(-0.875),
12172 DoubleToRawbits(14.465),
12173 DoubleToRawbits(kFP64PositiveInfinity),
12174 DoubleToRawbits(kFP64NegativeInfinity)};
12175
12176 FPBinArithHelper(config, fn, kDRegSize, zn_inputs, zm_inputs, expected_d);
12177 }
12178
TEST_SVE(sve_fp_arithmetic_unpredicated_fsub)12179 TEST_SVE(sve_fp_arithmetic_unpredicated_fsub) {
12180 double zn_inputs[] = {24.0,
12181 5.5,
12182 0.0,
12183 3.875,
12184 2.125,
12185 kFP64PositiveInfinity,
12186 kFP64NegativeInfinity};
12187
12188 double zm_inputs[] = {1024.0, 2048.0, 0.1, -4.75, 12.34, 255.0, -13.0};
12189
12190 ArithFn fn = &MacroAssembler::Fsub;
12191
12192 uint16_t expected_h[] = {Float16ToRawbits(Float16(-1000.0)),
12193 Float16ToRawbits(Float16(-2042.5)),
12194 Float16ToRawbits(Float16(-0.1)),
12195 Float16ToRawbits(Float16(8.625)),
12196 Float16ToRawbits(Float16(-10.215)),
12197 Float16ToRawbits(kFP16PositiveInfinity),
12198 Float16ToRawbits(kFP16NegativeInfinity)};
12199
12200 FPBinArithHelper(config, fn, kHRegSize, zn_inputs, zm_inputs, expected_h);
12201
12202 uint32_t expected_s[] = {FloatToRawbits(-1000.0),
12203 FloatToRawbits(-2042.5),
12204 FloatToRawbits(-0.1),
12205 FloatToRawbits(8.625),
12206 FloatToRawbits(-10.215),
12207 FloatToRawbits(kFP32PositiveInfinity),
12208 FloatToRawbits(kFP32NegativeInfinity)};
12209
12210 FPBinArithHelper(config, fn, kSRegSize, zn_inputs, zm_inputs, expected_s);
12211
12212 uint64_t expected_d[] = {DoubleToRawbits(-1000.0),
12213 DoubleToRawbits(-2042.5),
12214 DoubleToRawbits(-0.1),
12215 DoubleToRawbits(8.625),
12216 DoubleToRawbits(-10.215),
12217 DoubleToRawbits(kFP64PositiveInfinity),
12218 DoubleToRawbits(kFP64NegativeInfinity)};
12219
12220 FPBinArithHelper(config, fn, kDRegSize, zn_inputs, zm_inputs, expected_d);
12221 }
12222
TEST_SVE(sve_fp_arithmetic_unpredicated_fmul)12223 TEST_SVE(sve_fp_arithmetic_unpredicated_fmul) {
12224 double zn_inputs[] = {24.0,
12225 5.5,
12226 0.0,
12227 3.875,
12228 2.125,
12229 kFP64PositiveInfinity,
12230 kFP64NegativeInfinity};
12231
12232 double zm_inputs[] = {1024.0, 2048.0, 0.1, -4.75, 12.34, 255.0, -13.0};
12233
12234 ArithFn fn = &MacroAssembler::Fmul;
12235
12236 uint16_t expected_h[] = {Float16ToRawbits(Float16(24576.0)),
12237 Float16ToRawbits(Float16(11264.0)),
12238 Float16ToRawbits(Float16(0.0)),
12239 Float16ToRawbits(Float16(-18.4)),
12240 Float16ToRawbits(Float16(26.23)),
12241 Float16ToRawbits(kFP16PositiveInfinity),
12242 Float16ToRawbits(kFP16PositiveInfinity)};
12243
12244 FPBinArithHelper(config, fn, kHRegSize, zn_inputs, zm_inputs, expected_h);
12245
12246 uint32_t expected_s[] = {FloatToRawbits(24576.0),
12247 FloatToRawbits(11264.0),
12248 FloatToRawbits(0.0),
12249 FloatToRawbits(-18.40625),
12250 FloatToRawbits(26.2225),
12251 FloatToRawbits(kFP32PositiveInfinity),
12252 FloatToRawbits(kFP32PositiveInfinity)};
12253
12254 FPBinArithHelper(config, fn, kSRegSize, zn_inputs, zm_inputs, expected_s);
12255
12256 uint64_t expected_d[] = {DoubleToRawbits(24576.0),
12257 DoubleToRawbits(11264.0),
12258 DoubleToRawbits(0.0),
12259 DoubleToRawbits(-18.40625),
12260 DoubleToRawbits(26.2225),
12261 DoubleToRawbits(kFP64PositiveInfinity),
12262 DoubleToRawbits(kFP64PositiveInfinity)};
12263
12264 FPBinArithHelper(config, fn, kDRegSize, zn_inputs, zm_inputs, expected_d);
12265 }
12266
12267 typedef void (MacroAssembler::*FPArithPredicatedFn)(
12268 const ZRegister& zd,
12269 const PRegisterM& pg,
12270 const ZRegister& zn,
12271 const ZRegister& zm,
12272 FPMacroNaNPropagationOption nan_option);
12273
12274 typedef void (MacroAssembler::*FPArithPredicatedNoNaNOptFn)(
12275 const ZRegister& zd,
12276 const PRegisterM& pg,
12277 const ZRegister& zn,
12278 const ZRegister& zm);
12279
12280 template <typename Ti, typename Te, size_t N>
FPBinArithHelper(Test * config,FPArithPredicatedFn macro,FPArithPredicatedNoNaNOptFn macro_nonan,unsigned lane_size_in_bits,const Ti (& zd_inputs)[N],const int (& pg_inputs)[N],const Ti (& zn_inputs)[N],const Ti (& zm_inputs)[N],const Te (& zd_expected)[N],FPMacroNaNPropagationOption nan_option=FastNaNPropagation)12281 static void FPBinArithHelper(
12282 Test* config,
12283 FPArithPredicatedFn macro,
12284 FPArithPredicatedNoNaNOptFn macro_nonan,
12285 unsigned lane_size_in_bits,
12286 const Ti (&zd_inputs)[N],
12287 const int (&pg_inputs)[N],
12288 const Ti (&zn_inputs)[N],
12289 const Ti (&zm_inputs)[N],
12290 const Te (&zd_expected)[N],
12291 FPMacroNaNPropagationOption nan_option = FastNaNPropagation) {
12292 VIXL_ASSERT((macro == NULL) ^ (macro_nonan == NULL));
12293 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
12294 START();
12295
12296 // Avoid choosing default scratch registers.
12297 ZRegister zd = z26.WithLaneSize(lane_size_in_bits);
12298 ZRegister zn = z27.WithLaneSize(lane_size_in_bits);
12299 ZRegister zm = z28.WithLaneSize(lane_size_in_bits);
12300
12301 uint64_t zn_inputs_rawbits[N];
12302 uint64_t zm_inputs_rawbits[N];
12303 uint64_t zd_inputs_rawbits[N];
12304
12305 FPToRawbitsWithSize(zn_inputs, zn_inputs_rawbits, lane_size_in_bits);
12306 FPToRawbitsWithSize(zm_inputs, zm_inputs_rawbits, lane_size_in_bits);
12307 FPToRawbitsWithSize(zd_inputs, zd_inputs_rawbits, lane_size_in_bits);
12308
12309 InsrHelper(&masm, zn, zn_inputs_rawbits);
12310 InsrHelper(&masm, zm, zm_inputs_rawbits);
12311 InsrHelper(&masm, zd, zd_inputs_rawbits);
12312
12313 PRegisterWithLaneSize pg = p0.WithLaneSize(lane_size_in_bits);
12314 Initialise(&masm, pg, pg_inputs);
12315
12316 // `instr` zdn, pg, zdn, zm
12317 ZRegister dn_result = z0.WithLaneSize(lane_size_in_bits);
12318 __ Mov(dn_result, zn);
12319 if (macro_nonan == NULL) {
12320 (masm.*macro)(dn_result, pg.Merging(), dn_result, zm, nan_option);
12321 } else {
12322 (masm.*macro_nonan)(dn_result, pg.Merging(), dn_result, zm);
12323 }
12324
12325 // Based on whether zd and zm registers are aliased, the macro of instructions
12326 // (`Instr`) swaps the order of operands if it has the commutative property,
12327 // otherwise, transfer to the reversed `Instr`, such as fdivr.
12328 // `instr` zdm, pg, zn, zdm
12329 ZRegister dm_result = z1.WithLaneSize(lane_size_in_bits);
12330 __ Mov(dm_result, zm);
12331 if (macro_nonan == NULL) {
12332 (masm.*macro)(dm_result, pg.Merging(), zn, dm_result, nan_option);
12333 } else {
12334 (masm.*macro_nonan)(dm_result, pg.Merging(), zn, dm_result);
12335 }
12336
12337 // The macro of instructions (`Instr`) automatically selects between `instr`
12338 // and movprfx + `instr` based on whether zd and zn registers are aliased.
12339 // A generated movprfx instruction is predicated that using the same
12340 // governing predicate register. In order to keep the result constant,
12341 // initialize the destination register first.
12342 // `instr` zd, pg, zn, zm
12343 ZRegister d_result = z2.WithLaneSize(lane_size_in_bits);
12344 __ Mov(d_result, zd);
12345 if (macro_nonan == NULL) {
12346 (masm.*macro)(d_result, pg.Merging(), zn, zm, nan_option);
12347 } else {
12348 (masm.*macro_nonan)(d_result, pg.Merging(), zn, zm);
12349 }
12350
12351 END();
12352
12353 if (CAN_RUN()) {
12354 RUN();
12355
12356 for (size_t i = 0; i < ArrayLength(zd_expected); i++) {
12357 int lane = static_cast<int>(ArrayLength(zd_expected) - i - 1);
12358 if (!core.HasSVELane(dn_result, lane)) break;
12359 if ((pg_inputs[i] & 1) != 0) {
12360 ASSERT_EQUAL_SVE_LANE(zd_expected[i], dn_result, lane);
12361 } else {
12362 ASSERT_EQUAL_SVE_LANE(zn_inputs_rawbits[i], dn_result, lane);
12363 }
12364 }
12365
12366 for (size_t i = 0; i < ArrayLength(zd_expected); i++) {
12367 int lane = static_cast<int>(ArrayLength(zd_expected) - i - 1);
12368 if (!core.HasSVELane(dm_result, lane)) break;
12369 if ((pg_inputs[i] & 1) != 0) {
12370 ASSERT_EQUAL_SVE_LANE(zd_expected[i], dm_result, lane);
12371 } else {
12372 ASSERT_EQUAL_SVE_LANE(zm_inputs_rawbits[i], dm_result, lane);
12373 }
12374 }
12375
12376 ASSERT_EQUAL_SVE(zd_expected, d_result);
12377 }
12378 }
12379
TEST_SVE(sve_binary_arithmetic_predicated_fdiv)12380 TEST_SVE(sve_binary_arithmetic_predicated_fdiv) {
12381 // The inputs are shared with different precision tests.
12382 double zd_in[] = {0.1, 1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9};
12383
12384 double zn_in[] = {24.0,
12385 24.0,
12386 -2.0,
12387 -2.0,
12388 5.5,
12389 5.5,
12390 kFP64PositiveInfinity,
12391 kFP64PositiveInfinity,
12392 kFP64NegativeInfinity,
12393 kFP64NegativeInfinity};
12394
12395 double zm_in[] = {-2.0, -2.0, 24.0, 24.0, 0.5, 0.5, 0.65, 0.65, 24.0, 24.0};
12396
12397 int pg_in[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
12398
12399 uint16_t exp_h[] = {Float16ToRawbits(Float16(0.1)),
12400 Float16ToRawbits(Float16(-12.0)),
12401 Float16ToRawbits(Float16(2.2)),
12402 Float16ToRawbits(Float16(-0.0833)),
12403 Float16ToRawbits(Float16(4.4)),
12404 Float16ToRawbits(Float16(11.0)),
12405 Float16ToRawbits(Float16(6.6)),
12406 Float16ToRawbits(kFP16PositiveInfinity),
12407 Float16ToRawbits(Float16(8.8)),
12408 Float16ToRawbits(kFP16NegativeInfinity)};
12409
12410 FPBinArithHelper(config,
12411 NULL,
12412 &MacroAssembler::Fdiv,
12413 kHRegSize,
12414 zd_in,
12415 pg_in,
12416 zn_in,
12417 zm_in,
12418 exp_h);
12419
12420 uint32_t exp_s[] = {FloatToRawbits(0.1),
12421 FloatToRawbits(-12.0),
12422 FloatToRawbits(2.2),
12423 0xbdaaaaab,
12424 FloatToRawbits(4.4),
12425 FloatToRawbits(11.0),
12426 FloatToRawbits(6.6),
12427 FloatToRawbits(kFP32PositiveInfinity),
12428 FloatToRawbits(8.8),
12429 FloatToRawbits(kFP32NegativeInfinity)};
12430
12431 FPBinArithHelper(config,
12432 NULL,
12433 &MacroAssembler::Fdiv,
12434 kSRegSize,
12435 zd_in,
12436 pg_in,
12437 zn_in,
12438 zm_in,
12439 exp_s);
12440
12441 uint64_t exp_d[] = {DoubleToRawbits(0.1),
12442 DoubleToRawbits(-12.0),
12443 DoubleToRawbits(2.2),
12444 0xbfb5555555555555,
12445 DoubleToRawbits(4.4),
12446 DoubleToRawbits(11.0),
12447 DoubleToRawbits(6.6),
12448 DoubleToRawbits(kFP64PositiveInfinity),
12449 DoubleToRawbits(8.8),
12450 DoubleToRawbits(kFP64NegativeInfinity)};
12451
12452 FPBinArithHelper(config,
12453 NULL,
12454 &MacroAssembler::Fdiv,
12455 kDRegSize,
12456 zd_in,
12457 pg_in,
12458 zn_in,
12459 zm_in,
12460 exp_d);
12461 }
12462
TEST_SVE(sve_select)12463 TEST_SVE(sve_select) {
12464 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
12465 START();
12466
12467 uint64_t in0[] = {0x01f203f405f607f8, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};
12468 uint64_t in1[] = {0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa};
12469
12470 // For simplicity, we re-use the same pg for various lane sizes.
12471 // For D lanes: 1, 1, 0
12472 // For S lanes: 1, 1, 1, 0, 0
12473 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
12474 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
12475 Initialise(&masm, p0.VnB(), pg_in);
12476 PRegisterM pg = p0.Merging();
12477
12478 InsrHelper(&masm, z30.VnD(), in0);
12479 InsrHelper(&masm, z31.VnD(), in1);
12480
12481 __ Sel(z0.VnB(), pg, z30.VnB(), z31.VnB());
12482 __ Sel(z1.VnH(), pg, z30.VnH(), z31.VnH());
12483 __ Sel(z2.VnS(), pg, z30.VnS(), z31.VnS());
12484 __ Sel(z3.VnD(), pg, z30.VnD(), z31.VnD());
12485
12486 END();
12487
12488 if (CAN_RUN()) {
12489 RUN();
12490
12491 uint64_t expected_z0[] = {0xaaaaaaaa05aa07f8,
12492 0xfeaaaaf0aac3870f,
12493 0xaaaa56aa9abcdeaa};
12494 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
12495
12496 uint64_t expected_z1[] = {0xaaaaaaaaaaaa07f8,
12497 0xaaaaf8f0e1c3870f,
12498 0xaaaaaaaa9abcaaaa};
12499 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
12500
12501 uint64_t expected_z2[] = {0xaaaaaaaa05f607f8,
12502 0xfefcf8f0e1c3870f,
12503 0xaaaaaaaaaaaaaaaa};
12504 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
12505
12506 uint64_t expected_z3[] = {0x01f203f405f607f8,
12507 0xfefcf8f0e1c3870f,
12508 0xaaaaaaaaaaaaaaaa};
12509 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
12510 }
12511 }
12512
TEST_SVE(sve_binary_arithmetic_predicated_fmax_fmin_h)12513 TEST_SVE(sve_binary_arithmetic_predicated_fmax_fmin_h) {
12514 double zd_inputs[] = {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8};
12515 double zn_inputs[] = {-2.1,
12516 8.5,
12517 225.5,
12518 0.0,
12519 8.8,
12520 -4.75,
12521 kFP64PositiveInfinity,
12522 kFP64NegativeInfinity};
12523 double zm_inputs[] = {-2.0,
12524 -13.0,
12525 24.0,
12526 0.01,
12527 0.5,
12528 300.75,
12529 kFP64NegativeInfinity,
12530 kFP64PositiveInfinity};
12531 int pg_inputs[] = {1, 1, 0, 1, 0, 1, 1, 1};
12532
12533 uint16_t zd_expected_max[] = {Float16ToRawbits(Float16(-2.0)),
12534 Float16ToRawbits(Float16(8.5)),
12535 Float16ToRawbits(Float16(3.3)),
12536 Float16ToRawbits(Float16(0.01)),
12537 Float16ToRawbits(Float16(5.5)),
12538 Float16ToRawbits(Float16(300.75)),
12539 Float16ToRawbits(kFP16PositiveInfinity),
12540 Float16ToRawbits(kFP16PositiveInfinity)};
12541 FPBinArithHelper(config,
12542 &MacroAssembler::Fmax,
12543 NULL,
12544 kHRegSize,
12545 zd_inputs,
12546 pg_inputs,
12547 zn_inputs,
12548 zm_inputs,
12549 zd_expected_max);
12550
12551 uint16_t zd_expected_min[] = {Float16ToRawbits(Float16(-2.1)),
12552 Float16ToRawbits(Float16(-13.0)),
12553 Float16ToRawbits(Float16(3.3)),
12554 Float16ToRawbits(Float16(0.0)),
12555 Float16ToRawbits(Float16(5.5)),
12556 Float16ToRawbits(Float16(-4.75)),
12557 Float16ToRawbits(kFP16NegativeInfinity),
12558 Float16ToRawbits(kFP16NegativeInfinity)};
12559 FPBinArithHelper(config,
12560 &MacroAssembler::Fmin,
12561 NULL,
12562 kHRegSize,
12563 zd_inputs,
12564 pg_inputs,
12565 zn_inputs,
12566 zm_inputs,
12567 zd_expected_min);
12568 }
12569
TEST_SVE(sve_binary_arithmetic_predicated_fmax_fmin_s)12570 TEST_SVE(sve_binary_arithmetic_predicated_fmax_fmin_s) {
12571 double zd_inputs[] = {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8};
12572 double zn_inputs[] = {-2.1,
12573 8.5,
12574 225.5,
12575 0.0,
12576 8.8,
12577 -4.75,
12578 kFP64PositiveInfinity,
12579 kFP64NegativeInfinity};
12580 double zm_inputs[] = {-2.0,
12581 -13.0,
12582 24.0,
12583 0.01,
12584 0.5,
12585 300.75,
12586 kFP64NegativeInfinity,
12587 kFP64PositiveInfinity};
12588 int pg_inputs[] = {1, 1, 0, 1, 0, 1, 1, 1};
12589
12590 uint32_t zd_expected_max[] = {FloatToRawbits(-2.0),
12591 FloatToRawbits(8.5),
12592 FloatToRawbits(3.3),
12593 FloatToRawbits(0.01),
12594 FloatToRawbits(5.5),
12595 FloatToRawbits(300.75),
12596 FloatToRawbits(kFP32PositiveInfinity),
12597 FloatToRawbits(kFP32PositiveInfinity)};
12598 FPBinArithHelper(config,
12599 &MacroAssembler::Fmax,
12600 NULL,
12601 kSRegSize,
12602 zd_inputs,
12603 pg_inputs,
12604 zn_inputs,
12605 zm_inputs,
12606 zd_expected_max);
12607
12608 uint32_t zd_expected_min[] = {FloatToRawbits(-2.1),
12609 FloatToRawbits(-13.0),
12610 FloatToRawbits(3.3),
12611 FloatToRawbits(0.0),
12612 FloatToRawbits(5.5),
12613 FloatToRawbits(-4.75),
12614 FloatToRawbits(kFP32NegativeInfinity),
12615 FloatToRawbits(kFP32NegativeInfinity)};
12616 FPBinArithHelper(config,
12617 &MacroAssembler::Fmin,
12618 NULL,
12619 kSRegSize,
12620 zd_inputs,
12621 pg_inputs,
12622 zn_inputs,
12623 zm_inputs,
12624 zd_expected_min);
12625 }
12626
TEST_SVE(sve_binary_arithmetic_predicated_fmax_fmin_d)12627 TEST_SVE(sve_binary_arithmetic_predicated_fmax_fmin_d) {
12628 double zd_inputs[] = {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8};
12629 double zn_inputs[] = {-2.1,
12630 8.5,
12631 225.5,
12632 0.0,
12633 8.8,
12634 -4.75,
12635 kFP64PositiveInfinity,
12636 kFP64NegativeInfinity};
12637 double zm_inputs[] = {-2.0,
12638 -13.0,
12639 24.0,
12640 0.01,
12641 0.5,
12642 300.75,
12643 kFP64NegativeInfinity,
12644 kFP64PositiveInfinity};
12645 int pg_inputs[] = {1, 1, 0, 1, 0, 1, 1, 1};
12646
12647 uint64_t zd_expected_max[] = {DoubleToRawbits(-2.0),
12648 DoubleToRawbits(8.5),
12649 DoubleToRawbits(3.3),
12650 DoubleToRawbits(0.01),
12651 DoubleToRawbits(5.5),
12652 DoubleToRawbits(300.75),
12653 DoubleToRawbits(kFP64PositiveInfinity),
12654 DoubleToRawbits(kFP64PositiveInfinity)};
12655 FPBinArithHelper(config,
12656 &MacroAssembler::Fmax,
12657 NULL,
12658 kDRegSize,
12659 zd_inputs,
12660 pg_inputs,
12661 zn_inputs,
12662 zm_inputs,
12663 zd_expected_max);
12664
12665 uint64_t zd_expected_min[] = {DoubleToRawbits(-2.1),
12666 DoubleToRawbits(-13.0),
12667 DoubleToRawbits(3.3),
12668 DoubleToRawbits(0.0),
12669 DoubleToRawbits(5.5),
12670 DoubleToRawbits(-4.75),
12671 DoubleToRawbits(kFP64NegativeInfinity),
12672 DoubleToRawbits(kFP64NegativeInfinity)};
12673 FPBinArithHelper(config,
12674 &MacroAssembler::Fmin,
12675 NULL,
12676 kDRegSize,
12677 zd_inputs,
12678 pg_inputs,
12679 zn_inputs,
12680 zm_inputs,
12681 zd_expected_min);
12682 }
12683
12684 template <typename T, size_t N>
BitwiseShiftImmHelper(Test * config,int lane_size_in_bits,const T (& zn_inputs)[N],int shift)12685 static void BitwiseShiftImmHelper(Test* config,
12686 int lane_size_in_bits,
12687 const T (&zn_inputs)[N],
12688 int shift) {
12689 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
12690 START();
12691
12692 ZRegister zd_asr = z25.WithLaneSize(lane_size_in_bits);
12693 ZRegister zd_lsr = z26.WithLaneSize(lane_size_in_bits);
12694 ZRegister zd_lsl = z27.WithLaneSize(lane_size_in_bits);
12695 ZRegister zn = z28.WithLaneSize(lane_size_in_bits);
12696
12697 InsrHelper(&masm, zn, zn_inputs);
12698
12699 __ Asr(zd_asr, zn, shift);
12700 __ Lsr(zd_lsr, zn, shift);
12701 __ Lsl(zd_lsl, zn, shift - 1); // Lsl supports 0 - lane_size-1.
12702
12703 END();
12704
12705 if (CAN_RUN()) {
12706 RUN();
12707
12708 const uint64_t mask = GetUintMask(lane_size_in_bits);
12709 for (int i = 0; i < static_cast<int>(N); i++) {
12710 int lane = N - i - 1;
12711 if (!core.HasSVELane(zd_asr, lane)) break;
12712 bool is_negative = (zn_inputs[i] & GetSignMask(lane_size_in_bits)) != 0;
12713 uint64_t result;
12714 if (shift >= lane_size_in_bits) {
12715 result = is_negative ? mask : 0;
12716 } else {
12717 result = zn_inputs[i] >> shift;
12718 if (is_negative) {
12719 result |= mask << (lane_size_in_bits - shift);
12720 result &= mask;
12721 }
12722 }
12723 ASSERT_EQUAL_SVE_LANE(result, zd_asr, lane);
12724 }
12725
12726 for (int i = 0; i < static_cast<int>(N); i++) {
12727 int lane = N - i - 1;
12728 if (!core.HasSVELane(zd_lsr, lane)) break;
12729 uint64_t result =
12730 (shift >= lane_size_in_bits) ? 0 : zn_inputs[i] >> shift;
12731 ASSERT_EQUAL_SVE_LANE(result, zd_lsr, lane);
12732 }
12733
12734 for (int i = 0; i < static_cast<int>(N); i++) {
12735 int lane = N - i - 1;
12736 if (!core.HasSVELane(zd_lsl, lane)) break;
12737 uint64_t result =
12738 (shift > lane_size_in_bits) ? 0 : zn_inputs[i] << (shift - 1);
12739 ASSERT_EQUAL_SVE_LANE(result & mask, zd_lsl, lane);
12740 }
12741 }
12742 }
12743
TEST_SVE(sve_bitwise_shift_imm_unpredicated)12744 TEST_SVE(sve_bitwise_shift_imm_unpredicated) {
12745 uint64_t inputs_b[] = {0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80};
12746 int shift_b[] = {1, 3, 5, 8};
12747 for (size_t i = 0; i < ArrayLength(shift_b); i++) {
12748 BitwiseShiftImmHelper(config, kBRegSize, inputs_b, shift_b[i]);
12749 }
12750
12751 uint64_t inputs_h[] = {0xfedc, 0xfa55, 0x0011, 0x2233};
12752 int shift_h[] = {1, 8, 11, 16};
12753 for (size_t i = 0; i < ArrayLength(shift_h); i++) {
12754 BitwiseShiftImmHelper(config, kHRegSize, inputs_h, shift_h[i]);
12755 }
12756
12757 uint64_t inputs_s[] = {0xfedcba98, 0xfffa55aa, 0x00112233};
12758 int shift_s[] = {1, 9, 17, 32};
12759 for (size_t i = 0; i < ArrayLength(shift_s); i++) {
12760 BitwiseShiftImmHelper(config, kSRegSize, inputs_s, shift_s[i]);
12761 }
12762
12763 uint64_t inputs_d[] = {0xfedcba98fedcba98,
12764 0xfffa5555aaaaaaaa,
12765 0x0011223344aafe80};
12766 int shift_d[] = {1, 23, 45, 64};
12767 for (size_t i = 0; i < ArrayLength(shift_d); i++) {
12768 BitwiseShiftImmHelper(config, kDRegSize, inputs_d, shift_d[i]);
12769 }
12770 }
12771
12772 template <typename T, typename R, size_t N>
BitwiseShiftWideElementsHelper(Test * config,Shift shift_type,int lane_size_in_bits,const T (& zn_inputs)[N],const R & zm_inputs,const T (& zd_expected)[N])12773 static void BitwiseShiftWideElementsHelper(Test* config,
12774 Shift shift_type,
12775 int lane_size_in_bits,
12776 const T (&zn_inputs)[N],
12777 const R& zm_inputs,
12778 const T (&zd_expected)[N]) {
12779 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
12780 START();
12781
12782 ArithFn macro;
12783 // Since logical shift left and right by the current lane size width is equal
12784 // to 0, so initialize the array to 0 for convenience.
12785 uint64_t zd_expected_max_shift_amount[N] = {0};
12786 switch (shift_type) {
12787 case ASR: {
12788 macro = &MacroAssembler::Asr;
12789 uint64_t mask = GetUintMask(lane_size_in_bits);
12790 for (size_t i = 0; i < ArrayLength(zn_inputs); i++) {
12791 bool is_negative = (zn_inputs[i] & GetSignMask(lane_size_in_bits)) != 0;
12792 zd_expected_max_shift_amount[i] = is_negative ? mask : 0;
12793 }
12794 break;
12795 }
12796 case LSR:
12797 macro = &MacroAssembler::Lsr;
12798 break;
12799 case LSL:
12800 macro = &MacroAssembler::Lsl;
12801 break;
12802 default:
12803 VIXL_UNIMPLEMENTED();
12804 macro = NULL;
12805 break;
12806 }
12807
12808 ZRegister zd = z26.WithLaneSize(lane_size_in_bits);
12809 ZRegister zn = z27.WithLaneSize(lane_size_in_bits);
12810 ZRegister zm = z28.WithLaneSize(kDRegSize);
12811
12812 InsrHelper(&masm, zn, zn_inputs);
12813 InsrHelper(&masm, zm, zm_inputs);
12814
12815 (masm.*macro)(zd, zn, zm);
12816
12817 ZRegister zm_max_shift_amount = z25.WithLaneSize(kDRegSize);
12818 ZRegister zd_max_shift_amount = z24.WithLaneSize(lane_size_in_bits);
12819
12820 __ Dup(zm_max_shift_amount, lane_size_in_bits);
12821 (masm.*macro)(zd_max_shift_amount, zn, zm_max_shift_amount);
12822
12823 ZRegister zm_out_of_range = z23.WithLaneSize(kDRegSize);
12824 ZRegister zd_out_of_range = z22.WithLaneSize(lane_size_in_bits);
12825
12826 __ Dup(zm_out_of_range, GetUintMask(lane_size_in_bits));
12827 (masm.*macro)(zd_out_of_range, zn, zm_out_of_range);
12828
12829 END();
12830
12831 if (CAN_RUN()) {
12832 RUN();
12833
12834 ASSERT_EQUAL_SVE(zd_expected, zd);
12835 ASSERT_EQUAL_SVE(zd_expected_max_shift_amount, zd_max_shift_amount);
12836 ASSERT_EQUAL_SVE(zd_max_shift_amount, zd_out_of_range);
12837 }
12838 }
12839
TEST_SVE(sve_bitwise_shift_wide_elements_unpredicated_asr)12840 TEST_SVE(sve_bitwise_shift_wide_elements_unpredicated_asr) {
12841 // clang-format off
12842 uint64_t inputs_b[] = {0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80,
12843 0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80};
12844 int shift_b[] = {1, 3};
12845 uint64_t expected_b[] = {0xff, 0xee, 0xdd, 0xcc, 0xff, 0x2a, 0xd5, 0xc0,
12846 0xff, 0xfb, 0xf7, 0xf3, 0xff, 0x0a, 0xf5, 0xf0};
12847 BitwiseShiftWideElementsHelper(config,
12848 ASR,
12849 kBRegSize,
12850 inputs_b,
12851 shift_b,
12852 expected_b);
12853
12854 uint64_t inputs_h[] = {0xfedc, 0xfa55, 0x0011, 0x2233,
12855 0xfedc, 0xfa55, 0x0011, 0x2233,
12856 0xfedc, 0xfa55, 0x0011, 0x2233};
12857 int shift_h[] = {1, 8, 11};
12858 uint64_t expected_h[] = {0xff6e, 0xfd2a, 0x0008, 0x1119,
12859 0xfffe, 0xfffa, 0x0000, 0x0022,
12860 0xffff, 0xffff, 0x0000, 0x0004};
12861 BitwiseShiftWideElementsHelper(config,
12862 ASR,
12863 kHRegSize,
12864 inputs_h,
12865 shift_h,
12866 expected_h);
12867
12868 uint64_t inputs_s[] =
12869 {0xfedcba98, 0xfffa55aa, 0x00112233, 0x01234567, 0xaaaaaaaa, 0x88888888};
12870 int shift_s[] = {1, 9, 23};
12871 uint64_t expected_s[] =
12872 {0xff6e5d4c, 0xfffd2ad5, 0x00000891, 0x000091a2, 0xffffff55, 0xffffff11};
12873 BitwiseShiftWideElementsHelper(config,
12874 ASR,
12875 kSRegSize,
12876 inputs_s,
12877 shift_s,
12878 expected_s);
12879 // clang-format on
12880 }
12881
TEST_SVE(sve_bitwise_shift_wide_elements_unpredicated_lsr)12882 TEST_SVE(sve_bitwise_shift_wide_elements_unpredicated_lsr) {
12883 // clang-format off
12884 uint64_t inputs_b[] = {0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80,
12885 0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80};
12886 int shift_b[] = {1, 3};
12887 uint64_t expected_b[] = {0x7f, 0x6e, 0x5d, 0x4c, 0x7f, 0x2a, 0x55, 0x40,
12888 0x1f, 0x1b, 0x17, 0x13, 0x1f, 0x0a, 0x15, 0x10};
12889
12890 BitwiseShiftWideElementsHelper(config,
12891 LSR,
12892 kBRegSize,
12893 inputs_b,
12894 shift_b,
12895 expected_b);
12896
12897 uint64_t inputs_h[] = {0xfedc, 0xfa55, 0x0011, 0x2233,
12898 0xfedc, 0xfa55, 0x0011, 0x2233,
12899 0xfedc, 0xfa55, 0x0011, 0x2233};
12900 int shift_h[] = {1, 8, 11};
12901 uint64_t expected_h[] = {0x7f6e, 0x7d2a, 0x0008, 0x1119,
12902 0x00fe, 0x00fa, 0x0000, 0x0022,
12903 0x001f, 0x001f, 0x0000, 0x0004};
12904 BitwiseShiftWideElementsHelper(config,
12905 LSR,
12906 kHRegSize,
12907 inputs_h,
12908 shift_h,
12909 expected_h);
12910
12911 uint64_t inputs_s[] =
12912 {0xfedcba98, 0xfffa55aa, 0x00112233, 0x01234567, 0xaaaaaaaa, 0x88888888};
12913 int shift_s[] = {1, 9, 23};
12914 uint64_t expected_s[] =
12915 {0x7f6e5d4c, 0x7ffd2ad5, 0x00000891, 0x000091a2, 0x00000155, 0x00000111};
12916 BitwiseShiftWideElementsHelper(config,
12917 LSR,
12918 kSRegSize,
12919 inputs_s,
12920 shift_s,
12921 expected_s);
12922 // clang-format on
12923 }
12924
TEST_SVE(sve_bitwise_shift_wide_elements_unpredicated_lsl)12925 TEST_SVE(sve_bitwise_shift_wide_elements_unpredicated_lsl) {
12926 // clang-format off
12927 uint64_t inputs_b[] = {0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80,
12928 0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80};
12929 int shift_b[] = {1, 5};
12930
12931 uint64_t expected_b[] = {0xfc, 0xb8, 0x74, 0x30, 0xfe, 0xaa, 0x54, 0x00,
12932 0xc0, 0x80, 0x40, 0x00, 0xe0, 0xa0, 0x40, 0x00};
12933
12934 BitwiseShiftWideElementsHelper(config,
12935 LSL,
12936 kBRegSize,
12937 inputs_b,
12938 shift_b,
12939 expected_b);
12940 uint64_t inputs_h[] = {0xfedc, 0xfa55, 0x0011, 0x2233,
12941 0xfedc, 0xfa55, 0x0011, 0x2233,
12942 0xfedc, 0xfa55, 0x0011, 0x2233};
12943 int shift_h[] = {1, 2, 14};
12944
12945 uint64_t expected_h[] = {0xfdb8, 0xf4aa, 0x0022, 0x4466,
12946 0xfb70, 0xe954, 0x0044, 0x88cc,
12947 0x0000, 0x4000, 0x4000, 0xc000};
12948 BitwiseShiftWideElementsHelper(config,
12949 LSL,
12950 kHRegSize,
12951 inputs_h,
12952 shift_h,
12953 expected_h);
12954 uint64_t inputs_s[] =
12955 {0xfedcba98, 0xfffa55aa, 0x00112233, 0x01234567, 0xaaaaaaaa, 0x88888888};
12956 int shift_s[] = {1, 19, 26};
12957 uint64_t expected_s[] =
12958 {0xfdb97530, 0xfff4ab54, 0x11980000, 0x2b380000, 0xa8000000, 0x20000000};
12959 BitwiseShiftWideElementsHelper(config,
12960 LSL,
12961 kSRegSize,
12962 inputs_s,
12963 shift_s,
12964 expected_s);
12965
12966 // Test large shifts outside the range of the "unsigned" type.
12967 uint64_t inputs_b2[] = {1, 2, 4, 8, 3, 5, 7, 9,
12968 1, 2, 4, 8, 3, 5, 7, 9};
12969 uint64_t shift_b2[] = {1, 0x1000000001};
12970 uint64_t expected_b2[] = {2, 4, 8, 16, 6, 10, 14, 18,
12971 0, 0, 0, 0, 0, 0, 0, 0};
12972 BitwiseShiftWideElementsHelper(config, LSL, kBRegSize, inputs_b2, shift_b2,
12973 expected_b2);
12974
12975 // clang-format on
12976 }
12977
TEST_SVE(sve_shift_by_vector)12978 TEST_SVE(sve_shift_by_vector) {
12979 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
12980
12981 START();
12982 __ Ptrue(p0.VnB());
12983 __ Pfalse(p1.VnB());
12984 __ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
12985 __ Zip1(p3.VnH(), p0.VnH(), p1.VnH());
12986 __ Zip1(p4.VnS(), p0.VnS(), p1.VnS());
12987 __ Zip1(p5.VnD(), p0.VnD(), p1.VnD());
12988
12989 __ Dup(z31.VnD(), 0x8000000080008080);
12990 __ Dup(z0.VnB(), -1);
12991
12992 __ Index(z1.VnB(), 0, 1);
12993 __ Dup(z2.VnB(), 0x55);
12994 __ Lsr(z2.VnB(), p2.Merging(), z0.VnB(), z1.VnB());
12995 __ Lsl(z3.VnB(), p0.Merging(), z0.VnB(), z1.VnB());
12996 __ Asr(z4.VnB(), p0.Merging(), z31.VnB(), z1.VnB());
12997
12998 __ Index(z1.VnH(), 0, 1);
12999 __ Dup(z6.VnB(), 0x55);
13000 __ Lsr(z5.VnH(), p0.Merging(), z0.VnH(), z1.VnH());
13001 __ Lsl(z6.VnH(), p3.Merging(), z0.VnH(), z1.VnH());
13002 __ Asr(z7.VnH(), p0.Merging(), z31.VnH(), z1.VnH());
13003
13004 __ Index(z1.VnS(), 0, 1);
13005 __ Dup(z10.VnB(), 0x55);
13006 __ Lsr(z8.VnS(), p0.Merging(), z0.VnS(), z1.VnS());
13007 __ Lsl(z9.VnS(), p0.Merging(), z0.VnS(), z1.VnS());
13008 __ Asr(z10.VnS(), p4.Merging(), z31.VnS(), z1.VnS());
13009
13010 __ Index(z1.VnD(), 0, 1);
13011 __ Lsr(z0.VnD(), p5.Merging(), z0.VnD(), z1.VnD());
13012 __ Lsl(z12.VnD(), p0.Merging(), z0.VnD(), z1.VnD());
13013 __ Asr(z13.VnD(), p0.Merging(), z31.VnD(), z1.VnD());
13014
13015 __ Dup(z11.VnD(), 0x100000001);
13016 __ Lsl(z14.VnD(), p0.Merging(), z1.VnD(), z11.VnD());
13017
13018 __ Index(z0.VnH(), 7, -1);
13019 __ Lsr(z0.VnH(), p0.Merging(), z31.VnH(), z0.VnH());
13020 END();
13021
13022 if (CAN_RUN()) {
13023 RUN();
13024
13025 uint64_t expected_z0[] = {0x8000000020001010, 0x0800000002000101};
13026 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
13027 uint64_t expected_z2[] = {0x5500550055005500, 0x5503550f553f55ff};
13028 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13029 uint64_t expected_z3[] = {0x0000000000000000, 0x80c0e0f0f8fcfeff};
13030 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13031 uint64_t expected_z4[] = {0xff000000ff00ffff, 0xff000000f000c080};
13032 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13033 uint64_t expected_z5[] = {0x01ff03ff07ff0fff, 0x1fff3fff7fffffff};
13034 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
13035 uint64_t expected_z6[] = {0x5555ffc05555fff0, 0x5555fffc5555ffff};
13036 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13037 uint64_t expected_z7[] = {0xff000000fc00f808, 0xf0000000c0008080};
13038 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
13039 uint64_t expected_z8[] = {0x1fffffff3fffffff, 0x7fffffffffffffff};
13040 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
13041 uint64_t expected_z9[] = {0xfffffff8fffffffc, 0xfffffffeffffffff};
13042 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
13043 uint64_t expected_z10[] = {0x55555555e0002020, 0x5555555580008080};
13044 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
13045 uint64_t expected_z12[] = {0xfffffffffffffffe, 0xffffffffffffffff};
13046 ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
13047 uint64_t expected_z13[] = {0xc000000040004040, 0x8000000080008080};
13048 ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
13049 uint64_t expected_z14[] = {0, 0};
13050 ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
13051 }
13052 }
13053
TEST_SVE(sve_shift_by_wide_vector)13054 TEST_SVE(sve_shift_by_wide_vector) {
13055 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13056
13057 START();
13058 __ Ptrue(p0.VnB());
13059 __ Pfalse(p1.VnB());
13060 __ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
13061 __ Zip1(p3.VnH(), p0.VnH(), p1.VnH());
13062 __ Zip1(p4.VnS(), p0.VnS(), p1.VnS());
13063
13064 __ Dup(z31.VnD(), 0x8000000080008080);
13065 __ Dup(z0.VnB(), -1);
13066 __ Index(z1.VnD(), 1, 5);
13067
13068 __ Dup(z2.VnB(), 0x55);
13069 __ Lsr(z2.VnB(), p2.Merging(), z2.VnB(), z1.VnD());
13070 __ Lsl(z3.VnB(), p0.Merging(), z0.VnB(), z1.VnD());
13071 __ Asr(z4.VnB(), p0.Merging(), z31.VnB(), z1.VnD());
13072
13073 __ Dup(z6.VnB(), 0x55);
13074 __ Lsr(z5.VnH(), p0.Merging(), z0.VnH(), z1.VnD());
13075 __ Lsl(z6.VnH(), p3.Merging(), z6.VnH(), z1.VnD());
13076 __ Asr(z7.VnH(), p0.Merging(), z31.VnH(), z1.VnD());
13077
13078 __ Dup(z10.VnB(), 0x55);
13079 __ Lsr(z8.VnS(), p0.Merging(), z0.VnS(), z1.VnD());
13080 __ Lsl(z9.VnS(), p0.Merging(), z0.VnS(), z1.VnD());
13081 __ Asr(z10.VnS(), p4.Merging(), z31.VnS(), z1.VnD());
13082 END();
13083
13084 if (CAN_RUN()) {
13085 RUN();
13086
13087 uint64_t expected_z2[] = {0x5501550155015501, 0x552a552a552a552a};
13088 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13089 uint64_t expected_z3[] = {0xc0c0c0c0c0c0c0c0, 0xfefefefefefefefe};
13090 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13091 uint64_t expected_z4[] = {0xfe000000fe00fefe, 0xc0000000c000c0c0};
13092 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13093 uint64_t expected_z5[] = {0x03ff03ff03ff03ff, 0x7fff7fff7fff7fff};
13094 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
13095 uint64_t expected_z6[] = {0x5555554055555540, 0x5555aaaa5555aaaa};
13096 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13097 uint64_t expected_z7[] = {0xfe000000fe00fe02, 0xc0000000c000c040};
13098 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
13099 uint64_t expected_z8[] = {0x03ffffff03ffffff, 0x7fffffff7fffffff};
13100 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
13101 uint64_t expected_z9[] = {0xffffffc0ffffffc0, 0xfffffffefffffffe};
13102 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
13103 uint64_t expected_z10[] = {0x55555555fe000202, 0x55555555c0004040};
13104 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
13105 }
13106 }
13107
TEST_SVE(sve_pred_shift_imm)13108 TEST_SVE(sve_pred_shift_imm) {
13109 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13110
13111 START();
13112 __ Ptrue(p0.VnB());
13113 __ Pfalse(p1.VnB());
13114 __ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
13115 __ Zip1(p3.VnH(), p0.VnH(), p1.VnH());
13116 __ Zip1(p4.VnS(), p0.VnS(), p1.VnS());
13117 __ Zip1(p5.VnD(), p0.VnD(), p1.VnD());
13118
13119 __ Dup(z31.VnD(), 0x8000000080008080);
13120 __ Lsr(z0.VnB(), p0.Merging(), z31.VnB(), 1);
13121 __ Mov(z1, z0);
13122 __ Lsl(z1.VnB(), p2.Merging(), z1.VnB(), 1);
13123 __ Asr(z2.VnB(), p0.Merging(), z1.VnB(), 2);
13124
13125 __ Lsr(z3.VnH(), p0.Merging(), z31.VnH(), 2);
13126 __ Mov(z4, z3);
13127 __ Lsl(z4.VnH(), p3.Merging(), z4.VnH(), 2);
13128 __ Asr(z5.VnH(), p0.Merging(), z4.VnH(), 3);
13129
13130 __ Lsr(z6.VnS(), p0.Merging(), z31.VnS(), 3);
13131 __ Mov(z7, z6);
13132 __ Lsl(z7.VnS(), p4.Merging(), z7.VnS(), 3);
13133 __ Asr(z8.VnS(), p0.Merging(), z7.VnS(), 4);
13134
13135 __ Lsr(z9.VnD(), p0.Merging(), z31.VnD(), 4);
13136 __ Mov(z10, z9);
13137 __ Lsl(z10.VnD(), p5.Merging(), z10.VnD(), 4);
13138 __ Asr(z11.VnD(), p0.Merging(), z10.VnD(), 5);
13139 END();
13140
13141 if (CAN_RUN()) {
13142 RUN();
13143 uint64_t expected_z0[] = {0x4000000040004040, 0x4000000040004040};
13144 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
13145 uint64_t expected_z1[] = {0x4000000040004080, 0x4000000040004080};
13146 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
13147 uint64_t expected_z2[] = {0x10000000100010e0, 0x10000000100010e0};
13148 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13149 uint64_t expected_z3[] = {0x2000000020002020, 0x2000000020002020};
13150 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13151 uint64_t expected_z4[] = {0x2000000020008080, 0x2000000020008080};
13152 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13153 uint64_t expected_z5[] = {0x040000000400f010, 0x040000000400f010};
13154 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
13155 uint64_t expected_z6[] = {0x1000000010001010, 0x1000000010001010};
13156 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13157 uint64_t expected_z7[] = {0x1000000080008080, 0x1000000080008080};
13158 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
13159 uint64_t expected_z8[] = {0x01000000f8000808, 0x01000000f8000808};
13160 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
13161 uint64_t expected_z9[] = {0x0800000008000808, 0x0800000008000808};
13162 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
13163 uint64_t expected_z10[] = {0x0800000008000808, 0x8000000080008080};
13164 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
13165 uint64_t expected_z11[] = {0x0040000000400040, 0xfc00000004000404};
13166 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
13167 }
13168 }
13169
TEST_SVE(sve_asrd)13170 TEST_SVE(sve_asrd) {
13171 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13172
13173 START();
13174 __ Ptrue(p0.VnB());
13175 __ Pfalse(p1.VnB());
13176 __ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
13177 __ Zip1(p3.VnH(), p0.VnH(), p1.VnH());
13178 __ Zip1(p4.VnS(), p0.VnS(), p1.VnS());
13179 __ Zip1(p5.VnD(), p0.VnD(), p1.VnD());
13180
13181 __ Index(z31.VnB(), 0x7f - 3, 1);
13182 __ Asrd(z0.VnB(), p0.Merging(), z31.VnB(), 1);
13183 __ Mov(z1, z31);
13184 __ Asrd(z1.VnB(), p2.Merging(), z1.VnB(), 2);
13185 __ Asrd(z2.VnB(), p0.Merging(), z31.VnB(), 7);
13186 __ Asrd(z3.VnB(), p0.Merging(), z31.VnB(), 8);
13187
13188 __ Index(z31.VnH(), 0x7fff - 3, 1);
13189 __ Asrd(z4.VnH(), p0.Merging(), z31.VnH(), 1);
13190 __ Mov(z5, z31);
13191 __ Asrd(z5.VnH(), p3.Merging(), z5.VnH(), 2);
13192 __ Asrd(z6.VnH(), p0.Merging(), z31.VnH(), 15);
13193 __ Asrd(z7.VnH(), p0.Merging(), z31.VnH(), 16);
13194
13195 __ Index(z31.VnS(), 0x7fffffff - 1, 1);
13196 __ Asrd(z8.VnS(), p0.Merging(), z31.VnS(), 1);
13197 __ Mov(z9, z31);
13198 __ Asrd(z9.VnS(), p4.Merging(), z9.VnS(), 2);
13199 __ Asrd(z10.VnS(), p0.Merging(), z31.VnS(), 31);
13200 __ Asrd(z11.VnS(), p0.Merging(), z31.VnS(), 32);
13201
13202 __ Index(z31.VnD(), 0x7fffffffffffffff, 1);
13203 __ Asrd(z12.VnD(), p0.Merging(), z31.VnD(), 1);
13204 __ Mov(z13, z31);
13205 __ Asrd(z13.VnD(), p5.Merging(), z13.VnD(), 2);
13206 __ Asrd(z14.VnD(), p0.Merging(), z31.VnD(), 63);
13207 __ Asrd(z31.VnD(), p0.Merging(), z31.VnD(), 64);
13208 END();
13209
13210 if (CAN_RUN()) {
13211 RUN();
13212 uint64_t expected_z0[] = {0xc6c5c5c4c4c3c3c2, 0xc2c1c1c03f3f3e3e};
13213 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
13214 uint64_t expected_z1[] = {0x8be389e287e285e1, 0x83e181e07f1f7d1f};
13215 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
13216 uint64_t expected_z2[] = {0x0000000000000000, 0x000000ff00000000};
13217 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13218 uint64_t expected_z3[] = {0x0000000000000000, 0x0000000000000000};
13219 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13220 uint64_t expected_z4[] = {0xc002c001c001c000, 0x3fff3fff3ffe3ffe};
13221 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13222 uint64_t expected_z5[] = {0x8003e0018001e000, 0x7fff1fff7ffd1fff};
13223 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
13224 uint64_t expected_z6[] = {0x000000000000ffff, 0x0000000000000000};
13225 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13226 uint64_t expected_z7[] = {0x0000000000000000, 0x0000000000000000};
13227 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
13228 uint64_t expected_z8[] = {0xc0000001c0000000, 0x3fffffff3fffffff};
13229 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
13230 uint64_t expected_z9[] = {0x80000001e0000000, 0x7fffffff1fffffff};
13231 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
13232 uint64_t expected_z10[] = {0x00000000ffffffff, 0x0000000000000000};
13233 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
13234 uint64_t expected_z11[] = {0x0000000000000000, 0x0000000000000000};
13235 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
13236 uint64_t expected_z12[] = {0xc000000000000000, 0x3fffffffffffffff};
13237 ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
13238 uint64_t expected_z13[] = {0x8000000000000000, 0x1fffffffffffffff};
13239 ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
13240 uint64_t expected_z14[] = {0xffffffffffffffff, 0x0000000000000000};
13241 ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
13242 uint64_t expected_z31[] = {0x0000000000000000, 0x0000000000000000};
13243 ASSERT_EQUAL_SVE(expected_z31, z31.VnD());
13244 }
13245 }
13246
TEST_SVE(sve_setffr)13247 TEST_SVE(sve_setffr) {
13248 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13249 START();
13250
13251 __ Ptrue(p15.VnB());
13252 __ Setffr();
13253 __ Rdffr(p14.VnB());
13254
13255 END();
13256
13257 if (CAN_RUN()) {
13258 RUN();
13259
13260 ASSERT_EQUAL_SVE(p14.VnB(), p15.VnB());
13261 }
13262 }
13263
WrffrHelper(Test * config,unsigned active_lanes)13264 static void WrffrHelper(Test* config, unsigned active_lanes) {
13265 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13266 START();
13267
13268 int inputs[kPRegMaxSize] = {0};
13269 VIXL_ASSERT(active_lanes <= kPRegMaxSize);
13270 for (unsigned i = 0; i < active_lanes; i++) {
13271 // The rightmost (highest-indexed) array element maps to the lowest-numbered
13272 // lane.
13273 inputs[kPRegMaxSize - i - 1] = 1;
13274 }
13275
13276 Initialise(&masm, p1.VnB(), inputs);
13277 __ Wrffr(p1.VnB());
13278 __ Rdffr(p2.VnB());
13279
13280 END();
13281
13282 if (CAN_RUN()) {
13283 RUN();
13284
13285 ASSERT_EQUAL_SVE(p1.VnB(), p2.VnB());
13286 }
13287 }
13288
TEST_SVE(sve_wrffr)13289 TEST_SVE(sve_wrffr) {
13290 int active_lanes_inputs[] = {0, 1, 7, 10, 32, 48, kPRegMaxSize};
13291 for (size_t i = 0; i < ArrayLength(active_lanes_inputs); i++) {
13292 WrffrHelper(config, active_lanes_inputs[i]);
13293 }
13294 }
13295
13296 template <size_t N>
RdffrHelper(Test * config,size_t active_lanes,const int (& pg_inputs)[N])13297 static void RdffrHelper(Test* config,
13298 size_t active_lanes,
13299 const int (&pg_inputs)[N]) {
13300 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13301 START();
13302
13303 VIXL_ASSERT(active_lanes <= kPRegMaxSize);
13304
13305 // The rightmost (highest-indexed) array element maps to the lowest-numbered
13306 // lane.
13307 int pd[kPRegMaxSize] = {0};
13308 for (unsigned i = 0; i < active_lanes; i++) {
13309 pd[kPRegMaxSize - i - 1] = 1;
13310 }
13311
13312 int pg[kPRegMaxSize] = {0};
13313 for (unsigned i = 0; i < N; i++) {
13314 pg[kPRegMaxSize - i - 1] = pg_inputs[i];
13315 }
13316
13317 int pd_expected[kPRegMaxSize] = {0};
13318 for (unsigned i = 0; i < std::min(active_lanes, N); i++) {
13319 int lane = kPRegMaxSize - i - 1;
13320 pd_expected[lane] = pd[lane] & pg[lane];
13321 }
13322
13323 Initialise(&masm, p0.VnB(), pg);
13324 Initialise(&masm, p1.VnB(), pd);
13325
13326 // The unpredicated form of rdffr has been tested in `WrffrHelper`.
13327 __ Wrffr(p1.VnB());
13328 __ Rdffr(p14.VnB(), p0.Zeroing());
13329 __ Rdffrs(p13.VnB(), p0.Zeroing());
13330 __ Mrs(x8, NZCV);
13331
13332 END();
13333
13334 if (CAN_RUN()) {
13335 RUN();
13336
13337 ASSERT_EQUAL_SVE(pd_expected, p14.VnB());
13338 ASSERT_EQUAL_SVE(pd_expected, p13.VnB());
13339 StatusFlags nzcv_expected =
13340 GetPredTestFlags(pd_expected, pg, core.GetSVELaneCount(kBRegSize));
13341 ASSERT_EQUAL_64(nzcv_expected, x8);
13342 }
13343 }
13344
TEST_SVE(sve_rdffr_rdffrs)13345 TEST_SVE(sve_rdffr_rdffrs) {
13346 // clang-format off
13347 int active_lanes_inputs[] = {0, 1, 15, 26, 39, 47, kPRegMaxSize};
13348 int pg_inputs_0[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13349 int pg_inputs_1[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
13350 int pg_inputs_2[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13351 int pg_inputs_3[] = {0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
13352 int pg_inputs_4[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13353 // clang-format on
13354
13355 for (size_t i = 0; i < ArrayLength(active_lanes_inputs); i++) {
13356 RdffrHelper(config, active_lanes_inputs[i], pg_inputs_0);
13357 RdffrHelper(config, active_lanes_inputs[i], pg_inputs_1);
13358 RdffrHelper(config, active_lanes_inputs[i], pg_inputs_2);
13359 RdffrHelper(config, active_lanes_inputs[i], pg_inputs_3);
13360 RdffrHelper(config, active_lanes_inputs[i], pg_inputs_4);
13361 }
13362 }
13363
13364 typedef void (MacroAssembler::*BrkpFn)(const PRegisterWithLaneSize& pd,
13365 const PRegisterZ& pg,
13366 const PRegisterWithLaneSize& pn,
13367 const PRegisterWithLaneSize& pm);
13368
13369 template <typename Tg, typename Tn, typename Td>
BrkpaBrkpbHelper(Test * config,BrkpFn macro,BrkpFn macro_set_flags,const Tg & pg_inputs,const Tn & pn_inputs,const Tn & pm_inputs,const Td & pd_expected)13370 static void BrkpaBrkpbHelper(Test* config,
13371 BrkpFn macro,
13372 BrkpFn macro_set_flags,
13373 const Tg& pg_inputs,
13374 const Tn& pn_inputs,
13375 const Tn& pm_inputs,
13376 const Td& pd_expected) {
13377 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13378 START();
13379
13380 PRegister pg = p15;
13381 PRegister pn = p14;
13382 PRegister pm = p13;
13383 Initialise(&masm, pg.VnB(), pg_inputs);
13384 Initialise(&masm, pn.VnB(), pn_inputs);
13385 Initialise(&masm, pm.VnB(), pm_inputs);
13386
13387 // Initialise NZCV to an impossible value, to check that we actually write it.
13388 __ Mov(x10, NZCVFlag);
13389 __ Msr(NZCV, x10);
13390
13391 (masm.*macro_set_flags)(p0.VnB(), pg.Zeroing(), pn.VnB(), pm.VnB());
13392 __ Mrs(x0, NZCV);
13393
13394 (masm.*macro)(p1.VnB(), pg.Zeroing(), pn.VnB(), pm.VnB());
13395
13396 END();
13397
13398 if (CAN_RUN()) {
13399 RUN();
13400
13401 ASSERT_EQUAL_SVE(pd_expected, p0.VnB());
13402
13403 // Check that the flags were properly set.
13404 StatusFlags nzcv_expected =
13405 GetPredTestFlags(pd_expected,
13406 pg_inputs,
13407 core.GetSVELaneCount(kBRegSize));
13408 ASSERT_EQUAL_64(nzcv_expected, x0);
13409 ASSERT_EQUAL_SVE(p0.VnB(), p1.VnB());
13410 }
13411 }
13412
13413 template <typename Tg, typename Tn, typename Td>
BrkpaHelper(Test * config,const Tg & pg_inputs,const Tn & pn_inputs,const Tn & pm_inputs,const Td & pd_expected)13414 static void BrkpaHelper(Test* config,
13415 const Tg& pg_inputs,
13416 const Tn& pn_inputs,
13417 const Tn& pm_inputs,
13418 const Td& pd_expected) {
13419 BrkpaBrkpbHelper(config,
13420 &MacroAssembler::Brkpa,
13421 &MacroAssembler::Brkpas,
13422 pg_inputs,
13423 pn_inputs,
13424 pm_inputs,
13425 pd_expected);
13426 }
13427
13428 template <typename Tg, typename Tn, typename Td>
BrkpbHelper(Test * config,const Tg & pg_inputs,const Tn & pn_inputs,const Tn & pm_inputs,const Td & pd_expected)13429 static void BrkpbHelper(Test* config,
13430 const Tg& pg_inputs,
13431 const Tn& pn_inputs,
13432 const Tn& pm_inputs,
13433 const Td& pd_expected) {
13434 BrkpaBrkpbHelper(config,
13435 &MacroAssembler::Brkpb,
13436 &MacroAssembler::Brkpbs,
13437 pg_inputs,
13438 pn_inputs,
13439 pm_inputs,
13440 pd_expected);
13441 }
13442
TEST_SVE(sve_brkpb)13443 TEST_SVE(sve_brkpb) {
13444 // clang-format off
13445 // The last active element of `pn` are `true` in all vector length configurations.
13446 // | boundary of 128-bits VL.
13447 // v
13448 int pg_1[] = {1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
13449 int pg_2[] = {1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
13450 int pg_3[] = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
13451
13452 // | highest-numbered lane lowest-numbered lane |
13453 // v v
13454 int pn_1[] = {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
13455 int pn_2[] = {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
13456 int pn_3[] = {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1};
13457
13458 int pm_1[] = {1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
13459 int pm_2[] = {0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13460 int pm_3[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13461
13462 // | first active
13463 // v
13464 int exp_1_1_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
13465 // | first active
13466 // v
13467 int exp_1_2_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
13468 // | first active
13469 // v
13470 int exp_1_3_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
13471
13472 BrkpbHelper(config, pg_1, pn_1, pm_1, exp_1_1_1);
13473 BrkpbHelper(config, pg_1, pn_2, pm_2, exp_1_2_2);
13474 BrkpbHelper(config, pg_1, pn_3, pm_3, exp_1_3_3);
13475
13476 // | first active
13477 // v
13478 int exp_2_1_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
13479 // | first active
13480 // v
13481 int exp_2_2_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
13482 // | first active
13483 // v
13484 int exp_2_3_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1};
13485 BrkpbHelper(config, pg_2, pn_1, pm_2, exp_2_1_2);
13486 BrkpbHelper(config, pg_2, pn_2, pm_3, exp_2_2_3);
13487 BrkpbHelper(config, pg_2, pn_3, pm_1, exp_2_3_1);
13488
13489 // | first active
13490 // v
13491 int exp_3_1_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
13492 // | first active
13493 // v
13494 int exp_3_2_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
13495 // | first active
13496 // v
13497 int exp_3_3_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
13498 BrkpbHelper(config, pg_3, pn_1, pm_3, exp_3_1_3);
13499 BrkpbHelper(config, pg_3, pn_2, pm_1, exp_3_2_1);
13500 BrkpbHelper(config, pg_3, pn_3, pm_2, exp_3_3_2);
13501
13502 // The last active element of `pn` are `false` in all vector length configurations.
13503 // | last active lane when VL > 128 bits.
13504 // v
13505 // | last active lane when VL == 128 bits.
13506 // v
13507 int pg_4[] = {0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
13508 int exp_4_x_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13509 BrkpbHelper(config, pg_4, pn_1, pm_1, exp_4_x_x);
13510 BrkpbHelper(config, pg_4, pn_2, pm_2, exp_4_x_x);
13511 BrkpbHelper(config, pg_4, pn_3, pm_3, exp_4_x_x);
13512 // clang-format on
13513 }
13514
TEST_SVE(sve_brkpa)13515 TEST_SVE(sve_brkpa) {
13516 // clang-format off
13517 // The last active element of `pn` are `true` in all vector length configurations.
13518 // | boundary of 128-bits VL.
13519 // v
13520 int pg_1[] = {1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
13521 int pg_2[] = {1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
13522 int pg_3[] = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
13523
13524 // | highest-numbered lane lowest-numbered lane |
13525 // v v
13526 int pn_1[] = {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
13527 int pn_2[] = {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
13528 int pn_3[] = {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1};
13529
13530 int pm_1[] = {1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
13531 int pm_2[] = {0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13532 int pm_3[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13533
13534 // | first active
13535 // v
13536 int exp_1_1_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0};
13537 // | first active
13538 // v
13539 int exp_1_2_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
13540 // | first active
13541 // v
13542 int exp_1_3_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0};
13543
13544 BrkpaHelper(config, pg_1, pn_1, pm_1, exp_1_1_1);
13545 BrkpaHelper(config, pg_1, pn_2, pm_2, exp_1_2_2);
13546 BrkpaHelper(config, pg_1, pn_3, pm_3, exp_1_3_3);
13547
13548 // | first active
13549 // v
13550 int exp_2_1_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
13551 // | first active
13552 // v
13553 int exp_2_2_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
13554 // | first active
13555 // v
13556 int exp_2_3_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1};
13557 BrkpaHelper(config, pg_2, pn_1, pm_2, exp_2_1_2);
13558 BrkpaHelper(config, pg_2, pn_2, pm_3, exp_2_2_3);
13559 BrkpaHelper(config, pg_2, pn_3, pm_1, exp_2_3_1);
13560
13561 // | first active
13562 // v
13563 int exp_3_1_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1};
13564 // | first active
13565 // v
13566 int exp_3_2_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1};
13567 // | first active
13568 // v
13569 int exp_3_3_2[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
13570 BrkpaHelper(config, pg_3, pn_1, pm_3, exp_3_1_3);
13571 BrkpaHelper(config, pg_3, pn_2, pm_1, exp_3_2_1);
13572 BrkpaHelper(config, pg_3, pn_3, pm_2, exp_3_3_2);
13573
13574 // The last active element of `pn` are `false` in all vector length configurations.
13575 // | last active lane when VL > 128 bits.
13576 // v
13577 // | last active lane when VL == 128 bits.
13578 // v
13579 int pg_4[] = {0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
13580 int exp_4_x_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13581 BrkpaHelper(config, pg_4, pn_1, pm_1, exp_4_x_x);
13582 BrkpaHelper(config, pg_4, pn_2, pm_2, exp_4_x_x);
13583 BrkpaHelper(config, pg_4, pn_3, pm_3, exp_4_x_x);
13584 // clang-format on
13585 }
13586
TEST_SVE(sve_rbit)13587 TEST_SVE(sve_rbit) {
13588 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13589 START();
13590
13591 uint64_t inputs[] = {0xaaaaaaaa55555555, 0xaaaa5555aa55aa55};
13592 InsrHelper(&masm, z0.VnD(), inputs);
13593
13594 __ Ptrue(p1.VnB());
13595 int pred[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1};
13596 Initialise(&masm, p2.VnB(), pred);
13597
13598 __ Rbit(z0.VnB(), p1.Merging(), z0.VnB());
13599 __ Rbit(z0.VnB(), p1.Merging(), z0.VnB());
13600
13601 __ Rbit(z1.VnB(), p1.Merging(), z0.VnB());
13602 __ Rbit(z2.VnH(), p1.Merging(), z0.VnH());
13603 __ Rbit(z3.VnS(), p1.Merging(), z0.VnS());
13604 __ Rbit(z4.VnD(), p1.Merging(), z0.VnD());
13605
13606 __ Dup(z5.VnB(), 0x42);
13607 __ Rbit(z5.VnB(), p2.Merging(), z0.VnB());
13608 __ Dup(z6.VnB(), 0x42);
13609 __ Rbit(z6.VnS(), p2.Merging(), z0.VnS());
13610
13611 END();
13612
13613 if (CAN_RUN()) {
13614 RUN();
13615
13616 ASSERT_EQUAL_SVE(inputs, z0.VnD());
13617
13618 uint64_t expected_z1[] = {0x55555555aaaaaaaa, 0x5555aaaa55aa55aa};
13619 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
13620 uint64_t expected_z2[] = {0x55555555aaaaaaaa, 0x5555aaaaaa55aa55};
13621 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13622 uint64_t expected_z3[] = {0x55555555aaaaaaaa, 0xaaaa5555aa55aa55};
13623 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13624 uint64_t expected_z4[] = {0xaaaaaaaa55555555, 0xaa55aa55aaaa5555};
13625 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13626 uint64_t expected_z5[] = {0x4255425542aa42aa, 0x4255424242aa42aa};
13627 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
13628 uint64_t expected_z6[] = {0x55555555aaaaaaaa, 0x42424242aa55aa55};
13629 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13630 }
13631 }
13632
TEST_SVE(sve_rev_bhw)13633 TEST_SVE(sve_rev_bhw) {
13634 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13635 START();
13636
13637 uint64_t inputs[] = {0xaaaaaaaa55555555, 0xaaaa5555aa55aa55};
13638 InsrHelper(&masm, z0.VnD(), inputs);
13639
13640 __ Ptrue(p1.VnB());
13641 int pred[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1};
13642 Initialise(&masm, p2.VnB(), pred);
13643
13644 __ Revb(z1.VnH(), p1.Merging(), z0.VnH());
13645 __ Revb(z2.VnS(), p1.Merging(), z0.VnS());
13646 __ Revb(z3.VnD(), p1.Merging(), z0.VnD());
13647 __ Revh(z4.VnS(), p1.Merging(), z0.VnS());
13648 __ Revh(z5.VnD(), p1.Merging(), z0.VnD());
13649 __ Revw(z6.VnD(), p1.Merging(), z0.VnD());
13650
13651 __ Dup(z7.VnB(), 0x42);
13652 __ Revb(z7.VnH(), p2.Merging(), z0.VnH());
13653 __ Dup(z8.VnB(), 0x42);
13654 __ Revh(z8.VnS(), p2.Merging(), z0.VnS());
13655
13656 END();
13657
13658 if (CAN_RUN()) {
13659 RUN();
13660
13661 uint64_t expected_z1[] = {0xaaaaaaaa55555555, 0xaaaa555555aa55aa};
13662 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
13663 uint64_t expected_z2[] = {0xaaaaaaaa55555555, 0x5555aaaa55aa55aa};
13664 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13665 uint64_t expected_z3[] = {0x55555555aaaaaaaa, 0x55aa55aa5555aaaa};
13666 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13667 uint64_t expected_z4[] = {0xaaaaaaaa55555555, 0x5555aaaaaa55aa55};
13668 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13669 uint64_t expected_z5[] = {0x55555555aaaaaaaa, 0xaa55aa555555aaaa};
13670 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
13671 uint64_t expected_z6[] = {0x55555555aaaaaaaa, 0xaa55aa55aaaa5555};
13672 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13673 uint64_t expected_z7[] = {0xaaaaaaaa55555555, 0xaaaa424255aa55aa};
13674 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
13675 uint64_t expected_z8[] = {0xaaaaaaaa55555555, 0x42424242aa55aa55};
13676 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
13677 }
13678 }
13679
TEST_SVE(sve_ftssel)13680 TEST_SVE(sve_ftssel) {
13681 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13682 START();
13683
13684 uint64_t in[] = {0x1111777766665555, 0xaaaabbbbccccdddd};
13685 uint64_t q[] = {0x0001000300000002, 0x0001000200000003};
13686 InsrHelper(&masm, z0.VnD(), in);
13687 InsrHelper(&masm, z1.VnD(), q);
13688
13689 __ Ftssel(z2.VnH(), z0.VnH(), z1.VnH());
13690 __ Ftssel(z3.VnS(), z0.VnS(), z1.VnS());
13691 __ Ftssel(z4.VnD(), z0.VnD(), z1.VnD());
13692
13693 END();
13694
13695 if (CAN_RUN()) {
13696 RUN();
13697
13698 uint64_t expected_z2[] = {0x3c00bc006666d555, 0x3c003bbbccccbc00};
13699 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13700 uint64_t expected_z3[] = {0xbf800000e6665555, 0x2aaabbbbbf800000};
13701 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13702 uint64_t expected_z4[] = {0x9111777766665555, 0xbff0000000000000};
13703 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13704 }
13705 }
13706
TEST_SVE(sve_fexpa)13707 TEST_SVE(sve_fexpa) {
13708 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13709 START();
13710
13711 uint64_t in0[] = {0x3ff0000000000000, 0x3ff0000000011001};
13712 uint64_t in1[] = {0x3ff000000002200f, 0xbff000000003301f};
13713 uint64_t in2[] = {0xbff000000004403f, 0x3ff0000000055040};
13714 uint64_t in3[] = {0x3f800000bf800001, 0x3f80000f3f80001f};
13715 uint64_t in4[] = {0x3f80002f3f82203f, 0xbf8000403f833041};
13716 uint64_t in5[] = {0x3c003c01bc00bc07, 0x3c08bc0f3c1fbc20};
13717 InsrHelper(&masm, z0.VnD(), in0);
13718 InsrHelper(&masm, z1.VnD(), in1);
13719 InsrHelper(&masm, z2.VnD(), in2);
13720 InsrHelper(&masm, z3.VnD(), in3);
13721 InsrHelper(&masm, z4.VnD(), in4);
13722 InsrHelper(&masm, z5.VnD(), in5);
13723
13724 __ Fexpa(z6.VnD(), z0.VnD());
13725 __ Fexpa(z7.VnD(), z1.VnD());
13726 __ Fexpa(z8.VnD(), z2.VnD());
13727 __ Fexpa(z9.VnS(), z3.VnS());
13728 __ Fexpa(z10.VnS(), z4.VnS());
13729 __ Fexpa(z11.VnH(), z5.VnH());
13730
13731 END();
13732
13733 if (CAN_RUN()) {
13734 RUN();
13735 uint64_t expected_z6[] = {0x0000000000000000, 0x44002c9a3e778061};
13736 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13737 uint64_t expected_z7[] = {0x0802d285a6e4030b, 0x4c06623882552225};
13738 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
13739 uint64_t expected_z8[] = {0x100fa7c1819e90d8, 0x5410000000000000};
13740 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
13741 uint64_t expected_z9[] = {0x00000000000164d2, 0x0016942d003311c4};
13742 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
13743 uint64_t expected_z10[] = {0x0054f35b407d3e0c, 0x00800000608164d2};
13744 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
13745 uint64_t expected_z11[] = {0x00000016000000a8, 0x00c2018903d40400};
13746 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
13747 }
13748 }
13749
TEST_SVE(sve_rev_p)13750 TEST_SVE(sve_rev_p) {
13751 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13752 START();
13753
13754 Initialise(&masm,
13755 p0.VnB(),
13756 0xabcdabcdabcdabcd,
13757 0xabcdabcdabcdabcd,
13758 0xabcdabcdabcdabcd,
13759 0xabcdabcdabcdabcd);
13760
13761 __ Rev(p1.VnB(), p0.VnB());
13762 __ Rev(p2.VnH(), p0.VnH());
13763 __ Rev(p3.VnS(), p0.VnS());
13764 __ Rev(p4.VnD(), p0.VnD());
13765
13766 END();
13767
13768 if (CAN_RUN()) {
13769 RUN();
13770
13771 int p1_expected[] = {1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1};
13772 ASSERT_EQUAL_SVE(p1_expected, p1.VnB());
13773 int p2_expected[] = {0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0};
13774 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
13775 int p3_expected[] = {1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0};
13776 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
13777 int p4_expected[] = {1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1};
13778 ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
13779 }
13780 }
13781
TEST_SVE(sve_trn_p_bh)13782 TEST_SVE(sve_trn_p_bh) {
13783 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13784 START();
13785
13786 Initialise(&masm, p0.VnB(), 0xa5a55a5a);
13787 __ Pfalse(p1.VnB());
13788
13789 __ Trn1(p2.VnB(), p0.VnB(), p0.VnB());
13790 __ Trn2(p3.VnB(), p0.VnB(), p0.VnB());
13791 __ Trn1(p4.VnB(), p1.VnB(), p0.VnB());
13792 __ Trn2(p5.VnB(), p1.VnB(), p0.VnB());
13793 __ Trn1(p6.VnB(), p0.VnB(), p1.VnB());
13794 __ Trn2(p7.VnB(), p0.VnB(), p1.VnB());
13795
13796 __ Trn1(p8.VnH(), p0.VnH(), p0.VnH());
13797 __ Trn2(p9.VnH(), p0.VnH(), p0.VnH());
13798 __ Trn1(p10.VnH(), p1.VnH(), p0.VnH());
13799 __ Trn2(p11.VnH(), p1.VnH(), p0.VnH());
13800 __ Trn1(p12.VnH(), p0.VnH(), p1.VnH());
13801 __ Trn2(p13.VnH(), p0.VnH(), p1.VnH());
13802
13803 END();
13804
13805 if (CAN_RUN()) {
13806 RUN();
13807 int p2_expected[] = {1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0};
13808 int p3_expected[] = {0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1};
13809 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
13810 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
13811
13812 int p4_expected[] = {1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
13813 int p5_expected[] = {0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0};
13814 ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
13815 ASSERT_EQUAL_SVE(p5_expected, p5.VnB());
13816
13817 int p6_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0};
13818 int p7_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1};
13819 ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
13820 ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
13821
13822 int p8_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
13823 int p9_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
13824 ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
13825 ASSERT_EQUAL_SVE(p9_expected, p9.VnB());
13826
13827 int p10_expected[] = {0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0};
13828 int p11_expected[] = {0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0};
13829 ASSERT_EQUAL_SVE(p10_expected, p10.VnB());
13830 ASSERT_EQUAL_SVE(p11_expected, p11.VnB());
13831
13832 int p12_expected[] = {0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0};
13833 int p13_expected[] = {0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0};
13834 ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
13835 ASSERT_EQUAL_SVE(p13_expected, p13.VnB());
13836 }
13837 }
13838
TEST_SVE(sve_trn_p_sd)13839 TEST_SVE(sve_trn_p_sd) {
13840 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13841 START();
13842
13843 Initialise(&masm, p0.VnB(), 0x55a55aaa);
13844 __ Pfalse(p1.VnB());
13845
13846 __ Trn1(p2.VnS(), p0.VnS(), p0.VnS());
13847 __ Trn2(p3.VnS(), p0.VnS(), p0.VnS());
13848 __ Trn1(p4.VnS(), p1.VnS(), p0.VnS());
13849 __ Trn2(p5.VnS(), p1.VnS(), p0.VnS());
13850 __ Trn1(p6.VnS(), p0.VnS(), p1.VnS());
13851 __ Trn2(p7.VnS(), p0.VnS(), p1.VnS());
13852
13853 __ Trn1(p8.VnD(), p0.VnD(), p0.VnD());
13854 __ Trn2(p9.VnD(), p0.VnD(), p0.VnD());
13855 __ Trn1(p10.VnD(), p1.VnD(), p0.VnD());
13856 __ Trn2(p11.VnD(), p1.VnD(), p0.VnD());
13857 __ Trn1(p12.VnD(), p0.VnD(), p1.VnD());
13858 __ Trn2(p13.VnD(), p0.VnD(), p1.VnD());
13859
13860 END();
13861
13862 if (CAN_RUN()) {
13863 RUN();
13864 int p2_expected[] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
13865 int p3_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
13866 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
13867 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
13868
13869 int p4_expected[] = {1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
13870 int p5_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
13871 ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
13872 ASSERT_EQUAL_SVE(p5_expected, p5.VnB());
13873
13874 int p6_expected[] = {0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0};
13875 int p7_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0};
13876 ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
13877 ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
13878
13879 int p8_expected[] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
13880 int p9_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
13881 ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
13882 ASSERT_EQUAL_SVE(p9_expected, p9.VnB());
13883
13884 int p10_expected[] = {1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13885 int p11_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13886 ASSERT_EQUAL_SVE(p10_expected, p10.VnB());
13887 ASSERT_EQUAL_SVE(p11_expected, p11.VnB());
13888
13889 int p12_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0};
13890 int p13_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0};
13891 ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
13892 ASSERT_EQUAL_SVE(p13_expected, p13.VnB());
13893 }
13894 }
13895
TEST_SVE(sve_zip_p_bh)13896 TEST_SVE(sve_zip_p_bh) {
13897 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13898 START();
13899
13900 Initialise(&masm,
13901 p0.VnB(),
13902 0x5a5a5a5a5a5a5a5a,
13903 0x5a5a5a5a5a5a5a5a,
13904 0x5a5a5a5a5a5a5a5a,
13905 0x5a5a5a5a5a5a5a5a);
13906 __ Pfalse(p1.VnB());
13907
13908 __ Zip1(p2.VnB(), p0.VnB(), p0.VnB());
13909 __ Zip2(p3.VnB(), p0.VnB(), p0.VnB());
13910 __ Zip1(p4.VnB(), p1.VnB(), p0.VnB());
13911 __ Zip2(p5.VnB(), p1.VnB(), p0.VnB());
13912 __ Zip1(p6.VnB(), p0.VnB(), p1.VnB());
13913 __ Zip2(p7.VnB(), p0.VnB(), p1.VnB());
13914
13915 __ Zip1(p8.VnH(), p0.VnH(), p0.VnH());
13916 __ Zip2(p9.VnH(), p0.VnH(), p0.VnH());
13917 __ Zip1(p10.VnH(), p1.VnH(), p0.VnH());
13918 __ Zip2(p11.VnH(), p1.VnH(), p0.VnH());
13919 __ Zip1(p12.VnH(), p0.VnH(), p1.VnH());
13920 __ Zip2(p13.VnH(), p0.VnH(), p1.VnH());
13921
13922 END();
13923
13924 if (CAN_RUN()) {
13925 RUN();
13926 int p2_expected[] = {0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0};
13927 int p3_expected[] = {0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0};
13928 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
13929 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
13930
13931 int p4_expected[] = {0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13932 int p5_expected[] = {0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13933 ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
13934 ASSERT_EQUAL_SVE(p5_expected, p5.VnB());
13935
13936 int p6_expected[] = {0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0};
13937 int p7_expected[] = {0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0};
13938 ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
13939 ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
13940
13941 int p8_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
13942 int p9_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
13943 ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
13944 ASSERT_EQUAL_SVE(p9_expected, p9.VnB());
13945
13946 int p10_expected[] = {0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13947 int p11_expected[] = {0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13948 ASSERT_EQUAL_SVE(p10_expected, p10.VnB());
13949 ASSERT_EQUAL_SVE(p11_expected, p11.VnB());
13950
13951 int p12_expected[] = {0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0};
13952 int p13_expected[] = {0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0};
13953 ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
13954 ASSERT_EQUAL_SVE(p13_expected, p13.VnB());
13955 }
13956 }
13957
TEST_SVE(sve_zip_p_sd)13958 TEST_SVE(sve_zip_p_sd) {
13959 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13960 START();
13961
13962 Initialise(&masm,
13963 p0.VnB(),
13964 0x5a5a5a5a5a5a5a5a,
13965 0x5a5a5a5a5a5a5a5a,
13966 0x5a5a5a5a5a5a5a5a,
13967 0x5a5a5a5a5a5a5a5a);
13968 __ Pfalse(p1.VnB());
13969
13970 __ Zip1(p2.VnS(), p0.VnS(), p0.VnS());
13971 __ Zip2(p3.VnS(), p0.VnS(), p0.VnS());
13972 __ Zip1(p4.VnS(), p1.VnS(), p0.VnS());
13973 __ Zip2(p5.VnS(), p1.VnS(), p0.VnS());
13974 __ Zip1(p6.VnS(), p0.VnS(), p1.VnS());
13975 __ Zip2(p7.VnS(), p0.VnS(), p1.VnS());
13976
13977 __ Zip1(p8.VnD(), p0.VnD(), p0.VnD());
13978 __ Zip2(p9.VnD(), p0.VnD(), p0.VnD());
13979 __ Zip1(p10.VnD(), p1.VnD(), p0.VnD());
13980 __ Zip2(p11.VnD(), p1.VnD(), p0.VnD());
13981 __ Zip1(p12.VnD(), p0.VnD(), p1.VnD());
13982 __ Zip2(p13.VnD(), p0.VnD(), p1.VnD());
13983
13984 END();
13985
13986 if (CAN_RUN()) {
13987 RUN();
13988 int p2_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
13989 int p3_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
13990 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
13991 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
13992
13993 int p4_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
13994 int p5_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
13995 ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
13996 ASSERT_EQUAL_SVE(p5_expected, p5.VnB());
13997
13998 int p6_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0};
13999 int p7_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0};
14000 ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
14001 ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
14002
14003 int p8_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
14004 int p9_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
14005 ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
14006 ASSERT_EQUAL_SVE(p9_expected, p9.VnB());
14007
14008 int p10_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14009 int p11_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14010 ASSERT_EQUAL_SVE(p10_expected, p10.VnB());
14011 ASSERT_EQUAL_SVE(p11_expected, p11.VnB());
14012
14013 int p12_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0};
14014 int p13_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0};
14015 ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
14016 ASSERT_EQUAL_SVE(p13_expected, p13.VnB());
14017 }
14018 }
14019
TEST_SVE(sve_uzp_p)14020 TEST_SVE(sve_uzp_p) {
14021 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14022 START();
14023
14024 Initialise(&masm,
14025 p0.VnB(),
14026 0xf0f0ff00ffff0000,
14027 0x4242424242424242,
14028 0x5a5a5a5a5a5a5a5a,
14029 0x0123456789abcdef);
14030 __ Rev(p1.VnB(), p0.VnB());
14031
14032 __ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
14033 __ Zip2(p3.VnB(), p0.VnB(), p1.VnB());
14034 __ Uzp1(p4.VnB(), p2.VnB(), p3.VnB());
14035 __ Uzp2(p5.VnB(), p2.VnB(), p3.VnB());
14036
14037 __ Zip1(p2.VnH(), p0.VnH(), p1.VnH());
14038 __ Zip2(p3.VnH(), p0.VnH(), p1.VnH());
14039 __ Uzp1(p6.VnH(), p2.VnH(), p3.VnH());
14040 __ Uzp2(p7.VnH(), p2.VnH(), p3.VnH());
14041
14042 __ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
14043 __ Zip2(p3.VnS(), p0.VnS(), p1.VnS());
14044 __ Uzp1(p8.VnS(), p2.VnS(), p3.VnS());
14045 __ Uzp2(p9.VnS(), p2.VnS(), p3.VnS());
14046
14047 __ Zip1(p2.VnD(), p0.VnD(), p1.VnD());
14048 __ Zip2(p3.VnD(), p0.VnD(), p1.VnD());
14049 __ Uzp1(p10.VnD(), p2.VnD(), p3.VnD());
14050 __ Uzp2(p11.VnD(), p2.VnD(), p3.VnD());
14051
14052 END();
14053
14054 if (CAN_RUN()) {
14055 RUN();
14056
14057 ASSERT_EQUAL_SVE(p0, p4);
14058 ASSERT_EQUAL_SVE(p1, p5);
14059 ASSERT_EQUAL_SVE(p0, p6);
14060 ASSERT_EQUAL_SVE(p1, p7);
14061 ASSERT_EQUAL_SVE(p0, p8);
14062 ASSERT_EQUAL_SVE(p1, p9);
14063 ASSERT_EQUAL_SVE(p0, p10);
14064 ASSERT_EQUAL_SVE(p1, p11);
14065 }
14066 }
14067
TEST_SVE(sve_punpk)14068 TEST_SVE(sve_punpk) {
14069 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14070 START();
14071
14072 auto get_64_bits_at = [](int byte_index) -> uint64_t {
14073 // Each 8-bit chunk has the value 0x50 + the byte index of the chunk.
14074 return 0x5756555453525150 + (0x0101010101010101 * byte_index);
14075 };
14076
14077 Initialise(&masm,
14078 p0.VnB(),
14079 get_64_bits_at(24),
14080 get_64_bits_at(16),
14081 get_64_bits_at(8),
14082 get_64_bits_at(0));
14083 __ Punpklo(p1.VnH(), p0.VnB());
14084 __ Punpkhi(p2.VnH(), p0.VnB());
14085
14086 END();
14087
14088 if (CAN_RUN()) {
14089 RUN();
14090
14091 int pl = config->sve_vl_in_bits() / kZRegBitsPerPRegBit;
14092 // For simplicity, just test the bottom 64 H-sized lanes.
14093 uint64_t p1_h_bits = get_64_bits_at(0);
14094 uint64_t p2_h_bits = get_64_bits_at(pl / (2 * 8));
14095 int p1_expected[64];
14096 int p2_expected[64];
14097 for (size_t i = 0; i < 64; i++) {
14098 p1_expected[63 - i] = (p1_h_bits >> i) & 1;
14099 p2_expected[63 - i] = (p2_h_bits >> i) & 1;
14100 }
14101 // Testing `VnH` ensures that odd-numbered B lanes are zero.
14102 ASSERT_EQUAL_SVE(p1_expected, p1.VnH());
14103 ASSERT_EQUAL_SVE(p2_expected, p2.VnH());
14104 }
14105 }
14106
14107 typedef void (MacroAssembler::*BrkFn)(const PRegisterWithLaneSize& pd,
14108 const PRegister& pg,
14109 const PRegisterWithLaneSize& pn);
14110
14111 typedef void (MacroAssembler::*BrksFn)(const PRegisterWithLaneSize& pd,
14112 const PRegisterZ& pg,
14113 const PRegisterWithLaneSize& pn);
14114
14115 template <typename T, size_t N>
BrkaBrkbHelper(Test * config,BrkFn macro,BrksFn macro_set_flags,const T (& pd_inputs)[N],const T (& pg_inputs)[N],const T (& pn_inputs)[N],const T (& pd_z_expected)[N])14116 static void BrkaBrkbHelper(Test* config,
14117 BrkFn macro,
14118 BrksFn macro_set_flags,
14119 const T (&pd_inputs)[N],
14120 const T (&pg_inputs)[N],
14121 const T (&pn_inputs)[N],
14122 const T (&pd_z_expected)[N]) {
14123 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14124 START();
14125
14126 PRegister pg = p10;
14127 PRegister pn = p9;
14128 PRegister pd_z = p0;
14129 PRegister pd_z_s = p1;
14130 PRegister pd_m = p2;
14131 Initialise(&masm, pg.VnB(), pg_inputs);
14132 Initialise(&masm, pn.VnB(), pn_inputs);
14133 Initialise(&masm, pd_m.VnB(), pd_inputs);
14134
14135 // Initialise NZCV to an impossible value, to check that we actually write it.
14136 __ Mov(x10, NZCVFlag);
14137 __ Msr(NZCV, x10);
14138
14139 (masm.*macro)(pd_z.VnB(), pg.Zeroing(), pn.VnB());
14140 (masm.*macro_set_flags)(pd_z_s.VnB(), pg.Zeroing(), pn.VnB());
14141 __ Mrs(x0, NZCV);
14142
14143 (masm.*macro)(pd_m.VnB(), pg.Merging(), pn.VnB());
14144
14145 END();
14146
14147 if (CAN_RUN()) {
14148 RUN();
14149
14150 ASSERT_EQUAL_SVE(pd_z_expected, pd_z.VnB());
14151
14152 // Check that the flags were properly set.
14153 StatusFlags nzcv_expected =
14154 GetPredTestFlags(pd_z_expected,
14155 pg_inputs,
14156 core.GetSVELaneCount(kBRegSize));
14157 ASSERT_EQUAL_64(nzcv_expected, x0);
14158 ASSERT_EQUAL_SVE(pd_z.VnB(), pd_z_s.VnB());
14159
14160 T pd_m_expected[N];
14161 // Set expected `pd` result on merging predication.
14162 for (size_t i = 0; i < N; i++) {
14163 pd_m_expected[i] = pg_inputs[i] ? pd_z_expected[i] : pd_inputs[i];
14164 }
14165 ASSERT_EQUAL_SVE(pd_m_expected, pd_m.VnB());
14166 }
14167 }
14168
14169 template <typename T>
BrkaHelper(Test * config,const T & pd_inputs,const T & pg_inputs,const T & pn_inputs,const T & pd_expected)14170 static void BrkaHelper(Test* config,
14171 const T& pd_inputs,
14172 const T& pg_inputs,
14173 const T& pn_inputs,
14174 const T& pd_expected) {
14175 BrkaBrkbHelper(config,
14176 &MacroAssembler::Brka,
14177 &MacroAssembler::Brkas,
14178 pd_inputs,
14179 pg_inputs,
14180 pn_inputs,
14181 pd_expected);
14182 }
14183
TEST_SVE(sve_brka)14184 TEST_SVE(sve_brka) {
14185 // clang-format off
14186 // | boundary of 128-bits VL.
14187 // v
14188 int pd[] = {1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14189
14190 // | highest-numbered lane lowest-numbered lane |
14191 // v v
14192 int pg_1[] = {1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
14193 int pg_2[] = {1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
14194
14195 int pn_1[] = {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
14196 int pn_2[] = {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14197 int pn_3[] = {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1};
14198
14199 // | first break
14200 // v
14201 int exp_1_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0};
14202 // | first break
14203 // v
14204 int exp_1_2[] = {0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
14205 // | first break
14206 // v
14207 int exp_1_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
14208
14209 BrkaHelper(config, pd, pg_1, pn_1, exp_1_1);
14210 BrkaHelper(config, pd, pg_1, pn_2, exp_1_2);
14211 BrkaHelper(config, pd, pg_1, pn_3, exp_1_3);
14212
14213 // | first break
14214 // v
14215 int exp_2_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1};
14216 // | first break
14217 // v
14218 int exp_2_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
14219 // | first break
14220 // v
14221 int exp_2_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
14222 BrkaHelper(config, pd, pg_2, pn_1, exp_2_1);
14223 BrkaHelper(config, pd, pg_2, pn_2, exp_2_2);
14224 BrkaHelper(config, pd, pg_2, pn_3, exp_2_3);
14225
14226 // The all-inactive zeroing predicate sets destination predicate all-false.
14227 int pg_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14228 int exp_3_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14229 BrkaHelper(config, pd, pg_3, pn_1, exp_3_x);
14230 BrkaHelper(config, pd, pg_3, pn_2, exp_3_x);
14231 BrkaHelper(config, pd, pg_3, pn_3, exp_3_x);
14232 // clang-format on
14233 }
14234
14235 template <typename T>
BrkbHelper(Test * config,const T & pd_inputs,const T & pg_inputs,const T & pn_inputs,const T & pd_expected)14236 static void BrkbHelper(Test* config,
14237 const T& pd_inputs,
14238 const T& pg_inputs,
14239 const T& pn_inputs,
14240 const T& pd_expected) {
14241 BrkaBrkbHelper(config,
14242 &MacroAssembler::Brkb,
14243 &MacroAssembler::Brkbs,
14244 pd_inputs,
14245 pg_inputs,
14246 pn_inputs,
14247 pd_expected);
14248 }
14249
TEST_SVE(sve_brkb)14250 TEST_SVE(sve_brkb) {
14251 // clang-format off
14252 // | boundary of 128-bits VL.
14253 // v
14254 int pd[] = {1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14255
14256 // | highest-numbered lane lowest-numbered lane |
14257 // v v
14258 int pg_1[] = {1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
14259 int pg_2[] = {1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
14260
14261 int pn_1[] = {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
14262 int pn_2[] = {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14263 int pn_3[] = {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1};
14264
14265 // | first break
14266 // v
14267 int exp_1_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
14268 // | first break
14269 // v
14270 int exp_1_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
14271 // | first break
14272 // v
14273 int exp_1_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0};
14274
14275 BrkbHelper(config, pd, pg_1, pn_1, exp_1_1);
14276 BrkbHelper(config, pd, pg_1, pn_2, exp_1_2);
14277 BrkbHelper(config, pd, pg_1, pn_3, exp_1_3);
14278
14279 // | first break
14280 // v
14281 int exp_2_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1};
14282 // | first break
14283 // v
14284 int exp_2_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
14285 // | first break
14286 // v
14287 int exp_2_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14288 BrkbHelper(config, pd, pg_2, pn_1, exp_2_1);
14289 BrkbHelper(config, pd, pg_2, pn_2, exp_2_2);
14290 BrkbHelper(config, pd, pg_2, pn_3, exp_2_3);
14291
14292 // The all-inactive zeroing predicate sets destination predicate all-false.
14293 int pg_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14294 int exp_3_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14295 BrkbHelper(config, pd, pg_3, pn_1, exp_3_x);
14296 BrkbHelper(config, pd, pg_3, pn_2, exp_3_x);
14297 BrkbHelper(config, pd, pg_3, pn_3, exp_3_x);
14298 // clang-format on
14299 }
14300
14301 typedef void (MacroAssembler::*BrknFn)(const PRegisterWithLaneSize& pd,
14302 const PRegisterZ& pg,
14303 const PRegisterWithLaneSize& pn,
14304 const PRegisterWithLaneSize& pm);
14305
14306 typedef void (MacroAssembler::*BrknsFn)(const PRegisterWithLaneSize& pd,
14307 const PRegisterZ& pg,
14308 const PRegisterWithLaneSize& pn,
14309 const PRegisterWithLaneSize& pm);
14310
14311 enum BrknDstPredicateState { kAllFalse, kUnchanged };
14312
14313 template <typename T, size_t N>
BrknHelper(Test * config,const T (& pd_inputs)[N],const T (& pg_inputs)[N],const T (& pn_inputs)[N],const T (& pm_inputs)[N],BrknDstPredicateState expected_pd_state)14314 static void BrknHelper(Test* config,
14315 const T (&pd_inputs)[N],
14316 const T (&pg_inputs)[N],
14317 const T (&pn_inputs)[N],
14318 const T (&pm_inputs)[N],
14319 BrknDstPredicateState expected_pd_state) {
14320 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14321 START();
14322
14323 PRegister pg = p10;
14324 PRegister pn = p9;
14325 PRegister pm = p8;
14326 PRegister pdm = p0;
14327 PRegister pd = p1;
14328 PRegister pd_s = p2;
14329 Initialise(&masm, pg.VnB(), pg_inputs);
14330 Initialise(&masm, pn.VnB(), pn_inputs);
14331 Initialise(&masm, pm.VnB(), pm_inputs);
14332 Initialise(&masm, pdm.VnB(), pm_inputs);
14333 Initialise(&masm, pd.VnB(), pd_inputs);
14334 Initialise(&masm, pd_s.VnB(), pd_inputs);
14335
14336 // Initialise NZCV to an impossible value, to check that we actually write it.
14337 __ Mov(x10, NZCVFlag);
14338 __ Msr(NZCV, x10);
14339
14340 __ Brkn(pdm.VnB(), pg.Zeroing(), pn.VnB(), pdm.VnB());
14341 // !pd.Aliases(pm).
14342 __ Brkn(pd.VnB(), pg.Zeroing(), pn.VnB(), pm.VnB());
14343 __ Brkns(pd_s.VnB(), pg.Zeroing(), pn.VnB(), pm.VnB());
14344 __ Mrs(x0, NZCV);
14345
14346 END();
14347
14348 if (CAN_RUN()) {
14349 RUN();
14350
14351 T all_false[N] = {0};
14352 if (expected_pd_state == kAllFalse) {
14353 ASSERT_EQUAL_SVE(all_false, pd.VnB());
14354 } else {
14355 ASSERT_EQUAL_SVE(pm_inputs, pd.VnB());
14356 }
14357 ASSERT_EQUAL_SVE(pm_inputs, pm.VnB());
14358
14359 T all_true[N];
14360 for (size_t i = 0; i < ArrayLength(all_true); i++) {
14361 all_true[i] = 1;
14362 }
14363
14364 // Check that the flags were properly set.
14365 StatusFlags nzcv_expected =
14366 GetPredTestFlags((expected_pd_state == kAllFalse) ? all_false
14367 : pm_inputs,
14368 all_true,
14369 core.GetSVELaneCount(kBRegSize));
14370 ASSERT_EQUAL_64(nzcv_expected, x0);
14371 ASSERT_EQUAL_SVE(pd.VnB(), pdm.VnB());
14372 ASSERT_EQUAL_SVE(pd.VnB(), pd_s.VnB());
14373 }
14374 }
14375
TEST_SVE(sve_brkn)14376 TEST_SVE(sve_brkn) {
14377 int pd[] = {1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
14378 int pm[] = {0, 1, 1, 1, 1, 0, 0, 1, 0, 1};
14379
14380 int pg_1[] = {1, 1, 0, 0, 1, 0, 1, 1, 0, 0};
14381 int pg_2[] = {0, 0, 0, 1, 1, 1, 0, 0, 1, 1};
14382 int pg_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14383
14384 int pn_1[] = {1, 0, 0, 0, 0, 1, 1, 0, 0, 0};
14385 int pn_2[] = {0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
14386 int pn_3[] = {0, 0, 0, 0, 1, 1, 0, 0, 1, 1};
14387
14388 BrknHelper(config, pd, pg_1, pn_1, pm, kUnchanged);
14389 BrknHelper(config, pd, pg_1, pn_2, pm, kAllFalse);
14390 BrknHelper(config, pd, pg_1, pn_3, pm, kAllFalse);
14391
14392 BrknHelper(config, pd, pg_2, pn_1, pm, kAllFalse);
14393 BrknHelper(config, pd, pg_2, pn_2, pm, kUnchanged);
14394 BrknHelper(config, pd, pg_2, pn_3, pm, kAllFalse);
14395
14396 BrknHelper(config, pd, pg_3, pn_1, pm, kAllFalse);
14397 BrknHelper(config, pd, pg_3, pn_2, pm, kAllFalse);
14398 BrknHelper(config, pd, pg_3, pn_3, pm, kAllFalse);
14399 }
14400
TEST_SVE(sve_trn)14401 TEST_SVE(sve_trn) {
14402 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14403 START();
14404
14405 uint64_t in0[] = {0xffeeddccbbaa9988, 0x7766554433221100};
14406 uint64_t in1[] = {0xaa55aa55aa55aa55, 0x55aa55aa55aa55aa};
14407 InsrHelper(&masm, z0.VnD(), in0);
14408 InsrHelper(&masm, z1.VnD(), in1);
14409
14410 __ Trn1(z2.VnB(), z0.VnB(), z1.VnB());
14411 __ Trn2(z3.VnB(), z0.VnB(), z1.VnB());
14412 __ Trn1(z4.VnH(), z0.VnH(), z1.VnH());
14413 __ Trn2(z5.VnH(), z0.VnH(), z1.VnH());
14414 __ Trn1(z6.VnS(), z0.VnS(), z1.VnS());
14415 __ Trn2(z7.VnS(), z0.VnS(), z1.VnS());
14416 __ Trn1(z8.VnD(), z0.VnD(), z1.VnD());
14417 __ Trn2(z9.VnD(), z0.VnD(), z1.VnD());
14418
14419 END();
14420
14421 if (CAN_RUN()) {
14422 RUN();
14423 uint64_t expected_z2[] = {0x55ee55cc55aa5588, 0xaa66aa44aa22aa00};
14424 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
14425 uint64_t expected_z3[] = {0xaaffaaddaabbaa99, 0x5577555555335511};
14426 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
14427 uint64_t expected_z4[] = {0xaa55ddccaa559988, 0x55aa554455aa1100};
14428 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
14429 uint64_t expected_z5[] = {0xaa55ffeeaa55bbaa, 0x55aa776655aa3322};
14430 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
14431 uint64_t expected_z6[] = {0xaa55aa55bbaa9988, 0x55aa55aa33221100};
14432 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
14433 uint64_t expected_z7[] = {0xaa55aa55ffeeddcc, 0x55aa55aa77665544};
14434 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
14435 uint64_t expected_z8[] = {0x55aa55aa55aa55aa, 0x7766554433221100};
14436 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
14437 uint64_t expected_z9[] = {0xaa55aa55aa55aa55, 0xffeeddccbbaa9988};
14438 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
14439 }
14440 }
14441
TEST_SVE(sve_zip_uzp)14442 TEST_SVE(sve_zip_uzp) {
14443 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14444 START();
14445
14446 __ Dup(z0.VnD(), 0xffeeddccbbaa9988);
14447 __ Insr(z0.VnD(), 0x7766554433221100);
14448 __ Dup(z1.VnD(), 0xaa55aa55aa55aa55);
14449 __ Insr(z1.VnD(), 0x55aa55aa55aa55aa);
14450
14451 __ Zip1(z2.VnB(), z0.VnB(), z1.VnB());
14452 __ Zip2(z3.VnB(), z0.VnB(), z1.VnB());
14453 __ Zip1(z4.VnH(), z0.VnH(), z1.VnH());
14454 __ Zip2(z5.VnH(), z0.VnH(), z1.VnH());
14455 __ Zip1(z6.VnS(), z0.VnS(), z1.VnS());
14456 __ Zip2(z7.VnS(), z0.VnS(), z1.VnS());
14457 __ Zip1(z8.VnD(), z0.VnD(), z1.VnD());
14458 __ Zip2(z9.VnD(), z0.VnD(), z1.VnD());
14459
14460 __ Uzp1(z10.VnB(), z2.VnB(), z3.VnB());
14461 __ Uzp2(z11.VnB(), z2.VnB(), z3.VnB());
14462 __ Uzp1(z12.VnH(), z4.VnH(), z5.VnH());
14463 __ Uzp2(z13.VnH(), z4.VnH(), z5.VnH());
14464 __ Uzp1(z14.VnS(), z6.VnS(), z7.VnS());
14465 __ Uzp2(z15.VnS(), z6.VnS(), z7.VnS());
14466 __ Uzp1(z16.VnD(), z8.VnD(), z9.VnD());
14467 __ Uzp2(z17.VnD(), z8.VnD(), z9.VnD());
14468
14469 END();
14470
14471 if (CAN_RUN()) {
14472 RUN();
14473 uint64_t expected_z2[] = {0x5577aa665555aa44, 0x5533aa225511aa00};
14474 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
14475 uint64_t expected_z3[] = {0xaaff55eeaadd55cc, 0xaabb55aaaa995588};
14476 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
14477 uint64_t expected_z4[] = {0x55aa776655aa5544, 0x55aa332255aa1100};
14478 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
14479 uint64_t expected_z5[] = {0xaa55ffeeaa55ddcc, 0xaa55bbaaaa559988};
14480 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
14481 uint64_t expected_z6[] = {0x55aa55aa77665544, 0x55aa55aa33221100};
14482 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
14483 uint64_t expected_z7[] = {0xaa55aa55ffeeddcc, 0xaa55aa55bbaa9988};
14484 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
14485 uint64_t expected_z8[] = {0x55aa55aa55aa55aa, 0x7766554433221100};
14486 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
14487 uint64_t expected_z9[] = {0xaa55aa55aa55aa55, 0xffeeddccbbaa9988};
14488 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
14489
14490 // Check uzp is the opposite of zip.
14491 ASSERT_EQUAL_SVE(z0.VnD(), z10.VnD());
14492 ASSERT_EQUAL_SVE(z1.VnD(), z11.VnD());
14493 ASSERT_EQUAL_SVE(z0.VnD(), z12.VnD());
14494 ASSERT_EQUAL_SVE(z1.VnD(), z13.VnD());
14495 ASSERT_EQUAL_SVE(z0.VnD(), z14.VnD());
14496 ASSERT_EQUAL_SVE(z1.VnD(), z15.VnD());
14497 ASSERT_EQUAL_SVE(z0.VnD(), z16.VnD());
14498 ASSERT_EQUAL_SVE(z1.VnD(), z17.VnD());
14499 }
14500 }
14501
TEST_SVE(sve_fcadd)14502 TEST_SVE(sve_fcadd) {
14503 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14504 START();
14505
14506 __ Dup(z30.VnS(), 0);
14507
14508 __ Ptrue(p0.VnB());
14509 __ Pfalse(p1.VnB());
14510 __ Zip1(p2.VnH(), p0.VnH(), p1.VnH()); // Real elements.
14511 __ Zip1(p3.VnH(), p1.VnH(), p0.VnH()); // Imaginary elements.
14512
14513 __ Fdup(z0.VnH(), 10.0); // 10i + 10
14514 __ Fdup(z1.VnH(), 5.0); // 5i + 5
14515 __ Index(z7.VnH(), 1, 1);
14516 __ Scvtf(z7.VnH(), p0.Merging(), z7.VnH()); // Ai + B
14517
14518 __ Sel(z2.VnH(), p3, z1.VnH(), z30.VnH()); // 5i + 0
14519 __ Sel(z3.VnH(), p2, z1.VnH(), z30.VnH()); // 0i + 5
14520 __ Sel(z7.VnH(), p3, z7.VnH(), z0.VnH()); // Ai + 10
14521 __ Mov(z8, z7);
14522 __ Ext(z8.VnB(), z8.VnB(), z8.VnB(), 2);
14523 __ Sel(z8.VnH(), p2, z8.VnH(), z30.VnH()); // 0i + A
14524
14525 // (10i + 10) + rotate(5i + 0, 90)
14526 // = (10i + 10) + (0i - 5)
14527 // = 10i + 5
14528 __ Fcadd(z4.VnH(), p0.Merging(), z0.VnH(), z2.VnH(), 90);
14529
14530 // (10i + 5) + rotate(0i + 5, 270)
14531 // = (10i + 5) + (-5i + 0)
14532 // = 5i + 5
14533 __ Fcadd(z4.VnH(), p0.Merging(), z4.VnH(), z3.VnH(), 270);
14534
14535 // The same calculation, but selecting real/imaginary using predication.
14536 __ Mov(z5, z0);
14537 __ Fcadd(z5.VnH(), p2.Merging(), z5.VnH(), z1.VnH(), 90);
14538 __ Fcadd(z5.VnH(), p3.Merging(), z5.VnH(), z1.VnH(), 270);
14539
14540 // Reference calculation: (10i + 10) - (5i + 5)
14541 __ Fsub(z6.VnH(), z0.VnH(), z1.VnH());
14542
14543 // Calculation using varying imaginary values.
14544 // (Ai + 10) + rotate(5i + 0, 90)
14545 // = (Ai + 10) + (0i - 5)
14546 // = Ai + 5
14547 __ Fcadd(z7.VnH(), p0.Merging(), z7.VnH(), z2.VnH(), 90);
14548
14549 // (Ai + 5) + rotate(0i + A, 270)
14550 // = (Ai + 5) + (-Ai + 0)
14551 // = 5
14552 __ Fcadd(z7.VnH(), p0.Merging(), z7.VnH(), z8.VnH(), 270);
14553
14554 // Repeated, but for wider elements.
14555 __ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
14556 __ Zip1(p3.VnS(), p1.VnS(), p0.VnS());
14557 __ Fdup(z0.VnS(), 42.0);
14558 __ Fdup(z1.VnS(), 21.0);
14559 __ Index(z11.VnS(), 1, 1);
14560 __ Scvtf(z11.VnS(), p0.Merging(), z11.VnS());
14561 __ Sel(z2.VnS(), p3, z1.VnS(), z30.VnS());
14562 __ Sel(z29.VnS(), p2, z1.VnS(), z30.VnS());
14563 __ Sel(z11.VnS(), p3, z11.VnS(), z0.VnS());
14564 __ Mov(z12, z11);
14565 __ Ext(z12.VnB(), z12.VnB(), z12.VnB(), 4);
14566 __ Sel(z12.VnS(), p2, z12.VnS(), z30.VnS());
14567 __ Fcadd(z8.VnS(), p0.Merging(), z0.VnS(), z2.VnS(), 90);
14568 __ Fcadd(z8.VnS(), p0.Merging(), z8.VnS(), z29.VnS(), 270);
14569 __ Mov(z9, z0);
14570 __ Fcadd(z9.VnS(), p2.Merging(), z9.VnS(), z1.VnS(), 90);
14571 __ Fcadd(z9.VnS(), p3.Merging(), z9.VnS(), z1.VnS(), 270);
14572 __ Fsub(z10.VnS(), z0.VnS(), z1.VnS());
14573 __ Fcadd(z11.VnS(), p0.Merging(), z11.VnS(), z2.VnS(), 90);
14574 __ Fcadd(z11.VnS(), p0.Merging(), z11.VnS(), z12.VnS(), 270);
14575
14576 __ Zip1(p2.VnD(), p0.VnD(), p1.VnD());
14577 __ Zip1(p3.VnD(), p1.VnD(), p0.VnD());
14578 __ Fdup(z0.VnD(), -42.0);
14579 __ Fdup(z1.VnD(), -21.0);
14580 __ Index(z15.VnD(), 1, 1);
14581 __ Scvtf(z15.VnD(), p0.Merging(), z15.VnD());
14582 __ Sel(z2.VnD(), p3, z1.VnD(), z30.VnD());
14583 __ Sel(z28.VnD(), p2, z1.VnD(), z30.VnD());
14584 __ Sel(z15.VnD(), p3, z15.VnD(), z0.VnD());
14585 __ Mov(z16, z15);
14586 __ Ext(z16.VnB(), z16.VnB(), z16.VnB(), 8);
14587 __ Sel(z16.VnD(), p2, z16.VnD(), z30.VnD());
14588 __ Fcadd(z12.VnD(), p0.Merging(), z0.VnD(), z2.VnD(), 90);
14589 __ Fcadd(z12.VnD(), p0.Merging(), z12.VnD(), z28.VnD(), 270);
14590 __ Mov(z13, z0);
14591 __ Fcadd(z13.VnD(), p2.Merging(), z13.VnD(), z1.VnD(), 90);
14592 __ Fcadd(z13.VnD(), p3.Merging(), z13.VnD(), z1.VnD(), 270);
14593 __ Fsub(z14.VnD(), z0.VnD(), z1.VnD());
14594 __ Fcadd(z15.VnD(), p0.Merging(), z15.VnD(), z2.VnD(), 90);
14595 __ Fcadd(z15.VnD(), p0.Merging(), z15.VnD(), z16.VnD(), 270);
14596 END();
14597
14598 if (CAN_RUN()) {
14599 RUN();
14600 ASSERT_EQUAL_SVE(z6.VnH(), z4.VnH());
14601 ASSERT_EQUAL_SVE(z6.VnH(), z5.VnH());
14602 ASSERT_EQUAL_SVE(z3.VnH(), z7.VnH());
14603 ASSERT_EQUAL_SVE(z10.VnS(), z8.VnS());
14604 ASSERT_EQUAL_SVE(z10.VnS(), z9.VnS());
14605 ASSERT_EQUAL_SVE(z29.VnS(), z11.VnS());
14606 ASSERT_EQUAL_SVE(z14.VnD(), z12.VnD());
14607 ASSERT_EQUAL_SVE(z14.VnD(), z13.VnD());
14608 ASSERT_EQUAL_SVE(z28.VnS(), z15.VnS());
14609 }
14610 }
14611
TEST_SVE(sve_fcmla_index)14612 TEST_SVE(sve_fcmla_index) {
14613 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14614 START();
14615
14616 __ Ptrue(p0.VnB());
14617
14618 __ Fdup(z0.VnH(), 10.0);
14619 __ Fdup(z2.VnH(), 2.0);
14620 __ Zip1(z0.VnH(), z0.VnH(), z2.VnH());
14621
14622 // Duplicate complex numbers across z2 segments. First segment has 1i+0,
14623 // second has 3i+2, etc.
14624 __ Index(z1.VnH(), 0, 1);
14625 __ Scvtf(z1.VnH(), p0.Merging(), z1.VnH());
14626 __ Zip1(z2.VnS(), z1.VnS(), z1.VnS());
14627 __ Zip1(z2.VnS(), z2.VnS(), z2.VnS());
14628
14629 // Derive a vector from z2 where only the third element in each segment
14630 // contains a complex number, with other elements zero.
14631 __ Index(z3.VnS(), 0, 1);
14632 __ And(z3.VnS(), z3.VnS(), 3);
14633 __ Cmpeq(p2.VnS(), p0.Zeroing(), z3.VnS(), 2);
14634 __ Dup(z3.VnB(), 0);
14635 __ Sel(z3.VnS(), p2, z2.VnS(), z3.VnS());
14636
14637 // Use indexed complex multiply on this vector, indexing the third element.
14638 __ Dup(z4.VnH(), 0);
14639 __ Fcmla(z4.VnH(), z0.VnH(), z3.VnH(), 2, 0);
14640 __ Fcmla(z4.VnH(), z0.VnH(), z3.VnH(), 2, 90);
14641
14642 // Rotate the indexed complex number and repeat, negated, and with a different
14643 // index.
14644 __ Ext(z3.VnH(), z3.VnH(), z3.VnH(), 4);
14645 __ Dup(z5.VnH(), 0);
14646 __ Fcmla(z5.VnH(), z0.VnH(), z3.VnH(), 1, 180);
14647 __ Fcmla(z5.VnH(), z0.VnH(), z3.VnH(), 1, 270);
14648 __ Fneg(z5.VnH(), p0.Merging(), z5.VnH());
14649
14650 // Create a reference result from a vector complex multiply.
14651 __ Dup(z6.VnH(), 0);
14652 __ Fcmla(z6.VnH(), p0.Merging(), z6.VnH(), z0.VnH(), z2.VnH(), 0);
14653 __ Fcmla(z6.VnH(), p0.Merging(), z6.VnH(), z0.VnH(), z2.VnH(), 90);
14654
14655 // Repeated, but for wider elements.
14656 __ Fdup(z0.VnS(), 42.0);
14657 __ Fdup(z2.VnS(), 24.0);
14658 __ Zip1(z0.VnS(), z0.VnS(), z2.VnS());
14659 __ Index(z1.VnS(), -42, 13);
14660 __ Scvtf(z1.VnS(), p0.Merging(), z1.VnS());
14661 __ Zip1(z2.VnD(), z1.VnD(), z1.VnD());
14662 __ Zip1(z2.VnD(), z2.VnD(), z2.VnD());
14663 __ Index(z3.VnD(), 0, 1);
14664 __ And(z3.VnD(), z3.VnD(), 1);
14665 __ Cmpeq(p2.VnD(), p0.Zeroing(), z3.VnD(), 1);
14666 __ Dup(z3.VnB(), 0);
14667 __ Sel(z3.VnD(), p2, z2.VnD(), z3.VnD());
14668 __ Dup(z7.VnS(), 0);
14669 __ Fcmla(z7.VnS(), z0.VnS(), z3.VnS(), 1, 0);
14670 __ Fcmla(z7.VnS(), z0.VnS(), z3.VnS(), 1, 90);
14671 __ Ext(z3.VnB(), z3.VnB(), z3.VnB(), 8);
14672 __ Dup(z8.VnS(), 0);
14673 __ Fcmla(z8.VnS(), z0.VnS(), z3.VnS(), 0, 180);
14674 __ Fcmla(z8.VnS(), z0.VnS(), z3.VnS(), 0, 270);
14675 __ Fneg(z8.VnS(), p0.Merging(), z8.VnS());
14676 __ Dup(z9.VnS(), 0);
14677 __ Fcmla(z9.VnS(), p0.Merging(), z9.VnS(), z0.VnS(), z2.VnS(), 0);
14678 __ Fcmla(z9.VnS(), p0.Merging(), z9.VnS(), z0.VnS(), z2.VnS(), 90);
14679 END();
14680
14681 if (CAN_RUN()) {
14682 RUN();
14683 ASSERT_EQUAL_SVE(z6.VnH(), z4.VnH());
14684 ASSERT_EQUAL_SVE(z6.VnH(), z5.VnH());
14685 ASSERT_EQUAL_SVE(z9.VnS(), z7.VnS());
14686 ASSERT_EQUAL_SVE(z9.VnS(), z8.VnS());
14687 }
14688 }
14689
TEST_SVE(sve_fcmla)14690 TEST_SVE(sve_fcmla) {
14691 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14692 START();
14693
14694 __ Ptrue(p0.VnB());
14695 __ Pfalse(p1.VnB());
14696 __ Zip1(p2.VnH(), p0.VnH(), p1.VnH()); // Real elements.
14697 __ Zip1(p3.VnH(), p1.VnH(), p0.VnH()); // Imaginary elements.
14698
14699 __ Fdup(z0.VnH(), 10.0);
14700 __ Fdup(z2.VnH(), 2.0);
14701
14702 // Create pairs of complex numbers, Ai + A. A is chosen to be non-zero, as
14703 // the later fneg will result in a failed comparison otherwise.
14704 __ Index(z1.VnH(), -4, 3);
14705 __ Zip1(z1.VnH(), z1.VnH(), z1.VnH());
14706 __ Zip1(z1.VnH(), z1.VnH(), z1.VnH());
14707 __ Scvtf(z1.VnH(), p0.Merging(), z1.VnH());
14708
14709 __ Sel(z3.VnH(), p2, z0.VnH(), z1.VnH()); // Ai + 10
14710 __ Sel(z4.VnH(), p2, z1.VnH(), z2.VnH()); // 2i + A
14711
14712 __ Zip1(p2.VnS(), p0.VnS(), p1.VnS()); // Even complex numbers.
14713 __ Zip1(p3.VnS(), p1.VnS(), p0.VnS()); // Odd complex numbers.
14714
14715 // Calculate (Ai + 10) * (2i + A) = (20 + A^2)i + 8A, using predication to
14716 // select only the complex numbers in odd-numbered element pairs. This leaves
14717 // results in elements 2/3, 6/7, etc. with zero in elements 0/1, 4/5, etc.
14718 // ... 7 6 5 4 3 2 1 0 <-- element
14719 // ... | 20+A^2 | 8A | 0 | 0 | 20+A^2 | 8A | 0 | 0 | <-- value
14720 __ Dup(z5.VnH(), 0);
14721 __ Fcmla(z5.VnH(), p3.Merging(), z5.VnH(), z4.VnH(), z3.VnH(), 0);
14722 __ Fcmla(z5.VnH(), p3.Merging(), z5.VnH(), z4.VnH(), z3.VnH(), 90);
14723
14724 // Move the odd results to the even result positions.
14725 // ... 7 6 5 4 3 2 1 0 <-- element
14726 // ... | 0 | 0 | 20+A^2 | 8A | 0 | 0 | 20+A^2 | 8A | <-- value
14727 __ Ext(z5.VnB(), z5.VnB(), z5.VnB(), 4);
14728
14729 // Calculate -(Ai + 10) * (2i + A) = -(20 + A^2)i - 8A for the even complex
14730 // numbers.
14731 // ... 7 6 5 4 3 2 1 0 <-- element
14732 // ... | 0 | 0 | -20-A^2 | -8A | 0 | 0 | -20-A^2 | -8A | <-- value
14733 __ Dup(z6.VnH(), 0);
14734 __ Fcmla(z6.VnH(), p2.Merging(), z6.VnH(), z4.VnH(), z3.VnH(), 180);
14735 __ Fcmla(z6.VnH(), p2.Merging(), z6.VnH(), z4.VnH(), z3.VnH(), 270);
14736
14737 // Negate the even results. The results in z6 should now match the results
14738 // computed earlier in z5.
14739 // ... 7 6 5 4 3 2 1 0 <-- element
14740 // ... | 0 | 0 | 20+A^2 | 8A | 0 | 0 | 20+A^2 | 8A | <-- value
14741 __ Fneg(z6.VnH(), p2.Merging(), z6.VnH());
14742
14743
14744 // Similarly, but for wider elements.
14745 __ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
14746 __ Zip1(p3.VnS(), p1.VnS(), p0.VnS());
14747 __ Index(z1.VnS(), -4, 3);
14748 __ Zip1(z1.VnS(), z1.VnS(), z1.VnS());
14749 __ Zip1(z1.VnS(), z1.VnS(), z1.VnS());
14750 __ Scvtf(z1.VnS(), p0.Merging(), z1.VnS());
14751 __ Fdup(z0.VnS(), 20.0);
14752 __ Fdup(z2.VnS(), 21.0);
14753 __ Sel(z3.VnS(), p2, z0.VnS(), z1.VnS());
14754 __ Sel(z4.VnS(), p2, z1.VnS(), z2.VnS());
14755 __ Punpklo(p2.VnH(), p2.VnB());
14756 __ Punpklo(p3.VnH(), p3.VnB());
14757 __ Dup(z7.VnS(), 0);
14758 __ Fcmla(z7.VnS(), p3.Merging(), z7.VnS(), z4.VnS(), z3.VnS(), 0);
14759 __ Fcmla(z7.VnS(), p3.Merging(), z7.VnS(), z4.VnS(), z3.VnS(), 90);
14760 __ Ext(z7.VnB(), z7.VnB(), z7.VnB(), 8);
14761 __ Dup(z8.VnS(), 0);
14762 __ Fcmla(z8.VnS(), p2.Merging(), z8.VnS(), z4.VnS(), z3.VnS(), 180);
14763 __ Fcmla(z8.VnS(), p2.Merging(), z8.VnS(), z4.VnS(), z3.VnS(), 270);
14764 __ Fneg(z8.VnS(), p2.Merging(), z8.VnS());
14765
14766 // Double precision computed for even lanes only.
14767 __ Zip1(p2.VnD(), p0.VnD(), p1.VnD());
14768 __ Index(z1.VnD(), -4, 3);
14769 __ Zip1(z1.VnD(), z1.VnD(), z1.VnD());
14770 __ Zip1(z1.VnD(), z1.VnD(), z1.VnD());
14771 __ Scvtf(z1.VnD(), p0.Merging(), z1.VnD());
14772 __ Fdup(z0.VnD(), 20.0);
14773 __ Fdup(z2.VnD(), 21.0);
14774 __ Sel(z3.VnD(), p2, z0.VnD(), z1.VnD());
14775 __ Sel(z4.VnD(), p2, z1.VnD(), z2.VnD());
14776 __ Punpklo(p2.VnH(), p2.VnB());
14777 __ Dup(z9.VnD(), 0);
14778 __ Fcmla(z9.VnD(), p2.Merging(), z9.VnD(), z4.VnD(), z3.VnD(), 0);
14779 __ Fcmla(z9.VnD(), p2.Merging(), z9.VnD(), z4.VnD(), z3.VnD(), 90);
14780 __ Dup(z10.VnD(), 0);
14781 __ Fcmla(z10.VnD(), p2.Merging(), z10.VnD(), z4.VnD(), z3.VnD(), 180);
14782 __ Fcmla(z10.VnD(), p2.Merging(), z10.VnD(), z4.VnD(), z3.VnD(), 270);
14783 __ Fneg(z10.VnD(), p2.Merging(), z10.VnD());
14784 END();
14785
14786 if (CAN_RUN()) {
14787 RUN();
14788 ASSERT_EQUAL_SVE(z5.VnH(), z6.VnH());
14789 ASSERT_EQUAL_SVE(z7.VnS(), z8.VnS());
14790 ASSERT_EQUAL_SVE(z9.VnD(), z10.VnD());
14791 }
14792 }
14793
14794 // Create a pattern in dst where the value of each element in src is incremented
14795 // by the segment number. This allows varying a short input by a predictable
14796 // pattern for each segment.
FPSegmentPatternHelper(MacroAssembler * masm,const ZRegister & dst,const PRegisterM & ptrue,const ZRegister & src)14797 static void FPSegmentPatternHelper(MacroAssembler* masm,
14798 const ZRegister& dst,
14799 const PRegisterM& ptrue,
14800 const ZRegister& src) {
14801 VIXL_ASSERT(AreSameLaneSize(dst, src));
14802 UseScratchRegisterScope temps(masm);
14803 ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(dst);
14804 masm->Index(ztmp, 0, 1);
14805 masm->Asr(ztmp, ztmp, kQRegSizeInBytesLog2 - dst.GetLaneSizeInBytesLog2());
14806 masm->Scvtf(ztmp, ptrue, ztmp);
14807 masm->Fadd(dst, src, ztmp);
14808 }
14809
TEST_SVE(sve_fpmul_index)14810 TEST_SVE(sve_fpmul_index) {
14811 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14812 START();
14813
14814 uint64_t in0[] = {0x3ff000003f803c00, 0xbff00000bf80bc00};
14815 uint64_t in1[] = {0x3ff012343ff03c76, 0xbff01234bff0bc76};
14816
14817 __ Ptrue(p0.VnB());
14818 // Repeat indexed vector across up to 2048-bit VL.
14819 for (size_t i = 0; i < (kZRegMaxSize / kDRegSize); i++) {
14820 InsrHelper(&masm, z25.VnD(), in0);
14821 }
14822 InsrHelper(&masm, z1.VnD(), in1);
14823
14824 FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z25.VnH());
14825 __ Fmul(z2.VnH(), z1.VnH(), z0.VnH(), 0);
14826 __ Fmul(z3.VnH(), z1.VnH(), z0.VnH(), 1);
14827 __ Fmul(z4.VnH(), z1.VnH(), z0.VnH(), 4);
14828 __ Fmul(z5.VnH(), z1.VnH(), z0.VnH(), 7);
14829
14830 __ Fmul(z6.VnS(), z1.VnS(), z0.VnS(), 0);
14831 __ Fmul(z7.VnS(), z1.VnS(), z0.VnS(), 1);
14832 __ Fmul(z8.VnS(), z1.VnS(), z0.VnS(), 2);
14833 __ Fmul(z9.VnS(), z1.VnS(), z0.VnS(), 3);
14834
14835 __ Fmul(z10.VnD(), z1.VnD(), z0.VnD(), 0);
14836 __ Fmul(z11.VnD(), z1.VnD(), z0.VnD(), 1);
14837
14838 // Compute the results using other instructions.
14839 __ Dup(z12.VnH(), z25.VnH(), 0);
14840 FPSegmentPatternHelper(&masm, z12.VnH(), p0.Merging(), z12.VnH());
14841 __ Fmul(z12.VnH(), z1.VnH(), z12.VnH());
14842 __ Dup(z13.VnH(), z25.VnH(), 1);
14843 FPSegmentPatternHelper(&masm, z13.VnH(), p0.Merging(), z13.VnH());
14844 __ Fmul(z13.VnH(), z1.VnH(), z13.VnH());
14845 __ Dup(z14.VnH(), z25.VnH(), 4);
14846 FPSegmentPatternHelper(&masm, z14.VnH(), p0.Merging(), z14.VnH());
14847 __ Fmul(z14.VnH(), z1.VnH(), z14.VnH());
14848 __ Dup(z15.VnH(), z25.VnH(), 7);
14849 FPSegmentPatternHelper(&masm, z15.VnH(), p0.Merging(), z15.VnH());
14850 __ Fmul(z15.VnH(), z1.VnH(), z15.VnH());
14851
14852 __ Dup(z16.VnS(), z25.VnS(), 0);
14853 FPSegmentPatternHelper(&masm, z16.VnH(), p0.Merging(), z16.VnH());
14854 __ Fmul(z16.VnS(), z1.VnS(), z16.VnS());
14855 __ Dup(z17.VnS(), z25.VnS(), 1);
14856 FPSegmentPatternHelper(&masm, z17.VnH(), p0.Merging(), z17.VnH());
14857 __ Fmul(z17.VnS(), z1.VnS(), z17.VnS());
14858 __ Dup(z18.VnS(), z25.VnS(), 2);
14859 FPSegmentPatternHelper(&masm, z18.VnH(), p0.Merging(), z18.VnH());
14860 __ Fmul(z18.VnS(), z1.VnS(), z18.VnS());
14861 __ Dup(z19.VnS(), z25.VnS(), 3);
14862 FPSegmentPatternHelper(&masm, z19.VnH(), p0.Merging(), z19.VnH());
14863 __ Fmul(z19.VnS(), z1.VnS(), z19.VnS());
14864
14865 __ Dup(z20.VnD(), z25.VnD(), 0);
14866 FPSegmentPatternHelper(&masm, z20.VnH(), p0.Merging(), z20.VnH());
14867 __ Fmul(z20.VnD(), z1.VnD(), z20.VnD());
14868 __ Dup(z21.VnD(), z25.VnD(), 1);
14869 FPSegmentPatternHelper(&masm, z21.VnH(), p0.Merging(), z21.VnH());
14870 __ Fmul(z21.VnD(), z1.VnD(), z21.VnD());
14871
14872 END();
14873
14874 if (CAN_RUN()) {
14875 RUN();
14876 ASSERT_EQUAL_SVE(z12.VnH(), z2.VnH());
14877 ASSERT_EQUAL_SVE(z13.VnH(), z3.VnH());
14878 ASSERT_EQUAL_SVE(z14.VnH(), z4.VnH());
14879 ASSERT_EQUAL_SVE(z15.VnH(), z5.VnH());
14880 ASSERT_EQUAL_SVE(z16.VnS(), z6.VnS());
14881 ASSERT_EQUAL_SVE(z17.VnS(), z7.VnS());
14882 ASSERT_EQUAL_SVE(z18.VnS(), z8.VnS());
14883 ASSERT_EQUAL_SVE(z19.VnS(), z9.VnS());
14884 ASSERT_EQUAL_SVE(z20.VnD(), z10.VnD());
14885 ASSERT_EQUAL_SVE(z21.VnD(), z11.VnD());
14886 }
14887 }
14888
TEST_SVE(sve_ftmad)14889 TEST_SVE(sve_ftmad) {
14890 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14891 START();
14892
14893 uint64_t in_h0[] = {0x7c027e01fc02fe01,
14894 0x3c003c00bc00bc00,
14895 0x3c003c00bc00bc00};
14896 uint64_t in_h1[] = {0xfe01fc027e017e01,
14897 0x3c00bc003c00bc00,
14898 0x3c00bc003c00bc00};
14899 uint64_t in_s0[] = {0x7f800002ffc00001,
14900 0x3f8000003f800000,
14901 0xbf800000bf800000};
14902 uint64_t in_s1[] = {0xffc00001ffc00001,
14903 0x3f800000bf800000,
14904 0x3f800000bf800000};
14905 uint64_t in_d0[] = {0x7ff8000000000001,
14906 0x3ff0000000000000,
14907 0xbff0000000000000};
14908 uint64_t in_d1[] = {0xfff0000000000002,
14909 0xbff0000000000000,
14910 0x3ff0000000000000};
14911 InsrHelper(&masm, z0.VnD(), in_h0);
14912 InsrHelper(&masm, z1.VnD(), in_h1);
14913 InsrHelper(&masm, z2.VnD(), in_s0);
14914 InsrHelper(&masm, z3.VnD(), in_s1);
14915 InsrHelper(&masm, z4.VnD(), in_d0);
14916 InsrHelper(&masm, z5.VnD(), in_d1);
14917
14918 __ Mov(z6, z0);
14919 __ Ftmad(z6.VnH(), z6.VnH(), z1.VnH(), 0);
14920 __ Mov(z7, z0);
14921 __ Ftmad(z7.VnH(), z7.VnH(), z1.VnH(), 1);
14922 __ Mov(z8, z0);
14923 __ Ftmad(z8.VnH(), z8.VnH(), z1.VnH(), 2);
14924
14925 __ Mov(z9, z2);
14926 __ Ftmad(z9.VnS(), z9.VnS(), z3.VnS(), 0);
14927 __ Mov(z10, z2);
14928 __ Ftmad(z10.VnS(), z10.VnS(), z3.VnS(), 3);
14929 __ Mov(z11, z2);
14930 __ Ftmad(z11.VnS(), z11.VnS(), z3.VnS(), 4);
14931
14932 __ Mov(z12, z4);
14933 __ Ftmad(z12.VnD(), z12.VnD(), z5.VnD(), 0);
14934 __ Mov(z13, z4);
14935 __ Ftmad(z13.VnD(), z13.VnD(), z5.VnD(), 5);
14936 __ Mov(z14, z4);
14937 __ Ftmad(z14.VnD(), z14.VnD(), z5.VnD(), 7);
14938
14939 END();
14940
14941 if (CAN_RUN()) {
14942 RUN();
14943 uint64_t expected_z6[] = {0x7e027e02fe02fe01,
14944 0x4000400000000000,
14945 0x4000400000000000};
14946 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
14947 uint64_t expected_z7[] = {0x7e027e02fe02fe01,
14948 0x3aab3800bcabbe00,
14949 0x3aab3800bcabbe00};
14950 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
14951 uint64_t expected_z8[] = {0x7e027e02fe02fe01,
14952 0x3c083c2abbefbbac,
14953 0x3c083c2abbefbbac};
14954 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
14955 uint64_t expected_z9[] = {0x7fc00002ffc00001,
14956 0x4000000040000000,
14957 0x0000000000000000};
14958 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
14959 uint64_t expected_z10[] = {0x7fc00002ffc00001,
14960 0x3f7ff2ff3f7fa4fc,
14961 0xbf800680bf802d82};
14962 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
14963 uint64_t expected_z11[] = {0x7fc00002ffc00001,
14964 0x3f8000173f8000cd,
14965 0xbf7fffd2bf7ffe66};
14966 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
14967 uint64_t expected_z12[] = {0x7ff8000000000002,
14968 0x4000000000000000,
14969 0x0000000000000000};
14970 ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
14971 uint64_t expected_z13[] = {0x7ff8000000000002,
14972 0x3fefffff6c0d846c,
14973 0xbff0000006b978ae};
14974 ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
14975 uint64_t expected_z14[] = {0x7ff8000000000002,
14976 0x3feffffffffe708a,
14977 0xbff0000000000000};
14978 ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
14979 }
14980 }
14981
BasicFPArithHelper(MacroAssembler * masm,int lane_size_in_bits,const uint64_t (& inputs)[2],const uint64_t (& inputs_fmulx)[2],const uint64_t (& inputs_nans)[2])14982 static void BasicFPArithHelper(MacroAssembler* masm,
14983 int lane_size_in_bits,
14984 const uint64_t (&inputs)[2],
14985 const uint64_t (&inputs_fmulx)[2],
14986 const uint64_t (&inputs_nans)[2]) {
14987 int ls = lane_size_in_bits;
14988
14989 for (int i = 0; i < 16; i++) {
14990 InsrHelper(masm, z0.VnD(), inputs);
14991 }
14992 ZRegister rvrs = z1.WithLaneSize(ls);
14993 masm->Rev(rvrs, z0.WithLaneSize(ls));
14994
14995 int pred[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1};
14996 Initialise(masm, p2.VnB(), pred);
14997 PRegisterM p2m = p2.Merging();
14998
14999 masm->Mov(z2, z0);
15000 masm->Fadd(z2.WithLaneSize(ls),
15001 p2m,
15002 z2.WithLaneSize(ls),
15003 rvrs,
15004 FastNaNPropagation);
15005 masm->Mov(z3, z0);
15006 masm->Fsub(z3.WithLaneSize(ls), p2m, z3.WithLaneSize(ls), rvrs);
15007 masm->Mov(z4, z0);
15008 masm->Fsub(z4.WithLaneSize(ls), p2m, rvrs, z4.WithLaneSize(ls));
15009 masm->Mov(z5, z0);
15010 masm->Fabd(z5.WithLaneSize(ls),
15011 p2m,
15012 z5.WithLaneSize(ls),
15013 rvrs,
15014 FastNaNPropagation);
15015 masm->Mov(z6, z0);
15016 masm->Fmul(z6.WithLaneSize(ls),
15017 p2m,
15018 z6.WithLaneSize(ls),
15019 rvrs,
15020 FastNaNPropagation);
15021
15022 for (int i = 0; i < 16; i++) {
15023 InsrHelper(masm, z7.VnD(), inputs_fmulx);
15024 }
15025 masm->Rev(z8.WithLaneSize(ls), z7.WithLaneSize(ls));
15026 masm->Fmulx(z7.WithLaneSize(ls),
15027 p2m,
15028 z7.WithLaneSize(ls),
15029 z8.WithLaneSize(ls),
15030 FastNaNPropagation);
15031
15032 InsrHelper(masm, z8.VnD(), inputs_nans);
15033 masm->Mov(z9, z8);
15034 masm->Fminnm(z9.WithLaneSize(ls),
15035 p2m,
15036 z9.WithLaneSize(ls),
15037 rvrs,
15038 FastNaNPropagation);
15039 masm->Mov(z10, z8);
15040 masm->Fmaxnm(z10.WithLaneSize(ls),
15041 p2m,
15042 z10.WithLaneSize(ls),
15043 rvrs,
15044 FastNaNPropagation);
15045 }
15046
TEST_SVE(sve_fp_arith_pred_h)15047 TEST_SVE(sve_fp_arith_pred_h) {
15048 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15049 START();
15050
15051 uint64_t inputs[] = {0x4800470046004500, 0x4400420040003c00};
15052 uint64_t inputs_fmulx[] = {0x7c00fc007c00fc00, 0x0000800000008000};
15053 uint64_t inputs_nans[] = {0x7fffffff7fffffff, 0x7bfffbff7fbbfbff};
15054
15055 BasicFPArithHelper(&masm, kHRegSize, inputs, inputs_fmulx, inputs_nans);
15056
15057 END();
15058
15059 if (CAN_RUN()) {
15060 RUN();
15061 uint64_t expected_z2[] = {0x4880488048804880, 0x4880420048804880};
15062 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
15063 uint64_t expected_z3[] = {0x4700450042003c00, 0xbc004200c500c700};
15064 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
15065 uint64_t expected_z4[] = {0xc700c500c200bc00, 0x3c00420045004700};
15066 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
15067 uint64_t expected_z5[] = {0x4700450042003c00, 0x3c00420045004700};
15068 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
15069 uint64_t expected_z6[] = {0x48004b004c804d00, 0x4d0042004b004800};
15070 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
15071 uint64_t expected_z7[] = {0xc000c000c000c000, 0xc0008000c000c000};
15072 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
15073 uint64_t expected_z9[] = {0x3c00400042004400, 0x4500fbff4700fbff};
15074 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
15075 uint64_t expected_z10[] = {0x3c00400042004400, 0x7bfffbff47004800};
15076 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
15077 }
15078 }
15079
TEST_SVE(sve_fp_arith_pred_s)15080 TEST_SVE(sve_fp_arith_pred_s) {
15081 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15082 START();
15083
15084 uint64_t inputs[] = {0x4080000040400000, 0x400000003f800000};
15085 uint64_t inputs_fmulx[] = {0x7f800000ff800000, 0x0000000080000000};
15086 uint64_t inputs_nans[] = {0x7fffffffffffffff, 0x41000000c1000000};
15087
15088 BasicFPArithHelper(&masm, kSRegSize, inputs, inputs_fmulx, inputs_nans);
15089
15090 END();
15091
15092 if (CAN_RUN()) {
15093 RUN();
15094 uint64_t expected_z2[] = {0x40a0000040a00000, 0x4000000040a00000};
15095 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
15096 uint64_t expected_z3[] = {0x404000003f800000, 0x40000000c0400000};
15097 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
15098 uint64_t expected_z4[] = {0xc0400000bf800000, 0x4000000040400000};
15099 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
15100 uint64_t expected_z5[] = {0x404000003f800000, 0x4000000040400000};
15101 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
15102 uint64_t expected_z6[] = {0x4080000040c00000, 0x4000000040800000};
15103 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
15104 uint64_t expected_z7[] = {0xc0000000c0000000, 0x00000000c0000000};
15105 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
15106 uint64_t expected_z9[] = {0x3f80000040000000, 0x41000000c1000000};
15107 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
15108 uint64_t expected_z10[] = {0x3f80000040000000, 0x4100000040800000};
15109 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
15110 }
15111 }
15112
TEST_SVE(sve_fp_arith_pred_d)15113 TEST_SVE(sve_fp_arith_pred_d) {
15114 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15115 START();
15116
15117 uint64_t inputs[] = {0x4000000000000000, 0x3ff0000000000000};
15118 uint64_t inputs_fmulx[] = {0x7ff0000000000000, 0x8000000000000000};
15119 uint64_t inputs_nans[] = {0x7fffffffffffffff, 0x4100000000000000};
15120
15121 BasicFPArithHelper(&masm, kDRegSize, inputs, inputs_fmulx, inputs_nans);
15122
15123 END();
15124
15125 if (CAN_RUN()) {
15126 RUN();
15127 uint64_t expected_z2[] = {0x4008000000000000, 0x4008000000000000};
15128 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
15129 uint64_t expected_z3[] = {0x3ff0000000000000, 0xbff0000000000000};
15130 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
15131 uint64_t expected_z4[] = {0xbff0000000000000, 0x3ff0000000000000};
15132 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
15133 uint64_t expected_z5[] = {0x3ff0000000000000, 0x3ff0000000000000};
15134 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
15135 uint64_t expected_z6[] = {0x4000000000000000, 0x4000000000000000};
15136 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
15137 uint64_t expected_z7[] = {0xc000000000000000, 0xc000000000000000};
15138 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
15139 uint64_t expected_z9[] = {0x3ff0000000000000, 0x4000000000000000};
15140 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
15141 uint64_t expected_z10[] = {0x3ff0000000000000, 0x4100000000000000};
15142 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
15143 }
15144 }
15145
TEST_SVE(sve_fp_arith_pred_imm)15146 TEST_SVE(sve_fp_arith_pred_imm) {
15147 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15148 START();
15149
15150 int pred[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1};
15151 Initialise(&masm, p0.VnB(), pred);
15152 PRegisterM p0m = p0.Merging();
15153 __ Ptrue(p1.VnB());
15154
15155 __ Fdup(z0.VnD(), 0.0);
15156
15157 __ Mov(z1, z0);
15158 __ Fdiv(z1.VnH(), p1.Merging(), z1.VnH(), z1.VnH());
15159 __ Mov(z2, z0);
15160 __ Fadd(z2.VnH(), p0m, z2.VnH(), 0.5);
15161 __ Mov(z3, z2);
15162 __ Fsub(z3.VnH(), p0m, z3.VnH(), 1.0);
15163 __ Mov(z4, z3);
15164 __ Fsub(z4.VnH(), p0m, 1.0, z4.VnH());
15165 __ Mov(z5, z4);
15166 __ Fmul(z5.VnH(), p0m, z5.VnH(), 2.0);
15167 __ Mov(z6, z1);
15168 __ Fminnm(z6.VnH(), p0m, z6.VnH(), 0.0);
15169 __ Mov(z7, z1);
15170 __ Fmaxnm(z7.VnH(), p0m, z7.VnH(), 1.0);
15171 __ Mov(z8, z5);
15172 __ Fmin(z8.VnH(), p0m, z8.VnH(), 1.0);
15173 __ Mov(z9, z5);
15174 __ Fmax(z9.VnH(), p0m, z9.VnH(), 0.0);
15175
15176 __ Mov(z11, z0);
15177 __ Fdiv(z11.VnS(), p1.Merging(), z11.VnS(), z11.VnS());
15178 __ Mov(z12, z0);
15179 __ Fadd(z12.VnS(), p0m, z12.VnS(), 0.5);
15180 __ Mov(z13, z12);
15181 __ Fsub(z13.VnS(), p0m, z13.VnS(), 1.0);
15182 __ Mov(z14, z13);
15183 __ Fsub(z14.VnS(), p0m, 1.0, z14.VnS());
15184 __ Mov(z15, z14);
15185 __ Fmul(z15.VnS(), p0m, z15.VnS(), 2.0);
15186 __ Mov(z16, z11);
15187 __ Fminnm(z16.VnS(), p0m, z16.VnS(), 0.0);
15188 __ Mov(z17, z11);
15189 __ Fmaxnm(z17.VnS(), p0m, z17.VnS(), 1.0);
15190 __ Mov(z18, z15);
15191 __ Fmin(z18.VnS(), p0m, z18.VnS(), 1.0);
15192 __ Mov(z19, z15);
15193 __ Fmax(z19.VnS(), p0m, z19.VnS(), 0.0);
15194
15195 __ Mov(z21, z0);
15196 __ Fdiv(z21.VnD(), p1.Merging(), z21.VnD(), z21.VnD());
15197 __ Mov(z22, z0);
15198 __ Fadd(z22.VnD(), p0m, z22.VnD(), 0.5);
15199 __ Mov(z23, z22);
15200 __ Fsub(z23.VnD(), p0m, z23.VnD(), 1.0);
15201 __ Mov(z24, z23);
15202 __ Fsub(z24.VnD(), p0m, 1.0, z24.VnD());
15203 __ Mov(z25, z24);
15204 __ Fmul(z25.VnD(), p0m, z25.VnD(), 2.0);
15205 __ Mov(z26, z21);
15206 __ Fminnm(z26.VnD(), p0m, z26.VnD(), 0.0);
15207 __ Mov(z27, z21);
15208 __ Fmaxnm(z27.VnD(), p0m, z27.VnD(), 1.0);
15209 __ Mov(z28, z25);
15210 __ Fmin(z28.VnD(), p0m, z28.VnD(), 1.0);
15211 __ Mov(z29, z25);
15212 __ Fmax(z29.VnD(), p0m, z29.VnD(), 0.0);
15213
15214 __ Index(z0.VnH(), -3, 1);
15215 __ Scvtf(z0.VnH(), p1.Merging(), z0.VnH());
15216 __ Fmax(z0.VnH(), p1.Merging(), z0.VnH(), 0.0);
15217 __ Index(z1.VnS(), -4, 2);
15218 __ Scvtf(z1.VnS(), p1.Merging(), z1.VnS());
15219 __ Fadd(z1.VnS(), p1.Merging(), z1.VnS(), 1.0);
15220
15221 END();
15222
15223 if (CAN_RUN()) {
15224 RUN();
15225 uint64_t expected_z2[] = {0x3800380038003800, 0x3800000038003800};
15226 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
15227 uint64_t expected_z3[] = {0xb800b800b800b800, 0xb8000000b800b800};
15228 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
15229 uint64_t expected_z4[] = {0x3e003e003e003e00, 0x3e0000003e003e00};
15230 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
15231 uint64_t expected_z5[] = {0x4200420042004200, 0x4200000042004200};
15232 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
15233 uint64_t expected_z6[] = {0x0000000000000000, 0x00007e0000000000};
15234 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
15235 uint64_t expected_z7[] = {0x3c003c003c003c00, 0x3c007e003c003c00};
15236 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
15237 uint64_t expected_z8[] = {0x3c003c003c003c00, 0x3c0000003c003c00};
15238 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
15239 uint64_t expected_z9[] = {0x4200420042004200, 0x4200000042004200};
15240 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
15241
15242 uint64_t expected_z12[] = {0x3f0000003f000000, 0x000000003f000000};
15243 ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
15244 uint64_t expected_z13[] = {0xbf000000bf000000, 0x00000000bf000000};
15245 ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
15246 uint64_t expected_z14[] = {0x3fc000003fc00000, 0x000000003fc00000};
15247 ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
15248 uint64_t expected_z15[] = {0x4040000040400000, 0x0000000040400000};
15249 ASSERT_EQUAL_SVE(expected_z15, z15.VnD());
15250 uint64_t expected_z16[] = {0x0000000000000000, 0x7fc0000000000000};
15251 ASSERT_EQUAL_SVE(expected_z16, z16.VnD());
15252 uint64_t expected_z17[] = {0x3f8000003f800000, 0x7fc000003f800000};
15253 ASSERT_EQUAL_SVE(expected_z17, z17.VnD());
15254 uint64_t expected_z18[] = {0x3f8000003f800000, 0x000000003f800000};
15255 ASSERT_EQUAL_SVE(expected_z18, z18.VnD());
15256 uint64_t expected_z19[] = {0x4040000040400000, 0x0000000040400000};
15257 ASSERT_EQUAL_SVE(expected_z19, z19.VnD());
15258
15259 uint64_t expected_z22[] = {0x3fe0000000000000, 0x3fe0000000000000};
15260 ASSERT_EQUAL_SVE(expected_z22, z22.VnD());
15261 uint64_t expected_z23[] = {0xbfe0000000000000, 0xbfe0000000000000};
15262 ASSERT_EQUAL_SVE(expected_z23, z23.VnD());
15263 uint64_t expected_z24[] = {0x3ff8000000000000, 0x3ff8000000000000};
15264 ASSERT_EQUAL_SVE(expected_z24, z24.VnD());
15265 uint64_t expected_z25[] = {0x4008000000000000, 0x4008000000000000};
15266 ASSERT_EQUAL_SVE(expected_z25, z25.VnD());
15267 uint64_t expected_z26[] = {0x0000000000000000, 0x0000000000000000};
15268 ASSERT_EQUAL_SVE(expected_z26, z26.VnD());
15269 uint64_t expected_z27[] = {0x3ff0000000000000, 0x3ff0000000000000};
15270 ASSERT_EQUAL_SVE(expected_z27, z27.VnD());
15271 uint64_t expected_z28[] = {0x3ff0000000000000, 0x3ff0000000000000};
15272 ASSERT_EQUAL_SVE(expected_z28, z28.VnD());
15273 uint64_t expected_z29[] = {0x4008000000000000, 0x4008000000000000};
15274 ASSERT_EQUAL_SVE(expected_z29, z29.VnD());
15275 uint64_t expected_z0[] = {0x4400420040003c00, 0x0000000000000000};
15276 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
15277 uint64_t expected_z1[] = {0x404000003f800000, 0xbf800000c0400000};
15278 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
15279 }
15280 }
15281
TEST_SVE(sve_fscale)15282 TEST_SVE(sve_fscale) {
15283 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15284 START();
15285
15286 uint64_t inputs_h[] = {0x4800470046004500, 0x4400420040003c00};
15287 InsrHelper(&masm, z0.VnD(), inputs_h);
15288 uint64_t inputs_s[] = {0x4080000040400000, 0x400000003f800000};
15289 InsrHelper(&masm, z1.VnD(), inputs_s);
15290 uint64_t inputs_d[] = {0x40f0000000000000, 0x4000000000000000};
15291 InsrHelper(&masm, z2.VnD(), inputs_d);
15292
15293 uint64_t scales[] = {0x00080002fff8fffe, 0x00100001fff0ffff};
15294 InsrHelper(&masm, z3.VnD(), scales);
15295
15296 __ Ptrue(p0.VnB());
15297 int pred[] = {0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1};
15298 Initialise(&masm, p1.VnB(), pred);
15299
15300 __ Mov(z4, z0);
15301 __ Fscale(z4.VnH(), p0.Merging(), z4.VnH(), z3.VnH());
15302 __ Mov(z5, z0);
15303 __ Fscale(z5.VnH(), p1.Merging(), z5.VnH(), z3.VnH());
15304
15305 __ Sunpklo(z3.VnS(), z3.VnH());
15306 __ Mov(z6, z1);
15307 __ Fscale(z6.VnS(), p0.Merging(), z6.VnS(), z3.VnS());
15308 __ Mov(z7, z1);
15309 __ Fscale(z7.VnS(), p1.Merging(), z7.VnS(), z3.VnS());
15310
15311 __ Sunpklo(z3.VnD(), z3.VnS());
15312 __ Mov(z8, z2);
15313 __ Fscale(z8.VnD(), p0.Merging(), z8.VnD(), z3.VnD());
15314 __ Mov(z9, z2);
15315 __ Fscale(z9.VnD(), p1.Merging(), z9.VnD(), z3.VnD());
15316
15317 // Test full double precision range scaling.
15318 __ Dup(z10.VnD(), 2045);
15319 __ Dup(z11.VnD(), 0x0010000000000000); // 2^-1022
15320 __ Fscale(z11.VnD(), p0.Merging(), z11.VnD(), z10.VnD());
15321
15322 END();
15323
15324 if (CAN_RUN()) {
15325 RUN();
15326
15327 uint64_t expected_z4[] = {0x68004f0026003d00, 0x7c00460002003800};
15328 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
15329 uint64_t expected_z5[] = {0x68004f0026004500, 0x7c00420002003800};
15330 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
15331
15332 uint64_t expected_z6[] = {0x4880000040c00000, 0x380000003f000000};
15333 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
15334 uint64_t expected_z7[] = {0x4880000040400000, 0x400000003f000000};
15335 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
15336
15337 uint64_t expected_z8[] = {0x3ff0000000000000, 0x3ff0000000000000};
15338 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
15339 uint64_t expected_z9[] = {0x40f0000000000000, 0x3ff0000000000000};
15340 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
15341
15342 uint64_t expected_z11[] = {0x7fe0000000000000, 0x7fe0000000000000};
15343 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
15344 }
15345 }
15346
15347 typedef void (MacroAssembler::*FcvtFrintMFn)(const ZRegister& zd,
15348 const PRegisterM& pg,
15349 const ZRegister& zn);
15350
15351 typedef void (MacroAssembler::*FcvtFrintZFn)(const ZRegister& zd,
15352 const PRegisterZ& pg,
15353 const ZRegister& zn);
15354
15355 template <typename F, size_t N>
TestFcvtFrintHelper(Test * config,FcvtFrintMFn macro_m,FcvtFrintZFn macro_z,int dst_type_size_in_bits,int src_type_size_in_bits,const F (& zn_inputs)[N],const int (& pg_inputs)[N],const uint64_t (& zd_expected_all_active)[N])15356 static void TestFcvtFrintHelper(Test* config,
15357 FcvtFrintMFn macro_m,
15358 FcvtFrintZFn macro_z,
15359 int dst_type_size_in_bits,
15360 int src_type_size_in_bits,
15361 const F (&zn_inputs)[N],
15362 const int (&pg_inputs)[N],
15363 const uint64_t (&zd_expected_all_active)[N]) {
15364 VIXL_ASSERT(macro_m != NULL);
15365 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15366 START();
15367
15368 // If the input and result types have a different size, the instruction
15369 // options on elements of the largest specified type is determined by the
15370 // larger type.
15371 int lane_size_in_bits =
15372 std::max(dst_type_size_in_bits, src_type_size_in_bits);
15373
15374 ZRegister zd_all_active = z25;
15375 ZRegister zd_merging = z26;
15376 ZRegister zn = z27;
15377
15378 uint64_t zn_rawbits[N];
15379 FPToRawbitsWithSize(zn_inputs, zn_rawbits, src_type_size_in_bits);
15380 InsrHelper(&masm, zn.WithLaneSize(lane_size_in_bits), zn_rawbits);
15381
15382 PRegisterWithLaneSize pg_all_active = p0.WithLaneSize(lane_size_in_bits);
15383 __ Ptrue(pg_all_active);
15384
15385 // Test floating-point conversions with all lanes actived.
15386 (masm.*macro_m)(zd_all_active.WithLaneSize(dst_type_size_in_bits),
15387 pg_all_active.Merging(),
15388 zn.WithLaneSize(src_type_size_in_bits));
15389
15390 PRegisterWithLaneSize pg_merging = p1.WithLaneSize(lane_size_in_bits);
15391 Initialise(&masm, pg_merging, pg_inputs);
15392
15393 __ Dup(zd_merging.VnD(), 0x0bad0bad0bad0bad);
15394
15395 // Use the same `zn` inputs to test floating-point conversions but partial
15396 // lanes are set inactive.
15397 (masm.*macro_m)(zd_merging.WithLaneSize(dst_type_size_in_bits),
15398 pg_merging.Merging(),
15399 zn.WithLaneSize(src_type_size_in_bits));
15400
15401 ZRegister zd_zeroing = z24;
15402 PRegisterWithLaneSize pg_zeroing = p1.WithLaneSize(lane_size_in_bits);
15403 Initialise(&masm, pg_zeroing, pg_inputs);
15404
15405 if (macro_z != NULL) {
15406 __ Dup(zd_zeroing.VnD(), 0x0bad0bad0bad0bad);
15407 (masm.*macro_z)(zd_zeroing.WithLaneSize(dst_type_size_in_bits),
15408 pg_zeroing.Zeroing(),
15409 zn.WithLaneSize(src_type_size_in_bits));
15410 }
15411
15412 END();
15413
15414 if (CAN_RUN()) {
15415 RUN();
15416
15417 ASSERT_EQUAL_SVE(zd_expected_all_active,
15418 zd_all_active.WithLaneSize(lane_size_in_bits));
15419
15420 uint64_t zd_expected_merging[N];
15421 for (unsigned i = 0; i < N; i++) {
15422 zd_expected_merging[i] =
15423 pg_inputs[i] ? zd_expected_all_active[i]
15424 : 0x0bad0bad0bad0bad & GetUintMask(lane_size_in_bits);
15425 }
15426 ASSERT_EQUAL_SVE(zd_expected_merging,
15427 zd_merging.WithLaneSize(lane_size_in_bits));
15428
15429 if (macro_z != NULL) {
15430 uint64_t zd_expected_zeroing[N] = {0};
15431 for (unsigned i = 0; i < N; i++) {
15432 if (pg_inputs[i]) {
15433 zd_expected_zeroing[i] = zd_expected_all_active[i];
15434 }
15435 }
15436 ASSERT_EQUAL_SVE(zd_expected_zeroing,
15437 zd_zeroing.WithLaneSize(lane_size_in_bits));
15438 }
15439 }
15440 }
15441
15442 template <typename F, size_t N>
TestFcvtzHelper(Test * config,FcvtFrintMFn macro_m,int dst_type_size_in_bits,int src_type_size_in_bits,const F (& zn_inputs)[N],const int (& pg_inputs)[N],const uint64_t (& zd_expected_all_active)[N])15443 static void TestFcvtzHelper(Test* config,
15444 FcvtFrintMFn macro_m,
15445 int dst_type_size_in_bits,
15446 int src_type_size_in_bits,
15447 const F (&zn_inputs)[N],
15448 const int (&pg_inputs)[N],
15449 const uint64_t (&zd_expected_all_active)[N]) {
15450 TestFcvtFrintHelper(config,
15451 macro_m,
15452 // Fcvt variants have no zeroing predication form.
15453 NULL,
15454 dst_type_size_in_bits,
15455 src_type_size_in_bits,
15456 zn_inputs,
15457 pg_inputs,
15458 zd_expected_all_active);
15459 }
15460
TEST_SVE(fcvtzs_fcvtzu_float16)15461 TEST_SVE(fcvtzs_fcvtzu_float16) {
15462 const double h_max_float16 = 0x7ff0; // Largest float16 == INT16_MAX.
15463 const double h_min_float16 = -h_max_float16; // Smallest float16 > INT16_MIN.
15464 const double largest_float16 = 0xffe0; // 65504
15465 const double smallest_float16 = -largest_float16;
15466 const double h_max_int_add_one = 0x8000;
15467
15468 double zn_inputs[] = {1.0,
15469 1.1,
15470 1.5,
15471 -1.5,
15472 h_max_float16,
15473 h_min_float16,
15474 largest_float16,
15475 smallest_float16,
15476 kFP64PositiveInfinity,
15477 kFP64NegativeInfinity,
15478 h_max_int_add_one};
15479
15480 int pg_inputs[] = {0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1};
15481
15482 uint64_t expected_fcvtzs_fp162h[] =
15483 {1, 1, 1, 0xffff, 0x7ff0, 0x8010, 0x7fff, 0x8000, 0x7fff, 0x8000, 0x7fff};
15484
15485 uint64_t expected_fcvtzu_fp162h[] =
15486 {1, 1, 1, 0, 0x7ff0, 0, 0xffe0, 0, 0xffff, 0, 0x8000};
15487
15488 // Float16 to 16-bit integers.
15489 TestFcvtzHelper(config,
15490 &MacroAssembler::Fcvtzs,
15491 kHRegSize,
15492 kHRegSize,
15493 zn_inputs,
15494 pg_inputs,
15495 expected_fcvtzs_fp162h);
15496
15497 TestFcvtzHelper(config,
15498 &MacroAssembler::Fcvtzu,
15499 kHRegSize,
15500 kHRegSize,
15501 zn_inputs,
15502 pg_inputs,
15503 expected_fcvtzu_fp162h);
15504
15505 uint64_t expected_fcvtzs_fp162w[] = {1,
15506 1,
15507 1,
15508 0xffffffff,
15509 0x7ff0,
15510 0xffff8010,
15511 0xffe0,
15512 0xffff0020,
15513 0x7fffffff,
15514 0x80000000,
15515 0x8000};
15516
15517 uint64_t expected_fcvtzu_fp162w[] =
15518 {1, 1, 1, 0, 0x7ff0, 0, 0xffe0, 0, 0xffffffff, 0, 0x8000};
15519
15520 // Float16 to 32-bit integers.
15521 TestFcvtzHelper(config,
15522 &MacroAssembler::Fcvtzs,
15523 kSRegSize,
15524 kHRegSize,
15525 zn_inputs,
15526 pg_inputs,
15527 expected_fcvtzs_fp162w);
15528
15529 TestFcvtzHelper(config,
15530 &MacroAssembler::Fcvtzu,
15531 kSRegSize,
15532 kHRegSize,
15533 zn_inputs,
15534 pg_inputs,
15535 expected_fcvtzu_fp162w);
15536
15537 uint64_t expected_fcvtzs_fp162x[] = {1,
15538 1,
15539 1,
15540 0xffffffffffffffff,
15541 0x7ff0,
15542 0xffffffffffff8010,
15543 0xffe0,
15544 0xffffffffffff0020,
15545 0x7fffffffffffffff,
15546 0x8000000000000000,
15547 0x8000};
15548
15549 uint64_t expected_fcvtzu_fp162x[] =
15550 {1, 1, 1, 0, 0x7ff0, 0, 0xffe0, 0, 0xffffffffffffffff, 0, 0x8000};
15551
15552 // Float16 to 64-bit integers.
15553 TestFcvtzHelper(config,
15554 &MacroAssembler::Fcvtzs,
15555 kDRegSize,
15556 kHRegSize,
15557 zn_inputs,
15558 pg_inputs,
15559 expected_fcvtzs_fp162x);
15560
15561 TestFcvtzHelper(config,
15562 &MacroAssembler::Fcvtzu,
15563 kDRegSize,
15564 kHRegSize,
15565 zn_inputs,
15566 pg_inputs,
15567 expected_fcvtzu_fp162x);
15568 }
15569
TEST_SVE(fcvtzs_fcvtzu_float)15570 TEST_SVE(fcvtzs_fcvtzu_float) {
15571 const double w_max_float = 0x7fffff80; // Largest float < INT32_MAX.
15572 const double w_min_float = -w_max_float; // Smallest float > INT32_MIN.
15573 const double x_max_float = 0x7fffff8000000000; // Largest float < INT64_MAX.
15574 const double x_min_float = -x_max_float; // Smallest float > INT64_MIN.
15575 const double w_min_int_add_one = 0x80000000;
15576 const double x_max_int_add_one = 0x80000000'00000000;
15577
15578 double zn_inputs[] = {1.0,
15579 1.1,
15580 1.5,
15581 -1.5,
15582 w_max_float,
15583 w_min_float,
15584 x_max_float,
15585 x_min_float,
15586 kFP64PositiveInfinity,
15587 kFP64NegativeInfinity,
15588 w_min_int_add_one,
15589 x_max_int_add_one};
15590
15591 int pg_inputs[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1};
15592
15593 uint64_t expected_fcvtzs_s2w[] = {1,
15594 1,
15595 1,
15596 0xffffffff,
15597 0x7fffff80,
15598 0x80000080,
15599 0x7fffffff,
15600 0x80000000,
15601 0x7fffffff,
15602 0x80000000,
15603 0x7fffffff,
15604 0x7fffffff};
15605
15606 uint64_t expected_fcvtzu_s2w[] = {1,
15607 1,
15608 1,
15609 0,
15610 0x7fffff80,
15611 0,
15612 0xffffffff,
15613 0,
15614 0xffffffff,
15615 0,
15616 0x80000000,
15617 0xffffffff};
15618
15619 // Float to 32-bit integers.
15620 TestFcvtzHelper(config,
15621 &MacroAssembler::Fcvtzs,
15622 kSRegSize,
15623 kSRegSize,
15624 zn_inputs,
15625 pg_inputs,
15626 expected_fcvtzs_s2w);
15627
15628 TestFcvtzHelper(config,
15629 &MacroAssembler::Fcvtzu,
15630 kSRegSize,
15631 kSRegSize,
15632 zn_inputs,
15633 pg_inputs,
15634 expected_fcvtzu_s2w);
15635
15636 uint64_t expected_fcvtzs_s2x[] = {1,
15637 1,
15638 1,
15639 0xffffffffffffffff,
15640 0x7fffff80,
15641 0xffffffff80000080,
15642 0x7fffff8000000000,
15643 0x8000008000000000,
15644 0x7fffffffffffffff,
15645 0x8000000000000000,
15646 0x80000000,
15647 0x7fffffffffffffff};
15648
15649 uint64_t expected_fcvtzu_s2x[] = {1,
15650 1,
15651 1,
15652 0,
15653 0x7fffff80,
15654 0,
15655 0x7fffff8000000000,
15656 0,
15657 0xffffffffffffffff,
15658 0,
15659 0x80000000,
15660 0x8000000000000000};
15661
15662 // Float to 64-bit integers.
15663 TestFcvtzHelper(config,
15664 &MacroAssembler::Fcvtzs,
15665 kDRegSize,
15666 kSRegSize,
15667 zn_inputs,
15668 pg_inputs,
15669 expected_fcvtzs_s2x);
15670
15671 TestFcvtzHelper(config,
15672 &MacroAssembler::Fcvtzu,
15673 kDRegSize,
15674 kSRegSize,
15675 zn_inputs,
15676 pg_inputs,
15677 expected_fcvtzu_s2x);
15678 }
15679
TEST_SVE(fcvtzs_fcvtzu_double)15680 TEST_SVE(fcvtzs_fcvtzu_double) {
15681 const double w_max_float = 0x7fffff80; // Largest float < INT32_MAX.
15682 const double w_min_float = -w_max_float; // Smallest float > INT32_MIN.
15683 const double x_max_float = 0x7fffff8000000000; // Largest float < INT64_MAX.
15684 const double x_min_float = -x_max_float; // Smallest float > INT64_MIN.
15685 const double w_max_double = kWMaxInt; // Largest double == INT32_MAX.
15686 const double w_min_double = -w_max_double; // Smallest double > INT32_MIN.
15687 const double x_max_double =
15688 0x7ffffffffffffc00; // Largest double < INT64_MAX.
15689 const double x_min_double = -x_max_double; // Smallest double > INT64_MIN.
15690 const double w_max_int_sub_one = kWMaxInt - 1;
15691 const double w_min_int_add_one = kWMinInt + 1;
15692 const double w_max_int_add_one = 0x80000000;
15693 const double x_max_int_add_one = 0x80000000'00000000;
15694
15695 double zn_inputs[] = {1.0,
15696 1.1,
15697 1.5,
15698 -1.5,
15699 w_max_float,
15700 w_min_float,
15701 x_max_float,
15702 x_min_float,
15703 w_max_double,
15704 w_min_double,
15705 x_max_double,
15706 x_min_double,
15707 kFP64PositiveInfinity,
15708 kFP64NegativeInfinity,
15709 w_max_int_sub_one,
15710 w_min_int_add_one,
15711 w_max_int_add_one,
15712 x_max_int_add_one};
15713
15714 int pg_inputs[] = {1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0};
15715
15716 uint64_t expected_fcvtzs_d2w[] = {1,
15717 1,
15718 1,
15719 0xffffffffffffffff,
15720 0x7fffff80,
15721 0xffffffff80000080,
15722 0x7fffffff,
15723 0xffffffff80000000,
15724 0x7fffffff,
15725 0xffffffff80000001,
15726 0x7fffffff,
15727 0xffffffff80000000,
15728 0x7fffffff,
15729 0xffffffff80000000,
15730 0x7ffffffe,
15731 0xffffffff80000001,
15732 0x7fffffff,
15733 0x7fffffff};
15734
15735 uint64_t expected_fcvtzu_d2w[] = {1,
15736 1,
15737 1,
15738 0,
15739 0x7fffff80,
15740 0,
15741 0xffffffff,
15742 0,
15743 0x7fffffff,
15744 0,
15745 0xffffffff,
15746 0,
15747 0xffffffff,
15748 0,
15749 0x7ffffffe,
15750 0,
15751 0x80000000,
15752 0xffffffff};
15753
15754 // Double to 32-bit integers.
15755 TestFcvtzHelper(config,
15756 &MacroAssembler::Fcvtzs,
15757 kSRegSize,
15758 kDRegSize,
15759 zn_inputs,
15760 pg_inputs,
15761 expected_fcvtzs_d2w);
15762
15763 TestFcvtzHelper(config,
15764 &MacroAssembler::Fcvtzu,
15765 kSRegSize,
15766 kDRegSize,
15767 zn_inputs,
15768 pg_inputs,
15769 expected_fcvtzu_d2w);
15770
15771 uint64_t expected_fcvtzs_d2x[] = {1,
15772 1,
15773 1,
15774 0xffffffffffffffff,
15775 0x7fffff80,
15776 0xffffffff80000080,
15777 0x7fffff8000000000,
15778 0x8000008000000000,
15779 0x7fffffff,
15780 0xffffffff80000001,
15781 0x7ffffffffffffc00,
15782 0x8000000000000400,
15783 0x7fffffffffffffff,
15784 0x8000000000000000,
15785 0x7ffffffe,
15786 0xffffffff80000001,
15787 0x80000000,
15788 0x7fffffffffffffff};
15789
15790 uint64_t expected_fcvtzu_d2x[] = {1,
15791 1,
15792 1,
15793 0,
15794 0x7fffff80,
15795 0,
15796 0x7fffff8000000000,
15797 0,
15798 0x7fffffff,
15799 0,
15800 0x7ffffffffffffc00,
15801 0,
15802 0xffffffffffffffff,
15803 0,
15804 0x000000007ffffffe,
15805 0,
15806 0x80000000,
15807 0x8000000000000000};
15808
15809 // Double to 64-bit integers.
15810 TestFcvtzHelper(config,
15811 &MacroAssembler::Fcvtzs,
15812 kDRegSize,
15813 kDRegSize,
15814 zn_inputs,
15815 pg_inputs,
15816 expected_fcvtzs_d2x);
15817
15818 TestFcvtzHelper(config,
15819 &MacroAssembler::Fcvtzu,
15820 kDRegSize,
15821 kDRegSize,
15822 zn_inputs,
15823 pg_inputs,
15824 expected_fcvtzu_d2x);
15825 }
15826
15827 template <typename F, size_t N>
TestFrintHelper(Test * config,FcvtFrintMFn macro_m,FcvtFrintZFn macro_z,int lane_size_in_bits,const F (& zn_inputs)[N],const int (& pg_inputs)[N],const F (& zd_expected)[N])15828 static void TestFrintHelper(Test* config,
15829 FcvtFrintMFn macro_m,
15830 FcvtFrintZFn macro_z,
15831 int lane_size_in_bits,
15832 const F (&zn_inputs)[N],
15833 const int (&pg_inputs)[N],
15834 const F (&zd_expected)[N]) {
15835 uint64_t zd_expected_rawbits[N];
15836 FPToRawbitsWithSize(zd_expected, zd_expected_rawbits, lane_size_in_bits);
15837 TestFcvtFrintHelper(config,
15838 macro_m,
15839 macro_z,
15840 lane_size_in_bits,
15841 lane_size_in_bits,
15842 zn_inputs,
15843 pg_inputs,
15844 zd_expected_rawbits);
15845 }
15846
TEST_SVE(frint)15847 TEST_SVE(frint) {
15848 const double inf_pos = kFP64PositiveInfinity;
15849 const double inf_neg = kFP64NegativeInfinity;
15850
15851 double zn_inputs[] =
15852 {1.1, 1.5, 1.9, 2.5, -1.5, -2.5, 0.0, -0.0, -0.2, inf_pos, inf_neg};
15853 double zd_expected_a[] =
15854 {1.0, 2.0, 2.0, 3.0, -2.0, -3.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
15855 double zd_expected_i[] =
15856 {1.0, 2.0, 2.0, 2.0, -2.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
15857 double zd_expected_m[] =
15858 {1.0, 1.0, 1.0, 2.0, -2.0, -3.0, 0.0, -0.0, -1.0, inf_pos, inf_neg};
15859 double zd_expected_n[] =
15860 {1.0, 2.0, 2.0, 2.0, -2.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
15861 double zd_expected_p[] =
15862 {2.0, 2.0, 2.0, 3.0, -1.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
15863 double zd_expected_x[] =
15864 {1.0, 2.0, 2.0, 2.0, -2.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
15865 double zd_expected_z[] =
15866 {1.0, 1.0, 1.0, 2.0, -1.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
15867
15868 int pg_inputs[] = {0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0};
15869
15870 struct TestDataSet {
15871 FcvtFrintMFn macro_m; // merging form.
15872 FcvtFrintZFn macro_z; // zeroing form.
15873 double (&expected)[11];
15874 };
15875
15876 TestDataSet test_data[] =
15877 {{&MacroAssembler::Frinta, &MacroAssembler::Frinta, zd_expected_a},
15878 {&MacroAssembler::Frinti, &MacroAssembler::Frinti, zd_expected_i},
15879 {&MacroAssembler::Frintm, &MacroAssembler::Frintm, zd_expected_m},
15880 {&MacroAssembler::Frintn, &MacroAssembler::Frintn, zd_expected_n},
15881 {&MacroAssembler::Frintp, &MacroAssembler::Frintp, zd_expected_p},
15882 {&MacroAssembler::Frintx, &MacroAssembler::Frintx, zd_expected_x},
15883 {&MacroAssembler::Frintz, &MacroAssembler::Frintz, zd_expected_z}};
15884
15885 unsigned lane_sizes[] = {kHRegSize, kSRegSize, kDRegSize};
15886
15887 for (size_t i = 0; i < sizeof(test_data) / sizeof(TestDataSet); i++) {
15888 for (size_t j = 0; j < ArrayLength(lane_sizes); j++) {
15889 TestFrintHelper(config,
15890 test_data[i].macro_m,
15891 test_data[i].macro_z,
15892 lane_sizes[j],
15893 zn_inputs,
15894 pg_inputs,
15895 test_data[i].expected);
15896 }
15897 }
15898 }
15899
15900 struct CvtfTestDataSet {
15901 uint64_t int_value;
15902 uint64_t scvtf_result;
15903 uint64_t ucvtf_result;
15904 };
15905
15906 template <size_t N>
TestUScvtfHelper(Test * config,int dst_type_size_in_bits,int src_type_size_in_bits,const int (& pg_inputs)[N],const CvtfTestDataSet (& data_set)[N])15907 static void TestUScvtfHelper(Test* config,
15908 int dst_type_size_in_bits,
15909 int src_type_size_in_bits,
15910 const int (&pg_inputs)[N],
15911 const CvtfTestDataSet (&data_set)[N]) {
15912 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15913 START();
15914
15915 // Unpack the data from the array of struct into individual arrays that can
15916 // simplify the testing.
15917 uint64_t zn_inputs[N];
15918 uint64_t expected_zd_scvtf_all_active[N];
15919 uint64_t expected_zd_ucvtf_all_active[N];
15920 for (size_t i = 0; i < N; i++) {
15921 zn_inputs[i] = data_set[i].int_value;
15922 expected_zd_scvtf_all_active[i] = data_set[i].scvtf_result;
15923 expected_zd_ucvtf_all_active[i] = data_set[i].ucvtf_result;
15924 }
15925
15926 // If the input and result types have a different size, the instruction
15927 // operates on elements of the largest specified type.
15928 int lane_size_in_bits =
15929 std::max(dst_type_size_in_bits, src_type_size_in_bits);
15930
15931 ZRegister zd_scvtf_all_active = z25;
15932 ZRegister zd_ucvtf_all_active = z26;
15933 ZRegister zn = z27;
15934 InsrHelper(&masm, zn.WithLaneSize(lane_size_in_bits), zn_inputs);
15935
15936 PRegisterWithLaneSize pg_all_active = p0.WithLaneSize(lane_size_in_bits);
15937 __ Ptrue(pg_all_active);
15938
15939 // Test integer conversions with all lanes actived.
15940 __ Scvtf(zd_scvtf_all_active.WithLaneSize(dst_type_size_in_bits),
15941 pg_all_active.Merging(),
15942 zn.WithLaneSize(src_type_size_in_bits));
15943 __ Ucvtf(zd_ucvtf_all_active.WithLaneSize(dst_type_size_in_bits),
15944 pg_all_active.Merging(),
15945 zn.WithLaneSize(src_type_size_in_bits));
15946
15947 ZRegister zd_scvtf_merged = z23;
15948 ZRegister zd_ucvtf_merged = z24;
15949
15950 PRegisterWithLaneSize pg_merged = p1.WithLaneSize(lane_size_in_bits);
15951 Initialise(&masm, pg_merged, pg_inputs);
15952
15953 uint64_t snan;
15954 switch (lane_size_in_bits) {
15955 case kHRegSize:
15956 snan = 0x7c11;
15957 break;
15958 case kSRegSize:
15959 snan = 0x7f951111;
15960 break;
15961 case kDRegSize:
15962 snan = 0x7ff5555511111111;
15963 break;
15964 }
15965 __ Dup(zd_scvtf_merged.WithLaneSize(lane_size_in_bits), snan);
15966 __ Dup(zd_ucvtf_merged.WithLaneSize(lane_size_in_bits), snan);
15967
15968 // Use the same `zn` inputs to test integer conversions but some lanes are set
15969 // inactive.
15970 __ Scvtf(zd_scvtf_merged.WithLaneSize(dst_type_size_in_bits),
15971 pg_merged.Merging(),
15972 zn.WithLaneSize(src_type_size_in_bits));
15973 __ Ucvtf(zd_ucvtf_merged.WithLaneSize(dst_type_size_in_bits),
15974 pg_merged.Merging(),
15975 zn.WithLaneSize(src_type_size_in_bits));
15976
15977 END();
15978
15979 if (CAN_RUN()) {
15980 RUN();
15981
15982 ASSERT_EQUAL_SVE(expected_zd_scvtf_all_active,
15983 zd_scvtf_all_active.WithLaneSize(lane_size_in_bits));
15984 ASSERT_EQUAL_SVE(expected_zd_ucvtf_all_active,
15985 zd_ucvtf_all_active.WithLaneSize(lane_size_in_bits));
15986
15987 uint64_t expected_zd_scvtf_merged[N];
15988 for (size_t i = 0; i < N; i++) {
15989 expected_zd_scvtf_merged[i] =
15990 pg_inputs[i] ? expected_zd_scvtf_all_active[i] : snan;
15991 }
15992 ASSERT_EQUAL_SVE(expected_zd_scvtf_merged,
15993 zd_scvtf_merged.WithLaneSize(lane_size_in_bits));
15994
15995 uint64_t expected_zd_ucvtf_merged[N];
15996 for (size_t i = 0; i < N; i++) {
15997 expected_zd_ucvtf_merged[i] =
15998 pg_inputs[i] ? expected_zd_ucvtf_all_active[i] : snan;
15999 }
16000 ASSERT_EQUAL_SVE(expected_zd_ucvtf_merged,
16001 zd_ucvtf_merged.WithLaneSize(lane_size_in_bits));
16002 }
16003 }
16004
TEST_SVE(scvtf_ucvtf_h_s_d_to_float16)16005 TEST_SVE(scvtf_ucvtf_h_s_d_to_float16) {
16006 // clang-format off
16007 CvtfTestDataSet data_set_1[] = {
16008 // Simple conversions of positive numbers which require no rounding; the
16009 // results should not depened on the rounding mode, and ucvtf and scvtf should
16010 // produce the same result.
16011 {0x0000, 0x0000, 0x0000},
16012 {0x0001, 0x3c00, 0x3c00},
16013 {0x0010, 0x4c00, 0x4c00},
16014 {0x0080, 0x5800, 0x5800},
16015 {0x0400, 0x6400, 0x6400},
16016 // Conversions which require rounding.
16017 {0x4000, 0x7400, 0x7400},
16018 {0x4001, 0x7400, 0x7400},
16019 // Round up to produce a result that's too big for the input to represent.
16020 {0x7ff0, 0x77ff, 0x77ff},
16021 {0x7ff1, 0x77ff, 0x77ff},
16022 {0x7ffe, 0x7800, 0x7800},
16023 {0x7fff, 0x7800, 0x7800}};
16024 int pg_1[] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
16025 TestUScvtfHelper(config, kHRegSize, kDRegSize, pg_1, data_set_1);
16026 TestUScvtfHelper(config, kHRegSize, kSRegSize, pg_1, data_set_1);
16027 TestUScvtfHelper(config, kHRegSize, kHRegSize, pg_1, data_set_1);
16028
16029 CvtfTestDataSet data_set_2[] = {
16030 // Test mantissa extremities.
16031 {0x0401, 0x6401, 0x6401},
16032 {0x4020, 0x7402, 0x7402},
16033 // The largest int16_t that fits in a float16.
16034 {0xffef, 0xcc40, 0x7bff},
16035 // Values that would be negative if treated as an int16_t.
16036 {0xff00, 0xdc00, 0x7bf8},
16037 {0x8000, 0xf800, 0x7800},
16038 {0x8100, 0xf7f0, 0x7808},
16039 // Check for bit pattern reproduction.
16040 {0x0123, 0x5c8c, 0x5c8c},
16041 {0x0cde, 0x6a6f, 0x6a6f},
16042 // Simple conversions of negative int64_t values. These require no rounding,
16043 // and the results should not depend on the rounding mode.
16044 {0xf800, 0xe800, 0x7bc0},
16045 {0xfc00, 0xe400, 0x7be0},
16046 {0xc000, 0xf400, 0x7a00},
16047 // Check rounding of negative int16_t values.
16048 {0x8ffe, 0xf700, 0x7880},
16049 {0x8fff, 0xf700, 0x7880},
16050 {0xffee, 0xcc80, 0x7bff},
16051 {0xffef, 0xcc40, 0x7bff}};
16052 int pg_2[] = {1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1};
16053 // `32-bit to float16` and `64-bit to float16` of above tests has been tested
16054 // in `ucvtf` of `16-bit to float16`.
16055 TestUScvtfHelper(config, kHRegSize, kHRegSize, pg_2, data_set_2);
16056 // clang-format on
16057 }
16058
TEST_SVE(scvtf_ucvtf_s_to_float)16059 TEST_SVE(scvtf_ucvtf_s_to_float) {
16060 // clang-format off
16061 int dst_lane_size = kSRegSize;
16062 int src_lane_size = kSRegSize;
16063
16064 // Simple conversions of positive numbers which require no rounding; the
16065 // results should not depened on the rounding mode, and ucvtf and scvtf should
16066 // produce the same result.
16067 CvtfTestDataSet data_set_1[] = {
16068 {0x00000000, 0x00000000, 0x00000000},
16069 {0x00000001, 0x3f800000, 0x3f800000},
16070 {0x00004000, 0x46800000, 0x46800000},
16071 {0x00010000, 0x47800000, 0x47800000},
16072 {0x40000000, 0x4e800000, 0x4e800000}};
16073 int pg_1[] = {1, 0, 1, 0, 0};
16074 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_1, data_set_1);
16075
16076 CvtfTestDataSet data_set_2[] = {
16077 // Test mantissa extremities.
16078 {0x00800001, 0x4b000001, 0x4b000001},
16079 {0x40400000, 0x4e808000, 0x4e808000},
16080 // The largest int32_t that fits in a double.
16081 {0x7fffff80, 0x4effffff, 0x4effffff},
16082 // Values that would be negative if treated as an int32_t.
16083 {0xffffffff, 0xbf800000, 0x4f800000},
16084 {0xffffff00, 0xc3800000, 0x4f7fffff},
16085 {0x80000000, 0xcf000000, 0x4f000000},
16086 {0x80000001, 0xcf000000, 0x4f000000},
16087 // Check for bit pattern reproduction.
16088 {0x089abcde, 0x4d09abce, 0x4d09abce},
16089 {0x12345678, 0x4d91a2b4, 0x4d91a2b4}};
16090 int pg_2[] = {1, 0, 1, 0, 1, 1, 1, 0, 0};
16091 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_2, data_set_2);
16092
16093 // Simple conversions of negative int32_t values. These require no rounding,
16094 // and the results should not depend on the rounding mode.
16095 CvtfTestDataSet data_set_3[] = {
16096 {0xffffc000, 0xc6800000, 0x4f7fffc0},
16097 {0xffff0000, 0xc7800000, 0x4f7fff00},
16098 {0xc0000000, 0xce800000, 0x4f400000},
16099 // Conversions which require rounding.
16100 {0x72800000, 0x4ee50000, 0x4ee50000},
16101 {0x72800001, 0x4ee50000, 0x4ee50000},
16102 {0x73000000, 0x4ee60000, 0x4ee60000},
16103 // Check rounding of negative int32_t values.
16104 {0x80000140, 0xcefffffe, 0x4f000001},
16105 {0x80000141, 0xcefffffd, 0x4f000001},
16106 {0x80000180, 0xcefffffd, 0x4f000002},
16107 // Round up to produce a result that's too big for the input to represent.
16108 {0x7fffffc0, 0x4f000000, 0x4f000000},
16109 {0x7fffffff, 0x4f000000, 0x4f000000}};
16110 int pg_3[] = {1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0};
16111 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_3, data_set_3);
16112 // clang-format on
16113 }
16114
TEST_SVE(scvtf_ucvtf_d_to_float)16115 TEST_SVE(scvtf_ucvtf_d_to_float) {
16116 // clang-format off
16117 int dst_lane_size = kSRegSize;
16118 int src_lane_size = kDRegSize;
16119
16120 // Simple conversions of positive numbers which require no rounding; the
16121 // results should not depened on the rounding mode, and ucvtf and scvtf should
16122 // produce the same result.
16123 CvtfTestDataSet data_set_1[] = {
16124 {0x0000000000000000, 0x00000000, 0x00000000},
16125 {0x0000000000000001, 0x3f800000, 0x3f800000},
16126 {0x0000000040000000, 0x4e800000, 0x4e800000},
16127 {0x0000000100000000, 0x4f800000, 0x4f800000},
16128 {0x4000000000000000, 0x5e800000, 0x5e800000}};
16129 int pg_1[] = {1, 1, 0, 1, 0};
16130 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_1, data_set_1);
16131
16132 CvtfTestDataSet data_set_2[] = {
16133 // Test mantissa extremities.
16134 {0x0010000000000001, 0x59800000, 0x59800000},
16135 {0x4008000000000000, 0x5e801000, 0x5e801000},
16136 // The largest int32_t that fits in a float.
16137 {0x000000007fffff80, 0x4effffff, 0x4effffff},
16138 // Values that would be negative if treated as an int32_t.
16139 {0x00000000ffffffff, 0x4f800000, 0x4f800000},
16140 {0x00000000ffffff00, 0x4f7fffff, 0x4f7fffff},
16141 {0x0000000080000000, 0x4f000000, 0x4f000000},
16142 {0x0000000080000100, 0x4f000001, 0x4f000001},
16143 // The largest int64_t that fits in a float.
16144 {0x7fffff8000000000, 0x5effffff, 0x5effffff},
16145 // Check for bit pattern reproduction.
16146 {0x0123456789abcde0, 0x5b91a2b4, 0x5b91a2b4},
16147 {0x0000000000876543, 0x4b076543, 0x4b076543}};
16148 int pg_2[] = {1, 0, 0, 0, 1, 0, 0, 0, 0, 1};
16149 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_2, data_set_2);
16150
16151 CvtfTestDataSet data_set_3[] = {
16152 // Simple conversions of negative int64_t values. These require no rounding,
16153 // and the results should not depend on the rounding mode.
16154 {0xffffffffc0000000, 0xce800000, 0x5f800000},
16155 {0xffffffff00000000, 0xcf800000, 0x5f800000},
16156 {0xc000000000000000, 0xde800000, 0x5f400000},
16157 // Conversions which require rounding.
16158 {0x0000800002800000, 0x57000002, 0x57000002},
16159 {0x0000800002800001, 0x57000003, 0x57000003},
16160 {0x0000800003000000, 0x57000003, 0x57000003},
16161 // Check rounding of negative int64_t values.
16162 {0x8000014000000000, 0xdefffffe, 0x5f000001},
16163 {0x8000014000000001, 0xdefffffd, 0x5f000001},
16164 {0x8000018000000000, 0xdefffffd, 0x5f000002},
16165 // Round up to produce a result that's too big for the input to represent.
16166 {0x00000000ffffff80, 0x4f800000, 0x4f800000},
16167 {0x00000000ffffffff, 0x4f800000, 0x4f800000},
16168 {0xffffff8000000000, 0xd3000000, 0x5f800000},
16169 {0xffffffffffffffff, 0xbf800000, 0x5f800000}};
16170 int pg_3[] = {0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1};
16171 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_3, data_set_3);
16172 // clang-format on
16173 }
16174
TEST_SVE(scvtf_ucvtf_d_to_double)16175 TEST_SVE(scvtf_ucvtf_d_to_double) {
16176 // clang-format off
16177 int dst_lane_size = kDRegSize;
16178 int src_lane_size = kDRegSize;
16179
16180 // Simple conversions of positive numbers which require no rounding; the
16181 // results should not depened on the rounding mode, and ucvtf and scvtf should
16182 // produce the same result.
16183 CvtfTestDataSet data_set_1[] = {
16184 {0x0000000000000000, 0x0000000000000000, 0x0000000000000000},
16185 {0x0000000000000001, 0x3ff0000000000000, 0x3ff0000000000000},
16186 {0x0000000040000000, 0x41d0000000000000, 0x41d0000000000000},
16187 {0x0000000100000000, 0x41f0000000000000, 0x41f0000000000000},
16188 {0x4000000000000000, 0x43d0000000000000, 0x43d0000000000000}};
16189 int pg_1[] = {0, 1, 1, 0, 0};
16190 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_1, data_set_1);
16191
16192 CvtfTestDataSet data_set_2[] = {
16193 // Test mantissa extremities.
16194 {0x0010000000000001, 0x4330000000000001, 0x4330000000000001},
16195 {0x4008000000000000, 0x43d0020000000000, 0x43d0020000000000},
16196 // The largest int32_t that fits in a double.
16197 {0x000000007fffffff, 0x41dfffffffc00000, 0x41dfffffffc00000},
16198 // Values that would be negative if treated as an int32_t.
16199 {0x00000000ffffffff, 0x41efffffffe00000, 0x41efffffffe00000},
16200 {0x0000000080000000, 0x41e0000000000000, 0x41e0000000000000},
16201 {0x0000000080000001, 0x41e0000000200000, 0x41e0000000200000},
16202 // The largest int64_t that fits in a double.
16203 {0x7ffffffffffffc00, 0x43dfffffffffffff, 0x43dfffffffffffff},
16204 // Check for bit pattern reproduction.
16205 {0x0123456789abcde0, 0x43723456789abcde, 0x43723456789abcde},
16206 {0x0000000012345678, 0x41b2345678000000, 0x41b2345678000000}};
16207 int pg_2[] = {1, 1, 1, 1, 1, 0, 0, 0, 0};
16208 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_2, data_set_2);
16209
16210 CvtfTestDataSet data_set_3[] = {
16211 // Simple conversions of negative int64_t values. These require no rounding,
16212 // and the results should not depend on the rounding mode.
16213 {0xffffffffc0000000, 0xc1d0000000000000, 0x43effffffff80000},
16214 {0xffffffff00000000, 0xc1f0000000000000, 0x43efffffffe00000},
16215 {0xc000000000000000, 0xc3d0000000000000, 0x43e8000000000000},
16216 // Conversions which require rounding.
16217 {0x1000000000000280, 0x43b0000000000002, 0x43b0000000000002},
16218 {0x1000000000000281, 0x43b0000000000003, 0x43b0000000000003},
16219 {0x1000000000000300, 0x43b0000000000003, 0x43b0000000000003},
16220 // Check rounding of negative int64_t values.
16221 {0x8000000000000a00, 0xc3dffffffffffffe, 0x43e0000000000001},
16222 {0x8000000000000a01, 0xc3dffffffffffffd, 0x43e0000000000001},
16223 {0x8000000000000c00, 0xc3dffffffffffffd, 0x43e0000000000002},
16224 // Round up to produce a result that's too big for the input to represent.
16225 {0x7ffffffffffffe00, 0x43e0000000000000, 0x43e0000000000000},
16226 {0x7fffffffffffffff, 0x43e0000000000000, 0x43e0000000000000},
16227 {0xfffffffffffffc00, 0xc090000000000000, 0x43f0000000000000},
16228 {0xffffffffffffffff, 0xbff0000000000000, 0x43f0000000000000}};
16229 int pg_3[] = {1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0};
16230 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_3, data_set_3);
16231 // clang-format on
16232 }
16233
TEST_SVE(scvtf_ucvtf_s_to_double)16234 TEST_SVE(scvtf_ucvtf_s_to_double) {
16235 // clang-format off
16236 int dst_lane_size = kDRegSize;
16237 int src_lane_size = kSRegSize;
16238
16239 // Simple conversions of positive numbers which require no rounding; the
16240 // results should not depened on the rounding mode, and ucvtf and scvtf should
16241 // produce the same result.
16242 CvtfTestDataSet data_set_1[] = {
16243 {0x00000000, 0x0000000000000000, 0x0000000000000000},
16244 {0x00000001, 0x3ff0000000000000, 0x3ff0000000000000},
16245 {0x00004000, 0x40d0000000000000, 0x40d0000000000000},
16246 {0x00010000, 0x40f0000000000000, 0x40f0000000000000},
16247 {0x40000000, 0x41d0000000000000, 0x41d0000000000000}};
16248 int pg_1[] = {1, 0, 0, 0, 1};
16249 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_1, data_set_1);
16250
16251 CvtfTestDataSet data_set_2[] = {
16252 // Test mantissa extremities.
16253 {0x40000400, 0x41d0000100000000, 0x41d0000100000000},
16254 // The largest int32_t that fits in a double.
16255 {0x7fffffff, 0x41dfffffffc00000, 0x41dfffffffc00000},
16256 // Values that would be negative if treated as an int32_t.
16257 {0xffffffff, 0xbff0000000000000, 0x41efffffffe00000},
16258 {0x80000000, 0xc1e0000000000000, 0x41e0000000000000},
16259 {0x80000001, 0xc1dfffffffc00000, 0x41e0000000200000},
16260 // Check for bit pattern reproduction.
16261 {0x089abcde, 0x41a13579bc000000, 0x41a13579bc000000},
16262 {0x12345678, 0x41b2345678000000, 0x41b2345678000000},
16263 // Simple conversions of negative int32_t values. These require no rounding,
16264 // and the results should not depend on the rounding mode.
16265 {0xffffc000, 0xc0d0000000000000, 0x41effff800000000},
16266 {0xffff0000, 0xc0f0000000000000, 0x41efffe000000000},
16267 {0xc0000000, 0xc1d0000000000000, 0x41e8000000000000}};
16268 int pg_2[] = {1, 0, 1, 0, 0, 1, 1, 0, 1, 1};
16269 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_2, data_set_2);
16270
16271 // Note that IEEE 754 double-precision format has 52-bits fraction, so all
16272 // 32-bits integers are representable in double.
16273 // clang-format on
16274 }
16275
TEST_SVE(sve_fadda)16276 TEST_SVE(sve_fadda) {
16277 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
16278 CPUFeatures::kFP,
16279 CPUFeatures::kFPHalf);
16280 START();
16281
16282 __ Ptrue(p0.VnB());
16283 __ Pfalse(p1.VnB());
16284 __ Zip1(p1.VnH(), p0.VnH(), p1.VnH());
16285
16286 __ Index(z0.VnS(), 3, 3);
16287 __ Scvtf(z0.VnS(), p0.Merging(), z0.VnS());
16288 __ Fmov(s2, 2.0);
16289 __ Fadda(s2, p0, s2, z0.VnS());
16290
16291 __ Index(z0.VnD(), -7, -7);
16292 __ Scvtf(z0.VnD(), p0.Merging(), z0.VnD());
16293 __ Fmov(d3, 3.0);
16294 __ Fadda(d3, p0, d3, z0.VnD());
16295
16296 __ Index(z0.VnH(), 1, 1);
16297 __ Scvtf(z0.VnH(), p0.Merging(), z0.VnH());
16298 __ Fmov(h4, 0);
16299 __ Fadda(h4, p1, h4, z0.VnH());
16300 END();
16301
16302 if (CAN_RUN()) {
16303 RUN();
16304 // Sum of 1 .. n is n+1 * n/2, ie. n(n+1)/2.
16305 int n = core.GetSVELaneCount(kSRegSize);
16306 ASSERT_EQUAL_FP32(2 + 3 * ((n + 1) * (n / 2)), s2);
16307
16308 n /= 2; // Half as many lanes.
16309 ASSERT_EQUAL_FP64(3 + -7 * ((n + 1) * (n / 2)), d3);
16310
16311 // Sum of first n odd numbers is n^2.
16312 n = core.GetSVELaneCount(kHRegSize) / 2; // Half are odd numbers.
16313 ASSERT_EQUAL_FP16(Float16(n * n), h4);
16314 }
16315 }
16316
TEST_SVE(sve_extract)16317 TEST_SVE(sve_extract) {
16318 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16319 START();
16320
16321 __ Index(z0.VnB(), 0, 1);
16322
16323 __ Mov(z1, z0);
16324 __ Mov(z2, z0);
16325 __ Mov(z3, z0);
16326 __ Mov(z4, z0);
16327 __ Mov(z5, z0);
16328 __ Mov(z6, z0);
16329
16330 __ Ext(z1, z1, z0, 0);
16331 __ Ext(z2, z2, z0, 1);
16332 __ Ext(z3, z3, z0, 15);
16333 __ Ext(z4, z4, z0, 31);
16334 __ Ext(z5, z5, z0, 47);
16335 __ Ext(z6, z6, z0, 255);
16336
16337 END();
16338
16339 if (CAN_RUN()) {
16340 RUN();
16341
16342 ASSERT_EQUAL_SVE(z1, z0);
16343
16344 int lane_count = core.GetSVELaneCount(kBRegSize);
16345 if (lane_count == 16) {
16346 uint64_t z2_expected[] = {0x000f0e0d0c0b0a09, 0x0807060504030201};
16347 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
16348 } else {
16349 uint64_t z2_expected[] = {0x100f0e0d0c0b0a09, 0x0807060504030201};
16350 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
16351 }
16352
16353 if (lane_count == 16) {
16354 uint64_t z3_expected[] = {0x0e0d0c0b0a090807, 0x060504030201000f};
16355 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
16356 } else {
16357 uint64_t z3_expected[] = {0x1e1d1c1b1a191817, 0x161514131211100f};
16358 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
16359 }
16360
16361 if (lane_count < 32) {
16362 ASSERT_EQUAL_SVE(z4, z0);
16363 } else if (lane_count == 32) {
16364 uint64_t z4_expected[] = {0x0e0d0c0b0a090807, 0x060504030201001f};
16365 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
16366 } else {
16367 uint64_t z4_expected[] = {0x2e2d2c2b2a292827, 0x262524232221201f};
16368 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
16369 }
16370
16371 if (lane_count < 48) {
16372 ASSERT_EQUAL_SVE(z5, z0);
16373 } else if (lane_count == 48) {
16374 uint64_t z5_expected[] = {0x0e0d0c0b0a090807, 0x060504030201002f};
16375 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
16376 } else {
16377 uint64_t z5_expected[] = {0x3e3d3c3b3a393837, 0x363534333231302f};
16378 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
16379 }
16380
16381 if (lane_count < 256) {
16382 ASSERT_EQUAL_SVE(z6, z0);
16383 } else {
16384 uint64_t z6_expected[] = {0x0e0d0c0b0a090807, 0x06050403020100ff};
16385 ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
16386 }
16387 }
16388 }
16389
TEST_SVE(sve_fp_paired_across)16390 TEST_SVE(sve_fp_paired_across) {
16391 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16392
16393 START();
16394
16395 __ Ptrue(p0.VnB());
16396 __ Pfalse(p1.VnB());
16397 __ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
16398 __ Zip1(p3.VnD(), p0.VnD(), p1.VnD());
16399 __ Zip1(p4.VnH(), p0.VnH(), p1.VnH());
16400
16401 __ Index(z0.VnS(), 3, 3);
16402 __ Scvtf(z0.VnS(), p0.Merging(), z0.VnS());
16403 __ Faddv(s1, p0, z0.VnS());
16404 __ Fminv(s2, p2, z0.VnS());
16405 __ Fmaxv(s3, p2, z0.VnS());
16406
16407 __ Index(z0.VnD(), -7, -7);
16408 __ Scvtf(z0.VnD(), p0.Merging(), z0.VnD());
16409 __ Faddv(d4, p0, z0.VnD());
16410 __ Fminv(d5, p3, z0.VnD());
16411 __ Fmaxv(d6, p3, z0.VnD());
16412
16413 __ Index(z0.VnH(), 1, 1);
16414 __ Scvtf(z0.VnH(), p0.Merging(), z0.VnH());
16415 __ Faddv(h7, p4, z0.VnH());
16416 __ Fminv(h8, p4, z0.VnH());
16417 __ Fmaxv(h9, p4, z0.VnH());
16418
16419 __ Dup(z10.VnH(), 0);
16420 __ Fdiv(z10.VnH(), p0.Merging(), z10.VnH(), z10.VnH());
16421 __ Insr(z10.VnH(), 0x5140);
16422 __ Insr(z10.VnH(), 0xd140);
16423 __ Ext(z10.VnB(), z10.VnB(), z10.VnB(), 2);
16424 __ Fmaxnmv(h11, p0, z10.VnH());
16425 __ Fmaxnmv(h12, p4, z10.VnH());
16426 __ Fminnmv(h13, p0, z10.VnH());
16427 __ Fminnmv(h14, p4, z10.VnH());
16428
16429 __ Dup(z10.VnS(), 0);
16430 __ Fdiv(z10.VnS(), p0.Merging(), z10.VnS(), z10.VnS());
16431 __ Insr(z10.VnS(), 0x42280000);
16432 __ Insr(z10.VnS(), 0xc2280000);
16433 __ Ext(z10.VnB(), z10.VnB(), z10.VnB(), 4);
16434 __ Fmaxnmv(s15, p0, z10.VnS());
16435 __ Fmaxnmv(s16, p2, z10.VnS());
16436 __ Fminnmv(s17, p0, z10.VnS());
16437 __ Fminnmv(s18, p2, z10.VnS());
16438
16439 __ Dup(z10.VnD(), 0);
16440 __ Fdiv(z10.VnD(), p0.Merging(), z10.VnD(), z10.VnD());
16441 __ Insr(z10.VnD(), 0x4045000000000000);
16442 __ Insr(z10.VnD(), 0xc045000000000000);
16443 __ Ext(z10.VnB(), z10.VnB(), z10.VnB(), 8);
16444 __ Fmaxnmv(d19, p0, z10.VnD());
16445 __ Fmaxnmv(d20, p3, z10.VnD());
16446 __ Fminnmv(d21, p0, z10.VnD());
16447 __ Fminnmv(d22, p3, z10.VnD());
16448 END();
16449
16450 if (CAN_RUN()) {
16451 RUN();
16452 // Sum of 1 .. n is n+1 * n/2, ie. n(n+1)/2.
16453 int n = core.GetSVELaneCount(kSRegSize);
16454 ASSERT_EQUAL_FP32(3 * ((n + 1) * (n / 2)), s1);
16455 ASSERT_EQUAL_FP32(3, s2);
16456 ASSERT_EQUAL_FP32(3 * n - 3, s3);
16457
16458 n /= 2; // Half as many lanes.
16459 ASSERT_EQUAL_FP64(-7 * ((n + 1) * (n / 2)), d4);
16460 ASSERT_EQUAL_FP64(-7 * (n - 1), d5);
16461 ASSERT_EQUAL_FP64(-7, d6);
16462
16463 // Sum of first n odd numbers is n^2.
16464 n = core.GetSVELaneCount(kHRegSize) / 2; // Half are odd numbers.
16465 ASSERT_EQUAL_FP16(Float16(n * n), h7);
16466 ASSERT_EQUAL_FP16(Float16(1), h8);
16467
16468 n = core.GetSVELaneCount(kHRegSize);
16469 ASSERT_EQUAL_FP16(Float16(n - 1), h9);
16470
16471 ASSERT_EQUAL_FP16(Float16(42), h11);
16472 ASSERT_EQUAL_FP16(Float16(42), h12);
16473 ASSERT_EQUAL_FP16(Float16(-42), h13);
16474 ASSERT_EQUAL_FP16(Float16(42), h14);
16475 ASSERT_EQUAL_FP32(42, s15);
16476 ASSERT_EQUAL_FP32(42, s16);
16477 ASSERT_EQUAL_FP32(-42, s17);
16478 ASSERT_EQUAL_FP32(42, s18);
16479 ASSERT_EQUAL_FP64(42, d19);
16480 ASSERT_EQUAL_FP64(42, d20);
16481 ASSERT_EQUAL_FP64(-42, d21);
16482 ASSERT_EQUAL_FP64(42, d22);
16483 }
16484 }
16485
TEST_SVE(sve_frecpe_frsqrte)16486 TEST_SVE(sve_frecpe_frsqrte) {
16487 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16488
16489 START();
16490
16491 __ Ptrue(p0.VnB());
16492
16493 __ Index(z0.VnH(), 0, 1);
16494 __ Fdup(z1.VnH(), Float16(1));
16495 __ Fscale(z1.VnH(), p0.Merging(), z1.VnH(), z0.VnH());
16496 __ Insr(z1.VnH(), 0);
16497 __ Frsqrte(z2.VnH(), z1.VnH());
16498 __ Frecpe(z1.VnH(), z1.VnH());
16499
16500 __ Index(z0.VnS(), 0, 1);
16501 __ Fdup(z3.VnS(), Float16(1));
16502 __ Fscale(z3.VnS(), p0.Merging(), z3.VnS(), z0.VnS());
16503 __ Insr(z3.VnS(), 0);
16504 __ Frsqrte(z4.VnS(), z3.VnS());
16505 __ Frecpe(z3.VnS(), z3.VnS());
16506
16507 __ Index(z0.VnD(), 0, 1);
16508 __ Fdup(z5.VnD(), Float16(1));
16509 __ Fscale(z5.VnD(), p0.Merging(), z5.VnD(), z0.VnD());
16510 __ Insr(z5.VnD(), 0);
16511 __ Frsqrte(z6.VnD(), z5.VnD());
16512 __ Frecpe(z5.VnD(), z5.VnD());
16513 END();
16514
16515 if (CAN_RUN()) {
16516 RUN();
16517 uint64_t z1_expected[] = {0x23fc27fc2bfc2ffc, 0x33fc37fc3bfc7c00};
16518 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
16519 uint64_t z2_expected[] = {0x2ffc31a433fc35a4, 0x37fc39a43bfc7c00};
16520 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
16521
16522 uint64_t z3_expected[] = {0x3e7f80003eff8000, 0x3f7f80007f800000};
16523 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
16524 uint64_t z4_expected[] = {0x3eff80003f348000, 0x3f7f80007f800000};
16525 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
16526
16527 uint64_t z5_expected[] = {0x3feff00000000000, 0x7ff0000000000000};
16528 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
16529 uint64_t z6_expected[] = {0x3feff00000000000, 0x7ff0000000000000};
16530 ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
16531 }
16532 }
16533
TEST_SVE(sve_frecps_frsqrts)16534 TEST_SVE(sve_frecps_frsqrts) {
16535 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16536
16537 START();
16538 __ Ptrue(p0.VnB());
16539
16540 __ Index(z0.VnH(), 0, -1);
16541 __ Fdup(z1.VnH(), Float16(1));
16542 __ Fscale(z1.VnH(), p0.Merging(), z1.VnH(), z0.VnH());
16543 __ Scvtf(z0.VnH(), p0.Merging(), z0.VnH());
16544 __ Insr(z1.VnH(), 0);
16545 __ Frsqrts(z2.VnH(), z1.VnH(), z0.VnH());
16546 __ Frecps(z1.VnH(), z1.VnH(), z0.VnH());
16547
16548 __ Index(z0.VnS(), 0, -1);
16549 __ Fdup(z3.VnS(), Float16(1));
16550 __ Fscale(z3.VnS(), p0.Merging(), z3.VnS(), z0.VnS());
16551 __ Scvtf(z0.VnS(), p0.Merging(), z0.VnS());
16552 __ Insr(z3.VnS(), 0);
16553 __ Frsqrts(z4.VnS(), z3.VnS(), z0.VnS());
16554 __ Frecps(z3.VnS(), z3.VnS(), z0.VnS());
16555
16556 __ Index(z0.VnD(), 0, -1);
16557 __ Fdup(z5.VnD(), Float16(1));
16558 __ Fscale(z5.VnD(), p0.Merging(), z5.VnD(), z0.VnD());
16559 __ Scvtf(z0.VnD(), p0.Merging(), z0.VnD());
16560 __ Insr(z5.VnD(), 0);
16561 __ Frsqrts(z6.VnD(), z5.VnD(), z0.VnD());
16562 __ Frecps(z5.VnD(), z5.VnD(), z0.VnD());
16563 END();
16564
16565 if (CAN_RUN()) {
16566 RUN();
16567 uint64_t z1_expected[] = {0x4038406040a04100, 0x4180420042004000};
16568 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
16569 uint64_t z2_expected[] = {0x3e383e603ea03f00, 0x3f80400040003e00};
16570 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
16571
16572 uint64_t z3_expected[] = {0x4030000040400000, 0x4040000040000000};
16573 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
16574 uint64_t z4_expected[] = {0x3ff0000040000000, 0x400000003fc00000};
16575 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
16576
16577 uint64_t z5_expected[] = {0x4008000000000000, 0x4000000000000000};
16578 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
16579 uint64_t z6_expected[] = {0x4000000000000000, 0x3ff8000000000000};
16580 ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
16581 }
16582 }
16583
TEST_SVE(sve_ftsmul)16584 TEST_SVE(sve_ftsmul) {
16585 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16586
16587 START();
16588 __ Ptrue(p0.VnB());
16589
16590 __ Index(z0.VnH(), 0, 1);
16591 __ Rev(z1.VnH(), z0.VnH());
16592 __ Scvtf(z0.VnH(), p0.Merging(), z0.VnH());
16593 __ Dup(z2.VnH(), 0);
16594 __ Fdiv(z2.VnH(), p0.Merging(), z2.VnH(), z2.VnH());
16595 __ Ftsmul(z3.VnH(), z0.VnH(), z1.VnH());
16596 __ Ftsmul(z4.VnH(), z2.VnH(), z1.VnH());
16597
16598 __ Index(z0.VnS(), -7, 1);
16599 __ Rev(z1.VnS(), z0.VnS());
16600 __ Scvtf(z0.VnS(), p0.Merging(), z0.VnS());
16601 __ Dup(z2.VnS(), 0);
16602 __ Fdiv(z2.VnS(), p0.Merging(), z2.VnS(), z2.VnS());
16603 __ Ftsmul(z5.VnS(), z0.VnS(), z1.VnS());
16604 __ Ftsmul(z6.VnS(), z2.VnS(), z1.VnS());
16605
16606 __ Index(z0.VnD(), 2, -1);
16607 __ Rev(z1.VnD(), z0.VnD());
16608 __ Scvtf(z0.VnD(), p0.Merging(), z0.VnD());
16609 __ Dup(z2.VnD(), 0);
16610 __ Fdiv(z2.VnD(), p0.Merging(), z2.VnD(), z2.VnD());
16611 __ Ftsmul(z7.VnD(), z0.VnD(), z1.VnD());
16612 __ Ftsmul(z8.VnD(), z2.VnD(), z1.VnD());
16613 END();
16614
16615 if (CAN_RUN()) {
16616 RUN();
16617 uint64_t z3_expected[] = {0x5220d0804e40cc00, 0x4880c4003c008000};
16618 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
16619 uint64_t z4_expected[] = {0x7e007e007e007e00, 0x7e007e007e007e00};
16620 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
16621
16622 uint64_t z5_expected[] = {0xc180000041c80000, 0xc210000042440000};
16623 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
16624 uint64_t z6_expected[] = {0x7fc000007fc00000, 0x7fc000007fc00000};
16625 ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
16626
16627 uint64_t z7_expected[] = {0x3ff0000000000000, 0xc010000000000000};
16628 ASSERT_EQUAL_SVE(z7_expected, z7.VnD());
16629 uint64_t z8_expected[] = {0x7ff8000000000000, 0x7ff8000000000000};
16630 ASSERT_EQUAL_SVE(z8_expected, z8.VnD());
16631 }
16632 }
16633
16634 typedef void (MacroAssembler::*FPMulAccFn)(
16635 const ZRegister& zd,
16636 const PRegisterM& pg,
16637 const ZRegister& za,
16638 const ZRegister& zn,
16639 const ZRegister& zm,
16640 FPMacroNaNPropagationOption nan_option);
16641
16642 // The `pg_inputs` is used for examining the predication correctness internally.
16643 // It does not imply the value of `result` argument. `result` stands for the
16644 // expected result on all-true predication.
16645 template <typename T, size_t N>
FPMulAccHelper(Test * config,FPMulAccFn macro,unsigned lane_size_in_bits,const int (& pg_inputs)[N],const T (& za_inputs)[N],const T (& zn_inputs)[N],const T (& zm_inputs)[N],const uint64_t (& result)[N],FPMacroNaNPropagationOption nan_option=FastNaNPropagation)16646 static void FPMulAccHelper(
16647 Test* config,
16648 FPMulAccFn macro,
16649 unsigned lane_size_in_bits,
16650 const int (&pg_inputs)[N],
16651 const T (&za_inputs)[N],
16652 const T (&zn_inputs)[N],
16653 const T (&zm_inputs)[N],
16654 const uint64_t (&result)[N],
16655 FPMacroNaNPropagationOption nan_option = FastNaNPropagation) {
16656 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16657 START();
16658
16659 ZRegister zd = z0.WithLaneSize(lane_size_in_bits);
16660 ZRegister za = z1.WithLaneSize(lane_size_in_bits);
16661 ZRegister zn = z2.WithLaneSize(lane_size_in_bits);
16662 ZRegister zm = z3.WithLaneSize(lane_size_in_bits);
16663
16664 uint64_t za_rawbits[N];
16665 uint64_t zn_rawbits[N];
16666 uint64_t zm_rawbits[N];
16667
16668 FPToRawbitsWithSize(za_inputs, za_rawbits, lane_size_in_bits);
16669 FPToRawbitsWithSize(zn_inputs, zn_rawbits, lane_size_in_bits);
16670 FPToRawbitsWithSize(zm_inputs, zm_rawbits, lane_size_in_bits);
16671
16672 InsrHelper(&masm, za, za_rawbits);
16673 InsrHelper(&masm, zn, zn_rawbits);
16674 InsrHelper(&masm, zm, zm_rawbits);
16675
16676 // Initialize `zd` with a signalling NaN.
16677 uint64_t sn = GetSignallingNan(lane_size_in_bits);
16678 __ Mov(x29, sn);
16679 __ Dup(zd, x29);
16680
16681 Initialise(&masm, p0.WithLaneSize(lane_size_in_bits), pg_inputs);
16682
16683 // Fmla macro automatically selects between fmla, fmad and movprfx + fmla
16684 // Fmls `ditto` fmls, fmsb and movprfx + fmls
16685 // Fnmla `ditto` fnmla, fnmad and movprfx + fnmla
16686 // Fnmls `ditto` fnmls, fnmsb and movprfx + fnmls
16687 // based on what registers are aliased.
16688 ZRegister da_result = z10.WithLaneSize(lane_size_in_bits);
16689 ZRegister dn_result = z11.WithLaneSize(lane_size_in_bits);
16690 ZRegister dm_result = z12.WithLaneSize(lane_size_in_bits);
16691 ZRegister d_result = z13.WithLaneSize(lane_size_in_bits);
16692
16693 __ Mov(da_result, za);
16694 (masm.*macro)(da_result, p0.Merging(), da_result, zn, zm, nan_option);
16695
16696 __ Mov(dn_result, zn);
16697 (masm.*macro)(dn_result, p0.Merging(), za, dn_result, zm, nan_option);
16698
16699 __ Mov(dm_result, zm);
16700 (masm.*macro)(dm_result, p0.Merging(), za, zn, dm_result, nan_option);
16701
16702 __ Mov(d_result, zd);
16703 (masm.*macro)(d_result, p0.Merging(), za, zn, zm, nan_option);
16704
16705 END();
16706
16707 if (CAN_RUN()) {
16708 RUN();
16709
16710 ASSERT_EQUAL_SVE(za_rawbits, za);
16711 ASSERT_EQUAL_SVE(zn_rawbits, zn);
16712 ASSERT_EQUAL_SVE(zm_rawbits, zm);
16713
16714 uint64_t da_expected[N];
16715 uint64_t dn_expected[N];
16716 uint64_t dm_expected[N];
16717 uint64_t d_expected[N];
16718 for (size_t i = 0; i < N; i++) {
16719 da_expected[i] = ((pg_inputs[i] & 1) != 0) ? result[i] : za_rawbits[i];
16720 dn_expected[i] = ((pg_inputs[i] & 1) != 0) ? result[i] : zn_rawbits[i];
16721 dm_expected[i] = ((pg_inputs[i] & 1) != 0) ? result[i] : zm_rawbits[i];
16722 d_expected[i] = ((pg_inputs[i] & 1) != 0) ? result[i] : sn;
16723 }
16724
16725 ASSERT_EQUAL_SVE(da_expected, da_result);
16726 ASSERT_EQUAL_SVE(dn_expected, dn_result);
16727 ASSERT_EQUAL_SVE(dm_expected, dm_result);
16728 ASSERT_EQUAL_SVE(d_expected, d_result);
16729 }
16730 }
16731
TEST_SVE(sve_fmla_fmad)16732 TEST_SVE(sve_fmla_fmad) {
16733 // fmla : zd = za + zn * zm
16734 double za_inputs[] = {-39.0, 1.0, -3.0, 2.0};
16735 double zn_inputs[] = {-5.0, -20.0, 9.0, 8.0};
16736 double zm_inputs[] = {9.0, -5.0, 4.0, 5.0};
16737 int pg_inputs[] = {1, 1, 0, 1};
16738
16739 uint64_t fmla_result_h[] = {Float16ToRawbits(Float16(-84.0)),
16740 Float16ToRawbits(Float16(101.0)),
16741 Float16ToRawbits(Float16(33.0)),
16742 Float16ToRawbits(Float16(42.0))};
16743
16744 // `fmad` has been tested in the helper.
16745 FPMulAccHelper(config,
16746 &MacroAssembler::Fmla,
16747 kHRegSize,
16748 pg_inputs,
16749 za_inputs,
16750 zn_inputs,
16751 zm_inputs,
16752 fmla_result_h);
16753
16754 uint64_t fmla_result_s[] = {FloatToRawbits(-84.0f),
16755 FloatToRawbits(101.0f),
16756 FloatToRawbits(33.0f),
16757 FloatToRawbits(42.0f)};
16758
16759 FPMulAccHelper(config,
16760 &MacroAssembler::Fmla,
16761 kSRegSize,
16762 pg_inputs,
16763 za_inputs,
16764 zn_inputs,
16765 zm_inputs,
16766 fmla_result_s);
16767
16768 uint64_t fmla_result_d[] = {DoubleToRawbits(-84.0),
16769 DoubleToRawbits(101.0),
16770 DoubleToRawbits(33.0),
16771 DoubleToRawbits(42.0)};
16772
16773 FPMulAccHelper(config,
16774 &MacroAssembler::Fmla,
16775 kDRegSize,
16776 pg_inputs,
16777 za_inputs,
16778 zn_inputs,
16779 zm_inputs,
16780 fmla_result_d);
16781 }
16782
TEST_SVE(sve_fmls_fmsb)16783 TEST_SVE(sve_fmls_fmsb) {
16784 // fmls : zd = za - zn * zm
16785 double za_inputs[] = {-39.0, 1.0, -3.0, 2.0};
16786 double zn_inputs[] = {-5.0, -20.0, 9.0, 8.0};
16787 double zm_inputs[] = {9.0, -5.0, 4.0, 5.0};
16788 int pg_inputs[] = {1, 0, 1, 1};
16789
16790 uint64_t fmls_result_h[] = {Float16ToRawbits(Float16(6.0)),
16791 Float16ToRawbits(Float16(-99.0)),
16792 Float16ToRawbits(Float16(-39.0)),
16793 Float16ToRawbits(Float16(-38.0))};
16794
16795 // `fmsb` has been tested in the helper.
16796 FPMulAccHelper(config,
16797 &MacroAssembler::Fmls,
16798 kHRegSize,
16799 pg_inputs,
16800 za_inputs,
16801 zn_inputs,
16802 zm_inputs,
16803 fmls_result_h);
16804
16805 uint64_t fmls_result_s[] = {FloatToRawbits(6.0f),
16806 FloatToRawbits(-99.0f),
16807 FloatToRawbits(-39.0f),
16808 FloatToRawbits(-38.0f)};
16809
16810 FPMulAccHelper(config,
16811 &MacroAssembler::Fmls,
16812 kSRegSize,
16813 pg_inputs,
16814 za_inputs,
16815 zn_inputs,
16816 zm_inputs,
16817 fmls_result_s);
16818
16819 uint64_t fmls_result_d[] = {DoubleToRawbits(6.0),
16820 DoubleToRawbits(-99.0),
16821 DoubleToRawbits(-39.0),
16822 DoubleToRawbits(-38.0)};
16823
16824 FPMulAccHelper(config,
16825 &MacroAssembler::Fmls,
16826 kDRegSize,
16827 pg_inputs,
16828 za_inputs,
16829 zn_inputs,
16830 zm_inputs,
16831 fmls_result_d);
16832 }
16833
TEST_SVE(sve_fnmla_fnmad)16834 TEST_SVE(sve_fnmla_fnmad) {
16835 // fnmla : zd = -za - zn * zm
16836 double za_inputs[] = {-39.0, 1.0, -3.0, 2.0};
16837 double zn_inputs[] = {-5.0, -20.0, 9.0, 8.0};
16838 double zm_inputs[] = {9.0, -5.0, 4.0, 5.0};
16839 int pg_inputs[] = {0, 1, 1, 1};
16840
16841 uint64_t fnmla_result_h[] = {Float16ToRawbits(Float16(84.0)),
16842 Float16ToRawbits(Float16(-101.0)),
16843 Float16ToRawbits(Float16(-33.0)),
16844 Float16ToRawbits(Float16(-42.0))};
16845
16846 // `fnmad` has been tested in the helper.
16847 FPMulAccHelper(config,
16848 &MacroAssembler::Fnmla,
16849 kHRegSize,
16850 pg_inputs,
16851 za_inputs,
16852 zn_inputs,
16853 zm_inputs,
16854 fnmla_result_h);
16855
16856 uint64_t fnmla_result_s[] = {FloatToRawbits(84.0f),
16857 FloatToRawbits(-101.0f),
16858 FloatToRawbits(-33.0f),
16859 FloatToRawbits(-42.0f)};
16860
16861 FPMulAccHelper(config,
16862 &MacroAssembler::Fnmla,
16863 kSRegSize,
16864 pg_inputs,
16865 za_inputs,
16866 zn_inputs,
16867 zm_inputs,
16868 fnmla_result_s);
16869
16870 uint64_t fnmla_result_d[] = {DoubleToRawbits(84.0),
16871 DoubleToRawbits(-101.0),
16872 DoubleToRawbits(-33.0),
16873 DoubleToRawbits(-42.0)};
16874
16875 FPMulAccHelper(config,
16876 &MacroAssembler::Fnmla,
16877 kDRegSize,
16878 pg_inputs,
16879 za_inputs,
16880 zn_inputs,
16881 zm_inputs,
16882 fnmla_result_d);
16883 }
16884
TEST_SVE(sve_fnmls_fnmsb)16885 TEST_SVE(sve_fnmls_fnmsb) {
16886 // fnmls : zd = -za + zn * zm
16887 double za_inputs[] = {-39.0, 1.0, -3.0, 2.0};
16888 double zn_inputs[] = {-5.0, -20.0, 9.0, 8.0};
16889 double zm_inputs[] = {9.0, -5.0, 4.0, 5.0};
16890 int pg_inputs[] = {1, 1, 1, 0};
16891
16892 uint64_t fnmls_result_h[] = {Float16ToRawbits(Float16(-6.0)),
16893 Float16ToRawbits(Float16(99.0)),
16894 Float16ToRawbits(Float16(39.0)),
16895 Float16ToRawbits(Float16(38.0))};
16896
16897 // `fnmsb` has been tested in the helper.
16898 FPMulAccHelper(config,
16899 &MacroAssembler::Fnmls,
16900 kHRegSize,
16901 pg_inputs,
16902 za_inputs,
16903 zn_inputs,
16904 zm_inputs,
16905 fnmls_result_h);
16906
16907 uint64_t fnmls_result_s[] = {FloatToRawbits(-6.0f),
16908 FloatToRawbits(99.0f),
16909 FloatToRawbits(39.0f),
16910 FloatToRawbits(38.0f)};
16911
16912 FPMulAccHelper(config,
16913 &MacroAssembler::Fnmls,
16914 kSRegSize,
16915 pg_inputs,
16916 za_inputs,
16917 zn_inputs,
16918 zm_inputs,
16919 fnmls_result_s);
16920
16921 uint64_t fnmls_result_d[] = {DoubleToRawbits(-6.0),
16922 DoubleToRawbits(99.0),
16923 DoubleToRawbits(39.0),
16924 DoubleToRawbits(38.0)};
16925
16926 FPMulAccHelper(config,
16927 &MacroAssembler::Fnmls,
16928 kDRegSize,
16929 pg_inputs,
16930 za_inputs,
16931 zn_inputs,
16932 zm_inputs,
16933 fnmls_result_d);
16934 }
16935
16936 typedef void (MacroAssembler::*FPMulAccIdxFn)(const ZRegister& zd,
16937 const ZRegister& za,
16938 const ZRegister& zn,
16939 const ZRegister& zm,
16940 int index);
16941
16942 template <typename T, size_t N>
FPMulAccIdxHelper(Test * config,FPMulAccFn macro,FPMulAccIdxFn macro_idx,const T (& za_inputs)[N],const T (& zn_inputs)[N],const T (& zm_inputs)[N])16943 static void FPMulAccIdxHelper(Test* config,
16944 FPMulAccFn macro,
16945 FPMulAccIdxFn macro_idx,
16946 const T (&za_inputs)[N],
16947 const T (&zn_inputs)[N],
16948 const T (&zm_inputs)[N]) {
16949 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16950 START();
16951
16952 __ Ptrue(p0.VnB());
16953
16954 // Repeat indexed vector across up to 2048-bit VL.
16955 for (size_t i = 0; i < (kZRegMaxSize / kDRegSize); i += N) {
16956 InsrHelper(&masm, z30.VnD(), zm_inputs);
16957 }
16958
16959 FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z30.VnH());
16960
16961 InsrHelper(&masm, z1.VnD(), zn_inputs);
16962 InsrHelper(&masm, z2.VnD(), za_inputs);
16963
16964 __ Mov(z3, z0);
16965 (masm.*macro_idx)(z3.VnH(), z2.VnH(), z1.VnH(), z3.VnH(), 0); // zd == zm
16966 __ Mov(z4, z1);
16967 (masm.*macro_idx)(z4.VnH(), z2.VnH(), z4.VnH(), z0.VnH(), 1); // zd == zn
16968 __ Mov(z5, z2);
16969 (masm.*macro_idx)(z5.VnH(), z5.VnH(), z1.VnH(), z0.VnH(), 4); // zd == za
16970 (masm.*macro_idx)(z6.VnH(), z2.VnH(), z1.VnH(), z0.VnH(), 7);
16971
16972 FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z30.VnS());
16973
16974 __ Mov(z7, z0);
16975 (masm.*macro_idx)(z7.VnS(), z2.VnS(), z1.VnS(), z7.VnS(), 0); // zd == zm
16976 __ Mov(z8, z1);
16977 (masm.*macro_idx)(z8.VnS(), z2.VnS(), z8.VnS(), z0.VnS(), 1); // zd == zn
16978 __ Mov(z9, z2);
16979 (masm.*macro_idx)(z9.VnS(), z9.VnS(), z1.VnS(), z0.VnS(), 2); // zd == za
16980 (masm.*macro_idx)(z10.VnS(), z2.VnS(), z1.VnS(), z0.VnS(), 3);
16981
16982 FPSegmentPatternHelper(&masm, z0.VnD(), p0.Merging(), z30.VnD());
16983
16984 __ Mov(z11, z0);
16985 (masm.*macro_idx)(z11.VnD(), z2.VnD(), z1.VnD(), z11.VnD(), 0); // zd == zm
16986 __ Mov(z12, z1);
16987 (masm.*macro_idx)(z12.VnD(), z2.VnD(), z12.VnD(), z0.VnD(), 1); // zd == zn
16988 __ Mov(z13, z2);
16989 (masm.*macro_idx)(z13.VnD(), z13.VnD(), z1.VnD(), z0.VnD(), 0); // zd == za
16990 __ Mov(z14, z0);
16991 // zd == zn == zm
16992 (masm.*macro_idx)(z14.VnD(), z2.VnD(), z14.VnD(), z14.VnD(), 1);
16993
16994 // Indexed form of Fmla and Fmls won't swap argument, passing strict NaN
16995 // propagation mode to ensure the following macros don't swap argument in
16996 // any cases.
16997 FPMacroNaNPropagationOption option = StrictNaNPropagation;
16998 // Compute the results using other instructions.
16999 __ Dup(z0.VnH(), z30.VnH(), 0);
17000 FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z0.VnH());
17001 (masm.*macro)(z15.VnH(), p0.Merging(), z2.VnH(), z1.VnH(), z0.VnH(), option);
17002 __ Dup(z0.VnH(), z30.VnH(), 1);
17003 FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z0.VnH());
17004 (masm.*macro)(z16.VnH(), p0.Merging(), z2.VnH(), z1.VnH(), z0.VnH(), option);
17005 __ Dup(z0.VnH(), z30.VnH(), 4);
17006 FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z0.VnH());
17007 (masm.*macro)(z17.VnH(), p0.Merging(), z2.VnH(), z1.VnH(), z0.VnH(), option);
17008 __ Dup(z0.VnH(), z30.VnH(), 7);
17009 FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z0.VnH());
17010 (masm.*macro)(z18.VnH(), p0.Merging(), z2.VnH(), z1.VnH(), z0.VnH(), option);
17011
17012 __ Dup(z0.VnS(), z30.VnS(), 0);
17013 FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z0.VnS());
17014 (masm.*macro)(z19.VnS(), p0.Merging(), z2.VnS(), z1.VnS(), z0.VnS(), option);
17015 __ Dup(z0.VnS(), z30.VnS(), 1);
17016 FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z0.VnS());
17017 (masm.*macro)(z20.VnS(), p0.Merging(), z2.VnS(), z1.VnS(), z0.VnS(), option);
17018 __ Dup(z0.VnS(), z30.VnS(), 2);
17019 FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z0.VnS());
17020 (masm.*macro)(z21.VnS(), p0.Merging(), z2.VnS(), z1.VnS(), z0.VnS(), option);
17021 __ Dup(z0.VnS(), z30.VnS(), 3);
17022 FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z0.VnS());
17023 (masm.*macro)(z22.VnS(), p0.Merging(), z2.VnS(), z1.VnS(), z0.VnS(), option);
17024
17025 __ Dup(z0.VnD(), z30.VnD(), 0);
17026 FPSegmentPatternHelper(&masm, z0.VnD(), p0.Merging(), z0.VnD());
17027 (masm.*macro)(z23.VnD(), p0.Merging(), z2.VnD(), z1.VnD(), z0.VnD(), option);
17028 __ Dup(z0.VnD(), z30.VnD(), 1);
17029 FPSegmentPatternHelper(&masm, z0.VnD(), p0.Merging(), z0.VnD());
17030 (masm.*macro)(z24.VnD(), p0.Merging(), z2.VnD(), z1.VnD(), z0.VnD(), option);
17031 FPSegmentPatternHelper(&masm, z0.VnD(), p0.Merging(), z30.VnD());
17032 __ Dup(z29.VnD(), z30.VnD(), 1);
17033 FPSegmentPatternHelper(&masm, z29.VnD(), p0.Merging(), z29.VnD());
17034 (masm.*macro)(z25.VnD(), p0.Merging(), z2.VnD(), z0.VnD(), z29.VnD(), option);
17035
17036 END();
17037
17038 if (CAN_RUN()) {
17039 RUN();
17040
17041 ASSERT_EQUAL_SVE(z15.VnH(), z3.VnH());
17042 ASSERT_EQUAL_SVE(z16.VnH(), z4.VnH());
17043 ASSERT_EQUAL_SVE(z17.VnH(), z5.VnH());
17044 ASSERT_EQUAL_SVE(z18.VnH(), z6.VnH());
17045
17046 ASSERT_EQUAL_SVE(z19.VnS(), z7.VnS());
17047 ASSERT_EQUAL_SVE(z20.VnS(), z8.VnS());
17048 ASSERT_EQUAL_SVE(z21.VnS(), z9.VnS());
17049 ASSERT_EQUAL_SVE(z22.VnS(), z10.VnS());
17050
17051 ASSERT_EQUAL_SVE(z23.VnD(), z11.VnD());
17052 ASSERT_EQUAL_SVE(z24.VnD(), z12.VnD());
17053 ASSERT_EQUAL_SVE(z11.VnD(), z13.VnD());
17054 ASSERT_EQUAL_SVE(z25.VnD(), z14.VnD());
17055 }
17056 }
17057
TEST_SVE(sve_fmla_fmls_index)17058 TEST_SVE(sve_fmla_fmls_index) {
17059 uint64_t zm_inputs_1[] = {0x3ff000003f803c00, 0xbff00000bf80bc00};
17060 uint64_t zn_inputs_1[] = {0x3ff012343ff03c76, 0xbff01234bff0bc76};
17061 uint64_t za_inputs_1[] = {0x3c004000bc00c000, 0x64006800e400e800};
17062
17063 // Using the vector form of Fmla and Fmls to verify the indexed form.
17064 FPMulAccIdxHelper(config,
17065 &MacroAssembler::Fmla, // vector form
17066 &MacroAssembler::Fmla, // indexed form
17067 za_inputs_1,
17068 zn_inputs_1,
17069 zm_inputs_1);
17070
17071 FPMulAccIdxHelper(config,
17072 &MacroAssembler::Fmls, // vector form
17073 &MacroAssembler::Fmls, // indexed form
17074 za_inputs_1,
17075 zn_inputs_1,
17076 zm_inputs_1);
17077
17078 uint64_t zm_inputs_2[] = {0x7ff5555511111111, // NaN
17079 0xfff0000000000000}; // Infinity
17080 uint64_t zn_inputs_2[] = {0x7f9511117fc00000, // NaN
17081 0x7f800000ff800000}; // Infinity
17082 uint64_t za_inputs_2[] = {0x7c11000000007e00, // NaN
17083 0x000000007c00fc00}; // Infinity
17084 FPMulAccIdxHelper(config,
17085 &MacroAssembler::Fmla, // vector form
17086 &MacroAssembler::Fmla, // indexed form
17087 za_inputs_2,
17088 zn_inputs_2,
17089 zm_inputs_2);
17090
17091 FPMulAccIdxHelper(config,
17092 &MacroAssembler::Fmls, // vector form
17093 &MacroAssembler::Fmls, // indexed form
17094 za_inputs_2,
17095 zn_inputs_2,
17096 zm_inputs_2);
17097 }
17098
17099 // Execute a number of instructions which all use ProcessNaNs, and check that
17100 // they all propagate NaNs correctly.
17101 template <typename Ti, typename Td, size_t N>
ProcessNaNsHelper(Test * config,int lane_size_in_bits,const Ti (& zn_inputs)[N],const Ti (& zm_inputs)[N],const Td (& zd_expected)[N],FPMacroNaNPropagationOption nan_option)17102 static void ProcessNaNsHelper(Test* config,
17103 int lane_size_in_bits,
17104 const Ti (&zn_inputs)[N],
17105 const Ti (&zm_inputs)[N],
17106 const Td (&zd_expected)[N],
17107 FPMacroNaNPropagationOption nan_option) {
17108 ArithFn arith_unpredicated_macro[] = {&MacroAssembler::Fadd,
17109 &MacroAssembler::Fsub,
17110 &MacroAssembler::Fmul};
17111
17112 for (size_t i = 0; i < ArrayLength(arith_unpredicated_macro); i++) {
17113 FPBinArithHelper(config,
17114 arith_unpredicated_macro[i],
17115 lane_size_in_bits,
17116 zn_inputs,
17117 zm_inputs,
17118 zd_expected);
17119 }
17120
17121 FPArithPredicatedFn arith_predicated_macro[] = {&MacroAssembler::Fmax,
17122 &MacroAssembler::Fmin};
17123 int pg_inputs[N];
17124 // With an all-true predicate, this helper aims to compare with special
17125 // numbers.
17126 for (size_t i = 0; i < N; i++) {
17127 pg_inputs[i] = 1;
17128 }
17129
17130 // fdivr propagates the quotient (Zm) preferentially, so we don't actually
17131 // need any special handling for StrictNaNPropagation.
17132 FPBinArithHelper(config,
17133 NULL,
17134 &MacroAssembler::Fdiv,
17135 lane_size_in_bits,
17136 // With an all-true predicate, the value in zd is
17137 // irrelevant to the operations.
17138 zn_inputs,
17139 pg_inputs,
17140 zn_inputs,
17141 zm_inputs,
17142 zd_expected);
17143
17144 for (size_t i = 0; i < ArrayLength(arith_predicated_macro); i++) {
17145 FPBinArithHelper(config,
17146 arith_predicated_macro[i],
17147 NULL,
17148 lane_size_in_bits,
17149 // With an all-true predicate, the value in zd is
17150 // irrelevant to the operations.
17151 zn_inputs,
17152 pg_inputs,
17153 zn_inputs,
17154 zm_inputs,
17155 zd_expected,
17156 nan_option);
17157 }
17158 }
17159
17160 template <typename Ti, typename Td, size_t N>
ProcessNaNsHelper3(Test * config,int lane_size_in_bits,const Ti (& za_inputs)[N],const Ti (& zn_inputs)[N],const Ti (& zm_inputs)[N],const Td (& zd_expected_fmla)[N],const Td (& zd_expected_fmls)[N],const Td (& zd_expected_fnmla)[N],const Td (& zd_expected_fnmls)[N],FPMacroNaNPropagationOption nan_option)17161 static void ProcessNaNsHelper3(Test* config,
17162 int lane_size_in_bits,
17163 const Ti (&za_inputs)[N],
17164 const Ti (&zn_inputs)[N],
17165 const Ti (&zm_inputs)[N],
17166 const Td (&zd_expected_fmla)[N],
17167 const Td (&zd_expected_fmls)[N],
17168 const Td (&zd_expected_fnmla)[N],
17169 const Td (&zd_expected_fnmls)[N],
17170 FPMacroNaNPropagationOption nan_option) {
17171 int pg_inputs[N];
17172 // With an all-true predicate, this helper aims to compare with special
17173 // numbers.
17174 for (size_t i = 0; i < N; i++) {
17175 pg_inputs[i] = 1;
17176 }
17177
17178 FPMulAccHelper(config,
17179 &MacroAssembler::Fmla,
17180 lane_size_in_bits,
17181 pg_inputs,
17182 za_inputs,
17183 zn_inputs,
17184 zm_inputs,
17185 zd_expected_fmla,
17186 nan_option);
17187
17188 FPMulAccHelper(config,
17189 &MacroAssembler::Fmls,
17190 lane_size_in_bits,
17191 pg_inputs,
17192 za_inputs,
17193 zn_inputs,
17194 zm_inputs,
17195 zd_expected_fmls,
17196 nan_option);
17197
17198 FPMulAccHelper(config,
17199 &MacroAssembler::Fnmla,
17200 lane_size_in_bits,
17201 pg_inputs,
17202 za_inputs,
17203 zn_inputs,
17204 zm_inputs,
17205 zd_expected_fnmla,
17206 nan_option);
17207
17208 FPMulAccHelper(config,
17209 &MacroAssembler::Fnmls,
17210 lane_size_in_bits,
17211 pg_inputs,
17212 za_inputs,
17213 zn_inputs,
17214 zm_inputs,
17215 zd_expected_fnmls,
17216 nan_option);
17217 }
17218
TEST_SVE(sve_process_nans_double)17219 TEST_SVE(sve_process_nans_double) {
17220 // Use non-standard NaNs to check that the payload bits are preserved.
17221 double sa = RawbitsToDouble(0x7ff5555511111111);
17222 double sn = RawbitsToDouble(0x7ff5555522222222);
17223 double sm = RawbitsToDouble(0x7ff5555533333333);
17224 double qa = RawbitsToDouble(0x7ffaaaaa11111111);
17225 double qn = RawbitsToDouble(0x7ffaaaaa22222222);
17226 double qm = RawbitsToDouble(0x7ffaaaaa33333333);
17227 VIXL_ASSERT(IsSignallingNaN(sa));
17228 VIXL_ASSERT(IsSignallingNaN(sn));
17229 VIXL_ASSERT(IsSignallingNaN(sm));
17230 VIXL_ASSERT(IsQuietNaN(qa));
17231 VIXL_ASSERT(IsQuietNaN(qn));
17232 VIXL_ASSERT(IsQuietNaN(qm));
17233
17234 // The input NaNs after passing through ProcessNaN.
17235 uint64_t sa_proc = 0x7ffd555511111111;
17236 uint64_t sn_proc = 0x7ffd555522222222;
17237 uint64_t sm_proc = 0x7ffd555533333333;
17238 uint64_t qa_proc = DoubleToRawbits(qa);
17239 uint64_t qn_proc = DoubleToRawbits(qn);
17240 uint64_t qm_proc = DoubleToRawbits(qm);
17241 uint64_t sa_proc_n = sa_proc ^ kDSignMask;
17242 uint64_t sn_proc_n = sn_proc ^ kDSignMask;
17243 uint64_t qa_proc_n = qa_proc ^ kDSignMask;
17244 uint64_t qn_proc_n = qn_proc ^ kDSignMask;
17245
17246 // Quiet NaNs are propagated.
17247 double zn_inputs_1[] = {qn, 0.0, 0.0, qm, qn, qm};
17248 double zm_inputs_1[] = {0.0, qn, qm, 0.0, qm, qn};
17249 uint64_t zd_expected_1[] =
17250 {qn_proc, qn_proc, qm_proc, qm_proc, qn_proc, qm_proc};
17251
17252 ProcessNaNsHelper(config,
17253 kDRegSize,
17254 zn_inputs_1,
17255 zm_inputs_1,
17256 zd_expected_1,
17257 StrictNaNPropagation);
17258
17259 // Signalling NaNs are propagated.
17260 double zn_inputs_2[] = {sn, 0.0, 0.0, sm, sn, sm};
17261 double zm_inputs_2[] = {0.0, sn, sm, 0.0, sm, sn};
17262 uint64_t zd_expected_2[] =
17263 {sn_proc, sn_proc, sm_proc, sm_proc, sn_proc, sm_proc};
17264 ProcessNaNsHelper(config,
17265 kDRegSize,
17266 zn_inputs_2,
17267 zm_inputs_2,
17268 zd_expected_2,
17269 StrictNaNPropagation);
17270
17271 // Signalling NaNs take precedence over quiet NaNs.
17272 double zn_inputs_3[] = {sn, qn, sn, sn, qn};
17273 double zm_inputs_3[] = {qm, sm, sm, qn, sn};
17274 uint64_t zd_expected_3[] = {sn_proc, sm_proc, sn_proc, sn_proc, sn_proc};
17275 ProcessNaNsHelper(config,
17276 kDRegSize,
17277 zn_inputs_3,
17278 zm_inputs_3,
17279 zd_expected_3,
17280 StrictNaNPropagation);
17281
17282 double za_inputs_4[] = {qa, qa, 0.0, 0.0, qa, qa};
17283 double zn_inputs_4[] = {qn, 0.0, 0.0, qn, qn, qn};
17284 double zm_inputs_4[] = {0.0, qm, qm, qm, qm, 0.0};
17285
17286 // If `a` is propagated, its sign is inverted by fnmla and fnmls.
17287 // If `n` is propagated, its sign is inverted by fmls and fnmla.
17288 // If `m` is propagated, its sign is never inverted.
17289 uint64_t zd_expected_fmla_4[] =
17290 {qa_proc, qa_proc, qm_proc, qn_proc, qa_proc, qa_proc};
17291 uint64_t zd_expected_fmls_4[] =
17292 {qa_proc, qa_proc, qm_proc, qn_proc_n, qa_proc, qa_proc};
17293 uint64_t zd_expected_fnmla_4[] =
17294 {qa_proc_n, qa_proc_n, qm_proc, qn_proc_n, qa_proc_n, qa_proc_n};
17295 uint64_t zd_expected_fnmls_4[] =
17296 {qa_proc_n, qa_proc_n, qm_proc, qn_proc, qa_proc_n, qa_proc_n};
17297
17298 ProcessNaNsHelper3(config,
17299 kDRegSize,
17300 za_inputs_4,
17301 zn_inputs_4,
17302 zm_inputs_4,
17303 zd_expected_fmla_4,
17304 zd_expected_fmls_4,
17305 zd_expected_fnmla_4,
17306 zd_expected_fnmls_4,
17307 StrictNaNPropagation);
17308
17309 // Signalling NaNs take precedence over quiet NaNs.
17310 double za_inputs_5[] = {qa, qa, sa, sa, sa};
17311 double zn_inputs_5[] = {qn, sn, sn, sn, qn};
17312 double zm_inputs_5[] = {sm, qm, sm, qa, sm};
17313 uint64_t zd_expected_fmla_5[] = {sm_proc, sn_proc, sa_proc, sa_proc, sa_proc};
17314 uint64_t zd_expected_fmls_5[] = {sm_proc,
17315 sn_proc_n,
17316 sa_proc,
17317 sa_proc,
17318 sa_proc};
17319 uint64_t zd_expected_fnmla_5[] = {sm_proc,
17320 sn_proc_n,
17321 sa_proc_n,
17322 sa_proc_n,
17323 sa_proc_n};
17324 uint64_t zd_expected_fnmls_5[] = {sm_proc,
17325 sn_proc,
17326 sa_proc_n,
17327 sa_proc_n,
17328 sa_proc_n};
17329
17330 ProcessNaNsHelper3(config,
17331 kDRegSize,
17332 za_inputs_5,
17333 zn_inputs_5,
17334 zm_inputs_5,
17335 zd_expected_fmla_5,
17336 zd_expected_fmls_5,
17337 zd_expected_fnmla_5,
17338 zd_expected_fnmls_5,
17339 StrictNaNPropagation);
17340
17341 const double inf = kFP64PositiveInfinity;
17342 const double inf_n = kFP64NegativeInfinity;
17343 uint64_t inf_proc = DoubleToRawbits(inf);
17344 uint64_t inf_proc_n = DoubleToRawbits(inf_n);
17345 uint64_t d_inf_proc = DoubleToRawbits(kFP64DefaultNaN);
17346
17347 double za_inputs_6[] = {qa, qa, 0.0f, -0.0f, qa, sa};
17348 double zn_inputs_6[] = {inf, -0.0f, -0.0f, inf, inf_n, inf};
17349 double zm_inputs_6[] = {0.0f, inf_n, inf, inf, inf, 0.0f};
17350
17351 // quiet_nan + (0.0 * inf) produces the default NaN, not quiet_nan. Ditto for
17352 // (inf * 0.0). On the other hand, quiet_nan + (inf * inf) propagates the
17353 // quiet_nan.
17354 uint64_t zd_expected_fmla_6[] =
17355 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc, sa_proc};
17356 uint64_t zd_expected_fmls_6[] =
17357 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc, sa_proc};
17358 uint64_t zd_expected_fnmla_6[] =
17359 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc_n, sa_proc_n};
17360 uint64_t zd_expected_fnmls_6[] =
17361 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc_n, sa_proc_n};
17362
17363 ProcessNaNsHelper3(config,
17364 kDRegSize,
17365 za_inputs_6,
17366 zn_inputs_6,
17367 zm_inputs_6,
17368 zd_expected_fmla_6,
17369 zd_expected_fmls_6,
17370 zd_expected_fnmla_6,
17371 zd_expected_fnmls_6,
17372 StrictNaNPropagation);
17373 }
17374
TEST_SVE(sve_process_nans_float)17375 TEST_SVE(sve_process_nans_float) {
17376 // Use non-standard NaNs to check that the payload bits are preserved.
17377 float sa = RawbitsToFloat(0x7f951111);
17378 float sn = RawbitsToFloat(0x7f952222);
17379 float sm = RawbitsToFloat(0x7f953333);
17380 float qa = RawbitsToFloat(0x7fea1111);
17381 float qn = RawbitsToFloat(0x7fea2222);
17382 float qm = RawbitsToFloat(0x7fea3333);
17383 VIXL_ASSERT(IsSignallingNaN(sa));
17384 VIXL_ASSERT(IsSignallingNaN(sn));
17385 VIXL_ASSERT(IsSignallingNaN(sm));
17386 VIXL_ASSERT(IsQuietNaN(qa));
17387 VIXL_ASSERT(IsQuietNaN(qn));
17388 VIXL_ASSERT(IsQuietNaN(qm));
17389
17390 // The input NaNs after passing through ProcessNaN.
17391 uint32_t sa_proc = 0x7fd51111;
17392 uint32_t sn_proc = 0x7fd52222;
17393 uint32_t sm_proc = 0x7fd53333;
17394 uint32_t qa_proc = FloatToRawbits(qa);
17395 uint32_t qn_proc = FloatToRawbits(qn);
17396 uint32_t qm_proc = FloatToRawbits(qm);
17397 uint32_t sa_proc_n = sa_proc ^ kSSignMask;
17398 uint32_t sn_proc_n = sn_proc ^ kSSignMask;
17399 uint32_t qa_proc_n = qa_proc ^ kSSignMask;
17400 uint32_t qn_proc_n = qn_proc ^ kSSignMask;
17401
17402 // Quiet NaNs are propagated.
17403 float zn_inputs_1[] = {qn, 0.0f, 0.0f, qm, qn, qm};
17404 float zm_inputs_1[] = {0.0f, qn, qm, 0.0f, qm, qn};
17405 uint64_t zd_expected_1[] =
17406 {qn_proc, qn_proc, qm_proc, qm_proc, qn_proc, qm_proc};
17407
17408 ProcessNaNsHelper(config,
17409 kSRegSize,
17410 zn_inputs_1,
17411 zm_inputs_1,
17412 zd_expected_1,
17413 StrictNaNPropagation);
17414
17415 // Signalling NaNs are propagated.
17416 float zn_inputs_2[] = {sn, 0.0f, 0.0f, sm, sn, sm};
17417 float zm_inputs_2[] = {0.0f, sn, sm, 0.0f, sm, sn};
17418 uint64_t zd_expected_2[] =
17419 {sn_proc, sn_proc, sm_proc, sm_proc, sn_proc, sm_proc};
17420 ProcessNaNsHelper(config,
17421 kSRegSize,
17422 zn_inputs_2,
17423 zm_inputs_2,
17424 zd_expected_2,
17425 StrictNaNPropagation);
17426
17427 // Signalling NaNs take precedence over quiet NaNs.
17428 float zn_inputs_3[] = {sn, qn, sn, sn, qn};
17429 float zm_inputs_3[] = {qm, sm, sm, qn, sn};
17430 uint64_t zd_expected_3[] = {sn_proc, sm_proc, sn_proc, sn_proc, sn_proc};
17431 ProcessNaNsHelper(config,
17432 kSRegSize,
17433 zn_inputs_3,
17434 zm_inputs_3,
17435 zd_expected_3,
17436 StrictNaNPropagation);
17437
17438 float za_inputs_4[] = {qa, qa, 0.0f, 0.0f, qa, qa};
17439 float zn_inputs_4[] = {qn, 0.0f, 0.0f, qn, qn, qn};
17440 float zm_inputs_4[] = {0.0f, qm, qm, qm, qm, 0.0f};
17441
17442 // If `a` is propagated, its sign is inverted by fnmla and fnmls.
17443 // If `n` is propagated, its sign is inverted by fmls and fnmla.
17444 // If `m` is propagated, its sign is never inverted.
17445 uint64_t zd_expected_fmla_4[] =
17446 {qa_proc, qa_proc, qm_proc, qn_proc, qa_proc, qa_proc};
17447 uint64_t zd_expected_fmls_4[] =
17448 {qa_proc, qa_proc, qm_proc, qn_proc_n, qa_proc, qa_proc};
17449 uint64_t zd_expected_fnmla_4[] =
17450 {qa_proc_n, qa_proc_n, qm_proc, qn_proc_n, qa_proc_n, qa_proc_n};
17451 uint64_t zd_expected_fnmls_4[] =
17452 {qa_proc_n, qa_proc_n, qm_proc, qn_proc, qa_proc_n, qa_proc_n};
17453
17454 ProcessNaNsHelper3(config,
17455 kSRegSize,
17456 za_inputs_4,
17457 zn_inputs_4,
17458 zm_inputs_4,
17459 zd_expected_fmla_4,
17460 zd_expected_fmls_4,
17461 zd_expected_fnmla_4,
17462 zd_expected_fnmls_4,
17463 StrictNaNPropagation);
17464
17465 // Signalling NaNs take precedence over quiet NaNs.
17466 float za_inputs_5[] = {qa, qa, sa, sa, sa};
17467 float zn_inputs_5[] = {qn, sn, sn, sn, qn};
17468 float zm_inputs_5[] = {sm, qm, sm, qa, sm};
17469 uint64_t zd_expected_fmla_5[] = {sm_proc, sn_proc, sa_proc, sa_proc, sa_proc};
17470 uint64_t zd_expected_fmls_5[] = {sm_proc,
17471 sn_proc_n,
17472 sa_proc,
17473 sa_proc,
17474 sa_proc};
17475 uint64_t zd_expected_fnmla_5[] = {sm_proc,
17476 sn_proc_n,
17477 sa_proc_n,
17478 sa_proc_n,
17479 sa_proc_n};
17480 uint64_t zd_expected_fnmls_5[] = {sm_proc,
17481 sn_proc,
17482 sa_proc_n,
17483 sa_proc_n,
17484 sa_proc_n};
17485
17486 ProcessNaNsHelper3(config,
17487 kSRegSize,
17488 za_inputs_5,
17489 zn_inputs_5,
17490 zm_inputs_5,
17491 zd_expected_fmla_5,
17492 zd_expected_fmls_5,
17493 zd_expected_fnmla_5,
17494 zd_expected_fnmls_5,
17495 StrictNaNPropagation);
17496
17497 const float inf = kFP32PositiveInfinity;
17498 const float inf_n = kFP32NegativeInfinity;
17499 uint32_t inf_proc = FloatToRawbits(inf);
17500 uint32_t inf_proc_n = FloatToRawbits(inf_n);
17501 uint32_t d_inf_proc = FloatToRawbits(kFP32DefaultNaN);
17502
17503 float za_inputs_6[] = {qa, qa, 0.0f, 0.0f, qa, sa};
17504 float zn_inputs_6[] = {inf, 0.0f, 0.0f, inf, inf_n, inf};
17505 float zm_inputs_6[] = {0.0f, inf_n, inf, inf, inf, 0.0f};
17506
17507 // quiet_nan + (0.0 * inf) produces the default NaN, not quiet_nan. Ditto for
17508 // (inf * 0.0). On the other hand, quiet_nan + (inf * inf) propagates the
17509 // quiet_nan.
17510 uint64_t zd_expected_fmla_6[] =
17511 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc, sa_proc};
17512 uint64_t zd_expected_fmls_6[] =
17513 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc, sa_proc};
17514 uint64_t zd_expected_fnmla_6[] =
17515 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc_n, sa_proc_n};
17516 uint64_t zd_expected_fnmls_6[] =
17517 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc_n, sa_proc_n};
17518
17519 ProcessNaNsHelper3(config,
17520 kSRegSize,
17521 za_inputs_6,
17522 zn_inputs_6,
17523 zm_inputs_6,
17524 zd_expected_fmla_6,
17525 zd_expected_fmls_6,
17526 zd_expected_fnmla_6,
17527 zd_expected_fnmls_6,
17528 StrictNaNPropagation);
17529 }
17530
TEST_SVE(sve_process_nans_half)17531 TEST_SVE(sve_process_nans_half) {
17532 // Use non-standard NaNs to check that the payload bits are preserved.
17533 Float16 sa(RawbitsToFloat16(0x7c11));
17534 Float16 sn(RawbitsToFloat16(0x7c22));
17535 Float16 sm(RawbitsToFloat16(0x7c33));
17536 Float16 qa(RawbitsToFloat16(0x7e44));
17537 Float16 qn(RawbitsToFloat16(0x7e55));
17538 Float16 qm(RawbitsToFloat16(0x7e66));
17539 VIXL_ASSERT(IsSignallingNaN(sa));
17540 VIXL_ASSERT(IsSignallingNaN(sn));
17541 VIXL_ASSERT(IsSignallingNaN(sm));
17542 VIXL_ASSERT(IsQuietNaN(qa));
17543 VIXL_ASSERT(IsQuietNaN(qn));
17544 VIXL_ASSERT(IsQuietNaN(qm));
17545
17546 // The input NaNs after passing through ProcessNaN.
17547 uint16_t sa_proc = 0x7e11;
17548 uint16_t sn_proc = 0x7e22;
17549 uint16_t sm_proc = 0x7e33;
17550 uint16_t qa_proc = Float16ToRawbits(qa);
17551 uint16_t qn_proc = Float16ToRawbits(qn);
17552 uint16_t qm_proc = Float16ToRawbits(qm);
17553 uint16_t sa_proc_n = sa_proc ^ kHSignMask;
17554 uint16_t sn_proc_n = sn_proc ^ kHSignMask;
17555 uint16_t qa_proc_n = qa_proc ^ kHSignMask;
17556 uint16_t qn_proc_n = qn_proc ^ kHSignMask;
17557 Float16 zero(0.0);
17558
17559 // Quiet NaNs are propagated.
17560 Float16 zn_inputs_1[] = {qn, zero, zero, qm, qn, qm};
17561 Float16 zm_inputs_1[] = {zero, qn, qm, zero, qm, qn};
17562 uint64_t zd_expected_1[] =
17563 {qn_proc, qn_proc, qm_proc, qm_proc, qn_proc, qm_proc};
17564
17565 ProcessNaNsHelper(config,
17566 kHRegSize,
17567 zn_inputs_1,
17568 zm_inputs_1,
17569 zd_expected_1,
17570 StrictNaNPropagation);
17571
17572 // Signalling NaNs are propagated.
17573 Float16 zn_inputs_2[] = {sn, zero, zero, sm, sn, sm};
17574 Float16 zm_inputs_2[] = {zero, sn, sm, zero, sm, sn};
17575 uint64_t zd_expected_2[] =
17576 {sn_proc, sn_proc, sm_proc, sm_proc, sn_proc, sm_proc};
17577 ProcessNaNsHelper(config,
17578 kHRegSize,
17579 zn_inputs_2,
17580 zm_inputs_2,
17581 zd_expected_2,
17582 StrictNaNPropagation);
17583
17584 // Signalling NaNs take precedence over quiet NaNs.
17585 Float16 zn_inputs_3[] = {sn, qn, sn, sn, qn};
17586 Float16 zm_inputs_3[] = {qm, sm, sm, qn, sn};
17587 uint64_t zd_expected_3[] = {sn_proc, sm_proc, sn_proc, sn_proc, sn_proc};
17588 ProcessNaNsHelper(config,
17589 kHRegSize,
17590 zn_inputs_3,
17591 zm_inputs_3,
17592 zd_expected_3,
17593 StrictNaNPropagation);
17594
17595 Float16 za_inputs_4[] = {qa, qa, zero, zero, qa, qa};
17596 Float16 zn_inputs_4[] = {qn, zero, zero, qn, qn, qn};
17597 Float16 zm_inputs_4[] = {zero, qm, qm, qm, qm, zero};
17598
17599 // If `a` is propagated, its sign is inverted by fnmla and fnmls.
17600 // If `n` is propagated, its sign is inverted by fmls and fnmla.
17601 // If `m` is propagated, its sign is never inverted.
17602 uint64_t zd_expected_fmla_4[] =
17603 {qa_proc, qa_proc, qm_proc, qn_proc, qa_proc, qa_proc};
17604 uint64_t zd_expected_fmls_4[] =
17605 {qa_proc, qa_proc, qm_proc, qn_proc_n, qa_proc, qa_proc};
17606 uint64_t zd_expected_fnmla_4[] =
17607 {qa_proc_n, qa_proc_n, qm_proc, qn_proc_n, qa_proc_n, qa_proc_n};
17608 uint64_t zd_expected_fnmls_4[] =
17609 {qa_proc_n, qa_proc_n, qm_proc, qn_proc, qa_proc_n, qa_proc_n};
17610
17611 ProcessNaNsHelper3(config,
17612 kHRegSize,
17613 za_inputs_4,
17614 zn_inputs_4,
17615 zm_inputs_4,
17616 zd_expected_fmla_4,
17617 zd_expected_fmls_4,
17618 zd_expected_fnmla_4,
17619 zd_expected_fnmls_4,
17620 StrictNaNPropagation);
17621
17622 // Signalling NaNs take precedence over quiet NaNs.
17623 Float16 za_inputs_5[] = {qa, qa, sa, sa, sa};
17624 Float16 zn_inputs_5[] = {qn, sn, sn, sn, qn};
17625 Float16 zm_inputs_5[] = {sm, qm, sm, qa, sm};
17626 uint64_t zd_expected_fmla_5[] = {sm_proc, sn_proc, sa_proc, sa_proc, sa_proc};
17627 uint64_t zd_expected_fmls_5[] = {sm_proc,
17628 sn_proc_n,
17629 sa_proc,
17630 sa_proc,
17631 sa_proc};
17632 uint64_t zd_expected_fnmla_5[] = {sm_proc,
17633 sn_proc_n,
17634 sa_proc_n,
17635 sa_proc_n,
17636 sa_proc_n};
17637 uint64_t zd_expected_fnmls_5[] = {sm_proc,
17638 sn_proc,
17639 sa_proc_n,
17640 sa_proc_n,
17641 sa_proc_n};
17642
17643 ProcessNaNsHelper3(config,
17644 kHRegSize,
17645 za_inputs_5,
17646 zn_inputs_5,
17647 zm_inputs_5,
17648 zd_expected_fmla_5,
17649 zd_expected_fmls_5,
17650 zd_expected_fnmla_5,
17651 zd_expected_fnmls_5,
17652 StrictNaNPropagation);
17653
17654 const Float16 inf = kFP16PositiveInfinity;
17655 const Float16 inf_n = kFP16NegativeInfinity;
17656 uint64_t inf_proc = Float16ToRawbits(inf);
17657 uint64_t inf_proc_n = Float16ToRawbits(inf_n);
17658 uint64_t d_inf_proc = Float16ToRawbits(kFP16DefaultNaN);
17659
17660 Float16 za_inputs_6[] = {qa, qa, zero, zero, qa, sa};
17661 Float16 zn_inputs_6[] = {inf, zero, zero, inf, inf_n, inf};
17662 Float16 zm_inputs_6[] = {zero, inf_n, inf, inf, inf, zero};
17663
17664 // quiet_nan + (0.0 * inf) produces the default NaN, not quiet_nan. Ditto for
17665 // (inf * 0.0). On the other hand, quiet_nan + (inf * inf) propagates the
17666 // quiet_nan.
17667 uint64_t zd_expected_fmla_6[] =
17668 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc, sa_proc};
17669 uint64_t zd_expected_fmls_6[] =
17670 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc, sa_proc};
17671 uint64_t zd_expected_fnmla_6[] =
17672 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc_n, sa_proc_n};
17673 uint64_t zd_expected_fnmls_6[] =
17674 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc_n, sa_proc_n};
17675
17676 ProcessNaNsHelper3(config,
17677 kHRegSize,
17678 za_inputs_6,
17679 zn_inputs_6,
17680 zm_inputs_6,
17681 zd_expected_fmla_6,
17682 zd_expected_fmls_6,
17683 zd_expected_fnmla_6,
17684 zd_expected_fnmls_6,
17685 StrictNaNPropagation);
17686 }
17687
17688 typedef void (MacroAssembler::*FCmpFn)(const PRegisterWithLaneSize& pd,
17689 const PRegisterZ& pg,
17690 const ZRegister& zn,
17691 const ZRegister& zm);
17692
17693 typedef void (MacroAssembler::*FCmpZeroFn)(const PRegisterWithLaneSize& pd,
17694 const PRegisterZ& pg,
17695 const ZRegister& zn,
17696 double zero);
17697
17698 typedef void (MacroAssembler::*CmpFn)(const PRegisterWithLaneSize& pd,
17699 const PRegisterZ& pg,
17700 const ZRegister& zn,
17701 const ZRegister& zm);
17702
GetFpAbsCompareFn(Condition cond)17703 static FCmpFn GetFpAbsCompareFn(Condition cond) {
17704 switch (cond) {
17705 case ge:
17706 return &MacroAssembler::Facge;
17707 case gt:
17708 return &MacroAssembler::Facgt;
17709 case le:
17710 return &MacroAssembler::Facle;
17711 case lt:
17712 return &MacroAssembler::Faclt;
17713 default:
17714 VIXL_UNIMPLEMENTED();
17715 return NULL;
17716 }
17717 }
17718
GetFpCompareFn(Condition cond)17719 static FCmpFn GetFpCompareFn(Condition cond) {
17720 switch (cond) {
17721 case ge:
17722 return &MacroAssembler::Fcmge;
17723 case gt:
17724 return &MacroAssembler::Fcmgt;
17725 case le:
17726 return &MacroAssembler::Fcmle;
17727 case lt:
17728 return &MacroAssembler::Fcmlt;
17729 case eq:
17730 return &MacroAssembler::Fcmeq;
17731 case ne:
17732 return &MacroAssembler::Fcmne;
17733 case uo:
17734 return &MacroAssembler::Fcmuo;
17735 default:
17736 VIXL_UNIMPLEMENTED();
17737 return NULL;
17738 }
17739 }
17740
GetFpCompareZeroFn(Condition cond)17741 static FCmpZeroFn GetFpCompareZeroFn(Condition cond) {
17742 switch (cond) {
17743 case ge:
17744 return &MacroAssembler::Fcmge;
17745 case gt:
17746 return &MacroAssembler::Fcmgt;
17747 case le:
17748 return &MacroAssembler::Fcmle;
17749 case lt:
17750 return &MacroAssembler::Fcmlt;
17751 case eq:
17752 return &MacroAssembler::Fcmeq;
17753 case ne:
17754 return &MacroAssembler::Fcmne;
17755 default:
17756 VIXL_UNIMPLEMENTED();
17757 return NULL;
17758 }
17759 }
17760
GetIntCompareFn(Condition cond)17761 static CmpFn GetIntCompareFn(Condition cond) {
17762 switch (cond) {
17763 case ge:
17764 return &MacroAssembler::Cmpge;
17765 case gt:
17766 return &MacroAssembler::Cmpgt;
17767 case le:
17768 return &MacroAssembler::Cmple;
17769 case lt:
17770 return &MacroAssembler::Cmplt;
17771 case eq:
17772 return &MacroAssembler::Cmpeq;
17773 case ne:
17774 return &MacroAssembler::Cmpne;
17775 default:
17776 VIXL_UNIMPLEMENTED();
17777 return NULL;
17778 }
17779 }
17780
17781 template <size_t N>
TestFpCompareHelper(Test * config,int lane_size_in_bits,Condition cond,const double (& zn_inputs)[N],const double (& zm_inputs)[N],const int (& pd_expected)[N],bool is_absolute=false)17782 static void TestFpCompareHelper(Test* config,
17783 int lane_size_in_bits,
17784 Condition cond,
17785 const double (&zn_inputs)[N],
17786 const double (&zm_inputs)[N],
17787 const int (&pd_expected)[N],
17788 bool is_absolute = false) {
17789 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
17790 START();
17791
17792 ZRegister zt_int_1 = z1.WithLaneSize(lane_size_in_bits);
17793 ZRegister zt_int_2 = z2.WithLaneSize(lane_size_in_bits);
17794 ZRegister zt_int_3 = z3.WithLaneSize(lane_size_in_bits);
17795 ZRegister zt_fp_1 = z11.WithLaneSize(lane_size_in_bits);
17796 ZRegister zt_fp_2 = z12.WithLaneSize(lane_size_in_bits);
17797 ZRegister zt_fp_3 = z13.WithLaneSize(lane_size_in_bits);
17798 ZRegister fp_one = z31.WithLaneSize(lane_size_in_bits);
17799
17800 PRegisterWithLaneSize pd_result_int_1 = p15.WithLaneSize(lane_size_in_bits);
17801 PRegisterWithLaneSize pd_result_fp_1 = p14.WithLaneSize(lane_size_in_bits);
17802 PRegisterWithLaneSize pd_result_int_2 = p13.WithLaneSize(lane_size_in_bits);
17803 PRegisterWithLaneSize pd_result_fp_2 = p12.WithLaneSize(lane_size_in_bits);
17804
17805 FCmpFn fcmp = is_absolute ? GetFpAbsCompareFn(cond) : GetFpCompareFn(cond);
17806 __ Ptrue(p1.VnB());
17807
17808 if (cond != uo) {
17809 int pg_inputs[] = {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1};
17810 Initialise(&masm, p0.WithLaneSize(lane_size_in_bits), pg_inputs);
17811
17812 __ Fdup(fp_one, 0.1f);
17813
17814 __ Index(zt_int_1, 3, 3);
17815 __ Scvtf(zt_fp_1, p0.Merging(), zt_int_1);
17816 __ Fadd(zt_fp_1, zt_fp_1, fp_one);
17817
17818 __ Index(zt_int_2, 3, -10);
17819 __ Scvtf(zt_fp_2, p0.Merging(), zt_int_2);
17820 __ Fadd(zt_fp_2, zt_fp_2, fp_one);
17821
17822 __ Index(zt_int_3, 3, 2);
17823 __ Scvtf(zt_fp_3, p0.Merging(), zt_int_3);
17824 __ Fadd(zt_fp_3, zt_fp_3, fp_one);
17825
17826
17827 // There is no absolute comparison in integer type, use `abs` with `cmp<cc>`
17828 // to synthesize the expected result for `fac<cc>`.
17829 if (is_absolute == true) {
17830 __ Abs(zt_int_2, p1.Merging(), zt_int_2);
17831 }
17832
17833 CmpFn cmp = GetIntCompareFn(cond);
17834 (masm.*cmp)(pd_result_int_1, p0.Zeroing(), zt_int_1, zt_int_2);
17835 (masm.*fcmp)(pd_result_fp_1, p0.Zeroing(), zt_fp_1, zt_fp_2);
17836
17837 (masm.*cmp)(pd_result_int_2, p0.Zeroing(), zt_int_1, zt_int_3);
17838 (masm.*fcmp)(pd_result_fp_2, p0.Zeroing(), zt_fp_1, zt_fp_3);
17839 }
17840
17841 uint64_t zn_inputs_rawbits[N];
17842 uint64_t zm_inputs_rawbits[N];
17843 FPToRawbitsWithSize(zn_inputs, zn_inputs_rawbits, lane_size_in_bits);
17844 FPToRawbitsWithSize(zm_inputs, zm_inputs_rawbits, lane_size_in_bits);
17845
17846 ZRegister zn_fp = z14.WithLaneSize(lane_size_in_bits);
17847 ZRegister zm_fp = z15.WithLaneSize(lane_size_in_bits);
17848 InsrHelper(&masm, zn_fp, zn_inputs_rawbits);
17849 InsrHelper(&masm, zm_fp, zm_inputs_rawbits);
17850
17851 PRegisterWithLaneSize pd_result_fp_3 = p11.WithLaneSize(lane_size_in_bits);
17852 (masm.*fcmp)(pd_result_fp_3, p1.Zeroing(), zn_fp, zm_fp);
17853
17854 END();
17855
17856 if (CAN_RUN()) {
17857 RUN();
17858
17859 if (cond != uo) {
17860 ASSERT_EQUAL_SVE(pd_result_int_1, pd_result_fp_1);
17861 ASSERT_EQUAL_SVE(pd_result_int_2, pd_result_fp_2);
17862 }
17863 ASSERT_EQUAL_SVE(pd_expected, pd_result_fp_3);
17864 }
17865 }
17866
TEST_SVE(sve_fp_compare_vectors)17867 TEST_SVE(sve_fp_compare_vectors) {
17868 double inf_p = kFP64PositiveInfinity;
17869 double inf_n = kFP64NegativeInfinity;
17870 double nan = kFP64DefaultNaN;
17871
17872 // Normal floating point comparison has been tested in the helper.
17873 double zn[] = {0.0, inf_n, 1.0, inf_p, inf_p, nan, 0.0, nan};
17874 double zm[] = {-0.0, inf_n, inf_n, -2.0, inf_n, nan, nan, inf_p};
17875
17876 int pd_fcm_gt[] = {0, 0, 1, 1, 1, 0, 0, 0};
17877 int pd_fcm_lt[] = {0, 0, 0, 0, 0, 0, 0, 0};
17878 int pd_fcm_ge[] = {1, 1, 1, 1, 1, 0, 0, 0};
17879 int pd_fcm_le[] = {1, 1, 0, 0, 0, 0, 0, 0};
17880 int pd_fcm_eq[] = {1, 1, 0, 0, 0, 0, 0, 0};
17881 int pd_fcm_ne[] = {0, 0, 1, 1, 1, 1, 1, 1};
17882 int pd_fcm_uo[] = {0, 0, 0, 0, 0, 1, 1, 1};
17883 int pd_fac_gt[] = {0, 0, 0, 1, 0, 0, 0, 0};
17884 int pd_fac_lt[] = {0, 0, 1, 0, 0, 0, 0, 0};
17885 int pd_fac_ge[] = {1, 1, 0, 1, 1, 0, 0, 0};
17886 int pd_fac_le[] = {1, 1, 1, 0, 1, 0, 0, 0};
17887
17888 int lane_sizes[] = {kHRegSize, kSRegSize, kDRegSize};
17889
17890 for (size_t i = 0; i < ArrayLength(lane_sizes); i++) {
17891 int lane_size = lane_sizes[i];
17892 // Test floating-point compare vectors.
17893 TestFpCompareHelper(config, lane_size, gt, zn, zm, pd_fcm_gt);
17894 TestFpCompareHelper(config, lane_size, lt, zn, zm, pd_fcm_lt);
17895 TestFpCompareHelper(config, lane_size, ge, zn, zm, pd_fcm_ge);
17896 TestFpCompareHelper(config, lane_size, le, zn, zm, pd_fcm_le);
17897 TestFpCompareHelper(config, lane_size, eq, zn, zm, pd_fcm_eq);
17898 TestFpCompareHelper(config, lane_size, ne, zn, zm, pd_fcm_ne);
17899 TestFpCompareHelper(config, lane_size, uo, zn, zm, pd_fcm_uo);
17900
17901 // Test floating-point absolute compare vectors.
17902 TestFpCompareHelper(config, lane_size, gt, zn, zm, pd_fac_gt, true);
17903 TestFpCompareHelper(config, lane_size, lt, zn, zm, pd_fac_lt, true);
17904 TestFpCompareHelper(config, lane_size, ge, zn, zm, pd_fac_ge, true);
17905 TestFpCompareHelper(config, lane_size, le, zn, zm, pd_fac_le, true);
17906 }
17907 }
17908
17909 template <size_t N, typename T>
TestFpCompareZeroHelper(Test * config,int lane_size_in_bits,Condition cond,const T (& zn_inputs)[N],const int (& pd_expected)[N])17910 static void TestFpCompareZeroHelper(Test* config,
17911 int lane_size_in_bits,
17912 Condition cond,
17913 const T (&zn_inputs)[N],
17914 const int (&pd_expected)[N]) {
17915 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
17916 START();
17917
17918 ZRegister zn = z28.WithLaneSize(lane_size_in_bits);
17919 PRegisterWithLaneSize pd = p14.WithLaneSize(lane_size_in_bits);
17920
17921 uint64_t zn_rawbits[N];
17922 FPToRawbitsWithSize(zn_inputs, zn_rawbits, lane_size_in_bits);
17923 InsrHelper(&masm, zn, zn_rawbits);
17924
17925 __ Ptrue(p0.VnB());
17926 (masm.*GetFpCompareZeroFn(cond))(pd, p0.Zeroing(), zn, 0.0);
17927
17928 END();
17929
17930 if (CAN_RUN()) {
17931 RUN();
17932
17933 ASSERT_EQUAL_SVE(pd_expected, pd);
17934 }
17935 }
17936
TEST_SVE(sve_fp_compare_vector_zero)17937 TEST_SVE(sve_fp_compare_vector_zero) {
17938 Float16 fp16_inf_p = kFP16PositiveInfinity;
17939 Float16 fp16_inf_n = kFP16NegativeInfinity;
17940 Float16 fp16_dn = kFP16DefaultNaN;
17941 Float16 fp16_sn = RawbitsToFloat16(0x7c22);
17942 Float16 fp16_qn = RawbitsToFloat16(0x7e55);
17943
17944 float fp32_inf_p = kFP32PositiveInfinity;
17945 float fp32_inf_n = kFP32NegativeInfinity;
17946 float fp32_dn = kFP32DefaultNaN;
17947 float fp32_sn = RawbitsToFloat(0x7f952222);
17948 float fp32_qn = RawbitsToFloat(0x7fea2222);
17949
17950 double fp64_inf_p = kFP64PositiveInfinity;
17951 double fp64_inf_n = kFP64NegativeInfinity;
17952 double fp64_dn = kFP64DefaultNaN;
17953 double fp64_sn = RawbitsToDouble(0x7ff5555511111111);
17954 double fp64_qn = RawbitsToDouble(0x7ffaaaaa11111111);
17955
17956 // Normal floating point comparison has been tested in the non-zero form.
17957 Float16 zn_inputs_h[] = {Float16(0.0),
17958 Float16(-0.0),
17959 fp16_inf_p,
17960 fp16_inf_n,
17961 fp16_dn,
17962 fp16_sn,
17963 fp16_qn};
17964 float zn_inputs_s[] =
17965 {0.0, -0.0, fp32_inf_p, fp32_inf_n, fp32_dn, fp32_sn, fp32_qn};
17966 double zn_inputs_d[] =
17967 {0.0, -0.0, fp64_inf_p, fp64_inf_n, fp64_dn, fp64_sn, fp64_qn};
17968
17969 int pd_expected_gt[] = {0, 0, 1, 0, 0, 0, 0};
17970 int pd_expected_lt[] = {0, 0, 0, 1, 0, 0, 0};
17971 int pd_expected_ge[] = {1, 1, 1, 0, 0, 0, 0};
17972 int pd_expected_le[] = {1, 1, 0, 1, 0, 0, 0};
17973 int pd_expected_eq[] = {1, 1, 0, 0, 0, 0, 0};
17974 int pd_expected_ne[] = {0, 0, 1, 1, 1, 1, 1};
17975
17976 TestFpCompareZeroHelper(config, kDRegSize, gt, zn_inputs_d, pd_expected_gt);
17977 TestFpCompareZeroHelper(config, kDRegSize, lt, zn_inputs_d, pd_expected_lt);
17978 TestFpCompareZeroHelper(config, kDRegSize, ge, zn_inputs_d, pd_expected_ge);
17979 TestFpCompareZeroHelper(config, kDRegSize, le, zn_inputs_d, pd_expected_le);
17980 TestFpCompareZeroHelper(config, kDRegSize, eq, zn_inputs_d, pd_expected_eq);
17981 TestFpCompareZeroHelper(config, kDRegSize, ne, zn_inputs_d, pd_expected_ne);
17982
17983 TestFpCompareZeroHelper(config, kSRegSize, gt, zn_inputs_s, pd_expected_gt);
17984 TestFpCompareZeroHelper(config, kSRegSize, lt, zn_inputs_s, pd_expected_lt);
17985 TestFpCompareZeroHelper(config, kSRegSize, ge, zn_inputs_s, pd_expected_ge);
17986 TestFpCompareZeroHelper(config, kSRegSize, le, zn_inputs_s, pd_expected_le);
17987 TestFpCompareZeroHelper(config, kSRegSize, eq, zn_inputs_s, pd_expected_eq);
17988 TestFpCompareZeroHelper(config, kSRegSize, ne, zn_inputs_s, pd_expected_ne);
17989
17990 TestFpCompareZeroHelper(config, kHRegSize, gt, zn_inputs_h, pd_expected_gt);
17991 TestFpCompareZeroHelper(config, kHRegSize, lt, zn_inputs_h, pd_expected_lt);
17992 TestFpCompareZeroHelper(config, kHRegSize, ge, zn_inputs_h, pd_expected_ge);
17993 TestFpCompareZeroHelper(config, kHRegSize, le, zn_inputs_h, pd_expected_le);
17994 TestFpCompareZeroHelper(config, kHRegSize, eq, zn_inputs_h, pd_expected_eq);
17995 TestFpCompareZeroHelper(config, kHRegSize, ne, zn_inputs_h, pd_expected_ne);
17996 }
17997
17998 typedef void (MacroAssembler::*FPUnaryMFn)(const ZRegister& zd,
17999 const PRegisterM& pg,
18000 const ZRegister& zn);
18001
18002 typedef void (MacroAssembler::*FPUnaryZFn)(const ZRegister& zd,
18003 const PRegisterZ& pg,
18004 const ZRegister& zn);
18005
18006 template <size_t N, size_t M>
TestFPUnaryPredicatedHelper(Test * config,int src_size_in_bits,int dst_size_in_bits,uint64_t (& zn_inputs)[N],const uint64_t (& pg_inputs)[M],const uint64_t (& zd_expected)[N],FPUnaryMFn macro_m,FPUnaryZFn macro_z)18007 static void TestFPUnaryPredicatedHelper(Test* config,
18008 int src_size_in_bits,
18009 int dst_size_in_bits,
18010 uint64_t (&zn_inputs)[N],
18011 const uint64_t (&pg_inputs)[M],
18012 const uint64_t (&zd_expected)[N],
18013 FPUnaryMFn macro_m,
18014 FPUnaryZFn macro_z) {
18015 // Provide the full predicate input.
18016 VIXL_ASSERT(M == (kPRegMaxSize / kDRegSize));
18017 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
18018 START();
18019
18020 int ds = dst_size_in_bits;
18021 int ss = src_size_in_bits;
18022 int ls = std::max(ss, ds);
18023
18024 // When destination type is larger than source type, fill the high parts with
18025 // noise values, which should be ignored.
18026 if (ds > ss) {
18027 VIXL_ASSERT(ss < 64);
18028 uint64_t zn_inputs_mod[N];
18029 uint64_t sn = GetSignallingNan(ss);
18030 for (unsigned i = 0; i < N; i++) {
18031 zn_inputs_mod[i] = zn_inputs[i] | ((sn + i) << ss);
18032 }
18033 InsrHelper(&masm, z29.WithLaneSize(ls), zn_inputs_mod);
18034 } else {
18035 InsrHelper(&masm, z29.WithLaneSize(ls), zn_inputs);
18036 }
18037
18038 // Make a copy so we can check that constructive operations preserve zn.
18039 __ Mov(z28, z29);
18040
18041 // Run the operation on all lanes.
18042 __ Ptrue(p0.WithLaneSize(ls));
18043 (masm.*macro_m)(z27.WithLaneSize(ds), p0.Merging(), z28.WithLaneSize(ss));
18044
18045 Initialise(&masm,
18046 p1.VnB(),
18047 pg_inputs[3],
18048 pg_inputs[2],
18049 pg_inputs[1],
18050 pg_inputs[0]);
18051
18052 // Clear the irrelevant lanes.
18053 __ Index(z31.WithLaneSize(ls), 0, 1);
18054 __ Cmplt(p1.WithLaneSize(ls), p1.Zeroing(), z31.WithLaneSize(ls), N);
18055
18056 // Check merging predication.
18057 __ Index(z11.WithLaneSize(ls), 42, 1);
18058 // Preserve the base value so we can derive the expected result.
18059 __ Mov(z21, z11);
18060 __ Mov(z9, z11);
18061 (masm.*macro_m)(z11.WithLaneSize(ds), p1.Merging(), z28.WithLaneSize(ss));
18062
18063 // Generate expected values using explicit merging operations.
18064 InsrHelper(&masm, z25.WithLaneSize(ls), zd_expected);
18065 __ Mov(z21.WithLaneSize(ls), p1.Merging(), z25.WithLaneSize(ls));
18066
18067 // Check zeroing predication.
18068 __ Index(z12.WithLaneSize(ds), 42, -1);
18069 (masm.*macro_z)(z12.WithLaneSize(ds), p1.Zeroing(), z28.WithLaneSize(ss));
18070
18071 // Generate expected values using explicit zeroing operations.
18072 InsrHelper(&masm, z30.WithLaneSize(ls), zd_expected);
18073 // Emulate zeroing predication.
18074 __ Dup(z22.WithLaneSize(ls), 0);
18075 __ Mov(z22.WithLaneSize(ls), p1.Merging(), z30.WithLaneSize(ls));
18076
18077 // Check an in-place update.
18078 __ Mov(z9.WithLaneSize(ls), p1.Merging(), z28.WithLaneSize(ls));
18079 (masm.*macro_m)(z9.WithLaneSize(ds), p1.Merging(), z9.WithLaneSize(ss));
18080
18081 END();
18082
18083 if (CAN_RUN()) {
18084 RUN();
18085
18086 // Check all lanes.
18087 ASSERT_EQUAL_SVE(zd_expected, z27.WithLaneSize(ls));
18088
18089 // Check that constructive operations preserve their inputs.
18090 ASSERT_EQUAL_SVE(z28, z29);
18091
18092 // Check merging predication.
18093 ASSERT_EQUAL_SVE(z21.WithLaneSize(ls), z21.WithLaneSize(ls));
18094
18095 // Check zeroing predication.
18096 ASSERT_EQUAL_SVE(z22.WithLaneSize(ls), z12.WithLaneSize(ls));
18097
18098 // Check in-place operation where zd == zn.
18099 ASSERT_EQUAL_SVE(z21.WithLaneSize(ls), z9.WithLaneSize(ls));
18100 }
18101 }
18102
18103 template <size_t N, typename T>
TestFPUnaryPredicatedHelper(Test * config,int src_size_in_bits,int dst_size_in_bits,T (& zn_inputs)[N],const T (& zd_expected)[N],FPUnaryMFn macro_m,FPUnaryZFn macro_z)18104 static void TestFPUnaryPredicatedHelper(Test* config,
18105 int src_size_in_bits,
18106 int dst_size_in_bits,
18107 T (&zn_inputs)[N],
18108 const T (&zd_expected)[N],
18109 FPUnaryMFn macro_m,
18110 FPUnaryZFn macro_z) {
18111 uint64_t pg_inputs[] = {0xa55aa55aa55aa55a,
18112 0xa55aa55aa55aa55a,
18113 0xa55aa55aa55aa55a,
18114 0xa55aa55aa55aa55a};
18115
18116 TestFPUnaryPredicatedHelper(config,
18117 src_size_in_bits,
18118 dst_size_in_bits,
18119 zn_inputs,
18120 pg_inputs,
18121 zd_expected,
18122 macro_m,
18123 macro_z);
18124
18125 // The complementary of above precicate to get full input coverage.
18126 uint64_t pg_c_inputs[] = {0x5aa55aa55aa55aa5,
18127 0x5aa55aa55aa55aa5,
18128 0x5aa55aa55aa55aa5,
18129 0x5aa55aa55aa55aa5};
18130
18131 TestFPUnaryPredicatedHelper(config,
18132 src_size_in_bits,
18133 dst_size_in_bits,
18134 zn_inputs,
18135 pg_c_inputs,
18136 zd_expected,
18137 macro_m,
18138 macro_z);
18139 }
18140
18141 template <size_t N, typename T>
TestFcvtHelper(Test * config,int src_size_in_bits,int dst_size_in_bits,T (& zn_inputs)[N],const T (& zd_expected)[N])18142 static void TestFcvtHelper(Test* config,
18143 int src_size_in_bits,
18144 int dst_size_in_bits,
18145 T (&zn_inputs)[N],
18146 const T (&zd_expected)[N]) {
18147 TestFPUnaryPredicatedHelper(config,
18148 src_size_in_bits,
18149 dst_size_in_bits,
18150 zn_inputs,
18151 zd_expected,
18152 &MacroAssembler::Fcvt, // Merging form.
18153 &MacroAssembler::Fcvt); // Zerging form.
18154 }
18155
TEST_SVE(sve_fcvt)18156 TEST_SVE(sve_fcvt) {
18157 uint64_t h_vals[] = {0x7c00,
18158 0xfc00,
18159 0,
18160 0x8000,
18161 0x7bff, // Max half precision.
18162 0x0400, // Min positive normal.
18163 0x03ff, // Max subnormal.
18164 0x0001}; // Min positive subnormal.
18165
18166 uint64_t s_vals[] = {0x7f800000,
18167 0xff800000,
18168 0,
18169 0x80000000,
18170 0x477fe000,
18171 0x38800000,
18172 0x387fc000,
18173 0x33800000};
18174
18175 uint64_t d_vals[] = {0x7ff0000000000000,
18176 0xfff0000000000000,
18177 0,
18178 0x8000000000000000,
18179 0x40effc0000000000,
18180 0x3f10000000000000,
18181 0x3f0ff80000000000,
18182 0x3e70000000000000};
18183
18184 TestFcvtHelper(config, kHRegSize, kSRegSize, h_vals, s_vals);
18185 TestFcvtHelper(config, kSRegSize, kHRegSize, s_vals, h_vals);
18186 TestFcvtHelper(config, kSRegSize, kDRegSize, s_vals, d_vals);
18187 TestFcvtHelper(config, kDRegSize, kSRegSize, d_vals, s_vals);
18188 TestFcvtHelper(config, kHRegSize, kDRegSize, h_vals, d_vals);
18189 TestFcvtHelper(config, kDRegSize, kHRegSize, d_vals, h_vals);
18190 }
18191
TEST_SVE(sve_fcvt_nan)18192 TEST_SVE(sve_fcvt_nan) {
18193 uint64_t h_inputs[] = {0x7e55, // Quiet NaN.
18194 0x7c22}; // Signalling NaN.
18195
18196 uint64_t h2s_expected[] = {0x7fcaa000, 0x7fc44000};
18197
18198 uint64_t h2d_expected[] = {0x7ff9540000000000, 0x7ff8880000000000};
18199
18200 uint64_t s_inputs[] = {0x7fc12345, // Quiet NaN.
18201 0x7f812345}; // Signalling NaN.
18202
18203 uint64_t s2h_expected[] = {0x7e09, 0x7e09};
18204
18205 uint64_t s2d_expected[] = {0x7ff82468a0000000, 0x7ff82468a0000000};
18206
18207 uint64_t d_inputs[] = {0x7ffaaaaa22222222, // Quiet NaN.
18208 0x7ff5555511111111}; // Signalling NaN.
18209
18210 uint64_t d2h_expected[] = {0x7eaa, 0x7f55};
18211
18212 uint64_t d2s_expected[] = {0x7fd55551, 0x7feaaaa8};
18213
18214 TestFcvtHelper(config, kHRegSize, kSRegSize, h_inputs, h2s_expected);
18215 TestFcvtHelper(config, kSRegSize, kHRegSize, s_inputs, s2h_expected);
18216 TestFcvtHelper(config, kHRegSize, kDRegSize, h_inputs, h2d_expected);
18217 TestFcvtHelper(config, kDRegSize, kHRegSize, d_inputs, d2h_expected);
18218 TestFcvtHelper(config, kSRegSize, kDRegSize, s_inputs, s2d_expected);
18219 TestFcvtHelper(config, kDRegSize, kSRegSize, d_inputs, d2s_expected);
18220 }
18221
18222 template <size_t N, typename T>
TestFrecpxHelper(Test * config,int lane_size_in_bits,T (& zn_inputs)[N],const T (& zd_expected)[N])18223 static void TestFrecpxHelper(Test* config,
18224 int lane_size_in_bits,
18225 T (&zn_inputs)[N],
18226 const T (&zd_expected)[N]) {
18227 TestFPUnaryPredicatedHelper(config,
18228 lane_size_in_bits,
18229 lane_size_in_bits,
18230 zn_inputs,
18231 zd_expected,
18232 &MacroAssembler::Frecpx, // Merging form.
18233 &MacroAssembler::Frecpx); // Zerging form.
18234 }
18235
TEST_SVE(sve_frecpx_h)18236 TEST_SVE(sve_frecpx_h) {
18237 uint64_t zn_inputs[] = {Float16ToRawbits(kFP16PositiveInfinity),
18238 Float16ToRawbits(kFP16NegativeInfinity),
18239 Float16ToRawbits(Float16(0.0)),
18240 Float16ToRawbits(Float16(-0.0)),
18241 0x0001, // Smallest positive subnormal number.
18242 0x03ff, // Largest subnormal number.
18243 0x0400, // Smallest positive normal number.
18244 0x7bff, // Largest normal number.
18245 0x3bff, // Largest number less than one.
18246 0x3c01, // Smallest number larger than one.
18247 0x7c22, // Signalling NaN.
18248 0x7e55}; // Quiet NaN.
18249
18250 uint64_t zd_expected[] = {0,
18251 0x8000,
18252 0x7800,
18253 0xf800,
18254 // Exponent of subnormal numbers are zero.
18255 0x7800,
18256 0x7800,
18257 0x7800,
18258 0x0400,
18259 0x4400,
18260 0x4000,
18261 0x7e22, // To quiet NaN.
18262 0x7e55};
18263
18264 TestFrecpxHelper(config, kHRegSize, zn_inputs, zd_expected);
18265 }
18266
TEST_SVE(sve_frecpx_s)18267 TEST_SVE(sve_frecpx_s) {
18268 uint64_t zn_inputs[] = {FloatToRawbits(kFP32PositiveInfinity),
18269 FloatToRawbits(kFP32NegativeInfinity),
18270 FloatToRawbits(65504), // Max half precision.
18271 FloatToRawbits(6.10352e-5), // Min positive normal.
18272 FloatToRawbits(6.09756e-5), // Max subnormal.
18273 FloatToRawbits(
18274 5.96046e-8), // Min positive subnormal.
18275 FloatToRawbits(5e-9), // Not representable -> zero.
18276 FloatToRawbits(-0.0),
18277 FloatToRawbits(0.0),
18278 0x7f952222, // Signalling NaN.
18279 0x7fea2222}; // Quiet NaN;
18280
18281 uint64_t zd_expected[] = {0, // 0.0
18282 0x80000000, // -0.0
18283 0x38800000, // 6.10352e-05
18284 0x47000000, // 32768
18285 0x47800000, // 65536
18286 0x4c800000, // 6.71089e+07
18287 0x4e000000, // 5.36871e+08
18288 0xff000000, // -1.70141e+38
18289 0x7f000000, // 1.70141e+38
18290 0x7fd52222,
18291 0x7fea2222};
18292
18293 TestFrecpxHelper(config, kSRegSize, zn_inputs, zd_expected);
18294 }
18295
TEST_SVE(sve_frecpx_d)18296 TEST_SVE(sve_frecpx_d) {
18297 uint64_t zn_inputs[] = {DoubleToRawbits(kFP64PositiveInfinity),
18298 DoubleToRawbits(kFP64NegativeInfinity),
18299 DoubleToRawbits(65504), // Max half precision.
18300 DoubleToRawbits(6.10352e-5), // Min positive normal.
18301 DoubleToRawbits(6.09756e-5), // Max subnormal.
18302 DoubleToRawbits(
18303 5.96046e-8), // Min positive subnormal.
18304 DoubleToRawbits(5e-9), // Not representable -> zero.
18305 DoubleToRawbits(-0.0),
18306 DoubleToRawbits(0.0),
18307 0x7ff5555511111111, // Signalling NaN.
18308 0x7ffaaaaa11111111}; // Quiet NaN;
18309
18310 uint64_t zd_expected[] = {0, // 0.0
18311 0x8000000000000000, // -0.0
18312 0x3f10000000000000, // 6.10352e-05
18313 0x40e0000000000000, // 32768
18314 0x40f0000000000000, // 65536
18315 0x4190000000000000, // 6.71089e+07
18316 0x41c0000000000000, // 5.36871e+08
18317 0xffe0000000000000, // -1.70141e+38
18318 0x7fe0000000000000, // 1.70141e+38
18319 0x7ffd555511111111,
18320 0x7ffaaaaa11111111};
18321
18322 TestFrecpxHelper(config, kDRegSize, zn_inputs, zd_expected);
18323 }
18324
18325 template <size_t N, typename T>
TestFsqrtHelper(Test * config,int lane_size_in_bits,T (& zn_inputs)[N],const T (& zd_expected)[N])18326 static void TestFsqrtHelper(Test* config,
18327 int lane_size_in_bits,
18328 T (&zn_inputs)[N],
18329 const T (&zd_expected)[N]) {
18330 TestFPUnaryPredicatedHelper(config,
18331 lane_size_in_bits,
18332 lane_size_in_bits,
18333 zn_inputs,
18334 zd_expected,
18335 &MacroAssembler::Fsqrt, // Merging form.
18336 &MacroAssembler::Fsqrt); // Zerging form.
18337 }
18338
TEST_SVE(sve_fsqrt_h)18339 TEST_SVE(sve_fsqrt_h) {
18340 uint64_t zn_inputs[] =
18341 {Float16ToRawbits(Float16(0.0)),
18342 Float16ToRawbits(Float16(-0.0)),
18343 Float16ToRawbits(Float16(1.0)),
18344 Float16ToRawbits(Float16(65025.0)),
18345 Float16ToRawbits(kFP16PositiveInfinity),
18346 Float16ToRawbits(kFP16NegativeInfinity),
18347 Float16ToRawbits(Float16(6.10352e-5)), // Min normal positive.
18348 Float16ToRawbits(Float16(65504.0)), // Max normal positive float.
18349 Float16ToRawbits(Float16(6.09756e-5)), // Max subnormal.
18350 Float16ToRawbits(Float16(5.96046e-8)), // Min subnormal positive.
18351 0x7c22, // Signaling NaN
18352 0x7e55}; // Quiet NaN
18353
18354 uint64_t zd_expected[] = {Float16ToRawbits(Float16(0.0)),
18355 Float16ToRawbits(Float16(-0.0)),
18356 Float16ToRawbits(Float16(1.0)),
18357 Float16ToRawbits(Float16(255.0)),
18358 Float16ToRawbits(kFP16PositiveInfinity),
18359 Float16ToRawbits(kFP16DefaultNaN),
18360 0x2000,
18361 0x5bff,
18362 0x1fff,
18363 0x0c00,
18364 0x7e22, // To quiet NaN.
18365 0x7e55};
18366
18367 TestFsqrtHelper(config, kHRegSize, zn_inputs, zd_expected);
18368 }
18369
TEST_SVE(sve_fsqrt_s)18370 TEST_SVE(sve_fsqrt_s) {
18371 uint64_t zn_inputs[] = {FloatToRawbits(0.0f),
18372 FloatToRawbits(-0.0f),
18373 FloatToRawbits(1.0f),
18374 FloatToRawbits(65536.0f),
18375 FloatToRawbits(kFP32PositiveInfinity),
18376 FloatToRawbits(kFP32NegativeInfinity),
18377 0x00800000, // Min normal positive, ~1.17e−38
18378 0x7f7fffff, // Max normal positive, ~3.40e+38
18379 0x00000001, // Min subnormal positive, ~1.40e−45
18380 0x007fffff, // Max subnormal, ~1.17e−38
18381 0x7f951111, // Signaling NaN
18382 0x7fea1111}; // Quiet NaN
18383
18384 uint64_t zd_expected[] = {FloatToRawbits(0.0f),
18385 FloatToRawbits(-0.0f),
18386 FloatToRawbits(1.0f),
18387 FloatToRawbits(256.0f),
18388 FloatToRawbits(kFP32PositiveInfinity),
18389 FloatToRawbits(kFP32DefaultNaN),
18390 0x20000000, // ~1.08e-19
18391 0x5f7fffff, // ~1.84e+19
18392 0x1a3504f3, // ~3.74e-23
18393 0x1fffffff, // ~1.08e-19
18394 0x7fd51111, // To quiet NaN.
18395 0x7fea1111};
18396
18397 TestFsqrtHelper(config, kSRegSize, zn_inputs, zd_expected);
18398 }
18399
TEST_SVE(sve_fsqrt_d)18400 TEST_SVE(sve_fsqrt_d) {
18401 uint64_t zn_inputs[] =
18402 {DoubleToRawbits(0.0),
18403 DoubleToRawbits(-0.0),
18404 DoubleToRawbits(1.0),
18405 DoubleToRawbits(65536.0),
18406 DoubleToRawbits(kFP64PositiveInfinity),
18407 DoubleToRawbits(kFP64NegativeInfinity),
18408 0x0010000000000000, // Min normal positive, ~2.22e-308
18409 0x7fefffffffffffff, // Max normal positive, ~1.79e+308
18410 0x0000000000000001, // Min subnormal positive, 5e-324
18411 0x000fffffffffffff, // Max subnormal, ~2.22e-308
18412 0x7ff5555511111111,
18413 0x7ffaaaaa11111111};
18414
18415 uint64_t zd_expected[] = {DoubleToRawbits(0.0),
18416 DoubleToRawbits(-0.0),
18417 DoubleToRawbits(1.0),
18418 DoubleToRawbits(256.0),
18419 DoubleToRawbits(kFP64PositiveInfinity),
18420 DoubleToRawbits(kFP64DefaultNaN),
18421 0x2000000000000000, // ~1.49e-154
18422 0x5fefffffffffffff, // ~1.34e+154
18423 0x1e60000000000000, // ~2.22e-162
18424 0x1fffffffffffffff, // ~1.49e-154
18425 0x7ffd555511111111, // To quiet NaN.
18426 0x7ffaaaaa11111111};
18427
18428 TestFsqrtHelper(config, kDRegSize, zn_inputs, zd_expected);
18429 }
18430
TEST_SVE(sve_adr)18431 TEST_SVE(sve_adr) {
18432 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
18433 START();
18434
18435 __ Index(z0.VnD(), 0x10000000f0000000, 0x1000);
18436 __ Index(z1.VnD(), 1, 3);
18437 __ Index(z2.VnS(), -1, -1);
18438 __ Adr(z3.VnD(), SVEMemOperand(z0.VnD(), z1.VnD()));
18439 __ Adr(z4.VnD(), SVEMemOperand(z0.VnD(), z1.VnD(), LSL, 1));
18440 __ Adr(z5.VnD(), SVEMemOperand(z0.VnD(), z1.VnD(), LSL, 2));
18441 __ Adr(z6.VnD(), SVEMemOperand(z0.VnD(), z1.VnD(), LSL, 3));
18442 __ Adr(z7.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), UXTW));
18443 __ Adr(z8.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), UXTW, 1));
18444 __ Adr(z9.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), UXTW, 2));
18445 __ Adr(z10.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), UXTW, 3));
18446 __ Adr(z11.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), SXTW));
18447 __ Adr(z12.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), SXTW, 1));
18448 __ Adr(z13.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), SXTW, 2));
18449 __ Adr(z14.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), SXTW, 3));
18450 __ Adr(z15.VnS(), SVEMemOperand(z0.VnS(), z2.VnS()));
18451 __ Adr(z16.VnS(), SVEMemOperand(z0.VnS(), z2.VnS(), LSL, 1));
18452 __ Adr(z17.VnS(), SVEMemOperand(z0.VnS(), z2.VnS(), LSL, 2));
18453 __ Adr(z18.VnS(), SVEMemOperand(z0.VnS(), z2.VnS(), LSL, 3));
18454
18455 END();
18456
18457 if (CAN_RUN()) {
18458 RUN();
18459 uint64_t expected_z3[] = {0x10000000f0001004, 0x10000000f0000001};
18460 uint64_t expected_z4[] = {0x10000000f0001008, 0x10000000f0000002};
18461 uint64_t expected_z5[] = {0x10000000f0001010, 0x10000000f0000004};
18462 uint64_t expected_z6[] = {0x10000000f0001020, 0x10000000f0000008};
18463 uint64_t expected_z7[] = {0x10000001f0000ffd, 0x10000001efffffff};
18464 uint64_t expected_z8[] = {0x10000002f0000ffa, 0x10000002effffffe};
18465 uint64_t expected_z9[] = {0x10000004f0000ff4, 0x10000004effffffc};
18466 uint64_t expected_z10[] = {0x10000008f0000fe8, 0x10000008effffff8};
18467 uint64_t expected_z11[] = {0x10000000f0000ffd, 0x10000000efffffff};
18468 uint64_t expected_z12[] = {0x10000000f0000ffa, 0x10000000effffffe};
18469 uint64_t expected_z13[] = {0x10000000f0000ff4, 0x10000000effffffc};
18470 uint64_t expected_z14[] = {0x10000000f0000fe8, 0x10000000effffff8};
18471 uint64_t expected_z15[] = {0x0ffffffcf0000ffd, 0x0ffffffeefffffff};
18472 uint64_t expected_z16[] = {0x0ffffff8f0000ffa, 0x0ffffffceffffffe};
18473 uint64_t expected_z17[] = {0x0ffffff0f0000ff4, 0x0ffffff8effffffc};
18474 uint64_t expected_z18[] = {0x0fffffe0f0000fe8, 0x0ffffff0effffff8};
18475
18476 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
18477 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
18478 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
18479 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
18480 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
18481 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
18482 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
18483 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
18484 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
18485 ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
18486 ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
18487 ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
18488 ASSERT_EQUAL_SVE(expected_z15, z15.VnD());
18489 ASSERT_EQUAL_SVE(expected_z16, z16.VnD());
18490 ASSERT_EQUAL_SVE(expected_z17, z17.VnD());
18491 ASSERT_EQUAL_SVE(expected_z18, z18.VnD());
18492 }
18493 }
18494
18495 // Test loads and broadcast by comparing them with the result of a set of
18496 // equivalent scalar loads.
18497 template <typename F>
LoadBcastHelper(Test * config,unsigned msize_in_bits,unsigned esize_in_bits,F sve_ld1,bool is_signed)18498 static void LoadBcastHelper(Test* config,
18499 unsigned msize_in_bits,
18500 unsigned esize_in_bits,
18501 F sve_ld1,
18502 bool is_signed) {
18503 VIXL_ASSERT((esize_in_bits == kBRegSize) || (esize_in_bits == kHRegSize) ||
18504 (esize_in_bits == kSRegSize) || (esize_in_bits == kDRegSize));
18505 static const unsigned kMaxLaneCount = kZRegMaxSize / kBRegSize;
18506
18507 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
18508 START();
18509
18510 unsigned msize_in_bytes = msize_in_bits / kBitsPerByte;
18511 unsigned esize_in_bytes = esize_in_bits / kBitsPerByte;
18512 int vl = config->sve_vl_in_bytes();
18513
18514 uint64_t offsets[kMaxLaneCount];
18515 uint64_t buffer_size = vl * 64;
18516 uint64_t data = reinterpret_cast<uintptr_t>(malloc(buffer_size));
18517 BufferFillingHelper(data,
18518 buffer_size,
18519 msize_in_bytes,
18520 kMaxLaneCount,
18521 offsets);
18522
18523 for (unsigned i = 0; i < (kMaxLaneCount / 2); i++) {
18524 // Assign encodable offsets into the first part of the offset array so
18525 // that both encodable and unencodable offset can be tested.
18526 // Note that the encoding bit range of immediate offset is 6 bits.
18527 offsets[i] = (offsets[i] % (UINT64_C(1) << 6)) * msize_in_bytes;
18528 }
18529
18530 ZRegister zn = z0.WithLaneSize(esize_in_bits);
18531 ZRegister zn_ref = z4.WithLaneSize(esize_in_bits);
18532
18533 PRegisterZ pg = p0.Zeroing();
18534 Initialise(&masm,
18535 pg,
18536 0x9abcdef012345678,
18537 0xabcdef0123456789,
18538 0xf4f3f1f0fefdfcfa,
18539 0xf9f8f6f5f3f2f0ff);
18540
18541 __ Mov(x2, data);
18542 uint64_t enablable_offset = offsets[0];
18543 // Simple check if the operation correct in a single offset.
18544 (masm.*sve_ld1)(zn, pg, SVEMemOperand(x2, enablable_offset));
18545
18546 // Generate a reference result using scalar loads.
18547 uint64_t address = data + enablable_offset;
18548 uint64_t duplicated_addresses[kMaxLaneCount];
18549 for (unsigned i = 0; i < kMaxLaneCount; i++) {
18550 duplicated_addresses[i] = address;
18551 }
18552
18553 ScalarLoadHelper(&masm,
18554 vl,
18555 duplicated_addresses,
18556 zn_ref,
18557 pg,
18558 esize_in_bits,
18559 msize_in_bits,
18560 is_signed);
18561
18562 ZRegister zn_agg = z10.WithLaneSize(esize_in_bits);
18563 ZRegister zn_agg_ref = z11.WithLaneSize(esize_in_bits);
18564 ZRegister zn_temp = z12.WithLaneSize(esize_in_bits);
18565
18566 __ Dup(zn_agg, 0);
18567 __ Dup(zn_agg_ref, 0);
18568
18569 // Check if the operation correct in different offsets.
18570 for (unsigned i = 0; i < (vl / esize_in_bytes); i++) {
18571 (masm.*sve_ld1)(zn_temp, pg, SVEMemOperand(x2, offsets[i]));
18572 __ Lastb(x1, pg, zn_temp);
18573 __ Insr(zn_agg, x1);
18574
18575 __ Mov(x3, data + offsets[i]);
18576 ScalarLoadHelper(&masm, x1, x3, msize_in_bits, is_signed);
18577 __ Insr(zn_agg_ref, x1);
18578 }
18579
18580 END();
18581
18582 if (CAN_RUN()) {
18583 RUN();
18584
18585 ASSERT_EQUAL_SVE(zn_ref, zn);
18586 ASSERT_EQUAL_SVE(zn_agg_ref, zn_agg);
18587 }
18588
18589 free(reinterpret_cast<void*>(data));
18590 }
18591
TEST_SVE(sve_ld1rb)18592 TEST_SVE(sve_ld1rb) {
18593 LoadBcastHelper(config, kBRegSize, kBRegSize, &MacroAssembler::Ld1rb, false);
18594 LoadBcastHelper(config, kBRegSize, kHRegSize, &MacroAssembler::Ld1rb, false);
18595 LoadBcastHelper(config, kBRegSize, kSRegSize, &MacroAssembler::Ld1rb, false);
18596 LoadBcastHelper(config, kBRegSize, kDRegSize, &MacroAssembler::Ld1rb, false);
18597 }
18598
TEST_SVE(sve_ld1rh)18599 TEST_SVE(sve_ld1rh) {
18600 LoadBcastHelper(config, kHRegSize, kHRegSize, &MacroAssembler::Ld1rh, false);
18601 LoadBcastHelper(config, kHRegSize, kSRegSize, &MacroAssembler::Ld1rh, false);
18602 LoadBcastHelper(config, kHRegSize, kDRegSize, &MacroAssembler::Ld1rh, false);
18603 }
18604
TEST_SVE(sve_ld1rw)18605 TEST_SVE(sve_ld1rw) {
18606 LoadBcastHelper(config, kSRegSize, kSRegSize, &MacroAssembler::Ld1rw, false);
18607 LoadBcastHelper(config, kSRegSize, kDRegSize, &MacroAssembler::Ld1rw, false);
18608 }
18609
TEST_SVE(sve_ld1rd)18610 TEST_SVE(sve_ld1rd) {
18611 LoadBcastHelper(config, kDRegSize, kDRegSize, &MacroAssembler::Ld1rd, false);
18612 }
18613
TEST_SVE(sve_ld1rsb)18614 TEST_SVE(sve_ld1rsb) {
18615 LoadBcastHelper(config, kBRegSize, kHRegSize, &MacroAssembler::Ld1rsb, true);
18616 LoadBcastHelper(config, kBRegSize, kSRegSize, &MacroAssembler::Ld1rsb, true);
18617 LoadBcastHelper(config, kBRegSize, kDRegSize, &MacroAssembler::Ld1rsb, true);
18618 }
18619
TEST_SVE(sve_ld1rsh)18620 TEST_SVE(sve_ld1rsh) {
18621 LoadBcastHelper(config, kHRegSize, kSRegSize, &MacroAssembler::Ld1rsh, true);
18622 LoadBcastHelper(config, kHRegSize, kDRegSize, &MacroAssembler::Ld1rsh, true);
18623 }
18624
TEST_SVE(sve_ld1rsw)18625 TEST_SVE(sve_ld1rsw) {
18626 LoadBcastHelper(config, kSRegSize, kDRegSize, &MacroAssembler::Ld1rsw, true);
18627 }
18628
TEST_SVE(sve_prefetch_offset)18629 TEST_SVE(sve_prefetch_offset) {
18630 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
18631
18632 START();
18633
18634 __ Prfb(PLDL1KEEP, p5, SVEMemOperand(z30.VnS(), 0));
18635 __ Prfb(PLDL1STRM, p5, SVEMemOperand(x28, -11, SVE_MUL_VL));
18636 __ Prfb(PLDL2KEEP, p6, SVEMemOperand(x30, x29));
18637 __ Prfb(PLDL2STRM, p6, SVEMemOperand(x7, z12.VnS(), UXTW));
18638 __ Prfh(PSTL2KEEP, p6, SVEMemOperand(z0.VnS(), 28));
18639 __ Prfh(PSTL2STRM, p4, SVEMemOperand(x17, -3, SVE_MUL_VL));
18640 __ Prfh(PSTL3KEEP, p3, SVEMemOperand(x0, x0, LSL, 1));
18641 __ Prfh(PSTL3STRM, p4, SVEMemOperand(x20, z0.VnD(), LSL, 1));
18642 __ Prfw(PLDL1KEEP, p3, SVEMemOperand(z23.VnD(), 5));
18643 __ Prfw(PLDL1STRM, p1, SVEMemOperand(x4, 10, SVE_MUL_VL));
18644 __ Prfw(PLDL2KEEP, p2, SVEMemOperand(x22, x22, LSL, 2));
18645 __ Prfw(PLDL2STRM, p1, SVEMemOperand(x2, z6.VnS(), SXTW, 2));
18646 __ Prfd(PLDL3KEEP, p5, SVEMemOperand(z11.VnD(), 9));
18647 __ Prfd(PLDL3STRM, p3, SVEMemOperand(x0, -24, SVE_MUL_VL));
18648 __ Prfd(PSTL1KEEP, p7, SVEMemOperand(x5, x5, LSL, 3));
18649 __ Prfd(PSTL1STRM, p1, SVEMemOperand(x19, z18.VnS(), SXTW, 3));
18650
18651 END();
18652 if (CAN_RUN()) {
18653 RUN();
18654 }
18655 }
18656
TEST_SVE(sve2_match_nmatch)18657 TEST_SVE(sve2_match_nmatch) {
18658 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
18659
18660 START();
18661
18662 __ Ptrue(p0.VnB());
18663 __ Ptrue(p1.VnH());
18664 __ Ptrue(p2.VnS());
18665
18666 // Vector to search is bytes 0 - 7, repeating every eight bytes.
18667 __ Index(z0.VnB(), 0, 1);
18668 __ Dup(z0.VnD(), z0.VnD(), 0);
18669
18670 // Elements to find are (repeated) bytes 0 - 3 in the first segment, 4 - 7
18671 // in the second, 8 - 11 in the third, etc.
18672 __ Index(z1.VnB(), 0, 1);
18673 __ Lsr(z1.VnB(), z1.VnB(), 2);
18674
18675 __ Match(p3.VnB(), p0.Zeroing(), z0.VnB(), z1.VnB());
18676 __ Match(p4.VnB(), p1.Zeroing(), z0.VnB(), z1.VnB());
18677 __ Nmatch(p0.VnB(), p0.Zeroing(), z0.VnB(), z1.VnB());
18678
18679 __ Uunpklo(z0.VnH(), z0.VnB());
18680 __ Uunpklo(z1.VnH(), z1.VnB());
18681
18682 __ Match(p5.VnH(), p1.Zeroing(), z0.VnH(), z1.VnH());
18683 __ Match(p6.VnH(), p2.Zeroing(), z0.VnH(), z1.VnH());
18684 __ Nmatch(p1.VnH(), p1.Zeroing(), z0.VnH(), z1.VnH());
18685
18686 END();
18687 if (CAN_RUN()) {
18688 RUN();
18689
18690 int p3_exp[] = {1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
18691 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1};
18692 ASSERT_EQUAL_SVE(p3_exp, p3.VnB());
18693 int p4_exp[] = {0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
18694 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1};
18695 ASSERT_EQUAL_SVE(p4_exp, p4.VnB());
18696 int p0_exp[] = {0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,
18697 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0};
18698 ASSERT_EQUAL_SVE(p0_exp, p0.VnB());
18699
18700 int p5_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
18701 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1};
18702 ASSERT_EQUAL_SVE(p5_exp, p5.VnB());
18703 int p6_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
18704 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
18705 ASSERT_EQUAL_SVE(p6_exp, p6.VnB());
18706 int p1_exp[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
18707 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0};
18708 ASSERT_EQUAL_SVE(p1_exp, p1.VnB());
18709 }
18710 }
18711
TEST_SVE(sve2_saba_uaba)18712 TEST_SVE(sve2_saba_uaba) {
18713 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
18714
18715 START();
18716
18717 __ Index(z0.VnB(), 0, 1);
18718 __ Dup(z1.VnB(), 0xff);
18719 __ Dup(z2.VnB(), 1);
18720 __ Uaba(z2.VnB(), z2.VnB(), z0.VnB(), z1.VnB());
18721 __ Index(z0.VnB(), 0, -1);
18722
18723 __ Index(z3.VnH(), 0, 1);
18724 __ Index(z4.VnH(), 1, 1);
18725 __ Uaba(z3.VnH(), z3.VnH(), z3.VnH(), z4.VnH());
18726
18727 __ Index(z5.VnS(), 3, 6);
18728 __ Index(z6.VnS(), 5, 6);
18729 __ Uaba(z5.VnS(), z5.VnS(), z5.VnS(), z6.VnS());
18730
18731 __ Index(z7.VnD(), 424, 12);
18732 __ Index(z8.VnD(), 4242, 12);
18733 __ Uaba(z7.VnD(), z7.VnD(), z7.VnD(), z8.VnD());
18734
18735 __ Index(z9.VnH(), -1, -1);
18736 __ Dup(z10.VnB(), 0);
18737 __ Saba(z10.VnB(), z10.VnB(), z9.VnB(), z10.VnB());
18738 __ Index(z11.VnH(), 0x0101, 1);
18739
18740 __ Index(z12.VnH(), 0, 1);
18741 __ Index(z13.VnH(), 0, -1);
18742 __ Saba(z13.VnH(), z13.VnH(), z12.VnH(), z13.VnH());
18743
18744 __ Index(z14.VnS(), 0, 2);
18745 __ Index(z15.VnS(), 0, -2);
18746 __ Saba(z15.VnS(), z15.VnS(), z14.VnS(), z15.VnS());
18747
18748 __ Index(z16.VnD(), 0, 42);
18749 __ Index(z17.VnD(), 0, -42);
18750 __ Saba(z17.VnD(), z17.VnD(), z16.VnD(), z17.VnD());
18751
18752 END();
18753
18754 if (CAN_RUN()) {
18755 RUN();
18756
18757 ASSERT_EQUAL_SVE(z0, z2);
18758 ASSERT_EQUAL_SVE(z3, z4);
18759 ASSERT_EQUAL_SVE(z5, z6);
18760 ASSERT_EQUAL_SVE(z7, z8);
18761
18762 ASSERT_EQUAL_SVE(z10, z11);
18763 ASSERT_EQUAL_SVE(z12, z13);
18764 ASSERT_EQUAL_SVE(z14, z15);
18765 ASSERT_EQUAL_SVE(z16, z17);
18766 }
18767 }
18768
TEST_SVE(sve2_integer_multiply_long_vector)18769 TEST_SVE(sve2_integer_multiply_long_vector) {
18770 // The test just check Sqdmull[b|t] and Pmull[b|t], as the way how the element
18771 // operating of the other instructions in the group are likewise.
18772 int32_t zn_inputs_s[] =
18773 {1, -2, 3, -4, 5, -6, 7, -8, INT32_MIN, INT32_MAX, INT32_MAX, INT32_MIN};
18774
18775 int32_t zm_inputs_s[] =
18776 {1, 2, 3, 4, 5, 6, 7, 8, INT32_MAX, INT32_MIN, INT32_MAX, INT32_MIN};
18777 int64_t sqdmullb_vec_expected_d[] =
18778 {-8, -32, -72, -128, RawbitsToInt64(0x8000000100000000), INT64_MAX};
18779
18780 uint64_t sqdmullt_vec_expected_d[] =
18781 {2, 18, 50, 98, 0x8000000100000000, 0x7ffffffe00000002};
18782
18783 uint64_t pmullb_vec_expected_d[] = {0x00000001fffffffc,
18784 0x00000003fffffff0,
18785 0x000000020000001c,
18786 0x00000007ffffffc0,
18787 0x3fffffff80000000,
18788 0x4000000000000000};
18789
18790 uint64_t pmullt_vec_expected_d[] = {0x05,
18791 0x11,
18792 0x15,
18793 0x3fffffff80000000,
18794 0x1555555555555555};
18795
18796 uint64_t sqdmullb_idx_expected_d[] = {0xfffffffffffffff8,
18797 0xfffffffffffffff0,
18798 0xffffffffffffffb8,
18799 0xffffffffffffffa0,
18800 0x8000000100000000,
18801 INT64_MAX};
18802
18803 uint64_t sqdmullt_idx_expected_d[] =
18804 {8, // 2 * zn[11] * zm[8] = 2 * 4 * 1
18805 24, // 2 * zn[9] * zm[8] = 2 * 4 * 3
18806 80, // 2 * zn[7] * zm[4] = 2 * 8 * 5
18807 112, // 2 * zn[5] * zm[4] = 2 * 8 * 7
18808 0x7fffffffffffffff, // 2 * zn[3] * zm[0]
18809 0x8000000100000000}; // 2 * zn[1] * zm[0]
18810
18811 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
18812 START();
18813
18814 InsrHelper(&masm, z31.VnS(), zn_inputs_s);
18815 InsrHelper(&masm, z30.VnS(), zm_inputs_s);
18816
18817 __ Sqdmullb(z1.VnD(), z31.VnS(), z30.VnS());
18818 __ Sqdmullt(z2.VnD(), z31.VnS(), z30.VnS());
18819
18820 __ Pmullb(z3.VnD(), z31.VnS(), z30.VnS());
18821 __ Pmullt(z4.VnD(), z31.VnS(), z30.VnS());
18822
18823 __ Mov(z7, z30);
18824 __ Mov(z8, z31);
18825 __ Sqdmullb(z5.VnD(), z8.VnS(), z7.VnS(), 2);
18826 __ Sqdmullt(z6.VnD(), z8.VnS(), z7.VnS(), 0);
18827
18828 END();
18829
18830 if (CAN_RUN()) {
18831 RUN();
18832
18833 ASSERT_EQUAL_SVE(sqdmullb_vec_expected_d, z1.VnD());
18834 ASSERT_EQUAL_SVE(sqdmullt_vec_expected_d, z2.VnD());
18835 ASSERT_EQUAL_SVE(pmullb_vec_expected_d, z3.VnD());
18836 ASSERT_EQUAL_SVE(pmullt_vec_expected_d, z4.VnD());
18837 ASSERT_EQUAL_SVE(sqdmullb_idx_expected_d, z5.VnD());
18838 ASSERT_EQUAL_SVE(sqdmullt_idx_expected_d, z6.VnD());
18839 }
18840 }
18841
TEST_SVE(sve2_integer_multiply_add_long_vector)18842 TEST_SVE(sve2_integer_multiply_add_long_vector) {
18843 int32_t zn_inputs_s[] =
18844 {1, -2, 3, -4, 5, -6, 7, -8, INT32_MIN, INT32_MAX, INT32_MAX, INT32_MIN};
18845
18846 int32_t zm_inputs_s[] =
18847 {1, 2, 3, 4, 5, 6, 7, 8, INT32_MAX, INT32_MIN, INT32_MAX, INT32_MIN};
18848
18849 int64_t sqdmlalb_vec_expected_d[] =
18850 {-3, -28, -69, -126, RawbitsToInt64(0x8000000100000001), INT64_MAX};
18851
18852 int64_t sqdmlalt_vec_expected_d[] = {-3,
18853 14,
18854 47,
18855 96,
18856 RawbitsToInt64(0x80000000ffffffff),
18857 static_cast<int64_t>(
18858 0x7ffffffe00000002)};
18859
18860 int64_t sqdmlalb_idx_expected_d[] =
18861 {-11, // za.d[5] + 2 * zn.s[10] * zm.s[8] = 5 + 2 * -2 * 4
18862 -28, // za.d[4] + 2 * zn.s[8] * zm.s[8] = 4 + 2 * -4 * 4
18863 -93, // za.d[3] + 2 * zn.s[6] * zm.s[4] = 3 + 2 * -6 * 8
18864 -126, // za.d[2] + 2 * zn.s[4] * zm.s[4] = 2 + 2 * -8 * 8
18865 RawbitsToInt64(0x8000000100000001),
18866 INT64_MAX};
18867
18868 int64_t sqdmlalt_idx_expected_d[] =
18869 {1, // za.d[5] + 2 * zn.s[11] * zm.s[9] = -5 + 2 * 1 * 3
18870 14, // za.d[4] + 2 * zn.s[9] * zm.s[9] = -4 + 2 * 3 * 3
18871 67, // za.d[3] + 2 * zn.s[7] * zm.s[5] = -3 + 2 * 5 * 7
18872 96, // za.d[2] + 2 * zn.s[5] * zm.s[5] = -2 + 2 * 7 * 7
18873 RawbitsToInt64(0x80000000ffffffff),
18874 static_cast<int64_t>(0x7ffffffe00000002)};
18875
18876 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
18877 START();
18878
18879 InsrHelper(&masm, z0.VnS(), zn_inputs_s);
18880 InsrHelper(&masm, z1.VnS(), zm_inputs_s);
18881 __ Index(z2.VnD(), 0, 1);
18882 __ Index(z3.VnD(), 0, -1);
18883
18884 __ Mov(z31, z2);
18885 __ Sqdmlalb(z31.VnD(), z31.VnD(), z0.VnS(), z1.VnS());
18886 __ Mov(z30, z3);
18887 __ Sqdmlalt(z30.VnD(), z30.VnD(), z0.VnS(), z1.VnS());
18888 __ Mov(z29, z31);
18889 __ Sqdmlslb(z29.VnD(), z29.VnD(), z0.VnS(), z1.VnS());
18890 __ Mov(z28, z30);
18891 __ Sqdmlslt(z28.VnD(), z28.VnD(), z0.VnS(), z1.VnS());
18892
18893 __ Sqdmlalb(z27.VnD(), z2.VnD(), z0.VnS(), z1.VnS());
18894 __ Sqdmlalt(z26.VnD(), z3.VnD(), z0.VnS(), z1.VnS());
18895 __ Sqdmlslb(z25.VnD(), z27.VnD(), z0.VnS(), z1.VnS());
18896 __ Sqdmlslt(z24.VnD(), z26.VnD(), z0.VnS(), z1.VnS());
18897
18898 __ Mov(z23, z2);
18899 __ Sqdmlalb(z23.VnD(), z23.VnD(), z0.VnS(), z1.VnS(), 0);
18900 __ Mov(z22, z3);
18901 __ Sqdmlalt(z22.VnD(), z22.VnD(), z0.VnS(), z1.VnS(), 1);
18902 __ Mov(z21, z23);
18903 __ Sqdmlslb(z21.VnD(), z21.VnD(), z0.VnS(), z1.VnS(), 0);
18904 __ Mov(z20, z22);
18905 __ Sqdmlslt(z20.VnD(), z20.VnD(), z0.VnS(), z1.VnS(), 1);
18906
18907
18908 END();
18909
18910 if (CAN_RUN()) {
18911 RUN();
18912
18913 ASSERT_EQUAL_SVE(sqdmlalb_vec_expected_d, z31.VnD());
18914 ASSERT_EQUAL_SVE(sqdmlalt_vec_expected_d, z30.VnD());
18915 ASSERT_EQUAL_SVE(z2, z29);
18916 ASSERT_EQUAL_SVE(z3, z28);
18917
18918 ASSERT_EQUAL_SVE(z31, z27);
18919 ASSERT_EQUAL_SVE(z30, z26);
18920 ASSERT_EQUAL_SVE(z29, z25);
18921 ASSERT_EQUAL_SVE(z28, z24);
18922
18923 ASSERT_EQUAL_SVE(sqdmlalb_idx_expected_d, z23.VnD());
18924 ASSERT_EQUAL_SVE(sqdmlalt_idx_expected_d, z22.VnD());
18925 ASSERT_EQUAL_SVE(z2, z21);
18926 ASSERT_EQUAL_SVE(z3, z20);
18927 }
18928 }
18929
TEST_SVE(sve2_ldnt1)18930 TEST_SVE(sve2_ldnt1) {
18931 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
18932 START();
18933
18934 int data_size = kZRegMaxSizeInBytes * 4;
18935 uint8_t* data = new uint8_t[data_size];
18936 for (int i = 0; i < data_size; i++) {
18937 data[i] = i & 0xff;
18938 }
18939
18940 // Set the base half-way through the buffer so we can use negative indices.
18941 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
18942 __ Index(z30.VnD(), x0, 1);
18943 __ Ptrue(p0.VnB());
18944 __ Punpklo(p1.VnH(), p0.VnB());
18945 __ Punpklo(p2.VnH(), p1.VnB());
18946 __ Punpklo(p3.VnH(), p2.VnB());
18947 __ Punpklo(p4.VnH(), p3.VnB());
18948
18949 __ Mov(x1, 1);
18950 __ Ldnt1b(z0.VnD(), p1.Zeroing(), SVEMemOperand(z30.VnD(), x1));
18951 __ Ld1b(z1.VnD(), p1.Zeroing(), SVEMemOperand(x1, z30.VnD()));
18952
18953 __ Mov(x1, -4);
18954 __ Ldnt1h(z2.VnD(), p2.Zeroing(), SVEMemOperand(z30.VnD(), x1));
18955 __ Ld1h(z3.VnD(), p2.Zeroing(), SVEMemOperand(x1, z30.VnD()));
18956
18957 __ Mov(x1, 16);
18958 __ Ldnt1w(z4.VnD(), p3.Zeroing(), SVEMemOperand(z30.VnD(), x1));
18959 __ Ld1w(z5.VnD(), p3.Zeroing(), SVEMemOperand(x1, z30.VnD()));
18960
18961 __ Mov(x1, -16);
18962 __ Ldnt1d(z6.VnD(), p4.Zeroing(), SVEMemOperand(z30.VnD(), x1));
18963 __ Ld1d(z7.VnD(), p4.Zeroing(), SVEMemOperand(x1, z30.VnD()));
18964
18965 __ Mov(x1, 1);
18966 __ Ldnt1sb(z8.VnD(), p0.Zeroing(), SVEMemOperand(z30.VnD(), x1));
18967 __ Ld1sb(z9.VnD(), p0.Zeroing(), SVEMemOperand(x1, z30.VnD()));
18968
18969 __ Mov(x1, -4);
18970 __ Ldnt1sh(z10.VnD(), p2.Zeroing(), SVEMemOperand(z30.VnD(), x1));
18971 __ Ld1sh(z11.VnD(), p2.Zeroing(), SVEMemOperand(x1, z30.VnD()));
18972
18973 __ Mov(x1, 16);
18974 __ Ldnt1sw(z12.VnD(), p3.Zeroing(), SVEMemOperand(z30.VnD(), x1));
18975 __ Ld1sw(z13.VnD(), p3.Zeroing(), SVEMemOperand(x1, z30.VnD()));
18976
18977 END();
18978
18979 if (CAN_RUN()) {
18980 RUN();
18981 ASSERT_EQUAL_SVE(z0, z1);
18982 ASSERT_EQUAL_SVE(z2, z3);
18983 ASSERT_EQUAL_SVE(z4, z5);
18984 ASSERT_EQUAL_SVE(z6, z7);
18985 ASSERT_EQUAL_SVE(z8, z9);
18986 ASSERT_EQUAL_SVE(z10, z11);
18987 ASSERT_EQUAL_SVE(z12, z13);
18988 }
18989 }
18990
TEST_SVE(sve2_stnt1)18991 TEST_SVE(sve2_stnt1) {
18992 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
18993 START();
18994
18995 int data_size = kZRegMaxSizeInBytes * 4;
18996 uint8_t* data = new uint8_t[data_size];
18997
18998 // Set the base half-way through the buffer so we can use negative indices.
18999 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
19000 __ Ptrue(p0.VnB());
19001 __ Punpklo(p1.VnH(), p0.VnB());
19002 __ Punpklo(p2.VnH(), p1.VnB());
19003 __ Punpklo(p3.VnH(), p2.VnB());
19004 __ Punpklo(p4.VnH(), p3.VnB());
19005 __ Dup(z0.VnB(), 0xaa);
19006 __ Dup(z1.VnB(), 0x55);
19007 __ Rdvl(x1, 1);
19008 __ Mov(x3, 0);
19009
19010 // Put store addresses into z30, and a small offset in x4.
19011 __ Index(z30.VnD(), x0, 1);
19012 __ Mov(x4, 2);
19013
19014 // Store an entire vector of 0xaa to the buffer, then a smaller scatter store
19015 // of 0x55 using Stnt1b.
19016 __ St1b(z0.VnB(), p0, SVEMemOperand(x0, x4));
19017 __ Stnt1b(z1.VnD(), p0, SVEMemOperand(z30.VnD(), x4));
19018
19019 // Load the entire vector back from the buffer.
19020 __ Ld1b(z2.VnB(), p0.Zeroing(), SVEMemOperand(x0, x4));
19021
19022 // Construct a predicate that reflects the number of bytes stored by Stnt1b,
19023 // based on the current VL, and use Sel to obtain a reference vector for
19024 // comparison.
19025 __ Lsr(x2, x1, 3);
19026 __ Whilelo(p5.VnB(), x3, x2);
19027 __ Sel(z3.VnB(), p5.Merging(), z1.VnB(), z0.VnB());
19028
19029 // Repeat for larger element sizes.
19030 __ Mov(x4, -4);
19031 __ Index(z30.VnD(), x0, 2);
19032 __ St1b(z0.VnB(), p0, SVEMemOperand(x0, x4));
19033 __ Stnt1h(z1.VnD(), p0, SVEMemOperand(z30.VnD(), x4));
19034 __ Ld1b(z4.VnB(), p0.Zeroing(), SVEMemOperand(x0, x4));
19035 __ Lsr(x2, x1, 2);
19036 __ Whilelo(p5.VnB(), x3, x2);
19037 __ Sel(z5.VnB(), p5.Merging(), z1.VnB(), z0.VnB());
19038
19039 __ Mov(x4, 16);
19040 __ Index(z30.VnD(), x0, 4);
19041 __ St1b(z0.VnB(), p0, SVEMemOperand(x0, x4));
19042 __ Stnt1w(z1.VnD(), p0, SVEMemOperand(z30.VnD(), x4));
19043 __ Ld1b(z6.VnB(), p0.Zeroing(), SVEMemOperand(x0, x4));
19044 __ Lsr(x2, x1, 1);
19045 __ Whilelo(p5.VnB(), x3, x2);
19046 __ Sel(z7.VnB(), p5.Merging(), z1.VnB(), z0.VnB());
19047
19048 __ Mov(x4, -16);
19049 __ Index(z30.VnD(), x0, 8);
19050 __ St1b(z0.VnB(), p0, SVEMemOperand(x0, x4));
19051 __ Stnt1d(z1.VnD(), p0, SVEMemOperand(z30.VnD(), x4));
19052 __ Ld1b(z8.VnB(), p0.Zeroing(), SVEMemOperand(x0, x4));
19053 __ Whilelo(p5.VnB(), x3, x1);
19054 __ Sel(z9.VnB(), p5.Merging(), z1.VnB(), z0.VnB());
19055 END();
19056
19057 if (CAN_RUN()) {
19058 RUN();
19059 ASSERT_EQUAL_SVE(z2, z3);
19060 ASSERT_EQUAL_SVE(z4, z5);
19061 ASSERT_EQUAL_SVE(z6, z7);
19062 ASSERT_EQUAL_SVE(z8, z9);
19063 }
19064 }
19065
TEST_SVE(sve2_while_simple)19066 TEST_SVE(sve2_while_simple) {
19067 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
19068
19069 START();
19070 __ Mov(x0, 1);
19071 __ Mov(x1, 0);
19072 __ Mov(x2, 3);
19073
19074 __ Whilehi(p0.VnB(), x0, x1);
19075 __ Whilehs(p1.VnB(), x0, x1);
19076 __ Whilehi(p2.VnB(), x2, x1);
19077 __ Whilehs(p3.VnB(), x2, x1);
19078 __ Whilehi(p4.VnB(), x2, x0);
19079 __ Whilehs(p5.VnB(), x2, x0);
19080
19081 __ Whilegt(p6.VnB(), x0, x1);
19082 __ Whilege(p7.VnB(), x0, x1);
19083 __ Whilegt(p8.VnB(), x2, x1);
19084 __ Whilege(p9.VnB(), x2, x1);
19085 __ Whilegt(p10.VnB(), x2, x0);
19086 __ Whilege(p11.VnB(), x2, x0);
19087
19088 __ Mov(x4, 0x80000000);
19089 __ Mov(x5, 0x80000001);
19090 __ Whilege(p12.VnB(), w5, w4);
19091 __ Whilegt(p13.VnB(), w5, w4);
19092
19093 __ Mov(x6, 0x8000000000000000);
19094 __ Mov(x7, 0x8000000000000001);
19095 __ Whilege(p14.VnB(), x7, x6);
19096 __ Whilegt(p15.VnB(), x7, x6);
19097
19098 for (int i = 0; i < 16; i++) {
19099 __ Rev(PRegister(i).VnB(), PRegister(i).VnB());
19100 }
19101
19102 END();
19103
19104 if (CAN_RUN()) {
19105 RUN();
19106 int p0_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
19107 int p1_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
19108 int p2_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
19109 int p3_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
19110 int p4_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1};
19111 int p5_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
19112 int p6_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
19113 int p7_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1};
19114 int p8_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
19115 int p9_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1};
19116 int p10_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1};
19117 int p11_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
19118 int p12_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
19119 int p13_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
19120 int p14_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
19121 int p15_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
19122
19123 ASSERT_EQUAL_SVE(p0_exp, p0.VnB());
19124 ASSERT_EQUAL_SVE(p1_exp, p1.VnB());
19125 ASSERT_EQUAL_SVE(p2_exp, p2.VnB());
19126 ASSERT_EQUAL_SVE(p3_exp, p3.VnB());
19127 ASSERT_EQUAL_SVE(p4_exp, p4.VnB());
19128 ASSERT_EQUAL_SVE(p5_exp, p5.VnB());
19129 ASSERT_EQUAL_SVE(p6_exp, p6.VnB());
19130 ASSERT_EQUAL_SVE(p7_exp, p7.VnB());
19131 ASSERT_EQUAL_SVE(p8_exp, p8.VnB());
19132 ASSERT_EQUAL_SVE(p9_exp, p9.VnB());
19133 ASSERT_EQUAL_SVE(p10_exp, p10.VnB());
19134 ASSERT_EQUAL_SVE(p11_exp, p11.VnB());
19135 ASSERT_EQUAL_SVE(p12_exp, p12.VnB());
19136 ASSERT_EQUAL_SVE(p13_exp, p13.VnB());
19137 ASSERT_EQUAL_SVE(p14_exp, p14.VnB());
19138 ASSERT_EQUAL_SVE(p15_exp, p15.VnB());
19139 }
19140 }
19141
TEST_SVE(sve2_whilerw_whilewr_simple)19142 TEST_SVE(sve2_whilerw_whilewr_simple) {
19143 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
19144
19145 START();
19146 __ Mov(x0, 0);
19147 __ Mov(x1, 1);
19148 __ Mov(x2, 3);
19149
19150 __ Whilerw(p0.VnB(), x0, x0);
19151 __ Whilerw(p1.VnB(), x0, x1);
19152 __ Whilerw(p2.VnB(), x1, x0);
19153
19154 __ Whilewr(p3.VnB(), x0, x0);
19155 __ Whilewr(p4.VnB(), x0, x1);
19156 __ Whilewr(p5.VnB(), x1, x0);
19157
19158 __ Whilewr(p6.VnH(), x1, x1);
19159 __ Whilewr(p7.VnH(), x1, x2);
19160 __ Whilewr(p8.VnH(), x2, x1);
19161
19162 END();
19163
19164 if (CAN_RUN()) {
19165 RUN();
19166 int p0_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
19167 ASSERT_EQUAL_SVE(p0_exp, p0.VnB());
19168 int p1_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
19169 ASSERT_EQUAL_SVE(p1_exp, p1.VnB());
19170 int p2_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
19171 ASSERT_EQUAL_SVE(p2_exp, p2.VnB());
19172 int p3_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
19173 ASSERT_EQUAL_SVE(p3_exp, p3.VnB());
19174 int p4_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
19175 ASSERT_EQUAL_SVE(p4_exp, p4.VnB());
19176 int p5_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
19177 ASSERT_EQUAL_SVE(p5_exp, p5.VnB());
19178 int p6_exp[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
19179 ASSERT_EQUAL_SVE(p6_exp, p6.VnB());
19180 int p7_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
19181 ASSERT_EQUAL_SVE(p7_exp, p7.VnB());
19182 int p8_exp[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
19183 ASSERT_EQUAL_SVE(p8_exp, p8.VnB());
19184 }
19185 }
19186
TEST_SVE(sve2_sqrdcmlah)19187 TEST_SVE(sve2_sqrdcmlah) {
19188 int32_t zn_inputs[] = {-1, -2, -3, -4, 1, 2, 3, 4};
19189 int32_t zm_inputs[] = {-1, -2, 3, 4, 1, 2, -3, -4};
19190 int32_t za_inputs[] = {1, 2, 3, 4, 5, 6, 7, 8};
19191 int32_t zd_000_expected[] =
19192 {1025, 2050, -6141, -8188, 1029, 2054, -6137, -8184};
19193 int32_t zd_090_expected[] =
19194 {1025, -510, -6141, 4612, 1029, -506, -6137, 4616};
19195 int32_t zd_180_expected[] =
19196 {-1023, -2046, 6147, 8196, -1019, -2042, 6151, 8200};
19197 int32_t zd_270_expected[] =
19198 {-1023, 514, 6147, -4604, -1019, 518, 6151, -4600};
19199 int32_t zd_0_270_expected[] =
19200 {2049, -1534, 6147, -4604, 2053, -1530, 6151, -4600};
19201 int32_t zd_3_090_expected[] =
19202 {1025, -510, 3075, -1532, 1029, -506, 3079, -1528};
19203
19204 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
19205 START();
19206
19207 InsrHelper(&masm, z0.VnS(), zn_inputs);
19208 InsrHelper(&masm, z1.VnS(), zm_inputs);
19209 InsrHelper(&masm, z31.VnS(), za_inputs);
19210
19211 // When the value in operands is small, shift left a random value so that it
19212 // can affect the result in destination.
19213 int shift = 20;
19214 __ Lsl(z0.VnS(), z0.VnS(), shift);
19215 __ Lsl(z1.VnS(), z1.VnS(), shift);
19216
19217 __ Mov(z10, z31);
19218 __ Sqrdcmlah(z10.VnS(), z10.VnS(), z0.VnS(), z1.VnS(), 0);
19219
19220 __ Mov(z11, z31);
19221 __ Sqrdcmlah(z11.VnS(), z11.VnS(), z0.VnS(), z1.VnS(), 90);
19222
19223 __ Mov(z12, z31);
19224 __ Sqrdcmlah(z12.VnS(), z12.VnS(), z0.VnS(), z1.VnS(), 180);
19225
19226 __ Mov(z13, z31);
19227 __ Sqrdcmlah(z13.VnS(), z13.VnS(), z0.VnS(), z1.VnS(), 270);
19228
19229 __ Sqrdcmlah(z14.VnS(), z31.VnS(), z0.VnS(), z1.VnS(), 0);
19230 __ Sqrdcmlah(z15.VnS(), z31.VnS(), z0.VnS(), z1.VnS(), 90);
19231 __ Sqrdcmlah(z16.VnS(), z31.VnS(), z0.VnS(), z1.VnS(), 180);
19232 __ Sqrdcmlah(z17.VnS(), z31.VnS(), z0.VnS(), z1.VnS(), 270);
19233
19234 __ Mov(z18, z31);
19235 __ Sqrdcmlah(z18.VnS(), z18.VnS(), z0.VnS(), z1.VnS(), 0, 270);
19236
19237 __ Mov(z19, z31);
19238 __ Sqrdcmlah(z19.VnS(), z19.VnS(), z0.VnS(), z1.VnS(), 1, 90);
19239
19240 END();
19241
19242 if (CAN_RUN()) {
19243 RUN();
19244
19245 ASSERT_EQUAL_SVE(zd_000_expected, z10.VnS());
19246 ASSERT_EQUAL_SVE(zd_090_expected, z11.VnS());
19247 ASSERT_EQUAL_SVE(zd_180_expected, z12.VnS());
19248 ASSERT_EQUAL_SVE(zd_270_expected, z13.VnS());
19249
19250 ASSERT_EQUAL_SVE(z14, z10);
19251 ASSERT_EQUAL_SVE(z15, z11);
19252 ASSERT_EQUAL_SVE(z16, z12);
19253 ASSERT_EQUAL_SVE(z17, z13);
19254
19255 ASSERT_EQUAL_SVE(zd_0_270_expected, z18.VnS());
19256 ASSERT_EQUAL_SVE(zd_3_090_expected, z19.VnS());
19257 }
19258 }
19259
TEST_SVE(sve2_sqrdmlah)19260 TEST_SVE(sve2_sqrdmlah) {
19261 uint16_t zn_inputs_h[] = {0x7ffe, 0x7ffd, 0x7ffd, 0x7ffd, 0x8000,
19262 0x7fff, 0x7ffe, 0x7ffe, 0x8001, 0x8000,
19263 0x7ffd, 0x7ffd, 0x7ffd, 0x5555, 0x5555,
19264 0x5555, 0x8000, 0x8000, 0xaaaa, 0x8001};
19265
19266 uint16_t zm_inputs_h[] = {0x7ffd, 0x7fff, 0x7ffe, 0x7ffd, 0x8001,
19267 0x7fff, 0x7fff, 0x7ffe, 0x8000, 0x8000,
19268 0xaaaa, 0x0001, 0x0001, 0xaaaa, 0xaaaa,
19269 0xcccc, 0x8000, 0x8000, 0x8000, 0x8001};
19270
19271 uint16_t za_inputs_h[] = {0x1010, 0x1010, 0x1010, 0x1010, 0x1010,
19272 0x1010, 0x1010, 0x1010, 0x8000, 0x8011,
19273 0x8006, 0xff7d, 0xfeff, 0xaabc, 0xaabb,
19274 0x9c72, 0x8000, 0x0000, 0x8000, 0xffff};
19275
19276 uint16_t zd_expected_h[] = {0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
19277 0x7fff, 0x7fff, 0x7fff, 0xffff, 0x0011,
19278 0x8000, 0xff7e, 0xff00, 0x8000, 0x8000,
19279 0x8000, 0x0000, 0x7fff, 0xd556, 0x7ffd};
19280
19281 uint32_t zn_inputs_s[] = {0x04000000,
19282 0x80000000,
19283 0x04000000,
19284 0x80000000,
19285 0x80000000,
19286 0x80000001,
19287 0x7fffffff,
19288 0x80000000,
19289 0x7ffffffe,
19290 0x7ffffffd,
19291 0x7ffffffd,
19292 0x7ffffffd};
19293
19294 uint32_t zm_inputs_s[] = {0x00000020,
19295 0x80000000,
19296 0x00000010,
19297 0x80000000,
19298 0x7fffffff,
19299 0x80000000,
19300 0x80000000,
19301 0x80000001,
19302 0x7ffffffd,
19303 0x7fffffff,
19304 0x7ffffffe,
19305 0x7ffffffd};
19306
19307 uint32_t za_inputs_s[] = {0x00000000,
19308 0x00000000,
19309 0x00000020,
19310 0x00108000,
19311 0x00000000,
19312 0x00000001,
19313 0x00000000,
19314 0x00000001,
19315 0x10101010,
19316 0x10101010,
19317 0x10101010,
19318 0x10101010};
19319
19320 uint32_t zd_expected_s[] = {0x00000001,
19321 0x7fffffff,
19322 0x00000021,
19323 0x7fffffff,
19324 0x80000001,
19325 0x7fffffff,
19326 0x80000001,
19327 0x7fffffff,
19328 0x7fffffff,
19329 0x7fffffff,
19330 0x7fffffff,
19331 0x7fffffff};
19332
19333 uint64_t zn_inputs_d[] = {0x0400000000000000, 0x8000000000000000,
19334 0x0400000000000000, 0x8000000000000000,
19335 0x8000000000000000, 0x8000000000000001,
19336 0x7fffffffffffffff, 0x8000000000000000,
19337 0x7ffffffffffffffe, 0x7ffffffffffffffd,
19338 0x7ffffffffffffffd, 0x7ffffffffffffffd,
19339 0xf1299accc9186169, 0xd529d2675ee9da21,
19340 0x1a10b5d60b92dcf9, 0xfb1d358e0e6455b1,
19341 0x8eb7721078bdc589, 0x4171509750ded141,
19342 0x8eb7721078bdc589, 0x4171509750ded141};
19343
19344 uint64_t zm_inputs_d[] = {0x0000000000000020, 0x8000000000000000,
19345 0x0000000000000010, 0x8000000000000000,
19346 0x7fffffffffffffff, 0x8000000000000000,
19347 0x8000000000000000, 0x8000000000000001,
19348 0x7ffffffffffffffd, 0x7fffffffffffffff,
19349 0x7ffffffffffffffe, 0x7ffffffffffffffd,
19350 0x30b940efe73f180e, 0x3bc1ff1e52a99b66,
19351 0x40de5c9793535a5e, 0x24752faf47bdddb6,
19352 0x162663016b07e5ae, 0x1de34b56f3d22006,
19353 0x8eb7721078bdc589, 0x4171509750ded141};
19354
19355 uint64_t za_inputs_d[] = {0x0000000000000000, 0x0000000000000000,
19356 0x0000000000000020, 0x0010108000000000,
19357 0x0000000000000000, 0x0000000000000001,
19358 0x0000000000000000, 0x0000000000000001,
19359 0x1010101010101010, 0x1010101010101010,
19360 0x1010101010101010, 0x1010101010101010,
19361 0xb18253371b2c2c77, 0xa70de31e6645eaef,
19362 0xda817198c0318487, 0x9fd9e6b8e04b42ff,
19363 0xced1f6b7119ab197, 0x01ae051a85509b0f,
19364 0x01a211e9352f7927, 0x7667b70a5b13749f};
19365
19366 uint64_t zd_expected_d[] = {0x0000000000000001, 0x7fffffffffffffff,
19367 0x0000000000000021, 0x7fffffffffffffff,
19368 0x8000000000000001, 0x7fffffffffffffff,
19369 0x8000000000000001, 0x7fffffffffffffff,
19370 0x7fffffffffffffff, 0x7fffffffffffffff,
19371 0x7fffffffffffffff, 0x7fffffffffffffff,
19372 0xabdc73dea0d72a35, 0x930e3dc877301966,
19373 0xe7b7145a059f8a9f, 0x9e75a4a9d10cf8af,
19374 0xbb378528642d2581, 0x10f5e6d693ffddf3,
19375 0x65e455a46adc091c, 0x7fffffffffffffff};
19376
19377 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
19378 START();
19379
19380 InsrHelper(&masm, z0.VnH(), zn_inputs_h);
19381 InsrHelper(&masm, z1.VnH(), zm_inputs_h);
19382 InsrHelper(&masm, z2.VnH(), za_inputs_h);
19383
19384 __ Sqrdmlah(z2.VnH(), z2.VnH(), z0.VnH(), z1.VnH());
19385
19386 InsrHelper(&masm, z3.VnS(), zn_inputs_s);
19387 InsrHelper(&masm, z4.VnS(), zm_inputs_s);
19388 InsrHelper(&masm, z5.VnS(), za_inputs_s);
19389
19390 __ Sqrdmlah(z5.VnS(), z5.VnS(), z3.VnS(), z4.VnS());
19391
19392 InsrHelper(&masm, z6.VnD(), zn_inputs_d);
19393 InsrHelper(&masm, z7.VnD(), zm_inputs_d);
19394 InsrHelper(&masm, z8.VnD(), za_inputs_d);
19395
19396 __ Sqrdmlah(z8.VnD(), z8.VnD(), z6.VnD(), z7.VnD());
19397
19398 END();
19399
19400 if (CAN_RUN()) {
19401 RUN();
19402 ASSERT_EQUAL_SVE(zd_expected_h, z2.VnH());
19403 ASSERT_EQUAL_SVE(zd_expected_s, z5.VnS());
19404 ASSERT_EQUAL_SVE(zd_expected_d, z8.VnD());
19405 }
19406 }
19407
TEST_SVE(sve2_cmla)19408 TEST_SVE(sve2_cmla) {
19409 int32_t zn_inputs_s[] = {-2, -4, -6, -8, 2, 4, 6, 8};
19410 int32_t zm_inputs_s[] = {-2, -4, -6, -8, 2, 4, 6, 8};
19411 int32_t zda_inputs_s[] = {1, 2, 3, 4, 5, 6, 7, 8};
19412 int32_t zd_000_expected[] = {9, 18, 51, 68, 13, 22, 55, 72};
19413 int32_t zd_090_expected[] = {9, -2, 51, -32, 13, 2, 55, -28};
19414 int32_t zd_180_expected[] = {-7, -14, -45, -60, -3, -10, -41, -56};
19415 int32_t zd_270_expected[] = {-7, 6, -45, 40, -3, 10, -41, 44};
19416
19417 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
19418 START();
19419
19420 InsrHelper(&masm, z31.VnS(), zn_inputs_s);
19421 InsrHelper(&masm, z30.VnS(), zm_inputs_s);
19422
19423 InsrHelper(&masm, z0.VnS(), zda_inputs_s);
19424 __ Mov(z29, z0);
19425 __ Cmla(z0.VnS(), z0.VnS(), z31.VnS(), z30.VnS(), 0);
19426
19427 InsrHelper(&masm, z1.VnS(), zda_inputs_s);
19428 __ Mov(z28, z1);
19429 __ Cmla(z1.VnS(), z1.VnS(), z31.VnS(), z30.VnS(), 90);
19430
19431 InsrHelper(&masm, z2.VnS(), zda_inputs_s);
19432 __ Mov(z27, z2);
19433 __ Cmla(z2.VnS(), z2.VnS(), z31.VnS(), z30.VnS(), 180);
19434
19435 InsrHelper(&masm, z3.VnS(), zda_inputs_s);
19436 __ Mov(z26, z3);
19437 __ Cmla(z3.VnS(), z3.VnS(), z31.VnS(), z30.VnS(), 270);
19438
19439 __ Cmla(z4.VnS(), z29.VnS(), z31.VnS(), z30.VnS(), 0);
19440 __ Cmla(z5.VnS(), z28.VnS(), z31.VnS(), z30.VnS(), 90);
19441 __ Cmla(z6.VnS(), z27.VnS(), z31.VnS(), z30.VnS(), 180);
19442 __ Cmla(z7.VnS(), z26.VnS(), z31.VnS(), z30.VnS(), 270);
19443
19444 END();
19445
19446 if (CAN_RUN()) {
19447 RUN();
19448
19449 ASSERT_EQUAL_SVE(zd_000_expected, z0.VnS());
19450 ASSERT_EQUAL_SVE(zd_090_expected, z1.VnS());
19451 ASSERT_EQUAL_SVE(zd_180_expected, z2.VnS());
19452 ASSERT_EQUAL_SVE(zd_270_expected, z3.VnS());
19453
19454 ASSERT_EQUAL_SVE(z4, z0);
19455 ASSERT_EQUAL_SVE(z5, z1);
19456 ASSERT_EQUAL_SVE(z6, z2);
19457 ASSERT_EQUAL_SVE(z7, z3);
19458 }
19459 }
19460
TEST_SVE(sve2_integer_saturating_multiply_add_long)19461 TEST_SVE(sve2_integer_saturating_multiply_add_long) {
19462 int32_t zn_bottom_inputs[] =
19463 {-2, -4, -6, -8, INT32_MAX, INT32_MIN, INT32_MIN};
19464
19465 int32_t zm_top_inputs[] = {1, 3, 5, 7, INT32_MAX, INT32_MAX, INT32_MIN};
19466
19467 int64_t sqdmlalbt_expected[] = {2,
19468 -19,
19469 -56,
19470 -109,
19471 static_cast<int64_t>(0x7ffffffe00000004),
19472 RawbitsToInt64(0x8000000100000001),
19473 INT64_MAX};
19474
19475 int64_t sqdmlslbt_expected[] = {-2,
19476 19,
19477 56,
19478 109,
19479 RawbitsToInt64(0x80000001fffffffc),
19480 static_cast<int64_t>(0x7ffffffeffffffff),
19481 RawbitsToInt64(0x8000000000000001)};
19482
19483 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
19484 START();
19485
19486 InsrHelper(&masm, z31.VnS(), zn_bottom_inputs);
19487 InsrHelper(&masm, z30.VnS(), zm_top_inputs);
19488
19489 __ Dup(z29.VnD(), 0);
19490 __ Zip1(z31.VnS(), z31.VnS(), z29.VnS());
19491 __ Zip1(z30.VnS(), z29.VnS(), z30.VnS());
19492
19493 // Initialise inputs for za.
19494 __ Index(z1.VnD(), 0, 1);
19495 __ Index(z2.VnD(), 0, -1);
19496
19497 __ Sqdmlalbt(z1.VnD(), z1.VnD(), z31.VnS(), z30.VnS());
19498 __ Sqdmlslbt(z2.VnD(), z2.VnD(), z31.VnS(), z30.VnS());
19499
19500 END();
19501
19502 if (CAN_RUN()) {
19503 RUN();
19504
19505 ASSERT_EQUAL_SVE(sqdmlalbt_expected, z1.VnD());
19506 ASSERT_EQUAL_SVE(sqdmlslbt_expected, z2.VnD());
19507 }
19508 }
19509
TEST_SVE(sve2_floating_point_multiply_add_long_vector)19510 TEST_SVE(sve2_floating_point_multiply_add_long_vector) {
19511 uint16_t zn_inputs[] = {Float16ToRawbits(Float16(1000)),
19512 Float16ToRawbits(Float16(2000)),
19513 Float16ToRawbits(Float16(0.5)),
19514 Float16ToRawbits(Float16(-0.5)),
19515 Float16ToRawbits(Float16(14)),
19516 Float16ToRawbits(Float16(-14)),
19517 Float16ToRawbits(kFP16PositiveInfinity),
19518 Float16ToRawbits(kFP16NegativeInfinity)};
19519
19520 uint16_t zm_inputs[] = {Float16ToRawbits(Float16(10)),
19521 Float16ToRawbits(Float16(-10)),
19522 Float16ToRawbits(Float16(10)),
19523 Float16ToRawbits(Float16(-10)),
19524 Float16ToRawbits(Float16(10)),
19525 Float16ToRawbits(Float16(-10)),
19526 Float16ToRawbits(Float16(10)),
19527 Float16ToRawbits(Float16(-10))};
19528
19529 uint32_t za_inputs[] = {FloatToRawbits(1.0f),
19530 FloatToRawbits(-1.0f),
19531 FloatToRawbits(1.0f),
19532 FloatToRawbits(-1.0f)};
19533
19534 uint32_t fmlalb_zd_expected[] = {0xc69c3e00, // -19999
19535 0x40800000, // 4
19536 0x430d0000, // 141
19537 FloatToRawbits(kFP32PositiveInfinity)};
19538
19539 uint32_t fmlalt_zd_expected[] = {0x461c4400, // 10001
19540 0x40800000, // 4
19541 0x430d0000, // 141
19542 FloatToRawbits(kFP32PositiveInfinity)};
19543
19544 uint32_t fmlslb_zd_expected[] = {0x469c4200, // 20001
19545 0xc0c00000, // -6
19546 0xc30b0000, // -139
19547 FloatToRawbits(kFP32NegativeInfinity)};
19548
19549 uint32_t fmlslt_zd_expected[] = {0xc61c3c00, // -9999
19550 0xc0c00000, // -6
19551 0xc30b0000, // -139
19552 FloatToRawbits(kFP32NegativeInfinity)};
19553
19554 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
19555 START();
19556
19557 InsrHelper(&masm, z31.VnH(), zn_inputs);
19558 InsrHelper(&masm, z30.VnH(), zm_inputs);
19559 InsrHelper(&masm, z29.VnS(), za_inputs);
19560
19561 __ Mov(z0, z29);
19562 __ Fmlalb(z0.VnS(), z0.VnS(), z31.VnH(), z30.VnH());
19563
19564 __ Mov(z1, z29);
19565 __ Fmlalt(z1.VnS(), z1.VnS(), z31.VnH(), z30.VnH());
19566
19567 __ Mov(z2, z29);
19568 __ Fmlslb(z2.VnS(), z2.VnS(), z31.VnH(), z30.VnH());
19569
19570 __ Mov(z3, z29);
19571 __ Fmlslt(z3.VnS(), z3.VnS(), z31.VnH(), z30.VnH());
19572
19573 __ Fmlalb(z4.VnS(), z29.VnS(), z31.VnH(), z30.VnH());
19574 __ Fmlalt(z5.VnS(), z29.VnS(), z31.VnH(), z30.VnH());
19575 __ Fmlslb(z6.VnS(), z29.VnS(), z31.VnH(), z30.VnH());
19576 __ Fmlslt(z7.VnS(), z29.VnS(), z31.VnH(), z30.VnH());
19577
19578 END();
19579
19580 if (CAN_RUN()) {
19581 RUN();
19582
19583 ASSERT_EQUAL_SVE(fmlalb_zd_expected, z0.VnS());
19584 ASSERT_EQUAL_SVE(fmlalt_zd_expected, z1.VnS());
19585 ASSERT_EQUAL_SVE(fmlslb_zd_expected, z2.VnS());
19586 ASSERT_EQUAL_SVE(fmlslt_zd_expected, z3.VnS());
19587
19588 ASSERT_EQUAL_SVE(z4, z0);
19589 ASSERT_EQUAL_SVE(z5, z1);
19590 ASSERT_EQUAL_SVE(z6, z2);
19591 ASSERT_EQUAL_SVE(z7, z3);
19592 }
19593 }
19594
TEST_SVE(sve2_flogb_simple)19595 TEST_SVE(sve2_flogb_simple) {
19596 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
19597
19598 START();
19599 __ Ptrue(p0.VnB());
19600 __ Index(z0.VnS(), -4, 1);
19601 __ Mov(z1.VnS(), 0);
19602 __ Mov(z2.VnD(), 0x000fffffffffffff);
19603 __ Mov(z3.VnD(), 0x0010000000000000);
19604 __ Scvtf(z0.VnS(), p0.Merging(), z0.VnS());
19605 __ Scvtf(z1.VnS(), p0.Merging(), z1.VnS());
19606 __ Fdiv(z1.VnS(), p0.Merging(), z0.VnS(), z1.VnS());
19607 __ Flogb(z0.VnS(), p0.Merging(), z0.VnS());
19608 __ Flogb(z1.VnS(), p0.Merging(), z1.VnS());
19609 __ Flogb(z2.VnD(), p0.Merging(), z2.VnD());
19610 __ Flogb(z3.VnD(), p0.Merging(), z3.VnD());
19611 END();
19612
19613 if (CAN_RUN()) {
19614 RUN();
19615 uint64_t expected_z0[] = {0x0000000200000002,
19616 0x0000000200000002,
19617 0x0000000100000001,
19618 0x0000000080000000,
19619 0x0000000000000001,
19620 0x0000000100000002};
19621 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
19622
19623 uint64_t expected_z1[] = {0x7fffffff7fffffff,
19624 0x7fffffff7fffffff,
19625 0x7fffffff7fffffff,
19626 0x7fffffff80000000,
19627 0x7fffffff7fffffff,
19628 0x7fffffff7fffffff};
19629 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
19630
19631 uint64_t expected_z2[] = {0xfffffffffffffc01,
19632 0xfffffffffffffc01,
19633 0xfffffffffffffc01,
19634 0xfffffffffffffc01};
19635 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
19636
19637 uint64_t expected_z3[] = {0xfffffffffffffc02,
19638 0xfffffffffffffc02,
19639 0xfffffffffffffc02,
19640 0xfffffffffffffc02};
19641 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
19642 }
19643 }
19644
TEST_SVE(neon_matmul)19645 TEST_SVE(neon_matmul) {
19646 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
19647 CPUFeatures::kSVEI8MM,
19648 CPUFeatures::kNEON,
19649 CPUFeatures::kI8MM);
19650
19651 // Test Neon integer matrix multiply against SVE.
19652 START();
19653 __ Movi(v0.V2D(), 0xffeeddccbbaa9988, 0x77665544332211);
19654 __ Movi(v1.V2D(), 0xaa5555aa55555555, 0x55aaaa55aaaaaa);
19655 __ Movi(v2.V2D(), 0, 0);
19656 __ Movi(v3.V2D(), 0, 0);
19657 __ Movi(v4.V2D(), 0, 0);
19658 __ Movi(v5.V2D(), 0, 0);
19659 __ Movi(v6.V2D(), 0, 0);
19660 __ Movi(v7.V2D(), 0, 0);
19661
19662 __ Smmla(v2.V4S(), v0.V16B(), v1.V16B());
19663 __ Smmla(z3.VnS(), z3.VnS(), z0.VnB(), z1.VnB());
19664 __ Ummla(v4.V4S(), v0.V16B(), v1.V16B());
19665 __ Ummla(z5.VnS(), z5.VnS(), z0.VnB(), z1.VnB());
19666 __ Usmmla(v6.V4S(), v0.V16B(), v1.V16B());
19667 __ Usmmla(z7.VnS(), z7.VnS(), z0.VnB(), z1.VnB());
19668 END();
19669
19670 if (CAN_RUN()) {
19671 RUN();
19672
19673 // The inputs as Z registers are zero beyond the least-significant 128 bits,
19674 // so the Neon and SVE results should be equal for any VL.
19675 ASSERT_EQUAL_SVE(z3, z2);
19676 ASSERT_EQUAL_SVE(z5, z4);
19677 ASSERT_EQUAL_SVE(z7, z6);
19678 }
19679 }
19680
TEST_SVE(sudot_usdot)19681 TEST_SVE(sudot_usdot) {
19682 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
19683 CPUFeatures::kSVE2,
19684 CPUFeatures::kSVEI8MM);
19685
19686 START();
19687 __ Ptrue(p0.VnB());
19688 __ Index(z0.VnS(), -424242, 77777);
19689 __ Index(z1.VnB(), 127, -1);
19690 __ Sqabs(z1.VnB(), p0.Merging(), z1.VnB());
19691 __ Index(z2.VnB(), 0, 1);
19692 __ Sqabs(z2.VnB(), p0.Merging(), z2.VnB());
19693 __ Index(z3.VnB(), -128, 1);
19694 __ Mov(z4.VnD(), 0);
19695
19696 // Test Usdot against Udot/Sdot over the range of inputs where they should be
19697 // equal.
19698 __ Usdot(z5.VnS(), z0.VnS(), z1.VnB(), z2.VnB());
19699 __ Udot(z6.VnS(), z0.VnS(), z1.VnB(), z2.VnB());
19700 __ Usdot(z7.VnS(), z0.VnS(), z1.VnB(), z3.VnB());
19701 __ Sdot(z8.VnS(), z0.VnS(), z1.VnB(), z3.VnB());
19702
19703 // Construct values which, when interpreted correctly as signed/unsigned,
19704 // should give a zero result for dot product.
19705 __ Mov(z10.VnS(), 0x8101ff40); // [-127, 1, -1, 64] as signed bytes.
19706 __ Mov(z11.VnS(), 0x02fe8002); // [2, 254, 128, 2] as unsigned bytes.
19707 __ Usdot(z12.VnS(), z4.VnS(), z11.VnB(), z10.VnB());
19708 __ Usdot(z13.VnS(), z4.VnS(), z10.VnB(), z11.VnB());
19709
19710 // Construct a vector with duplicated values across segments. This allows
19711 // testing indexed dot product against the already tested variant.
19712 __ Mov(z14.VnS(), 1);
19713 __ Mul(z15.VnS(), z14.VnS(), z3.VnS(), 1);
19714
19715 __ Usdot(z16.VnS(), z0.VnS(), z3.VnB(), z3.VnB(), 1);
19716 __ Usdot(z17.VnS(), z0.VnS(), z3.VnB(), z15.VnB());
19717 __ Sudot(z18.VnS(), z0.VnS(), z3.VnB(), z3.VnB(), 1);
19718 __ Usdot(z19.VnS(), z0.VnS(), z15.VnB(), z3.VnB());
19719 END();
19720
19721 if (CAN_RUN()) {
19722 RUN();
19723 ASSERT_EQUAL_SVE(z6, z5);
19724 ASSERT_EQUAL_SVE(z8, z7);
19725 ASSERT_EQUAL_SVE(z4, z12);
19726
19727 uint64_t z13_expected[] = {0xffff8200ffff8200, 0xffff8200ffff8200};
19728 ASSERT_EQUAL_SVE(z13_expected, z13.VnD());
19729
19730 ASSERT_EQUAL_SVE(z17, z16);
19731 ASSERT_EQUAL_SVE(z19, z18);
19732 }
19733 }
19734
19735 // Manually constructed simulator test to avoid creating a VL128 variant.
19736
19737 #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
Testsve_fmatmul(Test * config)19738 void Testsve_fmatmul(Test* config) {
19739 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVEF64MM);
19740
19741 // Only double-precision matrix multiply is tested here. Single-precision is
19742 // tested in the simulator tests using a generated sequence. The (templated)
19743 // code used in the simulator for both cases is the same, which is why the
19744 // tests here don't need to be comprehensive.
19745 START();
19746 Label vl_too_short;
19747 __ Rdvl(x0, 1);
19748 __ Cmp(x0, 32);
19749 __ B(lt, &vl_too_short); // Skip testing VL128.
19750
19751 __ Fdup(z0.VnD(), 1.0);
19752 __ Fdup(z1.VnD(), 2.0);
19753 __ Mov(z2.VnD(), 0);
19754
19755 // Build 2x2 identity matrix in z3.
19756 Label iden_loop;
19757 __ Lsr(x0, x0, 5);
19758 __ Bind(&iden_loop);
19759 __ Insr(z3.VnD(), d0);
19760 __ Insr(z3.VnD(), d2);
19761 __ Insr(z3.VnD(), d2);
19762 __ Insr(z3.VnD(), d0);
19763 __ Sub(x0, x0, 1);
19764 __ Cbnz(x0, &iden_loop);
19765
19766 __ Fmmla(z1.VnD(), z1.VnD(), z0.VnD(), z0.VnD());
19767 __ Fmmla(z2.VnD(), z2.VnD(), z1.VnD(), z3.VnD());
19768
19769 __ Ptrue(p0.VnB());
19770 __ Index(z4.VnD(), -8, 3);
19771 __ Scvtf(z4.VnD(), p0.Merging(), z4.VnD());
19772 __ Mov(z5.VnD(), 0);
19773 __ Fmmla(z4.VnD(), z4.VnD(), z4.VnD(), z4.VnD());
19774 __ Fmmla(z5.VnD(), z5.VnD(), z4.VnD(), z3.VnD());
19775
19776 __ Bind(&vl_too_short);
19777 END();
19778
19779 if (CAN_RUN()) {
19780 RUN();
19781
19782 int vl = core.GetSVELaneCount(kBRegSize) * 8;
19783 if (vl >= 256) {
19784 ASSERT_EQUAL_SVE(z1, z2);
19785 ASSERT_EQUAL_SVE(z4, z5);
19786
19787 switch (vl) {
19788 case 256:
19789 case 384: {
19790 // All results are 4.0 (1 * 1 + 2). Results for elements beyond a VL
19791 // that's a multiple of 256 bits should be zero.
19792 uint64_t z1_expected[] = {0x0000000000000000,
19793 0x0000000000000000,
19794 0x4010000000000000,
19795 0x4010000000000000,
19796 0x4010000000000000,
19797 0x4010000000000000};
19798 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
19799
19800 uint64_t z4_expected[] = {0x0000000000000000,
19801 0x0000000000000000,
19802 0x4018000000000000, // 6.0
19803 0x4022000000000000, // 9.0
19804 0x4018000000000000, // 6.0
19805 0x4054400000000000}; // 81.0
19806 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
19807 break;
19808 }
19809 case 2048: {
19810 uint64_t z1_expected[] =
19811 {0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
19812 0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
19813 0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
19814 0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
19815 0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
19816 0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
19817 0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
19818 0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
19819 0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
19820 0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
19821 0x4010000000000000, 0x4010000000000000};
19822 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
19823
19824 uint64_t z4_expected[] = {
19825 0x40cb690000000000, 0x40c9728000000000, 0x40c9710000000000,
19826 0x40c79e8000000000, 0x40c41f0000000000, 0x40c2708000000000,
19827 0x40c26f0000000000, 0x40c0e48000000000, 0x40bbea0000000000,
19828 0x40b91d0000000000, 0x40b91a0000000000, 0x40b6950000000000,
19829 0x40b1d60000000000, 0x40af320000000000, 0x40af2c0000000000,
19830 0x40ab420000000000, 0x40a4040000000000, 0x40a0aa0000000000,
19831 0x40a0a40000000000, 0x409bb40000000000, 0x4091b80000000000,
19832 0x408a880000000000, 0x408a700000000000, 0x4083c80000000000,
19833 0x4071a00000000000, 0x4061a00000000000, 0x4061400000000000,
19834 0x4051400000000000, 0x4018000000000000, 0x4022000000000000,
19835 0x4018000000000000, 0x4054400000000000,
19836 };
19837 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
19838 break;
19839 }
19840 default:
19841 printf("WARNING: Some tests skipped due to unexpected VL.\n");
19842 break;
19843 }
19844 }
19845 }
19846 }
19847 Test* test_sve_fmatmul_list[] =
19848 {Test::MakeSVETest(256, "AARCH64_ASM_sve_fmatmul_vl256", &Testsve_fmatmul),
19849 Test::MakeSVETest(384, "AARCH64_ASM_sve_fmatmul_vl384", &Testsve_fmatmul),
19850 Test::MakeSVETest(2048,
19851 "AARCH64_ASM_sve_fmatmul_vl2048",
19852 &Testsve_fmatmul)};
19853
Testsve_ld1ro(Test * config)19854 void Testsve_ld1ro(Test* config) {
19855 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVEF64MM);
19856 START();
19857
19858 int data_size = (kQRegSizeInBytes + 128) * 4;
19859 uint8_t* data = new uint8_t[data_size];
19860 for (int i = 0; i < data_size; i++) {
19861 data[i] = i & 0xff;
19862 }
19863
19864 // Set the base to just past half-way through the buffer so we can use
19865 // negative indices.
19866 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[7 + data_size / 2]));
19867
19868 __ Index(z0.VnB(), 0, 1);
19869 __ Ptrue(p0.VnB());
19870 __ Cmplo(p0.VnB(), p0.Zeroing(), z0.VnB(), 4);
19871 __ Pfalse(p1.VnB());
19872 __ Zip1(p1.VnB(), p0.VnB(), p1.VnB());
19873 __ Ptrue(p2.VnB());
19874
19875 __ Mov(x1, -32);
19876 __ Ld1rob(z0.VnB(), p1.Zeroing(), SVEMemOperand(x0, -32));
19877 __ Ld1rob(z1.VnB(), p1.Zeroing(), SVEMemOperand(x0, x1));
19878
19879 __ Mov(x1, 64 / 2);
19880 __ Ld1roh(z2.VnH(), p2.Zeroing(), SVEMemOperand(x0, 64));
19881 __ Ld1roh(z3.VnH(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 1));
19882
19883 __ Mov(x1, -96 / 4);
19884 __ Ld1row(z4.VnS(), p2.Zeroing(), SVEMemOperand(x0, -96));
19885 __ Ld1row(z5.VnS(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 2));
19886
19887 __ Mov(x1, 128 / 8);
19888 __ Ld1rod(z6.VnD(), p2.Zeroing(), SVEMemOperand(x0, 128));
19889 __ Ld1rod(z7.VnD(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 3));
19890
19891 // Check that all 256-bit segments match by rotating the vector by one
19892 // segment, eoring, and orring across the vector.
19893 __ Dup(z11.VnQ(), z0.VnQ(), 2);
19894 __ Mov(z8, z0);
19895 __ Ext(z8.VnB(), z8.VnB(), z8.VnB(), 32);
19896 __ Eor(z8.VnB(), z8.VnB(), z0.VnB());
19897 __ Orv(b9, p2, z8.VnB());
19898
19899 __ Mov(z8, z2);
19900 __ Ext(z8.VnB(), z8.VnB(), z8.VnB(), 32);
19901 __ Eor(z8.VnB(), z8.VnB(), z2.VnB());
19902 __ Orv(b8, p2, z8.VnB());
19903 __ Orr(z9, z9, z8);
19904
19905 __ Mov(z8, z4);
19906 __ Ext(z8.VnB(), z8.VnB(), z8.VnB(), 32);
19907 __ Eor(z8.VnB(), z8.VnB(), z4.VnB());
19908 __ Orv(b8, p2, z8.VnB());
19909 __ Orr(z9, z9, z8);
19910
19911 __ Mov(z8, z6);
19912 __ Ext(z8.VnB(), z8.VnB(), z8.VnB(), 32);
19913 __ Eor(z8.VnB(), z8.VnB(), z6.VnB());
19914 __ Orv(b8, p2, z8.VnB());
19915 __ Orr(z9, z9, z8);
19916
19917 END();
19918
19919 if (CAN_RUN()) {
19920 RUN();
19921
19922 int vl = core.GetSVELaneCount(kBRegSize) * 8;
19923 if (vl >= 256) {
19924 ASSERT_EQUAL_SVE(z0, z1);
19925 ASSERT_EQUAL_SVE(z2, z3);
19926 ASSERT_EQUAL_SVE(z4, z5);
19927 ASSERT_EQUAL_SVE(z6, z7);
19928
19929 switch (vl) {
19930 case 256:
19931 case 2048: {
19932 // Check the result of the rotate/eor sequence.
19933 uint64_t expected_z9[] = {0, 0};
19934 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
19935 break;
19936 }
19937 case 384: {
19938 // For non-multiple-of-256 VL, the top 128-bits must be zero, which
19939 // breaks the rotate/eor sequence. Check the results explicitly.
19940 uint64_t z0_expected[] = {0x0000000000000000,
19941 0x0000000000000000,
19942 0x0000000000000000,
19943 0x0000000000000000,
19944 0x0000000000000000,
19945 0x000d000b00090007};
19946 uint64_t z2_expected[] = {0x0000000000000000,
19947 0x0000000000000000,
19948 0x868584838281807f,
19949 0x7e7d7c7b7a797877,
19950 0x767574737271706f,
19951 0x6e6d6c6b6a696867};
19952 uint64_t z4_expected[] = {0x0000000000000000,
19953 0x0000000000000000,
19954 0xe6e5e4e3e2e1e0df,
19955 0xdedddcdbdad9d8d7,
19956 0xd6d5d4d3d2d1d0cf,
19957 0xcecdcccbcac9c8c7};
19958 uint64_t z6_expected[] = {0x0000000000000000,
19959 0x0000000000000000,
19960 0xc6c5c4c3c2c1c0bf,
19961 0xbebdbcbbbab9b8b7,
19962 0xb6b5b4b3b2b1b0af,
19963 0xaeadacabaaa9a8a7};
19964 ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
19965 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
19966 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
19967 ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
19968 break;
19969 }
19970 default:
19971 printf("WARNING: Some tests skipped due to unexpected VL.\n");
19972 break;
19973 }
19974 }
19975 }
19976 }
19977 Test* test_sve_ld1ro_list[] =
19978 {Test::MakeSVETest(256, "AARCH64_ASM_sve_ld1ro_vl256", &Testsve_ld1ro),
19979 Test::MakeSVETest(384, "AARCH64_ASM_sve_ld1ro_vl384", &Testsve_ld1ro),
19980 Test::MakeSVETest(2048, "AARCH64_ASM_sve_ld1ro_vl2048", &Testsve_ld1ro)};
19981 #endif
19982
19983 } // namespace aarch64
19984 } // namespace vixl
19985