xref: /aosp_15_r20/external/vixl/test/aarch64/test-assembler-sve-aarch64.cc (revision f5c631da2f1efdd72b5fd1e20510e4042af13d77)
1 // Copyright 2019, VIXL authors
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are met:
6 //
7 //   * Redistributions of source code must retain the above copyright notice,
8 //     this list of conditions and the following disclaimer.
9 //   * Redistributions in binary form must reproduce the above copyright notice,
10 //     this list of conditions and the following disclaimer in the documentation
11 //     and/or other materials provided with the distribution.
12 //   * Neither the name of ARM Limited nor the names of its contributors may be
13 //     used to endorse or promote products derived from this software without
14 //     specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
17 // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 
27 #include <sys/mman.h>
28 #include <unistd.h>
29 
30 #include <cfloat>
31 #include <cmath>
32 #include <cstdio>
33 #include <cstdlib>
34 #include <cstring>
35 #include <functional>
36 
37 #include "test-runner.h"
38 #include "test-utils.h"
39 #include "aarch64/test-utils-aarch64.h"
40 
41 #include "aarch64/cpu-aarch64.h"
42 #include "aarch64/disasm-aarch64.h"
43 #include "aarch64/macro-assembler-aarch64.h"
44 #include "aarch64/simulator-aarch64.h"
45 #include "test-assembler-aarch64.h"
46 
47 #define TEST_SVE(name) TEST_SVE_INNER("ASM", name)
48 
49 namespace vixl {
50 namespace aarch64 {
51 
52 // Conveniently initialise P registers with scalar bit patterns. The destination
53 // lane size is ignored. This is optimised for call-site clarity, not generated
54 // code quality.
55 //
56 // Usage:
57 //
58 //    Initialise(&masm, p0, 0x1234);  // Sets p0 = 0b'0001'0010'0011'0100
Initialise(MacroAssembler * masm,const PRegister & pd,uint64_t value3,uint64_t value2,uint64_t value1,uint64_t value0)59 void Initialise(MacroAssembler* masm,
60                 const PRegister& pd,
61                 uint64_t value3,
62                 uint64_t value2,
63                 uint64_t value1,
64                 uint64_t value0) {
65   // Generate a literal pool, as in the array form.
66   UseScratchRegisterScope temps(masm);
67   Register temp = temps.AcquireX();
68   Label data;
69   Label done;
70 
71   masm->Adr(temp, &data);
72   masm->Ldr(pd, SVEMemOperand(temp));
73   masm->B(&done);
74   {
75     ExactAssemblyScope total(masm, kPRegMaxSizeInBytes);
76     masm->bind(&data);
77     masm->dc64(value0);
78     masm->dc64(value1);
79     masm->dc64(value2);
80     masm->dc64(value3);
81   }
82   masm->Bind(&done);
83 }
Initialise(MacroAssembler * masm,const PRegister & pd,uint64_t value2,uint64_t value1,uint64_t value0)84 void Initialise(MacroAssembler* masm,
85                 const PRegister& pd,
86                 uint64_t value2,
87                 uint64_t value1,
88                 uint64_t value0) {
89   Initialise(masm, pd, 0, value2, value1, value0);
90 }
Initialise(MacroAssembler * masm,const PRegister & pd,uint64_t value1,uint64_t value0)91 void Initialise(MacroAssembler* masm,
92                 const PRegister& pd,
93                 uint64_t value1,
94                 uint64_t value0) {
95   Initialise(masm, pd, 0, 0, value1, value0);
96 }
Initialise(MacroAssembler * masm,const PRegister & pd,uint64_t value0)97 void Initialise(MacroAssembler* masm, const PRegister& pd, uint64_t value0) {
98   Initialise(masm, pd, 0, 0, 0, value0);
99 }
100 
101 // Conveniently initialise P registers by lane. This is optimised for call-site
102 // clarity, not generated code quality.
103 //
104 // Usage:
105 //
106 //     int values[] = { 0x0, 0x1, 0x2 };
107 //     Initialise(&masm, p0.VnS(), values);  // Sets p0 = 0b'0000'0001'0010
108 //
109 // The rightmost (highest-indexed) array element maps to the lowest-numbered
110 // lane. Unspecified lanes are set to 0 (inactive).
111 //
112 // Each element of the `values` array is mapped onto a lane in `pd`. The
113 // architecture only respects the lower bit, and writes zero the upper bits, but
114 // other (encodable) values can be specified if required by the test.
115 template <typename T, size_t N>
Initialise(MacroAssembler * masm,const PRegisterWithLaneSize & pd,const T (& values)[N])116 void Initialise(MacroAssembler* masm,
117                 const PRegisterWithLaneSize& pd,
118                 const T (&values)[N]) {
119   // Turn the array into 64-bit chunks.
120   uint64_t chunks[4] = {0, 0, 0, 0};
121   VIXL_STATIC_ASSERT(sizeof(chunks) == kPRegMaxSizeInBytes);
122 
123   int p_bits_per_lane = pd.GetLaneSizeInBits() / kZRegBitsPerPRegBit;
124   VIXL_ASSERT((64 % p_bits_per_lane) == 0);
125   VIXL_ASSERT((N * p_bits_per_lane) <= kPRegMaxSize);
126 
127   uint64_t p_lane_mask = GetUintMask(p_bits_per_lane);
128 
129   VIXL_STATIC_ASSERT(N <= kPRegMaxSize);
130   size_t bit = 0;
131   for (int n = static_cast<int>(N - 1); n >= 0; n--) {
132     VIXL_ASSERT(bit < (sizeof(chunks) * kBitsPerByte));
133     uint64_t value = values[n] & p_lane_mask;
134     chunks[bit / 64] |= value << (bit % 64);
135     bit += p_bits_per_lane;
136   }
137 
138   Initialise(masm, pd, chunks[3], chunks[2], chunks[1], chunks[0]);
139 }
140 
141 // Ensure that basic test infrastructure works.
TEST_SVE(sve_test_infrastructure_z)142 TEST_SVE(sve_test_infrastructure_z) {
143   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
144   START();
145 
146   __ Mov(x0, 0x0123456789abcdef);
147 
148   // Test basic `Insr` behaviour.
149   __ Insr(z0.VnB(), 1);
150   __ Insr(z0.VnB(), 2);
151   __ Insr(z0.VnB(), x0);
152   __ Insr(z0.VnB(), -42);
153   __ Insr(z0.VnB(), 0);
154 
155   // Test array inputs.
156   int z1_inputs[] = {3, 4, 5, -42, 0};
157   InsrHelper(&masm, z1.VnH(), z1_inputs);
158 
159   // Test that sign-extension works as intended for various lane sizes.
160   __ Dup(z2.VnD(), 0);            // Clear the register first.
161   __ Insr(z2.VnB(), -42);         //                       0xd6
162   __ Insr(z2.VnB(), 0xfe);        //                       0xfe
163   __ Insr(z2.VnH(), -42);         //                     0xffd6
164   __ Insr(z2.VnH(), 0xfedc);      //                     0xfedc
165   __ Insr(z2.VnS(), -42);         //                 0xffffffd6
166   __ Insr(z2.VnS(), 0xfedcba98);  //                 0xfedcba98
167   // Use another register for VnD(), so we can support 128-bit Z registers.
168   __ Insr(z3.VnD(), -42);                 // 0xffffffffffffffd6
169   __ Insr(z3.VnD(), 0xfedcba9876543210);  // 0xfedcba9876543210
170 
171   END();
172 
173   if (CAN_RUN()) {
174     RUN();
175 
176     // Test that array checks work properly on a register initialised
177     // lane-by-lane.
178     int z0_inputs_b[] = {0x01, 0x02, 0xef, 0xd6, 0x00};
179     ASSERT_EQUAL_SVE(z0_inputs_b, z0.VnB());
180 
181     // Test that lane-by-lane checks work properly on a register initialised
182     // by array.
183     for (size_t i = 0; i < ArrayLength(z1_inputs); i++) {
184       // The rightmost (highest-indexed) array element maps to the
185       // lowest-numbered lane.
186       int lane = static_cast<int>(ArrayLength(z1_inputs) - i - 1);
187       ASSERT_EQUAL_SVE_LANE(z1_inputs[i], z1.VnH(), lane);
188     }
189 
190     uint64_t z2_inputs_d[] = {0x0000d6feffd6fedc, 0xffffffd6fedcba98};
191     ASSERT_EQUAL_SVE(z2_inputs_d, z2.VnD());
192     uint64_t z3_inputs_d[] = {0xffffffffffffffd6, 0xfedcba9876543210};
193     ASSERT_EQUAL_SVE(z3_inputs_d, z3.VnD());
194   }
195 }
196 
197 // Ensure that basic test infrastructure works.
TEST_SVE(sve_test_infrastructure_p)198 TEST_SVE(sve_test_infrastructure_p) {
199   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
200   START();
201 
202   // Simple cases: move boolean (0 or 1) values.
203 
204   int p0_inputs[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0};
205   Initialise(&masm, p0.VnB(), p0_inputs);
206 
207   int p1_inputs[] = {1, 0, 1, 1, 0, 1, 1, 1};
208   Initialise(&masm, p1.VnH(), p1_inputs);
209 
210   int p2_inputs[] = {1, 1, 0, 1};
211   Initialise(&masm, p2.VnS(), p2_inputs);
212 
213   int p3_inputs[] = {0, 1};
214   Initialise(&masm, p3.VnD(), p3_inputs);
215 
216   // Advanced cases: move numeric value into architecturally-ignored bits.
217 
218   // B-sized lanes get one bit in a P register, so there are no ignored bits.
219 
220   // H-sized lanes get two bits in a P register.
221   int p4_inputs[] = {0x3, 0x2, 0x1, 0x0, 0x1, 0x2, 0x3};
222   Initialise(&masm, p4.VnH(), p4_inputs);
223 
224   // S-sized lanes get four bits in a P register.
225   int p5_inputs[] = {0xc, 0x7, 0x9, 0x6, 0xf};
226   Initialise(&masm, p5.VnS(), p5_inputs);
227 
228   // D-sized lanes get eight bits in a P register.
229   int p6_inputs[] = {0x81, 0xcc, 0x55};
230   Initialise(&masm, p6.VnD(), p6_inputs);
231 
232   // The largest possible P register has 32 bytes.
233   int p7_inputs[] = {0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
234                      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
235                      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
236                      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f};
237   Initialise(&masm, p7.VnD(), p7_inputs);
238 
239   END();
240 
241   if (CAN_RUN()) {
242     RUN();
243 
244     // Test that lane-by-lane checks work properly. The rightmost
245     // (highest-indexed) array element maps to the lowest-numbered lane.
246     for (size_t i = 0; i < ArrayLength(p0_inputs); i++) {
247       int lane = static_cast<int>(ArrayLength(p0_inputs) - i - 1);
248       ASSERT_EQUAL_SVE_LANE(p0_inputs[i], p0.VnB(), lane);
249     }
250     for (size_t i = 0; i < ArrayLength(p1_inputs); i++) {
251       int lane = static_cast<int>(ArrayLength(p1_inputs) - i - 1);
252       ASSERT_EQUAL_SVE_LANE(p1_inputs[i], p1.VnH(), lane);
253     }
254     for (size_t i = 0; i < ArrayLength(p2_inputs); i++) {
255       int lane = static_cast<int>(ArrayLength(p2_inputs) - i - 1);
256       ASSERT_EQUAL_SVE_LANE(p2_inputs[i], p2.VnS(), lane);
257     }
258     for (size_t i = 0; i < ArrayLength(p3_inputs); i++) {
259       int lane = static_cast<int>(ArrayLength(p3_inputs) - i - 1);
260       ASSERT_EQUAL_SVE_LANE(p3_inputs[i], p3.VnD(), lane);
261     }
262 
263     // Test that array checks work properly on predicates initialised with a
264     // possibly-different lane size.
265     // 0b...11'10'01'00'01'10'11
266     int p4_expected[] = {0x39, 0x1b};
267     ASSERT_EQUAL_SVE(p4_expected, p4.VnD());
268 
269     ASSERT_EQUAL_SVE(p5_inputs, p5.VnS());
270 
271     // 0b...10000001'11001100'01010101
272     int p6_expected[] = {2, 0, 0, 1, 3, 0, 3, 0, 1, 1, 1, 1};
273     ASSERT_EQUAL_SVE(p6_expected, p6.VnH());
274 
275     // 0b...10011100'10011101'10011110'10011111
276     int p7_expected[] = {1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1,
277                          1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1};
278     ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
279   }
280 }
281 
282 // Test that writes to V registers clear the high bits of the corresponding Z
283 // register.
TEST_SVE(sve_v_write_clear)284 TEST_SVE(sve_v_write_clear) {
285   SVE_SETUP_WITH_FEATURES(CPUFeatures::kNEON,
286                           CPUFeatures::kFP,
287                           CPUFeatures::kSVE);
288   START();
289 
290   // The Simulator has two mechansisms for writing V registers:
291   //  - Write*Register, calling through to SimRegisterBase::Write.
292   //  - LogicVRegister::ClearForWrite followed by one or more lane updates.
293   // Try to cover both variants.
294 
295   // Prepare some known inputs.
296   uint8_t data[kQRegSizeInBytes];
297   for (size_t i = 0; i < kQRegSizeInBytes; i++) {
298     data[i] = 42 + i;
299   }
300   __ Mov(x10, reinterpret_cast<uintptr_t>(data));
301   __ Fmov(d30, 42.0);
302 
303   // Use Index to label the lane indices, so failures are easy to detect and
304   // diagnose.
305   __ Index(z0.VnB(), 0, 1);
306   __ Index(z1.VnB(), 0, 1);
307   __ Index(z2.VnB(), 0, 1);
308   __ Index(z3.VnB(), 0, 1);
309   __ Index(z4.VnB(), 0, 1);
310 
311   __ Index(z10.VnB(), 0, -1);
312   __ Index(z11.VnB(), 0, -1);
313   __ Index(z12.VnB(), 0, -1);
314   __ Index(z13.VnB(), 0, -1);
315   __ Index(z14.VnB(), 0, -1);
316 
317   // Instructions using Write*Register (and SimRegisterBase::Write).
318   __ Ldr(b0, MemOperand(x10));
319   __ Fcvt(h1, d30);
320   __ Fmov(s2, 1.5f);
321   __ Fmov(d3, d30);
322   __ Ldr(q4, MemOperand(x10));
323 
324   // Instructions using LogicVRegister::ClearForWrite.
325   // These also (incidentally) test that across-lane instructions correctly
326   // ignore the high-order Z register lanes.
327   __ Sminv(b10, v10.V16B());
328   __ Addv(h11, v11.V4H());
329   __ Saddlv(s12, v12.V8H());
330   __ Dup(v13.V8B(), b13, kDRegSizeInBytes);
331   __ Uaddl(v14.V8H(), v14.V8B(), v14.V8B());
332 
333   END();
334 
335   if (CAN_RUN()) {
336     RUN();
337 
338     // Check the Q part first.
339     ASSERT_EQUAL_128(0x0000000000000000, 0x000000000000002a, v0);
340     ASSERT_EQUAL_128(0x0000000000000000, 0x0000000000005140, v1);  // 42.0 (f16)
341     ASSERT_EQUAL_128(0x0000000000000000, 0x000000003fc00000, v2);  // 1.5 (f32)
342     ASSERT_EQUAL_128(0x0000000000000000, 0x4045000000000000, v3);  // 42.0 (f64)
343     ASSERT_EQUAL_128(0x3938373635343332, 0x31302f2e2d2c2b2a, v4);
344     ASSERT_EQUAL_128(0x0000000000000000, 0x00000000000000f1, v10);  // -15
345     //  0xf9fa + 0xfbfc + 0xfdfe + 0xff00 -> 0xf2f4
346     ASSERT_EQUAL_128(0x0000000000000000, 0x000000000000f2f4, v11);
347     //  0xfffff1f2 + 0xfffff3f4 + ... + 0xfffffdfe + 0xffffff00 -> 0xffffc6c8
348     ASSERT_EQUAL_128(0x0000000000000000, 0x00000000ffffc6c8, v12);
349     ASSERT_EQUAL_128(0x0000000000000000, 0xf8f8f8f8f8f8f8f8, v13);  // [-8] x 8
350     //    [0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, 0x0000]
351     //  + [0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, 0x0000]
352     // -> [0x01f2, 0x01f4, 0x01f6, 0x01f8, 0x01fa, 0x01fc, 0x01fe, 0x0000]
353     ASSERT_EQUAL_128(0x01f201f401f601f8, 0x01fa01fc01fe0000, v14);
354 
355     // Check that the upper lanes are all clear.
356     for (int i = kQRegSizeInBytes; i < core.GetSVELaneCount(kBRegSize); i++) {
357       ASSERT_EQUAL_SVE_LANE(0x00, z0.VnB(), i);
358       ASSERT_EQUAL_SVE_LANE(0x00, z1.VnB(), i);
359       ASSERT_EQUAL_SVE_LANE(0x00, z2.VnB(), i);
360       ASSERT_EQUAL_SVE_LANE(0x00, z3.VnB(), i);
361       ASSERT_EQUAL_SVE_LANE(0x00, z4.VnB(), i);
362       ASSERT_EQUAL_SVE_LANE(0x00, z10.VnB(), i);
363       ASSERT_EQUAL_SVE_LANE(0x00, z11.VnB(), i);
364       ASSERT_EQUAL_SVE_LANE(0x00, z12.VnB(), i);
365       ASSERT_EQUAL_SVE_LANE(0x00, z13.VnB(), i);
366       ASSERT_EQUAL_SVE_LANE(0x00, z14.VnB(), i);
367     }
368   }
369 }
370 
MlaMlsHelper(Test * config,unsigned lane_size_in_bits)371 static void MlaMlsHelper(Test* config, unsigned lane_size_in_bits) {
372   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
373   START();
374 
375   int zd_inputs[] = {0xbb, 0xcc, 0xdd, 0xee};
376   int za_inputs[] = {-39, 1, -3, 2};
377   int zn_inputs[] = {-5, -20, 9, 8};
378   int zm_inputs[] = {9, -5, 4, 5};
379 
380   ZRegister zd = z0.WithLaneSize(lane_size_in_bits);
381   ZRegister za = z1.WithLaneSize(lane_size_in_bits);
382   ZRegister zn = z2.WithLaneSize(lane_size_in_bits);
383   ZRegister zm = z3.WithLaneSize(lane_size_in_bits);
384 
385   // TODO: Use a simple `Dup` once it accepts arbitrary immediates.
386   InsrHelper(&masm, zd, zd_inputs);
387   InsrHelper(&masm, za, za_inputs);
388   InsrHelper(&masm, zn, zn_inputs);
389   InsrHelper(&masm, zm, zm_inputs);
390 
391   int p0_inputs[] = {1, 1, 0, 1};
392   int p1_inputs[] = {1, 0, 1, 1};
393   int p2_inputs[] = {0, 1, 1, 1};
394   int p3_inputs[] = {1, 1, 1, 0};
395 
396   Initialise(&masm, p0.WithLaneSize(lane_size_in_bits), p0_inputs);
397   Initialise(&masm, p1.WithLaneSize(lane_size_in_bits), p1_inputs);
398   Initialise(&masm, p2.WithLaneSize(lane_size_in_bits), p2_inputs);
399   Initialise(&masm, p3.WithLaneSize(lane_size_in_bits), p3_inputs);
400 
401   // The Mla macro automatically selects between mla, mad and movprfx + mla
402   // based on what registers are aliased.
403   ZRegister mla_da_result = z10.WithLaneSize(lane_size_in_bits);
404   ZRegister mla_dn_result = z11.WithLaneSize(lane_size_in_bits);
405   ZRegister mla_dm_result = z12.WithLaneSize(lane_size_in_bits);
406   ZRegister mla_d_result = z13.WithLaneSize(lane_size_in_bits);
407 
408   __ Mov(mla_da_result, za);
409   __ Mla(mla_da_result, p0.Merging(), mla_da_result, zn, zm);
410 
411   __ Mov(mla_dn_result, zn);
412   __ Mla(mla_dn_result, p1.Merging(), za, mla_dn_result, zm);
413 
414   __ Mov(mla_dm_result, zm);
415   __ Mla(mla_dm_result, p2.Merging(), za, zn, mla_dm_result);
416 
417   __ Mov(mla_d_result, zd);
418   __ Mla(mla_d_result, p3.Merging(), za, zn, zm);
419 
420   // The Mls macro automatically selects between mls, msb and movprfx + mls
421   // based on what registers are aliased.
422   ZRegister mls_da_result = z20.WithLaneSize(lane_size_in_bits);
423   ZRegister mls_dn_result = z21.WithLaneSize(lane_size_in_bits);
424   ZRegister mls_dm_result = z22.WithLaneSize(lane_size_in_bits);
425   ZRegister mls_d_result = z23.WithLaneSize(lane_size_in_bits);
426 
427   __ Mov(mls_da_result, za);
428   __ Mls(mls_da_result, p0.Merging(), mls_da_result, zn, zm);
429 
430   __ Mov(mls_dn_result, zn);
431   __ Mls(mls_dn_result, p1.Merging(), za, mls_dn_result, zm);
432 
433   __ Mov(mls_dm_result, zm);
434   __ Mls(mls_dm_result, p2.Merging(), za, zn, mls_dm_result);
435 
436   __ Mov(mls_d_result, zd);
437   __ Mls(mls_d_result, p3.Merging(), za, zn, zm);
438 
439   END();
440 
441   if (CAN_RUN()) {
442     RUN();
443 
444     ASSERT_EQUAL_SVE(za_inputs, z1.WithLaneSize(lane_size_in_bits));
445     ASSERT_EQUAL_SVE(zn_inputs, z2.WithLaneSize(lane_size_in_bits));
446     ASSERT_EQUAL_SVE(zm_inputs, z3.WithLaneSize(lane_size_in_bits));
447 
448     int mla[] = {-84, 101, 33, 42};
449     int mls[] = {6, -99, -39, -38};
450 
451     int mla_da_expected[] = {mla[0], mla[1], za_inputs[2], mla[3]};
452     ASSERT_EQUAL_SVE(mla_da_expected, mla_da_result);
453 
454     int mla_dn_expected[] = {mla[0], zn_inputs[1], mla[2], mla[3]};
455     ASSERT_EQUAL_SVE(mla_dn_expected, mla_dn_result);
456 
457     int mla_dm_expected[] = {zm_inputs[0], mla[1], mla[2], mla[3]};
458     ASSERT_EQUAL_SVE(mla_dm_expected, mla_dm_result);
459 
460     int mla_d_expected[] = {mla[0], mla[1], mla[2], zd_inputs[3]};
461     ASSERT_EQUAL_SVE(mla_d_expected, mla_d_result);
462 
463     int mls_da_expected[] = {mls[0], mls[1], za_inputs[2], mls[3]};
464     ASSERT_EQUAL_SVE(mls_da_expected, mls_da_result);
465 
466     int mls_dn_expected[] = {mls[0], zn_inputs[1], mls[2], mls[3]};
467     ASSERT_EQUAL_SVE(mls_dn_expected, mls_dn_result);
468 
469     int mls_dm_expected[] = {zm_inputs[0], mls[1], mls[2], mls[3]};
470     ASSERT_EQUAL_SVE(mls_dm_expected, mls_dm_result);
471 
472     int mls_d_expected[] = {mls[0], mls[1], mls[2], zd_inputs[3]};
473     ASSERT_EQUAL_SVE(mls_d_expected, mls_d_result);
474   }
475 }
476 
TEST_SVE(sve_mla_mls_b)477 TEST_SVE(sve_mla_mls_b) { MlaMlsHelper(config, kBRegSize); }
TEST_SVE(sve_mla_mls_h)478 TEST_SVE(sve_mla_mls_h) { MlaMlsHelper(config, kHRegSize); }
TEST_SVE(sve_mla_mls_s)479 TEST_SVE(sve_mla_mls_s) { MlaMlsHelper(config, kSRegSize); }
TEST_SVE(sve_mla_mls_d)480 TEST_SVE(sve_mla_mls_d) { MlaMlsHelper(config, kDRegSize); }
481 
TEST_SVE(sve_bitwise_unpredicate_logical)482 TEST_SVE(sve_bitwise_unpredicate_logical) {
483   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
484   START();
485 
486   uint64_t z8_inputs[] = {0xfedcba9876543210, 0x0123456789abcdef};
487   InsrHelper(&masm, z8.VnD(), z8_inputs);
488   uint64_t z15_inputs[] = {0xffffeeeeddddcccc, 0xccccddddeeeeffff};
489   InsrHelper(&masm, z15.VnD(), z15_inputs);
490 
491   __ And(z1.VnD(), z8.VnD(), z15.VnD());
492   __ Bic(z2.VnD(), z8.VnD(), z15.VnD());
493   __ Eor(z3.VnD(), z8.VnD(), z15.VnD());
494   __ Orr(z4.VnD(), z8.VnD(), z15.VnD());
495 
496   END();
497 
498   if (CAN_RUN()) {
499     RUN();
500     uint64_t z1_expected[] = {0xfedcaa8854540000, 0x0000454588aacdef};
501     uint64_t z2_expected[] = {0x0000101022003210, 0x0123002201010000};
502     uint64_t z3_expected[] = {0x01235476ab89fedc, 0xcdef98ba67453210};
503     uint64_t z4_expected[] = {0xfffffefeffddfedc, 0xcdefddffefefffff};
504 
505     ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
506     ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
507     ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
508     ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
509   }
510 }
511 
TEST_SVE(sve_last_r)512 TEST_SVE(sve_last_r) {
513   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
514   START();
515 
516   __ Pfalse(p1.VnB());
517   int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
518   int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
519   Initialise(&masm, p2.VnB(), p2_inputs);
520   Initialise(&masm, p3.VnB(), p3_inputs);
521   __ Ptrue(p4.VnB());
522 
523   __ Index(z0.VnB(), 0x10, 1);
524   __ Lasta(x1, p1, z0.VnB());
525   __ Lastb(x2, p1, z0.VnB());
526   __ Lasta(x3, p2, z0.VnB());
527   __ Lastb(x4, p2, z0.VnB());
528   __ Lasta(x5, p3, z0.VnB());
529   __ Lastb(x6, p3, z0.VnB());
530   __ Lasta(x7, p4, z0.VnB());
531 
532   __ Punpklo(p3.VnH(), p3.VnB());
533   __ Index(z0.VnH(), 0x1110, 1);
534   __ Lasta(x9, p1, z0.VnH());
535   __ Lastb(x10, p3, z0.VnH());
536   __ Lasta(x12, p4, z0.VnH());
537 
538   __ Index(z0.VnS(), 0x11111110, 1);
539   __ Lastb(x13, p1, z0.VnS());
540   __ Lasta(x14, p2, z0.VnS());
541   __ Lastb(x18, p4, z0.VnS());
542 
543   __ Index(z0.VnD(), 0x1111111111111110, 1);
544   __ Lasta(x19, p1, z0.VnD());
545   __ Lastb(x20, p3, z0.VnD());
546   __ Lasta(x21, p3, z0.VnD());
547   END();
548 
549   if (CAN_RUN()) {
550     RUN();
551 
552     ASSERT_EQUAL_64(0x0000000000000010, x1);
553     ASSERT_EQUAL_64(0x0000000000000011, x3);
554     ASSERT_EQUAL_64(0x0000000000000010, x4);
555     ASSERT_EQUAL_64(0x0000000000000019, x5);
556     ASSERT_EQUAL_64(0x0000000000000018, x6);
557     ASSERT_EQUAL_64(0x0000000000000010, x7);
558     ASSERT_EQUAL_64(0x0000000000001110, x9);
559     ASSERT_EQUAL_64(0x0000000000001110, x12);
560     ASSERT_EQUAL_64(0x0000000011111111, x14);
561     ASSERT_EQUAL_64(0x1111111111111110, x19);
562 
563     int vl = core.GetSVELaneCount(kBRegSize) * 8;
564     switch (vl) {
565       case 128:
566         ASSERT_EQUAL_64(0x000000000000001f, x2);
567         ASSERT_EQUAL_64(0x0000000000001116, x10);
568         ASSERT_EQUAL_64(0x0000000011111113, x13);
569         ASSERT_EQUAL_64(0x0000000011111113, x18);
570         ASSERT_EQUAL_64(0x1111111111111111, x20);
571         ASSERT_EQUAL_64(0x1111111111111110, x21);
572         break;
573       case 384:
574         ASSERT_EQUAL_64(0x000000000000003f, x2);
575         ASSERT_EQUAL_64(0x0000000000001118, x10);
576         ASSERT_EQUAL_64(0x000000001111111b, x13);
577         ASSERT_EQUAL_64(0x000000001111111b, x18);
578         ASSERT_EQUAL_64(0x1111111111111112, x20);
579         ASSERT_EQUAL_64(0x1111111111111113, x21);
580         break;
581       case 2048:
582         ASSERT_EQUAL_64(0x000000000000000f, x2);
583         ASSERT_EQUAL_64(0x0000000000001118, x10);
584         ASSERT_EQUAL_64(0x000000001111114f, x13);
585         ASSERT_EQUAL_64(0x000000001111114f, x18);
586         ASSERT_EQUAL_64(0x1111111111111112, x20);
587         ASSERT_EQUAL_64(0x1111111111111113, x21);
588         break;
589       default:
590         printf("WARNING: Some tests skipped due to unexpected VL.\n");
591         break;
592     }
593   }
594 }
595 
TEST_SVE(sve_last_v)596 TEST_SVE(sve_last_v) {
597   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
598   START();
599 
600   __ Pfalse(p1.VnB());
601   int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
602   int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
603   Initialise(&masm, p2.VnB(), p2_inputs);
604   Initialise(&masm, p3.VnB(), p3_inputs);
605   __ Ptrue(p4.VnB());
606 
607   __ Index(z0.VnB(), 0x10, 1);
608   __ Lasta(b1, p1, z0.VnB());
609   __ Lastb(b2, p1, z0.VnB());
610   __ Lasta(b3, p2, z0.VnB());
611   __ Lastb(b4, p2, z0.VnB());
612   __ Lasta(b5, p3, z0.VnB());
613   __ Lastb(b6, p3, z0.VnB());
614   __ Lasta(b7, p4, z0.VnB());
615 
616   __ Punpklo(p3.VnH(), p3.VnB());
617   __ Index(z0.VnH(), 0x1110, 1);
618   __ Lasta(h9, p1, z0.VnH());
619   __ Lastb(h10, p3, z0.VnH());
620   __ Lasta(h12, p4, z0.VnH());
621 
622   __ Index(z0.VnS(), 0x11111110, 1);
623   __ Lastb(s13, p1, z0.VnS());
624   __ Lasta(s14, p2, z0.VnS());
625   __ Lastb(s18, p4, z0.VnS());
626 
627   __ Index(z0.VnD(), 0x1111111111111110, 1);
628   __ Lasta(d19, p1, z0.VnD());
629   __ Lastb(d20, p3, z0.VnD());
630   __ Lasta(d21, p3, z0.VnD());
631   END();
632 
633   if (CAN_RUN()) {
634     RUN();
635 
636     ASSERT_EQUAL_128(0, 0x0000000000000010, q1);
637     ASSERT_EQUAL_128(0, 0x0000000000000011, q3);
638     ASSERT_EQUAL_128(0, 0x0000000000000010, q4);
639     ASSERT_EQUAL_128(0, 0x0000000000000019, q5);
640     ASSERT_EQUAL_128(0, 0x0000000000000018, q6);
641     ASSERT_EQUAL_128(0, 0x0000000000000010, q7);
642     ASSERT_EQUAL_128(0, 0x0000000000001110, q9);
643     ASSERT_EQUAL_128(0, 0x0000000000001110, q12);
644     ASSERT_EQUAL_128(0, 0x0000000011111111, q14);
645     ASSERT_EQUAL_128(0, 0x1111111111111110, q19);
646 
647     int vl = core.GetSVELaneCount(kBRegSize) * 8;
648     switch (vl) {
649       case 128:
650         ASSERT_EQUAL_128(0, 0x000000000000001f, q2);
651         ASSERT_EQUAL_128(0, 0x0000000000001116, q10);
652         ASSERT_EQUAL_128(0, 0x0000000011111113, q13);
653         ASSERT_EQUAL_128(0, 0x0000000011111113, q18);
654         ASSERT_EQUAL_128(0, 0x1111111111111111, q20);
655         ASSERT_EQUAL_128(0, 0x1111111111111110, q21);
656         break;
657       case 384:
658         ASSERT_EQUAL_128(0, 0x000000000000003f, q2);
659         ASSERT_EQUAL_128(0, 0x0000000000001118, q10);
660         ASSERT_EQUAL_128(0, 0x000000001111111b, q13);
661         ASSERT_EQUAL_128(0, 0x000000001111111b, q18);
662         ASSERT_EQUAL_128(0, 0x1111111111111112, q20);
663         ASSERT_EQUAL_128(0, 0x1111111111111113, q21);
664         break;
665       case 2048:
666         ASSERT_EQUAL_128(0, 0x000000000000000f, q2);
667         ASSERT_EQUAL_128(0, 0x0000000000001118, q10);
668         ASSERT_EQUAL_128(0, 0x000000001111114f, q13);
669         ASSERT_EQUAL_128(0, 0x000000001111114f, q18);
670         ASSERT_EQUAL_128(0, 0x1111111111111112, q20);
671         ASSERT_EQUAL_128(0, 0x1111111111111113, q21);
672         break;
673       default:
674         printf("WARNING: Some tests skipped due to unexpected VL.\n");
675         break;
676     }
677   }
678 }
679 
TEST_SVE(sve_clast_r)680 TEST_SVE(sve_clast_r) {
681   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
682   START();
683 
684   __ Pfalse(p1.VnB());
685   int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
686   int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
687   Initialise(&masm, p2.VnB(), p2_inputs);
688   Initialise(&masm, p3.VnB(), p3_inputs);
689   __ Ptrue(p4.VnB());
690 
691   __ Index(z0.VnB(), 0x10, 1);
692   __ Mov(x1, -1);
693   __ Mov(x2, -1);
694   __ Clasta(x1, p1, x1, z0.VnB());
695   __ Clastb(x2, p1, x2, z0.VnB());
696   __ Clasta(x3, p2, x3, z0.VnB());
697   __ Clastb(x4, p2, x4, z0.VnB());
698   __ Clasta(x5, p3, x5, z0.VnB());
699   __ Clastb(x6, p3, x6, z0.VnB());
700   __ Clasta(x7, p4, x7, z0.VnB());
701 
702   __ Punpklo(p3.VnH(), p3.VnB());
703   __ Index(z0.VnH(), 0x1110, 1);
704   __ Mov(x9, -1);
705   __ Clasta(x9, p1, x9, z0.VnH());
706   __ Clastb(x10, p3, x10, z0.VnH());
707   __ Clasta(x12, p4, x12, z0.VnH());
708 
709   __ Index(z0.VnS(), 0x11111110, 1);
710   __ Mov(x13, -1);
711   __ Clasta(x13, p1, x13, z0.VnS());
712   __ Clastb(x14, p2, x14, z0.VnS());
713   __ Clasta(x18, p4, x18, z0.VnS());
714 
715   __ Index(z0.VnD(), 0x1111111111111110, 1);
716   __ Mov(x19, -1);
717   __ Clasta(x19, p1, x19, z0.VnD());
718   __ Clastb(x20, p2, x20, z0.VnD());
719   __ Clasta(x21, p4, x21, z0.VnD());
720   END();
721 
722   if (CAN_RUN()) {
723     RUN();
724     ASSERT_EQUAL_64(0x00000000000000ff, x1);
725     ASSERT_EQUAL_64(0x00000000000000ff, x2);
726     ASSERT_EQUAL_64(0x0000000000000011, x3);
727     ASSERT_EQUAL_64(0x0000000000000010, x4);
728     ASSERT_EQUAL_64(0x0000000000000019, x5);
729     ASSERT_EQUAL_64(0x0000000000000018, x6);
730     ASSERT_EQUAL_64(0x0000000000000010, x7);
731     ASSERT_EQUAL_64(0x000000000000ffff, x9);
732     ASSERT_EQUAL_64(0x0000000000001110, x12);
733     ASSERT_EQUAL_64(0x00000000ffffffff, x13);
734     ASSERT_EQUAL_64(0x0000000011111110, x14);
735     ASSERT_EQUAL_64(0x0000000011111110, x18);
736     ASSERT_EQUAL_64(0xffffffffffffffff, x19);
737     ASSERT_EQUAL_64(0x1111111111111110, x20);
738     ASSERT_EQUAL_64(0x1111111111111110, x21);
739 
740     int vl = core.GetSVELaneCount(kBRegSize) * 8;
741     switch (vl) {
742       case 128:
743         ASSERT_EQUAL_64(0x0000000000001116, x10);
744         break;
745       case 384:
746         ASSERT_EQUAL_64(0x0000000000001118, x10);
747         break;
748       case 2048:
749         ASSERT_EQUAL_64(0x0000000000001118, x10);
750         break;
751       default:
752         printf("WARNING: Some tests skipped due to unexpected VL.\n");
753         break;
754     }
755   }
756 }
757 
TEST_SVE(sve_clast_v)758 TEST_SVE(sve_clast_v) {
759   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
760   START();
761 
762   __ Pfalse(p1.VnB());
763   int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
764   int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
765   Initialise(&masm, p2.VnB(), p2_inputs);
766   Initialise(&masm, p3.VnB(), p3_inputs);
767   __ Ptrue(p4.VnB());
768 
769   __ Index(z0.VnB(), 0x10, 1);
770   __ Dup(z1.VnB(), -1);
771   __ Dup(z2.VnB(), -1);
772   __ Clasta(b1, p1, b1, z0.VnB());
773   __ Clastb(b2, p1, b2, z0.VnB());
774   __ Clasta(b3, p2, b3, z0.VnB());
775   __ Clastb(b4, p2, b4, z0.VnB());
776   __ Clasta(b5, p3, b5, z0.VnB());
777   __ Clastb(b6, p3, b6, z0.VnB());
778   __ Clasta(b7, p4, b7, z0.VnB());
779 
780   __ Punpklo(p3.VnH(), p3.VnB());
781   __ Index(z0.VnH(), 0x1110, 1);
782   __ Dup(z9.VnB(), -1);
783   __ Clasta(h9, p1, h9, z0.VnH());
784   __ Clastb(h10, p3, h10, z0.VnH());
785   __ Clasta(h12, p4, h12, z0.VnH());
786 
787   __ Index(z0.VnS(), 0x11111110, 1);
788   __ Dup(z13.VnB(), -1);
789   __ Clasta(s13, p1, s13, z0.VnS());
790   __ Clastb(s14, p2, s14, z0.VnS());
791   __ Clasta(s18, p4, s18, z0.VnS());
792 
793   __ Index(z0.VnD(), 0x1111111111111110, 1);
794   __ Dup(z19.VnB(), -1);
795   __ Clasta(d19, p1, d19, z0.VnD());
796   __ Clastb(d20, p2, d20, z0.VnD());
797   __ Clasta(d21, p4, d21, z0.VnD());
798   END();
799 
800   if (CAN_RUN()) {
801     RUN();
802     ASSERT_EQUAL_128(0, 0x00000000000000ff, q1);
803     ASSERT_EQUAL_128(0, 0x00000000000000ff, q2);
804     ASSERT_EQUAL_128(0, 0x0000000000000011, q3);
805     ASSERT_EQUAL_128(0, 0x0000000000000010, q4);
806     ASSERT_EQUAL_128(0, 0x0000000000000019, q5);
807     ASSERT_EQUAL_128(0, 0x0000000000000018, q6);
808     ASSERT_EQUAL_128(0, 0x0000000000000010, q7);
809     ASSERT_EQUAL_128(0, 0x000000000000ffff, q9);
810     ASSERT_EQUAL_128(0, 0x0000000000001110, q12);
811     ASSERT_EQUAL_128(0, 0x00000000ffffffff, q13);
812     ASSERT_EQUAL_128(0, 0x0000000011111110, q14);
813     ASSERT_EQUAL_128(0, 0x0000000011111110, q18);
814     ASSERT_EQUAL_128(0, 0xffffffffffffffff, q19);
815     ASSERT_EQUAL_128(0, 0x1111111111111110, q20);
816     ASSERT_EQUAL_128(0, 0x1111111111111110, q21);
817 
818     int vl = core.GetSVELaneCount(kBRegSize) * 8;
819     switch (vl) {
820       case 128:
821         ASSERT_EQUAL_128(0, 0x0000000000001116, q10);
822         break;
823       case 384:
824         ASSERT_EQUAL_128(0, 0x0000000000001118, q10);
825         break;
826       case 2048:
827         ASSERT_EQUAL_128(0, 0x0000000000001118, q10);
828         break;
829       default:
830         printf("WARNING: Some tests skipped due to unexpected VL.\n");
831         break;
832     }
833   }
834 }
835 
TEST_SVE(sve_clast_z)836 TEST_SVE(sve_clast_z) {
837   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
838   START();
839 
840   __ Pfalse(p1.VnB());
841   int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
842   int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
843   Initialise(&masm, p2.VnB(), p2_inputs);
844   Initialise(&masm, p3.VnB(), p3_inputs);
845   __ Ptrue(p4.VnB());
846 
847   __ Index(z0.VnB(), 0x10, 1);
848   __ Dup(z1.VnB(), 0xff);
849   __ Dup(z2.VnB(), 0xff);
850   __ Clasta(z1.VnB(), p1, z1.VnB(), z0.VnB());
851   __ Clastb(z2.VnB(), p1, z2.VnB(), z0.VnB());
852   __ Clasta(z3.VnB(), p2, z3.VnB(), z0.VnB());
853   __ Clastb(z4.VnB(), p2, z4.VnB(), z0.VnB());
854   __ Clasta(z5.VnB(), p3, z5.VnB(), z0.VnB());
855   __ Clastb(z6.VnB(), p3, z6.VnB(), z0.VnB());
856   __ Clasta(z7.VnB(), p4, z7.VnB(), z0.VnB());
857 
858   __ Punpklo(p3.VnH(), p3.VnB());
859   __ Index(z0.VnH(), 0x1110, 1);
860   __ Dup(z9.VnB(), 0xff);
861   __ Clasta(z9.VnH(), p1, z9.VnH(), z0.VnH());
862   __ Clastb(z10.VnH(), p3, z10.VnH(), z0.VnH());
863   __ Clasta(z12.VnH(), p4, z12.VnH(), z0.VnH());
864 
865   __ Index(z0.VnS(), 0x11111110, 1);
866   __ Dup(z13.VnB(), 0xff);
867   __ Clasta(z13.VnS(), p1, z13.VnS(), z0.VnS());
868   __ Clastb(z14.VnS(), p2, z14.VnS(), z0.VnS());
869   __ Clasta(z16.VnS(), p4, z16.VnS(), z0.VnS());
870 
871   __ Index(z0.VnD(), 0x1111111111111110, 1);
872   __ Dup(z17.VnB(), 0xff);
873   __ Clasta(z17.VnD(), p1, z17.VnD(), z0.VnD());
874   __ Clastb(z18.VnD(), p2, z18.VnD(), z0.VnD());
875   __ Clasta(z20.VnD(), p4, z20.VnD(), z0.VnD());
876   END();
877 
878   if (CAN_RUN()) {
879     RUN();
880     uint64_t z1_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
881     uint64_t z2_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
882     uint64_t z3_expected[] = {0x1111111111111111, 0x1111111111111111};
883     uint64_t z4_expected[] = {0x1010101010101010, 0x1010101010101010};
884     uint64_t z5_expected[] = {0x1919191919191919, 0x1919191919191919};
885     uint64_t z6_expected[] = {0x1818181818181818, 0x1818181818181818};
886     uint64_t z7_expected[] = {0x1010101010101010, 0x1010101010101010};
887     uint64_t z9_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
888     uint64_t z12_expected[] = {0x1110111011101110, 0x1110111011101110};
889     uint64_t z13_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
890     uint64_t z14_expected[] = {0x1111111011111110, 0x1111111011111110};
891     uint64_t z16_expected[] = {0x1111111011111110, 0x1111111011111110};
892     uint64_t z17_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
893     uint64_t z18_expected[] = {0x1111111111111110, 0x1111111111111110};
894     uint64_t z20_expected[] = {0x1111111111111110, 0x1111111111111110};
895 
896     uint64_t z10_expected_vl128[] = {0x1116111611161116, 0x1116111611161116};
897     uint64_t z10_expected_vl_long[] = {0x1118111811181118, 0x1118111811181118};
898 
899     ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
900     ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
901     ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
902     ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
903     ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
904     ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
905     ASSERT_EQUAL_SVE(z7_expected, z7.VnD());
906     ASSERT_EQUAL_SVE(z9_expected, z9.VnD());
907     ASSERT_EQUAL_SVE(z12_expected, z12.VnD());
908     ASSERT_EQUAL_SVE(z13_expected, z13.VnD());
909     ASSERT_EQUAL_SVE(z14_expected, z14.VnD());
910     ASSERT_EQUAL_SVE(z16_expected, z16.VnD());
911     ASSERT_EQUAL_SVE(z17_expected, z17.VnD());
912     ASSERT_EQUAL_SVE(z18_expected, z18.VnD());
913     ASSERT_EQUAL_SVE(z20_expected, z20.VnD());
914 
915     int vl = core.GetSVELaneCount(kBRegSize) * 8;
916     switch (vl) {
917       case 128:
918         ASSERT_EQUAL_SVE(z10_expected_vl128, z10.VnD());
919         break;
920       case 384:
921       case 2048:
922         ASSERT_EQUAL_SVE(z10_expected_vl_long, z10.VnD());
923         break;
924       default:
925         printf("WARNING: Some tests skipped due to unexpected VL.\n");
926         break;
927     }
928   }
929 }
930 
TEST_SVE(sve_compact)931 TEST_SVE(sve_compact) {
932   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
933   START();
934 
935   __ Ptrue(p0.VnB());
936   __ Pfalse(p1.VnB());
937   __ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
938   __ Zip1(p3.VnS(), p1.VnS(), p0.VnS());
939   __ Zip1(p4.VnD(), p0.VnD(), p1.VnD());
940 
941   __ Index(z0.VnS(), 0x11111111, 0x11111111);
942   __ Mov(q0, q0);
943   __ Compact(z1.VnS(), p0, z0.VnS());
944   __ Compact(z2.VnS(), p2, z0.VnS());
945   __ Compact(z0.VnS(), p3, z0.VnS());
946 
947   __ Index(z3.VnD(), 0x1111111111111111, 0x1111111111111111);
948   __ Mov(q3, q3);
949   __ Compact(z4.VnD(), p0, z3.VnD());
950   __ Compact(z5.VnD(), p1, z3.VnD());
951   __ Compact(z6.VnD(), p4, z3.VnD());
952 
953   END();
954 
955   if (CAN_RUN()) {
956     RUN();
957     uint64_t z1_expected[] = {0x4444444433333333, 0x2222222211111111};
958     uint64_t z2_expected[] = {0x0000000000000000, 0x3333333311111111};
959     uint64_t z0_expected[] = {0x0000000000000000, 0x4444444422222222};
960     uint64_t z4_expected[] = {0x2222222222222222, 0x1111111111111111};
961     uint64_t z5_expected[] = {0x0000000000000000, 0x0000000000000000};
962     uint64_t z6_expected[] = {0x0000000000000000, 0x1111111111111111};
963     ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
964     ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
965     ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
966     ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
967     ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
968     ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
969   }
970 }
971 
TEST_SVE(sve_splice)972 TEST_SVE(sve_splice) {
973   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
974   START();
975 
976   __ Ptrue(p0.VnB());
977   __ Pfalse(p1.VnB());
978   int p2b_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
979   int p3b_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0};
980   int p4b_inputs[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
981   int p5b_inputs[] = {0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0};
982   int p6b_inputs[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0};
983   Initialise(&masm, p2.VnB(), p2b_inputs);
984   Initialise(&masm, p3.VnB(), p3b_inputs);
985   Initialise(&masm, p4.VnB(), p4b_inputs);
986   Initialise(&masm, p5.VnB(), p5b_inputs);
987   Initialise(&masm, p6.VnB(), p6b_inputs);
988 
989   __ Index(z30.VnB(), 1, 1);
990 
991   __ Index(z0.VnB(), -1, -1);
992   __ Splice(z0.VnB(), p0, z0.VnB(), z30.VnB());
993   __ Index(z1.VnB(), -1, -1);
994   __ Splice(z1.VnB(), p1, z1.VnB(), z30.VnB());
995   __ Index(z2.VnB(), -1, -1);
996   __ Splice(z2.VnB(), p2, z2.VnB(), z30.VnB());
997   __ Index(z3.VnB(), -1, -1);
998   __ Splice(z3.VnB(), p3, z3.VnB(), z30.VnB());
999   __ Index(z4.VnB(), -1, -1);
1000   __ Splice(z4.VnB(), p4, z4.VnB(), z30.VnB());
1001   __ Index(z5.VnB(), -1, -1);
1002   __ Splice(z5.VnB(), p5, z5.VnB(), z30.VnB());
1003   __ Index(z6.VnB(), -1, -1);
1004   __ Splice(z6.VnB(), p6, z6.VnB(), z30.VnB());
1005 
1006   int p2h_inputs[] = {0, 0, 0, 0, 0, 0, 1, 0};
1007   int p3h_inputs[] = {0, 0, 1, 0, 0, 0, 1, 0};
1008   Initialise(&masm, p2.VnH(), p2h_inputs);
1009   Initialise(&masm, p3.VnH(), p3h_inputs);
1010 
1011   __ Index(z30.VnH(), 1, 1);
1012   __ Index(z29.VnH(), -1, -1);
1013   __ Splice(z7.VnH(), p2, z29.VnH(), z30.VnH());
1014   __ Splice(z8.VnH(), p3, z29.VnH(), z30.VnH());
1015 
1016   int p2s_inputs[] = {0, 0, 1, 0};
1017   int p3s_inputs[] = {1, 0, 1, 0};
1018   Initialise(&masm, p2.VnS(), p2s_inputs);
1019   Initialise(&masm, p3.VnS(), p3s_inputs);
1020 
1021   __ Index(z30.VnS(), 1, 1);
1022   __ Index(z29.VnS(), -1, -1);
1023   __ Splice(z9.VnS(), p2, z29.VnS(), z30.VnS());
1024   __ Splice(z10.VnS(), p3, z29.VnS(), z30.VnS());
1025 
1026   int p2d_inputs[] = {0, 1};
1027   int p3d_inputs[] = {1, 0};
1028   Initialise(&masm, p2.VnD(), p2d_inputs);
1029   Initialise(&masm, p3.VnD(), p3d_inputs);
1030 
1031   __ Index(z30.VnD(), 1, 1);
1032   __ Index(z29.VnD(), -1, -1);
1033   __ Splice(z11.VnD(), p2, z29.VnD(), z30.VnD());
1034   __ Splice(z30.VnD(), p3, z29.VnD(), z30.VnD());
1035 
1036   END();
1037 
1038   if (CAN_RUN()) {
1039     RUN();
1040     uint64_t z0_expected[] = {0xf0f1f2f3f4f5f6f7, 0xf8f9fafbfcfdfeff};
1041     uint64_t z1_expected[] = {0x100f0e0d0c0b0a09, 0x0807060504030201};
1042     uint64_t z2_expected[] = {0x0f0e0d0c0b0a0908, 0x07060504030201ff};
1043     uint64_t z3_expected[] = {0x0f0e0d0c0b0a0908, 0x07060504030201fe};
1044     uint64_t z4_expected[] = {0x0f0e0d0c0b0a0908, 0x07060504030201f0};
1045     uint64_t z5_expected[] = {0x0c0b0a0908070605, 0x04030201f6f7f8f9};
1046     uint64_t z6_expected[] = {0x01f0f1f2f3f4f5f6, 0xf7f8f9fafbfcfdfe};
1047     uint64_t z7_expected[] = {0x0007000600050004, 0x000300020001fffe};
1048     uint64_t z8_expected[] = {0x000300020001fffa, 0xfffbfffcfffdfffe};
1049     uint64_t z9_expected[] = {0x0000000300000002, 0x00000001fffffffe};
1050     uint64_t z10_expected[] = {0x00000001fffffffc, 0xfffffffdfffffffe};
1051     uint64_t z11_expected[] = {0x0000000000000001, 0xffffffffffffffff};
1052     uint64_t z30_expected[] = {0x0000000000000001, 0xfffffffffffffffe};
1053 
1054     ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
1055     ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
1056     ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
1057     ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
1058     ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
1059     ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
1060     ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
1061     ASSERT_EQUAL_SVE(z7_expected, z7.VnD());
1062     ASSERT_EQUAL_SVE(z8_expected, z8.VnD());
1063     ASSERT_EQUAL_SVE(z9_expected, z9.VnD());
1064     ASSERT_EQUAL_SVE(z10_expected, z10.VnD());
1065     ASSERT_EQUAL_SVE(z11_expected, z11.VnD());
1066     ASSERT_EQUAL_SVE(z30_expected, z30.VnD());
1067   }
1068 }
1069 
TEST_SVE(sve_predicate_logical)1070 TEST_SVE(sve_predicate_logical) {
1071   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
1072   START();
1073 
1074   // 0b...01011010'10110111
1075   int p10_inputs[] = {0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1};  // Pm
1076   // 0b...11011001'01010010
1077   int p11_inputs[] = {1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0};  // Pn
1078   // 0b...01010101'10110010
1079   int p12_inputs[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0};  // pg
1080 
1081   Initialise(&masm, p10.VnB(), p10_inputs);
1082   Initialise(&masm, p11.VnB(), p11_inputs);
1083   Initialise(&masm, p12.VnB(), p12_inputs);
1084 
1085   __ Ands(p0.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1086   __ Mrs(x0, NZCV);
1087   __ Bics(p1.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1088   __ Mrs(x1, NZCV);
1089   __ Eor(p2.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1090   __ Nand(p3.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1091   __ Nor(p4.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1092   __ Orn(p5.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1093   __ Orr(p6.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1094   __ Sel(p7.VnB(), p12, p11.VnB(), p10.VnB());
1095 
1096   END();
1097 
1098   if (CAN_RUN()) {
1099     RUN();
1100 
1101     // 0b...01010000'00010010
1102     int p0_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0};
1103     // 0b...00000001'00000000
1104     int p1_expected[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0};
1105     // 0b...00000001'10100000
1106     int p2_expected[] = {0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0};
1107     // 0b...00000101'10100000
1108     int p3_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0};
1109     // 0b...00000100'00000000
1110     int p4_expected[] = {0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1111     // 0b...01010101'00010010
1112     int p5_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0};
1113     // 0b...01010001'10110010
1114     int p6_expected[] = {0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0};
1115     // 0b...01011011'00010111
1116     int p7_expected[] = {0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1};
1117 
1118     ASSERT_EQUAL_SVE(p0_expected, p0.VnB());
1119     ASSERT_EQUAL_SVE(p1_expected, p1.VnB());
1120     ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
1121     ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
1122     ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
1123     ASSERT_EQUAL_SVE(p5_expected, p5.VnB());
1124     ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
1125     ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
1126 
1127     ASSERT_EQUAL_32(SVEFirstFlag, w0);
1128     ASSERT_EQUAL_32(SVENotLastFlag, w1);
1129   }
1130 }
1131 
TEST_SVE(sve_int_compare_vectors)1132 TEST_SVE(sve_int_compare_vectors) {
1133   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
1134   START();
1135 
1136   int z10_inputs[] = {0x00, 0x80, 0xff, 0x7f, 0x00, 0x00, 0x00, 0xff};
1137   int z11_inputs[] = {0x00, 0x00, 0x00, 0x00, 0x80, 0xff, 0x7f, 0xfe};
1138   int p0_inputs[] = {1, 0, 1, 1, 1, 1, 1, 1};
1139   InsrHelper(&masm, z10.VnB(), z10_inputs);
1140   InsrHelper(&masm, z11.VnB(), z11_inputs);
1141   Initialise(&masm, p0.VnB(), p0_inputs);
1142 
1143   __ Cmphs(p6.VnB(), p0.Zeroing(), z10.VnB(), z11.VnB());
1144   __ Mrs(x6, NZCV);
1145 
1146   uint64_t z12_inputs[] = {0xffffffffffffffff, 0x8000000000000000};
1147   uint64_t z13_inputs[] = {0x0000000000000000, 0x8000000000000000};
1148   int p1_inputs[] = {1, 1};
1149   InsrHelper(&masm, z12.VnD(), z12_inputs);
1150   InsrHelper(&masm, z13.VnD(), z13_inputs);
1151   Initialise(&masm, p1.VnD(), p1_inputs);
1152 
1153   __ Cmphi(p7.VnD(), p1.Zeroing(), z12.VnD(), z13.VnD());
1154   __ Mrs(x7, NZCV);
1155 
1156   int z14_inputs[] = {0, 32767, -1, -32767, 0, 0, 0, 32766};
1157   int z15_inputs[] = {0, 0, 0, 0, 32767, -1, -32767, 32767};
1158 
1159   int p2_inputs[] = {1, 0, 1, 1, 1, 1, 1, 1};
1160   InsrHelper(&masm, z14.VnH(), z14_inputs);
1161   InsrHelper(&masm, z15.VnH(), z15_inputs);
1162   Initialise(&masm, p2.VnH(), p2_inputs);
1163 
1164   __ Cmpge(p8.VnH(), p2.Zeroing(), z14.VnH(), z15.VnH());
1165   __ Mrs(x8, NZCV);
1166 
1167   __ Cmpeq(p9.VnH(), p2.Zeroing(), z14.VnH(), z15.VnH());
1168   __ Mrs(x9, NZCV);
1169 
1170   int z16_inputs[] = {0, -1, 0, 0};
1171   int z17_inputs[] = {0, 0, 2147483647, -2147483648};
1172   int p3_inputs[] = {1, 1, 1, 1};
1173   InsrHelper(&masm, z16.VnS(), z16_inputs);
1174   InsrHelper(&masm, z17.VnS(), z17_inputs);
1175   Initialise(&masm, p3.VnS(), p3_inputs);
1176 
1177   __ Cmpgt(p10.VnS(), p3.Zeroing(), z16.VnS(), z17.VnS());
1178   __ Mrs(x10, NZCV);
1179 
1180   __ Cmpne(p11.VnS(), p3.Zeroing(), z16.VnS(), z17.VnS());
1181   __ Mrs(x11, NZCV);
1182 
1183   // Architectural aliases testing.
1184   __ Cmpls(p12.VnB(), p0.Zeroing(), z11.VnB(), z10.VnB());  // HS
1185   __ Cmplo(p13.VnD(), p1.Zeroing(), z13.VnD(), z12.VnD());  // HI
1186   __ Cmple(p14.VnH(), p2.Zeroing(), z15.VnH(), z14.VnH());  // GE
1187   __ Cmplt(p15.VnS(), p3.Zeroing(), z17.VnS(), z16.VnS());  // GT
1188 
1189   END();
1190 
1191   if (CAN_RUN()) {
1192     RUN();
1193 
1194     int p6_expected[] = {1, 0, 1, 1, 0, 0, 0, 1};
1195     for (size_t i = 0; i < ArrayLength(p6_expected); i++) {
1196       int lane = static_cast<int>(ArrayLength(p6_expected) - i - 1);
1197       ASSERT_EQUAL_SVE_LANE(p6_expected[i], p6.VnB(), lane);
1198     }
1199 
1200     int p7_expected[] = {1, 0};
1201     ASSERT_EQUAL_SVE(p7_expected, p7.VnD());
1202 
1203     int p8_expected[] = {1, 0, 0, 0, 0, 1, 1, 0};
1204     ASSERT_EQUAL_SVE(p8_expected, p8.VnH());
1205 
1206     int p9_expected[] = {1, 0, 0, 0, 0, 0, 0, 0};
1207     ASSERT_EQUAL_SVE(p9_expected, p9.VnH());
1208 
1209     int p10_expected[] = {0, 0, 0, 1};
1210     ASSERT_EQUAL_SVE(p10_expected, p10.VnS());
1211 
1212     int p11_expected[] = {0, 1, 1, 1};
1213     ASSERT_EQUAL_SVE(p11_expected, p11.VnS());
1214 
1215     // Reuse the expected results to verify the architectural aliases.
1216     ASSERT_EQUAL_SVE(p6_expected, p12.VnB());
1217     ASSERT_EQUAL_SVE(p7_expected, p13.VnD());
1218     ASSERT_EQUAL_SVE(p8_expected, p14.VnH());
1219     ASSERT_EQUAL_SVE(p10_expected, p15.VnS());
1220 
1221     ASSERT_EQUAL_32(SVEFirstFlag, w6);
1222     ASSERT_EQUAL_32(NoFlag, w7);
1223     ASSERT_EQUAL_32(NoFlag, w8);
1224     ASSERT_EQUAL_32(NoFlag, w9);
1225     ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w10);
1226   }
1227 }
1228 
TEST_SVE(sve_int_compare_vectors_wide_elements)1229 TEST_SVE(sve_int_compare_vectors_wide_elements) {
1230   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
1231   START();
1232 
1233   int src1_inputs_1[] = {0, 1, -1, -128, 127, 100, -66};
1234   int src2_inputs_1[] = {0, -1};
1235   int mask_inputs_1[] = {1, 1, 1, 1, 1, 0, 1};
1236   InsrHelper(&masm, z13.VnB(), src1_inputs_1);
1237   InsrHelper(&masm, z19.VnD(), src2_inputs_1);
1238   Initialise(&masm, p0.VnB(), mask_inputs_1);
1239 
1240   __ Cmpge(p2.VnB(), p0.Zeroing(), z13.VnB(), z19.VnD());
1241   __ Mrs(x2, NZCV);
1242   __ Cmpgt(p3.VnB(), p0.Zeroing(), z13.VnB(), z19.VnD());
1243   __ Mrs(x3, NZCV);
1244 
1245   int src1_inputs_2[] = {0, 32767, -1, -32767, 1, 1234, 0, 32766};
1246   int src2_inputs_2[] = {0, -32767};
1247   int mask_inputs_2[] = {1, 0, 1, 1, 1, 1, 1, 1};
1248   InsrHelper(&masm, z13.VnH(), src1_inputs_2);
1249   InsrHelper(&masm, z19.VnD(), src2_inputs_2);
1250   Initialise(&masm, p0.VnH(), mask_inputs_2);
1251 
1252   __ Cmple(p4.VnH(), p0.Zeroing(), z13.VnH(), z19.VnD());
1253   __ Mrs(x4, NZCV);
1254   __ Cmplt(p5.VnH(), p0.Zeroing(), z13.VnH(), z19.VnD());
1255   __ Mrs(x5, NZCV);
1256 
1257   int src1_inputs_3[] = {0, -1, 2147483647, -2147483648};
1258   int src2_inputs_3[] = {0, -2147483648};
1259   int mask_inputs_3[] = {1, 1, 1, 1};
1260   InsrHelper(&masm, z13.VnS(), src1_inputs_3);
1261   InsrHelper(&masm, z19.VnD(), src2_inputs_3);
1262   Initialise(&masm, p0.VnS(), mask_inputs_3);
1263 
1264   __ Cmpeq(p6.VnS(), p0.Zeroing(), z13.VnS(), z19.VnD());
1265   __ Mrs(x6, NZCV);
1266   __ Cmpne(p7.VnS(), p0.Zeroing(), z13.VnS(), z19.VnD());
1267   __ Mrs(x7, NZCV);
1268 
1269   int src1_inputs_4[] = {0x00, 0x80, 0x7f, 0xff, 0x7f, 0xf0, 0x0f, 0x55};
1270   int src2_inputs_4[] = {0x00, 0x7f};
1271   int mask_inputs_4[] = {1, 1, 1, 1, 0, 1, 1, 1};
1272   InsrHelper(&masm, z13.VnB(), src1_inputs_4);
1273   InsrHelper(&masm, z19.VnD(), src2_inputs_4);
1274   Initialise(&masm, p0.VnB(), mask_inputs_4);
1275 
1276   __ Cmplo(p8.VnB(), p0.Zeroing(), z13.VnB(), z19.VnD());
1277   __ Mrs(x8, NZCV);
1278   __ Cmpls(p9.VnB(), p0.Zeroing(), z13.VnB(), z19.VnD());
1279   __ Mrs(x9, NZCV);
1280 
1281   int src1_inputs_5[] = {0x0000, 0x8000, 0x7fff, 0xffff};
1282   int src2_inputs_5[] = {0x8000, 0xffff};
1283   int mask_inputs_5[] = {1, 1, 1, 1};
1284   InsrHelper(&masm, z13.VnS(), src1_inputs_5);
1285   InsrHelper(&masm, z19.VnD(), src2_inputs_5);
1286   Initialise(&masm, p0.VnS(), mask_inputs_5);
1287 
1288   __ Cmphi(p10.VnS(), p0.Zeroing(), z13.VnS(), z19.VnD());
1289   __ Mrs(x10, NZCV);
1290   __ Cmphs(p11.VnS(), p0.Zeroing(), z13.VnS(), z19.VnD());
1291   __ Mrs(x11, NZCV);
1292 
1293   END();
1294 
1295   if (CAN_RUN()) {
1296     RUN();
1297     int p2_expected[] = {1, 1, 1, 0, 1, 0, 0};
1298     ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
1299 
1300     int p3_expected[] = {1, 1, 0, 0, 1, 0, 0};
1301     ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
1302 
1303     int p4_expected[] = {0x1, 0x0, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0};
1304     ASSERT_EQUAL_SVE(p4_expected, p4.VnH());
1305 
1306     int p5_expected[] = {0x0, 0x0, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0};
1307     ASSERT_EQUAL_SVE(p5_expected, p5.VnH());
1308 
1309     int p6_expected[] = {0x1, 0x0, 0x0, 0x1};
1310     ASSERT_EQUAL_SVE(p6_expected, p6.VnS());
1311 
1312     int p7_expected[] = {0x0, 0x1, 0x1, 0x0};
1313     ASSERT_EQUAL_SVE(p7_expected, p7.VnS());
1314 
1315     int p8_expected[] = {1, 0, 0, 0, 0, 0, 1, 1};
1316     ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
1317 
1318     int p9_expected[] = {1, 0, 1, 0, 0, 0, 1, 1};
1319     ASSERT_EQUAL_SVE(p9_expected, p9.VnB());
1320 
1321     int p10_expected[] = {0x0, 0x0, 0x0, 0x0};
1322     ASSERT_EQUAL_SVE(p10_expected, p10.VnS());
1323 
1324     int p11_expected[] = {0x0, 0x1, 0x0, 0x1};
1325     ASSERT_EQUAL_SVE(p11_expected, p11.VnS());
1326 
1327     ASSERT_EQUAL_32(NoFlag, w2);
1328     ASSERT_EQUAL_32(NoFlag, w3);
1329     ASSERT_EQUAL_32(NoFlag, w4);
1330     ASSERT_EQUAL_32(SVENotLastFlag, w5);
1331     ASSERT_EQUAL_32(SVEFirstFlag, w6);
1332     ASSERT_EQUAL_32(SVENotLastFlag, w7);
1333     ASSERT_EQUAL_32(SVEFirstFlag, w8);
1334     ASSERT_EQUAL_32(SVEFirstFlag, w9);
1335     ASSERT_EQUAL_32(SVENotLastFlag | SVENoneFlag, w10);
1336     ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w11);
1337   }
1338 }
1339 
TEST_SVE(sve_bitwise_imm)1340 TEST_SVE(sve_bitwise_imm) {
1341   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
1342   START();
1343 
1344   // clang-format off
1345   uint64_t z21_inputs[] = {0xfedcba9876543210, 0x0123456789abcdef};
1346   uint32_t z22_inputs[] = {0xfedcba98, 0x76543210, 0x01234567, 0x89abcdef};
1347   uint16_t z23_inputs[] = {0xfedc, 0xba98, 0x7654, 0x3210,
1348                            0x0123, 0x4567, 0x89ab, 0xcdef};
1349   uint8_t z24_inputs[] = {0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10,
1350                           0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef};
1351   // clang-format on
1352 
1353   InsrHelper(&masm, z1.VnD(), z21_inputs);
1354   InsrHelper(&masm, z2.VnS(), z22_inputs);
1355   InsrHelper(&masm, z3.VnH(), z23_inputs);
1356   InsrHelper(&masm, z4.VnB(), z24_inputs);
1357 
1358   __ And(z1.VnD(), z1.VnD(), 0x0000ffff0000ffff);
1359   __ And(z2.VnS(), z2.VnS(), 0xff0000ff);
1360   __ And(z3.VnH(), z3.VnH(), 0x0ff0);
1361   __ And(z4.VnB(), z4.VnB(), 0x3f);
1362 
1363   InsrHelper(&masm, z5.VnD(), z21_inputs);
1364   InsrHelper(&masm, z6.VnS(), z22_inputs);
1365   InsrHelper(&masm, z7.VnH(), z23_inputs);
1366   InsrHelper(&masm, z8.VnB(), z24_inputs);
1367 
1368   __ Eor(z5.VnD(), z5.VnD(), 0x0000ffff0000ffff);
1369   __ Eor(z6.VnS(), z6.VnS(), 0xff0000ff);
1370   __ Eor(z7.VnH(), z7.VnH(), 0x0ff0);
1371   __ Eor(z8.VnB(), z8.VnB(), 0x3f);
1372 
1373   InsrHelper(&masm, z9.VnD(), z21_inputs);
1374   InsrHelper(&masm, z10.VnS(), z22_inputs);
1375   InsrHelper(&masm, z11.VnH(), z23_inputs);
1376   InsrHelper(&masm, z12.VnB(), z24_inputs);
1377 
1378   __ Orr(z9.VnD(), z9.VnD(), 0x0000ffff0000ffff);
1379   __ Orr(z10.VnS(), z10.VnS(), 0xff0000ff);
1380   __ Orr(z11.VnH(), z11.VnH(), 0x0ff0);
1381   __ Orr(z12.VnB(), z12.VnB(), 0x3f);
1382 
1383   {
1384     // The `Dup` macro maps onto either `dup` or `dupm`, but has its own test,
1385     // so here we test `dupm` directly.
1386     ExactAssemblyScope guard(&masm, 4 * kInstructionSize);
1387     __ dupm(z13.VnD(), 0x7ffffff800000000);
1388     __ dupm(z14.VnS(), 0x7ffc7ffc);
1389     __ dupm(z15.VnH(), 0x3ffc);
1390     __ dupm(z16.VnB(), 0xc3);
1391   }
1392 
1393   END();
1394 
1395   if (CAN_RUN()) {
1396     RUN();
1397 
1398     // clang-format off
1399     uint64_t z1_expected[] = {0x0000ba9800003210, 0x000045670000cdef};
1400     uint32_t z2_expected[] = {0xfe000098, 0x76000010, 0x01000067, 0x890000ef};
1401     uint16_t z3_expected[] = {0x0ed0, 0x0a90, 0x0650, 0x0210,
1402                               0x0120, 0x0560, 0x09a0, 0x0de0};
1403     uint8_t z4_expected[] = {0x3e, 0x1c, 0x3a, 0x18, 0x36, 0x14, 0x32, 0x10,
1404                              0x01, 0x23, 0x05, 0x27, 0x09, 0x2b, 0x0d, 0x2f};
1405 
1406     ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
1407     ASSERT_EQUAL_SVE(z2_expected, z2.VnS());
1408     ASSERT_EQUAL_SVE(z3_expected, z3.VnH());
1409     ASSERT_EQUAL_SVE(z4_expected, z4.VnB());
1410 
1411     uint64_t z5_expected[] = {0xfedc45677654cdef, 0x0123ba9889ab3210};
1412     uint32_t z6_expected[] = {0x01dcba67, 0x895432ef, 0xfe234598, 0x76abcd10};
1413     uint16_t z7_expected[] = {0xf12c, 0xb568, 0x79a4, 0x3de0,
1414                               0x0ed3, 0x4a97, 0x865b, 0xc21f};
1415     uint8_t z8_expected[] = {0xc1, 0xe3, 0x85, 0xa7, 0x49, 0x6b, 0x0d, 0x2f,
1416                              0x3e, 0x1c, 0x7a, 0x58, 0xb6, 0x94, 0xf2, 0xd0};
1417 
1418     ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
1419     ASSERT_EQUAL_SVE(z6_expected, z6.VnS());
1420     ASSERT_EQUAL_SVE(z7_expected, z7.VnH());
1421     ASSERT_EQUAL_SVE(z8_expected, z8.VnB());
1422 
1423     uint64_t z9_expected[] = {0xfedcffff7654ffff, 0x0123ffff89abffff};
1424     uint32_t z10_expected[] = {0xffdcbaff, 0xff5432ff,  0xff2345ff, 0xffabcdff};
1425     uint16_t z11_expected[] = {0xfffc, 0xbff8, 0x7ff4, 0x3ff0,
1426                                0x0ff3, 0x4ff7, 0x8ffb, 0xcfff};
1427     uint8_t z12_expected[] = {0xff, 0xff, 0xbf, 0xbf, 0x7f, 0x7f, 0x3f, 0x3f,
1428                               0x3f, 0x3f, 0x7f, 0x7f, 0xbf, 0xbf, 0xff, 0xff};
1429 
1430     ASSERT_EQUAL_SVE(z9_expected, z9.VnD());
1431     ASSERT_EQUAL_SVE(z10_expected, z10.VnS());
1432     ASSERT_EQUAL_SVE(z11_expected, z11.VnH());
1433     ASSERT_EQUAL_SVE(z12_expected, z12.VnB());
1434 
1435     uint64_t z13_expected[] = {0x7ffffff800000000, 0x7ffffff800000000};
1436     uint32_t z14_expected[] = {0x7ffc7ffc, 0x7ffc7ffc, 0x7ffc7ffc, 0x7ffc7ffc};
1437     uint16_t z15_expected[] = {0x3ffc, 0x3ffc, 0x3ffc, 0x3ffc,
1438                                0x3ffc, 0x3ffc, 0x3ffc ,0x3ffc};
1439     ASSERT_EQUAL_SVE(z13_expected, z13.VnD());
1440     ASSERT_EQUAL_SVE(z14_expected, z14.VnS());
1441     ASSERT_EQUAL_SVE(z15_expected, z15.VnH());
1442     // clang-format on
1443   }
1444 }
1445 
TEST_SVE(sve_dup_imm)1446 TEST_SVE(sve_dup_imm) {
1447   // The `Dup` macro can generate `dup`, `dupm`, and it can synthesise
1448   // unencodable immediates.
1449 
1450   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
1451   START();
1452 
1453   // Encodable with `dup` (shift 0).
1454   __ Dup(z0.VnD(), -1);
1455   __ Dup(z1.VnS(), 0x7f);
1456   __ Dup(z2.VnH(), -0x80);
1457   __ Dup(z3.VnB(), 42);
1458 
1459   // Encodable with `dup` (shift 8).
1460   __ Dup(z4.VnD(), -42 * 256);
1461   __ Dup(z5.VnS(), -0x8000);
1462   __ Dup(z6.VnH(), 0x7f00);
1463   // B-sized lanes cannot take a shift of 8.
1464 
1465   // Encodable with `dupm` (but not `dup`).
1466   __ Dup(z10.VnD(), 0x3fc);
1467   __ Dup(z11.VnS(), -516097);  // 0xfff81fff, as a signed int.
1468   __ Dup(z12.VnH(), 0x0001);
1469   // All values that fit B-sized lanes are encodable with `dup`.
1470 
1471   // Cases that require immediate synthesis.
1472   __ Dup(z20.VnD(), 0x1234);
1473   __ Dup(z21.VnD(), -4242);
1474   __ Dup(z22.VnD(), 0xfedcba9876543210);
1475   __ Dup(z23.VnS(), 0x01020304);
1476   __ Dup(z24.VnS(), -0x01020304);
1477   __ Dup(z25.VnH(), 0x3c38);
1478   // All values that fit B-sized lanes are directly encodable.
1479 
1480   END();
1481 
1482   if (CAN_RUN()) {
1483     RUN();
1484 
1485     ASSERT_EQUAL_SVE(0xffffffffffffffff, z0.VnD());
1486     ASSERT_EQUAL_SVE(0x0000007f, z1.VnS());
1487     ASSERT_EQUAL_SVE(0xff80, z2.VnH());
1488     ASSERT_EQUAL_SVE(0x2a, z3.VnB());
1489 
1490     ASSERT_EQUAL_SVE(0xffffffffffffd600, z4.VnD());
1491     ASSERT_EQUAL_SVE(0xffff8000, z5.VnS());
1492     ASSERT_EQUAL_SVE(0x7f00, z6.VnH());
1493 
1494     ASSERT_EQUAL_SVE(0x00000000000003fc, z10.VnD());
1495     ASSERT_EQUAL_SVE(0xfff81fff, z11.VnS());
1496     ASSERT_EQUAL_SVE(0x0001, z12.VnH());
1497 
1498     ASSERT_EQUAL_SVE(0x1234, z20.VnD());
1499     ASSERT_EQUAL_SVE(0xffffffffffffef6e, z21.VnD());
1500     ASSERT_EQUAL_SVE(0xfedcba9876543210, z22.VnD());
1501     ASSERT_EQUAL_SVE(0x01020304, z23.VnS());
1502     ASSERT_EQUAL_SVE(0xfefdfcfc, z24.VnS());
1503     ASSERT_EQUAL_SVE(0x3c38, z25.VnH());
1504   }
1505 }
1506 
TEST_SVE(sve_inc_dec_p_scalar)1507 TEST_SVE(sve_inc_dec_p_scalar) {
1508   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
1509   START();
1510 
1511   int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
1512   Initialise(&masm, p0.VnB(), p0_inputs);
1513 
1514   int p0_b_count = 9;
1515   int p0_h_count = 5;
1516   int p0_s_count = 3;
1517   int p0_d_count = 2;
1518 
1519   // 64-bit operations preserve their high bits.
1520   __ Mov(x0, 0x123456780000002a);
1521   __ Decp(x0, p0.VnB());
1522 
1523   __ Mov(x1, 0x123456780000002a);
1524   __ Incp(x1, p0.VnH());
1525 
1526   // Check that saturation does not occur.
1527   __ Mov(x10, 1);
1528   __ Decp(x10, p0.VnS());
1529 
1530   __ Mov(x11, UINT64_MAX);
1531   __ Incp(x11, p0.VnD());
1532 
1533   __ Mov(x12, INT64_MAX);
1534   __ Incp(x12, p0.VnB());
1535 
1536   // With an all-true predicate, these instructions increment or decrement by
1537   // the vector length.
1538   __ Ptrue(p15.VnB());
1539 
1540   __ Mov(x20, 0x4000000000000000);
1541   __ Decp(x20, p15.VnB());
1542 
1543   __ Mov(x21, 0x4000000000000000);
1544   __ Incp(x21, p15.VnH());
1545 
1546   END();
1547   if (CAN_RUN()) {
1548     RUN();
1549 
1550     ASSERT_EQUAL_64(0x123456780000002a - p0_b_count, x0);
1551     ASSERT_EQUAL_64(0x123456780000002a + p0_h_count, x1);
1552 
1553     ASSERT_EQUAL_64(UINT64_C(1) - p0_s_count, x10);
1554     ASSERT_EQUAL_64(UINT64_MAX + p0_d_count, x11);
1555     ASSERT_EQUAL_64(static_cast<uint64_t>(INT64_MAX) + p0_b_count, x12);
1556 
1557     ASSERT_EQUAL_64(0x4000000000000000 - core.GetSVELaneCount(kBRegSize), x20);
1558     ASSERT_EQUAL_64(0x4000000000000000 + core.GetSVELaneCount(kHRegSize), x21);
1559   }
1560 }
1561 
TEST_SVE(sve_sqinc_sqdec_p_scalar)1562 TEST_SVE(sve_sqinc_sqdec_p_scalar) {
1563   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
1564   START();
1565 
1566   int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
1567   Initialise(&masm, p0.VnB(), p0_inputs);
1568 
1569   int p0_b_count = 9;
1570   int p0_h_count = 5;
1571   int p0_s_count = 3;
1572   int p0_d_count = 2;
1573 
1574   uint64_t placeholder_high = 0x1234567800000000;
1575 
1576   // 64-bit operations preserve their high bits.
1577   __ Mov(x0, placeholder_high + 42);
1578   __ Sqdecp(x0, p0.VnB());
1579 
1580   __ Mov(x1, placeholder_high + 42);
1581   __ Sqincp(x1, p0.VnH());
1582 
1583   // 32-bit operations sign-extend into their high bits.
1584   __ Mov(x2, placeholder_high + 42);
1585   __ Sqdecp(x2, p0.VnS(), w2);
1586 
1587   __ Mov(x3, placeholder_high + 42);
1588   __ Sqincp(x3, p0.VnD(), w3);
1589 
1590   __ Mov(x4, placeholder_high + 1);
1591   __ Sqdecp(x4, p0.VnS(), w4);
1592 
1593   __ Mov(x5, placeholder_high - 1);
1594   __ Sqincp(x5, p0.VnD(), w5);
1595 
1596   // Check that saturation behaves correctly.
1597   __ Mov(x10, 0x8000000000000001);  // INT64_MIN + 1
1598   __ Sqdecp(x10, p0.VnB());
1599 
1600   __ Mov(x11, placeholder_high + 0x80000001);  // INT32_MIN + 1
1601   __ Sqdecp(x11, p0.VnH(), w11);
1602 
1603   __ Mov(x12, 1);
1604   __ Sqdecp(x12, p0.VnS());
1605 
1606   __ Mov(x13, placeholder_high + 1);
1607   __ Sqdecp(x13, p0.VnD(), w13);
1608 
1609   __ Mov(x14, 0x7ffffffffffffffe);  // INT64_MAX - 1
1610   __ Sqincp(x14, p0.VnB());
1611 
1612   __ Mov(x15, placeholder_high + 0x7ffffffe);  // INT32_MAX - 1
1613   __ Sqincp(x15, p0.VnH(), w15);
1614 
1615   // Don't use x16 and x17 since they are scratch registers by default.
1616 
1617   __ Mov(x18, 0xffffffffffffffff);
1618   __ Sqincp(x18, p0.VnS());
1619 
1620   __ Mov(x19, placeholder_high + 0xffffffff);
1621   __ Sqincp(x19, p0.VnD(), w19);
1622 
1623   __ Mov(x20, placeholder_high + 0xffffffff);
1624   __ Sqdecp(x20, p0.VnB(), w20);
1625 
1626   // With an all-true predicate, these instructions increment or decrement by
1627   // the vector length.
1628   __ Ptrue(p15.VnB());
1629 
1630   __ Mov(x21, 0);
1631   __ Sqdecp(x21, p15.VnB());
1632 
1633   __ Mov(x22, 0);
1634   __ Sqincp(x22, p15.VnH());
1635 
1636   __ Mov(x23, placeholder_high);
1637   __ Sqdecp(x23, p15.VnS(), w23);
1638 
1639   __ Mov(x24, placeholder_high);
1640   __ Sqincp(x24, p15.VnD(), w24);
1641 
1642   END();
1643   if (CAN_RUN()) {
1644     RUN();
1645 
1646     // 64-bit operations preserve their high bits.
1647     ASSERT_EQUAL_64(placeholder_high + 42 - p0_b_count, x0);
1648     ASSERT_EQUAL_64(placeholder_high + 42 + p0_h_count, x1);
1649 
1650     // 32-bit operations sign-extend into their high bits.
1651     ASSERT_EQUAL_64(42 - p0_s_count, x2);
1652     ASSERT_EQUAL_64(42 + p0_d_count, x3);
1653     ASSERT_EQUAL_64(0xffffffff00000000 | (1 - p0_s_count), x4);
1654     ASSERT_EQUAL_64(p0_d_count - 1, x5);
1655 
1656     // Check that saturation behaves correctly.
1657     ASSERT_EQUAL_64(INT64_MIN, x10);
1658     ASSERT_EQUAL_64(INT32_MIN, x11);
1659     ASSERT_EQUAL_64(1 - p0_s_count, x12);
1660     ASSERT_EQUAL_64(1 - p0_d_count, x13);
1661     ASSERT_EQUAL_64(INT64_MAX, x14);
1662     ASSERT_EQUAL_64(INT32_MAX, x15);
1663     ASSERT_EQUAL_64(p0_s_count - 1, x18);
1664     ASSERT_EQUAL_64(p0_d_count - 1, x19);
1665     ASSERT_EQUAL_64(-1 - p0_b_count, x20);
1666 
1667     // Check all-true predicates.
1668     ASSERT_EQUAL_64(-core.GetSVELaneCount(kBRegSize), x21);
1669     ASSERT_EQUAL_64(core.GetSVELaneCount(kHRegSize), x22);
1670     ASSERT_EQUAL_64(-core.GetSVELaneCount(kSRegSize), x23);
1671     ASSERT_EQUAL_64(core.GetSVELaneCount(kDRegSize), x24);
1672   }
1673 }
1674 
TEST_SVE(sve_uqinc_uqdec_p_scalar)1675 TEST_SVE(sve_uqinc_uqdec_p_scalar) {
1676   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
1677   START();
1678 
1679   int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
1680   Initialise(&masm, p0.VnB(), p0_inputs);
1681 
1682   int p0_b_count = 9;
1683   int p0_h_count = 5;
1684   int p0_s_count = 3;
1685   int p0_d_count = 2;
1686 
1687   uint64_t placeholder_high = 0x1234567800000000;
1688 
1689   // 64-bit operations preserve their high bits.
1690   __ Mov(x0, placeholder_high + 42);
1691   __ Uqdecp(x0, p0.VnB());
1692 
1693   __ Mov(x1, placeholder_high + 42);
1694   __ Uqincp(x1, p0.VnH());
1695 
1696   // 32-bit operations zero-extend into their high bits.
1697   __ Mov(x2, placeholder_high + 42);
1698   __ Uqdecp(x2, p0.VnS(), w2);
1699 
1700   __ Mov(x3, placeholder_high + 42);
1701   __ Uqincp(x3, p0.VnD(), w3);
1702 
1703   __ Mov(x4, placeholder_high + 0x80000001);
1704   __ Uqdecp(x4, p0.VnS(), w4);
1705 
1706   __ Mov(x5, placeholder_high + 0x7fffffff);
1707   __ Uqincp(x5, p0.VnD(), w5);
1708 
1709   // Check that saturation behaves correctly.
1710   __ Mov(x10, 1);
1711   __ Uqdecp(x10, p0.VnB(), x10);
1712 
1713   __ Mov(x11, placeholder_high + 1);
1714   __ Uqdecp(x11, p0.VnH(), w11);
1715 
1716   __ Mov(x12, 0x8000000000000000);  // INT64_MAX + 1
1717   __ Uqdecp(x12, p0.VnS(), x12);
1718 
1719   __ Mov(x13, placeholder_high + 0x80000000);  // INT32_MAX + 1
1720   __ Uqdecp(x13, p0.VnD(), w13);
1721 
1722   __ Mov(x14, 0xfffffffffffffffe);  // UINT64_MAX - 1
1723   __ Uqincp(x14, p0.VnB(), x14);
1724 
1725   __ Mov(x15, placeholder_high + 0xfffffffe);  // UINT32_MAX - 1
1726   __ Uqincp(x15, p0.VnH(), w15);
1727 
1728   // Don't use x16 and x17 since they are scratch registers by default.
1729 
1730   __ Mov(x18, 0x7ffffffffffffffe);  // INT64_MAX - 1
1731   __ Uqincp(x18, p0.VnS(), x18);
1732 
1733   __ Mov(x19, placeholder_high + 0x7ffffffe);  // INT32_MAX - 1
1734   __ Uqincp(x19, p0.VnD(), w19);
1735 
1736   // With an all-true predicate, these instructions increment or decrement by
1737   // the vector length.
1738   __ Ptrue(p15.VnB());
1739 
1740   __ Mov(x20, 0x4000000000000000);
1741   __ Uqdecp(x20, p15.VnB(), x20);
1742 
1743   __ Mov(x21, 0x4000000000000000);
1744   __ Uqincp(x21, p15.VnH(), x21);
1745 
1746   __ Mov(x22, placeholder_high + 0x40000000);
1747   __ Uqdecp(x22, p15.VnS(), w22);
1748 
1749   __ Mov(x23, placeholder_high + 0x40000000);
1750   __ Uqincp(x23, p15.VnD(), w23);
1751 
1752   END();
1753   if (CAN_RUN()) {
1754     RUN();
1755 
1756     // 64-bit operations preserve their high bits.
1757     ASSERT_EQUAL_64(placeholder_high + 42 - p0_b_count, x0);
1758     ASSERT_EQUAL_64(placeholder_high + 42 + p0_h_count, x1);
1759 
1760     // 32-bit operations zero-extend into their high bits.
1761     ASSERT_EQUAL_64(42 - p0_s_count, x2);
1762     ASSERT_EQUAL_64(42 + p0_d_count, x3);
1763     ASSERT_EQUAL_64(UINT64_C(0x80000001) - p0_s_count, x4);
1764     ASSERT_EQUAL_64(UINT64_C(0x7fffffff) + p0_d_count, x5);
1765 
1766     // Check that saturation behaves correctly.
1767     ASSERT_EQUAL_64(0, x10);
1768     ASSERT_EQUAL_64(0, x11);
1769     ASSERT_EQUAL_64(0x8000000000000000 - p0_s_count, x12);
1770     ASSERT_EQUAL_64(UINT64_C(0x80000000) - p0_d_count, x13);
1771     ASSERT_EQUAL_64(UINT64_MAX, x14);
1772     ASSERT_EQUAL_64(UINT32_MAX, x15);
1773     ASSERT_EQUAL_64(0x7ffffffffffffffe + p0_s_count, x18);
1774     ASSERT_EQUAL_64(UINT64_C(0x7ffffffe) + p0_d_count, x19);
1775 
1776     // Check all-true predicates.
1777     ASSERT_EQUAL_64(0x4000000000000000 - core.GetSVELaneCount(kBRegSize), x20);
1778     ASSERT_EQUAL_64(0x4000000000000000 + core.GetSVELaneCount(kHRegSize), x21);
1779     ASSERT_EQUAL_64(0x40000000 - core.GetSVELaneCount(kSRegSize), x22);
1780     ASSERT_EQUAL_64(0x40000000 + core.GetSVELaneCount(kDRegSize), x23);
1781   }
1782 }
1783 
TEST_SVE(sve_inc_dec_p_vector)1784 TEST_SVE(sve_inc_dec_p_vector) {
1785   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
1786   START();
1787 
1788   // There are {5, 3, 2} active {H, S, D} lanes. B-sized lanes are ignored.
1789   int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
1790   Initialise(&masm, p0.VnB(), p0_inputs);
1791 
1792   // Check that saturation does not occur.
1793 
1794   int64_t z0_inputs[] = {0x1234567800000042, 0, 1, INT64_MIN};
1795   InsrHelper(&masm, z0.VnD(), z0_inputs);
1796 
1797   int64_t z1_inputs[] = {0x12345678ffffff2a, 0, -1, INT64_MAX};
1798   InsrHelper(&masm, z1.VnD(), z1_inputs);
1799 
1800   int32_t z2_inputs[] = {0x12340042, 0, -1, 1, INT32_MAX, INT32_MIN};
1801   InsrHelper(&masm, z2.VnS(), z2_inputs);
1802 
1803   int16_t z3_inputs[] = {0x122a, 0, 1, -1, INT16_MIN, INT16_MAX};
1804   InsrHelper(&masm, z3.VnH(), z3_inputs);
1805 
1806   // The MacroAssembler implements non-destructive operations using movprfx.
1807   __ Decp(z10.VnD(), p0, z0.VnD());
1808   __ Decp(z11.VnD(), p0, z1.VnD());
1809   __ Decp(z12.VnS(), p0, z2.VnS());
1810   __ Decp(z13.VnH(), p0, z3.VnH());
1811 
1812   __ Incp(z14.VnD(), p0, z0.VnD());
1813   __ Incp(z15.VnD(), p0, z1.VnD());
1814   __ Incp(z16.VnS(), p0, z2.VnS());
1815   __ Incp(z17.VnH(), p0, z3.VnH());
1816 
1817   // Also test destructive forms.
1818   __ Mov(z4, z0);
1819   __ Mov(z5, z1);
1820   __ Mov(z6, z2);
1821   __ Mov(z7, z3);
1822 
1823   __ Decp(z0.VnD(), p0);
1824   __ Decp(z1.VnD(), p0);
1825   __ Decp(z2.VnS(), p0);
1826   __ Decp(z3.VnH(), p0);
1827 
1828   __ Incp(z4.VnD(), p0);
1829   __ Incp(z5.VnD(), p0);
1830   __ Incp(z6.VnS(), p0);
1831   __ Incp(z7.VnH(), p0);
1832 
1833   END();
1834   if (CAN_RUN()) {
1835     RUN();
1836 
1837     // z0_inputs[...] - number of active D lanes (2)
1838     int64_t z0_expected[] = {0x1234567800000040, -2, -1, 0x7ffffffffffffffe};
1839     ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
1840 
1841     // z1_inputs[...] - number of active D lanes (2)
1842     int64_t z1_expected[] = {0x12345678ffffff28, -2, -3, 0x7ffffffffffffffd};
1843     ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
1844 
1845     // z2_inputs[...] - number of active S lanes (3)
1846     int32_t z2_expected[] = {0x1234003f, -3, -4, -2, 0x7ffffffc, 0x7ffffffd};
1847     ASSERT_EQUAL_SVE(z2_expected, z2.VnS());
1848 
1849     // z3_inputs[...] - number of active H lanes (5)
1850     int16_t z3_expected[] = {0x1225, -5, -4, -6, 0x7ffb, 0x7ffa};
1851     ASSERT_EQUAL_SVE(z3_expected, z3.VnH());
1852 
1853     // z0_inputs[...] + number of active D lanes (2)
1854     uint64_t z4_expected[] = {0x1234567800000044, 2, 3, 0x8000000000000002};
1855     ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
1856 
1857     // z1_inputs[...] + number of active D lanes (2)
1858     uint64_t z5_expected[] = {0x12345678ffffff2c, 2, 1, 0x8000000000000001};
1859     ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
1860 
1861     // z2_inputs[...] + number of active S lanes (3)
1862     uint32_t z6_expected[] = {0x12340045, 3, 2, 4, 0x80000002, 0x80000003};
1863     ASSERT_EQUAL_SVE(z6_expected, z6.VnS());
1864 
1865     // z3_inputs[...] + number of active H lanes (5)
1866     uint16_t z7_expected[] = {0x122f, 5, 6, 4, 0x8005, 0x8004};
1867     ASSERT_EQUAL_SVE(z7_expected, z7.VnH());
1868 
1869     // Check that the non-destructive macros produced the same results.
1870     ASSERT_EQUAL_SVE(z0_expected, z10.VnD());
1871     ASSERT_EQUAL_SVE(z1_expected, z11.VnD());
1872     ASSERT_EQUAL_SVE(z2_expected, z12.VnS());
1873     ASSERT_EQUAL_SVE(z3_expected, z13.VnH());
1874     ASSERT_EQUAL_SVE(z4_expected, z14.VnD());
1875     ASSERT_EQUAL_SVE(z5_expected, z15.VnD());
1876     ASSERT_EQUAL_SVE(z6_expected, z16.VnS());
1877     ASSERT_EQUAL_SVE(z7_expected, z17.VnH());
1878   }
1879 }
1880 
TEST_SVE(sve_inc_dec_ptrue_vector)1881 TEST_SVE(sve_inc_dec_ptrue_vector) {
1882   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
1883   START();
1884 
1885   // With an all-true predicate, these instructions increment or decrement by
1886   // the vector length.
1887   __ Ptrue(p15.VnB());
1888 
1889   __ Dup(z0.VnD(), 0);
1890   __ Decp(z0.VnD(), p15);
1891 
1892   __ Dup(z1.VnS(), 0);
1893   __ Decp(z1.VnS(), p15);
1894 
1895   __ Dup(z2.VnH(), 0);
1896   __ Decp(z2.VnH(), p15);
1897 
1898   __ Dup(z3.VnD(), 0);
1899   __ Incp(z3.VnD(), p15);
1900 
1901   __ Dup(z4.VnS(), 0);
1902   __ Incp(z4.VnS(), p15);
1903 
1904   __ Dup(z5.VnH(), 0);
1905   __ Incp(z5.VnH(), p15);
1906 
1907   END();
1908   if (CAN_RUN()) {
1909     RUN();
1910 
1911     int d_lane_count = core.GetSVELaneCount(kDRegSize);
1912     int s_lane_count = core.GetSVELaneCount(kSRegSize);
1913     int h_lane_count = core.GetSVELaneCount(kHRegSize);
1914 
1915     for (int i = 0; i < d_lane_count; i++) {
1916       ASSERT_EQUAL_SVE_LANE(-d_lane_count, z0.VnD(), i);
1917       ASSERT_EQUAL_SVE_LANE(d_lane_count, z3.VnD(), i);
1918     }
1919 
1920     for (int i = 0; i < s_lane_count; i++) {
1921       ASSERT_EQUAL_SVE_LANE(-s_lane_count, z1.VnS(), i);
1922       ASSERT_EQUAL_SVE_LANE(s_lane_count, z4.VnS(), i);
1923     }
1924 
1925     for (int i = 0; i < h_lane_count; i++) {
1926       ASSERT_EQUAL_SVE_LANE(-h_lane_count, z2.VnH(), i);
1927       ASSERT_EQUAL_SVE_LANE(h_lane_count, z5.VnH(), i);
1928     }
1929   }
1930 }
1931 
TEST_SVE(sve_sqinc_sqdec_p_vector)1932 TEST_SVE(sve_sqinc_sqdec_p_vector) {
1933   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
1934   START();
1935 
1936   // There are {5, 3, 2} active {H, S, D} lanes. B-sized lanes are ignored.
1937   int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
1938   Initialise(&masm, p0.VnB(), p0_inputs);
1939 
1940   // Check that saturation behaves correctly.
1941 
1942   int64_t z0_inputs[] = {0x1234567800000042, 0, 1, INT64_MIN};
1943   InsrHelper(&masm, z0.VnD(), z0_inputs);
1944 
1945   int64_t z1_inputs[] = {0x12345678ffffff2a, 0, -1, INT64_MAX};
1946   InsrHelper(&masm, z1.VnD(), z1_inputs);
1947 
1948   int32_t z2_inputs[] = {0x12340042, 0, -1, 1, INT32_MAX, INT32_MIN};
1949   InsrHelper(&masm, z2.VnS(), z2_inputs);
1950 
1951   int16_t z3_inputs[] = {0x122a, 0, 1, -1, INT16_MIN, INT16_MAX};
1952   InsrHelper(&masm, z3.VnH(), z3_inputs);
1953 
1954   // The MacroAssembler implements non-destructive operations using movprfx.
1955   __ Sqdecp(z10.VnD(), p0, z0.VnD());
1956   __ Sqdecp(z11.VnD(), p0, z1.VnD());
1957   __ Sqdecp(z12.VnS(), p0, z2.VnS());
1958   __ Sqdecp(z13.VnH(), p0, z3.VnH());
1959 
1960   __ Sqincp(z14.VnD(), p0, z0.VnD());
1961   __ Sqincp(z15.VnD(), p0, z1.VnD());
1962   __ Sqincp(z16.VnS(), p0, z2.VnS());
1963   __ Sqincp(z17.VnH(), p0, z3.VnH());
1964 
1965   // Also test destructive forms.
1966   __ Mov(z4, z0);
1967   __ Mov(z5, z1);
1968   __ Mov(z6, z2);
1969   __ Mov(z7, z3);
1970 
1971   __ Sqdecp(z0.VnD(), p0);
1972   __ Sqdecp(z1.VnD(), p0);
1973   __ Sqdecp(z2.VnS(), p0);
1974   __ Sqdecp(z3.VnH(), p0);
1975 
1976   __ Sqincp(z4.VnD(), p0);
1977   __ Sqincp(z5.VnD(), p0);
1978   __ Sqincp(z6.VnS(), p0);
1979   __ Sqincp(z7.VnH(), p0);
1980 
1981   END();
1982   if (CAN_RUN()) {
1983     RUN();
1984 
1985     // z0_inputs[...] - number of active D lanes (2)
1986     int64_t z0_expected[] = {0x1234567800000040, -2, -1, INT64_MIN};
1987     ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
1988 
1989     // z1_inputs[...] - number of active D lanes (2)
1990     int64_t z1_expected[] = {0x12345678ffffff28, -2, -3, 0x7ffffffffffffffd};
1991     ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
1992 
1993     // z2_inputs[...] - number of active S lanes (3)
1994     int32_t z2_expected[] = {0x1234003f, -3, -4, -2, 0x7ffffffc, INT32_MIN};
1995     ASSERT_EQUAL_SVE(z2_expected, z2.VnS());
1996 
1997     // z3_inputs[...] - number of active H lanes (5)
1998     int16_t z3_expected[] = {0x1225, -5, -4, -6, INT16_MIN, 0x7ffa};
1999     ASSERT_EQUAL_SVE(z3_expected, z3.VnH());
2000 
2001     // z0_inputs[...] + number of active D lanes (2)
2002     uint64_t z4_expected[] = {0x1234567800000044, 2, 3, 0x8000000000000002};
2003     ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
2004 
2005     // z1_inputs[...] + number of active D lanes (2)
2006     uint64_t z5_expected[] = {0x12345678ffffff2c, 2, 1, INT64_MAX};
2007     ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
2008 
2009     // z2_inputs[...] + number of active S lanes (3)
2010     uint32_t z6_expected[] = {0x12340045, 3, 2, 4, INT32_MAX, 0x80000003};
2011     ASSERT_EQUAL_SVE(z6_expected, z6.VnS());
2012 
2013     // z3_inputs[...] + number of active H lanes (5)
2014     uint16_t z7_expected[] = {0x122f, 5, 6, 4, 0x8005, INT16_MAX};
2015     ASSERT_EQUAL_SVE(z7_expected, z7.VnH());
2016 
2017     // Check that the non-destructive macros produced the same results.
2018     ASSERT_EQUAL_SVE(z0_expected, z10.VnD());
2019     ASSERT_EQUAL_SVE(z1_expected, z11.VnD());
2020     ASSERT_EQUAL_SVE(z2_expected, z12.VnS());
2021     ASSERT_EQUAL_SVE(z3_expected, z13.VnH());
2022     ASSERT_EQUAL_SVE(z4_expected, z14.VnD());
2023     ASSERT_EQUAL_SVE(z5_expected, z15.VnD());
2024     ASSERT_EQUAL_SVE(z6_expected, z16.VnS());
2025     ASSERT_EQUAL_SVE(z7_expected, z17.VnH());
2026   }
2027 }
2028 
TEST_SVE(sve_sqinc_sqdec_ptrue_vector)2029 TEST_SVE(sve_sqinc_sqdec_ptrue_vector) {
2030   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2031   START();
2032 
2033   // With an all-true predicate, these instructions increment or decrement by
2034   // the vector length.
2035   __ Ptrue(p15.VnB());
2036 
2037   __ Dup(z0.VnD(), 0);
2038   __ Sqdecp(z0.VnD(), p15);
2039 
2040   __ Dup(z1.VnS(), 0);
2041   __ Sqdecp(z1.VnS(), p15);
2042 
2043   __ Dup(z2.VnH(), 0);
2044   __ Sqdecp(z2.VnH(), p15);
2045 
2046   __ Dup(z3.VnD(), 0);
2047   __ Sqincp(z3.VnD(), p15);
2048 
2049   __ Dup(z4.VnS(), 0);
2050   __ Sqincp(z4.VnS(), p15);
2051 
2052   __ Dup(z5.VnH(), 0);
2053   __ Sqincp(z5.VnH(), p15);
2054 
2055   END();
2056   if (CAN_RUN()) {
2057     RUN();
2058 
2059     int d_lane_count = core.GetSVELaneCount(kDRegSize);
2060     int s_lane_count = core.GetSVELaneCount(kSRegSize);
2061     int h_lane_count = core.GetSVELaneCount(kHRegSize);
2062 
2063     for (int i = 0; i < d_lane_count; i++) {
2064       ASSERT_EQUAL_SVE_LANE(-d_lane_count, z0.VnD(), i);
2065       ASSERT_EQUAL_SVE_LANE(d_lane_count, z3.VnD(), i);
2066     }
2067 
2068     for (int i = 0; i < s_lane_count; i++) {
2069       ASSERT_EQUAL_SVE_LANE(-s_lane_count, z1.VnS(), i);
2070       ASSERT_EQUAL_SVE_LANE(s_lane_count, z4.VnS(), i);
2071     }
2072 
2073     for (int i = 0; i < h_lane_count; i++) {
2074       ASSERT_EQUAL_SVE_LANE(-h_lane_count, z2.VnH(), i);
2075       ASSERT_EQUAL_SVE_LANE(h_lane_count, z5.VnH(), i);
2076     }
2077   }
2078 }
2079 
TEST_SVE(sve_uqinc_uqdec_p_vector)2080 TEST_SVE(sve_uqinc_uqdec_p_vector) {
2081   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2082   START();
2083 
2084   // There are {5, 3, 2} active {H, S, D} lanes. B-sized lanes are ignored.
2085   int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
2086   Initialise(&masm, p0.VnB(), p0_inputs);
2087 
2088   // Check that saturation behaves correctly.
2089 
2090   uint64_t z0_inputs[] = {0x1234567800000042, 0, 1, 0x8000000000000000};
2091   InsrHelper(&masm, z0.VnD(), z0_inputs);
2092 
2093   uint64_t z1_inputs[] = {0x12345678ffffff2a, 0, UINT64_MAX, INT64_MAX};
2094   InsrHelper(&masm, z1.VnD(), z1_inputs);
2095 
2096   uint32_t z2_inputs[] = {0x12340042, 0, UINT32_MAX, 1, INT32_MAX, 0x80000000};
2097   InsrHelper(&masm, z2.VnS(), z2_inputs);
2098 
2099   uint16_t z3_inputs[] = {0x122a, 0, 1, UINT16_MAX, 0x8000, INT16_MAX};
2100   InsrHelper(&masm, z3.VnH(), z3_inputs);
2101 
2102   // The MacroAssembler implements non-destructive operations using movprfx.
2103   __ Uqdecp(z10.VnD(), p0, z0.VnD());
2104   __ Uqdecp(z11.VnD(), p0, z1.VnD());
2105   __ Uqdecp(z12.VnS(), p0, z2.VnS());
2106   __ Uqdecp(z13.VnH(), p0, z3.VnH());
2107 
2108   __ Uqincp(z14.VnD(), p0, z0.VnD());
2109   __ Uqincp(z15.VnD(), p0, z1.VnD());
2110   __ Uqincp(z16.VnS(), p0, z2.VnS());
2111   __ Uqincp(z17.VnH(), p0, z3.VnH());
2112 
2113   // Also test destructive forms.
2114   __ Mov(z4, z0);
2115   __ Mov(z5, z1);
2116   __ Mov(z6, z2);
2117   __ Mov(z7, z3);
2118 
2119   __ Uqdecp(z0.VnD(), p0);
2120   __ Uqdecp(z1.VnD(), p0);
2121   __ Uqdecp(z2.VnS(), p0);
2122   __ Uqdecp(z3.VnH(), p0);
2123 
2124   __ Uqincp(z4.VnD(), p0);
2125   __ Uqincp(z5.VnD(), p0);
2126   __ Uqincp(z6.VnS(), p0);
2127   __ Uqincp(z7.VnH(), p0);
2128 
2129   END();
2130   if (CAN_RUN()) {
2131     RUN();
2132 
2133     // z0_inputs[...] - number of active D lanes (2)
2134     uint64_t z0_expected[] = {0x1234567800000040, 0, 0, 0x7ffffffffffffffe};
2135     ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
2136 
2137     // z1_inputs[...] - number of active D lanes (2)
2138     uint64_t z1_expected[] = {0x12345678ffffff28,
2139                               0,
2140                               0xfffffffffffffffd,
2141                               0x7ffffffffffffffd};
2142     ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
2143 
2144     // z2_inputs[...] - number of active S lanes (3)
2145     uint32_t z2_expected[] =
2146         {0x1234003f, 0, 0xfffffffc, 0, 0x7ffffffc, 0x7ffffffd};
2147     ASSERT_EQUAL_SVE(z2_expected, z2.VnS());
2148 
2149     // z3_inputs[...] - number of active H lanes (5)
2150     uint16_t z3_expected[] = {0x1225, 0, 0, 0xfffa, 0x7ffb, 0x7ffa};
2151     ASSERT_EQUAL_SVE(z3_expected, z3.VnH());
2152 
2153     // z0_inputs[...] + number of active D lanes (2)
2154     uint64_t z4_expected[] = {0x1234567800000044, 2, 3, 0x8000000000000002};
2155     ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
2156 
2157     // z1_inputs[...] + number of active D lanes (2)
2158     uint64_t z5_expected[] = {0x12345678ffffff2c,
2159                               2,
2160                               UINT64_MAX,
2161                               0x8000000000000001};
2162     ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
2163 
2164     // z2_inputs[...] + number of active S lanes (3)
2165     uint32_t z6_expected[] =
2166         {0x12340045, 3, UINT32_MAX, 4, 0x80000002, 0x80000003};
2167     ASSERT_EQUAL_SVE(z6_expected, z6.VnS());
2168 
2169     // z3_inputs[...] + number of active H lanes (5)
2170     uint16_t z7_expected[] = {0x122f, 5, 6, UINT16_MAX, 0x8005, 0x8004};
2171     ASSERT_EQUAL_SVE(z7_expected, z7.VnH());
2172 
2173     // Check that the non-destructive macros produced the same results.
2174     ASSERT_EQUAL_SVE(z0_expected, z10.VnD());
2175     ASSERT_EQUAL_SVE(z1_expected, z11.VnD());
2176     ASSERT_EQUAL_SVE(z2_expected, z12.VnS());
2177     ASSERT_EQUAL_SVE(z3_expected, z13.VnH());
2178     ASSERT_EQUAL_SVE(z4_expected, z14.VnD());
2179     ASSERT_EQUAL_SVE(z5_expected, z15.VnD());
2180     ASSERT_EQUAL_SVE(z6_expected, z16.VnS());
2181     ASSERT_EQUAL_SVE(z7_expected, z17.VnH());
2182   }
2183 }
2184 
TEST_SVE(sve_uqinc_uqdec_ptrue_vector)2185 TEST_SVE(sve_uqinc_uqdec_ptrue_vector) {
2186   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2187   START();
2188 
2189   // With an all-true predicate, these instructions increment or decrement by
2190   // the vector length.
2191   __ Ptrue(p15.VnB());
2192 
2193   __ Mov(x0, 0x1234567800000000);
2194   __ Mov(x1, 0x12340000);
2195   __ Mov(x2, 0x1200);
2196 
2197   __ Dup(z0.VnD(), x0);
2198   __ Uqdecp(z0.VnD(), p15);
2199 
2200   __ Dup(z1.VnS(), x1);
2201   __ Uqdecp(z1.VnS(), p15);
2202 
2203   __ Dup(z2.VnH(), x2);
2204   __ Uqdecp(z2.VnH(), p15);
2205 
2206   __ Dup(z3.VnD(), x0);
2207   __ Uqincp(z3.VnD(), p15);
2208 
2209   __ Dup(z4.VnS(), x1);
2210   __ Uqincp(z4.VnS(), p15);
2211 
2212   __ Dup(z5.VnH(), x2);
2213   __ Uqincp(z5.VnH(), p15);
2214 
2215   END();
2216   if (CAN_RUN()) {
2217     RUN();
2218 
2219     int d_lane_count = core.GetSVELaneCount(kDRegSize);
2220     int s_lane_count = core.GetSVELaneCount(kSRegSize);
2221     int h_lane_count = core.GetSVELaneCount(kHRegSize);
2222 
2223     for (int i = 0; i < d_lane_count; i++) {
2224       ASSERT_EQUAL_SVE_LANE(0x1234567800000000 - d_lane_count, z0.VnD(), i);
2225       ASSERT_EQUAL_SVE_LANE(0x1234567800000000 + d_lane_count, z3.VnD(), i);
2226     }
2227 
2228     for (int i = 0; i < s_lane_count; i++) {
2229       ASSERT_EQUAL_SVE_LANE(0x12340000 - s_lane_count, z1.VnS(), i);
2230       ASSERT_EQUAL_SVE_LANE(0x12340000 + s_lane_count, z4.VnS(), i);
2231     }
2232 
2233     for (int i = 0; i < h_lane_count; i++) {
2234       ASSERT_EQUAL_SVE_LANE(0x1200 - h_lane_count, z2.VnH(), i);
2235       ASSERT_EQUAL_SVE_LANE(0x1200 + h_lane_count, z5.VnH(), i);
2236     }
2237   }
2238 }
2239 
TEST_SVE(sve_index)2240 TEST_SVE(sve_index) {
2241   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2242   START();
2243 
2244   // Simple cases.
2245   __ Index(z0.VnB(), 0, 1);
2246   __ Index(z1.VnH(), 1, 1);
2247   __ Index(z2.VnS(), 2, 1);
2248   __ Index(z3.VnD(), 3, 1);
2249 
2250   // Synthesised immediates.
2251   __ Index(z4.VnB(), 42, -1);
2252   __ Index(z5.VnH(), -1, 42);
2253   __ Index(z6.VnS(), 42, 42);
2254 
2255   // Register arguments.
2256   __ Mov(x0, 42);
2257   __ Mov(x1, -3);
2258   __ Index(z10.VnD(), x0, x1);
2259   __ Index(z11.VnB(), w0, w1);
2260   // The register size should correspond to the lane size, but VIXL allows any
2261   // register at least as big as the lane size.
2262   __ Index(z12.VnB(), x0, x1);
2263   __ Index(z13.VnH(), w0, x1);
2264   __ Index(z14.VnS(), x0, w1);
2265 
2266   // Integer overflow.
2267   __ Index(z20.VnB(), UINT8_MAX - 2, 2);
2268   __ Index(z21.VnH(), 7, -3);
2269   __ Index(z22.VnS(), INT32_MAX - 2, 1);
2270   __ Index(z23.VnD(), INT64_MIN + 6, -7);
2271 
2272   END();
2273 
2274   if (CAN_RUN()) {
2275     RUN();
2276 
2277     int b_lane_count = core.GetSVELaneCount(kBRegSize);
2278     int h_lane_count = core.GetSVELaneCount(kHRegSize);
2279     int s_lane_count = core.GetSVELaneCount(kSRegSize);
2280     int d_lane_count = core.GetSVELaneCount(kDRegSize);
2281 
2282     uint64_t b_mask = GetUintMask(kBRegSize);
2283     uint64_t h_mask = GetUintMask(kHRegSize);
2284     uint64_t s_mask = GetUintMask(kSRegSize);
2285     uint64_t d_mask = GetUintMask(kDRegSize);
2286 
2287     // Simple cases.
2288     for (int i = 0; i < b_lane_count; i++) {
2289       ASSERT_EQUAL_SVE_LANE((0 + i) & b_mask, z0.VnB(), i);
2290     }
2291     for (int i = 0; i < h_lane_count; i++) {
2292       ASSERT_EQUAL_SVE_LANE((1 + i) & h_mask, z1.VnH(), i);
2293     }
2294     for (int i = 0; i < s_lane_count; i++) {
2295       ASSERT_EQUAL_SVE_LANE((2 + i) & s_mask, z2.VnS(), i);
2296     }
2297     for (int i = 0; i < d_lane_count; i++) {
2298       ASSERT_EQUAL_SVE_LANE((3 + i) & d_mask, z3.VnD(), i);
2299     }
2300 
2301     // Synthesised immediates.
2302     for (int i = 0; i < b_lane_count; i++) {
2303       ASSERT_EQUAL_SVE_LANE((42 - i) & b_mask, z4.VnB(), i);
2304     }
2305     for (int i = 0; i < h_lane_count; i++) {
2306       ASSERT_EQUAL_SVE_LANE((-1 + (42 * i)) & h_mask, z5.VnH(), i);
2307     }
2308     for (int i = 0; i < s_lane_count; i++) {
2309       ASSERT_EQUAL_SVE_LANE((42 + (42 * i)) & s_mask, z6.VnS(), i);
2310     }
2311 
2312     // Register arguments.
2313     for (int i = 0; i < d_lane_count; i++) {
2314       ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & d_mask, z10.VnD(), i);
2315     }
2316     for (int i = 0; i < b_lane_count; i++) {
2317       ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & b_mask, z11.VnB(), i);
2318     }
2319     for (int i = 0; i < b_lane_count; i++) {
2320       ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & b_mask, z12.VnB(), i);
2321     }
2322     for (int i = 0; i < h_lane_count; i++) {
2323       ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & h_mask, z13.VnH(), i);
2324     }
2325     for (int i = 0; i < s_lane_count; i++) {
2326       ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & s_mask, z14.VnS(), i);
2327     }
2328 
2329     // Integer overflow.
2330     uint8_t expected_z20[] = {0x05, 0x03, 0x01, 0xff, 0xfd};
2331     ASSERT_EQUAL_SVE(expected_z20, z20.VnB());
2332     uint16_t expected_z21[] = {0xfffb, 0xfffe, 0x0001, 0x0004, 0x0007};
2333     ASSERT_EQUAL_SVE(expected_z21, z21.VnH());
2334     uint32_t expected_z22[] = {0x80000000, 0x7fffffff, 0x7ffffffe, 0x7ffffffd};
2335     ASSERT_EQUAL_SVE(expected_z22, z22.VnS());
2336     uint64_t expected_z23[] = {0x7fffffffffffffff, 0x8000000000000006};
2337     ASSERT_EQUAL_SVE(expected_z23, z23.VnD());
2338   }
2339 }
2340 
TEST(sve_int_compare_count_and_limit_scalars)2341 TEST(sve_int_compare_count_and_limit_scalars) {
2342   SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2343   START();
2344 
2345   __ Mov(w20, 0xfffffffd);
2346   __ Mov(w21, 0xffffffff);
2347 
2348   __ Whilele(p0.VnB(), w20, w21);
2349   __ Mrs(x0, NZCV);
2350   __ Whilele(p1.VnH(), w20, w21);
2351   __ Mrs(x1, NZCV);
2352 
2353   __ Mov(w20, 0xffffffff);
2354   __ Mov(w21, 0x00000000);
2355 
2356   __ Whilelt(p2.VnS(), w20, w21);
2357   __ Mrs(x2, NZCV);
2358   __ Whilelt(p3.VnD(), w20, w21);
2359   __ Mrs(x3, NZCV);
2360 
2361   __ Mov(w20, 0xfffffffd);
2362   __ Mov(w21, 0xffffffff);
2363 
2364   __ Whilels(p4.VnB(), w20, w21);
2365   __ Mrs(x4, NZCV);
2366   __ Whilels(p5.VnH(), w20, w21);
2367   __ Mrs(x5, NZCV);
2368 
2369   __ Mov(w20, 0xffffffff);
2370   __ Mov(w21, 0x00000000);
2371 
2372   __ Whilelo(p6.VnS(), w20, w21);
2373   __ Mrs(x6, NZCV);
2374   __ Whilelo(p7.VnD(), w20, w21);
2375   __ Mrs(x7, NZCV);
2376 
2377   __ Mov(x20, 0xfffffffffffffffd);
2378   __ Mov(x21, 0xffffffffffffffff);
2379 
2380   __ Whilele(p8.VnB(), x20, x21);
2381   __ Mrs(x8, NZCV);
2382   __ Whilele(p9.VnH(), x20, x21);
2383   __ Mrs(x9, NZCV);
2384 
2385   __ Mov(x20, 0xffffffffffffffff);
2386   __ Mov(x21, 0x0000000000000000);
2387 
2388   __ Whilelt(p10.VnS(), x20, x21);
2389   __ Mrs(x10, NZCV);
2390   __ Whilelt(p11.VnD(), x20, x21);
2391   __ Mrs(x11, NZCV);
2392 
2393   __ Mov(x20, 0xfffffffffffffffd);
2394   __ Mov(x21, 0xffffffffffffffff);
2395 
2396   __ Whilels(p12.VnB(), x20, x21);
2397   __ Mrs(x12, NZCV);
2398   __ Whilels(p13.VnH(), x20, x21);
2399   __ Mrs(x13, NZCV);
2400 
2401   __ Mov(x20, 0xffffffffffffffff);
2402   __ Mov(x21, 0x0000000000000000);
2403 
2404   __ Whilelo(p14.VnS(), x20, x21);
2405   __ Mrs(x14, NZCV);
2406   __ Whilelo(p15.VnD(), x20, x21);
2407   __ Mrs(x15, NZCV);
2408 
2409   END();
2410 
2411   if (CAN_RUN()) {
2412     RUN();
2413 
2414     // 0b...00000000'00000111
2415     int p0_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
2416     ASSERT_EQUAL_SVE(p0_expected, p0.VnB());
2417 
2418     // 0b...00000000'00010101
2419     int p1_expected[] = {0, 0, 0, 0, 0, 1, 1, 1};
2420     ASSERT_EQUAL_SVE(p1_expected, p1.VnH());
2421 
2422     int p2_expected[] = {0x0, 0x0, 0x0, 0x1};
2423     ASSERT_EQUAL_SVE(p2_expected, p2.VnS());
2424 
2425     int p3_expected[] = {0x00, 0x01};
2426     ASSERT_EQUAL_SVE(p3_expected, p3.VnD());
2427 
2428     // 0b...11111111'11111111
2429     int p4_expected[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
2430     ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
2431 
2432     // 0b...01010101'01010101
2433     int p5_expected[] = {1, 1, 1, 1, 1, 1, 1, 1};
2434     ASSERT_EQUAL_SVE(p5_expected, p5.VnH());
2435 
2436     int p6_expected[] = {0x0, 0x0, 0x0, 0x0};
2437     ASSERT_EQUAL_SVE(p6_expected, p6.VnS());
2438 
2439     int p7_expected[] = {0x00, 0x00};
2440     ASSERT_EQUAL_SVE(p7_expected, p7.VnD());
2441 
2442     // 0b...00000000'00000111
2443     int p8_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
2444     ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
2445 
2446     // 0b...00000000'00010101
2447     int p9_expected[] = {0, 0, 0, 0, 0, 1, 1, 1};
2448     ASSERT_EQUAL_SVE(p9_expected, p9.VnH());
2449 
2450     int p10_expected[] = {0x0, 0x0, 0x0, 0x1};
2451     ASSERT_EQUAL_SVE(p10_expected, p10.VnS());
2452 
2453     int p11_expected[] = {0x00, 0x01};
2454     ASSERT_EQUAL_SVE(p11_expected, p11.VnD());
2455 
2456     // 0b...11111111'11111111
2457     int p12_expected[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
2458     ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
2459 
2460     // 0b...01010101'01010101
2461     int p13_expected[] = {1, 1, 1, 1, 1, 1, 1, 1};
2462     ASSERT_EQUAL_SVE(p13_expected, p13.VnH());
2463 
2464     int p14_expected[] = {0x0, 0x0, 0x0, 0x0};
2465     ASSERT_EQUAL_SVE(p14_expected, p14.VnS());
2466 
2467     int p15_expected[] = {0x00, 0x00};
2468     ASSERT_EQUAL_SVE(p15_expected, p15.VnD());
2469 
2470     ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w0);
2471     ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w1);
2472     ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w2);
2473     ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w3);
2474     ASSERT_EQUAL_32(SVEFirstFlag, w4);
2475     ASSERT_EQUAL_32(SVEFirstFlag, w5);
2476     ASSERT_EQUAL_32(SVENoneFlag | SVENotLastFlag, w6);
2477     ASSERT_EQUAL_32(SVENoneFlag | SVENotLastFlag, w7);
2478     ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w8);
2479     ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w9);
2480     ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w10);
2481     ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w11);
2482     ASSERT_EQUAL_32(SVEFirstFlag, w12);
2483     ASSERT_EQUAL_32(SVEFirstFlag, w13);
2484     ASSERT_EQUAL_32(SVENoneFlag | SVENotLastFlag, w14);
2485     ASSERT_EQUAL_32(SVENoneFlag | SVENotLastFlag, w15);
2486   }
2487 }
2488 
TEST(sve_int_compare_count_and_limit_scalars_regression_test)2489 TEST(sve_int_compare_count_and_limit_scalars_regression_test) {
2490   SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2491   START();
2492 
2493   __ Mov(w0, 0x7ffffffd);
2494   __ Mov(w1, 0x7fffffff);
2495   __ Whilele(p0.VnB(), w0, w1);
2496 
2497   END();
2498 
2499   if (CAN_RUN()) {
2500     RUN();
2501 
2502     int p0_expected[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
2503     ASSERT_EQUAL_SVE(p0_expected, p0.VnB());
2504   }
2505 }
2506 
TEST(sve_int_compare_vectors_signed_imm)2507 TEST(sve_int_compare_vectors_signed_imm) {
2508   SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2509   START();
2510 
2511   int z13_inputs[] = {0, 1, -1, -15, 126, -127, -126, -15};
2512   int mask_inputs1[] = {1, 1, 1, 0, 1, 1, 1, 1};
2513   InsrHelper(&masm, z13.VnB(), z13_inputs);
2514   Initialise(&masm, p0.VnB(), mask_inputs1);
2515 
2516   __ Cmpeq(p2.VnB(), p0.Zeroing(), z13.VnB(), -15);
2517   __ Mrs(x2, NZCV);
2518   __ Cmpeq(p3.VnB(), p0.Zeroing(), z13.VnB(), -127);
2519 
2520   int z14_inputs[] = {0, 1, -1, -32767, -32766, 32767, 32766, 0};
2521   int mask_inputs2[] = {1, 1, 1, 0, 1, 1, 1, 1};
2522   InsrHelper(&masm, z14.VnH(), z14_inputs);
2523   Initialise(&masm, p0.VnH(), mask_inputs2);
2524 
2525   __ Cmpge(p4.VnH(), p0.Zeroing(), z14.VnH(), -1);
2526   __ Mrs(x4, NZCV);
2527   __ Cmpge(p5.VnH(), p0.Zeroing(), z14.VnH(), -32767);
2528 
2529   int z15_inputs[] = {0, 1, -1, INT_MIN};
2530   int mask_inputs3[] = {0, 1, 1, 1};
2531   InsrHelper(&masm, z15.VnS(), z15_inputs);
2532   Initialise(&masm, p0.VnS(), mask_inputs3);
2533 
2534   __ Cmpgt(p6.VnS(), p0.Zeroing(), z15.VnS(), 0);
2535   __ Mrs(x6, NZCV);
2536   __ Cmpgt(p7.VnS(), p0.Zeroing(), z15.VnS(), INT_MIN + 1);
2537 
2538   __ Cmplt(p8.VnS(), p0.Zeroing(), z15.VnS(), 0);
2539   __ Mrs(x8, NZCV);
2540   __ Cmplt(p9.VnS(), p0.Zeroing(), z15.VnS(), INT_MIN + 1);
2541 
2542   int64_t z16_inputs[] = {0, -1};
2543   int mask_inputs4[] = {1, 1};
2544   InsrHelper(&masm, z16.VnD(), z16_inputs);
2545   Initialise(&masm, p0.VnD(), mask_inputs4);
2546 
2547   __ Cmple(p10.VnD(), p0.Zeroing(), z16.VnD(), -1);
2548   __ Mrs(x10, NZCV);
2549   __ Cmple(p11.VnD(), p0.Zeroing(), z16.VnD(), LLONG_MIN);
2550 
2551   __ Cmpne(p12.VnD(), p0.Zeroing(), z16.VnD(), -1);
2552   __ Mrs(x12, NZCV);
2553   __ Cmpne(p13.VnD(), p0.Zeroing(), z16.VnD(), LLONG_MAX);
2554 
2555   END();
2556 
2557   if (CAN_RUN()) {
2558     RUN();
2559 
2560     int p2_expected[] = {0, 0, 0, 0, 0, 0, 0, 1};
2561     ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
2562 
2563     int p3_expected[] = {0, 0, 0, 0, 0, 1, 0, 0};
2564     ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
2565 
2566     int p4_expected[] = {0x1, 0x1, 0x1, 0x0, 0x0, 0x1, 0x1, 0x1};
2567     ASSERT_EQUAL_SVE(p4_expected, p4.VnH());
2568 
2569     int p5_expected[] = {0x1, 0x1, 0x1, 0x0, 0x1, 0x1, 0x1, 0x1};
2570     ASSERT_EQUAL_SVE(p5_expected, p5.VnH());
2571 
2572     int p6_expected[] = {0x0, 0x1, 0x0, 0x0};
2573     ASSERT_EQUAL_SVE(p6_expected, p6.VnS());
2574 
2575     int p7_expected[] = {0x0, 0x1, 0x1, 0x0};
2576     ASSERT_EQUAL_SVE(p7_expected, p7.VnS());
2577 
2578     int p8_expected[] = {0x0, 0x0, 0x1, 0x1};
2579     ASSERT_EQUAL_SVE(p8_expected, p8.VnS());
2580 
2581     int p9_expected[] = {0x0, 0x0, 0x0, 0x1};
2582     ASSERT_EQUAL_SVE(p9_expected, p9.VnS());
2583 
2584     int p10_expected[] = {0x00, 0x01};
2585     ASSERT_EQUAL_SVE(p10_expected, p10.VnD());
2586 
2587     int p11_expected[] = {0x00, 0x00};
2588     ASSERT_EQUAL_SVE(p11_expected, p11.VnD());
2589 
2590     int p12_expected[] = {0x01, 0x00};
2591     ASSERT_EQUAL_SVE(p12_expected, p12.VnD());
2592 
2593     int p13_expected[] = {0x01, 0x01};
2594     ASSERT_EQUAL_SVE(p13_expected, p13.VnD());
2595 
2596     ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w2);
2597     ASSERT_EQUAL_32(SVEFirstFlag, w4);
2598     ASSERT_EQUAL_32(NoFlag, w6);
2599     ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w8);
2600     ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w10);
2601     ASSERT_EQUAL_32(NoFlag, w12);
2602   }
2603 }
2604 
TEST(sve_int_compare_vectors_unsigned_imm)2605 TEST(sve_int_compare_vectors_unsigned_imm) {
2606   SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2607   START();
2608 
2609   uint32_t src1_inputs[] = {0xf7, 0x0f, 0x8f, 0x1f, 0x83, 0x12, 0x00, 0xf1};
2610   int mask_inputs1[] = {1, 1, 1, 0, 1, 1, 0, 1};
2611   InsrHelper(&masm, z13.VnB(), src1_inputs);
2612   Initialise(&masm, p0.VnB(), mask_inputs1);
2613 
2614   __ Cmphi(p2.VnB(), p0.Zeroing(), z13.VnB(), 0x0f);
2615   __ Mrs(x2, NZCV);
2616   __ Cmphi(p3.VnB(), p0.Zeroing(), z13.VnB(), 0xf0);
2617 
2618   uint32_t src2_inputs[] = {0xffff, 0x8000, 0x1fff, 0x0000, 0x1234};
2619   int mask_inputs2[] = {1, 1, 1, 1, 0};
2620   InsrHelper(&masm, z13.VnH(), src2_inputs);
2621   Initialise(&masm, p0.VnH(), mask_inputs2);
2622 
2623   __ Cmphs(p4.VnH(), p0.Zeroing(), z13.VnH(), 0x1f);
2624   __ Mrs(x4, NZCV);
2625   __ Cmphs(p5.VnH(), p0.Zeroing(), z13.VnH(), 0x1fff);
2626 
2627   uint32_t src3_inputs[] = {0xffffffff, 0xfedcba98, 0x0000ffff, 0x00000000};
2628   int mask_inputs3[] = {1, 1, 1, 1};
2629   InsrHelper(&masm, z13.VnS(), src3_inputs);
2630   Initialise(&masm, p0.VnS(), mask_inputs3);
2631 
2632   __ Cmplo(p6.VnS(), p0.Zeroing(), z13.VnS(), 0x3f);
2633   __ Mrs(x6, NZCV);
2634   __ Cmplo(p7.VnS(), p0.Zeroing(), z13.VnS(), 0x3f3f3f3f);
2635 
2636   uint64_t src4_inputs[] = {0xffffffffffffffff, 0x0000000000000000};
2637   int mask_inputs4[] = {1, 1};
2638   InsrHelper(&masm, z13.VnD(), src4_inputs);
2639   Initialise(&masm, p0.VnD(), mask_inputs4);
2640 
2641   __ Cmpls(p8.VnD(), p0.Zeroing(), z13.VnD(), 0x2f);
2642   __ Mrs(x8, NZCV);
2643   __ Cmpls(p9.VnD(), p0.Zeroing(), z13.VnD(), 0x800000000000000);
2644 
2645   END();
2646 
2647   if (CAN_RUN()) {
2648     RUN();
2649 
2650     int p2_expected[] = {1, 0, 1, 0, 1, 1, 0, 1};
2651     ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
2652 
2653     int p3_expected[] = {1, 0, 0, 0, 0, 0, 0, 1};
2654     ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
2655 
2656     int p4_expected[] = {0x1, 0x1, 0x1, 0x0, 0x0};
2657     ASSERT_EQUAL_SVE(p4_expected, p4.VnH());
2658 
2659     int p5_expected[] = {0x1, 0x1, 0x1, 0x0, 0x0};
2660     ASSERT_EQUAL_SVE(p5_expected, p5.VnH());
2661 
2662     int p6_expected[] = {0x0, 0x0, 0x0, 0x1};
2663     ASSERT_EQUAL_SVE(p6_expected, p6.VnS());
2664 
2665     int p7_expected[] = {0x0, 0x0, 0x1, 0x1};
2666     ASSERT_EQUAL_SVE(p7_expected, p7.VnS());
2667 
2668     int p8_expected[] = {0x00, 0x01};
2669     ASSERT_EQUAL_SVE(p8_expected, p8.VnD());
2670 
2671     int p9_expected[] = {0x00, 0x01};
2672     ASSERT_EQUAL_SVE(p9_expected, p9.VnD());
2673 
2674     ASSERT_EQUAL_32(SVEFirstFlag, w2);
2675     ASSERT_EQUAL_32(NoFlag, w4);
2676     ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w6);
2677     ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w8);
2678   }
2679 }
2680 
TEST(sve_int_compare_conditionally_terminate_scalars)2681 TEST(sve_int_compare_conditionally_terminate_scalars) {
2682   SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2683   START();
2684 
2685   __ Mov(x0, 0xfedcba9887654321);
2686   __ Mov(x1, 0x1000100010001000);
2687 
2688   // Initialise Z and C. These are preserved by cterm*, and the V flag is set to
2689   // !C if the condition does not hold.
2690   __ Mov(x10, NoFlag);
2691   __ Msr(NZCV, x10);
2692 
2693   __ Ctermeq(w0, w0);
2694   __ Mrs(x2, NZCV);
2695   __ Ctermeq(x0, x1);
2696   __ Mrs(x3, NZCV);
2697   __ Ctermne(x0, x0);
2698   __ Mrs(x4, NZCV);
2699   __ Ctermne(w0, w1);
2700   __ Mrs(x5, NZCV);
2701 
2702   // As above, but with all flags initially set.
2703   __ Mov(x10, NZCVFlag);
2704   __ Msr(NZCV, x10);
2705 
2706   __ Ctermeq(w0, w0);
2707   __ Mrs(x6, NZCV);
2708   __ Ctermeq(x0, x1);
2709   __ Mrs(x7, NZCV);
2710   __ Ctermne(x0, x0);
2711   __ Mrs(x8, NZCV);
2712   __ Ctermne(w0, w1);
2713   __ Mrs(x9, NZCV);
2714 
2715   END();
2716 
2717   if (CAN_RUN()) {
2718     RUN();
2719 
2720     ASSERT_EQUAL_32(SVEFirstFlag, w2);
2721     ASSERT_EQUAL_32(VFlag, w3);
2722     ASSERT_EQUAL_32(VFlag, w4);
2723     ASSERT_EQUAL_32(SVEFirstFlag, w5);
2724 
2725     ASSERT_EQUAL_32(SVEFirstFlag | ZCFlag, w6);
2726     ASSERT_EQUAL_32(ZCFlag, w7);
2727     ASSERT_EQUAL_32(ZCFlag, w8);
2728     ASSERT_EQUAL_32(SVEFirstFlag | ZCFlag, w9);
2729   }
2730 }
2731 
2732 // Work out what the architectural `PredTest` pseudocode should produce for the
2733 // given result and governing predicate.
2734 template <typename Tg, typename Td, int N>
GetPredTestFlags(const Td (& pd)[N],const Tg (& pg)[N],int vl)2735 static StatusFlags GetPredTestFlags(const Td (&pd)[N],
2736                                     const Tg (&pg)[N],
2737                                     int vl) {
2738   int first = -1;
2739   int last = -1;
2740   bool any_active = false;
2741 
2742   // Only consider potentially-active lanes.
2743   int start = (N > vl) ? (N - vl) : 0;
2744   for (int i = start; i < N; i++) {
2745     if ((pg[i] & 1) == 1) {
2746       // Look for the first and last active lanes.
2747       // Note that the 'first' lane is the one with the highest index.
2748       if (last < 0) last = i;
2749       first = i;
2750       // Look for any active lanes that are also active in pd.
2751       if ((pd[i] & 1) == 1) any_active = true;
2752     }
2753   }
2754 
2755   uint32_t flags = 0;
2756   if ((first >= 0) && ((pd[first] & 1) == 1)) flags |= SVEFirstFlag;
2757   if (!any_active) flags |= SVENoneFlag;
2758   if ((last < 0) || ((pd[last] & 1) == 0)) flags |= SVENotLastFlag;
2759   return static_cast<StatusFlags>(flags);
2760 }
2761 
2762 typedef void (MacroAssembler::*PfirstPnextFn)(const PRegisterWithLaneSize& pd,
2763                                               const PRegister& pg,
2764                                               const PRegisterWithLaneSize& pn);
2765 template <typename Tg, typename Tn, typename Td>
PfirstPnextHelper(Test * config,PfirstPnextFn macro,unsigned lane_size_in_bits,const Tg & pg_inputs,const Tn & pn_inputs,const Td & pd_expected)2766 static void PfirstPnextHelper(Test* config,
2767                               PfirstPnextFn macro,
2768                               unsigned lane_size_in_bits,
2769                               const Tg& pg_inputs,
2770                               const Tn& pn_inputs,
2771                               const Td& pd_expected) {
2772   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2773   START();
2774 
2775   PRegister pg = p15;
2776   PRegister pn = p14;
2777   Initialise(&masm, pg.WithLaneSize(lane_size_in_bits), pg_inputs);
2778   Initialise(&masm, pn.WithLaneSize(lane_size_in_bits), pn_inputs);
2779 
2780   // Initialise NZCV to an impossible value, to check that we actually write it.
2781   __ Mov(x10, NZCVFlag);
2782 
2783   // If pd.Is(pn), the MacroAssembler simply passes the arguments directly to
2784   // the Assembler.
2785   __ Msr(NZCV, x10);
2786   __ Mov(p0, pn);
2787   (masm.*macro)(p0.WithLaneSize(lane_size_in_bits),
2788                 pg,
2789                 p0.WithLaneSize(lane_size_in_bits));
2790   __ Mrs(x0, NZCV);
2791 
2792   // The MacroAssembler supports non-destructive use.
2793   __ Msr(NZCV, x10);
2794   (masm.*macro)(p1.WithLaneSize(lane_size_in_bits),
2795                 pg,
2796                 pn.WithLaneSize(lane_size_in_bits));
2797   __ Mrs(x1, NZCV);
2798 
2799   // If pd.Aliases(pg) the macro requires a scratch register.
2800   {
2801     UseScratchRegisterScope temps(&masm);
2802     temps.Include(p13);
2803     __ Msr(NZCV, x10);
2804     __ Mov(p2, p15);
2805     (masm.*macro)(p2.WithLaneSize(lane_size_in_bits),
2806                   p2,
2807                   pn.WithLaneSize(lane_size_in_bits));
2808     __ Mrs(x2, NZCV);
2809   }
2810 
2811   END();
2812 
2813   if (CAN_RUN()) {
2814     RUN();
2815 
2816     // Check that the inputs weren't modified.
2817     ASSERT_EQUAL_SVE(pn_inputs, pn.WithLaneSize(lane_size_in_bits));
2818     ASSERT_EQUAL_SVE(pg_inputs, pg.WithLaneSize(lane_size_in_bits));
2819 
2820     // Check the primary operation.
2821     ASSERT_EQUAL_SVE(pd_expected, p0.WithLaneSize(lane_size_in_bits));
2822     ASSERT_EQUAL_SVE(pd_expected, p1.WithLaneSize(lane_size_in_bits));
2823     ASSERT_EQUAL_SVE(pd_expected, p2.WithLaneSize(lane_size_in_bits));
2824 
2825     // Check that the flags were properly set.
2826     StatusFlags nzcv_expected =
2827         GetPredTestFlags(pd_expected,
2828                          pg_inputs,
2829                          core.GetSVELaneCount(kBRegSize));
2830     ASSERT_EQUAL_64(nzcv_expected, x0);
2831     ASSERT_EQUAL_64(nzcv_expected, x1);
2832     ASSERT_EQUAL_64(nzcv_expected, x2);
2833   }
2834 }
2835 
2836 template <typename Tg, typename Tn, typename Td>
PfirstHelper(Test * config,const Tg & pg_inputs,const Tn & pn_inputs,const Td & pd_expected)2837 static void PfirstHelper(Test* config,
2838                          const Tg& pg_inputs,
2839                          const Tn& pn_inputs,
2840                          const Td& pd_expected) {
2841   PfirstPnextHelper(config,
2842                     &MacroAssembler::Pfirst,
2843                     kBRegSize,  // pfirst only accepts B-sized lanes.
2844                     pg_inputs,
2845                     pn_inputs,
2846                     pd_expected);
2847 }
2848 
2849 template <typename Tg, typename Tn, typename Td>
PnextHelper(Test * config,unsigned lane_size_in_bits,const Tg & pg_inputs,const Tn & pn_inputs,const Td & pd_expected)2850 static void PnextHelper(Test* config,
2851                         unsigned lane_size_in_bits,
2852                         const Tg& pg_inputs,
2853                         const Tn& pn_inputs,
2854                         const Td& pd_expected) {
2855   PfirstPnextHelper(config,
2856                     &MacroAssembler::Pnext,
2857                     lane_size_in_bits,
2858                     pg_inputs,
2859                     pn_inputs,
2860                     pd_expected);
2861 }
2862 
TEST_SVE(sve_pfirst)2863 TEST_SVE(sve_pfirst) {
2864   // Provide more lanes than kPRegMinSize (to check propagation if we have a
2865   // large VL), but few enough to make the test easy to read.
2866   int in0[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
2867   int in1[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
2868   int in2[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
2869   int in3[] = {0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
2870   int in4[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
2871   VIXL_ASSERT(ArrayLength(in0) > kPRegMinSize);
2872 
2873   // Pfirst finds the first active lane in pg, and activates the corresponding
2874   // lane in pn (if it isn't already active).
2875 
2876   //                             The first active lane in in1 is here. |
2877   //                                                                   v
2878   int exp10[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
2879   int exp12[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0};
2880   int exp13[] = {0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
2881   int exp14[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
2882   PfirstHelper(config, in1, in0, exp10);
2883   PfirstHelper(config, in1, in2, exp12);
2884   PfirstHelper(config, in1, in3, exp13);
2885   PfirstHelper(config, in1, in4, exp14);
2886 
2887   //                          The first active lane in in2 is here. |
2888   //                                                                v
2889   int exp20[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0};
2890   int exp21[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0};
2891   int exp23[] = {0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
2892   int exp24[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0};
2893   PfirstHelper(config, in2, in0, exp20);
2894   PfirstHelper(config, in2, in1, exp21);
2895   PfirstHelper(config, in2, in3, exp23);
2896   PfirstHelper(config, in2, in4, exp24);
2897 
2898   //                                   The first active lane in in3 is here. |
2899   //                                                                         v
2900   int exp30[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
2901   int exp31[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1};
2902   int exp32[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1};
2903   int exp34[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
2904   PfirstHelper(config, in3, in0, exp30);
2905   PfirstHelper(config, in3, in1, exp31);
2906   PfirstHelper(config, in3, in2, exp32);
2907   PfirstHelper(config, in3, in4, exp34);
2908 
2909   //             | The first active lane in in4 is here.
2910   //             v
2911   int exp40[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
2912   int exp41[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
2913   int exp42[] = {1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
2914   int exp43[] = {1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
2915   PfirstHelper(config, in4, in0, exp40);
2916   PfirstHelper(config, in4, in1, exp41);
2917   PfirstHelper(config, in4, in2, exp42);
2918   PfirstHelper(config, in4, in3, exp43);
2919 
2920   // If pg is all inactive, the input is passed through unchanged.
2921   PfirstHelper(config, in0, in0, in0);
2922   PfirstHelper(config, in0, in1, in1);
2923   PfirstHelper(config, in0, in2, in2);
2924   PfirstHelper(config, in0, in3, in3);
2925 
2926   // If the values of pg and pn match, the value is passed through unchanged.
2927   PfirstHelper(config, in0, in0, in0);
2928   PfirstHelper(config, in1, in1, in1);
2929   PfirstHelper(config, in2, in2, in2);
2930   PfirstHelper(config, in3, in3, in3);
2931 }
2932 
TEST_SVE(sve_pfirst_alias)2933 TEST_SVE(sve_pfirst_alias) {
2934   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2935   START();
2936 
2937   // Check that the Simulator behaves correctly when all arguments are aliased.
2938   int in_b[] = {0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0};
2939   int in_h[] = {0, 0, 0, 0, 1, 1, 0, 0};
2940   int in_s[] = {0, 1, 1, 0};
2941   int in_d[] = {1, 1};
2942 
2943   Initialise(&masm, p0.VnB(), in_b);
2944   Initialise(&masm, p1.VnH(), in_h);
2945   Initialise(&masm, p2.VnS(), in_s);
2946   Initialise(&masm, p3.VnD(), in_d);
2947 
2948   // Initialise NZCV to an impossible value, to check that we actually write it.
2949   __ Mov(x10, NZCVFlag);
2950 
2951   __ Msr(NZCV, x10);
2952   __ Pfirst(p0.VnB(), p0, p0.VnB());
2953   __ Mrs(x0, NZCV);
2954 
2955   __ Msr(NZCV, x10);
2956   __ Pfirst(p1.VnB(), p1, p1.VnB());
2957   __ Mrs(x1, NZCV);
2958 
2959   __ Msr(NZCV, x10);
2960   __ Pfirst(p2.VnB(), p2, p2.VnB());
2961   __ Mrs(x2, NZCV);
2962 
2963   __ Msr(NZCV, x10);
2964   __ Pfirst(p3.VnB(), p3, p3.VnB());
2965   __ Mrs(x3, NZCV);
2966 
2967   END();
2968 
2969   if (CAN_RUN()) {
2970     RUN();
2971 
2972     // The first lane from pg is already active in pdn, so the P register should
2973     // be unchanged.
2974     ASSERT_EQUAL_SVE(in_b, p0.VnB());
2975     ASSERT_EQUAL_SVE(in_h, p1.VnH());
2976     ASSERT_EQUAL_SVE(in_s, p2.VnS());
2977     ASSERT_EQUAL_SVE(in_d, p3.VnD());
2978 
2979     ASSERT_EQUAL_64(SVEFirstFlag, x0);
2980     ASSERT_EQUAL_64(SVEFirstFlag, x1);
2981     ASSERT_EQUAL_64(SVEFirstFlag, x2);
2982     ASSERT_EQUAL_64(SVEFirstFlag, x3);
2983   }
2984 }
2985 
TEST_SVE(sve_pnext_b)2986 TEST_SVE(sve_pnext_b) {
2987   // TODO: Once we have the infrastructure, provide more lanes than kPRegMinSize
2988   // (to check propagation if we have a large VL), but few enough to make the
2989   // test easy to read.
2990   // For now, we just use kPRegMinSize so that the test works anywhere.
2991   int in0[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
2992   int in1[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
2993   int in2[] = {0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
2994   int in3[] = {0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1};
2995   int in4[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
2996 
2997   // Pnext activates the next element that is true in pg, after the last-active
2998   // element in pn. If all pn elements are false (as in in0), it starts looking
2999   // at element 0.
3000 
3001   // There are no active lanes in in0, so the result is simply the first active
3002   // lane from pg.
3003   int exp00[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3004   int exp10[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
3005   int exp20[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0};
3006   int exp30[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
3007   int exp40[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3008 
3009   //      The last active lane in in1 is here. |
3010   //                                           v
3011   int exp01[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3012   int exp11[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3013   int exp21[] = {0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3014   int exp31[] = {0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3015   int exp41[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3016 
3017   //                | The last active lane in in2 is here.
3018   //                v
3019   int exp02[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3020   int exp12[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3021   int exp22[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3022   int exp32[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3023   int exp42[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3024 
3025   //                               | The last active lane in in3 is here.
3026   //                               v
3027   int exp03[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3028   int exp13[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3029   int exp23[] = {0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3030   int exp33[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3031   int exp43[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3032 
3033   //             | The last active lane in in4 is here.
3034   //             v
3035   int exp04[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3036   int exp14[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3037   int exp24[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3038   int exp34[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3039   int exp44[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3040 
3041   PnextHelper(config, kBRegSize, in0, in0, exp00);
3042   PnextHelper(config, kBRegSize, in1, in0, exp10);
3043   PnextHelper(config, kBRegSize, in2, in0, exp20);
3044   PnextHelper(config, kBRegSize, in3, in0, exp30);
3045   PnextHelper(config, kBRegSize, in4, in0, exp40);
3046 
3047   PnextHelper(config, kBRegSize, in0, in1, exp01);
3048   PnextHelper(config, kBRegSize, in1, in1, exp11);
3049   PnextHelper(config, kBRegSize, in2, in1, exp21);
3050   PnextHelper(config, kBRegSize, in3, in1, exp31);
3051   PnextHelper(config, kBRegSize, in4, in1, exp41);
3052 
3053   PnextHelper(config, kBRegSize, in0, in2, exp02);
3054   PnextHelper(config, kBRegSize, in1, in2, exp12);
3055   PnextHelper(config, kBRegSize, in2, in2, exp22);
3056   PnextHelper(config, kBRegSize, in3, in2, exp32);
3057   PnextHelper(config, kBRegSize, in4, in2, exp42);
3058 
3059   PnextHelper(config, kBRegSize, in0, in3, exp03);
3060   PnextHelper(config, kBRegSize, in1, in3, exp13);
3061   PnextHelper(config, kBRegSize, in2, in3, exp23);
3062   PnextHelper(config, kBRegSize, in3, in3, exp33);
3063   PnextHelper(config, kBRegSize, in4, in3, exp43);
3064 
3065   PnextHelper(config, kBRegSize, in0, in4, exp04);
3066   PnextHelper(config, kBRegSize, in1, in4, exp14);
3067   PnextHelper(config, kBRegSize, in2, in4, exp24);
3068   PnextHelper(config, kBRegSize, in3, in4, exp34);
3069   PnextHelper(config, kBRegSize, in4, in4, exp44);
3070 }
3071 
TEST_SVE(sve_pnext_h)3072 TEST_SVE(sve_pnext_h) {
3073   // TODO: Once we have the infrastructure, provide more lanes than kPRegMinSize
3074   // (to check propagation if we have a large VL), but few enough to make the
3075   // test easy to read.
3076   // For now, we just use kPRegMinSize so that the test works anywhere.
3077   int in0[] = {0, 0, 0, 0, 0, 0, 0, 0};
3078   int in1[] = {0, 0, 0, 1, 0, 2, 1, 0};
3079   int in2[] = {0, 1, 2, 0, 2, 0, 2, 0};
3080   int in3[] = {0, 0, 0, 3, 0, 0, 0, 3};
3081   int in4[] = {3, 0, 0, 0, 0, 0, 0, 0};
3082 
3083   // Pnext activates the next element that is true in pg, after the last-active
3084   // element in pn. If all pn elements are false (as in in0), it starts looking
3085   // at element 0.
3086   //
3087   // As for other SVE instructions, elements are only considered to be active if
3088   // the _first_ bit in each field is one. Other bits are ignored.
3089 
3090   // There are no active lanes in in0, so the result is simply the first active
3091   // lane from pg.
3092   int exp00[] = {0, 0, 0, 0, 0, 0, 0, 0};
3093   int exp10[] = {0, 0, 0, 0, 0, 0, 1, 0};
3094   int exp20[] = {0, 1, 0, 0, 0, 0, 0, 0};
3095   int exp30[] = {0, 0, 0, 0, 0, 0, 0, 1};
3096   int exp40[] = {1, 0, 0, 0, 0, 0, 0, 0};
3097 
3098   //                      | The last active lane in in1 is here.
3099   //                      v
3100   int exp01[] = {0, 0, 0, 0, 0, 0, 0, 0};
3101   int exp11[] = {0, 0, 0, 0, 0, 0, 0, 0};
3102   int exp21[] = {0, 1, 0, 0, 0, 0, 0, 0};
3103   int exp31[] = {0, 0, 0, 0, 0, 0, 0, 0};
3104   int exp41[] = {1, 0, 0, 0, 0, 0, 0, 0};
3105 
3106   //                | The last active lane in in2 is here.
3107   //                v
3108   int exp02[] = {0, 0, 0, 0, 0, 0, 0, 0};
3109   int exp12[] = {0, 0, 0, 0, 0, 0, 0, 0};
3110   int exp22[] = {0, 0, 0, 0, 0, 0, 0, 0};
3111   int exp32[] = {0, 0, 0, 0, 0, 0, 0, 0};
3112   int exp42[] = {1, 0, 0, 0, 0, 0, 0, 0};
3113 
3114   //                      | The last active lane in in3 is here.
3115   //                      v
3116   int exp03[] = {0, 0, 0, 0, 0, 0, 0, 0};
3117   int exp13[] = {0, 0, 0, 0, 0, 0, 0, 0};
3118   int exp23[] = {0, 1, 0, 0, 0, 0, 0, 0};
3119   int exp33[] = {0, 0, 0, 0, 0, 0, 0, 0};
3120   int exp43[] = {1, 0, 0, 0, 0, 0, 0, 0};
3121 
3122   //             | The last active lane in in4 is here.
3123   //             v
3124   int exp04[] = {0, 0, 0, 0, 0, 0, 0, 0};
3125   int exp14[] = {0, 0, 0, 0, 0, 0, 0, 0};
3126   int exp24[] = {0, 0, 0, 0, 0, 0, 0, 0};
3127   int exp34[] = {0, 0, 0, 0, 0, 0, 0, 0};
3128   int exp44[] = {0, 0, 0, 0, 0, 0, 0, 0};
3129 
3130   PnextHelper(config, kHRegSize, in0, in0, exp00);
3131   PnextHelper(config, kHRegSize, in1, in0, exp10);
3132   PnextHelper(config, kHRegSize, in2, in0, exp20);
3133   PnextHelper(config, kHRegSize, in3, in0, exp30);
3134   PnextHelper(config, kHRegSize, in4, in0, exp40);
3135 
3136   PnextHelper(config, kHRegSize, in0, in1, exp01);
3137   PnextHelper(config, kHRegSize, in1, in1, exp11);
3138   PnextHelper(config, kHRegSize, in2, in1, exp21);
3139   PnextHelper(config, kHRegSize, in3, in1, exp31);
3140   PnextHelper(config, kHRegSize, in4, in1, exp41);
3141 
3142   PnextHelper(config, kHRegSize, in0, in2, exp02);
3143   PnextHelper(config, kHRegSize, in1, in2, exp12);
3144   PnextHelper(config, kHRegSize, in2, in2, exp22);
3145   PnextHelper(config, kHRegSize, in3, in2, exp32);
3146   PnextHelper(config, kHRegSize, in4, in2, exp42);
3147 
3148   PnextHelper(config, kHRegSize, in0, in3, exp03);
3149   PnextHelper(config, kHRegSize, in1, in3, exp13);
3150   PnextHelper(config, kHRegSize, in2, in3, exp23);
3151   PnextHelper(config, kHRegSize, in3, in3, exp33);
3152   PnextHelper(config, kHRegSize, in4, in3, exp43);
3153 
3154   PnextHelper(config, kHRegSize, in0, in4, exp04);
3155   PnextHelper(config, kHRegSize, in1, in4, exp14);
3156   PnextHelper(config, kHRegSize, in2, in4, exp24);
3157   PnextHelper(config, kHRegSize, in3, in4, exp34);
3158   PnextHelper(config, kHRegSize, in4, in4, exp44);
3159 }
3160 
TEST_SVE(sve_pnext_s)3161 TEST_SVE(sve_pnext_s) {
3162   // TODO: Once we have the infrastructure, provide more lanes than kPRegMinSize
3163   // (to check propagation if we have a large VL), but few enough to make the
3164   // test easy to read.
3165   // For now, we just use kPRegMinSize so that the test works anywhere.
3166   int in0[] = {0xe, 0xc, 0x8, 0x0};
3167   int in1[] = {0x0, 0x2, 0x0, 0x1};
3168   int in2[] = {0x0, 0x1, 0xf, 0x0};
3169   int in3[] = {0xf, 0x0, 0x0, 0x0};
3170 
3171   // Pnext activates the next element that is true in pg, after the last-active
3172   // element in pn. If all pn elements are false (as in in0), it starts looking
3173   // at element 0.
3174   //
3175   // As for other SVE instructions, elements are only considered to be active if
3176   // the _first_ bit in each field is one. Other bits are ignored.
3177 
3178   // There are no active lanes in in0, so the result is simply the first active
3179   // lane from pg.
3180   int exp00[] = {0, 0, 0, 0};
3181   int exp10[] = {0, 0, 0, 1};
3182   int exp20[] = {0, 0, 1, 0};
3183   int exp30[] = {1, 0, 0, 0};
3184 
3185   //                      | The last active lane in in1 is here.
3186   //                      v
3187   int exp01[] = {0, 0, 0, 0};
3188   int exp11[] = {0, 0, 0, 0};
3189   int exp21[] = {0, 0, 1, 0};
3190   int exp31[] = {1, 0, 0, 0};
3191 
3192   //                | The last active lane in in2 is here.
3193   //                v
3194   int exp02[] = {0, 0, 0, 0};
3195   int exp12[] = {0, 0, 0, 0};
3196   int exp22[] = {0, 0, 0, 0};
3197   int exp32[] = {1, 0, 0, 0};
3198 
3199   //             | The last active lane in in3 is here.
3200   //             v
3201   int exp03[] = {0, 0, 0, 0};
3202   int exp13[] = {0, 0, 0, 0};
3203   int exp23[] = {0, 0, 0, 0};
3204   int exp33[] = {0, 0, 0, 0};
3205 
3206   PnextHelper(config, kSRegSize, in0, in0, exp00);
3207   PnextHelper(config, kSRegSize, in1, in0, exp10);
3208   PnextHelper(config, kSRegSize, in2, in0, exp20);
3209   PnextHelper(config, kSRegSize, in3, in0, exp30);
3210 
3211   PnextHelper(config, kSRegSize, in0, in1, exp01);
3212   PnextHelper(config, kSRegSize, in1, in1, exp11);
3213   PnextHelper(config, kSRegSize, in2, in1, exp21);
3214   PnextHelper(config, kSRegSize, in3, in1, exp31);
3215 
3216   PnextHelper(config, kSRegSize, in0, in2, exp02);
3217   PnextHelper(config, kSRegSize, in1, in2, exp12);
3218   PnextHelper(config, kSRegSize, in2, in2, exp22);
3219   PnextHelper(config, kSRegSize, in3, in2, exp32);
3220 
3221   PnextHelper(config, kSRegSize, in0, in3, exp03);
3222   PnextHelper(config, kSRegSize, in1, in3, exp13);
3223   PnextHelper(config, kSRegSize, in2, in3, exp23);
3224   PnextHelper(config, kSRegSize, in3, in3, exp33);
3225 }
3226 
TEST_SVE(sve_pnext_d)3227 TEST_SVE(sve_pnext_d) {
3228   // TODO: Once we have the infrastructure, provide more lanes than kPRegMinSize
3229   // (to check propagation if we have a large VL), but few enough to make the
3230   // test easy to read.
3231   // For now, we just use kPRegMinSize so that the test works anywhere.
3232   int in0[] = {0xfe, 0xf0};
3233   int in1[] = {0x00, 0x55};
3234   int in2[] = {0x33, 0xff};
3235 
3236   // Pnext activates the next element that is true in pg, after the last-active
3237   // element in pn. If all pn elements are false (as in in0), it starts looking
3238   // at element 0.
3239   //
3240   // As for other SVE instructions, elements are only considered to be active if
3241   // the _first_ bit in each field is one. Other bits are ignored.
3242 
3243   // There are no active lanes in in0, so the result is simply the first active
3244   // lane from pg.
3245   int exp00[] = {0, 0};
3246   int exp10[] = {0, 1};
3247   int exp20[] = {0, 1};
3248 
3249   //                | The last active lane in in1 is here.
3250   //                v
3251   int exp01[] = {0, 0};
3252   int exp11[] = {0, 0};
3253   int exp21[] = {1, 0};
3254 
3255   //             | The last active lane in in2 is here.
3256   //             v
3257   int exp02[] = {0, 0};
3258   int exp12[] = {0, 0};
3259   int exp22[] = {0, 0};
3260 
3261   PnextHelper(config, kDRegSize, in0, in0, exp00);
3262   PnextHelper(config, kDRegSize, in1, in0, exp10);
3263   PnextHelper(config, kDRegSize, in2, in0, exp20);
3264 
3265   PnextHelper(config, kDRegSize, in0, in1, exp01);
3266   PnextHelper(config, kDRegSize, in1, in1, exp11);
3267   PnextHelper(config, kDRegSize, in2, in1, exp21);
3268 
3269   PnextHelper(config, kDRegSize, in0, in2, exp02);
3270   PnextHelper(config, kDRegSize, in1, in2, exp12);
3271   PnextHelper(config, kDRegSize, in2, in2, exp22);
3272 }
3273 
TEST_SVE(sve_pnext_alias)3274 TEST_SVE(sve_pnext_alias) {
3275   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
3276   START();
3277 
3278   // Check that the Simulator behaves correctly when all arguments are aliased.
3279   int in_b[] = {0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0};
3280   int in_h[] = {0, 0, 0, 0, 1, 1, 0, 0};
3281   int in_s[] = {0, 1, 1, 0};
3282   int in_d[] = {1, 1};
3283 
3284   Initialise(&masm, p0.VnB(), in_b);
3285   Initialise(&masm, p1.VnH(), in_h);
3286   Initialise(&masm, p2.VnS(), in_s);
3287   Initialise(&masm, p3.VnD(), in_d);
3288 
3289   // Initialise NZCV to an impossible value, to check that we actually write it.
3290   __ Mov(x10, NZCVFlag);
3291 
3292   __ Msr(NZCV, x10);
3293   __ Pnext(p0.VnB(), p0, p0.VnB());
3294   __ Mrs(x0, NZCV);
3295 
3296   __ Msr(NZCV, x10);
3297   __ Pnext(p1.VnB(), p1, p1.VnB());
3298   __ Mrs(x1, NZCV);
3299 
3300   __ Msr(NZCV, x10);
3301   __ Pnext(p2.VnB(), p2, p2.VnB());
3302   __ Mrs(x2, NZCV);
3303 
3304   __ Msr(NZCV, x10);
3305   __ Pnext(p3.VnB(), p3, p3.VnB());
3306   __ Mrs(x3, NZCV);
3307 
3308   END();
3309 
3310   if (CAN_RUN()) {
3311     RUN();
3312 
3313     // Since pg.Is(pdn), there can be no active lanes in pg above the last
3314     // active lane in pdn, so the result should always be zero.
3315     ASSERT_EQUAL_SVE(0, p0.VnB());
3316     ASSERT_EQUAL_SVE(0, p1.VnH());
3317     ASSERT_EQUAL_SVE(0, p2.VnS());
3318     ASSERT_EQUAL_SVE(0, p3.VnD());
3319 
3320     ASSERT_EQUAL_64(SVENoneFlag | SVENotLastFlag, x0);
3321     ASSERT_EQUAL_64(SVENoneFlag | SVENotLastFlag, x1);
3322     ASSERT_EQUAL_64(SVENoneFlag | SVENotLastFlag, x2);
3323     ASSERT_EQUAL_64(SVENoneFlag | SVENotLastFlag, x3);
3324   }
3325 }
3326 
PtrueHelper(Test * config,unsigned lane_size_in_bits,FlagsUpdate s=LeaveFlags)3327 static void PtrueHelper(Test* config,
3328                         unsigned lane_size_in_bits,
3329                         FlagsUpdate s = LeaveFlags) {
3330   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
3331   START();
3332 
3333   PRegisterWithLaneSize p[kNumberOfPRegisters];
3334   for (unsigned i = 0; i < kNumberOfPRegisters; i++) {
3335     p[i] = PRegister(i).WithLaneSize(lane_size_in_bits);
3336   }
3337 
3338   // Initialise NZCV to an impossible value, to check that we actually write it.
3339   StatusFlags nzcv_unmodified = NZCVFlag;
3340   __ Mov(x20, nzcv_unmodified);
3341 
3342   // We don't have enough registers to conveniently test every pattern, so take
3343   // samples from each group.
3344   __ Msr(NZCV, x20);
3345   __ Ptrue(p[0], SVE_POW2, s);
3346   __ Mrs(x0, NZCV);
3347 
3348   __ Msr(NZCV, x20);
3349   __ Ptrue(p[1], SVE_VL1, s);
3350   __ Mrs(x1, NZCV);
3351 
3352   __ Msr(NZCV, x20);
3353   __ Ptrue(p[2], SVE_VL2, s);
3354   __ Mrs(x2, NZCV);
3355 
3356   __ Msr(NZCV, x20);
3357   __ Ptrue(p[3], SVE_VL5, s);
3358   __ Mrs(x3, NZCV);
3359 
3360   __ Msr(NZCV, x20);
3361   __ Ptrue(p[4], SVE_VL6, s);
3362   __ Mrs(x4, NZCV);
3363 
3364   __ Msr(NZCV, x20);
3365   __ Ptrue(p[5], SVE_VL8, s);
3366   __ Mrs(x5, NZCV);
3367 
3368   __ Msr(NZCV, x20);
3369   __ Ptrue(p[6], SVE_VL16, s);
3370   __ Mrs(x6, NZCV);
3371 
3372   __ Msr(NZCV, x20);
3373   __ Ptrue(p[7], SVE_VL64, s);
3374   __ Mrs(x7, NZCV);
3375 
3376   __ Msr(NZCV, x20);
3377   __ Ptrue(p[8], SVE_VL256, s);
3378   __ Mrs(x8, NZCV);
3379 
3380   {
3381     // We have to use the Assembler to use values not defined by
3382     // SVEPredicateConstraint, so call `ptrues` directly..
3383     typedef void (
3384         MacroAssembler::*AssemblePtrueFn)(const PRegisterWithLaneSize& pd,
3385                                           int pattern);
3386     AssemblePtrueFn assemble = &MacroAssembler::ptrue;
3387     if (s == SetFlags) {
3388       assemble = &MacroAssembler::ptrues;
3389     }
3390 
3391     ExactAssemblyScope guard(&masm, 12 * kInstructionSize);
3392     __ msr(NZCV, x20);
3393     (masm.*assemble)(p[9], 0xe);
3394     __ mrs(x9, NZCV);
3395 
3396     __ msr(NZCV, x20);
3397     (masm.*assemble)(p[10], 0x16);
3398     __ mrs(x10, NZCV);
3399 
3400     __ msr(NZCV, x20);
3401     (masm.*assemble)(p[11], 0x1a);
3402     __ mrs(x11, NZCV);
3403 
3404     __ msr(NZCV, x20);
3405     (masm.*assemble)(p[12], 0x1c);
3406     __ mrs(x12, NZCV);
3407   }
3408 
3409   __ Msr(NZCV, x20);
3410   __ Ptrue(p[13], SVE_MUL4, s);
3411   __ Mrs(x13, NZCV);
3412 
3413   __ Msr(NZCV, x20);
3414   __ Ptrue(p[14], SVE_MUL3, s);
3415   __ Mrs(x14, NZCV);
3416 
3417   __ Msr(NZCV, x20);
3418   __ Ptrue(p[15], SVE_ALL, s);
3419   __ Mrs(x15, NZCV);
3420 
3421   END();
3422 
3423   if (CAN_RUN()) {
3424     RUN();
3425 
3426     int all = core.GetSVELaneCount(lane_size_in_bits);
3427     int pow2 = 1 << HighestSetBitPosition(all);
3428     int mul4 = all - (all % 4);
3429     int mul3 = all - (all % 3);
3430 
3431     // Check P register results.
3432     for (int i = 0; i < all; i++) {
3433       ASSERT_EQUAL_SVE_LANE(i < pow2, p[0], i);
3434       ASSERT_EQUAL_SVE_LANE((all >= 1) && (i < 1), p[1], i);
3435       ASSERT_EQUAL_SVE_LANE((all >= 2) && (i < 2), p[2], i);
3436       ASSERT_EQUAL_SVE_LANE((all >= 5) && (i < 5), p[3], i);
3437       ASSERT_EQUAL_SVE_LANE((all >= 6) && (i < 6), p[4], i);
3438       ASSERT_EQUAL_SVE_LANE((all >= 8) && (i < 8), p[5], i);
3439       ASSERT_EQUAL_SVE_LANE((all >= 16) && (i < 16), p[6], i);
3440       ASSERT_EQUAL_SVE_LANE((all >= 64) && (i < 64), p[7], i);
3441       ASSERT_EQUAL_SVE_LANE((all >= 256) && (i < 256), p[8], i);
3442       ASSERT_EQUAL_SVE_LANE(false, p[9], i);
3443       ASSERT_EQUAL_SVE_LANE(false, p[10], i);
3444       ASSERT_EQUAL_SVE_LANE(false, p[11], i);
3445       ASSERT_EQUAL_SVE_LANE(false, p[12], i);
3446       ASSERT_EQUAL_SVE_LANE(i < mul4, p[13], i);
3447       ASSERT_EQUAL_SVE_LANE(i < mul3, p[14], i);
3448       ASSERT_EQUAL_SVE_LANE(true, p[15], i);
3449     }
3450 
3451     // Check NZCV results.
3452     if (s == LeaveFlags) {
3453       // No flags should have been updated.
3454       for (int i = 0; i <= 15; i++) {
3455         ASSERT_EQUAL_64(nzcv_unmodified, XRegister(i));
3456       }
3457     } else {
3458       StatusFlags zero = static_cast<StatusFlags>(SVENoneFlag | SVENotLastFlag);
3459       StatusFlags nonzero = SVEFirstFlag;
3460 
3461       // POW2
3462       ASSERT_EQUAL_64(nonzero, x0);
3463       // VL*
3464       ASSERT_EQUAL_64((all >= 1) ? nonzero : zero, x1);
3465       ASSERT_EQUAL_64((all >= 2) ? nonzero : zero, x2);
3466       ASSERT_EQUAL_64((all >= 5) ? nonzero : zero, x3);
3467       ASSERT_EQUAL_64((all >= 6) ? nonzero : zero, x4);
3468       ASSERT_EQUAL_64((all >= 8) ? nonzero : zero, x5);
3469       ASSERT_EQUAL_64((all >= 16) ? nonzero : zero, x6);
3470       ASSERT_EQUAL_64((all >= 64) ? nonzero : zero, x7);
3471       ASSERT_EQUAL_64((all >= 256) ? nonzero : zero, x8);
3472       // #uimm5
3473       ASSERT_EQUAL_64(zero, x9);
3474       ASSERT_EQUAL_64(zero, x10);
3475       ASSERT_EQUAL_64(zero, x11);
3476       ASSERT_EQUAL_64(zero, x12);
3477       // MUL*
3478       ASSERT_EQUAL_64((all >= 4) ? nonzero : zero, x13);
3479       ASSERT_EQUAL_64((all >= 3) ? nonzero : zero, x14);
3480       // ALL
3481       ASSERT_EQUAL_64(nonzero, x15);
3482     }
3483   }
3484 }
3485 
TEST_SVE(sve_ptrue_b)3486 TEST_SVE(sve_ptrue_b) { PtrueHelper(config, kBRegSize, LeaveFlags); }
TEST_SVE(sve_ptrue_h)3487 TEST_SVE(sve_ptrue_h) { PtrueHelper(config, kHRegSize, LeaveFlags); }
TEST_SVE(sve_ptrue_s)3488 TEST_SVE(sve_ptrue_s) { PtrueHelper(config, kSRegSize, LeaveFlags); }
TEST_SVE(sve_ptrue_d)3489 TEST_SVE(sve_ptrue_d) { PtrueHelper(config, kDRegSize, LeaveFlags); }
3490 
TEST_SVE(sve_ptrues_b)3491 TEST_SVE(sve_ptrues_b) { PtrueHelper(config, kBRegSize, SetFlags); }
TEST_SVE(sve_ptrues_h)3492 TEST_SVE(sve_ptrues_h) { PtrueHelper(config, kHRegSize, SetFlags); }
TEST_SVE(sve_ptrues_s)3493 TEST_SVE(sve_ptrues_s) { PtrueHelper(config, kSRegSize, SetFlags); }
TEST_SVE(sve_ptrues_d)3494 TEST_SVE(sve_ptrues_d) { PtrueHelper(config, kDRegSize, SetFlags); }
3495 
TEST_SVE(sve_pfalse)3496 TEST_SVE(sve_pfalse) {
3497   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
3498   START();
3499 
3500   // Initialise non-zero inputs.
3501   __ Ptrue(p0.VnB());
3502   __ Ptrue(p1.VnH());
3503   __ Ptrue(p2.VnS());
3504   __ Ptrue(p3.VnD());
3505 
3506   // The instruction only supports B-sized lanes, but the lane size has no
3507   // logical effect, so the MacroAssembler accepts anything.
3508   __ Pfalse(p0.VnB());
3509   __ Pfalse(p1.VnH());
3510   __ Pfalse(p2.VnS());
3511   __ Pfalse(p3.VnD());
3512 
3513   END();
3514 
3515   if (CAN_RUN()) {
3516     RUN();
3517 
3518     ASSERT_EQUAL_SVE(0, p0.VnB());
3519     ASSERT_EQUAL_SVE(0, p1.VnB());
3520     ASSERT_EQUAL_SVE(0, p2.VnB());
3521     ASSERT_EQUAL_SVE(0, p3.VnB());
3522   }
3523 }
3524 
TEST_SVE(sve_ptest)3525 TEST_SVE(sve_ptest) {
3526   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
3527   START();
3528 
3529   // Initialise NZCV to a known (impossible) value.
3530   StatusFlags nzcv_unmodified = NZCVFlag;
3531   __ Mov(x0, nzcv_unmodified);
3532   __ Msr(NZCV, x0);
3533 
3534   // Construct some test inputs.
3535   int in2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0};
3536   int in3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0};
3537   int in4[] = {0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0};
3538   __ Pfalse(p0.VnB());
3539   __ Ptrue(p1.VnB());
3540   Initialise(&masm, p2.VnB(), in2);
3541   Initialise(&masm, p3.VnB(), in3);
3542   Initialise(&masm, p4.VnB(), in4);
3543 
3544   // All-inactive pg.
3545   __ Ptest(p0, p0.VnB());
3546   __ Mrs(x0, NZCV);
3547   __ Ptest(p0, p1.VnB());
3548   __ Mrs(x1, NZCV);
3549   __ Ptest(p0, p2.VnB());
3550   __ Mrs(x2, NZCV);
3551   __ Ptest(p0, p3.VnB());
3552   __ Mrs(x3, NZCV);
3553   __ Ptest(p0, p4.VnB());
3554   __ Mrs(x4, NZCV);
3555 
3556   // All-active pg.
3557   __ Ptest(p1, p0.VnB());
3558   __ Mrs(x5, NZCV);
3559   __ Ptest(p1, p1.VnB());
3560   __ Mrs(x6, NZCV);
3561   __ Ptest(p1, p2.VnB());
3562   __ Mrs(x7, NZCV);
3563   __ Ptest(p1, p3.VnB());
3564   __ Mrs(x8, NZCV);
3565   __ Ptest(p1, p4.VnB());
3566   __ Mrs(x9, NZCV);
3567 
3568   // Combinations of other inputs.
3569   __ Ptest(p2, p2.VnB());
3570   __ Mrs(x20, NZCV);
3571   __ Ptest(p2, p3.VnB());
3572   __ Mrs(x21, NZCV);
3573   __ Ptest(p2, p4.VnB());
3574   __ Mrs(x22, NZCV);
3575   __ Ptest(p3, p2.VnB());
3576   __ Mrs(x23, NZCV);
3577   __ Ptest(p3, p3.VnB());
3578   __ Mrs(x24, NZCV);
3579   __ Ptest(p3, p4.VnB());
3580   __ Mrs(x25, NZCV);
3581   __ Ptest(p4, p2.VnB());
3582   __ Mrs(x26, NZCV);
3583   __ Ptest(p4, p3.VnB());
3584   __ Mrs(x27, NZCV);
3585   __ Ptest(p4, p4.VnB());
3586   __ Mrs(x28, NZCV);
3587 
3588   END();
3589 
3590   if (CAN_RUN()) {
3591     RUN();
3592 
3593     StatusFlags zero = static_cast<StatusFlags>(SVENoneFlag | SVENotLastFlag);
3594 
3595     // If pg is all inactive, the value of pn is irrelevant.
3596     ASSERT_EQUAL_64(zero, x0);
3597     ASSERT_EQUAL_64(zero, x1);
3598     ASSERT_EQUAL_64(zero, x2);
3599     ASSERT_EQUAL_64(zero, x3);
3600     ASSERT_EQUAL_64(zero, x4);
3601 
3602     // All-active pg.
3603     ASSERT_EQUAL_64(zero, x5);          // All-inactive pn.
3604     ASSERT_EQUAL_64(SVEFirstFlag, x6);  // All-active pn.
3605     // Other pn inputs are non-zero, but the first and last lanes are inactive.
3606     ASSERT_EQUAL_64(SVENotLastFlag, x7);
3607     ASSERT_EQUAL_64(SVENotLastFlag, x8);
3608     ASSERT_EQUAL_64(SVENotLastFlag, x9);
3609 
3610     // Other inputs.
3611     ASSERT_EQUAL_64(SVEFirstFlag, x20);  // pg: in2, pn: in2
3612     ASSERT_EQUAL_64(NoFlag, x21);        // pg: in2, pn: in3
3613     ASSERT_EQUAL_64(zero, x22);          // pg: in2, pn: in4
3614     ASSERT_EQUAL_64(static_cast<StatusFlags>(SVEFirstFlag | SVENotLastFlag),
3615                     x23);                // pg: in3, pn: in2
3616     ASSERT_EQUAL_64(SVEFirstFlag, x24);  // pg: in3, pn: in3
3617     ASSERT_EQUAL_64(zero, x25);          // pg: in3, pn: in4
3618     ASSERT_EQUAL_64(zero, x26);          // pg: in4, pn: in2
3619     ASSERT_EQUAL_64(zero, x27);          // pg: in4, pn: in3
3620     ASSERT_EQUAL_64(SVEFirstFlag, x28);  // pg: in4, pn: in4
3621   }
3622 }
3623 
TEST_SVE(sve_cntp)3624 TEST_SVE(sve_cntp) {
3625   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
3626   START();
3627 
3628   // There are {7, 5, 2, 1} active {B, H, S, D} lanes.
3629   int p0_inputs[] = {0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0};
3630   Initialise(&masm, p0.VnB(), p0_inputs);
3631 
3632   // With an all-true predicate, these instructions measure the vector length.
3633   __ Ptrue(p10.VnB());
3634   __ Ptrue(p11.VnH());
3635   __ Ptrue(p12.VnS());
3636   __ Ptrue(p13.VnD());
3637 
3638   // `ptrue p10.b` provides an all-active pg.
3639   __ Cntp(x10, p10, p10.VnB());
3640   __ Cntp(x11, p10, p11.VnH());
3641   __ Cntp(x12, p10, p12.VnS());
3642   __ Cntp(x13, p10, p13.VnD());
3643 
3644   // Check that the predicate mask is applied properly.
3645   __ Cntp(x14, p10, p10.VnB());
3646   __ Cntp(x15, p11, p10.VnB());
3647   __ Cntp(x16, p12, p10.VnB());
3648   __ Cntp(x17, p13, p10.VnB());
3649 
3650   // Check other patterns (including some ignored bits).
3651   __ Cntp(x0, p10, p0.VnB());
3652   __ Cntp(x1, p10, p0.VnH());
3653   __ Cntp(x2, p10, p0.VnS());
3654   __ Cntp(x3, p10, p0.VnD());
3655   __ Cntp(x4, p0, p10.VnB());
3656   __ Cntp(x5, p0, p10.VnH());
3657   __ Cntp(x6, p0, p10.VnS());
3658   __ Cntp(x7, p0, p10.VnD());
3659 
3660   END();
3661 
3662   if (CAN_RUN()) {
3663     RUN();
3664 
3665     int vl_b = core.GetSVELaneCount(kBRegSize);
3666     int vl_h = core.GetSVELaneCount(kHRegSize);
3667     int vl_s = core.GetSVELaneCount(kSRegSize);
3668     int vl_d = core.GetSVELaneCount(kDRegSize);
3669 
3670     // Check all-active predicates in various combinations.
3671     ASSERT_EQUAL_64(vl_b, x10);
3672     ASSERT_EQUAL_64(vl_h, x11);
3673     ASSERT_EQUAL_64(vl_s, x12);
3674     ASSERT_EQUAL_64(vl_d, x13);
3675 
3676     ASSERT_EQUAL_64(vl_b, x14);
3677     ASSERT_EQUAL_64(vl_h, x15);
3678     ASSERT_EQUAL_64(vl_s, x16);
3679     ASSERT_EQUAL_64(vl_d, x17);
3680 
3681     // Check that irrelevant bits are properly ignored.
3682     ASSERT_EQUAL_64(7, x0);
3683     ASSERT_EQUAL_64(5, x1);
3684     ASSERT_EQUAL_64(2, x2);
3685     ASSERT_EQUAL_64(1, x3);
3686 
3687     ASSERT_EQUAL_64(7, x4);
3688     ASSERT_EQUAL_64(5, x5);
3689     ASSERT_EQUAL_64(2, x6);
3690     ASSERT_EQUAL_64(1, x7);
3691   }
3692 }
3693 
3694 typedef void (MacroAssembler::*CntFn)(const Register& dst,
3695                                       int pattern,
3696                                       int multiplier);
3697 
3698 template <typename T>
GenerateCntSequence(MacroAssembler * masm,CntFn cnt,T acc_value,int multiplier)3699 void GenerateCntSequence(MacroAssembler* masm,
3700                          CntFn cnt,
3701                          T acc_value,
3702                          int multiplier) {
3703   // Initialise accumulators.
3704   masm->Mov(x0, acc_value);
3705   masm->Mov(x1, acc_value);
3706   masm->Mov(x2, acc_value);
3707   masm->Mov(x3, acc_value);
3708   masm->Mov(x4, acc_value);
3709   masm->Mov(x5, acc_value);
3710   masm->Mov(x6, acc_value);
3711   masm->Mov(x7, acc_value);
3712   masm->Mov(x8, acc_value);
3713   masm->Mov(x9, acc_value);
3714   masm->Mov(x10, acc_value);
3715   masm->Mov(x11, acc_value);
3716   masm->Mov(x12, acc_value);
3717   masm->Mov(x13, acc_value);
3718   masm->Mov(x14, acc_value);
3719   masm->Mov(x15, acc_value);
3720   masm->Mov(x18, acc_value);
3721   masm->Mov(x19, acc_value);
3722   masm->Mov(x20, acc_value);
3723   masm->Mov(x21, acc_value);
3724 
3725   (masm->*cnt)(Register(0, sizeof(T) * kBitsPerByte), SVE_POW2, multiplier);
3726   (masm->*cnt)(Register(1, sizeof(T) * kBitsPerByte), SVE_VL1, multiplier);
3727   (masm->*cnt)(Register(2, sizeof(T) * kBitsPerByte), SVE_VL2, multiplier);
3728   (masm->*cnt)(Register(3, sizeof(T) * kBitsPerByte), SVE_VL3, multiplier);
3729   (masm->*cnt)(Register(4, sizeof(T) * kBitsPerByte), SVE_VL4, multiplier);
3730   (masm->*cnt)(Register(5, sizeof(T) * kBitsPerByte), SVE_VL5, multiplier);
3731   (masm->*cnt)(Register(6, sizeof(T) * kBitsPerByte), SVE_VL6, multiplier);
3732   (masm->*cnt)(Register(7, sizeof(T) * kBitsPerByte), SVE_VL7, multiplier);
3733   (masm->*cnt)(Register(8, sizeof(T) * kBitsPerByte), SVE_VL8, multiplier);
3734   (masm->*cnt)(Register(9, sizeof(T) * kBitsPerByte), SVE_VL16, multiplier);
3735   (masm->*cnt)(Register(10, sizeof(T) * kBitsPerByte), SVE_VL32, multiplier);
3736   (masm->*cnt)(Register(11, sizeof(T) * kBitsPerByte), SVE_VL64, multiplier);
3737   (masm->*cnt)(Register(12, sizeof(T) * kBitsPerByte), SVE_VL128, multiplier);
3738   (masm->*cnt)(Register(13, sizeof(T) * kBitsPerByte), SVE_VL256, multiplier);
3739   (masm->*cnt)(Register(14, sizeof(T) * kBitsPerByte), 16, multiplier);
3740   (masm->*cnt)(Register(15, sizeof(T) * kBitsPerByte), 23, multiplier);
3741   (masm->*cnt)(Register(18, sizeof(T) * kBitsPerByte), 28, multiplier);
3742   (masm->*cnt)(Register(19, sizeof(T) * kBitsPerByte), SVE_MUL4, multiplier);
3743   (masm->*cnt)(Register(20, sizeof(T) * kBitsPerByte), SVE_MUL3, multiplier);
3744   (masm->*cnt)(Register(21, sizeof(T) * kBitsPerByte), SVE_ALL, multiplier);
3745 }
3746 
FixedVL(int fixed,int length)3747 int FixedVL(int fixed, int length) {
3748   VIXL_ASSERT(((fixed >= 1) && (fixed <= 8)) || (fixed == 16) ||
3749               (fixed == 32) || (fixed == 64) || (fixed == 128) ||
3750               (fixed = 256));
3751   return (length >= fixed) ? fixed : 0;
3752 }
3753 
CntHelper(Test * config,CntFn cnt,int multiplier,int lane_size_in_bits,int64_t acc_value=0,bool is_increment=true)3754 static void CntHelper(Test* config,
3755                       CntFn cnt,
3756                       int multiplier,
3757                       int lane_size_in_bits,
3758                       int64_t acc_value = 0,
3759                       bool is_increment = true) {
3760   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
3761   START();
3762   GenerateCntSequence(&masm, cnt, acc_value, multiplier);
3763   END();
3764 
3765   if (CAN_RUN()) {
3766     RUN();
3767 
3768     int all = core.GetSVELaneCount(lane_size_in_bits);
3769     int pow2 = 1 << HighestSetBitPosition(all);
3770     int mul4 = all - (all % 4);
3771     int mul3 = all - (all % 3);
3772 
3773     multiplier = is_increment ? multiplier : -multiplier;
3774 
3775     ASSERT_EQUAL_64(acc_value + (multiplier * pow2), x0);
3776     ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(1, all)), x1);
3777     ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(2, all)), x2);
3778     ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(3, all)), x3);
3779     ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(4, all)), x4);
3780     ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(5, all)), x5);
3781     ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(6, all)), x6);
3782     ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(7, all)), x7);
3783     ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(8, all)), x8);
3784     ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(16, all)), x9);
3785     ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(32, all)), x10);
3786     ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(64, all)), x11);
3787     ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(128, all)), x12);
3788     ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(256, all)), x13);
3789     ASSERT_EQUAL_64(acc_value, x14);
3790     ASSERT_EQUAL_64(acc_value, x15);
3791     ASSERT_EQUAL_64(acc_value, x18);
3792     ASSERT_EQUAL_64(acc_value + (multiplier * mul4), x19);
3793     ASSERT_EQUAL_64(acc_value + (multiplier * mul3), x20);
3794     ASSERT_EQUAL_64(acc_value + (multiplier * all), x21);
3795   }
3796 }
3797 
IncHelper(Test * config,CntFn cnt,int multiplier,int lane_size_in_bits,int64_t acc_value)3798 static void IncHelper(Test* config,
3799                       CntFn cnt,
3800                       int multiplier,
3801                       int lane_size_in_bits,
3802                       int64_t acc_value) {
3803   CntHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, true);
3804 }
3805 
DecHelper(Test * config,CntFn cnt,int multiplier,int lane_size_in_bits,int64_t acc_value)3806 static void DecHelper(Test* config,
3807                       CntFn cnt,
3808                       int multiplier,
3809                       int lane_size_in_bits,
3810                       int64_t acc_value) {
3811   CntHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, false);
3812 }
3813 
TEST_SVE(sve_cntb)3814 TEST_SVE(sve_cntb) {
3815   CntHelper(config, &MacroAssembler::Cntb, 1, kBRegSize);
3816   CntHelper(config, &MacroAssembler::Cntb, 2, kBRegSize);
3817   CntHelper(config, &MacroAssembler::Cntb, 15, kBRegSize);
3818   CntHelper(config, &MacroAssembler::Cntb, 16, kBRegSize);
3819 }
3820 
TEST_SVE(sve_cnth)3821 TEST_SVE(sve_cnth) {
3822   CntHelper(config, &MacroAssembler::Cnth, 1, kHRegSize);
3823   CntHelper(config, &MacroAssembler::Cnth, 2, kHRegSize);
3824   CntHelper(config, &MacroAssembler::Cnth, 15, kHRegSize);
3825   CntHelper(config, &MacroAssembler::Cnth, 16, kHRegSize);
3826 }
3827 
TEST_SVE(sve_cntw)3828 TEST_SVE(sve_cntw) {
3829   CntHelper(config, &MacroAssembler::Cntw, 1, kWRegSize);
3830   CntHelper(config, &MacroAssembler::Cntw, 2, kWRegSize);
3831   CntHelper(config, &MacroAssembler::Cntw, 15, kWRegSize);
3832   CntHelper(config, &MacroAssembler::Cntw, 16, kWRegSize);
3833 }
3834 
TEST_SVE(sve_cntd)3835 TEST_SVE(sve_cntd) {
3836   CntHelper(config, &MacroAssembler::Cntd, 1, kDRegSize);
3837   CntHelper(config, &MacroAssembler::Cntd, 2, kDRegSize);
3838   CntHelper(config, &MacroAssembler::Cntd, 15, kDRegSize);
3839   CntHelper(config, &MacroAssembler::Cntd, 16, kDRegSize);
3840 }
3841 
TEST_SVE(sve_decb)3842 TEST_SVE(sve_decb) {
3843   DecHelper(config, &MacroAssembler::Decb, 1, kBRegSize, 42);
3844   DecHelper(config, &MacroAssembler::Decb, 2, kBRegSize, -1);
3845   DecHelper(config, &MacroAssembler::Decb, 15, kBRegSize, INT64_MIN);
3846   DecHelper(config, &MacroAssembler::Decb, 16, kBRegSize, -42);
3847 }
3848 
TEST_SVE(sve_dech)3849 TEST_SVE(sve_dech) {
3850   DecHelper(config, &MacroAssembler::Dech, 1, kHRegSize, 42);
3851   DecHelper(config, &MacroAssembler::Dech, 2, kHRegSize, -1);
3852   DecHelper(config, &MacroAssembler::Dech, 15, kHRegSize, INT64_MIN);
3853   DecHelper(config, &MacroAssembler::Dech, 16, kHRegSize, -42);
3854 }
3855 
TEST_SVE(sve_decw)3856 TEST_SVE(sve_decw) {
3857   DecHelper(config, &MacroAssembler::Decw, 1, kWRegSize, 42);
3858   DecHelper(config, &MacroAssembler::Decw, 2, kWRegSize, -1);
3859   DecHelper(config, &MacroAssembler::Decw, 15, kWRegSize, INT64_MIN);
3860   DecHelper(config, &MacroAssembler::Decw, 16, kWRegSize, -42);
3861 }
3862 
TEST_SVE(sve_decd)3863 TEST_SVE(sve_decd) {
3864   DecHelper(config, &MacroAssembler::Decd, 1, kDRegSize, 42);
3865   DecHelper(config, &MacroAssembler::Decd, 2, kDRegSize, -1);
3866   DecHelper(config, &MacroAssembler::Decd, 15, kDRegSize, INT64_MIN);
3867   DecHelper(config, &MacroAssembler::Decd, 16, kDRegSize, -42);
3868 }
3869 
TEST_SVE(sve_incb)3870 TEST_SVE(sve_incb) {
3871   IncHelper(config, &MacroAssembler::Incb, 1, kBRegSize, 42);
3872   IncHelper(config, &MacroAssembler::Incb, 2, kBRegSize, -1);
3873   IncHelper(config, &MacroAssembler::Incb, 15, kBRegSize, INT64_MAX);
3874   IncHelper(config, &MacroAssembler::Incb, 16, kBRegSize, -42);
3875 }
3876 
TEST_SVE(sve_inch)3877 TEST_SVE(sve_inch) {
3878   IncHelper(config, &MacroAssembler::Inch, 1, kHRegSize, 42);
3879   IncHelper(config, &MacroAssembler::Inch, 2, kHRegSize, -1);
3880   IncHelper(config, &MacroAssembler::Inch, 15, kHRegSize, INT64_MAX);
3881   IncHelper(config, &MacroAssembler::Inch, 16, kHRegSize, -42);
3882 }
3883 
TEST_SVE(sve_incw)3884 TEST_SVE(sve_incw) {
3885   IncHelper(config, &MacroAssembler::Incw, 1, kWRegSize, 42);
3886   IncHelper(config, &MacroAssembler::Incw, 2, kWRegSize, -1);
3887   IncHelper(config, &MacroAssembler::Incw, 15, kWRegSize, INT64_MAX);
3888   IncHelper(config, &MacroAssembler::Incw, 16, kWRegSize, -42);
3889 }
3890 
TEST_SVE(sve_incd)3891 TEST_SVE(sve_incd) {
3892   IncHelper(config, &MacroAssembler::Incd, 1, kDRegSize, 42);
3893   IncHelper(config, &MacroAssembler::Incd, 2, kDRegSize, -1);
3894   IncHelper(config, &MacroAssembler::Incd, 15, kDRegSize, INT64_MAX);
3895   IncHelper(config, &MacroAssembler::Incd, 16, kDRegSize, -42);
3896 }
3897 
3898 template <typename T>
QAdd(T x,int y)3899 static T QAdd(T x, int y) {
3900   VIXL_ASSERT(y > INT_MIN);
3901   T result;
3902   T min = std::numeric_limits<T>::min();
3903   T max = std::numeric_limits<T>::max();
3904   if ((x >= 0) && (y >= 0)) {
3905     // For positive a and b, saturate at max.
3906     result = (max - x) < static_cast<T>(y) ? max : x + y;
3907   } else if ((y < 0) && ((x < 0) || (min == 0))) {
3908     // For negative b, where either a negative or T unsigned.
3909     result = (x - min) < static_cast<T>(-y) ? min : x + y;
3910   } else {
3911     result = x + y;
3912   }
3913   return result;
3914 }
3915 
3916 template <typename T>
QIncDecHelper(Test * config,CntFn cnt,int multiplier,int lane_size_in_bits,T acc_value,bool is_increment)3917 static void QIncDecHelper(Test* config,
3918                           CntFn cnt,
3919                           int multiplier,
3920                           int lane_size_in_bits,
3921                           T acc_value,
3922                           bool is_increment) {
3923   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
3924   START();
3925   GenerateCntSequence(&masm, cnt, acc_value, multiplier);
3926   END();
3927 
3928   if (CAN_RUN()) {
3929     RUN();
3930 
3931     int all = core.GetSVELaneCount(lane_size_in_bits);
3932     int pow2 = 1 << HighestSetBitPosition(all);
3933     int mul4 = all - (all % 4);
3934     int mul3 = all - (all % 3);
3935 
3936     multiplier = is_increment ? multiplier : -multiplier;
3937 
3938     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * pow2), x0);
3939     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(1, all)), x1);
3940     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(2, all)), x2);
3941     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(3, all)), x3);
3942     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(4, all)), x4);
3943     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(5, all)), x5);
3944     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(6, all)), x6);
3945     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(7, all)), x7);
3946     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(8, all)), x8);
3947     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(16, all)), x9);
3948     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(32, all)), x10);
3949     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(64, all)), x11);
3950     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(128, all)), x12);
3951     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(256, all)), x13);
3952     ASSERT_EQUAL_64(acc_value, x14);
3953     ASSERT_EQUAL_64(acc_value, x15);
3954     ASSERT_EQUAL_64(acc_value, x18);
3955     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * mul4), x19);
3956     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * mul3), x20);
3957     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * all), x21);
3958   }
3959 }
3960 
3961 template <typename T>
QIncHelper(Test * config,CntFn cnt,int multiplier,int lane_size_in_bits,T acc_value)3962 static void QIncHelper(Test* config,
3963                        CntFn cnt,
3964                        int multiplier,
3965                        int lane_size_in_bits,
3966                        T acc_value) {
3967   QIncDecHelper<T>(config, cnt, multiplier, lane_size_in_bits, acc_value, true);
3968 }
3969 
3970 template <typename T>
QDecHelper(Test * config,CntFn cnt,int multiplier,int lane_size_in_bits,T acc_value)3971 static void QDecHelper(Test* config,
3972                        CntFn cnt,
3973                        int multiplier,
3974                        int lane_size_in_bits,
3975                        T acc_value) {
3976   QIncDecHelper<T>(config,
3977                    cnt,
3978                    multiplier,
3979                    lane_size_in_bits,
3980                    acc_value,
3981                    false);
3982 }
3983 
TEST_SVE(sve_sqdecb)3984 TEST_SVE(sve_sqdecb) {
3985   int64_t bigneg = INT64_MIN + 42;
3986   int64_t bigpos = INT64_MAX - 42;
3987   QDecHelper<int64_t>(config, &MacroAssembler::Sqdecb, 1, kBRegSize, 1);
3988   QDecHelper<int64_t>(config, &MacroAssembler::Sqdecb, 2, kBRegSize, bigneg);
3989   QDecHelper<int64_t>(config, &MacroAssembler::Sqdecb, 15, kBRegSize, 999);
3990   QDecHelper<int64_t>(config, &MacroAssembler::Sqdecb, 16, kBRegSize, bigpos);
3991 }
3992 
TEST_SVE(sve_sqdech)3993 TEST_SVE(sve_sqdech) {
3994   int64_t bigneg = INT64_MIN + 42;
3995   int64_t bigpos = INT64_MAX - 42;
3996   QDecHelper<int64_t>(config, &MacroAssembler::Sqdech, 1, kHRegSize, 1);
3997   QDecHelper<int64_t>(config, &MacroAssembler::Sqdech, 2, kHRegSize, bigneg);
3998   QDecHelper<int64_t>(config, &MacroAssembler::Sqdech, 15, kHRegSize, 999);
3999   QDecHelper<int64_t>(config, &MacroAssembler::Sqdech, 16, kHRegSize, bigpos);
4000 }
4001 
TEST_SVE(sve_sqdecw)4002 TEST_SVE(sve_sqdecw) {
4003   int64_t bigneg = INT64_MIN + 42;
4004   int64_t bigpos = INT64_MAX - 42;
4005   QDecHelper<int64_t>(config, &MacroAssembler::Sqdecw, 1, kWRegSize, 1);
4006   QDecHelper<int64_t>(config, &MacroAssembler::Sqdecw, 2, kWRegSize, bigneg);
4007   QDecHelper<int64_t>(config, &MacroAssembler::Sqdecw, 15, kWRegSize, 999);
4008   QDecHelper<int64_t>(config, &MacroAssembler::Sqdecw, 16, kWRegSize, bigpos);
4009 }
4010 
TEST_SVE(sve_sqdecd)4011 TEST_SVE(sve_sqdecd) {
4012   int64_t bigneg = INT64_MIN + 42;
4013   int64_t bigpos = INT64_MAX - 42;
4014   QDecHelper<int64_t>(config, &MacroAssembler::Sqdecd, 1, kDRegSize, 1);
4015   QDecHelper<int64_t>(config, &MacroAssembler::Sqdecd, 2, kDRegSize, bigneg);
4016   QDecHelper<int64_t>(config, &MacroAssembler::Sqdecd, 15, kDRegSize, 999);
4017   QDecHelper<int64_t>(config, &MacroAssembler::Sqdecd, 16, kDRegSize, bigpos);
4018 }
4019 
TEST_SVE(sve_sqincb)4020 TEST_SVE(sve_sqincb) {
4021   int64_t bigneg = INT64_MIN + 42;
4022   int64_t bigpos = INT64_MAX - 42;
4023   QIncHelper<int64_t>(config, &MacroAssembler::Sqincb, 1, kBRegSize, 1);
4024   QIncHelper<int64_t>(config, &MacroAssembler::Sqincb, 2, kBRegSize, bigneg);
4025   QIncHelper<int64_t>(config, &MacroAssembler::Sqincb, 15, kBRegSize, 999);
4026   QIncHelper<int64_t>(config, &MacroAssembler::Sqincb, 16, kBRegSize, bigpos);
4027 }
4028 
TEST_SVE(sve_sqinch)4029 TEST_SVE(sve_sqinch) {
4030   int64_t bigneg = INT64_MIN + 42;
4031   int64_t bigpos = INT64_MAX - 42;
4032   QIncHelper<int64_t>(config, &MacroAssembler::Sqinch, 1, kHRegSize, 1);
4033   QIncHelper<int64_t>(config, &MacroAssembler::Sqinch, 2, kHRegSize, bigneg);
4034   QIncHelper<int64_t>(config, &MacroAssembler::Sqinch, 15, kHRegSize, 999);
4035   QIncHelper<int64_t>(config, &MacroAssembler::Sqinch, 16, kHRegSize, bigpos);
4036 }
4037 
TEST_SVE(sve_sqincw)4038 TEST_SVE(sve_sqincw) {
4039   int64_t bigneg = INT64_MIN + 42;
4040   int64_t bigpos = INT64_MAX - 42;
4041   QIncHelper<int64_t>(config, &MacroAssembler::Sqincw, 1, kWRegSize, 1);
4042   QIncHelper<int64_t>(config, &MacroAssembler::Sqincw, 2, kWRegSize, bigneg);
4043   QIncHelper<int64_t>(config, &MacroAssembler::Sqincw, 15, kWRegSize, 999);
4044   QIncHelper<int64_t>(config, &MacroAssembler::Sqincw, 16, kWRegSize, bigpos);
4045 }
4046 
TEST_SVE(sve_sqincd)4047 TEST_SVE(sve_sqincd) {
4048   int64_t bigneg = INT64_MIN + 42;
4049   int64_t bigpos = INT64_MAX - 42;
4050   QIncHelper<int64_t>(config, &MacroAssembler::Sqincd, 1, kDRegSize, 1);
4051   QIncHelper<int64_t>(config, &MacroAssembler::Sqincd, 2, kDRegSize, bigneg);
4052   QIncHelper<int64_t>(config, &MacroAssembler::Sqincd, 15, kDRegSize, 999);
4053   QIncHelper<int64_t>(config, &MacroAssembler::Sqincd, 16, kDRegSize, bigpos);
4054 }
4055 
TEST_SVE(sve_uqdecb)4056 TEST_SVE(sve_uqdecb) {
4057   int32_t big32 = UINT32_MAX - 42;
4058   int64_t big64 = UINT64_MAX - 42;
4059   QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecb, 1, kBRegSize, 1);
4060   QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecb, 2, kBRegSize, 42);
4061   QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecb, 15, kBRegSize, 999);
4062   QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecb, 16, kBRegSize, big32);
4063   QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecb, 1, kBRegSize, 1);
4064   QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecb, 2, kBRegSize, 42);
4065   QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecb, 15, kBRegSize, 999);
4066   QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecb, 16, kBRegSize, big64);
4067 }
4068 
TEST_SVE(sve_uqdech)4069 TEST_SVE(sve_uqdech) {
4070   int32_t big32 = UINT32_MAX - 42;
4071   int64_t big64 = UINT64_MAX - 42;
4072   QDecHelper<uint32_t>(config, &MacroAssembler::Uqdech, 1, kHRegSize, 1);
4073   QDecHelper<uint32_t>(config, &MacroAssembler::Uqdech, 2, kHRegSize, 42);
4074   QDecHelper<uint32_t>(config, &MacroAssembler::Uqdech, 15, kHRegSize, 999);
4075   QDecHelper<uint32_t>(config, &MacroAssembler::Uqdech, 16, kHRegSize, big32);
4076   QDecHelper<uint64_t>(config, &MacroAssembler::Uqdech, 1, kHRegSize, 1);
4077   QDecHelper<uint64_t>(config, &MacroAssembler::Uqdech, 2, kHRegSize, 42);
4078   QDecHelper<uint64_t>(config, &MacroAssembler::Uqdech, 15, kHRegSize, 999);
4079   QDecHelper<uint64_t>(config, &MacroAssembler::Uqdech, 16, kHRegSize, big64);
4080 }
4081 
TEST_SVE(sve_uqdecw)4082 TEST_SVE(sve_uqdecw) {
4083   int32_t big32 = UINT32_MAX - 42;
4084   int64_t big64 = UINT64_MAX - 42;
4085   QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecw, 1, kWRegSize, 1);
4086   QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecw, 2, kWRegSize, 42);
4087   QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecw, 15, kWRegSize, 999);
4088   QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecw, 16, kWRegSize, big32);
4089   QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecw, 1, kWRegSize, 1);
4090   QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecw, 2, kWRegSize, 42);
4091   QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecw, 15, kWRegSize, 999);
4092   QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecw, 16, kWRegSize, big64);
4093 }
4094 
TEST_SVE(sve_uqdecd)4095 TEST_SVE(sve_uqdecd) {
4096   int32_t big32 = UINT32_MAX - 42;
4097   int64_t big64 = UINT64_MAX - 42;
4098   QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecd, 1, kDRegSize, 1);
4099   QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecd, 2, kDRegSize, 42);
4100   QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecd, 15, kDRegSize, 999);
4101   QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecd, 16, kDRegSize, big32);
4102   QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecd, 1, kDRegSize, 1);
4103   QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecd, 2, kDRegSize, 42);
4104   QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecd, 15, kDRegSize, 999);
4105   QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecd, 16, kDRegSize, big64);
4106 }
4107 
TEST_SVE(sve_uqincb)4108 TEST_SVE(sve_uqincb) {
4109   int32_t big32 = UINT32_MAX - 42;
4110   int64_t big64 = UINT64_MAX - 42;
4111   QIncHelper<uint32_t>(config, &MacroAssembler::Uqincb, 1, kBRegSize, 1);
4112   QIncHelper<uint32_t>(config, &MacroAssembler::Uqincb, 2, kBRegSize, 42);
4113   QIncHelper<uint32_t>(config, &MacroAssembler::Uqincb, 15, kBRegSize, 999);
4114   QIncHelper<uint32_t>(config, &MacroAssembler::Uqincb, 16, kBRegSize, big32);
4115   QIncHelper<uint64_t>(config, &MacroAssembler::Uqincb, 1, kBRegSize, 1);
4116   QIncHelper<uint64_t>(config, &MacroAssembler::Uqincb, 2, kBRegSize, 42);
4117   QIncHelper<uint64_t>(config, &MacroAssembler::Uqincb, 15, kBRegSize, 999);
4118   QIncHelper<uint64_t>(config, &MacroAssembler::Uqincb, 16, kBRegSize, big64);
4119 }
4120 
TEST_SVE(sve_uqinch)4121 TEST_SVE(sve_uqinch) {
4122   int32_t big32 = UINT32_MAX - 42;
4123   int64_t big64 = UINT64_MAX - 42;
4124   QIncHelper<uint32_t>(config, &MacroAssembler::Uqinch, 1, kHRegSize, 1);
4125   QIncHelper<uint32_t>(config, &MacroAssembler::Uqinch, 2, kHRegSize, 42);
4126   QIncHelper<uint32_t>(config, &MacroAssembler::Uqinch, 15, kHRegSize, 999);
4127   QIncHelper<uint32_t>(config, &MacroAssembler::Uqinch, 16, kHRegSize, big32);
4128   QIncHelper<uint64_t>(config, &MacroAssembler::Uqinch, 1, kHRegSize, 1);
4129   QIncHelper<uint64_t>(config, &MacroAssembler::Uqinch, 2, kHRegSize, 42);
4130   QIncHelper<uint64_t>(config, &MacroAssembler::Uqinch, 15, kHRegSize, 999);
4131   QIncHelper<uint64_t>(config, &MacroAssembler::Uqinch, 16, kHRegSize, big64);
4132 }
4133 
TEST_SVE(sve_uqincw)4134 TEST_SVE(sve_uqincw) {
4135   int32_t big32 = UINT32_MAX - 42;
4136   int64_t big64 = UINT64_MAX - 42;
4137   QIncHelper<uint32_t>(config, &MacroAssembler::Uqincw, 1, kWRegSize, 1);
4138   QIncHelper<uint32_t>(config, &MacroAssembler::Uqincw, 2, kWRegSize, 42);
4139   QIncHelper<uint32_t>(config, &MacroAssembler::Uqincw, 15, kWRegSize, 999);
4140   QIncHelper<uint32_t>(config, &MacroAssembler::Uqincw, 16, kWRegSize, big32);
4141   QIncHelper<uint64_t>(config, &MacroAssembler::Uqincw, 1, kWRegSize, 1);
4142   QIncHelper<uint64_t>(config, &MacroAssembler::Uqincw, 2, kWRegSize, 42);
4143   QIncHelper<uint64_t>(config, &MacroAssembler::Uqincw, 15, kWRegSize, 999);
4144   QIncHelper<uint64_t>(config, &MacroAssembler::Uqincw, 16, kWRegSize, big64);
4145 }
4146 
TEST_SVE(sve_uqincd)4147 TEST_SVE(sve_uqincd) {
4148   int32_t big32 = UINT32_MAX - 42;
4149   int64_t big64 = UINT64_MAX - 42;
4150   QIncHelper<uint32_t>(config, &MacroAssembler::Uqincd, 1, kDRegSize, 1);
4151   QIncHelper<uint32_t>(config, &MacroAssembler::Uqincd, 2, kDRegSize, 42);
4152   QIncHelper<uint32_t>(config, &MacroAssembler::Uqincd, 15, kDRegSize, 999);
4153   QIncHelper<uint32_t>(config, &MacroAssembler::Uqincd, 16, kDRegSize, big32);
4154   QIncHelper<uint64_t>(config, &MacroAssembler::Uqincd, 1, kDRegSize, 1);
4155   QIncHelper<uint64_t>(config, &MacroAssembler::Uqincd, 2, kDRegSize, 42);
4156   QIncHelper<uint64_t>(config, &MacroAssembler::Uqincd, 15, kDRegSize, 999);
4157   QIncHelper<uint64_t>(config, &MacroAssembler::Uqincd, 16, kDRegSize, big64);
4158 }
4159 
4160 typedef void (MacroAssembler::*QIncDecXWFn)(const Register& dst,
4161                                             const Register& src,
4162                                             int pattern,
4163                                             int multiplier);
4164 
QIncDecXWHelper(Test * config,QIncDecXWFn cnt,int multiplier,int lane_size_in_bits,int32_t acc_value,bool is_increment)4165 static void QIncDecXWHelper(Test* config,
4166                             QIncDecXWFn cnt,
4167                             int multiplier,
4168                             int lane_size_in_bits,
4169                             int32_t acc_value,
4170                             bool is_increment) {
4171   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
4172   START();
4173 
4174   // Initialise accumulators.
4175   __ Mov(x0, acc_value);
4176   __ Mov(x1, acc_value);
4177   __ Mov(x2, acc_value);
4178   __ Mov(x3, acc_value);
4179   __ Mov(x4, acc_value);
4180   __ Mov(x5, acc_value);
4181   __ Mov(x6, acc_value);
4182   __ Mov(x7, acc_value);
4183   __ Mov(x8, acc_value);
4184   __ Mov(x9, acc_value);
4185   __ Mov(x10, acc_value);
4186   __ Mov(x11, acc_value);
4187   __ Mov(x12, acc_value);
4188   __ Mov(x13, acc_value);
4189   __ Mov(x14, acc_value);
4190   __ Mov(x15, acc_value);
4191   __ Mov(x18, acc_value);
4192   __ Mov(x19, acc_value);
4193   __ Mov(x20, acc_value);
4194   __ Mov(x21, acc_value);
4195 
4196   (masm.*cnt)(x0, w0, SVE_POW2, multiplier);
4197   (masm.*cnt)(x1, w1, SVE_VL1, multiplier);
4198   (masm.*cnt)(x2, w2, SVE_VL2, multiplier);
4199   (masm.*cnt)(x3, w3, SVE_VL3, multiplier);
4200   (masm.*cnt)(x4, w4, SVE_VL4, multiplier);
4201   (masm.*cnt)(x5, w5, SVE_VL5, multiplier);
4202   (masm.*cnt)(x6, w6, SVE_VL6, multiplier);
4203   (masm.*cnt)(x7, w7, SVE_VL7, multiplier);
4204   (masm.*cnt)(x8, w8, SVE_VL8, multiplier);
4205   (masm.*cnt)(x9, w9, SVE_VL16, multiplier);
4206   (masm.*cnt)(x10, w10, SVE_VL32, multiplier);
4207   (masm.*cnt)(x11, w11, SVE_VL64, multiplier);
4208   (masm.*cnt)(x12, w12, SVE_VL128, multiplier);
4209   (masm.*cnt)(x13, w13, SVE_VL256, multiplier);
4210   (masm.*cnt)(x14, w14, 16, multiplier);
4211   (masm.*cnt)(x15, w15, 23, multiplier);
4212   (masm.*cnt)(x18, w18, 28, multiplier);
4213   (masm.*cnt)(x19, w19, SVE_MUL4, multiplier);
4214   (masm.*cnt)(x20, w20, SVE_MUL3, multiplier);
4215   (masm.*cnt)(x21, w21, SVE_ALL, multiplier);
4216 
4217   END();
4218 
4219   if (CAN_RUN()) {
4220     RUN();
4221 
4222     int all = core.GetSVELaneCount(lane_size_in_bits);
4223     int pow2 = 1 << HighestSetBitPosition(all);
4224     int mul4 = all - (all % 4);
4225     int mul3 = all - (all % 3);
4226 
4227     multiplier = is_increment ? multiplier : -multiplier;
4228 
4229     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * pow2), x0);
4230     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(1, all)), x1);
4231     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(2, all)), x2);
4232     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(3, all)), x3);
4233     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(4, all)), x4);
4234     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(5, all)), x5);
4235     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(6, all)), x6);
4236     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(7, all)), x7);
4237     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(8, all)), x8);
4238     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(16, all)), x9);
4239     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(32, all)), x10);
4240     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(64, all)), x11);
4241     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(128, all)), x12);
4242     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(256, all)), x13);
4243     ASSERT_EQUAL_64(acc_value, x14);
4244     ASSERT_EQUAL_64(acc_value, x15);
4245     ASSERT_EQUAL_64(acc_value, x18);
4246     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * mul4), x19);
4247     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * mul3), x20);
4248     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * all), x21);
4249   }
4250 }
4251 
QIncXWHelper(Test * config,QIncDecXWFn cnt,int multiplier,int lane_size_in_bits,int32_t acc_value)4252 static void QIncXWHelper(Test* config,
4253                          QIncDecXWFn cnt,
4254                          int multiplier,
4255                          int lane_size_in_bits,
4256                          int32_t acc_value) {
4257   QIncDecXWHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, true);
4258 }
4259 
QDecXWHelper(Test * config,QIncDecXWFn cnt,int multiplier,int lane_size_in_bits,int32_t acc_value)4260 static void QDecXWHelper(Test* config,
4261                          QIncDecXWFn cnt,
4262                          int multiplier,
4263                          int lane_size_in_bits,
4264                          int32_t acc_value) {
4265   QIncDecXWHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, false);
4266 }
4267 
TEST_SVE(sve_sqdecb_xw)4268 TEST_SVE(sve_sqdecb_xw) {
4269   QDecXWHelper(config, &MacroAssembler::Sqdecb, 1, kBRegSize, 1);
4270   QDecXWHelper(config, &MacroAssembler::Sqdecb, 2, kBRegSize, INT32_MIN + 42);
4271   QDecXWHelper(config, &MacroAssembler::Sqdecb, 15, kBRegSize, 999);
4272   QDecXWHelper(config, &MacroAssembler::Sqdecb, 16, kBRegSize, INT32_MAX - 42);
4273 }
4274 
TEST_SVE(sve_sqdech_xw)4275 TEST_SVE(sve_sqdech_xw) {
4276   QDecXWHelper(config, &MacroAssembler::Sqdech, 1, kHRegSize, 1);
4277   QDecXWHelper(config, &MacroAssembler::Sqdech, 2, kHRegSize, INT32_MIN + 42);
4278   QDecXWHelper(config, &MacroAssembler::Sqdech, 15, kHRegSize, 999);
4279   QDecXWHelper(config, &MacroAssembler::Sqdech, 16, kHRegSize, INT32_MAX - 42);
4280 }
4281 
TEST_SVE(sve_sqdecw_xw)4282 TEST_SVE(sve_sqdecw_xw) {
4283   QDecXWHelper(config, &MacroAssembler::Sqdecw, 1, kWRegSize, 1);
4284   QDecXWHelper(config, &MacroAssembler::Sqdecw, 2, kWRegSize, INT32_MIN + 42);
4285   QDecXWHelper(config, &MacroAssembler::Sqdecw, 15, kWRegSize, 999);
4286   QDecXWHelper(config, &MacroAssembler::Sqdecw, 16, kWRegSize, INT32_MAX - 42);
4287 }
4288 
TEST_SVE(sve_sqdecd_xw)4289 TEST_SVE(sve_sqdecd_xw) {
4290   QDecXWHelper(config, &MacroAssembler::Sqdecd, 1, kDRegSize, 1);
4291   QDecXWHelper(config, &MacroAssembler::Sqdecd, 2, kDRegSize, INT32_MIN + 42);
4292   QDecXWHelper(config, &MacroAssembler::Sqdecd, 15, kDRegSize, 999);
4293   QDecXWHelper(config, &MacroAssembler::Sqdecd, 16, kDRegSize, INT32_MAX - 42);
4294 }
4295 
TEST_SVE(sve_sqincb_xw)4296 TEST_SVE(sve_sqincb_xw) {
4297   QIncXWHelper(config, &MacroAssembler::Sqincb, 1, kBRegSize, 1);
4298   QIncXWHelper(config, &MacroAssembler::Sqincb, 2, kBRegSize, INT32_MIN + 42);
4299   QIncXWHelper(config, &MacroAssembler::Sqincb, 15, kBRegSize, 999);
4300   QIncXWHelper(config, &MacroAssembler::Sqincb, 16, kBRegSize, INT32_MAX - 42);
4301 }
4302 
TEST_SVE(sve_sqinch_xw)4303 TEST_SVE(sve_sqinch_xw) {
4304   QIncXWHelper(config, &MacroAssembler::Sqinch, 1, kHRegSize, 1);
4305   QIncXWHelper(config, &MacroAssembler::Sqinch, 2, kHRegSize, INT32_MIN + 42);
4306   QIncXWHelper(config, &MacroAssembler::Sqinch, 15, kHRegSize, 999);
4307   QIncXWHelper(config, &MacroAssembler::Sqinch, 16, kHRegSize, INT32_MAX - 42);
4308 }
4309 
TEST_SVE(sve_sqincw_xw)4310 TEST_SVE(sve_sqincw_xw) {
4311   QIncXWHelper(config, &MacroAssembler::Sqincw, 1, kWRegSize, 1);
4312   QIncXWHelper(config, &MacroAssembler::Sqincw, 2, kWRegSize, INT32_MIN + 42);
4313   QIncXWHelper(config, &MacroAssembler::Sqincw, 15, kWRegSize, 999);
4314   QIncXWHelper(config, &MacroAssembler::Sqincw, 16, kWRegSize, INT32_MAX - 42);
4315 }
4316 
TEST_SVE(sve_sqincd_xw)4317 TEST_SVE(sve_sqincd_xw) {
4318   QIncXWHelper(config, &MacroAssembler::Sqincd, 1, kDRegSize, 1);
4319   QIncXWHelper(config, &MacroAssembler::Sqincd, 2, kDRegSize, INT32_MIN + 42);
4320   QIncXWHelper(config, &MacroAssembler::Sqincd, 15, kDRegSize, 999);
4321   QIncXWHelper(config, &MacroAssembler::Sqincd, 16, kDRegSize, INT32_MAX - 42);
4322 }
4323 
4324 typedef void (MacroAssembler::*IncDecZFn)(const ZRegister& dst,
4325                                           int pattern,
4326                                           int multiplier);
4327 typedef void (MacroAssembler::*AddSubFn)(const ZRegister& dst,
4328                                          const ZRegister& src1,
4329                                          const ZRegister& src2);
4330 
IncDecZHelper(Test * config,IncDecZFn fn,CntFn cnt,AddSubFn addsub,int multiplier,int lane_size_in_bits)4331 static void IncDecZHelper(Test* config,
4332                           IncDecZFn fn,
4333                           CntFn cnt,
4334                           AddSubFn addsub,
4335                           int multiplier,
4336                           int lane_size_in_bits) {
4337   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
4338   START();
4339 
4340   uint64_t acc_inputs[] = {0x7766554433221100,
4341                            0xffffffffffffffff,
4342                            0x0000000000000000,
4343                            0xffffffff0000ffff,
4344                            0x7fffffffffffffff,
4345                            0x8000000000000000,
4346                            0x7fffffff7fff7fff,
4347                            0x8000000080008000};
4348 
4349   for (unsigned i = 0; i < kNumberOfZRegisters; i++) {
4350     for (int j = 0; j < 4; j++) {
4351       InsrHelper(&masm, ZRegister(i, kDRegSize), acc_inputs);
4352     }
4353   }
4354   for (unsigned i = 0; i < 15; i++) {
4355     __ Mov(XRegister(i), 0);
4356   }
4357 
4358   (masm.*fn)(z16.WithLaneSize(lane_size_in_bits), SVE_POW2, multiplier);
4359   (masm.*fn)(z17.WithLaneSize(lane_size_in_bits), SVE_VL1, multiplier);
4360   (masm.*fn)(z18.WithLaneSize(lane_size_in_bits), SVE_VL2, multiplier);
4361   (masm.*fn)(z19.WithLaneSize(lane_size_in_bits), SVE_VL3, multiplier);
4362   (masm.*fn)(z20.WithLaneSize(lane_size_in_bits), SVE_VL4, multiplier);
4363   (masm.*fn)(z21.WithLaneSize(lane_size_in_bits), SVE_VL7, multiplier);
4364   (masm.*fn)(z22.WithLaneSize(lane_size_in_bits), SVE_VL8, multiplier);
4365   (masm.*fn)(z23.WithLaneSize(lane_size_in_bits), SVE_VL16, multiplier);
4366   (masm.*fn)(z24.WithLaneSize(lane_size_in_bits), SVE_VL64, multiplier);
4367   (masm.*fn)(z25.WithLaneSize(lane_size_in_bits), SVE_VL256, multiplier);
4368   (masm.*fn)(z26.WithLaneSize(lane_size_in_bits), 16, multiplier);
4369   (masm.*fn)(z27.WithLaneSize(lane_size_in_bits), 28, multiplier);
4370   (masm.*fn)(z28.WithLaneSize(lane_size_in_bits), SVE_MUL3, multiplier);
4371   (masm.*fn)(z29.WithLaneSize(lane_size_in_bits), SVE_MUL4, multiplier);
4372   (masm.*fn)(z30.WithLaneSize(lane_size_in_bits), SVE_ALL, multiplier);
4373 
4374   // Perform computation using alternative instructions.
4375   (masm.*cnt)(x0, SVE_POW2, multiplier);
4376   (masm.*cnt)(x1, SVE_VL1, multiplier);
4377   (masm.*cnt)(x2, SVE_VL2, multiplier);
4378   (masm.*cnt)(x3, SVE_VL3, multiplier);
4379   (masm.*cnt)(x4, SVE_VL4, multiplier);
4380   (masm.*cnt)(x5, SVE_VL7, multiplier);
4381   (masm.*cnt)(x6, SVE_VL8, multiplier);
4382   (masm.*cnt)(x7, SVE_VL16, multiplier);
4383   (masm.*cnt)(x8, SVE_VL64, multiplier);
4384   (masm.*cnt)(x9, SVE_VL256, multiplier);
4385   (masm.*cnt)(x10, 16, multiplier);
4386   (masm.*cnt)(x11, 28, multiplier);
4387   (masm.*cnt)(x12, SVE_MUL3, multiplier);
4388   (masm.*cnt)(x13, SVE_MUL4, multiplier);
4389   (masm.*cnt)(x14, SVE_ALL, multiplier);
4390 
4391   ZRegister zscratch = z15.WithLaneSize(lane_size_in_bits);
4392   for (unsigned i = 0; i < 15; i++) {
4393     ZRegister zsrcdst = ZRegister(i, lane_size_in_bits);
4394     Register x = Register(i, kXRegSize);
4395     __ Dup(zscratch, x);
4396     (masm.*addsub)(zsrcdst, zsrcdst, zscratch);
4397   }
4398 
4399   END();
4400 
4401   if (CAN_RUN()) {
4402     RUN();
4403 
4404     ASSERT_EQUAL_SVE(z0, z16);
4405     ASSERT_EQUAL_SVE(z1, z17);
4406     ASSERT_EQUAL_SVE(z2, z18);
4407     ASSERT_EQUAL_SVE(z3, z19);
4408     ASSERT_EQUAL_SVE(z4, z20);
4409     ASSERT_EQUAL_SVE(z5, z21);
4410     ASSERT_EQUAL_SVE(z6, z22);
4411     ASSERT_EQUAL_SVE(z7, z23);
4412     ASSERT_EQUAL_SVE(z8, z24);
4413     ASSERT_EQUAL_SVE(z9, z25);
4414     ASSERT_EQUAL_SVE(z10, z26);
4415     ASSERT_EQUAL_SVE(z11, z27);
4416     ASSERT_EQUAL_SVE(z12, z28);
4417     ASSERT_EQUAL_SVE(z13, z29);
4418     ASSERT_EQUAL_SVE(z14, z30);
4419   }
4420 }
4421 
TEST_SVE(sve_inc_dec_vec)4422 TEST_SVE(sve_inc_dec_vec) {
4423   CntFn cnth = &MacroAssembler::Cnth;
4424   CntFn cntw = &MacroAssembler::Cntw;
4425   CntFn cntd = &MacroAssembler::Cntd;
4426   AddSubFn sub = &MacroAssembler::Sub;
4427   AddSubFn add = &MacroAssembler::Add;
4428   for (int mult = 1; mult <= 16; mult += 5) {
4429     IncDecZHelper(config, &MacroAssembler::Dech, cnth, sub, mult, kHRegSize);
4430     IncDecZHelper(config, &MacroAssembler::Decw, cntw, sub, mult, kSRegSize);
4431     IncDecZHelper(config, &MacroAssembler::Decd, cntd, sub, mult, kDRegSize);
4432     IncDecZHelper(config, &MacroAssembler::Inch, cnth, add, mult, kHRegSize);
4433     IncDecZHelper(config, &MacroAssembler::Incw, cntw, add, mult, kSRegSize);
4434     IncDecZHelper(config, &MacroAssembler::Incd, cntd, add, mult, kDRegSize);
4435   }
4436 }
4437 
TEST_SVE(sve_unsigned_sat_inc_dec_vec)4438 TEST_SVE(sve_unsigned_sat_inc_dec_vec) {
4439   CntFn cnth = &MacroAssembler::Cnth;
4440   CntFn cntw = &MacroAssembler::Cntw;
4441   CntFn cntd = &MacroAssembler::Cntd;
4442   AddSubFn sub = &MacroAssembler::Uqsub;
4443   AddSubFn add = &MacroAssembler::Uqadd;
4444   for (int mult = 1; mult <= 16; mult += 5) {
4445     IncDecZHelper(config, &MacroAssembler::Uqdech, cnth, sub, mult, kHRegSize);
4446     IncDecZHelper(config, &MacroAssembler::Uqdecw, cntw, sub, mult, kSRegSize);
4447     IncDecZHelper(config, &MacroAssembler::Uqdecd, cntd, sub, mult, kDRegSize);
4448     IncDecZHelper(config, &MacroAssembler::Uqinch, cnth, add, mult, kHRegSize);
4449     IncDecZHelper(config, &MacroAssembler::Uqincw, cntw, add, mult, kSRegSize);
4450     IncDecZHelper(config, &MacroAssembler::Uqincd, cntd, add, mult, kDRegSize);
4451   }
4452 }
4453 
TEST_SVE(sve_signed_sat_inc_dec_vec)4454 TEST_SVE(sve_signed_sat_inc_dec_vec) {
4455   CntFn cnth = &MacroAssembler::Cnth;
4456   CntFn cntw = &MacroAssembler::Cntw;
4457   CntFn cntd = &MacroAssembler::Cntd;
4458   AddSubFn sub = &MacroAssembler::Sqsub;
4459   AddSubFn add = &MacroAssembler::Sqadd;
4460   for (int mult = 1; mult <= 16; mult += 5) {
4461     IncDecZHelper(config, &MacroAssembler::Sqdech, cnth, sub, mult, kHRegSize);
4462     IncDecZHelper(config, &MacroAssembler::Sqdecw, cntw, sub, mult, kSRegSize);
4463     IncDecZHelper(config, &MacroAssembler::Sqdecd, cntd, sub, mult, kDRegSize);
4464     IncDecZHelper(config, &MacroAssembler::Sqinch, cnth, add, mult, kHRegSize);
4465     IncDecZHelper(config, &MacroAssembler::Sqincw, cntw, add, mult, kSRegSize);
4466     IncDecZHelper(config, &MacroAssembler::Sqincd, cntd, add, mult, kDRegSize);
4467   }
4468 }
4469 
4470 typedef void (MacroAssembler::*ArithPredicatedFn)(const ZRegister& zd,
4471                                                   const PRegisterM& pg,
4472                                                   const ZRegister& zn,
4473                                                   const ZRegister& zm);
4474 
4475 template <typename Td, typename Tg, typename Tn>
IntBinArithHelper(Test * config,ArithPredicatedFn macro,unsigned lane_size_in_bits,const Tg & pg_inputs,const Tn & zn_inputs,const Tn & zm_inputs,const Td & zd_expected)4476 static void IntBinArithHelper(Test* config,
4477                               ArithPredicatedFn macro,
4478                               unsigned lane_size_in_bits,
4479                               const Tg& pg_inputs,
4480                               const Tn& zn_inputs,
4481                               const Tn& zm_inputs,
4482                               const Td& zd_expected) {
4483   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
4484   START();
4485 
4486   ZRegister src_a = z31.WithLaneSize(lane_size_in_bits);
4487   ZRegister src_b = z27.WithLaneSize(lane_size_in_bits);
4488   InsrHelper(&masm, src_a, zn_inputs);
4489   InsrHelper(&masm, src_b, zm_inputs);
4490 
4491   Initialise(&masm, p0.WithLaneSize(lane_size_in_bits), pg_inputs);
4492 
4493   ZRegister zd_1 = z0.WithLaneSize(lane_size_in_bits);
4494   ZRegister zd_2 = z1.WithLaneSize(lane_size_in_bits);
4495   ZRegister zd_3 = z2.WithLaneSize(lane_size_in_bits);
4496 
4497   // `instr` zd(dst), zd(src_a), zn(src_b)
4498   __ Mov(zd_1, src_a);
4499   (masm.*macro)(zd_1, p0.Merging(), zd_1, src_b);
4500 
4501   // `instr` zd(dst), zm(src_a), zd(src_b)
4502   // Based on whether zd and zm registers are aliased, the macro of instructions
4503   // (`Instr`) swaps the order of operands if it has the commutative property,
4504   // otherwise, transfer to the reversed `Instr`, such as subr and divr.
4505   __ Mov(zd_2, src_b);
4506   (masm.*macro)(zd_2, p0.Merging(), src_a, zd_2);
4507 
4508   // `instr` zd(dst), zm(src_a), zn(src_b)
4509   // The macro of instructions (`Instr`) automatically selects between `instr`
4510   // and movprfx + `instr` based on whether zd and zn registers are aliased.
4511   // A generated movprfx instruction is predicated that using the same
4512   // governing predicate register. In order to keep the result constant,
4513   // initialize the destination register first.
4514   __ Mov(zd_3, src_a);
4515   (masm.*macro)(zd_3, p0.Merging(), src_a, src_b);
4516 
4517   END();
4518 
4519   if (CAN_RUN()) {
4520     RUN();
4521     ASSERT_EQUAL_SVE(zd_expected, zd_1);
4522 
4523     for (size_t i = 0; i < ArrayLength(zd_expected); i++) {
4524       int lane = static_cast<int>(ArrayLength(zd_expected) - i - 1);
4525       if (!core.HasSVELane(zd_1, lane)) break;
4526       if ((pg_inputs[i] & 1) != 0) {
4527         ASSERT_EQUAL_SVE_LANE(zd_expected[i], zd_1, lane);
4528       } else {
4529         ASSERT_EQUAL_SVE_LANE(zn_inputs[i], zd_1, lane);
4530       }
4531     }
4532 
4533     ASSERT_EQUAL_SVE(zd_expected, zd_3);
4534   }
4535 }
4536 
TEST_SVE(sve_binary_arithmetic_predicated_add)4537 TEST_SVE(sve_binary_arithmetic_predicated_add) {
4538   // clang-format off
4539   unsigned zn_b[] = {0x00, 0x01, 0x10, 0x81, 0xff, 0x0f, 0x01, 0x7f};
4540 
4541   unsigned zm_b[] = {0x00, 0x01, 0x10, 0x00, 0x81, 0x80, 0xff, 0xff};
4542 
4543   unsigned zn_h[] = {0x0000, 0x0123, 0x1010, 0x8181, 0xffff, 0x0f0f, 0x0101, 0x7f7f};
4544 
4545   unsigned zm_h[] = {0x0000, 0x0123, 0x1010, 0x0000, 0x8181, 0x8080, 0xffff, 0xffff};
4546 
4547   unsigned zn_s[] = {0x00000000, 0x01234567, 0x10101010, 0x81818181,
4548                      0xffffffff, 0x0f0f0f0f, 0x01010101, 0x7f7f7f7f};
4549 
4550   unsigned zm_s[] = {0x00000000, 0x01234567, 0x10101010, 0x00000000,
4551                      0x81818181, 0x80808080, 0xffffffff, 0xffffffff};
4552 
4553   uint64_t zn_d[] = {0x0000000000000000, 0x0123456789abcdef,
4554                      0x1010101010101010, 0x8181818181818181,
4555                      0xffffffffffffffff, 0x0f0f0f0f0f0f0f0f,
4556                      0x0101010101010101, 0x7f7f7f7fffffffff};
4557 
4558   uint64_t zm_d[] = {0x0000000000000000, 0x0123456789abcdef,
4559                      0x1010101010101010, 0x0000000000000000,
4560                      0x8181818181818181, 0x8080808080808080,
4561                      0xffffffffffffffff, 0xffffffffffffffff};
4562 
4563   int pg_b[] = {1, 1, 1, 0, 1, 1, 1, 0};
4564   int pg_h[] = {1, 1, 0, 1, 1, 1, 0, 1};
4565   int pg_s[] = {1, 0, 1, 1, 1, 0, 1, 1};
4566   int pg_d[] = {0, 1, 1, 1, 0, 1, 1, 1};
4567 
4568   unsigned add_exp_b[] = {0x00, 0x02, 0x20, 0x81, 0x80, 0x8f, 0x00, 0x7f};
4569 
4570   unsigned add_exp_h[] = {0x0000, 0x0246, 0x1010, 0x8181,
4571                           0x8180, 0x8f8f, 0x0101, 0x7f7e};
4572 
4573   unsigned add_exp_s[] = {0x00000000, 0x01234567, 0x20202020, 0x81818181,
4574                           0x81818180, 0x0f0f0f0f, 0x01010100, 0x7f7f7f7e};
4575 
4576   uint64_t add_exp_d[] = {0x0000000000000000, 0x02468acf13579bde,
4577                           0x2020202020202020, 0x8181818181818181,
4578                           0xffffffffffffffff, 0x8f8f8f8f8f8f8f8f,
4579                           0x0101010101010100, 0x7f7f7f7ffffffffe};
4580 
4581   ArithPredicatedFn fn = &MacroAssembler::Add;
4582   IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, add_exp_b);
4583   IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, add_exp_h);
4584   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, add_exp_s);
4585   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, add_exp_d);
4586 
4587   unsigned sub_exp_b[] = {0x00, 0x00, 0x00, 0x81, 0x7e, 0x8f, 0x02, 0x7f};
4588 
4589   unsigned sub_exp_h[] = {0x0000, 0x0000, 0x1010, 0x8181,
4590                           0x7e7e, 0x8e8f, 0x0101, 0x7f80};
4591 
4592   unsigned sub_exp_s[] = {0x00000000, 0x01234567, 0x00000000, 0x81818181,
4593                           0x7e7e7e7e, 0x0f0f0f0f, 0x01010102, 0x7f7f7f80};
4594 
4595   uint64_t sub_exp_d[] = {0x0000000000000000, 0x0000000000000000,
4596                           0x0000000000000000, 0x8181818181818181,
4597                           0xffffffffffffffff, 0x8e8e8e8e8e8e8e8f,
4598                           0x0101010101010102, 0x7f7f7f8000000000};
4599 
4600   fn = &MacroAssembler::Sub;
4601   IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, sub_exp_b);
4602   IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, sub_exp_h);
4603   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, sub_exp_s);
4604   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, sub_exp_d);
4605   // clang-format on
4606 }
4607 
TEST_SVE(sve_binary_arithmetic_predicated_umin_umax_uabd)4608 TEST_SVE(sve_binary_arithmetic_predicated_umin_umax_uabd) {
4609   // clang-format off
4610   unsigned zn_b[] = {0x00, 0xff, 0x0f, 0xff, 0xf0, 0x98, 0x55, 0x67};
4611 
4612   unsigned zm_b[] = {0x01, 0x00, 0x0e, 0xfe, 0xfe, 0xab, 0xcd, 0x78};
4613 
4614   unsigned zn_h[] = {0x0000, 0xffff, 0x00ff, 0xffff,
4615                      0xff00, 0xba98, 0x5555, 0x4567};
4616 
4617   unsigned zm_h[] = {0x0001, 0x0000, 0x00ee, 0xfffe,
4618                      0xfe00, 0xabab, 0xcdcd, 0x5678};
4619 
4620   unsigned zn_s[] = {0x00000000, 0xffffffff, 0x0000ffff, 0xffffffff,
4621                      0xffff0000, 0xfedcba98, 0x55555555, 0x01234567};
4622 
4623   unsigned zm_s[] = {0x00000001, 0x00000000, 0x0000eeee, 0xfffffffe,
4624                      0xfffe0000, 0xabababab, 0xcdcdcdcd, 0x12345678};
4625 
4626   uint64_t zn_d[] = {0x0000000000000000, 0xffffffffffffffff,
4627                      0x5555555555555555, 0x0000000001234567};
4628 
4629   uint64_t zm_d[] = {0x0000000000000001, 0x0000000000000000,
4630                      0xcdcdcdcdcdcdcdcd, 0x0000000012345678};
4631 
4632   int pg_b[] = {1, 1, 1, 0, 1, 1, 1, 0};
4633   int pg_h[] = {1, 1, 0, 1, 1, 1, 0, 1};
4634   int pg_s[] = {1, 0, 1, 1, 1, 0, 1, 1};
4635   int pg_d[] = {1, 0, 1, 1};
4636 
4637   unsigned umax_exp_b[] = {0x01, 0xff, 0x0f, 0xff, 0xfe, 0xab, 0xcd, 0x67};
4638 
4639   unsigned umax_exp_h[] = {0x0001, 0xffff, 0x00ff, 0xffff,
4640                            0xff00, 0xba98, 0x5555, 0x5678};
4641 
4642   unsigned umax_exp_s[] = {0x00000001, 0xffffffff, 0x0000ffff, 0xffffffff,
4643                            0xffff0000, 0xfedcba98, 0xcdcdcdcd, 0x12345678};
4644 
4645   uint64_t umax_exp_d[] = {0x0000000000000001, 0xffffffffffffffff,
4646                            0xcdcdcdcdcdcdcdcd, 0x0000000012345678};
4647 
4648   ArithPredicatedFn fn = &MacroAssembler::Umax;
4649   IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, umax_exp_b);
4650   IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, umax_exp_h);
4651   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, umax_exp_s);
4652   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, umax_exp_d);
4653 
4654   unsigned umin_exp_b[] = {0x00, 0x00, 0x0e, 0xff, 0xf0, 0x98, 0x55, 0x67};
4655 
4656   unsigned umin_exp_h[] = {0x0000, 0x0000, 0x00ff, 0xfffe,
4657                            0xfe00, 0xabab, 0x5555, 0x4567};
4658 
4659   unsigned umin_exp_s[] = {0x00000000, 0xffffffff, 0x0000eeee, 0xfffffffe,
4660                            0xfffe0000, 0xfedcba98, 0x55555555, 0x01234567};
4661 
4662   uint64_t umin_exp_d[] = {0x0000000000000000, 0xffffffffffffffff,
4663                            0x5555555555555555, 0x0000000001234567};
4664   fn = &MacroAssembler::Umin;
4665   IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, umin_exp_b);
4666   IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, umin_exp_h);
4667   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, umin_exp_s);
4668   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, umin_exp_d);
4669 
4670   unsigned uabd_exp_b[] = {0x01, 0xff, 0x01, 0xff, 0x0e, 0x13, 0x78, 0x67};
4671 
4672   unsigned uabd_exp_h[] = {0x0001, 0xffff, 0x00ff, 0x0001,
4673                            0x0100, 0x0eed, 0x5555, 0x1111};
4674 
4675   unsigned uabd_exp_s[] = {0x00000001, 0xffffffff, 0x00001111, 0x00000001,
4676                            0x00010000, 0xfedcba98, 0x78787878, 0x11111111};
4677 
4678   uint64_t uabd_exp_d[] = {0x0000000000000001, 0xffffffffffffffff,
4679                            0x7878787878787878, 0x0000000011111111};
4680 
4681   fn = &MacroAssembler::Uabd;
4682   IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, uabd_exp_b);
4683   IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, uabd_exp_h);
4684   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, uabd_exp_s);
4685   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, uabd_exp_d);
4686   // clang-format on
4687 }
4688 
TEST_SVE(sve_binary_arithmetic_predicated_smin_smax_sabd)4689 TEST_SVE(sve_binary_arithmetic_predicated_smin_smax_sabd) {
4690   // clang-format off
4691   int zn_b[] = {0, -128, -128, -128, -128, 127, 127, 1};
4692 
4693   int zm_b[] = {-1, 0, -1, -127, 127, 126, -1, 0};
4694 
4695   int zn_h[] = {0, INT16_MIN, INT16_MIN, INT16_MIN,
4696                 INT16_MIN, INT16_MAX, INT16_MAX, 1};
4697 
4698   int zm_h[] = {-1, 0, -1, INT16_MIN + 1,
4699                 INT16_MAX, INT16_MAX - 1, -1, 0};
4700 
4701   int zn_s[] = {0, INT32_MIN, INT32_MIN, INT32_MIN,
4702                 INT32_MIN, INT32_MAX, INT32_MAX, 1};
4703 
4704   int zm_s[] = {-1, 0, -1, -INT32_MAX,
4705                 INT32_MAX, INT32_MAX - 1, -1, 0};
4706 
4707   int64_t zn_d[] = {0, INT64_MIN, INT64_MIN, INT64_MIN,
4708                     INT64_MIN, INT64_MAX, INT64_MAX, 1};
4709 
4710   int64_t zm_d[] = {-1, 0, -1, INT64_MIN + 1,
4711                     INT64_MAX, INT64_MAX - 1, -1, 0};
4712 
4713   int pg_b[] = {1, 1, 1, 0, 1, 1, 1, 0};
4714   int pg_h[] = {1, 1, 0, 1, 1, 1, 0, 1};
4715   int pg_s[] = {1, 0, 1, 1, 1, 0, 1, 1};
4716   int pg_d[] = {0, 1, 1, 1, 0, 1, 1, 1};
4717 
4718   int smax_exp_b[] = {0, 0, -1, -128, 127, 127, 127, 1};
4719 
4720   int smax_exp_h[] = {0, 0, INT16_MIN, INT16_MIN + 1,
4721                       INT16_MAX, INT16_MAX, INT16_MAX, 1};
4722 
4723   int smax_exp_s[] = {0, INT32_MIN, -1, INT32_MIN + 1,
4724                       INT32_MAX, INT32_MAX, INT32_MAX, 1};
4725 
4726   int64_t smax_exp_d[] = {0, 0, -1, INT64_MIN + 1,
4727                           INT64_MIN, INT64_MAX, INT64_MAX, 1};
4728 
4729   ArithPredicatedFn fn = &MacroAssembler::Smax;
4730   IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, smax_exp_b);
4731   IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, smax_exp_h);
4732   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, smax_exp_s);
4733   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, smax_exp_d);
4734 
4735   int smin_exp_b[] = {-1, -128, -128, -128, -128, 126, -1, 1};
4736 
4737   int smin_exp_h[] = {-1, INT16_MIN, INT16_MIN, INT16_MIN,
4738                       INT16_MIN, INT16_MAX - 1, INT16_MAX, 0};
4739 
4740   int smin_exp_s[] = {-1, INT32_MIN, INT32_MIN, INT32_MIN,
4741                       INT32_MIN, INT32_MAX, -1, 0};
4742 
4743   int64_t smin_exp_d[] = {0, INT64_MIN, INT64_MIN, INT64_MIN,
4744                           INT64_MIN, INT64_MAX - 1, -1, 0};
4745 
4746   fn = &MacroAssembler::Smin;
4747   IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, smin_exp_b);
4748   IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, smin_exp_h);
4749   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, smin_exp_s);
4750   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, smin_exp_d);
4751 
4752   unsigned sabd_exp_b[] = {1, 128, 127, 128, 255, 1, 128, 1};
4753 
4754   unsigned sabd_exp_h[] = {1, 0x8000, 0x8000, 1, 0xffff, 1, 0x7fff, 1};
4755 
4756   unsigned sabd_exp_s[] = {1, 0x80000000, 0x7fffffff, 1,
4757                            0xffffffff, 0x7fffffff, 0x80000000, 1};
4758 
4759   uint64_t sabd_exp_d[] = {0, 0x8000000000000000, 0x7fffffffffffffff, 1,
4760                            0x8000000000000000, 1, 0x8000000000000000, 1};
4761 
4762   fn = &MacroAssembler::Sabd;
4763   IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, sabd_exp_b);
4764   IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, sabd_exp_h);
4765   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, sabd_exp_s);
4766   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, sabd_exp_d);
4767   // clang-format on
4768 }
4769 
TEST_SVE(sve_binary_arithmetic_predicated_mul_umulh)4770 TEST_SVE(sve_binary_arithmetic_predicated_mul_umulh) {
4771   // clang-format off
4772   unsigned zn_b[] = {0x00, 0x01, 0x20, 0x08, 0x80, 0xff, 0x55, 0xaa};
4773 
4774   unsigned zm_b[] = {0x7f, 0xcd, 0x80, 0xff, 0x55, 0xaa, 0x00, 0x08};
4775 
4776   unsigned zn_h[] = {0x0000, 0x0001, 0x0020, 0x0800,
4777                      0x8000, 0xff00, 0x5555, 0xaaaa};
4778 
4779   unsigned zm_h[] = {0x007f, 0x00cd, 0x0800, 0xffff,
4780                      0x5555, 0xaaaa, 0x0001, 0x1234};
4781 
4782   unsigned zn_s[] = {0x00000000, 0x00000001, 0x00200020, 0x08000800,
4783                      0x12345678, 0xffffffff, 0x55555555, 0xaaaaaaaa};
4784 
4785   unsigned zm_s[] = {0x00000000, 0x00000001, 0x00200020, 0x08000800,
4786                      0x12345678, 0x22223333, 0x55556666, 0x77778888};
4787 
4788   uint64_t zn_d[] = {0x0000000000000000, 0x5555555555555555,
4789                      0xffffffffffffffff, 0xaaaaaaaaaaaaaaaa};
4790 
4791   uint64_t zm_d[] = {0x0000000000000000, 0x1111111133333333,
4792                      0xddddddddeeeeeeee, 0xaaaaaaaaaaaaaaaa};
4793 
4794   int pg_b[] = {0, 1, 1, 1, 0, 1, 1, 1};
4795   int pg_h[] = {1, 0, 1, 1, 1, 0, 1, 1};
4796   int pg_s[] = {1, 1, 0, 1, 1, 1, 0, 1};
4797   int pg_d[] = {1, 1, 0, 1};
4798 
4799   unsigned mul_exp_b[] = {0x00, 0xcd, 0x00, 0xf8, 0x80, 0x56, 0x00, 0x50};
4800 
4801   unsigned mul_exp_h[] = {0x0000, 0x0001, 0x0000, 0xf800,
4802                           0x8000, 0xff00, 0x5555, 0x9e88};
4803 
4804   unsigned mul_exp_s[] = {0x00000000, 0x00000001, 0x00200020, 0x00400000,
4805                           0x1df4d840, 0xddddcccd, 0x55555555, 0xb05afa50};
4806 
4807   uint64_t mul_exp_d[] = {0x0000000000000000, 0xa4fa4fa4eeeeeeef,
4808                           0xffffffffffffffff, 0x38e38e38e38e38e4};
4809 
4810   ArithPredicatedFn fn = &MacroAssembler::Mul;
4811   IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, mul_exp_b);
4812   IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, mul_exp_h);
4813   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, mul_exp_s);
4814   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, mul_exp_d);
4815 
4816   unsigned umulh_exp_b[] = {0x00, 0x00, 0x10, 0x07, 0x80, 0xa9, 0x00, 0x05};
4817 
4818   unsigned umulh_exp_h[] = {0x0000, 0x0001, 0x0001, 0x07ff,
4819                             0x2aaa, 0xff00, 0x0000, 0x0c22};
4820 
4821   unsigned umulh_exp_s[] = {0x00000000, 0x00000000, 0x00200020, 0x00400080,
4822                             0x014b66dc, 0x22223332, 0x55555555, 0x4fa505af};
4823 
4824   uint64_t umulh_exp_d[] = {0x0000000000000000, 0x05b05b05bbbbbbbb,
4825                             0xffffffffffffffff, 0x71c71c71c71c71c6};
4826 
4827   fn = &MacroAssembler::Umulh;
4828   IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, umulh_exp_b);
4829   IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, umulh_exp_h);
4830   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, umulh_exp_s);
4831   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, umulh_exp_d);
4832   // clang-format on
4833 }
4834 
TEST_SVE(sve_binary_arithmetic_predicated_smulh)4835 TEST_SVE(sve_binary_arithmetic_predicated_smulh) {
4836   // clang-format off
4837   int zn_b[] = {0, 1, -1, INT8_MIN, INT8_MAX, -1, 100, -3};
4838 
4839   int zm_b[] = {0, INT8_MIN, INT8_MIN, INT8_MAX, INT8_MAX, -1, 2, 66};
4840 
4841   int zn_h[] = {0, 1, -1, INT16_MIN, INT16_MAX, -1, 10000, -3};
4842 
4843   int zm_h[] = {0, INT16_MIN, INT16_MIN, INT16_MAX, INT16_MAX, -1, 2, 6666};
4844 
4845   int zn_s[] = {0, 1, -1, INT32_MIN, INT32_MAX, -1, 100000000, -3};
4846 
4847   int zm_s[] = {0, INT32_MIN, INT32_MIN, INT32_MAX, INT32_MAX, -1, 2, 66666666};
4848 
4849   int64_t zn_d[] = {0, -1, INT64_MIN, INT64_MAX};
4850 
4851   int64_t zm_d[] = {INT64_MIN, INT64_MAX, INT64_MIN, INT64_MAX};
4852 
4853   int pg_b[] = {0, 1, 1, 1, 0, 1, 1, 1};
4854   int pg_h[] = {1, 0, 1, 1, 1, 0, 1, 1};
4855   int pg_s[] = {1, 1, 0, 1, 1, 1, 0, 1};
4856   int pg_d[] = {1, 1, 0, 1};
4857 
4858   int exp_b[] = {0, -1, 0, -64, INT8_MAX, 0, 0, -1};
4859 
4860   int exp_h[] = {0, 1, 0, -16384, 16383, -1, 0, -1};
4861 
4862   int exp_s[] = {0, -1, -1, -1073741824, 1073741823, 0, 100000000, -1};
4863 
4864   int64_t exp_d[] = {0, -1, INT64_MIN, 4611686018427387903};
4865 
4866   ArithPredicatedFn fn = &MacroAssembler::Smulh;
4867   IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, exp_b);
4868   IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, exp_h);
4869   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, exp_s);
4870   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, exp_d);
4871   // clang-format on
4872 }
4873 
TEST_SVE(sve_binary_arithmetic_predicated_logical)4874 TEST_SVE(sve_binary_arithmetic_predicated_logical) {
4875   // clang-format off
4876   unsigned zn_b[] = {0x00, 0x01, 0x20, 0x08, 0x80, 0xff, 0x55, 0xaa};
4877   unsigned zm_b[] = {0x7f, 0xcd, 0x80, 0xff, 0x55, 0xaa, 0x00, 0x08};
4878 
4879   unsigned zn_h[] = {0x0000, 0x0001, 0x2020, 0x0008,
4880                      0x8000, 0xffff, 0x5555, 0xaaaa};
4881   unsigned zm_h[] = {0x7fff, 0xabcd, 0x8000, 0xffff,
4882                      0x5555, 0xaaaa, 0x0000, 0x0800};
4883 
4884   unsigned zn_s[] = {0x00000001, 0x20200008, 0x8000ffff, 0x5555aaaa};
4885   unsigned zm_s[] = {0x7fffabcd, 0x8000ffff, 0x5555aaaa, 0x00000800};
4886 
4887   uint64_t zn_d[] = {0xfedcba9876543210, 0x0123456789abcdef,
4888                      0x0001200880ff55aa, 0x0022446688aaccee};
4889   uint64_t zm_d[] = {0xffffeeeeddddcccc, 0xccccddddeeeeffff,
4890                      0x7fcd80ff55aa0008, 0x1133557799bbddff};
4891 
4892   int pg_b[] = {0, 1, 1, 1, 0, 1, 1, 1};
4893   int pg_h[] = {1, 0, 1, 1, 1, 0, 1, 1};
4894   int pg_s[] = {1, 1, 1, 0};
4895   int pg_d[] = {1, 1, 0, 1};
4896 
4897   unsigned and_exp_b[] = {0x00, 0x01, 0x00, 0x08, 0x80, 0xaa, 0x00, 0x08};
4898 
4899   unsigned and_exp_h[] = {0x0000, 0x0001, 0x0000, 0x0008,
4900                           0x0000, 0xffff, 0x0000, 0x0800};
4901 
4902   unsigned and_exp_s[] = {0x00000001, 0x00000008, 0x0000aaaa, 0x5555aaaa};
4903 
4904   uint64_t and_exp_d[] = {0xfedcaa8854540000, 0x0000454588aacdef,
4905                           0x0001200880ff55aa, 0x0022446688aaccee};
4906 
4907   ArithPredicatedFn fn = &MacroAssembler::And;
4908   IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, and_exp_b);
4909   IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, and_exp_h);
4910   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, and_exp_s);
4911   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, and_exp_d);
4912 
4913   unsigned bic_exp_b[] = {0x00, 0x00, 0x20, 0x00, 0x80, 0x55, 0x55, 0xa2};
4914 
4915   unsigned bic_exp_h[] = {0x0000, 0x0001, 0x2020, 0x0000,
4916                           0x8000, 0xffff, 0x5555, 0xa2aa};
4917 
4918   unsigned bic_exp_s[] = {0x00000000, 0x20200000, 0x80005555, 0x5555aaaa};
4919 
4920   uint64_t bic_exp_d[] = {0x0000101022003210, 0x0123002201010000,
4921                           0x0001200880ff55aa, 0x0000000000000000};
4922 
4923   fn = &MacroAssembler::Bic;
4924   IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, bic_exp_b);
4925   IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, bic_exp_h);
4926   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, bic_exp_s);
4927   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, bic_exp_d);
4928 
4929   unsigned eor_exp_b[] = {0x00, 0xcc, 0xa0, 0xf7, 0x80, 0x55, 0x55, 0xa2};
4930 
4931   unsigned eor_exp_h[] = {0x7fff, 0x0001, 0xa020, 0xfff7,
4932                           0xd555, 0xffff, 0x5555, 0xa2aa};
4933 
4934   unsigned eor_exp_s[] = {0x7fffabcc, 0xa020fff7, 0xd5555555, 0x5555aaaa};
4935 
4936   uint64_t eor_exp_d[] = {0x01235476ab89fedc, 0xcdef98ba67453210,
4937                           0x0001200880ff55aa, 0x1111111111111111};
4938 
4939   fn = &MacroAssembler::Eor;
4940   IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, eor_exp_b);
4941   IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, eor_exp_h);
4942   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, eor_exp_s);
4943   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, eor_exp_d);
4944 
4945   unsigned orr_exp_b[] = {0x00, 0xcd, 0xa0, 0xff, 0x80, 0xff, 0x55, 0xaa};
4946 
4947   unsigned orr_exp_h[] = {0x7fff, 0x0001, 0xa020, 0xffff,
4948                           0xd555, 0xffff, 0x5555, 0xaaaa};
4949 
4950   unsigned orr_exp_s[] = {0x7fffabcd, 0xa020ffff, 0xd555ffff, 0x5555aaaa};
4951 
4952   uint64_t orr_exp_d[] = {0xfffffefeffddfedc, 0xcdefddffefefffff,
4953                           0x0001200880ff55aa, 0x1133557799bbddff};
4954 
4955   fn = &MacroAssembler::Orr;
4956   IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, orr_exp_b);
4957   IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, orr_exp_h);
4958   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, orr_exp_s);
4959   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, orr_exp_d);
4960   // clang-format on
4961 }
4962 
TEST_SVE(sve_binary_arithmetic_predicated_sdiv)4963 TEST_SVE(sve_binary_arithmetic_predicated_sdiv) {
4964   // clang-format off
4965   int zn_s[] = {0, 1, -1, 2468,
4966                 INT32_MIN, INT32_MAX, INT32_MIN, INT32_MAX,
4967                 -11111111, 87654321, 0, 0};
4968 
4969   int zm_s[] = {1, -1, 1, 1234,
4970                 -1, INT32_MIN, 1, -1,
4971                 22222222, 80000000, -1, 0};
4972 
4973   int64_t zn_d[] = {0, 1, -1, 2468,
4974                     INT64_MIN, INT64_MAX, INT64_MIN, INT64_MAX,
4975                     -11111111, 87654321, 0, 0};
4976 
4977   int64_t zm_d[] = {1, -1, 1, 1234,
4978                     -1, INT64_MIN, 1, -1,
4979                     22222222, 80000000, -1, 0};
4980 
4981   int pg_s[] = {1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0};
4982   int pg_d[] = {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1};
4983 
4984   int exp_s[] = {0, 1, -1, 2,
4985                  INT32_MIN, 0, INT32_MIN, -INT32_MAX,
4986                  0, 1, 0, 0};
4987 
4988   int64_t exp_d[] = {0, -1, -1, 2,
4989                      INT64_MIN, INT64_MAX, INT64_MIN, -INT64_MAX,
4990                      0, 1, 0, 0};
4991 
4992   ArithPredicatedFn fn = &MacroAssembler::Sdiv;
4993   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, exp_s);
4994   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, exp_d);
4995   // clang-format on
4996 }
4997 
TEST_SVE(sve_binary_arithmetic_predicated_udiv)4998 TEST_SVE(sve_binary_arithmetic_predicated_udiv) {
4999   // clang-format off
5000   unsigned zn_s[] = {0x00000000, 0x00000001, 0xffffffff, 0x80000000,
5001                      0xffffffff, 0x80000000, 0xffffffff, 0x0000f000};
5002 
5003   unsigned zm_s[] = {0x00000001, 0xffffffff, 0x80000000, 0x00000002,
5004                      0x00000000, 0x00000001, 0x00008000, 0xf0000000};
5005 
5006   uint64_t zn_d[] = {0x0000000000000000, 0x0000000000000001,
5007                      0xffffffffffffffff, 0x8000000000000000,
5008                      0xffffffffffffffff, 0x8000000000000000,
5009                      0xffffffffffffffff, 0xf0000000f0000000};
5010 
5011   uint64_t zm_d[] = {0x0000000000000001, 0xffffffff00000000,
5012                      0x8000000000000000, 0x0000000000000002,
5013                      0x8888888888888888, 0x0000000000000001,
5014                      0x0000000080000000, 0x00000000f0000000};
5015 
5016   int pg_s[] = {1, 1, 0, 1, 1, 0, 1, 1};
5017   int pg_d[] = {1, 0, 1, 1, 1, 1, 0, 1};
5018 
5019   unsigned exp_s[] = {0x00000000, 0x00000000, 0xffffffff, 0x40000000,
5020                       0x00000000, 0x80000000, 0x0001ffff, 0x00000000};
5021 
5022   uint64_t exp_d[] = {0x0000000000000000, 0x0000000000000001,
5023                       0x0000000000000001, 0x4000000000000000,
5024                       0x0000000000000001, 0x8000000000000000,
5025                       0xffffffffffffffff, 0x0000000100000001};
5026 
5027   ArithPredicatedFn fn = &MacroAssembler::Udiv;
5028   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, exp_s);
5029   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, exp_d);
5030   // clang-format on
5031 }
5032 
5033 typedef void (MacroAssembler::*ArithFn)(const ZRegister& zd,
5034                                         const ZRegister& zn,
5035                                         const ZRegister& zm);
5036 
5037 template <typename T>
IntArithHelper(Test * config,ArithFn macro,unsigned lane_size_in_bits,const T & zn_inputs,const T & zm_inputs,const T & zd_expected)5038 static void IntArithHelper(Test* config,
5039                            ArithFn macro,
5040                            unsigned lane_size_in_bits,
5041                            const T& zn_inputs,
5042                            const T& zm_inputs,
5043                            const T& zd_expected) {
5044   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5045   START();
5046 
5047   ZRegister zn = z31.WithLaneSize(lane_size_in_bits);
5048   ZRegister zm = z27.WithLaneSize(lane_size_in_bits);
5049   InsrHelper(&masm, zn, zn_inputs);
5050   InsrHelper(&masm, zm, zm_inputs);
5051 
5052   ZRegister zd = z0.WithLaneSize(lane_size_in_bits);
5053   (masm.*macro)(zd, zn, zm);
5054 
5055   END();
5056 
5057   if (CAN_RUN()) {
5058     RUN();
5059     ASSERT_EQUAL_SVE(zd_expected, zd);
5060   }
5061 }
5062 
TEST_SVE(sve_arithmetic_unpredicated_add_sqadd_uqadd)5063 TEST_SVE(sve_arithmetic_unpredicated_add_sqadd_uqadd) {
5064   // clang-format off
5065   unsigned in_b[] = {0x81, 0x7f, 0x10, 0xaa, 0x55, 0xff, 0xf0};
5066   unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa, 0x5555, 0xffff, 0xf0f0};
5067   unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0x10001010, 0xaaaaaaaa, 0xf000f0f0};
5068   uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f,
5069                       0x1000000010001010, 0xf0000000f000f0f0};
5070 
5071   ArithFn fn = &MacroAssembler::Add;
5072 
5073   unsigned add_exp_b[] = {0x02, 0xfe, 0x20, 0x54, 0xaa, 0xfe, 0xe0};
5074   unsigned add_exp_h[] = {0x0302, 0xfefe, 0x2020, 0x5554, 0xaaaa, 0xfffe, 0xe1e0};
5075   unsigned add_exp_s[] = {0x00030302, 0xfffefefe, 0x20002020, 0x55555554, 0xe001e1e0};
5076   uint64_t add_exp_d[] = {0x0000000300030302, 0xfffffffefffefefe,
5077                           0x2000000020002020, 0xe0000001e001e1e0};
5078 
5079   IntArithHelper(config, fn, kBRegSize, in_b, in_b, add_exp_b);
5080   IntArithHelper(config, fn, kHRegSize, in_h, in_h, add_exp_h);
5081   IntArithHelper(config, fn, kSRegSize, in_s, in_s, add_exp_s);
5082   IntArithHelper(config, fn, kDRegSize, in_d, in_d, add_exp_d);
5083 
5084   fn = &MacroAssembler::Sqadd;
5085 
5086   unsigned sqadd_exp_b[] = {0x80, 0x7f, 0x20, 0x80, 0x7f, 0xfe, 0xe0};
5087   unsigned sqadd_exp_h[] = {0x8000, 0x7fff, 0x2020, 0x8000, 0x7fff, 0xfffe, 0xe1e0};
5088   unsigned sqadd_exp_s[] = {0x80000000, 0x7fffffff, 0x20002020, 0x80000000, 0xe001e1e0};
5089   uint64_t sqadd_exp_d[] = {0x8000000000000000, 0x7fffffffffffffff,
5090                             0x2000000020002020, 0xe0000001e001e1e0};
5091 
5092   IntArithHelper(config, fn, kBRegSize, in_b, in_b, sqadd_exp_b);
5093   IntArithHelper(config, fn, kHRegSize, in_h, in_h, sqadd_exp_h);
5094   IntArithHelper(config, fn, kSRegSize, in_s, in_s, sqadd_exp_s);
5095   IntArithHelper(config, fn, kDRegSize, in_d, in_d, sqadd_exp_d);
5096 
5097   fn = &MacroAssembler::Uqadd;
5098 
5099   unsigned uqadd_exp_b[] = {0xff, 0xfe, 0x20, 0xff, 0xaa, 0xff, 0xff};
5100   unsigned uqadd_exp_h[] = {0xffff, 0xfefe, 0x2020, 0xffff, 0xaaaa, 0xffff, 0xffff};
5101   unsigned uqadd_exp_s[] = {0xffffffff, 0xfffefefe, 0x20002020, 0xffffffff, 0xffffffff};
5102   uint64_t uqadd_exp_d[] = {0xffffffffffffffff, 0xfffffffefffefefe,
5103                             0x2000000020002020, 0xffffffffffffffff};
5104 
5105   IntArithHelper(config, fn, kBRegSize, in_b, in_b, uqadd_exp_b);
5106   IntArithHelper(config, fn, kHRegSize, in_h, in_h, uqadd_exp_h);
5107   IntArithHelper(config, fn, kSRegSize, in_s, in_s, uqadd_exp_s);
5108   IntArithHelper(config, fn, kDRegSize, in_d, in_d, uqadd_exp_d);
5109   // clang-format on
5110 }
5111 
TEST_SVE(sve_arithmetic_unpredicated_sub_sqsub_uqsub)5112 TEST_SVE(sve_arithmetic_unpredicated_sub_sqsub_uqsub) {
5113   // clang-format off
5114 
5115   unsigned ins1_b[] = {0x81, 0x7f, 0x7e, 0xaa};
5116   unsigned ins2_b[] = {0x10, 0xf0, 0xf0, 0x55};
5117 
5118   unsigned ins1_h[] = {0x8181, 0x7f7f, 0x7e7e, 0xaaaa};
5119   unsigned ins2_h[] = {0x1010, 0xf0f0, 0xf0f0, 0x5555};
5120 
5121   unsigned ins1_s[] = {0x80018181, 0x7fff7f7f, 0x7eee7e7e, 0xaaaaaaaa};
5122   unsigned ins2_s[] = {0x10001010, 0xf000f0f0, 0xf000f0f0, 0x55555555};
5123 
5124   uint64_t ins1_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f,
5125                        0x7eeeeeee7eee7e7e, 0xaaaaaaaaaaaaaaaa};
5126   uint64_t ins2_d[] = {0x1000000010001010, 0xf0000000f000f0f0,
5127                        0xf0000000f000f0f0, 0x5555555555555555};
5128 
5129   ArithFn fn = &MacroAssembler::Sub;
5130 
5131   unsigned ins1_sub_ins2_exp_b[] = {0x71, 0x8f, 0x8e, 0x55};
5132   unsigned ins1_sub_ins2_exp_h[] = {0x7171, 0x8e8f, 0x8d8e, 0x5555};
5133   unsigned ins1_sub_ins2_exp_s[] = {0x70017171, 0x8ffe8e8f, 0x8eed8d8e, 0x55555555};
5134   uint64_t ins1_sub_ins2_exp_d[] = {0x7000000170017171, 0x8ffffffe8ffe8e8f,
5135                                     0x8eeeeeed8eed8d8e, 0x5555555555555555};
5136 
5137   IntArithHelper(config, fn, kBRegSize, ins1_b, ins2_b, ins1_sub_ins2_exp_b);
5138   IntArithHelper(config, fn, kHRegSize, ins1_h, ins2_h, ins1_sub_ins2_exp_h);
5139   IntArithHelper(config, fn, kSRegSize, ins1_s, ins2_s, ins1_sub_ins2_exp_s);
5140   IntArithHelper(config, fn, kDRegSize, ins1_d, ins2_d, ins1_sub_ins2_exp_d);
5141 
5142   unsigned ins2_sub_ins1_exp_b[] = {0x8f, 0x71, 0x72, 0xab};
5143   unsigned ins2_sub_ins1_exp_h[] = {0x8e8f, 0x7171, 0x7272, 0xaaab};
5144   unsigned ins2_sub_ins1_exp_s[] = {0x8ffe8e8f, 0x70017171, 0x71127272, 0xaaaaaaab};
5145   uint64_t ins2_sub_ins1_exp_d[] = {0x8ffffffe8ffe8e8f, 0x7000000170017171,
5146                                     0x7111111271127272, 0xaaaaaaaaaaaaaaab};
5147 
5148   IntArithHelper(config, fn, kBRegSize, ins2_b, ins1_b, ins2_sub_ins1_exp_b);
5149   IntArithHelper(config, fn, kHRegSize, ins2_h, ins1_h, ins2_sub_ins1_exp_h);
5150   IntArithHelper(config, fn, kSRegSize, ins2_s, ins1_s, ins2_sub_ins1_exp_s);
5151   IntArithHelper(config, fn, kDRegSize, ins2_d, ins1_d, ins2_sub_ins1_exp_d);
5152 
5153   fn = &MacroAssembler::Sqsub;
5154 
5155   unsigned ins1_sqsub_ins2_exp_b[] = {0x80, 0x7f, 0x7f, 0x80};
5156   unsigned ins1_sqsub_ins2_exp_h[] = {0x8000, 0x7fff, 0x7fff, 0x8000};
5157   unsigned ins1_sqsub_ins2_exp_s[] = {0x80000000, 0x7fffffff, 0x7fffffff, 0x80000000};
5158   uint64_t ins1_sqsub_ins2_exp_d[] = {0x8000000000000000, 0x7fffffffffffffff,
5159                                       0x7fffffffffffffff, 0x8000000000000000};
5160 
5161   IntArithHelper(config, fn, kBRegSize, ins1_b, ins2_b, ins1_sqsub_ins2_exp_b);
5162   IntArithHelper(config, fn, kHRegSize, ins1_h, ins2_h, ins1_sqsub_ins2_exp_h);
5163   IntArithHelper(config, fn, kSRegSize, ins1_s, ins2_s, ins1_sqsub_ins2_exp_s);
5164   IntArithHelper(config, fn, kDRegSize, ins1_d, ins2_d, ins1_sqsub_ins2_exp_d);
5165 
5166   unsigned ins2_sqsub_ins1_exp_b[] = {0x7f, 0x80, 0x80, 0x7f};
5167   unsigned ins2_sqsub_ins1_exp_h[] = {0x7fff, 0x8000, 0x8000, 0x7fff};
5168   unsigned ins2_sqsub_ins1_exp_s[] = {0x7fffffff, 0x80000000, 0x80000000, 0x7fffffff};
5169   uint64_t ins2_sqsub_ins1_exp_d[] = {0x7fffffffffffffff, 0x8000000000000000,
5170                                       0x8000000000000000, 0x7fffffffffffffff};
5171 
5172   IntArithHelper(config, fn, kBRegSize, ins2_b, ins1_b, ins2_sqsub_ins1_exp_b);
5173   IntArithHelper(config, fn, kHRegSize, ins2_h, ins1_h, ins2_sqsub_ins1_exp_h);
5174   IntArithHelper(config, fn, kSRegSize, ins2_s, ins1_s, ins2_sqsub_ins1_exp_s);
5175   IntArithHelper(config, fn, kDRegSize, ins2_d, ins1_d, ins2_sqsub_ins1_exp_d);
5176 
5177   fn = &MacroAssembler::Uqsub;
5178 
5179   unsigned ins1_uqsub_ins2_exp_b[] = {0x71, 0x00, 0x00, 0x55};
5180   unsigned ins1_uqsub_ins2_exp_h[] = {0x7171, 0x0000, 0x0000, 0x5555};
5181   unsigned ins1_uqsub_ins2_exp_s[] = {0x70017171, 0x00000000, 0x00000000, 0x55555555};
5182   uint64_t ins1_uqsub_ins2_exp_d[] = {0x7000000170017171, 0x0000000000000000,
5183                                       0x0000000000000000, 0x5555555555555555};
5184 
5185   IntArithHelper(config, fn, kBRegSize, ins1_b, ins2_b, ins1_uqsub_ins2_exp_b);
5186   IntArithHelper(config, fn, kHRegSize, ins1_h, ins2_h, ins1_uqsub_ins2_exp_h);
5187   IntArithHelper(config, fn, kSRegSize, ins1_s, ins2_s, ins1_uqsub_ins2_exp_s);
5188   IntArithHelper(config, fn, kDRegSize, ins1_d, ins2_d, ins1_uqsub_ins2_exp_d);
5189 
5190   unsigned ins2_uqsub_ins1_exp_b[] = {0x00, 0x71, 0x72, 0x00};
5191   unsigned ins2_uqsub_ins1_exp_h[] = {0x0000, 0x7171, 0x7272, 0x0000};
5192   unsigned ins2_uqsub_ins1_exp_s[] = {0x00000000, 0x70017171, 0x71127272, 0x00000000};
5193   uint64_t ins2_uqsub_ins1_exp_d[] = {0x0000000000000000, 0x7000000170017171,
5194                                       0x7111111271127272, 0x0000000000000000};
5195 
5196   IntArithHelper(config, fn, kBRegSize, ins2_b, ins1_b, ins2_uqsub_ins1_exp_b);
5197   IntArithHelper(config, fn, kHRegSize, ins2_h, ins1_h, ins2_uqsub_ins1_exp_h);
5198   IntArithHelper(config, fn, kSRegSize, ins2_s, ins1_s, ins2_uqsub_ins1_exp_s);
5199   IntArithHelper(config, fn, kDRegSize, ins2_d, ins1_d, ins2_uqsub_ins1_exp_d);
5200   // clang-format on
5201 }
5202 
TEST_SVE(sve_rdvl)5203 TEST_SVE(sve_rdvl) {
5204   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5205   START();
5206 
5207   // Encodable multipliers.
5208   __ Rdvl(x0, 0);
5209   __ Rdvl(x1, 1);
5210   __ Rdvl(x2, 2);
5211   __ Rdvl(x3, 31);
5212   __ Rdvl(x4, -1);
5213   __ Rdvl(x5, -2);
5214   __ Rdvl(x6, -32);
5215 
5216   // For unencodable multipliers, the MacroAssembler uses a sequence of
5217   // instructions.
5218   __ Rdvl(x10, 32);
5219   __ Rdvl(x11, -33);
5220   __ Rdvl(x12, 42);
5221   __ Rdvl(x13, -42);
5222 
5223   // The maximum value of VL is 256 (bytes), so the multiplier is limited to the
5224   // range [INT64_MIN/256, INT64_MAX/256], to ensure that no signed overflow
5225   // occurs in the macro.
5226   __ Rdvl(x14, 0x007fffffffffffff);
5227   __ Rdvl(x15, -0x0080000000000000);
5228 
5229   END();
5230 
5231   if (CAN_RUN()) {
5232     RUN();
5233 
5234     uint64_t vl = config->sve_vl_in_bytes();
5235 
5236     ASSERT_EQUAL_64(vl * 0, x0);
5237     ASSERT_EQUAL_64(vl * 1, x1);
5238     ASSERT_EQUAL_64(vl * 2, x2);
5239     ASSERT_EQUAL_64(vl * 31, x3);
5240     ASSERT_EQUAL_64(vl * -1, x4);
5241     ASSERT_EQUAL_64(vl * -2, x5);
5242     ASSERT_EQUAL_64(vl * -32, x6);
5243 
5244     ASSERT_EQUAL_64(vl * 32, x10);
5245     ASSERT_EQUAL_64(vl * -33, x11);
5246     ASSERT_EQUAL_64(vl * 42, x12);
5247     ASSERT_EQUAL_64(vl * -42, x13);
5248 
5249     ASSERT_EQUAL_64(vl * 0x007fffffffffffff, x14);
5250     ASSERT_EQUAL_64(vl * 0xff80000000000000, x15);
5251   }
5252 }
5253 
TEST_SVE(sve_rdpl)5254 TEST_SVE(sve_rdpl) {
5255   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5256   START();
5257 
5258   // There is no `rdpl` instruction, so the MacroAssembler maps `Rdpl` onto
5259   // Addpl(xd, xzr, ...).
5260 
5261   // Encodable multipliers (as `addvl`).
5262   __ Rdpl(x0, 0);
5263   __ Rdpl(x1, 8);
5264   __ Rdpl(x2, 248);
5265   __ Rdpl(x3, -8);
5266   __ Rdpl(x4, -256);
5267 
5268   // Encodable multipliers (as `movz` + `addpl`).
5269   __ Rdpl(x7, 31);
5270   __ Rdpl(x8, -31);
5271 
5272   // For unencodable multipliers, the MacroAssembler uses a sequence of
5273   // instructions.
5274   __ Rdpl(x10, 42);
5275   __ Rdpl(x11, -42);
5276 
5277   // The maximum value of VL is 256 (bytes), so the multiplier is limited to the
5278   // range [INT64_MIN/256, INT64_MAX/256], to ensure that no signed overflow
5279   // occurs in the macro.
5280   __ Rdpl(x12, 0x007fffffffffffff);
5281   __ Rdpl(x13, -0x0080000000000000);
5282 
5283   END();
5284 
5285   if (CAN_RUN()) {
5286     RUN();
5287 
5288     uint64_t vl = config->sve_vl_in_bytes();
5289     VIXL_ASSERT((vl % kZRegBitsPerPRegBit) == 0);
5290     uint64_t pl = vl / kZRegBitsPerPRegBit;
5291 
5292     ASSERT_EQUAL_64(pl * 0, x0);
5293     ASSERT_EQUAL_64(pl * 8, x1);
5294     ASSERT_EQUAL_64(pl * 248, x2);
5295     ASSERT_EQUAL_64(pl * -8, x3);
5296     ASSERT_EQUAL_64(pl * -256, x4);
5297 
5298     ASSERT_EQUAL_64(pl * 31, x7);
5299     ASSERT_EQUAL_64(pl * -31, x8);
5300 
5301     ASSERT_EQUAL_64(pl * 42, x10);
5302     ASSERT_EQUAL_64(pl * -42, x11);
5303 
5304     ASSERT_EQUAL_64(pl * 0x007fffffffffffff, x12);
5305     ASSERT_EQUAL_64(pl * 0xff80000000000000, x13);
5306   }
5307 }
5308 
TEST_SVE(sve_addvl)5309 TEST_SVE(sve_addvl) {
5310   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5311   START();
5312 
5313   uint64_t base = 0x1234567800000000;
5314   __ Mov(x30, base);
5315 
5316   // Encodable multipliers.
5317   __ Addvl(x0, x30, 0);
5318   __ Addvl(x1, x30, 1);
5319   __ Addvl(x2, x30, 31);
5320   __ Addvl(x3, x30, -1);
5321   __ Addvl(x4, x30, -32);
5322 
5323   // For unencodable multipliers, the MacroAssembler uses `Rdvl` and `Add`.
5324   __ Addvl(x5, x30, 32);
5325   __ Addvl(x6, x30, -33);
5326 
5327   // Test the limits of the multiplier supported by the `Rdvl` macro.
5328   __ Addvl(x7, x30, 0x007fffffffffffff);
5329   __ Addvl(x8, x30, -0x0080000000000000);
5330 
5331   // Check that xzr behaves correctly.
5332   __ Addvl(x9, xzr, 8);
5333   __ Addvl(x10, xzr, 42);
5334 
5335   // Check that sp behaves correctly with encodable and unencodable multipliers.
5336   __ Addvl(sp, sp, -5);
5337   __ Addvl(sp, sp, -37);
5338   __ Addvl(x11, sp, -2);
5339   __ Addvl(sp, x11, 2);
5340   __ Addvl(x12, sp, -42);
5341 
5342   // Restore the value of sp.
5343   __ Addvl(sp, x11, 39);
5344   __ Addvl(sp, sp, 5);
5345 
5346   // Adjust x11 and x12 to make the test sp-agnostic.
5347   __ Sub(x11, sp, x11);
5348   __ Sub(x12, sp, x12);
5349 
5350   // Check cases where xd.Is(xn). This stresses scratch register allocation.
5351   __ Mov(x20, x30);
5352   __ Mov(x21, x30);
5353   __ Mov(x22, x30);
5354   __ Addvl(x20, x20, 4);
5355   __ Addvl(x21, x21, 42);
5356   __ Addvl(x22, x22, -0x0080000000000000);
5357 
5358   END();
5359 
5360   if (CAN_RUN()) {
5361     RUN();
5362 
5363     uint64_t vl = config->sve_vl_in_bytes();
5364 
5365     ASSERT_EQUAL_64(base + (vl * 0), x0);
5366     ASSERT_EQUAL_64(base + (vl * 1), x1);
5367     ASSERT_EQUAL_64(base + (vl * 31), x2);
5368     ASSERT_EQUAL_64(base + (vl * -1), x3);
5369     ASSERT_EQUAL_64(base + (vl * -32), x4);
5370 
5371     ASSERT_EQUAL_64(base + (vl * 32), x5);
5372     ASSERT_EQUAL_64(base + (vl * -33), x6);
5373 
5374     ASSERT_EQUAL_64(base + (vl * 0x007fffffffffffff), x7);
5375     ASSERT_EQUAL_64(base + (vl * 0xff80000000000000), x8);
5376 
5377     ASSERT_EQUAL_64(vl * 8, x9);
5378     ASSERT_EQUAL_64(vl * 42, x10);
5379 
5380     ASSERT_EQUAL_64(vl * 44, x11);
5381     ASSERT_EQUAL_64(vl * 84, x12);
5382 
5383     ASSERT_EQUAL_64(base + (vl * 4), x20);
5384     ASSERT_EQUAL_64(base + (vl * 42), x21);
5385     ASSERT_EQUAL_64(base + (vl * 0xff80000000000000), x22);
5386 
5387     ASSERT_EQUAL_64(base, x30);
5388   }
5389 }
5390 
TEST_SVE(sve_addpl)5391 TEST_SVE(sve_addpl) {
5392   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5393   START();
5394 
5395   uint64_t base = 0x1234567800000000;
5396   __ Mov(x30, base);
5397 
5398   // Encodable multipliers.
5399   __ Addpl(x0, x30, 0);
5400   __ Addpl(x1, x30, 1);
5401   __ Addpl(x2, x30, 31);
5402   __ Addpl(x3, x30, -1);
5403   __ Addpl(x4, x30, -32);
5404 
5405   // For unencodable multipliers, the MacroAssembler uses `Addvl` if it can, or
5406   // it falls back to `Rdvl` and `Add`.
5407   __ Addpl(x5, x30, 32);
5408   __ Addpl(x6, x30, -33);
5409 
5410   // Test the limits of the multiplier supported by the `Rdvl` macro.
5411   __ Addpl(x7, x30, 0x007fffffffffffff);
5412   __ Addpl(x8, x30, -0x0080000000000000);
5413 
5414   // Check that xzr behaves correctly.
5415   __ Addpl(x9, xzr, 8);
5416   __ Addpl(x10, xzr, 42);
5417 
5418   // Check that sp behaves correctly with encodable and unencodable multipliers.
5419   __ Addpl(sp, sp, -5);
5420   __ Addpl(sp, sp, -37);
5421   __ Addpl(x11, sp, -2);
5422   __ Addpl(sp, x11, 2);
5423   __ Addpl(x12, sp, -42);
5424 
5425   // Restore the value of sp.
5426   __ Addpl(sp, x11, 39);
5427   __ Addpl(sp, sp, 5);
5428 
5429   // Adjust x11 and x12 to make the test sp-agnostic.
5430   __ Sub(x11, sp, x11);
5431   __ Sub(x12, sp, x12);
5432 
5433   // Check cases where xd.Is(xn). This stresses scratch register allocation.
5434   __ Mov(x20, x30);
5435   __ Mov(x21, x30);
5436   __ Mov(x22, x30);
5437   __ Addpl(x20, x20, 4);
5438   __ Addpl(x21, x21, 42);
5439   __ Addpl(x22, x22, -0x0080000000000000);
5440 
5441   END();
5442 
5443   if (CAN_RUN()) {
5444     RUN();
5445 
5446     uint64_t vl = config->sve_vl_in_bytes();
5447     VIXL_ASSERT((vl % kZRegBitsPerPRegBit) == 0);
5448     uint64_t pl = vl / kZRegBitsPerPRegBit;
5449 
5450     ASSERT_EQUAL_64(base + (pl * 0), x0);
5451     ASSERT_EQUAL_64(base + (pl * 1), x1);
5452     ASSERT_EQUAL_64(base + (pl * 31), x2);
5453     ASSERT_EQUAL_64(base + (pl * -1), x3);
5454     ASSERT_EQUAL_64(base + (pl * -32), x4);
5455 
5456     ASSERT_EQUAL_64(base + (pl * 32), x5);
5457     ASSERT_EQUAL_64(base + (pl * -33), x6);
5458 
5459     ASSERT_EQUAL_64(base + (pl * 0x007fffffffffffff), x7);
5460     ASSERT_EQUAL_64(base + (pl * 0xff80000000000000), x8);
5461 
5462     ASSERT_EQUAL_64(pl * 8, x9);
5463     ASSERT_EQUAL_64(pl * 42, x10);
5464 
5465     ASSERT_EQUAL_64(pl * 44, x11);
5466     ASSERT_EQUAL_64(pl * 84, x12);
5467 
5468     ASSERT_EQUAL_64(base + (pl * 4), x20);
5469     ASSERT_EQUAL_64(base + (pl * 42), x21);
5470     ASSERT_EQUAL_64(base + (pl * 0xff80000000000000), x22);
5471 
5472     ASSERT_EQUAL_64(base, x30);
5473   }
5474 }
5475 
TEST_SVE(sve_calculate_sve_address)5476 TEST_SVE(sve_calculate_sve_address) {
5477 #pragma GCC diagnostic push
5478 #pragma GCC diagnostic ignored "-Wshadow"
5479 
5480   // Shadow the `MacroAssembler` type so that the test macros work without
5481   // modification.
5482   typedef CalculateSVEAddressMacroAssembler MacroAssembler;
5483 
5484   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5485   START();  // NOLINT(clang-diagnostic-local-type-template-args)
5486 
5487   uint64_t base = 0x1234567800000000;
5488   __ Mov(x28, base);
5489   __ Mov(x29, 48);
5490   __ Mov(x30, -48);
5491 
5492   // Simple scalar (or equivalent) cases.
5493 
5494   __ CalculateSVEAddress(x0, SVEMemOperand(x28));
5495   __ CalculateSVEAddress(x1, SVEMemOperand(x28, 0));
5496   __ CalculateSVEAddress(x2, SVEMemOperand(x28, 0, SVE_MUL_VL));
5497   __ CalculateSVEAddress(x3, SVEMemOperand(x28, 0, SVE_MUL_VL), 3);
5498   __ CalculateSVEAddress(x4, SVEMemOperand(x28, xzr));
5499   __ CalculateSVEAddress(x5, SVEMemOperand(x28, xzr, LSL, 42));
5500 
5501   // scalar-plus-immediate
5502 
5503   // Unscaled immediates, handled with `Add`.
5504   __ CalculateSVEAddress(x6, SVEMemOperand(x28, 42));
5505   __ CalculateSVEAddress(x7, SVEMemOperand(x28, -42));
5506   // Scaled immediates, handled with `Addvl` or `Addpl`.
5507   __ CalculateSVEAddress(x8, SVEMemOperand(x28, 31, SVE_MUL_VL), 0);
5508   __ CalculateSVEAddress(x9, SVEMemOperand(x28, -32, SVE_MUL_VL), 0);
5509   // Out of `addvl` or `addpl` range.
5510   __ CalculateSVEAddress(x10, SVEMemOperand(x28, 42, SVE_MUL_VL), 0);
5511   __ CalculateSVEAddress(x11, SVEMemOperand(x28, -42, SVE_MUL_VL), 0);
5512   // As above, for VL-based accesses smaller than a Z register.
5513   VIXL_STATIC_ASSERT(kZRegBitsPerPRegBitLog2 == 3);
5514   __ CalculateSVEAddress(x12, SVEMemOperand(x28, -32 * 8, SVE_MUL_VL), 3);
5515   __ CalculateSVEAddress(x13, SVEMemOperand(x28, -42 * 8, SVE_MUL_VL), 3);
5516   __ CalculateSVEAddress(x14, SVEMemOperand(x28, -32 * 4, SVE_MUL_VL), 2);
5517   __ CalculateSVEAddress(x15, SVEMemOperand(x28, -42 * 4, SVE_MUL_VL), 2);
5518   __ CalculateSVEAddress(x18, SVEMemOperand(x28, -32 * 2, SVE_MUL_VL), 1);
5519   __ CalculateSVEAddress(x19, SVEMemOperand(x28, -42 * 2, SVE_MUL_VL), 1);
5520 
5521   // scalar-plus-scalar
5522 
5523   __ CalculateSVEAddress(x20, SVEMemOperand(x28, x29));
5524   __ CalculateSVEAddress(x21, SVEMemOperand(x28, x30));
5525   __ CalculateSVEAddress(x22, SVEMemOperand(x28, x29, LSL, 8));
5526   __ CalculateSVEAddress(x23, SVEMemOperand(x28, x30, LSL, 8));
5527 
5528   // In-place updates, to stress scratch register allocation.
5529 
5530   __ Mov(x24, 0xabcd000000000000);
5531   __ Mov(x25, 0xabcd101100000000);
5532   __ Mov(x26, 0xabcd202200000000);
5533   __ Mov(x27, 0xabcd303300000000);
5534   __ Mov(x28, 0xabcd404400000000);
5535   __ Mov(x29, 0xabcd505500000000);
5536 
5537   __ CalculateSVEAddress(x24, SVEMemOperand(x24));
5538   __ CalculateSVEAddress(x25, SVEMemOperand(x25, 0x42));
5539   __ CalculateSVEAddress(x26, SVEMemOperand(x26, 3, SVE_MUL_VL), 0);
5540   __ CalculateSVEAddress(x27, SVEMemOperand(x27, 0x42, SVE_MUL_VL), 3);
5541   __ CalculateSVEAddress(x28, SVEMemOperand(x28, x30));
5542   __ CalculateSVEAddress(x29, SVEMemOperand(x29, x30, LSL, 4));
5543 
5544   END();
5545 
5546   if (CAN_RUN()) {
5547     RUN();
5548 
5549     uint64_t vl = config->sve_vl_in_bytes();
5550     VIXL_ASSERT((vl % kZRegBitsPerPRegBit) == 0);
5551     uint64_t pl = vl / kZRegBitsPerPRegBit;
5552 
5553     // Simple scalar (or equivalent) cases.
5554     ASSERT_EQUAL_64(base, x0);
5555     ASSERT_EQUAL_64(base, x1);
5556     ASSERT_EQUAL_64(base, x2);
5557     ASSERT_EQUAL_64(base, x3);
5558     ASSERT_EQUAL_64(base, x4);
5559     ASSERT_EQUAL_64(base, x5);
5560 
5561     // scalar-plus-immediate
5562     ASSERT_EQUAL_64(base + 42, x6);
5563     ASSERT_EQUAL_64(base - 42, x7);
5564     ASSERT_EQUAL_64(base + (31 * vl), x8);
5565     ASSERT_EQUAL_64(base - (32 * vl), x9);
5566     ASSERT_EQUAL_64(base + (42 * vl), x10);
5567     ASSERT_EQUAL_64(base - (42 * vl), x11);
5568     ASSERT_EQUAL_64(base - (32 * vl), x12);
5569     ASSERT_EQUAL_64(base - (42 * vl), x13);
5570     ASSERT_EQUAL_64(base - (32 * vl), x14);
5571     ASSERT_EQUAL_64(base - (42 * vl), x15);
5572     ASSERT_EQUAL_64(base - (32 * vl), x18);
5573     ASSERT_EQUAL_64(base - (42 * vl), x19);
5574 
5575     // scalar-plus-scalar
5576     ASSERT_EQUAL_64(base + 48, x20);
5577     ASSERT_EQUAL_64(base - 48, x21);
5578     ASSERT_EQUAL_64(base + (48 << 8), x22);
5579     ASSERT_EQUAL_64(base - (48 << 8), x23);
5580 
5581     // In-place updates.
5582     ASSERT_EQUAL_64(0xabcd000000000000, x24);
5583     ASSERT_EQUAL_64(0xabcd101100000000 + 0x42, x25);
5584     ASSERT_EQUAL_64(0xabcd202200000000 + (3 * vl), x26);
5585     ASSERT_EQUAL_64(0xabcd303300000000 + (0x42 * pl), x27);
5586     ASSERT_EQUAL_64(0xabcd404400000000 - 48, x28);
5587     ASSERT_EQUAL_64(0xabcd505500000000 - (48 << 4), x29);
5588   }
5589 #pragma GCC diagnostic pop
5590 }
5591 
TEST_SVE(sve_permute_vector_unpredicated)5592 TEST_SVE(sve_permute_vector_unpredicated) {
5593   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
5594   START();
5595 
5596   // Initialise registers with known values first.
5597   __ Dup(z1.VnB(), 0x11);
5598   __ Dup(z2.VnB(), 0x22);
5599   __ Dup(z3.VnB(), 0x33);
5600   __ Dup(z4.VnB(), 0x44);
5601 
5602   __ Mov(x0, 0x0123456789abcdef);
5603   __ Fmov(d0, RawbitsToDouble(0x7ffaaaaa22223456));
5604   __ Insr(z1.VnS(), w0);
5605   __ Insr(z2.VnD(), x0);
5606   __ Insr(z3.VnH(), h0);
5607   __ Insr(z4.VnD(), d0);
5608 
5609   uint64_t inputs[] = {0xfedcba9876543210,
5610                        0x0123456789abcdef,
5611                        0x8f8e8d8c8b8a8988,
5612                        0x8786858483828180};
5613 
5614   // Initialize a distinguishable value throughout the register first.
5615   __ Dup(z9.VnB(), 0xff);
5616   InsrHelper(&masm, z9.VnD(), inputs);
5617 
5618   __ Rev(z5.VnB(), z9.VnB());
5619   __ Rev(z6.VnH(), z9.VnH());
5620   __ Rev(z7.VnS(), z9.VnS());
5621   __ Rev(z8.VnD(), z9.VnD());
5622 
5623   int index[7] = {22, 7, 7, 3, 1, 1, 63};
5624   // Broadcasting an data within the input array.
5625   __ Dup(z10.VnB(), z9.VnB(), index[0]);
5626   __ Dup(z11.VnH(), z9.VnH(), index[1]);
5627   __ Dup(z12.VnS(), z9.VnS(), index[2]);
5628   __ Dup(z13.VnD(), z9.VnD(), index[3]);
5629   __ Dup(z14.VnQ(), z9.VnQ(), index[4]);
5630   // Test dst == src
5631   __ Mov(z15, z9);
5632   __ Dup(z15.VnS(), z15.VnS(), index[5]);
5633   // Selecting an data beyond the input array.
5634   __ Dup(z16.VnB(), z9.VnB(), index[6]);
5635 
5636   END();
5637 
5638   if (CAN_RUN()) {
5639     RUN();
5640 
5641     // Insr
5642     uint64_t z1_expected[] = {0x1111111111111111, 0x1111111189abcdef};
5643     uint64_t z2_expected[] = {0x2222222222222222, 0x0123456789abcdef};
5644     uint64_t z3_expected[] = {0x3333333333333333, 0x3333333333333456};
5645     uint64_t z4_expected[] = {0x4444444444444444, 0x7ffaaaaa22223456};
5646     ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
5647     ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
5648     ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
5649     ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
5650 
5651     // Rev
5652     int lane_count = core.GetSVELaneCount(kBRegSize);
5653     for (int i = 0; i < lane_count; i++) {
5654       uint64_t expected =
5655           core.zreg_lane(z5.GetCode(), kBRegSize, lane_count - i - 1);
5656       uint64_t input = core.zreg_lane(z9.GetCode(), kBRegSize, i);
5657       ASSERT_EQUAL_64(expected, input);
5658     }
5659 
5660     lane_count = core.GetSVELaneCount(kHRegSize);
5661     for (int i = 0; i < lane_count; i++) {
5662       uint64_t expected =
5663           core.zreg_lane(z6.GetCode(), kHRegSize, lane_count - i - 1);
5664       uint64_t input = core.zreg_lane(z9.GetCode(), kHRegSize, i);
5665       ASSERT_EQUAL_64(expected, input);
5666     }
5667 
5668     lane_count = core.GetSVELaneCount(kSRegSize);
5669     for (int i = 0; i < lane_count; i++) {
5670       uint64_t expected =
5671           core.zreg_lane(z7.GetCode(), kSRegSize, lane_count - i - 1);
5672       uint64_t input = core.zreg_lane(z9.GetCode(), kSRegSize, i);
5673       ASSERT_EQUAL_64(expected, input);
5674     }
5675 
5676     lane_count = core.GetSVELaneCount(kDRegSize);
5677     for (int i = 0; i < lane_count; i++) {
5678       uint64_t expected =
5679           core.zreg_lane(z8.GetCode(), kDRegSize, lane_count - i - 1);
5680       uint64_t input = core.zreg_lane(z9.GetCode(), kDRegSize, i);
5681       ASSERT_EQUAL_64(expected, input);
5682     }
5683 
5684     // Dup
5685     unsigned vl = config->sve_vl_in_bits();
5686     lane_count = core.GetSVELaneCount(kBRegSize);
5687     uint64_t expected_z10 = (vl > (index[0] * kBRegSize)) ? 0x23 : 0;
5688     for (int i = 0; i < lane_count; i++) {
5689       ASSERT_EQUAL_SVE_LANE(expected_z10, z10.VnB(), i);
5690     }
5691 
5692     lane_count = core.GetSVELaneCount(kHRegSize);
5693     uint64_t expected_z11 = (vl > (index[1] * kHRegSize)) ? 0x8f8e : 0;
5694     for (int i = 0; i < lane_count; i++) {
5695       ASSERT_EQUAL_SVE_LANE(expected_z11, z11.VnH(), i);
5696     }
5697 
5698     lane_count = core.GetSVELaneCount(kSRegSize);
5699     uint64_t expected_z12 = (vl > (index[2] * kSRegSize)) ? 0xfedcba98 : 0;
5700     for (int i = 0; i < lane_count; i++) {
5701       ASSERT_EQUAL_SVE_LANE(expected_z12, z12.VnS(), i);
5702     }
5703 
5704     lane_count = core.GetSVELaneCount(kDRegSize);
5705     uint64_t expected_z13 =
5706         (vl > (index[3] * kDRegSize)) ? 0xfedcba9876543210 : 0;
5707     for (int i = 0; i < lane_count; i++) {
5708       ASSERT_EQUAL_SVE_LANE(expected_z13, z13.VnD(), i);
5709     }
5710 
5711     lane_count = core.GetSVELaneCount(kDRegSize);
5712     uint64_t expected_z14_lo = 0;
5713     uint64_t expected_z14_hi = 0;
5714     if (vl > (index[4] * kQRegSize)) {
5715       expected_z14_lo = 0x0123456789abcdef;
5716       expected_z14_hi = 0xfedcba9876543210;
5717     }
5718     for (int i = 0; i < lane_count; i += 2) {
5719       ASSERT_EQUAL_SVE_LANE(expected_z14_lo, z14.VnD(), i);
5720       ASSERT_EQUAL_SVE_LANE(expected_z14_hi, z14.VnD(), i + 1);
5721     }
5722 
5723     lane_count = core.GetSVELaneCount(kSRegSize);
5724     uint64_t expected_z15 = (vl > (index[5] * kSRegSize)) ? 0x87868584 : 0;
5725     for (int i = 0; i < lane_count; i++) {
5726       ASSERT_EQUAL_SVE_LANE(expected_z15, z15.VnS(), i);
5727     }
5728 
5729     lane_count = core.GetSVELaneCount(kBRegSize);
5730     uint64_t expected_z16 = (vl > (index[6] * kBRegSize)) ? 0xff : 0;
5731     for (int i = 0; i < lane_count; i++) {
5732       ASSERT_EQUAL_SVE_LANE(expected_z16, z16.VnB(), i);
5733     }
5734   }
5735 }
5736 
TEST_SVE(sve_permute_vector_unpredicated_unpack_vector_elements)5737 TEST_SVE(sve_permute_vector_unpredicated_unpack_vector_elements) {
5738   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5739   START();
5740 
5741   uint64_t z9_inputs[] = {0xfedcba9876543210,
5742                           0x0123456789abcdef,
5743                           0x8f8e8d8c8b8a8988,
5744                           0x8786858483828180};
5745   InsrHelper(&masm, z9.VnD(), z9_inputs);
5746 
5747   __ Sunpkhi(z10.VnH(), z9.VnB());
5748   __ Sunpkhi(z11.VnS(), z9.VnH());
5749   __ Sunpkhi(z12.VnD(), z9.VnS());
5750 
5751   __ Sunpklo(z13.VnH(), z9.VnB());
5752   __ Sunpklo(z14.VnS(), z9.VnH());
5753   __ Sunpklo(z15.VnD(), z9.VnS());
5754 
5755   __ Uunpkhi(z16.VnH(), z9.VnB());
5756   __ Uunpkhi(z17.VnS(), z9.VnH());
5757   __ Uunpkhi(z18.VnD(), z9.VnS());
5758 
5759   __ Uunpklo(z19.VnH(), z9.VnB());
5760   __ Uunpklo(z20.VnS(), z9.VnH());
5761   __ Uunpklo(z21.VnD(), z9.VnS());
5762 
5763   // Test unpacking with same source and destination.
5764   __ Mov(z22, z9);
5765   __ Sunpklo(z22.VnH(), z22.VnB());
5766   __ Mov(z23, z9);
5767   __ Uunpklo(z23.VnH(), z23.VnB());
5768 
5769   END();
5770 
5771   if (CAN_RUN()) {
5772     RUN();
5773 
5774     // Suunpkhi
5775     int lane_count = core.GetSVELaneCount(kHRegSize);
5776     for (int i = lane_count - 1; i >= 0; i--) {
5777       uint16_t expected = core.zreg_lane<uint16_t>(z10.GetCode(), i);
5778       uint8_t b_lane = core.zreg_lane<uint8_t>(z9.GetCode(), i + lane_count);
5779       uint16_t input = SignExtend<int16_t>(b_lane, kBRegSize);
5780       ASSERT_EQUAL_64(expected, input);
5781     }
5782 
5783     lane_count = core.GetSVELaneCount(kSRegSize);
5784     for (int i = lane_count - 1; i >= 0; i--) {
5785       uint32_t expected = core.zreg_lane<uint32_t>(z11.GetCode(), i);
5786       uint16_t h_lane = core.zreg_lane<uint16_t>(z9.GetCode(), i + lane_count);
5787       uint32_t input = SignExtend<int32_t>(h_lane, kHRegSize);
5788       ASSERT_EQUAL_64(expected, input);
5789     }
5790 
5791     lane_count = core.GetSVELaneCount(kDRegSize);
5792     for (int i = lane_count - 1; i >= 0; i--) {
5793       uint64_t expected = core.zreg_lane<uint64_t>(z12.GetCode(), i);
5794       uint32_t s_lane = core.zreg_lane<uint32_t>(z9.GetCode(), i + lane_count);
5795       uint64_t input = SignExtend<int64_t>(s_lane, kSRegSize);
5796       ASSERT_EQUAL_64(expected, input);
5797     }
5798 
5799     // Suunpklo
5800     lane_count = core.GetSVELaneCount(kHRegSize);
5801     for (int i = lane_count - 1; i >= 0; i--) {
5802       uint16_t expected = core.zreg_lane<uint16_t>(z13.GetCode(), i);
5803       uint8_t b_lane = core.zreg_lane<uint8_t>(z9.GetCode(), i);
5804       uint16_t input = SignExtend<int16_t>(b_lane, kBRegSize);
5805       ASSERT_EQUAL_64(expected, input);
5806     }
5807 
5808     lane_count = core.GetSVELaneCount(kSRegSize);
5809     for (int i = lane_count - 1; i >= 0; i--) {
5810       uint32_t expected = core.zreg_lane<uint32_t>(z14.GetCode(), i);
5811       uint16_t h_lane = core.zreg_lane<uint16_t>(z9.GetCode(), i);
5812       uint32_t input = SignExtend<int32_t>(h_lane, kHRegSize);
5813       ASSERT_EQUAL_64(expected, input);
5814     }
5815 
5816     lane_count = core.GetSVELaneCount(kDRegSize);
5817     for (int i = lane_count - 1; i >= 0; i--) {
5818       uint64_t expected = core.zreg_lane<uint64_t>(z15.GetCode(), i);
5819       uint32_t s_lane = core.zreg_lane<uint32_t>(z9.GetCode(), i);
5820       uint64_t input = SignExtend<int64_t>(s_lane, kSRegSize);
5821       ASSERT_EQUAL_64(expected, input);
5822     }
5823 
5824     // Uuunpkhi
5825     lane_count = core.GetSVELaneCount(kHRegSize);
5826     for (int i = lane_count - 1; i >= 0; i--) {
5827       uint16_t expected = core.zreg_lane<uint16_t>(z16.GetCode(), i);
5828       uint16_t input = core.zreg_lane<uint8_t>(z9.GetCode(), i + lane_count);
5829       ASSERT_EQUAL_64(expected, input);
5830     }
5831 
5832     lane_count = core.GetSVELaneCount(kSRegSize);
5833     for (int i = lane_count - 1; i >= 0; i--) {
5834       uint32_t expected = core.zreg_lane<uint32_t>(z17.GetCode(), i);
5835       uint32_t input = core.zreg_lane<uint16_t>(z9.GetCode(), i + lane_count);
5836       ASSERT_EQUAL_64(expected, input);
5837     }
5838 
5839     lane_count = core.GetSVELaneCount(kDRegSize);
5840     for (int i = lane_count - 1; i >= 0; i--) {
5841       uint64_t expected = core.zreg_lane<uint64_t>(z18.GetCode(), i);
5842       uint64_t input = core.zreg_lane<uint32_t>(z9.GetCode(), i + lane_count);
5843       ASSERT_EQUAL_64(expected, input);
5844     }
5845 
5846     // Uuunpklo
5847     lane_count = core.GetSVELaneCount(kHRegSize);
5848     for (int i = lane_count - 1; i >= 0; i--) {
5849       uint16_t expected = core.zreg_lane<uint16_t>(z19.GetCode(), i);
5850       uint16_t input = core.zreg_lane<uint8_t>(z9.GetCode(), i);
5851       ASSERT_EQUAL_64(expected, input);
5852     }
5853 
5854     lane_count = core.GetSVELaneCount(kSRegSize);
5855     for (int i = lane_count - 1; i >= 0; i--) {
5856       uint32_t expected = core.zreg_lane<uint32_t>(z20.GetCode(), i);
5857       uint32_t input = core.zreg_lane<uint16_t>(z9.GetCode(), i);
5858       ASSERT_EQUAL_64(expected, input);
5859     }
5860 
5861     lane_count = core.GetSVELaneCount(kDRegSize);
5862     for (int i = lane_count - 1; i >= 0; i--) {
5863       uint64_t expected = core.zreg_lane<uint64_t>(z21.GetCode(), i);
5864       uint64_t input = core.zreg_lane<uint32_t>(z9.GetCode(), i);
5865       ASSERT_EQUAL_64(expected, input);
5866     }
5867 
5868     ASSERT_EQUAL_SVE(z13, z22);
5869     ASSERT_EQUAL_SVE(z19, z23);
5870   }
5871 }
5872 
TEST_SVE(sve_cnot_not)5873 TEST_SVE(sve_cnot_not) {
5874   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5875   START();
5876 
5877   uint64_t in[] = {0x0000000000000000, 0x00000000e1c30000, 0x123456789abcdef0};
5878 
5879   // For simplicity, we re-use the same pg for various lane sizes.
5880   // For D lanes:         1,                      1,                      0
5881   // For S lanes:         1,          1,          1,          0,          0
5882   // For H lanes:   0,    1,    0,    1,    1,    1,    0,    0,    1,    0
5883   int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
5884   Initialise(&masm, p0.VnB(), pg_in);
5885   PRegisterM pg = p0.Merging();
5886 
5887   // These are merging operations, so we have to initialise the result register.
5888   // We use a mixture of constructive and destructive operations.
5889 
5890   InsrHelper(&masm, z31.VnD(), in);
5891   // Make a copy so we can check that constructive operations preserve zn.
5892   __ Mov(z30, z31);
5893 
5894   // For constructive operations, use a different initial result value.
5895   __ Index(z29.VnB(), 0, -1);
5896 
5897   __ Mov(z0, z31);
5898   __ Cnot(z0.VnB(), pg, z0.VnB());  // destructive
5899   __ Mov(z1, z29);
5900   __ Cnot(z1.VnH(), pg, z31.VnH());
5901   __ Mov(z2, z31);
5902   __ Cnot(z2.VnS(), pg, z2.VnS());  // destructive
5903   __ Mov(z3, z29);
5904   __ Cnot(z3.VnD(), pg, z31.VnD());
5905 
5906   __ Mov(z4, z29);
5907   __ Not(z4.VnB(), pg, z31.VnB());
5908   __ Mov(z5, z31);
5909   __ Not(z5.VnH(), pg, z5.VnH());  // destructive
5910   __ Mov(z6, z29);
5911   __ Not(z6.VnS(), pg, z31.VnS());
5912   __ Mov(z7, z31);
5913   __ Not(z7.VnD(), pg, z7.VnD());  // destructive
5914 
5915   END();
5916 
5917   if (CAN_RUN()) {
5918     RUN();
5919 
5920     // Check that constructive operations preserve their inputs.
5921     ASSERT_EQUAL_SVE(z30, z31);
5922 
5923     // clang-format off
5924 
5925     // Cnot (B) destructive
5926     uint64_t expected_z0[] =
5927     // pg:  0 0 0 0 1 0 1 1     1 0 0 1 0 1 1 1     0 0 1 0 1 1 1 0
5928         {0x0000000001000101, 0x01000001e1000101, 0x12340078000000f0};
5929     ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
5930 
5931     // Cnot (H)
5932     uint64_t expected_z1[] =
5933     // pg:    0   0   0   1       0   1   1   1       0   0   1   0
5934         {0xe9eaebecedee0001, 0xf1f2000100000001, 0xf9fafbfc0000ff00};
5935     ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
5936 
5937     // Cnot (S) destructive
5938     uint64_t expected_z2[] =
5939     // pg:        0       1           1       1           0       0
5940         {0x0000000000000001, 0x0000000100000000, 0x123456789abcdef0};
5941     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
5942 
5943     // Cnot (D)
5944     uint64_t expected_z3[] =
5945     // pg:                1                   1                   0
5946         {0x0000000000000001, 0x0000000000000000, 0xf9fafbfcfdfeff00};
5947     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
5948 
5949     // Not (B)
5950     uint64_t expected_z4[] =
5951     // pg:  0 0 0 0 1 0 1 1     1 0 0 1 0 1 1 1     0 0 1 0 1 1 1 0
5952         {0xe9eaebecffeeffff, 0xfff2f3fff53cffff, 0xf9faa9fc65432100};
5953     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
5954 
5955     // Not (H) destructive
5956     uint64_t expected_z5[] =
5957     // pg:    0   0   0   1       0   1   1   1       0   0   1   0
5958         {0x000000000000ffff, 0x0000ffff1e3cffff, 0x123456786543def0};
5959     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
5960 
5961     // Not (S)
5962     uint64_t expected_z6[] =
5963     // pg:        0       1           1       1           0       0
5964         {0xe9eaebecffffffff, 0xffffffff1e3cffff, 0xf9fafbfcfdfeff00};
5965     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
5966 
5967     // Not (D) destructive
5968     uint64_t expected_z7[] =
5969     // pg:                1                   1                   0
5970         {0xffffffffffffffff, 0xffffffff1e3cffff, 0x123456789abcdef0};
5971     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
5972 
5973     // clang-format on
5974   }
5975 }
5976 
TEST_SVE(sve_fabs_fneg)5977 TEST_SVE(sve_fabs_fneg) {
5978   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5979   START();
5980 
5981   // Include FP64, FP32 and FP16 signalling NaNs. Most FP operations quieten
5982   // NaNs, but fabs and fneg do not.
5983   uint64_t in[] = {0xc04500004228d140,  // Recognisable (+/-42) values.
5984                    0xfff00000ff80fc01,  // Signalling NaNs.
5985                    0x123456789abcdef0};
5986 
5987   // For simplicity, we re-use the same pg for various lane sizes.
5988   // For D lanes:         1,                      1,                      0
5989   // For S lanes:         1,          1,          1,          0,          0
5990   // For H lanes:   0,    1,    0,    1,    1,    1,    0,    0,    1,    0
5991   int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
5992   Initialise(&masm, p0.VnB(), pg_in);
5993   PRegisterM pg = p0.Merging();
5994 
5995   // These are merging operations, so we have to initialise the result register.
5996   // We use a mixture of constructive and destructive operations.
5997 
5998   InsrHelper(&masm, z31.VnD(), in);
5999   // Make a copy so we can check that constructive operations preserve zn.
6000   __ Mov(z30, z31);
6001 
6002   // For constructive operations, use a different initial result value.
6003   __ Index(z29.VnB(), 0, -1);
6004 
6005   __ Mov(z0, z29);
6006   __ Fabs(z0.VnH(), pg, z31.VnH());
6007   __ Mov(z1, z31);
6008   __ Fabs(z1.VnS(), pg, z1.VnS());  // destructive
6009   __ Mov(z2, z29);
6010   __ Fabs(z2.VnD(), pg, z31.VnD());
6011 
6012   __ Mov(z3, z31);
6013   __ Fneg(z3.VnH(), pg, z3.VnH());  // destructive
6014   __ Mov(z4, z29);
6015   __ Fneg(z4.VnS(), pg, z31.VnS());
6016   __ Mov(z5, z31);
6017   __ Fneg(z5.VnD(), pg, z5.VnD());  // destructive
6018 
6019   END();
6020 
6021   if (CAN_RUN()) {
6022     RUN();
6023 
6024     // Check that constructive operations preserve their inputs.
6025     ASSERT_EQUAL_SVE(z30, z31);
6026 
6027     // clang-format off
6028 
6029     // Fabs (H)
6030     uint64_t expected_z0[] =
6031     // pg:    0   0   0   1       0   1   1   1       0   0   1   0
6032         {0xe9eaebecedee5140, 0xf1f200007f807c01, 0xf9fafbfc1abcff00};
6033     ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
6034 
6035     // Fabs (S) destructive
6036     uint64_t expected_z1[] =
6037     // pg:        0       1           1       1           0       0
6038         {0xc04500004228d140, 0x7ff000007f80fc01, 0x123456789abcdef0};
6039     ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
6040 
6041     // Fabs (D)
6042     uint64_t expected_z2[] =
6043     // pg:                1                   1                   0
6044         {0x404500004228d140, 0x7ff00000ff80fc01, 0xf9fafbfcfdfeff00};
6045     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
6046 
6047     // Fneg (H) destructive
6048     uint64_t expected_z3[] =
6049     // pg:    0   0   0   1       0   1   1   1       0   0   1   0
6050         {0xc045000042285140, 0xfff080007f807c01, 0x123456781abcdef0};
6051     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6052 
6053     // Fneg (S)
6054     uint64_t expected_z4[] =
6055     // pg:        0       1           1       1           0       0
6056         {0xe9eaebecc228d140, 0x7ff000007f80fc01, 0xf9fafbfcfdfeff00};
6057     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
6058 
6059     // Fneg (D) destructive
6060     uint64_t expected_z5[] =
6061     // pg:                1                   1                   0
6062         {0x404500004228d140, 0x7ff00000ff80fc01, 0x123456789abcdef0};
6063     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
6064 
6065     // clang-format on
6066   }
6067 }
6068 
TEST_SVE(sve_cls_clz_cnt)6069 TEST_SVE(sve_cls_clz_cnt) {
6070   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6071   START();
6072 
6073   uint64_t in[] = {0x0000000000000000, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};
6074 
6075   // For simplicity, we re-use the same pg for various lane sizes.
6076   // For D lanes:         1,                      1,                      0
6077   // For S lanes:         1,          1,          1,          0,          0
6078   // For H lanes:   0,    1,    0,    1,    1,    1,    0,    0,    1,    0
6079   int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
6080   Initialise(&masm, p0.VnB(), pg_in);
6081   PRegisterM pg = p0.Merging();
6082 
6083   // These are merging operations, so we have to initialise the result register.
6084   // We use a mixture of constructive and destructive operations.
6085 
6086   InsrHelper(&masm, z31.VnD(), in);
6087   // Make a copy so we can check that constructive operations preserve zn.
6088   __ Mov(z30, z31);
6089 
6090   // For constructive operations, use a different initial result value.
6091   __ Index(z29.VnB(), 0, -1);
6092 
6093   __ Mov(z0, z29);
6094   __ Cls(z0.VnB(), pg, z31.VnB());
6095   __ Mov(z1, z31);
6096   __ Clz(z1.VnH(), pg, z1.VnH());  // destructive
6097   __ Mov(z2, z29);
6098   __ Cnt(z2.VnS(), pg, z31.VnS());
6099   __ Mov(z3, z31);
6100   __ Cnt(z3.VnD(), pg, z3.VnD());  // destructive
6101 
6102   END();
6103 
6104   if (CAN_RUN()) {
6105     RUN();
6106     // Check that non-destructive operations preserve their inputs.
6107     ASSERT_EQUAL_SVE(z30, z31);
6108 
6109     // clang-format off
6110 
6111     // cls (B)
6112     uint8_t expected_z0[] =
6113     // pg:  0     0     0     0     1     0     1     1
6114     // pg:  1     0     0     1     0     1     1     1
6115     // pg:  0     0     1     0     1     1     1     0
6116         {0xe9, 0xea, 0xeb, 0xec,    7, 0xee,    7,    7,
6117             6, 0xf2, 0xf3,    3, 0xf5,    1,    0,    3,
6118          0xf9, 0xfa,    0, 0xfc,    0,    0,    1, 0x00};
6119     ASSERT_EQUAL_SVE(expected_z0, z0.VnB());
6120 
6121     // clz (H) destructive
6122     uint16_t expected_z1[] =
6123     // pg:    0       0       0       1
6124     // pg:    0       1       1       1
6125     // pg:    0       0       1       0
6126         {0x0000, 0x0000, 0x0000,     16,
6127          0xfefc,      0,      0,      0,
6128          0x1234, 0x5678,      0, 0xdef0};
6129     ASSERT_EQUAL_SVE(expected_z1, z1.VnH());
6130 
6131     // cnt (S)
6132     uint32_t expected_z2[] =
6133     // pg:        0           1
6134     // pg:        1           1
6135     // pg:        0           0
6136         {0xe9eaebec,          0,
6137                  22,         16,
6138          0xf9fafbfc, 0xfdfeff00};
6139     ASSERT_EQUAL_SVE(expected_z2, z2.VnS());
6140 
6141     // cnt (D) destructive
6142     uint64_t expected_z3[] =
6143     // pg:                1                   1                   0
6144         {                 0,                 38, 0x123456789abcdef0};
6145     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6146 
6147     // clang-format on
6148   }
6149 }
6150 
TEST_SVE(sve_sxt)6151 TEST_SVE(sve_sxt) {
6152   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6153   START();
6154 
6155   uint64_t in[] = {0x01f203f405f607f8, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};
6156 
6157   // For simplicity, we re-use the same pg for various lane sizes.
6158   // For D lanes:         1,                      1,                      0
6159   // For S lanes:         1,          1,          1,          0,          0
6160   // For H lanes:   0,    1,    0,    1,    1,    1,    0,    0,    1,    0
6161   int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
6162   Initialise(&masm, p0.VnB(), pg_in);
6163   PRegisterM pg = p0.Merging();
6164 
6165   // These are merging operations, so we have to initialise the result register.
6166   // We use a mixture of constructive and destructive operations.
6167 
6168   InsrHelper(&masm, z31.VnD(), in);
6169   // Make a copy so we can check that constructive operations preserve zn.
6170   __ Mov(z30, z31);
6171 
6172   // For constructive operations, use a different initial result value.
6173   __ Index(z29.VnB(), 0, -1);
6174 
6175   __ Mov(z0, z31);
6176   __ Sxtb(z0.VnH(), pg, z0.VnH());  // destructive
6177   __ Mov(z1, z29);
6178   __ Sxtb(z1.VnS(), pg, z31.VnS());
6179   __ Mov(z2, z31);
6180   __ Sxtb(z2.VnD(), pg, z2.VnD());  // destructive
6181   __ Mov(z3, z29);
6182   __ Sxth(z3.VnS(), pg, z31.VnS());
6183   __ Mov(z4, z31);
6184   __ Sxth(z4.VnD(), pg, z4.VnD());  // destructive
6185   __ Mov(z5, z29);
6186   __ Sxtw(z5.VnD(), pg, z31.VnD());
6187 
6188   END();
6189 
6190   if (CAN_RUN()) {
6191     RUN();
6192     // Check that constructive operations preserve their inputs.
6193     ASSERT_EQUAL_SVE(z30, z31);
6194 
6195     // clang-format off
6196 
6197     // Sxtb (H) destructive
6198     uint64_t expected_z0[] =
6199     // pg:    0   0   0   1       0   1   1   1       0   0   1   0
6200         {0x01f203f405f6fff8, 0xfefcfff0ffc3000f, 0x12345678ffbcdef0};
6201     ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
6202 
6203     // Sxtb (S)
6204     uint64_t expected_z1[] =
6205     // pg:        0       1           1       1           0       0
6206         {0xe9eaebecfffffff8, 0xfffffff00000000f, 0xf9fafbfcfdfeff00};
6207     ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
6208 
6209     // Sxtb (D) destructive
6210     uint64_t expected_z2[] =
6211     // pg:                1                   1                   0
6212         {0xfffffffffffffff8, 0x000000000000000f, 0x123456789abcdef0};
6213     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
6214 
6215     // Sxth (S)
6216     uint64_t expected_z3[] =
6217     // pg:        0       1           1       1           0       0
6218         {0xe9eaebec000007f8, 0xfffff8f0ffff870f, 0xf9fafbfcfdfeff00};
6219     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6220 
6221     // Sxth (D) destructive
6222     uint64_t expected_z4[] =
6223     // pg:                1                   1                   0
6224         {0x00000000000007f8, 0xffffffffffff870f, 0x123456789abcdef0};
6225     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
6226 
6227     // Sxtw (D)
6228     uint64_t expected_z5[] =
6229     // pg:                1                   1                   0
6230         {0x0000000005f607f8, 0xffffffffe1c3870f, 0xf9fafbfcfdfeff00};
6231     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
6232 
6233     // clang-format on
6234   }
6235 }
6236 
TEST_SVE(sve_uxt)6237 TEST_SVE(sve_uxt) {
6238   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6239   START();
6240 
6241   uint64_t in[] = {0x01f203f405f607f8, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};
6242 
6243   // For simplicity, we re-use the same pg for various lane sizes.
6244   // For D lanes:         1,                      1,                      0
6245   // For S lanes:         1,          1,          1,          0,          0
6246   // For H lanes:   0,    1,    0,    1,    1,    1,    0,    0,    1,    0
6247   int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
6248   Initialise(&masm, p0.VnB(), pg_in);
6249   PRegisterM pg = p0.Merging();
6250 
6251   // These are merging operations, so we have to initialise the result register.
6252   // We use a mixture of constructive and destructive operations.
6253 
6254   InsrHelper(&masm, z31.VnD(), in);
6255   // Make a copy so we can check that constructive operations preserve zn.
6256   __ Mov(z30, z31);
6257 
6258   // For constructive operations, use a different initial result value.
6259   __ Index(z29.VnB(), 0, -1);
6260 
6261   __ Mov(z0, z29);
6262   __ Uxtb(z0.VnH(), pg, z31.VnH());
6263   __ Mov(z1, z31);
6264   __ Uxtb(z1.VnS(), pg, z1.VnS());  // destructive
6265   __ Mov(z2, z29);
6266   __ Uxtb(z2.VnD(), pg, z31.VnD());
6267   __ Mov(z3, z31);
6268   __ Uxth(z3.VnS(), pg, z3.VnS());  // destructive
6269   __ Mov(z4, z29);
6270   __ Uxth(z4.VnD(), pg, z31.VnD());
6271   __ Mov(z5, z31);
6272   __ Uxtw(z5.VnD(), pg, z5.VnD());  // destructive
6273 
6274   END();
6275 
6276   if (CAN_RUN()) {
6277     RUN();
6278     // clang-format off
6279 
6280     // Uxtb (H)
6281     uint64_t expected_z0[] =
6282     // pg:    0   0   0   1       0   1   1   1       0   0   1   0
6283         {0xe9eaebecedee00f8, 0xf1f200f000c3000f, 0xf9fafbfc00bcff00};
6284     ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
6285 
6286     // Uxtb (S) destructive
6287     uint64_t expected_z1[] =
6288     // pg:        0       1           1       1           0       0
6289         {0x01f203f4000000f8, 0x000000f00000000f, 0x123456789abcdef0};
6290     ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
6291 
6292     // Uxtb (D)
6293     uint64_t expected_z2[] =
6294     // pg:                1                   1                   0
6295         {0x00000000000000f8, 0x000000000000000f, 0xf9fafbfcfdfeff00};
6296     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
6297 
6298     // Uxth (S) destructive
6299     uint64_t expected_z3[] =
6300     // pg:        0       1           1       1           0       0
6301         {0x01f203f4000007f8, 0x0000f8f00000870f, 0x123456789abcdef0};
6302     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6303 
6304     // Uxth (D)
6305     uint64_t expected_z4[] =
6306     // pg:                1                   1                   0
6307         {0x00000000000007f8, 0x000000000000870f, 0xf9fafbfcfdfeff00};
6308     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
6309 
6310     // Uxtw (D) destructive
6311     uint64_t expected_z5[] =
6312     // pg:                1                   1                   0
6313         {0x0000000005f607f8, 0x00000000e1c3870f, 0x123456789abcdef0};
6314     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
6315 
6316     // clang-format on
6317   }
6318 }
6319 
TEST_SVE(sve_abs_neg)6320 TEST_SVE(sve_abs_neg) {
6321   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6322   START();
6323 
6324   uint64_t in[] = {0x01f203f405f607f8, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};
6325 
6326   // For simplicity, we re-use the same pg for various lane sizes.
6327   // For D lanes:         1,                      1,                      0
6328   // For S lanes:         1,          1,          1,          0,          0
6329   // For H lanes:   0,    1,    0,    1,    1,    1,    0,    0,    1,    0
6330   int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
6331   Initialise(&masm, p0.VnB(), pg_in);
6332   PRegisterM pg = p0.Merging();
6333 
6334   InsrHelper(&masm, z31.VnD(), in);
6335 
6336   // These are merging operations, so we have to initialise the result register.
6337   // We use a mixture of constructive and destructive operations.
6338 
6339   InsrHelper(&masm, z31.VnD(), in);
6340   // Make a copy so we can check that constructive operations preserve zn.
6341   __ Mov(z30, z31);
6342 
6343   // For constructive operations, use a different initial result value.
6344   __ Index(z29.VnB(), 0, -1);
6345 
6346   __ Mov(z0, z31);
6347   __ Abs(z0.VnD(), pg, z0.VnD());  // destructive
6348   __ Mov(z1, z29);
6349   __ Abs(z1.VnB(), pg, z31.VnB());
6350 
6351   __ Mov(z2, z31);
6352   __ Neg(z2.VnH(), pg, z2.VnH());  // destructive
6353   __ Mov(z3, z29);
6354   __ Neg(z3.VnS(), pg, z31.VnS());
6355 
6356   // The unpredicated form of `Neg` is implemented using `subr`.
6357   __ Mov(z4, z31);
6358   __ Neg(z4.VnB(), z4.VnB());  // destructive
6359   __ Mov(z5, z29);
6360   __ Neg(z5.VnD(), z31.VnD());
6361 
6362   END();
6363 
6364   if (CAN_RUN()) {
6365     RUN();
6366 
6367     ASSERT_EQUAL_SVE(z30, z31);
6368 
6369     // clang-format off
6370 
6371     // Abs (D) destructive
6372     uint64_t expected_z0[] =
6373     // pg:                1                   1                   0
6374         {0x01f203f405f607f8, 0x0103070f1e3c78f1, 0x123456789abcdef0};
6375     ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
6376 
6377     // Abs (B)
6378     uint64_t expected_z1[] =
6379     // pg:  0 0 0 0 1 0 1 1     1 0 0 1 0 1 1 1     0 0 1 0 1 1 1 0
6380         {0xe9eaebec05ee0708, 0x02f2f310f53d790f, 0xf9fa56fc66442200};
6381     ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
6382 
6383     // Neg (H) destructive
6384     uint64_t expected_z2[] =
6385     // pg:    0   0   0   1       0   1   1   1       0   0   1   0
6386         {0x01f203f405f6f808, 0xfefc07101e3d78f1, 0x123456786544def0};
6387     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
6388 
6389     // Neg (S)
6390     uint64_t expected_z3[] =
6391     // pg:        0       1           1       1           0       0
6392         {0xe9eaebecfa09f808, 0x010307101e3c78f1, 0xf9fafbfcfdfeff00};
6393     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6394 
6395     // Neg (B) destructive, unpredicated
6396     uint64_t expected_z4[] =
6397         {0xff0efd0cfb0af908, 0x020408101f3d79f1, 0xeeccaa8866442210};
6398     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
6399 
6400     // Neg (D) unpredicated
6401     uint64_t expected_z5[] =
6402         {0xfe0dfc0bfa09f808, 0x0103070f1e3c78f1, 0xedcba98765432110};
6403     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
6404 
6405     // clang-format on
6406   }
6407 }
6408 
TEST_SVE(sve_cpy)6409 TEST_SVE(sve_cpy) {
6410   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
6411   START();
6412 
6413   // For simplicity, we re-use the same pg for various lane sizes.
6414   // For D lanes:         0,                      1,                      1
6415   // For S lanes:         0,          1,          1,          0,          1
6416   // For H lanes:   1,    0,    0,    1,    0,    1,    1,    0,    0,    1
6417   int pg_in[] = {1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1};
6418 
6419   PRegisterM pg = p7.Merging();
6420   Initialise(&masm, pg.VnB(), pg_in);
6421 
6422   // These are merging operations, so we have to initialise the result registers
6423   // for each operation.
6424   for (unsigned i = 0; i < kNumberOfZRegisters; i++) {
6425     __ Index(ZRegister(i, kBRegSize), 0, -1);
6426   }
6427 
6428   // Recognisable values to copy.
6429   __ Mov(x0, 0xdeadbeefdeadbe42);
6430   __ Mov(x1, 0xdeadbeefdead8421);
6431   __ Mov(x2, 0xdeadbeef80042001);
6432   __ Mov(x3, 0x8000000420000001);
6433 
6434   // Use NEON moves, to avoid testing SVE `cpy` against itself.
6435   __ Dup(v28.V2D(), x0);
6436   __ Dup(v29.V2D(), x1);
6437   __ Dup(v30.V2D(), x2);
6438   __ Dup(v31.V2D(), x3);
6439 
6440   // Register forms (CPY_z_p_r)
6441   __ Cpy(z0.VnB(), pg, w0);
6442   __ Cpy(z1.VnH(), pg, x1);  // X registers are accepted for small lanes.
6443   __ Cpy(z2.VnS(), pg, w2);
6444   __ Cpy(z3.VnD(), pg, x3);
6445 
6446   // VRegister forms (CPY_z_p_v)
6447   __ Cpy(z4.VnB(), pg, b28);
6448   __ Cpy(z5.VnH(), pg, h29);
6449   __ Cpy(z6.VnS(), pg, s30);
6450   __ Cpy(z7.VnD(), pg, d31);
6451 
6452   // Check that we can copy the stack pointer.
6453   __ Mov(x10, sp);
6454   __ Mov(sp, 0xabcabcabcabcabca);  // Set sp to a known value.
6455   __ Cpy(z16.VnB(), pg, sp);
6456   __ Cpy(z17.VnH(), pg, wsp);
6457   __ Cpy(z18.VnS(), pg, wsp);
6458   __ Cpy(z19.VnD(), pg, sp);
6459   __ Mov(sp, x10);  // Restore sp.
6460 
6461   END();
6462 
6463   if (CAN_RUN()) {
6464     RUN();
6465     // clang-format off
6466 
6467     uint64_t expected_b[] =
6468     // pg:  0 0 0 0 1 1 1 0     1 0 0 1 1 0 1 1     0 1 0 0 0 0 0 1
6469         {0xe9eaebec424242f0, 0x42f2f34242f64242, 0xf942fbfcfdfeff42};
6470     ASSERT_EQUAL_SVE(expected_b, z0.VnD());
6471     ASSERT_EQUAL_SVE(expected_b, z4.VnD());
6472 
6473     uint64_t expected_h[] =
6474     // pg:    0   0   1   0       0   1   0   1       1   0   0   1
6475         {0xe9eaebec8421eff0, 0xf1f28421f5f68421, 0x8421fbfcfdfe8421};
6476     ASSERT_EQUAL_SVE(expected_h, z1.VnD());
6477     ASSERT_EQUAL_SVE(expected_h, z5.VnD());
6478 
6479     uint64_t expected_s[] =
6480     // pg:        0       0           1       1           0       1
6481         {0xe9eaebecedeeeff0, 0x8004200180042001, 0xf9fafbfc80042001};
6482     ASSERT_EQUAL_SVE(expected_s, z2.VnD());
6483     ASSERT_EQUAL_SVE(expected_s, z6.VnD());
6484 
6485     uint64_t expected_d[] =
6486     // pg:                0                   1                   1
6487         {0xe9eaebecedeeeff0, 0x8000000420000001, 0x8000000420000001};
6488     ASSERT_EQUAL_SVE(expected_d, z3.VnD());
6489     ASSERT_EQUAL_SVE(expected_d, z7.VnD());
6490 
6491 
6492     uint64_t expected_b_sp[] =
6493     // pg:  0 0 0 0 1 1 1 0     1 0 0 1 1 0 1 1     0 1 0 0 0 0 0 1
6494         {0xe9eaebeccacacaf0, 0xcaf2f3cacaf6caca, 0xf9cafbfcfdfeffca};
6495     ASSERT_EQUAL_SVE(expected_b_sp, z16.VnD());
6496 
6497     uint64_t expected_h_sp[] =
6498     // pg:    0   0   1   0       0   1   0   1       1   0   0   1
6499         {0xe9eaebecabcaeff0, 0xf1f2abcaf5f6abca, 0xabcafbfcfdfeabca};
6500     ASSERT_EQUAL_SVE(expected_h_sp, z17.VnD());
6501 
6502     uint64_t expected_s_sp[] =
6503     // pg:        0       0           1       1           0       1
6504         {0xe9eaebecedeeeff0, 0xcabcabcacabcabca, 0xf9fafbfccabcabca};
6505     ASSERT_EQUAL_SVE(expected_s_sp, z18.VnD());
6506 
6507     uint64_t expected_d_sp[] =
6508     // pg:                0                   1                   1
6509         {0xe9eaebecedeeeff0, 0xabcabcabcabcabca, 0xabcabcabcabcabca};
6510     ASSERT_EQUAL_SVE(expected_d_sp, z19.VnD());
6511 
6512     // clang-format on
6513   }
6514 }
6515 
TEST_SVE(sve_cpy_imm)6516 TEST_SVE(sve_cpy_imm) {
6517   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6518   START();
6519 
6520   // For simplicity, we re-use the same pg for various lane sizes.
6521   // For D lanes:         0,                      1,                      1
6522   // For S lanes:         0,          1,          1,          0,          1
6523   // For H lanes:   1,    0,    0,    1,    0,    1,    1,    0,    0,    1
6524   int pg_in[] = {1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1};
6525 
6526   PRegister pg = p7;
6527   Initialise(&masm, pg.VnB(), pg_in);
6528 
6529   // These are (mostly) merging operations, so we have to initialise the result
6530   // registers for each operation.
6531   for (unsigned i = 0; i < kNumberOfZRegisters; i++) {
6532     __ Index(ZRegister(i, kBRegSize), 0, -1);
6533   }
6534 
6535   // Encodable integer forms (CPY_z_p_i)
6536   __ Cpy(z0.VnB(), pg.Merging(), 0);
6537   __ Cpy(z1.VnB(), pg.Zeroing(), 42);
6538   __ Cpy(z2.VnB(), pg.Merging(), -42);
6539   __ Cpy(z3.VnB(), pg.Zeroing(), 0xff);
6540   __ Cpy(z4.VnH(), pg.Merging(), 127);
6541   __ Cpy(z5.VnS(), pg.Zeroing(), -128);
6542   __ Cpy(z6.VnD(), pg.Merging(), -1);
6543 
6544   // Forms encodable using fcpy.
6545   __ Cpy(z7.VnH(), pg.Merging(), Float16ToRawbits(Float16(-31.0)));
6546   __ Cpy(z8.VnS(), pg.Zeroing(), FloatToRawbits(2.0f));
6547   __ Cpy(z9.VnD(), pg.Merging(), DoubleToRawbits(-4.0));
6548 
6549   // Other forms use a scratch register.
6550   __ Cpy(z10.VnH(), pg.Merging(), 0xff);
6551   __ Cpy(z11.VnD(), pg.Zeroing(), 0x0123456789abcdef);
6552 
6553   END();
6554 
6555   if (CAN_RUN()) {
6556     RUN();
6557     // clang-format off
6558 
6559     uint64_t expected_z0[] =
6560     // pg:  0 0 0 0 1 1 1 0     1 0 0 1 1 0 1 1     0 1 0 0 0 0 0 1
6561         {0xe9eaebec000000f0, 0x00f2f30000f60000, 0xf900fbfcfdfeff00};
6562     ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
6563 
6564     uint64_t expected_z1[] =
6565     // pg:  0 0 0 0 1 1 1 0     1 0 0 1 1 0 1 1     0 1 0 0 0 0 0 1
6566         {0x000000002a2a2a00, 0x2a00002a2a002a2a, 0x002a00000000002a};
6567     ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
6568 
6569     uint64_t expected_z2[] =
6570     // pg:  0 0 0 0 1 1 1 0     1 0 0 1 1 0 1 1     0 1 0 0 0 0 0 1
6571         {0xe9eaebecd6d6d6f0, 0xd6f2f3d6d6f6d6d6, 0xf9d6fbfcfdfeffd6};
6572     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
6573 
6574     uint64_t expected_z3[] =
6575     // pg:  0 0 0 0 1 1 1 0     1 0 0 1 1 0 1 1     0 1 0 0 0 0 0 1
6576         {0x00000000ffffff00, 0xff0000ffff00ffff, 0x00ff0000000000ff};
6577     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6578 
6579     uint64_t expected_z4[] =
6580     // pg:    0   0   1   0       0   1   0   1       1   0   0   1
6581         {0xe9eaebec007feff0, 0xf1f2007ff5f6007f, 0x007ffbfcfdfe007f};
6582     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
6583 
6584     uint64_t expected_z5[] =
6585     // pg:        0       0           1       1           0       1
6586         {0x0000000000000000, 0xffffff80ffffff80, 0x00000000ffffff80};
6587     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
6588 
6589     uint64_t expected_z6[] =
6590     // pg:                0                   1                   1
6591         {0xe9eaebecedeeeff0, 0xffffffffffffffff, 0xffffffffffffffff};
6592     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
6593 
6594     uint64_t expected_z7[] =
6595     // pg:    0   0   1   0       0   1   0   1       1   0   0   1
6596         {0xe9eaebeccfc0eff0, 0xf1f2cfc0f5f6cfc0, 0xcfc0fbfcfdfecfc0};
6597     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
6598 
6599     uint64_t expected_z8[] =
6600     // pg:        0       0           1       1           0       1
6601         {0x0000000000000000, 0x4000000040000000, 0x0000000040000000};
6602     ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
6603 
6604     uint64_t expected_z9[] =
6605     // pg:                0                   1                   1
6606         {0xe9eaebecedeeeff0, 0xc010000000000000, 0xc010000000000000};
6607     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
6608 
6609     uint64_t expected_z10[] =
6610     // pg:    0   0   1   0       0   1   0   1       1   0   0   1
6611         {0xe9eaebec00ffeff0, 0xf1f200fff5f600ff, 0x00fffbfcfdfe00ff};
6612     ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
6613 
6614     uint64_t expected_z11[] =
6615     // pg:                0                   1                   1
6616         {0x0000000000000000, 0x0123456789abcdef, 0x0123456789abcdef};
6617     ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
6618 
6619     // clang-format on
6620   }
6621 }
6622 
TEST_SVE(sve_fcpy_imm)6623 TEST_SVE(sve_fcpy_imm) {
6624   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6625   START();
6626 
6627   // For simplicity, we re-use the same pg for various lane sizes.
6628   // For D lanes:         0,                      1,                      1
6629   // For S lanes:         0,          1,          1,          0,          1
6630   // For H lanes:   1,    0,    0,    1,    0,    1,    1,    0,    0,    1
6631   int pg_in[] = {1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1};
6632 
6633   PRegister pg = p7;
6634   Initialise(&masm, pg.VnB(), pg_in);
6635 
6636   // These are (mostly) merging operations, so we have to initialise the result
6637   // registers for each operation.
6638   for (unsigned i = 0; i < kNumberOfZRegisters; i++) {
6639     __ Index(ZRegister(i, kBRegSize), 0, -1);
6640   }
6641 
6642   // Encodable floating-point forms (FCPY_z_p_i)
6643   __ Fcpy(z1.VnH(), pg.Merging(), Float16(1.0));
6644   __ Fcpy(z2.VnH(), pg.Merging(), -2.0f);
6645   __ Fcpy(z3.VnH(), pg.Merging(), 3.0);
6646   __ Fcpy(z4.VnS(), pg.Merging(), Float16(-4.0));
6647   __ Fcpy(z5.VnS(), pg.Merging(), 5.0f);
6648   __ Fcpy(z6.VnS(), pg.Merging(), 6.0);
6649   __ Fcpy(z7.VnD(), pg.Merging(), Float16(7.0));
6650   __ Fcpy(z8.VnD(), pg.Merging(), 8.0f);
6651   __ Fmov(z9.VnD(), pg.Merging(), -9.0);
6652 
6653   // Unencodable immediates.
6654   __ Fcpy(z10.VnS(), pg.Merging(), 0.0);
6655   __ Fcpy(z11.VnH(), pg.Merging(), Float16(42.0));
6656   __ Fcpy(z12.VnD(), pg.Merging(), RawbitsToDouble(0x7ff0000012340000));  // NaN
6657   __ Fcpy(z13.VnH(), pg.Merging(), kFP64NegativeInfinity);
6658 
6659   // Fmov alias.
6660   __ Fmov(z14.VnS(), pg.Merging(), 0.0);
6661   __ Fmov(z15.VnH(), pg.Merging(), Float16(42.0));
6662   __ Fmov(z16.VnD(), pg.Merging(), RawbitsToDouble(0x7ff0000012340000));  // NaN
6663   __ Fmov(z17.VnH(), pg.Merging(), kFP64NegativeInfinity);
6664   END();
6665 
6666   if (CAN_RUN()) {
6667     RUN();
6668     // clang-format off
6669 
6670     // 1.0 as FP16: 0x3c00
6671     uint64_t expected_z1[] =
6672     // pg:    0   0   1   0       0   1   0   1       1   0   0   1
6673         {0xe9eaebec3c00eff0, 0xf1f23c00f5f63c00, 0x3c00fbfcfdfe3c00};
6674     ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
6675 
6676     // -2.0 as FP16: 0xc000
6677     uint64_t expected_z2[] =
6678     // pg:    0   0   1   0       0   1   0   1       1   0   0   1
6679         {0xe9eaebecc000eff0, 0xf1f2c000f5f6c000, 0xc000fbfcfdfec000};
6680     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
6681 
6682     // 3.0 as FP16: 0x4200
6683     uint64_t expected_z3[] =
6684     // pg:    0   0   1   0       0   1   0   1       1   0   0   1
6685         {0xe9eaebec4200eff0, 0xf1f24200f5f64200, 0x4200fbfcfdfe4200};
6686     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6687 
6688     // -4.0 as FP32: 0xc0800000
6689     uint64_t expected_z4[] =
6690     // pg:        0       0           1       1           0       1
6691         {0xe9eaebecedeeeff0, 0xc0800000c0800000, 0xf9fafbfcc0800000};
6692     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
6693 
6694     // 5.0 as FP32: 0x40a00000
6695     uint64_t expected_z5[] =
6696     // pg:        0       0           1       1           0       1
6697         {0xe9eaebecedeeeff0, 0x40a0000040a00000, 0xf9fafbfc40a00000};
6698     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
6699 
6700     // 6.0 as FP32: 0x40c00000
6701     uint64_t expected_z6[] =
6702     // pg:        0       0           1       1           0       1
6703         {0xe9eaebecedeeeff0, 0x40c0000040c00000, 0xf9fafbfc40c00000};
6704     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
6705 
6706     // 7.0 as FP64: 0x401c000000000000
6707     uint64_t expected_z7[] =
6708     // pg:                0                   1                   1
6709         {0xe9eaebecedeeeff0, 0x401c000000000000, 0x401c000000000000};
6710     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
6711 
6712     // 8.0 as FP64: 0x4020000000000000
6713     uint64_t expected_z8[] =
6714     // pg:                0                   1                   1
6715         {0xe9eaebecedeeeff0, 0x4020000000000000, 0x4020000000000000};
6716     ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
6717 
6718     // -9.0 as FP64: 0xc022000000000000
6719     uint64_t expected_z9[] =
6720     // pg:                0                   1                   1
6721         {0xe9eaebecedeeeff0, 0xc022000000000000, 0xc022000000000000};
6722     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
6723 
6724     // 0.0 as FP32: 0x00000000
6725     uint64_t expected_z10[] =
6726     // pg:        0       0           1       1           0       1
6727         {0xe9eaebecedeeeff0, 0x0000000000000000, 0xf9fafbfc00000000};
6728     ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
6729 
6730     // 42.0 as FP16: 0x5140
6731     uint64_t expected_z11[] =
6732     // pg:    0   0   1   0       0   1   0   1       1   0   0   1
6733         {0xe9eaebec5140eff0, 0xf1f25140f5f65140, 0x5140fbfcfdfe5140};
6734     ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
6735 
6736     // Signalling NaN (with payload): 0x7ff0000012340000
6737     uint64_t expected_z12[] =
6738     // pg:                0                   1                   1
6739         {0xe9eaebecedeeeff0, 0x7ff0000012340000, 0x7ff0000012340000};
6740     ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
6741 
6742     // -infinity as FP16: 0xfc00
6743     uint64_t expected_z13[] =
6744     // pg:    0   0   1   0       0   1   0   1       1   0   0   1
6745         {0xe9eaebecfc00eff0, 0xf1f2fc00f5f6fc00, 0xfc00fbfcfdfefc00};
6746     ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
6747 
6748     ASSERT_EQUAL_SVE(z10.VnD(), z14.VnD());
6749     ASSERT_EQUAL_SVE(z11.VnD(), z15.VnD());
6750     ASSERT_EQUAL_SVE(z12.VnD(), z16.VnD());
6751     ASSERT_EQUAL_SVE(z13.VnD(), z17.VnD());
6752     // clang-format on
6753   }
6754 }
6755 
TEST_SVE(sve_permute_vector_unpredicated_table_lookup)6756 TEST_SVE(sve_permute_vector_unpredicated_table_lookup) {
6757   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6758   START();
6759 
6760   uint64_t table_inputs[] = {0xffeeddccbbaa9988, 0x7766554433221100};
6761 
6762   int index_b[] = {255, 255, 11, 10, 15, 14, 13, 12, 1, 0, 4, 3, 7, 6, 5, 4};
6763 
6764   int index_h[] = {5, 6, 7, 8, 2, 3, 6, 4};
6765 
6766   int index_s[] = {1, 3, 2, 31, -1};
6767 
6768   int index_d[] = {31, 1};
6769 
6770   // Initialize the register with a value that doesn't existed in the table.
6771   __ Dup(z9.VnB(), 0x1f);
6772   InsrHelper(&masm, z9.VnD(), table_inputs);
6773 
6774   ZRegister ind_b = z0.WithLaneSize(kBRegSize);
6775   ZRegister ind_h = z1.WithLaneSize(kHRegSize);
6776   ZRegister ind_s = z2.WithLaneSize(kSRegSize);
6777   ZRegister ind_d = z3.WithLaneSize(kDRegSize);
6778 
6779   InsrHelper(&masm, ind_b, index_b);
6780   InsrHelper(&masm, ind_h, index_h);
6781   InsrHelper(&masm, ind_s, index_s);
6782   InsrHelper(&masm, ind_d, index_d);
6783 
6784   __ Tbl(z26.VnB(), z9.VnB(), ind_b);
6785 
6786   __ Tbl(z27.VnH(), z9.VnH(), ind_h);
6787 
6788   __ Tbl(z28.VnS(), z9.VnS(), ind_s);
6789 
6790   __ Tbl(z29.VnD(), z9.VnD(), ind_d);
6791 
6792   END();
6793 
6794   if (CAN_RUN()) {
6795     RUN();
6796 
6797     // clang-format off
6798     unsigned z26_expected[] = {0x1f, 0x1f, 0xbb, 0xaa, 0xff, 0xee, 0xdd, 0xcc,
6799                                0x11, 0x00, 0x44, 0x33, 0x77, 0x66, 0x55, 0x44};
6800 
6801     unsigned z27_expected[] = {0xbbaa, 0xddcc, 0xffee, 0x1f1f,
6802                                0x5544, 0x7766, 0xddcc, 0x9988};
6803 
6804     unsigned z28_expected[] =
6805        {0x77665544, 0xffeeddcc, 0xbbaa9988, 0x1f1f1f1f, 0x1f1f1f1f};
6806 
6807     uint64_t z29_expected[] = {0x1f1f1f1f1f1f1f1f, 0xffeeddccbbaa9988};
6808     // clang-format on
6809 
6810     unsigned vl = config->sve_vl_in_bits();
6811     for (size_t i = 0; i < ArrayLength(index_b); i++) {
6812       int lane = static_cast<int>(ArrayLength(index_b) - i - 1);
6813       if (!core.HasSVELane(z26.VnB(), lane)) break;
6814       uint64_t expected = (vl > (index_b[i] * kBRegSize)) ? z26_expected[i] : 0;
6815       ASSERT_EQUAL_SVE_LANE(expected, z26.VnB(), lane);
6816     }
6817 
6818     for (size_t i = 0; i < ArrayLength(index_h); i++) {
6819       int lane = static_cast<int>(ArrayLength(index_h) - i - 1);
6820       if (!core.HasSVELane(z27.VnH(), lane)) break;
6821       uint64_t expected = (vl > (index_h[i] * kHRegSize)) ? z27_expected[i] : 0;
6822       ASSERT_EQUAL_SVE_LANE(expected, z27.VnH(), lane);
6823     }
6824 
6825     for (size_t i = 0; i < ArrayLength(index_s); i++) {
6826       int lane = static_cast<int>(ArrayLength(index_s) - i - 1);
6827       if (!core.HasSVELane(z28.VnS(), lane)) break;
6828       uint64_t expected = (vl > (index_s[i] * kSRegSize)) ? z28_expected[i] : 0;
6829       ASSERT_EQUAL_SVE_LANE(expected, z28.VnS(), lane);
6830     }
6831 
6832     for (size_t i = 0; i < ArrayLength(index_d); i++) {
6833       int lane = static_cast<int>(ArrayLength(index_d) - i - 1);
6834       if (!core.HasSVELane(z29.VnD(), lane)) break;
6835       uint64_t expected = (vl > (index_d[i] * kDRegSize)) ? z29_expected[i] : 0;
6836       ASSERT_EQUAL_SVE_LANE(expected, z29.VnD(), lane);
6837     }
6838   }
6839 }
6840 
TEST_SVE(ldr_str_z_bi)6841 TEST_SVE(ldr_str_z_bi) {
6842   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6843   START();
6844 
6845   int vl = config->sve_vl_in_bytes();
6846 
6847   // The immediate can address [-256, 255] times the VL, so allocate enough
6848   // space to exceed that in both directions.
6849   int data_size = vl * 1024;
6850 
6851   uint8_t* data = new uint8_t[data_size];
6852   memset(data, 0, data_size);
6853 
6854   // Set the base half-way through the buffer so we can use negative indices.
6855   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
6856 
6857   __ Index(z1.VnB(), 1, 3);
6858   __ Index(z2.VnB(), 2, 5);
6859   __ Index(z3.VnB(), 3, 7);
6860   __ Index(z4.VnB(), 4, 11);
6861   __ Index(z5.VnB(), 5, 13);
6862   __ Index(z6.VnB(), 6, 2);
6863   __ Index(z7.VnB(), 7, 3);
6864   __ Index(z8.VnB(), 8, 5);
6865   __ Index(z9.VnB(), 9, 7);
6866 
6867   // Encodable cases.
6868   __ Str(z1, SVEMemOperand(x0));
6869   __ Str(z2, SVEMemOperand(x0, 2, SVE_MUL_VL));
6870   __ Str(z3, SVEMemOperand(x0, -3, SVE_MUL_VL));
6871   __ Str(z4, SVEMemOperand(x0, 255, SVE_MUL_VL));
6872   __ Str(z5, SVEMemOperand(x0, -256, SVE_MUL_VL));
6873 
6874   // Cases that fall back on `CalculateSVEAddress`.
6875   __ Str(z6, SVEMemOperand(x0, 6 * vl));
6876   __ Str(z7, SVEMemOperand(x0, -7 * vl));
6877   __ Str(z8, SVEMemOperand(x0, 314, SVE_MUL_VL));
6878   __ Str(z9, SVEMemOperand(x0, -314, SVE_MUL_VL));
6879 
6880   // Corresponding loads.
6881   __ Ldr(z11, SVEMemOperand(x0, xzr));  // Test xzr operand.
6882   __ Ldr(z12, SVEMemOperand(x0, 2, SVE_MUL_VL));
6883   __ Ldr(z13, SVEMemOperand(x0, -3, SVE_MUL_VL));
6884   __ Ldr(z14, SVEMemOperand(x0, 255, SVE_MUL_VL));
6885   __ Ldr(z15, SVEMemOperand(x0, -256, SVE_MUL_VL));
6886 
6887   __ Ldr(z16, SVEMemOperand(x0, 6 * vl));
6888   __ Ldr(z17, SVEMemOperand(x0, -7 * vl));
6889   __ Ldr(z18, SVEMemOperand(x0, 314, SVE_MUL_VL));
6890   __ Ldr(z19, SVEMemOperand(x0, -314, SVE_MUL_VL));
6891 
6892   END();
6893 
6894   if (CAN_RUN()) {
6895     RUN();
6896 
6897     uint8_t* expected = new uint8_t[data_size];
6898     memset(expected, 0, data_size);
6899     uint8_t* middle = &expected[data_size / 2];
6900 
6901     for (int i = 0; i < vl; i++) {
6902       middle[i] = (1 + (3 * i)) & 0xff;                 // z1
6903       middle[(2 * vl) + i] = (2 + (5 * i)) & 0xff;      // z2
6904       middle[(-3 * vl) + i] = (3 + (7 * i)) & 0xff;     // z3
6905       middle[(255 * vl) + i] = (4 + (11 * i)) & 0xff;   // z4
6906       middle[(-256 * vl) + i] = (5 + (13 * i)) & 0xff;  // z5
6907       middle[(6 * vl) + i] = (6 + (2 * i)) & 0xff;      // z6
6908       middle[(-7 * vl) + i] = (7 + (3 * i)) & 0xff;     // z7
6909       middle[(314 * vl) + i] = (8 + (5 * i)) & 0xff;    // z8
6910       middle[(-314 * vl) + i] = (9 + (7 * i)) & 0xff;   // z9
6911     }
6912 
6913     ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
6914 
6915     ASSERT_EQUAL_SVE(z1, z11);
6916     ASSERT_EQUAL_SVE(z2, z12);
6917     ASSERT_EQUAL_SVE(z3, z13);
6918     ASSERT_EQUAL_SVE(z4, z14);
6919     ASSERT_EQUAL_SVE(z5, z15);
6920     ASSERT_EQUAL_SVE(z6, z16);
6921     ASSERT_EQUAL_SVE(z7, z17);
6922     ASSERT_EQUAL_SVE(z8, z18);
6923     ASSERT_EQUAL_SVE(z9, z19);
6924 
6925     delete[] expected;
6926   }
6927   delete[] data;
6928 }
6929 
TEST_SVE(ldr_str_p_bi)6930 TEST_SVE(ldr_str_p_bi) {
6931   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6932   START();
6933 
6934   int vl = config->sve_vl_in_bytes();
6935   VIXL_ASSERT((vl % kZRegBitsPerPRegBit) == 0);
6936   int pl = vl / kZRegBitsPerPRegBit;
6937 
6938   // The immediate can address [-256, 255] times the PL, so allocate enough
6939   // space to exceed that in both directions.
6940   int data_size = pl * 1024;
6941 
6942   uint8_t* data = new uint8_t[data_size];
6943   memset(data, 0, data_size);
6944 
6945   // Set the base half-way through the buffer so we can use negative indices.
6946   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
6947 
6948   uint64_t pattern[4] = {0x1010101011101111,
6949                          0x0010111011000101,
6950                          0x1001101110010110,
6951                          0x1010110101100011};
6952   for (int i = 8; i <= 15; i++) {
6953     // Initialise p8-p15 with a conveniently-recognisable, non-zero pattern.
6954     Initialise(&masm,
6955                PRegister(i),
6956                pattern[3] * i,
6957                pattern[2] * i,
6958                pattern[1] * i,
6959                pattern[0] * i);
6960   }
6961 
6962   // Encodable cases.
6963   __ Str(p8, SVEMemOperand(x0));
6964   __ Str(p9, SVEMemOperand(x0, 2, SVE_MUL_VL));
6965   __ Str(p10, SVEMemOperand(x0, -3, SVE_MUL_VL));
6966   __ Str(p11, SVEMemOperand(x0, 255, SVE_MUL_VL));
6967 
6968   // Cases that fall back on `CalculateSVEAddress`.
6969   __ Str(p12, SVEMemOperand(x0, 6 * pl));
6970   __ Str(p13, SVEMemOperand(x0, -7 * pl));
6971   __ Str(p14, SVEMemOperand(x0, 314, SVE_MUL_VL));
6972   __ Str(p15, SVEMemOperand(x0, -314, SVE_MUL_VL));
6973 
6974   // Corresponding loads.
6975   __ Ldr(p0, SVEMemOperand(x0));
6976   __ Ldr(p1, SVEMemOperand(x0, 2, SVE_MUL_VL));
6977   __ Ldr(p2, SVEMemOperand(x0, -3, SVE_MUL_VL));
6978   __ Ldr(p3, SVEMemOperand(x0, 255, SVE_MUL_VL));
6979 
6980   __ Ldr(p4, SVEMemOperand(x0, 6 * pl));
6981   __ Ldr(p5, SVEMemOperand(x0, -7 * pl));
6982   __ Ldr(p6, SVEMemOperand(x0, 314, SVE_MUL_VL));
6983   __ Ldr(p7, SVEMemOperand(x0, -314, SVE_MUL_VL));
6984 
6985   END();
6986 
6987   if (CAN_RUN()) {
6988     RUN();
6989 
6990     uint8_t* expected = new uint8_t[data_size];
6991     memset(expected, 0, data_size);
6992     uint8_t* middle = &expected[data_size / 2];
6993 
6994     for (int i = 0; i < pl; i++) {
6995       int bit_index = (i % sizeof(pattern[0])) * kBitsPerByte;
6996       size_t index = i / sizeof(pattern[0]);
6997       VIXL_ASSERT(index < ArrayLength(pattern));
6998       uint64_t byte = (pattern[index] >> bit_index) & 0xff;
6999       // Each byte of `pattern` can be multiplied by 15 without carry.
7000       VIXL_ASSERT((byte * 15) <= 0xff);
7001 
7002       middle[i] = byte * 8;                 // p8
7003       middle[(2 * pl) + i] = byte * 9;      // p9
7004       middle[(-3 * pl) + i] = byte * 10;    // p10
7005       middle[(255 * pl) + i] = byte * 11;   // p11
7006       middle[(6 * pl) + i] = byte * 12;     // p12
7007       middle[(-7 * pl) + i] = byte * 13;    // p13
7008       middle[(314 * pl) + i] = byte * 14;   // p14
7009       middle[(-314 * pl) + i] = byte * 15;  // p15
7010     }
7011 
7012     ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
7013 
7014     ASSERT_EQUAL_SVE(p0, p8);
7015     ASSERT_EQUAL_SVE(p1, p9);
7016     ASSERT_EQUAL_SVE(p2, p10);
7017     ASSERT_EQUAL_SVE(p3, p11);
7018     ASSERT_EQUAL_SVE(p4, p12);
7019     ASSERT_EQUAL_SVE(p5, p13);
7020     ASSERT_EQUAL_SVE(p6, p14);
7021     ASSERT_EQUAL_SVE(p7, p15);
7022 
7023     delete[] expected;
7024   }
7025   delete[] data;
7026 }
7027 
7028 template <typename T>
MemoryWrite(uint8_t * base,int64_t offset,int64_t index,T data)7029 static void MemoryWrite(uint8_t* base, int64_t offset, int64_t index, T data) {
7030   memcpy(base + offset + (index * sizeof(data)), &data, sizeof(data));
7031 }
7032 
TEST_SVE(sve_ld1_st1_contiguous)7033 TEST_SVE(sve_ld1_st1_contiguous) {
7034   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
7035   START();
7036 
7037   int vl = config->sve_vl_in_bytes();
7038 
7039   // The immediate can address [-8, 7] times the VL, so allocate enough space to
7040   // exceed that in both directions.
7041   int data_size = vl * 128;
7042 
7043   uint8_t* data = new uint8_t[data_size];
7044   memset(data, 0, data_size);
7045 
7046   // Set the base half-way through the buffer so we can use negative indices.
7047   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
7048 
7049   // Encodable scalar-plus-immediate cases.
7050   __ Index(z1.VnB(), 1, -3);
7051   __ Ptrue(p1.VnB());
7052   __ St1b(z1.VnB(), p1, SVEMemOperand(x0));
7053 
7054   __ Index(z2.VnH(), -2, 5);
7055   __ Ptrue(p2.VnH(), SVE_MUL3);
7056   __ St1b(z2.VnH(), p2, SVEMemOperand(x0, 7, SVE_MUL_VL));
7057 
7058   __ Index(z3.VnS(), 3, -7);
7059   __ Ptrue(p3.VnS(), SVE_POW2);
7060   __ St1h(z3.VnS(), p3, SVEMemOperand(x0, -8, SVE_MUL_VL));
7061 
7062   // Encodable scalar-plus-scalar cases.
7063   __ Index(z4.VnD(), -4, 11);
7064   __ Ptrue(p4.VnD(), SVE_VL3);
7065   __ Addvl(x1, x0, 8);  // Try not to overlap with VL-dependent cases.
7066   __ Mov(x2, 17);
7067   __ St1b(z4.VnD(), p4, SVEMemOperand(x1, x2));
7068 
7069   __ Index(z5.VnD(), 6, -2);
7070   __ Ptrue(p5.VnD(), SVE_VL16);
7071   __ Addvl(x3, x0, 10);  // Try not to overlap with VL-dependent cases.
7072   __ Mov(x4, 6);
7073   __ St1d(z5.VnD(), p5, SVEMemOperand(x3, x4, LSL, 3));
7074 
7075   // Unencodable cases fall back on `CalculateSVEAddress`.
7076   __ Index(z6.VnS(), -7, 3);
7077   // Setting SVE_ALL on B lanes checks that the Simulator ignores irrelevant
7078   // predicate bits when handling larger lanes.
7079   __ Ptrue(p6.VnB(), SVE_ALL);
7080   __ St1w(z6.VnS(), p6, SVEMemOperand(x0, 42, SVE_MUL_VL));
7081 
7082   __ Index(z7.VnD(), 32, -11);
7083   __ Ptrue(p7.VnD(), SVE_MUL4);
7084   __ St1w(z7.VnD(), p7, SVEMemOperand(x0, 22, SVE_MUL_VL));
7085 
7086   // Corresponding loads.
7087   __ Ld1b(z8.VnB(), p1.Zeroing(), SVEMemOperand(x0));
7088   __ Ld1b(z9.VnH(), p2.Zeroing(), SVEMemOperand(x0, 7, SVE_MUL_VL));
7089   __ Ld1h(z10.VnS(), p3.Zeroing(), SVEMemOperand(x0, -8, SVE_MUL_VL));
7090   __ Ld1b(z11.VnD(), p4.Zeroing(), SVEMemOperand(x1, x2));
7091   __ Ld1d(z12.VnD(), p5.Zeroing(), SVEMemOperand(x3, x4, LSL, 3));
7092   __ Ld1w(z13.VnS(), p6.Zeroing(), SVEMemOperand(x0, 42, SVE_MUL_VL));
7093 
7094   __ Ld1sb(z14.VnH(), p2.Zeroing(), SVEMemOperand(x0, 7, SVE_MUL_VL));
7095   __ Ld1sh(z15.VnS(), p3.Zeroing(), SVEMemOperand(x0, -8, SVE_MUL_VL));
7096   __ Ld1sb(z16.VnD(), p4.Zeroing(), SVEMemOperand(x1, x2));
7097   __ Ld1sw(z17.VnD(), p7.Zeroing(), SVEMemOperand(x0, 22, SVE_MUL_VL));
7098 
7099   // We can test ld1 by comparing the value loaded with the value stored. In
7100   // most cases, there are two complications:
7101   //  - Loads have zeroing predication, so we have to clear the inactive
7102   //    elements on our reference.
7103   //  - We have to replicate any sign- or zero-extension.
7104 
7105   // Ld1b(z8.VnB(), ...)
7106   __ Dup(z18.VnB(), 0);
7107   __ Mov(z18.VnB(), p1.Merging(), z1.VnB());
7108 
7109   // Ld1b(z9.VnH(), ...)
7110   __ Dup(z19.VnH(), 0);
7111   __ Uxtb(z19.VnH(), p2.Merging(), z2.VnH());
7112 
7113   // Ld1h(z10.VnS(), ...)
7114   __ Dup(z20.VnS(), 0);
7115   __ Uxth(z20.VnS(), p3.Merging(), z3.VnS());
7116 
7117   // Ld1b(z11.VnD(), ...)
7118   __ Dup(z21.VnD(), 0);
7119   __ Uxtb(z21.VnD(), p4.Merging(), z4.VnD());
7120 
7121   // Ld1d(z12.VnD(), ...)
7122   __ Dup(z22.VnD(), 0);
7123   __ Mov(z22.VnD(), p5.Merging(), z5.VnD());
7124 
7125   // Ld1w(z13.VnS(), ...)
7126   __ Dup(z23.VnS(), 0);
7127   __ Mov(z23.VnS(), p6.Merging(), z6.VnS());
7128 
7129   // Ld1sb(z14.VnH(), ...)
7130   __ Dup(z24.VnH(), 0);
7131   __ Sxtb(z24.VnH(), p2.Merging(), z2.VnH());
7132 
7133   // Ld1sh(z15.VnS(), ...)
7134   __ Dup(z25.VnS(), 0);
7135   __ Sxth(z25.VnS(), p3.Merging(), z3.VnS());
7136 
7137   // Ld1sb(z16.VnD(), ...)
7138   __ Dup(z26.VnD(), 0);
7139   __ Sxtb(z26.VnD(), p4.Merging(), z4.VnD());
7140 
7141   // Ld1sw(z17.VnD(), ...)
7142   __ Dup(z27.VnD(), 0);
7143   __ Sxtw(z27.VnD(), p7.Merging(), z7.VnD());
7144 
7145   END();
7146 
7147   if (CAN_RUN()) {
7148     RUN();
7149 
7150     uint8_t* expected = new uint8_t[data_size];
7151     memset(expected, 0, data_size);
7152     uint8_t* middle = &expected[data_size / 2];
7153 
7154     int vl_b = vl / kBRegSizeInBytes;
7155     int vl_h = vl / kHRegSizeInBytes;
7156     int vl_s = vl / kSRegSizeInBytes;
7157     int vl_d = vl / kDRegSizeInBytes;
7158 
7159     // Encodable cases.
7160 
7161     // st1b { z1.b }, SVE_ALL
7162     for (int i = 0; i < vl_b; i++) {
7163       MemoryWrite(middle, 0, i, static_cast<uint8_t>(1 - (3 * i)));
7164     }
7165 
7166     // st1b { z2.h }, SVE_MUL3
7167     int vl_h_mul3 = vl_h - (vl_h % 3);
7168     for (int i = 0; i < vl_h_mul3; i++) {
7169       int64_t offset = 7 * static_cast<int>(vl / (kHRegSize / kBRegSize));
7170       MemoryWrite(middle, offset, i, static_cast<uint8_t>(-2 + (5 * i)));
7171     }
7172 
7173     // st1h { z3.s }, SVE_POW2
7174     int vl_s_pow2 = 1 << HighestSetBitPosition(vl_s);
7175     for (int i = 0; i < vl_s_pow2; i++) {
7176       int64_t offset = -8 * static_cast<int>(vl / (kSRegSize / kHRegSize));
7177       MemoryWrite(middle, offset, i, static_cast<uint16_t>(3 - (7 * i)));
7178     }
7179 
7180     // st1b { z4.d }, SVE_VL3
7181     if (vl_d >= 3) {
7182       for (int i = 0; i < 3; i++) {
7183         MemoryWrite(middle,
7184                     (8 * vl) + 17,
7185                     i,
7186                     static_cast<uint8_t>(-4 + (11 * i)));
7187       }
7188     }
7189 
7190     // st1d { z5.d }, SVE_VL16
7191     if (vl_d >= 16) {
7192       for (int i = 0; i < 16; i++) {
7193         MemoryWrite(middle,
7194                     (10 * vl) + (6 * kDRegSizeInBytes),
7195                     i,
7196                     static_cast<uint64_t>(6 - (2 * i)));
7197       }
7198     }
7199 
7200     // Unencodable cases.
7201 
7202     // st1w { z6.s }, SVE_ALL
7203     for (int i = 0; i < vl_s; i++) {
7204       MemoryWrite(middle, 42 * vl, i, static_cast<uint32_t>(-7 + (3 * i)));
7205     }
7206 
7207     // st1w { z7.d }, SVE_MUL4
7208     int vl_d_mul4 = vl_d - (vl_d % 4);
7209     for (int i = 0; i < vl_d_mul4; i++) {
7210       int64_t offset = 22 * static_cast<int>(vl / (kDRegSize / kWRegSize));
7211       MemoryWrite(middle, offset, i, static_cast<uint32_t>(32 + (-11 * i)));
7212     }
7213 
7214     ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
7215 
7216     // Check that we loaded back the expected values.
7217 
7218     ASSERT_EQUAL_SVE(z18, z8);
7219     ASSERT_EQUAL_SVE(z19, z9);
7220     ASSERT_EQUAL_SVE(z20, z10);
7221     ASSERT_EQUAL_SVE(z21, z11);
7222     ASSERT_EQUAL_SVE(z22, z12);
7223     ASSERT_EQUAL_SVE(z23, z13);
7224     ASSERT_EQUAL_SVE(z24, z14);
7225     ASSERT_EQUAL_SVE(z25, z15);
7226     ASSERT_EQUAL_SVE(z26, z16);
7227     ASSERT_EQUAL_SVE(z27, z17);
7228 
7229     delete[] expected;
7230   }
7231   delete[] data;
7232 }
7233 
TEST_SVE(sve_ld2_st2_scalar_plus_imm)7234 TEST_SVE(sve_ld2_st2_scalar_plus_imm) {
7235   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
7236   START();
7237 
7238   int vl = config->sve_vl_in_bytes();
7239 
7240   // The immediate can address [-16, 14] times the VL, so allocate enough space
7241   // to exceed that in both directions.
7242   int data_size = vl * 128;
7243 
7244   uint8_t* data = new uint8_t[data_size];
7245   memset(data, 0, data_size);
7246 
7247   // Set the base half-way through the buffer so we can use negative indeces.
7248   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
7249 
7250   __ Index(z14.VnB(), 1, -3);
7251   __ Index(z15.VnB(), 2, -3);
7252   __ Ptrue(p0.VnB());
7253   __ St2b(z14.VnB(), z15.VnB(), p0, SVEMemOperand(x0));
7254 
7255   __ Index(z16.VnH(), -2, 5);
7256   __ Index(z17.VnH(), -3, 5);
7257   __ Ptrue(p1.VnH(), SVE_MUL3);
7258   __ St2h(z16.VnH(), z17.VnH(), p1, SVEMemOperand(x0, 8, SVE_MUL_VL));
7259 
7260   // Wrap around from z31 to z0.
7261   __ Index(z31.VnS(), 3, -7);
7262   __ Index(z0.VnS(), 4, -7);
7263   __ Ptrue(p2.VnS(), SVE_POW2);
7264   __ St2w(z31.VnS(), z0.VnS(), p2, SVEMemOperand(x0, -12, SVE_MUL_VL));
7265 
7266   __ Index(z18.VnD(), -7, 3);
7267   __ Index(z19.VnD(), -8, 3);
7268   // Sparse predication, including some irrelevant bits (0xe). To make the
7269   // results easy to check, activate each lane <n> where n is a multiple of 5.
7270   Initialise(&masm,
7271              p3,
7272              0xeee10000000001ee,
7273              0xeeeeeee100000000,
7274              0x01eeeeeeeee10000,
7275              0x000001eeeeeeeee1);
7276   __ St2d(z18.VnD(), z19.VnD(), p3, SVEMemOperand(x0, 14, SVE_MUL_VL));
7277 
7278   // We can test ld2 by comparing the values loaded with the values stored.
7279   // There are two complications:
7280   //  - Loads have zeroing predication, so we have to clear the inactive
7281   //    elements on our reference.
7282   //  - We want to test both loads and stores that span { z31, z0 }, so we have
7283   //    to move some values around.
7284   //
7285   // Registers z4-z11 will hold as-stored values (with inactive elements
7286   // cleared). Registers z20-z27 will hold the values that were loaded.
7287 
7288   // Ld2b(z14.VnB(), z15.VnB(), ...)
7289   __ Dup(z4.VnB(), 0);
7290   __ Dup(z5.VnB(), 0);
7291   __ Mov(z4.VnB(), p0.Merging(), z14.VnB());
7292   __ Mov(z5.VnB(), p0.Merging(), z15.VnB());
7293 
7294   // Ld2h(z16.VnH(), z17.VnH(), ...)
7295   __ Dup(z6.VnH(), 0);
7296   __ Dup(z7.VnH(), 0);
7297   __ Mov(z6.VnH(), p1.Merging(), z16.VnH());
7298   __ Mov(z7.VnH(), p1.Merging(), z17.VnH());
7299 
7300   // Ld2w(z31.VnS(), z0.VnS(), ...)
7301   __ Dup(z8.VnS(), 0);
7302   __ Dup(z9.VnS(), 0);
7303   __ Mov(z8.VnS(), p2.Merging(), z31.VnS());
7304   __ Mov(z9.VnS(), p2.Merging(), z0.VnS());
7305 
7306   // Ld2d(z18.VnD(), z19.VnD(), ...)
7307   __ Dup(z10.VnD(), 0);
7308   __ Dup(z11.VnD(), 0);
7309   __ Mov(z10.VnD(), p3.Merging(), z18.VnD());
7310   __ Mov(z11.VnD(), p3.Merging(), z19.VnD());
7311 
7312   // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
7313   __ Ld2b(z31.VnB(), z0.VnB(), p0.Zeroing(), SVEMemOperand(x0));
7314   __ Mov(z20, z31);
7315   __ Mov(z21, z0);
7316 
7317   __ Ld2h(z22.VnH(), z23.VnH(), p1.Zeroing(), SVEMemOperand(x0, 8, SVE_MUL_VL));
7318   __ Ld2w(z24.VnS(),
7319           z25.VnS(),
7320           p2.Zeroing(),
7321           SVEMemOperand(x0, -12, SVE_MUL_VL));
7322   __ Ld2d(z26.VnD(),
7323           z27.VnD(),
7324           p3.Zeroing(),
7325           SVEMemOperand(x0, 14, SVE_MUL_VL));
7326 
7327   END();
7328 
7329   if (CAN_RUN()) {
7330     RUN();
7331 
7332     uint8_t* expected = new uint8_t[data_size];
7333     memset(expected, 0, data_size);
7334     uint8_t* middle = &expected[data_size / 2];
7335 
7336     int vl_b = vl / kBRegSizeInBytes;
7337     int vl_h = vl / kHRegSizeInBytes;
7338     int vl_s = vl / kSRegSizeInBytes;
7339     int vl_d = vl / kDRegSizeInBytes;
7340 
7341     int reg_count = 2;
7342 
7343     // st2b { z14.b, z15.b }, SVE_ALL
7344     for (int i = 0; i < vl_b; i++) {
7345       uint8_t lane0 = 1 - (3 * i);
7346       uint8_t lane1 = 2 - (3 * i);
7347       MemoryWrite(middle, 0, (i * reg_count) + 0, lane0);
7348       MemoryWrite(middle, 0, (i * reg_count) + 1, lane1);
7349     }
7350 
7351     // st2h { z16.h, z17.h }, SVE_MUL3
7352     int vl_h_mul3 = vl_h - (vl_h % 3);
7353     for (int i = 0; i < vl_h_mul3; i++) {
7354       int64_t offset = 8 * vl;
7355       uint16_t lane0 = -2 + (5 * i);
7356       uint16_t lane1 = -3 + (5 * i);
7357       MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7358       MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7359     }
7360 
7361     // st2w { z31.s, z0.s }, SVE_POW2
7362     int vl_s_pow2 = 1 << HighestSetBitPosition(vl_s);
7363     for (int i = 0; i < vl_s_pow2; i++) {
7364       int64_t offset = -12 * vl;
7365       uint32_t lane0 = 3 - (7 * i);
7366       uint32_t lane1 = 4 - (7 * i);
7367       MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7368       MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7369     }
7370 
7371     // st2d { z18.d, z19.d }, ((i % 5) == 0)
7372     for (int i = 0; i < vl_d; i++) {
7373       if ((i % 5) == 0) {
7374         int64_t offset = 14 * vl;
7375         uint64_t lane0 = -7 + (3 * i);
7376         uint64_t lane1 = -8 + (3 * i);
7377         MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7378         MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7379       }
7380     }
7381 
7382     ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
7383 
7384     // Check that we loaded back the expected values.
7385 
7386     // st2b/ld2b
7387     ASSERT_EQUAL_SVE(z4, z20);
7388     ASSERT_EQUAL_SVE(z5, z21);
7389 
7390     // st2h/ld2h
7391     ASSERT_EQUAL_SVE(z6, z22);
7392     ASSERT_EQUAL_SVE(z7, z23);
7393 
7394     // st2w/ld2w
7395     ASSERT_EQUAL_SVE(z8, z24);
7396     ASSERT_EQUAL_SVE(z9, z25);
7397 
7398     // st2d/ld2d
7399     ASSERT_EQUAL_SVE(z10, z26);
7400     ASSERT_EQUAL_SVE(z11, z27);
7401 
7402     delete[] expected;
7403   }
7404   delete[] data;
7405 }
7406 
TEST_SVE(sve_ld2_st2_scalar_plus_scalar)7407 TEST_SVE(sve_ld2_st2_scalar_plus_scalar) {
7408   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
7409   START();
7410 
7411   int vl = config->sve_vl_in_bytes();
7412 
7413   // Allocate plenty of space to enable indexing in both directions.
7414   int data_size = vl * 128;
7415 
7416   uint8_t* data = new uint8_t[data_size];
7417   memset(data, 0, data_size);
7418 
7419   // Set the base half-way through the buffer so we can use negative indeces.
7420   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
7421 
7422   __ Index(z10.VnB(), -4, 11);
7423   __ Index(z11.VnB(), -5, 11);
7424   __ Ptrue(p7.VnB(), SVE_MUL4);
7425   __ Mov(x1, 0);
7426   __ St2b(z10.VnB(), z11.VnB(), p7, SVEMemOperand(x0, x1));
7427 
7428   __ Index(z12.VnH(), 6, -2);
7429   __ Index(z13.VnH(), 7, -2);
7430   __ Ptrue(p6.VnH(), SVE_VL16);
7431   __ Rdvl(x2, 3);  // Make offsets VL-dependent so we can avoid overlap.
7432   __ St2h(z12.VnH(), z13.VnH(), p6, SVEMemOperand(x0, x2, LSL, 1));
7433 
7434   __ Index(z14.VnS(), -7, 3);
7435   __ Index(z15.VnS(), -8, 3);
7436   // Sparse predication, including some irrelevant bits (0xe). To make the
7437   // results easy to check, activate each lane <n> where n is a multiple of 5.
7438   Initialise(&masm,
7439              p5,
7440              0xeee1000010000100,
7441              0x001eeee100001000,
7442              0x0100001eeee10000,
7443              0x10000100001eeee1);
7444   __ Rdvl(x3, -3);
7445   __ St2w(z14.VnS(), z15.VnS(), p5, SVEMemOperand(x0, x3, LSL, 2));
7446 
7447   // Wrap around from z31 to z0.
7448   __ Index(z31.VnD(), 32, -11);
7449   __ Index(z0.VnD(), 33, -11);
7450   __ Ptrue(p4.VnD(), SVE_MUL3);
7451   __ Rdvl(x4, 1);
7452   __ St2d(z31.VnD(), z0.VnD(), p4, SVEMemOperand(x0, x4, LSL, 3));
7453 
7454   // We can test ld2 by comparing the values loaded with the values stored.
7455   // There are two complications:
7456   //  - Loads have zeroing predication, so we have to clear the inactive
7457   //    elements on our reference.
7458   //  - We want to test both loads and stores that span { z31, z0 }, so we have
7459   //    to move some values around.
7460   //
7461   // Registers z4-z11 will hold as-stored values (with inactive elements
7462   // cleared). Registers z20-z27 will hold the values that were loaded.
7463 
7464   // Ld2b(z20.VnB(), z21.VnB(), ...)
7465   __ Dup(z4.VnB(), 0);
7466   __ Dup(z5.VnB(), 0);
7467   __ Mov(z4.VnB(), p7.Merging(), z10.VnB());
7468   __ Mov(z5.VnB(), p7.Merging(), z11.VnB());
7469 
7470   // Ld2h(z22.VnH(), z23.VnH(), ...)
7471   __ Dup(z6.VnH(), 0);
7472   __ Dup(z7.VnH(), 0);
7473   __ Mov(z6.VnH(), p6.Merging(), z12.VnH());
7474   __ Mov(z7.VnH(), p6.Merging(), z13.VnH());
7475 
7476   // Ld2w(z24.VnS(), z25.VnS(), ...)
7477   __ Dup(z8.VnS(), 0);
7478   __ Dup(z9.VnS(), 0);
7479   __ Mov(z8.VnS(), p5.Merging(), z14.VnS());
7480   __ Mov(z9.VnS(), p5.Merging(), z15.VnS());
7481 
7482   // Ld2d(z31.VnD(), z0.VnD(), ...)
7483   __ Dup(z10.VnD(), 0);
7484   __ Dup(z11.VnD(), 0);
7485   __ Mov(z10.VnD(), p4.Merging(), z31.VnD());
7486   __ Mov(z11.VnD(), p4.Merging(), z0.VnD());
7487 
7488   // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
7489   __ Ld2b(z31.VnB(), z0.VnB(), p7.Zeroing(), SVEMemOperand(x0, x1));
7490   __ Mov(z20, z31);
7491   __ Mov(z21, z0);
7492 
7493   __ Ld2h(z22.VnH(), z23.VnH(), p6.Zeroing(), SVEMemOperand(x0, x2, LSL, 1));
7494   __ Ld2w(z24.VnS(), z25.VnS(), p5.Zeroing(), SVEMemOperand(x0, x3, LSL, 2));
7495   __ Ld2d(z26.VnD(), z27.VnD(), p4.Zeroing(), SVEMemOperand(x0, x4, LSL, 3));
7496 
7497   END();
7498 
7499   if (CAN_RUN()) {
7500     RUN();
7501 
7502     uint8_t* expected = new uint8_t[data_size];
7503     memset(expected, 0, data_size);
7504     uint8_t* middle = &expected[data_size / 2];
7505 
7506     int vl_b = vl / kBRegSizeInBytes;
7507     int vl_h = vl / kHRegSizeInBytes;
7508     int vl_s = vl / kSRegSizeInBytes;
7509     int vl_d = vl / kDRegSizeInBytes;
7510 
7511     int reg_count = 2;
7512 
7513     // st2b { z10.b, z11.b }, SVE_MUL4
7514     int vl_b_mul4 = vl_b - (vl_b % 4);
7515     for (int i = 0; i < vl_b_mul4; i++) {
7516       uint8_t lane0 = -4 + (11 * i);
7517       uint8_t lane1 = -5 + (11 * i);
7518       MemoryWrite(middle, 0, (i * reg_count) + 0, lane0);
7519       MemoryWrite(middle, 0, (i * reg_count) + 1, lane1);
7520     }
7521 
7522     // st2h { z12.h, z13.h }, SVE_VL16
7523     if (vl_h >= 16) {
7524       for (int i = 0; i < 16; i++) {
7525         int64_t offset = (3 << kHRegSizeInBytesLog2) * vl;
7526         uint16_t lane0 = 6 - (2 * i);
7527         uint16_t lane1 = 7 - (2 * i);
7528         MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7529         MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7530       }
7531     }
7532 
7533     // st2w { z14.s, z15.s }, ((i % 5) == 0)
7534     for (int i = 0; i < vl_s; i++) {
7535       if ((i % 5) == 0) {
7536         int64_t offset = -(3 << kSRegSizeInBytesLog2) * vl;
7537         uint32_t lane0 = -7 + (3 * i);
7538         uint32_t lane1 = -8 + (3 * i);
7539         MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7540         MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7541       }
7542     }
7543 
7544     // st2d { z31.b, z0.b }, SVE_MUL3
7545     int vl_d_mul3 = vl_d - (vl_d % 3);
7546     for (int i = 0; i < vl_d_mul3; i++) {
7547       int64_t offset = (1 << kDRegSizeInBytesLog2) * vl;
7548       uint64_t lane0 = 32 - (11 * i);
7549       uint64_t lane1 = 33 - (11 * i);
7550       MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7551       MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7552     }
7553 
7554     ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
7555 
7556     // Check that we loaded back the expected values.
7557 
7558     // st2b/ld2b
7559     ASSERT_EQUAL_SVE(z4, z20);
7560     ASSERT_EQUAL_SVE(z5, z21);
7561 
7562     // st2h/ld2h
7563     ASSERT_EQUAL_SVE(z6, z22);
7564     ASSERT_EQUAL_SVE(z7, z23);
7565 
7566     // st2w/ld2w
7567     ASSERT_EQUAL_SVE(z8, z24);
7568     ASSERT_EQUAL_SVE(z9, z25);
7569 
7570     // st2d/ld2d
7571     ASSERT_EQUAL_SVE(z10, z26);
7572     ASSERT_EQUAL_SVE(z11, z27);
7573 
7574     delete[] expected;
7575   }
7576   delete[] data;
7577 }
7578 
TEST_SVE(sve_ld3_st3_scalar_plus_imm)7579 TEST_SVE(sve_ld3_st3_scalar_plus_imm) {
7580   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
7581   START();
7582 
7583   int vl = config->sve_vl_in_bytes();
7584 
7585   // The immediate can address [-24, 21] times the VL, so allocate enough space
7586   // to exceed that in both directions.
7587   int data_size = vl * 128;
7588 
7589   uint8_t* data = new uint8_t[data_size];
7590   memset(data, 0, data_size);
7591 
7592   // Set the base half-way through the buffer so we can use negative indeces.
7593   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
7594 
7595   // We can test ld3 by comparing the values loaded with the values stored.
7596   // There are two complications:
7597   //  - Loads have zeroing predication, so we have to clear the inactive
7598   //    elements on our reference.
7599   //  - We want to test both loads and stores that span { z31, z0 }, so we have
7600   //    to move some values around.
7601   //
7602   // Registers z4-z15 will hold as-stored values (with inactive elements
7603   // cleared). Registers z16-z27 will hold the values that were loaded.
7604 
7605   __ Index(z10.VnB(), 1, -3);
7606   __ Index(z11.VnB(), 2, -3);
7607   __ Index(z12.VnB(), 3, -3);
7608   __ Ptrue(p0.VnB());
7609   __ St3b(z10.VnB(), z11.VnB(), z12.VnB(), p0, SVEMemOperand(x0));
7610   // Save the stored values for ld3 tests.
7611   __ Dup(z4.VnB(), 0);
7612   __ Dup(z5.VnB(), 0);
7613   __ Dup(z6.VnB(), 0);
7614   __ Mov(z4.VnB(), p0.Merging(), z10.VnB());
7615   __ Mov(z5.VnB(), p0.Merging(), z11.VnB());
7616   __ Mov(z6.VnB(), p0.Merging(), z12.VnB());
7617 
7618   // Wrap around from z31 to z0.
7619   __ Index(z31.VnH(), -2, 5);
7620   __ Index(z0.VnH(), -3, 5);
7621   __ Index(z1.VnH(), -4, 5);
7622   __ Ptrue(p1.VnH(), SVE_MUL3);
7623   __ St3h(z31.VnH(), z0.VnH(), z1.VnH(), p1, SVEMemOperand(x0, 9, SVE_MUL_VL));
7624   // Save the stored values for ld3 tests.
7625   __ Dup(z7.VnH(), 0);
7626   __ Dup(z8.VnH(), 0);
7627   __ Dup(z9.VnH(), 0);
7628   __ Mov(z7.VnH(), p1.Merging(), z31.VnH());
7629   __ Mov(z8.VnH(), p1.Merging(), z0.VnH());
7630   __ Mov(z9.VnH(), p1.Merging(), z1.VnH());
7631 
7632   __ Index(z30.VnS(), 3, -7);
7633   __ Index(z31.VnS(), 4, -7);
7634   __ Index(z0.VnS(), 5, -7);
7635   __ Ptrue(p2.VnS(), SVE_POW2);
7636   __ St3w(z30.VnS(),
7637           z31.VnS(),
7638           z0.VnS(),
7639           p2,
7640           SVEMemOperand(x0, -12, SVE_MUL_VL));
7641   // Save the stored values for ld3 tests.
7642   __ Dup(z10.VnS(), 0);
7643   __ Dup(z11.VnS(), 0);
7644   __ Dup(z12.VnS(), 0);
7645   __ Mov(z10.VnS(), p2.Merging(), z30.VnS());
7646   __ Mov(z11.VnS(), p2.Merging(), z31.VnS());
7647   __ Mov(z12.VnS(), p2.Merging(), z0.VnS());
7648 
7649   __ Index(z0.VnD(), -7, 3);
7650   __ Index(z1.VnD(), -8, 3);
7651   __ Index(z2.VnD(), -9, 3);
7652   // Sparse predication, including some irrelevant bits (0xee). To make the
7653   // results easy to check, activate each lane <n> where n is a multiple of 5.
7654   Initialise(&masm,
7655              p3,
7656              0xeee10000000001ee,
7657              0xeeeeeee100000000,
7658              0x01eeeeeeeee10000,
7659              0x000001eeeeeeeee1);
7660   __ St3d(z0.VnD(), z1.VnD(), z2.VnD(), p3, SVEMemOperand(x0, 15, SVE_MUL_VL));
7661   // Save the stored values for ld3 tests.
7662   __ Dup(z13.VnD(), 0);
7663   __ Dup(z14.VnD(), 0);
7664   __ Dup(z15.VnD(), 0);
7665   __ Mov(z13.VnD(), p3.Merging(), z0.VnD());
7666   __ Mov(z14.VnD(), p3.Merging(), z1.VnD());
7667   __ Mov(z15.VnD(), p3.Merging(), z2.VnD());
7668 
7669   // Corresponding loads.
7670   // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
7671   __ Ld3b(z31.VnB(), z0.VnB(), z1.VnB(), p0.Zeroing(), SVEMemOperand(x0));
7672   __ Mov(z16, z31);
7673   __ Mov(z17, z0);
7674   __ Mov(z18, z1);
7675   __ Ld3h(z30.VnH(),
7676           z31.VnH(),
7677           z0.VnH(),
7678           p1.Zeroing(),
7679           SVEMemOperand(x0, 9, SVE_MUL_VL));
7680   __ Mov(z19, z30);
7681   __ Mov(z20, z31);
7682   __ Mov(z21, z0);
7683   __ Ld3w(z22.VnS(),
7684           z23.VnS(),
7685           z24.VnS(),
7686           p2.Zeroing(),
7687           SVEMemOperand(x0, -12, SVE_MUL_VL));
7688   __ Ld3d(z25.VnD(),
7689           z26.VnD(),
7690           z27.VnD(),
7691           p3.Zeroing(),
7692           SVEMemOperand(x0, 15, SVE_MUL_VL));
7693 
7694   END();
7695 
7696   if (CAN_RUN()) {
7697     RUN();
7698 
7699     uint8_t* expected = new uint8_t[data_size];
7700     memset(expected, 0, data_size);
7701     uint8_t* middle = &expected[data_size / 2];
7702 
7703     int vl_b = vl / kBRegSizeInBytes;
7704     int vl_h = vl / kHRegSizeInBytes;
7705     int vl_s = vl / kSRegSizeInBytes;
7706     int vl_d = vl / kDRegSizeInBytes;
7707 
7708     int reg_count = 3;
7709 
7710     // st3b { z10.b, z11.b, z12.b }, SVE_ALL
7711     for (int i = 0; i < vl_b; i++) {
7712       uint8_t lane0 = 1 - (3 * i);
7713       uint8_t lane1 = 2 - (3 * i);
7714       uint8_t lane2 = 3 - (3 * i);
7715       MemoryWrite(middle, 0, (i * reg_count) + 0, lane0);
7716       MemoryWrite(middle, 0, (i * reg_count) + 1, lane1);
7717       MemoryWrite(middle, 0, (i * reg_count) + 2, lane2);
7718     }
7719 
7720     // st3h { z31.h, z0.h, z1.h }, SVE_MUL3
7721     int vl_h_mul3 = vl_h - (vl_h % 3);
7722     for (int i = 0; i < vl_h_mul3; i++) {
7723       int64_t offset = 9 * vl;
7724       uint16_t lane0 = -2 + (5 * i);
7725       uint16_t lane1 = -3 + (5 * i);
7726       uint16_t lane2 = -4 + (5 * i);
7727       MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7728       MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7729       MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7730     }
7731 
7732     // st3w { z30.s, z31.s, z0.s }, SVE_POW2
7733     int vl_s_pow2 = 1 << HighestSetBitPosition(vl_s);
7734     for (int i = 0; i < vl_s_pow2; i++) {
7735       int64_t offset = -12 * vl;
7736       uint32_t lane0 = 3 - (7 * i);
7737       uint32_t lane1 = 4 - (7 * i);
7738       uint32_t lane2 = 5 - (7 * i);
7739       MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7740       MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7741       MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7742     }
7743 
7744     // st3d { z0.d, z1.d, z2.d }, ((i % 5) == 0)
7745     for (int i = 0; i < vl_d; i++) {
7746       if ((i % 5) == 0) {
7747         int64_t offset = 15 * vl;
7748         uint64_t lane0 = -7 + (3 * i);
7749         uint64_t lane1 = -8 + (3 * i);
7750         uint64_t lane2 = -9 + (3 * i);
7751         MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7752         MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7753         MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7754       }
7755     }
7756 
7757     ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
7758 
7759     // Check that we loaded back the expected values.
7760 
7761     // st3b/ld3b
7762     ASSERT_EQUAL_SVE(z4, z16);
7763     ASSERT_EQUAL_SVE(z5, z17);
7764     ASSERT_EQUAL_SVE(z6, z18);
7765 
7766     // st3h/ld3h
7767     ASSERT_EQUAL_SVE(z7, z19);
7768     ASSERT_EQUAL_SVE(z8, z20);
7769     ASSERT_EQUAL_SVE(z9, z21);
7770 
7771     // st3w/ld3w
7772     ASSERT_EQUAL_SVE(z10, z22);
7773     ASSERT_EQUAL_SVE(z11, z23);
7774     ASSERT_EQUAL_SVE(z12, z24);
7775 
7776     // st3d/ld3d
7777     ASSERT_EQUAL_SVE(z13, z25);
7778     ASSERT_EQUAL_SVE(z14, z26);
7779     ASSERT_EQUAL_SVE(z15, z27);
7780 
7781     delete[] expected;
7782   }
7783   delete[] data;
7784 }
7785 
TEST_SVE(sve_ld3_st3_scalar_plus_scalar)7786 TEST_SVE(sve_ld3_st3_scalar_plus_scalar) {
7787   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
7788   START();
7789 
7790   int vl = config->sve_vl_in_bytes();
7791 
7792   // Allocate plenty of space to enable indexing in both directions.
7793   int data_size = vl * 128;
7794 
7795   uint8_t* data = new uint8_t[data_size];
7796   memset(data, 0, data_size);
7797 
7798   // Set the base half-way through the buffer so we can use negative indeces.
7799   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
7800 
7801   // We can test ld3 by comparing the values loaded with the values stored.
7802   // There are two complications:
7803   //  - Loads have zeroing predication, so we have to clear the inactive
7804   //    elements on our reference.
7805   //  - We want to test both loads and stores that span { z31, z0 }, so we have
7806   //    to move some values around.
7807   //
7808   // Registers z4-z15 will hold as-stored values (with inactive elements
7809   // cleared). Registers z16-z27 will hold the values that were loaded.
7810 
7811   __ Index(z10.VnB(), -4, 11);
7812   __ Index(z11.VnB(), -5, 11);
7813   __ Index(z12.VnB(), -6, 11);
7814   __ Ptrue(p7.VnB(), SVE_MUL4);
7815   __ Rdvl(x1, -1);  // Make offsets VL-dependent so we can avoid overlap.
7816   __ St3b(z10.VnB(), z11.VnB(), z12.VnB(), p7, SVEMemOperand(x0, x1, LSL, 0));
7817   // Save the stored values for ld3 tests.
7818   __ Dup(z4.VnB(), 0);
7819   __ Dup(z5.VnB(), 0);
7820   __ Dup(z6.VnB(), 0);
7821   __ Mov(z4.VnB(), p7.Merging(), z10.VnB());
7822   __ Mov(z5.VnB(), p7.Merging(), z11.VnB());
7823   __ Mov(z6.VnB(), p7.Merging(), z12.VnB());
7824 
7825   __ Index(z13.VnH(), 6, -2);
7826   __ Index(z14.VnH(), 7, -2);
7827   __ Index(z15.VnH(), 8, -2);
7828   __ Ptrue(p6.VnH(), SVE_VL16);
7829   __ Rdvl(x2, 5);  // (5 * vl) << 1 = 10 * vl
7830   __ St3h(z13.VnH(), z14.VnH(), z15.VnH(), p6, SVEMemOperand(x0, x2, LSL, 1));
7831   // Save the stored values for ld3 tests.
7832   __ Dup(z7.VnH(), 0);
7833   __ Dup(z8.VnH(), 0);
7834   __ Dup(z9.VnH(), 0);
7835   __ Mov(z7.VnH(), p6.Merging(), z13.VnH());
7836   __ Mov(z8.VnH(), p6.Merging(), z14.VnH());
7837   __ Mov(z9.VnH(), p6.Merging(), z15.VnH());
7838 
7839   // Wrap around from z31 to z0.
7840   __ Index(z30.VnS(), -7, 3);
7841   __ Index(z31.VnS(), -8, 3);
7842   __ Index(z0.VnS(), -9, 3);
7843   // Sparse predication, including some irrelevant bits (0xe). To make the
7844   // results easy to check, activate each lane <n> where n is a multiple of 5.
7845   Initialise(&masm,
7846              p5,
7847              0xeee1000010000100,
7848              0x001eeee100001000,
7849              0x0100001eeee10000,
7850              0x10000100001eeee1);
7851   __ Rdvl(x3, -5);  // -(5 * vl) << 2 = -20 * vl
7852   __ St3w(z30.VnS(), z31.VnS(), z0.VnS(), p5, SVEMemOperand(x0, x3, LSL, 2));
7853   // Save the stored values for ld3 tests.
7854   __ Dup(z10.VnS(), 0);
7855   __ Dup(z11.VnS(), 0);
7856   __ Dup(z12.VnS(), 0);
7857   __ Mov(z10.VnS(), p5.Merging(), z30.VnS());
7858   __ Mov(z11.VnS(), p5.Merging(), z31.VnS());
7859   __ Mov(z12.VnS(), p5.Merging(), z0.VnS());
7860 
7861   __ Index(z31.VnD(), 32, -11);
7862   __ Index(z0.VnD(), 33, -11);
7863   __ Index(z1.VnD(), 34, -11);
7864   __ Ptrue(p4.VnD(), SVE_MUL3);
7865   __ Rdvl(x4, -1);  // -(1 * vl) << 3 = -8 * vl
7866   __ St3d(z31.VnD(), z0.VnD(), z1.VnD(), p4, SVEMemOperand(x0, x4, LSL, 3));
7867   // Save the stored values for ld3 tests.
7868   __ Dup(z13.VnD(), 0);
7869   __ Dup(z14.VnD(), 0);
7870   __ Dup(z15.VnD(), 0);
7871   __ Mov(z13.VnD(), p4.Merging(), z31.VnD());
7872   __ Mov(z14.VnD(), p4.Merging(), z0.VnD());
7873   __ Mov(z15.VnD(), p4.Merging(), z1.VnD());
7874 
7875   // Corresponding loads.
7876   // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
7877   __ Ld3b(z31.VnB(),
7878           z0.VnB(),
7879           z1.VnB(),
7880           p7.Zeroing(),
7881           SVEMemOperand(x0, x1, LSL, 0));
7882   __ Mov(z16, z31);
7883   __ Mov(z17, z0);
7884   __ Mov(z18, z1);
7885   __ Ld3h(z30.VnH(),
7886           z31.VnH(),
7887           z0.VnH(),
7888           p6.Zeroing(),
7889           SVEMemOperand(x0, x2, LSL, 1));
7890   __ Mov(z19, z30);
7891   __ Mov(z20, z31);
7892   __ Mov(z21, z0);
7893   __ Ld3w(z22.VnS(),
7894           z23.VnS(),
7895           z24.VnS(),
7896           p5.Zeroing(),
7897           SVEMemOperand(x0, x3, LSL, 2));
7898   __ Ld3d(z25.VnD(),
7899           z26.VnD(),
7900           z27.VnD(),
7901           p4.Zeroing(),
7902           SVEMemOperand(x0, x4, LSL, 3));
7903 
7904   END();
7905 
7906   if (CAN_RUN()) {
7907     RUN();
7908 
7909     uint8_t* expected = new uint8_t[data_size];
7910     memset(expected, 0, data_size);
7911     uint8_t* middle = &expected[data_size / 2];
7912 
7913     int vl_b = vl / kBRegSizeInBytes;
7914     int vl_h = vl / kHRegSizeInBytes;
7915     int vl_s = vl / kSRegSizeInBytes;
7916     int vl_d = vl / kDRegSizeInBytes;
7917 
7918     int reg_count = 3;
7919 
7920     // st3b { z10.b, z11.b, z12.b }, SVE_MUL4
7921     int vl_b_mul4 = vl_b - (vl_b % 4);
7922     for (int i = 0; i < vl_b_mul4; i++) {
7923       int64_t offset = -(1 << kBRegSizeInBytesLog2) * vl;
7924       uint8_t lane0 = -4 + (11 * i);
7925       uint8_t lane1 = -5 + (11 * i);
7926       uint8_t lane2 = -6 + (11 * i);
7927       MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7928       MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7929       MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7930     }
7931 
7932     // st3h { z13.h, z14.h, z15.h }, SVE_VL16
7933     if (vl_h >= 16) {
7934       for (int i = 0; i < 16; i++) {
7935         int64_t offset = (5 << kHRegSizeInBytesLog2) * vl;
7936         uint16_t lane0 = 6 - (2 * i);
7937         uint16_t lane1 = 7 - (2 * i);
7938         uint16_t lane2 = 8 - (2 * i);
7939         MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7940         MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7941         MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7942       }
7943     }
7944 
7945     // st3w { z30.s, z31.s, z0.s }, ((i % 5) == 0)
7946     for (int i = 0; i < vl_s; i++) {
7947       if ((i % 5) == 0) {
7948         int64_t offset = -(5 << kSRegSizeInBytesLog2) * vl;
7949         uint32_t lane0 = -7 + (3 * i);
7950         uint32_t lane1 = -8 + (3 * i);
7951         uint32_t lane2 = -9 + (3 * i);
7952         MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7953         MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7954         MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7955       }
7956     }
7957 
7958     // st3d { z31.d, z0.d, z1.d }, SVE_MUL3
7959     int vl_d_mul3 = vl_d - (vl_d % 3);
7960     for (int i = 0; i < vl_d_mul3; i++) {
7961       int64_t offset = -(1 << kDRegSizeInBytesLog2) * vl;
7962       uint64_t lane0 = 32 - (11 * i);
7963       uint64_t lane1 = 33 - (11 * i);
7964       uint64_t lane2 = 34 - (11 * i);
7965       MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7966       MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7967       MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7968     }
7969 
7970     ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
7971 
7972     // Check that we loaded back the expected values.
7973 
7974     // st3b/ld3b
7975     ASSERT_EQUAL_SVE(z4, z16);
7976     ASSERT_EQUAL_SVE(z5, z17);
7977     ASSERT_EQUAL_SVE(z6, z18);
7978 
7979     // st3h/ld3h
7980     ASSERT_EQUAL_SVE(z7, z19);
7981     ASSERT_EQUAL_SVE(z8, z20);
7982     ASSERT_EQUAL_SVE(z9, z21);
7983 
7984     // st3w/ld3w
7985     ASSERT_EQUAL_SVE(z10, z22);
7986     ASSERT_EQUAL_SVE(z11, z23);
7987     ASSERT_EQUAL_SVE(z12, z24);
7988 
7989     // st3d/ld3d
7990     ASSERT_EQUAL_SVE(z13, z25);
7991     ASSERT_EQUAL_SVE(z14, z26);
7992     ASSERT_EQUAL_SVE(z15, z27);
7993 
7994     delete[] expected;
7995   }
7996   delete[] data;
7997 }
7998 
TEST_SVE(sve_ld4_st4_scalar_plus_imm)7999 TEST_SVE(sve_ld4_st4_scalar_plus_imm) {
8000   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
8001   START();
8002 
8003   int vl = config->sve_vl_in_bytes();
8004 
8005   // The immediate can address [-24, 21] times the VL, so allocate enough space
8006   // to exceed that in both directions.
8007   int data_size = vl * 128;
8008 
8009   uint8_t* data = new uint8_t[data_size];
8010   memset(data, 0, data_size);
8011 
8012   // Set the base half-way through the buffer so we can use negative indeces.
8013   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
8014 
8015   // We can test ld4 by comparing the values loaded with the values stored.
8016   // There are two complications:
8017   //  - Loads have zeroing predication, so we have to clear the inactive
8018   //    elements on our reference.
8019   //  - We want to test both loads and stores that span { z31, z0 }, so we have
8020   //    to move some values around.
8021   //
8022   // Registers z3-z18 will hold as-stored values (with inactive elements
8023   // cleared). Registers z19-z31 and z0-z2 will hold the values that were
8024   // loaded.
8025 
8026   __ Index(z10.VnB(), 1, -7);
8027   __ Index(z11.VnB(), 2, -7);
8028   __ Index(z12.VnB(), 3, -7);
8029   __ Index(z13.VnB(), 4, -7);
8030   __ Ptrue(p0.VnB());
8031   __ St4b(z10.VnB(), z11.VnB(), z12.VnB(), z13.VnB(), p0, SVEMemOperand(x0));
8032   // Save the stored values for ld4 tests.
8033   __ Dup(z3.VnB(), 0);
8034   __ Dup(z4.VnB(), 0);
8035   __ Dup(z5.VnB(), 0);
8036   __ Dup(z6.VnB(), 0);
8037   __ Mov(z3.VnB(), p0.Merging(), z10.VnB());
8038   __ Mov(z4.VnB(), p0.Merging(), z11.VnB());
8039   __ Mov(z5.VnB(), p0.Merging(), z12.VnB());
8040   __ Mov(z6.VnB(), p0.Merging(), z13.VnB());
8041 
8042   // Wrap around from z31 to z0.
8043   __ Index(z31.VnH(), -2, 5);
8044   __ Index(z0.VnH(), -3, 5);
8045   __ Index(z1.VnH(), -4, 5);
8046   __ Index(z2.VnH(), -5, 5);
8047   __ Ptrue(p1.VnH(), SVE_MUL3);
8048   __ St4h(z31.VnH(),
8049           z0.VnH(),
8050           z1.VnH(),
8051           z2.VnH(),
8052           p1,
8053           SVEMemOperand(x0, 4, SVE_MUL_VL));
8054   // Save the stored values for ld4 tests.
8055   __ Dup(z7.VnH(), 0);
8056   __ Dup(z8.VnH(), 0);
8057   __ Dup(z9.VnH(), 0);
8058   __ Dup(z10.VnH(), 0);
8059   __ Mov(z7.VnH(), p1.Merging(), z31.VnH());
8060   __ Mov(z8.VnH(), p1.Merging(), z0.VnH());
8061   __ Mov(z9.VnH(), p1.Merging(), z1.VnH());
8062   __ Mov(z10.VnH(), p1.Merging(), z2.VnH());
8063 
8064   // Wrap around from z31 to z0.
8065   __ Index(z29.VnS(), 2, -7);
8066   __ Index(z30.VnS(), 3, -7);
8067   __ Index(z31.VnS(), 4, -7);
8068   __ Index(z0.VnS(), 5, -7);
8069   __ Ptrue(p2.VnS(), SVE_POW2);
8070   __ St4w(z29.VnS(),
8071           z30.VnS(),
8072           z31.VnS(),
8073           z0.VnS(),
8074           p2,
8075           SVEMemOperand(x0, -12, SVE_MUL_VL));
8076   // Save the stored values for ld4 tests.
8077   __ Dup(z11.VnS(), 0);
8078   __ Dup(z12.VnS(), 0);
8079   __ Dup(z13.VnS(), 0);
8080   __ Dup(z14.VnS(), 0);
8081   __ Mov(z11.VnS(), p2.Merging(), z29.VnS());
8082   __ Mov(z12.VnS(), p2.Merging(), z30.VnS());
8083   __ Mov(z13.VnS(), p2.Merging(), z31.VnS());
8084   __ Mov(z14.VnS(), p2.Merging(), z0.VnS());
8085 
8086   __ Index(z20.VnD(), -7, 8);
8087   __ Index(z21.VnD(), -8, 8);
8088   __ Index(z22.VnD(), -9, 8);
8089   __ Index(z23.VnD(), -10, 8);
8090   // Sparse predication, including some irrelevant bits (0xee). To make the
8091   // results easy to check, activate each lane <n> where n is a multiple of 5.
8092   Initialise(&masm,
8093              p3,
8094              0xeee10000000001ee,
8095              0xeeeeeee100000000,
8096              0x01eeeeeeeee10000,
8097              0x000001eeeeeeeee1);
8098   __ St4d(z20.VnD(),
8099           z21.VnD(),
8100           z22.VnD(),
8101           z23.VnD(),
8102           p3,
8103           SVEMemOperand(x0, 16, SVE_MUL_VL));
8104   // Save the stored values for ld4 tests.
8105   __ Dup(z15.VnD(), 0);
8106   __ Dup(z16.VnD(), 0);
8107   __ Dup(z17.VnD(), 0);
8108   __ Dup(z18.VnD(), 0);
8109   __ Mov(z15.VnD(), p3.Merging(), z20.VnD());
8110   __ Mov(z16.VnD(), p3.Merging(), z21.VnD());
8111   __ Mov(z17.VnD(), p3.Merging(), z22.VnD());
8112   __ Mov(z18.VnD(), p3.Merging(), z23.VnD());
8113 
8114   // Corresponding loads.
8115   // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
8116   __ Ld4b(z31.VnB(),
8117           z0.VnB(),
8118           z1.VnB(),
8119           z2.VnB(),
8120           p0.Zeroing(),
8121           SVEMemOperand(x0));
8122   __ Mov(z19, z31);
8123   __ Mov(z20, z0);
8124   __ Mov(z21, z1);
8125   __ Mov(z22, z2);
8126   __ Ld4h(z23.VnH(),
8127           z24.VnH(),
8128           z25.VnH(),
8129           z26.VnH(),
8130           p1.Zeroing(),
8131           SVEMemOperand(x0, 4, SVE_MUL_VL));
8132   __ Ld4w(z27.VnS(),
8133           z28.VnS(),
8134           z29.VnS(),
8135           z30.VnS(),
8136           p2.Zeroing(),
8137           SVEMemOperand(x0, -12, SVE_MUL_VL));
8138   // Wrap around from z31 to z0.
8139   __ Ld4d(z31.VnD(),
8140           z0.VnD(),
8141           z1.VnD(),
8142           z2.VnD(),
8143           p3.Zeroing(),
8144           SVEMemOperand(x0, 16, SVE_MUL_VL));
8145 
8146   END();
8147 
8148   if (CAN_RUN()) {
8149     RUN();
8150 
8151     uint8_t* expected = new uint8_t[data_size];
8152     memset(expected, 0, data_size);
8153     uint8_t* middle = &expected[data_size / 2];
8154 
8155     int vl_b = vl / kBRegSizeInBytes;
8156     int vl_h = vl / kHRegSizeInBytes;
8157     int vl_s = vl / kSRegSizeInBytes;
8158     int vl_d = vl / kDRegSizeInBytes;
8159 
8160     int reg_count = 4;
8161 
8162     // st2b { z10.b, z11.b, z12.b, z13.b }, SVE_ALL
8163     for (int i = 0; i < vl_b; i++) {
8164       uint8_t lane0 = 1 - (7 * i);
8165       uint8_t lane1 = 2 - (7 * i);
8166       uint8_t lane2 = 3 - (7 * i);
8167       uint8_t lane3 = 4 - (7 * i);
8168       MemoryWrite(middle, 0, (i * reg_count) + 0, lane0);
8169       MemoryWrite(middle, 0, (i * reg_count) + 1, lane1);
8170       MemoryWrite(middle, 0, (i * reg_count) + 2, lane2);
8171       MemoryWrite(middle, 0, (i * reg_count) + 3, lane3);
8172     }
8173 
8174     // st4h { z31.h, z0.h, z1.h, z2.h }, SVE_MUL3
8175     int vl_h_mul3 = vl_h - (vl_h % 3);
8176     for (int i = 0; i < vl_h_mul3; i++) {
8177       int64_t offset = 4 * vl;
8178       uint16_t lane0 = -2 + (5 * i);
8179       uint16_t lane1 = -3 + (5 * i);
8180       uint16_t lane2 = -4 + (5 * i);
8181       uint16_t lane3 = -5 + (5 * i);
8182       MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8183       MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8184       MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8185       MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8186     }
8187 
8188     // st4w { z29.s, z30.s, z31.s, z0.s }, SVE_POW2
8189     int vl_s_pow2 = 1 << HighestSetBitPosition(vl_s);
8190     for (int i = 0; i < vl_s_pow2; i++) {
8191       int64_t offset = -12 * vl;
8192       uint32_t lane0 = 2 - (7 * i);
8193       uint32_t lane1 = 3 - (7 * i);
8194       uint32_t lane2 = 4 - (7 * i);
8195       uint32_t lane3 = 5 - (7 * i);
8196       MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8197       MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8198       MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8199       MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8200     }
8201 
8202     // st4d { z20.d, z21.d, z22.d, z23.d }, ((i % 5) == 0)
8203     for (int i = 0; i < vl_d; i++) {
8204       if ((i % 5) == 0) {
8205         int64_t offset = 16 * vl;
8206         uint64_t lane0 = -7 + (8 * i);
8207         uint64_t lane1 = -8 + (8 * i);
8208         uint64_t lane2 = -9 + (8 * i);
8209         uint64_t lane3 = -10 + (8 * i);
8210         MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8211         MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8212         MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8213         MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8214       }
8215     }
8216 
8217     ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
8218 
8219     // Check that we loaded back the expected values.
8220 
8221     // st4b/ld4b
8222     ASSERT_EQUAL_SVE(z3, z19);
8223     ASSERT_EQUAL_SVE(z4, z20);
8224     ASSERT_EQUAL_SVE(z5, z21);
8225     ASSERT_EQUAL_SVE(z6, z22);
8226 
8227     // st4h/ld4h
8228     ASSERT_EQUAL_SVE(z7, z23);
8229     ASSERT_EQUAL_SVE(z8, z24);
8230     ASSERT_EQUAL_SVE(z9, z25);
8231     ASSERT_EQUAL_SVE(z10, z26);
8232 
8233     // st4w/ld4w
8234     ASSERT_EQUAL_SVE(z11, z27);
8235     ASSERT_EQUAL_SVE(z12, z28);
8236     ASSERT_EQUAL_SVE(z13, z29);
8237     ASSERT_EQUAL_SVE(z14, z30);
8238 
8239     // st4d/ld4d
8240     ASSERT_EQUAL_SVE(z15, z31);
8241     ASSERT_EQUAL_SVE(z16, z0);
8242     ASSERT_EQUAL_SVE(z17, z1);
8243     ASSERT_EQUAL_SVE(z18, z2);
8244 
8245     delete[] expected;
8246   }
8247   delete[] data;
8248 }
8249 
TEST_SVE(sve_ld4_st4_scalar_plus_scalar)8250 TEST_SVE(sve_ld4_st4_scalar_plus_scalar) {
8251   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
8252   START();
8253 
8254   int vl = config->sve_vl_in_bytes();
8255 
8256   // Allocate plenty of space to enable indexing in both directions.
8257   int data_size = vl * 128;
8258 
8259   uint8_t* data = new uint8_t[data_size];
8260   memset(data, 0, data_size);
8261 
8262   // Set the base half-way through the buffer so we can use negative indeces.
8263   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
8264 
8265   // We can test ld4 by comparing the values loaded with the values stored.
8266   // There are two complications:
8267   //  - Loads have zeroing predication, so we have to clear the inactive
8268   //    elements on our reference.
8269   //  - We want to test both loads and stores that span { z31, z0 }, so we have
8270   //    to move some values around.
8271   //
8272   // Registers z3-z18 will hold as-stored values (with inactive elements
8273   // cleared). Registers z19-z31 and z0-z2 will hold the values that were
8274   // loaded.
8275 
8276   __ Index(z19.VnB(), -4, 11);
8277   __ Index(z20.VnB(), -5, 11);
8278   __ Index(z21.VnB(), -6, 11);
8279   __ Index(z22.VnB(), -7, 11);
8280   __ Ptrue(p7.VnB(), SVE_MUL4);
8281   __ Rdvl(x1, -1);  // Make offsets VL-dependent so we can avoid overlap.
8282   __ St4b(z19.VnB(),
8283           z20.VnB(),
8284           z21.VnB(),
8285           z22.VnB(),
8286           p7,
8287           SVEMemOperand(x0, x1, LSL, 0));
8288   // Save the stored values for ld4 tests.
8289   __ Dup(z3.VnB(), 0);
8290   __ Dup(z4.VnB(), 0);
8291   __ Dup(z5.VnB(), 0);
8292   __ Dup(z6.VnB(), 0);
8293   __ Mov(z3.VnB(), p7.Merging(), z19.VnB());
8294   __ Mov(z4.VnB(), p7.Merging(), z20.VnB());
8295   __ Mov(z5.VnB(), p7.Merging(), z21.VnB());
8296   __ Mov(z6.VnB(), p7.Merging(), z22.VnB());
8297 
8298   __ Index(z23.VnH(), 6, -2);
8299   __ Index(z24.VnH(), 7, -2);
8300   __ Index(z25.VnH(), 8, -2);
8301   __ Index(z26.VnH(), 9, -2);
8302   __ Ptrue(p6.VnH(), SVE_VL16);
8303   __ Rdvl(x2, 7);  // (7 * vl) << 1 = 14 * vl
8304   __ St4h(z23.VnH(),
8305           z24.VnH(),
8306           z25.VnH(),
8307           z26.VnH(),
8308           p6,
8309           SVEMemOperand(x0, x2, LSL, 1));
8310   // Save the stored values for ld4 tests.
8311   __ Dup(z7.VnH(), 0);
8312   __ Dup(z8.VnH(), 0);
8313   __ Dup(z9.VnH(), 0);
8314   __ Dup(z10.VnH(), 0);
8315   __ Mov(z7.VnH(), p6.Merging(), z23.VnH());
8316   __ Mov(z8.VnH(), p6.Merging(), z24.VnH());
8317   __ Mov(z9.VnH(), p6.Merging(), z25.VnH());
8318   __ Mov(z10.VnH(), p6.Merging(), z26.VnH());
8319 
8320   // Wrap around from z31 to z0.
8321   __ Index(z29.VnS(), -6, 7);
8322   __ Index(z30.VnS(), -7, 7);
8323   __ Index(z31.VnS(), -8, 7);
8324   __ Index(z0.VnS(), -9, 7);
8325   // Sparse predication, including some irrelevant bits (0xe). To make the
8326   // results easy to check, activate each lane <n> where n is a multiple of 5.
8327   Initialise(&masm,
8328              p5,
8329              0xeee1000010000100,
8330              0x001eeee100001000,
8331              0x0100001eeee10000,
8332              0x10000100001eeee1);
8333   __ Rdvl(x3, -5);  // -(5 * vl) << 2 = -20 * vl
8334   __ St4w(z29.VnS(),
8335           z30.VnS(),
8336           z31.VnS(),
8337           z0.VnS(),
8338           p5,
8339           SVEMemOperand(x0, x3, LSL, 2));
8340   // Save the stored values for ld4 tests.
8341   __ Dup(z11.VnS(), 0);
8342   __ Dup(z12.VnS(), 0);
8343   __ Dup(z13.VnS(), 0);
8344   __ Dup(z14.VnS(), 0);
8345   __ Mov(z11.VnS(), p5.Merging(), z29.VnS());
8346   __ Mov(z12.VnS(), p5.Merging(), z30.VnS());
8347   __ Mov(z13.VnS(), p5.Merging(), z31.VnS());
8348   __ Mov(z14.VnS(), p5.Merging(), z0.VnS());
8349 
8350   __ Index(z31.VnD(), 32, -11);
8351   __ Index(z0.VnD(), 33, -11);
8352   __ Index(z1.VnD(), 34, -11);
8353   __ Index(z2.VnD(), 35, -11);
8354   __ Ptrue(p4.VnD(), SVE_MUL3);
8355   __ Rdvl(x4, -1);  // -(1 * vl) << 3 = -8 *vl
8356   __ St4d(z31.VnD(),
8357           z0.VnD(),
8358           z1.VnD(),
8359           z2.VnD(),
8360           p4,
8361           SVEMemOperand(x0, x4, LSL, 3));
8362   // Save the stored values for ld4 tests.
8363   __ Dup(z15.VnD(), 0);
8364   __ Dup(z16.VnD(), 0);
8365   __ Dup(z17.VnD(), 0);
8366   __ Dup(z18.VnD(), 0);
8367   __ Mov(z15.VnD(), p4.Merging(), z31.VnD());
8368   __ Mov(z16.VnD(), p4.Merging(), z0.VnD());
8369   __ Mov(z17.VnD(), p4.Merging(), z1.VnD());
8370   __ Mov(z18.VnD(), p4.Merging(), z2.VnD());
8371 
8372   // Corresponding loads.
8373   // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
8374   __ Ld4b(z31.VnB(),
8375           z0.VnB(),
8376           z1.VnB(),
8377           z2.VnB(),
8378           p7.Zeroing(),
8379           SVEMemOperand(x0, x1, LSL, 0));
8380   __ Mov(z19, z31);
8381   __ Mov(z20, z0);
8382   __ Mov(z21, z1);
8383   __ Mov(z22, z2);
8384   __ Ld4h(z23.VnH(),
8385           z24.VnH(),
8386           z25.VnH(),
8387           z26.VnH(),
8388           p6.Zeroing(),
8389           SVEMemOperand(x0, x2, LSL, 1));
8390   __ Ld4w(z27.VnS(),
8391           z28.VnS(),
8392           z29.VnS(),
8393           z30.VnS(),
8394           p5.Zeroing(),
8395           SVEMemOperand(x0, x3, LSL, 2));
8396   // Wrap around from z31 to z0.
8397   __ Ld4d(z31.VnD(),
8398           z0.VnD(),
8399           z1.VnD(),
8400           z2.VnD(),
8401           p4.Zeroing(),
8402           SVEMemOperand(x0, x4, LSL, 3));
8403 
8404   END();
8405 
8406   if (CAN_RUN()) {
8407     RUN();
8408 
8409     uint8_t* expected = new uint8_t[data_size];
8410     memset(expected, 0, data_size);
8411     uint8_t* middle = &expected[data_size / 2];
8412 
8413     int vl_b = vl / kBRegSizeInBytes;
8414     int vl_h = vl / kHRegSizeInBytes;
8415     int vl_s = vl / kSRegSizeInBytes;
8416     int vl_d = vl / kDRegSizeInBytes;
8417 
8418     int reg_count = 4;
8419 
8420     // st4b { z19.b, z20.b, z21.b, z22.b }, SVE_MUL4
8421     int vl_b_mul4 = vl_b - (vl_b % 4);
8422     for (int i = 0; i < vl_b_mul4; i++) {
8423       int64_t offset = -(1 << kBRegSizeInBytesLog2) * vl;
8424       uint8_t lane0 = -4 + (11 * i);
8425       uint8_t lane1 = -5 + (11 * i);
8426       uint8_t lane2 = -6 + (11 * i);
8427       uint8_t lane3 = -7 + (11 * i);
8428       MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8429       MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8430       MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8431       MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8432     }
8433 
8434     // st4h { z22.h, z23.h, z24.h, z25.h }, SVE_VL16
8435     if (vl_h >= 16) {
8436       for (int i = 0; i < 16; i++) {
8437         int64_t offset = (7 << kHRegSizeInBytesLog2) * vl;
8438         uint16_t lane0 = 6 - (2 * i);
8439         uint16_t lane1 = 7 - (2 * i);
8440         uint16_t lane2 = 8 - (2 * i);
8441         uint16_t lane3 = 9 - (2 * i);
8442         MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8443         MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8444         MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8445         MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8446       }
8447     }
8448 
8449     // st4w { z29.s, z30.s, z31.s, z0.s }, ((i % 5) == 0)
8450     for (int i = 0; i < vl_s; i++) {
8451       if ((i % 5) == 0) {
8452         int64_t offset = -(5 << kSRegSizeInBytesLog2) * vl;
8453         uint32_t lane0 = -6 + (7 * i);
8454         uint32_t lane1 = -7 + (7 * i);
8455         uint32_t lane2 = -8 + (7 * i);
8456         uint32_t lane3 = -9 + (7 * i);
8457         MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8458         MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8459         MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8460         MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8461       }
8462     }
8463 
8464     // st4d { z31.d, z0.d, z1.d, z2.d }, SVE_MUL3
8465     int vl_d_mul3 = vl_d - (vl_d % 3);
8466     for (int i = 0; i < vl_d_mul3; i++) {
8467       int64_t offset = -(1 << kDRegSizeInBytesLog2) * vl;
8468       uint64_t lane0 = 32 - (11 * i);
8469       uint64_t lane1 = 33 - (11 * i);
8470       uint64_t lane2 = 34 - (11 * i);
8471       uint64_t lane3 = 35 - (11 * i);
8472       MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8473       MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8474       MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8475       MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8476     }
8477 
8478     ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
8479 
8480     // Check that we loaded back the expected values.
8481 
8482     // st4b/ld4b
8483     ASSERT_EQUAL_SVE(z3, z19);
8484     ASSERT_EQUAL_SVE(z4, z20);
8485     ASSERT_EQUAL_SVE(z5, z21);
8486     ASSERT_EQUAL_SVE(z6, z22);
8487 
8488     // st4h/ld4h
8489     ASSERT_EQUAL_SVE(z7, z23);
8490     ASSERT_EQUAL_SVE(z8, z24);
8491     ASSERT_EQUAL_SVE(z9, z25);
8492     ASSERT_EQUAL_SVE(z10, z26);
8493 
8494     // st4w/ld4w
8495     ASSERT_EQUAL_SVE(z11, z27);
8496     ASSERT_EQUAL_SVE(z12, z28);
8497     ASSERT_EQUAL_SVE(z13, z29);
8498     ASSERT_EQUAL_SVE(z14, z30);
8499 
8500     // st4d/ld4d
8501     ASSERT_EQUAL_SVE(z15, z31);
8502     ASSERT_EQUAL_SVE(z16, z0);
8503     ASSERT_EQUAL_SVE(z17, z1);
8504     ASSERT_EQUAL_SVE(z18, z2);
8505 
8506     delete[] expected;
8507   }
8508   delete[] data;
8509 }
8510 
TEST_SVE(sve_ld234_st234_scalar_plus_scalar_sp)8511 TEST_SVE(sve_ld234_st234_scalar_plus_scalar_sp) {
8512   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
8513   START();
8514 
8515   // Check that the simulator correctly interprets rn == 31 as sp.
8516   // The indexing logic is the same regardless so we just check one load and
8517   // store of each type.
8518 
8519   // There are no pre- or post-indexing modes, so reserve space first.
8520   __ ClaimVL(2 + 3 + 4);
8521 
8522   __ Index(z0.VnB(), 42, 2);
8523   __ Index(z1.VnB(), 43, 2);
8524   __ Ptrue(p0.VnB(), SVE_VL7);
8525   __ Rdvl(x0, 0);
8526   __ St2b(z0.VnB(), z1.VnB(), p0, SVEMemOperand(sp, x0));
8527 
8528   __ Index(z4.VnH(), 42, 3);
8529   __ Index(z5.VnH(), 43, 3);
8530   __ Index(z6.VnH(), 44, 3);
8531   __ Ptrue(p1.VnH(), SVE_POW2);
8532   __ Rdvl(x1, 2);
8533   __ Lsr(x1, x1, 1);
8534   __ St3h(z4.VnH(), z5.VnH(), z6.VnH(), p1, SVEMemOperand(sp, x1, LSL, 1));
8535 
8536   __ Index(z8.VnS(), 42, 4);
8537   __ Index(z9.VnS(), 43, 4);
8538   __ Index(z10.VnS(), 44, 4);
8539   __ Index(z11.VnS(), 45, 4);
8540   __ Ptrue(p2.VnS());
8541   __ Rdvl(x2, 2 + 3);
8542   __ Lsr(x2, x2, 2);
8543   __ St4w(z8.VnS(),
8544           z9.VnS(),
8545           z10.VnS(),
8546           z11.VnS(),
8547           p2,
8548           SVEMemOperand(sp, x2, LSL, 2));
8549 
8550   // Corresponding loads.
8551   // We have to explicitly zero inactive lanes in the reference values because
8552   // loads have zeroing predication.
8553   __ Dup(z12.VnB(), 0);
8554   __ Dup(z13.VnB(), 0);
8555   __ Mov(z12.VnB(), p0.Merging(), z0.VnB());
8556   __ Mov(z13.VnB(), p0.Merging(), z1.VnB());
8557   __ Ld2b(z0.VnB(), z1.VnB(), p0.Zeroing(), SVEMemOperand(sp, x0));
8558 
8559   __ Dup(z16.VnH(), 0);
8560   __ Dup(z17.VnH(), 0);
8561   __ Dup(z18.VnH(), 0);
8562   __ Mov(z16.VnH(), p1.Merging(), z4.VnH());
8563   __ Mov(z17.VnH(), p1.Merging(), z5.VnH());
8564   __ Mov(z18.VnH(), p1.Merging(), z6.VnH());
8565   __ Ld3h(z4.VnH(),
8566           z5.VnH(),
8567           z6.VnH(),
8568           p1.Zeroing(),
8569           SVEMemOperand(sp, x1, LSL, 1));
8570 
8571   __ Dup(z20.VnS(), 0);
8572   __ Dup(z21.VnS(), 0);
8573   __ Dup(z22.VnS(), 0);
8574   __ Dup(z23.VnS(), 0);
8575   __ Mov(z20.VnS(), p2.Merging(), z8.VnS());
8576   __ Mov(z21.VnS(), p2.Merging(), z9.VnS());
8577   __ Mov(z22.VnS(), p2.Merging(), z10.VnS());
8578   __ Mov(z23.VnS(), p2.Merging(), z11.VnS());
8579   __ Ld4w(z8.VnS(),
8580           z9.VnS(),
8581           z10.VnS(),
8582           z11.VnS(),
8583           p2.Zeroing(),
8584           SVEMemOperand(sp, x2, LSL, 2));
8585 
8586   __ DropVL(2 + 3 + 4);
8587 
8588   END();
8589 
8590   if (CAN_RUN()) {
8591     RUN();
8592 
8593     // The most likely failure mode is the that simulator reads sp as xzr and
8594     // crashes on execution. We already test the address calculations separately
8595     // and sp doesn't change this, so just test that we load the values we
8596     // stored.
8597 
8598     // st2b/ld2b
8599     ASSERT_EQUAL_SVE(z0, z12);
8600     ASSERT_EQUAL_SVE(z1, z13);
8601 
8602     // st3h/ld3h
8603     ASSERT_EQUAL_SVE(z4, z16);
8604     ASSERT_EQUAL_SVE(z5, z17);
8605     ASSERT_EQUAL_SVE(z6, z18);
8606 
8607     // st4h/ld4h
8608     ASSERT_EQUAL_SVE(z8, z20);
8609     ASSERT_EQUAL_SVE(z9, z21);
8610     ASSERT_EQUAL_SVE(z10, z22);
8611     ASSERT_EQUAL_SVE(z11, z23);
8612   }
8613 }
8614 
TEST_SVE(sve_ld234_st234_scalar_plus_imm_sp)8615 TEST_SVE(sve_ld234_st234_scalar_plus_imm_sp) {
8616   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
8617   START();
8618 
8619   // Check that the simulator correctly interprets rn == 31 as sp.
8620   // The indexing logic is the same regardless so we just check one load and
8621   // store of each type.
8622 
8623   // There are no pre- or post-indexing modes, so reserve space first.
8624   // Note that the stores fill in an order that allows each immediate to be a
8625   // multiple of the number of registers.
8626   __ ClaimVL(4 + 2 + 3);
8627 
8628   __ Index(z0.VnB(), 42, 2);
8629   __ Index(z1.VnB(), 43, 2);
8630   __ Ptrue(p0.VnB(), SVE_POW2);
8631   __ St2b(z0.VnB(), z1.VnB(), p0, SVEMemOperand(sp, 4, SVE_MUL_VL));
8632 
8633   __ Index(z4.VnH(), 42, 3);
8634   __ Index(z5.VnH(), 43, 3);
8635   __ Index(z6.VnH(), 44, 3);
8636   __ Ptrue(p1.VnH(), SVE_VL7);
8637   __ St3h(z4.VnH(), z5.VnH(), z6.VnH(), p1, SVEMemOperand(sp, 6, SVE_MUL_VL));
8638 
8639   __ Index(z8.VnS(), 42, 4);
8640   __ Index(z9.VnS(), 43, 4);
8641   __ Index(z10.VnS(), 44, 4);
8642   __ Index(z11.VnS(), 45, 4);
8643   __ Ptrue(p2.VnS());
8644   __ St4w(z8.VnS(), z9.VnS(), z10.VnS(), z11.VnS(), p2, SVEMemOperand(sp));
8645 
8646   // Corresponding loads.
8647   // We have to explicitly zero inactive lanes in the reference values because
8648   // loads have zeroing predication.
8649   __ Dup(z12.VnB(), 0);
8650   __ Dup(z13.VnB(), 0);
8651   __ Mov(z12.VnB(), p0.Merging(), z0.VnB());
8652   __ Mov(z13.VnB(), p0.Merging(), z1.VnB());
8653   __ Ld2b(z0.VnB(), z1.VnB(), p0.Zeroing(), SVEMemOperand(sp, 4, SVE_MUL_VL));
8654 
8655   __ Dup(z16.VnH(), 0);
8656   __ Dup(z17.VnH(), 0);
8657   __ Dup(z18.VnH(), 0);
8658   __ Mov(z16.VnH(), p1.Merging(), z4.VnH());
8659   __ Mov(z17.VnH(), p1.Merging(), z5.VnH());
8660   __ Mov(z18.VnH(), p1.Merging(), z6.VnH());
8661   __ Ld3h(z4.VnH(),
8662           z5.VnH(),
8663           z6.VnH(),
8664           p1.Zeroing(),
8665           SVEMemOperand(sp, 6, SVE_MUL_VL));
8666 
8667   __ Dup(z20.VnS(), 0);
8668   __ Dup(z21.VnS(), 0);
8669   __ Dup(z22.VnS(), 0);
8670   __ Dup(z23.VnS(), 0);
8671   __ Mov(z20.VnS(), p2.Merging(), z8.VnS());
8672   __ Mov(z21.VnS(), p2.Merging(), z9.VnS());
8673   __ Mov(z22.VnS(), p2.Merging(), z10.VnS());
8674   __ Mov(z23.VnS(), p2.Merging(), z11.VnS());
8675   __ Ld4w(z8.VnS(),
8676           z9.VnS(),
8677           z10.VnS(),
8678           z11.VnS(),
8679           p2.Zeroing(),
8680           SVEMemOperand(sp));
8681 
8682   __ DropVL(4 + 2 + 3);
8683 
8684   END();
8685 
8686   if (CAN_RUN()) {
8687     RUN();
8688 
8689     // The most likely failure mode is the that simulator reads sp as xzr and
8690     // crashes on execution. We already test the address calculations separately
8691     // and sp doesn't change this, so just test that we load the values we
8692     // stored.
8693     // TODO: Actually do this, once loads are implemented.
8694   }
8695 }
8696 
8697 // Fill the input buffer with arbitrary data. Meanwhile, assign random offsets
8698 // from the base address of the buffer and corresponding addresses to the
8699 // arguments if provided.
BufferFillingHelper(uint64_t data_ptr,size_t buffer_size,unsigned lane_size_in_bytes,int lane_count,uint64_t * offsets,uint64_t * addresses=nullptr,uint64_t * max_address=nullptr)8700 static void BufferFillingHelper(uint64_t data_ptr,
8701                                 size_t buffer_size,
8702                                 unsigned lane_size_in_bytes,
8703                                 int lane_count,
8704                                 uint64_t* offsets,
8705                                 uint64_t* addresses = nullptr,
8706                                 uint64_t* max_address = nullptr) {
8707   // Use a fixed seed for nrand48() so that test runs are reproducible.
8708   unsigned short seed[3] = {1, 2, 3};  // NOLINT(runtime/int)
8709 
8710   // Fill a buffer with arbitrary data.
8711   for (size_t i = 0; i < buffer_size; i++) {
8712     uint8_t byte = nrand48(seed) & 0xff;
8713     memcpy(reinterpret_cast<void*>(data_ptr + i), &byte, 1);
8714   }
8715 
8716   if (max_address != nullptr) {
8717     *max_address = 0;
8718   }
8719 
8720   // Vectors of random addresses and offsets into the buffer.
8721   for (int i = 0; i < lane_count; i++) {
8722     uint64_t rnd = nrand48(seed);
8723     // Limit the range to the set of completely-accessible elements in memory.
8724     offsets[i] = rnd % (buffer_size - lane_size_in_bytes);
8725     if ((addresses != nullptr) && (max_address != nullptr)) {
8726       addresses[i] = data_ptr + offsets[i];
8727       *max_address = std::max(*max_address, addresses[i]);
8728     }
8729   }
8730 }
8731 
ScalarLoadHelper(MacroAssembler * masm,Register dst,Register addr,int msize_in_bits,bool is_signed)8732 static void ScalarLoadHelper(MacroAssembler* masm,
8733                              Register dst,
8734                              Register addr,
8735                              int msize_in_bits,
8736                              bool is_signed) {
8737   if (is_signed) {
8738     switch (msize_in_bits) {
8739       case kBRegSize:
8740         masm->Ldrsb(dst, MemOperand(addr));
8741         break;
8742       case kHRegSize:
8743         masm->Ldrsh(dst, MemOperand(addr));
8744         break;
8745       case kWRegSize:
8746         masm->Ldrsw(dst, MemOperand(addr));
8747         break;
8748       default:
8749         VIXL_UNIMPLEMENTED();
8750         break;
8751     }
8752   } else {
8753     switch (msize_in_bits) {
8754       case kBRegSize:
8755         masm->Ldrb(dst, MemOperand(addr));
8756         break;
8757       case kHRegSize:
8758         masm->Ldrh(dst, MemOperand(addr));
8759         break;
8760       case kWRegSize:
8761         masm->Ldr(dst.W(), MemOperand(addr));
8762         break;
8763       case kXRegSize:
8764         masm->Ldr(dst, MemOperand(addr));
8765         break;
8766       default:
8767         VIXL_UNIMPLEMENTED();
8768         break;
8769     }
8770   }
8771 }
8772 
8773 // Generate a reference result using scalar loads.
8774 // For now this helper doesn't save and restore the caller registers.
8775 // Clobber register z30, x28, x29 and p7.
8776 template <size_t N>
ScalarLoadHelper(MacroAssembler * masm,int vl,const uint64_t (& addresses)[N],const ZRegister & zt_ref,const PRegisterZ & pg,unsigned esize_in_bits,unsigned msize_in_bits,bool is_signed)8777 static void ScalarLoadHelper(MacroAssembler* masm,
8778                              int vl,
8779                              const uint64_t (&addresses)[N],
8780                              const ZRegister& zt_ref,
8781                              const PRegisterZ& pg,
8782                              unsigned esize_in_bits,
8783                              unsigned msize_in_bits,
8784                              bool is_signed) {
8785   unsigned esize_in_bytes = esize_in_bits / kBitsPerByte;
8786   ZRegister lane_numbers = z30.WithLaneSize(esize_in_bits);
8787   masm->Index(lane_numbers, 0, 1);
8788   masm->Dup(zt_ref, 0);
8789   for (unsigned i = 0; i < (vl / esize_in_bytes); i++) {
8790     masm->Mov(x29, addresses[N - i - 1]);
8791     Register rt(28, std::min(std::max(esize_in_bits, kSRegSize), kDRegSize));
8792     ScalarLoadHelper(masm, rt, x29, msize_in_bits, is_signed);
8793 
8794     // Emulate predication.
8795     masm->Cmpeq(p7.WithLaneSize(esize_in_bits), pg, lane_numbers, i);
8796     masm->Cpy(zt_ref, p7.Merging(), rt);
8797   }
8798 }
8799 
8800 typedef void (MacroAssembler::*Ld1Macro)(const ZRegister& zt,
8801                                          const PRegisterZ& pg,
8802                                          const SVEMemOperand& addr);
8803 
8804 template <typename T>
Ldff1Helper(Test * config,uintptr_t data,unsigned msize_in_bits,unsigned esize_in_bits,CPURegister::RegisterType base_type,Ld1Macro ldff1,Ld1Macro ld1,T mod,bool scale=false)8805 static void Ldff1Helper(Test* config,
8806                         uintptr_t data,
8807                         unsigned msize_in_bits,
8808                         unsigned esize_in_bits,
8809                         CPURegister::RegisterType base_type,
8810                         Ld1Macro ldff1,
8811                         Ld1Macro ld1,
8812                         T mod,
8813                         bool scale = false) {
8814   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
8815   START();
8816 
8817   int vl = config->sve_vl_in_bytes();
8818   size_t page_size = sysconf(_SC_PAGE_SIZE);
8819   VIXL_ASSERT(page_size > static_cast<size_t>(vl));
8820 
8821   unsigned esize_in_bytes = esize_in_bits / kBitsPerByte;
8822   unsigned msize_in_bytes = msize_in_bits / kBitsPerByte;
8823   unsigned msize_in_bytes_log2 = std::log2(msize_in_bytes);
8824   VIXL_ASSERT(msize_in_bits <= esize_in_bits);
8825 
8826   PRegister all = p7;
8827   __ Ptrue(all.VnB());
8828 
8829   size_t offset_modifier = 0;
8830 
8831   // The highest address at which a load stopped. Every FF load should fault at
8832   // `data + page_size`, so this value should not exceed that value. However,
8833   // the architecture allows fault-tolerant loads to fault arbitrarily, so the
8834   // real value may be lower.
8835   //
8836   // This is used to check that the `mprotect` above really does make the second
8837   // page inaccessible, and that the resulting FFR from each load reflects that.
8838   Register limit = x22;
8839   __ Mov(limit, 0);
8840 
8841   // If the FFR grows unexpectedly, we increment this register by the
8842   // difference. FFR should never grow, except when explicitly set.
8843   Register ffr_grow_count = x23;
8844   __ Mov(ffr_grow_count, 0);
8845 
8846   // Set the offset so that the load is guaranteed to start in the
8847   // accessible page, but end in the inaccessible one.
8848   VIXL_ASSERT((page_size % msize_in_bytes) == 0);
8849   VIXL_ASSERT((vl % msize_in_bytes) == 0);
8850   size_t elements_per_page = page_size / msize_in_bytes;
8851   size_t elements_per_access = vl / esize_in_bytes;
8852   size_t min_offset = (elements_per_page - elements_per_access) + 1;
8853   size_t max_offset = elements_per_page - 1;
8854   size_t offset =
8855       min_offset + (offset_modifier % (max_offset - min_offset + 1));
8856   offset_modifier++;
8857 
8858   __ Setffr();
8859   __ Mov(x20, data);
8860   __ Mov(x21, offset);
8861 
8862   if (base_type == CPURegister::kRegister) {
8863     // Scalar-plus-scalar mode.
8864     VIXL_ASSERT((std::is_same<T, vixl::aarch64::Shift>::value));
8865     VIXL_ASSERT((static_cast<int>(mod) == LSL) ||
8866                 (static_cast<int>(mod) == NO_SHIFT));
8867     (masm.*ldff1)(z0.WithLaneSize(esize_in_bits),
8868                   all.Zeroing(),
8869                   SVEMemOperand(x20, x21, mod, msize_in_bytes_log2));
8870   } else {
8871     VIXL_ASSERT(base_type == CPURegister::kZRegister);
8872     int offs_size;
8873     bool offs_is_unsigned;
8874     if (std::is_same<T, vixl::aarch64::Extend>::value) {
8875       // Scalar-plus-vector mode with 32-bit optional unpacked or upacked, and
8876       // unscaled or scaled offset.
8877       VIXL_ASSERT((static_cast<int>(mod) == SXTW) ||
8878                   (static_cast<int>(mod) == UXTW));
8879       if (scale == true) {
8880         // Gather first-fault bytes load doesn't support scaled offset.
8881         VIXL_ASSERT(msize_in_bits != kBRegSize);
8882       }
8883       offs_is_unsigned = (static_cast<int>(mod) == UXTW) ? true : false;
8884       offs_size = kSRegSize;
8885 
8886     } else {
8887       // Scalar-plus-vector mode with 64-bit unscaled or scaled offset.
8888       VIXL_ASSERT((std::is_same<T, vixl::aarch64::Shift>::value));
8889       VIXL_ASSERT((static_cast<int>(mod) == LSL) ||
8890                   (static_cast<int>(mod) == NO_SHIFT));
8891       offs_is_unsigned = false;
8892       offs_size = kDRegSize;
8893     }
8894 
8895     // For generating the pattern of "base address + index << shift".
8896     // In case of unscaled-offset operation, use `msize_in_bytes` be an offset
8897     // of each decreasing memory accesses. otherwise, decreases the indexes by 1
8898     // and then scale it by the shift value.
8899     int shift = (scale == true) ? msize_in_bytes_log2 : 0;
8900     int index_offset = msize_in_bytes >> shift;
8901     VIXL_ASSERT(index_offset > 0);
8902     uint64_t index = 0;
8903     uint64_t base_address = 0;
8904 
8905     if (offs_is_unsigned == true) {
8906       // Base address.
8907       base_address = data;
8908       // Maximum unsigned positive index.
8909       index = page_size >> shift;
8910 
8911     } else {
8912       // Base address.
8913       base_address = data + (2 * page_size);
8914       // Maximum unsigned positive index.
8915       uint64_t uint_e_max =
8916           (esize_in_bits == kDRegSize) ? UINT64_MAX : UINT32_MAX;
8917       index = uint_e_max - (page_size >> shift) + 1;
8918     }
8919 
8920     __ Mov(x19, base_address);
8921     if ((offs_size == kSRegSize) && (esize_in_bits == kDRegSize)) {
8922       // In this case, the index values are optionally sign or zero-extended
8923       // from 32 to 64 bits, assign a convenient value to the top 32 bits to
8924       // ensure only the low 32 bits be the index values.
8925       index |= 0x1234567800000000;
8926     }
8927 
8928     index -= index_offset * (elements_per_access - 1);
8929     __ Index(z17.WithLaneSize(esize_in_bits), index, index_offset);
8930 
8931     // Scalar plus vector mode.
8932     (masm.*
8933      ldff1)(z0.WithLaneSize(esize_in_bits),
8934             all.Zeroing(),
8935             SVEMemOperand(x19, z17.WithLaneSize(esize_in_bits), mod, shift));
8936   }
8937 
8938   __ Rdffrs(p0.VnB(), all.Zeroing());
8939 
8940   // Execute another Ldff1 with no offset, so that every element could be
8941   // read. It should respect FFR, and load no more than we loaded the
8942   // first time.
8943   (masm.*
8944    ldff1)(z16.WithLaneSize(esize_in_bits), all.Zeroing(), SVEMemOperand(x20));
8945   __ Rdffrs(p1.VnB(), all.Zeroing());
8946   __ Cntp(x0, all, p1.VnB());
8947   __ Uqdecp(x0, p0.VnB());
8948   __ Add(ffr_grow_count, ffr_grow_count, x0);
8949 
8950   // Use the FFR to predicate the normal load. If it wasn't properly set,
8951   // the normal load will abort.
8952   (masm.*ld1)(z16.WithLaneSize(esize_in_bits),
8953               p0.Zeroing(),
8954               SVEMemOperand(x20, x21, LSL, msize_in_bytes_log2));
8955 
8956   // Work out the address after the one that was just accessed.
8957   __ Incp(x21, p0.WithLaneSize(esize_in_bits));
8958   __ Add(x0, x20, Operand(x21, LSL, msize_in_bytes_log2));
8959   __ Cmp(limit, x0);
8960   __ Csel(limit, limit, x0, hs);
8961 
8962   // Clear lanes inactive in FFR. These have an undefined result.
8963   __ Not(p0.VnB(), all.Zeroing(), p0.VnB());
8964   __ Mov(z0.WithLaneSize(esize_in_bits), p0.Merging(), 0);
8965 
8966   END();
8967 
8968   if (CAN_RUN()) {
8969     RUN();
8970 
8971     uintptr_t expected_limit = data + page_size;
8972     uintptr_t measured_limit = core.xreg(limit.GetCode());
8973     VIXL_CHECK(measured_limit <= expected_limit);
8974     if (measured_limit < expected_limit) {
8975       // We can't fail the test for this case, but a warning is helpful for
8976       // manually-run tests.
8977       printf(
8978           "WARNING: All fault-tolerant loads detected faults before the\n"
8979           "expected limit. This is architecturally possible, but improbable,\n"
8980           "and could be a symptom of another problem.\n");
8981     }
8982 
8983     ASSERT_EQUAL_64(0, ffr_grow_count);
8984 
8985     ASSERT_EQUAL_SVE(z0.WithLaneSize(esize_in_bits),
8986                      z16.WithLaneSize(esize_in_bits));
8987   }
8988 }
8989 
TEST_SVE(sve_ldff1_scalar_plus_scalar)8990 TEST_SVE(sve_ldff1_scalar_plus_scalar) {
8991   size_t page_size = sysconf(_SC_PAGE_SIZE);
8992   VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));
8993 
8994   // Allocate two pages, then mprotect the second one to make it inaccessible.
8995   uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
8996                                                     page_size * 2,
8997                                                     PROT_READ | PROT_WRITE,
8998                                                     MAP_PRIVATE | MAP_ANONYMOUS,
8999                                                     -1,
9000                                                     0));
9001   mprotect(reinterpret_cast<void*>(data + page_size), page_size, PROT_NONE);
9002 
9003   // Fill the accessible page with arbitrary data.
9004   for (size_t i = 0; i < page_size; i++) {
9005     // Reverse bits so we get a mixture of positive and negative values.
9006     uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
9007     memcpy(reinterpret_cast<void*>(data + i), &byte, 1);
9008   }
9009 
9010   auto ldff1_unscaled_offset_helper = std::bind(&Ldff1Helper<Shift>,
9011                                                 config,
9012                                                 data,
9013                                                 std::placeholders::_1,
9014                                                 std::placeholders::_2,
9015                                                 CPURegister::kRegister,
9016                                                 std::placeholders::_3,
9017                                                 std::placeholders::_4,
9018                                                 NO_SHIFT,
9019                                                 false);
9020 
9021   Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
9022   Ld1Macro ld1b = &MacroAssembler::Ld1b;
9023   ldff1_unscaled_offset_helper(kBRegSize, kBRegSize, ldff1b, ld1b);
9024   ldff1_unscaled_offset_helper(kBRegSize, kHRegSize, ldff1b, ld1b);
9025   ldff1_unscaled_offset_helper(kBRegSize, kSRegSize, ldff1b, ld1b);
9026   ldff1_unscaled_offset_helper(kBRegSize, kDRegSize, ldff1b, ld1b);
9027 
9028   Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
9029   Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
9030   ldff1_unscaled_offset_helper(kBRegSize, kHRegSize, ldff1sb, ld1sb);
9031   ldff1_unscaled_offset_helper(kBRegSize, kSRegSize, ldff1sb, ld1sb);
9032   ldff1_unscaled_offset_helper(kBRegSize, kDRegSize, ldff1sb, ld1sb);
9033 
9034   auto ldff1_scaled_offset_helper = std::bind(&Ldff1Helper<Shift>,
9035                                               config,
9036                                               data,
9037                                               std::placeholders::_1,
9038                                               std::placeholders::_2,
9039                                               CPURegister::kRegister,
9040                                               std::placeholders::_3,
9041                                               std::placeholders::_4,
9042                                               LSL,
9043                                               true);
9044 
9045   Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9046   Ld1Macro ld1h = &MacroAssembler::Ld1h;
9047   ldff1_scaled_offset_helper(kHRegSize, kHRegSize, ldff1h, ld1h);
9048   ldff1_scaled_offset_helper(kHRegSize, kSRegSize, ldff1h, ld1h);
9049   ldff1_scaled_offset_helper(kHRegSize, kDRegSize, ldff1h, ld1h);
9050 
9051   Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9052   Ld1Macro ld1w = &MacroAssembler::Ld1w;
9053   ldff1_scaled_offset_helper(kSRegSize, kSRegSize, ldff1w, ld1w);
9054   ldff1_scaled_offset_helper(kSRegSize, kDRegSize, ldff1w, ld1w);
9055 
9056   Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
9057   Ld1Macro ld1d = &MacroAssembler::Ld1d;
9058   ldff1_scaled_offset_helper(kDRegSize, kDRegSize, ldff1d, ld1d);
9059 
9060   Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9061   Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
9062   ldff1_scaled_offset_helper(kHRegSize, kSRegSize, ldff1sh, ld1sh);
9063   ldff1_scaled_offset_helper(kHRegSize, kDRegSize, ldff1sh, ld1sh);
9064 
9065   Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
9066   Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
9067   ldff1_scaled_offset_helper(kSRegSize, kDRegSize, ldff1sw, ld1sw);
9068 
9069   munmap(reinterpret_cast<void*>(data), page_size * 2);
9070 }
9071 
sve_ldff1_scalar_plus_vector_32_scaled_offset(Test * config,uintptr_t data)9072 static void sve_ldff1_scalar_plus_vector_32_scaled_offset(Test* config,
9073                                                           uintptr_t data) {
9074   auto ldff1_32_scaled_offset_helper = std::bind(&Ldff1Helper<Extend>,
9075                                                  config,
9076                                                  data,
9077                                                  std::placeholders::_1,
9078                                                  kSRegSize,
9079                                                  CPURegister::kZRegister,
9080                                                  std::placeholders::_2,
9081                                                  std::placeholders::_3,
9082                                                  std::placeholders::_4,
9083                                                  true);
9084   Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9085   Ld1Macro ld1h = &MacroAssembler::Ld1h;
9086   ldff1_32_scaled_offset_helper(kHRegSize, ldff1h, ld1h, UXTW);
9087   ldff1_32_scaled_offset_helper(kHRegSize, ldff1h, ld1h, SXTW);
9088 
9089   Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9090   Ld1Macro ld1w = &MacroAssembler::Ld1w;
9091   ldff1_32_scaled_offset_helper(kSRegSize, ldff1w, ld1w, UXTW);
9092   ldff1_32_scaled_offset_helper(kSRegSize, ldff1w, ld1w, SXTW);
9093 
9094   Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9095   Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
9096   ldff1_32_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh, UXTW);
9097   ldff1_32_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh, SXTW);
9098 }
9099 
sve_ldff1_scalar_plus_vector_32_unscaled_offset(Test * config,uintptr_t data)9100 static void sve_ldff1_scalar_plus_vector_32_unscaled_offset(Test* config,
9101                                                             uintptr_t data) {
9102   auto ldff1_32_unscaled_offset_helper = std::bind(&Ldff1Helper<Extend>,
9103                                                    config,
9104                                                    data,
9105                                                    std::placeholders::_1,
9106                                                    kSRegSize,
9107                                                    CPURegister::kZRegister,
9108                                                    std::placeholders::_2,
9109                                                    std::placeholders::_3,
9110                                                    std::placeholders::_4,
9111                                                    false);
9112 
9113   Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
9114   Ld1Macro ld1b = &MacroAssembler::Ld1b;
9115   ldff1_32_unscaled_offset_helper(kBRegSize, ldff1b, ld1b, UXTW);
9116   ldff1_32_unscaled_offset_helper(kBRegSize, ldff1b, ld1b, SXTW);
9117 
9118   Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9119   Ld1Macro ld1h = &MacroAssembler::Ld1h;
9120   ldff1_32_unscaled_offset_helper(kHRegSize, ldff1h, ld1h, UXTW);
9121   ldff1_32_unscaled_offset_helper(kHRegSize, ldff1h, ld1h, SXTW);
9122 
9123   Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9124   Ld1Macro ld1w = &MacroAssembler::Ld1w;
9125   ldff1_32_unscaled_offset_helper(kSRegSize, ldff1w, ld1w, UXTW);
9126   ldff1_32_unscaled_offset_helper(kSRegSize, ldff1w, ld1w, SXTW);
9127 
9128   Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
9129   Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
9130   ldff1_32_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb, UXTW);
9131   ldff1_32_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb, SXTW);
9132 
9133   Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9134   Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
9135   ldff1_32_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh, UXTW);
9136   ldff1_32_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh, SXTW);
9137 }
9138 
sve_ldff1_scalar_plus_vector_32_unpacked_scaled_offset(Test * config,uintptr_t data)9139 static void sve_ldff1_scalar_plus_vector_32_unpacked_scaled_offset(
9140     Test* config, uintptr_t data) {
9141   auto ldff1_32_unpacked_scaled_offset_helper =
9142       std::bind(&Ldff1Helper<Extend>,
9143                 config,
9144                 data,
9145                 std::placeholders::_1,
9146                 kDRegSize,
9147                 CPURegister::kZRegister,
9148                 std::placeholders::_2,
9149                 std::placeholders::_3,
9150                 std::placeholders::_4,
9151                 true);
9152 
9153   Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9154   Ld1Macro ld1h = &MacroAssembler::Ld1h;
9155   ldff1_32_unpacked_scaled_offset_helper(kHRegSize, ldff1h, ld1h, UXTW);
9156   ldff1_32_unpacked_scaled_offset_helper(kHRegSize, ldff1h, ld1h, SXTW);
9157 
9158   Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9159   Ld1Macro ld1w = &MacroAssembler::Ld1w;
9160   ldff1_32_unpacked_scaled_offset_helper(kSRegSize, ldff1w, ld1w, UXTW);
9161   ldff1_32_unpacked_scaled_offset_helper(kSRegSize, ldff1w, ld1w, SXTW);
9162 
9163   Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
9164   Ld1Macro ld1d = &MacroAssembler::Ld1d;
9165   ldff1_32_unpacked_scaled_offset_helper(kDRegSize, ldff1d, ld1d, UXTW);
9166   ldff1_32_unpacked_scaled_offset_helper(kDRegSize, ldff1d, ld1d, SXTW);
9167 
9168   Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9169   Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
9170   ldff1_32_unpacked_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh, UXTW);
9171   ldff1_32_unpacked_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh, SXTW);
9172 
9173   Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
9174   Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
9175   ldff1_32_unpacked_scaled_offset_helper(kSRegSize, ldff1sw, ld1sw, UXTW);
9176   ldff1_32_unpacked_scaled_offset_helper(kSRegSize, ldff1sw, ld1sw, SXTW);
9177 }
9178 
sve_ldff1_scalar_plus_vector_32_unpacked_unscaled_offset(Test * config,uintptr_t data)9179 static void sve_ldff1_scalar_plus_vector_32_unpacked_unscaled_offset(
9180     Test* config, uintptr_t data) {
9181   auto ldff1_32_unpacked_unscaled_offset_helper =
9182       std::bind(&Ldff1Helper<Extend>,
9183                 config,
9184                 data,
9185                 std::placeholders::_1,
9186                 kDRegSize,
9187                 CPURegister::kZRegister,
9188                 std::placeholders::_2,
9189                 std::placeholders::_3,
9190                 std::placeholders::_4,
9191                 false);
9192 
9193   Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
9194   Ld1Macro ld1b = &MacroAssembler::Ld1b;
9195   ldff1_32_unpacked_unscaled_offset_helper(kBRegSize, ldff1b, ld1b, UXTW);
9196   ldff1_32_unpacked_unscaled_offset_helper(kBRegSize, ldff1b, ld1b, SXTW);
9197 
9198   Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9199   Ld1Macro ld1h = &MacroAssembler::Ld1h;
9200   ldff1_32_unpacked_unscaled_offset_helper(kHRegSize, ldff1h, ld1h, UXTW);
9201   ldff1_32_unpacked_unscaled_offset_helper(kHRegSize, ldff1h, ld1h, SXTW);
9202 
9203   Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9204   Ld1Macro ld1w = &MacroAssembler::Ld1w;
9205   ldff1_32_unpacked_unscaled_offset_helper(kSRegSize, ldff1w, ld1w, UXTW);
9206   ldff1_32_unpacked_unscaled_offset_helper(kSRegSize, ldff1w, ld1w, SXTW);
9207 
9208   Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
9209   Ld1Macro ld1d = &MacroAssembler::Ld1d;
9210   ldff1_32_unpacked_unscaled_offset_helper(kDRegSize, ldff1d, ld1d, UXTW);
9211   ldff1_32_unpacked_unscaled_offset_helper(kDRegSize, ldff1d, ld1d, SXTW);
9212 
9213   Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
9214   Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
9215   ldff1_32_unpacked_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb, UXTW);
9216   ldff1_32_unpacked_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb, SXTW);
9217 
9218   Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9219   Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
9220   ldff1_32_unpacked_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh, UXTW);
9221   ldff1_32_unpacked_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh, SXTW);
9222 
9223   Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
9224   Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
9225   ldff1_32_unpacked_unscaled_offset_helper(kSRegSize, ldff1sw, ld1sw, UXTW);
9226   ldff1_32_unpacked_unscaled_offset_helper(kSRegSize, ldff1sw, ld1sw, SXTW);
9227 }
9228 
sve_ldff1_scalar_plus_vector_64_scaled_offset(Test * config,uintptr_t data)9229 static void sve_ldff1_scalar_plus_vector_64_scaled_offset(Test* config,
9230                                                           uintptr_t data) {
9231   auto ldff1_64_scaled_offset_helper = std::bind(&Ldff1Helper<Shift>,
9232                                                  config,
9233                                                  data,
9234                                                  std::placeholders::_1,
9235                                                  kDRegSize,
9236                                                  CPURegister::kZRegister,
9237                                                  std::placeholders::_2,
9238                                                  std::placeholders::_3,
9239                                                  LSL,
9240                                                  true);
9241 
9242   Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9243   Ld1Macro ld1h = &MacroAssembler::Ld1h;
9244   ldff1_64_scaled_offset_helper(kHRegSize, ldff1h, ld1h);
9245 
9246   Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9247   Ld1Macro ld1w = &MacroAssembler::Ld1w;
9248   ldff1_64_scaled_offset_helper(kSRegSize, ldff1w, ld1w);
9249 
9250   Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
9251   Ld1Macro ld1d = &MacroAssembler::Ld1d;
9252   ldff1_64_scaled_offset_helper(kDRegSize, ldff1d, ld1d);
9253 
9254   Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9255   Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
9256   ldff1_64_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh);
9257 
9258   Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
9259   Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
9260   ldff1_64_scaled_offset_helper(kSRegSize, ldff1sw, ld1sw);
9261 }
9262 
sve_ldff1_scalar_plus_vector_64_unscaled_offset(Test * config,uintptr_t data)9263 static void sve_ldff1_scalar_plus_vector_64_unscaled_offset(Test* config,
9264                                                             uintptr_t data) {
9265   auto ldff1_64_unscaled_offset_helper = std::bind(&Ldff1Helper<Shift>,
9266                                                    config,
9267                                                    data,
9268                                                    std::placeholders::_1,
9269                                                    kDRegSize,
9270                                                    CPURegister::kZRegister,
9271                                                    std::placeholders::_2,
9272                                                    std::placeholders::_3,
9273                                                    NO_SHIFT,
9274                                                    false);
9275 
9276   Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
9277   Ld1Macro ld1b = &MacroAssembler::Ld1b;
9278   ldff1_64_unscaled_offset_helper(kBRegSize, ldff1b, ld1b);
9279 
9280   Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9281   Ld1Macro ld1h = &MacroAssembler::Ld1h;
9282   ldff1_64_unscaled_offset_helper(kHRegSize, ldff1h, ld1h);
9283 
9284   Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9285   Ld1Macro ld1w = &MacroAssembler::Ld1w;
9286   ldff1_64_unscaled_offset_helper(kSRegSize, ldff1w, ld1w);
9287 
9288   Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
9289   Ld1Macro ld1d = &MacroAssembler::Ld1d;
9290   ldff1_64_unscaled_offset_helper(kDRegSize, ldff1d, ld1d);
9291 
9292   Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
9293   Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
9294   ldff1_64_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb);
9295 
9296   Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9297   Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
9298   ldff1_64_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh);
9299 
9300   Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
9301   Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
9302   ldff1_64_unscaled_offset_helper(kSRegSize, ldff1sw, ld1sw);
9303 }
9304 
TEST_SVE(sve_ldff1_scalar_plus_vector)9305 TEST_SVE(sve_ldff1_scalar_plus_vector) {
9306   size_t page_size = sysconf(_SC_PAGE_SIZE);
9307   VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));
9308 
9309   // Allocate two pages, then mprotect the second one to make it inaccessible.
9310   uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
9311                                                     page_size * 2,
9312                                                     PROT_READ | PROT_WRITE,
9313                                                     MAP_PRIVATE | MAP_ANONYMOUS,
9314                                                     -1,
9315                                                     0));
9316   mprotect(reinterpret_cast<void*>(data + page_size), page_size, PROT_NONE);
9317 
9318   // Fill the accessible page with arbitrary data.
9319   for (size_t i = 0; i < page_size; i++) {
9320     // Reverse bits so we get a mixture of positive and negative values.
9321     uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
9322     memcpy(reinterpret_cast<void*>(data + i), &byte, 1);
9323   }
9324 
9325   sve_ldff1_scalar_plus_vector_32_scaled_offset(config, data);
9326   sve_ldff1_scalar_plus_vector_32_unscaled_offset(config, data);
9327   sve_ldff1_scalar_plus_vector_32_unpacked_scaled_offset(config, data);
9328   sve_ldff1_scalar_plus_vector_32_unpacked_unscaled_offset(config, data);
9329   sve_ldff1_scalar_plus_vector_64_scaled_offset(config, data);
9330   sve_ldff1_scalar_plus_vector_64_unscaled_offset(config, data);
9331 
9332   munmap(reinterpret_cast<void*>(data), page_size * 2);
9333 }
9334 
TEST_SVE(sve_ldnf1)9335 TEST_SVE(sve_ldnf1) {
9336   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
9337                           CPUFeatures::kNEON,
9338                           CPUFeatures::kFP);
9339   START();
9340 
9341   size_t page_size = sysconf(_SC_PAGE_SIZE);
9342   VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));
9343 
9344   // Allocate two pages, fill them with data, then mprotect the second one to
9345   // make it inaccessible.
9346   uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
9347                                                     page_size * 2,
9348                                                     PROT_READ | PROT_WRITE,
9349                                                     MAP_PRIVATE | MAP_ANONYMOUS,
9350                                                     -1,
9351                                                     0));
9352 
9353   // Fill the pages with arbitrary data.
9354   for (size_t i = 0; i < page_size; i++) {
9355     // Reverse bits so we get a mixture of positive and negative values.
9356     uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
9357     memcpy(reinterpret_cast<void*>(data + i), &byte, 1);
9358   }
9359 
9360   mprotect(reinterpret_cast<void*>(data + page_size), page_size, PROT_NONE);
9361 
9362   __ Setffr();
9363   __ Ptrue(p0.VnB());
9364   __ Dup(z10.VnB(), 0);
9365 
9366   // Move an address that points to the last unprotected eight bytes.
9367   __ Mov(x0, data + page_size - (kQRegSizeInBytes / kBRegSizeInBytes) / 2);
9368 
9369   // Load, non-faulting, a vector of bytes from x0. At most, eight bytes will be
9370   // loaded, the rest being in a protected page.
9371   __ Ldnf1b(z0.VnB(), p0.Zeroing(), SVEMemOperand(x0));
9372   __ Rdffr(p1.VnB());
9373   __ Setffr();
9374 
9375   // Create references using the FFR value in p1 to zero the undefined lanes.
9376   __ Sel(z0.VnB(), p1, z0.VnB(), z10.VnB());
9377   __ Ld1b(z20.VnB(), p1.Zeroing(), SVEMemOperand(x0));
9378 
9379   // Repeat for larger elements and different addresses, giving different FFR
9380   // results.
9381   __ Add(x1, x0, 1);
9382   __ Ldnf1h(z1.VnH(), p0.Zeroing(), SVEMemOperand(x1));
9383   __ Rdffr(p1.VnB());
9384   __ Setffr();
9385   __ Sel(z1.VnH(), p1, z1.VnH(), z10.VnH());
9386   __ Ld1h(z21.VnH(), p1.Zeroing(), SVEMemOperand(x1));
9387 
9388   __ Add(x1, x0, 2);
9389   __ Ldnf1w(z2.VnS(), p0.Zeroing(), SVEMemOperand(x1));
9390   __ Rdffr(p1.VnB());
9391   __ Setffr();
9392   __ Sel(z2.VnS(), p1, z2.VnS(), z10.VnS());
9393   __ Ld1w(z22.VnS(), p1.Zeroing(), SVEMemOperand(x1));
9394 
9395   __ Sub(x1, x0, 1);
9396   __ Ldnf1d(z3.VnD(), p0.Zeroing(), SVEMemOperand(x1));
9397   __ Rdffr(p1.VnB());
9398   __ Setffr();
9399   __ Sel(z3.VnD(), p1, z3.VnD(), z10.VnD());
9400   __ Ld1d(z23.VnD(), p1.Zeroing(), SVEMemOperand(x1));
9401 
9402   // Load from previous VL-sized area of memory. All of this should be in the
9403   // accessible page.
9404   __ Ldnf1b(z4.VnB(), p0.Zeroing(), SVEMemOperand(x0, -1, SVE_MUL_VL));
9405   __ Rdffr(p1.VnB());
9406   __ Setffr();
9407   __ Sel(z4.VnB(), p1, z4.VnB(), z10.VnB());
9408   __ Ld1b(z24.VnB(), p1.Zeroing(), SVEMemOperand(x0, -1, SVE_MUL_VL));
9409 
9410   // Repeat partial load for larger element size.
9411   __ Mov(x0, data + page_size - (kQRegSizeInBytes / kSRegSizeInBytes) / 2);
9412   __ Ldnf1b(z5.VnS(), p0.Zeroing(), SVEMemOperand(x0));
9413   __ Rdffr(p1.VnB());
9414   __ Setffr();
9415   __ Sel(z5.VnS(), p1, z5.VnS(), z10.VnS());
9416   __ Ld1b(z25.VnS(), p1.Zeroing(), SVEMemOperand(x0));
9417 
9418   // Repeat for sign extension.
9419   __ Mov(x0, data + page_size - (kQRegSizeInBytes / kHRegSizeInBytes) / 2);
9420   __ Ldnf1sb(z6.VnH(), p0.Zeroing(), SVEMemOperand(x0));
9421   __ Rdffr(p1.VnB());
9422   __ Setffr();
9423   __ Sel(z6.VnH(), p1, z6.VnH(), z10.VnH());
9424   __ Ld1sb(z26.VnH(), p1.Zeroing(), SVEMemOperand(x0));
9425 
9426   END();
9427 
9428   if (CAN_RUN()) {
9429     RUN();
9430     ASSERT_EQUAL_SVE(z20, z0);
9431     ASSERT_EQUAL_SVE(z21, z1);
9432     ASSERT_EQUAL_SVE(z22, z2);
9433     ASSERT_EQUAL_SVE(z23, z3);
9434     ASSERT_EQUAL_SVE(z24, z4);
9435     ASSERT_EQUAL_SVE(z25, z5);
9436     ASSERT_EQUAL_SVE(z26, z6);
9437   }
9438 
9439   munmap(reinterpret_cast<void*>(data), page_size * 2);
9440 }
9441 
9442 // Emphasis on test if the modifiers are propagated and simulated correctly.
TEST_SVE(sve_ldff1_regression_test)9443 TEST_SVE(sve_ldff1_regression_test) {
9444   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
9445   START();
9446 
9447   size_t page_size = sysconf(_SC_PAGE_SIZE);
9448   VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));
9449 
9450   uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
9451                                                     page_size * 2,
9452                                                     PROT_READ | PROT_WRITE,
9453                                                     MAP_PRIVATE | MAP_ANONYMOUS,
9454                                                     -1,
9455                                                     0));
9456   uintptr_t middle = data + page_size;
9457   // Fill the accessible page with arbitrary data.
9458   for (size_t i = 0; i < page_size; i++) {
9459     // Reverse bits so we get a mixture of positive and negative values.
9460     uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
9461     memcpy(reinterpret_cast<void*>(middle + i), &byte, 1);
9462     // Make one bit roughly different in every byte and copy the bytes in the
9463     // reverse direction that convenient to verifying the loads in negative
9464     // indexes.
9465     byte += 1;
9466     memcpy(reinterpret_cast<void*>(middle - i), &byte, 1);
9467   }
9468 
9469   PRegister all = p6;
9470   __ Ptrue(all.VnB());
9471 
9472   __ Mov(x0, middle);
9473   __ Index(z31.VnS(), 0, 3);
9474   __ Neg(z30.VnS(), z31.VnS());
9475 
9476   __ Setffr();
9477 
9478   // Scalar plus vector 32 unscaled offset
9479   __ Ldff1b(z1.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
9480   __ Ldff1h(z2.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW));
9481   __ Ldff1w(z3.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
9482   __ Ldff1sb(z4.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW));
9483   __ Ldff1sh(z5.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
9484 
9485   // Scalar plus vector 32 scaled offset
9486   __ Ldff1h(z6.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW, 1));
9487   __ Ldff1w(z7.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW, 2));
9488   __ Ldff1sh(z8.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW, 1));
9489 
9490   __ Index(z31.VnD(), 0, 3);
9491   __ Neg(z30.VnD(), z31.VnD());
9492 
9493   // Ensure only the low 32 bits are used for the testing with positive index
9494   // values. It also test if the indexes are treated as positive in `uxtw` form.
9495   __ Mov(x3, 0x8000000080000000);
9496   __ Dup(z28.VnD(), x3);
9497   __ Sub(x2, x0, 0x80000000);
9498   __ Add(z29.VnD(), z31.VnD(), z28.VnD());
9499 
9500   // Scalar plus vector 32 unpacked unscaled offset
9501   __ Ldff1b(z9.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9502   __ Ldff1h(z10.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW));
9503   __ Ldff1w(z11.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9504   __ Ldff1sb(z12.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9505   __ Ldff1sh(z13.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW));
9506   __ Ldff1sw(z14.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9507 
9508   // Scalar plus vector 32 unpacked scaled offset
9509   __ Ldff1h(z15.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 1));
9510   __ Ldff1w(z16.VnD(), all.Zeroing(), SVEMemOperand(x0, z31.VnD(), UXTW, 2));
9511   __ Ldff1d(z17.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 3));
9512   __ Ldff1sh(z18.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 1));
9513   __ Ldff1sw(z19.VnD(), all.Zeroing(), SVEMemOperand(x0, z31.VnD(), UXTW, 2));
9514 
9515   __ Sub(x0, x0, x3);
9516   // Note that the positive indexes has been added by `0x8000000080000000`. The
9517   // wrong address will be accessed if the address is treated as negative.
9518 
9519   // Scalar plus vector 64 unscaled offset
9520   __ Ldff1b(z20.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9521   __ Ldff1h(z21.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9522   __ Ldff1w(z22.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9523   __ Ldff1sh(z23.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9524   __ Ldff1sw(z24.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9525 
9526   // Scalar plus vector 64 scaled offset
9527   __ Lsr(z29.VnD(), z28.VnD(), 1);  // Shift right to 0x4000000040000000
9528   __ Add(z30.VnD(), z31.VnD(), z29.VnD());
9529   __ Ldff1h(z25.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 1));
9530   __ Ldff1sh(z26.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 1));
9531 
9532   __ Lsr(z29.VnD(), z29.VnD(), 1);  // Shift right to 0x2000000020000000
9533   __ Add(z30.VnD(), z31.VnD(), z29.VnD());
9534   __ Ldff1w(z27.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 2));
9535   __ Ldff1sw(z28.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 2));
9536 
9537   __ Lsr(z29.VnD(), z29.VnD(), 1);  // Shift right to 0x1000000010000000
9538   __ Add(z30.VnD(), z31.VnD(), z29.VnD());
9539   __ Ldff1d(z29.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 3));
9540 
9541   __ Rdffr(p1.VnB());
9542   __ Cntp(x10, all, p1.VnB());
9543 
9544   END();
9545 
9546   if (CAN_RUN()) {
9547     RUN();
9548 
9549     int64_t loaded_data_in_bytes = core.xreg(x10.GetCode());
9550     // Only check 128 bits in this test.
9551     if (loaded_data_in_bytes < kQRegSizeInBytes) {
9552       // Report a warning when we hit fault-tolerant loads before all expected
9553       // loads performed.
9554       printf(
9555           "WARNING: Fault-tolerant loads detected faults before the "
9556           "expected loads completed.\n");
9557       return;
9558     }
9559 
9560     // Scalar plus vector 32 unscaled offset
9561     uint32_t expected_z1[] = {0x00000090, 0x00000060, 0x000000c0, 0x00000001};
9562     uint32_t expected_z2[] = {0x00001191, 0x0000a161, 0x000041c1, 0x00008001};
9563     uint32_t expected_z3[] = {0x30d05090, 0x9010e060, 0x60a020c0, 0xc0408001};
9564     uint32_t expected_z4[] = {0xffffff91, 0x00000061, 0xffffffc1, 0x00000001};
9565     uint32_t expected_z5[] = {0x00005090, 0xffffe060, 0x000020c0, 0xffff8001};
9566 
9567     ASSERT_EQUAL_SVE(expected_z1, z1.VnS());
9568     ASSERT_EQUAL_SVE(expected_z2, z2.VnS());
9569     ASSERT_EQUAL_SVE(expected_z3, z3.VnS());
9570     ASSERT_EQUAL_SVE(expected_z4, z4.VnS());
9571     ASSERT_EQUAL_SVE(expected_z5, z5.VnS());
9572 
9573     // Scalar plus vector 32 scaled offset
9574     uint32_t expected_z6[] = {0x0000c848, 0x0000b030, 0x0000e060, 0x00008001};
9575     uint32_t expected_z7[] = {0xe464a424, 0xd8589818, 0xf070b030, 0xc0408001};
9576     uint32_t expected_z8[] = {0xffff8949, 0xffffd131, 0xffffa161, 0xffff8001};
9577 
9578     ASSERT_EQUAL_SVE(expected_z6, z6.VnS());
9579     ASSERT_EQUAL_SVE(expected_z7, z7.VnS());
9580     ASSERT_EQUAL_SVE(expected_z8, z8.VnS());
9581 
9582     // Scalar plus vector 32 unpacked unscaled offset
9583     uint64_t expected_z9[] = {0x00000000000000c0, 0x0000000000000001};
9584     uint64_t expected_z10[] = {0x00000000000041c1, 0x0000000000008001};
9585     uint64_t expected_z11[] = {0x0000000060a020c0, 0x00000000c0408001};
9586     uint64_t expected_z12[] = {0xffffffffffffffc0, 0x0000000000000001};
9587     uint64_t expected_z13[] = {0x00000000000041c1, 0xffffffffffff8001};
9588     uint64_t expected_z14[] = {0x0000000060a020c0, 0xffffffffc0408001};
9589 
9590     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
9591     ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
9592     ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
9593     ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
9594     ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
9595     ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
9596 
9597     // Scalar plus vector 32 unpacked scaled offset
9598     uint64_t expected_z15[] = {0x000000000000a161, 0x0000000000008001};
9599     uint64_t expected_z16[] = {0x00000000f070b030, 0x00000000c0408001};
9600     uint64_t expected_z17[] = {0x8949c929a969e919, 0xe060a020c0408001};
9601     uint64_t expected_z18[] = {0xffffffffffffa161, 0xffffffffffff8001};
9602     uint64_t expected_z19[] = {0xfffffffff070b030, 0xffffffffc0408001};
9603 
9604     ASSERT_EQUAL_SVE(expected_z15, z15.VnD());
9605     ASSERT_EQUAL_SVE(expected_z16, z16.VnD());
9606     ASSERT_EQUAL_SVE(expected_z17, z17.VnD());
9607     ASSERT_EQUAL_SVE(expected_z18, z18.VnD());
9608     ASSERT_EQUAL_SVE(expected_z19, z19.VnD());
9609 
9610     // Scalar plus vector 64 unscaled offset
9611     uint64_t expected_z20[] = {0x00000000000000c0, 0x0000000000000001};
9612     uint64_t expected_z21[] = {0x00000000000020c0, 0x0000000000008001};
9613     uint64_t expected_z22[] = {0x0000000060a020c0, 0x00000000c0408001};
9614     uint64_t expected_z23[] = {0x00000000000020c0, 0xffffffffffff8001};
9615     uint64_t expected_z24[] = {0x0000000060a020c0, 0xffffffffc0408001};
9616 
9617     ASSERT_EQUAL_SVE(expected_z20, z20.VnD());
9618     ASSERT_EQUAL_SVE(expected_z21, z21.VnD());
9619     ASSERT_EQUAL_SVE(expected_z22, z22.VnD());
9620     ASSERT_EQUAL_SVE(expected_z23, z23.VnD());
9621     ASSERT_EQUAL_SVE(expected_z24, z24.VnD());
9622 
9623     uint64_t expected_z25[] = {0x000000000000e060, 0x0000000000008001};
9624     uint64_t expected_z26[] = {0xffffffffffffe060, 0xffffffffffff8001};
9625     uint64_t expected_z27[] = {0x00000000f070b030, 0x00000000c0408001};
9626     uint64_t expected_z28[] = {0xfffffffff070b030, 0xffffffffc0408001};
9627     uint64_t expected_z29[] = {0xf878b838d8589818, 0xe060a020c0408001};
9628 
9629     // Scalar plus vector 64 scaled offset
9630     ASSERT_EQUAL_SVE(expected_z25, z25.VnD());
9631     ASSERT_EQUAL_SVE(expected_z26, z26.VnD());
9632     ASSERT_EQUAL_SVE(expected_z27, z27.VnD());
9633     ASSERT_EQUAL_SVE(expected_z28, z28.VnD());
9634     ASSERT_EQUAL_SVE(expected_z29, z29.VnD());
9635   }
9636 }
9637 
9638 // Emphasis on test if the modifiers are propagated and simulated correctly.
TEST_SVE(sve_ld1_regression_test)9639 TEST_SVE(sve_ld1_regression_test) {
9640   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
9641   START();
9642 
9643   size_t page_size = sysconf(_SC_PAGE_SIZE);
9644   VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));
9645 
9646   uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
9647                                                     page_size * 2,
9648                                                     PROT_READ | PROT_WRITE,
9649                                                     MAP_PRIVATE | MAP_ANONYMOUS,
9650                                                     -1,
9651                                                     0));
9652   uintptr_t middle = data + page_size;
9653   // Fill the accessible page with arbitrary data.
9654   for (size_t i = 0; i < page_size; i++) {
9655     // Reverse bits so we get a mixture of positive and negative values.
9656     uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
9657     memcpy(reinterpret_cast<void*>(middle + i), &byte, 1);
9658     // Make one bit roughly different in every byte and copy the bytes in the
9659     // reverse direction that convenient to verifying the loads in negative
9660     // indexes.
9661     byte += 1;
9662     memcpy(reinterpret_cast<void*>(middle - i), &byte, 1);
9663   }
9664 
9665   PRegister all = p6;
9666   __ Ptrue(all.VnB());
9667 
9668   __ Mov(x0, middle);
9669   __ Index(z31.VnS(), 0, 3);
9670   __ Neg(z30.VnS(), z31.VnS());
9671 
9672   // Scalar plus vector 32 unscaled offset
9673   __ Ld1b(z1.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
9674   __ Ld1h(z2.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW));
9675   __ Ld1w(z3.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
9676   __ Ld1sb(z4.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW));
9677   __ Ld1sh(z5.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
9678 
9679   // Scalar plus vector 32 scaled offset
9680   __ Ld1h(z6.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW, 1));
9681   __ Ld1w(z7.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW, 2));
9682   __ Ld1sh(z8.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW, 1));
9683 
9684   __ Index(z31.VnD(), 0, 3);
9685   __ Neg(z30.VnD(), z31.VnD());
9686 
9687   // Ensure only the low 32 bits are used for the testing with positive index
9688   // values. It also test if the indexes are treated as positive in `uxtw` form.
9689   __ Mov(x3, 0x8000000080000000);
9690   __ Dup(z28.VnD(), x3);
9691   __ Sub(x2, x0, 0x80000000);
9692   __ Add(z29.VnD(), z31.VnD(), z28.VnD());
9693 
9694   // Scalar plus vector 32 unpacked unscaled offset
9695   __ Ld1b(z9.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9696   __ Ld1h(z10.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW));
9697   __ Ld1w(z11.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9698   __ Ld1sb(z12.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9699   __ Ld1sh(z13.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW));
9700   __ Ld1sw(z14.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9701 
9702   // Scalar plus vector 32 unpacked scaled offset
9703   __ Ld1h(z15.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 1));
9704   __ Ld1w(z16.VnD(), all.Zeroing(), SVEMemOperand(x0, z31.VnD(), UXTW, 2));
9705   __ Ld1d(z17.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 3));
9706   __ Ld1sh(z18.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 1));
9707   __ Ld1sw(z19.VnD(), all.Zeroing(), SVEMemOperand(x0, z31.VnD(), UXTW, 2));
9708 
9709   __ Sub(x0, x0, x3);
9710   // Note that the positive indexes has been added by `0x8000000080000000`. The
9711   // wrong address will be accessed if the address is treated as negative.
9712 
9713   // Scalar plus vector 64 unscaled offset
9714   __ Ld1b(z20.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9715   __ Ld1h(z21.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9716   __ Ld1w(z22.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9717   __ Ld1sh(z23.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9718   __ Ld1sw(z24.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9719 
9720   // Scalar plus vector 64 scaled offset
9721   __ Lsr(z29.VnD(), z28.VnD(), 1);  // Shift right to 0x4000000040000000
9722   __ Add(z30.VnD(), z31.VnD(), z29.VnD());
9723   __ Ld1h(z25.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 1));
9724   __ Ld1sh(z26.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 1));
9725 
9726   __ Lsr(z29.VnD(), z29.VnD(), 1);  // Shift right to 0x2000000020000000
9727   __ Add(z30.VnD(), z31.VnD(), z29.VnD());
9728   __ Ld1w(z27.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 2));
9729   __ Ld1sw(z28.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 2));
9730 
9731   __ Lsr(z29.VnD(), z29.VnD(), 1);  // Shift right to 0x1000000010000000
9732   __ Add(z30.VnD(), z31.VnD(), z29.VnD());
9733   __ Ld1d(z29.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 3));
9734 
9735   END();
9736 
9737   if (CAN_RUN()) {
9738     RUN();
9739 
9740     // Scalar plus vector 32 unscaled offset
9741     uint32_t expected_z1[] = {0x00000090, 0x00000060, 0x000000c0, 0x00000001};
9742     uint32_t expected_z2[] = {0x00001191, 0x0000a161, 0x000041c1, 0x00008001};
9743     uint32_t expected_z3[] = {0x30d05090, 0x9010e060, 0x60a020c0, 0xc0408001};
9744     uint32_t expected_z4[] = {0xffffff91, 0x00000061, 0xffffffc1, 0x00000001};
9745     uint32_t expected_z5[] = {0x00005090, 0xffffe060, 0x000020c0, 0xffff8001};
9746 
9747     ASSERT_EQUAL_SVE(expected_z1, z1.VnS());
9748     ASSERT_EQUAL_SVE(expected_z2, z2.VnS());
9749     ASSERT_EQUAL_SVE(expected_z3, z3.VnS());
9750     ASSERT_EQUAL_SVE(expected_z4, z4.VnS());
9751     ASSERT_EQUAL_SVE(expected_z5, z5.VnS());
9752 
9753     // Scalar plus vector 32 scaled offset
9754     uint32_t expected_z6[] = {0x0000c848, 0x0000b030, 0x0000e060, 0x00008001};
9755     uint32_t expected_z7[] = {0xe464a424, 0xd8589818, 0xf070b030, 0xc0408001};
9756     uint32_t expected_z8[] = {0xffff8949, 0xffffd131, 0xffffa161, 0xffff8001};
9757 
9758     ASSERT_EQUAL_SVE(expected_z6, z6.VnS());
9759     ASSERT_EQUAL_SVE(expected_z7, z7.VnS());
9760     ASSERT_EQUAL_SVE(expected_z8, z8.VnS());
9761 
9762     // Scalar plus vector 32 unpacked unscaled offset
9763     uint64_t expected_z9[] = {0x00000000000000c0, 0x0000000000000001};
9764     uint64_t expected_z10[] = {0x00000000000041c1, 0x0000000000008001};
9765     uint64_t expected_z11[] = {0x0000000060a020c0, 0x00000000c0408001};
9766     uint64_t expected_z12[] = {0xffffffffffffffc0, 0x0000000000000001};
9767     uint64_t expected_z13[] = {0x00000000000041c1, 0xffffffffffff8001};
9768     uint64_t expected_z14[] = {0x0000000060a020c0, 0xffffffffc0408001};
9769 
9770     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
9771     ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
9772     ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
9773     ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
9774     ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
9775     ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
9776 
9777     // Scalar plus vector 32 unpacked scaled offset
9778     uint64_t expected_z15[] = {0x000000000000a161, 0x0000000000008001};
9779     uint64_t expected_z16[] = {0x00000000f070b030, 0x00000000c0408001};
9780     uint64_t expected_z17[] = {0x8949c929a969e919, 0xe060a020c0408001};
9781     uint64_t expected_z18[] = {0xffffffffffffa161, 0xffffffffffff8001};
9782     uint64_t expected_z19[] = {0xfffffffff070b030, 0xffffffffc0408001};
9783 
9784     ASSERT_EQUAL_SVE(expected_z15, z15.VnD());
9785     ASSERT_EQUAL_SVE(expected_z16, z16.VnD());
9786     ASSERT_EQUAL_SVE(expected_z17, z17.VnD());
9787     ASSERT_EQUAL_SVE(expected_z18, z18.VnD());
9788     ASSERT_EQUAL_SVE(expected_z19, z19.VnD());
9789 
9790     // Scalar plus vector 64 unscaled offset
9791     uint64_t expected_z20[] = {0x00000000000000c0, 0x0000000000000001};
9792     uint64_t expected_z21[] = {0x00000000000020c0, 0x0000000000008001};
9793     uint64_t expected_z22[] = {0x0000000060a020c0, 0x00000000c0408001};
9794     uint64_t expected_z23[] = {0x00000000000020c0, 0xffffffffffff8001};
9795     uint64_t expected_z24[] = {0x0000000060a020c0, 0xffffffffc0408001};
9796 
9797     ASSERT_EQUAL_SVE(expected_z20, z20.VnD());
9798     ASSERT_EQUAL_SVE(expected_z21, z21.VnD());
9799     ASSERT_EQUAL_SVE(expected_z22, z22.VnD());
9800     ASSERT_EQUAL_SVE(expected_z23, z23.VnD());
9801     ASSERT_EQUAL_SVE(expected_z24, z24.VnD());
9802 
9803     uint64_t expected_z25[] = {0x000000000000e060, 0x0000000000008001};
9804     uint64_t expected_z26[] = {0xffffffffffffe060, 0xffffffffffff8001};
9805     uint64_t expected_z27[] = {0x00000000f070b030, 0x00000000c0408001};
9806     uint64_t expected_z28[] = {0xfffffffff070b030, 0xffffffffc0408001};
9807     uint64_t expected_z29[] = {0xf878b838d8589818, 0xe060a020c0408001};
9808 
9809     // Scalar plus vector 64 scaled offset
9810     ASSERT_EQUAL_SVE(expected_z25, z25.VnD());
9811     ASSERT_EQUAL_SVE(expected_z26, z26.VnD());
9812     ASSERT_EQUAL_SVE(expected_z27, z27.VnD());
9813     ASSERT_EQUAL_SVE(expected_z28, z28.VnD());
9814     ASSERT_EQUAL_SVE(expected_z29, z29.VnD());
9815   }
9816 }
9817 
9818 // Test gather loads by comparing them with the result of a set of equivalent
9819 // scalar loads.
9820 template <typename T>
GatherLoadScalarPlusVectorHelper(Test * config,unsigned msize_in_bits,unsigned esize_in_bits,Ld1Macro ld1,Ld1Macro ldff1,T mod,bool is_signed,bool is_scaled)9821 static void GatherLoadScalarPlusVectorHelper(Test* config,
9822                                              unsigned msize_in_bits,
9823                                              unsigned esize_in_bits,
9824                                              Ld1Macro ld1,
9825                                              Ld1Macro ldff1,
9826                                              T mod,
9827                                              bool is_signed,
9828                                              bool is_scaled) {
9829   // SVE supports 32- and 64-bit addressing for gather loads.
9830   VIXL_ASSERT((esize_in_bits == kSRegSize) || (esize_in_bits == kDRegSize));
9831   static const unsigned kMaxLaneCount = kZRegMaxSize / kSRegSize;
9832 
9833   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
9834   START();
9835 
9836   unsigned msize_in_bytes = msize_in_bits / kBitsPerByte;
9837   int vl = config->sve_vl_in_bytes();
9838 
9839   uint64_t addresses[kMaxLaneCount];
9840   uint64_t offsets[kMaxLaneCount];
9841   uint64_t max_address = 0;
9842   uint64_t buffer_size = vl * 64;
9843   uint64_t data = reinterpret_cast<uintptr_t>(malloc(buffer_size));
9844   // Fill the buffer with arbitrary data. Meanwhile, create the random addresses
9845   // and offsets into the buffer placed in the argument list.
9846   BufferFillingHelper(data,
9847                       buffer_size,
9848                       msize_in_bytes,
9849                       kMaxLaneCount,
9850                       offsets,
9851                       addresses,
9852                       &max_address);
9853 
9854   ZRegister zn = z0.WithLaneSize(esize_in_bits);
9855   ZRegister zt_ref = z1.WithLaneSize(esize_in_bits);
9856   ZRegister zt = z2.WithLaneSize(esize_in_bits);
9857   ZRegister zt_ff = z3.WithLaneSize(esize_in_bits);
9858   PRegisterWithLaneSize pg_ff = p1.WithLaneSize(esize_in_bits);
9859   PRegisterWithLaneSize pg_diff = p2.WithLaneSize(esize_in_bits);
9860 
9861   int shift = 0;
9862   if (is_scaled) {
9863     shift = std::log2(msize_in_bytes);
9864     for (unsigned i = 0; i < kMaxLaneCount; i++) {
9865       // Ensure the offsets are the multiple of the scale factor of the
9866       // operation.
9867       offsets[i] = (offsets[i] >> shift) << shift;
9868       addresses[i] = data + offsets[i];
9869     }
9870   }
9871 
9872   PRegister all = p6;
9873   __ Ptrue(all.WithLaneSize(esize_in_bits));
9874 
9875   PRegisterZ pg = p0.Zeroing();
9876   Initialise(&masm,
9877              pg,
9878              0x9abcdef012345678,
9879              0xabcdef0123456789,
9880              0xf4f3f1f0fefdfcfa,
9881              0xf9f8f6f5f3f2f1ff);
9882 
9883   __ Mov(x0, data);
9884 
9885   // Generate a reference result for scalar-plus-scalar form using scalar loads.
9886   ScalarLoadHelper(&masm,
9887                    vl,
9888                    addresses,
9889                    zt_ref,
9890                    pg,
9891                    esize_in_bits,
9892                    msize_in_bits,
9893                    is_signed);
9894 
9895   InsrHelper(&masm, zn, offsets);
9896   if (is_scaled) {
9897     // Scale down the offsets if testing scaled-offset operation.
9898     __ Lsr(zn, zn, shift);
9899   }
9900 
9901   (masm.*ld1)(zt, pg, SVEMemOperand(x0, zn, mod, shift));
9902 
9903   Register ffr_check_count = x17;
9904   __ Mov(ffr_check_count, 0);
9905 
9906   // Test the data correctness in which the data gather load from different
9907   // addresses. The first-fault behavior test is emphasized in `Ldff1Helper`.
9908   __ Setffr();
9909   (masm.*ldff1)(zt_ff, pg, SVEMemOperand(x0, zn, mod, shift));
9910 
9911   // Compare these two vector register and place the different to
9912   // `ffr_check_count`.
9913   __ Rdffrs(pg_ff.VnB(), all.Zeroing());
9914   __ Cmpeq(pg_diff, all.Zeroing(), zt_ref, zt_ff);
9915   __ Eor(pg_diff.VnB(), all.Zeroing(), pg_diff.VnB(), pg_ff.VnB());
9916   __ Incp(ffr_check_count, pg_diff);
9917 
9918   END();
9919 
9920   if (CAN_RUN()) {
9921     RUN();
9922 
9923     ASSERT_EQUAL_SVE(zt_ref, zt);
9924     ASSERT_EQUAL_64(0, ffr_check_count);
9925   }
9926 
9927   free(reinterpret_cast<void*>(data));
9928 }
9929 
9930 // Test gather loads by comparing them with the result of a set of equivalent
9931 // scalar loads.
9932 template <typename F>
GatherLoadScalarPlusScalarOrImmHelper(Test * config,unsigned msize_in_bits,unsigned esize_in_bits,F sve_ld1,bool is_signed)9933 static void GatherLoadScalarPlusScalarOrImmHelper(Test* config,
9934                                                   unsigned msize_in_bits,
9935                                                   unsigned esize_in_bits,
9936                                                   F sve_ld1,
9937                                                   bool is_signed) {
9938   // SVE supports 32- and 64-bit addressing for gather loads.
9939   VIXL_ASSERT((esize_in_bits == kSRegSize) || (esize_in_bits == kDRegSize));
9940   static const unsigned kMaxLaneCount = kZRegMaxSize / kSRegSize;
9941 
9942   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
9943   START();
9944 
9945   unsigned msize_in_bytes = msize_in_bits / kBitsPerByte;
9946   int vl = config->sve_vl_in_bytes();
9947 
9948   uint64_t addresses[kMaxLaneCount];
9949   uint64_t offsets[kMaxLaneCount];
9950   uint64_t max_address = 0;
9951   uint64_t buffer_size = vl * 64;
9952   uint64_t data = reinterpret_cast<uintptr_t>(malloc(buffer_size));
9953   BufferFillingHelper(data,
9954                       buffer_size,
9955                       msize_in_bytes,
9956                       kMaxLaneCount,
9957                       offsets,
9958                       addresses,
9959                       &max_address);
9960 
9961   // Maximised offsets, to ensure that the address calculation is modulo-2^64,
9962   // and that the vector addresses are not sign-extended.
9963   uint64_t uint_e_max = (esize_in_bits == kDRegSize) ? UINT64_MAX : UINT32_MAX;
9964   uint64_t maxed_offsets[kMaxLaneCount];
9965   uint64_t maxed_offsets_imm = max_address - uint_e_max;
9966   for (unsigned i = 0; i < kMaxLaneCount; i++) {
9967     maxed_offsets[i] = addresses[i] - maxed_offsets_imm;
9968   }
9969 
9970   ZRegister zn = z0.WithLaneSize(esize_in_bits);
9971   ZRegister zt_addresses = z1.WithLaneSize(esize_in_bits);
9972   ZRegister zt_offsets = z2.WithLaneSize(esize_in_bits);
9973   ZRegister zt_maxed = z3.WithLaneSize(esize_in_bits);
9974   ZRegister zt_ref = z4.WithLaneSize(esize_in_bits);
9975 
9976   PRegisterZ pg = p0.Zeroing();
9977   Initialise(&masm,
9978              pg,
9979              0x9abcdef012345678,
9980              0xabcdef0123456789,
9981              0xf4f3f1f0fefdfcfa,
9982              0xf9f8f6f5f3f2f0ff);
9983 
9984   // Execute each load.
9985 
9986   if (esize_in_bits == kDRegSize) {
9987     // Only test `addresses` if we can use 64-bit pointers. InsrHelper will fail
9988     // if any value won't fit in a lane of zn.
9989     InsrHelper(&masm, zn, addresses);
9990     (masm.*sve_ld1)(zt_addresses, pg, SVEMemOperand(zn));
9991   }
9992 
9993   InsrHelper(&masm, zn, offsets);
9994   (masm.*sve_ld1)(zt_offsets, pg, SVEMemOperand(zn, data));
9995 
9996   InsrHelper(&masm, zn, maxed_offsets);
9997   (masm.*sve_ld1)(zt_maxed, pg, SVEMemOperand(zn, maxed_offsets_imm));
9998 
9999   // Generate a reference result using scalar loads.
10000   ScalarLoadHelper(&masm,
10001                    vl,
10002                    addresses,
10003                    zt_ref,
10004                    pg,
10005                    esize_in_bits,
10006                    msize_in_bits,
10007                    is_signed);
10008 
10009   END();
10010 
10011   if (CAN_RUN()) {
10012     RUN();
10013 
10014     if (esize_in_bits == kDRegSize) {
10015       ASSERT_EQUAL_SVE(zt_ref, zt_addresses);
10016     }
10017     ASSERT_EQUAL_SVE(zt_ref, zt_offsets);
10018     ASSERT_EQUAL_SVE(zt_ref, zt_maxed);
10019   }
10020 
10021   free(reinterpret_cast<void*>(data));
10022 }
10023 
TEST_SVE(sve_ld1b_64bit_vector_plus_immediate)10024 TEST_SVE(sve_ld1b_64bit_vector_plus_immediate) {
10025   GatherLoadScalarPlusScalarOrImmHelper(config,
10026                                         kBRegSize,
10027                                         kDRegSize,
10028                                         &MacroAssembler::Ld1b,
10029                                         false);
10030 }
10031 
TEST_SVE(sve_ld1h_64bit_vector_plus_immediate)10032 TEST_SVE(sve_ld1h_64bit_vector_plus_immediate) {
10033   GatherLoadScalarPlusScalarOrImmHelper(config,
10034                                         kHRegSize,
10035                                         kDRegSize,
10036                                         &MacroAssembler::Ld1h,
10037                                         false);
10038 }
10039 
TEST_SVE(sve_ld1w_64bit_vector_plus_immediate)10040 TEST_SVE(sve_ld1w_64bit_vector_plus_immediate) {
10041   GatherLoadScalarPlusScalarOrImmHelper(config,
10042                                         kSRegSize,
10043                                         kDRegSize,
10044                                         &MacroAssembler::Ld1w,
10045                                         false);
10046 }
10047 
TEST_SVE(sve_ld1d_64bit_vector_plus_immediate)10048 TEST_SVE(sve_ld1d_64bit_vector_plus_immediate) {
10049   GatherLoadScalarPlusScalarOrImmHelper(config,
10050                                         kDRegSize,
10051                                         kDRegSize,
10052                                         &MacroAssembler::Ld1d,
10053                                         false);
10054 }
10055 
TEST_SVE(sve_ld1sb_64bit_vector_plus_immediate)10056 TEST_SVE(sve_ld1sb_64bit_vector_plus_immediate) {
10057   GatherLoadScalarPlusScalarOrImmHelper(config,
10058                                         kBRegSize,
10059                                         kDRegSize,
10060                                         &MacroAssembler::Ld1sb,
10061                                         true);
10062 }
10063 
TEST_SVE(sve_ld1sh_64bit_vector_plus_immediate)10064 TEST_SVE(sve_ld1sh_64bit_vector_plus_immediate) {
10065   GatherLoadScalarPlusScalarOrImmHelper(config,
10066                                         kHRegSize,
10067                                         kDRegSize,
10068                                         &MacroAssembler::Ld1sh,
10069                                         true);
10070 }
10071 
TEST_SVE(sve_ld1sw_64bit_vector_plus_immediate)10072 TEST_SVE(sve_ld1sw_64bit_vector_plus_immediate) {
10073   GatherLoadScalarPlusScalarOrImmHelper(config,
10074                                         kSRegSize,
10075                                         kDRegSize,
10076                                         &MacroAssembler::Ld1sw,
10077                                         true);
10078 }
10079 
TEST_SVE(sve_ld1b_32bit_vector_plus_immediate)10080 TEST_SVE(sve_ld1b_32bit_vector_plus_immediate) {
10081   GatherLoadScalarPlusScalarOrImmHelper(config,
10082                                         kBRegSize,
10083                                         kSRegSize,
10084                                         &MacroAssembler::Ld1b,
10085                                         false);
10086 }
10087 
TEST_SVE(sve_ld1h_32bit_vector_plus_immediate)10088 TEST_SVE(sve_ld1h_32bit_vector_plus_immediate) {
10089   GatherLoadScalarPlusScalarOrImmHelper(config,
10090                                         kHRegSize,
10091                                         kSRegSize,
10092                                         &MacroAssembler::Ld1h,
10093                                         false);
10094 }
10095 
TEST_SVE(sve_ld1w_32bit_vector_plus_immediate)10096 TEST_SVE(sve_ld1w_32bit_vector_plus_immediate) {
10097   GatherLoadScalarPlusScalarOrImmHelper(config,
10098                                         kSRegSize,
10099                                         kSRegSize,
10100                                         &MacroAssembler::Ld1w,
10101                                         false);
10102 }
10103 
TEST_SVE(sve_ld1sb_32bit_vector_plus_immediate)10104 TEST_SVE(sve_ld1sb_32bit_vector_plus_immediate) {
10105   GatherLoadScalarPlusScalarOrImmHelper(config,
10106                                         kBRegSize,
10107                                         kSRegSize,
10108                                         &MacroAssembler::Ld1sb,
10109                                         true);
10110 }
10111 
TEST_SVE(sve_ld1sh_32bit_vector_plus_immediate)10112 TEST_SVE(sve_ld1sh_32bit_vector_plus_immediate) {
10113   GatherLoadScalarPlusScalarOrImmHelper(config,
10114                                         kHRegSize,
10115                                         kSRegSize,
10116                                         &MacroAssembler::Ld1sh,
10117                                         true);
10118 }
10119 
TEST_SVE(sve_ld1_scalar_plus_vector_32_scaled_offset)10120 TEST_SVE(sve_ld1_scalar_plus_vector_32_scaled_offset) {
10121   auto ld1_32_scaled_offset_helper =
10122       std::bind(&GatherLoadScalarPlusVectorHelper<Extend>,
10123                 config,
10124                 std::placeholders::_1,
10125                 kSRegSize,
10126                 std::placeholders::_2,
10127                 std::placeholders::_3,
10128                 std::placeholders::_4,
10129                 std::placeholders::_5,
10130                 true);
10131 
10132   Ld1Macro ld1h = &MacroAssembler::Ld1h;
10133   Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
10134   ld1_32_scaled_offset_helper(kHRegSize, ld1h, ldff1h, UXTW, false);
10135   ld1_32_scaled_offset_helper(kHRegSize, ld1h, ldff1h, SXTW, false);
10136 
10137   Ld1Macro ld1w = &MacroAssembler::Ld1w;
10138   Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
10139   ld1_32_scaled_offset_helper(kSRegSize, ld1w, ldff1w, UXTW, false);
10140   ld1_32_scaled_offset_helper(kSRegSize, ld1w, ldff1w, SXTW, false);
10141 
10142   Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
10143   Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
10144   ld1_32_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, UXTW, true);
10145   ld1_32_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, SXTW, true);
10146 }
10147 
TEST_SVE(sve_ld1_scalar_plus_vector_32_unscaled_offset)10148 TEST_SVE(sve_ld1_scalar_plus_vector_32_unscaled_offset) {
10149   auto ld1_32_unscaled_offset_helper =
10150       std::bind(&GatherLoadScalarPlusVectorHelper<Extend>,
10151                 config,
10152                 std::placeholders::_1,
10153                 kSRegSize,
10154                 std::placeholders::_2,
10155                 std::placeholders::_3,
10156                 std::placeholders::_4,
10157                 std::placeholders::_5,
10158                 false);
10159 
10160   Ld1Macro ld1b = &MacroAssembler::Ld1b;
10161   Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
10162   ld1_32_unscaled_offset_helper(kBRegSize, ld1b, ldff1b, UXTW, false);
10163   ld1_32_unscaled_offset_helper(kBRegSize, ld1b, ldff1b, SXTW, false);
10164 
10165   Ld1Macro ld1h = &MacroAssembler::Ld1h;
10166   Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
10167   ld1_32_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, UXTW, false);
10168   ld1_32_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, SXTW, false);
10169 
10170   Ld1Macro ld1w = &MacroAssembler::Ld1w;
10171   Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
10172   ld1_32_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, UXTW, false);
10173   ld1_32_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, SXTW, false);
10174 
10175   Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
10176   Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
10177   ld1_32_unscaled_offset_helper(kBRegSize, ld1sb, ldff1sb, UXTW, true);
10178   ld1_32_unscaled_offset_helper(kBRegSize, ld1sb, ldff1sb, SXTW, true);
10179 
10180   Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
10181   Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
10182   ld1_32_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, UXTW, true);
10183   ld1_32_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, SXTW, true);
10184 }
10185 
TEST_SVE(sve_ld1_scalar_plus_vector_32_unpacked_scaled_offset)10186 TEST_SVE(sve_ld1_scalar_plus_vector_32_unpacked_scaled_offset) {
10187   auto ld1_32_unpacked_scaled_offset_helper =
10188       std::bind(&GatherLoadScalarPlusVectorHelper<Extend>,
10189                 config,
10190                 std::placeholders::_1,
10191                 kDRegSize,
10192                 std::placeholders::_2,
10193                 std::placeholders::_3,
10194                 std::placeholders::_4,
10195                 std::placeholders::_5,
10196                 true);
10197 
10198   Ld1Macro ld1h = &MacroAssembler::Ld1h;
10199   Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
10200   ld1_32_unpacked_scaled_offset_helper(kHRegSize, ld1h, ldff1h, UXTW, false);
10201   ld1_32_unpacked_scaled_offset_helper(kHRegSize, ld1h, ldff1h, SXTW, false);
10202 
10203   Ld1Macro ld1w = &MacroAssembler::Ld1w;
10204   Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
10205   ld1_32_unpacked_scaled_offset_helper(kSRegSize, ld1w, ldff1w, UXTW, false);
10206   ld1_32_unpacked_scaled_offset_helper(kSRegSize, ld1w, ldff1w, SXTW, false);
10207 
10208   Ld1Macro ld1d = &MacroAssembler::Ld1d;
10209   Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
10210   ld1_32_unpacked_scaled_offset_helper(kDRegSize, ld1d, ldff1d, UXTW, false);
10211   ld1_32_unpacked_scaled_offset_helper(kDRegSize, ld1d, ldff1d, SXTW, false);
10212 
10213   Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
10214   Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
10215   ld1_32_unpacked_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, UXTW, true);
10216   ld1_32_unpacked_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, SXTW, true);
10217 
10218   Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
10219   Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
10220   ld1_32_unpacked_scaled_offset_helper(kSRegSize, ld1sw, ldff1sw, UXTW, true);
10221   ld1_32_unpacked_scaled_offset_helper(kSRegSize, ld1sw, ldff1sw, SXTW, true);
10222 }
10223 
TEST_SVE(sve_ld1_scalar_plus_vector_32_unpacked_unscaled_offset)10224 TEST_SVE(sve_ld1_scalar_plus_vector_32_unpacked_unscaled_offset) {
10225   auto ld1_32_unpacked_unscaled_offset_helper =
10226       std::bind(&GatherLoadScalarPlusVectorHelper<Extend>,
10227                 config,
10228                 std::placeholders::_1,
10229                 kDRegSize,
10230                 std::placeholders::_2,
10231                 std::placeholders::_3,
10232                 std::placeholders::_4,
10233                 std::placeholders::_5,
10234                 false);
10235 
10236   Ld1Macro ld1h = &MacroAssembler::Ld1h;
10237   Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
10238   ld1_32_unpacked_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, UXTW, false);
10239   ld1_32_unpacked_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, SXTW, false);
10240 
10241   Ld1Macro ld1w = &MacroAssembler::Ld1w;
10242   Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
10243   ld1_32_unpacked_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, UXTW, false);
10244   ld1_32_unpacked_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, SXTW, false);
10245 
10246   Ld1Macro ld1d = &MacroAssembler::Ld1d;
10247   Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
10248   ld1_32_unpacked_unscaled_offset_helper(kDRegSize, ld1d, ldff1d, UXTW, false);
10249   ld1_32_unpacked_unscaled_offset_helper(kDRegSize, ld1d, ldff1d, SXTW, false);
10250 
10251   Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
10252   Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
10253   ld1_32_unpacked_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, UXTW, true);
10254   ld1_32_unpacked_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, SXTW, true);
10255 
10256   Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
10257   Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
10258   ld1_32_unpacked_unscaled_offset_helper(kSRegSize, ld1sw, ldff1sw, UXTW, true);
10259   ld1_32_unpacked_unscaled_offset_helper(kSRegSize, ld1sw, ldff1sw, SXTW, true);
10260 }
10261 
TEST_SVE(sve_ld1_scalar_plus_vector_64_scaled_offset)10262 TEST_SVE(sve_ld1_scalar_plus_vector_64_scaled_offset) {
10263   auto ld1_64_scaled_offset_helper =
10264       std::bind(&GatherLoadScalarPlusVectorHelper<Shift>,
10265                 config,
10266                 std::placeholders::_1,
10267                 kDRegSize,
10268                 std::placeholders::_2,
10269                 std::placeholders::_3,
10270                 LSL,
10271                 std::placeholders::_4,
10272                 true);
10273 
10274   Ld1Macro ld1h = &MacroAssembler::Ld1h;
10275   Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
10276   ld1_64_scaled_offset_helper(kHRegSize, ld1h, ldff1h, false);
10277 
10278   Ld1Macro ld1w = &MacroAssembler::Ld1w;
10279   Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
10280   ld1_64_scaled_offset_helper(kSRegSize, ld1w, ldff1w, false);
10281 
10282   Ld1Macro ld1d = &MacroAssembler::Ld1d;
10283   Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
10284   ld1_64_scaled_offset_helper(kDRegSize, ld1d, ldff1d, false);
10285 
10286   Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
10287   Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
10288   ld1_64_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, true);
10289 
10290   Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
10291   Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
10292   ld1_64_scaled_offset_helper(kSRegSize, ld1sw, ldff1sw, true);
10293 }
10294 
TEST_SVE(sve_ld1_scalar_plus_vector_64_unscaled_offset)10295 TEST_SVE(sve_ld1_scalar_plus_vector_64_unscaled_offset) {
10296   auto ld1_64_unscaled_offset_helper =
10297       std::bind(&GatherLoadScalarPlusVectorHelper<Shift>,
10298                 config,
10299                 std::placeholders::_1,
10300                 kDRegSize,
10301                 std::placeholders::_2,
10302                 std::placeholders::_3,
10303                 NO_SHIFT,
10304                 std::placeholders::_4,
10305                 false);
10306 
10307   Ld1Macro ld1b = &MacroAssembler::Ld1b;
10308   Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
10309   ld1_64_unscaled_offset_helper(kBRegSize, ld1b, ldff1b, false);
10310 
10311   Ld1Macro ld1h = &MacroAssembler::Ld1h;
10312   Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
10313   ld1_64_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, false);
10314 
10315   Ld1Macro ld1w = &MacroAssembler::Ld1w;
10316   Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
10317   ld1_64_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, false);
10318 
10319   Ld1Macro ld1d = &MacroAssembler::Ld1d;
10320   Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
10321   ld1_64_unscaled_offset_helper(kDRegSize, ld1d, ldff1d, false);
10322 
10323   Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
10324   Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
10325   ld1_64_unscaled_offset_helper(kBRegSize, ld1sb, ldff1sb, true);
10326 
10327   Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
10328   Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
10329   ld1_64_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, true);
10330 
10331   Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
10332   Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
10333   ld1_64_unscaled_offset_helper(kSRegSize, ld1sw, ldff1sw, true);
10334 }
10335 
TEST_SVE(sve_ldnt1)10336 TEST_SVE(sve_ldnt1) {
10337   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
10338   START();
10339 
10340   int data_size = kZRegMaxSizeInBytes * 16;
10341   uint8_t* data = new uint8_t[data_size];
10342   for (int i = 0; i < data_size; i++) {
10343     data[i] = i & 0xff;
10344   }
10345 
10346   // Set the base half-way through the buffer so we can use negative indices.
10347   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
10348   __ Ptrue(p0.VnB());
10349   __ Punpklo(p1.VnH(), p0.VnB());
10350   __ Punpklo(p2.VnH(), p1.VnB());
10351   __ Punpklo(p3.VnH(), p2.VnB());
10352   __ Punpklo(p4.VnH(), p3.VnB());
10353 
10354   __ Mov(x1, 42);
10355   __ Ld1b(z0.VnB(), p1.Zeroing(), SVEMemOperand(x0, x1));
10356   __ Ldnt1b(z1.VnB(), p1.Zeroing(), SVEMemOperand(x0, x1));
10357 
10358   __ Mov(x1, -21);
10359   __ Ld1h(z2.VnH(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 1));
10360   __ Ldnt1h(z3.VnH(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 1));
10361 
10362   __ Mov(x1, 10);
10363   __ Ld1w(z4.VnS(), p3.Zeroing(), SVEMemOperand(x0, x1, LSL, 2));
10364   __ Ldnt1w(z5.VnS(), p3.Zeroing(), SVEMemOperand(x0, x1, LSL, 2));
10365 
10366   __ Mov(x1, -5);
10367   __ Ld1d(z6.VnD(), p4.Zeroing(), SVEMemOperand(x0, x1, LSL, 3));
10368   __ Ldnt1d(z7.VnD(), p4.Zeroing(), SVEMemOperand(x0, x1, LSL, 3));
10369 
10370   __ Ld1b(z8.VnB(), p1.Zeroing(), SVEMemOperand(x0, 1, SVE_MUL_VL));
10371   __ Ldnt1b(z9.VnB(), p1.Zeroing(), SVEMemOperand(x0, 1, SVE_MUL_VL));
10372 
10373   __ Ld1h(z10.VnH(), p2.Zeroing(), SVEMemOperand(x0, -1, SVE_MUL_VL));
10374   __ Ldnt1h(z11.VnH(), p2.Zeroing(), SVEMemOperand(x0, -1, SVE_MUL_VL));
10375 
10376   __ Ld1w(z12.VnS(), p3.Zeroing(), SVEMemOperand(x0, 7, SVE_MUL_VL));
10377   __ Ldnt1w(z13.VnS(), p3.Zeroing(), SVEMemOperand(x0, 7, SVE_MUL_VL));
10378 
10379   __ Ld1d(z14.VnD(), p4.Zeroing(), SVEMemOperand(x0, -8, SVE_MUL_VL));
10380   __ Ldnt1d(z15.VnD(), p4.Zeroing(), SVEMemOperand(x0, -8, SVE_MUL_VL));
10381   END();
10382 
10383   if (CAN_RUN()) {
10384     RUN();
10385     ASSERT_EQUAL_SVE(z0, z1);
10386     ASSERT_EQUAL_SVE(z2, z3);
10387     ASSERT_EQUAL_SVE(z4, z5);
10388     ASSERT_EQUAL_SVE(z6, z7);
10389     ASSERT_EQUAL_SVE(z8, z9);
10390     ASSERT_EQUAL_SVE(z10, z11);
10391     ASSERT_EQUAL_SVE(z12, z13);
10392     ASSERT_EQUAL_SVE(z14, z15);
10393   }
10394 }
10395 
TEST_SVE(sve_stnt1)10396 TEST_SVE(sve_stnt1) {
10397   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
10398   START();
10399 
10400   int data_size = kZRegMaxSizeInBytes * 16;
10401   uint8_t* data = new uint8_t[data_size];
10402 
10403   // Set the base half-way through the buffer so we can use negative indices.
10404   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
10405   __ Ptrue(p0.VnB());
10406   __ Punpklo(p1.VnH(), p0.VnB());
10407   __ Punpklo(p2.VnH(), p1.VnB());
10408   __ Punpklo(p3.VnH(), p2.VnB());
10409   __ Punpklo(p4.VnH(), p3.VnB());
10410   __ Dup(z0.VnB(), 0x55);
10411   __ Index(z1.VnB(), 0, 1);
10412 
10413   // Store with all-true and patterned predication, load back, and create a
10414   // reference value for later comparison.
10415   __ Rdvl(x1, 1);
10416   __ Stnt1b(z0.VnB(), p0, SVEMemOperand(x0, x1));
10417   __ Stnt1b(z1.VnB(), p1, SVEMemOperand(x0, 1, SVE_MUL_VL));
10418   __ Ld1b(z2.VnB(), p0.Zeroing(), SVEMemOperand(x0, x1));
10419   __ Sel(z3.VnB(), p1, z1.VnB(), z0.VnB());
10420 
10421   // Repeated, with wider elements and different offsets.
10422   __ Rdvl(x1, -1);
10423   __ Lsr(x1, x1, 1);
10424   __ Stnt1h(z0.VnH(), p0, SVEMemOperand(x0, x1, LSL, 1));
10425   __ Stnt1h(z1.VnH(), p2, SVEMemOperand(x0, -1, SVE_MUL_VL));
10426   __ Ld1b(z4.VnB(), p0.Zeroing(), SVEMemOperand(x0, x1, LSL, 1));
10427   __ Sel(z5.VnH(), p2, z1.VnH(), z0.VnH());
10428 
10429   __ Rdvl(x1, 7);
10430   __ Lsr(x1, x1, 2);
10431   __ Stnt1w(z0.VnS(), p0, SVEMemOperand(x0, x1, LSL, 2));
10432   __ Stnt1w(z1.VnS(), p3, SVEMemOperand(x0, 7, SVE_MUL_VL));
10433   __ Ld1b(z6.VnB(), p0.Zeroing(), SVEMemOperand(x0, x1, LSL, 2));
10434   __ Sel(z7.VnS(), p3, z1.VnS(), z0.VnS());
10435 
10436   __ Rdvl(x1, -8);
10437   __ Lsr(x1, x1, 3);
10438   __ Stnt1d(z0.VnD(), p0, SVEMemOperand(x0, x1, LSL, 3));
10439   __ Stnt1d(z1.VnD(), p4, SVEMemOperand(x0, -8, SVE_MUL_VL));
10440   __ Ld1b(z8.VnB(), p0.Zeroing(), SVEMemOperand(x0, x1, LSL, 3));
10441   __ Sel(z9.VnD(), p4, z1.VnD(), z0.VnD());
10442   END();
10443 
10444   if (CAN_RUN()) {
10445     RUN();
10446     ASSERT_EQUAL_SVE(z2, z3);
10447     ASSERT_EQUAL_SVE(z4, z5);
10448     ASSERT_EQUAL_SVE(z6, z7);
10449     ASSERT_EQUAL_SVE(z8, z9);
10450   }
10451 }
10452 
TEST_SVE(sve_ld1rq)10453 TEST_SVE(sve_ld1rq) {
10454   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
10455   START();
10456 
10457   int data_size = (kQRegSizeInBytes + 128) * 2;
10458   uint8_t* data = new uint8_t[data_size];
10459   for (int i = 0; i < data_size; i++) {
10460     data[i] = i & 0xff;
10461   }
10462 
10463   // Set the base half-way through the buffer so we can use negative indices.
10464   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
10465 
10466   __ Index(z0.VnB(), 0, 1);
10467   __ Ptrue(p0.VnB());
10468   __ Cmplo(p0.VnB(), p0.Zeroing(), z0.VnB(), 4);
10469   __ Pfalse(p1.VnB());
10470   __ Zip1(p1.VnB(), p0.VnB(), p1.VnB());
10471 
10472   // Load and broadcast using scalar offsets.
10473   __ Mov(x1, -42);
10474   __ Ld1rqb(z0.VnB(), p1.Zeroing(), SVEMemOperand(x0, x1));
10475 
10476   __ Add(x2, x0, 1);
10477   __ Mov(x1, -21);
10478   __ Punpklo(p2.VnH(), p1.VnB());
10479   __ Ld1rqh(z1.VnH(), p2.Zeroing(), SVEMemOperand(x2, x1, LSL, 1));
10480 
10481   __ Add(x2, x2, 1);
10482   __ Mov(x1, -10);
10483   __ Punpklo(p3.VnH(), p2.VnB());
10484   __ Ld1rqw(z2.VnS(), p3.Zeroing(), SVEMemOperand(x2, x1, LSL, 2));
10485 
10486   __ Add(x2, x2, 1);
10487   __ Mov(x1, 5);
10488   __ Punpklo(p4.VnH(), p3.VnB());
10489   __ Ld1rqd(z3.VnD(), p4.Zeroing(), SVEMemOperand(x2, x1, LSL, 3));
10490 
10491   // Check that all segments match by rotating the vector by one segment,
10492   // eoring, and orring across the vector.
10493   __ Mov(z4, z0);
10494   __ Ext(z4.VnB(), z4.VnB(), z4.VnB(), 16);
10495   __ Eor(z4.VnB(), z4.VnB(), z0.VnB());
10496   __ Orv(b4, p0, z4.VnB());
10497   __ Mov(z5, z1);
10498   __ Ext(z5.VnB(), z5.VnB(), z5.VnB(), 16);
10499   __ Eor(z5.VnB(), z5.VnB(), z1.VnB());
10500   __ Orv(b5, p0, z5.VnB());
10501   __ Orr(z4, z4, z5);
10502   __ Mov(z5, z2);
10503   __ Ext(z5.VnB(), z5.VnB(), z5.VnB(), 16);
10504   __ Eor(z5.VnB(), z5.VnB(), z2.VnB());
10505   __ Orv(b5, p0, z5.VnB());
10506   __ Orr(z4, z4, z5);
10507   __ Mov(z5, z3);
10508   __ Ext(z5.VnB(), z5.VnB(), z5.VnB(), 16);
10509   __ Eor(z5.VnB(), z5.VnB(), z3.VnB());
10510   __ Orv(b5, p0, z5.VnB());
10511   __ Orr(z4, z4, z5);
10512 
10513   // Load and broadcast the same values, using immediate offsets.
10514   __ Add(x1, x0, 6);
10515   __ Ld1rqb(z5.VnB(), p1.Zeroing(), SVEMemOperand(x1, -48));
10516   __ Add(x1, x0, -9);
10517   __ Ld1rqh(z6.VnH(), p2.Zeroing(), SVEMemOperand(x1, -32));
10518   __ Add(x1, x0, -70);
10519   __ Ld1rqw(z7.VnS(), p3.Zeroing(), SVEMemOperand(x1, 32));
10520   __ Add(x1, x0, 27);
10521   __ Ld1rqd(z8.VnD(), p4.Zeroing(), SVEMemOperand(x1, 16));
10522   END();
10523 
10524   if (CAN_RUN()) {
10525     RUN();
10526     uint64_t expected_z0[] = {0x0000000000000000, 0x006c006a00680066};
10527     uint64_t expected_z1[] = {0x000074730000706f, 0x00006c6b00006867};
10528     uint64_t expected_z2[] = {0x0000000075747372, 0x000000006d6c6b6a};
10529     uint64_t expected_z3[] = {0x0000000000000000, 0xc2c1c0bfbebdbcbb};
10530     uint64_t expected_z4[] = {0, 0};
10531     ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
10532     ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
10533     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
10534     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
10535     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
10536     ASSERT_EQUAL_SVE(z0, z5);
10537     ASSERT_EQUAL_SVE(z1, z6);
10538     ASSERT_EQUAL_SVE(z2, z7);
10539     ASSERT_EQUAL_SVE(z3, z8);
10540   }
10541 }
10542 
TEST_SVE(sve_st1_vec_imm)10543 TEST_SVE(sve_st1_vec_imm) {
10544   SVE_SETUP_WITH_FEATURES(CPUFeatures::kNEON, CPUFeatures::kSVE);
10545   START();
10546 
10547   // TODO: Use mmap() to request a buffer in the low 4GB, which allows testing
10548   // 32-bit address vectors.
10549   int data_size = kZRegMaxSizeInBytes * 16;
10550   uint8_t* data = new uint8_t[data_size];
10551 
10552   // Set the base to 16 bytes from the end of the buffer so we can use negative
10553   // indices.
10554   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size - 16]));
10555   __ Ptrue(p0.VnB());
10556 
10557   // Store a vector of index values in reverse order, using
10558   // vector-plus-immediate addressing to begin at byte 15, then storing to
10559   // bytes 14, 13, etc.
10560   __ Index(z1.VnD(), x0, -1);
10561   __ Index(z2.VnD(), 0, 1);
10562 
10563   // Iterate in order to store at least 16 bytes. The number of iterations
10564   // depends on VL, eg. VL128 iterates eight times, storing bytes 15 and 14
10565   // on the first iteration, 13 and 12 on the next, etc.
10566   uint64_t dlanes = config->sve_vl_in_bytes() / kDRegSizeInBytes;
10567   for (int i = 15; i >= 0; i -= dlanes * kBRegSizeInBytes) {
10568     __ St1b(z2.VnD(), p0, SVEMemOperand(z1.VnD(), i));
10569     __ Incd(z2.VnD());
10570   }
10571 
10572   // Reload the stored data, and build a reference for comparison. The reference
10573   // is truncated to a Q register, as only the least-significant 128 bits are
10574   // checked.
10575   __ Ldr(q4, MemOperand(x0));
10576   __ Index(z5.VnB(), 15, -1);
10577   __ Mov(q5, q5);
10578 
10579   // Repeat for wider elements.
10580   __ Index(z1.VnD(), x0, -2);  // Stepping by -2 for H-sized elements.
10581   __ Index(z2.VnD(), 0, 1);
10582   for (int i = 14; i >= 0; i -= dlanes * kHRegSizeInBytes) {
10583     __ St1h(z2.VnD(), p0, SVEMemOperand(z1.VnD(), i));
10584     __ Incd(z2.VnD());
10585   }
10586   __ Ldr(q6, MemOperand(x0));
10587   __ Index(z7.VnH(), 7, -1);
10588   __ Mov(q7, q7);
10589 
10590   __ Index(z1.VnD(), x0, -4);  // Stepping by -4 for S-sized elements.
10591   __ Index(z2.VnD(), 0, 1);
10592   for (int i = 12; i >= 0; i -= dlanes * kSRegSizeInBytes) {
10593     __ St1w(z2.VnD(), p0, SVEMemOperand(z1.VnD(), i));
10594     __ Incd(z2.VnD());
10595   }
10596   __ Ldr(q8, MemOperand(x0));
10597   __ Index(z9.VnS(), 3, -1);
10598   __ Mov(q9, q9);
10599 
10600   __ Index(z1.VnD(), x0, -8);  // Stepping by -8 for D-sized elements.
10601   __ Index(z2.VnD(), 0, 1);
10602   for (int i = 8; i >= 0; i -= dlanes * kDRegSizeInBytes) {
10603     __ St1d(z2.VnD(), p0, SVEMemOperand(z1.VnD(), i));
10604     __ Incd(z2.VnD());
10605   }
10606   __ Ldr(q10, MemOperand(x0));
10607   __ Index(z11.VnD(), 1, -1);
10608   __ Mov(q11, q11);
10609 
10610   // Test predication by storing even halfwords to memory (using predication)
10611   // at byte-separated addresses. The result should be the same as storing
10612   // even halfwords contiguously to memory.
10613   __ Pfalse(p1.VnB());
10614   __ Zip1(p1.VnD(), p0.VnD(), p1.VnD());
10615   __ Mov(x0, reinterpret_cast<uintptr_t>(data));
10616   __ Index(z1.VnD(), x0, 1);
10617   __ Index(z2.VnD(), 0x1000, 1);
10618   for (int i = 0; i < 16; i += dlanes) {
10619     __ St1h(z2.VnD(), p1, SVEMemOperand(z1.VnD(), i));
10620     __ Incd(z2.VnD());
10621   }
10622   __ Ldr(q2, MemOperand(x0));
10623   __ Index(z3.VnH(), 0x1000, 2);
10624   __ Mov(q3, q3);
10625 
10626   END();
10627 
10628   if (CAN_RUN()) {
10629     RUN();
10630 
10631     ASSERT_EQUAL_SVE(z3, z2);
10632     ASSERT_EQUAL_SVE(z5, z4);
10633     ASSERT_EQUAL_SVE(z7, z6);
10634     ASSERT_EQUAL_SVE(z9, z8);
10635     ASSERT_EQUAL_SVE(z11, z10);
10636   }
10637 }
10638 
10639 template <typename T>
sve_st1_scalar_plus_vector_helper(Test * config,int esize_in_bits,T mod,bool is_scaled)10640 static void sve_st1_scalar_plus_vector_helper(Test* config,
10641                                               int esize_in_bits,
10642                                               T mod,
10643                                               bool is_scaled) {
10644   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
10645   START();
10646 
10647   int vl = config->sve_vl_in_bytes();
10648   int data_size = vl * 160;
10649   uint8_t* data = new uint8_t[data_size];
10650   memset(data, 0, data_size);
10651   int vl_per_esize = vl / (esize_in_bits / kBitsPerByte);
10652 
10653   ZRegister zn_b = z0.WithLaneSize(esize_in_bits);
10654   ZRegister zn_h = z1.WithLaneSize(esize_in_bits);
10655   ZRegister zn_s = z2.WithLaneSize(esize_in_bits);
10656   ZRegister zn_d = z3.WithLaneSize(esize_in_bits);
10657 
10658   ZRegister zn_ld_b = z10.WithLaneSize(esize_in_bits);
10659   ZRegister zn_ld_h = z11.WithLaneSize(esize_in_bits);
10660   ZRegister zn_ld_s = z12.WithLaneSize(esize_in_bits);
10661   ZRegister zn_ld_d = z13.WithLaneSize(esize_in_bits);
10662   ZRegister offsets = z31.WithLaneSize(esize_in_bits);
10663 
10664   // Set the base half-way through the buffer so we can use negative indices.
10665   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
10666   __ Ptrue(p6.WithLaneSize(esize_in_bits));
10667   __ Pfalse(p7.WithLaneSize(esize_in_bits));
10668   __ Zip1(p0.WithLaneSize(esize_in_bits),
10669           p6.WithLaneSize(esize_in_bits),
10670           p7.WithLaneSize(esize_in_bits));
10671   __ Zip1(p1.WithLaneSize(esize_in_bits),
10672           p7.WithLaneSize(esize_in_bits),
10673           p6.WithLaneSize(esize_in_bits));
10674 
10675   // `st1b` doesn't have the scaled-offset forms.
10676   if (is_scaled == false) {
10677     // Simply stepping the index by 2 to simulate a scatter memory access.
10678     __ Index(offsets, 1, 2);
10679     __ St1b(offsets, p0, SVEMemOperand(x0, offsets, mod));
10680     __ Ld1b(zn_ld_b, p0.Zeroing(), SVEMemOperand(x0, offsets, mod));
10681     __ Dup(zn_b, 0);
10682     __ Mov(zn_b, p0.Merging(), offsets);
10683   }
10684 
10685   // Store the values to isolated range different with other stores.
10686   int scale = is_scaled ? 1 : 0;
10687   __ Add(x1, x0, vl_per_esize * 4);
10688   __ Index(offsets, 6, 4);
10689   __ St1h(offsets, p0, SVEMemOperand(x1, offsets, mod, scale));
10690   __ Ld1h(zn_ld_h, p0.Zeroing(), SVEMemOperand(x1, offsets, mod, scale));
10691   __ Dup(zn_h, 0);
10692   __ Mov(zn_h, p0.Merging(), offsets);
10693 
10694   scale = is_scaled ? 2 : 0;
10695   __ Add(x2, x0, UINT64_MAX + (vl_per_esize * -8) + 1);
10696   __ Index(offsets, 64, 8);
10697   if ((std::is_same<T, vixl::aarch64::Extend>::value) &&
10698       (static_cast<int>(mod) == SXTW)) {
10699     // Testing negative offsets.
10700     __ Neg(offsets, p6.Merging(), offsets);
10701   }
10702   __ St1w(offsets, p1, SVEMemOperand(x2, offsets, mod, scale));
10703   __ Ld1w(zn_ld_s, p1.Zeroing(), SVEMemOperand(x2, offsets, mod, scale));
10704   __ Dup(zn_s, 0);
10705   __ Mov(zn_s, p1.Merging(), offsets);
10706 
10707   if (esize_in_bits == kDRegSize) {
10708     // Test st1w by comparing the 32-bit value loaded correspondingly with the
10709     // 32-bit value stored.
10710     __ Lsl(zn_s, zn_s, kSRegSize);
10711     __ Lsr(zn_s, zn_s, kSRegSize);
10712   }
10713 
10714   // `st1d` doesn't have the S-sized lane forms.
10715   if (esize_in_bits == kDRegSize) {
10716     scale = is_scaled ? 3 : 0;
10717     __ Add(x3, x0, UINT64_MAX + (vl_per_esize * -16) + 1);
10718     __ Index(offsets, 128, 16);
10719     if ((std::is_same<T, vixl::aarch64::Extend>::value) &&
10720         (static_cast<int>(mod) == SXTW)) {
10721       __ Neg(offsets, p6.Merging(), offsets);
10722     }
10723     __ St1d(offsets, p1, SVEMemOperand(x3, offsets, mod, scale));
10724     __ Ld1d(zn_ld_d, p1.Zeroing(), SVEMemOperand(x3, offsets, mod, scale));
10725     __ Dup(zn_d, 0);
10726     __ Mov(zn_d, p1.Merging(), offsets);
10727   }
10728 
10729   END();
10730 
10731   if (CAN_RUN()) {
10732     RUN();
10733 
10734     if (scale == false) {
10735       ASSERT_EQUAL_SVE(zn_ld_b, zn_b);
10736     }
10737 
10738     ASSERT_EQUAL_SVE(zn_ld_h, zn_h);
10739     ASSERT_EQUAL_SVE(zn_ld_s, zn_s);
10740 
10741     if (esize_in_bits == kDRegSize) {
10742       ASSERT_EQUAL_SVE(zn_ld_d, zn_d);
10743     }
10744   }
10745 
10746   delete[] data;
10747 }
10748 
TEST_SVE(sve_st1_sca_vec_32_unpacked_unscaled)10749 TEST_SVE(sve_st1_sca_vec_32_unpacked_unscaled) {
10750   sve_st1_scalar_plus_vector_helper(config, kDRegSize, UXTW, false);
10751   sve_st1_scalar_plus_vector_helper(config, kDRegSize, SXTW, false);
10752 }
10753 
TEST_SVE(sve_st1_sca_vec_32_unpacked_scaled)10754 TEST_SVE(sve_st1_sca_vec_32_unpacked_scaled) {
10755   sve_st1_scalar_plus_vector_helper(config, kDRegSize, UXTW, true);
10756   sve_st1_scalar_plus_vector_helper(config, kDRegSize, SXTW, true);
10757 }
10758 
TEST_SVE(sve_st1_sca_vec_32_unscaled)10759 TEST_SVE(sve_st1_sca_vec_32_unscaled) {
10760   sve_st1_scalar_plus_vector_helper(config, kSRegSize, UXTW, false);
10761   sve_st1_scalar_plus_vector_helper(config, kSRegSize, SXTW, false);
10762 }
10763 
TEST_SVE(sve_st1_sca_vec_32_scaled)10764 TEST_SVE(sve_st1_sca_vec_32_scaled) {
10765   sve_st1_scalar_plus_vector_helper(config, kSRegSize, UXTW, true);
10766   sve_st1_scalar_plus_vector_helper(config, kSRegSize, SXTW, true);
10767 }
10768 
TEST_SVE(sve_st1_sca_vec_64_scaled)10769 TEST_SVE(sve_st1_sca_vec_64_scaled) {
10770   sve_st1_scalar_plus_vector_helper(config, kDRegSize, LSL, true);
10771 }
10772 
TEST_SVE(sve_st1_sca_vec_64_unscaled)10773 TEST_SVE(sve_st1_sca_vec_64_unscaled) {
10774   sve_st1_scalar_plus_vector_helper(config, kDRegSize, NO_SHIFT, false);
10775 }
10776 
10777 typedef void (MacroAssembler::*IntWideImmFn)(const ZRegister& zd,
10778                                              const ZRegister& zn,
10779                                              const IntegerOperand imm);
10780 
10781 template <typename F, typename Td, typename Tn>
IntWideImmHelper(Test * config,F macro,unsigned lane_size_in_bits,const Tn & zn_inputs,IntegerOperand imm,const Td & zd_expected)10782 static void IntWideImmHelper(Test* config,
10783                              F macro,
10784                              unsigned lane_size_in_bits,
10785                              const Tn& zn_inputs,
10786                              IntegerOperand imm,
10787                              const Td& zd_expected) {
10788   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
10789   START();
10790 
10791   ZRegister zd1 = z0.WithLaneSize(lane_size_in_bits);
10792   InsrHelper(&masm, zd1, zn_inputs);
10793 
10794   // Also test with a different zn, to test the movprfx case.
10795   ZRegister zn = z1.WithLaneSize(lane_size_in_bits);
10796   InsrHelper(&masm, zn, zn_inputs);
10797   ZRegister zd2 = z2.WithLaneSize(lane_size_in_bits);
10798   ZRegister zn_copy = z3.WithSameLaneSizeAs(zn);
10799 
10800   // Make a copy so we can check that constructive operations preserve zn.
10801   __ Mov(zn_copy, zn);
10802 
10803   {
10804     UseScratchRegisterScope temps(&masm);
10805     // The MacroAssembler needs a P scratch register for some of these macros,
10806     // and it doesn't have one by default.
10807     temps.Include(p3);
10808 
10809     (masm.*macro)(zd1, zd1, imm);
10810     (masm.*macro)(zd2, zn, imm);
10811   }
10812 
10813   END();
10814 
10815   if (CAN_RUN()) {
10816     RUN();
10817 
10818     ASSERT_EQUAL_SVE(zd_expected, zd1);
10819 
10820     // Check the result from `instr` with movprfx is the same as
10821     // the immediate version.
10822     ASSERT_EQUAL_SVE(zd_expected, zd2);
10823 
10824     ASSERT_EQUAL_SVE(zn_copy, zn);
10825   }
10826 }
10827 
TEST_SVE(sve_int_wide_imm_unpredicated_smax)10828 TEST_SVE(sve_int_wide_imm_unpredicated_smax) {
10829   int in_b[] = {0, -128, 127, -127, 126, 1, -1, 55};
10830   int in_h[] = {0, -128, 127, INT16_MIN, INT16_MAX, 1, -1, 5555};
10831   int in_s[] = {0, -128, 127, INT32_MIN, INT32_MAX, 1, -1, 555555};
10832   int64_t in_d[] = {1, 10, 10000, 1000000};
10833 
10834   IntWideImmFn fn = &MacroAssembler::Smax;
10835 
10836   int exp_b_1[] = {0, -1, 127, -1, 126, 1, -1, 55};
10837   int exp_h_1[] = {127, 127, 127, 127, INT16_MAX, 127, 127, 5555};
10838   int exp_s_1[] = {0, -128, 127, -128, INT32_MAX, 1, -1, 555555};
10839   int64_t exp_d_1[] = {99, 99, 10000, 1000000};
10840 
10841   IntWideImmHelper(config, fn, kBRegSize, in_b, -1, exp_b_1);
10842   IntWideImmHelper(config, fn, kHRegSize, in_h, 127, exp_h_1);
10843   IntWideImmHelper(config, fn, kSRegSize, in_s, -128, exp_s_1);
10844   IntWideImmHelper(config, fn, kDRegSize, in_d, 99, exp_d_1);
10845 
10846   int exp_h_2[] = {0, -128, 127, -255, INT16_MAX, 1, -1, 5555};
10847   int exp_s_2[] = {2048, 2048, 2048, 2048, INT32_MAX, 2048, 2048, 555555};
10848   int64_t exp_d_2[] = {INT16_MAX, INT16_MAX, INT16_MAX, 1000000};
10849 
10850   // The immediate is in the range [-128, 127], but the macro is able to
10851   // synthesise unencodable immediates.
10852   // B-sized lanes cannot take an immediate out of the range [-128, 127].
10853   IntWideImmHelper(config, fn, kHRegSize, in_h, -255, exp_h_2);
10854   IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
10855   IntWideImmHelper(config, fn, kDRegSize, in_d, INT16_MAX, exp_d_2);
10856 }
10857 
TEST_SVE(sve_int_wide_imm_unpredicated_smin)10858 TEST_SVE(sve_int_wide_imm_unpredicated_smin) {
10859   int in_b[] = {0, -128, 127, -127, 126, 1, -1, 55};
10860   int in_h[] = {0, -128, 127, INT16_MIN, INT16_MAX, 1, -1, 5555};
10861   int in_s[] = {0, -128, 127, INT32_MIN, INT32_MAX, 1, -1, 555555};
10862   int64_t in_d[] = {1, 10, 10000, 1000000};
10863 
10864   IntWideImmFn fn = &MacroAssembler::Smin;
10865 
10866   int exp_b_1[] = {-1, -128, -1, -127, -1, -1, -1, -1};
10867   int exp_h_1[] = {0, -128, 127, INT16_MIN, 127, 1, -1, 127};
10868   int exp_s_1[] = {-128, -128, -128, INT32_MIN, -128, -128, -128, -128};
10869   int64_t exp_d_1[] = {1, 10, 99, 99};
10870 
10871   IntWideImmHelper(config, fn, kBRegSize, in_b, -1, exp_b_1);
10872   IntWideImmHelper(config, fn, kHRegSize, in_h, 127, exp_h_1);
10873   IntWideImmHelper(config, fn, kSRegSize, in_s, -128, exp_s_1);
10874   IntWideImmHelper(config, fn, kDRegSize, in_d, 99, exp_d_1);
10875 
10876   int exp_h_2[] = {-255, -255, -255, INT16_MIN, -255, -255, -255, -255};
10877   int exp_s_2[] = {0, -128, 127, INT32_MIN, 2048, 1, -1, 2048};
10878   int64_t exp_d_2[] = {1, 10, 10000, INT16_MAX};
10879 
10880   // The immediate is in the range [-128, 127], but the macro is able to
10881   // synthesise unencodable immediates.
10882   // B-sized lanes cannot take an immediate out of the range [-128, 127].
10883   IntWideImmHelper(config, fn, kHRegSize, in_h, -255, exp_h_2);
10884   IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
10885   IntWideImmHelper(config, fn, kDRegSize, in_d, INT16_MAX, exp_d_2);
10886 }
10887 
TEST_SVE(sve_int_wide_imm_unpredicated_umax)10888 TEST_SVE(sve_int_wide_imm_unpredicated_umax) {
10889   int in_b[] = {0, 255, 127, 0x80, 1, 55};
10890   int in_h[] = {0, 255, 127, INT16_MAX, 1, 5555};
10891   int in_s[] = {0, 0xff, 0x7f, INT32_MAX, 1, 555555};
10892   int64_t in_d[] = {1, 10, 10000, 1000000};
10893 
10894   IntWideImmFn fn = &MacroAssembler::Umax;
10895 
10896   int exp_b_1[] = {17, 255, 127, 0x80, 17, 55};
10897   int exp_h_1[] = {127, 255, 127, INT16_MAX, 127, 5555};
10898   int exp_s_1[] = {255, 255, 255, INT32_MAX, 255, 555555};
10899   int64_t exp_d_1[] = {99, 99, 10000, 1000000};
10900 
10901   IntWideImmHelper(config, fn, kBRegSize, in_b, 17, exp_b_1);
10902   IntWideImmHelper(config, fn, kHRegSize, in_h, 0x7f, exp_h_1);
10903   IntWideImmHelper(config, fn, kSRegSize, in_s, 0xff, exp_s_1);
10904   IntWideImmHelper(config, fn, kDRegSize, in_d, 99, exp_d_1);
10905 
10906   int exp_h_2[] = {511, 511, 511, INT16_MAX, 511, 5555};
10907   int exp_s_2[] = {2048, 2048, 2048, INT32_MAX, 2048, 555555};
10908   int64_t exp_d_2[] = {INT16_MAX, INT16_MAX, INT16_MAX, 1000000};
10909 
10910   // The immediate is in the range [0, 255], but the macro is able to
10911   // synthesise unencodable immediates.
10912   // B-sized lanes cannot take an immediate out of the range [0, 255].
10913   IntWideImmHelper(config, fn, kHRegSize, in_h, 511, exp_h_2);
10914   IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
10915   IntWideImmHelper(config, fn, kDRegSize, in_d, INT16_MAX, exp_d_2);
10916 }
10917 
TEST_SVE(sve_int_wide_imm_unpredicated_umin)10918 TEST_SVE(sve_int_wide_imm_unpredicated_umin) {
10919   int in_b[] = {0, 255, 127, 0x80, 1, 55};
10920   int in_h[] = {0, 255, 127, INT16_MAX, 1, 5555};
10921   int in_s[] = {0, 0xff, 0x7f, INT32_MAX, 1, 555555};
10922   int64_t in_d[] = {1, 10, 10000, 1000000};
10923 
10924   IntWideImmFn fn = &MacroAssembler::Umin;
10925 
10926   int exp_b_1[] = {0, 17, 17, 17, 1, 17};
10927   int exp_h_1[] = {0, 127, 127, 127, 1, 127};
10928   int exp_s_1[] = {0, 255, 127, 255, 1, 255};
10929   int64_t exp_d_1[] = {1, 10, 99, 99};
10930 
10931   IntWideImmHelper(config, fn, kBRegSize, in_b, 17, exp_b_1);
10932   IntWideImmHelper(config, fn, kHRegSize, in_h, 0x7f, exp_h_1);
10933   IntWideImmHelper(config, fn, kSRegSize, in_s, 255, exp_s_1);
10934   IntWideImmHelper(config, fn, kDRegSize, in_d, 99, exp_d_1);
10935 
10936   int exp_h_2[] = {0, 255, 127, 511, 1, 511};
10937   int exp_s_2[] = {0, 255, 127, 2048, 1, 2048};
10938   int64_t exp_d_2[] = {1, 10, 10000, INT16_MAX};
10939 
10940   // The immediate is in the range [0, 255], but the macro is able to
10941   // synthesise unencodable immediates.
10942   // B-sized lanes cannot take an immediate out of the range [0, 255].
10943   IntWideImmHelper(config, fn, kHRegSize, in_h, 511, exp_h_2);
10944   IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
10945   IntWideImmHelper(config, fn, kDRegSize, in_d, INT16_MAX, exp_d_2);
10946 }
10947 
TEST_SVE(sve_int_wide_imm_unpredicated_mul)10948 TEST_SVE(sve_int_wide_imm_unpredicated_mul) {
10949   int in_b[] = {11, -1, 7, -3};
10950   int in_h[] = {111, -1, 17, -123};
10951   int in_s[] = {11111, -1, 117, -12345};
10952   int64_t in_d[] = {0x7fffffff, 0x80000000};
10953 
10954   IntWideImmFn fn = &MacroAssembler::Mul;
10955 
10956   int exp_b_1[] = {66, -6, 42, -18};
10957   int exp_h_1[] = {-14208, 128, -2176, 15744};
10958   int exp_s_1[] = {11111 * 127, -127, 117 * 127, -12345 * 127};
10959   int64_t exp_d_1[] = {0xfffffffe, 0x100000000};
10960 
10961   IntWideImmHelper(config, fn, kBRegSize, in_b, 6, exp_b_1);
10962   IntWideImmHelper(config, fn, kHRegSize, in_h, -128, exp_h_1);
10963   IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
10964   IntWideImmHelper(config, fn, kDRegSize, in_d, 2, exp_d_1);
10965 
10966   int exp_h_2[] = {-28305, 255, -4335, 31365};
10967   int exp_s_2[] = {22755328, -2048, 239616, -25282560};
10968   int64_t exp_d_2[] = {0x00000063ffffff38, 0x0000006400000000};
10969 
10970   // The immediate is in the range [-128, 127], but the macro is able to
10971   // synthesise unencodable immediates.
10972   // B-sized lanes cannot take an immediate out of the range [0, 255].
10973   IntWideImmHelper(config, fn, kHRegSize, in_h, -255, exp_h_2);
10974   IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
10975   IntWideImmHelper(config, fn, kDRegSize, in_d, 200, exp_d_2);
10976 
10977   // Integer overflow on multiplication.
10978   unsigned exp_b_3[] = {0x75, 0x81, 0x79, 0x83};
10979 
10980   IntWideImmHelper(config, fn, kBRegSize, in_b, 0x7f, exp_b_3);
10981 }
10982 
TEST_SVE(sve_int_wide_imm_unpredicated_add)10983 TEST_SVE(sve_int_wide_imm_unpredicated_add) {
10984   unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
10985   unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
10986   unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
10987   uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
10988 
10989   IntWideImmFn fn = &MacroAssembler::Add;
10990 
10991   unsigned exp_b_1[] = {0x02, 0x00, 0x91, 0x80};
10992   unsigned exp_h_1[] = {0x8191, 0x7f8f, 0x1020, 0xaaba};
10993   unsigned exp_s_1[] = {0x80018200, 0x7fff7ffe, 0xaaaaab29, 0xf000f16f};
10994   uint64_t exp_d_1[] = {0x8000000180018280, 0x7fffffff7fff807e};
10995 
10996   // Encodable with `add` (shift 0).
10997   IntWideImmHelper(config, fn, kBRegSize, in_b, 0x81, exp_b_1);
10998   IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
10999   IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
11000   IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff, exp_d_1);
11001 
11002   unsigned exp_h_2[] = {0x9181, 0x8f7f, 0x2010, 0xbaaa};
11003   unsigned exp_s_2[] = {0x80020081, 0x7ffffe7f, 0xaaab29aa, 0xf0016ff0};
11004   uint64_t exp_d_2[] = {0x8000000180028081, 0x7fffffff80007e7f};
11005 
11006   // Encodable with `add` (shift 8).
11007   // B-sized lanes cannot take a shift of 8.
11008   IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
11009   IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
11010   IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
11011 
11012   unsigned exp_s_3[] = {0x80808181, 0x807e7f7f, 0xab29aaaa, 0xf07ff0f0};
11013 
11014   // The macro is able to synthesise unencodable immediates.
11015   IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 16, exp_s_3);
11016 
11017   unsigned exp_b_4[] = {0x61, 0x5f, 0xf0, 0xdf};
11018   unsigned exp_h_4[] = {0x6181, 0x5f7f, 0xf010, 0x8aaa};
11019   unsigned exp_s_4[] = {0x00018181, 0xffff7f7f, 0x2aaaaaaa, 0x7000f0f0};
11020   uint64_t exp_d_4[] = {0x8000000180018180, 0x7fffffff7fff7f7e};
11021 
11022   // Negative immediates use `sub`.
11023   IntWideImmHelper(config, fn, kBRegSize, in_b, -0x20, exp_b_4);
11024   IntWideImmHelper(config, fn, kHRegSize, in_h, -0x2000, exp_h_4);
11025   IntWideImmHelper(config, fn, kSRegSize, in_s, INT32_MIN, exp_s_4);
11026   IntWideImmHelper(config, fn, kDRegSize, in_d, -1, exp_d_4);
11027 }
11028 
TEST_SVE(sve_int_wide_imm_unpredicated_sqadd)11029 TEST_SVE(sve_int_wide_imm_unpredicated_sqadd) {
11030   unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
11031   unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
11032   unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
11033   uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
11034 
11035   IntWideImmFn fn = &MacroAssembler::Sqadd;
11036 
11037   unsigned exp_b_1[] = {0x02, 0x7f, 0x7f, 0x7f};
11038   unsigned exp_h_1[] = {0x8191, 0x7f8f, 0x1020, 0xaaba};
11039   unsigned exp_s_1[] = {0x80018200, 0x7fff7ffe, 0xaaaaab29, 0xf000f16f};
11040   uint64_t exp_d_1[] = {0x8000000180018280, 0x7fffffff7fff807e};
11041 
11042   // Encodable with `sqadd` (shift 0).
11043   // Note that encodable immediates are unsigned, even for signed saturation.
11044   IntWideImmHelper(config, fn, kBRegSize, in_b, 129, exp_b_1);
11045   IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
11046   IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
11047   IntWideImmHelper(config, fn, kDRegSize, in_d, 255, exp_d_1);
11048 
11049   unsigned exp_h_2[] = {0x9181, 0x7fff, 0x2010, 0xbaaa};
11050   unsigned exp_s_2[] = {0x80020081, 0x7ffffe7f, 0xaaab29aa, 0xf0016ff0};
11051   uint64_t exp_d_2[] = {0x8000000180028081, 0x7fffffff80007e7f};
11052 
11053   // Encodable with `sqadd` (shift 8).
11054   // B-sized lanes cannot take a shift of 8.
11055   IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
11056   IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
11057   IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
11058 }
11059 
TEST_SVE(sve_int_wide_imm_unpredicated_uqadd)11060 TEST_SVE(sve_int_wide_imm_unpredicated_uqadd) {
11061   unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
11062   unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
11063   unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
11064   uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
11065 
11066   IntWideImmFn fn = &MacroAssembler::Uqadd;
11067 
11068   unsigned exp_b_1[] = {0xff, 0xff, 0x91, 0xff};
11069   unsigned exp_h_1[] = {0x8191, 0x7f8f, 0x1020, 0xaaba};
11070   unsigned exp_s_1[] = {0x80018200, 0x7fff7ffe, 0xaaaaab29, 0xf000f16f};
11071   uint64_t exp_d_1[] = {0x8000000180018280, 0x7fffffff7fff807e};
11072 
11073   // Encodable with `uqadd` (shift 0).
11074   IntWideImmHelper(config, fn, kBRegSize, in_b, 0x81, exp_b_1);
11075   IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
11076   IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
11077   IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff, exp_d_1);
11078 
11079   unsigned exp_h_2[] = {0x9181, 0x8f7f, 0x2010, 0xbaaa};
11080   unsigned exp_s_2[] = {0x80020081, 0x7ffffe7f, 0xaaab29aa, 0xf0016ff0};
11081   uint64_t exp_d_2[] = {0x8000000180028081, 0x7fffffff80007e7f};
11082 
11083   // Encodable with `uqadd` (shift 8).
11084   // B-sized lanes cannot take a shift of 8.
11085   IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
11086   IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
11087   IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
11088 }
11089 
TEST_SVE(sve_int_wide_imm_unpredicated_sub)11090 TEST_SVE(sve_int_wide_imm_unpredicated_sub) {
11091   unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
11092   unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
11093   unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
11094   uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
11095 
11096   IntWideImmFn fn = &MacroAssembler::Sub;
11097 
11098   unsigned exp_b_1[] = {0x00, 0xfe, 0x8f, 0x7e};
11099   unsigned exp_h_1[] = {0x8171, 0x7f6f, 0x1000, 0xaa9a};
11100   unsigned exp_s_1[] = {0x80018102, 0x7fff7f00, 0xaaaaaa2b, 0xf000f071};
11101   uint64_t exp_d_1[] = {0x8000000180018082, 0x7fffffff7fff7e80};
11102 
11103   // Encodable with `sub` (shift 0).
11104   IntWideImmHelper(config, fn, kBRegSize, in_b, 0x81, exp_b_1);
11105   IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
11106   IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
11107   IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff, exp_d_1);
11108 
11109   unsigned exp_h_2[] = {0x7181, 0x6f7f, 0x0010, 0x9aaa};
11110   unsigned exp_s_2[] = {0x80010281, 0x7fff007f, 0xaaaa2baa, 0xf00071f0};
11111   uint64_t exp_d_2[] = {0x8000000180008281, 0x7fffffff7ffe807f};
11112 
11113   // Encodable with `sub` (shift 8).
11114   // B-sized lanes cannot take a shift of 8.
11115   IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
11116   IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
11117   IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
11118 
11119   unsigned exp_s_3[] = {0x7f828181, 0x7f807f7f, 0xaa2baaaa, 0xef81f0f0};
11120 
11121   // The macro is able to synthesise unencodable immediates.
11122   IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 16, exp_s_3);
11123 
11124   unsigned exp_b_4[] = {0xa1, 0x9f, 0x30, 0x1f};
11125   unsigned exp_h_4[] = {0xa181, 0x9f7f, 0x3010, 0xcaaa};
11126   unsigned exp_s_4[] = {0x00018181, 0xffff7f7f, 0x2aaaaaaa, 0x7000f0f0};
11127   uint64_t exp_d_4[] = {0x8000000180018182, 0x7fffffff7fff7f80};
11128 
11129   // Negative immediates use `add`.
11130   IntWideImmHelper(config, fn, kBRegSize, in_b, -0x20, exp_b_4);
11131   IntWideImmHelper(config, fn, kHRegSize, in_h, -0x2000, exp_h_4);
11132   IntWideImmHelper(config, fn, kSRegSize, in_s, INT32_MIN, exp_s_4);
11133   IntWideImmHelper(config, fn, kDRegSize, in_d, -1, exp_d_4);
11134 }
11135 
TEST_SVE(sve_int_wide_imm_unpredicated_sqsub)11136 TEST_SVE(sve_int_wide_imm_unpredicated_sqsub) {
11137   unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
11138   unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
11139   unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
11140   uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
11141 
11142   IntWideImmFn fn = &MacroAssembler::Sqsub;
11143 
11144   unsigned exp_b_1[] = {0x80, 0xfe, 0x8f, 0x80};
11145   unsigned exp_h_1[] = {0x8171, 0x7f6f, 0x1000, 0xaa9a};
11146   unsigned exp_s_1[] = {0x80018102, 0x7fff7f00, 0xaaaaaa2b, 0xf000f071};
11147   uint64_t exp_d_1[] = {0x8000000180018082, 0x7fffffff7fff7e80};
11148 
11149   // Encodable with `sqsub` (shift 0).
11150   // Note that encodable immediates are unsigned, even for signed saturation.
11151   IntWideImmHelper(config, fn, kBRegSize, in_b, 129, exp_b_1);
11152   IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
11153   IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
11154   IntWideImmHelper(config, fn, kDRegSize, in_d, 255, exp_d_1);
11155 
11156   unsigned exp_h_2[] = {0x8000, 0x6f7f, 0x0010, 0x9aaa};
11157   unsigned exp_s_2[] = {0x80010281, 0x7fff007f, 0xaaaa2baa, 0xf00071f0};
11158   uint64_t exp_d_2[] = {0x8000000180008281, 0x7fffffff7ffe807f};
11159 
11160   // Encodable with `sqsub` (shift 8).
11161   // B-sized lanes cannot take a shift of 8.
11162   IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
11163   IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
11164   IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
11165 }
11166 
TEST_SVE(sve_int_wide_imm_unpredicated_uqsub)11167 TEST_SVE(sve_int_wide_imm_unpredicated_uqsub) {
11168   unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
11169   unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
11170   unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
11171   uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
11172 
11173   IntWideImmFn fn = &MacroAssembler::Uqsub;
11174 
11175   unsigned exp_b_1[] = {0x00, 0x00, 0x00, 0x7e};
11176   unsigned exp_h_1[] = {0x8171, 0x7f6f, 0x1000, 0xaa9a};
11177   unsigned exp_s_1[] = {0x80018102, 0x7fff7f00, 0xaaaaaa2b, 0xf000f071};
11178   uint64_t exp_d_1[] = {0x8000000180018082, 0x7fffffff7fff7e80};
11179 
11180   // Encodable with `uqsub` (shift 0).
11181   IntWideImmHelper(config, fn, kBRegSize, in_b, 0x81, exp_b_1);
11182   IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
11183   IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
11184   IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff, exp_d_1);
11185 
11186   unsigned exp_h_2[] = {0x7181, 0x6f7f, 0x0010, 0x9aaa};
11187   unsigned exp_s_2[] = {0x80010281, 0x7fff007f, 0xaaaa2baa, 0xf00071f0};
11188   uint64_t exp_d_2[] = {0x8000000180008281, 0x7fffffff7ffe807f};
11189 
11190   // Encodable with `uqsub` (shift 8).
11191   // B-sized lanes cannot take a shift of 8.
11192   IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
11193   IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
11194   IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
11195 }
11196 
TEST_SVE(sve_int_wide_imm_unpredicated_subr)11197 TEST_SVE(sve_int_wide_imm_unpredicated_subr) {
11198   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11199   START();
11200 
11201   // Encodable with `subr` (shift 0).
11202   __ Index(z0.VnD(), 1, 1);
11203   __ Sub(z0.VnD(), 100, z0.VnD());
11204   __ Index(z1.VnS(), 0x7f, 1);
11205   __ Sub(z1.VnS(), 0xf7, z1.VnS());
11206   __ Index(z2.VnH(), 0xaaaa, 0x2222);
11207   __ Sub(z2.VnH(), 0x80, z2.VnH());
11208   __ Index(z3.VnB(), 133, 1);
11209   __ Sub(z3.VnB(), 255, z3.VnB());
11210 
11211   // Encodable with `subr` (shift 8).
11212   __ Index(z4.VnD(), 256, -1);
11213   __ Sub(z4.VnD(), 42 * 256, z4.VnD());
11214   __ Index(z5.VnS(), 0x7878, 1);
11215   __ Sub(z5.VnS(), 0x8000, z5.VnS());
11216   __ Index(z6.VnH(), 0x30f0, -1);
11217   __ Sub(z6.VnH(), 0x7f00, z6.VnH());
11218   // B-sized lanes cannot take a shift of 8.
11219 
11220   // Select with movprfx.
11221   __ Index(z31.VnD(), 256, 4001);
11222   __ Sub(z7.VnD(), 42 * 256, z31.VnD());
11223 
11224   // Out of immediate encodable range of `sub`.
11225   __ Index(z30.VnS(), 0x11223344, 1);
11226   __ Sub(z8.VnS(), 0x88776655, z30.VnS());
11227 
11228   END();
11229 
11230   if (CAN_RUN()) {
11231     RUN();
11232 
11233     int expected_z0[] = {87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99};
11234     ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
11235 
11236     int expected_z1[] = {0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78};
11237     ASSERT_EQUAL_SVE(expected_z1, z1.VnS());
11238 
11239     int expected_z2[] = {0xab2c, 0xcd4e, 0xef70, 0x1192, 0x33b4, 0x55d6};
11240     ASSERT_EQUAL_SVE(expected_z2, z2.VnH());
11241 
11242     int expected_z3[] = {0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a};
11243     ASSERT_EQUAL_SVE(expected_z3, z3.VnB());
11244 
11245     int expected_z4[] = {10502, 10501, 10500, 10499, 10498, 10497, 10496};
11246     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
11247 
11248     int expected_z5[] = {0x0783, 0x0784, 0x0785, 0x0786, 0x0787, 0x0788};
11249     ASSERT_EQUAL_SVE(expected_z5, z5.VnS());
11250 
11251     int expected_z6[] = {0x4e15, 0x4e14, 0x4e13, 0x4e12, 0x4e11, 0x4e10};
11252     ASSERT_EQUAL_SVE(expected_z6, z6.VnH());
11253 
11254     int expected_z7[] = {-13510, -9509, -5508, -1507, 2494, 6495, 10496};
11255     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
11256 
11257     int expected_z8[] = {0x7755330e, 0x7755330f, 0x77553310, 0x77553311};
11258     ASSERT_EQUAL_SVE(expected_z8, z8.VnS());
11259   }
11260 }
11261 
TEST_SVE(sve_int_wide_imm_unpredicated_fdup)11262 TEST_SVE(sve_int_wide_imm_unpredicated_fdup) {
11263   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11264   START();
11265 
11266   // Immediates which can be encoded in the instructions.
11267   __ Fdup(z0.VnH(), RawbitsToFloat16(0xc500));
11268   __ Fdup(z1.VnS(), Float16(2.0));
11269   __ Fdup(z2.VnD(), Float16(3.875));
11270   __ Fdup(z3.VnH(), 8.0f);
11271   __ Fdup(z4.VnS(), -4.75f);
11272   __ Fdup(z5.VnD(), 0.5f);
11273   __ Fdup(z6.VnH(), 1.0);
11274   __ Fdup(z7.VnS(), 2.125);
11275   __ Fdup(z8.VnD(), -13.0);
11276 
11277   // Immediates which cannot be encoded in the instructions.
11278   __ Fdup(z10.VnH(), Float16(0.0));
11279   __ Fdup(z11.VnH(), kFP16PositiveInfinity);
11280   __ Fdup(z12.VnS(), 255.0f);
11281   __ Fdup(z13.VnS(), kFP32NegativeInfinity);
11282   __ Fdup(z14.VnD(), 12.3456);
11283   __ Fdup(z15.VnD(), kFP64PositiveInfinity);
11284 
11285   END();
11286 
11287   if (CAN_RUN()) {
11288     RUN();
11289 
11290     ASSERT_EQUAL_SVE(0xc500, z0.VnH());
11291     ASSERT_EQUAL_SVE(0x40000000, z1.VnS());
11292     ASSERT_EQUAL_SVE(0x400f000000000000, z2.VnD());
11293     ASSERT_EQUAL_SVE(0x4800, z3.VnH());
11294     ASSERT_EQUAL_SVE(FloatToRawbits(-4.75f), z4.VnS());
11295     ASSERT_EQUAL_SVE(DoubleToRawbits(0.5), z5.VnD());
11296     ASSERT_EQUAL_SVE(0x3c00, z6.VnH());
11297     ASSERT_EQUAL_SVE(FloatToRawbits(2.125f), z7.VnS());
11298     ASSERT_EQUAL_SVE(DoubleToRawbits(-13.0), z8.VnD());
11299 
11300     ASSERT_EQUAL_SVE(0x0000, z10.VnH());
11301     ASSERT_EQUAL_SVE(Float16ToRawbits(kFP16PositiveInfinity), z11.VnH());
11302     ASSERT_EQUAL_SVE(FloatToRawbits(255.0), z12.VnS());
11303     ASSERT_EQUAL_SVE(FloatToRawbits(kFP32NegativeInfinity), z13.VnS());
11304     ASSERT_EQUAL_SVE(DoubleToRawbits(12.3456), z14.VnD());
11305     ASSERT_EQUAL_SVE(DoubleToRawbits(kFP64PositiveInfinity), z15.VnD());
11306   }
11307 }
11308 
TEST_SVE(sve_andv_eorv_orv)11309 TEST_SVE(sve_andv_eorv_orv) {
11310   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11311   START();
11312 
11313   uint64_t in[] = {0x8899aabbccddeeff, 0x7777555533331111, 0x123456789abcdef0};
11314   InsrHelper(&masm, z31.VnD(), in);
11315 
11316   // For simplicity, we re-use the same pg for various lane sizes.
11317   // For D lanes:         1,                      1,                      0
11318   // For S lanes:         1,          1,          1,          0,          0
11319   // For H lanes:   0,    1,    0,    1,    1,    1,    0,    0,    1,    0
11320   int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
11321   Initialise(&masm, p0.VnB(), pg_in);
11322 
11323   // Make a copy so we can check that constructive operations preserve zn.
11324   __ Mov(z0, z31);
11325   __ Andv(b0, p0, z0.VnB());  // destructive
11326   __ Andv(h1, p0, z31.VnH());
11327   __ Mov(z2, z31);
11328   __ Andv(s2, p0, z2.VnS());  // destructive
11329   __ Andv(d3, p0, z31.VnD());
11330 
11331   __ Eorv(b4, p0, z31.VnB());
11332   __ Mov(z5, z31);
11333   __ Eorv(h5, p0, z5.VnH());  // destructive
11334   __ Eorv(s6, p0, z31.VnS());
11335   __ Mov(z7, z31);
11336   __ Eorv(d7, p0, z7.VnD());  // destructive
11337 
11338   __ Mov(z8, z31);
11339   __ Orv(b8, p0, z8.VnB());  // destructive
11340   __ Orv(h9, p0, z31.VnH());
11341   __ Mov(z10, z31);
11342   __ Orv(s10, p0, z10.VnS());  // destructive
11343   __ Orv(d11, p0, z31.VnD());
11344 
11345   END();
11346 
11347   if (CAN_RUN()) {
11348     RUN();
11349 
11350     if (static_cast<int>(ArrayLength(pg_in)) >= config->sve_vl_in_bytes()) {
11351       ASSERT_EQUAL_64(0x10, d0);
11352       ASSERT_EQUAL_64(0x1010, d1);
11353       ASSERT_EQUAL_64(0x33331111, d2);
11354       ASSERT_EQUAL_64(0x7777555533331111, d3);
11355       ASSERT_EQUAL_64(0xbf, d4);
11356       ASSERT_EQUAL_64(0xedcb, d5);
11357       ASSERT_EQUAL_64(0x44444444, d6);
11358       ASSERT_EQUAL_64(0x7777555533331111, d7);
11359       ASSERT_EQUAL_64(0xff, d8);
11360       ASSERT_EQUAL_64(0xffff, d9);
11361       ASSERT_EQUAL_64(0x77775555, d10);
11362       ASSERT_EQUAL_64(0x7777555533331111, d11);
11363     } else {
11364       ASSERT_EQUAL_64(0, d0);
11365       ASSERT_EQUAL_64(0x0010, d1);
11366       ASSERT_EQUAL_64(0x00110011, d2);
11367       ASSERT_EQUAL_64(0x0011001100110011, d3);
11368       ASSERT_EQUAL_64(0x62, d4);
11369       ASSERT_EQUAL_64(0x0334, d5);
11370       ASSERT_EQUAL_64(0x8899aabb, d6);
11371       ASSERT_EQUAL_64(0xffeeffeeffeeffee, d7);
11372       ASSERT_EQUAL_64(0xff, d8);
11373       ASSERT_EQUAL_64(0xffff, d9);
11374       ASSERT_EQUAL_64(0xffffffff, d10);
11375       ASSERT_EQUAL_64(0xffffffffffffffff, d11);
11376     }
11377 
11378     // Check the upper lanes above the top of the V register are all clear.
11379     for (int i = 1; i < core.GetSVELaneCount(kDRegSize); i++) {
11380       ASSERT_EQUAL_SVE_LANE(0, z0.VnD(), i);
11381       ASSERT_EQUAL_SVE_LANE(0, z1.VnD(), i);
11382       ASSERT_EQUAL_SVE_LANE(0, z2.VnD(), i);
11383       ASSERT_EQUAL_SVE_LANE(0, z3.VnD(), i);
11384       ASSERT_EQUAL_SVE_LANE(0, z4.VnD(), i);
11385       ASSERT_EQUAL_SVE_LANE(0, z5.VnD(), i);
11386       ASSERT_EQUAL_SVE_LANE(0, z6.VnD(), i);
11387       ASSERT_EQUAL_SVE_LANE(0, z7.VnD(), i);
11388       ASSERT_EQUAL_SVE_LANE(0, z8.VnD(), i);
11389       ASSERT_EQUAL_SVE_LANE(0, z9.VnD(), i);
11390       ASSERT_EQUAL_SVE_LANE(0, z10.VnD(), i);
11391       ASSERT_EQUAL_SVE_LANE(0, z11.VnD(), i);
11392     }
11393   }
11394 }
11395 
11396 
TEST_SVE(sve_saddv_uaddv)11397 TEST_SVE(sve_saddv_uaddv) {
11398   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11399   START();
11400 
11401   uint64_t in[] = {0x8899aabbccddeeff, 0x8182838485868788, 0x0807060504030201};
11402   InsrHelper(&masm, z31.VnD(), in);
11403 
11404   // For simplicity, we re-use the same pg for various lane sizes.
11405   // For D lanes:         1,                      1,                      0
11406   // For S lanes:         1,          1,          1,          0,          0
11407   // For H lanes:   0,    1,    0,    1,    1,    1,    0,    0,    1,    0
11408   int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
11409   Initialise(&masm, p0.VnB(), pg_in);
11410 
11411   // Make a copy so we can check that constructive operations preserve zn.
11412   __ Mov(z0, z31);
11413   __ Saddv(b0, p0, z0.VnB());  // destructive
11414   __ Saddv(h1, p0, z31.VnH());
11415   __ Mov(z2, z31);
11416   __ Saddv(s2, p0, z2.VnS());  // destructive
11417 
11418   __ Uaddv(b4, p0, z31.VnB());
11419   __ Mov(z5, z31);
11420   __ Uaddv(h5, p0, z5.VnH());  // destructive
11421   __ Uaddv(s6, p0, z31.VnS());
11422   __ Mov(z7, z31);
11423   __ Uaddv(d7, p0, z7.VnD());  // destructive
11424 
11425   END();
11426 
11427   if (CAN_RUN()) {
11428     RUN();
11429 
11430     if (static_cast<int>(ArrayLength(pg_in)) >= config->sve_vl_in_bytes()) {
11431       // Saddv
11432       ASSERT_EQUAL_64(0xfffffffffffffda9, d0);
11433       ASSERT_EQUAL_64(0xfffffffffffe9495, d1);
11434       ASSERT_EQUAL_64(0xffffffff07090b0c, d2);
11435       // Uaddv
11436       ASSERT_EQUAL_64(0x00000000000002a9, d4);
11437       ASSERT_EQUAL_64(0x0000000000019495, d5);
11438       ASSERT_EQUAL_64(0x0000000107090b0c, d6);
11439       ASSERT_EQUAL_64(0x8182838485868788, d7);
11440     } else {
11441       // Saddv
11442       ASSERT_EQUAL_64(0xfffffffffffffd62, d0);
11443       ASSERT_EQUAL_64(0xfffffffffffe8394, d1);
11444       ASSERT_EQUAL_64(0xfffffffed3e6fa0b, d2);
11445       // Uaddv
11446       ASSERT_EQUAL_64(0x0000000000000562, d4);
11447       ASSERT_EQUAL_64(0x0000000000028394, d5);
11448       ASSERT_EQUAL_64(0x00000001d3e6fa0b, d6);
11449       ASSERT_EQUAL_64(0x0a1c2e4052647687, d7);
11450     }
11451 
11452     // Check the upper lanes above the top of the V register are all clear.
11453     for (int i = 1; i < core.GetSVELaneCount(kDRegSize); i++) {
11454       ASSERT_EQUAL_SVE_LANE(0, z0.VnD(), i);
11455       ASSERT_EQUAL_SVE_LANE(0, z1.VnD(), i);
11456       ASSERT_EQUAL_SVE_LANE(0, z2.VnD(), i);
11457       ASSERT_EQUAL_SVE_LANE(0, z4.VnD(), i);
11458       ASSERT_EQUAL_SVE_LANE(0, z5.VnD(), i);
11459       ASSERT_EQUAL_SVE_LANE(0, z6.VnD(), i);
11460       ASSERT_EQUAL_SVE_LANE(0, z7.VnD(), i);
11461     }
11462   }
11463 }
11464 
11465 
TEST_SVE(sve_sminv_uminv)11466 TEST_SVE(sve_sminv_uminv) {
11467   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11468   START();
11469 
11470   uint64_t in[] = {0xfffa5555aaaaaaaa, 0x0011223344aafe80, 0x00112233aabbfc00};
11471   InsrHelper(&masm, z31.VnD(), in);
11472 
11473   // For simplicity, we re-use the same pg for various lane sizes.
11474   // For D lanes:         1,                      0,                      1
11475   // For S lanes:         1,          1,          0,          0,          1
11476   // For H lanes:   1,    1,    0,    1,    1,    0,    0,    0,    1,    1
11477   int pg_in[] = {1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1};
11478   Initialise(&masm, p0.VnB(), pg_in);
11479 
11480   // Make a copy so we can check that constructive operations preserve zn.
11481   __ Mov(z0, z31);
11482   __ Sminv(b0, p0, z0.VnB());  // destructive
11483   __ Sminv(h1, p0, z31.VnH());
11484   __ Mov(z2, z31);
11485   __ Sminv(s2, p0, z2.VnS());  // destructive
11486   __ Sminv(d3, p0, z31.VnD());
11487 
11488   __ Uminv(b4, p0, z31.VnB());
11489   __ Mov(z5, z31);
11490   __ Uminv(h5, p0, z5.VnH());  // destructive
11491   __ Uminv(s6, p0, z31.VnS());
11492   __ Mov(z7, z31);
11493   __ Uminv(d7, p0, z7.VnD());  // destructive
11494 
11495   END();
11496 
11497   if (CAN_RUN()) {
11498     RUN();
11499 
11500     if (static_cast<int>(ArrayLength(pg_in)) >= config->sve_vl_in_bytes()) {
11501       // Sminv
11502       ASSERT_EQUAL_64(0xaa, d0);
11503       ASSERT_EQUAL_64(0xaabb, d1);
11504       ASSERT_EQUAL_64(0xaabbfc00, d2);
11505       ASSERT_EQUAL_64(0x00112233aabbfc00, d3);  // The smaller lane is inactive.
11506       // Uminv
11507       ASSERT_EQUAL_64(0, d4);
11508       ASSERT_EQUAL_64(0x2233, d5);
11509       ASSERT_EQUAL_64(0x112233, d6);
11510       ASSERT_EQUAL_64(0x00112233aabbfc00, d7);  // The smaller lane is inactive.
11511     } else {
11512       // Sminv
11513       ASSERT_EQUAL_64(0xaa, d0);
11514       ASSERT_EQUAL_64(0xaaaa, d1);
11515       ASSERT_EQUAL_64(0xaaaaaaaa, d2);
11516       ASSERT_EQUAL_64(0xfffa5555aaaaaaaa, d3);
11517       // Uminv
11518       ASSERT_EQUAL_64(0, d4);
11519       ASSERT_EQUAL_64(0x2233, d5);
11520       ASSERT_EQUAL_64(0x112233, d6);
11521       ASSERT_EQUAL_64(0x00112233aabbfc00, d7);
11522     }
11523 
11524     // Check the upper lanes above the top of the V register are all clear.
11525     for (int i = 1; i < core.GetSVELaneCount(kDRegSize); i++) {
11526       ASSERT_EQUAL_SVE_LANE(0, z0.VnD(), i);
11527       ASSERT_EQUAL_SVE_LANE(0, z1.VnD(), i);
11528       ASSERT_EQUAL_SVE_LANE(0, z2.VnD(), i);
11529       ASSERT_EQUAL_SVE_LANE(0, z3.VnD(), i);
11530       ASSERT_EQUAL_SVE_LANE(0, z4.VnD(), i);
11531       ASSERT_EQUAL_SVE_LANE(0, z5.VnD(), i);
11532       ASSERT_EQUAL_SVE_LANE(0, z6.VnD(), i);
11533       ASSERT_EQUAL_SVE_LANE(0, z7.VnD(), i);
11534     }
11535   }
11536 }
11537 
TEST_SVE(sve_smaxv_umaxv)11538 TEST_SVE(sve_smaxv_umaxv) {
11539   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11540   START();
11541 
11542   uint64_t in[] = {0xfffa5555aaaaaaaa, 0x0011223344aafe80, 0x00112233aabbfc00};
11543   InsrHelper(&masm, z31.VnD(), in);
11544 
11545   // For simplicity, we re-use the same pg for various lane sizes.
11546   // For D lanes:         1,                      0,                      1
11547   // For S lanes:         1,          1,          0,          0,          1
11548   // For H lanes:   1,    1,    0,    1,    1,    0,    0,    0,    1,    1
11549   int pg_in[] = {1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1};
11550   Initialise(&masm, p0.VnB(), pg_in);
11551 
11552   // Make a copy so we can check that constructive operations preserve zn.
11553   __ Mov(z0, z31);
11554   __ Smaxv(b0, p0, z0.VnB());  // destructive
11555   __ Smaxv(h1, p0, z31.VnH());
11556   __ Mov(z2, z31);
11557   __ Smaxv(s2, p0, z2.VnS());  // destructive
11558   __ Smaxv(d3, p0, z31.VnD());
11559 
11560   __ Umaxv(b4, p0, z31.VnB());
11561   __ Mov(z5, z31);
11562   __ Umaxv(h5, p0, z5.VnH());  // destructive
11563   __ Umaxv(s6, p0, z31.VnS());
11564   __ Mov(z7, z31);
11565   __ Umaxv(d7, p0, z7.VnD());  // destructive
11566 
11567   END();
11568 
11569   if (CAN_RUN()) {
11570     RUN();
11571 
11572     if (static_cast<int>(ArrayLength(pg_in)) >= config->sve_vl_in_bytes()) {
11573       // Smaxv
11574       ASSERT_EQUAL_64(0x33, d0);
11575       ASSERT_EQUAL_64(0x44aa, d1);
11576       ASSERT_EQUAL_64(0x112233, d2);
11577       ASSERT_EQUAL_64(0x112233aabbfc00, d3);
11578       // Umaxv
11579       ASSERT_EQUAL_64(0xfe, d4);
11580       ASSERT_EQUAL_64(0xfc00, d5);
11581       ASSERT_EQUAL_64(0xaabbfc00, d6);
11582       ASSERT_EQUAL_64(0x112233aabbfc00, d7);
11583     } else {
11584       // Smaxv
11585       ASSERT_EQUAL_64(0x33, d0);
11586       ASSERT_EQUAL_64(0x44aa, d1);
11587       ASSERT_EQUAL_64(0x112233, d2);
11588       ASSERT_EQUAL_64(0x00112233aabbfc00, d3);
11589       // Umaxv
11590       ASSERT_EQUAL_64(0xfe, d4);
11591       ASSERT_EQUAL_64(0xfc00, d5);
11592       ASSERT_EQUAL_64(0xaabbfc00, d6);
11593       ASSERT_EQUAL_64(0xfffa5555aaaaaaaa, d7);
11594     }
11595 
11596     // Check the upper lanes above the top of the V register are all clear.
11597     for (int i = 1; i < core.GetSVELaneCount(kDRegSize); i++) {
11598       ASSERT_EQUAL_SVE_LANE(0, z0.VnD(), i);
11599       ASSERT_EQUAL_SVE_LANE(0, z1.VnD(), i);
11600       ASSERT_EQUAL_SVE_LANE(0, z2.VnD(), i);
11601       ASSERT_EQUAL_SVE_LANE(0, z3.VnD(), i);
11602       ASSERT_EQUAL_SVE_LANE(0, z4.VnD(), i);
11603       ASSERT_EQUAL_SVE_LANE(0, z5.VnD(), i);
11604       ASSERT_EQUAL_SVE_LANE(0, z6.VnD(), i);
11605       ASSERT_EQUAL_SVE_LANE(0, z7.VnD(), i);
11606     }
11607   }
11608 }
11609 
11610 template <typename T, size_t M, size_t N>
SdotUdotHelper(Test * config,unsigned lane_size_in_bits,const T (& zd_inputs)[M],const T (& za_inputs)[M],const T (& zn_inputs)[N],const T (& zm_inputs)[N],const T (& zd_expected)[M],const T (& zdnm_expected)[M],bool is_signed,int index=-1)11611 static void SdotUdotHelper(Test* config,
11612                            unsigned lane_size_in_bits,
11613                            const T (&zd_inputs)[M],
11614                            const T (&za_inputs)[M],
11615                            const T (&zn_inputs)[N],
11616                            const T (&zm_inputs)[N],
11617                            const T (&zd_expected)[M],
11618                            const T (&zdnm_expected)[M],
11619                            bool is_signed,
11620                            int index = -1) {
11621   VIXL_STATIC_ASSERT(N == (M * 4));
11622   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11623   START();
11624 
11625   auto dot_fn = [&](const ZRegister& zd,
11626                     const ZRegister& za,
11627                     const ZRegister& zn,
11628                     const ZRegister& zm,
11629                     bool is_signed_fn,
11630                     int index_fn) {
11631     if (is_signed_fn) {
11632       if (index_fn < 0) {
11633         __ Sdot(zd, za, zn, zm);
11634       } else {
11635         __ Sdot(zd, za, zn, zm, index_fn);
11636       }
11637     } else {
11638       if (index_fn < 0) {
11639         __ Udot(zd, za, zn, zm);
11640       } else {
11641         __ Udot(zd, za, zn, zm, index_fn);
11642       }
11643     }
11644   };
11645 
11646   ZRegister zd = z0.WithLaneSize(lane_size_in_bits);
11647   ZRegister za = z1.WithLaneSize(lane_size_in_bits);
11648   ZRegister zn = z2.WithLaneSize(lane_size_in_bits / 4);
11649   ZRegister zm = z3.WithLaneSize(lane_size_in_bits / 4);
11650 
11651   InsrHelper(&masm, zd, zd_inputs);
11652   InsrHelper(&masm, za, za_inputs);
11653   InsrHelper(&masm, zn, zn_inputs);
11654   InsrHelper(&masm, zm, zm_inputs);
11655 
11656   // The Dot macro handles arbitrarily-aliased registers in the argument list.
11657   ZRegister dm_result = z4.WithLaneSize(lane_size_in_bits);
11658   ZRegister dnm_result = z5.WithLaneSize(lane_size_in_bits);
11659   ZRegister da_result = z6.WithLaneSize(lane_size_in_bits);
11660   ZRegister dn_result = z7.WithLaneSize(lane_size_in_bits);
11661   ZRegister d_result = z8.WithLaneSize(lane_size_in_bits);
11662 
11663   __ Mov(da_result, za);
11664   // zda = zda + (zn . zm)
11665   dot_fn(da_result, da_result, zn, zm, is_signed, index);
11666 
11667   __ Mov(dn_result, zn.WithSameLaneSizeAs(dn_result));
11668   // zdn = za + (zdn . zm)
11669   dot_fn(dn_result, za, dn_result.WithSameLaneSizeAs(zn), zm, is_signed, index);
11670 
11671   __ Mov(dm_result, zm.WithSameLaneSizeAs(dm_result));
11672   // zdm = za + (zn . zdm)
11673   dot_fn(dm_result, za, zn, dm_result.WithSameLaneSizeAs(zm), is_signed, index);
11674 
11675   __ Mov(d_result, zd);
11676   // zd = za + (zn . zm)
11677   dot_fn(d_result, za, zn, zm, is_signed, index);
11678 
11679   __ Mov(dnm_result, zn.WithSameLaneSizeAs(dnm_result));
11680   // zdnm = za + (zdmn . zdnm)
11681   dot_fn(dnm_result,
11682          za,
11683          dnm_result.WithSameLaneSizeAs(zn),
11684          dnm_result.WithSameLaneSizeAs(zm),
11685          is_signed,
11686          index);
11687 
11688   END();
11689 
11690   if (CAN_RUN()) {
11691     RUN();
11692 
11693     ASSERT_EQUAL_SVE(za_inputs, z1.WithLaneSize(lane_size_in_bits));
11694     ASSERT_EQUAL_SVE(zn_inputs, z2.WithLaneSize(lane_size_in_bits / 4));
11695     ASSERT_EQUAL_SVE(zm_inputs, z3.WithLaneSize(lane_size_in_bits / 4));
11696 
11697     ASSERT_EQUAL_SVE(zd_expected, da_result);
11698     ASSERT_EQUAL_SVE(zd_expected, dn_result);
11699     ASSERT_EQUAL_SVE(zd_expected, dm_result);
11700     ASSERT_EQUAL_SVE(zd_expected, d_result);
11701 
11702     ASSERT_EQUAL_SVE(zdnm_expected, dnm_result);
11703   }
11704 }
11705 
TEST_SVE(sve_sdot)11706 TEST_SVE(sve_sdot) {
11707   int64_t zd_inputs[] = {0x33, 0xee, 0xff};
11708   int64_t za_inputs[] = {INT32_MAX, -3, 2};
11709   int64_t zn_inputs[] = {-128, -128, -128, -128, 9, -1, 1, 30, -5, -20, 9, 8};
11710   int64_t zm_inputs[] = {-128, -128, -128, -128, -19, 15, 6, 0, 9, -5, 4, 5};
11711 
11712   // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
11713   int64_t zd_expected_s[] = {-2147418113, -183, 133};  // 0x8000ffff
11714   int64_t zd_expected_d[] = {2147549183, -183, 133};   // 0x8000ffff
11715 
11716   // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
11717   int64_t zdnm_expected_s[] = {-2147418113, 980, 572};
11718   int64_t zdnm_expected_d[] = {2147549183, 980, 572};
11719 
11720   SdotUdotHelper(config,
11721                  kSRegSize,
11722                  zd_inputs,
11723                  za_inputs,
11724                  zn_inputs,
11725                  zm_inputs,
11726                  zd_expected_s,
11727                  zdnm_expected_s,
11728                  true);
11729 
11730   SdotUdotHelper(config,
11731                  kDRegSize,
11732                  zd_inputs,
11733                  za_inputs,
11734                  zn_inputs,
11735                  zm_inputs,
11736                  zd_expected_d,
11737                  zdnm_expected_d,
11738                  true);
11739 }
11740 
TEST_SVE(sve_udot)11741 TEST_SVE(sve_udot) {
11742   int64_t zd_inputs[] = {0x33, 0xee, 0xff};
11743   int64_t za_inputs[] = {INT32_MAX, -3, 2};
11744   int64_t zn_inputs[] = {-128, -128, -128, -128, 9, -1, 1, 30, -5, -20, 9, 8};
11745   int64_t zm_inputs[] = {-128, -128, -128, -128, -19, 15, 6, 0, 9, -5, 4, 5};
11746 
11747   // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
11748   int64_t zd_expected_s[] = {0x8000ffff, 0x00001749, 0x0000f085};
11749   int64_t zd_expected_d[] = {0x000000047c00ffff,
11750                              0x000000000017ff49,
11751                              0x00000000fff00085};
11752 
11753   // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
11754   int64_t zdnm_expected_s[] = {0x8000ffff, 0x000101d4, 0x0001d03c};
11755   int64_t zdnm_expected_d[] = {0x000000047c00ffff,
11756                                0x00000000fffe03d4,
11757                                0x00000001ffce023c};
11758 
11759   SdotUdotHelper(config,
11760                  kSRegSize,
11761                  zd_inputs,
11762                  za_inputs,
11763                  zn_inputs,
11764                  zm_inputs,
11765                  zd_expected_s,
11766                  zdnm_expected_s,
11767                  false);
11768 
11769   SdotUdotHelper(config,
11770                  kDRegSize,
11771                  zd_inputs,
11772                  za_inputs,
11773                  zn_inputs,
11774                  zm_inputs,
11775                  zd_expected_d,
11776                  zdnm_expected_d,
11777                  false);
11778 }
11779 
TEST_SVE(sve_sdot_indexed_s)11780 TEST_SVE(sve_sdot_indexed_s) {
11781   int64_t zd_inputs[] = {0xff, 0xff, 0xff, 0xff};
11782   int64_t za_inputs[] = {0, 1, 2, 3};
11783   int64_t zn_inputs[] =
11784       {-1, -1, -1, -1, -2, -2, -2, -2, -3, -3, -3, -3, -4, -4, -4, -4};
11785   int64_t zm_inputs[] =
11786       {127, 127, 127, 127, -128, -128, -128, -128, -1, -1, -1, -1, 0, 0, 0, 0};
11787 
11788   constexpr int s = kQRegSize / kSRegSize;
11789 
11790   // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
11791   int64_t zd_expected_s[][s] = {{0, 1, 2, 3},  // Generated from zm[0]
11792                                 {4, 9, 14, 19},
11793                                 {512, 1025, 1538, 2051},
11794                                 {-508, -1015, -1522, -2029}};
11795 
11796   // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
11797   int64_t zdnm_expected_s[][s] = {{16, 33, 50, 67},
11798                                   {12, 25, 38, 51},
11799                                   {8, 17, 26, 35},
11800                                   {4, 9, 14, 19}};
11801 
11802   for (unsigned i = 0; i < s; i++) {
11803     SdotUdotHelper(config,
11804                    kSRegSize,
11805                    zd_inputs,
11806                    za_inputs,
11807                    zn_inputs,
11808                    zm_inputs,
11809                    zd_expected_s[i],
11810                    zdnm_expected_s[i],
11811                    true,
11812                    i);
11813   }
11814 }
11815 
TEST_SVE(sve_sdot_indexed_d)11816 TEST_SVE(sve_sdot_indexed_d) {
11817   int64_t zd_inputs[] = {0xff, 0xff};
11818   int64_t za_inputs[] = {0, 1};
11819   int64_t zn_inputs[] = {-1, -1, -1, -1, -1, -1, -1, -1};
11820   int64_t zm_inputs[] = {-128, -128, -128, -128, 127, 127, 127, 127};
11821 
11822   constexpr int d = kQRegSize / kDRegSize;
11823 
11824   // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
11825   int64_t zd_expected_d[][d] = {{-508, -507},  // Generated from zm[0]
11826                                 {512, 513}};
11827 
11828   // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
11829   int64_t zdnm_expected_d[][d] = {{4, 5}, {4, 5}};
11830 
11831   for (unsigned i = 0; i < d; i++) {
11832     SdotUdotHelper(config,
11833                    kDRegSize,
11834                    zd_inputs,
11835                    za_inputs,
11836                    zn_inputs,
11837                    zm_inputs,
11838                    zd_expected_d[i],
11839                    zdnm_expected_d[i],
11840                    true,
11841                    i);
11842   }
11843 }
11844 
TEST_SVE(sve_udot_indexed_s)11845 TEST_SVE(sve_udot_indexed_s) {
11846   int64_t zd_inputs[] = {0xff, 0xff, 0xff, 0xff};
11847   int64_t za_inputs[] = {0, 1, 2, 3};
11848   int64_t zn_inputs[] = {1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4};
11849   int64_t zm_inputs[] =
11850       {127, 127, 127, 127, 255, 255, 255, 255, 1, 1, 1, 1, 0, 0, 0, 0};
11851 
11852   constexpr int s = kQRegSize / kSRegSize;
11853 
11854   // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
11855   int64_t zd_expected_s[][s] = {{0, 1, 2, 3},
11856                                 {4, 9, 14, 19},
11857                                 {1020, 2041, 3062, 4083},
11858                                 {508, 1017, 1526, 2035}};
11859 
11860   // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
11861   int64_t zdnm_expected_s[][s] = {{16, 33, 50, 67},
11862                                   {12, 25, 38, 51},
11863                                   {8, 17, 26, 35},
11864                                   {4, 9, 14, 19}};
11865 
11866   for (unsigned i = 0; i < s; i++) {
11867     SdotUdotHelper(config,
11868                    kSRegSize,
11869                    zd_inputs,
11870                    za_inputs,
11871                    zn_inputs,
11872                    zm_inputs,
11873                    zd_expected_s[i],
11874                    zdnm_expected_s[i],
11875                    false,
11876                    i);
11877   }
11878 }
11879 
TEST_SVE(sve_udot_indexed_d)11880 TEST_SVE(sve_udot_indexed_d) {
11881   int64_t zd_inputs[] = {0xff, 0xff};
11882   int64_t za_inputs[] = {0, 1};
11883   int64_t zn_inputs[] = {1, 1, 1, 1, 1, 1, 1, 1};
11884   int64_t zm_inputs[] = {255, 255, 255, 255, 127, 127, 127, 127};
11885 
11886   constexpr int d = kQRegSize / kDRegSize;
11887 
11888   // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
11889   int64_t zd_expected_d[][d] = {{508, 509}, {1020, 1021}};
11890 
11891   // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
11892   int64_t zdnm_expected_d[][d] = {{4, 5}, {4, 5}};
11893 
11894   for (unsigned i = 0; i < d; i++) {
11895     SdotUdotHelper(config,
11896                    kDRegSize,
11897                    zd_inputs,
11898                    za_inputs,
11899                    zn_inputs,
11900                    zm_inputs,
11901                    zd_expected_d[i],
11902                    zdnm_expected_d[i],
11903                    false,
11904                    i);
11905   }
11906 }
11907 
IntSegmentPatternHelper(MacroAssembler * masm,const ZRegister & dst,const ZRegister & src)11908 static void IntSegmentPatternHelper(MacroAssembler* masm,
11909                                     const ZRegister& dst,
11910                                     const ZRegister& src) {
11911   VIXL_ASSERT(AreSameLaneSize(dst, src));
11912   UseScratchRegisterScope temps(masm);
11913   ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(dst);
11914   masm->Index(ztmp, 0, 1);
11915   masm->Asr(ztmp, ztmp, kQRegSizeInBytesLog2 - dst.GetLaneSizeInBytesLog2());
11916   masm->Add(dst, src, ztmp);
11917 }
11918 
TEST_SVE(sve_sdot_udot_indexed_s)11919 TEST_SVE(sve_sdot_udot_indexed_s) {
11920   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11921   START();
11922 
11923   const int multiplier = 2;
11924   __ Dup(z9.VnS(), multiplier);
11925 
11926   __ Ptrue(p0.VnB());
11927   __ Index(z29.VnS(), 4, 1);
11928 
11929   // z29 = [... 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0]
11930   __ And(z29.VnS(), z29.VnS(), 3);
11931 
11932   // p7 = [... 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1]
11933   __ Cmple(p7.VnS(), p0.Zeroing(), z29.VnS(), 0);
11934 
11935   // p6 = [... 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
11936   __ Cmple(p6.VnS(), p0.Zeroing(), z29.VnS(), 1);
11937 
11938   // p5 = [... 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1]
11939   __ Cmple(p5.VnS(), p0.Zeroing(), z29.VnS(), 2);
11940 
11941   __ Index(z28.VnB(), 1, 1);
11942   __ Dup(z27.VnS(), z28.VnS(), 0);
11943 
11944   // z27 = [... 3, 2, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1]
11945   IntSegmentPatternHelper(&masm, z27.VnB(), z27.VnB());
11946 
11947   // z27 = [... 6, 4, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 8, 6, 4, 2]
11948   __ Mul(z27.VnS(), p7.Merging(), z27.VnS(), z9.VnS());
11949 
11950   // z27 = [... 12, 8, 4, 3, 2, 1, 4, 3, 2, 1, 8, 6, 4, 2, 16, 12, 8, 4]
11951   __ Mul(z27.VnS(), p6.Merging(), z27.VnS(), z9.VnS());
11952 
11953   //     2nd segment |                                        1st segment |
11954   //                 v                                                    v
11955   // z27 = [... 24, 16, 4, 3, 2, 1, 8, 6, 4, 2, 16, 12, 8, 4, 32, 24, 16, 8]
11956   __ Mul(z27.VnS(), p5.Merging(), z27.VnS(), z9.VnS());
11957 
11958   __ Dup(z0.VnS(), 0);
11959   __ Dup(z1.VnS(), 0);
11960   __ Dup(z2.VnS(), 0);
11961   __ Dup(z3.VnS(), 0);
11962   __ Dup(z4.VnS(), 0);
11963   __ Dup(z5.VnS(), 0);
11964 
11965   // Skip the lanes starting from the 129th lane since the value of these lanes
11966   // are overflow after the number sequence creation by `index`.
11967   __ Cmpls(p3.VnB(), p0.Zeroing(), z28.VnB(), 128);
11968   __ Mov(z0.VnB(), p3.Merging(), z27.VnB());
11969   __ Mov(z1.VnB(), p3.Merging(), z28.VnB());
11970 
11971   __ Dup(z2.VnS(), 0);
11972   __ Dup(z3.VnS(), 0);
11973   __ Dup(z4.VnS(), 0);
11974   __ Dup(z5.VnS(), 0);
11975 
11976   __ Udot(z2.VnS(), z2.VnS(), z1.VnB(), z0.VnB(), 0);
11977 
11978   __ Udot(z3.VnS(), z3.VnS(), z1.VnB(), z0.VnB(), 1);
11979   __ Mul(z3.VnS(), z3.VnS(), 2);
11980 
11981   __ Udot(z4.VnS(), z4.VnS(), z1.VnB(), z0.VnB(), 2);
11982   __ Mul(z4.VnS(), z4.VnS(), 4);
11983 
11984   __ Udot(z5.VnS(), z5.VnS(), z1.VnB(), z0.VnB(), 3);
11985   __ Mul(z5.VnS(), z5.VnS(), 8);
11986 
11987   __ Dup(z7.VnS(), 0);
11988   __ Dup(z8.VnS(), 0);
11989   __ Dup(z9.VnS(), 0);
11990   __ Dup(z10.VnS(), 0);
11991 
11992   // Negate the all positive vector for testing signed dot.
11993   __ Neg(z6.VnB(), p0.Merging(), z0.VnB());
11994   __ Sdot(z7.VnS(), z7.VnS(), z1.VnB(), z6.VnB(), 0);
11995 
11996   __ Sdot(z8.VnS(), z8.VnS(), z1.VnB(), z6.VnB(), 1);
11997   __ Mul(z8.VnS(), z8.VnS(), 2);
11998 
11999   __ Sdot(z9.VnS(), z9.VnS(), z1.VnB(), z6.VnB(), 2);
12000   __ Mul(z9.VnS(), z9.VnS(), 4);
12001 
12002   __ Sdot(z10.VnS(), z10.VnS(), z1.VnB(), z6.VnB(), 3);
12003   __ Mul(z10.VnS(), z10.VnS(), 8);
12004 
12005   END();
12006 
12007   if (CAN_RUN()) {
12008     RUN();
12009 
12010     // Only compare the first 128-bit segment of destination register, use
12011     // another result from generated instructions to check the remaining part.
12012     // s_lane[0] = (1 * 8) + (2 * 16) + (3 * 24) + (4 * 32) = 240
12013     // ...
12014     // s_lane[3] = (13 * 8) + (14 * 16) + (15 * 24) + (16 * 32) = 1200
12015     int udot_expected[] = {1200, 880, 560, 240};
12016     ASSERT_EQUAL_SVE(udot_expected, z2.VnS());
12017     ASSERT_EQUAL_SVE(z2.VnS(), z3.VnS());
12018     ASSERT_EQUAL_SVE(z2.VnS(), z4.VnS());
12019     ASSERT_EQUAL_SVE(z2.VnS(), z5.VnS());
12020 
12021     int sdot_expected[] = {-1200, -880, -560, -240};
12022     ASSERT_EQUAL_SVE(sdot_expected, z7.VnS());
12023     ASSERT_EQUAL_SVE(z7.VnS(), z8.VnS());
12024     ASSERT_EQUAL_SVE(z7.VnS(), z9.VnS());
12025     ASSERT_EQUAL_SVE(z7.VnS(), z10.VnS());
12026   }
12027 }
12028 
TEST_SVE(sve_sdot_udot_indexed_d)12029 TEST_SVE(sve_sdot_udot_indexed_d) {
12030   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
12031   START();
12032 
12033   const int multiplier = 2;
12034   __ Dup(z9.VnD(), multiplier);
12035 
12036   __ Ptrue(p0.VnD());
12037   __ Pfalse(p1.VnD());
12038 
12039   // p2 = [..., 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
12040   __ Zip1(p2.VnD(), p0.VnD(), p1.VnD());
12041 
12042   __ Index(z1.VnH(), 1, 1);
12043   __ Dup(z0.VnD(), z1.VnD(), 0);
12044 
12045   // z0 = [... 5, 4, 3, 2, 5, 4, 3, 2, 4, 3, 2, 1, 4, 3, 2, 1]
12046   IntSegmentPatternHelper(&masm, z0.VnH(), z0.VnH());
12047 
12048   //                     2nd segment |           1st segment |
12049   //                                 v                       v
12050   // z0 = [... 5, 4, 3, 2, 10, 8, 6, 4, 4, 3, 2, 1, 8, 6, 4, 2]
12051   __ Mul(z0.VnD(), p2.Merging(), z0.VnD(), z9.VnD());
12052 
12053   __ Dup(z3.VnD(), 0);
12054   __ Dup(z4.VnD(), 0);
12055 
12056   __ Udot(z3.VnD(), z3.VnD(), z1.VnH(), z0.VnH(), 0);
12057 
12058   __ Udot(z4.VnD(), z4.VnD(), z1.VnH(), z0.VnH(), 1);
12059   __ Mul(z4.VnD(), z4.VnD(), multiplier);
12060 
12061   __ Dup(z12.VnD(), 0);
12062   __ Dup(z13.VnD(), 0);
12063 
12064   __ Ptrue(p4.VnH());
12065   __ Neg(z10.VnH(), p4.Merging(), z0.VnH());
12066 
12067   __ Sdot(z12.VnD(), z12.VnD(), z1.VnH(), z10.VnH(), 0);
12068 
12069   __ Sdot(z13.VnD(), z13.VnD(), z1.VnH(), z10.VnH(), 1);
12070   __ Mul(z13.VnD(), z13.VnD(), multiplier);
12071 
12072   END();
12073 
12074   if (CAN_RUN()) {
12075     RUN();
12076 
12077     // Only compare the first 128-bit segment of destination register, use
12078     // another result from generated instructions to check the remaining part.
12079     // d_lane[0] = (1 * 2) + (2 * 4) + (3 * 6) + (4 * 8) = 60
12080     // d_lane[1] = (5 * 2) + (6 * 4) + (7 * 6) + (8 * 8) = 140
12081     uint64_t udot_expected[] = {416, 304, 140, 60};
12082     ASSERT_EQUAL_SVE(udot_expected, z3.VnD());
12083     ASSERT_EQUAL_SVE(z3.VnD(), z4.VnD());
12084 
12085     int64_t sdot_expected[] = {-416, -304, -140, -60};
12086     ASSERT_EQUAL_SVE(sdot_expected, z12.VnD());
12087     ASSERT_EQUAL_SVE(z12.VnD(), z13.VnD());
12088   }
12089 }
12090 
12091 template <typename T, size_t N>
FPToRawbitsWithSize(const T (& inputs)[N],uint64_t * outputs,unsigned size_in_bits)12092 static void FPToRawbitsWithSize(const T (&inputs)[N],
12093                                 uint64_t* outputs,
12094                                 unsigned size_in_bits) {
12095   for (size_t i = 0; i < N; i++) {
12096     outputs[i] = vixl::FPToRawbitsWithSize(size_in_bits, inputs[i]);
12097   }
12098 }
12099 
12100 template <typename Ti, typename Te, size_t N>
FPBinArithHelper(Test * config,ArithFn macro,int lane_size_in_bits,const Ti (& zn_inputs)[N],const Ti (& zm_inputs)[N],const Te (& zd_expected)[N])12101 static void FPBinArithHelper(Test* config,
12102                              ArithFn macro,
12103                              int lane_size_in_bits,
12104                              const Ti (&zn_inputs)[N],
12105                              const Ti (&zm_inputs)[N],
12106                              const Te (&zd_expected)[N]) {
12107   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
12108 
12109   START();
12110 
12111   ZRegister zd = z29.WithLaneSize(lane_size_in_bits);
12112   ZRegister zn = z30.WithLaneSize(lane_size_in_bits);
12113   ZRegister zm = z31.WithLaneSize(lane_size_in_bits);
12114 
12115   uint64_t zn_rawbits[N];
12116   uint64_t zm_rawbits[N];
12117 
12118   FPToRawbitsWithSize(zn_inputs, zn_rawbits, lane_size_in_bits);
12119   FPToRawbitsWithSize(zm_inputs, zm_rawbits, lane_size_in_bits);
12120 
12121   InsrHelper(&masm, zn, zn_rawbits);
12122   InsrHelper(&masm, zm, zm_rawbits);
12123 
12124   (masm.*macro)(zd, zn, zm);
12125 
12126   END();
12127 
12128   if (CAN_RUN()) {
12129     RUN();
12130 
12131     ASSERT_EQUAL_SVE(zd_expected, zd);
12132   }
12133 }
12134 
TEST_SVE(sve_fp_arithmetic_unpredicated_fadd)12135 TEST_SVE(sve_fp_arithmetic_unpredicated_fadd) {
12136   double zn_inputs[] = {24.0,
12137                         5.5,
12138                         0.0,
12139                         3.875,
12140                         2.125,
12141                         kFP64PositiveInfinity,
12142                         kFP64NegativeInfinity};
12143 
12144   double zm_inputs[] = {1024.0, 2048.0, 0.1, -4.75, 12.34, 255.0, -13.0};
12145 
12146   ArithFn fn = &MacroAssembler::Fadd;
12147 
12148   uint16_t expected_h[] = {Float16ToRawbits(Float16(1048.0)),
12149                            Float16ToRawbits(Float16(2053.5)),
12150                            Float16ToRawbits(Float16(0.1)),
12151                            Float16ToRawbits(Float16(-0.875)),
12152                            Float16ToRawbits(Float16(14.465)),
12153                            Float16ToRawbits(kFP16PositiveInfinity),
12154                            Float16ToRawbits(kFP16NegativeInfinity)};
12155 
12156   FPBinArithHelper(config, fn, kHRegSize, zn_inputs, zm_inputs, expected_h);
12157 
12158   uint32_t expected_s[] = {FloatToRawbits(1048.0f),
12159                            FloatToRawbits(2053.5f),
12160                            FloatToRawbits(0.1f),
12161                            FloatToRawbits(-0.875f),
12162                            FloatToRawbits(14.465f),
12163                            FloatToRawbits(kFP32PositiveInfinity),
12164                            FloatToRawbits(kFP32NegativeInfinity)};
12165 
12166   FPBinArithHelper(config, fn, kSRegSize, zn_inputs, zm_inputs, expected_s);
12167 
12168   uint64_t expected_d[] = {DoubleToRawbits(1048.0),
12169                            DoubleToRawbits(2053.5),
12170                            DoubleToRawbits(0.1),
12171                            DoubleToRawbits(-0.875),
12172                            DoubleToRawbits(14.465),
12173                            DoubleToRawbits(kFP64PositiveInfinity),
12174                            DoubleToRawbits(kFP64NegativeInfinity)};
12175 
12176   FPBinArithHelper(config, fn, kDRegSize, zn_inputs, zm_inputs, expected_d);
12177 }
12178 
TEST_SVE(sve_fp_arithmetic_unpredicated_fsub)12179 TEST_SVE(sve_fp_arithmetic_unpredicated_fsub) {
12180   double zn_inputs[] = {24.0,
12181                         5.5,
12182                         0.0,
12183                         3.875,
12184                         2.125,
12185                         kFP64PositiveInfinity,
12186                         kFP64NegativeInfinity};
12187 
12188   double zm_inputs[] = {1024.0, 2048.0, 0.1, -4.75, 12.34, 255.0, -13.0};
12189 
12190   ArithFn fn = &MacroAssembler::Fsub;
12191 
12192   uint16_t expected_h[] = {Float16ToRawbits(Float16(-1000.0)),
12193                            Float16ToRawbits(Float16(-2042.5)),
12194                            Float16ToRawbits(Float16(-0.1)),
12195                            Float16ToRawbits(Float16(8.625)),
12196                            Float16ToRawbits(Float16(-10.215)),
12197                            Float16ToRawbits(kFP16PositiveInfinity),
12198                            Float16ToRawbits(kFP16NegativeInfinity)};
12199 
12200   FPBinArithHelper(config, fn, kHRegSize, zn_inputs, zm_inputs, expected_h);
12201 
12202   uint32_t expected_s[] = {FloatToRawbits(-1000.0),
12203                            FloatToRawbits(-2042.5),
12204                            FloatToRawbits(-0.1),
12205                            FloatToRawbits(8.625),
12206                            FloatToRawbits(-10.215),
12207                            FloatToRawbits(kFP32PositiveInfinity),
12208                            FloatToRawbits(kFP32NegativeInfinity)};
12209 
12210   FPBinArithHelper(config, fn, kSRegSize, zn_inputs, zm_inputs, expected_s);
12211 
12212   uint64_t expected_d[] = {DoubleToRawbits(-1000.0),
12213                            DoubleToRawbits(-2042.5),
12214                            DoubleToRawbits(-0.1),
12215                            DoubleToRawbits(8.625),
12216                            DoubleToRawbits(-10.215),
12217                            DoubleToRawbits(kFP64PositiveInfinity),
12218                            DoubleToRawbits(kFP64NegativeInfinity)};
12219 
12220   FPBinArithHelper(config, fn, kDRegSize, zn_inputs, zm_inputs, expected_d);
12221 }
12222 
TEST_SVE(sve_fp_arithmetic_unpredicated_fmul)12223 TEST_SVE(sve_fp_arithmetic_unpredicated_fmul) {
12224   double zn_inputs[] = {24.0,
12225                         5.5,
12226                         0.0,
12227                         3.875,
12228                         2.125,
12229                         kFP64PositiveInfinity,
12230                         kFP64NegativeInfinity};
12231 
12232   double zm_inputs[] = {1024.0, 2048.0, 0.1, -4.75, 12.34, 255.0, -13.0};
12233 
12234   ArithFn fn = &MacroAssembler::Fmul;
12235 
12236   uint16_t expected_h[] = {Float16ToRawbits(Float16(24576.0)),
12237                            Float16ToRawbits(Float16(11264.0)),
12238                            Float16ToRawbits(Float16(0.0)),
12239                            Float16ToRawbits(Float16(-18.4)),
12240                            Float16ToRawbits(Float16(26.23)),
12241                            Float16ToRawbits(kFP16PositiveInfinity),
12242                            Float16ToRawbits(kFP16PositiveInfinity)};
12243 
12244   FPBinArithHelper(config, fn, kHRegSize, zn_inputs, zm_inputs, expected_h);
12245 
12246   uint32_t expected_s[] = {FloatToRawbits(24576.0),
12247                            FloatToRawbits(11264.0),
12248                            FloatToRawbits(0.0),
12249                            FloatToRawbits(-18.40625),
12250                            FloatToRawbits(26.2225),
12251                            FloatToRawbits(kFP32PositiveInfinity),
12252                            FloatToRawbits(kFP32PositiveInfinity)};
12253 
12254   FPBinArithHelper(config, fn, kSRegSize, zn_inputs, zm_inputs, expected_s);
12255 
12256   uint64_t expected_d[] = {DoubleToRawbits(24576.0),
12257                            DoubleToRawbits(11264.0),
12258                            DoubleToRawbits(0.0),
12259                            DoubleToRawbits(-18.40625),
12260                            DoubleToRawbits(26.2225),
12261                            DoubleToRawbits(kFP64PositiveInfinity),
12262                            DoubleToRawbits(kFP64PositiveInfinity)};
12263 
12264   FPBinArithHelper(config, fn, kDRegSize, zn_inputs, zm_inputs, expected_d);
12265 }
12266 
12267 typedef void (MacroAssembler::*FPArithPredicatedFn)(
12268     const ZRegister& zd,
12269     const PRegisterM& pg,
12270     const ZRegister& zn,
12271     const ZRegister& zm,
12272     FPMacroNaNPropagationOption nan_option);
12273 
12274 typedef void (MacroAssembler::*FPArithPredicatedNoNaNOptFn)(
12275     const ZRegister& zd,
12276     const PRegisterM& pg,
12277     const ZRegister& zn,
12278     const ZRegister& zm);
12279 
12280 template <typename Ti, typename Te, size_t N>
FPBinArithHelper(Test * config,FPArithPredicatedFn macro,FPArithPredicatedNoNaNOptFn macro_nonan,unsigned lane_size_in_bits,const Ti (& zd_inputs)[N],const int (& pg_inputs)[N],const Ti (& zn_inputs)[N],const Ti (& zm_inputs)[N],const Te (& zd_expected)[N],FPMacroNaNPropagationOption nan_option=FastNaNPropagation)12281 static void FPBinArithHelper(
12282     Test* config,
12283     FPArithPredicatedFn macro,
12284     FPArithPredicatedNoNaNOptFn macro_nonan,
12285     unsigned lane_size_in_bits,
12286     const Ti (&zd_inputs)[N],
12287     const int (&pg_inputs)[N],
12288     const Ti (&zn_inputs)[N],
12289     const Ti (&zm_inputs)[N],
12290     const Te (&zd_expected)[N],
12291     FPMacroNaNPropagationOption nan_option = FastNaNPropagation) {
12292   VIXL_ASSERT((macro == NULL) ^ (macro_nonan == NULL));
12293   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
12294   START();
12295 
12296   // Avoid choosing default scratch registers.
12297   ZRegister zd = z26.WithLaneSize(lane_size_in_bits);
12298   ZRegister zn = z27.WithLaneSize(lane_size_in_bits);
12299   ZRegister zm = z28.WithLaneSize(lane_size_in_bits);
12300 
12301   uint64_t zn_inputs_rawbits[N];
12302   uint64_t zm_inputs_rawbits[N];
12303   uint64_t zd_inputs_rawbits[N];
12304 
12305   FPToRawbitsWithSize(zn_inputs, zn_inputs_rawbits, lane_size_in_bits);
12306   FPToRawbitsWithSize(zm_inputs, zm_inputs_rawbits, lane_size_in_bits);
12307   FPToRawbitsWithSize(zd_inputs, zd_inputs_rawbits, lane_size_in_bits);
12308 
12309   InsrHelper(&masm, zn, zn_inputs_rawbits);
12310   InsrHelper(&masm, zm, zm_inputs_rawbits);
12311   InsrHelper(&masm, zd, zd_inputs_rawbits);
12312 
12313   PRegisterWithLaneSize pg = p0.WithLaneSize(lane_size_in_bits);
12314   Initialise(&masm, pg, pg_inputs);
12315 
12316   // `instr` zdn, pg, zdn, zm
12317   ZRegister dn_result = z0.WithLaneSize(lane_size_in_bits);
12318   __ Mov(dn_result, zn);
12319   if (macro_nonan == NULL) {
12320     (masm.*macro)(dn_result, pg.Merging(), dn_result, zm, nan_option);
12321   } else {
12322     (masm.*macro_nonan)(dn_result, pg.Merging(), dn_result, zm);
12323   }
12324 
12325   // Based on whether zd and zm registers are aliased, the macro of instructions
12326   // (`Instr`) swaps the order of operands if it has the commutative property,
12327   // otherwise, transfer to the reversed `Instr`, such as fdivr.
12328   // `instr` zdm, pg, zn, zdm
12329   ZRegister dm_result = z1.WithLaneSize(lane_size_in_bits);
12330   __ Mov(dm_result, zm);
12331   if (macro_nonan == NULL) {
12332     (masm.*macro)(dm_result, pg.Merging(), zn, dm_result, nan_option);
12333   } else {
12334     (masm.*macro_nonan)(dm_result, pg.Merging(), zn, dm_result);
12335   }
12336 
12337   // The macro of instructions (`Instr`) automatically selects between `instr`
12338   // and movprfx + `instr` based on whether zd and zn registers are aliased.
12339   // A generated movprfx instruction is predicated that using the same
12340   // governing predicate register. In order to keep the result constant,
12341   // initialize the destination register first.
12342   // `instr` zd, pg, zn, zm
12343   ZRegister d_result = z2.WithLaneSize(lane_size_in_bits);
12344   __ Mov(d_result, zd);
12345   if (macro_nonan == NULL) {
12346     (masm.*macro)(d_result, pg.Merging(), zn, zm, nan_option);
12347   } else {
12348     (masm.*macro_nonan)(d_result, pg.Merging(), zn, zm);
12349   }
12350 
12351   END();
12352 
12353   if (CAN_RUN()) {
12354     RUN();
12355 
12356     for (size_t i = 0; i < ArrayLength(zd_expected); i++) {
12357       int lane = static_cast<int>(ArrayLength(zd_expected) - i - 1);
12358       if (!core.HasSVELane(dn_result, lane)) break;
12359       if ((pg_inputs[i] & 1) != 0) {
12360         ASSERT_EQUAL_SVE_LANE(zd_expected[i], dn_result, lane);
12361       } else {
12362         ASSERT_EQUAL_SVE_LANE(zn_inputs_rawbits[i], dn_result, lane);
12363       }
12364     }
12365 
12366     for (size_t i = 0; i < ArrayLength(zd_expected); i++) {
12367       int lane = static_cast<int>(ArrayLength(zd_expected) - i - 1);
12368       if (!core.HasSVELane(dm_result, lane)) break;
12369       if ((pg_inputs[i] & 1) != 0) {
12370         ASSERT_EQUAL_SVE_LANE(zd_expected[i], dm_result, lane);
12371       } else {
12372         ASSERT_EQUAL_SVE_LANE(zm_inputs_rawbits[i], dm_result, lane);
12373       }
12374     }
12375 
12376     ASSERT_EQUAL_SVE(zd_expected, d_result);
12377   }
12378 }
12379 
TEST_SVE(sve_binary_arithmetic_predicated_fdiv)12380 TEST_SVE(sve_binary_arithmetic_predicated_fdiv) {
12381   // The inputs are shared with different precision tests.
12382   double zd_in[] = {0.1, 1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9};
12383 
12384   double zn_in[] = {24.0,
12385                     24.0,
12386                     -2.0,
12387                     -2.0,
12388                     5.5,
12389                     5.5,
12390                     kFP64PositiveInfinity,
12391                     kFP64PositiveInfinity,
12392                     kFP64NegativeInfinity,
12393                     kFP64NegativeInfinity};
12394 
12395   double zm_in[] = {-2.0, -2.0, 24.0, 24.0, 0.5, 0.5, 0.65, 0.65, 24.0, 24.0};
12396 
12397   int pg_in[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
12398 
12399   uint16_t exp_h[] = {Float16ToRawbits(Float16(0.1)),
12400                       Float16ToRawbits(Float16(-12.0)),
12401                       Float16ToRawbits(Float16(2.2)),
12402                       Float16ToRawbits(Float16(-0.0833)),
12403                       Float16ToRawbits(Float16(4.4)),
12404                       Float16ToRawbits(Float16(11.0)),
12405                       Float16ToRawbits(Float16(6.6)),
12406                       Float16ToRawbits(kFP16PositiveInfinity),
12407                       Float16ToRawbits(Float16(8.8)),
12408                       Float16ToRawbits(kFP16NegativeInfinity)};
12409 
12410   FPBinArithHelper(config,
12411                    NULL,
12412                    &MacroAssembler::Fdiv,
12413                    kHRegSize,
12414                    zd_in,
12415                    pg_in,
12416                    zn_in,
12417                    zm_in,
12418                    exp_h);
12419 
12420   uint32_t exp_s[] = {FloatToRawbits(0.1),
12421                       FloatToRawbits(-12.0),
12422                       FloatToRawbits(2.2),
12423                       0xbdaaaaab,
12424                       FloatToRawbits(4.4),
12425                       FloatToRawbits(11.0),
12426                       FloatToRawbits(6.6),
12427                       FloatToRawbits(kFP32PositiveInfinity),
12428                       FloatToRawbits(8.8),
12429                       FloatToRawbits(kFP32NegativeInfinity)};
12430 
12431   FPBinArithHelper(config,
12432                    NULL,
12433                    &MacroAssembler::Fdiv,
12434                    kSRegSize,
12435                    zd_in,
12436                    pg_in,
12437                    zn_in,
12438                    zm_in,
12439                    exp_s);
12440 
12441   uint64_t exp_d[] = {DoubleToRawbits(0.1),
12442                       DoubleToRawbits(-12.0),
12443                       DoubleToRawbits(2.2),
12444                       0xbfb5555555555555,
12445                       DoubleToRawbits(4.4),
12446                       DoubleToRawbits(11.0),
12447                       DoubleToRawbits(6.6),
12448                       DoubleToRawbits(kFP64PositiveInfinity),
12449                       DoubleToRawbits(8.8),
12450                       DoubleToRawbits(kFP64NegativeInfinity)};
12451 
12452   FPBinArithHelper(config,
12453                    NULL,
12454                    &MacroAssembler::Fdiv,
12455                    kDRegSize,
12456                    zd_in,
12457                    pg_in,
12458                    zn_in,
12459                    zm_in,
12460                    exp_d);
12461 }
12462 
TEST_SVE(sve_select)12463 TEST_SVE(sve_select) {
12464   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
12465   START();
12466 
12467   uint64_t in0[] = {0x01f203f405f607f8, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};
12468   uint64_t in1[] = {0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa};
12469 
12470   // For simplicity, we re-use the same pg for various lane sizes.
12471   // For D lanes:         1,                      1,                      0
12472   // For S lanes:         1,          1,          1,          0,          0
12473   // For H lanes:   0,    1,    0,    1,    1,    1,    0,    0,    1,    0
12474   int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
12475   Initialise(&masm, p0.VnB(), pg_in);
12476   PRegisterM pg = p0.Merging();
12477 
12478   InsrHelper(&masm, z30.VnD(), in0);
12479   InsrHelper(&masm, z31.VnD(), in1);
12480 
12481   __ Sel(z0.VnB(), pg, z30.VnB(), z31.VnB());
12482   __ Sel(z1.VnH(), pg, z30.VnH(), z31.VnH());
12483   __ Sel(z2.VnS(), pg, z30.VnS(), z31.VnS());
12484   __ Sel(z3.VnD(), pg, z30.VnD(), z31.VnD());
12485 
12486   END();
12487 
12488   if (CAN_RUN()) {
12489     RUN();
12490 
12491     uint64_t expected_z0[] = {0xaaaaaaaa05aa07f8,
12492                               0xfeaaaaf0aac3870f,
12493                               0xaaaa56aa9abcdeaa};
12494     ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
12495 
12496     uint64_t expected_z1[] = {0xaaaaaaaaaaaa07f8,
12497                               0xaaaaf8f0e1c3870f,
12498                               0xaaaaaaaa9abcaaaa};
12499     ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
12500 
12501     uint64_t expected_z2[] = {0xaaaaaaaa05f607f8,
12502                               0xfefcf8f0e1c3870f,
12503                               0xaaaaaaaaaaaaaaaa};
12504     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
12505 
12506     uint64_t expected_z3[] = {0x01f203f405f607f8,
12507                               0xfefcf8f0e1c3870f,
12508                               0xaaaaaaaaaaaaaaaa};
12509     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
12510   }
12511 }
12512 
TEST_SVE(sve_binary_arithmetic_predicated_fmax_fmin_h)12513 TEST_SVE(sve_binary_arithmetic_predicated_fmax_fmin_h) {
12514   double zd_inputs[] = {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8};
12515   double zn_inputs[] = {-2.1,
12516                         8.5,
12517                         225.5,
12518                         0.0,
12519                         8.8,
12520                         -4.75,
12521                         kFP64PositiveInfinity,
12522                         kFP64NegativeInfinity};
12523   double zm_inputs[] = {-2.0,
12524                         -13.0,
12525                         24.0,
12526                         0.01,
12527                         0.5,
12528                         300.75,
12529                         kFP64NegativeInfinity,
12530                         kFP64PositiveInfinity};
12531   int pg_inputs[] = {1, 1, 0, 1, 0, 1, 1, 1};
12532 
12533   uint16_t zd_expected_max[] = {Float16ToRawbits(Float16(-2.0)),
12534                                 Float16ToRawbits(Float16(8.5)),
12535                                 Float16ToRawbits(Float16(3.3)),
12536                                 Float16ToRawbits(Float16(0.01)),
12537                                 Float16ToRawbits(Float16(5.5)),
12538                                 Float16ToRawbits(Float16(300.75)),
12539                                 Float16ToRawbits(kFP16PositiveInfinity),
12540                                 Float16ToRawbits(kFP16PositiveInfinity)};
12541   FPBinArithHelper(config,
12542                    &MacroAssembler::Fmax,
12543                    NULL,
12544                    kHRegSize,
12545                    zd_inputs,
12546                    pg_inputs,
12547                    zn_inputs,
12548                    zm_inputs,
12549                    zd_expected_max);
12550 
12551   uint16_t zd_expected_min[] = {Float16ToRawbits(Float16(-2.1)),
12552                                 Float16ToRawbits(Float16(-13.0)),
12553                                 Float16ToRawbits(Float16(3.3)),
12554                                 Float16ToRawbits(Float16(0.0)),
12555                                 Float16ToRawbits(Float16(5.5)),
12556                                 Float16ToRawbits(Float16(-4.75)),
12557                                 Float16ToRawbits(kFP16NegativeInfinity),
12558                                 Float16ToRawbits(kFP16NegativeInfinity)};
12559   FPBinArithHelper(config,
12560                    &MacroAssembler::Fmin,
12561                    NULL,
12562                    kHRegSize,
12563                    zd_inputs,
12564                    pg_inputs,
12565                    zn_inputs,
12566                    zm_inputs,
12567                    zd_expected_min);
12568 }
12569 
TEST_SVE(sve_binary_arithmetic_predicated_fmax_fmin_s)12570 TEST_SVE(sve_binary_arithmetic_predicated_fmax_fmin_s) {
12571   double zd_inputs[] = {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8};
12572   double zn_inputs[] = {-2.1,
12573                         8.5,
12574                         225.5,
12575                         0.0,
12576                         8.8,
12577                         -4.75,
12578                         kFP64PositiveInfinity,
12579                         kFP64NegativeInfinity};
12580   double zm_inputs[] = {-2.0,
12581                         -13.0,
12582                         24.0,
12583                         0.01,
12584                         0.5,
12585                         300.75,
12586                         kFP64NegativeInfinity,
12587                         kFP64PositiveInfinity};
12588   int pg_inputs[] = {1, 1, 0, 1, 0, 1, 1, 1};
12589 
12590   uint32_t zd_expected_max[] = {FloatToRawbits(-2.0),
12591                                 FloatToRawbits(8.5),
12592                                 FloatToRawbits(3.3),
12593                                 FloatToRawbits(0.01),
12594                                 FloatToRawbits(5.5),
12595                                 FloatToRawbits(300.75),
12596                                 FloatToRawbits(kFP32PositiveInfinity),
12597                                 FloatToRawbits(kFP32PositiveInfinity)};
12598   FPBinArithHelper(config,
12599                    &MacroAssembler::Fmax,
12600                    NULL,
12601                    kSRegSize,
12602                    zd_inputs,
12603                    pg_inputs,
12604                    zn_inputs,
12605                    zm_inputs,
12606                    zd_expected_max);
12607 
12608   uint32_t zd_expected_min[] = {FloatToRawbits(-2.1),
12609                                 FloatToRawbits(-13.0),
12610                                 FloatToRawbits(3.3),
12611                                 FloatToRawbits(0.0),
12612                                 FloatToRawbits(5.5),
12613                                 FloatToRawbits(-4.75),
12614                                 FloatToRawbits(kFP32NegativeInfinity),
12615                                 FloatToRawbits(kFP32NegativeInfinity)};
12616   FPBinArithHelper(config,
12617                    &MacroAssembler::Fmin,
12618                    NULL,
12619                    kSRegSize,
12620                    zd_inputs,
12621                    pg_inputs,
12622                    zn_inputs,
12623                    zm_inputs,
12624                    zd_expected_min);
12625 }
12626 
TEST_SVE(sve_binary_arithmetic_predicated_fmax_fmin_d)12627 TEST_SVE(sve_binary_arithmetic_predicated_fmax_fmin_d) {
12628   double zd_inputs[] = {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8};
12629   double zn_inputs[] = {-2.1,
12630                         8.5,
12631                         225.5,
12632                         0.0,
12633                         8.8,
12634                         -4.75,
12635                         kFP64PositiveInfinity,
12636                         kFP64NegativeInfinity};
12637   double zm_inputs[] = {-2.0,
12638                         -13.0,
12639                         24.0,
12640                         0.01,
12641                         0.5,
12642                         300.75,
12643                         kFP64NegativeInfinity,
12644                         kFP64PositiveInfinity};
12645   int pg_inputs[] = {1, 1, 0, 1, 0, 1, 1, 1};
12646 
12647   uint64_t zd_expected_max[] = {DoubleToRawbits(-2.0),
12648                                 DoubleToRawbits(8.5),
12649                                 DoubleToRawbits(3.3),
12650                                 DoubleToRawbits(0.01),
12651                                 DoubleToRawbits(5.5),
12652                                 DoubleToRawbits(300.75),
12653                                 DoubleToRawbits(kFP64PositiveInfinity),
12654                                 DoubleToRawbits(kFP64PositiveInfinity)};
12655   FPBinArithHelper(config,
12656                    &MacroAssembler::Fmax,
12657                    NULL,
12658                    kDRegSize,
12659                    zd_inputs,
12660                    pg_inputs,
12661                    zn_inputs,
12662                    zm_inputs,
12663                    zd_expected_max);
12664 
12665   uint64_t zd_expected_min[] = {DoubleToRawbits(-2.1),
12666                                 DoubleToRawbits(-13.0),
12667                                 DoubleToRawbits(3.3),
12668                                 DoubleToRawbits(0.0),
12669                                 DoubleToRawbits(5.5),
12670                                 DoubleToRawbits(-4.75),
12671                                 DoubleToRawbits(kFP64NegativeInfinity),
12672                                 DoubleToRawbits(kFP64NegativeInfinity)};
12673   FPBinArithHelper(config,
12674                    &MacroAssembler::Fmin,
12675                    NULL,
12676                    kDRegSize,
12677                    zd_inputs,
12678                    pg_inputs,
12679                    zn_inputs,
12680                    zm_inputs,
12681                    zd_expected_min);
12682 }
12683 
12684 template <typename T, size_t N>
BitwiseShiftImmHelper(Test * config,int lane_size_in_bits,const T (& zn_inputs)[N],int shift)12685 static void BitwiseShiftImmHelper(Test* config,
12686                                   int lane_size_in_bits,
12687                                   const T (&zn_inputs)[N],
12688                                   int shift) {
12689   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
12690   START();
12691 
12692   ZRegister zd_asr = z25.WithLaneSize(lane_size_in_bits);
12693   ZRegister zd_lsr = z26.WithLaneSize(lane_size_in_bits);
12694   ZRegister zd_lsl = z27.WithLaneSize(lane_size_in_bits);
12695   ZRegister zn = z28.WithLaneSize(lane_size_in_bits);
12696 
12697   InsrHelper(&masm, zn, zn_inputs);
12698 
12699   __ Asr(zd_asr, zn, shift);
12700   __ Lsr(zd_lsr, zn, shift);
12701   __ Lsl(zd_lsl, zn, shift - 1);  // Lsl supports 0 - lane_size-1.
12702 
12703   END();
12704 
12705   if (CAN_RUN()) {
12706     RUN();
12707 
12708     const uint64_t mask = GetUintMask(lane_size_in_bits);
12709     for (int i = 0; i < static_cast<int>(N); i++) {
12710       int lane = N - i - 1;
12711       if (!core.HasSVELane(zd_asr, lane)) break;
12712       bool is_negative = (zn_inputs[i] & GetSignMask(lane_size_in_bits)) != 0;
12713       uint64_t result;
12714       if (shift >= lane_size_in_bits) {
12715         result = is_negative ? mask : 0;
12716       } else {
12717         result = zn_inputs[i] >> shift;
12718         if (is_negative) {
12719           result |= mask << (lane_size_in_bits - shift);
12720           result &= mask;
12721         }
12722       }
12723       ASSERT_EQUAL_SVE_LANE(result, zd_asr, lane);
12724     }
12725 
12726     for (int i = 0; i < static_cast<int>(N); i++) {
12727       int lane = N - i - 1;
12728       if (!core.HasSVELane(zd_lsr, lane)) break;
12729       uint64_t result =
12730           (shift >= lane_size_in_bits) ? 0 : zn_inputs[i] >> shift;
12731       ASSERT_EQUAL_SVE_LANE(result, zd_lsr, lane);
12732     }
12733 
12734     for (int i = 0; i < static_cast<int>(N); i++) {
12735       int lane = N - i - 1;
12736       if (!core.HasSVELane(zd_lsl, lane)) break;
12737       uint64_t result =
12738           (shift > lane_size_in_bits) ? 0 : zn_inputs[i] << (shift - 1);
12739       ASSERT_EQUAL_SVE_LANE(result & mask, zd_lsl, lane);
12740     }
12741   }
12742 }
12743 
TEST_SVE(sve_bitwise_shift_imm_unpredicated)12744 TEST_SVE(sve_bitwise_shift_imm_unpredicated) {
12745   uint64_t inputs_b[] = {0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80};
12746   int shift_b[] = {1, 3, 5, 8};
12747   for (size_t i = 0; i < ArrayLength(shift_b); i++) {
12748     BitwiseShiftImmHelper(config, kBRegSize, inputs_b, shift_b[i]);
12749   }
12750 
12751   uint64_t inputs_h[] = {0xfedc, 0xfa55, 0x0011, 0x2233};
12752   int shift_h[] = {1, 8, 11, 16};
12753   for (size_t i = 0; i < ArrayLength(shift_h); i++) {
12754     BitwiseShiftImmHelper(config, kHRegSize, inputs_h, shift_h[i]);
12755   }
12756 
12757   uint64_t inputs_s[] = {0xfedcba98, 0xfffa55aa, 0x00112233};
12758   int shift_s[] = {1, 9, 17, 32};
12759   for (size_t i = 0; i < ArrayLength(shift_s); i++) {
12760     BitwiseShiftImmHelper(config, kSRegSize, inputs_s, shift_s[i]);
12761   }
12762 
12763   uint64_t inputs_d[] = {0xfedcba98fedcba98,
12764                          0xfffa5555aaaaaaaa,
12765                          0x0011223344aafe80};
12766   int shift_d[] = {1, 23, 45, 64};
12767   for (size_t i = 0; i < ArrayLength(shift_d); i++) {
12768     BitwiseShiftImmHelper(config, kDRegSize, inputs_d, shift_d[i]);
12769   }
12770 }
12771 
12772 template <typename T, typename R, size_t N>
BitwiseShiftWideElementsHelper(Test * config,Shift shift_type,int lane_size_in_bits,const T (& zn_inputs)[N],const R & zm_inputs,const T (& zd_expected)[N])12773 static void BitwiseShiftWideElementsHelper(Test* config,
12774                                            Shift shift_type,
12775                                            int lane_size_in_bits,
12776                                            const T (&zn_inputs)[N],
12777                                            const R& zm_inputs,
12778                                            const T (&zd_expected)[N]) {
12779   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
12780   START();
12781 
12782   ArithFn macro;
12783   // Since logical shift left and right by the current lane size width is equal
12784   // to 0, so initialize the array to 0 for convenience.
12785   uint64_t zd_expected_max_shift_amount[N] = {0};
12786   switch (shift_type) {
12787     case ASR: {
12788       macro = &MacroAssembler::Asr;
12789       uint64_t mask = GetUintMask(lane_size_in_bits);
12790       for (size_t i = 0; i < ArrayLength(zn_inputs); i++) {
12791         bool is_negative = (zn_inputs[i] & GetSignMask(lane_size_in_bits)) != 0;
12792         zd_expected_max_shift_amount[i] = is_negative ? mask : 0;
12793       }
12794       break;
12795     }
12796     case LSR:
12797       macro = &MacroAssembler::Lsr;
12798       break;
12799     case LSL:
12800       macro = &MacroAssembler::Lsl;
12801       break;
12802     default:
12803       VIXL_UNIMPLEMENTED();
12804       macro = NULL;
12805       break;
12806   }
12807 
12808   ZRegister zd = z26.WithLaneSize(lane_size_in_bits);
12809   ZRegister zn = z27.WithLaneSize(lane_size_in_bits);
12810   ZRegister zm = z28.WithLaneSize(kDRegSize);
12811 
12812   InsrHelper(&masm, zn, zn_inputs);
12813   InsrHelper(&masm, zm, zm_inputs);
12814 
12815   (masm.*macro)(zd, zn, zm);
12816 
12817   ZRegister zm_max_shift_amount = z25.WithLaneSize(kDRegSize);
12818   ZRegister zd_max_shift_amount = z24.WithLaneSize(lane_size_in_bits);
12819 
12820   __ Dup(zm_max_shift_amount, lane_size_in_bits);
12821   (masm.*macro)(zd_max_shift_amount, zn, zm_max_shift_amount);
12822 
12823   ZRegister zm_out_of_range = z23.WithLaneSize(kDRegSize);
12824   ZRegister zd_out_of_range = z22.WithLaneSize(lane_size_in_bits);
12825 
12826   __ Dup(zm_out_of_range, GetUintMask(lane_size_in_bits));
12827   (masm.*macro)(zd_out_of_range, zn, zm_out_of_range);
12828 
12829   END();
12830 
12831   if (CAN_RUN()) {
12832     RUN();
12833 
12834     ASSERT_EQUAL_SVE(zd_expected, zd);
12835     ASSERT_EQUAL_SVE(zd_expected_max_shift_amount, zd_max_shift_amount);
12836     ASSERT_EQUAL_SVE(zd_max_shift_amount, zd_out_of_range);
12837   }
12838 }
12839 
TEST_SVE(sve_bitwise_shift_wide_elements_unpredicated_asr)12840 TEST_SVE(sve_bitwise_shift_wide_elements_unpredicated_asr) {
12841   // clang-format off
12842   uint64_t inputs_b[] = {0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80,
12843                          0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80};
12844   int shift_b[] = {1, 3};
12845   uint64_t expected_b[] = {0xff, 0xee, 0xdd, 0xcc, 0xff, 0x2a, 0xd5, 0xc0,
12846                            0xff, 0xfb, 0xf7, 0xf3, 0xff, 0x0a, 0xf5, 0xf0};
12847   BitwiseShiftWideElementsHelper(config,
12848                                  ASR,
12849                                  kBRegSize,
12850                                  inputs_b,
12851                                  shift_b,
12852                                  expected_b);
12853 
12854   uint64_t inputs_h[] = {0xfedc, 0xfa55, 0x0011, 0x2233,
12855                          0xfedc, 0xfa55, 0x0011, 0x2233,
12856                          0xfedc, 0xfa55, 0x0011, 0x2233};
12857   int shift_h[] = {1, 8, 11};
12858   uint64_t expected_h[] = {0xff6e, 0xfd2a, 0x0008, 0x1119,
12859                            0xfffe, 0xfffa, 0x0000, 0x0022,
12860                            0xffff, 0xffff, 0x0000, 0x0004};
12861   BitwiseShiftWideElementsHelper(config,
12862                                  ASR,
12863                                  kHRegSize,
12864                                  inputs_h,
12865                                  shift_h,
12866                                  expected_h);
12867 
12868   uint64_t inputs_s[] =
12869       {0xfedcba98, 0xfffa55aa, 0x00112233, 0x01234567, 0xaaaaaaaa, 0x88888888};
12870   int shift_s[] = {1, 9, 23};
12871   uint64_t expected_s[] =
12872       {0xff6e5d4c, 0xfffd2ad5, 0x00000891, 0x000091a2, 0xffffff55, 0xffffff11};
12873   BitwiseShiftWideElementsHelper(config,
12874                                  ASR,
12875                                  kSRegSize,
12876                                  inputs_s,
12877                                  shift_s,
12878                                  expected_s);
12879   // clang-format on
12880 }
12881 
TEST_SVE(sve_bitwise_shift_wide_elements_unpredicated_lsr)12882 TEST_SVE(sve_bitwise_shift_wide_elements_unpredicated_lsr) {
12883   // clang-format off
12884   uint64_t inputs_b[] = {0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80,
12885                          0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80};
12886   int shift_b[] = {1, 3};
12887   uint64_t expected_b[] = {0x7f, 0x6e, 0x5d, 0x4c, 0x7f, 0x2a, 0x55, 0x40,
12888                            0x1f, 0x1b, 0x17, 0x13, 0x1f, 0x0a, 0x15, 0x10};
12889 
12890   BitwiseShiftWideElementsHelper(config,
12891                                  LSR,
12892                                  kBRegSize,
12893                                  inputs_b,
12894                                  shift_b,
12895                                  expected_b);
12896 
12897   uint64_t inputs_h[] = {0xfedc, 0xfa55, 0x0011, 0x2233,
12898                          0xfedc, 0xfa55, 0x0011, 0x2233,
12899                          0xfedc, 0xfa55, 0x0011, 0x2233};
12900   int shift_h[] = {1, 8, 11};
12901   uint64_t expected_h[] = {0x7f6e, 0x7d2a, 0x0008, 0x1119,
12902                            0x00fe, 0x00fa, 0x0000, 0x0022,
12903                            0x001f, 0x001f, 0x0000, 0x0004};
12904   BitwiseShiftWideElementsHelper(config,
12905                                  LSR,
12906                                  kHRegSize,
12907                                  inputs_h,
12908                                  shift_h,
12909                                  expected_h);
12910 
12911   uint64_t inputs_s[] =
12912       {0xfedcba98, 0xfffa55aa, 0x00112233, 0x01234567, 0xaaaaaaaa, 0x88888888};
12913   int shift_s[] = {1, 9, 23};
12914   uint64_t expected_s[] =
12915       {0x7f6e5d4c, 0x7ffd2ad5, 0x00000891, 0x000091a2, 0x00000155, 0x00000111};
12916   BitwiseShiftWideElementsHelper(config,
12917                                  LSR,
12918                                  kSRegSize,
12919                                  inputs_s,
12920                                  shift_s,
12921                                  expected_s);
12922   // clang-format on
12923 }
12924 
TEST_SVE(sve_bitwise_shift_wide_elements_unpredicated_lsl)12925 TEST_SVE(sve_bitwise_shift_wide_elements_unpredicated_lsl) {
12926   // clang-format off
12927   uint64_t inputs_b[] = {0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80,
12928                          0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80};
12929   int shift_b[] = {1, 5};
12930 
12931   uint64_t expected_b[] = {0xfc, 0xb8, 0x74, 0x30, 0xfe, 0xaa, 0x54, 0x00,
12932                            0xc0, 0x80, 0x40, 0x00, 0xe0, 0xa0, 0x40, 0x00};
12933 
12934   BitwiseShiftWideElementsHelper(config,
12935                                  LSL,
12936                                  kBRegSize,
12937                                  inputs_b,
12938                                  shift_b,
12939                                  expected_b);
12940   uint64_t inputs_h[] = {0xfedc, 0xfa55, 0x0011, 0x2233,
12941                          0xfedc, 0xfa55, 0x0011, 0x2233,
12942                          0xfedc, 0xfa55, 0x0011, 0x2233};
12943   int shift_h[] = {1, 2, 14};
12944 
12945   uint64_t expected_h[] = {0xfdb8, 0xf4aa, 0x0022, 0x4466,
12946                            0xfb70, 0xe954, 0x0044, 0x88cc,
12947                            0x0000, 0x4000, 0x4000, 0xc000};
12948   BitwiseShiftWideElementsHelper(config,
12949                                  LSL,
12950                                  kHRegSize,
12951                                  inputs_h,
12952                                  shift_h,
12953                                  expected_h);
12954   uint64_t inputs_s[] =
12955       {0xfedcba98, 0xfffa55aa, 0x00112233, 0x01234567, 0xaaaaaaaa, 0x88888888};
12956   int shift_s[] = {1, 19, 26};
12957   uint64_t expected_s[] =
12958       {0xfdb97530, 0xfff4ab54, 0x11980000, 0x2b380000, 0xa8000000, 0x20000000};
12959   BitwiseShiftWideElementsHelper(config,
12960                                  LSL,
12961                                  kSRegSize,
12962                                  inputs_s,
12963                                  shift_s,
12964                                  expected_s);
12965 
12966   // Test large shifts outside the range of the "unsigned" type.
12967   uint64_t inputs_b2[] = {1, 2, 4, 8, 3, 5, 7, 9,
12968                           1, 2, 4, 8, 3, 5, 7, 9};
12969   uint64_t shift_b2[] = {1, 0x1000000001};
12970   uint64_t expected_b2[] = {2, 4, 8, 16, 6, 10, 14, 18,
12971                             0, 0, 0, 0, 0, 0, 0, 0};
12972   BitwiseShiftWideElementsHelper(config, LSL, kBRegSize, inputs_b2, shift_b2,
12973                                  expected_b2);
12974 
12975   // clang-format on
12976 }
12977 
TEST_SVE(sve_shift_by_vector)12978 TEST_SVE(sve_shift_by_vector) {
12979   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
12980 
12981   START();
12982   __ Ptrue(p0.VnB());
12983   __ Pfalse(p1.VnB());
12984   __ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
12985   __ Zip1(p3.VnH(), p0.VnH(), p1.VnH());
12986   __ Zip1(p4.VnS(), p0.VnS(), p1.VnS());
12987   __ Zip1(p5.VnD(), p0.VnD(), p1.VnD());
12988 
12989   __ Dup(z31.VnD(), 0x8000000080008080);
12990   __ Dup(z0.VnB(), -1);
12991 
12992   __ Index(z1.VnB(), 0, 1);
12993   __ Dup(z2.VnB(), 0x55);
12994   __ Lsr(z2.VnB(), p2.Merging(), z0.VnB(), z1.VnB());
12995   __ Lsl(z3.VnB(), p0.Merging(), z0.VnB(), z1.VnB());
12996   __ Asr(z4.VnB(), p0.Merging(), z31.VnB(), z1.VnB());
12997 
12998   __ Index(z1.VnH(), 0, 1);
12999   __ Dup(z6.VnB(), 0x55);
13000   __ Lsr(z5.VnH(), p0.Merging(), z0.VnH(), z1.VnH());
13001   __ Lsl(z6.VnH(), p3.Merging(), z0.VnH(), z1.VnH());
13002   __ Asr(z7.VnH(), p0.Merging(), z31.VnH(), z1.VnH());
13003 
13004   __ Index(z1.VnS(), 0, 1);
13005   __ Dup(z10.VnB(), 0x55);
13006   __ Lsr(z8.VnS(), p0.Merging(), z0.VnS(), z1.VnS());
13007   __ Lsl(z9.VnS(), p0.Merging(), z0.VnS(), z1.VnS());
13008   __ Asr(z10.VnS(), p4.Merging(), z31.VnS(), z1.VnS());
13009 
13010   __ Index(z1.VnD(), 0, 1);
13011   __ Lsr(z0.VnD(), p5.Merging(), z0.VnD(), z1.VnD());
13012   __ Lsl(z12.VnD(), p0.Merging(), z0.VnD(), z1.VnD());
13013   __ Asr(z13.VnD(), p0.Merging(), z31.VnD(), z1.VnD());
13014 
13015   __ Dup(z11.VnD(), 0x100000001);
13016   __ Lsl(z14.VnD(), p0.Merging(), z1.VnD(), z11.VnD());
13017 
13018   __ Index(z0.VnH(), 7, -1);
13019   __ Lsr(z0.VnH(), p0.Merging(), z31.VnH(), z0.VnH());
13020   END();
13021 
13022   if (CAN_RUN()) {
13023     RUN();
13024 
13025     uint64_t expected_z0[] = {0x8000000020001010, 0x0800000002000101};
13026     ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
13027     uint64_t expected_z2[] = {0x5500550055005500, 0x5503550f553f55ff};
13028     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13029     uint64_t expected_z3[] = {0x0000000000000000, 0x80c0e0f0f8fcfeff};
13030     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13031     uint64_t expected_z4[] = {0xff000000ff00ffff, 0xff000000f000c080};
13032     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13033     uint64_t expected_z5[] = {0x01ff03ff07ff0fff, 0x1fff3fff7fffffff};
13034     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
13035     uint64_t expected_z6[] = {0x5555ffc05555fff0, 0x5555fffc5555ffff};
13036     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13037     uint64_t expected_z7[] = {0xff000000fc00f808, 0xf0000000c0008080};
13038     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
13039     uint64_t expected_z8[] = {0x1fffffff3fffffff, 0x7fffffffffffffff};
13040     ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
13041     uint64_t expected_z9[] = {0xfffffff8fffffffc, 0xfffffffeffffffff};
13042     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
13043     uint64_t expected_z10[] = {0x55555555e0002020, 0x5555555580008080};
13044     ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
13045     uint64_t expected_z12[] = {0xfffffffffffffffe, 0xffffffffffffffff};
13046     ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
13047     uint64_t expected_z13[] = {0xc000000040004040, 0x8000000080008080};
13048     ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
13049     uint64_t expected_z14[] = {0, 0};
13050     ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
13051   }
13052 }
13053 
TEST_SVE(sve_shift_by_wide_vector)13054 TEST_SVE(sve_shift_by_wide_vector) {
13055   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13056 
13057   START();
13058   __ Ptrue(p0.VnB());
13059   __ Pfalse(p1.VnB());
13060   __ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
13061   __ Zip1(p3.VnH(), p0.VnH(), p1.VnH());
13062   __ Zip1(p4.VnS(), p0.VnS(), p1.VnS());
13063 
13064   __ Dup(z31.VnD(), 0x8000000080008080);
13065   __ Dup(z0.VnB(), -1);
13066   __ Index(z1.VnD(), 1, 5);
13067 
13068   __ Dup(z2.VnB(), 0x55);
13069   __ Lsr(z2.VnB(), p2.Merging(), z2.VnB(), z1.VnD());
13070   __ Lsl(z3.VnB(), p0.Merging(), z0.VnB(), z1.VnD());
13071   __ Asr(z4.VnB(), p0.Merging(), z31.VnB(), z1.VnD());
13072 
13073   __ Dup(z6.VnB(), 0x55);
13074   __ Lsr(z5.VnH(), p0.Merging(), z0.VnH(), z1.VnD());
13075   __ Lsl(z6.VnH(), p3.Merging(), z6.VnH(), z1.VnD());
13076   __ Asr(z7.VnH(), p0.Merging(), z31.VnH(), z1.VnD());
13077 
13078   __ Dup(z10.VnB(), 0x55);
13079   __ Lsr(z8.VnS(), p0.Merging(), z0.VnS(), z1.VnD());
13080   __ Lsl(z9.VnS(), p0.Merging(), z0.VnS(), z1.VnD());
13081   __ Asr(z10.VnS(), p4.Merging(), z31.VnS(), z1.VnD());
13082   END();
13083 
13084   if (CAN_RUN()) {
13085     RUN();
13086 
13087     uint64_t expected_z2[] = {0x5501550155015501, 0x552a552a552a552a};
13088     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13089     uint64_t expected_z3[] = {0xc0c0c0c0c0c0c0c0, 0xfefefefefefefefe};
13090     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13091     uint64_t expected_z4[] = {0xfe000000fe00fefe, 0xc0000000c000c0c0};
13092     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13093     uint64_t expected_z5[] = {0x03ff03ff03ff03ff, 0x7fff7fff7fff7fff};
13094     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
13095     uint64_t expected_z6[] = {0x5555554055555540, 0x5555aaaa5555aaaa};
13096     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13097     uint64_t expected_z7[] = {0xfe000000fe00fe02, 0xc0000000c000c040};
13098     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
13099     uint64_t expected_z8[] = {0x03ffffff03ffffff, 0x7fffffff7fffffff};
13100     ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
13101     uint64_t expected_z9[] = {0xffffffc0ffffffc0, 0xfffffffefffffffe};
13102     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
13103     uint64_t expected_z10[] = {0x55555555fe000202, 0x55555555c0004040};
13104     ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
13105   }
13106 }
13107 
TEST_SVE(sve_pred_shift_imm)13108 TEST_SVE(sve_pred_shift_imm) {
13109   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13110 
13111   START();
13112   __ Ptrue(p0.VnB());
13113   __ Pfalse(p1.VnB());
13114   __ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
13115   __ Zip1(p3.VnH(), p0.VnH(), p1.VnH());
13116   __ Zip1(p4.VnS(), p0.VnS(), p1.VnS());
13117   __ Zip1(p5.VnD(), p0.VnD(), p1.VnD());
13118 
13119   __ Dup(z31.VnD(), 0x8000000080008080);
13120   __ Lsr(z0.VnB(), p0.Merging(), z31.VnB(), 1);
13121   __ Mov(z1, z0);
13122   __ Lsl(z1.VnB(), p2.Merging(), z1.VnB(), 1);
13123   __ Asr(z2.VnB(), p0.Merging(), z1.VnB(), 2);
13124 
13125   __ Lsr(z3.VnH(), p0.Merging(), z31.VnH(), 2);
13126   __ Mov(z4, z3);
13127   __ Lsl(z4.VnH(), p3.Merging(), z4.VnH(), 2);
13128   __ Asr(z5.VnH(), p0.Merging(), z4.VnH(), 3);
13129 
13130   __ Lsr(z6.VnS(), p0.Merging(), z31.VnS(), 3);
13131   __ Mov(z7, z6);
13132   __ Lsl(z7.VnS(), p4.Merging(), z7.VnS(), 3);
13133   __ Asr(z8.VnS(), p0.Merging(), z7.VnS(), 4);
13134 
13135   __ Lsr(z9.VnD(), p0.Merging(), z31.VnD(), 4);
13136   __ Mov(z10, z9);
13137   __ Lsl(z10.VnD(), p5.Merging(), z10.VnD(), 4);
13138   __ Asr(z11.VnD(), p0.Merging(), z10.VnD(), 5);
13139   END();
13140 
13141   if (CAN_RUN()) {
13142     RUN();
13143     uint64_t expected_z0[] = {0x4000000040004040, 0x4000000040004040};
13144     ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
13145     uint64_t expected_z1[] = {0x4000000040004080, 0x4000000040004080};
13146     ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
13147     uint64_t expected_z2[] = {0x10000000100010e0, 0x10000000100010e0};
13148     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13149     uint64_t expected_z3[] = {0x2000000020002020, 0x2000000020002020};
13150     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13151     uint64_t expected_z4[] = {0x2000000020008080, 0x2000000020008080};
13152     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13153     uint64_t expected_z5[] = {0x040000000400f010, 0x040000000400f010};
13154     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
13155     uint64_t expected_z6[] = {0x1000000010001010, 0x1000000010001010};
13156     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13157     uint64_t expected_z7[] = {0x1000000080008080, 0x1000000080008080};
13158     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
13159     uint64_t expected_z8[] = {0x01000000f8000808, 0x01000000f8000808};
13160     ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
13161     uint64_t expected_z9[] = {0x0800000008000808, 0x0800000008000808};
13162     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
13163     uint64_t expected_z10[] = {0x0800000008000808, 0x8000000080008080};
13164     ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
13165     uint64_t expected_z11[] = {0x0040000000400040, 0xfc00000004000404};
13166     ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
13167   }
13168 }
13169 
TEST_SVE(sve_asrd)13170 TEST_SVE(sve_asrd) {
13171   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13172 
13173   START();
13174   __ Ptrue(p0.VnB());
13175   __ Pfalse(p1.VnB());
13176   __ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
13177   __ Zip1(p3.VnH(), p0.VnH(), p1.VnH());
13178   __ Zip1(p4.VnS(), p0.VnS(), p1.VnS());
13179   __ Zip1(p5.VnD(), p0.VnD(), p1.VnD());
13180 
13181   __ Index(z31.VnB(), 0x7f - 3, 1);
13182   __ Asrd(z0.VnB(), p0.Merging(), z31.VnB(), 1);
13183   __ Mov(z1, z31);
13184   __ Asrd(z1.VnB(), p2.Merging(), z1.VnB(), 2);
13185   __ Asrd(z2.VnB(), p0.Merging(), z31.VnB(), 7);
13186   __ Asrd(z3.VnB(), p0.Merging(), z31.VnB(), 8);
13187 
13188   __ Index(z31.VnH(), 0x7fff - 3, 1);
13189   __ Asrd(z4.VnH(), p0.Merging(), z31.VnH(), 1);
13190   __ Mov(z5, z31);
13191   __ Asrd(z5.VnH(), p3.Merging(), z5.VnH(), 2);
13192   __ Asrd(z6.VnH(), p0.Merging(), z31.VnH(), 15);
13193   __ Asrd(z7.VnH(), p0.Merging(), z31.VnH(), 16);
13194 
13195   __ Index(z31.VnS(), 0x7fffffff - 1, 1);
13196   __ Asrd(z8.VnS(), p0.Merging(), z31.VnS(), 1);
13197   __ Mov(z9, z31);
13198   __ Asrd(z9.VnS(), p4.Merging(), z9.VnS(), 2);
13199   __ Asrd(z10.VnS(), p0.Merging(), z31.VnS(), 31);
13200   __ Asrd(z11.VnS(), p0.Merging(), z31.VnS(), 32);
13201 
13202   __ Index(z31.VnD(), 0x7fffffffffffffff, 1);
13203   __ Asrd(z12.VnD(), p0.Merging(), z31.VnD(), 1);
13204   __ Mov(z13, z31);
13205   __ Asrd(z13.VnD(), p5.Merging(), z13.VnD(), 2);
13206   __ Asrd(z14.VnD(), p0.Merging(), z31.VnD(), 63);
13207   __ Asrd(z31.VnD(), p0.Merging(), z31.VnD(), 64);
13208   END();
13209 
13210   if (CAN_RUN()) {
13211     RUN();
13212     uint64_t expected_z0[] = {0xc6c5c5c4c4c3c3c2, 0xc2c1c1c03f3f3e3e};
13213     ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
13214     uint64_t expected_z1[] = {0x8be389e287e285e1, 0x83e181e07f1f7d1f};
13215     ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
13216     uint64_t expected_z2[] = {0x0000000000000000, 0x000000ff00000000};
13217     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13218     uint64_t expected_z3[] = {0x0000000000000000, 0x0000000000000000};
13219     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13220     uint64_t expected_z4[] = {0xc002c001c001c000, 0x3fff3fff3ffe3ffe};
13221     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13222     uint64_t expected_z5[] = {0x8003e0018001e000, 0x7fff1fff7ffd1fff};
13223     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
13224     uint64_t expected_z6[] = {0x000000000000ffff, 0x0000000000000000};
13225     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13226     uint64_t expected_z7[] = {0x0000000000000000, 0x0000000000000000};
13227     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
13228     uint64_t expected_z8[] = {0xc0000001c0000000, 0x3fffffff3fffffff};
13229     ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
13230     uint64_t expected_z9[] = {0x80000001e0000000, 0x7fffffff1fffffff};
13231     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
13232     uint64_t expected_z10[] = {0x00000000ffffffff, 0x0000000000000000};
13233     ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
13234     uint64_t expected_z11[] = {0x0000000000000000, 0x0000000000000000};
13235     ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
13236     uint64_t expected_z12[] = {0xc000000000000000, 0x3fffffffffffffff};
13237     ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
13238     uint64_t expected_z13[] = {0x8000000000000000, 0x1fffffffffffffff};
13239     ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
13240     uint64_t expected_z14[] = {0xffffffffffffffff, 0x0000000000000000};
13241     ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
13242     uint64_t expected_z31[] = {0x0000000000000000, 0x0000000000000000};
13243     ASSERT_EQUAL_SVE(expected_z31, z31.VnD());
13244   }
13245 }
13246 
TEST_SVE(sve_setffr)13247 TEST_SVE(sve_setffr) {
13248   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13249   START();
13250 
13251   __ Ptrue(p15.VnB());
13252   __ Setffr();
13253   __ Rdffr(p14.VnB());
13254 
13255   END();
13256 
13257   if (CAN_RUN()) {
13258     RUN();
13259 
13260     ASSERT_EQUAL_SVE(p14.VnB(), p15.VnB());
13261   }
13262 }
13263 
WrffrHelper(Test * config,unsigned active_lanes)13264 static void WrffrHelper(Test* config, unsigned active_lanes) {
13265   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13266   START();
13267 
13268   int inputs[kPRegMaxSize] = {0};
13269   VIXL_ASSERT(active_lanes <= kPRegMaxSize);
13270   for (unsigned i = 0; i < active_lanes; i++) {
13271     // The rightmost (highest-indexed) array element maps to the lowest-numbered
13272     // lane.
13273     inputs[kPRegMaxSize - i - 1] = 1;
13274   }
13275 
13276   Initialise(&masm, p1.VnB(), inputs);
13277   __ Wrffr(p1.VnB());
13278   __ Rdffr(p2.VnB());
13279 
13280   END();
13281 
13282   if (CAN_RUN()) {
13283     RUN();
13284 
13285     ASSERT_EQUAL_SVE(p1.VnB(), p2.VnB());
13286   }
13287 }
13288 
TEST_SVE(sve_wrffr)13289 TEST_SVE(sve_wrffr) {
13290   int active_lanes_inputs[] = {0, 1, 7, 10, 32, 48, kPRegMaxSize};
13291   for (size_t i = 0; i < ArrayLength(active_lanes_inputs); i++) {
13292     WrffrHelper(config, active_lanes_inputs[i]);
13293   }
13294 }
13295 
13296 template <size_t N>
RdffrHelper(Test * config,size_t active_lanes,const int (& pg_inputs)[N])13297 static void RdffrHelper(Test* config,
13298                         size_t active_lanes,
13299                         const int (&pg_inputs)[N]) {
13300   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13301   START();
13302 
13303   VIXL_ASSERT(active_lanes <= kPRegMaxSize);
13304 
13305   // The rightmost (highest-indexed) array element maps to the lowest-numbered
13306   // lane.
13307   int pd[kPRegMaxSize] = {0};
13308   for (unsigned i = 0; i < active_lanes; i++) {
13309     pd[kPRegMaxSize - i - 1] = 1;
13310   }
13311 
13312   int pg[kPRegMaxSize] = {0};
13313   for (unsigned i = 0; i < N; i++) {
13314     pg[kPRegMaxSize - i - 1] = pg_inputs[i];
13315   }
13316 
13317   int pd_expected[kPRegMaxSize] = {0};
13318   for (unsigned i = 0; i < std::min(active_lanes, N); i++) {
13319     int lane = kPRegMaxSize - i - 1;
13320     pd_expected[lane] = pd[lane] & pg[lane];
13321   }
13322 
13323   Initialise(&masm, p0.VnB(), pg);
13324   Initialise(&masm, p1.VnB(), pd);
13325 
13326   // The unpredicated form of rdffr has been tested in `WrffrHelper`.
13327   __ Wrffr(p1.VnB());
13328   __ Rdffr(p14.VnB(), p0.Zeroing());
13329   __ Rdffrs(p13.VnB(), p0.Zeroing());
13330   __ Mrs(x8, NZCV);
13331 
13332   END();
13333 
13334   if (CAN_RUN()) {
13335     RUN();
13336 
13337     ASSERT_EQUAL_SVE(pd_expected, p14.VnB());
13338     ASSERT_EQUAL_SVE(pd_expected, p13.VnB());
13339     StatusFlags nzcv_expected =
13340         GetPredTestFlags(pd_expected, pg, core.GetSVELaneCount(kBRegSize));
13341     ASSERT_EQUAL_64(nzcv_expected, x8);
13342   }
13343 }
13344 
TEST_SVE(sve_rdffr_rdffrs)13345 TEST_SVE(sve_rdffr_rdffrs) {
13346   // clang-format off
13347   int active_lanes_inputs[] = {0, 1, 15, 26, 39, 47, kPRegMaxSize};
13348   int pg_inputs_0[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13349   int pg_inputs_1[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
13350   int pg_inputs_2[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13351   int pg_inputs_3[] = {0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
13352   int pg_inputs_4[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13353   // clang-format on
13354 
13355   for (size_t i = 0; i < ArrayLength(active_lanes_inputs); i++) {
13356     RdffrHelper(config, active_lanes_inputs[i], pg_inputs_0);
13357     RdffrHelper(config, active_lanes_inputs[i], pg_inputs_1);
13358     RdffrHelper(config, active_lanes_inputs[i], pg_inputs_2);
13359     RdffrHelper(config, active_lanes_inputs[i], pg_inputs_3);
13360     RdffrHelper(config, active_lanes_inputs[i], pg_inputs_4);
13361   }
13362 }
13363 
13364 typedef void (MacroAssembler::*BrkpFn)(const PRegisterWithLaneSize& pd,
13365                                        const PRegisterZ& pg,
13366                                        const PRegisterWithLaneSize& pn,
13367                                        const PRegisterWithLaneSize& pm);
13368 
13369 template <typename Tg, typename Tn, typename Td>
BrkpaBrkpbHelper(Test * config,BrkpFn macro,BrkpFn macro_set_flags,const Tg & pg_inputs,const Tn & pn_inputs,const Tn & pm_inputs,const Td & pd_expected)13370 static void BrkpaBrkpbHelper(Test* config,
13371                              BrkpFn macro,
13372                              BrkpFn macro_set_flags,
13373                              const Tg& pg_inputs,
13374                              const Tn& pn_inputs,
13375                              const Tn& pm_inputs,
13376                              const Td& pd_expected) {
13377   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13378   START();
13379 
13380   PRegister pg = p15;
13381   PRegister pn = p14;
13382   PRegister pm = p13;
13383   Initialise(&masm, pg.VnB(), pg_inputs);
13384   Initialise(&masm, pn.VnB(), pn_inputs);
13385   Initialise(&masm, pm.VnB(), pm_inputs);
13386 
13387   // Initialise NZCV to an impossible value, to check that we actually write it.
13388   __ Mov(x10, NZCVFlag);
13389   __ Msr(NZCV, x10);
13390 
13391   (masm.*macro_set_flags)(p0.VnB(), pg.Zeroing(), pn.VnB(), pm.VnB());
13392   __ Mrs(x0, NZCV);
13393 
13394   (masm.*macro)(p1.VnB(), pg.Zeroing(), pn.VnB(), pm.VnB());
13395 
13396   END();
13397 
13398   if (CAN_RUN()) {
13399     RUN();
13400 
13401     ASSERT_EQUAL_SVE(pd_expected, p0.VnB());
13402 
13403     // Check that the flags were properly set.
13404     StatusFlags nzcv_expected =
13405         GetPredTestFlags(pd_expected,
13406                          pg_inputs,
13407                          core.GetSVELaneCount(kBRegSize));
13408     ASSERT_EQUAL_64(nzcv_expected, x0);
13409     ASSERT_EQUAL_SVE(p0.VnB(), p1.VnB());
13410   }
13411 }
13412 
13413 template <typename Tg, typename Tn, typename Td>
BrkpaHelper(Test * config,const Tg & pg_inputs,const Tn & pn_inputs,const Tn & pm_inputs,const Td & pd_expected)13414 static void BrkpaHelper(Test* config,
13415                         const Tg& pg_inputs,
13416                         const Tn& pn_inputs,
13417                         const Tn& pm_inputs,
13418                         const Td& pd_expected) {
13419   BrkpaBrkpbHelper(config,
13420                    &MacroAssembler::Brkpa,
13421                    &MacroAssembler::Brkpas,
13422                    pg_inputs,
13423                    pn_inputs,
13424                    pm_inputs,
13425                    pd_expected);
13426 }
13427 
13428 template <typename Tg, typename Tn, typename Td>
BrkpbHelper(Test * config,const Tg & pg_inputs,const Tn & pn_inputs,const Tn & pm_inputs,const Td & pd_expected)13429 static void BrkpbHelper(Test* config,
13430                         const Tg& pg_inputs,
13431                         const Tn& pn_inputs,
13432                         const Tn& pm_inputs,
13433                         const Td& pd_expected) {
13434   BrkpaBrkpbHelper(config,
13435                    &MacroAssembler::Brkpb,
13436                    &MacroAssembler::Brkpbs,
13437                    pg_inputs,
13438                    pn_inputs,
13439                    pm_inputs,
13440                    pd_expected);
13441 }
13442 
TEST_SVE(sve_brkpb)13443 TEST_SVE(sve_brkpb) {
13444   // clang-format off
13445   // The last active element of `pn` are `true` in all vector length configurations.
13446   //                                | boundary of 128-bits VL.
13447   //                                v
13448   int pg_1[] =      {1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
13449   int pg_2[] =      {1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
13450   int pg_3[] =      {1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
13451 
13452   //                 | highest-numbered lane                lowest-numbered lane |
13453   //                 v                                                           v
13454   int pn_1[] =      {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
13455   int pn_2[] =      {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
13456   int pn_3[] =      {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1};
13457 
13458   int pm_1[] =      {1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
13459   int pm_2[] =      {0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13460   int pm_3[] =      {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13461 
13462   //                                                                    | first active
13463   //                                                                    v
13464   int exp_1_1_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
13465   //                                            | first active
13466   //                                            v
13467   int exp_1_2_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
13468   //                                                                    | first active
13469   //                                                                    v
13470   int exp_1_3_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
13471 
13472   BrkpbHelper(config, pg_1, pn_1, pm_1, exp_1_1_1);
13473   BrkpbHelper(config, pg_1, pn_2, pm_2, exp_1_2_2);
13474   BrkpbHelper(config, pg_1, pn_3, pm_3, exp_1_3_3);
13475 
13476   //                                               | first active
13477   //                                               v
13478   int exp_2_1_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
13479   //                                            | first active
13480   //                                            v
13481   int exp_2_2_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
13482   //                                                                 | first active
13483   //                                                                 v
13484   int exp_2_3_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1};
13485   BrkpbHelper(config, pg_2, pn_1, pm_2, exp_2_1_2);
13486   BrkpbHelper(config, pg_2, pn_2, pm_3, exp_2_2_3);
13487   BrkpbHelper(config, pg_2, pn_3, pm_1, exp_2_3_1);
13488 
13489   //                                                                    | first active
13490   //                                                                    v
13491   int exp_3_1_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
13492   //                                                                    | first active
13493   //                                                                    v
13494   int exp_3_2_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
13495   //                                      | first active
13496   //                                      v
13497   int exp_3_3_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
13498   BrkpbHelper(config, pg_3, pn_1, pm_3, exp_3_1_3);
13499   BrkpbHelper(config, pg_3, pn_2, pm_1, exp_3_2_1);
13500   BrkpbHelper(config, pg_3, pn_3, pm_2, exp_3_3_2);
13501 
13502   // The last active element of `pn` are `false` in all vector length configurations.
13503   //                       | last active lane when VL > 128 bits.
13504   //                       v
13505   //                                   | last active lane when VL == 128 bits.
13506   //                                   v
13507   int pg_4[] =      {0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
13508   int exp_4_x_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13509   BrkpbHelper(config, pg_4, pn_1, pm_1, exp_4_x_x);
13510   BrkpbHelper(config, pg_4, pn_2, pm_2, exp_4_x_x);
13511   BrkpbHelper(config, pg_4, pn_3, pm_3, exp_4_x_x);
13512   // clang-format on
13513 }
13514 
TEST_SVE(sve_brkpa)13515 TEST_SVE(sve_brkpa) {
13516   // clang-format off
13517   // The last active element of `pn` are `true` in all vector length configurations.
13518   //                                | boundary of 128-bits VL.
13519   //                                v
13520   int pg_1[] =      {1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
13521   int pg_2[] =      {1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
13522   int pg_3[] =      {1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
13523 
13524   //                 | highest-numbered lane                lowest-numbered lane |
13525   //                 v                                                           v
13526   int pn_1[] =      {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
13527   int pn_2[] =      {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
13528   int pn_3[] =      {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1};
13529 
13530   int pm_1[] =      {1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
13531   int pm_2[] =      {0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13532   int pm_3[] =      {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13533 
13534   //                                                                    | first active
13535   //                                                                    v
13536   int exp_1_1_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0};
13537   //                                            | first active
13538   //                                            v
13539   int exp_1_2_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
13540   //                                                                    | first active
13541   //                                                                    v
13542   int exp_1_3_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0};
13543 
13544   BrkpaHelper(config, pg_1, pn_1, pm_1, exp_1_1_1);
13545   BrkpaHelper(config, pg_1, pn_2, pm_2, exp_1_2_2);
13546   BrkpaHelper(config, pg_1, pn_3, pm_3, exp_1_3_3);
13547 
13548   //                                               | first active
13549   //                                               v
13550   int exp_2_1_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
13551   //                                            | first active
13552   //                                            v
13553   int exp_2_2_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
13554   //                                                                 | first active
13555   //                                                                 v
13556   int exp_2_3_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1};
13557   BrkpaHelper(config, pg_2, pn_1, pm_2, exp_2_1_2);
13558   BrkpaHelper(config, pg_2, pn_2, pm_3, exp_2_2_3);
13559   BrkpaHelper(config, pg_2, pn_3, pm_1, exp_2_3_1);
13560 
13561   //                                                                    | first active
13562   //                                                                    v
13563   int exp_3_1_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1};
13564   //                                                                    | first active
13565   //                                                                    v
13566   int exp_3_2_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1};
13567   //                                      | first active
13568   //                                      v
13569   int exp_3_3_2[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
13570   BrkpaHelper(config, pg_3, pn_1, pm_3, exp_3_1_3);
13571   BrkpaHelper(config, pg_3, pn_2, pm_1, exp_3_2_1);
13572   BrkpaHelper(config, pg_3, pn_3, pm_2, exp_3_3_2);
13573 
13574   // The last active element of `pn` are `false` in all vector length configurations.
13575   //                       | last active lane when VL > 128 bits.
13576   //                       v
13577   //                                   | last active lane when VL == 128 bits.
13578   //                                   v
13579   int pg_4[] =      {0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
13580   int exp_4_x_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13581   BrkpaHelper(config, pg_4, pn_1, pm_1, exp_4_x_x);
13582   BrkpaHelper(config, pg_4, pn_2, pm_2, exp_4_x_x);
13583   BrkpaHelper(config, pg_4, pn_3, pm_3, exp_4_x_x);
13584   // clang-format on
13585 }
13586 
TEST_SVE(sve_rbit)13587 TEST_SVE(sve_rbit) {
13588   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13589   START();
13590 
13591   uint64_t inputs[] = {0xaaaaaaaa55555555, 0xaaaa5555aa55aa55};
13592   InsrHelper(&masm, z0.VnD(), inputs);
13593 
13594   __ Ptrue(p1.VnB());
13595   int pred[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1};
13596   Initialise(&masm, p2.VnB(), pred);
13597 
13598   __ Rbit(z0.VnB(), p1.Merging(), z0.VnB());
13599   __ Rbit(z0.VnB(), p1.Merging(), z0.VnB());
13600 
13601   __ Rbit(z1.VnB(), p1.Merging(), z0.VnB());
13602   __ Rbit(z2.VnH(), p1.Merging(), z0.VnH());
13603   __ Rbit(z3.VnS(), p1.Merging(), z0.VnS());
13604   __ Rbit(z4.VnD(), p1.Merging(), z0.VnD());
13605 
13606   __ Dup(z5.VnB(), 0x42);
13607   __ Rbit(z5.VnB(), p2.Merging(), z0.VnB());
13608   __ Dup(z6.VnB(), 0x42);
13609   __ Rbit(z6.VnS(), p2.Merging(), z0.VnS());
13610 
13611   END();
13612 
13613   if (CAN_RUN()) {
13614     RUN();
13615 
13616     ASSERT_EQUAL_SVE(inputs, z0.VnD());
13617 
13618     uint64_t expected_z1[] = {0x55555555aaaaaaaa, 0x5555aaaa55aa55aa};
13619     ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
13620     uint64_t expected_z2[] = {0x55555555aaaaaaaa, 0x5555aaaaaa55aa55};
13621     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13622     uint64_t expected_z3[] = {0x55555555aaaaaaaa, 0xaaaa5555aa55aa55};
13623     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13624     uint64_t expected_z4[] = {0xaaaaaaaa55555555, 0xaa55aa55aaaa5555};
13625     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13626     uint64_t expected_z5[] = {0x4255425542aa42aa, 0x4255424242aa42aa};
13627     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
13628     uint64_t expected_z6[] = {0x55555555aaaaaaaa, 0x42424242aa55aa55};
13629     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13630   }
13631 }
13632 
TEST_SVE(sve_rev_bhw)13633 TEST_SVE(sve_rev_bhw) {
13634   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13635   START();
13636 
13637   uint64_t inputs[] = {0xaaaaaaaa55555555, 0xaaaa5555aa55aa55};
13638   InsrHelper(&masm, z0.VnD(), inputs);
13639 
13640   __ Ptrue(p1.VnB());
13641   int pred[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1};
13642   Initialise(&masm, p2.VnB(), pred);
13643 
13644   __ Revb(z1.VnH(), p1.Merging(), z0.VnH());
13645   __ Revb(z2.VnS(), p1.Merging(), z0.VnS());
13646   __ Revb(z3.VnD(), p1.Merging(), z0.VnD());
13647   __ Revh(z4.VnS(), p1.Merging(), z0.VnS());
13648   __ Revh(z5.VnD(), p1.Merging(), z0.VnD());
13649   __ Revw(z6.VnD(), p1.Merging(), z0.VnD());
13650 
13651   __ Dup(z7.VnB(), 0x42);
13652   __ Revb(z7.VnH(), p2.Merging(), z0.VnH());
13653   __ Dup(z8.VnB(), 0x42);
13654   __ Revh(z8.VnS(), p2.Merging(), z0.VnS());
13655 
13656   END();
13657 
13658   if (CAN_RUN()) {
13659     RUN();
13660 
13661     uint64_t expected_z1[] = {0xaaaaaaaa55555555, 0xaaaa555555aa55aa};
13662     ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
13663     uint64_t expected_z2[] = {0xaaaaaaaa55555555, 0x5555aaaa55aa55aa};
13664     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13665     uint64_t expected_z3[] = {0x55555555aaaaaaaa, 0x55aa55aa5555aaaa};
13666     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13667     uint64_t expected_z4[] = {0xaaaaaaaa55555555, 0x5555aaaaaa55aa55};
13668     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13669     uint64_t expected_z5[] = {0x55555555aaaaaaaa, 0xaa55aa555555aaaa};
13670     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
13671     uint64_t expected_z6[] = {0x55555555aaaaaaaa, 0xaa55aa55aaaa5555};
13672     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13673     uint64_t expected_z7[] = {0xaaaaaaaa55555555, 0xaaaa424255aa55aa};
13674     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
13675     uint64_t expected_z8[] = {0xaaaaaaaa55555555, 0x42424242aa55aa55};
13676     ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
13677   }
13678 }
13679 
TEST_SVE(sve_ftssel)13680 TEST_SVE(sve_ftssel) {
13681   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13682   START();
13683 
13684   uint64_t in[] = {0x1111777766665555, 0xaaaabbbbccccdddd};
13685   uint64_t q[] = {0x0001000300000002, 0x0001000200000003};
13686   InsrHelper(&masm, z0.VnD(), in);
13687   InsrHelper(&masm, z1.VnD(), q);
13688 
13689   __ Ftssel(z2.VnH(), z0.VnH(), z1.VnH());
13690   __ Ftssel(z3.VnS(), z0.VnS(), z1.VnS());
13691   __ Ftssel(z4.VnD(), z0.VnD(), z1.VnD());
13692 
13693   END();
13694 
13695   if (CAN_RUN()) {
13696     RUN();
13697 
13698     uint64_t expected_z2[] = {0x3c00bc006666d555, 0x3c003bbbccccbc00};
13699     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13700     uint64_t expected_z3[] = {0xbf800000e6665555, 0x2aaabbbbbf800000};
13701     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13702     uint64_t expected_z4[] = {0x9111777766665555, 0xbff0000000000000};
13703     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13704   }
13705 }
13706 
TEST_SVE(sve_fexpa)13707 TEST_SVE(sve_fexpa) {
13708   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13709   START();
13710 
13711   uint64_t in0[] = {0x3ff0000000000000, 0x3ff0000000011001};
13712   uint64_t in1[] = {0x3ff000000002200f, 0xbff000000003301f};
13713   uint64_t in2[] = {0xbff000000004403f, 0x3ff0000000055040};
13714   uint64_t in3[] = {0x3f800000bf800001, 0x3f80000f3f80001f};
13715   uint64_t in4[] = {0x3f80002f3f82203f, 0xbf8000403f833041};
13716   uint64_t in5[] = {0x3c003c01bc00bc07, 0x3c08bc0f3c1fbc20};
13717   InsrHelper(&masm, z0.VnD(), in0);
13718   InsrHelper(&masm, z1.VnD(), in1);
13719   InsrHelper(&masm, z2.VnD(), in2);
13720   InsrHelper(&masm, z3.VnD(), in3);
13721   InsrHelper(&masm, z4.VnD(), in4);
13722   InsrHelper(&masm, z5.VnD(), in5);
13723 
13724   __ Fexpa(z6.VnD(), z0.VnD());
13725   __ Fexpa(z7.VnD(), z1.VnD());
13726   __ Fexpa(z8.VnD(), z2.VnD());
13727   __ Fexpa(z9.VnS(), z3.VnS());
13728   __ Fexpa(z10.VnS(), z4.VnS());
13729   __ Fexpa(z11.VnH(), z5.VnH());
13730 
13731   END();
13732 
13733   if (CAN_RUN()) {
13734     RUN();
13735     uint64_t expected_z6[] = {0x0000000000000000, 0x44002c9a3e778061};
13736     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13737     uint64_t expected_z7[] = {0x0802d285a6e4030b, 0x4c06623882552225};
13738     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
13739     uint64_t expected_z8[] = {0x100fa7c1819e90d8, 0x5410000000000000};
13740     ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
13741     uint64_t expected_z9[] = {0x00000000000164d2, 0x0016942d003311c4};
13742     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
13743     uint64_t expected_z10[] = {0x0054f35b407d3e0c, 0x00800000608164d2};
13744     ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
13745     uint64_t expected_z11[] = {0x00000016000000a8, 0x00c2018903d40400};
13746     ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
13747   }
13748 }
13749 
TEST_SVE(sve_rev_p)13750 TEST_SVE(sve_rev_p) {
13751   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13752   START();
13753 
13754   Initialise(&masm,
13755              p0.VnB(),
13756              0xabcdabcdabcdabcd,
13757              0xabcdabcdabcdabcd,
13758              0xabcdabcdabcdabcd,
13759              0xabcdabcdabcdabcd);
13760 
13761   __ Rev(p1.VnB(), p0.VnB());
13762   __ Rev(p2.VnH(), p0.VnH());
13763   __ Rev(p3.VnS(), p0.VnS());
13764   __ Rev(p4.VnD(), p0.VnD());
13765 
13766   END();
13767 
13768   if (CAN_RUN()) {
13769     RUN();
13770 
13771     int p1_expected[] = {1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1};
13772     ASSERT_EQUAL_SVE(p1_expected, p1.VnB());
13773     int p2_expected[] = {0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0};
13774     ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
13775     int p3_expected[] = {1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0};
13776     ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
13777     int p4_expected[] = {1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1};
13778     ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
13779   }
13780 }
13781 
TEST_SVE(sve_trn_p_bh)13782 TEST_SVE(sve_trn_p_bh) {
13783   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13784   START();
13785 
13786   Initialise(&masm, p0.VnB(), 0xa5a55a5a);
13787   __ Pfalse(p1.VnB());
13788 
13789   __ Trn1(p2.VnB(), p0.VnB(), p0.VnB());
13790   __ Trn2(p3.VnB(), p0.VnB(), p0.VnB());
13791   __ Trn1(p4.VnB(), p1.VnB(), p0.VnB());
13792   __ Trn2(p5.VnB(), p1.VnB(), p0.VnB());
13793   __ Trn1(p6.VnB(), p0.VnB(), p1.VnB());
13794   __ Trn2(p7.VnB(), p0.VnB(), p1.VnB());
13795 
13796   __ Trn1(p8.VnH(), p0.VnH(), p0.VnH());
13797   __ Trn2(p9.VnH(), p0.VnH(), p0.VnH());
13798   __ Trn1(p10.VnH(), p1.VnH(), p0.VnH());
13799   __ Trn2(p11.VnH(), p1.VnH(), p0.VnH());
13800   __ Trn1(p12.VnH(), p0.VnH(), p1.VnH());
13801   __ Trn2(p13.VnH(), p0.VnH(), p1.VnH());
13802 
13803   END();
13804 
13805   if (CAN_RUN()) {
13806     RUN();
13807     int p2_expected[] = {1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0};
13808     int p3_expected[] = {0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1};
13809     ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
13810     ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
13811 
13812     int p4_expected[] = {1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
13813     int p5_expected[] = {0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0};
13814     ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
13815     ASSERT_EQUAL_SVE(p5_expected, p5.VnB());
13816 
13817     int p6_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0};
13818     int p7_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1};
13819     ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
13820     ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
13821 
13822     int p8_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
13823     int p9_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
13824     ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
13825     ASSERT_EQUAL_SVE(p9_expected, p9.VnB());
13826 
13827     int p10_expected[] = {0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0};
13828     int p11_expected[] = {0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0};
13829     ASSERT_EQUAL_SVE(p10_expected, p10.VnB());
13830     ASSERT_EQUAL_SVE(p11_expected, p11.VnB());
13831 
13832     int p12_expected[] = {0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0};
13833     int p13_expected[] = {0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0};
13834     ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
13835     ASSERT_EQUAL_SVE(p13_expected, p13.VnB());
13836   }
13837 }
13838 
TEST_SVE(sve_trn_p_sd)13839 TEST_SVE(sve_trn_p_sd) {
13840   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13841   START();
13842 
13843   Initialise(&masm, p0.VnB(), 0x55a55aaa);
13844   __ Pfalse(p1.VnB());
13845 
13846   __ Trn1(p2.VnS(), p0.VnS(), p0.VnS());
13847   __ Trn2(p3.VnS(), p0.VnS(), p0.VnS());
13848   __ Trn1(p4.VnS(), p1.VnS(), p0.VnS());
13849   __ Trn2(p5.VnS(), p1.VnS(), p0.VnS());
13850   __ Trn1(p6.VnS(), p0.VnS(), p1.VnS());
13851   __ Trn2(p7.VnS(), p0.VnS(), p1.VnS());
13852 
13853   __ Trn1(p8.VnD(), p0.VnD(), p0.VnD());
13854   __ Trn2(p9.VnD(), p0.VnD(), p0.VnD());
13855   __ Trn1(p10.VnD(), p1.VnD(), p0.VnD());
13856   __ Trn2(p11.VnD(), p1.VnD(), p0.VnD());
13857   __ Trn1(p12.VnD(), p0.VnD(), p1.VnD());
13858   __ Trn2(p13.VnD(), p0.VnD(), p1.VnD());
13859 
13860   END();
13861 
13862   if (CAN_RUN()) {
13863     RUN();
13864     int p2_expected[] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
13865     int p3_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
13866     ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
13867     ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
13868 
13869     int p4_expected[] = {1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
13870     int p5_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
13871     ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
13872     ASSERT_EQUAL_SVE(p5_expected, p5.VnB());
13873 
13874     int p6_expected[] = {0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0};
13875     int p7_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0};
13876     ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
13877     ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
13878 
13879     int p8_expected[] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
13880     int p9_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
13881     ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
13882     ASSERT_EQUAL_SVE(p9_expected, p9.VnB());
13883 
13884     int p10_expected[] = {1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13885     int p11_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13886     ASSERT_EQUAL_SVE(p10_expected, p10.VnB());
13887     ASSERT_EQUAL_SVE(p11_expected, p11.VnB());
13888 
13889     int p12_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0};
13890     int p13_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0};
13891     ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
13892     ASSERT_EQUAL_SVE(p13_expected, p13.VnB());
13893   }
13894 }
13895 
TEST_SVE(sve_zip_p_bh)13896 TEST_SVE(sve_zip_p_bh) {
13897   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13898   START();
13899 
13900   Initialise(&masm,
13901              p0.VnB(),
13902              0x5a5a5a5a5a5a5a5a,
13903              0x5a5a5a5a5a5a5a5a,
13904              0x5a5a5a5a5a5a5a5a,
13905              0x5a5a5a5a5a5a5a5a);
13906   __ Pfalse(p1.VnB());
13907 
13908   __ Zip1(p2.VnB(), p0.VnB(), p0.VnB());
13909   __ Zip2(p3.VnB(), p0.VnB(), p0.VnB());
13910   __ Zip1(p4.VnB(), p1.VnB(), p0.VnB());
13911   __ Zip2(p5.VnB(), p1.VnB(), p0.VnB());
13912   __ Zip1(p6.VnB(), p0.VnB(), p1.VnB());
13913   __ Zip2(p7.VnB(), p0.VnB(), p1.VnB());
13914 
13915   __ Zip1(p8.VnH(), p0.VnH(), p0.VnH());
13916   __ Zip2(p9.VnH(), p0.VnH(), p0.VnH());
13917   __ Zip1(p10.VnH(), p1.VnH(), p0.VnH());
13918   __ Zip2(p11.VnH(), p1.VnH(), p0.VnH());
13919   __ Zip1(p12.VnH(), p0.VnH(), p1.VnH());
13920   __ Zip2(p13.VnH(), p0.VnH(), p1.VnH());
13921 
13922   END();
13923 
13924   if (CAN_RUN()) {
13925     RUN();
13926     int p2_expected[] = {0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0};
13927     int p3_expected[] = {0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0};
13928     ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
13929     ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
13930 
13931     int p4_expected[] = {0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13932     int p5_expected[] = {0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13933     ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
13934     ASSERT_EQUAL_SVE(p5_expected, p5.VnB());
13935 
13936     int p6_expected[] = {0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0};
13937     int p7_expected[] = {0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0};
13938     ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
13939     ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
13940 
13941     int p8_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
13942     int p9_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
13943     ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
13944     ASSERT_EQUAL_SVE(p9_expected, p9.VnB());
13945 
13946     int p10_expected[] = {0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13947     int p11_expected[] = {0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13948     ASSERT_EQUAL_SVE(p10_expected, p10.VnB());
13949     ASSERT_EQUAL_SVE(p11_expected, p11.VnB());
13950 
13951     int p12_expected[] = {0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0};
13952     int p13_expected[] = {0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0};
13953     ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
13954     ASSERT_EQUAL_SVE(p13_expected, p13.VnB());
13955   }
13956 }
13957 
TEST_SVE(sve_zip_p_sd)13958 TEST_SVE(sve_zip_p_sd) {
13959   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13960   START();
13961 
13962   Initialise(&masm,
13963              p0.VnB(),
13964              0x5a5a5a5a5a5a5a5a,
13965              0x5a5a5a5a5a5a5a5a,
13966              0x5a5a5a5a5a5a5a5a,
13967              0x5a5a5a5a5a5a5a5a);
13968   __ Pfalse(p1.VnB());
13969 
13970   __ Zip1(p2.VnS(), p0.VnS(), p0.VnS());
13971   __ Zip2(p3.VnS(), p0.VnS(), p0.VnS());
13972   __ Zip1(p4.VnS(), p1.VnS(), p0.VnS());
13973   __ Zip2(p5.VnS(), p1.VnS(), p0.VnS());
13974   __ Zip1(p6.VnS(), p0.VnS(), p1.VnS());
13975   __ Zip2(p7.VnS(), p0.VnS(), p1.VnS());
13976 
13977   __ Zip1(p8.VnD(), p0.VnD(), p0.VnD());
13978   __ Zip2(p9.VnD(), p0.VnD(), p0.VnD());
13979   __ Zip1(p10.VnD(), p1.VnD(), p0.VnD());
13980   __ Zip2(p11.VnD(), p1.VnD(), p0.VnD());
13981   __ Zip1(p12.VnD(), p0.VnD(), p1.VnD());
13982   __ Zip2(p13.VnD(), p0.VnD(), p1.VnD());
13983 
13984   END();
13985 
13986   if (CAN_RUN()) {
13987     RUN();
13988     int p2_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
13989     int p3_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
13990     ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
13991     ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
13992 
13993     int p4_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
13994     int p5_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
13995     ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
13996     ASSERT_EQUAL_SVE(p5_expected, p5.VnB());
13997 
13998     int p6_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0};
13999     int p7_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0};
14000     ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
14001     ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
14002 
14003     int p8_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
14004     int p9_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
14005     ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
14006     ASSERT_EQUAL_SVE(p9_expected, p9.VnB());
14007 
14008     int p10_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14009     int p11_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14010     ASSERT_EQUAL_SVE(p10_expected, p10.VnB());
14011     ASSERT_EQUAL_SVE(p11_expected, p11.VnB());
14012 
14013     int p12_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0};
14014     int p13_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0};
14015     ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
14016     ASSERT_EQUAL_SVE(p13_expected, p13.VnB());
14017   }
14018 }
14019 
TEST_SVE(sve_uzp_p)14020 TEST_SVE(sve_uzp_p) {
14021   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14022   START();
14023 
14024   Initialise(&masm,
14025              p0.VnB(),
14026              0xf0f0ff00ffff0000,
14027              0x4242424242424242,
14028              0x5a5a5a5a5a5a5a5a,
14029              0x0123456789abcdef);
14030   __ Rev(p1.VnB(), p0.VnB());
14031 
14032   __ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
14033   __ Zip2(p3.VnB(), p0.VnB(), p1.VnB());
14034   __ Uzp1(p4.VnB(), p2.VnB(), p3.VnB());
14035   __ Uzp2(p5.VnB(), p2.VnB(), p3.VnB());
14036 
14037   __ Zip1(p2.VnH(), p0.VnH(), p1.VnH());
14038   __ Zip2(p3.VnH(), p0.VnH(), p1.VnH());
14039   __ Uzp1(p6.VnH(), p2.VnH(), p3.VnH());
14040   __ Uzp2(p7.VnH(), p2.VnH(), p3.VnH());
14041 
14042   __ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
14043   __ Zip2(p3.VnS(), p0.VnS(), p1.VnS());
14044   __ Uzp1(p8.VnS(), p2.VnS(), p3.VnS());
14045   __ Uzp2(p9.VnS(), p2.VnS(), p3.VnS());
14046 
14047   __ Zip1(p2.VnD(), p0.VnD(), p1.VnD());
14048   __ Zip2(p3.VnD(), p0.VnD(), p1.VnD());
14049   __ Uzp1(p10.VnD(), p2.VnD(), p3.VnD());
14050   __ Uzp2(p11.VnD(), p2.VnD(), p3.VnD());
14051 
14052   END();
14053 
14054   if (CAN_RUN()) {
14055     RUN();
14056 
14057     ASSERT_EQUAL_SVE(p0, p4);
14058     ASSERT_EQUAL_SVE(p1, p5);
14059     ASSERT_EQUAL_SVE(p0, p6);
14060     ASSERT_EQUAL_SVE(p1, p7);
14061     ASSERT_EQUAL_SVE(p0, p8);
14062     ASSERT_EQUAL_SVE(p1, p9);
14063     ASSERT_EQUAL_SVE(p0, p10);
14064     ASSERT_EQUAL_SVE(p1, p11);
14065   }
14066 }
14067 
TEST_SVE(sve_punpk)14068 TEST_SVE(sve_punpk) {
14069   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14070   START();
14071 
14072   auto get_64_bits_at = [](int byte_index) -> uint64_t {
14073     // Each 8-bit chunk has the value 0x50 + the byte index of the chunk.
14074     return 0x5756555453525150 + (0x0101010101010101 * byte_index);
14075   };
14076 
14077   Initialise(&masm,
14078              p0.VnB(),
14079              get_64_bits_at(24),
14080              get_64_bits_at(16),
14081              get_64_bits_at(8),
14082              get_64_bits_at(0));
14083   __ Punpklo(p1.VnH(), p0.VnB());
14084   __ Punpkhi(p2.VnH(), p0.VnB());
14085 
14086   END();
14087 
14088   if (CAN_RUN()) {
14089     RUN();
14090 
14091     int pl = config->sve_vl_in_bits() / kZRegBitsPerPRegBit;
14092     // For simplicity, just test the bottom 64 H-sized lanes.
14093     uint64_t p1_h_bits = get_64_bits_at(0);
14094     uint64_t p2_h_bits = get_64_bits_at(pl / (2 * 8));
14095     int p1_expected[64];
14096     int p2_expected[64];
14097     for (size_t i = 0; i < 64; i++) {
14098       p1_expected[63 - i] = (p1_h_bits >> i) & 1;
14099       p2_expected[63 - i] = (p2_h_bits >> i) & 1;
14100     }
14101     // Testing `VnH` ensures that odd-numbered B lanes are zero.
14102     ASSERT_EQUAL_SVE(p1_expected, p1.VnH());
14103     ASSERT_EQUAL_SVE(p2_expected, p2.VnH());
14104   }
14105 }
14106 
14107 typedef void (MacroAssembler::*BrkFn)(const PRegisterWithLaneSize& pd,
14108                                       const PRegister& pg,
14109                                       const PRegisterWithLaneSize& pn);
14110 
14111 typedef void (MacroAssembler::*BrksFn)(const PRegisterWithLaneSize& pd,
14112                                        const PRegisterZ& pg,
14113                                        const PRegisterWithLaneSize& pn);
14114 
14115 template <typename T, size_t N>
BrkaBrkbHelper(Test * config,BrkFn macro,BrksFn macro_set_flags,const T (& pd_inputs)[N],const T (& pg_inputs)[N],const T (& pn_inputs)[N],const T (& pd_z_expected)[N])14116 static void BrkaBrkbHelper(Test* config,
14117                            BrkFn macro,
14118                            BrksFn macro_set_flags,
14119                            const T (&pd_inputs)[N],
14120                            const T (&pg_inputs)[N],
14121                            const T (&pn_inputs)[N],
14122                            const T (&pd_z_expected)[N]) {
14123   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14124   START();
14125 
14126   PRegister pg = p10;
14127   PRegister pn = p9;
14128   PRegister pd_z = p0;
14129   PRegister pd_z_s = p1;
14130   PRegister pd_m = p2;
14131   Initialise(&masm, pg.VnB(), pg_inputs);
14132   Initialise(&masm, pn.VnB(), pn_inputs);
14133   Initialise(&masm, pd_m.VnB(), pd_inputs);
14134 
14135   // Initialise NZCV to an impossible value, to check that we actually write it.
14136   __ Mov(x10, NZCVFlag);
14137   __ Msr(NZCV, x10);
14138 
14139   (masm.*macro)(pd_z.VnB(), pg.Zeroing(), pn.VnB());
14140   (masm.*macro_set_flags)(pd_z_s.VnB(), pg.Zeroing(), pn.VnB());
14141   __ Mrs(x0, NZCV);
14142 
14143   (masm.*macro)(pd_m.VnB(), pg.Merging(), pn.VnB());
14144 
14145   END();
14146 
14147   if (CAN_RUN()) {
14148     RUN();
14149 
14150     ASSERT_EQUAL_SVE(pd_z_expected, pd_z.VnB());
14151 
14152     // Check that the flags were properly set.
14153     StatusFlags nzcv_expected =
14154         GetPredTestFlags(pd_z_expected,
14155                          pg_inputs,
14156                          core.GetSVELaneCount(kBRegSize));
14157     ASSERT_EQUAL_64(nzcv_expected, x0);
14158     ASSERT_EQUAL_SVE(pd_z.VnB(), pd_z_s.VnB());
14159 
14160     T pd_m_expected[N];
14161     // Set expected `pd` result on merging predication.
14162     for (size_t i = 0; i < N; i++) {
14163       pd_m_expected[i] = pg_inputs[i] ? pd_z_expected[i] : pd_inputs[i];
14164     }
14165     ASSERT_EQUAL_SVE(pd_m_expected, pd_m.VnB());
14166   }
14167 }
14168 
14169 template <typename T>
BrkaHelper(Test * config,const T & pd_inputs,const T & pg_inputs,const T & pn_inputs,const T & pd_expected)14170 static void BrkaHelper(Test* config,
14171                        const T& pd_inputs,
14172                        const T& pg_inputs,
14173                        const T& pn_inputs,
14174                        const T& pd_expected) {
14175   BrkaBrkbHelper(config,
14176                  &MacroAssembler::Brka,
14177                  &MacroAssembler::Brkas,
14178                  pd_inputs,
14179                  pg_inputs,
14180                  pn_inputs,
14181                  pd_expected);
14182 }
14183 
TEST_SVE(sve_brka)14184 TEST_SVE(sve_brka) {
14185   // clang-format off
14186   //                              | boundary of 128-bits VL.
14187   //                              v
14188   int pd[] =      {1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14189 
14190   //               | highest-numbered lane                lowest-numbered lane |
14191   //               v                                                           v
14192   int pg_1[] =    {1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
14193   int pg_2[] =    {1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
14194 
14195   int pn_1[] =    {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
14196   int pn_2[] =    {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14197   int pn_3[] =    {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1};
14198 
14199   //                                                                  | first break
14200   //                                                                  v
14201   int exp_1_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0};
14202   //                              | first break
14203   //                              v
14204   int exp_1_2[] = {0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
14205   //                                                      | first break
14206   //                                                      v
14207   int exp_1_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
14208 
14209   BrkaHelper(config, pd, pg_1, pn_1, exp_1_1);
14210   BrkaHelper(config, pd, pg_1, pn_2, exp_1_2);
14211   BrkaHelper(config, pd, pg_1, pn_3, exp_1_3);
14212 
14213   //                                                               | first break
14214   //                                                               v
14215   int exp_2_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1};
14216   //                                       | first break
14217   //                                       v
14218   int exp_2_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
14219   //                                                                           | first break
14220   //                                                                           v
14221   int exp_2_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
14222   BrkaHelper(config, pd, pg_2, pn_1, exp_2_1);
14223   BrkaHelper(config, pd, pg_2, pn_2, exp_2_2);
14224   BrkaHelper(config, pd, pg_2, pn_3, exp_2_3);
14225 
14226   // The all-inactive zeroing predicate sets destination predicate all-false.
14227   int pg_3[] =    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14228   int exp_3_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14229   BrkaHelper(config, pd, pg_3, pn_1, exp_3_x);
14230   BrkaHelper(config, pd, pg_3, pn_2, exp_3_x);
14231   BrkaHelper(config, pd, pg_3, pn_3, exp_3_x);
14232   // clang-format on
14233 }
14234 
14235 template <typename T>
BrkbHelper(Test * config,const T & pd_inputs,const T & pg_inputs,const T & pn_inputs,const T & pd_expected)14236 static void BrkbHelper(Test* config,
14237                        const T& pd_inputs,
14238                        const T& pg_inputs,
14239                        const T& pn_inputs,
14240                        const T& pd_expected) {
14241   BrkaBrkbHelper(config,
14242                  &MacroAssembler::Brkb,
14243                  &MacroAssembler::Brkbs,
14244                  pd_inputs,
14245                  pg_inputs,
14246                  pn_inputs,
14247                  pd_expected);
14248 }
14249 
TEST_SVE(sve_brkb)14250 TEST_SVE(sve_brkb) {
14251   // clang-format off
14252   //                              | boundary of 128-bits VL.
14253   //                              v
14254   int pd[] =      {1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14255 
14256   //               | highest-numbered lane                lowest-numbered lane |
14257   //               v                                                           v
14258   int pg_1[] =    {1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
14259   int pg_2[] =    {1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
14260 
14261   int pn_1[] =    {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
14262   int pn_2[] =    {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14263   int pn_3[] =    {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1};
14264 
14265   //                                                                  | first break
14266   //                                                                  v
14267   int exp_1_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
14268   //                              | first break
14269   //                              v
14270   int exp_1_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
14271   //                                                      | first break
14272   //                                                      v
14273   int exp_1_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0};
14274 
14275   BrkbHelper(config, pd, pg_1, pn_1, exp_1_1);
14276   BrkbHelper(config, pd, pg_1, pn_2, exp_1_2);
14277   BrkbHelper(config, pd, pg_1, pn_3, exp_1_3);
14278 
14279   //                                                               | first break
14280   //                                                               v
14281   int exp_2_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1};
14282   //                                       | first break
14283   //                                       v
14284   int exp_2_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
14285   //                                                                           | first break
14286   //                                                                           v
14287   int exp_2_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14288   BrkbHelper(config, pd, pg_2, pn_1, exp_2_1);
14289   BrkbHelper(config, pd, pg_2, pn_2, exp_2_2);
14290   BrkbHelper(config, pd, pg_2, pn_3, exp_2_3);
14291 
14292   // The all-inactive zeroing predicate sets destination predicate all-false.
14293   int pg_3[] =    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14294   int exp_3_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14295   BrkbHelper(config, pd, pg_3, pn_1, exp_3_x);
14296   BrkbHelper(config, pd, pg_3, pn_2, exp_3_x);
14297   BrkbHelper(config, pd, pg_3, pn_3, exp_3_x);
14298   // clang-format on
14299 }
14300 
14301 typedef void (MacroAssembler::*BrknFn)(const PRegisterWithLaneSize& pd,
14302                                        const PRegisterZ& pg,
14303                                        const PRegisterWithLaneSize& pn,
14304                                        const PRegisterWithLaneSize& pm);
14305 
14306 typedef void (MacroAssembler::*BrknsFn)(const PRegisterWithLaneSize& pd,
14307                                         const PRegisterZ& pg,
14308                                         const PRegisterWithLaneSize& pn,
14309                                         const PRegisterWithLaneSize& pm);
14310 
14311 enum BrknDstPredicateState { kAllFalse, kUnchanged };
14312 
14313 template <typename T, size_t N>
BrknHelper(Test * config,const T (& pd_inputs)[N],const T (& pg_inputs)[N],const T (& pn_inputs)[N],const T (& pm_inputs)[N],BrknDstPredicateState expected_pd_state)14314 static void BrknHelper(Test* config,
14315                        const T (&pd_inputs)[N],
14316                        const T (&pg_inputs)[N],
14317                        const T (&pn_inputs)[N],
14318                        const T (&pm_inputs)[N],
14319                        BrknDstPredicateState expected_pd_state) {
14320   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14321   START();
14322 
14323   PRegister pg = p10;
14324   PRegister pn = p9;
14325   PRegister pm = p8;
14326   PRegister pdm = p0;
14327   PRegister pd = p1;
14328   PRegister pd_s = p2;
14329   Initialise(&masm, pg.VnB(), pg_inputs);
14330   Initialise(&masm, pn.VnB(), pn_inputs);
14331   Initialise(&masm, pm.VnB(), pm_inputs);
14332   Initialise(&masm, pdm.VnB(), pm_inputs);
14333   Initialise(&masm, pd.VnB(), pd_inputs);
14334   Initialise(&masm, pd_s.VnB(), pd_inputs);
14335 
14336   // Initialise NZCV to an impossible value, to check that we actually write it.
14337   __ Mov(x10, NZCVFlag);
14338   __ Msr(NZCV, x10);
14339 
14340   __ Brkn(pdm.VnB(), pg.Zeroing(), pn.VnB(), pdm.VnB());
14341   // !pd.Aliases(pm).
14342   __ Brkn(pd.VnB(), pg.Zeroing(), pn.VnB(), pm.VnB());
14343   __ Brkns(pd_s.VnB(), pg.Zeroing(), pn.VnB(), pm.VnB());
14344   __ Mrs(x0, NZCV);
14345 
14346   END();
14347 
14348   if (CAN_RUN()) {
14349     RUN();
14350 
14351     T all_false[N] = {0};
14352     if (expected_pd_state == kAllFalse) {
14353       ASSERT_EQUAL_SVE(all_false, pd.VnB());
14354     } else {
14355       ASSERT_EQUAL_SVE(pm_inputs, pd.VnB());
14356     }
14357     ASSERT_EQUAL_SVE(pm_inputs, pm.VnB());
14358 
14359     T all_true[N];
14360     for (size_t i = 0; i < ArrayLength(all_true); i++) {
14361       all_true[i] = 1;
14362     }
14363 
14364     // Check that the flags were properly set.
14365     StatusFlags nzcv_expected =
14366         GetPredTestFlags((expected_pd_state == kAllFalse) ? all_false
14367                                                           : pm_inputs,
14368                          all_true,
14369                          core.GetSVELaneCount(kBRegSize));
14370     ASSERT_EQUAL_64(nzcv_expected, x0);
14371     ASSERT_EQUAL_SVE(pd.VnB(), pdm.VnB());
14372     ASSERT_EQUAL_SVE(pd.VnB(), pd_s.VnB());
14373   }
14374 }
14375 
TEST_SVE(sve_brkn)14376 TEST_SVE(sve_brkn) {
14377   int pd[] = {1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
14378   int pm[] = {0, 1, 1, 1, 1, 0, 0, 1, 0, 1};
14379 
14380   int pg_1[] = {1, 1, 0, 0, 1, 0, 1, 1, 0, 0};
14381   int pg_2[] = {0, 0, 0, 1, 1, 1, 0, 0, 1, 1};
14382   int pg_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14383 
14384   int pn_1[] = {1, 0, 0, 0, 0, 1, 1, 0, 0, 0};
14385   int pn_2[] = {0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
14386   int pn_3[] = {0, 0, 0, 0, 1, 1, 0, 0, 1, 1};
14387 
14388   BrknHelper(config, pd, pg_1, pn_1, pm, kUnchanged);
14389   BrknHelper(config, pd, pg_1, pn_2, pm, kAllFalse);
14390   BrknHelper(config, pd, pg_1, pn_3, pm, kAllFalse);
14391 
14392   BrknHelper(config, pd, pg_2, pn_1, pm, kAllFalse);
14393   BrknHelper(config, pd, pg_2, pn_2, pm, kUnchanged);
14394   BrknHelper(config, pd, pg_2, pn_3, pm, kAllFalse);
14395 
14396   BrknHelper(config, pd, pg_3, pn_1, pm, kAllFalse);
14397   BrknHelper(config, pd, pg_3, pn_2, pm, kAllFalse);
14398   BrknHelper(config, pd, pg_3, pn_3, pm, kAllFalse);
14399 }
14400 
TEST_SVE(sve_trn)14401 TEST_SVE(sve_trn) {
14402   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14403   START();
14404 
14405   uint64_t in0[] = {0xffeeddccbbaa9988, 0x7766554433221100};
14406   uint64_t in1[] = {0xaa55aa55aa55aa55, 0x55aa55aa55aa55aa};
14407   InsrHelper(&masm, z0.VnD(), in0);
14408   InsrHelper(&masm, z1.VnD(), in1);
14409 
14410   __ Trn1(z2.VnB(), z0.VnB(), z1.VnB());
14411   __ Trn2(z3.VnB(), z0.VnB(), z1.VnB());
14412   __ Trn1(z4.VnH(), z0.VnH(), z1.VnH());
14413   __ Trn2(z5.VnH(), z0.VnH(), z1.VnH());
14414   __ Trn1(z6.VnS(), z0.VnS(), z1.VnS());
14415   __ Trn2(z7.VnS(), z0.VnS(), z1.VnS());
14416   __ Trn1(z8.VnD(), z0.VnD(), z1.VnD());
14417   __ Trn2(z9.VnD(), z0.VnD(), z1.VnD());
14418 
14419   END();
14420 
14421   if (CAN_RUN()) {
14422     RUN();
14423     uint64_t expected_z2[] = {0x55ee55cc55aa5588, 0xaa66aa44aa22aa00};
14424     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
14425     uint64_t expected_z3[] = {0xaaffaaddaabbaa99, 0x5577555555335511};
14426     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
14427     uint64_t expected_z4[] = {0xaa55ddccaa559988, 0x55aa554455aa1100};
14428     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
14429     uint64_t expected_z5[] = {0xaa55ffeeaa55bbaa, 0x55aa776655aa3322};
14430     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
14431     uint64_t expected_z6[] = {0xaa55aa55bbaa9988, 0x55aa55aa33221100};
14432     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
14433     uint64_t expected_z7[] = {0xaa55aa55ffeeddcc, 0x55aa55aa77665544};
14434     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
14435     uint64_t expected_z8[] = {0x55aa55aa55aa55aa, 0x7766554433221100};
14436     ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
14437     uint64_t expected_z9[] = {0xaa55aa55aa55aa55, 0xffeeddccbbaa9988};
14438     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
14439   }
14440 }
14441 
TEST_SVE(sve_zip_uzp)14442 TEST_SVE(sve_zip_uzp) {
14443   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14444   START();
14445 
14446   __ Dup(z0.VnD(), 0xffeeddccbbaa9988);
14447   __ Insr(z0.VnD(), 0x7766554433221100);
14448   __ Dup(z1.VnD(), 0xaa55aa55aa55aa55);
14449   __ Insr(z1.VnD(), 0x55aa55aa55aa55aa);
14450 
14451   __ Zip1(z2.VnB(), z0.VnB(), z1.VnB());
14452   __ Zip2(z3.VnB(), z0.VnB(), z1.VnB());
14453   __ Zip1(z4.VnH(), z0.VnH(), z1.VnH());
14454   __ Zip2(z5.VnH(), z0.VnH(), z1.VnH());
14455   __ Zip1(z6.VnS(), z0.VnS(), z1.VnS());
14456   __ Zip2(z7.VnS(), z0.VnS(), z1.VnS());
14457   __ Zip1(z8.VnD(), z0.VnD(), z1.VnD());
14458   __ Zip2(z9.VnD(), z0.VnD(), z1.VnD());
14459 
14460   __ Uzp1(z10.VnB(), z2.VnB(), z3.VnB());
14461   __ Uzp2(z11.VnB(), z2.VnB(), z3.VnB());
14462   __ Uzp1(z12.VnH(), z4.VnH(), z5.VnH());
14463   __ Uzp2(z13.VnH(), z4.VnH(), z5.VnH());
14464   __ Uzp1(z14.VnS(), z6.VnS(), z7.VnS());
14465   __ Uzp2(z15.VnS(), z6.VnS(), z7.VnS());
14466   __ Uzp1(z16.VnD(), z8.VnD(), z9.VnD());
14467   __ Uzp2(z17.VnD(), z8.VnD(), z9.VnD());
14468 
14469   END();
14470 
14471   if (CAN_RUN()) {
14472     RUN();
14473     uint64_t expected_z2[] = {0x5577aa665555aa44, 0x5533aa225511aa00};
14474     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
14475     uint64_t expected_z3[] = {0xaaff55eeaadd55cc, 0xaabb55aaaa995588};
14476     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
14477     uint64_t expected_z4[] = {0x55aa776655aa5544, 0x55aa332255aa1100};
14478     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
14479     uint64_t expected_z5[] = {0xaa55ffeeaa55ddcc, 0xaa55bbaaaa559988};
14480     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
14481     uint64_t expected_z6[] = {0x55aa55aa77665544, 0x55aa55aa33221100};
14482     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
14483     uint64_t expected_z7[] = {0xaa55aa55ffeeddcc, 0xaa55aa55bbaa9988};
14484     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
14485     uint64_t expected_z8[] = {0x55aa55aa55aa55aa, 0x7766554433221100};
14486     ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
14487     uint64_t expected_z9[] = {0xaa55aa55aa55aa55, 0xffeeddccbbaa9988};
14488     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
14489 
14490     // Check uzp is the opposite of zip.
14491     ASSERT_EQUAL_SVE(z0.VnD(), z10.VnD());
14492     ASSERT_EQUAL_SVE(z1.VnD(), z11.VnD());
14493     ASSERT_EQUAL_SVE(z0.VnD(), z12.VnD());
14494     ASSERT_EQUAL_SVE(z1.VnD(), z13.VnD());
14495     ASSERT_EQUAL_SVE(z0.VnD(), z14.VnD());
14496     ASSERT_EQUAL_SVE(z1.VnD(), z15.VnD());
14497     ASSERT_EQUAL_SVE(z0.VnD(), z16.VnD());
14498     ASSERT_EQUAL_SVE(z1.VnD(), z17.VnD());
14499   }
14500 }
14501 
TEST_SVE(sve_fcadd)14502 TEST_SVE(sve_fcadd) {
14503   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14504   START();
14505 
14506   __ Dup(z30.VnS(), 0);
14507 
14508   __ Ptrue(p0.VnB());
14509   __ Pfalse(p1.VnB());
14510   __ Zip1(p2.VnH(), p0.VnH(), p1.VnH());  // Real elements.
14511   __ Zip1(p3.VnH(), p1.VnH(), p0.VnH());  // Imaginary elements.
14512 
14513   __ Fdup(z0.VnH(), 10.0);  // 10i + 10
14514   __ Fdup(z1.VnH(), 5.0);   // 5i + 5
14515   __ Index(z7.VnH(), 1, 1);
14516   __ Scvtf(z7.VnH(), p0.Merging(), z7.VnH());  // Ai + B
14517 
14518   __ Sel(z2.VnH(), p3, z1.VnH(), z30.VnH());  // 5i + 0
14519   __ Sel(z3.VnH(), p2, z1.VnH(), z30.VnH());  // 0i + 5
14520   __ Sel(z7.VnH(), p3, z7.VnH(), z0.VnH());   // Ai + 10
14521   __ Mov(z8, z7);
14522   __ Ext(z8.VnB(), z8.VnB(), z8.VnB(), 2);
14523   __ Sel(z8.VnH(), p2, z8.VnH(), z30.VnH());  // 0i + A
14524 
14525   // (10i + 10) + rotate(5i + 0, 90)
14526   //   = (10i + 10) + (0i - 5)
14527   //   = 10i + 5
14528   __ Fcadd(z4.VnH(), p0.Merging(), z0.VnH(), z2.VnH(), 90);
14529 
14530   // (10i + 5) + rotate(0i + 5, 270)
14531   //   = (10i + 5) + (-5i + 0)
14532   //   = 5i + 5
14533   __ Fcadd(z4.VnH(), p0.Merging(), z4.VnH(), z3.VnH(), 270);
14534 
14535   // The same calculation, but selecting real/imaginary using predication.
14536   __ Mov(z5, z0);
14537   __ Fcadd(z5.VnH(), p2.Merging(), z5.VnH(), z1.VnH(), 90);
14538   __ Fcadd(z5.VnH(), p3.Merging(), z5.VnH(), z1.VnH(), 270);
14539 
14540   // Reference calculation: (10i + 10) - (5i + 5)
14541   __ Fsub(z6.VnH(), z0.VnH(), z1.VnH());
14542 
14543   // Calculation using varying imaginary values.
14544   // (Ai + 10) + rotate(5i + 0, 90)
14545   //   = (Ai + 10) + (0i - 5)
14546   //   = Ai + 5
14547   __ Fcadd(z7.VnH(), p0.Merging(), z7.VnH(), z2.VnH(), 90);
14548 
14549   // (Ai + 5) + rotate(0i + A, 270)
14550   //   = (Ai + 5) + (-Ai + 0)
14551   //   = 5
14552   __ Fcadd(z7.VnH(), p0.Merging(), z7.VnH(), z8.VnH(), 270);
14553 
14554   // Repeated, but for wider elements.
14555   __ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
14556   __ Zip1(p3.VnS(), p1.VnS(), p0.VnS());
14557   __ Fdup(z0.VnS(), 42.0);
14558   __ Fdup(z1.VnS(), 21.0);
14559   __ Index(z11.VnS(), 1, 1);
14560   __ Scvtf(z11.VnS(), p0.Merging(), z11.VnS());
14561   __ Sel(z2.VnS(), p3, z1.VnS(), z30.VnS());
14562   __ Sel(z29.VnS(), p2, z1.VnS(), z30.VnS());
14563   __ Sel(z11.VnS(), p3, z11.VnS(), z0.VnS());
14564   __ Mov(z12, z11);
14565   __ Ext(z12.VnB(), z12.VnB(), z12.VnB(), 4);
14566   __ Sel(z12.VnS(), p2, z12.VnS(), z30.VnS());
14567   __ Fcadd(z8.VnS(), p0.Merging(), z0.VnS(), z2.VnS(), 90);
14568   __ Fcadd(z8.VnS(), p0.Merging(), z8.VnS(), z29.VnS(), 270);
14569   __ Mov(z9, z0);
14570   __ Fcadd(z9.VnS(), p2.Merging(), z9.VnS(), z1.VnS(), 90);
14571   __ Fcadd(z9.VnS(), p3.Merging(), z9.VnS(), z1.VnS(), 270);
14572   __ Fsub(z10.VnS(), z0.VnS(), z1.VnS());
14573   __ Fcadd(z11.VnS(), p0.Merging(), z11.VnS(), z2.VnS(), 90);
14574   __ Fcadd(z11.VnS(), p0.Merging(), z11.VnS(), z12.VnS(), 270);
14575 
14576   __ Zip1(p2.VnD(), p0.VnD(), p1.VnD());
14577   __ Zip1(p3.VnD(), p1.VnD(), p0.VnD());
14578   __ Fdup(z0.VnD(), -42.0);
14579   __ Fdup(z1.VnD(), -21.0);
14580   __ Index(z15.VnD(), 1, 1);
14581   __ Scvtf(z15.VnD(), p0.Merging(), z15.VnD());
14582   __ Sel(z2.VnD(), p3, z1.VnD(), z30.VnD());
14583   __ Sel(z28.VnD(), p2, z1.VnD(), z30.VnD());
14584   __ Sel(z15.VnD(), p3, z15.VnD(), z0.VnD());
14585   __ Mov(z16, z15);
14586   __ Ext(z16.VnB(), z16.VnB(), z16.VnB(), 8);
14587   __ Sel(z16.VnD(), p2, z16.VnD(), z30.VnD());
14588   __ Fcadd(z12.VnD(), p0.Merging(), z0.VnD(), z2.VnD(), 90);
14589   __ Fcadd(z12.VnD(), p0.Merging(), z12.VnD(), z28.VnD(), 270);
14590   __ Mov(z13, z0);
14591   __ Fcadd(z13.VnD(), p2.Merging(), z13.VnD(), z1.VnD(), 90);
14592   __ Fcadd(z13.VnD(), p3.Merging(), z13.VnD(), z1.VnD(), 270);
14593   __ Fsub(z14.VnD(), z0.VnD(), z1.VnD());
14594   __ Fcadd(z15.VnD(), p0.Merging(), z15.VnD(), z2.VnD(), 90);
14595   __ Fcadd(z15.VnD(), p0.Merging(), z15.VnD(), z16.VnD(), 270);
14596   END();
14597 
14598   if (CAN_RUN()) {
14599     RUN();
14600     ASSERT_EQUAL_SVE(z6.VnH(), z4.VnH());
14601     ASSERT_EQUAL_SVE(z6.VnH(), z5.VnH());
14602     ASSERT_EQUAL_SVE(z3.VnH(), z7.VnH());
14603     ASSERT_EQUAL_SVE(z10.VnS(), z8.VnS());
14604     ASSERT_EQUAL_SVE(z10.VnS(), z9.VnS());
14605     ASSERT_EQUAL_SVE(z29.VnS(), z11.VnS());
14606     ASSERT_EQUAL_SVE(z14.VnD(), z12.VnD());
14607     ASSERT_EQUAL_SVE(z14.VnD(), z13.VnD());
14608     ASSERT_EQUAL_SVE(z28.VnS(), z15.VnS());
14609   }
14610 }
14611 
TEST_SVE(sve_fcmla_index)14612 TEST_SVE(sve_fcmla_index) {
14613   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14614   START();
14615 
14616   __ Ptrue(p0.VnB());
14617 
14618   __ Fdup(z0.VnH(), 10.0);
14619   __ Fdup(z2.VnH(), 2.0);
14620   __ Zip1(z0.VnH(), z0.VnH(), z2.VnH());
14621 
14622   // Duplicate complex numbers across z2 segments. First segment has 1i+0,
14623   // second has 3i+2, etc.
14624   __ Index(z1.VnH(), 0, 1);
14625   __ Scvtf(z1.VnH(), p0.Merging(), z1.VnH());
14626   __ Zip1(z2.VnS(), z1.VnS(), z1.VnS());
14627   __ Zip1(z2.VnS(), z2.VnS(), z2.VnS());
14628 
14629   // Derive a vector from z2 where only the third element in each segment
14630   // contains a complex number, with other elements zero.
14631   __ Index(z3.VnS(), 0, 1);
14632   __ And(z3.VnS(), z3.VnS(), 3);
14633   __ Cmpeq(p2.VnS(), p0.Zeroing(), z3.VnS(), 2);
14634   __ Dup(z3.VnB(), 0);
14635   __ Sel(z3.VnS(), p2, z2.VnS(), z3.VnS());
14636 
14637   // Use indexed complex multiply on this vector, indexing the third element.
14638   __ Dup(z4.VnH(), 0);
14639   __ Fcmla(z4.VnH(), z0.VnH(), z3.VnH(), 2, 0);
14640   __ Fcmla(z4.VnH(), z0.VnH(), z3.VnH(), 2, 90);
14641 
14642   // Rotate the indexed complex number and repeat, negated, and with a different
14643   // index.
14644   __ Ext(z3.VnH(), z3.VnH(), z3.VnH(), 4);
14645   __ Dup(z5.VnH(), 0);
14646   __ Fcmla(z5.VnH(), z0.VnH(), z3.VnH(), 1, 180);
14647   __ Fcmla(z5.VnH(), z0.VnH(), z3.VnH(), 1, 270);
14648   __ Fneg(z5.VnH(), p0.Merging(), z5.VnH());
14649 
14650   // Create a reference result from a vector complex multiply.
14651   __ Dup(z6.VnH(), 0);
14652   __ Fcmla(z6.VnH(), p0.Merging(), z6.VnH(), z0.VnH(), z2.VnH(), 0);
14653   __ Fcmla(z6.VnH(), p0.Merging(), z6.VnH(), z0.VnH(), z2.VnH(), 90);
14654 
14655   // Repeated, but for wider elements.
14656   __ Fdup(z0.VnS(), 42.0);
14657   __ Fdup(z2.VnS(), 24.0);
14658   __ Zip1(z0.VnS(), z0.VnS(), z2.VnS());
14659   __ Index(z1.VnS(), -42, 13);
14660   __ Scvtf(z1.VnS(), p0.Merging(), z1.VnS());
14661   __ Zip1(z2.VnD(), z1.VnD(), z1.VnD());
14662   __ Zip1(z2.VnD(), z2.VnD(), z2.VnD());
14663   __ Index(z3.VnD(), 0, 1);
14664   __ And(z3.VnD(), z3.VnD(), 1);
14665   __ Cmpeq(p2.VnD(), p0.Zeroing(), z3.VnD(), 1);
14666   __ Dup(z3.VnB(), 0);
14667   __ Sel(z3.VnD(), p2, z2.VnD(), z3.VnD());
14668   __ Dup(z7.VnS(), 0);
14669   __ Fcmla(z7.VnS(), z0.VnS(), z3.VnS(), 1, 0);
14670   __ Fcmla(z7.VnS(), z0.VnS(), z3.VnS(), 1, 90);
14671   __ Ext(z3.VnB(), z3.VnB(), z3.VnB(), 8);
14672   __ Dup(z8.VnS(), 0);
14673   __ Fcmla(z8.VnS(), z0.VnS(), z3.VnS(), 0, 180);
14674   __ Fcmla(z8.VnS(), z0.VnS(), z3.VnS(), 0, 270);
14675   __ Fneg(z8.VnS(), p0.Merging(), z8.VnS());
14676   __ Dup(z9.VnS(), 0);
14677   __ Fcmla(z9.VnS(), p0.Merging(), z9.VnS(), z0.VnS(), z2.VnS(), 0);
14678   __ Fcmla(z9.VnS(), p0.Merging(), z9.VnS(), z0.VnS(), z2.VnS(), 90);
14679   END();
14680 
14681   if (CAN_RUN()) {
14682     RUN();
14683     ASSERT_EQUAL_SVE(z6.VnH(), z4.VnH());
14684     ASSERT_EQUAL_SVE(z6.VnH(), z5.VnH());
14685     ASSERT_EQUAL_SVE(z9.VnS(), z7.VnS());
14686     ASSERT_EQUAL_SVE(z9.VnS(), z8.VnS());
14687   }
14688 }
14689 
TEST_SVE(sve_fcmla)14690 TEST_SVE(sve_fcmla) {
14691   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14692   START();
14693 
14694   __ Ptrue(p0.VnB());
14695   __ Pfalse(p1.VnB());
14696   __ Zip1(p2.VnH(), p0.VnH(), p1.VnH());  // Real elements.
14697   __ Zip1(p3.VnH(), p1.VnH(), p0.VnH());  // Imaginary elements.
14698 
14699   __ Fdup(z0.VnH(), 10.0);
14700   __ Fdup(z2.VnH(), 2.0);
14701 
14702   // Create pairs of complex numbers, Ai + A. A is chosen to be non-zero, as
14703   // the later fneg will result in a failed comparison otherwise.
14704   __ Index(z1.VnH(), -4, 3);
14705   __ Zip1(z1.VnH(), z1.VnH(), z1.VnH());
14706   __ Zip1(z1.VnH(), z1.VnH(), z1.VnH());
14707   __ Scvtf(z1.VnH(), p0.Merging(), z1.VnH());
14708 
14709   __ Sel(z3.VnH(), p2, z0.VnH(), z1.VnH());  // Ai + 10
14710   __ Sel(z4.VnH(), p2, z1.VnH(), z2.VnH());  // 2i + A
14711 
14712   __ Zip1(p2.VnS(), p0.VnS(), p1.VnS());  // Even complex numbers.
14713   __ Zip1(p3.VnS(), p1.VnS(), p0.VnS());  // Odd complex numbers.
14714 
14715   // Calculate (Ai + 10) * (2i + A) = (20 + A^2)i + 8A, using predication to
14716   // select only the complex numbers in odd-numbered element pairs. This leaves
14717   // results in elements 2/3, 6/7, etc. with zero in elements 0/1, 4/5, etc.
14718   //   ...      7      6   5   4      3      2   1   0     <-- element
14719   //   ... | 20+A^2 | 8A | 0 | 0 | 20+A^2 | 8A | 0 | 0 |   <-- value
14720   __ Dup(z5.VnH(), 0);
14721   __ Fcmla(z5.VnH(), p3.Merging(), z5.VnH(), z4.VnH(), z3.VnH(), 0);
14722   __ Fcmla(z5.VnH(), p3.Merging(), z5.VnH(), z4.VnH(), z3.VnH(), 90);
14723 
14724   // Move the odd results to the even result positions.
14725   //   ...   7   6      5      4   3   2      1      0     <-- element
14726   //   ... | 0 | 0 | 20+A^2 | 8A | 0 | 0 | 20+A^2 | 8A |   <-- value
14727   __ Ext(z5.VnB(), z5.VnB(), z5.VnB(), 4);
14728 
14729   // Calculate -(Ai + 10) * (2i + A) = -(20 + A^2)i - 8A for the even complex
14730   // numbers.
14731   //   ...   7   6       5       4   3   2       1       0     <-- element
14732   //   ... | 0 | 0 | -20-A^2 | -8A | 0 | 0 | -20-A^2 | -8A |   <-- value
14733   __ Dup(z6.VnH(), 0);
14734   __ Fcmla(z6.VnH(), p2.Merging(), z6.VnH(), z4.VnH(), z3.VnH(), 180);
14735   __ Fcmla(z6.VnH(), p2.Merging(), z6.VnH(), z4.VnH(), z3.VnH(), 270);
14736 
14737   // Negate the even results. The results in z6 should now match the results
14738   // computed earlier in z5.
14739   //   ...   7   6      5      4   3   2      1      0     <-- element
14740   //   ... | 0 | 0 | 20+A^2 | 8A | 0 | 0 | 20+A^2 | 8A |   <-- value
14741   __ Fneg(z6.VnH(), p2.Merging(), z6.VnH());
14742 
14743 
14744   // Similarly, but for wider elements.
14745   __ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
14746   __ Zip1(p3.VnS(), p1.VnS(), p0.VnS());
14747   __ Index(z1.VnS(), -4, 3);
14748   __ Zip1(z1.VnS(), z1.VnS(), z1.VnS());
14749   __ Zip1(z1.VnS(), z1.VnS(), z1.VnS());
14750   __ Scvtf(z1.VnS(), p0.Merging(), z1.VnS());
14751   __ Fdup(z0.VnS(), 20.0);
14752   __ Fdup(z2.VnS(), 21.0);
14753   __ Sel(z3.VnS(), p2, z0.VnS(), z1.VnS());
14754   __ Sel(z4.VnS(), p2, z1.VnS(), z2.VnS());
14755   __ Punpklo(p2.VnH(), p2.VnB());
14756   __ Punpklo(p3.VnH(), p3.VnB());
14757   __ Dup(z7.VnS(), 0);
14758   __ Fcmla(z7.VnS(), p3.Merging(), z7.VnS(), z4.VnS(), z3.VnS(), 0);
14759   __ Fcmla(z7.VnS(), p3.Merging(), z7.VnS(), z4.VnS(), z3.VnS(), 90);
14760   __ Ext(z7.VnB(), z7.VnB(), z7.VnB(), 8);
14761   __ Dup(z8.VnS(), 0);
14762   __ Fcmla(z8.VnS(), p2.Merging(), z8.VnS(), z4.VnS(), z3.VnS(), 180);
14763   __ Fcmla(z8.VnS(), p2.Merging(), z8.VnS(), z4.VnS(), z3.VnS(), 270);
14764   __ Fneg(z8.VnS(), p2.Merging(), z8.VnS());
14765 
14766   // Double precision computed for even lanes only.
14767   __ Zip1(p2.VnD(), p0.VnD(), p1.VnD());
14768   __ Index(z1.VnD(), -4, 3);
14769   __ Zip1(z1.VnD(), z1.VnD(), z1.VnD());
14770   __ Zip1(z1.VnD(), z1.VnD(), z1.VnD());
14771   __ Scvtf(z1.VnD(), p0.Merging(), z1.VnD());
14772   __ Fdup(z0.VnD(), 20.0);
14773   __ Fdup(z2.VnD(), 21.0);
14774   __ Sel(z3.VnD(), p2, z0.VnD(), z1.VnD());
14775   __ Sel(z4.VnD(), p2, z1.VnD(), z2.VnD());
14776   __ Punpklo(p2.VnH(), p2.VnB());
14777   __ Dup(z9.VnD(), 0);
14778   __ Fcmla(z9.VnD(), p2.Merging(), z9.VnD(), z4.VnD(), z3.VnD(), 0);
14779   __ Fcmla(z9.VnD(), p2.Merging(), z9.VnD(), z4.VnD(), z3.VnD(), 90);
14780   __ Dup(z10.VnD(), 0);
14781   __ Fcmla(z10.VnD(), p2.Merging(), z10.VnD(), z4.VnD(), z3.VnD(), 180);
14782   __ Fcmla(z10.VnD(), p2.Merging(), z10.VnD(), z4.VnD(), z3.VnD(), 270);
14783   __ Fneg(z10.VnD(), p2.Merging(), z10.VnD());
14784   END();
14785 
14786   if (CAN_RUN()) {
14787     RUN();
14788     ASSERT_EQUAL_SVE(z5.VnH(), z6.VnH());
14789     ASSERT_EQUAL_SVE(z7.VnS(), z8.VnS());
14790     ASSERT_EQUAL_SVE(z9.VnD(), z10.VnD());
14791   }
14792 }
14793 
14794 // Create a pattern in dst where the value of each element in src is incremented
14795 // by the segment number. This allows varying a short input by a predictable
14796 // pattern for each segment.
FPSegmentPatternHelper(MacroAssembler * masm,const ZRegister & dst,const PRegisterM & ptrue,const ZRegister & src)14797 static void FPSegmentPatternHelper(MacroAssembler* masm,
14798                                    const ZRegister& dst,
14799                                    const PRegisterM& ptrue,
14800                                    const ZRegister& src) {
14801   VIXL_ASSERT(AreSameLaneSize(dst, src));
14802   UseScratchRegisterScope temps(masm);
14803   ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(dst);
14804   masm->Index(ztmp, 0, 1);
14805   masm->Asr(ztmp, ztmp, kQRegSizeInBytesLog2 - dst.GetLaneSizeInBytesLog2());
14806   masm->Scvtf(ztmp, ptrue, ztmp);
14807   masm->Fadd(dst, src, ztmp);
14808 }
14809 
TEST_SVE(sve_fpmul_index)14810 TEST_SVE(sve_fpmul_index) {
14811   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14812   START();
14813 
14814   uint64_t in0[] = {0x3ff000003f803c00, 0xbff00000bf80bc00};
14815   uint64_t in1[] = {0x3ff012343ff03c76, 0xbff01234bff0bc76};
14816 
14817   __ Ptrue(p0.VnB());
14818   // Repeat indexed vector across up to 2048-bit VL.
14819   for (size_t i = 0; i < (kZRegMaxSize / kDRegSize); i++) {
14820     InsrHelper(&masm, z25.VnD(), in0);
14821   }
14822   InsrHelper(&masm, z1.VnD(), in1);
14823 
14824   FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z25.VnH());
14825   __ Fmul(z2.VnH(), z1.VnH(), z0.VnH(), 0);
14826   __ Fmul(z3.VnH(), z1.VnH(), z0.VnH(), 1);
14827   __ Fmul(z4.VnH(), z1.VnH(), z0.VnH(), 4);
14828   __ Fmul(z5.VnH(), z1.VnH(), z0.VnH(), 7);
14829 
14830   __ Fmul(z6.VnS(), z1.VnS(), z0.VnS(), 0);
14831   __ Fmul(z7.VnS(), z1.VnS(), z0.VnS(), 1);
14832   __ Fmul(z8.VnS(), z1.VnS(), z0.VnS(), 2);
14833   __ Fmul(z9.VnS(), z1.VnS(), z0.VnS(), 3);
14834 
14835   __ Fmul(z10.VnD(), z1.VnD(), z0.VnD(), 0);
14836   __ Fmul(z11.VnD(), z1.VnD(), z0.VnD(), 1);
14837 
14838   // Compute the results using other instructions.
14839   __ Dup(z12.VnH(), z25.VnH(), 0);
14840   FPSegmentPatternHelper(&masm, z12.VnH(), p0.Merging(), z12.VnH());
14841   __ Fmul(z12.VnH(), z1.VnH(), z12.VnH());
14842   __ Dup(z13.VnH(), z25.VnH(), 1);
14843   FPSegmentPatternHelper(&masm, z13.VnH(), p0.Merging(), z13.VnH());
14844   __ Fmul(z13.VnH(), z1.VnH(), z13.VnH());
14845   __ Dup(z14.VnH(), z25.VnH(), 4);
14846   FPSegmentPatternHelper(&masm, z14.VnH(), p0.Merging(), z14.VnH());
14847   __ Fmul(z14.VnH(), z1.VnH(), z14.VnH());
14848   __ Dup(z15.VnH(), z25.VnH(), 7);
14849   FPSegmentPatternHelper(&masm, z15.VnH(), p0.Merging(), z15.VnH());
14850   __ Fmul(z15.VnH(), z1.VnH(), z15.VnH());
14851 
14852   __ Dup(z16.VnS(), z25.VnS(), 0);
14853   FPSegmentPatternHelper(&masm, z16.VnH(), p0.Merging(), z16.VnH());
14854   __ Fmul(z16.VnS(), z1.VnS(), z16.VnS());
14855   __ Dup(z17.VnS(), z25.VnS(), 1);
14856   FPSegmentPatternHelper(&masm, z17.VnH(), p0.Merging(), z17.VnH());
14857   __ Fmul(z17.VnS(), z1.VnS(), z17.VnS());
14858   __ Dup(z18.VnS(), z25.VnS(), 2);
14859   FPSegmentPatternHelper(&masm, z18.VnH(), p0.Merging(), z18.VnH());
14860   __ Fmul(z18.VnS(), z1.VnS(), z18.VnS());
14861   __ Dup(z19.VnS(), z25.VnS(), 3);
14862   FPSegmentPatternHelper(&masm, z19.VnH(), p0.Merging(), z19.VnH());
14863   __ Fmul(z19.VnS(), z1.VnS(), z19.VnS());
14864 
14865   __ Dup(z20.VnD(), z25.VnD(), 0);
14866   FPSegmentPatternHelper(&masm, z20.VnH(), p0.Merging(), z20.VnH());
14867   __ Fmul(z20.VnD(), z1.VnD(), z20.VnD());
14868   __ Dup(z21.VnD(), z25.VnD(), 1);
14869   FPSegmentPatternHelper(&masm, z21.VnH(), p0.Merging(), z21.VnH());
14870   __ Fmul(z21.VnD(), z1.VnD(), z21.VnD());
14871 
14872   END();
14873 
14874   if (CAN_RUN()) {
14875     RUN();
14876     ASSERT_EQUAL_SVE(z12.VnH(), z2.VnH());
14877     ASSERT_EQUAL_SVE(z13.VnH(), z3.VnH());
14878     ASSERT_EQUAL_SVE(z14.VnH(), z4.VnH());
14879     ASSERT_EQUAL_SVE(z15.VnH(), z5.VnH());
14880     ASSERT_EQUAL_SVE(z16.VnS(), z6.VnS());
14881     ASSERT_EQUAL_SVE(z17.VnS(), z7.VnS());
14882     ASSERT_EQUAL_SVE(z18.VnS(), z8.VnS());
14883     ASSERT_EQUAL_SVE(z19.VnS(), z9.VnS());
14884     ASSERT_EQUAL_SVE(z20.VnD(), z10.VnD());
14885     ASSERT_EQUAL_SVE(z21.VnD(), z11.VnD());
14886   }
14887 }
14888 
TEST_SVE(sve_ftmad)14889 TEST_SVE(sve_ftmad) {
14890   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14891   START();
14892 
14893   uint64_t in_h0[] = {0x7c027e01fc02fe01,
14894                       0x3c003c00bc00bc00,
14895                       0x3c003c00bc00bc00};
14896   uint64_t in_h1[] = {0xfe01fc027e017e01,
14897                       0x3c00bc003c00bc00,
14898                       0x3c00bc003c00bc00};
14899   uint64_t in_s0[] = {0x7f800002ffc00001,
14900                       0x3f8000003f800000,
14901                       0xbf800000bf800000};
14902   uint64_t in_s1[] = {0xffc00001ffc00001,
14903                       0x3f800000bf800000,
14904                       0x3f800000bf800000};
14905   uint64_t in_d0[] = {0x7ff8000000000001,
14906                       0x3ff0000000000000,
14907                       0xbff0000000000000};
14908   uint64_t in_d1[] = {0xfff0000000000002,
14909                       0xbff0000000000000,
14910                       0x3ff0000000000000};
14911   InsrHelper(&masm, z0.VnD(), in_h0);
14912   InsrHelper(&masm, z1.VnD(), in_h1);
14913   InsrHelper(&masm, z2.VnD(), in_s0);
14914   InsrHelper(&masm, z3.VnD(), in_s1);
14915   InsrHelper(&masm, z4.VnD(), in_d0);
14916   InsrHelper(&masm, z5.VnD(), in_d1);
14917 
14918   __ Mov(z6, z0);
14919   __ Ftmad(z6.VnH(), z6.VnH(), z1.VnH(), 0);
14920   __ Mov(z7, z0);
14921   __ Ftmad(z7.VnH(), z7.VnH(), z1.VnH(), 1);
14922   __ Mov(z8, z0);
14923   __ Ftmad(z8.VnH(), z8.VnH(), z1.VnH(), 2);
14924 
14925   __ Mov(z9, z2);
14926   __ Ftmad(z9.VnS(), z9.VnS(), z3.VnS(), 0);
14927   __ Mov(z10, z2);
14928   __ Ftmad(z10.VnS(), z10.VnS(), z3.VnS(), 3);
14929   __ Mov(z11, z2);
14930   __ Ftmad(z11.VnS(), z11.VnS(), z3.VnS(), 4);
14931 
14932   __ Mov(z12, z4);
14933   __ Ftmad(z12.VnD(), z12.VnD(), z5.VnD(), 0);
14934   __ Mov(z13, z4);
14935   __ Ftmad(z13.VnD(), z13.VnD(), z5.VnD(), 5);
14936   __ Mov(z14, z4);
14937   __ Ftmad(z14.VnD(), z14.VnD(), z5.VnD(), 7);
14938 
14939   END();
14940 
14941   if (CAN_RUN()) {
14942     RUN();
14943     uint64_t expected_z6[] = {0x7e027e02fe02fe01,
14944                               0x4000400000000000,
14945                               0x4000400000000000};
14946     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
14947     uint64_t expected_z7[] = {0x7e027e02fe02fe01,
14948                               0x3aab3800bcabbe00,
14949                               0x3aab3800bcabbe00};
14950     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
14951     uint64_t expected_z8[] = {0x7e027e02fe02fe01,
14952                               0x3c083c2abbefbbac,
14953                               0x3c083c2abbefbbac};
14954     ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
14955     uint64_t expected_z9[] = {0x7fc00002ffc00001,
14956                               0x4000000040000000,
14957                               0x0000000000000000};
14958     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
14959     uint64_t expected_z10[] = {0x7fc00002ffc00001,
14960                                0x3f7ff2ff3f7fa4fc,
14961                                0xbf800680bf802d82};
14962     ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
14963     uint64_t expected_z11[] = {0x7fc00002ffc00001,
14964                                0x3f8000173f8000cd,
14965                                0xbf7fffd2bf7ffe66};
14966     ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
14967     uint64_t expected_z12[] = {0x7ff8000000000002,
14968                                0x4000000000000000,
14969                                0x0000000000000000};
14970     ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
14971     uint64_t expected_z13[] = {0x7ff8000000000002,
14972                                0x3fefffff6c0d846c,
14973                                0xbff0000006b978ae};
14974     ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
14975     uint64_t expected_z14[] = {0x7ff8000000000002,
14976                                0x3feffffffffe708a,
14977                                0xbff0000000000000};
14978     ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
14979   }
14980 }
14981 
BasicFPArithHelper(MacroAssembler * masm,int lane_size_in_bits,const uint64_t (& inputs)[2],const uint64_t (& inputs_fmulx)[2],const uint64_t (& inputs_nans)[2])14982 static void BasicFPArithHelper(MacroAssembler* masm,
14983                                int lane_size_in_bits,
14984                                const uint64_t (&inputs)[2],
14985                                const uint64_t (&inputs_fmulx)[2],
14986                                const uint64_t (&inputs_nans)[2]) {
14987   int ls = lane_size_in_bits;
14988 
14989   for (int i = 0; i < 16; i++) {
14990     InsrHelper(masm, z0.VnD(), inputs);
14991   }
14992   ZRegister rvrs = z1.WithLaneSize(ls);
14993   masm->Rev(rvrs, z0.WithLaneSize(ls));
14994 
14995   int pred[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1};
14996   Initialise(masm, p2.VnB(), pred);
14997   PRegisterM p2m = p2.Merging();
14998 
14999   masm->Mov(z2, z0);
15000   masm->Fadd(z2.WithLaneSize(ls),
15001              p2m,
15002              z2.WithLaneSize(ls),
15003              rvrs,
15004              FastNaNPropagation);
15005   masm->Mov(z3, z0);
15006   masm->Fsub(z3.WithLaneSize(ls), p2m, z3.WithLaneSize(ls), rvrs);
15007   masm->Mov(z4, z0);
15008   masm->Fsub(z4.WithLaneSize(ls), p2m, rvrs, z4.WithLaneSize(ls));
15009   masm->Mov(z5, z0);
15010   masm->Fabd(z5.WithLaneSize(ls),
15011              p2m,
15012              z5.WithLaneSize(ls),
15013              rvrs,
15014              FastNaNPropagation);
15015   masm->Mov(z6, z0);
15016   masm->Fmul(z6.WithLaneSize(ls),
15017              p2m,
15018              z6.WithLaneSize(ls),
15019              rvrs,
15020              FastNaNPropagation);
15021 
15022   for (int i = 0; i < 16; i++) {
15023     InsrHelper(masm, z7.VnD(), inputs_fmulx);
15024   }
15025   masm->Rev(z8.WithLaneSize(ls), z7.WithLaneSize(ls));
15026   masm->Fmulx(z7.WithLaneSize(ls),
15027               p2m,
15028               z7.WithLaneSize(ls),
15029               z8.WithLaneSize(ls),
15030               FastNaNPropagation);
15031 
15032   InsrHelper(masm, z8.VnD(), inputs_nans);
15033   masm->Mov(z9, z8);
15034   masm->Fminnm(z9.WithLaneSize(ls),
15035                p2m,
15036                z9.WithLaneSize(ls),
15037                rvrs,
15038                FastNaNPropagation);
15039   masm->Mov(z10, z8);
15040   masm->Fmaxnm(z10.WithLaneSize(ls),
15041                p2m,
15042                z10.WithLaneSize(ls),
15043                rvrs,
15044                FastNaNPropagation);
15045 }
15046 
TEST_SVE(sve_fp_arith_pred_h)15047 TEST_SVE(sve_fp_arith_pred_h) {
15048   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15049   START();
15050 
15051   uint64_t inputs[] = {0x4800470046004500, 0x4400420040003c00};
15052   uint64_t inputs_fmulx[] = {0x7c00fc007c00fc00, 0x0000800000008000};
15053   uint64_t inputs_nans[] = {0x7fffffff7fffffff, 0x7bfffbff7fbbfbff};
15054 
15055   BasicFPArithHelper(&masm, kHRegSize, inputs, inputs_fmulx, inputs_nans);
15056 
15057   END();
15058 
15059   if (CAN_RUN()) {
15060     RUN();
15061     uint64_t expected_z2[] = {0x4880488048804880, 0x4880420048804880};
15062     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
15063     uint64_t expected_z3[] = {0x4700450042003c00, 0xbc004200c500c700};
15064     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
15065     uint64_t expected_z4[] = {0xc700c500c200bc00, 0x3c00420045004700};
15066     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
15067     uint64_t expected_z5[] = {0x4700450042003c00, 0x3c00420045004700};
15068     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
15069     uint64_t expected_z6[] = {0x48004b004c804d00, 0x4d0042004b004800};
15070     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
15071     uint64_t expected_z7[] = {0xc000c000c000c000, 0xc0008000c000c000};
15072     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
15073     uint64_t expected_z9[] = {0x3c00400042004400, 0x4500fbff4700fbff};
15074     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
15075     uint64_t expected_z10[] = {0x3c00400042004400, 0x7bfffbff47004800};
15076     ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
15077   }
15078 }
15079 
TEST_SVE(sve_fp_arith_pred_s)15080 TEST_SVE(sve_fp_arith_pred_s) {
15081   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15082   START();
15083 
15084   uint64_t inputs[] = {0x4080000040400000, 0x400000003f800000};
15085   uint64_t inputs_fmulx[] = {0x7f800000ff800000, 0x0000000080000000};
15086   uint64_t inputs_nans[] = {0x7fffffffffffffff, 0x41000000c1000000};
15087 
15088   BasicFPArithHelper(&masm, kSRegSize, inputs, inputs_fmulx, inputs_nans);
15089 
15090   END();
15091 
15092   if (CAN_RUN()) {
15093     RUN();
15094     uint64_t expected_z2[] = {0x40a0000040a00000, 0x4000000040a00000};
15095     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
15096     uint64_t expected_z3[] = {0x404000003f800000, 0x40000000c0400000};
15097     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
15098     uint64_t expected_z4[] = {0xc0400000bf800000, 0x4000000040400000};
15099     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
15100     uint64_t expected_z5[] = {0x404000003f800000, 0x4000000040400000};
15101     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
15102     uint64_t expected_z6[] = {0x4080000040c00000, 0x4000000040800000};
15103     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
15104     uint64_t expected_z7[] = {0xc0000000c0000000, 0x00000000c0000000};
15105     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
15106     uint64_t expected_z9[] = {0x3f80000040000000, 0x41000000c1000000};
15107     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
15108     uint64_t expected_z10[] = {0x3f80000040000000, 0x4100000040800000};
15109     ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
15110   }
15111 }
15112 
TEST_SVE(sve_fp_arith_pred_d)15113 TEST_SVE(sve_fp_arith_pred_d) {
15114   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15115   START();
15116 
15117   uint64_t inputs[] = {0x4000000000000000, 0x3ff0000000000000};
15118   uint64_t inputs_fmulx[] = {0x7ff0000000000000, 0x8000000000000000};
15119   uint64_t inputs_nans[] = {0x7fffffffffffffff, 0x4100000000000000};
15120 
15121   BasicFPArithHelper(&masm, kDRegSize, inputs, inputs_fmulx, inputs_nans);
15122 
15123   END();
15124 
15125   if (CAN_RUN()) {
15126     RUN();
15127     uint64_t expected_z2[] = {0x4008000000000000, 0x4008000000000000};
15128     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
15129     uint64_t expected_z3[] = {0x3ff0000000000000, 0xbff0000000000000};
15130     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
15131     uint64_t expected_z4[] = {0xbff0000000000000, 0x3ff0000000000000};
15132     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
15133     uint64_t expected_z5[] = {0x3ff0000000000000, 0x3ff0000000000000};
15134     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
15135     uint64_t expected_z6[] = {0x4000000000000000, 0x4000000000000000};
15136     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
15137     uint64_t expected_z7[] = {0xc000000000000000, 0xc000000000000000};
15138     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
15139     uint64_t expected_z9[] = {0x3ff0000000000000, 0x4000000000000000};
15140     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
15141     uint64_t expected_z10[] = {0x3ff0000000000000, 0x4100000000000000};
15142     ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
15143   }
15144 }
15145 
TEST_SVE(sve_fp_arith_pred_imm)15146 TEST_SVE(sve_fp_arith_pred_imm) {
15147   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15148   START();
15149 
15150   int pred[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1};
15151   Initialise(&masm, p0.VnB(), pred);
15152   PRegisterM p0m = p0.Merging();
15153   __ Ptrue(p1.VnB());
15154 
15155   __ Fdup(z0.VnD(), 0.0);
15156 
15157   __ Mov(z1, z0);
15158   __ Fdiv(z1.VnH(), p1.Merging(), z1.VnH(), z1.VnH());
15159   __ Mov(z2, z0);
15160   __ Fadd(z2.VnH(), p0m, z2.VnH(), 0.5);
15161   __ Mov(z3, z2);
15162   __ Fsub(z3.VnH(), p0m, z3.VnH(), 1.0);
15163   __ Mov(z4, z3);
15164   __ Fsub(z4.VnH(), p0m, 1.0, z4.VnH());
15165   __ Mov(z5, z4);
15166   __ Fmul(z5.VnH(), p0m, z5.VnH(), 2.0);
15167   __ Mov(z6, z1);
15168   __ Fminnm(z6.VnH(), p0m, z6.VnH(), 0.0);
15169   __ Mov(z7, z1);
15170   __ Fmaxnm(z7.VnH(), p0m, z7.VnH(), 1.0);
15171   __ Mov(z8, z5);
15172   __ Fmin(z8.VnH(), p0m, z8.VnH(), 1.0);
15173   __ Mov(z9, z5);
15174   __ Fmax(z9.VnH(), p0m, z9.VnH(), 0.0);
15175 
15176   __ Mov(z11, z0);
15177   __ Fdiv(z11.VnS(), p1.Merging(), z11.VnS(), z11.VnS());
15178   __ Mov(z12, z0);
15179   __ Fadd(z12.VnS(), p0m, z12.VnS(), 0.5);
15180   __ Mov(z13, z12);
15181   __ Fsub(z13.VnS(), p0m, z13.VnS(), 1.0);
15182   __ Mov(z14, z13);
15183   __ Fsub(z14.VnS(), p0m, 1.0, z14.VnS());
15184   __ Mov(z15, z14);
15185   __ Fmul(z15.VnS(), p0m, z15.VnS(), 2.0);
15186   __ Mov(z16, z11);
15187   __ Fminnm(z16.VnS(), p0m, z16.VnS(), 0.0);
15188   __ Mov(z17, z11);
15189   __ Fmaxnm(z17.VnS(), p0m, z17.VnS(), 1.0);
15190   __ Mov(z18, z15);
15191   __ Fmin(z18.VnS(), p0m, z18.VnS(), 1.0);
15192   __ Mov(z19, z15);
15193   __ Fmax(z19.VnS(), p0m, z19.VnS(), 0.0);
15194 
15195   __ Mov(z21, z0);
15196   __ Fdiv(z21.VnD(), p1.Merging(), z21.VnD(), z21.VnD());
15197   __ Mov(z22, z0);
15198   __ Fadd(z22.VnD(), p0m, z22.VnD(), 0.5);
15199   __ Mov(z23, z22);
15200   __ Fsub(z23.VnD(), p0m, z23.VnD(), 1.0);
15201   __ Mov(z24, z23);
15202   __ Fsub(z24.VnD(), p0m, 1.0, z24.VnD());
15203   __ Mov(z25, z24);
15204   __ Fmul(z25.VnD(), p0m, z25.VnD(), 2.0);
15205   __ Mov(z26, z21);
15206   __ Fminnm(z26.VnD(), p0m, z26.VnD(), 0.0);
15207   __ Mov(z27, z21);
15208   __ Fmaxnm(z27.VnD(), p0m, z27.VnD(), 1.0);
15209   __ Mov(z28, z25);
15210   __ Fmin(z28.VnD(), p0m, z28.VnD(), 1.0);
15211   __ Mov(z29, z25);
15212   __ Fmax(z29.VnD(), p0m, z29.VnD(), 0.0);
15213 
15214   __ Index(z0.VnH(), -3, 1);
15215   __ Scvtf(z0.VnH(), p1.Merging(), z0.VnH());
15216   __ Fmax(z0.VnH(), p1.Merging(), z0.VnH(), 0.0);
15217   __ Index(z1.VnS(), -4, 2);
15218   __ Scvtf(z1.VnS(), p1.Merging(), z1.VnS());
15219   __ Fadd(z1.VnS(), p1.Merging(), z1.VnS(), 1.0);
15220 
15221   END();
15222 
15223   if (CAN_RUN()) {
15224     RUN();
15225     uint64_t expected_z2[] = {0x3800380038003800, 0x3800000038003800};
15226     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
15227     uint64_t expected_z3[] = {0xb800b800b800b800, 0xb8000000b800b800};
15228     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
15229     uint64_t expected_z4[] = {0x3e003e003e003e00, 0x3e0000003e003e00};
15230     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
15231     uint64_t expected_z5[] = {0x4200420042004200, 0x4200000042004200};
15232     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
15233     uint64_t expected_z6[] = {0x0000000000000000, 0x00007e0000000000};
15234     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
15235     uint64_t expected_z7[] = {0x3c003c003c003c00, 0x3c007e003c003c00};
15236     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
15237     uint64_t expected_z8[] = {0x3c003c003c003c00, 0x3c0000003c003c00};
15238     ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
15239     uint64_t expected_z9[] = {0x4200420042004200, 0x4200000042004200};
15240     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
15241 
15242     uint64_t expected_z12[] = {0x3f0000003f000000, 0x000000003f000000};
15243     ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
15244     uint64_t expected_z13[] = {0xbf000000bf000000, 0x00000000bf000000};
15245     ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
15246     uint64_t expected_z14[] = {0x3fc000003fc00000, 0x000000003fc00000};
15247     ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
15248     uint64_t expected_z15[] = {0x4040000040400000, 0x0000000040400000};
15249     ASSERT_EQUAL_SVE(expected_z15, z15.VnD());
15250     uint64_t expected_z16[] = {0x0000000000000000, 0x7fc0000000000000};
15251     ASSERT_EQUAL_SVE(expected_z16, z16.VnD());
15252     uint64_t expected_z17[] = {0x3f8000003f800000, 0x7fc000003f800000};
15253     ASSERT_EQUAL_SVE(expected_z17, z17.VnD());
15254     uint64_t expected_z18[] = {0x3f8000003f800000, 0x000000003f800000};
15255     ASSERT_EQUAL_SVE(expected_z18, z18.VnD());
15256     uint64_t expected_z19[] = {0x4040000040400000, 0x0000000040400000};
15257     ASSERT_EQUAL_SVE(expected_z19, z19.VnD());
15258 
15259     uint64_t expected_z22[] = {0x3fe0000000000000, 0x3fe0000000000000};
15260     ASSERT_EQUAL_SVE(expected_z22, z22.VnD());
15261     uint64_t expected_z23[] = {0xbfe0000000000000, 0xbfe0000000000000};
15262     ASSERT_EQUAL_SVE(expected_z23, z23.VnD());
15263     uint64_t expected_z24[] = {0x3ff8000000000000, 0x3ff8000000000000};
15264     ASSERT_EQUAL_SVE(expected_z24, z24.VnD());
15265     uint64_t expected_z25[] = {0x4008000000000000, 0x4008000000000000};
15266     ASSERT_EQUAL_SVE(expected_z25, z25.VnD());
15267     uint64_t expected_z26[] = {0x0000000000000000, 0x0000000000000000};
15268     ASSERT_EQUAL_SVE(expected_z26, z26.VnD());
15269     uint64_t expected_z27[] = {0x3ff0000000000000, 0x3ff0000000000000};
15270     ASSERT_EQUAL_SVE(expected_z27, z27.VnD());
15271     uint64_t expected_z28[] = {0x3ff0000000000000, 0x3ff0000000000000};
15272     ASSERT_EQUAL_SVE(expected_z28, z28.VnD());
15273     uint64_t expected_z29[] = {0x4008000000000000, 0x4008000000000000};
15274     ASSERT_EQUAL_SVE(expected_z29, z29.VnD());
15275     uint64_t expected_z0[] = {0x4400420040003c00, 0x0000000000000000};
15276     ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
15277     uint64_t expected_z1[] = {0x404000003f800000, 0xbf800000c0400000};
15278     ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
15279   }
15280 }
15281 
TEST_SVE(sve_fscale)15282 TEST_SVE(sve_fscale) {
15283   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15284   START();
15285 
15286   uint64_t inputs_h[] = {0x4800470046004500, 0x4400420040003c00};
15287   InsrHelper(&masm, z0.VnD(), inputs_h);
15288   uint64_t inputs_s[] = {0x4080000040400000, 0x400000003f800000};
15289   InsrHelper(&masm, z1.VnD(), inputs_s);
15290   uint64_t inputs_d[] = {0x40f0000000000000, 0x4000000000000000};
15291   InsrHelper(&masm, z2.VnD(), inputs_d);
15292 
15293   uint64_t scales[] = {0x00080002fff8fffe, 0x00100001fff0ffff};
15294   InsrHelper(&masm, z3.VnD(), scales);
15295 
15296   __ Ptrue(p0.VnB());
15297   int pred[] = {0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1};
15298   Initialise(&masm, p1.VnB(), pred);
15299 
15300   __ Mov(z4, z0);
15301   __ Fscale(z4.VnH(), p0.Merging(), z4.VnH(), z3.VnH());
15302   __ Mov(z5, z0);
15303   __ Fscale(z5.VnH(), p1.Merging(), z5.VnH(), z3.VnH());
15304 
15305   __ Sunpklo(z3.VnS(), z3.VnH());
15306   __ Mov(z6, z1);
15307   __ Fscale(z6.VnS(), p0.Merging(), z6.VnS(), z3.VnS());
15308   __ Mov(z7, z1);
15309   __ Fscale(z7.VnS(), p1.Merging(), z7.VnS(), z3.VnS());
15310 
15311   __ Sunpklo(z3.VnD(), z3.VnS());
15312   __ Mov(z8, z2);
15313   __ Fscale(z8.VnD(), p0.Merging(), z8.VnD(), z3.VnD());
15314   __ Mov(z9, z2);
15315   __ Fscale(z9.VnD(), p1.Merging(), z9.VnD(), z3.VnD());
15316 
15317   // Test full double precision range scaling.
15318   __ Dup(z10.VnD(), 2045);
15319   __ Dup(z11.VnD(), 0x0010000000000000);  // 2^-1022
15320   __ Fscale(z11.VnD(), p0.Merging(), z11.VnD(), z10.VnD());
15321 
15322   END();
15323 
15324   if (CAN_RUN()) {
15325     RUN();
15326 
15327     uint64_t expected_z4[] = {0x68004f0026003d00, 0x7c00460002003800};
15328     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
15329     uint64_t expected_z5[] = {0x68004f0026004500, 0x7c00420002003800};
15330     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
15331 
15332     uint64_t expected_z6[] = {0x4880000040c00000, 0x380000003f000000};
15333     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
15334     uint64_t expected_z7[] = {0x4880000040400000, 0x400000003f000000};
15335     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
15336 
15337     uint64_t expected_z8[] = {0x3ff0000000000000, 0x3ff0000000000000};
15338     ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
15339     uint64_t expected_z9[] = {0x40f0000000000000, 0x3ff0000000000000};
15340     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
15341 
15342     uint64_t expected_z11[] = {0x7fe0000000000000, 0x7fe0000000000000};
15343     ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
15344   }
15345 }
15346 
15347 typedef void (MacroAssembler::*FcvtFrintMFn)(const ZRegister& zd,
15348                                              const PRegisterM& pg,
15349                                              const ZRegister& zn);
15350 
15351 typedef void (MacroAssembler::*FcvtFrintZFn)(const ZRegister& zd,
15352                                              const PRegisterZ& pg,
15353                                              const ZRegister& zn);
15354 
15355 template <typename F, size_t N>
TestFcvtFrintHelper(Test * config,FcvtFrintMFn macro_m,FcvtFrintZFn macro_z,int dst_type_size_in_bits,int src_type_size_in_bits,const F (& zn_inputs)[N],const int (& pg_inputs)[N],const uint64_t (& zd_expected_all_active)[N])15356 static void TestFcvtFrintHelper(Test* config,
15357                                 FcvtFrintMFn macro_m,
15358                                 FcvtFrintZFn macro_z,
15359                                 int dst_type_size_in_bits,
15360                                 int src_type_size_in_bits,
15361                                 const F (&zn_inputs)[N],
15362                                 const int (&pg_inputs)[N],
15363                                 const uint64_t (&zd_expected_all_active)[N]) {
15364   VIXL_ASSERT(macro_m != NULL);
15365   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15366   START();
15367 
15368   // If the input and result types have a different size, the instruction
15369   // options on elements of the largest specified type is determined by the
15370   // larger type.
15371   int lane_size_in_bits =
15372       std::max(dst_type_size_in_bits, src_type_size_in_bits);
15373 
15374   ZRegister zd_all_active = z25;
15375   ZRegister zd_merging = z26;
15376   ZRegister zn = z27;
15377 
15378   uint64_t zn_rawbits[N];
15379   FPToRawbitsWithSize(zn_inputs, zn_rawbits, src_type_size_in_bits);
15380   InsrHelper(&masm, zn.WithLaneSize(lane_size_in_bits), zn_rawbits);
15381 
15382   PRegisterWithLaneSize pg_all_active = p0.WithLaneSize(lane_size_in_bits);
15383   __ Ptrue(pg_all_active);
15384 
15385   // Test floating-point conversions with all lanes actived.
15386   (masm.*macro_m)(zd_all_active.WithLaneSize(dst_type_size_in_bits),
15387                   pg_all_active.Merging(),
15388                   zn.WithLaneSize(src_type_size_in_bits));
15389 
15390   PRegisterWithLaneSize pg_merging = p1.WithLaneSize(lane_size_in_bits);
15391   Initialise(&masm, pg_merging, pg_inputs);
15392 
15393   __ Dup(zd_merging.VnD(), 0x0bad0bad0bad0bad);
15394 
15395   // Use the same `zn` inputs to test floating-point conversions but partial
15396   // lanes are set inactive.
15397   (masm.*macro_m)(zd_merging.WithLaneSize(dst_type_size_in_bits),
15398                   pg_merging.Merging(),
15399                   zn.WithLaneSize(src_type_size_in_bits));
15400 
15401   ZRegister zd_zeroing = z24;
15402   PRegisterWithLaneSize pg_zeroing = p1.WithLaneSize(lane_size_in_bits);
15403   Initialise(&masm, pg_zeroing, pg_inputs);
15404 
15405   if (macro_z != NULL) {
15406     __ Dup(zd_zeroing.VnD(), 0x0bad0bad0bad0bad);
15407     (masm.*macro_z)(zd_zeroing.WithLaneSize(dst_type_size_in_bits),
15408                     pg_zeroing.Zeroing(),
15409                     zn.WithLaneSize(src_type_size_in_bits));
15410   }
15411 
15412   END();
15413 
15414   if (CAN_RUN()) {
15415     RUN();
15416 
15417     ASSERT_EQUAL_SVE(zd_expected_all_active,
15418                      zd_all_active.WithLaneSize(lane_size_in_bits));
15419 
15420     uint64_t zd_expected_merging[N];
15421     for (unsigned i = 0; i < N; i++) {
15422       zd_expected_merging[i] =
15423           pg_inputs[i] ? zd_expected_all_active[i]
15424                        : 0x0bad0bad0bad0bad & GetUintMask(lane_size_in_bits);
15425     }
15426     ASSERT_EQUAL_SVE(zd_expected_merging,
15427                      zd_merging.WithLaneSize(lane_size_in_bits));
15428 
15429     if (macro_z != NULL) {
15430       uint64_t zd_expected_zeroing[N] = {0};
15431       for (unsigned i = 0; i < N; i++) {
15432         if (pg_inputs[i]) {
15433           zd_expected_zeroing[i] = zd_expected_all_active[i];
15434         }
15435       }
15436       ASSERT_EQUAL_SVE(zd_expected_zeroing,
15437                        zd_zeroing.WithLaneSize(lane_size_in_bits));
15438     }
15439   }
15440 }
15441 
15442 template <typename F, size_t N>
TestFcvtzHelper(Test * config,FcvtFrintMFn macro_m,int dst_type_size_in_bits,int src_type_size_in_bits,const F (& zn_inputs)[N],const int (& pg_inputs)[N],const uint64_t (& zd_expected_all_active)[N])15443 static void TestFcvtzHelper(Test* config,
15444                             FcvtFrintMFn macro_m,
15445                             int dst_type_size_in_bits,
15446                             int src_type_size_in_bits,
15447                             const F (&zn_inputs)[N],
15448                             const int (&pg_inputs)[N],
15449                             const uint64_t (&zd_expected_all_active)[N]) {
15450   TestFcvtFrintHelper(config,
15451                       macro_m,
15452                       // Fcvt variants have no zeroing predication form.
15453                       NULL,
15454                       dst_type_size_in_bits,
15455                       src_type_size_in_bits,
15456                       zn_inputs,
15457                       pg_inputs,
15458                       zd_expected_all_active);
15459 }
15460 
TEST_SVE(fcvtzs_fcvtzu_float16)15461 TEST_SVE(fcvtzs_fcvtzu_float16) {
15462   const double h_max_float16 = 0x7ff0;          // Largest float16 == INT16_MAX.
15463   const double h_min_float16 = -h_max_float16;  // Smallest float16 > INT16_MIN.
15464   const double largest_float16 = 0xffe0;        // 65504
15465   const double smallest_float16 = -largest_float16;
15466   const double h_max_int_add_one = 0x8000;
15467 
15468   double zn_inputs[] = {1.0,
15469                         1.1,
15470                         1.5,
15471                         -1.5,
15472                         h_max_float16,
15473                         h_min_float16,
15474                         largest_float16,
15475                         smallest_float16,
15476                         kFP64PositiveInfinity,
15477                         kFP64NegativeInfinity,
15478                         h_max_int_add_one};
15479 
15480   int pg_inputs[] = {0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1};
15481 
15482   uint64_t expected_fcvtzs_fp162h[] =
15483       {1, 1, 1, 0xffff, 0x7ff0, 0x8010, 0x7fff, 0x8000, 0x7fff, 0x8000, 0x7fff};
15484 
15485   uint64_t expected_fcvtzu_fp162h[] =
15486       {1, 1, 1, 0, 0x7ff0, 0, 0xffe0, 0, 0xffff, 0, 0x8000};
15487 
15488   // Float16 to 16-bit integers.
15489   TestFcvtzHelper(config,
15490                   &MacroAssembler::Fcvtzs,
15491                   kHRegSize,
15492                   kHRegSize,
15493                   zn_inputs,
15494                   pg_inputs,
15495                   expected_fcvtzs_fp162h);
15496 
15497   TestFcvtzHelper(config,
15498                   &MacroAssembler::Fcvtzu,
15499                   kHRegSize,
15500                   kHRegSize,
15501                   zn_inputs,
15502                   pg_inputs,
15503                   expected_fcvtzu_fp162h);
15504 
15505   uint64_t expected_fcvtzs_fp162w[] = {1,
15506                                        1,
15507                                        1,
15508                                        0xffffffff,
15509                                        0x7ff0,
15510                                        0xffff8010,
15511                                        0xffe0,
15512                                        0xffff0020,
15513                                        0x7fffffff,
15514                                        0x80000000,
15515                                        0x8000};
15516 
15517   uint64_t expected_fcvtzu_fp162w[] =
15518       {1, 1, 1, 0, 0x7ff0, 0, 0xffe0, 0, 0xffffffff, 0, 0x8000};
15519 
15520   // Float16 to 32-bit integers.
15521   TestFcvtzHelper(config,
15522                   &MacroAssembler::Fcvtzs,
15523                   kSRegSize,
15524                   kHRegSize,
15525                   zn_inputs,
15526                   pg_inputs,
15527                   expected_fcvtzs_fp162w);
15528 
15529   TestFcvtzHelper(config,
15530                   &MacroAssembler::Fcvtzu,
15531                   kSRegSize,
15532                   kHRegSize,
15533                   zn_inputs,
15534                   pg_inputs,
15535                   expected_fcvtzu_fp162w);
15536 
15537   uint64_t expected_fcvtzs_fp162x[] = {1,
15538                                        1,
15539                                        1,
15540                                        0xffffffffffffffff,
15541                                        0x7ff0,
15542                                        0xffffffffffff8010,
15543                                        0xffe0,
15544                                        0xffffffffffff0020,
15545                                        0x7fffffffffffffff,
15546                                        0x8000000000000000,
15547                                        0x8000};
15548 
15549   uint64_t expected_fcvtzu_fp162x[] =
15550       {1, 1, 1, 0, 0x7ff0, 0, 0xffe0, 0, 0xffffffffffffffff, 0, 0x8000};
15551 
15552   // Float16 to 64-bit integers.
15553   TestFcvtzHelper(config,
15554                   &MacroAssembler::Fcvtzs,
15555                   kDRegSize,
15556                   kHRegSize,
15557                   zn_inputs,
15558                   pg_inputs,
15559                   expected_fcvtzs_fp162x);
15560 
15561   TestFcvtzHelper(config,
15562                   &MacroAssembler::Fcvtzu,
15563                   kDRegSize,
15564                   kHRegSize,
15565                   zn_inputs,
15566                   pg_inputs,
15567                   expected_fcvtzu_fp162x);
15568 }
15569 
TEST_SVE(fcvtzs_fcvtzu_float)15570 TEST_SVE(fcvtzs_fcvtzu_float) {
15571   const double w_max_float = 0x7fffff80;          // Largest float < INT32_MAX.
15572   const double w_min_float = -w_max_float;        // Smallest float > INT32_MIN.
15573   const double x_max_float = 0x7fffff8000000000;  // Largest float < INT64_MAX.
15574   const double x_min_float = -x_max_float;        // Smallest float > INT64_MIN.
15575   const double w_min_int_add_one = 0x80000000;
15576   const double x_max_int_add_one = 0x80000000'00000000;
15577 
15578   double zn_inputs[] = {1.0,
15579                         1.1,
15580                         1.5,
15581                         -1.5,
15582                         w_max_float,
15583                         w_min_float,
15584                         x_max_float,
15585                         x_min_float,
15586                         kFP64PositiveInfinity,
15587                         kFP64NegativeInfinity,
15588                         w_min_int_add_one,
15589                         x_max_int_add_one};
15590 
15591   int pg_inputs[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1};
15592 
15593   uint64_t expected_fcvtzs_s2w[] = {1,
15594                                     1,
15595                                     1,
15596                                     0xffffffff,
15597                                     0x7fffff80,
15598                                     0x80000080,
15599                                     0x7fffffff,
15600                                     0x80000000,
15601                                     0x7fffffff,
15602                                     0x80000000,
15603                                     0x7fffffff,
15604                                     0x7fffffff};
15605 
15606   uint64_t expected_fcvtzu_s2w[] = {1,
15607                                     1,
15608                                     1,
15609                                     0,
15610                                     0x7fffff80,
15611                                     0,
15612                                     0xffffffff,
15613                                     0,
15614                                     0xffffffff,
15615                                     0,
15616                                     0x80000000,
15617                                     0xffffffff};
15618 
15619   // Float to 32-bit integers.
15620   TestFcvtzHelper(config,
15621                   &MacroAssembler::Fcvtzs,
15622                   kSRegSize,
15623                   kSRegSize,
15624                   zn_inputs,
15625                   pg_inputs,
15626                   expected_fcvtzs_s2w);
15627 
15628   TestFcvtzHelper(config,
15629                   &MacroAssembler::Fcvtzu,
15630                   kSRegSize,
15631                   kSRegSize,
15632                   zn_inputs,
15633                   pg_inputs,
15634                   expected_fcvtzu_s2w);
15635 
15636   uint64_t expected_fcvtzs_s2x[] = {1,
15637                                     1,
15638                                     1,
15639                                     0xffffffffffffffff,
15640                                     0x7fffff80,
15641                                     0xffffffff80000080,
15642                                     0x7fffff8000000000,
15643                                     0x8000008000000000,
15644                                     0x7fffffffffffffff,
15645                                     0x8000000000000000,
15646                                     0x80000000,
15647                                     0x7fffffffffffffff};
15648 
15649   uint64_t expected_fcvtzu_s2x[] = {1,
15650                                     1,
15651                                     1,
15652                                     0,
15653                                     0x7fffff80,
15654                                     0,
15655                                     0x7fffff8000000000,
15656                                     0,
15657                                     0xffffffffffffffff,
15658                                     0,
15659                                     0x80000000,
15660                                     0x8000000000000000};
15661 
15662   // Float to 64-bit integers.
15663   TestFcvtzHelper(config,
15664                   &MacroAssembler::Fcvtzs,
15665                   kDRegSize,
15666                   kSRegSize,
15667                   zn_inputs,
15668                   pg_inputs,
15669                   expected_fcvtzs_s2x);
15670 
15671   TestFcvtzHelper(config,
15672                   &MacroAssembler::Fcvtzu,
15673                   kDRegSize,
15674                   kSRegSize,
15675                   zn_inputs,
15676                   pg_inputs,
15677                   expected_fcvtzu_s2x);
15678 }
15679 
TEST_SVE(fcvtzs_fcvtzu_double)15680 TEST_SVE(fcvtzs_fcvtzu_double) {
15681   const double w_max_float = 0x7fffff80;          // Largest float < INT32_MAX.
15682   const double w_min_float = -w_max_float;        // Smallest float > INT32_MIN.
15683   const double x_max_float = 0x7fffff8000000000;  // Largest float < INT64_MAX.
15684   const double x_min_float = -x_max_float;        // Smallest float > INT64_MIN.
15685   const double w_max_double = kWMaxInt;       // Largest double == INT32_MAX.
15686   const double w_min_double = -w_max_double;  // Smallest double > INT32_MIN.
15687   const double x_max_double =
15688       0x7ffffffffffffc00;                     // Largest double < INT64_MAX.
15689   const double x_min_double = -x_max_double;  // Smallest double > INT64_MIN.
15690   const double w_max_int_sub_one = kWMaxInt - 1;
15691   const double w_min_int_add_one = kWMinInt + 1;
15692   const double w_max_int_add_one = 0x80000000;
15693   const double x_max_int_add_one = 0x80000000'00000000;
15694 
15695   double zn_inputs[] = {1.0,
15696                         1.1,
15697                         1.5,
15698                         -1.5,
15699                         w_max_float,
15700                         w_min_float,
15701                         x_max_float,
15702                         x_min_float,
15703                         w_max_double,
15704                         w_min_double,
15705                         x_max_double,
15706                         x_min_double,
15707                         kFP64PositiveInfinity,
15708                         kFP64NegativeInfinity,
15709                         w_max_int_sub_one,
15710                         w_min_int_add_one,
15711                         w_max_int_add_one,
15712                         x_max_int_add_one};
15713 
15714   int pg_inputs[] = {1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0};
15715 
15716   uint64_t expected_fcvtzs_d2w[] = {1,
15717                                     1,
15718                                     1,
15719                                     0xffffffffffffffff,
15720                                     0x7fffff80,
15721                                     0xffffffff80000080,
15722                                     0x7fffffff,
15723                                     0xffffffff80000000,
15724                                     0x7fffffff,
15725                                     0xffffffff80000001,
15726                                     0x7fffffff,
15727                                     0xffffffff80000000,
15728                                     0x7fffffff,
15729                                     0xffffffff80000000,
15730                                     0x7ffffffe,
15731                                     0xffffffff80000001,
15732                                     0x7fffffff,
15733                                     0x7fffffff};
15734 
15735   uint64_t expected_fcvtzu_d2w[] = {1,
15736                                     1,
15737                                     1,
15738                                     0,
15739                                     0x7fffff80,
15740                                     0,
15741                                     0xffffffff,
15742                                     0,
15743                                     0x7fffffff,
15744                                     0,
15745                                     0xffffffff,
15746                                     0,
15747                                     0xffffffff,
15748                                     0,
15749                                     0x7ffffffe,
15750                                     0,
15751                                     0x80000000,
15752                                     0xffffffff};
15753 
15754   // Double to 32-bit integers.
15755   TestFcvtzHelper(config,
15756                   &MacroAssembler::Fcvtzs,
15757                   kSRegSize,
15758                   kDRegSize,
15759                   zn_inputs,
15760                   pg_inputs,
15761                   expected_fcvtzs_d2w);
15762 
15763   TestFcvtzHelper(config,
15764                   &MacroAssembler::Fcvtzu,
15765                   kSRegSize,
15766                   kDRegSize,
15767                   zn_inputs,
15768                   pg_inputs,
15769                   expected_fcvtzu_d2w);
15770 
15771   uint64_t expected_fcvtzs_d2x[] = {1,
15772                                     1,
15773                                     1,
15774                                     0xffffffffffffffff,
15775                                     0x7fffff80,
15776                                     0xffffffff80000080,
15777                                     0x7fffff8000000000,
15778                                     0x8000008000000000,
15779                                     0x7fffffff,
15780                                     0xffffffff80000001,
15781                                     0x7ffffffffffffc00,
15782                                     0x8000000000000400,
15783                                     0x7fffffffffffffff,
15784                                     0x8000000000000000,
15785                                     0x7ffffffe,
15786                                     0xffffffff80000001,
15787                                     0x80000000,
15788                                     0x7fffffffffffffff};
15789 
15790   uint64_t expected_fcvtzu_d2x[] = {1,
15791                                     1,
15792                                     1,
15793                                     0,
15794                                     0x7fffff80,
15795                                     0,
15796                                     0x7fffff8000000000,
15797                                     0,
15798                                     0x7fffffff,
15799                                     0,
15800                                     0x7ffffffffffffc00,
15801                                     0,
15802                                     0xffffffffffffffff,
15803                                     0,
15804                                     0x000000007ffffffe,
15805                                     0,
15806                                     0x80000000,
15807                                     0x8000000000000000};
15808 
15809   // Double to 64-bit integers.
15810   TestFcvtzHelper(config,
15811                   &MacroAssembler::Fcvtzs,
15812                   kDRegSize,
15813                   kDRegSize,
15814                   zn_inputs,
15815                   pg_inputs,
15816                   expected_fcvtzs_d2x);
15817 
15818   TestFcvtzHelper(config,
15819                   &MacroAssembler::Fcvtzu,
15820                   kDRegSize,
15821                   kDRegSize,
15822                   zn_inputs,
15823                   pg_inputs,
15824                   expected_fcvtzu_d2x);
15825 }
15826 
15827 template <typename F, size_t N>
TestFrintHelper(Test * config,FcvtFrintMFn macro_m,FcvtFrintZFn macro_z,int lane_size_in_bits,const F (& zn_inputs)[N],const int (& pg_inputs)[N],const F (& zd_expected)[N])15828 static void TestFrintHelper(Test* config,
15829                             FcvtFrintMFn macro_m,
15830                             FcvtFrintZFn macro_z,
15831                             int lane_size_in_bits,
15832                             const F (&zn_inputs)[N],
15833                             const int (&pg_inputs)[N],
15834                             const F (&zd_expected)[N]) {
15835   uint64_t zd_expected_rawbits[N];
15836   FPToRawbitsWithSize(zd_expected, zd_expected_rawbits, lane_size_in_bits);
15837   TestFcvtFrintHelper(config,
15838                       macro_m,
15839                       macro_z,
15840                       lane_size_in_bits,
15841                       lane_size_in_bits,
15842                       zn_inputs,
15843                       pg_inputs,
15844                       zd_expected_rawbits);
15845 }
15846 
TEST_SVE(frint)15847 TEST_SVE(frint) {
15848   const double inf_pos = kFP64PositiveInfinity;
15849   const double inf_neg = kFP64NegativeInfinity;
15850 
15851   double zn_inputs[] =
15852       {1.1, 1.5, 1.9, 2.5, -1.5, -2.5, 0.0, -0.0, -0.2, inf_pos, inf_neg};
15853   double zd_expected_a[] =
15854       {1.0, 2.0, 2.0, 3.0, -2.0, -3.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
15855   double zd_expected_i[] =
15856       {1.0, 2.0, 2.0, 2.0, -2.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
15857   double zd_expected_m[] =
15858       {1.0, 1.0, 1.0, 2.0, -2.0, -3.0, 0.0, -0.0, -1.0, inf_pos, inf_neg};
15859   double zd_expected_n[] =
15860       {1.0, 2.0, 2.0, 2.0, -2.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
15861   double zd_expected_p[] =
15862       {2.0, 2.0, 2.0, 3.0, -1.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
15863   double zd_expected_x[] =
15864       {1.0, 2.0, 2.0, 2.0, -2.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
15865   double zd_expected_z[] =
15866       {1.0, 1.0, 1.0, 2.0, -1.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
15867 
15868   int pg_inputs[] = {0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0};
15869 
15870   struct TestDataSet {
15871     FcvtFrintMFn macro_m;  // merging form.
15872     FcvtFrintZFn macro_z;  // zeroing form.
15873     double (&expected)[11];
15874   };
15875 
15876   TestDataSet test_data[] =
15877       {{&MacroAssembler::Frinta, &MacroAssembler::Frinta, zd_expected_a},
15878        {&MacroAssembler::Frinti, &MacroAssembler::Frinti, zd_expected_i},
15879        {&MacroAssembler::Frintm, &MacroAssembler::Frintm, zd_expected_m},
15880        {&MacroAssembler::Frintn, &MacroAssembler::Frintn, zd_expected_n},
15881        {&MacroAssembler::Frintp, &MacroAssembler::Frintp, zd_expected_p},
15882        {&MacroAssembler::Frintx, &MacroAssembler::Frintx, zd_expected_x},
15883        {&MacroAssembler::Frintz, &MacroAssembler::Frintz, zd_expected_z}};
15884 
15885   unsigned lane_sizes[] = {kHRegSize, kSRegSize, kDRegSize};
15886 
15887   for (size_t i = 0; i < sizeof(test_data) / sizeof(TestDataSet); i++) {
15888     for (size_t j = 0; j < ArrayLength(lane_sizes); j++) {
15889       TestFrintHelper(config,
15890                       test_data[i].macro_m,
15891                       test_data[i].macro_z,
15892                       lane_sizes[j],
15893                       zn_inputs,
15894                       pg_inputs,
15895                       test_data[i].expected);
15896     }
15897   }
15898 }
15899 
15900 struct CvtfTestDataSet {
15901   uint64_t int_value;
15902   uint64_t scvtf_result;
15903   uint64_t ucvtf_result;
15904 };
15905 
15906 template <size_t N>
TestUScvtfHelper(Test * config,int dst_type_size_in_bits,int src_type_size_in_bits,const int (& pg_inputs)[N],const CvtfTestDataSet (& data_set)[N])15907 static void TestUScvtfHelper(Test* config,
15908                              int dst_type_size_in_bits,
15909                              int src_type_size_in_bits,
15910                              const int (&pg_inputs)[N],
15911                              const CvtfTestDataSet (&data_set)[N]) {
15912   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15913   START();
15914 
15915   // Unpack the data from the array of struct into individual arrays that can
15916   // simplify the testing.
15917   uint64_t zn_inputs[N];
15918   uint64_t expected_zd_scvtf_all_active[N];
15919   uint64_t expected_zd_ucvtf_all_active[N];
15920   for (size_t i = 0; i < N; i++) {
15921     zn_inputs[i] = data_set[i].int_value;
15922     expected_zd_scvtf_all_active[i] = data_set[i].scvtf_result;
15923     expected_zd_ucvtf_all_active[i] = data_set[i].ucvtf_result;
15924   }
15925 
15926   // If the input and result types have a different size, the instruction
15927   // operates on elements of the largest specified type.
15928   int lane_size_in_bits =
15929       std::max(dst_type_size_in_bits, src_type_size_in_bits);
15930 
15931   ZRegister zd_scvtf_all_active = z25;
15932   ZRegister zd_ucvtf_all_active = z26;
15933   ZRegister zn = z27;
15934   InsrHelper(&masm, zn.WithLaneSize(lane_size_in_bits), zn_inputs);
15935 
15936   PRegisterWithLaneSize pg_all_active = p0.WithLaneSize(lane_size_in_bits);
15937   __ Ptrue(pg_all_active);
15938 
15939   // Test integer conversions with all lanes actived.
15940   __ Scvtf(zd_scvtf_all_active.WithLaneSize(dst_type_size_in_bits),
15941            pg_all_active.Merging(),
15942            zn.WithLaneSize(src_type_size_in_bits));
15943   __ Ucvtf(zd_ucvtf_all_active.WithLaneSize(dst_type_size_in_bits),
15944            pg_all_active.Merging(),
15945            zn.WithLaneSize(src_type_size_in_bits));
15946 
15947   ZRegister zd_scvtf_merged = z23;
15948   ZRegister zd_ucvtf_merged = z24;
15949 
15950   PRegisterWithLaneSize pg_merged = p1.WithLaneSize(lane_size_in_bits);
15951   Initialise(&masm, pg_merged, pg_inputs);
15952 
15953   uint64_t snan;
15954   switch (lane_size_in_bits) {
15955     case kHRegSize:
15956       snan = 0x7c11;
15957       break;
15958     case kSRegSize:
15959       snan = 0x7f951111;
15960       break;
15961     case kDRegSize:
15962       snan = 0x7ff5555511111111;
15963       break;
15964   }
15965   __ Dup(zd_scvtf_merged.WithLaneSize(lane_size_in_bits), snan);
15966   __ Dup(zd_ucvtf_merged.WithLaneSize(lane_size_in_bits), snan);
15967 
15968   // Use the same `zn` inputs to test integer conversions but some lanes are set
15969   // inactive.
15970   __ Scvtf(zd_scvtf_merged.WithLaneSize(dst_type_size_in_bits),
15971            pg_merged.Merging(),
15972            zn.WithLaneSize(src_type_size_in_bits));
15973   __ Ucvtf(zd_ucvtf_merged.WithLaneSize(dst_type_size_in_bits),
15974            pg_merged.Merging(),
15975            zn.WithLaneSize(src_type_size_in_bits));
15976 
15977   END();
15978 
15979   if (CAN_RUN()) {
15980     RUN();
15981 
15982     ASSERT_EQUAL_SVE(expected_zd_scvtf_all_active,
15983                      zd_scvtf_all_active.WithLaneSize(lane_size_in_bits));
15984     ASSERT_EQUAL_SVE(expected_zd_ucvtf_all_active,
15985                      zd_ucvtf_all_active.WithLaneSize(lane_size_in_bits));
15986 
15987     uint64_t expected_zd_scvtf_merged[N];
15988     for (size_t i = 0; i < N; i++) {
15989       expected_zd_scvtf_merged[i] =
15990           pg_inputs[i] ? expected_zd_scvtf_all_active[i] : snan;
15991     }
15992     ASSERT_EQUAL_SVE(expected_zd_scvtf_merged,
15993                      zd_scvtf_merged.WithLaneSize(lane_size_in_bits));
15994 
15995     uint64_t expected_zd_ucvtf_merged[N];
15996     for (size_t i = 0; i < N; i++) {
15997       expected_zd_ucvtf_merged[i] =
15998           pg_inputs[i] ? expected_zd_ucvtf_all_active[i] : snan;
15999     }
16000     ASSERT_EQUAL_SVE(expected_zd_ucvtf_merged,
16001                      zd_ucvtf_merged.WithLaneSize(lane_size_in_bits));
16002   }
16003 }
16004 
TEST_SVE(scvtf_ucvtf_h_s_d_to_float16)16005 TEST_SVE(scvtf_ucvtf_h_s_d_to_float16) {
16006   // clang-format off
16007   CvtfTestDataSet data_set_1[] = {
16008     // Simple conversions of positive numbers which require no rounding; the
16009     // results should not depened on the rounding mode, and ucvtf and scvtf should
16010     // produce the same result.
16011     {0x0000, 0x0000, 0x0000},
16012     {0x0001, 0x3c00, 0x3c00},
16013     {0x0010, 0x4c00, 0x4c00},
16014     {0x0080, 0x5800, 0x5800},
16015     {0x0400, 0x6400, 0x6400},
16016     // Conversions which require rounding.
16017     {0x4000, 0x7400, 0x7400},
16018     {0x4001, 0x7400, 0x7400},
16019     // Round up to produce a result that's too big for the input to represent.
16020     {0x7ff0, 0x77ff, 0x77ff},
16021     {0x7ff1, 0x77ff, 0x77ff},
16022     {0x7ffe, 0x7800, 0x7800},
16023     {0x7fff, 0x7800, 0x7800}};
16024   int pg_1[] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
16025   TestUScvtfHelper(config, kHRegSize, kDRegSize, pg_1, data_set_1);
16026   TestUScvtfHelper(config, kHRegSize, kSRegSize, pg_1, data_set_1);
16027   TestUScvtfHelper(config, kHRegSize, kHRegSize, pg_1, data_set_1);
16028 
16029   CvtfTestDataSet data_set_2[] = {
16030     // Test mantissa extremities.
16031     {0x0401, 0x6401, 0x6401},
16032     {0x4020, 0x7402, 0x7402},
16033     // The largest int16_t that fits in a float16.
16034     {0xffef, 0xcc40, 0x7bff},
16035     // Values that would be negative if treated as an int16_t.
16036     {0xff00, 0xdc00, 0x7bf8},
16037     {0x8000, 0xf800, 0x7800},
16038     {0x8100, 0xf7f0, 0x7808},
16039     // Check for bit pattern reproduction.
16040     {0x0123, 0x5c8c, 0x5c8c},
16041     {0x0cde, 0x6a6f, 0x6a6f},
16042     // Simple conversions of negative int64_t values. These require no rounding,
16043     // and the results should not depend on the rounding mode.
16044     {0xf800, 0xe800, 0x7bc0},
16045     {0xfc00, 0xe400, 0x7be0},
16046     {0xc000, 0xf400, 0x7a00},
16047     // Check rounding of negative int16_t values.
16048     {0x8ffe, 0xf700, 0x7880},
16049     {0x8fff, 0xf700, 0x7880},
16050     {0xffee, 0xcc80, 0x7bff},
16051     {0xffef, 0xcc40, 0x7bff}};
16052   int pg_2[] = {1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1};
16053   // `32-bit to float16` and `64-bit to float16` of above tests has been tested
16054   // in `ucvtf` of `16-bit to float16`.
16055   TestUScvtfHelper(config, kHRegSize, kHRegSize, pg_2, data_set_2);
16056   // clang-format on
16057 }
16058 
TEST_SVE(scvtf_ucvtf_s_to_float)16059 TEST_SVE(scvtf_ucvtf_s_to_float) {
16060   // clang-format off
16061   int dst_lane_size = kSRegSize;
16062   int src_lane_size = kSRegSize;
16063 
16064   // Simple conversions of positive numbers which require no rounding; the
16065   // results should not depened on the rounding mode, and ucvtf and scvtf should
16066   // produce the same result.
16067   CvtfTestDataSet data_set_1[] = {
16068     {0x00000000, 0x00000000, 0x00000000},
16069     {0x00000001, 0x3f800000, 0x3f800000},
16070     {0x00004000, 0x46800000, 0x46800000},
16071     {0x00010000, 0x47800000, 0x47800000},
16072     {0x40000000, 0x4e800000, 0x4e800000}};
16073   int pg_1[] = {1, 0, 1, 0, 0};
16074   TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_1, data_set_1);
16075 
16076   CvtfTestDataSet data_set_2[] = {
16077     // Test mantissa extremities.
16078     {0x00800001, 0x4b000001, 0x4b000001},
16079     {0x40400000, 0x4e808000, 0x4e808000},
16080     // The largest int32_t that fits in a double.
16081     {0x7fffff80, 0x4effffff, 0x4effffff},
16082     // Values that would be negative if treated as an int32_t.
16083     {0xffffffff, 0xbf800000, 0x4f800000},
16084     {0xffffff00, 0xc3800000, 0x4f7fffff},
16085     {0x80000000, 0xcf000000, 0x4f000000},
16086     {0x80000001, 0xcf000000, 0x4f000000},
16087     // Check for bit pattern reproduction.
16088     {0x089abcde, 0x4d09abce, 0x4d09abce},
16089     {0x12345678, 0x4d91a2b4, 0x4d91a2b4}};
16090   int pg_2[] = {1, 0, 1, 0, 1, 1, 1, 0, 0};
16091   TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_2, data_set_2);
16092 
16093   // Simple conversions of negative int32_t values. These require no rounding,
16094   // and the results should not depend on the rounding mode.
16095   CvtfTestDataSet data_set_3[] = {
16096     {0xffffc000, 0xc6800000, 0x4f7fffc0},
16097     {0xffff0000, 0xc7800000, 0x4f7fff00},
16098     {0xc0000000, 0xce800000, 0x4f400000},
16099     // Conversions which require rounding.
16100     {0x72800000, 0x4ee50000, 0x4ee50000},
16101     {0x72800001, 0x4ee50000, 0x4ee50000},
16102     {0x73000000, 0x4ee60000, 0x4ee60000},
16103     // Check rounding of negative int32_t values.
16104     {0x80000140, 0xcefffffe, 0x4f000001},
16105     {0x80000141, 0xcefffffd, 0x4f000001},
16106     {0x80000180, 0xcefffffd, 0x4f000002},
16107     // Round up to produce a result that's too big for the input to represent.
16108     {0x7fffffc0, 0x4f000000, 0x4f000000},
16109     {0x7fffffff, 0x4f000000, 0x4f000000}};
16110   int pg_3[] = {1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0};
16111   TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_3, data_set_3);
16112   // clang-format on
16113 }
16114 
TEST_SVE(scvtf_ucvtf_d_to_float)16115 TEST_SVE(scvtf_ucvtf_d_to_float) {
16116   // clang-format off
16117   int dst_lane_size = kSRegSize;
16118   int src_lane_size = kDRegSize;
16119 
16120   // Simple conversions of positive numbers which require no rounding; the
16121   // results should not depened on the rounding mode, and ucvtf and scvtf should
16122   // produce the same result.
16123   CvtfTestDataSet data_set_1[] = {
16124     {0x0000000000000000, 0x00000000, 0x00000000},
16125     {0x0000000000000001, 0x3f800000, 0x3f800000},
16126     {0x0000000040000000, 0x4e800000, 0x4e800000},
16127     {0x0000000100000000, 0x4f800000, 0x4f800000},
16128     {0x4000000000000000, 0x5e800000, 0x5e800000}};
16129   int pg_1[] = {1, 1, 0, 1, 0};
16130   TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_1, data_set_1);
16131 
16132   CvtfTestDataSet data_set_2[] = {
16133     // Test mantissa extremities.
16134     {0x0010000000000001, 0x59800000, 0x59800000},
16135     {0x4008000000000000, 0x5e801000, 0x5e801000},
16136     // The largest int32_t that fits in a float.
16137     {0x000000007fffff80, 0x4effffff, 0x4effffff},
16138     // Values that would be negative if treated as an int32_t.
16139     {0x00000000ffffffff, 0x4f800000, 0x4f800000},
16140     {0x00000000ffffff00, 0x4f7fffff, 0x4f7fffff},
16141     {0x0000000080000000, 0x4f000000, 0x4f000000},
16142     {0x0000000080000100, 0x4f000001, 0x4f000001},
16143     // The largest int64_t that fits in a float.
16144     {0x7fffff8000000000, 0x5effffff, 0x5effffff},
16145     // Check for bit pattern reproduction.
16146     {0x0123456789abcde0, 0x5b91a2b4, 0x5b91a2b4},
16147     {0x0000000000876543, 0x4b076543, 0x4b076543}};
16148   int pg_2[] = {1, 0, 0, 0, 1, 0, 0, 0, 0, 1};
16149   TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_2, data_set_2);
16150 
16151   CvtfTestDataSet data_set_3[] = {
16152     // Simple conversions of negative int64_t values. These require no rounding,
16153     // and the results should not depend on the rounding mode.
16154     {0xffffffffc0000000, 0xce800000, 0x5f800000},
16155     {0xffffffff00000000, 0xcf800000, 0x5f800000},
16156     {0xc000000000000000, 0xde800000, 0x5f400000},
16157     // Conversions which require rounding.
16158     {0x0000800002800000, 0x57000002, 0x57000002},
16159     {0x0000800002800001, 0x57000003, 0x57000003},
16160     {0x0000800003000000, 0x57000003, 0x57000003},
16161     // Check rounding of negative int64_t values.
16162     {0x8000014000000000, 0xdefffffe, 0x5f000001},
16163     {0x8000014000000001, 0xdefffffd, 0x5f000001},
16164     {0x8000018000000000, 0xdefffffd, 0x5f000002},
16165     // Round up to produce a result that's too big for the input to represent.
16166     {0x00000000ffffff80, 0x4f800000, 0x4f800000},
16167     {0x00000000ffffffff, 0x4f800000, 0x4f800000},
16168     {0xffffff8000000000, 0xd3000000, 0x5f800000},
16169     {0xffffffffffffffff, 0xbf800000, 0x5f800000}};
16170   int pg_3[] = {0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1};
16171   TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_3, data_set_3);
16172   // clang-format on
16173 }
16174 
TEST_SVE(scvtf_ucvtf_d_to_double)16175 TEST_SVE(scvtf_ucvtf_d_to_double) {
16176   // clang-format off
16177   int dst_lane_size = kDRegSize;
16178   int src_lane_size = kDRegSize;
16179 
16180   // Simple conversions of positive numbers which require no rounding; the
16181   // results should not depened on the rounding mode, and ucvtf and scvtf should
16182   // produce the same result.
16183   CvtfTestDataSet data_set_1[] = {
16184     {0x0000000000000000, 0x0000000000000000, 0x0000000000000000},
16185     {0x0000000000000001, 0x3ff0000000000000, 0x3ff0000000000000},
16186     {0x0000000040000000, 0x41d0000000000000, 0x41d0000000000000},
16187     {0x0000000100000000, 0x41f0000000000000, 0x41f0000000000000},
16188     {0x4000000000000000, 0x43d0000000000000, 0x43d0000000000000}};
16189   int pg_1[] = {0, 1, 1, 0, 0};
16190   TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_1, data_set_1);
16191 
16192   CvtfTestDataSet data_set_2[] = {
16193     // Test mantissa extremities.
16194     {0x0010000000000001, 0x4330000000000001, 0x4330000000000001},
16195     {0x4008000000000000, 0x43d0020000000000, 0x43d0020000000000},
16196     // The largest int32_t that fits in a double.
16197     {0x000000007fffffff, 0x41dfffffffc00000, 0x41dfffffffc00000},
16198     // Values that would be negative if treated as an int32_t.
16199     {0x00000000ffffffff, 0x41efffffffe00000, 0x41efffffffe00000},
16200     {0x0000000080000000, 0x41e0000000000000, 0x41e0000000000000},
16201     {0x0000000080000001, 0x41e0000000200000, 0x41e0000000200000},
16202     // The largest int64_t that fits in a double.
16203     {0x7ffffffffffffc00, 0x43dfffffffffffff, 0x43dfffffffffffff},
16204     // Check for bit pattern reproduction.
16205     {0x0123456789abcde0, 0x43723456789abcde, 0x43723456789abcde},
16206     {0x0000000012345678, 0x41b2345678000000, 0x41b2345678000000}};
16207   int pg_2[] = {1, 1, 1, 1, 1, 0, 0, 0, 0};
16208   TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_2, data_set_2);
16209 
16210   CvtfTestDataSet data_set_3[] = {
16211     // Simple conversions of negative int64_t values. These require no rounding,
16212     // and the results should not depend on the rounding mode.
16213     {0xffffffffc0000000, 0xc1d0000000000000, 0x43effffffff80000},
16214     {0xffffffff00000000, 0xc1f0000000000000, 0x43efffffffe00000},
16215     {0xc000000000000000, 0xc3d0000000000000, 0x43e8000000000000},
16216     // Conversions which require rounding.
16217     {0x1000000000000280, 0x43b0000000000002, 0x43b0000000000002},
16218     {0x1000000000000281, 0x43b0000000000003, 0x43b0000000000003},
16219     {0x1000000000000300, 0x43b0000000000003, 0x43b0000000000003},
16220     // Check rounding of negative int64_t values.
16221     {0x8000000000000a00, 0xc3dffffffffffffe, 0x43e0000000000001},
16222     {0x8000000000000a01, 0xc3dffffffffffffd, 0x43e0000000000001},
16223     {0x8000000000000c00, 0xc3dffffffffffffd, 0x43e0000000000002},
16224     // Round up to produce a result that's too big for the input to represent.
16225     {0x7ffffffffffffe00, 0x43e0000000000000, 0x43e0000000000000},
16226     {0x7fffffffffffffff, 0x43e0000000000000, 0x43e0000000000000},
16227     {0xfffffffffffffc00, 0xc090000000000000, 0x43f0000000000000},
16228     {0xffffffffffffffff, 0xbff0000000000000, 0x43f0000000000000}};
16229   int pg_3[] = {1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0};
16230   TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_3, data_set_3);
16231   // clang-format on
16232 }
16233 
TEST_SVE(scvtf_ucvtf_s_to_double)16234 TEST_SVE(scvtf_ucvtf_s_to_double) {
16235   // clang-format off
16236   int dst_lane_size = kDRegSize;
16237   int src_lane_size = kSRegSize;
16238 
16239   // Simple conversions of positive numbers which require no rounding; the
16240   // results should not depened on the rounding mode, and ucvtf and scvtf should
16241   // produce the same result.
16242   CvtfTestDataSet data_set_1[] = {
16243     {0x00000000, 0x0000000000000000, 0x0000000000000000},
16244     {0x00000001, 0x3ff0000000000000, 0x3ff0000000000000},
16245     {0x00004000, 0x40d0000000000000, 0x40d0000000000000},
16246     {0x00010000, 0x40f0000000000000, 0x40f0000000000000},
16247     {0x40000000, 0x41d0000000000000, 0x41d0000000000000}};
16248   int pg_1[] = {1, 0, 0, 0, 1};
16249   TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_1, data_set_1);
16250 
16251   CvtfTestDataSet data_set_2[] = {
16252     // Test mantissa extremities.
16253     {0x40000400, 0x41d0000100000000, 0x41d0000100000000},
16254     // The largest int32_t that fits in a double.
16255     {0x7fffffff, 0x41dfffffffc00000, 0x41dfffffffc00000},
16256     // Values that would be negative if treated as an int32_t.
16257     {0xffffffff, 0xbff0000000000000, 0x41efffffffe00000},
16258     {0x80000000, 0xc1e0000000000000, 0x41e0000000000000},
16259     {0x80000001, 0xc1dfffffffc00000, 0x41e0000000200000},
16260     // Check for bit pattern reproduction.
16261     {0x089abcde, 0x41a13579bc000000, 0x41a13579bc000000},
16262     {0x12345678, 0x41b2345678000000, 0x41b2345678000000},
16263     // Simple conversions of negative int32_t values. These require no rounding,
16264     // and the results should not depend on the rounding mode.
16265     {0xffffc000, 0xc0d0000000000000, 0x41effff800000000},
16266     {0xffff0000, 0xc0f0000000000000, 0x41efffe000000000},
16267     {0xc0000000, 0xc1d0000000000000, 0x41e8000000000000}};
16268   int pg_2[] = {1, 0, 1, 0, 0, 1, 1, 0, 1, 1};
16269   TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_2, data_set_2);
16270 
16271   // Note that IEEE 754 double-precision format has 52-bits fraction, so all
16272   // 32-bits integers are representable in double.
16273   // clang-format on
16274 }
16275 
TEST_SVE(sve_fadda)16276 TEST_SVE(sve_fadda) {
16277   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
16278                           CPUFeatures::kFP,
16279                           CPUFeatures::kFPHalf);
16280   START();
16281 
16282   __ Ptrue(p0.VnB());
16283   __ Pfalse(p1.VnB());
16284   __ Zip1(p1.VnH(), p0.VnH(), p1.VnH());
16285 
16286   __ Index(z0.VnS(), 3, 3);
16287   __ Scvtf(z0.VnS(), p0.Merging(), z0.VnS());
16288   __ Fmov(s2, 2.0);
16289   __ Fadda(s2, p0, s2, z0.VnS());
16290 
16291   __ Index(z0.VnD(), -7, -7);
16292   __ Scvtf(z0.VnD(), p0.Merging(), z0.VnD());
16293   __ Fmov(d3, 3.0);
16294   __ Fadda(d3, p0, d3, z0.VnD());
16295 
16296   __ Index(z0.VnH(), 1, 1);
16297   __ Scvtf(z0.VnH(), p0.Merging(), z0.VnH());
16298   __ Fmov(h4, 0);
16299   __ Fadda(h4, p1, h4, z0.VnH());
16300   END();
16301 
16302   if (CAN_RUN()) {
16303     RUN();
16304     // Sum of 1 .. n is n+1 * n/2, ie. n(n+1)/2.
16305     int n = core.GetSVELaneCount(kSRegSize);
16306     ASSERT_EQUAL_FP32(2 + 3 * ((n + 1) * (n / 2)), s2);
16307 
16308     n /= 2;  // Half as many lanes.
16309     ASSERT_EQUAL_FP64(3 + -7 * ((n + 1) * (n / 2)), d3);
16310 
16311     // Sum of first n odd numbers is n^2.
16312     n = core.GetSVELaneCount(kHRegSize) / 2;  // Half are odd numbers.
16313     ASSERT_EQUAL_FP16(Float16(n * n), h4);
16314   }
16315 }
16316 
TEST_SVE(sve_extract)16317 TEST_SVE(sve_extract) {
16318   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16319   START();
16320 
16321   __ Index(z0.VnB(), 0, 1);
16322 
16323   __ Mov(z1, z0);
16324   __ Mov(z2, z0);
16325   __ Mov(z3, z0);
16326   __ Mov(z4, z0);
16327   __ Mov(z5, z0);
16328   __ Mov(z6, z0);
16329 
16330   __ Ext(z1, z1, z0, 0);
16331   __ Ext(z2, z2, z0, 1);
16332   __ Ext(z3, z3, z0, 15);
16333   __ Ext(z4, z4, z0, 31);
16334   __ Ext(z5, z5, z0, 47);
16335   __ Ext(z6, z6, z0, 255);
16336 
16337   END();
16338 
16339   if (CAN_RUN()) {
16340     RUN();
16341 
16342     ASSERT_EQUAL_SVE(z1, z0);
16343 
16344     int lane_count = core.GetSVELaneCount(kBRegSize);
16345     if (lane_count == 16) {
16346       uint64_t z2_expected[] = {0x000f0e0d0c0b0a09, 0x0807060504030201};
16347       ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
16348     } else {
16349       uint64_t z2_expected[] = {0x100f0e0d0c0b0a09, 0x0807060504030201};
16350       ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
16351     }
16352 
16353     if (lane_count == 16) {
16354       uint64_t z3_expected[] = {0x0e0d0c0b0a090807, 0x060504030201000f};
16355       ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
16356     } else {
16357       uint64_t z3_expected[] = {0x1e1d1c1b1a191817, 0x161514131211100f};
16358       ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
16359     }
16360 
16361     if (lane_count < 32) {
16362       ASSERT_EQUAL_SVE(z4, z0);
16363     } else if (lane_count == 32) {
16364       uint64_t z4_expected[] = {0x0e0d0c0b0a090807, 0x060504030201001f};
16365       ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
16366     } else {
16367       uint64_t z4_expected[] = {0x2e2d2c2b2a292827, 0x262524232221201f};
16368       ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
16369     }
16370 
16371     if (lane_count < 48) {
16372       ASSERT_EQUAL_SVE(z5, z0);
16373     } else if (lane_count == 48) {
16374       uint64_t z5_expected[] = {0x0e0d0c0b0a090807, 0x060504030201002f};
16375       ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
16376     } else {
16377       uint64_t z5_expected[] = {0x3e3d3c3b3a393837, 0x363534333231302f};
16378       ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
16379     }
16380 
16381     if (lane_count < 256) {
16382       ASSERT_EQUAL_SVE(z6, z0);
16383     } else {
16384       uint64_t z6_expected[] = {0x0e0d0c0b0a090807, 0x06050403020100ff};
16385       ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
16386     }
16387   }
16388 }
16389 
TEST_SVE(sve_fp_paired_across)16390 TEST_SVE(sve_fp_paired_across) {
16391   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16392 
16393   START();
16394 
16395   __ Ptrue(p0.VnB());
16396   __ Pfalse(p1.VnB());
16397   __ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
16398   __ Zip1(p3.VnD(), p0.VnD(), p1.VnD());
16399   __ Zip1(p4.VnH(), p0.VnH(), p1.VnH());
16400 
16401   __ Index(z0.VnS(), 3, 3);
16402   __ Scvtf(z0.VnS(), p0.Merging(), z0.VnS());
16403   __ Faddv(s1, p0, z0.VnS());
16404   __ Fminv(s2, p2, z0.VnS());
16405   __ Fmaxv(s3, p2, z0.VnS());
16406 
16407   __ Index(z0.VnD(), -7, -7);
16408   __ Scvtf(z0.VnD(), p0.Merging(), z0.VnD());
16409   __ Faddv(d4, p0, z0.VnD());
16410   __ Fminv(d5, p3, z0.VnD());
16411   __ Fmaxv(d6, p3, z0.VnD());
16412 
16413   __ Index(z0.VnH(), 1, 1);
16414   __ Scvtf(z0.VnH(), p0.Merging(), z0.VnH());
16415   __ Faddv(h7, p4, z0.VnH());
16416   __ Fminv(h8, p4, z0.VnH());
16417   __ Fmaxv(h9, p4, z0.VnH());
16418 
16419   __ Dup(z10.VnH(), 0);
16420   __ Fdiv(z10.VnH(), p0.Merging(), z10.VnH(), z10.VnH());
16421   __ Insr(z10.VnH(), 0x5140);
16422   __ Insr(z10.VnH(), 0xd140);
16423   __ Ext(z10.VnB(), z10.VnB(), z10.VnB(), 2);
16424   __ Fmaxnmv(h11, p0, z10.VnH());
16425   __ Fmaxnmv(h12, p4, z10.VnH());
16426   __ Fminnmv(h13, p0, z10.VnH());
16427   __ Fminnmv(h14, p4, z10.VnH());
16428 
16429   __ Dup(z10.VnS(), 0);
16430   __ Fdiv(z10.VnS(), p0.Merging(), z10.VnS(), z10.VnS());
16431   __ Insr(z10.VnS(), 0x42280000);
16432   __ Insr(z10.VnS(), 0xc2280000);
16433   __ Ext(z10.VnB(), z10.VnB(), z10.VnB(), 4);
16434   __ Fmaxnmv(s15, p0, z10.VnS());
16435   __ Fmaxnmv(s16, p2, z10.VnS());
16436   __ Fminnmv(s17, p0, z10.VnS());
16437   __ Fminnmv(s18, p2, z10.VnS());
16438 
16439   __ Dup(z10.VnD(), 0);
16440   __ Fdiv(z10.VnD(), p0.Merging(), z10.VnD(), z10.VnD());
16441   __ Insr(z10.VnD(), 0x4045000000000000);
16442   __ Insr(z10.VnD(), 0xc045000000000000);
16443   __ Ext(z10.VnB(), z10.VnB(), z10.VnB(), 8);
16444   __ Fmaxnmv(d19, p0, z10.VnD());
16445   __ Fmaxnmv(d20, p3, z10.VnD());
16446   __ Fminnmv(d21, p0, z10.VnD());
16447   __ Fminnmv(d22, p3, z10.VnD());
16448   END();
16449 
16450   if (CAN_RUN()) {
16451     RUN();
16452     // Sum of 1 .. n is n+1 * n/2, ie. n(n+1)/2.
16453     int n = core.GetSVELaneCount(kSRegSize);
16454     ASSERT_EQUAL_FP32(3 * ((n + 1) * (n / 2)), s1);
16455     ASSERT_EQUAL_FP32(3, s2);
16456     ASSERT_EQUAL_FP32(3 * n - 3, s3);
16457 
16458     n /= 2;  // Half as many lanes.
16459     ASSERT_EQUAL_FP64(-7 * ((n + 1) * (n / 2)), d4);
16460     ASSERT_EQUAL_FP64(-7 * (n - 1), d5);
16461     ASSERT_EQUAL_FP64(-7, d6);
16462 
16463     // Sum of first n odd numbers is n^2.
16464     n = core.GetSVELaneCount(kHRegSize) / 2;  // Half are odd numbers.
16465     ASSERT_EQUAL_FP16(Float16(n * n), h7);
16466     ASSERT_EQUAL_FP16(Float16(1), h8);
16467 
16468     n = core.GetSVELaneCount(kHRegSize);
16469     ASSERT_EQUAL_FP16(Float16(n - 1), h9);
16470 
16471     ASSERT_EQUAL_FP16(Float16(42), h11);
16472     ASSERT_EQUAL_FP16(Float16(42), h12);
16473     ASSERT_EQUAL_FP16(Float16(-42), h13);
16474     ASSERT_EQUAL_FP16(Float16(42), h14);
16475     ASSERT_EQUAL_FP32(42, s15);
16476     ASSERT_EQUAL_FP32(42, s16);
16477     ASSERT_EQUAL_FP32(-42, s17);
16478     ASSERT_EQUAL_FP32(42, s18);
16479     ASSERT_EQUAL_FP64(42, d19);
16480     ASSERT_EQUAL_FP64(42, d20);
16481     ASSERT_EQUAL_FP64(-42, d21);
16482     ASSERT_EQUAL_FP64(42, d22);
16483   }
16484 }
16485 
TEST_SVE(sve_frecpe_frsqrte)16486 TEST_SVE(sve_frecpe_frsqrte) {
16487   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16488 
16489   START();
16490 
16491   __ Ptrue(p0.VnB());
16492 
16493   __ Index(z0.VnH(), 0, 1);
16494   __ Fdup(z1.VnH(), Float16(1));
16495   __ Fscale(z1.VnH(), p0.Merging(), z1.VnH(), z0.VnH());
16496   __ Insr(z1.VnH(), 0);
16497   __ Frsqrte(z2.VnH(), z1.VnH());
16498   __ Frecpe(z1.VnH(), z1.VnH());
16499 
16500   __ Index(z0.VnS(), 0, 1);
16501   __ Fdup(z3.VnS(), Float16(1));
16502   __ Fscale(z3.VnS(), p0.Merging(), z3.VnS(), z0.VnS());
16503   __ Insr(z3.VnS(), 0);
16504   __ Frsqrte(z4.VnS(), z3.VnS());
16505   __ Frecpe(z3.VnS(), z3.VnS());
16506 
16507   __ Index(z0.VnD(), 0, 1);
16508   __ Fdup(z5.VnD(), Float16(1));
16509   __ Fscale(z5.VnD(), p0.Merging(), z5.VnD(), z0.VnD());
16510   __ Insr(z5.VnD(), 0);
16511   __ Frsqrte(z6.VnD(), z5.VnD());
16512   __ Frecpe(z5.VnD(), z5.VnD());
16513   END();
16514 
16515   if (CAN_RUN()) {
16516     RUN();
16517     uint64_t z1_expected[] = {0x23fc27fc2bfc2ffc, 0x33fc37fc3bfc7c00};
16518     ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
16519     uint64_t z2_expected[] = {0x2ffc31a433fc35a4, 0x37fc39a43bfc7c00};
16520     ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
16521 
16522     uint64_t z3_expected[] = {0x3e7f80003eff8000, 0x3f7f80007f800000};
16523     ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
16524     uint64_t z4_expected[] = {0x3eff80003f348000, 0x3f7f80007f800000};
16525     ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
16526 
16527     uint64_t z5_expected[] = {0x3feff00000000000, 0x7ff0000000000000};
16528     ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
16529     uint64_t z6_expected[] = {0x3feff00000000000, 0x7ff0000000000000};
16530     ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
16531   }
16532 }
16533 
TEST_SVE(sve_frecps_frsqrts)16534 TEST_SVE(sve_frecps_frsqrts) {
16535   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16536 
16537   START();
16538   __ Ptrue(p0.VnB());
16539 
16540   __ Index(z0.VnH(), 0, -1);
16541   __ Fdup(z1.VnH(), Float16(1));
16542   __ Fscale(z1.VnH(), p0.Merging(), z1.VnH(), z0.VnH());
16543   __ Scvtf(z0.VnH(), p0.Merging(), z0.VnH());
16544   __ Insr(z1.VnH(), 0);
16545   __ Frsqrts(z2.VnH(), z1.VnH(), z0.VnH());
16546   __ Frecps(z1.VnH(), z1.VnH(), z0.VnH());
16547 
16548   __ Index(z0.VnS(), 0, -1);
16549   __ Fdup(z3.VnS(), Float16(1));
16550   __ Fscale(z3.VnS(), p0.Merging(), z3.VnS(), z0.VnS());
16551   __ Scvtf(z0.VnS(), p0.Merging(), z0.VnS());
16552   __ Insr(z3.VnS(), 0);
16553   __ Frsqrts(z4.VnS(), z3.VnS(), z0.VnS());
16554   __ Frecps(z3.VnS(), z3.VnS(), z0.VnS());
16555 
16556   __ Index(z0.VnD(), 0, -1);
16557   __ Fdup(z5.VnD(), Float16(1));
16558   __ Fscale(z5.VnD(), p0.Merging(), z5.VnD(), z0.VnD());
16559   __ Scvtf(z0.VnD(), p0.Merging(), z0.VnD());
16560   __ Insr(z5.VnD(), 0);
16561   __ Frsqrts(z6.VnD(), z5.VnD(), z0.VnD());
16562   __ Frecps(z5.VnD(), z5.VnD(), z0.VnD());
16563   END();
16564 
16565   if (CAN_RUN()) {
16566     RUN();
16567     uint64_t z1_expected[] = {0x4038406040a04100, 0x4180420042004000};
16568     ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
16569     uint64_t z2_expected[] = {0x3e383e603ea03f00, 0x3f80400040003e00};
16570     ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
16571 
16572     uint64_t z3_expected[] = {0x4030000040400000, 0x4040000040000000};
16573     ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
16574     uint64_t z4_expected[] = {0x3ff0000040000000, 0x400000003fc00000};
16575     ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
16576 
16577     uint64_t z5_expected[] = {0x4008000000000000, 0x4000000000000000};
16578     ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
16579     uint64_t z6_expected[] = {0x4000000000000000, 0x3ff8000000000000};
16580     ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
16581   }
16582 }
16583 
TEST_SVE(sve_ftsmul)16584 TEST_SVE(sve_ftsmul) {
16585   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16586 
16587   START();
16588   __ Ptrue(p0.VnB());
16589 
16590   __ Index(z0.VnH(), 0, 1);
16591   __ Rev(z1.VnH(), z0.VnH());
16592   __ Scvtf(z0.VnH(), p0.Merging(), z0.VnH());
16593   __ Dup(z2.VnH(), 0);
16594   __ Fdiv(z2.VnH(), p0.Merging(), z2.VnH(), z2.VnH());
16595   __ Ftsmul(z3.VnH(), z0.VnH(), z1.VnH());
16596   __ Ftsmul(z4.VnH(), z2.VnH(), z1.VnH());
16597 
16598   __ Index(z0.VnS(), -7, 1);
16599   __ Rev(z1.VnS(), z0.VnS());
16600   __ Scvtf(z0.VnS(), p0.Merging(), z0.VnS());
16601   __ Dup(z2.VnS(), 0);
16602   __ Fdiv(z2.VnS(), p0.Merging(), z2.VnS(), z2.VnS());
16603   __ Ftsmul(z5.VnS(), z0.VnS(), z1.VnS());
16604   __ Ftsmul(z6.VnS(), z2.VnS(), z1.VnS());
16605 
16606   __ Index(z0.VnD(), 2, -1);
16607   __ Rev(z1.VnD(), z0.VnD());
16608   __ Scvtf(z0.VnD(), p0.Merging(), z0.VnD());
16609   __ Dup(z2.VnD(), 0);
16610   __ Fdiv(z2.VnD(), p0.Merging(), z2.VnD(), z2.VnD());
16611   __ Ftsmul(z7.VnD(), z0.VnD(), z1.VnD());
16612   __ Ftsmul(z8.VnD(), z2.VnD(), z1.VnD());
16613   END();
16614 
16615   if (CAN_RUN()) {
16616     RUN();
16617     uint64_t z3_expected[] = {0x5220d0804e40cc00, 0x4880c4003c008000};
16618     ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
16619     uint64_t z4_expected[] = {0x7e007e007e007e00, 0x7e007e007e007e00};
16620     ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
16621 
16622     uint64_t z5_expected[] = {0xc180000041c80000, 0xc210000042440000};
16623     ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
16624     uint64_t z6_expected[] = {0x7fc000007fc00000, 0x7fc000007fc00000};
16625     ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
16626 
16627     uint64_t z7_expected[] = {0x3ff0000000000000, 0xc010000000000000};
16628     ASSERT_EQUAL_SVE(z7_expected, z7.VnD());
16629     uint64_t z8_expected[] = {0x7ff8000000000000, 0x7ff8000000000000};
16630     ASSERT_EQUAL_SVE(z8_expected, z8.VnD());
16631   }
16632 }
16633 
16634 typedef void (MacroAssembler::*FPMulAccFn)(
16635     const ZRegister& zd,
16636     const PRegisterM& pg,
16637     const ZRegister& za,
16638     const ZRegister& zn,
16639     const ZRegister& zm,
16640     FPMacroNaNPropagationOption nan_option);
16641 
16642 // The `pg_inputs` is used for examining the predication correctness internally.
16643 // It does not imply the value of `result` argument. `result` stands for the
16644 // expected result on all-true predication.
16645 template <typename T, size_t N>
FPMulAccHelper(Test * config,FPMulAccFn macro,unsigned lane_size_in_bits,const int (& pg_inputs)[N],const T (& za_inputs)[N],const T (& zn_inputs)[N],const T (& zm_inputs)[N],const uint64_t (& result)[N],FPMacroNaNPropagationOption nan_option=FastNaNPropagation)16646 static void FPMulAccHelper(
16647     Test* config,
16648     FPMulAccFn macro,
16649     unsigned lane_size_in_bits,
16650     const int (&pg_inputs)[N],
16651     const T (&za_inputs)[N],
16652     const T (&zn_inputs)[N],
16653     const T (&zm_inputs)[N],
16654     const uint64_t (&result)[N],
16655     FPMacroNaNPropagationOption nan_option = FastNaNPropagation) {
16656   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16657   START();
16658 
16659   ZRegister zd = z0.WithLaneSize(lane_size_in_bits);
16660   ZRegister za = z1.WithLaneSize(lane_size_in_bits);
16661   ZRegister zn = z2.WithLaneSize(lane_size_in_bits);
16662   ZRegister zm = z3.WithLaneSize(lane_size_in_bits);
16663 
16664   uint64_t za_rawbits[N];
16665   uint64_t zn_rawbits[N];
16666   uint64_t zm_rawbits[N];
16667 
16668   FPToRawbitsWithSize(za_inputs, za_rawbits, lane_size_in_bits);
16669   FPToRawbitsWithSize(zn_inputs, zn_rawbits, lane_size_in_bits);
16670   FPToRawbitsWithSize(zm_inputs, zm_rawbits, lane_size_in_bits);
16671 
16672   InsrHelper(&masm, za, za_rawbits);
16673   InsrHelper(&masm, zn, zn_rawbits);
16674   InsrHelper(&masm, zm, zm_rawbits);
16675 
16676   // Initialize `zd` with a signalling NaN.
16677   uint64_t sn = GetSignallingNan(lane_size_in_bits);
16678   __ Mov(x29, sn);
16679   __ Dup(zd, x29);
16680 
16681   Initialise(&masm, p0.WithLaneSize(lane_size_in_bits), pg_inputs);
16682 
16683   // Fmla  macro automatically selects between fmla,  fmad  and movprfx + fmla
16684   // Fmls                 `ditto`              fmls,  fmsb  and movprfx + fmls
16685   // Fnmla                `ditto`              fnmla, fnmad and movprfx + fnmla
16686   // Fnmls                `ditto`              fnmls, fnmsb and movprfx + fnmls
16687   // based on what registers are aliased.
16688   ZRegister da_result = z10.WithLaneSize(lane_size_in_bits);
16689   ZRegister dn_result = z11.WithLaneSize(lane_size_in_bits);
16690   ZRegister dm_result = z12.WithLaneSize(lane_size_in_bits);
16691   ZRegister d_result = z13.WithLaneSize(lane_size_in_bits);
16692 
16693   __ Mov(da_result, za);
16694   (masm.*macro)(da_result, p0.Merging(), da_result, zn, zm, nan_option);
16695 
16696   __ Mov(dn_result, zn);
16697   (masm.*macro)(dn_result, p0.Merging(), za, dn_result, zm, nan_option);
16698 
16699   __ Mov(dm_result, zm);
16700   (masm.*macro)(dm_result, p0.Merging(), za, zn, dm_result, nan_option);
16701 
16702   __ Mov(d_result, zd);
16703   (masm.*macro)(d_result, p0.Merging(), za, zn, zm, nan_option);
16704 
16705   END();
16706 
16707   if (CAN_RUN()) {
16708     RUN();
16709 
16710     ASSERT_EQUAL_SVE(za_rawbits, za);
16711     ASSERT_EQUAL_SVE(zn_rawbits, zn);
16712     ASSERT_EQUAL_SVE(zm_rawbits, zm);
16713 
16714     uint64_t da_expected[N];
16715     uint64_t dn_expected[N];
16716     uint64_t dm_expected[N];
16717     uint64_t d_expected[N];
16718     for (size_t i = 0; i < N; i++) {
16719       da_expected[i] = ((pg_inputs[i] & 1) != 0) ? result[i] : za_rawbits[i];
16720       dn_expected[i] = ((pg_inputs[i] & 1) != 0) ? result[i] : zn_rawbits[i];
16721       dm_expected[i] = ((pg_inputs[i] & 1) != 0) ? result[i] : zm_rawbits[i];
16722       d_expected[i] = ((pg_inputs[i] & 1) != 0) ? result[i] : sn;
16723     }
16724 
16725     ASSERT_EQUAL_SVE(da_expected, da_result);
16726     ASSERT_EQUAL_SVE(dn_expected, dn_result);
16727     ASSERT_EQUAL_SVE(dm_expected, dm_result);
16728     ASSERT_EQUAL_SVE(d_expected, d_result);
16729   }
16730 }
16731 
TEST_SVE(sve_fmla_fmad)16732 TEST_SVE(sve_fmla_fmad) {
16733   // fmla : zd = za + zn * zm
16734   double za_inputs[] = {-39.0, 1.0, -3.0, 2.0};
16735   double zn_inputs[] = {-5.0, -20.0, 9.0, 8.0};
16736   double zm_inputs[] = {9.0, -5.0, 4.0, 5.0};
16737   int pg_inputs[] = {1, 1, 0, 1};
16738 
16739   uint64_t fmla_result_h[] = {Float16ToRawbits(Float16(-84.0)),
16740                               Float16ToRawbits(Float16(101.0)),
16741                               Float16ToRawbits(Float16(33.0)),
16742                               Float16ToRawbits(Float16(42.0))};
16743 
16744   // `fmad` has been tested in the helper.
16745   FPMulAccHelper(config,
16746                  &MacroAssembler::Fmla,
16747                  kHRegSize,
16748                  pg_inputs,
16749                  za_inputs,
16750                  zn_inputs,
16751                  zm_inputs,
16752                  fmla_result_h);
16753 
16754   uint64_t fmla_result_s[] = {FloatToRawbits(-84.0f),
16755                               FloatToRawbits(101.0f),
16756                               FloatToRawbits(33.0f),
16757                               FloatToRawbits(42.0f)};
16758 
16759   FPMulAccHelper(config,
16760                  &MacroAssembler::Fmla,
16761                  kSRegSize,
16762                  pg_inputs,
16763                  za_inputs,
16764                  zn_inputs,
16765                  zm_inputs,
16766                  fmla_result_s);
16767 
16768   uint64_t fmla_result_d[] = {DoubleToRawbits(-84.0),
16769                               DoubleToRawbits(101.0),
16770                               DoubleToRawbits(33.0),
16771                               DoubleToRawbits(42.0)};
16772 
16773   FPMulAccHelper(config,
16774                  &MacroAssembler::Fmla,
16775                  kDRegSize,
16776                  pg_inputs,
16777                  za_inputs,
16778                  zn_inputs,
16779                  zm_inputs,
16780                  fmla_result_d);
16781 }
16782 
TEST_SVE(sve_fmls_fmsb)16783 TEST_SVE(sve_fmls_fmsb) {
16784   // fmls : zd = za - zn * zm
16785   double za_inputs[] = {-39.0, 1.0, -3.0, 2.0};
16786   double zn_inputs[] = {-5.0, -20.0, 9.0, 8.0};
16787   double zm_inputs[] = {9.0, -5.0, 4.0, 5.0};
16788   int pg_inputs[] = {1, 0, 1, 1};
16789 
16790   uint64_t fmls_result_h[] = {Float16ToRawbits(Float16(6.0)),
16791                               Float16ToRawbits(Float16(-99.0)),
16792                               Float16ToRawbits(Float16(-39.0)),
16793                               Float16ToRawbits(Float16(-38.0))};
16794 
16795   // `fmsb` has been tested in the helper.
16796   FPMulAccHelper(config,
16797                  &MacroAssembler::Fmls,
16798                  kHRegSize,
16799                  pg_inputs,
16800                  za_inputs,
16801                  zn_inputs,
16802                  zm_inputs,
16803                  fmls_result_h);
16804 
16805   uint64_t fmls_result_s[] = {FloatToRawbits(6.0f),
16806                               FloatToRawbits(-99.0f),
16807                               FloatToRawbits(-39.0f),
16808                               FloatToRawbits(-38.0f)};
16809 
16810   FPMulAccHelper(config,
16811                  &MacroAssembler::Fmls,
16812                  kSRegSize,
16813                  pg_inputs,
16814                  za_inputs,
16815                  zn_inputs,
16816                  zm_inputs,
16817                  fmls_result_s);
16818 
16819   uint64_t fmls_result_d[] = {DoubleToRawbits(6.0),
16820                               DoubleToRawbits(-99.0),
16821                               DoubleToRawbits(-39.0),
16822                               DoubleToRawbits(-38.0)};
16823 
16824   FPMulAccHelper(config,
16825                  &MacroAssembler::Fmls,
16826                  kDRegSize,
16827                  pg_inputs,
16828                  za_inputs,
16829                  zn_inputs,
16830                  zm_inputs,
16831                  fmls_result_d);
16832 }
16833 
TEST_SVE(sve_fnmla_fnmad)16834 TEST_SVE(sve_fnmla_fnmad) {
16835   // fnmla : zd = -za - zn * zm
16836   double za_inputs[] = {-39.0, 1.0, -3.0, 2.0};
16837   double zn_inputs[] = {-5.0, -20.0, 9.0, 8.0};
16838   double zm_inputs[] = {9.0, -5.0, 4.0, 5.0};
16839   int pg_inputs[] = {0, 1, 1, 1};
16840 
16841   uint64_t fnmla_result_h[] = {Float16ToRawbits(Float16(84.0)),
16842                                Float16ToRawbits(Float16(-101.0)),
16843                                Float16ToRawbits(Float16(-33.0)),
16844                                Float16ToRawbits(Float16(-42.0))};
16845 
16846   // `fnmad` has been tested in the helper.
16847   FPMulAccHelper(config,
16848                  &MacroAssembler::Fnmla,
16849                  kHRegSize,
16850                  pg_inputs,
16851                  za_inputs,
16852                  zn_inputs,
16853                  zm_inputs,
16854                  fnmla_result_h);
16855 
16856   uint64_t fnmla_result_s[] = {FloatToRawbits(84.0f),
16857                                FloatToRawbits(-101.0f),
16858                                FloatToRawbits(-33.0f),
16859                                FloatToRawbits(-42.0f)};
16860 
16861   FPMulAccHelper(config,
16862                  &MacroAssembler::Fnmla,
16863                  kSRegSize,
16864                  pg_inputs,
16865                  za_inputs,
16866                  zn_inputs,
16867                  zm_inputs,
16868                  fnmla_result_s);
16869 
16870   uint64_t fnmla_result_d[] = {DoubleToRawbits(84.0),
16871                                DoubleToRawbits(-101.0),
16872                                DoubleToRawbits(-33.0),
16873                                DoubleToRawbits(-42.0)};
16874 
16875   FPMulAccHelper(config,
16876                  &MacroAssembler::Fnmla,
16877                  kDRegSize,
16878                  pg_inputs,
16879                  za_inputs,
16880                  zn_inputs,
16881                  zm_inputs,
16882                  fnmla_result_d);
16883 }
16884 
TEST_SVE(sve_fnmls_fnmsb)16885 TEST_SVE(sve_fnmls_fnmsb) {
16886   // fnmls : zd = -za + zn * zm
16887   double za_inputs[] = {-39.0, 1.0, -3.0, 2.0};
16888   double zn_inputs[] = {-5.0, -20.0, 9.0, 8.0};
16889   double zm_inputs[] = {9.0, -5.0, 4.0, 5.0};
16890   int pg_inputs[] = {1, 1, 1, 0};
16891 
16892   uint64_t fnmls_result_h[] = {Float16ToRawbits(Float16(-6.0)),
16893                                Float16ToRawbits(Float16(99.0)),
16894                                Float16ToRawbits(Float16(39.0)),
16895                                Float16ToRawbits(Float16(38.0))};
16896 
16897   // `fnmsb` has been tested in the helper.
16898   FPMulAccHelper(config,
16899                  &MacroAssembler::Fnmls,
16900                  kHRegSize,
16901                  pg_inputs,
16902                  za_inputs,
16903                  zn_inputs,
16904                  zm_inputs,
16905                  fnmls_result_h);
16906 
16907   uint64_t fnmls_result_s[] = {FloatToRawbits(-6.0f),
16908                                FloatToRawbits(99.0f),
16909                                FloatToRawbits(39.0f),
16910                                FloatToRawbits(38.0f)};
16911 
16912   FPMulAccHelper(config,
16913                  &MacroAssembler::Fnmls,
16914                  kSRegSize,
16915                  pg_inputs,
16916                  za_inputs,
16917                  zn_inputs,
16918                  zm_inputs,
16919                  fnmls_result_s);
16920 
16921   uint64_t fnmls_result_d[] = {DoubleToRawbits(-6.0),
16922                                DoubleToRawbits(99.0),
16923                                DoubleToRawbits(39.0),
16924                                DoubleToRawbits(38.0)};
16925 
16926   FPMulAccHelper(config,
16927                  &MacroAssembler::Fnmls,
16928                  kDRegSize,
16929                  pg_inputs,
16930                  za_inputs,
16931                  zn_inputs,
16932                  zm_inputs,
16933                  fnmls_result_d);
16934 }
16935 
16936 typedef void (MacroAssembler::*FPMulAccIdxFn)(const ZRegister& zd,
16937                                               const ZRegister& za,
16938                                               const ZRegister& zn,
16939                                               const ZRegister& zm,
16940                                               int index);
16941 
16942 template <typename T, size_t N>
FPMulAccIdxHelper(Test * config,FPMulAccFn macro,FPMulAccIdxFn macro_idx,const T (& za_inputs)[N],const T (& zn_inputs)[N],const T (& zm_inputs)[N])16943 static void FPMulAccIdxHelper(Test* config,
16944                               FPMulAccFn macro,
16945                               FPMulAccIdxFn macro_idx,
16946                               const T (&za_inputs)[N],
16947                               const T (&zn_inputs)[N],
16948                               const T (&zm_inputs)[N]) {
16949   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16950   START();
16951 
16952   __ Ptrue(p0.VnB());
16953 
16954   // Repeat indexed vector across up to 2048-bit VL.
16955   for (size_t i = 0; i < (kZRegMaxSize / kDRegSize); i += N) {
16956     InsrHelper(&masm, z30.VnD(), zm_inputs);
16957   }
16958 
16959   FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z30.VnH());
16960 
16961   InsrHelper(&masm, z1.VnD(), zn_inputs);
16962   InsrHelper(&masm, z2.VnD(), za_inputs);
16963 
16964   __ Mov(z3, z0);
16965   (masm.*macro_idx)(z3.VnH(), z2.VnH(), z1.VnH(), z3.VnH(), 0);  // zd == zm
16966   __ Mov(z4, z1);
16967   (masm.*macro_idx)(z4.VnH(), z2.VnH(), z4.VnH(), z0.VnH(), 1);  // zd == zn
16968   __ Mov(z5, z2);
16969   (masm.*macro_idx)(z5.VnH(), z5.VnH(), z1.VnH(), z0.VnH(), 4);  // zd == za
16970   (masm.*macro_idx)(z6.VnH(), z2.VnH(), z1.VnH(), z0.VnH(), 7);
16971 
16972   FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z30.VnS());
16973 
16974   __ Mov(z7, z0);
16975   (masm.*macro_idx)(z7.VnS(), z2.VnS(), z1.VnS(), z7.VnS(), 0);  // zd == zm
16976   __ Mov(z8, z1);
16977   (masm.*macro_idx)(z8.VnS(), z2.VnS(), z8.VnS(), z0.VnS(), 1);  // zd == zn
16978   __ Mov(z9, z2);
16979   (masm.*macro_idx)(z9.VnS(), z9.VnS(), z1.VnS(), z0.VnS(), 2);  // zd == za
16980   (masm.*macro_idx)(z10.VnS(), z2.VnS(), z1.VnS(), z0.VnS(), 3);
16981 
16982   FPSegmentPatternHelper(&masm, z0.VnD(), p0.Merging(), z30.VnD());
16983 
16984   __ Mov(z11, z0);
16985   (masm.*macro_idx)(z11.VnD(), z2.VnD(), z1.VnD(), z11.VnD(), 0);  // zd == zm
16986   __ Mov(z12, z1);
16987   (masm.*macro_idx)(z12.VnD(), z2.VnD(), z12.VnD(), z0.VnD(), 1);  // zd == zn
16988   __ Mov(z13, z2);
16989   (masm.*macro_idx)(z13.VnD(), z13.VnD(), z1.VnD(), z0.VnD(), 0);  // zd == za
16990   __ Mov(z14, z0);
16991   // zd == zn == zm
16992   (masm.*macro_idx)(z14.VnD(), z2.VnD(), z14.VnD(), z14.VnD(), 1);
16993 
16994   // Indexed form of Fmla and Fmls won't swap argument, passing strict NaN
16995   // propagation mode to ensure the following macros don't swap argument in
16996   // any cases.
16997   FPMacroNaNPropagationOption option = StrictNaNPropagation;
16998   // Compute the results using other instructions.
16999   __ Dup(z0.VnH(), z30.VnH(), 0);
17000   FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z0.VnH());
17001   (masm.*macro)(z15.VnH(), p0.Merging(), z2.VnH(), z1.VnH(), z0.VnH(), option);
17002   __ Dup(z0.VnH(), z30.VnH(), 1);
17003   FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z0.VnH());
17004   (masm.*macro)(z16.VnH(), p0.Merging(), z2.VnH(), z1.VnH(), z0.VnH(), option);
17005   __ Dup(z0.VnH(), z30.VnH(), 4);
17006   FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z0.VnH());
17007   (masm.*macro)(z17.VnH(), p0.Merging(), z2.VnH(), z1.VnH(), z0.VnH(), option);
17008   __ Dup(z0.VnH(), z30.VnH(), 7);
17009   FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z0.VnH());
17010   (masm.*macro)(z18.VnH(), p0.Merging(), z2.VnH(), z1.VnH(), z0.VnH(), option);
17011 
17012   __ Dup(z0.VnS(), z30.VnS(), 0);
17013   FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z0.VnS());
17014   (masm.*macro)(z19.VnS(), p0.Merging(), z2.VnS(), z1.VnS(), z0.VnS(), option);
17015   __ Dup(z0.VnS(), z30.VnS(), 1);
17016   FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z0.VnS());
17017   (masm.*macro)(z20.VnS(), p0.Merging(), z2.VnS(), z1.VnS(), z0.VnS(), option);
17018   __ Dup(z0.VnS(), z30.VnS(), 2);
17019   FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z0.VnS());
17020   (masm.*macro)(z21.VnS(), p0.Merging(), z2.VnS(), z1.VnS(), z0.VnS(), option);
17021   __ Dup(z0.VnS(), z30.VnS(), 3);
17022   FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z0.VnS());
17023   (masm.*macro)(z22.VnS(), p0.Merging(), z2.VnS(), z1.VnS(), z0.VnS(), option);
17024 
17025   __ Dup(z0.VnD(), z30.VnD(), 0);
17026   FPSegmentPatternHelper(&masm, z0.VnD(), p0.Merging(), z0.VnD());
17027   (masm.*macro)(z23.VnD(), p0.Merging(), z2.VnD(), z1.VnD(), z0.VnD(), option);
17028   __ Dup(z0.VnD(), z30.VnD(), 1);
17029   FPSegmentPatternHelper(&masm, z0.VnD(), p0.Merging(), z0.VnD());
17030   (masm.*macro)(z24.VnD(), p0.Merging(), z2.VnD(), z1.VnD(), z0.VnD(), option);
17031   FPSegmentPatternHelper(&masm, z0.VnD(), p0.Merging(), z30.VnD());
17032   __ Dup(z29.VnD(), z30.VnD(), 1);
17033   FPSegmentPatternHelper(&masm, z29.VnD(), p0.Merging(), z29.VnD());
17034   (masm.*macro)(z25.VnD(), p0.Merging(), z2.VnD(), z0.VnD(), z29.VnD(), option);
17035 
17036   END();
17037 
17038   if (CAN_RUN()) {
17039     RUN();
17040 
17041     ASSERT_EQUAL_SVE(z15.VnH(), z3.VnH());
17042     ASSERT_EQUAL_SVE(z16.VnH(), z4.VnH());
17043     ASSERT_EQUAL_SVE(z17.VnH(), z5.VnH());
17044     ASSERT_EQUAL_SVE(z18.VnH(), z6.VnH());
17045 
17046     ASSERT_EQUAL_SVE(z19.VnS(), z7.VnS());
17047     ASSERT_EQUAL_SVE(z20.VnS(), z8.VnS());
17048     ASSERT_EQUAL_SVE(z21.VnS(), z9.VnS());
17049     ASSERT_EQUAL_SVE(z22.VnS(), z10.VnS());
17050 
17051     ASSERT_EQUAL_SVE(z23.VnD(), z11.VnD());
17052     ASSERT_EQUAL_SVE(z24.VnD(), z12.VnD());
17053     ASSERT_EQUAL_SVE(z11.VnD(), z13.VnD());
17054     ASSERT_EQUAL_SVE(z25.VnD(), z14.VnD());
17055   }
17056 }
17057 
TEST_SVE(sve_fmla_fmls_index)17058 TEST_SVE(sve_fmla_fmls_index) {
17059   uint64_t zm_inputs_1[] = {0x3ff000003f803c00, 0xbff00000bf80bc00};
17060   uint64_t zn_inputs_1[] = {0x3ff012343ff03c76, 0xbff01234bff0bc76};
17061   uint64_t za_inputs_1[] = {0x3c004000bc00c000, 0x64006800e400e800};
17062 
17063   // Using the vector form of Fmla and Fmls to verify the indexed form.
17064   FPMulAccIdxHelper(config,
17065                     &MacroAssembler::Fmla,  // vector form
17066                     &MacroAssembler::Fmla,  // indexed form
17067                     za_inputs_1,
17068                     zn_inputs_1,
17069                     zm_inputs_1);
17070 
17071   FPMulAccIdxHelper(config,
17072                     &MacroAssembler::Fmls,  // vector form
17073                     &MacroAssembler::Fmls,  // indexed form
17074                     za_inputs_1,
17075                     zn_inputs_1,
17076                     zm_inputs_1);
17077 
17078   uint64_t zm_inputs_2[] = {0x7ff5555511111111,   // NaN
17079                             0xfff0000000000000};  // Infinity
17080   uint64_t zn_inputs_2[] = {0x7f9511117fc00000,   // NaN
17081                             0x7f800000ff800000};  // Infinity
17082   uint64_t za_inputs_2[] = {0x7c11000000007e00,   // NaN
17083                             0x000000007c00fc00};  // Infinity
17084   FPMulAccIdxHelper(config,
17085                     &MacroAssembler::Fmla,  // vector form
17086                     &MacroAssembler::Fmla,  // indexed form
17087                     za_inputs_2,
17088                     zn_inputs_2,
17089                     zm_inputs_2);
17090 
17091   FPMulAccIdxHelper(config,
17092                     &MacroAssembler::Fmls,  // vector form
17093                     &MacroAssembler::Fmls,  // indexed form
17094                     za_inputs_2,
17095                     zn_inputs_2,
17096                     zm_inputs_2);
17097 }
17098 
17099 // Execute a number of instructions which all use ProcessNaNs, and check that
17100 // they all propagate NaNs correctly.
17101 template <typename Ti, typename Td, size_t N>
ProcessNaNsHelper(Test * config,int lane_size_in_bits,const Ti (& zn_inputs)[N],const Ti (& zm_inputs)[N],const Td (& zd_expected)[N],FPMacroNaNPropagationOption nan_option)17102 static void ProcessNaNsHelper(Test* config,
17103                               int lane_size_in_bits,
17104                               const Ti (&zn_inputs)[N],
17105                               const Ti (&zm_inputs)[N],
17106                               const Td (&zd_expected)[N],
17107                               FPMacroNaNPropagationOption nan_option) {
17108   ArithFn arith_unpredicated_macro[] = {&MacroAssembler::Fadd,
17109                                         &MacroAssembler::Fsub,
17110                                         &MacroAssembler::Fmul};
17111 
17112   for (size_t i = 0; i < ArrayLength(arith_unpredicated_macro); i++) {
17113     FPBinArithHelper(config,
17114                      arith_unpredicated_macro[i],
17115                      lane_size_in_bits,
17116                      zn_inputs,
17117                      zm_inputs,
17118                      zd_expected);
17119   }
17120 
17121   FPArithPredicatedFn arith_predicated_macro[] = {&MacroAssembler::Fmax,
17122                                                   &MacroAssembler::Fmin};
17123   int pg_inputs[N];
17124   // With an all-true predicate, this helper aims to compare with special
17125   // numbers.
17126   for (size_t i = 0; i < N; i++) {
17127     pg_inputs[i] = 1;
17128   }
17129 
17130   // fdivr propagates the quotient (Zm) preferentially, so we don't actually
17131   // need any special handling for StrictNaNPropagation.
17132   FPBinArithHelper(config,
17133                    NULL,
17134                    &MacroAssembler::Fdiv,
17135                    lane_size_in_bits,
17136                    // With an all-true predicate, the value in zd is
17137                    // irrelevant to the operations.
17138                    zn_inputs,
17139                    pg_inputs,
17140                    zn_inputs,
17141                    zm_inputs,
17142                    zd_expected);
17143 
17144   for (size_t i = 0; i < ArrayLength(arith_predicated_macro); i++) {
17145     FPBinArithHelper(config,
17146                      arith_predicated_macro[i],
17147                      NULL,
17148                      lane_size_in_bits,
17149                      // With an all-true predicate, the value in zd is
17150                      // irrelevant to the operations.
17151                      zn_inputs,
17152                      pg_inputs,
17153                      zn_inputs,
17154                      zm_inputs,
17155                      zd_expected,
17156                      nan_option);
17157   }
17158 }
17159 
17160 template <typename Ti, typename Td, size_t N>
ProcessNaNsHelper3(Test * config,int lane_size_in_bits,const Ti (& za_inputs)[N],const Ti (& zn_inputs)[N],const Ti (& zm_inputs)[N],const Td (& zd_expected_fmla)[N],const Td (& zd_expected_fmls)[N],const Td (& zd_expected_fnmla)[N],const Td (& zd_expected_fnmls)[N],FPMacroNaNPropagationOption nan_option)17161 static void ProcessNaNsHelper3(Test* config,
17162                                int lane_size_in_bits,
17163                                const Ti (&za_inputs)[N],
17164                                const Ti (&zn_inputs)[N],
17165                                const Ti (&zm_inputs)[N],
17166                                const Td (&zd_expected_fmla)[N],
17167                                const Td (&zd_expected_fmls)[N],
17168                                const Td (&zd_expected_fnmla)[N],
17169                                const Td (&zd_expected_fnmls)[N],
17170                                FPMacroNaNPropagationOption nan_option) {
17171   int pg_inputs[N];
17172   // With an all-true predicate, this helper aims to compare with special
17173   // numbers.
17174   for (size_t i = 0; i < N; i++) {
17175     pg_inputs[i] = 1;
17176   }
17177 
17178   FPMulAccHelper(config,
17179                  &MacroAssembler::Fmla,
17180                  lane_size_in_bits,
17181                  pg_inputs,
17182                  za_inputs,
17183                  zn_inputs,
17184                  zm_inputs,
17185                  zd_expected_fmla,
17186                  nan_option);
17187 
17188   FPMulAccHelper(config,
17189                  &MacroAssembler::Fmls,
17190                  lane_size_in_bits,
17191                  pg_inputs,
17192                  za_inputs,
17193                  zn_inputs,
17194                  zm_inputs,
17195                  zd_expected_fmls,
17196                  nan_option);
17197 
17198   FPMulAccHelper(config,
17199                  &MacroAssembler::Fnmla,
17200                  lane_size_in_bits,
17201                  pg_inputs,
17202                  za_inputs,
17203                  zn_inputs,
17204                  zm_inputs,
17205                  zd_expected_fnmla,
17206                  nan_option);
17207 
17208   FPMulAccHelper(config,
17209                  &MacroAssembler::Fnmls,
17210                  lane_size_in_bits,
17211                  pg_inputs,
17212                  za_inputs,
17213                  zn_inputs,
17214                  zm_inputs,
17215                  zd_expected_fnmls,
17216                  nan_option);
17217 }
17218 
TEST_SVE(sve_process_nans_double)17219 TEST_SVE(sve_process_nans_double) {
17220   // Use non-standard NaNs to check that the payload bits are preserved.
17221   double sa = RawbitsToDouble(0x7ff5555511111111);
17222   double sn = RawbitsToDouble(0x7ff5555522222222);
17223   double sm = RawbitsToDouble(0x7ff5555533333333);
17224   double qa = RawbitsToDouble(0x7ffaaaaa11111111);
17225   double qn = RawbitsToDouble(0x7ffaaaaa22222222);
17226   double qm = RawbitsToDouble(0x7ffaaaaa33333333);
17227   VIXL_ASSERT(IsSignallingNaN(sa));
17228   VIXL_ASSERT(IsSignallingNaN(sn));
17229   VIXL_ASSERT(IsSignallingNaN(sm));
17230   VIXL_ASSERT(IsQuietNaN(qa));
17231   VIXL_ASSERT(IsQuietNaN(qn));
17232   VIXL_ASSERT(IsQuietNaN(qm));
17233 
17234   // The input NaNs after passing through ProcessNaN.
17235   uint64_t sa_proc = 0x7ffd555511111111;
17236   uint64_t sn_proc = 0x7ffd555522222222;
17237   uint64_t sm_proc = 0x7ffd555533333333;
17238   uint64_t qa_proc = DoubleToRawbits(qa);
17239   uint64_t qn_proc = DoubleToRawbits(qn);
17240   uint64_t qm_proc = DoubleToRawbits(qm);
17241   uint64_t sa_proc_n = sa_proc ^ kDSignMask;
17242   uint64_t sn_proc_n = sn_proc ^ kDSignMask;
17243   uint64_t qa_proc_n = qa_proc ^ kDSignMask;
17244   uint64_t qn_proc_n = qn_proc ^ kDSignMask;
17245 
17246   // Quiet NaNs are propagated.
17247   double zn_inputs_1[] = {qn, 0.0, 0.0, qm, qn, qm};
17248   double zm_inputs_1[] = {0.0, qn, qm, 0.0, qm, qn};
17249   uint64_t zd_expected_1[] =
17250       {qn_proc, qn_proc, qm_proc, qm_proc, qn_proc, qm_proc};
17251 
17252   ProcessNaNsHelper(config,
17253                     kDRegSize,
17254                     zn_inputs_1,
17255                     zm_inputs_1,
17256                     zd_expected_1,
17257                     StrictNaNPropagation);
17258 
17259   // Signalling NaNs are propagated.
17260   double zn_inputs_2[] = {sn, 0.0, 0.0, sm, sn, sm};
17261   double zm_inputs_2[] = {0.0, sn, sm, 0.0, sm, sn};
17262   uint64_t zd_expected_2[] =
17263       {sn_proc, sn_proc, sm_proc, sm_proc, sn_proc, sm_proc};
17264   ProcessNaNsHelper(config,
17265                     kDRegSize,
17266                     zn_inputs_2,
17267                     zm_inputs_2,
17268                     zd_expected_2,
17269                     StrictNaNPropagation);
17270 
17271   // Signalling NaNs take precedence over quiet NaNs.
17272   double zn_inputs_3[] = {sn, qn, sn, sn, qn};
17273   double zm_inputs_3[] = {qm, sm, sm, qn, sn};
17274   uint64_t zd_expected_3[] = {sn_proc, sm_proc, sn_proc, sn_proc, sn_proc};
17275   ProcessNaNsHelper(config,
17276                     kDRegSize,
17277                     zn_inputs_3,
17278                     zm_inputs_3,
17279                     zd_expected_3,
17280                     StrictNaNPropagation);
17281 
17282   double za_inputs_4[] = {qa, qa, 0.0, 0.0, qa, qa};
17283   double zn_inputs_4[] = {qn, 0.0, 0.0, qn, qn, qn};
17284   double zm_inputs_4[] = {0.0, qm, qm, qm, qm, 0.0};
17285 
17286   // If `a` is propagated, its sign is inverted by fnmla and fnmls.
17287   // If `n` is propagated, its sign is inverted by fmls and fnmla.
17288   // If `m` is propagated, its sign is never inverted.
17289   uint64_t zd_expected_fmla_4[] =
17290       {qa_proc, qa_proc, qm_proc, qn_proc, qa_proc, qa_proc};
17291   uint64_t zd_expected_fmls_4[] =
17292       {qa_proc, qa_proc, qm_proc, qn_proc_n, qa_proc, qa_proc};
17293   uint64_t zd_expected_fnmla_4[] =
17294       {qa_proc_n, qa_proc_n, qm_proc, qn_proc_n, qa_proc_n, qa_proc_n};
17295   uint64_t zd_expected_fnmls_4[] =
17296       {qa_proc_n, qa_proc_n, qm_proc, qn_proc, qa_proc_n, qa_proc_n};
17297 
17298   ProcessNaNsHelper3(config,
17299                      kDRegSize,
17300                      za_inputs_4,
17301                      zn_inputs_4,
17302                      zm_inputs_4,
17303                      zd_expected_fmla_4,
17304                      zd_expected_fmls_4,
17305                      zd_expected_fnmla_4,
17306                      zd_expected_fnmls_4,
17307                      StrictNaNPropagation);
17308 
17309   // Signalling NaNs take precedence over quiet NaNs.
17310   double za_inputs_5[] = {qa, qa, sa, sa, sa};
17311   double zn_inputs_5[] = {qn, sn, sn, sn, qn};
17312   double zm_inputs_5[] = {sm, qm, sm, qa, sm};
17313   uint64_t zd_expected_fmla_5[] = {sm_proc, sn_proc, sa_proc, sa_proc, sa_proc};
17314   uint64_t zd_expected_fmls_5[] = {sm_proc,
17315                                    sn_proc_n,
17316                                    sa_proc,
17317                                    sa_proc,
17318                                    sa_proc};
17319   uint64_t zd_expected_fnmla_5[] = {sm_proc,
17320                                     sn_proc_n,
17321                                     sa_proc_n,
17322                                     sa_proc_n,
17323                                     sa_proc_n};
17324   uint64_t zd_expected_fnmls_5[] = {sm_proc,
17325                                     sn_proc,
17326                                     sa_proc_n,
17327                                     sa_proc_n,
17328                                     sa_proc_n};
17329 
17330   ProcessNaNsHelper3(config,
17331                      kDRegSize,
17332                      za_inputs_5,
17333                      zn_inputs_5,
17334                      zm_inputs_5,
17335                      zd_expected_fmla_5,
17336                      zd_expected_fmls_5,
17337                      zd_expected_fnmla_5,
17338                      zd_expected_fnmls_5,
17339                      StrictNaNPropagation);
17340 
17341   const double inf = kFP64PositiveInfinity;
17342   const double inf_n = kFP64NegativeInfinity;
17343   uint64_t inf_proc = DoubleToRawbits(inf);
17344   uint64_t inf_proc_n = DoubleToRawbits(inf_n);
17345   uint64_t d_inf_proc = DoubleToRawbits(kFP64DefaultNaN);
17346 
17347   double za_inputs_6[] = {qa, qa, 0.0f, -0.0f, qa, sa};
17348   double zn_inputs_6[] = {inf, -0.0f, -0.0f, inf, inf_n, inf};
17349   double zm_inputs_6[] = {0.0f, inf_n, inf, inf, inf, 0.0f};
17350 
17351   // quiet_nan + (0.0 * inf) produces the default NaN, not quiet_nan. Ditto for
17352   // (inf * 0.0). On the other hand, quiet_nan + (inf * inf) propagates the
17353   // quiet_nan.
17354   uint64_t zd_expected_fmla_6[] =
17355       {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc, sa_proc};
17356   uint64_t zd_expected_fmls_6[] =
17357       {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc, sa_proc};
17358   uint64_t zd_expected_fnmla_6[] =
17359       {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc_n, sa_proc_n};
17360   uint64_t zd_expected_fnmls_6[] =
17361       {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc_n, sa_proc_n};
17362 
17363   ProcessNaNsHelper3(config,
17364                      kDRegSize,
17365                      za_inputs_6,
17366                      zn_inputs_6,
17367                      zm_inputs_6,
17368                      zd_expected_fmla_6,
17369                      zd_expected_fmls_6,
17370                      zd_expected_fnmla_6,
17371                      zd_expected_fnmls_6,
17372                      StrictNaNPropagation);
17373 }
17374 
TEST_SVE(sve_process_nans_float)17375 TEST_SVE(sve_process_nans_float) {
17376   // Use non-standard NaNs to check that the payload bits are preserved.
17377   float sa = RawbitsToFloat(0x7f951111);
17378   float sn = RawbitsToFloat(0x7f952222);
17379   float sm = RawbitsToFloat(0x7f953333);
17380   float qa = RawbitsToFloat(0x7fea1111);
17381   float qn = RawbitsToFloat(0x7fea2222);
17382   float qm = RawbitsToFloat(0x7fea3333);
17383   VIXL_ASSERT(IsSignallingNaN(sa));
17384   VIXL_ASSERT(IsSignallingNaN(sn));
17385   VIXL_ASSERT(IsSignallingNaN(sm));
17386   VIXL_ASSERT(IsQuietNaN(qa));
17387   VIXL_ASSERT(IsQuietNaN(qn));
17388   VIXL_ASSERT(IsQuietNaN(qm));
17389 
17390   // The input NaNs after passing through ProcessNaN.
17391   uint32_t sa_proc = 0x7fd51111;
17392   uint32_t sn_proc = 0x7fd52222;
17393   uint32_t sm_proc = 0x7fd53333;
17394   uint32_t qa_proc = FloatToRawbits(qa);
17395   uint32_t qn_proc = FloatToRawbits(qn);
17396   uint32_t qm_proc = FloatToRawbits(qm);
17397   uint32_t sa_proc_n = sa_proc ^ kSSignMask;
17398   uint32_t sn_proc_n = sn_proc ^ kSSignMask;
17399   uint32_t qa_proc_n = qa_proc ^ kSSignMask;
17400   uint32_t qn_proc_n = qn_proc ^ kSSignMask;
17401 
17402   // Quiet NaNs are propagated.
17403   float zn_inputs_1[] = {qn, 0.0f, 0.0f, qm, qn, qm};
17404   float zm_inputs_1[] = {0.0f, qn, qm, 0.0f, qm, qn};
17405   uint64_t zd_expected_1[] =
17406       {qn_proc, qn_proc, qm_proc, qm_proc, qn_proc, qm_proc};
17407 
17408   ProcessNaNsHelper(config,
17409                     kSRegSize,
17410                     zn_inputs_1,
17411                     zm_inputs_1,
17412                     zd_expected_1,
17413                     StrictNaNPropagation);
17414 
17415   // Signalling NaNs are propagated.
17416   float zn_inputs_2[] = {sn, 0.0f, 0.0f, sm, sn, sm};
17417   float zm_inputs_2[] = {0.0f, sn, sm, 0.0f, sm, sn};
17418   uint64_t zd_expected_2[] =
17419       {sn_proc, sn_proc, sm_proc, sm_proc, sn_proc, sm_proc};
17420   ProcessNaNsHelper(config,
17421                     kSRegSize,
17422                     zn_inputs_2,
17423                     zm_inputs_2,
17424                     zd_expected_2,
17425                     StrictNaNPropagation);
17426 
17427   // Signalling NaNs take precedence over quiet NaNs.
17428   float zn_inputs_3[] = {sn, qn, sn, sn, qn};
17429   float zm_inputs_3[] = {qm, sm, sm, qn, sn};
17430   uint64_t zd_expected_3[] = {sn_proc, sm_proc, sn_proc, sn_proc, sn_proc};
17431   ProcessNaNsHelper(config,
17432                     kSRegSize,
17433                     zn_inputs_3,
17434                     zm_inputs_3,
17435                     zd_expected_3,
17436                     StrictNaNPropagation);
17437 
17438   float za_inputs_4[] = {qa, qa, 0.0f, 0.0f, qa, qa};
17439   float zn_inputs_4[] = {qn, 0.0f, 0.0f, qn, qn, qn};
17440   float zm_inputs_4[] = {0.0f, qm, qm, qm, qm, 0.0f};
17441 
17442   // If `a` is propagated, its sign is inverted by fnmla and fnmls.
17443   // If `n` is propagated, its sign is inverted by fmls and fnmla.
17444   // If `m` is propagated, its sign is never inverted.
17445   uint64_t zd_expected_fmla_4[] =
17446       {qa_proc, qa_proc, qm_proc, qn_proc, qa_proc, qa_proc};
17447   uint64_t zd_expected_fmls_4[] =
17448       {qa_proc, qa_proc, qm_proc, qn_proc_n, qa_proc, qa_proc};
17449   uint64_t zd_expected_fnmla_4[] =
17450       {qa_proc_n, qa_proc_n, qm_proc, qn_proc_n, qa_proc_n, qa_proc_n};
17451   uint64_t zd_expected_fnmls_4[] =
17452       {qa_proc_n, qa_proc_n, qm_proc, qn_proc, qa_proc_n, qa_proc_n};
17453 
17454   ProcessNaNsHelper3(config,
17455                      kSRegSize,
17456                      za_inputs_4,
17457                      zn_inputs_4,
17458                      zm_inputs_4,
17459                      zd_expected_fmla_4,
17460                      zd_expected_fmls_4,
17461                      zd_expected_fnmla_4,
17462                      zd_expected_fnmls_4,
17463                      StrictNaNPropagation);
17464 
17465   // Signalling NaNs take precedence over quiet NaNs.
17466   float za_inputs_5[] = {qa, qa, sa, sa, sa};
17467   float zn_inputs_5[] = {qn, sn, sn, sn, qn};
17468   float zm_inputs_5[] = {sm, qm, sm, qa, sm};
17469   uint64_t zd_expected_fmla_5[] = {sm_proc, sn_proc, sa_proc, sa_proc, sa_proc};
17470   uint64_t zd_expected_fmls_5[] = {sm_proc,
17471                                    sn_proc_n,
17472                                    sa_proc,
17473                                    sa_proc,
17474                                    sa_proc};
17475   uint64_t zd_expected_fnmla_5[] = {sm_proc,
17476                                     sn_proc_n,
17477                                     sa_proc_n,
17478                                     sa_proc_n,
17479                                     sa_proc_n};
17480   uint64_t zd_expected_fnmls_5[] = {sm_proc,
17481                                     sn_proc,
17482                                     sa_proc_n,
17483                                     sa_proc_n,
17484                                     sa_proc_n};
17485 
17486   ProcessNaNsHelper3(config,
17487                      kSRegSize,
17488                      za_inputs_5,
17489                      zn_inputs_5,
17490                      zm_inputs_5,
17491                      zd_expected_fmla_5,
17492                      zd_expected_fmls_5,
17493                      zd_expected_fnmla_5,
17494                      zd_expected_fnmls_5,
17495                      StrictNaNPropagation);
17496 
17497   const float inf = kFP32PositiveInfinity;
17498   const float inf_n = kFP32NegativeInfinity;
17499   uint32_t inf_proc = FloatToRawbits(inf);
17500   uint32_t inf_proc_n = FloatToRawbits(inf_n);
17501   uint32_t d_inf_proc = FloatToRawbits(kFP32DefaultNaN);
17502 
17503   float za_inputs_6[] = {qa, qa, 0.0f, 0.0f, qa, sa};
17504   float zn_inputs_6[] = {inf, 0.0f, 0.0f, inf, inf_n, inf};
17505   float zm_inputs_6[] = {0.0f, inf_n, inf, inf, inf, 0.0f};
17506 
17507   // quiet_nan + (0.0 * inf) produces the default NaN, not quiet_nan. Ditto for
17508   // (inf * 0.0). On the other hand, quiet_nan + (inf * inf) propagates the
17509   // quiet_nan.
17510   uint64_t zd_expected_fmla_6[] =
17511       {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc, sa_proc};
17512   uint64_t zd_expected_fmls_6[] =
17513       {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc, sa_proc};
17514   uint64_t zd_expected_fnmla_6[] =
17515       {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc_n, sa_proc_n};
17516   uint64_t zd_expected_fnmls_6[] =
17517       {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc_n, sa_proc_n};
17518 
17519   ProcessNaNsHelper3(config,
17520                      kSRegSize,
17521                      za_inputs_6,
17522                      zn_inputs_6,
17523                      zm_inputs_6,
17524                      zd_expected_fmla_6,
17525                      zd_expected_fmls_6,
17526                      zd_expected_fnmla_6,
17527                      zd_expected_fnmls_6,
17528                      StrictNaNPropagation);
17529 }
17530 
TEST_SVE(sve_process_nans_half)17531 TEST_SVE(sve_process_nans_half) {
17532   // Use non-standard NaNs to check that the payload bits are preserved.
17533   Float16 sa(RawbitsToFloat16(0x7c11));
17534   Float16 sn(RawbitsToFloat16(0x7c22));
17535   Float16 sm(RawbitsToFloat16(0x7c33));
17536   Float16 qa(RawbitsToFloat16(0x7e44));
17537   Float16 qn(RawbitsToFloat16(0x7e55));
17538   Float16 qm(RawbitsToFloat16(0x7e66));
17539   VIXL_ASSERT(IsSignallingNaN(sa));
17540   VIXL_ASSERT(IsSignallingNaN(sn));
17541   VIXL_ASSERT(IsSignallingNaN(sm));
17542   VIXL_ASSERT(IsQuietNaN(qa));
17543   VIXL_ASSERT(IsQuietNaN(qn));
17544   VIXL_ASSERT(IsQuietNaN(qm));
17545 
17546   // The input NaNs after passing through ProcessNaN.
17547   uint16_t sa_proc = 0x7e11;
17548   uint16_t sn_proc = 0x7e22;
17549   uint16_t sm_proc = 0x7e33;
17550   uint16_t qa_proc = Float16ToRawbits(qa);
17551   uint16_t qn_proc = Float16ToRawbits(qn);
17552   uint16_t qm_proc = Float16ToRawbits(qm);
17553   uint16_t sa_proc_n = sa_proc ^ kHSignMask;
17554   uint16_t sn_proc_n = sn_proc ^ kHSignMask;
17555   uint16_t qa_proc_n = qa_proc ^ kHSignMask;
17556   uint16_t qn_proc_n = qn_proc ^ kHSignMask;
17557   Float16 zero(0.0);
17558 
17559   // Quiet NaNs are propagated.
17560   Float16 zn_inputs_1[] = {qn, zero, zero, qm, qn, qm};
17561   Float16 zm_inputs_1[] = {zero, qn, qm, zero, qm, qn};
17562   uint64_t zd_expected_1[] =
17563       {qn_proc, qn_proc, qm_proc, qm_proc, qn_proc, qm_proc};
17564 
17565   ProcessNaNsHelper(config,
17566                     kHRegSize,
17567                     zn_inputs_1,
17568                     zm_inputs_1,
17569                     zd_expected_1,
17570                     StrictNaNPropagation);
17571 
17572   // Signalling NaNs are propagated.
17573   Float16 zn_inputs_2[] = {sn, zero, zero, sm, sn, sm};
17574   Float16 zm_inputs_2[] = {zero, sn, sm, zero, sm, sn};
17575   uint64_t zd_expected_2[] =
17576       {sn_proc, sn_proc, sm_proc, sm_proc, sn_proc, sm_proc};
17577   ProcessNaNsHelper(config,
17578                     kHRegSize,
17579                     zn_inputs_2,
17580                     zm_inputs_2,
17581                     zd_expected_2,
17582                     StrictNaNPropagation);
17583 
17584   // Signalling NaNs take precedence over quiet NaNs.
17585   Float16 zn_inputs_3[] = {sn, qn, sn, sn, qn};
17586   Float16 zm_inputs_3[] = {qm, sm, sm, qn, sn};
17587   uint64_t zd_expected_3[] = {sn_proc, sm_proc, sn_proc, sn_proc, sn_proc};
17588   ProcessNaNsHelper(config,
17589                     kHRegSize,
17590                     zn_inputs_3,
17591                     zm_inputs_3,
17592                     zd_expected_3,
17593                     StrictNaNPropagation);
17594 
17595   Float16 za_inputs_4[] = {qa, qa, zero, zero, qa, qa};
17596   Float16 zn_inputs_4[] = {qn, zero, zero, qn, qn, qn};
17597   Float16 zm_inputs_4[] = {zero, qm, qm, qm, qm, zero};
17598 
17599   // If `a` is propagated, its sign is inverted by fnmla and fnmls.
17600   // If `n` is propagated, its sign is inverted by fmls and fnmla.
17601   // If `m` is propagated, its sign is never inverted.
17602   uint64_t zd_expected_fmla_4[] =
17603       {qa_proc, qa_proc, qm_proc, qn_proc, qa_proc, qa_proc};
17604   uint64_t zd_expected_fmls_4[] =
17605       {qa_proc, qa_proc, qm_proc, qn_proc_n, qa_proc, qa_proc};
17606   uint64_t zd_expected_fnmla_4[] =
17607       {qa_proc_n, qa_proc_n, qm_proc, qn_proc_n, qa_proc_n, qa_proc_n};
17608   uint64_t zd_expected_fnmls_4[] =
17609       {qa_proc_n, qa_proc_n, qm_proc, qn_proc, qa_proc_n, qa_proc_n};
17610 
17611   ProcessNaNsHelper3(config,
17612                      kHRegSize,
17613                      za_inputs_4,
17614                      zn_inputs_4,
17615                      zm_inputs_4,
17616                      zd_expected_fmla_4,
17617                      zd_expected_fmls_4,
17618                      zd_expected_fnmla_4,
17619                      zd_expected_fnmls_4,
17620                      StrictNaNPropagation);
17621 
17622   // Signalling NaNs take precedence over quiet NaNs.
17623   Float16 za_inputs_5[] = {qa, qa, sa, sa, sa};
17624   Float16 zn_inputs_5[] = {qn, sn, sn, sn, qn};
17625   Float16 zm_inputs_5[] = {sm, qm, sm, qa, sm};
17626   uint64_t zd_expected_fmla_5[] = {sm_proc, sn_proc, sa_proc, sa_proc, sa_proc};
17627   uint64_t zd_expected_fmls_5[] = {sm_proc,
17628                                    sn_proc_n,
17629                                    sa_proc,
17630                                    sa_proc,
17631                                    sa_proc};
17632   uint64_t zd_expected_fnmla_5[] = {sm_proc,
17633                                     sn_proc_n,
17634                                     sa_proc_n,
17635                                     sa_proc_n,
17636                                     sa_proc_n};
17637   uint64_t zd_expected_fnmls_5[] = {sm_proc,
17638                                     sn_proc,
17639                                     sa_proc_n,
17640                                     sa_proc_n,
17641                                     sa_proc_n};
17642 
17643   ProcessNaNsHelper3(config,
17644                      kHRegSize,
17645                      za_inputs_5,
17646                      zn_inputs_5,
17647                      zm_inputs_5,
17648                      zd_expected_fmla_5,
17649                      zd_expected_fmls_5,
17650                      zd_expected_fnmla_5,
17651                      zd_expected_fnmls_5,
17652                      StrictNaNPropagation);
17653 
17654   const Float16 inf = kFP16PositiveInfinity;
17655   const Float16 inf_n = kFP16NegativeInfinity;
17656   uint64_t inf_proc = Float16ToRawbits(inf);
17657   uint64_t inf_proc_n = Float16ToRawbits(inf_n);
17658   uint64_t d_inf_proc = Float16ToRawbits(kFP16DefaultNaN);
17659 
17660   Float16 za_inputs_6[] = {qa, qa, zero, zero, qa, sa};
17661   Float16 zn_inputs_6[] = {inf, zero, zero, inf, inf_n, inf};
17662   Float16 zm_inputs_6[] = {zero, inf_n, inf, inf, inf, zero};
17663 
17664   // quiet_nan + (0.0 * inf) produces the default NaN, not quiet_nan. Ditto for
17665   // (inf * 0.0). On the other hand, quiet_nan + (inf * inf) propagates the
17666   // quiet_nan.
17667   uint64_t zd_expected_fmla_6[] =
17668       {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc, sa_proc};
17669   uint64_t zd_expected_fmls_6[] =
17670       {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc, sa_proc};
17671   uint64_t zd_expected_fnmla_6[] =
17672       {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc_n, sa_proc_n};
17673   uint64_t zd_expected_fnmls_6[] =
17674       {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc_n, sa_proc_n};
17675 
17676   ProcessNaNsHelper3(config,
17677                      kHRegSize,
17678                      za_inputs_6,
17679                      zn_inputs_6,
17680                      zm_inputs_6,
17681                      zd_expected_fmla_6,
17682                      zd_expected_fmls_6,
17683                      zd_expected_fnmla_6,
17684                      zd_expected_fnmls_6,
17685                      StrictNaNPropagation);
17686 }
17687 
17688 typedef void (MacroAssembler::*FCmpFn)(const PRegisterWithLaneSize& pd,
17689                                        const PRegisterZ& pg,
17690                                        const ZRegister& zn,
17691                                        const ZRegister& zm);
17692 
17693 typedef void (MacroAssembler::*FCmpZeroFn)(const PRegisterWithLaneSize& pd,
17694                                            const PRegisterZ& pg,
17695                                            const ZRegister& zn,
17696                                            double zero);
17697 
17698 typedef void (MacroAssembler::*CmpFn)(const PRegisterWithLaneSize& pd,
17699                                       const PRegisterZ& pg,
17700                                       const ZRegister& zn,
17701                                       const ZRegister& zm);
17702 
GetFpAbsCompareFn(Condition cond)17703 static FCmpFn GetFpAbsCompareFn(Condition cond) {
17704   switch (cond) {
17705     case ge:
17706       return &MacroAssembler::Facge;
17707     case gt:
17708       return &MacroAssembler::Facgt;
17709     case le:
17710       return &MacroAssembler::Facle;
17711     case lt:
17712       return &MacroAssembler::Faclt;
17713     default:
17714       VIXL_UNIMPLEMENTED();
17715       return NULL;
17716   }
17717 }
17718 
GetFpCompareFn(Condition cond)17719 static FCmpFn GetFpCompareFn(Condition cond) {
17720   switch (cond) {
17721     case ge:
17722       return &MacroAssembler::Fcmge;
17723     case gt:
17724       return &MacroAssembler::Fcmgt;
17725     case le:
17726       return &MacroAssembler::Fcmle;
17727     case lt:
17728       return &MacroAssembler::Fcmlt;
17729     case eq:
17730       return &MacroAssembler::Fcmeq;
17731     case ne:
17732       return &MacroAssembler::Fcmne;
17733     case uo:
17734       return &MacroAssembler::Fcmuo;
17735     default:
17736       VIXL_UNIMPLEMENTED();
17737       return NULL;
17738   }
17739 }
17740 
GetFpCompareZeroFn(Condition cond)17741 static FCmpZeroFn GetFpCompareZeroFn(Condition cond) {
17742   switch (cond) {
17743     case ge:
17744       return &MacroAssembler::Fcmge;
17745     case gt:
17746       return &MacroAssembler::Fcmgt;
17747     case le:
17748       return &MacroAssembler::Fcmle;
17749     case lt:
17750       return &MacroAssembler::Fcmlt;
17751     case eq:
17752       return &MacroAssembler::Fcmeq;
17753     case ne:
17754       return &MacroAssembler::Fcmne;
17755     default:
17756       VIXL_UNIMPLEMENTED();
17757       return NULL;
17758   }
17759 }
17760 
GetIntCompareFn(Condition cond)17761 static CmpFn GetIntCompareFn(Condition cond) {
17762   switch (cond) {
17763     case ge:
17764       return &MacroAssembler::Cmpge;
17765     case gt:
17766       return &MacroAssembler::Cmpgt;
17767     case le:
17768       return &MacroAssembler::Cmple;
17769     case lt:
17770       return &MacroAssembler::Cmplt;
17771     case eq:
17772       return &MacroAssembler::Cmpeq;
17773     case ne:
17774       return &MacroAssembler::Cmpne;
17775     default:
17776       VIXL_UNIMPLEMENTED();
17777       return NULL;
17778   }
17779 }
17780 
17781 template <size_t N>
TestFpCompareHelper(Test * config,int lane_size_in_bits,Condition cond,const double (& zn_inputs)[N],const double (& zm_inputs)[N],const int (& pd_expected)[N],bool is_absolute=false)17782 static void TestFpCompareHelper(Test* config,
17783                                 int lane_size_in_bits,
17784                                 Condition cond,
17785                                 const double (&zn_inputs)[N],
17786                                 const double (&zm_inputs)[N],
17787                                 const int (&pd_expected)[N],
17788                                 bool is_absolute = false) {
17789   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
17790   START();
17791 
17792   ZRegister zt_int_1 = z1.WithLaneSize(lane_size_in_bits);
17793   ZRegister zt_int_2 = z2.WithLaneSize(lane_size_in_bits);
17794   ZRegister zt_int_3 = z3.WithLaneSize(lane_size_in_bits);
17795   ZRegister zt_fp_1 = z11.WithLaneSize(lane_size_in_bits);
17796   ZRegister zt_fp_2 = z12.WithLaneSize(lane_size_in_bits);
17797   ZRegister zt_fp_3 = z13.WithLaneSize(lane_size_in_bits);
17798   ZRegister fp_one = z31.WithLaneSize(lane_size_in_bits);
17799 
17800   PRegisterWithLaneSize pd_result_int_1 = p15.WithLaneSize(lane_size_in_bits);
17801   PRegisterWithLaneSize pd_result_fp_1 = p14.WithLaneSize(lane_size_in_bits);
17802   PRegisterWithLaneSize pd_result_int_2 = p13.WithLaneSize(lane_size_in_bits);
17803   PRegisterWithLaneSize pd_result_fp_2 = p12.WithLaneSize(lane_size_in_bits);
17804 
17805   FCmpFn fcmp = is_absolute ? GetFpAbsCompareFn(cond) : GetFpCompareFn(cond);
17806   __ Ptrue(p1.VnB());
17807 
17808   if (cond != uo) {
17809     int pg_inputs[] = {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1};
17810     Initialise(&masm, p0.WithLaneSize(lane_size_in_bits), pg_inputs);
17811 
17812     __ Fdup(fp_one, 0.1f);
17813 
17814     __ Index(zt_int_1, 3, 3);
17815     __ Scvtf(zt_fp_1, p0.Merging(), zt_int_1);
17816     __ Fadd(zt_fp_1, zt_fp_1, fp_one);
17817 
17818     __ Index(zt_int_2, 3, -10);
17819     __ Scvtf(zt_fp_2, p0.Merging(), zt_int_2);
17820     __ Fadd(zt_fp_2, zt_fp_2, fp_one);
17821 
17822     __ Index(zt_int_3, 3, 2);
17823     __ Scvtf(zt_fp_3, p0.Merging(), zt_int_3);
17824     __ Fadd(zt_fp_3, zt_fp_3, fp_one);
17825 
17826 
17827     // There is no absolute comparison in integer type, use `abs` with `cmp<cc>`
17828     // to synthesize the expected result for `fac<cc>`.
17829     if (is_absolute == true) {
17830       __ Abs(zt_int_2, p1.Merging(), zt_int_2);
17831     }
17832 
17833     CmpFn cmp = GetIntCompareFn(cond);
17834     (masm.*cmp)(pd_result_int_1, p0.Zeroing(), zt_int_1, zt_int_2);
17835     (masm.*fcmp)(pd_result_fp_1, p0.Zeroing(), zt_fp_1, zt_fp_2);
17836 
17837     (masm.*cmp)(pd_result_int_2, p0.Zeroing(), zt_int_1, zt_int_3);
17838     (masm.*fcmp)(pd_result_fp_2, p0.Zeroing(), zt_fp_1, zt_fp_3);
17839   }
17840 
17841   uint64_t zn_inputs_rawbits[N];
17842   uint64_t zm_inputs_rawbits[N];
17843   FPToRawbitsWithSize(zn_inputs, zn_inputs_rawbits, lane_size_in_bits);
17844   FPToRawbitsWithSize(zm_inputs, zm_inputs_rawbits, lane_size_in_bits);
17845 
17846   ZRegister zn_fp = z14.WithLaneSize(lane_size_in_bits);
17847   ZRegister zm_fp = z15.WithLaneSize(lane_size_in_bits);
17848   InsrHelper(&masm, zn_fp, zn_inputs_rawbits);
17849   InsrHelper(&masm, zm_fp, zm_inputs_rawbits);
17850 
17851   PRegisterWithLaneSize pd_result_fp_3 = p11.WithLaneSize(lane_size_in_bits);
17852   (masm.*fcmp)(pd_result_fp_3, p1.Zeroing(), zn_fp, zm_fp);
17853 
17854   END();
17855 
17856   if (CAN_RUN()) {
17857     RUN();
17858 
17859     if (cond != uo) {
17860       ASSERT_EQUAL_SVE(pd_result_int_1, pd_result_fp_1);
17861       ASSERT_EQUAL_SVE(pd_result_int_2, pd_result_fp_2);
17862     }
17863     ASSERT_EQUAL_SVE(pd_expected, pd_result_fp_3);
17864   }
17865 }
17866 
TEST_SVE(sve_fp_compare_vectors)17867 TEST_SVE(sve_fp_compare_vectors) {
17868   double inf_p = kFP64PositiveInfinity;
17869   double inf_n = kFP64NegativeInfinity;
17870   double nan = kFP64DefaultNaN;
17871 
17872   // Normal floating point comparison has been tested in the helper.
17873   double zn[] = {0.0, inf_n, 1.0, inf_p, inf_p, nan, 0.0, nan};
17874   double zm[] = {-0.0, inf_n, inf_n, -2.0, inf_n, nan, nan, inf_p};
17875 
17876   int pd_fcm_gt[] = {0, 0, 1, 1, 1, 0, 0, 0};
17877   int pd_fcm_lt[] = {0, 0, 0, 0, 0, 0, 0, 0};
17878   int pd_fcm_ge[] = {1, 1, 1, 1, 1, 0, 0, 0};
17879   int pd_fcm_le[] = {1, 1, 0, 0, 0, 0, 0, 0};
17880   int pd_fcm_eq[] = {1, 1, 0, 0, 0, 0, 0, 0};
17881   int pd_fcm_ne[] = {0, 0, 1, 1, 1, 1, 1, 1};
17882   int pd_fcm_uo[] = {0, 0, 0, 0, 0, 1, 1, 1};
17883   int pd_fac_gt[] = {0, 0, 0, 1, 0, 0, 0, 0};
17884   int pd_fac_lt[] = {0, 0, 1, 0, 0, 0, 0, 0};
17885   int pd_fac_ge[] = {1, 1, 0, 1, 1, 0, 0, 0};
17886   int pd_fac_le[] = {1, 1, 1, 0, 1, 0, 0, 0};
17887 
17888   int lane_sizes[] = {kHRegSize, kSRegSize, kDRegSize};
17889 
17890   for (size_t i = 0; i < ArrayLength(lane_sizes); i++) {
17891     int lane_size = lane_sizes[i];
17892     // Test floating-point compare vectors.
17893     TestFpCompareHelper(config, lane_size, gt, zn, zm, pd_fcm_gt);
17894     TestFpCompareHelper(config, lane_size, lt, zn, zm, pd_fcm_lt);
17895     TestFpCompareHelper(config, lane_size, ge, zn, zm, pd_fcm_ge);
17896     TestFpCompareHelper(config, lane_size, le, zn, zm, pd_fcm_le);
17897     TestFpCompareHelper(config, lane_size, eq, zn, zm, pd_fcm_eq);
17898     TestFpCompareHelper(config, lane_size, ne, zn, zm, pd_fcm_ne);
17899     TestFpCompareHelper(config, lane_size, uo, zn, zm, pd_fcm_uo);
17900 
17901     // Test floating-point absolute compare vectors.
17902     TestFpCompareHelper(config, lane_size, gt, zn, zm, pd_fac_gt, true);
17903     TestFpCompareHelper(config, lane_size, lt, zn, zm, pd_fac_lt, true);
17904     TestFpCompareHelper(config, lane_size, ge, zn, zm, pd_fac_ge, true);
17905     TestFpCompareHelper(config, lane_size, le, zn, zm, pd_fac_le, true);
17906   }
17907 }
17908 
17909 template <size_t N, typename T>
TestFpCompareZeroHelper(Test * config,int lane_size_in_bits,Condition cond,const T (& zn_inputs)[N],const int (& pd_expected)[N])17910 static void TestFpCompareZeroHelper(Test* config,
17911                                     int lane_size_in_bits,
17912                                     Condition cond,
17913                                     const T (&zn_inputs)[N],
17914                                     const int (&pd_expected)[N]) {
17915   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
17916   START();
17917 
17918   ZRegister zn = z28.WithLaneSize(lane_size_in_bits);
17919   PRegisterWithLaneSize pd = p14.WithLaneSize(lane_size_in_bits);
17920 
17921   uint64_t zn_rawbits[N];
17922   FPToRawbitsWithSize(zn_inputs, zn_rawbits, lane_size_in_bits);
17923   InsrHelper(&masm, zn, zn_rawbits);
17924 
17925   __ Ptrue(p0.VnB());
17926   (masm.*GetFpCompareZeroFn(cond))(pd, p0.Zeroing(), zn, 0.0);
17927 
17928   END();
17929 
17930   if (CAN_RUN()) {
17931     RUN();
17932 
17933     ASSERT_EQUAL_SVE(pd_expected, pd);
17934   }
17935 }
17936 
TEST_SVE(sve_fp_compare_vector_zero)17937 TEST_SVE(sve_fp_compare_vector_zero) {
17938   Float16 fp16_inf_p = kFP16PositiveInfinity;
17939   Float16 fp16_inf_n = kFP16NegativeInfinity;
17940   Float16 fp16_dn = kFP16DefaultNaN;
17941   Float16 fp16_sn = RawbitsToFloat16(0x7c22);
17942   Float16 fp16_qn = RawbitsToFloat16(0x7e55);
17943 
17944   float fp32_inf_p = kFP32PositiveInfinity;
17945   float fp32_inf_n = kFP32NegativeInfinity;
17946   float fp32_dn = kFP32DefaultNaN;
17947   float fp32_sn = RawbitsToFloat(0x7f952222);
17948   float fp32_qn = RawbitsToFloat(0x7fea2222);
17949 
17950   double fp64_inf_p = kFP64PositiveInfinity;
17951   double fp64_inf_n = kFP64NegativeInfinity;
17952   double fp64_dn = kFP64DefaultNaN;
17953   double fp64_sn = RawbitsToDouble(0x7ff5555511111111);
17954   double fp64_qn = RawbitsToDouble(0x7ffaaaaa11111111);
17955 
17956   // Normal floating point comparison has been tested in the non-zero form.
17957   Float16 zn_inputs_h[] = {Float16(0.0),
17958                            Float16(-0.0),
17959                            fp16_inf_p,
17960                            fp16_inf_n,
17961                            fp16_dn,
17962                            fp16_sn,
17963                            fp16_qn};
17964   float zn_inputs_s[] =
17965       {0.0, -0.0, fp32_inf_p, fp32_inf_n, fp32_dn, fp32_sn, fp32_qn};
17966   double zn_inputs_d[] =
17967       {0.0, -0.0, fp64_inf_p, fp64_inf_n, fp64_dn, fp64_sn, fp64_qn};
17968 
17969   int pd_expected_gt[] = {0, 0, 1, 0, 0, 0, 0};
17970   int pd_expected_lt[] = {0, 0, 0, 1, 0, 0, 0};
17971   int pd_expected_ge[] = {1, 1, 1, 0, 0, 0, 0};
17972   int pd_expected_le[] = {1, 1, 0, 1, 0, 0, 0};
17973   int pd_expected_eq[] = {1, 1, 0, 0, 0, 0, 0};
17974   int pd_expected_ne[] = {0, 0, 1, 1, 1, 1, 1};
17975 
17976   TestFpCompareZeroHelper(config, kDRegSize, gt, zn_inputs_d, pd_expected_gt);
17977   TestFpCompareZeroHelper(config, kDRegSize, lt, zn_inputs_d, pd_expected_lt);
17978   TestFpCompareZeroHelper(config, kDRegSize, ge, zn_inputs_d, pd_expected_ge);
17979   TestFpCompareZeroHelper(config, kDRegSize, le, zn_inputs_d, pd_expected_le);
17980   TestFpCompareZeroHelper(config, kDRegSize, eq, zn_inputs_d, pd_expected_eq);
17981   TestFpCompareZeroHelper(config, kDRegSize, ne, zn_inputs_d, pd_expected_ne);
17982 
17983   TestFpCompareZeroHelper(config, kSRegSize, gt, zn_inputs_s, pd_expected_gt);
17984   TestFpCompareZeroHelper(config, kSRegSize, lt, zn_inputs_s, pd_expected_lt);
17985   TestFpCompareZeroHelper(config, kSRegSize, ge, zn_inputs_s, pd_expected_ge);
17986   TestFpCompareZeroHelper(config, kSRegSize, le, zn_inputs_s, pd_expected_le);
17987   TestFpCompareZeroHelper(config, kSRegSize, eq, zn_inputs_s, pd_expected_eq);
17988   TestFpCompareZeroHelper(config, kSRegSize, ne, zn_inputs_s, pd_expected_ne);
17989 
17990   TestFpCompareZeroHelper(config, kHRegSize, gt, zn_inputs_h, pd_expected_gt);
17991   TestFpCompareZeroHelper(config, kHRegSize, lt, zn_inputs_h, pd_expected_lt);
17992   TestFpCompareZeroHelper(config, kHRegSize, ge, zn_inputs_h, pd_expected_ge);
17993   TestFpCompareZeroHelper(config, kHRegSize, le, zn_inputs_h, pd_expected_le);
17994   TestFpCompareZeroHelper(config, kHRegSize, eq, zn_inputs_h, pd_expected_eq);
17995   TestFpCompareZeroHelper(config, kHRegSize, ne, zn_inputs_h, pd_expected_ne);
17996 }
17997 
17998 typedef void (MacroAssembler::*FPUnaryMFn)(const ZRegister& zd,
17999                                            const PRegisterM& pg,
18000                                            const ZRegister& zn);
18001 
18002 typedef void (MacroAssembler::*FPUnaryZFn)(const ZRegister& zd,
18003                                            const PRegisterZ& pg,
18004                                            const ZRegister& zn);
18005 
18006 template <size_t N, size_t M>
TestFPUnaryPredicatedHelper(Test * config,int src_size_in_bits,int dst_size_in_bits,uint64_t (& zn_inputs)[N],const uint64_t (& pg_inputs)[M],const uint64_t (& zd_expected)[N],FPUnaryMFn macro_m,FPUnaryZFn macro_z)18007 static void TestFPUnaryPredicatedHelper(Test* config,
18008                                         int src_size_in_bits,
18009                                         int dst_size_in_bits,
18010                                         uint64_t (&zn_inputs)[N],
18011                                         const uint64_t (&pg_inputs)[M],
18012                                         const uint64_t (&zd_expected)[N],
18013                                         FPUnaryMFn macro_m,
18014                                         FPUnaryZFn macro_z) {
18015   // Provide the full predicate input.
18016   VIXL_ASSERT(M == (kPRegMaxSize / kDRegSize));
18017   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
18018   START();
18019 
18020   int ds = dst_size_in_bits;
18021   int ss = src_size_in_bits;
18022   int ls = std::max(ss, ds);
18023 
18024   // When destination type is larger than source type, fill the high parts with
18025   // noise values, which should be ignored.
18026   if (ds > ss) {
18027     VIXL_ASSERT(ss < 64);
18028     uint64_t zn_inputs_mod[N];
18029     uint64_t sn = GetSignallingNan(ss);
18030     for (unsigned i = 0; i < N; i++) {
18031       zn_inputs_mod[i] = zn_inputs[i] | ((sn + i) << ss);
18032     }
18033     InsrHelper(&masm, z29.WithLaneSize(ls), zn_inputs_mod);
18034   } else {
18035     InsrHelper(&masm, z29.WithLaneSize(ls), zn_inputs);
18036   }
18037 
18038   // Make a copy so we can check that constructive operations preserve zn.
18039   __ Mov(z28, z29);
18040 
18041   // Run the operation on all lanes.
18042   __ Ptrue(p0.WithLaneSize(ls));
18043   (masm.*macro_m)(z27.WithLaneSize(ds), p0.Merging(), z28.WithLaneSize(ss));
18044 
18045   Initialise(&masm,
18046              p1.VnB(),
18047              pg_inputs[3],
18048              pg_inputs[2],
18049              pg_inputs[1],
18050              pg_inputs[0]);
18051 
18052   // Clear the irrelevant lanes.
18053   __ Index(z31.WithLaneSize(ls), 0, 1);
18054   __ Cmplt(p1.WithLaneSize(ls), p1.Zeroing(), z31.WithLaneSize(ls), N);
18055 
18056   // Check merging predication.
18057   __ Index(z11.WithLaneSize(ls), 42, 1);
18058   // Preserve the base value so we can derive the expected result.
18059   __ Mov(z21, z11);
18060   __ Mov(z9, z11);
18061   (masm.*macro_m)(z11.WithLaneSize(ds), p1.Merging(), z28.WithLaneSize(ss));
18062 
18063   // Generate expected values using explicit merging operations.
18064   InsrHelper(&masm, z25.WithLaneSize(ls), zd_expected);
18065   __ Mov(z21.WithLaneSize(ls), p1.Merging(), z25.WithLaneSize(ls));
18066 
18067   // Check zeroing predication.
18068   __ Index(z12.WithLaneSize(ds), 42, -1);
18069   (masm.*macro_z)(z12.WithLaneSize(ds), p1.Zeroing(), z28.WithLaneSize(ss));
18070 
18071   // Generate expected values using explicit zeroing operations.
18072   InsrHelper(&masm, z30.WithLaneSize(ls), zd_expected);
18073   // Emulate zeroing predication.
18074   __ Dup(z22.WithLaneSize(ls), 0);
18075   __ Mov(z22.WithLaneSize(ls), p1.Merging(), z30.WithLaneSize(ls));
18076 
18077   // Check an in-place update.
18078   __ Mov(z9.WithLaneSize(ls), p1.Merging(), z28.WithLaneSize(ls));
18079   (masm.*macro_m)(z9.WithLaneSize(ds), p1.Merging(), z9.WithLaneSize(ss));
18080 
18081   END();
18082 
18083   if (CAN_RUN()) {
18084     RUN();
18085 
18086     // Check all lanes.
18087     ASSERT_EQUAL_SVE(zd_expected, z27.WithLaneSize(ls));
18088 
18089     // Check that constructive operations preserve their inputs.
18090     ASSERT_EQUAL_SVE(z28, z29);
18091 
18092     // Check merging predication.
18093     ASSERT_EQUAL_SVE(z21.WithLaneSize(ls), z21.WithLaneSize(ls));
18094 
18095     // Check zeroing predication.
18096     ASSERT_EQUAL_SVE(z22.WithLaneSize(ls), z12.WithLaneSize(ls));
18097 
18098     // Check in-place operation where zd == zn.
18099     ASSERT_EQUAL_SVE(z21.WithLaneSize(ls), z9.WithLaneSize(ls));
18100   }
18101 }
18102 
18103 template <size_t N, typename T>
TestFPUnaryPredicatedHelper(Test * config,int src_size_in_bits,int dst_size_in_bits,T (& zn_inputs)[N],const T (& zd_expected)[N],FPUnaryMFn macro_m,FPUnaryZFn macro_z)18104 static void TestFPUnaryPredicatedHelper(Test* config,
18105                                         int src_size_in_bits,
18106                                         int dst_size_in_bits,
18107                                         T (&zn_inputs)[N],
18108                                         const T (&zd_expected)[N],
18109                                         FPUnaryMFn macro_m,
18110                                         FPUnaryZFn macro_z) {
18111   uint64_t pg_inputs[] = {0xa55aa55aa55aa55a,
18112                           0xa55aa55aa55aa55a,
18113                           0xa55aa55aa55aa55a,
18114                           0xa55aa55aa55aa55a};
18115 
18116   TestFPUnaryPredicatedHelper(config,
18117                               src_size_in_bits,
18118                               dst_size_in_bits,
18119                               zn_inputs,
18120                               pg_inputs,
18121                               zd_expected,
18122                               macro_m,
18123                               macro_z);
18124 
18125   // The complementary of above precicate to get full input coverage.
18126   uint64_t pg_c_inputs[] = {0x5aa55aa55aa55aa5,
18127                             0x5aa55aa55aa55aa5,
18128                             0x5aa55aa55aa55aa5,
18129                             0x5aa55aa55aa55aa5};
18130 
18131   TestFPUnaryPredicatedHelper(config,
18132                               src_size_in_bits,
18133                               dst_size_in_bits,
18134                               zn_inputs,
18135                               pg_c_inputs,
18136                               zd_expected,
18137                               macro_m,
18138                               macro_z);
18139 }
18140 
18141 template <size_t N, typename T>
TestFcvtHelper(Test * config,int src_size_in_bits,int dst_size_in_bits,T (& zn_inputs)[N],const T (& zd_expected)[N])18142 static void TestFcvtHelper(Test* config,
18143                            int src_size_in_bits,
18144                            int dst_size_in_bits,
18145                            T (&zn_inputs)[N],
18146                            const T (&zd_expected)[N]) {
18147   TestFPUnaryPredicatedHelper(config,
18148                               src_size_in_bits,
18149                               dst_size_in_bits,
18150                               zn_inputs,
18151                               zd_expected,
18152                               &MacroAssembler::Fcvt,   // Merging form.
18153                               &MacroAssembler::Fcvt);  // Zerging form.
18154 }
18155 
TEST_SVE(sve_fcvt)18156 TEST_SVE(sve_fcvt) {
18157   uint64_t h_vals[] = {0x7c00,
18158                        0xfc00,
18159                        0,
18160                        0x8000,
18161                        0x7bff,   // Max half precision.
18162                        0x0400,   // Min positive normal.
18163                        0x03ff,   // Max subnormal.
18164                        0x0001};  // Min positive subnormal.
18165 
18166   uint64_t s_vals[] = {0x7f800000,
18167                        0xff800000,
18168                        0,
18169                        0x80000000,
18170                        0x477fe000,
18171                        0x38800000,
18172                        0x387fc000,
18173                        0x33800000};
18174 
18175   uint64_t d_vals[] = {0x7ff0000000000000,
18176                        0xfff0000000000000,
18177                        0,
18178                        0x8000000000000000,
18179                        0x40effc0000000000,
18180                        0x3f10000000000000,
18181                        0x3f0ff80000000000,
18182                        0x3e70000000000000};
18183 
18184   TestFcvtHelper(config, kHRegSize, kSRegSize, h_vals, s_vals);
18185   TestFcvtHelper(config, kSRegSize, kHRegSize, s_vals, h_vals);
18186   TestFcvtHelper(config, kSRegSize, kDRegSize, s_vals, d_vals);
18187   TestFcvtHelper(config, kDRegSize, kSRegSize, d_vals, s_vals);
18188   TestFcvtHelper(config, kHRegSize, kDRegSize, h_vals, d_vals);
18189   TestFcvtHelper(config, kDRegSize, kHRegSize, d_vals, h_vals);
18190 }
18191 
TEST_SVE(sve_fcvt_nan)18192 TEST_SVE(sve_fcvt_nan) {
18193   uint64_t h_inputs[] = {0x7e55,   // Quiet NaN.
18194                          0x7c22};  // Signalling NaN.
18195 
18196   uint64_t h2s_expected[] = {0x7fcaa000, 0x7fc44000};
18197 
18198   uint64_t h2d_expected[] = {0x7ff9540000000000, 0x7ff8880000000000};
18199 
18200   uint64_t s_inputs[] = {0x7fc12345,   // Quiet NaN.
18201                          0x7f812345};  // Signalling NaN.
18202 
18203   uint64_t s2h_expected[] = {0x7e09, 0x7e09};
18204 
18205   uint64_t s2d_expected[] = {0x7ff82468a0000000, 0x7ff82468a0000000};
18206 
18207   uint64_t d_inputs[] = {0x7ffaaaaa22222222,   // Quiet NaN.
18208                          0x7ff5555511111111};  // Signalling NaN.
18209 
18210   uint64_t d2h_expected[] = {0x7eaa, 0x7f55};
18211 
18212   uint64_t d2s_expected[] = {0x7fd55551, 0x7feaaaa8};
18213 
18214   TestFcvtHelper(config, kHRegSize, kSRegSize, h_inputs, h2s_expected);
18215   TestFcvtHelper(config, kSRegSize, kHRegSize, s_inputs, s2h_expected);
18216   TestFcvtHelper(config, kHRegSize, kDRegSize, h_inputs, h2d_expected);
18217   TestFcvtHelper(config, kDRegSize, kHRegSize, d_inputs, d2h_expected);
18218   TestFcvtHelper(config, kSRegSize, kDRegSize, s_inputs, s2d_expected);
18219   TestFcvtHelper(config, kDRegSize, kSRegSize, d_inputs, d2s_expected);
18220 }
18221 
18222 template <size_t N, typename T>
TestFrecpxHelper(Test * config,int lane_size_in_bits,T (& zn_inputs)[N],const T (& zd_expected)[N])18223 static void TestFrecpxHelper(Test* config,
18224                              int lane_size_in_bits,
18225                              T (&zn_inputs)[N],
18226                              const T (&zd_expected)[N]) {
18227   TestFPUnaryPredicatedHelper(config,
18228                               lane_size_in_bits,
18229                               lane_size_in_bits,
18230                               zn_inputs,
18231                               zd_expected,
18232                               &MacroAssembler::Frecpx,   // Merging form.
18233                               &MacroAssembler::Frecpx);  // Zerging form.
18234 }
18235 
TEST_SVE(sve_frecpx_h)18236 TEST_SVE(sve_frecpx_h) {
18237   uint64_t zn_inputs[] = {Float16ToRawbits(kFP16PositiveInfinity),
18238                           Float16ToRawbits(kFP16NegativeInfinity),
18239                           Float16ToRawbits(Float16(0.0)),
18240                           Float16ToRawbits(Float16(-0.0)),
18241                           0x0001,   // Smallest positive subnormal number.
18242                           0x03ff,   // Largest subnormal number.
18243                           0x0400,   // Smallest positive normal number.
18244                           0x7bff,   // Largest normal number.
18245                           0x3bff,   // Largest number less than one.
18246                           0x3c01,   // Smallest number larger than one.
18247                           0x7c22,   // Signalling NaN.
18248                           0x7e55};  // Quiet NaN.
18249 
18250   uint64_t zd_expected[] = {0,
18251                             0x8000,
18252                             0x7800,
18253                             0xf800,
18254                             // Exponent of subnormal numbers are zero.
18255                             0x7800,
18256                             0x7800,
18257                             0x7800,
18258                             0x0400,
18259                             0x4400,
18260                             0x4000,
18261                             0x7e22,  // To quiet NaN.
18262                             0x7e55};
18263 
18264   TestFrecpxHelper(config, kHRegSize, zn_inputs, zd_expected);
18265 }
18266 
TEST_SVE(sve_frecpx_s)18267 TEST_SVE(sve_frecpx_s) {
18268   uint64_t zn_inputs[] = {FloatToRawbits(kFP32PositiveInfinity),
18269                           FloatToRawbits(kFP32NegativeInfinity),
18270                           FloatToRawbits(65504),       // Max half precision.
18271                           FloatToRawbits(6.10352e-5),  // Min positive normal.
18272                           FloatToRawbits(6.09756e-5),  // Max subnormal.
18273                           FloatToRawbits(
18274                               5.96046e-8),       // Min positive subnormal.
18275                           FloatToRawbits(5e-9),  // Not representable -> zero.
18276                           FloatToRawbits(-0.0),
18277                           FloatToRawbits(0.0),
18278                           0x7f952222,   // Signalling NaN.
18279                           0x7fea2222};  // Quiet NaN;
18280 
18281   uint64_t zd_expected[] = {0,           // 0.0
18282                             0x80000000,  // -0.0
18283                             0x38800000,  // 6.10352e-05
18284                             0x47000000,  // 32768
18285                             0x47800000,  // 65536
18286                             0x4c800000,  // 6.71089e+07
18287                             0x4e000000,  // 5.36871e+08
18288                             0xff000000,  // -1.70141e+38
18289                             0x7f000000,  // 1.70141e+38
18290                             0x7fd52222,
18291                             0x7fea2222};
18292 
18293   TestFrecpxHelper(config, kSRegSize, zn_inputs, zd_expected);
18294 }
18295 
TEST_SVE(sve_frecpx_d)18296 TEST_SVE(sve_frecpx_d) {
18297   uint64_t zn_inputs[] = {DoubleToRawbits(kFP64PositiveInfinity),
18298                           DoubleToRawbits(kFP64NegativeInfinity),
18299                           DoubleToRawbits(65504),       // Max half precision.
18300                           DoubleToRawbits(6.10352e-5),  // Min positive normal.
18301                           DoubleToRawbits(6.09756e-5),  // Max subnormal.
18302                           DoubleToRawbits(
18303                               5.96046e-8),        // Min positive subnormal.
18304                           DoubleToRawbits(5e-9),  // Not representable -> zero.
18305                           DoubleToRawbits(-0.0),
18306                           DoubleToRawbits(0.0),
18307                           0x7ff5555511111111,   // Signalling NaN.
18308                           0x7ffaaaaa11111111};  // Quiet NaN;
18309 
18310   uint64_t zd_expected[] = {0,                   // 0.0
18311                             0x8000000000000000,  // -0.0
18312                             0x3f10000000000000,  // 6.10352e-05
18313                             0x40e0000000000000,  // 32768
18314                             0x40f0000000000000,  // 65536
18315                             0x4190000000000000,  // 6.71089e+07
18316                             0x41c0000000000000,  // 5.36871e+08
18317                             0xffe0000000000000,  // -1.70141e+38
18318                             0x7fe0000000000000,  // 1.70141e+38
18319                             0x7ffd555511111111,
18320                             0x7ffaaaaa11111111};
18321 
18322   TestFrecpxHelper(config, kDRegSize, zn_inputs, zd_expected);
18323 }
18324 
18325 template <size_t N, typename T>
TestFsqrtHelper(Test * config,int lane_size_in_bits,T (& zn_inputs)[N],const T (& zd_expected)[N])18326 static void TestFsqrtHelper(Test* config,
18327                             int lane_size_in_bits,
18328                             T (&zn_inputs)[N],
18329                             const T (&zd_expected)[N]) {
18330   TestFPUnaryPredicatedHelper(config,
18331                               lane_size_in_bits,
18332                               lane_size_in_bits,
18333                               zn_inputs,
18334                               zd_expected,
18335                               &MacroAssembler::Fsqrt,   // Merging form.
18336                               &MacroAssembler::Fsqrt);  // Zerging form.
18337 }
18338 
TEST_SVE(sve_fsqrt_h)18339 TEST_SVE(sve_fsqrt_h) {
18340   uint64_t zn_inputs[] =
18341       {Float16ToRawbits(Float16(0.0)),
18342        Float16ToRawbits(Float16(-0.0)),
18343        Float16ToRawbits(Float16(1.0)),
18344        Float16ToRawbits(Float16(65025.0)),
18345        Float16ToRawbits(kFP16PositiveInfinity),
18346        Float16ToRawbits(kFP16NegativeInfinity),
18347        Float16ToRawbits(Float16(6.10352e-5)),  // Min normal positive.
18348        Float16ToRawbits(Float16(65504.0)),     // Max normal positive float.
18349        Float16ToRawbits(Float16(6.09756e-5)),  // Max subnormal.
18350        Float16ToRawbits(Float16(5.96046e-8)),  // Min subnormal positive.
18351        0x7c22,                                 // Signaling NaN
18352        0x7e55};                                // Quiet NaN
18353 
18354   uint64_t zd_expected[] = {Float16ToRawbits(Float16(0.0)),
18355                             Float16ToRawbits(Float16(-0.0)),
18356                             Float16ToRawbits(Float16(1.0)),
18357                             Float16ToRawbits(Float16(255.0)),
18358                             Float16ToRawbits(kFP16PositiveInfinity),
18359                             Float16ToRawbits(kFP16DefaultNaN),
18360                             0x2000,
18361                             0x5bff,
18362                             0x1fff,
18363                             0x0c00,
18364                             0x7e22,  // To quiet NaN.
18365                             0x7e55};
18366 
18367   TestFsqrtHelper(config, kHRegSize, zn_inputs, zd_expected);
18368 }
18369 
TEST_SVE(sve_fsqrt_s)18370 TEST_SVE(sve_fsqrt_s) {
18371   uint64_t zn_inputs[] = {FloatToRawbits(0.0f),
18372                           FloatToRawbits(-0.0f),
18373                           FloatToRawbits(1.0f),
18374                           FloatToRawbits(65536.0f),
18375                           FloatToRawbits(kFP32PositiveInfinity),
18376                           FloatToRawbits(kFP32NegativeInfinity),
18377                           0x00800000,   // Min normal positive, ~1.17e−38
18378                           0x7f7fffff,   // Max normal positive, ~3.40e+38
18379                           0x00000001,   // Min subnormal positive, ~1.40e−45
18380                           0x007fffff,   // Max subnormal, ~1.17e−38
18381                           0x7f951111,   // Signaling NaN
18382                           0x7fea1111};  // Quiet NaN
18383 
18384   uint64_t zd_expected[] = {FloatToRawbits(0.0f),
18385                             FloatToRawbits(-0.0f),
18386                             FloatToRawbits(1.0f),
18387                             FloatToRawbits(256.0f),
18388                             FloatToRawbits(kFP32PositiveInfinity),
18389                             FloatToRawbits(kFP32DefaultNaN),
18390                             0x20000000,  // ~1.08e-19
18391                             0x5f7fffff,  // ~1.84e+19
18392                             0x1a3504f3,  // ~3.74e-23
18393                             0x1fffffff,  // ~1.08e-19
18394                             0x7fd51111,  // To quiet NaN.
18395                             0x7fea1111};
18396 
18397   TestFsqrtHelper(config, kSRegSize, zn_inputs, zd_expected);
18398 }
18399 
TEST_SVE(sve_fsqrt_d)18400 TEST_SVE(sve_fsqrt_d) {
18401   uint64_t zn_inputs[] =
18402       {DoubleToRawbits(0.0),
18403        DoubleToRawbits(-0.0),
18404        DoubleToRawbits(1.0),
18405        DoubleToRawbits(65536.0),
18406        DoubleToRawbits(kFP64PositiveInfinity),
18407        DoubleToRawbits(kFP64NegativeInfinity),
18408        0x0010000000000000,  // Min normal positive, ~2.22e-308
18409        0x7fefffffffffffff,  // Max normal positive, ~1.79e+308
18410        0x0000000000000001,  // Min subnormal positive, 5e-324
18411        0x000fffffffffffff,  // Max subnormal, ~2.22e-308
18412        0x7ff5555511111111,
18413        0x7ffaaaaa11111111};
18414 
18415   uint64_t zd_expected[] = {DoubleToRawbits(0.0),
18416                             DoubleToRawbits(-0.0),
18417                             DoubleToRawbits(1.0),
18418                             DoubleToRawbits(256.0),
18419                             DoubleToRawbits(kFP64PositiveInfinity),
18420                             DoubleToRawbits(kFP64DefaultNaN),
18421                             0x2000000000000000,  // ~1.49e-154
18422                             0x5fefffffffffffff,  // ~1.34e+154
18423                             0x1e60000000000000,  // ~2.22e-162
18424                             0x1fffffffffffffff,  // ~1.49e-154
18425                             0x7ffd555511111111,  // To quiet NaN.
18426                             0x7ffaaaaa11111111};
18427 
18428   TestFsqrtHelper(config, kDRegSize, zn_inputs, zd_expected);
18429 }
18430 
TEST_SVE(sve_adr)18431 TEST_SVE(sve_adr) {
18432   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
18433   START();
18434 
18435   __ Index(z0.VnD(), 0x10000000f0000000, 0x1000);
18436   __ Index(z1.VnD(), 1, 3);
18437   __ Index(z2.VnS(), -1, -1);
18438   __ Adr(z3.VnD(), SVEMemOperand(z0.VnD(), z1.VnD()));
18439   __ Adr(z4.VnD(), SVEMemOperand(z0.VnD(), z1.VnD(), LSL, 1));
18440   __ Adr(z5.VnD(), SVEMemOperand(z0.VnD(), z1.VnD(), LSL, 2));
18441   __ Adr(z6.VnD(), SVEMemOperand(z0.VnD(), z1.VnD(), LSL, 3));
18442   __ Adr(z7.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), UXTW));
18443   __ Adr(z8.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), UXTW, 1));
18444   __ Adr(z9.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), UXTW, 2));
18445   __ Adr(z10.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), UXTW, 3));
18446   __ Adr(z11.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), SXTW));
18447   __ Adr(z12.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), SXTW, 1));
18448   __ Adr(z13.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), SXTW, 2));
18449   __ Adr(z14.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), SXTW, 3));
18450   __ Adr(z15.VnS(), SVEMemOperand(z0.VnS(), z2.VnS()));
18451   __ Adr(z16.VnS(), SVEMemOperand(z0.VnS(), z2.VnS(), LSL, 1));
18452   __ Adr(z17.VnS(), SVEMemOperand(z0.VnS(), z2.VnS(), LSL, 2));
18453   __ Adr(z18.VnS(), SVEMemOperand(z0.VnS(), z2.VnS(), LSL, 3));
18454 
18455   END();
18456 
18457   if (CAN_RUN()) {
18458     RUN();
18459     uint64_t expected_z3[] = {0x10000000f0001004, 0x10000000f0000001};
18460     uint64_t expected_z4[] = {0x10000000f0001008, 0x10000000f0000002};
18461     uint64_t expected_z5[] = {0x10000000f0001010, 0x10000000f0000004};
18462     uint64_t expected_z6[] = {0x10000000f0001020, 0x10000000f0000008};
18463     uint64_t expected_z7[] = {0x10000001f0000ffd, 0x10000001efffffff};
18464     uint64_t expected_z8[] = {0x10000002f0000ffa, 0x10000002effffffe};
18465     uint64_t expected_z9[] = {0x10000004f0000ff4, 0x10000004effffffc};
18466     uint64_t expected_z10[] = {0x10000008f0000fe8, 0x10000008effffff8};
18467     uint64_t expected_z11[] = {0x10000000f0000ffd, 0x10000000efffffff};
18468     uint64_t expected_z12[] = {0x10000000f0000ffa, 0x10000000effffffe};
18469     uint64_t expected_z13[] = {0x10000000f0000ff4, 0x10000000effffffc};
18470     uint64_t expected_z14[] = {0x10000000f0000fe8, 0x10000000effffff8};
18471     uint64_t expected_z15[] = {0x0ffffffcf0000ffd, 0x0ffffffeefffffff};
18472     uint64_t expected_z16[] = {0x0ffffff8f0000ffa, 0x0ffffffceffffffe};
18473     uint64_t expected_z17[] = {0x0ffffff0f0000ff4, 0x0ffffff8effffffc};
18474     uint64_t expected_z18[] = {0x0fffffe0f0000fe8, 0x0ffffff0effffff8};
18475 
18476     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
18477     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
18478     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
18479     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
18480     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
18481     ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
18482     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
18483     ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
18484     ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
18485     ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
18486     ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
18487     ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
18488     ASSERT_EQUAL_SVE(expected_z15, z15.VnD());
18489     ASSERT_EQUAL_SVE(expected_z16, z16.VnD());
18490     ASSERT_EQUAL_SVE(expected_z17, z17.VnD());
18491     ASSERT_EQUAL_SVE(expected_z18, z18.VnD());
18492   }
18493 }
18494 
18495 // Test loads and broadcast by comparing them with the result of a set of
18496 // equivalent scalar loads.
18497 template <typename F>
LoadBcastHelper(Test * config,unsigned msize_in_bits,unsigned esize_in_bits,F sve_ld1,bool is_signed)18498 static void LoadBcastHelper(Test* config,
18499                             unsigned msize_in_bits,
18500                             unsigned esize_in_bits,
18501                             F sve_ld1,
18502                             bool is_signed) {
18503   VIXL_ASSERT((esize_in_bits == kBRegSize) || (esize_in_bits == kHRegSize) ||
18504               (esize_in_bits == kSRegSize) || (esize_in_bits == kDRegSize));
18505   static const unsigned kMaxLaneCount = kZRegMaxSize / kBRegSize;
18506 
18507   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
18508   START();
18509 
18510   unsigned msize_in_bytes = msize_in_bits / kBitsPerByte;
18511   unsigned esize_in_bytes = esize_in_bits / kBitsPerByte;
18512   int vl = config->sve_vl_in_bytes();
18513 
18514   uint64_t offsets[kMaxLaneCount];
18515   uint64_t buffer_size = vl * 64;
18516   uint64_t data = reinterpret_cast<uintptr_t>(malloc(buffer_size));
18517   BufferFillingHelper(data,
18518                       buffer_size,
18519                       msize_in_bytes,
18520                       kMaxLaneCount,
18521                       offsets);
18522 
18523   for (unsigned i = 0; i < (kMaxLaneCount / 2); i++) {
18524     // Assign encodable offsets into the first part of the offset array so
18525     // that both encodable and unencodable offset can be tested.
18526     // Note that the encoding bit range of immediate offset is 6 bits.
18527     offsets[i] = (offsets[i] % (UINT64_C(1) << 6)) * msize_in_bytes;
18528   }
18529 
18530   ZRegister zn = z0.WithLaneSize(esize_in_bits);
18531   ZRegister zn_ref = z4.WithLaneSize(esize_in_bits);
18532 
18533   PRegisterZ pg = p0.Zeroing();
18534   Initialise(&masm,
18535              pg,
18536              0x9abcdef012345678,
18537              0xabcdef0123456789,
18538              0xf4f3f1f0fefdfcfa,
18539              0xf9f8f6f5f3f2f0ff);
18540 
18541   __ Mov(x2, data);
18542   uint64_t enablable_offset = offsets[0];
18543   // Simple check if the operation correct in a single offset.
18544   (masm.*sve_ld1)(zn, pg, SVEMemOperand(x2, enablable_offset));
18545 
18546   // Generate a reference result using scalar loads.
18547   uint64_t address = data + enablable_offset;
18548   uint64_t duplicated_addresses[kMaxLaneCount];
18549   for (unsigned i = 0; i < kMaxLaneCount; i++) {
18550     duplicated_addresses[i] = address;
18551   }
18552 
18553   ScalarLoadHelper(&masm,
18554                    vl,
18555                    duplicated_addresses,
18556                    zn_ref,
18557                    pg,
18558                    esize_in_bits,
18559                    msize_in_bits,
18560                    is_signed);
18561 
18562   ZRegister zn_agg = z10.WithLaneSize(esize_in_bits);
18563   ZRegister zn_agg_ref = z11.WithLaneSize(esize_in_bits);
18564   ZRegister zn_temp = z12.WithLaneSize(esize_in_bits);
18565 
18566   __ Dup(zn_agg, 0);
18567   __ Dup(zn_agg_ref, 0);
18568 
18569   // Check if the operation correct in different offsets.
18570   for (unsigned i = 0; i < (vl / esize_in_bytes); i++) {
18571     (masm.*sve_ld1)(zn_temp, pg, SVEMemOperand(x2, offsets[i]));
18572     __ Lastb(x1, pg, zn_temp);
18573     __ Insr(zn_agg, x1);
18574 
18575     __ Mov(x3, data + offsets[i]);
18576     ScalarLoadHelper(&masm, x1, x3, msize_in_bits, is_signed);
18577     __ Insr(zn_agg_ref, x1);
18578   }
18579 
18580   END();
18581 
18582   if (CAN_RUN()) {
18583     RUN();
18584 
18585     ASSERT_EQUAL_SVE(zn_ref, zn);
18586     ASSERT_EQUAL_SVE(zn_agg_ref, zn_agg);
18587   }
18588 
18589   free(reinterpret_cast<void*>(data));
18590 }
18591 
TEST_SVE(sve_ld1rb)18592 TEST_SVE(sve_ld1rb) {
18593   LoadBcastHelper(config, kBRegSize, kBRegSize, &MacroAssembler::Ld1rb, false);
18594   LoadBcastHelper(config, kBRegSize, kHRegSize, &MacroAssembler::Ld1rb, false);
18595   LoadBcastHelper(config, kBRegSize, kSRegSize, &MacroAssembler::Ld1rb, false);
18596   LoadBcastHelper(config, kBRegSize, kDRegSize, &MacroAssembler::Ld1rb, false);
18597 }
18598 
TEST_SVE(sve_ld1rh)18599 TEST_SVE(sve_ld1rh) {
18600   LoadBcastHelper(config, kHRegSize, kHRegSize, &MacroAssembler::Ld1rh, false);
18601   LoadBcastHelper(config, kHRegSize, kSRegSize, &MacroAssembler::Ld1rh, false);
18602   LoadBcastHelper(config, kHRegSize, kDRegSize, &MacroAssembler::Ld1rh, false);
18603 }
18604 
TEST_SVE(sve_ld1rw)18605 TEST_SVE(sve_ld1rw) {
18606   LoadBcastHelper(config, kSRegSize, kSRegSize, &MacroAssembler::Ld1rw, false);
18607   LoadBcastHelper(config, kSRegSize, kDRegSize, &MacroAssembler::Ld1rw, false);
18608 }
18609 
TEST_SVE(sve_ld1rd)18610 TEST_SVE(sve_ld1rd) {
18611   LoadBcastHelper(config, kDRegSize, kDRegSize, &MacroAssembler::Ld1rd, false);
18612 }
18613 
TEST_SVE(sve_ld1rsb)18614 TEST_SVE(sve_ld1rsb) {
18615   LoadBcastHelper(config, kBRegSize, kHRegSize, &MacroAssembler::Ld1rsb, true);
18616   LoadBcastHelper(config, kBRegSize, kSRegSize, &MacroAssembler::Ld1rsb, true);
18617   LoadBcastHelper(config, kBRegSize, kDRegSize, &MacroAssembler::Ld1rsb, true);
18618 }
18619 
TEST_SVE(sve_ld1rsh)18620 TEST_SVE(sve_ld1rsh) {
18621   LoadBcastHelper(config, kHRegSize, kSRegSize, &MacroAssembler::Ld1rsh, true);
18622   LoadBcastHelper(config, kHRegSize, kDRegSize, &MacroAssembler::Ld1rsh, true);
18623 }
18624 
TEST_SVE(sve_ld1rsw)18625 TEST_SVE(sve_ld1rsw) {
18626   LoadBcastHelper(config, kSRegSize, kDRegSize, &MacroAssembler::Ld1rsw, true);
18627 }
18628 
TEST_SVE(sve_prefetch_offset)18629 TEST_SVE(sve_prefetch_offset) {
18630   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
18631 
18632   START();
18633 
18634   __ Prfb(PLDL1KEEP, p5, SVEMemOperand(z30.VnS(), 0));
18635   __ Prfb(PLDL1STRM, p5, SVEMemOperand(x28, -11, SVE_MUL_VL));
18636   __ Prfb(PLDL2KEEP, p6, SVEMemOperand(x30, x29));
18637   __ Prfb(PLDL2STRM, p6, SVEMemOperand(x7, z12.VnS(), UXTW));
18638   __ Prfh(PSTL2KEEP, p6, SVEMemOperand(z0.VnS(), 28));
18639   __ Prfh(PSTL2STRM, p4, SVEMemOperand(x17, -3, SVE_MUL_VL));
18640   __ Prfh(PSTL3KEEP, p3, SVEMemOperand(x0, x0, LSL, 1));
18641   __ Prfh(PSTL3STRM, p4, SVEMemOperand(x20, z0.VnD(), LSL, 1));
18642   __ Prfw(PLDL1KEEP, p3, SVEMemOperand(z23.VnD(), 5));
18643   __ Prfw(PLDL1STRM, p1, SVEMemOperand(x4, 10, SVE_MUL_VL));
18644   __ Prfw(PLDL2KEEP, p2, SVEMemOperand(x22, x22, LSL, 2));
18645   __ Prfw(PLDL2STRM, p1, SVEMemOperand(x2, z6.VnS(), SXTW, 2));
18646   __ Prfd(PLDL3KEEP, p5, SVEMemOperand(z11.VnD(), 9));
18647   __ Prfd(PLDL3STRM, p3, SVEMemOperand(x0, -24, SVE_MUL_VL));
18648   __ Prfd(PSTL1KEEP, p7, SVEMemOperand(x5, x5, LSL, 3));
18649   __ Prfd(PSTL1STRM, p1, SVEMemOperand(x19, z18.VnS(), SXTW, 3));
18650 
18651   END();
18652   if (CAN_RUN()) {
18653     RUN();
18654   }
18655 }
18656 
TEST_SVE(sve2_match_nmatch)18657 TEST_SVE(sve2_match_nmatch) {
18658   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
18659 
18660   START();
18661 
18662   __ Ptrue(p0.VnB());
18663   __ Ptrue(p1.VnH());
18664   __ Ptrue(p2.VnS());
18665 
18666   // Vector to search is bytes 0 - 7, repeating every eight bytes.
18667   __ Index(z0.VnB(), 0, 1);
18668   __ Dup(z0.VnD(), z0.VnD(), 0);
18669 
18670   // Elements to find are (repeated) bytes 0 - 3 in the first segment, 4 - 7
18671   // in the second, 8 - 11 in the third, etc.
18672   __ Index(z1.VnB(), 0, 1);
18673   __ Lsr(z1.VnB(), z1.VnB(), 2);
18674 
18675   __ Match(p3.VnB(), p0.Zeroing(), z0.VnB(), z1.VnB());
18676   __ Match(p4.VnB(), p1.Zeroing(), z0.VnB(), z1.VnB());
18677   __ Nmatch(p0.VnB(), p0.Zeroing(), z0.VnB(), z1.VnB());
18678 
18679   __ Uunpklo(z0.VnH(), z0.VnB());
18680   __ Uunpklo(z1.VnH(), z1.VnB());
18681 
18682   __ Match(p5.VnH(), p1.Zeroing(), z0.VnH(), z1.VnH());
18683   __ Match(p6.VnH(), p2.Zeroing(), z0.VnH(), z1.VnH());
18684   __ Nmatch(p1.VnH(), p1.Zeroing(), z0.VnH(), z1.VnH());
18685 
18686   END();
18687   if (CAN_RUN()) {
18688     RUN();
18689 
18690     int p3_exp[] = {1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
18691                     0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1};
18692     ASSERT_EQUAL_SVE(p3_exp, p3.VnB());
18693     int p4_exp[] = {0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
18694                     0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1};
18695     ASSERT_EQUAL_SVE(p4_exp, p4.VnB());
18696     int p0_exp[] = {0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,
18697                     1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0};
18698     ASSERT_EQUAL_SVE(p0_exp, p0.VnB());
18699 
18700     int p5_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
18701                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1};
18702     ASSERT_EQUAL_SVE(p5_exp, p5.VnB());
18703     int p6_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
18704                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
18705     ASSERT_EQUAL_SVE(p6_exp, p6.VnB());
18706     int p1_exp[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
18707                     0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0};
18708     ASSERT_EQUAL_SVE(p1_exp, p1.VnB());
18709   }
18710 }
18711 
TEST_SVE(sve2_saba_uaba)18712 TEST_SVE(sve2_saba_uaba) {
18713   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
18714 
18715   START();
18716 
18717   __ Index(z0.VnB(), 0, 1);
18718   __ Dup(z1.VnB(), 0xff);
18719   __ Dup(z2.VnB(), 1);
18720   __ Uaba(z2.VnB(), z2.VnB(), z0.VnB(), z1.VnB());
18721   __ Index(z0.VnB(), 0, -1);
18722 
18723   __ Index(z3.VnH(), 0, 1);
18724   __ Index(z4.VnH(), 1, 1);
18725   __ Uaba(z3.VnH(), z3.VnH(), z3.VnH(), z4.VnH());
18726 
18727   __ Index(z5.VnS(), 3, 6);
18728   __ Index(z6.VnS(), 5, 6);
18729   __ Uaba(z5.VnS(), z5.VnS(), z5.VnS(), z6.VnS());
18730 
18731   __ Index(z7.VnD(), 424, 12);
18732   __ Index(z8.VnD(), 4242, 12);
18733   __ Uaba(z7.VnD(), z7.VnD(), z7.VnD(), z8.VnD());
18734 
18735   __ Index(z9.VnH(), -1, -1);
18736   __ Dup(z10.VnB(), 0);
18737   __ Saba(z10.VnB(), z10.VnB(), z9.VnB(), z10.VnB());
18738   __ Index(z11.VnH(), 0x0101, 1);
18739 
18740   __ Index(z12.VnH(), 0, 1);
18741   __ Index(z13.VnH(), 0, -1);
18742   __ Saba(z13.VnH(), z13.VnH(), z12.VnH(), z13.VnH());
18743 
18744   __ Index(z14.VnS(), 0, 2);
18745   __ Index(z15.VnS(), 0, -2);
18746   __ Saba(z15.VnS(), z15.VnS(), z14.VnS(), z15.VnS());
18747 
18748   __ Index(z16.VnD(), 0, 42);
18749   __ Index(z17.VnD(), 0, -42);
18750   __ Saba(z17.VnD(), z17.VnD(), z16.VnD(), z17.VnD());
18751 
18752   END();
18753 
18754   if (CAN_RUN()) {
18755     RUN();
18756 
18757     ASSERT_EQUAL_SVE(z0, z2);
18758     ASSERT_EQUAL_SVE(z3, z4);
18759     ASSERT_EQUAL_SVE(z5, z6);
18760     ASSERT_EQUAL_SVE(z7, z8);
18761 
18762     ASSERT_EQUAL_SVE(z10, z11);
18763     ASSERT_EQUAL_SVE(z12, z13);
18764     ASSERT_EQUAL_SVE(z14, z15);
18765     ASSERT_EQUAL_SVE(z16, z17);
18766   }
18767 }
18768 
TEST_SVE(sve2_integer_multiply_long_vector)18769 TEST_SVE(sve2_integer_multiply_long_vector) {
18770   // The test just check Sqdmull[b|t] and Pmull[b|t], as the way how the element
18771   // operating of the other instructions in the group are likewise.
18772   int32_t zn_inputs_s[] =
18773       {1, -2, 3, -4, 5, -6, 7, -8, INT32_MIN, INT32_MAX, INT32_MAX, INT32_MIN};
18774 
18775   int32_t zm_inputs_s[] =
18776       {1, 2, 3, 4, 5, 6, 7, 8, INT32_MAX, INT32_MIN, INT32_MAX, INT32_MIN};
18777   int64_t sqdmullb_vec_expected_d[] =
18778       {-8, -32, -72, -128, RawbitsToInt64(0x8000000100000000), INT64_MAX};
18779 
18780   uint64_t sqdmullt_vec_expected_d[] =
18781       {2, 18, 50, 98, 0x8000000100000000, 0x7ffffffe00000002};
18782 
18783   uint64_t pmullb_vec_expected_d[] = {0x00000001fffffffc,
18784                                       0x00000003fffffff0,
18785                                       0x000000020000001c,
18786                                       0x00000007ffffffc0,
18787                                       0x3fffffff80000000,
18788                                       0x4000000000000000};
18789 
18790   uint64_t pmullt_vec_expected_d[] = {0x05,
18791                                       0x11,
18792                                       0x15,
18793                                       0x3fffffff80000000,
18794                                       0x1555555555555555};
18795 
18796   uint64_t sqdmullb_idx_expected_d[] = {0xfffffffffffffff8,
18797                                         0xfffffffffffffff0,
18798                                         0xffffffffffffffb8,
18799                                         0xffffffffffffffa0,
18800                                         0x8000000100000000,
18801                                         INT64_MAX};
18802 
18803   uint64_t sqdmullt_idx_expected_d[] =
18804       {8,                    // 2 * zn[11] * zm[8] = 2 * 4 * 1
18805        24,                   // 2 * zn[9] * zm[8] = 2 * 4 * 3
18806        80,                   // 2 * zn[7] * zm[4] = 2 * 8 * 5
18807        112,                  // 2 * zn[5] * zm[4] = 2 * 8 * 7
18808        0x7fffffffffffffff,   // 2 * zn[3] * zm[0]
18809        0x8000000100000000};  // 2 * zn[1] * zm[0]
18810 
18811   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
18812   START();
18813 
18814   InsrHelper(&masm, z31.VnS(), zn_inputs_s);
18815   InsrHelper(&masm, z30.VnS(), zm_inputs_s);
18816 
18817   __ Sqdmullb(z1.VnD(), z31.VnS(), z30.VnS());
18818   __ Sqdmullt(z2.VnD(), z31.VnS(), z30.VnS());
18819 
18820   __ Pmullb(z3.VnD(), z31.VnS(), z30.VnS());
18821   __ Pmullt(z4.VnD(), z31.VnS(), z30.VnS());
18822 
18823   __ Mov(z7, z30);
18824   __ Mov(z8, z31);
18825   __ Sqdmullb(z5.VnD(), z8.VnS(), z7.VnS(), 2);
18826   __ Sqdmullt(z6.VnD(), z8.VnS(), z7.VnS(), 0);
18827 
18828   END();
18829 
18830   if (CAN_RUN()) {
18831     RUN();
18832 
18833     ASSERT_EQUAL_SVE(sqdmullb_vec_expected_d, z1.VnD());
18834     ASSERT_EQUAL_SVE(sqdmullt_vec_expected_d, z2.VnD());
18835     ASSERT_EQUAL_SVE(pmullb_vec_expected_d, z3.VnD());
18836     ASSERT_EQUAL_SVE(pmullt_vec_expected_d, z4.VnD());
18837     ASSERT_EQUAL_SVE(sqdmullb_idx_expected_d, z5.VnD());
18838     ASSERT_EQUAL_SVE(sqdmullt_idx_expected_d, z6.VnD());
18839   }
18840 }
18841 
TEST_SVE(sve2_integer_multiply_add_long_vector)18842 TEST_SVE(sve2_integer_multiply_add_long_vector) {
18843   int32_t zn_inputs_s[] =
18844       {1, -2, 3, -4, 5, -6, 7, -8, INT32_MIN, INT32_MAX, INT32_MAX, INT32_MIN};
18845 
18846   int32_t zm_inputs_s[] =
18847       {1, 2, 3, 4, 5, 6, 7, 8, INT32_MAX, INT32_MIN, INT32_MAX, INT32_MIN};
18848 
18849   int64_t sqdmlalb_vec_expected_d[] =
18850       {-3, -28, -69, -126, RawbitsToInt64(0x8000000100000001), INT64_MAX};
18851 
18852   int64_t sqdmlalt_vec_expected_d[] = {-3,
18853                                        14,
18854                                        47,
18855                                        96,
18856                                        RawbitsToInt64(0x80000000ffffffff),
18857                                        static_cast<int64_t>(
18858                                            0x7ffffffe00000002)};
18859 
18860   int64_t sqdmlalb_idx_expected_d[] =
18861       {-11,   // za.d[5] + 2 * zn.s[10] * zm.s[8] = 5 + 2 * -2 * 4
18862        -28,   // za.d[4] + 2 * zn.s[8] * zm.s[8] = 4 + 2 * -4 * 4
18863        -93,   // za.d[3] + 2 * zn.s[6] * zm.s[4] = 3 + 2 * -6 * 8
18864        -126,  // za.d[2] + 2 * zn.s[4] * zm.s[4] = 2 + 2 * -8 * 8
18865        RawbitsToInt64(0x8000000100000001),
18866        INT64_MAX};
18867 
18868   int64_t sqdmlalt_idx_expected_d[] =
18869       {1,   // za.d[5] + 2 * zn.s[11] * zm.s[9] = -5 + 2 * 1 * 3
18870        14,  // za.d[4] + 2 * zn.s[9] * zm.s[9] = -4 + 2 * 3 * 3
18871        67,  // za.d[3] + 2 * zn.s[7] * zm.s[5] = -3 + 2 * 5 * 7
18872        96,  // za.d[2] + 2 * zn.s[5] * zm.s[5] = -2 + 2 * 7 * 7
18873        RawbitsToInt64(0x80000000ffffffff),
18874        static_cast<int64_t>(0x7ffffffe00000002)};
18875 
18876   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
18877   START();
18878 
18879   InsrHelper(&masm, z0.VnS(), zn_inputs_s);
18880   InsrHelper(&masm, z1.VnS(), zm_inputs_s);
18881   __ Index(z2.VnD(), 0, 1);
18882   __ Index(z3.VnD(), 0, -1);
18883 
18884   __ Mov(z31, z2);
18885   __ Sqdmlalb(z31.VnD(), z31.VnD(), z0.VnS(), z1.VnS());
18886   __ Mov(z30, z3);
18887   __ Sqdmlalt(z30.VnD(), z30.VnD(), z0.VnS(), z1.VnS());
18888   __ Mov(z29, z31);
18889   __ Sqdmlslb(z29.VnD(), z29.VnD(), z0.VnS(), z1.VnS());
18890   __ Mov(z28, z30);
18891   __ Sqdmlslt(z28.VnD(), z28.VnD(), z0.VnS(), z1.VnS());
18892 
18893   __ Sqdmlalb(z27.VnD(), z2.VnD(), z0.VnS(), z1.VnS());
18894   __ Sqdmlalt(z26.VnD(), z3.VnD(), z0.VnS(), z1.VnS());
18895   __ Sqdmlslb(z25.VnD(), z27.VnD(), z0.VnS(), z1.VnS());
18896   __ Sqdmlslt(z24.VnD(), z26.VnD(), z0.VnS(), z1.VnS());
18897 
18898   __ Mov(z23, z2);
18899   __ Sqdmlalb(z23.VnD(), z23.VnD(), z0.VnS(), z1.VnS(), 0);
18900   __ Mov(z22, z3);
18901   __ Sqdmlalt(z22.VnD(), z22.VnD(), z0.VnS(), z1.VnS(), 1);
18902   __ Mov(z21, z23);
18903   __ Sqdmlslb(z21.VnD(), z21.VnD(), z0.VnS(), z1.VnS(), 0);
18904   __ Mov(z20, z22);
18905   __ Sqdmlslt(z20.VnD(), z20.VnD(), z0.VnS(), z1.VnS(), 1);
18906 
18907 
18908   END();
18909 
18910   if (CAN_RUN()) {
18911     RUN();
18912 
18913     ASSERT_EQUAL_SVE(sqdmlalb_vec_expected_d, z31.VnD());
18914     ASSERT_EQUAL_SVE(sqdmlalt_vec_expected_d, z30.VnD());
18915     ASSERT_EQUAL_SVE(z2, z29);
18916     ASSERT_EQUAL_SVE(z3, z28);
18917 
18918     ASSERT_EQUAL_SVE(z31, z27);
18919     ASSERT_EQUAL_SVE(z30, z26);
18920     ASSERT_EQUAL_SVE(z29, z25);
18921     ASSERT_EQUAL_SVE(z28, z24);
18922 
18923     ASSERT_EQUAL_SVE(sqdmlalb_idx_expected_d, z23.VnD());
18924     ASSERT_EQUAL_SVE(sqdmlalt_idx_expected_d, z22.VnD());
18925     ASSERT_EQUAL_SVE(z2, z21);
18926     ASSERT_EQUAL_SVE(z3, z20);
18927   }
18928 }
18929 
TEST_SVE(sve2_ldnt1)18930 TEST_SVE(sve2_ldnt1) {
18931   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
18932   START();
18933 
18934   int data_size = kZRegMaxSizeInBytes * 4;
18935   uint8_t* data = new uint8_t[data_size];
18936   for (int i = 0; i < data_size; i++) {
18937     data[i] = i & 0xff;
18938   }
18939 
18940   // Set the base half-way through the buffer so we can use negative indices.
18941   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
18942   __ Index(z30.VnD(), x0, 1);
18943   __ Ptrue(p0.VnB());
18944   __ Punpklo(p1.VnH(), p0.VnB());
18945   __ Punpklo(p2.VnH(), p1.VnB());
18946   __ Punpklo(p3.VnH(), p2.VnB());
18947   __ Punpklo(p4.VnH(), p3.VnB());
18948 
18949   __ Mov(x1, 1);
18950   __ Ldnt1b(z0.VnD(), p1.Zeroing(), SVEMemOperand(z30.VnD(), x1));
18951   __ Ld1b(z1.VnD(), p1.Zeroing(), SVEMemOperand(x1, z30.VnD()));
18952 
18953   __ Mov(x1, -4);
18954   __ Ldnt1h(z2.VnD(), p2.Zeroing(), SVEMemOperand(z30.VnD(), x1));
18955   __ Ld1h(z3.VnD(), p2.Zeroing(), SVEMemOperand(x1, z30.VnD()));
18956 
18957   __ Mov(x1, 16);
18958   __ Ldnt1w(z4.VnD(), p3.Zeroing(), SVEMemOperand(z30.VnD(), x1));
18959   __ Ld1w(z5.VnD(), p3.Zeroing(), SVEMemOperand(x1, z30.VnD()));
18960 
18961   __ Mov(x1, -16);
18962   __ Ldnt1d(z6.VnD(), p4.Zeroing(), SVEMemOperand(z30.VnD(), x1));
18963   __ Ld1d(z7.VnD(), p4.Zeroing(), SVEMemOperand(x1, z30.VnD()));
18964 
18965   __ Mov(x1, 1);
18966   __ Ldnt1sb(z8.VnD(), p0.Zeroing(), SVEMemOperand(z30.VnD(), x1));
18967   __ Ld1sb(z9.VnD(), p0.Zeroing(), SVEMemOperand(x1, z30.VnD()));
18968 
18969   __ Mov(x1, -4);
18970   __ Ldnt1sh(z10.VnD(), p2.Zeroing(), SVEMemOperand(z30.VnD(), x1));
18971   __ Ld1sh(z11.VnD(), p2.Zeroing(), SVEMemOperand(x1, z30.VnD()));
18972 
18973   __ Mov(x1, 16);
18974   __ Ldnt1sw(z12.VnD(), p3.Zeroing(), SVEMemOperand(z30.VnD(), x1));
18975   __ Ld1sw(z13.VnD(), p3.Zeroing(), SVEMemOperand(x1, z30.VnD()));
18976 
18977   END();
18978 
18979   if (CAN_RUN()) {
18980     RUN();
18981     ASSERT_EQUAL_SVE(z0, z1);
18982     ASSERT_EQUAL_SVE(z2, z3);
18983     ASSERT_EQUAL_SVE(z4, z5);
18984     ASSERT_EQUAL_SVE(z6, z7);
18985     ASSERT_EQUAL_SVE(z8, z9);
18986     ASSERT_EQUAL_SVE(z10, z11);
18987     ASSERT_EQUAL_SVE(z12, z13);
18988   }
18989 }
18990 
TEST_SVE(sve2_stnt1)18991 TEST_SVE(sve2_stnt1) {
18992   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
18993   START();
18994 
18995   int data_size = kZRegMaxSizeInBytes * 4;
18996   uint8_t* data = new uint8_t[data_size];
18997 
18998   // Set the base half-way through the buffer so we can use negative indices.
18999   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
19000   __ Ptrue(p0.VnB());
19001   __ Punpklo(p1.VnH(), p0.VnB());
19002   __ Punpklo(p2.VnH(), p1.VnB());
19003   __ Punpklo(p3.VnH(), p2.VnB());
19004   __ Punpklo(p4.VnH(), p3.VnB());
19005   __ Dup(z0.VnB(), 0xaa);
19006   __ Dup(z1.VnB(), 0x55);
19007   __ Rdvl(x1, 1);
19008   __ Mov(x3, 0);
19009 
19010   // Put store addresses into z30, and a small offset in x4.
19011   __ Index(z30.VnD(), x0, 1);
19012   __ Mov(x4, 2);
19013 
19014   // Store an entire vector of 0xaa to the buffer, then a smaller scatter store
19015   // of 0x55 using Stnt1b.
19016   __ St1b(z0.VnB(), p0, SVEMemOperand(x0, x4));
19017   __ Stnt1b(z1.VnD(), p0, SVEMemOperand(z30.VnD(), x4));
19018 
19019   // Load the entire vector back from the buffer.
19020   __ Ld1b(z2.VnB(), p0.Zeroing(), SVEMemOperand(x0, x4));
19021 
19022   // Construct a predicate that reflects the number of bytes stored by Stnt1b,
19023   // based on the current VL, and use Sel to obtain a reference vector for
19024   // comparison.
19025   __ Lsr(x2, x1, 3);
19026   __ Whilelo(p5.VnB(), x3, x2);
19027   __ Sel(z3.VnB(), p5.Merging(), z1.VnB(), z0.VnB());
19028 
19029   // Repeat for larger element sizes.
19030   __ Mov(x4, -4);
19031   __ Index(z30.VnD(), x0, 2);
19032   __ St1b(z0.VnB(), p0, SVEMemOperand(x0, x4));
19033   __ Stnt1h(z1.VnD(), p0, SVEMemOperand(z30.VnD(), x4));
19034   __ Ld1b(z4.VnB(), p0.Zeroing(), SVEMemOperand(x0, x4));
19035   __ Lsr(x2, x1, 2);
19036   __ Whilelo(p5.VnB(), x3, x2);
19037   __ Sel(z5.VnB(), p5.Merging(), z1.VnB(), z0.VnB());
19038 
19039   __ Mov(x4, 16);
19040   __ Index(z30.VnD(), x0, 4);
19041   __ St1b(z0.VnB(), p0, SVEMemOperand(x0, x4));
19042   __ Stnt1w(z1.VnD(), p0, SVEMemOperand(z30.VnD(), x4));
19043   __ Ld1b(z6.VnB(), p0.Zeroing(), SVEMemOperand(x0, x4));
19044   __ Lsr(x2, x1, 1);
19045   __ Whilelo(p5.VnB(), x3, x2);
19046   __ Sel(z7.VnB(), p5.Merging(), z1.VnB(), z0.VnB());
19047 
19048   __ Mov(x4, -16);
19049   __ Index(z30.VnD(), x0, 8);
19050   __ St1b(z0.VnB(), p0, SVEMemOperand(x0, x4));
19051   __ Stnt1d(z1.VnD(), p0, SVEMemOperand(z30.VnD(), x4));
19052   __ Ld1b(z8.VnB(), p0.Zeroing(), SVEMemOperand(x0, x4));
19053   __ Whilelo(p5.VnB(), x3, x1);
19054   __ Sel(z9.VnB(), p5.Merging(), z1.VnB(), z0.VnB());
19055   END();
19056 
19057   if (CAN_RUN()) {
19058     RUN();
19059     ASSERT_EQUAL_SVE(z2, z3);
19060     ASSERT_EQUAL_SVE(z4, z5);
19061     ASSERT_EQUAL_SVE(z6, z7);
19062     ASSERT_EQUAL_SVE(z8, z9);
19063   }
19064 }
19065 
TEST_SVE(sve2_while_simple)19066 TEST_SVE(sve2_while_simple) {
19067   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
19068 
19069   START();
19070   __ Mov(x0, 1);
19071   __ Mov(x1, 0);
19072   __ Mov(x2, 3);
19073 
19074   __ Whilehi(p0.VnB(), x0, x1);
19075   __ Whilehs(p1.VnB(), x0, x1);
19076   __ Whilehi(p2.VnB(), x2, x1);
19077   __ Whilehs(p3.VnB(), x2, x1);
19078   __ Whilehi(p4.VnB(), x2, x0);
19079   __ Whilehs(p5.VnB(), x2, x0);
19080 
19081   __ Whilegt(p6.VnB(), x0, x1);
19082   __ Whilege(p7.VnB(), x0, x1);
19083   __ Whilegt(p8.VnB(), x2, x1);
19084   __ Whilege(p9.VnB(), x2, x1);
19085   __ Whilegt(p10.VnB(), x2, x0);
19086   __ Whilege(p11.VnB(), x2, x0);
19087 
19088   __ Mov(x4, 0x80000000);
19089   __ Mov(x5, 0x80000001);
19090   __ Whilege(p12.VnB(), w5, w4);
19091   __ Whilegt(p13.VnB(), w5, w4);
19092 
19093   __ Mov(x6, 0x8000000000000000);
19094   __ Mov(x7, 0x8000000000000001);
19095   __ Whilege(p14.VnB(), x7, x6);
19096   __ Whilegt(p15.VnB(), x7, x6);
19097 
19098   for (int i = 0; i < 16; i++) {
19099     __ Rev(PRegister(i).VnB(), PRegister(i).VnB());
19100   }
19101 
19102   END();
19103 
19104   if (CAN_RUN()) {
19105     RUN();
19106     int p0_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
19107     int p1_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
19108     int p2_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
19109     int p3_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
19110     int p4_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1};
19111     int p5_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
19112     int p6_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
19113     int p7_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1};
19114     int p8_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
19115     int p9_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1};
19116     int p10_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1};
19117     int p11_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
19118     int p12_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
19119     int p13_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
19120     int p14_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
19121     int p15_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
19122 
19123     ASSERT_EQUAL_SVE(p0_exp, p0.VnB());
19124     ASSERT_EQUAL_SVE(p1_exp, p1.VnB());
19125     ASSERT_EQUAL_SVE(p2_exp, p2.VnB());
19126     ASSERT_EQUAL_SVE(p3_exp, p3.VnB());
19127     ASSERT_EQUAL_SVE(p4_exp, p4.VnB());
19128     ASSERT_EQUAL_SVE(p5_exp, p5.VnB());
19129     ASSERT_EQUAL_SVE(p6_exp, p6.VnB());
19130     ASSERT_EQUAL_SVE(p7_exp, p7.VnB());
19131     ASSERT_EQUAL_SVE(p8_exp, p8.VnB());
19132     ASSERT_EQUAL_SVE(p9_exp, p9.VnB());
19133     ASSERT_EQUAL_SVE(p10_exp, p10.VnB());
19134     ASSERT_EQUAL_SVE(p11_exp, p11.VnB());
19135     ASSERT_EQUAL_SVE(p12_exp, p12.VnB());
19136     ASSERT_EQUAL_SVE(p13_exp, p13.VnB());
19137     ASSERT_EQUAL_SVE(p14_exp, p14.VnB());
19138     ASSERT_EQUAL_SVE(p15_exp, p15.VnB());
19139   }
19140 }
19141 
TEST_SVE(sve2_whilerw_whilewr_simple)19142 TEST_SVE(sve2_whilerw_whilewr_simple) {
19143   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
19144 
19145   START();
19146   __ Mov(x0, 0);
19147   __ Mov(x1, 1);
19148   __ Mov(x2, 3);
19149 
19150   __ Whilerw(p0.VnB(), x0, x0);
19151   __ Whilerw(p1.VnB(), x0, x1);
19152   __ Whilerw(p2.VnB(), x1, x0);
19153 
19154   __ Whilewr(p3.VnB(), x0, x0);
19155   __ Whilewr(p4.VnB(), x0, x1);
19156   __ Whilewr(p5.VnB(), x1, x0);
19157 
19158   __ Whilewr(p6.VnH(), x1, x1);
19159   __ Whilewr(p7.VnH(), x1, x2);
19160   __ Whilewr(p8.VnH(), x2, x1);
19161 
19162   END();
19163 
19164   if (CAN_RUN()) {
19165     RUN();
19166     int p0_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
19167     ASSERT_EQUAL_SVE(p0_exp, p0.VnB());
19168     int p1_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
19169     ASSERT_EQUAL_SVE(p1_exp, p1.VnB());
19170     int p2_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
19171     ASSERT_EQUAL_SVE(p2_exp, p2.VnB());
19172     int p3_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
19173     ASSERT_EQUAL_SVE(p3_exp, p3.VnB());
19174     int p4_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
19175     ASSERT_EQUAL_SVE(p4_exp, p4.VnB());
19176     int p5_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
19177     ASSERT_EQUAL_SVE(p5_exp, p5.VnB());
19178     int p6_exp[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
19179     ASSERT_EQUAL_SVE(p6_exp, p6.VnB());
19180     int p7_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
19181     ASSERT_EQUAL_SVE(p7_exp, p7.VnB());
19182     int p8_exp[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
19183     ASSERT_EQUAL_SVE(p8_exp, p8.VnB());
19184   }
19185 }
19186 
TEST_SVE(sve2_sqrdcmlah)19187 TEST_SVE(sve2_sqrdcmlah) {
19188   int32_t zn_inputs[] = {-1, -2, -3, -4, 1, 2, 3, 4};
19189   int32_t zm_inputs[] = {-1, -2, 3, 4, 1, 2, -3, -4};
19190   int32_t za_inputs[] = {1, 2, 3, 4, 5, 6, 7, 8};
19191   int32_t zd_000_expected[] =
19192       {1025, 2050, -6141, -8188, 1029, 2054, -6137, -8184};
19193   int32_t zd_090_expected[] =
19194       {1025, -510, -6141, 4612, 1029, -506, -6137, 4616};
19195   int32_t zd_180_expected[] =
19196       {-1023, -2046, 6147, 8196, -1019, -2042, 6151, 8200};
19197   int32_t zd_270_expected[] =
19198       {-1023, 514, 6147, -4604, -1019, 518, 6151, -4600};
19199   int32_t zd_0_270_expected[] =
19200       {2049, -1534, 6147, -4604, 2053, -1530, 6151, -4600};
19201   int32_t zd_3_090_expected[] =
19202       {1025, -510, 3075, -1532, 1029, -506, 3079, -1528};
19203 
19204   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
19205   START();
19206 
19207   InsrHelper(&masm, z0.VnS(), zn_inputs);
19208   InsrHelper(&masm, z1.VnS(), zm_inputs);
19209   InsrHelper(&masm, z31.VnS(), za_inputs);
19210 
19211   // When the value in operands is small, shift left a random value so that it
19212   // can affect the result in destination.
19213   int shift = 20;
19214   __ Lsl(z0.VnS(), z0.VnS(), shift);
19215   __ Lsl(z1.VnS(), z1.VnS(), shift);
19216 
19217   __ Mov(z10, z31);
19218   __ Sqrdcmlah(z10.VnS(), z10.VnS(), z0.VnS(), z1.VnS(), 0);
19219 
19220   __ Mov(z11, z31);
19221   __ Sqrdcmlah(z11.VnS(), z11.VnS(), z0.VnS(), z1.VnS(), 90);
19222 
19223   __ Mov(z12, z31);
19224   __ Sqrdcmlah(z12.VnS(), z12.VnS(), z0.VnS(), z1.VnS(), 180);
19225 
19226   __ Mov(z13, z31);
19227   __ Sqrdcmlah(z13.VnS(), z13.VnS(), z0.VnS(), z1.VnS(), 270);
19228 
19229   __ Sqrdcmlah(z14.VnS(), z31.VnS(), z0.VnS(), z1.VnS(), 0);
19230   __ Sqrdcmlah(z15.VnS(), z31.VnS(), z0.VnS(), z1.VnS(), 90);
19231   __ Sqrdcmlah(z16.VnS(), z31.VnS(), z0.VnS(), z1.VnS(), 180);
19232   __ Sqrdcmlah(z17.VnS(), z31.VnS(), z0.VnS(), z1.VnS(), 270);
19233 
19234   __ Mov(z18, z31);
19235   __ Sqrdcmlah(z18.VnS(), z18.VnS(), z0.VnS(), z1.VnS(), 0, 270);
19236 
19237   __ Mov(z19, z31);
19238   __ Sqrdcmlah(z19.VnS(), z19.VnS(), z0.VnS(), z1.VnS(), 1, 90);
19239 
19240   END();
19241 
19242   if (CAN_RUN()) {
19243     RUN();
19244 
19245     ASSERT_EQUAL_SVE(zd_000_expected, z10.VnS());
19246     ASSERT_EQUAL_SVE(zd_090_expected, z11.VnS());
19247     ASSERT_EQUAL_SVE(zd_180_expected, z12.VnS());
19248     ASSERT_EQUAL_SVE(zd_270_expected, z13.VnS());
19249 
19250     ASSERT_EQUAL_SVE(z14, z10);
19251     ASSERT_EQUAL_SVE(z15, z11);
19252     ASSERT_EQUAL_SVE(z16, z12);
19253     ASSERT_EQUAL_SVE(z17, z13);
19254 
19255     ASSERT_EQUAL_SVE(zd_0_270_expected, z18.VnS());
19256     ASSERT_EQUAL_SVE(zd_3_090_expected, z19.VnS());
19257   }
19258 }
19259 
TEST_SVE(sve2_sqrdmlah)19260 TEST_SVE(sve2_sqrdmlah) {
19261   uint16_t zn_inputs_h[] = {0x7ffe, 0x7ffd, 0x7ffd, 0x7ffd, 0x8000,
19262                             0x7fff, 0x7ffe, 0x7ffe, 0x8001, 0x8000,
19263                             0x7ffd, 0x7ffd, 0x7ffd, 0x5555, 0x5555,
19264                             0x5555, 0x8000, 0x8000, 0xaaaa, 0x8001};
19265 
19266   uint16_t zm_inputs_h[] = {0x7ffd, 0x7fff, 0x7ffe, 0x7ffd, 0x8001,
19267                             0x7fff, 0x7fff, 0x7ffe, 0x8000, 0x8000,
19268                             0xaaaa, 0x0001, 0x0001, 0xaaaa, 0xaaaa,
19269                             0xcccc, 0x8000, 0x8000, 0x8000, 0x8001};
19270 
19271   uint16_t za_inputs_h[] = {0x1010, 0x1010, 0x1010, 0x1010, 0x1010,
19272                             0x1010, 0x1010, 0x1010, 0x8000, 0x8011,
19273                             0x8006, 0xff7d, 0xfeff, 0xaabc, 0xaabb,
19274                             0x9c72, 0x8000, 0x0000, 0x8000, 0xffff};
19275 
19276   uint16_t zd_expected_h[] = {0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
19277                               0x7fff, 0x7fff, 0x7fff, 0xffff, 0x0011,
19278                               0x8000, 0xff7e, 0xff00, 0x8000, 0x8000,
19279                               0x8000, 0x0000, 0x7fff, 0xd556, 0x7ffd};
19280 
19281   uint32_t zn_inputs_s[] = {0x04000000,
19282                             0x80000000,
19283                             0x04000000,
19284                             0x80000000,
19285                             0x80000000,
19286                             0x80000001,
19287                             0x7fffffff,
19288                             0x80000000,
19289                             0x7ffffffe,
19290                             0x7ffffffd,
19291                             0x7ffffffd,
19292                             0x7ffffffd};
19293 
19294   uint32_t zm_inputs_s[] = {0x00000020,
19295                             0x80000000,
19296                             0x00000010,
19297                             0x80000000,
19298                             0x7fffffff,
19299                             0x80000000,
19300                             0x80000000,
19301                             0x80000001,
19302                             0x7ffffffd,
19303                             0x7fffffff,
19304                             0x7ffffffe,
19305                             0x7ffffffd};
19306 
19307   uint32_t za_inputs_s[] = {0x00000000,
19308                             0x00000000,
19309                             0x00000020,
19310                             0x00108000,
19311                             0x00000000,
19312                             0x00000001,
19313                             0x00000000,
19314                             0x00000001,
19315                             0x10101010,
19316                             0x10101010,
19317                             0x10101010,
19318                             0x10101010};
19319 
19320   uint32_t zd_expected_s[] = {0x00000001,
19321                               0x7fffffff,
19322                               0x00000021,
19323                               0x7fffffff,
19324                               0x80000001,
19325                               0x7fffffff,
19326                               0x80000001,
19327                               0x7fffffff,
19328                               0x7fffffff,
19329                               0x7fffffff,
19330                               0x7fffffff,
19331                               0x7fffffff};
19332 
19333   uint64_t zn_inputs_d[] = {0x0400000000000000, 0x8000000000000000,
19334                             0x0400000000000000, 0x8000000000000000,
19335                             0x8000000000000000, 0x8000000000000001,
19336                             0x7fffffffffffffff, 0x8000000000000000,
19337                             0x7ffffffffffffffe, 0x7ffffffffffffffd,
19338                             0x7ffffffffffffffd, 0x7ffffffffffffffd,
19339                             0xf1299accc9186169, 0xd529d2675ee9da21,
19340                             0x1a10b5d60b92dcf9, 0xfb1d358e0e6455b1,
19341                             0x8eb7721078bdc589, 0x4171509750ded141,
19342                             0x8eb7721078bdc589, 0x4171509750ded141};
19343 
19344   uint64_t zm_inputs_d[] = {0x0000000000000020, 0x8000000000000000,
19345                             0x0000000000000010, 0x8000000000000000,
19346                             0x7fffffffffffffff, 0x8000000000000000,
19347                             0x8000000000000000, 0x8000000000000001,
19348                             0x7ffffffffffffffd, 0x7fffffffffffffff,
19349                             0x7ffffffffffffffe, 0x7ffffffffffffffd,
19350                             0x30b940efe73f180e, 0x3bc1ff1e52a99b66,
19351                             0x40de5c9793535a5e, 0x24752faf47bdddb6,
19352                             0x162663016b07e5ae, 0x1de34b56f3d22006,
19353                             0x8eb7721078bdc589, 0x4171509750ded141};
19354 
19355   uint64_t za_inputs_d[] = {0x0000000000000000, 0x0000000000000000,
19356                             0x0000000000000020, 0x0010108000000000,
19357                             0x0000000000000000, 0x0000000000000001,
19358                             0x0000000000000000, 0x0000000000000001,
19359                             0x1010101010101010, 0x1010101010101010,
19360                             0x1010101010101010, 0x1010101010101010,
19361                             0xb18253371b2c2c77, 0xa70de31e6645eaef,
19362                             0xda817198c0318487, 0x9fd9e6b8e04b42ff,
19363                             0xced1f6b7119ab197, 0x01ae051a85509b0f,
19364                             0x01a211e9352f7927, 0x7667b70a5b13749f};
19365 
19366   uint64_t zd_expected_d[] = {0x0000000000000001, 0x7fffffffffffffff,
19367                               0x0000000000000021, 0x7fffffffffffffff,
19368                               0x8000000000000001, 0x7fffffffffffffff,
19369                               0x8000000000000001, 0x7fffffffffffffff,
19370                               0x7fffffffffffffff, 0x7fffffffffffffff,
19371                               0x7fffffffffffffff, 0x7fffffffffffffff,
19372                               0xabdc73dea0d72a35, 0x930e3dc877301966,
19373                               0xe7b7145a059f8a9f, 0x9e75a4a9d10cf8af,
19374                               0xbb378528642d2581, 0x10f5e6d693ffddf3,
19375                               0x65e455a46adc091c, 0x7fffffffffffffff};
19376 
19377   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
19378   START();
19379 
19380   InsrHelper(&masm, z0.VnH(), zn_inputs_h);
19381   InsrHelper(&masm, z1.VnH(), zm_inputs_h);
19382   InsrHelper(&masm, z2.VnH(), za_inputs_h);
19383 
19384   __ Sqrdmlah(z2.VnH(), z2.VnH(), z0.VnH(), z1.VnH());
19385 
19386   InsrHelper(&masm, z3.VnS(), zn_inputs_s);
19387   InsrHelper(&masm, z4.VnS(), zm_inputs_s);
19388   InsrHelper(&masm, z5.VnS(), za_inputs_s);
19389 
19390   __ Sqrdmlah(z5.VnS(), z5.VnS(), z3.VnS(), z4.VnS());
19391 
19392   InsrHelper(&masm, z6.VnD(), zn_inputs_d);
19393   InsrHelper(&masm, z7.VnD(), zm_inputs_d);
19394   InsrHelper(&masm, z8.VnD(), za_inputs_d);
19395 
19396   __ Sqrdmlah(z8.VnD(), z8.VnD(), z6.VnD(), z7.VnD());
19397 
19398   END();
19399 
19400   if (CAN_RUN()) {
19401     RUN();
19402     ASSERT_EQUAL_SVE(zd_expected_h, z2.VnH());
19403     ASSERT_EQUAL_SVE(zd_expected_s, z5.VnS());
19404     ASSERT_EQUAL_SVE(zd_expected_d, z8.VnD());
19405   }
19406 }
19407 
TEST_SVE(sve2_cmla)19408 TEST_SVE(sve2_cmla) {
19409   int32_t zn_inputs_s[] = {-2, -4, -6, -8, 2, 4, 6, 8};
19410   int32_t zm_inputs_s[] = {-2, -4, -6, -8, 2, 4, 6, 8};
19411   int32_t zda_inputs_s[] = {1, 2, 3, 4, 5, 6, 7, 8};
19412   int32_t zd_000_expected[] = {9, 18, 51, 68, 13, 22, 55, 72};
19413   int32_t zd_090_expected[] = {9, -2, 51, -32, 13, 2, 55, -28};
19414   int32_t zd_180_expected[] = {-7, -14, -45, -60, -3, -10, -41, -56};
19415   int32_t zd_270_expected[] = {-7, 6, -45, 40, -3, 10, -41, 44};
19416 
19417   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
19418   START();
19419 
19420   InsrHelper(&masm, z31.VnS(), zn_inputs_s);
19421   InsrHelper(&masm, z30.VnS(), zm_inputs_s);
19422 
19423   InsrHelper(&masm, z0.VnS(), zda_inputs_s);
19424   __ Mov(z29, z0);
19425   __ Cmla(z0.VnS(), z0.VnS(), z31.VnS(), z30.VnS(), 0);
19426 
19427   InsrHelper(&masm, z1.VnS(), zda_inputs_s);
19428   __ Mov(z28, z1);
19429   __ Cmla(z1.VnS(), z1.VnS(), z31.VnS(), z30.VnS(), 90);
19430 
19431   InsrHelper(&masm, z2.VnS(), zda_inputs_s);
19432   __ Mov(z27, z2);
19433   __ Cmla(z2.VnS(), z2.VnS(), z31.VnS(), z30.VnS(), 180);
19434 
19435   InsrHelper(&masm, z3.VnS(), zda_inputs_s);
19436   __ Mov(z26, z3);
19437   __ Cmla(z3.VnS(), z3.VnS(), z31.VnS(), z30.VnS(), 270);
19438 
19439   __ Cmla(z4.VnS(), z29.VnS(), z31.VnS(), z30.VnS(), 0);
19440   __ Cmla(z5.VnS(), z28.VnS(), z31.VnS(), z30.VnS(), 90);
19441   __ Cmla(z6.VnS(), z27.VnS(), z31.VnS(), z30.VnS(), 180);
19442   __ Cmla(z7.VnS(), z26.VnS(), z31.VnS(), z30.VnS(), 270);
19443 
19444   END();
19445 
19446   if (CAN_RUN()) {
19447     RUN();
19448 
19449     ASSERT_EQUAL_SVE(zd_000_expected, z0.VnS());
19450     ASSERT_EQUAL_SVE(zd_090_expected, z1.VnS());
19451     ASSERT_EQUAL_SVE(zd_180_expected, z2.VnS());
19452     ASSERT_EQUAL_SVE(zd_270_expected, z3.VnS());
19453 
19454     ASSERT_EQUAL_SVE(z4, z0);
19455     ASSERT_EQUAL_SVE(z5, z1);
19456     ASSERT_EQUAL_SVE(z6, z2);
19457     ASSERT_EQUAL_SVE(z7, z3);
19458   }
19459 }
19460 
TEST_SVE(sve2_integer_saturating_multiply_add_long)19461 TEST_SVE(sve2_integer_saturating_multiply_add_long) {
19462   int32_t zn_bottom_inputs[] =
19463       {-2, -4, -6, -8, INT32_MAX, INT32_MIN, INT32_MIN};
19464 
19465   int32_t zm_top_inputs[] = {1, 3, 5, 7, INT32_MAX, INT32_MAX, INT32_MIN};
19466 
19467   int64_t sqdmlalbt_expected[] = {2,
19468                                   -19,
19469                                   -56,
19470                                   -109,
19471                                   static_cast<int64_t>(0x7ffffffe00000004),
19472                                   RawbitsToInt64(0x8000000100000001),
19473                                   INT64_MAX};
19474 
19475   int64_t sqdmlslbt_expected[] = {-2,
19476                                   19,
19477                                   56,
19478                                   109,
19479                                   RawbitsToInt64(0x80000001fffffffc),
19480                                   static_cast<int64_t>(0x7ffffffeffffffff),
19481                                   RawbitsToInt64(0x8000000000000001)};
19482 
19483   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
19484   START();
19485 
19486   InsrHelper(&masm, z31.VnS(), zn_bottom_inputs);
19487   InsrHelper(&masm, z30.VnS(), zm_top_inputs);
19488 
19489   __ Dup(z29.VnD(), 0);
19490   __ Zip1(z31.VnS(), z31.VnS(), z29.VnS());
19491   __ Zip1(z30.VnS(), z29.VnS(), z30.VnS());
19492 
19493   // Initialise inputs for za.
19494   __ Index(z1.VnD(), 0, 1);
19495   __ Index(z2.VnD(), 0, -1);
19496 
19497   __ Sqdmlalbt(z1.VnD(), z1.VnD(), z31.VnS(), z30.VnS());
19498   __ Sqdmlslbt(z2.VnD(), z2.VnD(), z31.VnS(), z30.VnS());
19499 
19500   END();
19501 
19502   if (CAN_RUN()) {
19503     RUN();
19504 
19505     ASSERT_EQUAL_SVE(sqdmlalbt_expected, z1.VnD());
19506     ASSERT_EQUAL_SVE(sqdmlslbt_expected, z2.VnD());
19507   }
19508 }
19509 
TEST_SVE(sve2_floating_point_multiply_add_long_vector)19510 TEST_SVE(sve2_floating_point_multiply_add_long_vector) {
19511   uint16_t zn_inputs[] = {Float16ToRawbits(Float16(1000)),
19512                           Float16ToRawbits(Float16(2000)),
19513                           Float16ToRawbits(Float16(0.5)),
19514                           Float16ToRawbits(Float16(-0.5)),
19515                           Float16ToRawbits(Float16(14)),
19516                           Float16ToRawbits(Float16(-14)),
19517                           Float16ToRawbits(kFP16PositiveInfinity),
19518                           Float16ToRawbits(kFP16NegativeInfinity)};
19519 
19520   uint16_t zm_inputs[] = {Float16ToRawbits(Float16(10)),
19521                           Float16ToRawbits(Float16(-10)),
19522                           Float16ToRawbits(Float16(10)),
19523                           Float16ToRawbits(Float16(-10)),
19524                           Float16ToRawbits(Float16(10)),
19525                           Float16ToRawbits(Float16(-10)),
19526                           Float16ToRawbits(Float16(10)),
19527                           Float16ToRawbits(Float16(-10))};
19528 
19529   uint32_t za_inputs[] = {FloatToRawbits(1.0f),
19530                           FloatToRawbits(-1.0f),
19531                           FloatToRawbits(1.0f),
19532                           FloatToRawbits(-1.0f)};
19533 
19534   uint32_t fmlalb_zd_expected[] = {0xc69c3e00,  // -19999
19535                                    0x40800000,  // 4
19536                                    0x430d0000,  // 141
19537                                    FloatToRawbits(kFP32PositiveInfinity)};
19538 
19539   uint32_t fmlalt_zd_expected[] = {0x461c4400,  // 10001
19540                                    0x40800000,  // 4
19541                                    0x430d0000,  // 141
19542                                    FloatToRawbits(kFP32PositiveInfinity)};
19543 
19544   uint32_t fmlslb_zd_expected[] = {0x469c4200,  // 20001
19545                                    0xc0c00000,  // -6
19546                                    0xc30b0000,  // -139
19547                                    FloatToRawbits(kFP32NegativeInfinity)};
19548 
19549   uint32_t fmlslt_zd_expected[] = {0xc61c3c00,  // -9999
19550                                    0xc0c00000,  // -6
19551                                    0xc30b0000,  // -139
19552                                    FloatToRawbits(kFP32NegativeInfinity)};
19553 
19554   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
19555   START();
19556 
19557   InsrHelper(&masm, z31.VnH(), zn_inputs);
19558   InsrHelper(&masm, z30.VnH(), zm_inputs);
19559   InsrHelper(&masm, z29.VnS(), za_inputs);
19560 
19561   __ Mov(z0, z29);
19562   __ Fmlalb(z0.VnS(), z0.VnS(), z31.VnH(), z30.VnH());
19563 
19564   __ Mov(z1, z29);
19565   __ Fmlalt(z1.VnS(), z1.VnS(), z31.VnH(), z30.VnH());
19566 
19567   __ Mov(z2, z29);
19568   __ Fmlslb(z2.VnS(), z2.VnS(), z31.VnH(), z30.VnH());
19569 
19570   __ Mov(z3, z29);
19571   __ Fmlslt(z3.VnS(), z3.VnS(), z31.VnH(), z30.VnH());
19572 
19573   __ Fmlalb(z4.VnS(), z29.VnS(), z31.VnH(), z30.VnH());
19574   __ Fmlalt(z5.VnS(), z29.VnS(), z31.VnH(), z30.VnH());
19575   __ Fmlslb(z6.VnS(), z29.VnS(), z31.VnH(), z30.VnH());
19576   __ Fmlslt(z7.VnS(), z29.VnS(), z31.VnH(), z30.VnH());
19577 
19578   END();
19579 
19580   if (CAN_RUN()) {
19581     RUN();
19582 
19583     ASSERT_EQUAL_SVE(fmlalb_zd_expected, z0.VnS());
19584     ASSERT_EQUAL_SVE(fmlalt_zd_expected, z1.VnS());
19585     ASSERT_EQUAL_SVE(fmlslb_zd_expected, z2.VnS());
19586     ASSERT_EQUAL_SVE(fmlslt_zd_expected, z3.VnS());
19587 
19588     ASSERT_EQUAL_SVE(z4, z0);
19589     ASSERT_EQUAL_SVE(z5, z1);
19590     ASSERT_EQUAL_SVE(z6, z2);
19591     ASSERT_EQUAL_SVE(z7, z3);
19592   }
19593 }
19594 
TEST_SVE(sve2_flogb_simple)19595 TEST_SVE(sve2_flogb_simple) {
19596   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
19597 
19598   START();
19599   __ Ptrue(p0.VnB());
19600   __ Index(z0.VnS(), -4, 1);
19601   __ Mov(z1.VnS(), 0);
19602   __ Mov(z2.VnD(), 0x000fffffffffffff);
19603   __ Mov(z3.VnD(), 0x0010000000000000);
19604   __ Scvtf(z0.VnS(), p0.Merging(), z0.VnS());
19605   __ Scvtf(z1.VnS(), p0.Merging(), z1.VnS());
19606   __ Fdiv(z1.VnS(), p0.Merging(), z0.VnS(), z1.VnS());
19607   __ Flogb(z0.VnS(), p0.Merging(), z0.VnS());
19608   __ Flogb(z1.VnS(), p0.Merging(), z1.VnS());
19609   __ Flogb(z2.VnD(), p0.Merging(), z2.VnD());
19610   __ Flogb(z3.VnD(), p0.Merging(), z3.VnD());
19611   END();
19612 
19613   if (CAN_RUN()) {
19614     RUN();
19615     uint64_t expected_z0[] = {0x0000000200000002,
19616                               0x0000000200000002,
19617                               0x0000000100000001,
19618                               0x0000000080000000,
19619                               0x0000000000000001,
19620                               0x0000000100000002};
19621     ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
19622 
19623     uint64_t expected_z1[] = {0x7fffffff7fffffff,
19624                               0x7fffffff7fffffff,
19625                               0x7fffffff7fffffff,
19626                               0x7fffffff80000000,
19627                               0x7fffffff7fffffff,
19628                               0x7fffffff7fffffff};
19629     ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
19630 
19631     uint64_t expected_z2[] = {0xfffffffffffffc01,
19632                               0xfffffffffffffc01,
19633                               0xfffffffffffffc01,
19634                               0xfffffffffffffc01};
19635     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
19636 
19637     uint64_t expected_z3[] = {0xfffffffffffffc02,
19638                               0xfffffffffffffc02,
19639                               0xfffffffffffffc02,
19640                               0xfffffffffffffc02};
19641     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
19642   }
19643 }
19644 
TEST_SVE(neon_matmul)19645 TEST_SVE(neon_matmul) {
19646   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
19647                           CPUFeatures::kSVEI8MM,
19648                           CPUFeatures::kNEON,
19649                           CPUFeatures::kI8MM);
19650 
19651   // Test Neon integer matrix multiply against SVE.
19652   START();
19653   __ Movi(v0.V2D(), 0xffeeddccbbaa9988, 0x77665544332211);
19654   __ Movi(v1.V2D(), 0xaa5555aa55555555, 0x55aaaa55aaaaaa);
19655   __ Movi(v2.V2D(), 0, 0);
19656   __ Movi(v3.V2D(), 0, 0);
19657   __ Movi(v4.V2D(), 0, 0);
19658   __ Movi(v5.V2D(), 0, 0);
19659   __ Movi(v6.V2D(), 0, 0);
19660   __ Movi(v7.V2D(), 0, 0);
19661 
19662   __ Smmla(v2.V4S(), v0.V16B(), v1.V16B());
19663   __ Smmla(z3.VnS(), z3.VnS(), z0.VnB(), z1.VnB());
19664   __ Ummla(v4.V4S(), v0.V16B(), v1.V16B());
19665   __ Ummla(z5.VnS(), z5.VnS(), z0.VnB(), z1.VnB());
19666   __ Usmmla(v6.V4S(), v0.V16B(), v1.V16B());
19667   __ Usmmla(z7.VnS(), z7.VnS(), z0.VnB(), z1.VnB());
19668   END();
19669 
19670   if (CAN_RUN()) {
19671     RUN();
19672 
19673     // The inputs as Z registers are zero beyond the least-significant 128 bits,
19674     // so the Neon and SVE results should be equal for any VL.
19675     ASSERT_EQUAL_SVE(z3, z2);
19676     ASSERT_EQUAL_SVE(z5, z4);
19677     ASSERT_EQUAL_SVE(z7, z6);
19678   }
19679 }
19680 
TEST_SVE(sudot_usdot)19681 TEST_SVE(sudot_usdot) {
19682   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
19683                           CPUFeatures::kSVE2,
19684                           CPUFeatures::kSVEI8MM);
19685 
19686   START();
19687   __ Ptrue(p0.VnB());
19688   __ Index(z0.VnS(), -424242, 77777);
19689   __ Index(z1.VnB(), 127, -1);
19690   __ Sqabs(z1.VnB(), p0.Merging(), z1.VnB());
19691   __ Index(z2.VnB(), 0, 1);
19692   __ Sqabs(z2.VnB(), p0.Merging(), z2.VnB());
19693   __ Index(z3.VnB(), -128, 1);
19694   __ Mov(z4.VnD(), 0);
19695 
19696   // Test Usdot against Udot/Sdot over the range of inputs where they should be
19697   // equal.
19698   __ Usdot(z5.VnS(), z0.VnS(), z1.VnB(), z2.VnB());
19699   __ Udot(z6.VnS(), z0.VnS(), z1.VnB(), z2.VnB());
19700   __ Usdot(z7.VnS(), z0.VnS(), z1.VnB(), z3.VnB());
19701   __ Sdot(z8.VnS(), z0.VnS(), z1.VnB(), z3.VnB());
19702 
19703   // Construct values which, when interpreted correctly as signed/unsigned,
19704   // should give a zero result for dot product.
19705   __ Mov(z10.VnS(), 0x8101ff40);  // [-127, 1, -1, 64] as signed bytes.
19706   __ Mov(z11.VnS(), 0x02fe8002);  // [2, 254, 128, 2] as unsigned bytes.
19707   __ Usdot(z12.VnS(), z4.VnS(), z11.VnB(), z10.VnB());
19708   __ Usdot(z13.VnS(), z4.VnS(), z10.VnB(), z11.VnB());
19709 
19710   // Construct a vector with duplicated values across segments. This allows
19711   // testing indexed dot product against the already tested variant.
19712   __ Mov(z14.VnS(), 1);
19713   __ Mul(z15.VnS(), z14.VnS(), z3.VnS(), 1);
19714 
19715   __ Usdot(z16.VnS(), z0.VnS(), z3.VnB(), z3.VnB(), 1);
19716   __ Usdot(z17.VnS(), z0.VnS(), z3.VnB(), z15.VnB());
19717   __ Sudot(z18.VnS(), z0.VnS(), z3.VnB(), z3.VnB(), 1);
19718   __ Usdot(z19.VnS(), z0.VnS(), z15.VnB(), z3.VnB());
19719   END();
19720 
19721   if (CAN_RUN()) {
19722     RUN();
19723     ASSERT_EQUAL_SVE(z6, z5);
19724     ASSERT_EQUAL_SVE(z8, z7);
19725     ASSERT_EQUAL_SVE(z4, z12);
19726 
19727     uint64_t z13_expected[] = {0xffff8200ffff8200, 0xffff8200ffff8200};
19728     ASSERT_EQUAL_SVE(z13_expected, z13.VnD());
19729 
19730     ASSERT_EQUAL_SVE(z17, z16);
19731     ASSERT_EQUAL_SVE(z19, z18);
19732   }
19733 }
19734 
19735 // Manually constructed simulator test to avoid creating a VL128 variant.
19736 
19737 #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
Testsve_fmatmul(Test * config)19738 void Testsve_fmatmul(Test* config) {
19739   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVEF64MM);
19740 
19741   // Only double-precision matrix multiply is tested here. Single-precision is
19742   // tested in the simulator tests using a generated sequence. The (templated)
19743   // code used in the simulator for both cases is the same, which is why the
19744   // tests here don't need to be comprehensive.
19745   START();
19746   Label vl_too_short;
19747   __ Rdvl(x0, 1);
19748   __ Cmp(x0, 32);
19749   __ B(lt, &vl_too_short);  // Skip testing VL128.
19750 
19751   __ Fdup(z0.VnD(), 1.0);
19752   __ Fdup(z1.VnD(), 2.0);
19753   __ Mov(z2.VnD(), 0);
19754 
19755   // Build 2x2 identity matrix in z3.
19756   Label iden_loop;
19757   __ Lsr(x0, x0, 5);
19758   __ Bind(&iden_loop);
19759   __ Insr(z3.VnD(), d0);
19760   __ Insr(z3.VnD(), d2);
19761   __ Insr(z3.VnD(), d2);
19762   __ Insr(z3.VnD(), d0);
19763   __ Sub(x0, x0, 1);
19764   __ Cbnz(x0, &iden_loop);
19765 
19766   __ Fmmla(z1.VnD(), z1.VnD(), z0.VnD(), z0.VnD());
19767   __ Fmmla(z2.VnD(), z2.VnD(), z1.VnD(), z3.VnD());
19768 
19769   __ Ptrue(p0.VnB());
19770   __ Index(z4.VnD(), -8, 3);
19771   __ Scvtf(z4.VnD(), p0.Merging(), z4.VnD());
19772   __ Mov(z5.VnD(), 0);
19773   __ Fmmla(z4.VnD(), z4.VnD(), z4.VnD(), z4.VnD());
19774   __ Fmmla(z5.VnD(), z5.VnD(), z4.VnD(), z3.VnD());
19775 
19776   __ Bind(&vl_too_short);
19777   END();
19778 
19779   if (CAN_RUN()) {
19780     RUN();
19781 
19782     int vl = core.GetSVELaneCount(kBRegSize) * 8;
19783     if (vl >= 256) {
19784       ASSERT_EQUAL_SVE(z1, z2);
19785       ASSERT_EQUAL_SVE(z4, z5);
19786 
19787       switch (vl) {
19788         case 256:
19789         case 384: {
19790           // All results are 4.0 (1 * 1 + 2). Results for elements beyond a VL
19791           // that's a multiple of 256 bits should be zero.
19792           uint64_t z1_expected[] = {0x0000000000000000,
19793                                     0x0000000000000000,
19794                                     0x4010000000000000,
19795                                     0x4010000000000000,
19796                                     0x4010000000000000,
19797                                     0x4010000000000000};
19798           ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
19799 
19800           uint64_t z4_expected[] = {0x0000000000000000,
19801                                     0x0000000000000000,
19802                                     0x4018000000000000,   // 6.0
19803                                     0x4022000000000000,   // 9.0
19804                                     0x4018000000000000,   // 6.0
19805                                     0x4054400000000000};  // 81.0
19806           ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
19807           break;
19808         }
19809         case 2048: {
19810           uint64_t z1_expected[] =
19811               {0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
19812                0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
19813                0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
19814                0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
19815                0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
19816                0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
19817                0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
19818                0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
19819                0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
19820                0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
19821                0x4010000000000000, 0x4010000000000000};
19822           ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
19823 
19824           uint64_t z4_expected[] = {
19825               0x40cb690000000000, 0x40c9728000000000, 0x40c9710000000000,
19826               0x40c79e8000000000, 0x40c41f0000000000, 0x40c2708000000000,
19827               0x40c26f0000000000, 0x40c0e48000000000, 0x40bbea0000000000,
19828               0x40b91d0000000000, 0x40b91a0000000000, 0x40b6950000000000,
19829               0x40b1d60000000000, 0x40af320000000000, 0x40af2c0000000000,
19830               0x40ab420000000000, 0x40a4040000000000, 0x40a0aa0000000000,
19831               0x40a0a40000000000, 0x409bb40000000000, 0x4091b80000000000,
19832               0x408a880000000000, 0x408a700000000000, 0x4083c80000000000,
19833               0x4071a00000000000, 0x4061a00000000000, 0x4061400000000000,
19834               0x4051400000000000, 0x4018000000000000, 0x4022000000000000,
19835               0x4018000000000000, 0x4054400000000000,
19836           };
19837           ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
19838           break;
19839         }
19840         default:
19841           printf("WARNING: Some tests skipped due to unexpected VL.\n");
19842           break;
19843       }
19844     }
19845   }
19846 }
19847 Test* test_sve_fmatmul_list[] =
19848     {Test::MakeSVETest(256, "AARCH64_ASM_sve_fmatmul_vl256", &Testsve_fmatmul),
19849      Test::MakeSVETest(384, "AARCH64_ASM_sve_fmatmul_vl384", &Testsve_fmatmul),
19850      Test::MakeSVETest(2048,
19851                        "AARCH64_ASM_sve_fmatmul_vl2048",
19852                        &Testsve_fmatmul)};
19853 
Testsve_ld1ro(Test * config)19854 void Testsve_ld1ro(Test* config) {
19855   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVEF64MM);
19856   START();
19857 
19858   int data_size = (kQRegSizeInBytes + 128) * 4;
19859   uint8_t* data = new uint8_t[data_size];
19860   for (int i = 0; i < data_size; i++) {
19861     data[i] = i & 0xff;
19862   }
19863 
19864   // Set the base to just past half-way through the buffer so we can use
19865   // negative indices.
19866   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[7 + data_size / 2]));
19867 
19868   __ Index(z0.VnB(), 0, 1);
19869   __ Ptrue(p0.VnB());
19870   __ Cmplo(p0.VnB(), p0.Zeroing(), z0.VnB(), 4);
19871   __ Pfalse(p1.VnB());
19872   __ Zip1(p1.VnB(), p0.VnB(), p1.VnB());
19873   __ Ptrue(p2.VnB());
19874 
19875   __ Mov(x1, -32);
19876   __ Ld1rob(z0.VnB(), p1.Zeroing(), SVEMemOperand(x0, -32));
19877   __ Ld1rob(z1.VnB(), p1.Zeroing(), SVEMemOperand(x0, x1));
19878 
19879   __ Mov(x1, 64 / 2);
19880   __ Ld1roh(z2.VnH(), p2.Zeroing(), SVEMemOperand(x0, 64));
19881   __ Ld1roh(z3.VnH(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 1));
19882 
19883   __ Mov(x1, -96 / 4);
19884   __ Ld1row(z4.VnS(), p2.Zeroing(), SVEMemOperand(x0, -96));
19885   __ Ld1row(z5.VnS(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 2));
19886 
19887   __ Mov(x1, 128 / 8);
19888   __ Ld1rod(z6.VnD(), p2.Zeroing(), SVEMemOperand(x0, 128));
19889   __ Ld1rod(z7.VnD(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 3));
19890 
19891   // Check that all 256-bit segments match by rotating the vector by one
19892   // segment, eoring, and orring across the vector.
19893   __ Dup(z11.VnQ(), z0.VnQ(), 2);
19894   __ Mov(z8, z0);
19895   __ Ext(z8.VnB(), z8.VnB(), z8.VnB(), 32);
19896   __ Eor(z8.VnB(), z8.VnB(), z0.VnB());
19897   __ Orv(b9, p2, z8.VnB());
19898 
19899   __ Mov(z8, z2);
19900   __ Ext(z8.VnB(), z8.VnB(), z8.VnB(), 32);
19901   __ Eor(z8.VnB(), z8.VnB(), z2.VnB());
19902   __ Orv(b8, p2, z8.VnB());
19903   __ Orr(z9, z9, z8);
19904 
19905   __ Mov(z8, z4);
19906   __ Ext(z8.VnB(), z8.VnB(), z8.VnB(), 32);
19907   __ Eor(z8.VnB(), z8.VnB(), z4.VnB());
19908   __ Orv(b8, p2, z8.VnB());
19909   __ Orr(z9, z9, z8);
19910 
19911   __ Mov(z8, z6);
19912   __ Ext(z8.VnB(), z8.VnB(), z8.VnB(), 32);
19913   __ Eor(z8.VnB(), z8.VnB(), z6.VnB());
19914   __ Orv(b8, p2, z8.VnB());
19915   __ Orr(z9, z9, z8);
19916 
19917   END();
19918 
19919   if (CAN_RUN()) {
19920     RUN();
19921 
19922     int vl = core.GetSVELaneCount(kBRegSize) * 8;
19923     if (vl >= 256) {
19924       ASSERT_EQUAL_SVE(z0, z1);
19925       ASSERT_EQUAL_SVE(z2, z3);
19926       ASSERT_EQUAL_SVE(z4, z5);
19927       ASSERT_EQUAL_SVE(z6, z7);
19928 
19929       switch (vl) {
19930         case 256:
19931         case 2048: {
19932           // Check the result of the rotate/eor sequence.
19933           uint64_t expected_z9[] = {0, 0};
19934           ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
19935           break;
19936         }
19937         case 384: {
19938           // For non-multiple-of-256 VL, the top 128-bits must be zero, which
19939           // breaks the rotate/eor sequence. Check the results explicitly.
19940           uint64_t z0_expected[] = {0x0000000000000000,
19941                                     0x0000000000000000,
19942                                     0x0000000000000000,
19943                                     0x0000000000000000,
19944                                     0x0000000000000000,
19945                                     0x000d000b00090007};
19946           uint64_t z2_expected[] = {0x0000000000000000,
19947                                     0x0000000000000000,
19948                                     0x868584838281807f,
19949                                     0x7e7d7c7b7a797877,
19950                                     0x767574737271706f,
19951                                     0x6e6d6c6b6a696867};
19952           uint64_t z4_expected[] = {0x0000000000000000,
19953                                     0x0000000000000000,
19954                                     0xe6e5e4e3e2e1e0df,
19955                                     0xdedddcdbdad9d8d7,
19956                                     0xd6d5d4d3d2d1d0cf,
19957                                     0xcecdcccbcac9c8c7};
19958           uint64_t z6_expected[] = {0x0000000000000000,
19959                                     0x0000000000000000,
19960                                     0xc6c5c4c3c2c1c0bf,
19961                                     0xbebdbcbbbab9b8b7,
19962                                     0xb6b5b4b3b2b1b0af,
19963                                     0xaeadacabaaa9a8a7};
19964           ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
19965           ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
19966           ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
19967           ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
19968           break;
19969         }
19970         default:
19971           printf("WARNING: Some tests skipped due to unexpected VL.\n");
19972           break;
19973       }
19974     }
19975   }
19976 }
19977 Test* test_sve_ld1ro_list[] =
19978     {Test::MakeSVETest(256, "AARCH64_ASM_sve_ld1ro_vl256", &Testsve_ld1ro),
19979      Test::MakeSVETest(384, "AARCH64_ASM_sve_ld1ro_vl384", &Testsve_ld1ro),
19980      Test::MakeSVETest(2048, "AARCH64_ASM_sve_ld1ro_vl2048", &Testsve_ld1ro)};
19981 #endif
19982 
19983 }  // namespace aarch64
19984 }  // namespace vixl
19985