xref: /aosp_15_r20/external/mesa3d/src/nouveau/mme/tests/mme_tu104_sim_hw_test.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2022 Collabora Ltd.
3  * SPDX-License-Identifier: MIT
4  */
5 #include "mme_runner.h"
6 #include "mme_tu104_sim.h"
7 
8 #include "nv_push_clc597.h"
9 
10 class mme_tu104_sim_test : public ::testing::Test, public mme_hw_runner {
11 public:
12    mme_tu104_sim_test();
13    ~mme_tu104_sim_test();
14 
15    void SetUp();
16    void test_macro(const mme_builder *b,
17                    const std::vector<uint32_t>& macro,
18                    const std::vector<uint32_t>& params);
19 };
20 
mme_tu104_sim_test()21 mme_tu104_sim_test::mme_tu104_sim_test() :
22    ::testing::Test(),
23    mme_hw_runner()
24 { }
25 
~mme_tu104_sim_test()26 mme_tu104_sim_test::~mme_tu104_sim_test()
27 { }
28 
29 void
SetUp()30 mme_tu104_sim_test::SetUp()
31 {
32    ASSERT_TRUE(set_up_hw(TURING_A, UINT16_MAX));
33 }
34 
35 void
test_macro(const mme_builder * b,const std::vector<uint32_t> & macro,const std::vector<uint32_t> & params)36 mme_tu104_sim_test::test_macro(const mme_builder *b,
37                                const std::vector<uint32_t>& macro,
38                                const std::vector<uint32_t>& params)
39 {
40    const uint32_t data_dwords = DATA_BO_SIZE / sizeof(uint32_t);
41 
42    std::vector<mme_tu104_inst> insts(macro.size() / 3);
43    mme_tu104_decode(&insts[0], &macro[0], macro.size() / 3);
44 
45    /* First, make a copy of the data and simulate the macro */
46    std::vector<uint32_t> sim_data(data, data + (DATA_BO_SIZE / 4));
47    mme_tu104_sim_mem sim_mem = {
48       .addr = data_addr,
49       .data = &sim_data[0],
50       .size = DATA_BO_SIZE,
51    };
52    mme_tu104_sim(insts.size(), &insts[0],
53                  params.size(), params.size() ? &params[0] : NULL,
54                  1, &sim_mem);
55 
56    /* Now run the macro on the GPU */
57    push_macro(0, macro);
58 
59    P_1INC(p, NVC597, CALL_MME_MACRO(0));
60    if (params.empty()) {
61       P_NVC597_CALL_MME_MACRO(p, 0, 0);
62    } else {
63       P_INLINE_ARRAY(p, &params[0], params.size());
64    }
65 
66    submit_push();
67 
68    /* Check the results */
69    for (uint32_t i = 0; i < data_dwords; i++)
70       ASSERT_EQ(data[i], sim_data[i]);
71 }
72 
73 static mme_tu104_reg
mme_value_as_reg(mme_value val)74 mme_value_as_reg(mme_value val)
75 {
76    assert(val.type == MME_VALUE_TYPE_REG);
77    return (mme_tu104_reg)(MME_TU104_REG_R0 + val.reg);
78 }
79 
TEST_F(mme_tu104_sim_test,sanity)80 TEST_F(mme_tu104_sim_test, sanity)
81 {
82    const uint32_t canary = 0xc0ffee01;
83 
84    mme_builder b;
85    mme_builder_init(&b, devinfo);
86 
87    mme_store_imm_addr(&b, data_addr, mme_imm(canary));
88 
89    auto macro = mme_builder_finish_vec(&b);
90 
91    std::vector<uint32_t> params;
92    test_macro(&b, macro, params);
93 }
94 
TEST_F(mme_tu104_sim_test,multi_param)95 TEST_F(mme_tu104_sim_test, multi_param)
96 {
97    mme_builder b;
98    mme_builder_init(&b, devinfo);
99 
100    mme_value v0 = mme_alloc_reg(&b);
101    mme_value v1 = mme_alloc_reg(&b);
102 
103    mme_tu104_asm(&b, i) {
104       i.alu[0].dst = mme_value_as_reg(v0);
105       i.alu[0].src[0] = MME_TU104_REG_LOAD1;
106       i.alu[1].dst = mme_value_as_reg(v1);
107       i.alu[1].src[0] = MME_TU104_REG_LOAD0;
108       i.imm[0] = (1<<12) | (NVC597_SET_MME_SHADOW_SCRATCH(12) >> 2);
109       i.out[0].mthd = MME_TU104_OUT_OP_IMM0;
110       i.out[0].emit = MME_TU104_OUT_OP_LOAD0;
111       i.imm[1] = (1<<12) | (NVC597_SET_MME_SHADOW_SCRATCH(35) >> 2);
112       i.out[1].mthd = MME_TU104_OUT_OP_IMM1;
113       i.out[1].emit = MME_TU104_OUT_OP_LOAD1;
114    }
115 
116    mme_value v2 = mme_state(&b, NVC597_SET_MME_SHADOW_SCRATCH(12));
117    mme_value v3 = mme_state(&b, NVC597_SET_MME_SHADOW_SCRATCH(35));
118 
119    mme_store_imm_addr(&b, data_addr + 0, v0);
120    mme_store_imm_addr(&b, data_addr + 4, v1);
121    mme_store_imm_addr(&b, data_addr + 8, v2);
122    mme_store_imm_addr(&b, data_addr + 12, v3);
123 
124    auto macro = mme_builder_finish_vec(&b);
125 
126    std::vector<uint32_t> params;
127    params.push_back(2581);
128    params.push_back(3048);
129 
130    test_macro(&b, macro, params);
131 }
132 
TEST_F(mme_tu104_sim_test,pred_param)133 TEST_F(mme_tu104_sim_test, pred_param)
134 {
135    mme_builder b;
136    mme_builder_init(&b, devinfo);
137 
138    mme_value v0 = mme_load(&b);
139    mme_value v1 = mme_mov(&b, mme_imm(240));
140 
141    mme_tu104_asm(&b, i) {
142       i.pred_mode = MME_TU104_PRED_TTTT;
143       i.alu[0].dst = mme_value_as_reg(v1);
144       i.alu[0].src[0] = MME_TU104_REG_LOAD0;
145    }
146 
147    mme_value v2 = mme_load(&b);
148 
149    mme_store_imm_addr(&b, data_addr + 0, v0);
150    mme_store_imm_addr(&b, data_addr + 4, v1);
151    mme_store_imm_addr(&b, data_addr + 8, v2);
152 
153    auto macro = mme_builder_finish_vec(&b);
154 
155    for (uint32_t j = 0; j < 4; j++) {
156       reset_push();
157 
158       std::vector<uint32_t> params;
159       params.push_back((j & 1) * 2043);
160       params.push_back((j & 2) * 523);
161       params.push_back(2581);
162       params.push_back(3048);
163 
164       test_macro(&b, macro, params);
165    }
166 }
167 
TEST_F(mme_tu104_sim_test,out_imm0)168 TEST_F(mme_tu104_sim_test, out_imm0)
169 {
170    mme_builder b;
171    mme_builder_init(&b, devinfo);
172 
173    mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
174    mme_emit(&b, mme_imm(high32(data_addr + 0)));
175    mme_emit(&b, mme_imm(low32(data_addr + 0)));
176    mme_tu104_asm(&b, i) {
177       i.imm[0] = 0x1234;
178       i.out[0].emit = MME_TU104_OUT_OP_IMM0;
179    }
180    mme_emit(&b, mme_imm(0x10000000));
181 
182    mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
183    mme_emit(&b, mme_imm(high32(data_addr + 4)));
184    mme_emit(&b, mme_imm(low32(data_addr + 4)));
185    mme_tu104_asm(&b, i) {
186       i.imm[0] = 0x8765;
187       i.out[0].emit = MME_TU104_OUT_OP_IMM0;
188    }
189    mme_emit(&b, mme_imm(0x10000000));
190 
191    auto macro = mme_builder_finish_vec(&b);
192 
193    std::vector<uint32_t> params;
194    test_macro(&b, macro, params);
195 }
196 
TEST_F(mme_tu104_sim_test,out_imm1)197 TEST_F(mme_tu104_sim_test, out_imm1)
198 {
199    mme_builder b;
200    mme_builder_init(&b, devinfo);
201 
202    mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
203    mme_emit(&b, mme_imm(high32(data_addr + 0)));
204    mme_emit(&b, mme_imm(low32(data_addr + 0)));
205    mme_tu104_asm(&b, i) {
206       i.imm[1] = 0x1234;
207       i.out[0].emit = MME_TU104_OUT_OP_IMM1;
208    }
209    mme_emit(&b, mme_imm(0x10000000));
210 
211    mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
212    mme_emit(&b, mme_imm(high32(data_addr + 4)));
213    mme_emit(&b, mme_imm(low32(data_addr + 4)));
214    mme_tu104_asm(&b, i) {
215       i.imm[1] = 0x8765;
216       i.out[0].emit = MME_TU104_OUT_OP_IMM1;
217    }
218    mme_emit(&b, mme_imm(0x10000000));
219 
220    auto macro = mme_builder_finish_vec(&b);
221 
222    std::vector<uint32_t> params;
223    test_macro(&b, macro, params);
224 }
225 
TEST_F(mme_tu104_sim_test,out_immhigh0)226 TEST_F(mme_tu104_sim_test, out_immhigh0)
227 {
228    mme_builder b;
229    mme_builder_init(&b, devinfo);
230 
231    mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
232    mme_emit(&b, mme_imm(high32(data_addr + 0)));
233    mme_emit(&b, mme_imm(low32(data_addr + 0)));
234    mme_tu104_asm(&b, i) {
235       i.imm[0] = 0x1234;
236       i.out[0].emit = MME_TU104_OUT_OP_IMMHIGH0;
237    }
238    mme_emit(&b, mme_imm(0x10000000));
239 
240    mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
241    mme_emit(&b, mme_imm(high32(data_addr + 4)));
242    mme_emit(&b, mme_imm(low32(data_addr + 4)));
243    mme_tu104_asm(&b, i) {
244       i.imm[0] = 0x8765;
245       i.out[1].emit = MME_TU104_OUT_OP_IMMHIGH0;
246    }
247    mme_emit(&b, mme_imm(0x10000000));
248 
249    auto macro = mme_builder_finish_vec(&b);
250 
251    std::vector<uint32_t> params;
252    test_macro(&b, macro, params);
253 }
254 
TEST_F(mme_tu104_sim_test,out_immhigh1)255 TEST_F(mme_tu104_sim_test, out_immhigh1)
256 {
257    mme_builder b;
258    mme_builder_init(&b, devinfo);
259 
260    mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
261    mme_emit(&b, mme_imm(high32(data_addr + 0)));
262    mme_emit(&b, mme_imm(low32(data_addr + 0)));
263    mme_tu104_asm(&b, i) {
264       i.imm[1] = 0x1234;
265       i.out[0].emit = MME_TU104_OUT_OP_IMMHIGH1;
266    }
267    mme_emit(&b, mme_imm(0x10000000));
268 
269    mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
270    mme_emit(&b, mme_imm(high32(data_addr + 4)));
271    mme_emit(&b, mme_imm(low32(data_addr + 4)));
272    mme_tu104_asm(&b, i) {
273       i.imm[1] = 0x8765;
274       i.out[1].emit = MME_TU104_OUT_OP_IMMHIGH1;
275    }
276    mme_emit(&b, mme_imm(0x10000000));
277 
278    auto macro = mme_builder_finish_vec(&b);
279 
280    std::vector<uint32_t> params;
281    test_macro(&b, macro, params);
282 }
283 
TEST_F(mme_tu104_sim_test,out_imm32)284 TEST_F(mme_tu104_sim_test, out_imm32)
285 {
286    mme_builder b;
287    mme_builder_init(&b, devinfo);
288 
289    mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
290    mme_emit(&b, mme_imm(high32(data_addr + 0)));
291    mme_emit(&b, mme_imm(low32(data_addr + 0)));
292    mme_tu104_asm(&b, i) {
293       i.imm[0] = 0x1234;
294       i.imm[1] = 0x7654;
295       i.out[0].emit = MME_TU104_OUT_OP_IMM32;
296    }
297    mme_emit(&b, mme_imm(0x10000000));
298 
299    mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
300    mme_emit(&b, mme_imm(high32(data_addr + 4)));
301    mme_emit(&b, mme_imm(low32(data_addr + 4)));
302    mme_tu104_asm(&b, i) {
303       i.imm[0] = 0x1234;
304       i.imm[1] = 0x7654;
305       i.out[1].emit = MME_TU104_OUT_OP_IMM32;
306    }
307    mme_emit(&b, mme_imm(0x10000000));
308 
309    auto macro = mme_builder_finish_vec(&b);
310 
311    std::vector<uint32_t> params;
312    test_macro(&b, macro, params);
313 }
314 
TEST_F(mme_tu104_sim_test,reg_imm32)315 TEST_F(mme_tu104_sim_test, reg_imm32)
316 {
317    const uint32_t canary = 0xc0ffee01;
318 
319    mme_builder b;
320    mme_builder_init(&b, devinfo);
321 
322    mme_value v = mme_alloc_reg(&b);
323 
324    mme_tu104_asm(&b, i) {
325       i.alu[0].dst = mme_value_as_reg(v);
326       i.alu[0].op = MME_TU104_ALU_OP_ADD;
327       i.alu[0].src[0] = MME_TU104_REG_IMM32,
328       i.imm[0] = (uint16_t)canary;
329       i.imm[1] = (uint16_t)(canary >> 16);
330    }
331 
332    mme_store_imm_addr(&b, data_addr, v);
333 
334    auto macro = mme_builder_finish_vec(&b);
335 
336    std::vector<uint32_t> params;
337    test_macro(&b, macro, params);
338 }
339 
TEST_F(mme_tu104_sim_test,pred_alu)340 TEST_F(mme_tu104_sim_test, pred_alu)
341 {
342    static const enum mme_tu104_pred preds[] = {
343       MME_TU104_PRED_UUUU,
344       MME_TU104_PRED_TTTT,
345       MME_TU104_PRED_FFFF,
346       MME_TU104_PRED_TTUU,
347       MME_TU104_PRED_FFUU,
348       MME_TU104_PRED_TFUU,
349       MME_TU104_PRED_TUUU,
350       MME_TU104_PRED_FUUU,
351       MME_TU104_PRED_UUTT,
352       MME_TU104_PRED_UUTF,
353       MME_TU104_PRED_UUTU,
354       MME_TU104_PRED_UUFT,
355       MME_TU104_PRED_UUFF,
356       MME_TU104_PRED_UUFU,
357       MME_TU104_PRED_UUUT,
358       MME_TU104_PRED_UUUF,
359    };
360 
361    for (uint32_t i = 0; i < ARRAY_SIZE(preds); i++) {
362       mme_builder b;
363       mme_builder_init(&b, devinfo);
364 
365       mme_value pred = mme_load(&b);
366       mme_value v0 = mme_mov(&b, mme_imm(i * 100 + 13));
367       mme_value v1 = mme_mov(&b, mme_imm(i * 100 + 62));
368 
369       mme_tu104_asm(&b, inst) {
370          inst.pred = mme_value_as_reg(pred);
371          inst.pred_mode = preds[i];
372          inst.alu[0].dst = mme_value_as_reg(v0);
373          inst.alu[0].src[0] = MME_TU104_REG_IMM;
374          inst.imm[0] = i * 100 + 25;
375          inst.alu[1].dst = mme_value_as_reg(v1);
376          inst.alu[1].src[0] = MME_TU104_REG_IMM;
377          inst.imm[1] = i * 100 + 73;
378       }
379 
380       mme_store_imm_addr(&b, data_addr + i * 8 + 0, v0);
381       mme_store_imm_addr(&b, data_addr + i * 8 + 4, v1);
382 
383       auto macro = mme_builder_finish_vec(&b);
384 
385       for (uint32_t j = 0; j < 2; j++) {
386          reset_push();
387 
388          std::vector<uint32_t> params;
389          params.push_back(j * 25894);
390 
391          test_macro(&b, macro, params);
392       }
393    }
394 }
395 
TEST_F(mme_tu104_sim_test,pred_out)396 TEST_F(mme_tu104_sim_test, pred_out)
397 {
398    static const enum mme_tu104_pred preds[] = {
399       MME_TU104_PRED_UUUU,
400       MME_TU104_PRED_TTTT,
401       MME_TU104_PRED_FFFF,
402       MME_TU104_PRED_TTUU,
403       MME_TU104_PRED_FFUU,
404       MME_TU104_PRED_TFUU,
405       MME_TU104_PRED_TUUU,
406       MME_TU104_PRED_FUUU,
407       MME_TU104_PRED_UUTT,
408       MME_TU104_PRED_UUTF,
409       MME_TU104_PRED_UUTU,
410       MME_TU104_PRED_UUFT,
411       MME_TU104_PRED_UUFF,
412       MME_TU104_PRED_UUFU,
413       MME_TU104_PRED_UUUT,
414       MME_TU104_PRED_UUUF,
415    };
416 
417    for (uint32_t i = 0; i < ARRAY_SIZE(preds); i++) {
418       mme_builder b;
419       mme_builder_init(&b, devinfo);
420 
421       mme_value pred = mme_load(&b);
422 
423       mme_tu104_asm(&b, inst) {
424          inst.imm[0] = (1<<12) | (NVC597_SET_MME_SHADOW_SCRATCH(i*2 + 0) >> 2);
425          inst.imm[1] = i * 100 + 25;
426          inst.out[0].mthd = MME_TU104_OUT_OP_IMM0;
427          inst.out[0].emit = MME_TU104_OUT_OP_IMM1;
428       }
429 
430       mme_tu104_asm(&b, inst) {
431          inst.imm[0] = (1<<12) | (NVC597_SET_MME_SHADOW_SCRATCH(i*2 + 1) >> 2);
432          inst.imm[1] = i * 100 + 75;
433          inst.out[0].mthd = MME_TU104_OUT_OP_IMM0;
434          inst.out[0].emit = MME_TU104_OUT_OP_IMM1;
435       }
436 
437       mme_tu104_asm(&b, inst) {
438          inst.pred = mme_value_as_reg(pred);
439          inst.pred_mode = preds[i];
440          inst.imm[0] = (1<<12) | (NVC597_SET_MME_SHADOW_SCRATCH(i*2 + 0) >> 2);
441          inst.imm[1] = (1<<12) | (NVC597_SET_MME_SHADOW_SCRATCH(i*2 + 1) >> 2);
442          inst.out[0].mthd = MME_TU104_OUT_OP_IMM0;
443          inst.out[0].emit = MME_TU104_OUT_OP_IMM1;
444          inst.out[1].mthd = MME_TU104_OUT_OP_IMM1;
445          inst.out[1].emit = MME_TU104_OUT_OP_IMM0;
446       }
447 
448       mme_value v0 = mme_state(&b, NVC597_SET_MME_SHADOW_SCRATCH(i*2 + 0));
449       mme_value v1 = mme_state(&b, NVC597_SET_MME_SHADOW_SCRATCH(i*2 + 1));
450 
451       mme_store_imm_addr(&b, data_addr + i * 8 + 0, v0);
452       mme_store_imm_addr(&b, data_addr + i * 8 + 4, v1);
453 
454       auto macro = mme_builder_finish_vec(&b);
455 
456       for (uint32_t j = 0; j < 2; j++) {
457          reset_push();
458 
459          std::vector<uint32_t> params;
460          params.push_back(j * 25894);
461 
462          test_macro(&b, macro, params);
463       }
464    }
465 }
466 
TEST_F(mme_tu104_sim_test,add)467 TEST_F(mme_tu104_sim_test, add)
468 {
469    mme_builder b;
470    mme_builder_init(&b, devinfo);
471 
472    mme_value x = mme_load(&b);
473    mme_value y = mme_load(&b);
474    mme_value sum = mme_add(&b, x, y);
475    mme_store_imm_addr(&b, data_addr, sum);
476 
477    auto macro = mme_builder_finish_vec(&b);
478 
479    std::vector<uint32_t> params;
480    params.push_back(25);
481    params.push_back(138);
482 
483    test_macro(&b, macro, params);
484 }
485 
TEST_F(mme_tu104_sim_test,add_imm)486 TEST_F(mme_tu104_sim_test, add_imm)
487 {
488    mme_builder b;
489    mme_builder_init(&b, devinfo);
490 
491    mme_value x = mme_load(&b);
492 
493    mme_value v0 = mme_add(&b, x, mme_imm(0x00000001));
494    mme_value v1 = mme_add(&b, x, mme_imm(0xffffffff));
495    mme_value v2 = mme_add(&b, x, mme_imm(0xffff8000));
496    mme_value v3 = mme_add(&b, mme_imm(0x00000001), x);
497    mme_value v4 = mme_add(&b, mme_imm(0xffffffff), x);
498    mme_value v5 = mme_add(&b, mme_imm(0xffff8000), x);
499    mme_value v6 = mme_add(&b, mme_zero(), mme_imm(0x00000001));
500    mme_value v7 = mme_add(&b, mme_zero(), mme_imm(0xffffffff));
501    mme_value v8 = mme_add(&b, mme_zero(), mme_imm(0xffff8000));
502 
503    mme_store_imm_addr(&b, data_addr + 0,  v0);
504    mme_store_imm_addr(&b, data_addr + 4,  v1);
505    mme_store_imm_addr(&b, data_addr + 8,  v2);
506    mme_store_imm_addr(&b, data_addr + 12, v3);
507    mme_store_imm_addr(&b, data_addr + 16, v4);
508    mme_store_imm_addr(&b, data_addr + 20, v5);
509    mme_store_imm_addr(&b, data_addr + 24, v6);
510    mme_store_imm_addr(&b, data_addr + 28, v7);
511    mme_store_imm_addr(&b, data_addr + 32, v8);
512 
513    auto macro = mme_builder_finish_vec(&b);
514 
515    uint32_t vals[] = {
516       0x0000ffff,
517       0x00008000,
518       0x0001ffff,
519       0xffffffff,
520    };
521 
522    for (uint32_t i = 0; i < ARRAY_SIZE(vals); i++) {
523       reset_push();
524 
525       std::vector<uint32_t> params;
526       params.push_back(vals[i]);
527 
528       test_macro(&b, macro, params);
529    }
530 }
531 
TEST_F(mme_tu104_sim_test,addc)532 TEST_F(mme_tu104_sim_test, addc)
533 {
534    mme_builder b;
535    mme_builder_init(&b, devinfo);
536 
537    struct mme_value64 x = { mme_load(&b), mme_load(&b) };
538    struct mme_value64 y = { mme_load(&b), mme_load(&b) };
539 
540    struct mme_value64 sum = mme_add64(&b, x, y);
541 
542    mme_store_imm_addr(&b, data_addr + 0, sum.lo);
543    mme_store_imm_addr(&b, data_addr + 4, sum.hi);
544 
545    auto macro = mme_builder_finish_vec(&b);
546 
547    std::vector<uint32_t> params;
548    params.push_back(0x80008650);
549    params.push_back(0x596);
550    params.push_back(0x8000a8f6);
551    params.push_back(0x836);
552 
553    test_macro(&b, macro, params);
554 }
555 
TEST_F(mme_tu104_sim_test,addc_imm)556 TEST_F(mme_tu104_sim_test, addc_imm)
557 {
558    mme_builder b;
559    mme_builder_init(&b, devinfo);
560 
561    mme_value x_lo = mme_load(&b);
562    mme_value x_hi = mme_load(&b);
563 
564    mme_value v1_lo = mme_alloc_reg(&b);
565    mme_value v1_hi = mme_alloc_reg(&b);
566    mme_tu104_asm(&b, i) {
567       i.alu[0].dst = mme_value_as_reg(v1_lo);
568       i.alu[0].op = MME_TU104_ALU_OP_ADD;
569       i.alu[0].src[0] = mme_value_as_reg(x_lo);
570       i.alu[0].src[1] = MME_TU104_REG_IMM;
571       i.imm[0] = 0x0001;
572       i.alu[1].dst = mme_value_as_reg(v1_hi);
573       i.alu[1].op = MME_TU104_ALU_OP_ADDC;
574       i.alu[1].src[0] = mme_value_as_reg(x_hi);
575       i.alu[1].src[1] = MME_TU104_REG_IMM;
576       i.imm[1] = 0x0000;
577    }
578 
579    mme_value v2_lo = mme_alloc_reg(&b);
580    mme_value v2_hi = mme_alloc_reg(&b);
581    mme_tu104_asm(&b, i) {
582       i.alu[0].dst = mme_value_as_reg(v2_lo);
583       i.alu[0].op = MME_TU104_ALU_OP_ADD;
584       i.alu[0].src[0] = mme_value_as_reg(x_lo);
585       i.alu[0].src[1] = MME_TU104_REG_IMM;
586       i.imm[0] = 0x0000;
587       i.alu[1].dst = mme_value_as_reg(v2_hi);
588       i.alu[1].op = MME_TU104_ALU_OP_ADDC;
589       i.alu[1].src[0] = mme_value_as_reg(x_hi);
590       i.alu[1].src[1] = MME_TU104_REG_IMM;
591       i.imm[1] = 0x0001;
592    }
593 
594    mme_value v3_lo = mme_alloc_reg(&b);
595    mme_value v3_hi = mme_alloc_reg(&b);
596    mme_tu104_asm(&b, i) {
597       i.alu[0].dst = mme_value_as_reg(v3_lo);
598       i.alu[0].op = MME_TU104_ALU_OP_ADD;
599       i.alu[0].src[0] = mme_value_as_reg(x_lo);
600       i.alu[0].src[1] = MME_TU104_REG_IMM;
601       i.imm[0] = 0x0000;
602       i.alu[1].dst = mme_value_as_reg(v3_hi);
603       i.alu[1].op = MME_TU104_ALU_OP_ADDC;
604       i.alu[1].src[0] = mme_value_as_reg(x_hi);
605       i.alu[1].src[1] = MME_TU104_REG_IMM;
606       i.imm[1] = 0xffff;
607    }
608 
609    mme_value v4_lo = mme_alloc_reg(&b);
610    mme_value v4_hi = mme_alloc_reg(&b);
611    mme_tu104_asm(&b, i) {
612       i.alu[0].dst = mme_value_as_reg(v4_lo);
613       i.alu[0].op = MME_TU104_ALU_OP_ADD;
614       i.alu[0].src[0] = mme_value_as_reg(x_lo);
615       i.alu[0].src[1] = MME_TU104_REG_IMM;
616       i.imm[0] = 0x0000;
617       i.alu[1].dst = mme_value_as_reg(v4_hi);
618       i.alu[1].op = MME_TU104_ALU_OP_ADDC;
619       i.alu[1].src[0] = mme_value_as_reg(x_hi);
620       i.alu[1].src[1] = MME_TU104_REG_IMM;
621       i.imm[1] = 0x8000;
622    }
623 
624    mme_store_imm_addr(&b, data_addr + 0,  v1_lo);
625    mme_store_imm_addr(&b, data_addr + 4,  v1_hi);
626    mme_store_imm_addr(&b, data_addr + 8,  v2_lo);
627    mme_store_imm_addr(&b, data_addr + 12, v2_hi);
628    mme_store_imm_addr(&b, data_addr + 16, v3_lo);
629    mme_store_imm_addr(&b, data_addr + 20, v3_hi);
630    mme_store_imm_addr(&b, data_addr + 24, v4_lo);
631    mme_store_imm_addr(&b, data_addr + 28, v4_hi);
632 
633    auto macro = mme_builder_finish_vec(&b);
634 
635    uint64_t vals[] = {
636       0x0000ffffffffffffull,
637       0x0000ffffffff8000ull,
638       0x0000ffff00000000ull,
639       0x0000800000000000ull,
640       0x00008000ffffffffull,
641       0x0001ffff00000000ull,
642       0xffffffff00000000ull,
643       0xffffffffffffffffull,
644    };
645 
646    for (uint32_t i = 0; i < ARRAY_SIZE(vals); i++) {
647       reset_push();
648 
649       std::vector<uint32_t> params;
650       params.push_back(low32(vals[i]));
651       params.push_back(high32(vals[i]));
652 
653       test_macro(&b, macro, params);
654    }
655 }
656 
TEST_F(mme_tu104_sim_test,sub)657 TEST_F(mme_tu104_sim_test, sub)
658 {
659    mme_builder b;
660    mme_builder_init(&b, devinfo);
661 
662    mme_value x = mme_load(&b);
663    mme_value y = mme_load(&b);
664    mme_value diff = mme_sub(&b, x, y);
665    mme_store_imm_addr(&b, data_addr, diff);
666 
667    auto macro = mme_builder_finish_vec(&b);
668 
669    std::vector<uint32_t> params;
670    params.push_back(25);
671    params.push_back(138);
672 
673    test_macro(&b, macro, params);
674 }
675 
TEST_F(mme_tu104_sim_test,subb)676 TEST_F(mme_tu104_sim_test, subb)
677 {
678    mme_builder b;
679    mme_builder_init(&b, devinfo);
680 
681    struct mme_value64 x = { mme_load(&b), mme_load(&b) };
682    struct mme_value64 y = { mme_load(&b), mme_load(&b) };
683 
684    struct mme_value64 diff = mme_sub64(&b, x, y);
685 
686    mme_store_imm_addr(&b, data_addr + 0, diff.lo);
687    mme_store_imm_addr(&b, data_addr + 4, diff.hi);
688 
689    auto macro = mme_builder_finish_vec(&b);
690 
691    std::vector<uint32_t> params;
692    params.push_back(0x80008650);
693    params.push_back(0x596);
694    params.push_back(0x8000a8f6);
695    params.push_back(0x836);
696 
697    test_macro(&b, macro, params);
698 }
699 
TEST_F(mme_tu104_sim_test,mul)700 TEST_F(mme_tu104_sim_test, mul)
701 {
702    mme_builder b;
703    mme_builder_init(&b, devinfo);
704 
705    mme_value x = mme_load(&b);
706    mme_value y = mme_load(&b);
707    mme_value sum = mme_mul(&b, x, y);
708    mme_store_imm_addr(&b, data_addr, sum);
709 
710    auto macro = mme_builder_finish_vec(&b);
711 
712    std::vector<uint32_t> params;
713    params.push_back(25);
714    params.push_back(138);
715 
716    test_macro(&b, macro, params);
717 }
718 
TEST_F(mme_tu104_sim_test,mul_imm)719 TEST_F(mme_tu104_sim_test, mul_imm)
720 {
721    mme_builder b;
722    mme_builder_init(&b, devinfo);
723 
724    mme_value x = mme_load(&b);
725 
726    mme_value v0 = mme_mul(&b, x, mme_imm(0x00000001));
727    mme_value v1 = mme_mul(&b, x, mme_imm(0xffffffff));
728    mme_value v2 = mme_mul(&b, x, mme_imm(0xffff8000));
729    mme_value v3 = mme_mul(&b, mme_imm(0x00000001), x);
730    mme_value v4 = mme_mul(&b, mme_imm(0xffffffff), x);
731    mme_value v5 = mme_mul(&b, mme_imm(0xffff8000), x);
732 
733    mme_store_imm_addr(&b, data_addr + 0,  v0);
734    mme_store_imm_addr(&b, data_addr + 4,  v1);
735    mme_store_imm_addr(&b, data_addr + 8,  v2);
736    mme_store_imm_addr(&b, data_addr + 12, v3);
737    mme_store_imm_addr(&b, data_addr + 16, v4);
738    mme_store_imm_addr(&b, data_addr + 20, v5);
739 
740    auto macro = mme_builder_finish_vec(&b);
741 
742    int32_t vals[] = { 1, -5, -1, 5 };
743 
744    for (uint32_t i = 0; i < ARRAY_SIZE(vals); i++) {
745       reset_push();
746 
747       std::vector<uint32_t> params;
748       params.push_back(vals[i]);
749 
750       test_macro(&b, macro, params);
751    }
752 }
753 
TEST_F(mme_tu104_sim_test,mul_mulh)754 TEST_F(mme_tu104_sim_test, mul_mulh)
755 {
756    mme_builder b;
757    mme_builder_init(&b, devinfo);
758 
759    mme_value x = mme_load(&b);
760    mme_value y = mme_load(&b);
761 
762    struct mme_value64 prod = mme_imul_32x32_64(&b, x, y);
763 
764    mme_store_imm_addr(&b, data_addr + 0, prod.lo);
765    mme_store_imm_addr(&b, data_addr + 4, prod.hi);
766 
767    auto macro = mme_builder_finish_vec(&b);
768 
769    std::vector<uint32_t> params;
770    params.push_back(0x80008650);
771    params.push_back(0x596);
772 
773    test_macro(&b, macro, params);
774 }
775 
776 static inline struct mme_value
mme_mulu(struct mme_builder * b,struct mme_value x,struct mme_value y)777 mme_mulu(struct mme_builder *b, struct mme_value x, struct mme_value y)
778 {
779    return mme_alu(b, MME_ALU_OP_MULU, x, y);
780 }
781 
TEST_F(mme_tu104_sim_test,mulu_imm)782 TEST_F(mme_tu104_sim_test, mulu_imm)
783 {
784    mme_builder b;
785    mme_builder_init(&b, devinfo);
786 
787    mme_value x = mme_load(&b);
788 
789    mme_value v0 = mme_mulu(&b, x, mme_imm(0x00000001));
790    mme_value v1 = mme_mulu(&b, x, mme_imm(0xffffffff));
791    mme_value v2 = mme_mulu(&b, x, mme_imm(0xffff8000));
792    mme_value v3 = mme_mulu(&b, mme_imm(0x00000001), x);
793    mme_value v4 = mme_mulu(&b, mme_imm(0xffffffff), x);
794    mme_value v5 = mme_mulu(&b, mme_imm(0xffff8000), x);
795 
796    mme_store_imm_addr(&b, data_addr + 0,  v0);
797    mme_store_imm_addr(&b, data_addr + 4,  v1);
798    mme_store_imm_addr(&b, data_addr + 8,  v2);
799    mme_store_imm_addr(&b, data_addr + 12, v3);
800    mme_store_imm_addr(&b, data_addr + 16, v4);
801    mme_store_imm_addr(&b, data_addr + 20, v5);
802 
803    auto macro = mme_builder_finish_vec(&b);
804 
805    int32_t vals[] = { 1, -5, -1, 5 };
806 
807    for (uint32_t i = 0; i < ARRAY_SIZE(vals); i++) {
808       reset_push();
809 
810       std::vector<uint32_t> params;
811       params.push_back(vals[i]);
812 
813       test_macro(&b, macro, params);
814    }
815 }
816 
TEST_F(mme_tu104_sim_test,mulu_mulh)817 TEST_F(mme_tu104_sim_test, mulu_mulh)
818 {
819    mme_builder b;
820    mme_builder_init(&b, devinfo);
821 
822    mme_value x = mme_load(&b);
823    mme_value y = mme_load(&b);
824 
825    struct mme_value64 prod = mme_umul_32x32_64(&b, x, y);
826 
827    mme_store_imm_addr(&b, data_addr + 0, prod.lo);
828    mme_store_imm_addr(&b, data_addr + 4, prod.hi);
829 
830    auto macro = mme_builder_finish_vec(&b);
831 
832    std::vector<uint32_t> params;
833    params.push_back(0x80008650);
834    params.push_back(0x596);
835 
836    test_macro(&b, macro, params);
837 }
838 
TEST_F(mme_tu104_sim_test,clz)839 TEST_F(mme_tu104_sim_test, clz)
840 {
841    mme_builder b;
842    mme_builder_init(&b, devinfo);
843 
844    mme_value bits = mme_clz(&b, mme_load(&b));
845    mme_store_imm_addr(&b, data_addr, bits);
846 
847    auto macro = mme_builder_finish_vec(&b);
848 
849    std::vector<uint32_t> params;
850    params.push_back(0x00406fe0);
851 
852    test_macro(&b, macro, params);
853 }
854 
855 #define SHIFT_TEST(op)                                               \
856 TEST_F(mme_tu104_sim_test, op)                                       \
857 {                                                                    \
858    mme_builder b;                                                    \
859    mme_builder_init(&b, devinfo);                                 \
860                                                                      \
861    mme_value val = mme_load(&b);                                     \
862    mme_value shift1 = mme_load(&b);                                  \
863    mme_value shift2 = mme_load(&b);                                  \
864    mme_store_imm_addr(&b, data_addr + 0, mme_##op(&b, val, shift1)); \
865    mme_store_imm_addr(&b, data_addr + 4, mme_##op(&b, val, shift2)); \
866                                                                      \
867    auto macro = mme_builder_finish_vec(&b);                          \
868                                                                      \
869    std::vector<uint32_t> params;                                     \
870    params.push_back(0x0c406fe0);                                     \
871    params.push_back(5);                                              \
872    params.push_back(51);                                             \
873                                                                      \
874    test_macro(&b, macro, params);                                    \
875 }
876 
877 SHIFT_TEST(sll)
SHIFT_TEST(srl)878 SHIFT_TEST(srl)
879 SHIFT_TEST(sra)
880 
881 #undef SHIFT_TEST
882 
883 TEST_F(mme_tu104_sim_test, bfe)
884 {
885    const uint32_t canary = 0xc0ffee01;
886 
887    mme_builder b;
888    mme_builder_init(&b, devinfo);
889 
890    mme_value val = mme_load(&b);
891    mme_value pos = mme_load(&b);
892 
893    mme_store_imm_addr(&b, data_addr + 0, mme_bfe(&b, val, pos, 1), true);
894    mme_store_imm_addr(&b, data_addr + 4, mme_bfe(&b, val, pos, 2), true);
895    mme_store_imm_addr(&b, data_addr + 8, mme_bfe(&b, val, pos, 5), true);
896 
897    auto macro = mme_builder_finish_vec(&b);
898 
899    for (unsigned i = 0; i < 31; i++) {
900       std::vector<uint32_t> params;
901       params.push_back(canary);
902       params.push_back(i);
903 
904       test_macro(&b, macro, params);
905 
906       ASSERT_EQ(data[0], (canary >> i) & 0x1);
907       ASSERT_EQ(data[1], (canary >> i) & 0x3);
908       ASSERT_EQ(data[2], (canary >> i) & 0x1f);
909    }
910 }
911 
TEST_F(mme_tu104_sim_test,not)912 TEST_F(mme_tu104_sim_test, not)
913 {
914    mme_builder b;
915    mme_builder_init(&b, devinfo);
916 
917    mme_value x = mme_load(&b);
918    mme_value v1 = mme_not(&b, x);
919    mme_store_imm_addr(&b, data_addr + 0, v1);
920 
921    auto macro = mme_builder_finish_vec(&b);
922 
923    std::vector<uint32_t> params;
924    params.push_back(0x0c406fe0);
925 
926    test_macro(&b, macro, params);
927 }
928 
929 #define BITOP_TEST(op)                                               \
930 TEST_F(mme_tu104_sim_test, op)                                       \
931 {                                                                    \
932    mme_builder b;                                                    \
933    mme_builder_init(&b, devinfo);                                 \
934                                                                      \
935    mme_value x = mme_load(&b);                                       \
936    mme_value y = mme_load(&b);                                       \
937    mme_value v1 = mme_##op(&b, x, y);                                \
938    mme_value v2 = mme_##op(&b, x, mme_imm(0xffff8000));              \
939    mme_value v3 = mme_##op(&b, x, mme_imm(0xffffffff));              \
940    mme_store_imm_addr(&b, data_addr + 0, v1);                        \
941    mme_store_imm_addr(&b, data_addr + 4, v2);                        \
942    mme_store_imm_addr(&b, data_addr + 8, v3);                        \
943                                                                      \
944    auto macro = mme_builder_finish_vec(&b);                          \
945                                                                      \
946    std::vector<uint32_t> params;                                     \
947    params.push_back(0x0c406fe0);                                     \
948    params.push_back(0x00fff0c0);                                     \
949                                                                      \
950    test_macro(&b, macro, params);                                    \
951 }
952 
953 BITOP_TEST(and)
BITOP_TEST(and_not)954 BITOP_TEST(and_not)
955 BITOP_TEST(nand)
956 BITOP_TEST(or)
957 BITOP_TEST(xor)
958 
959 #undef BITOP_TEST
960 
961 TEST_F(mme_tu104_sim_test, merge)
962 {
963    mme_builder b;
964    mme_builder_init(&b, devinfo);
965 
966    mme_value x = mme_load(&b);
967    mme_value y = mme_load(&b);
968 
969    mme_value m1 = mme_merge(&b, x, y, 12, 12, 20);
970    mme_value m2 = mme_merge(&b, x, y, 12, 8,  20);
971    mme_value m3 = mme_merge(&b, x, y, 8,  12, 20);
972    mme_value m4 = mme_merge(&b, x, y, 12, 16, 8);
973    mme_value m5 = mme_merge(&b, x, y, 24, 12, 8);
974 
975    mme_store_imm_addr(&b, data_addr + 0,  m1);
976    mme_store_imm_addr(&b, data_addr + 4,  m2);
977    mme_store_imm_addr(&b, data_addr + 8,  m3);
978    mme_store_imm_addr(&b, data_addr + 12, m4);
979    mme_store_imm_addr(&b, data_addr + 16, m5);
980 
981    auto macro = mme_builder_finish_vec(&b);
982 
983    std::vector<uint32_t> params;
984    params.push_back(0x0c406fe0);
985    params.push_back(0x76543210u);
986 
987    test_macro(&b, macro, params);
988 }
989 
990 #define COMPARISON_TEST(op)                     \
991 TEST_F(mme_tu104_sim_test, op)                  \
992 {                                               \
993    mme_builder b;                               \
994    mme_builder_init(&b, devinfo);            \
995                                                 \
996    mme_value x = mme_load(&b);                  \
997    mme_value y = mme_load(&b);                  \
998    mme_value z = mme_load(&b);                  \
999    mme_value w = mme_load(&b);                  \
1000                                                 \
1001    mme_value v1 = mme_##op(&b, x, y);           \
1002    mme_value v2 = mme_##op(&b, y, x);           \
1003    mme_value v3 = mme_##op(&b, y, z);           \
1004    mme_value v4 = mme_##op(&b, z, y);           \
1005    mme_value v5 = mme_##op(&b, w, z);           \
1006    mme_value v6 = mme_##op(&b, z, w);           \
1007    mme_value v7 = mme_##op(&b, w, w);           \
1008                                                 \
1009    mme_store_imm_addr(&b, data_addr + 0,  v1);  \
1010    mme_store_imm_addr(&b, data_addr + 4,  v2);  \
1011    mme_store_imm_addr(&b, data_addr + 8,  v3);  \
1012    mme_store_imm_addr(&b, data_addr + 12, v4);  \
1013    mme_store_imm_addr(&b, data_addr + 16, v5);  \
1014    mme_store_imm_addr(&b, data_addr + 20, v6);  \
1015    mme_store_imm_addr(&b, data_addr + 24, v7);  \
1016                                                 \
1017    auto macro = mme_builder_finish_vec(&b);     \
1018                                                 \
1019    std::vector<uint32_t> params;                \
1020    params.push_back(-5);                        \
1021    params.push_back(-10);                       \
1022    params.push_back(5);                         \
1023    params.push_back(10);                        \
1024                                                 \
1025    test_macro(&b, macro, params);               \
1026 }
1027 
1028 COMPARISON_TEST(slt)
COMPARISON_TEST(sltu)1029 COMPARISON_TEST(sltu)
1030 COMPARISON_TEST(sle)
1031 COMPARISON_TEST(sleu)
1032 COMPARISON_TEST(seq)
1033 
1034 #undef COMPARISON_TEST
1035 
1036 static inline void
1037 mme_inc_whole_inst(mme_builder *b, mme_value val)
1038 {
1039    mme_tu104_asm(b, i) {
1040       i.alu[0].dst = mme_value_as_reg(val);
1041       i.alu[0].op = MME_TU104_ALU_OP_ADD;
1042       i.alu[0].src[0] = mme_value_as_reg(val);
1043       i.alu[0].src[1] = MME_TU104_REG_IMM;
1044       i.imm[0] = 1;
1045    }
1046 }
1047 
TEST_F(mme_tu104_sim_test,loop)1048 TEST_F(mme_tu104_sim_test, loop)
1049 {
1050    mme_builder b;
1051    mme_builder_init(&b, devinfo);
1052 
1053    mme_value count = mme_load(&b);
1054 
1055    mme_value x = mme_mov(&b, mme_zero());
1056    mme_value y = mme_mov(&b, mme_zero());
1057 
1058    mme_loop(&b, count) {
1059       mme_tu104_asm(&b, i) { } /* noop */
1060       mme_add_to(&b, x, x, count);
1061    }
1062    mme_add_to(&b, y, y, mme_imm(1));
1063    mme_tu104_asm(&b, i) { } /* noop */
1064    mme_tu104_asm(&b, i) { } /* noop */
1065    mme_tu104_asm(&b, i) { } /* noop */
1066 
1067    mme_store_imm_addr(&b, data_addr + 0,  count);
1068    mme_store_imm_addr(&b, data_addr + 4,  x);
1069    mme_store_imm_addr(&b, data_addr + 8,  y);
1070 
1071    auto macro = mme_builder_finish_vec(&b);
1072 
1073    uint32_t counts[] = {0, 1, 5, 9};
1074 
1075    for (uint32_t i = 0; i < ARRAY_SIZE(counts); i++) {
1076       reset_push();
1077 
1078       std::vector<uint32_t> params;
1079       params.push_back(counts[i]);
1080 
1081       test_macro(&b, macro, params);
1082       ASSERT_EQ(data[0], counts[i]);
1083       ASSERT_EQ(data[1], counts[i] * counts[i]);
1084       ASSERT_EQ(data[2], 1);
1085    }
1086 }
1087 
TEST_F(mme_tu104_sim_test,jal)1088 TEST_F(mme_tu104_sim_test, jal)
1089 {
1090    mme_builder b;
1091    mme_builder_init(&b, devinfo);
1092 
1093    mme_value x = mme_mov(&b, mme_zero());
1094    mme_value y = mme_mov(&b, mme_zero());
1095 
1096    mme_tu104_asm(&b, i) {
1097       i.alu[0].op = MME_TU104_ALU_OP_JAL;
1098       i.imm[0] = (1 << 15) | 6;
1099    }
1100 
1101    for (uint32_t j = 0; j < 10; j++)
1102       mme_inc_whole_inst(&b, x);
1103 
1104 //   mme_tu104_asm(&b, i) {
1105 //      i.alu[0].op = MME_TU104_ALU_OP_JAL;
1106 //      i.imm[0] = 6;
1107 //   }
1108 //
1109 //   for (uint32_t j = 0; j < 10; j++)
1110 //      mme_inc_whole_inst(&b, y);
1111 
1112    mme_store_imm_addr(&b, data_addr + 0, x);
1113    mme_store_imm_addr(&b, data_addr + 4, y);
1114 
1115    auto macro = mme_builder_finish_vec(&b);
1116 
1117    std::vector<uint32_t> params;
1118    test_macro(&b, macro, params);
1119    ASSERT_EQ(data[0], 5);
1120 }
1121 
TEST_F(mme_tu104_sim_test,bxx_fwd)1122 TEST_F(mme_tu104_sim_test, bxx_fwd)
1123 {
1124    mme_builder b;
1125    mme_builder_init(&b, devinfo);
1126 
1127    mme_value vals[10];
1128    for (uint32_t i = 0; i < 10; i++)
1129       vals[i] = mme_mov(&b, mme_zero());
1130 
1131    mme_tu104_asm(&b, i) {
1132       i.alu[0].op = MME_TU104_ALU_OP_BEQ;
1133       i.imm[0] = (1 << 15) | 6;
1134    }
1135 
1136    for (uint32_t j = 0; j < 10; j++)
1137       mme_inc_whole_inst(&b, vals[j]);
1138 
1139    for (uint32_t j = 0; j < 10; j++)
1140       mme_store_imm_addr(&b, data_addr + j * 4, vals[j]);
1141 
1142    auto macro = mme_builder_finish_vec(&b);
1143 
1144    std::vector<uint32_t> params;
1145    test_macro(&b, macro, params);
1146 }
1147 
TEST_F(mme_tu104_sim_test,bxx_bwd)1148 TEST_F(mme_tu104_sim_test, bxx_bwd)
1149 {
1150    mme_builder b;
1151    mme_builder_init(&b, devinfo);
1152 
1153    mme_value vals[15];
1154    for (uint32_t i = 0; i < 15; i++)
1155       vals[i] = mme_mov(&b, mme_zero());
1156 
1157    mme_tu104_asm(&b, i) {
1158       i.alu[0].op = MME_TU104_ALU_OP_JAL;
1159       i.imm[0] = (1 << 15) | 12;
1160    }
1161 
1162    for (uint32_t j = 0; j < 10; j++)
1163       mme_inc_whole_inst(&b, vals[j]);
1164 
1165    mme_tu104_asm(&b, i) {
1166       i.alu[0].op = MME_TU104_ALU_OP_JAL;
1167       i.imm[0] = (1 << 15) | 2;
1168    }
1169 
1170    mme_tu104_asm(&b, i) {
1171       i.alu[0].op = MME_TU104_ALU_OP_BEQ;
1172       i.imm[0] = (1 << 15) | ((-8) & 0x1fff);
1173    }
1174 
1175    for (uint32_t j = 10; j < 15; j++)
1176       mme_inc_whole_inst(&b, vals[j]);
1177 
1178    for (uint32_t j = 0; j < 15; j++)
1179       mme_store_imm_addr(&b, data_addr + j * 4, vals[j]);
1180 
1181    auto macro = mme_builder_finish_vec(&b);
1182 
1183    std::vector<uint32_t> params;
1184    test_macro(&b, macro, params);
1185    for (uint32_t j = 0; j < 3; j++)
1186       ASSERT_EQ(data[j], 0);
1187    for (uint32_t j = 3; j < 15; j++)
1188       ASSERT_EQ(data[j], 1);
1189 }
1190 
TEST_F(mme_tu104_sim_test,bxx_exit)1191 TEST_F(mme_tu104_sim_test, bxx_exit)
1192 {
1193    mme_builder b;
1194    mme_builder_init(&b, devinfo);
1195 
1196    mme_value vals[10];
1197    for (uint32_t i = 0; i < 10; i++)
1198       vals[i] = mme_mov(&b, mme_zero());
1199 
1200    for (uint32_t i = 0; i < 10; i++)
1201       mme_store_imm_addr(&b, data_addr + i * 4, mme_imm(0));
1202 
1203    mme_tu104_asm(&b, i) {
1204       i.alu[0].op = MME_TU104_ALU_OP_BEQ;
1205       i.imm[0] = (1 << 15) | 0x1000;
1206    }
1207 
1208    /* those writes won't be visible */
1209    for (uint32_t j = 0; j < 10; j++)
1210       mme_inc_whole_inst(&b, vals[j]);
1211 
1212    for (uint32_t i = 0; i < 10; i++)
1213       mme_store_imm_addr(&b, data_addr + i * 4, vals[i]);
1214 
1215    std::vector<uint32_t> params;
1216 
1217    auto macro = mme_builder_finish_vec(&b);
1218    test_macro(&b, macro, params);
1219 
1220    uint32_t i;
1221    for (i = 0; i < 10; i++)
1222       ASSERT_EQ(data[i], 0);
1223 }
1224 
TEST_F(mme_tu104_sim_test,mme_exit)1225 TEST_F(mme_tu104_sim_test, mme_exit)
1226 {
1227    mme_builder b;
1228    mme_builder_init(&b, devinfo);
1229 
1230    mme_value vals[10];
1231    for (uint32_t i = 0; i < 10; i++)
1232       vals[i] = mme_mov(&b, mme_zero());
1233 
1234    for (uint32_t i = 0; i < 10; i++)
1235       mme_store_imm_addr(&b, data_addr + i * 4, mme_imm(0));
1236 
1237    /* abort */
1238    mme_exit(&b);
1239 
1240    /* those writes won't be visible */
1241    for (uint32_t i = 0; i < 10; i++)
1242       vals[i] = mme_mov(&b, mme_imm(i));
1243 
1244    for (uint32_t i = 0; i < 10; i++) {
1245       mme_store_imm_addr(&b, data_addr + i * 4, vals[i]);
1246    }
1247 
1248    std::vector<uint32_t> params;
1249 
1250    auto macro = mme_builder_finish_vec(&b);
1251    test_macro(&b, macro, params);
1252 
1253    uint32_t i;
1254    for (i = 0; i < 10; i++)
1255       ASSERT_EQ(data[i], 0);
1256 }
1257 
TEST_F(mme_tu104_sim_test,mme_exit_if)1258 TEST_F(mme_tu104_sim_test, mme_exit_if)
1259 {
1260    mme_builder b;
1261    mme_builder_init(&b, devinfo);
1262 
1263    mme_value vals[10];
1264    for (uint32_t i = 0; i < 10; i++)
1265       vals[i] = mme_mov(&b, mme_zero());
1266 
1267    for (uint32_t i = 0; i < 10; i++)
1268       mme_store_imm_addr(&b, data_addr + i * 4, mme_imm(0));
1269 
1270    /* shouldn't do anything */
1271    mme_exit_if(&b, ieq, mme_zero(), mme_imm(1));
1272 
1273    for (uint32_t i = 0; i < 10; i++)
1274       vals[i] = mme_mov(&b, mme_imm(i));
1275 
1276    for (uint32_t i = 0; i < 10; i++) {
1277       /* abort on reaching 5 */
1278       mme_exit_if(&b, ile, mme_imm(5), vals[i]);
1279       mme_store_imm_addr(&b, data_addr + i * 4, vals[i]);
1280    }
1281 
1282    std::vector<uint32_t> params;
1283 
1284    auto macro = mme_builder_finish_vec(&b);
1285    test_macro(&b, macro, params);
1286 
1287    uint32_t i;
1288    for (i = 0; i < 10; i++)
1289       ASSERT_EQ(data[i], i < 5 ? i : 0);
1290 }
1291 
c_ilt(int32_t x,int32_t y)1292 static bool c_ilt(int32_t x, int32_t y) { return x < y; };
c_ult(uint32_t x,uint32_t y)1293 static bool c_ult(uint32_t x, uint32_t y) { return x < y; };
c_ile(int32_t x,int32_t y)1294 static bool c_ile(int32_t x, int32_t y) { return x <= y; };
c_ule(uint32_t x,uint32_t y)1295 static bool c_ule(uint32_t x, uint32_t y) { return x <= y; };
c_ieq(int32_t x,int32_t y)1296 static bool c_ieq(int32_t x, int32_t y) { return x == y; };
c_ige(int32_t x,int32_t y)1297 static bool c_ige(int32_t x, int32_t y) { return x >= y; };
c_uge(uint32_t x,uint32_t y)1298 static bool c_uge(uint32_t x, uint32_t y) { return x >= y; };
c_igt(int32_t x,int32_t y)1299 static bool c_igt(int32_t x, int32_t y) { return x > y; };
c_ugt(uint32_t x,uint32_t y)1300 static bool c_ugt(uint32_t x, uint32_t y) { return x > y; };
c_ine(int32_t x,int32_t y)1301 static bool c_ine(int32_t x, int32_t y) { return x != y; };
1302 
1303 #define IF_TEST(op)                                                  \
1304 TEST_F(mme_tu104_sim_test, if_##op)                                  \
1305 {                                                                    \
1306    mme_builder b;                                                    \
1307    mme_builder_init(&b, devinfo);                                 \
1308                                                                      \
1309    mme_value x = mme_load(&b);                                       \
1310    mme_value y = mme_load(&b);                                       \
1311    mme_value i = mme_mov(&b, mme_zero());                            \
1312                                                                      \
1313    mme_start_if_##op(&b, x, y);                                      \
1314    {                                                                 \
1315       mme_add_to(&b, i, i, mme_imm(1));                              \
1316       mme_add_to(&b, i, i, mme_imm(1));                              \
1317    }                                                                 \
1318    mme_end_if(&b);                                                   \
1319    mme_add_to(&b, i, i, mme_imm(1));                                 \
1320    mme_add_to(&b, i, i, mme_imm(1));                                 \
1321    mme_add_to(&b, i, i, mme_imm(1));                                 \
1322                                                                      \
1323    mme_store_imm_addr(&b, data_addr + 0, i);                         \
1324                                                                      \
1325    auto macro = mme_builder_finish_vec(&b);                          \
1326                                                                      \
1327    uint32_t vals[] = {23, 56, (uint32_t)-5, (uint32_t)-10, 56, 14};  \
1328                                                                      \
1329    for (uint32_t i = 0; i < ARRAY_SIZE(vals) - 1; i++) {             \
1330       reset_push();                                                  \
1331                                                                      \
1332       std::vector<uint32_t> params;                                  \
1333       params.push_back(vals[i + 0]);                                 \
1334       params.push_back(vals[i + 1]);                                 \
1335                                                                      \
1336       test_macro(&b, macro, params);                                 \
1337                                                                      \
1338       ASSERT_EQ(data[0], c_##op(params[0], params[1]) ? 5 : 3);      \
1339    }                                                                 \
1340 }
1341 
1342 IF_TEST(ilt)
IF_TEST(ult)1343 IF_TEST(ult)
1344 IF_TEST(ile)
1345 IF_TEST(ule)
1346 IF_TEST(ieq)
1347 IF_TEST(ige)
1348 IF_TEST(uge)
1349 IF_TEST(igt)
1350 IF_TEST(ugt)
1351 IF_TEST(ine)
1352 
1353 #undef IF_TEST
1354 
1355 #define WHILE_TEST(op, start, step, bound)            \
1356 TEST_F(mme_tu104_sim_test, while_##op)                \
1357 {                                                     \
1358    mme_builder b;                                     \
1359    mme_builder_init(&b, devinfo);                  \
1360                                                       \
1361    mme_value x = mme_mov(&b, mme_zero());             \
1362    mme_value y = mme_mov(&b, mme_zero());             \
1363    mme_value z = mme_mov(&b, mme_imm(start));         \
1364    mme_value w = mme_mov(&b, mme_zero());             \
1365    mme_value v = mme_mov(&b, mme_zero());             \
1366                                                       \
1367    for (uint32_t j = 0; j < 5; j++)                   \
1368       mme_inc_whole_inst(&b, x);                      \
1369                                                       \
1370    mme_while(&b, op, z, mme_imm(bound)) {             \
1371       for (uint32_t j = 0; j < 5; j++)                \
1372          mme_inc_whole_inst(&b, y);                   \
1373                                                       \
1374       mme_add_to(&b, z, z, mme_imm(step));            \
1375                                                       \
1376       for (uint32_t j = 0; j < 5; j++)                \
1377          mme_inc_whole_inst(&b, w);                   \
1378    }                                                  \
1379                                                       \
1380    for (uint32_t j = 0; j < 5; j++)                   \
1381       mme_inc_whole_inst(&b, v);                      \
1382                                                       \
1383    mme_store_imm_addr(&b, data_addr + 0, x);          \
1384    mme_store_imm_addr(&b, data_addr + 4, y);          \
1385    mme_store_imm_addr(&b, data_addr + 8, z);          \
1386    mme_store_imm_addr(&b, data_addr + 12, w);         \
1387    mme_store_imm_addr(&b, data_addr + 16, v);         \
1388                                                       \
1389    auto macro = mme_builder_finish_vec(&b);           \
1390                                                       \
1391    uint32_t end = (uint32_t)(start), count = 0;       \
1392    while (c_##op(end, (bound))) {                     \
1393       end += (uint32_t)(step);                        \
1394       count++;                                        \
1395    }                                                  \
1396                                                       \
1397    std::vector<uint32_t> params;                      \
1398    test_macro(&b, macro, params);                     \
1399    ASSERT_EQ(data[0], 5);                             \
1400    ASSERT_EQ(data[1], 5 * count);                     \
1401    ASSERT_EQ(data[2], end);                           \
1402    ASSERT_EQ(data[3], 5 * count);                     \
1403    ASSERT_EQ(data[4], 5);                             \
1404 }
1405 
1406 WHILE_TEST(ilt, 0, 1, 7)
1407 WHILE_TEST(ult, 0, 1, 7)
1408 WHILE_TEST(ile, -10, 2, 0)
1409 WHILE_TEST(ule, 0, 1, 7)
1410 WHILE_TEST(ieq, 0, 5, 0)
1411 WHILE_TEST(ige, 5, -1, -5)
1412 WHILE_TEST(uge, 15, -2, 2)
1413 WHILE_TEST(igt, 7, -3, -10)
1414 WHILE_TEST(ugt, 1604, -30, 1000)
1415 WHILE_TEST(ine, 0, 1, 7)
1416 
1417 #undef WHILE_TEST
1418 
1419 TEST_F(mme_tu104_sim_test, nested_while)
1420 {
1421    mme_builder b;
1422    mme_builder_init(&b, devinfo);
1423 
1424    mme_value n = mme_load(&b);
1425    mme_value m = mme_load(&b);
1426 
1427    mme_value count = mme_mov(&b, mme_zero());
1428 
1429    mme_value i = mme_mov(&b, mme_zero());
1430    mme_value j = mme_mov(&b, mme_imm(0xffff));
1431    mme_while(&b, ine, i, n) {
1432       mme_mov_to(&b, j, mme_zero());
1433       mme_while(&b, ine, j, m) {
1434          mme_add_to(&b, count, count, mme_imm(1));
1435          mme_add_to(&b, j, j, mme_imm(1));
1436       }
1437 
1438       mme_add_to(&b, i, i, mme_imm(1));
1439    }
1440 
1441    mme_store_imm_addr(&b, data_addr + 0, i);
1442    mme_store_imm_addr(&b, data_addr + 4, j);
1443    mme_store_imm_addr(&b, data_addr + 8, count);
1444 
1445    auto macro = mme_builder_finish_vec(&b);
1446 
1447    std::vector<uint32_t> params;
1448    params.push_back(3);
1449    params.push_back(5);
1450 
1451    test_macro(&b, macro, params);
1452    ASSERT_EQ(data[0], 3);
1453    ASSERT_EQ(data[1], 5);
1454    ASSERT_EQ(data[2], 15);
1455 }
1456 
1457 #if 0
1458 TEST_F(mme_tu104_sim_test, do_ble)
1459 {
1460    mme_builder b;
1461    mme_builder_init(&b, devinfo);
1462 
1463    mme_alu(&b, R5, ADD, LOAD0, ZERO);
1464    mme_alu(&b, R6, ADD, ZERO, ZERO);
1465    mme_alu(&b, R7, ADD, ZERO, ZERO);
1466 
1467    mme_alu_imm(&b, R7, ADD, R7, IMM, 1);
1468    mme_alu_imm(&b, R7, ADD, R7, IMM, 1);
1469    mme_alu_imm(&b, R7, ADD, R7, IMM, 1);
1470    mme_alu_imm(&b, R7, ADD, R7, IMM, 1);
1471    mme_alu_imm(&b, R6, ADD, R6, IMM, 1);
1472    mme_branch(&b, BLE, R6, R5, -3, 2);
1473    mme_alu_imm(&b, R7, ADD, R7, IMM, 1);
1474    mme_alu_imm(&b, R7, ADD, R7, IMM, 1);
1475 
1476    mme_store_imm_addr(&b, data_addr + 0,  MME_TU104_REG_R7);
1477 
1478    mme_end(&b);
1479 
1480    uint32_t counts[] = {0, 1, 5, 9};
1481 
1482    for (uint32_t i = 0; i < ARRAY_SIZE(counts); i++) {
1483       reset_push();
1484 
1485       std::vector<uint32_t> params;
1486       params.push_back(counts[i]);
1487 
1488       test_macro(&b, params);
1489    }
1490 }
1491 #endif
1492 
TEST_F(mme_tu104_sim_test,dread_dwrite)1493 TEST_F(mme_tu104_sim_test, dread_dwrite)
1494 {
1495    mme_builder b;
1496    mme_builder_init(&b, devinfo);
1497 
1498    mme_value x = mme_load(&b);
1499    mme_value y = mme_load(&b);
1500 
1501    mme_dwrite(&b, mme_imm(5), x);
1502    mme_dwrite(&b, mme_imm(8), y);
1503 
1504    mme_value y2 = mme_dread(&b, mme_imm(8));
1505    mme_value x2 = mme_dread(&b, mme_imm(5));
1506 
1507    mme_store_imm_addr(&b, data_addr + 0, y2);
1508    mme_store_imm_addr(&b, data_addr + 4, x2);
1509 
1510    auto macro = mme_builder_finish_vec(&b);
1511 
1512    std::vector<uint32_t> params;
1513    params.push_back(-10);
1514    params.push_back(5);
1515 
1516    test_macro(&b, macro, params);
1517 }
1518 
TEST_F(mme_tu104_sim_test,dwrite_dma)1519 TEST_F(mme_tu104_sim_test, dwrite_dma)
1520 {
1521    const uint32_t canary5 = 0xc0ffee01;
1522    const uint32_t canary8 = canary5 & 0x00ffff00;
1523 
1524    mme_builder b;
1525    mme_builder_init(&b, devinfo);
1526 
1527    mme_value x = mme_load(&b);
1528    mme_value y = mme_load(&b);
1529 
1530    mme_dwrite(&b, mme_imm(5), x);
1531    mme_dwrite(&b, mme_imm(8), y);
1532 
1533    auto macro = mme_builder_finish_vec(&b);
1534 
1535    push_macro(0, macro);
1536 
1537    P_1INC(p, NVC597, CALL_MME_MACRO(0));
1538    P_INLINE_DATA(p, canary5);
1539    P_INLINE_DATA(p, canary8);
1540 
1541    P_MTHD(p, NVC597, SET_MME_MEM_ADDRESS_A);
1542    P_NVC597_SET_MME_MEM_ADDRESS_A(p, high32(data_addr));
1543    P_NVC597_SET_MME_MEM_ADDRESS_B(p, low32(data_addr));
1544    /* Start 3 dwords into MME RAM */
1545    P_NVC597_SET_MME_DATA_RAM_ADDRESS(p, 3);
1546    P_IMMD(p, NVC597, MME_DMA_WRITE, 20);
1547 
1548    submit_push();
1549 
1550    for (uint32_t i = 0; i < 20; i++) {
1551       if (i == 5 - 3) {
1552          ASSERT_EQ(data[i], canary5);
1553       } else if (i == 8 - 3) {
1554          ASSERT_EQ(data[i], canary8);
1555       } else {
1556          ASSERT_EQ(data[i], 0);
1557       }
1558    }
1559 }
1560 
TEST_F(mme_tu104_sim_test,dram_limit)1561 TEST_F(mme_tu104_sim_test, dram_limit)
1562 {
1563    static const uint32_t chunk_size = 32;
1564 
1565    mme_builder b;
1566    mme_builder_init(&b, devinfo);
1567 
1568    mme_value start = mme_load(&b);
1569    mme_value count = mme_load(&b);
1570 
1571    mme_value i = mme_mov(&b, start);
1572    mme_loop(&b, count) {
1573       mme_dwrite(&b, i, i);
1574       mme_add_to(&b, i, i, mme_imm(1));
1575    }
1576 
1577    mme_value j = mme_mov(&b, start);
1578    struct mme_value64 addr = mme_mov64(&b, mme_imm64(data_addr));
1579 
1580    mme_loop(&b, count) {
1581       mme_value x = mme_dread(&b, j);
1582       mme_store(&b, addr, x);
1583       mme_add_to(&b, j, j, mme_imm(1));
1584       mme_add64_to(&b, addr, addr, mme_imm64(4));
1585    }
1586 
1587    auto macro = mme_builder_finish_vec(&b);
1588 
1589    for (uint32_t i = 0; i < MME_TU104_DRAM_COUNT; i += chunk_size) {
1590       reset_push();
1591 
1592       push_macro(0, macro);
1593 
1594       P_1INC(p, NVC597, CALL_MME_MACRO(0));
1595       P_INLINE_DATA(p, i);
1596       P_INLINE_DATA(p, chunk_size);
1597 
1598       submit_push();
1599 
1600       for (uint32_t j = 0; j < chunk_size; j++)
1601          ASSERT_EQ(data[j], i + j);
1602    }
1603 }
1604 
TEST_F(mme_tu104_sim_test,dma_read_fifoed)1605 TEST_F(mme_tu104_sim_test, dma_read_fifoed)
1606 {
1607    mme_builder b;
1608    mme_builder_init(&b, devinfo);
1609 
1610    mme_mthd(&b, NVC597_SET_MME_DATA_RAM_ADDRESS);
1611    mme_emit(&b, mme_zero());
1612 
1613    mme_mthd(&b, NVC597_SET_MME_MEM_ADDRESS_A);
1614    mme_emit(&b, mme_imm(high32(data_addr)));
1615    mme_emit(&b, mme_imm(low32(data_addr)));
1616 
1617    mme_mthd(&b, NVC597_MME_DMA_READ_FIFOED);
1618    mme_emit(&b, mme_imm(2));
1619 
1620    mme_tu104_load_barrier(&b);
1621 
1622    mme_value x = mme_load(&b);
1623    mme_value y = mme_load(&b);
1624 
1625    mme_store_imm_addr(&b, data_addr + 256 + 0, x);
1626    mme_store_imm_addr(&b, data_addr + 256 + 4, y);
1627 
1628    auto macro = mme_builder_finish_vec(&b);
1629 
1630    P_IMMD(p, NVC597, SET_MME_DATA_FIFO_CONFIG, FIFO_SIZE_SIZE_4KB);
1631 
1632    for (uint32_t i = 0; i < 64; i++)
1633       data[i] = 1000 + i;
1634 
1635    std::vector<uint32_t> params;
1636    params.push_back(7);
1637 
1638    test_macro(&b, macro, params);
1639 }
1640 
TEST_F(mme_tu104_sim_test,scratch_limit)1641 TEST_F(mme_tu104_sim_test, scratch_limit)
1642 {
1643    static const uint32_t chunk_size = 32;
1644 
1645    mme_builder b;
1646    mme_builder_init(&b, devinfo);
1647 
1648    mme_value start = mme_load(&b);
1649    mme_value count = mme_load(&b);
1650 
1651    mme_value i = mme_mov(&b, start);
1652    mme_loop(&b, count) {
1653       mme_mthd_arr(&b, NVC597_SET_MME_SHADOW_SCRATCH(0), i);
1654       mme_emit(&b, i);
1655       mme_add_to(&b, i, i, mme_imm(1));
1656    }
1657 
1658    mme_value j = mme_mov(&b, start);
1659    struct mme_value64 addr = mme_mov64(&b, mme_imm64(data_addr));
1660 
1661    mme_loop(&b, count) {
1662       mme_value x = mme_state_arr(&b, NVC597_SET_MME_SHADOW_SCRATCH(0), j);
1663       mme_store(&b, addr, x);
1664       mme_add_to(&b, j, j, mme_imm(1));
1665       mme_add64_to(&b, addr, addr, mme_imm64(4));
1666    }
1667 
1668    auto macro = mme_builder_finish_vec(&b);
1669 
1670    for (uint32_t i = 0; i < MME_TU104_SCRATCH_COUNT; i += chunk_size) {
1671       reset_push();
1672 
1673       push_macro(0, macro);
1674 
1675       P_1INC(p, NVC597, CALL_MME_MACRO(0));
1676       P_INLINE_DATA(p, i);
1677       P_INLINE_DATA(p, chunk_size);
1678 
1679       submit_push();
1680 
1681       for (uint32_t j = 0; j < chunk_size; j++)
1682          ASSERT_EQ(data[j], i + j);
1683    }
1684 }
1685