xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/radeonsi/si_test_dma_perf.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2024 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "si_pipe.h"
8 #include "si_query.h"
9 #include "util/streaming-load-memcpy.h"
10 
11 #define MIN_SIZE   512
12 #define MAX_SIZE   (128 * 1024 * 1024)
13 #define SIZE_SHIFT 1
14 #define WARMUP_RUNS 16
15 #define NUM_RUNS   32
16 
17 enum {
18    TEST_FILL_VRAM,
19    TEST_FILL_VRAM_12B,
20    TEST_FILL_GTT,
21    TEST_FILL_GTT_12B,
22    TEST_COPY_VRAM_VRAM,
23    TEST_COPY_VRAM_GTT,
24    TEST_COPY_GTT_VRAM,
25    NUM_TESTS,
26 };
27 
28 static const char *test_strings[] = {
29    [TEST_FILL_VRAM] = "fill->VRAM",
30    [TEST_FILL_VRAM_12B] = "fill->VRAM 12B",
31    [TEST_FILL_GTT] = "fill->GTT",
32    [TEST_FILL_GTT_12B] = "fill->GTT 12B",
33    [TEST_COPY_VRAM_VRAM] = "VRAM->VRAM",
34    [TEST_COPY_VRAM_GTT] = "VRAM->GTT",
35    [TEST_COPY_GTT_VRAM] = "GTT->VRAM",
36 };
37 
38 enum {
39    METHOD_DEFAULT,
40    METHOD_CP_DMA,
41    METHOD_COMPUTE_2DW,
42    METHOD_COMPUTE_3DW,
43    METHOD_COMPUTE_4DW,
44    NUM_METHODS,
45 };
46 
47 static const char *method_strings[] = {
48    [METHOD_DEFAULT] = "Default",
49    [METHOD_CP_DMA] = "CP DMA",
50    [METHOD_COMPUTE_2DW] = "CS 2dw",
51    [METHOD_COMPUTE_3DW] = "CS 3dw",
52    [METHOD_COMPUTE_4DW] = "CS 4dw",
53 };
54 
55 enum {
56    ALIGN_MAX,
57    ALIGN_256,
58    ALIGN_128,
59    ALIGN_64,
60    ALIGN_4,
61    ALIGN_2,
62    ALIGN_1,
63    ALIGN_SRC128,
64    ALIGN_SRC64,
65    ALIGN_SRC4,
66    ALIGN_SRC2,
67    ALIGN_SRC1,
68    ALIGN_DST128,
69    ALIGN_DST64,
70    ALIGN_DST4,
71    ALIGN_DST2,
72    ALIGN_DST1,
73    ALIGN_SRC4_DST2,
74    ALIGN_SRC4_DST1,
75    ALIGN_SRC2_DST4,
76    ALIGN_SRC2_DST1,
77    ALIGN_SRC1_DST4,
78    ALIGN_SRC1_DST2,
79    NUM_ALIGNMENTS,
80 };
81 
82 struct align_info_t {
83    const char *string;
84    unsigned src_offset;
85    unsigned dst_offset;
86 };
87 
88 static const struct align_info_t align_info[] = {
89    [ALIGN_MAX] = {"both=max", 0, 0},
90    [ALIGN_256] = {"both=256", 256, 256},
91    [ALIGN_128] = {"both=128", 128, 128},
92    [ALIGN_64] = {"both=64", 64, 64},
93    [ALIGN_4] = {"both=4", 4, 4},
94    [ALIGN_2] = {"both=2", 2, 2},
95    [ALIGN_1] = {"both=1", 1, 1},
96    [ALIGN_SRC128] = {"src=128", 128, 0},
97    [ALIGN_SRC64] = {"src=64", 64, 0},
98    [ALIGN_SRC4] = {"src=4", 4, 0},
99    [ALIGN_SRC2] = {"src=2", 2, 0},
100    [ALIGN_SRC1] = {"src=1", 1, 0},
101    [ALIGN_DST128] = {"dst=128", 0, 128},
102    [ALIGN_DST64] = {"dst=64", 0, 64},
103    [ALIGN_DST4] = {"dst=4", 0, 4},
104    [ALIGN_DST2] = {"dst=2", 0, 2},
105    [ALIGN_DST1] = {"dst=1", 0, 1},
106    [ALIGN_SRC4_DST2] = {"src=4 dst=2", 4, 2},
107    [ALIGN_SRC4_DST1] = {"src=4 dst=1", 4, 1},
108    [ALIGN_SRC2_DST4] = {"src=2 dst=4", 2, 4},
109    [ALIGN_SRC2_DST1] = {"src=2 dst=1", 2, 1},
110    [ALIGN_SRC1_DST4] = {"src=1 dst=4", 1, 4},
111    [ALIGN_SRC1_DST2] = {"src=1 dst=2", 1, 2},
112 };
113 
si_test_dma_perf(struct si_screen * sscreen)114 void si_test_dma_perf(struct si_screen *sscreen)
115 {
116    struct pipe_screen *screen = &sscreen->b;
117    struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
118    struct si_context *sctx = (struct si_context *)ctx;
119 
120    sscreen->ws->cs_set_pstate(&sctx->gfx_cs, RADEON_CTX_PSTATE_PEAK);
121 
122    printf("Test          , Method , Alignment  ,");
123    for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
124       if (size >= 1024 * 1024)
125          printf("%6uMB,", size / (1024 * 1024));
126       else if (size >= 1024)
127          printf("%6uKB,", size / 1024);
128       else
129          printf(" %6uB,", size);
130    }
131    printf("\n");
132 
133    /* Run benchmarks. */
134    for (unsigned test_flavor = 0; test_flavor < NUM_TESTS; test_flavor++) {
135       bool is_copy = test_flavor >= TEST_COPY_VRAM_VRAM;
136 
137       if (test_flavor)
138          puts("");
139 
140       for (unsigned method = 0; method < NUM_METHODS; method++) {
141          for (unsigned align = 0; align < NUM_ALIGNMENTS; align++) {
142             unsigned dwords_per_thread, clear_value_size;
143             unsigned src_offset = align_info[align].src_offset;
144             unsigned dst_offset = align_info[align].dst_offset;
145 
146             /* offset > 0 && offset < 4 is the only case when the compute shader performs the same
147              * as offset=0 without any alignment optimizations, so shift the offset by 4 to get
148              * unaligned performance.
149              */
150             if (src_offset && src_offset < 4)
151                src_offset += 4;
152             if (dst_offset && dst_offset < 4)
153                dst_offset += 4;
154 
155             if (!is_copy && dst_offset != src_offset)
156                continue;
157 
158             if (test_flavor == TEST_FILL_VRAM_12B || test_flavor == TEST_FILL_GTT_12B) {
159                if ((method != METHOD_DEFAULT && method != METHOD_COMPUTE_3DW &&
160                     method != METHOD_COMPUTE_4DW) || dst_offset % 4)
161                   continue;
162 
163                dwords_per_thread = method == METHOD_COMPUTE_3DW ? 3 : 4;
164                clear_value_size = 12;
165             } else {
166                if (method == METHOD_COMPUTE_3DW)
167                   continue;
168 
169                dwords_per_thread = method == METHOD_COMPUTE_2DW ? 2 : 4;
170                clear_value_size = dst_offset % 4 ? 1 : 4;
171             }
172 
173             printf("%-14s, %-7s, %-11s,", test_strings[test_flavor], method_strings[method],
174                    align_info[align].string);
175 
176             for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
177                struct pipe_resource *dst, *src;
178                enum pipe_resource_usage dst_usage = PIPE_USAGE_DEFAULT;
179                enum pipe_resource_usage src_usage = PIPE_USAGE_DEFAULT;
180 
181                if (test_flavor == TEST_FILL_GTT || test_flavor == TEST_FILL_GTT_12B ||
182                    test_flavor == TEST_COPY_VRAM_GTT)
183                   dst_usage = PIPE_USAGE_STREAM;
184 
185                if (test_flavor == TEST_COPY_GTT_VRAM)
186                   src_usage = PIPE_USAGE_STREAM;
187 
188                /* Don't test large sizes with GTT because it's slow. */
189                if ((dst_usage == PIPE_USAGE_STREAM || src_usage == PIPE_USAGE_STREAM) &&
190                    size > 16 * 1024 * 1024) {
191                   printf("%8s,", "n/a");
192                   continue;
193                }
194 
195                dst = pipe_aligned_buffer_create(screen, 0, dst_usage, dst_offset + size, 256);
196                src = is_copy ? pipe_aligned_buffer_create(screen, 0, src_usage, src_offset + size, 256) : NULL;
197 
198                struct pipe_query *q = ctx->create_query(ctx, PIPE_QUERY_TIME_ELAPSED, 0);
199                bool success = true;
200 
201                /* Run tests. */
202                for (unsigned iter = 0; iter < WARMUP_RUNS + NUM_RUNS; iter++) {
203                   const uint32_t clear_value[4] = {0x12345678, 0x23456789, 0x34567890, 0x45678901};
204 
205                   if (iter == WARMUP_RUNS)
206                      ctx->begin_query(ctx, q);
207 
208                   if (method == METHOD_DEFAULT) {
209                      if (is_copy) {
210                         si_barrier_before_simple_buffer_op(sctx, 0, dst, src);
211                         si_copy_buffer(sctx, dst, src, dst_offset, src_offset, size);
212                         si_barrier_after_simple_buffer_op(sctx, 0, dst, src);
213                      } else {
214                         sctx->b.clear_buffer(&sctx->b, dst, dst_offset, size, &clear_value,
215                                              clear_value_size);
216                      }
217                   } else if (method == METHOD_CP_DMA) {
218                      /* CP DMA */
219                      if (sscreen->info.cp_sdma_ge_use_system_memory_scope) {
220                         /* The CP DMA code doesn't implement this case. */
221                         success = false;
222                         continue;
223                      }
224 
225                      if (is_copy) {
226                         /* CP DMA copies are about as slow as PCIe on GFX6-8. */
227                         if (sctx->gfx_level <= GFX8 && size > 16 * 1024 * 1024) {
228                            success = false;
229                            continue;
230                         }
231 
232                         si_barrier_before_simple_buffer_op(sctx, 0, dst, src);
233                         si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size);
234                         si_barrier_after_simple_buffer_op(sctx, 0, dst, src);
235                      } else {
236                         /* CP DMA clears must be aligned to 4 bytes. */
237                         if (dst_offset % 4 || size % 4 ||
238                             /* CP DMA clears are so slow on GFX6-8 that we risk getting a GPU timeout. */
239                             (sctx->gfx_level <= GFX8 && size > 512 * 1024)) {
240                            success = false;
241                            continue;
242                         }
243 
244                         assert(clear_value_size == 4);
245                         si_barrier_before_simple_buffer_op(sctx, 0, dst, src);
246                         si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, dst, dst_offset, size,
247                                                clear_value[0]);
248                         si_barrier_after_simple_buffer_op(sctx, 0, dst, src);
249                      }
250                   } else {
251                      /* Compute */
252                      si_barrier_before_simple_buffer_op(sctx, 0, dst, src);
253                      success &=
254                         si_compute_clear_copy_buffer(sctx, dst, dst_offset, src, src_offset,
255                                                      size, clear_value, clear_value_size,
256                                                      dwords_per_thread, false, false);
257                      si_barrier_after_simple_buffer_op(sctx, 0, dst, src);
258                   }
259 
260                   sctx->barrier_flags |= SI_BARRIER_INV_L2;
261                }
262 
263                ctx->end_query(ctx, q);
264 
265                pipe_resource_reference(&dst, NULL);
266                pipe_resource_reference(&src, NULL);
267 
268                /* Get results. */
269                union pipe_query_result result;
270 
271                ctx->get_query_result(ctx, q, true, &result);
272                ctx->destroy_query(ctx, q);
273 
274                /* Navi10 and Vega10 sometimes incorrectly return elapsed time of 0 nanoseconds
275                 * for very small ops.
276                 */
277                if (success && result.u64) {
278                   double GB = 1024.0 * 1024.0 * 1024.0;
279                   double seconds = result.u64 / (double)NUM_RUNS / (1000.0 * 1000.0 * 1000.0);
280                   double GBps = (size / GB) / seconds * (test_flavor == TEST_COPY_VRAM_VRAM ? 2 : 1);
281                   printf("%8.2f,", GBps);
282                } else {
283                   printf("%8s,", "n/a");
284                }
285             }
286             puts("");
287          }
288       }
289    }
290 
291    ctx->destroy(ctx);
292    exit(0);
293 }
294 
295 void
si_test_mem_perf(struct si_screen * sscreen)296 si_test_mem_perf(struct si_screen *sscreen)
297 {
298    struct radeon_winsys *ws = sscreen->ws;
299    const size_t buffer_size = 16 * 1024 * 1024;
300    const enum radeon_bo_domain domains[] = { 0, RADEON_DOMAIN_VRAM, RADEON_DOMAIN_GTT };
301    const uint64_t flags[] = { 0, RADEON_FLAG_GTT_WC };
302    const int n_loops = 2;
303    char *title[] = { "Write To", "Read From", "Stream From" };
304    char *domain_str[] = { "RAM", "VRAM", "GTT" };
305 
306    for (int i = 0; i < 3; i++) {
307       printf("| %12s", title[i]);
308 
309       printf(" | Size (kB) | Flags |");
310       for (int l = 0; l < n_loops; l++)
311           printf(" Run %d (MB/s) |", l + 1);
312       printf("\n");
313 
314       printf("|--------------|-----------|-------|");
315       for (int l = 0; l < n_loops; l++)
316           printf("--------------|");
317       printf("\n");
318       for (int j = 0; j < ARRAY_SIZE(domains); j++) {
319          enum radeon_bo_domain domain = domains[j];
320          for (int k = 0; k < ARRAY_SIZE(flags); k++) {
321             if (k && domain != RADEON_DOMAIN_GTT)
322                continue;
323 
324             struct pb_buffer_lean *bo = NULL;
325             void *ptr = NULL;
326 
327             if (domains[j]) {
328                bo = ws->buffer_create(ws, buffer_size, 4096, domains[j],
329                                       RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_NO_SUBALLOC |
330                                       flags[k]);
331                if (!bo)
332                   continue;
333 
334                ptr = ws->buffer_map(ws, bo, NULL, RADEON_MAP_TEMPORARY | (i ? PIPE_MAP_READ : PIPE_MAP_WRITE));
335                if (!ptr) {
336                   radeon_bo_reference(ws, &bo, NULL);
337                   continue;
338                }
339             } else {
340                ptr = malloc(buffer_size);
341             }
342 
343             printf("| %12s |", domain_str[j]);
344 
345             printf("%10zu |", buffer_size / 1024);
346 
347             printf(" %5s |", domain == RADEON_DOMAIN_VRAM ? "(WC)" : (k == 0 ? "" : "WC "));
348 
349             int *cpu = calloc(1, buffer_size);
350             memset(cpu, 'c', buffer_size);
351             fflush(stdout);
352 
353             int64_t before, after;
354 
355             for (int loop = 0; loop < n_loops; loop++) {
356                before = os_time_get_nano();
357 
358                switch (i) {
359                case 0:
360                   memcpy(ptr, cpu, buffer_size);
361                   break;
362                case 1:
363                   memcpy(cpu, ptr, buffer_size);
364                   break;
365                case 2:
366                default:
367                   util_streaming_load_memcpy(cpu, ptr, buffer_size);
368                   break;
369                }
370 
371                after = os_time_get_nano();
372 
373                /* Pretend to do something with the result to make sure it's
374                 * not skipped.
375                 */
376                if (debug_get_num_option("AMD_DEBUG", 0) == 0x123)
377                    assert(memcmp(ptr, cpu, buffer_size));
378 
379                float dt = (after - before) / (1000000000.0);
380                float bandwidth = (buffer_size / (1024 * 1024)) / dt;
381 
382                printf("%13.3f |", bandwidth);
383             }
384             printf("\n");
385 
386             free(cpu);
387             if (bo) {
388                ws->buffer_unmap(ws, bo);
389                radeon_bo_reference(ws, &bo, NULL);
390             } else {
391                free(ptr);
392             }
393          }
394       }
395       printf("\n");
396    }
397 
398 
399    exit(0);
400 }
401 
402 #define COLOR_RESET  "\033[0m"
403 #define COLOR_RED    "\033[1;31m"
404 #define COLOR_YELLOW "\033[1;33m"
405 #define COLOR_CYAN   "\033[1;36m"
406 
si_test_clear_buffer(struct si_screen * sscreen)407 void si_test_clear_buffer(struct si_screen *sscreen)
408 {
409    struct pipe_screen *screen = &sscreen->b;
410    struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
411    struct si_context *sctx = (struct si_context *)ctx;
412    unsigned buf_size = 32;
413    unsigned num_tests = 0, num_passes = 0;
414 
415    srand(0x9b47d95b);
416 
417    printf("dst, si,dw, %-*s, %-*s, %-*s, %-*s\n",
418           32, "clear value",
419           buf_size * 2, "init dst",
420           buf_size * 2, "expected dst",
421           buf_size * 2, "observed dst");
422    printf("off, ze,th\n");
423 
424    /* Generate an infinite number of random tests. */
425    while (1) {
426       struct pipe_resource *dst;
427 
428       dst = pipe_aligned_buffer_create(screen, 0, PIPE_USAGE_STAGING, buf_size, 256);
429 
430       unsigned clear_value_size = 1 << (rand() % 6);
431       if (clear_value_size == 32)
432          clear_value_size = 12; /* test only 1, 2, 4, 8, 16, and 12 */
433 
434       uint8_t *clear_value = (uint8_t *)malloc(buf_size);
435       uint8_t *init_dst_buffer = (uint8_t *)malloc(buf_size);
436       uint8_t *expected_dst_buffer = (uint8_t *)malloc(buf_size);
437       uint8_t *read_dst_buffer = (uint8_t *)malloc(buf_size);
438 
439       for (unsigned i = 0; i < buf_size; i++) {
440          clear_value[i] = rand();
441          init_dst_buffer[i] = rand();
442          expected_dst_buffer[i] = rand();
443       }
444 
445       pipe_buffer_write(ctx, dst, 0, buf_size, init_dst_buffer);
446 
447       unsigned op_size = (((rand() % buf_size) + 1) / clear_value_size) * clear_value_size;
448       if (!op_size)
449          op_size = clear_value_size;
450 
451       unsigned dst_offset = rand() % (buf_size - op_size + 1);
452       if (clear_value_size == 12)
453          dst_offset &= ~0x3;
454 
455       unsigned dwords_per_thread = 1 << (rand() % 3);
456       dwords_per_thread = MAX2(dwords_per_thread, DIV_ROUND_UP(clear_value_size, 4));
457 
458       memcpy(expected_dst_buffer, init_dst_buffer, buf_size);
459       for (unsigned i = 0; i < op_size; i++)
460          expected_dst_buffer[dst_offset + i] = clear_value[i % clear_value_size];
461 
462       printf(" %2u, %2u, %u, ", dst_offset, op_size, dwords_per_thread);
463 
464       /* Visualize the clear. */
465       for (unsigned i = 0; i < clear_value_size; i++)
466          printf("%02x", clear_value[i]);
467       for (unsigned i = clear_value_size; i < 16; i++)
468          printf("  ");
469 
470       printf("%s, %s", COLOR_RESET, COLOR_CYAN);
471       for (unsigned i = 0; i < buf_size; i++) {
472          printf("%s%02x",
473                 i < dst_offset || i >= dst_offset + op_size ? COLOR_CYAN : COLOR_RESET,
474                 init_dst_buffer[i]);
475       }
476       printf("%s, ", COLOR_RESET);
477       for (unsigned i = 0; i < buf_size; i++) {
478          printf("%s%02x",
479                 i >= dst_offset && i < dst_offset + op_size ? COLOR_YELLOW : COLOR_CYAN,
480                 expected_dst_buffer[i]);
481       }
482       printf("%s, ", COLOR_RESET);
483       fflush(stdout);
484 
485       si_barrier_before_simple_buffer_op(sctx, 0, dst, NULL);
486       bool done = si_compute_clear_copy_buffer(sctx, dst, dst_offset, NULL, 0, op_size,
487                                                (uint32_t*)clear_value, clear_value_size,
488                                                dwords_per_thread, false, false);
489       si_barrier_after_simple_buffer_op(sctx, 0, dst, NULL);
490 
491       if (done) {
492          pipe_buffer_read(ctx, dst, 0, buf_size, read_dst_buffer);
493          bool success = !memcmp(read_dst_buffer, expected_dst_buffer, buf_size);
494 
495          num_tests++;
496          if (success)
497             num_passes++;
498 
499          for (unsigned i = 0; i < buf_size; i++) {
500             printf("%s%02x",
501                    read_dst_buffer[i] != expected_dst_buffer[i] ? COLOR_RED :
502                    i >= dst_offset && i < dst_offset + op_size ? COLOR_YELLOW : COLOR_CYAN,
503                    read_dst_buffer[i]);
504          }
505 
506          printf("%s, %s [%u/%u]\n", COLOR_RESET, success ? "pass" : "fail", num_passes, num_tests);
507       } else {
508          printf("%*s, skip [%u/%u]\n", buf_size * 2, "", num_passes, num_tests);
509       }
510 
511       free(clear_value);
512       free(init_dst_buffer);
513       free(expected_dst_buffer);
514       free(read_dst_buffer);
515       pipe_resource_reference(&dst, NULL);
516    }
517 
518    ctx->destroy(ctx);
519    exit(0);
520 }
521 
si_test_copy_buffer(struct si_screen * sscreen)522 void si_test_copy_buffer(struct si_screen *sscreen)
523 {
524    struct pipe_screen *screen = &sscreen->b;
525    struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
526    struct si_context *sctx = (struct si_context *)ctx;
527    unsigned buf_size = 32;
528    unsigned num_tests = 0, num_passes = 0;
529 
530    srand(0x9b47d95b);
531 
532    printf("src,dst, si,dw, %-*s, %-*s, %-*s, %-*s\n",
533           MIN2(buf_size, 32) * 2, "init src",
534           MIN2(buf_size, 32) * 2, "init dst",
535           MIN2(buf_size, 32) * 2, "expected dst",
536           MIN2(buf_size, 32) * 2, "observed dst");
537    printf("off,off, ze,th\n");
538 
539    /* Generate an infinite number of random tests. */
540    while (1) {
541       struct pipe_resource *dst, *src;
542 
543       dst = pipe_aligned_buffer_create(screen, 0, PIPE_USAGE_STAGING, buf_size, 256);
544       src = pipe_aligned_buffer_create(screen, 0, PIPE_USAGE_STAGING, buf_size, 256);
545 
546       uint8_t *init_src_buffer = (uint8_t *)malloc(buf_size);
547       uint8_t *init_dst_buffer = (uint8_t *)malloc(buf_size);
548       uint8_t *expected_dst_buffer = (uint8_t *)malloc(buf_size);
549       uint8_t *read_dst_buffer = (uint8_t *)malloc(buf_size);
550 
551       for (unsigned i = 0; i < buf_size; i++) {
552          init_src_buffer[i] = rand();
553          init_dst_buffer[i] = rand();
554       }
555 
556       pipe_buffer_write(ctx, src, 0, buf_size, init_src_buffer);
557       pipe_buffer_write(ctx, dst, 0, buf_size, init_dst_buffer);
558 
559       unsigned dst_offset = rand() % buf_size;
560       unsigned op_size = (rand() % (buf_size - dst_offset)) + 1;
561       unsigned src_offset = rand() % (buf_size - op_size + 1);
562       unsigned dwords_per_thread = 1 << (rand() % 3);
563 
564       memcpy(expected_dst_buffer, init_dst_buffer, buf_size);
565       memcpy(expected_dst_buffer + dst_offset, init_src_buffer + src_offset, op_size);
566 
567       printf(" %2u, %2u, %2u, %u, ", src_offset, dst_offset, op_size, dwords_per_thread);
568 
569       if (buf_size <= 32) {
570          /* Visualize the copy. */
571          for (unsigned i = 0; i < buf_size; i++) {
572             printf("%s%02x",
573                    i >= src_offset && i < src_offset + op_size ? COLOR_YELLOW : COLOR_RESET,
574                    init_src_buffer[i]);
575          }
576          printf("%s, %s", COLOR_RESET, COLOR_CYAN);
577          for (unsigned i = 0; i < buf_size; i++) {
578             printf("%s%02x",
579                    i < dst_offset || i >= dst_offset + op_size ? COLOR_CYAN : COLOR_RESET,
580                    init_dst_buffer[i]);
581          }
582          printf("%s, ", COLOR_RESET);
583          for (unsigned i = 0; i < buf_size; i++) {
584             printf("%s%02x",
585                    i >= dst_offset && i < dst_offset + op_size ? COLOR_YELLOW : COLOR_CYAN,
586                    expected_dst_buffer[i]);
587          }
588          printf("%s, ", COLOR_RESET);
589       }
590       fflush(stdout);
591 
592       si_barrier_before_simple_buffer_op(sctx, 0, dst, src);
593       bool done = si_compute_clear_copy_buffer(sctx, dst, dst_offset, src, src_offset, op_size,
594                                                NULL, 0, dwords_per_thread, false, false);
595       si_barrier_after_simple_buffer_op(sctx, 0, dst, src);
596 
597       if (done) {
598          pipe_buffer_read(ctx, dst, 0, buf_size, read_dst_buffer);
599          bool success = !memcmp(read_dst_buffer, expected_dst_buffer, buf_size);
600 
601          num_tests++;
602          if (success)
603             num_passes++;
604 
605          if (buf_size <= 32) {
606             for (unsigned i = 0; i < buf_size; i++) {
607                printf("%s%02x",
608                       read_dst_buffer[i] != expected_dst_buffer[i] ? COLOR_RED :
609                       i >= dst_offset && i < dst_offset + op_size ? COLOR_YELLOW : COLOR_CYAN,
610                       read_dst_buffer[i]);
611             }
612             printf("%s, ", COLOR_RESET);
613          }
614 
615          printf("%s [%u/%u]\n", success ? "pass" : "fail", num_passes, num_tests);
616       } else {
617          printf("%*s, skip [%u/%u]\n", buf_size * 2, "", num_passes, num_tests);
618       }
619 
620       free(init_src_buffer);
621       free(init_dst_buffer);
622       free(expected_dst_buffer);
623       free(read_dst_buffer);
624       pipe_resource_reference(&dst, NULL);
625       pipe_resource_reference(&src, NULL);
626    }
627 
628    ctx->destroy(ctx);
629    exit(0);
630 }
631