1 /*
2 * Copyright 2024 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "si_pipe.h"
8 #include "si_query.h"
9 #include "util/streaming-load-memcpy.h"
10
11 #define MIN_SIZE 512
12 #define MAX_SIZE (128 * 1024 * 1024)
13 #define SIZE_SHIFT 1
14 #define WARMUP_RUNS 16
15 #define NUM_RUNS 32
16
17 enum {
18 TEST_FILL_VRAM,
19 TEST_FILL_VRAM_12B,
20 TEST_FILL_GTT,
21 TEST_FILL_GTT_12B,
22 TEST_COPY_VRAM_VRAM,
23 TEST_COPY_VRAM_GTT,
24 TEST_COPY_GTT_VRAM,
25 NUM_TESTS,
26 };
27
28 static const char *test_strings[] = {
29 [TEST_FILL_VRAM] = "fill->VRAM",
30 [TEST_FILL_VRAM_12B] = "fill->VRAM 12B",
31 [TEST_FILL_GTT] = "fill->GTT",
32 [TEST_FILL_GTT_12B] = "fill->GTT 12B",
33 [TEST_COPY_VRAM_VRAM] = "VRAM->VRAM",
34 [TEST_COPY_VRAM_GTT] = "VRAM->GTT",
35 [TEST_COPY_GTT_VRAM] = "GTT->VRAM",
36 };
37
38 enum {
39 METHOD_DEFAULT,
40 METHOD_CP_DMA,
41 METHOD_COMPUTE_2DW,
42 METHOD_COMPUTE_3DW,
43 METHOD_COMPUTE_4DW,
44 NUM_METHODS,
45 };
46
47 static const char *method_strings[] = {
48 [METHOD_DEFAULT] = "Default",
49 [METHOD_CP_DMA] = "CP DMA",
50 [METHOD_COMPUTE_2DW] = "CS 2dw",
51 [METHOD_COMPUTE_3DW] = "CS 3dw",
52 [METHOD_COMPUTE_4DW] = "CS 4dw",
53 };
54
55 enum {
56 ALIGN_MAX,
57 ALIGN_256,
58 ALIGN_128,
59 ALIGN_64,
60 ALIGN_4,
61 ALIGN_2,
62 ALIGN_1,
63 ALIGN_SRC128,
64 ALIGN_SRC64,
65 ALIGN_SRC4,
66 ALIGN_SRC2,
67 ALIGN_SRC1,
68 ALIGN_DST128,
69 ALIGN_DST64,
70 ALIGN_DST4,
71 ALIGN_DST2,
72 ALIGN_DST1,
73 ALIGN_SRC4_DST2,
74 ALIGN_SRC4_DST1,
75 ALIGN_SRC2_DST4,
76 ALIGN_SRC2_DST1,
77 ALIGN_SRC1_DST4,
78 ALIGN_SRC1_DST2,
79 NUM_ALIGNMENTS,
80 };
81
82 struct align_info_t {
83 const char *string;
84 unsigned src_offset;
85 unsigned dst_offset;
86 };
87
88 static const struct align_info_t align_info[] = {
89 [ALIGN_MAX] = {"both=max", 0, 0},
90 [ALIGN_256] = {"both=256", 256, 256},
91 [ALIGN_128] = {"both=128", 128, 128},
92 [ALIGN_64] = {"both=64", 64, 64},
93 [ALIGN_4] = {"both=4", 4, 4},
94 [ALIGN_2] = {"both=2", 2, 2},
95 [ALIGN_1] = {"both=1", 1, 1},
96 [ALIGN_SRC128] = {"src=128", 128, 0},
97 [ALIGN_SRC64] = {"src=64", 64, 0},
98 [ALIGN_SRC4] = {"src=4", 4, 0},
99 [ALIGN_SRC2] = {"src=2", 2, 0},
100 [ALIGN_SRC1] = {"src=1", 1, 0},
101 [ALIGN_DST128] = {"dst=128", 0, 128},
102 [ALIGN_DST64] = {"dst=64", 0, 64},
103 [ALIGN_DST4] = {"dst=4", 0, 4},
104 [ALIGN_DST2] = {"dst=2", 0, 2},
105 [ALIGN_DST1] = {"dst=1", 0, 1},
106 [ALIGN_SRC4_DST2] = {"src=4 dst=2", 4, 2},
107 [ALIGN_SRC4_DST1] = {"src=4 dst=1", 4, 1},
108 [ALIGN_SRC2_DST4] = {"src=2 dst=4", 2, 4},
109 [ALIGN_SRC2_DST1] = {"src=2 dst=1", 2, 1},
110 [ALIGN_SRC1_DST4] = {"src=1 dst=4", 1, 4},
111 [ALIGN_SRC1_DST2] = {"src=1 dst=2", 1, 2},
112 };
113
si_test_dma_perf(struct si_screen * sscreen)114 void si_test_dma_perf(struct si_screen *sscreen)
115 {
116 struct pipe_screen *screen = &sscreen->b;
117 struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
118 struct si_context *sctx = (struct si_context *)ctx;
119
120 sscreen->ws->cs_set_pstate(&sctx->gfx_cs, RADEON_CTX_PSTATE_PEAK);
121
122 printf("Test , Method , Alignment ,");
123 for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
124 if (size >= 1024 * 1024)
125 printf("%6uMB,", size / (1024 * 1024));
126 else if (size >= 1024)
127 printf("%6uKB,", size / 1024);
128 else
129 printf(" %6uB,", size);
130 }
131 printf("\n");
132
133 /* Run benchmarks. */
134 for (unsigned test_flavor = 0; test_flavor < NUM_TESTS; test_flavor++) {
135 bool is_copy = test_flavor >= TEST_COPY_VRAM_VRAM;
136
137 if (test_flavor)
138 puts("");
139
140 for (unsigned method = 0; method < NUM_METHODS; method++) {
141 for (unsigned align = 0; align < NUM_ALIGNMENTS; align++) {
142 unsigned dwords_per_thread, clear_value_size;
143 unsigned src_offset = align_info[align].src_offset;
144 unsigned dst_offset = align_info[align].dst_offset;
145
146 /* offset > 0 && offset < 4 is the only case when the compute shader performs the same
147 * as offset=0 without any alignment optimizations, so shift the offset by 4 to get
148 * unaligned performance.
149 */
150 if (src_offset && src_offset < 4)
151 src_offset += 4;
152 if (dst_offset && dst_offset < 4)
153 dst_offset += 4;
154
155 if (!is_copy && dst_offset != src_offset)
156 continue;
157
158 if (test_flavor == TEST_FILL_VRAM_12B || test_flavor == TEST_FILL_GTT_12B) {
159 if ((method != METHOD_DEFAULT && method != METHOD_COMPUTE_3DW &&
160 method != METHOD_COMPUTE_4DW) || dst_offset % 4)
161 continue;
162
163 dwords_per_thread = method == METHOD_COMPUTE_3DW ? 3 : 4;
164 clear_value_size = 12;
165 } else {
166 if (method == METHOD_COMPUTE_3DW)
167 continue;
168
169 dwords_per_thread = method == METHOD_COMPUTE_2DW ? 2 : 4;
170 clear_value_size = dst_offset % 4 ? 1 : 4;
171 }
172
173 printf("%-14s, %-7s, %-11s,", test_strings[test_flavor], method_strings[method],
174 align_info[align].string);
175
176 for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
177 struct pipe_resource *dst, *src;
178 enum pipe_resource_usage dst_usage = PIPE_USAGE_DEFAULT;
179 enum pipe_resource_usage src_usage = PIPE_USAGE_DEFAULT;
180
181 if (test_flavor == TEST_FILL_GTT || test_flavor == TEST_FILL_GTT_12B ||
182 test_flavor == TEST_COPY_VRAM_GTT)
183 dst_usage = PIPE_USAGE_STREAM;
184
185 if (test_flavor == TEST_COPY_GTT_VRAM)
186 src_usage = PIPE_USAGE_STREAM;
187
188 /* Don't test large sizes with GTT because it's slow. */
189 if ((dst_usage == PIPE_USAGE_STREAM || src_usage == PIPE_USAGE_STREAM) &&
190 size > 16 * 1024 * 1024) {
191 printf("%8s,", "n/a");
192 continue;
193 }
194
195 dst = pipe_aligned_buffer_create(screen, 0, dst_usage, dst_offset + size, 256);
196 src = is_copy ? pipe_aligned_buffer_create(screen, 0, src_usage, src_offset + size, 256) : NULL;
197
198 struct pipe_query *q = ctx->create_query(ctx, PIPE_QUERY_TIME_ELAPSED, 0);
199 bool success = true;
200
201 /* Run tests. */
202 for (unsigned iter = 0; iter < WARMUP_RUNS + NUM_RUNS; iter++) {
203 const uint32_t clear_value[4] = {0x12345678, 0x23456789, 0x34567890, 0x45678901};
204
205 if (iter == WARMUP_RUNS)
206 ctx->begin_query(ctx, q);
207
208 if (method == METHOD_DEFAULT) {
209 if (is_copy) {
210 si_barrier_before_simple_buffer_op(sctx, 0, dst, src);
211 si_copy_buffer(sctx, dst, src, dst_offset, src_offset, size);
212 si_barrier_after_simple_buffer_op(sctx, 0, dst, src);
213 } else {
214 sctx->b.clear_buffer(&sctx->b, dst, dst_offset, size, &clear_value,
215 clear_value_size);
216 }
217 } else if (method == METHOD_CP_DMA) {
218 /* CP DMA */
219 if (sscreen->info.cp_sdma_ge_use_system_memory_scope) {
220 /* The CP DMA code doesn't implement this case. */
221 success = false;
222 continue;
223 }
224
225 if (is_copy) {
226 /* CP DMA copies are about as slow as PCIe on GFX6-8. */
227 if (sctx->gfx_level <= GFX8 && size > 16 * 1024 * 1024) {
228 success = false;
229 continue;
230 }
231
232 si_barrier_before_simple_buffer_op(sctx, 0, dst, src);
233 si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size);
234 si_barrier_after_simple_buffer_op(sctx, 0, dst, src);
235 } else {
236 /* CP DMA clears must be aligned to 4 bytes. */
237 if (dst_offset % 4 || size % 4 ||
238 /* CP DMA clears are so slow on GFX6-8 that we risk getting a GPU timeout. */
239 (sctx->gfx_level <= GFX8 && size > 512 * 1024)) {
240 success = false;
241 continue;
242 }
243
244 assert(clear_value_size == 4);
245 si_barrier_before_simple_buffer_op(sctx, 0, dst, src);
246 si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, dst, dst_offset, size,
247 clear_value[0]);
248 si_barrier_after_simple_buffer_op(sctx, 0, dst, src);
249 }
250 } else {
251 /* Compute */
252 si_barrier_before_simple_buffer_op(sctx, 0, dst, src);
253 success &=
254 si_compute_clear_copy_buffer(sctx, dst, dst_offset, src, src_offset,
255 size, clear_value, clear_value_size,
256 dwords_per_thread, false, false);
257 si_barrier_after_simple_buffer_op(sctx, 0, dst, src);
258 }
259
260 sctx->barrier_flags |= SI_BARRIER_INV_L2;
261 }
262
263 ctx->end_query(ctx, q);
264
265 pipe_resource_reference(&dst, NULL);
266 pipe_resource_reference(&src, NULL);
267
268 /* Get results. */
269 union pipe_query_result result;
270
271 ctx->get_query_result(ctx, q, true, &result);
272 ctx->destroy_query(ctx, q);
273
274 /* Navi10 and Vega10 sometimes incorrectly return elapsed time of 0 nanoseconds
275 * for very small ops.
276 */
277 if (success && result.u64) {
278 double GB = 1024.0 * 1024.0 * 1024.0;
279 double seconds = result.u64 / (double)NUM_RUNS / (1000.0 * 1000.0 * 1000.0);
280 double GBps = (size / GB) / seconds * (test_flavor == TEST_COPY_VRAM_VRAM ? 2 : 1);
281 printf("%8.2f,", GBps);
282 } else {
283 printf("%8s,", "n/a");
284 }
285 }
286 puts("");
287 }
288 }
289 }
290
291 ctx->destroy(ctx);
292 exit(0);
293 }
294
295 void
si_test_mem_perf(struct si_screen * sscreen)296 si_test_mem_perf(struct si_screen *sscreen)
297 {
298 struct radeon_winsys *ws = sscreen->ws;
299 const size_t buffer_size = 16 * 1024 * 1024;
300 const enum radeon_bo_domain domains[] = { 0, RADEON_DOMAIN_VRAM, RADEON_DOMAIN_GTT };
301 const uint64_t flags[] = { 0, RADEON_FLAG_GTT_WC };
302 const int n_loops = 2;
303 char *title[] = { "Write To", "Read From", "Stream From" };
304 char *domain_str[] = { "RAM", "VRAM", "GTT" };
305
306 for (int i = 0; i < 3; i++) {
307 printf("| %12s", title[i]);
308
309 printf(" | Size (kB) | Flags |");
310 for (int l = 0; l < n_loops; l++)
311 printf(" Run %d (MB/s) |", l + 1);
312 printf("\n");
313
314 printf("|--------------|-----------|-------|");
315 for (int l = 0; l < n_loops; l++)
316 printf("--------------|");
317 printf("\n");
318 for (int j = 0; j < ARRAY_SIZE(domains); j++) {
319 enum radeon_bo_domain domain = domains[j];
320 for (int k = 0; k < ARRAY_SIZE(flags); k++) {
321 if (k && domain != RADEON_DOMAIN_GTT)
322 continue;
323
324 struct pb_buffer_lean *bo = NULL;
325 void *ptr = NULL;
326
327 if (domains[j]) {
328 bo = ws->buffer_create(ws, buffer_size, 4096, domains[j],
329 RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_NO_SUBALLOC |
330 flags[k]);
331 if (!bo)
332 continue;
333
334 ptr = ws->buffer_map(ws, bo, NULL, RADEON_MAP_TEMPORARY | (i ? PIPE_MAP_READ : PIPE_MAP_WRITE));
335 if (!ptr) {
336 radeon_bo_reference(ws, &bo, NULL);
337 continue;
338 }
339 } else {
340 ptr = malloc(buffer_size);
341 }
342
343 printf("| %12s |", domain_str[j]);
344
345 printf("%10zu |", buffer_size / 1024);
346
347 printf(" %5s |", domain == RADEON_DOMAIN_VRAM ? "(WC)" : (k == 0 ? "" : "WC "));
348
349 int *cpu = calloc(1, buffer_size);
350 memset(cpu, 'c', buffer_size);
351 fflush(stdout);
352
353 int64_t before, after;
354
355 for (int loop = 0; loop < n_loops; loop++) {
356 before = os_time_get_nano();
357
358 switch (i) {
359 case 0:
360 memcpy(ptr, cpu, buffer_size);
361 break;
362 case 1:
363 memcpy(cpu, ptr, buffer_size);
364 break;
365 case 2:
366 default:
367 util_streaming_load_memcpy(cpu, ptr, buffer_size);
368 break;
369 }
370
371 after = os_time_get_nano();
372
373 /* Pretend to do something with the result to make sure it's
374 * not skipped.
375 */
376 if (debug_get_num_option("AMD_DEBUG", 0) == 0x123)
377 assert(memcmp(ptr, cpu, buffer_size));
378
379 float dt = (after - before) / (1000000000.0);
380 float bandwidth = (buffer_size / (1024 * 1024)) / dt;
381
382 printf("%13.3f |", bandwidth);
383 }
384 printf("\n");
385
386 free(cpu);
387 if (bo) {
388 ws->buffer_unmap(ws, bo);
389 radeon_bo_reference(ws, &bo, NULL);
390 } else {
391 free(ptr);
392 }
393 }
394 }
395 printf("\n");
396 }
397
398
399 exit(0);
400 }
401
402 #define COLOR_RESET "\033[0m"
403 #define COLOR_RED "\033[1;31m"
404 #define COLOR_YELLOW "\033[1;33m"
405 #define COLOR_CYAN "\033[1;36m"
406
si_test_clear_buffer(struct si_screen * sscreen)407 void si_test_clear_buffer(struct si_screen *sscreen)
408 {
409 struct pipe_screen *screen = &sscreen->b;
410 struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
411 struct si_context *sctx = (struct si_context *)ctx;
412 unsigned buf_size = 32;
413 unsigned num_tests = 0, num_passes = 0;
414
415 srand(0x9b47d95b);
416
417 printf("dst, si,dw, %-*s, %-*s, %-*s, %-*s\n",
418 32, "clear value",
419 buf_size * 2, "init dst",
420 buf_size * 2, "expected dst",
421 buf_size * 2, "observed dst");
422 printf("off, ze,th\n");
423
424 /* Generate an infinite number of random tests. */
425 while (1) {
426 struct pipe_resource *dst;
427
428 dst = pipe_aligned_buffer_create(screen, 0, PIPE_USAGE_STAGING, buf_size, 256);
429
430 unsigned clear_value_size = 1 << (rand() % 6);
431 if (clear_value_size == 32)
432 clear_value_size = 12; /* test only 1, 2, 4, 8, 16, and 12 */
433
434 uint8_t *clear_value = (uint8_t *)malloc(buf_size);
435 uint8_t *init_dst_buffer = (uint8_t *)malloc(buf_size);
436 uint8_t *expected_dst_buffer = (uint8_t *)malloc(buf_size);
437 uint8_t *read_dst_buffer = (uint8_t *)malloc(buf_size);
438
439 for (unsigned i = 0; i < buf_size; i++) {
440 clear_value[i] = rand();
441 init_dst_buffer[i] = rand();
442 expected_dst_buffer[i] = rand();
443 }
444
445 pipe_buffer_write(ctx, dst, 0, buf_size, init_dst_buffer);
446
447 unsigned op_size = (((rand() % buf_size) + 1) / clear_value_size) * clear_value_size;
448 if (!op_size)
449 op_size = clear_value_size;
450
451 unsigned dst_offset = rand() % (buf_size - op_size + 1);
452 if (clear_value_size == 12)
453 dst_offset &= ~0x3;
454
455 unsigned dwords_per_thread = 1 << (rand() % 3);
456 dwords_per_thread = MAX2(dwords_per_thread, DIV_ROUND_UP(clear_value_size, 4));
457
458 memcpy(expected_dst_buffer, init_dst_buffer, buf_size);
459 for (unsigned i = 0; i < op_size; i++)
460 expected_dst_buffer[dst_offset + i] = clear_value[i % clear_value_size];
461
462 printf(" %2u, %2u, %u, ", dst_offset, op_size, dwords_per_thread);
463
464 /* Visualize the clear. */
465 for (unsigned i = 0; i < clear_value_size; i++)
466 printf("%02x", clear_value[i]);
467 for (unsigned i = clear_value_size; i < 16; i++)
468 printf(" ");
469
470 printf("%s, %s", COLOR_RESET, COLOR_CYAN);
471 for (unsigned i = 0; i < buf_size; i++) {
472 printf("%s%02x",
473 i < dst_offset || i >= dst_offset + op_size ? COLOR_CYAN : COLOR_RESET,
474 init_dst_buffer[i]);
475 }
476 printf("%s, ", COLOR_RESET);
477 for (unsigned i = 0; i < buf_size; i++) {
478 printf("%s%02x",
479 i >= dst_offset && i < dst_offset + op_size ? COLOR_YELLOW : COLOR_CYAN,
480 expected_dst_buffer[i]);
481 }
482 printf("%s, ", COLOR_RESET);
483 fflush(stdout);
484
485 si_barrier_before_simple_buffer_op(sctx, 0, dst, NULL);
486 bool done = si_compute_clear_copy_buffer(sctx, dst, dst_offset, NULL, 0, op_size,
487 (uint32_t*)clear_value, clear_value_size,
488 dwords_per_thread, false, false);
489 si_barrier_after_simple_buffer_op(sctx, 0, dst, NULL);
490
491 if (done) {
492 pipe_buffer_read(ctx, dst, 0, buf_size, read_dst_buffer);
493 bool success = !memcmp(read_dst_buffer, expected_dst_buffer, buf_size);
494
495 num_tests++;
496 if (success)
497 num_passes++;
498
499 for (unsigned i = 0; i < buf_size; i++) {
500 printf("%s%02x",
501 read_dst_buffer[i] != expected_dst_buffer[i] ? COLOR_RED :
502 i >= dst_offset && i < dst_offset + op_size ? COLOR_YELLOW : COLOR_CYAN,
503 read_dst_buffer[i]);
504 }
505
506 printf("%s, %s [%u/%u]\n", COLOR_RESET, success ? "pass" : "fail", num_passes, num_tests);
507 } else {
508 printf("%*s, skip [%u/%u]\n", buf_size * 2, "", num_passes, num_tests);
509 }
510
511 free(clear_value);
512 free(init_dst_buffer);
513 free(expected_dst_buffer);
514 free(read_dst_buffer);
515 pipe_resource_reference(&dst, NULL);
516 }
517
518 ctx->destroy(ctx);
519 exit(0);
520 }
521
si_test_copy_buffer(struct si_screen * sscreen)522 void si_test_copy_buffer(struct si_screen *sscreen)
523 {
524 struct pipe_screen *screen = &sscreen->b;
525 struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
526 struct si_context *sctx = (struct si_context *)ctx;
527 unsigned buf_size = 32;
528 unsigned num_tests = 0, num_passes = 0;
529
530 srand(0x9b47d95b);
531
532 printf("src,dst, si,dw, %-*s, %-*s, %-*s, %-*s\n",
533 MIN2(buf_size, 32) * 2, "init src",
534 MIN2(buf_size, 32) * 2, "init dst",
535 MIN2(buf_size, 32) * 2, "expected dst",
536 MIN2(buf_size, 32) * 2, "observed dst");
537 printf("off,off, ze,th\n");
538
539 /* Generate an infinite number of random tests. */
540 while (1) {
541 struct pipe_resource *dst, *src;
542
543 dst = pipe_aligned_buffer_create(screen, 0, PIPE_USAGE_STAGING, buf_size, 256);
544 src = pipe_aligned_buffer_create(screen, 0, PIPE_USAGE_STAGING, buf_size, 256);
545
546 uint8_t *init_src_buffer = (uint8_t *)malloc(buf_size);
547 uint8_t *init_dst_buffer = (uint8_t *)malloc(buf_size);
548 uint8_t *expected_dst_buffer = (uint8_t *)malloc(buf_size);
549 uint8_t *read_dst_buffer = (uint8_t *)malloc(buf_size);
550
551 for (unsigned i = 0; i < buf_size; i++) {
552 init_src_buffer[i] = rand();
553 init_dst_buffer[i] = rand();
554 }
555
556 pipe_buffer_write(ctx, src, 0, buf_size, init_src_buffer);
557 pipe_buffer_write(ctx, dst, 0, buf_size, init_dst_buffer);
558
559 unsigned dst_offset = rand() % buf_size;
560 unsigned op_size = (rand() % (buf_size - dst_offset)) + 1;
561 unsigned src_offset = rand() % (buf_size - op_size + 1);
562 unsigned dwords_per_thread = 1 << (rand() % 3);
563
564 memcpy(expected_dst_buffer, init_dst_buffer, buf_size);
565 memcpy(expected_dst_buffer + dst_offset, init_src_buffer + src_offset, op_size);
566
567 printf(" %2u, %2u, %2u, %u, ", src_offset, dst_offset, op_size, dwords_per_thread);
568
569 if (buf_size <= 32) {
570 /* Visualize the copy. */
571 for (unsigned i = 0; i < buf_size; i++) {
572 printf("%s%02x",
573 i >= src_offset && i < src_offset + op_size ? COLOR_YELLOW : COLOR_RESET,
574 init_src_buffer[i]);
575 }
576 printf("%s, %s", COLOR_RESET, COLOR_CYAN);
577 for (unsigned i = 0; i < buf_size; i++) {
578 printf("%s%02x",
579 i < dst_offset || i >= dst_offset + op_size ? COLOR_CYAN : COLOR_RESET,
580 init_dst_buffer[i]);
581 }
582 printf("%s, ", COLOR_RESET);
583 for (unsigned i = 0; i < buf_size; i++) {
584 printf("%s%02x",
585 i >= dst_offset && i < dst_offset + op_size ? COLOR_YELLOW : COLOR_CYAN,
586 expected_dst_buffer[i]);
587 }
588 printf("%s, ", COLOR_RESET);
589 }
590 fflush(stdout);
591
592 si_barrier_before_simple_buffer_op(sctx, 0, dst, src);
593 bool done = si_compute_clear_copy_buffer(sctx, dst, dst_offset, src, src_offset, op_size,
594 NULL, 0, dwords_per_thread, false, false);
595 si_barrier_after_simple_buffer_op(sctx, 0, dst, src);
596
597 if (done) {
598 pipe_buffer_read(ctx, dst, 0, buf_size, read_dst_buffer);
599 bool success = !memcmp(read_dst_buffer, expected_dst_buffer, buf_size);
600
601 num_tests++;
602 if (success)
603 num_passes++;
604
605 if (buf_size <= 32) {
606 for (unsigned i = 0; i < buf_size; i++) {
607 printf("%s%02x",
608 read_dst_buffer[i] != expected_dst_buffer[i] ? COLOR_RED :
609 i >= dst_offset && i < dst_offset + op_size ? COLOR_YELLOW : COLOR_CYAN,
610 read_dst_buffer[i]);
611 }
612 printf("%s, ", COLOR_RESET);
613 }
614
615 printf("%s [%u/%u]\n", success ? "pass" : "fail", num_passes, num_tests);
616 } else {
617 printf("%*s, skip [%u/%u]\n", buf_size * 2, "", num_passes, num_tests);
618 }
619
620 free(init_src_buffer);
621 free(init_dst_buffer);
622 free(expected_dst_buffer);
623 free(read_dst_buffer);
624 pipe_resource_reference(&dst, NULL);
625 pipe_resource_reference(&src, NULL);
626 }
627
628 ctx->destroy(ctx);
629 exit(0);
630 }
631