1 /*
2 * Copyright 2021 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 *
22 */
23
24 #include <stdio.h>
25 #include <sys/types.h>
26 #include <sys/stat.h>
27 #include <fcntl.h>
28 #include <stdarg.h>
29 #include <string.h>
30 #include <errno.h>
31 #include <unistd.h>
32 #include <stdlib.h>
33 #include <inttypes.h>
34
35 #include "drm.h"
36 #include "xf86drmMode.h"
37 #include "xf86drm.h"
38 #include "amdgpu.h"
39 #include "amdgpu_drm.h"
40 #include "amdgpu_internal.h"
41
42 #define MAX_CARDS_SUPPORTED 4
43 #define NUM_BUFFER_OBJECTS 1024
44
45 #define SDMA_PACKET(op, sub_op, e) ((((e) & 0xFFFF) << 16) | \
46 (((sub_op) & 0xFF) << 8) | \
47 (((op) & 0xFF) << 0))
48
49 #define SDMA_OPCODE_COPY 1
50 # define SDMA_COPY_SUB_OPCODE_LINEAR 0
51
52
53 #define SDMA_PACKET_SI(op, b, t, s, cnt) ((((op) & 0xF) << 28) | \
54 (((b) & 0x1) << 26) | \
55 (((t) & 0x1) << 23) | \
56 (((s) & 0x1) << 22) | \
57 (((cnt) & 0xFFFFF) << 0))
58 #define SDMA_OPCODE_COPY_SI 3
59
60
61 /** Help string for command line parameters */
62 static const char usage[] =
63 "Usage: %s [-?h] [-b v|g|vg size] "
64 "[-c from to size count]\n"
65 "where:\n"
66 " b - Allocate a BO in VRAM, GTT or VRAM|GTT of size bytes.\n"
67 " This flag can be used multiple times. The first bo will\n"
68 " have id `1`, then second id `2`, ...\n"
69 " c - Copy size bytes from BO (bo_id1) to BO (bo_id2), count times\n"
70 " h - Display this help\n"
71 "\n"
72 "Sizes can be postfixes with k, m or g for kilo, mega and gigabyte scaling\n";
73
74 /** Specified options strings for getopt */
75 static const char options[] = "?hb:c:";
76
77 /* Open AMD devices.
78 * Returns the fd of the first device it could open.
79 */
amdgpu_open_device(void)80 static int amdgpu_open_device(void)
81 {
82 drmDevicePtr devices[MAX_CARDS_SUPPORTED];
83 unsigned int i;
84 int drm_count;
85
86 drm_count = drmGetDevices2(0, devices, MAX_CARDS_SUPPORTED);
87 if (drm_count < 0) {
88 fprintf(stderr, "drmGetDevices2() returned an error %d\n",
89 drm_count);
90 return drm_count;
91 }
92
93 for (i = 0; i < drm_count; i++) {
94 drmVersionPtr version;
95 int fd;
96
97 /* If this is not PCI device, skip*/
98 if (devices[i]->bustype != DRM_BUS_PCI)
99 continue;
100
101 /* If this is not AMD GPU vender ID, skip*/
102 if (devices[i]->deviceinfo.pci->vendor_id != 0x1002)
103 continue;
104
105 if (!(devices[i]->available_nodes & 1 << DRM_NODE_RENDER))
106 continue;
107
108 fd = open(devices[i]->nodes[DRM_NODE_RENDER], O_RDWR | O_CLOEXEC);
109
110 /* This node is not available. */
111 if (fd < 0) continue;
112
113 version = drmGetVersion(fd);
114 if (!version) {
115 fprintf(stderr,
116 "Warning: Cannot get version for %s."
117 "Error is %s\n",
118 devices[i]->nodes[DRM_NODE_RENDER],
119 strerror(errno));
120 close(fd);
121 continue;
122 }
123
124 if (strcmp(version->name, "amdgpu")) {
125 /* This is not AMDGPU driver, skip.*/
126 drmFreeVersion(version);
127 close(fd);
128 continue;
129 }
130
131 drmFreeVersion(version);
132 drmFreeDevices(devices, drm_count);
133 return fd;
134 }
135
136 return -1;
137 }
138
139 amdgpu_device_handle device_handle;
140 amdgpu_context_handle context_handle;
141
142 amdgpu_bo_handle resources[NUM_BUFFER_OBJECTS];
143 uint64_t virtual[NUM_BUFFER_OBJECTS];
144 unsigned int num_buffers;
145 uint32_t *pm4;
146
alloc_bo(uint32_t domain,uint64_t size)147 int alloc_bo(uint32_t domain, uint64_t size)
148 {
149 struct amdgpu_bo_alloc_request request = {};
150 amdgpu_bo_handle bo;
151 amdgpu_va_handle va;
152 uint64_t addr;
153 int r;
154
155 if (num_buffers >= NUM_BUFFER_OBJECTS)
156 return -ENOSPC;
157
158 request.alloc_size = size;
159 request.phys_alignment = 0;
160 request.preferred_heap = domain;
161 request.flags = 0;
162 r = amdgpu_bo_alloc(device_handle, &request, &bo);
163 if (r)
164 return r;
165
166 r = amdgpu_va_range_alloc(device_handle, amdgpu_gpu_va_range_general,
167 size, 0, 0, &addr, &va, 0);
168 if (r)
169 return r;
170
171 r = amdgpu_bo_va_op_raw(device_handle, bo, 0, size, addr,
172 AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE |
173 AMDGPU_VM_PAGE_EXECUTABLE, AMDGPU_VA_OP_MAP);
174 if (r)
175 return r;
176
177 resources[num_buffers] = bo;
178 virtual[num_buffers] = addr;
179 fprintf(stdout, "Allocated BO number %u at 0x%" PRIx64 ", domain 0x%x, size %" PRIu64 "\n",
180 num_buffers++, addr, domain, size);
181 return 0;
182 }
183
submit_ib(uint32_t from,uint32_t to,uint64_t size,uint32_t count)184 int submit_ib(uint32_t from, uint32_t to, uint64_t size, uint32_t count)
185 {
186 struct amdgpu_cs_request ibs_request;
187 struct amdgpu_cs_fence fence_status;
188 struct amdgpu_cs_ib_info ib_info;
189 uint64_t copied = size, delta;
190 struct timespec start, stop;
191
192 uint64_t src = virtual[from];
193 uint64_t dst = virtual[to];
194 uint32_t expired;
195 int i, r;
196
197 i = 0;
198 while (size) {
199 uint64_t bytes = size < 0x40000 ? size : 0x40000;
200
201 if (device_handle->info.family_id == AMDGPU_FAMILY_SI) {
202 pm4[i++] = SDMA_PACKET_SI(SDMA_OPCODE_COPY_SI, 0, 0, 0,
203 bytes);
204 pm4[i++] = 0xffffffff & dst;
205 pm4[i++] = 0xffffffff & src;
206 pm4[i++] = (0xffffffff00000000 & dst) >> 32;
207 pm4[i++] = (0xffffffff00000000 & src) >> 32;
208 } else {
209 pm4[i++] = SDMA_PACKET(SDMA_OPCODE_COPY,
210 SDMA_COPY_SUB_OPCODE_LINEAR,
211 0);
212 if ( device_handle->info.family_id >= AMDGPU_FAMILY_AI)
213 pm4[i++] = bytes - 1;
214 else
215 pm4[i++] = bytes;
216 pm4[i++] = 0;
217 pm4[i++] = 0xffffffff & src;
218 pm4[i++] = (0xffffffff00000000 & src) >> 32;
219 pm4[i++] = 0xffffffff & dst;
220 pm4[i++] = (0xffffffff00000000 & dst) >> 32;
221 }
222
223 size -= bytes;
224 src += bytes;
225 dst += bytes;
226 }
227
228 memset(&ib_info, 0, sizeof(ib_info));
229 ib_info.ib_mc_address = virtual[0];
230 ib_info.size = i;
231
232 memset(&ibs_request, 0, sizeof(ibs_request));
233 ibs_request.ip_type = AMDGPU_HW_IP_DMA;
234 ibs_request.ring = 0;
235 ibs_request.number_of_ibs = 1;
236 ibs_request.ibs = &ib_info;
237 ibs_request.fence_info.handle = NULL;
238
239 r = clock_gettime(CLOCK_MONOTONIC, &start);
240 if (r)
241 return errno;
242
243 r = amdgpu_bo_list_create(device_handle, num_buffers, resources, NULL,
244 &ibs_request.resources);
245 if (r)
246 return r;
247
248 for (i = 0; i < count; ++i) {
249 r = amdgpu_cs_submit(context_handle, 0, &ibs_request, 1);
250 if (r)
251 return r;
252 }
253
254 r = amdgpu_bo_list_destroy(ibs_request.resources);
255 if (r)
256 return r;
257
258 memset(&fence_status, 0, sizeof(fence_status));
259 fence_status.ip_type = ibs_request.ip_type;
260 fence_status.ip_instance = 0;
261 fence_status.ring = ibs_request.ring;
262 fence_status.context = context_handle;
263 fence_status.fence = ibs_request.seq_no;
264 r = amdgpu_cs_query_fence_status(&fence_status,
265 AMDGPU_TIMEOUT_INFINITE,
266 0, &expired);
267 if (r)
268 return r;
269
270 r = clock_gettime(CLOCK_MONOTONIC, &stop);
271 if (r)
272 return errno;
273
274 delta = stop.tv_nsec + stop.tv_sec * 1000000000UL;
275 delta -= start.tv_nsec + start.tv_sec * 1000000000UL;
276
277 fprintf(stdout, "Submitted %u IBs to copy from %u(%" PRIx64 ") to %u(%" PRIx64 ") %" PRIu64 " bytes took %" PRIu64 " usec\n",
278 count, from, virtual[from], to, virtual[to], copied, delta / 1000);
279 return 0;
280 }
281
next_arg(int argc,char ** argv,const char * msg)282 void next_arg(int argc, char **argv, const char *msg)
283 {
284 optarg = argv[optind++];
285 if (optind > argc || optarg[0] == '-') {
286 fprintf(stderr, "%s\n", msg);
287 exit(EXIT_FAILURE);
288 }
289 }
290
parse_size(void)291 uint64_t parse_size(void)
292 {
293 uint64_t size;
294 char ext[2];
295
296 ext[0] = 0;
297 if (sscanf(optarg, "%" PRIi64 "%1[kmgKMG]", &size, ext) < 1) {
298 fprintf(stderr, "Can't parse size arg: %s\n", optarg);
299 exit(EXIT_FAILURE);
300 }
301 switch (ext[0]) {
302 case 'k':
303 case 'K':
304 size *= 1024;
305 break;
306 case 'm':
307 case 'M':
308 size *= 1024 * 1024;
309 break;
310 case 'g':
311 case 'G':
312 size *= 1024 * 1024 * 1024;
313 break;
314 default:
315 break;
316 }
317 return size;
318 }
319
main(int argc,char ** argv)320 int main(int argc, char **argv)
321 {
322 uint32_t major_version, minor_version;
323 uint32_t domain, from, to, count;
324 uint64_t size;
325 int fd, r, c;
326
327 fd = amdgpu_open_device();
328 if (fd < 0) {
329 perror("Cannot open AMDGPU device");
330 exit(EXIT_FAILURE);
331 }
332
333 r = amdgpu_device_initialize(fd, &major_version, &minor_version, &device_handle);
334 if (r) {
335 fprintf(stderr, "amdgpu_device_initialize returned %d\n", r);
336 exit(EXIT_FAILURE);
337 }
338
339 r = amdgpu_cs_ctx_create(device_handle, &context_handle);
340 if (r) {
341 fprintf(stderr, "amdgpu_cs_ctx_create returned %d\n", r);
342 exit(EXIT_FAILURE);
343 }
344
345 if (argc == 1) {
346 fprintf(stderr, usage, argv[0]);
347 exit(EXIT_FAILURE);
348 }
349
350 r = alloc_bo(AMDGPU_GEM_DOMAIN_GTT, 2ULL * 1024 * 1024);
351 if (r) {
352 fprintf(stderr, "Buffer allocation failed with %d\n", r);
353 exit(EXIT_FAILURE);
354 }
355
356 r = amdgpu_bo_cpu_map(resources[0], (void **)&pm4);
357 if (r) {
358 fprintf(stderr, "Buffer mapping failed with %d\n", r);
359 exit(EXIT_FAILURE);
360 }
361
362 opterr = 0;
363 while ((c = getopt(argc, argv, options)) != -1) {
364 switch (c) {
365 case 'b':
366 if (!strcmp(optarg, "v"))
367 domain = AMDGPU_GEM_DOMAIN_VRAM;
368 else if (!strcmp(optarg, "g"))
369 domain = AMDGPU_GEM_DOMAIN_GTT;
370 else if (!strcmp(optarg, "vg"))
371 domain = AMDGPU_GEM_DOMAIN_VRAM | AMDGPU_GEM_DOMAIN_GTT;
372 else {
373 fprintf(stderr, "Invalid domain: %s\n", optarg);
374 exit(EXIT_FAILURE);
375 }
376 next_arg(argc, argv, "Missing buffer size");
377 size = parse_size();
378 if (size < getpagesize()) {
379 fprintf(stderr, "Buffer size to small %" PRIu64 "\n", size);
380 exit(EXIT_FAILURE);
381 }
382 r = alloc_bo(domain, size);
383 if (r) {
384 fprintf(stderr, "Buffer allocation failed with %d\n", r);
385 exit(EXIT_FAILURE);
386 }
387 break;
388 case 'c':
389 if (sscanf(optarg, "%u", &from) != 1) {
390 fprintf(stderr, "Can't parse from buffer: %s\n", optarg);
391 exit(EXIT_FAILURE);
392 }
393 next_arg(argc, argv, "Missing to buffer");
394 if (sscanf(optarg, "%u", &to) != 1) {
395 fprintf(stderr, "Can't parse to buffer: %s\n", optarg);
396 exit(EXIT_FAILURE);
397 }
398 next_arg(argc, argv, "Missing size");
399 size = parse_size();
400 next_arg(argc, argv, "Missing count");
401 count = parse_size();
402 r = submit_ib(from, to, size, count);
403 if (r) {
404 fprintf(stderr, "IB submission failed with %d\n", r);
405 exit(EXIT_FAILURE);
406 }
407 break;
408 case '?':
409 case 'h':
410 fprintf(stderr, usage, argv[0]);
411 exit(EXIT_SUCCESS);
412 default:
413 fprintf(stderr, usage, argv[0]);
414 exit(EXIT_FAILURE);
415 }
416 }
417
418 return EXIT_SUCCESS;
419 }
420