xref: /aosp_15_r20/external/igt-gpu-tools/tests/prime_mmap_coherency.c (revision d83cc019efdc2edc6c4b16e9034a3ceb8d35d77c)
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Tiago Vignatti
25  */
26 
27 /** @file prime_mmap_coherency.c
28  *
29  * TODO: need to show the need for prime_sync_end().
30  */
31 
32 #include "igt.h"
33 
34 IGT_TEST_DESCRIPTION("Test dma-buf mmap on !llc platforms mostly and provoke"
35 		" coherency bugs so we know for sure where we need the sync ioctls.");
36 
37 int fd;
38 static drm_intel_bufmgr *bufmgr;
39 struct intel_batchbuffer *batch;
40 static int width = 1024, height = 1024;
41 
42 /*
43  * Exercises the need for read flush:
44  *   1. create a BO and write '0's, in GTT domain.
45  *   2. read BO using the dma-buf CPU mmap.
46  *   3. write '1's, in GTT domain.
47  *   4. read again through the mapped dma-buf.
48  */
test_read_flush(void)49 static int test_read_flush(void)
50 {
51 	drm_intel_bo *bo_1;
52 	drm_intel_bo *bo_2;
53 	uint32_t *ptr_cpu;
54 	uint32_t *ptr_gtt;
55 	int dma_buf_fd, i;
56 	int stale = 0;
57 
58 	bo_1 = drm_intel_bo_alloc(bufmgr, "BO 1", width * height * 4, 4096);
59 
60 	/* STEP #1: put the BO 1 in GTT domain. We use the blitter to copy and fill
61 	 * zeros to BO 1, so commands will be submitted and likely to place BO 1 in
62 	 * the GTT domain. */
63 	bo_2 = drm_intel_bo_alloc(bufmgr, "BO 2", width * height * 4, 4096);
64 	intel_copy_bo(batch, bo_1, bo_2, width * height);
65 	drm_intel_bo_unreference(bo_2);
66 
67 	/* STEP #2: read BO 1 using the dma-buf CPU mmap. This dirties the CPU caches. */
68 	dma_buf_fd = prime_handle_to_fd_for_mmap(fd, bo_1->handle);
69 
70 	/* STEP #3: write 0x11 into BO 1. */
71 	bo_2 = drm_intel_bo_alloc(bufmgr, "BO 2", width * height * 4, 4096);
72 	ptr_gtt = gem_mmap__gtt(fd, bo_2->handle, width * height, PROT_READ | PROT_WRITE);
73 	gem_set_domain(fd, bo_2->handle,
74 		       I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
75 	memset(ptr_gtt, 0xc5, width * height);
76 	munmap(ptr_gtt, width * height);
77 
78 	ptr_cpu = mmap(NULL, width * height, PROT_READ,
79 		       MAP_SHARED, dma_buf_fd, 0);
80 	igt_assert(ptr_cpu != MAP_FAILED);
81 
82 	prime_sync_start(dma_buf_fd, false);
83 	for (i = 0; i < (width * height) / 4; i++)
84 		igt_assert_eq(ptr_cpu[i], 0);
85 	prime_sync_end(dma_buf_fd, false);
86 
87 	intel_copy_bo(batch, bo_1, bo_2, width * height);
88 	drm_intel_bo_unreference(bo_2);
89 
90 	/* STEP #4: read again using the CPU mmap. Doing #1 before #3 makes sure we
91 	 * don't do a full CPU cache flush in step #3 again. That makes sure all the
92 	 * stale cachelines from step #2 survive (mostly, a few will be evicted)
93 	 * until we try to read them again in step #4. This behavior could be fixed
94 	 * by flush CPU read right before accessing the CPU pointer */
95 	prime_sync_start(dma_buf_fd, false);
96 	for (i = 0; i < (width * height) / 4; i++)
97 		if (ptr_cpu[i] != 0xc5c5c5c5)
98 			stale++;
99 	prime_sync_end(dma_buf_fd, false);
100 
101 	drm_intel_bo_unreference(bo_1);
102 	munmap(ptr_cpu, width * height);
103 
104 	close(dma_buf_fd);
105 
106 	return stale;
107 }
108 
109 /*
110  * Exercises the need for write flush:
111  *   1. create BO 1 and write '0's, in GTT domain.
112  *   2. write '1's into BO 1 using the dma-buf CPU mmap.
113  *   3. copy BO 1 to new BO 2, in GTT domain.
114  *   4. read via dma-buf mmap BO 2.
115  */
test_write_flush(void)116 static int test_write_flush(void)
117 {
118 	drm_intel_bo *bo_1;
119 	drm_intel_bo *bo_2;
120 	uint32_t *ptr_cpu;
121 	uint32_t *ptr2_cpu;
122 	int dma_buf_fd, dma_buf2_fd, i;
123 	int stale = 0;
124 
125 	bo_1 = drm_intel_bo_alloc(bufmgr, "BO 1", width * height * 4, 4096);
126 
127 	/* STEP #1: Put the BO 1 in GTT domain. We use the blitter to copy and fill
128 	 * zeros to BO 1, so commands will be submitted and likely to place BO 1 in
129 	 * the GTT domain. */
130 	bo_2 = drm_intel_bo_alloc(bufmgr, "BO 2", width * height * 4, 4096);
131 	intel_copy_bo(batch, bo_1, bo_2, width * height);
132 	drm_intel_bo_unreference(bo_2);
133 
134 	/* STEP #2: Write '1's into BO 1 using the dma-buf CPU mmap. */
135 	dma_buf_fd = prime_handle_to_fd_for_mmap(fd, bo_1->handle);
136 	igt_skip_on(errno == EINVAL);
137 
138 	ptr_cpu = mmap(NULL, width * height, PROT_READ | PROT_WRITE,
139 		       MAP_SHARED, dma_buf_fd, 0);
140 	igt_assert(ptr_cpu != MAP_FAILED);
141 
142 	/* This is the main point of this test: !llc hw requires a cache write
143 	 * flush right here (explained in step #4). */
144 	prime_sync_start(dma_buf_fd, true);
145 	memset(ptr_cpu, 0x11, width * height);
146 	prime_sync_end(dma_buf_fd, true);
147 
148 	/* STEP #3: Copy BO 1 into BO 2, using blitter. */
149 	bo_2 = drm_intel_bo_alloc(bufmgr, "BO 2", width * height * 4, 4096);
150 	intel_copy_bo(batch, bo_2, bo_1, width * height);
151 
152 	/* STEP #4: compare BO 2 against written BO 1. In !llc hardware, there
153 	 * should be some cache lines that didn't get flushed out and are still 0,
154 	 * requiring cache flush before the write in step 2. */
155 	dma_buf2_fd = prime_handle_to_fd_for_mmap(fd, bo_2->handle);
156 	igt_skip_on(errno == EINVAL);
157 
158 	ptr2_cpu = mmap(NULL, width * height, PROT_READ | PROT_WRITE,
159 		        MAP_SHARED, dma_buf2_fd, 0);
160 	igt_assert(ptr2_cpu != MAP_FAILED);
161 
162 	prime_sync_start(dma_buf2_fd, false);
163 
164 	for (i = 0; i < (width * height) / 4; i++)
165 		if (ptr2_cpu[i] != 0x11111111)
166 			stale++;
167 
168 	prime_sync_end(dma_buf2_fd, false);
169 
170 	drm_intel_bo_unreference(bo_1);
171 	drm_intel_bo_unreference(bo_2);
172 	munmap(ptr_cpu, width * height);
173 
174 	close(dma_buf2_fd);
175 	close(dma_buf_fd);
176 
177 	return stale;
178 }
179 
blit_and_cmp(void)180 static void blit_and_cmp(void)
181 {
182 	drm_intel_bo *bo_1;
183 	drm_intel_bo *bo_2;
184 	uint32_t *ptr_cpu;
185 	uint32_t *ptr2_cpu;
186 	int dma_buf_fd, dma_buf2_fd, i;
187 	int local_fd;
188 	drm_intel_bufmgr *local_bufmgr;
189 	struct intel_batchbuffer *local_batch;
190 
191 	/* recreate process local variables */
192 	local_fd = drm_open_driver(DRIVER_INTEL);
193 	local_bufmgr = drm_intel_bufmgr_gem_init(local_fd, 4096);
194 	igt_assert(local_bufmgr);
195 
196 	local_batch = intel_batchbuffer_alloc(local_bufmgr, intel_get_drm_devid(local_fd));
197 	igt_assert(local_batch);
198 
199 	bo_1 = drm_intel_bo_alloc(local_bufmgr, "BO 1", width * height * 4, 4096);
200 	dma_buf_fd = prime_handle_to_fd_for_mmap(local_fd, bo_1->handle);
201 	igt_skip_on(errno == EINVAL);
202 
203 	ptr_cpu = mmap(NULL, width * height, PROT_READ | PROT_WRITE,
204 		       MAP_SHARED, dma_buf_fd, 0);
205 	igt_assert(ptr_cpu != MAP_FAILED);
206 
207 	bo_2 = drm_intel_bo_alloc(local_bufmgr, "BO 2", width * height * 4, 4096);
208 	dma_buf2_fd = prime_handle_to_fd_for_mmap(local_fd, bo_2->handle);
209 
210 	ptr2_cpu = mmap(NULL, width * height, PROT_READ | PROT_WRITE,
211 			MAP_SHARED, dma_buf2_fd, 0);
212 	igt_assert(ptr2_cpu != MAP_FAILED);
213 
214 	/* Fill up BO 1 with '1's and BO 2 with '0's */
215 	prime_sync_start(dma_buf_fd, true);
216 	memset(ptr_cpu, 0x11, width * height);
217 	prime_sync_end(dma_buf_fd, true);
218 
219 	prime_sync_start(dma_buf2_fd, true);
220 	memset(ptr2_cpu, 0x00, width * height);
221 	prime_sync_end(dma_buf2_fd, true);
222 
223 	/* Copy BO 1 into BO 2, using blitter. */
224 	intel_copy_bo(local_batch, bo_2, bo_1, width * height);
225 	usleep(0); /* let someone else claim the mutex */
226 
227 	/* Compare BOs. If prime_sync_* were executed properly, the caches
228 	 * should be synced. */
229 	prime_sync_start(dma_buf2_fd, false);
230 	for (i = 0; i < (width * height) / 4; i++)
231 		igt_fail_on_f(ptr2_cpu[i] != 0x11111111, "Found 0x%08x at offset 0x%08x\n", ptr2_cpu[i], i);
232 	prime_sync_end(dma_buf2_fd, false);
233 
234 	drm_intel_bo_unreference(bo_1);
235 	drm_intel_bo_unreference(bo_2);
236 	munmap(ptr_cpu, width * height);
237 	munmap(ptr2_cpu, width * height);
238 
239 	close(dma_buf_fd);
240 	close(dma_buf2_fd);
241 
242 	intel_batchbuffer_free(local_batch);
243 	drm_intel_bufmgr_destroy(local_bufmgr);
244 	close(local_fd);
245 }
246 
247 /*
248  * Constantly interrupt concurrent blits to stress out prime_sync_* and make
249  * sure these ioctl errors are handled accordingly.
250  *
251  * Important to note that in case of failure (e.g. in a case where the ioctl
252  * wouldn't try again in a return error) this test does not reliably catch the
253  * problem with 100% of accuracy.
254  */
test_ioctl_errors(void)255 static void test_ioctl_errors(void)
256 {
257 	int ncpus = sysconf(_SC_NPROCESSORS_ONLN);
258 
259 	/* Ensure we can do at least one child */
260 	intel_require_memory(2, width*height*4, CHECK_RAM);
261 
262 	for (int num_children = 1; num_children <= 8 *ncpus; num_children <<= 1) {
263 		uint64_t required, total;
264 
265 		igt_info("Spawing %d interruptible children\n", num_children);
266 		if (!__intel_check_memory(2*num_children,
267 					  width*height*4,
268 					  CHECK_RAM,
269 					  &required, &total)) {
270 			igt_debug("Estimated that we need %'lluMiB for test, but only have %'lluMiB\n",
271 				  (long long)(required >> 20),
272 				  (long long)(total >> 20));
273 			break;
274 		}
275 
276 		igt_fork(child, num_children)
277 			igt_while_interruptible(true) blit_and_cmp();
278 		igt_waitchildren();
279 	}
280 }
281 
282 igt_main
283 {
284 	igt_fixture {
285 		fd = drm_open_driver(DRIVER_INTEL);
286 		igt_require_gem(fd);
287 
288 		bufmgr = drm_intel_bufmgr_gem_init(fd, 4096);
289 		batch = intel_batchbuffer_alloc(bufmgr, intel_get_drm_devid(fd));
290 	}
291 
292 	/* Cache coherency and the eviction are pretty much unpredictable, so
293 	 * reproducing boils down to trial and error to hit different scenarios.
294 	 * TODO: We may want to improve tests a bit by picking random subranges. */
295 	igt_subtest("read") {
296 		igt_until_timeout(5) {
297 			int stale = test_read_flush();
298 			igt_fail_on_f(stale,
299 				      "num of stale cache lines %d\n", stale);
300 		}
301 	}
302 
303 	igt_subtest("write") {
304 		igt_until_timeout(5) {
305 			int stale = test_write_flush();
306 			igt_fail_on_f(stale,
307 				      "num of stale cache lines %d\n", stale);
308 		}
309 	}
310 
311 	igt_subtest("ioctl-errors") {
312 		igt_info("exercising concurrent blit to get ioctl errors\n");
313 		test_ioctl_errors();
314 	}
315 
316 	igt_fixture {
317 		intel_batchbuffer_free(batch);
318 		drm_intel_bufmgr_destroy(bufmgr);
319 
320 		close(fd);
321 	}
322 }
323