1 /*
2 * Copyright 2010 Jerome Glisse <[email protected]>
3 * Authors:
4 * Jerome Glisse
5 * SPDX-License-Identifier: MIT
6 */
7
8 #include "r600_pipe.h"
9 #include "evergreend.h"
10 #include "util/u_memory.h"
11 #include "util/u_math.h"
12
evergreen_dma_copy_buffer(struct r600_context * rctx,struct pipe_resource * dst,struct pipe_resource * src,uint64_t dst_offset,uint64_t src_offset,uint64_t size)13 void evergreen_dma_copy_buffer(struct r600_context *rctx,
14 struct pipe_resource *dst,
15 struct pipe_resource *src,
16 uint64_t dst_offset,
17 uint64_t src_offset,
18 uint64_t size)
19 {
20 struct radeon_cmdbuf *cs = &rctx->b.dma.cs;
21 unsigned i, ncopy, csize, sub_cmd, shift;
22 struct r600_resource *rdst = (struct r600_resource*)dst;
23 struct r600_resource *rsrc = (struct r600_resource*)src;
24
25 /* Mark the buffer range of destination as valid (initialized),
26 * so that transfer_map knows it should wait for the GPU when mapping
27 * that range. */
28 util_range_add(&rdst->b.b, &rdst->valid_buffer_range, dst_offset,
29 dst_offset + size);
30
31 dst_offset += rdst->gpu_address;
32 src_offset += rsrc->gpu_address;
33
34 /* see if we use dword or byte copy */
35 if (!(dst_offset % 4) && !(src_offset % 4) && !(size % 4)) {
36 size >>= 2;
37 sub_cmd = EG_DMA_COPY_DWORD_ALIGNED;
38 shift = 2;
39 } else {
40 sub_cmd = EG_DMA_COPY_BYTE_ALIGNED;
41 shift = 0;
42 }
43 ncopy = (size / EG_DMA_COPY_MAX_SIZE) + !!(size % EG_DMA_COPY_MAX_SIZE);
44
45 r600_need_dma_space(&rctx->b, ncopy * 5, rdst, rsrc);
46 for (i = 0; i < ncopy; i++) {
47 csize = size < EG_DMA_COPY_MAX_SIZE ? size : EG_DMA_COPY_MAX_SIZE;
48 /* emit reloc before writing cs so that cs is always in consistent state */
49 radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rsrc, RADEON_USAGE_READ);
50 radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rdst, RADEON_USAGE_WRITE);
51 radeon_emit(cs, DMA_PACKET(DMA_PACKET_COPY, sub_cmd, csize));
52 radeon_emit(cs, dst_offset & 0xffffffff);
53 radeon_emit(cs, src_offset & 0xffffffff);
54 radeon_emit(cs, (dst_offset >> 32UL) & 0xff);
55 radeon_emit(cs, (src_offset >> 32UL) & 0xff);
56 dst_offset += csize << shift;
57 src_offset += csize << shift;
58 size -= csize;
59 }
60 }
61
62 /* The max number of bytes to copy per packet. */
63 #define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8)
64
evergreen_cp_dma_clear_buffer(struct r600_context * rctx,struct pipe_resource * dst,uint64_t offset,unsigned size,uint32_t clear_value,enum r600_coherency coher)65 void evergreen_cp_dma_clear_buffer(struct r600_context *rctx,
66 struct pipe_resource *dst, uint64_t offset,
67 unsigned size, uint32_t clear_value,
68 enum r600_coherency coher)
69 {
70 struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
71
72 assert(size);
73 assert(rctx->screen->b.has_cp_dma);
74
75 /* Mark the buffer range of destination as valid (initialized),
76 * so that transfer_map knows it should wait for the GPU when mapping
77 * that range. */
78 util_range_add(dst, &r600_resource(dst)->valid_buffer_range, offset,
79 offset + size);
80
81 offset += r600_resource(dst)->gpu_address;
82
83 /* Flush the cache where the resource is bound. */
84 rctx->b.flags |= r600_get_flush_flags(coher) |
85 R600_CONTEXT_WAIT_3D_IDLE;
86
87 while (size) {
88 unsigned sync = 0;
89 unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
90 unsigned reloc;
91
92 r600_need_cs_space(rctx,
93 10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0) +
94 R600_MAX_PFP_SYNC_ME_DWORDS, false, 0);
95
96 /* Flush the caches for the first copy only. */
97 if (rctx->b.flags) {
98 r600_flush_emit(rctx);
99 }
100
101 /* Do the synchronization after the last copy, so that all data is written to memory. */
102 if (size == byte_count) {
103 sync = PKT3_CP_DMA_CP_SYNC;
104 }
105
106 /* This must be done after r600_need_cs_space. */
107 reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
108 (struct r600_resource*)dst, RADEON_USAGE_WRITE |
109 RADEON_PRIO_CP_DMA);
110
111 radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
112 radeon_emit(cs, clear_value); /* DATA [31:0] */
113 radeon_emit(cs, sync | PKT3_CP_DMA_SRC_SEL(2)); /* CP_SYNC [31] | SRC_SEL[30:29] */
114 radeon_emit(cs, offset); /* DST_ADDR_LO [31:0] */
115 radeon_emit(cs, (offset >> 32) & 0xff); /* DST_ADDR_HI [7:0] */
116 radeon_emit(cs, byte_count); /* COMMAND [29:22] | BYTE_COUNT [20:0] */
117
118 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
119 radeon_emit(cs, reloc);
120
121 size -= byte_count;
122 offset += byte_count;
123 }
124
125 /* CP DMA is executed in ME, but index buffers are read by PFP.
126 * This ensures that ME (CP DMA) is idle before PFP starts fetching
127 * indices. If we wanted to execute CP DMA in PFP, this packet
128 * should precede it.
129 */
130 if (coher == R600_COHERENCY_SHADER)
131 r600_emit_pfp_sync_me(rctx);
132 }
133