4 * Copyright (c) 2009,2020 Red Hat
6 * This work is licensed under the terms of the GNU General Public License
7 * (GNU GPL), version 2 or later.
10 #include "qemu/osdep.h"
11 #include "sysemu/block-backend.h"
12 #include "sysemu/dma.h"
13 #include "trace/trace-root.h"
14 #include "qemu/thread.h"
15 #include "qemu/main-loop.h"
16 #include "sysemu/cpu-timers.h"
17 #include "qemu/range.h"
19 /* #define DEBUG_IOMMU */
21 MemTxResult
dma_memory_set(AddressSpace
*as
, dma_addr_t addr
,
22 uint8_t c
, dma_addr_t len
, MemTxAttrs attrs
)
24 dma_barrier(as
, DMA_DIRECTION_FROM_DEVICE
);
26 #define FILLBUF_SIZE 512
27 uint8_t fillbuf
[FILLBUF_SIZE
];
29 MemTxResult error
= MEMTX_OK
;
31 memset(fillbuf
, c
, FILLBUF_SIZE
);
33 l
= len
< FILLBUF_SIZE
? len
: FILLBUF_SIZE
;
34 error
|= address_space_write(as
, addr
, attrs
, fillbuf
, l
);
42 void qemu_sglist_init(QEMUSGList
*qsg
, DeviceState
*dev
, int alloc_hint
,
45 qsg
->sg
= g_malloc(alloc_hint
* sizeof(ScatterGatherEntry
));
47 qsg
->nalloc
= alloc_hint
;
51 object_ref(OBJECT(dev
));
54 void qemu_sglist_add(QEMUSGList
*qsg
, dma_addr_t base
, dma_addr_t len
)
56 if (qsg
->nsg
== qsg
->nalloc
) {
57 qsg
->nalloc
= 2 * qsg
->nalloc
+ 1;
58 qsg
->sg
= g_realloc(qsg
->sg
, qsg
->nalloc
* sizeof(ScatterGatherEntry
));
60 qsg
->sg
[qsg
->nsg
].base
= base
;
61 qsg
->sg
[qsg
->nsg
].len
= len
;
66 void qemu_sglist_destroy(QEMUSGList
*qsg
)
68 object_unref(OBJECT(qsg
->dev
));
70 memset(qsg
, 0, sizeof(*qsg
));
82 dma_addr_t sg_cur_byte
;
89 static void dma_blk_cb(void *opaque
, int ret
);
91 static void reschedule_dma(void *opaque
)
93 DMAAIOCB
*dbs
= (DMAAIOCB
*)opaque
;
95 assert(!dbs
->acb
&& dbs
->bh
);
96 qemu_bh_delete(dbs
->bh
);
101 static void dma_blk_unmap(DMAAIOCB
*dbs
)
105 for (i
= 0; i
< dbs
->iov
.niov
; ++i
) {
106 dma_memory_unmap(dbs
->sg
->as
, dbs
->iov
.iov
[i
].iov_base
,
107 dbs
->iov
.iov
[i
].iov_len
, dbs
->dir
,
108 dbs
->iov
.iov
[i
].iov_len
);
110 qemu_iovec_reset(&dbs
->iov
);
113 static void dma_complete(DMAAIOCB
*dbs
, int ret
)
115 trace_dma_complete(dbs
, ret
, dbs
->common
.cb
);
117 assert(!dbs
->acb
&& !dbs
->bh
);
119 if (dbs
->common
.cb
) {
120 dbs
->common
.cb(dbs
->common
.opaque
, ret
);
122 qemu_iovec_destroy(&dbs
->iov
);
126 static void dma_blk_cb(void *opaque
, int ret
)
128 DMAAIOCB
*dbs
= (DMAAIOCB
*)opaque
;
129 dma_addr_t cur_addr
, cur_len
;
132 trace_dma_blk_cb(dbs
, ret
);
135 dbs
->offset
+= dbs
->iov
.size
;
137 if (dbs
->sg_cur_index
== dbs
->sg
->nsg
|| ret
< 0) {
138 dma_complete(dbs
, ret
);
143 while (dbs
->sg_cur_index
< dbs
->sg
->nsg
) {
144 cur_addr
= dbs
->sg
->sg
[dbs
->sg_cur_index
].base
+ dbs
->sg_cur_byte
;
145 cur_len
= dbs
->sg
->sg
[dbs
->sg_cur_index
].len
- dbs
->sg_cur_byte
;
146 mem
= dma_memory_map(dbs
->sg
->as
, cur_addr
, &cur_len
, dbs
->dir
,
147 MEMTXATTRS_UNSPECIFIED
);
149 * Make reads deterministic in icount mode. Windows sometimes issues
150 * disk read requests with overlapping SGs. It leads
151 * to non-determinism, because resulting buffer contents may be mixed
152 * from several sectors. This code splits all SGs into several
153 * groups. SGs in every group do not overlap.
155 if (mem
&& icount_enabled() && dbs
->dir
== DMA_DIRECTION_FROM_DEVICE
) {
157 for (i
= 0 ; i
< dbs
->iov
.niov
; ++i
) {
158 if (ranges_overlap((intptr_t)dbs
->iov
.iov
[i
].iov_base
,
159 dbs
->iov
.iov
[i
].iov_len
, (intptr_t)mem
,
161 dma_memory_unmap(dbs
->sg
->as
, mem
, cur_len
,
170 qemu_iovec_add(&dbs
->iov
, mem
, cur_len
);
171 dbs
->sg_cur_byte
+= cur_len
;
172 if (dbs
->sg_cur_byte
== dbs
->sg
->sg
[dbs
->sg_cur_index
].len
) {
173 dbs
->sg_cur_byte
= 0;
178 if (dbs
->iov
.size
== 0) {
179 trace_dma_map_wait(dbs
);
180 dbs
->bh
= aio_bh_new(dbs
->ctx
, reschedule_dma
, dbs
);
181 cpu_register_map_client(dbs
->bh
);
185 if (!QEMU_IS_ALIGNED(dbs
->iov
.size
, dbs
->align
)) {
186 qemu_iovec_discard_back(&dbs
->iov
,
187 QEMU_ALIGN_DOWN(dbs
->iov
.size
, dbs
->align
));
190 aio_context_acquire(dbs
->ctx
);
191 dbs
->acb
= dbs
->io_func(dbs
->offset
, &dbs
->iov
,
192 dma_blk_cb
, dbs
, dbs
->io_func_opaque
);
193 aio_context_release(dbs
->ctx
);
197 static void dma_aio_cancel(BlockAIOCB
*acb
)
199 DMAAIOCB
*dbs
= container_of(acb
, DMAAIOCB
, common
);
201 trace_dma_aio_cancel(dbs
);
203 assert(!(dbs
->acb
&& dbs
->bh
));
205 /* This will invoke dma_blk_cb. */
206 blk_aio_cancel_async(dbs
->acb
);
211 cpu_unregister_map_client(dbs
->bh
);
212 qemu_bh_delete(dbs
->bh
);
215 if (dbs
->common
.cb
) {
216 dbs
->common
.cb(dbs
->common
.opaque
, -ECANCELED
);
220 static AioContext
*dma_get_aio_context(BlockAIOCB
*acb
)
222 DMAAIOCB
*dbs
= container_of(acb
, DMAAIOCB
, common
);
227 static const AIOCBInfo dma_aiocb_info
= {
228 .aiocb_size
= sizeof(DMAAIOCB
),
229 .cancel_async
= dma_aio_cancel
,
230 .get_aio_context
= dma_get_aio_context
,
233 BlockAIOCB
*dma_blk_io(AioContext
*ctx
,
234 QEMUSGList
*sg
, uint64_t offset
, uint32_t align
,
235 DMAIOFunc
*io_func
, void *io_func_opaque
,
236 BlockCompletionFunc
*cb
,
237 void *opaque
, DMADirection dir
)
239 DMAAIOCB
*dbs
= qemu_aio_get(&dma_aiocb_info
, NULL
, cb
, opaque
);
241 trace_dma_blk_io(dbs
, io_func_opaque
, offset
, (dir
== DMA_DIRECTION_TO_DEVICE
));
246 dbs
->offset
= offset
;
248 dbs
->sg_cur_index
= 0;
249 dbs
->sg_cur_byte
= 0;
251 dbs
->io_func
= io_func
;
252 dbs
->io_func_opaque
= io_func_opaque
;
254 qemu_iovec_init(&dbs
->iov
, sg
->nsg
);
261 BlockAIOCB
*dma_blk_read_io_func(int64_t offset
, QEMUIOVector
*iov
,
262 BlockCompletionFunc
*cb
, void *cb_opaque
,
265 BlockBackend
*blk
= opaque
;
266 return blk_aio_preadv(blk
, offset
, iov
, 0, cb
, cb_opaque
);
269 BlockAIOCB
*dma_blk_read(BlockBackend
*blk
,
270 QEMUSGList
*sg
, uint64_t offset
, uint32_t align
,
271 void (*cb
)(void *opaque
, int ret
), void *opaque
)
273 return dma_blk_io(blk_get_aio_context(blk
), sg
, offset
, align
,
274 dma_blk_read_io_func
, blk
, cb
, opaque
,
275 DMA_DIRECTION_FROM_DEVICE
);
279 BlockAIOCB
*dma_blk_write_io_func(int64_t offset
, QEMUIOVector
*iov
,
280 BlockCompletionFunc
*cb
, void *cb_opaque
,
283 BlockBackend
*blk
= opaque
;
284 return blk_aio_pwritev(blk
, offset
, iov
, 0, cb
, cb_opaque
);
287 BlockAIOCB
*dma_blk_write(BlockBackend
*blk
,
288 QEMUSGList
*sg
, uint64_t offset
, uint32_t align
,
289 void (*cb
)(void *opaque
, int ret
), void *opaque
)
291 return dma_blk_io(blk_get_aio_context(blk
), sg
, offset
, align
,
292 dma_blk_write_io_func
, blk
, cb
, opaque
,
293 DMA_DIRECTION_TO_DEVICE
);
297 static MemTxResult
dma_buf_rw(void *buf
, dma_addr_t len
, dma_addr_t
*residual
,
298 QEMUSGList
*sg
, DMADirection dir
,
302 dma_addr_t xresidual
;
304 MemTxResult res
= MEMTX_OK
;
306 xresidual
= sg
->size
;
308 len
= MIN(len
, xresidual
);
310 ScatterGatherEntry entry
= sg
->sg
[sg_cur_index
++];
311 dma_addr_t xfer
= MIN(len
, entry
.len
);
312 res
|= dma_memory_rw(sg
->as
, entry
.base
, ptr
, xfer
, dir
, attrs
);
319 *residual
= xresidual
;
324 MemTxResult
dma_buf_read(void *ptr
, dma_addr_t len
, dma_addr_t
*residual
,
325 QEMUSGList
*sg
, MemTxAttrs attrs
)
327 return dma_buf_rw(ptr
, len
, residual
, sg
, DMA_DIRECTION_FROM_DEVICE
, attrs
);
330 MemTxResult
dma_buf_write(void *ptr
, dma_addr_t len
, dma_addr_t
*residual
,
331 QEMUSGList
*sg
, MemTxAttrs attrs
)
333 return dma_buf_rw(ptr
, len
, residual
, sg
, DMA_DIRECTION_TO_DEVICE
, attrs
);
336 void dma_acct_start(BlockBackend
*blk
, BlockAcctCookie
*cookie
,
337 QEMUSGList
*sg
, enum BlockAcctType type
)
339 block_acct_start(blk_get_stats(blk
), cookie
, sg
->size
, type
);
342 uint64_t dma_aligned_pow2_mask(uint64_t start
, uint64_t end
, int max_addr_bits
)
344 uint64_t max_mask
= UINT64_MAX
, addr_mask
= end
- start
;
345 uint64_t alignment_mask
, size_mask
;
347 if (max_addr_bits
!= 64) {
348 max_mask
= (1ULL << max_addr_bits
) - 1;
351 alignment_mask
= start
? (start
& -start
) - 1 : max_mask
;
352 alignment_mask
= MIN(alignment_mask
, max_mask
);
353 size_mask
= MIN(addr_mask
, max_mask
);
355 if (alignment_mask
<= size_mask
) {
356 /* Increase the alignment of start */
357 return alignment_mask
;
359 /* Find the largest page mask from size */
360 if (addr_mask
== UINT64_MAX
) {
363 return (1ULL << (63 - clz64(addr_mask
+ 1))) - 1;