2 * Copyright (c) 2018 Citrix Systems Inc.
3 * (c) Gerd Hoffmann <kraxel@redhat.com>
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; under version 2 of the License.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License along
15 * with this program; if not, see <http://www.gnu.org/licenses/>.
17 * Contributions after 2012-01-13 are licensed under the terms of the
18 * GNU GPL, version 2 or (at your option) any later version.
21 #include "qemu/osdep.h"
22 #include "qemu/error-report.h"
23 #include "qapi/error.h"
25 #include "hw/xen/xen_common.h"
26 #include "hw/block/xen_blkif.h"
27 #include "sysemu/block-backend.h"
28 #include "sysemu/iothread.h"
29 #include "xen-block.h"
41 struct XenBlkDev
*blkdev
;
42 QLIST_ENTRY(ioreq
) list
;
48 XenEventChannel
*event_channel
;
49 unsigned int *ring_ref
;
50 unsigned int nr_ring_ref
;
55 blkif_back_rings_t rings
;
57 QLIST_HEAD(inflight_head
, ioreq
) inflight
;
58 QLIST_HEAD(finished_head
, ioreq
) finished
;
59 QLIST_HEAD(freelist_head
, ioreq
) freelist
;
61 int requests_inflight
;
62 int requests_finished
;
63 unsigned int max_requests
;
70 static void ioreq_reset(struct ioreq
*ioreq
)
72 memset(&ioreq
->req
, 0, sizeof(ioreq
->req
));
79 ioreq
->aio_inflight
= 0;
80 ioreq
->aio_errors
= 0;
83 memset(&ioreq
->list
, 0, sizeof(ioreq
->list
));
84 memset(&ioreq
->acct
, 0, sizeof(ioreq
->acct
));
86 qemu_iovec_reset(&ioreq
->v
);
89 static struct ioreq
*ioreq_start(struct XenBlkDev
*blkdev
)
91 struct ioreq
*ioreq
= NULL
;
93 if (QLIST_EMPTY(&blkdev
->freelist
)) {
94 if (blkdev
->requests_total
>= blkdev
->max_requests
) {
97 /* allocate new struct */
98 ioreq
= g_malloc0(sizeof(*ioreq
));
99 ioreq
->blkdev
= blkdev
;
100 blkdev
->requests_total
++;
101 qemu_iovec_init(&ioreq
->v
, 1);
103 /* get one from freelist */
104 ioreq
= QLIST_FIRST(&blkdev
->freelist
);
105 QLIST_REMOVE(ioreq
, list
);
107 QLIST_INSERT_HEAD(&blkdev
->inflight
, ioreq
, list
);
108 blkdev
->requests_inflight
++;
114 static void ioreq_finish(struct ioreq
*ioreq
)
116 struct XenBlkDev
*blkdev
= ioreq
->blkdev
;
118 QLIST_REMOVE(ioreq
, list
);
119 QLIST_INSERT_HEAD(&blkdev
->finished
, ioreq
, list
);
120 blkdev
->requests_inflight
--;
121 blkdev
->requests_finished
++;
124 static void ioreq_release(struct ioreq
*ioreq
, bool finish
)
126 struct XenBlkDev
*blkdev
= ioreq
->blkdev
;
128 QLIST_REMOVE(ioreq
, list
);
130 ioreq
->blkdev
= blkdev
;
131 QLIST_INSERT_HEAD(&blkdev
->freelist
, ioreq
, list
);
133 blkdev
->requests_finished
--;
135 blkdev
->requests_inflight
--;
140 * translate request into iovec + start offset
141 * do sanity checks along the way
143 static int ioreq_parse(struct ioreq
*ioreq
)
145 struct XenBlkDev
*blkdev
= ioreq
->blkdev
;
149 switch (ioreq
->req
.operation
) {
152 case BLKIF_OP_FLUSH_DISKCACHE
:
154 if (!ioreq
->req
.nr_segments
) {
160 case BLKIF_OP_DISCARD
:
163 error_report("error: unknown operation (%d)", ioreq
->req
.operation
);
167 if (ioreq
->req
.operation
!= BLKIF_OP_READ
&&
168 blk_is_read_only(blkdev
->blk
)) {
169 error_report("error: write req for ro device");
173 ioreq
->start
= ioreq
->req
.sector_number
* blkdev
->file_blk
;
174 for (i
= 0; i
< ioreq
->req
.nr_segments
; i
++) {
175 if (i
== BLKIF_MAX_SEGMENTS_PER_REQUEST
) {
176 error_report("error: nr_segments too big");
179 if (ioreq
->req
.seg
[i
].first_sect
> ioreq
->req
.seg
[i
].last_sect
) {
180 error_report("error: first > last sector");
183 if (ioreq
->req
.seg
[i
].last_sect
* blkdev
->file_blk
>= XC_PAGE_SIZE
) {
184 error_report("error: page crossing");
188 len
= (ioreq
->req
.seg
[i
].last_sect
-
189 ioreq
->req
.seg
[i
].first_sect
+ 1) * blkdev
->file_blk
;
192 if (ioreq
->start
+ ioreq
->size
> blkdev
->file_size
) {
193 error_report("error: access beyond end of file");
199 ioreq
->status
= BLKIF_RSP_ERROR
;
203 static int ioreq_grant_copy(struct ioreq
*ioreq
)
205 struct XenBlkDev
*blkdev
= ioreq
->blkdev
;
206 XenDevice
*xendev
= blkdev
->xendev
;
207 XenDeviceGrantCopySegment segs
[BLKIF_MAX_SEGMENTS_PER_REQUEST
];
209 int64_t file_blk
= blkdev
->file_blk
;
210 bool to_domain
= (ioreq
->req
.operation
== BLKIF_OP_READ
);
211 void *virt
= ioreq
->buf
;
212 Error
*local_err
= NULL
;
214 if (ioreq
->req
.nr_segments
== 0) {
218 count
= ioreq
->req
.nr_segments
;
220 for (i
= 0; i
< count
; i
++) {
222 segs
[i
].dest
.foreign
.ref
= ioreq
->req
.seg
[i
].gref
;
223 segs
[i
].dest
.foreign
.offset
= ioreq
->req
.seg
[i
].first_sect
*
225 segs
[i
].source
.virt
= virt
;
227 segs
[i
].source
.foreign
.ref
= ioreq
->req
.seg
[i
].gref
;
228 segs
[i
].source
.foreign
.offset
= ioreq
->req
.seg
[i
].first_sect
*
230 segs
[i
].dest
.virt
= virt
;
232 segs
[i
].len
= (ioreq
->req
.seg
[i
].last_sect
-
233 ioreq
->req
.seg
[i
].first_sect
+ 1) * file_blk
;
237 xen_device_copy_grant_refs(xendev
, to_domain
, segs
, count
, &local_err
);
240 error_reportf_err(local_err
, "failed to copy data: ");
249 static int ioreq_runio_qemu_aio(struct ioreq
*ioreq
);
251 static void qemu_aio_complete(void *opaque
, int ret
)
253 struct ioreq
*ioreq
= opaque
;
254 struct XenBlkDev
*blkdev
= ioreq
->blkdev
;
256 aio_context_acquire(blkdev
->ctx
);
259 error_report("%s I/O error",
260 ioreq
->req
.operation
== BLKIF_OP_READ
?
265 ioreq
->aio_inflight
--;
266 if (ioreq
->presync
) {
268 ioreq_runio_qemu_aio(ioreq
);
271 if (ioreq
->aio_inflight
> 0) {
275 switch (ioreq
->req
.operation
) {
277 /* in case of failure ioreq->aio_errors is increased */
279 ioreq_grant_copy(ioreq
);
281 qemu_vfree(ioreq
->buf
);
284 case BLKIF_OP_FLUSH_DISKCACHE
:
285 if (!ioreq
->req
.nr_segments
) {
288 qemu_vfree(ioreq
->buf
);
294 ioreq
->status
= ioreq
->aio_errors
? BLKIF_RSP_ERROR
: BLKIF_RSP_OKAY
;
297 switch (ioreq
->req
.operation
) {
299 case BLKIF_OP_FLUSH_DISKCACHE
:
300 if (!ioreq
->req
.nr_segments
) {
304 if (ioreq
->status
== BLKIF_RSP_OKAY
) {
305 block_acct_done(blk_get_stats(blkdev
->blk
), &ioreq
->acct
);
307 block_acct_failed(blk_get_stats(blkdev
->blk
), &ioreq
->acct
);
310 case BLKIF_OP_DISCARD
:
314 qemu_bh_schedule(blkdev
->bh
);
317 aio_context_release(blkdev
->ctx
);
320 static bool blk_split_discard(struct ioreq
*ioreq
, blkif_sector_t sector_number
,
323 struct XenBlkDev
*blkdev
= ioreq
->blkdev
;
326 uint64_t byte_remaining
, limit
;
327 uint64_t sec_start
= sector_number
;
328 uint64_t sec_count
= nr_sectors
;
330 /* Wrap around, or overflowing byte limit? */
331 if (sec_start
+ sec_count
< sec_count
||
332 sec_start
+ sec_count
> INT64_MAX
/ blkdev
->file_blk
) {
336 limit
= BDRV_REQUEST_MAX_SECTORS
* blkdev
->file_blk
;
337 byte_offset
= sec_start
* blkdev
->file_blk
;
338 byte_remaining
= sec_count
* blkdev
->file_blk
;
341 byte_chunk
= byte_remaining
> limit
? limit
: byte_remaining
;
342 ioreq
->aio_inflight
++;
343 blk_aio_pdiscard(blkdev
->blk
, byte_offset
, byte_chunk
,
344 qemu_aio_complete
, ioreq
);
345 byte_remaining
-= byte_chunk
;
346 byte_offset
+= byte_chunk
;
347 } while (byte_remaining
> 0);
352 static int ioreq_runio_qemu_aio(struct ioreq
*ioreq
)
354 struct XenBlkDev
*blkdev
= ioreq
->blkdev
;
356 ioreq
->buf
= qemu_memalign(XC_PAGE_SIZE
, ioreq
->size
);
357 if (ioreq
->req
.nr_segments
&&
358 (ioreq
->req
.operation
== BLKIF_OP_WRITE
||
359 ioreq
->req
.operation
== BLKIF_OP_FLUSH_DISKCACHE
) &&
360 ioreq_grant_copy(ioreq
)) {
361 qemu_vfree(ioreq
->buf
);
365 ioreq
->aio_inflight
++;
366 if (ioreq
->presync
) {
367 blk_aio_flush(ioreq
->blkdev
->blk
, qemu_aio_complete
, ioreq
);
371 switch (ioreq
->req
.operation
) {
373 qemu_iovec_add(&ioreq
->v
, ioreq
->buf
, ioreq
->size
);
374 block_acct_start(blk_get_stats(blkdev
->blk
), &ioreq
->acct
,
375 ioreq
->v
.size
, BLOCK_ACCT_READ
);
376 ioreq
->aio_inflight
++;
377 blk_aio_preadv(blkdev
->blk
, ioreq
->start
, &ioreq
->v
, 0,
378 qemu_aio_complete
, ioreq
);
381 case BLKIF_OP_FLUSH_DISKCACHE
:
382 if (!ioreq
->req
.nr_segments
) {
386 qemu_iovec_add(&ioreq
->v
, ioreq
->buf
, ioreq
->size
);
387 block_acct_start(blk_get_stats(blkdev
->blk
), &ioreq
->acct
,
389 ioreq
->req
.operation
== BLKIF_OP_WRITE
?
390 BLOCK_ACCT_WRITE
: BLOCK_ACCT_FLUSH
);
391 ioreq
->aio_inflight
++;
392 blk_aio_pwritev(blkdev
->blk
, ioreq
->start
, &ioreq
->v
, 0,
393 qemu_aio_complete
, ioreq
);
395 case BLKIF_OP_DISCARD
:
397 struct blkif_request_discard
*req
= (void *)&ioreq
->req
;
398 if (!blk_split_discard(ioreq
, req
->sector_number
, req
->nr_sectors
)) {
404 /* unknown operation (shouldn't happen -- parse catches this) */
408 qemu_aio_complete(ioreq
, 0);
414 ioreq
->status
= BLKIF_RSP_ERROR
;
418 static int blk_send_response_one(struct ioreq
*ioreq
)
420 struct XenBlkDev
*blkdev
= ioreq
->blkdev
;
422 int have_requests
= 0;
423 blkif_response_t
*resp
;
425 /* Place on the response ring for the relevant domain. */
426 switch (blkdev
->protocol
) {
427 case BLKIF_PROTOCOL_NATIVE
:
428 resp
= (blkif_response_t
*)RING_GET_RESPONSE(
429 &blkdev
->rings
.native
,
430 blkdev
->rings
.native
.rsp_prod_pvt
);
432 case BLKIF_PROTOCOL_X86_32
:
433 resp
= (blkif_response_t
*)RING_GET_RESPONSE(
434 &blkdev
->rings
.x86_32_part
,
435 blkdev
->rings
.x86_32_part
.rsp_prod_pvt
);
437 case BLKIF_PROTOCOL_X86_64
:
438 resp
= (blkif_response_t
*)RING_GET_RESPONSE(
439 &blkdev
->rings
.x86_64_part
,
440 blkdev
->rings
.x86_64_part
.rsp_prod_pvt
);
446 resp
->id
= ioreq
->req
.id
;
447 resp
->operation
= ioreq
->req
.operation
;
448 resp
->status
= ioreq
->status
;
450 blkdev
->rings
.common
.rsp_prod_pvt
++;
452 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blkdev
->rings
.common
, send_notify
);
453 if (blkdev
->rings
.common
.rsp_prod_pvt
== blkdev
->rings
.common
.req_cons
) {
455 * Tail check for pending requests. Allows frontend to avoid
456 * notifications if requests are already in flight (lower
457 * overheads and promotes batching).
459 RING_FINAL_CHECK_FOR_REQUESTS(&blkdev
->rings
.common
, have_requests
);
460 } else if (RING_HAS_UNCONSUMED_REQUESTS(&blkdev
->rings
.common
)) {
470 /* walk finished list, send outstanding responses, free requests */
471 static void blk_send_response_all(struct XenBlkDev
*blkdev
)
476 while (!QLIST_EMPTY(&blkdev
->finished
)) {
477 ioreq
= QLIST_FIRST(&blkdev
->finished
);
478 send_notify
+= blk_send_response_one(ioreq
);
479 ioreq_release(ioreq
, true);
482 Error
*local_err
= NULL
;
484 xen_device_notify_event_channel(blkdev
->xendev
,
485 blkdev
->event_channel
,
488 error_report_err(local_err
);
493 static int blk_get_request(struct XenBlkDev
*blkdev
, struct ioreq
*ioreq
,
496 switch (blkdev
->protocol
) {
497 case BLKIF_PROTOCOL_NATIVE
:
498 memcpy(&ioreq
->req
, RING_GET_REQUEST(&blkdev
->rings
.native
, rc
),
501 case BLKIF_PROTOCOL_X86_32
:
502 blkif_get_x86_32_req(&ioreq
->req
,
503 RING_GET_REQUEST(&blkdev
->rings
.x86_32_part
, rc
));
505 case BLKIF_PROTOCOL_X86_64
:
506 blkif_get_x86_64_req(&ioreq
->req
,
507 RING_GET_REQUEST(&blkdev
->rings
.x86_64_part
, rc
));
510 /* Prevent the compiler from accessing the on-ring fields instead. */
515 static void blk_handle_requests(struct XenBlkDev
*blkdev
)
520 blkdev
->more_work
= 0;
522 rc
= blkdev
->rings
.common
.req_cons
;
523 rp
= blkdev
->rings
.common
.sring
->req_prod
;
524 xen_rmb(); /* Ensure we see queued requests up to 'rp'. */
526 blk_send_response_all(blkdev
);
528 /* pull request from ring */
529 if (RING_REQUEST_CONS_OVERFLOW(&blkdev
->rings
.common
, rc
)) {
532 ioreq
= ioreq_start(blkdev
);
537 blk_get_request(blkdev
, ioreq
, rc
);
538 blkdev
->rings
.common
.req_cons
= ++rc
;
541 if (ioreq_parse(ioreq
) != 0) {
543 switch (ioreq
->req
.operation
) {
545 block_acct_invalid(blk_get_stats(blkdev
->blk
),
549 block_acct_invalid(blk_get_stats(blkdev
->blk
),
552 case BLKIF_OP_FLUSH_DISKCACHE
:
553 block_acct_invalid(blk_get_stats(blkdev
->blk
),
559 if (blk_send_response_one(ioreq
)) {
560 Error
*local_err
= NULL
;
562 xen_device_notify_event_channel(blkdev
->xendev
,
563 blkdev
->event_channel
,
566 error_report_err(local_err
);
569 ioreq_release(ioreq
, false);
573 ioreq_runio_qemu_aio(ioreq
);
576 if (blkdev
->more_work
&& blkdev
->requests_inflight
< blkdev
->max_requests
) {
577 qemu_bh_schedule(blkdev
->bh
);
581 static void blk_bh(void *opaque
)
583 struct XenBlkDev
*blkdev
= opaque
;
585 aio_context_acquire(blkdev
->ctx
);
586 blk_handle_requests(blkdev
);
587 aio_context_release(blkdev
->ctx
);
590 static void blk_event(void *opaque
)
592 struct XenBlkDev
*blkdev
= opaque
;
594 qemu_bh_schedule(blkdev
->bh
);
597 struct XenBlkDev
*xen_block_dataplane_create(XenDevice
*xendev
,
601 struct XenBlkDev
*blkdev
= g_new0(struct XenBlkDev
, 1);
603 blkdev
->xendev
= xendev
;
604 blkdev
->file_blk
= conf
->logical_block_size
;
605 blkdev
->blk
= conf
->blk
;
606 blkdev
->file_size
= blk_getlength(blkdev
->blk
);
608 QLIST_INIT(&blkdev
->inflight
);
609 QLIST_INIT(&blkdev
->finished
);
610 QLIST_INIT(&blkdev
->freelist
);
613 blkdev
->iothread
= iothread
;
614 object_ref(OBJECT(blkdev
->iothread
));
615 blkdev
->ctx
= iothread_get_aio_context(blkdev
->iothread
);
617 blkdev
->ctx
= qemu_get_aio_context();
619 blkdev
->bh
= aio_bh_new(blkdev
->ctx
, blk_bh
, blkdev
);
624 void xen_block_dataplane_destroy(struct XenBlkDev
*blkdev
)
632 while (!QLIST_EMPTY(&blkdev
->freelist
)) {
633 ioreq
= QLIST_FIRST(&blkdev
->freelist
);
634 QLIST_REMOVE(ioreq
, list
);
635 qemu_iovec_destroy(&ioreq
->v
);
639 qemu_bh_delete(blkdev
->bh
);
640 if (blkdev
->iothread
) {
641 object_unref(OBJECT(blkdev
->iothread
));
648 void xen_block_dataplane_stop(struct XenBlkDev
*blkdev
)
656 aio_context_acquire(blkdev
->ctx
);
657 blk_set_aio_context(blkdev
->blk
, qemu_get_aio_context());
658 aio_context_release(blkdev
->ctx
);
660 xendev
= blkdev
->xendev
;
662 if (blkdev
->event_channel
) {
663 Error
*local_err
= NULL
;
665 xen_device_unbind_event_channel(xendev
, blkdev
->event_channel
,
667 blkdev
->event_channel
= NULL
;
670 error_report_err(local_err
);
675 Error
*local_err
= NULL
;
677 xen_device_unmap_grant_refs(xendev
, blkdev
->sring
,
678 blkdev
->nr_ring_ref
, &local_err
);
679 blkdev
->sring
= NULL
;
682 error_report_err(local_err
);
686 g_free(blkdev
->ring_ref
);
687 blkdev
->ring_ref
= NULL
;
690 void xen_block_dataplane_start(struct XenBlkDev
*blkdev
,
691 const unsigned int ring_ref
[],
692 unsigned int nr_ring_ref
,
693 unsigned int event_channel
,
694 unsigned int protocol
,
697 XenDevice
*xendev
= blkdev
->xendev
;
698 Error
*local_err
= NULL
;
699 unsigned int ring_size
;
702 blkdev
->nr_ring_ref
= nr_ring_ref
;
703 blkdev
->ring_ref
= g_new(unsigned int, nr_ring_ref
);
705 for (i
= 0; i
< nr_ring_ref
; i
++) {
706 blkdev
->ring_ref
[i
] = ring_ref
[i
];
709 blkdev
->protocol
= protocol
;
711 ring_size
= XC_PAGE_SIZE
* blkdev
->nr_ring_ref
;
712 switch (blkdev
->protocol
) {
713 case BLKIF_PROTOCOL_NATIVE
:
715 blkdev
->max_requests
= __CONST_RING_SIZE(blkif
, ring_size
);
718 case BLKIF_PROTOCOL_X86_32
:
720 blkdev
->max_requests
= __CONST_RING_SIZE(blkif_x86_32
, ring_size
);
723 case BLKIF_PROTOCOL_X86_64
:
725 blkdev
->max_requests
= __CONST_RING_SIZE(blkif_x86_64
, ring_size
);
729 error_setg(errp
, "unknown protocol %u", blkdev
->protocol
);
733 xen_device_set_max_grant_refs(xendev
, blkdev
->nr_ring_ref
,
736 error_propagate(errp
, local_err
);
740 blkdev
->sring
= xen_device_map_grant_refs(xendev
,
743 PROT_READ
| PROT_WRITE
,
746 error_propagate(errp
, local_err
);
750 switch (blkdev
->protocol
) {
751 case BLKIF_PROTOCOL_NATIVE
:
753 blkif_sring_t
*sring_native
= blkdev
->sring
;
755 BACK_RING_INIT(&blkdev
->rings
.native
, sring_native
, ring_size
);
758 case BLKIF_PROTOCOL_X86_32
:
760 blkif_x86_32_sring_t
*sring_x86_32
= blkdev
->sring
;
762 BACK_RING_INIT(&blkdev
->rings
.x86_32_part
, sring_x86_32
,
766 case BLKIF_PROTOCOL_X86_64
:
768 blkif_x86_64_sring_t
*sring_x86_64
= blkdev
->sring
;
770 BACK_RING_INIT(&blkdev
->rings
.x86_64_part
, sring_x86_64
,
776 blkdev
->event_channel
=
777 xen_device_bind_event_channel(xendev
, event_channel
,
781 error_propagate(errp
, local_err
);
785 aio_context_acquire(blkdev
->ctx
);
786 blk_set_aio_context(blkdev
->blk
, blkdev
->ctx
);
787 aio_context_release(blkdev
->ctx
);
791 xen_block_dataplane_stop(blkdev
);