2 * Copyright (c) 2018 Citrix Systems Inc.
3 * (c) Gerd Hoffmann <kraxel@redhat.com>
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; under version 2 of the License.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License along
15 * with this program; if not, see <http://www.gnu.org/licenses/>.
17 * Contributions after 2012-01-13 are licensed under the terms of the
18 * GNU GPL, version 2 or (at your option) any later version.
36 struct XenBlkDev
*blkdev
;
37 QLIST_ENTRY(ioreq
) list
;
41 #define MAX_RING_PAGE_ORDER 4
44 struct XenLegacyDevice xendev
; /* must be first */
51 const char *fileproto
;
53 unsigned int ring_ref
[1 << MAX_RING_PAGE_ORDER
];
54 unsigned int nr_ring_ref
;
59 blkif_back_rings_t rings
;
63 QLIST_HEAD(inflight_head
, ioreq
) inflight
;
64 QLIST_HEAD(finished_head
, ioreq
) finished
;
65 QLIST_HEAD(freelist_head
, ioreq
) freelist
;
67 int requests_inflight
;
68 int requests_finished
;
69 unsigned int max_requests
;
71 gboolean feature_discard
;
73 /* qemu block driver */
82 static void ioreq_reset(struct ioreq
*ioreq
)
84 memset(&ioreq
->req
, 0, sizeof(ioreq
->req
));
91 ioreq
->aio_inflight
= 0;
92 ioreq
->aio_errors
= 0;
95 memset(&ioreq
->list
, 0, sizeof(ioreq
->list
));
96 memset(&ioreq
->acct
, 0, sizeof(ioreq
->acct
));
98 qemu_iovec_reset(&ioreq
->v
);
101 static struct ioreq
*ioreq_start(struct XenBlkDev
*blkdev
)
103 struct ioreq
*ioreq
= NULL
;
105 if (QLIST_EMPTY(&blkdev
->freelist
)) {
106 if (blkdev
->requests_total
>= blkdev
->max_requests
) {
109 /* allocate new struct */
110 ioreq
= g_malloc0(sizeof(*ioreq
));
111 ioreq
->blkdev
= blkdev
;
112 blkdev
->requests_total
++;
113 qemu_iovec_init(&ioreq
->v
, 1);
115 /* get one from freelist */
116 ioreq
= QLIST_FIRST(&blkdev
->freelist
);
117 QLIST_REMOVE(ioreq
, list
);
119 QLIST_INSERT_HEAD(&blkdev
->inflight
, ioreq
, list
);
120 blkdev
->requests_inflight
++;
126 static void ioreq_finish(struct ioreq
*ioreq
)
128 struct XenBlkDev
*blkdev
= ioreq
->blkdev
;
130 QLIST_REMOVE(ioreq
, list
);
131 QLIST_INSERT_HEAD(&blkdev
->finished
, ioreq
, list
);
132 blkdev
->requests_inflight
--;
133 blkdev
->requests_finished
++;
136 static void ioreq_release(struct ioreq
*ioreq
, bool finish
)
138 struct XenBlkDev
*blkdev
= ioreq
->blkdev
;
140 QLIST_REMOVE(ioreq
, list
);
142 ioreq
->blkdev
= blkdev
;
143 QLIST_INSERT_HEAD(&blkdev
->freelist
, ioreq
, list
);
145 blkdev
->requests_finished
--;
147 blkdev
->requests_inflight
--;
152 * translate request into iovec + start offset
153 * do sanity checks along the way
155 static int ioreq_parse(struct ioreq
*ioreq
)
157 struct XenBlkDev
*blkdev
= ioreq
->blkdev
;
158 struct XenLegacyDevice
*xendev
= &blkdev
->xendev
;
162 switch (ioreq
->req
.operation
) {
165 case BLKIF_OP_FLUSH_DISKCACHE
:
167 if (!ioreq
->req
.nr_segments
) {
173 case BLKIF_OP_DISCARD
:
176 error_report("error: unknown operation (%d)", ioreq
->req
.operation
);
180 if (ioreq
->req
.operation
!= BLKIF_OP_READ
&& blkdev
->mode
[0] != 'w') {
181 error_report("error: write req for ro device");
185 ioreq
->start
= ioreq
->req
.sector_number
* blkdev
->file_blk
;
186 for (i
= 0; i
< ioreq
->req
.nr_segments
; i
++) {
187 if (i
== BLKIF_MAX_SEGMENTS_PER_REQUEST
) {
188 error_report("error: nr_segments too big");
191 if (ioreq
->req
.seg
[i
].first_sect
> ioreq
->req
.seg
[i
].last_sect
) {
192 error_report("error: first > last sector");
195 if (ioreq
->req
.seg
[i
].last_sect
* BLOCK_SIZE
>= XC_PAGE_SIZE
) {
196 error_report("error: page crossing");
200 len
= (ioreq
->req
.seg
[i
].last_sect
-
201 ioreq
->req
.seg
[i
].first_sect
+ 1) * blkdev
->file_blk
;
204 if (ioreq
->start
+ ioreq
->size
> blkdev
->file_size
) {
205 error_report("error: access beyond end of file");
211 ioreq
->status
= BLKIF_RSP_ERROR
;
215 static int ioreq_grant_copy(struct ioreq
*ioreq
)
217 struct XenBlkDev
*blkdev
= ioreq
->blkdev
;
218 struct XenLegacyDevice
*xendev
= &blkdev
->xendev
;
219 XenGrantCopySegment segs
[BLKIF_MAX_SEGMENTS_PER_REQUEST
];
221 int64_t file_blk
= blkdev
->file_blk
;
222 bool to_domain
= (ioreq
->req
.operation
== BLKIF_OP_READ
);
223 void *virt
= ioreq
->buf
;
225 if (ioreq
->req
.nr_segments
== 0) {
229 count
= ioreq
->req
.nr_segments
;
231 for (i
= 0; i
< count
; i
++) {
233 segs
[i
].dest
.foreign
.ref
= ioreq
->req
.seg
[i
].gref
;
234 segs
[i
].dest
.foreign
.offset
= ioreq
->req
.seg
[i
].first_sect
*
236 segs
[i
].source
.virt
= virt
;
238 segs
[i
].source
.foreign
.ref
= ioreq
->req
.seg
[i
].gref
;
239 segs
[i
].source
.foreign
.offset
= ioreq
->req
.seg
[i
].first_sect
*
241 segs
[i
].dest
.virt
= virt
;
243 segs
[i
].len
= (ioreq
->req
.seg
[i
].last_sect
244 - ioreq
->req
.seg
[i
].first_sect
+ 1) * file_blk
;
248 rc
= xen_be_copy_grant_refs(xendev
, to_domain
, segs
, count
);
251 error_report("failed to copy data %d", rc
);
259 static int ioreq_runio_qemu_aio(struct ioreq
*ioreq
);
261 static void qemu_aio_complete(void *opaque
, int ret
)
263 struct ioreq
*ioreq
= opaque
;
264 struct XenBlkDev
*blkdev
= ioreq
->blkdev
;
265 struct XenLegacyDevice
*xendev
= &blkdev
->xendev
;
267 aio_context_acquire(blkdev
->ctx
);
270 error_report("%s I/O error",
271 ioreq
->req
.operation
== BLKIF_OP_READ
?
276 ioreq
->aio_inflight
--;
277 if (ioreq
->presync
) {
279 ioreq_runio_qemu_aio(ioreq
);
282 if (ioreq
->aio_inflight
> 0) {
286 switch (ioreq
->req
.operation
) {
288 /* in case of failure ioreq->aio_errors is increased */
290 ioreq_grant_copy(ioreq
);
292 qemu_vfree(ioreq
->buf
);
295 case BLKIF_OP_FLUSH_DISKCACHE
:
296 if (!ioreq
->req
.nr_segments
) {
299 qemu_vfree(ioreq
->buf
);
305 ioreq
->status
= ioreq
->aio_errors
? BLKIF_RSP_ERROR
: BLKIF_RSP_OKAY
;
308 switch (ioreq
->req
.operation
) {
310 case BLKIF_OP_FLUSH_DISKCACHE
:
311 if (!ioreq
->req
.nr_segments
) {
315 if (ioreq
->status
== BLKIF_RSP_OKAY
) {
316 block_acct_done(blk_get_stats(blkdev
->blk
), &ioreq
->acct
);
318 block_acct_failed(blk_get_stats(blkdev
->blk
), &ioreq
->acct
);
321 case BLKIF_OP_DISCARD
:
325 qemu_bh_schedule(blkdev
->bh
);
328 aio_context_release(blkdev
->ctx
);
331 static bool blk_split_discard(struct ioreq
*ioreq
, blkif_sector_t sector_number
,
334 struct XenBlkDev
*blkdev
= ioreq
->blkdev
;
337 uint64_t byte_remaining
, limit
;
338 uint64_t sec_start
= sector_number
;
339 uint64_t sec_count
= nr_sectors
;
341 /* Wrap around, or overflowing byte limit? */
342 if (sec_start
+ sec_count
< sec_count
||
343 sec_start
+ sec_count
> INT64_MAX
>> BDRV_SECTOR_BITS
) {
347 limit
= BDRV_REQUEST_MAX_SECTORS
<< BDRV_SECTOR_BITS
;
348 byte_offset
= sec_start
<< BDRV_SECTOR_BITS
;
349 byte_remaining
= sec_count
<< BDRV_SECTOR_BITS
;
352 byte_chunk
= byte_remaining
> limit
? limit
: byte_remaining
;
353 ioreq
->aio_inflight
++;
354 blk_aio_pdiscard(blkdev
->blk
, byte_offset
, byte_chunk
,
355 qemu_aio_complete
, ioreq
);
356 byte_remaining
-= byte_chunk
;
357 byte_offset
+= byte_chunk
;
358 } while (byte_remaining
> 0);
363 static int ioreq_runio_qemu_aio(struct ioreq
*ioreq
)
365 struct XenBlkDev
*blkdev
= ioreq
->blkdev
;
367 ioreq
->buf
= qemu_memalign(XC_PAGE_SIZE
, ioreq
->size
);
368 if (ioreq
->req
.nr_segments
&&
369 (ioreq
->req
.operation
== BLKIF_OP_WRITE
||
370 ioreq
->req
.operation
== BLKIF_OP_FLUSH_DISKCACHE
) &&
371 ioreq_grant_copy(ioreq
)) {
372 qemu_vfree(ioreq
->buf
);
376 ioreq
->aio_inflight
++;
377 if (ioreq
->presync
) {
378 blk_aio_flush(ioreq
->blkdev
->blk
, qemu_aio_complete
, ioreq
);
382 switch (ioreq
->req
.operation
) {
384 qemu_iovec_add(&ioreq
->v
, ioreq
->buf
, ioreq
->size
);
385 block_acct_start(blk_get_stats(blkdev
->blk
), &ioreq
->acct
,
386 ioreq
->v
.size
, BLOCK_ACCT_READ
);
387 ioreq
->aio_inflight
++;
388 blk_aio_preadv(blkdev
->blk
, ioreq
->start
, &ioreq
->v
, 0,
389 qemu_aio_complete
, ioreq
);
392 case BLKIF_OP_FLUSH_DISKCACHE
:
393 if (!ioreq
->req
.nr_segments
) {
397 qemu_iovec_add(&ioreq
->v
, ioreq
->buf
, ioreq
->size
);
398 block_acct_start(blk_get_stats(blkdev
->blk
), &ioreq
->acct
,
400 ioreq
->req
.operation
== BLKIF_OP_WRITE
?
401 BLOCK_ACCT_WRITE
: BLOCK_ACCT_FLUSH
);
402 ioreq
->aio_inflight
++;
403 blk_aio_pwritev(blkdev
->blk
, ioreq
->start
, &ioreq
->v
, 0,
404 qemu_aio_complete
, ioreq
);
406 case BLKIF_OP_DISCARD
:
408 struct blkif_request_discard
*req
= (void *)&ioreq
->req
;
409 if (!blk_split_discard(ioreq
, req
->sector_number
, req
->nr_sectors
)) {
415 /* unknown operation (shouldn't happen -- parse catches this) */
419 qemu_aio_complete(ioreq
, 0);
425 ioreq
->status
= BLKIF_RSP_ERROR
;
429 static int blk_send_response_one(struct ioreq
*ioreq
)
431 struct XenBlkDev
*blkdev
= ioreq
->blkdev
;
433 int have_requests
= 0;
434 blkif_response_t
*resp
;
436 /* Place on the response ring for the relevant domain. */
437 switch (blkdev
->protocol
) {
438 case BLKIF_PROTOCOL_NATIVE
:
439 resp
= (blkif_response_t
*)RING_GET_RESPONSE(
440 &blkdev
->rings
.native
,
441 blkdev
->rings
.native
.rsp_prod_pvt
);
443 case BLKIF_PROTOCOL_X86_32
:
444 resp
= (blkif_response_t
*)RING_GET_RESPONSE(
445 &blkdev
->rings
.x86_32_part
,
446 blkdev
->rings
.x86_32_part
.rsp_prod_pvt
);
448 case BLKIF_PROTOCOL_X86_64
:
449 resp
= (blkif_response_t
*)RING_GET_RESPONSE(
450 &blkdev
->rings
.x86_64_part
,
451 blkdev
->rings
.x86_64_part
.rsp_prod_pvt
);
457 resp
->id
= ioreq
->req
.id
;
458 resp
->operation
= ioreq
->req
.operation
;
459 resp
->status
= ioreq
->status
;
461 blkdev
->rings
.common
.rsp_prod_pvt
++;
463 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blkdev
->rings
.common
, send_notify
);
464 if (blkdev
->rings
.common
.rsp_prod_pvt
== blkdev
->rings
.common
.req_cons
) {
466 * Tail check for pending requests. Allows frontend to avoid
467 * notifications if requests are already in flight (lower
468 * overheads and promotes batching).
470 RING_FINAL_CHECK_FOR_REQUESTS(&blkdev
->rings
.common
, have_requests
);
471 } else if (RING_HAS_UNCONSUMED_REQUESTS(&blkdev
->rings
.common
)) {
481 /* walk finished list, send outstanding responses, free requests */
482 static void blk_send_response_all(struct XenBlkDev
*blkdev
)
487 while (!QLIST_EMPTY(&blkdev
->finished
)) {
488 ioreq
= QLIST_FIRST(&blkdev
->finished
);
489 send_notify
+= blk_send_response_one(ioreq
);
490 ioreq_release(ioreq
, true);
493 xen_pv_send_notify(&blkdev
->xendev
);
497 static int blk_get_request(struct XenBlkDev
*blkdev
, struct ioreq
*ioreq
,
500 switch (blkdev
->protocol
) {
501 case BLKIF_PROTOCOL_NATIVE
:
502 memcpy(&ioreq
->req
, RING_GET_REQUEST(&blkdev
->rings
.native
, rc
),
505 case BLKIF_PROTOCOL_X86_32
:
506 blkif_get_x86_32_req(&ioreq
->req
,
507 RING_GET_REQUEST(&blkdev
->rings
.x86_32_part
, rc
));
509 case BLKIF_PROTOCOL_X86_64
:
510 blkif_get_x86_64_req(&ioreq
->req
,
511 RING_GET_REQUEST(&blkdev
->rings
.x86_64_part
, rc
));
514 /* Prevent the compiler from accessing the on-ring fields instead. */
519 static void blk_handle_requests(struct XenBlkDev
*blkdev
)
524 blkdev
->more_work
= 0;
526 rc
= blkdev
->rings
.common
.req_cons
;
527 rp
= blkdev
->rings
.common
.sring
->req_prod
;
528 xen_rmb(); /* Ensure we see queued requests up to 'rp'. */
530 blk_send_response_all(blkdev
);
532 /* pull request from ring */
533 if (RING_REQUEST_CONS_OVERFLOW(&blkdev
->rings
.common
, rc
)) {
536 ioreq
= ioreq_start(blkdev
);
541 blk_get_request(blkdev
, ioreq
, rc
);
542 blkdev
->rings
.common
.req_cons
= ++rc
;
545 if (ioreq_parse(ioreq
) != 0) {
547 switch (ioreq
->req
.operation
) {
549 block_acct_invalid(blk_get_stats(blkdev
->blk
),
553 block_acct_invalid(blk_get_stats(blkdev
->blk
),
556 case BLKIF_OP_FLUSH_DISKCACHE
:
557 block_acct_invalid(blk_get_stats(blkdev
->blk
),
563 if (blk_send_response_one(ioreq
)) {
564 xen_pv_send_notify(&blkdev
->xendev
);
566 ioreq_release(ioreq
, false);
570 ioreq_runio_qemu_aio(ioreq
);
573 if (blkdev
->more_work
&& blkdev
->requests_inflight
< blkdev
->max_requests
) {
574 qemu_bh_schedule(blkdev
->bh
);
578 static void blk_bh(void *opaque
)
580 struct XenBlkDev
*blkdev
= opaque
;
582 aio_context_acquire(blkdev
->ctx
);
583 blk_handle_requests(blkdev
);
584 aio_context_release(blkdev
->ctx
);
587 static void blk_alloc(struct XenLegacyDevice
*xendev
)
589 struct XenBlkDev
*blkdev
= container_of(xendev
, struct XenBlkDev
, xendev
);
592 trace_xen_disk_alloc(xendev
->name
);
594 QLIST_INIT(&blkdev
->inflight
);
595 QLIST_INIT(&blkdev
->finished
);
596 QLIST_INIT(&blkdev
->freelist
);
598 blkdev
->iothread
= iothread_create(xendev
->name
, &err
);
601 blkdev
->ctx
= iothread_get_aio_context(blkdev
->iothread
);
602 blkdev
->bh
= aio_bh_new(blkdev
->ctx
, blk_bh
, blkdev
);
605 static int blk_free(struct XenLegacyDevice
*xendev
)
607 struct XenBlkDev
*blkdev
= container_of(xendev
, struct XenBlkDev
, xendev
);
610 trace_xen_disk_free(xendev
->name
);
612 blk_disconnect(xendev
);
614 while (!QLIST_EMPTY(&blkdev
->freelist
)) {
615 ioreq
= QLIST_FIRST(&blkdev
->freelist
);
616 QLIST_REMOVE(ioreq
, list
);
617 qemu_iovec_destroy(&ioreq
->v
);
621 g_free(blkdev
->params
);
622 g_free(blkdev
->mode
);
623 g_free(blkdev
->type
);
625 g_free(blkdev
->devtype
);
626 qemu_bh_delete(blkdev
->bh
);
627 iothread_destroy(blkdev
->iothread
);
631 static void blk_event(struct XenLegacyDevice
*xendev
)
633 struct XenBlkDev
*blkdev
= container_of(xendev
, struct XenBlkDev
, xendev
);
635 qemu_bh_schedule(blkdev
->bh
);