]> git.proxmox.com Git - mirror_qemu.git/blob - hw/block/dataplane/xen-block.c
228472320a395ad8cde063f28a94f729fb488057
[mirror_qemu.git] / hw / block / dataplane / xen-block.c
1 /*
2 * Copyright (c) 2018 Citrix Systems Inc.
3 * (c) Gerd Hoffmann <kraxel@redhat.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; under version 2 of the License.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License along
15 * with this program; if not, see <http://www.gnu.org/licenses/>.
16 *
17 * Contributions after 2012-01-13 are licensed under the terms of the
18 * GNU GPL, version 2 or (at your option) any later version.
19 */
20
21 struct ioreq {
22 blkif_request_t req;
23 int16_t status;
24
25 /* parsed request */
26 off_t start;
27 QEMUIOVector v;
28 void *buf;
29 size_t size;
30 int presync;
31
32 /* aio status */
33 int aio_inflight;
34 int aio_errors;
35
36 struct XenBlkDev *blkdev;
37 QLIST_ENTRY(ioreq) list;
38 BlockAcctCookie acct;
39 };
40
41 #define MAX_RING_PAGE_ORDER 4
42
43 struct XenBlkDev {
44 struct XenLegacyDevice xendev; /* must be first */
45 char *params;
46 char *mode;
47 char *type;
48 char *dev;
49 char *devtype;
50 bool directiosafe;
51 const char *fileproto;
52 const char *filename;
53 unsigned int ring_ref[1 << MAX_RING_PAGE_ORDER];
54 unsigned int nr_ring_ref;
55 void *sring;
56 int64_t file_blk;
57 int64_t file_size;
58 int protocol;
59 blkif_back_rings_t rings;
60 int more_work;
61
62 /* request lists */
63 QLIST_HEAD(inflight_head, ioreq) inflight;
64 QLIST_HEAD(finished_head, ioreq) finished;
65 QLIST_HEAD(freelist_head, ioreq) freelist;
66 int requests_total;
67 int requests_inflight;
68 int requests_finished;
69 unsigned int max_requests;
70
71 gboolean feature_discard;
72
73 /* qemu block driver */
74 DriveInfo *dinfo;
75 BlockBackend *blk;
76 QEMUBH *bh;
77
78 IOThread *iothread;
79 AioContext *ctx;
80 };
81
82 static void ioreq_reset(struct ioreq *ioreq)
83 {
84 memset(&ioreq->req, 0, sizeof(ioreq->req));
85 ioreq->status = 0;
86 ioreq->start = 0;
87 ioreq->buf = NULL;
88 ioreq->size = 0;
89 ioreq->presync = 0;
90
91 ioreq->aio_inflight = 0;
92 ioreq->aio_errors = 0;
93
94 ioreq->blkdev = NULL;
95 memset(&ioreq->list, 0, sizeof(ioreq->list));
96 memset(&ioreq->acct, 0, sizeof(ioreq->acct));
97
98 qemu_iovec_reset(&ioreq->v);
99 }
100
101 static struct ioreq *ioreq_start(struct XenBlkDev *blkdev)
102 {
103 struct ioreq *ioreq = NULL;
104
105 if (QLIST_EMPTY(&blkdev->freelist)) {
106 if (blkdev->requests_total >= blkdev->max_requests) {
107 goto out;
108 }
109 /* allocate new struct */
110 ioreq = g_malloc0(sizeof(*ioreq));
111 ioreq->blkdev = blkdev;
112 blkdev->requests_total++;
113 qemu_iovec_init(&ioreq->v, 1);
114 } else {
115 /* get one from freelist */
116 ioreq = QLIST_FIRST(&blkdev->freelist);
117 QLIST_REMOVE(ioreq, list);
118 }
119 QLIST_INSERT_HEAD(&blkdev->inflight, ioreq, list);
120 blkdev->requests_inflight++;
121
122 out:
123 return ioreq;
124 }
125
126 static void ioreq_finish(struct ioreq *ioreq)
127 {
128 struct XenBlkDev *blkdev = ioreq->blkdev;
129
130 QLIST_REMOVE(ioreq, list);
131 QLIST_INSERT_HEAD(&blkdev->finished, ioreq, list);
132 blkdev->requests_inflight--;
133 blkdev->requests_finished++;
134 }
135
136 static void ioreq_release(struct ioreq *ioreq, bool finish)
137 {
138 struct XenBlkDev *blkdev = ioreq->blkdev;
139
140 QLIST_REMOVE(ioreq, list);
141 ioreq_reset(ioreq);
142 ioreq->blkdev = blkdev;
143 QLIST_INSERT_HEAD(&blkdev->freelist, ioreq, list);
144 if (finish) {
145 blkdev->requests_finished--;
146 } else {
147 blkdev->requests_inflight--;
148 }
149 }
150
151 /*
152 * translate request into iovec + start offset
153 * do sanity checks along the way
154 */
155 static int ioreq_parse(struct ioreq *ioreq)
156 {
157 struct XenBlkDev *blkdev = ioreq->blkdev;
158 struct XenLegacyDevice *xendev = &blkdev->xendev;
159 size_t len;
160 int i;
161
162 switch (ioreq->req.operation) {
163 case BLKIF_OP_READ:
164 break;
165 case BLKIF_OP_FLUSH_DISKCACHE:
166 ioreq->presync = 1;
167 if (!ioreq->req.nr_segments) {
168 return 0;
169 }
170 /* fall through */
171 case BLKIF_OP_WRITE:
172 break;
173 case BLKIF_OP_DISCARD:
174 return 0;
175 default:
176 error_report("error: unknown operation (%d)", ioreq->req.operation);
177 goto err;
178 };
179
180 if (ioreq->req.operation != BLKIF_OP_READ && blkdev->mode[0] != 'w') {
181 error_report("error: write req for ro device");
182 goto err;
183 }
184
185 ioreq->start = ioreq->req.sector_number * blkdev->file_blk;
186 for (i = 0; i < ioreq->req.nr_segments; i++) {
187 if (i == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
188 error_report("error: nr_segments too big");
189 goto err;
190 }
191 if (ioreq->req.seg[i].first_sect > ioreq->req.seg[i].last_sect) {
192 error_report("error: first > last sector");
193 goto err;
194 }
195 if (ioreq->req.seg[i].last_sect * BLOCK_SIZE >= XC_PAGE_SIZE) {
196 error_report("error: page crossing");
197 goto err;
198 }
199
200 len = (ioreq->req.seg[i].last_sect -
201 ioreq->req.seg[i].first_sect + 1) * blkdev->file_blk;
202 ioreq->size += len;
203 }
204 if (ioreq->start + ioreq->size > blkdev->file_size) {
205 error_report("error: access beyond end of file");
206 goto err;
207 }
208 return 0;
209
210 err:
211 ioreq->status = BLKIF_RSP_ERROR;
212 return -1;
213 }
214
215 static int ioreq_grant_copy(struct ioreq *ioreq)
216 {
217 struct XenBlkDev *blkdev = ioreq->blkdev;
218 struct XenLegacyDevice *xendev = &blkdev->xendev;
219 XenGrantCopySegment segs[BLKIF_MAX_SEGMENTS_PER_REQUEST];
220 int i, count, rc;
221 int64_t file_blk = blkdev->file_blk;
222 bool to_domain = (ioreq->req.operation == BLKIF_OP_READ);
223 void *virt = ioreq->buf;
224
225 if (ioreq->req.nr_segments == 0) {
226 return 0;
227 }
228
229 count = ioreq->req.nr_segments;
230
231 for (i = 0; i < count; i++) {
232 if (to_domain) {
233 segs[i].dest.foreign.ref = ioreq->req.seg[i].gref;
234 segs[i].dest.foreign.offset = ioreq->req.seg[i].first_sect *
235 file_blk;
236 segs[i].source.virt = virt;
237 } else {
238 segs[i].source.foreign.ref = ioreq->req.seg[i].gref;
239 segs[i].source.foreign.offset = ioreq->req.seg[i].first_sect *
240 file_blk;
241 segs[i].dest.virt = virt;
242 }
243 segs[i].len = (ioreq->req.seg[i].last_sect
244 - ioreq->req.seg[i].first_sect + 1) * file_blk;
245 virt += segs[i].len;
246 }
247
248 rc = xen_be_copy_grant_refs(xendev, to_domain, segs, count);
249
250 if (rc) {
251 error_report("failed to copy data %d", rc);
252 ioreq->aio_errors++;
253 return -1;
254 }
255
256 return rc;
257 }
258
259 static int ioreq_runio_qemu_aio(struct ioreq *ioreq);
260
261 static void qemu_aio_complete(void *opaque, int ret)
262 {
263 struct ioreq *ioreq = opaque;
264 struct XenBlkDev *blkdev = ioreq->blkdev;
265 struct XenLegacyDevice *xendev = &blkdev->xendev;
266
267 aio_context_acquire(blkdev->ctx);
268
269 if (ret != 0) {
270 error_report("%s I/O error",
271 ioreq->req.operation == BLKIF_OP_READ ?
272 "read" : "write");
273 ioreq->aio_errors++;
274 }
275
276 ioreq->aio_inflight--;
277 if (ioreq->presync) {
278 ioreq->presync = 0;
279 ioreq_runio_qemu_aio(ioreq);
280 goto done;
281 }
282 if (ioreq->aio_inflight > 0) {
283 goto done;
284 }
285
286 switch (ioreq->req.operation) {
287 case BLKIF_OP_READ:
288 /* in case of failure ioreq->aio_errors is increased */
289 if (ret == 0) {
290 ioreq_grant_copy(ioreq);
291 }
292 qemu_vfree(ioreq->buf);
293 break;
294 case BLKIF_OP_WRITE:
295 case BLKIF_OP_FLUSH_DISKCACHE:
296 if (!ioreq->req.nr_segments) {
297 break;
298 }
299 qemu_vfree(ioreq->buf);
300 break;
301 default:
302 break;
303 }
304
305 ioreq->status = ioreq->aio_errors ? BLKIF_RSP_ERROR : BLKIF_RSP_OKAY;
306 ioreq_finish(ioreq);
307
308 switch (ioreq->req.operation) {
309 case BLKIF_OP_WRITE:
310 case BLKIF_OP_FLUSH_DISKCACHE:
311 if (!ioreq->req.nr_segments) {
312 break;
313 }
314 case BLKIF_OP_READ:
315 if (ioreq->status == BLKIF_RSP_OKAY) {
316 block_acct_done(blk_get_stats(blkdev->blk), &ioreq->acct);
317 } else {
318 block_acct_failed(blk_get_stats(blkdev->blk), &ioreq->acct);
319 }
320 break;
321 case BLKIF_OP_DISCARD:
322 default:
323 break;
324 }
325 qemu_bh_schedule(blkdev->bh);
326
327 done:
328 aio_context_release(blkdev->ctx);
329 }
330
331 static bool blk_split_discard(struct ioreq *ioreq, blkif_sector_t sector_number,
332 uint64_t nr_sectors)
333 {
334 struct XenBlkDev *blkdev = ioreq->blkdev;
335 int64_t byte_offset;
336 int byte_chunk;
337 uint64_t byte_remaining, limit;
338 uint64_t sec_start = sector_number;
339 uint64_t sec_count = nr_sectors;
340
341 /* Wrap around, or overflowing byte limit? */
342 if (sec_start + sec_count < sec_count ||
343 sec_start + sec_count > INT64_MAX >> BDRV_SECTOR_BITS) {
344 return false;
345 }
346
347 limit = BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS;
348 byte_offset = sec_start << BDRV_SECTOR_BITS;
349 byte_remaining = sec_count << BDRV_SECTOR_BITS;
350
351 do {
352 byte_chunk = byte_remaining > limit ? limit : byte_remaining;
353 ioreq->aio_inflight++;
354 blk_aio_pdiscard(blkdev->blk, byte_offset, byte_chunk,
355 qemu_aio_complete, ioreq);
356 byte_remaining -= byte_chunk;
357 byte_offset += byte_chunk;
358 } while (byte_remaining > 0);
359
360 return true;
361 }
362
363 static int ioreq_runio_qemu_aio(struct ioreq *ioreq)
364 {
365 struct XenBlkDev *blkdev = ioreq->blkdev;
366
367 ioreq->buf = qemu_memalign(XC_PAGE_SIZE, ioreq->size);
368 if (ioreq->req.nr_segments &&
369 (ioreq->req.operation == BLKIF_OP_WRITE ||
370 ioreq->req.operation == BLKIF_OP_FLUSH_DISKCACHE) &&
371 ioreq_grant_copy(ioreq)) {
372 qemu_vfree(ioreq->buf);
373 goto err;
374 }
375
376 ioreq->aio_inflight++;
377 if (ioreq->presync) {
378 blk_aio_flush(ioreq->blkdev->blk, qemu_aio_complete, ioreq);
379 return 0;
380 }
381
382 switch (ioreq->req.operation) {
383 case BLKIF_OP_READ:
384 qemu_iovec_add(&ioreq->v, ioreq->buf, ioreq->size);
385 block_acct_start(blk_get_stats(blkdev->blk), &ioreq->acct,
386 ioreq->v.size, BLOCK_ACCT_READ);
387 ioreq->aio_inflight++;
388 blk_aio_preadv(blkdev->blk, ioreq->start, &ioreq->v, 0,
389 qemu_aio_complete, ioreq);
390 break;
391 case BLKIF_OP_WRITE:
392 case BLKIF_OP_FLUSH_DISKCACHE:
393 if (!ioreq->req.nr_segments) {
394 break;
395 }
396
397 qemu_iovec_add(&ioreq->v, ioreq->buf, ioreq->size);
398 block_acct_start(blk_get_stats(blkdev->blk), &ioreq->acct,
399 ioreq->v.size,
400 ioreq->req.operation == BLKIF_OP_WRITE ?
401 BLOCK_ACCT_WRITE : BLOCK_ACCT_FLUSH);
402 ioreq->aio_inflight++;
403 blk_aio_pwritev(blkdev->blk, ioreq->start, &ioreq->v, 0,
404 qemu_aio_complete, ioreq);
405 break;
406 case BLKIF_OP_DISCARD:
407 {
408 struct blkif_request_discard *req = (void *)&ioreq->req;
409 if (!blk_split_discard(ioreq, req->sector_number, req->nr_sectors)) {
410 goto err;
411 }
412 break;
413 }
414 default:
415 /* unknown operation (shouldn't happen -- parse catches this) */
416 goto err;
417 }
418
419 qemu_aio_complete(ioreq, 0);
420
421 return 0;
422
423 err:
424 ioreq_finish(ioreq);
425 ioreq->status = BLKIF_RSP_ERROR;
426 return -1;
427 }
428
429 static int blk_send_response_one(struct ioreq *ioreq)
430 {
431 struct XenBlkDev *blkdev = ioreq->blkdev;
432 int send_notify = 0;
433 int have_requests = 0;
434 blkif_response_t *resp;
435
436 /* Place on the response ring for the relevant domain. */
437 switch (blkdev->protocol) {
438 case BLKIF_PROTOCOL_NATIVE:
439 resp = (blkif_response_t *)RING_GET_RESPONSE(
440 &blkdev->rings.native,
441 blkdev->rings.native.rsp_prod_pvt);
442 break;
443 case BLKIF_PROTOCOL_X86_32:
444 resp = (blkif_response_t *)RING_GET_RESPONSE(
445 &blkdev->rings.x86_32_part,
446 blkdev->rings.x86_32_part.rsp_prod_pvt);
447 break;
448 case BLKIF_PROTOCOL_X86_64:
449 resp = (blkif_response_t *)RING_GET_RESPONSE(
450 &blkdev->rings.x86_64_part,
451 blkdev->rings.x86_64_part.rsp_prod_pvt);
452 break;
453 default:
454 return 0;
455 }
456
457 resp->id = ioreq->req.id;
458 resp->operation = ioreq->req.operation;
459 resp->status = ioreq->status;
460
461 blkdev->rings.common.rsp_prod_pvt++;
462
463 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blkdev->rings.common, send_notify);
464 if (blkdev->rings.common.rsp_prod_pvt == blkdev->rings.common.req_cons) {
465 /*
466 * Tail check for pending requests. Allows frontend to avoid
467 * notifications if requests are already in flight (lower
468 * overheads and promotes batching).
469 */
470 RING_FINAL_CHECK_FOR_REQUESTS(&blkdev->rings.common, have_requests);
471 } else if (RING_HAS_UNCONSUMED_REQUESTS(&blkdev->rings.common)) {
472 have_requests = 1;
473 }
474
475 if (have_requests) {
476 blkdev->more_work++;
477 }
478 return send_notify;
479 }
480
481 /* walk finished list, send outstanding responses, free requests */
482 static void blk_send_response_all(struct XenBlkDev *blkdev)
483 {
484 struct ioreq *ioreq;
485 int send_notify = 0;
486
487 while (!QLIST_EMPTY(&blkdev->finished)) {
488 ioreq = QLIST_FIRST(&blkdev->finished);
489 send_notify += blk_send_response_one(ioreq);
490 ioreq_release(ioreq, true);
491 }
492 if (send_notify) {
493 xen_pv_send_notify(&blkdev->xendev);
494 }
495 }
496
497 static int blk_get_request(struct XenBlkDev *blkdev, struct ioreq *ioreq,
498 RING_IDX rc)
499 {
500 switch (blkdev->protocol) {
501 case BLKIF_PROTOCOL_NATIVE:
502 memcpy(&ioreq->req, RING_GET_REQUEST(&blkdev->rings.native, rc),
503 sizeof(ioreq->req));
504 break;
505 case BLKIF_PROTOCOL_X86_32:
506 blkif_get_x86_32_req(&ioreq->req,
507 RING_GET_REQUEST(&blkdev->rings.x86_32_part, rc));
508 break;
509 case BLKIF_PROTOCOL_X86_64:
510 blkif_get_x86_64_req(&ioreq->req,
511 RING_GET_REQUEST(&blkdev->rings.x86_64_part, rc));
512 break;
513 }
514 /* Prevent the compiler from accessing the on-ring fields instead. */
515 barrier();
516 return 0;
517 }
518
519 static void blk_handle_requests(struct XenBlkDev *blkdev)
520 {
521 RING_IDX rc, rp;
522 struct ioreq *ioreq;
523
524 blkdev->more_work = 0;
525
526 rc = blkdev->rings.common.req_cons;
527 rp = blkdev->rings.common.sring->req_prod;
528 xen_rmb(); /* Ensure we see queued requests up to 'rp'. */
529
530 blk_send_response_all(blkdev);
531 while (rc != rp) {
532 /* pull request from ring */
533 if (RING_REQUEST_CONS_OVERFLOW(&blkdev->rings.common, rc)) {
534 break;
535 }
536 ioreq = ioreq_start(blkdev);
537 if (ioreq == NULL) {
538 blkdev->more_work++;
539 break;
540 }
541 blk_get_request(blkdev, ioreq, rc);
542 blkdev->rings.common.req_cons = ++rc;
543
544 /* parse them */
545 if (ioreq_parse(ioreq) != 0) {
546
547 switch (ioreq->req.operation) {
548 case BLKIF_OP_READ:
549 block_acct_invalid(blk_get_stats(blkdev->blk),
550 BLOCK_ACCT_READ);
551 break;
552 case BLKIF_OP_WRITE:
553 block_acct_invalid(blk_get_stats(blkdev->blk),
554 BLOCK_ACCT_WRITE);
555 break;
556 case BLKIF_OP_FLUSH_DISKCACHE:
557 block_acct_invalid(blk_get_stats(blkdev->blk),
558 BLOCK_ACCT_FLUSH);
559 default:
560 break;
561 };
562
563 if (blk_send_response_one(ioreq)) {
564 xen_pv_send_notify(&blkdev->xendev);
565 }
566 ioreq_release(ioreq, false);
567 continue;
568 }
569
570 ioreq_runio_qemu_aio(ioreq);
571 }
572
573 if (blkdev->more_work && blkdev->requests_inflight < blkdev->max_requests) {
574 qemu_bh_schedule(blkdev->bh);
575 }
576 }
577
578 static void blk_bh(void *opaque)
579 {
580 struct XenBlkDev *blkdev = opaque;
581
582 aio_context_acquire(blkdev->ctx);
583 blk_handle_requests(blkdev);
584 aio_context_release(blkdev->ctx);
585 }
586
587 static void blk_alloc(struct XenLegacyDevice *xendev)
588 {
589 struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
590 Error *err = NULL;
591
592 trace_xen_disk_alloc(xendev->name);
593
594 QLIST_INIT(&blkdev->inflight);
595 QLIST_INIT(&blkdev->finished);
596 QLIST_INIT(&blkdev->freelist);
597
598 blkdev->iothread = iothread_create(xendev->name, &err);
599 assert(!err);
600
601 blkdev->ctx = iothread_get_aio_context(blkdev->iothread);
602 blkdev->bh = aio_bh_new(blkdev->ctx, blk_bh, blkdev);
603 }
604
605 static int blk_free(struct XenLegacyDevice *xendev)
606 {
607 struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
608 struct ioreq *ioreq;
609
610 trace_xen_disk_free(xendev->name);
611
612 blk_disconnect(xendev);
613
614 while (!QLIST_EMPTY(&blkdev->freelist)) {
615 ioreq = QLIST_FIRST(&blkdev->freelist);
616 QLIST_REMOVE(ioreq, list);
617 qemu_iovec_destroy(&ioreq->v);
618 g_free(ioreq);
619 }
620
621 g_free(blkdev->params);
622 g_free(blkdev->mode);
623 g_free(blkdev->type);
624 g_free(blkdev->dev);
625 g_free(blkdev->devtype);
626 qemu_bh_delete(blkdev->bh);
627 iothread_destroy(blkdev->iothread);
628 return 0;
629 }
630
631 static void blk_event(struct XenLegacyDevice *xendev)
632 {
633 struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
634
635 qemu_bh_schedule(blkdev->bh);
636 }