]> git.proxmox.com Git - mirror_qemu.git/blame - hw/block/xen_disk.c
Merge remote-tracking branch 'remotes/kraxel/tags/input-20190111-pull-request' into...
[mirror_qemu.git] / hw / block / xen_disk.c
CommitLineData
62d23efa
AL
1/*
2 * xen paravirt block device backend
3 *
4 * (c) Gerd Hoffmann <kraxel@redhat.com>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; under version 2 of the License.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License along
8167ee88 16 * with this program; if not, see <http://www.gnu.org/licenses/>.
6b620ca3
PB
17 *
18 * Contributions after 2012-01-13 are licensed under the terms of the
19 * GNU GPL, version 2 or (at your option) any later version.
62d23efa
AL
20 */
21
80c71a24 22#include "qemu/osdep.h"
8f951a13 23#include "qemu/units.h"
62d23efa 24#include <sys/ioctl.h>
62d23efa
AL
25#include <sys/uio.h>
26
83c9f4ca 27#include "hw/hw.h"
0d09e41a 28#include "hw/xen/xen_backend.h"
47b43a1f 29#include "xen_blkif.h"
9c17d615 30#include "sysemu/blockdev.h"
1491ede7 31#include "sysemu/iothread.h"
26f54e9a 32#include "sysemu/block-backend.h"
da34e65c 33#include "qapi/error.h"
9a925356
HR
34#include "qapi/qmp/qdict.h"
35#include "qapi/qmp/qstring.h"
1491ede7 36#include "trace.h"
62d23efa
AL
37
38/* ------------------------------------------------------------- */
39
62d23efa
AL
40#define BLOCK_SIZE 512
41#define IOCB_COUNT (BLKIF_MAX_SEGMENTS_PER_REQUEST + 2)
42
43struct ioreq {
44 blkif_request_t req;
45 int16_t status;
46
47 /* parsed request */
48 off_t start;
49 QEMUIOVector v;
5ebf9626
PD
50 void *buf;
51 size_t size;
62d23efa 52 int presync;
62d23efa 53
62d23efa
AL
54 /* aio status */
55 int aio_inflight;
56 int aio_errors;
57
58 struct XenBlkDev *blkdev;
72cf2d4f 59 QLIST_ENTRY(ioreq) list;
a597e79c 60 BlockAcctCookie acct;
62d23efa
AL
61};
62
3284fad7
PD
63#define MAX_RING_PAGE_ORDER 4
64
62d23efa
AL
65struct XenBlkDev {
66 struct XenDevice xendev; /* must be first */
67 char *params;
68 char *mode;
69 char *type;
70 char *dev;
71 char *devtype;
454ae734 72 bool directiosafe;
62d23efa
AL
73 const char *fileproto;
74 const char *filename;
3284fad7
PD
75 unsigned int ring_ref[1 << MAX_RING_PAGE_ORDER];
76 unsigned int nr_ring_ref;
62d23efa
AL
77 void *sring;
78 int64_t file_blk;
79 int64_t file_size;
80 int protocol;
81 blkif_back_rings_t rings;
82 int more_work;
62d23efa
AL
83
84 /* request lists */
b58deb34
PB
85 QLIST_HEAD(, ioreq) inflight;
86 QLIST_HEAD(, ioreq) finished;
87 QLIST_HEAD(, ioreq) freelist;
62d23efa
AL
88 int requests_total;
89 int requests_inflight;
90 int requests_finished;
3284fad7 91 unsigned int max_requests;
62d23efa 92
f3135204 93 gboolean feature_discard;
9e496d74 94
62d23efa 95 /* qemu block driver */
751c6a17 96 DriveInfo *dinfo;
4be74634 97 BlockBackend *blk;
62d23efa 98 QEMUBH *bh;
1491ede7
PD
99
100 IOThread *iothread;
101 AioContext *ctx;
62d23efa
AL
102};
103
104/* ------------------------------------------------------------- */
105
282c6a2f
RPM
106static void ioreq_reset(struct ioreq *ioreq)
107{
108 memset(&ioreq->req, 0, sizeof(ioreq->req));
109 ioreq->status = 0;
110 ioreq->start = 0;
5ebf9626
PD
111 ioreq->buf = NULL;
112 ioreq->size = 0;
282c6a2f 113 ioreq->presync = 0;
282c6a2f 114
282c6a2f
RPM
115 ioreq->aio_inflight = 0;
116 ioreq->aio_errors = 0;
117
118 ioreq->blkdev = NULL;
119 memset(&ioreq->list, 0, sizeof(ioreq->list));
120 memset(&ioreq->acct, 0, sizeof(ioreq->acct));
121
122 qemu_iovec_reset(&ioreq->v);
123}
124
62d23efa
AL
125static struct ioreq *ioreq_start(struct XenBlkDev *blkdev)
126{
127 struct ioreq *ioreq = NULL;
128
72cf2d4f 129 if (QLIST_EMPTY(&blkdev->freelist)) {
3284fad7 130 if (blkdev->requests_total >= blkdev->max_requests) {
209cd7ab
AP
131 goto out;
132 }
133 /* allocate new struct */
7267c094 134 ioreq = g_malloc0(sizeof(*ioreq));
209cd7ab
AP
135 ioreq->blkdev = blkdev;
136 blkdev->requests_total++;
5ebf9626 137 qemu_iovec_init(&ioreq->v, 1);
62d23efa 138 } else {
209cd7ab
AP
139 /* get one from freelist */
140 ioreq = QLIST_FIRST(&blkdev->freelist);
141 QLIST_REMOVE(ioreq, list);
62d23efa 142 }
72cf2d4f 143 QLIST_INSERT_HEAD(&blkdev->inflight, ioreq, list);
62d23efa
AL
144 blkdev->requests_inflight++;
145
146out:
147 return ioreq;
148}
149
150static void ioreq_finish(struct ioreq *ioreq)
151{
152 struct XenBlkDev *blkdev = ioreq->blkdev;
153
72cf2d4f
BS
154 QLIST_REMOVE(ioreq, list);
155 QLIST_INSERT_HEAD(&blkdev->finished, ioreq, list);
62d23efa
AL
156 blkdev->requests_inflight--;
157 blkdev->requests_finished++;
158}
159
ed547766 160static void ioreq_release(struct ioreq *ioreq, bool finish)
62d23efa
AL
161{
162 struct XenBlkDev *blkdev = ioreq->blkdev;
163
72cf2d4f 164 QLIST_REMOVE(ioreq, list);
282c6a2f 165 ioreq_reset(ioreq);
62d23efa 166 ioreq->blkdev = blkdev;
72cf2d4f 167 QLIST_INSERT_HEAD(&blkdev->freelist, ioreq, list);
ed547766
JB
168 if (finish) {
169 blkdev->requests_finished--;
170 } else {
171 blkdev->requests_inflight--;
172 }
62d23efa
AL
173}
174
175/*
176 * translate request into iovec + start offset
177 * do sanity checks along the way
178 */
179static int ioreq_parse(struct ioreq *ioreq)
180{
181 struct XenBlkDev *blkdev = ioreq->blkdev;
443c3c9c 182 struct XenDevice *xendev = &blkdev->xendev;
62d23efa
AL
183 size_t len;
184 int i;
185
443c3c9c 186 xen_pv_printf(xendev, 3,
209cd7ab
AP
187 "op %d, nr %d, handle %d, id %" PRId64 ", sector %" PRId64 "\n",
188 ioreq->req.operation, ioreq->req.nr_segments,
189 ioreq->req.handle, ioreq->req.id, ioreq->req.sector_number);
62d23efa
AL
190 switch (ioreq->req.operation) {
191 case BLKIF_OP_READ:
209cd7ab 192 break;
7e7b7cba
SS
193 case BLKIF_OP_FLUSH_DISKCACHE:
194 ioreq->presync = 1;
5cbdebe3 195 if (!ioreq->req.nr_segments) {
5cbdebe3
SS
196 return 0;
197 }
209cd7ab 198 /* fall through */
62d23efa 199 case BLKIF_OP_WRITE:
209cd7ab 200 break;
f3135204
OH
201 case BLKIF_OP_DISCARD:
202 return 0;
62d23efa 203 default:
443c3c9c 204 xen_pv_printf(xendev, 0, "error: unknown operation (%d)\n",
209cd7ab
AP
205 ioreq->req.operation);
206 goto err;
62d23efa
AL
207 };
208
908c7b9f 209 if (ioreq->req.operation != BLKIF_OP_READ && blkdev->mode[0] != 'w') {
443c3c9c 210 xen_pv_printf(xendev, 0, "error: write req for ro device\n");
908c7b9f
GH
211 goto err;
212 }
213
62d23efa
AL
214 ioreq->start = ioreq->req.sector_number * blkdev->file_blk;
215 for (i = 0; i < ioreq->req.nr_segments; i++) {
209cd7ab 216 if (i == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
443c3c9c 217 xen_pv_printf(xendev, 0, "error: nr_segments too big\n");
209cd7ab
AP
218 goto err;
219 }
220 if (ioreq->req.seg[i].first_sect > ioreq->req.seg[i].last_sect) {
443c3c9c 221 xen_pv_printf(xendev, 0, "error: first > last sector\n");
209cd7ab
AP
222 goto err;
223 }
224 if (ioreq->req.seg[i].last_sect * BLOCK_SIZE >= XC_PAGE_SIZE) {
443c3c9c 225 xen_pv_printf(xendev, 0, "error: page crossing\n");
209cd7ab
AP
226 goto err;
227 }
228
209cd7ab 229 len = (ioreq->req.seg[i].last_sect - ioreq->req.seg[i].first_sect + 1) * blkdev->file_blk;
5ebf9626 230 ioreq->size += len;
62d23efa 231 }
5ebf9626 232 if (ioreq->start + ioreq->size > blkdev->file_size) {
443c3c9c 233 xen_pv_printf(xendev, 0, "error: access beyond end of file\n");
209cd7ab 234 goto err;
62d23efa
AL
235 }
236 return 0;
237
238err:
239 ioreq->status = BLKIF_RSP_ERROR;
240 return -1;
241}
242
b6eb9b45
PS
243static int ioreq_grant_copy(struct ioreq *ioreq)
244{
5ee1d999
PD
245 struct XenBlkDev *blkdev = ioreq->blkdev;
246 struct XenDevice *xendev = &blkdev->xendev;
247 XenGrantCopySegment segs[BLKIF_MAX_SEGMENTS_PER_REQUEST];
b6eb9b45 248 int i, count, rc;
443c3c9c 249 int64_t file_blk = blkdev->file_blk;
5ee1d999 250 bool to_domain = (ioreq->req.operation == BLKIF_OP_READ);
5ebf9626 251 void *virt = ioreq->buf;
b6eb9b45 252
5ebf9626 253 if (ioreq->req.nr_segments == 0) {
b6eb9b45
PS
254 return 0;
255 }
256
5ebf9626 257 count = ioreq->req.nr_segments;
b6eb9b45
PS
258
259 for (i = 0; i < count; i++) {
5ee1d999 260 if (to_domain) {
5ebf9626 261 segs[i].dest.foreign.ref = ioreq->req.seg[i].gref;
b6eb9b45 262 segs[i].dest.foreign.offset = ioreq->req.seg[i].first_sect * file_blk;
5ebf9626 263 segs[i].source.virt = virt;
b6eb9b45 264 } else {
5ebf9626 265 segs[i].source.foreign.ref = ioreq->req.seg[i].gref;
b6eb9b45 266 segs[i].source.foreign.offset = ioreq->req.seg[i].first_sect * file_blk;
5ebf9626 267 segs[i].dest.virt = virt;
b6eb9b45
PS
268 }
269 segs[i].len = (ioreq->req.seg[i].last_sect
270 - ioreq->req.seg[i].first_sect + 1) * file_blk;
5ebf9626 271 virt += segs[i].len;
b6eb9b45
PS
272 }
273
5ee1d999 274 rc = xen_be_copy_grant_refs(xendev, to_domain, segs, count);
b6eb9b45
PS
275
276 if (rc) {
443c3c9c 277 xen_pv_printf(xendev, 0,
b6eb9b45
PS
278 "failed to copy data %d\n", rc);
279 ioreq->aio_errors++;
280 return -1;
281 }
282
b6eb9b45
PS
283 return rc;
284}
b6eb9b45 285
c6961b7d
SS
286static int ioreq_runio_qemu_aio(struct ioreq *ioreq);
287
62d23efa
AL
288static void qemu_aio_complete(void *opaque, int ret)
289{
290 struct ioreq *ioreq = opaque;
1491ede7 291 struct XenBlkDev *blkdev = ioreq->blkdev;
443c3c9c 292 struct XenDevice *xendev = &blkdev->xendev;
1491ede7
PD
293
294 aio_context_acquire(blkdev->ctx);
62d23efa
AL
295
296 if (ret != 0) {
443c3c9c 297 xen_pv_printf(xendev, 0, "%s I/O error\n",
62d23efa
AL
298 ioreq->req.operation == BLKIF_OP_READ ? "read" : "write");
299 ioreq->aio_errors++;
300 }
301
302 ioreq->aio_inflight--;
c6961b7d
SS
303 if (ioreq->presync) {
304 ioreq->presync = 0;
305 ioreq_runio_qemu_aio(ioreq);
1491ede7 306 goto done;
c6961b7d 307 }
209cd7ab 308 if (ioreq->aio_inflight > 0) {
1491ede7 309 goto done;
209cd7ab 310 }
62d23efa 311
06454c24
PD
312 switch (ioreq->req.operation) {
313 case BLKIF_OP_READ:
314 /* in case of failure ioreq->aio_errors is increased */
315 if (ret == 0) {
316 ioreq_grant_copy(ioreq);
317 }
5ebf9626 318 qemu_vfree(ioreq->buf);
06454c24
PD
319 break;
320 case BLKIF_OP_WRITE:
321 case BLKIF_OP_FLUSH_DISKCACHE:
322 if (!ioreq->req.nr_segments) {
b6eb9b45
PS
323 break;
324 }
5ebf9626 325 qemu_vfree(ioreq->buf);
06454c24
PD
326 break;
327 default:
328 break;
b6eb9b45
PS
329 }
330
62d23efa 331 ioreq->status = ioreq->aio_errors ? BLKIF_RSP_ERROR : BLKIF_RSP_OKAY;
62d23efa 332 ioreq_finish(ioreq);
06454c24 333
58da5b1e
OH
334 switch (ioreq->req.operation) {
335 case BLKIF_OP_WRITE:
336 case BLKIF_OP_FLUSH_DISKCACHE:
337 if (!ioreq->req.nr_segments) {
338 break;
339 }
340 case BLKIF_OP_READ:
57ee366c 341 if (ioreq->status == BLKIF_RSP_OKAY) {
1491ede7 342 block_acct_done(blk_get_stats(blkdev->blk), &ioreq->acct);
57ee366c 343 } else {
1491ede7 344 block_acct_failed(blk_get_stats(blkdev->blk), &ioreq->acct);
57ee366c 345 }
58da5b1e 346 break;
f3135204 347 case BLKIF_OP_DISCARD:
58da5b1e
OH
348 default:
349 break;
350 }
1491ede7
PD
351 qemu_bh_schedule(blkdev->bh);
352
353done:
354 aio_context_release(blkdev->ctx);
62d23efa
AL
355}
356
7875efb9
OH
357static bool blk_split_discard(struct ioreq *ioreq, blkif_sector_t sector_number,
358 uint64_t nr_sectors)
359{
360 struct XenBlkDev *blkdev = ioreq->blkdev;
361 int64_t byte_offset;
362 int byte_chunk;
363 uint64_t byte_remaining, limit;
364 uint64_t sec_start = sector_number;
365 uint64_t sec_count = nr_sectors;
366
367 /* Wrap around, or overflowing byte limit? */
368 if (sec_start + sec_count < sec_count ||
369 sec_start + sec_count > INT64_MAX >> BDRV_SECTOR_BITS) {
370 return false;
371 }
372
373 limit = BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS;
374 byte_offset = sec_start << BDRV_SECTOR_BITS;
375 byte_remaining = sec_count << BDRV_SECTOR_BITS;
376
377 do {
378 byte_chunk = byte_remaining > limit ? limit : byte_remaining;
379 ioreq->aio_inflight++;
380 blk_aio_pdiscard(blkdev->blk, byte_offset, byte_chunk,
381 qemu_aio_complete, ioreq);
382 byte_remaining -= byte_chunk;
383 byte_offset += byte_chunk;
384 } while (byte_remaining > 0);
385
386 return true;
387}
388
62d23efa
AL
389static int ioreq_runio_qemu_aio(struct ioreq *ioreq)
390{
391 struct XenBlkDev *blkdev = ioreq->blkdev;
392
5ebf9626 393 ioreq->buf = qemu_memalign(XC_PAGE_SIZE, ioreq->size);
06454c24
PD
394 if (ioreq->req.nr_segments &&
395 (ioreq->req.operation == BLKIF_OP_WRITE ||
396 ioreq->req.operation == BLKIF_OP_FLUSH_DISKCACHE) &&
397 ioreq_grant_copy(ioreq)) {
5ebf9626 398 qemu_vfree(ioreq->buf);
06454c24 399 goto err;
209cd7ab 400 }
62d23efa
AL
401
402 ioreq->aio_inflight++;
209cd7ab 403 if (ioreq->presync) {
4be74634 404 blk_aio_flush(ioreq->blkdev->blk, qemu_aio_complete, ioreq);
c6961b7d 405 return 0;
209cd7ab 406 }
62d23efa
AL
407
408 switch (ioreq->req.operation) {
409 case BLKIF_OP_READ:
5ebf9626 410 qemu_iovec_add(&ioreq->v, ioreq->buf, ioreq->size);
4be74634 411 block_acct_start(blk_get_stats(blkdev->blk), &ioreq->acct,
5366d0c8 412 ioreq->v.size, BLOCK_ACCT_READ);
62d23efa 413 ioreq->aio_inflight++;
d00000f9
EB
414 blk_aio_preadv(blkdev->blk, ioreq->start, &ioreq->v, 0,
415 qemu_aio_complete, ioreq);
209cd7ab 416 break;
62d23efa 417 case BLKIF_OP_WRITE:
7e7b7cba 418 case BLKIF_OP_FLUSH_DISKCACHE:
209cd7ab 419 if (!ioreq->req.nr_segments) {
5cbdebe3 420 break;
209cd7ab 421 }
a597e79c 422
5ebf9626 423 qemu_iovec_add(&ioreq->v, ioreq->buf, ioreq->size);
4be74634 424 block_acct_start(blk_get_stats(blkdev->blk), &ioreq->acct,
693044eb
AG
425 ioreq->v.size,
426 ioreq->req.operation == BLKIF_OP_WRITE ?
427 BLOCK_ACCT_WRITE : BLOCK_ACCT_FLUSH);
209bef3e 428 ioreq->aio_inflight++;
d00000f9
EB
429 blk_aio_pwritev(blkdev->blk, ioreq->start, &ioreq->v, 0,
430 qemu_aio_complete, ioreq);
209cd7ab 431 break;
f3135204
OH
432 case BLKIF_OP_DISCARD:
433 {
7875efb9
OH
434 struct blkif_request_discard *req = (void *)&ioreq->req;
435 if (!blk_split_discard(ioreq, req->sector_number, req->nr_sectors)) {
436 goto err;
437 }
f3135204
OH
438 break;
439 }
62d23efa 440 default:
209cd7ab
AP
441 /* unknown operation (shouldn't happen -- parse catches this) */
442 goto err;
62d23efa
AL
443 }
444
62d23efa
AL
445 qemu_aio_complete(ioreq, 0);
446
447 return 0;
448
449err:
f6ec953c 450 ioreq_finish(ioreq);
62d23efa
AL
451 ioreq->status = BLKIF_RSP_ERROR;
452 return -1;
453}
454
455static int blk_send_response_one(struct ioreq *ioreq)
456{
457 struct XenBlkDev *blkdev = ioreq->blkdev;
458 int send_notify = 0;
459 int have_requests = 0;
b0ac694f 460 blkif_response_t *resp;
62d23efa
AL
461
462 /* Place on the response ring for the relevant domain. */
463 switch (blkdev->protocol) {
464 case BLKIF_PROTOCOL_NATIVE:
b0ac694f
SS
465 resp = (blkif_response_t *) RING_GET_RESPONSE(&blkdev->rings.native,
466 blkdev->rings.native.rsp_prod_pvt);
209cd7ab 467 break;
62d23efa 468 case BLKIF_PROTOCOL_X86_32:
b0ac694f
SS
469 resp = (blkif_response_t *) RING_GET_RESPONSE(&blkdev->rings.x86_32_part,
470 blkdev->rings.x86_32_part.rsp_prod_pvt);
209cd7ab 471 break;
62d23efa 472 case BLKIF_PROTOCOL_X86_64:
b0ac694f
SS
473 resp = (blkif_response_t *) RING_GET_RESPONSE(&blkdev->rings.x86_64_part,
474 blkdev->rings.x86_64_part.rsp_prod_pvt);
209cd7ab 475 break;
62d23efa 476 default:
8cced121 477 return 0;
62d23efa 478 }
b0ac694f
SS
479
480 resp->id = ioreq->req.id;
481 resp->operation = ioreq->req.operation;
482 resp->status = ioreq->status;
483
62d23efa
AL
484 blkdev->rings.common.rsp_prod_pvt++;
485
486 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blkdev->rings.common, send_notify);
487 if (blkdev->rings.common.rsp_prod_pvt == blkdev->rings.common.req_cons) {
209cd7ab
AP
488 /*
489 * Tail check for pending requests. Allows frontend to avoid
490 * notifications if requests are already in flight (lower
491 * overheads and promotes batching).
492 */
493 RING_FINAL_CHECK_FOR_REQUESTS(&blkdev->rings.common, have_requests);
62d23efa 494 } else if (RING_HAS_UNCONSUMED_REQUESTS(&blkdev->rings.common)) {
209cd7ab 495 have_requests = 1;
62d23efa
AL
496 }
497
209cd7ab
AP
498 if (have_requests) {
499 blkdev->more_work++;
500 }
62d23efa
AL
501 return send_notify;
502}
503
504/* walk finished list, send outstanding responses, free requests */
505static void blk_send_response_all(struct XenBlkDev *blkdev)
506{
507 struct ioreq *ioreq;
508 int send_notify = 0;
509
72cf2d4f
BS
510 while (!QLIST_EMPTY(&blkdev->finished)) {
511 ioreq = QLIST_FIRST(&blkdev->finished);
209cd7ab 512 send_notify += blk_send_response_one(ioreq);
ed547766 513 ioreq_release(ioreq, true);
209cd7ab
AP
514 }
515 if (send_notify) {
ba18fa2a 516 xen_pv_send_notify(&blkdev->xendev);
62d23efa 517 }
62d23efa
AL
518}
519
520static int blk_get_request(struct XenBlkDev *blkdev, struct ioreq *ioreq, RING_IDX rc)
521{
522 switch (blkdev->protocol) {
523 case BLKIF_PROTOCOL_NATIVE:
209cd7ab
AP
524 memcpy(&ioreq->req, RING_GET_REQUEST(&blkdev->rings.native, rc),
525 sizeof(ioreq->req));
526 break;
62d23efa 527 case BLKIF_PROTOCOL_X86_32:
6fcfeff9
BS
528 blkif_get_x86_32_req(&ioreq->req,
529 RING_GET_REQUEST(&blkdev->rings.x86_32_part, rc));
209cd7ab 530 break;
62d23efa 531 case BLKIF_PROTOCOL_X86_64:
6fcfeff9
BS
532 blkif_get_x86_64_req(&ioreq->req,
533 RING_GET_REQUEST(&blkdev->rings.x86_64_part, rc));
209cd7ab 534 break;
62d23efa 535 }
4837a1a5
JB
536 /* Prevent the compiler from accessing the on-ring fields instead. */
537 barrier();
62d23efa
AL
538 return 0;
539}
540
541static void blk_handle_requests(struct XenBlkDev *blkdev)
542{
543 RING_IDX rc, rp;
544 struct ioreq *ioreq;
545
546 blkdev->more_work = 0;
547
548 rc = blkdev->rings.common.req_cons;
549 rp = blkdev->rings.common.sring->req_prod;
550 xen_rmb(); /* Ensure we see queued requests up to 'rp'. */
551
4e5b184d 552 blk_send_response_all(blkdev);
fc1f79f7 553 while (rc != rp) {
62d23efa 554 /* pull request from ring */
209cd7ab 555 if (RING_REQUEST_CONS_OVERFLOW(&blkdev->rings.common, rc)) {
62d23efa 556 break;
209cd7ab 557 }
62d23efa
AL
558 ioreq = ioreq_start(blkdev);
559 if (ioreq == NULL) {
560 blkdev->more_work++;
561 break;
562 }
563 blk_get_request(blkdev, ioreq, rc);
564 blkdev->rings.common.req_cons = ++rc;
565
566 /* parse them */
567 if (ioreq_parse(ioreq) != 0) {
57ee366c
AG
568
569 switch (ioreq->req.operation) {
570 case BLKIF_OP_READ:
571 block_acct_invalid(blk_get_stats(blkdev->blk),
572 BLOCK_ACCT_READ);
573 break;
574 case BLKIF_OP_WRITE:
575 block_acct_invalid(blk_get_stats(blkdev->blk),
576 BLOCK_ACCT_WRITE);
577 break;
578 case BLKIF_OP_FLUSH_DISKCACHE:
579 block_acct_invalid(blk_get_stats(blkdev->blk),
580 BLOCK_ACCT_FLUSH);
581 default:
582 break;
583 };
584
209cd7ab 585 if (blk_send_response_one(ioreq)) {
ba18fa2a 586 xen_pv_send_notify(&blkdev->xendev);
209cd7ab 587 }
ed547766 588 ioreq_release(ioreq, false);
62d23efa
AL
589 continue;
590 }
591
4e5b184d 592 ioreq_runio_qemu_aio(ioreq);
209cd7ab 593 }
62d23efa 594
3284fad7 595 if (blkdev->more_work && blkdev->requests_inflight < blkdev->max_requests) {
62d23efa 596 qemu_bh_schedule(blkdev->bh);
209cd7ab 597 }
62d23efa
AL
598}
599
600/* ------------------------------------------------------------- */
601
602static void blk_bh(void *opaque)
603{
604 struct XenBlkDev *blkdev = opaque;
1491ede7
PD
605
606 aio_context_acquire(blkdev->ctx);
62d23efa 607 blk_handle_requests(blkdev);
1491ede7 608 aio_context_release(blkdev->ctx);
62d23efa
AL
609}
610
611static void blk_alloc(struct XenDevice *xendev)
612{
613 struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
1491ede7
PD
614 Error *err = NULL;
615
616 trace_xen_disk_alloc(xendev->name);
62d23efa 617
72cf2d4f
BS
618 QLIST_INIT(&blkdev->inflight);
619 QLIST_INIT(&blkdev->finished);
620 QLIST_INIT(&blkdev->freelist);
1491ede7
PD
621
622 blkdev->iothread = iothread_create(xendev->name, &err);
623 assert(!err);
624
625 blkdev->ctx = iothread_get_aio_context(blkdev->iothread);
626 blkdev->bh = aio_bh_new(blkdev->ctx, blk_bh, blkdev);
62d23efa
AL
627}
628
f3135204
OH
629static void blk_parse_discard(struct XenBlkDev *blkdev)
630{
443c3c9c 631 struct XenDevice *xendev = &blkdev->xendev;
f3135204
OH
632 int enable;
633
634 blkdev->feature_discard = true;
635
443c3c9c 636 if (xenstore_read_be_int(xendev, "discard-enable", &enable) == 0) {
f3135204
OH
637 blkdev->feature_discard = !!enable;
638 }
639
640 if (blkdev->feature_discard) {
443c3c9c 641 xenstore_write_be_int(xendev, "feature-discard", 1);
f3135204
OH
642 }
643}
644
62d23efa
AL
645static int blk_init(struct XenDevice *xendev)
646{
647 struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
86f425db 648 int info = 0;
454ae734 649 char *directiosafe = NULL;
62d23efa 650
1491ede7
PD
651 trace_xen_disk_init(xendev->name);
652
62d23efa
AL
653 /* read xenstore entries */
654 if (blkdev->params == NULL) {
5ea3c2b4 655 char *h = NULL;
443c3c9c 656 blkdev->params = xenstore_read_be_str(xendev, "params");
5ea3c2b4
SS
657 if (blkdev->params != NULL) {
658 h = strchr(blkdev->params, ':');
659 }
209cd7ab
AP
660 if (h != NULL) {
661 blkdev->fileproto = blkdev->params;
662 blkdev->filename = h+1;
663 *h = 0;
664 } else {
665 blkdev->fileproto = "<unset>";
666 blkdev->filename = blkdev->params;
667 }
668 }
7cef3f4f
SS
669 if (!strcmp("aio", blkdev->fileproto)) {
670 blkdev->fileproto = "raw";
671 }
fc3e493b
SS
672 if (!strcmp("vhd", blkdev->fileproto)) {
673 blkdev->fileproto = "vpc";
674 }
209cd7ab 675 if (blkdev->mode == NULL) {
443c3c9c 676 blkdev->mode = xenstore_read_be_str(xendev, "mode");
209cd7ab
AP
677 }
678 if (blkdev->type == NULL) {
443c3c9c 679 blkdev->type = xenstore_read_be_str(xendev, "type");
209cd7ab
AP
680 }
681 if (blkdev->dev == NULL) {
443c3c9c 682 blkdev->dev = xenstore_read_be_str(xendev, "dev");
209cd7ab
AP
683 }
684 if (blkdev->devtype == NULL) {
443c3c9c 685 blkdev->devtype = xenstore_read_be_str(xendev, "device-type");
209cd7ab 686 }
443c3c9c 687 directiosafe = xenstore_read_be_str(xendev, "direct-io-safe");
454ae734 688 blkdev->directiosafe = (directiosafe && atoi(directiosafe));
62d23efa
AL
689
690 /* do we have all we need? */
691 if (blkdev->params == NULL ||
209cd7ab
AP
692 blkdev->mode == NULL ||
693 blkdev->type == NULL ||
694 blkdev->dev == NULL) {
5ea3c2b4 695 goto out_error;
209cd7ab 696 }
62d23efa
AL
697
698 /* read-only ? */
86f425db 699 if (strcmp(blkdev->mode, "w")) {
209cd7ab 700 info |= VDISK_READONLY;
62d23efa
AL
701 }
702
703 /* cdrom ? */
209cd7ab
AP
704 if (blkdev->devtype && !strcmp(blkdev->devtype, "cdrom")) {
705 info |= VDISK_CDROM;
706 }
62d23efa 707
86f425db
AB
708 blkdev->file_blk = BLOCK_SIZE;
709
710 /* fill info
711 * blk_connect supplies sector-size and sectors
712 */
443c3c9c
PD
713 xenstore_write_be_int(xendev, "feature-flush-cache", 1);
714 xenstore_write_be_int(xendev, "info", info);
454ae734 715
443c3c9c 716 xenstore_write_be_int(xendev, "max-ring-page-order",
3284fad7
PD
717 MAX_RING_PAGE_ORDER);
718
f3135204
OH
719 blk_parse_discard(blkdev);
720
454ae734 721 g_free(directiosafe);
86f425db
AB
722 return 0;
723
724out_error:
725 g_free(blkdev->params);
726 blkdev->params = NULL;
727 g_free(blkdev->mode);
728 blkdev->mode = NULL;
729 g_free(blkdev->type);
730 blkdev->type = NULL;
731 g_free(blkdev->dev);
732 blkdev->dev = NULL;
733 g_free(blkdev->devtype);
734 blkdev->devtype = NULL;
454ae734
SS
735 g_free(directiosafe);
736 blkdev->directiosafe = false;
86f425db
AB
737 return -1;
738}
739
740static int blk_connect(struct XenDevice *xendev)
741{
742 struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
06454c24 743 int index, qflags;
b64ec4e4 744 bool readonly = true;
ecdd3cc8 745 bool writethrough = true;
3284fad7
PD
746 int order, ring_ref;
747 unsigned int ring_size, max_grants;
748 unsigned int i;
86f425db 749
1491ede7
PD
750 trace_xen_disk_connect(xendev->name);
751
86f425db 752 /* read-only ? */
454ae734
SS
753 if (blkdev->directiosafe) {
754 qflags = BDRV_O_NOCACHE | BDRV_O_NATIVE_AIO;
755 } else {
ecdd3cc8
KW
756 qflags = 0;
757 writethrough = false;
454ae734 758 }
86f425db
AB
759 if (strcmp(blkdev->mode, "w") == 0) {
760 qflags |= BDRV_O_RDWR;
b64ec4e4 761 readonly = false;
86f425db 762 }
f3135204
OH
763 if (blkdev->feature_discard) {
764 qflags |= BDRV_O_UNMAP;
765 }
86f425db 766
62d23efa 767 /* init qemu block driver */
443c3c9c 768 index = (xendev->dev - 202 * 256) / 16;
751c6a17
GH
769 blkdev->dinfo = drive_get(IF_XEN, 0, index);
770 if (!blkdev->dinfo) {
98522f63 771 Error *local_err = NULL;
9a925356 772 QDict *options = NULL;
cedccf13 773
9a925356
HR
774 if (strcmp(blkdev->fileproto, "<unset>")) {
775 options = qdict_new();
46f5ac20 776 qdict_put_str(options, "driver", blkdev->fileproto);
26f54e9a 777 }
cedccf13 778
9a925356 779 /* setup via xenbus -> create new block driver instance */
443c3c9c 780 xen_pv_printf(xendev, 2, "create new bdrv (xenbus setup)\n");
efaa7c4e 781 blkdev->blk = blk_new_open(blkdev->filename, NULL, options,
9a925356
HR
782 qflags, &local_err);
783 if (!blkdev->blk) {
443c3c9c 784 xen_pv_printf(xendev, 0, "error: %s\n",
cedccf13
MA
785 error_get_pretty(local_err));
786 error_free(local_err);
cedccf13
MA
787 return -1;
788 }
ecdd3cc8 789 blk_set_enable_write_cache(blkdev->blk, !writethrough);
62d23efa
AL
790 } else {
791 /* setup via qemu cmdline -> already setup for us */
443c3c9c 792 xen_pv_printf(xendev, 2,
b9730c5b 793 "get configured bdrv (cmdline setup)\n");
4be74634
MA
794 blkdev->blk = blk_by_legacy_dinfo(blkdev->dinfo);
795 if (blk_is_read_only(blkdev->blk) && !readonly) {
443c3c9c 796 xen_pv_printf(xendev, 0, "Unexpected read-only drive");
4be74634 797 blkdev->blk = NULL;
4f8a066b
KW
798 return -1;
799 }
4be74634
MA
800 /* blkdev->blk is not create by us, we get a reference
801 * so we can blk_unref() unconditionally */
802 blk_ref(blkdev->blk);
803 }
bbc8ea98 804 blk_attach_dev_legacy(blkdev->blk, blkdev);
4be74634 805 blkdev->file_size = blk_getlength(blkdev->blk);
62d23efa 806 if (blkdev->file_size < 0) {
5433c24f
HR
807 BlockDriverState *bs = blk_bs(blkdev->blk);
808 const char *drv_name = bs ? bdrv_get_format_name(bs) : NULL;
443c3c9c 809 xen_pv_printf(xendev, 1, "blk_getlength: %d (%s) | drv %s\n",
62d23efa 810 (int)blkdev->file_size, strerror(-blkdev->file_size),
5433c24f 811 drv_name ?: "-");
209cd7ab 812 blkdev->file_size = 0;
62d23efa 813 }
62d23efa 814
96c77dba 815 xen_pv_printf(xendev, 1, "type \"%s\", fileproto \"%s\", filename \"%s\","
209cd7ab
AP
816 " size %" PRId64 " (%" PRId64 " MB)\n",
817 blkdev->type, blkdev->fileproto, blkdev->filename,
8f951a13 818 blkdev->file_size, blkdev->file_size / MiB);
62d23efa 819
86f425db 820 /* Fill in number of sector size and number of sectors */
443c3c9c
PD
821 xenstore_write_be_int(xendev, "sector-size", blkdev->file_blk);
822 xenstore_write_be_int64(xendev, "sectors",
9246ce88 823 blkdev->file_size / blkdev->file_blk);
62d23efa 824
443c3c9c 825 if (xenstore_read_fe_int(xendev, "ring-page-order",
3284fad7
PD
826 &order) == -1) {
827 blkdev->nr_ring_ref = 1;
828
443c3c9c 829 if (xenstore_read_fe_int(xendev, "ring-ref",
3284fad7
PD
830 &ring_ref) == -1) {
831 return -1;
832 }
833 blkdev->ring_ref[0] = ring_ref;
834
835 } else if (order >= 0 && order <= MAX_RING_PAGE_ORDER) {
836 blkdev->nr_ring_ref = 1 << order;
837
838 for (i = 0; i < blkdev->nr_ring_ref; i++) {
839 char *key;
840
841 key = g_strdup_printf("ring-ref%u", i);
842 if (!key) {
843 return -1;
844 }
845
443c3c9c 846 if (xenstore_read_fe_int(xendev, key,
3284fad7
PD
847 &ring_ref) == -1) {
848 g_free(key);
849 return -1;
850 }
851 blkdev->ring_ref[i] = ring_ref;
852
853 g_free(key);
854 }
855 } else {
856 xen_pv_printf(xendev, 0, "invalid ring-page-order: %d\n",
857 order);
209cd7ab
AP
858 return -1;
859 }
3284fad7 860
443c3c9c
PD
861 if (xenstore_read_fe_int(xendev, "event-channel",
862 &xendev->remote_port) == -1) {
209cd7ab
AP
863 return -1;
864 }
62d23efa 865
443c3c9c 866 if (!xendev->protocol) {
4ada797b 867 blkdev->protocol = BLKIF_PROTOCOL_NATIVE;
443c3c9c 868 } else if (strcmp(xendev->protocol, XEN_IO_PROTO_ABI_NATIVE) == 0) {
4ada797b 869 blkdev->protocol = BLKIF_PROTOCOL_NATIVE;
443c3c9c 870 } else if (strcmp(xendev->protocol, XEN_IO_PROTO_ABI_X86_32) == 0) {
4ada797b 871 blkdev->protocol = BLKIF_PROTOCOL_X86_32;
443c3c9c 872 } else if (strcmp(xendev->protocol, XEN_IO_PROTO_ABI_X86_64) == 0) {
4ada797b
JG
873 blkdev->protocol = BLKIF_PROTOCOL_X86_64;
874 } else {
875 blkdev->protocol = BLKIF_PROTOCOL_NATIVE;
62d23efa
AL
876 }
877
3284fad7
PD
878 ring_size = XC_PAGE_SIZE * blkdev->nr_ring_ref;
879 switch (blkdev->protocol) {
880 case BLKIF_PROTOCOL_NATIVE:
881 {
882 blkdev->max_requests = __CONST_RING_SIZE(blkif, ring_size);
883 break;
884 }
885 case BLKIF_PROTOCOL_X86_32:
886 {
887 blkdev->max_requests = __CONST_RING_SIZE(blkif_x86_32, ring_size);
888 break;
889 }
890 case BLKIF_PROTOCOL_X86_64:
891 {
892 blkdev->max_requests = __CONST_RING_SIZE(blkif_x86_64, ring_size);
893 break;
894 }
895 default:
896 return -1;
897 }
898
3284fad7 899 /* Add on the number needed for the ring pages */
06454c24 900 max_grants = blkdev->nr_ring_ref;
3284fad7 901
5ee1d999 902 xen_be_set_max_grant_refs(xendev, max_grants);
5ee1d999
PD
903 blkdev->sring = xen_be_map_grant_refs(xendev, blkdev->ring_ref,
904 blkdev->nr_ring_ref,
905 PROT_READ | PROT_WRITE);
209cd7ab
AP
906 if (!blkdev->sring) {
907 return -1;
908 }
3284fad7 909
62d23efa
AL
910 switch (blkdev->protocol) {
911 case BLKIF_PROTOCOL_NATIVE:
912 {
209cd7ab 913 blkif_sring_t *sring_native = blkdev->sring;
3284fad7 914 BACK_RING_INIT(&blkdev->rings.native, sring_native, ring_size);
209cd7ab 915 break;
62d23efa
AL
916 }
917 case BLKIF_PROTOCOL_X86_32:
918 {
209cd7ab 919 blkif_x86_32_sring_t *sring_x86_32 = blkdev->sring;
6fcfeff9 920
3284fad7 921 BACK_RING_INIT(&blkdev->rings.x86_32_part, sring_x86_32, ring_size);
209cd7ab 922 break;
62d23efa
AL
923 }
924 case BLKIF_PROTOCOL_X86_64:
925 {
209cd7ab 926 blkif_x86_64_sring_t *sring_x86_64 = blkdev->sring;
6fcfeff9 927
3284fad7 928 BACK_RING_INIT(&blkdev->rings.x86_64_part, sring_x86_64, ring_size);
209cd7ab 929 break;
62d23efa
AL
930 }
931 }
932
1491ede7
PD
933 blk_set_aio_context(blkdev->blk, blkdev->ctx);
934
443c3c9c 935 xen_be_bind_evtchn(xendev);
62d23efa 936
443c3c9c 937 xen_pv_printf(xendev, 1, "ok: proto %s, nr-ring-ref %u, "
209cd7ab 938 "remote port %d, local port %d\n",
443c3c9c
PD
939 xendev->protocol, blkdev->nr_ring_ref,
940 xendev->remote_port, xendev->local_port);
62d23efa
AL
941 return 0;
942}
943
944static void blk_disconnect(struct XenDevice *xendev)
945{
946 struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
947
1491ede7
PD
948 trace_xen_disk_disconnect(xendev->name);
949
950 aio_context_acquire(blkdev->ctx);
951
4be74634 952 if (blkdev->blk) {
1491ede7 953 blk_set_aio_context(blkdev->blk, qemu_get_aio_context());
4be74634
MA
954 blk_detach_dev(blkdev->blk, blkdev);
955 blk_unref(blkdev->blk);
956 blkdev->blk = NULL;
62d23efa 957 }
443c3c9c 958 xen_pv_unbind_evtchn(xendev);
62d23efa 959
1491ede7
PD
960 aio_context_release(blkdev->ctx);
961
62d23efa 962 if (blkdev->sring) {
5ee1d999
PD
963 xen_be_unmap_grant_refs(xendev, blkdev->sring,
964 blkdev->nr_ring_ref);
209cd7ab 965 blkdev->sring = NULL;
62d23efa
AL
966 }
967}
968
969static int blk_free(struct XenDevice *xendev)
970{
971 struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
972 struct ioreq *ioreq;
973
1491ede7
PD
974 trace_xen_disk_free(xendev->name);
975
e38c3e86 976 blk_disconnect(xendev);
77ba8fef 977
72cf2d4f 978 while (!QLIST_EMPTY(&blkdev->freelist)) {
209cd7ab 979 ioreq = QLIST_FIRST(&blkdev->freelist);
72cf2d4f 980 QLIST_REMOVE(ioreq, list);
62d23efa 981 qemu_iovec_destroy(&ioreq->v);
7267c094 982 g_free(ioreq);
62d23efa
AL
983 }
984
7267c094
AL
985 g_free(blkdev->params);
986 g_free(blkdev->mode);
987 g_free(blkdev->type);
988 g_free(blkdev->dev);
989 g_free(blkdev->devtype);
62d23efa 990 qemu_bh_delete(blkdev->bh);
1491ede7 991 iothread_destroy(blkdev->iothread);
62d23efa
AL
992 return 0;
993}
994
995static void blk_event(struct XenDevice *xendev)
996{
997 struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
998
999 qemu_bh_schedule(blkdev->bh);
1000}
1001
1002struct XenDevOps xen_blkdev_ops = {
5ee1d999 1003 .flags = DEVOPS_FLAG_NEED_GNTDEV,
62d23efa 1004 .size = sizeof(struct XenBlkDev),
62d23efa
AL
1005 .alloc = blk_alloc,
1006 .init = blk_init,
5ee1d999 1007 .initialise = blk_connect,
62d23efa
AL
1008 .disconnect = blk_disconnect,
1009 .event = blk_event,
1010 .free = blk_free,
1011};