hw/dataplane/virtio-blk.c

   1 /*
   2  * Dedicated thread for virtio-blk I/O processing
   3  *
   4  * Copyright 2012 IBM, Corp.
   5  * Copyright 2012 Red Hat, Inc. and/or its affiliates
   6  *
   7  * Authors:
   8  *   Stefan Hajnoczi <stefanha@redhat.com>
   9  *
  10  * This work is licensed under the terms of the GNU GPL, version 2 or later.
  11  * See the COPYING file in the top-level directory.
  12  *
  13  */
  14
  15 #include "trace.h"
  16 #include "qemu/iov.h"
  17 #include "event-poll.h"
  18 #include "qemu/thread.h"
  19 #include "vring.h"
  20 #include "ioq.h"
  21 #include "migration/migration.h"
  22 #include "hw/virtio-blk.h"
  23 #include "hw/dataplane/virtio-blk.h"
  24
  25 enum {
  26     SEG_MAX = 126,                  /* maximum number of I/O segments */
  27     VRING_MAX = SEG_MAX + 2,        /* maximum number of vring descriptors */
  28     REQ_MAX = VRING_MAX,            /* maximum number of requests in the vring,
  29                                      * is VRING_MAX / 2 with traditional and
  30                                      * VRING_MAX with indirect descriptors */
  31 };
  32
  33 typedef struct {
  34     struct iocb iocb;               /* Linux AIO control block */
  35     QEMUIOVector *inhdr;            /* iovecs for virtio_blk_inhdr */
  36     unsigned int head;              /* vring descriptor index */
  37 } VirtIOBlockRequest;
  38
  39 struct VirtIOBlockDataPlane {
  40     bool started;
  41     QEMUBH *start_bh;
  42     QemuThread thread;
  43
  44     VirtIOBlkConf *blk;
  45     int fd;                         /* image file descriptor */
  46
  47     VirtIODevice *vdev;
  48     Vring vring;                    /* virtqueue vring */
  49     EventNotifier *guest_notifier;  /* irq */
  50
  51     EventPoll event_poll;           /* event poller */
  52     EventHandler io_handler;        /* Linux AIO completion handler */
  53     EventHandler notify_handler;    /* virtqueue notify handler */
  54
  55     IOQueue ioqueue;                /* Linux AIO queue (should really be per
  56                                        dataplane thread) */
  57     VirtIOBlockRequest requests[REQ_MAX]; /* pool of requests, managed by the
  58                                              queue */
  59
  60     unsigned int num_reqs;
  61
  62     Error *migration_blocker;
  63 };
  64
  65 /* Raise an interrupt to signal guest, if necessary */
  66 static void notify_guest(VirtIOBlockDataPlane *s)
  67 {
  68     if (!vring_should_notify(s->vdev, &s->vring)) {
  69         return;
  70     }
  71
  72     event_notifier_set(s->guest_notifier);
  73 }
  74
  75 static void complete_request(struct iocb *iocb, ssize_t ret, void *opaque)
  76 {
  77     VirtIOBlockDataPlane *s = opaque;
  78     VirtIOBlockRequest *req = container_of(iocb, VirtIOBlockRequest, iocb);
  79     struct virtio_blk_inhdr hdr;
  80     int len;
  81
  82     if (likely(ret >= 0)) {
  83         hdr.status = VIRTIO_BLK_S_OK;
  84         len = ret;
  85     } else {
  86         hdr.status = VIRTIO_BLK_S_IOERR;
  87         len = 0;
  88     }
  89
  90     trace_virtio_blk_data_plane_complete_request(s, req->head, ret);
  91
  92     qemu_iovec_from_buf(req->inhdr, 0, &hdr, sizeof(hdr));
  93     qemu_iovec_destroy(req->inhdr);
  94     g_slice_free(QEMUIOVector, req->inhdr);
  95
  96     /* According to the virtio specification len should be the number of bytes
  97      * written to, but for virtio-blk it seems to be the number of bytes
  98      * transferred plus the status bytes.
  99      */
 100     vring_push(&s->vring, req->head, len + sizeof(hdr));
 101
 102     s->num_reqs--;
 103 }
 104
 105 static void complete_request_early(VirtIOBlockDataPlane *s, unsigned int head,
 106                                    QEMUIOVector *inhdr, unsigned char status)
 107 {
 108     struct virtio_blk_inhdr hdr = {
 109         .status = status,
 110     };
 111
 112     qemu_iovec_from_buf(inhdr, 0, &hdr, sizeof(hdr));
 113     qemu_iovec_destroy(inhdr);
 114     g_slice_free(QEMUIOVector, inhdr);
 115
 116     vring_push(&s->vring, head, sizeof(hdr));
 117     notify_guest(s);
 118 }
 119
 120 /* Get disk serial number */
 121 static void do_get_id_cmd(VirtIOBlockDataPlane *s,
 122                           struct iovec *iov, unsigned int iov_cnt,
 123                           unsigned int head, QEMUIOVector *inhdr)
 124 {
 125     char id[VIRTIO_BLK_ID_BYTES];
 126
 127     /* Serial number not NUL-terminated when shorter than buffer */
 128     strncpy(id, s->blk->serial ? s->blk->serial : "", sizeof(id));
 129     iov_from_buf(iov, iov_cnt, 0, id, sizeof(id));
 130     complete_request_early(s, head, inhdr, VIRTIO_BLK_S_OK);
 131 }
 132
 133 static int do_rdwr_cmd(VirtIOBlockDataPlane *s, bool read,
 134                        struct iovec *iov, unsigned int iov_cnt,
 135                        long long offset, unsigned int head,
 136                        QEMUIOVector *inhdr)
 137 {
 138     struct iocb *iocb;
 139
 140     iocb = ioq_rdwr(&s->ioqueue, read, iov, iov_cnt, offset);
 141
 142     /* Fill in virtio block metadata needed for completion */
 143     VirtIOBlockRequest *req = container_of(iocb, VirtIOBlockRequest, iocb);
 144     req->head = head;
 145     req->inhdr = inhdr;
 146     return 0;
 147 }
 148
 149 static int process_request(IOQueue *ioq, struct iovec iov[],
 150                            unsigned int out_num, unsigned int in_num,
 151                            unsigned int head)
 152 {
 153     VirtIOBlockDataPlane *s = container_of(ioq, VirtIOBlockDataPlane, ioqueue);
 154     struct iovec *in_iov = &iov[out_num];
 155     struct virtio_blk_outhdr outhdr;
 156     QEMUIOVector *inhdr;
 157     size_t in_size;
 158
 159     /* Copy in outhdr */
 160     if (unlikely(iov_to_buf(iov, out_num, 0, &outhdr,
 161                             sizeof(outhdr)) != sizeof(outhdr))) {
 162         error_report("virtio-blk request outhdr too short");
 163         return -EFAULT;
 164     }
 165     iov_discard_front(&iov, &out_num, sizeof(outhdr));
 166
 167     /* Grab inhdr for later */
 168     in_size = iov_size(in_iov, in_num);
 169     if (in_size < sizeof(struct virtio_blk_inhdr)) {
 170         error_report("virtio_blk request inhdr too short");
 171         return -EFAULT;
 172     }
 173     inhdr = g_slice_new(QEMUIOVector);
 174     qemu_iovec_init(inhdr, 1);
 175     qemu_iovec_concat_iov(inhdr, in_iov, in_num,
 176             in_size - sizeof(struct virtio_blk_inhdr),
 177             sizeof(struct virtio_blk_inhdr));
 178     iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr));
 179
 180     /* TODO Linux sets the barrier bit even when not advertised! */
 181     outhdr.type &= ~VIRTIO_BLK_T_BARRIER;
 182
 183     switch (outhdr.type) {
 184     case VIRTIO_BLK_T_IN:
 185         do_rdwr_cmd(s, true, in_iov, in_num, outhdr.sector * 512, head, inhdr);
 186         return 0;
 187
 188     case VIRTIO_BLK_T_OUT:
 189         do_rdwr_cmd(s, false, iov, out_num, outhdr.sector * 512, head, inhdr);
 190         return 0;
 191
 192     case VIRTIO_BLK_T_SCSI_CMD:
 193         /* TODO support SCSI commands */
 194         complete_request_early(s, head, inhdr, VIRTIO_BLK_S_UNSUPP);
 195         return 0;
 196
 197     case VIRTIO_BLK_T_FLUSH:
 198         /* TODO fdsync not supported by Linux AIO, do it synchronously here! */
 199         if (qemu_fdatasync(s->fd) < 0) {
 200             complete_request_early(s, head, inhdr, VIRTIO_BLK_S_IOERR);
 201         } else {
 202             complete_request_early(s, head, inhdr, VIRTIO_BLK_S_OK);
 203         }
 204         return 0;
 205
 206     case VIRTIO_BLK_T_GET_ID:
 207         do_get_id_cmd(s, in_iov, in_num, head, inhdr);
 208         return 0;
 209
 210     default:
 211         error_report("virtio-blk unsupported request type %#x", outhdr.type);
 212         qemu_iovec_destroy(inhdr);
 213         g_slice_free(QEMUIOVector, inhdr);
 214         return -EFAULT;
 215     }
 216 }
 217
 218 static void handle_notify(EventHandler *handler)
 219 {
 220     VirtIOBlockDataPlane *s = container_of(handler, VirtIOBlockDataPlane,
 221                                            notify_handler);
 222
 223     /* There is one array of iovecs into which all new requests are extracted
 224      * from the vring.  Requests are read from the vring and the translated
 225      * descriptors are written to the iovecs array.  The iovecs do not have to
 226      * persist across handle_notify() calls because the kernel copies the
 227      * iovecs on io_submit().
 228      *
 229      * Handling io_submit() EAGAIN may require storing the requests across
 230      * handle_notify() calls until the kernel has sufficient resources to
 231      * accept more I/O.  This is not implemented yet.
 232      */
 233     struct iovec iovec[VRING_MAX];
 234     struct iovec *end = &iovec[VRING_MAX];
 235     struct iovec *iov = iovec;
 236
 237     /* When a request is read from the vring, the index of the first descriptor
 238      * (aka head) is returned so that the completed request can be pushed onto
 239      * the vring later.
 240      *
 241      * The number of hypervisor read-only iovecs is out_num.  The number of
 242      * hypervisor write-only iovecs is in_num.
 243      */
 244     int head;
 245     unsigned int out_num = 0, in_num = 0;
 246     unsigned int num_queued;
 247
 248     for (;;) {
 249         /* Disable guest->host notifies to avoid unnecessary vmexits */
 250         vring_disable_notification(s->vdev, &s->vring);
 251
 252         for (;;) {
 253             head = vring_pop(s->vdev, &s->vring, iov, end, &out_num, &in_num);
 254             if (head < 0) {
 255                 break; /* no more requests */
 256             }
 257
 258             trace_virtio_blk_data_plane_process_request(s, out_num, in_num,
 259                                                         head);
 260
 261             if (process_request(&s->ioqueue, iov, out_num, in_num, head) < 0) {
 262                 vring_set_broken(&s->vring);
 263                 break;
 264             }
 265             iov += out_num + in_num;
 266         }
 267
 268         if (likely(head == -EAGAIN)) { /* vring emptied */
 269             /* Re-enable guest->host notifies and stop processing the vring.
 270              * But if the guest has snuck in more descriptors, keep processing.
 271              */
 272             if (vring_enable_notification(s->vdev, &s->vring)) {
 273                 break;
 274             }
 275         } else { /* head == -ENOBUFS or fatal error, iovecs[] is depleted */
 276             /* Since there are no iovecs[] left, stop processing for now.  Do
 277              * not re-enable guest->host notifies since the I/O completion
 278              * handler knows to check for more vring descriptors anyway.
 279              */
 280             break;
 281         }
 282     }
 283
 284     num_queued = ioq_num_queued(&s->ioqueue);
 285     if (num_queued > 0) {
 286         s->num_reqs += num_queued;
 287
 288         int rc = ioq_submit(&s->ioqueue);
 289         if (unlikely(rc < 0)) {
 290             fprintf(stderr, "ioq_submit failed %d\n", rc);
 291             exit(1);
 292         }
 293     }
 294 }
 295
 296 static void handle_io(EventHandler *handler)
 297 {
 298     VirtIOBlockDataPlane *s = container_of(handler, VirtIOBlockDataPlane,
 299                                            io_handler);
 300
 301     if (ioq_run_completion(&s->ioqueue, complete_request, s) > 0) {
 302         notify_guest(s);
 303     }
 304
 305     /* If there were more requests than iovecs, the vring will not be empty yet
 306      * so check again.  There should now be enough resources to process more
 307      * requests.
 308      */
 309     if (unlikely(vring_more_avail(&s->vring))) {
 310         handle_notify(&s->notify_handler);
 311     }
 312 }
 313
 314 static void *data_plane_thread(void *opaque)
 315 {
 316     VirtIOBlockDataPlane *s = opaque;
 317
 318     do {
 319         event_poll(&s->event_poll);
 320     } while (s->started || s->num_reqs > 0);
 321     return NULL;
 322 }
 323
 324 static void start_data_plane_bh(void *opaque)
 325 {
 326     VirtIOBlockDataPlane *s = opaque;
 327
 328     qemu_bh_delete(s->start_bh);
 329     s->start_bh = NULL;
 330     qemu_thread_create(&s->thread, data_plane_thread,
 331                        s, QEMU_THREAD_JOINABLE);
 332 }
 333
 334 bool virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *blk,
 335                                   VirtIOBlockDataPlane **dataplane)
 336 {
 337     VirtIOBlockDataPlane *s;
 338     int fd;
 339
 340     *dataplane = NULL;
 341
 342     if (!blk->data_plane) {
 343         return true;
 344     }
 345
 346     if (blk->scsi) {
 347         error_report("device is incompatible with x-data-plane, use scsi=off");
 348         return false;
 349     }
 350
 351     if (blk->config_wce) {
 352         error_report("device is incompatible with x-data-plane, "
 353                      "use config-wce=off");
 354         return false;
 355     }
 356
 357     fd = raw_get_aio_fd(blk->conf.bs);
 358     if (fd < 0) {
 359         error_report("drive is incompatible with x-data-plane, "
 360                      "use format=raw,cache=none,aio=native");
 361         return false;
 362     }
 363
 364     s = g_new0(VirtIOBlockDataPlane, 1);
 365     s->vdev = vdev;
 366     s->fd = fd;
 367     s->blk = blk;
 368
 369     /* Prevent block operations that conflict with data plane thread */
 370     bdrv_set_in_use(blk->conf.bs, 1);
 371
 372     error_setg(&s->migration_blocker,
 373             "x-data-plane does not support migration");
 374     migrate_add_blocker(s->migration_blocker);
 375
 376     *dataplane = s;
 377     return true;
 378 }
 379
 380 void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s)
 381 {
 382     if (!s) {
 383         return;
 384     }
 385
 386     virtio_blk_data_plane_stop(s);
 387     migrate_del_blocker(s->migration_blocker);
 388     error_free(s->migration_blocker);
 389     bdrv_set_in_use(s->blk->conf.bs, 0);
 390     g_free(s);
 391 }
 392
 393 void virtio_blk_data_plane_start(VirtIOBlockDataPlane *s)
 394 {
 395     VirtQueue *vq;
 396     int i;
 397
 398     if (s->started) {
 399         return;
 400     }
 401
 402     vq = virtio_get_queue(s->vdev, 0);
 403     if (!vring_setup(&s->vring, s->vdev, 0)) {
 404         return;
 405     }
 406
 407     event_poll_init(&s->event_poll);
 408
 409     /* Set up guest notifier (irq) */
 410     if (s->vdev->binding->set_guest_notifiers(s->vdev->binding_opaque,
 411                                               true) != 0) {
 412         fprintf(stderr, "virtio-blk failed to set guest notifier, "
 413                 "ensure -enable-kvm is set\n");
 414         exit(1);
 415     }
 416     s->guest_notifier = virtio_queue_get_guest_notifier(vq);
 417
 418     /* Set up virtqueue notify */
 419     if (s->vdev->binding->set_host_notifier(s->vdev->binding_opaque,
 420                                             0, true) != 0) {
 421         fprintf(stderr, "virtio-blk failed to set host notifier\n");
 422         exit(1);
 423     }
 424     event_poll_add(&s->event_poll, &s->notify_handler,
 425                    virtio_queue_get_host_notifier(vq),
 426                    handle_notify);
 427
 428     /* Set up ioqueue */
 429     ioq_init(&s->ioqueue, s->fd, REQ_MAX);
 430     for (i = 0; i < ARRAY_SIZE(s->requests); i++) {
 431         ioq_put_iocb(&s->ioqueue, &s->requests[i].iocb);
 432     }
 433     event_poll_add(&s->event_poll, &s->io_handler,
 434                    ioq_get_notifier(&s->ioqueue), handle_io);
 435
 436     s->started = true;
 437     trace_virtio_blk_data_plane_start(s);
 438
 439     /* Kick right away to begin processing requests already in vring */
 440     event_notifier_set(virtio_queue_get_host_notifier(vq));
 441
 442     /* Spawn thread in BH so it inherits iothread cpusets */
 443     s->start_bh = qemu_bh_new(start_data_plane_bh, s);
 444     qemu_bh_schedule(s->start_bh);
 445 }
 446
 447 void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s)
 448 {
 449     if (!s->started) {
 450         return;
 451     }
 452     s->started = false;
 453     trace_virtio_blk_data_plane_stop(s);
 454
 455     /* Stop thread or cancel pending thread creation BH */
 456     if (s->start_bh) {
 457         qemu_bh_delete(s->start_bh);
 458         s->start_bh = NULL;
 459     } else {
 460         event_poll_notify(&s->event_poll);
 461         qemu_thread_join(&s->thread);
 462     }
 463
 464     ioq_cleanup(&s->ioqueue);
 465
 466     s->vdev->binding->set_host_notifier(s->vdev->binding_opaque, 0, false);
 467
 468     event_poll_cleanup(&s->event_poll);
 469
 470     /* Clean up guest notifier (irq) */
 471     s->vdev->binding->set_guest_notifiers(s->vdev->binding_opaque, false);
 472
 473     vring_teardown(&s->vring);
 474 }