hw/block/xen_disk.c

   1 /*
   2  *  xen paravirt block device backend
   3  *
   4  *  (c) Gerd Hoffmann <kraxel@redhat.com>
   5  *
   6  *  This program is free software; you can redistribute it and/or modify
   7  *  it under the terms of the GNU General Public License as published by
   8  *  the Free Software Foundation; under version 2 of the License.
   9  *
  10  *  This program is distributed in the hope that it will be useful,
  11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  *  GNU General Public License for more details.
  14  *
  15  *  You should have received a copy of the GNU General Public License along
  16  *  with this program; if not, see <http://www.gnu.org/licenses/>.
  17  *
  18  *  Contributions after 2012-01-13 are licensed under the terms of the
  19  *  GNU GPL, version 2 or (at your option) any later version.
  20  */
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #include <stdarg.h>
  25 #include <string.h>
  26 #include <unistd.h>
  27 #include <inttypes.h>
  28 #include <time.h>
  29 #include <fcntl.h>
  30 #include <errno.h>
  31 #include <sys/ioctl.h>
  32 #include <sys/types.h>
  33 #include <sys/stat.h>
  34 #include <sys/mman.h>
  35 #include <sys/uio.h>
  36
  37 #include "hw/hw.h"
  38 #include "hw/xen/xen_backend.h"
  39 #include "xen_blkif.h"
  40 #include "sysemu/blockdev.h"
  41 #include "sysemu/block-backend.h"
  42 #include "qapi/qmp/qdict.h"
  43 #include "qapi/qmp/qstring.h"
  44
  45 /* ------------------------------------------------------------- */
  46
  47 static int batch_maps   = 0;
  48
  49 static int max_requests = 32;
  50
  51 /* ------------------------------------------------------------- */
  52
  53 #define BLOCK_SIZE  512
  54 #define IOCB_COUNT  (BLKIF_MAX_SEGMENTS_PER_REQUEST + 2)
  55
  56 struct PersistentGrant {
  57     void *page;
  58     struct XenBlkDev *blkdev;
  59 };
  60
  61 typedef struct PersistentGrant PersistentGrant;
  62
  63 struct PersistentRegion {
  64     void *addr;
  65     int num;
  66 };
  67
  68 typedef struct PersistentRegion PersistentRegion;
  69
  70 struct ioreq {
  71     blkif_request_t     req;
  72     int16_t             status;
  73
  74     /* parsed request */
  75     off_t               start;
  76     QEMUIOVector        v;
  77     int                 presync;
  78     int                 postsync;
  79     uint8_t             mapped;
  80
  81     /* grant mapping */
  82     uint32_t            domids[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  83     uint32_t            refs[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  84     int                 prot;
  85     void                *page[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  86     void                *pages;
  87     int                 num_unmap;
  88
  89     /* aio status */
  90     int                 aio_inflight;
  91     int                 aio_errors;
  92
  93     struct XenBlkDev    *blkdev;
  94     QLIST_ENTRY(ioreq)   list;
  95     BlockAcctCookie     acct;
  96 };
  97
  98 struct XenBlkDev {
  99     struct XenDevice    xendev;  /* must be first */
 100     char                *params;
 101     char                *mode;
 102     char                *type;
 103     char                *dev;
 104     char                *devtype;
 105     bool                directiosafe;
 106     const char          *fileproto;
 107     const char          *filename;
 108     int                 ring_ref;
 109     void                *sring;
 110     int64_t             file_blk;
 111     int64_t             file_size;
 112     int                 protocol;
 113     blkif_back_rings_t  rings;
 114     int                 more_work;
 115     int                 cnt_map;
 116
 117     /* request lists */
 118     QLIST_HEAD(inflight_head, ioreq) inflight;
 119     QLIST_HEAD(finished_head, ioreq) finished;
 120     QLIST_HEAD(freelist_head, ioreq) freelist;
 121     int                 requests_total;
 122     int                 requests_inflight;
 123     int                 requests_finished;
 124
 125     /* Persistent grants extension */
 126     gboolean            feature_discard;
 127     gboolean            feature_persistent;
 128     GTree               *persistent_gnts;
 129     GSList              *persistent_regions;
 130     unsigned int        persistent_gnt_count;
 131     unsigned int        max_grants;
 132
 133     /* qemu block driver */
 134     DriveInfo           *dinfo;
 135     BlockBackend        *blk;
 136     QEMUBH              *bh;
 137 };
 138
 139 /* ------------------------------------------------------------- */
 140
 141 static void ioreq_reset(struct ioreq *ioreq)
 142 {
 143     memset(&ioreq->req, 0, sizeof(ioreq->req));
 144     ioreq->status = 0;
 145     ioreq->start = 0;
 146     ioreq->presync = 0;
 147     ioreq->postsync = 0;
 148     ioreq->mapped = 0;
 149
 150     memset(ioreq->domids, 0, sizeof(ioreq->domids));
 151     memset(ioreq->refs, 0, sizeof(ioreq->refs));
 152     ioreq->prot = 0;
 153     memset(ioreq->page, 0, sizeof(ioreq->page));
 154     ioreq->pages = NULL;
 155
 156     ioreq->aio_inflight = 0;
 157     ioreq->aio_errors = 0;
 158
 159     ioreq->blkdev = NULL;
 160     memset(&ioreq->list, 0, sizeof(ioreq->list));
 161     memset(&ioreq->acct, 0, sizeof(ioreq->acct));
 162
 163     qemu_iovec_reset(&ioreq->v);
 164 }
 165
 166 static gint int_cmp(gconstpointer a, gconstpointer b, gpointer user_data)
 167 {
 168     uint ua = GPOINTER_TO_UINT(a);
 169     uint ub = GPOINTER_TO_UINT(b);
 170     return (ua > ub) - (ua < ub);
 171 }
 172
 173 static void destroy_grant(gpointer pgnt)
 174 {
 175     PersistentGrant *grant = pgnt;
 176     XenGnttab gnt = grant->blkdev->xendev.gnttabdev;
 177
 178     if (xc_gnttab_munmap(gnt, grant->page, 1) != 0) {
 179         xen_be_printf(&grant->blkdev->xendev, 0,
 180                       "xc_gnttab_munmap failed: %s\n",
 181                       strerror(errno));
 182     }
 183     grant->blkdev->persistent_gnt_count--;
 184     xen_be_printf(&grant->blkdev->xendev, 3,
 185                   "unmapped grant %p\n", grant->page);
 186     g_free(grant);
 187 }
 188
 189 static void remove_persistent_region(gpointer data, gpointer dev)
 190 {
 191     PersistentRegion *region = data;
 192     struct XenBlkDev *blkdev = dev;
 193     XenGnttab gnt = blkdev->xendev.gnttabdev;
 194
 195     if (xc_gnttab_munmap(gnt, region->addr, region->num) != 0) {
 196         xen_be_printf(&blkdev->xendev, 0,
 197                       "xc_gnttab_munmap region %p failed: %s\n",
 198                       region->addr, strerror(errno));
 199     }
 200     xen_be_printf(&blkdev->xendev, 3,
 201                   "unmapped grant region %p with %d pages\n",
 202                   region->addr, region->num);
 203     g_free(region);
 204 }
 205
 206 static struct ioreq *ioreq_start(struct XenBlkDev *blkdev)
 207 {
 208     struct ioreq *ioreq = NULL;
 209
 210     if (QLIST_EMPTY(&blkdev->freelist)) {
 211         if (blkdev->requests_total >= max_requests) {
 212             goto out;
 213         }
 214         /* allocate new struct */
 215         ioreq = g_malloc0(sizeof(*ioreq));
 216         ioreq->blkdev = blkdev;
 217         blkdev->requests_total++;
 218         qemu_iovec_init(&ioreq->v, BLKIF_MAX_SEGMENTS_PER_REQUEST);
 219     } else {
 220         /* get one from freelist */
 221         ioreq = QLIST_FIRST(&blkdev->freelist);
 222         QLIST_REMOVE(ioreq, list);
 223     }
 224     QLIST_INSERT_HEAD(&blkdev->inflight, ioreq, list);
 225     blkdev->requests_inflight++;
 226
 227 out:
 228     return ioreq;
 229 }
 230
 231 static void ioreq_finish(struct ioreq *ioreq)
 232 {
 233     struct XenBlkDev *blkdev = ioreq->blkdev;
 234
 235     QLIST_REMOVE(ioreq, list);
 236     QLIST_INSERT_HEAD(&blkdev->finished, ioreq, list);
 237     blkdev->requests_inflight--;
 238     blkdev->requests_finished++;
 239 }
 240
 241 static void ioreq_release(struct ioreq *ioreq, bool finish)
 242 {
 243     struct XenBlkDev *blkdev = ioreq->blkdev;
 244
 245     QLIST_REMOVE(ioreq, list);
 246     ioreq_reset(ioreq);
 247     ioreq->blkdev = blkdev;
 248     QLIST_INSERT_HEAD(&blkdev->freelist, ioreq, list);
 249     if (finish) {
 250         blkdev->requests_finished--;
 251     } else {
 252         blkdev->requests_inflight--;
 253     }
 254 }
 255
 256 /*
 257  * translate request into iovec + start offset
 258  * do sanity checks along the way
 259  */
 260 static int ioreq_parse(struct ioreq *ioreq)
 261 {
 262     struct XenBlkDev *blkdev = ioreq->blkdev;
 263     uintptr_t mem;
 264     size_t len;
 265     int i;
 266
 267     xen_be_printf(&blkdev->xendev, 3,
 268                   "op %d, nr %d, handle %d, id %" PRId64 ", sector %" PRId64 "\n",
 269                   ioreq->req.operation, ioreq->req.nr_segments,
 270                   ioreq->req.handle, ioreq->req.id, ioreq->req.sector_number);
 271     switch (ioreq->req.operation) {
 272     case BLKIF_OP_READ:
 273         ioreq->prot = PROT_WRITE; /* to memory */
 274         break;
 275     case BLKIF_OP_FLUSH_DISKCACHE:
 276         ioreq->presync = 1;
 277         if (!ioreq->req.nr_segments) {
 278             return 0;
 279         }
 280         /* fall through */
 281     case BLKIF_OP_WRITE:
 282         ioreq->prot = PROT_READ; /* from memory */
 283         break;
 284     case BLKIF_OP_DISCARD:
 285         return 0;
 286     default:
 287         xen_be_printf(&blkdev->xendev, 0, "error: unknown operation (%d)\n",
 288                       ioreq->req.operation);
 289         goto err;
 290     };
 291
 292     if (ioreq->req.operation != BLKIF_OP_READ && blkdev->mode[0] != 'w') {
 293         xen_be_printf(&blkdev->xendev, 0, "error: write req for ro device\n");
 294         goto err;
 295     }
 296
 297     ioreq->start = ioreq->req.sector_number * blkdev->file_blk;
 298     for (i = 0; i < ioreq->req.nr_segments; i++) {
 299         if (i == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
 300             xen_be_printf(&blkdev->xendev, 0, "error: nr_segments too big\n");
 301             goto err;
 302         }
 303         if (ioreq->req.seg[i].first_sect > ioreq->req.seg[i].last_sect) {
 304             xen_be_printf(&blkdev->xendev, 0, "error: first > last sector\n");
 305             goto err;
 306         }
 307         if (ioreq->req.seg[i].last_sect * BLOCK_SIZE >= XC_PAGE_SIZE) {
 308             xen_be_printf(&blkdev->xendev, 0, "error: page crossing\n");
 309             goto err;
 310         }
 311
 312         ioreq->domids[i] = blkdev->xendev.dom;
 313         ioreq->refs[i]   = ioreq->req.seg[i].gref;
 314
 315         mem = ioreq->req.seg[i].first_sect * blkdev->file_blk;
 316         len = (ioreq->req.seg[i].last_sect - ioreq->req.seg[i].first_sect + 1) * blkdev->file_blk;
 317         qemu_iovec_add(&ioreq->v, (void*)mem, len);
 318     }
 319     if (ioreq->start + ioreq->v.size > blkdev->file_size) {
 320         xen_be_printf(&blkdev->xendev, 0, "error: access beyond end of file\n");
 321         goto err;
 322     }
 323     return 0;
 324
 325 err:
 326     ioreq->status = BLKIF_RSP_ERROR;
 327     return -1;
 328 }
 329
 330 static void ioreq_unmap(struct ioreq *ioreq)
 331 {
 332     XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev;
 333     int i;
 334
 335     if (ioreq->num_unmap == 0 || ioreq->mapped == 0) {
 336         return;
 337     }
 338     if (batch_maps) {
 339         if (!ioreq->pages) {
 340             return;
 341         }
 342         if (xc_gnttab_munmap(gnt, ioreq->pages, ioreq->num_unmap) != 0) {
 343             xen_be_printf(&ioreq->blkdev->xendev, 0, "xc_gnttab_munmap failed: %s\n",
 344                           strerror(errno));
 345         }
 346         ioreq->blkdev->cnt_map -= ioreq->num_unmap;
 347         ioreq->pages = NULL;
 348     } else {
 349         for (i = 0; i < ioreq->num_unmap; i++) {
 350             if (!ioreq->page[i]) {
 351                 continue;
 352             }
 353             if (xc_gnttab_munmap(gnt, ioreq->page[i], 1) != 0) {
 354                 xen_be_printf(&ioreq->blkdev->xendev, 0, "xc_gnttab_munmap failed: %s\n",
 355                               strerror(errno));
 356             }
 357             ioreq->blkdev->cnt_map--;
 358             ioreq->page[i] = NULL;
 359         }
 360     }
 361     ioreq->mapped = 0;
 362 }
 363
 364 static int ioreq_map(struct ioreq *ioreq)
 365 {
 366     XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev;
 367     uint32_t domids[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 368     uint32_t refs[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 369     void *page[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 370     int i, j, new_maps = 0;
 371     PersistentGrant *grant;
 372     PersistentRegion *region;
 373     /* domids and refs variables will contain the information necessary
 374      * to map the grants that are needed to fulfill this request.
 375      *
 376      * After mapping the needed grants, the page array will contain the
 377      * memory address of each granted page in the order specified in ioreq
 378      * (disregarding if it's a persistent grant or not).
 379      */
 380
 381     if (ioreq->v.niov == 0 || ioreq->mapped == 1) {
 382         return 0;
 383     }
 384     if (ioreq->blkdev->feature_persistent) {
 385         for (i = 0; i < ioreq->v.niov; i++) {
 386             grant = g_tree_lookup(ioreq->blkdev->persistent_gnts,
 387                                     GUINT_TO_POINTER(ioreq->refs[i]));
 388
 389             if (grant != NULL) {
 390                 page[i] = grant->page;
 391                 xen_be_printf(&ioreq->blkdev->xendev, 3,
 392                               "using persistent-grant %" PRIu32 "\n",
 393                               ioreq->refs[i]);
 394             } else {
 395                     /* Add the grant to the list of grants that
 396                      * should be mapped
 397                      */
 398                     domids[new_maps] = ioreq->domids[i];
 399                     refs[new_maps] = ioreq->refs[i];
 400                     page[i] = NULL;
 401                     new_maps++;
 402             }
 403         }
 404         /* Set the protection to RW, since grants may be reused later
 405          * with a different protection than the one needed for this request
 406          */
 407         ioreq->prot = PROT_WRITE | PROT_READ;
 408     } else {
 409         /* All grants in the request should be mapped */
 410         memcpy(refs, ioreq->refs, sizeof(refs));
 411         memcpy(domids, ioreq->domids, sizeof(domids));
 412         memset(page, 0, sizeof(page));
 413         new_maps = ioreq->v.niov;
 414     }
 415
 416     if (batch_maps && new_maps) {
 417         ioreq->pages = xc_gnttab_map_grant_refs
 418             (gnt, new_maps, domids, refs, ioreq->prot);
 419         if (ioreq->pages == NULL) {
 420             xen_be_printf(&ioreq->blkdev->xendev, 0,
 421                           "can't map %d grant refs (%s, %d maps)\n",
 422                           new_maps, strerror(errno), ioreq->blkdev->cnt_map);
 423             return -1;
 424         }
 425         for (i = 0, j = 0; i < ioreq->v.niov; i++) {
 426             if (page[i] == NULL) {
 427                 page[i] = ioreq->pages + (j++) * XC_PAGE_SIZE;
 428             }
 429         }
 430         ioreq->blkdev->cnt_map += new_maps;
 431     } else if (new_maps)  {
 432         for (i = 0; i < new_maps; i++) {
 433             ioreq->page[i] = xc_gnttab_map_grant_ref
 434                 (gnt, domids[i], refs[i], ioreq->prot);
 435             if (ioreq->page[i] == NULL) {
 436                 xen_be_printf(&ioreq->blkdev->xendev, 0,
 437                               "can't map grant ref %d (%s, %d maps)\n",
 438                               refs[i], strerror(errno), ioreq->blkdev->cnt_map);
 439                 ioreq->mapped = 1;
 440                 ioreq_unmap(ioreq);
 441                 return -1;
 442             }
 443             ioreq->blkdev->cnt_map++;
 444         }
 445         for (i = 0, j = 0; i < ioreq->v.niov; i++) {
 446             if (page[i] == NULL) {
 447                 page[i] = ioreq->page[j++];
 448             }
 449         }
 450     }
 451     if (ioreq->blkdev->feature_persistent && new_maps != 0 &&
 452         (!batch_maps || (ioreq->blkdev->persistent_gnt_count + new_maps <=
 453         ioreq->blkdev->max_grants))) {
 454         /*
 455          * If we are using persistent grants and batch mappings only
 456          * add the new maps to the list of persistent grants if the whole
 457          * area can be persistently mapped.
 458          */
 459         if (batch_maps) {
 460             region = g_malloc0(sizeof(*region));
 461             region->addr = ioreq->pages;
 462             region->num = new_maps;
 463             ioreq->blkdev->persistent_regions = g_slist_append(
 464                                             ioreq->blkdev->persistent_regions,
 465                                             region);
 466         }
 467         while ((ioreq->blkdev->persistent_gnt_count < ioreq->blkdev->max_grants)
 468               && new_maps) {
 469             /* Go through the list of newly mapped grants and add as many
 470              * as possible to the list of persistently mapped grants.
 471              *
 472              * Since we start at the end of ioreq->page(s), we only need
 473              * to decrease new_maps to prevent this granted pages from
 474              * being unmapped in ioreq_unmap.
 475              */
 476             grant = g_malloc0(sizeof(*grant));
 477             new_maps--;
 478             if (batch_maps) {
 479                 grant->page = ioreq->pages + (new_maps) * XC_PAGE_SIZE;
 480             } else {
 481                 grant->page = ioreq->page[new_maps];
 482             }
 483             grant->blkdev = ioreq->blkdev;
 484             xen_be_printf(&ioreq->blkdev->xendev, 3,
 485                           "adding grant %" PRIu32 " page: %p\n",
 486                           refs[new_maps], grant->page);
 487             g_tree_insert(ioreq->blkdev->persistent_gnts,
 488                           GUINT_TO_POINTER(refs[new_maps]),
 489                           grant);
 490             ioreq->blkdev->persistent_gnt_count++;
 491         }
 492         assert(!batch_maps || new_maps == 0);
 493     }
 494     for (i = 0; i < ioreq->v.niov; i++) {
 495         ioreq->v.iov[i].iov_base += (uintptr_t)page[i];
 496     }
 497     ioreq->mapped = 1;
 498     ioreq->num_unmap = new_maps;
 499     return 0;
 500 }
 501
 502 static int ioreq_runio_qemu_aio(struct ioreq *ioreq);
 503
 504 static void qemu_aio_complete(void *opaque, int ret)
 505 {
 506     struct ioreq *ioreq = opaque;
 507
 508     if (ret != 0) {
 509         xen_be_printf(&ioreq->blkdev->xendev, 0, "%s I/O error\n",
 510                       ioreq->req.operation == BLKIF_OP_READ ? "read" : "write");
 511         ioreq->aio_errors++;
 512     }
 513
 514     ioreq->aio_inflight--;
 515     if (ioreq->presync) {
 516         ioreq->presync = 0;
 517         ioreq_runio_qemu_aio(ioreq);
 518         return;
 519     }
 520     if (ioreq->aio_inflight > 0) {
 521         return;
 522     }
 523     if (ioreq->postsync) {
 524         ioreq->postsync = 0;
 525         ioreq->aio_inflight++;
 526         blk_aio_flush(ioreq->blkdev->blk, qemu_aio_complete, ioreq);
 527         return;
 528     }
 529
 530     ioreq->status = ioreq->aio_errors ? BLKIF_RSP_ERROR : BLKIF_RSP_OKAY;
 531     ioreq_unmap(ioreq);
 532     ioreq_finish(ioreq);
 533     switch (ioreq->req.operation) {
 534     case BLKIF_OP_WRITE:
 535     case BLKIF_OP_FLUSH_DISKCACHE:
 536         if (!ioreq->req.nr_segments) {
 537             break;
 538         }
 539     case BLKIF_OP_READ:
 540         block_acct_done(blk_get_stats(ioreq->blkdev->blk), &ioreq->acct);
 541         break;
 542     case BLKIF_OP_DISCARD:
 543     default:
 544         break;
 545     }
 546     qemu_bh_schedule(ioreq->blkdev->bh);
 547 }
 548
 549 static int ioreq_runio_qemu_aio(struct ioreq *ioreq)
 550 {
 551     struct XenBlkDev *blkdev = ioreq->blkdev;
 552
 553     if (ioreq->req.nr_segments && ioreq_map(ioreq) == -1) {
 554         goto err_no_map;
 555     }
 556
 557     ioreq->aio_inflight++;
 558     if (ioreq->presync) {
 559         blk_aio_flush(ioreq->blkdev->blk, qemu_aio_complete, ioreq);
 560         return 0;
 561     }
 562
 563     switch (ioreq->req.operation) {
 564     case BLKIF_OP_READ:
 565         block_acct_start(blk_get_stats(blkdev->blk), &ioreq->acct,
 566                          ioreq->v.size, BLOCK_ACCT_READ);
 567         ioreq->aio_inflight++;
 568         blk_aio_readv(blkdev->blk, ioreq->start / BLOCK_SIZE,
 569                       &ioreq->v, ioreq->v.size / BLOCK_SIZE,
 570                       qemu_aio_complete, ioreq);
 571         break;
 572     case BLKIF_OP_WRITE:
 573     case BLKIF_OP_FLUSH_DISKCACHE:
 574         if (!ioreq->req.nr_segments) {
 575             break;
 576         }
 577
 578         block_acct_start(blk_get_stats(blkdev->blk), &ioreq->acct,
 579                          ioreq->v.size, BLOCK_ACCT_WRITE);
 580         ioreq->aio_inflight++;
 581         blk_aio_writev(blkdev->blk, ioreq->start / BLOCK_SIZE,
 582                        &ioreq->v, ioreq->v.size / BLOCK_SIZE,
 583                        qemu_aio_complete, ioreq);
 584         break;
 585     case BLKIF_OP_DISCARD:
 586     {
 587         struct blkif_request_discard *discard_req = (void *)&ioreq->req;
 588         ioreq->aio_inflight++;
 589         blk_aio_discard(blkdev->blk,
 590                         discard_req->sector_number, discard_req->nr_sectors,
 591                         qemu_aio_complete, ioreq);
 592         break;
 593     }
 594     default:
 595         /* unknown operation (shouldn't happen -- parse catches this) */
 596         goto err;
 597     }
 598
 599     qemu_aio_complete(ioreq, 0);
 600
 601     return 0;
 602
 603 err:
 604     ioreq_unmap(ioreq);
 605 err_no_map:
 606     ioreq_finish(ioreq);
 607     ioreq->status = BLKIF_RSP_ERROR;
 608     return -1;
 609 }
 610
 611 static int blk_send_response_one(struct ioreq *ioreq)
 612 {
 613     struct XenBlkDev  *blkdev = ioreq->blkdev;
 614     int               send_notify   = 0;
 615     int               have_requests = 0;
 616     blkif_response_t  resp;
 617     void              *dst;
 618
 619     resp.id        = ioreq->req.id;
 620     resp.operation = ioreq->req.operation;
 621     resp.status    = ioreq->status;
 622
 623     /* Place on the response ring for the relevant domain. */
 624     switch (blkdev->protocol) {
 625     case BLKIF_PROTOCOL_NATIVE:
 626         dst = RING_GET_RESPONSE(&blkdev->rings.native, blkdev->rings.native.rsp_prod_pvt);
 627         break;
 628     case BLKIF_PROTOCOL_X86_32:
 629         dst = RING_GET_RESPONSE(&blkdev->rings.x86_32_part,
 630                                 blkdev->rings.x86_32_part.rsp_prod_pvt);
 631         break;
 632     case BLKIF_PROTOCOL_X86_64:
 633         dst = RING_GET_RESPONSE(&blkdev->rings.x86_64_part,
 634                                 blkdev->rings.x86_64_part.rsp_prod_pvt);
 635         break;
 636     default:
 637         dst = NULL;
 638         return 0;
 639     }
 640     memcpy(dst, &resp, sizeof(resp));
 641     blkdev->rings.common.rsp_prod_pvt++;
 642
 643     RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blkdev->rings.common, send_notify);
 644     if (blkdev->rings.common.rsp_prod_pvt == blkdev->rings.common.req_cons) {
 645         /*
 646          * Tail check for pending requests. Allows frontend to avoid
 647          * notifications if requests are already in flight (lower
 648          * overheads and promotes batching).
 649          */
 650         RING_FINAL_CHECK_FOR_REQUESTS(&blkdev->rings.common, have_requests);
 651     } else if (RING_HAS_UNCONSUMED_REQUESTS(&blkdev->rings.common)) {
 652         have_requests = 1;
 653     }
 654
 655     if (have_requests) {
 656         blkdev->more_work++;
 657     }
 658     return send_notify;
 659 }
 660
 661 /* walk finished list, send outstanding responses, free requests */
 662 static void blk_send_response_all(struct XenBlkDev *blkdev)
 663 {
 664     struct ioreq *ioreq;
 665     int send_notify = 0;
 666
 667     while (!QLIST_EMPTY(&blkdev->finished)) {
 668         ioreq = QLIST_FIRST(&blkdev->finished);
 669         send_notify += blk_send_response_one(ioreq);
 670         ioreq_release(ioreq, true);
 671     }
 672     if (send_notify) {
 673         xen_be_send_notify(&blkdev->xendev);
 674     }
 675 }
 676
 677 static int blk_get_request(struct XenBlkDev *blkdev, struct ioreq *ioreq, RING_IDX rc)
 678 {
 679     switch (blkdev->protocol) {
 680     case BLKIF_PROTOCOL_NATIVE:
 681         memcpy(&ioreq->req, RING_GET_REQUEST(&blkdev->rings.native, rc),
 682                sizeof(ioreq->req));
 683         break;
 684     case BLKIF_PROTOCOL_X86_32:
 685         blkif_get_x86_32_req(&ioreq->req,
 686                              RING_GET_REQUEST(&blkdev->rings.x86_32_part, rc));
 687         break;
 688     case BLKIF_PROTOCOL_X86_64:
 689         blkif_get_x86_64_req(&ioreq->req,
 690                              RING_GET_REQUEST(&blkdev->rings.x86_64_part, rc));
 691         break;
 692     }
 693     return 0;
 694 }
 695
 696 static void blk_handle_requests(struct XenBlkDev *blkdev)
 697 {
 698     RING_IDX rc, rp;
 699     struct ioreq *ioreq;
 700
 701     blkdev->more_work = 0;
 702
 703     rc = blkdev->rings.common.req_cons;
 704     rp = blkdev->rings.common.sring->req_prod;
 705     xen_rmb(); /* Ensure we see queued requests up to 'rp'. */
 706
 707     blk_send_response_all(blkdev);
 708     while (rc != rp) {
 709         /* pull request from ring */
 710         if (RING_REQUEST_CONS_OVERFLOW(&blkdev->rings.common, rc)) {
 711             break;
 712         }
 713         ioreq = ioreq_start(blkdev);
 714         if (ioreq == NULL) {
 715             blkdev->more_work++;
 716             break;
 717         }
 718         blk_get_request(blkdev, ioreq, rc);
 719         blkdev->rings.common.req_cons = ++rc;
 720
 721         /* parse them */
 722         if (ioreq_parse(ioreq) != 0) {
 723             if (blk_send_response_one(ioreq)) {
 724                 xen_be_send_notify(&blkdev->xendev);
 725             }
 726             ioreq_release(ioreq, false);
 727             continue;
 728         }
 729
 730         ioreq_runio_qemu_aio(ioreq);
 731     }
 732
 733     if (blkdev->more_work && blkdev->requests_inflight < max_requests) {
 734         qemu_bh_schedule(blkdev->bh);
 735     }
 736 }
 737
 738 /* ------------------------------------------------------------- */
 739
 740 static void blk_bh(void *opaque)
 741 {
 742     struct XenBlkDev *blkdev = opaque;
 743     blk_handle_requests(blkdev);
 744 }
 745
 746 /*
 747  * We need to account for the grant allocations requiring contiguous
 748  * chunks; the worst case number would be
 749  *     max_req * max_seg + (max_req - 1) * (max_seg - 1) + 1,
 750  * but in order to keep things simple just use
 751  *     2 * max_req * max_seg.
 752  */
 753 #define MAX_GRANTS(max_req, max_seg) (2 * (max_req) * (max_seg))
 754
 755 static void blk_alloc(struct XenDevice *xendev)
 756 {
 757     struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
 758
 759     QLIST_INIT(&blkdev->inflight);
 760     QLIST_INIT(&blkdev->finished);
 761     QLIST_INIT(&blkdev->freelist);
 762     blkdev->bh = qemu_bh_new(blk_bh, blkdev);
 763     if (xen_mode != XEN_EMULATE) {
 764         batch_maps = 1;
 765     }
 766     if (xc_gnttab_set_max_grants(xendev->gnttabdev,
 767             MAX_GRANTS(max_requests, BLKIF_MAX_SEGMENTS_PER_REQUEST)) < 0) {
 768         xen_be_printf(xendev, 0, "xc_gnttab_set_max_grants failed: %s\n",
 769                       strerror(errno));
 770     }
 771 }
 772
 773 static void blk_parse_discard(struct XenBlkDev *blkdev)
 774 {
 775     int enable;
 776
 777     blkdev->feature_discard = true;
 778
 779     if (xenstore_read_be_int(&blkdev->xendev, "discard-enable", &enable) == 0) {
 780         blkdev->feature_discard = !!enable;
 781     }
 782
 783     if (blkdev->feature_discard) {
 784         xenstore_write_be_int(&blkdev->xendev, "feature-discard", 1);
 785     }
 786 }
 787
 788 static int blk_init(struct XenDevice *xendev)
 789 {
 790     struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
 791     int info = 0;
 792     char *directiosafe = NULL;
 793
 794     /* read xenstore entries */
 795     if (blkdev->params == NULL) {
 796         char *h = NULL;
 797         blkdev->params = xenstore_read_be_str(&blkdev->xendev, "params");
 798         if (blkdev->params != NULL) {
 799             h = strchr(blkdev->params, ':');
 800         }
 801         if (h != NULL) {
 802             blkdev->fileproto = blkdev->params;
 803             blkdev->filename  = h+1;
 804             *h = 0;
 805         } else {
 806             blkdev->fileproto = "<unset>";
 807             blkdev->filename  = blkdev->params;
 808         }
 809     }
 810     if (!strcmp("aio", blkdev->fileproto)) {
 811         blkdev->fileproto = "raw";
 812     }
 813     if (blkdev->mode == NULL) {
 814         blkdev->mode = xenstore_read_be_str(&blkdev->xendev, "mode");
 815     }
 816     if (blkdev->type == NULL) {
 817         blkdev->type = xenstore_read_be_str(&blkdev->xendev, "type");
 818     }
 819     if (blkdev->dev == NULL) {
 820         blkdev->dev = xenstore_read_be_str(&blkdev->xendev, "dev");
 821     }
 822     if (blkdev->devtype == NULL) {
 823         blkdev->devtype = xenstore_read_be_str(&blkdev->xendev, "device-type");
 824     }
 825     directiosafe = xenstore_read_be_str(&blkdev->xendev, "direct-io-safe");
 826     blkdev->directiosafe = (directiosafe && atoi(directiosafe));
 827
 828     /* do we have all we need? */
 829     if (blkdev->params == NULL ||
 830         blkdev->mode == NULL   ||
 831         blkdev->type == NULL   ||
 832         blkdev->dev == NULL) {
 833         goto out_error;
 834     }
 835
 836     /* read-only ? */
 837     if (strcmp(blkdev->mode, "w")) {
 838         info  |= VDISK_READONLY;
 839     }
 840
 841     /* cdrom ? */
 842     if (blkdev->devtype && !strcmp(blkdev->devtype, "cdrom")) {
 843         info  |= VDISK_CDROM;
 844     }
 845
 846     blkdev->file_blk  = BLOCK_SIZE;
 847
 848     /* fill info
 849      * blk_connect supplies sector-size and sectors
 850      */
 851     xenstore_write_be_int(&blkdev->xendev, "feature-flush-cache", 1);
 852     xenstore_write_be_int(&blkdev->xendev, "feature-persistent", 1);
 853     xenstore_write_be_int(&blkdev->xendev, "info", info);
 854
 855     blk_parse_discard(blkdev);
 856
 857     g_free(directiosafe);
 858     return 0;
 859
 860 out_error:
 861     g_free(blkdev->params);
 862     blkdev->params = NULL;
 863     g_free(blkdev->mode);
 864     blkdev->mode = NULL;
 865     g_free(blkdev->type);
 866     blkdev->type = NULL;
 867     g_free(blkdev->dev);
 868     blkdev->dev = NULL;
 869     g_free(blkdev->devtype);
 870     blkdev->devtype = NULL;
 871     g_free(directiosafe);
 872     blkdev->directiosafe = false;
 873     return -1;
 874 }
 875
 876 static int blk_connect(struct XenDevice *xendev)
 877 {
 878     struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
 879     int pers, index, qflags;
 880     bool readonly = true;
 881
 882     /* read-only ? */
 883     if (blkdev->directiosafe) {
 884         qflags = BDRV_O_NOCACHE | BDRV_O_NATIVE_AIO;
 885     } else {
 886         qflags = BDRV_O_CACHE_WB;
 887     }
 888     if (strcmp(blkdev->mode, "w") == 0) {
 889         qflags |= BDRV_O_RDWR;
 890         readonly = false;
 891     }
 892     if (blkdev->feature_discard) {
 893         qflags |= BDRV_O_UNMAP;
 894     }
 895
 896     /* init qemu block driver */
 897     index = (blkdev->xendev.dev - 202 * 256) / 16;
 898     blkdev->dinfo = drive_get(IF_XEN, 0, index);
 899     if (!blkdev->dinfo) {
 900         Error *local_err = NULL;
 901         QDict *options = NULL;
 902
 903         if (strcmp(blkdev->fileproto, "<unset>")) {
 904             options = qdict_new();
 905             qdict_put(options, "driver", qstring_from_str(blkdev->fileproto));
 906         }
 907
 908         /* setup via xenbus -> create new block driver instance */
 909         xen_be_printf(&blkdev->xendev, 2, "create new bdrv (xenbus setup)\n");
 910         blkdev->blk = blk_new_open(blkdev->dev, blkdev->filename, NULL, options,
 911                                    qflags, &local_err);
 912         if (!blkdev->blk) {
 913             xen_be_printf(&blkdev->xendev, 0, "error: %s\n",
 914                           error_get_pretty(local_err));
 915             error_free(local_err);
 916             return -1;
 917         }
 918     } else {
 919         /* setup via qemu cmdline -> already setup for us */
 920         xen_be_printf(&blkdev->xendev, 2, "get configured bdrv (cmdline setup)\n");
 921         blkdev->blk = blk_by_legacy_dinfo(blkdev->dinfo);
 922         if (blk_is_read_only(blkdev->blk) && !readonly) {
 923             xen_be_printf(&blkdev->xendev, 0, "Unexpected read-only drive");
 924             blkdev->blk = NULL;
 925             return -1;
 926         }
 927         /* blkdev->blk is not create by us, we get a reference
 928          * so we can blk_unref() unconditionally */
 929         blk_ref(blkdev->blk);
 930     }
 931     blk_attach_dev_nofail(blkdev->blk, blkdev);
 932     blkdev->file_size = blk_getlength(blkdev->blk);
 933     if (blkdev->file_size < 0) {
 934         xen_be_printf(&blkdev->xendev, 1, "blk_getlength: %d (%s) | drv %s\n",
 935                       (int)blkdev->file_size, strerror(-blkdev->file_size),
 936                       bdrv_get_format_name(blk_bs(blkdev->blk)) ?: "-");
 937         blkdev->file_size = 0;
 938     }
 939
 940     xen_be_printf(xendev, 1, "type \"%s\", fileproto \"%s\", filename \"%s\","
 941                   " size %" PRId64 " (%" PRId64 " MB)\n",
 942                   blkdev->type, blkdev->fileproto, blkdev->filename,
 943                   blkdev->file_size, blkdev->file_size >> 20);
 944
 945     /* Fill in number of sector size and number of sectors */
 946     xenstore_write_be_int(&blkdev->xendev, "sector-size", blkdev->file_blk);
 947     xenstore_write_be_int64(&blkdev->xendev, "sectors",
 948                             blkdev->file_size / blkdev->file_blk);
 949
 950     if (xenstore_read_fe_int(&blkdev->xendev, "ring-ref", &blkdev->ring_ref) == -1) {
 951         return -1;
 952     }
 953     if (xenstore_read_fe_int(&blkdev->xendev, "event-channel",
 954                              &blkdev->xendev.remote_port) == -1) {
 955         return -1;
 956     }
 957     if (xenstore_read_fe_int(&blkdev->xendev, "feature-persistent", &pers)) {
 958         blkdev->feature_persistent = FALSE;
 959     } else {
 960         blkdev->feature_persistent = !!pers;
 961     }
 962
 963     blkdev->protocol = BLKIF_PROTOCOL_NATIVE;
 964     if (blkdev->xendev.protocol) {
 965         if (strcmp(blkdev->xendev.protocol, XEN_IO_PROTO_ABI_X86_32) == 0) {
 966             blkdev->protocol = BLKIF_PROTOCOL_X86_32;
 967         }
 968         if (strcmp(blkdev->xendev.protocol, XEN_IO_PROTO_ABI_X86_64) == 0) {
 969             blkdev->protocol = BLKIF_PROTOCOL_X86_64;
 970         }
 971     }
 972
 973     blkdev->sring = xc_gnttab_map_grant_ref(blkdev->xendev.gnttabdev,
 974                                             blkdev->xendev.dom,
 975                                             blkdev->ring_ref,
 976                                             PROT_READ | PROT_WRITE);
 977     if (!blkdev->sring) {
 978         return -1;
 979     }
 980     blkdev->cnt_map++;
 981
 982     switch (blkdev->protocol) {
 983     case BLKIF_PROTOCOL_NATIVE:
 984     {
 985         blkif_sring_t *sring_native = blkdev->sring;
 986         BACK_RING_INIT(&blkdev->rings.native, sring_native, XC_PAGE_SIZE);
 987         break;
 988     }
 989     case BLKIF_PROTOCOL_X86_32:
 990     {
 991         blkif_x86_32_sring_t *sring_x86_32 = blkdev->sring;
 992
 993         BACK_RING_INIT(&blkdev->rings.x86_32_part, sring_x86_32, XC_PAGE_SIZE);
 994         break;
 995     }
 996     case BLKIF_PROTOCOL_X86_64:
 997     {
 998         blkif_x86_64_sring_t *sring_x86_64 = blkdev->sring;
 999
1000         BACK_RING_INIT(&blkdev->rings.x86_64_part, sring_x86_64, XC_PAGE_SIZE);
1001         break;
1002     }
1003     }
1004
1005     if (blkdev->feature_persistent) {
1006         /* Init persistent grants */
1007         blkdev->max_grants = max_requests * BLKIF_MAX_SEGMENTS_PER_REQUEST;
1008         blkdev->persistent_gnts = g_tree_new_full((GCompareDataFunc)int_cmp,
1009                                              NULL, NULL,
1010                                              batch_maps ?
1011                                              (GDestroyNotify)g_free :
1012                                              (GDestroyNotify)destroy_grant);
1013         blkdev->persistent_regions = NULL;
1014         blkdev->persistent_gnt_count = 0;
1015     }
1016
1017     xen_be_bind_evtchn(&blkdev->xendev);
1018
1019     xen_be_printf(&blkdev->xendev, 1, "ok: proto %s, ring-ref %d, "
1020                   "remote port %d, local port %d\n",
1021                   blkdev->xendev.protocol, blkdev->ring_ref,
1022                   blkdev->xendev.remote_port, blkdev->xendev.local_port);
1023     return 0;
1024 }
1025
1026 static void blk_disconnect(struct XenDevice *xendev)
1027 {
1028     struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
1029
1030     if (blkdev->blk) {
1031         blk_detach_dev(blkdev->blk, blkdev);
1032         blk_unref(blkdev->blk);
1033         blkdev->blk = NULL;
1034     }
1035     xen_be_unbind_evtchn(&blkdev->xendev);
1036
1037     if (blkdev->sring) {
1038         xc_gnttab_munmap(blkdev->xendev.gnttabdev, blkdev->sring, 1);
1039         blkdev->cnt_map--;
1040         blkdev->sring = NULL;
1041     }
1042
1043     /*
1044      * Unmap persistent grants before switching to the closed state
1045      * so the frontend can free them.
1046      *
1047      * In the !batch_maps case g_tree_destroy will take care of unmapping
1048      * the grant, but in the batch_maps case we need to iterate over every
1049      * region in persistent_regions and unmap it.
1050      */
1051     if (blkdev->feature_persistent) {
1052         g_tree_destroy(blkdev->persistent_gnts);
1053         assert(batch_maps || blkdev->persistent_gnt_count == 0);
1054         if (batch_maps) {
1055             blkdev->persistent_gnt_count = 0;
1056             g_slist_foreach(blkdev->persistent_regions,
1057                             (GFunc)remove_persistent_region, blkdev);
1058             g_slist_free(blkdev->persistent_regions);
1059         }
1060         blkdev->feature_persistent = false;
1061     }
1062 }
1063
1064 static int blk_free(struct XenDevice *xendev)
1065 {
1066     struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
1067     struct ioreq *ioreq;
1068
1069     if (blkdev->blk || blkdev->sring) {
1070         blk_disconnect(xendev);
1071     }
1072
1073     while (!QLIST_EMPTY(&blkdev->freelist)) {
1074         ioreq = QLIST_FIRST(&blkdev->freelist);
1075         QLIST_REMOVE(ioreq, list);
1076         qemu_iovec_destroy(&ioreq->v);
1077         g_free(ioreq);
1078     }
1079
1080     g_free(blkdev->params);
1081     g_free(blkdev->mode);
1082     g_free(blkdev->type);
1083     g_free(blkdev->dev);
1084     g_free(blkdev->devtype);
1085     qemu_bh_delete(blkdev->bh);
1086     return 0;
1087 }
1088
1089 static void blk_event(struct XenDevice *xendev)
1090 {
1091     struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
1092
1093     qemu_bh_schedule(blkdev->bh);
1094 }
1095
1096 struct XenDevOps xen_blkdev_ops = {
1097     .size       = sizeof(struct XenBlkDev),
1098     .flags      = DEVOPS_FLAG_NEED_GNTDEV,
1099     .alloc      = blk_alloc,
1100     .init       = blk_init,
1101     .initialise    = blk_connect,
1102     .disconnect = blk_disconnect,
1103     .event      = blk_event,
1104     .free       = blk_free,
1105 };