drivers/block/rbd.c

   1 /*
   2    rbd.c -- Export ceph rados objects as a Linux block device
   3
   4
   5    based on drivers/block/osdblk.c:
   6
   7    Copyright 2009 Red Hat, Inc.
   8
   9    This program is free software; you can redistribute it and/or modify
  10    it under the terms of the GNU General Public License as published by
  11    the Free Software Foundation.
  12
  13    This program is distributed in the hope that it will be useful,
  14    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16    GNU General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with this program; see the file COPYING.  If not, write to
  20    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  21
  22
  23
  24    For usage instructions, please refer to:
  25
  26                  Documentation/ABI/testing/sysfs-bus-rbd
  27
  28  */
  29
  30 #include <linux/ceph/libceph.h>
  31 #include <linux/ceph/osd_client.h>
  32 #include <linux/ceph/mon_client.h>
  33 #include <linux/ceph/decode.h>
  34 #include <linux/parser.h>
  35
  36 #include <linux/kernel.h>
  37 #include <linux/device.h>
  38 #include <linux/module.h>
  39 #include <linux/fs.h>
  40 #include <linux/blkdev.h>
  41
  42 #include "rbd_types.h"
  43
  44 #define RBD_DEBUG       /* Activate rbd_assert() calls */
  45
  46 /*
  47  * The basic unit of block I/O is a sector.  It is interpreted in a
  48  * number of contexts in Linux (blk, bio, genhd), but the default is
  49  * universally 512 bytes.  These symbols are just slightly more
  50  * meaningful than the bare numbers they represent.
  51  */
  52 #define SECTOR_SHIFT    9
  53 #define SECTOR_SIZE     (1ULL << SECTOR_SHIFT)
  54
  55 #define RBD_DRV_NAME "rbd"
  56 #define RBD_DRV_NAME_LONG "rbd (rados block device)"
  57
  58 #define RBD_MINORS_PER_MAJOR    256             /* max minors per blkdev */
  59
  60 #define RBD_SNAP_DEV_NAME_PREFIX        "snap_"
  61 #define RBD_MAX_SNAP_NAME_LEN   \
  62                         (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
  63
  64 #define RBD_MAX_SNAP_COUNT      510     /* allows max snapc to fit in 4KB */
  65
  66 #define RBD_SNAP_HEAD_NAME      "-"
  67
  68 /* This allows a single page to hold an image name sent by OSD */
  69 #define RBD_IMAGE_NAME_LEN_MAX  (PAGE_SIZE - sizeof (__le32) - 1)
  70 #define RBD_IMAGE_ID_LEN_MAX    64
  71
  72 #define RBD_OBJ_PREFIX_LEN_MAX  64
  73
  74 /* Feature bits */
  75
  76 #define RBD_FEATURE_LAYERING    (1<<0)
  77 #define RBD_FEATURE_STRIPINGV2  (1<<1)
  78 #define RBD_FEATURES_ALL \
  79             (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
  80
  81 /* Features supported by this (client software) implementation. */
  82
  83 #define RBD_FEATURES_SUPPORTED  (0)
  84
  85 /*
  86  * An RBD device name will be "rbd#", where the "rbd" comes from
  87  * RBD_DRV_NAME above, and # is a unique integer identifier.
  88  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
  89  * enough to hold all possible device names.
  90  */
  91 #define DEV_NAME_LEN            32
  92 #define MAX_INT_FORMAT_WIDTH    ((5 * sizeof (int)) / 2 + 1)
  93
  94 /*
  95  * block device image metadata (in-memory version)
  96  */
  97 struct rbd_image_header {
  98         /* These four fields never change for a given rbd image */
  99         char *object_prefix;
 100         u64 features;
 101         __u8 obj_order;
 102         __u8 crypt_type;
 103         __u8 comp_type;
 104
 105         /* The remaining fields need to be updated occasionally */
 106         u64 image_size;
 107         struct ceph_snap_context *snapc;
 108         char *snap_names;
 109         u64 *snap_sizes;
 110
 111         u64 obj_version;
 112 };
 113
 114 /*
 115  * An rbd image specification.
 116  *
 117  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
 118  * identify an image.  Each rbd_dev structure includes a pointer to
 119  * an rbd_spec structure that encapsulates this identity.
 120  *
 121  * Each of the id's in an rbd_spec has an associated name.  For a
 122  * user-mapped image, the names are supplied and the id's associated
 123  * with them are looked up.  For a layered image, a parent image is
 124  * defined by the tuple, and the names are looked up.
 125  *
 126  * An rbd_dev structure contains a parent_spec pointer which is
 127  * non-null if the image it represents is a child in a layered
 128  * image.  This pointer will refer to the rbd_spec structure used
 129  * by the parent rbd_dev for its own identity (i.e., the structure
 130  * is shared between the parent and child).
 131  *
 132  * Since these structures are populated once, during the discovery
 133  * phase of image construction, they are effectively immutable so
 134  * we make no effort to synchronize access to them.
 135  *
 136  * Note that code herein does not assume the image name is known (it
 137  * could be a null pointer).
 138  */
 139 struct rbd_spec {
 140         u64             pool_id;
 141         char            *pool_name;
 142
 143         char            *image_id;
 144         char            *image_name;
 145
 146         u64             snap_id;
 147         char            *snap_name;
 148
 149         struct kref     kref;
 150 };
 151
 152 /*
 153  * an instance of the client.  multiple devices may share an rbd client.
 154  */
 155 struct rbd_client {
 156         struct ceph_client      *client;
 157         struct kref             kref;
 158         struct list_head        node;
 159 };
 160
 161 struct rbd_img_request;
 162 typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
 163
 164 #define BAD_WHICH       U32_MAX         /* Good which or bad which, which? */
 165
 166 struct rbd_obj_request;
 167 typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
 168
 169 enum obj_request_type {
 170         OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
 171 };
 172
 173 enum obj_req_flags {
 174         OBJ_REQ_DONE,           /* completion flag: not done = 0, done = 1 */
 175         OBJ_REQ_IMG_DATA,       /* object usage: standalone = 0, image = 1 */
 176 };
 177
 178 struct rbd_obj_request {
 179         const char              *object_name;
 180         u64                     offset;         /* object start byte */
 181         u64                     length;         /* bytes from offset */
 182         unsigned long           flags;
 183
 184         struct rbd_img_request  *img_request;
 185         u64                     img_offset;     /* image relative offset */
 186         struct list_head        links;          /* img_request->obj_requests */
 187         u32                     which;          /* posn image request list */
 188
 189         enum obj_request_type   type;
 190         union {
 191                 struct bio      *bio_list;
 192                 struct {
 193                         struct page     **pages;
 194                         u32             page_count;
 195                 };
 196         };
 197
 198         struct ceph_osd_request *osd_req;
 199
 200         u64                     xferred;        /* bytes transferred */
 201         u64                     version;
 202         int                     result;
 203
 204         rbd_obj_callback_t      callback;
 205         struct completion       completion;
 206
 207         struct kref             kref;
 208 };
 209
 210 enum img_req_flags {
 211         IMG_REQ_WRITE,          /* I/O direction: read = 0, write = 1 */
 212         IMG_REQ_CHILD,          /* initiator: block = 0, child image = 1 */
 213         IMG_REQ_LAYERED,        /* ENOENT handling: normal = 0, layered = 1 */
 214 };
 215
 216 struct rbd_img_request {
 217         struct rbd_device       *rbd_dev;
 218         u64                     offset; /* starting image byte offset */
 219         u64                     length; /* byte count from offset */
 220         unsigned long           flags;
 221         union {
 222                 u64                     snap_id;        /* for reads */
 223                 struct ceph_snap_context *snapc;        /* for writes */
 224         };
 225         union {
 226                 struct request          *rq;            /* block request */
 227                 struct rbd_obj_request  *obj_request;   /* obj req initiator */
 228         };
 229         spinlock_t              completion_lock;/* protects next_completion */
 230         u32                     next_completion;
 231         rbd_img_callback_t      callback;
 232         u64                     xferred;/* aggregate bytes transferred */
 233         int                     result; /* first nonzero obj_request result */
 234
 235         u32                     obj_request_count;
 236         struct list_head        obj_requests;   /* rbd_obj_request structs */
 237
 238         struct kref             kref;
 239 };
 240
 241 #define for_each_obj_request(ireq, oreq) \
 242         list_for_each_entry(oreq, &(ireq)->obj_requests, links)
 243 #define for_each_obj_request_from(ireq, oreq) \
 244         list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
 245 #define for_each_obj_request_safe(ireq, oreq, n) \
 246         list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
 247
 248 struct rbd_snap {
 249         struct  device          dev;
 250         const char              *name;
 251         u64                     size;
 252         struct list_head        node;
 253         u64                     id;
 254         u64                     features;
 255 };
 256
 257 struct rbd_mapping {
 258         u64                     size;
 259         u64                     features;
 260         bool                    read_only;
 261 };
 262
 263 /*
 264  * a single device
 265  */
 266 struct rbd_device {
 267         int                     dev_id;         /* blkdev unique id */
 268
 269         int                     major;          /* blkdev assigned major */
 270         struct gendisk          *disk;          /* blkdev's gendisk and rq */
 271
 272         u32                     image_format;   /* Either 1 or 2 */
 273         struct rbd_client       *rbd_client;
 274
 275         char                    name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
 276
 277         spinlock_t              lock;           /* queue, flags, open_count */
 278
 279         struct rbd_image_header header;
 280         unsigned long           flags;          /* possibly lock protected */
 281         struct rbd_spec         *spec;
 282
 283         char                    *header_name;
 284
 285         struct ceph_file_layout layout;
 286
 287         struct ceph_osd_event   *watch_event;
 288         struct rbd_obj_request  *watch_request;
 289
 290         struct rbd_spec         *parent_spec;
 291         u64                     parent_overlap;
 292         struct rbd_device       *parent;
 293
 294         /* protects updating the header */
 295         struct rw_semaphore     header_rwsem;
 296
 297         struct rbd_mapping      mapping;
 298
 299         struct list_head        node;
 300
 301         /* list of snapshots */
 302         struct list_head        snaps;
 303
 304         /* sysfs related */
 305         struct device           dev;
 306         unsigned long           open_count;     /* protected by lock */
 307 };
 308
 309 /*
 310  * Flag bits for rbd_dev->flags.  If atomicity is required,
 311  * rbd_dev->lock is used to protect access.
 312  *
 313  * Currently, only the "removing" flag (which is coupled with the
 314  * "open_count" field) requires atomic access.
 315  */
 316 enum rbd_dev_flags {
 317         RBD_DEV_FLAG_EXISTS,    /* mapped snapshot has not been deleted */
 318         RBD_DEV_FLAG_REMOVING,  /* this mapping is being removed */
 319 };
 320
 321 static DEFINE_MUTEX(ctl_mutex);   /* Serialize open/close/setup/teardown */
 322
 323 static LIST_HEAD(rbd_dev_list);    /* devices */
 324 static DEFINE_SPINLOCK(rbd_dev_list_lock);
 325
 326 static LIST_HEAD(rbd_client_list);              /* clients */
 327 static DEFINE_SPINLOCK(rbd_client_list_lock);
 328
 329 static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
 330 static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
 331
 332 static void rbd_dev_release(struct device *dev);
 333 static void rbd_remove_snap_dev(struct rbd_snap *snap);
 334
 335 static ssize_t rbd_add(struct bus_type *bus, const char *buf,
 336                        size_t count);
 337 static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
 338                           size_t count);
 339 static int rbd_dev_probe(struct rbd_device *rbd_dev);
 340
 341 static struct bus_attribute rbd_bus_attrs[] = {
 342         __ATTR(add, S_IWUSR, NULL, rbd_add),
 343         __ATTR(remove, S_IWUSR, NULL, rbd_remove),
 344         __ATTR_NULL
 345 };
 346
 347 static struct bus_type rbd_bus_type = {
 348         .name           = "rbd",
 349         .bus_attrs      = rbd_bus_attrs,
 350 };
 351
 352 static void rbd_root_dev_release(struct device *dev)
 353 {
 354 }
 355
 356 static struct device rbd_root_dev = {
 357         .init_name =    "rbd",
 358         .release =      rbd_root_dev_release,
 359 };
 360
 361 static __printf(2, 3)
 362 void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
 363 {
 364         struct va_format vaf;
 365         va_list args;
 366
 367         va_start(args, fmt);
 368         vaf.fmt = fmt;
 369         vaf.va = &args;
 370
 371         if (!rbd_dev)
 372                 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
 373         else if (rbd_dev->disk)
 374                 printk(KERN_WARNING "%s: %s: %pV\n",
 375                         RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
 376         else if (rbd_dev->spec && rbd_dev->spec->image_name)
 377                 printk(KERN_WARNING "%s: image %s: %pV\n",
 378                         RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
 379         else if (rbd_dev->spec && rbd_dev->spec->image_id)
 380                 printk(KERN_WARNING "%s: id %s: %pV\n",
 381                         RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
 382         else    /* punt */
 383                 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
 384                         RBD_DRV_NAME, rbd_dev, &vaf);
 385         va_end(args);
 386 }
 387
 388 #ifdef RBD_DEBUG
 389 #define rbd_assert(expr)                                                \
 390                 if (unlikely(!(expr))) {                                \
 391                         printk(KERN_ERR "\nAssertion failure in %s() "  \
 392                                                 "at line %d:\n\n"       \
 393                                         "\trbd_assert(%s);\n\n",        \
 394                                         __func__, __LINE__, #expr);     \
 395                         BUG();                                          \
 396                 }
 397 #else /* !RBD_DEBUG */
 398 #  define rbd_assert(expr)      ((void) 0)
 399 #endif /* !RBD_DEBUG */
 400
 401 static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
 402
 403 static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
 404 static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
 405
 406 static int rbd_open(struct block_device *bdev, fmode_t mode)
 407 {
 408         struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
 409         bool removing = false;
 410
 411         if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
 412                 return -EROFS;
 413
 414         spin_lock_irq(&rbd_dev->lock);
 415         if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
 416                 removing = true;
 417         else
 418                 rbd_dev->open_count++;
 419         spin_unlock_irq(&rbd_dev->lock);
 420         if (removing)
 421                 return -ENOENT;
 422
 423         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 424         (void) get_device(&rbd_dev->dev);
 425         set_device_ro(bdev, rbd_dev->mapping.read_only);
 426         mutex_unlock(&ctl_mutex);
 427
 428         return 0;
 429 }
 430
 431 static int rbd_release(struct gendisk *disk, fmode_t mode)
 432 {
 433         struct rbd_device *rbd_dev = disk->private_data;
 434         unsigned long open_count_before;
 435
 436         spin_lock_irq(&rbd_dev->lock);
 437         open_count_before = rbd_dev->open_count--;
 438         spin_unlock_irq(&rbd_dev->lock);
 439         rbd_assert(open_count_before > 0);
 440
 441         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 442         put_device(&rbd_dev->dev);
 443         mutex_unlock(&ctl_mutex);
 444
 445         return 0;
 446 }
 447
 448 static const struct block_device_operations rbd_bd_ops = {
 449         .owner                  = THIS_MODULE,
 450         .open                   = rbd_open,
 451         .release                = rbd_release,
 452 };
 453
 454 /*
 455  * Initialize an rbd client instance.
 456  * We own *ceph_opts.
 457  */
 458 static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
 459 {
 460         struct rbd_client *rbdc;
 461         int ret = -ENOMEM;
 462
 463         dout("%s:\n", __func__);
 464         rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
 465         if (!rbdc)
 466                 goto out_opt;
 467
 468         kref_init(&rbdc->kref);
 469         INIT_LIST_HEAD(&rbdc->node);
 470
 471         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 472
 473         rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
 474         if (IS_ERR(rbdc->client))
 475                 goto out_mutex;
 476         ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
 477
 478         ret = ceph_open_session(rbdc->client);
 479         if (ret < 0)
 480                 goto out_err;
 481
 482         spin_lock(&rbd_client_list_lock);
 483         list_add_tail(&rbdc->node, &rbd_client_list);
 484         spin_unlock(&rbd_client_list_lock);
 485
 486         mutex_unlock(&ctl_mutex);
 487         dout("%s: rbdc %p\n", __func__, rbdc);
 488
 489         return rbdc;
 490
 491 out_err:
 492         ceph_destroy_client(rbdc->client);
 493 out_mutex:
 494         mutex_unlock(&ctl_mutex);
 495         kfree(rbdc);
 496 out_opt:
 497         if (ceph_opts)
 498                 ceph_destroy_options(ceph_opts);
 499         dout("%s: error %d\n", __func__, ret);
 500
 501         return ERR_PTR(ret);
 502 }
 503
 504 static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
 505 {
 506         kref_get(&rbdc->kref);
 507
 508         return rbdc;
 509 }
 510
 511 /*
 512  * Find a ceph client with specific addr and configuration.  If
 513  * found, bump its reference count.
 514  */
 515 static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
 516 {
 517         struct rbd_client *client_node;
 518         bool found = false;
 519
 520         if (ceph_opts->flags & CEPH_OPT_NOSHARE)
 521                 return NULL;
 522
 523         spin_lock(&rbd_client_list_lock);
 524         list_for_each_entry(client_node, &rbd_client_list, node) {
 525                 if (!ceph_compare_options(ceph_opts, client_node->client)) {
 526                         __rbd_get_client(client_node);
 527
 528                         found = true;
 529                         break;
 530                 }
 531         }
 532         spin_unlock(&rbd_client_list_lock);
 533
 534         return found ? client_node : NULL;
 535 }
 536
 537 /*
 538  * mount options
 539  */
 540 enum {
 541         Opt_last_int,
 542         /* int args above */
 543         Opt_last_string,
 544         /* string args above */
 545         Opt_read_only,
 546         Opt_read_write,
 547         /* Boolean args above */
 548         Opt_last_bool,
 549 };
 550
 551 static match_table_t rbd_opts_tokens = {
 552         /* int args above */
 553         /* string args above */
 554         {Opt_read_only, "read_only"},
 555         {Opt_read_only, "ro"},          /* Alternate spelling */
 556         {Opt_read_write, "read_write"},
 557         {Opt_read_write, "rw"},         /* Alternate spelling */
 558         /* Boolean args above */
 559         {-1, NULL}
 560 };
 561
 562 struct rbd_options {
 563         bool    read_only;
 564 };
 565
 566 #define RBD_READ_ONLY_DEFAULT   false
 567
 568 static int parse_rbd_opts_token(char *c, void *private)
 569 {
 570         struct rbd_options *rbd_opts = private;
 571         substring_t argstr[MAX_OPT_ARGS];
 572         int token, intval, ret;
 573
 574         token = match_token(c, rbd_opts_tokens, argstr);
 575         if (token < 0)
 576                 return -EINVAL;
 577
 578         if (token < Opt_last_int) {
 579                 ret = match_int(&argstr[0], &intval);
 580                 if (ret < 0) {
 581                         pr_err("bad mount option arg (not int) "
 582                                "at '%s'\n", c);
 583                         return ret;
 584                 }
 585                 dout("got int token %d val %d\n", token, intval);
 586         } else if (token > Opt_last_int && token < Opt_last_string) {
 587                 dout("got string token %d val %s\n", token,
 588                      argstr[0].from);
 589         } else if (token > Opt_last_string && token < Opt_last_bool) {
 590                 dout("got Boolean token %d\n", token);
 591         } else {
 592                 dout("got token %d\n", token);
 593         }
 594
 595         switch (token) {
 596         case Opt_read_only:
 597                 rbd_opts->read_only = true;
 598                 break;
 599         case Opt_read_write:
 600                 rbd_opts->read_only = false;
 601                 break;
 602         default:
 603                 rbd_assert(false);
 604                 break;
 605         }
 606         return 0;
 607 }
 608
 609 /*
 610  * Get a ceph client with specific addr and configuration, if one does
 611  * not exist create it.
 612  */
 613 static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
 614 {
 615         struct rbd_client *rbdc;
 616
 617         rbdc = rbd_client_find(ceph_opts);
 618         if (rbdc)       /* using an existing client */
 619                 ceph_destroy_options(ceph_opts);
 620         else
 621                 rbdc = rbd_client_create(ceph_opts);
 622
 623         return rbdc;
 624 }
 625
 626 /*
 627  * Destroy ceph client
 628  *
 629  * Caller must hold rbd_client_list_lock.
 630  */
 631 static void rbd_client_release(struct kref *kref)
 632 {
 633         struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
 634
 635         dout("%s: rbdc %p\n", __func__, rbdc);
 636         spin_lock(&rbd_client_list_lock);
 637         list_del(&rbdc->node);
 638         spin_unlock(&rbd_client_list_lock);
 639
 640         ceph_destroy_client(rbdc->client);
 641         kfree(rbdc);
 642 }
 643
 644 /*
 645  * Drop reference to ceph client node. If it's not referenced anymore, release
 646  * it.
 647  */
 648 static void rbd_put_client(struct rbd_client *rbdc)
 649 {
 650         if (rbdc)
 651                 kref_put(&rbdc->kref, rbd_client_release);
 652 }
 653
 654 static bool rbd_image_format_valid(u32 image_format)
 655 {
 656         return image_format == 1 || image_format == 2;
 657 }
 658
 659 static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
 660 {
 661         size_t size;
 662         u32 snap_count;
 663
 664         /* The header has to start with the magic rbd header text */
 665         if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
 666                 return false;
 667
 668         /* The bio layer requires at least sector-sized I/O */
 669
 670         if (ondisk->options.order < SECTOR_SHIFT)
 671                 return false;
 672
 673         /* If we use u64 in a few spots we may be able to loosen this */
 674
 675         if (ondisk->options.order > 8 * sizeof (int) - 1)
 676                 return false;
 677
 678         /*
 679          * The size of a snapshot header has to fit in a size_t, and
 680          * that limits the number of snapshots.
 681          */
 682         snap_count = le32_to_cpu(ondisk->snap_count);
 683         size = SIZE_MAX - sizeof (struct ceph_snap_context);
 684         if (snap_count > size / sizeof (__le64))
 685                 return false;
 686
 687         /*
 688          * Not only that, but the size of the entire the snapshot
 689          * header must also be representable in a size_t.
 690          */
 691         size -= snap_count * sizeof (__le64);
 692         if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
 693                 return false;
 694
 695         return true;
 696 }
 697
 698 /*
 699  * Create a new header structure, translate header format from the on-disk
 700  * header.
 701  */
 702 static int rbd_header_from_disk(struct rbd_image_header *header,
 703                                  struct rbd_image_header_ondisk *ondisk)
 704 {
 705         u32 snap_count;
 706         size_t len;
 707         size_t size;
 708         u32 i;
 709
 710         memset(header, 0, sizeof (*header));
 711
 712         snap_count = le32_to_cpu(ondisk->snap_count);
 713
 714         len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
 715         header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
 716         if (!header->object_prefix)
 717                 return -ENOMEM;
 718         memcpy(header->object_prefix, ondisk->object_prefix, len);
 719         header->object_prefix[len] = '\0';
 720
 721         if (snap_count) {
 722                 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
 723
 724                 /* Save a copy of the snapshot names */
 725
 726                 if (snap_names_len > (u64) SIZE_MAX)
 727                         return -EIO;
 728                 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
 729                 if (!header->snap_names)
 730                         goto out_err;
 731                 /*
 732                  * Note that rbd_dev_v1_header_read() guarantees
 733                  * the ondisk buffer we're working with has
 734                  * snap_names_len bytes beyond the end of the
 735                  * snapshot id array, this memcpy() is safe.
 736                  */
 737                 memcpy(header->snap_names, &ondisk->snaps[snap_count],
 738                         snap_names_len);
 739
 740                 /* Record each snapshot's size */
 741
 742                 size = snap_count * sizeof (*header->snap_sizes);
 743                 header->snap_sizes = kmalloc(size, GFP_KERNEL);
 744                 if (!header->snap_sizes)
 745                         goto out_err;
 746                 for (i = 0; i < snap_count; i++)
 747                         header->snap_sizes[i] =
 748                                 le64_to_cpu(ondisk->snaps[i].image_size);
 749         } else {
 750                 WARN_ON(ondisk->snap_names_len);
 751                 header->snap_names = NULL;
 752                 header->snap_sizes = NULL;
 753         }
 754
 755         header->features = 0;   /* No features support in v1 images */
 756         header->obj_order = ondisk->options.order;
 757         header->crypt_type = ondisk->options.crypt_type;
 758         header->comp_type = ondisk->options.comp_type;
 759
 760         /* Allocate and fill in the snapshot context */
 761
 762         header->image_size = le64_to_cpu(ondisk->image_size);
 763         size = sizeof (struct ceph_snap_context);
 764         size += snap_count * sizeof (header->snapc->snaps[0]);
 765         header->snapc = kzalloc(size, GFP_KERNEL);
 766         if (!header->snapc)
 767                 goto out_err;
 768
 769         atomic_set(&header->snapc->nref, 1);
 770         header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
 771         header->snapc->num_snaps = snap_count;
 772         for (i = 0; i < snap_count; i++)
 773                 header->snapc->snaps[i] =
 774                         le64_to_cpu(ondisk->snaps[i].id);
 775
 776         return 0;
 777
 778 out_err:
 779         kfree(header->snap_sizes);
 780         header->snap_sizes = NULL;
 781         kfree(header->snap_names);
 782         header->snap_names = NULL;
 783         kfree(header->object_prefix);
 784         header->object_prefix = NULL;
 785
 786         return -ENOMEM;
 787 }
 788
 789 static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
 790 {
 791         struct rbd_snap *snap;
 792
 793         if (snap_id == CEPH_NOSNAP)
 794                 return RBD_SNAP_HEAD_NAME;
 795
 796         list_for_each_entry(snap, &rbd_dev->snaps, node)
 797                 if (snap_id == snap->id)
 798                         return snap->name;
 799
 800         return NULL;
 801 }
 802
 803 static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
 804 {
 805
 806         struct rbd_snap *snap;
 807
 808         list_for_each_entry(snap, &rbd_dev->snaps, node) {
 809                 if (!strcmp(snap_name, snap->name)) {
 810                         rbd_dev->spec->snap_id = snap->id;
 811                         rbd_dev->mapping.size = snap->size;
 812                         rbd_dev->mapping.features = snap->features;
 813
 814                         return 0;
 815                 }
 816         }
 817
 818         return -ENOENT;
 819 }
 820
 821 static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
 822 {
 823         int ret;
 824
 825         if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
 826                     sizeof (RBD_SNAP_HEAD_NAME))) {
 827                 rbd_dev->spec->snap_id = CEPH_NOSNAP;
 828                 rbd_dev->mapping.size = rbd_dev->header.image_size;
 829                 rbd_dev->mapping.features = rbd_dev->header.features;
 830                 ret = 0;
 831         } else {
 832                 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
 833                 if (ret < 0)
 834                         goto done;
 835                 rbd_dev->mapping.read_only = true;
 836         }
 837         set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
 838
 839 done:
 840         return ret;
 841 }
 842
 843 static void rbd_header_free(struct rbd_image_header *header)
 844 {
 845         kfree(header->object_prefix);
 846         header->object_prefix = NULL;
 847         kfree(header->snap_sizes);
 848         header->snap_sizes = NULL;
 849         kfree(header->snap_names);
 850         header->snap_names = NULL;
 851         ceph_put_snap_context(header->snapc);
 852         header->snapc = NULL;
 853 }
 854
 855 static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
 856 {
 857         char *name;
 858         u64 segment;
 859         int ret;
 860
 861         name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
 862         if (!name)
 863                 return NULL;
 864         segment = offset >> rbd_dev->header.obj_order;
 865         ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
 866                         rbd_dev->header.object_prefix, segment);
 867         if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
 868                 pr_err("error formatting segment name for #%llu (%d)\n",
 869                         segment, ret);
 870                 kfree(name);
 871                 name = NULL;
 872         }
 873
 874         return name;
 875 }
 876
 877 static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
 878 {
 879         u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
 880
 881         return offset & (segment_size - 1);
 882 }
 883
 884 static u64 rbd_segment_length(struct rbd_device *rbd_dev,
 885                                 u64 offset, u64 length)
 886 {
 887         u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
 888
 889         offset &= segment_size - 1;
 890
 891         rbd_assert(length <= U64_MAX - offset);
 892         if (offset + length > segment_size)
 893                 length = segment_size - offset;
 894
 895         return length;
 896 }
 897
 898 /*
 899  * returns the size of an object in the image
 900  */
 901 static u64 rbd_obj_bytes(struct rbd_image_header *header)
 902 {
 903         return 1 << header->obj_order;
 904 }
 905
 906 /*
 907  * bio helpers
 908  */
 909
 910 static void bio_chain_put(struct bio *chain)
 911 {
 912         struct bio *tmp;
 913
 914         while (chain) {
 915                 tmp = chain;
 916                 chain = chain->bi_next;
 917                 bio_put(tmp);
 918         }
 919 }
 920
 921 /*
 922  * zeros a bio chain, starting at specific offset
 923  */
 924 static void zero_bio_chain(struct bio *chain, int start_ofs)
 925 {
 926         struct bio_vec *bv;
 927         unsigned long flags;
 928         void *buf;
 929         int i;
 930         int pos = 0;
 931
 932         while (chain) {
 933                 bio_for_each_segment(bv, chain, i) {
 934                         if (pos + bv->bv_len > start_ofs) {
 935                                 int remainder = max(start_ofs - pos, 0);
 936                                 buf = bvec_kmap_irq(bv, &flags);
 937                                 memset(buf + remainder, 0,
 938                                        bv->bv_len - remainder);
 939                                 bvec_kunmap_irq(buf, &flags);
 940                         }
 941                         pos += bv->bv_len;
 942                 }
 943
 944                 chain = chain->bi_next;
 945         }
 946 }
 947
 948 /*
 949  * Clone a portion of a bio, starting at the given byte offset
 950  * and continuing for the number of bytes indicated.
 951  */
 952 static struct bio *bio_clone_range(struct bio *bio_src,
 953                                         unsigned int offset,
 954                                         unsigned int len,
 955                                         gfp_t gfpmask)
 956 {
 957         struct bio_vec *bv;
 958         unsigned int resid;
 959         unsigned short idx;
 960         unsigned int voff;
 961         unsigned short end_idx;
 962         unsigned short vcnt;
 963         struct bio *bio;
 964
 965         /* Handle the easy case for the caller */
 966
 967         if (!offset && len == bio_src->bi_size)
 968                 return bio_clone(bio_src, gfpmask);
 969
 970         if (WARN_ON_ONCE(!len))
 971                 return NULL;
 972         if (WARN_ON_ONCE(len > bio_src->bi_size))
 973                 return NULL;
 974         if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
 975                 return NULL;
 976
 977         /* Find first affected segment... */
 978
 979         resid = offset;
 980         __bio_for_each_segment(bv, bio_src, idx, 0) {
 981                 if (resid < bv->bv_len)
 982                         break;
 983                 resid -= bv->bv_len;
 984         }
 985         voff = resid;
 986
 987         /* ...and the last affected segment */
 988
 989         resid += len;
 990         __bio_for_each_segment(bv, bio_src, end_idx, idx) {
 991                 if (resid <= bv->bv_len)
 992                         break;
 993                 resid -= bv->bv_len;
 994         }
 995         vcnt = end_idx - idx + 1;
 996
 997         /* Build the clone */
 998
 999         bio = bio_alloc(gfpmask, (unsigned int) vcnt);
1000         if (!bio)
1001                 return NULL;    /* ENOMEM */
1002
1003         bio->bi_bdev = bio_src->bi_bdev;
1004         bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
1005         bio->bi_rw = bio_src->bi_rw;
1006         bio->bi_flags |= 1 << BIO_CLONED;
1007
1008         /*
1009          * Copy over our part of the bio_vec, then update the first
1010          * and last (or only) entries.
1011          */
1012         memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
1013                         vcnt * sizeof (struct bio_vec));
1014         bio->bi_io_vec[0].bv_offset += voff;
1015         if (vcnt > 1) {
1016                 bio->bi_io_vec[0].bv_len -= voff;
1017                 bio->bi_io_vec[vcnt - 1].bv_len = resid;
1018         } else {
1019                 bio->bi_io_vec[0].bv_len = len;
1020         }
1021
1022         bio->bi_vcnt = vcnt;
1023         bio->bi_size = len;
1024         bio->bi_idx = 0;
1025
1026         return bio;
1027 }
1028
1029 /*
1030  * Clone a portion of a bio chain, starting at the given byte offset
1031  * into the first bio in the source chain and continuing for the
1032  * number of bytes indicated.  The result is another bio chain of
1033  * exactly the given length, or a null pointer on error.
1034  *
1035  * The bio_src and offset parameters are both in-out.  On entry they
1036  * refer to the first source bio and the offset into that bio where
1037  * the start of data to be cloned is located.
1038  *
1039  * On return, bio_src is updated to refer to the bio in the source
1040  * chain that contains first un-cloned byte, and *offset will
1041  * contain the offset of that byte within that bio.
1042  */
1043 static struct bio *bio_chain_clone_range(struct bio **bio_src,
1044                                         unsigned int *offset,
1045                                         unsigned int len,
1046                                         gfp_t gfpmask)
1047 {
1048         struct bio *bi = *bio_src;
1049         unsigned int off = *offset;
1050         struct bio *chain = NULL;
1051         struct bio **end;
1052
1053         /* Build up a chain of clone bios up to the limit */
1054
1055         if (!bi || off >= bi->bi_size || !len)
1056                 return NULL;            /* Nothing to clone */
1057
1058         end = &chain;
1059         while (len) {
1060                 unsigned int bi_size;
1061                 struct bio *bio;
1062
1063                 if (!bi) {
1064                         rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1065                         goto out_err;   /* EINVAL; ran out of bio's */
1066                 }
1067                 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1068                 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1069                 if (!bio)
1070                         goto out_err;   /* ENOMEM */
1071
1072                 *end = bio;
1073                 end = &bio->bi_next;
1074
1075                 off += bi_size;
1076                 if (off == bi->bi_size) {
1077                         bi = bi->bi_next;
1078                         off = 0;
1079                 }
1080                 len -= bi_size;
1081         }
1082         *bio_src = bi;
1083         *offset = off;
1084
1085         return chain;
1086 out_err:
1087         bio_chain_put(chain);
1088
1089         return NULL;
1090 }
1091
1092 /*
1093  * The default/initial value for all object request flags is 0.  For
1094  * each flag, once its value is set to 1 it is never reset to 0
1095  * again.
1096  */
1097 static void obj_request_done_set(struct rbd_obj_request *obj_request)
1098 {
1099         if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
1100                 struct rbd_img_request *img_request = obj_request->img_request;
1101                 struct rbd_device *rbd_dev;
1102
1103                 rbd_dev = img_request ? img_request->rbd_dev : NULL;
1104                 rbd_warn(rbd_dev, "obj_request %p already marked done\n",
1105                         obj_request);
1106         }
1107 }
1108
1109 static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1110 {
1111         smp_mb();
1112         return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
1113 }
1114
1115 static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
1116 {
1117         if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
1118                 struct rbd_img_request *img_request = obj_request->img_request;
1119                 struct rbd_device *rbd_dev;
1120
1121                 rbd_dev = img_request ? img_request->rbd_dev : NULL;
1122                 rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
1123                         obj_request);
1124         }
1125 }
1126
1127 static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
1128 {
1129         smp_mb();
1130         return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
1131 }
1132
1133 static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1134 {
1135         dout("%s: obj %p (was %d)\n", __func__, obj_request,
1136                 atomic_read(&obj_request->kref.refcount));
1137         kref_get(&obj_request->kref);
1138 }
1139
1140 static void rbd_obj_request_destroy(struct kref *kref);
1141 static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1142 {
1143         rbd_assert(obj_request != NULL);
1144         dout("%s: obj %p (was %d)\n", __func__, obj_request,
1145                 atomic_read(&obj_request->kref.refcount));
1146         kref_put(&obj_request->kref, rbd_obj_request_destroy);
1147 }
1148
1149 static void rbd_img_request_get(struct rbd_img_request *img_request)
1150 {
1151         dout("%s: img %p (was %d)\n", __func__, img_request,
1152                 atomic_read(&img_request->kref.refcount));
1153         kref_get(&img_request->kref);
1154 }
1155
1156 static void rbd_img_request_destroy(struct kref *kref);
1157 static void rbd_img_request_put(struct rbd_img_request *img_request)
1158 {
1159         rbd_assert(img_request != NULL);
1160         dout("%s: img %p (was %d)\n", __func__, img_request,
1161                 atomic_read(&img_request->kref.refcount));
1162         kref_put(&img_request->kref, rbd_img_request_destroy);
1163 }
1164
1165 static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1166                                         struct rbd_obj_request *obj_request)
1167 {
1168         rbd_assert(obj_request->img_request == NULL);
1169
1170         rbd_obj_request_get(obj_request);
1171         obj_request->img_request = img_request;
1172         obj_request->which = img_request->obj_request_count;
1173         rbd_assert(!obj_request_img_data_test(obj_request));
1174         obj_request_img_data_set(obj_request);
1175         rbd_assert(obj_request->which != BAD_WHICH);
1176         img_request->obj_request_count++;
1177         list_add_tail(&obj_request->links, &img_request->obj_requests);
1178         dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1179                 obj_request->which);
1180 }
1181
1182 static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1183                                         struct rbd_obj_request *obj_request)
1184 {
1185         rbd_assert(obj_request->which != BAD_WHICH);
1186
1187         dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1188                 obj_request->which);
1189         list_del(&obj_request->links);
1190         rbd_assert(img_request->obj_request_count > 0);
1191         img_request->obj_request_count--;
1192         rbd_assert(obj_request->which == img_request->obj_request_count);
1193         obj_request->which = BAD_WHICH;
1194         rbd_assert(obj_request_img_data_test(obj_request));
1195         rbd_assert(obj_request->img_request == img_request);
1196         obj_request->img_request = NULL;
1197         obj_request->callback = NULL;
1198         rbd_obj_request_put(obj_request);
1199 }
1200
1201 static bool obj_request_type_valid(enum obj_request_type type)
1202 {
1203         switch (type) {
1204         case OBJ_REQUEST_NODATA:
1205         case OBJ_REQUEST_BIO:
1206         case OBJ_REQUEST_PAGES:
1207                 return true;
1208         default:
1209                 return false;
1210         }
1211 }
1212
1213 static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1214                                 struct rbd_obj_request *obj_request)
1215 {
1216         dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
1217
1218         return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1219 }
1220
1221 static void rbd_img_request_complete(struct rbd_img_request *img_request)
1222 {
1223
1224         dout("%s: img %p\n", __func__, img_request);
1225
1226         /*
1227          * If no error occurred, compute the aggregate transfer
1228          * count for the image request.  We could instead use
1229          * atomic64_cmpxchg() to update it as each object request
1230          * completes; not clear which way is better off hand.
1231          */
1232         if (!img_request->result) {
1233                 struct rbd_obj_request *obj_request;
1234                 u64 xferred = 0;
1235
1236                 for_each_obj_request(img_request, obj_request)
1237                         xferred += obj_request->xferred;
1238                 img_request->xferred = xferred;
1239         }
1240
1241         if (img_request->callback)
1242                 img_request->callback(img_request);
1243         else
1244                 rbd_img_request_put(img_request);
1245 }
1246
1247 /* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1248
1249 static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1250 {
1251         dout("%s: obj %p\n", __func__, obj_request);
1252
1253         return wait_for_completion_interruptible(&obj_request->completion);
1254 }
1255
1256 /*
1257  * The default/initial value for all image request flags is 0.  Each
1258  * is conditionally set to 1 at image request initialization time
1259  * and currently never change thereafter.
1260  */
1261 static void img_request_write_set(struct rbd_img_request *img_request)
1262 {
1263         set_bit(IMG_REQ_WRITE, &img_request->flags);
1264         smp_mb();
1265 }
1266
1267 static bool img_request_write_test(struct rbd_img_request *img_request)
1268 {
1269         smp_mb();
1270         return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
1271 }
1272
1273 static void img_request_child_set(struct rbd_img_request *img_request)
1274 {
1275         set_bit(IMG_REQ_CHILD, &img_request->flags);
1276         smp_mb();
1277 }
1278
1279 static bool img_request_child_test(struct rbd_img_request *img_request)
1280 {
1281         smp_mb();
1282         return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
1283 }
1284
1285 static void img_request_layered_set(struct rbd_img_request *img_request)
1286 {
1287         set_bit(IMG_REQ_LAYERED, &img_request->flags);
1288         smp_mb();
1289 }
1290
1291 static bool img_request_layered_test(struct rbd_img_request *img_request)
1292 {
1293         smp_mb();
1294         return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1295 }
1296
1297 static void
1298 rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1299 {
1300         dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1301                 obj_request, obj_request->img_request, obj_request->result,
1302                 obj_request->xferred, obj_request->length);
1303         /*
1304          * ENOENT means a hole in the image.  We zero-fill the
1305          * entire length of the request.  A short read also implies
1306          * zero-fill to the end of the request.  Either way we
1307          * update the xferred count to indicate the whole request
1308          * was satisfied.
1309          */
1310         BUG_ON(obj_request->type != OBJ_REQUEST_BIO);
1311         if (obj_request->result == -ENOENT) {
1312                 zero_bio_chain(obj_request->bio_list, 0);
1313                 obj_request->result = 0;
1314                 obj_request->xferred = obj_request->length;
1315         } else if (obj_request->xferred < obj_request->length &&
1316                         !obj_request->result) {
1317                 zero_bio_chain(obj_request->bio_list, obj_request->xferred);
1318                 obj_request->xferred = obj_request->length;
1319         }
1320         obj_request_done_set(obj_request);
1321 }
1322
1323 static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1324 {
1325         dout("%s: obj %p cb %p\n", __func__, obj_request,
1326                 obj_request->callback);
1327         if (obj_request->callback)
1328                 obj_request->callback(obj_request);
1329         else
1330                 complete_all(&obj_request->completion);
1331 }
1332
1333 static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
1334 {
1335         dout("%s: obj %p\n", __func__, obj_request);
1336         obj_request_done_set(obj_request);
1337 }
1338
1339 static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1340 {
1341         struct rbd_img_request *img_request = obj_request->img_request;
1342         bool layered = img_request && img_request_layered_test(img_request);
1343
1344         dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1345                 obj_request, img_request, obj_request->result,
1346                 obj_request->xferred, obj_request->length);
1347         if (layered && obj_request->result == -ENOENT)
1348                 rbd_img_parent_read(obj_request);
1349         else if (img_request)
1350                 rbd_img_obj_request_read_callback(obj_request);
1351         else
1352                 obj_request_done_set(obj_request);
1353 }
1354
1355 static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1356 {
1357         dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1358                 obj_request->result, obj_request->length);
1359         /*
1360          * There is no such thing as a successful short write.  Set
1361          * it to our originally-requested length.
1362          */
1363         obj_request->xferred = obj_request->length;
1364         obj_request_done_set(obj_request);
1365 }
1366
1367 /*
1368  * For a simple stat call there's nothing to do.  We'll do more if
1369  * this is part of a write sequence for a layered image.
1370  */
1371 static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1372 {
1373         dout("%s: obj %p\n", __func__, obj_request);
1374         obj_request_done_set(obj_request);
1375 }
1376
1377 static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1378                                 struct ceph_msg *msg)
1379 {
1380         struct rbd_obj_request *obj_request = osd_req->r_priv;
1381         u16 opcode;
1382
1383         dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1384         rbd_assert(osd_req == obj_request->osd_req);
1385         rbd_assert(obj_request_img_data_test(obj_request) ^
1386                                 !obj_request->img_request);
1387         rbd_assert(obj_request_img_data_test(obj_request) ^
1388                                 (obj_request->which == BAD_WHICH));
1389
1390         if (osd_req->r_result < 0)
1391                 obj_request->result = osd_req->r_result;
1392         obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1393
1394         WARN_ON(osd_req->r_num_ops != 1);       /* For now */
1395
1396         /*
1397          * We support a 64-bit length, but ultimately it has to be
1398          * passed to blk_end_request(), which takes an unsigned int.
1399          */
1400         obj_request->xferred = osd_req->r_reply_op_len[0];
1401         rbd_assert(obj_request->xferred < (u64)UINT_MAX);
1402         opcode = osd_req->r_ops[0].op;
1403         switch (opcode) {
1404         case CEPH_OSD_OP_READ:
1405                 rbd_osd_read_callback(obj_request);
1406                 break;
1407         case CEPH_OSD_OP_WRITE:
1408                 rbd_osd_write_callback(obj_request);
1409                 break;
1410         case CEPH_OSD_OP_STAT:
1411                 rbd_osd_stat_callback(obj_request);
1412                 break;
1413         case CEPH_OSD_OP_CALL:
1414         case CEPH_OSD_OP_NOTIFY_ACK:
1415         case CEPH_OSD_OP_WATCH:
1416                 rbd_osd_trivial_callback(obj_request);
1417                 break;
1418         default:
1419                 rbd_warn(NULL, "%s: unsupported op %hu\n",
1420                         obj_request->object_name, (unsigned short) opcode);
1421                 break;
1422         }
1423
1424         if (obj_request_done_test(obj_request))
1425                 rbd_obj_request_complete(obj_request);
1426 }
1427
1428 static void rbd_osd_req_format(struct rbd_obj_request *obj_request,
1429                                         bool write_request)
1430 {
1431         struct rbd_img_request *img_request = obj_request->img_request;
1432         struct ceph_osd_request *osd_req = obj_request->osd_req;
1433         struct ceph_snap_context *snapc = NULL;
1434         u64 snap_id = CEPH_NOSNAP;
1435         struct timespec *mtime = NULL;
1436         struct timespec now;
1437
1438         rbd_assert(osd_req != NULL);
1439
1440         if (write_request) {
1441                 now = CURRENT_TIME;
1442                 mtime = &now;
1443                 if (img_request)
1444                         snapc = img_request->snapc;
1445         } else if (img_request) {
1446                 snap_id = img_request->snap_id;
1447         }
1448         ceph_osdc_build_request(osd_req, obj_request->offset,
1449                         snapc, snap_id, mtime);
1450 }
1451
1452 static struct ceph_osd_request *rbd_osd_req_create(
1453                                         struct rbd_device *rbd_dev,
1454                                         bool write_request,
1455                                         struct rbd_obj_request *obj_request)
1456 {
1457         struct ceph_snap_context *snapc = NULL;
1458         struct ceph_osd_client *osdc;
1459         struct ceph_osd_request *osd_req;
1460
1461         if (obj_request_img_data_test(obj_request)) {
1462                 struct rbd_img_request *img_request = obj_request->img_request;
1463
1464                 rbd_assert(write_request ==
1465                                 img_request_write_test(img_request));
1466                 if (write_request)
1467                         snapc = img_request->snapc;
1468         }
1469
1470         /* Allocate and initialize the request, for the single op */
1471
1472         osdc = &rbd_dev->rbd_client->client->osdc;
1473         osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1474         if (!osd_req)
1475                 return NULL;    /* ENOMEM */
1476
1477         if (write_request)
1478                 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1479         else
1480                 osd_req->r_flags = CEPH_OSD_FLAG_READ;
1481
1482         osd_req->r_callback = rbd_osd_req_callback;
1483         osd_req->r_priv = obj_request;
1484
1485         osd_req->r_oid_len = strlen(obj_request->object_name);
1486         rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1487         memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1488
1489         osd_req->r_file_layout = rbd_dev->layout;       /* struct */
1490
1491         return osd_req;
1492 }
1493
1494 static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1495 {
1496         ceph_osdc_put_request(osd_req);
1497 }
1498
1499 /* object_name is assumed to be a non-null pointer and NUL-terminated */
1500
1501 static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1502                                                 u64 offset, u64 length,
1503                                                 enum obj_request_type type)
1504 {
1505         struct rbd_obj_request *obj_request;
1506         size_t size;
1507         char *name;
1508
1509         rbd_assert(obj_request_type_valid(type));
1510
1511         size = strlen(object_name) + 1;
1512         obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1513         if (!obj_request)
1514                 return NULL;
1515
1516         name = (char *)(obj_request + 1);
1517         obj_request->object_name = memcpy(name, object_name, size);
1518         obj_request->offset = offset;
1519         obj_request->length = length;
1520         obj_request->flags = 0;
1521         obj_request->which = BAD_WHICH;
1522         obj_request->type = type;
1523         INIT_LIST_HEAD(&obj_request->links);
1524         init_completion(&obj_request->completion);
1525         kref_init(&obj_request->kref);
1526
1527         dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
1528                 offset, length, (int)type, obj_request);
1529
1530         return obj_request;
1531 }
1532
1533 static void rbd_obj_request_destroy(struct kref *kref)
1534 {
1535         struct rbd_obj_request *obj_request;
1536
1537         obj_request = container_of(kref, struct rbd_obj_request, kref);
1538
1539         dout("%s: obj %p\n", __func__, obj_request);
1540
1541         rbd_assert(obj_request->img_request == NULL);
1542         rbd_assert(obj_request->which == BAD_WHICH);
1543
1544         if (obj_request->osd_req)
1545                 rbd_osd_req_destroy(obj_request->osd_req);
1546
1547         rbd_assert(obj_request_type_valid(obj_request->type));
1548         switch (obj_request->type) {
1549         case OBJ_REQUEST_NODATA:
1550                 break;          /* Nothing to do */
1551         case OBJ_REQUEST_BIO:
1552                 if (obj_request->bio_list)
1553                         bio_chain_put(obj_request->bio_list);
1554                 break;
1555         case OBJ_REQUEST_PAGES:
1556                 if (obj_request->pages)
1557                         ceph_release_page_vector(obj_request->pages,
1558                                                 obj_request->page_count);
1559                 break;
1560         }
1561
1562         kfree(obj_request);
1563 }
1564
1565 /*
1566  * Caller is responsible for filling in the list of object requests
1567  * that comprises the image request, and the Linux request pointer
1568  * (if there is one).
1569  */
1570 static struct rbd_img_request *rbd_img_request_create(
1571                                         struct rbd_device *rbd_dev,
1572                                         u64 offset, u64 length,
1573                                         bool write_request,
1574                                         bool child_request)
1575 {
1576         struct rbd_img_request *img_request;
1577         struct ceph_snap_context *snapc = NULL;
1578
1579         img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1580         if (!img_request)
1581                 return NULL;
1582
1583         if (write_request) {
1584                 down_read(&rbd_dev->header_rwsem);
1585                 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1586                 up_read(&rbd_dev->header_rwsem);
1587                 if (WARN_ON(!snapc)) {
1588                         kfree(img_request);
1589                         return NULL;    /* Shouldn't happen */
1590                 }
1591
1592         }
1593
1594         img_request->rq = NULL;
1595         img_request->rbd_dev = rbd_dev;
1596         img_request->offset = offset;
1597         img_request->length = length;
1598         img_request->flags = 0;
1599         if (write_request) {
1600                 img_request_write_set(img_request);
1601                 img_request->snapc = snapc;
1602         } else {
1603                 img_request->snap_id = rbd_dev->spec->snap_id;
1604         }
1605         if (child_request)
1606                 img_request_child_set(img_request);
1607         if (rbd_dev->parent_spec)
1608                 img_request_layered_set(img_request);
1609         spin_lock_init(&img_request->completion_lock);
1610         img_request->next_completion = 0;
1611         img_request->callback = NULL;
1612         img_request->result = 0;
1613         img_request->obj_request_count = 0;
1614         INIT_LIST_HEAD(&img_request->obj_requests);
1615         kref_init(&img_request->kref);
1616
1617         rbd_img_request_get(img_request);       /* Avoid a warning */
1618         rbd_img_request_put(img_request);       /* TEMPORARY */
1619
1620         dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
1621                 write_request ? "write" : "read", offset, length,
1622                 img_request);
1623
1624         return img_request;
1625 }
1626
1627 static void rbd_img_request_destroy(struct kref *kref)
1628 {
1629         struct rbd_img_request *img_request;
1630         struct rbd_obj_request *obj_request;
1631         struct rbd_obj_request *next_obj_request;
1632
1633         img_request = container_of(kref, struct rbd_img_request, kref);
1634
1635         dout("%s: img %p\n", __func__, img_request);
1636
1637         for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1638                 rbd_img_obj_request_del(img_request, obj_request);
1639         rbd_assert(img_request->obj_request_count == 0);
1640
1641         if (img_request_write_test(img_request))
1642                 ceph_put_snap_context(img_request->snapc);
1643
1644         if (img_request_child_test(img_request))
1645                 rbd_obj_request_put(img_request->obj_request);
1646
1647         kfree(img_request);
1648 }
1649
1650 static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
1651 {
1652         struct rbd_img_request *img_request;
1653         unsigned int xferred;
1654         int result;
1655         bool more;
1656
1657         rbd_assert(obj_request_img_data_test(obj_request));
1658         img_request = obj_request->img_request;
1659
1660         rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
1661         xferred = (unsigned int)obj_request->xferred;
1662         result = obj_request->result;
1663         if (result) {
1664                 struct rbd_device *rbd_dev = img_request->rbd_dev;
1665
1666                 rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
1667                         img_request_write_test(img_request) ? "write" : "read",
1668                         obj_request->length, obj_request->img_offset,
1669                         obj_request->offset);
1670                 rbd_warn(rbd_dev, "  result %d xferred %x\n",
1671                         result, xferred);
1672                 if (!img_request->result)
1673                         img_request->result = result;
1674         }
1675
1676         if (img_request_child_test(img_request)) {
1677                 rbd_assert(img_request->obj_request != NULL);
1678                 more = obj_request->which < img_request->obj_request_count - 1;
1679         } else {
1680                 rbd_assert(img_request->rq != NULL);
1681                 more = blk_end_request(img_request->rq, result, xferred);
1682         }
1683
1684         return more;
1685 }
1686
1687 static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1688 {
1689         struct rbd_img_request *img_request;
1690         u32 which = obj_request->which;
1691         bool more = true;
1692
1693         rbd_assert(obj_request_img_data_test(obj_request));
1694         img_request = obj_request->img_request;
1695
1696         dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1697         rbd_assert(img_request != NULL);
1698         rbd_assert(img_request->obj_request_count > 0);
1699         rbd_assert(which != BAD_WHICH);
1700         rbd_assert(which < img_request->obj_request_count);
1701         rbd_assert(which >= img_request->next_completion);
1702
1703         spin_lock_irq(&img_request->completion_lock);
1704         if (which != img_request->next_completion)
1705                 goto out;
1706
1707         for_each_obj_request_from(img_request, obj_request) {
1708                 rbd_assert(more);
1709                 rbd_assert(which < img_request->obj_request_count);
1710
1711                 if (!obj_request_done_test(obj_request))
1712                         break;
1713                 more = rbd_img_obj_end_request(obj_request);
1714                 which++;
1715         }
1716
1717         rbd_assert(more ^ (which == img_request->obj_request_count));
1718         img_request->next_completion = which;
1719 out:
1720         spin_unlock_irq(&img_request->completion_lock);
1721
1722         if (!more)
1723                 rbd_img_request_complete(img_request);
1724 }
1725
1726 static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
1727                                         struct bio *bio_list)
1728 {
1729         struct rbd_device *rbd_dev = img_request->rbd_dev;
1730         struct rbd_obj_request *obj_request = NULL;
1731         struct rbd_obj_request *next_obj_request;
1732         bool write_request = img_request_write_test(img_request);
1733         unsigned int bio_offset;
1734         u64 img_offset;
1735         u64 resid;
1736         u16 opcode;
1737
1738         dout("%s: img %p bio %p\n", __func__, img_request, bio_list);
1739
1740         opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
1741         bio_offset = 0;
1742         img_offset = img_request->offset;
1743         rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
1744         resid = img_request->length;
1745         rbd_assert(resid > 0);
1746         while (resid) {
1747                 struct ceph_osd_request *osd_req;
1748                 const char *object_name;
1749                 unsigned int clone_size;
1750                 u64 offset;
1751                 u64 length;
1752
1753                 object_name = rbd_segment_name(rbd_dev, img_offset);
1754                 if (!object_name)
1755                         goto out_unwind;
1756                 offset = rbd_segment_offset(rbd_dev, img_offset);
1757                 length = rbd_segment_length(rbd_dev, img_offset, resid);
1758                 obj_request = rbd_obj_request_create(object_name,
1759                                                 offset, length,
1760                                                 OBJ_REQUEST_BIO);
1761                 kfree(object_name);     /* object request has its own copy */
1762                 if (!obj_request)
1763                         goto out_unwind;
1764
1765                 rbd_assert(length <= (u64) UINT_MAX);
1766                 clone_size = (unsigned int) length;
1767                 obj_request->bio_list = bio_chain_clone_range(&bio_list,
1768                                                 &bio_offset, clone_size,
1769                                                 GFP_ATOMIC);
1770                 if (!obj_request->bio_list)
1771                         goto out_partial;
1772
1773                 osd_req = rbd_osd_req_create(rbd_dev, write_request,
1774                                                 obj_request);
1775                 if (!osd_req)
1776                         goto out_partial;
1777                 obj_request->osd_req = osd_req;
1778                 obj_request->callback = rbd_img_obj_callback;
1779
1780                 osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
1781                                                 0, 0);
1782                 osd_req_op_extent_osd_data_bio(osd_req, 0,
1783                                 obj_request->bio_list, obj_request->length);
1784                 rbd_osd_req_format(obj_request, write_request);
1785
1786                 obj_request->img_offset = img_offset;
1787                 rbd_img_obj_request_add(img_request, obj_request);
1788
1789                 img_offset += length;
1790                 resid -= length;
1791         }
1792
1793         return 0;
1794
1795 out_partial:
1796         rbd_obj_request_put(obj_request);
1797 out_unwind:
1798         for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1799                 rbd_obj_request_put(obj_request);
1800
1801         return -ENOMEM;
1802 }
1803
1804 static int rbd_img_request_submit(struct rbd_img_request *img_request)
1805 {
1806         struct rbd_device *rbd_dev = img_request->rbd_dev;
1807         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1808         struct rbd_obj_request *obj_request;
1809         struct rbd_obj_request *next_obj_request;
1810
1811         dout("%s: img %p\n", __func__, img_request);
1812         for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
1813                 int ret;
1814
1815                 ret = rbd_obj_request_submit(osdc, obj_request);
1816                 if (ret)
1817                         return ret;
1818                 /*
1819                  * The image request has its own reference to each
1820                  * of its object requests, so we can safely drop the
1821                  * initial one here.
1822                  */
1823                 rbd_obj_request_put(obj_request);
1824         }
1825
1826         return 0;
1827 }
1828
1829 static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
1830 {
1831         struct rbd_obj_request *obj_request;
1832
1833         rbd_assert(img_request_child_test(img_request));
1834
1835         obj_request = img_request->obj_request;
1836         rbd_assert(obj_request != NULL);
1837         obj_request->result = img_request->result;
1838         obj_request->xferred = img_request->xferred;
1839
1840         rbd_img_obj_request_read_callback(obj_request);
1841         rbd_obj_request_complete(obj_request);
1842 }
1843
1844 static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
1845 {
1846         struct rbd_device *rbd_dev;
1847         struct rbd_img_request *img_request;
1848         int result;
1849
1850         rbd_assert(obj_request_img_data_test(obj_request));
1851         rbd_assert(obj_request->img_request != NULL);
1852         rbd_assert(obj_request->result == (s32) -ENOENT);
1853         rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
1854
1855         rbd_dev = obj_request->img_request->rbd_dev;
1856         rbd_assert(rbd_dev->parent != NULL);
1857         /* rbd_read_finish(obj_request, obj_request->length); */
1858         img_request = rbd_img_request_create(rbd_dev->parent,
1859                                                 obj_request->img_offset,
1860                                                 obj_request->length,
1861                                                 false, true);
1862         result = -ENOMEM;
1863         if (!img_request)
1864                 goto out_err;
1865
1866         rbd_obj_request_get(obj_request);
1867         img_request->obj_request = obj_request;
1868
1869         result = rbd_img_request_fill_bio(img_request, obj_request->bio_list);
1870         if (result)
1871                 goto out_err;
1872
1873         img_request->callback = rbd_img_parent_read_callback;
1874         result = rbd_img_request_submit(img_request);
1875         if (result)
1876                 goto out_err;
1877
1878         return;
1879 out_err:
1880         if (img_request)
1881                 rbd_img_request_put(img_request);
1882         obj_request->result = result;
1883         obj_request->xferred = 0;
1884         obj_request_done_set(obj_request);
1885 }
1886
1887 static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
1888                                    u64 ver, u64 notify_id)
1889 {
1890         struct rbd_obj_request *obj_request;
1891         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1892         int ret;
1893
1894         obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1895                                                         OBJ_REQUEST_NODATA);
1896         if (!obj_request)
1897                 return -ENOMEM;
1898
1899         ret = -ENOMEM;
1900         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
1901         if (!obj_request->osd_req)
1902                 goto out;
1903         obj_request->callback = rbd_obj_request_put;
1904
1905         osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
1906                                         notify_id, ver, 0);
1907         rbd_osd_req_format(obj_request, false);
1908
1909         ret = rbd_obj_request_submit(osdc, obj_request);
1910 out:
1911         if (ret)
1912                 rbd_obj_request_put(obj_request);
1913
1914         return ret;
1915 }
1916
1917 static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1918 {
1919         struct rbd_device *rbd_dev = (struct rbd_device *)data;
1920         u64 hver;
1921         int rc;
1922
1923         if (!rbd_dev)
1924                 return;
1925
1926         dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
1927                 rbd_dev->header_name, (unsigned long long) notify_id,
1928                 (unsigned int) opcode);
1929         rc = rbd_dev_refresh(rbd_dev, &hver);
1930         if (rc)
1931                 rbd_warn(rbd_dev, "got notification but failed to "
1932                            " update snaps: %d\n", rc);
1933
1934         rbd_obj_notify_ack(rbd_dev, hver, notify_id);
1935 }
1936
1937 /*
1938  * Request sync osd watch/unwatch.  The value of "start" determines
1939  * whether a watch request is being initiated or torn down.
1940  */
1941 static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
1942 {
1943         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1944         struct rbd_obj_request *obj_request;
1945         int ret;
1946
1947         rbd_assert(start ^ !!rbd_dev->watch_event);
1948         rbd_assert(start ^ !!rbd_dev->watch_request);
1949
1950         if (start) {
1951                 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
1952                                                 &rbd_dev->watch_event);
1953                 if (ret < 0)
1954                         return ret;
1955                 rbd_assert(rbd_dev->watch_event != NULL);
1956         }
1957
1958         ret = -ENOMEM;
1959         obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1960                                                         OBJ_REQUEST_NODATA);
1961         if (!obj_request)
1962                 goto out_cancel;
1963
1964         obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
1965         if (!obj_request->osd_req)
1966                 goto out_cancel;
1967
1968         if (start)
1969                 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
1970         else
1971                 ceph_osdc_unregister_linger_request(osdc,
1972                                         rbd_dev->watch_request->osd_req);
1973
1974         osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
1975                                 rbd_dev->watch_event->cookie,
1976                                 rbd_dev->header.obj_version, start);
1977         rbd_osd_req_format(obj_request, true);
1978
1979         ret = rbd_obj_request_submit(osdc, obj_request);
1980         if (ret)
1981                 goto out_cancel;
1982         ret = rbd_obj_request_wait(obj_request);
1983         if (ret)
1984                 goto out_cancel;
1985         ret = obj_request->result;
1986         if (ret)
1987                 goto out_cancel;
1988
1989         /*
1990          * A watch request is set to linger, so the underlying osd
1991          * request won't go away until we unregister it.  We retain
1992          * a pointer to the object request during that time (in
1993          * rbd_dev->watch_request), so we'll keep a reference to
1994          * it.  We'll drop that reference (below) after we've
1995          * unregistered it.
1996          */
1997         if (start) {
1998                 rbd_dev->watch_request = obj_request;
1999
2000                 return 0;
2001         }
2002
2003         /* We have successfully torn down the watch request */
2004
2005         rbd_obj_request_put(rbd_dev->watch_request);
2006         rbd_dev->watch_request = NULL;
2007 out_cancel:
2008         /* Cancel the event if we're tearing down, or on error */
2009         ceph_osdc_cancel_event(rbd_dev->watch_event);
2010         rbd_dev->watch_event = NULL;
2011         if (obj_request)
2012                 rbd_obj_request_put(obj_request);
2013
2014         return ret;
2015 }
2016
2017 /*
2018  * Synchronous osd object method call
2019  */
2020 static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
2021                              const char *object_name,
2022                              const char *class_name,
2023                              const char *method_name,
2024                              const char *outbound,
2025                              size_t outbound_size,
2026                              char *inbound,
2027                              size_t inbound_size,
2028                              u64 *version)
2029 {
2030         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2031         struct rbd_obj_request *obj_request;
2032         struct page **pages;
2033         u32 page_count;
2034         int ret;
2035
2036         /*
2037          * Method calls are ultimately read operations.  The result
2038          * should placed into the inbound buffer provided.  They
2039          * also supply outbound data--parameters for the object
2040          * method.  Currently if this is present it will be a
2041          * snapshot id.
2042          */
2043         page_count = (u32) calc_pages_for(0, inbound_size);
2044         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2045         if (IS_ERR(pages))
2046                 return PTR_ERR(pages);
2047
2048         ret = -ENOMEM;
2049         obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
2050                                                         OBJ_REQUEST_PAGES);
2051         if (!obj_request)
2052                 goto out;
2053
2054         obj_request->pages = pages;
2055         obj_request->page_count = page_count;
2056
2057         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
2058         if (!obj_request->osd_req)
2059                 goto out;
2060
2061         osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
2062                                         class_name, method_name);
2063         if (outbound_size) {
2064                 struct ceph_pagelist *pagelist;
2065
2066                 pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
2067                 if (!pagelist)
2068                         goto out;
2069
2070                 ceph_pagelist_init(pagelist);
2071                 ceph_pagelist_append(pagelist, outbound, outbound_size);
2072                 osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
2073                                                 pagelist);
2074         }
2075         osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
2076                                         obj_request->pages, inbound_size,
2077                                         0, false, false);
2078         rbd_osd_req_format(obj_request, false);
2079
2080         ret = rbd_obj_request_submit(osdc, obj_request);
2081         if (ret)
2082                 goto out;
2083         ret = rbd_obj_request_wait(obj_request);
2084         if (ret)
2085                 goto out;
2086
2087         ret = obj_request->result;
2088         if (ret < 0)
2089                 goto out;
2090         ret = 0;
2091         ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
2092         if (version)
2093                 *version = obj_request->version;
2094 out:
2095         if (obj_request)
2096                 rbd_obj_request_put(obj_request);
2097         else
2098                 ceph_release_page_vector(pages, page_count);
2099
2100         return ret;
2101 }
2102
2103 static void rbd_request_fn(struct request_queue *q)
2104                 __releases(q->queue_lock) __acquires(q->queue_lock)
2105 {
2106         struct rbd_device *rbd_dev = q->queuedata;
2107         bool read_only = rbd_dev->mapping.read_only;
2108         struct request *rq;
2109         int result;
2110
2111         while ((rq = blk_fetch_request(q))) {
2112                 bool write_request = rq_data_dir(rq) == WRITE;
2113                 struct rbd_img_request *img_request;
2114                 u64 offset;
2115                 u64 length;
2116
2117                 /* Ignore any non-FS requests that filter through. */
2118
2119                 if (rq->cmd_type != REQ_TYPE_FS) {
2120                         dout("%s: non-fs request type %d\n", __func__,
2121                                 (int) rq->cmd_type);
2122                         __blk_end_request_all(rq, 0);
2123                         continue;
2124                 }
2125
2126                 /* Ignore/skip any zero-length requests */
2127
2128                 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
2129                 length = (u64) blk_rq_bytes(rq);
2130
2131                 if (!length) {
2132                         dout("%s: zero-length request\n", __func__);
2133                         __blk_end_request_all(rq, 0);
2134                         continue;
2135                 }
2136
2137                 spin_unlock_irq(q->queue_lock);
2138
2139                 /* Disallow writes to a read-only device */
2140
2141                 if (write_request) {
2142                         result = -EROFS;
2143                         if (read_only)
2144                                 goto end_request;
2145                         rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
2146                 }
2147
2148                 /*
2149                  * Quit early if the mapped snapshot no longer
2150                  * exists.  It's still possible the snapshot will
2151                  * have disappeared by the time our request arrives
2152                  * at the osd, but there's no sense in sending it if
2153                  * we already know.
2154                  */
2155                 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
2156                         dout("request for non-existent snapshot");
2157                         rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2158                         result = -ENXIO;
2159                         goto end_request;
2160                 }
2161
2162                 result = -EINVAL;
2163                 if (WARN_ON(offset && length > U64_MAX - offset + 1))
2164                         goto end_request;       /* Shouldn't happen */
2165
2166                 result = -ENOMEM;
2167                 img_request = rbd_img_request_create(rbd_dev, offset, length,
2168                                                         write_request, false);
2169                 if (!img_request)
2170                         goto end_request;
2171
2172                 img_request->rq = rq;
2173
2174                 result = rbd_img_request_fill_bio(img_request, rq->bio);
2175                 if (!result)
2176                         result = rbd_img_request_submit(img_request);
2177                 if (result)
2178                         rbd_img_request_put(img_request);
2179 end_request:
2180                 spin_lock_irq(q->queue_lock);
2181                 if (result < 0) {
2182                         rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
2183                                 write_request ? "write" : "read",
2184                                 length, offset, result);
2185
2186                         __blk_end_request_all(rq, result);
2187                 }
2188         }
2189 }
2190
2191 /*
2192  * a queue callback. Makes sure that we don't create a bio that spans across
2193  * multiple osd objects. One exception would be with a single page bios,
2194  * which we handle later at bio_chain_clone_range()
2195  */
2196 static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2197                           struct bio_vec *bvec)
2198 {
2199         struct rbd_device *rbd_dev = q->queuedata;
2200         sector_t sector_offset;
2201         sector_t sectors_per_obj;
2202         sector_t obj_sector_offset;
2203         int ret;
2204
2205         /*
2206          * Find how far into its rbd object the partition-relative
2207          * bio start sector is to offset relative to the enclosing
2208          * device.
2209          */
2210         sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2211         sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2212         obj_sector_offset = sector_offset & (sectors_per_obj - 1);
2213
2214         /*
2215          * Compute the number of bytes from that offset to the end
2216          * of the object.  Account for what's already used by the bio.
2217          */
2218         ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2219         if (ret > bmd->bi_size)
2220                 ret -= bmd->bi_size;
2221         else
2222                 ret = 0;
2223
2224         /*
2225          * Don't send back more than was asked for.  And if the bio
2226          * was empty, let the whole thing through because:  "Note
2227          * that a block device *must* allow a single page to be
2228          * added to an empty bio."
2229          */
2230         rbd_assert(bvec->bv_len <= PAGE_SIZE);
2231         if (ret > (int) bvec->bv_len || !bmd->bi_size)
2232                 ret = (int) bvec->bv_len;
2233
2234         return ret;
2235 }
2236
2237 static void rbd_free_disk(struct rbd_device *rbd_dev)
2238 {
2239         struct gendisk *disk = rbd_dev->disk;
2240
2241         if (!disk)
2242                 return;
2243
2244         if (disk->flags & GENHD_FL_UP)
2245                 del_gendisk(disk);
2246         if (disk->queue)
2247                 blk_cleanup_queue(disk->queue);
2248         put_disk(disk);
2249 }
2250
2251 static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2252                                 const char *object_name,
2253                                 u64 offset, u64 length,
2254                                 char *buf, u64 *version)
2255
2256 {
2257         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2258         struct rbd_obj_request *obj_request;
2259         struct page **pages = NULL;
2260         u32 page_count;
2261         size_t size;
2262         int ret;
2263
2264         page_count = (u32) calc_pages_for(offset, length);
2265         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2266         if (IS_ERR(pages))
2267                 ret = PTR_ERR(pages);
2268
2269         ret = -ENOMEM;
2270         obj_request = rbd_obj_request_create(object_name, offset, length,
2271                                                         OBJ_REQUEST_PAGES);
2272         if (!obj_request)
2273                 goto out;
2274
2275         obj_request->pages = pages;
2276         obj_request->page_count = page_count;
2277
2278         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
2279         if (!obj_request->osd_req)
2280                 goto out;
2281
2282         osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
2283                                         offset, length, 0, 0);
2284         osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
2285                                         obj_request->pages,
2286                                         obj_request->length,
2287                                         obj_request->offset & ~PAGE_MASK,
2288                                         false, false);
2289         rbd_osd_req_format(obj_request, false);
2290
2291         ret = rbd_obj_request_submit(osdc, obj_request);
2292         if (ret)
2293                 goto out;
2294         ret = rbd_obj_request_wait(obj_request);
2295         if (ret)
2296                 goto out;
2297
2298         ret = obj_request->result;
2299         if (ret < 0)
2300                 goto out;
2301
2302         rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
2303         size = (size_t) obj_request->xferred;
2304         ceph_copy_from_page_vector(pages, buf, 0, size);
2305         rbd_assert(size <= (size_t) INT_MAX);
2306         ret = (int) size;
2307         if (version)
2308                 *version = obj_request->version;
2309 out:
2310         if (obj_request)
2311                 rbd_obj_request_put(obj_request);
2312         else
2313                 ceph_release_page_vector(pages, page_count);
2314
2315         return ret;
2316 }
2317
2318 /*
2319  * Read the complete header for the given rbd device.
2320  *
2321  * Returns a pointer to a dynamically-allocated buffer containing
2322  * the complete and validated header.  Caller can pass the address
2323  * of a variable that will be filled in with the version of the
2324  * header object at the time it was read.
2325  *
2326  * Returns a pointer-coded errno if a failure occurs.
2327  */
2328 static struct rbd_image_header_ondisk *
2329 rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
2330 {
2331         struct rbd_image_header_ondisk *ondisk = NULL;
2332         u32 snap_count = 0;
2333         u64 names_size = 0;
2334         u32 want_count;
2335         int ret;
2336
2337         /*
2338          * The complete header will include an array of its 64-bit
2339          * snapshot ids, followed by the names of those snapshots as
2340          * a contiguous block of NUL-terminated strings.  Note that
2341          * the number of snapshots could change by the time we read
2342          * it in, in which case we re-read it.
2343          */
2344         do {
2345                 size_t size;
2346
2347                 kfree(ondisk);
2348
2349                 size = sizeof (*ondisk);
2350                 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
2351                 size += names_size;
2352                 ondisk = kmalloc(size, GFP_KERNEL);
2353                 if (!ondisk)
2354                         return ERR_PTR(-ENOMEM);
2355
2356                 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
2357                                        0, size,
2358                                        (char *) ondisk, version);
2359                 if (ret < 0)
2360                         goto out_err;
2361                 if (WARN_ON((size_t) ret < size)) {
2362                         ret = -ENXIO;
2363                         rbd_warn(rbd_dev, "short header read (want %zd got %d)",
2364                                 size, ret);
2365                         goto out_err;
2366                 }
2367                 if (!rbd_dev_ondisk_valid(ondisk)) {
2368                         ret = -ENXIO;
2369                         rbd_warn(rbd_dev, "invalid header");
2370                         goto out_err;
2371                 }
2372
2373                 names_size = le64_to_cpu(ondisk->snap_names_len);
2374                 want_count = snap_count;
2375                 snap_count = le32_to_cpu(ondisk->snap_count);
2376         } while (snap_count != want_count);
2377
2378         return ondisk;
2379
2380 out_err:
2381         kfree(ondisk);
2382
2383         return ERR_PTR(ret);
2384 }
2385
2386 /*
2387  * reload the ondisk the header
2388  */
2389 static int rbd_read_header(struct rbd_device *rbd_dev,
2390                            struct rbd_image_header *header)
2391 {
2392         struct rbd_image_header_ondisk *ondisk;
2393         u64 ver = 0;
2394         int ret;
2395
2396         ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
2397         if (IS_ERR(ondisk))
2398                 return PTR_ERR(ondisk);
2399         ret = rbd_header_from_disk(header, ondisk);
2400         if (ret >= 0)
2401                 header->obj_version = ver;
2402         kfree(ondisk);
2403
2404         return ret;
2405 }
2406
2407 static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
2408 {
2409         struct rbd_snap *snap;
2410         struct rbd_snap *next;
2411
2412         list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
2413                 rbd_remove_snap_dev(snap);
2414 }
2415
2416 static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
2417 {
2418         sector_t size;
2419
2420         if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
2421                 return;
2422
2423         size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
2424         dout("setting size to %llu sectors", (unsigned long long) size);
2425         rbd_dev->mapping.size = (u64) size;
2426         set_capacity(rbd_dev->disk, size);
2427 }
2428
2429 /*
2430  * only read the first part of the ondisk header, without the snaps info
2431  */
2432 static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
2433 {
2434         int ret;
2435         struct rbd_image_header h;
2436
2437         ret = rbd_read_header(rbd_dev, &h);
2438         if (ret < 0)
2439                 return ret;
2440
2441         down_write(&rbd_dev->header_rwsem);
2442
2443         /* Update image size, and check for resize of mapped image */
2444         rbd_dev->header.image_size = h.image_size;
2445         rbd_update_mapping_size(rbd_dev);
2446
2447         /* rbd_dev->header.object_prefix shouldn't change */
2448         kfree(rbd_dev->header.snap_sizes);
2449         kfree(rbd_dev->header.snap_names);
2450         /* osd requests may still refer to snapc */
2451         ceph_put_snap_context(rbd_dev->header.snapc);
2452
2453         if (hver)
2454                 *hver = h.obj_version;
2455         rbd_dev->header.obj_version = h.obj_version;
2456         rbd_dev->header.image_size = h.image_size;
2457         rbd_dev->header.snapc = h.snapc;
2458         rbd_dev->header.snap_names = h.snap_names;
2459         rbd_dev->header.snap_sizes = h.snap_sizes;
2460         /* Free the extra copy of the object prefix */
2461         WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
2462         kfree(h.object_prefix);
2463
2464         ret = rbd_dev_snaps_update(rbd_dev);
2465         if (!ret)
2466                 ret = rbd_dev_snaps_register(rbd_dev);
2467
2468         up_write(&rbd_dev->header_rwsem);
2469
2470         return ret;
2471 }
2472
2473 static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
2474 {
2475         int ret;
2476
2477         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
2478         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2479         if (rbd_dev->image_format == 1)
2480                 ret = rbd_dev_v1_refresh(rbd_dev, hver);
2481         else
2482                 ret = rbd_dev_v2_refresh(rbd_dev, hver);
2483         mutex_unlock(&ctl_mutex);
2484
2485         return ret;
2486 }
2487
2488 static int rbd_init_disk(struct rbd_device *rbd_dev)
2489 {
2490         struct gendisk *disk;
2491         struct request_queue *q;
2492         u64 segment_size;
2493
2494         /* create gendisk info */
2495         disk = alloc_disk(RBD_MINORS_PER_MAJOR);
2496         if (!disk)
2497                 return -ENOMEM;
2498
2499         snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
2500                  rbd_dev->dev_id);
2501         disk->major = rbd_dev->major;
2502         disk->first_minor = 0;
2503         disk->fops = &rbd_bd_ops;
2504         disk->private_data = rbd_dev;
2505
2506         q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
2507         if (!q)
2508                 goto out_disk;
2509
2510         /* We use the default size, but let's be explicit about it. */
2511         blk_queue_physical_block_size(q, SECTOR_SIZE);
2512
2513         /* set io sizes to object size */
2514         segment_size = rbd_obj_bytes(&rbd_dev->header);
2515         blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
2516         blk_queue_max_segment_size(q, segment_size);
2517         blk_queue_io_min(q, segment_size);
2518         blk_queue_io_opt(q, segment_size);
2519
2520         blk_queue_merge_bvec(q, rbd_merge_bvec);
2521         disk->queue = q;
2522
2523         q->queuedata = rbd_dev;
2524
2525         rbd_dev->disk = disk;
2526
2527         set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
2528
2529         return 0;
2530 out_disk:
2531         put_disk(disk);
2532
2533         return -ENOMEM;
2534 }
2535
2536 /*
2537   sysfs
2538 */
2539
2540 static struct rbd_device *dev_to_rbd_dev(struct device *dev)
2541 {
2542         return container_of(dev, struct rbd_device, dev);
2543 }
2544
2545 static ssize_t rbd_size_show(struct device *dev,
2546                              struct device_attribute *attr, char *buf)
2547 {
2548         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2549         sector_t size;
2550
2551         down_read(&rbd_dev->header_rwsem);
2552         size = get_capacity(rbd_dev->disk);
2553         up_read(&rbd_dev->header_rwsem);
2554
2555         return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
2556 }
2557
2558 /*
2559  * Note this shows the features for whatever's mapped, which is not
2560  * necessarily the base image.
2561  */
2562 static ssize_t rbd_features_show(struct device *dev,
2563                              struct device_attribute *attr, char *buf)
2564 {
2565         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2566
2567         return sprintf(buf, "0x%016llx\n",
2568                         (unsigned long long) rbd_dev->mapping.features);
2569 }
2570
2571 static ssize_t rbd_major_show(struct device *dev,
2572                               struct device_attribute *attr, char *buf)
2573 {
2574         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2575
2576         return sprintf(buf, "%d\n", rbd_dev->major);
2577 }
2578
2579 static ssize_t rbd_client_id_show(struct device *dev,
2580                                   struct device_attribute *attr, char *buf)
2581 {
2582         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2583
2584         return sprintf(buf, "client%lld\n",
2585                         ceph_client_id(rbd_dev->rbd_client->client));
2586 }
2587
2588 static ssize_t rbd_pool_show(struct device *dev,
2589                              struct device_attribute *attr, char *buf)
2590 {
2591         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2592
2593         return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
2594 }
2595
2596 static ssize_t rbd_pool_id_show(struct device *dev,
2597                              struct device_attribute *attr, char *buf)
2598 {
2599         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2600
2601         return sprintf(buf, "%llu\n",
2602                 (unsigned long long) rbd_dev->spec->pool_id);
2603 }
2604
2605 static ssize_t rbd_name_show(struct device *dev,
2606                              struct device_attribute *attr, char *buf)
2607 {
2608         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2609
2610         if (rbd_dev->spec->image_name)
2611                 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2612
2613         return sprintf(buf, "(unknown)\n");
2614 }
2615
2616 static ssize_t rbd_image_id_show(struct device *dev,
2617                              struct device_attribute *attr, char *buf)
2618 {
2619         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2620
2621         return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
2622 }
2623
2624 /*
2625  * Shows the name of the currently-mapped snapshot (or
2626  * RBD_SNAP_HEAD_NAME for the base image).
2627  */
2628 static ssize_t rbd_snap_show(struct device *dev,
2629                              struct device_attribute *attr,
2630                              char *buf)
2631 {
2632         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2633
2634         return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
2635 }
2636
2637 /*
2638  * For an rbd v2 image, shows the pool id, image id, and snapshot id
2639  * for the parent image.  If there is no parent, simply shows
2640  * "(no parent image)".
2641  */
2642 static ssize_t rbd_parent_show(struct device *dev,
2643                              struct device_attribute *attr,
2644                              char *buf)
2645 {
2646         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2647         struct rbd_spec *spec = rbd_dev->parent_spec;
2648         int count;
2649         char *bufp = buf;
2650
2651         if (!spec)
2652                 return sprintf(buf, "(no parent image)\n");
2653
2654         count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2655                         (unsigned long long) spec->pool_id, spec->pool_name);
2656         if (count < 0)
2657                 return count;
2658         bufp += count;
2659
2660         count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2661                         spec->image_name ? spec->image_name : "(unknown)");
2662         if (count < 0)
2663                 return count;
2664         bufp += count;
2665
2666         count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2667                         (unsigned long long) spec->snap_id, spec->snap_name);
2668         if (count < 0)
2669                 return count;
2670         bufp += count;
2671
2672         count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2673         if (count < 0)
2674                 return count;
2675         bufp += count;
2676
2677         return (ssize_t) (bufp - buf);
2678 }
2679
2680 static ssize_t rbd_image_refresh(struct device *dev,
2681                                  struct device_attribute *attr,
2682                                  const char *buf,
2683                                  size_t size)
2684 {
2685         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2686         int ret;
2687
2688         ret = rbd_dev_refresh(rbd_dev, NULL);
2689
2690         return ret < 0 ? ret : size;
2691 }
2692
2693 static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
2694 static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
2695 static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2696 static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2697 static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
2698 static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
2699 static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
2700 static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
2701 static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2702 static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
2703 static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
2704
2705 static struct attribute *rbd_attrs[] = {
2706         &dev_attr_size.attr,
2707         &dev_attr_features.attr,
2708         &dev_attr_major.attr,
2709         &dev_attr_client_id.attr,
2710         &dev_attr_pool.attr,
2711         &dev_attr_pool_id.attr,
2712         &dev_attr_name.attr,
2713         &dev_attr_image_id.attr,
2714         &dev_attr_current_snap.attr,
2715         &dev_attr_parent.attr,
2716         &dev_attr_refresh.attr,
2717         NULL
2718 };
2719
2720 static struct attribute_group rbd_attr_group = {
2721         .attrs = rbd_attrs,
2722 };
2723
2724 static const struct attribute_group *rbd_attr_groups[] = {
2725         &rbd_attr_group,
2726         NULL
2727 };
2728
2729 static void rbd_sysfs_dev_release(struct device *dev)
2730 {
2731 }
2732
2733 static struct device_type rbd_device_type = {
2734         .name           = "rbd",
2735         .groups         = rbd_attr_groups,
2736         .release        = rbd_sysfs_dev_release,
2737 };
2738
2739
2740 /*
2741   sysfs - snapshots
2742 */
2743
2744 static ssize_t rbd_snap_size_show(struct device *dev,
2745                                   struct device_attribute *attr,
2746                                   char *buf)
2747 {
2748         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2749
2750         return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
2751 }
2752
2753 static ssize_t rbd_snap_id_show(struct device *dev,
2754                                 struct device_attribute *attr,
2755                                 char *buf)
2756 {
2757         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2758
2759         return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
2760 }
2761
2762 static ssize_t rbd_snap_features_show(struct device *dev,
2763                                 struct device_attribute *attr,
2764                                 char *buf)
2765 {
2766         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2767
2768         return sprintf(buf, "0x%016llx\n",
2769                         (unsigned long long) snap->features);
2770 }
2771
2772 static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2773 static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
2774 static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
2775
2776 static struct attribute *rbd_snap_attrs[] = {
2777         &dev_attr_snap_size.attr,
2778         &dev_attr_snap_id.attr,
2779         &dev_attr_snap_features.attr,
2780         NULL,
2781 };
2782
2783 static struct attribute_group rbd_snap_attr_group = {
2784         .attrs = rbd_snap_attrs,
2785 };
2786
2787 static void rbd_snap_dev_release(struct device *dev)
2788 {
2789         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2790         kfree(snap->name);
2791         kfree(snap);
2792 }
2793
2794 static const struct attribute_group *rbd_snap_attr_groups[] = {
2795         &rbd_snap_attr_group,
2796         NULL
2797 };
2798
2799 static struct device_type rbd_snap_device_type = {
2800         .groups         = rbd_snap_attr_groups,
2801         .release        = rbd_snap_dev_release,
2802 };
2803
2804 static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2805 {
2806         kref_get(&spec->kref);
2807
2808         return spec;
2809 }
2810
2811 static void rbd_spec_free(struct kref *kref);
2812 static void rbd_spec_put(struct rbd_spec *spec)
2813 {
2814         if (spec)
2815                 kref_put(&spec->kref, rbd_spec_free);
2816 }
2817
2818 static struct rbd_spec *rbd_spec_alloc(void)
2819 {
2820         struct rbd_spec *spec;
2821
2822         spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2823         if (!spec)
2824                 return NULL;
2825         kref_init(&spec->kref);
2826
2827         return spec;
2828 }
2829
2830 static void rbd_spec_free(struct kref *kref)
2831 {
2832         struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2833
2834         kfree(spec->pool_name);
2835         kfree(spec->image_id);
2836         kfree(spec->image_name);
2837         kfree(spec->snap_name);
2838         kfree(spec);
2839 }
2840
2841 static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2842                                 struct rbd_spec *spec)
2843 {
2844         struct rbd_device *rbd_dev;
2845
2846         rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2847         if (!rbd_dev)
2848                 return NULL;
2849
2850         spin_lock_init(&rbd_dev->lock);
2851         rbd_dev->flags = 0;
2852         INIT_LIST_HEAD(&rbd_dev->node);
2853         INIT_LIST_HEAD(&rbd_dev->snaps);
2854         init_rwsem(&rbd_dev->header_rwsem);
2855
2856         rbd_dev->spec = spec;
2857         rbd_dev->rbd_client = rbdc;
2858
2859         /* Initialize the layout used for all rbd requests */
2860
2861         rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2862         rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
2863         rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2864         rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
2865
2866         return rbd_dev;
2867 }
2868
2869 static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2870 {
2871         rbd_spec_put(rbd_dev->parent_spec);
2872         kfree(rbd_dev->header_name);
2873         rbd_put_client(rbd_dev->rbd_client);
2874         rbd_spec_put(rbd_dev->spec);
2875         kfree(rbd_dev);
2876 }
2877
2878 static bool rbd_snap_registered(struct rbd_snap *snap)
2879 {
2880         bool ret = snap->dev.type == &rbd_snap_device_type;
2881         bool reg = device_is_registered(&snap->dev);
2882
2883         rbd_assert(!ret ^ reg);
2884
2885         return ret;
2886 }
2887
2888 static void rbd_remove_snap_dev(struct rbd_snap *snap)
2889 {
2890         list_del(&snap->node);
2891         if (device_is_registered(&snap->dev))
2892                 device_unregister(&snap->dev);
2893 }
2894
2895 static int rbd_register_snap_dev(struct rbd_snap *snap,
2896                                   struct device *parent)
2897 {
2898         struct device *dev = &snap->dev;
2899         int ret;
2900
2901         dev->type = &rbd_snap_device_type;
2902         dev->parent = parent;
2903         dev->release = rbd_snap_dev_release;
2904         dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
2905         dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2906
2907         ret = device_register(dev);
2908
2909         return ret;
2910 }
2911
2912 static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2913                                                 const char *snap_name,
2914                                                 u64 snap_id, u64 snap_size,
2915                                                 u64 snap_features)
2916 {
2917         struct rbd_snap *snap;
2918         int ret;
2919
2920         snap = kzalloc(sizeof (*snap), GFP_KERNEL);
2921         if (!snap)
2922                 return ERR_PTR(-ENOMEM);
2923
2924         ret = -ENOMEM;
2925         snap->name = kstrdup(snap_name, GFP_KERNEL);
2926         if (!snap->name)
2927                 goto err;
2928
2929         snap->id = snap_id;
2930         snap->size = snap_size;
2931         snap->features = snap_features;
2932
2933         return snap;
2934
2935 err:
2936         kfree(snap->name);
2937         kfree(snap);
2938
2939         return ERR_PTR(ret);
2940 }
2941
2942 static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2943                 u64 *snap_size, u64 *snap_features)
2944 {
2945         char *snap_name;
2946
2947         rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2948
2949         *snap_size = rbd_dev->header.snap_sizes[which];
2950         *snap_features = 0;     /* No features for v1 */
2951
2952         /* Skip over names until we find the one we are looking for */
2953
2954         snap_name = rbd_dev->header.snap_names;
2955         while (which--)
2956                 snap_name += strlen(snap_name) + 1;
2957
2958         return snap_name;
2959 }
2960
2961 /*
2962  * Get the size and object order for an image snapshot, or if
2963  * snap_id is CEPH_NOSNAP, gets this information for the base
2964  * image.
2965  */
2966 static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2967                                 u8 *order, u64 *snap_size)
2968 {
2969         __le64 snapid = cpu_to_le64(snap_id);
2970         int ret;
2971         struct {
2972                 u8 order;
2973                 __le64 size;
2974         } __attribute__ ((packed)) size_buf = { 0 };
2975
2976         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2977                                 "rbd", "get_size",
2978                                 (char *) &snapid, sizeof (snapid),
2979                                 (char *) &size_buf, sizeof (size_buf), NULL);
2980         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2981         if (ret < 0)
2982                 return ret;
2983
2984         *order = size_buf.order;
2985         *snap_size = le64_to_cpu(size_buf.size);
2986
2987         dout("  snap_id 0x%016llx order = %u, snap_size = %llu\n",
2988                 (unsigned long long) snap_id, (unsigned int) *order,
2989                 (unsigned long long) *snap_size);
2990
2991         return 0;
2992 }
2993
2994 static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2995 {
2996         return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2997                                         &rbd_dev->header.obj_order,
2998                                         &rbd_dev->header.image_size);
2999 }
3000
3001 static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
3002 {
3003         void *reply_buf;
3004         int ret;
3005         void *p;
3006
3007         reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
3008         if (!reply_buf)
3009                 return -ENOMEM;
3010
3011         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3012                                 "rbd", "get_object_prefix",
3013                                 NULL, 0,
3014                                 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
3015         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3016         if (ret < 0)
3017                 goto out;
3018
3019         p = reply_buf;
3020         rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
3021                                                 p + RBD_OBJ_PREFIX_LEN_MAX,
3022                                                 NULL, GFP_NOIO);
3023
3024         if (IS_ERR(rbd_dev->header.object_prefix)) {
3025                 ret = PTR_ERR(rbd_dev->header.object_prefix);
3026                 rbd_dev->header.object_prefix = NULL;
3027         } else {
3028                 dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
3029         }
3030
3031 out:
3032         kfree(reply_buf);
3033
3034         return ret;
3035 }
3036
3037 static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
3038                 u64 *snap_features)
3039 {
3040         __le64 snapid = cpu_to_le64(snap_id);
3041         struct {
3042                 __le64 features;
3043                 __le64 incompat;
3044         } features_buf = { 0 };
3045         u64 incompat;
3046         int ret;
3047
3048         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3049                                 "rbd", "get_features",
3050                                 (char *) &snapid, sizeof (snapid),
3051                                 (char *) &features_buf, sizeof (features_buf),
3052                                 NULL);
3053         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3054         if (ret < 0)
3055                 return ret;
3056
3057         incompat = le64_to_cpu(features_buf.incompat);
3058         if (incompat & ~RBD_FEATURES_SUPPORTED)
3059                 return -ENXIO;
3060
3061         *snap_features = le64_to_cpu(features_buf.features);
3062
3063         dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
3064                 (unsigned long long) snap_id,
3065                 (unsigned long long) *snap_features,
3066                 (unsigned long long) le64_to_cpu(features_buf.incompat));
3067
3068         return 0;
3069 }
3070
3071 static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
3072 {
3073         return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
3074                                                 &rbd_dev->header.features);
3075 }
3076
3077 static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
3078 {
3079         struct rbd_spec *parent_spec;
3080         size_t size;
3081         void *reply_buf = NULL;
3082         __le64 snapid;
3083         void *p;
3084         void *end;
3085         char *image_id;
3086         u64 overlap;
3087         int ret;
3088
3089         parent_spec = rbd_spec_alloc();
3090         if (!parent_spec)
3091                 return -ENOMEM;
3092
3093         size = sizeof (__le64) +                                /* pool_id */
3094                 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +        /* image_id */
3095                 sizeof (__le64) +                               /* snap_id */
3096                 sizeof (__le64);                                /* overlap */
3097         reply_buf = kmalloc(size, GFP_KERNEL);
3098         if (!reply_buf) {
3099                 ret = -ENOMEM;
3100                 goto out_err;
3101         }
3102
3103         snapid = cpu_to_le64(CEPH_NOSNAP);
3104         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3105                                 "rbd", "get_parent",
3106                                 (char *) &snapid, sizeof (snapid),
3107                                 (char *) reply_buf, size, NULL);
3108         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3109         if (ret < 0)
3110                 goto out_err;
3111
3112         ret = -ERANGE;
3113         p = reply_buf;
3114         end = (char *) reply_buf + size;
3115         ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
3116         if (parent_spec->pool_id == CEPH_NOPOOL)
3117                 goto out;       /* No parent?  No problem. */
3118
3119         /* The ceph file layout needs to fit pool id in 32 bits */
3120
3121         ret = -EIO;
3122         if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
3123                 goto out;
3124
3125         image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
3126         if (IS_ERR(image_id)) {
3127                 ret = PTR_ERR(image_id);
3128                 goto out_err;
3129         }
3130         parent_spec->image_id = image_id;
3131         ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
3132         ceph_decode_64_safe(&p, end, overlap, out_err);
3133
3134         rbd_dev->parent_overlap = overlap;
3135         rbd_dev->parent_spec = parent_spec;
3136         parent_spec = NULL;     /* rbd_dev now owns this */
3137 out:
3138         ret = 0;
3139 out_err:
3140         kfree(reply_buf);
3141         rbd_spec_put(parent_spec);
3142
3143         return ret;
3144 }
3145
3146 static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
3147 {
3148         size_t image_id_size;
3149         char *image_id;
3150         void *p;
3151         void *end;
3152         size_t size;
3153         void *reply_buf = NULL;
3154         size_t len = 0;
3155         char *image_name = NULL;
3156         int ret;
3157
3158         rbd_assert(!rbd_dev->spec->image_name);
3159
3160         len = strlen(rbd_dev->spec->image_id);
3161         image_id_size = sizeof (__le32) + len;
3162         image_id = kmalloc(image_id_size, GFP_KERNEL);
3163         if (!image_id)
3164                 return NULL;
3165
3166         p = image_id;
3167         end = (char *) image_id + image_id_size;
3168         ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
3169
3170         size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
3171         reply_buf = kmalloc(size, GFP_KERNEL);
3172         if (!reply_buf)
3173                 goto out;
3174
3175         ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
3176                                 "rbd", "dir_get_name",
3177                                 image_id, image_id_size,
3178                                 (char *) reply_buf, size, NULL);
3179         if (ret < 0)
3180                 goto out;
3181         p = reply_buf;
3182         end = (char *) reply_buf + size;
3183         image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
3184         if (IS_ERR(image_name))
3185                 image_name = NULL;
3186         else
3187                 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
3188 out:
3189         kfree(reply_buf);
3190         kfree(image_id);
3191
3192         return image_name;
3193 }
3194
3195 /*
3196  * When a parent image gets probed, we only have the pool, image,
3197  * and snapshot ids but not the names of any of them.  This call
3198  * is made later to fill in those names.  It has to be done after
3199  * rbd_dev_snaps_update() has completed because some of the
3200  * information (in particular, snapshot name) is not available
3201  * until then.
3202  */
3203 static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
3204 {
3205         struct ceph_osd_client *osdc;
3206         const char *name;
3207         void *reply_buf = NULL;
3208         int ret;
3209
3210         if (rbd_dev->spec->pool_name)
3211                 return 0;       /* Already have the names */
3212
3213         /* Look up the pool name */
3214
3215         osdc = &rbd_dev->rbd_client->client->osdc;
3216         name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
3217         if (!name) {
3218                 rbd_warn(rbd_dev, "there is no pool with id %llu",
3219                         rbd_dev->spec->pool_id);        /* Really a BUG() */
3220                 return -EIO;
3221         }
3222
3223         rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
3224         if (!rbd_dev->spec->pool_name)
3225                 return -ENOMEM;
3226
3227         /* Fetch the image name; tolerate failure here */
3228
3229         name = rbd_dev_image_name(rbd_dev);
3230         if (name)
3231                 rbd_dev->spec->image_name = (char *) name;
3232         else
3233                 rbd_warn(rbd_dev, "unable to get image name");
3234
3235         /* Look up the snapshot name. */
3236
3237         name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
3238         if (!name) {
3239                 rbd_warn(rbd_dev, "no snapshot with id %llu",
3240                         rbd_dev->spec->snap_id);        /* Really a BUG() */
3241                 ret = -EIO;
3242                 goto out_err;
3243         }
3244         rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
3245         if(!rbd_dev->spec->snap_name)
3246                 goto out_err;
3247
3248         return 0;
3249 out_err:
3250         kfree(reply_buf);
3251         kfree(rbd_dev->spec->pool_name);
3252         rbd_dev->spec->pool_name = NULL;
3253
3254         return ret;
3255 }
3256
3257 static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
3258 {
3259         size_t size;
3260         int ret;
3261         void *reply_buf;
3262         void *p;
3263         void *end;
3264         u64 seq;
3265         u32 snap_count;
3266         struct ceph_snap_context *snapc;
3267         u32 i;
3268
3269         /*
3270          * We'll need room for the seq value (maximum snapshot id),
3271          * snapshot count, and array of that many snapshot ids.
3272          * For now we have a fixed upper limit on the number we're
3273          * prepared to receive.
3274          */
3275         size = sizeof (__le64) + sizeof (__le32) +
3276                         RBD_MAX_SNAP_COUNT * sizeof (__le64);
3277         reply_buf = kzalloc(size, GFP_KERNEL);
3278         if (!reply_buf)
3279                 return -ENOMEM;
3280
3281         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3282                                 "rbd", "get_snapcontext",
3283                                 NULL, 0,
3284                                 reply_buf, size, ver);
3285         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3286         if (ret < 0)
3287                 goto out;
3288
3289         ret = -ERANGE;
3290         p = reply_buf;
3291         end = (char *) reply_buf + size;
3292         ceph_decode_64_safe(&p, end, seq, out);
3293         ceph_decode_32_safe(&p, end, snap_count, out);
3294
3295         /*
3296          * Make sure the reported number of snapshot ids wouldn't go
3297          * beyond the end of our buffer.  But before checking that,
3298          * make sure the computed size of the snapshot context we
3299          * allocate is representable in a size_t.
3300          */
3301         if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
3302                                  / sizeof (u64)) {
3303                 ret = -EINVAL;
3304                 goto out;
3305         }
3306         if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
3307                 goto out;
3308
3309         size = sizeof (struct ceph_snap_context) +
3310                                 snap_count * sizeof (snapc->snaps[0]);
3311         snapc = kmalloc(size, GFP_KERNEL);
3312         if (!snapc) {
3313                 ret = -ENOMEM;
3314                 goto out;
3315         }
3316
3317         atomic_set(&snapc->nref, 1);
3318         snapc->seq = seq;
3319         snapc->num_snaps = snap_count;
3320         for (i = 0; i < snap_count; i++)
3321                 snapc->snaps[i] = ceph_decode_64(&p);
3322
3323         rbd_dev->header.snapc = snapc;
3324
3325         dout("  snap context seq = %llu, snap_count = %u\n",
3326                 (unsigned long long) seq, (unsigned int) snap_count);
3327
3328 out:
3329         kfree(reply_buf);
3330
3331         return 0;
3332 }
3333
3334 static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3335 {
3336         size_t size;
3337         void *reply_buf;
3338         __le64 snap_id;
3339         int ret;
3340         void *p;
3341         void *end;
3342         char *snap_name;
3343
3344         size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3345         reply_buf = kmalloc(size, GFP_KERNEL);
3346         if (!reply_buf)
3347                 return ERR_PTR(-ENOMEM);
3348
3349         snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
3350         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3351                                 "rbd", "get_snapshot_name",
3352                                 (char *) &snap_id, sizeof (snap_id),
3353                                 reply_buf, size, NULL);
3354         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3355         if (ret < 0)
3356                 goto out;
3357
3358         p = reply_buf;
3359         end = (char *) reply_buf + size;
3360         snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
3361         if (IS_ERR(snap_name)) {
3362                 ret = PTR_ERR(snap_name);
3363                 goto out;
3364         } else {
3365                 dout("  snap_id 0x%016llx snap_name = %s\n",
3366                         (unsigned long long) le64_to_cpu(snap_id), snap_name);
3367         }
3368         kfree(reply_buf);
3369
3370         return snap_name;
3371 out:
3372         kfree(reply_buf);
3373
3374         return ERR_PTR(ret);
3375 }
3376
3377 static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3378                 u64 *snap_size, u64 *snap_features)
3379 {
3380         u64 snap_id;
3381         u8 order;
3382         int ret;
3383
3384         snap_id = rbd_dev->header.snapc->snaps[which];
3385         ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
3386         if (ret)
3387                 return ERR_PTR(ret);
3388         ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
3389         if (ret)
3390                 return ERR_PTR(ret);
3391
3392         return rbd_dev_v2_snap_name(rbd_dev, which);
3393 }
3394
3395 static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3396                 u64 *snap_size, u64 *snap_features)
3397 {
3398         if (rbd_dev->image_format == 1)
3399                 return rbd_dev_v1_snap_info(rbd_dev, which,
3400                                         snap_size, snap_features);
3401         if (rbd_dev->image_format == 2)
3402                 return rbd_dev_v2_snap_info(rbd_dev, which,
3403                                         snap_size, snap_features);
3404         return ERR_PTR(-EINVAL);
3405 }
3406
3407 static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
3408 {
3409         int ret;
3410         __u8 obj_order;
3411
3412         down_write(&rbd_dev->header_rwsem);
3413
3414         /* Grab old order first, to see if it changes */
3415
3416         obj_order = rbd_dev->header.obj_order,
3417         ret = rbd_dev_v2_image_size(rbd_dev);
3418         if (ret)
3419                 goto out;
3420         if (rbd_dev->header.obj_order != obj_order) {
3421                 ret = -EIO;
3422                 goto out;
3423         }
3424         rbd_update_mapping_size(rbd_dev);
3425
3426         ret = rbd_dev_v2_snap_context(rbd_dev, hver);
3427         dout("rbd_dev_v2_snap_context returned %d\n", ret);
3428         if (ret)
3429                 goto out;
3430         ret = rbd_dev_snaps_update(rbd_dev);
3431         dout("rbd_dev_snaps_update returned %d\n", ret);
3432         if (ret)
3433                 goto out;
3434         ret = rbd_dev_snaps_register(rbd_dev);
3435         dout("rbd_dev_snaps_register returned %d\n", ret);
3436 out:
3437         up_write(&rbd_dev->header_rwsem);
3438
3439         return ret;
3440 }
3441
3442 /*
3443  * Scan the rbd device's current snapshot list and compare it to the
3444  * newly-received snapshot context.  Remove any existing snapshots
3445  * not present in the new snapshot context.  Add a new snapshot for
3446  * any snaphots in the snapshot context not in the current list.
3447  * And verify there are no changes to snapshots we already know
3448  * about.
3449  *
3450  * Assumes the snapshots in the snapshot context are sorted by
3451  * snapshot id, highest id first.  (Snapshots in the rbd_dev's list
3452  * are also maintained in that order.)
3453  */
3454 static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
3455 {
3456         struct ceph_snap_context *snapc = rbd_dev->header.snapc;
3457         const u32 snap_count = snapc->num_snaps;
3458         struct list_head *head = &rbd_dev->snaps;
3459         struct list_head *links = head->next;
3460         u32 index = 0;
3461
3462         dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
3463         while (index < snap_count || links != head) {
3464                 u64 snap_id;
3465                 struct rbd_snap *snap;
3466                 char *snap_name;
3467                 u64 snap_size = 0;
3468                 u64 snap_features = 0;
3469
3470                 snap_id = index < snap_count ? snapc->snaps[index]
3471                                              : CEPH_NOSNAP;
3472                 snap = links != head ? list_entry(links, struct rbd_snap, node)
3473                                      : NULL;
3474                 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
3475
3476                 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
3477                         struct list_head *next = links->next;
3478
3479                         /*
3480                          * A previously-existing snapshot is not in
3481                          * the new snap context.
3482                          *
3483                          * If the now missing snapshot is the one the
3484                          * image is mapped to, clear its exists flag
3485                          * so we can avoid sending any more requests
3486                          * to it.
3487                          */
3488                         if (rbd_dev->spec->snap_id == snap->id)
3489                                 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
3490                         rbd_remove_snap_dev(snap);
3491                         dout("%ssnap id %llu has been removed\n",
3492                                 rbd_dev->spec->snap_id == snap->id ?
3493                                                         "mapped " : "",
3494                                 (unsigned long long) snap->id);
3495
3496                         /* Done with this list entry; advance */
3497
3498                         links = next;
3499                         continue;
3500                 }
3501
3502                 snap_name = rbd_dev_snap_info(rbd_dev, index,
3503                                         &snap_size, &snap_features);
3504                 if (IS_ERR(snap_name))
3505                         return PTR_ERR(snap_name);
3506
3507                 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
3508                         (unsigned long long) snap_id);
3509                 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
3510                         struct rbd_snap *new_snap;
3511
3512                         /* We haven't seen this snapshot before */
3513
3514                         new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
3515                                         snap_id, snap_size, snap_features);
3516                         if (IS_ERR(new_snap)) {
3517                                 int err = PTR_ERR(new_snap);
3518
3519                                 dout("  failed to add dev, error %d\n", err);
3520
3521                                 return err;
3522                         }
3523
3524                         /* New goes before existing, or at end of list */
3525
3526                         dout("  added dev%s\n", snap ? "" : " at end\n");
3527                         if (snap)
3528                                 list_add_tail(&new_snap->node, &snap->node);
3529                         else
3530                                 list_add_tail(&new_snap->node, head);
3531                 } else {
3532                         /* Already have this one */
3533
3534                         dout("  already present\n");
3535
3536                         rbd_assert(snap->size == snap_size);
3537                         rbd_assert(!strcmp(snap->name, snap_name));
3538                         rbd_assert(snap->features == snap_features);
3539
3540                         /* Done with this list entry; advance */
3541
3542                         links = links->next;
3543                 }
3544
3545                 /* Advance to the next entry in the snapshot context */
3546
3547                 index++;
3548         }
3549         dout("%s: done\n", __func__);
3550
3551         return 0;
3552 }
3553
3554 /*
3555  * Scan the list of snapshots and register the devices for any that
3556  * have not already been registered.
3557  */
3558 static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
3559 {
3560         struct rbd_snap *snap;
3561         int ret = 0;
3562
3563         dout("%s:\n", __func__);
3564         if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
3565                 return -EIO;
3566
3567         list_for_each_entry(snap, &rbd_dev->snaps, node) {
3568                 if (!rbd_snap_registered(snap)) {
3569                         ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
3570                         if (ret < 0)
3571                                 break;
3572                 }
3573         }
3574         dout("%s: returning %d\n", __func__, ret);
3575
3576         return ret;
3577 }
3578
3579 static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3580 {
3581         struct device *dev;
3582         int ret;
3583
3584         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3585
3586         dev = &rbd_dev->dev;
3587         dev->bus = &rbd_bus_type;
3588         dev->type = &rbd_device_type;
3589         dev->parent = &rbd_root_dev;
3590         dev->release = rbd_dev_release;
3591         dev_set_name(dev, "%d", rbd_dev->dev_id);
3592         ret = device_register(dev);
3593
3594         mutex_unlock(&ctl_mutex);
3595
3596         return ret;
3597 }
3598
3599 static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3600 {
3601         device_unregister(&rbd_dev->dev);
3602 }
3603
3604 static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
3605
3606 /*
3607  * Get a unique rbd identifier for the given new rbd_dev, and add
3608  * the rbd_dev to the global list.  The minimum rbd id is 1.
3609  */
3610 static void rbd_dev_id_get(struct rbd_device *rbd_dev)
3611 {
3612         rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
3613
3614         spin_lock(&rbd_dev_list_lock);
3615         list_add_tail(&rbd_dev->node, &rbd_dev_list);
3616         spin_unlock(&rbd_dev_list_lock);
3617         dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3618                 (unsigned long long) rbd_dev->dev_id);
3619 }
3620
3621 /*
3622  * Remove an rbd_dev from the global list, and record that its
3623  * identifier is no longer in use.
3624  */
3625 static void rbd_dev_id_put(struct rbd_device *rbd_dev)
3626 {
3627         struct list_head *tmp;
3628         int rbd_id = rbd_dev->dev_id;
3629         int max_id;
3630
3631         rbd_assert(rbd_id > 0);
3632
3633         dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3634                 (unsigned long long) rbd_dev->dev_id);
3635         spin_lock(&rbd_dev_list_lock);
3636         list_del_init(&rbd_dev->node);
3637
3638         /*
3639          * If the id being "put" is not the current maximum, there
3640          * is nothing special we need to do.
3641          */
3642         if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
3643                 spin_unlock(&rbd_dev_list_lock);
3644                 return;
3645         }
3646
3647         /*
3648          * We need to update the current maximum id.  Search the
3649          * list to find out what it is.  We're more likely to find
3650          * the maximum at the end, so search the list backward.
3651          */
3652         max_id = 0;
3653         list_for_each_prev(tmp, &rbd_dev_list) {
3654                 struct rbd_device *rbd_dev;
3655
3656                 rbd_dev = list_entry(tmp, struct rbd_device, node);
3657                 if (rbd_dev->dev_id > max_id)
3658                         max_id = rbd_dev->dev_id;
3659         }
3660         spin_unlock(&rbd_dev_list_lock);
3661
3662         /*
3663          * The max id could have been updated by rbd_dev_id_get(), in
3664          * which case it now accurately reflects the new maximum.
3665          * Be careful not to overwrite the maximum value in that
3666          * case.
3667          */
3668         atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3669         dout("  max dev id has been reset\n");
3670 }
3671
3672 /*
3673  * Skips over white space at *buf, and updates *buf to point to the
3674  * first found non-space character (if any). Returns the length of
3675  * the token (string of non-white space characters) found.  Note
3676  * that *buf must be terminated with '\0'.
3677  */
3678 static inline size_t next_token(const char **buf)
3679 {
3680         /*
3681         * These are the characters that produce nonzero for
3682         * isspace() in the "C" and "POSIX" locales.
3683         */
3684         const char *spaces = " \f\n\r\t\v";
3685
3686         *buf += strspn(*buf, spaces);   /* Find start of token */
3687
3688         return strcspn(*buf, spaces);   /* Return token length */
3689 }
3690
3691 /*
3692  * Finds the next token in *buf, and if the provided token buffer is
3693  * big enough, copies the found token into it.  The result, if
3694  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
3695  * must be terminated with '\0' on entry.
3696  *
3697  * Returns the length of the token found (not including the '\0').
3698  * Return value will be 0 if no token is found, and it will be >=
3699  * token_size if the token would not fit.
3700  *
3701  * The *buf pointer will be updated to point beyond the end of the
3702  * found token.  Note that this occurs even if the token buffer is
3703  * too small to hold it.
3704  */
3705 static inline size_t copy_token(const char **buf,
3706                                 char *token,
3707                                 size_t token_size)
3708 {
3709         size_t len;
3710
3711         len = next_token(buf);
3712         if (len < token_size) {
3713                 memcpy(token, *buf, len);
3714                 *(token + len) = '\0';
3715         }
3716         *buf += len;
3717
3718         return len;
3719 }
3720
3721 /*
3722  * Finds the next token in *buf, dynamically allocates a buffer big
3723  * enough to hold a copy of it, and copies the token into the new
3724  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
3725  * that a duplicate buffer is created even for a zero-length token.
3726  *
3727  * Returns a pointer to the newly-allocated duplicate, or a null
3728  * pointer if memory for the duplicate was not available.  If
3729  * the lenp argument is a non-null pointer, the length of the token
3730  * (not including the '\0') is returned in *lenp.
3731  *
3732  * If successful, the *buf pointer will be updated to point beyond
3733  * the end of the found token.
3734  *
3735  * Note: uses GFP_KERNEL for allocation.
3736  */
3737 static inline char *dup_token(const char **buf, size_t *lenp)
3738 {
3739         char *dup;
3740         size_t len;
3741
3742         len = next_token(buf);
3743         dup = kmemdup(*buf, len + 1, GFP_KERNEL);
3744         if (!dup)
3745                 return NULL;
3746         *(dup + len) = '\0';
3747         *buf += len;
3748
3749         if (lenp)
3750                 *lenp = len;
3751
3752         return dup;
3753 }
3754
3755 /*
3756  * Parse the options provided for an "rbd add" (i.e., rbd image
3757  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
3758  * and the data written is passed here via a NUL-terminated buffer.
3759  * Returns 0 if successful or an error code otherwise.
3760  *
3761  * The information extracted from these options is recorded in
3762  * the other parameters which return dynamically-allocated
3763  * structures:
3764  *  ceph_opts
3765  *      The address of a pointer that will refer to a ceph options
3766  *      structure.  Caller must release the returned pointer using
3767  *      ceph_destroy_options() when it is no longer needed.
3768  *  rbd_opts
3769  *      Address of an rbd options pointer.  Fully initialized by
3770  *      this function; caller must release with kfree().
3771  *  spec
3772  *      Address of an rbd image specification pointer.  Fully
3773  *      initialized by this function based on parsed options.
3774  *      Caller must release with rbd_spec_put().
3775  *
3776  * The options passed take this form:
3777  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3778  * where:
3779  *  <mon_addrs>
3780  *      A comma-separated list of one or more monitor addresses.
3781  *      A monitor address is an ip address, optionally followed
3782  *      by a port number (separated by a colon).
3783  *        I.e.:  ip1[:port1][,ip2[:port2]...]
3784  *  <options>
3785  *      A comma-separated list of ceph and/or rbd options.
3786  *  <pool_name>
3787  *      The name of the rados pool containing the rbd image.
3788  *  <image_name>
3789  *      The name of the image in that pool to map.
3790  *  <snap_id>
3791  *      An optional snapshot id.  If provided, the mapping will
3792  *      present data from the image at the time that snapshot was
3793  *      created.  The image head is used if no snapshot id is
3794  *      provided.  Snapshot mappings are always read-only.
3795  */
3796 static int rbd_add_parse_args(const char *buf,
3797                                 struct ceph_options **ceph_opts,
3798                                 struct rbd_options **opts,
3799                                 struct rbd_spec **rbd_spec)
3800 {
3801         size_t len;
3802         char *options;
3803         const char *mon_addrs;
3804         size_t mon_addrs_size;
3805         struct rbd_spec *spec = NULL;
3806         struct rbd_options *rbd_opts = NULL;
3807         struct ceph_options *copts;
3808         int ret;
3809
3810         /* The first four tokens are required */
3811
3812         len = next_token(&buf);
3813         if (!len) {
3814                 rbd_warn(NULL, "no monitor address(es) provided");
3815                 return -EINVAL;
3816         }
3817         mon_addrs = buf;
3818         mon_addrs_size = len + 1;
3819         buf += len;
3820
3821         ret = -EINVAL;
3822         options = dup_token(&buf, NULL);
3823         if (!options)
3824                 return -ENOMEM;
3825         if (!*options) {
3826                 rbd_warn(NULL, "no options provided");
3827                 goto out_err;
3828         }
3829
3830         spec = rbd_spec_alloc();
3831         if (!spec)
3832                 goto out_mem;
3833
3834         spec->pool_name = dup_token(&buf, NULL);
3835         if (!spec->pool_name)
3836                 goto out_mem;
3837         if (!*spec->pool_name) {
3838                 rbd_warn(NULL, "no pool name provided");
3839                 goto out_err;
3840         }
3841
3842         spec->image_name = dup_token(&buf, NULL);
3843         if (!spec->image_name)
3844                 goto out_mem;
3845         if (!*spec->image_name) {
3846                 rbd_warn(NULL, "no image name provided");
3847                 goto out_err;
3848         }
3849
3850         /*
3851          * Snapshot name is optional; default is to use "-"
3852          * (indicating the head/no snapshot).
3853          */
3854         len = next_token(&buf);
3855         if (!len) {
3856                 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3857                 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
3858         } else if (len > RBD_MAX_SNAP_NAME_LEN) {
3859                 ret = -ENAMETOOLONG;
3860                 goto out_err;
3861         }
3862         spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
3863         if (!spec->snap_name)
3864                 goto out_mem;
3865         *(spec->snap_name + len) = '\0';
3866
3867         /* Initialize all rbd options to the defaults */
3868
3869         rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3870         if (!rbd_opts)
3871                 goto out_mem;
3872
3873         rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
3874
3875         copts = ceph_parse_options(options, mon_addrs,
3876                                         mon_addrs + mon_addrs_size - 1,
3877                                         parse_rbd_opts_token, rbd_opts);
3878         if (IS_ERR(copts)) {
3879                 ret = PTR_ERR(copts);
3880                 goto out_err;
3881         }
3882         kfree(options);
3883
3884         *ceph_opts = copts;
3885         *opts = rbd_opts;
3886         *rbd_spec = spec;
3887
3888         return 0;
3889 out_mem:
3890         ret = -ENOMEM;
3891 out_err:
3892         kfree(rbd_opts);
3893         rbd_spec_put(spec);
3894         kfree(options);
3895
3896         return ret;
3897 }
3898
3899 /*
3900  * An rbd format 2 image has a unique identifier, distinct from the
3901  * name given to it by the user.  Internally, that identifier is
3902  * what's used to specify the names of objects related to the image.
3903  *
3904  * A special "rbd id" object is used to map an rbd image name to its
3905  * id.  If that object doesn't exist, then there is no v2 rbd image
3906  * with the supplied name.
3907  *
3908  * This function will record the given rbd_dev's image_id field if
3909  * it can be determined, and in that case will return 0.  If any
3910  * errors occur a negative errno will be returned and the rbd_dev's
3911  * image_id field will be unchanged (and should be NULL).
3912  */
3913 static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3914 {
3915         int ret;
3916         size_t size;
3917         char *object_name;
3918         void *response;
3919         void *p;
3920
3921         /* If we already have it we don't need to look it up */
3922
3923         if (rbd_dev->spec->image_id)
3924                 return 0;
3925
3926         /*
3927          * When probing a parent image, the image id is already
3928          * known (and the image name likely is not).  There's no
3929          * need to fetch the image id again in this case.
3930          */
3931         if (rbd_dev->spec->image_id)
3932                 return 0;
3933
3934         /*
3935          * First, see if the format 2 image id file exists, and if
3936          * so, get the image's persistent id from it.
3937          */
3938         size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
3939         object_name = kmalloc(size, GFP_NOIO);
3940         if (!object_name)
3941                 return -ENOMEM;
3942         sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
3943         dout("rbd id object name is %s\n", object_name);
3944
3945         /* Response will be an encoded string, which includes a length */
3946
3947         size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3948         response = kzalloc(size, GFP_NOIO);
3949         if (!response) {
3950                 ret = -ENOMEM;
3951                 goto out;
3952         }
3953
3954         ret = rbd_obj_method_sync(rbd_dev, object_name,
3955                                 "rbd", "get_id",
3956                                 NULL, 0,
3957                                 response, RBD_IMAGE_ID_LEN_MAX, NULL);
3958         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3959         if (ret < 0)
3960                 goto out;
3961
3962         p = response;
3963         rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
3964                                                 p + RBD_IMAGE_ID_LEN_MAX,
3965                                                 NULL, GFP_NOIO);
3966         if (IS_ERR(rbd_dev->spec->image_id)) {
3967                 ret = PTR_ERR(rbd_dev->spec->image_id);
3968                 rbd_dev->spec->image_id = NULL;
3969         } else {
3970                 dout("image_id is %s\n", rbd_dev->spec->image_id);
3971         }
3972 out:
3973         kfree(response);
3974         kfree(object_name);
3975
3976         return ret;
3977 }
3978
3979 static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3980 {
3981         int ret;
3982         size_t size;
3983
3984         /* Version 1 images have no id; empty string is used */
3985
3986         rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3987         if (!rbd_dev->spec->image_id)
3988                 return -ENOMEM;
3989
3990         /* Record the header object name for this rbd image. */
3991
3992         size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
3993         rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3994         if (!rbd_dev->header_name) {
3995                 ret = -ENOMEM;
3996                 goto out_err;
3997         }
3998         sprintf(rbd_dev->header_name, "%s%s",
3999                 rbd_dev->spec->image_name, RBD_SUFFIX);
4000
4001         /* Populate rbd image metadata */
4002
4003         ret = rbd_read_header(rbd_dev, &rbd_dev->header);
4004         if (ret < 0)
4005                 goto out_err;
4006
4007         /* Version 1 images have no parent (no layering) */
4008
4009         rbd_dev->parent_spec = NULL;
4010         rbd_dev->parent_overlap = 0;
4011
4012         rbd_dev->image_format = 1;
4013
4014         dout("discovered version 1 image, header name is %s\n",
4015                 rbd_dev->header_name);
4016
4017         return 0;
4018
4019 out_err:
4020         kfree(rbd_dev->header_name);
4021         rbd_dev->header_name = NULL;
4022         kfree(rbd_dev->spec->image_id);
4023         rbd_dev->spec->image_id = NULL;
4024
4025         return ret;
4026 }
4027
4028 static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
4029 {
4030         size_t size;
4031         int ret;
4032         u64 ver = 0;
4033
4034         /*
4035          * Image id was filled in by the caller.  Record the header
4036          * object name for this rbd image.
4037          */
4038         size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
4039         rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4040         if (!rbd_dev->header_name)
4041                 return -ENOMEM;
4042         sprintf(rbd_dev->header_name, "%s%s",
4043                         RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
4044
4045         /* Get the size and object order for the image */
4046
4047         ret = rbd_dev_v2_image_size(rbd_dev);
4048         if (ret < 0)
4049                 goto out_err;
4050
4051         /* Get the object prefix (a.k.a. block_name) for the image */
4052
4053         ret = rbd_dev_v2_object_prefix(rbd_dev);
4054         if (ret < 0)
4055                 goto out_err;
4056
4057         /* Get the and check features for the image */
4058
4059         ret = rbd_dev_v2_features(rbd_dev);
4060         if (ret < 0)
4061                 goto out_err;
4062
4063         /* If the image supports layering, get the parent info */
4064
4065         if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
4066                 ret = rbd_dev_v2_parent_info(rbd_dev);
4067                 if (ret < 0)
4068                         goto out_err;
4069         }
4070
4071         /* crypto and compression type aren't (yet) supported for v2 images */
4072
4073         rbd_dev->header.crypt_type = 0;
4074         rbd_dev->header.comp_type = 0;
4075
4076         /* Get the snapshot context, plus the header version */
4077
4078         ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
4079         if (ret)
4080                 goto out_err;
4081         rbd_dev->header.obj_version = ver;
4082
4083         rbd_dev->image_format = 2;
4084
4085         dout("discovered version 2 image, header name is %s\n",
4086                 rbd_dev->header_name);
4087
4088         return 0;
4089 out_err:
4090         rbd_dev->parent_overlap = 0;
4091         rbd_spec_put(rbd_dev->parent_spec);
4092         rbd_dev->parent_spec = NULL;
4093         kfree(rbd_dev->header_name);
4094         rbd_dev->header_name = NULL;
4095         kfree(rbd_dev->header.object_prefix);
4096         rbd_dev->header.object_prefix = NULL;
4097
4098         return ret;
4099 }
4100
4101 static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
4102 {
4103         struct rbd_device *parent = NULL;
4104         struct rbd_spec *parent_spec = NULL;
4105         struct rbd_client *rbdc = NULL;
4106         int ret;
4107
4108         /* no need to lock here, as rbd_dev is not registered yet */
4109         ret = rbd_dev_snaps_update(rbd_dev);
4110         if (ret)
4111                 return ret;
4112
4113         ret = rbd_dev_probe_update_spec(rbd_dev);
4114         if (ret)
4115                 goto err_out_snaps;
4116
4117         ret = rbd_dev_set_mapping(rbd_dev);
4118         if (ret)
4119                 goto err_out_snaps;
4120
4121         /* generate unique id: find highest unique id, add one */
4122         rbd_dev_id_get(rbd_dev);
4123
4124         /* Fill in the device name, now that we have its id. */
4125         BUILD_BUG_ON(DEV_NAME_LEN
4126                         < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
4127         sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
4128
4129         /* Get our block major device number. */
4130
4131         ret = register_blkdev(0, rbd_dev->name);
4132         if (ret < 0)
4133                 goto err_out_id;
4134         rbd_dev->major = ret;
4135
4136         /* Set up the blkdev mapping. */
4137
4138         ret = rbd_init_disk(rbd_dev);
4139         if (ret)
4140                 goto err_out_blkdev;
4141
4142         ret = rbd_bus_add_dev(rbd_dev);
4143         if (ret)
4144                 goto err_out_disk;
4145
4146         /*
4147          * At this point cleanup in the event of an error is the job
4148          * of the sysfs code (initiated by rbd_bus_del_dev()).
4149          */
4150         /* Probe the parent if there is one */
4151
4152         if (rbd_dev->parent_spec) {
4153                 /*
4154                  * We need to pass a reference to the client and the
4155                  * parent spec when creating the parent rbd_dev.
4156                  * Images related by parent/child relationships
4157                  * always share both.
4158                  */
4159                 parent_spec = rbd_spec_get(rbd_dev->parent_spec);
4160                 rbdc = __rbd_get_client(rbd_dev->rbd_client);
4161
4162                 parent = rbd_dev_create(rbdc, parent_spec);
4163                 if (!parent) {
4164                         ret = -ENOMEM;
4165                         goto err_out_spec;
4166                 }
4167                 rbdc = NULL;            /* parent now owns reference */
4168                 parent_spec = NULL;     /* parent now owns reference */
4169                 ret = rbd_dev_probe(parent);
4170                 if (ret < 0)
4171                         goto err_out_parent;
4172                 rbd_dev->parent = parent;
4173         }
4174
4175         down_write(&rbd_dev->header_rwsem);
4176         ret = rbd_dev_snaps_register(rbd_dev);
4177         up_write(&rbd_dev->header_rwsem);
4178         if (ret)
4179                 goto err_out_bus;
4180
4181         ret = rbd_dev_header_watch_sync(rbd_dev, 1);
4182         if (ret)
4183                 goto err_out_bus;
4184
4185         /* Everything's ready.  Announce the disk to the world. */
4186
4187         add_disk(rbd_dev->disk);
4188
4189         pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
4190                 (unsigned long long) rbd_dev->mapping.size);
4191
4192         return ret;
4193
4194 err_out_parent:
4195         rbd_dev_destroy(parent);
4196 err_out_spec:
4197         rbd_spec_put(parent_spec);
4198         rbd_put_client(rbdc);
4199 err_out_bus:
4200         /* this will also clean up rest of rbd_dev stuff */
4201
4202         rbd_bus_del_dev(rbd_dev);
4203
4204         return ret;
4205 err_out_disk:
4206         rbd_free_disk(rbd_dev);
4207 err_out_blkdev:
4208         unregister_blkdev(rbd_dev->major, rbd_dev->name);
4209 err_out_id:
4210         rbd_dev_id_put(rbd_dev);
4211 err_out_snaps:
4212         rbd_remove_all_snaps(rbd_dev);
4213
4214         return ret;
4215 }
4216
4217 /*
4218  * Probe for the existence of the header object for the given rbd
4219  * device.  For format 2 images this includes determining the image
4220  * id.
4221  */
4222 static int rbd_dev_probe(struct rbd_device *rbd_dev)
4223 {
4224         int ret;
4225
4226         /*
4227          * Get the id from the image id object.  If it's not a
4228          * format 2 image, we'll get ENOENT back, and we'll assume
4229          * it's a format 1 image.
4230          */
4231         ret = rbd_dev_image_id(rbd_dev);
4232         if (ret)
4233                 ret = rbd_dev_v1_probe(rbd_dev);
4234         else
4235                 ret = rbd_dev_v2_probe(rbd_dev);
4236         if (ret) {
4237                 dout("probe failed, returning %d\n", ret);
4238
4239                 return ret;
4240         }
4241
4242         ret = rbd_dev_probe_finish(rbd_dev);
4243         if (ret)
4244                 rbd_header_free(&rbd_dev->header);
4245
4246         return ret;
4247 }
4248
4249 static ssize_t rbd_add(struct bus_type *bus,
4250                        const char *buf,
4251                        size_t count)
4252 {
4253         struct rbd_device *rbd_dev = NULL;
4254         struct ceph_options *ceph_opts = NULL;
4255         struct rbd_options *rbd_opts = NULL;
4256         struct rbd_spec *spec = NULL;
4257         struct rbd_client *rbdc;
4258         struct ceph_osd_client *osdc;
4259         int rc = -ENOMEM;
4260
4261         if (!try_module_get(THIS_MODULE))
4262                 return -ENODEV;
4263
4264         /* parse add command */
4265         rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
4266         if (rc < 0)
4267                 goto err_out_module;
4268
4269         rbdc = rbd_get_client(ceph_opts);
4270         if (IS_ERR(rbdc)) {
4271                 rc = PTR_ERR(rbdc);
4272                 goto err_out_args;
4273         }
4274         ceph_opts = NULL;       /* rbd_dev client now owns this */
4275
4276         /* pick the pool */
4277         osdc = &rbdc->client->osdc;
4278         rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
4279         if (rc < 0)
4280                 goto err_out_client;
4281         spec->pool_id = (u64) rc;
4282
4283         /* The ceph file layout needs to fit pool id in 32 bits */
4284
4285         if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
4286                 rc = -EIO;
4287                 goto err_out_client;
4288         }
4289
4290         rbd_dev = rbd_dev_create(rbdc, spec);
4291         if (!rbd_dev)
4292                 goto err_out_client;
4293         rbdc = NULL;            /* rbd_dev now owns this */
4294         spec = NULL;            /* rbd_dev now owns this */
4295
4296         rbd_dev->mapping.read_only = rbd_opts->read_only;
4297         kfree(rbd_opts);
4298         rbd_opts = NULL;        /* done with this */
4299
4300         rc = rbd_dev_probe(rbd_dev);
4301         if (rc < 0)
4302                 goto err_out_rbd_dev;
4303
4304         return count;
4305 err_out_rbd_dev:
4306         rbd_dev_destroy(rbd_dev);
4307 err_out_client:
4308         rbd_put_client(rbdc);
4309 err_out_args:
4310         if (ceph_opts)
4311                 ceph_destroy_options(ceph_opts);
4312         kfree(rbd_opts);
4313         rbd_spec_put(spec);
4314 err_out_module:
4315         module_put(THIS_MODULE);
4316
4317         dout("Error adding device %s\n", buf);
4318
4319         return (ssize_t) rc;
4320 }
4321
4322 static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
4323 {
4324         struct list_head *tmp;
4325         struct rbd_device *rbd_dev;
4326
4327         spin_lock(&rbd_dev_list_lock);
4328         list_for_each(tmp, &rbd_dev_list) {
4329                 rbd_dev = list_entry(tmp, struct rbd_device, node);
4330                 if (rbd_dev->dev_id == dev_id) {
4331                         spin_unlock(&rbd_dev_list_lock);
4332                         return rbd_dev;
4333                 }
4334         }
4335         spin_unlock(&rbd_dev_list_lock);
4336         return NULL;
4337 }
4338
4339 static void rbd_dev_release(struct device *dev)
4340 {
4341         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4342
4343         if (rbd_dev->watch_event)
4344                 rbd_dev_header_watch_sync(rbd_dev, 0);
4345
4346         /* clean up and free blkdev */
4347         rbd_free_disk(rbd_dev);
4348         unregister_blkdev(rbd_dev->major, rbd_dev->name);
4349
4350         /* release allocated disk header fields */
4351         rbd_header_free(&rbd_dev->header);
4352
4353         /* done with the id, and with the rbd_dev */
4354         rbd_dev_id_put(rbd_dev);
4355         rbd_assert(rbd_dev->rbd_client != NULL);
4356         rbd_dev_destroy(rbd_dev);
4357
4358         /* release module ref */
4359         module_put(THIS_MODULE);
4360 }
4361
4362 static void __rbd_remove(struct rbd_device *rbd_dev)
4363 {
4364         rbd_remove_all_snaps(rbd_dev);
4365         rbd_bus_del_dev(rbd_dev);
4366 }
4367
4368 static ssize_t rbd_remove(struct bus_type *bus,
4369                           const char *buf,
4370                           size_t count)
4371 {
4372         struct rbd_device *rbd_dev = NULL;
4373         int target_id, rc;
4374         unsigned long ul;
4375         int ret = count;
4376
4377         rc = strict_strtoul(buf, 10, &ul);
4378         if (rc)
4379                 return rc;
4380
4381         /* convert to int; abort if we lost anything in the conversion */
4382         target_id = (int) ul;
4383         if (target_id != ul)
4384                 return -EINVAL;
4385
4386         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4387
4388         rbd_dev = __rbd_get_dev(target_id);
4389         if (!rbd_dev) {
4390                 ret = -ENOENT;
4391                 goto done;
4392         }
4393
4394         spin_lock_irq(&rbd_dev->lock);
4395         if (rbd_dev->open_count)
4396                 ret = -EBUSY;
4397         else
4398                 set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
4399         spin_unlock_irq(&rbd_dev->lock);
4400         if (ret < 0)
4401                 goto done;
4402
4403         while (rbd_dev->parent_spec) {
4404                 struct rbd_device *first = rbd_dev;
4405                 struct rbd_device *second = first->parent;
4406                 struct rbd_device *third;
4407
4408                 /*
4409                  * Follow to the parent with no grandparent and
4410                  * remove it.
4411                  */
4412                 while (second && (third = second->parent)) {
4413                         first = second;
4414                         second = third;
4415                 }
4416                 __rbd_remove(second);
4417                 rbd_spec_put(first->parent_spec);
4418                 first->parent_spec = NULL;
4419                 first->parent_overlap = 0;
4420                 first->parent = NULL;
4421         }
4422         __rbd_remove(rbd_dev);
4423
4424 done:
4425         mutex_unlock(&ctl_mutex);
4426
4427         return ret;
4428 }
4429
4430 /*
4431  * create control files in sysfs
4432  * /sys/bus/rbd/...
4433  */
4434 static int rbd_sysfs_init(void)
4435 {
4436         int ret;
4437
4438         ret = device_register(&rbd_root_dev);
4439         if (ret < 0)
4440                 return ret;
4441
4442         ret = bus_register(&rbd_bus_type);
4443         if (ret < 0)
4444                 device_unregister(&rbd_root_dev);
4445
4446         return ret;
4447 }
4448
4449 static void rbd_sysfs_cleanup(void)
4450 {
4451         bus_unregister(&rbd_bus_type);
4452         device_unregister(&rbd_root_dev);
4453 }
4454
4455 static int __init rbd_init(void)
4456 {
4457         int rc;
4458
4459         if (!libceph_compatible(NULL)) {
4460                 rbd_warn(NULL, "libceph incompatibility (quitting)");
4461
4462                 return -EINVAL;
4463         }
4464         rc = rbd_sysfs_init();
4465         if (rc)
4466                 return rc;
4467         pr_info("loaded " RBD_DRV_NAME_LONG "\n");
4468         return 0;
4469 }
4470
4471 static void __exit rbd_exit(void)
4472 {
4473         rbd_sysfs_cleanup();
4474 }
4475
4476 module_init(rbd_init);
4477 module_exit(rbd_exit);
4478
4479 MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
4480 MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
4481 MODULE_DESCRIPTION("rados block device");
4482
4483 /* following authorship retained from original osdblk.c */
4484 MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
4485
4486 MODULE_LICENSE("GPL");