drivers/block/rbd.c

   1 /*
   2    rbd.c -- Export ceph rados objects as a Linux block device
   3
   4
   5    based on drivers/block/osdblk.c:
   6
   7    Copyright 2009 Red Hat, Inc.
   8
   9    This program is free software; you can redistribute it and/or modify
  10    it under the terms of the GNU General Public License as published by
  11    the Free Software Foundation.
  12
  13    This program is distributed in the hope that it will be useful,
  14    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16    GNU General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with this program; see the file COPYING.  If not, write to
  20    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  21
  22
  23
  24    For usage instructions, please refer to:
  25
  26                  Documentation/ABI/testing/sysfs-bus-rbd
  27
  28  */
  29
  30 #include <linux/ceph/libceph.h>
  31 #include <linux/ceph/osd_client.h>
  32 #include <linux/ceph/mon_client.h>
  33 #include <linux/ceph/decode.h>
  34 #include <linux/parser.h>
  35
  36 #include <linux/kernel.h>
  37 #include <linux/device.h>
  38 #include <linux/module.h>
  39 #include <linux/fs.h>
  40 #include <linux/blkdev.h>
  41
  42 #include "rbd_types.h"
  43
  44 #define RBD_DEBUG       /* Activate rbd_assert() calls */
  45
  46 /*
  47  * The basic unit of block I/O is a sector.  It is interpreted in a
  48  * number of contexts in Linux (blk, bio, genhd), but the default is
  49  * universally 512 bytes.  These symbols are just slightly more
  50  * meaningful than the bare numbers they represent.
  51  */
  52 #define SECTOR_SHIFT    9
  53 #define SECTOR_SIZE     (1ULL << SECTOR_SHIFT)
  54
  55 /* It might be useful to have these defined elsewhere */
  56
  57 #define U8_MAX  ((u8)   (~0U))
  58 #define U16_MAX ((u16)  (~0U))
  59 #define U32_MAX ((u32)  (~0U))
  60 #define U64_MAX ((u64)  (~0ULL))
  61
  62 #define RBD_DRV_NAME "rbd"
  63 #define RBD_DRV_NAME_LONG "rbd (rados block device)"
  64
  65 #define RBD_MINORS_PER_MAJOR    256             /* max minors per blkdev */
  66
  67 #define RBD_SNAP_DEV_NAME_PREFIX        "snap_"
  68 #define RBD_MAX_SNAP_NAME_LEN   \
  69                         (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
  70
  71 #define RBD_MAX_SNAP_COUNT      510     /* allows max snapc to fit in 4KB */
  72
  73 #define RBD_SNAP_HEAD_NAME      "-"
  74
  75 /* This allows a single page to hold an image name sent by OSD */
  76 #define RBD_IMAGE_NAME_LEN_MAX  (PAGE_SIZE - sizeof (__le32) - 1)
  77 #define RBD_IMAGE_ID_LEN_MAX    64
  78
  79 #define RBD_OBJ_PREFIX_LEN_MAX  64
  80
  81 /* Feature bits */
  82
  83 #define RBD_FEATURE_LAYERING      1
  84
  85 /* Features supported by this (client software) implementation. */
  86
  87 #define RBD_FEATURES_ALL          (0)
  88
  89 /*
  90  * An RBD device name will be "rbd#", where the "rbd" comes from
  91  * RBD_DRV_NAME above, and # is a unique integer identifier.
  92  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
  93  * enough to hold all possible device names.
  94  */
  95 #define DEV_NAME_LEN            32
  96 #define MAX_INT_FORMAT_WIDTH    ((5 * sizeof (int)) / 2 + 1)
  97
  98 /*
  99  * block device image metadata (in-memory version)
 100  */
 101 struct rbd_image_header {
 102         /* These four fields never change for a given rbd image */
 103         char *object_prefix;
 104         u64 features;
 105         __u8 obj_order;
 106         __u8 crypt_type;
 107         __u8 comp_type;
 108
 109         /* The remaining fields need to be updated occasionally */
 110         u64 image_size;
 111         struct ceph_snap_context *snapc;
 112         char *snap_names;
 113         u64 *snap_sizes;
 114
 115         u64 obj_version;
 116 };
 117
 118 /*
 119  * An rbd image specification.
 120  *
 121  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
 122  * identify an image.  Each rbd_dev structure includes a pointer to
 123  * an rbd_spec structure that encapsulates this identity.
 124  *
 125  * Each of the id's in an rbd_spec has an associated name.  For a
 126  * user-mapped image, the names are supplied and the id's associated
 127  * with them are looked up.  For a layered image, a parent image is
 128  * defined by the tuple, and the names are looked up.
 129  *
 130  * An rbd_dev structure contains a parent_spec pointer which is
 131  * non-null if the image it represents is a child in a layered
 132  * image.  This pointer will refer to the rbd_spec structure used
 133  * by the parent rbd_dev for its own identity (i.e., the structure
 134  * is shared between the parent and child).
 135  *
 136  * Since these structures are populated once, during the discovery
 137  * phase of image construction, they are effectively immutable so
 138  * we make no effort to synchronize access to them.
 139  *
 140  * Note that code herein does not assume the image name is known (it
 141  * could be a null pointer).
 142  */
 143 struct rbd_spec {
 144         u64             pool_id;
 145         char            *pool_name;
 146
 147         char            *image_id;
 148         char            *image_name;
 149
 150         u64             snap_id;
 151         char            *snap_name;
 152
 153         struct kref     kref;
 154 };
 155
 156 /*
 157  * an instance of the client.  multiple devices may share an rbd client.
 158  */
 159 struct rbd_client {
 160         struct ceph_client      *client;
 161         struct kref             kref;
 162         struct list_head        node;
 163 };
 164
 165 struct rbd_img_request;
 166 typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
 167
 168 #define BAD_WHICH       U32_MAX         /* Good which or bad which, which? */
 169
 170 struct rbd_obj_request;
 171 typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
 172
 173 enum obj_request_type {
 174         OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
 175 };
 176
 177 struct rbd_obj_request {
 178         const char              *object_name;
 179         u64                     offset;         /* object start byte */
 180         u64                     length;         /* bytes from offset */
 181
 182         struct rbd_img_request  *img_request;
 183         struct list_head        links;          /* img_request->obj_requests */
 184         u32                     which;          /* posn image request list */
 185
 186         enum obj_request_type   type;
 187         union {
 188                 struct bio      *bio_list;
 189                 struct {
 190                         struct page     **pages;
 191                         u32             page_count;
 192                 };
 193         };
 194
 195         struct ceph_osd_request *osd_req;
 196
 197         u64                     xferred;        /* bytes transferred */
 198         u64                     version;
 199         s32                     result;
 200         atomic_t                done;
 201
 202         rbd_obj_callback_t      callback;
 203         struct completion       completion;
 204
 205         struct kref             kref;
 206 };
 207
 208 struct rbd_img_request {
 209         struct request          *rq;
 210         struct rbd_device       *rbd_dev;
 211         u64                     offset; /* starting image byte offset */
 212         u64                     length; /* byte count from offset */
 213         bool                    write_request;  /* false for read */
 214         union {
 215                 struct ceph_snap_context *snapc;        /* for writes */
 216                 u64             snap_id;                /* for reads */
 217         };
 218         spinlock_t              completion_lock;/* protects next_completion */
 219         u32                     next_completion;
 220         rbd_img_callback_t      callback;
 221
 222         u32                     obj_request_count;
 223         struct list_head        obj_requests;   /* rbd_obj_request structs */
 224
 225         struct kref             kref;
 226 };
 227
 228 #define for_each_obj_request(ireq, oreq) \
 229         list_for_each_entry(oreq, &ireq->obj_requests, links)
 230 #define for_each_obj_request_from(ireq, oreq) \
 231         list_for_each_entry_from(oreq, &ireq->obj_requests, links)
 232 #define for_each_obj_request_safe(ireq, oreq, n) \
 233         list_for_each_entry_safe_reverse(oreq, n, &ireq->obj_requests, links)
 234
 235 struct rbd_snap {
 236         struct  device          dev;
 237         const char              *name;
 238         u64                     size;
 239         struct list_head        node;
 240         u64                     id;
 241         u64                     features;
 242 };
 243
 244 struct rbd_mapping {
 245         u64                     size;
 246         u64                     features;
 247         bool                    read_only;
 248 };
 249
 250 /*
 251  * a single device
 252  */
 253 struct rbd_device {
 254         int                     dev_id;         /* blkdev unique id */
 255
 256         int                     major;          /* blkdev assigned major */
 257         struct gendisk          *disk;          /* blkdev's gendisk and rq */
 258
 259         u32                     image_format;   /* Either 1 or 2 */
 260         struct rbd_client       *rbd_client;
 261
 262         char                    name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
 263
 264         spinlock_t              lock;           /* queue, flags, open_count */
 265
 266         struct rbd_image_header header;
 267         unsigned long           flags;          /* possibly lock protected */
 268         struct rbd_spec         *spec;
 269
 270         char                    *header_name;
 271
 272         struct ceph_file_layout layout;
 273
 274         struct ceph_osd_event   *watch_event;
 275         struct rbd_obj_request  *watch_request;
 276
 277         struct rbd_spec         *parent_spec;
 278         u64                     parent_overlap;
 279
 280         /* protects updating the header */
 281         struct rw_semaphore     header_rwsem;
 282
 283         struct rbd_mapping      mapping;
 284
 285         struct list_head        node;
 286
 287         /* list of snapshots */
 288         struct list_head        snaps;
 289
 290         /* sysfs related */
 291         struct device           dev;
 292         unsigned long           open_count;     /* protected by lock */
 293 };
 294
 295 /*
 296  * Flag bits for rbd_dev->flags.  If atomicity is required,
 297  * rbd_dev->lock is used to protect access.
 298  *
 299  * Currently, only the "removing" flag (which is coupled with the
 300  * "open_count" field) requires atomic access.
 301  */
 302 enum rbd_dev_flags {
 303         RBD_DEV_FLAG_EXISTS,    /* mapped snapshot has not been deleted */
 304         RBD_DEV_FLAG_REMOVING,  /* this mapping is being removed */
 305 };
 306
 307 static DEFINE_MUTEX(ctl_mutex);   /* Serialize open/close/setup/teardown */
 308
 309 static LIST_HEAD(rbd_dev_list);    /* devices */
 310 static DEFINE_SPINLOCK(rbd_dev_list_lock);
 311
 312 static LIST_HEAD(rbd_client_list);              /* clients */
 313 static DEFINE_SPINLOCK(rbd_client_list_lock);
 314
 315 static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
 316 static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
 317
 318 static void rbd_dev_release(struct device *dev);
 319 static void rbd_remove_snap_dev(struct rbd_snap *snap);
 320
 321 static ssize_t rbd_add(struct bus_type *bus, const char *buf,
 322                        size_t count);
 323 static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
 324                           size_t count);
 325
 326 static struct bus_attribute rbd_bus_attrs[] = {
 327         __ATTR(add, S_IWUSR, NULL, rbd_add),
 328         __ATTR(remove, S_IWUSR, NULL, rbd_remove),
 329         __ATTR_NULL
 330 };
 331
 332 static struct bus_type rbd_bus_type = {
 333         .name           = "rbd",
 334         .bus_attrs      = rbd_bus_attrs,
 335 };
 336
 337 static void rbd_root_dev_release(struct device *dev)
 338 {
 339 }
 340
 341 static struct device rbd_root_dev = {
 342         .init_name =    "rbd",
 343         .release =      rbd_root_dev_release,
 344 };
 345
 346 static __printf(2, 3)
 347 void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
 348 {
 349         struct va_format vaf;
 350         va_list args;
 351
 352         va_start(args, fmt);
 353         vaf.fmt = fmt;
 354         vaf.va = &args;
 355
 356         if (!rbd_dev)
 357                 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
 358         else if (rbd_dev->disk)
 359                 printk(KERN_WARNING "%s: %s: %pV\n",
 360                         RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
 361         else if (rbd_dev->spec && rbd_dev->spec->image_name)
 362                 printk(KERN_WARNING "%s: image %s: %pV\n",
 363                         RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
 364         else if (rbd_dev->spec && rbd_dev->spec->image_id)
 365                 printk(KERN_WARNING "%s: id %s: %pV\n",
 366                         RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
 367         else    /* punt */
 368                 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
 369                         RBD_DRV_NAME, rbd_dev, &vaf);
 370         va_end(args);
 371 }
 372
 373 #ifdef RBD_DEBUG
 374 #define rbd_assert(expr)                                                \
 375                 if (unlikely(!(expr))) {                                \
 376                         printk(KERN_ERR "\nAssertion failure in %s() "  \
 377                                                 "at line %d:\n\n"       \
 378                                         "\trbd_assert(%s);\n\n",        \
 379                                         __func__, __LINE__, #expr);     \
 380                         BUG();                                          \
 381                 }
 382 #else /* !RBD_DEBUG */
 383 #  define rbd_assert(expr)      ((void) 0)
 384 #endif /* !RBD_DEBUG */
 385
 386 static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
 387 static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
 388
 389 static int rbd_open(struct block_device *bdev, fmode_t mode)
 390 {
 391         struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
 392         bool removing = false;
 393
 394         if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
 395                 return -EROFS;
 396
 397         spin_lock(&rbd_dev->lock);
 398         if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
 399                 removing = true;
 400         else
 401                 rbd_dev->open_count++;
 402         spin_unlock(&rbd_dev->lock);
 403         if (removing)
 404                 return -ENOENT;
 405
 406         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 407         (void) get_device(&rbd_dev->dev);
 408         set_device_ro(bdev, rbd_dev->mapping.read_only);
 409         mutex_unlock(&ctl_mutex);
 410
 411         return 0;
 412 }
 413
 414 static int rbd_release(struct gendisk *disk, fmode_t mode)
 415 {
 416         struct rbd_device *rbd_dev = disk->private_data;
 417         unsigned long open_count_before;
 418
 419         spin_lock(&rbd_dev->lock);
 420         open_count_before = rbd_dev->open_count--;
 421         spin_unlock(&rbd_dev->lock);
 422         rbd_assert(open_count_before > 0);
 423
 424         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 425         put_device(&rbd_dev->dev);
 426         mutex_unlock(&ctl_mutex);
 427
 428         return 0;
 429 }
 430
 431 static const struct block_device_operations rbd_bd_ops = {
 432         .owner                  = THIS_MODULE,
 433         .open                   = rbd_open,
 434         .release                = rbd_release,
 435 };
 436
 437 /*
 438  * Initialize an rbd client instance.
 439  * We own *ceph_opts.
 440  */
 441 static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
 442 {
 443         struct rbd_client *rbdc;
 444         int ret = -ENOMEM;
 445
 446         dout("rbd_client_create\n");
 447         rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
 448         if (!rbdc)
 449                 goto out_opt;
 450
 451         kref_init(&rbdc->kref);
 452         INIT_LIST_HEAD(&rbdc->node);
 453
 454         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 455
 456         rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
 457         if (IS_ERR(rbdc->client))
 458                 goto out_mutex;
 459         ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
 460
 461         ret = ceph_open_session(rbdc->client);
 462         if (ret < 0)
 463                 goto out_err;
 464
 465         spin_lock(&rbd_client_list_lock);
 466         list_add_tail(&rbdc->node, &rbd_client_list);
 467         spin_unlock(&rbd_client_list_lock);
 468
 469         mutex_unlock(&ctl_mutex);
 470
 471         dout("rbd_client_create created %p\n", rbdc);
 472         return rbdc;
 473
 474 out_err:
 475         ceph_destroy_client(rbdc->client);
 476 out_mutex:
 477         mutex_unlock(&ctl_mutex);
 478         kfree(rbdc);
 479 out_opt:
 480         if (ceph_opts)
 481                 ceph_destroy_options(ceph_opts);
 482         return ERR_PTR(ret);
 483 }
 484
 485 /*
 486  * Find a ceph client with specific addr and configuration.  If
 487  * found, bump its reference count.
 488  */
 489 static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
 490 {
 491         struct rbd_client *client_node;
 492         bool found = false;
 493
 494         if (ceph_opts->flags & CEPH_OPT_NOSHARE)
 495                 return NULL;
 496
 497         spin_lock(&rbd_client_list_lock);
 498         list_for_each_entry(client_node, &rbd_client_list, node) {
 499                 if (!ceph_compare_options(ceph_opts, client_node->client)) {
 500                         kref_get(&client_node->kref);
 501                         found = true;
 502                         break;
 503                 }
 504         }
 505         spin_unlock(&rbd_client_list_lock);
 506
 507         return found ? client_node : NULL;
 508 }
 509
 510 /*
 511  * mount options
 512  */
 513 enum {
 514         Opt_last_int,
 515         /* int args above */
 516         Opt_last_string,
 517         /* string args above */
 518         Opt_read_only,
 519         Opt_read_write,
 520         /* Boolean args above */
 521         Opt_last_bool,
 522 };
 523
 524 static match_table_t rbd_opts_tokens = {
 525         /* int args above */
 526         /* string args above */
 527         {Opt_read_only, "read_only"},
 528         {Opt_read_only, "ro"},          /* Alternate spelling */
 529         {Opt_read_write, "read_write"},
 530         {Opt_read_write, "rw"},         /* Alternate spelling */
 531         /* Boolean args above */
 532         {-1, NULL}
 533 };
 534
 535 struct rbd_options {
 536         bool    read_only;
 537 };
 538
 539 #define RBD_READ_ONLY_DEFAULT   false
 540
 541 static int parse_rbd_opts_token(char *c, void *private)
 542 {
 543         struct rbd_options *rbd_opts = private;
 544         substring_t argstr[MAX_OPT_ARGS];
 545         int token, intval, ret;
 546
 547         token = match_token(c, rbd_opts_tokens, argstr);
 548         if (token < 0)
 549                 return -EINVAL;
 550
 551         if (token < Opt_last_int) {
 552                 ret = match_int(&argstr[0], &intval);
 553                 if (ret < 0) {
 554                         pr_err("bad mount option arg (not int) "
 555                                "at '%s'\n", c);
 556                         return ret;
 557                 }
 558                 dout("got int token %d val %d\n", token, intval);
 559         } else if (token > Opt_last_int && token < Opt_last_string) {
 560                 dout("got string token %d val %s\n", token,
 561                      argstr[0].from);
 562         } else if (token > Opt_last_string && token < Opt_last_bool) {
 563                 dout("got Boolean token %d\n", token);
 564         } else {
 565                 dout("got token %d\n", token);
 566         }
 567
 568         switch (token) {
 569         case Opt_read_only:
 570                 rbd_opts->read_only = true;
 571                 break;
 572         case Opt_read_write:
 573                 rbd_opts->read_only = false;
 574                 break;
 575         default:
 576                 rbd_assert(false);
 577                 break;
 578         }
 579         return 0;
 580 }
 581
 582 /*
 583  * Get a ceph client with specific addr and configuration, if one does
 584  * not exist create it.
 585  */
 586 static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
 587 {
 588         struct rbd_client *rbdc;
 589
 590         rbdc = rbd_client_find(ceph_opts);
 591         if (rbdc)       /* using an existing client */
 592                 ceph_destroy_options(ceph_opts);
 593         else
 594                 rbdc = rbd_client_create(ceph_opts);
 595
 596         return rbdc;
 597 }
 598
 599 /*
 600  * Destroy ceph client
 601  *
 602  * Caller must hold rbd_client_list_lock.
 603  */
 604 static void rbd_client_release(struct kref *kref)
 605 {
 606         struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
 607
 608         dout("rbd_release_client %p\n", rbdc);
 609         spin_lock(&rbd_client_list_lock);
 610         list_del(&rbdc->node);
 611         spin_unlock(&rbd_client_list_lock);
 612
 613         ceph_destroy_client(rbdc->client);
 614         kfree(rbdc);
 615 }
 616
 617 /*
 618  * Drop reference to ceph client node. If it's not referenced anymore, release
 619  * it.
 620  */
 621 static void rbd_put_client(struct rbd_client *rbdc)
 622 {
 623         if (rbdc)
 624                 kref_put(&rbdc->kref, rbd_client_release);
 625 }
 626
 627 static bool rbd_image_format_valid(u32 image_format)
 628 {
 629         return image_format == 1 || image_format == 2;
 630 }
 631
 632 static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
 633 {
 634         size_t size;
 635         u32 snap_count;
 636
 637         /* The header has to start with the magic rbd header text */
 638         if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
 639                 return false;
 640
 641         /* The bio layer requires at least sector-sized I/O */
 642
 643         if (ondisk->options.order < SECTOR_SHIFT)
 644                 return false;
 645
 646         /* If we use u64 in a few spots we may be able to loosen this */
 647
 648         if (ondisk->options.order > 8 * sizeof (int) - 1)
 649                 return false;
 650
 651         /*
 652          * The size of a snapshot header has to fit in a size_t, and
 653          * that limits the number of snapshots.
 654          */
 655         snap_count = le32_to_cpu(ondisk->snap_count);
 656         size = SIZE_MAX - sizeof (struct ceph_snap_context);
 657         if (snap_count > size / sizeof (__le64))
 658                 return false;
 659
 660         /*
 661          * Not only that, but the size of the entire the snapshot
 662          * header must also be representable in a size_t.
 663          */
 664         size -= snap_count * sizeof (__le64);
 665         if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
 666                 return false;
 667
 668         return true;
 669 }
 670
 671 /*
 672  * Create a new header structure, translate header format from the on-disk
 673  * header.
 674  */
 675 static int rbd_header_from_disk(struct rbd_image_header *header,
 676                                  struct rbd_image_header_ondisk *ondisk)
 677 {
 678         u32 snap_count;
 679         size_t len;
 680         size_t size;
 681         u32 i;
 682
 683         memset(header, 0, sizeof (*header));
 684
 685         snap_count = le32_to_cpu(ondisk->snap_count);
 686
 687         len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
 688         header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
 689         if (!header->object_prefix)
 690                 return -ENOMEM;
 691         memcpy(header->object_prefix, ondisk->object_prefix, len);
 692         header->object_prefix[len] = '\0';
 693
 694         if (snap_count) {
 695                 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
 696
 697                 /* Save a copy of the snapshot names */
 698
 699                 if (snap_names_len > (u64) SIZE_MAX)
 700                         return -EIO;
 701                 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
 702                 if (!header->snap_names)
 703                         goto out_err;
 704                 /*
 705                  * Note that rbd_dev_v1_header_read() guarantees
 706                  * the ondisk buffer we're working with has
 707                  * snap_names_len bytes beyond the end of the
 708                  * snapshot id array, this memcpy() is safe.
 709                  */
 710                 memcpy(header->snap_names, &ondisk->snaps[snap_count],
 711                         snap_names_len);
 712
 713                 /* Record each snapshot's size */
 714
 715                 size = snap_count * sizeof (*header->snap_sizes);
 716                 header->snap_sizes = kmalloc(size, GFP_KERNEL);
 717                 if (!header->snap_sizes)
 718                         goto out_err;
 719                 for (i = 0; i < snap_count; i++)
 720                         header->snap_sizes[i] =
 721                                 le64_to_cpu(ondisk->snaps[i].image_size);
 722         } else {
 723                 WARN_ON(ondisk->snap_names_len);
 724                 header->snap_names = NULL;
 725                 header->snap_sizes = NULL;
 726         }
 727
 728         header->features = 0;   /* No features support in v1 images */
 729         header->obj_order = ondisk->options.order;
 730         header->crypt_type = ondisk->options.crypt_type;
 731         header->comp_type = ondisk->options.comp_type;
 732
 733         /* Allocate and fill in the snapshot context */
 734
 735         header->image_size = le64_to_cpu(ondisk->image_size);
 736         size = sizeof (struct ceph_snap_context);
 737         size += snap_count * sizeof (header->snapc->snaps[0]);
 738         header->snapc = kzalloc(size, GFP_KERNEL);
 739         if (!header->snapc)
 740                 goto out_err;
 741
 742         atomic_set(&header->snapc->nref, 1);
 743         header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
 744         header->snapc->num_snaps = snap_count;
 745         for (i = 0; i < snap_count; i++)
 746                 header->snapc->snaps[i] =
 747                         le64_to_cpu(ondisk->snaps[i].id);
 748
 749         return 0;
 750
 751 out_err:
 752         kfree(header->snap_sizes);
 753         header->snap_sizes = NULL;
 754         kfree(header->snap_names);
 755         header->snap_names = NULL;
 756         kfree(header->object_prefix);
 757         header->object_prefix = NULL;
 758
 759         return -ENOMEM;
 760 }
 761
 762 static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
 763 {
 764         struct rbd_snap *snap;
 765
 766         if (snap_id == CEPH_NOSNAP)
 767                 return RBD_SNAP_HEAD_NAME;
 768
 769         list_for_each_entry(snap, &rbd_dev->snaps, node)
 770                 if (snap_id == snap->id)
 771                         return snap->name;
 772
 773         return NULL;
 774 }
 775
 776 static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
 777 {
 778
 779         struct rbd_snap *snap;
 780
 781         list_for_each_entry(snap, &rbd_dev->snaps, node) {
 782                 if (!strcmp(snap_name, snap->name)) {
 783                         rbd_dev->spec->snap_id = snap->id;
 784                         rbd_dev->mapping.size = snap->size;
 785                         rbd_dev->mapping.features = snap->features;
 786
 787                         return 0;
 788                 }
 789         }
 790
 791         return -ENOENT;
 792 }
 793
 794 static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
 795 {
 796         int ret;
 797
 798         if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
 799                     sizeof (RBD_SNAP_HEAD_NAME))) {
 800                 rbd_dev->spec->snap_id = CEPH_NOSNAP;
 801                 rbd_dev->mapping.size = rbd_dev->header.image_size;
 802                 rbd_dev->mapping.features = rbd_dev->header.features;
 803                 ret = 0;
 804         } else {
 805                 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
 806                 if (ret < 0)
 807                         goto done;
 808                 rbd_dev->mapping.read_only = true;
 809         }
 810         set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
 811
 812 done:
 813         return ret;
 814 }
 815
 816 static void rbd_header_free(struct rbd_image_header *header)
 817 {
 818         kfree(header->object_prefix);
 819         header->object_prefix = NULL;
 820         kfree(header->snap_sizes);
 821         header->snap_sizes = NULL;
 822         kfree(header->snap_names);
 823         header->snap_names = NULL;
 824         ceph_put_snap_context(header->snapc);
 825         header->snapc = NULL;
 826 }
 827
 828 static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
 829 {
 830         char *name;
 831         u64 segment;
 832         int ret;
 833
 834         name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
 835         if (!name)
 836                 return NULL;
 837         segment = offset >> rbd_dev->header.obj_order;
 838         ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
 839                         rbd_dev->header.object_prefix, segment);
 840         if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
 841                 pr_err("error formatting segment name for #%llu (%d)\n",
 842                         segment, ret);
 843                 kfree(name);
 844                 name = NULL;
 845         }
 846
 847         return name;
 848 }
 849
 850 static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
 851 {
 852         u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
 853
 854         return offset & (segment_size - 1);
 855 }
 856
 857 static u64 rbd_segment_length(struct rbd_device *rbd_dev,
 858                                 u64 offset, u64 length)
 859 {
 860         u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
 861
 862         offset &= segment_size - 1;
 863
 864         rbd_assert(length <= U64_MAX - offset);
 865         if (offset + length > segment_size)
 866                 length = segment_size - offset;
 867
 868         return length;
 869 }
 870
 871 /*
 872  * returns the size of an object in the image
 873  */
 874 static u64 rbd_obj_bytes(struct rbd_image_header *header)
 875 {
 876         return 1 << header->obj_order;
 877 }
 878
 879 /*
 880  * bio helpers
 881  */
 882
 883 static void bio_chain_put(struct bio *chain)
 884 {
 885         struct bio *tmp;
 886
 887         while (chain) {
 888                 tmp = chain;
 889                 chain = chain->bi_next;
 890                 bio_put(tmp);
 891         }
 892 }
 893
 894 /*
 895  * zeros a bio chain, starting at specific offset
 896  */
 897 static void zero_bio_chain(struct bio *chain, int start_ofs)
 898 {
 899         struct bio_vec *bv;
 900         unsigned long flags;
 901         void *buf;
 902         int i;
 903         int pos = 0;
 904
 905         while (chain) {
 906                 bio_for_each_segment(bv, chain, i) {
 907                         if (pos + bv->bv_len > start_ofs) {
 908                                 int remainder = max(start_ofs - pos, 0);
 909                                 buf = bvec_kmap_irq(bv, &flags);
 910                                 memset(buf + remainder, 0,
 911                                        bv->bv_len - remainder);
 912                                 bvec_kunmap_irq(buf, &flags);
 913                         }
 914                         pos += bv->bv_len;
 915                 }
 916
 917                 chain = chain->bi_next;
 918         }
 919 }
 920
 921 /*
 922  * Clone a portion of a bio, starting at the given byte offset
 923  * and continuing for the number of bytes indicated.
 924  */
 925 static struct bio *bio_clone_range(struct bio *bio_src,
 926                                         unsigned int offset,
 927                                         unsigned int len,
 928                                         gfp_t gfpmask)
 929 {
 930         struct bio_vec *bv;
 931         unsigned int resid;
 932         unsigned short idx;
 933         unsigned int voff;
 934         unsigned short end_idx;
 935         unsigned short vcnt;
 936         struct bio *bio;
 937
 938         /* Handle the easy case for the caller */
 939
 940         if (!offset && len == bio_src->bi_size)
 941                 return bio_clone(bio_src, gfpmask);
 942
 943         if (WARN_ON_ONCE(!len))
 944                 return NULL;
 945         if (WARN_ON_ONCE(len > bio_src->bi_size))
 946                 return NULL;
 947         if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
 948                 return NULL;
 949
 950         /* Find first affected segment... */
 951
 952         resid = offset;
 953         __bio_for_each_segment(bv, bio_src, idx, 0) {
 954                 if (resid < bv->bv_len)
 955                         break;
 956                 resid -= bv->bv_len;
 957         }
 958         voff = resid;
 959
 960         /* ...and the last affected segment */
 961
 962         resid += len;
 963         __bio_for_each_segment(bv, bio_src, end_idx, idx) {
 964                 if (resid <= bv->bv_len)
 965                         break;
 966                 resid -= bv->bv_len;
 967         }
 968         vcnt = end_idx - idx + 1;
 969
 970         /* Build the clone */
 971
 972         bio = bio_alloc(gfpmask, (unsigned int) vcnt);
 973         if (!bio)
 974                 return NULL;    /* ENOMEM */
 975
 976         bio->bi_bdev = bio_src->bi_bdev;
 977         bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
 978         bio->bi_rw = bio_src->bi_rw;
 979         bio->bi_flags |= 1 << BIO_CLONED;
 980
 981         /*
 982          * Copy over our part of the bio_vec, then update the first
 983          * and last (or only) entries.
 984          */
 985         memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
 986                         vcnt * sizeof (struct bio_vec));
 987         bio->bi_io_vec[0].bv_offset += voff;
 988         if (vcnt > 1) {
 989                 bio->bi_io_vec[0].bv_len -= voff;
 990                 bio->bi_io_vec[vcnt - 1].bv_len = resid;
 991         } else {
 992                 bio->bi_io_vec[0].bv_len = len;
 993         }
 994
 995         bio->bi_vcnt = vcnt;
 996         bio->bi_size = len;
 997         bio->bi_idx = 0;
 998
 999         return bio;
1000 }
1001
1002 /*
1003  * Clone a portion of a bio chain, starting at the given byte offset
1004  * into the first bio in the source chain and continuing for the
1005  * number of bytes indicated.  The result is another bio chain of
1006  * exactly the given length, or a null pointer on error.
1007  *
1008  * The bio_src and offset parameters are both in-out.  On entry they
1009  * refer to the first source bio and the offset into that bio where
1010  * the start of data to be cloned is located.
1011  *
1012  * On return, bio_src is updated to refer to the bio in the source
1013  * chain that contains first un-cloned byte, and *offset will
1014  * contain the offset of that byte within that bio.
1015  */
1016 static struct bio *bio_chain_clone_range(struct bio **bio_src,
1017                                         unsigned int *offset,
1018                                         unsigned int len,
1019                                         gfp_t gfpmask)
1020 {
1021         struct bio *bi = *bio_src;
1022         unsigned int off = *offset;
1023         struct bio *chain = NULL;
1024         struct bio **end;
1025
1026         /* Build up a chain of clone bios up to the limit */
1027
1028         if (!bi || off >= bi->bi_size || !len)
1029                 return NULL;            /* Nothing to clone */
1030
1031         end = &chain;
1032         while (len) {
1033                 unsigned int bi_size;
1034                 struct bio *bio;
1035
1036                 if (!bi) {
1037                         rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1038                         goto out_err;   /* EINVAL; ran out of bio's */
1039                 }
1040                 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1041                 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1042                 if (!bio)
1043                         goto out_err;   /* ENOMEM */
1044
1045                 *end = bio;
1046                 end = &bio->bi_next;
1047
1048                 off += bi_size;
1049                 if (off == bi->bi_size) {
1050                         bi = bi->bi_next;
1051                         off = 0;
1052                 }
1053                 len -= bi_size;
1054         }
1055         *bio_src = bi;
1056         *offset = off;
1057
1058         return chain;
1059 out_err:
1060         bio_chain_put(chain);
1061
1062         return NULL;
1063 }
1064
1065 static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1066 {
1067         kref_get(&obj_request->kref);
1068 }
1069
1070 static void rbd_obj_request_destroy(struct kref *kref);
1071 static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1072 {
1073         rbd_assert(obj_request != NULL);
1074         kref_put(&obj_request->kref, rbd_obj_request_destroy);
1075 }
1076
1077 static void rbd_img_request_get(struct rbd_img_request *img_request)
1078 {
1079         kref_get(&img_request->kref);
1080 }
1081
1082 static void rbd_img_request_destroy(struct kref *kref);
1083 static void rbd_img_request_put(struct rbd_img_request *img_request)
1084 {
1085         rbd_assert(img_request != NULL);
1086         kref_put(&img_request->kref, rbd_img_request_destroy);
1087 }
1088
1089 static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1090                                         struct rbd_obj_request *obj_request)
1091 {
1092         rbd_assert(obj_request->img_request == NULL);
1093
1094         rbd_obj_request_get(obj_request);
1095         obj_request->img_request = img_request;
1096         obj_request->which = img_request->obj_request_count;
1097         rbd_assert(obj_request->which != BAD_WHICH);
1098         img_request->obj_request_count++;
1099         list_add_tail(&obj_request->links, &img_request->obj_requests);
1100 }
1101
1102 static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1103                                         struct rbd_obj_request *obj_request)
1104 {
1105         rbd_assert(obj_request->which != BAD_WHICH);
1106
1107         list_del(&obj_request->links);
1108         rbd_assert(img_request->obj_request_count > 0);
1109         img_request->obj_request_count--;
1110         rbd_assert(obj_request->which == img_request->obj_request_count);
1111         obj_request->which = BAD_WHICH;
1112         rbd_assert(obj_request->img_request == img_request);
1113         obj_request->img_request = NULL;
1114         obj_request->callback = NULL;
1115         rbd_obj_request_put(obj_request);
1116 }
1117
1118 static bool obj_request_type_valid(enum obj_request_type type)
1119 {
1120         switch (type) {
1121         case OBJ_REQUEST_NODATA:
1122         case OBJ_REQUEST_BIO:
1123         case OBJ_REQUEST_PAGES:
1124                 return true;
1125         default:
1126                 return false;
1127         }
1128 }
1129
1130 struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
1131 {
1132         struct ceph_osd_req_op *op;
1133         va_list args;
1134         size_t size;
1135
1136         op = kzalloc(sizeof (*op), GFP_NOIO);
1137         if (!op)
1138                 return NULL;
1139         op->op = opcode;
1140         va_start(args, opcode);
1141         switch (opcode) {
1142         case CEPH_OSD_OP_READ:
1143         case CEPH_OSD_OP_WRITE:
1144                 /* rbd_osd_req_op_create(READ, offset, length) */
1145                 /* rbd_osd_req_op_create(WRITE, offset, length) */
1146                 op->extent.offset = va_arg(args, u64);
1147                 op->extent.length = va_arg(args, u64);
1148                 if (opcode == CEPH_OSD_OP_WRITE)
1149                         op->payload_len = op->extent.length;
1150                 break;
1151         case CEPH_OSD_OP_CALL:
1152                 /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
1153                 op->cls.class_name = va_arg(args, char *);
1154                 size = strlen(op->cls.class_name);
1155                 rbd_assert(size <= (size_t) U8_MAX);
1156                 op->cls.class_len = size;
1157                 op->payload_len = size;
1158
1159                 op->cls.method_name = va_arg(args, char *);
1160                 size = strlen(op->cls.method_name);
1161                 rbd_assert(size <= (size_t) U8_MAX);
1162                 op->cls.method_len = size;
1163                 op->payload_len += size;
1164
1165                 op->cls.argc = 0;
1166                 op->cls.indata = va_arg(args, void *);
1167                 size = va_arg(args, size_t);
1168                 rbd_assert(size <= (size_t) U32_MAX);
1169                 op->cls.indata_len = (u32) size;
1170                 op->payload_len += size;
1171                 break;
1172         case CEPH_OSD_OP_NOTIFY_ACK:
1173         case CEPH_OSD_OP_WATCH:
1174                 /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */
1175                 /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */
1176                 op->watch.cookie = va_arg(args, u64);
1177                 op->watch.ver = va_arg(args, u64);
1178                 op->watch.ver = cpu_to_le64(op->watch.ver);
1179                 if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int))
1180                         op->watch.flag = (u8) 1;
1181                 break;
1182         default:
1183                 rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
1184                 kfree(op);
1185                 op = NULL;
1186                 break;
1187         }
1188         va_end(args);
1189
1190         return op;
1191 }
1192
1193 static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
1194 {
1195         kfree(op);
1196 }
1197
1198 static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1199                                 struct rbd_obj_request *obj_request)
1200 {
1201         return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1202 }
1203
1204 static void rbd_img_request_complete(struct rbd_img_request *img_request)
1205 {
1206         if (img_request->callback)
1207                 img_request->callback(img_request);
1208         else
1209                 rbd_img_request_put(img_request);
1210 }
1211
1212 /* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1213
1214 static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1215 {
1216         return wait_for_completion_interruptible(&obj_request->completion);
1217 }
1218
1219 static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request,
1220                                 struct ceph_osd_op *op)
1221 {
1222         atomic_set(&obj_request->done, 1);
1223 }
1224
1225 static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1226 {
1227         if (obj_request->callback)
1228                 obj_request->callback(obj_request);
1229         else
1230                 complete_all(&obj_request->completion);
1231 }
1232
1233 static void rbd_osd_read_callback(struct rbd_obj_request *obj_request,
1234                                 struct ceph_osd_op *op)
1235 {
1236         u64 xferred;
1237
1238         /*
1239          * We support a 64-bit length, but ultimately it has to be
1240          * passed to blk_end_request(), which takes an unsigned int.
1241          */
1242         xferred = le64_to_cpu(op->extent.length);
1243         rbd_assert(xferred < (u64) UINT_MAX);
1244         if (obj_request->result == (s32) -ENOENT) {
1245                 zero_bio_chain(obj_request->bio_list, 0);
1246                 obj_request->result = 0;
1247         } else if (xferred < obj_request->length && !obj_request->result) {
1248                 zero_bio_chain(obj_request->bio_list, xferred);
1249                 xferred = obj_request->length;
1250         }
1251         obj_request->xferred = xferred;
1252         atomic_set(&obj_request->done, 1);
1253 }
1254
1255 static void rbd_osd_write_callback(struct rbd_obj_request *obj_request,
1256                                 struct ceph_osd_op *op)
1257 {
1258         obj_request->xferred = le64_to_cpu(op->extent.length);
1259         atomic_set(&obj_request->done, 1);
1260 }
1261
1262 static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1263                                 struct ceph_msg *msg)
1264 {
1265         struct rbd_obj_request *obj_request = osd_req->r_priv;
1266         struct ceph_osd_reply_head *reply_head;
1267         struct ceph_osd_op *op;
1268         u32 num_ops;
1269         u16 opcode;
1270
1271         rbd_assert(osd_req == obj_request->osd_req);
1272         rbd_assert(!!obj_request->img_request ^
1273                                 (obj_request->which == BAD_WHICH));
1274
1275         obj_request->xferred = le32_to_cpu(msg->hdr.data_len);
1276         reply_head = msg->front.iov_base;
1277         obj_request->result = (s32) le32_to_cpu(reply_head->result);
1278         obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1279
1280         num_ops = le32_to_cpu(reply_head->num_ops);
1281         WARN_ON(num_ops != 1);  /* For now */
1282
1283         op = &reply_head->ops[0];
1284         opcode = le16_to_cpu(op->op);
1285         switch (opcode) {
1286         case CEPH_OSD_OP_READ:
1287                 rbd_osd_read_callback(obj_request, op);
1288                 break;
1289         case CEPH_OSD_OP_WRITE:
1290                 rbd_osd_write_callback(obj_request, op);
1291                 break;
1292         case CEPH_OSD_OP_CALL:
1293         case CEPH_OSD_OP_NOTIFY_ACK:
1294         case CEPH_OSD_OP_WATCH:
1295                 rbd_osd_trivial_callback(obj_request, op);
1296                 break;
1297         default:
1298                 rbd_warn(NULL, "%s: unsupported op %hu\n",
1299                         obj_request->object_name, (unsigned short) opcode);
1300                 break;
1301         }
1302
1303         if (atomic_read(&obj_request->done))
1304                 rbd_obj_request_complete(obj_request);
1305 }
1306
1307 static struct ceph_osd_request *rbd_osd_req_create(
1308                                         struct rbd_device *rbd_dev,
1309                                         bool write_request,
1310                                         struct rbd_obj_request *obj_request,
1311                                         struct ceph_osd_req_op *op)
1312 {
1313         struct rbd_img_request *img_request = obj_request->img_request;
1314         struct ceph_snap_context *snapc = NULL;
1315         struct ceph_osd_client *osdc;
1316         struct ceph_osd_request *osd_req;
1317         struct timespec now;
1318         struct timespec *mtime;
1319         u64 snap_id = CEPH_NOSNAP;
1320         u64 offset = obj_request->offset;
1321         u64 length = obj_request->length;
1322
1323         if (img_request) {
1324                 rbd_assert(img_request->write_request == write_request);
1325                 if (img_request->write_request)
1326                         snapc = img_request->snapc;
1327                 else
1328                         snap_id = img_request->snap_id;
1329         }
1330
1331         /* Allocate and initialize the request, for the single op */
1332
1333         osdc = &rbd_dev->rbd_client->client->osdc;
1334         osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1335         if (!osd_req)
1336                 return NULL;    /* ENOMEM */
1337
1338         rbd_assert(obj_request_type_valid(obj_request->type));
1339         switch (obj_request->type) {
1340         case OBJ_REQUEST_NODATA:
1341                 break;          /* Nothing to do */
1342         case OBJ_REQUEST_BIO:
1343                 rbd_assert(obj_request->bio_list != NULL);
1344                 osd_req->r_bio = obj_request->bio_list;
1345                 break;
1346         case OBJ_REQUEST_PAGES:
1347                 osd_req->r_pages = obj_request->pages;
1348                 osd_req->r_num_pages = obj_request->page_count;
1349                 osd_req->r_page_alignment = offset & ~PAGE_MASK;
1350                 break;
1351         }
1352
1353         if (write_request) {
1354                 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1355                 now = CURRENT_TIME;
1356                 mtime = &now;
1357         } else {
1358                 osd_req->r_flags = CEPH_OSD_FLAG_READ;
1359                 mtime = NULL;   /* not needed for reads */
1360                 offset = 0;     /* These are not used... */
1361                 length = 0;     /* ...for osd read requests */
1362         }
1363
1364         osd_req->r_callback = rbd_osd_req_callback;
1365         osd_req->r_priv = obj_request;
1366
1367         osd_req->r_oid_len = strlen(obj_request->object_name);
1368         rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1369         memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1370
1371         osd_req->r_file_layout = rbd_dev->layout;       /* struct */
1372
1373         /* osd_req will get its own reference to snapc (if non-null) */
1374
1375         ceph_osdc_build_request(osd_req, offset, length, 1, op,
1376                                 snapc, snap_id, mtime);
1377
1378         return osd_req;
1379 }
1380
1381 static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1382 {
1383         ceph_osdc_put_request(osd_req);
1384 }
1385
1386 /* object_name is assumed to be a non-null pointer and NUL-terminated */
1387
1388 static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1389                                                 u64 offset, u64 length,
1390                                                 enum obj_request_type type)
1391 {
1392         struct rbd_obj_request *obj_request;
1393         size_t size;
1394         char *name;
1395
1396         rbd_assert(obj_request_type_valid(type));
1397
1398         size = strlen(object_name) + 1;
1399         obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1400         if (!obj_request)
1401                 return NULL;
1402
1403         name = (char *)(obj_request + 1);
1404         obj_request->object_name = memcpy(name, object_name, size);
1405         obj_request->offset = offset;
1406         obj_request->length = length;
1407         obj_request->which = BAD_WHICH;
1408         obj_request->type = type;
1409         INIT_LIST_HEAD(&obj_request->links);
1410         atomic_set(&obj_request->done, 0);
1411         init_completion(&obj_request->completion);
1412         kref_init(&obj_request->kref);
1413
1414         return obj_request;
1415 }
1416
1417 static void rbd_obj_request_destroy(struct kref *kref)
1418 {
1419         struct rbd_obj_request *obj_request;
1420
1421         obj_request = container_of(kref, struct rbd_obj_request, kref);
1422
1423         rbd_assert(obj_request->img_request == NULL);
1424         rbd_assert(obj_request->which == BAD_WHICH);
1425
1426         if (obj_request->osd_req)
1427                 rbd_osd_req_destroy(obj_request->osd_req);
1428
1429         rbd_assert(obj_request_type_valid(obj_request->type));
1430         switch (obj_request->type) {
1431         case OBJ_REQUEST_NODATA:
1432                 break;          /* Nothing to do */
1433         case OBJ_REQUEST_BIO:
1434                 if (obj_request->bio_list)
1435                         bio_chain_put(obj_request->bio_list);
1436                 break;
1437         case OBJ_REQUEST_PAGES:
1438                 if (obj_request->pages)
1439                         ceph_release_page_vector(obj_request->pages,
1440                                                 obj_request->page_count);
1441                 break;
1442         }
1443
1444         kfree(obj_request);
1445 }
1446
1447 /*
1448  * Caller is responsible for filling in the list of object requests
1449  * that comprises the image request, and the Linux request pointer
1450  * (if there is one).
1451  */
1452 struct rbd_img_request *rbd_img_request_create(struct rbd_device *rbd_dev,
1453                                         u64 offset, u64 length,
1454                                         bool write_request)
1455 {
1456         struct rbd_img_request *img_request;
1457         struct ceph_snap_context *snapc = NULL;
1458
1459         img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1460         if (!img_request)
1461                 return NULL;
1462
1463         if (write_request) {
1464                 down_read(&rbd_dev->header_rwsem);
1465                 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1466                 up_read(&rbd_dev->header_rwsem);
1467                 if (WARN_ON(!snapc)) {
1468                         kfree(img_request);
1469                         return NULL;    /* Shouldn't happen */
1470                 }
1471         }
1472
1473         img_request->rq = NULL;
1474         img_request->rbd_dev = rbd_dev;
1475         img_request->offset = offset;
1476         img_request->length = length;
1477         img_request->write_request = write_request;
1478         if (write_request)
1479                 img_request->snapc = snapc;
1480         else
1481                 img_request->snap_id = rbd_dev->spec->snap_id;
1482         spin_lock_init(&img_request->completion_lock);
1483         img_request->next_completion = 0;
1484         img_request->callback = NULL;
1485         img_request->obj_request_count = 0;
1486         INIT_LIST_HEAD(&img_request->obj_requests);
1487         kref_init(&img_request->kref);
1488
1489         rbd_img_request_get(img_request);       /* Avoid a warning */
1490         rbd_img_request_put(img_request);       /* TEMPORARY */
1491
1492         return img_request;
1493 }
1494
1495 static void rbd_img_request_destroy(struct kref *kref)
1496 {
1497         struct rbd_img_request *img_request;
1498         struct rbd_obj_request *obj_request;
1499         struct rbd_obj_request *next_obj_request;
1500
1501         img_request = container_of(kref, struct rbd_img_request, kref);
1502
1503         for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1504                 rbd_img_obj_request_del(img_request, obj_request);
1505         rbd_assert(img_request->obj_request_count == 0);
1506
1507         if (img_request->write_request)
1508                 ceph_put_snap_context(img_request->snapc);
1509
1510         kfree(img_request);
1511 }
1512
1513 static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
1514                                         struct bio *bio_list)
1515 {
1516         struct rbd_device *rbd_dev = img_request->rbd_dev;
1517         struct rbd_obj_request *obj_request = NULL;
1518         struct rbd_obj_request *next_obj_request;
1519         unsigned int bio_offset;
1520         u64 image_offset;
1521         u64 resid;
1522         u16 opcode;
1523
1524         opcode = img_request->write_request ? CEPH_OSD_OP_WRITE
1525                                               : CEPH_OSD_OP_READ;
1526         bio_offset = 0;
1527         image_offset = img_request->offset;
1528         rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT);
1529         resid = img_request->length;
1530         while (resid) {
1531                 const char *object_name;
1532                 unsigned int clone_size;
1533                 struct ceph_osd_req_op *op;
1534                 u64 offset;
1535                 u64 length;
1536
1537                 object_name = rbd_segment_name(rbd_dev, image_offset);
1538                 if (!object_name)
1539                         goto out_unwind;
1540                 offset = rbd_segment_offset(rbd_dev, image_offset);
1541                 length = rbd_segment_length(rbd_dev, image_offset, resid);
1542                 obj_request = rbd_obj_request_create(object_name,
1543                                                 offset, length,
1544                                                 OBJ_REQUEST_BIO);
1545                 kfree(object_name);     /* object request has its own copy */
1546                 if (!obj_request)
1547                         goto out_unwind;
1548
1549                 rbd_assert(length <= (u64) UINT_MAX);
1550                 clone_size = (unsigned int) length;
1551                 obj_request->bio_list = bio_chain_clone_range(&bio_list,
1552                                                 &bio_offset, clone_size,
1553                                                 GFP_ATOMIC);
1554                 if (!obj_request->bio_list)
1555                         goto out_partial;
1556
1557                 /*
1558                  * Build up the op to use in building the osd
1559                  * request.  Note that the contents of the op are
1560                  * copied by rbd_osd_req_create().
1561                  */
1562                 op = rbd_osd_req_op_create(opcode, offset, length);
1563                 if (!op)
1564                         goto out_partial;
1565                 obj_request->osd_req = rbd_osd_req_create(rbd_dev,
1566                                                 img_request->write_request,
1567                                                 obj_request, op);
1568                 rbd_osd_req_op_destroy(op);
1569                 if (!obj_request->osd_req)
1570                         goto out_partial;
1571                 /* status and version are initially zero-filled */
1572
1573                 rbd_img_obj_request_add(img_request, obj_request);
1574
1575                 image_offset += length;
1576                 resid -= length;
1577         }
1578
1579         return 0;
1580
1581 out_partial:
1582         rbd_obj_request_put(obj_request);
1583 out_unwind:
1584         for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1585                 rbd_obj_request_put(obj_request);
1586
1587         return -ENOMEM;
1588 }
1589
1590 static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1591 {
1592         struct rbd_img_request *img_request;
1593         u32 which = obj_request->which;
1594         bool more = true;
1595
1596         img_request = obj_request->img_request;
1597         rbd_assert(img_request != NULL);
1598         rbd_assert(img_request->rq != NULL);
1599         rbd_assert(which != BAD_WHICH);
1600         rbd_assert(which < img_request->obj_request_count);
1601         rbd_assert(which >= img_request->next_completion);
1602
1603         spin_lock_irq(&img_request->completion_lock);
1604         if (which != img_request->next_completion)
1605                 goto out;
1606
1607         for_each_obj_request_from(img_request, obj_request) {
1608                 unsigned int xferred;
1609                 int result;
1610
1611                 rbd_assert(more);
1612                 rbd_assert(which < img_request->obj_request_count);
1613
1614                 if (!atomic_read(&obj_request->done))
1615                         break;
1616
1617                 rbd_assert(obj_request->xferred <= (u64) UINT_MAX);
1618                 xferred = (unsigned int) obj_request->xferred;
1619                 result = (int) obj_request->result;
1620                 if (result)
1621                         rbd_warn(NULL, "obj_request %s result %d xferred %u\n",
1622                                 img_request->write_request ? "write" : "read",
1623                                 result, xferred);
1624
1625                 more = blk_end_request(img_request->rq, result, xferred);
1626                 which++;
1627         }
1628         rbd_assert(more ^ (which == img_request->obj_request_count));
1629         img_request->next_completion = which;
1630 out:
1631         spin_unlock_irq(&img_request->completion_lock);
1632
1633         if (!more)
1634                 rbd_img_request_complete(img_request);
1635 }
1636
1637 static int rbd_img_request_submit(struct rbd_img_request *img_request)
1638 {
1639         struct rbd_device *rbd_dev = img_request->rbd_dev;
1640         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1641         struct rbd_obj_request *obj_request;
1642
1643         for_each_obj_request(img_request, obj_request) {
1644                 int ret;
1645
1646                 obj_request->callback = rbd_img_obj_callback;
1647                 ret = rbd_obj_request_submit(osdc, obj_request);
1648                 if (ret)
1649                         return ret;
1650                 /*
1651                  * The image request has its own reference to each
1652                  * of its object requests, so we can safely drop the
1653                  * initial one here.
1654                  */
1655                 rbd_obj_request_put(obj_request);
1656         }
1657
1658         return 0;
1659 }
1660
1661 static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
1662                                    u64 ver, u64 notify_id)
1663 {
1664         struct rbd_obj_request *obj_request;
1665         struct ceph_osd_req_op *op;
1666         struct ceph_osd_client *osdc;
1667         int ret;
1668
1669         obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1670                                                         OBJ_REQUEST_NODATA);
1671         if (!obj_request)
1672                 return -ENOMEM;
1673
1674         ret = -ENOMEM;
1675         op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
1676         if (!op)
1677                 goto out;
1678         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
1679                                                 obj_request, op);
1680         rbd_osd_req_op_destroy(op);
1681         if (!obj_request->osd_req)
1682                 goto out;
1683
1684         osdc = &rbd_dev->rbd_client->client->osdc;
1685         obj_request->callback = rbd_obj_request_put;
1686         ret = rbd_obj_request_submit(osdc, obj_request);
1687 out:
1688         if (ret)
1689                 rbd_obj_request_put(obj_request);
1690
1691         return ret;
1692 }
1693
1694 static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1695 {
1696         struct rbd_device *rbd_dev = (struct rbd_device *)data;
1697         u64 hver;
1698         int rc;
1699
1700         if (!rbd_dev)
1701                 return;
1702
1703         dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1704                 rbd_dev->header_name, (unsigned long long) notify_id,
1705                 (unsigned int) opcode);
1706         rc = rbd_dev_refresh(rbd_dev, &hver);
1707         if (rc)
1708                 rbd_warn(rbd_dev, "got notification but failed to "
1709                            " update snaps: %d\n", rc);
1710
1711         rbd_obj_notify_ack(rbd_dev, hver, notify_id);
1712 }
1713
1714 /*
1715  * Request sync osd watch/unwatch.  The value of "start" determines
1716  * whether a watch request is being initiated or torn down.
1717  */
1718 static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
1719 {
1720         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1721         struct rbd_obj_request *obj_request;
1722         struct ceph_osd_req_op *op;
1723         int ret;
1724
1725         rbd_assert(start ^ !!rbd_dev->watch_event);
1726         rbd_assert(start ^ !!rbd_dev->watch_request);
1727
1728         if (start) {
1729                 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev,
1730                                                 &rbd_dev->watch_event);
1731                 if (ret < 0)
1732                         return ret;
1733                 rbd_assert(rbd_dev->watch_event != NULL);
1734         }
1735
1736         ret = -ENOMEM;
1737         obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1738                                                         OBJ_REQUEST_NODATA);
1739         if (!obj_request)
1740                 goto out_cancel;
1741
1742         op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
1743                                 rbd_dev->watch_event->cookie,
1744                                 rbd_dev->header.obj_version, start);
1745         if (!op)
1746                 goto out_cancel;
1747         obj_request->osd_req = rbd_osd_req_create(rbd_dev, true,
1748                                                         obj_request, op);
1749         rbd_osd_req_op_destroy(op);
1750         if (!obj_request->osd_req)
1751                 goto out_cancel;
1752
1753         if (start)
1754                 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
1755         else
1756                 ceph_osdc_unregister_linger_request(osdc,
1757                                         rbd_dev->watch_request->osd_req);
1758         ret = rbd_obj_request_submit(osdc, obj_request);
1759         if (ret)
1760                 goto out_cancel;
1761         ret = rbd_obj_request_wait(obj_request);
1762         if (ret)
1763                 goto out_cancel;
1764         ret = obj_request->result;
1765         if (ret)
1766                 goto out_cancel;
1767
1768         /*
1769          * A watch request is set to linger, so the underlying osd
1770          * request won't go away until we unregister it.  We retain
1771          * a pointer to the object request during that time (in
1772          * rbd_dev->watch_request), so we'll keep a reference to
1773          * it.  We'll drop that reference (below) after we've
1774          * unregistered it.
1775          */
1776         if (start) {
1777                 rbd_dev->watch_request = obj_request;
1778
1779                 return 0;
1780         }
1781
1782         /* We have successfully torn down the watch request */
1783
1784         rbd_obj_request_put(rbd_dev->watch_request);
1785         rbd_dev->watch_request = NULL;
1786 out_cancel:
1787         /* Cancel the event if we're tearing down, or on error */
1788         ceph_osdc_cancel_event(rbd_dev->watch_event);
1789         rbd_dev->watch_event = NULL;
1790         if (obj_request)
1791                 rbd_obj_request_put(obj_request);
1792
1793         return ret;
1794 }
1795
1796 /*
1797  * Synchronous osd object method call
1798  */
1799 static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
1800                              const char *object_name,
1801                              const char *class_name,
1802                              const char *method_name,
1803                              const char *outbound,
1804                              size_t outbound_size,
1805                              char *inbound,
1806                              size_t inbound_size,
1807                              u64 *version)
1808 {
1809         struct rbd_obj_request *obj_request;
1810         struct ceph_osd_client *osdc;
1811         struct ceph_osd_req_op *op;
1812         struct page **pages;
1813         u32 page_count;
1814         int ret;
1815
1816         /*
1817          * Method calls are ultimately read operations but they
1818          * don't involve object data (so no offset or length).
1819          * The result should placed into the inbound buffer
1820          * provided.  They also supply outbound data--parameters for
1821          * the object method.  Currently if this is present it will
1822          * be a snapshot id.
1823          */
1824         page_count = (u32) calc_pages_for(0, inbound_size);
1825         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
1826         if (IS_ERR(pages))
1827                 return PTR_ERR(pages);
1828
1829         ret = -ENOMEM;
1830         obj_request = rbd_obj_request_create(object_name, 0, 0,
1831                                                         OBJ_REQUEST_PAGES);
1832         if (!obj_request)
1833                 goto out;
1834
1835         obj_request->pages = pages;
1836         obj_request->page_count = page_count;
1837
1838         op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
1839                                         method_name, outbound, outbound_size);
1840         if (!op)
1841                 goto out;
1842         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
1843                                                 obj_request, op);
1844         rbd_osd_req_op_destroy(op);
1845         if (!obj_request->osd_req)
1846                 goto out;
1847
1848         osdc = &rbd_dev->rbd_client->client->osdc;
1849         ret = rbd_obj_request_submit(osdc, obj_request);
1850         if (ret)
1851                 goto out;
1852         ret = rbd_obj_request_wait(obj_request);
1853         if (ret)
1854                 goto out;
1855
1856         ret = obj_request->result;
1857         if (ret < 0)
1858                 goto out;
1859         ret = ceph_copy_from_page_vector(pages, inbound, 0,
1860                                         obj_request->xferred);
1861         if (version)
1862                 *version = obj_request->version;
1863 out:
1864         if (obj_request)
1865                 rbd_obj_request_put(obj_request);
1866         else
1867                 ceph_release_page_vector(pages, page_count);
1868
1869         return ret;
1870 }
1871
1872 static void rbd_request_fn(struct request_queue *q)
1873 {
1874         struct rbd_device *rbd_dev = q->queuedata;
1875         bool read_only = rbd_dev->mapping.read_only;
1876         struct request *rq;
1877         int result;
1878
1879         while ((rq = blk_fetch_request(q))) {
1880                 bool write_request = rq_data_dir(rq) == WRITE;
1881                 struct rbd_img_request *img_request;
1882                 u64 offset;
1883                 u64 length;
1884
1885                 /* Ignore any non-FS requests that filter through. */
1886
1887                 if (rq->cmd_type != REQ_TYPE_FS) {
1888                         __blk_end_request_all(rq, 0);
1889                         continue;
1890                 }
1891
1892                 spin_unlock_irq(q->queue_lock);
1893
1894                 /* Disallow writes to a read-only device */
1895
1896                 if (write_request) {
1897                         result = -EROFS;
1898                         if (read_only)
1899                                 goto end_request;
1900                         rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
1901                 }
1902
1903                 /*
1904                  * Quit early if the mapped snapshot no longer
1905                  * exists.  It's still possible the snapshot will
1906                  * have disappeared by the time our request arrives
1907                  * at the osd, but there's no sense in sending it if
1908                  * we already know.
1909                  */
1910                 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
1911                         dout("request for non-existent snapshot");
1912                         rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
1913                         result = -ENXIO;
1914                         goto end_request;
1915                 }
1916
1917                 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
1918                 length = (u64) blk_rq_bytes(rq);
1919
1920                 result = -EINVAL;
1921                 if (WARN_ON(offset && length > U64_MAX - offset + 1))
1922                         goto end_request;       /* Shouldn't happen */
1923
1924                 result = -ENOMEM;
1925                 img_request = rbd_img_request_create(rbd_dev, offset, length,
1926                                                         write_request);
1927                 if (!img_request)
1928                         goto end_request;
1929
1930                 img_request->rq = rq;
1931
1932                 result = rbd_img_request_fill_bio(img_request, rq->bio);
1933                 if (!result)
1934                         result = rbd_img_request_submit(img_request);
1935                 if (result)
1936                         rbd_img_request_put(img_request);
1937 end_request:
1938                 spin_lock_irq(q->queue_lock);
1939                 if (result < 0) {
1940                         rbd_warn(rbd_dev, "obj_request %s result %d\n",
1941                                 write_request ? "write" : "read", result);
1942                         __blk_end_request_all(rq, result);
1943                 }
1944         }
1945 }
1946
1947 /*
1948  * a queue callback. Makes sure that we don't create a bio that spans across
1949  * multiple osd objects. One exception would be with a single page bios,
1950  * which we handle later at bio_chain_clone_range()
1951  */
1952 static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1953                           struct bio_vec *bvec)
1954 {
1955         struct rbd_device *rbd_dev = q->queuedata;
1956         sector_t sector_offset;
1957         sector_t sectors_per_obj;
1958         sector_t obj_sector_offset;
1959         int ret;
1960
1961         /*
1962          * Find how far into its rbd object the partition-relative
1963          * bio start sector is to offset relative to the enclosing
1964          * device.
1965          */
1966         sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1967         sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1968         obj_sector_offset = sector_offset & (sectors_per_obj - 1);
1969
1970         /*
1971          * Compute the number of bytes from that offset to the end
1972          * of the object.  Account for what's already used by the bio.
1973          */
1974         ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1975         if (ret > bmd->bi_size)
1976                 ret -= bmd->bi_size;
1977         else
1978                 ret = 0;
1979
1980         /*
1981          * Don't send back more than was asked for.  And if the bio
1982          * was empty, let the whole thing through because:  "Note
1983          * that a block device *must* allow a single page to be
1984          * added to an empty bio."
1985          */
1986         rbd_assert(bvec->bv_len <= PAGE_SIZE);
1987         if (ret > (int) bvec->bv_len || !bmd->bi_size)
1988                 ret = (int) bvec->bv_len;
1989
1990         return ret;
1991 }
1992
1993 static void rbd_free_disk(struct rbd_device *rbd_dev)
1994 {
1995         struct gendisk *disk = rbd_dev->disk;
1996
1997         if (!disk)
1998                 return;
1999
2000         if (disk->flags & GENHD_FL_UP)
2001                 del_gendisk(disk);
2002         if (disk->queue)
2003                 blk_cleanup_queue(disk->queue);
2004         put_disk(disk);
2005 }
2006
2007 static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2008                                 const char *object_name,
2009                                 u64 offset, u64 length,
2010                                 char *buf, u64 *version)
2011
2012 {
2013         struct ceph_osd_req_op *op;
2014         struct rbd_obj_request *obj_request;
2015         struct ceph_osd_client *osdc;
2016         struct page **pages = NULL;
2017         u32 page_count;
2018         int ret;
2019
2020         page_count = (u32) calc_pages_for(offset, length);
2021         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2022         if (IS_ERR(pages))
2023                 ret = PTR_ERR(pages);
2024
2025         ret = -ENOMEM;
2026         obj_request = rbd_obj_request_create(object_name, offset, length,
2027                                                         OBJ_REQUEST_PAGES);
2028         if (!obj_request)
2029                 goto out;
2030
2031         obj_request->pages = pages;
2032         obj_request->page_count = page_count;
2033
2034         op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, offset, length);
2035         if (!op)
2036                 goto out;
2037         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2038                                                 obj_request, op);
2039         rbd_osd_req_op_destroy(op);
2040         if (!obj_request->osd_req)
2041                 goto out;
2042
2043         osdc = &rbd_dev->rbd_client->client->osdc;
2044         ret = rbd_obj_request_submit(osdc, obj_request);
2045         if (ret)
2046                 goto out;
2047         ret = rbd_obj_request_wait(obj_request);
2048         if (ret)
2049                 goto out;
2050
2051         ret = obj_request->result;
2052         if (ret < 0)
2053                 goto out;
2054         ret = ceph_copy_from_page_vector(pages, buf, 0, obj_request->xferred);
2055         if (version)
2056                 *version = obj_request->version;
2057 out:
2058         if (obj_request)
2059                 rbd_obj_request_put(obj_request);
2060         else
2061                 ceph_release_page_vector(pages, page_count);
2062
2063         return ret;
2064 }
2065
2066 /*
2067  * Read the complete header for the given rbd device.
2068  *
2069  * Returns a pointer to a dynamically-allocated buffer containing
2070  * the complete and validated header.  Caller can pass the address
2071  * of a variable that will be filled in with the version of the
2072  * header object at the time it was read.
2073  *
2074  * Returns a pointer-coded errno if a failure occurs.
2075  */
2076 static struct rbd_image_header_ondisk *
2077 rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
2078 {
2079         struct rbd_image_header_ondisk *ondisk = NULL;
2080         u32 snap_count = 0;
2081         u64 names_size = 0;
2082         u32 want_count;
2083         int ret;
2084
2085         /*
2086          * The complete header will include an array of its 64-bit
2087          * snapshot ids, followed by the names of those snapshots as
2088          * a contiguous block of NUL-terminated strings.  Note that
2089          * the number of snapshots could change by the time we read
2090          * it in, in which case we re-read it.
2091          */
2092         do {
2093                 size_t size;
2094
2095                 kfree(ondisk);
2096
2097                 size = sizeof (*ondisk);
2098                 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
2099                 size += names_size;
2100                 ondisk = kmalloc(size, GFP_KERNEL);
2101                 if (!ondisk)
2102                         return ERR_PTR(-ENOMEM);
2103
2104                 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
2105                                        0, size,
2106                                        (char *) ondisk, version);
2107
2108                 if (ret < 0)
2109                         goto out_err;
2110                 if (WARN_ON((size_t) ret < size)) {
2111                         ret = -ENXIO;
2112                         rbd_warn(rbd_dev, "short header read (want %zd got %d)",
2113                                 size, ret);
2114                         goto out_err;
2115                 }
2116                 if (!rbd_dev_ondisk_valid(ondisk)) {
2117                         ret = -ENXIO;
2118                         rbd_warn(rbd_dev, "invalid header");
2119                         goto out_err;
2120                 }
2121
2122                 names_size = le64_to_cpu(ondisk->snap_names_len);
2123                 want_count = snap_count;
2124                 snap_count = le32_to_cpu(ondisk->snap_count);
2125         } while (snap_count != want_count);
2126
2127         return ondisk;
2128
2129 out_err:
2130         kfree(ondisk);
2131
2132         return ERR_PTR(ret);
2133 }
2134
2135 /*
2136  * reload the ondisk the header
2137  */
2138 static int rbd_read_header(struct rbd_device *rbd_dev,
2139                            struct rbd_image_header *header)
2140 {
2141         struct rbd_image_header_ondisk *ondisk;
2142         u64 ver = 0;
2143         int ret;
2144
2145         ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
2146         if (IS_ERR(ondisk))
2147                 return PTR_ERR(ondisk);
2148         ret = rbd_header_from_disk(header, ondisk);
2149         if (ret >= 0)
2150                 header->obj_version = ver;
2151         kfree(ondisk);
2152
2153         return ret;
2154 }
2155
2156 static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
2157 {
2158         struct rbd_snap *snap;
2159         struct rbd_snap *next;
2160
2161         list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
2162                 rbd_remove_snap_dev(snap);
2163 }
2164
2165 static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
2166 {
2167         sector_t size;
2168
2169         if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
2170                 return;
2171
2172         size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
2173         dout("setting size to %llu sectors", (unsigned long long) size);
2174         rbd_dev->mapping.size = (u64) size;
2175         set_capacity(rbd_dev->disk, size);
2176 }
2177
2178 /*
2179  * only read the first part of the ondisk header, without the snaps info
2180  */
2181 static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
2182 {
2183         int ret;
2184         struct rbd_image_header h;
2185
2186         ret = rbd_read_header(rbd_dev, &h);
2187         if (ret < 0)
2188                 return ret;
2189
2190         down_write(&rbd_dev->header_rwsem);
2191
2192         /* Update image size, and check for resize of mapped image */
2193         rbd_dev->header.image_size = h.image_size;
2194         rbd_update_mapping_size(rbd_dev);
2195
2196         /* rbd_dev->header.object_prefix shouldn't change */
2197         kfree(rbd_dev->header.snap_sizes);
2198         kfree(rbd_dev->header.snap_names);
2199         /* osd requests may still refer to snapc */
2200         ceph_put_snap_context(rbd_dev->header.snapc);
2201
2202         if (hver)
2203                 *hver = h.obj_version;
2204         rbd_dev->header.obj_version = h.obj_version;
2205         rbd_dev->header.image_size = h.image_size;
2206         rbd_dev->header.snapc = h.snapc;
2207         rbd_dev->header.snap_names = h.snap_names;
2208         rbd_dev->header.snap_sizes = h.snap_sizes;
2209         /* Free the extra copy of the object prefix */
2210         WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
2211         kfree(h.object_prefix);
2212
2213         ret = rbd_dev_snaps_update(rbd_dev);
2214         if (!ret)
2215                 ret = rbd_dev_snaps_register(rbd_dev);
2216
2217         up_write(&rbd_dev->header_rwsem);
2218
2219         return ret;
2220 }
2221
2222 static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
2223 {
2224         int ret;
2225
2226         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
2227         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2228         if (rbd_dev->image_format == 1)
2229                 ret = rbd_dev_v1_refresh(rbd_dev, hver);
2230         else
2231                 ret = rbd_dev_v2_refresh(rbd_dev, hver);
2232         mutex_unlock(&ctl_mutex);
2233
2234         return ret;
2235 }
2236
2237 static int rbd_init_disk(struct rbd_device *rbd_dev)
2238 {
2239         struct gendisk *disk;
2240         struct request_queue *q;
2241         u64 segment_size;
2242
2243         /* create gendisk info */
2244         disk = alloc_disk(RBD_MINORS_PER_MAJOR);
2245         if (!disk)
2246                 return -ENOMEM;
2247
2248         snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
2249                  rbd_dev->dev_id);
2250         disk->major = rbd_dev->major;
2251         disk->first_minor = 0;
2252         disk->fops = &rbd_bd_ops;
2253         disk->private_data = rbd_dev;
2254
2255         q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
2256         if (!q)
2257                 goto out_disk;
2258
2259         /* We use the default size, but let's be explicit about it. */
2260         blk_queue_physical_block_size(q, SECTOR_SIZE);
2261
2262         /* set io sizes to object size */
2263         segment_size = rbd_obj_bytes(&rbd_dev->header);
2264         blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
2265         blk_queue_max_segment_size(q, segment_size);
2266         blk_queue_io_min(q, segment_size);
2267         blk_queue_io_opt(q, segment_size);
2268
2269         blk_queue_merge_bvec(q, rbd_merge_bvec);
2270         disk->queue = q;
2271
2272         q->queuedata = rbd_dev;
2273
2274         rbd_dev->disk = disk;
2275
2276         set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
2277
2278         return 0;
2279 out_disk:
2280         put_disk(disk);
2281
2282         return -ENOMEM;
2283 }
2284
2285 /*
2286   sysfs
2287 */
2288
2289 static struct rbd_device *dev_to_rbd_dev(struct device *dev)
2290 {
2291         return container_of(dev, struct rbd_device, dev);
2292 }
2293
2294 static ssize_t rbd_size_show(struct device *dev,
2295                              struct device_attribute *attr, char *buf)
2296 {
2297         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2298         sector_t size;
2299
2300         down_read(&rbd_dev->header_rwsem);
2301         size = get_capacity(rbd_dev->disk);
2302         up_read(&rbd_dev->header_rwsem);
2303
2304         return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
2305 }
2306
2307 /*
2308  * Note this shows the features for whatever's mapped, which is not
2309  * necessarily the base image.
2310  */
2311 static ssize_t rbd_features_show(struct device *dev,
2312                              struct device_attribute *attr, char *buf)
2313 {
2314         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2315
2316         return sprintf(buf, "0x%016llx\n",
2317                         (unsigned long long) rbd_dev->mapping.features);
2318 }
2319
2320 static ssize_t rbd_major_show(struct device *dev,
2321                               struct device_attribute *attr, char *buf)
2322 {
2323         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2324
2325         return sprintf(buf, "%d\n", rbd_dev->major);
2326 }
2327
2328 static ssize_t rbd_client_id_show(struct device *dev,
2329                                   struct device_attribute *attr, char *buf)
2330 {
2331         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2332
2333         return sprintf(buf, "client%lld\n",
2334                         ceph_client_id(rbd_dev->rbd_client->client));
2335 }
2336
2337 static ssize_t rbd_pool_show(struct device *dev,
2338                              struct device_attribute *attr, char *buf)
2339 {
2340         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2341
2342         return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
2343 }
2344
2345 static ssize_t rbd_pool_id_show(struct device *dev,
2346                              struct device_attribute *attr, char *buf)
2347 {
2348         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2349
2350         return sprintf(buf, "%llu\n",
2351                 (unsigned long long) rbd_dev->spec->pool_id);
2352 }
2353
2354 static ssize_t rbd_name_show(struct device *dev,
2355                              struct device_attribute *attr, char *buf)
2356 {
2357         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2358
2359         if (rbd_dev->spec->image_name)
2360                 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2361
2362         return sprintf(buf, "(unknown)\n");
2363 }
2364
2365 static ssize_t rbd_image_id_show(struct device *dev,
2366                              struct device_attribute *attr, char *buf)
2367 {
2368         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2369
2370         return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
2371 }
2372
2373 /*
2374  * Shows the name of the currently-mapped snapshot (or
2375  * RBD_SNAP_HEAD_NAME for the base image).
2376  */
2377 static ssize_t rbd_snap_show(struct device *dev,
2378                              struct device_attribute *attr,
2379                              char *buf)
2380 {
2381         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2382
2383         return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
2384 }
2385
2386 /*
2387  * For an rbd v2 image, shows the pool id, image id, and snapshot id
2388  * for the parent image.  If there is no parent, simply shows
2389  * "(no parent image)".
2390  */
2391 static ssize_t rbd_parent_show(struct device *dev,
2392                              struct device_attribute *attr,
2393                              char *buf)
2394 {
2395         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2396         struct rbd_spec *spec = rbd_dev->parent_spec;
2397         int count;
2398         char *bufp = buf;
2399
2400         if (!spec)
2401                 return sprintf(buf, "(no parent image)\n");
2402
2403         count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2404                         (unsigned long long) spec->pool_id, spec->pool_name);
2405         if (count < 0)
2406                 return count;
2407         bufp += count;
2408
2409         count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2410                         spec->image_name ? spec->image_name : "(unknown)");
2411         if (count < 0)
2412                 return count;
2413         bufp += count;
2414
2415         count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2416                         (unsigned long long) spec->snap_id, spec->snap_name);
2417         if (count < 0)
2418                 return count;
2419         bufp += count;
2420
2421         count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2422         if (count < 0)
2423                 return count;
2424         bufp += count;
2425
2426         return (ssize_t) (bufp - buf);
2427 }
2428
2429 static ssize_t rbd_image_refresh(struct device *dev,
2430                                  struct device_attribute *attr,
2431                                  const char *buf,
2432                                  size_t size)
2433 {
2434         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2435         int ret;
2436
2437         ret = rbd_dev_refresh(rbd_dev, NULL);
2438
2439         return ret < 0 ? ret : size;
2440 }
2441
2442 static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
2443 static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
2444 static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2445 static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2446 static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
2447 static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
2448 static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
2449 static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
2450 static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2451 static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
2452 static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
2453
2454 static struct attribute *rbd_attrs[] = {
2455         &dev_attr_size.attr,
2456         &dev_attr_features.attr,
2457         &dev_attr_major.attr,
2458         &dev_attr_client_id.attr,
2459         &dev_attr_pool.attr,
2460         &dev_attr_pool_id.attr,
2461         &dev_attr_name.attr,
2462         &dev_attr_image_id.attr,
2463         &dev_attr_current_snap.attr,
2464         &dev_attr_parent.attr,
2465         &dev_attr_refresh.attr,
2466         NULL
2467 };
2468
2469 static struct attribute_group rbd_attr_group = {
2470         .attrs = rbd_attrs,
2471 };
2472
2473 static const struct attribute_group *rbd_attr_groups[] = {
2474         &rbd_attr_group,
2475         NULL
2476 };
2477
2478 static void rbd_sysfs_dev_release(struct device *dev)
2479 {
2480 }
2481
2482 static struct device_type rbd_device_type = {
2483         .name           = "rbd",
2484         .groups         = rbd_attr_groups,
2485         .release        = rbd_sysfs_dev_release,
2486 };
2487
2488
2489 /*
2490   sysfs - snapshots
2491 */
2492
2493 static ssize_t rbd_snap_size_show(struct device *dev,
2494                                   struct device_attribute *attr,
2495                                   char *buf)
2496 {
2497         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2498
2499         return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
2500 }
2501
2502 static ssize_t rbd_snap_id_show(struct device *dev,
2503                                 struct device_attribute *attr,
2504                                 char *buf)
2505 {
2506         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2507
2508         return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
2509 }
2510
2511 static ssize_t rbd_snap_features_show(struct device *dev,
2512                                 struct device_attribute *attr,
2513                                 char *buf)
2514 {
2515         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2516
2517         return sprintf(buf, "0x%016llx\n",
2518                         (unsigned long long) snap->features);
2519 }
2520
2521 static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2522 static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
2523 static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
2524
2525 static struct attribute *rbd_snap_attrs[] = {
2526         &dev_attr_snap_size.attr,
2527         &dev_attr_snap_id.attr,
2528         &dev_attr_snap_features.attr,
2529         NULL,
2530 };
2531
2532 static struct attribute_group rbd_snap_attr_group = {
2533         .attrs = rbd_snap_attrs,
2534 };
2535
2536 static void rbd_snap_dev_release(struct device *dev)
2537 {
2538         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2539         kfree(snap->name);
2540         kfree(snap);
2541 }
2542
2543 static const struct attribute_group *rbd_snap_attr_groups[] = {
2544         &rbd_snap_attr_group,
2545         NULL
2546 };
2547
2548 static struct device_type rbd_snap_device_type = {
2549         .groups         = rbd_snap_attr_groups,
2550         .release        = rbd_snap_dev_release,
2551 };
2552
2553 static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2554 {
2555         kref_get(&spec->kref);
2556
2557         return spec;
2558 }
2559
2560 static void rbd_spec_free(struct kref *kref);
2561 static void rbd_spec_put(struct rbd_spec *spec)
2562 {
2563         if (spec)
2564                 kref_put(&spec->kref, rbd_spec_free);
2565 }
2566
2567 static struct rbd_spec *rbd_spec_alloc(void)
2568 {
2569         struct rbd_spec *spec;
2570
2571         spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2572         if (!spec)
2573                 return NULL;
2574         kref_init(&spec->kref);
2575
2576         rbd_spec_put(rbd_spec_get(spec));       /* TEMPORARY */
2577
2578         return spec;
2579 }
2580
2581 static void rbd_spec_free(struct kref *kref)
2582 {
2583         struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2584
2585         kfree(spec->pool_name);
2586         kfree(spec->image_id);
2587         kfree(spec->image_name);
2588         kfree(spec->snap_name);
2589         kfree(spec);
2590 }
2591
2592 struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2593                                 struct rbd_spec *spec)
2594 {
2595         struct rbd_device *rbd_dev;
2596
2597         rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2598         if (!rbd_dev)
2599                 return NULL;
2600
2601         spin_lock_init(&rbd_dev->lock);
2602         rbd_dev->flags = 0;
2603         INIT_LIST_HEAD(&rbd_dev->node);
2604         INIT_LIST_HEAD(&rbd_dev->snaps);
2605         init_rwsem(&rbd_dev->header_rwsem);
2606
2607         rbd_dev->spec = spec;
2608         rbd_dev->rbd_client = rbdc;
2609
2610         /* Initialize the layout used for all rbd requests */
2611
2612         rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2613         rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
2614         rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2615         rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
2616
2617         return rbd_dev;
2618 }
2619
2620 static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2621 {
2622         rbd_spec_put(rbd_dev->parent_spec);
2623         kfree(rbd_dev->header_name);
2624         rbd_put_client(rbd_dev->rbd_client);
2625         rbd_spec_put(rbd_dev->spec);
2626         kfree(rbd_dev);
2627 }
2628
2629 static bool rbd_snap_registered(struct rbd_snap *snap)
2630 {
2631         bool ret = snap->dev.type == &rbd_snap_device_type;
2632         bool reg = device_is_registered(&snap->dev);
2633
2634         rbd_assert(!ret ^ reg);
2635
2636         return ret;
2637 }
2638
2639 static void rbd_remove_snap_dev(struct rbd_snap *snap)
2640 {
2641         list_del(&snap->node);
2642         if (device_is_registered(&snap->dev))
2643                 device_unregister(&snap->dev);
2644 }
2645
2646 static int rbd_register_snap_dev(struct rbd_snap *snap,
2647                                   struct device *parent)
2648 {
2649         struct device *dev = &snap->dev;
2650         int ret;
2651
2652         dev->type = &rbd_snap_device_type;
2653         dev->parent = parent;
2654         dev->release = rbd_snap_dev_release;
2655         dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
2656         dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2657
2658         ret = device_register(dev);
2659
2660         return ret;
2661 }
2662
2663 static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2664                                                 const char *snap_name,
2665                                                 u64 snap_id, u64 snap_size,
2666                                                 u64 snap_features)
2667 {
2668         struct rbd_snap *snap;
2669         int ret;
2670
2671         snap = kzalloc(sizeof (*snap), GFP_KERNEL);
2672         if (!snap)
2673                 return ERR_PTR(-ENOMEM);
2674
2675         ret = -ENOMEM;
2676         snap->name = kstrdup(snap_name, GFP_KERNEL);
2677         if (!snap->name)
2678                 goto err;
2679
2680         snap->id = snap_id;
2681         snap->size = snap_size;
2682         snap->features = snap_features;
2683
2684         return snap;
2685
2686 err:
2687         kfree(snap->name);
2688         kfree(snap);
2689
2690         return ERR_PTR(ret);
2691 }
2692
2693 static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2694                 u64 *snap_size, u64 *snap_features)
2695 {
2696         char *snap_name;
2697
2698         rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2699
2700         *snap_size = rbd_dev->header.snap_sizes[which];
2701         *snap_features = 0;     /* No features for v1 */
2702
2703         /* Skip over names until we find the one we are looking for */
2704
2705         snap_name = rbd_dev->header.snap_names;
2706         while (which--)
2707                 snap_name += strlen(snap_name) + 1;
2708
2709         return snap_name;
2710 }
2711
2712 /*
2713  * Get the size and object order for an image snapshot, or if
2714  * snap_id is CEPH_NOSNAP, gets this information for the base
2715  * image.
2716  */
2717 static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2718                                 u8 *order, u64 *snap_size)
2719 {
2720         __le64 snapid = cpu_to_le64(snap_id);
2721         int ret;
2722         struct {
2723                 u8 order;
2724                 __le64 size;
2725         } __attribute__ ((packed)) size_buf = { 0 };
2726
2727         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2728                                 "rbd", "get_size",
2729                                 (char *) &snapid, sizeof (snapid),
2730                                 (char *) &size_buf, sizeof (size_buf), NULL);
2731         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2732         if (ret < 0)
2733                 return ret;
2734
2735         *order = size_buf.order;
2736         *snap_size = le64_to_cpu(size_buf.size);
2737
2738         dout("  snap_id 0x%016llx order = %u, snap_size = %llu\n",
2739                 (unsigned long long) snap_id, (unsigned int) *order,
2740                 (unsigned long long) *snap_size);
2741
2742         return 0;
2743 }
2744
2745 static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2746 {
2747         return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2748                                         &rbd_dev->header.obj_order,
2749                                         &rbd_dev->header.image_size);
2750 }
2751
2752 static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2753 {
2754         void *reply_buf;
2755         int ret;
2756         void *p;
2757
2758         reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2759         if (!reply_buf)
2760                 return -ENOMEM;
2761
2762         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2763                                 "rbd", "get_object_prefix",
2764                                 NULL, 0,
2765                                 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
2766         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2767         if (ret < 0)
2768                 goto out;
2769         ret = 0;    /* rbd_obj_method_sync() can return positive */
2770
2771         p = reply_buf;
2772         rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2773                                                 p + RBD_OBJ_PREFIX_LEN_MAX,
2774                                                 NULL, GFP_NOIO);
2775
2776         if (IS_ERR(rbd_dev->header.object_prefix)) {
2777                 ret = PTR_ERR(rbd_dev->header.object_prefix);
2778                 rbd_dev->header.object_prefix = NULL;
2779         } else {
2780                 dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
2781         }
2782
2783 out:
2784         kfree(reply_buf);
2785
2786         return ret;
2787 }
2788
2789 static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2790                 u64 *snap_features)
2791 {
2792         __le64 snapid = cpu_to_le64(snap_id);
2793         struct {
2794                 __le64 features;
2795                 __le64 incompat;
2796         } features_buf = { 0 };
2797         u64 incompat;
2798         int ret;
2799
2800         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2801                                 "rbd", "get_features",
2802                                 (char *) &snapid, sizeof (snapid),
2803                                 (char *) &features_buf, sizeof (features_buf),
2804                                 NULL);
2805         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2806         if (ret < 0)
2807                 return ret;
2808
2809         incompat = le64_to_cpu(features_buf.incompat);
2810         if (incompat & ~RBD_FEATURES_ALL)
2811                 return -ENXIO;
2812
2813         *snap_features = le64_to_cpu(features_buf.features);
2814
2815         dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2816                 (unsigned long long) snap_id,
2817                 (unsigned long long) *snap_features,
2818                 (unsigned long long) le64_to_cpu(features_buf.incompat));
2819
2820         return 0;
2821 }
2822
2823 static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2824 {
2825         return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2826                                                 &rbd_dev->header.features);
2827 }
2828
2829 static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2830 {
2831         struct rbd_spec *parent_spec;
2832         size_t size;
2833         void *reply_buf = NULL;
2834         __le64 snapid;
2835         void *p;
2836         void *end;
2837         char *image_id;
2838         u64 overlap;
2839         int ret;
2840
2841         parent_spec = rbd_spec_alloc();
2842         if (!parent_spec)
2843                 return -ENOMEM;
2844
2845         size = sizeof (__le64) +                                /* pool_id */
2846                 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +        /* image_id */
2847                 sizeof (__le64) +                               /* snap_id */
2848                 sizeof (__le64);                                /* overlap */
2849         reply_buf = kmalloc(size, GFP_KERNEL);
2850         if (!reply_buf) {
2851                 ret = -ENOMEM;
2852                 goto out_err;
2853         }
2854
2855         snapid = cpu_to_le64(CEPH_NOSNAP);
2856         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2857                                 "rbd", "get_parent",
2858                                 (char *) &snapid, sizeof (snapid),
2859                                 (char *) reply_buf, size, NULL);
2860         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2861         if (ret < 0)
2862                 goto out_err;
2863
2864         ret = -ERANGE;
2865         p = reply_buf;
2866         end = (char *) reply_buf + size;
2867         ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2868         if (parent_spec->pool_id == CEPH_NOPOOL)
2869                 goto out;       /* No parent?  No problem. */
2870
2871         /* The ceph file layout needs to fit pool id in 32 bits */
2872
2873         ret = -EIO;
2874         if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
2875                 goto out;
2876
2877         image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
2878         if (IS_ERR(image_id)) {
2879                 ret = PTR_ERR(image_id);
2880                 goto out_err;
2881         }
2882         parent_spec->image_id = image_id;
2883         ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
2884         ceph_decode_64_safe(&p, end, overlap, out_err);
2885
2886         rbd_dev->parent_overlap = overlap;
2887         rbd_dev->parent_spec = parent_spec;
2888         parent_spec = NULL;     /* rbd_dev now owns this */
2889 out:
2890         ret = 0;
2891 out_err:
2892         kfree(reply_buf);
2893         rbd_spec_put(parent_spec);
2894
2895         return ret;
2896 }
2897
2898 static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
2899 {
2900         size_t image_id_size;
2901         char *image_id;
2902         void *p;
2903         void *end;
2904         size_t size;
2905         void *reply_buf = NULL;
2906         size_t len = 0;
2907         char *image_name = NULL;
2908         int ret;
2909
2910         rbd_assert(!rbd_dev->spec->image_name);
2911
2912         len = strlen(rbd_dev->spec->image_id);
2913         image_id_size = sizeof (__le32) + len;
2914         image_id = kmalloc(image_id_size, GFP_KERNEL);
2915         if (!image_id)
2916                 return NULL;
2917
2918         p = image_id;
2919         end = (char *) image_id + image_id_size;
2920         ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
2921
2922         size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
2923         reply_buf = kmalloc(size, GFP_KERNEL);
2924         if (!reply_buf)
2925                 goto out;
2926
2927         ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
2928                                 "rbd", "dir_get_name",
2929                                 image_id, image_id_size,
2930                                 (char *) reply_buf, size, NULL);
2931         if (ret < 0)
2932                 goto out;
2933         p = reply_buf;
2934         end = (char *) reply_buf + size;
2935         image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2936         if (IS_ERR(image_name))
2937                 image_name = NULL;
2938         else
2939                 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
2940 out:
2941         kfree(reply_buf);
2942         kfree(image_id);
2943
2944         return image_name;
2945 }
2946
2947 /*
2948  * When a parent image gets probed, we only have the pool, image,
2949  * and snapshot ids but not the names of any of them.  This call
2950  * is made later to fill in those names.  It has to be done after
2951  * rbd_dev_snaps_update() has completed because some of the
2952  * information (in particular, snapshot name) is not available
2953  * until then.
2954  */
2955 static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
2956 {
2957         struct ceph_osd_client *osdc;
2958         const char *name;
2959         void *reply_buf = NULL;
2960         int ret;
2961
2962         if (rbd_dev->spec->pool_name)
2963                 return 0;       /* Already have the names */
2964
2965         /* Look up the pool name */
2966
2967         osdc = &rbd_dev->rbd_client->client->osdc;
2968         name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
2969         if (!name) {
2970                 rbd_warn(rbd_dev, "there is no pool with id %llu",
2971                         rbd_dev->spec->pool_id);        /* Really a BUG() */
2972                 return -EIO;
2973         }
2974
2975         rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
2976         if (!rbd_dev->spec->pool_name)
2977                 return -ENOMEM;
2978
2979         /* Fetch the image name; tolerate failure here */
2980
2981         name = rbd_dev_image_name(rbd_dev);
2982         if (name)
2983                 rbd_dev->spec->image_name = (char *) name;
2984         else
2985                 rbd_warn(rbd_dev, "unable to get image name");
2986
2987         /* Look up the snapshot name. */
2988
2989         name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
2990         if (!name) {
2991                 rbd_warn(rbd_dev, "no snapshot with id %llu",
2992                         rbd_dev->spec->snap_id);        /* Really a BUG() */
2993                 ret = -EIO;
2994                 goto out_err;
2995         }
2996         rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
2997         if(!rbd_dev->spec->snap_name)
2998                 goto out_err;
2999
3000         return 0;
3001 out_err:
3002         kfree(reply_buf);
3003         kfree(rbd_dev->spec->pool_name);
3004         rbd_dev->spec->pool_name = NULL;
3005
3006         return ret;
3007 }
3008
3009 static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
3010 {
3011         size_t size;
3012         int ret;
3013         void *reply_buf;
3014         void *p;
3015         void *end;
3016         u64 seq;
3017         u32 snap_count;
3018         struct ceph_snap_context *snapc;
3019         u32 i;
3020
3021         /*
3022          * We'll need room for the seq value (maximum snapshot id),
3023          * snapshot count, and array of that many snapshot ids.
3024          * For now we have a fixed upper limit on the number we're
3025          * prepared to receive.
3026          */
3027         size = sizeof (__le64) + sizeof (__le32) +
3028                         RBD_MAX_SNAP_COUNT * sizeof (__le64);
3029         reply_buf = kzalloc(size, GFP_KERNEL);
3030         if (!reply_buf)
3031                 return -ENOMEM;
3032
3033         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3034                                 "rbd", "get_snapcontext",
3035                                 NULL, 0,
3036                                 reply_buf, size, ver);
3037         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3038         if (ret < 0)
3039                 goto out;
3040
3041         ret = -ERANGE;
3042         p = reply_buf;
3043         end = (char *) reply_buf + size;
3044         ceph_decode_64_safe(&p, end, seq, out);
3045         ceph_decode_32_safe(&p, end, snap_count, out);
3046
3047         /*
3048          * Make sure the reported number of snapshot ids wouldn't go
3049          * beyond the end of our buffer.  But before checking that,
3050          * make sure the computed size of the snapshot context we
3051          * allocate is representable in a size_t.
3052          */
3053         if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
3054                                  / sizeof (u64)) {
3055                 ret = -EINVAL;
3056                 goto out;
3057         }
3058         if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
3059                 goto out;
3060
3061         size = sizeof (struct ceph_snap_context) +
3062                                 snap_count * sizeof (snapc->snaps[0]);
3063         snapc = kmalloc(size, GFP_KERNEL);
3064         if (!snapc) {
3065                 ret = -ENOMEM;
3066                 goto out;
3067         }
3068
3069         atomic_set(&snapc->nref, 1);
3070         snapc->seq = seq;
3071         snapc->num_snaps = snap_count;
3072         for (i = 0; i < snap_count; i++)
3073                 snapc->snaps[i] = ceph_decode_64(&p);
3074
3075         rbd_dev->header.snapc = snapc;
3076
3077         dout("  snap context seq = %llu, snap_count = %u\n",
3078                 (unsigned long long) seq, (unsigned int) snap_count);
3079
3080 out:
3081         kfree(reply_buf);
3082
3083         return 0;
3084 }
3085
3086 static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3087 {
3088         size_t size;
3089         void *reply_buf;
3090         __le64 snap_id;
3091         int ret;
3092         void *p;
3093         void *end;
3094         char *snap_name;
3095
3096         size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3097         reply_buf = kmalloc(size, GFP_KERNEL);
3098         if (!reply_buf)
3099                 return ERR_PTR(-ENOMEM);
3100
3101         snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
3102         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3103                                 "rbd", "get_snapshot_name",
3104                                 (char *) &snap_id, sizeof (snap_id),
3105                                 reply_buf, size, NULL);
3106         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3107         if (ret < 0)
3108                 goto out;
3109
3110         p = reply_buf;
3111         end = (char *) reply_buf + size;
3112         snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
3113         if (IS_ERR(snap_name)) {
3114                 ret = PTR_ERR(snap_name);
3115                 goto out;
3116         } else {
3117                 dout("  snap_id 0x%016llx snap_name = %s\n",
3118                         (unsigned long long) le64_to_cpu(snap_id), snap_name);
3119         }
3120         kfree(reply_buf);
3121
3122         return snap_name;
3123 out:
3124         kfree(reply_buf);
3125
3126         return ERR_PTR(ret);
3127 }
3128
3129 static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3130                 u64 *snap_size, u64 *snap_features)
3131 {
3132         u64 snap_id;
3133         u8 order;
3134         int ret;
3135
3136         snap_id = rbd_dev->header.snapc->snaps[which];
3137         ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
3138         if (ret)
3139                 return ERR_PTR(ret);
3140         ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
3141         if (ret)
3142                 return ERR_PTR(ret);
3143
3144         return rbd_dev_v2_snap_name(rbd_dev, which);
3145 }
3146
3147 static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3148                 u64 *snap_size, u64 *snap_features)
3149 {
3150         if (rbd_dev->image_format == 1)
3151                 return rbd_dev_v1_snap_info(rbd_dev, which,
3152                                         snap_size, snap_features);
3153         if (rbd_dev->image_format == 2)
3154                 return rbd_dev_v2_snap_info(rbd_dev, which,
3155                                         snap_size, snap_features);
3156         return ERR_PTR(-EINVAL);
3157 }
3158
3159 static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
3160 {
3161         int ret;
3162         __u8 obj_order;
3163
3164         down_write(&rbd_dev->header_rwsem);
3165
3166         /* Grab old order first, to see if it changes */
3167
3168         obj_order = rbd_dev->header.obj_order,
3169         ret = rbd_dev_v2_image_size(rbd_dev);
3170         if (ret)
3171                 goto out;
3172         if (rbd_dev->header.obj_order != obj_order) {
3173                 ret = -EIO;
3174                 goto out;
3175         }
3176         rbd_update_mapping_size(rbd_dev);
3177
3178         ret = rbd_dev_v2_snap_context(rbd_dev, hver);
3179         dout("rbd_dev_v2_snap_context returned %d\n", ret);
3180         if (ret)
3181                 goto out;
3182         ret = rbd_dev_snaps_update(rbd_dev);
3183         dout("rbd_dev_snaps_update returned %d\n", ret);
3184         if (ret)
3185                 goto out;
3186         ret = rbd_dev_snaps_register(rbd_dev);
3187         dout("rbd_dev_snaps_register returned %d\n", ret);
3188 out:
3189         up_write(&rbd_dev->header_rwsem);
3190
3191         return ret;
3192 }
3193
3194 /*
3195  * Scan the rbd device's current snapshot list and compare it to the
3196  * newly-received snapshot context.  Remove any existing snapshots
3197  * not present in the new snapshot context.  Add a new snapshot for
3198  * any snaphots in the snapshot context not in the current list.
3199  * And verify there are no changes to snapshots we already know
3200  * about.
3201  *
3202  * Assumes the snapshots in the snapshot context are sorted by
3203  * snapshot id, highest id first.  (Snapshots in the rbd_dev's list
3204  * are also maintained in that order.)
3205  */
3206 static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
3207 {
3208         struct ceph_snap_context *snapc = rbd_dev->header.snapc;
3209         const u32 snap_count = snapc->num_snaps;
3210         struct list_head *head = &rbd_dev->snaps;
3211         struct list_head *links = head->next;
3212         u32 index = 0;
3213
3214         dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
3215         while (index < snap_count || links != head) {
3216                 u64 snap_id;
3217                 struct rbd_snap *snap;
3218                 char *snap_name;
3219                 u64 snap_size = 0;
3220                 u64 snap_features = 0;
3221
3222                 snap_id = index < snap_count ? snapc->snaps[index]
3223                                              : CEPH_NOSNAP;
3224                 snap = links != head ? list_entry(links, struct rbd_snap, node)
3225                                      : NULL;
3226                 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
3227
3228                 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
3229                         struct list_head *next = links->next;
3230
3231                         /*
3232                          * A previously-existing snapshot is not in
3233                          * the new snap context.
3234                          *
3235                          * If the now missing snapshot is the one the
3236                          * image is mapped to, clear its exists flag
3237                          * so we can avoid sending any more requests
3238                          * to it.
3239                          */
3240                         if (rbd_dev->spec->snap_id == snap->id)
3241                                 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
3242                         rbd_remove_snap_dev(snap);
3243                         dout("%ssnap id %llu has been removed\n",
3244                                 rbd_dev->spec->snap_id == snap->id ?
3245                                                         "mapped " : "",
3246                                 (unsigned long long) snap->id);
3247
3248                         /* Done with this list entry; advance */
3249
3250                         links = next;
3251                         continue;
3252                 }
3253
3254                 snap_name = rbd_dev_snap_info(rbd_dev, index,
3255                                         &snap_size, &snap_features);
3256                 if (IS_ERR(snap_name))
3257                         return PTR_ERR(snap_name);
3258
3259                 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
3260                         (unsigned long long) snap_id);
3261                 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
3262                         struct rbd_snap *new_snap;
3263
3264                         /* We haven't seen this snapshot before */
3265
3266                         new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
3267                                         snap_id, snap_size, snap_features);
3268                         if (IS_ERR(new_snap)) {
3269                                 int err = PTR_ERR(new_snap);
3270
3271                                 dout("  failed to add dev, error %d\n", err);
3272
3273                                 return err;
3274                         }
3275
3276                         /* New goes before existing, or at end of list */
3277
3278                         dout("  added dev%s\n", snap ? "" : " at end\n");
3279                         if (snap)
3280                                 list_add_tail(&new_snap->node, &snap->node);
3281                         else
3282                                 list_add_tail(&new_snap->node, head);
3283                 } else {
3284                         /* Already have this one */
3285
3286                         dout("  already present\n");
3287
3288                         rbd_assert(snap->size == snap_size);
3289                         rbd_assert(!strcmp(snap->name, snap_name));
3290                         rbd_assert(snap->features == snap_features);
3291
3292                         /* Done with this list entry; advance */
3293
3294                         links = links->next;
3295                 }
3296
3297                 /* Advance to the next entry in the snapshot context */
3298
3299                 index++;
3300         }
3301         dout("%s: done\n", __func__);
3302
3303         return 0;
3304 }
3305
3306 /*
3307  * Scan the list of snapshots and register the devices for any that
3308  * have not already been registered.
3309  */
3310 static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
3311 {
3312         struct rbd_snap *snap;
3313         int ret = 0;
3314
3315         dout("%s called\n", __func__);
3316         if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
3317                 return -EIO;
3318
3319         list_for_each_entry(snap, &rbd_dev->snaps, node) {
3320                 if (!rbd_snap_registered(snap)) {
3321                         ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
3322                         if (ret < 0)
3323                                 break;
3324                 }
3325         }
3326         dout("%s: returning %d\n", __func__, ret);
3327
3328         return ret;
3329 }
3330
3331 static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3332 {
3333         struct device *dev;
3334         int ret;
3335
3336         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3337
3338         dev = &rbd_dev->dev;
3339         dev->bus = &rbd_bus_type;
3340         dev->type = &rbd_device_type;
3341         dev->parent = &rbd_root_dev;
3342         dev->release = rbd_dev_release;
3343         dev_set_name(dev, "%d", rbd_dev->dev_id);
3344         ret = device_register(dev);
3345
3346         mutex_unlock(&ctl_mutex);
3347
3348         return ret;
3349 }
3350
3351 static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3352 {
3353         device_unregister(&rbd_dev->dev);
3354 }
3355
3356 static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
3357
3358 /*
3359  * Get a unique rbd identifier for the given new rbd_dev, and add
3360  * the rbd_dev to the global list.  The minimum rbd id is 1.
3361  */
3362 static void rbd_dev_id_get(struct rbd_device *rbd_dev)
3363 {
3364         rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
3365
3366         spin_lock(&rbd_dev_list_lock);
3367         list_add_tail(&rbd_dev->node, &rbd_dev_list);
3368         spin_unlock(&rbd_dev_list_lock);
3369         dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3370                 (unsigned long long) rbd_dev->dev_id);
3371 }
3372
3373 /*
3374  * Remove an rbd_dev from the global list, and record that its
3375  * identifier is no longer in use.
3376  */
3377 static void rbd_dev_id_put(struct rbd_device *rbd_dev)
3378 {
3379         struct list_head *tmp;
3380         int rbd_id = rbd_dev->dev_id;
3381         int max_id;
3382
3383         rbd_assert(rbd_id > 0);
3384
3385         dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3386                 (unsigned long long) rbd_dev->dev_id);
3387         spin_lock(&rbd_dev_list_lock);
3388         list_del_init(&rbd_dev->node);
3389
3390         /*
3391          * If the id being "put" is not the current maximum, there
3392          * is nothing special we need to do.
3393          */
3394         if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
3395                 spin_unlock(&rbd_dev_list_lock);
3396                 return;
3397         }
3398
3399         /*
3400          * We need to update the current maximum id.  Search the
3401          * list to find out what it is.  We're more likely to find
3402          * the maximum at the end, so search the list backward.
3403          */
3404         max_id = 0;
3405         list_for_each_prev(tmp, &rbd_dev_list) {
3406                 struct rbd_device *rbd_dev;
3407
3408                 rbd_dev = list_entry(tmp, struct rbd_device, node);
3409                 if (rbd_dev->dev_id > max_id)
3410                         max_id = rbd_dev->dev_id;
3411         }
3412         spin_unlock(&rbd_dev_list_lock);
3413
3414         /*
3415          * The max id could have been updated by rbd_dev_id_get(), in
3416          * which case it now accurately reflects the new maximum.
3417          * Be careful not to overwrite the maximum value in that
3418          * case.
3419          */
3420         atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3421         dout("  max dev id has been reset\n");
3422 }
3423
3424 /*
3425  * Skips over white space at *buf, and updates *buf to point to the
3426  * first found non-space character (if any). Returns the length of
3427  * the token (string of non-white space characters) found.  Note
3428  * that *buf must be terminated with '\0'.
3429  */
3430 static inline size_t next_token(const char **buf)
3431 {
3432         /*
3433         * These are the characters that produce nonzero for
3434         * isspace() in the "C" and "POSIX" locales.
3435         */
3436         const char *spaces = " \f\n\r\t\v";
3437
3438         *buf += strspn(*buf, spaces);   /* Find start of token */
3439
3440         return strcspn(*buf, spaces);   /* Return token length */
3441 }
3442
3443 /*
3444  * Finds the next token in *buf, and if the provided token buffer is
3445  * big enough, copies the found token into it.  The result, if
3446  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
3447  * must be terminated with '\0' on entry.
3448  *
3449  * Returns the length of the token found (not including the '\0').
3450  * Return value will be 0 if no token is found, and it will be >=
3451  * token_size if the token would not fit.
3452  *
3453  * The *buf pointer will be updated to point beyond the end of the
3454  * found token.  Note that this occurs even if the token buffer is
3455  * too small to hold it.
3456  */
3457 static inline size_t copy_token(const char **buf,
3458                                 char *token,
3459                                 size_t token_size)
3460 {
3461         size_t len;
3462
3463         len = next_token(buf);
3464         if (len < token_size) {
3465                 memcpy(token, *buf, len);
3466                 *(token + len) = '\0';
3467         }
3468         *buf += len;
3469
3470         return len;
3471 }
3472
3473 /*
3474  * Finds the next token in *buf, dynamically allocates a buffer big
3475  * enough to hold a copy of it, and copies the token into the new
3476  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
3477  * that a duplicate buffer is created even for a zero-length token.
3478  *
3479  * Returns a pointer to the newly-allocated duplicate, or a null
3480  * pointer if memory for the duplicate was not available.  If
3481  * the lenp argument is a non-null pointer, the length of the token
3482  * (not including the '\0') is returned in *lenp.
3483  *
3484  * If successful, the *buf pointer will be updated to point beyond
3485  * the end of the found token.
3486  *
3487  * Note: uses GFP_KERNEL for allocation.
3488  */
3489 static inline char *dup_token(const char **buf, size_t *lenp)
3490 {
3491         char *dup;
3492         size_t len;
3493
3494         len = next_token(buf);
3495         dup = kmemdup(*buf, len + 1, GFP_KERNEL);
3496         if (!dup)
3497                 return NULL;
3498         *(dup + len) = '\0';
3499         *buf += len;
3500
3501         if (lenp)
3502                 *lenp = len;
3503
3504         return dup;
3505 }
3506
3507 /*
3508  * Parse the options provided for an "rbd add" (i.e., rbd image
3509  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
3510  * and the data written is passed here via a NUL-terminated buffer.
3511  * Returns 0 if successful or an error code otherwise.
3512  *
3513  * The information extracted from these options is recorded in
3514  * the other parameters which return dynamically-allocated
3515  * structures:
3516  *  ceph_opts
3517  *      The address of a pointer that will refer to a ceph options
3518  *      structure.  Caller must release the returned pointer using
3519  *      ceph_destroy_options() when it is no longer needed.
3520  *  rbd_opts
3521  *      Address of an rbd options pointer.  Fully initialized by
3522  *      this function; caller must release with kfree().
3523  *  spec
3524  *      Address of an rbd image specification pointer.  Fully
3525  *      initialized by this function based on parsed options.
3526  *      Caller must release with rbd_spec_put().
3527  *
3528  * The options passed take this form:
3529  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3530  * where:
3531  *  <mon_addrs>
3532  *      A comma-separated list of one or more monitor addresses.
3533  *      A monitor address is an ip address, optionally followed
3534  *      by a port number (separated by a colon).
3535  *        I.e.:  ip1[:port1][,ip2[:port2]...]
3536  *  <options>
3537  *      A comma-separated list of ceph and/or rbd options.
3538  *  <pool_name>
3539  *      The name of the rados pool containing the rbd image.
3540  *  <image_name>
3541  *      The name of the image in that pool to map.
3542  *  <snap_id>
3543  *      An optional snapshot id.  If provided, the mapping will
3544  *      present data from the image at the time that snapshot was
3545  *      created.  The image head is used if no snapshot id is
3546  *      provided.  Snapshot mappings are always read-only.
3547  */
3548 static int rbd_add_parse_args(const char *buf,
3549                                 struct ceph_options **ceph_opts,
3550                                 struct rbd_options **opts,
3551                                 struct rbd_spec **rbd_spec)
3552 {
3553         size_t len;
3554         char *options;
3555         const char *mon_addrs;
3556         size_t mon_addrs_size;
3557         struct rbd_spec *spec = NULL;
3558         struct rbd_options *rbd_opts = NULL;
3559         struct ceph_options *copts;
3560         int ret;
3561
3562         /* The first four tokens are required */
3563
3564         len = next_token(&buf);
3565         if (!len) {
3566                 rbd_warn(NULL, "no monitor address(es) provided");
3567                 return -EINVAL;
3568         }
3569         mon_addrs = buf;
3570         mon_addrs_size = len + 1;
3571         buf += len;
3572
3573         ret = -EINVAL;
3574         options = dup_token(&buf, NULL);
3575         if (!options)
3576                 return -ENOMEM;
3577         if (!*options) {
3578                 rbd_warn(NULL, "no options provided");
3579                 goto out_err;
3580         }
3581
3582         spec = rbd_spec_alloc();
3583         if (!spec)
3584                 goto out_mem;
3585
3586         spec->pool_name = dup_token(&buf, NULL);
3587         if (!spec->pool_name)
3588                 goto out_mem;
3589         if (!*spec->pool_name) {
3590                 rbd_warn(NULL, "no pool name provided");
3591                 goto out_err;
3592         }
3593
3594         spec->image_name = dup_token(&buf, NULL);
3595         if (!spec->image_name)
3596                 goto out_mem;
3597         if (!*spec->image_name) {
3598                 rbd_warn(NULL, "no image name provided");
3599                 goto out_err;
3600         }
3601
3602         /*
3603          * Snapshot name is optional; default is to use "-"
3604          * (indicating the head/no snapshot).
3605          */
3606         len = next_token(&buf);
3607         if (!len) {
3608                 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3609                 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
3610         } else if (len > RBD_MAX_SNAP_NAME_LEN) {
3611                 ret = -ENAMETOOLONG;
3612                 goto out_err;
3613         }
3614         spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
3615         if (!spec->snap_name)
3616                 goto out_mem;
3617         *(spec->snap_name + len) = '\0';
3618
3619         /* Initialize all rbd options to the defaults */
3620
3621         rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3622         if (!rbd_opts)
3623                 goto out_mem;
3624
3625         rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
3626
3627         copts = ceph_parse_options(options, mon_addrs,
3628                                         mon_addrs + mon_addrs_size - 1,
3629                                         parse_rbd_opts_token, rbd_opts);
3630         if (IS_ERR(copts)) {
3631                 ret = PTR_ERR(copts);
3632                 goto out_err;
3633         }
3634         kfree(options);
3635
3636         *ceph_opts = copts;
3637         *opts = rbd_opts;
3638         *rbd_spec = spec;
3639
3640         return 0;
3641 out_mem:
3642         ret = -ENOMEM;
3643 out_err:
3644         kfree(rbd_opts);
3645         rbd_spec_put(spec);
3646         kfree(options);
3647
3648         return ret;
3649 }
3650
3651 /*
3652  * An rbd format 2 image has a unique identifier, distinct from the
3653  * name given to it by the user.  Internally, that identifier is
3654  * what's used to specify the names of objects related to the image.
3655  *
3656  * A special "rbd id" object is used to map an rbd image name to its
3657  * id.  If that object doesn't exist, then there is no v2 rbd image
3658  * with the supplied name.
3659  *
3660  * This function will record the given rbd_dev's image_id field if
3661  * it can be determined, and in that case will return 0.  If any
3662  * errors occur a negative errno will be returned and the rbd_dev's
3663  * image_id field will be unchanged (and should be NULL).
3664  */
3665 static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3666 {
3667         int ret;
3668         size_t size;
3669         char *object_name;
3670         void *response;
3671         void *p;
3672
3673         /*
3674          * When probing a parent image, the image id is already
3675          * known (and the image name likely is not).  There's no
3676          * need to fetch the image id again in this case.
3677          */
3678         if (rbd_dev->spec->image_id)
3679                 return 0;
3680
3681         /*
3682          * First, see if the format 2 image id file exists, and if
3683          * so, get the image's persistent id from it.
3684          */
3685         size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
3686         object_name = kmalloc(size, GFP_NOIO);
3687         if (!object_name)
3688                 return -ENOMEM;
3689         sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
3690         dout("rbd id object name is %s\n", object_name);
3691
3692         /* Response will be an encoded string, which includes a length */
3693
3694         size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3695         response = kzalloc(size, GFP_NOIO);
3696         if (!response) {
3697                 ret = -ENOMEM;
3698                 goto out;
3699         }
3700
3701         ret = rbd_obj_method_sync(rbd_dev, object_name,
3702                                 "rbd", "get_id",
3703                                 NULL, 0,
3704                                 response, RBD_IMAGE_ID_LEN_MAX, NULL);
3705         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3706         if (ret < 0)
3707                 goto out;
3708         ret = 0;    /* rbd_obj_method_sync() can return positive */
3709
3710         p = response;
3711         rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
3712                                                 p + RBD_IMAGE_ID_LEN_MAX,
3713                                                 NULL, GFP_NOIO);
3714         if (IS_ERR(rbd_dev->spec->image_id)) {
3715                 ret = PTR_ERR(rbd_dev->spec->image_id);
3716                 rbd_dev->spec->image_id = NULL;
3717         } else {
3718                 dout("image_id is %s\n", rbd_dev->spec->image_id);
3719         }
3720 out:
3721         kfree(response);
3722         kfree(object_name);
3723
3724         return ret;
3725 }
3726
3727 static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3728 {
3729         int ret;
3730         size_t size;
3731
3732         /* Version 1 images have no id; empty string is used */
3733
3734         rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3735         if (!rbd_dev->spec->image_id)
3736                 return -ENOMEM;
3737
3738         /* Record the header object name for this rbd image. */
3739
3740         size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
3741         rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3742         if (!rbd_dev->header_name) {
3743                 ret = -ENOMEM;
3744                 goto out_err;
3745         }
3746         sprintf(rbd_dev->header_name, "%s%s",
3747                 rbd_dev->spec->image_name, RBD_SUFFIX);
3748
3749         /* Populate rbd image metadata */
3750
3751         ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3752         if (ret < 0)
3753                 goto out_err;
3754
3755         /* Version 1 images have no parent (no layering) */
3756
3757         rbd_dev->parent_spec = NULL;
3758         rbd_dev->parent_overlap = 0;
3759
3760         rbd_dev->image_format = 1;
3761
3762         dout("discovered version 1 image, header name is %s\n",
3763                 rbd_dev->header_name);
3764
3765         return 0;
3766
3767 out_err:
3768         kfree(rbd_dev->header_name);
3769         rbd_dev->header_name = NULL;
3770         kfree(rbd_dev->spec->image_id);
3771         rbd_dev->spec->image_id = NULL;
3772
3773         return ret;
3774 }
3775
3776 static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3777 {
3778         size_t size;
3779         int ret;
3780         u64 ver = 0;
3781
3782         /*
3783          * Image id was filled in by the caller.  Record the header
3784          * object name for this rbd image.
3785          */
3786         size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
3787         rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3788         if (!rbd_dev->header_name)
3789                 return -ENOMEM;
3790         sprintf(rbd_dev->header_name, "%s%s",
3791                         RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
3792
3793         /* Get the size and object order for the image */
3794
3795         ret = rbd_dev_v2_image_size(rbd_dev);
3796         if (ret < 0)
3797                 goto out_err;
3798
3799         /* Get the object prefix (a.k.a. block_name) for the image */
3800
3801         ret = rbd_dev_v2_object_prefix(rbd_dev);
3802         if (ret < 0)
3803                 goto out_err;
3804
3805         /* Get the and check features for the image */
3806
3807         ret = rbd_dev_v2_features(rbd_dev);
3808         if (ret < 0)
3809                 goto out_err;
3810
3811         /* If the image supports layering, get the parent info */
3812
3813         if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3814                 ret = rbd_dev_v2_parent_info(rbd_dev);
3815                 if (ret < 0)
3816                         goto out_err;
3817         }
3818
3819         /* crypto and compression type aren't (yet) supported for v2 images */
3820
3821         rbd_dev->header.crypt_type = 0;
3822         rbd_dev->header.comp_type = 0;
3823
3824         /* Get the snapshot context, plus the header version */
3825
3826         ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
3827         if (ret)
3828                 goto out_err;
3829         rbd_dev->header.obj_version = ver;
3830
3831         rbd_dev->image_format = 2;
3832
3833         dout("discovered version 2 image, header name is %s\n",
3834                 rbd_dev->header_name);
3835
3836         return 0;
3837 out_err:
3838         rbd_dev->parent_overlap = 0;
3839         rbd_spec_put(rbd_dev->parent_spec);
3840         rbd_dev->parent_spec = NULL;
3841         kfree(rbd_dev->header_name);
3842         rbd_dev->header_name = NULL;
3843         kfree(rbd_dev->header.object_prefix);
3844         rbd_dev->header.object_prefix = NULL;
3845
3846         return ret;
3847 }
3848
3849 static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3850 {
3851         int ret;
3852
3853         /* no need to lock here, as rbd_dev is not registered yet */
3854         ret = rbd_dev_snaps_update(rbd_dev);
3855         if (ret)
3856                 return ret;
3857
3858         ret = rbd_dev_probe_update_spec(rbd_dev);
3859         if (ret)
3860                 goto err_out_snaps;
3861
3862         ret = rbd_dev_set_mapping(rbd_dev);
3863         if (ret)
3864                 goto err_out_snaps;
3865
3866         /* generate unique id: find highest unique id, add one */
3867         rbd_dev_id_get(rbd_dev);
3868
3869         /* Fill in the device name, now that we have its id. */
3870         BUILD_BUG_ON(DEV_NAME_LEN
3871                         < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3872         sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3873
3874         /* Get our block major device number. */
3875
3876         ret = register_blkdev(0, rbd_dev->name);
3877         if (ret < 0)
3878                 goto err_out_id;
3879         rbd_dev->major = ret;
3880
3881         /* Set up the blkdev mapping. */
3882
3883         ret = rbd_init_disk(rbd_dev);
3884         if (ret)
3885                 goto err_out_blkdev;
3886
3887         ret = rbd_bus_add_dev(rbd_dev);
3888         if (ret)
3889                 goto err_out_disk;
3890
3891         /*
3892          * At this point cleanup in the event of an error is the job
3893          * of the sysfs code (initiated by rbd_bus_del_dev()).
3894          */
3895         down_write(&rbd_dev->header_rwsem);
3896         ret = rbd_dev_snaps_register(rbd_dev);
3897         up_write(&rbd_dev->header_rwsem);
3898         if (ret)
3899                 goto err_out_bus;
3900
3901         ret = rbd_dev_header_watch_sync(rbd_dev, 1);
3902         if (ret)
3903                 goto err_out_bus;
3904
3905         /* Everything's ready.  Announce the disk to the world. */
3906
3907         add_disk(rbd_dev->disk);
3908
3909         pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3910                 (unsigned long long) rbd_dev->mapping.size);
3911
3912         return ret;
3913 err_out_bus:
3914         /* this will also clean up rest of rbd_dev stuff */
3915
3916         rbd_bus_del_dev(rbd_dev);
3917
3918         return ret;
3919 err_out_disk:
3920         rbd_free_disk(rbd_dev);
3921 err_out_blkdev:
3922         unregister_blkdev(rbd_dev->major, rbd_dev->name);
3923 err_out_id:
3924         rbd_dev_id_put(rbd_dev);
3925 err_out_snaps:
3926         rbd_remove_all_snaps(rbd_dev);
3927
3928         return ret;
3929 }
3930
3931 /*
3932  * Probe for the existence of the header object for the given rbd
3933  * device.  For format 2 images this includes determining the image
3934  * id.
3935  */
3936 static int rbd_dev_probe(struct rbd_device *rbd_dev)
3937 {
3938         int ret;
3939
3940         /*
3941          * Get the id from the image id object.  If it's not a
3942          * format 2 image, we'll get ENOENT back, and we'll assume
3943          * it's a format 1 image.
3944          */
3945         ret = rbd_dev_image_id(rbd_dev);
3946         if (ret)
3947                 ret = rbd_dev_v1_probe(rbd_dev);
3948         else
3949                 ret = rbd_dev_v2_probe(rbd_dev);
3950         if (ret) {
3951                 dout("probe failed, returning %d\n", ret);
3952
3953                 return ret;
3954         }
3955
3956         ret = rbd_dev_probe_finish(rbd_dev);
3957         if (ret)
3958                 rbd_header_free(&rbd_dev->header);
3959
3960         return ret;
3961 }
3962
3963 static ssize_t rbd_add(struct bus_type *bus,
3964                        const char *buf,
3965                        size_t count)
3966 {
3967         struct rbd_device *rbd_dev = NULL;
3968         struct ceph_options *ceph_opts = NULL;
3969         struct rbd_options *rbd_opts = NULL;
3970         struct rbd_spec *spec = NULL;
3971         struct rbd_client *rbdc;
3972         struct ceph_osd_client *osdc;
3973         int rc = -ENOMEM;
3974
3975         if (!try_module_get(THIS_MODULE))
3976                 return -ENODEV;
3977
3978         /* parse add command */
3979         rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
3980         if (rc < 0)
3981                 goto err_out_module;
3982
3983         rbdc = rbd_get_client(ceph_opts);
3984         if (IS_ERR(rbdc)) {
3985                 rc = PTR_ERR(rbdc);
3986                 goto err_out_args;
3987         }
3988         ceph_opts = NULL;       /* rbd_dev client now owns this */
3989
3990         /* pick the pool */
3991         osdc = &rbdc->client->osdc;
3992         rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
3993         if (rc < 0)
3994                 goto err_out_client;
3995         spec->pool_id = (u64) rc;
3996
3997         /* The ceph file layout needs to fit pool id in 32 bits */
3998
3999         if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
4000                 rc = -EIO;
4001                 goto err_out_client;
4002         }
4003
4004         rbd_dev = rbd_dev_create(rbdc, spec);
4005         if (!rbd_dev)
4006                 goto err_out_client;
4007         rbdc = NULL;            /* rbd_dev now owns this */
4008         spec = NULL;            /* rbd_dev now owns this */
4009
4010         rbd_dev->mapping.read_only = rbd_opts->read_only;
4011         kfree(rbd_opts);
4012         rbd_opts = NULL;        /* done with this */
4013
4014         rc = rbd_dev_probe(rbd_dev);
4015         if (rc < 0)
4016                 goto err_out_rbd_dev;
4017
4018         return count;
4019 err_out_rbd_dev:
4020         rbd_dev_destroy(rbd_dev);
4021 err_out_client:
4022         rbd_put_client(rbdc);
4023 err_out_args:
4024         if (ceph_opts)
4025                 ceph_destroy_options(ceph_opts);
4026         kfree(rbd_opts);
4027         rbd_spec_put(spec);
4028 err_out_module:
4029         module_put(THIS_MODULE);
4030
4031         dout("Error adding device %s\n", buf);
4032
4033         return (ssize_t) rc;
4034 }
4035
4036 static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
4037 {
4038         struct list_head *tmp;
4039         struct rbd_device *rbd_dev;
4040
4041         spin_lock(&rbd_dev_list_lock);
4042         list_for_each(tmp, &rbd_dev_list) {
4043                 rbd_dev = list_entry(tmp, struct rbd_device, node);
4044                 if (rbd_dev->dev_id == dev_id) {
4045                         spin_unlock(&rbd_dev_list_lock);
4046                         return rbd_dev;
4047                 }
4048         }
4049         spin_unlock(&rbd_dev_list_lock);
4050         return NULL;
4051 }
4052
4053 static void rbd_dev_release(struct device *dev)
4054 {
4055         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4056
4057         if (rbd_dev->watch_event)
4058                 rbd_dev_header_watch_sync(rbd_dev, 0);
4059
4060         /* clean up and free blkdev */
4061         rbd_free_disk(rbd_dev);
4062         unregister_blkdev(rbd_dev->major, rbd_dev->name);
4063
4064         /* release allocated disk header fields */
4065         rbd_header_free(&rbd_dev->header);
4066
4067         /* done with the id, and with the rbd_dev */
4068         rbd_dev_id_put(rbd_dev);
4069         rbd_assert(rbd_dev->rbd_client != NULL);
4070         rbd_dev_destroy(rbd_dev);
4071
4072         /* release module ref */
4073         module_put(THIS_MODULE);
4074 }
4075
4076 static ssize_t rbd_remove(struct bus_type *bus,
4077                           const char *buf,
4078                           size_t count)
4079 {
4080         struct rbd_device *rbd_dev = NULL;
4081         int target_id, rc;
4082         unsigned long ul;
4083         int ret = count;
4084
4085         rc = strict_strtoul(buf, 10, &ul);
4086         if (rc)
4087                 return rc;
4088
4089         /* convert to int; abort if we lost anything in the conversion */
4090         target_id = (int) ul;
4091         if (target_id != ul)
4092                 return -EINVAL;
4093
4094         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4095
4096         rbd_dev = __rbd_get_dev(target_id);
4097         if (!rbd_dev) {
4098                 ret = -ENOENT;
4099                 goto done;
4100         }
4101
4102         spin_lock(&rbd_dev->lock);
4103         if (rbd_dev->open_count)
4104                 ret = -EBUSY;
4105         else
4106                 set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
4107         spin_unlock(&rbd_dev->lock);
4108         if (ret < 0)
4109                 goto done;
4110
4111         rbd_remove_all_snaps(rbd_dev);
4112         rbd_bus_del_dev(rbd_dev);
4113
4114 done:
4115         mutex_unlock(&ctl_mutex);
4116
4117         return ret;
4118 }
4119
4120 /*
4121  * create control files in sysfs
4122  * /sys/bus/rbd/...
4123  */
4124 static int rbd_sysfs_init(void)
4125 {
4126         int ret;
4127
4128         ret = device_register(&rbd_root_dev);
4129         if (ret < 0)
4130                 return ret;
4131
4132         ret = bus_register(&rbd_bus_type);
4133         if (ret < 0)
4134                 device_unregister(&rbd_root_dev);
4135
4136         return ret;
4137 }
4138
4139 static void rbd_sysfs_cleanup(void)
4140 {
4141         bus_unregister(&rbd_bus_type);
4142         device_unregister(&rbd_root_dev);
4143 }
4144
4145 int __init rbd_init(void)
4146 {
4147         int rc;
4148
4149         if (!libceph_compatible(NULL)) {
4150                 rbd_warn(NULL, "libceph incompatibility (quitting)");
4151
4152                 return -EINVAL;
4153         }
4154         rc = rbd_sysfs_init();
4155         if (rc)
4156                 return rc;
4157         pr_info("loaded " RBD_DRV_NAME_LONG "\n");
4158         return 0;
4159 }
4160
4161 void __exit rbd_exit(void)
4162 {
4163         rbd_sysfs_cleanup();
4164 }
4165
4166 module_init(rbd_init);
4167 module_exit(rbd_exit);
4168
4169 MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
4170 MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
4171 MODULE_DESCRIPTION("rados block device");
4172
4173 /* following authorship retained from original osdblk.c */
4174 MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
4175
4176 MODULE_LICENSE("GPL");