]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blobdiff - drivers/block/rbd.c
rbd: implement feature checks
[mirror_ubuntu-artful-kernel.git] / drivers / block / rbd.c
index b8956131950ce95c41d31c082665162105a32f4d..0f260a6e97c44addbf8120d131e4afb1fb3cabb9 100644 (file)
 #define RBD_MINORS_PER_MAJOR   256             /* max minors per blkdev */
 
 #define RBD_MAX_SNAP_NAME_LEN  32
+#define RBD_MAX_SNAP_COUNT     510     /* allows max snapc to fit in 4KB */
 #define RBD_MAX_OPT_LEN                1024
 
 #define RBD_SNAP_HEAD_NAME     "-"
 
+#define RBD_IMAGE_ID_LEN_MAX   64
+#define RBD_OBJ_PREFIX_LEN_MAX 64
+
+/* Feature bits */
+
+#define RBD_FEATURE_LAYERING      1
+
+/* Features supported by this (client software) implementation. */
+
+#define RBD_FEATURES_ALL          (0)
+
 /*
  * An RBD device name will be "rbd#", where the "rbd" comes from
  * RBD_DRV_NAME above, and # is a unique integer identifier.
@@ -83,6 +95,7 @@
 struct rbd_image_header {
        /* These four fields never change for a given rbd image */
        char *object_prefix;
+       u64 features;
        __u8 obj_order;
        __u8 crypt_type;
        __u8 comp_type;
@@ -146,12 +159,14 @@ struct rbd_snap {
        u64                     size;
        struct list_head        node;
        u64                     id;
+       u64                     features;
 };
 
 struct rbd_mapping {
        char                    *snap_name;
        u64                     snap_id;
        u64                     size;
+       u64                     features;
        bool                    snap_exists;
        bool                    read_only;
 };
@@ -165,6 +180,7 @@ struct rbd_device {
        int                     major;          /* blkdev assigned major */
        struct gendisk          *disk;          /* blkdev's gendisk and rq */
 
+       u32                     image_format;   /* Either 1 or 2 */
        struct rbd_options      rbd_opts;
        struct rbd_client       *rbd_client;
 
@@ -173,6 +189,8 @@ struct rbd_device {
        spinlock_t              lock;           /* queue lock */
 
        struct rbd_image_header header;
+       char                    *image_id;
+       size_t                  image_id_len;
        char                    *image_name;
        size_t                  image_name_len;
        char                    *header_name;
@@ -208,10 +226,6 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
 static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
 
 static void rbd_dev_release(struct device *dev);
-static ssize_t rbd_snap_add(struct device *dev,
-                           struct device_attribute *attr,
-                           const char *buf,
-                           size_t count);
 static void __rbd_remove_snap_dev(struct rbd_snap *snap);
 
 static ssize_t rbd_add(struct bus_type *bus, const char *buf,
@@ -262,7 +276,8 @@ static void rbd_put_dev(struct rbd_device *rbd_dev)
        put_device(&rbd_dev->dev);
 }
 
-static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
+static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
+static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
 
 static int rbd_open(struct block_device *bdev, fmode_t mode)
 {
@@ -504,6 +519,11 @@ static void rbd_coll_release(struct kref *kref)
        kfree(coll);
 }
 
+static bool rbd_image_format_valid(u32 image_format)
+{
+       return image_format == 1 || image_format == 2;
+}
+
 static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
 {
        size_t size;
@@ -590,6 +610,7 @@ static int rbd_header_from_disk(struct rbd_image_header *header,
                header->snap_sizes = NULL;
        }
 
+       header->features = 0;   /* No features support in v1 images */
        header->obj_order = ondisk->options.order;
        header->crypt_type = ondisk->options.crypt_type;
        header->comp_type = ondisk->options.comp_type;
@@ -632,6 +653,7 @@ static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
                if (!strcmp(snap_name, snap->name)) {
                        rbd_dev->mapping.snap_id = snap->id;
                        rbd_dev->mapping.size = snap->size;
+                       rbd_dev->mapping.features = snap->features;
 
                        return 0;
                }
@@ -648,6 +670,7 @@ static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name)
                    sizeof (RBD_SNAP_HEAD_NAME))) {
                rbd_dev->mapping.snap_id = CEPH_NOSNAP;
                rbd_dev->mapping.size = rbd_dev->header.image_size;
+               rbd_dev->mapping.features = rbd_dev->header.features;
                rbd_dev->mapping.snap_exists = false;
                rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only;
                ret = 0;
@@ -1006,8 +1029,9 @@ static int rbd_do_request(struct request *rq,
        layout->fl_stripe_count = cpu_to_le32(1);
        layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
        layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
-       ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
-                               req, ops);
+       ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
+                                  req, ops);
+       rbd_assert(ret == 0);
 
        ceph_osdc_build_request(req, ofs, &len,
                                ops,
@@ -1289,7 +1313,7 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
        dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
                rbd_dev->header_name, (unsigned long long) notify_id,
                (unsigned int) opcode);
-       rc = rbd_refresh_header(rbd_dev, &hver);
+       rc = rbd_dev_refresh(rbd_dev, &hver);
        if (rc)
                pr_warning(RBD_DRV_NAME "%d got notification but failed to "
                           " update snaps: %d\n", rbd_dev->major, rc);
@@ -1371,71 +1395,6 @@ static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
        return ret;
 }
 
-struct rbd_notify_info {
-       struct rbd_device *rbd_dev;
-};
-
-static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
-{
-       struct rbd_device *rbd_dev = (struct rbd_device *)data;
-       if (!rbd_dev)
-               return;
-
-       dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n",
-                       rbd_dev->header_name, (unsigned long long) notify_id,
-                       (unsigned int) opcode);
-}
-
-/*
- * Request sync osd notify
- */
-static int rbd_req_sync_notify(struct rbd_device *rbd_dev)
-{
-       struct ceph_osd_req_op *ops;
-       struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
-       struct ceph_osd_event *event;
-       struct rbd_notify_info info;
-       int payload_len = sizeof(u32) + sizeof(u32);
-       int ret;
-
-       ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len);
-       if (!ops)
-               return -ENOMEM;
-
-       info.rbd_dev = rbd_dev;
-
-       ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
-                                    (void *)&info, &event);
-       if (ret < 0)
-               goto fail;
-
-       ops[0].watch.ver = 1;
-       ops[0].watch.flag = 1;
-       ops[0].watch.cookie = event->cookie;
-       ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
-       ops[0].watch.timeout = 12;
-
-       ret = rbd_req_sync_op(rbd_dev, NULL,
-                              CEPH_NOSNAP,
-                              CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-                              ops,
-                              rbd_dev->header_name,
-                              0, 0, NULL, NULL, NULL);
-       if (ret < 0)
-               goto fail_event;
-
-       ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
-       dout("ceph_osdc_wait_event returned %d\n", ret);
-       rbd_destroy_ops(ops);
-       return 0;
-
-fail_event:
-       ceph_osdc_cancel_event(event);
-fail:
-       rbd_destroy_ops(ops);
-       return ret;
-}
-
 /*
  * Synchronous osd object method call
  */
@@ -1757,52 +1716,6 @@ static int rbd_read_header(struct rbd_device *rbd_dev,
        return ret;
 }
 
-/*
- * create a snapshot
- */
-static int rbd_header_add_snap(struct rbd_device *rbd_dev,
-                              const char *snap_name,
-                              gfp_t gfp_flags)
-{
-       int name_len = strlen(snap_name);
-       u64 new_snapid;
-       int ret;
-       void *data, *p, *e;
-       struct ceph_mon_client *monc;
-
-       /* we should create a snapshot only if we're pointing at the head */
-       if (rbd_dev->mapping.snap_id != CEPH_NOSNAP)
-               return -EINVAL;
-
-       monc = &rbd_dev->rbd_client->client->monc;
-       ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
-       dout("created snapid=%llu\n", (unsigned long long) new_snapid);
-       if (ret < 0)
-               return ret;
-
-       data = kmalloc(name_len + 16, gfp_flags);
-       if (!data)
-               return -ENOMEM;
-
-       p = data;
-       e = data + name_len + 16;
-
-       ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
-       ceph_encode_64_safe(&p, e, new_snapid, bad);
-
-       ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
-                               "rbd", "snap_add",
-                               data, (size_t) (p - data), NULL, 0,
-                               CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-                               NULL);
-
-       kfree(data);
-
-       return ret < 0 ? ret : 0;
-bad:
-       return -ERANGE;
-}
-
 static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
 {
        struct rbd_snap *snap;
@@ -1812,10 +1725,23 @@ static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
                __rbd_remove_snap_dev(snap);
 }
 
+static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
+{
+       sector_t size;
+
+       if (rbd_dev->mapping.snap_id != CEPH_NOSNAP)
+               return;
+
+       size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
+       dout("setting size to %llu sectors", (unsigned long long) size);
+       rbd_dev->mapping.size = (u64) size;
+       set_capacity(rbd_dev->disk, size);
+}
+
 /*
  * only read the first part of the ondisk header, without the snaps info
  */
-static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
+static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
 {
        int ret;
        struct rbd_image_header h;
@@ -1826,17 +1752,9 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
 
        down_write(&rbd_dev->header_rwsem);
 
-       /* resized? */
-       if (rbd_dev->mapping.snap_id == CEPH_NOSNAP) {
-               sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
-
-               if (size != (sector_t) rbd_dev->mapping.size) {
-                       dout("setting size to %llu sectors",
-                               (unsigned long long) size);
-                       rbd_dev->mapping.size = (u64) size;
-                       set_capacity(rbd_dev->disk, size);
-               }
-       }
+       /* Update image size, and check for resize of mapped image */
+       rbd_dev->header.image_size = h.image_size;
+       rbd_update_mapping_size(rbd_dev);
 
        /* rbd_dev->header.object_prefix shouldn't change */
        kfree(rbd_dev->header.snap_sizes);
@@ -1864,12 +1782,16 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
        return ret;
 }
 
-static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
+static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
 {
        int ret;
 
+       rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
        mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-       ret = __rbd_refresh_header(rbd_dev, hver);
+       if (rbd_dev->image_format == 1)
+               ret = rbd_dev_v1_refresh(rbd_dev, hver);
+       else
+               ret = rbd_dev_v2_refresh(rbd_dev, hver);
        mutex_unlock(&ctl_mutex);
 
        return ret;
@@ -1946,6 +1868,19 @@ static ssize_t rbd_size_show(struct device *dev,
        return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
 }
 
+/*
+ * Note this shows the features for whatever's mapped, which is not
+ * necessarily the base image.
+ */
+static ssize_t rbd_features_show(struct device *dev,
+                            struct device_attribute *attr, char *buf)
+{
+       struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+       return sprintf(buf, "0x%016llx\n",
+                       (unsigned long long) rbd_dev->mapping.features);
+}
+
 static ssize_t rbd_major_show(struct device *dev,
                              struct device_attribute *attr, char *buf)
 {
@@ -1987,6 +1922,18 @@ static ssize_t rbd_name_show(struct device *dev,
        return sprintf(buf, "%s\n", rbd_dev->image_name);
 }
 
+static ssize_t rbd_image_id_show(struct device *dev,
+                            struct device_attribute *attr, char *buf)
+{
+       struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+       return sprintf(buf, "%s\n", rbd_dev->image_id);
+}
+
+/*
+ * Shows the name of the currently-mapped snapshot (or
+ * RBD_SNAP_HEAD_NAME for the base image).
+ */
 static ssize_t rbd_snap_show(struct device *dev,
                             struct device_attribute *attr,
                             char *buf)
@@ -2004,31 +1951,33 @@ static ssize_t rbd_image_refresh(struct device *dev,
        struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
        int ret;
 
-       ret = rbd_refresh_header(rbd_dev, NULL);
+       ret = rbd_dev_refresh(rbd_dev, NULL);
 
        return ret < 0 ? ret : size;
 }
 
 static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
+static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
 static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
 static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
 static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
 static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
 static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
+static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
 static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
 static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
-static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
 
 static struct attribute *rbd_attrs[] = {
        &dev_attr_size.attr,
+       &dev_attr_features.attr,
        &dev_attr_major.attr,
        &dev_attr_client_id.attr,
        &dev_attr_pool.attr,
        &dev_attr_pool_id.attr,
        &dev_attr_name.attr,
+       &dev_attr_image_id.attr,
        &dev_attr_current_snap.attr,
        &dev_attr_refresh.attr,
-       &dev_attr_create_snap.attr,
        NULL
 };
 
@@ -2074,12 +2023,24 @@ static ssize_t rbd_snap_id_show(struct device *dev,
        return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
 }
 
+static ssize_t rbd_snap_features_show(struct device *dev,
+                               struct device_attribute *attr,
+                               char *buf)
+{
+       struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
+
+       return sprintf(buf, "0x%016llx\n",
+                       (unsigned long long) snap->features);
+}
+
 static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
 static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
+static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
 
 static struct attribute *rbd_snap_attrs[] = {
        &dev_attr_snap_size.attr,
        &dev_attr_snap_id.attr,
+       &dev_attr_snap_features.attr,
        NULL,
 };
 
@@ -2139,7 +2100,9 @@ static int rbd_register_snap_dev(struct rbd_snap *snap,
 }
 
 static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
-                                             int i, const char *name)
+                                               const char *snap_name,
+                                               u64 snap_id, u64 snap_size,
+                                               u64 snap_features)
 {
        struct rbd_snap *snap;
        int ret;
@@ -2149,12 +2112,13 @@ static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
                return ERR_PTR(-ENOMEM);
 
        ret = -ENOMEM;
-       snap->name = kstrdup(name, GFP_KERNEL);
+       snap->name = kstrdup(snap_name, GFP_KERNEL);
        if (!snap->name)
                goto err;
 
-       snap->size = rbd_dev->header.snap_sizes[i];
-       snap->id = rbd_dev->header.snapc->snaps[i];
+       snap->id = snap_id;
+       snap->size = snap_size;
+       snap->features = snap_features;
 
        return snap;
 
@@ -2165,6 +2129,333 @@ err:
        return ERR_PTR(ret);
 }
 
+static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
+               u64 *snap_size, u64 *snap_features)
+{
+       char *snap_name;
+
+       rbd_assert(which < rbd_dev->header.snapc->num_snaps);
+
+       *snap_size = rbd_dev->header.snap_sizes[which];
+       *snap_features = 0;     /* No features for v1 */
+
+       /* Skip over names until we find the one we are looking for */
+
+       snap_name = rbd_dev->header.snap_names;
+       while (which--)
+               snap_name += strlen(snap_name) + 1;
+
+       return snap_name;
+}
+
+/*
+ * Get the size and object order for an image snapshot, or if
+ * snap_id is CEPH_NOSNAP, gets this information for the base
+ * image.
+ */
+static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
+                               u8 *order, u64 *snap_size)
+{
+       __le64 snapid = cpu_to_le64(snap_id);
+       int ret;
+       struct {
+               u8 order;
+               __le64 size;
+       } __attribute__ ((packed)) size_buf = { 0 };
+
+       ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
+                               "rbd", "get_size",
+                               (char *) &snapid, sizeof (snapid),
+                               (char *) &size_buf, sizeof (size_buf),
+                               CEPH_OSD_FLAG_READ, NULL);
+       dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
+       if (ret < 0)
+               return ret;
+
+       *order = size_buf.order;
+       *snap_size = le64_to_cpu(size_buf.size);
+
+       dout("  snap_id 0x%016llx order = %u, snap_size = %llu\n",
+               (unsigned long long) snap_id, (unsigned int) *order,
+               (unsigned long long) *snap_size);
+
+       return 0;
+}
+
+static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
+{
+       return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
+                                       &rbd_dev->header.obj_order,
+                                       &rbd_dev->header.image_size);
+}
+
+static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
+{
+       void *reply_buf;
+       int ret;
+       void *p;
+
+       reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
+       if (!reply_buf)
+               return -ENOMEM;
+
+       ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
+                               "rbd", "get_object_prefix",
+                               NULL, 0,
+                               reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
+                               CEPH_OSD_FLAG_READ, NULL);
+       dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
+       if (ret < 0)
+               goto out;
+
+       p = reply_buf;
+       rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
+                                               p + RBD_OBJ_PREFIX_LEN_MAX,
+                                               NULL, GFP_NOIO);
+
+       if (IS_ERR(rbd_dev->header.object_prefix)) {
+               ret = PTR_ERR(rbd_dev->header.object_prefix);
+               rbd_dev->header.object_prefix = NULL;
+       } else {
+               dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
+       }
+
+out:
+       kfree(reply_buf);
+
+       return ret;
+}
+
+static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
+               u64 *snap_features)
+{
+       __le64 snapid = cpu_to_le64(snap_id);
+       struct {
+               __le64 features;
+               __le64 incompat;
+       } features_buf = { 0 };
+       u64 incompat;
+       int ret;
+
+       ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
+                               "rbd", "get_features",
+                               (char *) &snapid, sizeof (snapid),
+                               (char *) &features_buf, sizeof (features_buf),
+                               CEPH_OSD_FLAG_READ, NULL);
+       dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
+       if (ret < 0)
+               return ret;
+
+       incompat = le64_to_cpu(features_buf.incompat);
+       if (incompat & ~RBD_FEATURES_ALL)
+               return -ENOTSUPP;
+
+       *snap_features = le64_to_cpu(features_buf.features);
+
+       dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
+               (unsigned long long) snap_id,
+               (unsigned long long) *snap_features,
+               (unsigned long long) le64_to_cpu(features_buf.incompat));
+
+       return 0;
+}
+
+static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
+{
+       return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
+                                               &rbd_dev->header.features);
+}
+
+static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
+{
+       size_t size;
+       int ret;
+       void *reply_buf;
+       void *p;
+       void *end;
+       u64 seq;
+       u32 snap_count;
+       struct ceph_snap_context *snapc;
+       u32 i;
+
+       /*
+        * We'll need room for the seq value (maximum snapshot id),
+        * snapshot count, and array of that many snapshot ids.
+        * For now we have a fixed upper limit on the number we're
+        * prepared to receive.
+        */
+       size = sizeof (__le64) + sizeof (__le32) +
+                       RBD_MAX_SNAP_COUNT * sizeof (__le64);
+       reply_buf = kzalloc(size, GFP_KERNEL);
+       if (!reply_buf)
+               return -ENOMEM;
+
+       ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
+                               "rbd", "get_snapcontext",
+                               NULL, 0,
+                               reply_buf, size,
+                               CEPH_OSD_FLAG_READ, ver);
+       dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
+       if (ret < 0)
+               goto out;
+
+       ret = -ERANGE;
+       p = reply_buf;
+       end = (char *) reply_buf + size;
+       ceph_decode_64_safe(&p, end, seq, out);
+       ceph_decode_32_safe(&p, end, snap_count, out);
+
+       /*
+        * Make sure the reported number of snapshot ids wouldn't go
+        * beyond the end of our buffer.  But before checking that,
+        * make sure the computed size of the snapshot context we
+        * allocate is representable in a size_t.
+        */
+       if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
+                                / sizeof (u64)) {
+               ret = -EINVAL;
+               goto out;
+       }
+       if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
+               goto out;
+
+       size = sizeof (struct ceph_snap_context) +
+                               snap_count * sizeof (snapc->snaps[0]);
+       snapc = kmalloc(size, GFP_KERNEL);
+       if (!snapc) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       atomic_set(&snapc->nref, 1);
+       snapc->seq = seq;
+       snapc->num_snaps = snap_count;
+       for (i = 0; i < snap_count; i++)
+               snapc->snaps[i] = ceph_decode_64(&p);
+
+       rbd_dev->header.snapc = snapc;
+
+       dout("  snap context seq = %llu, snap_count = %u\n",
+               (unsigned long long) seq, (unsigned int) snap_count);
+
+out:
+       kfree(reply_buf);
+
+       return 0;
+}
+
+static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
+{
+       size_t size;
+       void *reply_buf;
+       __le64 snap_id;
+       int ret;
+       void *p;
+       void *end;
+       size_t snap_name_len;
+       char *snap_name;
+
+       size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
+       reply_buf = kmalloc(size, GFP_KERNEL);
+       if (!reply_buf)
+               return ERR_PTR(-ENOMEM);
+
+       snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
+       ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
+                               "rbd", "get_snapshot_name",
+                               (char *) &snap_id, sizeof (snap_id),
+                               reply_buf, size,
+                               CEPH_OSD_FLAG_READ, NULL);
+       dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
+       if (ret < 0)
+               goto out;
+
+       p = reply_buf;
+       end = (char *) reply_buf + size;
+       snap_name_len = 0;
+       snap_name = ceph_extract_encoded_string(&p, end, &snap_name_len,
+                               GFP_KERNEL);
+       if (IS_ERR(snap_name)) {
+               ret = PTR_ERR(snap_name);
+               goto out;
+       } else {
+               dout("  snap_id 0x%016llx snap_name = %s\n",
+                       (unsigned long long) le64_to_cpu(snap_id), snap_name);
+       }
+       kfree(reply_buf);
+
+       return snap_name;
+out:
+       kfree(reply_buf);
+
+       return ERR_PTR(ret);
+}
+
+static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
+               u64 *snap_size, u64 *snap_features)
+{
+       __le64 snap_id;
+       u8 order;
+       int ret;
+
+       snap_id = rbd_dev->header.snapc->snaps[which];
+       ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
+       if (ret)
+               return ERR_PTR(ret);
+       ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
+       if (ret)
+               return ERR_PTR(ret);
+
+       return rbd_dev_v2_snap_name(rbd_dev, which);
+}
+
+static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
+               u64 *snap_size, u64 *snap_features)
+{
+       if (rbd_dev->image_format == 1)
+               return rbd_dev_v1_snap_info(rbd_dev, which,
+                                       snap_size, snap_features);
+       if (rbd_dev->image_format == 2)
+               return rbd_dev_v2_snap_info(rbd_dev, which,
+                                       snap_size, snap_features);
+       return ERR_PTR(-EINVAL);
+}
+
+static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
+{
+       int ret;
+       __u8 obj_order;
+
+       down_write(&rbd_dev->header_rwsem);
+
+       /* Grab old order first, to see if it changes */
+
+       obj_order = rbd_dev->header.obj_order,
+       ret = rbd_dev_v2_image_size(rbd_dev);
+       if (ret)
+               goto out;
+       if (rbd_dev->header.obj_order != obj_order) {
+               ret = -EIO;
+               goto out;
+       }
+       rbd_update_mapping_size(rbd_dev);
+
+       ret = rbd_dev_v2_snap_context(rbd_dev, hver);
+       dout("rbd_dev_v2_snap_context returned %d\n", ret);
+       if (ret)
+               goto out;
+       ret = rbd_dev_snaps_update(rbd_dev);
+       dout("rbd_dev_snaps_update returned %d\n", ret);
+       if (ret)
+               goto out;
+       ret = rbd_dev_snaps_register(rbd_dev);
+       dout("rbd_dev_snaps_register returned %d\n", ret);
+out:
+       up_write(&rbd_dev->header_rwsem);
+
+       return ret;
+}
+
 /*
  * Scan the rbd device's current snapshot list and compare it to the
  * newly-received snapshot context.  Remove any existing snapshots
@@ -2181,7 +2472,6 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
 {
        struct ceph_snap_context *snapc = rbd_dev->header.snapc;
        const u32 snap_count = snapc->num_snaps;
-       char *snap_name = rbd_dev->header.snap_names;
        struct list_head *head = &rbd_dev->snaps;
        struct list_head *links = head->next;
        u32 index = 0;
@@ -2190,6 +2480,9 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
        while (index < snap_count || links != head) {
                u64 snap_id;
                struct rbd_snap *snap;
+               char *snap_name;
+               u64 snap_size = 0;
+               u64 snap_features = 0;
 
                snap_id = index < snap_count ? snapc->snaps[index]
                                             : CEPH_NOSNAP;
@@ -2216,6 +2509,11 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
                        continue;
                }
 
+               snap_name = rbd_dev_snap_info(rbd_dev, index,
+                                       &snap_size, &snap_features);
+               if (IS_ERR(snap_name))
+                       return PTR_ERR(snap_name);
+
                dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
                        (unsigned long long) snap_id);
                if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
@@ -2223,8 +2521,8 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
 
                        /* We haven't seen this snapshot before */
 
-                       new_snap = __rbd_add_snap_dev(rbd_dev, index,
-                                                       snap_name);
+                       new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
+                                       snap_id, snap_size, snap_features);
                        if (IS_ERR(new_snap)) {
                                int err = PTR_ERR(new_snap);
 
@@ -2245,9 +2543,9 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
 
                        dout("  already present\n");
 
-                       rbd_assert(snap->size ==
-                                       rbd_dev->header.snap_sizes[index]);
+                       rbd_assert(snap->size == snap_size);
                        rbd_assert(!strcmp(snap->name, snap_name));
+                       rbd_assert(snap->features == snap_features);
 
                        /* Done with this list entry; advance */
 
@@ -2257,7 +2555,6 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
                /* Advance to the next entry in the snapshot context */
 
                index++;
-               snap_name += strlen(snap_name) + 1;
        }
        dout("%s: done\n", __func__);
 
@@ -2321,7 +2618,7 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
        do {
                ret = rbd_req_sync_watch(rbd_dev);
                if (ret == -ERANGE) {
-                       rc = rbd_refresh_header(rbd_dev, NULL);
+                       rc = rbd_dev_refresh(rbd_dev, NULL);
                        if (rc < 0)
                                return rc;
                }
@@ -2553,6 +2850,205 @@ out_err:
        return err_ptr;
 }
 
+/*
+ * An rbd format 2 image has a unique identifier, distinct from the
+ * name given to it by the user.  Internally, that identifier is
+ * what's used to specify the names of objects related to the image.
+ *
+ * A special "rbd id" object is used to map an rbd image name to its
+ * id.  If that object doesn't exist, then there is no v2 rbd image
+ * with the supplied name.
+ *
+ * This function will record the given rbd_dev's image_id field if
+ * it can be determined, and in that case will return 0.  If any
+ * errors occur a negative errno will be returned and the rbd_dev's
+ * image_id field will be unchanged (and should be NULL).
+ */
+static int rbd_dev_image_id(struct rbd_device *rbd_dev)
+{
+       int ret;
+       size_t size;
+       char *object_name;
+       void *response;
+       void *p;
+
+       /*
+        * First, see if the format 2 image id file exists, and if
+        * so, get the image's persistent id from it.
+        */
+       size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len;
+       object_name = kmalloc(size, GFP_NOIO);
+       if (!object_name)
+               return -ENOMEM;
+       sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name);
+       dout("rbd id object name is %s\n", object_name);
+
+       /* Response will be an encoded string, which includes a length */
+
+       size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
+       response = kzalloc(size, GFP_NOIO);
+       if (!response) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       ret = rbd_req_sync_exec(rbd_dev, object_name,
+                               "rbd", "get_id",
+                               NULL, 0,
+                               response, RBD_IMAGE_ID_LEN_MAX,
+                               CEPH_OSD_FLAG_READ, NULL);
+       dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
+       if (ret < 0)
+               goto out;
+
+       p = response;
+       rbd_dev->image_id = ceph_extract_encoded_string(&p,
+                                               p + RBD_IMAGE_ID_LEN_MAX,
+                                               &rbd_dev->image_id_len,
+                                               GFP_NOIO);
+       if (IS_ERR(rbd_dev->image_id)) {
+               ret = PTR_ERR(rbd_dev->image_id);
+               rbd_dev->image_id = NULL;
+       } else {
+               dout("image_id is %s\n", rbd_dev->image_id);
+       }
+out:
+       kfree(response);
+       kfree(object_name);
+
+       return ret;
+}
+
+static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
+{
+       int ret;
+       size_t size;
+
+       /* Version 1 images have no id; empty string is used */
+
+       rbd_dev->image_id = kstrdup("", GFP_KERNEL);
+       if (!rbd_dev->image_id)
+               return -ENOMEM;
+       rbd_dev->image_id_len = 0;
+
+       /* Record the header object name for this rbd image. */
+
+       size = rbd_dev->image_name_len + sizeof (RBD_SUFFIX);
+       rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
+       if (!rbd_dev->header_name) {
+               ret = -ENOMEM;
+               goto out_err;
+       }
+       sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
+
+       /* Populate rbd image metadata */
+
+       ret = rbd_read_header(rbd_dev, &rbd_dev->header);
+       if (ret < 0)
+               goto out_err;
+       rbd_dev->image_format = 1;
+
+       dout("discovered version 1 image, header name is %s\n",
+               rbd_dev->header_name);
+
+       return 0;
+
+out_err:
+       kfree(rbd_dev->header_name);
+       rbd_dev->header_name = NULL;
+       kfree(rbd_dev->image_id);
+       rbd_dev->image_id = NULL;
+
+       return ret;
+}
+
+static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
+{
+       size_t size;
+       int ret;
+       u64 ver = 0;
+
+       /*
+        * Image id was filled in by the caller.  Record the header
+        * object name for this rbd image.
+        */
+       size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->image_id_len;
+       rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
+       if (!rbd_dev->header_name)
+               return -ENOMEM;
+       sprintf(rbd_dev->header_name, "%s%s",
+                       RBD_HEADER_PREFIX, rbd_dev->image_id);
+
+       /* Get the size and object order for the image */
+
+       ret = rbd_dev_v2_image_size(rbd_dev);
+       if (ret < 0)
+               goto out_err;
+
+       /* Get the object prefix (a.k.a. block_name) for the image */
+
+       ret = rbd_dev_v2_object_prefix(rbd_dev);
+       if (ret < 0)
+               goto out_err;
+
+       /* Get the and check features for the image */
+
+       ret = rbd_dev_v2_features(rbd_dev);
+       if (ret < 0)
+               goto out_err;
+
+       /* crypto and compression type aren't (yet) supported for v2 images */
+
+       rbd_dev->header.crypt_type = 0;
+       rbd_dev->header.comp_type = 0;
+
+       /* Get the snapshot context, plus the header version */
+
+       ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
+       if (ret)
+               goto out_err;
+       rbd_dev->header.obj_version = ver;
+
+       rbd_dev->image_format = 2;
+
+       dout("discovered version 2 image, header name is %s\n",
+               rbd_dev->header_name);
+
+       return -ENOTSUPP;
+out_err:
+       kfree(rbd_dev->header_name);
+       rbd_dev->header_name = NULL;
+       kfree(rbd_dev->header.object_prefix);
+       rbd_dev->header.object_prefix = NULL;
+
+       return ret;
+}
+
+/*
+ * Probe for the existence of the header object for the given rbd
+ * device.  For format 2 images this includes determining the image
+ * id.
+ */
+static int rbd_dev_probe(struct rbd_device *rbd_dev)
+{
+       int ret;
+
+       /*
+        * Get the id from the image id object.  If it's not a
+        * format 2 image, we'll get ENOENT back, and we'll assume
+        * it's a format 1 image.
+        */
+       ret = rbd_dev_image_id(rbd_dev);
+       if (ret)
+               ret = rbd_dev_v1_probe(rbd_dev);
+       else
+               ret = rbd_dev_v2_probe(rbd_dev);
+       if (ret)
+               dout("probe failed, returning %d\n", ret);
+
+       return ret;
+}
+
 static ssize_t rbd_add(struct bus_type *bus,
                       const char *buf,
                       size_t count)
@@ -2600,19 +3096,8 @@ static ssize_t rbd_add(struct bus_type *bus,
                goto err_out_client;
        rbd_dev->pool_id = rc;
 
-       /* Create the name of the header object */
-
-       rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
-                                               + sizeof (RBD_SUFFIX),
-                                       GFP_KERNEL);
-       if (!rbd_dev->header_name)
-               goto err_out_client;
-       sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
-
-       /* Get information about the image being mapped */
-
-       rc = rbd_read_header(rbd_dev, &rbd_dev->header);
-       if (rc)
+       rc = rbd_dev_probe(rbd_dev);
+       if (rc < 0)
                goto err_out_client;
 
        /* no need to lock here, as rbd_dev is not registered yet */
@@ -2691,6 +3176,7 @@ err_out_header:
 err_out_client:
        kfree(rbd_dev->header_name);
        rbd_put_client(rbd_dev);
+       kfree(rbd_dev->image_id);
 err_out_args:
        kfree(rbd_dev->mapping.snap_name);
        kfree(rbd_dev->image_name);
@@ -2746,6 +3232,7 @@ static void rbd_dev_release(struct device *dev)
 
        /* done with the id, and with the rbd_dev */
        kfree(rbd_dev->mapping.snap_name);
+       kfree(rbd_dev->image_id);
        kfree(rbd_dev->header_name);
        kfree(rbd_dev->pool_name);
        kfree(rbd_dev->image_name);
@@ -2791,47 +3278,6 @@ done:
        return ret;
 }
 
-static ssize_t rbd_snap_add(struct device *dev,
-                           struct device_attribute *attr,
-                           const char *buf,
-                           size_t count)
-{
-       struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
-       int ret;
-       char *name = kmalloc(count + 1, GFP_KERNEL);
-       if (!name)
-               return -ENOMEM;
-
-       snprintf(name, count, "%s", buf);
-
-       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-
-       ret = rbd_header_add_snap(rbd_dev,
-                                 name, GFP_KERNEL);
-       if (ret < 0)
-               goto err_unlock;
-
-       ret = __rbd_refresh_header(rbd_dev, NULL);
-       if (ret < 0)
-               goto err_unlock;
-
-       /* shouldn't hold ctl_mutex when notifying.. notify might
-          trigger a watch callback that would need to get that mutex */
-       mutex_unlock(&ctl_mutex);
-
-       /* make a best effort, don't error if failed */
-       rbd_req_sync_notify(rbd_dev);
-
-       ret = count;
-       kfree(name);
-       return ret;
-
-err_unlock:
-       mutex_unlock(&ctl_mutex);
-       kfree(name);
-       return ret;
-}
-
 /*
  * create control files in sysfs
  * /sys/bus/rbd/...