]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - drivers/block/rbd.c
rbd: define and use rbd_warn()
[mirror_ubuntu-bionic-kernel.git] / drivers / block / rbd.c
CommitLineData
602adf40
YS
1/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
dfc5606d 24 For usage instructions, please refer to:
602adf40 25
dfc5606d 26 Documentation/ABI/testing/sysfs-bus-rbd
602adf40
YS
27
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
59c2be1e 34#include <linux/parser.h>
602adf40
YS
35
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
aafb230e
AE
44#define RBD_DEBUG /* Activate rbd_assert() calls */
45
593a9e7b
AE
46/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
df111be6
AE
55/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
f0f8cef5
AE
59#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
602adf40
YS
61
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
d4b125e9
AE
64#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
65#define RBD_MAX_SNAP_NAME_LEN \
66 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
67
35d489f9 68#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
602adf40
YS
69#define RBD_MAX_OPT_LEN 1024
70
71#define RBD_SNAP_HEAD_NAME "-"
72
9e15b77d
AE
73/* This allows a single page to hold an image name sent by OSD */
74#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
1e130199 75#define RBD_IMAGE_ID_LEN_MAX 64
9e15b77d 76
1e130199 77#define RBD_OBJ_PREFIX_LEN_MAX 64
589d30e0 78
d889140c
AE
79/* Feature bits */
80
81#define RBD_FEATURE_LAYERING 1
82
83/* Features supported by this (client software) implementation. */
84
85#define RBD_FEATURES_ALL (0)
86
81a89793
AE
87/*
88 * An RBD device name will be "rbd#", where the "rbd" comes from
89 * RBD_DRV_NAME above, and # is a unique integer identifier.
90 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
91 * enough to hold all possible device names.
92 */
602adf40 93#define DEV_NAME_LEN 32
81a89793 94#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
602adf40 95
cc0538b6 96#define RBD_READ_ONLY_DEFAULT false
59c2be1e 97
602adf40
YS
98/*
99 * block device image metadata (in-memory version)
100 */
101struct rbd_image_header {
f84344f3 102 /* These four fields never change for a given rbd image */
849b4260 103 char *object_prefix;
34b13184 104 u64 features;
602adf40
YS
105 __u8 obj_order;
106 __u8 crypt_type;
107 __u8 comp_type;
602adf40 108
f84344f3
AE
109 /* The remaining fields need to be updated occasionally */
110 u64 image_size;
111 struct ceph_snap_context *snapc;
602adf40
YS
112 char *snap_names;
113 u64 *snap_sizes;
59c2be1e
YS
114
115 u64 obj_version;
116};
117
0d7dbfce
AE
118/*
119 * An rbd image specification.
120 *
121 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
c66c6e0c
AE
122 * identify an image. Each rbd_dev structure includes a pointer to
123 * an rbd_spec structure that encapsulates this identity.
124 *
125 * Each of the id's in an rbd_spec has an associated name. For a
126 * user-mapped image, the names are supplied and the id's associated
127 * with them are looked up. For a layered image, a parent image is
128 * defined by the tuple, and the names are looked up.
129 *
130 * An rbd_dev structure contains a parent_spec pointer which is
131 * non-null if the image it represents is a child in a layered
132 * image. This pointer will refer to the rbd_spec structure used
133 * by the parent rbd_dev for its own identity (i.e., the structure
134 * is shared between the parent and child).
135 *
136 * Since these structures are populated once, during the discovery
137 * phase of image construction, they are effectively immutable so
138 * we make no effort to synchronize access to them.
139 *
140 * Note that code herein does not assume the image name is known (it
141 * could be a null pointer).
0d7dbfce
AE
142 */
143struct rbd_spec {
144 u64 pool_id;
145 char *pool_name;
146
147 char *image_id;
0d7dbfce 148 char *image_name;
0d7dbfce
AE
149
150 u64 snap_id;
151 char *snap_name;
152
153 struct kref kref;
154};
155
59c2be1e 156struct rbd_options {
cc0538b6 157 bool read_only;
602adf40
YS
158};
159
160/*
f0f8cef5 161 * an instance of the client. multiple devices may share an rbd client.
602adf40
YS
162 */
163struct rbd_client {
164 struct ceph_client *client;
165 struct kref kref;
166 struct list_head node;
167};
168
169/*
f0f8cef5 170 * a request completion status
602adf40 171 */
1fec7093
YS
172struct rbd_req_status {
173 int done;
174 int rc;
175 u64 bytes;
176};
177
178/*
179 * a collection of requests
180 */
181struct rbd_req_coll {
182 int total;
183 int num_done;
184 struct kref kref;
185 struct rbd_req_status status[0];
602adf40
YS
186};
187
f0f8cef5
AE
188/*
189 * a single io request
190 */
191struct rbd_request {
192 struct request *rq; /* blk layer request */
193 struct bio *bio; /* cloned bio */
194 struct page **pages; /* list of used pages */
195 u64 len;
196 int coll_index;
197 struct rbd_req_coll *coll;
198};
199
dfc5606d
YS
200struct rbd_snap {
201 struct device dev;
202 const char *name;
3591538f 203 u64 size;
dfc5606d
YS
204 struct list_head node;
205 u64 id;
34b13184 206 u64 features;
dfc5606d
YS
207};
208
f84344f3 209struct rbd_mapping {
99c1f08f 210 u64 size;
34b13184 211 u64 features;
f84344f3
AE
212 bool read_only;
213};
214
602adf40
YS
215/*
216 * a single device
217 */
218struct rbd_device {
de71a297 219 int dev_id; /* blkdev unique id */
602adf40
YS
220
221 int major; /* blkdev assigned major */
222 struct gendisk *disk; /* blkdev's gendisk and rq */
602adf40 223
a30b71b9 224 u32 image_format; /* Either 1 or 2 */
602adf40
YS
225 struct rbd_client *rbd_client;
226
227 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
228
229 spinlock_t lock; /* queue lock */
230
231 struct rbd_image_header header;
daba5fdb 232 bool exists;
0d7dbfce 233 struct rbd_spec *spec;
602adf40 234
0d7dbfce 235 char *header_name;
971f839a 236
59c2be1e
YS
237 struct ceph_osd_event *watch_event;
238 struct ceph_osd_request *watch_request;
239
86b00e0d
AE
240 struct rbd_spec *parent_spec;
241 u64 parent_overlap;
242
c666601a
JD
243 /* protects updating the header */
244 struct rw_semaphore header_rwsem;
f84344f3
AE
245
246 struct rbd_mapping mapping;
602adf40
YS
247
248 struct list_head node;
dfc5606d
YS
249
250 /* list of snapshots */
251 struct list_head snaps;
252
253 /* sysfs related */
254 struct device dev;
42382b70 255 unsigned long open_count;
dfc5606d
YS
256};
257
602adf40 258static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
e124a82f 259
602adf40 260static LIST_HEAD(rbd_dev_list); /* devices */
e124a82f
AE
261static DEFINE_SPINLOCK(rbd_dev_list_lock);
262
432b8587
AE
263static LIST_HEAD(rbd_client_list); /* clients */
264static DEFINE_SPINLOCK(rbd_client_list_lock);
602adf40 265
304f6808
AE
266static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
267static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
268
dfc5606d 269static void rbd_dev_release(struct device *dev);
41f38c2b 270static void rbd_remove_snap_dev(struct rbd_snap *snap);
dfc5606d 271
f0f8cef5
AE
272static ssize_t rbd_add(struct bus_type *bus, const char *buf,
273 size_t count);
274static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
275 size_t count);
276
277static struct bus_attribute rbd_bus_attrs[] = {
278 __ATTR(add, S_IWUSR, NULL, rbd_add),
279 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
280 __ATTR_NULL
281};
282
283static struct bus_type rbd_bus_type = {
284 .name = "rbd",
285 .bus_attrs = rbd_bus_attrs,
286};
287
288static void rbd_root_dev_release(struct device *dev)
289{
290}
291
292static struct device rbd_root_dev = {
293 .init_name = "rbd",
294 .release = rbd_root_dev_release,
295};
296
06ecc6cb
AE
297static __printf(2, 3)
298void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
299{
300 struct va_format vaf;
301 va_list args;
302
303 va_start(args, fmt);
304 vaf.fmt = fmt;
305 vaf.va = &args;
306
307 if (!rbd_dev)
308 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
309 else if (rbd_dev->disk)
310 printk(KERN_WARNING "%s: %s: %pV\n",
311 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
312 else if (rbd_dev->spec && rbd_dev->spec->image_name)
313 printk(KERN_WARNING "%s: image %s: %pV\n",
314 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
315 else if (rbd_dev->spec && rbd_dev->spec->image_id)
316 printk(KERN_WARNING "%s: id %s: %pV\n",
317 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
318 else /* punt */
319 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
320 RBD_DRV_NAME, rbd_dev, &vaf);
321 va_end(args);
322}
323
aafb230e
AE
324#ifdef RBD_DEBUG
325#define rbd_assert(expr) \
326 if (unlikely(!(expr))) { \
327 printk(KERN_ERR "\nAssertion failure in %s() " \
328 "at line %d:\n\n" \
329 "\trbd_assert(%s);\n\n", \
330 __func__, __LINE__, #expr); \
331 BUG(); \
332 }
333#else /* !RBD_DEBUG */
334# define rbd_assert(expr) ((void) 0)
335#endif /* !RBD_DEBUG */
dfc5606d 336
117973fb
AE
337static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
338static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
59c2be1e 339
602adf40
YS
340static int rbd_open(struct block_device *bdev, fmode_t mode)
341{
f0f8cef5 342 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
602adf40 343
f84344f3 344 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
602adf40
YS
345 return -EROFS;
346
42382b70 347 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
c3e946ce 348 (void) get_device(&rbd_dev->dev);
f84344f3 349 set_device_ro(bdev, rbd_dev->mapping.read_only);
42382b70
AE
350 rbd_dev->open_count++;
351 mutex_unlock(&ctl_mutex);
340c7a2b 352
602adf40
YS
353 return 0;
354}
355
dfc5606d
YS
356static int rbd_release(struct gendisk *disk, fmode_t mode)
357{
358 struct rbd_device *rbd_dev = disk->private_data;
359
42382b70
AE
360 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
361 rbd_assert(rbd_dev->open_count > 0);
362 rbd_dev->open_count--;
c3e946ce 363 put_device(&rbd_dev->dev);
42382b70 364 mutex_unlock(&ctl_mutex);
dfc5606d
YS
365
366 return 0;
367}
368
602adf40
YS
369static const struct block_device_operations rbd_bd_ops = {
370 .owner = THIS_MODULE,
371 .open = rbd_open,
dfc5606d 372 .release = rbd_release,
602adf40
YS
373};
374
375/*
376 * Initialize an rbd client instance.
43ae4701 377 * We own *ceph_opts.
602adf40 378 */
f8c38929 379static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
602adf40
YS
380{
381 struct rbd_client *rbdc;
382 int ret = -ENOMEM;
383
384 dout("rbd_client_create\n");
385 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
386 if (!rbdc)
387 goto out_opt;
388
389 kref_init(&rbdc->kref);
390 INIT_LIST_HEAD(&rbdc->node);
391
bc534d86
AE
392 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
393
43ae4701 394 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
602adf40 395 if (IS_ERR(rbdc->client))
bc534d86 396 goto out_mutex;
43ae4701 397 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
602adf40
YS
398
399 ret = ceph_open_session(rbdc->client);
400 if (ret < 0)
401 goto out_err;
402
432b8587 403 spin_lock(&rbd_client_list_lock);
602adf40 404 list_add_tail(&rbdc->node, &rbd_client_list);
432b8587 405 spin_unlock(&rbd_client_list_lock);
602adf40 406
bc534d86
AE
407 mutex_unlock(&ctl_mutex);
408
602adf40
YS
409 dout("rbd_client_create created %p\n", rbdc);
410 return rbdc;
411
412out_err:
413 ceph_destroy_client(rbdc->client);
bc534d86
AE
414out_mutex:
415 mutex_unlock(&ctl_mutex);
602adf40
YS
416 kfree(rbdc);
417out_opt:
43ae4701
AE
418 if (ceph_opts)
419 ceph_destroy_options(ceph_opts);
28f259b7 420 return ERR_PTR(ret);
602adf40
YS
421}
422
423/*
1f7ba331
AE
424 * Find a ceph client with specific addr and configuration. If
425 * found, bump its reference count.
602adf40 426 */
1f7ba331 427static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
602adf40
YS
428{
429 struct rbd_client *client_node;
1f7ba331 430 bool found = false;
602adf40 431
43ae4701 432 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
602adf40
YS
433 return NULL;
434
1f7ba331
AE
435 spin_lock(&rbd_client_list_lock);
436 list_for_each_entry(client_node, &rbd_client_list, node) {
437 if (!ceph_compare_options(ceph_opts, client_node->client)) {
438 kref_get(&client_node->kref);
439 found = true;
440 break;
441 }
442 }
443 spin_unlock(&rbd_client_list_lock);
444
445 return found ? client_node : NULL;
602adf40
YS
446}
447
59c2be1e
YS
448/*
449 * mount options
450 */
451enum {
59c2be1e
YS
452 Opt_last_int,
453 /* int args above */
454 Opt_last_string,
455 /* string args above */
cc0538b6
AE
456 Opt_read_only,
457 Opt_read_write,
458 /* Boolean args above */
459 Opt_last_bool,
59c2be1e
YS
460};
461
43ae4701 462static match_table_t rbd_opts_tokens = {
59c2be1e
YS
463 /* int args above */
464 /* string args above */
be466c1c 465 {Opt_read_only, "read_only"},
cc0538b6
AE
466 {Opt_read_only, "ro"}, /* Alternate spelling */
467 {Opt_read_write, "read_write"},
468 {Opt_read_write, "rw"}, /* Alternate spelling */
469 /* Boolean args above */
59c2be1e
YS
470 {-1, NULL}
471};
472
473static int parse_rbd_opts_token(char *c, void *private)
474{
43ae4701 475 struct rbd_options *rbd_opts = private;
59c2be1e
YS
476 substring_t argstr[MAX_OPT_ARGS];
477 int token, intval, ret;
478
43ae4701 479 token = match_token(c, rbd_opts_tokens, argstr);
59c2be1e
YS
480 if (token < 0)
481 return -EINVAL;
482
483 if (token < Opt_last_int) {
484 ret = match_int(&argstr[0], &intval);
485 if (ret < 0) {
486 pr_err("bad mount option arg (not int) "
487 "at '%s'\n", c);
488 return ret;
489 }
490 dout("got int token %d val %d\n", token, intval);
491 } else if (token > Opt_last_int && token < Opt_last_string) {
492 dout("got string token %d val %s\n", token,
493 argstr[0].from);
cc0538b6
AE
494 } else if (token > Opt_last_string && token < Opt_last_bool) {
495 dout("got Boolean token %d\n", token);
59c2be1e
YS
496 } else {
497 dout("got token %d\n", token);
498 }
499
500 switch (token) {
cc0538b6
AE
501 case Opt_read_only:
502 rbd_opts->read_only = true;
503 break;
504 case Opt_read_write:
505 rbd_opts->read_only = false;
506 break;
59c2be1e 507 default:
aafb230e
AE
508 rbd_assert(false);
509 break;
59c2be1e
YS
510 }
511 return 0;
512}
513
602adf40
YS
514/*
515 * Get a ceph client with specific addr and configuration, if one does
516 * not exist create it.
517 */
9d3997fd 518static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
602adf40 519{
f8c38929 520 struct rbd_client *rbdc;
59c2be1e 521
1f7ba331 522 rbdc = rbd_client_find(ceph_opts);
9d3997fd 523 if (rbdc) /* using an existing client */
43ae4701 524 ceph_destroy_options(ceph_opts);
9d3997fd 525 else
f8c38929 526 rbdc = rbd_client_create(ceph_opts);
602adf40 527
9d3997fd 528 return rbdc;
602adf40
YS
529}
530
531/*
532 * Destroy ceph client
d23a4b3f 533 *
432b8587 534 * Caller must hold rbd_client_list_lock.
602adf40
YS
535 */
536static void rbd_client_release(struct kref *kref)
537{
538 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
539
540 dout("rbd_release_client %p\n", rbdc);
cd9d9f5d 541 spin_lock(&rbd_client_list_lock);
602adf40 542 list_del(&rbdc->node);
cd9d9f5d 543 spin_unlock(&rbd_client_list_lock);
602adf40
YS
544
545 ceph_destroy_client(rbdc->client);
546 kfree(rbdc);
547}
548
549/*
550 * Drop reference to ceph client node. If it's not referenced anymore, release
551 * it.
552 */
9d3997fd 553static void rbd_put_client(struct rbd_client *rbdc)
602adf40 554{
c53d5893
AE
555 if (rbdc)
556 kref_put(&rbdc->kref, rbd_client_release);
602adf40
YS
557}
558
1fec7093
YS
559/*
560 * Destroy requests collection
561 */
562static void rbd_coll_release(struct kref *kref)
563{
564 struct rbd_req_coll *coll =
565 container_of(kref, struct rbd_req_coll, kref);
566
567 dout("rbd_coll_release %p\n", coll);
568 kfree(coll);
569}
602adf40 570
a30b71b9
AE
571static bool rbd_image_format_valid(u32 image_format)
572{
573 return image_format == 1 || image_format == 2;
574}
575
8e94af8e
AE
576static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
577{
103a150f
AE
578 size_t size;
579 u32 snap_count;
580
581 /* The header has to start with the magic rbd header text */
582 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
583 return false;
584
db2388b6
AE
585 /* The bio layer requires at least sector-sized I/O */
586
587 if (ondisk->options.order < SECTOR_SHIFT)
588 return false;
589
590 /* If we use u64 in a few spots we may be able to loosen this */
591
592 if (ondisk->options.order > 8 * sizeof (int) - 1)
593 return false;
594
103a150f
AE
595 /*
596 * The size of a snapshot header has to fit in a size_t, and
597 * that limits the number of snapshots.
598 */
599 snap_count = le32_to_cpu(ondisk->snap_count);
600 size = SIZE_MAX - sizeof (struct ceph_snap_context);
601 if (snap_count > size / sizeof (__le64))
602 return false;
603
604 /*
605 * Not only that, but the size of the entire the snapshot
606 * header must also be representable in a size_t.
607 */
608 size -= snap_count * sizeof (__le64);
609 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
610 return false;
611
612 return true;
8e94af8e
AE
613}
614
602adf40
YS
615/*
616 * Create a new header structure, translate header format from the on-disk
617 * header.
618 */
619static int rbd_header_from_disk(struct rbd_image_header *header,
4156d998 620 struct rbd_image_header_ondisk *ondisk)
602adf40 621{
ccece235 622 u32 snap_count;
58c17b0e 623 size_t len;
d2bb24e5 624 size_t size;
621901d6 625 u32 i;
602adf40 626
6a52325f
AE
627 memset(header, 0, sizeof (*header));
628
103a150f
AE
629 snap_count = le32_to_cpu(ondisk->snap_count);
630
58c17b0e
AE
631 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
632 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
6a52325f 633 if (!header->object_prefix)
602adf40 634 return -ENOMEM;
58c17b0e
AE
635 memcpy(header->object_prefix, ondisk->object_prefix, len);
636 header->object_prefix[len] = '\0';
00f1f36f 637
602adf40 638 if (snap_count) {
f785cc1d
AE
639 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
640
621901d6
AE
641 /* Save a copy of the snapshot names */
642
f785cc1d
AE
643 if (snap_names_len > (u64) SIZE_MAX)
644 return -EIO;
645 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
602adf40 646 if (!header->snap_names)
6a52325f 647 goto out_err;
f785cc1d
AE
648 /*
649 * Note that rbd_dev_v1_header_read() guarantees
650 * the ondisk buffer we're working with has
651 * snap_names_len bytes beyond the end of the
652 * snapshot id array, this memcpy() is safe.
653 */
654 memcpy(header->snap_names, &ondisk->snaps[snap_count],
655 snap_names_len);
6a52325f 656
621901d6
AE
657 /* Record each snapshot's size */
658
d2bb24e5
AE
659 size = snap_count * sizeof (*header->snap_sizes);
660 header->snap_sizes = kmalloc(size, GFP_KERNEL);
602adf40 661 if (!header->snap_sizes)
6a52325f 662 goto out_err;
621901d6
AE
663 for (i = 0; i < snap_count; i++)
664 header->snap_sizes[i] =
665 le64_to_cpu(ondisk->snaps[i].image_size);
602adf40 666 } else {
ccece235 667 WARN_ON(ondisk->snap_names_len);
602adf40
YS
668 header->snap_names = NULL;
669 header->snap_sizes = NULL;
670 }
849b4260 671
34b13184 672 header->features = 0; /* No features support in v1 images */
602adf40
YS
673 header->obj_order = ondisk->options.order;
674 header->crypt_type = ondisk->options.crypt_type;
675 header->comp_type = ondisk->options.comp_type;
6a52325f 676
621901d6
AE
677 /* Allocate and fill in the snapshot context */
678
f84344f3 679 header->image_size = le64_to_cpu(ondisk->image_size);
6a52325f
AE
680 size = sizeof (struct ceph_snap_context);
681 size += snap_count * sizeof (header->snapc->snaps[0]);
682 header->snapc = kzalloc(size, GFP_KERNEL);
683 if (!header->snapc)
684 goto out_err;
602adf40
YS
685
686 atomic_set(&header->snapc->nref, 1);
505cbb9b 687 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
602adf40 688 header->snapc->num_snaps = snap_count;
621901d6
AE
689 for (i = 0; i < snap_count; i++)
690 header->snapc->snaps[i] =
691 le64_to_cpu(ondisk->snaps[i].id);
602adf40
YS
692
693 return 0;
694
6a52325f 695out_err:
849b4260 696 kfree(header->snap_sizes);
ccece235 697 header->snap_sizes = NULL;
602adf40 698 kfree(header->snap_names);
ccece235 699 header->snap_names = NULL;
6a52325f
AE
700 kfree(header->object_prefix);
701 header->object_prefix = NULL;
ccece235 702
00f1f36f 703 return -ENOMEM;
602adf40
YS
704}
705
9e15b77d
AE
706static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
707{
708 struct rbd_snap *snap;
709
710 if (snap_id == CEPH_NOSNAP)
711 return RBD_SNAP_HEAD_NAME;
712
713 list_for_each_entry(snap, &rbd_dev->snaps, node)
714 if (snap_id == snap->id)
715 return snap->name;
716
717 return NULL;
718}
719
8836b995 720static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
602adf40 721{
602adf40 722
e86924a8 723 struct rbd_snap *snap;
602adf40 724
e86924a8
AE
725 list_for_each_entry(snap, &rbd_dev->snaps, node) {
726 if (!strcmp(snap_name, snap->name)) {
0d7dbfce 727 rbd_dev->spec->snap_id = snap->id;
e86924a8 728 rbd_dev->mapping.size = snap->size;
34b13184 729 rbd_dev->mapping.features = snap->features;
602adf40 730
e86924a8 731 return 0;
00f1f36f 732 }
00f1f36f 733 }
e86924a8 734
00f1f36f 735 return -ENOENT;
602adf40
YS
736}
737
819d52bf 738static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
602adf40 739{
78dc447d 740 int ret;
602adf40 741
0d7dbfce 742 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
cc9d734c 743 sizeof (RBD_SNAP_HEAD_NAME))) {
0d7dbfce 744 rbd_dev->spec->snap_id = CEPH_NOSNAP;
99c1f08f 745 rbd_dev->mapping.size = rbd_dev->header.image_size;
34b13184 746 rbd_dev->mapping.features = rbd_dev->header.features;
e86924a8 747 ret = 0;
602adf40 748 } else {
0d7dbfce 749 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
602adf40
YS
750 if (ret < 0)
751 goto done;
f84344f3 752 rbd_dev->mapping.read_only = true;
602adf40 753 }
daba5fdb 754 rbd_dev->exists = true;
602adf40 755done:
602adf40
YS
756 return ret;
757}
758
759static void rbd_header_free(struct rbd_image_header *header)
760{
849b4260 761 kfree(header->object_prefix);
d78fd7ae 762 header->object_prefix = NULL;
602adf40 763 kfree(header->snap_sizes);
d78fd7ae 764 header->snap_sizes = NULL;
849b4260 765 kfree(header->snap_names);
d78fd7ae 766 header->snap_names = NULL;
d1d25646 767 ceph_put_snap_context(header->snapc);
d78fd7ae 768 header->snapc = NULL;
602adf40
YS
769}
770
65ccfe21 771static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
602adf40 772{
65ccfe21
AE
773 char *name;
774 u64 segment;
775 int ret;
602adf40 776
2fd82b9e 777 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
65ccfe21
AE
778 if (!name)
779 return NULL;
780 segment = offset >> rbd_dev->header.obj_order;
2fd82b9e 781 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
65ccfe21 782 rbd_dev->header.object_prefix, segment);
2fd82b9e 783 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
65ccfe21
AE
784 pr_err("error formatting segment name for #%llu (%d)\n",
785 segment, ret);
786 kfree(name);
787 name = NULL;
788 }
602adf40 789
65ccfe21
AE
790 return name;
791}
602adf40 792
65ccfe21
AE
793static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
794{
795 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
602adf40 796
65ccfe21
AE
797 return offset & (segment_size - 1);
798}
799
800static u64 rbd_segment_length(struct rbd_device *rbd_dev,
801 u64 offset, u64 length)
802{
803 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
804
805 offset &= segment_size - 1;
806
aafb230e 807 rbd_assert(length <= U64_MAX - offset);
65ccfe21
AE
808 if (offset + length > segment_size)
809 length = segment_size - offset;
810
811 return length;
602adf40
YS
812}
813
1fec7093
YS
814static int rbd_get_num_segments(struct rbd_image_header *header,
815 u64 ofs, u64 len)
816{
df111be6
AE
817 u64 start_seg;
818 u64 end_seg;
819
820 if (!len)
821 return 0;
822 if (len - 1 > U64_MAX - ofs)
823 return -ERANGE;
824
825 start_seg = ofs >> header->obj_order;
826 end_seg = (ofs + len - 1) >> header->obj_order;
827
1fec7093
YS
828 return end_seg - start_seg + 1;
829}
830
029bcbd8
JD
831/*
832 * returns the size of an object in the image
833 */
834static u64 rbd_obj_bytes(struct rbd_image_header *header)
835{
836 return 1 << header->obj_order;
837}
838
602adf40
YS
839/*
840 * bio helpers
841 */
842
843static void bio_chain_put(struct bio *chain)
844{
845 struct bio *tmp;
846
847 while (chain) {
848 tmp = chain;
849 chain = chain->bi_next;
850 bio_put(tmp);
851 }
852}
853
854/*
855 * zeros a bio chain, starting at specific offset
856 */
857static void zero_bio_chain(struct bio *chain, int start_ofs)
858{
859 struct bio_vec *bv;
860 unsigned long flags;
861 void *buf;
862 int i;
863 int pos = 0;
864
865 while (chain) {
866 bio_for_each_segment(bv, chain, i) {
867 if (pos + bv->bv_len > start_ofs) {
868 int remainder = max(start_ofs - pos, 0);
869 buf = bvec_kmap_irq(bv, &flags);
870 memset(buf + remainder, 0,
871 bv->bv_len - remainder);
85b5aaa6 872 bvec_kunmap_irq(buf, &flags);
602adf40
YS
873 }
874 pos += bv->bv_len;
875 }
876
877 chain = chain->bi_next;
878 }
879}
880
881/*
f7760dad
AE
882 * Clone a portion of a bio, starting at the given byte offset
883 * and continuing for the number of bytes indicated.
602adf40 884 */
f7760dad
AE
885static struct bio *bio_clone_range(struct bio *bio_src,
886 unsigned int offset,
887 unsigned int len,
888 gfp_t gfpmask)
602adf40 889{
f7760dad
AE
890 struct bio_vec *bv;
891 unsigned int resid;
892 unsigned short idx;
893 unsigned int voff;
894 unsigned short end_idx;
895 unsigned short vcnt;
896 struct bio *bio;
897
898 /* Handle the easy case for the caller */
899
900 if (!offset && len == bio_src->bi_size)
901 return bio_clone(bio_src, gfpmask);
902
903 if (WARN_ON_ONCE(!len))
904 return NULL;
905 if (WARN_ON_ONCE(len > bio_src->bi_size))
906 return NULL;
907 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
908 return NULL;
909
910 /* Find first affected segment... */
911
912 resid = offset;
913 __bio_for_each_segment(bv, bio_src, idx, 0) {
914 if (resid < bv->bv_len)
915 break;
916 resid -= bv->bv_len;
602adf40 917 }
f7760dad 918 voff = resid;
602adf40 919
f7760dad 920 /* ...and the last affected segment */
602adf40 921
f7760dad
AE
922 resid += len;
923 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
924 if (resid <= bv->bv_len)
925 break;
926 resid -= bv->bv_len;
927 }
928 vcnt = end_idx - idx + 1;
929
930 /* Build the clone */
931
932 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
933 if (!bio)
934 return NULL; /* ENOMEM */
602adf40 935
f7760dad
AE
936 bio->bi_bdev = bio_src->bi_bdev;
937 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
938 bio->bi_rw = bio_src->bi_rw;
939 bio->bi_flags |= 1 << BIO_CLONED;
940
941 /*
942 * Copy over our part of the bio_vec, then update the first
943 * and last (or only) entries.
944 */
945 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
946 vcnt * sizeof (struct bio_vec));
947 bio->bi_io_vec[0].bv_offset += voff;
948 if (vcnt > 1) {
949 bio->bi_io_vec[0].bv_len -= voff;
950 bio->bi_io_vec[vcnt - 1].bv_len = resid;
951 } else {
952 bio->bi_io_vec[0].bv_len = len;
602adf40
YS
953 }
954
f7760dad
AE
955 bio->bi_vcnt = vcnt;
956 bio->bi_size = len;
957 bio->bi_idx = 0;
958
959 return bio;
960}
961
962/*
963 * Clone a portion of a bio chain, starting at the given byte offset
964 * into the first bio in the source chain and continuing for the
965 * number of bytes indicated. The result is another bio chain of
966 * exactly the given length, or a null pointer on error.
967 *
968 * The bio_src and offset parameters are both in-out. On entry they
969 * refer to the first source bio and the offset into that bio where
970 * the start of data to be cloned is located.
971 *
972 * On return, bio_src is updated to refer to the bio in the source
973 * chain that contains first un-cloned byte, and *offset will
974 * contain the offset of that byte within that bio.
975 */
976static struct bio *bio_chain_clone_range(struct bio **bio_src,
977 unsigned int *offset,
978 unsigned int len,
979 gfp_t gfpmask)
980{
981 struct bio *bi = *bio_src;
982 unsigned int off = *offset;
983 struct bio *chain = NULL;
984 struct bio **end;
985
986 /* Build up a chain of clone bios up to the limit */
987
988 if (!bi || off >= bi->bi_size || !len)
989 return NULL; /* Nothing to clone */
602adf40 990
f7760dad
AE
991 end = &chain;
992 while (len) {
993 unsigned int bi_size;
994 struct bio *bio;
995
996 if (!bi)
997 goto out_err; /* EINVAL; ran out of bio's */
998 bi_size = min_t(unsigned int, bi->bi_size - off, len);
999 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1000 if (!bio)
1001 goto out_err; /* ENOMEM */
1002
1003 *end = bio;
1004 end = &bio->bi_next;
602adf40 1005
f7760dad
AE
1006 off += bi_size;
1007 if (off == bi->bi_size) {
1008 bi = bi->bi_next;
1009 off = 0;
1010 }
1011 len -= bi_size;
1012 }
1013 *bio_src = bi;
1014 *offset = off;
1015
1016 return chain;
1017out_err:
1018 bio_chain_put(chain);
602adf40 1019
602adf40
YS
1020 return NULL;
1021}
1022
1023/*
1024 * helpers for osd request op vectors.
1025 */
57cfc106
AE
1026static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
1027 int opcode, u32 payload_len)
602adf40 1028{
57cfc106
AE
1029 struct ceph_osd_req_op *ops;
1030
1031 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
1032 if (!ops)
1033 return NULL;
1034
1035 ops[0].op = opcode;
1036
602adf40
YS
1037 /*
1038 * op extent offset and length will be set later on
1039 * in calc_raw_layout()
1040 */
57cfc106
AE
1041 ops[0].payload_len = payload_len;
1042
1043 return ops;
602adf40
YS
1044}
1045
1046static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
1047{
1048 kfree(ops);
1049}
1050
1fec7093
YS
1051static void rbd_coll_end_req_index(struct request *rq,
1052 struct rbd_req_coll *coll,
1053 int index,
1054 int ret, u64 len)
1055{
1056 struct request_queue *q;
1057 int min, max, i;
1058
bd919d45
AE
1059 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
1060 coll, index, ret, (unsigned long long) len);
1fec7093
YS
1061
1062 if (!rq)
1063 return;
1064
1065 if (!coll) {
1066 blk_end_request(rq, ret, len);
1067 return;
1068 }
1069
1070 q = rq->q;
1071
1072 spin_lock_irq(q->queue_lock);
1073 coll->status[index].done = 1;
1074 coll->status[index].rc = ret;
1075 coll->status[index].bytes = len;
1076 max = min = coll->num_done;
1077 while (max < coll->total && coll->status[max].done)
1078 max++;
1079
1080 for (i = min; i<max; i++) {
1081 __blk_end_request(rq, coll->status[i].rc,
1082 coll->status[i].bytes);
1083 coll->num_done++;
1084 kref_put(&coll->kref, rbd_coll_release);
1085 }
1086 spin_unlock_irq(q->queue_lock);
1087}
1088
1089static void rbd_coll_end_req(struct rbd_request *req,
1090 int ret, u64 len)
1091{
1092 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
1093}
1094
602adf40
YS
1095/*
1096 * Send ceph osd request
1097 */
1098static int rbd_do_request(struct request *rq,
0ce1a794 1099 struct rbd_device *rbd_dev,
602adf40
YS
1100 struct ceph_snap_context *snapc,
1101 u64 snapid,
aded07ea 1102 const char *object_name, u64 ofs, u64 len,
602adf40
YS
1103 struct bio *bio,
1104 struct page **pages,
1105 int num_pages,
1106 int flags,
1107 struct ceph_osd_req_op *ops,
1fec7093
YS
1108 struct rbd_req_coll *coll,
1109 int coll_index,
602adf40 1110 void (*rbd_cb)(struct ceph_osd_request *req,
59c2be1e
YS
1111 struct ceph_msg *msg),
1112 struct ceph_osd_request **linger_req,
1113 u64 *ver)
602adf40
YS
1114{
1115 struct ceph_osd_request *req;
1116 struct ceph_file_layout *layout;
1117 int ret;
1118 u64 bno;
1119 struct timespec mtime = CURRENT_TIME;
1120 struct rbd_request *req_data;
1121 struct ceph_osd_request_head *reqhead;
1dbb4399 1122 struct ceph_osd_client *osdc;
602adf40 1123
602adf40 1124 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
1fec7093
YS
1125 if (!req_data) {
1126 if (coll)
1127 rbd_coll_end_req_index(rq, coll, coll_index,
1128 -ENOMEM, len);
1129 return -ENOMEM;
1130 }
1131
1132 if (coll) {
1133 req_data->coll = coll;
1134 req_data->coll_index = coll_index;
1135 }
602adf40 1136
f7760dad
AE
1137 dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1138 object_name, (unsigned long long) ofs,
1139 (unsigned long long) len, coll, coll_index);
602adf40 1140
0ce1a794 1141 osdc = &rbd_dev->rbd_client->client->osdc;
1dbb4399
AE
1142 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
1143 false, GFP_NOIO, pages, bio);
4ad12621 1144 if (!req) {
4ad12621 1145 ret = -ENOMEM;
602adf40
YS
1146 goto done_pages;
1147 }
1148
1149 req->r_callback = rbd_cb;
1150
1151 req_data->rq = rq;
1152 req_data->bio = bio;
1153 req_data->pages = pages;
1154 req_data->len = len;
1155
1156 req->r_priv = req_data;
1157
1158 reqhead = req->r_request->front.iov_base;
1159 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1160
aded07ea 1161 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
602adf40
YS
1162 req->r_oid_len = strlen(req->r_oid);
1163
1164 layout = &req->r_file_layout;
1165 memset(layout, 0, sizeof(*layout));
1166 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1167 layout->fl_stripe_count = cpu_to_le32(1);
1168 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
0d7dbfce 1169 layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id);
6cae3717
SW
1170 ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1171 req, ops);
1172 rbd_assert(ret == 0);
602adf40
YS
1173
1174 ceph_osdc_build_request(req, ofs, &len,
1175 ops,
1176 snapc,
1177 &mtime,
1178 req->r_oid, req->r_oid_len);
602adf40 1179
59c2be1e 1180 if (linger_req) {
1dbb4399 1181 ceph_osdc_set_request_linger(osdc, req);
59c2be1e
YS
1182 *linger_req = req;
1183 }
1184
1dbb4399 1185 ret = ceph_osdc_start_request(osdc, req, false);
602adf40
YS
1186 if (ret < 0)
1187 goto done_err;
1188
1189 if (!rbd_cb) {
1dbb4399 1190 ret = ceph_osdc_wait_request(osdc, req);
59c2be1e
YS
1191 if (ver)
1192 *ver = le64_to_cpu(req->r_reassert_version.version);
bd919d45
AE
1193 dout("reassert_ver=%llu\n",
1194 (unsigned long long)
1195 le64_to_cpu(req->r_reassert_version.version));
602adf40
YS
1196 ceph_osdc_put_request(req);
1197 }
1198 return ret;
1199
1200done_err:
1201 bio_chain_put(req_data->bio);
1202 ceph_osdc_put_request(req);
1203done_pages:
1fec7093 1204 rbd_coll_end_req(req_data, ret, len);
602adf40 1205 kfree(req_data);
602adf40
YS
1206 return ret;
1207}
1208
1209/*
1210 * Ceph osd op callback
1211 */
1212static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1213{
1214 struct rbd_request *req_data = req->r_priv;
1215 struct ceph_osd_reply_head *replyhead;
1216 struct ceph_osd_op *op;
1217 __s32 rc;
1218 u64 bytes;
1219 int read_op;
1220
1221 /* parse reply */
1222 replyhead = msg->front.iov_base;
1223 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1224 op = (void *)(replyhead + 1);
1225 rc = le32_to_cpu(replyhead->result);
1226 bytes = le64_to_cpu(op->extent.length);
895cfcc8 1227 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
602adf40 1228
bd919d45
AE
1229 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1230 (unsigned long long) bytes, read_op, (int) rc);
602adf40
YS
1231
1232 if (rc == -ENOENT && read_op) {
1233 zero_bio_chain(req_data->bio, 0);
1234 rc = 0;
1235 } else if (rc == 0 && read_op && bytes < req_data->len) {
1236 zero_bio_chain(req_data->bio, bytes);
1237 bytes = req_data->len;
1238 }
1239
1fec7093 1240 rbd_coll_end_req(req_data, rc, bytes);
602adf40
YS
1241
1242 if (req_data->bio)
1243 bio_chain_put(req_data->bio);
1244
1245 ceph_osdc_put_request(req);
1246 kfree(req_data);
1247}
1248
59c2be1e
YS
1249static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1250{
1251 ceph_osdc_put_request(req);
1252}
1253
602adf40
YS
1254/*
1255 * Do a synchronous ceph osd operation
1256 */
0ce1a794 1257static int rbd_req_sync_op(struct rbd_device *rbd_dev,
602adf40
YS
1258 struct ceph_snap_context *snapc,
1259 u64 snapid,
602adf40 1260 int flags,
913d2fdc 1261 struct ceph_osd_req_op *ops,
aded07ea 1262 const char *object_name,
f8d4de6e
AE
1263 u64 ofs, u64 inbound_size,
1264 char *inbound,
59c2be1e
YS
1265 struct ceph_osd_request **linger_req,
1266 u64 *ver)
602adf40
YS
1267{
1268 int ret;
1269 struct page **pages;
1270 int num_pages;
913d2fdc 1271
aafb230e 1272 rbd_assert(ops != NULL);
602adf40 1273
f8d4de6e 1274 num_pages = calc_pages_for(ofs, inbound_size);
602adf40 1275 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
b8d0638a
DC
1276 if (IS_ERR(pages))
1277 return PTR_ERR(pages);
602adf40 1278
0ce1a794 1279 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
f8d4de6e 1280 object_name, ofs, inbound_size, NULL,
602adf40
YS
1281 pages, num_pages,
1282 flags,
1283 ops,
1fec7093 1284 NULL, 0,
59c2be1e
YS
1285 NULL,
1286 linger_req, ver);
602adf40 1287 if (ret < 0)
913d2fdc 1288 goto done;
602adf40 1289
f8d4de6e
AE
1290 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1291 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
602adf40 1292
602adf40
YS
1293done:
1294 ceph_release_page_vector(pages, num_pages);
1295 return ret;
1296}
1297
1298/*
1299 * Do an asynchronous ceph osd operation
1300 */
1301static int rbd_do_op(struct request *rq,
0ce1a794 1302 struct rbd_device *rbd_dev,
602adf40 1303 struct ceph_snap_context *snapc,
602adf40 1304 u64 ofs, u64 len,
1fec7093
YS
1305 struct bio *bio,
1306 struct rbd_req_coll *coll,
1307 int coll_index)
602adf40
YS
1308{
1309 char *seg_name;
1310 u64 seg_ofs;
1311 u64 seg_len;
1312 int ret;
1313 struct ceph_osd_req_op *ops;
1314 u32 payload_len;
ff2e4bb5
AE
1315 int opcode;
1316 int flags;
4634246d 1317 u64 snapid;
602adf40 1318
65ccfe21 1319 seg_name = rbd_segment_name(rbd_dev, ofs);
602adf40
YS
1320 if (!seg_name)
1321 return -ENOMEM;
65ccfe21
AE
1322 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1323 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
602adf40 1324
ff2e4bb5
AE
1325 if (rq_data_dir(rq) == WRITE) {
1326 opcode = CEPH_OSD_OP_WRITE;
1327 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
4634246d 1328 snapid = CEPH_NOSNAP;
ff2e4bb5
AE
1329 payload_len = seg_len;
1330 } else {
1331 opcode = CEPH_OSD_OP_READ;
1332 flags = CEPH_OSD_FLAG_READ;
4634246d 1333 snapc = NULL;
0d7dbfce 1334 snapid = rbd_dev->spec->snap_id;
ff2e4bb5
AE
1335 payload_len = 0;
1336 }
602adf40 1337
57cfc106
AE
1338 ret = -ENOMEM;
1339 ops = rbd_create_rw_ops(1, opcode, payload_len);
1340 if (!ops)
602adf40
YS
1341 goto done;
1342
1343 /* we've taken care of segment sizes earlier when we
1344 cloned the bios. We should never have a segment
1345 truncated at this point */
aafb230e 1346 rbd_assert(seg_len == len);
602adf40
YS
1347
1348 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1349 seg_name, seg_ofs, seg_len,
1350 bio,
1351 NULL, 0,
1352 flags,
1353 ops,
1fec7093 1354 coll, coll_index,
59c2be1e 1355 rbd_req_cb, 0, NULL);
11f77002
SW
1356
1357 rbd_destroy_ops(ops);
602adf40
YS
1358done:
1359 kfree(seg_name);
1360 return ret;
1361}
1362
602adf40
YS
1363/*
1364 * Request sync osd read
1365 */
0ce1a794 1366static int rbd_req_sync_read(struct rbd_device *rbd_dev,
602adf40 1367 u64 snapid,
aded07ea 1368 const char *object_name,
602adf40 1369 u64 ofs, u64 len,
59c2be1e
YS
1370 char *buf,
1371 u64 *ver)
602adf40 1372{
913d2fdc
AE
1373 struct ceph_osd_req_op *ops;
1374 int ret;
1375
1376 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1377 if (!ops)
1378 return -ENOMEM;
1379
1380 ret = rbd_req_sync_op(rbd_dev, NULL,
b06e6a6b 1381 snapid,
602adf40 1382 CEPH_OSD_FLAG_READ,
913d2fdc
AE
1383 ops, object_name, ofs, len, buf, NULL, ver);
1384 rbd_destroy_ops(ops);
1385
1386 return ret;
602adf40
YS
1387}
1388
1389/*
59c2be1e
YS
1390 * Request sync osd watch
1391 */
0ce1a794 1392static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
59c2be1e 1393 u64 ver,
7f0a24d8 1394 u64 notify_id)
59c2be1e
YS
1395{
1396 struct ceph_osd_req_op *ops;
11f77002
SW
1397 int ret;
1398
57cfc106
AE
1399 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1400 if (!ops)
1401 return -ENOMEM;
59c2be1e 1402
a71b891b 1403 ops[0].watch.ver = cpu_to_le64(ver);
59c2be1e
YS
1404 ops[0].watch.cookie = notify_id;
1405 ops[0].watch.flag = 0;
1406
0ce1a794 1407 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
7f0a24d8 1408 rbd_dev->header_name, 0, 0, NULL,
ad4f232f 1409 NULL, 0,
59c2be1e
YS
1410 CEPH_OSD_FLAG_READ,
1411 ops,
1fec7093 1412 NULL, 0,
59c2be1e
YS
1413 rbd_simple_req_cb, 0, NULL);
1414
1415 rbd_destroy_ops(ops);
1416 return ret;
1417}
1418
1419static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1420{
0ce1a794 1421 struct rbd_device *rbd_dev = (struct rbd_device *)data;
a71b891b 1422 u64 hver;
13143d2d
SW
1423 int rc;
1424
0ce1a794 1425 if (!rbd_dev)
59c2be1e
YS
1426 return;
1427
bd919d45
AE
1428 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1429 rbd_dev->header_name, (unsigned long long) notify_id,
1430 (unsigned int) opcode);
117973fb 1431 rc = rbd_dev_refresh(rbd_dev, &hver);
13143d2d 1432 if (rc)
06ecc6cb
AE
1433 rbd_warn(rbd_dev, "got notification but failed to "
1434 " update snaps: %d\n", rc);
59c2be1e 1435
7f0a24d8 1436 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
59c2be1e
YS
1437}
1438
1439/*
1440 * Request sync osd watch
1441 */
0e6f322d 1442static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
59c2be1e
YS
1443{
1444 struct ceph_osd_req_op *ops;
0ce1a794 1445 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
57cfc106 1446 int ret;
59c2be1e 1447
57cfc106
AE
1448 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1449 if (!ops)
1450 return -ENOMEM;
59c2be1e
YS
1451
1452 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
0ce1a794 1453 (void *)rbd_dev, &rbd_dev->watch_event);
59c2be1e
YS
1454 if (ret < 0)
1455 goto fail;
1456
0e6f322d 1457 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
0ce1a794 1458 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
59c2be1e
YS
1459 ops[0].watch.flag = 1;
1460
0ce1a794 1461 ret = rbd_req_sync_op(rbd_dev, NULL,
59c2be1e 1462 CEPH_NOSNAP,
59c2be1e
YS
1463 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1464 ops,
0e6f322d
AE
1465 rbd_dev->header_name,
1466 0, 0, NULL,
0ce1a794 1467 &rbd_dev->watch_request, NULL);
59c2be1e
YS
1468
1469 if (ret < 0)
1470 goto fail_event;
1471
1472 rbd_destroy_ops(ops);
1473 return 0;
1474
1475fail_event:
0ce1a794
AE
1476 ceph_osdc_cancel_event(rbd_dev->watch_event);
1477 rbd_dev->watch_event = NULL;
59c2be1e
YS
1478fail:
1479 rbd_destroy_ops(ops);
1480 return ret;
1481}
1482
79e3057c
YS
1483/*
1484 * Request sync osd unwatch
1485 */
070c633f 1486static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
79e3057c
YS
1487{
1488 struct ceph_osd_req_op *ops;
57cfc106 1489 int ret;
79e3057c 1490
57cfc106
AE
1491 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1492 if (!ops)
1493 return -ENOMEM;
79e3057c
YS
1494
1495 ops[0].watch.ver = 0;
0ce1a794 1496 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
79e3057c
YS
1497 ops[0].watch.flag = 0;
1498
0ce1a794 1499 ret = rbd_req_sync_op(rbd_dev, NULL,
79e3057c 1500 CEPH_NOSNAP,
79e3057c
YS
1501 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1502 ops,
070c633f
AE
1503 rbd_dev->header_name,
1504 0, 0, NULL, NULL, NULL);
1505
79e3057c
YS
1506
1507 rbd_destroy_ops(ops);
0ce1a794
AE
1508 ceph_osdc_cancel_event(rbd_dev->watch_event);
1509 rbd_dev->watch_event = NULL;
79e3057c
YS
1510 return ret;
1511}
1512
602adf40 1513/*
3cb4a687 1514 * Synchronous osd object method call
602adf40 1515 */
0ce1a794 1516static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
aded07ea
AE
1517 const char *object_name,
1518 const char *class_name,
1519 const char *method_name,
3cb4a687
AE
1520 const char *outbound,
1521 size_t outbound_size,
f8d4de6e
AE
1522 char *inbound,
1523 size_t inbound_size,
3cb4a687 1524 int flags,
59c2be1e 1525 u64 *ver)
602adf40
YS
1526{
1527 struct ceph_osd_req_op *ops;
aded07ea
AE
1528 int class_name_len = strlen(class_name);
1529 int method_name_len = strlen(method_name);
3cb4a687 1530 int payload_size;
57cfc106
AE
1531 int ret;
1532
3cb4a687
AE
1533 /*
1534 * Any input parameters required by the method we're calling
1535 * will be sent along with the class and method names as
1536 * part of the message payload. That data and its size are
1537 * supplied via the indata and indata_len fields (named from
1538 * the perspective of the server side) in the OSD request
1539 * operation.
1540 */
1541 payload_size = class_name_len + method_name_len + outbound_size;
1542 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
57cfc106
AE
1543 if (!ops)
1544 return -ENOMEM;
602adf40 1545
aded07ea
AE
1546 ops[0].cls.class_name = class_name;
1547 ops[0].cls.class_len = (__u8) class_name_len;
1548 ops[0].cls.method_name = method_name;
1549 ops[0].cls.method_len = (__u8) method_name_len;
602adf40 1550 ops[0].cls.argc = 0;
3cb4a687
AE
1551 ops[0].cls.indata = outbound;
1552 ops[0].cls.indata_len = outbound_size;
602adf40 1553
0ce1a794 1554 ret = rbd_req_sync_op(rbd_dev, NULL,
602adf40 1555 CEPH_NOSNAP,
3cb4a687 1556 flags, ops,
f8d4de6e
AE
1557 object_name, 0, inbound_size, inbound,
1558 NULL, ver);
602adf40
YS
1559
1560 rbd_destroy_ops(ops);
1561
1562 dout("cls_exec returned %d\n", ret);
1563 return ret;
1564}
1565
1fec7093
YS
1566static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1567{
1568 struct rbd_req_coll *coll =
1569 kzalloc(sizeof(struct rbd_req_coll) +
1570 sizeof(struct rbd_req_status) * num_reqs,
1571 GFP_ATOMIC);
1572
1573 if (!coll)
1574 return NULL;
1575 coll->total = num_reqs;
1576 kref_init(&coll->kref);
1577 return coll;
1578}
1579
602adf40
YS
1580/*
1581 * block device queue callback
1582 */
1583static void rbd_rq_fn(struct request_queue *q)
1584{
1585 struct rbd_device *rbd_dev = q->queuedata;
1586 struct request *rq;
602adf40 1587
00f1f36f 1588 while ((rq = blk_fetch_request(q))) {
602adf40 1589 struct bio *bio;
602adf40 1590 bool do_write;
bd919d45 1591 unsigned int size;
602adf40 1592 u64 ofs;
1fec7093
YS
1593 int num_segs, cur_seg = 0;
1594 struct rbd_req_coll *coll;
d1d25646 1595 struct ceph_snap_context *snapc;
f7760dad 1596 unsigned int bio_offset;
602adf40 1597
602adf40
YS
1598 dout("fetched request\n");
1599
1600 /* filter out block requests we don't understand */
1601 if ((rq->cmd_type != REQ_TYPE_FS)) {
1602 __blk_end_request_all(rq, 0);
00f1f36f 1603 continue;
602adf40
YS
1604 }
1605
1606 /* deduce our operation (read, write) */
1607 do_write = (rq_data_dir(rq) == WRITE);
f84344f3 1608 if (do_write && rbd_dev->mapping.read_only) {
602adf40 1609 __blk_end_request_all(rq, -EROFS);
00f1f36f 1610 continue;
602adf40
YS
1611 }
1612
1613 spin_unlock_irq(q->queue_lock);
1614
d1d25646 1615 down_read(&rbd_dev->header_rwsem);
e88a36ec 1616
daba5fdb 1617 if (!rbd_dev->exists) {
0d7dbfce 1618 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
e88a36ec 1619 up_read(&rbd_dev->header_rwsem);
d1d25646
JD
1620 dout("request for non-existent snapshot");
1621 spin_lock_irq(q->queue_lock);
1622 __blk_end_request_all(rq, -ENXIO);
1623 continue;
e88a36ec
JD
1624 }
1625
d1d25646
JD
1626 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1627
1628 up_read(&rbd_dev->header_rwsem);
1629
f7760dad
AE
1630 size = blk_rq_bytes(rq);
1631 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1632 bio = rq->bio;
1633
602adf40
YS
1634 dout("%s 0x%x bytes at 0x%llx\n",
1635 do_write ? "write" : "read",
bd919d45 1636 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
602adf40 1637
1fec7093 1638 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
df111be6
AE
1639 if (num_segs <= 0) {
1640 spin_lock_irq(q->queue_lock);
1641 __blk_end_request_all(rq, num_segs);
1642 ceph_put_snap_context(snapc);
1643 continue;
1644 }
1fec7093
YS
1645 coll = rbd_alloc_coll(num_segs);
1646 if (!coll) {
1647 spin_lock_irq(q->queue_lock);
1648 __blk_end_request_all(rq, -ENOMEM);
d1d25646 1649 ceph_put_snap_context(snapc);
00f1f36f 1650 continue;
1fec7093
YS
1651 }
1652
f7760dad 1653 bio_offset = 0;
602adf40 1654 do {
f7760dad
AE
1655 u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1656 unsigned int chain_size;
1657 struct bio *bio_chain;
1658
1659 BUG_ON(limit > (u64) UINT_MAX);
1660 chain_size = (unsigned int) limit;
bd919d45 1661 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
f7760dad 1662
1fec7093 1663 kref_get(&coll->kref);
f7760dad
AE
1664
1665 /* Pass a cloned bio chain via an osd request */
1666
1667 bio_chain = bio_chain_clone_range(&bio,
1668 &bio_offset, chain_size,
1669 GFP_ATOMIC);
1670 if (bio_chain)
4634246d 1671 (void) rbd_do_op(rq, rbd_dev, snapc,
f7760dad
AE
1672 ofs, chain_size,
1673 bio_chain, coll, cur_seg);
4634246d 1674 else
1fec7093 1675 rbd_coll_end_req_index(rq, coll, cur_seg,
f7760dad
AE
1676 -ENOMEM, chain_size);
1677 size -= chain_size;
1678 ofs += chain_size;
602adf40 1679
1fec7093 1680 cur_seg++;
602adf40 1681 } while (size > 0);
1fec7093 1682 kref_put(&coll->kref, rbd_coll_release);
602adf40 1683
602adf40 1684 spin_lock_irq(q->queue_lock);
d1d25646
JD
1685
1686 ceph_put_snap_context(snapc);
602adf40
YS
1687 }
1688}
1689
1690/*
1691 * a queue callback. Makes sure that we don't create a bio that spans across
1692 * multiple osd objects. One exception would be with a single page bios,
f7760dad 1693 * which we handle later at bio_chain_clone_range()
602adf40
YS
1694 */
1695static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1696 struct bio_vec *bvec)
1697{
1698 struct rbd_device *rbd_dev = q->queuedata;
e5cfeed2
AE
1699 sector_t sector_offset;
1700 sector_t sectors_per_obj;
1701 sector_t obj_sector_offset;
1702 int ret;
1703
1704 /*
1705 * Find how far into its rbd object the partition-relative
1706 * bio start sector is to offset relative to the enclosing
1707 * device.
1708 */
1709 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1710 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1711 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
1712
1713 /*
1714 * Compute the number of bytes from that offset to the end
1715 * of the object. Account for what's already used by the bio.
1716 */
1717 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1718 if (ret > bmd->bi_size)
1719 ret -= bmd->bi_size;
1720 else
1721 ret = 0;
1722
1723 /*
1724 * Don't send back more than was asked for. And if the bio
1725 * was empty, let the whole thing through because: "Note
1726 * that a block device *must* allow a single page to be
1727 * added to an empty bio."
1728 */
1729 rbd_assert(bvec->bv_len <= PAGE_SIZE);
1730 if (ret > (int) bvec->bv_len || !bmd->bi_size)
1731 ret = (int) bvec->bv_len;
1732
1733 return ret;
602adf40
YS
1734}
1735
1736static void rbd_free_disk(struct rbd_device *rbd_dev)
1737{
1738 struct gendisk *disk = rbd_dev->disk;
1739
1740 if (!disk)
1741 return;
1742
602adf40
YS
1743 if (disk->flags & GENHD_FL_UP)
1744 del_gendisk(disk);
1745 if (disk->queue)
1746 blk_cleanup_queue(disk->queue);
1747 put_disk(disk);
1748}
1749
1750/*
4156d998
AE
1751 * Read the complete header for the given rbd device.
1752 *
1753 * Returns a pointer to a dynamically-allocated buffer containing
1754 * the complete and validated header. Caller can pass the address
1755 * of a variable that will be filled in with the version of the
1756 * header object at the time it was read.
1757 *
1758 * Returns a pointer-coded errno if a failure occurs.
602adf40 1759 */
4156d998
AE
1760static struct rbd_image_header_ondisk *
1761rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
602adf40 1762{
4156d998 1763 struct rbd_image_header_ondisk *ondisk = NULL;
50f7c4c9 1764 u32 snap_count = 0;
4156d998
AE
1765 u64 names_size = 0;
1766 u32 want_count;
1767 int ret;
602adf40 1768
00f1f36f 1769 /*
4156d998
AE
1770 * The complete header will include an array of its 64-bit
1771 * snapshot ids, followed by the names of those snapshots as
1772 * a contiguous block of NUL-terminated strings. Note that
1773 * the number of snapshots could change by the time we read
1774 * it in, in which case we re-read it.
00f1f36f 1775 */
4156d998
AE
1776 do {
1777 size_t size;
1778
1779 kfree(ondisk);
1780
1781 size = sizeof (*ondisk);
1782 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1783 size += names_size;
1784 ondisk = kmalloc(size, GFP_KERNEL);
1785 if (!ondisk)
1786 return ERR_PTR(-ENOMEM);
1787
1788 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
0bed54dc 1789 rbd_dev->header_name,
4156d998
AE
1790 0, size,
1791 (char *) ondisk, version);
1792
1793 if (ret < 0)
1794 goto out_err;
1795 if (WARN_ON((size_t) ret < size)) {
1796 ret = -ENXIO;
06ecc6cb
AE
1797 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
1798 size, ret);
4156d998
AE
1799 goto out_err;
1800 }
1801 if (!rbd_dev_ondisk_valid(ondisk)) {
1802 ret = -ENXIO;
06ecc6cb 1803 rbd_warn(rbd_dev, "invalid header");
4156d998 1804 goto out_err;
81e759fb 1805 }
602adf40 1806
4156d998
AE
1807 names_size = le64_to_cpu(ondisk->snap_names_len);
1808 want_count = snap_count;
1809 snap_count = le32_to_cpu(ondisk->snap_count);
1810 } while (snap_count != want_count);
00f1f36f 1811
4156d998 1812 return ondisk;
00f1f36f 1813
4156d998
AE
1814out_err:
1815 kfree(ondisk);
1816
1817 return ERR_PTR(ret);
1818}
1819
1820/*
1821 * reload the ondisk the header
1822 */
1823static int rbd_read_header(struct rbd_device *rbd_dev,
1824 struct rbd_image_header *header)
1825{
1826 struct rbd_image_header_ondisk *ondisk;
1827 u64 ver = 0;
1828 int ret;
602adf40 1829
4156d998
AE
1830 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1831 if (IS_ERR(ondisk))
1832 return PTR_ERR(ondisk);
1833 ret = rbd_header_from_disk(header, ondisk);
1834 if (ret >= 0)
1835 header->obj_version = ver;
1836 kfree(ondisk);
1837
1838 return ret;
602adf40
YS
1839}
1840
41f38c2b 1841static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
dfc5606d
YS
1842{
1843 struct rbd_snap *snap;
a0593290 1844 struct rbd_snap *next;
dfc5606d 1845
a0593290 1846 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
41f38c2b 1847 rbd_remove_snap_dev(snap);
dfc5606d
YS
1848}
1849
9478554a
AE
1850static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
1851{
1852 sector_t size;
1853
0d7dbfce 1854 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
9478554a
AE
1855 return;
1856
1857 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
1858 dout("setting size to %llu sectors", (unsigned long long) size);
1859 rbd_dev->mapping.size = (u64) size;
1860 set_capacity(rbd_dev->disk, size);
1861}
1862
602adf40
YS
1863/*
1864 * only read the first part of the ondisk header, without the snaps info
1865 */
117973fb 1866static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
602adf40
YS
1867{
1868 int ret;
1869 struct rbd_image_header h;
602adf40
YS
1870
1871 ret = rbd_read_header(rbd_dev, &h);
1872 if (ret < 0)
1873 return ret;
1874
a51aa0c0
JD
1875 down_write(&rbd_dev->header_rwsem);
1876
9478554a
AE
1877 /* Update image size, and check for resize of mapped image */
1878 rbd_dev->header.image_size = h.image_size;
1879 rbd_update_mapping_size(rbd_dev);
9db4b3e3 1880
849b4260 1881 /* rbd_dev->header.object_prefix shouldn't change */
602adf40 1882 kfree(rbd_dev->header.snap_sizes);
849b4260 1883 kfree(rbd_dev->header.snap_names);
d1d25646
JD
1884 /* osd requests may still refer to snapc */
1885 ceph_put_snap_context(rbd_dev->header.snapc);
602adf40 1886
b813623a
AE
1887 if (hver)
1888 *hver = h.obj_version;
a71b891b 1889 rbd_dev->header.obj_version = h.obj_version;
93a24e08 1890 rbd_dev->header.image_size = h.image_size;
602adf40
YS
1891 rbd_dev->header.snapc = h.snapc;
1892 rbd_dev->header.snap_names = h.snap_names;
1893 rbd_dev->header.snap_sizes = h.snap_sizes;
849b4260
AE
1894 /* Free the extra copy of the object prefix */
1895 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1896 kfree(h.object_prefix);
1897
304f6808
AE
1898 ret = rbd_dev_snaps_update(rbd_dev);
1899 if (!ret)
1900 ret = rbd_dev_snaps_register(rbd_dev);
dfc5606d 1901
c666601a 1902 up_write(&rbd_dev->header_rwsem);
602adf40 1903
dfc5606d 1904 return ret;
602adf40
YS
1905}
1906
117973fb 1907static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
1fe5e993
AE
1908{
1909 int ret;
1910
117973fb 1911 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1fe5e993 1912 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
117973fb
AE
1913 if (rbd_dev->image_format == 1)
1914 ret = rbd_dev_v1_refresh(rbd_dev, hver);
1915 else
1916 ret = rbd_dev_v2_refresh(rbd_dev, hver);
1fe5e993
AE
1917 mutex_unlock(&ctl_mutex);
1918
1919 return ret;
1920}
1921
602adf40
YS
1922static int rbd_init_disk(struct rbd_device *rbd_dev)
1923{
1924 struct gendisk *disk;
1925 struct request_queue *q;
593a9e7b 1926 u64 segment_size;
602adf40 1927
602adf40 1928 /* create gendisk info */
602adf40
YS
1929 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1930 if (!disk)
1fcdb8aa 1931 return -ENOMEM;
602adf40 1932
f0f8cef5 1933 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
de71a297 1934 rbd_dev->dev_id);
602adf40
YS
1935 disk->major = rbd_dev->major;
1936 disk->first_minor = 0;
1937 disk->fops = &rbd_bd_ops;
1938 disk->private_data = rbd_dev;
1939
1940 /* init rq */
602adf40
YS
1941 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1942 if (!q)
1943 goto out_disk;
029bcbd8 1944
593a9e7b
AE
1945 /* We use the default size, but let's be explicit about it. */
1946 blk_queue_physical_block_size(q, SECTOR_SIZE);
1947
029bcbd8 1948 /* set io sizes to object size */
593a9e7b
AE
1949 segment_size = rbd_obj_bytes(&rbd_dev->header);
1950 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1951 blk_queue_max_segment_size(q, segment_size);
1952 blk_queue_io_min(q, segment_size);
1953 blk_queue_io_opt(q, segment_size);
029bcbd8 1954
602adf40
YS
1955 blk_queue_merge_bvec(q, rbd_merge_bvec);
1956 disk->queue = q;
1957
1958 q->queuedata = rbd_dev;
1959
1960 rbd_dev->disk = disk;
602adf40 1961
12f02944
AE
1962 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1963
602adf40 1964 return 0;
602adf40
YS
1965out_disk:
1966 put_disk(disk);
1fcdb8aa
AE
1967
1968 return -ENOMEM;
602adf40
YS
1969}
1970
dfc5606d
YS
1971/*
1972 sysfs
1973*/
1974
593a9e7b
AE
1975static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1976{
1977 return container_of(dev, struct rbd_device, dev);
1978}
1979
dfc5606d
YS
1980static ssize_t rbd_size_show(struct device *dev,
1981 struct device_attribute *attr, char *buf)
1982{
593a9e7b 1983 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
a51aa0c0
JD
1984 sector_t size;
1985
1986 down_read(&rbd_dev->header_rwsem);
1987 size = get_capacity(rbd_dev->disk);
1988 up_read(&rbd_dev->header_rwsem);
dfc5606d 1989
a51aa0c0 1990 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
dfc5606d
YS
1991}
1992
34b13184
AE
1993/*
1994 * Note this shows the features for whatever's mapped, which is not
1995 * necessarily the base image.
1996 */
1997static ssize_t rbd_features_show(struct device *dev,
1998 struct device_attribute *attr, char *buf)
1999{
2000 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2001
2002 return sprintf(buf, "0x%016llx\n",
2003 (unsigned long long) rbd_dev->mapping.features);
2004}
2005
dfc5606d
YS
2006static ssize_t rbd_major_show(struct device *dev,
2007 struct device_attribute *attr, char *buf)
2008{
593a9e7b 2009 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 2010
dfc5606d
YS
2011 return sprintf(buf, "%d\n", rbd_dev->major);
2012}
2013
2014static ssize_t rbd_client_id_show(struct device *dev,
2015 struct device_attribute *attr, char *buf)
602adf40 2016{
593a9e7b 2017 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2018
1dbb4399
AE
2019 return sprintf(buf, "client%lld\n",
2020 ceph_client_id(rbd_dev->rbd_client->client));
602adf40
YS
2021}
2022
dfc5606d
YS
2023static ssize_t rbd_pool_show(struct device *dev,
2024 struct device_attribute *attr, char *buf)
602adf40 2025{
593a9e7b 2026 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2027
0d7dbfce 2028 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
dfc5606d
YS
2029}
2030
9bb2f334
AE
2031static ssize_t rbd_pool_id_show(struct device *dev,
2032 struct device_attribute *attr, char *buf)
2033{
2034 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2035
0d7dbfce
AE
2036 return sprintf(buf, "%llu\n",
2037 (unsigned long long) rbd_dev->spec->pool_id);
9bb2f334
AE
2038}
2039
dfc5606d
YS
2040static ssize_t rbd_name_show(struct device *dev,
2041 struct device_attribute *attr, char *buf)
2042{
593a9e7b 2043 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2044
a92ffdf8
AE
2045 if (rbd_dev->spec->image_name)
2046 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2047
2048 return sprintf(buf, "(unknown)\n");
dfc5606d
YS
2049}
2050
589d30e0
AE
2051static ssize_t rbd_image_id_show(struct device *dev,
2052 struct device_attribute *attr, char *buf)
2053{
2054 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2055
0d7dbfce 2056 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
589d30e0
AE
2057}
2058
34b13184
AE
2059/*
2060 * Shows the name of the currently-mapped snapshot (or
2061 * RBD_SNAP_HEAD_NAME for the base image).
2062 */
dfc5606d
YS
2063static ssize_t rbd_snap_show(struct device *dev,
2064 struct device_attribute *attr,
2065 char *buf)
2066{
593a9e7b 2067 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2068
0d7dbfce 2069 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
dfc5606d
YS
2070}
2071
86b00e0d
AE
2072/*
2073 * For an rbd v2 image, shows the pool id, image id, and snapshot id
2074 * for the parent image. If there is no parent, simply shows
2075 * "(no parent image)".
2076 */
2077static ssize_t rbd_parent_show(struct device *dev,
2078 struct device_attribute *attr,
2079 char *buf)
2080{
2081 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2082 struct rbd_spec *spec = rbd_dev->parent_spec;
2083 int count;
2084 char *bufp = buf;
2085
2086 if (!spec)
2087 return sprintf(buf, "(no parent image)\n");
2088
2089 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2090 (unsigned long long) spec->pool_id, spec->pool_name);
2091 if (count < 0)
2092 return count;
2093 bufp += count;
2094
2095 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2096 spec->image_name ? spec->image_name : "(unknown)");
2097 if (count < 0)
2098 return count;
2099 bufp += count;
2100
2101 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2102 (unsigned long long) spec->snap_id, spec->snap_name);
2103 if (count < 0)
2104 return count;
2105 bufp += count;
2106
2107 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2108 if (count < 0)
2109 return count;
2110 bufp += count;
2111
2112 return (ssize_t) (bufp - buf);
2113}
2114
dfc5606d
YS
2115static ssize_t rbd_image_refresh(struct device *dev,
2116 struct device_attribute *attr,
2117 const char *buf,
2118 size_t size)
2119{
593a9e7b 2120 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
b813623a 2121 int ret;
602adf40 2122
117973fb 2123 ret = rbd_dev_refresh(rbd_dev, NULL);
b813623a
AE
2124
2125 return ret < 0 ? ret : size;
dfc5606d 2126}
602adf40 2127
dfc5606d 2128static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
34b13184 2129static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
dfc5606d
YS
2130static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2131static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2132static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
9bb2f334 2133static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
dfc5606d 2134static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
589d30e0 2135static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
dfc5606d
YS
2136static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2137static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
86b00e0d 2138static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
dfc5606d
YS
2139
2140static struct attribute *rbd_attrs[] = {
2141 &dev_attr_size.attr,
34b13184 2142 &dev_attr_features.attr,
dfc5606d
YS
2143 &dev_attr_major.attr,
2144 &dev_attr_client_id.attr,
2145 &dev_attr_pool.attr,
9bb2f334 2146 &dev_attr_pool_id.attr,
dfc5606d 2147 &dev_attr_name.attr,
589d30e0 2148 &dev_attr_image_id.attr,
dfc5606d 2149 &dev_attr_current_snap.attr,
86b00e0d 2150 &dev_attr_parent.attr,
dfc5606d 2151 &dev_attr_refresh.attr,
dfc5606d
YS
2152 NULL
2153};
2154
2155static struct attribute_group rbd_attr_group = {
2156 .attrs = rbd_attrs,
2157};
2158
2159static const struct attribute_group *rbd_attr_groups[] = {
2160 &rbd_attr_group,
2161 NULL
2162};
2163
2164static void rbd_sysfs_dev_release(struct device *dev)
2165{
2166}
2167
2168static struct device_type rbd_device_type = {
2169 .name = "rbd",
2170 .groups = rbd_attr_groups,
2171 .release = rbd_sysfs_dev_release,
2172};
2173
2174
2175/*
2176 sysfs - snapshots
2177*/
2178
2179static ssize_t rbd_snap_size_show(struct device *dev,
2180 struct device_attribute *attr,
2181 char *buf)
2182{
2183 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2184
3591538f 2185 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
dfc5606d
YS
2186}
2187
2188static ssize_t rbd_snap_id_show(struct device *dev,
2189 struct device_attribute *attr,
2190 char *buf)
2191{
2192 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2193
3591538f 2194 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
dfc5606d
YS
2195}
2196
34b13184
AE
2197static ssize_t rbd_snap_features_show(struct device *dev,
2198 struct device_attribute *attr,
2199 char *buf)
2200{
2201 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2202
2203 return sprintf(buf, "0x%016llx\n",
2204 (unsigned long long) snap->features);
2205}
2206
dfc5606d
YS
2207static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2208static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
34b13184 2209static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
dfc5606d
YS
2210
2211static struct attribute *rbd_snap_attrs[] = {
2212 &dev_attr_snap_size.attr,
2213 &dev_attr_snap_id.attr,
34b13184 2214 &dev_attr_snap_features.attr,
dfc5606d
YS
2215 NULL,
2216};
2217
2218static struct attribute_group rbd_snap_attr_group = {
2219 .attrs = rbd_snap_attrs,
2220};
2221
2222static void rbd_snap_dev_release(struct device *dev)
2223{
2224 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2225 kfree(snap->name);
2226 kfree(snap);
2227}
2228
2229static const struct attribute_group *rbd_snap_attr_groups[] = {
2230 &rbd_snap_attr_group,
2231 NULL
2232};
2233
2234static struct device_type rbd_snap_device_type = {
2235 .groups = rbd_snap_attr_groups,
2236 .release = rbd_snap_dev_release,
2237};
2238
8b8fb99c
AE
2239static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2240{
2241 kref_get(&spec->kref);
2242
2243 return spec;
2244}
2245
2246static void rbd_spec_free(struct kref *kref);
2247static void rbd_spec_put(struct rbd_spec *spec)
2248{
2249 if (spec)
2250 kref_put(&spec->kref, rbd_spec_free);
2251}
2252
2253static struct rbd_spec *rbd_spec_alloc(void)
2254{
2255 struct rbd_spec *spec;
2256
2257 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2258 if (!spec)
2259 return NULL;
2260 kref_init(&spec->kref);
2261
2262 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2263
2264 return spec;
2265}
2266
2267static void rbd_spec_free(struct kref *kref)
2268{
2269 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2270
2271 kfree(spec->pool_name);
2272 kfree(spec->image_id);
2273 kfree(spec->image_name);
2274 kfree(spec->snap_name);
2275 kfree(spec);
2276}
2277
c53d5893
AE
2278struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2279 struct rbd_spec *spec)
2280{
2281 struct rbd_device *rbd_dev;
2282
2283 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2284 if (!rbd_dev)
2285 return NULL;
2286
2287 spin_lock_init(&rbd_dev->lock);
2288 INIT_LIST_HEAD(&rbd_dev->node);
2289 INIT_LIST_HEAD(&rbd_dev->snaps);
2290 init_rwsem(&rbd_dev->header_rwsem);
2291
2292 rbd_dev->spec = spec;
2293 rbd_dev->rbd_client = rbdc;
2294
2295 return rbd_dev;
2296}
2297
2298static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2299{
86b00e0d 2300 rbd_spec_put(rbd_dev->parent_spec);
c53d5893
AE
2301 kfree(rbd_dev->header_name);
2302 rbd_put_client(rbd_dev->rbd_client);
2303 rbd_spec_put(rbd_dev->spec);
2304 kfree(rbd_dev);
2305}
2306
304f6808
AE
2307static bool rbd_snap_registered(struct rbd_snap *snap)
2308{
2309 bool ret = snap->dev.type == &rbd_snap_device_type;
2310 bool reg = device_is_registered(&snap->dev);
2311
2312 rbd_assert(!ret ^ reg);
2313
2314 return ret;
2315}
2316
41f38c2b 2317static void rbd_remove_snap_dev(struct rbd_snap *snap)
dfc5606d
YS
2318{
2319 list_del(&snap->node);
304f6808
AE
2320 if (device_is_registered(&snap->dev))
2321 device_unregister(&snap->dev);
dfc5606d
YS
2322}
2323
14e7085d 2324static int rbd_register_snap_dev(struct rbd_snap *snap,
dfc5606d
YS
2325 struct device *parent)
2326{
2327 struct device *dev = &snap->dev;
2328 int ret;
2329
2330 dev->type = &rbd_snap_device_type;
2331 dev->parent = parent;
2332 dev->release = rbd_snap_dev_release;
d4b125e9 2333 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
304f6808
AE
2334 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2335
dfc5606d
YS
2336 ret = device_register(dev);
2337
2338 return ret;
2339}
2340
4e891e0a 2341static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
c8d18425 2342 const char *snap_name,
34b13184
AE
2343 u64 snap_id, u64 snap_size,
2344 u64 snap_features)
dfc5606d 2345{
4e891e0a 2346 struct rbd_snap *snap;
dfc5606d 2347 int ret;
4e891e0a
AE
2348
2349 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
dfc5606d 2350 if (!snap)
4e891e0a
AE
2351 return ERR_PTR(-ENOMEM);
2352
2353 ret = -ENOMEM;
c8d18425 2354 snap->name = kstrdup(snap_name, GFP_KERNEL);
4e891e0a
AE
2355 if (!snap->name)
2356 goto err;
2357
c8d18425
AE
2358 snap->id = snap_id;
2359 snap->size = snap_size;
34b13184 2360 snap->features = snap_features;
4e891e0a
AE
2361
2362 return snap;
2363
dfc5606d
YS
2364err:
2365 kfree(snap->name);
2366 kfree(snap);
4e891e0a
AE
2367
2368 return ERR_PTR(ret);
dfc5606d
YS
2369}
2370
cd892126
AE
2371static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2372 u64 *snap_size, u64 *snap_features)
2373{
2374 char *snap_name;
2375
2376 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2377
2378 *snap_size = rbd_dev->header.snap_sizes[which];
2379 *snap_features = 0; /* No features for v1 */
2380
2381 /* Skip over names until we find the one we are looking for */
2382
2383 snap_name = rbd_dev->header.snap_names;
2384 while (which--)
2385 snap_name += strlen(snap_name) + 1;
2386
2387 return snap_name;
2388}
2389
9d475de5
AE
2390/*
2391 * Get the size and object order for an image snapshot, or if
2392 * snap_id is CEPH_NOSNAP, gets this information for the base
2393 * image.
2394 */
2395static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2396 u8 *order, u64 *snap_size)
2397{
2398 __le64 snapid = cpu_to_le64(snap_id);
2399 int ret;
2400 struct {
2401 u8 order;
2402 __le64 size;
2403 } __attribute__ ((packed)) size_buf = { 0 };
2404
2405 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2406 "rbd", "get_size",
2407 (char *) &snapid, sizeof (snapid),
2408 (char *) &size_buf, sizeof (size_buf),
2409 CEPH_OSD_FLAG_READ, NULL);
2410 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2411 if (ret < 0)
2412 return ret;
2413
2414 *order = size_buf.order;
2415 *snap_size = le64_to_cpu(size_buf.size);
2416
2417 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2418 (unsigned long long) snap_id, (unsigned int) *order,
2419 (unsigned long long) *snap_size);
2420
2421 return 0;
2422}
2423
2424static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2425{
2426 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2427 &rbd_dev->header.obj_order,
2428 &rbd_dev->header.image_size);
2429}
2430
1e130199
AE
2431static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2432{
2433 void *reply_buf;
2434 int ret;
2435 void *p;
2436
2437 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2438 if (!reply_buf)
2439 return -ENOMEM;
2440
2441 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2442 "rbd", "get_object_prefix",
2443 NULL, 0,
2444 reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
2445 CEPH_OSD_FLAG_READ, NULL);
2446 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2447 if (ret < 0)
2448 goto out;
a0ea3a40 2449 ret = 0; /* rbd_req_sync_exec() can return positive */
1e130199
AE
2450
2451 p = reply_buf;
2452 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2453 p + RBD_OBJ_PREFIX_LEN_MAX,
2454 NULL, GFP_NOIO);
2455
2456 if (IS_ERR(rbd_dev->header.object_prefix)) {
2457 ret = PTR_ERR(rbd_dev->header.object_prefix);
2458 rbd_dev->header.object_prefix = NULL;
2459 } else {
2460 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2461 }
2462
2463out:
2464 kfree(reply_buf);
2465
2466 return ret;
2467}
2468
b1b5402a
AE
2469static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2470 u64 *snap_features)
2471{
2472 __le64 snapid = cpu_to_le64(snap_id);
2473 struct {
2474 __le64 features;
2475 __le64 incompat;
2476 } features_buf = { 0 };
d889140c 2477 u64 incompat;
b1b5402a
AE
2478 int ret;
2479
2480 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2481 "rbd", "get_features",
2482 (char *) &snapid, sizeof (snapid),
2483 (char *) &features_buf, sizeof (features_buf),
2484 CEPH_OSD_FLAG_READ, NULL);
2485 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2486 if (ret < 0)
2487 return ret;
d889140c
AE
2488
2489 incompat = le64_to_cpu(features_buf.incompat);
2490 if (incompat & ~RBD_FEATURES_ALL)
b8f5c6ed 2491 return -ENXIO;
d889140c 2492
b1b5402a
AE
2493 *snap_features = le64_to_cpu(features_buf.features);
2494
2495 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2496 (unsigned long long) snap_id,
2497 (unsigned long long) *snap_features,
2498 (unsigned long long) le64_to_cpu(features_buf.incompat));
2499
2500 return 0;
2501}
2502
2503static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2504{
2505 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2506 &rbd_dev->header.features);
2507}
2508
86b00e0d
AE
2509static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2510{
2511 struct rbd_spec *parent_spec;
2512 size_t size;
2513 void *reply_buf = NULL;
2514 __le64 snapid;
2515 void *p;
2516 void *end;
2517 char *image_id;
2518 u64 overlap;
86b00e0d
AE
2519 int ret;
2520
2521 parent_spec = rbd_spec_alloc();
2522 if (!parent_spec)
2523 return -ENOMEM;
2524
2525 size = sizeof (__le64) + /* pool_id */
2526 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
2527 sizeof (__le64) + /* snap_id */
2528 sizeof (__le64); /* overlap */
2529 reply_buf = kmalloc(size, GFP_KERNEL);
2530 if (!reply_buf) {
2531 ret = -ENOMEM;
2532 goto out_err;
2533 }
2534
2535 snapid = cpu_to_le64(CEPH_NOSNAP);
2536 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2537 "rbd", "get_parent",
2538 (char *) &snapid, sizeof (snapid),
2539 (char *) reply_buf, size,
2540 CEPH_OSD_FLAG_READ, NULL);
2541 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2542 if (ret < 0)
2543 goto out_err;
2544
2545 ret = -ERANGE;
2546 p = reply_buf;
2547 end = (char *) reply_buf + size;
2548 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2549 if (parent_spec->pool_id == CEPH_NOPOOL)
2550 goto out; /* No parent? No problem. */
2551
979ed480 2552 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
86b00e0d
AE
2553 if (IS_ERR(image_id)) {
2554 ret = PTR_ERR(image_id);
2555 goto out_err;
2556 }
2557 parent_spec->image_id = image_id;
2558 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
2559 ceph_decode_64_safe(&p, end, overlap, out_err);
2560
2561 rbd_dev->parent_overlap = overlap;
2562 rbd_dev->parent_spec = parent_spec;
2563 parent_spec = NULL; /* rbd_dev now owns this */
2564out:
2565 ret = 0;
2566out_err:
2567 kfree(reply_buf);
2568 rbd_spec_put(parent_spec);
2569
2570 return ret;
2571}
2572
9e15b77d
AE
2573static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
2574{
2575 size_t image_id_size;
2576 char *image_id;
2577 void *p;
2578 void *end;
2579 size_t size;
2580 void *reply_buf = NULL;
2581 size_t len = 0;
2582 char *image_name = NULL;
2583 int ret;
2584
2585 rbd_assert(!rbd_dev->spec->image_name);
2586
69e7a02f
AE
2587 len = strlen(rbd_dev->spec->image_id);
2588 image_id_size = sizeof (__le32) + len;
9e15b77d
AE
2589 image_id = kmalloc(image_id_size, GFP_KERNEL);
2590 if (!image_id)
2591 return NULL;
2592
2593 p = image_id;
2594 end = (char *) image_id + image_id_size;
69e7a02f 2595 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
9e15b77d
AE
2596
2597 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
2598 reply_buf = kmalloc(size, GFP_KERNEL);
2599 if (!reply_buf)
2600 goto out;
2601
2602 ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY,
2603 "rbd", "dir_get_name",
2604 image_id, image_id_size,
2605 (char *) reply_buf, size,
2606 CEPH_OSD_FLAG_READ, NULL);
2607 if (ret < 0)
2608 goto out;
2609 p = reply_buf;
2610 end = (char *) reply_buf + size;
2611 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2612 if (IS_ERR(image_name))
2613 image_name = NULL;
2614 else
2615 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
2616out:
2617 kfree(reply_buf);
2618 kfree(image_id);
2619
2620 return image_name;
2621}
2622
2623/*
2624 * When a parent image gets probed, we only have the pool, image,
2625 * and snapshot ids but not the names of any of them. This call
2626 * is made later to fill in those names. It has to be done after
2627 * rbd_dev_snaps_update() has completed because some of the
2628 * information (in particular, snapshot name) is not available
2629 * until then.
2630 */
2631static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
2632{
2633 struct ceph_osd_client *osdc;
2634 const char *name;
2635 void *reply_buf = NULL;
2636 int ret;
2637
2638 if (rbd_dev->spec->pool_name)
2639 return 0; /* Already have the names */
2640
2641 /* Look up the pool name */
2642
2643 osdc = &rbd_dev->rbd_client->client->osdc;
2644 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
2645 if (!name)
2646 return -EIO; /* pool id too large (>= 2^31) */
2647
2648 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
2649 if (!rbd_dev->spec->pool_name)
2650 return -ENOMEM;
2651
2652 /* Fetch the image name; tolerate failure here */
2653
2654 name = rbd_dev_image_name(rbd_dev);
69e7a02f 2655 if (name)
9e15b77d 2656 rbd_dev->spec->image_name = (char *) name;
69e7a02f 2657 else
06ecc6cb 2658 rbd_warn(rbd_dev, "unable to get image name");
9e15b77d
AE
2659
2660 /* Look up the snapshot name. */
2661
2662 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
2663 if (!name) {
2664 ret = -EIO;
2665 goto out_err;
2666 }
2667 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
2668 if(!rbd_dev->spec->snap_name)
2669 goto out_err;
2670
2671 return 0;
2672out_err:
2673 kfree(reply_buf);
2674 kfree(rbd_dev->spec->pool_name);
2675 rbd_dev->spec->pool_name = NULL;
2676
2677 return ret;
2678}
2679
6e14b1a6 2680static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
35d489f9
AE
2681{
2682 size_t size;
2683 int ret;
2684 void *reply_buf;
2685 void *p;
2686 void *end;
2687 u64 seq;
2688 u32 snap_count;
2689 struct ceph_snap_context *snapc;
2690 u32 i;
2691
2692 /*
2693 * We'll need room for the seq value (maximum snapshot id),
2694 * snapshot count, and array of that many snapshot ids.
2695 * For now we have a fixed upper limit on the number we're
2696 * prepared to receive.
2697 */
2698 size = sizeof (__le64) + sizeof (__le32) +
2699 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2700 reply_buf = kzalloc(size, GFP_KERNEL);
2701 if (!reply_buf)
2702 return -ENOMEM;
2703
2704 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2705 "rbd", "get_snapcontext",
2706 NULL, 0,
2707 reply_buf, size,
6e14b1a6 2708 CEPH_OSD_FLAG_READ, ver);
35d489f9
AE
2709 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2710 if (ret < 0)
2711 goto out;
2712
2713 ret = -ERANGE;
2714 p = reply_buf;
2715 end = (char *) reply_buf + size;
2716 ceph_decode_64_safe(&p, end, seq, out);
2717 ceph_decode_32_safe(&p, end, snap_count, out);
2718
2719 /*
2720 * Make sure the reported number of snapshot ids wouldn't go
2721 * beyond the end of our buffer. But before checking that,
2722 * make sure the computed size of the snapshot context we
2723 * allocate is representable in a size_t.
2724 */
2725 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2726 / sizeof (u64)) {
2727 ret = -EINVAL;
2728 goto out;
2729 }
2730 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2731 goto out;
2732
2733 size = sizeof (struct ceph_snap_context) +
2734 snap_count * sizeof (snapc->snaps[0]);
2735 snapc = kmalloc(size, GFP_KERNEL);
2736 if (!snapc) {
2737 ret = -ENOMEM;
2738 goto out;
2739 }
2740
2741 atomic_set(&snapc->nref, 1);
2742 snapc->seq = seq;
2743 snapc->num_snaps = snap_count;
2744 for (i = 0; i < snap_count; i++)
2745 snapc->snaps[i] = ceph_decode_64(&p);
2746
2747 rbd_dev->header.snapc = snapc;
2748
2749 dout(" snap context seq = %llu, snap_count = %u\n",
2750 (unsigned long long) seq, (unsigned int) snap_count);
2751
2752out:
2753 kfree(reply_buf);
2754
2755 return 0;
2756}
2757
b8b1e2db
AE
2758static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2759{
2760 size_t size;
2761 void *reply_buf;
2762 __le64 snap_id;
2763 int ret;
2764 void *p;
2765 void *end;
b8b1e2db
AE
2766 char *snap_name;
2767
2768 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2769 reply_buf = kmalloc(size, GFP_KERNEL);
2770 if (!reply_buf)
2771 return ERR_PTR(-ENOMEM);
2772
2773 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2774 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2775 "rbd", "get_snapshot_name",
2776 (char *) &snap_id, sizeof (snap_id),
2777 reply_buf, size,
2778 CEPH_OSD_FLAG_READ, NULL);
2779 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2780 if (ret < 0)
2781 goto out;
2782
2783 p = reply_buf;
2784 end = (char *) reply_buf + size;
e5c35534 2785 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
b8b1e2db
AE
2786 if (IS_ERR(snap_name)) {
2787 ret = PTR_ERR(snap_name);
2788 goto out;
2789 } else {
2790 dout(" snap_id 0x%016llx snap_name = %s\n",
2791 (unsigned long long) le64_to_cpu(snap_id), snap_name);
2792 }
2793 kfree(reply_buf);
2794
2795 return snap_name;
2796out:
2797 kfree(reply_buf);
2798
2799 return ERR_PTR(ret);
2800}
2801
2802static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2803 u64 *snap_size, u64 *snap_features)
2804{
2805 __le64 snap_id;
2806 u8 order;
2807 int ret;
2808
2809 snap_id = rbd_dev->header.snapc->snaps[which];
2810 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2811 if (ret)
2812 return ERR_PTR(ret);
2813 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2814 if (ret)
2815 return ERR_PTR(ret);
2816
2817 return rbd_dev_v2_snap_name(rbd_dev, which);
2818}
2819
2820static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2821 u64 *snap_size, u64 *snap_features)
2822{
2823 if (rbd_dev->image_format == 1)
2824 return rbd_dev_v1_snap_info(rbd_dev, which,
2825 snap_size, snap_features);
2826 if (rbd_dev->image_format == 2)
2827 return rbd_dev_v2_snap_info(rbd_dev, which,
2828 snap_size, snap_features);
2829 return ERR_PTR(-EINVAL);
2830}
2831
117973fb
AE
2832static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2833{
2834 int ret;
2835 __u8 obj_order;
2836
2837 down_write(&rbd_dev->header_rwsem);
2838
2839 /* Grab old order first, to see if it changes */
2840
2841 obj_order = rbd_dev->header.obj_order,
2842 ret = rbd_dev_v2_image_size(rbd_dev);
2843 if (ret)
2844 goto out;
2845 if (rbd_dev->header.obj_order != obj_order) {
2846 ret = -EIO;
2847 goto out;
2848 }
2849 rbd_update_mapping_size(rbd_dev);
2850
2851 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2852 dout("rbd_dev_v2_snap_context returned %d\n", ret);
2853 if (ret)
2854 goto out;
2855 ret = rbd_dev_snaps_update(rbd_dev);
2856 dout("rbd_dev_snaps_update returned %d\n", ret);
2857 if (ret)
2858 goto out;
2859 ret = rbd_dev_snaps_register(rbd_dev);
2860 dout("rbd_dev_snaps_register returned %d\n", ret);
2861out:
2862 up_write(&rbd_dev->header_rwsem);
2863
2864 return ret;
2865}
2866
dfc5606d 2867/*
35938150
AE
2868 * Scan the rbd device's current snapshot list and compare it to the
2869 * newly-received snapshot context. Remove any existing snapshots
2870 * not present in the new snapshot context. Add a new snapshot for
2871 * any snaphots in the snapshot context not in the current list.
2872 * And verify there are no changes to snapshots we already know
2873 * about.
2874 *
2875 * Assumes the snapshots in the snapshot context are sorted by
2876 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2877 * are also maintained in that order.)
dfc5606d 2878 */
304f6808 2879static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
dfc5606d 2880{
35938150
AE
2881 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2882 const u32 snap_count = snapc->num_snaps;
35938150
AE
2883 struct list_head *head = &rbd_dev->snaps;
2884 struct list_head *links = head->next;
2885 u32 index = 0;
dfc5606d 2886
9fcbb800 2887 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
35938150
AE
2888 while (index < snap_count || links != head) {
2889 u64 snap_id;
2890 struct rbd_snap *snap;
cd892126
AE
2891 char *snap_name;
2892 u64 snap_size = 0;
2893 u64 snap_features = 0;
dfc5606d 2894
35938150
AE
2895 snap_id = index < snap_count ? snapc->snaps[index]
2896 : CEPH_NOSNAP;
2897 snap = links != head ? list_entry(links, struct rbd_snap, node)
2898 : NULL;
aafb230e 2899 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
dfc5606d 2900
35938150
AE
2901 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2902 struct list_head *next = links->next;
dfc5606d 2903
35938150 2904 /* Existing snapshot not in the new snap context */
dfc5606d 2905
0d7dbfce 2906 if (rbd_dev->spec->snap_id == snap->id)
daba5fdb 2907 rbd_dev->exists = false;
41f38c2b 2908 rbd_remove_snap_dev(snap);
9fcbb800 2909 dout("%ssnap id %llu has been removed\n",
0d7dbfce
AE
2910 rbd_dev->spec->snap_id == snap->id ?
2911 "mapped " : "",
9fcbb800 2912 (unsigned long long) snap->id);
35938150
AE
2913
2914 /* Done with this list entry; advance */
2915
2916 links = next;
dfc5606d
YS
2917 continue;
2918 }
35938150 2919
b8b1e2db
AE
2920 snap_name = rbd_dev_snap_info(rbd_dev, index,
2921 &snap_size, &snap_features);
cd892126
AE
2922 if (IS_ERR(snap_name))
2923 return PTR_ERR(snap_name);
2924
9fcbb800
AE
2925 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2926 (unsigned long long) snap_id);
35938150
AE
2927 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2928 struct rbd_snap *new_snap;
2929
2930 /* We haven't seen this snapshot before */
2931
c8d18425 2932 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
cd892126 2933 snap_id, snap_size, snap_features);
9fcbb800
AE
2934 if (IS_ERR(new_snap)) {
2935 int err = PTR_ERR(new_snap);
2936
2937 dout(" failed to add dev, error %d\n", err);
2938
2939 return err;
2940 }
35938150
AE
2941
2942 /* New goes before existing, or at end of list */
2943
9fcbb800 2944 dout(" added dev%s\n", snap ? "" : " at end\n");
35938150
AE
2945 if (snap)
2946 list_add_tail(&new_snap->node, &snap->node);
2947 else
523f3258 2948 list_add_tail(&new_snap->node, head);
35938150
AE
2949 } else {
2950 /* Already have this one */
2951
9fcbb800
AE
2952 dout(" already present\n");
2953
cd892126 2954 rbd_assert(snap->size == snap_size);
aafb230e 2955 rbd_assert(!strcmp(snap->name, snap_name));
cd892126 2956 rbd_assert(snap->features == snap_features);
35938150
AE
2957
2958 /* Done with this list entry; advance */
2959
2960 links = links->next;
dfc5606d 2961 }
35938150
AE
2962
2963 /* Advance to the next entry in the snapshot context */
2964
2965 index++;
dfc5606d 2966 }
9fcbb800 2967 dout("%s: done\n", __func__);
dfc5606d
YS
2968
2969 return 0;
2970}
2971
304f6808
AE
2972/*
2973 * Scan the list of snapshots and register the devices for any that
2974 * have not already been registered.
2975 */
2976static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2977{
2978 struct rbd_snap *snap;
2979 int ret = 0;
2980
2981 dout("%s called\n", __func__);
86ff77bb
AE
2982 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2983 return -EIO;
304f6808
AE
2984
2985 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2986 if (!rbd_snap_registered(snap)) {
2987 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2988 if (ret < 0)
2989 break;
2990 }
2991 }
2992 dout("%s: returning %d\n", __func__, ret);
2993
2994 return ret;
2995}
2996
dfc5606d
YS
2997static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2998{
dfc5606d 2999 struct device *dev;
cd789ab9 3000 int ret;
dfc5606d
YS
3001
3002 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
dfc5606d 3003
cd789ab9 3004 dev = &rbd_dev->dev;
dfc5606d
YS
3005 dev->bus = &rbd_bus_type;
3006 dev->type = &rbd_device_type;
3007 dev->parent = &rbd_root_dev;
3008 dev->release = rbd_dev_release;
de71a297 3009 dev_set_name(dev, "%d", rbd_dev->dev_id);
dfc5606d 3010 ret = device_register(dev);
dfc5606d 3011
dfc5606d 3012 mutex_unlock(&ctl_mutex);
cd789ab9 3013
dfc5606d 3014 return ret;
602adf40
YS
3015}
3016
dfc5606d
YS
3017static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3018{
3019 device_unregister(&rbd_dev->dev);
3020}
3021
59c2be1e
YS
3022static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
3023{
3024 int ret, rc;
3025
3026 do {
0e6f322d 3027 ret = rbd_req_sync_watch(rbd_dev);
59c2be1e 3028 if (ret == -ERANGE) {
117973fb 3029 rc = rbd_dev_refresh(rbd_dev, NULL);
59c2be1e
YS
3030 if (rc < 0)
3031 return rc;
3032 }
3033 } while (ret == -ERANGE);
3034
3035 return ret;
3036}
3037
e2839308 3038static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
1ddbe94e
AE
3039
3040/*
499afd5b
AE
3041 * Get a unique rbd identifier for the given new rbd_dev, and add
3042 * the rbd_dev to the global list. The minimum rbd id is 1.
1ddbe94e 3043 */
e2839308 3044static void rbd_dev_id_get(struct rbd_device *rbd_dev)
b7f23c36 3045{
e2839308 3046 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
499afd5b
AE
3047
3048 spin_lock(&rbd_dev_list_lock);
3049 list_add_tail(&rbd_dev->node, &rbd_dev_list);
3050 spin_unlock(&rbd_dev_list_lock);
e2839308
AE
3051 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3052 (unsigned long long) rbd_dev->dev_id);
1ddbe94e 3053}
b7f23c36 3054
1ddbe94e 3055/*
499afd5b
AE
3056 * Remove an rbd_dev from the global list, and record that its
3057 * identifier is no longer in use.
1ddbe94e 3058 */
e2839308 3059static void rbd_dev_id_put(struct rbd_device *rbd_dev)
1ddbe94e 3060{
d184f6bf 3061 struct list_head *tmp;
de71a297 3062 int rbd_id = rbd_dev->dev_id;
d184f6bf
AE
3063 int max_id;
3064
aafb230e 3065 rbd_assert(rbd_id > 0);
499afd5b 3066
e2839308
AE
3067 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3068 (unsigned long long) rbd_dev->dev_id);
499afd5b
AE
3069 spin_lock(&rbd_dev_list_lock);
3070 list_del_init(&rbd_dev->node);
d184f6bf
AE
3071
3072 /*
3073 * If the id being "put" is not the current maximum, there
3074 * is nothing special we need to do.
3075 */
e2839308 3076 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
d184f6bf
AE
3077 spin_unlock(&rbd_dev_list_lock);
3078 return;
3079 }
3080
3081 /*
3082 * We need to update the current maximum id. Search the
3083 * list to find out what it is. We're more likely to find
3084 * the maximum at the end, so search the list backward.
3085 */
3086 max_id = 0;
3087 list_for_each_prev(tmp, &rbd_dev_list) {
3088 struct rbd_device *rbd_dev;
3089
3090 rbd_dev = list_entry(tmp, struct rbd_device, node);
b213e0b1
AE
3091 if (rbd_dev->dev_id > max_id)
3092 max_id = rbd_dev->dev_id;
d184f6bf 3093 }
499afd5b 3094 spin_unlock(&rbd_dev_list_lock);
b7f23c36 3095
1ddbe94e 3096 /*
e2839308 3097 * The max id could have been updated by rbd_dev_id_get(), in
d184f6bf
AE
3098 * which case it now accurately reflects the new maximum.
3099 * Be careful not to overwrite the maximum value in that
3100 * case.
1ddbe94e 3101 */
e2839308
AE
3102 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3103 dout(" max dev id has been reset\n");
b7f23c36
AE
3104}
3105
e28fff26
AE
3106/*
3107 * Skips over white space at *buf, and updates *buf to point to the
3108 * first found non-space character (if any). Returns the length of
593a9e7b
AE
3109 * the token (string of non-white space characters) found. Note
3110 * that *buf must be terminated with '\0'.
e28fff26
AE
3111 */
3112static inline size_t next_token(const char **buf)
3113{
3114 /*
3115 * These are the characters that produce nonzero for
3116 * isspace() in the "C" and "POSIX" locales.
3117 */
3118 const char *spaces = " \f\n\r\t\v";
3119
3120 *buf += strspn(*buf, spaces); /* Find start of token */
3121
3122 return strcspn(*buf, spaces); /* Return token length */
3123}
3124
3125/*
3126 * Finds the next token in *buf, and if the provided token buffer is
3127 * big enough, copies the found token into it. The result, if
593a9e7b
AE
3128 * copied, is guaranteed to be terminated with '\0'. Note that *buf
3129 * must be terminated with '\0' on entry.
e28fff26
AE
3130 *
3131 * Returns the length of the token found (not including the '\0').
3132 * Return value will be 0 if no token is found, and it will be >=
3133 * token_size if the token would not fit.
3134 *
593a9e7b 3135 * The *buf pointer will be updated to point beyond the end of the
e28fff26
AE
3136 * found token. Note that this occurs even if the token buffer is
3137 * too small to hold it.
3138 */
3139static inline size_t copy_token(const char **buf,
3140 char *token,
3141 size_t token_size)
3142{
3143 size_t len;
3144
3145 len = next_token(buf);
3146 if (len < token_size) {
3147 memcpy(token, *buf, len);
3148 *(token + len) = '\0';
3149 }
3150 *buf += len;
3151
3152 return len;
3153}
3154
ea3352f4
AE
3155/*
3156 * Finds the next token in *buf, dynamically allocates a buffer big
3157 * enough to hold a copy of it, and copies the token into the new
3158 * buffer. The copy is guaranteed to be terminated with '\0'. Note
3159 * that a duplicate buffer is created even for a zero-length token.
3160 *
3161 * Returns a pointer to the newly-allocated duplicate, or a null
3162 * pointer if memory for the duplicate was not available. If
3163 * the lenp argument is a non-null pointer, the length of the token
3164 * (not including the '\0') is returned in *lenp.
3165 *
3166 * If successful, the *buf pointer will be updated to point beyond
3167 * the end of the found token.
3168 *
3169 * Note: uses GFP_KERNEL for allocation.
3170 */
3171static inline char *dup_token(const char **buf, size_t *lenp)
3172{
3173 char *dup;
3174 size_t len;
3175
3176 len = next_token(buf);
4caf35f9 3177 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
ea3352f4
AE
3178 if (!dup)
3179 return NULL;
ea3352f4
AE
3180 *(dup + len) = '\0';
3181 *buf += len;
3182
3183 if (lenp)
3184 *lenp = len;
3185
3186 return dup;
3187}
3188
a725f65e 3189/*
859c31df
AE
3190 * Parse the options provided for an "rbd add" (i.e., rbd image
3191 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
3192 * and the data written is passed here via a NUL-terminated buffer.
3193 * Returns 0 if successful or an error code otherwise.
d22f76e7 3194 *
859c31df
AE
3195 * The information extracted from these options is recorded in
3196 * the other parameters which return dynamically-allocated
3197 * structures:
3198 * ceph_opts
3199 * The address of a pointer that will refer to a ceph options
3200 * structure. Caller must release the returned pointer using
3201 * ceph_destroy_options() when it is no longer needed.
3202 * rbd_opts
3203 * Address of an rbd options pointer. Fully initialized by
3204 * this function; caller must release with kfree().
3205 * spec
3206 * Address of an rbd image specification pointer. Fully
3207 * initialized by this function based on parsed options.
3208 * Caller must release with rbd_spec_put().
3209 *
3210 * The options passed take this form:
3211 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3212 * where:
3213 * <mon_addrs>
3214 * A comma-separated list of one or more monitor addresses.
3215 * A monitor address is an ip address, optionally followed
3216 * by a port number (separated by a colon).
3217 * I.e.: ip1[:port1][,ip2[:port2]...]
3218 * <options>
3219 * A comma-separated list of ceph and/or rbd options.
3220 * <pool_name>
3221 * The name of the rados pool containing the rbd image.
3222 * <image_name>
3223 * The name of the image in that pool to map.
3224 * <snap_id>
3225 * An optional snapshot id. If provided, the mapping will
3226 * present data from the image at the time that snapshot was
3227 * created. The image head is used if no snapshot id is
3228 * provided. Snapshot mappings are always read-only.
a725f65e 3229 */
859c31df 3230static int rbd_add_parse_args(const char *buf,
dc79b113 3231 struct ceph_options **ceph_opts,
859c31df
AE
3232 struct rbd_options **opts,
3233 struct rbd_spec **rbd_spec)
e28fff26 3234{
d22f76e7 3235 size_t len;
859c31df 3236 char *options;
0ddebc0c
AE
3237 const char *mon_addrs;
3238 size_t mon_addrs_size;
859c31df 3239 struct rbd_spec *spec = NULL;
4e9afeba 3240 struct rbd_options *rbd_opts = NULL;
859c31df 3241 struct ceph_options *copts;
dc79b113 3242 int ret;
e28fff26
AE
3243
3244 /* The first four tokens are required */
3245
7ef3214a
AE
3246 len = next_token(&buf);
3247 if (!len)
dc79b113 3248 return -EINVAL; /* Missing monitor address(es) */
0ddebc0c 3249 mon_addrs = buf;
f28e565a 3250 mon_addrs_size = len + 1;
7ef3214a 3251 buf += len;
a725f65e 3252
dc79b113 3253 ret = -EINVAL;
f28e565a
AE
3254 options = dup_token(&buf, NULL);
3255 if (!options)
dc79b113 3256 return -ENOMEM;
f28e565a
AE
3257 if (!*options)
3258 goto out_err; /* Missing options */
e28fff26 3259
859c31df
AE
3260 spec = rbd_spec_alloc();
3261 if (!spec)
f28e565a 3262 goto out_mem;
859c31df
AE
3263
3264 spec->pool_name = dup_token(&buf, NULL);
3265 if (!spec->pool_name)
3266 goto out_mem;
3267 if (!*spec->pool_name)
f28e565a 3268 goto out_err; /* Missing pool name */
e28fff26 3269
69e7a02f 3270 spec->image_name = dup_token(&buf, NULL);
859c31df 3271 if (!spec->image_name)
f28e565a 3272 goto out_mem;
859c31df 3273 if (!*spec->image_name)
f28e565a 3274 goto out_err; /* Missing image name */
d4b125e9 3275
f28e565a
AE
3276 /*
3277 * Snapshot name is optional; default is to use "-"
3278 * (indicating the head/no snapshot).
3279 */
3feeb894 3280 len = next_token(&buf);
820a5f3e 3281 if (!len) {
3feeb894
AE
3282 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3283 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
f28e565a 3284 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
dc79b113 3285 ret = -ENAMETOOLONG;
f28e565a 3286 goto out_err;
849b4260 3287 }
4caf35f9 3288 spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
859c31df 3289 if (!spec->snap_name)
f28e565a 3290 goto out_mem;
859c31df 3291 *(spec->snap_name + len) = '\0';
e5c35534 3292
0ddebc0c 3293 /* Initialize all rbd options to the defaults */
e28fff26 3294
4e9afeba
AE
3295 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3296 if (!rbd_opts)
3297 goto out_mem;
3298
3299 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
d22f76e7 3300
859c31df 3301 copts = ceph_parse_options(options, mon_addrs,
0ddebc0c 3302 mon_addrs + mon_addrs_size - 1,
4e9afeba 3303 parse_rbd_opts_token, rbd_opts);
859c31df
AE
3304 if (IS_ERR(copts)) {
3305 ret = PTR_ERR(copts);
dc79b113
AE
3306 goto out_err;
3307 }
859c31df
AE
3308 kfree(options);
3309
3310 *ceph_opts = copts;
4e9afeba 3311 *opts = rbd_opts;
859c31df 3312 *rbd_spec = spec;
0ddebc0c 3313
dc79b113 3314 return 0;
f28e565a 3315out_mem:
dc79b113 3316 ret = -ENOMEM;
d22f76e7 3317out_err:
859c31df
AE
3318 kfree(rbd_opts);
3319 rbd_spec_put(spec);
f28e565a 3320 kfree(options);
d22f76e7 3321
dc79b113 3322 return ret;
a725f65e
AE
3323}
3324
589d30e0
AE
3325/*
3326 * An rbd format 2 image has a unique identifier, distinct from the
3327 * name given to it by the user. Internally, that identifier is
3328 * what's used to specify the names of objects related to the image.
3329 *
3330 * A special "rbd id" object is used to map an rbd image name to its
3331 * id. If that object doesn't exist, then there is no v2 rbd image
3332 * with the supplied name.
3333 *
3334 * This function will record the given rbd_dev's image_id field if
3335 * it can be determined, and in that case will return 0. If any
3336 * errors occur a negative errno will be returned and the rbd_dev's
3337 * image_id field will be unchanged (and should be NULL).
3338 */
3339static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3340{
3341 int ret;
3342 size_t size;
3343 char *object_name;
3344 void *response;
3345 void *p;
3346
2c0d0a10
AE
3347 /*
3348 * When probing a parent image, the image id is already
3349 * known (and the image name likely is not). There's no
3350 * need to fetch the image id again in this case.
3351 */
3352 if (rbd_dev->spec->image_id)
3353 return 0;
3354
589d30e0
AE
3355 /*
3356 * First, see if the format 2 image id file exists, and if
3357 * so, get the image's persistent id from it.
3358 */
69e7a02f 3359 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
589d30e0
AE
3360 object_name = kmalloc(size, GFP_NOIO);
3361 if (!object_name)
3362 return -ENOMEM;
0d7dbfce 3363 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
589d30e0
AE
3364 dout("rbd id object name is %s\n", object_name);
3365
3366 /* Response will be an encoded string, which includes a length */
3367
3368 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3369 response = kzalloc(size, GFP_NOIO);
3370 if (!response) {
3371 ret = -ENOMEM;
3372 goto out;
3373 }
3374
3375 ret = rbd_req_sync_exec(rbd_dev, object_name,
3376 "rbd", "get_id",
3377 NULL, 0,
3378 response, RBD_IMAGE_ID_LEN_MAX,
3379 CEPH_OSD_FLAG_READ, NULL);
3380 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3381 if (ret < 0)
3382 goto out;
a0ea3a40 3383 ret = 0; /* rbd_req_sync_exec() can return positive */
589d30e0
AE
3384
3385 p = response;
0d7dbfce 3386 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
589d30e0 3387 p + RBD_IMAGE_ID_LEN_MAX,
979ed480 3388 NULL, GFP_NOIO);
0d7dbfce
AE
3389 if (IS_ERR(rbd_dev->spec->image_id)) {
3390 ret = PTR_ERR(rbd_dev->spec->image_id);
3391 rbd_dev->spec->image_id = NULL;
589d30e0 3392 } else {
0d7dbfce 3393 dout("image_id is %s\n", rbd_dev->spec->image_id);
589d30e0
AE
3394 }
3395out:
3396 kfree(response);
3397 kfree(object_name);
3398
3399 return ret;
3400}
3401
a30b71b9
AE
3402static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3403{
3404 int ret;
3405 size_t size;
3406
3407 /* Version 1 images have no id; empty string is used */
3408
0d7dbfce
AE
3409 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3410 if (!rbd_dev->spec->image_id)
a30b71b9 3411 return -ENOMEM;
a30b71b9
AE
3412
3413 /* Record the header object name for this rbd image. */
3414
69e7a02f 3415 size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
a30b71b9
AE
3416 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3417 if (!rbd_dev->header_name) {
3418 ret = -ENOMEM;
3419 goto out_err;
3420 }
0d7dbfce
AE
3421 sprintf(rbd_dev->header_name, "%s%s",
3422 rbd_dev->spec->image_name, RBD_SUFFIX);
a30b71b9
AE
3423
3424 /* Populate rbd image metadata */
3425
3426 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3427 if (ret < 0)
3428 goto out_err;
86b00e0d
AE
3429
3430 /* Version 1 images have no parent (no layering) */
3431
3432 rbd_dev->parent_spec = NULL;
3433 rbd_dev->parent_overlap = 0;
3434
a30b71b9
AE
3435 rbd_dev->image_format = 1;
3436
3437 dout("discovered version 1 image, header name is %s\n",
3438 rbd_dev->header_name);
3439
3440 return 0;
3441
3442out_err:
3443 kfree(rbd_dev->header_name);
3444 rbd_dev->header_name = NULL;
0d7dbfce
AE
3445 kfree(rbd_dev->spec->image_id);
3446 rbd_dev->spec->image_id = NULL;
a30b71b9
AE
3447
3448 return ret;
3449}
3450
3451static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3452{
3453 size_t size;
9d475de5 3454 int ret;
6e14b1a6 3455 u64 ver = 0;
a30b71b9
AE
3456
3457 /*
3458 * Image id was filled in by the caller. Record the header
3459 * object name for this rbd image.
3460 */
979ed480 3461 size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
a30b71b9
AE
3462 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3463 if (!rbd_dev->header_name)
3464 return -ENOMEM;
3465 sprintf(rbd_dev->header_name, "%s%s",
0d7dbfce 3466 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
9d475de5
AE
3467
3468 /* Get the size and object order for the image */
3469
3470 ret = rbd_dev_v2_image_size(rbd_dev);
1e130199
AE
3471 if (ret < 0)
3472 goto out_err;
3473
3474 /* Get the object prefix (a.k.a. block_name) for the image */
3475
3476 ret = rbd_dev_v2_object_prefix(rbd_dev);
b1b5402a
AE
3477 if (ret < 0)
3478 goto out_err;
3479
d889140c 3480 /* Get the and check features for the image */
b1b5402a
AE
3481
3482 ret = rbd_dev_v2_features(rbd_dev);
9d475de5
AE
3483 if (ret < 0)
3484 goto out_err;
35d489f9 3485
86b00e0d
AE
3486 /* If the image supports layering, get the parent info */
3487
3488 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3489 ret = rbd_dev_v2_parent_info(rbd_dev);
3490 if (ret < 0)
3491 goto out_err;
3492 }
3493
6e14b1a6
AE
3494 /* crypto and compression type aren't (yet) supported for v2 images */
3495
3496 rbd_dev->header.crypt_type = 0;
3497 rbd_dev->header.comp_type = 0;
35d489f9 3498
6e14b1a6
AE
3499 /* Get the snapshot context, plus the header version */
3500
3501 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
35d489f9
AE
3502 if (ret)
3503 goto out_err;
6e14b1a6
AE
3504 rbd_dev->header.obj_version = ver;
3505
a30b71b9
AE
3506 rbd_dev->image_format = 2;
3507
3508 dout("discovered version 2 image, header name is %s\n",
3509 rbd_dev->header_name);
3510
35152979 3511 return 0;
9d475de5 3512out_err:
86b00e0d
AE
3513 rbd_dev->parent_overlap = 0;
3514 rbd_spec_put(rbd_dev->parent_spec);
3515 rbd_dev->parent_spec = NULL;
9d475de5
AE
3516 kfree(rbd_dev->header_name);
3517 rbd_dev->header_name = NULL;
1e130199
AE
3518 kfree(rbd_dev->header.object_prefix);
3519 rbd_dev->header.object_prefix = NULL;
9d475de5
AE
3520
3521 return ret;
a30b71b9
AE
3522}
3523
83a06263
AE
3524static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3525{
3526 int ret;
3527
3528 /* no need to lock here, as rbd_dev is not registered yet */
3529 ret = rbd_dev_snaps_update(rbd_dev);
3530 if (ret)
3531 return ret;
3532
9e15b77d
AE
3533 ret = rbd_dev_probe_update_spec(rbd_dev);
3534 if (ret)
3535 goto err_out_snaps;
3536
83a06263
AE
3537 ret = rbd_dev_set_mapping(rbd_dev);
3538 if (ret)
3539 goto err_out_snaps;
3540
3541 /* generate unique id: find highest unique id, add one */
3542 rbd_dev_id_get(rbd_dev);
3543
3544 /* Fill in the device name, now that we have its id. */
3545 BUILD_BUG_ON(DEV_NAME_LEN
3546 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3547 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3548
3549 /* Get our block major device number. */
3550
3551 ret = register_blkdev(0, rbd_dev->name);
3552 if (ret < 0)
3553 goto err_out_id;
3554 rbd_dev->major = ret;
3555
3556 /* Set up the blkdev mapping. */
3557
3558 ret = rbd_init_disk(rbd_dev);
3559 if (ret)
3560 goto err_out_blkdev;
3561
3562 ret = rbd_bus_add_dev(rbd_dev);
3563 if (ret)
3564 goto err_out_disk;
3565
3566 /*
3567 * At this point cleanup in the event of an error is the job
3568 * of the sysfs code (initiated by rbd_bus_del_dev()).
3569 */
3570 down_write(&rbd_dev->header_rwsem);
3571 ret = rbd_dev_snaps_register(rbd_dev);
3572 up_write(&rbd_dev->header_rwsem);
3573 if (ret)
3574 goto err_out_bus;
3575
3576 ret = rbd_init_watch_dev(rbd_dev);
3577 if (ret)
3578 goto err_out_bus;
3579
3580 /* Everything's ready. Announce the disk to the world. */
3581
3582 add_disk(rbd_dev->disk);
3583
3584 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3585 (unsigned long long) rbd_dev->mapping.size);
3586
3587 return ret;
3588err_out_bus:
3589 /* this will also clean up rest of rbd_dev stuff */
3590
3591 rbd_bus_del_dev(rbd_dev);
3592
3593 return ret;
3594err_out_disk:
3595 rbd_free_disk(rbd_dev);
3596err_out_blkdev:
3597 unregister_blkdev(rbd_dev->major, rbd_dev->name);
3598err_out_id:
3599 rbd_dev_id_put(rbd_dev);
3600err_out_snaps:
3601 rbd_remove_all_snaps(rbd_dev);
3602
3603 return ret;
3604}
3605
a30b71b9
AE
3606/*
3607 * Probe for the existence of the header object for the given rbd
3608 * device. For format 2 images this includes determining the image
3609 * id.
3610 */
3611static int rbd_dev_probe(struct rbd_device *rbd_dev)
3612{
3613 int ret;
3614
3615 /*
3616 * Get the id from the image id object. If it's not a
3617 * format 2 image, we'll get ENOENT back, and we'll assume
3618 * it's a format 1 image.
3619 */
3620 ret = rbd_dev_image_id(rbd_dev);
3621 if (ret)
3622 ret = rbd_dev_v1_probe(rbd_dev);
3623 else
3624 ret = rbd_dev_v2_probe(rbd_dev);
83a06263 3625 if (ret) {
a30b71b9
AE
3626 dout("probe failed, returning %d\n", ret);
3627
83a06263
AE
3628 return ret;
3629 }
3630
3631 ret = rbd_dev_probe_finish(rbd_dev);
3632 if (ret)
3633 rbd_header_free(&rbd_dev->header);
3634
a30b71b9
AE
3635 return ret;
3636}
3637
59c2be1e
YS
3638static ssize_t rbd_add(struct bus_type *bus,
3639 const char *buf,
3640 size_t count)
602adf40 3641{
cb8627c7 3642 struct rbd_device *rbd_dev = NULL;
dc79b113 3643 struct ceph_options *ceph_opts = NULL;
4e9afeba 3644 struct rbd_options *rbd_opts = NULL;
859c31df 3645 struct rbd_spec *spec = NULL;
9d3997fd 3646 struct rbd_client *rbdc;
27cc2594
AE
3647 struct ceph_osd_client *osdc;
3648 int rc = -ENOMEM;
602adf40
YS
3649
3650 if (!try_module_get(THIS_MODULE))
3651 return -ENODEV;
3652
602adf40 3653 /* parse add command */
859c31df 3654 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
dc79b113 3655 if (rc < 0)
bd4ba655 3656 goto err_out_module;
78cea76e 3657
9d3997fd
AE
3658 rbdc = rbd_get_client(ceph_opts);
3659 if (IS_ERR(rbdc)) {
3660 rc = PTR_ERR(rbdc);
0ddebc0c 3661 goto err_out_args;
9d3997fd 3662 }
c53d5893 3663 ceph_opts = NULL; /* rbd_dev client now owns this */
602adf40 3664
602adf40 3665 /* pick the pool */
9d3997fd 3666 osdc = &rbdc->client->osdc;
859c31df 3667 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
602adf40
YS
3668 if (rc < 0)
3669 goto err_out_client;
859c31df
AE
3670 spec->pool_id = (u64) rc;
3671
c53d5893 3672 rbd_dev = rbd_dev_create(rbdc, spec);
bd4ba655
AE
3673 if (!rbd_dev)
3674 goto err_out_client;
c53d5893
AE
3675 rbdc = NULL; /* rbd_dev now owns this */
3676 spec = NULL; /* rbd_dev now owns this */
602adf40 3677
bd4ba655 3678 rbd_dev->mapping.read_only = rbd_opts->read_only;
c53d5893
AE
3679 kfree(rbd_opts);
3680 rbd_opts = NULL; /* done with this */
bd4ba655 3681
a30b71b9
AE
3682 rc = rbd_dev_probe(rbd_dev);
3683 if (rc < 0)
c53d5893 3684 goto err_out_rbd_dev;
05fd6f6f 3685
602adf40 3686 return count;
c53d5893
AE
3687err_out_rbd_dev:
3688 rbd_dev_destroy(rbd_dev);
bd4ba655 3689err_out_client:
9d3997fd 3690 rbd_put_client(rbdc);
0ddebc0c 3691err_out_args:
78cea76e
AE
3692 if (ceph_opts)
3693 ceph_destroy_options(ceph_opts);
4e9afeba 3694 kfree(rbd_opts);
859c31df 3695 rbd_spec_put(spec);
bd4ba655
AE
3696err_out_module:
3697 module_put(THIS_MODULE);
27cc2594 3698
602adf40 3699 dout("Error adding device %s\n", buf);
27cc2594
AE
3700
3701 return (ssize_t) rc;
602adf40
YS
3702}
3703
de71a297 3704static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
602adf40
YS
3705{
3706 struct list_head *tmp;
3707 struct rbd_device *rbd_dev;
3708
e124a82f 3709 spin_lock(&rbd_dev_list_lock);
602adf40
YS
3710 list_for_each(tmp, &rbd_dev_list) {
3711 rbd_dev = list_entry(tmp, struct rbd_device, node);
de71a297 3712 if (rbd_dev->dev_id == dev_id) {
e124a82f 3713 spin_unlock(&rbd_dev_list_lock);
602adf40 3714 return rbd_dev;
e124a82f 3715 }
602adf40 3716 }
e124a82f 3717 spin_unlock(&rbd_dev_list_lock);
602adf40
YS
3718 return NULL;
3719}
3720
dfc5606d 3721static void rbd_dev_release(struct device *dev)
602adf40 3722{
593a9e7b 3723 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 3724
1dbb4399
AE
3725 if (rbd_dev->watch_request) {
3726 struct ceph_client *client = rbd_dev->rbd_client->client;
3727
3728 ceph_osdc_unregister_linger_request(&client->osdc,
59c2be1e 3729 rbd_dev->watch_request);
1dbb4399 3730 }
59c2be1e 3731 if (rbd_dev->watch_event)
070c633f 3732 rbd_req_sync_unwatch(rbd_dev);
59c2be1e 3733
602adf40
YS
3734
3735 /* clean up and free blkdev */
3736 rbd_free_disk(rbd_dev);
3737 unregister_blkdev(rbd_dev->major, rbd_dev->name);
32eec68d 3738
2ac4e75d
AE
3739 /* release allocated disk header fields */
3740 rbd_header_free(&rbd_dev->header);
3741
32eec68d 3742 /* done with the id, and with the rbd_dev */
e2839308 3743 rbd_dev_id_put(rbd_dev);
c53d5893
AE
3744 rbd_assert(rbd_dev->rbd_client != NULL);
3745 rbd_dev_destroy(rbd_dev);
602adf40
YS
3746
3747 /* release module ref */
3748 module_put(THIS_MODULE);
602adf40
YS
3749}
3750
dfc5606d
YS
3751static ssize_t rbd_remove(struct bus_type *bus,
3752 const char *buf,
3753 size_t count)
602adf40
YS
3754{
3755 struct rbd_device *rbd_dev = NULL;
3756 int target_id, rc;
3757 unsigned long ul;
3758 int ret = count;
3759
3760 rc = strict_strtoul(buf, 10, &ul);
3761 if (rc)
3762 return rc;
3763
3764 /* convert to int; abort if we lost anything in the conversion */
3765 target_id = (int) ul;
3766 if (target_id != ul)
3767 return -EINVAL;
3768
3769 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3770
3771 rbd_dev = __rbd_get_dev(target_id);
3772 if (!rbd_dev) {
3773 ret = -ENOENT;
3774 goto done;
42382b70
AE
3775 }
3776
3777 if (rbd_dev->open_count) {
3778 ret = -EBUSY;
3779 goto done;
602adf40
YS
3780 }
3781
41f38c2b 3782 rbd_remove_all_snaps(rbd_dev);
dfc5606d 3783 rbd_bus_del_dev(rbd_dev);
602adf40
YS
3784
3785done:
3786 mutex_unlock(&ctl_mutex);
aafb230e 3787
602adf40
YS
3788 return ret;
3789}
3790
602adf40
YS
3791/*
3792 * create control files in sysfs
dfc5606d 3793 * /sys/bus/rbd/...
602adf40
YS
3794 */
3795static int rbd_sysfs_init(void)
3796{
dfc5606d 3797 int ret;
602adf40 3798
fed4c143 3799 ret = device_register(&rbd_root_dev);
21079786 3800 if (ret < 0)
dfc5606d 3801 return ret;
602adf40 3802
fed4c143
AE
3803 ret = bus_register(&rbd_bus_type);
3804 if (ret < 0)
3805 device_unregister(&rbd_root_dev);
602adf40 3806
602adf40
YS
3807 return ret;
3808}
3809
3810static void rbd_sysfs_cleanup(void)
3811{
dfc5606d 3812 bus_unregister(&rbd_bus_type);
fed4c143 3813 device_unregister(&rbd_root_dev);
602adf40
YS
3814}
3815
3816int __init rbd_init(void)
3817{
3818 int rc;
3819
3820 rc = rbd_sysfs_init();
3821 if (rc)
3822 return rc;
f0f8cef5 3823 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
602adf40
YS
3824 return 0;
3825}
3826
3827void __exit rbd_exit(void)
3828{
3829 rbd_sysfs_cleanup();
3830}
3831
3832module_init(rbd_init);
3833module_exit(rbd_exit);
3834
3835MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3836MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3837MODULE_DESCRIPTION("rados block device");
3838
3839/* following authorship retained from original osdblk.c */
3840MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3841
3842MODULE_LICENSE("GPL");