]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - drivers/block/rbd.c
rbd: get image features for a v2 image
[mirror_ubuntu-bionic-kernel.git] / drivers / block / rbd.c
CommitLineData
602adf40
YS
1/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
dfc5606d 24 For usage instructions, please refer to:
602adf40 25
dfc5606d 26 Documentation/ABI/testing/sysfs-bus-rbd
602adf40
YS
27
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
59c2be1e 34#include <linux/parser.h>
602adf40
YS
35
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
aafb230e
AE
44#define RBD_DEBUG /* Activate rbd_assert() calls */
45
593a9e7b
AE
46/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
df111be6
AE
55/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
f0f8cef5
AE
59#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
602adf40
YS
61
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
602adf40
YS
64#define RBD_MAX_SNAP_NAME_LEN 32
65#define RBD_MAX_OPT_LEN 1024
66
67#define RBD_SNAP_HEAD_NAME "-"
68
1e130199
AE
69#define RBD_IMAGE_ID_LEN_MAX 64
70#define RBD_OBJ_PREFIX_LEN_MAX 64
589d30e0 71
81a89793
AE
72/*
73 * An RBD device name will be "rbd#", where the "rbd" comes from
74 * RBD_DRV_NAME above, and # is a unique integer identifier.
75 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
76 * enough to hold all possible device names.
77 */
602adf40 78#define DEV_NAME_LEN 32
81a89793 79#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
602adf40 80
cc0538b6 81#define RBD_READ_ONLY_DEFAULT false
59c2be1e 82
602adf40
YS
83/*
84 * block device image metadata (in-memory version)
85 */
86struct rbd_image_header {
f84344f3 87 /* These four fields never change for a given rbd image */
849b4260 88 char *object_prefix;
34b13184 89 u64 features;
602adf40
YS
90 __u8 obj_order;
91 __u8 crypt_type;
92 __u8 comp_type;
602adf40 93
f84344f3
AE
94 /* The remaining fields need to be updated occasionally */
95 u64 image_size;
96 struct ceph_snap_context *snapc;
602adf40
YS
97 char *snap_names;
98 u64 *snap_sizes;
59c2be1e
YS
99
100 u64 obj_version;
101};
102
103struct rbd_options {
cc0538b6 104 bool read_only;
602adf40
YS
105};
106
107/*
f0f8cef5 108 * an instance of the client. multiple devices may share an rbd client.
602adf40
YS
109 */
110struct rbd_client {
111 struct ceph_client *client;
112 struct kref kref;
113 struct list_head node;
114};
115
116/*
f0f8cef5 117 * a request completion status
602adf40 118 */
1fec7093
YS
119struct rbd_req_status {
120 int done;
121 int rc;
122 u64 bytes;
123};
124
125/*
126 * a collection of requests
127 */
128struct rbd_req_coll {
129 int total;
130 int num_done;
131 struct kref kref;
132 struct rbd_req_status status[0];
602adf40
YS
133};
134
f0f8cef5
AE
135/*
136 * a single io request
137 */
138struct rbd_request {
139 struct request *rq; /* blk layer request */
140 struct bio *bio; /* cloned bio */
141 struct page **pages; /* list of used pages */
142 u64 len;
143 int coll_index;
144 struct rbd_req_coll *coll;
145};
146
dfc5606d
YS
147struct rbd_snap {
148 struct device dev;
149 const char *name;
3591538f 150 u64 size;
dfc5606d
YS
151 struct list_head node;
152 u64 id;
34b13184 153 u64 features;
dfc5606d
YS
154};
155
f84344f3
AE
156struct rbd_mapping {
157 char *snap_name;
158 u64 snap_id;
99c1f08f 159 u64 size;
34b13184 160 u64 features;
f84344f3
AE
161 bool snap_exists;
162 bool read_only;
163};
164
602adf40
YS
165/*
166 * a single device
167 */
168struct rbd_device {
de71a297 169 int dev_id; /* blkdev unique id */
602adf40
YS
170
171 int major; /* blkdev assigned major */
172 struct gendisk *disk; /* blkdev's gendisk and rq */
602adf40 173
a30b71b9 174 u32 image_format; /* Either 1 or 2 */
f8c38929 175 struct rbd_options rbd_opts;
602adf40
YS
176 struct rbd_client *rbd_client;
177
178 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
179
180 spinlock_t lock; /* queue lock */
181
182 struct rbd_image_header header;
589d30e0
AE
183 char *image_id;
184 size_t image_id_len;
0bed54dc
AE
185 char *image_name;
186 size_t image_name_len;
187 char *header_name;
d22f76e7 188 char *pool_name;
9bb2f334 189 int pool_id;
602adf40 190
59c2be1e
YS
191 struct ceph_osd_event *watch_event;
192 struct ceph_osd_request *watch_request;
193
c666601a
JD
194 /* protects updating the header */
195 struct rw_semaphore header_rwsem;
f84344f3
AE
196
197 struct rbd_mapping mapping;
602adf40
YS
198
199 struct list_head node;
dfc5606d
YS
200
201 /* list of snapshots */
202 struct list_head snaps;
203
204 /* sysfs related */
205 struct device dev;
206};
207
602adf40 208static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
e124a82f 209
602adf40 210static LIST_HEAD(rbd_dev_list); /* devices */
e124a82f
AE
211static DEFINE_SPINLOCK(rbd_dev_list_lock);
212
432b8587
AE
213static LIST_HEAD(rbd_client_list); /* clients */
214static DEFINE_SPINLOCK(rbd_client_list_lock);
602adf40 215
304f6808
AE
216static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
217static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
218
dfc5606d 219static void rbd_dev_release(struct device *dev);
14e7085d 220static void __rbd_remove_snap_dev(struct rbd_snap *snap);
dfc5606d 221
f0f8cef5
AE
222static ssize_t rbd_add(struct bus_type *bus, const char *buf,
223 size_t count);
224static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
225 size_t count);
226
227static struct bus_attribute rbd_bus_attrs[] = {
228 __ATTR(add, S_IWUSR, NULL, rbd_add),
229 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
230 __ATTR_NULL
231};
232
233static struct bus_type rbd_bus_type = {
234 .name = "rbd",
235 .bus_attrs = rbd_bus_attrs,
236};
237
238static void rbd_root_dev_release(struct device *dev)
239{
240}
241
242static struct device rbd_root_dev = {
243 .init_name = "rbd",
244 .release = rbd_root_dev_release,
245};
246
aafb230e
AE
247#ifdef RBD_DEBUG
248#define rbd_assert(expr) \
249 if (unlikely(!(expr))) { \
250 printk(KERN_ERR "\nAssertion failure in %s() " \
251 "at line %d:\n\n" \
252 "\trbd_assert(%s);\n\n", \
253 __func__, __LINE__, #expr); \
254 BUG(); \
255 }
256#else /* !RBD_DEBUG */
257# define rbd_assert(expr) ((void) 0)
258#endif /* !RBD_DEBUG */
dfc5606d 259
dfc5606d
YS
260static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
261{
262 return get_device(&rbd_dev->dev);
263}
264
265static void rbd_put_dev(struct rbd_device *rbd_dev)
266{
267 put_device(&rbd_dev->dev);
268}
602adf40 269
1fe5e993 270static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
59c2be1e 271
602adf40
YS
272static int rbd_open(struct block_device *bdev, fmode_t mode)
273{
f0f8cef5 274 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
602adf40 275
f84344f3 276 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
602adf40
YS
277 return -EROFS;
278
340c7a2b 279 rbd_get_dev(rbd_dev);
f84344f3 280 set_device_ro(bdev, rbd_dev->mapping.read_only);
340c7a2b 281
602adf40
YS
282 return 0;
283}
284
dfc5606d
YS
285static int rbd_release(struct gendisk *disk, fmode_t mode)
286{
287 struct rbd_device *rbd_dev = disk->private_data;
288
289 rbd_put_dev(rbd_dev);
290
291 return 0;
292}
293
602adf40
YS
294static const struct block_device_operations rbd_bd_ops = {
295 .owner = THIS_MODULE,
296 .open = rbd_open,
dfc5606d 297 .release = rbd_release,
602adf40
YS
298};
299
300/*
301 * Initialize an rbd client instance.
43ae4701 302 * We own *ceph_opts.
602adf40 303 */
f8c38929 304static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
602adf40
YS
305{
306 struct rbd_client *rbdc;
307 int ret = -ENOMEM;
308
309 dout("rbd_client_create\n");
310 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
311 if (!rbdc)
312 goto out_opt;
313
314 kref_init(&rbdc->kref);
315 INIT_LIST_HEAD(&rbdc->node);
316
bc534d86
AE
317 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
318
43ae4701 319 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
602adf40 320 if (IS_ERR(rbdc->client))
bc534d86 321 goto out_mutex;
43ae4701 322 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
602adf40
YS
323
324 ret = ceph_open_session(rbdc->client);
325 if (ret < 0)
326 goto out_err;
327
432b8587 328 spin_lock(&rbd_client_list_lock);
602adf40 329 list_add_tail(&rbdc->node, &rbd_client_list);
432b8587 330 spin_unlock(&rbd_client_list_lock);
602adf40 331
bc534d86
AE
332 mutex_unlock(&ctl_mutex);
333
602adf40
YS
334 dout("rbd_client_create created %p\n", rbdc);
335 return rbdc;
336
337out_err:
338 ceph_destroy_client(rbdc->client);
bc534d86
AE
339out_mutex:
340 mutex_unlock(&ctl_mutex);
602adf40
YS
341 kfree(rbdc);
342out_opt:
43ae4701
AE
343 if (ceph_opts)
344 ceph_destroy_options(ceph_opts);
28f259b7 345 return ERR_PTR(ret);
602adf40
YS
346}
347
348/*
1f7ba331
AE
349 * Find a ceph client with specific addr and configuration. If
350 * found, bump its reference count.
602adf40 351 */
1f7ba331 352static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
602adf40
YS
353{
354 struct rbd_client *client_node;
1f7ba331 355 bool found = false;
602adf40 356
43ae4701 357 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
602adf40
YS
358 return NULL;
359
1f7ba331
AE
360 spin_lock(&rbd_client_list_lock);
361 list_for_each_entry(client_node, &rbd_client_list, node) {
362 if (!ceph_compare_options(ceph_opts, client_node->client)) {
363 kref_get(&client_node->kref);
364 found = true;
365 break;
366 }
367 }
368 spin_unlock(&rbd_client_list_lock);
369
370 return found ? client_node : NULL;
602adf40
YS
371}
372
59c2be1e
YS
373/*
374 * mount options
375 */
376enum {
59c2be1e
YS
377 Opt_last_int,
378 /* int args above */
379 Opt_last_string,
380 /* string args above */
cc0538b6
AE
381 Opt_read_only,
382 Opt_read_write,
383 /* Boolean args above */
384 Opt_last_bool,
59c2be1e
YS
385};
386
43ae4701 387static match_table_t rbd_opts_tokens = {
59c2be1e
YS
388 /* int args above */
389 /* string args above */
f84344f3 390 {Opt_read_only, "mapping.read_only"},
cc0538b6
AE
391 {Opt_read_only, "ro"}, /* Alternate spelling */
392 {Opt_read_write, "read_write"},
393 {Opt_read_write, "rw"}, /* Alternate spelling */
394 /* Boolean args above */
59c2be1e
YS
395 {-1, NULL}
396};
397
398static int parse_rbd_opts_token(char *c, void *private)
399{
43ae4701 400 struct rbd_options *rbd_opts = private;
59c2be1e
YS
401 substring_t argstr[MAX_OPT_ARGS];
402 int token, intval, ret;
403
43ae4701 404 token = match_token(c, rbd_opts_tokens, argstr);
59c2be1e
YS
405 if (token < 0)
406 return -EINVAL;
407
408 if (token < Opt_last_int) {
409 ret = match_int(&argstr[0], &intval);
410 if (ret < 0) {
411 pr_err("bad mount option arg (not int) "
412 "at '%s'\n", c);
413 return ret;
414 }
415 dout("got int token %d val %d\n", token, intval);
416 } else if (token > Opt_last_int && token < Opt_last_string) {
417 dout("got string token %d val %s\n", token,
418 argstr[0].from);
cc0538b6
AE
419 } else if (token > Opt_last_string && token < Opt_last_bool) {
420 dout("got Boolean token %d\n", token);
59c2be1e
YS
421 } else {
422 dout("got token %d\n", token);
423 }
424
425 switch (token) {
cc0538b6
AE
426 case Opt_read_only:
427 rbd_opts->read_only = true;
428 break;
429 case Opt_read_write:
430 rbd_opts->read_only = false;
431 break;
59c2be1e 432 default:
aafb230e
AE
433 rbd_assert(false);
434 break;
59c2be1e
YS
435 }
436 return 0;
437}
438
602adf40
YS
439/*
440 * Get a ceph client with specific addr and configuration, if one does
441 * not exist create it.
442 */
f8c38929
AE
443static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
444 size_t mon_addr_len, char *options)
602adf40 445{
f8c38929 446 struct rbd_options *rbd_opts = &rbd_dev->rbd_opts;
43ae4701 447 struct ceph_options *ceph_opts;
f8c38929 448 struct rbd_client *rbdc;
59c2be1e 449
cc0538b6 450 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
602adf40 451
43ae4701
AE
452 ceph_opts = ceph_parse_options(options, mon_addr,
453 mon_addr + mon_addr_len,
454 parse_rbd_opts_token, rbd_opts);
f8c38929
AE
455 if (IS_ERR(ceph_opts))
456 return PTR_ERR(ceph_opts);
602adf40 457
1f7ba331 458 rbdc = rbd_client_find(ceph_opts);
602adf40 459 if (rbdc) {
602adf40 460 /* using an existing client */
43ae4701 461 ceph_destroy_options(ceph_opts);
f8c38929
AE
462 } else {
463 rbdc = rbd_client_create(ceph_opts);
464 if (IS_ERR(rbdc))
465 return PTR_ERR(rbdc);
602adf40 466 }
f8c38929 467 rbd_dev->rbd_client = rbdc;
602adf40 468
f8c38929 469 return 0;
602adf40
YS
470}
471
472/*
473 * Destroy ceph client
d23a4b3f 474 *
432b8587 475 * Caller must hold rbd_client_list_lock.
602adf40
YS
476 */
477static void rbd_client_release(struct kref *kref)
478{
479 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
480
481 dout("rbd_release_client %p\n", rbdc);
cd9d9f5d 482 spin_lock(&rbd_client_list_lock);
602adf40 483 list_del(&rbdc->node);
cd9d9f5d 484 spin_unlock(&rbd_client_list_lock);
602adf40
YS
485
486 ceph_destroy_client(rbdc->client);
487 kfree(rbdc);
488}
489
490/*
491 * Drop reference to ceph client node. If it's not referenced anymore, release
492 * it.
493 */
494static void rbd_put_client(struct rbd_device *rbd_dev)
495{
496 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
497 rbd_dev->rbd_client = NULL;
602adf40
YS
498}
499
1fec7093
YS
500/*
501 * Destroy requests collection
502 */
503static void rbd_coll_release(struct kref *kref)
504{
505 struct rbd_req_coll *coll =
506 container_of(kref, struct rbd_req_coll, kref);
507
508 dout("rbd_coll_release %p\n", coll);
509 kfree(coll);
510}
602adf40 511
a30b71b9
AE
512static bool rbd_image_format_valid(u32 image_format)
513{
514 return image_format == 1 || image_format == 2;
515}
516
8e94af8e
AE
517static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
518{
103a150f
AE
519 size_t size;
520 u32 snap_count;
521
522 /* The header has to start with the magic rbd header text */
523 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
524 return false;
525
526 /*
527 * The size of a snapshot header has to fit in a size_t, and
528 * that limits the number of snapshots.
529 */
530 snap_count = le32_to_cpu(ondisk->snap_count);
531 size = SIZE_MAX - sizeof (struct ceph_snap_context);
532 if (snap_count > size / sizeof (__le64))
533 return false;
534
535 /*
536 * Not only that, but the size of the entire the snapshot
537 * header must also be representable in a size_t.
538 */
539 size -= snap_count * sizeof (__le64);
540 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
541 return false;
542
543 return true;
8e94af8e
AE
544}
545
602adf40
YS
546/*
547 * Create a new header structure, translate header format from the on-disk
548 * header.
549 */
550static int rbd_header_from_disk(struct rbd_image_header *header,
4156d998 551 struct rbd_image_header_ondisk *ondisk)
602adf40 552{
ccece235 553 u32 snap_count;
58c17b0e 554 size_t len;
d2bb24e5 555 size_t size;
621901d6 556 u32 i;
602adf40 557
6a52325f
AE
558 memset(header, 0, sizeof (*header));
559
103a150f
AE
560 snap_count = le32_to_cpu(ondisk->snap_count);
561
58c17b0e
AE
562 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
563 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
6a52325f 564 if (!header->object_prefix)
602adf40 565 return -ENOMEM;
58c17b0e
AE
566 memcpy(header->object_prefix, ondisk->object_prefix, len);
567 header->object_prefix[len] = '\0';
00f1f36f 568
602adf40 569 if (snap_count) {
f785cc1d
AE
570 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
571
621901d6
AE
572 /* Save a copy of the snapshot names */
573
f785cc1d
AE
574 if (snap_names_len > (u64) SIZE_MAX)
575 return -EIO;
576 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
602adf40 577 if (!header->snap_names)
6a52325f 578 goto out_err;
f785cc1d
AE
579 /*
580 * Note that rbd_dev_v1_header_read() guarantees
581 * the ondisk buffer we're working with has
582 * snap_names_len bytes beyond the end of the
583 * snapshot id array, this memcpy() is safe.
584 */
585 memcpy(header->snap_names, &ondisk->snaps[snap_count],
586 snap_names_len);
6a52325f 587
621901d6
AE
588 /* Record each snapshot's size */
589
d2bb24e5
AE
590 size = snap_count * sizeof (*header->snap_sizes);
591 header->snap_sizes = kmalloc(size, GFP_KERNEL);
602adf40 592 if (!header->snap_sizes)
6a52325f 593 goto out_err;
621901d6
AE
594 for (i = 0; i < snap_count; i++)
595 header->snap_sizes[i] =
596 le64_to_cpu(ondisk->snaps[i].image_size);
602adf40 597 } else {
ccece235 598 WARN_ON(ondisk->snap_names_len);
602adf40
YS
599 header->snap_names = NULL;
600 header->snap_sizes = NULL;
601 }
849b4260 602
34b13184 603 header->features = 0; /* No features support in v1 images */
602adf40
YS
604 header->obj_order = ondisk->options.order;
605 header->crypt_type = ondisk->options.crypt_type;
606 header->comp_type = ondisk->options.comp_type;
6a52325f 607
621901d6
AE
608 /* Allocate and fill in the snapshot context */
609
f84344f3 610 header->image_size = le64_to_cpu(ondisk->image_size);
6a52325f
AE
611 size = sizeof (struct ceph_snap_context);
612 size += snap_count * sizeof (header->snapc->snaps[0]);
613 header->snapc = kzalloc(size, GFP_KERNEL);
614 if (!header->snapc)
615 goto out_err;
602adf40
YS
616
617 atomic_set(&header->snapc->nref, 1);
505cbb9b 618 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
602adf40 619 header->snapc->num_snaps = snap_count;
621901d6
AE
620 for (i = 0; i < snap_count; i++)
621 header->snapc->snaps[i] =
622 le64_to_cpu(ondisk->snaps[i].id);
602adf40
YS
623
624 return 0;
625
6a52325f 626out_err:
849b4260 627 kfree(header->snap_sizes);
ccece235 628 header->snap_sizes = NULL;
602adf40 629 kfree(header->snap_names);
ccece235 630 header->snap_names = NULL;
6a52325f
AE
631 kfree(header->object_prefix);
632 header->object_prefix = NULL;
ccece235 633
00f1f36f 634 return -ENOMEM;
602adf40
YS
635}
636
8836b995 637static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
602adf40 638{
602adf40 639
e86924a8 640 struct rbd_snap *snap;
602adf40 641
e86924a8
AE
642 list_for_each_entry(snap, &rbd_dev->snaps, node) {
643 if (!strcmp(snap_name, snap->name)) {
644 rbd_dev->mapping.snap_id = snap->id;
645 rbd_dev->mapping.size = snap->size;
34b13184 646 rbd_dev->mapping.features = snap->features;
602adf40 647
e86924a8 648 return 0;
00f1f36f 649 }
00f1f36f 650 }
e86924a8 651
00f1f36f 652 return -ENOENT;
602adf40
YS
653}
654
5ed16177 655static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name)
602adf40 656{
78dc447d 657 int ret;
602adf40 658
4e1105a2 659 if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME,
cc9d734c 660 sizeof (RBD_SNAP_HEAD_NAME))) {
f84344f3 661 rbd_dev->mapping.snap_id = CEPH_NOSNAP;
99c1f08f 662 rbd_dev->mapping.size = rbd_dev->header.image_size;
34b13184 663 rbd_dev->mapping.features = rbd_dev->header.features;
f84344f3
AE
664 rbd_dev->mapping.snap_exists = false;
665 rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only;
e86924a8 666 ret = 0;
602adf40 667 } else {
8836b995 668 ret = snap_by_name(rbd_dev, snap_name);
602adf40
YS
669 if (ret < 0)
670 goto done;
f84344f3
AE
671 rbd_dev->mapping.snap_exists = true;
672 rbd_dev->mapping.read_only = true;
602adf40 673 }
4e1105a2 674 rbd_dev->mapping.snap_name = snap_name;
602adf40 675done:
602adf40
YS
676 return ret;
677}
678
679static void rbd_header_free(struct rbd_image_header *header)
680{
849b4260 681 kfree(header->object_prefix);
d78fd7ae 682 header->object_prefix = NULL;
602adf40 683 kfree(header->snap_sizes);
d78fd7ae 684 header->snap_sizes = NULL;
849b4260 685 kfree(header->snap_names);
d78fd7ae 686 header->snap_names = NULL;
d1d25646 687 ceph_put_snap_context(header->snapc);
d78fd7ae 688 header->snapc = NULL;
602adf40
YS
689}
690
65ccfe21 691static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
602adf40 692{
65ccfe21
AE
693 char *name;
694 u64 segment;
695 int ret;
602adf40 696
65ccfe21
AE
697 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
698 if (!name)
699 return NULL;
700 segment = offset >> rbd_dev->header.obj_order;
701 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
702 rbd_dev->header.object_prefix, segment);
703 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
704 pr_err("error formatting segment name for #%llu (%d)\n",
705 segment, ret);
706 kfree(name);
707 name = NULL;
708 }
602adf40 709
65ccfe21
AE
710 return name;
711}
602adf40 712
65ccfe21
AE
713static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
714{
715 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
602adf40 716
65ccfe21
AE
717 return offset & (segment_size - 1);
718}
719
720static u64 rbd_segment_length(struct rbd_device *rbd_dev,
721 u64 offset, u64 length)
722{
723 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
724
725 offset &= segment_size - 1;
726
aafb230e 727 rbd_assert(length <= U64_MAX - offset);
65ccfe21
AE
728 if (offset + length > segment_size)
729 length = segment_size - offset;
730
731 return length;
602adf40
YS
732}
733
1fec7093
YS
734static int rbd_get_num_segments(struct rbd_image_header *header,
735 u64 ofs, u64 len)
736{
df111be6
AE
737 u64 start_seg;
738 u64 end_seg;
739
740 if (!len)
741 return 0;
742 if (len - 1 > U64_MAX - ofs)
743 return -ERANGE;
744
745 start_seg = ofs >> header->obj_order;
746 end_seg = (ofs + len - 1) >> header->obj_order;
747
1fec7093
YS
748 return end_seg - start_seg + 1;
749}
750
029bcbd8
JD
751/*
752 * returns the size of an object in the image
753 */
754static u64 rbd_obj_bytes(struct rbd_image_header *header)
755{
756 return 1 << header->obj_order;
757}
758
602adf40
YS
759/*
760 * bio helpers
761 */
762
763static void bio_chain_put(struct bio *chain)
764{
765 struct bio *tmp;
766
767 while (chain) {
768 tmp = chain;
769 chain = chain->bi_next;
770 bio_put(tmp);
771 }
772}
773
774/*
775 * zeros a bio chain, starting at specific offset
776 */
777static void zero_bio_chain(struct bio *chain, int start_ofs)
778{
779 struct bio_vec *bv;
780 unsigned long flags;
781 void *buf;
782 int i;
783 int pos = 0;
784
785 while (chain) {
786 bio_for_each_segment(bv, chain, i) {
787 if (pos + bv->bv_len > start_ofs) {
788 int remainder = max(start_ofs - pos, 0);
789 buf = bvec_kmap_irq(bv, &flags);
790 memset(buf + remainder, 0,
791 bv->bv_len - remainder);
85b5aaa6 792 bvec_kunmap_irq(buf, &flags);
602adf40
YS
793 }
794 pos += bv->bv_len;
795 }
796
797 chain = chain->bi_next;
798 }
799}
800
801/*
802 * bio_chain_clone - clone a chain of bios up to a certain length.
803 * might return a bio_pair that will need to be released.
804 */
805static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
806 struct bio_pair **bp,
807 int len, gfp_t gfpmask)
808{
542582fc
AE
809 struct bio *old_chain = *old;
810 struct bio *new_chain = NULL;
811 struct bio *tail;
602adf40
YS
812 int total = 0;
813
814 if (*bp) {
815 bio_pair_release(*bp);
816 *bp = NULL;
817 }
818
819 while (old_chain && (total < len)) {
542582fc
AE
820 struct bio *tmp;
821
602adf40
YS
822 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
823 if (!tmp)
824 goto err_out;
542582fc 825 gfpmask &= ~__GFP_WAIT; /* can't wait after the first */
602adf40
YS
826
827 if (total + old_chain->bi_size > len) {
828 struct bio_pair *bp;
829
830 /*
831 * this split can only happen with a single paged bio,
832 * split_bio will BUG_ON if this is not the case
833 */
834 dout("bio_chain_clone split! total=%d remaining=%d"
bd919d45
AE
835 "bi_size=%u\n",
836 total, len - total, old_chain->bi_size);
602adf40
YS
837
838 /* split the bio. We'll release it either in the next
839 call, or it will have to be released outside */
593a9e7b 840 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
602adf40
YS
841 if (!bp)
842 goto err_out;
843
844 __bio_clone(tmp, &bp->bio1);
845
846 *next = &bp->bio2;
847 } else {
848 __bio_clone(tmp, old_chain);
849 *next = old_chain->bi_next;
850 }
851
852 tmp->bi_bdev = NULL;
602adf40 853 tmp->bi_next = NULL;
542582fc 854 if (new_chain)
602adf40 855 tail->bi_next = tmp;
542582fc
AE
856 else
857 new_chain = tmp;
858 tail = tmp;
602adf40
YS
859 old_chain = old_chain->bi_next;
860
861 total += tmp->bi_size;
862 }
863
aafb230e 864 rbd_assert(total == len);
602adf40 865
602adf40
YS
866 *old = old_chain;
867
868 return new_chain;
869
870err_out:
871 dout("bio_chain_clone with err\n");
872 bio_chain_put(new_chain);
873 return NULL;
874}
875
876/*
877 * helpers for osd request op vectors.
878 */
57cfc106
AE
879static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
880 int opcode, u32 payload_len)
602adf40 881{
57cfc106
AE
882 struct ceph_osd_req_op *ops;
883
884 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
885 if (!ops)
886 return NULL;
887
888 ops[0].op = opcode;
889
602adf40
YS
890 /*
891 * op extent offset and length will be set later on
892 * in calc_raw_layout()
893 */
57cfc106
AE
894 ops[0].payload_len = payload_len;
895
896 return ops;
602adf40
YS
897}
898
899static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
900{
901 kfree(ops);
902}
903
1fec7093
YS
904static void rbd_coll_end_req_index(struct request *rq,
905 struct rbd_req_coll *coll,
906 int index,
907 int ret, u64 len)
908{
909 struct request_queue *q;
910 int min, max, i;
911
bd919d45
AE
912 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
913 coll, index, ret, (unsigned long long) len);
1fec7093
YS
914
915 if (!rq)
916 return;
917
918 if (!coll) {
919 blk_end_request(rq, ret, len);
920 return;
921 }
922
923 q = rq->q;
924
925 spin_lock_irq(q->queue_lock);
926 coll->status[index].done = 1;
927 coll->status[index].rc = ret;
928 coll->status[index].bytes = len;
929 max = min = coll->num_done;
930 while (max < coll->total && coll->status[max].done)
931 max++;
932
933 for (i = min; i<max; i++) {
934 __blk_end_request(rq, coll->status[i].rc,
935 coll->status[i].bytes);
936 coll->num_done++;
937 kref_put(&coll->kref, rbd_coll_release);
938 }
939 spin_unlock_irq(q->queue_lock);
940}
941
942static void rbd_coll_end_req(struct rbd_request *req,
943 int ret, u64 len)
944{
945 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
946}
947
602adf40
YS
948/*
949 * Send ceph osd request
950 */
951static int rbd_do_request(struct request *rq,
0ce1a794 952 struct rbd_device *rbd_dev,
602adf40
YS
953 struct ceph_snap_context *snapc,
954 u64 snapid,
aded07ea 955 const char *object_name, u64 ofs, u64 len,
602adf40
YS
956 struct bio *bio,
957 struct page **pages,
958 int num_pages,
959 int flags,
960 struct ceph_osd_req_op *ops,
1fec7093
YS
961 struct rbd_req_coll *coll,
962 int coll_index,
602adf40 963 void (*rbd_cb)(struct ceph_osd_request *req,
59c2be1e
YS
964 struct ceph_msg *msg),
965 struct ceph_osd_request **linger_req,
966 u64 *ver)
602adf40
YS
967{
968 struct ceph_osd_request *req;
969 struct ceph_file_layout *layout;
970 int ret;
971 u64 bno;
972 struct timespec mtime = CURRENT_TIME;
973 struct rbd_request *req_data;
974 struct ceph_osd_request_head *reqhead;
1dbb4399 975 struct ceph_osd_client *osdc;
602adf40 976
602adf40 977 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
1fec7093
YS
978 if (!req_data) {
979 if (coll)
980 rbd_coll_end_req_index(rq, coll, coll_index,
981 -ENOMEM, len);
982 return -ENOMEM;
983 }
984
985 if (coll) {
986 req_data->coll = coll;
987 req_data->coll_index = coll_index;
988 }
602adf40 989
bd919d45
AE
990 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
991 (unsigned long long) ofs, (unsigned long long) len);
602adf40 992
0ce1a794 993 osdc = &rbd_dev->rbd_client->client->osdc;
1dbb4399
AE
994 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
995 false, GFP_NOIO, pages, bio);
4ad12621 996 if (!req) {
4ad12621 997 ret = -ENOMEM;
602adf40
YS
998 goto done_pages;
999 }
1000
1001 req->r_callback = rbd_cb;
1002
1003 req_data->rq = rq;
1004 req_data->bio = bio;
1005 req_data->pages = pages;
1006 req_data->len = len;
1007
1008 req->r_priv = req_data;
1009
1010 reqhead = req->r_request->front.iov_base;
1011 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1012
aded07ea 1013 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
602adf40
YS
1014 req->r_oid_len = strlen(req->r_oid);
1015
1016 layout = &req->r_file_layout;
1017 memset(layout, 0, sizeof(*layout));
1018 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1019 layout->fl_stripe_count = cpu_to_le32(1);
1020 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
0ce1a794 1021 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
1dbb4399
AE
1022 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1023 req, ops);
602adf40
YS
1024
1025 ceph_osdc_build_request(req, ofs, &len,
1026 ops,
1027 snapc,
1028 &mtime,
1029 req->r_oid, req->r_oid_len);
602adf40 1030
59c2be1e 1031 if (linger_req) {
1dbb4399 1032 ceph_osdc_set_request_linger(osdc, req);
59c2be1e
YS
1033 *linger_req = req;
1034 }
1035
1dbb4399 1036 ret = ceph_osdc_start_request(osdc, req, false);
602adf40
YS
1037 if (ret < 0)
1038 goto done_err;
1039
1040 if (!rbd_cb) {
1dbb4399 1041 ret = ceph_osdc_wait_request(osdc, req);
59c2be1e
YS
1042 if (ver)
1043 *ver = le64_to_cpu(req->r_reassert_version.version);
bd919d45
AE
1044 dout("reassert_ver=%llu\n",
1045 (unsigned long long)
1046 le64_to_cpu(req->r_reassert_version.version));
602adf40
YS
1047 ceph_osdc_put_request(req);
1048 }
1049 return ret;
1050
1051done_err:
1052 bio_chain_put(req_data->bio);
1053 ceph_osdc_put_request(req);
1054done_pages:
1fec7093 1055 rbd_coll_end_req(req_data, ret, len);
602adf40 1056 kfree(req_data);
602adf40
YS
1057 return ret;
1058}
1059
1060/*
1061 * Ceph osd op callback
1062 */
1063static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1064{
1065 struct rbd_request *req_data = req->r_priv;
1066 struct ceph_osd_reply_head *replyhead;
1067 struct ceph_osd_op *op;
1068 __s32 rc;
1069 u64 bytes;
1070 int read_op;
1071
1072 /* parse reply */
1073 replyhead = msg->front.iov_base;
1074 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1075 op = (void *)(replyhead + 1);
1076 rc = le32_to_cpu(replyhead->result);
1077 bytes = le64_to_cpu(op->extent.length);
895cfcc8 1078 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
602adf40 1079
bd919d45
AE
1080 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1081 (unsigned long long) bytes, read_op, (int) rc);
602adf40
YS
1082
1083 if (rc == -ENOENT && read_op) {
1084 zero_bio_chain(req_data->bio, 0);
1085 rc = 0;
1086 } else if (rc == 0 && read_op && bytes < req_data->len) {
1087 zero_bio_chain(req_data->bio, bytes);
1088 bytes = req_data->len;
1089 }
1090
1fec7093 1091 rbd_coll_end_req(req_data, rc, bytes);
602adf40
YS
1092
1093 if (req_data->bio)
1094 bio_chain_put(req_data->bio);
1095
1096 ceph_osdc_put_request(req);
1097 kfree(req_data);
1098}
1099
59c2be1e
YS
1100static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1101{
1102 ceph_osdc_put_request(req);
1103}
1104
602adf40
YS
1105/*
1106 * Do a synchronous ceph osd operation
1107 */
0ce1a794 1108static int rbd_req_sync_op(struct rbd_device *rbd_dev,
602adf40
YS
1109 struct ceph_snap_context *snapc,
1110 u64 snapid,
602adf40 1111 int flags,
913d2fdc 1112 struct ceph_osd_req_op *ops,
aded07ea 1113 const char *object_name,
f8d4de6e
AE
1114 u64 ofs, u64 inbound_size,
1115 char *inbound,
59c2be1e
YS
1116 struct ceph_osd_request **linger_req,
1117 u64 *ver)
602adf40
YS
1118{
1119 int ret;
1120 struct page **pages;
1121 int num_pages;
913d2fdc 1122
aafb230e 1123 rbd_assert(ops != NULL);
602adf40 1124
f8d4de6e 1125 num_pages = calc_pages_for(ofs, inbound_size);
602adf40 1126 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
b8d0638a
DC
1127 if (IS_ERR(pages))
1128 return PTR_ERR(pages);
602adf40 1129
0ce1a794 1130 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
f8d4de6e 1131 object_name, ofs, inbound_size, NULL,
602adf40
YS
1132 pages, num_pages,
1133 flags,
1134 ops,
1fec7093 1135 NULL, 0,
59c2be1e
YS
1136 NULL,
1137 linger_req, ver);
602adf40 1138 if (ret < 0)
913d2fdc 1139 goto done;
602adf40 1140
f8d4de6e
AE
1141 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1142 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
602adf40 1143
602adf40
YS
1144done:
1145 ceph_release_page_vector(pages, num_pages);
1146 return ret;
1147}
1148
1149/*
1150 * Do an asynchronous ceph osd operation
1151 */
1152static int rbd_do_op(struct request *rq,
0ce1a794 1153 struct rbd_device *rbd_dev,
602adf40
YS
1154 struct ceph_snap_context *snapc,
1155 u64 snapid,
d1f57ea6 1156 int opcode, int flags,
602adf40 1157 u64 ofs, u64 len,
1fec7093
YS
1158 struct bio *bio,
1159 struct rbd_req_coll *coll,
1160 int coll_index)
602adf40
YS
1161{
1162 char *seg_name;
1163 u64 seg_ofs;
1164 u64 seg_len;
1165 int ret;
1166 struct ceph_osd_req_op *ops;
1167 u32 payload_len;
1168
65ccfe21 1169 seg_name = rbd_segment_name(rbd_dev, ofs);
602adf40
YS
1170 if (!seg_name)
1171 return -ENOMEM;
65ccfe21
AE
1172 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1173 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
602adf40
YS
1174
1175 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1176
57cfc106
AE
1177 ret = -ENOMEM;
1178 ops = rbd_create_rw_ops(1, opcode, payload_len);
1179 if (!ops)
602adf40
YS
1180 goto done;
1181
1182 /* we've taken care of segment sizes earlier when we
1183 cloned the bios. We should never have a segment
1184 truncated at this point */
aafb230e 1185 rbd_assert(seg_len == len);
602adf40
YS
1186
1187 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1188 seg_name, seg_ofs, seg_len,
1189 bio,
1190 NULL, 0,
1191 flags,
1192 ops,
1fec7093 1193 coll, coll_index,
59c2be1e 1194 rbd_req_cb, 0, NULL);
11f77002
SW
1195
1196 rbd_destroy_ops(ops);
602adf40
YS
1197done:
1198 kfree(seg_name);
1199 return ret;
1200}
1201
1202/*
1203 * Request async osd write
1204 */
1205static int rbd_req_write(struct request *rq,
1206 struct rbd_device *rbd_dev,
1207 struct ceph_snap_context *snapc,
1208 u64 ofs, u64 len,
1fec7093
YS
1209 struct bio *bio,
1210 struct rbd_req_coll *coll,
1211 int coll_index)
602adf40
YS
1212{
1213 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1214 CEPH_OSD_OP_WRITE,
1215 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1fec7093 1216 ofs, len, bio, coll, coll_index);
602adf40
YS
1217}
1218
1219/*
1220 * Request async osd read
1221 */
1222static int rbd_req_read(struct request *rq,
1223 struct rbd_device *rbd_dev,
1224 u64 snapid,
1225 u64 ofs, u64 len,
1fec7093
YS
1226 struct bio *bio,
1227 struct rbd_req_coll *coll,
1228 int coll_index)
602adf40
YS
1229{
1230 return rbd_do_op(rq, rbd_dev, NULL,
b06e6a6b 1231 snapid,
602adf40
YS
1232 CEPH_OSD_OP_READ,
1233 CEPH_OSD_FLAG_READ,
1fec7093 1234 ofs, len, bio, coll, coll_index);
602adf40
YS
1235}
1236
1237/*
1238 * Request sync osd read
1239 */
0ce1a794 1240static int rbd_req_sync_read(struct rbd_device *rbd_dev,
602adf40 1241 u64 snapid,
aded07ea 1242 const char *object_name,
602adf40 1243 u64 ofs, u64 len,
59c2be1e
YS
1244 char *buf,
1245 u64 *ver)
602adf40 1246{
913d2fdc
AE
1247 struct ceph_osd_req_op *ops;
1248 int ret;
1249
1250 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1251 if (!ops)
1252 return -ENOMEM;
1253
1254 ret = rbd_req_sync_op(rbd_dev, NULL,
b06e6a6b 1255 snapid,
602adf40 1256 CEPH_OSD_FLAG_READ,
913d2fdc
AE
1257 ops, object_name, ofs, len, buf, NULL, ver);
1258 rbd_destroy_ops(ops);
1259
1260 return ret;
602adf40
YS
1261}
1262
1263/*
59c2be1e
YS
1264 * Request sync osd watch
1265 */
0ce1a794 1266static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
59c2be1e 1267 u64 ver,
7f0a24d8 1268 u64 notify_id)
59c2be1e
YS
1269{
1270 struct ceph_osd_req_op *ops;
11f77002
SW
1271 int ret;
1272
57cfc106
AE
1273 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1274 if (!ops)
1275 return -ENOMEM;
59c2be1e 1276
a71b891b 1277 ops[0].watch.ver = cpu_to_le64(ver);
59c2be1e
YS
1278 ops[0].watch.cookie = notify_id;
1279 ops[0].watch.flag = 0;
1280
0ce1a794 1281 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
7f0a24d8 1282 rbd_dev->header_name, 0, 0, NULL,
ad4f232f 1283 NULL, 0,
59c2be1e
YS
1284 CEPH_OSD_FLAG_READ,
1285 ops,
1fec7093 1286 NULL, 0,
59c2be1e
YS
1287 rbd_simple_req_cb, 0, NULL);
1288
1289 rbd_destroy_ops(ops);
1290 return ret;
1291}
1292
1293static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1294{
0ce1a794 1295 struct rbd_device *rbd_dev = (struct rbd_device *)data;
a71b891b 1296 u64 hver;
13143d2d
SW
1297 int rc;
1298
0ce1a794 1299 if (!rbd_dev)
59c2be1e
YS
1300 return;
1301
bd919d45
AE
1302 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1303 rbd_dev->header_name, (unsigned long long) notify_id,
1304 (unsigned int) opcode);
1fe5e993 1305 rc = rbd_refresh_header(rbd_dev, &hver);
13143d2d 1306 if (rc)
f0f8cef5 1307 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
0ce1a794 1308 " update snaps: %d\n", rbd_dev->major, rc);
59c2be1e 1309
7f0a24d8 1310 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
59c2be1e
YS
1311}
1312
1313/*
1314 * Request sync osd watch
1315 */
0e6f322d 1316static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
59c2be1e
YS
1317{
1318 struct ceph_osd_req_op *ops;
0ce1a794 1319 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
57cfc106 1320 int ret;
59c2be1e 1321
57cfc106
AE
1322 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1323 if (!ops)
1324 return -ENOMEM;
59c2be1e
YS
1325
1326 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
0ce1a794 1327 (void *)rbd_dev, &rbd_dev->watch_event);
59c2be1e
YS
1328 if (ret < 0)
1329 goto fail;
1330
0e6f322d 1331 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
0ce1a794 1332 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
59c2be1e
YS
1333 ops[0].watch.flag = 1;
1334
0ce1a794 1335 ret = rbd_req_sync_op(rbd_dev, NULL,
59c2be1e 1336 CEPH_NOSNAP,
59c2be1e
YS
1337 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1338 ops,
0e6f322d
AE
1339 rbd_dev->header_name,
1340 0, 0, NULL,
0ce1a794 1341 &rbd_dev->watch_request, NULL);
59c2be1e
YS
1342
1343 if (ret < 0)
1344 goto fail_event;
1345
1346 rbd_destroy_ops(ops);
1347 return 0;
1348
1349fail_event:
0ce1a794
AE
1350 ceph_osdc_cancel_event(rbd_dev->watch_event);
1351 rbd_dev->watch_event = NULL;
59c2be1e
YS
1352fail:
1353 rbd_destroy_ops(ops);
1354 return ret;
1355}
1356
79e3057c
YS
1357/*
1358 * Request sync osd unwatch
1359 */
070c633f 1360static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
79e3057c
YS
1361{
1362 struct ceph_osd_req_op *ops;
57cfc106 1363 int ret;
79e3057c 1364
57cfc106
AE
1365 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1366 if (!ops)
1367 return -ENOMEM;
79e3057c
YS
1368
1369 ops[0].watch.ver = 0;
0ce1a794 1370 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
79e3057c
YS
1371 ops[0].watch.flag = 0;
1372
0ce1a794 1373 ret = rbd_req_sync_op(rbd_dev, NULL,
79e3057c 1374 CEPH_NOSNAP,
79e3057c
YS
1375 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1376 ops,
070c633f
AE
1377 rbd_dev->header_name,
1378 0, 0, NULL, NULL, NULL);
1379
79e3057c
YS
1380
1381 rbd_destroy_ops(ops);
0ce1a794
AE
1382 ceph_osdc_cancel_event(rbd_dev->watch_event);
1383 rbd_dev->watch_event = NULL;
79e3057c
YS
1384 return ret;
1385}
1386
602adf40 1387/*
3cb4a687 1388 * Synchronous osd object method call
602adf40 1389 */
0ce1a794 1390static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
aded07ea
AE
1391 const char *object_name,
1392 const char *class_name,
1393 const char *method_name,
3cb4a687
AE
1394 const char *outbound,
1395 size_t outbound_size,
f8d4de6e
AE
1396 char *inbound,
1397 size_t inbound_size,
3cb4a687 1398 int flags,
59c2be1e 1399 u64 *ver)
602adf40
YS
1400{
1401 struct ceph_osd_req_op *ops;
aded07ea
AE
1402 int class_name_len = strlen(class_name);
1403 int method_name_len = strlen(method_name);
3cb4a687 1404 int payload_size;
57cfc106
AE
1405 int ret;
1406
3cb4a687
AE
1407 /*
1408 * Any input parameters required by the method we're calling
1409 * will be sent along with the class and method names as
1410 * part of the message payload. That data and its size are
1411 * supplied via the indata and indata_len fields (named from
1412 * the perspective of the server side) in the OSD request
1413 * operation.
1414 */
1415 payload_size = class_name_len + method_name_len + outbound_size;
1416 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
57cfc106
AE
1417 if (!ops)
1418 return -ENOMEM;
602adf40 1419
aded07ea
AE
1420 ops[0].cls.class_name = class_name;
1421 ops[0].cls.class_len = (__u8) class_name_len;
1422 ops[0].cls.method_name = method_name;
1423 ops[0].cls.method_len = (__u8) method_name_len;
602adf40 1424 ops[0].cls.argc = 0;
3cb4a687
AE
1425 ops[0].cls.indata = outbound;
1426 ops[0].cls.indata_len = outbound_size;
602adf40 1427
0ce1a794 1428 ret = rbd_req_sync_op(rbd_dev, NULL,
602adf40 1429 CEPH_NOSNAP,
3cb4a687 1430 flags, ops,
f8d4de6e
AE
1431 object_name, 0, inbound_size, inbound,
1432 NULL, ver);
602adf40
YS
1433
1434 rbd_destroy_ops(ops);
1435
1436 dout("cls_exec returned %d\n", ret);
1437 return ret;
1438}
1439
1fec7093
YS
1440static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1441{
1442 struct rbd_req_coll *coll =
1443 kzalloc(sizeof(struct rbd_req_coll) +
1444 sizeof(struct rbd_req_status) * num_reqs,
1445 GFP_ATOMIC);
1446
1447 if (!coll)
1448 return NULL;
1449 coll->total = num_reqs;
1450 kref_init(&coll->kref);
1451 return coll;
1452}
1453
602adf40
YS
1454/*
1455 * block device queue callback
1456 */
1457static void rbd_rq_fn(struct request_queue *q)
1458{
1459 struct rbd_device *rbd_dev = q->queuedata;
1460 struct request *rq;
1461 struct bio_pair *bp = NULL;
1462
00f1f36f 1463 while ((rq = blk_fetch_request(q))) {
602adf40
YS
1464 struct bio *bio;
1465 struct bio *rq_bio, *next_bio = NULL;
1466 bool do_write;
bd919d45
AE
1467 unsigned int size;
1468 u64 op_size = 0;
602adf40 1469 u64 ofs;
1fec7093
YS
1470 int num_segs, cur_seg = 0;
1471 struct rbd_req_coll *coll;
d1d25646 1472 struct ceph_snap_context *snapc;
602adf40 1473
602adf40
YS
1474 dout("fetched request\n");
1475
1476 /* filter out block requests we don't understand */
1477 if ((rq->cmd_type != REQ_TYPE_FS)) {
1478 __blk_end_request_all(rq, 0);
00f1f36f 1479 continue;
602adf40
YS
1480 }
1481
1482 /* deduce our operation (read, write) */
1483 do_write = (rq_data_dir(rq) == WRITE);
1484
1485 size = blk_rq_bytes(rq);
593a9e7b 1486 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
602adf40 1487 rq_bio = rq->bio;
f84344f3 1488 if (do_write && rbd_dev->mapping.read_only) {
602adf40 1489 __blk_end_request_all(rq, -EROFS);
00f1f36f 1490 continue;
602adf40
YS
1491 }
1492
1493 spin_unlock_irq(q->queue_lock);
1494
d1d25646 1495 down_read(&rbd_dev->header_rwsem);
e88a36ec 1496
f84344f3
AE
1497 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP &&
1498 !rbd_dev->mapping.snap_exists) {
e88a36ec 1499 up_read(&rbd_dev->header_rwsem);
d1d25646
JD
1500 dout("request for non-existent snapshot");
1501 spin_lock_irq(q->queue_lock);
1502 __blk_end_request_all(rq, -ENXIO);
1503 continue;
e88a36ec
JD
1504 }
1505
d1d25646
JD
1506 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1507
1508 up_read(&rbd_dev->header_rwsem);
1509
602adf40
YS
1510 dout("%s 0x%x bytes at 0x%llx\n",
1511 do_write ? "write" : "read",
bd919d45 1512 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
602adf40 1513
1fec7093 1514 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
df111be6
AE
1515 if (num_segs <= 0) {
1516 spin_lock_irq(q->queue_lock);
1517 __blk_end_request_all(rq, num_segs);
1518 ceph_put_snap_context(snapc);
1519 continue;
1520 }
1fec7093
YS
1521 coll = rbd_alloc_coll(num_segs);
1522 if (!coll) {
1523 spin_lock_irq(q->queue_lock);
1524 __blk_end_request_all(rq, -ENOMEM);
d1d25646 1525 ceph_put_snap_context(snapc);
00f1f36f 1526 continue;
1fec7093
YS
1527 }
1528
602adf40
YS
1529 do {
1530 /* a bio clone to be passed down to OSD req */
bd919d45 1531 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
65ccfe21 1532 op_size = rbd_segment_length(rbd_dev, ofs, size);
1fec7093 1533 kref_get(&coll->kref);
602adf40
YS
1534 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1535 op_size, GFP_ATOMIC);
1536 if (!bio) {
1fec7093
YS
1537 rbd_coll_end_req_index(rq, coll, cur_seg,
1538 -ENOMEM, op_size);
1539 goto next_seg;
602adf40
YS
1540 }
1541
1fec7093 1542
602adf40
YS
1543 /* init OSD command: write or read */
1544 if (do_write)
1545 rbd_req_write(rq, rbd_dev,
d1d25646 1546 snapc,
602adf40 1547 ofs,
1fec7093
YS
1548 op_size, bio,
1549 coll, cur_seg);
602adf40
YS
1550 else
1551 rbd_req_read(rq, rbd_dev,
f84344f3 1552 rbd_dev->mapping.snap_id,
602adf40 1553 ofs,
1fec7093
YS
1554 op_size, bio,
1555 coll, cur_seg);
602adf40 1556
1fec7093 1557next_seg:
602adf40
YS
1558 size -= op_size;
1559 ofs += op_size;
1560
1fec7093 1561 cur_seg++;
602adf40
YS
1562 rq_bio = next_bio;
1563 } while (size > 0);
1fec7093 1564 kref_put(&coll->kref, rbd_coll_release);
602adf40
YS
1565
1566 if (bp)
1567 bio_pair_release(bp);
602adf40 1568 spin_lock_irq(q->queue_lock);
d1d25646
JD
1569
1570 ceph_put_snap_context(snapc);
602adf40
YS
1571 }
1572}
1573
1574/*
1575 * a queue callback. Makes sure that we don't create a bio that spans across
1576 * multiple osd objects. One exception would be with a single page bios,
1577 * which we handle later at bio_chain_clone
1578 */
1579static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1580 struct bio_vec *bvec)
1581{
1582 struct rbd_device *rbd_dev = q->queuedata;
593a9e7b
AE
1583 unsigned int chunk_sectors;
1584 sector_t sector;
1585 unsigned int bio_sectors;
602adf40
YS
1586 int max;
1587
593a9e7b
AE
1588 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1589 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1590 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1591
602adf40 1592 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
593a9e7b 1593 + bio_sectors)) << SECTOR_SHIFT;
602adf40
YS
1594 if (max < 0)
1595 max = 0; /* bio_add cannot handle a negative return */
1596 if (max <= bvec->bv_len && bio_sectors == 0)
1597 return bvec->bv_len;
1598 return max;
1599}
1600
1601static void rbd_free_disk(struct rbd_device *rbd_dev)
1602{
1603 struct gendisk *disk = rbd_dev->disk;
1604
1605 if (!disk)
1606 return;
1607
602adf40
YS
1608 if (disk->flags & GENHD_FL_UP)
1609 del_gendisk(disk);
1610 if (disk->queue)
1611 blk_cleanup_queue(disk->queue);
1612 put_disk(disk);
1613}
1614
1615/*
4156d998
AE
1616 * Read the complete header for the given rbd device.
1617 *
1618 * Returns a pointer to a dynamically-allocated buffer containing
1619 * the complete and validated header. Caller can pass the address
1620 * of a variable that will be filled in with the version of the
1621 * header object at the time it was read.
1622 *
1623 * Returns a pointer-coded errno if a failure occurs.
602adf40 1624 */
4156d998
AE
1625static struct rbd_image_header_ondisk *
1626rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
602adf40 1627{
4156d998 1628 struct rbd_image_header_ondisk *ondisk = NULL;
50f7c4c9 1629 u32 snap_count = 0;
4156d998
AE
1630 u64 names_size = 0;
1631 u32 want_count;
1632 int ret;
602adf40 1633
00f1f36f 1634 /*
4156d998
AE
1635 * The complete header will include an array of its 64-bit
1636 * snapshot ids, followed by the names of those snapshots as
1637 * a contiguous block of NUL-terminated strings. Note that
1638 * the number of snapshots could change by the time we read
1639 * it in, in which case we re-read it.
00f1f36f 1640 */
4156d998
AE
1641 do {
1642 size_t size;
1643
1644 kfree(ondisk);
1645
1646 size = sizeof (*ondisk);
1647 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1648 size += names_size;
1649 ondisk = kmalloc(size, GFP_KERNEL);
1650 if (!ondisk)
1651 return ERR_PTR(-ENOMEM);
1652
1653 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
0bed54dc 1654 rbd_dev->header_name,
4156d998
AE
1655 0, size,
1656 (char *) ondisk, version);
1657
1658 if (ret < 0)
1659 goto out_err;
1660 if (WARN_ON((size_t) ret < size)) {
1661 ret = -ENXIO;
1662 pr_warning("short header read for image %s"
1663 " (want %zd got %d)\n",
1664 rbd_dev->image_name, size, ret);
1665 goto out_err;
1666 }
1667 if (!rbd_dev_ondisk_valid(ondisk)) {
1668 ret = -ENXIO;
1669 pr_warning("invalid header for image %s\n",
1670 rbd_dev->image_name);
1671 goto out_err;
81e759fb 1672 }
602adf40 1673
4156d998
AE
1674 names_size = le64_to_cpu(ondisk->snap_names_len);
1675 want_count = snap_count;
1676 snap_count = le32_to_cpu(ondisk->snap_count);
1677 } while (snap_count != want_count);
00f1f36f 1678
4156d998 1679 return ondisk;
00f1f36f 1680
4156d998
AE
1681out_err:
1682 kfree(ondisk);
1683
1684 return ERR_PTR(ret);
1685}
1686
1687/*
1688 * reload the ondisk the header
1689 */
1690static int rbd_read_header(struct rbd_device *rbd_dev,
1691 struct rbd_image_header *header)
1692{
1693 struct rbd_image_header_ondisk *ondisk;
1694 u64 ver = 0;
1695 int ret;
602adf40 1696
4156d998
AE
1697 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1698 if (IS_ERR(ondisk))
1699 return PTR_ERR(ondisk);
1700 ret = rbd_header_from_disk(header, ondisk);
1701 if (ret >= 0)
1702 header->obj_version = ver;
1703 kfree(ondisk);
1704
1705 return ret;
602adf40
YS
1706}
1707
dfc5606d
YS
1708static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1709{
1710 struct rbd_snap *snap;
a0593290 1711 struct rbd_snap *next;
dfc5606d 1712
a0593290 1713 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
14e7085d 1714 __rbd_remove_snap_dev(snap);
dfc5606d
YS
1715}
1716
602adf40
YS
1717/*
1718 * only read the first part of the ondisk header, without the snaps info
1719 */
b813623a 1720static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
602adf40
YS
1721{
1722 int ret;
1723 struct rbd_image_header h;
602adf40
YS
1724
1725 ret = rbd_read_header(rbd_dev, &h);
1726 if (ret < 0)
1727 return ret;
1728
a51aa0c0
JD
1729 down_write(&rbd_dev->header_rwsem);
1730
9db4b3e3 1731 /* resized? */
f84344f3 1732 if (rbd_dev->mapping.snap_id == CEPH_NOSNAP) {
474ef7ce
JD
1733 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1734
99c1f08f
AE
1735 if (size != (sector_t) rbd_dev->mapping.size) {
1736 dout("setting size to %llu sectors",
1737 (unsigned long long) size);
1738 rbd_dev->mapping.size = (u64) size;
1739 set_capacity(rbd_dev->disk, size);
1740 }
474ef7ce 1741 }
9db4b3e3 1742
849b4260 1743 /* rbd_dev->header.object_prefix shouldn't change */
602adf40 1744 kfree(rbd_dev->header.snap_sizes);
849b4260 1745 kfree(rbd_dev->header.snap_names);
d1d25646
JD
1746 /* osd requests may still refer to snapc */
1747 ceph_put_snap_context(rbd_dev->header.snapc);
602adf40 1748
b813623a
AE
1749 if (hver)
1750 *hver = h.obj_version;
a71b891b 1751 rbd_dev->header.obj_version = h.obj_version;
93a24e08 1752 rbd_dev->header.image_size = h.image_size;
602adf40
YS
1753 rbd_dev->header.snapc = h.snapc;
1754 rbd_dev->header.snap_names = h.snap_names;
1755 rbd_dev->header.snap_sizes = h.snap_sizes;
849b4260
AE
1756 /* Free the extra copy of the object prefix */
1757 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1758 kfree(h.object_prefix);
1759
304f6808
AE
1760 ret = rbd_dev_snaps_update(rbd_dev);
1761 if (!ret)
1762 ret = rbd_dev_snaps_register(rbd_dev);
dfc5606d 1763
c666601a 1764 up_write(&rbd_dev->header_rwsem);
602adf40 1765
dfc5606d 1766 return ret;
602adf40
YS
1767}
1768
1fe5e993
AE
1769static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1770{
1771 int ret;
1772
1773 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1774 ret = __rbd_refresh_header(rbd_dev, hver);
1775 mutex_unlock(&ctl_mutex);
1776
1777 return ret;
1778}
1779
602adf40
YS
1780static int rbd_init_disk(struct rbd_device *rbd_dev)
1781{
1782 struct gendisk *disk;
1783 struct request_queue *q;
593a9e7b 1784 u64 segment_size;
602adf40 1785
602adf40 1786 /* create gendisk info */
602adf40
YS
1787 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1788 if (!disk)
1fcdb8aa 1789 return -ENOMEM;
602adf40 1790
f0f8cef5 1791 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
de71a297 1792 rbd_dev->dev_id);
602adf40
YS
1793 disk->major = rbd_dev->major;
1794 disk->first_minor = 0;
1795 disk->fops = &rbd_bd_ops;
1796 disk->private_data = rbd_dev;
1797
1798 /* init rq */
602adf40
YS
1799 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1800 if (!q)
1801 goto out_disk;
029bcbd8 1802
593a9e7b
AE
1803 /* We use the default size, but let's be explicit about it. */
1804 blk_queue_physical_block_size(q, SECTOR_SIZE);
1805
029bcbd8 1806 /* set io sizes to object size */
593a9e7b
AE
1807 segment_size = rbd_obj_bytes(&rbd_dev->header);
1808 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1809 blk_queue_max_segment_size(q, segment_size);
1810 blk_queue_io_min(q, segment_size);
1811 blk_queue_io_opt(q, segment_size);
029bcbd8 1812
602adf40
YS
1813 blk_queue_merge_bvec(q, rbd_merge_bvec);
1814 disk->queue = q;
1815
1816 q->queuedata = rbd_dev;
1817
1818 rbd_dev->disk = disk;
602adf40 1819
12f02944
AE
1820 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1821
602adf40 1822 return 0;
602adf40
YS
1823out_disk:
1824 put_disk(disk);
1fcdb8aa
AE
1825
1826 return -ENOMEM;
602adf40
YS
1827}
1828
dfc5606d
YS
1829/*
1830 sysfs
1831*/
1832
593a9e7b
AE
1833static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1834{
1835 return container_of(dev, struct rbd_device, dev);
1836}
1837
dfc5606d
YS
1838static ssize_t rbd_size_show(struct device *dev,
1839 struct device_attribute *attr, char *buf)
1840{
593a9e7b 1841 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
a51aa0c0
JD
1842 sector_t size;
1843
1844 down_read(&rbd_dev->header_rwsem);
1845 size = get_capacity(rbd_dev->disk);
1846 up_read(&rbd_dev->header_rwsem);
dfc5606d 1847
a51aa0c0 1848 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
dfc5606d
YS
1849}
1850
34b13184
AE
1851/*
1852 * Note this shows the features for whatever's mapped, which is not
1853 * necessarily the base image.
1854 */
1855static ssize_t rbd_features_show(struct device *dev,
1856 struct device_attribute *attr, char *buf)
1857{
1858 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1859
1860 return sprintf(buf, "0x%016llx\n",
1861 (unsigned long long) rbd_dev->mapping.features);
1862}
1863
dfc5606d
YS
1864static ssize_t rbd_major_show(struct device *dev,
1865 struct device_attribute *attr, char *buf)
1866{
593a9e7b 1867 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 1868
dfc5606d
YS
1869 return sprintf(buf, "%d\n", rbd_dev->major);
1870}
1871
1872static ssize_t rbd_client_id_show(struct device *dev,
1873 struct device_attribute *attr, char *buf)
602adf40 1874{
593a9e7b 1875 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1876
1dbb4399
AE
1877 return sprintf(buf, "client%lld\n",
1878 ceph_client_id(rbd_dev->rbd_client->client));
602adf40
YS
1879}
1880
dfc5606d
YS
1881static ssize_t rbd_pool_show(struct device *dev,
1882 struct device_attribute *attr, char *buf)
602adf40 1883{
593a9e7b 1884 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d
YS
1885
1886 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1887}
1888
9bb2f334
AE
1889static ssize_t rbd_pool_id_show(struct device *dev,
1890 struct device_attribute *attr, char *buf)
1891{
1892 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1893
1894 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1895}
1896
dfc5606d
YS
1897static ssize_t rbd_name_show(struct device *dev,
1898 struct device_attribute *attr, char *buf)
1899{
593a9e7b 1900 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1901
0bed54dc 1902 return sprintf(buf, "%s\n", rbd_dev->image_name);
dfc5606d
YS
1903}
1904
589d30e0
AE
1905static ssize_t rbd_image_id_show(struct device *dev,
1906 struct device_attribute *attr, char *buf)
1907{
1908 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1909
1910 return sprintf(buf, "%s\n", rbd_dev->image_id);
1911}
1912
34b13184
AE
1913/*
1914 * Shows the name of the currently-mapped snapshot (or
1915 * RBD_SNAP_HEAD_NAME for the base image).
1916 */
dfc5606d
YS
1917static ssize_t rbd_snap_show(struct device *dev,
1918 struct device_attribute *attr,
1919 char *buf)
1920{
593a9e7b 1921 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1922
f84344f3 1923 return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name);
dfc5606d
YS
1924}
1925
1926static ssize_t rbd_image_refresh(struct device *dev,
1927 struct device_attribute *attr,
1928 const char *buf,
1929 size_t size)
1930{
593a9e7b 1931 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
b813623a 1932 int ret;
602adf40 1933
1fe5e993 1934 ret = rbd_refresh_header(rbd_dev, NULL);
b813623a
AE
1935
1936 return ret < 0 ? ret : size;
dfc5606d 1937}
602adf40 1938
dfc5606d 1939static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
34b13184 1940static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
dfc5606d
YS
1941static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1942static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1943static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
9bb2f334 1944static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
dfc5606d 1945static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
589d30e0 1946static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
dfc5606d
YS
1947static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1948static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
dfc5606d
YS
1949
1950static struct attribute *rbd_attrs[] = {
1951 &dev_attr_size.attr,
34b13184 1952 &dev_attr_features.attr,
dfc5606d
YS
1953 &dev_attr_major.attr,
1954 &dev_attr_client_id.attr,
1955 &dev_attr_pool.attr,
9bb2f334 1956 &dev_attr_pool_id.attr,
dfc5606d 1957 &dev_attr_name.attr,
589d30e0 1958 &dev_attr_image_id.attr,
dfc5606d
YS
1959 &dev_attr_current_snap.attr,
1960 &dev_attr_refresh.attr,
dfc5606d
YS
1961 NULL
1962};
1963
1964static struct attribute_group rbd_attr_group = {
1965 .attrs = rbd_attrs,
1966};
1967
1968static const struct attribute_group *rbd_attr_groups[] = {
1969 &rbd_attr_group,
1970 NULL
1971};
1972
1973static void rbd_sysfs_dev_release(struct device *dev)
1974{
1975}
1976
1977static struct device_type rbd_device_type = {
1978 .name = "rbd",
1979 .groups = rbd_attr_groups,
1980 .release = rbd_sysfs_dev_release,
1981};
1982
1983
1984/*
1985 sysfs - snapshots
1986*/
1987
1988static ssize_t rbd_snap_size_show(struct device *dev,
1989 struct device_attribute *attr,
1990 char *buf)
1991{
1992 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1993
3591538f 1994 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
dfc5606d
YS
1995}
1996
1997static ssize_t rbd_snap_id_show(struct device *dev,
1998 struct device_attribute *attr,
1999 char *buf)
2000{
2001 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2002
3591538f 2003 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
dfc5606d
YS
2004}
2005
34b13184
AE
2006static ssize_t rbd_snap_features_show(struct device *dev,
2007 struct device_attribute *attr,
2008 char *buf)
2009{
2010 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2011
2012 return sprintf(buf, "0x%016llx\n",
2013 (unsigned long long) snap->features);
2014}
2015
dfc5606d
YS
2016static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2017static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
34b13184 2018static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
dfc5606d
YS
2019
2020static struct attribute *rbd_snap_attrs[] = {
2021 &dev_attr_snap_size.attr,
2022 &dev_attr_snap_id.attr,
34b13184 2023 &dev_attr_snap_features.attr,
dfc5606d
YS
2024 NULL,
2025};
2026
2027static struct attribute_group rbd_snap_attr_group = {
2028 .attrs = rbd_snap_attrs,
2029};
2030
2031static void rbd_snap_dev_release(struct device *dev)
2032{
2033 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2034 kfree(snap->name);
2035 kfree(snap);
2036}
2037
2038static const struct attribute_group *rbd_snap_attr_groups[] = {
2039 &rbd_snap_attr_group,
2040 NULL
2041};
2042
2043static struct device_type rbd_snap_device_type = {
2044 .groups = rbd_snap_attr_groups,
2045 .release = rbd_snap_dev_release,
2046};
2047
304f6808
AE
2048static bool rbd_snap_registered(struct rbd_snap *snap)
2049{
2050 bool ret = snap->dev.type == &rbd_snap_device_type;
2051 bool reg = device_is_registered(&snap->dev);
2052
2053 rbd_assert(!ret ^ reg);
2054
2055 return ret;
2056}
2057
14e7085d 2058static void __rbd_remove_snap_dev(struct rbd_snap *snap)
dfc5606d
YS
2059{
2060 list_del(&snap->node);
304f6808
AE
2061 if (device_is_registered(&snap->dev))
2062 device_unregister(&snap->dev);
dfc5606d
YS
2063}
2064
14e7085d 2065static int rbd_register_snap_dev(struct rbd_snap *snap,
dfc5606d
YS
2066 struct device *parent)
2067{
2068 struct device *dev = &snap->dev;
2069 int ret;
2070
2071 dev->type = &rbd_snap_device_type;
2072 dev->parent = parent;
2073 dev->release = rbd_snap_dev_release;
2074 dev_set_name(dev, "snap_%s", snap->name);
304f6808
AE
2075 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2076
dfc5606d
YS
2077 ret = device_register(dev);
2078
2079 return ret;
2080}
2081
4e891e0a 2082static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
c8d18425 2083 const char *snap_name,
34b13184
AE
2084 u64 snap_id, u64 snap_size,
2085 u64 snap_features)
dfc5606d 2086{
4e891e0a 2087 struct rbd_snap *snap;
dfc5606d 2088 int ret;
4e891e0a
AE
2089
2090 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
dfc5606d 2091 if (!snap)
4e891e0a
AE
2092 return ERR_PTR(-ENOMEM);
2093
2094 ret = -ENOMEM;
c8d18425 2095 snap->name = kstrdup(snap_name, GFP_KERNEL);
4e891e0a
AE
2096 if (!snap->name)
2097 goto err;
2098
c8d18425
AE
2099 snap->id = snap_id;
2100 snap->size = snap_size;
34b13184 2101 snap->features = snap_features;
4e891e0a
AE
2102
2103 return snap;
2104
dfc5606d
YS
2105err:
2106 kfree(snap->name);
2107 kfree(snap);
4e891e0a
AE
2108
2109 return ERR_PTR(ret);
dfc5606d
YS
2110}
2111
cd892126
AE
2112static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2113 u64 *snap_size, u64 *snap_features)
2114{
2115 char *snap_name;
2116
2117 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2118
2119 *snap_size = rbd_dev->header.snap_sizes[which];
2120 *snap_features = 0; /* No features for v1 */
2121
2122 /* Skip over names until we find the one we are looking for */
2123
2124 snap_name = rbd_dev->header.snap_names;
2125 while (which--)
2126 snap_name += strlen(snap_name) + 1;
2127
2128 return snap_name;
2129}
2130
9d475de5
AE
2131/*
2132 * Get the size and object order for an image snapshot, or if
2133 * snap_id is CEPH_NOSNAP, gets this information for the base
2134 * image.
2135 */
2136static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2137 u8 *order, u64 *snap_size)
2138{
2139 __le64 snapid = cpu_to_le64(snap_id);
2140 int ret;
2141 struct {
2142 u8 order;
2143 __le64 size;
2144 } __attribute__ ((packed)) size_buf = { 0 };
2145
2146 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2147 "rbd", "get_size",
2148 (char *) &snapid, sizeof (snapid),
2149 (char *) &size_buf, sizeof (size_buf),
2150 CEPH_OSD_FLAG_READ, NULL);
2151 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2152 if (ret < 0)
2153 return ret;
2154
2155 *order = size_buf.order;
2156 *snap_size = le64_to_cpu(size_buf.size);
2157
2158 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2159 (unsigned long long) snap_id, (unsigned int) *order,
2160 (unsigned long long) *snap_size);
2161
2162 return 0;
2163}
2164
2165static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2166{
2167 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2168 &rbd_dev->header.obj_order,
2169 &rbd_dev->header.image_size);
2170}
2171
1e130199
AE
2172static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2173{
2174 void *reply_buf;
2175 int ret;
2176 void *p;
2177
2178 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2179 if (!reply_buf)
2180 return -ENOMEM;
2181
2182 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2183 "rbd", "get_object_prefix",
2184 NULL, 0,
2185 reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
2186 CEPH_OSD_FLAG_READ, NULL);
2187 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2188 if (ret < 0)
2189 goto out;
2190
2191 p = reply_buf;
2192 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2193 p + RBD_OBJ_PREFIX_LEN_MAX,
2194 NULL, GFP_NOIO);
2195
2196 if (IS_ERR(rbd_dev->header.object_prefix)) {
2197 ret = PTR_ERR(rbd_dev->header.object_prefix);
2198 rbd_dev->header.object_prefix = NULL;
2199 } else {
2200 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2201 }
2202
2203out:
2204 kfree(reply_buf);
2205
2206 return ret;
2207}
2208
b1b5402a
AE
2209static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2210 u64 *snap_features)
2211{
2212 __le64 snapid = cpu_to_le64(snap_id);
2213 struct {
2214 __le64 features;
2215 __le64 incompat;
2216 } features_buf = { 0 };
2217 int ret;
2218
2219 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2220 "rbd", "get_features",
2221 (char *) &snapid, sizeof (snapid),
2222 (char *) &features_buf, sizeof (features_buf),
2223 CEPH_OSD_FLAG_READ, NULL);
2224 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2225 if (ret < 0)
2226 return ret;
2227 *snap_features = le64_to_cpu(features_buf.features);
2228
2229 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2230 (unsigned long long) snap_id,
2231 (unsigned long long) *snap_features,
2232 (unsigned long long) le64_to_cpu(features_buf.incompat));
2233
2234 return 0;
2235}
2236
2237static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2238{
2239 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2240 &rbd_dev->header.features);
2241}
2242
dfc5606d 2243/*
35938150
AE
2244 * Scan the rbd device's current snapshot list and compare it to the
2245 * newly-received snapshot context. Remove any existing snapshots
2246 * not present in the new snapshot context. Add a new snapshot for
2247 * any snaphots in the snapshot context not in the current list.
2248 * And verify there are no changes to snapshots we already know
2249 * about.
2250 *
2251 * Assumes the snapshots in the snapshot context are sorted by
2252 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2253 * are also maintained in that order.)
dfc5606d 2254 */
304f6808 2255static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
dfc5606d 2256{
35938150
AE
2257 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2258 const u32 snap_count = snapc->num_snaps;
35938150
AE
2259 struct list_head *head = &rbd_dev->snaps;
2260 struct list_head *links = head->next;
2261 u32 index = 0;
dfc5606d 2262
9fcbb800 2263 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
35938150
AE
2264 while (index < snap_count || links != head) {
2265 u64 snap_id;
2266 struct rbd_snap *snap;
cd892126
AE
2267 char *snap_name;
2268 u64 snap_size = 0;
2269 u64 snap_features = 0;
dfc5606d 2270
35938150
AE
2271 snap_id = index < snap_count ? snapc->snaps[index]
2272 : CEPH_NOSNAP;
2273 snap = links != head ? list_entry(links, struct rbd_snap, node)
2274 : NULL;
aafb230e 2275 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
dfc5606d 2276
35938150
AE
2277 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2278 struct list_head *next = links->next;
dfc5606d 2279
35938150 2280 /* Existing snapshot not in the new snap context */
dfc5606d 2281
f84344f3
AE
2282 if (rbd_dev->mapping.snap_id == snap->id)
2283 rbd_dev->mapping.snap_exists = false;
35938150 2284 __rbd_remove_snap_dev(snap);
9fcbb800 2285 dout("%ssnap id %llu has been removed\n",
f84344f3
AE
2286 rbd_dev->mapping.snap_id == snap->id ?
2287 "mapped " : "",
9fcbb800 2288 (unsigned long long) snap->id);
35938150
AE
2289
2290 /* Done with this list entry; advance */
2291
2292 links = next;
dfc5606d
YS
2293 continue;
2294 }
35938150 2295
cd892126
AE
2296 snap_name = rbd_dev_v1_snap_info(rbd_dev, index,
2297 &snap_size, &snap_features);
2298 if (IS_ERR(snap_name))
2299 return PTR_ERR(snap_name);
2300
9fcbb800
AE
2301 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2302 (unsigned long long) snap_id);
35938150
AE
2303 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2304 struct rbd_snap *new_snap;
2305
2306 /* We haven't seen this snapshot before */
2307
c8d18425 2308 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
cd892126 2309 snap_id, snap_size, snap_features);
9fcbb800
AE
2310 if (IS_ERR(new_snap)) {
2311 int err = PTR_ERR(new_snap);
2312
2313 dout(" failed to add dev, error %d\n", err);
2314
2315 return err;
2316 }
35938150
AE
2317
2318 /* New goes before existing, or at end of list */
2319
9fcbb800 2320 dout(" added dev%s\n", snap ? "" : " at end\n");
35938150
AE
2321 if (snap)
2322 list_add_tail(&new_snap->node, &snap->node);
2323 else
523f3258 2324 list_add_tail(&new_snap->node, head);
35938150
AE
2325 } else {
2326 /* Already have this one */
2327
9fcbb800
AE
2328 dout(" already present\n");
2329
cd892126 2330 rbd_assert(snap->size == snap_size);
aafb230e 2331 rbd_assert(!strcmp(snap->name, snap_name));
cd892126 2332 rbd_assert(snap->features == snap_features);
35938150
AE
2333
2334 /* Done with this list entry; advance */
2335
2336 links = links->next;
dfc5606d 2337 }
35938150
AE
2338
2339 /* Advance to the next entry in the snapshot context */
2340
2341 index++;
dfc5606d 2342 }
9fcbb800 2343 dout("%s: done\n", __func__);
dfc5606d
YS
2344
2345 return 0;
2346}
2347
304f6808
AE
2348/*
2349 * Scan the list of snapshots and register the devices for any that
2350 * have not already been registered.
2351 */
2352static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2353{
2354 struct rbd_snap *snap;
2355 int ret = 0;
2356
2357 dout("%s called\n", __func__);
86ff77bb
AE
2358 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2359 return -EIO;
304f6808
AE
2360
2361 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2362 if (!rbd_snap_registered(snap)) {
2363 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2364 if (ret < 0)
2365 break;
2366 }
2367 }
2368 dout("%s: returning %d\n", __func__, ret);
2369
2370 return ret;
2371}
2372
dfc5606d
YS
2373static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2374{
dfc5606d 2375 struct device *dev;
cd789ab9 2376 int ret;
dfc5606d
YS
2377
2378 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
dfc5606d 2379
cd789ab9 2380 dev = &rbd_dev->dev;
dfc5606d
YS
2381 dev->bus = &rbd_bus_type;
2382 dev->type = &rbd_device_type;
2383 dev->parent = &rbd_root_dev;
2384 dev->release = rbd_dev_release;
de71a297 2385 dev_set_name(dev, "%d", rbd_dev->dev_id);
dfc5606d 2386 ret = device_register(dev);
dfc5606d 2387
dfc5606d 2388 mutex_unlock(&ctl_mutex);
cd789ab9 2389
dfc5606d 2390 return ret;
602adf40
YS
2391}
2392
dfc5606d
YS
2393static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2394{
2395 device_unregister(&rbd_dev->dev);
2396}
2397
59c2be1e
YS
2398static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2399{
2400 int ret, rc;
2401
2402 do {
0e6f322d 2403 ret = rbd_req_sync_watch(rbd_dev);
59c2be1e 2404 if (ret == -ERANGE) {
1fe5e993 2405 rc = rbd_refresh_header(rbd_dev, NULL);
59c2be1e
YS
2406 if (rc < 0)
2407 return rc;
2408 }
2409 } while (ret == -ERANGE);
2410
2411 return ret;
2412}
2413
e2839308 2414static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
1ddbe94e
AE
2415
2416/*
499afd5b
AE
2417 * Get a unique rbd identifier for the given new rbd_dev, and add
2418 * the rbd_dev to the global list. The minimum rbd id is 1.
1ddbe94e 2419 */
e2839308 2420static void rbd_dev_id_get(struct rbd_device *rbd_dev)
b7f23c36 2421{
e2839308 2422 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
499afd5b
AE
2423
2424 spin_lock(&rbd_dev_list_lock);
2425 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2426 spin_unlock(&rbd_dev_list_lock);
e2839308
AE
2427 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2428 (unsigned long long) rbd_dev->dev_id);
1ddbe94e 2429}
b7f23c36 2430
1ddbe94e 2431/*
499afd5b
AE
2432 * Remove an rbd_dev from the global list, and record that its
2433 * identifier is no longer in use.
1ddbe94e 2434 */
e2839308 2435static void rbd_dev_id_put(struct rbd_device *rbd_dev)
1ddbe94e 2436{
d184f6bf 2437 struct list_head *tmp;
de71a297 2438 int rbd_id = rbd_dev->dev_id;
d184f6bf
AE
2439 int max_id;
2440
aafb230e 2441 rbd_assert(rbd_id > 0);
499afd5b 2442
e2839308
AE
2443 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2444 (unsigned long long) rbd_dev->dev_id);
499afd5b
AE
2445 spin_lock(&rbd_dev_list_lock);
2446 list_del_init(&rbd_dev->node);
d184f6bf
AE
2447
2448 /*
2449 * If the id being "put" is not the current maximum, there
2450 * is nothing special we need to do.
2451 */
e2839308 2452 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
d184f6bf
AE
2453 spin_unlock(&rbd_dev_list_lock);
2454 return;
2455 }
2456
2457 /*
2458 * We need to update the current maximum id. Search the
2459 * list to find out what it is. We're more likely to find
2460 * the maximum at the end, so search the list backward.
2461 */
2462 max_id = 0;
2463 list_for_each_prev(tmp, &rbd_dev_list) {
2464 struct rbd_device *rbd_dev;
2465
2466 rbd_dev = list_entry(tmp, struct rbd_device, node);
2467 if (rbd_id > max_id)
2468 max_id = rbd_id;
2469 }
499afd5b 2470 spin_unlock(&rbd_dev_list_lock);
b7f23c36 2471
1ddbe94e 2472 /*
e2839308 2473 * The max id could have been updated by rbd_dev_id_get(), in
d184f6bf
AE
2474 * which case it now accurately reflects the new maximum.
2475 * Be careful not to overwrite the maximum value in that
2476 * case.
1ddbe94e 2477 */
e2839308
AE
2478 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2479 dout(" max dev id has been reset\n");
b7f23c36
AE
2480}
2481
e28fff26
AE
2482/*
2483 * Skips over white space at *buf, and updates *buf to point to the
2484 * first found non-space character (if any). Returns the length of
593a9e7b
AE
2485 * the token (string of non-white space characters) found. Note
2486 * that *buf must be terminated with '\0'.
e28fff26
AE
2487 */
2488static inline size_t next_token(const char **buf)
2489{
2490 /*
2491 * These are the characters that produce nonzero for
2492 * isspace() in the "C" and "POSIX" locales.
2493 */
2494 const char *spaces = " \f\n\r\t\v";
2495
2496 *buf += strspn(*buf, spaces); /* Find start of token */
2497
2498 return strcspn(*buf, spaces); /* Return token length */
2499}
2500
2501/*
2502 * Finds the next token in *buf, and if the provided token buffer is
2503 * big enough, copies the found token into it. The result, if
593a9e7b
AE
2504 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2505 * must be terminated with '\0' on entry.
e28fff26
AE
2506 *
2507 * Returns the length of the token found (not including the '\0').
2508 * Return value will be 0 if no token is found, and it will be >=
2509 * token_size if the token would not fit.
2510 *
593a9e7b 2511 * The *buf pointer will be updated to point beyond the end of the
e28fff26
AE
2512 * found token. Note that this occurs even if the token buffer is
2513 * too small to hold it.
2514 */
2515static inline size_t copy_token(const char **buf,
2516 char *token,
2517 size_t token_size)
2518{
2519 size_t len;
2520
2521 len = next_token(buf);
2522 if (len < token_size) {
2523 memcpy(token, *buf, len);
2524 *(token + len) = '\0';
2525 }
2526 *buf += len;
2527
2528 return len;
2529}
2530
ea3352f4
AE
2531/*
2532 * Finds the next token in *buf, dynamically allocates a buffer big
2533 * enough to hold a copy of it, and copies the token into the new
2534 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2535 * that a duplicate buffer is created even for a zero-length token.
2536 *
2537 * Returns a pointer to the newly-allocated duplicate, or a null
2538 * pointer if memory for the duplicate was not available. If
2539 * the lenp argument is a non-null pointer, the length of the token
2540 * (not including the '\0') is returned in *lenp.
2541 *
2542 * If successful, the *buf pointer will be updated to point beyond
2543 * the end of the found token.
2544 *
2545 * Note: uses GFP_KERNEL for allocation.
2546 */
2547static inline char *dup_token(const char **buf, size_t *lenp)
2548{
2549 char *dup;
2550 size_t len;
2551
2552 len = next_token(buf);
2553 dup = kmalloc(len + 1, GFP_KERNEL);
2554 if (!dup)
2555 return NULL;
2556
2557 memcpy(dup, *buf, len);
2558 *(dup + len) = '\0';
2559 *buf += len;
2560
2561 if (lenp)
2562 *lenp = len;
2563
2564 return dup;
2565}
2566
a725f65e 2567/*
3feeb894
AE
2568 * This fills in the pool_name, image_name, image_name_len, rbd_dev,
2569 * rbd_md_name, and name fields of the given rbd_dev, based on the
2570 * list of monitor addresses and other options provided via
2571 * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated
2572 * copy of the snapshot name to map if successful, or a
2573 * pointer-coded error otherwise.
d22f76e7
AE
2574 *
2575 * Note: rbd_dev is assumed to have been initially zero-filled.
a725f65e 2576 */
3feeb894
AE
2577static char *rbd_add_parse_args(struct rbd_device *rbd_dev,
2578 const char *buf,
2579 const char **mon_addrs,
2580 size_t *mon_addrs_size,
2581 char *options,
2582 size_t options_size)
e28fff26 2583{
d22f76e7 2584 size_t len;
3feeb894
AE
2585 char *err_ptr = ERR_PTR(-EINVAL);
2586 char *snap_name;
e28fff26
AE
2587
2588 /* The first four tokens are required */
2589
7ef3214a
AE
2590 len = next_token(&buf);
2591 if (!len)
3feeb894 2592 return err_ptr;
5214ecc4 2593 *mon_addrs_size = len + 1;
7ef3214a
AE
2594 *mon_addrs = buf;
2595
2596 buf += len;
a725f65e 2597
e28fff26
AE
2598 len = copy_token(&buf, options, options_size);
2599 if (!len || len >= options_size)
3feeb894 2600 return err_ptr;
e28fff26 2601
3feeb894 2602 err_ptr = ERR_PTR(-ENOMEM);
d22f76e7
AE
2603 rbd_dev->pool_name = dup_token(&buf, NULL);
2604 if (!rbd_dev->pool_name)
d22f76e7 2605 goto out_err;
e28fff26 2606
0bed54dc
AE
2607 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2608 if (!rbd_dev->image_name)
bf3e5ae1 2609 goto out_err;
a725f65e 2610
3feeb894
AE
2611 /* Snapshot name is optional */
2612 len = next_token(&buf);
820a5f3e 2613 if (!len) {
3feeb894
AE
2614 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
2615 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
849b4260 2616 }
3feeb894
AE
2617 snap_name = kmalloc(len + 1, GFP_KERNEL);
2618 if (!snap_name)
2619 goto out_err;
2620 memcpy(snap_name, buf, len);
2621 *(snap_name + len) = '\0';
e28fff26 2622
3feeb894
AE
2623dout(" SNAP_NAME is <%s>, len is %zd\n", snap_name, len);
2624
2625 return snap_name;
d22f76e7
AE
2626
2627out_err:
0bed54dc 2628 kfree(rbd_dev->image_name);
d78fd7ae
AE
2629 rbd_dev->image_name = NULL;
2630 rbd_dev->image_name_len = 0;
d22f76e7
AE
2631 kfree(rbd_dev->pool_name);
2632 rbd_dev->pool_name = NULL;
2633
3feeb894 2634 return err_ptr;
a725f65e
AE
2635}
2636
589d30e0
AE
2637/*
2638 * An rbd format 2 image has a unique identifier, distinct from the
2639 * name given to it by the user. Internally, that identifier is
2640 * what's used to specify the names of objects related to the image.
2641 *
2642 * A special "rbd id" object is used to map an rbd image name to its
2643 * id. If that object doesn't exist, then there is no v2 rbd image
2644 * with the supplied name.
2645 *
2646 * This function will record the given rbd_dev's image_id field if
2647 * it can be determined, and in that case will return 0. If any
2648 * errors occur a negative errno will be returned and the rbd_dev's
2649 * image_id field will be unchanged (and should be NULL).
2650 */
2651static int rbd_dev_image_id(struct rbd_device *rbd_dev)
2652{
2653 int ret;
2654 size_t size;
2655 char *object_name;
2656 void *response;
2657 void *p;
2658
2659 /*
2660 * First, see if the format 2 image id file exists, and if
2661 * so, get the image's persistent id from it.
2662 */
2663 size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len;
2664 object_name = kmalloc(size, GFP_NOIO);
2665 if (!object_name)
2666 return -ENOMEM;
2667 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name);
2668 dout("rbd id object name is %s\n", object_name);
2669
2670 /* Response will be an encoded string, which includes a length */
2671
2672 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
2673 response = kzalloc(size, GFP_NOIO);
2674 if (!response) {
2675 ret = -ENOMEM;
2676 goto out;
2677 }
2678
2679 ret = rbd_req_sync_exec(rbd_dev, object_name,
2680 "rbd", "get_id",
2681 NULL, 0,
2682 response, RBD_IMAGE_ID_LEN_MAX,
2683 CEPH_OSD_FLAG_READ, NULL);
2684 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2685 if (ret < 0)
2686 goto out;
2687
2688 p = response;
2689 rbd_dev->image_id = ceph_extract_encoded_string(&p,
2690 p + RBD_IMAGE_ID_LEN_MAX,
2691 &rbd_dev->image_id_len,
2692 GFP_NOIO);
2693 if (IS_ERR(rbd_dev->image_id)) {
2694 ret = PTR_ERR(rbd_dev->image_id);
2695 rbd_dev->image_id = NULL;
2696 } else {
2697 dout("image_id is %s\n", rbd_dev->image_id);
2698 }
2699out:
2700 kfree(response);
2701 kfree(object_name);
2702
2703 return ret;
2704}
2705
a30b71b9
AE
2706static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
2707{
2708 int ret;
2709 size_t size;
2710
2711 /* Version 1 images have no id; empty string is used */
2712
2713 rbd_dev->image_id = kstrdup("", GFP_KERNEL);
2714 if (!rbd_dev->image_id)
2715 return -ENOMEM;
2716 rbd_dev->image_id_len = 0;
2717
2718 /* Record the header object name for this rbd image. */
2719
2720 size = rbd_dev->image_name_len + sizeof (RBD_SUFFIX);
2721 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
2722 if (!rbd_dev->header_name) {
2723 ret = -ENOMEM;
2724 goto out_err;
2725 }
2726 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
2727
2728 /* Populate rbd image metadata */
2729
2730 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
2731 if (ret < 0)
2732 goto out_err;
2733 rbd_dev->image_format = 1;
2734
2735 dout("discovered version 1 image, header name is %s\n",
2736 rbd_dev->header_name);
2737
2738 return 0;
2739
2740out_err:
2741 kfree(rbd_dev->header_name);
2742 rbd_dev->header_name = NULL;
2743 kfree(rbd_dev->image_id);
2744 rbd_dev->image_id = NULL;
2745
2746 return ret;
2747}
2748
2749static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
2750{
2751 size_t size;
9d475de5 2752 int ret;
a30b71b9
AE
2753
2754 /*
2755 * Image id was filled in by the caller. Record the header
2756 * object name for this rbd image.
2757 */
2758 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->image_id_len;
2759 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
2760 if (!rbd_dev->header_name)
2761 return -ENOMEM;
2762 sprintf(rbd_dev->header_name, "%s%s",
2763 RBD_HEADER_PREFIX, rbd_dev->image_id);
9d475de5
AE
2764
2765 /* Get the size and object order for the image */
2766
2767 ret = rbd_dev_v2_image_size(rbd_dev);
1e130199
AE
2768 if (ret < 0)
2769 goto out_err;
2770
2771 /* Get the object prefix (a.k.a. block_name) for the image */
2772
2773 ret = rbd_dev_v2_object_prefix(rbd_dev);
b1b5402a
AE
2774 if (ret < 0)
2775 goto out_err;
2776
2777 /* Get the features for the image */
2778
2779 ret = rbd_dev_v2_features(rbd_dev);
9d475de5
AE
2780 if (ret < 0)
2781 goto out_err;
a30b71b9
AE
2782 rbd_dev->image_format = 2;
2783
2784 dout("discovered version 2 image, header name is %s\n",
2785 rbd_dev->header_name);
2786
2787 return -ENOTSUPP;
9d475de5
AE
2788out_err:
2789 kfree(rbd_dev->header_name);
2790 rbd_dev->header_name = NULL;
1e130199
AE
2791 kfree(rbd_dev->header.object_prefix);
2792 rbd_dev->header.object_prefix = NULL;
9d475de5
AE
2793
2794 return ret;
a30b71b9
AE
2795}
2796
2797/*
2798 * Probe for the existence of the header object for the given rbd
2799 * device. For format 2 images this includes determining the image
2800 * id.
2801 */
2802static int rbd_dev_probe(struct rbd_device *rbd_dev)
2803{
2804 int ret;
2805
2806 /*
2807 * Get the id from the image id object. If it's not a
2808 * format 2 image, we'll get ENOENT back, and we'll assume
2809 * it's a format 1 image.
2810 */
2811 ret = rbd_dev_image_id(rbd_dev);
2812 if (ret)
2813 ret = rbd_dev_v1_probe(rbd_dev);
2814 else
2815 ret = rbd_dev_v2_probe(rbd_dev);
2816 if (ret)
2817 dout("probe failed, returning %d\n", ret);
2818
2819 return ret;
2820}
2821
59c2be1e
YS
2822static ssize_t rbd_add(struct bus_type *bus,
2823 const char *buf,
2824 size_t count)
602adf40 2825{
cb8627c7
AE
2826 char *options;
2827 struct rbd_device *rbd_dev = NULL;
7ef3214a
AE
2828 const char *mon_addrs = NULL;
2829 size_t mon_addrs_size = 0;
27cc2594
AE
2830 struct ceph_osd_client *osdc;
2831 int rc = -ENOMEM;
3feeb894 2832 char *snap_name;
602adf40
YS
2833
2834 if (!try_module_get(THIS_MODULE))
2835 return -ENODEV;
2836
60571c7d 2837 options = kmalloc(count, GFP_KERNEL);
602adf40 2838 if (!options)
85ae8926 2839 goto err_out_mem;
cb8627c7
AE
2840 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2841 if (!rbd_dev)
85ae8926 2842 goto err_out_mem;
602adf40
YS
2843
2844 /* static rbd_device initialization */
2845 spin_lock_init(&rbd_dev->lock);
2846 INIT_LIST_HEAD(&rbd_dev->node);
dfc5606d 2847 INIT_LIST_HEAD(&rbd_dev->snaps);
c666601a 2848 init_rwsem(&rbd_dev->header_rwsem);
602adf40 2849
602adf40 2850 /* parse add command */
3feeb894
AE
2851 snap_name = rbd_add_parse_args(rbd_dev, buf,
2852 &mon_addrs, &mon_addrs_size, options, count);
2853 if (IS_ERR(snap_name)) {
2854 rc = PTR_ERR(snap_name);
85ae8926 2855 goto err_out_mem;
3feeb894 2856 }
e124a82f 2857
f8c38929
AE
2858 rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
2859 if (rc < 0)
85ae8926 2860 goto err_out_args;
602adf40 2861
602adf40 2862 /* pick the pool */
1dbb4399 2863 osdc = &rbd_dev->rbd_client->client->osdc;
602adf40
YS
2864 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2865 if (rc < 0)
2866 goto err_out_client;
9bb2f334 2867 rbd_dev->pool_id = rc;
602adf40 2868
a30b71b9
AE
2869 rc = rbd_dev_probe(rbd_dev);
2870 if (rc < 0)
05fd6f6f 2871 goto err_out_client;
a30b71b9 2872 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
05fd6f6f
AE
2873
2874 /* no need to lock here, as rbd_dev is not registered yet */
2875 rc = rbd_dev_snaps_update(rbd_dev);
2876 if (rc)
2877 goto err_out_header;
2878
2879 rc = rbd_dev_set_mapping(rbd_dev, snap_name);
2880 if (rc)
2881 goto err_out_header;
2882
85ae8926
AE
2883 /* generate unique id: find highest unique id, add one */
2884 rbd_dev_id_get(rbd_dev);
2885
2886 /* Fill in the device name, now that we have its id. */
2887 BUILD_BUG_ON(DEV_NAME_LEN
2888 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
2889 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
2890
2891 /* Get our block major device number. */
2892
27cc2594
AE
2893 rc = register_blkdev(0, rbd_dev->name);
2894 if (rc < 0)
85ae8926 2895 goto err_out_id;
27cc2594 2896 rbd_dev->major = rc;
602adf40 2897
0f308a31
AE
2898 /* Set up the blkdev mapping. */
2899
2900 rc = rbd_init_disk(rbd_dev);
dfc5606d 2901 if (rc)
766fc439
YS
2902 goto err_out_blkdev;
2903
0f308a31
AE
2904 rc = rbd_bus_add_dev(rbd_dev);
2905 if (rc)
2906 goto err_out_disk;
2907
32eec68d
AE
2908 /*
2909 * At this point cleanup in the event of an error is the job
2910 * of the sysfs code (initiated by rbd_bus_del_dev()).
32eec68d 2911 */
2ac4e75d 2912
4bb1f1ed 2913 down_write(&rbd_dev->header_rwsem);
5ed16177 2914 rc = rbd_dev_snaps_register(rbd_dev);
4bb1f1ed 2915 up_write(&rbd_dev->header_rwsem);
2ac4e75d
AE
2916 if (rc)
2917 goto err_out_bus;
2918
3ee4001e
AE
2919 rc = rbd_init_watch_dev(rbd_dev);
2920 if (rc)
2921 goto err_out_bus;
2922
2ac4e75d
AE
2923 /* Everything's ready. Announce the disk to the world. */
2924
2ac4e75d 2925 add_disk(rbd_dev->disk);
3ee4001e 2926
2ac4e75d
AE
2927 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
2928 (unsigned long long) rbd_dev->mapping.size);
2929
602adf40
YS
2930 return count;
2931
766fc439 2932err_out_bus:
766fc439
YS
2933 /* this will also clean up rest of rbd_dev stuff */
2934
2935 rbd_bus_del_dev(rbd_dev);
2936 kfree(options);
766fc439
YS
2937 return rc;
2938
0f308a31
AE
2939err_out_disk:
2940 rbd_free_disk(rbd_dev);
602adf40
YS
2941err_out_blkdev:
2942 unregister_blkdev(rbd_dev->major, rbd_dev->name);
85ae8926
AE
2943err_out_id:
2944 rbd_dev_id_put(rbd_dev);
05fd6f6f
AE
2945err_out_header:
2946 rbd_header_free(&rbd_dev->header);
602adf40 2947err_out_client:
3fcf2581 2948 kfree(rbd_dev->header_name);
602adf40 2949 rbd_put_client(rbd_dev);
589d30e0 2950 kfree(rbd_dev->image_id);
85ae8926
AE
2951err_out_args:
2952 kfree(rbd_dev->mapping.snap_name);
2953 kfree(rbd_dev->image_name);
2954 kfree(rbd_dev->pool_name);
2955err_out_mem:
27cc2594 2956 kfree(rbd_dev);
cb8627c7 2957 kfree(options);
27cc2594 2958
602adf40
YS
2959 dout("Error adding device %s\n", buf);
2960 module_put(THIS_MODULE);
27cc2594
AE
2961
2962 return (ssize_t) rc;
602adf40
YS
2963}
2964
de71a297 2965static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
602adf40
YS
2966{
2967 struct list_head *tmp;
2968 struct rbd_device *rbd_dev;
2969
e124a82f 2970 spin_lock(&rbd_dev_list_lock);
602adf40
YS
2971 list_for_each(tmp, &rbd_dev_list) {
2972 rbd_dev = list_entry(tmp, struct rbd_device, node);
de71a297 2973 if (rbd_dev->dev_id == dev_id) {
e124a82f 2974 spin_unlock(&rbd_dev_list_lock);
602adf40 2975 return rbd_dev;
e124a82f 2976 }
602adf40 2977 }
e124a82f 2978 spin_unlock(&rbd_dev_list_lock);
602adf40
YS
2979 return NULL;
2980}
2981
dfc5606d 2982static void rbd_dev_release(struct device *dev)
602adf40 2983{
593a9e7b 2984 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 2985
1dbb4399
AE
2986 if (rbd_dev->watch_request) {
2987 struct ceph_client *client = rbd_dev->rbd_client->client;
2988
2989 ceph_osdc_unregister_linger_request(&client->osdc,
59c2be1e 2990 rbd_dev->watch_request);
1dbb4399 2991 }
59c2be1e 2992 if (rbd_dev->watch_event)
070c633f 2993 rbd_req_sync_unwatch(rbd_dev);
59c2be1e 2994
602adf40
YS
2995 rbd_put_client(rbd_dev);
2996
2997 /* clean up and free blkdev */
2998 rbd_free_disk(rbd_dev);
2999 unregister_blkdev(rbd_dev->major, rbd_dev->name);
32eec68d 3000
2ac4e75d
AE
3001 /* release allocated disk header fields */
3002 rbd_header_free(&rbd_dev->header);
3003
32eec68d 3004 /* done with the id, and with the rbd_dev */
f84344f3 3005 kfree(rbd_dev->mapping.snap_name);
589d30e0 3006 kfree(rbd_dev->image_id);
0bed54dc 3007 kfree(rbd_dev->header_name);
d22f76e7 3008 kfree(rbd_dev->pool_name);
0bed54dc 3009 kfree(rbd_dev->image_name);
e2839308 3010 rbd_dev_id_put(rbd_dev);
602adf40
YS
3011 kfree(rbd_dev);
3012
3013 /* release module ref */
3014 module_put(THIS_MODULE);
602adf40
YS
3015}
3016
dfc5606d
YS
3017static ssize_t rbd_remove(struct bus_type *bus,
3018 const char *buf,
3019 size_t count)
602adf40
YS
3020{
3021 struct rbd_device *rbd_dev = NULL;
3022 int target_id, rc;
3023 unsigned long ul;
3024 int ret = count;
3025
3026 rc = strict_strtoul(buf, 10, &ul);
3027 if (rc)
3028 return rc;
3029
3030 /* convert to int; abort if we lost anything in the conversion */
3031 target_id = (int) ul;
3032 if (target_id != ul)
3033 return -EINVAL;
3034
3035 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3036
3037 rbd_dev = __rbd_get_dev(target_id);
3038 if (!rbd_dev) {
3039 ret = -ENOENT;
3040 goto done;
3041 }
3042
dfc5606d
YS
3043 __rbd_remove_all_snaps(rbd_dev);
3044 rbd_bus_del_dev(rbd_dev);
602adf40
YS
3045
3046done:
3047 mutex_unlock(&ctl_mutex);
aafb230e 3048
602adf40
YS
3049 return ret;
3050}
3051
602adf40
YS
3052/*
3053 * create control files in sysfs
dfc5606d 3054 * /sys/bus/rbd/...
602adf40
YS
3055 */
3056static int rbd_sysfs_init(void)
3057{
dfc5606d 3058 int ret;
602adf40 3059
fed4c143 3060 ret = device_register(&rbd_root_dev);
21079786 3061 if (ret < 0)
dfc5606d 3062 return ret;
602adf40 3063
fed4c143
AE
3064 ret = bus_register(&rbd_bus_type);
3065 if (ret < 0)
3066 device_unregister(&rbd_root_dev);
602adf40 3067
602adf40
YS
3068 return ret;
3069}
3070
3071static void rbd_sysfs_cleanup(void)
3072{
dfc5606d 3073 bus_unregister(&rbd_bus_type);
fed4c143 3074 device_unregister(&rbd_root_dev);
602adf40
YS
3075}
3076
3077int __init rbd_init(void)
3078{
3079 int rc;
3080
3081 rc = rbd_sysfs_init();
3082 if (rc)
3083 return rc;
f0f8cef5 3084 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
602adf40
YS
3085 return 0;
3086}
3087
3088void __exit rbd_exit(void)
3089{
3090 rbd_sysfs_cleanup();
3091}
3092
3093module_init(rbd_init);
3094module_exit(rbd_exit);
3095
3096MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3097MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3098MODULE_DESCRIPTION("rados block device");
3099
3100/* following authorship retained from original osdblk.c */
3101MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3102
3103MODULE_LICENSE("GPL");