]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - drivers/block/rbd.c
libceph: reformat __reset_osd()
[mirror_ubuntu-jammy-kernel.git] / drivers / block / rbd.c
CommitLineData
602adf40
YS
1/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
dfc5606d 24 For usage instructions, please refer to:
602adf40 25
dfc5606d 26 Documentation/ABI/testing/sysfs-bus-rbd
602adf40
YS
27
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
59c2be1e 34#include <linux/parser.h>
602adf40
YS
35
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
aafb230e
AE
44#define RBD_DEBUG /* Activate rbd_assert() calls */
45
593a9e7b
AE
46/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
df111be6
AE
55/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
f0f8cef5
AE
59#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
602adf40
YS
61
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
d4b125e9
AE
64#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
65#define RBD_MAX_SNAP_NAME_LEN \
66 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
67
35d489f9 68#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
602adf40
YS
69#define RBD_MAX_OPT_LEN 1024
70
71#define RBD_SNAP_HEAD_NAME "-"
72
9e15b77d
AE
73/* This allows a single page to hold an image name sent by OSD */
74#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
1e130199 75#define RBD_IMAGE_ID_LEN_MAX 64
9e15b77d 76
1e130199 77#define RBD_OBJ_PREFIX_LEN_MAX 64
589d30e0 78
d889140c
AE
79/* Feature bits */
80
81#define RBD_FEATURE_LAYERING 1
82
83/* Features supported by this (client software) implementation. */
84
85#define RBD_FEATURES_ALL (0)
86
81a89793
AE
87/*
88 * An RBD device name will be "rbd#", where the "rbd" comes from
89 * RBD_DRV_NAME above, and # is a unique integer identifier.
90 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
91 * enough to hold all possible device names.
92 */
602adf40 93#define DEV_NAME_LEN 32
81a89793 94#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
602adf40 95
cc0538b6 96#define RBD_READ_ONLY_DEFAULT false
59c2be1e 97
602adf40
YS
98/*
99 * block device image metadata (in-memory version)
100 */
101struct rbd_image_header {
f84344f3 102 /* These four fields never change for a given rbd image */
849b4260 103 char *object_prefix;
34b13184 104 u64 features;
602adf40
YS
105 __u8 obj_order;
106 __u8 crypt_type;
107 __u8 comp_type;
602adf40 108
f84344f3
AE
109 /* The remaining fields need to be updated occasionally */
110 u64 image_size;
111 struct ceph_snap_context *snapc;
602adf40
YS
112 char *snap_names;
113 u64 *snap_sizes;
59c2be1e
YS
114
115 u64 obj_version;
116};
117
0d7dbfce
AE
118/*
119 * An rbd image specification.
120 *
121 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
122 * identify an image.
123 */
124struct rbd_spec {
125 u64 pool_id;
126 char *pool_name;
127
128 char *image_id;
129 size_t image_id_len;
130 char *image_name;
131 size_t image_name_len;
132
133 u64 snap_id;
134 char *snap_name;
135
136 struct kref kref;
137};
138
59c2be1e 139struct rbd_options {
cc0538b6 140 bool read_only;
602adf40
YS
141};
142
143/*
f0f8cef5 144 * an instance of the client. multiple devices may share an rbd client.
602adf40
YS
145 */
146struct rbd_client {
147 struct ceph_client *client;
148 struct kref kref;
149 struct list_head node;
150};
151
152/*
f0f8cef5 153 * a request completion status
602adf40 154 */
1fec7093
YS
155struct rbd_req_status {
156 int done;
157 int rc;
158 u64 bytes;
159};
160
161/*
162 * a collection of requests
163 */
164struct rbd_req_coll {
165 int total;
166 int num_done;
167 struct kref kref;
168 struct rbd_req_status status[0];
602adf40
YS
169};
170
f0f8cef5
AE
171/*
172 * a single io request
173 */
174struct rbd_request {
175 struct request *rq; /* blk layer request */
176 struct bio *bio; /* cloned bio */
177 struct page **pages; /* list of used pages */
178 u64 len;
179 int coll_index;
180 struct rbd_req_coll *coll;
181};
182
dfc5606d
YS
183struct rbd_snap {
184 struct device dev;
185 const char *name;
3591538f 186 u64 size;
dfc5606d
YS
187 struct list_head node;
188 u64 id;
34b13184 189 u64 features;
dfc5606d
YS
190};
191
f84344f3 192struct rbd_mapping {
99c1f08f 193 u64 size;
34b13184 194 u64 features;
f84344f3
AE
195 bool read_only;
196};
197
602adf40
YS
198/*
199 * a single device
200 */
201struct rbd_device {
de71a297 202 int dev_id; /* blkdev unique id */
602adf40
YS
203
204 int major; /* blkdev assigned major */
205 struct gendisk *disk; /* blkdev's gendisk and rq */
602adf40 206
a30b71b9 207 u32 image_format; /* Either 1 or 2 */
602adf40
YS
208 struct rbd_client *rbd_client;
209
210 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
211
212 spinlock_t lock; /* queue lock */
213
214 struct rbd_image_header header;
daba5fdb 215 bool exists;
0d7dbfce 216 struct rbd_spec *spec;
602adf40 217
0d7dbfce 218 char *header_name;
971f839a 219
59c2be1e
YS
220 struct ceph_osd_event *watch_event;
221 struct ceph_osd_request *watch_request;
222
86b00e0d
AE
223 struct rbd_spec *parent_spec;
224 u64 parent_overlap;
225
c666601a
JD
226 /* protects updating the header */
227 struct rw_semaphore header_rwsem;
f84344f3
AE
228
229 struct rbd_mapping mapping;
602adf40
YS
230
231 struct list_head node;
dfc5606d
YS
232
233 /* list of snapshots */
234 struct list_head snaps;
235
236 /* sysfs related */
237 struct device dev;
42382b70 238 unsigned long open_count;
dfc5606d
YS
239};
240
602adf40 241static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
e124a82f 242
602adf40 243static LIST_HEAD(rbd_dev_list); /* devices */
e124a82f
AE
244static DEFINE_SPINLOCK(rbd_dev_list_lock);
245
432b8587
AE
246static LIST_HEAD(rbd_client_list); /* clients */
247static DEFINE_SPINLOCK(rbd_client_list_lock);
602adf40 248
304f6808
AE
249static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
250static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
251
dfc5606d 252static void rbd_dev_release(struct device *dev);
41f38c2b 253static void rbd_remove_snap_dev(struct rbd_snap *snap);
dfc5606d 254
f0f8cef5
AE
255static ssize_t rbd_add(struct bus_type *bus, const char *buf,
256 size_t count);
257static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
258 size_t count);
259
260static struct bus_attribute rbd_bus_attrs[] = {
261 __ATTR(add, S_IWUSR, NULL, rbd_add),
262 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
263 __ATTR_NULL
264};
265
266static struct bus_type rbd_bus_type = {
267 .name = "rbd",
268 .bus_attrs = rbd_bus_attrs,
269};
270
271static void rbd_root_dev_release(struct device *dev)
272{
273}
274
275static struct device rbd_root_dev = {
276 .init_name = "rbd",
277 .release = rbd_root_dev_release,
278};
279
aafb230e
AE
280#ifdef RBD_DEBUG
281#define rbd_assert(expr) \
282 if (unlikely(!(expr))) { \
283 printk(KERN_ERR "\nAssertion failure in %s() " \
284 "at line %d:\n\n" \
285 "\trbd_assert(%s);\n\n", \
286 __func__, __LINE__, #expr); \
287 BUG(); \
288 }
289#else /* !RBD_DEBUG */
290# define rbd_assert(expr) ((void) 0)
291#endif /* !RBD_DEBUG */
dfc5606d 292
117973fb
AE
293static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
294static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
59c2be1e 295
602adf40
YS
296static int rbd_open(struct block_device *bdev, fmode_t mode)
297{
f0f8cef5 298 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
602adf40 299
f84344f3 300 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
602adf40
YS
301 return -EROFS;
302
42382b70 303 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
c3e946ce 304 (void) get_device(&rbd_dev->dev);
f84344f3 305 set_device_ro(bdev, rbd_dev->mapping.read_only);
42382b70
AE
306 rbd_dev->open_count++;
307 mutex_unlock(&ctl_mutex);
340c7a2b 308
602adf40
YS
309 return 0;
310}
311
dfc5606d
YS
312static int rbd_release(struct gendisk *disk, fmode_t mode)
313{
314 struct rbd_device *rbd_dev = disk->private_data;
315
42382b70
AE
316 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
317 rbd_assert(rbd_dev->open_count > 0);
318 rbd_dev->open_count--;
c3e946ce 319 put_device(&rbd_dev->dev);
42382b70 320 mutex_unlock(&ctl_mutex);
dfc5606d
YS
321
322 return 0;
323}
324
602adf40
YS
325static const struct block_device_operations rbd_bd_ops = {
326 .owner = THIS_MODULE,
327 .open = rbd_open,
dfc5606d 328 .release = rbd_release,
602adf40
YS
329};
330
331/*
332 * Initialize an rbd client instance.
43ae4701 333 * We own *ceph_opts.
602adf40 334 */
f8c38929 335static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
602adf40
YS
336{
337 struct rbd_client *rbdc;
338 int ret = -ENOMEM;
339
340 dout("rbd_client_create\n");
341 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
342 if (!rbdc)
343 goto out_opt;
344
345 kref_init(&rbdc->kref);
346 INIT_LIST_HEAD(&rbdc->node);
347
bc534d86
AE
348 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
349
43ae4701 350 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
602adf40 351 if (IS_ERR(rbdc->client))
bc534d86 352 goto out_mutex;
43ae4701 353 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
602adf40
YS
354
355 ret = ceph_open_session(rbdc->client);
356 if (ret < 0)
357 goto out_err;
358
432b8587 359 spin_lock(&rbd_client_list_lock);
602adf40 360 list_add_tail(&rbdc->node, &rbd_client_list);
432b8587 361 spin_unlock(&rbd_client_list_lock);
602adf40 362
bc534d86
AE
363 mutex_unlock(&ctl_mutex);
364
602adf40
YS
365 dout("rbd_client_create created %p\n", rbdc);
366 return rbdc;
367
368out_err:
369 ceph_destroy_client(rbdc->client);
bc534d86
AE
370out_mutex:
371 mutex_unlock(&ctl_mutex);
602adf40
YS
372 kfree(rbdc);
373out_opt:
43ae4701
AE
374 if (ceph_opts)
375 ceph_destroy_options(ceph_opts);
28f259b7 376 return ERR_PTR(ret);
602adf40
YS
377}
378
379/*
1f7ba331
AE
380 * Find a ceph client with specific addr and configuration. If
381 * found, bump its reference count.
602adf40 382 */
1f7ba331 383static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
602adf40
YS
384{
385 struct rbd_client *client_node;
1f7ba331 386 bool found = false;
602adf40 387
43ae4701 388 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
602adf40
YS
389 return NULL;
390
1f7ba331
AE
391 spin_lock(&rbd_client_list_lock);
392 list_for_each_entry(client_node, &rbd_client_list, node) {
393 if (!ceph_compare_options(ceph_opts, client_node->client)) {
394 kref_get(&client_node->kref);
395 found = true;
396 break;
397 }
398 }
399 spin_unlock(&rbd_client_list_lock);
400
401 return found ? client_node : NULL;
602adf40
YS
402}
403
59c2be1e
YS
404/*
405 * mount options
406 */
407enum {
59c2be1e
YS
408 Opt_last_int,
409 /* int args above */
410 Opt_last_string,
411 /* string args above */
cc0538b6
AE
412 Opt_read_only,
413 Opt_read_write,
414 /* Boolean args above */
415 Opt_last_bool,
59c2be1e
YS
416};
417
43ae4701 418static match_table_t rbd_opts_tokens = {
59c2be1e
YS
419 /* int args above */
420 /* string args above */
be466c1c 421 {Opt_read_only, "read_only"},
cc0538b6
AE
422 {Opt_read_only, "ro"}, /* Alternate spelling */
423 {Opt_read_write, "read_write"},
424 {Opt_read_write, "rw"}, /* Alternate spelling */
425 /* Boolean args above */
59c2be1e
YS
426 {-1, NULL}
427};
428
429static int parse_rbd_opts_token(char *c, void *private)
430{
43ae4701 431 struct rbd_options *rbd_opts = private;
59c2be1e
YS
432 substring_t argstr[MAX_OPT_ARGS];
433 int token, intval, ret;
434
43ae4701 435 token = match_token(c, rbd_opts_tokens, argstr);
59c2be1e
YS
436 if (token < 0)
437 return -EINVAL;
438
439 if (token < Opt_last_int) {
440 ret = match_int(&argstr[0], &intval);
441 if (ret < 0) {
442 pr_err("bad mount option arg (not int) "
443 "at '%s'\n", c);
444 return ret;
445 }
446 dout("got int token %d val %d\n", token, intval);
447 } else if (token > Opt_last_int && token < Opt_last_string) {
448 dout("got string token %d val %s\n", token,
449 argstr[0].from);
cc0538b6
AE
450 } else if (token > Opt_last_string && token < Opt_last_bool) {
451 dout("got Boolean token %d\n", token);
59c2be1e
YS
452 } else {
453 dout("got token %d\n", token);
454 }
455
456 switch (token) {
cc0538b6
AE
457 case Opt_read_only:
458 rbd_opts->read_only = true;
459 break;
460 case Opt_read_write:
461 rbd_opts->read_only = false;
462 break;
59c2be1e 463 default:
aafb230e
AE
464 rbd_assert(false);
465 break;
59c2be1e
YS
466 }
467 return 0;
468}
469
602adf40
YS
470/*
471 * Get a ceph client with specific addr and configuration, if one does
472 * not exist create it.
473 */
9d3997fd 474static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
602adf40 475{
f8c38929 476 struct rbd_client *rbdc;
59c2be1e 477
1f7ba331 478 rbdc = rbd_client_find(ceph_opts);
9d3997fd 479 if (rbdc) /* using an existing client */
43ae4701 480 ceph_destroy_options(ceph_opts);
9d3997fd 481 else
f8c38929 482 rbdc = rbd_client_create(ceph_opts);
602adf40 483
9d3997fd 484 return rbdc;
602adf40
YS
485}
486
487/*
488 * Destroy ceph client
d23a4b3f 489 *
432b8587 490 * Caller must hold rbd_client_list_lock.
602adf40
YS
491 */
492static void rbd_client_release(struct kref *kref)
493{
494 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
495
496 dout("rbd_release_client %p\n", rbdc);
cd9d9f5d 497 spin_lock(&rbd_client_list_lock);
602adf40 498 list_del(&rbdc->node);
cd9d9f5d 499 spin_unlock(&rbd_client_list_lock);
602adf40
YS
500
501 ceph_destroy_client(rbdc->client);
502 kfree(rbdc);
503}
504
505/*
506 * Drop reference to ceph client node. If it's not referenced anymore, release
507 * it.
508 */
9d3997fd 509static void rbd_put_client(struct rbd_client *rbdc)
602adf40 510{
c53d5893
AE
511 if (rbdc)
512 kref_put(&rbdc->kref, rbd_client_release);
602adf40
YS
513}
514
1fec7093
YS
515/*
516 * Destroy requests collection
517 */
518static void rbd_coll_release(struct kref *kref)
519{
520 struct rbd_req_coll *coll =
521 container_of(kref, struct rbd_req_coll, kref);
522
523 dout("rbd_coll_release %p\n", coll);
524 kfree(coll);
525}
602adf40 526
a30b71b9
AE
527static bool rbd_image_format_valid(u32 image_format)
528{
529 return image_format == 1 || image_format == 2;
530}
531
8e94af8e
AE
532static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
533{
103a150f
AE
534 size_t size;
535 u32 snap_count;
536
537 /* The header has to start with the magic rbd header text */
538 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
539 return false;
540
db2388b6
AE
541 /* The bio layer requires at least sector-sized I/O */
542
543 if (ondisk->options.order < SECTOR_SHIFT)
544 return false;
545
546 /* If we use u64 in a few spots we may be able to loosen this */
547
548 if (ondisk->options.order > 8 * sizeof (int) - 1)
549 return false;
550
103a150f
AE
551 /*
552 * The size of a snapshot header has to fit in a size_t, and
553 * that limits the number of snapshots.
554 */
555 snap_count = le32_to_cpu(ondisk->snap_count);
556 size = SIZE_MAX - sizeof (struct ceph_snap_context);
557 if (snap_count > size / sizeof (__le64))
558 return false;
559
560 /*
561 * Not only that, but the size of the entire the snapshot
562 * header must also be representable in a size_t.
563 */
564 size -= snap_count * sizeof (__le64);
565 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
566 return false;
567
568 return true;
8e94af8e
AE
569}
570
602adf40
YS
571/*
572 * Create a new header structure, translate header format from the on-disk
573 * header.
574 */
575static int rbd_header_from_disk(struct rbd_image_header *header,
4156d998 576 struct rbd_image_header_ondisk *ondisk)
602adf40 577{
ccece235 578 u32 snap_count;
58c17b0e 579 size_t len;
d2bb24e5 580 size_t size;
621901d6 581 u32 i;
602adf40 582
6a52325f
AE
583 memset(header, 0, sizeof (*header));
584
103a150f
AE
585 snap_count = le32_to_cpu(ondisk->snap_count);
586
58c17b0e
AE
587 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
588 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
6a52325f 589 if (!header->object_prefix)
602adf40 590 return -ENOMEM;
58c17b0e
AE
591 memcpy(header->object_prefix, ondisk->object_prefix, len);
592 header->object_prefix[len] = '\0';
00f1f36f 593
602adf40 594 if (snap_count) {
f785cc1d
AE
595 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
596
621901d6
AE
597 /* Save a copy of the snapshot names */
598
f785cc1d
AE
599 if (snap_names_len > (u64) SIZE_MAX)
600 return -EIO;
601 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
602adf40 602 if (!header->snap_names)
6a52325f 603 goto out_err;
f785cc1d
AE
604 /*
605 * Note that rbd_dev_v1_header_read() guarantees
606 * the ondisk buffer we're working with has
607 * snap_names_len bytes beyond the end of the
608 * snapshot id array, this memcpy() is safe.
609 */
610 memcpy(header->snap_names, &ondisk->snaps[snap_count],
611 snap_names_len);
6a52325f 612
621901d6
AE
613 /* Record each snapshot's size */
614
d2bb24e5
AE
615 size = snap_count * sizeof (*header->snap_sizes);
616 header->snap_sizes = kmalloc(size, GFP_KERNEL);
602adf40 617 if (!header->snap_sizes)
6a52325f 618 goto out_err;
621901d6
AE
619 for (i = 0; i < snap_count; i++)
620 header->snap_sizes[i] =
621 le64_to_cpu(ondisk->snaps[i].image_size);
602adf40 622 } else {
ccece235 623 WARN_ON(ondisk->snap_names_len);
602adf40
YS
624 header->snap_names = NULL;
625 header->snap_sizes = NULL;
626 }
849b4260 627
34b13184 628 header->features = 0; /* No features support in v1 images */
602adf40
YS
629 header->obj_order = ondisk->options.order;
630 header->crypt_type = ondisk->options.crypt_type;
631 header->comp_type = ondisk->options.comp_type;
6a52325f 632
621901d6
AE
633 /* Allocate and fill in the snapshot context */
634
f84344f3 635 header->image_size = le64_to_cpu(ondisk->image_size);
6a52325f
AE
636 size = sizeof (struct ceph_snap_context);
637 size += snap_count * sizeof (header->snapc->snaps[0]);
638 header->snapc = kzalloc(size, GFP_KERNEL);
639 if (!header->snapc)
640 goto out_err;
602adf40
YS
641
642 atomic_set(&header->snapc->nref, 1);
505cbb9b 643 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
602adf40 644 header->snapc->num_snaps = snap_count;
621901d6
AE
645 for (i = 0; i < snap_count; i++)
646 header->snapc->snaps[i] =
647 le64_to_cpu(ondisk->snaps[i].id);
602adf40
YS
648
649 return 0;
650
6a52325f 651out_err:
849b4260 652 kfree(header->snap_sizes);
ccece235 653 header->snap_sizes = NULL;
602adf40 654 kfree(header->snap_names);
ccece235 655 header->snap_names = NULL;
6a52325f
AE
656 kfree(header->object_prefix);
657 header->object_prefix = NULL;
ccece235 658
00f1f36f 659 return -ENOMEM;
602adf40
YS
660}
661
9e15b77d
AE
662static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
663{
664 struct rbd_snap *snap;
665
666 if (snap_id == CEPH_NOSNAP)
667 return RBD_SNAP_HEAD_NAME;
668
669 list_for_each_entry(snap, &rbd_dev->snaps, node)
670 if (snap_id == snap->id)
671 return snap->name;
672
673 return NULL;
674}
675
8836b995 676static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
602adf40 677{
602adf40 678
e86924a8 679 struct rbd_snap *snap;
602adf40 680
e86924a8
AE
681 list_for_each_entry(snap, &rbd_dev->snaps, node) {
682 if (!strcmp(snap_name, snap->name)) {
0d7dbfce 683 rbd_dev->spec->snap_id = snap->id;
e86924a8 684 rbd_dev->mapping.size = snap->size;
34b13184 685 rbd_dev->mapping.features = snap->features;
602adf40 686
e86924a8 687 return 0;
00f1f36f 688 }
00f1f36f 689 }
e86924a8 690
00f1f36f 691 return -ENOENT;
602adf40
YS
692}
693
819d52bf 694static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
602adf40 695{
78dc447d 696 int ret;
602adf40 697
0d7dbfce 698 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
cc9d734c 699 sizeof (RBD_SNAP_HEAD_NAME))) {
0d7dbfce 700 rbd_dev->spec->snap_id = CEPH_NOSNAP;
99c1f08f 701 rbd_dev->mapping.size = rbd_dev->header.image_size;
34b13184 702 rbd_dev->mapping.features = rbd_dev->header.features;
e86924a8 703 ret = 0;
602adf40 704 } else {
0d7dbfce 705 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
602adf40
YS
706 if (ret < 0)
707 goto done;
f84344f3 708 rbd_dev->mapping.read_only = true;
602adf40 709 }
daba5fdb 710 rbd_dev->exists = true;
602adf40 711done:
602adf40
YS
712 return ret;
713}
714
715static void rbd_header_free(struct rbd_image_header *header)
716{
849b4260 717 kfree(header->object_prefix);
d78fd7ae 718 header->object_prefix = NULL;
602adf40 719 kfree(header->snap_sizes);
d78fd7ae 720 header->snap_sizes = NULL;
849b4260 721 kfree(header->snap_names);
d78fd7ae 722 header->snap_names = NULL;
d1d25646 723 ceph_put_snap_context(header->snapc);
d78fd7ae 724 header->snapc = NULL;
602adf40
YS
725}
726
65ccfe21 727static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
602adf40 728{
65ccfe21
AE
729 char *name;
730 u64 segment;
731 int ret;
602adf40 732
2fd82b9e 733 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
65ccfe21
AE
734 if (!name)
735 return NULL;
736 segment = offset >> rbd_dev->header.obj_order;
2fd82b9e 737 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
65ccfe21 738 rbd_dev->header.object_prefix, segment);
2fd82b9e 739 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
65ccfe21
AE
740 pr_err("error formatting segment name for #%llu (%d)\n",
741 segment, ret);
742 kfree(name);
743 name = NULL;
744 }
602adf40 745
65ccfe21
AE
746 return name;
747}
602adf40 748
65ccfe21
AE
749static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
750{
751 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
602adf40 752
65ccfe21
AE
753 return offset & (segment_size - 1);
754}
755
756static u64 rbd_segment_length(struct rbd_device *rbd_dev,
757 u64 offset, u64 length)
758{
759 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
760
761 offset &= segment_size - 1;
762
aafb230e 763 rbd_assert(length <= U64_MAX - offset);
65ccfe21
AE
764 if (offset + length > segment_size)
765 length = segment_size - offset;
766
767 return length;
602adf40
YS
768}
769
1fec7093
YS
770static int rbd_get_num_segments(struct rbd_image_header *header,
771 u64 ofs, u64 len)
772{
df111be6
AE
773 u64 start_seg;
774 u64 end_seg;
775
776 if (!len)
777 return 0;
778 if (len - 1 > U64_MAX - ofs)
779 return -ERANGE;
780
781 start_seg = ofs >> header->obj_order;
782 end_seg = (ofs + len - 1) >> header->obj_order;
783
1fec7093
YS
784 return end_seg - start_seg + 1;
785}
786
029bcbd8
JD
787/*
788 * returns the size of an object in the image
789 */
790static u64 rbd_obj_bytes(struct rbd_image_header *header)
791{
792 return 1 << header->obj_order;
793}
794
602adf40
YS
795/*
796 * bio helpers
797 */
798
799static void bio_chain_put(struct bio *chain)
800{
801 struct bio *tmp;
802
803 while (chain) {
804 tmp = chain;
805 chain = chain->bi_next;
806 bio_put(tmp);
807 }
808}
809
810/*
811 * zeros a bio chain, starting at specific offset
812 */
813static void zero_bio_chain(struct bio *chain, int start_ofs)
814{
815 struct bio_vec *bv;
816 unsigned long flags;
817 void *buf;
818 int i;
819 int pos = 0;
820
821 while (chain) {
822 bio_for_each_segment(bv, chain, i) {
823 if (pos + bv->bv_len > start_ofs) {
824 int remainder = max(start_ofs - pos, 0);
825 buf = bvec_kmap_irq(bv, &flags);
826 memset(buf + remainder, 0,
827 bv->bv_len - remainder);
85b5aaa6 828 bvec_kunmap_irq(buf, &flags);
602adf40
YS
829 }
830 pos += bv->bv_len;
831 }
832
833 chain = chain->bi_next;
834 }
835}
836
837/*
f7760dad
AE
838 * Clone a portion of a bio, starting at the given byte offset
839 * and continuing for the number of bytes indicated.
602adf40 840 */
f7760dad
AE
841static struct bio *bio_clone_range(struct bio *bio_src,
842 unsigned int offset,
843 unsigned int len,
844 gfp_t gfpmask)
602adf40 845{
f7760dad
AE
846 struct bio_vec *bv;
847 unsigned int resid;
848 unsigned short idx;
849 unsigned int voff;
850 unsigned short end_idx;
851 unsigned short vcnt;
852 struct bio *bio;
853
854 /* Handle the easy case for the caller */
855
856 if (!offset && len == bio_src->bi_size)
857 return bio_clone(bio_src, gfpmask);
858
859 if (WARN_ON_ONCE(!len))
860 return NULL;
861 if (WARN_ON_ONCE(len > bio_src->bi_size))
862 return NULL;
863 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
864 return NULL;
865
866 /* Find first affected segment... */
867
868 resid = offset;
869 __bio_for_each_segment(bv, bio_src, idx, 0) {
870 if (resid < bv->bv_len)
871 break;
872 resid -= bv->bv_len;
602adf40 873 }
f7760dad 874 voff = resid;
602adf40 875
f7760dad 876 /* ...and the last affected segment */
602adf40 877
f7760dad
AE
878 resid += len;
879 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
880 if (resid <= bv->bv_len)
881 break;
882 resid -= bv->bv_len;
883 }
884 vcnt = end_idx - idx + 1;
885
886 /* Build the clone */
887
888 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
889 if (!bio)
890 return NULL; /* ENOMEM */
602adf40 891
f7760dad
AE
892 bio->bi_bdev = bio_src->bi_bdev;
893 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
894 bio->bi_rw = bio_src->bi_rw;
895 bio->bi_flags |= 1 << BIO_CLONED;
896
897 /*
898 * Copy over our part of the bio_vec, then update the first
899 * and last (or only) entries.
900 */
901 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
902 vcnt * sizeof (struct bio_vec));
903 bio->bi_io_vec[0].bv_offset += voff;
904 if (vcnt > 1) {
905 bio->bi_io_vec[0].bv_len -= voff;
906 bio->bi_io_vec[vcnt - 1].bv_len = resid;
907 } else {
908 bio->bi_io_vec[0].bv_len = len;
602adf40
YS
909 }
910
f7760dad
AE
911 bio->bi_vcnt = vcnt;
912 bio->bi_size = len;
913 bio->bi_idx = 0;
914
915 return bio;
916}
917
918/*
919 * Clone a portion of a bio chain, starting at the given byte offset
920 * into the first bio in the source chain and continuing for the
921 * number of bytes indicated. The result is another bio chain of
922 * exactly the given length, or a null pointer on error.
923 *
924 * The bio_src and offset parameters are both in-out. On entry they
925 * refer to the first source bio and the offset into that bio where
926 * the start of data to be cloned is located.
927 *
928 * On return, bio_src is updated to refer to the bio in the source
929 * chain that contains first un-cloned byte, and *offset will
930 * contain the offset of that byte within that bio.
931 */
932static struct bio *bio_chain_clone_range(struct bio **bio_src,
933 unsigned int *offset,
934 unsigned int len,
935 gfp_t gfpmask)
936{
937 struct bio *bi = *bio_src;
938 unsigned int off = *offset;
939 struct bio *chain = NULL;
940 struct bio **end;
941
942 /* Build up a chain of clone bios up to the limit */
943
944 if (!bi || off >= bi->bi_size || !len)
945 return NULL; /* Nothing to clone */
602adf40 946
f7760dad
AE
947 end = &chain;
948 while (len) {
949 unsigned int bi_size;
950 struct bio *bio;
951
952 if (!bi)
953 goto out_err; /* EINVAL; ran out of bio's */
954 bi_size = min_t(unsigned int, bi->bi_size - off, len);
955 bio = bio_clone_range(bi, off, bi_size, gfpmask);
956 if (!bio)
957 goto out_err; /* ENOMEM */
958
959 *end = bio;
960 end = &bio->bi_next;
602adf40 961
f7760dad
AE
962 off += bi_size;
963 if (off == bi->bi_size) {
964 bi = bi->bi_next;
965 off = 0;
966 }
967 len -= bi_size;
968 }
969 *bio_src = bi;
970 *offset = off;
971
972 return chain;
973out_err:
974 bio_chain_put(chain);
602adf40 975
602adf40
YS
976 return NULL;
977}
978
979/*
980 * helpers for osd request op vectors.
981 */
57cfc106
AE
982static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
983 int opcode, u32 payload_len)
602adf40 984{
57cfc106
AE
985 struct ceph_osd_req_op *ops;
986
987 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
988 if (!ops)
989 return NULL;
990
991 ops[0].op = opcode;
992
602adf40
YS
993 /*
994 * op extent offset and length will be set later on
995 * in calc_raw_layout()
996 */
57cfc106
AE
997 ops[0].payload_len = payload_len;
998
999 return ops;
602adf40
YS
1000}
1001
1002static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
1003{
1004 kfree(ops);
1005}
1006
1fec7093
YS
1007static void rbd_coll_end_req_index(struct request *rq,
1008 struct rbd_req_coll *coll,
1009 int index,
1010 int ret, u64 len)
1011{
1012 struct request_queue *q;
1013 int min, max, i;
1014
bd919d45
AE
1015 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
1016 coll, index, ret, (unsigned long long) len);
1fec7093
YS
1017
1018 if (!rq)
1019 return;
1020
1021 if (!coll) {
1022 blk_end_request(rq, ret, len);
1023 return;
1024 }
1025
1026 q = rq->q;
1027
1028 spin_lock_irq(q->queue_lock);
1029 coll->status[index].done = 1;
1030 coll->status[index].rc = ret;
1031 coll->status[index].bytes = len;
1032 max = min = coll->num_done;
1033 while (max < coll->total && coll->status[max].done)
1034 max++;
1035
1036 for (i = min; i<max; i++) {
1037 __blk_end_request(rq, coll->status[i].rc,
1038 coll->status[i].bytes);
1039 coll->num_done++;
1040 kref_put(&coll->kref, rbd_coll_release);
1041 }
1042 spin_unlock_irq(q->queue_lock);
1043}
1044
1045static void rbd_coll_end_req(struct rbd_request *req,
1046 int ret, u64 len)
1047{
1048 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
1049}
1050
602adf40
YS
1051/*
1052 * Send ceph osd request
1053 */
1054static int rbd_do_request(struct request *rq,
0ce1a794 1055 struct rbd_device *rbd_dev,
602adf40
YS
1056 struct ceph_snap_context *snapc,
1057 u64 snapid,
aded07ea 1058 const char *object_name, u64 ofs, u64 len,
602adf40
YS
1059 struct bio *bio,
1060 struct page **pages,
1061 int num_pages,
1062 int flags,
1063 struct ceph_osd_req_op *ops,
1fec7093
YS
1064 struct rbd_req_coll *coll,
1065 int coll_index,
602adf40 1066 void (*rbd_cb)(struct ceph_osd_request *req,
59c2be1e
YS
1067 struct ceph_msg *msg),
1068 struct ceph_osd_request **linger_req,
1069 u64 *ver)
602adf40
YS
1070{
1071 struct ceph_osd_request *req;
1072 struct ceph_file_layout *layout;
1073 int ret;
1074 u64 bno;
1075 struct timespec mtime = CURRENT_TIME;
1076 struct rbd_request *req_data;
1077 struct ceph_osd_request_head *reqhead;
1dbb4399 1078 struct ceph_osd_client *osdc;
602adf40 1079
602adf40 1080 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
1fec7093
YS
1081 if (!req_data) {
1082 if (coll)
1083 rbd_coll_end_req_index(rq, coll, coll_index,
1084 -ENOMEM, len);
1085 return -ENOMEM;
1086 }
1087
1088 if (coll) {
1089 req_data->coll = coll;
1090 req_data->coll_index = coll_index;
1091 }
602adf40 1092
f7760dad
AE
1093 dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1094 object_name, (unsigned long long) ofs,
1095 (unsigned long long) len, coll, coll_index);
602adf40 1096
0ce1a794 1097 osdc = &rbd_dev->rbd_client->client->osdc;
1dbb4399
AE
1098 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
1099 false, GFP_NOIO, pages, bio);
4ad12621 1100 if (!req) {
4ad12621 1101 ret = -ENOMEM;
602adf40
YS
1102 goto done_pages;
1103 }
1104
1105 req->r_callback = rbd_cb;
1106
1107 req_data->rq = rq;
1108 req_data->bio = bio;
1109 req_data->pages = pages;
1110 req_data->len = len;
1111
1112 req->r_priv = req_data;
1113
1114 reqhead = req->r_request->front.iov_base;
1115 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1116
aded07ea 1117 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
602adf40
YS
1118 req->r_oid_len = strlen(req->r_oid);
1119
1120 layout = &req->r_file_layout;
1121 memset(layout, 0, sizeof(*layout));
1122 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1123 layout->fl_stripe_count = cpu_to_le32(1);
1124 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
0d7dbfce 1125 layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id);
6cae3717
SW
1126 ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1127 req, ops);
1128 rbd_assert(ret == 0);
602adf40
YS
1129
1130 ceph_osdc_build_request(req, ofs, &len,
1131 ops,
1132 snapc,
1133 &mtime,
1134 req->r_oid, req->r_oid_len);
602adf40 1135
59c2be1e 1136 if (linger_req) {
1dbb4399 1137 ceph_osdc_set_request_linger(osdc, req);
59c2be1e
YS
1138 *linger_req = req;
1139 }
1140
1dbb4399 1141 ret = ceph_osdc_start_request(osdc, req, false);
602adf40
YS
1142 if (ret < 0)
1143 goto done_err;
1144
1145 if (!rbd_cb) {
1dbb4399 1146 ret = ceph_osdc_wait_request(osdc, req);
59c2be1e
YS
1147 if (ver)
1148 *ver = le64_to_cpu(req->r_reassert_version.version);
bd919d45
AE
1149 dout("reassert_ver=%llu\n",
1150 (unsigned long long)
1151 le64_to_cpu(req->r_reassert_version.version));
602adf40
YS
1152 ceph_osdc_put_request(req);
1153 }
1154 return ret;
1155
1156done_err:
1157 bio_chain_put(req_data->bio);
1158 ceph_osdc_put_request(req);
1159done_pages:
1fec7093 1160 rbd_coll_end_req(req_data, ret, len);
602adf40 1161 kfree(req_data);
602adf40
YS
1162 return ret;
1163}
1164
1165/*
1166 * Ceph osd op callback
1167 */
1168static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1169{
1170 struct rbd_request *req_data = req->r_priv;
1171 struct ceph_osd_reply_head *replyhead;
1172 struct ceph_osd_op *op;
1173 __s32 rc;
1174 u64 bytes;
1175 int read_op;
1176
1177 /* parse reply */
1178 replyhead = msg->front.iov_base;
1179 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1180 op = (void *)(replyhead + 1);
1181 rc = le32_to_cpu(replyhead->result);
1182 bytes = le64_to_cpu(op->extent.length);
895cfcc8 1183 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
602adf40 1184
bd919d45
AE
1185 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1186 (unsigned long long) bytes, read_op, (int) rc);
602adf40
YS
1187
1188 if (rc == -ENOENT && read_op) {
1189 zero_bio_chain(req_data->bio, 0);
1190 rc = 0;
1191 } else if (rc == 0 && read_op && bytes < req_data->len) {
1192 zero_bio_chain(req_data->bio, bytes);
1193 bytes = req_data->len;
1194 }
1195
1fec7093 1196 rbd_coll_end_req(req_data, rc, bytes);
602adf40
YS
1197
1198 if (req_data->bio)
1199 bio_chain_put(req_data->bio);
1200
1201 ceph_osdc_put_request(req);
1202 kfree(req_data);
1203}
1204
59c2be1e
YS
1205static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1206{
1207 ceph_osdc_put_request(req);
1208}
1209
602adf40
YS
1210/*
1211 * Do a synchronous ceph osd operation
1212 */
0ce1a794 1213static int rbd_req_sync_op(struct rbd_device *rbd_dev,
602adf40
YS
1214 struct ceph_snap_context *snapc,
1215 u64 snapid,
602adf40 1216 int flags,
913d2fdc 1217 struct ceph_osd_req_op *ops,
aded07ea 1218 const char *object_name,
f8d4de6e
AE
1219 u64 ofs, u64 inbound_size,
1220 char *inbound,
59c2be1e
YS
1221 struct ceph_osd_request **linger_req,
1222 u64 *ver)
602adf40
YS
1223{
1224 int ret;
1225 struct page **pages;
1226 int num_pages;
913d2fdc 1227
aafb230e 1228 rbd_assert(ops != NULL);
602adf40 1229
f8d4de6e 1230 num_pages = calc_pages_for(ofs, inbound_size);
602adf40 1231 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
b8d0638a
DC
1232 if (IS_ERR(pages))
1233 return PTR_ERR(pages);
602adf40 1234
0ce1a794 1235 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
f8d4de6e 1236 object_name, ofs, inbound_size, NULL,
602adf40
YS
1237 pages, num_pages,
1238 flags,
1239 ops,
1fec7093 1240 NULL, 0,
59c2be1e
YS
1241 NULL,
1242 linger_req, ver);
602adf40 1243 if (ret < 0)
913d2fdc 1244 goto done;
602adf40 1245
f8d4de6e
AE
1246 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1247 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
602adf40 1248
602adf40
YS
1249done:
1250 ceph_release_page_vector(pages, num_pages);
1251 return ret;
1252}
1253
1254/*
1255 * Do an asynchronous ceph osd operation
1256 */
1257static int rbd_do_op(struct request *rq,
0ce1a794 1258 struct rbd_device *rbd_dev,
602adf40 1259 struct ceph_snap_context *snapc,
602adf40 1260 u64 ofs, u64 len,
1fec7093
YS
1261 struct bio *bio,
1262 struct rbd_req_coll *coll,
1263 int coll_index)
602adf40
YS
1264{
1265 char *seg_name;
1266 u64 seg_ofs;
1267 u64 seg_len;
1268 int ret;
1269 struct ceph_osd_req_op *ops;
1270 u32 payload_len;
ff2e4bb5
AE
1271 int opcode;
1272 int flags;
4634246d 1273 u64 snapid;
602adf40 1274
65ccfe21 1275 seg_name = rbd_segment_name(rbd_dev, ofs);
602adf40
YS
1276 if (!seg_name)
1277 return -ENOMEM;
65ccfe21
AE
1278 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1279 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
602adf40 1280
ff2e4bb5
AE
1281 if (rq_data_dir(rq) == WRITE) {
1282 opcode = CEPH_OSD_OP_WRITE;
1283 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
4634246d 1284 snapid = CEPH_NOSNAP;
ff2e4bb5
AE
1285 payload_len = seg_len;
1286 } else {
1287 opcode = CEPH_OSD_OP_READ;
1288 flags = CEPH_OSD_FLAG_READ;
4634246d 1289 snapc = NULL;
0d7dbfce 1290 snapid = rbd_dev->spec->snap_id;
ff2e4bb5
AE
1291 payload_len = 0;
1292 }
602adf40 1293
57cfc106
AE
1294 ret = -ENOMEM;
1295 ops = rbd_create_rw_ops(1, opcode, payload_len);
1296 if (!ops)
602adf40
YS
1297 goto done;
1298
1299 /* we've taken care of segment sizes earlier when we
1300 cloned the bios. We should never have a segment
1301 truncated at this point */
aafb230e 1302 rbd_assert(seg_len == len);
602adf40
YS
1303
1304 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1305 seg_name, seg_ofs, seg_len,
1306 bio,
1307 NULL, 0,
1308 flags,
1309 ops,
1fec7093 1310 coll, coll_index,
59c2be1e 1311 rbd_req_cb, 0, NULL);
11f77002
SW
1312
1313 rbd_destroy_ops(ops);
602adf40
YS
1314done:
1315 kfree(seg_name);
1316 return ret;
1317}
1318
602adf40
YS
1319/*
1320 * Request sync osd read
1321 */
0ce1a794 1322static int rbd_req_sync_read(struct rbd_device *rbd_dev,
602adf40 1323 u64 snapid,
aded07ea 1324 const char *object_name,
602adf40 1325 u64 ofs, u64 len,
59c2be1e
YS
1326 char *buf,
1327 u64 *ver)
602adf40 1328{
913d2fdc
AE
1329 struct ceph_osd_req_op *ops;
1330 int ret;
1331
1332 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1333 if (!ops)
1334 return -ENOMEM;
1335
1336 ret = rbd_req_sync_op(rbd_dev, NULL,
b06e6a6b 1337 snapid,
602adf40 1338 CEPH_OSD_FLAG_READ,
913d2fdc
AE
1339 ops, object_name, ofs, len, buf, NULL, ver);
1340 rbd_destroy_ops(ops);
1341
1342 return ret;
602adf40
YS
1343}
1344
1345/*
59c2be1e
YS
1346 * Request sync osd watch
1347 */
0ce1a794 1348static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
59c2be1e 1349 u64 ver,
7f0a24d8 1350 u64 notify_id)
59c2be1e
YS
1351{
1352 struct ceph_osd_req_op *ops;
11f77002
SW
1353 int ret;
1354
57cfc106
AE
1355 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1356 if (!ops)
1357 return -ENOMEM;
59c2be1e 1358
a71b891b 1359 ops[0].watch.ver = cpu_to_le64(ver);
59c2be1e
YS
1360 ops[0].watch.cookie = notify_id;
1361 ops[0].watch.flag = 0;
1362
0ce1a794 1363 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
7f0a24d8 1364 rbd_dev->header_name, 0, 0, NULL,
ad4f232f 1365 NULL, 0,
59c2be1e
YS
1366 CEPH_OSD_FLAG_READ,
1367 ops,
1fec7093 1368 NULL, 0,
59c2be1e
YS
1369 rbd_simple_req_cb, 0, NULL);
1370
1371 rbd_destroy_ops(ops);
1372 return ret;
1373}
1374
1375static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1376{
0ce1a794 1377 struct rbd_device *rbd_dev = (struct rbd_device *)data;
a71b891b 1378 u64 hver;
13143d2d
SW
1379 int rc;
1380
0ce1a794 1381 if (!rbd_dev)
59c2be1e
YS
1382 return;
1383
bd919d45
AE
1384 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1385 rbd_dev->header_name, (unsigned long long) notify_id,
1386 (unsigned int) opcode);
117973fb 1387 rc = rbd_dev_refresh(rbd_dev, &hver);
13143d2d 1388 if (rc)
f0f8cef5 1389 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
0ce1a794 1390 " update snaps: %d\n", rbd_dev->major, rc);
59c2be1e 1391
7f0a24d8 1392 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
59c2be1e
YS
1393}
1394
1395/*
1396 * Request sync osd watch
1397 */
0e6f322d 1398static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
59c2be1e
YS
1399{
1400 struct ceph_osd_req_op *ops;
0ce1a794 1401 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
57cfc106 1402 int ret;
59c2be1e 1403
57cfc106
AE
1404 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1405 if (!ops)
1406 return -ENOMEM;
59c2be1e
YS
1407
1408 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
0ce1a794 1409 (void *)rbd_dev, &rbd_dev->watch_event);
59c2be1e
YS
1410 if (ret < 0)
1411 goto fail;
1412
0e6f322d 1413 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
0ce1a794 1414 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
59c2be1e
YS
1415 ops[0].watch.flag = 1;
1416
0ce1a794 1417 ret = rbd_req_sync_op(rbd_dev, NULL,
59c2be1e 1418 CEPH_NOSNAP,
59c2be1e
YS
1419 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1420 ops,
0e6f322d
AE
1421 rbd_dev->header_name,
1422 0, 0, NULL,
0ce1a794 1423 &rbd_dev->watch_request, NULL);
59c2be1e
YS
1424
1425 if (ret < 0)
1426 goto fail_event;
1427
1428 rbd_destroy_ops(ops);
1429 return 0;
1430
1431fail_event:
0ce1a794
AE
1432 ceph_osdc_cancel_event(rbd_dev->watch_event);
1433 rbd_dev->watch_event = NULL;
59c2be1e
YS
1434fail:
1435 rbd_destroy_ops(ops);
1436 return ret;
1437}
1438
79e3057c
YS
1439/*
1440 * Request sync osd unwatch
1441 */
070c633f 1442static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
79e3057c
YS
1443{
1444 struct ceph_osd_req_op *ops;
57cfc106 1445 int ret;
79e3057c 1446
57cfc106
AE
1447 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1448 if (!ops)
1449 return -ENOMEM;
79e3057c
YS
1450
1451 ops[0].watch.ver = 0;
0ce1a794 1452 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
79e3057c
YS
1453 ops[0].watch.flag = 0;
1454
0ce1a794 1455 ret = rbd_req_sync_op(rbd_dev, NULL,
79e3057c 1456 CEPH_NOSNAP,
79e3057c
YS
1457 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1458 ops,
070c633f
AE
1459 rbd_dev->header_name,
1460 0, 0, NULL, NULL, NULL);
1461
79e3057c
YS
1462
1463 rbd_destroy_ops(ops);
0ce1a794
AE
1464 ceph_osdc_cancel_event(rbd_dev->watch_event);
1465 rbd_dev->watch_event = NULL;
79e3057c
YS
1466 return ret;
1467}
1468
602adf40 1469/*
3cb4a687 1470 * Synchronous osd object method call
602adf40 1471 */
0ce1a794 1472static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
aded07ea
AE
1473 const char *object_name,
1474 const char *class_name,
1475 const char *method_name,
3cb4a687
AE
1476 const char *outbound,
1477 size_t outbound_size,
f8d4de6e
AE
1478 char *inbound,
1479 size_t inbound_size,
3cb4a687 1480 int flags,
59c2be1e 1481 u64 *ver)
602adf40
YS
1482{
1483 struct ceph_osd_req_op *ops;
aded07ea
AE
1484 int class_name_len = strlen(class_name);
1485 int method_name_len = strlen(method_name);
3cb4a687 1486 int payload_size;
57cfc106
AE
1487 int ret;
1488
3cb4a687
AE
1489 /*
1490 * Any input parameters required by the method we're calling
1491 * will be sent along with the class and method names as
1492 * part of the message payload. That data and its size are
1493 * supplied via the indata and indata_len fields (named from
1494 * the perspective of the server side) in the OSD request
1495 * operation.
1496 */
1497 payload_size = class_name_len + method_name_len + outbound_size;
1498 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
57cfc106
AE
1499 if (!ops)
1500 return -ENOMEM;
602adf40 1501
aded07ea
AE
1502 ops[0].cls.class_name = class_name;
1503 ops[0].cls.class_len = (__u8) class_name_len;
1504 ops[0].cls.method_name = method_name;
1505 ops[0].cls.method_len = (__u8) method_name_len;
602adf40 1506 ops[0].cls.argc = 0;
3cb4a687
AE
1507 ops[0].cls.indata = outbound;
1508 ops[0].cls.indata_len = outbound_size;
602adf40 1509
0ce1a794 1510 ret = rbd_req_sync_op(rbd_dev, NULL,
602adf40 1511 CEPH_NOSNAP,
3cb4a687 1512 flags, ops,
f8d4de6e
AE
1513 object_name, 0, inbound_size, inbound,
1514 NULL, ver);
602adf40
YS
1515
1516 rbd_destroy_ops(ops);
1517
1518 dout("cls_exec returned %d\n", ret);
1519 return ret;
1520}
1521
1fec7093
YS
1522static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1523{
1524 struct rbd_req_coll *coll =
1525 kzalloc(sizeof(struct rbd_req_coll) +
1526 sizeof(struct rbd_req_status) * num_reqs,
1527 GFP_ATOMIC);
1528
1529 if (!coll)
1530 return NULL;
1531 coll->total = num_reqs;
1532 kref_init(&coll->kref);
1533 return coll;
1534}
1535
602adf40
YS
1536/*
1537 * block device queue callback
1538 */
1539static void rbd_rq_fn(struct request_queue *q)
1540{
1541 struct rbd_device *rbd_dev = q->queuedata;
1542 struct request *rq;
602adf40 1543
00f1f36f 1544 while ((rq = blk_fetch_request(q))) {
602adf40 1545 struct bio *bio;
602adf40 1546 bool do_write;
bd919d45 1547 unsigned int size;
602adf40 1548 u64 ofs;
1fec7093
YS
1549 int num_segs, cur_seg = 0;
1550 struct rbd_req_coll *coll;
d1d25646 1551 struct ceph_snap_context *snapc;
f7760dad 1552 unsigned int bio_offset;
602adf40 1553
602adf40
YS
1554 dout("fetched request\n");
1555
1556 /* filter out block requests we don't understand */
1557 if ((rq->cmd_type != REQ_TYPE_FS)) {
1558 __blk_end_request_all(rq, 0);
00f1f36f 1559 continue;
602adf40
YS
1560 }
1561
1562 /* deduce our operation (read, write) */
1563 do_write = (rq_data_dir(rq) == WRITE);
f84344f3 1564 if (do_write && rbd_dev->mapping.read_only) {
602adf40 1565 __blk_end_request_all(rq, -EROFS);
00f1f36f 1566 continue;
602adf40
YS
1567 }
1568
1569 spin_unlock_irq(q->queue_lock);
1570
d1d25646 1571 down_read(&rbd_dev->header_rwsem);
e88a36ec 1572
daba5fdb 1573 if (!rbd_dev->exists) {
0d7dbfce 1574 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
e88a36ec 1575 up_read(&rbd_dev->header_rwsem);
d1d25646
JD
1576 dout("request for non-existent snapshot");
1577 spin_lock_irq(q->queue_lock);
1578 __blk_end_request_all(rq, -ENXIO);
1579 continue;
e88a36ec
JD
1580 }
1581
d1d25646
JD
1582 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1583
1584 up_read(&rbd_dev->header_rwsem);
1585
f7760dad
AE
1586 size = blk_rq_bytes(rq);
1587 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1588 bio = rq->bio;
1589
602adf40
YS
1590 dout("%s 0x%x bytes at 0x%llx\n",
1591 do_write ? "write" : "read",
bd919d45 1592 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
602adf40 1593
1fec7093 1594 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
df111be6
AE
1595 if (num_segs <= 0) {
1596 spin_lock_irq(q->queue_lock);
1597 __blk_end_request_all(rq, num_segs);
1598 ceph_put_snap_context(snapc);
1599 continue;
1600 }
1fec7093
YS
1601 coll = rbd_alloc_coll(num_segs);
1602 if (!coll) {
1603 spin_lock_irq(q->queue_lock);
1604 __blk_end_request_all(rq, -ENOMEM);
d1d25646 1605 ceph_put_snap_context(snapc);
00f1f36f 1606 continue;
1fec7093
YS
1607 }
1608
f7760dad 1609 bio_offset = 0;
602adf40 1610 do {
f7760dad
AE
1611 u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1612 unsigned int chain_size;
1613 struct bio *bio_chain;
1614
1615 BUG_ON(limit > (u64) UINT_MAX);
1616 chain_size = (unsigned int) limit;
bd919d45 1617 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
f7760dad 1618
1fec7093 1619 kref_get(&coll->kref);
f7760dad
AE
1620
1621 /* Pass a cloned bio chain via an osd request */
1622
1623 bio_chain = bio_chain_clone_range(&bio,
1624 &bio_offset, chain_size,
1625 GFP_ATOMIC);
1626 if (bio_chain)
4634246d 1627 (void) rbd_do_op(rq, rbd_dev, snapc,
f7760dad
AE
1628 ofs, chain_size,
1629 bio_chain, coll, cur_seg);
4634246d 1630 else
1fec7093 1631 rbd_coll_end_req_index(rq, coll, cur_seg,
f7760dad
AE
1632 -ENOMEM, chain_size);
1633 size -= chain_size;
1634 ofs += chain_size;
602adf40 1635
1fec7093 1636 cur_seg++;
602adf40 1637 } while (size > 0);
1fec7093 1638 kref_put(&coll->kref, rbd_coll_release);
602adf40 1639
602adf40 1640 spin_lock_irq(q->queue_lock);
d1d25646
JD
1641
1642 ceph_put_snap_context(snapc);
602adf40
YS
1643 }
1644}
1645
1646/*
1647 * a queue callback. Makes sure that we don't create a bio that spans across
1648 * multiple osd objects. One exception would be with a single page bios,
f7760dad 1649 * which we handle later at bio_chain_clone_range()
602adf40
YS
1650 */
1651static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1652 struct bio_vec *bvec)
1653{
1654 struct rbd_device *rbd_dev = q->queuedata;
e5cfeed2
AE
1655 sector_t sector_offset;
1656 sector_t sectors_per_obj;
1657 sector_t obj_sector_offset;
1658 int ret;
1659
1660 /*
1661 * Find how far into its rbd object the partition-relative
1662 * bio start sector is to offset relative to the enclosing
1663 * device.
1664 */
1665 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1666 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1667 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
1668
1669 /*
1670 * Compute the number of bytes from that offset to the end
1671 * of the object. Account for what's already used by the bio.
1672 */
1673 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1674 if (ret > bmd->bi_size)
1675 ret -= bmd->bi_size;
1676 else
1677 ret = 0;
1678
1679 /*
1680 * Don't send back more than was asked for. And if the bio
1681 * was empty, let the whole thing through because: "Note
1682 * that a block device *must* allow a single page to be
1683 * added to an empty bio."
1684 */
1685 rbd_assert(bvec->bv_len <= PAGE_SIZE);
1686 if (ret > (int) bvec->bv_len || !bmd->bi_size)
1687 ret = (int) bvec->bv_len;
1688
1689 return ret;
602adf40
YS
1690}
1691
1692static void rbd_free_disk(struct rbd_device *rbd_dev)
1693{
1694 struct gendisk *disk = rbd_dev->disk;
1695
1696 if (!disk)
1697 return;
1698
602adf40
YS
1699 if (disk->flags & GENHD_FL_UP)
1700 del_gendisk(disk);
1701 if (disk->queue)
1702 blk_cleanup_queue(disk->queue);
1703 put_disk(disk);
1704}
1705
1706/*
4156d998
AE
1707 * Read the complete header for the given rbd device.
1708 *
1709 * Returns a pointer to a dynamically-allocated buffer containing
1710 * the complete and validated header. Caller can pass the address
1711 * of a variable that will be filled in with the version of the
1712 * header object at the time it was read.
1713 *
1714 * Returns a pointer-coded errno if a failure occurs.
602adf40 1715 */
4156d998
AE
1716static struct rbd_image_header_ondisk *
1717rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
602adf40 1718{
4156d998 1719 struct rbd_image_header_ondisk *ondisk = NULL;
50f7c4c9 1720 u32 snap_count = 0;
4156d998
AE
1721 u64 names_size = 0;
1722 u32 want_count;
1723 int ret;
602adf40 1724
00f1f36f 1725 /*
4156d998
AE
1726 * The complete header will include an array of its 64-bit
1727 * snapshot ids, followed by the names of those snapshots as
1728 * a contiguous block of NUL-terminated strings. Note that
1729 * the number of snapshots could change by the time we read
1730 * it in, in which case we re-read it.
00f1f36f 1731 */
4156d998
AE
1732 do {
1733 size_t size;
1734
1735 kfree(ondisk);
1736
1737 size = sizeof (*ondisk);
1738 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1739 size += names_size;
1740 ondisk = kmalloc(size, GFP_KERNEL);
1741 if (!ondisk)
1742 return ERR_PTR(-ENOMEM);
1743
1744 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
0bed54dc 1745 rbd_dev->header_name,
4156d998
AE
1746 0, size,
1747 (char *) ondisk, version);
1748
1749 if (ret < 0)
1750 goto out_err;
1751 if (WARN_ON((size_t) ret < size)) {
1752 ret = -ENXIO;
1753 pr_warning("short header read for image %s"
1754 " (want %zd got %d)\n",
0d7dbfce 1755 rbd_dev->spec->image_name, size, ret);
4156d998
AE
1756 goto out_err;
1757 }
1758 if (!rbd_dev_ondisk_valid(ondisk)) {
1759 ret = -ENXIO;
1760 pr_warning("invalid header for image %s\n",
0d7dbfce 1761 rbd_dev->spec->image_name);
4156d998 1762 goto out_err;
81e759fb 1763 }
602adf40 1764
4156d998
AE
1765 names_size = le64_to_cpu(ondisk->snap_names_len);
1766 want_count = snap_count;
1767 snap_count = le32_to_cpu(ondisk->snap_count);
1768 } while (snap_count != want_count);
00f1f36f 1769
4156d998 1770 return ondisk;
00f1f36f 1771
4156d998
AE
1772out_err:
1773 kfree(ondisk);
1774
1775 return ERR_PTR(ret);
1776}
1777
1778/*
1779 * reload the ondisk the header
1780 */
1781static int rbd_read_header(struct rbd_device *rbd_dev,
1782 struct rbd_image_header *header)
1783{
1784 struct rbd_image_header_ondisk *ondisk;
1785 u64 ver = 0;
1786 int ret;
602adf40 1787
4156d998
AE
1788 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1789 if (IS_ERR(ondisk))
1790 return PTR_ERR(ondisk);
1791 ret = rbd_header_from_disk(header, ondisk);
1792 if (ret >= 0)
1793 header->obj_version = ver;
1794 kfree(ondisk);
1795
1796 return ret;
602adf40
YS
1797}
1798
41f38c2b 1799static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
dfc5606d
YS
1800{
1801 struct rbd_snap *snap;
a0593290 1802 struct rbd_snap *next;
dfc5606d 1803
a0593290 1804 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
41f38c2b 1805 rbd_remove_snap_dev(snap);
dfc5606d
YS
1806}
1807
9478554a
AE
1808static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
1809{
1810 sector_t size;
1811
0d7dbfce 1812 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
9478554a
AE
1813 return;
1814
1815 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
1816 dout("setting size to %llu sectors", (unsigned long long) size);
1817 rbd_dev->mapping.size = (u64) size;
1818 set_capacity(rbd_dev->disk, size);
1819}
1820
602adf40
YS
1821/*
1822 * only read the first part of the ondisk header, without the snaps info
1823 */
117973fb 1824static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
602adf40
YS
1825{
1826 int ret;
1827 struct rbd_image_header h;
602adf40
YS
1828
1829 ret = rbd_read_header(rbd_dev, &h);
1830 if (ret < 0)
1831 return ret;
1832
a51aa0c0
JD
1833 down_write(&rbd_dev->header_rwsem);
1834
9478554a
AE
1835 /* Update image size, and check for resize of mapped image */
1836 rbd_dev->header.image_size = h.image_size;
1837 rbd_update_mapping_size(rbd_dev);
9db4b3e3 1838
849b4260 1839 /* rbd_dev->header.object_prefix shouldn't change */
602adf40 1840 kfree(rbd_dev->header.snap_sizes);
849b4260 1841 kfree(rbd_dev->header.snap_names);
d1d25646
JD
1842 /* osd requests may still refer to snapc */
1843 ceph_put_snap_context(rbd_dev->header.snapc);
602adf40 1844
b813623a
AE
1845 if (hver)
1846 *hver = h.obj_version;
a71b891b 1847 rbd_dev->header.obj_version = h.obj_version;
93a24e08 1848 rbd_dev->header.image_size = h.image_size;
602adf40
YS
1849 rbd_dev->header.snapc = h.snapc;
1850 rbd_dev->header.snap_names = h.snap_names;
1851 rbd_dev->header.snap_sizes = h.snap_sizes;
849b4260
AE
1852 /* Free the extra copy of the object prefix */
1853 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1854 kfree(h.object_prefix);
1855
304f6808
AE
1856 ret = rbd_dev_snaps_update(rbd_dev);
1857 if (!ret)
1858 ret = rbd_dev_snaps_register(rbd_dev);
dfc5606d 1859
c666601a 1860 up_write(&rbd_dev->header_rwsem);
602adf40 1861
dfc5606d 1862 return ret;
602adf40
YS
1863}
1864
117973fb 1865static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
1fe5e993
AE
1866{
1867 int ret;
1868
117973fb 1869 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1fe5e993 1870 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
117973fb
AE
1871 if (rbd_dev->image_format == 1)
1872 ret = rbd_dev_v1_refresh(rbd_dev, hver);
1873 else
1874 ret = rbd_dev_v2_refresh(rbd_dev, hver);
1fe5e993
AE
1875 mutex_unlock(&ctl_mutex);
1876
1877 return ret;
1878}
1879
602adf40
YS
1880static int rbd_init_disk(struct rbd_device *rbd_dev)
1881{
1882 struct gendisk *disk;
1883 struct request_queue *q;
593a9e7b 1884 u64 segment_size;
602adf40 1885
602adf40 1886 /* create gendisk info */
602adf40
YS
1887 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1888 if (!disk)
1fcdb8aa 1889 return -ENOMEM;
602adf40 1890
f0f8cef5 1891 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
de71a297 1892 rbd_dev->dev_id);
602adf40
YS
1893 disk->major = rbd_dev->major;
1894 disk->first_minor = 0;
1895 disk->fops = &rbd_bd_ops;
1896 disk->private_data = rbd_dev;
1897
1898 /* init rq */
602adf40
YS
1899 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1900 if (!q)
1901 goto out_disk;
029bcbd8 1902
593a9e7b
AE
1903 /* We use the default size, but let's be explicit about it. */
1904 blk_queue_physical_block_size(q, SECTOR_SIZE);
1905
029bcbd8 1906 /* set io sizes to object size */
593a9e7b
AE
1907 segment_size = rbd_obj_bytes(&rbd_dev->header);
1908 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1909 blk_queue_max_segment_size(q, segment_size);
1910 blk_queue_io_min(q, segment_size);
1911 blk_queue_io_opt(q, segment_size);
029bcbd8 1912
602adf40
YS
1913 blk_queue_merge_bvec(q, rbd_merge_bvec);
1914 disk->queue = q;
1915
1916 q->queuedata = rbd_dev;
1917
1918 rbd_dev->disk = disk;
602adf40 1919
12f02944
AE
1920 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1921
602adf40 1922 return 0;
602adf40
YS
1923out_disk:
1924 put_disk(disk);
1fcdb8aa
AE
1925
1926 return -ENOMEM;
602adf40
YS
1927}
1928
dfc5606d
YS
1929/*
1930 sysfs
1931*/
1932
593a9e7b
AE
1933static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1934{
1935 return container_of(dev, struct rbd_device, dev);
1936}
1937
dfc5606d
YS
1938static ssize_t rbd_size_show(struct device *dev,
1939 struct device_attribute *attr, char *buf)
1940{
593a9e7b 1941 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
a51aa0c0
JD
1942 sector_t size;
1943
1944 down_read(&rbd_dev->header_rwsem);
1945 size = get_capacity(rbd_dev->disk);
1946 up_read(&rbd_dev->header_rwsem);
dfc5606d 1947
a51aa0c0 1948 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
dfc5606d
YS
1949}
1950
34b13184
AE
1951/*
1952 * Note this shows the features for whatever's mapped, which is not
1953 * necessarily the base image.
1954 */
1955static ssize_t rbd_features_show(struct device *dev,
1956 struct device_attribute *attr, char *buf)
1957{
1958 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1959
1960 return sprintf(buf, "0x%016llx\n",
1961 (unsigned long long) rbd_dev->mapping.features);
1962}
1963
dfc5606d
YS
1964static ssize_t rbd_major_show(struct device *dev,
1965 struct device_attribute *attr, char *buf)
1966{
593a9e7b 1967 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 1968
dfc5606d
YS
1969 return sprintf(buf, "%d\n", rbd_dev->major);
1970}
1971
1972static ssize_t rbd_client_id_show(struct device *dev,
1973 struct device_attribute *attr, char *buf)
602adf40 1974{
593a9e7b 1975 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1976
1dbb4399
AE
1977 return sprintf(buf, "client%lld\n",
1978 ceph_client_id(rbd_dev->rbd_client->client));
602adf40
YS
1979}
1980
dfc5606d
YS
1981static ssize_t rbd_pool_show(struct device *dev,
1982 struct device_attribute *attr, char *buf)
602adf40 1983{
593a9e7b 1984 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1985
0d7dbfce 1986 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
dfc5606d
YS
1987}
1988
9bb2f334
AE
1989static ssize_t rbd_pool_id_show(struct device *dev,
1990 struct device_attribute *attr, char *buf)
1991{
1992 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1993
0d7dbfce
AE
1994 return sprintf(buf, "%llu\n",
1995 (unsigned long long) rbd_dev->spec->pool_id);
9bb2f334
AE
1996}
1997
dfc5606d
YS
1998static ssize_t rbd_name_show(struct device *dev,
1999 struct device_attribute *attr, char *buf)
2000{
593a9e7b 2001 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2002
a92ffdf8
AE
2003 if (rbd_dev->spec->image_name)
2004 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2005
2006 return sprintf(buf, "(unknown)\n");
dfc5606d
YS
2007}
2008
589d30e0
AE
2009static ssize_t rbd_image_id_show(struct device *dev,
2010 struct device_attribute *attr, char *buf)
2011{
2012 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2013
0d7dbfce 2014 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
589d30e0
AE
2015}
2016
34b13184
AE
2017/*
2018 * Shows the name of the currently-mapped snapshot (or
2019 * RBD_SNAP_HEAD_NAME for the base image).
2020 */
dfc5606d
YS
2021static ssize_t rbd_snap_show(struct device *dev,
2022 struct device_attribute *attr,
2023 char *buf)
2024{
593a9e7b 2025 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2026
0d7dbfce 2027 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
dfc5606d
YS
2028}
2029
86b00e0d
AE
2030/*
2031 * For an rbd v2 image, shows the pool id, image id, and snapshot id
2032 * for the parent image. If there is no parent, simply shows
2033 * "(no parent image)".
2034 */
2035static ssize_t rbd_parent_show(struct device *dev,
2036 struct device_attribute *attr,
2037 char *buf)
2038{
2039 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2040 struct rbd_spec *spec = rbd_dev->parent_spec;
2041 int count;
2042 char *bufp = buf;
2043
2044 if (!spec)
2045 return sprintf(buf, "(no parent image)\n");
2046
2047 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2048 (unsigned long long) spec->pool_id, spec->pool_name);
2049 if (count < 0)
2050 return count;
2051 bufp += count;
2052
2053 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2054 spec->image_name ? spec->image_name : "(unknown)");
2055 if (count < 0)
2056 return count;
2057 bufp += count;
2058
2059 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2060 (unsigned long long) spec->snap_id, spec->snap_name);
2061 if (count < 0)
2062 return count;
2063 bufp += count;
2064
2065 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2066 if (count < 0)
2067 return count;
2068 bufp += count;
2069
2070 return (ssize_t) (bufp - buf);
2071}
2072
dfc5606d
YS
2073static ssize_t rbd_image_refresh(struct device *dev,
2074 struct device_attribute *attr,
2075 const char *buf,
2076 size_t size)
2077{
593a9e7b 2078 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
b813623a 2079 int ret;
602adf40 2080
117973fb 2081 ret = rbd_dev_refresh(rbd_dev, NULL);
b813623a
AE
2082
2083 return ret < 0 ? ret : size;
dfc5606d 2084}
602adf40 2085
dfc5606d 2086static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
34b13184 2087static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
dfc5606d
YS
2088static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2089static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2090static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
9bb2f334 2091static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
dfc5606d 2092static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
589d30e0 2093static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
dfc5606d
YS
2094static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2095static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
86b00e0d 2096static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
dfc5606d
YS
2097
2098static struct attribute *rbd_attrs[] = {
2099 &dev_attr_size.attr,
34b13184 2100 &dev_attr_features.attr,
dfc5606d
YS
2101 &dev_attr_major.attr,
2102 &dev_attr_client_id.attr,
2103 &dev_attr_pool.attr,
9bb2f334 2104 &dev_attr_pool_id.attr,
dfc5606d 2105 &dev_attr_name.attr,
589d30e0 2106 &dev_attr_image_id.attr,
dfc5606d 2107 &dev_attr_current_snap.attr,
86b00e0d 2108 &dev_attr_parent.attr,
dfc5606d 2109 &dev_attr_refresh.attr,
dfc5606d
YS
2110 NULL
2111};
2112
2113static struct attribute_group rbd_attr_group = {
2114 .attrs = rbd_attrs,
2115};
2116
2117static const struct attribute_group *rbd_attr_groups[] = {
2118 &rbd_attr_group,
2119 NULL
2120};
2121
2122static void rbd_sysfs_dev_release(struct device *dev)
2123{
2124}
2125
2126static struct device_type rbd_device_type = {
2127 .name = "rbd",
2128 .groups = rbd_attr_groups,
2129 .release = rbd_sysfs_dev_release,
2130};
2131
2132
2133/*
2134 sysfs - snapshots
2135*/
2136
2137static ssize_t rbd_snap_size_show(struct device *dev,
2138 struct device_attribute *attr,
2139 char *buf)
2140{
2141 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2142
3591538f 2143 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
dfc5606d
YS
2144}
2145
2146static ssize_t rbd_snap_id_show(struct device *dev,
2147 struct device_attribute *attr,
2148 char *buf)
2149{
2150 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2151
3591538f 2152 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
dfc5606d
YS
2153}
2154
34b13184
AE
2155static ssize_t rbd_snap_features_show(struct device *dev,
2156 struct device_attribute *attr,
2157 char *buf)
2158{
2159 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2160
2161 return sprintf(buf, "0x%016llx\n",
2162 (unsigned long long) snap->features);
2163}
2164
dfc5606d
YS
2165static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2166static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
34b13184 2167static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
dfc5606d
YS
2168
2169static struct attribute *rbd_snap_attrs[] = {
2170 &dev_attr_snap_size.attr,
2171 &dev_attr_snap_id.attr,
34b13184 2172 &dev_attr_snap_features.attr,
dfc5606d
YS
2173 NULL,
2174};
2175
2176static struct attribute_group rbd_snap_attr_group = {
2177 .attrs = rbd_snap_attrs,
2178};
2179
2180static void rbd_snap_dev_release(struct device *dev)
2181{
2182 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2183 kfree(snap->name);
2184 kfree(snap);
2185}
2186
2187static const struct attribute_group *rbd_snap_attr_groups[] = {
2188 &rbd_snap_attr_group,
2189 NULL
2190};
2191
2192static struct device_type rbd_snap_device_type = {
2193 .groups = rbd_snap_attr_groups,
2194 .release = rbd_snap_dev_release,
2195};
2196
8b8fb99c
AE
2197static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2198{
2199 kref_get(&spec->kref);
2200
2201 return spec;
2202}
2203
2204static void rbd_spec_free(struct kref *kref);
2205static void rbd_spec_put(struct rbd_spec *spec)
2206{
2207 if (spec)
2208 kref_put(&spec->kref, rbd_spec_free);
2209}
2210
2211static struct rbd_spec *rbd_spec_alloc(void)
2212{
2213 struct rbd_spec *spec;
2214
2215 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2216 if (!spec)
2217 return NULL;
2218 kref_init(&spec->kref);
2219
2220 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2221
2222 return spec;
2223}
2224
2225static void rbd_spec_free(struct kref *kref)
2226{
2227 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2228
2229 kfree(spec->pool_name);
2230 kfree(spec->image_id);
2231 kfree(spec->image_name);
2232 kfree(spec->snap_name);
2233 kfree(spec);
2234}
2235
c53d5893
AE
2236struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2237 struct rbd_spec *spec)
2238{
2239 struct rbd_device *rbd_dev;
2240
2241 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2242 if (!rbd_dev)
2243 return NULL;
2244
2245 spin_lock_init(&rbd_dev->lock);
2246 INIT_LIST_HEAD(&rbd_dev->node);
2247 INIT_LIST_HEAD(&rbd_dev->snaps);
2248 init_rwsem(&rbd_dev->header_rwsem);
2249
2250 rbd_dev->spec = spec;
2251 rbd_dev->rbd_client = rbdc;
2252
2253 return rbd_dev;
2254}
2255
2256static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2257{
86b00e0d 2258 rbd_spec_put(rbd_dev->parent_spec);
c53d5893
AE
2259 kfree(rbd_dev->header_name);
2260 rbd_put_client(rbd_dev->rbd_client);
2261 rbd_spec_put(rbd_dev->spec);
2262 kfree(rbd_dev);
2263}
2264
304f6808
AE
2265static bool rbd_snap_registered(struct rbd_snap *snap)
2266{
2267 bool ret = snap->dev.type == &rbd_snap_device_type;
2268 bool reg = device_is_registered(&snap->dev);
2269
2270 rbd_assert(!ret ^ reg);
2271
2272 return ret;
2273}
2274
41f38c2b 2275static void rbd_remove_snap_dev(struct rbd_snap *snap)
dfc5606d
YS
2276{
2277 list_del(&snap->node);
304f6808
AE
2278 if (device_is_registered(&snap->dev))
2279 device_unregister(&snap->dev);
dfc5606d
YS
2280}
2281
14e7085d 2282static int rbd_register_snap_dev(struct rbd_snap *snap,
dfc5606d
YS
2283 struct device *parent)
2284{
2285 struct device *dev = &snap->dev;
2286 int ret;
2287
2288 dev->type = &rbd_snap_device_type;
2289 dev->parent = parent;
2290 dev->release = rbd_snap_dev_release;
d4b125e9 2291 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
304f6808
AE
2292 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2293
dfc5606d
YS
2294 ret = device_register(dev);
2295
2296 return ret;
2297}
2298
4e891e0a 2299static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
c8d18425 2300 const char *snap_name,
34b13184
AE
2301 u64 snap_id, u64 snap_size,
2302 u64 snap_features)
dfc5606d 2303{
4e891e0a 2304 struct rbd_snap *snap;
dfc5606d 2305 int ret;
4e891e0a
AE
2306
2307 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
dfc5606d 2308 if (!snap)
4e891e0a
AE
2309 return ERR_PTR(-ENOMEM);
2310
2311 ret = -ENOMEM;
c8d18425 2312 snap->name = kstrdup(snap_name, GFP_KERNEL);
4e891e0a
AE
2313 if (!snap->name)
2314 goto err;
2315
c8d18425
AE
2316 snap->id = snap_id;
2317 snap->size = snap_size;
34b13184 2318 snap->features = snap_features;
4e891e0a
AE
2319
2320 return snap;
2321
dfc5606d
YS
2322err:
2323 kfree(snap->name);
2324 kfree(snap);
4e891e0a
AE
2325
2326 return ERR_PTR(ret);
dfc5606d
YS
2327}
2328
cd892126
AE
2329static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2330 u64 *snap_size, u64 *snap_features)
2331{
2332 char *snap_name;
2333
2334 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2335
2336 *snap_size = rbd_dev->header.snap_sizes[which];
2337 *snap_features = 0; /* No features for v1 */
2338
2339 /* Skip over names until we find the one we are looking for */
2340
2341 snap_name = rbd_dev->header.snap_names;
2342 while (which--)
2343 snap_name += strlen(snap_name) + 1;
2344
2345 return snap_name;
2346}
2347
9d475de5
AE
2348/*
2349 * Get the size and object order for an image snapshot, or if
2350 * snap_id is CEPH_NOSNAP, gets this information for the base
2351 * image.
2352 */
2353static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2354 u8 *order, u64 *snap_size)
2355{
2356 __le64 snapid = cpu_to_le64(snap_id);
2357 int ret;
2358 struct {
2359 u8 order;
2360 __le64 size;
2361 } __attribute__ ((packed)) size_buf = { 0 };
2362
2363 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2364 "rbd", "get_size",
2365 (char *) &snapid, sizeof (snapid),
2366 (char *) &size_buf, sizeof (size_buf),
2367 CEPH_OSD_FLAG_READ, NULL);
2368 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2369 if (ret < 0)
2370 return ret;
2371
2372 *order = size_buf.order;
2373 *snap_size = le64_to_cpu(size_buf.size);
2374
2375 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2376 (unsigned long long) snap_id, (unsigned int) *order,
2377 (unsigned long long) *snap_size);
2378
2379 return 0;
2380}
2381
2382static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2383{
2384 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2385 &rbd_dev->header.obj_order,
2386 &rbd_dev->header.image_size);
2387}
2388
1e130199
AE
2389static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2390{
2391 void *reply_buf;
2392 int ret;
2393 void *p;
2394
2395 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2396 if (!reply_buf)
2397 return -ENOMEM;
2398
2399 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2400 "rbd", "get_object_prefix",
2401 NULL, 0,
2402 reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
2403 CEPH_OSD_FLAG_READ, NULL);
2404 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2405 if (ret < 0)
2406 goto out;
a0ea3a40 2407 ret = 0; /* rbd_req_sync_exec() can return positive */
1e130199
AE
2408
2409 p = reply_buf;
2410 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2411 p + RBD_OBJ_PREFIX_LEN_MAX,
2412 NULL, GFP_NOIO);
2413
2414 if (IS_ERR(rbd_dev->header.object_prefix)) {
2415 ret = PTR_ERR(rbd_dev->header.object_prefix);
2416 rbd_dev->header.object_prefix = NULL;
2417 } else {
2418 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2419 }
2420
2421out:
2422 kfree(reply_buf);
2423
2424 return ret;
2425}
2426
b1b5402a
AE
2427static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2428 u64 *snap_features)
2429{
2430 __le64 snapid = cpu_to_le64(snap_id);
2431 struct {
2432 __le64 features;
2433 __le64 incompat;
2434 } features_buf = { 0 };
d889140c 2435 u64 incompat;
b1b5402a
AE
2436 int ret;
2437
2438 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2439 "rbd", "get_features",
2440 (char *) &snapid, sizeof (snapid),
2441 (char *) &features_buf, sizeof (features_buf),
2442 CEPH_OSD_FLAG_READ, NULL);
2443 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2444 if (ret < 0)
2445 return ret;
d889140c
AE
2446
2447 incompat = le64_to_cpu(features_buf.incompat);
2448 if (incompat & ~RBD_FEATURES_ALL)
b8f5c6ed 2449 return -ENXIO;
d889140c 2450
b1b5402a
AE
2451 *snap_features = le64_to_cpu(features_buf.features);
2452
2453 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2454 (unsigned long long) snap_id,
2455 (unsigned long long) *snap_features,
2456 (unsigned long long) le64_to_cpu(features_buf.incompat));
2457
2458 return 0;
2459}
2460
2461static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2462{
2463 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2464 &rbd_dev->header.features);
2465}
2466
86b00e0d
AE
2467static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2468{
2469 struct rbd_spec *parent_spec;
2470 size_t size;
2471 void *reply_buf = NULL;
2472 __le64 snapid;
2473 void *p;
2474 void *end;
2475 char *image_id;
2476 u64 overlap;
2477 size_t len = 0;
2478 int ret;
2479
2480 parent_spec = rbd_spec_alloc();
2481 if (!parent_spec)
2482 return -ENOMEM;
2483
2484 size = sizeof (__le64) + /* pool_id */
2485 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
2486 sizeof (__le64) + /* snap_id */
2487 sizeof (__le64); /* overlap */
2488 reply_buf = kmalloc(size, GFP_KERNEL);
2489 if (!reply_buf) {
2490 ret = -ENOMEM;
2491 goto out_err;
2492 }
2493
2494 snapid = cpu_to_le64(CEPH_NOSNAP);
2495 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2496 "rbd", "get_parent",
2497 (char *) &snapid, sizeof (snapid),
2498 (char *) reply_buf, size,
2499 CEPH_OSD_FLAG_READ, NULL);
2500 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2501 if (ret < 0)
2502 goto out_err;
2503
2504 ret = -ERANGE;
2505 p = reply_buf;
2506 end = (char *) reply_buf + size;
2507 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2508 if (parent_spec->pool_id == CEPH_NOPOOL)
2509 goto out; /* No parent? No problem. */
2510
2511 image_id = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2512 if (IS_ERR(image_id)) {
2513 ret = PTR_ERR(image_id);
2514 goto out_err;
2515 }
2516 parent_spec->image_id = image_id;
9e15b77d 2517 parent_spec->image_id_len = len;
86b00e0d
AE
2518 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
2519 ceph_decode_64_safe(&p, end, overlap, out_err);
2520
2521 rbd_dev->parent_overlap = overlap;
2522 rbd_dev->parent_spec = parent_spec;
2523 parent_spec = NULL; /* rbd_dev now owns this */
2524out:
2525 ret = 0;
2526out_err:
2527 kfree(reply_buf);
2528 rbd_spec_put(parent_spec);
2529
2530 return ret;
2531}
2532
9e15b77d
AE
2533static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
2534{
2535 size_t image_id_size;
2536 char *image_id;
2537 void *p;
2538 void *end;
2539 size_t size;
2540 void *reply_buf = NULL;
2541 size_t len = 0;
2542 char *image_name = NULL;
2543 int ret;
2544
2545 rbd_assert(!rbd_dev->spec->image_name);
2546
2547 image_id_size = sizeof (__le32) + rbd_dev->spec->image_id_len;
2548 image_id = kmalloc(image_id_size, GFP_KERNEL);
2549 if (!image_id)
2550 return NULL;
2551
2552 p = image_id;
2553 end = (char *) image_id + image_id_size;
2554 ceph_encode_string(&p, end, rbd_dev->spec->image_id,
2555 (u32) rbd_dev->spec->image_id_len);
2556
2557 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
2558 reply_buf = kmalloc(size, GFP_KERNEL);
2559 if (!reply_buf)
2560 goto out;
2561
2562 ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY,
2563 "rbd", "dir_get_name",
2564 image_id, image_id_size,
2565 (char *) reply_buf, size,
2566 CEPH_OSD_FLAG_READ, NULL);
2567 if (ret < 0)
2568 goto out;
2569 p = reply_buf;
2570 end = (char *) reply_buf + size;
2571 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2572 if (IS_ERR(image_name))
2573 image_name = NULL;
2574 else
2575 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
2576out:
2577 kfree(reply_buf);
2578 kfree(image_id);
2579
2580 return image_name;
2581}
2582
2583/*
2584 * When a parent image gets probed, we only have the pool, image,
2585 * and snapshot ids but not the names of any of them. This call
2586 * is made later to fill in those names. It has to be done after
2587 * rbd_dev_snaps_update() has completed because some of the
2588 * information (in particular, snapshot name) is not available
2589 * until then.
2590 */
2591static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
2592{
2593 struct ceph_osd_client *osdc;
2594 const char *name;
2595 void *reply_buf = NULL;
2596 int ret;
2597
2598 if (rbd_dev->spec->pool_name)
2599 return 0; /* Already have the names */
2600
2601 /* Look up the pool name */
2602
2603 osdc = &rbd_dev->rbd_client->client->osdc;
2604 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
2605 if (!name)
2606 return -EIO; /* pool id too large (>= 2^31) */
2607
2608 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
2609 if (!rbd_dev->spec->pool_name)
2610 return -ENOMEM;
2611
2612 /* Fetch the image name; tolerate failure here */
2613
2614 name = rbd_dev_image_name(rbd_dev);
2615 if (name) {
2616 rbd_dev->spec->image_name_len = strlen(name);
2617 rbd_dev->spec->image_name = (char *) name;
2618 } else {
2619 pr_warning(RBD_DRV_NAME "%d "
2620 "unable to get image name for image id %s\n",
2621 rbd_dev->major, rbd_dev->spec->image_id);
2622 }
2623
2624 /* Look up the snapshot name. */
2625
2626 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
2627 if (!name) {
2628 ret = -EIO;
2629 goto out_err;
2630 }
2631 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
2632 if(!rbd_dev->spec->snap_name)
2633 goto out_err;
2634
2635 return 0;
2636out_err:
2637 kfree(reply_buf);
2638 kfree(rbd_dev->spec->pool_name);
2639 rbd_dev->spec->pool_name = NULL;
2640
2641 return ret;
2642}
2643
6e14b1a6 2644static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
35d489f9
AE
2645{
2646 size_t size;
2647 int ret;
2648 void *reply_buf;
2649 void *p;
2650 void *end;
2651 u64 seq;
2652 u32 snap_count;
2653 struct ceph_snap_context *snapc;
2654 u32 i;
2655
2656 /*
2657 * We'll need room for the seq value (maximum snapshot id),
2658 * snapshot count, and array of that many snapshot ids.
2659 * For now we have a fixed upper limit on the number we're
2660 * prepared to receive.
2661 */
2662 size = sizeof (__le64) + sizeof (__le32) +
2663 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2664 reply_buf = kzalloc(size, GFP_KERNEL);
2665 if (!reply_buf)
2666 return -ENOMEM;
2667
2668 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2669 "rbd", "get_snapcontext",
2670 NULL, 0,
2671 reply_buf, size,
6e14b1a6 2672 CEPH_OSD_FLAG_READ, ver);
35d489f9
AE
2673 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2674 if (ret < 0)
2675 goto out;
2676
2677 ret = -ERANGE;
2678 p = reply_buf;
2679 end = (char *) reply_buf + size;
2680 ceph_decode_64_safe(&p, end, seq, out);
2681 ceph_decode_32_safe(&p, end, snap_count, out);
2682
2683 /*
2684 * Make sure the reported number of snapshot ids wouldn't go
2685 * beyond the end of our buffer. But before checking that,
2686 * make sure the computed size of the snapshot context we
2687 * allocate is representable in a size_t.
2688 */
2689 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2690 / sizeof (u64)) {
2691 ret = -EINVAL;
2692 goto out;
2693 }
2694 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2695 goto out;
2696
2697 size = sizeof (struct ceph_snap_context) +
2698 snap_count * sizeof (snapc->snaps[0]);
2699 snapc = kmalloc(size, GFP_KERNEL);
2700 if (!snapc) {
2701 ret = -ENOMEM;
2702 goto out;
2703 }
2704
2705 atomic_set(&snapc->nref, 1);
2706 snapc->seq = seq;
2707 snapc->num_snaps = snap_count;
2708 for (i = 0; i < snap_count; i++)
2709 snapc->snaps[i] = ceph_decode_64(&p);
2710
2711 rbd_dev->header.snapc = snapc;
2712
2713 dout(" snap context seq = %llu, snap_count = %u\n",
2714 (unsigned long long) seq, (unsigned int) snap_count);
2715
2716out:
2717 kfree(reply_buf);
2718
2719 return 0;
2720}
2721
b8b1e2db
AE
2722static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2723{
2724 size_t size;
2725 void *reply_buf;
2726 __le64 snap_id;
2727 int ret;
2728 void *p;
2729 void *end;
b8b1e2db
AE
2730 char *snap_name;
2731
2732 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2733 reply_buf = kmalloc(size, GFP_KERNEL);
2734 if (!reply_buf)
2735 return ERR_PTR(-ENOMEM);
2736
2737 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2738 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2739 "rbd", "get_snapshot_name",
2740 (char *) &snap_id, sizeof (snap_id),
2741 reply_buf, size,
2742 CEPH_OSD_FLAG_READ, NULL);
2743 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2744 if (ret < 0)
2745 goto out;
2746
2747 p = reply_buf;
2748 end = (char *) reply_buf + size;
e5c35534 2749 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
b8b1e2db
AE
2750 if (IS_ERR(snap_name)) {
2751 ret = PTR_ERR(snap_name);
2752 goto out;
2753 } else {
2754 dout(" snap_id 0x%016llx snap_name = %s\n",
2755 (unsigned long long) le64_to_cpu(snap_id), snap_name);
2756 }
2757 kfree(reply_buf);
2758
2759 return snap_name;
2760out:
2761 kfree(reply_buf);
2762
2763 return ERR_PTR(ret);
2764}
2765
2766static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2767 u64 *snap_size, u64 *snap_features)
2768{
2769 __le64 snap_id;
2770 u8 order;
2771 int ret;
2772
2773 snap_id = rbd_dev->header.snapc->snaps[which];
2774 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2775 if (ret)
2776 return ERR_PTR(ret);
2777 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2778 if (ret)
2779 return ERR_PTR(ret);
2780
2781 return rbd_dev_v2_snap_name(rbd_dev, which);
2782}
2783
2784static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2785 u64 *snap_size, u64 *snap_features)
2786{
2787 if (rbd_dev->image_format == 1)
2788 return rbd_dev_v1_snap_info(rbd_dev, which,
2789 snap_size, snap_features);
2790 if (rbd_dev->image_format == 2)
2791 return rbd_dev_v2_snap_info(rbd_dev, which,
2792 snap_size, snap_features);
2793 return ERR_PTR(-EINVAL);
2794}
2795
117973fb
AE
2796static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2797{
2798 int ret;
2799 __u8 obj_order;
2800
2801 down_write(&rbd_dev->header_rwsem);
2802
2803 /* Grab old order first, to see if it changes */
2804
2805 obj_order = rbd_dev->header.obj_order,
2806 ret = rbd_dev_v2_image_size(rbd_dev);
2807 if (ret)
2808 goto out;
2809 if (rbd_dev->header.obj_order != obj_order) {
2810 ret = -EIO;
2811 goto out;
2812 }
2813 rbd_update_mapping_size(rbd_dev);
2814
2815 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2816 dout("rbd_dev_v2_snap_context returned %d\n", ret);
2817 if (ret)
2818 goto out;
2819 ret = rbd_dev_snaps_update(rbd_dev);
2820 dout("rbd_dev_snaps_update returned %d\n", ret);
2821 if (ret)
2822 goto out;
2823 ret = rbd_dev_snaps_register(rbd_dev);
2824 dout("rbd_dev_snaps_register returned %d\n", ret);
2825out:
2826 up_write(&rbd_dev->header_rwsem);
2827
2828 return ret;
2829}
2830
dfc5606d 2831/*
35938150
AE
2832 * Scan the rbd device's current snapshot list and compare it to the
2833 * newly-received snapshot context. Remove any existing snapshots
2834 * not present in the new snapshot context. Add a new snapshot for
2835 * any snaphots in the snapshot context not in the current list.
2836 * And verify there are no changes to snapshots we already know
2837 * about.
2838 *
2839 * Assumes the snapshots in the snapshot context are sorted by
2840 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2841 * are also maintained in that order.)
dfc5606d 2842 */
304f6808 2843static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
dfc5606d 2844{
35938150
AE
2845 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2846 const u32 snap_count = snapc->num_snaps;
35938150
AE
2847 struct list_head *head = &rbd_dev->snaps;
2848 struct list_head *links = head->next;
2849 u32 index = 0;
dfc5606d 2850
9fcbb800 2851 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
35938150
AE
2852 while (index < snap_count || links != head) {
2853 u64 snap_id;
2854 struct rbd_snap *snap;
cd892126
AE
2855 char *snap_name;
2856 u64 snap_size = 0;
2857 u64 snap_features = 0;
dfc5606d 2858
35938150
AE
2859 snap_id = index < snap_count ? snapc->snaps[index]
2860 : CEPH_NOSNAP;
2861 snap = links != head ? list_entry(links, struct rbd_snap, node)
2862 : NULL;
aafb230e 2863 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
dfc5606d 2864
35938150
AE
2865 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2866 struct list_head *next = links->next;
dfc5606d 2867
35938150 2868 /* Existing snapshot not in the new snap context */
dfc5606d 2869
0d7dbfce 2870 if (rbd_dev->spec->snap_id == snap->id)
daba5fdb 2871 rbd_dev->exists = false;
41f38c2b 2872 rbd_remove_snap_dev(snap);
9fcbb800 2873 dout("%ssnap id %llu has been removed\n",
0d7dbfce
AE
2874 rbd_dev->spec->snap_id == snap->id ?
2875 "mapped " : "",
9fcbb800 2876 (unsigned long long) snap->id);
35938150
AE
2877
2878 /* Done with this list entry; advance */
2879
2880 links = next;
dfc5606d
YS
2881 continue;
2882 }
35938150 2883
b8b1e2db
AE
2884 snap_name = rbd_dev_snap_info(rbd_dev, index,
2885 &snap_size, &snap_features);
cd892126
AE
2886 if (IS_ERR(snap_name))
2887 return PTR_ERR(snap_name);
2888
9fcbb800
AE
2889 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2890 (unsigned long long) snap_id);
35938150
AE
2891 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2892 struct rbd_snap *new_snap;
2893
2894 /* We haven't seen this snapshot before */
2895
c8d18425 2896 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
cd892126 2897 snap_id, snap_size, snap_features);
9fcbb800
AE
2898 if (IS_ERR(new_snap)) {
2899 int err = PTR_ERR(new_snap);
2900
2901 dout(" failed to add dev, error %d\n", err);
2902
2903 return err;
2904 }
35938150
AE
2905
2906 /* New goes before existing, or at end of list */
2907
9fcbb800 2908 dout(" added dev%s\n", snap ? "" : " at end\n");
35938150
AE
2909 if (snap)
2910 list_add_tail(&new_snap->node, &snap->node);
2911 else
523f3258 2912 list_add_tail(&new_snap->node, head);
35938150
AE
2913 } else {
2914 /* Already have this one */
2915
9fcbb800
AE
2916 dout(" already present\n");
2917
cd892126 2918 rbd_assert(snap->size == snap_size);
aafb230e 2919 rbd_assert(!strcmp(snap->name, snap_name));
cd892126 2920 rbd_assert(snap->features == snap_features);
35938150
AE
2921
2922 /* Done with this list entry; advance */
2923
2924 links = links->next;
dfc5606d 2925 }
35938150
AE
2926
2927 /* Advance to the next entry in the snapshot context */
2928
2929 index++;
dfc5606d 2930 }
9fcbb800 2931 dout("%s: done\n", __func__);
dfc5606d
YS
2932
2933 return 0;
2934}
2935
304f6808
AE
2936/*
2937 * Scan the list of snapshots and register the devices for any that
2938 * have not already been registered.
2939 */
2940static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2941{
2942 struct rbd_snap *snap;
2943 int ret = 0;
2944
2945 dout("%s called\n", __func__);
86ff77bb
AE
2946 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2947 return -EIO;
304f6808
AE
2948
2949 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2950 if (!rbd_snap_registered(snap)) {
2951 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2952 if (ret < 0)
2953 break;
2954 }
2955 }
2956 dout("%s: returning %d\n", __func__, ret);
2957
2958 return ret;
2959}
2960
dfc5606d
YS
2961static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2962{
dfc5606d 2963 struct device *dev;
cd789ab9 2964 int ret;
dfc5606d
YS
2965
2966 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
dfc5606d 2967
cd789ab9 2968 dev = &rbd_dev->dev;
dfc5606d
YS
2969 dev->bus = &rbd_bus_type;
2970 dev->type = &rbd_device_type;
2971 dev->parent = &rbd_root_dev;
2972 dev->release = rbd_dev_release;
de71a297 2973 dev_set_name(dev, "%d", rbd_dev->dev_id);
dfc5606d 2974 ret = device_register(dev);
dfc5606d 2975
dfc5606d 2976 mutex_unlock(&ctl_mutex);
cd789ab9 2977
dfc5606d 2978 return ret;
602adf40
YS
2979}
2980
dfc5606d
YS
2981static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2982{
2983 device_unregister(&rbd_dev->dev);
2984}
2985
59c2be1e
YS
2986static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2987{
2988 int ret, rc;
2989
2990 do {
0e6f322d 2991 ret = rbd_req_sync_watch(rbd_dev);
59c2be1e 2992 if (ret == -ERANGE) {
117973fb 2993 rc = rbd_dev_refresh(rbd_dev, NULL);
59c2be1e
YS
2994 if (rc < 0)
2995 return rc;
2996 }
2997 } while (ret == -ERANGE);
2998
2999 return ret;
3000}
3001
e2839308 3002static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
1ddbe94e
AE
3003
3004/*
499afd5b
AE
3005 * Get a unique rbd identifier for the given new rbd_dev, and add
3006 * the rbd_dev to the global list. The minimum rbd id is 1.
1ddbe94e 3007 */
e2839308 3008static void rbd_dev_id_get(struct rbd_device *rbd_dev)
b7f23c36 3009{
e2839308 3010 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
499afd5b
AE
3011
3012 spin_lock(&rbd_dev_list_lock);
3013 list_add_tail(&rbd_dev->node, &rbd_dev_list);
3014 spin_unlock(&rbd_dev_list_lock);
e2839308
AE
3015 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3016 (unsigned long long) rbd_dev->dev_id);
1ddbe94e 3017}
b7f23c36 3018
1ddbe94e 3019/*
499afd5b
AE
3020 * Remove an rbd_dev from the global list, and record that its
3021 * identifier is no longer in use.
1ddbe94e 3022 */
e2839308 3023static void rbd_dev_id_put(struct rbd_device *rbd_dev)
1ddbe94e 3024{
d184f6bf 3025 struct list_head *tmp;
de71a297 3026 int rbd_id = rbd_dev->dev_id;
d184f6bf
AE
3027 int max_id;
3028
aafb230e 3029 rbd_assert(rbd_id > 0);
499afd5b 3030
e2839308
AE
3031 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3032 (unsigned long long) rbd_dev->dev_id);
499afd5b
AE
3033 spin_lock(&rbd_dev_list_lock);
3034 list_del_init(&rbd_dev->node);
d184f6bf
AE
3035
3036 /*
3037 * If the id being "put" is not the current maximum, there
3038 * is nothing special we need to do.
3039 */
e2839308 3040 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
d184f6bf
AE
3041 spin_unlock(&rbd_dev_list_lock);
3042 return;
3043 }
3044
3045 /*
3046 * We need to update the current maximum id. Search the
3047 * list to find out what it is. We're more likely to find
3048 * the maximum at the end, so search the list backward.
3049 */
3050 max_id = 0;
3051 list_for_each_prev(tmp, &rbd_dev_list) {
3052 struct rbd_device *rbd_dev;
3053
3054 rbd_dev = list_entry(tmp, struct rbd_device, node);
b213e0b1
AE
3055 if (rbd_dev->dev_id > max_id)
3056 max_id = rbd_dev->dev_id;
d184f6bf 3057 }
499afd5b 3058 spin_unlock(&rbd_dev_list_lock);
b7f23c36 3059
1ddbe94e 3060 /*
e2839308 3061 * The max id could have been updated by rbd_dev_id_get(), in
d184f6bf
AE
3062 * which case it now accurately reflects the new maximum.
3063 * Be careful not to overwrite the maximum value in that
3064 * case.
1ddbe94e 3065 */
e2839308
AE
3066 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3067 dout(" max dev id has been reset\n");
b7f23c36
AE
3068}
3069
e28fff26
AE
3070/*
3071 * Skips over white space at *buf, and updates *buf to point to the
3072 * first found non-space character (if any). Returns the length of
593a9e7b
AE
3073 * the token (string of non-white space characters) found. Note
3074 * that *buf must be terminated with '\0'.
e28fff26
AE
3075 */
3076static inline size_t next_token(const char **buf)
3077{
3078 /*
3079 * These are the characters that produce nonzero for
3080 * isspace() in the "C" and "POSIX" locales.
3081 */
3082 const char *spaces = " \f\n\r\t\v";
3083
3084 *buf += strspn(*buf, spaces); /* Find start of token */
3085
3086 return strcspn(*buf, spaces); /* Return token length */
3087}
3088
3089/*
3090 * Finds the next token in *buf, and if the provided token buffer is
3091 * big enough, copies the found token into it. The result, if
593a9e7b
AE
3092 * copied, is guaranteed to be terminated with '\0'. Note that *buf
3093 * must be terminated with '\0' on entry.
e28fff26
AE
3094 *
3095 * Returns the length of the token found (not including the '\0').
3096 * Return value will be 0 if no token is found, and it will be >=
3097 * token_size if the token would not fit.
3098 *
593a9e7b 3099 * The *buf pointer will be updated to point beyond the end of the
e28fff26
AE
3100 * found token. Note that this occurs even if the token buffer is
3101 * too small to hold it.
3102 */
3103static inline size_t copy_token(const char **buf,
3104 char *token,
3105 size_t token_size)
3106{
3107 size_t len;
3108
3109 len = next_token(buf);
3110 if (len < token_size) {
3111 memcpy(token, *buf, len);
3112 *(token + len) = '\0';
3113 }
3114 *buf += len;
3115
3116 return len;
3117}
3118
ea3352f4
AE
3119/*
3120 * Finds the next token in *buf, dynamically allocates a buffer big
3121 * enough to hold a copy of it, and copies the token into the new
3122 * buffer. The copy is guaranteed to be terminated with '\0'. Note
3123 * that a duplicate buffer is created even for a zero-length token.
3124 *
3125 * Returns a pointer to the newly-allocated duplicate, or a null
3126 * pointer if memory for the duplicate was not available. If
3127 * the lenp argument is a non-null pointer, the length of the token
3128 * (not including the '\0') is returned in *lenp.
3129 *
3130 * If successful, the *buf pointer will be updated to point beyond
3131 * the end of the found token.
3132 *
3133 * Note: uses GFP_KERNEL for allocation.
3134 */
3135static inline char *dup_token(const char **buf, size_t *lenp)
3136{
3137 char *dup;
3138 size_t len;
3139
3140 len = next_token(buf);
3141 dup = kmalloc(len + 1, GFP_KERNEL);
3142 if (!dup)
3143 return NULL;
3144
3145 memcpy(dup, *buf, len);
3146 *(dup + len) = '\0';
3147 *buf += len;
3148
3149 if (lenp)
3150 *lenp = len;
3151
3152 return dup;
3153}
3154
a725f65e 3155/*
859c31df
AE
3156 * Parse the options provided for an "rbd add" (i.e., rbd image
3157 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
3158 * and the data written is passed here via a NUL-terminated buffer.
3159 * Returns 0 if successful or an error code otherwise.
d22f76e7 3160 *
859c31df
AE
3161 * The information extracted from these options is recorded in
3162 * the other parameters which return dynamically-allocated
3163 * structures:
3164 * ceph_opts
3165 * The address of a pointer that will refer to a ceph options
3166 * structure. Caller must release the returned pointer using
3167 * ceph_destroy_options() when it is no longer needed.
3168 * rbd_opts
3169 * Address of an rbd options pointer. Fully initialized by
3170 * this function; caller must release with kfree().
3171 * spec
3172 * Address of an rbd image specification pointer. Fully
3173 * initialized by this function based on parsed options.
3174 * Caller must release with rbd_spec_put().
3175 *
3176 * The options passed take this form:
3177 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3178 * where:
3179 * <mon_addrs>
3180 * A comma-separated list of one or more monitor addresses.
3181 * A monitor address is an ip address, optionally followed
3182 * by a port number (separated by a colon).
3183 * I.e.: ip1[:port1][,ip2[:port2]...]
3184 * <options>
3185 * A comma-separated list of ceph and/or rbd options.
3186 * <pool_name>
3187 * The name of the rados pool containing the rbd image.
3188 * <image_name>
3189 * The name of the image in that pool to map.
3190 * <snap_id>
3191 * An optional snapshot id. If provided, the mapping will
3192 * present data from the image at the time that snapshot was
3193 * created. The image head is used if no snapshot id is
3194 * provided. Snapshot mappings are always read-only.
a725f65e 3195 */
859c31df 3196static int rbd_add_parse_args(const char *buf,
dc79b113 3197 struct ceph_options **ceph_opts,
859c31df
AE
3198 struct rbd_options **opts,
3199 struct rbd_spec **rbd_spec)
e28fff26 3200{
d22f76e7 3201 size_t len;
859c31df 3202 char *options;
0ddebc0c
AE
3203 const char *mon_addrs;
3204 size_t mon_addrs_size;
859c31df 3205 struct rbd_spec *spec = NULL;
4e9afeba 3206 struct rbd_options *rbd_opts = NULL;
859c31df 3207 struct ceph_options *copts;
dc79b113 3208 int ret;
e28fff26
AE
3209
3210 /* The first four tokens are required */
3211
7ef3214a
AE
3212 len = next_token(&buf);
3213 if (!len)
dc79b113 3214 return -EINVAL; /* Missing monitor address(es) */
0ddebc0c 3215 mon_addrs = buf;
f28e565a 3216 mon_addrs_size = len + 1;
7ef3214a 3217 buf += len;
a725f65e 3218
dc79b113 3219 ret = -EINVAL;
f28e565a
AE
3220 options = dup_token(&buf, NULL);
3221 if (!options)
dc79b113 3222 return -ENOMEM;
f28e565a
AE
3223 if (!*options)
3224 goto out_err; /* Missing options */
e28fff26 3225
859c31df
AE
3226 spec = rbd_spec_alloc();
3227 if (!spec)
f28e565a 3228 goto out_mem;
859c31df
AE
3229
3230 spec->pool_name = dup_token(&buf, NULL);
3231 if (!spec->pool_name)
3232 goto out_mem;
3233 if (!*spec->pool_name)
f28e565a 3234 goto out_err; /* Missing pool name */
e28fff26 3235
859c31df
AE
3236 spec->image_name = dup_token(&buf, &spec->image_name_len);
3237 if (!spec->image_name)
f28e565a 3238 goto out_mem;
859c31df 3239 if (!*spec->image_name)
f28e565a 3240 goto out_err; /* Missing image name */
d4b125e9 3241
f28e565a
AE
3242 /*
3243 * Snapshot name is optional; default is to use "-"
3244 * (indicating the head/no snapshot).
3245 */
3feeb894 3246 len = next_token(&buf);
820a5f3e 3247 if (!len) {
3feeb894
AE
3248 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3249 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
f28e565a 3250 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
dc79b113 3251 ret = -ENAMETOOLONG;
f28e565a 3252 goto out_err;
849b4260 3253 }
859c31df
AE
3254 spec->snap_name = kmalloc(len + 1, GFP_KERNEL);
3255 if (!spec->snap_name)
f28e565a 3256 goto out_mem;
859c31df
AE
3257 memcpy(spec->snap_name, buf, len);
3258 *(spec->snap_name + len) = '\0';
e5c35534 3259
0ddebc0c 3260 /* Initialize all rbd options to the defaults */
e28fff26 3261
4e9afeba
AE
3262 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3263 if (!rbd_opts)
3264 goto out_mem;
3265
3266 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
d22f76e7 3267
859c31df 3268 copts = ceph_parse_options(options, mon_addrs,
0ddebc0c 3269 mon_addrs + mon_addrs_size - 1,
4e9afeba 3270 parse_rbd_opts_token, rbd_opts);
859c31df
AE
3271 if (IS_ERR(copts)) {
3272 ret = PTR_ERR(copts);
dc79b113
AE
3273 goto out_err;
3274 }
859c31df
AE
3275 kfree(options);
3276
3277 *ceph_opts = copts;
4e9afeba 3278 *opts = rbd_opts;
859c31df 3279 *rbd_spec = spec;
0ddebc0c 3280
dc79b113 3281 return 0;
f28e565a 3282out_mem:
dc79b113 3283 ret = -ENOMEM;
d22f76e7 3284out_err:
859c31df
AE
3285 kfree(rbd_opts);
3286 rbd_spec_put(spec);
f28e565a 3287 kfree(options);
d22f76e7 3288
dc79b113 3289 return ret;
a725f65e
AE
3290}
3291
589d30e0
AE
3292/*
3293 * An rbd format 2 image has a unique identifier, distinct from the
3294 * name given to it by the user. Internally, that identifier is
3295 * what's used to specify the names of objects related to the image.
3296 *
3297 * A special "rbd id" object is used to map an rbd image name to its
3298 * id. If that object doesn't exist, then there is no v2 rbd image
3299 * with the supplied name.
3300 *
3301 * This function will record the given rbd_dev's image_id field if
3302 * it can be determined, and in that case will return 0. If any
3303 * errors occur a negative errno will be returned and the rbd_dev's
3304 * image_id field will be unchanged (and should be NULL).
3305 */
3306static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3307{
3308 int ret;
3309 size_t size;
3310 char *object_name;
3311 void *response;
3312 void *p;
3313
2c0d0a10
AE
3314 /*
3315 * When probing a parent image, the image id is already
3316 * known (and the image name likely is not). There's no
3317 * need to fetch the image id again in this case.
3318 */
3319 if (rbd_dev->spec->image_id)
3320 return 0;
3321
589d30e0
AE
3322 /*
3323 * First, see if the format 2 image id file exists, and if
3324 * so, get the image's persistent id from it.
3325 */
0d7dbfce 3326 size = sizeof (RBD_ID_PREFIX) + rbd_dev->spec->image_name_len;
589d30e0
AE
3327 object_name = kmalloc(size, GFP_NOIO);
3328 if (!object_name)
3329 return -ENOMEM;
0d7dbfce 3330 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
589d30e0
AE
3331 dout("rbd id object name is %s\n", object_name);
3332
3333 /* Response will be an encoded string, which includes a length */
3334
3335 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3336 response = kzalloc(size, GFP_NOIO);
3337 if (!response) {
3338 ret = -ENOMEM;
3339 goto out;
3340 }
3341
3342 ret = rbd_req_sync_exec(rbd_dev, object_name,
3343 "rbd", "get_id",
3344 NULL, 0,
3345 response, RBD_IMAGE_ID_LEN_MAX,
3346 CEPH_OSD_FLAG_READ, NULL);
3347 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3348 if (ret < 0)
3349 goto out;
a0ea3a40 3350 ret = 0; /* rbd_req_sync_exec() can return positive */
589d30e0
AE
3351
3352 p = response;
0d7dbfce 3353 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
589d30e0 3354 p + RBD_IMAGE_ID_LEN_MAX,
0d7dbfce 3355 &rbd_dev->spec->image_id_len,
589d30e0 3356 GFP_NOIO);
0d7dbfce
AE
3357 if (IS_ERR(rbd_dev->spec->image_id)) {
3358 ret = PTR_ERR(rbd_dev->spec->image_id);
3359 rbd_dev->spec->image_id = NULL;
589d30e0 3360 } else {
0d7dbfce 3361 dout("image_id is %s\n", rbd_dev->spec->image_id);
589d30e0
AE
3362 }
3363out:
3364 kfree(response);
3365 kfree(object_name);
3366
3367 return ret;
3368}
3369
a30b71b9
AE
3370static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3371{
3372 int ret;
3373 size_t size;
3374
3375 /* Version 1 images have no id; empty string is used */
3376
0d7dbfce
AE
3377 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3378 if (!rbd_dev->spec->image_id)
a30b71b9 3379 return -ENOMEM;
0d7dbfce 3380 rbd_dev->spec->image_id_len = 0;
a30b71b9
AE
3381
3382 /* Record the header object name for this rbd image. */
3383
0d7dbfce 3384 size = rbd_dev->spec->image_name_len + sizeof (RBD_SUFFIX);
a30b71b9
AE
3385 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3386 if (!rbd_dev->header_name) {
3387 ret = -ENOMEM;
3388 goto out_err;
3389 }
0d7dbfce
AE
3390 sprintf(rbd_dev->header_name, "%s%s",
3391 rbd_dev->spec->image_name, RBD_SUFFIX);
a30b71b9
AE
3392
3393 /* Populate rbd image metadata */
3394
3395 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3396 if (ret < 0)
3397 goto out_err;
86b00e0d
AE
3398
3399 /* Version 1 images have no parent (no layering) */
3400
3401 rbd_dev->parent_spec = NULL;
3402 rbd_dev->parent_overlap = 0;
3403
a30b71b9
AE
3404 rbd_dev->image_format = 1;
3405
3406 dout("discovered version 1 image, header name is %s\n",
3407 rbd_dev->header_name);
3408
3409 return 0;
3410
3411out_err:
3412 kfree(rbd_dev->header_name);
3413 rbd_dev->header_name = NULL;
0d7dbfce
AE
3414 kfree(rbd_dev->spec->image_id);
3415 rbd_dev->spec->image_id = NULL;
a30b71b9
AE
3416
3417 return ret;
3418}
3419
3420static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3421{
3422 size_t size;
9d475de5 3423 int ret;
6e14b1a6 3424 u64 ver = 0;
a30b71b9
AE
3425
3426 /*
3427 * Image id was filled in by the caller. Record the header
3428 * object name for this rbd image.
3429 */
0d7dbfce 3430 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->spec->image_id_len;
a30b71b9
AE
3431 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3432 if (!rbd_dev->header_name)
3433 return -ENOMEM;
3434 sprintf(rbd_dev->header_name, "%s%s",
0d7dbfce 3435 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
9d475de5
AE
3436
3437 /* Get the size and object order for the image */
3438
3439 ret = rbd_dev_v2_image_size(rbd_dev);
1e130199
AE
3440 if (ret < 0)
3441 goto out_err;
3442
3443 /* Get the object prefix (a.k.a. block_name) for the image */
3444
3445 ret = rbd_dev_v2_object_prefix(rbd_dev);
b1b5402a
AE
3446 if (ret < 0)
3447 goto out_err;
3448
d889140c 3449 /* Get the and check features for the image */
b1b5402a
AE
3450
3451 ret = rbd_dev_v2_features(rbd_dev);
9d475de5
AE
3452 if (ret < 0)
3453 goto out_err;
35d489f9 3454
86b00e0d
AE
3455 /* If the image supports layering, get the parent info */
3456
3457 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3458 ret = rbd_dev_v2_parent_info(rbd_dev);
3459 if (ret < 0)
3460 goto out_err;
3461 }
3462
6e14b1a6
AE
3463 /* crypto and compression type aren't (yet) supported for v2 images */
3464
3465 rbd_dev->header.crypt_type = 0;
3466 rbd_dev->header.comp_type = 0;
35d489f9 3467
6e14b1a6
AE
3468 /* Get the snapshot context, plus the header version */
3469
3470 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
35d489f9
AE
3471 if (ret)
3472 goto out_err;
6e14b1a6
AE
3473 rbd_dev->header.obj_version = ver;
3474
a30b71b9
AE
3475 rbd_dev->image_format = 2;
3476
3477 dout("discovered version 2 image, header name is %s\n",
3478 rbd_dev->header_name);
3479
35152979 3480 return 0;
9d475de5 3481out_err:
86b00e0d
AE
3482 rbd_dev->parent_overlap = 0;
3483 rbd_spec_put(rbd_dev->parent_spec);
3484 rbd_dev->parent_spec = NULL;
9d475de5
AE
3485 kfree(rbd_dev->header_name);
3486 rbd_dev->header_name = NULL;
1e130199
AE
3487 kfree(rbd_dev->header.object_prefix);
3488 rbd_dev->header.object_prefix = NULL;
9d475de5
AE
3489
3490 return ret;
a30b71b9
AE
3491}
3492
83a06263
AE
3493static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3494{
3495 int ret;
3496
3497 /* no need to lock here, as rbd_dev is not registered yet */
3498 ret = rbd_dev_snaps_update(rbd_dev);
3499 if (ret)
3500 return ret;
3501
9e15b77d
AE
3502 ret = rbd_dev_probe_update_spec(rbd_dev);
3503 if (ret)
3504 goto err_out_snaps;
3505
83a06263
AE
3506 ret = rbd_dev_set_mapping(rbd_dev);
3507 if (ret)
3508 goto err_out_snaps;
3509
3510 /* generate unique id: find highest unique id, add one */
3511 rbd_dev_id_get(rbd_dev);
3512
3513 /* Fill in the device name, now that we have its id. */
3514 BUILD_BUG_ON(DEV_NAME_LEN
3515 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3516 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3517
3518 /* Get our block major device number. */
3519
3520 ret = register_blkdev(0, rbd_dev->name);
3521 if (ret < 0)
3522 goto err_out_id;
3523 rbd_dev->major = ret;
3524
3525 /* Set up the blkdev mapping. */
3526
3527 ret = rbd_init_disk(rbd_dev);
3528 if (ret)
3529 goto err_out_blkdev;
3530
3531 ret = rbd_bus_add_dev(rbd_dev);
3532 if (ret)
3533 goto err_out_disk;
3534
3535 /*
3536 * At this point cleanup in the event of an error is the job
3537 * of the sysfs code (initiated by rbd_bus_del_dev()).
3538 */
3539 down_write(&rbd_dev->header_rwsem);
3540 ret = rbd_dev_snaps_register(rbd_dev);
3541 up_write(&rbd_dev->header_rwsem);
3542 if (ret)
3543 goto err_out_bus;
3544
3545 ret = rbd_init_watch_dev(rbd_dev);
3546 if (ret)
3547 goto err_out_bus;
3548
3549 /* Everything's ready. Announce the disk to the world. */
3550
3551 add_disk(rbd_dev->disk);
3552
3553 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3554 (unsigned long long) rbd_dev->mapping.size);
3555
3556 return ret;
3557err_out_bus:
3558 /* this will also clean up rest of rbd_dev stuff */
3559
3560 rbd_bus_del_dev(rbd_dev);
3561
3562 return ret;
3563err_out_disk:
3564 rbd_free_disk(rbd_dev);
3565err_out_blkdev:
3566 unregister_blkdev(rbd_dev->major, rbd_dev->name);
3567err_out_id:
3568 rbd_dev_id_put(rbd_dev);
3569err_out_snaps:
3570 rbd_remove_all_snaps(rbd_dev);
3571
3572 return ret;
3573}
3574
a30b71b9
AE
3575/*
3576 * Probe for the existence of the header object for the given rbd
3577 * device. For format 2 images this includes determining the image
3578 * id.
3579 */
3580static int rbd_dev_probe(struct rbd_device *rbd_dev)
3581{
3582 int ret;
3583
3584 /*
3585 * Get the id from the image id object. If it's not a
3586 * format 2 image, we'll get ENOENT back, and we'll assume
3587 * it's a format 1 image.
3588 */
3589 ret = rbd_dev_image_id(rbd_dev);
3590 if (ret)
3591 ret = rbd_dev_v1_probe(rbd_dev);
3592 else
3593 ret = rbd_dev_v2_probe(rbd_dev);
83a06263 3594 if (ret) {
a30b71b9
AE
3595 dout("probe failed, returning %d\n", ret);
3596
83a06263
AE
3597 return ret;
3598 }
3599
3600 ret = rbd_dev_probe_finish(rbd_dev);
3601 if (ret)
3602 rbd_header_free(&rbd_dev->header);
3603
a30b71b9
AE
3604 return ret;
3605}
3606
59c2be1e
YS
3607static ssize_t rbd_add(struct bus_type *bus,
3608 const char *buf,
3609 size_t count)
602adf40 3610{
cb8627c7 3611 struct rbd_device *rbd_dev = NULL;
dc79b113 3612 struct ceph_options *ceph_opts = NULL;
4e9afeba 3613 struct rbd_options *rbd_opts = NULL;
859c31df 3614 struct rbd_spec *spec = NULL;
9d3997fd 3615 struct rbd_client *rbdc;
27cc2594
AE
3616 struct ceph_osd_client *osdc;
3617 int rc = -ENOMEM;
602adf40
YS
3618
3619 if (!try_module_get(THIS_MODULE))
3620 return -ENODEV;
3621
602adf40 3622 /* parse add command */
859c31df 3623 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
dc79b113 3624 if (rc < 0)
bd4ba655 3625 goto err_out_module;
78cea76e 3626
9d3997fd
AE
3627 rbdc = rbd_get_client(ceph_opts);
3628 if (IS_ERR(rbdc)) {
3629 rc = PTR_ERR(rbdc);
0ddebc0c 3630 goto err_out_args;
9d3997fd 3631 }
c53d5893 3632 ceph_opts = NULL; /* rbd_dev client now owns this */
602adf40 3633
602adf40 3634 /* pick the pool */
9d3997fd 3635 osdc = &rbdc->client->osdc;
859c31df 3636 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
602adf40
YS
3637 if (rc < 0)
3638 goto err_out_client;
859c31df
AE
3639 spec->pool_id = (u64) rc;
3640
c53d5893 3641 rbd_dev = rbd_dev_create(rbdc, spec);
bd4ba655
AE
3642 if (!rbd_dev)
3643 goto err_out_client;
c53d5893
AE
3644 rbdc = NULL; /* rbd_dev now owns this */
3645 spec = NULL; /* rbd_dev now owns this */
602adf40 3646
bd4ba655 3647 rbd_dev->mapping.read_only = rbd_opts->read_only;
c53d5893
AE
3648 kfree(rbd_opts);
3649 rbd_opts = NULL; /* done with this */
bd4ba655 3650
a30b71b9
AE
3651 rc = rbd_dev_probe(rbd_dev);
3652 if (rc < 0)
c53d5893 3653 goto err_out_rbd_dev;
05fd6f6f 3654
602adf40 3655 return count;
c53d5893
AE
3656err_out_rbd_dev:
3657 rbd_dev_destroy(rbd_dev);
bd4ba655 3658err_out_client:
9d3997fd 3659 rbd_put_client(rbdc);
0ddebc0c 3660err_out_args:
78cea76e
AE
3661 if (ceph_opts)
3662 ceph_destroy_options(ceph_opts);
4e9afeba 3663 kfree(rbd_opts);
859c31df 3664 rbd_spec_put(spec);
bd4ba655
AE
3665err_out_module:
3666 module_put(THIS_MODULE);
27cc2594 3667
602adf40 3668 dout("Error adding device %s\n", buf);
27cc2594
AE
3669
3670 return (ssize_t) rc;
602adf40
YS
3671}
3672
de71a297 3673static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
602adf40
YS
3674{
3675 struct list_head *tmp;
3676 struct rbd_device *rbd_dev;
3677
e124a82f 3678 spin_lock(&rbd_dev_list_lock);
602adf40
YS
3679 list_for_each(tmp, &rbd_dev_list) {
3680 rbd_dev = list_entry(tmp, struct rbd_device, node);
de71a297 3681 if (rbd_dev->dev_id == dev_id) {
e124a82f 3682 spin_unlock(&rbd_dev_list_lock);
602adf40 3683 return rbd_dev;
e124a82f 3684 }
602adf40 3685 }
e124a82f 3686 spin_unlock(&rbd_dev_list_lock);
602adf40
YS
3687 return NULL;
3688}
3689
dfc5606d 3690static void rbd_dev_release(struct device *dev)
602adf40 3691{
593a9e7b 3692 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 3693
1dbb4399
AE
3694 if (rbd_dev->watch_request) {
3695 struct ceph_client *client = rbd_dev->rbd_client->client;
3696
3697 ceph_osdc_unregister_linger_request(&client->osdc,
59c2be1e 3698 rbd_dev->watch_request);
1dbb4399 3699 }
59c2be1e 3700 if (rbd_dev->watch_event)
070c633f 3701 rbd_req_sync_unwatch(rbd_dev);
59c2be1e 3702
602adf40
YS
3703
3704 /* clean up and free blkdev */
3705 rbd_free_disk(rbd_dev);
3706 unregister_blkdev(rbd_dev->major, rbd_dev->name);
32eec68d 3707
2ac4e75d
AE
3708 /* release allocated disk header fields */
3709 rbd_header_free(&rbd_dev->header);
3710
32eec68d 3711 /* done with the id, and with the rbd_dev */
e2839308 3712 rbd_dev_id_put(rbd_dev);
c53d5893
AE
3713 rbd_assert(rbd_dev->rbd_client != NULL);
3714 rbd_dev_destroy(rbd_dev);
602adf40
YS
3715
3716 /* release module ref */
3717 module_put(THIS_MODULE);
602adf40
YS
3718}
3719
dfc5606d
YS
3720static ssize_t rbd_remove(struct bus_type *bus,
3721 const char *buf,
3722 size_t count)
602adf40
YS
3723{
3724 struct rbd_device *rbd_dev = NULL;
3725 int target_id, rc;
3726 unsigned long ul;
3727 int ret = count;
3728
3729 rc = strict_strtoul(buf, 10, &ul);
3730 if (rc)
3731 return rc;
3732
3733 /* convert to int; abort if we lost anything in the conversion */
3734 target_id = (int) ul;
3735 if (target_id != ul)
3736 return -EINVAL;
3737
3738 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3739
3740 rbd_dev = __rbd_get_dev(target_id);
3741 if (!rbd_dev) {
3742 ret = -ENOENT;
3743 goto done;
42382b70
AE
3744 }
3745
3746 if (rbd_dev->open_count) {
3747 ret = -EBUSY;
3748 goto done;
602adf40
YS
3749 }
3750
41f38c2b 3751 rbd_remove_all_snaps(rbd_dev);
dfc5606d 3752 rbd_bus_del_dev(rbd_dev);
602adf40
YS
3753
3754done:
3755 mutex_unlock(&ctl_mutex);
aafb230e 3756
602adf40
YS
3757 return ret;
3758}
3759
602adf40
YS
3760/*
3761 * create control files in sysfs
dfc5606d 3762 * /sys/bus/rbd/...
602adf40
YS
3763 */
3764static int rbd_sysfs_init(void)
3765{
dfc5606d 3766 int ret;
602adf40 3767
fed4c143 3768 ret = device_register(&rbd_root_dev);
21079786 3769 if (ret < 0)
dfc5606d 3770 return ret;
602adf40 3771
fed4c143
AE
3772 ret = bus_register(&rbd_bus_type);
3773 if (ret < 0)
3774 device_unregister(&rbd_root_dev);
602adf40 3775
602adf40
YS
3776 return ret;
3777}
3778
3779static void rbd_sysfs_cleanup(void)
3780{
dfc5606d 3781 bus_unregister(&rbd_bus_type);
fed4c143 3782 device_unregister(&rbd_root_dev);
602adf40
YS
3783}
3784
3785int __init rbd_init(void)
3786{
3787 int rc;
3788
3789 rc = rbd_sysfs_init();
3790 if (rc)
3791 return rc;
f0f8cef5 3792 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
602adf40
YS
3793 return 0;
3794}
3795
3796void __exit rbd_exit(void)
3797{
3798 rbd_sysfs_cleanup();
3799}
3800
3801module_init(rbd_init);
3802module_exit(rbd_exit);
3803
3804MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3805MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3806MODULE_DESCRIPTION("rados block device");
3807
3808/* following authorship retained from original osdblk.c */
3809MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3810
3811MODULE_LICENSE("GPL");