]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - drivers/block/rbd.c
rbd: move call osd op setup into rbd_osd_req_op_create()
[mirror_ubuntu-artful-kernel.git] / drivers / block / rbd.c
CommitLineData
602adf40
YS
1/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
dfc5606d 24 For usage instructions, please refer to:
602adf40 25
dfc5606d 26 Documentation/ABI/testing/sysfs-bus-rbd
602adf40
YS
27
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
59c2be1e 34#include <linux/parser.h>
602adf40
YS
35
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
aafb230e
AE
44#define RBD_DEBUG /* Activate rbd_assert() calls */
45
593a9e7b
AE
46/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
2647ba38 55/* It might be useful to have these defined elsewhere */
df111be6 56
2647ba38
AE
57#define U8_MAX ((u8) (~0U))
58#define U16_MAX ((u16) (~0U))
59#define U32_MAX ((u32) (~0U))
60#define U64_MAX ((u64) (~0ULL))
df111be6 61
f0f8cef5
AE
62#define RBD_DRV_NAME "rbd"
63#define RBD_DRV_NAME_LONG "rbd (rados block device)"
602adf40
YS
64
65#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
66
d4b125e9
AE
67#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
68#define RBD_MAX_SNAP_NAME_LEN \
69 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
70
35d489f9 71#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
602adf40
YS
72#define RBD_MAX_OPT_LEN 1024
73
74#define RBD_SNAP_HEAD_NAME "-"
75
9e15b77d
AE
76/* This allows a single page to hold an image name sent by OSD */
77#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
1e130199 78#define RBD_IMAGE_ID_LEN_MAX 64
9e15b77d 79
1e130199 80#define RBD_OBJ_PREFIX_LEN_MAX 64
589d30e0 81
d889140c
AE
82/* Feature bits */
83
84#define RBD_FEATURE_LAYERING 1
85
86/* Features supported by this (client software) implementation. */
87
88#define RBD_FEATURES_ALL (0)
89
81a89793
AE
90/*
91 * An RBD device name will be "rbd#", where the "rbd" comes from
92 * RBD_DRV_NAME above, and # is a unique integer identifier.
93 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
94 * enough to hold all possible device names.
95 */
602adf40 96#define DEV_NAME_LEN 32
81a89793 97#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
602adf40 98
cc0538b6 99#define RBD_READ_ONLY_DEFAULT false
59c2be1e 100
602adf40
YS
101/*
102 * block device image metadata (in-memory version)
103 */
104struct rbd_image_header {
f84344f3 105 /* These four fields never change for a given rbd image */
849b4260 106 char *object_prefix;
34b13184 107 u64 features;
602adf40
YS
108 __u8 obj_order;
109 __u8 crypt_type;
110 __u8 comp_type;
602adf40 111
f84344f3
AE
112 /* The remaining fields need to be updated occasionally */
113 u64 image_size;
114 struct ceph_snap_context *snapc;
602adf40
YS
115 char *snap_names;
116 u64 *snap_sizes;
59c2be1e
YS
117
118 u64 obj_version;
119};
120
0d7dbfce
AE
121/*
122 * An rbd image specification.
123 *
124 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
c66c6e0c
AE
125 * identify an image. Each rbd_dev structure includes a pointer to
126 * an rbd_spec structure that encapsulates this identity.
127 *
128 * Each of the id's in an rbd_spec has an associated name. For a
129 * user-mapped image, the names are supplied and the id's associated
130 * with them are looked up. For a layered image, a parent image is
131 * defined by the tuple, and the names are looked up.
132 *
133 * An rbd_dev structure contains a parent_spec pointer which is
134 * non-null if the image it represents is a child in a layered
135 * image. This pointer will refer to the rbd_spec structure used
136 * by the parent rbd_dev for its own identity (i.e., the structure
137 * is shared between the parent and child).
138 *
139 * Since these structures are populated once, during the discovery
140 * phase of image construction, they are effectively immutable so
141 * we make no effort to synchronize access to them.
142 *
143 * Note that code herein does not assume the image name is known (it
144 * could be a null pointer).
0d7dbfce
AE
145 */
146struct rbd_spec {
147 u64 pool_id;
148 char *pool_name;
149
150 char *image_id;
0d7dbfce 151 char *image_name;
0d7dbfce
AE
152
153 u64 snap_id;
154 char *snap_name;
155
156 struct kref kref;
157};
158
59c2be1e 159struct rbd_options {
cc0538b6 160 bool read_only;
602adf40
YS
161};
162
163/*
f0f8cef5 164 * an instance of the client. multiple devices may share an rbd client.
602adf40
YS
165 */
166struct rbd_client {
167 struct ceph_client *client;
168 struct kref kref;
169 struct list_head node;
170};
171
172/*
f0f8cef5 173 * a request completion status
602adf40 174 */
1fec7093
YS
175struct rbd_req_status {
176 int done;
8986cb37 177 s32 rc;
1fec7093
YS
178 u64 bytes;
179};
180
181/*
182 * a collection of requests
183 */
184struct rbd_req_coll {
185 int total;
186 int num_done;
187 struct kref kref;
188 struct rbd_req_status status[0];
602adf40
YS
189};
190
f0f8cef5
AE
191/*
192 * a single io request
193 */
194struct rbd_request {
195 struct request *rq; /* blk layer request */
196 struct bio *bio; /* cloned bio */
197 struct page **pages; /* list of used pages */
198 u64 len;
199 int coll_index;
200 struct rbd_req_coll *coll;
201};
202
dfc5606d
YS
203struct rbd_snap {
204 struct device dev;
205 const char *name;
3591538f 206 u64 size;
dfc5606d
YS
207 struct list_head node;
208 u64 id;
34b13184 209 u64 features;
dfc5606d
YS
210};
211
f84344f3 212struct rbd_mapping {
99c1f08f 213 u64 size;
34b13184 214 u64 features;
f84344f3
AE
215 bool read_only;
216};
217
602adf40
YS
218/*
219 * a single device
220 */
221struct rbd_device {
de71a297 222 int dev_id; /* blkdev unique id */
602adf40
YS
223
224 int major; /* blkdev assigned major */
225 struct gendisk *disk; /* blkdev's gendisk and rq */
602adf40 226
a30b71b9 227 u32 image_format; /* Either 1 or 2 */
602adf40
YS
228 struct rbd_client *rbd_client;
229
230 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
231
232 spinlock_t lock; /* queue lock */
233
234 struct rbd_image_header header;
d78b650a 235 atomic_t exists;
0d7dbfce 236 struct rbd_spec *spec;
602adf40 237
0d7dbfce 238 char *header_name;
971f839a 239
0903e875
AE
240 struct ceph_file_layout layout;
241
59c2be1e
YS
242 struct ceph_osd_event *watch_event;
243 struct ceph_osd_request *watch_request;
244
86b00e0d
AE
245 struct rbd_spec *parent_spec;
246 u64 parent_overlap;
247
c666601a
JD
248 /* protects updating the header */
249 struct rw_semaphore header_rwsem;
f84344f3
AE
250
251 struct rbd_mapping mapping;
602adf40
YS
252
253 struct list_head node;
dfc5606d
YS
254
255 /* list of snapshots */
256 struct list_head snaps;
257
258 /* sysfs related */
259 struct device dev;
42382b70 260 unsigned long open_count;
dfc5606d
YS
261};
262
602adf40 263static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
e124a82f 264
602adf40 265static LIST_HEAD(rbd_dev_list); /* devices */
e124a82f
AE
266static DEFINE_SPINLOCK(rbd_dev_list_lock);
267
432b8587
AE
268static LIST_HEAD(rbd_client_list); /* clients */
269static DEFINE_SPINLOCK(rbd_client_list_lock);
602adf40 270
304f6808
AE
271static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
272static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
273
dfc5606d 274static void rbd_dev_release(struct device *dev);
41f38c2b 275static void rbd_remove_snap_dev(struct rbd_snap *snap);
dfc5606d 276
f0f8cef5
AE
277static ssize_t rbd_add(struct bus_type *bus, const char *buf,
278 size_t count);
279static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
280 size_t count);
281
282static struct bus_attribute rbd_bus_attrs[] = {
283 __ATTR(add, S_IWUSR, NULL, rbd_add),
284 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
285 __ATTR_NULL
286};
287
288static struct bus_type rbd_bus_type = {
289 .name = "rbd",
290 .bus_attrs = rbd_bus_attrs,
291};
292
293static void rbd_root_dev_release(struct device *dev)
294{
295}
296
297static struct device rbd_root_dev = {
298 .init_name = "rbd",
299 .release = rbd_root_dev_release,
300};
301
06ecc6cb
AE
302static __printf(2, 3)
303void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
304{
305 struct va_format vaf;
306 va_list args;
307
308 va_start(args, fmt);
309 vaf.fmt = fmt;
310 vaf.va = &args;
311
312 if (!rbd_dev)
313 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
314 else if (rbd_dev->disk)
315 printk(KERN_WARNING "%s: %s: %pV\n",
316 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
317 else if (rbd_dev->spec && rbd_dev->spec->image_name)
318 printk(KERN_WARNING "%s: image %s: %pV\n",
319 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
320 else if (rbd_dev->spec && rbd_dev->spec->image_id)
321 printk(KERN_WARNING "%s: id %s: %pV\n",
322 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
323 else /* punt */
324 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
325 RBD_DRV_NAME, rbd_dev, &vaf);
326 va_end(args);
327}
328
aafb230e
AE
329#ifdef RBD_DEBUG
330#define rbd_assert(expr) \
331 if (unlikely(!(expr))) { \
332 printk(KERN_ERR "\nAssertion failure in %s() " \
333 "at line %d:\n\n" \
334 "\trbd_assert(%s);\n\n", \
335 __func__, __LINE__, #expr); \
336 BUG(); \
337 }
338#else /* !RBD_DEBUG */
339# define rbd_assert(expr) ((void) 0)
340#endif /* !RBD_DEBUG */
dfc5606d 341
117973fb
AE
342static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
343static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
59c2be1e 344
602adf40
YS
345static int rbd_open(struct block_device *bdev, fmode_t mode)
346{
f0f8cef5 347 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
602adf40 348
f84344f3 349 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
602adf40
YS
350 return -EROFS;
351
42382b70 352 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
c3e946ce 353 (void) get_device(&rbd_dev->dev);
f84344f3 354 set_device_ro(bdev, rbd_dev->mapping.read_only);
42382b70
AE
355 rbd_dev->open_count++;
356 mutex_unlock(&ctl_mutex);
340c7a2b 357
602adf40
YS
358 return 0;
359}
360
dfc5606d
YS
361static int rbd_release(struct gendisk *disk, fmode_t mode)
362{
363 struct rbd_device *rbd_dev = disk->private_data;
364
42382b70
AE
365 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
366 rbd_assert(rbd_dev->open_count > 0);
367 rbd_dev->open_count--;
c3e946ce 368 put_device(&rbd_dev->dev);
42382b70 369 mutex_unlock(&ctl_mutex);
dfc5606d
YS
370
371 return 0;
372}
373
602adf40
YS
374static const struct block_device_operations rbd_bd_ops = {
375 .owner = THIS_MODULE,
376 .open = rbd_open,
dfc5606d 377 .release = rbd_release,
602adf40
YS
378};
379
380/*
381 * Initialize an rbd client instance.
43ae4701 382 * We own *ceph_opts.
602adf40 383 */
f8c38929 384static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
602adf40
YS
385{
386 struct rbd_client *rbdc;
387 int ret = -ENOMEM;
388
389 dout("rbd_client_create\n");
390 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
391 if (!rbdc)
392 goto out_opt;
393
394 kref_init(&rbdc->kref);
395 INIT_LIST_HEAD(&rbdc->node);
396
bc534d86
AE
397 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
398
43ae4701 399 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
602adf40 400 if (IS_ERR(rbdc->client))
bc534d86 401 goto out_mutex;
43ae4701 402 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
602adf40
YS
403
404 ret = ceph_open_session(rbdc->client);
405 if (ret < 0)
406 goto out_err;
407
432b8587 408 spin_lock(&rbd_client_list_lock);
602adf40 409 list_add_tail(&rbdc->node, &rbd_client_list);
432b8587 410 spin_unlock(&rbd_client_list_lock);
602adf40 411
bc534d86
AE
412 mutex_unlock(&ctl_mutex);
413
602adf40
YS
414 dout("rbd_client_create created %p\n", rbdc);
415 return rbdc;
416
417out_err:
418 ceph_destroy_client(rbdc->client);
bc534d86
AE
419out_mutex:
420 mutex_unlock(&ctl_mutex);
602adf40
YS
421 kfree(rbdc);
422out_opt:
43ae4701
AE
423 if (ceph_opts)
424 ceph_destroy_options(ceph_opts);
28f259b7 425 return ERR_PTR(ret);
602adf40
YS
426}
427
428/*
1f7ba331
AE
429 * Find a ceph client with specific addr and configuration. If
430 * found, bump its reference count.
602adf40 431 */
1f7ba331 432static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
602adf40
YS
433{
434 struct rbd_client *client_node;
1f7ba331 435 bool found = false;
602adf40 436
43ae4701 437 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
602adf40
YS
438 return NULL;
439
1f7ba331
AE
440 spin_lock(&rbd_client_list_lock);
441 list_for_each_entry(client_node, &rbd_client_list, node) {
442 if (!ceph_compare_options(ceph_opts, client_node->client)) {
443 kref_get(&client_node->kref);
444 found = true;
445 break;
446 }
447 }
448 spin_unlock(&rbd_client_list_lock);
449
450 return found ? client_node : NULL;
602adf40
YS
451}
452
59c2be1e
YS
453/*
454 * mount options
455 */
456enum {
59c2be1e
YS
457 Opt_last_int,
458 /* int args above */
459 Opt_last_string,
460 /* string args above */
cc0538b6
AE
461 Opt_read_only,
462 Opt_read_write,
463 /* Boolean args above */
464 Opt_last_bool,
59c2be1e
YS
465};
466
43ae4701 467static match_table_t rbd_opts_tokens = {
59c2be1e
YS
468 /* int args above */
469 /* string args above */
be466c1c 470 {Opt_read_only, "read_only"},
cc0538b6
AE
471 {Opt_read_only, "ro"}, /* Alternate spelling */
472 {Opt_read_write, "read_write"},
473 {Opt_read_write, "rw"}, /* Alternate spelling */
474 /* Boolean args above */
59c2be1e
YS
475 {-1, NULL}
476};
477
478static int parse_rbd_opts_token(char *c, void *private)
479{
43ae4701 480 struct rbd_options *rbd_opts = private;
59c2be1e
YS
481 substring_t argstr[MAX_OPT_ARGS];
482 int token, intval, ret;
483
43ae4701 484 token = match_token(c, rbd_opts_tokens, argstr);
59c2be1e
YS
485 if (token < 0)
486 return -EINVAL;
487
488 if (token < Opt_last_int) {
489 ret = match_int(&argstr[0], &intval);
490 if (ret < 0) {
491 pr_err("bad mount option arg (not int) "
492 "at '%s'\n", c);
493 return ret;
494 }
495 dout("got int token %d val %d\n", token, intval);
496 } else if (token > Opt_last_int && token < Opt_last_string) {
497 dout("got string token %d val %s\n", token,
498 argstr[0].from);
cc0538b6
AE
499 } else if (token > Opt_last_string && token < Opt_last_bool) {
500 dout("got Boolean token %d\n", token);
59c2be1e
YS
501 } else {
502 dout("got token %d\n", token);
503 }
504
505 switch (token) {
cc0538b6
AE
506 case Opt_read_only:
507 rbd_opts->read_only = true;
508 break;
509 case Opt_read_write:
510 rbd_opts->read_only = false;
511 break;
59c2be1e 512 default:
aafb230e
AE
513 rbd_assert(false);
514 break;
59c2be1e
YS
515 }
516 return 0;
517}
518
602adf40
YS
519/*
520 * Get a ceph client with specific addr and configuration, if one does
521 * not exist create it.
522 */
9d3997fd 523static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
602adf40 524{
f8c38929 525 struct rbd_client *rbdc;
59c2be1e 526
1f7ba331 527 rbdc = rbd_client_find(ceph_opts);
9d3997fd 528 if (rbdc) /* using an existing client */
43ae4701 529 ceph_destroy_options(ceph_opts);
9d3997fd 530 else
f8c38929 531 rbdc = rbd_client_create(ceph_opts);
602adf40 532
9d3997fd 533 return rbdc;
602adf40
YS
534}
535
536/*
537 * Destroy ceph client
d23a4b3f 538 *
432b8587 539 * Caller must hold rbd_client_list_lock.
602adf40
YS
540 */
541static void rbd_client_release(struct kref *kref)
542{
543 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
544
545 dout("rbd_release_client %p\n", rbdc);
cd9d9f5d 546 spin_lock(&rbd_client_list_lock);
602adf40 547 list_del(&rbdc->node);
cd9d9f5d 548 spin_unlock(&rbd_client_list_lock);
602adf40
YS
549
550 ceph_destroy_client(rbdc->client);
551 kfree(rbdc);
552}
553
554/*
555 * Drop reference to ceph client node. If it's not referenced anymore, release
556 * it.
557 */
9d3997fd 558static void rbd_put_client(struct rbd_client *rbdc)
602adf40 559{
c53d5893
AE
560 if (rbdc)
561 kref_put(&rbdc->kref, rbd_client_release);
602adf40
YS
562}
563
1fec7093
YS
564/*
565 * Destroy requests collection
566 */
567static void rbd_coll_release(struct kref *kref)
568{
569 struct rbd_req_coll *coll =
570 container_of(kref, struct rbd_req_coll, kref);
571
572 dout("rbd_coll_release %p\n", coll);
573 kfree(coll);
574}
602adf40 575
a30b71b9
AE
576static bool rbd_image_format_valid(u32 image_format)
577{
578 return image_format == 1 || image_format == 2;
579}
580
8e94af8e
AE
581static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
582{
103a150f
AE
583 size_t size;
584 u32 snap_count;
585
586 /* The header has to start with the magic rbd header text */
587 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
588 return false;
589
db2388b6
AE
590 /* The bio layer requires at least sector-sized I/O */
591
592 if (ondisk->options.order < SECTOR_SHIFT)
593 return false;
594
595 /* If we use u64 in a few spots we may be able to loosen this */
596
597 if (ondisk->options.order > 8 * sizeof (int) - 1)
598 return false;
599
103a150f
AE
600 /*
601 * The size of a snapshot header has to fit in a size_t, and
602 * that limits the number of snapshots.
603 */
604 snap_count = le32_to_cpu(ondisk->snap_count);
605 size = SIZE_MAX - sizeof (struct ceph_snap_context);
606 if (snap_count > size / sizeof (__le64))
607 return false;
608
609 /*
610 * Not only that, but the size of the entire the snapshot
611 * header must also be representable in a size_t.
612 */
613 size -= snap_count * sizeof (__le64);
614 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
615 return false;
616
617 return true;
8e94af8e
AE
618}
619
602adf40
YS
620/*
621 * Create a new header structure, translate header format from the on-disk
622 * header.
623 */
624static int rbd_header_from_disk(struct rbd_image_header *header,
4156d998 625 struct rbd_image_header_ondisk *ondisk)
602adf40 626{
ccece235 627 u32 snap_count;
58c17b0e 628 size_t len;
d2bb24e5 629 size_t size;
621901d6 630 u32 i;
602adf40 631
6a52325f
AE
632 memset(header, 0, sizeof (*header));
633
103a150f
AE
634 snap_count = le32_to_cpu(ondisk->snap_count);
635
58c17b0e
AE
636 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
637 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
6a52325f 638 if (!header->object_prefix)
602adf40 639 return -ENOMEM;
58c17b0e
AE
640 memcpy(header->object_prefix, ondisk->object_prefix, len);
641 header->object_prefix[len] = '\0';
00f1f36f 642
602adf40 643 if (snap_count) {
f785cc1d
AE
644 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
645
621901d6
AE
646 /* Save a copy of the snapshot names */
647
f785cc1d
AE
648 if (snap_names_len > (u64) SIZE_MAX)
649 return -EIO;
650 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
602adf40 651 if (!header->snap_names)
6a52325f 652 goto out_err;
f785cc1d
AE
653 /*
654 * Note that rbd_dev_v1_header_read() guarantees
655 * the ondisk buffer we're working with has
656 * snap_names_len bytes beyond the end of the
657 * snapshot id array, this memcpy() is safe.
658 */
659 memcpy(header->snap_names, &ondisk->snaps[snap_count],
660 snap_names_len);
6a52325f 661
621901d6
AE
662 /* Record each snapshot's size */
663
d2bb24e5
AE
664 size = snap_count * sizeof (*header->snap_sizes);
665 header->snap_sizes = kmalloc(size, GFP_KERNEL);
602adf40 666 if (!header->snap_sizes)
6a52325f 667 goto out_err;
621901d6
AE
668 for (i = 0; i < snap_count; i++)
669 header->snap_sizes[i] =
670 le64_to_cpu(ondisk->snaps[i].image_size);
602adf40 671 } else {
ccece235 672 WARN_ON(ondisk->snap_names_len);
602adf40
YS
673 header->snap_names = NULL;
674 header->snap_sizes = NULL;
675 }
849b4260 676
34b13184 677 header->features = 0; /* No features support in v1 images */
602adf40
YS
678 header->obj_order = ondisk->options.order;
679 header->crypt_type = ondisk->options.crypt_type;
680 header->comp_type = ondisk->options.comp_type;
6a52325f 681
621901d6
AE
682 /* Allocate and fill in the snapshot context */
683
f84344f3 684 header->image_size = le64_to_cpu(ondisk->image_size);
6a52325f
AE
685 size = sizeof (struct ceph_snap_context);
686 size += snap_count * sizeof (header->snapc->snaps[0]);
687 header->snapc = kzalloc(size, GFP_KERNEL);
688 if (!header->snapc)
689 goto out_err;
602adf40
YS
690
691 atomic_set(&header->snapc->nref, 1);
505cbb9b 692 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
602adf40 693 header->snapc->num_snaps = snap_count;
621901d6
AE
694 for (i = 0; i < snap_count; i++)
695 header->snapc->snaps[i] =
696 le64_to_cpu(ondisk->snaps[i].id);
602adf40
YS
697
698 return 0;
699
6a52325f 700out_err:
849b4260 701 kfree(header->snap_sizes);
ccece235 702 header->snap_sizes = NULL;
602adf40 703 kfree(header->snap_names);
ccece235 704 header->snap_names = NULL;
6a52325f
AE
705 kfree(header->object_prefix);
706 header->object_prefix = NULL;
ccece235 707
00f1f36f 708 return -ENOMEM;
602adf40
YS
709}
710
9e15b77d
AE
711static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
712{
713 struct rbd_snap *snap;
714
715 if (snap_id == CEPH_NOSNAP)
716 return RBD_SNAP_HEAD_NAME;
717
718 list_for_each_entry(snap, &rbd_dev->snaps, node)
719 if (snap_id == snap->id)
720 return snap->name;
721
722 return NULL;
723}
724
8836b995 725static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
602adf40 726{
602adf40 727
e86924a8 728 struct rbd_snap *snap;
602adf40 729
e86924a8
AE
730 list_for_each_entry(snap, &rbd_dev->snaps, node) {
731 if (!strcmp(snap_name, snap->name)) {
0d7dbfce 732 rbd_dev->spec->snap_id = snap->id;
e86924a8 733 rbd_dev->mapping.size = snap->size;
34b13184 734 rbd_dev->mapping.features = snap->features;
602adf40 735
e86924a8 736 return 0;
00f1f36f 737 }
00f1f36f 738 }
e86924a8 739
00f1f36f 740 return -ENOENT;
602adf40
YS
741}
742
819d52bf 743static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
602adf40 744{
78dc447d 745 int ret;
602adf40 746
0d7dbfce 747 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
cc9d734c 748 sizeof (RBD_SNAP_HEAD_NAME))) {
0d7dbfce 749 rbd_dev->spec->snap_id = CEPH_NOSNAP;
99c1f08f 750 rbd_dev->mapping.size = rbd_dev->header.image_size;
34b13184 751 rbd_dev->mapping.features = rbd_dev->header.features;
e86924a8 752 ret = 0;
602adf40 753 } else {
0d7dbfce 754 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
602adf40
YS
755 if (ret < 0)
756 goto done;
f84344f3 757 rbd_dev->mapping.read_only = true;
602adf40 758 }
d78b650a 759 atomic_set(&rbd_dev->exists, 1);
602adf40 760done:
602adf40
YS
761 return ret;
762}
763
764static void rbd_header_free(struct rbd_image_header *header)
765{
849b4260 766 kfree(header->object_prefix);
d78fd7ae 767 header->object_prefix = NULL;
602adf40 768 kfree(header->snap_sizes);
d78fd7ae 769 header->snap_sizes = NULL;
849b4260 770 kfree(header->snap_names);
d78fd7ae 771 header->snap_names = NULL;
d1d25646 772 ceph_put_snap_context(header->snapc);
d78fd7ae 773 header->snapc = NULL;
602adf40
YS
774}
775
65ccfe21 776static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
602adf40 777{
65ccfe21
AE
778 char *name;
779 u64 segment;
780 int ret;
602adf40 781
2fd82b9e 782 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
65ccfe21
AE
783 if (!name)
784 return NULL;
785 segment = offset >> rbd_dev->header.obj_order;
2fd82b9e 786 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
65ccfe21 787 rbd_dev->header.object_prefix, segment);
2fd82b9e 788 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
65ccfe21
AE
789 pr_err("error formatting segment name for #%llu (%d)\n",
790 segment, ret);
791 kfree(name);
792 name = NULL;
793 }
602adf40 794
65ccfe21
AE
795 return name;
796}
602adf40 797
65ccfe21
AE
798static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
799{
800 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
602adf40 801
65ccfe21
AE
802 return offset & (segment_size - 1);
803}
804
805static u64 rbd_segment_length(struct rbd_device *rbd_dev,
806 u64 offset, u64 length)
807{
808 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
809
810 offset &= segment_size - 1;
811
aafb230e 812 rbd_assert(length <= U64_MAX - offset);
65ccfe21
AE
813 if (offset + length > segment_size)
814 length = segment_size - offset;
815
816 return length;
602adf40
YS
817}
818
1fec7093
YS
819static int rbd_get_num_segments(struct rbd_image_header *header,
820 u64 ofs, u64 len)
821{
df111be6
AE
822 u64 start_seg;
823 u64 end_seg;
824
825 if (!len)
826 return 0;
827 if (len - 1 > U64_MAX - ofs)
828 return -ERANGE;
829
830 start_seg = ofs >> header->obj_order;
831 end_seg = (ofs + len - 1) >> header->obj_order;
832
1fec7093
YS
833 return end_seg - start_seg + 1;
834}
835
029bcbd8
JD
836/*
837 * returns the size of an object in the image
838 */
839static u64 rbd_obj_bytes(struct rbd_image_header *header)
840{
841 return 1 << header->obj_order;
842}
843
602adf40
YS
844/*
845 * bio helpers
846 */
847
848static void bio_chain_put(struct bio *chain)
849{
850 struct bio *tmp;
851
852 while (chain) {
853 tmp = chain;
854 chain = chain->bi_next;
855 bio_put(tmp);
856 }
857}
858
859/*
860 * zeros a bio chain, starting at specific offset
861 */
862static void zero_bio_chain(struct bio *chain, int start_ofs)
863{
864 struct bio_vec *bv;
865 unsigned long flags;
866 void *buf;
867 int i;
868 int pos = 0;
869
870 while (chain) {
871 bio_for_each_segment(bv, chain, i) {
872 if (pos + bv->bv_len > start_ofs) {
873 int remainder = max(start_ofs - pos, 0);
874 buf = bvec_kmap_irq(bv, &flags);
875 memset(buf + remainder, 0,
876 bv->bv_len - remainder);
85b5aaa6 877 bvec_kunmap_irq(buf, &flags);
602adf40
YS
878 }
879 pos += bv->bv_len;
880 }
881
882 chain = chain->bi_next;
883 }
884}
885
886/*
f7760dad
AE
887 * Clone a portion of a bio, starting at the given byte offset
888 * and continuing for the number of bytes indicated.
602adf40 889 */
f7760dad
AE
890static struct bio *bio_clone_range(struct bio *bio_src,
891 unsigned int offset,
892 unsigned int len,
893 gfp_t gfpmask)
602adf40 894{
f7760dad
AE
895 struct bio_vec *bv;
896 unsigned int resid;
897 unsigned short idx;
898 unsigned int voff;
899 unsigned short end_idx;
900 unsigned short vcnt;
901 struct bio *bio;
902
903 /* Handle the easy case for the caller */
904
905 if (!offset && len == bio_src->bi_size)
906 return bio_clone(bio_src, gfpmask);
907
908 if (WARN_ON_ONCE(!len))
909 return NULL;
910 if (WARN_ON_ONCE(len > bio_src->bi_size))
911 return NULL;
912 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
913 return NULL;
914
915 /* Find first affected segment... */
916
917 resid = offset;
918 __bio_for_each_segment(bv, bio_src, idx, 0) {
919 if (resid < bv->bv_len)
920 break;
921 resid -= bv->bv_len;
602adf40 922 }
f7760dad 923 voff = resid;
602adf40 924
f7760dad 925 /* ...and the last affected segment */
602adf40 926
f7760dad
AE
927 resid += len;
928 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
929 if (resid <= bv->bv_len)
930 break;
931 resid -= bv->bv_len;
932 }
933 vcnt = end_idx - idx + 1;
934
935 /* Build the clone */
936
937 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
938 if (!bio)
939 return NULL; /* ENOMEM */
602adf40 940
f7760dad
AE
941 bio->bi_bdev = bio_src->bi_bdev;
942 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
943 bio->bi_rw = bio_src->bi_rw;
944 bio->bi_flags |= 1 << BIO_CLONED;
945
946 /*
947 * Copy over our part of the bio_vec, then update the first
948 * and last (or only) entries.
949 */
950 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
951 vcnt * sizeof (struct bio_vec));
952 bio->bi_io_vec[0].bv_offset += voff;
953 if (vcnt > 1) {
954 bio->bi_io_vec[0].bv_len -= voff;
955 bio->bi_io_vec[vcnt - 1].bv_len = resid;
956 } else {
957 bio->bi_io_vec[0].bv_len = len;
602adf40
YS
958 }
959
f7760dad
AE
960 bio->bi_vcnt = vcnt;
961 bio->bi_size = len;
962 bio->bi_idx = 0;
963
964 return bio;
965}
966
967/*
968 * Clone a portion of a bio chain, starting at the given byte offset
969 * into the first bio in the source chain and continuing for the
970 * number of bytes indicated. The result is another bio chain of
971 * exactly the given length, or a null pointer on error.
972 *
973 * The bio_src and offset parameters are both in-out. On entry they
974 * refer to the first source bio and the offset into that bio where
975 * the start of data to be cloned is located.
976 *
977 * On return, bio_src is updated to refer to the bio in the source
978 * chain that contains first un-cloned byte, and *offset will
979 * contain the offset of that byte within that bio.
980 */
981static struct bio *bio_chain_clone_range(struct bio **bio_src,
982 unsigned int *offset,
983 unsigned int len,
984 gfp_t gfpmask)
985{
986 struct bio *bi = *bio_src;
987 unsigned int off = *offset;
988 struct bio *chain = NULL;
989 struct bio **end;
990
991 /* Build up a chain of clone bios up to the limit */
992
993 if (!bi || off >= bi->bi_size || !len)
994 return NULL; /* Nothing to clone */
602adf40 995
f7760dad
AE
996 end = &chain;
997 while (len) {
998 unsigned int bi_size;
999 struct bio *bio;
1000
f5400b7a
AE
1001 if (!bi) {
1002 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
f7760dad 1003 goto out_err; /* EINVAL; ran out of bio's */
f5400b7a 1004 }
f7760dad
AE
1005 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1006 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1007 if (!bio)
1008 goto out_err; /* ENOMEM */
1009
1010 *end = bio;
1011 end = &bio->bi_next;
602adf40 1012
f7760dad
AE
1013 off += bi_size;
1014 if (off == bi->bi_size) {
1015 bi = bi->bi_next;
1016 off = 0;
1017 }
1018 len -= bi_size;
1019 }
1020 *bio_src = bi;
1021 *offset = off;
1022
1023 return chain;
1024out_err:
1025 bio_chain_put(chain);
602adf40 1026
602adf40
YS
1027 return NULL;
1028}
1029
8d23bf29 1030static struct ceph_osd_req_op *rbd_create_rw_op(int opcode, u64 ofs, u64 len)
602adf40 1031{
139b4318 1032 struct ceph_osd_req_op *op;
57cfc106 1033
139b4318
AE
1034 op = kzalloc(sizeof (*op), GFP_NOIO);
1035 if (!op)
57cfc106 1036 return NULL;
8d23bf29 1037
139b4318 1038 op->op = opcode;
57cfc106 1039
139b4318 1040 return op;
602adf40
YS
1041}
1042
139b4318 1043static void rbd_destroy_op(struct ceph_osd_req_op *op)
602adf40 1044{
139b4318 1045 kfree(op);
602adf40
YS
1046}
1047
8d23bf29
AE
1048struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
1049{
1050 struct ceph_osd_req_op *op;
1051 va_list args;
2647ba38 1052 size_t size;
8d23bf29
AE
1053
1054 op = kzalloc(sizeof (*op), GFP_NOIO);
1055 if (!op)
1056 return NULL;
1057 op->op = opcode;
1058 va_start(args, opcode);
1059 switch (opcode) {
1060 case CEPH_OSD_OP_READ:
1061 case CEPH_OSD_OP_WRITE:
1062 /* rbd_osd_req_op_create(READ, offset, length) */
1063 /* rbd_osd_req_op_create(WRITE, offset, length) */
1064 op->extent.offset = va_arg(args, u64);
1065 op->extent.length = va_arg(args, u64);
1066 if (opcode == CEPH_OSD_OP_WRITE)
1067 op->payload_len = op->extent.length;
1068 break;
2647ba38
AE
1069 case CEPH_OSD_OP_CALL:
1070 /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
1071 op->cls.class_name = va_arg(args, char *);
1072 size = strlen(op->cls.class_name);
1073 rbd_assert(size <= (size_t) U8_MAX);
1074 op->cls.class_len = size;
1075 op->payload_len = size;
1076
1077 op->cls.method_name = va_arg(args, char *);
1078 size = strlen(op->cls.method_name);
1079 rbd_assert(size <= (size_t) U8_MAX);
1080 op->cls.method_len = size;
1081 op->payload_len += size;
1082
1083 op->cls.argc = 0;
1084 op->cls.indata = va_arg(args, void *);
1085 size = va_arg(args, size_t);
1086 rbd_assert(size <= (size_t) U32_MAX);
1087 op->cls.indata_len = (u32) size;
1088 op->payload_len += size;
1089 break;
8d23bf29
AE
1090 default:
1091 rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
1092 kfree(op);
1093 op = NULL;
1094 break;
1095 }
1096 va_end(args);
1097
1098 return op;
1099}
1100
1101static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
1102{
1103 kfree(op);
1104}
1105
1fec7093
YS
1106static void rbd_coll_end_req_index(struct request *rq,
1107 struct rbd_req_coll *coll,
1108 int index,
8986cb37 1109 s32 ret, u64 len)
1fec7093
YS
1110{
1111 struct request_queue *q;
1112 int min, max, i;
1113
bd919d45 1114 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
8986cb37 1115 coll, index, (int)ret, (unsigned long long)len);
1fec7093
YS
1116
1117 if (!rq)
1118 return;
1119
1120 if (!coll) {
1121 blk_end_request(rq, ret, len);
1122 return;
1123 }
1124
1125 q = rq->q;
1126
1127 spin_lock_irq(q->queue_lock);
1128 coll->status[index].done = 1;
1129 coll->status[index].rc = ret;
1130 coll->status[index].bytes = len;
1131 max = min = coll->num_done;
1132 while (max < coll->total && coll->status[max].done)
1133 max++;
1134
1135 for (i = min; i<max; i++) {
8986cb37 1136 __blk_end_request(rq, (int)coll->status[i].rc,
1fec7093
YS
1137 coll->status[i].bytes);
1138 coll->num_done++;
1139 kref_put(&coll->kref, rbd_coll_release);
1140 }
1141 spin_unlock_irq(q->queue_lock);
1142}
1143
725afc97 1144static void rbd_coll_end_req(struct rbd_request *rbd_req,
8986cb37 1145 s32 ret, u64 len)
1fec7093 1146{
725afc97
AE
1147 rbd_coll_end_req_index(rbd_req->rq,
1148 rbd_req->coll, rbd_req->coll_index,
1149 ret, len);
1fec7093
YS
1150}
1151
602adf40
YS
1152/*
1153 * Send ceph osd request
1154 */
1155static int rbd_do_request(struct request *rq,
0ce1a794 1156 struct rbd_device *rbd_dev,
602adf40
YS
1157 struct ceph_snap_context *snapc,
1158 u64 snapid,
aded07ea 1159 const char *object_name, u64 ofs, u64 len,
602adf40
YS
1160 struct bio *bio,
1161 struct page **pages,
1162 int num_pages,
1163 int flags,
30573d68 1164 struct ceph_osd_req_op *op,
1fec7093
YS
1165 struct rbd_req_coll *coll,
1166 int coll_index,
5f29ddd4
AE
1167 void (*rbd_cb)(struct ceph_osd_request *,
1168 struct ceph_msg *),
59c2be1e
YS
1169 struct ceph_osd_request **linger_req,
1170 u64 *ver)
602adf40 1171{
2e53c6c3 1172 struct ceph_osd_client *osdc;
5f29ddd4 1173 struct ceph_osd_request *osd_req;
2e53c6c3 1174 struct rbd_request *rbd_req = NULL;
602adf40 1175 struct timespec mtime = CURRENT_TIME;
2e53c6c3 1176 int ret;
602adf40 1177
f7760dad
AE
1178 dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1179 object_name, (unsigned long long) ofs,
1180 (unsigned long long) len, coll, coll_index);
602adf40 1181
0ce1a794 1182 osdc = &rbd_dev->rbd_client->client->osdc;
30573d68 1183 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_NOIO);
2e53c6c3
AE
1184 if (!osd_req)
1185 return -ENOMEM;
602adf40 1186
d178a9e7 1187 osd_req->r_flags = flags;
54a54007
AE
1188 osd_req->r_pages = pages;
1189 if (bio) {
1190 osd_req->r_bio = bio;
1191 bio_get(osd_req->r_bio);
1192 }
602adf40 1193
18216657 1194 if (coll) {
2e53c6c3
AE
1195 ret = -ENOMEM;
1196 rbd_req = kmalloc(sizeof(*rbd_req), GFP_NOIO);
1197 if (!rbd_req)
1198 goto done_osd_req;
1199
1200 rbd_req->rq = rq;
1201 rbd_req->bio = bio;
1202 rbd_req->pages = pages;
1203 rbd_req->len = len;
1204 rbd_req->coll = coll;
18216657 1205 rbd_req->coll_index = coll_index;
2e53c6c3 1206 }
602adf40 1207
2e53c6c3 1208 osd_req->r_callback = rbd_cb;
5f29ddd4 1209 osd_req->r_priv = rbd_req;
602adf40 1210
5f29ddd4
AE
1211 strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid));
1212 osd_req->r_oid_len = strlen(osd_req->r_oid);
602adf40 1213
0903e875 1214 osd_req->r_file_layout = rbd_dev->layout; /* struct */
e01e7927
AE
1215 osd_req->r_num_pages = calc_pages_for(ofs, len);
1216 osd_req->r_page_alignment = ofs & ~PAGE_MASK;
602adf40 1217
30573d68 1218 ceph_osdc_build_request(osd_req, ofs, len, 1, op,
ae7ca4a3 1219 snapc, snapid, &mtime);
602adf40 1220
59c2be1e 1221 if (linger_req) {
5f29ddd4
AE
1222 ceph_osdc_set_request_linger(osdc, osd_req);
1223 *linger_req = osd_req;
59c2be1e
YS
1224 }
1225
5f29ddd4 1226 ret = ceph_osdc_start_request(osdc, osd_req, false);
602adf40
YS
1227 if (ret < 0)
1228 goto done_err;
1229
1230 if (!rbd_cb) {
5f29ddd4
AE
1231 u64 version;
1232
1233 ret = ceph_osdc_wait_request(osdc, osd_req);
1234 version = le64_to_cpu(osd_req->r_reassert_version.version);
59c2be1e 1235 if (ver)
5f29ddd4
AE
1236 *ver = version;
1237 dout("reassert_ver=%llu\n", (unsigned long long) version);
1238 ceph_osdc_put_request(osd_req);
602adf40
YS
1239 }
1240 return ret;
1241
1242done_err:
2e53c6c3
AE
1243 if (bio)
1244 bio_chain_put(osd_req->r_bio);
725afc97 1245 kfree(rbd_req);
2e53c6c3
AE
1246done_osd_req:
1247 ceph_osdc_put_request(osd_req);
1248
602adf40
YS
1249 return ret;
1250}
1251
1252/*
1253 * Ceph osd op callback
1254 */
5f29ddd4 1255static void rbd_req_cb(struct ceph_osd_request *osd_req, struct ceph_msg *msg)
602adf40 1256{
5f29ddd4 1257 struct rbd_request *rbd_req = osd_req->r_priv;
602adf40
YS
1258 struct ceph_osd_reply_head *replyhead;
1259 struct ceph_osd_op *op;
8986cb37 1260 s32 rc;
602adf40
YS
1261 u64 bytes;
1262 int read_op;
1263
1264 /* parse reply */
1265 replyhead = msg->front.iov_base;
1266 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1267 op = (void *)(replyhead + 1);
8986cb37 1268 rc = (s32)le32_to_cpu(replyhead->result);
602adf40 1269 bytes = le64_to_cpu(op->extent.length);
895cfcc8 1270 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
602adf40 1271
bd919d45
AE
1272 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1273 (unsigned long long) bytes, read_op, (int) rc);
602adf40 1274
8986cb37 1275 if (rc == (s32)-ENOENT && read_op) {
725afc97 1276 zero_bio_chain(rbd_req->bio, 0);
602adf40 1277 rc = 0;
725afc97
AE
1278 } else if (rc == 0 && read_op && bytes < rbd_req->len) {
1279 zero_bio_chain(rbd_req->bio, bytes);
1280 bytes = rbd_req->len;
602adf40
YS
1281 }
1282
725afc97 1283 rbd_coll_end_req(rbd_req, rc, bytes);
602adf40 1284
725afc97
AE
1285 if (rbd_req->bio)
1286 bio_chain_put(rbd_req->bio);
602adf40 1287
5f29ddd4 1288 ceph_osdc_put_request(osd_req);
725afc97 1289 kfree(rbd_req);
602adf40
YS
1290}
1291
5f29ddd4
AE
1292static void rbd_simple_req_cb(struct ceph_osd_request *osd_req,
1293 struct ceph_msg *msg)
59c2be1e 1294{
5f29ddd4 1295 ceph_osdc_put_request(osd_req);
59c2be1e
YS
1296}
1297
602adf40
YS
1298/*
1299 * Do a synchronous ceph osd operation
1300 */
0ce1a794 1301static int rbd_req_sync_op(struct rbd_device *rbd_dev,
602adf40 1302 int flags,
30573d68 1303 struct ceph_osd_req_op *op,
aded07ea 1304 const char *object_name,
f8d4de6e
AE
1305 u64 ofs, u64 inbound_size,
1306 char *inbound,
59c2be1e
YS
1307 struct ceph_osd_request **linger_req,
1308 u64 *ver)
602adf40
YS
1309{
1310 int ret;
1311 struct page **pages;
1312 int num_pages;
913d2fdc 1313
30573d68 1314 rbd_assert(op != NULL);
602adf40 1315
f8d4de6e 1316 num_pages = calc_pages_for(ofs, inbound_size);
602adf40 1317 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
b8d0638a
DC
1318 if (IS_ERR(pages))
1319 return PTR_ERR(pages);
602adf40 1320
25704ac9 1321 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
f8d4de6e 1322 object_name, ofs, inbound_size, NULL,
602adf40
YS
1323 pages, num_pages,
1324 flags,
30573d68 1325 op,
1fec7093 1326 NULL, 0,
59c2be1e
YS
1327 NULL,
1328 linger_req, ver);
602adf40 1329 if (ret < 0)
913d2fdc 1330 goto done;
602adf40 1331
f8d4de6e
AE
1332 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1333 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
602adf40 1334
602adf40
YS
1335done:
1336 ceph_release_page_vector(pages, num_pages);
1337 return ret;
1338}
1339
1340/*
1341 * Do an asynchronous ceph osd operation
1342 */
1343static int rbd_do_op(struct request *rq,
0ce1a794 1344 struct rbd_device *rbd_dev,
602adf40 1345 struct ceph_snap_context *snapc,
602adf40 1346 u64 ofs, u64 len,
1fec7093
YS
1347 struct bio *bio,
1348 struct rbd_req_coll *coll,
1349 int coll_index)
602adf40
YS
1350{
1351 char *seg_name;
1352 u64 seg_ofs;
1353 u64 seg_len;
1354 int ret;
139b4318 1355 struct ceph_osd_req_op *op;
ff2e4bb5
AE
1356 int opcode;
1357 int flags;
4634246d 1358 u64 snapid;
602adf40 1359
65ccfe21 1360 seg_name = rbd_segment_name(rbd_dev, ofs);
602adf40
YS
1361 if (!seg_name)
1362 return -ENOMEM;
65ccfe21
AE
1363 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1364 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
602adf40 1365
ff2e4bb5
AE
1366 if (rq_data_dir(rq) == WRITE) {
1367 opcode = CEPH_OSD_OP_WRITE;
1368 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
4634246d 1369 snapid = CEPH_NOSNAP;
ff2e4bb5
AE
1370 } else {
1371 opcode = CEPH_OSD_OP_READ;
1372 flags = CEPH_OSD_FLAG_READ;
a7b4c65f 1373 rbd_assert(!snapc);
0d7dbfce 1374 snapid = rbd_dev->spec->snap_id;
ff2e4bb5 1375 }
602adf40 1376
57cfc106 1377 ret = -ENOMEM;
8d23bf29 1378 op = rbd_osd_req_op_create(opcode, seg_ofs, seg_len);
139b4318 1379 if (!op)
602adf40
YS
1380 goto done;
1381
1382 /* we've taken care of segment sizes earlier when we
1383 cloned the bios. We should never have a segment
1384 truncated at this point */
aafb230e 1385 rbd_assert(seg_len == len);
602adf40
YS
1386
1387 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1388 seg_name, seg_ofs, seg_len,
1389 bio,
1390 NULL, 0,
1391 flags,
30573d68 1392 op,
1fec7093 1393 coll, coll_index,
59c2be1e 1394 rbd_req_cb, 0, NULL);
cd323ac0
AE
1395 if (ret < 0)
1396 rbd_coll_end_req_index(rq, coll, coll_index,
1397 (s32)ret, seg_len);
8d23bf29 1398 rbd_osd_req_op_destroy(op);
602adf40
YS
1399done:
1400 kfree(seg_name);
1401 return ret;
1402}
1403
602adf40
YS
1404/*
1405 * Request sync osd read
1406 */
0ce1a794 1407static int rbd_req_sync_read(struct rbd_device *rbd_dev,
aded07ea 1408 const char *object_name,
602adf40 1409 u64 ofs, u64 len,
59c2be1e
YS
1410 char *buf,
1411 u64 *ver)
602adf40 1412{
139b4318 1413 struct ceph_osd_req_op *op;
913d2fdc
AE
1414 int ret;
1415
8d23bf29 1416 op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, ofs, len);
139b4318 1417 if (!op)
913d2fdc
AE
1418 return -ENOMEM;
1419
25704ac9 1420 ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ,
30573d68 1421 op, object_name, ofs, len, buf, NULL, ver);
8d23bf29 1422 rbd_osd_req_op_destroy(op);
913d2fdc
AE
1423
1424 return ret;
602adf40
YS
1425}
1426
1427/*
59c2be1e
YS
1428 * Request sync osd watch
1429 */
0ce1a794 1430static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
59c2be1e 1431 u64 ver,
7f0a24d8 1432 u64 notify_id)
59c2be1e 1433{
139b4318 1434 struct ceph_osd_req_op *op;
11f77002
SW
1435 int ret;
1436
8d23bf29 1437 op = rbd_create_rw_op(CEPH_OSD_OP_NOTIFY_ACK, 0, 0);
139b4318 1438 if (!op)
57cfc106 1439 return -ENOMEM;
59c2be1e 1440
139b4318
AE
1441 op->watch.ver = cpu_to_le64(ver);
1442 op->watch.cookie = notify_id;
1443 op->watch.flag = 0;
59c2be1e 1444
0ce1a794 1445 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
7f0a24d8 1446 rbd_dev->header_name, 0, 0, NULL,
ad4f232f 1447 NULL, 0,
59c2be1e 1448 CEPH_OSD_FLAG_READ,
30573d68 1449 op,
1fec7093 1450 NULL, 0,
59c2be1e
YS
1451 rbd_simple_req_cb, 0, NULL);
1452
139b4318 1453 rbd_destroy_op(op);
59c2be1e
YS
1454 return ret;
1455}
1456
1457static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1458{
0ce1a794 1459 struct rbd_device *rbd_dev = (struct rbd_device *)data;
a71b891b 1460 u64 hver;
13143d2d
SW
1461 int rc;
1462
0ce1a794 1463 if (!rbd_dev)
59c2be1e
YS
1464 return;
1465
bd919d45
AE
1466 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1467 rbd_dev->header_name, (unsigned long long) notify_id,
1468 (unsigned int) opcode);
117973fb 1469 rc = rbd_dev_refresh(rbd_dev, &hver);
13143d2d 1470 if (rc)
06ecc6cb
AE
1471 rbd_warn(rbd_dev, "got notification but failed to "
1472 " update snaps: %d\n", rc);
59c2be1e 1473
7f0a24d8 1474 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
59c2be1e
YS
1475}
1476
1477/*
907703d0
AE
1478 * Request sync osd watch/unwatch. The value of "start" determines
1479 * whether a watch request is being initiated or torn down.
59c2be1e 1480 */
907703d0 1481static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start)
59c2be1e 1482{
139b4318 1483 struct ceph_osd_req_op *op;
907703d0
AE
1484 struct ceph_osd_request **linger_req = NULL;
1485 __le64 version = 0;
57cfc106 1486 int ret;
59c2be1e 1487
8d23bf29 1488 op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0, 0);
139b4318 1489 if (!op)
57cfc106 1490 return -ENOMEM;
59c2be1e 1491
907703d0
AE
1492 if (start) {
1493 struct ceph_osd_client *osdc;
79e3057c 1494
907703d0
AE
1495 osdc = &rbd_dev->rbd_client->client->osdc;
1496 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev,
1497 &rbd_dev->watch_event);
1498 if (ret < 0)
1499 goto done;
1500 version = cpu_to_le64(rbd_dev->header.obj_version);
1501 linger_req = &rbd_dev->watch_request;
1502 }
79e3057c 1503
907703d0 1504 op->watch.ver = version;
139b4318 1505 op->watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
907703d0 1506 op->watch.flag = (u8) start ? 1 : 0;
79e3057c 1507
25704ac9 1508 ret = rbd_req_sync_op(rbd_dev,
79e3057c 1509 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
907703d0
AE
1510 op, rbd_dev->header_name,
1511 0, 0, NULL, linger_req, NULL);
79e3057c 1512
907703d0
AE
1513 if (!start || ret < 0) {
1514 ceph_osdc_cancel_event(rbd_dev->watch_event);
1515 rbd_dev->watch_event = NULL;
1516 }
1517done:
139b4318 1518 rbd_destroy_op(op);
907703d0 1519
79e3057c
YS
1520 return ret;
1521}
1522
602adf40 1523/*
3cb4a687 1524 * Synchronous osd object method call
602adf40 1525 */
0ce1a794 1526static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
aded07ea
AE
1527 const char *object_name,
1528 const char *class_name,
1529 const char *method_name,
3cb4a687
AE
1530 const char *outbound,
1531 size_t outbound_size,
f8d4de6e
AE
1532 char *inbound,
1533 size_t inbound_size,
59c2be1e 1534 u64 *ver)
602adf40 1535{
139b4318 1536 struct ceph_osd_req_op *op;
57cfc106
AE
1537 int ret;
1538
3cb4a687
AE
1539 /*
1540 * Any input parameters required by the method we're calling
1541 * will be sent along with the class and method names as
1542 * part of the message payload. That data and its size are
1543 * supplied via the indata and indata_len fields (named from
1544 * the perspective of the server side) in the OSD request
1545 * operation.
1546 */
2647ba38
AE
1547 op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
1548 method_name, outbound, outbound_size);
139b4318 1549 if (!op)
57cfc106 1550 return -ENOMEM;
602adf40 1551
30573d68 1552 ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op,
f8d4de6e
AE
1553 object_name, 0, inbound_size, inbound,
1554 NULL, ver);
602adf40 1555
2647ba38 1556 rbd_osd_req_op_destroy(op);
602adf40
YS
1557
1558 dout("cls_exec returned %d\n", ret);
1559 return ret;
1560}
1561
1fec7093
YS
1562static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1563{
1564 struct rbd_req_coll *coll =
1565 kzalloc(sizeof(struct rbd_req_coll) +
1566 sizeof(struct rbd_req_status) * num_reqs,
1567 GFP_ATOMIC);
1568
1569 if (!coll)
1570 return NULL;
1571 coll->total = num_reqs;
1572 kref_init(&coll->kref);
1573 return coll;
1574}
1575
8295cda7
AE
1576static int rbd_dev_do_request(struct request *rq,
1577 struct rbd_device *rbd_dev,
1578 struct ceph_snap_context *snapc,
1579 u64 ofs, unsigned int size,
1580 struct bio *bio_chain)
1581{
1582 int num_segs;
1583 struct rbd_req_coll *coll;
1584 unsigned int bio_offset;
1585 int cur_seg = 0;
1586
1587 dout("%s 0x%x bytes at 0x%llx\n",
1588 rq_data_dir(rq) == WRITE ? "write" : "read",
1589 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
1590
1591 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1592 if (num_segs <= 0)
1593 return num_segs;
1594
1595 coll = rbd_alloc_coll(num_segs);
1596 if (!coll)
1597 return -ENOMEM;
1598
1599 bio_offset = 0;
1600 do {
1601 u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1602 unsigned int clone_size;
1603 struct bio *bio_clone;
1604
1605 BUG_ON(limit > (u64)UINT_MAX);
1606 clone_size = (unsigned int)limit;
1607 dout("bio_chain->bi_vcnt=%hu\n", bio_chain->bi_vcnt);
1608
1609 kref_get(&coll->kref);
1610
1611 /* Pass a cloned bio chain via an osd request */
1612
1613 bio_clone = bio_chain_clone_range(&bio_chain,
1614 &bio_offset, clone_size,
1615 GFP_ATOMIC);
1616 if (bio_clone)
1617 (void)rbd_do_op(rq, rbd_dev, snapc,
1618 ofs, clone_size,
1619 bio_clone, coll, cur_seg);
1620 else
1621 rbd_coll_end_req_index(rq, coll, cur_seg,
1622 (s32)-ENOMEM,
1623 clone_size);
1624 size -= clone_size;
1625 ofs += clone_size;
1626
1627 cur_seg++;
1628 } while (size > 0);
1629 kref_put(&coll->kref, rbd_coll_release);
1630
1631 return 0;
1632}
1633
602adf40
YS
1634/*
1635 * block device queue callback
1636 */
1637static void rbd_rq_fn(struct request_queue *q)
1638{
1639 struct rbd_device *rbd_dev = q->queuedata;
b395e8b5 1640 bool read_only = rbd_dev->mapping.read_only;
602adf40 1641 struct request *rq;
602adf40 1642
00f1f36f 1643 while ((rq = blk_fetch_request(q))) {
b395e8b5
AE
1644 struct ceph_snap_context *snapc = NULL;
1645 unsigned int size = 0;
8295cda7 1646 int result;
602adf40 1647
602adf40
YS
1648 dout("fetched request\n");
1649
b395e8b5
AE
1650 /* Filter out block requests we don't understand */
1651
602adf40
YS
1652 if ((rq->cmd_type != REQ_TYPE_FS)) {
1653 __blk_end_request_all(rq, 0);
00f1f36f 1654 continue;
602adf40 1655 }
b395e8b5 1656 spin_unlock_irq(q->queue_lock);
602adf40 1657
a7b4c65f
AE
1658 /* Write requests need a reference to the snapshot context */
1659
1660 if (rq_data_dir(rq) == WRITE) {
1661 result = -EROFS;
1662 if (read_only) /* Can't write to a read-only device */
1663 goto out_end_request;
1664
1665 /*
1666 * Note that each osd request will take its
1667 * own reference to the snapshot context
1668 * supplied. The reference we take here
1669 * just guarantees the one we provide stays
1670 * valid.
1671 */
1672 down_read(&rbd_dev->header_rwsem);
b395e8b5 1673 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
a7b4c65f 1674 up_read(&rbd_dev->header_rwsem);
b395e8b5 1675 rbd_assert(snapc != NULL);
a7b4c65f 1676 } else if (!atomic_read(&rbd_dev->exists)) {
0d7dbfce 1677 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
d1d25646 1678 dout("request for non-existent snapshot");
b395e8b5
AE
1679 result = -ENXIO;
1680 goto out_end_request;
e88a36ec
JD
1681 }
1682
f7760dad 1683 size = blk_rq_bytes(rq);
b395e8b5
AE
1684 result = rbd_dev_do_request(rq, rbd_dev, snapc,
1685 blk_rq_pos(rq) * SECTOR_SIZE,
1686 size, rq->bio);
1687out_end_request:
a7b4c65f
AE
1688 if (snapc)
1689 ceph_put_snap_context(snapc);
8295cda7
AE
1690 spin_lock_irq(q->queue_lock);
1691 if (!size || result < 0)
1692 __blk_end_request_all(rq, result);
602adf40
YS
1693 }
1694}
1695
1696/*
1697 * a queue callback. Makes sure that we don't create a bio that spans across
1698 * multiple osd objects. One exception would be with a single page bios,
f7760dad 1699 * which we handle later at bio_chain_clone_range()
602adf40
YS
1700 */
1701static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1702 struct bio_vec *bvec)
1703{
1704 struct rbd_device *rbd_dev = q->queuedata;
e5cfeed2
AE
1705 sector_t sector_offset;
1706 sector_t sectors_per_obj;
1707 sector_t obj_sector_offset;
1708 int ret;
1709
1710 /*
1711 * Find how far into its rbd object the partition-relative
1712 * bio start sector is to offset relative to the enclosing
1713 * device.
1714 */
1715 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1716 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1717 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
1718
1719 /*
1720 * Compute the number of bytes from that offset to the end
1721 * of the object. Account for what's already used by the bio.
1722 */
1723 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1724 if (ret > bmd->bi_size)
1725 ret -= bmd->bi_size;
1726 else
1727 ret = 0;
1728
1729 /*
1730 * Don't send back more than was asked for. And if the bio
1731 * was empty, let the whole thing through because: "Note
1732 * that a block device *must* allow a single page to be
1733 * added to an empty bio."
1734 */
1735 rbd_assert(bvec->bv_len <= PAGE_SIZE);
1736 if (ret > (int) bvec->bv_len || !bmd->bi_size)
1737 ret = (int) bvec->bv_len;
1738
1739 return ret;
602adf40
YS
1740}
1741
1742static void rbd_free_disk(struct rbd_device *rbd_dev)
1743{
1744 struct gendisk *disk = rbd_dev->disk;
1745
1746 if (!disk)
1747 return;
1748
602adf40
YS
1749 if (disk->flags & GENHD_FL_UP)
1750 del_gendisk(disk);
1751 if (disk->queue)
1752 blk_cleanup_queue(disk->queue);
1753 put_disk(disk);
1754}
1755
1756/*
4156d998
AE
1757 * Read the complete header for the given rbd device.
1758 *
1759 * Returns a pointer to a dynamically-allocated buffer containing
1760 * the complete and validated header. Caller can pass the address
1761 * of a variable that will be filled in with the version of the
1762 * header object at the time it was read.
1763 *
1764 * Returns a pointer-coded errno if a failure occurs.
602adf40 1765 */
4156d998
AE
1766static struct rbd_image_header_ondisk *
1767rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
602adf40 1768{
4156d998 1769 struct rbd_image_header_ondisk *ondisk = NULL;
50f7c4c9 1770 u32 snap_count = 0;
4156d998
AE
1771 u64 names_size = 0;
1772 u32 want_count;
1773 int ret;
602adf40 1774
00f1f36f 1775 /*
4156d998
AE
1776 * The complete header will include an array of its 64-bit
1777 * snapshot ids, followed by the names of those snapshots as
1778 * a contiguous block of NUL-terminated strings. Note that
1779 * the number of snapshots could change by the time we read
1780 * it in, in which case we re-read it.
00f1f36f 1781 */
4156d998
AE
1782 do {
1783 size_t size;
1784
1785 kfree(ondisk);
1786
1787 size = sizeof (*ondisk);
1788 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1789 size += names_size;
1790 ondisk = kmalloc(size, GFP_KERNEL);
1791 if (!ondisk)
1792 return ERR_PTR(-ENOMEM);
1793
4775618d 1794 ret = rbd_req_sync_read(rbd_dev, rbd_dev->header_name,
4156d998
AE
1795 0, size,
1796 (char *) ondisk, version);
1797
1798 if (ret < 0)
1799 goto out_err;
1800 if (WARN_ON((size_t) ret < size)) {
1801 ret = -ENXIO;
06ecc6cb
AE
1802 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
1803 size, ret);
4156d998
AE
1804 goto out_err;
1805 }
1806 if (!rbd_dev_ondisk_valid(ondisk)) {
1807 ret = -ENXIO;
06ecc6cb 1808 rbd_warn(rbd_dev, "invalid header");
4156d998 1809 goto out_err;
81e759fb 1810 }
602adf40 1811
4156d998
AE
1812 names_size = le64_to_cpu(ondisk->snap_names_len);
1813 want_count = snap_count;
1814 snap_count = le32_to_cpu(ondisk->snap_count);
1815 } while (snap_count != want_count);
00f1f36f 1816
4156d998 1817 return ondisk;
00f1f36f 1818
4156d998
AE
1819out_err:
1820 kfree(ondisk);
1821
1822 return ERR_PTR(ret);
1823}
1824
1825/*
1826 * reload the ondisk the header
1827 */
1828static int rbd_read_header(struct rbd_device *rbd_dev,
1829 struct rbd_image_header *header)
1830{
1831 struct rbd_image_header_ondisk *ondisk;
1832 u64 ver = 0;
1833 int ret;
602adf40 1834
4156d998
AE
1835 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1836 if (IS_ERR(ondisk))
1837 return PTR_ERR(ondisk);
1838 ret = rbd_header_from_disk(header, ondisk);
1839 if (ret >= 0)
1840 header->obj_version = ver;
1841 kfree(ondisk);
1842
1843 return ret;
602adf40
YS
1844}
1845
41f38c2b 1846static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
dfc5606d
YS
1847{
1848 struct rbd_snap *snap;
a0593290 1849 struct rbd_snap *next;
dfc5606d 1850
a0593290 1851 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
41f38c2b 1852 rbd_remove_snap_dev(snap);
dfc5606d
YS
1853}
1854
9478554a
AE
1855static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
1856{
1857 sector_t size;
1858
0d7dbfce 1859 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
9478554a
AE
1860 return;
1861
1862 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
1863 dout("setting size to %llu sectors", (unsigned long long) size);
1864 rbd_dev->mapping.size = (u64) size;
1865 set_capacity(rbd_dev->disk, size);
1866}
1867
602adf40
YS
1868/*
1869 * only read the first part of the ondisk header, without the snaps info
1870 */
117973fb 1871static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
602adf40
YS
1872{
1873 int ret;
1874 struct rbd_image_header h;
602adf40
YS
1875
1876 ret = rbd_read_header(rbd_dev, &h);
1877 if (ret < 0)
1878 return ret;
1879
a51aa0c0
JD
1880 down_write(&rbd_dev->header_rwsem);
1881
9478554a
AE
1882 /* Update image size, and check for resize of mapped image */
1883 rbd_dev->header.image_size = h.image_size;
1884 rbd_update_mapping_size(rbd_dev);
9db4b3e3 1885
849b4260 1886 /* rbd_dev->header.object_prefix shouldn't change */
602adf40 1887 kfree(rbd_dev->header.snap_sizes);
849b4260 1888 kfree(rbd_dev->header.snap_names);
d1d25646
JD
1889 /* osd requests may still refer to snapc */
1890 ceph_put_snap_context(rbd_dev->header.snapc);
602adf40 1891
b813623a
AE
1892 if (hver)
1893 *hver = h.obj_version;
a71b891b 1894 rbd_dev->header.obj_version = h.obj_version;
93a24e08 1895 rbd_dev->header.image_size = h.image_size;
602adf40
YS
1896 rbd_dev->header.snapc = h.snapc;
1897 rbd_dev->header.snap_names = h.snap_names;
1898 rbd_dev->header.snap_sizes = h.snap_sizes;
849b4260
AE
1899 /* Free the extra copy of the object prefix */
1900 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1901 kfree(h.object_prefix);
1902
304f6808
AE
1903 ret = rbd_dev_snaps_update(rbd_dev);
1904 if (!ret)
1905 ret = rbd_dev_snaps_register(rbd_dev);
dfc5606d 1906
c666601a 1907 up_write(&rbd_dev->header_rwsem);
602adf40 1908
dfc5606d 1909 return ret;
602adf40
YS
1910}
1911
117973fb 1912static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
1fe5e993
AE
1913{
1914 int ret;
1915
117973fb 1916 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1fe5e993 1917 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
117973fb
AE
1918 if (rbd_dev->image_format == 1)
1919 ret = rbd_dev_v1_refresh(rbd_dev, hver);
1920 else
1921 ret = rbd_dev_v2_refresh(rbd_dev, hver);
1fe5e993
AE
1922 mutex_unlock(&ctl_mutex);
1923
1924 return ret;
1925}
1926
602adf40
YS
1927static int rbd_init_disk(struct rbd_device *rbd_dev)
1928{
1929 struct gendisk *disk;
1930 struct request_queue *q;
593a9e7b 1931 u64 segment_size;
602adf40 1932
602adf40 1933 /* create gendisk info */
602adf40
YS
1934 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1935 if (!disk)
1fcdb8aa 1936 return -ENOMEM;
602adf40 1937
f0f8cef5 1938 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
de71a297 1939 rbd_dev->dev_id);
602adf40
YS
1940 disk->major = rbd_dev->major;
1941 disk->first_minor = 0;
1942 disk->fops = &rbd_bd_ops;
1943 disk->private_data = rbd_dev;
1944
1945 /* init rq */
602adf40
YS
1946 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1947 if (!q)
1948 goto out_disk;
029bcbd8 1949
593a9e7b
AE
1950 /* We use the default size, but let's be explicit about it. */
1951 blk_queue_physical_block_size(q, SECTOR_SIZE);
1952
029bcbd8 1953 /* set io sizes to object size */
593a9e7b
AE
1954 segment_size = rbd_obj_bytes(&rbd_dev->header);
1955 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1956 blk_queue_max_segment_size(q, segment_size);
1957 blk_queue_io_min(q, segment_size);
1958 blk_queue_io_opt(q, segment_size);
029bcbd8 1959
602adf40
YS
1960 blk_queue_merge_bvec(q, rbd_merge_bvec);
1961 disk->queue = q;
1962
1963 q->queuedata = rbd_dev;
1964
1965 rbd_dev->disk = disk;
602adf40 1966
12f02944
AE
1967 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1968
602adf40 1969 return 0;
602adf40
YS
1970out_disk:
1971 put_disk(disk);
1fcdb8aa
AE
1972
1973 return -ENOMEM;
602adf40
YS
1974}
1975
dfc5606d
YS
1976/*
1977 sysfs
1978*/
1979
593a9e7b
AE
1980static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1981{
1982 return container_of(dev, struct rbd_device, dev);
1983}
1984
dfc5606d
YS
1985static ssize_t rbd_size_show(struct device *dev,
1986 struct device_attribute *attr, char *buf)
1987{
593a9e7b 1988 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
a51aa0c0
JD
1989 sector_t size;
1990
1991 down_read(&rbd_dev->header_rwsem);
1992 size = get_capacity(rbd_dev->disk);
1993 up_read(&rbd_dev->header_rwsem);
dfc5606d 1994
a51aa0c0 1995 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
dfc5606d
YS
1996}
1997
34b13184
AE
1998/*
1999 * Note this shows the features for whatever's mapped, which is not
2000 * necessarily the base image.
2001 */
2002static ssize_t rbd_features_show(struct device *dev,
2003 struct device_attribute *attr, char *buf)
2004{
2005 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2006
2007 return sprintf(buf, "0x%016llx\n",
2008 (unsigned long long) rbd_dev->mapping.features);
2009}
2010
dfc5606d
YS
2011static ssize_t rbd_major_show(struct device *dev,
2012 struct device_attribute *attr, char *buf)
2013{
593a9e7b 2014 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 2015
dfc5606d
YS
2016 return sprintf(buf, "%d\n", rbd_dev->major);
2017}
2018
2019static ssize_t rbd_client_id_show(struct device *dev,
2020 struct device_attribute *attr, char *buf)
602adf40 2021{
593a9e7b 2022 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2023
1dbb4399
AE
2024 return sprintf(buf, "client%lld\n",
2025 ceph_client_id(rbd_dev->rbd_client->client));
602adf40
YS
2026}
2027
dfc5606d
YS
2028static ssize_t rbd_pool_show(struct device *dev,
2029 struct device_attribute *attr, char *buf)
602adf40 2030{
593a9e7b 2031 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2032
0d7dbfce 2033 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
dfc5606d
YS
2034}
2035
9bb2f334
AE
2036static ssize_t rbd_pool_id_show(struct device *dev,
2037 struct device_attribute *attr, char *buf)
2038{
2039 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2040
0d7dbfce
AE
2041 return sprintf(buf, "%llu\n",
2042 (unsigned long long) rbd_dev->spec->pool_id);
9bb2f334
AE
2043}
2044
dfc5606d
YS
2045static ssize_t rbd_name_show(struct device *dev,
2046 struct device_attribute *attr, char *buf)
2047{
593a9e7b 2048 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2049
a92ffdf8
AE
2050 if (rbd_dev->spec->image_name)
2051 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2052
2053 return sprintf(buf, "(unknown)\n");
dfc5606d
YS
2054}
2055
589d30e0
AE
2056static ssize_t rbd_image_id_show(struct device *dev,
2057 struct device_attribute *attr, char *buf)
2058{
2059 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2060
0d7dbfce 2061 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
589d30e0
AE
2062}
2063
34b13184
AE
2064/*
2065 * Shows the name of the currently-mapped snapshot (or
2066 * RBD_SNAP_HEAD_NAME for the base image).
2067 */
dfc5606d
YS
2068static ssize_t rbd_snap_show(struct device *dev,
2069 struct device_attribute *attr,
2070 char *buf)
2071{
593a9e7b 2072 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2073
0d7dbfce 2074 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
dfc5606d
YS
2075}
2076
86b00e0d
AE
2077/*
2078 * For an rbd v2 image, shows the pool id, image id, and snapshot id
2079 * for the parent image. If there is no parent, simply shows
2080 * "(no parent image)".
2081 */
2082static ssize_t rbd_parent_show(struct device *dev,
2083 struct device_attribute *attr,
2084 char *buf)
2085{
2086 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2087 struct rbd_spec *spec = rbd_dev->parent_spec;
2088 int count;
2089 char *bufp = buf;
2090
2091 if (!spec)
2092 return sprintf(buf, "(no parent image)\n");
2093
2094 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2095 (unsigned long long) spec->pool_id, spec->pool_name);
2096 if (count < 0)
2097 return count;
2098 bufp += count;
2099
2100 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2101 spec->image_name ? spec->image_name : "(unknown)");
2102 if (count < 0)
2103 return count;
2104 bufp += count;
2105
2106 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2107 (unsigned long long) spec->snap_id, spec->snap_name);
2108 if (count < 0)
2109 return count;
2110 bufp += count;
2111
2112 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2113 if (count < 0)
2114 return count;
2115 bufp += count;
2116
2117 return (ssize_t) (bufp - buf);
2118}
2119
dfc5606d
YS
2120static ssize_t rbd_image_refresh(struct device *dev,
2121 struct device_attribute *attr,
2122 const char *buf,
2123 size_t size)
2124{
593a9e7b 2125 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
b813623a 2126 int ret;
602adf40 2127
117973fb 2128 ret = rbd_dev_refresh(rbd_dev, NULL);
b813623a
AE
2129
2130 return ret < 0 ? ret : size;
dfc5606d 2131}
602adf40 2132
dfc5606d 2133static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
34b13184 2134static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
dfc5606d
YS
2135static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2136static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2137static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
9bb2f334 2138static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
dfc5606d 2139static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
589d30e0 2140static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
dfc5606d
YS
2141static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2142static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
86b00e0d 2143static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
dfc5606d
YS
2144
2145static struct attribute *rbd_attrs[] = {
2146 &dev_attr_size.attr,
34b13184 2147 &dev_attr_features.attr,
dfc5606d
YS
2148 &dev_attr_major.attr,
2149 &dev_attr_client_id.attr,
2150 &dev_attr_pool.attr,
9bb2f334 2151 &dev_attr_pool_id.attr,
dfc5606d 2152 &dev_attr_name.attr,
589d30e0 2153 &dev_attr_image_id.attr,
dfc5606d 2154 &dev_attr_current_snap.attr,
86b00e0d 2155 &dev_attr_parent.attr,
dfc5606d 2156 &dev_attr_refresh.attr,
dfc5606d
YS
2157 NULL
2158};
2159
2160static struct attribute_group rbd_attr_group = {
2161 .attrs = rbd_attrs,
2162};
2163
2164static const struct attribute_group *rbd_attr_groups[] = {
2165 &rbd_attr_group,
2166 NULL
2167};
2168
2169static void rbd_sysfs_dev_release(struct device *dev)
2170{
2171}
2172
2173static struct device_type rbd_device_type = {
2174 .name = "rbd",
2175 .groups = rbd_attr_groups,
2176 .release = rbd_sysfs_dev_release,
2177};
2178
2179
2180/*
2181 sysfs - snapshots
2182*/
2183
2184static ssize_t rbd_snap_size_show(struct device *dev,
2185 struct device_attribute *attr,
2186 char *buf)
2187{
2188 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2189
3591538f 2190 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
dfc5606d
YS
2191}
2192
2193static ssize_t rbd_snap_id_show(struct device *dev,
2194 struct device_attribute *attr,
2195 char *buf)
2196{
2197 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2198
3591538f 2199 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
dfc5606d
YS
2200}
2201
34b13184
AE
2202static ssize_t rbd_snap_features_show(struct device *dev,
2203 struct device_attribute *attr,
2204 char *buf)
2205{
2206 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2207
2208 return sprintf(buf, "0x%016llx\n",
2209 (unsigned long long) snap->features);
2210}
2211
dfc5606d
YS
2212static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2213static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
34b13184 2214static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
dfc5606d
YS
2215
2216static struct attribute *rbd_snap_attrs[] = {
2217 &dev_attr_snap_size.attr,
2218 &dev_attr_snap_id.attr,
34b13184 2219 &dev_attr_snap_features.attr,
dfc5606d
YS
2220 NULL,
2221};
2222
2223static struct attribute_group rbd_snap_attr_group = {
2224 .attrs = rbd_snap_attrs,
2225};
2226
2227static void rbd_snap_dev_release(struct device *dev)
2228{
2229 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2230 kfree(snap->name);
2231 kfree(snap);
2232}
2233
2234static const struct attribute_group *rbd_snap_attr_groups[] = {
2235 &rbd_snap_attr_group,
2236 NULL
2237};
2238
2239static struct device_type rbd_snap_device_type = {
2240 .groups = rbd_snap_attr_groups,
2241 .release = rbd_snap_dev_release,
2242};
2243
8b8fb99c
AE
2244static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2245{
2246 kref_get(&spec->kref);
2247
2248 return spec;
2249}
2250
2251static void rbd_spec_free(struct kref *kref);
2252static void rbd_spec_put(struct rbd_spec *spec)
2253{
2254 if (spec)
2255 kref_put(&spec->kref, rbd_spec_free);
2256}
2257
2258static struct rbd_spec *rbd_spec_alloc(void)
2259{
2260 struct rbd_spec *spec;
2261
2262 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2263 if (!spec)
2264 return NULL;
2265 kref_init(&spec->kref);
2266
2267 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2268
2269 return spec;
2270}
2271
2272static void rbd_spec_free(struct kref *kref)
2273{
2274 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2275
2276 kfree(spec->pool_name);
2277 kfree(spec->image_id);
2278 kfree(spec->image_name);
2279 kfree(spec->snap_name);
2280 kfree(spec);
2281}
2282
c53d5893
AE
2283struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2284 struct rbd_spec *spec)
2285{
2286 struct rbd_device *rbd_dev;
2287
2288 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2289 if (!rbd_dev)
2290 return NULL;
2291
2292 spin_lock_init(&rbd_dev->lock);
d78b650a 2293 atomic_set(&rbd_dev->exists, 0);
c53d5893
AE
2294 INIT_LIST_HEAD(&rbd_dev->node);
2295 INIT_LIST_HEAD(&rbd_dev->snaps);
2296 init_rwsem(&rbd_dev->header_rwsem);
2297
2298 rbd_dev->spec = spec;
2299 rbd_dev->rbd_client = rbdc;
2300
0903e875
AE
2301 /* Initialize the layout used for all rbd requests */
2302
2303 rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2304 rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
2305 rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2306 rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
2307
c53d5893
AE
2308 return rbd_dev;
2309}
2310
2311static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2312{
86b00e0d 2313 rbd_spec_put(rbd_dev->parent_spec);
c53d5893
AE
2314 kfree(rbd_dev->header_name);
2315 rbd_put_client(rbd_dev->rbd_client);
2316 rbd_spec_put(rbd_dev->spec);
2317 kfree(rbd_dev);
2318}
2319
304f6808
AE
2320static bool rbd_snap_registered(struct rbd_snap *snap)
2321{
2322 bool ret = snap->dev.type == &rbd_snap_device_type;
2323 bool reg = device_is_registered(&snap->dev);
2324
2325 rbd_assert(!ret ^ reg);
2326
2327 return ret;
2328}
2329
41f38c2b 2330static void rbd_remove_snap_dev(struct rbd_snap *snap)
dfc5606d
YS
2331{
2332 list_del(&snap->node);
304f6808
AE
2333 if (device_is_registered(&snap->dev))
2334 device_unregister(&snap->dev);
dfc5606d
YS
2335}
2336
14e7085d 2337static int rbd_register_snap_dev(struct rbd_snap *snap,
dfc5606d
YS
2338 struct device *parent)
2339{
2340 struct device *dev = &snap->dev;
2341 int ret;
2342
2343 dev->type = &rbd_snap_device_type;
2344 dev->parent = parent;
2345 dev->release = rbd_snap_dev_release;
d4b125e9 2346 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
304f6808
AE
2347 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2348
dfc5606d
YS
2349 ret = device_register(dev);
2350
2351 return ret;
2352}
2353
4e891e0a 2354static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
c8d18425 2355 const char *snap_name,
34b13184
AE
2356 u64 snap_id, u64 snap_size,
2357 u64 snap_features)
dfc5606d 2358{
4e891e0a 2359 struct rbd_snap *snap;
dfc5606d 2360 int ret;
4e891e0a
AE
2361
2362 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
dfc5606d 2363 if (!snap)
4e891e0a
AE
2364 return ERR_PTR(-ENOMEM);
2365
2366 ret = -ENOMEM;
c8d18425 2367 snap->name = kstrdup(snap_name, GFP_KERNEL);
4e891e0a
AE
2368 if (!snap->name)
2369 goto err;
2370
c8d18425
AE
2371 snap->id = snap_id;
2372 snap->size = snap_size;
34b13184 2373 snap->features = snap_features;
4e891e0a
AE
2374
2375 return snap;
2376
dfc5606d
YS
2377err:
2378 kfree(snap->name);
2379 kfree(snap);
4e891e0a
AE
2380
2381 return ERR_PTR(ret);
dfc5606d
YS
2382}
2383
cd892126
AE
2384static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2385 u64 *snap_size, u64 *snap_features)
2386{
2387 char *snap_name;
2388
2389 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2390
2391 *snap_size = rbd_dev->header.snap_sizes[which];
2392 *snap_features = 0; /* No features for v1 */
2393
2394 /* Skip over names until we find the one we are looking for */
2395
2396 snap_name = rbd_dev->header.snap_names;
2397 while (which--)
2398 snap_name += strlen(snap_name) + 1;
2399
2400 return snap_name;
2401}
2402
9d475de5
AE
2403/*
2404 * Get the size and object order for an image snapshot, or if
2405 * snap_id is CEPH_NOSNAP, gets this information for the base
2406 * image.
2407 */
2408static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2409 u8 *order, u64 *snap_size)
2410{
2411 __le64 snapid = cpu_to_le64(snap_id);
2412 int ret;
2413 struct {
2414 u8 order;
2415 __le64 size;
2416 } __attribute__ ((packed)) size_buf = { 0 };
2417
2418 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2419 "rbd", "get_size",
2420 (char *) &snapid, sizeof (snapid),
07b2391f 2421 (char *) &size_buf, sizeof (size_buf), NULL);
9d475de5
AE
2422 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2423 if (ret < 0)
2424 return ret;
2425
2426 *order = size_buf.order;
2427 *snap_size = le64_to_cpu(size_buf.size);
2428
2429 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2430 (unsigned long long) snap_id, (unsigned int) *order,
2431 (unsigned long long) *snap_size);
2432
2433 return 0;
2434}
2435
2436static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2437{
2438 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2439 &rbd_dev->header.obj_order,
2440 &rbd_dev->header.image_size);
2441}
2442
1e130199
AE
2443static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2444{
2445 void *reply_buf;
2446 int ret;
2447 void *p;
2448
2449 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2450 if (!reply_buf)
2451 return -ENOMEM;
2452
2453 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2454 "rbd", "get_object_prefix",
2455 NULL, 0,
07b2391f 2456 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
1e130199
AE
2457 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2458 if (ret < 0)
2459 goto out;
a0ea3a40 2460 ret = 0; /* rbd_req_sync_exec() can return positive */
1e130199
AE
2461
2462 p = reply_buf;
2463 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2464 p + RBD_OBJ_PREFIX_LEN_MAX,
2465 NULL, GFP_NOIO);
2466
2467 if (IS_ERR(rbd_dev->header.object_prefix)) {
2468 ret = PTR_ERR(rbd_dev->header.object_prefix);
2469 rbd_dev->header.object_prefix = NULL;
2470 } else {
2471 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2472 }
2473
2474out:
2475 kfree(reply_buf);
2476
2477 return ret;
2478}
2479
b1b5402a
AE
2480static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2481 u64 *snap_features)
2482{
2483 __le64 snapid = cpu_to_le64(snap_id);
2484 struct {
2485 __le64 features;
2486 __le64 incompat;
2487 } features_buf = { 0 };
d889140c 2488 u64 incompat;
b1b5402a
AE
2489 int ret;
2490
2491 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2492 "rbd", "get_features",
2493 (char *) &snapid, sizeof (snapid),
2494 (char *) &features_buf, sizeof (features_buf),
07b2391f 2495 NULL);
b1b5402a
AE
2496 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2497 if (ret < 0)
2498 return ret;
d889140c
AE
2499
2500 incompat = le64_to_cpu(features_buf.incompat);
2501 if (incompat & ~RBD_FEATURES_ALL)
b8f5c6ed 2502 return -ENXIO;
d889140c 2503
b1b5402a
AE
2504 *snap_features = le64_to_cpu(features_buf.features);
2505
2506 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2507 (unsigned long long) snap_id,
2508 (unsigned long long) *snap_features,
2509 (unsigned long long) le64_to_cpu(features_buf.incompat));
2510
2511 return 0;
2512}
2513
2514static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2515{
2516 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2517 &rbd_dev->header.features);
2518}
2519
86b00e0d
AE
2520static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2521{
2522 struct rbd_spec *parent_spec;
2523 size_t size;
2524 void *reply_buf = NULL;
2525 __le64 snapid;
2526 void *p;
2527 void *end;
2528 char *image_id;
2529 u64 overlap;
86b00e0d
AE
2530 int ret;
2531
2532 parent_spec = rbd_spec_alloc();
2533 if (!parent_spec)
2534 return -ENOMEM;
2535
2536 size = sizeof (__le64) + /* pool_id */
2537 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
2538 sizeof (__le64) + /* snap_id */
2539 sizeof (__le64); /* overlap */
2540 reply_buf = kmalloc(size, GFP_KERNEL);
2541 if (!reply_buf) {
2542 ret = -ENOMEM;
2543 goto out_err;
2544 }
2545
2546 snapid = cpu_to_le64(CEPH_NOSNAP);
2547 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2548 "rbd", "get_parent",
2549 (char *) &snapid, sizeof (snapid),
07b2391f 2550 (char *) reply_buf, size, NULL);
86b00e0d
AE
2551 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2552 if (ret < 0)
2553 goto out_err;
2554
2555 ret = -ERANGE;
2556 p = reply_buf;
2557 end = (char *) reply_buf + size;
2558 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2559 if (parent_spec->pool_id == CEPH_NOPOOL)
2560 goto out; /* No parent? No problem. */
2561
0903e875
AE
2562 /* The ceph file layout needs to fit pool id in 32 bits */
2563
2564 ret = -EIO;
2565 if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
2566 goto out;
2567
979ed480 2568 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
86b00e0d
AE
2569 if (IS_ERR(image_id)) {
2570 ret = PTR_ERR(image_id);
2571 goto out_err;
2572 }
2573 parent_spec->image_id = image_id;
2574 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
2575 ceph_decode_64_safe(&p, end, overlap, out_err);
2576
2577 rbd_dev->parent_overlap = overlap;
2578 rbd_dev->parent_spec = parent_spec;
2579 parent_spec = NULL; /* rbd_dev now owns this */
2580out:
2581 ret = 0;
2582out_err:
2583 kfree(reply_buf);
2584 rbd_spec_put(parent_spec);
2585
2586 return ret;
2587}
2588
9e15b77d
AE
2589static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
2590{
2591 size_t image_id_size;
2592 char *image_id;
2593 void *p;
2594 void *end;
2595 size_t size;
2596 void *reply_buf = NULL;
2597 size_t len = 0;
2598 char *image_name = NULL;
2599 int ret;
2600
2601 rbd_assert(!rbd_dev->spec->image_name);
2602
69e7a02f
AE
2603 len = strlen(rbd_dev->spec->image_id);
2604 image_id_size = sizeof (__le32) + len;
9e15b77d
AE
2605 image_id = kmalloc(image_id_size, GFP_KERNEL);
2606 if (!image_id)
2607 return NULL;
2608
2609 p = image_id;
2610 end = (char *) image_id + image_id_size;
69e7a02f 2611 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
9e15b77d
AE
2612
2613 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
2614 reply_buf = kmalloc(size, GFP_KERNEL);
2615 if (!reply_buf)
2616 goto out;
2617
2618 ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY,
2619 "rbd", "dir_get_name",
2620 image_id, image_id_size,
07b2391f 2621 (char *) reply_buf, size, NULL);
9e15b77d
AE
2622 if (ret < 0)
2623 goto out;
2624 p = reply_buf;
2625 end = (char *) reply_buf + size;
2626 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2627 if (IS_ERR(image_name))
2628 image_name = NULL;
2629 else
2630 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
2631out:
2632 kfree(reply_buf);
2633 kfree(image_id);
2634
2635 return image_name;
2636}
2637
2638/*
2639 * When a parent image gets probed, we only have the pool, image,
2640 * and snapshot ids but not the names of any of them. This call
2641 * is made later to fill in those names. It has to be done after
2642 * rbd_dev_snaps_update() has completed because some of the
2643 * information (in particular, snapshot name) is not available
2644 * until then.
2645 */
2646static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
2647{
2648 struct ceph_osd_client *osdc;
2649 const char *name;
2650 void *reply_buf = NULL;
2651 int ret;
2652
2653 if (rbd_dev->spec->pool_name)
2654 return 0; /* Already have the names */
2655
2656 /* Look up the pool name */
2657
2658 osdc = &rbd_dev->rbd_client->client->osdc;
2659 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
935dc89f
AE
2660 if (!name) {
2661 rbd_warn(rbd_dev, "there is no pool with id %llu",
2662 rbd_dev->spec->pool_id); /* Really a BUG() */
2663 return -EIO;
2664 }
9e15b77d
AE
2665
2666 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
2667 if (!rbd_dev->spec->pool_name)
2668 return -ENOMEM;
2669
2670 /* Fetch the image name; tolerate failure here */
2671
2672 name = rbd_dev_image_name(rbd_dev);
69e7a02f 2673 if (name)
9e15b77d 2674 rbd_dev->spec->image_name = (char *) name;
69e7a02f 2675 else
06ecc6cb 2676 rbd_warn(rbd_dev, "unable to get image name");
9e15b77d
AE
2677
2678 /* Look up the snapshot name. */
2679
2680 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
2681 if (!name) {
935dc89f
AE
2682 rbd_warn(rbd_dev, "no snapshot with id %llu",
2683 rbd_dev->spec->snap_id); /* Really a BUG() */
9e15b77d
AE
2684 ret = -EIO;
2685 goto out_err;
2686 }
2687 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
2688 if(!rbd_dev->spec->snap_name)
2689 goto out_err;
2690
2691 return 0;
2692out_err:
2693 kfree(reply_buf);
2694 kfree(rbd_dev->spec->pool_name);
2695 rbd_dev->spec->pool_name = NULL;
2696
2697 return ret;
2698}
2699
6e14b1a6 2700static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
35d489f9
AE
2701{
2702 size_t size;
2703 int ret;
2704 void *reply_buf;
2705 void *p;
2706 void *end;
2707 u64 seq;
2708 u32 snap_count;
2709 struct ceph_snap_context *snapc;
2710 u32 i;
2711
2712 /*
2713 * We'll need room for the seq value (maximum snapshot id),
2714 * snapshot count, and array of that many snapshot ids.
2715 * For now we have a fixed upper limit on the number we're
2716 * prepared to receive.
2717 */
2718 size = sizeof (__le64) + sizeof (__le32) +
2719 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2720 reply_buf = kzalloc(size, GFP_KERNEL);
2721 if (!reply_buf)
2722 return -ENOMEM;
2723
2724 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2725 "rbd", "get_snapcontext",
2726 NULL, 0,
07b2391f 2727 reply_buf, size, ver);
35d489f9
AE
2728 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2729 if (ret < 0)
2730 goto out;
2731
2732 ret = -ERANGE;
2733 p = reply_buf;
2734 end = (char *) reply_buf + size;
2735 ceph_decode_64_safe(&p, end, seq, out);
2736 ceph_decode_32_safe(&p, end, snap_count, out);
2737
2738 /*
2739 * Make sure the reported number of snapshot ids wouldn't go
2740 * beyond the end of our buffer. But before checking that,
2741 * make sure the computed size of the snapshot context we
2742 * allocate is representable in a size_t.
2743 */
2744 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2745 / sizeof (u64)) {
2746 ret = -EINVAL;
2747 goto out;
2748 }
2749 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2750 goto out;
2751
2752 size = sizeof (struct ceph_snap_context) +
2753 snap_count * sizeof (snapc->snaps[0]);
2754 snapc = kmalloc(size, GFP_KERNEL);
2755 if (!snapc) {
2756 ret = -ENOMEM;
2757 goto out;
2758 }
2759
2760 atomic_set(&snapc->nref, 1);
2761 snapc->seq = seq;
2762 snapc->num_snaps = snap_count;
2763 for (i = 0; i < snap_count; i++)
2764 snapc->snaps[i] = ceph_decode_64(&p);
2765
2766 rbd_dev->header.snapc = snapc;
2767
2768 dout(" snap context seq = %llu, snap_count = %u\n",
2769 (unsigned long long) seq, (unsigned int) snap_count);
2770
2771out:
2772 kfree(reply_buf);
2773
2774 return 0;
2775}
2776
b8b1e2db
AE
2777static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2778{
2779 size_t size;
2780 void *reply_buf;
2781 __le64 snap_id;
2782 int ret;
2783 void *p;
2784 void *end;
b8b1e2db
AE
2785 char *snap_name;
2786
2787 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2788 reply_buf = kmalloc(size, GFP_KERNEL);
2789 if (!reply_buf)
2790 return ERR_PTR(-ENOMEM);
2791
2792 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2793 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2794 "rbd", "get_snapshot_name",
2795 (char *) &snap_id, sizeof (snap_id),
07b2391f 2796 reply_buf, size, NULL);
b8b1e2db
AE
2797 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2798 if (ret < 0)
2799 goto out;
2800
2801 p = reply_buf;
2802 end = (char *) reply_buf + size;
e5c35534 2803 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
b8b1e2db
AE
2804 if (IS_ERR(snap_name)) {
2805 ret = PTR_ERR(snap_name);
2806 goto out;
2807 } else {
2808 dout(" snap_id 0x%016llx snap_name = %s\n",
2809 (unsigned long long) le64_to_cpu(snap_id), snap_name);
2810 }
2811 kfree(reply_buf);
2812
2813 return snap_name;
2814out:
2815 kfree(reply_buf);
2816
2817 return ERR_PTR(ret);
2818}
2819
2820static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2821 u64 *snap_size, u64 *snap_features)
2822{
2823 __le64 snap_id;
2824 u8 order;
2825 int ret;
2826
2827 snap_id = rbd_dev->header.snapc->snaps[which];
2828 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2829 if (ret)
2830 return ERR_PTR(ret);
2831 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2832 if (ret)
2833 return ERR_PTR(ret);
2834
2835 return rbd_dev_v2_snap_name(rbd_dev, which);
2836}
2837
2838static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2839 u64 *snap_size, u64 *snap_features)
2840{
2841 if (rbd_dev->image_format == 1)
2842 return rbd_dev_v1_snap_info(rbd_dev, which,
2843 snap_size, snap_features);
2844 if (rbd_dev->image_format == 2)
2845 return rbd_dev_v2_snap_info(rbd_dev, which,
2846 snap_size, snap_features);
2847 return ERR_PTR(-EINVAL);
2848}
2849
117973fb
AE
2850static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2851{
2852 int ret;
2853 __u8 obj_order;
2854
2855 down_write(&rbd_dev->header_rwsem);
2856
2857 /* Grab old order first, to see if it changes */
2858
2859 obj_order = rbd_dev->header.obj_order,
2860 ret = rbd_dev_v2_image_size(rbd_dev);
2861 if (ret)
2862 goto out;
2863 if (rbd_dev->header.obj_order != obj_order) {
2864 ret = -EIO;
2865 goto out;
2866 }
2867 rbd_update_mapping_size(rbd_dev);
2868
2869 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2870 dout("rbd_dev_v2_snap_context returned %d\n", ret);
2871 if (ret)
2872 goto out;
2873 ret = rbd_dev_snaps_update(rbd_dev);
2874 dout("rbd_dev_snaps_update returned %d\n", ret);
2875 if (ret)
2876 goto out;
2877 ret = rbd_dev_snaps_register(rbd_dev);
2878 dout("rbd_dev_snaps_register returned %d\n", ret);
2879out:
2880 up_write(&rbd_dev->header_rwsem);
2881
2882 return ret;
2883}
2884
dfc5606d 2885/*
35938150
AE
2886 * Scan the rbd device's current snapshot list and compare it to the
2887 * newly-received snapshot context. Remove any existing snapshots
2888 * not present in the new snapshot context. Add a new snapshot for
2889 * any snaphots in the snapshot context not in the current list.
2890 * And verify there are no changes to snapshots we already know
2891 * about.
2892 *
2893 * Assumes the snapshots in the snapshot context are sorted by
2894 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2895 * are also maintained in that order.)
dfc5606d 2896 */
304f6808 2897static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
dfc5606d 2898{
35938150
AE
2899 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2900 const u32 snap_count = snapc->num_snaps;
35938150
AE
2901 struct list_head *head = &rbd_dev->snaps;
2902 struct list_head *links = head->next;
2903 u32 index = 0;
dfc5606d 2904
9fcbb800 2905 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
35938150
AE
2906 while (index < snap_count || links != head) {
2907 u64 snap_id;
2908 struct rbd_snap *snap;
cd892126
AE
2909 char *snap_name;
2910 u64 snap_size = 0;
2911 u64 snap_features = 0;
dfc5606d 2912
35938150
AE
2913 snap_id = index < snap_count ? snapc->snaps[index]
2914 : CEPH_NOSNAP;
2915 snap = links != head ? list_entry(links, struct rbd_snap, node)
2916 : NULL;
aafb230e 2917 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
dfc5606d 2918
35938150
AE
2919 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2920 struct list_head *next = links->next;
dfc5606d 2921
35938150 2922 /* Existing snapshot not in the new snap context */
dfc5606d 2923
0d7dbfce 2924 if (rbd_dev->spec->snap_id == snap->id)
d78b650a 2925 atomic_set(&rbd_dev->exists, 0);
41f38c2b 2926 rbd_remove_snap_dev(snap);
9fcbb800 2927 dout("%ssnap id %llu has been removed\n",
0d7dbfce
AE
2928 rbd_dev->spec->snap_id == snap->id ?
2929 "mapped " : "",
9fcbb800 2930 (unsigned long long) snap->id);
35938150
AE
2931
2932 /* Done with this list entry; advance */
2933
2934 links = next;
dfc5606d
YS
2935 continue;
2936 }
35938150 2937
b8b1e2db
AE
2938 snap_name = rbd_dev_snap_info(rbd_dev, index,
2939 &snap_size, &snap_features);
cd892126
AE
2940 if (IS_ERR(snap_name))
2941 return PTR_ERR(snap_name);
2942
9fcbb800
AE
2943 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2944 (unsigned long long) snap_id);
35938150
AE
2945 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2946 struct rbd_snap *new_snap;
2947
2948 /* We haven't seen this snapshot before */
2949
c8d18425 2950 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
cd892126 2951 snap_id, snap_size, snap_features);
9fcbb800
AE
2952 if (IS_ERR(new_snap)) {
2953 int err = PTR_ERR(new_snap);
2954
2955 dout(" failed to add dev, error %d\n", err);
2956
2957 return err;
2958 }
35938150
AE
2959
2960 /* New goes before existing, or at end of list */
2961
9fcbb800 2962 dout(" added dev%s\n", snap ? "" : " at end\n");
35938150
AE
2963 if (snap)
2964 list_add_tail(&new_snap->node, &snap->node);
2965 else
523f3258 2966 list_add_tail(&new_snap->node, head);
35938150
AE
2967 } else {
2968 /* Already have this one */
2969
9fcbb800
AE
2970 dout(" already present\n");
2971
cd892126 2972 rbd_assert(snap->size == snap_size);
aafb230e 2973 rbd_assert(!strcmp(snap->name, snap_name));
cd892126 2974 rbd_assert(snap->features == snap_features);
35938150
AE
2975
2976 /* Done with this list entry; advance */
2977
2978 links = links->next;
dfc5606d 2979 }
35938150
AE
2980
2981 /* Advance to the next entry in the snapshot context */
2982
2983 index++;
dfc5606d 2984 }
9fcbb800 2985 dout("%s: done\n", __func__);
dfc5606d
YS
2986
2987 return 0;
2988}
2989
304f6808
AE
2990/*
2991 * Scan the list of snapshots and register the devices for any that
2992 * have not already been registered.
2993 */
2994static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2995{
2996 struct rbd_snap *snap;
2997 int ret = 0;
2998
2999 dout("%s called\n", __func__);
86ff77bb
AE
3000 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
3001 return -EIO;
304f6808
AE
3002
3003 list_for_each_entry(snap, &rbd_dev->snaps, node) {
3004 if (!rbd_snap_registered(snap)) {
3005 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
3006 if (ret < 0)
3007 break;
3008 }
3009 }
3010 dout("%s: returning %d\n", __func__, ret);
3011
3012 return ret;
3013}
3014
dfc5606d
YS
3015static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3016{
dfc5606d 3017 struct device *dev;
cd789ab9 3018 int ret;
dfc5606d
YS
3019
3020 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
dfc5606d 3021
cd789ab9 3022 dev = &rbd_dev->dev;
dfc5606d
YS
3023 dev->bus = &rbd_bus_type;
3024 dev->type = &rbd_device_type;
3025 dev->parent = &rbd_root_dev;
3026 dev->release = rbd_dev_release;
de71a297 3027 dev_set_name(dev, "%d", rbd_dev->dev_id);
dfc5606d 3028 ret = device_register(dev);
dfc5606d 3029
dfc5606d 3030 mutex_unlock(&ctl_mutex);
cd789ab9 3031
dfc5606d 3032 return ret;
602adf40
YS
3033}
3034
dfc5606d
YS
3035static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3036{
3037 device_unregister(&rbd_dev->dev);
3038}
3039
59c2be1e
YS
3040static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
3041{
3042 int ret, rc;
3043
3044 do {
907703d0 3045 ret = rbd_req_sync_watch(rbd_dev, 1);
59c2be1e 3046 if (ret == -ERANGE) {
117973fb 3047 rc = rbd_dev_refresh(rbd_dev, NULL);
59c2be1e
YS
3048 if (rc < 0)
3049 return rc;
3050 }
3051 } while (ret == -ERANGE);
3052
3053 return ret;
3054}
3055
e2839308 3056static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
1ddbe94e
AE
3057
3058/*
499afd5b
AE
3059 * Get a unique rbd identifier for the given new rbd_dev, and add
3060 * the rbd_dev to the global list. The minimum rbd id is 1.
1ddbe94e 3061 */
e2839308 3062static void rbd_dev_id_get(struct rbd_device *rbd_dev)
b7f23c36 3063{
e2839308 3064 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
499afd5b
AE
3065
3066 spin_lock(&rbd_dev_list_lock);
3067 list_add_tail(&rbd_dev->node, &rbd_dev_list);
3068 spin_unlock(&rbd_dev_list_lock);
e2839308
AE
3069 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3070 (unsigned long long) rbd_dev->dev_id);
1ddbe94e 3071}
b7f23c36 3072
1ddbe94e 3073/*
499afd5b
AE
3074 * Remove an rbd_dev from the global list, and record that its
3075 * identifier is no longer in use.
1ddbe94e 3076 */
e2839308 3077static void rbd_dev_id_put(struct rbd_device *rbd_dev)
1ddbe94e 3078{
d184f6bf 3079 struct list_head *tmp;
de71a297 3080 int rbd_id = rbd_dev->dev_id;
d184f6bf
AE
3081 int max_id;
3082
aafb230e 3083 rbd_assert(rbd_id > 0);
499afd5b 3084
e2839308
AE
3085 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3086 (unsigned long long) rbd_dev->dev_id);
499afd5b
AE
3087 spin_lock(&rbd_dev_list_lock);
3088 list_del_init(&rbd_dev->node);
d184f6bf
AE
3089
3090 /*
3091 * If the id being "put" is not the current maximum, there
3092 * is nothing special we need to do.
3093 */
e2839308 3094 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
d184f6bf
AE
3095 spin_unlock(&rbd_dev_list_lock);
3096 return;
3097 }
3098
3099 /*
3100 * We need to update the current maximum id. Search the
3101 * list to find out what it is. We're more likely to find
3102 * the maximum at the end, so search the list backward.
3103 */
3104 max_id = 0;
3105 list_for_each_prev(tmp, &rbd_dev_list) {
3106 struct rbd_device *rbd_dev;
3107
3108 rbd_dev = list_entry(tmp, struct rbd_device, node);
b213e0b1
AE
3109 if (rbd_dev->dev_id > max_id)
3110 max_id = rbd_dev->dev_id;
d184f6bf 3111 }
499afd5b 3112 spin_unlock(&rbd_dev_list_lock);
b7f23c36 3113
1ddbe94e 3114 /*
e2839308 3115 * The max id could have been updated by rbd_dev_id_get(), in
d184f6bf
AE
3116 * which case it now accurately reflects the new maximum.
3117 * Be careful not to overwrite the maximum value in that
3118 * case.
1ddbe94e 3119 */
e2839308
AE
3120 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3121 dout(" max dev id has been reset\n");
b7f23c36
AE
3122}
3123
e28fff26
AE
3124/*
3125 * Skips over white space at *buf, and updates *buf to point to the
3126 * first found non-space character (if any). Returns the length of
593a9e7b
AE
3127 * the token (string of non-white space characters) found. Note
3128 * that *buf must be terminated with '\0'.
e28fff26
AE
3129 */
3130static inline size_t next_token(const char **buf)
3131{
3132 /*
3133 * These are the characters that produce nonzero for
3134 * isspace() in the "C" and "POSIX" locales.
3135 */
3136 const char *spaces = " \f\n\r\t\v";
3137
3138 *buf += strspn(*buf, spaces); /* Find start of token */
3139
3140 return strcspn(*buf, spaces); /* Return token length */
3141}
3142
3143/*
3144 * Finds the next token in *buf, and if the provided token buffer is
3145 * big enough, copies the found token into it. The result, if
593a9e7b
AE
3146 * copied, is guaranteed to be terminated with '\0'. Note that *buf
3147 * must be terminated with '\0' on entry.
e28fff26
AE
3148 *
3149 * Returns the length of the token found (not including the '\0').
3150 * Return value will be 0 if no token is found, and it will be >=
3151 * token_size if the token would not fit.
3152 *
593a9e7b 3153 * The *buf pointer will be updated to point beyond the end of the
e28fff26
AE
3154 * found token. Note that this occurs even if the token buffer is
3155 * too small to hold it.
3156 */
3157static inline size_t copy_token(const char **buf,
3158 char *token,
3159 size_t token_size)
3160{
3161 size_t len;
3162
3163 len = next_token(buf);
3164 if (len < token_size) {
3165 memcpy(token, *buf, len);
3166 *(token + len) = '\0';
3167 }
3168 *buf += len;
3169
3170 return len;
3171}
3172
ea3352f4
AE
3173/*
3174 * Finds the next token in *buf, dynamically allocates a buffer big
3175 * enough to hold a copy of it, and copies the token into the new
3176 * buffer. The copy is guaranteed to be terminated with '\0'. Note
3177 * that a duplicate buffer is created even for a zero-length token.
3178 *
3179 * Returns a pointer to the newly-allocated duplicate, or a null
3180 * pointer if memory for the duplicate was not available. If
3181 * the lenp argument is a non-null pointer, the length of the token
3182 * (not including the '\0') is returned in *lenp.
3183 *
3184 * If successful, the *buf pointer will be updated to point beyond
3185 * the end of the found token.
3186 *
3187 * Note: uses GFP_KERNEL for allocation.
3188 */
3189static inline char *dup_token(const char **buf, size_t *lenp)
3190{
3191 char *dup;
3192 size_t len;
3193
3194 len = next_token(buf);
4caf35f9 3195 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
ea3352f4
AE
3196 if (!dup)
3197 return NULL;
ea3352f4
AE
3198 *(dup + len) = '\0';
3199 *buf += len;
3200
3201 if (lenp)
3202 *lenp = len;
3203
3204 return dup;
3205}
3206
a725f65e 3207/*
859c31df
AE
3208 * Parse the options provided for an "rbd add" (i.e., rbd image
3209 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
3210 * and the data written is passed here via a NUL-terminated buffer.
3211 * Returns 0 if successful or an error code otherwise.
d22f76e7 3212 *
859c31df
AE
3213 * The information extracted from these options is recorded in
3214 * the other parameters which return dynamically-allocated
3215 * structures:
3216 * ceph_opts
3217 * The address of a pointer that will refer to a ceph options
3218 * structure. Caller must release the returned pointer using
3219 * ceph_destroy_options() when it is no longer needed.
3220 * rbd_opts
3221 * Address of an rbd options pointer. Fully initialized by
3222 * this function; caller must release with kfree().
3223 * spec
3224 * Address of an rbd image specification pointer. Fully
3225 * initialized by this function based on parsed options.
3226 * Caller must release with rbd_spec_put().
3227 *
3228 * The options passed take this form:
3229 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3230 * where:
3231 * <mon_addrs>
3232 * A comma-separated list of one or more monitor addresses.
3233 * A monitor address is an ip address, optionally followed
3234 * by a port number (separated by a colon).
3235 * I.e.: ip1[:port1][,ip2[:port2]...]
3236 * <options>
3237 * A comma-separated list of ceph and/or rbd options.
3238 * <pool_name>
3239 * The name of the rados pool containing the rbd image.
3240 * <image_name>
3241 * The name of the image in that pool to map.
3242 * <snap_id>
3243 * An optional snapshot id. If provided, the mapping will
3244 * present data from the image at the time that snapshot was
3245 * created. The image head is used if no snapshot id is
3246 * provided. Snapshot mappings are always read-only.
a725f65e 3247 */
859c31df 3248static int rbd_add_parse_args(const char *buf,
dc79b113 3249 struct ceph_options **ceph_opts,
859c31df
AE
3250 struct rbd_options **opts,
3251 struct rbd_spec **rbd_spec)
e28fff26 3252{
d22f76e7 3253 size_t len;
859c31df 3254 char *options;
0ddebc0c
AE
3255 const char *mon_addrs;
3256 size_t mon_addrs_size;
859c31df 3257 struct rbd_spec *spec = NULL;
4e9afeba 3258 struct rbd_options *rbd_opts = NULL;
859c31df 3259 struct ceph_options *copts;
dc79b113 3260 int ret;
e28fff26
AE
3261
3262 /* The first four tokens are required */
3263
7ef3214a 3264 len = next_token(&buf);
4fb5d671
AE
3265 if (!len) {
3266 rbd_warn(NULL, "no monitor address(es) provided");
3267 return -EINVAL;
3268 }
0ddebc0c 3269 mon_addrs = buf;
f28e565a 3270 mon_addrs_size = len + 1;
7ef3214a 3271 buf += len;
a725f65e 3272
dc79b113 3273 ret = -EINVAL;
f28e565a
AE
3274 options = dup_token(&buf, NULL);
3275 if (!options)
dc79b113 3276 return -ENOMEM;
4fb5d671
AE
3277 if (!*options) {
3278 rbd_warn(NULL, "no options provided");
3279 goto out_err;
3280 }
e28fff26 3281
859c31df
AE
3282 spec = rbd_spec_alloc();
3283 if (!spec)
f28e565a 3284 goto out_mem;
859c31df
AE
3285
3286 spec->pool_name = dup_token(&buf, NULL);
3287 if (!spec->pool_name)
3288 goto out_mem;
4fb5d671
AE
3289 if (!*spec->pool_name) {
3290 rbd_warn(NULL, "no pool name provided");
3291 goto out_err;
3292 }
e28fff26 3293
69e7a02f 3294 spec->image_name = dup_token(&buf, NULL);
859c31df 3295 if (!spec->image_name)
f28e565a 3296 goto out_mem;
4fb5d671
AE
3297 if (!*spec->image_name) {
3298 rbd_warn(NULL, "no image name provided");
3299 goto out_err;
3300 }
d4b125e9 3301
f28e565a
AE
3302 /*
3303 * Snapshot name is optional; default is to use "-"
3304 * (indicating the head/no snapshot).
3305 */
3feeb894 3306 len = next_token(&buf);
820a5f3e 3307 if (!len) {
3feeb894
AE
3308 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3309 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
f28e565a 3310 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
dc79b113 3311 ret = -ENAMETOOLONG;
f28e565a 3312 goto out_err;
849b4260 3313 }
4caf35f9 3314 spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
859c31df 3315 if (!spec->snap_name)
f28e565a 3316 goto out_mem;
859c31df 3317 *(spec->snap_name + len) = '\0';
e5c35534 3318
0ddebc0c 3319 /* Initialize all rbd options to the defaults */
e28fff26 3320
4e9afeba
AE
3321 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3322 if (!rbd_opts)
3323 goto out_mem;
3324
3325 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
d22f76e7 3326
859c31df 3327 copts = ceph_parse_options(options, mon_addrs,
0ddebc0c 3328 mon_addrs + mon_addrs_size - 1,
4e9afeba 3329 parse_rbd_opts_token, rbd_opts);
859c31df
AE
3330 if (IS_ERR(copts)) {
3331 ret = PTR_ERR(copts);
dc79b113
AE
3332 goto out_err;
3333 }
859c31df
AE
3334 kfree(options);
3335
3336 *ceph_opts = copts;
4e9afeba 3337 *opts = rbd_opts;
859c31df 3338 *rbd_spec = spec;
0ddebc0c 3339
dc79b113 3340 return 0;
f28e565a 3341out_mem:
dc79b113 3342 ret = -ENOMEM;
d22f76e7 3343out_err:
859c31df
AE
3344 kfree(rbd_opts);
3345 rbd_spec_put(spec);
f28e565a 3346 kfree(options);
d22f76e7 3347
dc79b113 3348 return ret;
a725f65e
AE
3349}
3350
589d30e0
AE
3351/*
3352 * An rbd format 2 image has a unique identifier, distinct from the
3353 * name given to it by the user. Internally, that identifier is
3354 * what's used to specify the names of objects related to the image.
3355 *
3356 * A special "rbd id" object is used to map an rbd image name to its
3357 * id. If that object doesn't exist, then there is no v2 rbd image
3358 * with the supplied name.
3359 *
3360 * This function will record the given rbd_dev's image_id field if
3361 * it can be determined, and in that case will return 0. If any
3362 * errors occur a negative errno will be returned and the rbd_dev's
3363 * image_id field will be unchanged (and should be NULL).
3364 */
3365static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3366{
3367 int ret;
3368 size_t size;
3369 char *object_name;
3370 void *response;
3371 void *p;
3372
2c0d0a10
AE
3373 /*
3374 * When probing a parent image, the image id is already
3375 * known (and the image name likely is not). There's no
3376 * need to fetch the image id again in this case.
3377 */
3378 if (rbd_dev->spec->image_id)
3379 return 0;
3380
589d30e0
AE
3381 /*
3382 * First, see if the format 2 image id file exists, and if
3383 * so, get the image's persistent id from it.
3384 */
69e7a02f 3385 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
589d30e0
AE
3386 object_name = kmalloc(size, GFP_NOIO);
3387 if (!object_name)
3388 return -ENOMEM;
0d7dbfce 3389 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
589d30e0
AE
3390 dout("rbd id object name is %s\n", object_name);
3391
3392 /* Response will be an encoded string, which includes a length */
3393
3394 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3395 response = kzalloc(size, GFP_NOIO);
3396 if (!response) {
3397 ret = -ENOMEM;
3398 goto out;
3399 }
3400
3401 ret = rbd_req_sync_exec(rbd_dev, object_name,
3402 "rbd", "get_id",
3403 NULL, 0,
07b2391f 3404 response, RBD_IMAGE_ID_LEN_MAX, NULL);
589d30e0
AE
3405 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3406 if (ret < 0)
3407 goto out;
a0ea3a40 3408 ret = 0; /* rbd_req_sync_exec() can return positive */
589d30e0
AE
3409
3410 p = response;
0d7dbfce 3411 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
589d30e0 3412 p + RBD_IMAGE_ID_LEN_MAX,
979ed480 3413 NULL, GFP_NOIO);
0d7dbfce
AE
3414 if (IS_ERR(rbd_dev->spec->image_id)) {
3415 ret = PTR_ERR(rbd_dev->spec->image_id);
3416 rbd_dev->spec->image_id = NULL;
589d30e0 3417 } else {
0d7dbfce 3418 dout("image_id is %s\n", rbd_dev->spec->image_id);
589d30e0
AE
3419 }
3420out:
3421 kfree(response);
3422 kfree(object_name);
3423
3424 return ret;
3425}
3426
a30b71b9
AE
3427static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3428{
3429 int ret;
3430 size_t size;
3431
3432 /* Version 1 images have no id; empty string is used */
3433
0d7dbfce
AE
3434 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3435 if (!rbd_dev->spec->image_id)
a30b71b9 3436 return -ENOMEM;
a30b71b9
AE
3437
3438 /* Record the header object name for this rbd image. */
3439
69e7a02f 3440 size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
a30b71b9
AE
3441 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3442 if (!rbd_dev->header_name) {
3443 ret = -ENOMEM;
3444 goto out_err;
3445 }
0d7dbfce
AE
3446 sprintf(rbd_dev->header_name, "%s%s",
3447 rbd_dev->spec->image_name, RBD_SUFFIX);
a30b71b9
AE
3448
3449 /* Populate rbd image metadata */
3450
3451 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3452 if (ret < 0)
3453 goto out_err;
86b00e0d
AE
3454
3455 /* Version 1 images have no parent (no layering) */
3456
3457 rbd_dev->parent_spec = NULL;
3458 rbd_dev->parent_overlap = 0;
3459
a30b71b9
AE
3460 rbd_dev->image_format = 1;
3461
3462 dout("discovered version 1 image, header name is %s\n",
3463 rbd_dev->header_name);
3464
3465 return 0;
3466
3467out_err:
3468 kfree(rbd_dev->header_name);
3469 rbd_dev->header_name = NULL;
0d7dbfce
AE
3470 kfree(rbd_dev->spec->image_id);
3471 rbd_dev->spec->image_id = NULL;
a30b71b9
AE
3472
3473 return ret;
3474}
3475
3476static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3477{
3478 size_t size;
9d475de5 3479 int ret;
6e14b1a6 3480 u64 ver = 0;
a30b71b9
AE
3481
3482 /*
3483 * Image id was filled in by the caller. Record the header
3484 * object name for this rbd image.
3485 */
979ed480 3486 size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
a30b71b9
AE
3487 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3488 if (!rbd_dev->header_name)
3489 return -ENOMEM;
3490 sprintf(rbd_dev->header_name, "%s%s",
0d7dbfce 3491 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
9d475de5
AE
3492
3493 /* Get the size and object order for the image */
3494
3495 ret = rbd_dev_v2_image_size(rbd_dev);
1e130199
AE
3496 if (ret < 0)
3497 goto out_err;
3498
3499 /* Get the object prefix (a.k.a. block_name) for the image */
3500
3501 ret = rbd_dev_v2_object_prefix(rbd_dev);
b1b5402a
AE
3502 if (ret < 0)
3503 goto out_err;
3504
d889140c 3505 /* Get the and check features for the image */
b1b5402a
AE
3506
3507 ret = rbd_dev_v2_features(rbd_dev);
9d475de5
AE
3508 if (ret < 0)
3509 goto out_err;
35d489f9 3510
86b00e0d
AE
3511 /* If the image supports layering, get the parent info */
3512
3513 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3514 ret = rbd_dev_v2_parent_info(rbd_dev);
3515 if (ret < 0)
3516 goto out_err;
3517 }
3518
6e14b1a6
AE
3519 /* crypto and compression type aren't (yet) supported for v2 images */
3520
3521 rbd_dev->header.crypt_type = 0;
3522 rbd_dev->header.comp_type = 0;
35d489f9 3523
6e14b1a6
AE
3524 /* Get the snapshot context, plus the header version */
3525
3526 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
35d489f9
AE
3527 if (ret)
3528 goto out_err;
6e14b1a6
AE
3529 rbd_dev->header.obj_version = ver;
3530
a30b71b9
AE
3531 rbd_dev->image_format = 2;
3532
3533 dout("discovered version 2 image, header name is %s\n",
3534 rbd_dev->header_name);
3535
35152979 3536 return 0;
9d475de5 3537out_err:
86b00e0d
AE
3538 rbd_dev->parent_overlap = 0;
3539 rbd_spec_put(rbd_dev->parent_spec);
3540 rbd_dev->parent_spec = NULL;
9d475de5
AE
3541 kfree(rbd_dev->header_name);
3542 rbd_dev->header_name = NULL;
1e130199
AE
3543 kfree(rbd_dev->header.object_prefix);
3544 rbd_dev->header.object_prefix = NULL;
9d475de5
AE
3545
3546 return ret;
a30b71b9
AE
3547}
3548
83a06263
AE
3549static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3550{
3551 int ret;
3552
3553 /* no need to lock here, as rbd_dev is not registered yet */
3554 ret = rbd_dev_snaps_update(rbd_dev);
3555 if (ret)
3556 return ret;
3557
9e15b77d
AE
3558 ret = rbd_dev_probe_update_spec(rbd_dev);
3559 if (ret)
3560 goto err_out_snaps;
3561
83a06263
AE
3562 ret = rbd_dev_set_mapping(rbd_dev);
3563 if (ret)
3564 goto err_out_snaps;
3565
3566 /* generate unique id: find highest unique id, add one */
3567 rbd_dev_id_get(rbd_dev);
3568
3569 /* Fill in the device name, now that we have its id. */
3570 BUILD_BUG_ON(DEV_NAME_LEN
3571 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3572 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3573
3574 /* Get our block major device number. */
3575
3576 ret = register_blkdev(0, rbd_dev->name);
3577 if (ret < 0)
3578 goto err_out_id;
3579 rbd_dev->major = ret;
3580
3581 /* Set up the blkdev mapping. */
3582
3583 ret = rbd_init_disk(rbd_dev);
3584 if (ret)
3585 goto err_out_blkdev;
3586
3587 ret = rbd_bus_add_dev(rbd_dev);
3588 if (ret)
3589 goto err_out_disk;
3590
3591 /*
3592 * At this point cleanup in the event of an error is the job
3593 * of the sysfs code (initiated by rbd_bus_del_dev()).
3594 */
3595 down_write(&rbd_dev->header_rwsem);
3596 ret = rbd_dev_snaps_register(rbd_dev);
3597 up_write(&rbd_dev->header_rwsem);
3598 if (ret)
3599 goto err_out_bus;
3600
3601 ret = rbd_init_watch_dev(rbd_dev);
3602 if (ret)
3603 goto err_out_bus;
3604
3605 /* Everything's ready. Announce the disk to the world. */
3606
3607 add_disk(rbd_dev->disk);
3608
3609 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3610 (unsigned long long) rbd_dev->mapping.size);
3611
3612 return ret;
3613err_out_bus:
3614 /* this will also clean up rest of rbd_dev stuff */
3615
3616 rbd_bus_del_dev(rbd_dev);
3617
3618 return ret;
3619err_out_disk:
3620 rbd_free_disk(rbd_dev);
3621err_out_blkdev:
3622 unregister_blkdev(rbd_dev->major, rbd_dev->name);
3623err_out_id:
3624 rbd_dev_id_put(rbd_dev);
3625err_out_snaps:
3626 rbd_remove_all_snaps(rbd_dev);
3627
3628 return ret;
3629}
3630
a30b71b9
AE
3631/*
3632 * Probe for the existence of the header object for the given rbd
3633 * device. For format 2 images this includes determining the image
3634 * id.
3635 */
3636static int rbd_dev_probe(struct rbd_device *rbd_dev)
3637{
3638 int ret;
3639
3640 /*
3641 * Get the id from the image id object. If it's not a
3642 * format 2 image, we'll get ENOENT back, and we'll assume
3643 * it's a format 1 image.
3644 */
3645 ret = rbd_dev_image_id(rbd_dev);
3646 if (ret)
3647 ret = rbd_dev_v1_probe(rbd_dev);
3648 else
3649 ret = rbd_dev_v2_probe(rbd_dev);
83a06263 3650 if (ret) {
a30b71b9
AE
3651 dout("probe failed, returning %d\n", ret);
3652
83a06263
AE
3653 return ret;
3654 }
3655
3656 ret = rbd_dev_probe_finish(rbd_dev);
3657 if (ret)
3658 rbd_header_free(&rbd_dev->header);
3659
a30b71b9
AE
3660 return ret;
3661}
3662
59c2be1e
YS
3663static ssize_t rbd_add(struct bus_type *bus,
3664 const char *buf,
3665 size_t count)
602adf40 3666{
cb8627c7 3667 struct rbd_device *rbd_dev = NULL;
dc79b113 3668 struct ceph_options *ceph_opts = NULL;
4e9afeba 3669 struct rbd_options *rbd_opts = NULL;
859c31df 3670 struct rbd_spec *spec = NULL;
9d3997fd 3671 struct rbd_client *rbdc;
27cc2594
AE
3672 struct ceph_osd_client *osdc;
3673 int rc = -ENOMEM;
602adf40
YS
3674
3675 if (!try_module_get(THIS_MODULE))
3676 return -ENODEV;
3677
602adf40 3678 /* parse add command */
859c31df 3679 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
dc79b113 3680 if (rc < 0)
bd4ba655 3681 goto err_out_module;
78cea76e 3682
9d3997fd
AE
3683 rbdc = rbd_get_client(ceph_opts);
3684 if (IS_ERR(rbdc)) {
3685 rc = PTR_ERR(rbdc);
0ddebc0c 3686 goto err_out_args;
9d3997fd 3687 }
c53d5893 3688 ceph_opts = NULL; /* rbd_dev client now owns this */
602adf40 3689
602adf40 3690 /* pick the pool */
9d3997fd 3691 osdc = &rbdc->client->osdc;
859c31df 3692 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
602adf40
YS
3693 if (rc < 0)
3694 goto err_out_client;
859c31df
AE
3695 spec->pool_id = (u64) rc;
3696
0903e875
AE
3697 /* The ceph file layout needs to fit pool id in 32 bits */
3698
3699 if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
3700 rc = -EIO;
3701 goto err_out_client;
3702 }
3703
c53d5893 3704 rbd_dev = rbd_dev_create(rbdc, spec);
bd4ba655
AE
3705 if (!rbd_dev)
3706 goto err_out_client;
c53d5893
AE
3707 rbdc = NULL; /* rbd_dev now owns this */
3708 spec = NULL; /* rbd_dev now owns this */
602adf40 3709
bd4ba655 3710 rbd_dev->mapping.read_only = rbd_opts->read_only;
c53d5893
AE
3711 kfree(rbd_opts);
3712 rbd_opts = NULL; /* done with this */
bd4ba655 3713
a30b71b9
AE
3714 rc = rbd_dev_probe(rbd_dev);
3715 if (rc < 0)
c53d5893 3716 goto err_out_rbd_dev;
05fd6f6f 3717
602adf40 3718 return count;
c53d5893
AE
3719err_out_rbd_dev:
3720 rbd_dev_destroy(rbd_dev);
bd4ba655 3721err_out_client:
9d3997fd 3722 rbd_put_client(rbdc);
0ddebc0c 3723err_out_args:
78cea76e
AE
3724 if (ceph_opts)
3725 ceph_destroy_options(ceph_opts);
4e9afeba 3726 kfree(rbd_opts);
859c31df 3727 rbd_spec_put(spec);
bd4ba655
AE
3728err_out_module:
3729 module_put(THIS_MODULE);
27cc2594 3730
602adf40 3731 dout("Error adding device %s\n", buf);
27cc2594
AE
3732
3733 return (ssize_t) rc;
602adf40
YS
3734}
3735
de71a297 3736static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
602adf40
YS
3737{
3738 struct list_head *tmp;
3739 struct rbd_device *rbd_dev;
3740
e124a82f 3741 spin_lock(&rbd_dev_list_lock);
602adf40
YS
3742 list_for_each(tmp, &rbd_dev_list) {
3743 rbd_dev = list_entry(tmp, struct rbd_device, node);
de71a297 3744 if (rbd_dev->dev_id == dev_id) {
e124a82f 3745 spin_unlock(&rbd_dev_list_lock);
602adf40 3746 return rbd_dev;
e124a82f 3747 }
602adf40 3748 }
e124a82f 3749 spin_unlock(&rbd_dev_list_lock);
602adf40
YS
3750 return NULL;
3751}
3752
dfc5606d 3753static void rbd_dev_release(struct device *dev)
602adf40 3754{
593a9e7b 3755 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 3756
1dbb4399
AE
3757 if (rbd_dev->watch_request) {
3758 struct ceph_client *client = rbd_dev->rbd_client->client;
3759
3760 ceph_osdc_unregister_linger_request(&client->osdc,
59c2be1e 3761 rbd_dev->watch_request);
1dbb4399 3762 }
59c2be1e 3763 if (rbd_dev->watch_event)
907703d0 3764 rbd_req_sync_watch(rbd_dev, 0);
602adf40
YS
3765
3766 /* clean up and free blkdev */
3767 rbd_free_disk(rbd_dev);
3768 unregister_blkdev(rbd_dev->major, rbd_dev->name);
32eec68d 3769
2ac4e75d
AE
3770 /* release allocated disk header fields */
3771 rbd_header_free(&rbd_dev->header);
3772
32eec68d 3773 /* done with the id, and with the rbd_dev */
e2839308 3774 rbd_dev_id_put(rbd_dev);
c53d5893
AE
3775 rbd_assert(rbd_dev->rbd_client != NULL);
3776 rbd_dev_destroy(rbd_dev);
602adf40
YS
3777
3778 /* release module ref */
3779 module_put(THIS_MODULE);
602adf40
YS
3780}
3781
dfc5606d
YS
3782static ssize_t rbd_remove(struct bus_type *bus,
3783 const char *buf,
3784 size_t count)
602adf40
YS
3785{
3786 struct rbd_device *rbd_dev = NULL;
3787 int target_id, rc;
3788 unsigned long ul;
3789 int ret = count;
3790
3791 rc = strict_strtoul(buf, 10, &ul);
3792 if (rc)
3793 return rc;
3794
3795 /* convert to int; abort if we lost anything in the conversion */
3796 target_id = (int) ul;
3797 if (target_id != ul)
3798 return -EINVAL;
3799
3800 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3801
3802 rbd_dev = __rbd_get_dev(target_id);
3803 if (!rbd_dev) {
3804 ret = -ENOENT;
3805 goto done;
42382b70
AE
3806 }
3807
3808 if (rbd_dev->open_count) {
3809 ret = -EBUSY;
3810 goto done;
602adf40
YS
3811 }
3812
41f38c2b 3813 rbd_remove_all_snaps(rbd_dev);
dfc5606d 3814 rbd_bus_del_dev(rbd_dev);
602adf40
YS
3815
3816done:
3817 mutex_unlock(&ctl_mutex);
aafb230e 3818
602adf40
YS
3819 return ret;
3820}
3821
602adf40
YS
3822/*
3823 * create control files in sysfs
dfc5606d 3824 * /sys/bus/rbd/...
602adf40
YS
3825 */
3826static int rbd_sysfs_init(void)
3827{
dfc5606d 3828 int ret;
602adf40 3829
fed4c143 3830 ret = device_register(&rbd_root_dev);
21079786 3831 if (ret < 0)
dfc5606d 3832 return ret;
602adf40 3833
fed4c143
AE
3834 ret = bus_register(&rbd_bus_type);
3835 if (ret < 0)
3836 device_unregister(&rbd_root_dev);
602adf40 3837
602adf40
YS
3838 return ret;
3839}
3840
3841static void rbd_sysfs_cleanup(void)
3842{
dfc5606d 3843 bus_unregister(&rbd_bus_type);
fed4c143 3844 device_unregister(&rbd_root_dev);
602adf40
YS
3845}
3846
3847int __init rbd_init(void)
3848{
3849 int rc;
3850
3851 rc = rbd_sysfs_init();
3852 if (rc)
3853 return rc;
f0f8cef5 3854 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
602adf40
YS
3855 return 0;
3856}
3857
3858void __exit rbd_exit(void)
3859{
3860 rbd_sysfs_cleanup();
3861}
3862
3863module_init(rbd_init);
3864module_exit(rbd_exit);
3865
3866MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3867MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3868MODULE_DESCRIPTION("rados block device");
3869
3870/* following authorship retained from original osdblk.c */
3871MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3872
3873MODULE_LICENSE("GPL");