]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - drivers/block/rbd.c
rbd: update feature bits
[mirror_ubuntu-bionic-kernel.git] / drivers / block / rbd.c
CommitLineData
602adf40
YS
1/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
dfc5606d 24 For usage instructions, please refer to:
602adf40 25
dfc5606d 26 Documentation/ABI/testing/sysfs-bus-rbd
602adf40
YS
27
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
59c2be1e 34#include <linux/parser.h>
602adf40
YS
35
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
aafb230e
AE
44#define RBD_DEBUG /* Activate rbd_assert() calls */
45
593a9e7b
AE
46/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
f0f8cef5
AE
55#define RBD_DRV_NAME "rbd"
56#define RBD_DRV_NAME_LONG "rbd (rados block device)"
602adf40
YS
57
58#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
59
d4b125e9
AE
60#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
61#define RBD_MAX_SNAP_NAME_LEN \
62 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
63
35d489f9 64#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
602adf40
YS
65
66#define RBD_SNAP_HEAD_NAME "-"
67
9e15b77d
AE
68/* This allows a single page to hold an image name sent by OSD */
69#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
1e130199 70#define RBD_IMAGE_ID_LEN_MAX 64
9e15b77d 71
1e130199 72#define RBD_OBJ_PREFIX_LEN_MAX 64
589d30e0 73
d889140c
AE
74/* Feature bits */
75
5cbf6f12
AE
76#define RBD_FEATURE_LAYERING (1<<0)
77#define RBD_FEATURE_STRIPINGV2 (1<<1)
78#define RBD_FEATURES_ALL \
79 (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
d889140c
AE
80
81/* Features supported by this (client software) implementation. */
82
5cbf6f12 83#define RBD_FEATURES_SUPPORTED (0)
d889140c 84
81a89793
AE
85/*
86 * An RBD device name will be "rbd#", where the "rbd" comes from
87 * RBD_DRV_NAME above, and # is a unique integer identifier.
88 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
89 * enough to hold all possible device names.
90 */
602adf40 91#define DEV_NAME_LEN 32
81a89793 92#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
602adf40
YS
93
94/*
95 * block device image metadata (in-memory version)
96 */
97struct rbd_image_header {
f84344f3 98 /* These four fields never change for a given rbd image */
849b4260 99 char *object_prefix;
34b13184 100 u64 features;
602adf40
YS
101 __u8 obj_order;
102 __u8 crypt_type;
103 __u8 comp_type;
602adf40 104
f84344f3
AE
105 /* The remaining fields need to be updated occasionally */
106 u64 image_size;
107 struct ceph_snap_context *snapc;
602adf40
YS
108 char *snap_names;
109 u64 *snap_sizes;
59c2be1e
YS
110
111 u64 obj_version;
112};
113
0d7dbfce
AE
114/*
115 * An rbd image specification.
116 *
117 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
c66c6e0c
AE
118 * identify an image. Each rbd_dev structure includes a pointer to
119 * an rbd_spec structure that encapsulates this identity.
120 *
121 * Each of the id's in an rbd_spec has an associated name. For a
122 * user-mapped image, the names are supplied and the id's associated
123 * with them are looked up. For a layered image, a parent image is
124 * defined by the tuple, and the names are looked up.
125 *
126 * An rbd_dev structure contains a parent_spec pointer which is
127 * non-null if the image it represents is a child in a layered
128 * image. This pointer will refer to the rbd_spec structure used
129 * by the parent rbd_dev for its own identity (i.e., the structure
130 * is shared between the parent and child).
131 *
132 * Since these structures are populated once, during the discovery
133 * phase of image construction, they are effectively immutable so
134 * we make no effort to synchronize access to them.
135 *
136 * Note that code herein does not assume the image name is known (it
137 * could be a null pointer).
0d7dbfce
AE
138 */
139struct rbd_spec {
140 u64 pool_id;
141 char *pool_name;
142
143 char *image_id;
0d7dbfce 144 char *image_name;
0d7dbfce
AE
145
146 u64 snap_id;
147 char *snap_name;
148
149 struct kref kref;
150};
151
602adf40 152/*
f0f8cef5 153 * an instance of the client. multiple devices may share an rbd client.
602adf40
YS
154 */
155struct rbd_client {
156 struct ceph_client *client;
157 struct kref kref;
158 struct list_head node;
159};
160
bf0d5f50
AE
161struct rbd_img_request;
162typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
163
164#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
165
166struct rbd_obj_request;
167typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
168
9969ebc5
AE
169enum obj_request_type {
170 OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
171};
bf0d5f50
AE
172
173struct rbd_obj_request {
174 const char *object_name;
175 u64 offset; /* object start byte */
176 u64 length; /* bytes from offset */
177
178 struct rbd_img_request *img_request;
179 struct list_head links; /* img_request->obj_requests */
180 u32 which; /* posn image request list */
181
182 enum obj_request_type type;
788e2df3
AE
183 union {
184 struct bio *bio_list;
185 struct {
186 struct page **pages;
187 u32 page_count;
188 };
189 };
bf0d5f50
AE
190
191 struct ceph_osd_request *osd_req;
192
193 u64 xferred; /* bytes transferred */
194 u64 version;
1b83bef2 195 int result;
bf0d5f50
AE
196 atomic_t done;
197
198 rbd_obj_callback_t callback;
788e2df3 199 struct completion completion;
bf0d5f50
AE
200
201 struct kref kref;
202};
203
204struct rbd_img_request {
205 struct request *rq;
206 struct rbd_device *rbd_dev;
207 u64 offset; /* starting image byte offset */
208 u64 length; /* byte count from offset */
209 bool write_request; /* false for read */
210 union {
211 struct ceph_snap_context *snapc; /* for writes */
212 u64 snap_id; /* for reads */
213 };
214 spinlock_t completion_lock;/* protects next_completion */
215 u32 next_completion;
216 rbd_img_callback_t callback;
217
218 u32 obj_request_count;
219 struct list_head obj_requests; /* rbd_obj_request structs */
220
221 struct kref kref;
222};
223
224#define for_each_obj_request(ireq, oreq) \
ef06f4d3 225 list_for_each_entry(oreq, &(ireq)->obj_requests, links)
bf0d5f50 226#define for_each_obj_request_from(ireq, oreq) \
ef06f4d3 227 list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
bf0d5f50 228#define for_each_obj_request_safe(ireq, oreq, n) \
ef06f4d3 229 list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
bf0d5f50 230
dfc5606d
YS
231struct rbd_snap {
232 struct device dev;
233 const char *name;
3591538f 234 u64 size;
dfc5606d
YS
235 struct list_head node;
236 u64 id;
34b13184 237 u64 features;
dfc5606d
YS
238};
239
f84344f3 240struct rbd_mapping {
99c1f08f 241 u64 size;
34b13184 242 u64 features;
f84344f3
AE
243 bool read_only;
244};
245
602adf40
YS
246/*
247 * a single device
248 */
249struct rbd_device {
de71a297 250 int dev_id; /* blkdev unique id */
602adf40
YS
251
252 int major; /* blkdev assigned major */
253 struct gendisk *disk; /* blkdev's gendisk and rq */
602adf40 254
a30b71b9 255 u32 image_format; /* Either 1 or 2 */
602adf40
YS
256 struct rbd_client *rbd_client;
257
258 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
259
b82d167b 260 spinlock_t lock; /* queue, flags, open_count */
602adf40
YS
261
262 struct rbd_image_header header;
b82d167b 263 unsigned long flags; /* possibly lock protected */
0d7dbfce 264 struct rbd_spec *spec;
602adf40 265
0d7dbfce 266 char *header_name;
971f839a 267
0903e875
AE
268 struct ceph_file_layout layout;
269
59c2be1e 270 struct ceph_osd_event *watch_event;
975241af 271 struct rbd_obj_request *watch_request;
59c2be1e 272
86b00e0d
AE
273 struct rbd_spec *parent_spec;
274 u64 parent_overlap;
275
c666601a
JD
276 /* protects updating the header */
277 struct rw_semaphore header_rwsem;
f84344f3
AE
278
279 struct rbd_mapping mapping;
602adf40
YS
280
281 struct list_head node;
dfc5606d
YS
282
283 /* list of snapshots */
284 struct list_head snaps;
285
286 /* sysfs related */
287 struct device dev;
b82d167b 288 unsigned long open_count; /* protected by lock */
dfc5606d
YS
289};
290
b82d167b
AE
291/*
292 * Flag bits for rbd_dev->flags. If atomicity is required,
293 * rbd_dev->lock is used to protect access.
294 *
295 * Currently, only the "removing" flag (which is coupled with the
296 * "open_count" field) requires atomic access.
297 */
6d292906
AE
298enum rbd_dev_flags {
299 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
b82d167b 300 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
6d292906
AE
301};
302
602adf40 303static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
e124a82f 304
602adf40 305static LIST_HEAD(rbd_dev_list); /* devices */
e124a82f
AE
306static DEFINE_SPINLOCK(rbd_dev_list_lock);
307
432b8587
AE
308static LIST_HEAD(rbd_client_list); /* clients */
309static DEFINE_SPINLOCK(rbd_client_list_lock);
602adf40 310
304f6808
AE
311static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
312static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
313
dfc5606d 314static void rbd_dev_release(struct device *dev);
41f38c2b 315static void rbd_remove_snap_dev(struct rbd_snap *snap);
dfc5606d 316
f0f8cef5
AE
317static ssize_t rbd_add(struct bus_type *bus, const char *buf,
318 size_t count);
319static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
320 size_t count);
321
322static struct bus_attribute rbd_bus_attrs[] = {
323 __ATTR(add, S_IWUSR, NULL, rbd_add),
324 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
325 __ATTR_NULL
326};
327
328static struct bus_type rbd_bus_type = {
329 .name = "rbd",
330 .bus_attrs = rbd_bus_attrs,
331};
332
333static void rbd_root_dev_release(struct device *dev)
334{
335}
336
337static struct device rbd_root_dev = {
338 .init_name = "rbd",
339 .release = rbd_root_dev_release,
340};
341
06ecc6cb
AE
342static __printf(2, 3)
343void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
344{
345 struct va_format vaf;
346 va_list args;
347
348 va_start(args, fmt);
349 vaf.fmt = fmt;
350 vaf.va = &args;
351
352 if (!rbd_dev)
353 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
354 else if (rbd_dev->disk)
355 printk(KERN_WARNING "%s: %s: %pV\n",
356 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
357 else if (rbd_dev->spec && rbd_dev->spec->image_name)
358 printk(KERN_WARNING "%s: image %s: %pV\n",
359 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
360 else if (rbd_dev->spec && rbd_dev->spec->image_id)
361 printk(KERN_WARNING "%s: id %s: %pV\n",
362 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
363 else /* punt */
364 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
365 RBD_DRV_NAME, rbd_dev, &vaf);
366 va_end(args);
367}
368
aafb230e
AE
369#ifdef RBD_DEBUG
370#define rbd_assert(expr) \
371 if (unlikely(!(expr))) { \
372 printk(KERN_ERR "\nAssertion failure in %s() " \
373 "at line %d:\n\n" \
374 "\trbd_assert(%s);\n\n", \
375 __func__, __LINE__, #expr); \
376 BUG(); \
377 }
378#else /* !RBD_DEBUG */
379# define rbd_assert(expr) ((void) 0)
380#endif /* !RBD_DEBUG */
dfc5606d 381
117973fb
AE
382static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
383static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
59c2be1e 384
602adf40
YS
385static int rbd_open(struct block_device *bdev, fmode_t mode)
386{
f0f8cef5 387 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
b82d167b 388 bool removing = false;
602adf40 389
f84344f3 390 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
602adf40
YS
391 return -EROFS;
392
a14ea269 393 spin_lock_irq(&rbd_dev->lock);
b82d167b
AE
394 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
395 removing = true;
396 else
397 rbd_dev->open_count++;
a14ea269 398 spin_unlock_irq(&rbd_dev->lock);
b82d167b
AE
399 if (removing)
400 return -ENOENT;
401
42382b70 402 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
c3e946ce 403 (void) get_device(&rbd_dev->dev);
f84344f3 404 set_device_ro(bdev, rbd_dev->mapping.read_only);
42382b70 405 mutex_unlock(&ctl_mutex);
340c7a2b 406
602adf40
YS
407 return 0;
408}
409
dfc5606d
YS
410static int rbd_release(struct gendisk *disk, fmode_t mode)
411{
412 struct rbd_device *rbd_dev = disk->private_data;
b82d167b
AE
413 unsigned long open_count_before;
414
a14ea269 415 spin_lock_irq(&rbd_dev->lock);
b82d167b 416 open_count_before = rbd_dev->open_count--;
a14ea269 417 spin_unlock_irq(&rbd_dev->lock);
b82d167b 418 rbd_assert(open_count_before > 0);
dfc5606d 419
42382b70 420 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
c3e946ce 421 put_device(&rbd_dev->dev);
42382b70 422 mutex_unlock(&ctl_mutex);
dfc5606d
YS
423
424 return 0;
425}
426
602adf40
YS
427static const struct block_device_operations rbd_bd_ops = {
428 .owner = THIS_MODULE,
429 .open = rbd_open,
dfc5606d 430 .release = rbd_release,
602adf40
YS
431};
432
433/*
434 * Initialize an rbd client instance.
43ae4701 435 * We own *ceph_opts.
602adf40 436 */
f8c38929 437static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
602adf40
YS
438{
439 struct rbd_client *rbdc;
440 int ret = -ENOMEM;
441
37206ee5 442 dout("%s:\n", __func__);
602adf40
YS
443 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
444 if (!rbdc)
445 goto out_opt;
446
447 kref_init(&rbdc->kref);
448 INIT_LIST_HEAD(&rbdc->node);
449
bc534d86
AE
450 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
451
43ae4701 452 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
602adf40 453 if (IS_ERR(rbdc->client))
bc534d86 454 goto out_mutex;
43ae4701 455 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
602adf40
YS
456
457 ret = ceph_open_session(rbdc->client);
458 if (ret < 0)
459 goto out_err;
460
432b8587 461 spin_lock(&rbd_client_list_lock);
602adf40 462 list_add_tail(&rbdc->node, &rbd_client_list);
432b8587 463 spin_unlock(&rbd_client_list_lock);
602adf40 464
bc534d86 465 mutex_unlock(&ctl_mutex);
37206ee5 466 dout("%s: rbdc %p\n", __func__, rbdc);
bc534d86 467
602adf40
YS
468 return rbdc;
469
470out_err:
471 ceph_destroy_client(rbdc->client);
bc534d86
AE
472out_mutex:
473 mutex_unlock(&ctl_mutex);
602adf40
YS
474 kfree(rbdc);
475out_opt:
43ae4701
AE
476 if (ceph_opts)
477 ceph_destroy_options(ceph_opts);
37206ee5
AE
478 dout("%s: error %d\n", __func__, ret);
479
28f259b7 480 return ERR_PTR(ret);
602adf40
YS
481}
482
483/*
1f7ba331
AE
484 * Find a ceph client with specific addr and configuration. If
485 * found, bump its reference count.
602adf40 486 */
1f7ba331 487static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
602adf40
YS
488{
489 struct rbd_client *client_node;
1f7ba331 490 bool found = false;
602adf40 491
43ae4701 492 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
602adf40
YS
493 return NULL;
494
1f7ba331
AE
495 spin_lock(&rbd_client_list_lock);
496 list_for_each_entry(client_node, &rbd_client_list, node) {
497 if (!ceph_compare_options(ceph_opts, client_node->client)) {
498 kref_get(&client_node->kref);
499 found = true;
500 break;
501 }
502 }
503 spin_unlock(&rbd_client_list_lock);
504
505 return found ? client_node : NULL;
602adf40
YS
506}
507
59c2be1e
YS
508/*
509 * mount options
510 */
511enum {
59c2be1e
YS
512 Opt_last_int,
513 /* int args above */
514 Opt_last_string,
515 /* string args above */
cc0538b6
AE
516 Opt_read_only,
517 Opt_read_write,
518 /* Boolean args above */
519 Opt_last_bool,
59c2be1e
YS
520};
521
43ae4701 522static match_table_t rbd_opts_tokens = {
59c2be1e
YS
523 /* int args above */
524 /* string args above */
be466c1c 525 {Opt_read_only, "read_only"},
cc0538b6
AE
526 {Opt_read_only, "ro"}, /* Alternate spelling */
527 {Opt_read_write, "read_write"},
528 {Opt_read_write, "rw"}, /* Alternate spelling */
529 /* Boolean args above */
59c2be1e
YS
530 {-1, NULL}
531};
532
98571b5a
AE
533struct rbd_options {
534 bool read_only;
535};
536
537#define RBD_READ_ONLY_DEFAULT false
538
59c2be1e
YS
539static int parse_rbd_opts_token(char *c, void *private)
540{
43ae4701 541 struct rbd_options *rbd_opts = private;
59c2be1e
YS
542 substring_t argstr[MAX_OPT_ARGS];
543 int token, intval, ret;
544
43ae4701 545 token = match_token(c, rbd_opts_tokens, argstr);
59c2be1e
YS
546 if (token < 0)
547 return -EINVAL;
548
549 if (token < Opt_last_int) {
550 ret = match_int(&argstr[0], &intval);
551 if (ret < 0) {
552 pr_err("bad mount option arg (not int) "
553 "at '%s'\n", c);
554 return ret;
555 }
556 dout("got int token %d val %d\n", token, intval);
557 } else if (token > Opt_last_int && token < Opt_last_string) {
558 dout("got string token %d val %s\n", token,
559 argstr[0].from);
cc0538b6
AE
560 } else if (token > Opt_last_string && token < Opt_last_bool) {
561 dout("got Boolean token %d\n", token);
59c2be1e
YS
562 } else {
563 dout("got token %d\n", token);
564 }
565
566 switch (token) {
cc0538b6
AE
567 case Opt_read_only:
568 rbd_opts->read_only = true;
569 break;
570 case Opt_read_write:
571 rbd_opts->read_only = false;
572 break;
59c2be1e 573 default:
aafb230e
AE
574 rbd_assert(false);
575 break;
59c2be1e
YS
576 }
577 return 0;
578}
579
602adf40
YS
580/*
581 * Get a ceph client with specific addr and configuration, if one does
582 * not exist create it.
583 */
9d3997fd 584static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
602adf40 585{
f8c38929 586 struct rbd_client *rbdc;
59c2be1e 587
1f7ba331 588 rbdc = rbd_client_find(ceph_opts);
9d3997fd 589 if (rbdc) /* using an existing client */
43ae4701 590 ceph_destroy_options(ceph_opts);
9d3997fd 591 else
f8c38929 592 rbdc = rbd_client_create(ceph_opts);
602adf40 593
9d3997fd 594 return rbdc;
602adf40
YS
595}
596
597/*
598 * Destroy ceph client
d23a4b3f 599 *
432b8587 600 * Caller must hold rbd_client_list_lock.
602adf40
YS
601 */
602static void rbd_client_release(struct kref *kref)
603{
604 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
605
37206ee5 606 dout("%s: rbdc %p\n", __func__, rbdc);
cd9d9f5d 607 spin_lock(&rbd_client_list_lock);
602adf40 608 list_del(&rbdc->node);
cd9d9f5d 609 spin_unlock(&rbd_client_list_lock);
602adf40
YS
610
611 ceph_destroy_client(rbdc->client);
612 kfree(rbdc);
613}
614
615/*
616 * Drop reference to ceph client node. If it's not referenced anymore, release
617 * it.
618 */
9d3997fd 619static void rbd_put_client(struct rbd_client *rbdc)
602adf40 620{
c53d5893
AE
621 if (rbdc)
622 kref_put(&rbdc->kref, rbd_client_release);
602adf40
YS
623}
624
a30b71b9
AE
625static bool rbd_image_format_valid(u32 image_format)
626{
627 return image_format == 1 || image_format == 2;
628}
629
8e94af8e
AE
630static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
631{
103a150f
AE
632 size_t size;
633 u32 snap_count;
634
635 /* The header has to start with the magic rbd header text */
636 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
637 return false;
638
db2388b6
AE
639 /* The bio layer requires at least sector-sized I/O */
640
641 if (ondisk->options.order < SECTOR_SHIFT)
642 return false;
643
644 /* If we use u64 in a few spots we may be able to loosen this */
645
646 if (ondisk->options.order > 8 * sizeof (int) - 1)
647 return false;
648
103a150f
AE
649 /*
650 * The size of a snapshot header has to fit in a size_t, and
651 * that limits the number of snapshots.
652 */
653 snap_count = le32_to_cpu(ondisk->snap_count);
654 size = SIZE_MAX - sizeof (struct ceph_snap_context);
655 if (snap_count > size / sizeof (__le64))
656 return false;
657
658 /*
659 * Not only that, but the size of the entire the snapshot
660 * header must also be representable in a size_t.
661 */
662 size -= snap_count * sizeof (__le64);
663 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
664 return false;
665
666 return true;
8e94af8e
AE
667}
668
602adf40
YS
669/*
670 * Create a new header structure, translate header format from the on-disk
671 * header.
672 */
673static int rbd_header_from_disk(struct rbd_image_header *header,
4156d998 674 struct rbd_image_header_ondisk *ondisk)
602adf40 675{
ccece235 676 u32 snap_count;
58c17b0e 677 size_t len;
d2bb24e5 678 size_t size;
621901d6 679 u32 i;
602adf40 680
6a52325f
AE
681 memset(header, 0, sizeof (*header));
682
103a150f
AE
683 snap_count = le32_to_cpu(ondisk->snap_count);
684
58c17b0e
AE
685 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
686 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
6a52325f 687 if (!header->object_prefix)
602adf40 688 return -ENOMEM;
58c17b0e
AE
689 memcpy(header->object_prefix, ondisk->object_prefix, len);
690 header->object_prefix[len] = '\0';
00f1f36f 691
602adf40 692 if (snap_count) {
f785cc1d
AE
693 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
694
621901d6
AE
695 /* Save a copy of the snapshot names */
696
f785cc1d
AE
697 if (snap_names_len > (u64) SIZE_MAX)
698 return -EIO;
699 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
602adf40 700 if (!header->snap_names)
6a52325f 701 goto out_err;
f785cc1d
AE
702 /*
703 * Note that rbd_dev_v1_header_read() guarantees
704 * the ondisk buffer we're working with has
705 * snap_names_len bytes beyond the end of the
706 * snapshot id array, this memcpy() is safe.
707 */
708 memcpy(header->snap_names, &ondisk->snaps[snap_count],
709 snap_names_len);
6a52325f 710
621901d6
AE
711 /* Record each snapshot's size */
712
d2bb24e5
AE
713 size = snap_count * sizeof (*header->snap_sizes);
714 header->snap_sizes = kmalloc(size, GFP_KERNEL);
602adf40 715 if (!header->snap_sizes)
6a52325f 716 goto out_err;
621901d6
AE
717 for (i = 0; i < snap_count; i++)
718 header->snap_sizes[i] =
719 le64_to_cpu(ondisk->snaps[i].image_size);
602adf40 720 } else {
ccece235 721 WARN_ON(ondisk->snap_names_len);
602adf40
YS
722 header->snap_names = NULL;
723 header->snap_sizes = NULL;
724 }
849b4260 725
34b13184 726 header->features = 0; /* No features support in v1 images */
602adf40
YS
727 header->obj_order = ondisk->options.order;
728 header->crypt_type = ondisk->options.crypt_type;
729 header->comp_type = ondisk->options.comp_type;
6a52325f 730
621901d6
AE
731 /* Allocate and fill in the snapshot context */
732
f84344f3 733 header->image_size = le64_to_cpu(ondisk->image_size);
6a52325f
AE
734 size = sizeof (struct ceph_snap_context);
735 size += snap_count * sizeof (header->snapc->snaps[0]);
736 header->snapc = kzalloc(size, GFP_KERNEL);
737 if (!header->snapc)
738 goto out_err;
602adf40
YS
739
740 atomic_set(&header->snapc->nref, 1);
505cbb9b 741 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
602adf40 742 header->snapc->num_snaps = snap_count;
621901d6
AE
743 for (i = 0; i < snap_count; i++)
744 header->snapc->snaps[i] =
745 le64_to_cpu(ondisk->snaps[i].id);
602adf40
YS
746
747 return 0;
748
6a52325f 749out_err:
849b4260 750 kfree(header->snap_sizes);
ccece235 751 header->snap_sizes = NULL;
602adf40 752 kfree(header->snap_names);
ccece235 753 header->snap_names = NULL;
6a52325f
AE
754 kfree(header->object_prefix);
755 header->object_prefix = NULL;
ccece235 756
00f1f36f 757 return -ENOMEM;
602adf40
YS
758}
759
9e15b77d
AE
760static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
761{
762 struct rbd_snap *snap;
763
764 if (snap_id == CEPH_NOSNAP)
765 return RBD_SNAP_HEAD_NAME;
766
767 list_for_each_entry(snap, &rbd_dev->snaps, node)
768 if (snap_id == snap->id)
769 return snap->name;
770
771 return NULL;
772}
773
8836b995 774static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
602adf40 775{
602adf40 776
e86924a8 777 struct rbd_snap *snap;
602adf40 778
e86924a8
AE
779 list_for_each_entry(snap, &rbd_dev->snaps, node) {
780 if (!strcmp(snap_name, snap->name)) {
0d7dbfce 781 rbd_dev->spec->snap_id = snap->id;
e86924a8 782 rbd_dev->mapping.size = snap->size;
34b13184 783 rbd_dev->mapping.features = snap->features;
602adf40 784
e86924a8 785 return 0;
00f1f36f 786 }
00f1f36f 787 }
e86924a8 788
00f1f36f 789 return -ENOENT;
602adf40
YS
790}
791
819d52bf 792static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
602adf40 793{
78dc447d 794 int ret;
602adf40 795
0d7dbfce 796 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
cc9d734c 797 sizeof (RBD_SNAP_HEAD_NAME))) {
0d7dbfce 798 rbd_dev->spec->snap_id = CEPH_NOSNAP;
99c1f08f 799 rbd_dev->mapping.size = rbd_dev->header.image_size;
34b13184 800 rbd_dev->mapping.features = rbd_dev->header.features;
e86924a8 801 ret = 0;
602adf40 802 } else {
0d7dbfce 803 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
602adf40
YS
804 if (ret < 0)
805 goto done;
f84344f3 806 rbd_dev->mapping.read_only = true;
602adf40 807 }
6d292906
AE
808 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
809
602adf40 810done:
602adf40
YS
811 return ret;
812}
813
814static void rbd_header_free(struct rbd_image_header *header)
815{
849b4260 816 kfree(header->object_prefix);
d78fd7ae 817 header->object_prefix = NULL;
602adf40 818 kfree(header->snap_sizes);
d78fd7ae 819 header->snap_sizes = NULL;
849b4260 820 kfree(header->snap_names);
d78fd7ae 821 header->snap_names = NULL;
d1d25646 822 ceph_put_snap_context(header->snapc);
d78fd7ae 823 header->snapc = NULL;
602adf40
YS
824}
825
98571b5a 826static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
602adf40 827{
65ccfe21
AE
828 char *name;
829 u64 segment;
830 int ret;
602adf40 831
2fd82b9e 832 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
65ccfe21
AE
833 if (!name)
834 return NULL;
835 segment = offset >> rbd_dev->header.obj_order;
2fd82b9e 836 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
65ccfe21 837 rbd_dev->header.object_prefix, segment);
2fd82b9e 838 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
65ccfe21
AE
839 pr_err("error formatting segment name for #%llu (%d)\n",
840 segment, ret);
841 kfree(name);
842 name = NULL;
843 }
602adf40 844
65ccfe21
AE
845 return name;
846}
602adf40 847
65ccfe21
AE
848static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
849{
850 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
602adf40 851
65ccfe21
AE
852 return offset & (segment_size - 1);
853}
854
855static u64 rbd_segment_length(struct rbd_device *rbd_dev,
856 u64 offset, u64 length)
857{
858 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
859
860 offset &= segment_size - 1;
861
aafb230e 862 rbd_assert(length <= U64_MAX - offset);
65ccfe21
AE
863 if (offset + length > segment_size)
864 length = segment_size - offset;
865
866 return length;
602adf40
YS
867}
868
029bcbd8
JD
869/*
870 * returns the size of an object in the image
871 */
872static u64 rbd_obj_bytes(struct rbd_image_header *header)
873{
874 return 1 << header->obj_order;
875}
876
602adf40
YS
877/*
878 * bio helpers
879 */
880
881static void bio_chain_put(struct bio *chain)
882{
883 struct bio *tmp;
884
885 while (chain) {
886 tmp = chain;
887 chain = chain->bi_next;
888 bio_put(tmp);
889 }
890}
891
892/*
893 * zeros a bio chain, starting at specific offset
894 */
895static void zero_bio_chain(struct bio *chain, int start_ofs)
896{
897 struct bio_vec *bv;
898 unsigned long flags;
899 void *buf;
900 int i;
901 int pos = 0;
902
903 while (chain) {
904 bio_for_each_segment(bv, chain, i) {
905 if (pos + bv->bv_len > start_ofs) {
906 int remainder = max(start_ofs - pos, 0);
907 buf = bvec_kmap_irq(bv, &flags);
908 memset(buf + remainder, 0,
909 bv->bv_len - remainder);
85b5aaa6 910 bvec_kunmap_irq(buf, &flags);
602adf40
YS
911 }
912 pos += bv->bv_len;
913 }
914
915 chain = chain->bi_next;
916 }
917}
918
919/*
f7760dad
AE
920 * Clone a portion of a bio, starting at the given byte offset
921 * and continuing for the number of bytes indicated.
602adf40 922 */
f7760dad
AE
923static struct bio *bio_clone_range(struct bio *bio_src,
924 unsigned int offset,
925 unsigned int len,
926 gfp_t gfpmask)
602adf40 927{
f7760dad
AE
928 struct bio_vec *bv;
929 unsigned int resid;
930 unsigned short idx;
931 unsigned int voff;
932 unsigned short end_idx;
933 unsigned short vcnt;
934 struct bio *bio;
935
936 /* Handle the easy case for the caller */
937
938 if (!offset && len == bio_src->bi_size)
939 return bio_clone(bio_src, gfpmask);
940
941 if (WARN_ON_ONCE(!len))
942 return NULL;
943 if (WARN_ON_ONCE(len > bio_src->bi_size))
944 return NULL;
945 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
946 return NULL;
947
948 /* Find first affected segment... */
949
950 resid = offset;
951 __bio_for_each_segment(bv, bio_src, idx, 0) {
952 if (resid < bv->bv_len)
953 break;
954 resid -= bv->bv_len;
602adf40 955 }
f7760dad 956 voff = resid;
602adf40 957
f7760dad 958 /* ...and the last affected segment */
602adf40 959
f7760dad
AE
960 resid += len;
961 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
962 if (resid <= bv->bv_len)
963 break;
964 resid -= bv->bv_len;
965 }
966 vcnt = end_idx - idx + 1;
967
968 /* Build the clone */
969
970 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
971 if (!bio)
972 return NULL; /* ENOMEM */
602adf40 973
f7760dad
AE
974 bio->bi_bdev = bio_src->bi_bdev;
975 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
976 bio->bi_rw = bio_src->bi_rw;
977 bio->bi_flags |= 1 << BIO_CLONED;
978
979 /*
980 * Copy over our part of the bio_vec, then update the first
981 * and last (or only) entries.
982 */
983 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
984 vcnt * sizeof (struct bio_vec));
985 bio->bi_io_vec[0].bv_offset += voff;
986 if (vcnt > 1) {
987 bio->bi_io_vec[0].bv_len -= voff;
988 bio->bi_io_vec[vcnt - 1].bv_len = resid;
989 } else {
990 bio->bi_io_vec[0].bv_len = len;
602adf40
YS
991 }
992
f7760dad
AE
993 bio->bi_vcnt = vcnt;
994 bio->bi_size = len;
995 bio->bi_idx = 0;
996
997 return bio;
998}
999
1000/*
1001 * Clone a portion of a bio chain, starting at the given byte offset
1002 * into the first bio in the source chain and continuing for the
1003 * number of bytes indicated. The result is another bio chain of
1004 * exactly the given length, or a null pointer on error.
1005 *
1006 * The bio_src and offset parameters are both in-out. On entry they
1007 * refer to the first source bio and the offset into that bio where
1008 * the start of data to be cloned is located.
1009 *
1010 * On return, bio_src is updated to refer to the bio in the source
1011 * chain that contains first un-cloned byte, and *offset will
1012 * contain the offset of that byte within that bio.
1013 */
1014static struct bio *bio_chain_clone_range(struct bio **bio_src,
1015 unsigned int *offset,
1016 unsigned int len,
1017 gfp_t gfpmask)
1018{
1019 struct bio *bi = *bio_src;
1020 unsigned int off = *offset;
1021 struct bio *chain = NULL;
1022 struct bio **end;
1023
1024 /* Build up a chain of clone bios up to the limit */
1025
1026 if (!bi || off >= bi->bi_size || !len)
1027 return NULL; /* Nothing to clone */
602adf40 1028
f7760dad
AE
1029 end = &chain;
1030 while (len) {
1031 unsigned int bi_size;
1032 struct bio *bio;
1033
f5400b7a
AE
1034 if (!bi) {
1035 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
f7760dad 1036 goto out_err; /* EINVAL; ran out of bio's */
f5400b7a 1037 }
f7760dad
AE
1038 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1039 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1040 if (!bio)
1041 goto out_err; /* ENOMEM */
1042
1043 *end = bio;
1044 end = &bio->bi_next;
602adf40 1045
f7760dad
AE
1046 off += bi_size;
1047 if (off == bi->bi_size) {
1048 bi = bi->bi_next;
1049 off = 0;
1050 }
1051 len -= bi_size;
1052 }
1053 *bio_src = bi;
1054 *offset = off;
1055
1056 return chain;
1057out_err:
1058 bio_chain_put(chain);
602adf40 1059
602adf40
YS
1060 return NULL;
1061}
1062
bf0d5f50
AE
1063static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1064{
37206ee5
AE
1065 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1066 atomic_read(&obj_request->kref.refcount));
bf0d5f50
AE
1067 kref_get(&obj_request->kref);
1068}
1069
1070static void rbd_obj_request_destroy(struct kref *kref);
1071static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1072{
1073 rbd_assert(obj_request != NULL);
37206ee5
AE
1074 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1075 atomic_read(&obj_request->kref.refcount));
bf0d5f50
AE
1076 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1077}
1078
1079static void rbd_img_request_get(struct rbd_img_request *img_request)
1080{
37206ee5
AE
1081 dout("%s: img %p (was %d)\n", __func__, img_request,
1082 atomic_read(&img_request->kref.refcount));
bf0d5f50
AE
1083 kref_get(&img_request->kref);
1084}
1085
1086static void rbd_img_request_destroy(struct kref *kref);
1087static void rbd_img_request_put(struct rbd_img_request *img_request)
1088{
1089 rbd_assert(img_request != NULL);
37206ee5
AE
1090 dout("%s: img %p (was %d)\n", __func__, img_request,
1091 atomic_read(&img_request->kref.refcount));
bf0d5f50
AE
1092 kref_put(&img_request->kref, rbd_img_request_destroy);
1093}
1094
1095static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1096 struct rbd_obj_request *obj_request)
1097{
25dcf954
AE
1098 rbd_assert(obj_request->img_request == NULL);
1099
bf0d5f50
AE
1100 rbd_obj_request_get(obj_request);
1101 obj_request->img_request = img_request;
25dcf954 1102 obj_request->which = img_request->obj_request_count;
bf0d5f50 1103 rbd_assert(obj_request->which != BAD_WHICH);
25dcf954
AE
1104 img_request->obj_request_count++;
1105 list_add_tail(&obj_request->links, &img_request->obj_requests);
37206ee5
AE
1106 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1107 obj_request->which);
bf0d5f50
AE
1108}
1109
1110static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1111 struct rbd_obj_request *obj_request)
1112{
1113 rbd_assert(obj_request->which != BAD_WHICH);
25dcf954 1114
37206ee5
AE
1115 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1116 obj_request->which);
bf0d5f50 1117 list_del(&obj_request->links);
25dcf954
AE
1118 rbd_assert(img_request->obj_request_count > 0);
1119 img_request->obj_request_count--;
1120 rbd_assert(obj_request->which == img_request->obj_request_count);
1121 obj_request->which = BAD_WHICH;
bf0d5f50 1122 rbd_assert(obj_request->img_request == img_request);
bf0d5f50 1123 obj_request->img_request = NULL;
25dcf954 1124 obj_request->callback = NULL;
bf0d5f50
AE
1125 rbd_obj_request_put(obj_request);
1126}
1127
1128static bool obj_request_type_valid(enum obj_request_type type)
1129{
1130 switch (type) {
9969ebc5 1131 case OBJ_REQUEST_NODATA:
bf0d5f50 1132 case OBJ_REQUEST_BIO:
788e2df3 1133 case OBJ_REQUEST_PAGES:
bf0d5f50
AE
1134 return true;
1135 default:
1136 return false;
1137 }
1138}
1139
bf0d5f50
AE
1140static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1141 struct rbd_obj_request *obj_request)
1142{
37206ee5
AE
1143 dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
1144
bf0d5f50
AE
1145 return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1146}
1147
1148static void rbd_img_request_complete(struct rbd_img_request *img_request)
1149{
37206ee5 1150 dout("%s: img %p\n", __func__, img_request);
bf0d5f50
AE
1151 if (img_request->callback)
1152 img_request->callback(img_request);
1153 else
1154 rbd_img_request_put(img_request);
1155}
1156
788e2df3
AE
1157/* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1158
1159static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1160{
37206ee5
AE
1161 dout("%s: obj %p\n", __func__, obj_request);
1162
788e2df3
AE
1163 return wait_for_completion_interruptible(&obj_request->completion);
1164}
1165
07741308
AE
1166static void obj_request_done_init(struct rbd_obj_request *obj_request)
1167{
1168 atomic_set(&obj_request->done, 0);
1169 smp_wmb();
1170}
1171
1172static void obj_request_done_set(struct rbd_obj_request *obj_request)
1173{
632b88ca
AE
1174 int done;
1175
1176 done = atomic_inc_return(&obj_request->done);
1177 if (done > 1) {
1178 struct rbd_img_request *img_request = obj_request->img_request;
1179 struct rbd_device *rbd_dev;
1180
1181 rbd_dev = img_request ? img_request->rbd_dev : NULL;
1182 rbd_warn(rbd_dev, "obj_request %p was already done\n",
1183 obj_request);
1184 }
07741308
AE
1185}
1186
1187static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1188{
632b88ca 1189 smp_mb();
07741308
AE
1190 return atomic_read(&obj_request->done) != 0;
1191}
1192
6e2a4505
AE
1193static void
1194rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1195{
1196 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1197 obj_request, obj_request->img_request, obj_request->result,
1198 obj_request->xferred, obj_request->length);
1199 /*
1200 * ENOENT means a hole in the image. We zero-fill the
1201 * entire length of the request. A short read also implies
1202 * zero-fill to the end of the request. Either way we
1203 * update the xferred count to indicate the whole request
1204 * was satisfied.
1205 */
1206 BUG_ON(obj_request->type != OBJ_REQUEST_BIO);
1207 if (obj_request->result == -ENOENT) {
1208 zero_bio_chain(obj_request->bio_list, 0);
1209 obj_request->result = 0;
1210 obj_request->xferred = obj_request->length;
1211 } else if (obj_request->xferred < obj_request->length &&
1212 !obj_request->result) {
1213 zero_bio_chain(obj_request->bio_list, obj_request->xferred);
1214 obj_request->xferred = obj_request->length;
1215 }
1216 obj_request_done_set(obj_request);
1217}
1218
bf0d5f50
AE
1219static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1220{
37206ee5
AE
1221 dout("%s: obj %p cb %p\n", __func__, obj_request,
1222 obj_request->callback);
bf0d5f50
AE
1223 if (obj_request->callback)
1224 obj_request->callback(obj_request);
788e2df3
AE
1225 else
1226 complete_all(&obj_request->completion);
bf0d5f50
AE
1227}
1228
c47f9371 1229static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
39bf2c5d
AE
1230{
1231 dout("%s: obj %p\n", __func__, obj_request);
1232 obj_request_done_set(obj_request);
1233}
1234
c47f9371 1235static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
bf0d5f50 1236{
37206ee5 1237 dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request,
c47f9371 1238 obj_request->result, obj_request->xferred, obj_request->length);
6e2a4505
AE
1239 if (obj_request->img_request)
1240 rbd_img_obj_request_read_callback(obj_request);
1241 else
1242 obj_request_done_set(obj_request);
bf0d5f50
AE
1243}
1244
c47f9371 1245static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
bf0d5f50 1246{
1b83bef2
SW
1247 dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1248 obj_request->result, obj_request->length);
1249 /*
1250 * There is no such thing as a successful short write.
1251 * Our xferred value is the number of bytes transferred
1252 * back. Set it to our originally-requested length.
1253 */
1254 obj_request->xferred = obj_request->length;
07741308 1255 obj_request_done_set(obj_request);
bf0d5f50
AE
1256}
1257
fbfab539
AE
1258/*
1259 * For a simple stat call there's nothing to do. We'll do more if
1260 * this is part of a write sequence for a layered image.
1261 */
c47f9371 1262static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
fbfab539 1263{
37206ee5 1264 dout("%s: obj %p\n", __func__, obj_request);
fbfab539
AE
1265 obj_request_done_set(obj_request);
1266}
1267
bf0d5f50
AE
1268static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1269 struct ceph_msg *msg)
1270{
1271 struct rbd_obj_request *obj_request = osd_req->r_priv;
bf0d5f50
AE
1272 u16 opcode;
1273
37206ee5 1274 dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
bf0d5f50
AE
1275 rbd_assert(osd_req == obj_request->osd_req);
1276 rbd_assert(!!obj_request->img_request ^
1277 (obj_request->which == BAD_WHICH));
1278
1b83bef2
SW
1279 if (osd_req->r_result < 0)
1280 obj_request->result = osd_req->r_result;
bf0d5f50
AE
1281 obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1282
1b83bef2 1283 WARN_ON(osd_req->r_num_ops != 1); /* For now */
bf0d5f50 1284
c47f9371
AE
1285 /*
1286 * We support a 64-bit length, but ultimately it has to be
1287 * passed to blk_end_request(), which takes an unsigned int.
1288 */
1b83bef2 1289 obj_request->xferred = osd_req->r_reply_op_len[0];
c47f9371 1290 rbd_assert(obj_request->xferred < (u64) UINT_MAX);
79528734 1291 opcode = osd_req->r_ops[0].op;
bf0d5f50
AE
1292 switch (opcode) {
1293 case CEPH_OSD_OP_READ:
c47f9371 1294 rbd_osd_read_callback(obj_request);
bf0d5f50
AE
1295 break;
1296 case CEPH_OSD_OP_WRITE:
c47f9371 1297 rbd_osd_write_callback(obj_request);
bf0d5f50 1298 break;
fbfab539 1299 case CEPH_OSD_OP_STAT:
c47f9371 1300 rbd_osd_stat_callback(obj_request);
fbfab539 1301 break;
36be9a76 1302 case CEPH_OSD_OP_CALL:
b8d70035 1303 case CEPH_OSD_OP_NOTIFY_ACK:
9969ebc5 1304 case CEPH_OSD_OP_WATCH:
c47f9371 1305 rbd_osd_trivial_callback(obj_request);
9969ebc5 1306 break;
bf0d5f50
AE
1307 default:
1308 rbd_warn(NULL, "%s: unsupported op %hu\n",
1309 obj_request->object_name, (unsigned short) opcode);
1310 break;
1311 }
1312
07741308 1313 if (obj_request_done_test(obj_request))
bf0d5f50
AE
1314 rbd_obj_request_complete(obj_request);
1315}
1316
2fa12320 1317static void rbd_osd_req_format(struct rbd_obj_request *obj_request,
79528734 1318 bool write_request)
430c28c3
AE
1319{
1320 struct rbd_img_request *img_request = obj_request->img_request;
8c042b0d 1321 struct ceph_osd_request *osd_req = obj_request->osd_req;
430c28c3
AE
1322 struct ceph_snap_context *snapc = NULL;
1323 u64 snap_id = CEPH_NOSNAP;
1324 struct timespec *mtime = NULL;
1325 struct timespec now;
1326
8c042b0d 1327 rbd_assert(osd_req != NULL);
430c28c3
AE
1328
1329 if (write_request) {
1330 now = CURRENT_TIME;
1331 mtime = &now;
1332 if (img_request)
1333 snapc = img_request->snapc;
2fa12320
AE
1334 } else if (img_request) {
1335 snap_id = img_request->snap_id;
8c042b0d
AE
1336 }
1337 ceph_osdc_build_request(osd_req, obj_request->offset,
79528734 1338 snapc, snap_id, mtime);
430c28c3
AE
1339}
1340
bf0d5f50
AE
1341static struct ceph_osd_request *rbd_osd_req_create(
1342 struct rbd_device *rbd_dev,
1343 bool write_request,
430c28c3 1344 struct rbd_obj_request *obj_request)
bf0d5f50
AE
1345{
1346 struct rbd_img_request *img_request = obj_request->img_request;
1347 struct ceph_snap_context *snapc = NULL;
1348 struct ceph_osd_client *osdc;
1349 struct ceph_osd_request *osd_req;
bf0d5f50
AE
1350
1351 if (img_request) {
1352 rbd_assert(img_request->write_request == write_request);
1353 if (img_request->write_request)
1354 snapc = img_request->snapc;
bf0d5f50
AE
1355 }
1356
1357 /* Allocate and initialize the request, for the single op */
1358
1359 osdc = &rbd_dev->rbd_client->client->osdc;
1360 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1361 if (!osd_req)
1362 return NULL; /* ENOMEM */
bf0d5f50 1363
430c28c3 1364 if (write_request)
bf0d5f50 1365 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
430c28c3 1366 else
bf0d5f50 1367 osd_req->r_flags = CEPH_OSD_FLAG_READ;
bf0d5f50
AE
1368
1369 osd_req->r_callback = rbd_osd_req_callback;
1370 osd_req->r_priv = obj_request;
1371
1372 osd_req->r_oid_len = strlen(obj_request->object_name);
1373 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1374 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1375
1376 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1377
bf0d5f50
AE
1378 return osd_req;
1379}
1380
1381static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1382{
1383 ceph_osdc_put_request(osd_req);
1384}
1385
1386/* object_name is assumed to be a non-null pointer and NUL-terminated */
1387
1388static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1389 u64 offset, u64 length,
1390 enum obj_request_type type)
1391{
1392 struct rbd_obj_request *obj_request;
1393 size_t size;
1394 char *name;
1395
1396 rbd_assert(obj_request_type_valid(type));
1397
1398 size = strlen(object_name) + 1;
1399 obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1400 if (!obj_request)
1401 return NULL;
1402
1403 name = (char *)(obj_request + 1);
1404 obj_request->object_name = memcpy(name, object_name, size);
1405 obj_request->offset = offset;
1406 obj_request->length = length;
1407 obj_request->which = BAD_WHICH;
1408 obj_request->type = type;
1409 INIT_LIST_HEAD(&obj_request->links);
07741308 1410 obj_request_done_init(obj_request);
788e2df3 1411 init_completion(&obj_request->completion);
bf0d5f50
AE
1412 kref_init(&obj_request->kref);
1413
37206ee5
AE
1414 dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
1415 offset, length, (int)type, obj_request);
1416
bf0d5f50
AE
1417 return obj_request;
1418}
1419
1420static void rbd_obj_request_destroy(struct kref *kref)
1421{
1422 struct rbd_obj_request *obj_request;
1423
1424 obj_request = container_of(kref, struct rbd_obj_request, kref);
1425
37206ee5
AE
1426 dout("%s: obj %p\n", __func__, obj_request);
1427
bf0d5f50
AE
1428 rbd_assert(obj_request->img_request == NULL);
1429 rbd_assert(obj_request->which == BAD_WHICH);
1430
1431 if (obj_request->osd_req)
1432 rbd_osd_req_destroy(obj_request->osd_req);
1433
1434 rbd_assert(obj_request_type_valid(obj_request->type));
1435 switch (obj_request->type) {
9969ebc5
AE
1436 case OBJ_REQUEST_NODATA:
1437 break; /* Nothing to do */
bf0d5f50
AE
1438 case OBJ_REQUEST_BIO:
1439 if (obj_request->bio_list)
1440 bio_chain_put(obj_request->bio_list);
1441 break;
788e2df3
AE
1442 case OBJ_REQUEST_PAGES:
1443 if (obj_request->pages)
1444 ceph_release_page_vector(obj_request->pages,
1445 obj_request->page_count);
1446 break;
bf0d5f50
AE
1447 }
1448
1449 kfree(obj_request);
1450}
1451
1452/*
1453 * Caller is responsible for filling in the list of object requests
1454 * that comprises the image request, and the Linux request pointer
1455 * (if there is one).
1456 */
cc344fa1
AE
1457static struct rbd_img_request *rbd_img_request_create(
1458 struct rbd_device *rbd_dev,
bf0d5f50
AE
1459 u64 offset, u64 length,
1460 bool write_request)
1461{
1462 struct rbd_img_request *img_request;
1463 struct ceph_snap_context *snapc = NULL;
1464
1465 img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1466 if (!img_request)
1467 return NULL;
1468
1469 if (write_request) {
1470 down_read(&rbd_dev->header_rwsem);
1471 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1472 up_read(&rbd_dev->header_rwsem);
1473 if (WARN_ON(!snapc)) {
1474 kfree(img_request);
1475 return NULL; /* Shouldn't happen */
1476 }
1477 }
1478
1479 img_request->rq = NULL;
1480 img_request->rbd_dev = rbd_dev;
1481 img_request->offset = offset;
1482 img_request->length = length;
1483 img_request->write_request = write_request;
1484 if (write_request)
1485 img_request->snapc = snapc;
1486 else
1487 img_request->snap_id = rbd_dev->spec->snap_id;
1488 spin_lock_init(&img_request->completion_lock);
1489 img_request->next_completion = 0;
1490 img_request->callback = NULL;
1491 img_request->obj_request_count = 0;
1492 INIT_LIST_HEAD(&img_request->obj_requests);
1493 kref_init(&img_request->kref);
1494
1495 rbd_img_request_get(img_request); /* Avoid a warning */
1496 rbd_img_request_put(img_request); /* TEMPORARY */
1497
37206ee5
AE
1498 dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
1499 write_request ? "write" : "read", offset, length,
1500 img_request);
1501
bf0d5f50
AE
1502 return img_request;
1503}
1504
1505static void rbd_img_request_destroy(struct kref *kref)
1506{
1507 struct rbd_img_request *img_request;
1508 struct rbd_obj_request *obj_request;
1509 struct rbd_obj_request *next_obj_request;
1510
1511 img_request = container_of(kref, struct rbd_img_request, kref);
1512
37206ee5
AE
1513 dout("%s: img %p\n", __func__, img_request);
1514
bf0d5f50
AE
1515 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1516 rbd_img_obj_request_del(img_request, obj_request);
25dcf954 1517 rbd_assert(img_request->obj_request_count == 0);
bf0d5f50
AE
1518
1519 if (img_request->write_request)
1520 ceph_put_snap_context(img_request->snapc);
1521
1522 kfree(img_request);
1523}
1524
2169238d
AE
1525static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1526{
1527 struct rbd_img_request *img_request;
1528 u32 which = obj_request->which;
1529 bool more = true;
1530
1531 img_request = obj_request->img_request;
1532
1533 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1534 rbd_assert(img_request != NULL);
1535 rbd_assert(img_request->rq != NULL);
1536 rbd_assert(img_request->obj_request_count > 0);
1537 rbd_assert(which != BAD_WHICH);
1538 rbd_assert(which < img_request->obj_request_count);
1539 rbd_assert(which >= img_request->next_completion);
1540
1541 spin_lock_irq(&img_request->completion_lock);
1542 if (which != img_request->next_completion)
1543 goto out;
1544
1545 for_each_obj_request_from(img_request, obj_request) {
1546 unsigned int xferred;
1547 int result;
1548
1549 rbd_assert(more);
1550 rbd_assert(which < img_request->obj_request_count);
1551
1552 if (!obj_request_done_test(obj_request))
1553 break;
1554
1555 rbd_assert(obj_request->xferred <= (u64) UINT_MAX);
1556 xferred = (unsigned int) obj_request->xferred;
1557 result = (int) obj_request->result;
1558 if (result)
1559 rbd_warn(NULL, "obj_request %s result %d xferred %u\n",
1560 img_request->write_request ? "write" : "read",
1561 result, xferred);
1562
1563 more = blk_end_request(img_request->rq, result, xferred);
1564 which++;
1565 }
1566
1567 rbd_assert(more ^ (which == img_request->obj_request_count));
1568 img_request->next_completion = which;
1569out:
1570 spin_unlock_irq(&img_request->completion_lock);
1571
1572 if (!more)
1573 rbd_img_request_complete(img_request);
1574}
1575
bf0d5f50
AE
1576static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
1577 struct bio *bio_list)
1578{
1579 struct rbd_device *rbd_dev = img_request->rbd_dev;
1580 struct rbd_obj_request *obj_request = NULL;
1581 struct rbd_obj_request *next_obj_request;
430c28c3 1582 bool write_request = img_request->write_request;
bf0d5f50
AE
1583 unsigned int bio_offset;
1584 u64 image_offset;
1585 u64 resid;
1586 u16 opcode;
1587
37206ee5
AE
1588 dout("%s: img %p bio %p\n", __func__, img_request, bio_list);
1589
430c28c3 1590 opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
bf0d5f50
AE
1591 bio_offset = 0;
1592 image_offset = img_request->offset;
1593 rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT);
1594 resid = img_request->length;
4dda41d3 1595 rbd_assert(resid > 0);
bf0d5f50 1596 while (resid) {
2fa12320 1597 struct ceph_osd_request *osd_req;
bf0d5f50
AE
1598 const char *object_name;
1599 unsigned int clone_size;
bf0d5f50
AE
1600 u64 offset;
1601 u64 length;
1602
1603 object_name = rbd_segment_name(rbd_dev, image_offset);
1604 if (!object_name)
1605 goto out_unwind;
1606 offset = rbd_segment_offset(rbd_dev, image_offset);
1607 length = rbd_segment_length(rbd_dev, image_offset, resid);
1608 obj_request = rbd_obj_request_create(object_name,
1609 offset, length,
1610 OBJ_REQUEST_BIO);
1611 kfree(object_name); /* object request has its own copy */
1612 if (!obj_request)
1613 goto out_unwind;
1614
1615 rbd_assert(length <= (u64) UINT_MAX);
1616 clone_size = (unsigned int) length;
1617 obj_request->bio_list = bio_chain_clone_range(&bio_list,
1618 &bio_offset, clone_size,
1619 GFP_ATOMIC);
1620 if (!obj_request->bio_list)
1621 goto out_partial;
1622
2fa12320
AE
1623 osd_req = rbd_osd_req_create(rbd_dev, write_request,
1624 obj_request);
1625 if (!osd_req)
bf0d5f50 1626 goto out_partial;
2fa12320 1627 obj_request->osd_req = osd_req;
2169238d 1628 obj_request->callback = rbd_img_obj_callback;
430c28c3 1629
2fa12320
AE
1630 osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
1631 0, 0);
a4ce40a9
AE
1632 osd_req_op_extent_osd_data_bio(osd_req, 0, write_request,
1633 obj_request->bio_list, obj_request->length);
2fa12320 1634 rbd_osd_req_format(obj_request, write_request);
430c28c3 1635
bf0d5f50
AE
1636 rbd_img_obj_request_add(img_request, obj_request);
1637
1638 image_offset += length;
1639 resid -= length;
1640 }
1641
1642 return 0;
1643
1644out_partial:
1645 rbd_obj_request_put(obj_request);
1646out_unwind:
1647 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1648 rbd_obj_request_put(obj_request);
1649
1650 return -ENOMEM;
1651}
1652
bf0d5f50
AE
1653static int rbd_img_request_submit(struct rbd_img_request *img_request)
1654{
1655 struct rbd_device *rbd_dev = img_request->rbd_dev;
1656 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1657 struct rbd_obj_request *obj_request;
46faeed4 1658 struct rbd_obj_request *next_obj_request;
bf0d5f50 1659
37206ee5 1660 dout("%s: img %p\n", __func__, img_request);
46faeed4 1661 for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
bf0d5f50
AE
1662 int ret;
1663
bf0d5f50
AE
1664 ret = rbd_obj_request_submit(osdc, obj_request);
1665 if (ret)
1666 return ret;
1667 /*
1668 * The image request has its own reference to each
1669 * of its object requests, so we can safely drop the
1670 * initial one here.
1671 */
1672 rbd_obj_request_put(obj_request);
1673 }
1674
1675 return 0;
1676}
1677
cf81b60e 1678static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
b8d70035
AE
1679 u64 ver, u64 notify_id)
1680{
1681 struct rbd_obj_request *obj_request;
2169238d 1682 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
b8d70035
AE
1683 int ret;
1684
1685 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1686 OBJ_REQUEST_NODATA);
1687 if (!obj_request)
1688 return -ENOMEM;
1689
1690 ret = -ENOMEM;
430c28c3 1691 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
b8d70035
AE
1692 if (!obj_request->osd_req)
1693 goto out;
2169238d 1694 obj_request->callback = rbd_obj_request_put;
b8d70035 1695
c99d2d4a
AE
1696 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
1697 notify_id, ver, 0);
2fa12320 1698 rbd_osd_req_format(obj_request, false);
430c28c3 1699
b8d70035 1700 ret = rbd_obj_request_submit(osdc, obj_request);
b8d70035 1701out:
cf81b60e
AE
1702 if (ret)
1703 rbd_obj_request_put(obj_request);
b8d70035
AE
1704
1705 return ret;
1706}
1707
1708static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1709{
1710 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1711 u64 hver;
1712 int rc;
1713
1714 if (!rbd_dev)
1715 return;
1716
37206ee5 1717 dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
b8d70035
AE
1718 rbd_dev->header_name, (unsigned long long) notify_id,
1719 (unsigned int) opcode);
1720 rc = rbd_dev_refresh(rbd_dev, &hver);
1721 if (rc)
1722 rbd_warn(rbd_dev, "got notification but failed to "
1723 " update snaps: %d\n", rc);
1724
cf81b60e 1725 rbd_obj_notify_ack(rbd_dev, hver, notify_id);
b8d70035
AE
1726}
1727
9969ebc5
AE
1728/*
1729 * Request sync osd watch/unwatch. The value of "start" determines
1730 * whether a watch request is being initiated or torn down.
1731 */
1732static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
1733{
1734 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1735 struct rbd_obj_request *obj_request;
9969ebc5
AE
1736 int ret;
1737
1738 rbd_assert(start ^ !!rbd_dev->watch_event);
1739 rbd_assert(start ^ !!rbd_dev->watch_request);
1740
1741 if (start) {
3c663bbd 1742 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
9969ebc5
AE
1743 &rbd_dev->watch_event);
1744 if (ret < 0)
1745 return ret;
8eb87565 1746 rbd_assert(rbd_dev->watch_event != NULL);
9969ebc5
AE
1747 }
1748
1749 ret = -ENOMEM;
1750 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1751 OBJ_REQUEST_NODATA);
1752 if (!obj_request)
1753 goto out_cancel;
1754
430c28c3
AE
1755 obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
1756 if (!obj_request->osd_req)
1757 goto out_cancel;
1758
8eb87565 1759 if (start)
975241af 1760 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
8eb87565 1761 else
6977c3f9 1762 ceph_osdc_unregister_linger_request(osdc,
975241af 1763 rbd_dev->watch_request->osd_req);
2169238d
AE
1764
1765 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
1766 rbd_dev->watch_event->cookie,
1767 rbd_dev->header.obj_version, start);
1768 rbd_osd_req_format(obj_request, true);
1769
9969ebc5
AE
1770 ret = rbd_obj_request_submit(osdc, obj_request);
1771 if (ret)
1772 goto out_cancel;
1773 ret = rbd_obj_request_wait(obj_request);
1774 if (ret)
1775 goto out_cancel;
9969ebc5
AE
1776 ret = obj_request->result;
1777 if (ret)
1778 goto out_cancel;
1779
8eb87565
AE
1780 /*
1781 * A watch request is set to linger, so the underlying osd
1782 * request won't go away until we unregister it. We retain
1783 * a pointer to the object request during that time (in
1784 * rbd_dev->watch_request), so we'll keep a reference to
1785 * it. We'll drop that reference (below) after we've
1786 * unregistered it.
1787 */
1788 if (start) {
1789 rbd_dev->watch_request = obj_request;
1790
1791 return 0;
1792 }
1793
1794 /* We have successfully torn down the watch request */
1795
1796 rbd_obj_request_put(rbd_dev->watch_request);
1797 rbd_dev->watch_request = NULL;
9969ebc5
AE
1798out_cancel:
1799 /* Cancel the event if we're tearing down, or on error */
1800 ceph_osdc_cancel_event(rbd_dev->watch_event);
1801 rbd_dev->watch_event = NULL;
9969ebc5
AE
1802 if (obj_request)
1803 rbd_obj_request_put(obj_request);
1804
1805 return ret;
1806}
1807
36be9a76
AE
1808/*
1809 * Synchronous osd object method call
1810 */
1811static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
1812 const char *object_name,
1813 const char *class_name,
1814 const char *method_name,
1815 const char *outbound,
1816 size_t outbound_size,
1817 char *inbound,
1818 size_t inbound_size,
1819 u64 *version)
1820{
2169238d 1821 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
36be9a76 1822 struct rbd_obj_request *obj_request;
36be9a76
AE
1823 struct page **pages;
1824 u32 page_count;
1825 int ret;
1826
1827 /*
6010a451
AE
1828 * Method calls are ultimately read operations. The result
1829 * should placed into the inbound buffer provided. They
1830 * also supply outbound data--parameters for the object
1831 * method. Currently if this is present it will be a
1832 * snapshot id.
36be9a76
AE
1833 */
1834 page_count = (u32) calc_pages_for(0, inbound_size);
1835 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
1836 if (IS_ERR(pages))
1837 return PTR_ERR(pages);
1838
1839 ret = -ENOMEM;
6010a451 1840 obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
36be9a76
AE
1841 OBJ_REQUEST_PAGES);
1842 if (!obj_request)
1843 goto out;
1844
1845 obj_request->pages = pages;
1846 obj_request->page_count = page_count;
1847
430c28c3 1848 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
36be9a76
AE
1849 if (!obj_request->osd_req)
1850 goto out;
1851
c99d2d4a 1852 osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
04017e29
AE
1853 class_name, method_name);
1854 if (outbound_size) {
1855 struct ceph_pagelist *pagelist;
1856
1857 pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
1858 if (!pagelist)
1859 goto out;
1860
1861 ceph_pagelist_init(pagelist);
1862 ceph_pagelist_append(pagelist, outbound, outbound_size);
1863 osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
1864 pagelist);
1865 }
a4ce40a9
AE
1866 osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
1867 obj_request->pages, inbound_size,
44cd188d 1868 0, false, false);
2fa12320 1869 rbd_osd_req_format(obj_request, false);
430c28c3 1870
36be9a76
AE
1871 ret = rbd_obj_request_submit(osdc, obj_request);
1872 if (ret)
1873 goto out;
1874 ret = rbd_obj_request_wait(obj_request);
1875 if (ret)
1876 goto out;
1877
1878 ret = obj_request->result;
1879 if (ret < 0)
1880 goto out;
23ed6e13 1881 ret = 0;
903bb32e 1882 ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
36be9a76
AE
1883 if (version)
1884 *version = obj_request->version;
1885out:
1886 if (obj_request)
1887 rbd_obj_request_put(obj_request);
1888 else
1889 ceph_release_page_vector(pages, page_count);
1890
1891 return ret;
1892}
1893
bf0d5f50 1894static void rbd_request_fn(struct request_queue *q)
cc344fa1 1895 __releases(q->queue_lock) __acquires(q->queue_lock)
bf0d5f50
AE
1896{
1897 struct rbd_device *rbd_dev = q->queuedata;
1898 bool read_only = rbd_dev->mapping.read_only;
1899 struct request *rq;
1900 int result;
1901
1902 while ((rq = blk_fetch_request(q))) {
1903 bool write_request = rq_data_dir(rq) == WRITE;
1904 struct rbd_img_request *img_request;
1905 u64 offset;
1906 u64 length;
1907
1908 /* Ignore any non-FS requests that filter through. */
1909
1910 if (rq->cmd_type != REQ_TYPE_FS) {
4dda41d3
AE
1911 dout("%s: non-fs request type %d\n", __func__,
1912 (int) rq->cmd_type);
1913 __blk_end_request_all(rq, 0);
1914 continue;
1915 }
1916
1917 /* Ignore/skip any zero-length requests */
1918
1919 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
1920 length = (u64) blk_rq_bytes(rq);
1921
1922 if (!length) {
1923 dout("%s: zero-length request\n", __func__);
bf0d5f50
AE
1924 __blk_end_request_all(rq, 0);
1925 continue;
1926 }
1927
1928 spin_unlock_irq(q->queue_lock);
1929
1930 /* Disallow writes to a read-only device */
1931
1932 if (write_request) {
1933 result = -EROFS;
1934 if (read_only)
1935 goto end_request;
1936 rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
1937 }
1938
6d292906
AE
1939 /*
1940 * Quit early if the mapped snapshot no longer
1941 * exists. It's still possible the snapshot will
1942 * have disappeared by the time our request arrives
1943 * at the osd, but there's no sense in sending it if
1944 * we already know.
1945 */
1946 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
bf0d5f50
AE
1947 dout("request for non-existent snapshot");
1948 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
1949 result = -ENXIO;
1950 goto end_request;
1951 }
1952
bf0d5f50
AE
1953 result = -EINVAL;
1954 if (WARN_ON(offset && length > U64_MAX - offset + 1))
1955 goto end_request; /* Shouldn't happen */
1956
1957 result = -ENOMEM;
1958 img_request = rbd_img_request_create(rbd_dev, offset, length,
1959 write_request);
1960 if (!img_request)
1961 goto end_request;
1962
1963 img_request->rq = rq;
1964
1965 result = rbd_img_request_fill_bio(img_request, rq->bio);
1966 if (!result)
1967 result = rbd_img_request_submit(img_request);
1968 if (result)
1969 rbd_img_request_put(img_request);
1970end_request:
1971 spin_lock_irq(q->queue_lock);
1972 if (result < 0) {
1973 rbd_warn(rbd_dev, "obj_request %s result %d\n",
1974 write_request ? "write" : "read", result);
1975 __blk_end_request_all(rq, result);
1976 }
1977 }
1978}
1979
602adf40
YS
1980/*
1981 * a queue callback. Makes sure that we don't create a bio that spans across
1982 * multiple osd objects. One exception would be with a single page bios,
f7760dad 1983 * which we handle later at bio_chain_clone_range()
602adf40
YS
1984 */
1985static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1986 struct bio_vec *bvec)
1987{
1988 struct rbd_device *rbd_dev = q->queuedata;
e5cfeed2
AE
1989 sector_t sector_offset;
1990 sector_t sectors_per_obj;
1991 sector_t obj_sector_offset;
1992 int ret;
1993
1994 /*
1995 * Find how far into its rbd object the partition-relative
1996 * bio start sector is to offset relative to the enclosing
1997 * device.
1998 */
1999 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2000 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2001 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
2002
2003 /*
2004 * Compute the number of bytes from that offset to the end
2005 * of the object. Account for what's already used by the bio.
2006 */
2007 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2008 if (ret > bmd->bi_size)
2009 ret -= bmd->bi_size;
2010 else
2011 ret = 0;
2012
2013 /*
2014 * Don't send back more than was asked for. And if the bio
2015 * was empty, let the whole thing through because: "Note
2016 * that a block device *must* allow a single page to be
2017 * added to an empty bio."
2018 */
2019 rbd_assert(bvec->bv_len <= PAGE_SIZE);
2020 if (ret > (int) bvec->bv_len || !bmd->bi_size)
2021 ret = (int) bvec->bv_len;
2022
2023 return ret;
602adf40
YS
2024}
2025
2026static void rbd_free_disk(struct rbd_device *rbd_dev)
2027{
2028 struct gendisk *disk = rbd_dev->disk;
2029
2030 if (!disk)
2031 return;
2032
602adf40
YS
2033 if (disk->flags & GENHD_FL_UP)
2034 del_gendisk(disk);
2035 if (disk->queue)
2036 blk_cleanup_queue(disk->queue);
2037 put_disk(disk);
2038}
2039
788e2df3
AE
2040static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2041 const char *object_name,
2042 u64 offset, u64 length,
2043 char *buf, u64 *version)
2044
2045{
2169238d 2046 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
788e2df3 2047 struct rbd_obj_request *obj_request;
788e2df3
AE
2048 struct page **pages = NULL;
2049 u32 page_count;
1ceae7ef 2050 size_t size;
788e2df3
AE
2051 int ret;
2052
2053 page_count = (u32) calc_pages_for(offset, length);
2054 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2055 if (IS_ERR(pages))
2056 ret = PTR_ERR(pages);
2057
2058 ret = -ENOMEM;
2059 obj_request = rbd_obj_request_create(object_name, offset, length,
36be9a76 2060 OBJ_REQUEST_PAGES);
788e2df3
AE
2061 if (!obj_request)
2062 goto out;
2063
2064 obj_request->pages = pages;
2065 obj_request->page_count = page_count;
2066
430c28c3 2067 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
788e2df3
AE
2068 if (!obj_request->osd_req)
2069 goto out;
2070
c99d2d4a
AE
2071 osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
2072 offset, length, 0, 0);
a4ce40a9
AE
2073 osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, false,
2074 obj_request->pages,
44cd188d
AE
2075 obj_request->length,
2076 obj_request->offset & ~PAGE_MASK,
2077 false, false);
2fa12320 2078 rbd_osd_req_format(obj_request, false);
430c28c3 2079
788e2df3
AE
2080 ret = rbd_obj_request_submit(osdc, obj_request);
2081 if (ret)
2082 goto out;
2083 ret = rbd_obj_request_wait(obj_request);
2084 if (ret)
2085 goto out;
2086
2087 ret = obj_request->result;
2088 if (ret < 0)
2089 goto out;
1ceae7ef
AE
2090
2091 rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
2092 size = (size_t) obj_request->xferred;
903bb32e 2093 ceph_copy_from_page_vector(pages, buf, 0, size);
23ed6e13
AE
2094 rbd_assert(size <= (size_t) INT_MAX);
2095 ret = (int) size;
788e2df3
AE
2096 if (version)
2097 *version = obj_request->version;
2098out:
2099 if (obj_request)
2100 rbd_obj_request_put(obj_request);
2101 else
2102 ceph_release_page_vector(pages, page_count);
2103
2104 return ret;
2105}
2106
602adf40 2107/*
4156d998
AE
2108 * Read the complete header for the given rbd device.
2109 *
2110 * Returns a pointer to a dynamically-allocated buffer containing
2111 * the complete and validated header. Caller can pass the address
2112 * of a variable that will be filled in with the version of the
2113 * header object at the time it was read.
2114 *
2115 * Returns a pointer-coded errno if a failure occurs.
602adf40 2116 */
4156d998
AE
2117static struct rbd_image_header_ondisk *
2118rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
602adf40 2119{
4156d998 2120 struct rbd_image_header_ondisk *ondisk = NULL;
50f7c4c9 2121 u32 snap_count = 0;
4156d998
AE
2122 u64 names_size = 0;
2123 u32 want_count;
2124 int ret;
602adf40 2125
00f1f36f 2126 /*
4156d998
AE
2127 * The complete header will include an array of its 64-bit
2128 * snapshot ids, followed by the names of those snapshots as
2129 * a contiguous block of NUL-terminated strings. Note that
2130 * the number of snapshots could change by the time we read
2131 * it in, in which case we re-read it.
00f1f36f 2132 */
4156d998
AE
2133 do {
2134 size_t size;
2135
2136 kfree(ondisk);
2137
2138 size = sizeof (*ondisk);
2139 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
2140 size += names_size;
2141 ondisk = kmalloc(size, GFP_KERNEL);
2142 if (!ondisk)
2143 return ERR_PTR(-ENOMEM);
2144
788e2df3 2145 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
4156d998
AE
2146 0, size,
2147 (char *) ondisk, version);
4156d998
AE
2148 if (ret < 0)
2149 goto out_err;
2150 if (WARN_ON((size_t) ret < size)) {
2151 ret = -ENXIO;
06ecc6cb
AE
2152 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
2153 size, ret);
4156d998
AE
2154 goto out_err;
2155 }
2156 if (!rbd_dev_ondisk_valid(ondisk)) {
2157 ret = -ENXIO;
06ecc6cb 2158 rbd_warn(rbd_dev, "invalid header");
4156d998 2159 goto out_err;
81e759fb 2160 }
602adf40 2161
4156d998
AE
2162 names_size = le64_to_cpu(ondisk->snap_names_len);
2163 want_count = snap_count;
2164 snap_count = le32_to_cpu(ondisk->snap_count);
2165 } while (snap_count != want_count);
00f1f36f 2166
4156d998 2167 return ondisk;
00f1f36f 2168
4156d998
AE
2169out_err:
2170 kfree(ondisk);
2171
2172 return ERR_PTR(ret);
2173}
2174
2175/*
2176 * reload the ondisk the header
2177 */
2178static int rbd_read_header(struct rbd_device *rbd_dev,
2179 struct rbd_image_header *header)
2180{
2181 struct rbd_image_header_ondisk *ondisk;
2182 u64 ver = 0;
2183 int ret;
602adf40 2184
4156d998
AE
2185 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
2186 if (IS_ERR(ondisk))
2187 return PTR_ERR(ondisk);
2188 ret = rbd_header_from_disk(header, ondisk);
2189 if (ret >= 0)
2190 header->obj_version = ver;
2191 kfree(ondisk);
2192
2193 return ret;
602adf40
YS
2194}
2195
41f38c2b 2196static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
dfc5606d
YS
2197{
2198 struct rbd_snap *snap;
a0593290 2199 struct rbd_snap *next;
dfc5606d 2200
a0593290 2201 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
41f38c2b 2202 rbd_remove_snap_dev(snap);
dfc5606d
YS
2203}
2204
9478554a
AE
2205static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
2206{
2207 sector_t size;
2208
0d7dbfce 2209 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
9478554a
AE
2210 return;
2211
2212 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
2213 dout("setting size to %llu sectors", (unsigned long long) size);
2214 rbd_dev->mapping.size = (u64) size;
2215 set_capacity(rbd_dev->disk, size);
2216}
2217
602adf40
YS
2218/*
2219 * only read the first part of the ondisk header, without the snaps info
2220 */
117973fb 2221static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
602adf40
YS
2222{
2223 int ret;
2224 struct rbd_image_header h;
602adf40
YS
2225
2226 ret = rbd_read_header(rbd_dev, &h);
2227 if (ret < 0)
2228 return ret;
2229
a51aa0c0
JD
2230 down_write(&rbd_dev->header_rwsem);
2231
9478554a
AE
2232 /* Update image size, and check for resize of mapped image */
2233 rbd_dev->header.image_size = h.image_size;
2234 rbd_update_mapping_size(rbd_dev);
9db4b3e3 2235
849b4260 2236 /* rbd_dev->header.object_prefix shouldn't change */
602adf40 2237 kfree(rbd_dev->header.snap_sizes);
849b4260 2238 kfree(rbd_dev->header.snap_names);
d1d25646
JD
2239 /* osd requests may still refer to snapc */
2240 ceph_put_snap_context(rbd_dev->header.snapc);
602adf40 2241
b813623a
AE
2242 if (hver)
2243 *hver = h.obj_version;
a71b891b 2244 rbd_dev->header.obj_version = h.obj_version;
93a24e08 2245 rbd_dev->header.image_size = h.image_size;
602adf40
YS
2246 rbd_dev->header.snapc = h.snapc;
2247 rbd_dev->header.snap_names = h.snap_names;
2248 rbd_dev->header.snap_sizes = h.snap_sizes;
849b4260
AE
2249 /* Free the extra copy of the object prefix */
2250 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
2251 kfree(h.object_prefix);
2252
304f6808
AE
2253 ret = rbd_dev_snaps_update(rbd_dev);
2254 if (!ret)
2255 ret = rbd_dev_snaps_register(rbd_dev);
dfc5606d 2256
c666601a 2257 up_write(&rbd_dev->header_rwsem);
602adf40 2258
dfc5606d 2259 return ret;
602adf40
YS
2260}
2261
117973fb 2262static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
1fe5e993
AE
2263{
2264 int ret;
2265
117973fb 2266 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1fe5e993 2267 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
117973fb
AE
2268 if (rbd_dev->image_format == 1)
2269 ret = rbd_dev_v1_refresh(rbd_dev, hver);
2270 else
2271 ret = rbd_dev_v2_refresh(rbd_dev, hver);
1fe5e993
AE
2272 mutex_unlock(&ctl_mutex);
2273
2274 return ret;
2275}
2276
602adf40
YS
2277static int rbd_init_disk(struct rbd_device *rbd_dev)
2278{
2279 struct gendisk *disk;
2280 struct request_queue *q;
593a9e7b 2281 u64 segment_size;
602adf40 2282
602adf40 2283 /* create gendisk info */
602adf40
YS
2284 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
2285 if (!disk)
1fcdb8aa 2286 return -ENOMEM;
602adf40 2287
f0f8cef5 2288 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
de71a297 2289 rbd_dev->dev_id);
602adf40
YS
2290 disk->major = rbd_dev->major;
2291 disk->first_minor = 0;
2292 disk->fops = &rbd_bd_ops;
2293 disk->private_data = rbd_dev;
2294
bf0d5f50 2295 q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
602adf40
YS
2296 if (!q)
2297 goto out_disk;
029bcbd8 2298
593a9e7b
AE
2299 /* We use the default size, but let's be explicit about it. */
2300 blk_queue_physical_block_size(q, SECTOR_SIZE);
2301
029bcbd8 2302 /* set io sizes to object size */
593a9e7b
AE
2303 segment_size = rbd_obj_bytes(&rbd_dev->header);
2304 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
2305 blk_queue_max_segment_size(q, segment_size);
2306 blk_queue_io_min(q, segment_size);
2307 blk_queue_io_opt(q, segment_size);
029bcbd8 2308
602adf40
YS
2309 blk_queue_merge_bvec(q, rbd_merge_bvec);
2310 disk->queue = q;
2311
2312 q->queuedata = rbd_dev;
2313
2314 rbd_dev->disk = disk;
602adf40 2315
12f02944
AE
2316 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
2317
602adf40 2318 return 0;
602adf40
YS
2319out_disk:
2320 put_disk(disk);
1fcdb8aa
AE
2321
2322 return -ENOMEM;
602adf40
YS
2323}
2324
dfc5606d
YS
2325/*
2326 sysfs
2327*/
2328
593a9e7b
AE
2329static struct rbd_device *dev_to_rbd_dev(struct device *dev)
2330{
2331 return container_of(dev, struct rbd_device, dev);
2332}
2333
dfc5606d
YS
2334static ssize_t rbd_size_show(struct device *dev,
2335 struct device_attribute *attr, char *buf)
2336{
593a9e7b 2337 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
a51aa0c0
JD
2338 sector_t size;
2339
2340 down_read(&rbd_dev->header_rwsem);
2341 size = get_capacity(rbd_dev->disk);
2342 up_read(&rbd_dev->header_rwsem);
dfc5606d 2343
a51aa0c0 2344 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
dfc5606d
YS
2345}
2346
34b13184
AE
2347/*
2348 * Note this shows the features for whatever's mapped, which is not
2349 * necessarily the base image.
2350 */
2351static ssize_t rbd_features_show(struct device *dev,
2352 struct device_attribute *attr, char *buf)
2353{
2354 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2355
2356 return sprintf(buf, "0x%016llx\n",
2357 (unsigned long long) rbd_dev->mapping.features);
2358}
2359
dfc5606d
YS
2360static ssize_t rbd_major_show(struct device *dev,
2361 struct device_attribute *attr, char *buf)
2362{
593a9e7b 2363 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 2364
dfc5606d
YS
2365 return sprintf(buf, "%d\n", rbd_dev->major);
2366}
2367
2368static ssize_t rbd_client_id_show(struct device *dev,
2369 struct device_attribute *attr, char *buf)
602adf40 2370{
593a9e7b 2371 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2372
1dbb4399
AE
2373 return sprintf(buf, "client%lld\n",
2374 ceph_client_id(rbd_dev->rbd_client->client));
602adf40
YS
2375}
2376
dfc5606d
YS
2377static ssize_t rbd_pool_show(struct device *dev,
2378 struct device_attribute *attr, char *buf)
602adf40 2379{
593a9e7b 2380 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2381
0d7dbfce 2382 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
dfc5606d
YS
2383}
2384
9bb2f334
AE
2385static ssize_t rbd_pool_id_show(struct device *dev,
2386 struct device_attribute *attr, char *buf)
2387{
2388 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2389
0d7dbfce
AE
2390 return sprintf(buf, "%llu\n",
2391 (unsigned long long) rbd_dev->spec->pool_id);
9bb2f334
AE
2392}
2393
dfc5606d
YS
2394static ssize_t rbd_name_show(struct device *dev,
2395 struct device_attribute *attr, char *buf)
2396{
593a9e7b 2397 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2398
a92ffdf8
AE
2399 if (rbd_dev->spec->image_name)
2400 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2401
2402 return sprintf(buf, "(unknown)\n");
dfc5606d
YS
2403}
2404
589d30e0
AE
2405static ssize_t rbd_image_id_show(struct device *dev,
2406 struct device_attribute *attr, char *buf)
2407{
2408 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2409
0d7dbfce 2410 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
589d30e0
AE
2411}
2412
34b13184
AE
2413/*
2414 * Shows the name of the currently-mapped snapshot (or
2415 * RBD_SNAP_HEAD_NAME for the base image).
2416 */
dfc5606d
YS
2417static ssize_t rbd_snap_show(struct device *dev,
2418 struct device_attribute *attr,
2419 char *buf)
2420{
593a9e7b 2421 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2422
0d7dbfce 2423 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
dfc5606d
YS
2424}
2425
86b00e0d
AE
2426/*
2427 * For an rbd v2 image, shows the pool id, image id, and snapshot id
2428 * for the parent image. If there is no parent, simply shows
2429 * "(no parent image)".
2430 */
2431static ssize_t rbd_parent_show(struct device *dev,
2432 struct device_attribute *attr,
2433 char *buf)
2434{
2435 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2436 struct rbd_spec *spec = rbd_dev->parent_spec;
2437 int count;
2438 char *bufp = buf;
2439
2440 if (!spec)
2441 return sprintf(buf, "(no parent image)\n");
2442
2443 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2444 (unsigned long long) spec->pool_id, spec->pool_name);
2445 if (count < 0)
2446 return count;
2447 bufp += count;
2448
2449 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2450 spec->image_name ? spec->image_name : "(unknown)");
2451 if (count < 0)
2452 return count;
2453 bufp += count;
2454
2455 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2456 (unsigned long long) spec->snap_id, spec->snap_name);
2457 if (count < 0)
2458 return count;
2459 bufp += count;
2460
2461 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2462 if (count < 0)
2463 return count;
2464 bufp += count;
2465
2466 return (ssize_t) (bufp - buf);
2467}
2468
dfc5606d
YS
2469static ssize_t rbd_image_refresh(struct device *dev,
2470 struct device_attribute *attr,
2471 const char *buf,
2472 size_t size)
2473{
593a9e7b 2474 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
b813623a 2475 int ret;
602adf40 2476
117973fb 2477 ret = rbd_dev_refresh(rbd_dev, NULL);
b813623a
AE
2478
2479 return ret < 0 ? ret : size;
dfc5606d 2480}
602adf40 2481
dfc5606d 2482static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
34b13184 2483static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
dfc5606d
YS
2484static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2485static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2486static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
9bb2f334 2487static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
dfc5606d 2488static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
589d30e0 2489static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
dfc5606d
YS
2490static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2491static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
86b00e0d 2492static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
dfc5606d
YS
2493
2494static struct attribute *rbd_attrs[] = {
2495 &dev_attr_size.attr,
34b13184 2496 &dev_attr_features.attr,
dfc5606d
YS
2497 &dev_attr_major.attr,
2498 &dev_attr_client_id.attr,
2499 &dev_attr_pool.attr,
9bb2f334 2500 &dev_attr_pool_id.attr,
dfc5606d 2501 &dev_attr_name.attr,
589d30e0 2502 &dev_attr_image_id.attr,
dfc5606d 2503 &dev_attr_current_snap.attr,
86b00e0d 2504 &dev_attr_parent.attr,
dfc5606d 2505 &dev_attr_refresh.attr,
dfc5606d
YS
2506 NULL
2507};
2508
2509static struct attribute_group rbd_attr_group = {
2510 .attrs = rbd_attrs,
2511};
2512
2513static const struct attribute_group *rbd_attr_groups[] = {
2514 &rbd_attr_group,
2515 NULL
2516};
2517
2518static void rbd_sysfs_dev_release(struct device *dev)
2519{
2520}
2521
2522static struct device_type rbd_device_type = {
2523 .name = "rbd",
2524 .groups = rbd_attr_groups,
2525 .release = rbd_sysfs_dev_release,
2526};
2527
2528
2529/*
2530 sysfs - snapshots
2531*/
2532
2533static ssize_t rbd_snap_size_show(struct device *dev,
2534 struct device_attribute *attr,
2535 char *buf)
2536{
2537 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2538
3591538f 2539 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
dfc5606d
YS
2540}
2541
2542static ssize_t rbd_snap_id_show(struct device *dev,
2543 struct device_attribute *attr,
2544 char *buf)
2545{
2546 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2547
3591538f 2548 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
dfc5606d
YS
2549}
2550
34b13184
AE
2551static ssize_t rbd_snap_features_show(struct device *dev,
2552 struct device_attribute *attr,
2553 char *buf)
2554{
2555 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2556
2557 return sprintf(buf, "0x%016llx\n",
2558 (unsigned long long) snap->features);
2559}
2560
dfc5606d
YS
2561static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2562static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
34b13184 2563static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
dfc5606d
YS
2564
2565static struct attribute *rbd_snap_attrs[] = {
2566 &dev_attr_snap_size.attr,
2567 &dev_attr_snap_id.attr,
34b13184 2568 &dev_attr_snap_features.attr,
dfc5606d
YS
2569 NULL,
2570};
2571
2572static struct attribute_group rbd_snap_attr_group = {
2573 .attrs = rbd_snap_attrs,
2574};
2575
2576static void rbd_snap_dev_release(struct device *dev)
2577{
2578 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2579 kfree(snap->name);
2580 kfree(snap);
2581}
2582
2583static const struct attribute_group *rbd_snap_attr_groups[] = {
2584 &rbd_snap_attr_group,
2585 NULL
2586};
2587
2588static struct device_type rbd_snap_device_type = {
2589 .groups = rbd_snap_attr_groups,
2590 .release = rbd_snap_dev_release,
2591};
2592
8b8fb99c
AE
2593static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2594{
2595 kref_get(&spec->kref);
2596
2597 return spec;
2598}
2599
2600static void rbd_spec_free(struct kref *kref);
2601static void rbd_spec_put(struct rbd_spec *spec)
2602{
2603 if (spec)
2604 kref_put(&spec->kref, rbd_spec_free);
2605}
2606
2607static struct rbd_spec *rbd_spec_alloc(void)
2608{
2609 struct rbd_spec *spec;
2610
2611 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2612 if (!spec)
2613 return NULL;
2614 kref_init(&spec->kref);
2615
2616 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2617
2618 return spec;
2619}
2620
2621static void rbd_spec_free(struct kref *kref)
2622{
2623 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2624
2625 kfree(spec->pool_name);
2626 kfree(spec->image_id);
2627 kfree(spec->image_name);
2628 kfree(spec->snap_name);
2629 kfree(spec);
2630}
2631
cc344fa1 2632static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
c53d5893
AE
2633 struct rbd_spec *spec)
2634{
2635 struct rbd_device *rbd_dev;
2636
2637 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2638 if (!rbd_dev)
2639 return NULL;
2640
2641 spin_lock_init(&rbd_dev->lock);
6d292906 2642 rbd_dev->flags = 0;
c53d5893
AE
2643 INIT_LIST_HEAD(&rbd_dev->node);
2644 INIT_LIST_HEAD(&rbd_dev->snaps);
2645 init_rwsem(&rbd_dev->header_rwsem);
2646
2647 rbd_dev->spec = spec;
2648 rbd_dev->rbd_client = rbdc;
2649
0903e875
AE
2650 /* Initialize the layout used for all rbd requests */
2651
2652 rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2653 rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
2654 rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2655 rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
2656
c53d5893
AE
2657 return rbd_dev;
2658}
2659
2660static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2661{
86b00e0d 2662 rbd_spec_put(rbd_dev->parent_spec);
c53d5893
AE
2663 kfree(rbd_dev->header_name);
2664 rbd_put_client(rbd_dev->rbd_client);
2665 rbd_spec_put(rbd_dev->spec);
2666 kfree(rbd_dev);
2667}
2668
304f6808
AE
2669static bool rbd_snap_registered(struct rbd_snap *snap)
2670{
2671 bool ret = snap->dev.type == &rbd_snap_device_type;
2672 bool reg = device_is_registered(&snap->dev);
2673
2674 rbd_assert(!ret ^ reg);
2675
2676 return ret;
2677}
2678
41f38c2b 2679static void rbd_remove_snap_dev(struct rbd_snap *snap)
dfc5606d
YS
2680{
2681 list_del(&snap->node);
304f6808
AE
2682 if (device_is_registered(&snap->dev))
2683 device_unregister(&snap->dev);
dfc5606d
YS
2684}
2685
14e7085d 2686static int rbd_register_snap_dev(struct rbd_snap *snap,
dfc5606d
YS
2687 struct device *parent)
2688{
2689 struct device *dev = &snap->dev;
2690 int ret;
2691
2692 dev->type = &rbd_snap_device_type;
2693 dev->parent = parent;
2694 dev->release = rbd_snap_dev_release;
d4b125e9 2695 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
304f6808
AE
2696 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2697
dfc5606d
YS
2698 ret = device_register(dev);
2699
2700 return ret;
2701}
2702
4e891e0a 2703static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
c8d18425 2704 const char *snap_name,
34b13184
AE
2705 u64 snap_id, u64 snap_size,
2706 u64 snap_features)
dfc5606d 2707{
4e891e0a 2708 struct rbd_snap *snap;
dfc5606d 2709 int ret;
4e891e0a
AE
2710
2711 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
dfc5606d 2712 if (!snap)
4e891e0a
AE
2713 return ERR_PTR(-ENOMEM);
2714
2715 ret = -ENOMEM;
c8d18425 2716 snap->name = kstrdup(snap_name, GFP_KERNEL);
4e891e0a
AE
2717 if (!snap->name)
2718 goto err;
2719
c8d18425
AE
2720 snap->id = snap_id;
2721 snap->size = snap_size;
34b13184 2722 snap->features = snap_features;
4e891e0a
AE
2723
2724 return snap;
2725
dfc5606d
YS
2726err:
2727 kfree(snap->name);
2728 kfree(snap);
4e891e0a
AE
2729
2730 return ERR_PTR(ret);
dfc5606d
YS
2731}
2732
cd892126
AE
2733static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2734 u64 *snap_size, u64 *snap_features)
2735{
2736 char *snap_name;
2737
2738 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2739
2740 *snap_size = rbd_dev->header.snap_sizes[which];
2741 *snap_features = 0; /* No features for v1 */
2742
2743 /* Skip over names until we find the one we are looking for */
2744
2745 snap_name = rbd_dev->header.snap_names;
2746 while (which--)
2747 snap_name += strlen(snap_name) + 1;
2748
2749 return snap_name;
2750}
2751
9d475de5
AE
2752/*
2753 * Get the size and object order for an image snapshot, or if
2754 * snap_id is CEPH_NOSNAP, gets this information for the base
2755 * image.
2756 */
2757static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2758 u8 *order, u64 *snap_size)
2759{
2760 __le64 snapid = cpu_to_le64(snap_id);
2761 int ret;
2762 struct {
2763 u8 order;
2764 __le64 size;
2765 } __attribute__ ((packed)) size_buf = { 0 };
2766
36be9a76 2767 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
9d475de5
AE
2768 "rbd", "get_size",
2769 (char *) &snapid, sizeof (snapid),
07b2391f 2770 (char *) &size_buf, sizeof (size_buf), NULL);
36be9a76 2771 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
9d475de5
AE
2772 if (ret < 0)
2773 return ret;
2774
2775 *order = size_buf.order;
2776 *snap_size = le64_to_cpu(size_buf.size);
2777
2778 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2779 (unsigned long long) snap_id, (unsigned int) *order,
2780 (unsigned long long) *snap_size);
2781
2782 return 0;
2783}
2784
2785static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2786{
2787 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2788 &rbd_dev->header.obj_order,
2789 &rbd_dev->header.image_size);
2790}
2791
1e130199
AE
2792static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2793{
2794 void *reply_buf;
2795 int ret;
2796 void *p;
2797
2798 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2799 if (!reply_buf)
2800 return -ENOMEM;
2801
36be9a76 2802 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
1e130199
AE
2803 "rbd", "get_object_prefix",
2804 NULL, 0,
07b2391f 2805 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
36be9a76 2806 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
1e130199
AE
2807 if (ret < 0)
2808 goto out;
2809
2810 p = reply_buf;
2811 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2812 p + RBD_OBJ_PREFIX_LEN_MAX,
2813 NULL, GFP_NOIO);
2814
2815 if (IS_ERR(rbd_dev->header.object_prefix)) {
2816 ret = PTR_ERR(rbd_dev->header.object_prefix);
2817 rbd_dev->header.object_prefix = NULL;
2818 } else {
2819 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2820 }
2821
2822out:
2823 kfree(reply_buf);
2824
2825 return ret;
2826}
2827
b1b5402a
AE
2828static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2829 u64 *snap_features)
2830{
2831 __le64 snapid = cpu_to_le64(snap_id);
2832 struct {
2833 __le64 features;
2834 __le64 incompat;
2835 } features_buf = { 0 };
d889140c 2836 u64 incompat;
b1b5402a
AE
2837 int ret;
2838
36be9a76 2839 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
b1b5402a
AE
2840 "rbd", "get_features",
2841 (char *) &snapid, sizeof (snapid),
2842 (char *) &features_buf, sizeof (features_buf),
07b2391f 2843 NULL);
36be9a76 2844 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
b1b5402a
AE
2845 if (ret < 0)
2846 return ret;
d889140c
AE
2847
2848 incompat = le64_to_cpu(features_buf.incompat);
5cbf6f12 2849 if (incompat & ~RBD_FEATURES_SUPPORTED)
b8f5c6ed 2850 return -ENXIO;
d889140c 2851
b1b5402a
AE
2852 *snap_features = le64_to_cpu(features_buf.features);
2853
2854 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2855 (unsigned long long) snap_id,
2856 (unsigned long long) *snap_features,
2857 (unsigned long long) le64_to_cpu(features_buf.incompat));
2858
2859 return 0;
2860}
2861
2862static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2863{
2864 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2865 &rbd_dev->header.features);
2866}
2867
86b00e0d
AE
2868static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2869{
2870 struct rbd_spec *parent_spec;
2871 size_t size;
2872 void *reply_buf = NULL;
2873 __le64 snapid;
2874 void *p;
2875 void *end;
2876 char *image_id;
2877 u64 overlap;
86b00e0d
AE
2878 int ret;
2879
2880 parent_spec = rbd_spec_alloc();
2881 if (!parent_spec)
2882 return -ENOMEM;
2883
2884 size = sizeof (__le64) + /* pool_id */
2885 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
2886 sizeof (__le64) + /* snap_id */
2887 sizeof (__le64); /* overlap */
2888 reply_buf = kmalloc(size, GFP_KERNEL);
2889 if (!reply_buf) {
2890 ret = -ENOMEM;
2891 goto out_err;
2892 }
2893
2894 snapid = cpu_to_le64(CEPH_NOSNAP);
36be9a76 2895 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
86b00e0d
AE
2896 "rbd", "get_parent",
2897 (char *) &snapid, sizeof (snapid),
07b2391f 2898 (char *) reply_buf, size, NULL);
36be9a76 2899 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
86b00e0d
AE
2900 if (ret < 0)
2901 goto out_err;
2902
2903 ret = -ERANGE;
2904 p = reply_buf;
2905 end = (char *) reply_buf + size;
2906 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2907 if (parent_spec->pool_id == CEPH_NOPOOL)
2908 goto out; /* No parent? No problem. */
2909
0903e875
AE
2910 /* The ceph file layout needs to fit pool id in 32 bits */
2911
2912 ret = -EIO;
2913 if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
2914 goto out;
2915
979ed480 2916 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
86b00e0d
AE
2917 if (IS_ERR(image_id)) {
2918 ret = PTR_ERR(image_id);
2919 goto out_err;
2920 }
2921 parent_spec->image_id = image_id;
2922 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
2923 ceph_decode_64_safe(&p, end, overlap, out_err);
2924
2925 rbd_dev->parent_overlap = overlap;
2926 rbd_dev->parent_spec = parent_spec;
2927 parent_spec = NULL; /* rbd_dev now owns this */
2928out:
2929 ret = 0;
2930out_err:
2931 kfree(reply_buf);
2932 rbd_spec_put(parent_spec);
2933
2934 return ret;
2935}
2936
9e15b77d
AE
2937static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
2938{
2939 size_t image_id_size;
2940 char *image_id;
2941 void *p;
2942 void *end;
2943 size_t size;
2944 void *reply_buf = NULL;
2945 size_t len = 0;
2946 char *image_name = NULL;
2947 int ret;
2948
2949 rbd_assert(!rbd_dev->spec->image_name);
2950
69e7a02f
AE
2951 len = strlen(rbd_dev->spec->image_id);
2952 image_id_size = sizeof (__le32) + len;
9e15b77d
AE
2953 image_id = kmalloc(image_id_size, GFP_KERNEL);
2954 if (!image_id)
2955 return NULL;
2956
2957 p = image_id;
2958 end = (char *) image_id + image_id_size;
69e7a02f 2959 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
9e15b77d
AE
2960
2961 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
2962 reply_buf = kmalloc(size, GFP_KERNEL);
2963 if (!reply_buf)
2964 goto out;
2965
36be9a76 2966 ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
9e15b77d
AE
2967 "rbd", "dir_get_name",
2968 image_id, image_id_size,
07b2391f 2969 (char *) reply_buf, size, NULL);
9e15b77d
AE
2970 if (ret < 0)
2971 goto out;
2972 p = reply_buf;
2973 end = (char *) reply_buf + size;
2974 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2975 if (IS_ERR(image_name))
2976 image_name = NULL;
2977 else
2978 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
2979out:
2980 kfree(reply_buf);
2981 kfree(image_id);
2982
2983 return image_name;
2984}
2985
2986/*
2987 * When a parent image gets probed, we only have the pool, image,
2988 * and snapshot ids but not the names of any of them. This call
2989 * is made later to fill in those names. It has to be done after
2990 * rbd_dev_snaps_update() has completed because some of the
2991 * information (in particular, snapshot name) is not available
2992 * until then.
2993 */
2994static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
2995{
2996 struct ceph_osd_client *osdc;
2997 const char *name;
2998 void *reply_buf = NULL;
2999 int ret;
3000
3001 if (rbd_dev->spec->pool_name)
3002 return 0; /* Already have the names */
3003
3004 /* Look up the pool name */
3005
3006 osdc = &rbd_dev->rbd_client->client->osdc;
3007 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
935dc89f
AE
3008 if (!name) {
3009 rbd_warn(rbd_dev, "there is no pool with id %llu",
3010 rbd_dev->spec->pool_id); /* Really a BUG() */
3011 return -EIO;
3012 }
9e15b77d
AE
3013
3014 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
3015 if (!rbd_dev->spec->pool_name)
3016 return -ENOMEM;
3017
3018 /* Fetch the image name; tolerate failure here */
3019
3020 name = rbd_dev_image_name(rbd_dev);
69e7a02f 3021 if (name)
9e15b77d 3022 rbd_dev->spec->image_name = (char *) name;
69e7a02f 3023 else
06ecc6cb 3024 rbd_warn(rbd_dev, "unable to get image name");
9e15b77d
AE
3025
3026 /* Look up the snapshot name. */
3027
3028 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
3029 if (!name) {
935dc89f
AE
3030 rbd_warn(rbd_dev, "no snapshot with id %llu",
3031 rbd_dev->spec->snap_id); /* Really a BUG() */
9e15b77d
AE
3032 ret = -EIO;
3033 goto out_err;
3034 }
3035 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
3036 if(!rbd_dev->spec->snap_name)
3037 goto out_err;
3038
3039 return 0;
3040out_err:
3041 kfree(reply_buf);
3042 kfree(rbd_dev->spec->pool_name);
3043 rbd_dev->spec->pool_name = NULL;
3044
3045 return ret;
3046}
3047
6e14b1a6 3048static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
35d489f9
AE
3049{
3050 size_t size;
3051 int ret;
3052 void *reply_buf;
3053 void *p;
3054 void *end;
3055 u64 seq;
3056 u32 snap_count;
3057 struct ceph_snap_context *snapc;
3058 u32 i;
3059
3060 /*
3061 * We'll need room for the seq value (maximum snapshot id),
3062 * snapshot count, and array of that many snapshot ids.
3063 * For now we have a fixed upper limit on the number we're
3064 * prepared to receive.
3065 */
3066 size = sizeof (__le64) + sizeof (__le32) +
3067 RBD_MAX_SNAP_COUNT * sizeof (__le64);
3068 reply_buf = kzalloc(size, GFP_KERNEL);
3069 if (!reply_buf)
3070 return -ENOMEM;
3071
36be9a76 3072 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
35d489f9
AE
3073 "rbd", "get_snapcontext",
3074 NULL, 0,
07b2391f 3075 reply_buf, size, ver);
36be9a76 3076 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
35d489f9
AE
3077 if (ret < 0)
3078 goto out;
3079
3080 ret = -ERANGE;
3081 p = reply_buf;
3082 end = (char *) reply_buf + size;
3083 ceph_decode_64_safe(&p, end, seq, out);
3084 ceph_decode_32_safe(&p, end, snap_count, out);
3085
3086 /*
3087 * Make sure the reported number of snapshot ids wouldn't go
3088 * beyond the end of our buffer. But before checking that,
3089 * make sure the computed size of the snapshot context we
3090 * allocate is representable in a size_t.
3091 */
3092 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
3093 / sizeof (u64)) {
3094 ret = -EINVAL;
3095 goto out;
3096 }
3097 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
3098 goto out;
3099
3100 size = sizeof (struct ceph_snap_context) +
3101 snap_count * sizeof (snapc->snaps[0]);
3102 snapc = kmalloc(size, GFP_KERNEL);
3103 if (!snapc) {
3104 ret = -ENOMEM;
3105 goto out;
3106 }
3107
3108 atomic_set(&snapc->nref, 1);
3109 snapc->seq = seq;
3110 snapc->num_snaps = snap_count;
3111 for (i = 0; i < snap_count; i++)
3112 snapc->snaps[i] = ceph_decode_64(&p);
3113
3114 rbd_dev->header.snapc = snapc;
3115
3116 dout(" snap context seq = %llu, snap_count = %u\n",
3117 (unsigned long long) seq, (unsigned int) snap_count);
3118
3119out:
3120 kfree(reply_buf);
3121
3122 return 0;
3123}
3124
b8b1e2db
AE
3125static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3126{
3127 size_t size;
3128 void *reply_buf;
3129 __le64 snap_id;
3130 int ret;
3131 void *p;
3132 void *end;
b8b1e2db
AE
3133 char *snap_name;
3134
3135 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3136 reply_buf = kmalloc(size, GFP_KERNEL);
3137 if (!reply_buf)
3138 return ERR_PTR(-ENOMEM);
3139
3140 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
36be9a76 3141 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
b8b1e2db
AE
3142 "rbd", "get_snapshot_name",
3143 (char *) &snap_id, sizeof (snap_id),
07b2391f 3144 reply_buf, size, NULL);
36be9a76 3145 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
b8b1e2db
AE
3146 if (ret < 0)
3147 goto out;
3148
3149 p = reply_buf;
3150 end = (char *) reply_buf + size;
e5c35534 3151 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
b8b1e2db
AE
3152 if (IS_ERR(snap_name)) {
3153 ret = PTR_ERR(snap_name);
3154 goto out;
3155 } else {
3156 dout(" snap_id 0x%016llx snap_name = %s\n",
3157 (unsigned long long) le64_to_cpu(snap_id), snap_name);
3158 }
3159 kfree(reply_buf);
3160
3161 return snap_name;
3162out:
3163 kfree(reply_buf);
3164
3165 return ERR_PTR(ret);
3166}
3167
3168static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3169 u64 *snap_size, u64 *snap_features)
3170{
e0b49868 3171 u64 snap_id;
b8b1e2db
AE
3172 u8 order;
3173 int ret;
3174
3175 snap_id = rbd_dev->header.snapc->snaps[which];
3176 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
3177 if (ret)
3178 return ERR_PTR(ret);
3179 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
3180 if (ret)
3181 return ERR_PTR(ret);
3182
3183 return rbd_dev_v2_snap_name(rbd_dev, which);
3184}
3185
3186static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3187 u64 *snap_size, u64 *snap_features)
3188{
3189 if (rbd_dev->image_format == 1)
3190 return rbd_dev_v1_snap_info(rbd_dev, which,
3191 snap_size, snap_features);
3192 if (rbd_dev->image_format == 2)
3193 return rbd_dev_v2_snap_info(rbd_dev, which,
3194 snap_size, snap_features);
3195 return ERR_PTR(-EINVAL);
3196}
3197
117973fb
AE
3198static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
3199{
3200 int ret;
3201 __u8 obj_order;
3202
3203 down_write(&rbd_dev->header_rwsem);
3204
3205 /* Grab old order first, to see if it changes */
3206
3207 obj_order = rbd_dev->header.obj_order,
3208 ret = rbd_dev_v2_image_size(rbd_dev);
3209 if (ret)
3210 goto out;
3211 if (rbd_dev->header.obj_order != obj_order) {
3212 ret = -EIO;
3213 goto out;
3214 }
3215 rbd_update_mapping_size(rbd_dev);
3216
3217 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
3218 dout("rbd_dev_v2_snap_context returned %d\n", ret);
3219 if (ret)
3220 goto out;
3221 ret = rbd_dev_snaps_update(rbd_dev);
3222 dout("rbd_dev_snaps_update returned %d\n", ret);
3223 if (ret)
3224 goto out;
3225 ret = rbd_dev_snaps_register(rbd_dev);
3226 dout("rbd_dev_snaps_register returned %d\n", ret);
3227out:
3228 up_write(&rbd_dev->header_rwsem);
3229
3230 return ret;
3231}
3232
dfc5606d 3233/*
35938150
AE
3234 * Scan the rbd device's current snapshot list and compare it to the
3235 * newly-received snapshot context. Remove any existing snapshots
3236 * not present in the new snapshot context. Add a new snapshot for
3237 * any snaphots in the snapshot context not in the current list.
3238 * And verify there are no changes to snapshots we already know
3239 * about.
3240 *
3241 * Assumes the snapshots in the snapshot context are sorted by
3242 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
3243 * are also maintained in that order.)
dfc5606d 3244 */
304f6808 3245static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
dfc5606d 3246{
35938150
AE
3247 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
3248 const u32 snap_count = snapc->num_snaps;
35938150
AE
3249 struct list_head *head = &rbd_dev->snaps;
3250 struct list_head *links = head->next;
3251 u32 index = 0;
dfc5606d 3252
9fcbb800 3253 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
35938150
AE
3254 while (index < snap_count || links != head) {
3255 u64 snap_id;
3256 struct rbd_snap *snap;
cd892126
AE
3257 char *snap_name;
3258 u64 snap_size = 0;
3259 u64 snap_features = 0;
dfc5606d 3260
35938150
AE
3261 snap_id = index < snap_count ? snapc->snaps[index]
3262 : CEPH_NOSNAP;
3263 snap = links != head ? list_entry(links, struct rbd_snap, node)
3264 : NULL;
aafb230e 3265 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
dfc5606d 3266
35938150
AE
3267 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
3268 struct list_head *next = links->next;
dfc5606d 3269
6d292906
AE
3270 /*
3271 * A previously-existing snapshot is not in
3272 * the new snap context.
3273 *
3274 * If the now missing snapshot is the one the
3275 * image is mapped to, clear its exists flag
3276 * so we can avoid sending any more requests
3277 * to it.
3278 */
0d7dbfce 3279 if (rbd_dev->spec->snap_id == snap->id)
6d292906 3280 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
41f38c2b 3281 rbd_remove_snap_dev(snap);
9fcbb800 3282 dout("%ssnap id %llu has been removed\n",
0d7dbfce
AE
3283 rbd_dev->spec->snap_id == snap->id ?
3284 "mapped " : "",
9fcbb800 3285 (unsigned long long) snap->id);
35938150
AE
3286
3287 /* Done with this list entry; advance */
3288
3289 links = next;
dfc5606d
YS
3290 continue;
3291 }
35938150 3292
b8b1e2db
AE
3293 snap_name = rbd_dev_snap_info(rbd_dev, index,
3294 &snap_size, &snap_features);
cd892126
AE
3295 if (IS_ERR(snap_name))
3296 return PTR_ERR(snap_name);
3297
9fcbb800
AE
3298 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
3299 (unsigned long long) snap_id);
35938150
AE
3300 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
3301 struct rbd_snap *new_snap;
3302
3303 /* We haven't seen this snapshot before */
3304
c8d18425 3305 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
cd892126 3306 snap_id, snap_size, snap_features);
9fcbb800
AE
3307 if (IS_ERR(new_snap)) {
3308 int err = PTR_ERR(new_snap);
3309
3310 dout(" failed to add dev, error %d\n", err);
3311
3312 return err;
3313 }
35938150
AE
3314
3315 /* New goes before existing, or at end of list */
3316
9fcbb800 3317 dout(" added dev%s\n", snap ? "" : " at end\n");
35938150
AE
3318 if (snap)
3319 list_add_tail(&new_snap->node, &snap->node);
3320 else
523f3258 3321 list_add_tail(&new_snap->node, head);
35938150
AE
3322 } else {
3323 /* Already have this one */
3324
9fcbb800
AE
3325 dout(" already present\n");
3326
cd892126 3327 rbd_assert(snap->size == snap_size);
aafb230e 3328 rbd_assert(!strcmp(snap->name, snap_name));
cd892126 3329 rbd_assert(snap->features == snap_features);
35938150
AE
3330
3331 /* Done with this list entry; advance */
3332
3333 links = links->next;
dfc5606d 3334 }
35938150
AE
3335
3336 /* Advance to the next entry in the snapshot context */
3337
3338 index++;
dfc5606d 3339 }
9fcbb800 3340 dout("%s: done\n", __func__);
dfc5606d
YS
3341
3342 return 0;
3343}
3344
304f6808
AE
3345/*
3346 * Scan the list of snapshots and register the devices for any that
3347 * have not already been registered.
3348 */
3349static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
3350{
3351 struct rbd_snap *snap;
3352 int ret = 0;
3353
37206ee5 3354 dout("%s:\n", __func__);
86ff77bb
AE
3355 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
3356 return -EIO;
304f6808
AE
3357
3358 list_for_each_entry(snap, &rbd_dev->snaps, node) {
3359 if (!rbd_snap_registered(snap)) {
3360 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
3361 if (ret < 0)
3362 break;
3363 }
3364 }
3365 dout("%s: returning %d\n", __func__, ret);
3366
3367 return ret;
3368}
3369
dfc5606d
YS
3370static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3371{
dfc5606d 3372 struct device *dev;
cd789ab9 3373 int ret;
dfc5606d
YS
3374
3375 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
dfc5606d 3376
cd789ab9 3377 dev = &rbd_dev->dev;
dfc5606d
YS
3378 dev->bus = &rbd_bus_type;
3379 dev->type = &rbd_device_type;
3380 dev->parent = &rbd_root_dev;
3381 dev->release = rbd_dev_release;
de71a297 3382 dev_set_name(dev, "%d", rbd_dev->dev_id);
dfc5606d 3383 ret = device_register(dev);
dfc5606d 3384
dfc5606d 3385 mutex_unlock(&ctl_mutex);
cd789ab9 3386
dfc5606d 3387 return ret;
602adf40
YS
3388}
3389
dfc5606d
YS
3390static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3391{
3392 device_unregister(&rbd_dev->dev);
3393}
3394
e2839308 3395static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
1ddbe94e
AE
3396
3397/*
499afd5b
AE
3398 * Get a unique rbd identifier for the given new rbd_dev, and add
3399 * the rbd_dev to the global list. The minimum rbd id is 1.
1ddbe94e 3400 */
e2839308 3401static void rbd_dev_id_get(struct rbd_device *rbd_dev)
b7f23c36 3402{
e2839308 3403 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
499afd5b
AE
3404
3405 spin_lock(&rbd_dev_list_lock);
3406 list_add_tail(&rbd_dev->node, &rbd_dev_list);
3407 spin_unlock(&rbd_dev_list_lock);
e2839308
AE
3408 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3409 (unsigned long long) rbd_dev->dev_id);
1ddbe94e 3410}
b7f23c36 3411
1ddbe94e 3412/*
499afd5b
AE
3413 * Remove an rbd_dev from the global list, and record that its
3414 * identifier is no longer in use.
1ddbe94e 3415 */
e2839308 3416static void rbd_dev_id_put(struct rbd_device *rbd_dev)
1ddbe94e 3417{
d184f6bf 3418 struct list_head *tmp;
de71a297 3419 int rbd_id = rbd_dev->dev_id;
d184f6bf
AE
3420 int max_id;
3421
aafb230e 3422 rbd_assert(rbd_id > 0);
499afd5b 3423
e2839308
AE
3424 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3425 (unsigned long long) rbd_dev->dev_id);
499afd5b
AE
3426 spin_lock(&rbd_dev_list_lock);
3427 list_del_init(&rbd_dev->node);
d184f6bf
AE
3428
3429 /*
3430 * If the id being "put" is not the current maximum, there
3431 * is nothing special we need to do.
3432 */
e2839308 3433 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
d184f6bf
AE
3434 spin_unlock(&rbd_dev_list_lock);
3435 return;
3436 }
3437
3438 /*
3439 * We need to update the current maximum id. Search the
3440 * list to find out what it is. We're more likely to find
3441 * the maximum at the end, so search the list backward.
3442 */
3443 max_id = 0;
3444 list_for_each_prev(tmp, &rbd_dev_list) {
3445 struct rbd_device *rbd_dev;
3446
3447 rbd_dev = list_entry(tmp, struct rbd_device, node);
b213e0b1
AE
3448 if (rbd_dev->dev_id > max_id)
3449 max_id = rbd_dev->dev_id;
d184f6bf 3450 }
499afd5b 3451 spin_unlock(&rbd_dev_list_lock);
b7f23c36 3452
1ddbe94e 3453 /*
e2839308 3454 * The max id could have been updated by rbd_dev_id_get(), in
d184f6bf
AE
3455 * which case it now accurately reflects the new maximum.
3456 * Be careful not to overwrite the maximum value in that
3457 * case.
1ddbe94e 3458 */
e2839308
AE
3459 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3460 dout(" max dev id has been reset\n");
b7f23c36
AE
3461}
3462
e28fff26
AE
3463/*
3464 * Skips over white space at *buf, and updates *buf to point to the
3465 * first found non-space character (if any). Returns the length of
593a9e7b
AE
3466 * the token (string of non-white space characters) found. Note
3467 * that *buf must be terminated with '\0'.
e28fff26
AE
3468 */
3469static inline size_t next_token(const char **buf)
3470{
3471 /*
3472 * These are the characters that produce nonzero for
3473 * isspace() in the "C" and "POSIX" locales.
3474 */
3475 const char *spaces = " \f\n\r\t\v";
3476
3477 *buf += strspn(*buf, spaces); /* Find start of token */
3478
3479 return strcspn(*buf, spaces); /* Return token length */
3480}
3481
3482/*
3483 * Finds the next token in *buf, and if the provided token buffer is
3484 * big enough, copies the found token into it. The result, if
593a9e7b
AE
3485 * copied, is guaranteed to be terminated with '\0'. Note that *buf
3486 * must be terminated with '\0' on entry.
e28fff26
AE
3487 *
3488 * Returns the length of the token found (not including the '\0').
3489 * Return value will be 0 if no token is found, and it will be >=
3490 * token_size if the token would not fit.
3491 *
593a9e7b 3492 * The *buf pointer will be updated to point beyond the end of the
e28fff26
AE
3493 * found token. Note that this occurs even if the token buffer is
3494 * too small to hold it.
3495 */
3496static inline size_t copy_token(const char **buf,
3497 char *token,
3498 size_t token_size)
3499{
3500 size_t len;
3501
3502 len = next_token(buf);
3503 if (len < token_size) {
3504 memcpy(token, *buf, len);
3505 *(token + len) = '\0';
3506 }
3507 *buf += len;
3508
3509 return len;
3510}
3511
ea3352f4
AE
3512/*
3513 * Finds the next token in *buf, dynamically allocates a buffer big
3514 * enough to hold a copy of it, and copies the token into the new
3515 * buffer. The copy is guaranteed to be terminated with '\0'. Note
3516 * that a duplicate buffer is created even for a zero-length token.
3517 *
3518 * Returns a pointer to the newly-allocated duplicate, or a null
3519 * pointer if memory for the duplicate was not available. If
3520 * the lenp argument is a non-null pointer, the length of the token
3521 * (not including the '\0') is returned in *lenp.
3522 *
3523 * If successful, the *buf pointer will be updated to point beyond
3524 * the end of the found token.
3525 *
3526 * Note: uses GFP_KERNEL for allocation.
3527 */
3528static inline char *dup_token(const char **buf, size_t *lenp)
3529{
3530 char *dup;
3531 size_t len;
3532
3533 len = next_token(buf);
4caf35f9 3534 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
ea3352f4
AE
3535 if (!dup)
3536 return NULL;
ea3352f4
AE
3537 *(dup + len) = '\0';
3538 *buf += len;
3539
3540 if (lenp)
3541 *lenp = len;
3542
3543 return dup;
3544}
3545
a725f65e 3546/*
859c31df
AE
3547 * Parse the options provided for an "rbd add" (i.e., rbd image
3548 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
3549 * and the data written is passed here via a NUL-terminated buffer.
3550 * Returns 0 if successful or an error code otherwise.
d22f76e7 3551 *
859c31df
AE
3552 * The information extracted from these options is recorded in
3553 * the other parameters which return dynamically-allocated
3554 * structures:
3555 * ceph_opts
3556 * The address of a pointer that will refer to a ceph options
3557 * structure. Caller must release the returned pointer using
3558 * ceph_destroy_options() when it is no longer needed.
3559 * rbd_opts
3560 * Address of an rbd options pointer. Fully initialized by
3561 * this function; caller must release with kfree().
3562 * spec
3563 * Address of an rbd image specification pointer. Fully
3564 * initialized by this function based on parsed options.
3565 * Caller must release with rbd_spec_put().
3566 *
3567 * The options passed take this form:
3568 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3569 * where:
3570 * <mon_addrs>
3571 * A comma-separated list of one or more monitor addresses.
3572 * A monitor address is an ip address, optionally followed
3573 * by a port number (separated by a colon).
3574 * I.e.: ip1[:port1][,ip2[:port2]...]
3575 * <options>
3576 * A comma-separated list of ceph and/or rbd options.
3577 * <pool_name>
3578 * The name of the rados pool containing the rbd image.
3579 * <image_name>
3580 * The name of the image in that pool to map.
3581 * <snap_id>
3582 * An optional snapshot id. If provided, the mapping will
3583 * present data from the image at the time that snapshot was
3584 * created. The image head is used if no snapshot id is
3585 * provided. Snapshot mappings are always read-only.
a725f65e 3586 */
859c31df 3587static int rbd_add_parse_args(const char *buf,
dc79b113 3588 struct ceph_options **ceph_opts,
859c31df
AE
3589 struct rbd_options **opts,
3590 struct rbd_spec **rbd_spec)
e28fff26 3591{
d22f76e7 3592 size_t len;
859c31df 3593 char *options;
0ddebc0c
AE
3594 const char *mon_addrs;
3595 size_t mon_addrs_size;
859c31df 3596 struct rbd_spec *spec = NULL;
4e9afeba 3597 struct rbd_options *rbd_opts = NULL;
859c31df 3598 struct ceph_options *copts;
dc79b113 3599 int ret;
e28fff26
AE
3600
3601 /* The first four tokens are required */
3602
7ef3214a 3603 len = next_token(&buf);
4fb5d671
AE
3604 if (!len) {
3605 rbd_warn(NULL, "no monitor address(es) provided");
3606 return -EINVAL;
3607 }
0ddebc0c 3608 mon_addrs = buf;
f28e565a 3609 mon_addrs_size = len + 1;
7ef3214a 3610 buf += len;
a725f65e 3611
dc79b113 3612 ret = -EINVAL;
f28e565a
AE
3613 options = dup_token(&buf, NULL);
3614 if (!options)
dc79b113 3615 return -ENOMEM;
4fb5d671
AE
3616 if (!*options) {
3617 rbd_warn(NULL, "no options provided");
3618 goto out_err;
3619 }
e28fff26 3620
859c31df
AE
3621 spec = rbd_spec_alloc();
3622 if (!spec)
f28e565a 3623 goto out_mem;
859c31df
AE
3624
3625 spec->pool_name = dup_token(&buf, NULL);
3626 if (!spec->pool_name)
3627 goto out_mem;
4fb5d671
AE
3628 if (!*spec->pool_name) {
3629 rbd_warn(NULL, "no pool name provided");
3630 goto out_err;
3631 }
e28fff26 3632
69e7a02f 3633 spec->image_name = dup_token(&buf, NULL);
859c31df 3634 if (!spec->image_name)
f28e565a 3635 goto out_mem;
4fb5d671
AE
3636 if (!*spec->image_name) {
3637 rbd_warn(NULL, "no image name provided");
3638 goto out_err;
3639 }
d4b125e9 3640
f28e565a
AE
3641 /*
3642 * Snapshot name is optional; default is to use "-"
3643 * (indicating the head/no snapshot).
3644 */
3feeb894 3645 len = next_token(&buf);
820a5f3e 3646 if (!len) {
3feeb894
AE
3647 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3648 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
f28e565a 3649 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
dc79b113 3650 ret = -ENAMETOOLONG;
f28e565a 3651 goto out_err;
849b4260 3652 }
4caf35f9 3653 spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
859c31df 3654 if (!spec->snap_name)
f28e565a 3655 goto out_mem;
859c31df 3656 *(spec->snap_name + len) = '\0';
e5c35534 3657
0ddebc0c 3658 /* Initialize all rbd options to the defaults */
e28fff26 3659
4e9afeba
AE
3660 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3661 if (!rbd_opts)
3662 goto out_mem;
3663
3664 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
d22f76e7 3665
859c31df 3666 copts = ceph_parse_options(options, mon_addrs,
0ddebc0c 3667 mon_addrs + mon_addrs_size - 1,
4e9afeba 3668 parse_rbd_opts_token, rbd_opts);
859c31df
AE
3669 if (IS_ERR(copts)) {
3670 ret = PTR_ERR(copts);
dc79b113
AE
3671 goto out_err;
3672 }
859c31df
AE
3673 kfree(options);
3674
3675 *ceph_opts = copts;
4e9afeba 3676 *opts = rbd_opts;
859c31df 3677 *rbd_spec = spec;
0ddebc0c 3678
dc79b113 3679 return 0;
f28e565a 3680out_mem:
dc79b113 3681 ret = -ENOMEM;
d22f76e7 3682out_err:
859c31df
AE
3683 kfree(rbd_opts);
3684 rbd_spec_put(spec);
f28e565a 3685 kfree(options);
d22f76e7 3686
dc79b113 3687 return ret;
a725f65e
AE
3688}
3689
589d30e0
AE
3690/*
3691 * An rbd format 2 image has a unique identifier, distinct from the
3692 * name given to it by the user. Internally, that identifier is
3693 * what's used to specify the names of objects related to the image.
3694 *
3695 * A special "rbd id" object is used to map an rbd image name to its
3696 * id. If that object doesn't exist, then there is no v2 rbd image
3697 * with the supplied name.
3698 *
3699 * This function will record the given rbd_dev's image_id field if
3700 * it can be determined, and in that case will return 0. If any
3701 * errors occur a negative errno will be returned and the rbd_dev's
3702 * image_id field will be unchanged (and should be NULL).
3703 */
3704static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3705{
3706 int ret;
3707 size_t size;
3708 char *object_name;
3709 void *response;
3710 void *p;
3711
2c0d0a10
AE
3712 /*
3713 * When probing a parent image, the image id is already
3714 * known (and the image name likely is not). There's no
3715 * need to fetch the image id again in this case.
3716 */
3717 if (rbd_dev->spec->image_id)
3718 return 0;
3719
589d30e0
AE
3720 /*
3721 * First, see if the format 2 image id file exists, and if
3722 * so, get the image's persistent id from it.
3723 */
69e7a02f 3724 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
589d30e0
AE
3725 object_name = kmalloc(size, GFP_NOIO);
3726 if (!object_name)
3727 return -ENOMEM;
0d7dbfce 3728 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
589d30e0
AE
3729 dout("rbd id object name is %s\n", object_name);
3730
3731 /* Response will be an encoded string, which includes a length */
3732
3733 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3734 response = kzalloc(size, GFP_NOIO);
3735 if (!response) {
3736 ret = -ENOMEM;
3737 goto out;
3738 }
3739
36be9a76 3740 ret = rbd_obj_method_sync(rbd_dev, object_name,
589d30e0
AE
3741 "rbd", "get_id",
3742 NULL, 0,
07b2391f 3743 response, RBD_IMAGE_ID_LEN_MAX, NULL);
36be9a76 3744 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
589d30e0
AE
3745 if (ret < 0)
3746 goto out;
3747
3748 p = response;
0d7dbfce 3749 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
589d30e0 3750 p + RBD_IMAGE_ID_LEN_MAX,
979ed480 3751 NULL, GFP_NOIO);
0d7dbfce
AE
3752 if (IS_ERR(rbd_dev->spec->image_id)) {
3753 ret = PTR_ERR(rbd_dev->spec->image_id);
3754 rbd_dev->spec->image_id = NULL;
589d30e0 3755 } else {
0d7dbfce 3756 dout("image_id is %s\n", rbd_dev->spec->image_id);
589d30e0
AE
3757 }
3758out:
3759 kfree(response);
3760 kfree(object_name);
3761
3762 return ret;
3763}
3764
a30b71b9
AE
3765static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3766{
3767 int ret;
3768 size_t size;
3769
3770 /* Version 1 images have no id; empty string is used */
3771
0d7dbfce
AE
3772 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3773 if (!rbd_dev->spec->image_id)
a30b71b9 3774 return -ENOMEM;
a30b71b9
AE
3775
3776 /* Record the header object name for this rbd image. */
3777
69e7a02f 3778 size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
a30b71b9
AE
3779 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3780 if (!rbd_dev->header_name) {
3781 ret = -ENOMEM;
3782 goto out_err;
3783 }
0d7dbfce
AE
3784 sprintf(rbd_dev->header_name, "%s%s",
3785 rbd_dev->spec->image_name, RBD_SUFFIX);
a30b71b9
AE
3786
3787 /* Populate rbd image metadata */
3788
3789 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3790 if (ret < 0)
3791 goto out_err;
86b00e0d
AE
3792
3793 /* Version 1 images have no parent (no layering) */
3794
3795 rbd_dev->parent_spec = NULL;
3796 rbd_dev->parent_overlap = 0;
3797
a30b71b9
AE
3798 rbd_dev->image_format = 1;
3799
3800 dout("discovered version 1 image, header name is %s\n",
3801 rbd_dev->header_name);
3802
3803 return 0;
3804
3805out_err:
3806 kfree(rbd_dev->header_name);
3807 rbd_dev->header_name = NULL;
0d7dbfce
AE
3808 kfree(rbd_dev->spec->image_id);
3809 rbd_dev->spec->image_id = NULL;
a30b71b9
AE
3810
3811 return ret;
3812}
3813
3814static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3815{
3816 size_t size;
9d475de5 3817 int ret;
6e14b1a6 3818 u64 ver = 0;
a30b71b9
AE
3819
3820 /*
3821 * Image id was filled in by the caller. Record the header
3822 * object name for this rbd image.
3823 */
979ed480 3824 size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
a30b71b9
AE
3825 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3826 if (!rbd_dev->header_name)
3827 return -ENOMEM;
3828 sprintf(rbd_dev->header_name, "%s%s",
0d7dbfce 3829 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
9d475de5
AE
3830
3831 /* Get the size and object order for the image */
3832
3833 ret = rbd_dev_v2_image_size(rbd_dev);
1e130199
AE
3834 if (ret < 0)
3835 goto out_err;
3836
3837 /* Get the object prefix (a.k.a. block_name) for the image */
3838
3839 ret = rbd_dev_v2_object_prefix(rbd_dev);
b1b5402a
AE
3840 if (ret < 0)
3841 goto out_err;
3842
d889140c 3843 /* Get the and check features for the image */
b1b5402a
AE
3844
3845 ret = rbd_dev_v2_features(rbd_dev);
9d475de5
AE
3846 if (ret < 0)
3847 goto out_err;
35d489f9 3848
86b00e0d
AE
3849 /* If the image supports layering, get the parent info */
3850
3851 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3852 ret = rbd_dev_v2_parent_info(rbd_dev);
3853 if (ret < 0)
3854 goto out_err;
3855 }
3856
6e14b1a6
AE
3857 /* crypto and compression type aren't (yet) supported for v2 images */
3858
3859 rbd_dev->header.crypt_type = 0;
3860 rbd_dev->header.comp_type = 0;
35d489f9 3861
6e14b1a6
AE
3862 /* Get the snapshot context, plus the header version */
3863
3864 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
35d489f9
AE
3865 if (ret)
3866 goto out_err;
6e14b1a6
AE
3867 rbd_dev->header.obj_version = ver;
3868
a30b71b9
AE
3869 rbd_dev->image_format = 2;
3870
3871 dout("discovered version 2 image, header name is %s\n",
3872 rbd_dev->header_name);
3873
35152979 3874 return 0;
9d475de5 3875out_err:
86b00e0d
AE
3876 rbd_dev->parent_overlap = 0;
3877 rbd_spec_put(rbd_dev->parent_spec);
3878 rbd_dev->parent_spec = NULL;
9d475de5
AE
3879 kfree(rbd_dev->header_name);
3880 rbd_dev->header_name = NULL;
1e130199
AE
3881 kfree(rbd_dev->header.object_prefix);
3882 rbd_dev->header.object_prefix = NULL;
9d475de5
AE
3883
3884 return ret;
a30b71b9
AE
3885}
3886
83a06263
AE
3887static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3888{
3889 int ret;
3890
3891 /* no need to lock here, as rbd_dev is not registered yet */
3892 ret = rbd_dev_snaps_update(rbd_dev);
3893 if (ret)
3894 return ret;
3895
9e15b77d
AE
3896 ret = rbd_dev_probe_update_spec(rbd_dev);
3897 if (ret)
3898 goto err_out_snaps;
3899
83a06263
AE
3900 ret = rbd_dev_set_mapping(rbd_dev);
3901 if (ret)
3902 goto err_out_snaps;
3903
3904 /* generate unique id: find highest unique id, add one */
3905 rbd_dev_id_get(rbd_dev);
3906
3907 /* Fill in the device name, now that we have its id. */
3908 BUILD_BUG_ON(DEV_NAME_LEN
3909 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3910 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3911
3912 /* Get our block major device number. */
3913
3914 ret = register_blkdev(0, rbd_dev->name);
3915 if (ret < 0)
3916 goto err_out_id;
3917 rbd_dev->major = ret;
3918
3919 /* Set up the blkdev mapping. */
3920
3921 ret = rbd_init_disk(rbd_dev);
3922 if (ret)
3923 goto err_out_blkdev;
3924
3925 ret = rbd_bus_add_dev(rbd_dev);
3926 if (ret)
3927 goto err_out_disk;
3928
3929 /*
3930 * At this point cleanup in the event of an error is the job
3931 * of the sysfs code (initiated by rbd_bus_del_dev()).
3932 */
3933 down_write(&rbd_dev->header_rwsem);
3934 ret = rbd_dev_snaps_register(rbd_dev);
3935 up_write(&rbd_dev->header_rwsem);
3936 if (ret)
3937 goto err_out_bus;
3938
9969ebc5 3939 ret = rbd_dev_header_watch_sync(rbd_dev, 1);
83a06263
AE
3940 if (ret)
3941 goto err_out_bus;
3942
3943 /* Everything's ready. Announce the disk to the world. */
3944
3945 add_disk(rbd_dev->disk);
3946
3947 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3948 (unsigned long long) rbd_dev->mapping.size);
3949
3950 return ret;
3951err_out_bus:
3952 /* this will also clean up rest of rbd_dev stuff */
3953
3954 rbd_bus_del_dev(rbd_dev);
3955
3956 return ret;
3957err_out_disk:
3958 rbd_free_disk(rbd_dev);
3959err_out_blkdev:
3960 unregister_blkdev(rbd_dev->major, rbd_dev->name);
3961err_out_id:
3962 rbd_dev_id_put(rbd_dev);
3963err_out_snaps:
3964 rbd_remove_all_snaps(rbd_dev);
3965
3966 return ret;
3967}
3968
a30b71b9
AE
3969/*
3970 * Probe for the existence of the header object for the given rbd
3971 * device. For format 2 images this includes determining the image
3972 * id.
3973 */
3974static int rbd_dev_probe(struct rbd_device *rbd_dev)
3975{
3976 int ret;
3977
3978 /*
3979 * Get the id from the image id object. If it's not a
3980 * format 2 image, we'll get ENOENT back, and we'll assume
3981 * it's a format 1 image.
3982 */
3983 ret = rbd_dev_image_id(rbd_dev);
3984 if (ret)
3985 ret = rbd_dev_v1_probe(rbd_dev);
3986 else
3987 ret = rbd_dev_v2_probe(rbd_dev);
83a06263 3988 if (ret) {
a30b71b9
AE
3989 dout("probe failed, returning %d\n", ret);
3990
83a06263
AE
3991 return ret;
3992 }
3993
3994 ret = rbd_dev_probe_finish(rbd_dev);
3995 if (ret)
3996 rbd_header_free(&rbd_dev->header);
3997
a30b71b9
AE
3998 return ret;
3999}
4000
59c2be1e
YS
4001static ssize_t rbd_add(struct bus_type *bus,
4002 const char *buf,
4003 size_t count)
602adf40 4004{
cb8627c7 4005 struct rbd_device *rbd_dev = NULL;
dc79b113 4006 struct ceph_options *ceph_opts = NULL;
4e9afeba 4007 struct rbd_options *rbd_opts = NULL;
859c31df 4008 struct rbd_spec *spec = NULL;
9d3997fd 4009 struct rbd_client *rbdc;
27cc2594
AE
4010 struct ceph_osd_client *osdc;
4011 int rc = -ENOMEM;
602adf40
YS
4012
4013 if (!try_module_get(THIS_MODULE))
4014 return -ENODEV;
4015
602adf40 4016 /* parse add command */
859c31df 4017 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
dc79b113 4018 if (rc < 0)
bd4ba655 4019 goto err_out_module;
78cea76e 4020
9d3997fd
AE
4021 rbdc = rbd_get_client(ceph_opts);
4022 if (IS_ERR(rbdc)) {
4023 rc = PTR_ERR(rbdc);
0ddebc0c 4024 goto err_out_args;
9d3997fd 4025 }
c53d5893 4026 ceph_opts = NULL; /* rbd_dev client now owns this */
602adf40 4027
602adf40 4028 /* pick the pool */
9d3997fd 4029 osdc = &rbdc->client->osdc;
859c31df 4030 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
602adf40
YS
4031 if (rc < 0)
4032 goto err_out_client;
859c31df
AE
4033 spec->pool_id = (u64) rc;
4034
0903e875
AE
4035 /* The ceph file layout needs to fit pool id in 32 bits */
4036
4037 if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
4038 rc = -EIO;
4039 goto err_out_client;
4040 }
4041
c53d5893 4042 rbd_dev = rbd_dev_create(rbdc, spec);
bd4ba655
AE
4043 if (!rbd_dev)
4044 goto err_out_client;
c53d5893
AE
4045 rbdc = NULL; /* rbd_dev now owns this */
4046 spec = NULL; /* rbd_dev now owns this */
602adf40 4047
bd4ba655 4048 rbd_dev->mapping.read_only = rbd_opts->read_only;
c53d5893
AE
4049 kfree(rbd_opts);
4050 rbd_opts = NULL; /* done with this */
bd4ba655 4051
a30b71b9
AE
4052 rc = rbd_dev_probe(rbd_dev);
4053 if (rc < 0)
c53d5893 4054 goto err_out_rbd_dev;
05fd6f6f 4055
602adf40 4056 return count;
c53d5893
AE
4057err_out_rbd_dev:
4058 rbd_dev_destroy(rbd_dev);
bd4ba655 4059err_out_client:
9d3997fd 4060 rbd_put_client(rbdc);
0ddebc0c 4061err_out_args:
78cea76e
AE
4062 if (ceph_opts)
4063 ceph_destroy_options(ceph_opts);
4e9afeba 4064 kfree(rbd_opts);
859c31df 4065 rbd_spec_put(spec);
bd4ba655
AE
4066err_out_module:
4067 module_put(THIS_MODULE);
27cc2594 4068
602adf40 4069 dout("Error adding device %s\n", buf);
27cc2594
AE
4070
4071 return (ssize_t) rc;
602adf40
YS
4072}
4073
de71a297 4074static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
602adf40
YS
4075{
4076 struct list_head *tmp;
4077 struct rbd_device *rbd_dev;
4078
e124a82f 4079 spin_lock(&rbd_dev_list_lock);
602adf40
YS
4080 list_for_each(tmp, &rbd_dev_list) {
4081 rbd_dev = list_entry(tmp, struct rbd_device, node);
de71a297 4082 if (rbd_dev->dev_id == dev_id) {
e124a82f 4083 spin_unlock(&rbd_dev_list_lock);
602adf40 4084 return rbd_dev;
e124a82f 4085 }
602adf40 4086 }
e124a82f 4087 spin_unlock(&rbd_dev_list_lock);
602adf40
YS
4088 return NULL;
4089}
4090
dfc5606d 4091static void rbd_dev_release(struct device *dev)
602adf40 4092{
593a9e7b 4093 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 4094
59c2be1e 4095 if (rbd_dev->watch_event)
9969ebc5 4096 rbd_dev_header_watch_sync(rbd_dev, 0);
602adf40
YS
4097
4098 /* clean up and free blkdev */
4099 rbd_free_disk(rbd_dev);
4100 unregister_blkdev(rbd_dev->major, rbd_dev->name);
32eec68d 4101
2ac4e75d
AE
4102 /* release allocated disk header fields */
4103 rbd_header_free(&rbd_dev->header);
4104
32eec68d 4105 /* done with the id, and with the rbd_dev */
e2839308 4106 rbd_dev_id_put(rbd_dev);
c53d5893
AE
4107 rbd_assert(rbd_dev->rbd_client != NULL);
4108 rbd_dev_destroy(rbd_dev);
602adf40
YS
4109
4110 /* release module ref */
4111 module_put(THIS_MODULE);
602adf40
YS
4112}
4113
dfc5606d
YS
4114static ssize_t rbd_remove(struct bus_type *bus,
4115 const char *buf,
4116 size_t count)
602adf40
YS
4117{
4118 struct rbd_device *rbd_dev = NULL;
4119 int target_id, rc;
4120 unsigned long ul;
4121 int ret = count;
4122
4123 rc = strict_strtoul(buf, 10, &ul);
4124 if (rc)
4125 return rc;
4126
4127 /* convert to int; abort if we lost anything in the conversion */
4128 target_id = (int) ul;
4129 if (target_id != ul)
4130 return -EINVAL;
4131
4132 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4133
4134 rbd_dev = __rbd_get_dev(target_id);
4135 if (!rbd_dev) {
4136 ret = -ENOENT;
4137 goto done;
42382b70
AE
4138 }
4139
a14ea269 4140 spin_lock_irq(&rbd_dev->lock);
b82d167b 4141 if (rbd_dev->open_count)
42382b70 4142 ret = -EBUSY;
b82d167b
AE
4143 else
4144 set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
a14ea269 4145 spin_unlock_irq(&rbd_dev->lock);
b82d167b 4146 if (ret < 0)
42382b70 4147 goto done;
602adf40 4148
41f38c2b 4149 rbd_remove_all_snaps(rbd_dev);
dfc5606d 4150 rbd_bus_del_dev(rbd_dev);
602adf40
YS
4151
4152done:
4153 mutex_unlock(&ctl_mutex);
aafb230e 4154
602adf40
YS
4155 return ret;
4156}
4157
602adf40
YS
4158/*
4159 * create control files in sysfs
dfc5606d 4160 * /sys/bus/rbd/...
602adf40
YS
4161 */
4162static int rbd_sysfs_init(void)
4163{
dfc5606d 4164 int ret;
602adf40 4165
fed4c143 4166 ret = device_register(&rbd_root_dev);
21079786 4167 if (ret < 0)
dfc5606d 4168 return ret;
602adf40 4169
fed4c143
AE
4170 ret = bus_register(&rbd_bus_type);
4171 if (ret < 0)
4172 device_unregister(&rbd_root_dev);
602adf40 4173
602adf40
YS
4174 return ret;
4175}
4176
4177static void rbd_sysfs_cleanup(void)
4178{
dfc5606d 4179 bus_unregister(&rbd_bus_type);
fed4c143 4180 device_unregister(&rbd_root_dev);
602adf40
YS
4181}
4182
cc344fa1 4183static int __init rbd_init(void)
602adf40
YS
4184{
4185 int rc;
4186
1e32d34c
AE
4187 if (!libceph_compatible(NULL)) {
4188 rbd_warn(NULL, "libceph incompatibility (quitting)");
4189
4190 return -EINVAL;
4191 }
602adf40
YS
4192 rc = rbd_sysfs_init();
4193 if (rc)
4194 return rc;
f0f8cef5 4195 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
602adf40
YS
4196 return 0;
4197}
4198
cc344fa1 4199static void __exit rbd_exit(void)
602adf40
YS
4200{
4201 rbd_sysfs_cleanup();
4202}
4203
4204module_init(rbd_init);
4205module_exit(rbd_exit);
4206
4207MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
4208MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
4209MODULE_DESCRIPTION("rados block device");
4210
4211/* following authorship retained from original osdblk.c */
4212MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
4213
4214MODULE_LICENSE("GPL");