]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - drivers/block/rbd.c
rbd: record mapped size
[mirror_ubuntu-bionic-kernel.git] / drivers / block / rbd.c
CommitLineData
602adf40
YS
1/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
dfc5606d 24 For usage instructions, please refer to:
602adf40 25
dfc5606d 26 Documentation/ABI/testing/sysfs-bus-rbd
602adf40
YS
27
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
59c2be1e 34#include <linux/parser.h>
602adf40
YS
35
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
aafb230e
AE
44#define RBD_DEBUG /* Activate rbd_assert() calls */
45
593a9e7b
AE
46/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
df111be6
AE
55/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
f0f8cef5
AE
59#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
602adf40
YS
61
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
602adf40
YS
64#define RBD_MAX_SNAP_NAME_LEN 32
65#define RBD_MAX_OPT_LEN 1024
66
67#define RBD_SNAP_HEAD_NAME "-"
68
81a89793
AE
69/*
70 * An RBD device name will be "rbd#", where the "rbd" comes from
71 * RBD_DRV_NAME above, and # is a unique integer identifier.
72 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
73 * enough to hold all possible device names.
74 */
602adf40 75#define DEV_NAME_LEN 32
81a89793 76#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
602adf40 77
cc0538b6 78#define RBD_READ_ONLY_DEFAULT false
59c2be1e 79
602adf40
YS
80/*
81 * block device image metadata (in-memory version)
82 */
83struct rbd_image_header {
f84344f3 84 /* These four fields never change for a given rbd image */
849b4260 85 char *object_prefix;
602adf40
YS
86 __u8 obj_order;
87 __u8 crypt_type;
88 __u8 comp_type;
602adf40 89
f84344f3
AE
90 /* The remaining fields need to be updated occasionally */
91 u64 image_size;
92 struct ceph_snap_context *snapc;
602adf40
YS
93 char *snap_names;
94 u64 *snap_sizes;
59c2be1e
YS
95
96 u64 obj_version;
97};
98
99struct rbd_options {
cc0538b6 100 bool read_only;
602adf40
YS
101};
102
103/*
f0f8cef5 104 * an instance of the client. multiple devices may share an rbd client.
602adf40
YS
105 */
106struct rbd_client {
107 struct ceph_client *client;
108 struct kref kref;
109 struct list_head node;
110};
111
112/*
f0f8cef5 113 * a request completion status
602adf40 114 */
1fec7093
YS
115struct rbd_req_status {
116 int done;
117 int rc;
118 u64 bytes;
119};
120
121/*
122 * a collection of requests
123 */
124struct rbd_req_coll {
125 int total;
126 int num_done;
127 struct kref kref;
128 struct rbd_req_status status[0];
602adf40
YS
129};
130
f0f8cef5
AE
131/*
132 * a single io request
133 */
134struct rbd_request {
135 struct request *rq; /* blk layer request */
136 struct bio *bio; /* cloned bio */
137 struct page **pages; /* list of used pages */
138 u64 len;
139 int coll_index;
140 struct rbd_req_coll *coll;
141};
142
dfc5606d
YS
143struct rbd_snap {
144 struct device dev;
145 const char *name;
3591538f 146 u64 size;
dfc5606d
YS
147 struct list_head node;
148 u64 id;
149};
150
f84344f3
AE
151struct rbd_mapping {
152 char *snap_name;
153 u64 snap_id;
99c1f08f 154 u64 size;
f84344f3
AE
155 bool snap_exists;
156 bool read_only;
157};
158
602adf40
YS
159/*
160 * a single device
161 */
162struct rbd_device {
de71a297 163 int dev_id; /* blkdev unique id */
602adf40
YS
164
165 int major; /* blkdev assigned major */
166 struct gendisk *disk; /* blkdev's gendisk and rq */
602adf40 167
f8c38929 168 struct rbd_options rbd_opts;
602adf40
YS
169 struct rbd_client *rbd_client;
170
171 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
172
173 spinlock_t lock; /* queue lock */
174
175 struct rbd_image_header header;
0bed54dc
AE
176 char *image_name;
177 size_t image_name_len;
178 char *header_name;
d22f76e7 179 char *pool_name;
9bb2f334 180 int pool_id;
602adf40 181
59c2be1e
YS
182 struct ceph_osd_event *watch_event;
183 struct ceph_osd_request *watch_request;
184
c666601a
JD
185 /* protects updating the header */
186 struct rw_semaphore header_rwsem;
f84344f3
AE
187
188 struct rbd_mapping mapping;
602adf40
YS
189
190 struct list_head node;
dfc5606d
YS
191
192 /* list of snapshots */
193 struct list_head snaps;
194
195 /* sysfs related */
196 struct device dev;
197};
198
602adf40 199static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
e124a82f 200
602adf40 201static LIST_HEAD(rbd_dev_list); /* devices */
e124a82f
AE
202static DEFINE_SPINLOCK(rbd_dev_list_lock);
203
432b8587
AE
204static LIST_HEAD(rbd_client_list); /* clients */
205static DEFINE_SPINLOCK(rbd_client_list_lock);
602adf40 206
9fcbb800 207static int rbd_dev_snap_devs_update(struct rbd_device *rbd_dev);
dfc5606d 208static void rbd_dev_release(struct device *dev);
dfc5606d
YS
209static ssize_t rbd_snap_add(struct device *dev,
210 struct device_attribute *attr,
211 const char *buf,
212 size_t count);
14e7085d 213static void __rbd_remove_snap_dev(struct rbd_snap *snap);
dfc5606d 214
f0f8cef5
AE
215static ssize_t rbd_add(struct bus_type *bus, const char *buf,
216 size_t count);
217static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
218 size_t count);
219
220static struct bus_attribute rbd_bus_attrs[] = {
221 __ATTR(add, S_IWUSR, NULL, rbd_add),
222 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
223 __ATTR_NULL
224};
225
226static struct bus_type rbd_bus_type = {
227 .name = "rbd",
228 .bus_attrs = rbd_bus_attrs,
229};
230
231static void rbd_root_dev_release(struct device *dev)
232{
233}
234
235static struct device rbd_root_dev = {
236 .init_name = "rbd",
237 .release = rbd_root_dev_release,
238};
239
aafb230e
AE
240#ifdef RBD_DEBUG
241#define rbd_assert(expr) \
242 if (unlikely(!(expr))) { \
243 printk(KERN_ERR "\nAssertion failure in %s() " \
244 "at line %d:\n\n" \
245 "\trbd_assert(%s);\n\n", \
246 __func__, __LINE__, #expr); \
247 BUG(); \
248 }
249#else /* !RBD_DEBUG */
250# define rbd_assert(expr) ((void) 0)
251#endif /* !RBD_DEBUG */
dfc5606d 252
dfc5606d
YS
253static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
254{
255 return get_device(&rbd_dev->dev);
256}
257
258static void rbd_put_dev(struct rbd_device *rbd_dev)
259{
260 put_device(&rbd_dev->dev);
261}
602adf40 262
1fe5e993 263static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
59c2be1e 264
602adf40
YS
265static int rbd_open(struct block_device *bdev, fmode_t mode)
266{
f0f8cef5 267 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
602adf40 268
f84344f3 269 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
602adf40
YS
270 return -EROFS;
271
340c7a2b 272 rbd_get_dev(rbd_dev);
f84344f3 273 set_device_ro(bdev, rbd_dev->mapping.read_only);
340c7a2b 274
602adf40
YS
275 return 0;
276}
277
dfc5606d
YS
278static int rbd_release(struct gendisk *disk, fmode_t mode)
279{
280 struct rbd_device *rbd_dev = disk->private_data;
281
282 rbd_put_dev(rbd_dev);
283
284 return 0;
285}
286
602adf40
YS
287static const struct block_device_operations rbd_bd_ops = {
288 .owner = THIS_MODULE,
289 .open = rbd_open,
dfc5606d 290 .release = rbd_release,
602adf40
YS
291};
292
293/*
294 * Initialize an rbd client instance.
43ae4701 295 * We own *ceph_opts.
602adf40 296 */
f8c38929 297static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
602adf40
YS
298{
299 struct rbd_client *rbdc;
300 int ret = -ENOMEM;
301
302 dout("rbd_client_create\n");
303 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
304 if (!rbdc)
305 goto out_opt;
306
307 kref_init(&rbdc->kref);
308 INIT_LIST_HEAD(&rbdc->node);
309
bc534d86
AE
310 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
311
43ae4701 312 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
602adf40 313 if (IS_ERR(rbdc->client))
bc534d86 314 goto out_mutex;
43ae4701 315 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
602adf40
YS
316
317 ret = ceph_open_session(rbdc->client);
318 if (ret < 0)
319 goto out_err;
320
432b8587 321 spin_lock(&rbd_client_list_lock);
602adf40 322 list_add_tail(&rbdc->node, &rbd_client_list);
432b8587 323 spin_unlock(&rbd_client_list_lock);
602adf40 324
bc534d86
AE
325 mutex_unlock(&ctl_mutex);
326
602adf40
YS
327 dout("rbd_client_create created %p\n", rbdc);
328 return rbdc;
329
330out_err:
331 ceph_destroy_client(rbdc->client);
bc534d86
AE
332out_mutex:
333 mutex_unlock(&ctl_mutex);
602adf40
YS
334 kfree(rbdc);
335out_opt:
43ae4701
AE
336 if (ceph_opts)
337 ceph_destroy_options(ceph_opts);
28f259b7 338 return ERR_PTR(ret);
602adf40
YS
339}
340
341/*
1f7ba331
AE
342 * Find a ceph client with specific addr and configuration. If
343 * found, bump its reference count.
602adf40 344 */
1f7ba331 345static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
602adf40
YS
346{
347 struct rbd_client *client_node;
1f7ba331 348 bool found = false;
602adf40 349
43ae4701 350 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
602adf40
YS
351 return NULL;
352
1f7ba331
AE
353 spin_lock(&rbd_client_list_lock);
354 list_for_each_entry(client_node, &rbd_client_list, node) {
355 if (!ceph_compare_options(ceph_opts, client_node->client)) {
356 kref_get(&client_node->kref);
357 found = true;
358 break;
359 }
360 }
361 spin_unlock(&rbd_client_list_lock);
362
363 return found ? client_node : NULL;
602adf40
YS
364}
365
59c2be1e
YS
366/*
367 * mount options
368 */
369enum {
59c2be1e
YS
370 Opt_last_int,
371 /* int args above */
372 Opt_last_string,
373 /* string args above */
cc0538b6
AE
374 Opt_read_only,
375 Opt_read_write,
376 /* Boolean args above */
377 Opt_last_bool,
59c2be1e
YS
378};
379
43ae4701 380static match_table_t rbd_opts_tokens = {
59c2be1e
YS
381 /* int args above */
382 /* string args above */
f84344f3 383 {Opt_read_only, "mapping.read_only"},
cc0538b6
AE
384 {Opt_read_only, "ro"}, /* Alternate spelling */
385 {Opt_read_write, "read_write"},
386 {Opt_read_write, "rw"}, /* Alternate spelling */
387 /* Boolean args above */
59c2be1e
YS
388 {-1, NULL}
389};
390
391static int parse_rbd_opts_token(char *c, void *private)
392{
43ae4701 393 struct rbd_options *rbd_opts = private;
59c2be1e
YS
394 substring_t argstr[MAX_OPT_ARGS];
395 int token, intval, ret;
396
43ae4701 397 token = match_token(c, rbd_opts_tokens, argstr);
59c2be1e
YS
398 if (token < 0)
399 return -EINVAL;
400
401 if (token < Opt_last_int) {
402 ret = match_int(&argstr[0], &intval);
403 if (ret < 0) {
404 pr_err("bad mount option arg (not int) "
405 "at '%s'\n", c);
406 return ret;
407 }
408 dout("got int token %d val %d\n", token, intval);
409 } else if (token > Opt_last_int && token < Opt_last_string) {
410 dout("got string token %d val %s\n", token,
411 argstr[0].from);
cc0538b6
AE
412 } else if (token > Opt_last_string && token < Opt_last_bool) {
413 dout("got Boolean token %d\n", token);
59c2be1e
YS
414 } else {
415 dout("got token %d\n", token);
416 }
417
418 switch (token) {
cc0538b6
AE
419 case Opt_read_only:
420 rbd_opts->read_only = true;
421 break;
422 case Opt_read_write:
423 rbd_opts->read_only = false;
424 break;
59c2be1e 425 default:
aafb230e
AE
426 rbd_assert(false);
427 break;
59c2be1e
YS
428 }
429 return 0;
430}
431
602adf40
YS
432/*
433 * Get a ceph client with specific addr and configuration, if one does
434 * not exist create it.
435 */
f8c38929
AE
436static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
437 size_t mon_addr_len, char *options)
602adf40 438{
f8c38929 439 struct rbd_options *rbd_opts = &rbd_dev->rbd_opts;
43ae4701 440 struct ceph_options *ceph_opts;
f8c38929 441 struct rbd_client *rbdc;
59c2be1e 442
cc0538b6 443 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
602adf40 444
43ae4701
AE
445 ceph_opts = ceph_parse_options(options, mon_addr,
446 mon_addr + mon_addr_len,
447 parse_rbd_opts_token, rbd_opts);
f8c38929
AE
448 if (IS_ERR(ceph_opts))
449 return PTR_ERR(ceph_opts);
602adf40 450
1f7ba331 451 rbdc = rbd_client_find(ceph_opts);
602adf40 452 if (rbdc) {
602adf40 453 /* using an existing client */
43ae4701 454 ceph_destroy_options(ceph_opts);
f8c38929
AE
455 } else {
456 rbdc = rbd_client_create(ceph_opts);
457 if (IS_ERR(rbdc))
458 return PTR_ERR(rbdc);
602adf40 459 }
f8c38929 460 rbd_dev->rbd_client = rbdc;
602adf40 461
f8c38929 462 return 0;
602adf40
YS
463}
464
465/*
466 * Destroy ceph client
d23a4b3f 467 *
432b8587 468 * Caller must hold rbd_client_list_lock.
602adf40
YS
469 */
470static void rbd_client_release(struct kref *kref)
471{
472 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
473
474 dout("rbd_release_client %p\n", rbdc);
cd9d9f5d 475 spin_lock(&rbd_client_list_lock);
602adf40 476 list_del(&rbdc->node);
cd9d9f5d 477 spin_unlock(&rbd_client_list_lock);
602adf40
YS
478
479 ceph_destroy_client(rbdc->client);
480 kfree(rbdc);
481}
482
483/*
484 * Drop reference to ceph client node. If it's not referenced anymore, release
485 * it.
486 */
487static void rbd_put_client(struct rbd_device *rbd_dev)
488{
489 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
490 rbd_dev->rbd_client = NULL;
602adf40
YS
491}
492
1fec7093
YS
493/*
494 * Destroy requests collection
495 */
496static void rbd_coll_release(struct kref *kref)
497{
498 struct rbd_req_coll *coll =
499 container_of(kref, struct rbd_req_coll, kref);
500
501 dout("rbd_coll_release %p\n", coll);
502 kfree(coll);
503}
602adf40 504
8e94af8e
AE
505static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
506{
103a150f
AE
507 size_t size;
508 u32 snap_count;
509
510 /* The header has to start with the magic rbd header text */
511 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
512 return false;
513
514 /*
515 * The size of a snapshot header has to fit in a size_t, and
516 * that limits the number of snapshots.
517 */
518 snap_count = le32_to_cpu(ondisk->snap_count);
519 size = SIZE_MAX - sizeof (struct ceph_snap_context);
520 if (snap_count > size / sizeof (__le64))
521 return false;
522
523 /*
524 * Not only that, but the size of the entire the snapshot
525 * header must also be representable in a size_t.
526 */
527 size -= snap_count * sizeof (__le64);
528 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
529 return false;
530
531 return true;
8e94af8e
AE
532}
533
602adf40
YS
534/*
535 * Create a new header structure, translate header format from the on-disk
536 * header.
537 */
538static int rbd_header_from_disk(struct rbd_image_header *header,
4156d998 539 struct rbd_image_header_ondisk *ondisk)
602adf40 540{
ccece235 541 u32 snap_count;
58c17b0e 542 size_t len;
d2bb24e5 543 size_t size;
621901d6 544 u32 i;
602adf40 545
6a52325f
AE
546 memset(header, 0, sizeof (*header));
547
103a150f
AE
548 snap_count = le32_to_cpu(ondisk->snap_count);
549
58c17b0e
AE
550 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
551 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
6a52325f 552 if (!header->object_prefix)
602adf40 553 return -ENOMEM;
58c17b0e
AE
554 memcpy(header->object_prefix, ondisk->object_prefix, len);
555 header->object_prefix[len] = '\0';
00f1f36f 556
602adf40 557 if (snap_count) {
f785cc1d
AE
558 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
559
621901d6
AE
560 /* Save a copy of the snapshot names */
561
f785cc1d
AE
562 if (snap_names_len > (u64) SIZE_MAX)
563 return -EIO;
564 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
602adf40 565 if (!header->snap_names)
6a52325f 566 goto out_err;
f785cc1d
AE
567 /*
568 * Note that rbd_dev_v1_header_read() guarantees
569 * the ondisk buffer we're working with has
570 * snap_names_len bytes beyond the end of the
571 * snapshot id array, this memcpy() is safe.
572 */
573 memcpy(header->snap_names, &ondisk->snaps[snap_count],
574 snap_names_len);
6a52325f 575
621901d6
AE
576 /* Record each snapshot's size */
577
d2bb24e5
AE
578 size = snap_count * sizeof (*header->snap_sizes);
579 header->snap_sizes = kmalloc(size, GFP_KERNEL);
602adf40 580 if (!header->snap_sizes)
6a52325f 581 goto out_err;
621901d6
AE
582 for (i = 0; i < snap_count; i++)
583 header->snap_sizes[i] =
584 le64_to_cpu(ondisk->snaps[i].image_size);
602adf40 585 } else {
ccece235 586 WARN_ON(ondisk->snap_names_len);
602adf40
YS
587 header->snap_names = NULL;
588 header->snap_sizes = NULL;
589 }
849b4260 590
602adf40
YS
591 header->obj_order = ondisk->options.order;
592 header->crypt_type = ondisk->options.crypt_type;
593 header->comp_type = ondisk->options.comp_type;
6a52325f 594
621901d6
AE
595 /* Allocate and fill in the snapshot context */
596
f84344f3 597 header->image_size = le64_to_cpu(ondisk->image_size);
6a52325f
AE
598 size = sizeof (struct ceph_snap_context);
599 size += snap_count * sizeof (header->snapc->snaps[0]);
600 header->snapc = kzalloc(size, GFP_KERNEL);
601 if (!header->snapc)
602 goto out_err;
602adf40
YS
603
604 atomic_set(&header->snapc->nref, 1);
505cbb9b 605 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
602adf40 606 header->snapc->num_snaps = snap_count;
621901d6
AE
607 for (i = 0; i < snap_count; i++)
608 header->snapc->snaps[i] =
609 le64_to_cpu(ondisk->snaps[i].id);
602adf40
YS
610
611 return 0;
612
6a52325f 613out_err:
849b4260 614 kfree(header->snap_sizes);
ccece235 615 header->snap_sizes = NULL;
602adf40 616 kfree(header->snap_names);
ccece235 617 header->snap_names = NULL;
6a52325f
AE
618 kfree(header->object_prefix);
619 header->object_prefix = NULL;
ccece235 620
00f1f36f 621 return -ENOMEM;
602adf40
YS
622}
623
602adf40
YS
624static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
625 u64 *seq, u64 *size)
626{
627 int i;
628 char *p = header->snap_names;
629
c9aadfe7
AE
630 rbd_assert(header->snapc != NULL);
631 for (i = 0; i < header->snapc->num_snaps; i++) {
00f1f36f 632 if (!strcmp(snap_name, p)) {
602adf40 633
00f1f36f 634 /* Found it. Pass back its id and/or size */
602adf40 635
00f1f36f
AE
636 if (seq)
637 *seq = header->snapc->snaps[i];
638 if (size)
639 *size = header->snap_sizes[i];
640 return i;
641 }
642 p += strlen(p) + 1; /* Skip ahead to the next name */
643 }
644 return -ENOENT;
602adf40
YS
645}
646
99c1f08f 647static int rbd_header_set_snap(struct rbd_device *rbd_dev)
602adf40 648{
78dc447d 649 int ret;
602adf40 650
0ce1a794 651 down_write(&rbd_dev->header_rwsem);
602adf40 652
f84344f3 653 if (!memcmp(rbd_dev->mapping.snap_name, RBD_SNAP_HEAD_NAME,
cc9d734c 654 sizeof (RBD_SNAP_HEAD_NAME))) {
f84344f3 655 rbd_dev->mapping.snap_id = CEPH_NOSNAP;
99c1f08f 656 rbd_dev->mapping.size = rbd_dev->header.image_size;
f84344f3
AE
657 rbd_dev->mapping.snap_exists = false;
658 rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only;
602adf40 659 } else {
f84344f3
AE
660 ret = snap_by_name(&rbd_dev->header,
661 rbd_dev->mapping.snap_name,
99c1f08f
AE
662 &rbd_dev->mapping.snap_id,
663 &rbd_dev->mapping.size);
602adf40
YS
664 if (ret < 0)
665 goto done;
f84344f3
AE
666 rbd_dev->mapping.snap_exists = true;
667 rbd_dev->mapping.read_only = true;
602adf40
YS
668 }
669
670 ret = 0;
671done:
0ce1a794 672 up_write(&rbd_dev->header_rwsem);
602adf40
YS
673 return ret;
674}
675
676static void rbd_header_free(struct rbd_image_header *header)
677{
849b4260 678 kfree(header->object_prefix);
d78fd7ae 679 header->object_prefix = NULL;
602adf40 680 kfree(header->snap_sizes);
d78fd7ae 681 header->snap_sizes = NULL;
849b4260 682 kfree(header->snap_names);
d78fd7ae 683 header->snap_names = NULL;
d1d25646 684 ceph_put_snap_context(header->snapc);
d78fd7ae 685 header->snapc = NULL;
602adf40
YS
686}
687
65ccfe21 688static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
602adf40 689{
65ccfe21
AE
690 char *name;
691 u64 segment;
692 int ret;
602adf40 693
65ccfe21
AE
694 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
695 if (!name)
696 return NULL;
697 segment = offset >> rbd_dev->header.obj_order;
698 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
699 rbd_dev->header.object_prefix, segment);
700 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
701 pr_err("error formatting segment name for #%llu (%d)\n",
702 segment, ret);
703 kfree(name);
704 name = NULL;
705 }
602adf40 706
65ccfe21
AE
707 return name;
708}
602adf40 709
65ccfe21
AE
710static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
711{
712 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
602adf40 713
65ccfe21
AE
714 return offset & (segment_size - 1);
715}
716
717static u64 rbd_segment_length(struct rbd_device *rbd_dev,
718 u64 offset, u64 length)
719{
720 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
721
722 offset &= segment_size - 1;
723
aafb230e 724 rbd_assert(length <= U64_MAX - offset);
65ccfe21
AE
725 if (offset + length > segment_size)
726 length = segment_size - offset;
727
728 return length;
602adf40
YS
729}
730
1fec7093
YS
731static int rbd_get_num_segments(struct rbd_image_header *header,
732 u64 ofs, u64 len)
733{
df111be6
AE
734 u64 start_seg;
735 u64 end_seg;
736
737 if (!len)
738 return 0;
739 if (len - 1 > U64_MAX - ofs)
740 return -ERANGE;
741
742 start_seg = ofs >> header->obj_order;
743 end_seg = (ofs + len - 1) >> header->obj_order;
744
1fec7093
YS
745 return end_seg - start_seg + 1;
746}
747
029bcbd8
JD
748/*
749 * returns the size of an object in the image
750 */
751static u64 rbd_obj_bytes(struct rbd_image_header *header)
752{
753 return 1 << header->obj_order;
754}
755
602adf40
YS
756/*
757 * bio helpers
758 */
759
760static void bio_chain_put(struct bio *chain)
761{
762 struct bio *tmp;
763
764 while (chain) {
765 tmp = chain;
766 chain = chain->bi_next;
767 bio_put(tmp);
768 }
769}
770
771/*
772 * zeros a bio chain, starting at specific offset
773 */
774static void zero_bio_chain(struct bio *chain, int start_ofs)
775{
776 struct bio_vec *bv;
777 unsigned long flags;
778 void *buf;
779 int i;
780 int pos = 0;
781
782 while (chain) {
783 bio_for_each_segment(bv, chain, i) {
784 if (pos + bv->bv_len > start_ofs) {
785 int remainder = max(start_ofs - pos, 0);
786 buf = bvec_kmap_irq(bv, &flags);
787 memset(buf + remainder, 0,
788 bv->bv_len - remainder);
85b5aaa6 789 bvec_kunmap_irq(buf, &flags);
602adf40
YS
790 }
791 pos += bv->bv_len;
792 }
793
794 chain = chain->bi_next;
795 }
796}
797
798/*
799 * bio_chain_clone - clone a chain of bios up to a certain length.
800 * might return a bio_pair that will need to be released.
801 */
802static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
803 struct bio_pair **bp,
804 int len, gfp_t gfpmask)
805{
542582fc
AE
806 struct bio *old_chain = *old;
807 struct bio *new_chain = NULL;
808 struct bio *tail;
602adf40
YS
809 int total = 0;
810
811 if (*bp) {
812 bio_pair_release(*bp);
813 *bp = NULL;
814 }
815
816 while (old_chain && (total < len)) {
542582fc
AE
817 struct bio *tmp;
818
602adf40
YS
819 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
820 if (!tmp)
821 goto err_out;
542582fc 822 gfpmask &= ~__GFP_WAIT; /* can't wait after the first */
602adf40
YS
823
824 if (total + old_chain->bi_size > len) {
825 struct bio_pair *bp;
826
827 /*
828 * this split can only happen with a single paged bio,
829 * split_bio will BUG_ON if this is not the case
830 */
831 dout("bio_chain_clone split! total=%d remaining=%d"
bd919d45
AE
832 "bi_size=%u\n",
833 total, len - total, old_chain->bi_size);
602adf40
YS
834
835 /* split the bio. We'll release it either in the next
836 call, or it will have to be released outside */
593a9e7b 837 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
602adf40
YS
838 if (!bp)
839 goto err_out;
840
841 __bio_clone(tmp, &bp->bio1);
842
843 *next = &bp->bio2;
844 } else {
845 __bio_clone(tmp, old_chain);
846 *next = old_chain->bi_next;
847 }
848
849 tmp->bi_bdev = NULL;
602adf40 850 tmp->bi_next = NULL;
542582fc 851 if (new_chain)
602adf40 852 tail->bi_next = tmp;
542582fc
AE
853 else
854 new_chain = tmp;
855 tail = tmp;
602adf40
YS
856 old_chain = old_chain->bi_next;
857
858 total += tmp->bi_size;
859 }
860
aafb230e 861 rbd_assert(total == len);
602adf40 862
602adf40
YS
863 *old = old_chain;
864
865 return new_chain;
866
867err_out:
868 dout("bio_chain_clone with err\n");
869 bio_chain_put(new_chain);
870 return NULL;
871}
872
873/*
874 * helpers for osd request op vectors.
875 */
57cfc106
AE
876static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
877 int opcode, u32 payload_len)
602adf40 878{
57cfc106
AE
879 struct ceph_osd_req_op *ops;
880
881 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
882 if (!ops)
883 return NULL;
884
885 ops[0].op = opcode;
886
602adf40
YS
887 /*
888 * op extent offset and length will be set later on
889 * in calc_raw_layout()
890 */
57cfc106
AE
891 ops[0].payload_len = payload_len;
892
893 return ops;
602adf40
YS
894}
895
896static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
897{
898 kfree(ops);
899}
900
1fec7093
YS
901static void rbd_coll_end_req_index(struct request *rq,
902 struct rbd_req_coll *coll,
903 int index,
904 int ret, u64 len)
905{
906 struct request_queue *q;
907 int min, max, i;
908
bd919d45
AE
909 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
910 coll, index, ret, (unsigned long long) len);
1fec7093
YS
911
912 if (!rq)
913 return;
914
915 if (!coll) {
916 blk_end_request(rq, ret, len);
917 return;
918 }
919
920 q = rq->q;
921
922 spin_lock_irq(q->queue_lock);
923 coll->status[index].done = 1;
924 coll->status[index].rc = ret;
925 coll->status[index].bytes = len;
926 max = min = coll->num_done;
927 while (max < coll->total && coll->status[max].done)
928 max++;
929
930 for (i = min; i<max; i++) {
931 __blk_end_request(rq, coll->status[i].rc,
932 coll->status[i].bytes);
933 coll->num_done++;
934 kref_put(&coll->kref, rbd_coll_release);
935 }
936 spin_unlock_irq(q->queue_lock);
937}
938
939static void rbd_coll_end_req(struct rbd_request *req,
940 int ret, u64 len)
941{
942 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
943}
944
602adf40
YS
945/*
946 * Send ceph osd request
947 */
948static int rbd_do_request(struct request *rq,
0ce1a794 949 struct rbd_device *rbd_dev,
602adf40
YS
950 struct ceph_snap_context *snapc,
951 u64 snapid,
aded07ea 952 const char *object_name, u64 ofs, u64 len,
602adf40
YS
953 struct bio *bio,
954 struct page **pages,
955 int num_pages,
956 int flags,
957 struct ceph_osd_req_op *ops,
1fec7093
YS
958 struct rbd_req_coll *coll,
959 int coll_index,
602adf40 960 void (*rbd_cb)(struct ceph_osd_request *req,
59c2be1e
YS
961 struct ceph_msg *msg),
962 struct ceph_osd_request **linger_req,
963 u64 *ver)
602adf40
YS
964{
965 struct ceph_osd_request *req;
966 struct ceph_file_layout *layout;
967 int ret;
968 u64 bno;
969 struct timespec mtime = CURRENT_TIME;
970 struct rbd_request *req_data;
971 struct ceph_osd_request_head *reqhead;
1dbb4399 972 struct ceph_osd_client *osdc;
602adf40 973
602adf40 974 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
1fec7093
YS
975 if (!req_data) {
976 if (coll)
977 rbd_coll_end_req_index(rq, coll, coll_index,
978 -ENOMEM, len);
979 return -ENOMEM;
980 }
981
982 if (coll) {
983 req_data->coll = coll;
984 req_data->coll_index = coll_index;
985 }
602adf40 986
bd919d45
AE
987 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
988 (unsigned long long) ofs, (unsigned long long) len);
602adf40 989
0ce1a794 990 osdc = &rbd_dev->rbd_client->client->osdc;
1dbb4399
AE
991 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
992 false, GFP_NOIO, pages, bio);
4ad12621 993 if (!req) {
4ad12621 994 ret = -ENOMEM;
602adf40
YS
995 goto done_pages;
996 }
997
998 req->r_callback = rbd_cb;
999
1000 req_data->rq = rq;
1001 req_data->bio = bio;
1002 req_data->pages = pages;
1003 req_data->len = len;
1004
1005 req->r_priv = req_data;
1006
1007 reqhead = req->r_request->front.iov_base;
1008 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1009
aded07ea 1010 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
602adf40
YS
1011 req->r_oid_len = strlen(req->r_oid);
1012
1013 layout = &req->r_file_layout;
1014 memset(layout, 0, sizeof(*layout));
1015 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1016 layout->fl_stripe_count = cpu_to_le32(1);
1017 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
0ce1a794 1018 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
1dbb4399
AE
1019 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1020 req, ops);
602adf40
YS
1021
1022 ceph_osdc_build_request(req, ofs, &len,
1023 ops,
1024 snapc,
1025 &mtime,
1026 req->r_oid, req->r_oid_len);
602adf40 1027
59c2be1e 1028 if (linger_req) {
1dbb4399 1029 ceph_osdc_set_request_linger(osdc, req);
59c2be1e
YS
1030 *linger_req = req;
1031 }
1032
1dbb4399 1033 ret = ceph_osdc_start_request(osdc, req, false);
602adf40
YS
1034 if (ret < 0)
1035 goto done_err;
1036
1037 if (!rbd_cb) {
1dbb4399 1038 ret = ceph_osdc_wait_request(osdc, req);
59c2be1e
YS
1039 if (ver)
1040 *ver = le64_to_cpu(req->r_reassert_version.version);
bd919d45
AE
1041 dout("reassert_ver=%llu\n",
1042 (unsigned long long)
1043 le64_to_cpu(req->r_reassert_version.version));
602adf40
YS
1044 ceph_osdc_put_request(req);
1045 }
1046 return ret;
1047
1048done_err:
1049 bio_chain_put(req_data->bio);
1050 ceph_osdc_put_request(req);
1051done_pages:
1fec7093 1052 rbd_coll_end_req(req_data, ret, len);
602adf40 1053 kfree(req_data);
602adf40
YS
1054 return ret;
1055}
1056
1057/*
1058 * Ceph osd op callback
1059 */
1060static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1061{
1062 struct rbd_request *req_data = req->r_priv;
1063 struct ceph_osd_reply_head *replyhead;
1064 struct ceph_osd_op *op;
1065 __s32 rc;
1066 u64 bytes;
1067 int read_op;
1068
1069 /* parse reply */
1070 replyhead = msg->front.iov_base;
1071 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1072 op = (void *)(replyhead + 1);
1073 rc = le32_to_cpu(replyhead->result);
1074 bytes = le64_to_cpu(op->extent.length);
895cfcc8 1075 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
602adf40 1076
bd919d45
AE
1077 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1078 (unsigned long long) bytes, read_op, (int) rc);
602adf40
YS
1079
1080 if (rc == -ENOENT && read_op) {
1081 zero_bio_chain(req_data->bio, 0);
1082 rc = 0;
1083 } else if (rc == 0 && read_op && bytes < req_data->len) {
1084 zero_bio_chain(req_data->bio, bytes);
1085 bytes = req_data->len;
1086 }
1087
1fec7093 1088 rbd_coll_end_req(req_data, rc, bytes);
602adf40
YS
1089
1090 if (req_data->bio)
1091 bio_chain_put(req_data->bio);
1092
1093 ceph_osdc_put_request(req);
1094 kfree(req_data);
1095}
1096
59c2be1e
YS
1097static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1098{
1099 ceph_osdc_put_request(req);
1100}
1101
602adf40
YS
1102/*
1103 * Do a synchronous ceph osd operation
1104 */
0ce1a794 1105static int rbd_req_sync_op(struct rbd_device *rbd_dev,
602adf40
YS
1106 struct ceph_snap_context *snapc,
1107 u64 snapid,
602adf40 1108 int flags,
913d2fdc 1109 struct ceph_osd_req_op *ops,
aded07ea 1110 const char *object_name,
602adf40 1111 u64 ofs, u64 len,
59c2be1e
YS
1112 char *buf,
1113 struct ceph_osd_request **linger_req,
1114 u64 *ver)
602adf40
YS
1115{
1116 int ret;
1117 struct page **pages;
1118 int num_pages;
913d2fdc 1119
aafb230e 1120 rbd_assert(ops != NULL);
602adf40
YS
1121
1122 num_pages = calc_pages_for(ofs , len);
1123 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
b8d0638a
DC
1124 if (IS_ERR(pages))
1125 return PTR_ERR(pages);
602adf40 1126
0ce1a794 1127 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
aded07ea 1128 object_name, ofs, len, NULL,
602adf40
YS
1129 pages, num_pages,
1130 flags,
1131 ops,
1fec7093 1132 NULL, 0,
59c2be1e
YS
1133 NULL,
1134 linger_req, ver);
602adf40 1135 if (ret < 0)
913d2fdc 1136 goto done;
602adf40
YS
1137
1138 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1139 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1140
602adf40
YS
1141done:
1142 ceph_release_page_vector(pages, num_pages);
1143 return ret;
1144}
1145
1146/*
1147 * Do an asynchronous ceph osd operation
1148 */
1149static int rbd_do_op(struct request *rq,
0ce1a794 1150 struct rbd_device *rbd_dev,
602adf40
YS
1151 struct ceph_snap_context *snapc,
1152 u64 snapid,
d1f57ea6 1153 int opcode, int flags,
602adf40 1154 u64 ofs, u64 len,
1fec7093
YS
1155 struct bio *bio,
1156 struct rbd_req_coll *coll,
1157 int coll_index)
602adf40
YS
1158{
1159 char *seg_name;
1160 u64 seg_ofs;
1161 u64 seg_len;
1162 int ret;
1163 struct ceph_osd_req_op *ops;
1164 u32 payload_len;
1165
65ccfe21 1166 seg_name = rbd_segment_name(rbd_dev, ofs);
602adf40
YS
1167 if (!seg_name)
1168 return -ENOMEM;
65ccfe21
AE
1169 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1170 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
602adf40
YS
1171
1172 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1173
57cfc106
AE
1174 ret = -ENOMEM;
1175 ops = rbd_create_rw_ops(1, opcode, payload_len);
1176 if (!ops)
602adf40
YS
1177 goto done;
1178
1179 /* we've taken care of segment sizes earlier when we
1180 cloned the bios. We should never have a segment
1181 truncated at this point */
aafb230e 1182 rbd_assert(seg_len == len);
602adf40
YS
1183
1184 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1185 seg_name, seg_ofs, seg_len,
1186 bio,
1187 NULL, 0,
1188 flags,
1189 ops,
1fec7093 1190 coll, coll_index,
59c2be1e 1191 rbd_req_cb, 0, NULL);
11f77002
SW
1192
1193 rbd_destroy_ops(ops);
602adf40
YS
1194done:
1195 kfree(seg_name);
1196 return ret;
1197}
1198
1199/*
1200 * Request async osd write
1201 */
1202static int rbd_req_write(struct request *rq,
1203 struct rbd_device *rbd_dev,
1204 struct ceph_snap_context *snapc,
1205 u64 ofs, u64 len,
1fec7093
YS
1206 struct bio *bio,
1207 struct rbd_req_coll *coll,
1208 int coll_index)
602adf40
YS
1209{
1210 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1211 CEPH_OSD_OP_WRITE,
1212 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1fec7093 1213 ofs, len, bio, coll, coll_index);
602adf40
YS
1214}
1215
1216/*
1217 * Request async osd read
1218 */
1219static int rbd_req_read(struct request *rq,
1220 struct rbd_device *rbd_dev,
1221 u64 snapid,
1222 u64 ofs, u64 len,
1fec7093
YS
1223 struct bio *bio,
1224 struct rbd_req_coll *coll,
1225 int coll_index)
602adf40
YS
1226{
1227 return rbd_do_op(rq, rbd_dev, NULL,
b06e6a6b 1228 snapid,
602adf40
YS
1229 CEPH_OSD_OP_READ,
1230 CEPH_OSD_FLAG_READ,
1fec7093 1231 ofs, len, bio, coll, coll_index);
602adf40
YS
1232}
1233
1234/*
1235 * Request sync osd read
1236 */
0ce1a794 1237static int rbd_req_sync_read(struct rbd_device *rbd_dev,
602adf40 1238 u64 snapid,
aded07ea 1239 const char *object_name,
602adf40 1240 u64 ofs, u64 len,
59c2be1e
YS
1241 char *buf,
1242 u64 *ver)
602adf40 1243{
913d2fdc
AE
1244 struct ceph_osd_req_op *ops;
1245 int ret;
1246
1247 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1248 if (!ops)
1249 return -ENOMEM;
1250
1251 ret = rbd_req_sync_op(rbd_dev, NULL,
b06e6a6b 1252 snapid,
602adf40 1253 CEPH_OSD_FLAG_READ,
913d2fdc
AE
1254 ops, object_name, ofs, len, buf, NULL, ver);
1255 rbd_destroy_ops(ops);
1256
1257 return ret;
602adf40
YS
1258}
1259
1260/*
59c2be1e
YS
1261 * Request sync osd watch
1262 */
0ce1a794 1263static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
59c2be1e 1264 u64 ver,
7f0a24d8 1265 u64 notify_id)
59c2be1e
YS
1266{
1267 struct ceph_osd_req_op *ops;
11f77002
SW
1268 int ret;
1269
57cfc106
AE
1270 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1271 if (!ops)
1272 return -ENOMEM;
59c2be1e 1273
a71b891b 1274 ops[0].watch.ver = cpu_to_le64(ver);
59c2be1e
YS
1275 ops[0].watch.cookie = notify_id;
1276 ops[0].watch.flag = 0;
1277
0ce1a794 1278 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
7f0a24d8 1279 rbd_dev->header_name, 0, 0, NULL,
ad4f232f 1280 NULL, 0,
59c2be1e
YS
1281 CEPH_OSD_FLAG_READ,
1282 ops,
1fec7093 1283 NULL, 0,
59c2be1e
YS
1284 rbd_simple_req_cb, 0, NULL);
1285
1286 rbd_destroy_ops(ops);
1287 return ret;
1288}
1289
1290static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1291{
0ce1a794 1292 struct rbd_device *rbd_dev = (struct rbd_device *)data;
a71b891b 1293 u64 hver;
13143d2d
SW
1294 int rc;
1295
0ce1a794 1296 if (!rbd_dev)
59c2be1e
YS
1297 return;
1298
bd919d45
AE
1299 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1300 rbd_dev->header_name, (unsigned long long) notify_id,
1301 (unsigned int) opcode);
1fe5e993 1302 rc = rbd_refresh_header(rbd_dev, &hver);
13143d2d 1303 if (rc)
f0f8cef5 1304 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
0ce1a794 1305 " update snaps: %d\n", rbd_dev->major, rc);
59c2be1e 1306
7f0a24d8 1307 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
59c2be1e
YS
1308}
1309
1310/*
1311 * Request sync osd watch
1312 */
0e6f322d 1313static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
59c2be1e
YS
1314{
1315 struct ceph_osd_req_op *ops;
0ce1a794 1316 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
57cfc106 1317 int ret;
59c2be1e 1318
57cfc106
AE
1319 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1320 if (!ops)
1321 return -ENOMEM;
59c2be1e
YS
1322
1323 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
0ce1a794 1324 (void *)rbd_dev, &rbd_dev->watch_event);
59c2be1e
YS
1325 if (ret < 0)
1326 goto fail;
1327
0e6f322d 1328 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
0ce1a794 1329 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
59c2be1e
YS
1330 ops[0].watch.flag = 1;
1331
0ce1a794 1332 ret = rbd_req_sync_op(rbd_dev, NULL,
59c2be1e 1333 CEPH_NOSNAP,
59c2be1e
YS
1334 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1335 ops,
0e6f322d
AE
1336 rbd_dev->header_name,
1337 0, 0, NULL,
0ce1a794 1338 &rbd_dev->watch_request, NULL);
59c2be1e
YS
1339
1340 if (ret < 0)
1341 goto fail_event;
1342
1343 rbd_destroy_ops(ops);
1344 return 0;
1345
1346fail_event:
0ce1a794
AE
1347 ceph_osdc_cancel_event(rbd_dev->watch_event);
1348 rbd_dev->watch_event = NULL;
59c2be1e
YS
1349fail:
1350 rbd_destroy_ops(ops);
1351 return ret;
1352}
1353
79e3057c
YS
1354/*
1355 * Request sync osd unwatch
1356 */
070c633f 1357static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
79e3057c
YS
1358{
1359 struct ceph_osd_req_op *ops;
57cfc106 1360 int ret;
79e3057c 1361
57cfc106
AE
1362 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1363 if (!ops)
1364 return -ENOMEM;
79e3057c
YS
1365
1366 ops[0].watch.ver = 0;
0ce1a794 1367 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
79e3057c
YS
1368 ops[0].watch.flag = 0;
1369
0ce1a794 1370 ret = rbd_req_sync_op(rbd_dev, NULL,
79e3057c 1371 CEPH_NOSNAP,
79e3057c
YS
1372 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1373 ops,
070c633f
AE
1374 rbd_dev->header_name,
1375 0, 0, NULL, NULL, NULL);
1376
79e3057c
YS
1377
1378 rbd_destroy_ops(ops);
0ce1a794
AE
1379 ceph_osdc_cancel_event(rbd_dev->watch_event);
1380 rbd_dev->watch_event = NULL;
79e3057c
YS
1381 return ret;
1382}
1383
59c2be1e 1384struct rbd_notify_info {
0ce1a794 1385 struct rbd_device *rbd_dev;
59c2be1e
YS
1386};
1387
1388static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1389{
0ce1a794
AE
1390 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1391 if (!rbd_dev)
59c2be1e
YS
1392 return;
1393
bd919d45
AE
1394 dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n",
1395 rbd_dev->header_name, (unsigned long long) notify_id,
1396 (unsigned int) opcode);
59c2be1e
YS
1397}
1398
1399/*
1400 * Request sync osd notify
1401 */
4cb16250 1402static int rbd_req_sync_notify(struct rbd_device *rbd_dev)
59c2be1e
YS
1403{
1404 struct ceph_osd_req_op *ops;
0ce1a794 1405 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
59c2be1e
YS
1406 struct ceph_osd_event *event;
1407 struct rbd_notify_info info;
1408 int payload_len = sizeof(u32) + sizeof(u32);
1409 int ret;
1410
57cfc106
AE
1411 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len);
1412 if (!ops)
1413 return -ENOMEM;
59c2be1e 1414
0ce1a794 1415 info.rbd_dev = rbd_dev;
59c2be1e
YS
1416
1417 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1418 (void *)&info, &event);
1419 if (ret < 0)
1420 goto fail;
1421
1422 ops[0].watch.ver = 1;
1423 ops[0].watch.flag = 1;
1424 ops[0].watch.cookie = event->cookie;
1425 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1426 ops[0].watch.timeout = 12;
1427
0ce1a794 1428 ret = rbd_req_sync_op(rbd_dev, NULL,
59c2be1e 1429 CEPH_NOSNAP,
59c2be1e
YS
1430 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1431 ops,
4cb16250
AE
1432 rbd_dev->header_name,
1433 0, 0, NULL, NULL, NULL);
59c2be1e
YS
1434 if (ret < 0)
1435 goto fail_event;
1436
1437 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1438 dout("ceph_osdc_wait_event returned %d\n", ret);
1439 rbd_destroy_ops(ops);
1440 return 0;
1441
1442fail_event:
1443 ceph_osdc_cancel_event(event);
1444fail:
1445 rbd_destroy_ops(ops);
1446 return ret;
1447}
1448
602adf40
YS
1449/*
1450 * Request sync osd read
1451 */
0ce1a794 1452static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
aded07ea
AE
1453 const char *object_name,
1454 const char *class_name,
1455 const char *method_name,
602adf40 1456 const char *data,
59c2be1e
YS
1457 int len,
1458 u64 *ver)
602adf40
YS
1459{
1460 struct ceph_osd_req_op *ops;
aded07ea
AE
1461 int class_name_len = strlen(class_name);
1462 int method_name_len = strlen(method_name);
57cfc106
AE
1463 int ret;
1464
1465 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL,
aded07ea 1466 class_name_len + method_name_len + len);
57cfc106
AE
1467 if (!ops)
1468 return -ENOMEM;
602adf40 1469
aded07ea
AE
1470 ops[0].cls.class_name = class_name;
1471 ops[0].cls.class_len = (__u8) class_name_len;
1472 ops[0].cls.method_name = method_name;
1473 ops[0].cls.method_len = (__u8) method_name_len;
602adf40
YS
1474 ops[0].cls.argc = 0;
1475 ops[0].cls.indata = data;
1476 ops[0].cls.indata_len = len;
1477
0ce1a794 1478 ret = rbd_req_sync_op(rbd_dev, NULL,
602adf40 1479 CEPH_NOSNAP,
602adf40
YS
1480 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1481 ops,
d1f57ea6 1482 object_name, 0, 0, NULL, NULL, ver);
602adf40
YS
1483
1484 rbd_destroy_ops(ops);
1485
1486 dout("cls_exec returned %d\n", ret);
1487 return ret;
1488}
1489
1fec7093
YS
1490static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1491{
1492 struct rbd_req_coll *coll =
1493 kzalloc(sizeof(struct rbd_req_coll) +
1494 sizeof(struct rbd_req_status) * num_reqs,
1495 GFP_ATOMIC);
1496
1497 if (!coll)
1498 return NULL;
1499 coll->total = num_reqs;
1500 kref_init(&coll->kref);
1501 return coll;
1502}
1503
602adf40
YS
1504/*
1505 * block device queue callback
1506 */
1507static void rbd_rq_fn(struct request_queue *q)
1508{
1509 struct rbd_device *rbd_dev = q->queuedata;
1510 struct request *rq;
1511 struct bio_pair *bp = NULL;
1512
00f1f36f 1513 while ((rq = blk_fetch_request(q))) {
602adf40
YS
1514 struct bio *bio;
1515 struct bio *rq_bio, *next_bio = NULL;
1516 bool do_write;
bd919d45
AE
1517 unsigned int size;
1518 u64 op_size = 0;
602adf40 1519 u64 ofs;
1fec7093
YS
1520 int num_segs, cur_seg = 0;
1521 struct rbd_req_coll *coll;
d1d25646 1522 struct ceph_snap_context *snapc;
602adf40 1523
602adf40
YS
1524 dout("fetched request\n");
1525
1526 /* filter out block requests we don't understand */
1527 if ((rq->cmd_type != REQ_TYPE_FS)) {
1528 __blk_end_request_all(rq, 0);
00f1f36f 1529 continue;
602adf40
YS
1530 }
1531
1532 /* deduce our operation (read, write) */
1533 do_write = (rq_data_dir(rq) == WRITE);
1534
1535 size = blk_rq_bytes(rq);
593a9e7b 1536 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
602adf40 1537 rq_bio = rq->bio;
f84344f3 1538 if (do_write && rbd_dev->mapping.read_only) {
602adf40 1539 __blk_end_request_all(rq, -EROFS);
00f1f36f 1540 continue;
602adf40
YS
1541 }
1542
1543 spin_unlock_irq(q->queue_lock);
1544
d1d25646 1545 down_read(&rbd_dev->header_rwsem);
e88a36ec 1546
f84344f3
AE
1547 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP &&
1548 !rbd_dev->mapping.snap_exists) {
e88a36ec 1549 up_read(&rbd_dev->header_rwsem);
d1d25646
JD
1550 dout("request for non-existent snapshot");
1551 spin_lock_irq(q->queue_lock);
1552 __blk_end_request_all(rq, -ENXIO);
1553 continue;
e88a36ec
JD
1554 }
1555
d1d25646
JD
1556 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1557
1558 up_read(&rbd_dev->header_rwsem);
1559
602adf40
YS
1560 dout("%s 0x%x bytes at 0x%llx\n",
1561 do_write ? "write" : "read",
bd919d45 1562 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
602adf40 1563
1fec7093 1564 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
df111be6
AE
1565 if (num_segs <= 0) {
1566 spin_lock_irq(q->queue_lock);
1567 __blk_end_request_all(rq, num_segs);
1568 ceph_put_snap_context(snapc);
1569 continue;
1570 }
1fec7093
YS
1571 coll = rbd_alloc_coll(num_segs);
1572 if (!coll) {
1573 spin_lock_irq(q->queue_lock);
1574 __blk_end_request_all(rq, -ENOMEM);
d1d25646 1575 ceph_put_snap_context(snapc);
00f1f36f 1576 continue;
1fec7093
YS
1577 }
1578
602adf40
YS
1579 do {
1580 /* a bio clone to be passed down to OSD req */
bd919d45 1581 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
65ccfe21 1582 op_size = rbd_segment_length(rbd_dev, ofs, size);
1fec7093 1583 kref_get(&coll->kref);
602adf40
YS
1584 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1585 op_size, GFP_ATOMIC);
1586 if (!bio) {
1fec7093
YS
1587 rbd_coll_end_req_index(rq, coll, cur_seg,
1588 -ENOMEM, op_size);
1589 goto next_seg;
602adf40
YS
1590 }
1591
1fec7093 1592
602adf40
YS
1593 /* init OSD command: write or read */
1594 if (do_write)
1595 rbd_req_write(rq, rbd_dev,
d1d25646 1596 snapc,
602adf40 1597 ofs,
1fec7093
YS
1598 op_size, bio,
1599 coll, cur_seg);
602adf40
YS
1600 else
1601 rbd_req_read(rq, rbd_dev,
f84344f3 1602 rbd_dev->mapping.snap_id,
602adf40 1603 ofs,
1fec7093
YS
1604 op_size, bio,
1605 coll, cur_seg);
602adf40 1606
1fec7093 1607next_seg:
602adf40
YS
1608 size -= op_size;
1609 ofs += op_size;
1610
1fec7093 1611 cur_seg++;
602adf40
YS
1612 rq_bio = next_bio;
1613 } while (size > 0);
1fec7093 1614 kref_put(&coll->kref, rbd_coll_release);
602adf40
YS
1615
1616 if (bp)
1617 bio_pair_release(bp);
602adf40 1618 spin_lock_irq(q->queue_lock);
d1d25646
JD
1619
1620 ceph_put_snap_context(snapc);
602adf40
YS
1621 }
1622}
1623
1624/*
1625 * a queue callback. Makes sure that we don't create a bio that spans across
1626 * multiple osd objects. One exception would be with a single page bios,
1627 * which we handle later at bio_chain_clone
1628 */
1629static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1630 struct bio_vec *bvec)
1631{
1632 struct rbd_device *rbd_dev = q->queuedata;
593a9e7b
AE
1633 unsigned int chunk_sectors;
1634 sector_t sector;
1635 unsigned int bio_sectors;
602adf40
YS
1636 int max;
1637
593a9e7b
AE
1638 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1639 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1640 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1641
602adf40 1642 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
593a9e7b 1643 + bio_sectors)) << SECTOR_SHIFT;
602adf40
YS
1644 if (max < 0)
1645 max = 0; /* bio_add cannot handle a negative return */
1646 if (max <= bvec->bv_len && bio_sectors == 0)
1647 return bvec->bv_len;
1648 return max;
1649}
1650
1651static void rbd_free_disk(struct rbd_device *rbd_dev)
1652{
1653 struct gendisk *disk = rbd_dev->disk;
1654
1655 if (!disk)
1656 return;
1657
1658 rbd_header_free(&rbd_dev->header);
1659
1660 if (disk->flags & GENHD_FL_UP)
1661 del_gendisk(disk);
1662 if (disk->queue)
1663 blk_cleanup_queue(disk->queue);
1664 put_disk(disk);
1665}
1666
1667/*
4156d998
AE
1668 * Read the complete header for the given rbd device.
1669 *
1670 * Returns a pointer to a dynamically-allocated buffer containing
1671 * the complete and validated header. Caller can pass the address
1672 * of a variable that will be filled in with the version of the
1673 * header object at the time it was read.
1674 *
1675 * Returns a pointer-coded errno if a failure occurs.
602adf40 1676 */
4156d998
AE
1677static struct rbd_image_header_ondisk *
1678rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
602adf40 1679{
4156d998 1680 struct rbd_image_header_ondisk *ondisk = NULL;
50f7c4c9 1681 u32 snap_count = 0;
4156d998
AE
1682 u64 names_size = 0;
1683 u32 want_count;
1684 int ret;
602adf40 1685
00f1f36f 1686 /*
4156d998
AE
1687 * The complete header will include an array of its 64-bit
1688 * snapshot ids, followed by the names of those snapshots as
1689 * a contiguous block of NUL-terminated strings. Note that
1690 * the number of snapshots could change by the time we read
1691 * it in, in which case we re-read it.
00f1f36f 1692 */
4156d998
AE
1693 do {
1694 size_t size;
1695
1696 kfree(ondisk);
1697
1698 size = sizeof (*ondisk);
1699 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1700 size += names_size;
1701 ondisk = kmalloc(size, GFP_KERNEL);
1702 if (!ondisk)
1703 return ERR_PTR(-ENOMEM);
1704
1705 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
0bed54dc 1706 rbd_dev->header_name,
4156d998
AE
1707 0, size,
1708 (char *) ondisk, version);
1709
1710 if (ret < 0)
1711 goto out_err;
1712 if (WARN_ON((size_t) ret < size)) {
1713 ret = -ENXIO;
1714 pr_warning("short header read for image %s"
1715 " (want %zd got %d)\n",
1716 rbd_dev->image_name, size, ret);
1717 goto out_err;
1718 }
1719 if (!rbd_dev_ondisk_valid(ondisk)) {
1720 ret = -ENXIO;
1721 pr_warning("invalid header for image %s\n",
1722 rbd_dev->image_name);
1723 goto out_err;
81e759fb 1724 }
602adf40 1725
4156d998
AE
1726 names_size = le64_to_cpu(ondisk->snap_names_len);
1727 want_count = snap_count;
1728 snap_count = le32_to_cpu(ondisk->snap_count);
1729 } while (snap_count != want_count);
00f1f36f 1730
4156d998 1731 return ondisk;
00f1f36f 1732
4156d998
AE
1733out_err:
1734 kfree(ondisk);
1735
1736 return ERR_PTR(ret);
1737}
1738
1739/*
1740 * reload the ondisk the header
1741 */
1742static int rbd_read_header(struct rbd_device *rbd_dev,
1743 struct rbd_image_header *header)
1744{
1745 struct rbd_image_header_ondisk *ondisk;
1746 u64 ver = 0;
1747 int ret;
602adf40 1748
4156d998
AE
1749 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1750 if (IS_ERR(ondisk))
1751 return PTR_ERR(ondisk);
1752 ret = rbd_header_from_disk(header, ondisk);
1753 if (ret >= 0)
1754 header->obj_version = ver;
1755 kfree(ondisk);
1756
1757 return ret;
602adf40
YS
1758}
1759
1760/*
1761 * create a snapshot
1762 */
0ce1a794 1763static int rbd_header_add_snap(struct rbd_device *rbd_dev,
602adf40
YS
1764 const char *snap_name,
1765 gfp_t gfp_flags)
1766{
1767 int name_len = strlen(snap_name);
1768 u64 new_snapid;
1769 int ret;
916d4d67 1770 void *data, *p, *e;
1dbb4399 1771 struct ceph_mon_client *monc;
602adf40
YS
1772
1773 /* we should create a snapshot only if we're pointing at the head */
f84344f3 1774 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP)
602adf40
YS
1775 return -EINVAL;
1776
0ce1a794
AE
1777 monc = &rbd_dev->rbd_client->client->monc;
1778 ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
bd919d45 1779 dout("created snapid=%llu\n", (unsigned long long) new_snapid);
602adf40
YS
1780 if (ret < 0)
1781 return ret;
1782
1783 data = kmalloc(name_len + 16, gfp_flags);
1784 if (!data)
1785 return -ENOMEM;
1786
916d4d67
SW
1787 p = data;
1788 e = data + name_len + 16;
602adf40 1789
916d4d67
SW
1790 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1791 ceph_encode_64_safe(&p, e, new_snapid, bad);
602adf40 1792
0bed54dc 1793 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
0ce1a794 1794 "rbd", "snap_add",
d67d4be5 1795 data, p - data, NULL);
602adf40 1796
916d4d67 1797 kfree(data);
602adf40 1798
505cbb9b 1799 return ret < 0 ? ret : 0;
602adf40
YS
1800bad:
1801 return -ERANGE;
1802}
1803
dfc5606d
YS
1804static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1805{
1806 struct rbd_snap *snap;
a0593290 1807 struct rbd_snap *next;
dfc5606d 1808
a0593290 1809 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
14e7085d 1810 __rbd_remove_snap_dev(snap);
dfc5606d
YS
1811}
1812
602adf40
YS
1813/*
1814 * only read the first part of the ondisk header, without the snaps info
1815 */
b813623a 1816static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
602adf40
YS
1817{
1818 int ret;
1819 struct rbd_image_header h;
602adf40
YS
1820
1821 ret = rbd_read_header(rbd_dev, &h);
1822 if (ret < 0)
1823 return ret;
1824
a51aa0c0
JD
1825 down_write(&rbd_dev->header_rwsem);
1826
9db4b3e3 1827 /* resized? */
f84344f3 1828 if (rbd_dev->mapping.snap_id == CEPH_NOSNAP) {
474ef7ce
JD
1829 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1830
99c1f08f
AE
1831 if (size != (sector_t) rbd_dev->mapping.size) {
1832 dout("setting size to %llu sectors",
1833 (unsigned long long) size);
1834 rbd_dev->mapping.size = (u64) size;
1835 set_capacity(rbd_dev->disk, size);
1836 }
474ef7ce 1837 }
9db4b3e3 1838
849b4260 1839 /* rbd_dev->header.object_prefix shouldn't change */
602adf40 1840 kfree(rbd_dev->header.snap_sizes);
849b4260 1841 kfree(rbd_dev->header.snap_names);
d1d25646
JD
1842 /* osd requests may still refer to snapc */
1843 ceph_put_snap_context(rbd_dev->header.snapc);
602adf40 1844
b813623a
AE
1845 if (hver)
1846 *hver = h.obj_version;
a71b891b 1847 rbd_dev->header.obj_version = h.obj_version;
93a24e08 1848 rbd_dev->header.image_size = h.image_size;
602adf40
YS
1849 rbd_dev->header.snapc = h.snapc;
1850 rbd_dev->header.snap_names = h.snap_names;
1851 rbd_dev->header.snap_sizes = h.snap_sizes;
849b4260
AE
1852 /* Free the extra copy of the object prefix */
1853 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1854 kfree(h.object_prefix);
1855
9fcbb800 1856 ret = rbd_dev_snap_devs_update(rbd_dev);
dfc5606d 1857
c666601a 1858 up_write(&rbd_dev->header_rwsem);
602adf40 1859
dfc5606d 1860 return ret;
602adf40
YS
1861}
1862
1fe5e993
AE
1863static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1864{
1865 int ret;
1866
1867 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1868 ret = __rbd_refresh_header(rbd_dev, hver);
1869 mutex_unlock(&ctl_mutex);
1870
1871 return ret;
1872}
1873
602adf40
YS
1874static int rbd_init_disk(struct rbd_device *rbd_dev)
1875{
1876 struct gendisk *disk;
1877 struct request_queue *q;
1878 int rc;
593a9e7b 1879 u64 segment_size;
602adf40
YS
1880
1881 /* contact OSD, request size info about the object being mapped */
1882 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1883 if (rc)
1884 return rc;
1885
dfc5606d 1886 /* no need to lock here, as rbd_dev is not registered yet */
9fcbb800 1887 rc = rbd_dev_snap_devs_update(rbd_dev);
dfc5606d
YS
1888 if (rc)
1889 return rc;
1890
99c1f08f 1891 rc = rbd_header_set_snap(rbd_dev);
602adf40
YS
1892 if (rc)
1893 return rc;
1894
1895 /* create gendisk info */
1896 rc = -ENOMEM;
1897 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1898 if (!disk)
1899 goto out;
1900
f0f8cef5 1901 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
de71a297 1902 rbd_dev->dev_id);
602adf40
YS
1903 disk->major = rbd_dev->major;
1904 disk->first_minor = 0;
1905 disk->fops = &rbd_bd_ops;
1906 disk->private_data = rbd_dev;
1907
1908 /* init rq */
1909 rc = -ENOMEM;
1910 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1911 if (!q)
1912 goto out_disk;
029bcbd8 1913
593a9e7b
AE
1914 /* We use the default size, but let's be explicit about it. */
1915 blk_queue_physical_block_size(q, SECTOR_SIZE);
1916
029bcbd8 1917 /* set io sizes to object size */
593a9e7b
AE
1918 segment_size = rbd_obj_bytes(&rbd_dev->header);
1919 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1920 blk_queue_max_segment_size(q, segment_size);
1921 blk_queue_io_min(q, segment_size);
1922 blk_queue_io_opt(q, segment_size);
029bcbd8 1923
602adf40
YS
1924 blk_queue_merge_bvec(q, rbd_merge_bvec);
1925 disk->queue = q;
1926
1927 q->queuedata = rbd_dev;
1928
1929 rbd_dev->disk = disk;
602adf40
YS
1930
1931 /* finally, announce the disk to the world */
99c1f08f 1932 set_capacity(disk, (sector_t) rbd_dev->mapping.size / SECTOR_SIZE);
602adf40
YS
1933 add_disk(disk);
1934
1935 pr_info("%s: added with size 0x%llx\n",
99c1f08f 1936 disk->disk_name, (unsigned long long) rbd_dev->mapping.size);
602adf40
YS
1937 return 0;
1938
1939out_disk:
1940 put_disk(disk);
1941out:
1942 return rc;
1943}
1944
dfc5606d
YS
1945/*
1946 sysfs
1947*/
1948
593a9e7b
AE
1949static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1950{
1951 return container_of(dev, struct rbd_device, dev);
1952}
1953
dfc5606d
YS
1954static ssize_t rbd_size_show(struct device *dev,
1955 struct device_attribute *attr, char *buf)
1956{
593a9e7b 1957 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
a51aa0c0
JD
1958 sector_t size;
1959
1960 down_read(&rbd_dev->header_rwsem);
1961 size = get_capacity(rbd_dev->disk);
1962 up_read(&rbd_dev->header_rwsem);
dfc5606d 1963
a51aa0c0 1964 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
dfc5606d
YS
1965}
1966
1967static ssize_t rbd_major_show(struct device *dev,
1968 struct device_attribute *attr, char *buf)
1969{
593a9e7b 1970 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 1971
dfc5606d
YS
1972 return sprintf(buf, "%d\n", rbd_dev->major);
1973}
1974
1975static ssize_t rbd_client_id_show(struct device *dev,
1976 struct device_attribute *attr, char *buf)
602adf40 1977{
593a9e7b 1978 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1979
1dbb4399
AE
1980 return sprintf(buf, "client%lld\n",
1981 ceph_client_id(rbd_dev->rbd_client->client));
602adf40
YS
1982}
1983
dfc5606d
YS
1984static ssize_t rbd_pool_show(struct device *dev,
1985 struct device_attribute *attr, char *buf)
602adf40 1986{
593a9e7b 1987 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d
YS
1988
1989 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1990}
1991
9bb2f334
AE
1992static ssize_t rbd_pool_id_show(struct device *dev,
1993 struct device_attribute *attr, char *buf)
1994{
1995 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1996
1997 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1998}
1999
dfc5606d
YS
2000static ssize_t rbd_name_show(struct device *dev,
2001 struct device_attribute *attr, char *buf)
2002{
593a9e7b 2003 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2004
0bed54dc 2005 return sprintf(buf, "%s\n", rbd_dev->image_name);
dfc5606d
YS
2006}
2007
2008static ssize_t rbd_snap_show(struct device *dev,
2009 struct device_attribute *attr,
2010 char *buf)
2011{
593a9e7b 2012 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2013
f84344f3 2014 return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name);
dfc5606d
YS
2015}
2016
2017static ssize_t rbd_image_refresh(struct device *dev,
2018 struct device_attribute *attr,
2019 const char *buf,
2020 size_t size)
2021{
593a9e7b 2022 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
b813623a 2023 int ret;
602adf40 2024
1fe5e993 2025 ret = rbd_refresh_header(rbd_dev, NULL);
b813623a
AE
2026
2027 return ret < 0 ? ret : size;
dfc5606d 2028}
602adf40 2029
dfc5606d
YS
2030static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
2031static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2032static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2033static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
9bb2f334 2034static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
dfc5606d
YS
2035static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
2036static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2037static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
2038static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
dfc5606d
YS
2039
2040static struct attribute *rbd_attrs[] = {
2041 &dev_attr_size.attr,
2042 &dev_attr_major.attr,
2043 &dev_attr_client_id.attr,
2044 &dev_attr_pool.attr,
9bb2f334 2045 &dev_attr_pool_id.attr,
dfc5606d
YS
2046 &dev_attr_name.attr,
2047 &dev_attr_current_snap.attr,
2048 &dev_attr_refresh.attr,
2049 &dev_attr_create_snap.attr,
dfc5606d
YS
2050 NULL
2051};
2052
2053static struct attribute_group rbd_attr_group = {
2054 .attrs = rbd_attrs,
2055};
2056
2057static const struct attribute_group *rbd_attr_groups[] = {
2058 &rbd_attr_group,
2059 NULL
2060};
2061
2062static void rbd_sysfs_dev_release(struct device *dev)
2063{
2064}
2065
2066static struct device_type rbd_device_type = {
2067 .name = "rbd",
2068 .groups = rbd_attr_groups,
2069 .release = rbd_sysfs_dev_release,
2070};
2071
2072
2073/*
2074 sysfs - snapshots
2075*/
2076
2077static ssize_t rbd_snap_size_show(struct device *dev,
2078 struct device_attribute *attr,
2079 char *buf)
2080{
2081 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2082
3591538f 2083 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
dfc5606d
YS
2084}
2085
2086static ssize_t rbd_snap_id_show(struct device *dev,
2087 struct device_attribute *attr,
2088 char *buf)
2089{
2090 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2091
3591538f 2092 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
dfc5606d
YS
2093}
2094
2095static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2096static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
2097
2098static struct attribute *rbd_snap_attrs[] = {
2099 &dev_attr_snap_size.attr,
2100 &dev_attr_snap_id.attr,
2101 NULL,
2102};
2103
2104static struct attribute_group rbd_snap_attr_group = {
2105 .attrs = rbd_snap_attrs,
2106};
2107
2108static void rbd_snap_dev_release(struct device *dev)
2109{
2110 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2111 kfree(snap->name);
2112 kfree(snap);
2113}
2114
2115static const struct attribute_group *rbd_snap_attr_groups[] = {
2116 &rbd_snap_attr_group,
2117 NULL
2118};
2119
2120static struct device_type rbd_snap_device_type = {
2121 .groups = rbd_snap_attr_groups,
2122 .release = rbd_snap_dev_release,
2123};
2124
14e7085d 2125static void __rbd_remove_snap_dev(struct rbd_snap *snap)
dfc5606d
YS
2126{
2127 list_del(&snap->node);
2128 device_unregister(&snap->dev);
2129}
2130
14e7085d 2131static int rbd_register_snap_dev(struct rbd_snap *snap,
dfc5606d
YS
2132 struct device *parent)
2133{
2134 struct device *dev = &snap->dev;
2135 int ret;
2136
2137 dev->type = &rbd_snap_device_type;
2138 dev->parent = parent;
2139 dev->release = rbd_snap_dev_release;
2140 dev_set_name(dev, "snap_%s", snap->name);
2141 ret = device_register(dev);
2142
2143 return ret;
2144}
2145
4e891e0a
AE
2146static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2147 int i, const char *name)
dfc5606d 2148{
4e891e0a 2149 struct rbd_snap *snap;
dfc5606d 2150 int ret;
4e891e0a
AE
2151
2152 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
dfc5606d 2153 if (!snap)
4e891e0a
AE
2154 return ERR_PTR(-ENOMEM);
2155
2156 ret = -ENOMEM;
dfc5606d 2157 snap->name = kstrdup(name, GFP_KERNEL);
4e891e0a
AE
2158 if (!snap->name)
2159 goto err;
2160
dfc5606d
YS
2161 snap->size = rbd_dev->header.snap_sizes[i];
2162 snap->id = rbd_dev->header.snapc->snaps[i];
2163 if (device_is_registered(&rbd_dev->dev)) {
14e7085d 2164 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
dfc5606d
YS
2165 if (ret < 0)
2166 goto err;
2167 }
4e891e0a
AE
2168
2169 return snap;
2170
dfc5606d
YS
2171err:
2172 kfree(snap->name);
2173 kfree(snap);
4e891e0a
AE
2174
2175 return ERR_PTR(ret);
dfc5606d
YS
2176}
2177
2178/*
35938150
AE
2179 * Scan the rbd device's current snapshot list and compare it to the
2180 * newly-received snapshot context. Remove any existing snapshots
2181 * not present in the new snapshot context. Add a new snapshot for
2182 * any snaphots in the snapshot context not in the current list.
2183 * And verify there are no changes to snapshots we already know
2184 * about.
2185 *
2186 * Assumes the snapshots in the snapshot context are sorted by
2187 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2188 * are also maintained in that order.)
dfc5606d 2189 */
9fcbb800 2190static int rbd_dev_snap_devs_update(struct rbd_device *rbd_dev)
dfc5606d 2191{
35938150
AE
2192 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2193 const u32 snap_count = snapc->num_snaps;
2194 char *snap_name = rbd_dev->header.snap_names;
2195 struct list_head *head = &rbd_dev->snaps;
2196 struct list_head *links = head->next;
2197 u32 index = 0;
dfc5606d 2198
9fcbb800 2199 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
35938150
AE
2200 while (index < snap_count || links != head) {
2201 u64 snap_id;
2202 struct rbd_snap *snap;
dfc5606d 2203
35938150
AE
2204 snap_id = index < snap_count ? snapc->snaps[index]
2205 : CEPH_NOSNAP;
2206 snap = links != head ? list_entry(links, struct rbd_snap, node)
2207 : NULL;
aafb230e 2208 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
dfc5606d 2209
35938150
AE
2210 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2211 struct list_head *next = links->next;
dfc5606d 2212
35938150 2213 /* Existing snapshot not in the new snap context */
dfc5606d 2214
f84344f3
AE
2215 if (rbd_dev->mapping.snap_id == snap->id)
2216 rbd_dev->mapping.snap_exists = false;
35938150 2217 __rbd_remove_snap_dev(snap);
9fcbb800 2218 dout("%ssnap id %llu has been removed\n",
f84344f3
AE
2219 rbd_dev->mapping.snap_id == snap->id ?
2220 "mapped " : "",
9fcbb800 2221 (unsigned long long) snap->id);
35938150
AE
2222
2223 /* Done with this list entry; advance */
2224
2225 links = next;
dfc5606d
YS
2226 continue;
2227 }
35938150 2228
9fcbb800
AE
2229 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2230 (unsigned long long) snap_id);
35938150
AE
2231 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2232 struct rbd_snap *new_snap;
2233
2234 /* We haven't seen this snapshot before */
2235
2236 new_snap = __rbd_add_snap_dev(rbd_dev, index,
2237 snap_name);
9fcbb800
AE
2238 if (IS_ERR(new_snap)) {
2239 int err = PTR_ERR(new_snap);
2240
2241 dout(" failed to add dev, error %d\n", err);
2242
2243 return err;
2244 }
35938150
AE
2245
2246 /* New goes before existing, or at end of list */
2247
9fcbb800 2248 dout(" added dev%s\n", snap ? "" : " at end\n");
35938150
AE
2249 if (snap)
2250 list_add_tail(&new_snap->node, &snap->node);
2251 else
523f3258 2252 list_add_tail(&new_snap->node, head);
35938150
AE
2253 } else {
2254 /* Already have this one */
2255
9fcbb800
AE
2256 dout(" already present\n");
2257
aafb230e
AE
2258 rbd_assert(snap->size ==
2259 rbd_dev->header.snap_sizes[index]);
2260 rbd_assert(!strcmp(snap->name, snap_name));
35938150
AE
2261
2262 /* Done with this list entry; advance */
2263
2264 links = links->next;
dfc5606d 2265 }
35938150
AE
2266
2267 /* Advance to the next entry in the snapshot context */
2268
2269 index++;
2270 snap_name += strlen(snap_name) + 1;
dfc5606d 2271 }
9fcbb800 2272 dout("%s: done\n", __func__);
dfc5606d
YS
2273
2274 return 0;
2275}
2276
dfc5606d
YS
2277static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2278{
f0f8cef5 2279 int ret;
dfc5606d
YS
2280 struct device *dev;
2281 struct rbd_snap *snap;
2282
2283 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2284 dev = &rbd_dev->dev;
2285
2286 dev->bus = &rbd_bus_type;
2287 dev->type = &rbd_device_type;
2288 dev->parent = &rbd_root_dev;
2289 dev->release = rbd_dev_release;
de71a297 2290 dev_set_name(dev, "%d", rbd_dev->dev_id);
dfc5606d
YS
2291 ret = device_register(dev);
2292 if (ret < 0)
f0f8cef5 2293 goto out;
dfc5606d
YS
2294
2295 list_for_each_entry(snap, &rbd_dev->snaps, node) {
14e7085d 2296 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
dfc5606d 2297 if (ret < 0)
602adf40
YS
2298 break;
2299 }
f0f8cef5 2300out:
dfc5606d
YS
2301 mutex_unlock(&ctl_mutex);
2302 return ret;
602adf40
YS
2303}
2304
dfc5606d
YS
2305static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2306{
2307 device_unregister(&rbd_dev->dev);
2308}
2309
59c2be1e
YS
2310static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2311{
2312 int ret, rc;
2313
2314 do {
0e6f322d 2315 ret = rbd_req_sync_watch(rbd_dev);
59c2be1e 2316 if (ret == -ERANGE) {
1fe5e993 2317 rc = rbd_refresh_header(rbd_dev, NULL);
59c2be1e
YS
2318 if (rc < 0)
2319 return rc;
2320 }
2321 } while (ret == -ERANGE);
2322
2323 return ret;
2324}
2325
e2839308 2326static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
1ddbe94e
AE
2327
2328/*
499afd5b
AE
2329 * Get a unique rbd identifier for the given new rbd_dev, and add
2330 * the rbd_dev to the global list. The minimum rbd id is 1.
1ddbe94e 2331 */
e2839308 2332static void rbd_dev_id_get(struct rbd_device *rbd_dev)
b7f23c36 2333{
e2839308 2334 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
499afd5b
AE
2335
2336 spin_lock(&rbd_dev_list_lock);
2337 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2338 spin_unlock(&rbd_dev_list_lock);
e2839308
AE
2339 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2340 (unsigned long long) rbd_dev->dev_id);
1ddbe94e 2341}
b7f23c36 2342
1ddbe94e 2343/*
499afd5b
AE
2344 * Remove an rbd_dev from the global list, and record that its
2345 * identifier is no longer in use.
1ddbe94e 2346 */
e2839308 2347static void rbd_dev_id_put(struct rbd_device *rbd_dev)
1ddbe94e 2348{
d184f6bf 2349 struct list_head *tmp;
de71a297 2350 int rbd_id = rbd_dev->dev_id;
d184f6bf
AE
2351 int max_id;
2352
aafb230e 2353 rbd_assert(rbd_id > 0);
499afd5b 2354
e2839308
AE
2355 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2356 (unsigned long long) rbd_dev->dev_id);
499afd5b
AE
2357 spin_lock(&rbd_dev_list_lock);
2358 list_del_init(&rbd_dev->node);
d184f6bf
AE
2359
2360 /*
2361 * If the id being "put" is not the current maximum, there
2362 * is nothing special we need to do.
2363 */
e2839308 2364 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
d184f6bf
AE
2365 spin_unlock(&rbd_dev_list_lock);
2366 return;
2367 }
2368
2369 /*
2370 * We need to update the current maximum id. Search the
2371 * list to find out what it is. We're more likely to find
2372 * the maximum at the end, so search the list backward.
2373 */
2374 max_id = 0;
2375 list_for_each_prev(tmp, &rbd_dev_list) {
2376 struct rbd_device *rbd_dev;
2377
2378 rbd_dev = list_entry(tmp, struct rbd_device, node);
2379 if (rbd_id > max_id)
2380 max_id = rbd_id;
2381 }
499afd5b 2382 spin_unlock(&rbd_dev_list_lock);
b7f23c36 2383
1ddbe94e 2384 /*
e2839308 2385 * The max id could have been updated by rbd_dev_id_get(), in
d184f6bf
AE
2386 * which case it now accurately reflects the new maximum.
2387 * Be careful not to overwrite the maximum value in that
2388 * case.
1ddbe94e 2389 */
e2839308
AE
2390 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2391 dout(" max dev id has been reset\n");
b7f23c36
AE
2392}
2393
e28fff26
AE
2394/*
2395 * Skips over white space at *buf, and updates *buf to point to the
2396 * first found non-space character (if any). Returns the length of
593a9e7b
AE
2397 * the token (string of non-white space characters) found. Note
2398 * that *buf must be terminated with '\0'.
e28fff26
AE
2399 */
2400static inline size_t next_token(const char **buf)
2401{
2402 /*
2403 * These are the characters that produce nonzero for
2404 * isspace() in the "C" and "POSIX" locales.
2405 */
2406 const char *spaces = " \f\n\r\t\v";
2407
2408 *buf += strspn(*buf, spaces); /* Find start of token */
2409
2410 return strcspn(*buf, spaces); /* Return token length */
2411}
2412
2413/*
2414 * Finds the next token in *buf, and if the provided token buffer is
2415 * big enough, copies the found token into it. The result, if
593a9e7b
AE
2416 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2417 * must be terminated with '\0' on entry.
e28fff26
AE
2418 *
2419 * Returns the length of the token found (not including the '\0').
2420 * Return value will be 0 if no token is found, and it will be >=
2421 * token_size if the token would not fit.
2422 *
593a9e7b 2423 * The *buf pointer will be updated to point beyond the end of the
e28fff26
AE
2424 * found token. Note that this occurs even if the token buffer is
2425 * too small to hold it.
2426 */
2427static inline size_t copy_token(const char **buf,
2428 char *token,
2429 size_t token_size)
2430{
2431 size_t len;
2432
2433 len = next_token(buf);
2434 if (len < token_size) {
2435 memcpy(token, *buf, len);
2436 *(token + len) = '\0';
2437 }
2438 *buf += len;
2439
2440 return len;
2441}
2442
ea3352f4
AE
2443/*
2444 * Finds the next token in *buf, dynamically allocates a buffer big
2445 * enough to hold a copy of it, and copies the token into the new
2446 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2447 * that a duplicate buffer is created even for a zero-length token.
2448 *
2449 * Returns a pointer to the newly-allocated duplicate, or a null
2450 * pointer if memory for the duplicate was not available. If
2451 * the lenp argument is a non-null pointer, the length of the token
2452 * (not including the '\0') is returned in *lenp.
2453 *
2454 * If successful, the *buf pointer will be updated to point beyond
2455 * the end of the found token.
2456 *
2457 * Note: uses GFP_KERNEL for allocation.
2458 */
2459static inline char *dup_token(const char **buf, size_t *lenp)
2460{
2461 char *dup;
2462 size_t len;
2463
2464 len = next_token(buf);
2465 dup = kmalloc(len + 1, GFP_KERNEL);
2466 if (!dup)
2467 return NULL;
2468
2469 memcpy(dup, *buf, len);
2470 *(dup + len) = '\0';
2471 *buf += len;
2472
2473 if (lenp)
2474 *lenp = len;
2475
2476 return dup;
2477}
2478
a725f65e 2479/*
0bed54dc 2480 * This fills in the pool_name, image_name, image_name_len, snap_name,
a725f65e
AE
2481 * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
2482 * on the list of monitor addresses and other options provided via
2483 * /sys/bus/rbd/add.
d22f76e7
AE
2484 *
2485 * Note: rbd_dev is assumed to have been initially zero-filled.
a725f65e
AE
2486 */
2487static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2488 const char *buf,
7ef3214a 2489 const char **mon_addrs,
5214ecc4 2490 size_t *mon_addrs_size,
e28fff26 2491 char *options,
0bed54dc 2492 size_t options_size)
e28fff26 2493{
d22f76e7
AE
2494 size_t len;
2495 int ret;
e28fff26
AE
2496
2497 /* The first four tokens are required */
2498
7ef3214a
AE
2499 len = next_token(&buf);
2500 if (!len)
a725f65e 2501 return -EINVAL;
5214ecc4 2502 *mon_addrs_size = len + 1;
7ef3214a
AE
2503 *mon_addrs = buf;
2504
2505 buf += len;
a725f65e 2506
e28fff26
AE
2507 len = copy_token(&buf, options, options_size);
2508 if (!len || len >= options_size)
2509 return -EINVAL;
2510
bf3e5ae1 2511 ret = -ENOMEM;
d22f76e7
AE
2512 rbd_dev->pool_name = dup_token(&buf, NULL);
2513 if (!rbd_dev->pool_name)
d22f76e7 2514 goto out_err;
e28fff26 2515
0bed54dc
AE
2516 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2517 if (!rbd_dev->image_name)
bf3e5ae1 2518 goto out_err;
a725f65e 2519
cb8627c7
AE
2520 /* Create the name of the header object */
2521
0bed54dc 2522 rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
bf3e5ae1
AE
2523 + sizeof (RBD_SUFFIX),
2524 GFP_KERNEL);
0bed54dc 2525 if (!rbd_dev->header_name)
cb8627c7 2526 goto out_err;
0bed54dc 2527 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
a725f65e 2528
e28fff26 2529 /*
820a5f3e
AE
2530 * The snapshot name is optional. If none is is supplied,
2531 * we use the default value.
e28fff26 2532 */
f84344f3
AE
2533 rbd_dev->mapping.snap_name = dup_token(&buf, &len);
2534 if (!rbd_dev->mapping.snap_name)
820a5f3e
AE
2535 goto out_err;
2536 if (!len) {
2537 /* Replace the empty name with the default */
f84344f3
AE
2538 kfree(rbd_dev->mapping.snap_name);
2539 rbd_dev->mapping.snap_name
820a5f3e 2540 = kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL);
f84344f3 2541 if (!rbd_dev->mapping.snap_name)
820a5f3e
AE
2542 goto out_err;
2543
f84344f3 2544 memcpy(rbd_dev->mapping.snap_name, RBD_SNAP_HEAD_NAME,
e28fff26 2545 sizeof (RBD_SNAP_HEAD_NAME));
849b4260 2546 }
e28fff26 2547
a725f65e 2548 return 0;
d22f76e7
AE
2549
2550out_err:
0bed54dc 2551 kfree(rbd_dev->header_name);
d78fd7ae 2552 rbd_dev->header_name = NULL;
0bed54dc 2553 kfree(rbd_dev->image_name);
d78fd7ae
AE
2554 rbd_dev->image_name = NULL;
2555 rbd_dev->image_name_len = 0;
d22f76e7
AE
2556 kfree(rbd_dev->pool_name);
2557 rbd_dev->pool_name = NULL;
2558
2559 return ret;
a725f65e
AE
2560}
2561
59c2be1e
YS
2562static ssize_t rbd_add(struct bus_type *bus,
2563 const char *buf,
2564 size_t count)
602adf40 2565{
cb8627c7
AE
2566 char *options;
2567 struct rbd_device *rbd_dev = NULL;
7ef3214a
AE
2568 const char *mon_addrs = NULL;
2569 size_t mon_addrs_size = 0;
27cc2594
AE
2570 struct ceph_osd_client *osdc;
2571 int rc = -ENOMEM;
602adf40
YS
2572
2573 if (!try_module_get(THIS_MODULE))
2574 return -ENODEV;
2575
60571c7d 2576 options = kmalloc(count, GFP_KERNEL);
602adf40 2577 if (!options)
27cc2594 2578 goto err_nomem;
cb8627c7
AE
2579 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2580 if (!rbd_dev)
2581 goto err_nomem;
602adf40
YS
2582
2583 /* static rbd_device initialization */
2584 spin_lock_init(&rbd_dev->lock);
2585 INIT_LIST_HEAD(&rbd_dev->node);
dfc5606d 2586 INIT_LIST_HEAD(&rbd_dev->snaps);
c666601a 2587 init_rwsem(&rbd_dev->header_rwsem);
602adf40 2588
d184f6bf 2589 /* generate unique id: find highest unique id, add one */
e2839308 2590 rbd_dev_id_get(rbd_dev);
602adf40 2591
a725f65e 2592 /* Fill in the device name, now that we have its id. */
81a89793
AE
2593 BUILD_BUG_ON(DEV_NAME_LEN
2594 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
de71a297 2595 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
a725f65e 2596
602adf40 2597 /* parse add command */
7ef3214a 2598 rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
e28fff26 2599 options, count);
a725f65e 2600 if (rc)
f0f8cef5 2601 goto err_put_id;
e124a82f 2602
f8c38929
AE
2603 rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
2604 if (rc < 0)
f0f8cef5 2605 goto err_put_id;
602adf40 2606
602adf40 2607 /* pick the pool */
1dbb4399 2608 osdc = &rbd_dev->rbd_client->client->osdc;
602adf40
YS
2609 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2610 if (rc < 0)
2611 goto err_out_client;
9bb2f334 2612 rbd_dev->pool_id = rc;
602adf40
YS
2613
2614 /* register our block device */
27cc2594
AE
2615 rc = register_blkdev(0, rbd_dev->name);
2616 if (rc < 0)
602adf40 2617 goto err_out_client;
27cc2594 2618 rbd_dev->major = rc;
602adf40 2619
dfc5606d
YS
2620 rc = rbd_bus_add_dev(rbd_dev);
2621 if (rc)
766fc439
YS
2622 goto err_out_blkdev;
2623
32eec68d
AE
2624 /*
2625 * At this point cleanup in the event of an error is the job
2626 * of the sysfs code (initiated by rbd_bus_del_dev()).
2627 *
2628 * Set up and announce blkdev mapping.
2629 */
602adf40
YS
2630 rc = rbd_init_disk(rbd_dev);
2631 if (rc)
766fc439 2632 goto err_out_bus;
602adf40 2633
59c2be1e
YS
2634 rc = rbd_init_watch_dev(rbd_dev);
2635 if (rc)
2636 goto err_out_bus;
2637
602adf40
YS
2638 return count;
2639
766fc439 2640err_out_bus:
766fc439
YS
2641 /* this will also clean up rest of rbd_dev stuff */
2642
2643 rbd_bus_del_dev(rbd_dev);
2644 kfree(options);
766fc439
YS
2645 return rc;
2646
602adf40
YS
2647err_out_blkdev:
2648 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2649err_out_client:
2650 rbd_put_client(rbd_dev);
f0f8cef5 2651err_put_id:
cb8627c7 2652 if (rbd_dev->pool_name) {
f84344f3 2653 kfree(rbd_dev->mapping.snap_name);
0bed54dc
AE
2654 kfree(rbd_dev->header_name);
2655 kfree(rbd_dev->image_name);
cb8627c7
AE
2656 kfree(rbd_dev->pool_name);
2657 }
e2839308 2658 rbd_dev_id_put(rbd_dev);
27cc2594 2659err_nomem:
27cc2594 2660 kfree(rbd_dev);
cb8627c7 2661 kfree(options);
27cc2594 2662
602adf40
YS
2663 dout("Error adding device %s\n", buf);
2664 module_put(THIS_MODULE);
27cc2594
AE
2665
2666 return (ssize_t) rc;
602adf40
YS
2667}
2668
de71a297 2669static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
602adf40
YS
2670{
2671 struct list_head *tmp;
2672 struct rbd_device *rbd_dev;
2673
e124a82f 2674 spin_lock(&rbd_dev_list_lock);
602adf40
YS
2675 list_for_each(tmp, &rbd_dev_list) {
2676 rbd_dev = list_entry(tmp, struct rbd_device, node);
de71a297 2677 if (rbd_dev->dev_id == dev_id) {
e124a82f 2678 spin_unlock(&rbd_dev_list_lock);
602adf40 2679 return rbd_dev;
e124a82f 2680 }
602adf40 2681 }
e124a82f 2682 spin_unlock(&rbd_dev_list_lock);
602adf40
YS
2683 return NULL;
2684}
2685
dfc5606d 2686static void rbd_dev_release(struct device *dev)
602adf40 2687{
593a9e7b 2688 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 2689
1dbb4399
AE
2690 if (rbd_dev->watch_request) {
2691 struct ceph_client *client = rbd_dev->rbd_client->client;
2692
2693 ceph_osdc_unregister_linger_request(&client->osdc,
59c2be1e 2694 rbd_dev->watch_request);
1dbb4399 2695 }
59c2be1e 2696 if (rbd_dev->watch_event)
070c633f 2697 rbd_req_sync_unwatch(rbd_dev);
59c2be1e 2698
602adf40
YS
2699 rbd_put_client(rbd_dev);
2700
2701 /* clean up and free blkdev */
2702 rbd_free_disk(rbd_dev);
2703 unregister_blkdev(rbd_dev->major, rbd_dev->name);
32eec68d
AE
2704
2705 /* done with the id, and with the rbd_dev */
f84344f3 2706 kfree(rbd_dev->mapping.snap_name);
0bed54dc 2707 kfree(rbd_dev->header_name);
d22f76e7 2708 kfree(rbd_dev->pool_name);
0bed54dc 2709 kfree(rbd_dev->image_name);
e2839308 2710 rbd_dev_id_put(rbd_dev);
602adf40
YS
2711 kfree(rbd_dev);
2712
2713 /* release module ref */
2714 module_put(THIS_MODULE);
602adf40
YS
2715}
2716
dfc5606d
YS
2717static ssize_t rbd_remove(struct bus_type *bus,
2718 const char *buf,
2719 size_t count)
602adf40
YS
2720{
2721 struct rbd_device *rbd_dev = NULL;
2722 int target_id, rc;
2723 unsigned long ul;
2724 int ret = count;
2725
2726 rc = strict_strtoul(buf, 10, &ul);
2727 if (rc)
2728 return rc;
2729
2730 /* convert to int; abort if we lost anything in the conversion */
2731 target_id = (int) ul;
2732 if (target_id != ul)
2733 return -EINVAL;
2734
2735 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2736
2737 rbd_dev = __rbd_get_dev(target_id);
2738 if (!rbd_dev) {
2739 ret = -ENOENT;
2740 goto done;
2741 }
2742
dfc5606d
YS
2743 __rbd_remove_all_snaps(rbd_dev);
2744 rbd_bus_del_dev(rbd_dev);
602adf40
YS
2745
2746done:
2747 mutex_unlock(&ctl_mutex);
aafb230e 2748
602adf40
YS
2749 return ret;
2750}
2751
dfc5606d
YS
2752static ssize_t rbd_snap_add(struct device *dev,
2753 struct device_attribute *attr,
2754 const char *buf,
2755 size_t count)
602adf40 2756{
593a9e7b 2757 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d
YS
2758 int ret;
2759 char *name = kmalloc(count + 1, GFP_KERNEL);
602adf40
YS
2760 if (!name)
2761 return -ENOMEM;
2762
dfc5606d 2763 snprintf(name, count, "%s", buf);
602adf40
YS
2764
2765 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2766
602adf40
YS
2767 ret = rbd_header_add_snap(rbd_dev,
2768 name, GFP_KERNEL);
2769 if (ret < 0)
59c2be1e 2770 goto err_unlock;
602adf40 2771
b813623a 2772 ret = __rbd_refresh_header(rbd_dev, NULL);
602adf40 2773 if (ret < 0)
59c2be1e
YS
2774 goto err_unlock;
2775
2776 /* shouldn't hold ctl_mutex when notifying.. notify might
2777 trigger a watch callback that would need to get that mutex */
2778 mutex_unlock(&ctl_mutex);
2779
2780 /* make a best effort, don't error if failed */
4cb16250 2781 rbd_req_sync_notify(rbd_dev);
602adf40
YS
2782
2783 ret = count;
59c2be1e
YS
2784 kfree(name);
2785 return ret;
2786
2787err_unlock:
602adf40 2788 mutex_unlock(&ctl_mutex);
602adf40
YS
2789 kfree(name);
2790 return ret;
2791}
2792
602adf40
YS
2793/*
2794 * create control files in sysfs
dfc5606d 2795 * /sys/bus/rbd/...
602adf40
YS
2796 */
2797static int rbd_sysfs_init(void)
2798{
dfc5606d 2799 int ret;
602adf40 2800
fed4c143 2801 ret = device_register(&rbd_root_dev);
21079786 2802 if (ret < 0)
dfc5606d 2803 return ret;
602adf40 2804
fed4c143
AE
2805 ret = bus_register(&rbd_bus_type);
2806 if (ret < 0)
2807 device_unregister(&rbd_root_dev);
602adf40 2808
602adf40
YS
2809 return ret;
2810}
2811
2812static void rbd_sysfs_cleanup(void)
2813{
dfc5606d 2814 bus_unregister(&rbd_bus_type);
fed4c143 2815 device_unregister(&rbd_root_dev);
602adf40
YS
2816}
2817
2818int __init rbd_init(void)
2819{
2820 int rc;
2821
2822 rc = rbd_sysfs_init();
2823 if (rc)
2824 return rc;
f0f8cef5 2825 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
602adf40
YS
2826 return 0;
2827}
2828
2829void __exit rbd_exit(void)
2830{
2831 rbd_sysfs_cleanup();
2832}
2833
2834module_init(rbd_init);
2835module_exit(rbd_exit);
2836
2837MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2838MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2839MODULE_DESCRIPTION("rados block device");
2840
2841/* following authorship retained from original osdblk.c */
2842MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2843
2844MODULE_LICENSE("GPL");