]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - drivers/block/rbd.c
Linux 3.6
[mirror_ubuntu-bionic-kernel.git] / drivers / block / rbd.c
CommitLineData
602adf40
YS
1/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
dfc5606d 24 For usage instructions, please refer to:
602adf40 25
dfc5606d 26 Documentation/ABI/testing/sysfs-bus-rbd
602adf40
YS
27
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
59c2be1e 34#include <linux/parser.h>
602adf40
YS
35
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
593a9e7b
AE
44/*
45 * The basic unit of block I/O is a sector. It is interpreted in a
46 * number of contexts in Linux (blk, bio, genhd), but the default is
47 * universally 512 bytes. These symbols are just slightly more
48 * meaningful than the bare numbers they represent.
49 */
50#define SECTOR_SHIFT 9
51#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
52
f0f8cef5
AE
53#define RBD_DRV_NAME "rbd"
54#define RBD_DRV_NAME_LONG "rbd (rados block device)"
602adf40
YS
55
56#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
57
602adf40
YS
58#define RBD_MAX_SNAP_NAME_LEN 32
59#define RBD_MAX_OPT_LEN 1024
60
61#define RBD_SNAP_HEAD_NAME "-"
62
81a89793
AE
63/*
64 * An RBD device name will be "rbd#", where the "rbd" comes from
65 * RBD_DRV_NAME above, and # is a unique integer identifier.
66 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
67 * enough to hold all possible device names.
68 */
602adf40 69#define DEV_NAME_LEN 32
81a89793 70#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
602adf40 71
59c2be1e
YS
72#define RBD_NOTIFY_TIMEOUT_DEFAULT 10
73
602adf40
YS
74/*
75 * block device image metadata (in-memory version)
76 */
77struct rbd_image_header {
78 u64 image_size;
849b4260 79 char *object_prefix;
602adf40
YS
80 __u8 obj_order;
81 __u8 crypt_type;
82 __u8 comp_type;
602adf40
YS
83 struct ceph_snap_context *snapc;
84 size_t snap_names_len;
602adf40
YS
85 u32 total_snaps;
86
87 char *snap_names;
88 u64 *snap_sizes;
59c2be1e
YS
89
90 u64 obj_version;
91};
92
93struct rbd_options {
94 int notify_timeout;
602adf40
YS
95};
96
97/*
f0f8cef5 98 * an instance of the client. multiple devices may share an rbd client.
602adf40
YS
99 */
100struct rbd_client {
101 struct ceph_client *client;
59c2be1e 102 struct rbd_options *rbd_opts;
602adf40
YS
103 struct kref kref;
104 struct list_head node;
105};
106
107/*
f0f8cef5 108 * a request completion status
602adf40 109 */
1fec7093
YS
110struct rbd_req_status {
111 int done;
112 int rc;
113 u64 bytes;
114};
115
116/*
117 * a collection of requests
118 */
119struct rbd_req_coll {
120 int total;
121 int num_done;
122 struct kref kref;
123 struct rbd_req_status status[0];
602adf40
YS
124};
125
f0f8cef5
AE
126/*
127 * a single io request
128 */
129struct rbd_request {
130 struct request *rq; /* blk layer request */
131 struct bio *bio; /* cloned bio */
132 struct page **pages; /* list of used pages */
133 u64 len;
134 int coll_index;
135 struct rbd_req_coll *coll;
136};
137
dfc5606d
YS
138struct rbd_snap {
139 struct device dev;
140 const char *name;
3591538f 141 u64 size;
dfc5606d
YS
142 struct list_head node;
143 u64 id;
144};
145
602adf40
YS
146/*
147 * a single device
148 */
149struct rbd_device {
de71a297 150 int dev_id; /* blkdev unique id */
602adf40
YS
151
152 int major; /* blkdev assigned major */
153 struct gendisk *disk; /* blkdev's gendisk and rq */
154 struct request_queue *q;
155
602adf40
YS
156 struct rbd_client *rbd_client;
157
158 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
159
160 spinlock_t lock; /* queue lock */
161
162 struct rbd_image_header header;
0bed54dc
AE
163 char *image_name;
164 size_t image_name_len;
165 char *header_name;
d22f76e7 166 char *pool_name;
9bb2f334 167 int pool_id;
602adf40 168
59c2be1e
YS
169 struct ceph_osd_event *watch_event;
170 struct ceph_osd_request *watch_request;
171
c666601a
JD
172 /* protects updating the header */
173 struct rw_semaphore header_rwsem;
e88a36ec 174 /* name of the snapshot this device reads from */
820a5f3e 175 char *snap_name;
e88a36ec 176 /* id of the snapshot this device reads from */
77dfe99f 177 u64 snap_id; /* current snapshot id */
e88a36ec
JD
178 /* whether the snap_id this device reads from still exists */
179 bool snap_exists;
180 int read_only;
602adf40
YS
181
182 struct list_head node;
dfc5606d
YS
183
184 /* list of snapshots */
185 struct list_head snaps;
186
187 /* sysfs related */
188 struct device dev;
189};
190
602adf40 191static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
e124a82f 192
602adf40 193static LIST_HEAD(rbd_dev_list); /* devices */
e124a82f
AE
194static DEFINE_SPINLOCK(rbd_dev_list_lock);
195
432b8587
AE
196static LIST_HEAD(rbd_client_list); /* clients */
197static DEFINE_SPINLOCK(rbd_client_list_lock);
602adf40 198
dfc5606d
YS
199static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
200static void rbd_dev_release(struct device *dev);
dfc5606d
YS
201static ssize_t rbd_snap_add(struct device *dev,
202 struct device_attribute *attr,
203 const char *buf,
204 size_t count);
14e7085d 205static void __rbd_remove_snap_dev(struct rbd_snap *snap);
dfc5606d 206
f0f8cef5
AE
207static ssize_t rbd_add(struct bus_type *bus, const char *buf,
208 size_t count);
209static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
210 size_t count);
211
212static struct bus_attribute rbd_bus_attrs[] = {
213 __ATTR(add, S_IWUSR, NULL, rbd_add),
214 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
215 __ATTR_NULL
216};
217
218static struct bus_type rbd_bus_type = {
219 .name = "rbd",
220 .bus_attrs = rbd_bus_attrs,
221};
222
223static void rbd_root_dev_release(struct device *dev)
224{
225}
226
227static struct device rbd_root_dev = {
228 .init_name = "rbd",
229 .release = rbd_root_dev_release,
230};
231
dfc5606d 232
dfc5606d
YS
233static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
234{
235 return get_device(&rbd_dev->dev);
236}
237
238static void rbd_put_dev(struct rbd_device *rbd_dev)
239{
240 put_device(&rbd_dev->dev);
241}
602adf40 242
1fe5e993 243static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
59c2be1e 244
602adf40
YS
245static int rbd_open(struct block_device *bdev, fmode_t mode)
246{
f0f8cef5 247 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
602adf40 248
602adf40
YS
249 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
250 return -EROFS;
251
340c7a2b
AE
252 rbd_get_dev(rbd_dev);
253 set_device_ro(bdev, rbd_dev->read_only);
254
602adf40
YS
255 return 0;
256}
257
dfc5606d
YS
258static int rbd_release(struct gendisk *disk, fmode_t mode)
259{
260 struct rbd_device *rbd_dev = disk->private_data;
261
262 rbd_put_dev(rbd_dev);
263
264 return 0;
265}
266
602adf40
YS
267static const struct block_device_operations rbd_bd_ops = {
268 .owner = THIS_MODULE,
269 .open = rbd_open,
dfc5606d 270 .release = rbd_release,
602adf40
YS
271};
272
273/*
274 * Initialize an rbd client instance.
43ae4701 275 * We own *ceph_opts.
602adf40 276 */
43ae4701 277static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts,
59c2be1e 278 struct rbd_options *rbd_opts)
602adf40
YS
279{
280 struct rbd_client *rbdc;
281 int ret = -ENOMEM;
282
283 dout("rbd_client_create\n");
284 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
285 if (!rbdc)
286 goto out_opt;
287
288 kref_init(&rbdc->kref);
289 INIT_LIST_HEAD(&rbdc->node);
290
bc534d86
AE
291 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
292
43ae4701 293 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
602adf40 294 if (IS_ERR(rbdc->client))
bc534d86 295 goto out_mutex;
43ae4701 296 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
602adf40
YS
297
298 ret = ceph_open_session(rbdc->client);
299 if (ret < 0)
300 goto out_err;
301
59c2be1e
YS
302 rbdc->rbd_opts = rbd_opts;
303
432b8587 304 spin_lock(&rbd_client_list_lock);
602adf40 305 list_add_tail(&rbdc->node, &rbd_client_list);
432b8587 306 spin_unlock(&rbd_client_list_lock);
602adf40 307
bc534d86
AE
308 mutex_unlock(&ctl_mutex);
309
602adf40
YS
310 dout("rbd_client_create created %p\n", rbdc);
311 return rbdc;
312
313out_err:
314 ceph_destroy_client(rbdc->client);
bc534d86
AE
315out_mutex:
316 mutex_unlock(&ctl_mutex);
602adf40
YS
317 kfree(rbdc);
318out_opt:
43ae4701
AE
319 if (ceph_opts)
320 ceph_destroy_options(ceph_opts);
28f259b7 321 return ERR_PTR(ret);
602adf40
YS
322}
323
324/*
325 * Find a ceph client with specific addr and configuration.
326 */
43ae4701 327static struct rbd_client *__rbd_client_find(struct ceph_options *ceph_opts)
602adf40
YS
328{
329 struct rbd_client *client_node;
330
43ae4701 331 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
602adf40
YS
332 return NULL;
333
334 list_for_each_entry(client_node, &rbd_client_list, node)
43ae4701 335 if (!ceph_compare_options(ceph_opts, client_node->client))
602adf40
YS
336 return client_node;
337 return NULL;
338}
339
59c2be1e
YS
340/*
341 * mount options
342 */
343enum {
344 Opt_notify_timeout,
345 Opt_last_int,
346 /* int args above */
347 Opt_last_string,
348 /* string args above */
349};
350
43ae4701 351static match_table_t rbd_opts_tokens = {
59c2be1e
YS
352 {Opt_notify_timeout, "notify_timeout=%d"},
353 /* int args above */
354 /* string args above */
355 {-1, NULL}
356};
357
358static int parse_rbd_opts_token(char *c, void *private)
359{
43ae4701 360 struct rbd_options *rbd_opts = private;
59c2be1e
YS
361 substring_t argstr[MAX_OPT_ARGS];
362 int token, intval, ret;
363
43ae4701 364 token = match_token(c, rbd_opts_tokens, argstr);
59c2be1e
YS
365 if (token < 0)
366 return -EINVAL;
367
368 if (token < Opt_last_int) {
369 ret = match_int(&argstr[0], &intval);
370 if (ret < 0) {
371 pr_err("bad mount option arg (not int) "
372 "at '%s'\n", c);
373 return ret;
374 }
375 dout("got int token %d val %d\n", token, intval);
376 } else if (token > Opt_last_int && token < Opt_last_string) {
377 dout("got string token %d val %s\n", token,
378 argstr[0].from);
379 } else {
380 dout("got token %d\n", token);
381 }
382
383 switch (token) {
384 case Opt_notify_timeout:
43ae4701 385 rbd_opts->notify_timeout = intval;
59c2be1e
YS
386 break;
387 default:
388 BUG_ON(token);
389 }
390 return 0;
391}
392
602adf40
YS
393/*
394 * Get a ceph client with specific addr and configuration, if one does
395 * not exist create it.
396 */
5214ecc4
AE
397static struct rbd_client *rbd_get_client(const char *mon_addr,
398 size_t mon_addr_len,
399 char *options)
602adf40
YS
400{
401 struct rbd_client *rbdc;
43ae4701 402 struct ceph_options *ceph_opts;
59c2be1e
YS
403 struct rbd_options *rbd_opts;
404
405 rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
406 if (!rbd_opts)
d720bcb0 407 return ERR_PTR(-ENOMEM);
59c2be1e
YS
408
409 rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
602adf40 410
43ae4701
AE
411 ceph_opts = ceph_parse_options(options, mon_addr,
412 mon_addr + mon_addr_len,
413 parse_rbd_opts_token, rbd_opts);
414 if (IS_ERR(ceph_opts)) {
d720bcb0 415 kfree(rbd_opts);
43ae4701 416 return ERR_CAST(ceph_opts);
ee57741c 417 }
602adf40 418
432b8587 419 spin_lock(&rbd_client_list_lock);
43ae4701 420 rbdc = __rbd_client_find(ceph_opts);
602adf40 421 if (rbdc) {
602adf40
YS
422 /* using an existing client */
423 kref_get(&rbdc->kref);
432b8587 424 spin_unlock(&rbd_client_list_lock);
e6994d3d 425
43ae4701 426 ceph_destroy_options(ceph_opts);
e6994d3d
AE
427 kfree(rbd_opts);
428
d720bcb0 429 return rbdc;
602adf40 430 }
432b8587 431 spin_unlock(&rbd_client_list_lock);
602adf40 432
43ae4701 433 rbdc = rbd_client_create(ceph_opts, rbd_opts);
d97081b0 434
d720bcb0
AE
435 if (IS_ERR(rbdc))
436 kfree(rbd_opts);
602adf40 437
d720bcb0 438 return rbdc;
602adf40
YS
439}
440
441/*
442 * Destroy ceph client
d23a4b3f 443 *
432b8587 444 * Caller must hold rbd_client_list_lock.
602adf40
YS
445 */
446static void rbd_client_release(struct kref *kref)
447{
448 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
449
450 dout("rbd_release_client %p\n", rbdc);
cd9d9f5d 451 spin_lock(&rbd_client_list_lock);
602adf40 452 list_del(&rbdc->node);
cd9d9f5d 453 spin_unlock(&rbd_client_list_lock);
602adf40
YS
454
455 ceph_destroy_client(rbdc->client);
59c2be1e 456 kfree(rbdc->rbd_opts);
602adf40
YS
457 kfree(rbdc);
458}
459
460/*
461 * Drop reference to ceph client node. If it's not referenced anymore, release
462 * it.
463 */
464static void rbd_put_client(struct rbd_device *rbd_dev)
465{
466 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
467 rbd_dev->rbd_client = NULL;
602adf40
YS
468}
469
1fec7093
YS
470/*
471 * Destroy requests collection
472 */
473static void rbd_coll_release(struct kref *kref)
474{
475 struct rbd_req_coll *coll =
476 container_of(kref, struct rbd_req_coll, kref);
477
478 dout("rbd_coll_release %p\n", coll);
479 kfree(coll);
480}
602adf40 481
8e94af8e
AE
482static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
483{
484 return !memcmp(&ondisk->text,
485 RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT));
486}
487
602adf40
YS
488/*
489 * Create a new header structure, translate header format from the on-disk
490 * header.
491 */
492static int rbd_header_from_disk(struct rbd_image_header *header,
493 struct rbd_image_header_ondisk *ondisk,
ed63f4fd 494 u32 allocated_snaps)
602adf40 495{
ccece235 496 u32 snap_count;
602adf40 497
8e94af8e 498 if (!rbd_dev_ondisk_valid(ondisk))
81e759fb 499 return -ENXIO;
81e759fb 500
00f1f36f 501 snap_count = le32_to_cpu(ondisk->snap_count);
ccece235
AE
502 if (snap_count > (SIZE_MAX - sizeof(struct ceph_snap_context))
503 / sizeof (u64))
50f7c4c9 504 return -EINVAL;
602adf40 505 header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
f9f9a190 506 snap_count * sizeof(u64),
ed63f4fd 507 GFP_KERNEL);
602adf40
YS
508 if (!header->snapc)
509 return -ENOMEM;
00f1f36f 510
602adf40 511 if (snap_count) {
ccece235 512 header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
602adf40 513 header->snap_names = kmalloc(header->snap_names_len,
ed63f4fd 514 GFP_KERNEL);
602adf40
YS
515 if (!header->snap_names)
516 goto err_snapc;
517 header->snap_sizes = kmalloc(snap_count * sizeof(u64),
ed63f4fd 518 GFP_KERNEL);
602adf40
YS
519 if (!header->snap_sizes)
520 goto err_names;
521 } else {
ccece235
AE
522 WARN_ON(ondisk->snap_names_len);
523 header->snap_names_len = 0;
602adf40
YS
524 header->snap_names = NULL;
525 header->snap_sizes = NULL;
526 }
849b4260
AE
527
528 header->object_prefix = kmalloc(sizeof (ondisk->block_name) + 1,
ed63f4fd 529 GFP_KERNEL);
849b4260
AE
530 if (!header->object_prefix)
531 goto err_sizes;
532
ca1e49a6 533 memcpy(header->object_prefix, ondisk->block_name,
602adf40 534 sizeof(ondisk->block_name));
849b4260 535 header->object_prefix[sizeof (ondisk->block_name)] = '\0';
602adf40
YS
536
537 header->image_size = le64_to_cpu(ondisk->image_size);
538 header->obj_order = ondisk->options.order;
539 header->crypt_type = ondisk->options.crypt_type;
540 header->comp_type = ondisk->options.comp_type;
541
542 atomic_set(&header->snapc->nref, 1);
505cbb9b 543 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
602adf40
YS
544 header->snapc->num_snaps = snap_count;
545 header->total_snaps = snap_count;
546
21079786 547 if (snap_count && allocated_snaps == snap_count) {
ccece235
AE
548 int i;
549
602adf40
YS
550 for (i = 0; i < snap_count; i++) {
551 header->snapc->snaps[i] =
552 le64_to_cpu(ondisk->snaps[i].id);
553 header->snap_sizes[i] =
554 le64_to_cpu(ondisk->snaps[i].image_size);
555 }
556
557 /* copy snapshot names */
ccece235 558 memcpy(header->snap_names, &ondisk->snaps[snap_count],
602adf40
YS
559 header->snap_names_len);
560 }
561
562 return 0;
563
849b4260
AE
564err_sizes:
565 kfree(header->snap_sizes);
ccece235 566 header->snap_sizes = NULL;
602adf40
YS
567err_names:
568 kfree(header->snap_names);
ccece235 569 header->snap_names = NULL;
602adf40
YS
570err_snapc:
571 kfree(header->snapc);
ccece235
AE
572 header->snapc = NULL;
573
00f1f36f 574 return -ENOMEM;
602adf40
YS
575}
576
602adf40
YS
577static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
578 u64 *seq, u64 *size)
579{
580 int i;
581 char *p = header->snap_names;
582
00f1f36f
AE
583 for (i = 0; i < header->total_snaps; i++) {
584 if (!strcmp(snap_name, p)) {
602adf40 585
00f1f36f 586 /* Found it. Pass back its id and/or size */
602adf40 587
00f1f36f
AE
588 if (seq)
589 *seq = header->snapc->snaps[i];
590 if (size)
591 *size = header->snap_sizes[i];
592 return i;
593 }
594 p += strlen(p) + 1; /* Skip ahead to the next name */
595 }
596 return -ENOENT;
602adf40
YS
597}
598
0ce1a794 599static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size)
602adf40 600{
78dc447d 601 int ret;
602adf40 602
0ce1a794 603 down_write(&rbd_dev->header_rwsem);
602adf40 604
0ce1a794 605 if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
cc9d734c 606 sizeof (RBD_SNAP_HEAD_NAME))) {
0ce1a794 607 rbd_dev->snap_id = CEPH_NOSNAP;
e88a36ec 608 rbd_dev->snap_exists = false;
0ce1a794 609 rbd_dev->read_only = 0;
602adf40 610 if (size)
78dc447d 611 *size = rbd_dev->header.image_size;
602adf40 612 } else {
78dc447d
AE
613 u64 snap_id = 0;
614
615 ret = snap_by_name(&rbd_dev->header, rbd_dev->snap_name,
616 &snap_id, size);
602adf40
YS
617 if (ret < 0)
618 goto done;
78dc447d 619 rbd_dev->snap_id = snap_id;
e88a36ec 620 rbd_dev->snap_exists = true;
0ce1a794 621 rbd_dev->read_only = 1;
602adf40
YS
622 }
623
624 ret = 0;
625done:
0ce1a794 626 up_write(&rbd_dev->header_rwsem);
602adf40
YS
627 return ret;
628}
629
630static void rbd_header_free(struct rbd_image_header *header)
631{
849b4260 632 kfree(header->object_prefix);
602adf40 633 kfree(header->snap_sizes);
849b4260 634 kfree(header->snap_names);
d1d25646 635 ceph_put_snap_context(header->snapc);
602adf40
YS
636}
637
638/*
639 * get the actual striped segment name, offset and length
640 */
641static u64 rbd_get_segment(struct rbd_image_header *header,
ca1e49a6 642 const char *object_prefix,
602adf40
YS
643 u64 ofs, u64 len,
644 char *seg_name, u64 *segofs)
645{
646 u64 seg = ofs >> header->obj_order;
647
648 if (seg_name)
649 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
ca1e49a6 650 "%s.%012llx", object_prefix, seg);
602adf40
YS
651
652 ofs = ofs & ((1 << header->obj_order) - 1);
653 len = min_t(u64, len, (1 << header->obj_order) - ofs);
654
655 if (segofs)
656 *segofs = ofs;
657
658 return len;
659}
660
1fec7093
YS
661static int rbd_get_num_segments(struct rbd_image_header *header,
662 u64 ofs, u64 len)
663{
664 u64 start_seg = ofs >> header->obj_order;
665 u64 end_seg = (ofs + len - 1) >> header->obj_order;
666 return end_seg - start_seg + 1;
667}
668
029bcbd8
JD
669/*
670 * returns the size of an object in the image
671 */
672static u64 rbd_obj_bytes(struct rbd_image_header *header)
673{
674 return 1 << header->obj_order;
675}
676
602adf40
YS
677/*
678 * bio helpers
679 */
680
681static void bio_chain_put(struct bio *chain)
682{
683 struct bio *tmp;
684
685 while (chain) {
686 tmp = chain;
687 chain = chain->bi_next;
688 bio_put(tmp);
689 }
690}
691
692/*
693 * zeros a bio chain, starting at specific offset
694 */
695static void zero_bio_chain(struct bio *chain, int start_ofs)
696{
697 struct bio_vec *bv;
698 unsigned long flags;
699 void *buf;
700 int i;
701 int pos = 0;
702
703 while (chain) {
704 bio_for_each_segment(bv, chain, i) {
705 if (pos + bv->bv_len > start_ofs) {
706 int remainder = max(start_ofs - pos, 0);
707 buf = bvec_kmap_irq(bv, &flags);
708 memset(buf + remainder, 0,
709 bv->bv_len - remainder);
85b5aaa6 710 bvec_kunmap_irq(buf, &flags);
602adf40
YS
711 }
712 pos += bv->bv_len;
713 }
714
715 chain = chain->bi_next;
716 }
717}
718
719/*
720 * bio_chain_clone - clone a chain of bios up to a certain length.
721 * might return a bio_pair that will need to be released.
722 */
723static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
724 struct bio_pair **bp,
725 int len, gfp_t gfpmask)
726{
727 struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
728 int total = 0;
729
730 if (*bp) {
731 bio_pair_release(*bp);
732 *bp = NULL;
733 }
734
735 while (old_chain && (total < len)) {
736 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
737 if (!tmp)
738 goto err_out;
739
740 if (total + old_chain->bi_size > len) {
741 struct bio_pair *bp;
742
743 /*
744 * this split can only happen with a single paged bio,
745 * split_bio will BUG_ON if this is not the case
746 */
747 dout("bio_chain_clone split! total=%d remaining=%d"
bd919d45
AE
748 "bi_size=%u\n",
749 total, len - total, old_chain->bi_size);
602adf40
YS
750
751 /* split the bio. We'll release it either in the next
752 call, or it will have to be released outside */
593a9e7b 753 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
602adf40
YS
754 if (!bp)
755 goto err_out;
756
757 __bio_clone(tmp, &bp->bio1);
758
759 *next = &bp->bio2;
760 } else {
761 __bio_clone(tmp, old_chain);
762 *next = old_chain->bi_next;
763 }
764
765 tmp->bi_bdev = NULL;
766 gfpmask &= ~__GFP_WAIT;
767 tmp->bi_next = NULL;
768
769 if (!new_chain) {
770 new_chain = tail = tmp;
771 } else {
772 tail->bi_next = tmp;
773 tail = tmp;
774 }
775 old_chain = old_chain->bi_next;
776
777 total += tmp->bi_size;
778 }
779
780 BUG_ON(total < len);
781
782 if (tail)
783 tail->bi_next = NULL;
784
785 *old = old_chain;
786
787 return new_chain;
788
789err_out:
790 dout("bio_chain_clone with err\n");
791 bio_chain_put(new_chain);
792 return NULL;
793}
794
795/*
796 * helpers for osd request op vectors.
797 */
57cfc106
AE
798static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
799 int opcode, u32 payload_len)
602adf40 800{
57cfc106
AE
801 struct ceph_osd_req_op *ops;
802
803 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
804 if (!ops)
805 return NULL;
806
807 ops[0].op = opcode;
808
602adf40
YS
809 /*
810 * op extent offset and length will be set later on
811 * in calc_raw_layout()
812 */
57cfc106
AE
813 ops[0].payload_len = payload_len;
814
815 return ops;
602adf40
YS
816}
817
818static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
819{
820 kfree(ops);
821}
822
1fec7093
YS
823static void rbd_coll_end_req_index(struct request *rq,
824 struct rbd_req_coll *coll,
825 int index,
826 int ret, u64 len)
827{
828 struct request_queue *q;
829 int min, max, i;
830
bd919d45
AE
831 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
832 coll, index, ret, (unsigned long long) len);
1fec7093
YS
833
834 if (!rq)
835 return;
836
837 if (!coll) {
838 blk_end_request(rq, ret, len);
839 return;
840 }
841
842 q = rq->q;
843
844 spin_lock_irq(q->queue_lock);
845 coll->status[index].done = 1;
846 coll->status[index].rc = ret;
847 coll->status[index].bytes = len;
848 max = min = coll->num_done;
849 while (max < coll->total && coll->status[max].done)
850 max++;
851
852 for (i = min; i<max; i++) {
853 __blk_end_request(rq, coll->status[i].rc,
854 coll->status[i].bytes);
855 coll->num_done++;
856 kref_put(&coll->kref, rbd_coll_release);
857 }
858 spin_unlock_irq(q->queue_lock);
859}
860
861static void rbd_coll_end_req(struct rbd_request *req,
862 int ret, u64 len)
863{
864 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
865}
866
602adf40
YS
867/*
868 * Send ceph osd request
869 */
870static int rbd_do_request(struct request *rq,
0ce1a794 871 struct rbd_device *rbd_dev,
602adf40
YS
872 struct ceph_snap_context *snapc,
873 u64 snapid,
aded07ea 874 const char *object_name, u64 ofs, u64 len,
602adf40
YS
875 struct bio *bio,
876 struct page **pages,
877 int num_pages,
878 int flags,
879 struct ceph_osd_req_op *ops,
1fec7093
YS
880 struct rbd_req_coll *coll,
881 int coll_index,
602adf40 882 void (*rbd_cb)(struct ceph_osd_request *req,
59c2be1e
YS
883 struct ceph_msg *msg),
884 struct ceph_osd_request **linger_req,
885 u64 *ver)
602adf40
YS
886{
887 struct ceph_osd_request *req;
888 struct ceph_file_layout *layout;
889 int ret;
890 u64 bno;
891 struct timespec mtime = CURRENT_TIME;
892 struct rbd_request *req_data;
893 struct ceph_osd_request_head *reqhead;
1dbb4399 894 struct ceph_osd_client *osdc;
602adf40 895
602adf40 896 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
1fec7093
YS
897 if (!req_data) {
898 if (coll)
899 rbd_coll_end_req_index(rq, coll, coll_index,
900 -ENOMEM, len);
901 return -ENOMEM;
902 }
903
904 if (coll) {
905 req_data->coll = coll;
906 req_data->coll_index = coll_index;
907 }
602adf40 908
bd919d45
AE
909 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
910 (unsigned long long) ofs, (unsigned long long) len);
602adf40 911
0ce1a794 912 osdc = &rbd_dev->rbd_client->client->osdc;
1dbb4399
AE
913 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
914 false, GFP_NOIO, pages, bio);
4ad12621 915 if (!req) {
4ad12621 916 ret = -ENOMEM;
602adf40
YS
917 goto done_pages;
918 }
919
920 req->r_callback = rbd_cb;
921
922 req_data->rq = rq;
923 req_data->bio = bio;
924 req_data->pages = pages;
925 req_data->len = len;
926
927 req->r_priv = req_data;
928
929 reqhead = req->r_request->front.iov_base;
930 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
931
aded07ea 932 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
602adf40
YS
933 req->r_oid_len = strlen(req->r_oid);
934
935 layout = &req->r_file_layout;
936 memset(layout, 0, sizeof(*layout));
937 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
938 layout->fl_stripe_count = cpu_to_le32(1);
939 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
0ce1a794 940 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
1dbb4399
AE
941 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
942 req, ops);
602adf40
YS
943
944 ceph_osdc_build_request(req, ofs, &len,
945 ops,
946 snapc,
947 &mtime,
948 req->r_oid, req->r_oid_len);
602adf40 949
59c2be1e 950 if (linger_req) {
1dbb4399 951 ceph_osdc_set_request_linger(osdc, req);
59c2be1e
YS
952 *linger_req = req;
953 }
954
1dbb4399 955 ret = ceph_osdc_start_request(osdc, req, false);
602adf40
YS
956 if (ret < 0)
957 goto done_err;
958
959 if (!rbd_cb) {
1dbb4399 960 ret = ceph_osdc_wait_request(osdc, req);
59c2be1e
YS
961 if (ver)
962 *ver = le64_to_cpu(req->r_reassert_version.version);
bd919d45
AE
963 dout("reassert_ver=%llu\n",
964 (unsigned long long)
965 le64_to_cpu(req->r_reassert_version.version));
602adf40
YS
966 ceph_osdc_put_request(req);
967 }
968 return ret;
969
970done_err:
971 bio_chain_put(req_data->bio);
972 ceph_osdc_put_request(req);
973done_pages:
1fec7093 974 rbd_coll_end_req(req_data, ret, len);
602adf40 975 kfree(req_data);
602adf40
YS
976 return ret;
977}
978
979/*
980 * Ceph osd op callback
981 */
982static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
983{
984 struct rbd_request *req_data = req->r_priv;
985 struct ceph_osd_reply_head *replyhead;
986 struct ceph_osd_op *op;
987 __s32 rc;
988 u64 bytes;
989 int read_op;
990
991 /* parse reply */
992 replyhead = msg->front.iov_base;
993 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
994 op = (void *)(replyhead + 1);
995 rc = le32_to_cpu(replyhead->result);
996 bytes = le64_to_cpu(op->extent.length);
895cfcc8 997 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
602adf40 998
bd919d45
AE
999 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1000 (unsigned long long) bytes, read_op, (int) rc);
602adf40
YS
1001
1002 if (rc == -ENOENT && read_op) {
1003 zero_bio_chain(req_data->bio, 0);
1004 rc = 0;
1005 } else if (rc == 0 && read_op && bytes < req_data->len) {
1006 zero_bio_chain(req_data->bio, bytes);
1007 bytes = req_data->len;
1008 }
1009
1fec7093 1010 rbd_coll_end_req(req_data, rc, bytes);
602adf40
YS
1011
1012 if (req_data->bio)
1013 bio_chain_put(req_data->bio);
1014
1015 ceph_osdc_put_request(req);
1016 kfree(req_data);
1017}
1018
59c2be1e
YS
1019static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1020{
1021 ceph_osdc_put_request(req);
1022}
1023
602adf40
YS
1024/*
1025 * Do a synchronous ceph osd operation
1026 */
0ce1a794 1027static int rbd_req_sync_op(struct rbd_device *rbd_dev,
602adf40
YS
1028 struct ceph_snap_context *snapc,
1029 u64 snapid,
602adf40 1030 int flags,
913d2fdc 1031 struct ceph_osd_req_op *ops,
aded07ea 1032 const char *object_name,
602adf40 1033 u64 ofs, u64 len,
59c2be1e
YS
1034 char *buf,
1035 struct ceph_osd_request **linger_req,
1036 u64 *ver)
602adf40
YS
1037{
1038 int ret;
1039 struct page **pages;
1040 int num_pages;
913d2fdc
AE
1041
1042 BUG_ON(ops == NULL);
602adf40
YS
1043
1044 num_pages = calc_pages_for(ofs , len);
1045 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
b8d0638a
DC
1046 if (IS_ERR(pages))
1047 return PTR_ERR(pages);
602adf40 1048
0ce1a794 1049 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
aded07ea 1050 object_name, ofs, len, NULL,
602adf40
YS
1051 pages, num_pages,
1052 flags,
1053 ops,
1fec7093 1054 NULL, 0,
59c2be1e
YS
1055 NULL,
1056 linger_req, ver);
602adf40 1057 if (ret < 0)
913d2fdc 1058 goto done;
602adf40
YS
1059
1060 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1061 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1062
602adf40
YS
1063done:
1064 ceph_release_page_vector(pages, num_pages);
1065 return ret;
1066}
1067
1068/*
1069 * Do an asynchronous ceph osd operation
1070 */
1071static int rbd_do_op(struct request *rq,
0ce1a794 1072 struct rbd_device *rbd_dev,
602adf40
YS
1073 struct ceph_snap_context *snapc,
1074 u64 snapid,
d1f57ea6 1075 int opcode, int flags,
602adf40 1076 u64 ofs, u64 len,
1fec7093
YS
1077 struct bio *bio,
1078 struct rbd_req_coll *coll,
1079 int coll_index)
602adf40
YS
1080{
1081 char *seg_name;
1082 u64 seg_ofs;
1083 u64 seg_len;
1084 int ret;
1085 struct ceph_osd_req_op *ops;
1086 u32 payload_len;
1087
1088 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1089 if (!seg_name)
1090 return -ENOMEM;
1091
1092 seg_len = rbd_get_segment(&rbd_dev->header,
ca1e49a6 1093 rbd_dev->header.object_prefix,
602adf40
YS
1094 ofs, len,
1095 seg_name, &seg_ofs);
602adf40
YS
1096
1097 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1098
57cfc106
AE
1099 ret = -ENOMEM;
1100 ops = rbd_create_rw_ops(1, opcode, payload_len);
1101 if (!ops)
602adf40
YS
1102 goto done;
1103
1104 /* we've taken care of segment sizes earlier when we
1105 cloned the bios. We should never have a segment
1106 truncated at this point */
1107 BUG_ON(seg_len < len);
1108
1109 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1110 seg_name, seg_ofs, seg_len,
1111 bio,
1112 NULL, 0,
1113 flags,
1114 ops,
1fec7093 1115 coll, coll_index,
59c2be1e 1116 rbd_req_cb, 0, NULL);
11f77002
SW
1117
1118 rbd_destroy_ops(ops);
602adf40
YS
1119done:
1120 kfree(seg_name);
1121 return ret;
1122}
1123
1124/*
1125 * Request async osd write
1126 */
1127static int rbd_req_write(struct request *rq,
1128 struct rbd_device *rbd_dev,
1129 struct ceph_snap_context *snapc,
1130 u64 ofs, u64 len,
1fec7093
YS
1131 struct bio *bio,
1132 struct rbd_req_coll *coll,
1133 int coll_index)
602adf40
YS
1134{
1135 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1136 CEPH_OSD_OP_WRITE,
1137 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1fec7093 1138 ofs, len, bio, coll, coll_index);
602adf40
YS
1139}
1140
1141/*
1142 * Request async osd read
1143 */
1144static int rbd_req_read(struct request *rq,
1145 struct rbd_device *rbd_dev,
1146 u64 snapid,
1147 u64 ofs, u64 len,
1fec7093
YS
1148 struct bio *bio,
1149 struct rbd_req_coll *coll,
1150 int coll_index)
602adf40
YS
1151{
1152 return rbd_do_op(rq, rbd_dev, NULL,
b06e6a6b 1153 snapid,
602adf40
YS
1154 CEPH_OSD_OP_READ,
1155 CEPH_OSD_FLAG_READ,
1fec7093 1156 ofs, len, bio, coll, coll_index);
602adf40
YS
1157}
1158
1159/*
1160 * Request sync osd read
1161 */
0ce1a794 1162static int rbd_req_sync_read(struct rbd_device *rbd_dev,
602adf40 1163 u64 snapid,
aded07ea 1164 const char *object_name,
602adf40 1165 u64 ofs, u64 len,
59c2be1e
YS
1166 char *buf,
1167 u64 *ver)
602adf40 1168{
913d2fdc
AE
1169 struct ceph_osd_req_op *ops;
1170 int ret;
1171
1172 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1173 if (!ops)
1174 return -ENOMEM;
1175
1176 ret = rbd_req_sync_op(rbd_dev, NULL,
b06e6a6b 1177 snapid,
602adf40 1178 CEPH_OSD_FLAG_READ,
913d2fdc
AE
1179 ops, object_name, ofs, len, buf, NULL, ver);
1180 rbd_destroy_ops(ops);
1181
1182 return ret;
602adf40
YS
1183}
1184
1185/*
59c2be1e
YS
1186 * Request sync osd watch
1187 */
0ce1a794 1188static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
59c2be1e 1189 u64 ver,
7f0a24d8 1190 u64 notify_id)
59c2be1e
YS
1191{
1192 struct ceph_osd_req_op *ops;
11f77002
SW
1193 int ret;
1194
57cfc106
AE
1195 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1196 if (!ops)
1197 return -ENOMEM;
59c2be1e 1198
a71b891b 1199 ops[0].watch.ver = cpu_to_le64(ver);
59c2be1e
YS
1200 ops[0].watch.cookie = notify_id;
1201 ops[0].watch.flag = 0;
1202
0ce1a794 1203 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
7f0a24d8 1204 rbd_dev->header_name, 0, 0, NULL,
ad4f232f 1205 NULL, 0,
59c2be1e
YS
1206 CEPH_OSD_FLAG_READ,
1207 ops,
1fec7093 1208 NULL, 0,
59c2be1e
YS
1209 rbd_simple_req_cb, 0, NULL);
1210
1211 rbd_destroy_ops(ops);
1212 return ret;
1213}
1214
1215static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1216{
0ce1a794 1217 struct rbd_device *rbd_dev = (struct rbd_device *)data;
a71b891b 1218 u64 hver;
13143d2d
SW
1219 int rc;
1220
0ce1a794 1221 if (!rbd_dev)
59c2be1e
YS
1222 return;
1223
bd919d45
AE
1224 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1225 rbd_dev->header_name, (unsigned long long) notify_id,
1226 (unsigned int) opcode);
1fe5e993 1227 rc = rbd_refresh_header(rbd_dev, &hver);
13143d2d 1228 if (rc)
f0f8cef5 1229 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
0ce1a794 1230 " update snaps: %d\n", rbd_dev->major, rc);
59c2be1e 1231
7f0a24d8 1232 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
59c2be1e
YS
1233}
1234
1235/*
1236 * Request sync osd watch
1237 */
0e6f322d 1238static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
59c2be1e
YS
1239{
1240 struct ceph_osd_req_op *ops;
0ce1a794 1241 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
57cfc106 1242 int ret;
59c2be1e 1243
57cfc106
AE
1244 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1245 if (!ops)
1246 return -ENOMEM;
59c2be1e
YS
1247
1248 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
0ce1a794 1249 (void *)rbd_dev, &rbd_dev->watch_event);
59c2be1e
YS
1250 if (ret < 0)
1251 goto fail;
1252
0e6f322d 1253 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
0ce1a794 1254 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
59c2be1e
YS
1255 ops[0].watch.flag = 1;
1256
0ce1a794 1257 ret = rbd_req_sync_op(rbd_dev, NULL,
59c2be1e 1258 CEPH_NOSNAP,
59c2be1e
YS
1259 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1260 ops,
0e6f322d
AE
1261 rbd_dev->header_name,
1262 0, 0, NULL,
0ce1a794 1263 &rbd_dev->watch_request, NULL);
59c2be1e
YS
1264
1265 if (ret < 0)
1266 goto fail_event;
1267
1268 rbd_destroy_ops(ops);
1269 return 0;
1270
1271fail_event:
0ce1a794
AE
1272 ceph_osdc_cancel_event(rbd_dev->watch_event);
1273 rbd_dev->watch_event = NULL;
59c2be1e
YS
1274fail:
1275 rbd_destroy_ops(ops);
1276 return ret;
1277}
1278
79e3057c
YS
1279/*
1280 * Request sync osd unwatch
1281 */
070c633f 1282static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
79e3057c
YS
1283{
1284 struct ceph_osd_req_op *ops;
57cfc106 1285 int ret;
79e3057c 1286
57cfc106
AE
1287 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1288 if (!ops)
1289 return -ENOMEM;
79e3057c
YS
1290
1291 ops[0].watch.ver = 0;
0ce1a794 1292 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
79e3057c
YS
1293 ops[0].watch.flag = 0;
1294
0ce1a794 1295 ret = rbd_req_sync_op(rbd_dev, NULL,
79e3057c 1296 CEPH_NOSNAP,
79e3057c
YS
1297 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1298 ops,
070c633f
AE
1299 rbd_dev->header_name,
1300 0, 0, NULL, NULL, NULL);
1301
79e3057c
YS
1302
1303 rbd_destroy_ops(ops);
0ce1a794
AE
1304 ceph_osdc_cancel_event(rbd_dev->watch_event);
1305 rbd_dev->watch_event = NULL;
79e3057c
YS
1306 return ret;
1307}
1308
59c2be1e 1309struct rbd_notify_info {
0ce1a794 1310 struct rbd_device *rbd_dev;
59c2be1e
YS
1311};
1312
1313static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1314{
0ce1a794
AE
1315 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1316 if (!rbd_dev)
59c2be1e
YS
1317 return;
1318
bd919d45
AE
1319 dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n",
1320 rbd_dev->header_name, (unsigned long long) notify_id,
1321 (unsigned int) opcode);
59c2be1e
YS
1322}
1323
1324/*
1325 * Request sync osd notify
1326 */
4cb16250 1327static int rbd_req_sync_notify(struct rbd_device *rbd_dev)
59c2be1e
YS
1328{
1329 struct ceph_osd_req_op *ops;
0ce1a794 1330 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
59c2be1e
YS
1331 struct ceph_osd_event *event;
1332 struct rbd_notify_info info;
1333 int payload_len = sizeof(u32) + sizeof(u32);
1334 int ret;
1335
57cfc106
AE
1336 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len);
1337 if (!ops)
1338 return -ENOMEM;
59c2be1e 1339
0ce1a794 1340 info.rbd_dev = rbd_dev;
59c2be1e
YS
1341
1342 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1343 (void *)&info, &event);
1344 if (ret < 0)
1345 goto fail;
1346
1347 ops[0].watch.ver = 1;
1348 ops[0].watch.flag = 1;
1349 ops[0].watch.cookie = event->cookie;
1350 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1351 ops[0].watch.timeout = 12;
1352
0ce1a794 1353 ret = rbd_req_sync_op(rbd_dev, NULL,
59c2be1e 1354 CEPH_NOSNAP,
59c2be1e
YS
1355 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1356 ops,
4cb16250
AE
1357 rbd_dev->header_name,
1358 0, 0, NULL, NULL, NULL);
59c2be1e
YS
1359 if (ret < 0)
1360 goto fail_event;
1361
1362 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1363 dout("ceph_osdc_wait_event returned %d\n", ret);
1364 rbd_destroy_ops(ops);
1365 return 0;
1366
1367fail_event:
1368 ceph_osdc_cancel_event(event);
1369fail:
1370 rbd_destroy_ops(ops);
1371 return ret;
1372}
1373
602adf40
YS
1374/*
1375 * Request sync osd read
1376 */
0ce1a794 1377static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
aded07ea
AE
1378 const char *object_name,
1379 const char *class_name,
1380 const char *method_name,
602adf40 1381 const char *data,
59c2be1e
YS
1382 int len,
1383 u64 *ver)
602adf40
YS
1384{
1385 struct ceph_osd_req_op *ops;
aded07ea
AE
1386 int class_name_len = strlen(class_name);
1387 int method_name_len = strlen(method_name);
57cfc106
AE
1388 int ret;
1389
1390 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL,
aded07ea 1391 class_name_len + method_name_len + len);
57cfc106
AE
1392 if (!ops)
1393 return -ENOMEM;
602adf40 1394
aded07ea
AE
1395 ops[0].cls.class_name = class_name;
1396 ops[0].cls.class_len = (__u8) class_name_len;
1397 ops[0].cls.method_name = method_name;
1398 ops[0].cls.method_len = (__u8) method_name_len;
602adf40
YS
1399 ops[0].cls.argc = 0;
1400 ops[0].cls.indata = data;
1401 ops[0].cls.indata_len = len;
1402
0ce1a794 1403 ret = rbd_req_sync_op(rbd_dev, NULL,
602adf40 1404 CEPH_NOSNAP,
602adf40
YS
1405 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1406 ops,
d1f57ea6 1407 object_name, 0, 0, NULL, NULL, ver);
602adf40
YS
1408
1409 rbd_destroy_ops(ops);
1410
1411 dout("cls_exec returned %d\n", ret);
1412 return ret;
1413}
1414
1fec7093
YS
1415static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1416{
1417 struct rbd_req_coll *coll =
1418 kzalloc(sizeof(struct rbd_req_coll) +
1419 sizeof(struct rbd_req_status) * num_reqs,
1420 GFP_ATOMIC);
1421
1422 if (!coll)
1423 return NULL;
1424 coll->total = num_reqs;
1425 kref_init(&coll->kref);
1426 return coll;
1427}
1428
602adf40
YS
1429/*
1430 * block device queue callback
1431 */
1432static void rbd_rq_fn(struct request_queue *q)
1433{
1434 struct rbd_device *rbd_dev = q->queuedata;
1435 struct request *rq;
1436 struct bio_pair *bp = NULL;
1437
00f1f36f 1438 while ((rq = blk_fetch_request(q))) {
602adf40
YS
1439 struct bio *bio;
1440 struct bio *rq_bio, *next_bio = NULL;
1441 bool do_write;
bd919d45
AE
1442 unsigned int size;
1443 u64 op_size = 0;
602adf40 1444 u64 ofs;
1fec7093
YS
1445 int num_segs, cur_seg = 0;
1446 struct rbd_req_coll *coll;
d1d25646 1447 struct ceph_snap_context *snapc;
602adf40
YS
1448
1449 /* peek at request from block layer */
1450 if (!rq)
1451 break;
1452
1453 dout("fetched request\n");
1454
1455 /* filter out block requests we don't understand */
1456 if ((rq->cmd_type != REQ_TYPE_FS)) {
1457 __blk_end_request_all(rq, 0);
00f1f36f 1458 continue;
602adf40
YS
1459 }
1460
1461 /* deduce our operation (read, write) */
1462 do_write = (rq_data_dir(rq) == WRITE);
1463
1464 size = blk_rq_bytes(rq);
593a9e7b 1465 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
602adf40
YS
1466 rq_bio = rq->bio;
1467 if (do_write && rbd_dev->read_only) {
1468 __blk_end_request_all(rq, -EROFS);
00f1f36f 1469 continue;
602adf40
YS
1470 }
1471
1472 spin_unlock_irq(q->queue_lock);
1473
d1d25646 1474 down_read(&rbd_dev->header_rwsem);
e88a36ec 1475
d1d25646 1476 if (rbd_dev->snap_id != CEPH_NOSNAP && !rbd_dev->snap_exists) {
e88a36ec 1477 up_read(&rbd_dev->header_rwsem);
d1d25646
JD
1478 dout("request for non-existent snapshot");
1479 spin_lock_irq(q->queue_lock);
1480 __blk_end_request_all(rq, -ENXIO);
1481 continue;
e88a36ec
JD
1482 }
1483
d1d25646
JD
1484 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1485
1486 up_read(&rbd_dev->header_rwsem);
1487
602adf40
YS
1488 dout("%s 0x%x bytes at 0x%llx\n",
1489 do_write ? "write" : "read",
bd919d45 1490 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
602adf40 1491
1fec7093
YS
1492 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1493 coll = rbd_alloc_coll(num_segs);
1494 if (!coll) {
1495 spin_lock_irq(q->queue_lock);
1496 __blk_end_request_all(rq, -ENOMEM);
d1d25646 1497 ceph_put_snap_context(snapc);
00f1f36f 1498 continue;
1fec7093
YS
1499 }
1500
602adf40
YS
1501 do {
1502 /* a bio clone to be passed down to OSD req */
bd919d45 1503 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
602adf40 1504 op_size = rbd_get_segment(&rbd_dev->header,
ca1e49a6 1505 rbd_dev->header.object_prefix,
602adf40
YS
1506 ofs, size,
1507 NULL, NULL);
1fec7093 1508 kref_get(&coll->kref);
602adf40
YS
1509 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1510 op_size, GFP_ATOMIC);
1511 if (!bio) {
1fec7093
YS
1512 rbd_coll_end_req_index(rq, coll, cur_seg,
1513 -ENOMEM, op_size);
1514 goto next_seg;
602adf40
YS
1515 }
1516
1fec7093 1517
602adf40
YS
1518 /* init OSD command: write or read */
1519 if (do_write)
1520 rbd_req_write(rq, rbd_dev,
d1d25646 1521 snapc,
602adf40 1522 ofs,
1fec7093
YS
1523 op_size, bio,
1524 coll, cur_seg);
602adf40
YS
1525 else
1526 rbd_req_read(rq, rbd_dev,
77dfe99f 1527 rbd_dev->snap_id,
602adf40 1528 ofs,
1fec7093
YS
1529 op_size, bio,
1530 coll, cur_seg);
602adf40 1531
1fec7093 1532next_seg:
602adf40
YS
1533 size -= op_size;
1534 ofs += op_size;
1535
1fec7093 1536 cur_seg++;
602adf40
YS
1537 rq_bio = next_bio;
1538 } while (size > 0);
1fec7093 1539 kref_put(&coll->kref, rbd_coll_release);
602adf40
YS
1540
1541 if (bp)
1542 bio_pair_release(bp);
602adf40 1543 spin_lock_irq(q->queue_lock);
d1d25646
JD
1544
1545 ceph_put_snap_context(snapc);
602adf40
YS
1546 }
1547}
1548
1549/*
1550 * a queue callback. Makes sure that we don't create a bio that spans across
1551 * multiple osd objects. One exception would be with a single page bios,
1552 * which we handle later at bio_chain_clone
1553 */
1554static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1555 struct bio_vec *bvec)
1556{
1557 struct rbd_device *rbd_dev = q->queuedata;
593a9e7b
AE
1558 unsigned int chunk_sectors;
1559 sector_t sector;
1560 unsigned int bio_sectors;
602adf40
YS
1561 int max;
1562
593a9e7b
AE
1563 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1564 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1565 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1566
602adf40 1567 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
593a9e7b 1568 + bio_sectors)) << SECTOR_SHIFT;
602adf40
YS
1569 if (max < 0)
1570 max = 0; /* bio_add cannot handle a negative return */
1571 if (max <= bvec->bv_len && bio_sectors == 0)
1572 return bvec->bv_len;
1573 return max;
1574}
1575
1576static void rbd_free_disk(struct rbd_device *rbd_dev)
1577{
1578 struct gendisk *disk = rbd_dev->disk;
1579
1580 if (!disk)
1581 return;
1582
1583 rbd_header_free(&rbd_dev->header);
1584
1585 if (disk->flags & GENHD_FL_UP)
1586 del_gendisk(disk);
1587 if (disk->queue)
1588 blk_cleanup_queue(disk->queue);
1589 put_disk(disk);
1590}
1591
1592/*
1593 * reload the ondisk the header
1594 */
1595static int rbd_read_header(struct rbd_device *rbd_dev,
1596 struct rbd_image_header *header)
1597{
1598 ssize_t rc;
1599 struct rbd_image_header_ondisk *dh;
50f7c4c9 1600 u32 snap_count = 0;
59c2be1e 1601 u64 ver;
00f1f36f 1602 size_t len;
602adf40 1603
00f1f36f
AE
1604 /*
1605 * First reads the fixed-size header to determine the number
1606 * of snapshots, then re-reads it, along with all snapshot
1607 * records as well as their stored names.
1608 */
1609 len = sizeof (*dh);
602adf40 1610 while (1) {
602adf40
YS
1611 dh = kmalloc(len, GFP_KERNEL);
1612 if (!dh)
1613 return -ENOMEM;
1614
1615 rc = rbd_req_sync_read(rbd_dev,
9a5d690b 1616 CEPH_NOSNAP,
0bed54dc 1617 rbd_dev->header_name,
602adf40 1618 0, len,
59c2be1e 1619 (char *)dh, &ver);
602adf40
YS
1620 if (rc < 0)
1621 goto out_dh;
1622
ed63f4fd 1623 rc = rbd_header_from_disk(header, dh, snap_count);
81e759fb 1624 if (rc < 0) {
00f1f36f 1625 if (rc == -ENXIO)
81e759fb 1626 pr_warning("unrecognized header format"
0bed54dc
AE
1627 " for image %s\n",
1628 rbd_dev->image_name);
602adf40 1629 goto out_dh;
81e759fb 1630 }
602adf40 1631
00f1f36f
AE
1632 if (snap_count == header->total_snaps)
1633 break;
1634
1635 snap_count = header->total_snaps;
1636 len = sizeof (*dh) +
1637 snap_count * sizeof(struct rbd_image_snap_ondisk) +
1638 header->snap_names_len;
1639
1640 rbd_header_free(header);
1641 kfree(dh);
602adf40 1642 }
59c2be1e 1643 header->obj_version = ver;
602adf40
YS
1644
1645out_dh:
1646 kfree(dh);
1647 return rc;
1648}
1649
1650/*
1651 * create a snapshot
1652 */
0ce1a794 1653static int rbd_header_add_snap(struct rbd_device *rbd_dev,
602adf40
YS
1654 const char *snap_name,
1655 gfp_t gfp_flags)
1656{
1657 int name_len = strlen(snap_name);
1658 u64 new_snapid;
1659 int ret;
916d4d67 1660 void *data, *p, *e;
1dbb4399 1661 struct ceph_mon_client *monc;
602adf40
YS
1662
1663 /* we should create a snapshot only if we're pointing at the head */
0ce1a794 1664 if (rbd_dev->snap_id != CEPH_NOSNAP)
602adf40
YS
1665 return -EINVAL;
1666
0ce1a794
AE
1667 monc = &rbd_dev->rbd_client->client->monc;
1668 ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
bd919d45 1669 dout("created snapid=%llu\n", (unsigned long long) new_snapid);
602adf40
YS
1670 if (ret < 0)
1671 return ret;
1672
1673 data = kmalloc(name_len + 16, gfp_flags);
1674 if (!data)
1675 return -ENOMEM;
1676
916d4d67
SW
1677 p = data;
1678 e = data + name_len + 16;
602adf40 1679
916d4d67
SW
1680 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1681 ceph_encode_64_safe(&p, e, new_snapid, bad);
602adf40 1682
0bed54dc 1683 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
0ce1a794 1684 "rbd", "snap_add",
d67d4be5 1685 data, p - data, NULL);
602adf40 1686
916d4d67 1687 kfree(data);
602adf40 1688
505cbb9b 1689 return ret < 0 ? ret : 0;
602adf40
YS
1690bad:
1691 return -ERANGE;
1692}
1693
dfc5606d
YS
1694static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1695{
1696 struct rbd_snap *snap;
a0593290 1697 struct rbd_snap *next;
dfc5606d 1698
a0593290 1699 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
14e7085d 1700 __rbd_remove_snap_dev(snap);
dfc5606d
YS
1701}
1702
602adf40
YS
1703/*
1704 * only read the first part of the ondisk header, without the snaps info
1705 */
b813623a 1706static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
602adf40
YS
1707{
1708 int ret;
1709 struct rbd_image_header h;
602adf40
YS
1710
1711 ret = rbd_read_header(rbd_dev, &h);
1712 if (ret < 0)
1713 return ret;
1714
a51aa0c0
JD
1715 down_write(&rbd_dev->header_rwsem);
1716
9db4b3e3 1717 /* resized? */
474ef7ce
JD
1718 if (rbd_dev->snap_id == CEPH_NOSNAP) {
1719 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1720
1721 dout("setting size to %llu sectors", (unsigned long long) size);
1722 set_capacity(rbd_dev->disk, size);
1723 }
9db4b3e3 1724
849b4260 1725 /* rbd_dev->header.object_prefix shouldn't change */
602adf40 1726 kfree(rbd_dev->header.snap_sizes);
849b4260 1727 kfree(rbd_dev->header.snap_names);
d1d25646
JD
1728 /* osd requests may still refer to snapc */
1729 ceph_put_snap_context(rbd_dev->header.snapc);
602adf40 1730
b813623a
AE
1731 if (hver)
1732 *hver = h.obj_version;
a71b891b 1733 rbd_dev->header.obj_version = h.obj_version;
93a24e08 1734 rbd_dev->header.image_size = h.image_size;
602adf40
YS
1735 rbd_dev->header.total_snaps = h.total_snaps;
1736 rbd_dev->header.snapc = h.snapc;
1737 rbd_dev->header.snap_names = h.snap_names;
dfc5606d 1738 rbd_dev->header.snap_names_len = h.snap_names_len;
602adf40 1739 rbd_dev->header.snap_sizes = h.snap_sizes;
849b4260
AE
1740 /* Free the extra copy of the object prefix */
1741 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1742 kfree(h.object_prefix);
1743
dfc5606d
YS
1744 ret = __rbd_init_snaps_header(rbd_dev);
1745
c666601a 1746 up_write(&rbd_dev->header_rwsem);
602adf40 1747
dfc5606d 1748 return ret;
602adf40
YS
1749}
1750
1fe5e993
AE
1751static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1752{
1753 int ret;
1754
1755 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1756 ret = __rbd_refresh_header(rbd_dev, hver);
1757 mutex_unlock(&ctl_mutex);
1758
1759 return ret;
1760}
1761
602adf40
YS
1762static int rbd_init_disk(struct rbd_device *rbd_dev)
1763{
1764 struct gendisk *disk;
1765 struct request_queue *q;
1766 int rc;
593a9e7b 1767 u64 segment_size;
602adf40
YS
1768 u64 total_size = 0;
1769
1770 /* contact OSD, request size info about the object being mapped */
1771 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1772 if (rc)
1773 return rc;
1774
dfc5606d
YS
1775 /* no need to lock here, as rbd_dev is not registered yet */
1776 rc = __rbd_init_snaps_header(rbd_dev);
1777 if (rc)
1778 return rc;
1779
cc9d734c 1780 rc = rbd_header_set_snap(rbd_dev, &total_size);
602adf40
YS
1781 if (rc)
1782 return rc;
1783
1784 /* create gendisk info */
1785 rc = -ENOMEM;
1786 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1787 if (!disk)
1788 goto out;
1789
f0f8cef5 1790 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
de71a297 1791 rbd_dev->dev_id);
602adf40
YS
1792 disk->major = rbd_dev->major;
1793 disk->first_minor = 0;
1794 disk->fops = &rbd_bd_ops;
1795 disk->private_data = rbd_dev;
1796
1797 /* init rq */
1798 rc = -ENOMEM;
1799 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1800 if (!q)
1801 goto out_disk;
029bcbd8 1802
593a9e7b
AE
1803 /* We use the default size, but let's be explicit about it. */
1804 blk_queue_physical_block_size(q, SECTOR_SIZE);
1805
029bcbd8 1806 /* set io sizes to object size */
593a9e7b
AE
1807 segment_size = rbd_obj_bytes(&rbd_dev->header);
1808 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1809 blk_queue_max_segment_size(q, segment_size);
1810 blk_queue_io_min(q, segment_size);
1811 blk_queue_io_opt(q, segment_size);
029bcbd8 1812
602adf40
YS
1813 blk_queue_merge_bvec(q, rbd_merge_bvec);
1814 disk->queue = q;
1815
1816 q->queuedata = rbd_dev;
1817
1818 rbd_dev->disk = disk;
1819 rbd_dev->q = q;
1820
1821 /* finally, announce the disk to the world */
593a9e7b 1822 set_capacity(disk, total_size / SECTOR_SIZE);
602adf40
YS
1823 add_disk(disk);
1824
1825 pr_info("%s: added with size 0x%llx\n",
1826 disk->disk_name, (unsigned long long)total_size);
1827 return 0;
1828
1829out_disk:
1830 put_disk(disk);
1831out:
1832 return rc;
1833}
1834
dfc5606d
YS
1835/*
1836 sysfs
1837*/
1838
593a9e7b
AE
1839static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1840{
1841 return container_of(dev, struct rbd_device, dev);
1842}
1843
dfc5606d
YS
1844static ssize_t rbd_size_show(struct device *dev,
1845 struct device_attribute *attr, char *buf)
1846{
593a9e7b 1847 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
a51aa0c0
JD
1848 sector_t size;
1849
1850 down_read(&rbd_dev->header_rwsem);
1851 size = get_capacity(rbd_dev->disk);
1852 up_read(&rbd_dev->header_rwsem);
dfc5606d 1853
a51aa0c0 1854 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
dfc5606d
YS
1855}
1856
1857static ssize_t rbd_major_show(struct device *dev,
1858 struct device_attribute *attr, char *buf)
1859{
593a9e7b 1860 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 1861
dfc5606d
YS
1862 return sprintf(buf, "%d\n", rbd_dev->major);
1863}
1864
1865static ssize_t rbd_client_id_show(struct device *dev,
1866 struct device_attribute *attr, char *buf)
602adf40 1867{
593a9e7b 1868 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1869
1dbb4399
AE
1870 return sprintf(buf, "client%lld\n",
1871 ceph_client_id(rbd_dev->rbd_client->client));
602adf40
YS
1872}
1873
dfc5606d
YS
1874static ssize_t rbd_pool_show(struct device *dev,
1875 struct device_attribute *attr, char *buf)
602adf40 1876{
593a9e7b 1877 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d
YS
1878
1879 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1880}
1881
9bb2f334
AE
1882static ssize_t rbd_pool_id_show(struct device *dev,
1883 struct device_attribute *attr, char *buf)
1884{
1885 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1886
1887 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1888}
1889
dfc5606d
YS
1890static ssize_t rbd_name_show(struct device *dev,
1891 struct device_attribute *attr, char *buf)
1892{
593a9e7b 1893 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1894
0bed54dc 1895 return sprintf(buf, "%s\n", rbd_dev->image_name);
dfc5606d
YS
1896}
1897
1898static ssize_t rbd_snap_show(struct device *dev,
1899 struct device_attribute *attr,
1900 char *buf)
1901{
593a9e7b 1902 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d
YS
1903
1904 return sprintf(buf, "%s\n", rbd_dev->snap_name);
1905}
1906
1907static ssize_t rbd_image_refresh(struct device *dev,
1908 struct device_attribute *attr,
1909 const char *buf,
1910 size_t size)
1911{
593a9e7b 1912 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
b813623a 1913 int ret;
602adf40 1914
1fe5e993 1915 ret = rbd_refresh_header(rbd_dev, NULL);
b813623a
AE
1916
1917 return ret < 0 ? ret : size;
dfc5606d 1918}
602adf40 1919
dfc5606d
YS
1920static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1921static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1922static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1923static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
9bb2f334 1924static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
dfc5606d
YS
1925static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1926static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1927static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1928static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
dfc5606d
YS
1929
1930static struct attribute *rbd_attrs[] = {
1931 &dev_attr_size.attr,
1932 &dev_attr_major.attr,
1933 &dev_attr_client_id.attr,
1934 &dev_attr_pool.attr,
9bb2f334 1935 &dev_attr_pool_id.attr,
dfc5606d
YS
1936 &dev_attr_name.attr,
1937 &dev_attr_current_snap.attr,
1938 &dev_attr_refresh.attr,
1939 &dev_attr_create_snap.attr,
dfc5606d
YS
1940 NULL
1941};
1942
1943static struct attribute_group rbd_attr_group = {
1944 .attrs = rbd_attrs,
1945};
1946
1947static const struct attribute_group *rbd_attr_groups[] = {
1948 &rbd_attr_group,
1949 NULL
1950};
1951
1952static void rbd_sysfs_dev_release(struct device *dev)
1953{
1954}
1955
1956static struct device_type rbd_device_type = {
1957 .name = "rbd",
1958 .groups = rbd_attr_groups,
1959 .release = rbd_sysfs_dev_release,
1960};
1961
1962
1963/*
1964 sysfs - snapshots
1965*/
1966
1967static ssize_t rbd_snap_size_show(struct device *dev,
1968 struct device_attribute *attr,
1969 char *buf)
1970{
1971 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1972
3591538f 1973 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
dfc5606d
YS
1974}
1975
1976static ssize_t rbd_snap_id_show(struct device *dev,
1977 struct device_attribute *attr,
1978 char *buf)
1979{
1980 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1981
3591538f 1982 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
dfc5606d
YS
1983}
1984
1985static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
1986static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
1987
1988static struct attribute *rbd_snap_attrs[] = {
1989 &dev_attr_snap_size.attr,
1990 &dev_attr_snap_id.attr,
1991 NULL,
1992};
1993
1994static struct attribute_group rbd_snap_attr_group = {
1995 .attrs = rbd_snap_attrs,
1996};
1997
1998static void rbd_snap_dev_release(struct device *dev)
1999{
2000 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2001 kfree(snap->name);
2002 kfree(snap);
2003}
2004
2005static const struct attribute_group *rbd_snap_attr_groups[] = {
2006 &rbd_snap_attr_group,
2007 NULL
2008};
2009
2010static struct device_type rbd_snap_device_type = {
2011 .groups = rbd_snap_attr_groups,
2012 .release = rbd_snap_dev_release,
2013};
2014
14e7085d 2015static void __rbd_remove_snap_dev(struct rbd_snap *snap)
dfc5606d
YS
2016{
2017 list_del(&snap->node);
2018 device_unregister(&snap->dev);
2019}
2020
14e7085d 2021static int rbd_register_snap_dev(struct rbd_snap *snap,
dfc5606d
YS
2022 struct device *parent)
2023{
2024 struct device *dev = &snap->dev;
2025 int ret;
2026
2027 dev->type = &rbd_snap_device_type;
2028 dev->parent = parent;
2029 dev->release = rbd_snap_dev_release;
2030 dev_set_name(dev, "snap_%s", snap->name);
2031 ret = device_register(dev);
2032
2033 return ret;
2034}
2035
4e891e0a
AE
2036static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2037 int i, const char *name)
dfc5606d 2038{
4e891e0a 2039 struct rbd_snap *snap;
dfc5606d 2040 int ret;
4e891e0a
AE
2041
2042 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
dfc5606d 2043 if (!snap)
4e891e0a
AE
2044 return ERR_PTR(-ENOMEM);
2045
2046 ret = -ENOMEM;
dfc5606d 2047 snap->name = kstrdup(name, GFP_KERNEL);
4e891e0a
AE
2048 if (!snap->name)
2049 goto err;
2050
dfc5606d
YS
2051 snap->size = rbd_dev->header.snap_sizes[i];
2052 snap->id = rbd_dev->header.snapc->snaps[i];
2053 if (device_is_registered(&rbd_dev->dev)) {
14e7085d 2054 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
dfc5606d
YS
2055 if (ret < 0)
2056 goto err;
2057 }
4e891e0a
AE
2058
2059 return snap;
2060
dfc5606d
YS
2061err:
2062 kfree(snap->name);
2063 kfree(snap);
4e891e0a
AE
2064
2065 return ERR_PTR(ret);
dfc5606d
YS
2066}
2067
2068/*
2069 * search for the previous snap in a null delimited string list
2070 */
2071const char *rbd_prev_snap_name(const char *name, const char *start)
2072{
2073 if (name < start + 2)
2074 return NULL;
2075
2076 name -= 2;
2077 while (*name) {
2078 if (name == start)
2079 return start;
2080 name--;
2081 }
2082 return name + 1;
2083}
2084
2085/*
2086 * compare the old list of snapshots that we have to what's in the header
2087 * and update it accordingly. Note that the header holds the snapshots
2088 * in a reverse order (from newest to oldest) and we need to go from
2089 * older to new so that we don't get a duplicate snap name when
2090 * doing the process (e.g., removed snapshot and recreated a new
2091 * one with the same name.
2092 */
2093static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2094{
2095 const char *name, *first_name;
2096 int i = rbd_dev->header.total_snaps;
2097 struct rbd_snap *snap, *old_snap = NULL;
dfc5606d
YS
2098 struct list_head *p, *n;
2099
2100 first_name = rbd_dev->header.snap_names;
2101 name = first_name + rbd_dev->header.snap_names_len;
2102
2103 list_for_each_prev_safe(p, n, &rbd_dev->snaps) {
2104 u64 cur_id;
2105
2106 old_snap = list_entry(p, struct rbd_snap, node);
2107
2108 if (i)
2109 cur_id = rbd_dev->header.snapc->snaps[i - 1];
2110
2111 if (!i || old_snap->id < cur_id) {
e88a36ec
JD
2112 /*
2113 * old_snap->id was skipped, thus was
2114 * removed. If this rbd_dev is mapped to
2115 * the removed snapshot, record that it no
2116 * longer exists, to prevent further I/O.
2117 */
2118 if (rbd_dev->snap_id == old_snap->id)
2119 rbd_dev->snap_exists = false;
14e7085d 2120 __rbd_remove_snap_dev(old_snap);
dfc5606d
YS
2121 continue;
2122 }
2123 if (old_snap->id == cur_id) {
2124 /* we have this snapshot already */
2125 i--;
2126 name = rbd_prev_snap_name(name, first_name);
2127 continue;
2128 }
2129 for (; i > 0;
2130 i--, name = rbd_prev_snap_name(name, first_name)) {
2131 if (!name) {
2132 WARN_ON(1);
2133 return -EINVAL;
2134 }
2135 cur_id = rbd_dev->header.snapc->snaps[i];
2136 /* snapshot removal? handle it above */
2137 if (cur_id >= old_snap->id)
2138 break;
2139 /* a new snapshot */
4e891e0a
AE
2140 snap = __rbd_add_snap_dev(rbd_dev, i - 1, name);
2141 if (IS_ERR(snap))
2142 return PTR_ERR(snap);
dfc5606d
YS
2143
2144 /* note that we add it backward so using n and not p */
2145 list_add(&snap->node, n);
2146 p = &snap->node;
2147 }
2148 }
2149 /* we're done going over the old snap list, just add what's left */
2150 for (; i > 0; i--) {
2151 name = rbd_prev_snap_name(name, first_name);
2152 if (!name) {
2153 WARN_ON(1);
2154 return -EINVAL;
2155 }
4e891e0a
AE
2156 snap = __rbd_add_snap_dev(rbd_dev, i - 1, name);
2157 if (IS_ERR(snap))
2158 return PTR_ERR(snap);
dfc5606d
YS
2159 list_add(&snap->node, &rbd_dev->snaps);
2160 }
2161
2162 return 0;
2163}
2164
dfc5606d
YS
2165static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2166{
f0f8cef5 2167 int ret;
dfc5606d
YS
2168 struct device *dev;
2169 struct rbd_snap *snap;
2170
2171 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2172 dev = &rbd_dev->dev;
2173
2174 dev->bus = &rbd_bus_type;
2175 dev->type = &rbd_device_type;
2176 dev->parent = &rbd_root_dev;
2177 dev->release = rbd_dev_release;
de71a297 2178 dev_set_name(dev, "%d", rbd_dev->dev_id);
dfc5606d
YS
2179 ret = device_register(dev);
2180 if (ret < 0)
f0f8cef5 2181 goto out;
dfc5606d
YS
2182
2183 list_for_each_entry(snap, &rbd_dev->snaps, node) {
14e7085d 2184 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
dfc5606d 2185 if (ret < 0)
602adf40
YS
2186 break;
2187 }
f0f8cef5 2188out:
dfc5606d
YS
2189 mutex_unlock(&ctl_mutex);
2190 return ret;
602adf40
YS
2191}
2192
dfc5606d
YS
2193static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2194{
2195 device_unregister(&rbd_dev->dev);
2196}
2197
59c2be1e
YS
2198static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2199{
2200 int ret, rc;
2201
2202 do {
0e6f322d 2203 ret = rbd_req_sync_watch(rbd_dev);
59c2be1e 2204 if (ret == -ERANGE) {
1fe5e993 2205 rc = rbd_refresh_header(rbd_dev, NULL);
59c2be1e
YS
2206 if (rc < 0)
2207 return rc;
2208 }
2209 } while (ret == -ERANGE);
2210
2211 return ret;
2212}
2213
1ddbe94e
AE
2214static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
2215
2216/*
499afd5b
AE
2217 * Get a unique rbd identifier for the given new rbd_dev, and add
2218 * the rbd_dev to the global list. The minimum rbd id is 1.
1ddbe94e 2219 */
499afd5b 2220static void rbd_id_get(struct rbd_device *rbd_dev)
b7f23c36 2221{
de71a297 2222 rbd_dev->dev_id = atomic64_inc_return(&rbd_id_max);
499afd5b
AE
2223
2224 spin_lock(&rbd_dev_list_lock);
2225 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2226 spin_unlock(&rbd_dev_list_lock);
1ddbe94e 2227}
b7f23c36 2228
1ddbe94e 2229/*
499afd5b
AE
2230 * Remove an rbd_dev from the global list, and record that its
2231 * identifier is no longer in use.
1ddbe94e 2232 */
499afd5b 2233static void rbd_id_put(struct rbd_device *rbd_dev)
1ddbe94e 2234{
d184f6bf 2235 struct list_head *tmp;
de71a297 2236 int rbd_id = rbd_dev->dev_id;
d184f6bf
AE
2237 int max_id;
2238
2239 BUG_ON(rbd_id < 1);
499afd5b
AE
2240
2241 spin_lock(&rbd_dev_list_lock);
2242 list_del_init(&rbd_dev->node);
d184f6bf
AE
2243
2244 /*
2245 * If the id being "put" is not the current maximum, there
2246 * is nothing special we need to do.
2247 */
2248 if (rbd_id != atomic64_read(&rbd_id_max)) {
2249 spin_unlock(&rbd_dev_list_lock);
2250 return;
2251 }
2252
2253 /*
2254 * We need to update the current maximum id. Search the
2255 * list to find out what it is. We're more likely to find
2256 * the maximum at the end, so search the list backward.
2257 */
2258 max_id = 0;
2259 list_for_each_prev(tmp, &rbd_dev_list) {
2260 struct rbd_device *rbd_dev;
2261
2262 rbd_dev = list_entry(tmp, struct rbd_device, node);
2263 if (rbd_id > max_id)
2264 max_id = rbd_id;
2265 }
499afd5b 2266 spin_unlock(&rbd_dev_list_lock);
b7f23c36 2267
1ddbe94e 2268 /*
d184f6bf
AE
2269 * The max id could have been updated by rbd_id_get(), in
2270 * which case it now accurately reflects the new maximum.
2271 * Be careful not to overwrite the maximum value in that
2272 * case.
1ddbe94e 2273 */
d184f6bf 2274 atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id);
b7f23c36
AE
2275}
2276
e28fff26
AE
2277/*
2278 * Skips over white space at *buf, and updates *buf to point to the
2279 * first found non-space character (if any). Returns the length of
593a9e7b
AE
2280 * the token (string of non-white space characters) found. Note
2281 * that *buf must be terminated with '\0'.
e28fff26
AE
2282 */
2283static inline size_t next_token(const char **buf)
2284{
2285 /*
2286 * These are the characters that produce nonzero for
2287 * isspace() in the "C" and "POSIX" locales.
2288 */
2289 const char *spaces = " \f\n\r\t\v";
2290
2291 *buf += strspn(*buf, spaces); /* Find start of token */
2292
2293 return strcspn(*buf, spaces); /* Return token length */
2294}
2295
2296/*
2297 * Finds the next token in *buf, and if the provided token buffer is
2298 * big enough, copies the found token into it. The result, if
593a9e7b
AE
2299 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2300 * must be terminated with '\0' on entry.
e28fff26
AE
2301 *
2302 * Returns the length of the token found (not including the '\0').
2303 * Return value will be 0 if no token is found, and it will be >=
2304 * token_size if the token would not fit.
2305 *
593a9e7b 2306 * The *buf pointer will be updated to point beyond the end of the
e28fff26
AE
2307 * found token. Note that this occurs even if the token buffer is
2308 * too small to hold it.
2309 */
2310static inline size_t copy_token(const char **buf,
2311 char *token,
2312 size_t token_size)
2313{
2314 size_t len;
2315
2316 len = next_token(buf);
2317 if (len < token_size) {
2318 memcpy(token, *buf, len);
2319 *(token + len) = '\0';
2320 }
2321 *buf += len;
2322
2323 return len;
2324}
2325
ea3352f4
AE
2326/*
2327 * Finds the next token in *buf, dynamically allocates a buffer big
2328 * enough to hold a copy of it, and copies the token into the new
2329 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2330 * that a duplicate buffer is created even for a zero-length token.
2331 *
2332 * Returns a pointer to the newly-allocated duplicate, or a null
2333 * pointer if memory for the duplicate was not available. If
2334 * the lenp argument is a non-null pointer, the length of the token
2335 * (not including the '\0') is returned in *lenp.
2336 *
2337 * If successful, the *buf pointer will be updated to point beyond
2338 * the end of the found token.
2339 *
2340 * Note: uses GFP_KERNEL for allocation.
2341 */
2342static inline char *dup_token(const char **buf, size_t *lenp)
2343{
2344 char *dup;
2345 size_t len;
2346
2347 len = next_token(buf);
2348 dup = kmalloc(len + 1, GFP_KERNEL);
2349 if (!dup)
2350 return NULL;
2351
2352 memcpy(dup, *buf, len);
2353 *(dup + len) = '\0';
2354 *buf += len;
2355
2356 if (lenp)
2357 *lenp = len;
2358
2359 return dup;
2360}
2361
a725f65e 2362/*
0bed54dc 2363 * This fills in the pool_name, image_name, image_name_len, snap_name,
a725f65e
AE
2364 * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
2365 * on the list of monitor addresses and other options provided via
2366 * /sys/bus/rbd/add.
d22f76e7
AE
2367 *
2368 * Note: rbd_dev is assumed to have been initially zero-filled.
a725f65e
AE
2369 */
2370static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2371 const char *buf,
7ef3214a 2372 const char **mon_addrs,
5214ecc4 2373 size_t *mon_addrs_size,
e28fff26 2374 char *options,
0bed54dc 2375 size_t options_size)
e28fff26 2376{
d22f76e7
AE
2377 size_t len;
2378 int ret;
e28fff26
AE
2379
2380 /* The first four tokens are required */
2381
7ef3214a
AE
2382 len = next_token(&buf);
2383 if (!len)
a725f65e 2384 return -EINVAL;
5214ecc4 2385 *mon_addrs_size = len + 1;
7ef3214a
AE
2386 *mon_addrs = buf;
2387
2388 buf += len;
a725f65e 2389
e28fff26
AE
2390 len = copy_token(&buf, options, options_size);
2391 if (!len || len >= options_size)
2392 return -EINVAL;
2393
bf3e5ae1 2394 ret = -ENOMEM;
d22f76e7
AE
2395 rbd_dev->pool_name = dup_token(&buf, NULL);
2396 if (!rbd_dev->pool_name)
d22f76e7 2397 goto out_err;
e28fff26 2398
0bed54dc
AE
2399 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2400 if (!rbd_dev->image_name)
bf3e5ae1 2401 goto out_err;
a725f65e 2402
cb8627c7
AE
2403 /* Create the name of the header object */
2404
0bed54dc 2405 rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
bf3e5ae1
AE
2406 + sizeof (RBD_SUFFIX),
2407 GFP_KERNEL);
0bed54dc 2408 if (!rbd_dev->header_name)
cb8627c7 2409 goto out_err;
0bed54dc 2410 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
a725f65e 2411
e28fff26 2412 /*
820a5f3e
AE
2413 * The snapshot name is optional. If none is is supplied,
2414 * we use the default value.
e28fff26 2415 */
820a5f3e
AE
2416 rbd_dev->snap_name = dup_token(&buf, &len);
2417 if (!rbd_dev->snap_name)
2418 goto out_err;
2419 if (!len) {
2420 /* Replace the empty name with the default */
2421 kfree(rbd_dev->snap_name);
2422 rbd_dev->snap_name
2423 = kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL);
2424 if (!rbd_dev->snap_name)
2425 goto out_err;
2426
e28fff26
AE
2427 memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
2428 sizeof (RBD_SNAP_HEAD_NAME));
849b4260 2429 }
e28fff26 2430
a725f65e 2431 return 0;
d22f76e7
AE
2432
2433out_err:
0bed54dc
AE
2434 kfree(rbd_dev->header_name);
2435 kfree(rbd_dev->image_name);
d22f76e7
AE
2436 kfree(rbd_dev->pool_name);
2437 rbd_dev->pool_name = NULL;
2438
2439 return ret;
a725f65e
AE
2440}
2441
59c2be1e
YS
2442static ssize_t rbd_add(struct bus_type *bus,
2443 const char *buf,
2444 size_t count)
602adf40 2445{
cb8627c7
AE
2446 char *options;
2447 struct rbd_device *rbd_dev = NULL;
7ef3214a
AE
2448 const char *mon_addrs = NULL;
2449 size_t mon_addrs_size = 0;
27cc2594
AE
2450 struct ceph_osd_client *osdc;
2451 int rc = -ENOMEM;
602adf40
YS
2452
2453 if (!try_module_get(THIS_MODULE))
2454 return -ENODEV;
2455
60571c7d 2456 options = kmalloc(count, GFP_KERNEL);
602adf40 2457 if (!options)
27cc2594 2458 goto err_nomem;
cb8627c7
AE
2459 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2460 if (!rbd_dev)
2461 goto err_nomem;
602adf40
YS
2462
2463 /* static rbd_device initialization */
2464 spin_lock_init(&rbd_dev->lock);
2465 INIT_LIST_HEAD(&rbd_dev->node);
dfc5606d 2466 INIT_LIST_HEAD(&rbd_dev->snaps);
c666601a 2467 init_rwsem(&rbd_dev->header_rwsem);
602adf40 2468
d184f6bf 2469 /* generate unique id: find highest unique id, add one */
499afd5b 2470 rbd_id_get(rbd_dev);
602adf40 2471
a725f65e 2472 /* Fill in the device name, now that we have its id. */
81a89793
AE
2473 BUILD_BUG_ON(DEV_NAME_LEN
2474 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
de71a297 2475 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
a725f65e 2476
602adf40 2477 /* parse add command */
7ef3214a 2478 rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
e28fff26 2479 options, count);
a725f65e 2480 if (rc)
f0f8cef5 2481 goto err_put_id;
e124a82f 2482
5214ecc4
AE
2483 rbd_dev->rbd_client = rbd_get_client(mon_addrs, mon_addrs_size - 1,
2484 options);
d720bcb0
AE
2485 if (IS_ERR(rbd_dev->rbd_client)) {
2486 rc = PTR_ERR(rbd_dev->rbd_client);
f0f8cef5 2487 goto err_put_id;
d720bcb0 2488 }
602adf40 2489
602adf40 2490 /* pick the pool */
1dbb4399 2491 osdc = &rbd_dev->rbd_client->client->osdc;
602adf40
YS
2492 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2493 if (rc < 0)
2494 goto err_out_client;
9bb2f334 2495 rbd_dev->pool_id = rc;
602adf40
YS
2496
2497 /* register our block device */
27cc2594
AE
2498 rc = register_blkdev(0, rbd_dev->name);
2499 if (rc < 0)
602adf40 2500 goto err_out_client;
27cc2594 2501 rbd_dev->major = rc;
602adf40 2502
dfc5606d
YS
2503 rc = rbd_bus_add_dev(rbd_dev);
2504 if (rc)
766fc439
YS
2505 goto err_out_blkdev;
2506
32eec68d
AE
2507 /*
2508 * At this point cleanup in the event of an error is the job
2509 * of the sysfs code (initiated by rbd_bus_del_dev()).
2510 *
2511 * Set up and announce blkdev mapping.
2512 */
602adf40
YS
2513 rc = rbd_init_disk(rbd_dev);
2514 if (rc)
766fc439 2515 goto err_out_bus;
602adf40 2516
59c2be1e
YS
2517 rc = rbd_init_watch_dev(rbd_dev);
2518 if (rc)
2519 goto err_out_bus;
2520
602adf40
YS
2521 return count;
2522
766fc439 2523err_out_bus:
766fc439
YS
2524 /* this will also clean up rest of rbd_dev stuff */
2525
2526 rbd_bus_del_dev(rbd_dev);
2527 kfree(options);
766fc439
YS
2528 return rc;
2529
602adf40
YS
2530err_out_blkdev:
2531 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2532err_out_client:
2533 rbd_put_client(rbd_dev);
f0f8cef5 2534err_put_id:
cb8627c7 2535 if (rbd_dev->pool_name) {
820a5f3e 2536 kfree(rbd_dev->snap_name);
0bed54dc
AE
2537 kfree(rbd_dev->header_name);
2538 kfree(rbd_dev->image_name);
cb8627c7
AE
2539 kfree(rbd_dev->pool_name);
2540 }
499afd5b 2541 rbd_id_put(rbd_dev);
27cc2594 2542err_nomem:
27cc2594 2543 kfree(rbd_dev);
cb8627c7 2544 kfree(options);
27cc2594 2545
602adf40
YS
2546 dout("Error adding device %s\n", buf);
2547 module_put(THIS_MODULE);
27cc2594
AE
2548
2549 return (ssize_t) rc;
602adf40
YS
2550}
2551
de71a297 2552static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
602adf40
YS
2553{
2554 struct list_head *tmp;
2555 struct rbd_device *rbd_dev;
2556
e124a82f 2557 spin_lock(&rbd_dev_list_lock);
602adf40
YS
2558 list_for_each(tmp, &rbd_dev_list) {
2559 rbd_dev = list_entry(tmp, struct rbd_device, node);
de71a297 2560 if (rbd_dev->dev_id == dev_id) {
e124a82f 2561 spin_unlock(&rbd_dev_list_lock);
602adf40 2562 return rbd_dev;
e124a82f 2563 }
602adf40 2564 }
e124a82f 2565 spin_unlock(&rbd_dev_list_lock);
602adf40
YS
2566 return NULL;
2567}
2568
dfc5606d 2569static void rbd_dev_release(struct device *dev)
602adf40 2570{
593a9e7b 2571 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 2572
1dbb4399
AE
2573 if (rbd_dev->watch_request) {
2574 struct ceph_client *client = rbd_dev->rbd_client->client;
2575
2576 ceph_osdc_unregister_linger_request(&client->osdc,
59c2be1e 2577 rbd_dev->watch_request);
1dbb4399 2578 }
59c2be1e 2579 if (rbd_dev->watch_event)
070c633f 2580 rbd_req_sync_unwatch(rbd_dev);
59c2be1e 2581
602adf40
YS
2582 rbd_put_client(rbd_dev);
2583
2584 /* clean up and free blkdev */
2585 rbd_free_disk(rbd_dev);
2586 unregister_blkdev(rbd_dev->major, rbd_dev->name);
32eec68d
AE
2587
2588 /* done with the id, and with the rbd_dev */
820a5f3e 2589 kfree(rbd_dev->snap_name);
0bed54dc 2590 kfree(rbd_dev->header_name);
d22f76e7 2591 kfree(rbd_dev->pool_name);
0bed54dc 2592 kfree(rbd_dev->image_name);
32eec68d 2593 rbd_id_put(rbd_dev);
602adf40
YS
2594 kfree(rbd_dev);
2595
2596 /* release module ref */
2597 module_put(THIS_MODULE);
602adf40
YS
2598}
2599
dfc5606d
YS
2600static ssize_t rbd_remove(struct bus_type *bus,
2601 const char *buf,
2602 size_t count)
602adf40
YS
2603{
2604 struct rbd_device *rbd_dev = NULL;
2605 int target_id, rc;
2606 unsigned long ul;
2607 int ret = count;
2608
2609 rc = strict_strtoul(buf, 10, &ul);
2610 if (rc)
2611 return rc;
2612
2613 /* convert to int; abort if we lost anything in the conversion */
2614 target_id = (int) ul;
2615 if (target_id != ul)
2616 return -EINVAL;
2617
2618 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2619
2620 rbd_dev = __rbd_get_dev(target_id);
2621 if (!rbd_dev) {
2622 ret = -ENOENT;
2623 goto done;
2624 }
2625
dfc5606d
YS
2626 __rbd_remove_all_snaps(rbd_dev);
2627 rbd_bus_del_dev(rbd_dev);
602adf40
YS
2628
2629done:
2630 mutex_unlock(&ctl_mutex);
2631 return ret;
2632}
2633
dfc5606d
YS
2634static ssize_t rbd_snap_add(struct device *dev,
2635 struct device_attribute *attr,
2636 const char *buf,
2637 size_t count)
602adf40 2638{
593a9e7b 2639 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d
YS
2640 int ret;
2641 char *name = kmalloc(count + 1, GFP_KERNEL);
602adf40
YS
2642 if (!name)
2643 return -ENOMEM;
2644
dfc5606d 2645 snprintf(name, count, "%s", buf);
602adf40
YS
2646
2647 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2648
602adf40
YS
2649 ret = rbd_header_add_snap(rbd_dev,
2650 name, GFP_KERNEL);
2651 if (ret < 0)
59c2be1e 2652 goto err_unlock;
602adf40 2653
b813623a 2654 ret = __rbd_refresh_header(rbd_dev, NULL);
602adf40 2655 if (ret < 0)
59c2be1e
YS
2656 goto err_unlock;
2657
2658 /* shouldn't hold ctl_mutex when notifying.. notify might
2659 trigger a watch callback that would need to get that mutex */
2660 mutex_unlock(&ctl_mutex);
2661
2662 /* make a best effort, don't error if failed */
4cb16250 2663 rbd_req_sync_notify(rbd_dev);
602adf40
YS
2664
2665 ret = count;
59c2be1e
YS
2666 kfree(name);
2667 return ret;
2668
2669err_unlock:
602adf40 2670 mutex_unlock(&ctl_mutex);
602adf40
YS
2671 kfree(name);
2672 return ret;
2673}
2674
602adf40
YS
2675/*
2676 * create control files in sysfs
dfc5606d 2677 * /sys/bus/rbd/...
602adf40
YS
2678 */
2679static int rbd_sysfs_init(void)
2680{
dfc5606d 2681 int ret;
602adf40 2682
fed4c143 2683 ret = device_register(&rbd_root_dev);
21079786 2684 if (ret < 0)
dfc5606d 2685 return ret;
602adf40 2686
fed4c143
AE
2687 ret = bus_register(&rbd_bus_type);
2688 if (ret < 0)
2689 device_unregister(&rbd_root_dev);
602adf40 2690
602adf40
YS
2691 return ret;
2692}
2693
2694static void rbd_sysfs_cleanup(void)
2695{
dfc5606d 2696 bus_unregister(&rbd_bus_type);
fed4c143 2697 device_unregister(&rbd_root_dev);
602adf40
YS
2698}
2699
2700int __init rbd_init(void)
2701{
2702 int rc;
2703
2704 rc = rbd_sysfs_init();
2705 if (rc)
2706 return rc;
f0f8cef5 2707 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
602adf40
YS
2708 return 0;
2709}
2710
2711void __exit rbd_exit(void)
2712{
2713 rbd_sysfs_cleanup();
2714}
2715
2716module_init(rbd_init);
2717module_exit(rbd_exit);
2718
2719MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2720MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2721MODULE_DESCRIPTION("rados block device");
2722
2723/* following authorship retained from original osdblk.c */
2724MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2725
2726MODULE_LICENSE("GPL");