]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blame - drivers/block/rbd.c
Merge branch 'stable/for-linus-5.10-rc2' of git://git.kernel.org/pub/scm/linux/kernel...
[mirror_ubuntu-hirsute-kernel.git] / drivers / block / rbd.c
CommitLineData
e2a58ee5 1
602adf40
YS
2/*
3 rbd.c -- Export ceph rados objects as a Linux block device
4
5
6 based on drivers/block/osdblk.c:
7
8 Copyright 2009 Red Hat, Inc.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING. If not, write to
21 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
dfc5606d 25 For usage instructions, please refer to:
602adf40 26
dfc5606d 27 Documentation/ABI/testing/sysfs-bus-rbd
602adf40
YS
28
29 */
30
31#include <linux/ceph/libceph.h>
32#include <linux/ceph/osd_client.h>
33#include <linux/ceph/mon_client.h>
ed95b21a 34#include <linux/ceph/cls_lock_client.h>
43df3d35 35#include <linux/ceph/striper.h>
602adf40 36#include <linux/ceph/decode.h>
82995cc6 37#include <linux/fs_parser.h>
30d1cff8 38#include <linux/bsearch.h>
602adf40
YS
39
40#include <linux/kernel.h>
41#include <linux/device.h>
42#include <linux/module.h>
7ad18afa 43#include <linux/blk-mq.h>
602adf40
YS
44#include <linux/fs.h>
45#include <linux/blkdev.h>
1c2a9dfe 46#include <linux/slab.h>
f8a22fc2 47#include <linux/idr.h>
bc1ecc65 48#include <linux/workqueue.h>
602adf40
YS
49
50#include "rbd_types.h"
51
aafb230e
AE
52#define RBD_DEBUG /* Activate rbd_assert() calls */
53
a2acd00e
AE
54/*
55 * Increment the given counter and return its updated value.
56 * If the counter is already 0 it will not be incremented.
57 * If the counter is already at its maximum value returns
58 * -EINVAL without updating it.
59 */
60static int atomic_inc_return_safe(atomic_t *v)
61{
62 unsigned int counter;
63
bfc18e38 64 counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0);
a2acd00e
AE
65 if (counter <= (unsigned int)INT_MAX)
66 return (int)counter;
67
68 atomic_dec(v);
69
70 return -EINVAL;
71}
72
73/* Decrement the counter. Return the resulting value, or -EINVAL */
74static int atomic_dec_return_safe(atomic_t *v)
75{
76 int counter;
77
78 counter = atomic_dec_return(v);
79 if (counter >= 0)
80 return counter;
81
82 atomic_inc(v);
83
84 return -EINVAL;
85}
86
f0f8cef5 87#define RBD_DRV_NAME "rbd"
602adf40 88
7e513d43
ID
89#define RBD_MINORS_PER_MAJOR 256
90#define RBD_SINGLE_MAJOR_PART_SHIFT 4
602adf40 91
6d69bb53
ID
92#define RBD_MAX_PARENT_CHAIN_LEN 16
93
d4b125e9
AE
94#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
95#define RBD_MAX_SNAP_NAME_LEN \
96 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
97
35d489f9 98#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
602adf40
YS
99
100#define RBD_SNAP_HEAD_NAME "-"
101
9682fc6d
AE
102#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
103
9e15b77d
AE
104/* This allows a single page to hold an image name sent by OSD */
105#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
1e130199 106#define RBD_IMAGE_ID_LEN_MAX 64
9e15b77d 107
1e130199 108#define RBD_OBJ_PREFIX_LEN_MAX 64
589d30e0 109
ed95b21a 110#define RBD_NOTIFY_TIMEOUT 5 /* seconds */
99d16943
ID
111#define RBD_RETRY_DELAY msecs_to_jiffies(1000)
112
d889140c
AE
113/* Feature bits */
114
8767b293
ID
115#define RBD_FEATURE_LAYERING (1ULL<<0)
116#define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
117#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
22e8bd51
ID
118#define RBD_FEATURE_OBJECT_MAP (1ULL<<3)
119#define RBD_FEATURE_FAST_DIFF (1ULL<<4)
b9f6d447 120#define RBD_FEATURE_DEEP_FLATTEN (1ULL<<5)
8767b293 121#define RBD_FEATURE_DATA_POOL (1ULL<<7)
e573427a 122#define RBD_FEATURE_OPERATIONS (1ULL<<8)
8767b293 123
ed95b21a
ID
124#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
125 RBD_FEATURE_STRIPINGV2 | \
7e97332e 126 RBD_FEATURE_EXCLUSIVE_LOCK | \
22e8bd51
ID
127 RBD_FEATURE_OBJECT_MAP | \
128 RBD_FEATURE_FAST_DIFF | \
b9f6d447 129 RBD_FEATURE_DEEP_FLATTEN | \
e573427a
ID
130 RBD_FEATURE_DATA_POOL | \
131 RBD_FEATURE_OPERATIONS)
d889140c
AE
132
133/* Features supported by this (client software) implementation. */
134
770eba6e 135#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
d889140c 136
81a89793
AE
137/*
138 * An RBD device name will be "rbd#", where the "rbd" comes from
139 * RBD_DRV_NAME above, and # is a unique integer identifier.
81a89793 140 */
602adf40
YS
141#define DEV_NAME_LEN 32
142
143/*
144 * block device image metadata (in-memory version)
145 */
146struct rbd_image_header {
f35a4dee 147 /* These six fields never change for a given rbd image */
849b4260 148 char *object_prefix;
602adf40 149 __u8 obj_order;
f35a4dee
AE
150 u64 stripe_unit;
151 u64 stripe_count;
7e97332e 152 s64 data_pool_id;
f35a4dee 153 u64 features; /* Might be changeable someday? */
602adf40 154
f84344f3
AE
155 /* The remaining fields need to be updated occasionally */
156 u64 image_size;
157 struct ceph_snap_context *snapc;
f35a4dee
AE
158 char *snap_names; /* format 1 only */
159 u64 *snap_sizes; /* format 1 only */
59c2be1e
YS
160};
161
0d7dbfce
AE
162/*
163 * An rbd image specification.
164 *
165 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
c66c6e0c
AE
166 * identify an image. Each rbd_dev structure includes a pointer to
167 * an rbd_spec structure that encapsulates this identity.
168 *
169 * Each of the id's in an rbd_spec has an associated name. For a
170 * user-mapped image, the names are supplied and the id's associated
171 * with them are looked up. For a layered image, a parent image is
172 * defined by the tuple, and the names are looked up.
173 *
174 * An rbd_dev structure contains a parent_spec pointer which is
175 * non-null if the image it represents is a child in a layered
176 * image. This pointer will refer to the rbd_spec structure used
177 * by the parent rbd_dev for its own identity (i.e., the structure
178 * is shared between the parent and child).
179 *
180 * Since these structures are populated once, during the discovery
181 * phase of image construction, they are effectively immutable so
182 * we make no effort to synchronize access to them.
183 *
184 * Note that code herein does not assume the image name is known (it
185 * could be a null pointer).
0d7dbfce
AE
186 */
187struct rbd_spec {
188 u64 pool_id;
ecb4dc22 189 const char *pool_name;
b26c047b 190 const char *pool_ns; /* NULL if default, never "" */
0d7dbfce 191
ecb4dc22
AE
192 const char *image_id;
193 const char *image_name;
0d7dbfce
AE
194
195 u64 snap_id;
ecb4dc22 196 const char *snap_name;
0d7dbfce
AE
197
198 struct kref kref;
199};
200
602adf40 201/*
f0f8cef5 202 * an instance of the client. multiple devices may share an rbd client.
602adf40
YS
203 */
204struct rbd_client {
205 struct ceph_client *client;
206 struct kref kref;
207 struct list_head node;
208};
209
0192ce2e
ID
210struct pending_result {
211 int result; /* first nonzero result */
212 int num_pending;
213};
214
bf0d5f50 215struct rbd_img_request;
bf0d5f50 216
9969ebc5 217enum obj_request_type {
a1fbb5e7 218 OBJ_REQUEST_NODATA = 1,
5359a17d 219 OBJ_REQUEST_BIO, /* pointer into provided bio (list) */
7e07efb1 220 OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */
afb97888 221 OBJ_REQUEST_OWN_BVECS, /* private bio_vec array, doesn't own pages */
9969ebc5 222};
bf0d5f50 223
6d2940c8 224enum obj_operation_type {
a1fbb5e7 225 OBJ_OP_READ = 1,
6d2940c8 226 OBJ_OP_WRITE,
90e98c52 227 OBJ_OP_DISCARD,
6484cbe9 228 OBJ_OP_ZEROOUT,
6d2940c8
GZ
229};
230
0ad5d953
ID
231#define RBD_OBJ_FLAG_DELETION (1U << 0)
232#define RBD_OBJ_FLAG_COPYUP_ENABLED (1U << 1)
793333a3 233#define RBD_OBJ_FLAG_COPYUP_ZEROS (1U << 2)
22e8bd51
ID
234#define RBD_OBJ_FLAG_MAY_EXIST (1U << 3)
235#define RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT (1U << 4)
0ad5d953 236
a9b67e69 237enum rbd_obj_read_state {
85b5e6d1
ID
238 RBD_OBJ_READ_START = 1,
239 RBD_OBJ_READ_OBJECT,
a9b67e69
ID
240 RBD_OBJ_READ_PARENT,
241};
242
3da691bf
ID
243/*
244 * Writes go through the following state machine to deal with
245 * layering:
246 *
89a59c1c
ID
247 * . . . . . RBD_OBJ_WRITE_GUARD. . . . . . . . . . . . . .
248 * . | .
249 * . v .
250 * . RBD_OBJ_WRITE_READ_FROM_PARENT. . . .
251 * . | . .
252 * . v v (deep-copyup .
253 * (image . RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC . not needed) .
254 * flattened) v | . .
255 * . v . .
256 * . . . .RBD_OBJ_WRITE_COPYUP_OPS. . . . . (copyup .
257 * | not needed) v
258 * v .
259 * done . . . . . . . . . . . . . . . . . .
260 * ^
261 * |
262 * RBD_OBJ_WRITE_FLAT
3da691bf
ID
263 *
264 * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
89a59c1c
ID
265 * assert_exists guard is needed or not (in some cases it's not needed
266 * even if there is a parent).
3da691bf
ID
267 */
268enum rbd_obj_write_state {
85b5e6d1 269 RBD_OBJ_WRITE_START = 1,
22e8bd51 270 RBD_OBJ_WRITE_PRE_OBJECT_MAP,
85b5e6d1 271 RBD_OBJ_WRITE_OBJECT,
793333a3
ID
272 __RBD_OBJ_WRITE_COPYUP,
273 RBD_OBJ_WRITE_COPYUP,
22e8bd51 274 RBD_OBJ_WRITE_POST_OBJECT_MAP,
793333a3
ID
275};
276
277enum rbd_obj_copyup_state {
278 RBD_OBJ_COPYUP_START = 1,
279 RBD_OBJ_COPYUP_READ_PARENT,
22e8bd51
ID
280 __RBD_OBJ_COPYUP_OBJECT_MAPS,
281 RBD_OBJ_COPYUP_OBJECT_MAPS,
793333a3
ID
282 __RBD_OBJ_COPYUP_WRITE_OBJECT,
283 RBD_OBJ_COPYUP_WRITE_OBJECT,
926f9b3f
AE
284};
285
bf0d5f50 286struct rbd_obj_request {
43df3d35 287 struct ceph_object_extent ex;
0ad5d953 288 unsigned int flags; /* RBD_OBJ_FLAG_* */
c5b5ef6c 289 union {
a9b67e69 290 enum rbd_obj_read_state read_state; /* for reads */
3da691bf 291 enum rbd_obj_write_state write_state; /* for writes */
c5b5ef6c 292 };
bf0d5f50 293
51c3509e 294 struct rbd_img_request *img_request;
86bd7998
ID
295 struct ceph_file_extent *img_extents;
296 u32 num_img_extents;
bf0d5f50 297
788e2df3 298 union {
5359a17d 299 struct ceph_bio_iter bio_pos;
788e2df3 300 struct {
7e07efb1
ID
301 struct ceph_bvec_iter bvec_pos;
302 u32 bvec_count;
afb97888 303 u32 bvec_idx;
788e2df3
AE
304 };
305 };
793333a3
ID
306
307 enum rbd_obj_copyup_state copyup_state;
7e07efb1
ID
308 struct bio_vec *copyup_bvecs;
309 u32 copyup_bvec_count;
bf0d5f50 310
bcbab1db 311 struct list_head osd_reqs; /* w/ r_private_item */
bf0d5f50 312
85b5e6d1 313 struct mutex state_mutex;
793333a3 314 struct pending_result pending;
bf0d5f50
AE
315 struct kref kref;
316};
317
0c425248 318enum img_req_flags {
9849e986 319 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
d0b2e944 320 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
0c425248
AE
321};
322
0192ce2e
ID
323enum rbd_img_state {
324 RBD_IMG_START = 1,
637cd060 325 RBD_IMG_EXCLUSIVE_LOCK,
0192ce2e
ID
326 __RBD_IMG_OBJECT_REQUESTS,
327 RBD_IMG_OBJECT_REQUESTS,
328};
329
bf0d5f50 330struct rbd_img_request {
bf0d5f50 331 struct rbd_device *rbd_dev;
9bb0248d 332 enum obj_operation_type op_type;
ecc633ca 333 enum obj_request_type data_type;
0c425248 334 unsigned long flags;
0192ce2e 335 enum rbd_img_state state;
bf0d5f50 336 union {
9849e986 337 u64 snap_id; /* for reads */
bf0d5f50 338 struct ceph_snap_context *snapc; /* for writes */
9849e986 339 };
59e542c8 340 struct rbd_obj_request *obj_request; /* obj req initiator */
bf0d5f50 341
e1fddc8f 342 struct list_head lock_item;
43df3d35 343 struct list_head object_extents; /* obj_req.ex structs */
bf0d5f50 344
0192ce2e
ID
345 struct mutex state_mutex;
346 struct pending_result pending;
347 struct work_struct work;
348 int work_result;
bf0d5f50
AE
349};
350
351#define for_each_obj_request(ireq, oreq) \
43df3d35 352 list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
bf0d5f50 353#define for_each_obj_request_safe(ireq, oreq, n) \
43df3d35 354 list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
bf0d5f50 355
99d16943
ID
356enum rbd_watch_state {
357 RBD_WATCH_STATE_UNREGISTERED,
358 RBD_WATCH_STATE_REGISTERED,
359 RBD_WATCH_STATE_ERROR,
360};
361
ed95b21a
ID
362enum rbd_lock_state {
363 RBD_LOCK_STATE_UNLOCKED,
364 RBD_LOCK_STATE_LOCKED,
365 RBD_LOCK_STATE_RELEASING,
366};
367
368/* WatchNotify::ClientId */
369struct rbd_client_id {
370 u64 gid;
371 u64 handle;
372};
373
f84344f3 374struct rbd_mapping {
99c1f08f 375 u64 size;
f84344f3
AE
376};
377
602adf40
YS
378/*
379 * a single device
380 */
381struct rbd_device {
de71a297 382 int dev_id; /* blkdev unique id */
602adf40
YS
383
384 int major; /* blkdev assigned major */
dd82fff1 385 int minor;
602adf40 386 struct gendisk *disk; /* blkdev's gendisk and rq */
602adf40 387
a30b71b9 388 u32 image_format; /* Either 1 or 2 */
602adf40
YS
389 struct rbd_client *rbd_client;
390
391 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
392
b82d167b 393 spinlock_t lock; /* queue, flags, open_count */
602adf40
YS
394
395 struct rbd_image_header header;
b82d167b 396 unsigned long flags; /* possibly lock protected */
0d7dbfce 397 struct rbd_spec *spec;
d147543d 398 struct rbd_options *opts;
0d6d1e9c 399 char *config_info; /* add{,_single_major} string */
602adf40 400
c41d13a3 401 struct ceph_object_id header_oid;
922dab61 402 struct ceph_object_locator header_oloc;
971f839a 403
1643dfa4 404 struct ceph_file_layout layout; /* used for all rbd requests */
0903e875 405
99d16943
ID
406 struct mutex watch_mutex;
407 enum rbd_watch_state watch_state;
922dab61 408 struct ceph_osd_linger_request *watch_handle;
99d16943
ID
409 u64 watch_cookie;
410 struct delayed_work watch_dwork;
59c2be1e 411
ed95b21a
ID
412 struct rw_semaphore lock_rwsem;
413 enum rbd_lock_state lock_state;
cbbfb0ff 414 char lock_cookie[32];
ed95b21a
ID
415 struct rbd_client_id owner_cid;
416 struct work_struct acquired_lock_work;
417 struct work_struct released_lock_work;
418 struct delayed_work lock_dwork;
419 struct work_struct unlock_work;
e1fddc8f 420 spinlock_t lock_lists_lock;
637cd060 421 struct list_head acquiring_list;
e1fddc8f 422 struct list_head running_list;
637cd060
ID
423 struct completion acquire_wait;
424 int acquire_err;
e1fddc8f 425 struct completion releasing_wait;
ed95b21a 426
22e8bd51
ID
427 spinlock_t object_map_lock;
428 u8 *object_map;
429 u64 object_map_size; /* in objects */
430 u64 object_map_flags;
ed95b21a 431
1643dfa4 432 struct workqueue_struct *task_wq;
59c2be1e 433
86b00e0d
AE
434 struct rbd_spec *parent_spec;
435 u64 parent_overlap;
a2acd00e 436 atomic_t parent_ref;
2f82ee54 437 struct rbd_device *parent;
86b00e0d 438
7ad18afa
CH
439 /* Block layer tags. */
440 struct blk_mq_tag_set tag_set;
441
c666601a
JD
442 /* protects updating the header */
443 struct rw_semaphore header_rwsem;
f84344f3
AE
444
445 struct rbd_mapping mapping;
602adf40
YS
446
447 struct list_head node;
dfc5606d 448
dfc5606d
YS
449 /* sysfs related */
450 struct device dev;
b82d167b 451 unsigned long open_count; /* protected by lock */
dfc5606d
YS
452};
453
b82d167b 454/*
87c0fded
ID
455 * Flag bits for rbd_dev->flags:
456 * - REMOVING (which is coupled with rbd_dev->open_count) is protected
457 * by rbd_dev->lock
b82d167b 458 */
6d292906 459enum rbd_dev_flags {
686238b7 460 RBD_DEV_FLAG_EXISTS, /* rbd_dev_device_setup() ran */
b82d167b 461 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
39258aa2 462 RBD_DEV_FLAG_READONLY, /* -o ro or snapshot */
6d292906
AE
463};
464
cfbf6377 465static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
e124a82f 466
602adf40 467static LIST_HEAD(rbd_dev_list); /* devices */
e124a82f
AE
468static DEFINE_SPINLOCK(rbd_dev_list_lock);
469
432b8587
AE
470static LIST_HEAD(rbd_client_list); /* clients */
471static DEFINE_SPINLOCK(rbd_client_list_lock);
602adf40 472
78c2a44a
AE
473/* Slab caches for frequently-allocated structures */
474
1c2a9dfe 475static struct kmem_cache *rbd_img_request_cache;
868311b1 476static struct kmem_cache *rbd_obj_request_cache;
1c2a9dfe 477
9b60e70b 478static int rbd_major;
f8a22fc2
ID
479static DEFINE_IDA(rbd_dev_id_ida);
480
f5ee37bd
ID
481static struct workqueue_struct *rbd_wq;
482
89a59c1c
ID
483static struct ceph_snap_context rbd_empty_snapc = {
484 .nref = REFCOUNT_INIT(1),
485};
486
9b60e70b 487/*
3cfa3b16 488 * single-major requires >= 0.75 version of userspace rbd utility.
9b60e70b 489 */
3cfa3b16 490static bool single_major = true;
5657a819 491module_param(single_major, bool, 0444);
3cfa3b16 492MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
9b60e70b 493
7e9586ba
GKH
494static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count);
495static ssize_t remove_store(struct bus_type *bus, const char *buf,
496 size_t count);
497static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
498 size_t count);
499static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
500 size_t count);
6d69bb53 501static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
f0f8cef5 502
9b60e70b
ID
503static int rbd_dev_id_to_minor(int dev_id)
504{
7e513d43 505 return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
9b60e70b
ID
506}
507
508static int minor_to_rbd_dev_id(int minor)
509{
7e513d43 510 return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
9b60e70b
ID
511}
512
39258aa2
ID
513static bool rbd_is_ro(struct rbd_device *rbd_dev)
514{
515 return test_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags);
516}
517
f3c0e459
ID
518static bool rbd_is_snap(struct rbd_device *rbd_dev)
519{
520 return rbd_dev->spec->snap_id != CEPH_NOSNAP;
521}
522
ed95b21a
ID
523static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
524{
637cd060
ID
525 lockdep_assert_held(&rbd_dev->lock_rwsem);
526
ed95b21a
ID
527 return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
528 rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
529}
530
531static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
532{
533 bool is_lock_owner;
534
535 down_read(&rbd_dev->lock_rwsem);
536 is_lock_owner = __rbd_is_lock_owner(rbd_dev);
537 up_read(&rbd_dev->lock_rwsem);
538 return is_lock_owner;
539}
540
7e9586ba 541static ssize_t supported_features_show(struct bus_type *bus, char *buf)
8767b293
ID
542{
543 return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
544}
545
7e9586ba
GKH
546static BUS_ATTR_WO(add);
547static BUS_ATTR_WO(remove);
548static BUS_ATTR_WO(add_single_major);
549static BUS_ATTR_WO(remove_single_major);
550static BUS_ATTR_RO(supported_features);
b15a21dd
GKH
551
552static struct attribute *rbd_bus_attrs[] = {
553 &bus_attr_add.attr,
554 &bus_attr_remove.attr,
9b60e70b
ID
555 &bus_attr_add_single_major.attr,
556 &bus_attr_remove_single_major.attr,
8767b293 557 &bus_attr_supported_features.attr,
b15a21dd 558 NULL,
f0f8cef5 559};
92c76dc0
ID
560
561static umode_t rbd_bus_is_visible(struct kobject *kobj,
562 struct attribute *attr, int index)
563{
9b60e70b
ID
564 if (!single_major &&
565 (attr == &bus_attr_add_single_major.attr ||
566 attr == &bus_attr_remove_single_major.attr))
567 return 0;
568
92c76dc0
ID
569 return attr->mode;
570}
571
572static const struct attribute_group rbd_bus_group = {
573 .attrs = rbd_bus_attrs,
574 .is_visible = rbd_bus_is_visible,
575};
576__ATTRIBUTE_GROUPS(rbd_bus);
f0f8cef5
AE
577
578static struct bus_type rbd_bus_type = {
579 .name = "rbd",
b15a21dd 580 .bus_groups = rbd_bus_groups,
f0f8cef5
AE
581};
582
583static void rbd_root_dev_release(struct device *dev)
584{
585}
586
587static struct device rbd_root_dev = {
588 .init_name = "rbd",
589 .release = rbd_root_dev_release,
590};
591
06ecc6cb
AE
592static __printf(2, 3)
593void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
594{
595 struct va_format vaf;
596 va_list args;
597
598 va_start(args, fmt);
599 vaf.fmt = fmt;
600 vaf.va = &args;
601
602 if (!rbd_dev)
603 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
604 else if (rbd_dev->disk)
605 printk(KERN_WARNING "%s: %s: %pV\n",
606 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
607 else if (rbd_dev->spec && rbd_dev->spec->image_name)
608 printk(KERN_WARNING "%s: image %s: %pV\n",
609 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
610 else if (rbd_dev->spec && rbd_dev->spec->image_id)
611 printk(KERN_WARNING "%s: id %s: %pV\n",
612 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
613 else /* punt */
614 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
615 RBD_DRV_NAME, rbd_dev, &vaf);
616 va_end(args);
617}
618
aafb230e
AE
619#ifdef RBD_DEBUG
620#define rbd_assert(expr) \
621 if (unlikely(!(expr))) { \
622 printk(KERN_ERR "\nAssertion failure in %s() " \
623 "at line %d:\n\n" \
624 "\trbd_assert(%s);\n\n", \
625 __func__, __LINE__, #expr); \
626 BUG(); \
627 }
628#else /* !RBD_DEBUG */
629# define rbd_assert(expr) ((void) 0)
630#endif /* !RBD_DEBUG */
dfc5606d 631
05a46afd 632static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
8b3e1a56 633
cc4a38bd 634static int rbd_dev_refresh(struct rbd_device *rbd_dev);
2df3fac7 635static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
a720ae09 636static int rbd_dev_header_info(struct rbd_device *rbd_dev);
e8f59b59 637static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
54cac61f
AE
638static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
639 u64 snap_id);
2ad3d716
AE
640static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
641 u8 *order, u64 *snap_size);
22e8bd51 642static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev);
59c2be1e 643
54ab3b24 644static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result);
0192ce2e
ID
645static void rbd_img_handle_request(struct rbd_img_request *img_req, int result);
646
647/*
648 * Return true if nothing else is pending.
649 */
650static bool pending_result_dec(struct pending_result *pending, int *result)
651{
652 rbd_assert(pending->num_pending > 0);
653
654 if (*result && !pending->result)
655 pending->result = *result;
656 if (--pending->num_pending)
657 return false;
658
659 *result = pending->result;
660 return true;
661}
59c2be1e 662
602adf40
YS
663static int rbd_open(struct block_device *bdev, fmode_t mode)
664{
f0f8cef5 665 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
b82d167b 666 bool removing = false;
602adf40 667
a14ea269 668 spin_lock_irq(&rbd_dev->lock);
b82d167b
AE
669 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
670 removing = true;
671 else
672 rbd_dev->open_count++;
a14ea269 673 spin_unlock_irq(&rbd_dev->lock);
b82d167b
AE
674 if (removing)
675 return -ENOENT;
676
c3e946ce 677 (void) get_device(&rbd_dev->dev);
340c7a2b 678
602adf40
YS
679 return 0;
680}
681
db2a144b 682static void rbd_release(struct gendisk *disk, fmode_t mode)
dfc5606d
YS
683{
684 struct rbd_device *rbd_dev = disk->private_data;
b82d167b
AE
685 unsigned long open_count_before;
686
a14ea269 687 spin_lock_irq(&rbd_dev->lock);
b82d167b 688 open_count_before = rbd_dev->open_count--;
a14ea269 689 spin_unlock_irq(&rbd_dev->lock);
b82d167b 690 rbd_assert(open_count_before > 0);
dfc5606d 691
c3e946ce 692 put_device(&rbd_dev->dev);
dfc5606d
YS
693}
694
131fd9f6
GZ
695static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
696{
1de797bb 697 int ro;
131fd9f6 698
1de797bb 699 if (get_user(ro, (int __user *)arg))
131fd9f6
GZ
700 return -EFAULT;
701
c1b62057
ID
702 /*
703 * Both images mapped read-only and snapshots can't be marked
704 * read-write.
705 */
706 if (!ro) {
707 if (rbd_is_ro(rbd_dev))
708 return -EROFS;
709
710 rbd_assert(!rbd_is_snap(rbd_dev));
711 }
131fd9f6 712
1de797bb
ID
713 /* Let blkdev_roset() handle it */
714 return -ENOTTY;
131fd9f6
GZ
715}
716
717static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
718 unsigned int cmd, unsigned long arg)
719{
720 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
1de797bb 721 int ret;
131fd9f6 722
131fd9f6
GZ
723 switch (cmd) {
724 case BLKROSET:
725 ret = rbd_ioctl_set_ro(rbd_dev, arg);
726 break;
727 default:
728 ret = -ENOTTY;
729 }
730
131fd9f6
GZ
731 return ret;
732}
733
734#ifdef CONFIG_COMPAT
735static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
736 unsigned int cmd, unsigned long arg)
737{
738 return rbd_ioctl(bdev, mode, cmd, arg);
739}
740#endif /* CONFIG_COMPAT */
741
602adf40
YS
742static const struct block_device_operations rbd_bd_ops = {
743 .owner = THIS_MODULE,
744 .open = rbd_open,
dfc5606d 745 .release = rbd_release,
131fd9f6
GZ
746 .ioctl = rbd_ioctl,
747#ifdef CONFIG_COMPAT
748 .compat_ioctl = rbd_compat_ioctl,
749#endif
602adf40
YS
750};
751
752/*
7262cfca 753 * Initialize an rbd client instance. Success or not, this function
cfbf6377 754 * consumes ceph_opts. Caller holds client_mutex.
602adf40 755 */
f8c38929 756static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
602adf40
YS
757{
758 struct rbd_client *rbdc;
759 int ret = -ENOMEM;
760
37206ee5 761 dout("%s:\n", __func__);
602adf40
YS
762 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
763 if (!rbdc)
764 goto out_opt;
765
766 kref_init(&rbdc->kref);
767 INIT_LIST_HEAD(&rbdc->node);
768
74da4a0f 769 rbdc->client = ceph_create_client(ceph_opts, rbdc);
602adf40 770 if (IS_ERR(rbdc->client))
08f75463 771 goto out_rbdc;
43ae4701 772 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
602adf40
YS
773
774 ret = ceph_open_session(rbdc->client);
775 if (ret < 0)
08f75463 776 goto out_client;
602adf40 777
432b8587 778 spin_lock(&rbd_client_list_lock);
602adf40 779 list_add_tail(&rbdc->node, &rbd_client_list);
432b8587 780 spin_unlock(&rbd_client_list_lock);
602adf40 781
37206ee5 782 dout("%s: rbdc %p\n", __func__, rbdc);
bc534d86 783
602adf40 784 return rbdc;
08f75463 785out_client:
602adf40 786 ceph_destroy_client(rbdc->client);
08f75463 787out_rbdc:
602adf40
YS
788 kfree(rbdc);
789out_opt:
43ae4701
AE
790 if (ceph_opts)
791 ceph_destroy_options(ceph_opts);
37206ee5
AE
792 dout("%s: error %d\n", __func__, ret);
793
28f259b7 794 return ERR_PTR(ret);
602adf40
YS
795}
796
2f82ee54
AE
797static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
798{
799 kref_get(&rbdc->kref);
800
801 return rbdc;
802}
803
602adf40 804/*
1f7ba331
AE
805 * Find a ceph client with specific addr and configuration. If
806 * found, bump its reference count.
602adf40 807 */
1f7ba331 808static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
602adf40
YS
809{
810 struct rbd_client *client_node;
1f7ba331 811 bool found = false;
602adf40 812
43ae4701 813 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
602adf40
YS
814 return NULL;
815
1f7ba331
AE
816 spin_lock(&rbd_client_list_lock);
817 list_for_each_entry(client_node, &rbd_client_list, node) {
818 if (!ceph_compare_options(ceph_opts, client_node->client)) {
2f82ee54
AE
819 __rbd_get_client(client_node);
820
1f7ba331
AE
821 found = true;
822 break;
823 }
824 }
825 spin_unlock(&rbd_client_list_lock);
826
827 return found ? client_node : NULL;
602adf40
YS
828}
829
59c2be1e 830/*
210c104c 831 * (Per device) rbd map options
59c2be1e
YS
832 */
833enum {
b5584180 834 Opt_queue_depth,
0c93e1b7 835 Opt_alloc_size,
34f55d0b 836 Opt_lock_timeout,
59c2be1e 837 /* int args above */
b26c047b 838 Opt_pool_ns,
dc1dad8e 839 Opt_compression_hint,
59c2be1e 840 /* string args above */
cc0538b6
AE
841 Opt_read_only,
842 Opt_read_write,
80de1912 843 Opt_lock_on_read,
e010dd0a 844 Opt_exclusive,
d9360540 845 Opt_notrim,
59c2be1e
YS
846};
847
dc1dad8e
ID
848enum {
849 Opt_compression_hint_none,
850 Opt_compression_hint_compressible,
851 Opt_compression_hint_incompressible,
852};
853
854static const struct constant_table rbd_param_compression_hint[] = {
855 {"none", Opt_compression_hint_none},
856 {"compressible", Opt_compression_hint_compressible},
857 {"incompressible", Opt_compression_hint_incompressible},
858 {}
859};
860
d7167b14 861static const struct fs_parameter_spec rbd_parameters[] = {
82995cc6 862 fsparam_u32 ("alloc_size", Opt_alloc_size),
dc1dad8e
ID
863 fsparam_enum ("compression_hint", Opt_compression_hint,
864 rbd_param_compression_hint),
82995cc6
DH
865 fsparam_flag ("exclusive", Opt_exclusive),
866 fsparam_flag ("lock_on_read", Opt_lock_on_read),
867 fsparam_u32 ("lock_timeout", Opt_lock_timeout),
868 fsparam_flag ("notrim", Opt_notrim),
869 fsparam_string ("_pool_ns", Opt_pool_ns),
870 fsparam_u32 ("queue_depth", Opt_queue_depth),
871 fsparam_flag ("read_only", Opt_read_only),
872 fsparam_flag ("read_write", Opt_read_write),
873 fsparam_flag ("ro", Opt_read_only),
874 fsparam_flag ("rw", Opt_read_write),
875 {}
876};
877
98571b5a 878struct rbd_options {
b5584180 879 int queue_depth;
0c93e1b7 880 int alloc_size;
34f55d0b 881 unsigned long lock_timeout;
98571b5a 882 bool read_only;
80de1912 883 bool lock_on_read;
e010dd0a 884 bool exclusive;
d9360540 885 bool trim;
dc1dad8e
ID
886
887 u32 alloc_hint_flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */
98571b5a
AE
888};
889
b5584180 890#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
0c93e1b7 891#define RBD_ALLOC_SIZE_DEFAULT (64 * 1024)
34f55d0b 892#define RBD_LOCK_TIMEOUT_DEFAULT 0 /* no timeout */
98571b5a 893#define RBD_READ_ONLY_DEFAULT false
80de1912 894#define RBD_LOCK_ON_READ_DEFAULT false
e010dd0a 895#define RBD_EXCLUSIVE_DEFAULT false
d9360540 896#define RBD_TRIM_DEFAULT true
98571b5a 897
82995cc6 898struct rbd_parse_opts_ctx {
c300156b 899 struct rbd_spec *spec;
82995cc6 900 struct ceph_options *copts;
c300156b
ID
901 struct rbd_options *opts;
902};
903
6d2940c8
GZ
904static char* obj_op_name(enum obj_operation_type op_type)
905{
906 switch (op_type) {
907 case OBJ_OP_READ:
908 return "read";
909 case OBJ_OP_WRITE:
910 return "write";
90e98c52
GZ
911 case OBJ_OP_DISCARD:
912 return "discard";
6484cbe9
ID
913 case OBJ_OP_ZEROOUT:
914 return "zeroout";
6d2940c8
GZ
915 default:
916 return "???";
917 }
918}
919
602adf40
YS
920/*
921 * Destroy ceph client
d23a4b3f 922 *
432b8587 923 * Caller must hold rbd_client_list_lock.
602adf40
YS
924 */
925static void rbd_client_release(struct kref *kref)
926{
927 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
928
37206ee5 929 dout("%s: rbdc %p\n", __func__, rbdc);
cd9d9f5d 930 spin_lock(&rbd_client_list_lock);
602adf40 931 list_del(&rbdc->node);
cd9d9f5d 932 spin_unlock(&rbd_client_list_lock);
602adf40
YS
933
934 ceph_destroy_client(rbdc->client);
935 kfree(rbdc);
936}
937
938/*
939 * Drop reference to ceph client node. If it's not referenced anymore, release
940 * it.
941 */
9d3997fd 942static void rbd_put_client(struct rbd_client *rbdc)
602adf40 943{
c53d5893
AE
944 if (rbdc)
945 kref_put(&rbdc->kref, rbd_client_release);
602adf40
YS
946}
947
5feb0d8d
ID
948/*
949 * Get a ceph client with specific addr and configuration, if one does
950 * not exist create it. Either way, ceph_opts is consumed by this
951 * function.
952 */
953static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
954{
955 struct rbd_client *rbdc;
dd435855 956 int ret;
5feb0d8d 957
a32e4143 958 mutex_lock(&client_mutex);
5feb0d8d 959 rbdc = rbd_client_find(ceph_opts);
dd435855 960 if (rbdc) {
5feb0d8d 961 ceph_destroy_options(ceph_opts);
dd435855
ID
962
963 /*
964 * Using an existing client. Make sure ->pg_pools is up to
965 * date before we look up the pool id in do_rbd_add().
966 */
9d4a227f
ID
967 ret = ceph_wait_for_latest_osdmap(rbdc->client,
968 rbdc->client->options->mount_timeout);
dd435855
ID
969 if (ret) {
970 rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
971 rbd_put_client(rbdc);
972 rbdc = ERR_PTR(ret);
973 }
974 } else {
5feb0d8d 975 rbdc = rbd_client_create(ceph_opts);
dd435855 976 }
5feb0d8d
ID
977 mutex_unlock(&client_mutex);
978
979 return rbdc;
980}
981
a30b71b9
AE
982static bool rbd_image_format_valid(u32 image_format)
983{
984 return image_format == 1 || image_format == 2;
985}
986
8e94af8e
AE
987static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
988{
103a150f
AE
989 size_t size;
990 u32 snap_count;
991
992 /* The header has to start with the magic rbd header text */
993 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
994 return false;
995
db2388b6
AE
996 /* The bio layer requires at least sector-sized I/O */
997
998 if (ondisk->options.order < SECTOR_SHIFT)
999 return false;
1000
1001 /* If we use u64 in a few spots we may be able to loosen this */
1002
1003 if (ondisk->options.order > 8 * sizeof (int) - 1)
1004 return false;
1005
103a150f
AE
1006 /*
1007 * The size of a snapshot header has to fit in a size_t, and
1008 * that limits the number of snapshots.
1009 */
1010 snap_count = le32_to_cpu(ondisk->snap_count);
1011 size = SIZE_MAX - sizeof (struct ceph_snap_context);
1012 if (snap_count > size / sizeof (__le64))
1013 return false;
1014
1015 /*
1016 * Not only that, but the size of the entire the snapshot
1017 * header must also be representable in a size_t.
1018 */
1019 size -= snap_count * sizeof (__le64);
1020 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
1021 return false;
1022
1023 return true;
8e94af8e
AE
1024}
1025
5bc3fb17
ID
1026/*
1027 * returns the size of an object in the image
1028 */
1029static u32 rbd_obj_bytes(struct rbd_image_header *header)
1030{
1031 return 1U << header->obj_order;
1032}
1033
263423f8
ID
1034static void rbd_init_layout(struct rbd_device *rbd_dev)
1035{
1036 if (rbd_dev->header.stripe_unit == 0 ||
1037 rbd_dev->header.stripe_count == 0) {
1038 rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
1039 rbd_dev->header.stripe_count = 1;
1040 }
1041
1042 rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
1043 rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
1044 rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
7e97332e
ID
1045 rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
1046 rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
263423f8
ID
1047 RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
1048}
1049
602adf40 1050/*
bb23e37a
AE
1051 * Fill an rbd image header with information from the given format 1
1052 * on-disk header.
602adf40 1053 */
662518b1 1054static int rbd_header_from_disk(struct rbd_device *rbd_dev,
4156d998 1055 struct rbd_image_header_ondisk *ondisk)
602adf40 1056{
662518b1 1057 struct rbd_image_header *header = &rbd_dev->header;
bb23e37a
AE
1058 bool first_time = header->object_prefix == NULL;
1059 struct ceph_snap_context *snapc;
1060 char *object_prefix = NULL;
1061 char *snap_names = NULL;
1062 u64 *snap_sizes = NULL;
ccece235 1063 u32 snap_count;
bb23e37a 1064 int ret = -ENOMEM;
621901d6 1065 u32 i;
602adf40 1066
bb23e37a 1067 /* Allocate this now to avoid having to handle failure below */
6a52325f 1068
bb23e37a 1069 if (first_time) {
848d796c
ID
1070 object_prefix = kstrndup(ondisk->object_prefix,
1071 sizeof(ondisk->object_prefix),
1072 GFP_KERNEL);
bb23e37a
AE
1073 if (!object_prefix)
1074 return -ENOMEM;
bb23e37a 1075 }
00f1f36f 1076
bb23e37a 1077 /* Allocate the snapshot context and fill it in */
00f1f36f 1078
bb23e37a
AE
1079 snap_count = le32_to_cpu(ondisk->snap_count);
1080 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1081 if (!snapc)
1082 goto out_err;
1083 snapc->seq = le64_to_cpu(ondisk->snap_seq);
602adf40 1084 if (snap_count) {
bb23e37a 1085 struct rbd_image_snap_ondisk *snaps;
f785cc1d
AE
1086 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1087
bb23e37a 1088 /* We'll keep a copy of the snapshot names... */
621901d6 1089
bb23e37a
AE
1090 if (snap_names_len > (u64)SIZE_MAX)
1091 goto out_2big;
1092 snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1093 if (!snap_names)
6a52325f
AE
1094 goto out_err;
1095
bb23e37a 1096 /* ...as well as the array of their sizes. */
88a25a5f
ME
1097 snap_sizes = kmalloc_array(snap_count,
1098 sizeof(*header->snap_sizes),
1099 GFP_KERNEL);
bb23e37a 1100 if (!snap_sizes)
6a52325f 1101 goto out_err;
bb23e37a 1102
f785cc1d 1103 /*
bb23e37a
AE
1104 * Copy the names, and fill in each snapshot's id
1105 * and size.
1106 *
99a41ebc 1107 * Note that rbd_dev_v1_header_info() guarantees the
bb23e37a 1108 * ondisk buffer we're working with has
f785cc1d
AE
1109 * snap_names_len bytes beyond the end of the
1110 * snapshot id array, this memcpy() is safe.
1111 */
bb23e37a
AE
1112 memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1113 snaps = ondisk->snaps;
1114 for (i = 0; i < snap_count; i++) {
1115 snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1116 snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1117 }
602adf40 1118 }
6a52325f 1119
bb23e37a 1120 /* We won't fail any more, fill in the header */
621901d6 1121
bb23e37a
AE
1122 if (first_time) {
1123 header->object_prefix = object_prefix;
1124 header->obj_order = ondisk->options.order;
263423f8 1125 rbd_init_layout(rbd_dev);
602adf40 1126 } else {
662518b1
AE
1127 ceph_put_snap_context(header->snapc);
1128 kfree(header->snap_names);
1129 kfree(header->snap_sizes);
602adf40 1130 }
849b4260 1131
bb23e37a 1132 /* The remaining fields always get updated (when we refresh) */
621901d6 1133
f84344f3 1134 header->image_size = le64_to_cpu(ondisk->image_size);
bb23e37a
AE
1135 header->snapc = snapc;
1136 header->snap_names = snap_names;
1137 header->snap_sizes = snap_sizes;
468521c1 1138
602adf40 1139 return 0;
bb23e37a
AE
1140out_2big:
1141 ret = -EIO;
6a52325f 1142out_err:
bb23e37a
AE
1143 kfree(snap_sizes);
1144 kfree(snap_names);
1145 ceph_put_snap_context(snapc);
1146 kfree(object_prefix);
ccece235 1147
bb23e37a 1148 return ret;
602adf40
YS
1149}
1150
9682fc6d
AE
1151static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1152{
1153 const char *snap_name;
1154
1155 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1156
1157 /* Skip over names until we find the one we are looking for */
1158
1159 snap_name = rbd_dev->header.snap_names;
1160 while (which--)
1161 snap_name += strlen(snap_name) + 1;
1162
1163 return kstrdup(snap_name, GFP_KERNEL);
1164}
1165
30d1cff8
AE
1166/*
1167 * Snapshot id comparison function for use with qsort()/bsearch().
1168 * Note that result is for snapshots in *descending* order.
1169 */
1170static int snapid_compare_reverse(const void *s1, const void *s2)
1171{
1172 u64 snap_id1 = *(u64 *)s1;
1173 u64 snap_id2 = *(u64 *)s2;
1174
1175 if (snap_id1 < snap_id2)
1176 return 1;
1177 return snap_id1 == snap_id2 ? 0 : -1;
1178}
1179
1180/*
1181 * Search a snapshot context to see if the given snapshot id is
1182 * present.
1183 *
1184 * Returns the position of the snapshot id in the array if it's found,
1185 * or BAD_SNAP_INDEX otherwise.
1186 *
1187 * Note: The snapshot array is in kept sorted (by the osd) in
1188 * reverse order, highest snapshot id first.
1189 */
9682fc6d
AE
1190static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1191{
1192 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
30d1cff8 1193 u64 *found;
9682fc6d 1194
30d1cff8
AE
1195 found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1196 sizeof (snap_id), snapid_compare_reverse);
9682fc6d 1197
30d1cff8 1198 return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
9682fc6d
AE
1199}
1200
2ad3d716
AE
1201static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1202 u64 snap_id)
9e15b77d 1203{
54cac61f 1204 u32 which;
da6a6b63 1205 const char *snap_name;
9e15b77d 1206
54cac61f
AE
1207 which = rbd_dev_snap_index(rbd_dev, snap_id);
1208 if (which == BAD_SNAP_INDEX)
da6a6b63 1209 return ERR_PTR(-ENOENT);
54cac61f 1210
da6a6b63
JD
1211 snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1212 return snap_name ? snap_name : ERR_PTR(-ENOMEM);
54cac61f
AE
1213}
1214
1215static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1216{
9e15b77d
AE
1217 if (snap_id == CEPH_NOSNAP)
1218 return RBD_SNAP_HEAD_NAME;
1219
54cac61f
AE
1220 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1221 if (rbd_dev->image_format == 1)
1222 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
9e15b77d 1223
54cac61f 1224 return rbd_dev_v2_snap_name(rbd_dev, snap_id);
9e15b77d
AE
1225}
1226
2ad3d716
AE
1227static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1228 u64 *snap_size)
602adf40 1229{
2ad3d716
AE
1230 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1231 if (snap_id == CEPH_NOSNAP) {
1232 *snap_size = rbd_dev->header.image_size;
1233 } else if (rbd_dev->image_format == 1) {
1234 u32 which;
602adf40 1235
2ad3d716
AE
1236 which = rbd_dev_snap_index(rbd_dev, snap_id);
1237 if (which == BAD_SNAP_INDEX)
1238 return -ENOENT;
e86924a8 1239
2ad3d716
AE
1240 *snap_size = rbd_dev->header.snap_sizes[which];
1241 } else {
1242 u64 size = 0;
1243 int ret;
1244
1245 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1246 if (ret)
1247 return ret;
1248
1249 *snap_size = size;
1250 }
1251 return 0;
602adf40
YS
1252}
1253
2ad3d716
AE
1254static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1255{
8f4b7d98 1256 u64 snap_id = rbd_dev->spec->snap_id;
2ad3d716 1257 u64 size = 0;
2ad3d716
AE
1258 int ret;
1259
2ad3d716 1260 ret = rbd_snap_size(rbd_dev, snap_id, &size);
2ad3d716
AE
1261 if (ret)
1262 return ret;
1263
1264 rbd_dev->mapping.size = size;
8b0241f8 1265 return 0;
602adf40
YS
1266}
1267
d1cf5788
AE
1268static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1269{
1270 rbd_dev->mapping.size = 0;
200a6a8b
AE
1271}
1272
5359a17d 1273static void zero_bvec(struct bio_vec *bv)
602adf40 1274{
602adf40 1275 void *buf;
5359a17d 1276 unsigned long flags;
602adf40 1277
5359a17d
ID
1278 buf = bvec_kmap_irq(bv, &flags);
1279 memset(buf, 0, bv->bv_len);
1280 flush_dcache_page(bv->bv_page);
1281 bvec_kunmap_irq(buf, &flags);
602adf40
YS
1282}
1283
5359a17d 1284static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
b9434c5b 1285{
5359a17d 1286 struct ceph_bio_iter it = *bio_pos;
b9434c5b 1287
5359a17d
ID
1288 ceph_bio_iter_advance(&it, off);
1289 ceph_bio_iter_advance_step(&it, bytes, ({
1290 zero_bvec(&bv);
1291 }));
b9434c5b
AE
1292}
1293
7e07efb1 1294static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
602adf40 1295{
7e07efb1 1296 struct ceph_bvec_iter it = *bvec_pos;
602adf40 1297
7e07efb1
ID
1298 ceph_bvec_iter_advance(&it, off);
1299 ceph_bvec_iter_advance_step(&it, bytes, ({
1300 zero_bvec(&bv);
1301 }));
f7760dad
AE
1302}
1303
1304/*
3da691bf 1305 * Zero a range in @obj_req data buffer defined by a bio (list) or
afb97888 1306 * (private) bio_vec array.
f7760dad 1307 *
3da691bf 1308 * @off is relative to the start of the data buffer.
926f9b3f 1309 */
3da691bf
ID
1310static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1311 u32 bytes)
926f9b3f 1312{
54ab3b24
ID
1313 dout("%s %p data buf %u~%u\n", __func__, obj_req, off, bytes);
1314
ecc633ca 1315 switch (obj_req->img_request->data_type) {
3da691bf
ID
1316 case OBJ_REQUEST_BIO:
1317 zero_bios(&obj_req->bio_pos, off, bytes);
1318 break;
1319 case OBJ_REQUEST_BVECS:
afb97888 1320 case OBJ_REQUEST_OWN_BVECS:
3da691bf
ID
1321 zero_bvecs(&obj_req->bvec_pos, off, bytes);
1322 break;
1323 default:
16809372 1324 BUG();
6365d33a
AE
1325 }
1326}
1327
bf0d5f50
AE
1328static void rbd_obj_request_destroy(struct kref *kref);
1329static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1330{
1331 rbd_assert(obj_request != NULL);
37206ee5 1332 dout("%s: obj %p (was %d)\n", __func__, obj_request,
2c935bc5 1333 kref_read(&obj_request->kref));
bf0d5f50
AE
1334 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1335}
1336
bf0d5f50
AE
1337static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1338 struct rbd_obj_request *obj_request)
1339{
25dcf954
AE
1340 rbd_assert(obj_request->img_request == NULL);
1341
b155e86c 1342 /* Image request now owns object's original reference */
bf0d5f50 1343 obj_request->img_request = img_request;
15961b44 1344 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
bf0d5f50
AE
1345}
1346
1347static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1348 struct rbd_obj_request *obj_request)
1349{
15961b44 1350 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
43df3d35 1351 list_del(&obj_request->ex.oe_item);
bf0d5f50 1352 rbd_assert(obj_request->img_request == img_request);
bf0d5f50
AE
1353 rbd_obj_request_put(obj_request);
1354}
1355
a086a1b8 1356static void rbd_osd_submit(struct ceph_osd_request *osd_req)
bf0d5f50 1357{
a086a1b8 1358 struct rbd_obj_request *obj_req = osd_req->r_priv;
980917fc 1359
a086a1b8
ID
1360 dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n",
1361 __func__, osd_req, obj_req, obj_req->ex.oe_objno,
1362 obj_req->ex.oe_off, obj_req->ex.oe_len);
980917fc 1363 ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
bf0d5f50
AE
1364}
1365
0c425248
AE
1366/*
1367 * The default/initial value for all image request flags is 0. Each
1368 * is conditionally set to 1 at image request initialization time
1369 * and currently never change thereafter.
1370 */
d0b2e944
AE
1371static void img_request_layered_set(struct rbd_img_request *img_request)
1372{
1373 set_bit(IMG_REQ_LAYERED, &img_request->flags);
d0b2e944
AE
1374}
1375
1376static bool img_request_layered_test(struct rbd_img_request *img_request)
1377{
d0b2e944
AE
1378 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1379}
1380
3da691bf 1381static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
6e2a4505 1382{
3da691bf 1383 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
b9434c5b 1384
43df3d35
ID
1385 return !obj_req->ex.oe_off &&
1386 obj_req->ex.oe_len == rbd_dev->layout.object_size;
6e2a4505
AE
1387}
1388
3da691bf 1389static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
bf0d5f50 1390{
3da691bf 1391 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
bf0d5f50 1392
43df3d35 1393 return obj_req->ex.oe_off + obj_req->ex.oe_len ==
3da691bf 1394 rbd_dev->layout.object_size;
0dcc685e
ID
1395}
1396
13488d53
ID
1397/*
1398 * Must be called after rbd_obj_calc_img_extents().
1399 */
1400static bool rbd_obj_copyup_enabled(struct rbd_obj_request *obj_req)
1401{
1402 if (!obj_req->num_img_extents ||
9b17eb2c
ID
1403 (rbd_obj_is_entire(obj_req) &&
1404 !obj_req->img_request->snapc->num_snaps))
13488d53
ID
1405 return false;
1406
1407 return true;
1408}
1409
86bd7998 1410static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
bf0d5f50 1411{
86bd7998
ID
1412 return ceph_file_extents_bytes(obj_req->img_extents,
1413 obj_req->num_img_extents);
bf0d5f50
AE
1414}
1415
3da691bf 1416static bool rbd_img_is_write(struct rbd_img_request *img_req)
bf0d5f50 1417{
9bb0248d 1418 switch (img_req->op_type) {
3da691bf
ID
1419 case OBJ_OP_READ:
1420 return false;
1421 case OBJ_OP_WRITE:
1422 case OBJ_OP_DISCARD:
6484cbe9 1423 case OBJ_OP_ZEROOUT:
3da691bf
ID
1424 return true;
1425 default:
c6244b3b 1426 BUG();
3da691bf 1427 }
90e98c52
GZ
1428}
1429
85e084fe 1430static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
bf0d5f50 1431{
3da691bf 1432 struct rbd_obj_request *obj_req = osd_req->r_priv;
54ab3b24 1433 int result;
bf0d5f50 1434
3da691bf
ID
1435 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1436 osd_req->r_result, obj_req);
bf0d5f50 1437
54ab3b24
ID
1438 /*
1439 * Writes aren't allowed to return a data payload. In some
1440 * guarded write cases (e.g. stat + zero on an empty object)
1441 * a stat response makes it through, but we don't care.
1442 */
1443 if (osd_req->r_result > 0 && rbd_img_is_write(obj_req->img_request))
1444 result = 0;
3da691bf 1445 else
54ab3b24 1446 result = osd_req->r_result;
bf0d5f50 1447
54ab3b24 1448 rbd_obj_handle_request(obj_req, result);
bf0d5f50
AE
1449}
1450
bcbab1db 1451static void rbd_osd_format_read(struct ceph_osd_request *osd_req)
430c28c3 1452{
bcbab1db 1453 struct rbd_obj_request *obj_request = osd_req->r_priv;
22d2cfdf
ID
1454 struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
1455 struct ceph_options *opt = rbd_dev->rbd_client->client->options;
430c28c3 1456
22d2cfdf 1457 osd_req->r_flags = CEPH_OSD_FLAG_READ | opt->read_from_replica;
7c84883a 1458 osd_req->r_snapid = obj_request->img_request->snap_id;
9d4df01f
AE
1459}
1460
bcbab1db 1461static void rbd_osd_format_write(struct ceph_osd_request *osd_req)
9d4df01f 1462{
bcbab1db 1463 struct rbd_obj_request *obj_request = osd_req->r_priv;
9d4df01f 1464
a162b308 1465 osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
fac02ddf 1466 ktime_get_real_ts64(&osd_req->r_mtime);
43df3d35 1467 osd_req->r_data_offset = obj_request->ex.oe_off;
430c28c3
AE
1468}
1469
bc81207e 1470static struct ceph_osd_request *
bcbab1db
ID
1471__rbd_obj_add_osd_request(struct rbd_obj_request *obj_req,
1472 struct ceph_snap_context *snapc, int num_ops)
bc81207e 1473{
e28eded5 1474 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
bc81207e
ID
1475 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1476 struct ceph_osd_request *req;
a90bb0c1
ID
1477 const char *name_format = rbd_dev->image_format == 1 ?
1478 RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
bcbab1db 1479 int ret;
bc81207e 1480
e28eded5 1481 req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
bc81207e 1482 if (!req)
bcbab1db 1483 return ERR_PTR(-ENOMEM);
bc81207e 1484
bcbab1db 1485 list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
bc81207e 1486 req->r_callback = rbd_osd_req_callback;
a162b308 1487 req->r_priv = obj_req;
bc81207e 1488
b26c047b
ID
1489 /*
1490 * Data objects may be stored in a separate pool, but always in
1491 * the same namespace in that pool as the header in its pool.
1492 */
1493 ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
bc81207e 1494 req->r_base_oloc.pool = rbd_dev->layout.pool_id;
b26c047b 1495
bcbab1db
ID
1496 ret = ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
1497 rbd_dev->header.object_prefix,
1498 obj_req->ex.oe_objno);
1499 if (ret)
1500 return ERR_PTR(ret);
bc81207e 1501
bc81207e 1502 return req;
bc81207e
ID
1503}
1504
e28eded5 1505static struct ceph_osd_request *
bcbab1db 1506rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, int num_ops)
bf0d5f50 1507{
bcbab1db
ID
1508 return __rbd_obj_add_osd_request(obj_req, obj_req->img_request->snapc,
1509 num_ops);
bf0d5f50
AE
1510}
1511
ecc633ca 1512static struct rbd_obj_request *rbd_obj_request_create(void)
bf0d5f50
AE
1513{
1514 struct rbd_obj_request *obj_request;
bf0d5f50 1515
5a60e876 1516 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
6c696d85 1517 if (!obj_request)
f907ad55 1518 return NULL;
f907ad55 1519
43df3d35 1520 ceph_object_extent_init(&obj_request->ex);
bcbab1db 1521 INIT_LIST_HEAD(&obj_request->osd_reqs);
85b5e6d1 1522 mutex_init(&obj_request->state_mutex);
bf0d5f50
AE
1523 kref_init(&obj_request->kref);
1524
67e2b652 1525 dout("%s %p\n", __func__, obj_request);
bf0d5f50
AE
1526 return obj_request;
1527}
1528
1529static void rbd_obj_request_destroy(struct kref *kref)
1530{
1531 struct rbd_obj_request *obj_request;
bcbab1db 1532 struct ceph_osd_request *osd_req;
7e07efb1 1533 u32 i;
bf0d5f50
AE
1534
1535 obj_request = container_of(kref, struct rbd_obj_request, kref);
1536
37206ee5
AE
1537 dout("%s: obj %p\n", __func__, obj_request);
1538
bcbab1db
ID
1539 while (!list_empty(&obj_request->osd_reqs)) {
1540 osd_req = list_first_entry(&obj_request->osd_reqs,
1541 struct ceph_osd_request, r_private_item);
1542 list_del_init(&osd_req->r_private_item);
1543 ceph_osdc_put_request(osd_req);
1544 }
bf0d5f50 1545
ecc633ca 1546 switch (obj_request->img_request->data_type) {
9969ebc5 1547 case OBJ_REQUEST_NODATA:
bf0d5f50 1548 case OBJ_REQUEST_BIO:
7e07efb1 1549 case OBJ_REQUEST_BVECS:
5359a17d 1550 break; /* Nothing to do */
afb97888
ID
1551 case OBJ_REQUEST_OWN_BVECS:
1552 kfree(obj_request->bvec_pos.bvecs);
788e2df3 1553 break;
7e07efb1 1554 default:
16809372 1555 BUG();
bf0d5f50
AE
1556 }
1557
86bd7998 1558 kfree(obj_request->img_extents);
7e07efb1
ID
1559 if (obj_request->copyup_bvecs) {
1560 for (i = 0; i < obj_request->copyup_bvec_count; i++) {
1561 if (obj_request->copyup_bvecs[i].bv_page)
1562 __free_page(obj_request->copyup_bvecs[i].bv_page);
1563 }
1564 kfree(obj_request->copyup_bvecs);
bf0d5f50
AE
1565 }
1566
868311b1 1567 kmem_cache_free(rbd_obj_request_cache, obj_request);
bf0d5f50
AE
1568}
1569
fb65d228
AE
1570/* It's OK to call this for a device with no parent */
1571
1572static void rbd_spec_put(struct rbd_spec *spec);
1573static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1574{
1575 rbd_dev_remove_parent(rbd_dev);
1576 rbd_spec_put(rbd_dev->parent_spec);
1577 rbd_dev->parent_spec = NULL;
1578 rbd_dev->parent_overlap = 0;
1579}
1580
a2acd00e
AE
1581/*
1582 * Parent image reference counting is used to determine when an
1583 * image's parent fields can be safely torn down--after there are no
1584 * more in-flight requests to the parent image. When the last
1585 * reference is dropped, cleaning them up is safe.
1586 */
1587static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1588{
1589 int counter;
1590
1591 if (!rbd_dev->parent_spec)
1592 return;
1593
1594 counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1595 if (counter > 0)
1596 return;
1597
1598 /* Last reference; clean up parent data structures */
1599
1600 if (!counter)
1601 rbd_dev_unparent(rbd_dev);
1602 else
9584d508 1603 rbd_warn(rbd_dev, "parent reference underflow");
a2acd00e
AE
1604}
1605
1606/*
1607 * If an image has a non-zero parent overlap, get a reference to its
1608 * parent.
1609 *
1610 * Returns true if the rbd device has a parent with a non-zero
1611 * overlap and a reference for it was successfully taken, or
1612 * false otherwise.
1613 */
1614static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1615{
ae43e9d0 1616 int counter = 0;
a2acd00e
AE
1617
1618 if (!rbd_dev->parent_spec)
1619 return false;
1620
ae43e9d0
ID
1621 if (rbd_dev->parent_overlap)
1622 counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
a2acd00e
AE
1623
1624 if (counter < 0)
9584d508 1625 rbd_warn(rbd_dev, "parent reference overflow");
a2acd00e 1626
ae43e9d0 1627 return counter > 0;
a2acd00e
AE
1628}
1629
59e542c8
ID
1630static void rbd_img_request_init(struct rbd_img_request *img_request,
1631 struct rbd_device *rbd_dev,
1632 enum obj_operation_type op_type)
bf0d5f50 1633{
59e542c8 1634 memset(img_request, 0, sizeof(*img_request));
bf0d5f50 1635
bf0d5f50 1636 img_request->rbd_dev = rbd_dev;
9bb0248d 1637 img_request->op_type = op_type;
a0c5895b 1638
e1fddc8f 1639 INIT_LIST_HEAD(&img_request->lock_item);
43df3d35 1640 INIT_LIST_HEAD(&img_request->object_extents);
0192ce2e 1641 mutex_init(&img_request->state_mutex);
bf0d5f50
AE
1642}
1643
a52cc685
ID
1644static void rbd_img_capture_header(struct rbd_img_request *img_req)
1645{
1646 struct rbd_device *rbd_dev = img_req->rbd_dev;
1647
1648 lockdep_assert_held(&rbd_dev->header_rwsem);
1649
1650 if (rbd_img_is_write(img_req))
1651 img_req->snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1652 else
1653 img_req->snap_id = rbd_dev->spec->snap_id;
1654
1655 if (rbd_dev_parent_get(rbd_dev))
1656 img_request_layered_set(img_req);
1657}
1658
679a97d2 1659static void rbd_img_request_destroy(struct rbd_img_request *img_request)
bf0d5f50 1660{
bf0d5f50
AE
1661 struct rbd_obj_request *obj_request;
1662 struct rbd_obj_request *next_obj_request;
1663
37206ee5
AE
1664 dout("%s: img %p\n", __func__, img_request);
1665
e1fddc8f 1666 WARN_ON(!list_empty(&img_request->lock_item));
bf0d5f50
AE
1667 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1668 rbd_img_obj_request_del(img_request, obj_request);
1669
78b42a87 1670 if (img_request_layered_test(img_request))
a2acd00e 1671 rbd_dev_parent_put(img_request->rbd_dev);
a2acd00e 1672
9bb0248d 1673 if (rbd_img_is_write(img_request))
812164f8 1674 ceph_put_snap_context(img_request->snapc);
bf0d5f50 1675
59e542c8
ID
1676 if (test_bit(IMG_REQ_CHILD, &img_request->flags))
1677 kmem_cache_free(rbd_img_request_cache, img_request);
bf0d5f50
AE
1678}
1679
22e8bd51
ID
1680#define BITS_PER_OBJ 2
1681#define OBJS_PER_BYTE (BITS_PER_BYTE / BITS_PER_OBJ)
1682#define OBJ_MASK ((1 << BITS_PER_OBJ) - 1)
e93f3152 1683
22e8bd51
ID
1684static void __rbd_object_map_index(struct rbd_device *rbd_dev, u64 objno,
1685 u64 *index, u8 *shift)
1686{
1687 u32 off;
e93f3152 1688
22e8bd51
ID
1689 rbd_assert(objno < rbd_dev->object_map_size);
1690 *index = div_u64_rem(objno, OBJS_PER_BYTE, &off);
1691 *shift = (OBJS_PER_BYTE - off - 1) * BITS_PER_OBJ;
1692}
e93f3152 1693
22e8bd51
ID
1694static u8 __rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
1695{
1696 u64 index;
1697 u8 shift;
e93f3152 1698
22e8bd51
ID
1699 lockdep_assert_held(&rbd_dev->object_map_lock);
1700 __rbd_object_map_index(rbd_dev, objno, &index, &shift);
1701 return (rbd_dev->object_map[index] >> shift) & OBJ_MASK;
e93f3152
AE
1702}
1703
22e8bd51 1704static void __rbd_object_map_set(struct rbd_device *rbd_dev, u64 objno, u8 val)
e93f3152 1705{
22e8bd51
ID
1706 u64 index;
1707 u8 shift;
1708 u8 *p;
e93f3152 1709
22e8bd51
ID
1710 lockdep_assert_held(&rbd_dev->object_map_lock);
1711 rbd_assert(!(val & ~OBJ_MASK));
e93f3152 1712
22e8bd51
ID
1713 __rbd_object_map_index(rbd_dev, objno, &index, &shift);
1714 p = &rbd_dev->object_map[index];
1715 *p = (*p & ~(OBJ_MASK << shift)) | (val << shift);
e93f3152
AE
1716}
1717
22e8bd51 1718static u8 rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
1217857f 1719{
22e8bd51
ID
1720 u8 state;
1721
1722 spin_lock(&rbd_dev->object_map_lock);
1723 state = __rbd_object_map_get(rbd_dev, objno);
1724 spin_unlock(&rbd_dev->object_map_lock);
1725 return state;
3da691bf 1726}
1217857f 1727
22e8bd51 1728static bool use_object_map(struct rbd_device *rbd_dev)
3da691bf 1729{
3fe69921
ID
1730 /*
1731 * An image mapped read-only can't use the object map -- it isn't
1732 * loaded because the header lock isn't acquired. Someone else can
1733 * write to the image and update the object map behind our back.
1734 *
1735 * A snapshot can't be written to, so using the object map is always
1736 * safe.
1737 */
1738 if (!rbd_is_snap(rbd_dev) && rbd_is_ro(rbd_dev))
1739 return false;
1740
22e8bd51
ID
1741 return ((rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) &&
1742 !(rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID));
3da691bf
ID
1743}
1744
22e8bd51 1745static bool rbd_object_map_may_exist(struct rbd_device *rbd_dev, u64 objno)
3da691bf 1746{
22e8bd51 1747 u8 state;
8b3e1a56 1748
22e8bd51
ID
1749 /* fall back to default logic if object map is disabled or invalid */
1750 if (!use_object_map(rbd_dev))
1751 return true;
3da691bf 1752
22e8bd51
ID
1753 state = rbd_object_map_get(rbd_dev, objno);
1754 return state != OBJECT_NONEXISTENT;
1217857f
AE
1755}
1756
22e8bd51
ID
1757static void rbd_object_map_name(struct rbd_device *rbd_dev, u64 snap_id,
1758 struct ceph_object_id *oid)
13488d53 1759{
22e8bd51
ID
1760 if (snap_id == CEPH_NOSNAP)
1761 ceph_oid_printf(oid, "%s%s", RBD_OBJECT_MAP_PREFIX,
1762 rbd_dev->spec->image_id);
1763 else
1764 ceph_oid_printf(oid, "%s%s.%016llx", RBD_OBJECT_MAP_PREFIX,
1765 rbd_dev->spec->image_id, snap_id);
13488d53
ID
1766}
1767
22e8bd51 1768static int rbd_object_map_lock(struct rbd_device *rbd_dev)
2169238d 1769{
22e8bd51
ID
1770 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1771 CEPH_DEFINE_OID_ONSTACK(oid);
1772 u8 lock_type;
1773 char *lock_tag;
1774 struct ceph_locker *lockers;
1775 u32 num_lockers;
1776 bool broke_lock = false;
1777 int ret;
2169238d 1778
22e8bd51 1779 rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
2169238d 1780
22e8bd51
ID
1781again:
1782 ret = ceph_cls_lock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
1783 CEPH_CLS_LOCK_EXCLUSIVE, "", "", "", 0);
1784 if (ret != -EBUSY || broke_lock) {
1785 if (ret == -EEXIST)
1786 ret = 0; /* already locked by myself */
1787 if (ret)
1788 rbd_warn(rbd_dev, "failed to lock object map: %d", ret);
1789 return ret;
1790 }
2169238d 1791
22e8bd51
ID
1792 ret = ceph_cls_lock_info(osdc, &oid, &rbd_dev->header_oloc,
1793 RBD_LOCK_NAME, &lock_type, &lock_tag,
1794 &lockers, &num_lockers);
1795 if (ret) {
1796 if (ret == -ENOENT)
1797 goto again;
3da691bf 1798
22e8bd51 1799 rbd_warn(rbd_dev, "failed to get object map lockers: %d", ret);
86bd7998 1800 return ret;
22e8bd51 1801 }
86bd7998 1802
22e8bd51
ID
1803 kfree(lock_tag);
1804 if (num_lockers == 0)
1805 goto again;
2169238d 1806
22e8bd51
ID
1807 rbd_warn(rbd_dev, "breaking object map lock owned by %s%llu",
1808 ENTITY_NAME(lockers[0].id.name));
2169238d 1809
22e8bd51
ID
1810 ret = ceph_cls_break_lock(osdc, &oid, &rbd_dev->header_oloc,
1811 RBD_LOCK_NAME, lockers[0].id.cookie,
1812 &lockers[0].id.name);
1813 ceph_free_lockers(lockers, num_lockers);
1814 if (ret) {
1815 if (ret == -ENOENT)
1816 goto again;
13488d53 1817
22e8bd51
ID
1818 rbd_warn(rbd_dev, "failed to break object map lock: %d", ret);
1819 return ret;
3da691bf
ID
1820 }
1821
22e8bd51
ID
1822 broke_lock = true;
1823 goto again;
2169238d
AE
1824}
1825
22e8bd51 1826static void rbd_object_map_unlock(struct rbd_device *rbd_dev)
6484cbe9 1827{
22e8bd51
ID
1828 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1829 CEPH_DEFINE_OID_ONSTACK(oid);
1830 int ret;
1831
1832 rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
1833
1834 ret = ceph_cls_unlock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
1835 "");
1836 if (ret && ret != -ENOENT)
1837 rbd_warn(rbd_dev, "failed to unlock object map: %d", ret);
6484cbe9
ID
1838}
1839
22e8bd51 1840static int decode_object_map_header(void **p, void *end, u64 *object_map_size)
6484cbe9 1841{
22e8bd51
ID
1842 u8 struct_v;
1843 u32 struct_len;
1844 u32 header_len;
1845 void *header_end;
6484cbe9
ID
1846 int ret;
1847
22e8bd51
ID
1848 ceph_decode_32_safe(p, end, header_len, e_inval);
1849 header_end = *p + header_len;
0c93e1b7 1850
22e8bd51
ID
1851 ret = ceph_start_decoding(p, end, 1, "BitVector header", &struct_v,
1852 &struct_len);
6484cbe9
ID
1853 if (ret)
1854 return ret;
1855
22e8bd51 1856 ceph_decode_64_safe(p, end, *object_map_size, e_inval);
6484cbe9 1857
22e8bd51 1858 *p = header_end;
6484cbe9 1859 return 0;
22e8bd51
ID
1860
1861e_inval:
1862 return -EINVAL;
6484cbe9
ID
1863}
1864
22e8bd51 1865static int __rbd_object_map_load(struct rbd_device *rbd_dev)
13488d53 1866{
22e8bd51
ID
1867 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1868 CEPH_DEFINE_OID_ONSTACK(oid);
1869 struct page **pages;
1870 void *p, *end;
1871 size_t reply_len;
1872 u64 num_objects;
1873 u64 object_map_bytes;
1874 u64 object_map_size;
1875 int num_pages;
1876 int ret;
13488d53 1877
22e8bd51 1878 rbd_assert(!rbd_dev->object_map && !rbd_dev->object_map_size);
13488d53 1879
22e8bd51
ID
1880 num_objects = ceph_get_num_objects(&rbd_dev->layout,
1881 rbd_dev->mapping.size);
1882 object_map_bytes = DIV_ROUND_UP_ULL(num_objects * BITS_PER_OBJ,
1883 BITS_PER_BYTE);
1884 num_pages = calc_pages_for(0, object_map_bytes) + 1;
1885 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1886 if (IS_ERR(pages))
1887 return PTR_ERR(pages);
13488d53 1888
22e8bd51
ID
1889 reply_len = num_pages * PAGE_SIZE;
1890 rbd_object_map_name(rbd_dev, rbd_dev->spec->snap_id, &oid);
1891 ret = ceph_osdc_call(osdc, &oid, &rbd_dev->header_oloc,
1892 "rbd", "object_map_load", CEPH_OSD_FLAG_READ,
1893 NULL, 0, pages, &reply_len);
1894 if (ret)
1895 goto out;
3b434a2a 1896
22e8bd51
ID
1897 p = page_address(pages[0]);
1898 end = p + min(reply_len, (size_t)PAGE_SIZE);
1899 ret = decode_object_map_header(&p, end, &object_map_size);
1900 if (ret)
1901 goto out;
1902
1903 if (object_map_size != num_objects) {
1904 rbd_warn(rbd_dev, "object map size mismatch: %llu vs %llu",
1905 object_map_size, num_objects);
1906 ret = -EINVAL;
1907 goto out;
3b434a2a
JD
1908 }
1909
22e8bd51
ID
1910 if (offset_in_page(p) + object_map_bytes > reply_len) {
1911 ret = -EINVAL;
1912 goto out;
1913 }
1914
1915 rbd_dev->object_map = kvmalloc(object_map_bytes, GFP_KERNEL);
1916 if (!rbd_dev->object_map) {
1917 ret = -ENOMEM;
1918 goto out;
1919 }
1920
1921 rbd_dev->object_map_size = object_map_size;
1922 ceph_copy_from_page_vector(pages, rbd_dev->object_map,
1923 offset_in_page(p), object_map_bytes);
1924
1925out:
1926 ceph_release_page_vector(pages, num_pages);
1927 return ret;
1928}
3da691bf 1929
22e8bd51
ID
1930static void rbd_object_map_free(struct rbd_device *rbd_dev)
1931{
1932 kvfree(rbd_dev->object_map);
1933 rbd_dev->object_map = NULL;
1934 rbd_dev->object_map_size = 0;
3b434a2a
JD
1935}
1936
22e8bd51 1937static int rbd_object_map_load(struct rbd_device *rbd_dev)
bf0d5f50 1938{
3da691bf 1939 int ret;
37206ee5 1940
22e8bd51 1941 ret = __rbd_object_map_load(rbd_dev);
86bd7998
ID
1942 if (ret)
1943 return ret;
f1a4739f 1944
22e8bd51
ID
1945 ret = rbd_dev_v2_get_flags(rbd_dev);
1946 if (ret) {
1947 rbd_object_map_free(rbd_dev);
1948 return ret;
1949 }
1950
1951 if (rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID)
1952 rbd_warn(rbd_dev, "object map is invalid");
1953
1954 return 0;
1955}
1956
1957static int rbd_object_map_open(struct rbd_device *rbd_dev)
1958{
1959 int ret;
1960
1961 ret = rbd_object_map_lock(rbd_dev);
1962 if (ret)
1963 return ret;
1964
1965 ret = rbd_object_map_load(rbd_dev);
1966 if (ret) {
1967 rbd_object_map_unlock(rbd_dev);
1968 return ret;
1969 }
1970
1971 return 0;
1972}
1973
1974static void rbd_object_map_close(struct rbd_device *rbd_dev)
1975{
1976 rbd_object_map_free(rbd_dev);
1977 rbd_object_map_unlock(rbd_dev);
1978}
1979
1980/*
1981 * This function needs snap_id (or more precisely just something to
1982 * distinguish between HEAD and snapshot object maps), new_state and
1983 * current_state that were passed to rbd_object_map_update().
1984 *
1985 * To avoid allocating and stashing a context we piggyback on the OSD
1986 * request. A HEAD update has two ops (assert_locked). For new_state
1987 * and current_state we decode our own object_map_update op, encoded in
1988 * rbd_cls_object_map_update().
1989 */
1990static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req,
1991 struct ceph_osd_request *osd_req)
1992{
1993 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1994 struct ceph_osd_data *osd_data;
1995 u64 objno;
3f649ab7 1996 u8 state, new_state, current_state;
22e8bd51
ID
1997 bool has_current_state;
1998 void *p;
1999
2000 if (osd_req->r_result)
2001 return osd_req->r_result;
2002
2003 /*
2004 * Nothing to do for a snapshot object map.
2005 */
2006 if (osd_req->r_num_ops == 1)
2007 return 0;
2008
2009 /*
2010 * Update in-memory HEAD object map.
2011 */
2012 rbd_assert(osd_req->r_num_ops == 2);
2013 osd_data = osd_req_op_data(osd_req, 1, cls, request_data);
2014 rbd_assert(osd_data->type == CEPH_OSD_DATA_TYPE_PAGES);
2015
2016 p = page_address(osd_data->pages[0]);
2017 objno = ceph_decode_64(&p);
2018 rbd_assert(objno == obj_req->ex.oe_objno);
2019 rbd_assert(ceph_decode_64(&p) == objno + 1);
2020 new_state = ceph_decode_8(&p);
2021 has_current_state = ceph_decode_8(&p);
2022 if (has_current_state)
2023 current_state = ceph_decode_8(&p);
2024
2025 spin_lock(&rbd_dev->object_map_lock);
2026 state = __rbd_object_map_get(rbd_dev, objno);
2027 if (!has_current_state || current_state == state ||
2028 (current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN))
2029 __rbd_object_map_set(rbd_dev, objno, new_state);
2030 spin_unlock(&rbd_dev->object_map_lock);
2031
2032 return 0;
2033}
2034
2035static void rbd_object_map_callback(struct ceph_osd_request *osd_req)
2036{
2037 struct rbd_obj_request *obj_req = osd_req->r_priv;
2038 int result;
2039
2040 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
2041 osd_req->r_result, obj_req);
2042
2043 result = rbd_object_map_update_finish(obj_req, osd_req);
2044 rbd_obj_handle_request(obj_req, result);
2045}
2046
2047static bool update_needed(struct rbd_device *rbd_dev, u64 objno, u8 new_state)
2048{
2049 u8 state = rbd_object_map_get(rbd_dev, objno);
bf0d5f50 2050
22e8bd51
ID
2051 if (state == new_state ||
2052 (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) ||
2053 (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING))
2054 return false;
2055
2056 return true;
2057}
2058
2059static int rbd_cls_object_map_update(struct ceph_osd_request *req,
2060 int which, u64 objno, u8 new_state,
2061 const u8 *current_state)
2062{
2063 struct page **pages;
2064 void *p, *start;
2065 int ret;
2066
2067 ret = osd_req_op_cls_init(req, which, "rbd", "object_map_update");
2068 if (ret)
2069 return ret;
2070
2071 pages = ceph_alloc_page_vector(1, GFP_NOIO);
2072 if (IS_ERR(pages))
2073 return PTR_ERR(pages);
2074
2075 p = start = page_address(pages[0]);
2076 ceph_encode_64(&p, objno);
2077 ceph_encode_64(&p, objno + 1);
2078 ceph_encode_8(&p, new_state);
2079 if (current_state) {
2080 ceph_encode_8(&p, 1);
2081 ceph_encode_8(&p, *current_state);
2082 } else {
2083 ceph_encode_8(&p, 0);
2084 }
2085
2086 osd_req_op_cls_request_data_pages(req, which, pages, p - start, 0,
2087 false, true);
2088 return 0;
2089}
2090
2091/*
2092 * Return:
2093 * 0 - object map update sent
2094 * 1 - object map update isn't needed
2095 * <0 - error
2096 */
2097static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id,
2098 u8 new_state, const u8 *current_state)
2099{
2100 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2101 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2102 struct ceph_osd_request *req;
2103 int num_ops = 1;
2104 int which = 0;
2105 int ret;
2106
2107 if (snap_id == CEPH_NOSNAP) {
2108 if (!update_needed(rbd_dev, obj_req->ex.oe_objno, new_state))
2109 return 1;
2110
2111 num_ops++; /* assert_locked */
2112 }
2113
2114 req = ceph_osdc_alloc_request(osdc, NULL, num_ops, false, GFP_NOIO);
2115 if (!req)
2116 return -ENOMEM;
2117
2118 list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
2119 req->r_callback = rbd_object_map_callback;
2120 req->r_priv = obj_req;
2121
2122 rbd_object_map_name(rbd_dev, snap_id, &req->r_base_oid);
2123 ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
2124 req->r_flags = CEPH_OSD_FLAG_WRITE;
2125 ktime_get_real_ts64(&req->r_mtime);
2126
2127 if (snap_id == CEPH_NOSNAP) {
2128 /*
2129 * Protect against possible race conditions during lock
2130 * ownership transitions.
2131 */
2132 ret = ceph_cls_assert_locked(req, which++, RBD_LOCK_NAME,
2133 CEPH_CLS_LOCK_EXCLUSIVE, "", "");
3da691bf
ID
2134 if (ret)
2135 return ret;
22e8bd51
ID
2136 }
2137
2138 ret = rbd_cls_object_map_update(req, which, obj_req->ex.oe_objno,
2139 new_state, current_state);
2140 if (ret)
2141 return ret;
2142
2143 ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
2144 if (ret)
2145 return ret;
13488d53 2146
22e8bd51
ID
2147 ceph_osdc_start_request(osdc, req, false);
2148 return 0;
2149}
2150
86bd7998
ID
2151static void prune_extents(struct ceph_file_extent *img_extents,
2152 u32 *num_img_extents, u64 overlap)
e93f3152 2153{
86bd7998 2154 u32 cnt = *num_img_extents;
e93f3152 2155
86bd7998
ID
2156 /* drop extents completely beyond the overlap */
2157 while (cnt && img_extents[cnt - 1].fe_off >= overlap)
2158 cnt--;
e93f3152 2159
86bd7998
ID
2160 if (cnt) {
2161 struct ceph_file_extent *ex = &img_extents[cnt - 1];
e93f3152 2162
86bd7998
ID
2163 /* trim final overlapping extent */
2164 if (ex->fe_off + ex->fe_len > overlap)
2165 ex->fe_len = overlap - ex->fe_off;
2166 }
e93f3152 2167
86bd7998 2168 *num_img_extents = cnt;
e93f3152
AE
2169}
2170
86bd7998
ID
2171/*
2172 * Determine the byte range(s) covered by either just the object extent
2173 * or the entire object in the parent image.
2174 */
2175static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
2176 bool entire)
e93f3152 2177{
86bd7998
ID
2178 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2179 int ret;
e93f3152 2180
86bd7998
ID
2181 if (!rbd_dev->parent_overlap)
2182 return 0;
e93f3152 2183
86bd7998
ID
2184 ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
2185 entire ? 0 : obj_req->ex.oe_off,
2186 entire ? rbd_dev->layout.object_size :
2187 obj_req->ex.oe_len,
2188 &obj_req->img_extents,
2189 &obj_req->num_img_extents);
2190 if (ret)
2191 return ret;
e93f3152 2192
86bd7998
ID
2193 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
2194 rbd_dev->parent_overlap);
2195 return 0;
e93f3152
AE
2196}
2197
bcbab1db 2198static void rbd_osd_setup_data(struct ceph_osd_request *osd_req, int which)
1217857f 2199{
bcbab1db
ID
2200 struct rbd_obj_request *obj_req = osd_req->r_priv;
2201
ecc633ca 2202 switch (obj_req->img_request->data_type) {
3da691bf 2203 case OBJ_REQUEST_BIO:
bcbab1db 2204 osd_req_op_extent_osd_data_bio(osd_req, which,
3da691bf 2205 &obj_req->bio_pos,
43df3d35 2206 obj_req->ex.oe_len);
3da691bf
ID
2207 break;
2208 case OBJ_REQUEST_BVECS:
afb97888 2209 case OBJ_REQUEST_OWN_BVECS:
3da691bf 2210 rbd_assert(obj_req->bvec_pos.iter.bi_size ==
43df3d35 2211 obj_req->ex.oe_len);
afb97888 2212 rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
bcbab1db 2213 osd_req_op_extent_osd_data_bvec_pos(osd_req, which,
3da691bf
ID
2214 &obj_req->bvec_pos);
2215 break;
2216 default:
16809372 2217 BUG();
1217857f 2218 }
3da691bf 2219}
1217857f 2220
bcbab1db 2221static int rbd_osd_setup_stat(struct ceph_osd_request *osd_req, int which)
3da691bf
ID
2222{
2223 struct page **pages;
8b3e1a56 2224
3da691bf
ID
2225 /*
2226 * The response data for a STAT call consists of:
2227 * le64 length;
2228 * struct {
2229 * le32 tv_sec;
2230 * le32 tv_nsec;
2231 * } mtime;
2232 */
2233 pages = ceph_alloc_page_vector(1, GFP_NOIO);
2234 if (IS_ERR(pages))
2235 return PTR_ERR(pages);
2236
bcbab1db
ID
2237 osd_req_op_init(osd_req, which, CEPH_OSD_OP_STAT, 0);
2238 osd_req_op_raw_data_in_pages(osd_req, which, pages,
3da691bf
ID
2239 8 + sizeof(struct ceph_timespec),
2240 0, false, true);
2241 return 0;
1217857f
AE
2242}
2243
b5ae8cbc
ID
2244static int rbd_osd_setup_copyup(struct ceph_osd_request *osd_req, int which,
2245 u32 bytes)
2246{
2247 struct rbd_obj_request *obj_req = osd_req->r_priv;
2248 int ret;
2249
2250 ret = osd_req_op_cls_init(osd_req, which, "rbd", "copyup");
2251 if (ret)
2252 return ret;
2253
2254 osd_req_op_cls_request_data_bvecs(osd_req, which, obj_req->copyup_bvecs,
2255 obj_req->copyup_bvec_count, bytes);
2256 return 0;
2257}
2258
ea9b743c
ID
2259static int rbd_obj_init_read(struct rbd_obj_request *obj_req)
2260{
2261 obj_req->read_state = RBD_OBJ_READ_START;
2262 return 0;
2263}
2264
bcbab1db
ID
2265static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2266 int which)
2169238d 2267{
bcbab1db 2268 struct rbd_obj_request *obj_req = osd_req->r_priv;
3da691bf
ID
2269 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2270 u16 opcode;
2169238d 2271
8b5bec5c
ID
2272 if (!use_object_map(rbd_dev) ||
2273 !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST)) {
2274 osd_req_op_alloc_hint_init(osd_req, which++,
2275 rbd_dev->layout.object_size,
d3798acc 2276 rbd_dev->layout.object_size,
dc1dad8e 2277 rbd_dev->opts->alloc_hint_flags);
8b5bec5c 2278 }
2169238d 2279
3da691bf
ID
2280 if (rbd_obj_is_entire(obj_req))
2281 opcode = CEPH_OSD_OP_WRITEFULL;
2282 else
2283 opcode = CEPH_OSD_OP_WRITE;
2169238d 2284
bcbab1db 2285 osd_req_op_extent_init(osd_req, which, opcode,
43df3d35 2286 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
bcbab1db 2287 rbd_osd_setup_data(osd_req, which);
3da691bf 2288}
2169238d 2289
ea9b743c 2290static int rbd_obj_init_write(struct rbd_obj_request *obj_req)
3da691bf 2291{
3da691bf
ID
2292 int ret;
2293
86bd7998
ID
2294 /* reverse map the entire object onto the parent */
2295 ret = rbd_obj_calc_img_extents(obj_req, true);
2296 if (ret)
2297 return ret;
2298
0ad5d953
ID
2299 if (rbd_obj_copyup_enabled(obj_req))
2300 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
2301
85b5e6d1 2302 obj_req->write_state = RBD_OBJ_WRITE_START;
3da691bf 2303 return 0;
2169238d
AE
2304}
2305
6484cbe9
ID
2306static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req)
2307{
2308 return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE :
2309 CEPH_OSD_OP_ZERO;
2310}
2311
27bbd911
ID
2312static void __rbd_osd_setup_discard_ops(struct ceph_osd_request *osd_req,
2313 int which)
2314{
2315 struct rbd_obj_request *obj_req = osd_req->r_priv;
2316
2317 if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
2318 rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
2319 osd_req_op_init(osd_req, which, CEPH_OSD_OP_DELETE, 0);
13488d53 2320 } else {
27bbd911
ID
2321 osd_req_op_extent_init(osd_req, which,
2322 truncate_or_zero_opcode(obj_req),
2323 obj_req->ex.oe_off, obj_req->ex.oe_len,
2324 0, 0);
2325 }
2326}
2327
ea9b743c 2328static int rbd_obj_init_discard(struct rbd_obj_request *obj_req)
6484cbe9 2329{
0c93e1b7 2330 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
27bbd911 2331 u64 off, next_off;
6484cbe9
ID
2332 int ret;
2333
0c93e1b7
ID
2334 /*
2335 * Align the range to alloc_size boundary and punt on discards
2336 * that are too small to free up any space.
2337 *
2338 * alloc_size == object_size && is_tail() is a special case for
2339 * filestore with filestore_punch_hole = false, needed to allow
2340 * truncate (in addition to delete).
2341 */
2342 if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size ||
2343 !rbd_obj_is_tail(obj_req)) {
27bbd911
ID
2344 off = round_up(obj_req->ex.oe_off, rbd_dev->opts->alloc_size);
2345 next_off = round_down(obj_req->ex.oe_off + obj_req->ex.oe_len,
2346 rbd_dev->opts->alloc_size);
0c93e1b7
ID
2347 if (off >= next_off)
2348 return 1;
27bbd911
ID
2349
2350 dout("%s %p %llu~%llu -> %llu~%llu\n", __func__,
2351 obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len,
2352 off, next_off - off);
2353 obj_req->ex.oe_off = off;
2354 obj_req->ex.oe_len = next_off - off;
0c93e1b7
ID
2355 }
2356
6484cbe9
ID
2357 /* reverse map the entire object onto the parent */
2358 ret = rbd_obj_calc_img_extents(obj_req, true);
2359 if (ret)
2360 return ret;
2361
22e8bd51 2362 obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
0ad5d953
ID
2363 if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents)
2364 obj_req->flags |= RBD_OBJ_FLAG_DELETION;
2365
85b5e6d1 2366 obj_req->write_state = RBD_OBJ_WRITE_START;
6484cbe9
ID
2367 return 0;
2368}
2369
bcbab1db
ID
2370static void __rbd_osd_setup_zeroout_ops(struct ceph_osd_request *osd_req,
2371 int which)
3da691bf 2372{
bcbab1db 2373 struct rbd_obj_request *obj_req = osd_req->r_priv;
3b434a2a
JD
2374 u16 opcode;
2375
3da691bf 2376 if (rbd_obj_is_entire(obj_req)) {
86bd7998 2377 if (obj_req->num_img_extents) {
0ad5d953 2378 if (!(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
bcbab1db 2379 osd_req_op_init(osd_req, which++,
9b17eb2c 2380 CEPH_OSD_OP_CREATE, 0);
3b434a2a
JD
2381 opcode = CEPH_OSD_OP_TRUNCATE;
2382 } else {
0ad5d953 2383 rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
bcbab1db 2384 osd_req_op_init(osd_req, which++,
3da691bf
ID
2385 CEPH_OSD_OP_DELETE, 0);
2386 opcode = 0;
3b434a2a 2387 }
3b434a2a 2388 } else {
6484cbe9 2389 opcode = truncate_or_zero_opcode(obj_req);
3b434a2a
JD
2390 }
2391
3da691bf 2392 if (opcode)
bcbab1db 2393 osd_req_op_extent_init(osd_req, which, opcode,
43df3d35 2394 obj_req->ex.oe_off, obj_req->ex.oe_len,
3da691bf 2395 0, 0);
3b434a2a
JD
2396}
2397
ea9b743c 2398static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req)
bf0d5f50 2399{
3da691bf 2400 int ret;
37206ee5 2401
86bd7998
ID
2402 /* reverse map the entire object onto the parent */
2403 ret = rbd_obj_calc_img_extents(obj_req, true);
2404 if (ret)
2405 return ret;
f1a4739f 2406
0ad5d953
ID
2407 if (rbd_obj_copyup_enabled(obj_req))
2408 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
2409 if (!obj_req->num_img_extents) {
22e8bd51 2410 obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
0ad5d953
ID
2411 if (rbd_obj_is_entire(obj_req))
2412 obj_req->flags |= RBD_OBJ_FLAG_DELETION;
3da691bf 2413 }
3b434a2a 2414
a086a1b8 2415 obj_req->write_state = RBD_OBJ_WRITE_START;
3da691bf
ID
2416 return 0;
2417}
9d4df01f 2418
a086a1b8
ID
2419static int count_write_ops(struct rbd_obj_request *obj_req)
2420{
8b5bec5c
ID
2421 struct rbd_img_request *img_req = obj_req->img_request;
2422
2423 switch (img_req->op_type) {
a086a1b8 2424 case OBJ_OP_WRITE:
8b5bec5c
ID
2425 if (!use_object_map(img_req->rbd_dev) ||
2426 !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST))
2427 return 2; /* setallochint + write/writefull */
2428
2429 return 1; /* write/writefull */
a086a1b8
ID
2430 case OBJ_OP_DISCARD:
2431 return 1; /* delete/truncate/zero */
2432 case OBJ_OP_ZEROOUT:
2433 if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents &&
2434 !(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
2435 return 2; /* create + truncate */
bf0d5f50 2436
a086a1b8
ID
2437 return 1; /* delete/truncate/zero */
2438 default:
2439 BUG();
3da691bf 2440 }
a086a1b8 2441}
3b434a2a 2442
a086a1b8
ID
2443static void rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2444 int which)
2445{
2446 struct rbd_obj_request *obj_req = osd_req->r_priv;
2447
2448 switch (obj_req->img_request->op_type) {
2449 case OBJ_OP_WRITE:
2450 __rbd_osd_setup_write_ops(osd_req, which);
2451 break;
2452 case OBJ_OP_DISCARD:
2453 __rbd_osd_setup_discard_ops(osd_req, which);
2454 break;
2455 case OBJ_OP_ZEROOUT:
2456 __rbd_osd_setup_zeroout_ops(osd_req, which);
2457 break;
2458 default:
2459 BUG();
2460 }
3da691bf 2461}
9d4df01f 2462
3da691bf 2463/*
a086a1b8
ID
2464 * Prune the list of object requests (adjust offset and/or length, drop
2465 * redundant requests). Prepare object request state machines and image
2466 * request state machine for execution.
3da691bf
ID
2467 */
2468static int __rbd_img_fill_request(struct rbd_img_request *img_req)
2469{
0c93e1b7 2470 struct rbd_obj_request *obj_req, *next_obj_req;
3da691bf 2471 int ret;
430c28c3 2472
0c93e1b7 2473 for_each_obj_request_safe(img_req, obj_req, next_obj_req) {
9bb0248d 2474 switch (img_req->op_type) {
3da691bf 2475 case OBJ_OP_READ:
ea9b743c 2476 ret = rbd_obj_init_read(obj_req);
3da691bf
ID
2477 break;
2478 case OBJ_OP_WRITE:
ea9b743c 2479 ret = rbd_obj_init_write(obj_req);
3da691bf
ID
2480 break;
2481 case OBJ_OP_DISCARD:
ea9b743c 2482 ret = rbd_obj_init_discard(obj_req);
3da691bf 2483 break;
6484cbe9 2484 case OBJ_OP_ZEROOUT:
ea9b743c 2485 ret = rbd_obj_init_zeroout(obj_req);
6484cbe9 2486 break;
3da691bf 2487 default:
16809372 2488 BUG();
3da691bf 2489 }
0c93e1b7 2490 if (ret < 0)
3da691bf 2491 return ret;
0c93e1b7 2492 if (ret > 0) {
0c93e1b7
ID
2493 rbd_img_obj_request_del(img_req, obj_req);
2494 continue;
2495 }
bf0d5f50
AE
2496 }
2497
0192ce2e 2498 img_req->state = RBD_IMG_START;
bf0d5f50 2499 return 0;
3da691bf 2500}
bf0d5f50 2501
5a237819
ID
2502union rbd_img_fill_iter {
2503 struct ceph_bio_iter bio_iter;
2504 struct ceph_bvec_iter bvec_iter;
2505};
bf0d5f50 2506
5a237819
ID
2507struct rbd_img_fill_ctx {
2508 enum obj_request_type pos_type;
2509 union rbd_img_fill_iter *pos;
2510 union rbd_img_fill_iter iter;
2511 ceph_object_extent_fn_t set_pos_fn;
afb97888
ID
2512 ceph_object_extent_fn_t count_fn;
2513 ceph_object_extent_fn_t copy_fn;
5a237819 2514};
bf0d5f50 2515
5a237819 2516static struct ceph_object_extent *alloc_object_extent(void *arg)
0eefd470 2517{
5a237819
ID
2518 struct rbd_img_request *img_req = arg;
2519 struct rbd_obj_request *obj_req;
0eefd470 2520
5a237819
ID
2521 obj_req = rbd_obj_request_create();
2522 if (!obj_req)
2523 return NULL;
2761713d 2524
5a237819
ID
2525 rbd_img_obj_request_add(img_req, obj_req);
2526 return &obj_req->ex;
2527}
0eefd470 2528
afb97888
ID
2529/*
2530 * While su != os && sc == 1 is technically not fancy (it's the same
2531 * layout as su == os && sc == 1), we can't use the nocopy path for it
2532 * because ->set_pos_fn() should be called only once per object.
2533 * ceph_file_to_extents() invokes action_fn once per stripe unit, so
2534 * treat su != os && sc == 1 as fancy.
2535 */
2536static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
2537{
2538 return l->stripe_unit != l->object_size;
2539}
0eefd470 2540
afb97888
ID
2541static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
2542 struct ceph_file_extent *img_extents,
2543 u32 num_img_extents,
2544 struct rbd_img_fill_ctx *fctx)
2545{
2546 u32 i;
2547 int ret;
2548
2549 img_req->data_type = fctx->pos_type;
0eefd470
AE
2550
2551 /*
afb97888
ID
2552 * Create object requests and set each object request's starting
2553 * position in the provided bio (list) or bio_vec array.
0eefd470 2554 */
afb97888
ID
2555 fctx->iter = *fctx->pos;
2556 for (i = 0; i < num_img_extents; i++) {
2557 ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
2558 img_extents[i].fe_off,
2559 img_extents[i].fe_len,
2560 &img_req->object_extents,
2561 alloc_object_extent, img_req,
2562 fctx->set_pos_fn, &fctx->iter);
2563 if (ret)
2564 return ret;
2565 }
0eefd470 2566
afb97888 2567 return __rbd_img_fill_request(img_req);
0eefd470
AE
2568}
2569
5a237819
ID
2570/*
2571 * Map a list of image extents to a list of object extents, create the
2572 * corresponding object requests (normally each to a different object,
2573 * but not always) and add them to @img_req. For each object request,
afb97888 2574 * set up its data descriptor to point to the corresponding chunk(s) of
5a237819
ID
2575 * @fctx->pos data buffer.
2576 *
afb97888
ID
2577 * Because ceph_file_to_extents() will merge adjacent object extents
2578 * together, each object request's data descriptor may point to multiple
2579 * different chunks of @fctx->pos data buffer.
2580 *
5a237819
ID
2581 * @fctx->pos data buffer is assumed to be large enough.
2582 */
2583static int rbd_img_fill_request(struct rbd_img_request *img_req,
2584 struct ceph_file_extent *img_extents,
2585 u32 num_img_extents,
2586 struct rbd_img_fill_ctx *fctx)
3d7efd18 2587{
afb97888
ID
2588 struct rbd_device *rbd_dev = img_req->rbd_dev;
2589 struct rbd_obj_request *obj_req;
5a237819
ID
2590 u32 i;
2591 int ret;
2592
afb97888
ID
2593 if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2594 !rbd_layout_is_fancy(&rbd_dev->layout))
2595 return rbd_img_fill_request_nocopy(img_req, img_extents,
2596 num_img_extents, fctx);
3d7efd18 2597
afb97888 2598 img_req->data_type = OBJ_REQUEST_OWN_BVECS;
0eefd470 2599
bbea1c1a 2600 /*
afb97888
ID
2601 * Create object requests and determine ->bvec_count for each object
2602 * request. Note that ->bvec_count sum over all object requests may
2603 * be greater than the number of bio_vecs in the provided bio (list)
2604 * or bio_vec array because when mapped, those bio_vecs can straddle
2605 * stripe unit boundaries.
bbea1c1a 2606 */
5a237819
ID
2607 fctx->iter = *fctx->pos;
2608 for (i = 0; i < num_img_extents; i++) {
afb97888 2609 ret = ceph_file_to_extents(&rbd_dev->layout,
5a237819
ID
2610 img_extents[i].fe_off,
2611 img_extents[i].fe_len,
2612 &img_req->object_extents,
2613 alloc_object_extent, img_req,
afb97888
ID
2614 fctx->count_fn, &fctx->iter);
2615 if (ret)
2616 return ret;
bbea1c1a 2617 }
0eefd470 2618
afb97888
ID
2619 for_each_obj_request(img_req, obj_req) {
2620 obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2621 sizeof(*obj_req->bvec_pos.bvecs),
2622 GFP_NOIO);
2623 if (!obj_req->bvec_pos.bvecs)
2624 return -ENOMEM;
2625 }
0eefd470 2626
8785b1d4 2627 /*
afb97888
ID
2628 * Fill in each object request's private bio_vec array, splitting and
2629 * rearranging the provided bio_vecs in stripe unit chunks as needed.
8785b1d4 2630 */
afb97888
ID
2631 fctx->iter = *fctx->pos;
2632 for (i = 0; i < num_img_extents; i++) {
2633 ret = ceph_iterate_extents(&rbd_dev->layout,
2634 img_extents[i].fe_off,
2635 img_extents[i].fe_len,
2636 &img_req->object_extents,
2637 fctx->copy_fn, &fctx->iter);
5a237819
ID
2638 if (ret)
2639 return ret;
2640 }
3d7efd18 2641
5a237819
ID
2642 return __rbd_img_fill_request(img_req);
2643}
2644
2645static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
2646 u64 off, u64 len)
2647{
2648 struct ceph_file_extent ex = { off, len };
a55e601b 2649 union rbd_img_fill_iter dummy = {};
5a237819
ID
2650 struct rbd_img_fill_ctx fctx = {
2651 .pos_type = OBJ_REQUEST_NODATA,
2652 .pos = &dummy,
2653 };
2654
2655 return rbd_img_fill_request(img_req, &ex, 1, &fctx);
2656}
2657
2658static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2659{
2660 struct rbd_obj_request *obj_req =
2661 container_of(ex, struct rbd_obj_request, ex);
2662 struct ceph_bio_iter *it = arg;
3d7efd18 2663
5a237819
ID
2664 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2665 obj_req->bio_pos = *it;
2666 ceph_bio_iter_advance(it, bytes);
2667}
3d7efd18 2668
afb97888
ID
2669static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2670{
2671 struct rbd_obj_request *obj_req =
2672 container_of(ex, struct rbd_obj_request, ex);
2673 struct ceph_bio_iter *it = arg;
0eefd470 2674
afb97888
ID
2675 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2676 ceph_bio_iter_advance_step(it, bytes, ({
2677 obj_req->bvec_count++;
2678 }));
0eefd470 2679
afb97888 2680}
0eefd470 2681
afb97888
ID
2682static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2683{
2684 struct rbd_obj_request *obj_req =
2685 container_of(ex, struct rbd_obj_request, ex);
2686 struct ceph_bio_iter *it = arg;
0eefd470 2687
afb97888
ID
2688 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2689 ceph_bio_iter_advance_step(it, bytes, ({
2690 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2691 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2692 }));
3d7efd18
AE
2693}
2694
5a237819
ID
2695static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2696 struct ceph_file_extent *img_extents,
2697 u32 num_img_extents,
2698 struct ceph_bio_iter *bio_pos)
2699{
2700 struct rbd_img_fill_ctx fctx = {
2701 .pos_type = OBJ_REQUEST_BIO,
2702 .pos = (union rbd_img_fill_iter *)bio_pos,
2703 .set_pos_fn = set_bio_pos,
afb97888
ID
2704 .count_fn = count_bio_bvecs,
2705 .copy_fn = copy_bio_bvecs,
5a237819 2706 };
3d7efd18 2707
5a237819
ID
2708 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2709 &fctx);
2710}
3d7efd18 2711
5a237819
ID
2712static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2713 u64 off, u64 len, struct bio *bio)
2714{
2715 struct ceph_file_extent ex = { off, len };
2716 struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
3d7efd18 2717
5a237819
ID
2718 return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
2719}
a9e8ba2c 2720
5a237819
ID
2721static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2722{
2723 struct rbd_obj_request *obj_req =
2724 container_of(ex, struct rbd_obj_request, ex);
2725 struct ceph_bvec_iter *it = arg;
3d7efd18 2726
5a237819
ID
2727 obj_req->bvec_pos = *it;
2728 ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
2729 ceph_bvec_iter_advance(it, bytes);
2730}
3d7efd18 2731
afb97888
ID
2732static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2733{
2734 struct rbd_obj_request *obj_req =
2735 container_of(ex, struct rbd_obj_request, ex);
2736 struct ceph_bvec_iter *it = arg;
058aa991 2737
afb97888
ID
2738 ceph_bvec_iter_advance_step(it, bytes, ({
2739 obj_req->bvec_count++;
2740 }));
2741}
058aa991 2742
afb97888
ID
2743static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2744{
2745 struct rbd_obj_request *obj_req =
2746 container_of(ex, struct rbd_obj_request, ex);
2747 struct ceph_bvec_iter *it = arg;
3d7efd18 2748
afb97888
ID
2749 ceph_bvec_iter_advance_step(it, bytes, ({
2750 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2751 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2752 }));
3d7efd18
AE
2753}
2754
5a237819
ID
2755static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2756 struct ceph_file_extent *img_extents,
2757 u32 num_img_extents,
2758 struct ceph_bvec_iter *bvec_pos)
c5b5ef6c 2759{
5a237819
ID
2760 struct rbd_img_fill_ctx fctx = {
2761 .pos_type = OBJ_REQUEST_BVECS,
2762 .pos = (union rbd_img_fill_iter *)bvec_pos,
2763 .set_pos_fn = set_bvec_pos,
afb97888
ID
2764 .count_fn = count_bvecs,
2765 .copy_fn = copy_bvecs,
5a237819 2766 };
c5b5ef6c 2767
5a237819
ID
2768 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2769 &fctx);
2770}
c5b5ef6c 2771
5a237819
ID
2772static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2773 struct ceph_file_extent *img_extents,
2774 u32 num_img_extents,
2775 struct bio_vec *bvecs)
2776{
2777 struct ceph_bvec_iter it = {
2778 .bvecs = bvecs,
2779 .iter = { .bi_size = ceph_file_extents_bytes(img_extents,
2780 num_img_extents) },
2781 };
c5b5ef6c 2782
5a237819
ID
2783 return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
2784 &it);
2785}
c5b5ef6c 2786
0192ce2e 2787static void rbd_img_handle_request_work(struct work_struct *work)
bf0d5f50 2788{
0192ce2e
ID
2789 struct rbd_img_request *img_req =
2790 container_of(work, struct rbd_img_request, work);
c5b5ef6c 2791
0192ce2e
ID
2792 rbd_img_handle_request(img_req, img_req->work_result);
2793}
c2e82414 2794
0192ce2e
ID
2795static void rbd_img_schedule(struct rbd_img_request *img_req, int result)
2796{
2797 INIT_WORK(&img_req->work, rbd_img_handle_request_work);
2798 img_req->work_result = result;
2799 queue_work(rbd_wq, &img_req->work);
c5b5ef6c 2800}
c2e82414 2801
22e8bd51
ID
2802static bool rbd_obj_may_exist(struct rbd_obj_request *obj_req)
2803{
2804 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2805
2806 if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno)) {
2807 obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
2808 return true;
2809 }
2810
2811 dout("%s %p objno %llu assuming dne\n", __func__, obj_req,
2812 obj_req->ex.oe_objno);
2813 return false;
2814}
2815
85b5e6d1
ID
2816static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
2817{
a086a1b8
ID
2818 struct ceph_osd_request *osd_req;
2819 int ret;
2820
2821 osd_req = __rbd_obj_add_osd_request(obj_req, NULL, 1);
2822 if (IS_ERR(osd_req))
2823 return PTR_ERR(osd_req);
2824
2825 osd_req_op_extent_init(osd_req, 0, CEPH_OSD_OP_READ,
2826 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
2827 rbd_osd_setup_data(osd_req, 0);
2828 rbd_osd_format_read(osd_req);
2829
2830 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
2831 if (ret)
2832 return ret;
2833
2834 rbd_osd_submit(osd_req);
85b5e6d1 2835 return 0;
c5b5ef6c
AE
2836}
2837
86bd7998 2838static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
c5b5ef6c 2839{
3da691bf 2840 struct rbd_img_request *img_req = obj_req->img_request;
a52cc685 2841 struct rbd_device *parent = img_req->rbd_dev->parent;
3da691bf 2842 struct rbd_img_request *child_img_req;
c5b5ef6c
AE
2843 int ret;
2844
59e542c8 2845 child_img_req = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
3da691bf 2846 if (!child_img_req)
710214e3
ID
2847 return -ENOMEM;
2848
59e542c8 2849 rbd_img_request_init(child_img_req, parent, OBJ_OP_READ);
e93aca0a
ID
2850 __set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2851 child_img_req->obj_request = obj_req;
a90bb0c1 2852
a52cc685
ID
2853 down_read(&parent->header_rwsem);
2854 rbd_img_capture_header(child_img_req);
2855 up_read(&parent->header_rwsem);
2856
21ed05a8
ID
2857 dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req,
2858 obj_req);
2859
3da691bf 2860 if (!rbd_img_is_write(img_req)) {
ecc633ca 2861 switch (img_req->data_type) {
3da691bf 2862 case OBJ_REQUEST_BIO:
5a237819
ID
2863 ret = __rbd_img_fill_from_bio(child_img_req,
2864 obj_req->img_extents,
2865 obj_req->num_img_extents,
2866 &obj_req->bio_pos);
3da691bf
ID
2867 break;
2868 case OBJ_REQUEST_BVECS:
afb97888 2869 case OBJ_REQUEST_OWN_BVECS:
5a237819
ID
2870 ret = __rbd_img_fill_from_bvecs(child_img_req,
2871 obj_req->img_extents,
2872 obj_req->num_img_extents,
2873 &obj_req->bvec_pos);
3da691bf
ID
2874 break;
2875 default:
d342a15b 2876 BUG();
3da691bf
ID
2877 }
2878 } else {
5a237819
ID
2879 ret = rbd_img_fill_from_bvecs(child_img_req,
2880 obj_req->img_extents,
2881 obj_req->num_img_extents,
2882 obj_req->copyup_bvecs);
3da691bf
ID
2883 }
2884 if (ret) {
679a97d2 2885 rbd_img_request_destroy(child_img_req);
3da691bf
ID
2886 return ret;
2887 }
2888
0192ce2e
ID
2889 /* avoid parent chain recursion */
2890 rbd_img_schedule(child_img_req, 0);
3da691bf
ID
2891 return 0;
2892}
2893
85b5e6d1 2894static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result)
3da691bf
ID
2895{
2896 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2897 int ret;
2898
22e8bd51 2899again:
a9b67e69 2900 switch (obj_req->read_state) {
85b5e6d1
ID
2901 case RBD_OBJ_READ_START:
2902 rbd_assert(!*result);
2903
22e8bd51
ID
2904 if (!rbd_obj_may_exist(obj_req)) {
2905 *result = -ENOENT;
2906 obj_req->read_state = RBD_OBJ_READ_OBJECT;
2907 goto again;
2908 }
2909
85b5e6d1 2910 ret = rbd_obj_read_object(obj_req);
3da691bf 2911 if (ret) {
85b5e6d1 2912 *result = ret;
3da691bf
ID
2913 return true;
2914 }
85b5e6d1
ID
2915 obj_req->read_state = RBD_OBJ_READ_OBJECT;
2916 return false;
a9b67e69
ID
2917 case RBD_OBJ_READ_OBJECT:
2918 if (*result == -ENOENT && rbd_dev->parent_overlap) {
2919 /* reverse map this object extent onto the parent */
2920 ret = rbd_obj_calc_img_extents(obj_req, false);
86bd7998 2921 if (ret) {
54ab3b24 2922 *result = ret;
86bd7998
ID
2923 return true;
2924 }
a9b67e69
ID
2925 if (obj_req->num_img_extents) {
2926 ret = rbd_obj_read_from_parent(obj_req);
2927 if (ret) {
2928 *result = ret;
2929 return true;
2930 }
2931 obj_req->read_state = RBD_OBJ_READ_PARENT;
2932 return false;
2933 }
86bd7998 2934 }
710214e3 2935
a9b67e69
ID
2936 /*
2937 * -ENOENT means a hole in the image -- zero-fill the entire
2938 * length of the request. A short read also implies zero-fill
2939 * to the end of the request.
2940 */
2941 if (*result == -ENOENT) {
2942 rbd_obj_zero_range(obj_req, 0, obj_req->ex.oe_len);
2943 *result = 0;
2944 } else if (*result >= 0) {
2945 if (*result < obj_req->ex.oe_len)
2946 rbd_obj_zero_range(obj_req, *result,
2947 obj_req->ex.oe_len - *result);
2948 else
2949 rbd_assert(*result == obj_req->ex.oe_len);
2950 *result = 0;
2951 }
2952 return true;
2953 case RBD_OBJ_READ_PARENT:
d435c9a7
ID
2954 /*
2955 * The parent image is read only up to the overlap -- zero-fill
2956 * from the overlap to the end of the request.
2957 */
2958 if (!*result) {
2959 u32 obj_overlap = rbd_obj_img_extents_bytes(obj_req);
2960
2961 if (obj_overlap < obj_req->ex.oe_len)
2962 rbd_obj_zero_range(obj_req, obj_overlap,
2963 obj_req->ex.oe_len - obj_overlap);
2964 }
a9b67e69
ID
2965 return true;
2966 default:
2967 BUG();
710214e3 2968 }
3da691bf 2969}
c5b5ef6c 2970
22e8bd51
ID
2971static bool rbd_obj_write_is_noop(struct rbd_obj_request *obj_req)
2972{
2973 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2974
2975 if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno))
2976 obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
2977
2978 if (!(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST) &&
2979 (obj_req->flags & RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT)) {
2980 dout("%s %p noop for nonexistent\n", __func__, obj_req);
2981 return true;
2982 }
2983
2984 return false;
2985}
2986
2987/*
2988 * Return:
2989 * 0 - object map update sent
2990 * 1 - object map update isn't needed
2991 * <0 - error
2992 */
2993static int rbd_obj_write_pre_object_map(struct rbd_obj_request *obj_req)
2994{
2995 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2996 u8 new_state;
2997
2998 if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
2999 return 1;
3000
3001 if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
3002 new_state = OBJECT_PENDING;
3003 else
3004 new_state = OBJECT_EXISTS;
3005
3006 return rbd_object_map_update(obj_req, CEPH_NOSNAP, new_state, NULL);
3007}
3008
85b5e6d1
ID
3009static int rbd_obj_write_object(struct rbd_obj_request *obj_req)
3010{
a086a1b8
ID
3011 struct ceph_osd_request *osd_req;
3012 int num_ops = count_write_ops(obj_req);
3013 int which = 0;
3014 int ret;
710214e3 3015
a086a1b8
ID
3016 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)
3017 num_ops++; /* stat */
3018
3019 osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
3020 if (IS_ERR(osd_req))
3021 return PTR_ERR(osd_req);
3022
3023 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
3024 ret = rbd_osd_setup_stat(osd_req, which++);
3025 if (ret)
3026 return ret;
710214e3 3027 }
c5b5ef6c 3028
a086a1b8
ID
3029 rbd_osd_setup_write_ops(osd_req, which);
3030 rbd_osd_format_write(osd_req);
3031
3032 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
3033 if (ret)
3034 return ret;
3035
3036 rbd_osd_submit(osd_req);
85b5e6d1 3037 return 0;
3da691bf 3038}
c5b5ef6c 3039
3da691bf
ID
3040/*
3041 * copyup_bvecs pages are never highmem pages
3042 */
3043static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
3044{
3045 struct ceph_bvec_iter it = {
3046 .bvecs = bvecs,
3047 .iter = { .bi_size = bytes },
3048 };
c5b5ef6c 3049
3da691bf
ID
3050 ceph_bvec_iter_advance_step(&it, bytes, ({
3051 if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
3052 bv.bv_len))
3053 return false;
3054 }));
3055 return true;
c5b5ef6c
AE
3056}
3057
3a482501
ID
3058#define MODS_ONLY U32_MAX
3059
793333a3
ID
3060static int rbd_obj_copyup_empty_snapc(struct rbd_obj_request *obj_req,
3061 u32 bytes)
b454e36d 3062{
bcbab1db 3063 struct ceph_osd_request *osd_req;
fe943d50 3064 int ret;
70d045f6 3065
3da691bf 3066 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
89a59c1c 3067 rbd_assert(bytes > 0 && bytes != MODS_ONLY);
70d045f6 3068
bcbab1db
ID
3069 osd_req = __rbd_obj_add_osd_request(obj_req, &rbd_empty_snapc, 1);
3070 if (IS_ERR(osd_req))
3071 return PTR_ERR(osd_req);
b454e36d 3072
b5ae8cbc 3073 ret = rbd_osd_setup_copyup(osd_req, 0, bytes);
fe943d50
CX
3074 if (ret)
3075 return ret;
3076
bcbab1db 3077 rbd_osd_format_write(osd_req);
3da691bf 3078
bcbab1db 3079 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
89a59c1c
ID
3080 if (ret)
3081 return ret;
3082
a086a1b8 3083 rbd_osd_submit(osd_req);
89a59c1c
ID
3084 return 0;
3085}
3086
793333a3
ID
3087static int rbd_obj_copyup_current_snapc(struct rbd_obj_request *obj_req,
3088 u32 bytes)
b454e36d 3089{
bcbab1db 3090 struct ceph_osd_request *osd_req;
a086a1b8
ID
3091 int num_ops = count_write_ops(obj_req);
3092 int which = 0;
fe943d50 3093 int ret;
70d045f6 3094
3da691bf 3095 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
70d045f6 3096
a086a1b8
ID
3097 if (bytes != MODS_ONLY)
3098 num_ops++; /* copyup */
13488d53 3099
a086a1b8 3100 osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
bcbab1db
ID
3101 if (IS_ERR(osd_req))
3102 return PTR_ERR(osd_req);
b454e36d 3103
3a482501 3104 if (bytes != MODS_ONLY) {
b5ae8cbc 3105 ret = rbd_osd_setup_copyup(osd_req, which++, bytes);
3a482501
ID
3106 if (ret)
3107 return ret;
3da691bf 3108 }
3da691bf 3109
a086a1b8
ID
3110 rbd_osd_setup_write_ops(osd_req, which);
3111 rbd_osd_format_write(osd_req);
70d045f6 3112
bcbab1db 3113 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
26f887e0
ID
3114 if (ret)
3115 return ret;
3116
a086a1b8 3117 rbd_osd_submit(osd_req);
3da691bf 3118 return 0;
70d045f6
ID
3119}
3120
7e07efb1 3121static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
70d045f6 3122{
7e07efb1 3123 u32 i;
b454e36d 3124
7e07efb1
ID
3125 rbd_assert(!obj_req->copyup_bvecs);
3126 obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
3127 obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
3128 sizeof(*obj_req->copyup_bvecs),
3129 GFP_NOIO);
3130 if (!obj_req->copyup_bvecs)
3131 return -ENOMEM;
b454e36d 3132
7e07efb1
ID
3133 for (i = 0; i < obj_req->copyup_bvec_count; i++) {
3134 unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
3135
3136 obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
3137 if (!obj_req->copyup_bvecs[i].bv_page)
3138 return -ENOMEM;
3d7efd18 3139
7e07efb1
ID
3140 obj_req->copyup_bvecs[i].bv_offset = 0;
3141 obj_req->copyup_bvecs[i].bv_len = len;
3142 obj_overlap -= len;
3143 }
b454e36d 3144
7e07efb1
ID
3145 rbd_assert(!obj_overlap);
3146 return 0;
b454e36d
AE
3147}
3148
0ad5d953
ID
3149/*
3150 * The target object doesn't exist. Read the data for the entire
3151 * target object up to the overlap point (if any) from the parent,
3152 * so we can use it for a copyup.
3153 */
793333a3 3154static int rbd_obj_copyup_read_parent(struct rbd_obj_request *obj_req)
bf0d5f50 3155{
3da691bf 3156 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3da691bf 3157 int ret;
bf0d5f50 3158
86bd7998
ID
3159 rbd_assert(obj_req->num_img_extents);
3160 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
3161 rbd_dev->parent_overlap);
3162 if (!obj_req->num_img_extents) {
3da691bf
ID
3163 /*
3164 * The overlap has become 0 (most likely because the
3a482501
ID
3165 * image has been flattened). Re-submit the original write
3166 * request -- pass MODS_ONLY since the copyup isn't needed
3167 * anymore.
3da691bf 3168 */
793333a3 3169 return rbd_obj_copyup_current_snapc(obj_req, MODS_ONLY);
bf0d5f50
AE
3170 }
3171
86bd7998 3172 ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
3da691bf
ID
3173 if (ret)
3174 return ret;
3175
86bd7998 3176 return rbd_obj_read_from_parent(obj_req);
bf0d5f50 3177}
8b3e1a56 3178
22e8bd51
ID
3179static void rbd_obj_copyup_object_maps(struct rbd_obj_request *obj_req)
3180{
3181 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3182 struct ceph_snap_context *snapc = obj_req->img_request->snapc;
3183 u8 new_state;
3184 u32 i;
3185 int ret;
3186
3187 rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
3188
3189 if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3190 return;
3191
3192 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
3193 return;
3194
3195 for (i = 0; i < snapc->num_snaps; i++) {
3196 if ((rbd_dev->header.features & RBD_FEATURE_FAST_DIFF) &&
3197 i + 1 < snapc->num_snaps)
3198 new_state = OBJECT_EXISTS_CLEAN;
3199 else
3200 new_state = OBJECT_EXISTS;
3201
3202 ret = rbd_object_map_update(obj_req, snapc->snaps[i],
3203 new_state, NULL);
3204 if (ret < 0) {
3205 obj_req->pending.result = ret;
3206 return;
3207 }
3208
3209 rbd_assert(!ret);
3210 obj_req->pending.num_pending++;
3211 }
3212}
3213
793333a3
ID
3214static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req)
3215{
3216 u32 bytes = rbd_obj_img_extents_bytes(obj_req);
3217 int ret;
3218
3219 rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
3220
3221 /*
3222 * Only send non-zero copyup data to save some I/O and network
3223 * bandwidth -- zero copyup data is equivalent to the object not
3224 * existing.
3225 */
3226 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
3227 bytes = 0;
3228
3229 if (obj_req->img_request->snapc->num_snaps && bytes > 0) {
3230 /*
3231 * Send a copyup request with an empty snapshot context to
3232 * deep-copyup the object through all existing snapshots.
3233 * A second request with the current snapshot context will be
3234 * sent for the actual modification.
3235 */
3236 ret = rbd_obj_copyup_empty_snapc(obj_req, bytes);
3237 if (ret) {
3238 obj_req->pending.result = ret;
3239 return;
3240 }
3241
3242 obj_req->pending.num_pending++;
3243 bytes = MODS_ONLY;
3244 }
3245
3246 ret = rbd_obj_copyup_current_snapc(obj_req, bytes);
3247 if (ret) {
3248 obj_req->pending.result = ret;
3249 return;
3250 }
3251
3252 obj_req->pending.num_pending++;
3253}
3254
3255static bool rbd_obj_advance_copyup(struct rbd_obj_request *obj_req, int *result)
3256{
22e8bd51 3257 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
793333a3
ID
3258 int ret;
3259
3260again:
3261 switch (obj_req->copyup_state) {
3262 case RBD_OBJ_COPYUP_START:
3263 rbd_assert(!*result);
3264
3265 ret = rbd_obj_copyup_read_parent(obj_req);
3266 if (ret) {
3267 *result = ret;
3268 return true;
3269 }
3270 if (obj_req->num_img_extents)
3271 obj_req->copyup_state = RBD_OBJ_COPYUP_READ_PARENT;
3272 else
3273 obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3274 return false;
3275 case RBD_OBJ_COPYUP_READ_PARENT:
3276 if (*result)
3277 return true;
3278
3279 if (is_zero_bvecs(obj_req->copyup_bvecs,
3280 rbd_obj_img_extents_bytes(obj_req))) {
3281 dout("%s %p detected zeros\n", __func__, obj_req);
3282 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ZEROS;
3283 }
3284
22e8bd51
ID
3285 rbd_obj_copyup_object_maps(obj_req);
3286 if (!obj_req->pending.num_pending) {
3287 *result = obj_req->pending.result;
3288 obj_req->copyup_state = RBD_OBJ_COPYUP_OBJECT_MAPS;
3289 goto again;
3290 }
3291 obj_req->copyup_state = __RBD_OBJ_COPYUP_OBJECT_MAPS;
3292 return false;
3293 case __RBD_OBJ_COPYUP_OBJECT_MAPS:
3294 if (!pending_result_dec(&obj_req->pending, result))
3295 return false;
df561f66 3296 fallthrough;
22e8bd51
ID
3297 case RBD_OBJ_COPYUP_OBJECT_MAPS:
3298 if (*result) {
3299 rbd_warn(rbd_dev, "snap object map update failed: %d",
3300 *result);
3301 return true;
3302 }
3303
793333a3
ID
3304 rbd_obj_copyup_write_object(obj_req);
3305 if (!obj_req->pending.num_pending) {
3306 *result = obj_req->pending.result;
3307 obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3308 goto again;
3309 }
3310 obj_req->copyup_state = __RBD_OBJ_COPYUP_WRITE_OBJECT;
3311 return false;
3312 case __RBD_OBJ_COPYUP_WRITE_OBJECT:
3313 if (!pending_result_dec(&obj_req->pending, result))
3314 return false;
df561f66 3315 fallthrough;
793333a3
ID
3316 case RBD_OBJ_COPYUP_WRITE_OBJECT:
3317 return true;
3318 default:
3319 BUG();
3320 }
3321}
3322
22e8bd51
ID
3323/*
3324 * Return:
3325 * 0 - object map update sent
3326 * 1 - object map update isn't needed
3327 * <0 - error
3328 */
3329static int rbd_obj_write_post_object_map(struct rbd_obj_request *obj_req)
3330{
3331 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3332 u8 current_state = OBJECT_PENDING;
3333
3334 if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3335 return 1;
3336
3337 if (!(obj_req->flags & RBD_OBJ_FLAG_DELETION))
3338 return 1;
3339
3340 return rbd_object_map_update(obj_req, CEPH_NOSNAP, OBJECT_NONEXISTENT,
3341 &current_state);
3342}
3343
85b5e6d1 3344static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result)
8b3e1a56 3345{
793333a3 3346 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3da691bf 3347 int ret;
8b3e1a56 3348
793333a3 3349again:
3da691bf 3350 switch (obj_req->write_state) {
85b5e6d1
ID
3351 case RBD_OBJ_WRITE_START:
3352 rbd_assert(!*result);
3353
22e8bd51
ID
3354 if (rbd_obj_write_is_noop(obj_req))
3355 return true;
3356
3357 ret = rbd_obj_write_pre_object_map(obj_req);
3358 if (ret < 0) {
3359 *result = ret;
3360 return true;
3361 }
3362 obj_req->write_state = RBD_OBJ_WRITE_PRE_OBJECT_MAP;
3363 if (ret > 0)
3364 goto again;
3365 return false;
3366 case RBD_OBJ_WRITE_PRE_OBJECT_MAP:
3367 if (*result) {
3368 rbd_warn(rbd_dev, "pre object map update failed: %d",
3369 *result);
3370 return true;
3371 }
85b5e6d1
ID
3372 ret = rbd_obj_write_object(obj_req);
3373 if (ret) {
3374 *result = ret;
3375 return true;
3376 }
3377 obj_req->write_state = RBD_OBJ_WRITE_OBJECT;
3378 return false;
0ad5d953 3379 case RBD_OBJ_WRITE_OBJECT:
54ab3b24 3380 if (*result == -ENOENT) {
0ad5d953 3381 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
793333a3
ID
3382 *result = 0;
3383 obj_req->copyup_state = RBD_OBJ_COPYUP_START;
3384 obj_req->write_state = __RBD_OBJ_WRITE_COPYUP;
3385 goto again;
0ad5d953 3386 }
3da691bf 3387 /*
0ad5d953
ID
3388 * On a non-existent object:
3389 * delete - -ENOENT, truncate/zero - 0
3da691bf 3390 */
0ad5d953
ID
3391 if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
3392 *result = 0;
3da691bf 3393 }
a9b67e69 3394 if (*result)
3a482501 3395 return true;
8b3e1a56 3396
793333a3
ID
3397 obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
3398 goto again;
3399 case __RBD_OBJ_WRITE_COPYUP:
3400 if (!rbd_obj_advance_copyup(obj_req, result))
3401 return false;
df561f66 3402 fallthrough;
793333a3 3403 case RBD_OBJ_WRITE_COPYUP:
22e8bd51 3404 if (*result) {
793333a3 3405 rbd_warn(rbd_dev, "copyup failed: %d", *result);
22e8bd51
ID
3406 return true;
3407 }
3408 ret = rbd_obj_write_post_object_map(obj_req);
3409 if (ret < 0) {
3410 *result = ret;
3411 return true;
3412 }
3413 obj_req->write_state = RBD_OBJ_WRITE_POST_OBJECT_MAP;
3414 if (ret > 0)
3415 goto again;
3416 return false;
3417 case RBD_OBJ_WRITE_POST_OBJECT_MAP:
3418 if (*result)
3419 rbd_warn(rbd_dev, "post object map update failed: %d",
3420 *result);
793333a3 3421 return true;
3da691bf 3422 default:
c6244b3b 3423 BUG();
3da691bf
ID
3424 }
3425}
02c74fba 3426
3da691bf 3427/*
0ad5d953 3428 * Return true if @obj_req is completed.
3da691bf 3429 */
54ab3b24
ID
3430static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req,
3431 int *result)
3da691bf 3432{
0ad5d953 3433 struct rbd_img_request *img_req = obj_req->img_request;
0192ce2e 3434 struct rbd_device *rbd_dev = img_req->rbd_dev;
0ad5d953
ID
3435 bool done;
3436
85b5e6d1 3437 mutex_lock(&obj_req->state_mutex);
0ad5d953 3438 if (!rbd_img_is_write(img_req))
85b5e6d1 3439 done = rbd_obj_advance_read(obj_req, result);
0ad5d953 3440 else
85b5e6d1
ID
3441 done = rbd_obj_advance_write(obj_req, result);
3442 mutex_unlock(&obj_req->state_mutex);
0ad5d953 3443
0192ce2e
ID
3444 if (done && *result) {
3445 rbd_assert(*result < 0);
3446 rbd_warn(rbd_dev, "%s at objno %llu %llu~%llu result %d",
3447 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
3448 obj_req->ex.oe_off, obj_req->ex.oe_len, *result);
3449 }
0ad5d953 3450 return done;
3da691bf 3451}
02c74fba 3452
0192ce2e
ID
3453/*
3454 * This is open-coded in rbd_img_handle_request() to avoid parent chain
3455 * recursion.
3456 */
3457static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result)
3458{
3459 if (__rbd_obj_handle_request(obj_req, &result))
3460 rbd_img_handle_request(obj_req->img_request, result);
3461}
3462
e1fddc8f
ID
3463static bool need_exclusive_lock(struct rbd_img_request *img_req)
3464{
3465 struct rbd_device *rbd_dev = img_req->rbd_dev;
3466
3467 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK))
3468 return false;
3469
3fe69921 3470 if (rbd_is_ro(rbd_dev))
e1fddc8f
ID
3471 return false;
3472
3473 rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
22e8bd51
ID
3474 if (rbd_dev->opts->lock_on_read ||
3475 (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
e1fddc8f
ID
3476 return true;
3477
3478 return rbd_img_is_write(img_req);
3479}
3480
637cd060 3481static bool rbd_lock_add_request(struct rbd_img_request *img_req)
e1fddc8f
ID
3482{
3483 struct rbd_device *rbd_dev = img_req->rbd_dev;
637cd060 3484 bool locked;
e1fddc8f
ID
3485
3486 lockdep_assert_held(&rbd_dev->lock_rwsem);
637cd060 3487 locked = rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED;
e1fddc8f
ID
3488 spin_lock(&rbd_dev->lock_lists_lock);
3489 rbd_assert(list_empty(&img_req->lock_item));
637cd060
ID
3490 if (!locked)
3491 list_add_tail(&img_req->lock_item, &rbd_dev->acquiring_list);
3492 else
3493 list_add_tail(&img_req->lock_item, &rbd_dev->running_list);
e1fddc8f 3494 spin_unlock(&rbd_dev->lock_lists_lock);
637cd060 3495 return locked;
e1fddc8f
ID
3496}
3497
3498static void rbd_lock_del_request(struct rbd_img_request *img_req)
3499{
3500 struct rbd_device *rbd_dev = img_req->rbd_dev;
3501 bool need_wakeup;
3502
3503 lockdep_assert_held(&rbd_dev->lock_rwsem);
3504 spin_lock(&rbd_dev->lock_lists_lock);
3505 rbd_assert(!list_empty(&img_req->lock_item));
3506 list_del_init(&img_req->lock_item);
3507 need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING &&
3508 list_empty(&rbd_dev->running_list));
3509 spin_unlock(&rbd_dev->lock_lists_lock);
3510 if (need_wakeup)
3511 complete(&rbd_dev->releasing_wait);
3512}
3513
637cd060
ID
3514static int rbd_img_exclusive_lock(struct rbd_img_request *img_req)
3515{
3516 struct rbd_device *rbd_dev = img_req->rbd_dev;
3517
3518 if (!need_exclusive_lock(img_req))
3519 return 1;
3520
3521 if (rbd_lock_add_request(img_req))
3522 return 1;
3523
3524 if (rbd_dev->opts->exclusive) {
3525 WARN_ON(1); /* lock got released? */
3526 return -EROFS;
3527 }
3528
3529 /*
3530 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3531 * and cancel_delayed_work() in wake_lock_waiters().
3532 */
3533 dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3534 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3535 return 0;
3536}
3537
0192ce2e 3538static void rbd_img_object_requests(struct rbd_img_request *img_req)
7114edac 3539{
0192ce2e 3540 struct rbd_obj_request *obj_req;
7114edac 3541
0192ce2e
ID
3542 rbd_assert(!img_req->pending.result && !img_req->pending.num_pending);
3543
3544 for_each_obj_request(img_req, obj_req) {
3545 int result = 0;
a9e8ba2c 3546
0192ce2e
ID
3547 if (__rbd_obj_handle_request(obj_req, &result)) {
3548 if (result) {
3549 img_req->pending.result = result;
3550 return;
3551 }
3552 } else {
3553 img_req->pending.num_pending++;
3554 }
3555 }
8b3e1a56
AE
3556}
3557
0192ce2e 3558static bool rbd_img_advance(struct rbd_img_request *img_req, int *result)
8b3e1a56 3559{
637cd060 3560 struct rbd_device *rbd_dev = img_req->rbd_dev;
3da691bf 3561 int ret;
8b3e1a56 3562
0192ce2e
ID
3563again:
3564 switch (img_req->state) {
3565 case RBD_IMG_START:
3566 rbd_assert(!*result);
8b3e1a56 3567
637cd060
ID
3568 ret = rbd_img_exclusive_lock(img_req);
3569 if (ret < 0) {
3570 *result = ret;
3da691bf
ID
3571 return true;
3572 }
637cd060
ID
3573 img_req->state = RBD_IMG_EXCLUSIVE_LOCK;
3574 if (ret > 0)
3575 goto again;
3da691bf 3576 return false;
637cd060
ID
3577 case RBD_IMG_EXCLUSIVE_LOCK:
3578 if (*result)
89a59c1c
ID
3579 return true;
3580
637cd060
ID
3581 rbd_assert(!need_exclusive_lock(img_req) ||
3582 __rbd_is_lock_owner(rbd_dev));
3583
0192ce2e
ID
3584 rbd_img_object_requests(img_req);
3585 if (!img_req->pending.num_pending) {
3586 *result = img_req->pending.result;
3587 img_req->state = RBD_IMG_OBJECT_REQUESTS;
3588 goto again;
3da691bf 3589 }
0192ce2e 3590 img_req->state = __RBD_IMG_OBJECT_REQUESTS;
3da691bf 3591 return false;
0192ce2e
ID
3592 case __RBD_IMG_OBJECT_REQUESTS:
3593 if (!pending_result_dec(&img_req->pending, result))
3594 return false;
df561f66 3595 fallthrough;
0192ce2e
ID
3596 case RBD_IMG_OBJECT_REQUESTS:
3597 return true;
3da691bf 3598 default:
c6244b3b 3599 BUG();
3da691bf
ID
3600 }
3601}
02c74fba 3602
3da691bf 3603/*
0192ce2e 3604 * Return true if @img_req is completed.
3da691bf 3605 */
0192ce2e
ID
3606static bool __rbd_img_handle_request(struct rbd_img_request *img_req,
3607 int *result)
7114edac 3608{
0192ce2e
ID
3609 struct rbd_device *rbd_dev = img_req->rbd_dev;
3610 bool done;
7114edac 3611
e1fddc8f
ID
3612 if (need_exclusive_lock(img_req)) {
3613 down_read(&rbd_dev->lock_rwsem);
3614 mutex_lock(&img_req->state_mutex);
3615 done = rbd_img_advance(img_req, result);
3616 if (done)
3617 rbd_lock_del_request(img_req);
3618 mutex_unlock(&img_req->state_mutex);
3619 up_read(&rbd_dev->lock_rwsem);
3620 } else {
3621 mutex_lock(&img_req->state_mutex);
3622 done = rbd_img_advance(img_req, result);
3623 mutex_unlock(&img_req->state_mutex);
02c74fba 3624 }
a9e8ba2c 3625
0192ce2e
ID
3626 if (done && *result) {
3627 rbd_assert(*result < 0);
3628 rbd_warn(rbd_dev, "%s%s result %d",
3629 test_bit(IMG_REQ_CHILD, &img_req->flags) ? "child " : "",
3630 obj_op_name(img_req->op_type), *result);
7114edac 3631 }
0192ce2e 3632 return done;
7114edac 3633}
a9e8ba2c 3634
0192ce2e 3635static void rbd_img_handle_request(struct rbd_img_request *img_req, int result)
3da691bf 3636{
7114edac 3637again:
0192ce2e 3638 if (!__rbd_img_handle_request(img_req, &result))
7114edac 3639 return;
8b3e1a56 3640
7114edac 3641 if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
0192ce2e
ID
3642 struct rbd_obj_request *obj_req = img_req->obj_request;
3643
679a97d2 3644 rbd_img_request_destroy(img_req);
0192ce2e
ID
3645 if (__rbd_obj_handle_request(obj_req, &result)) {
3646 img_req = obj_req->img_request;
3647 goto again;
3648 }
3649 } else {
59e542c8 3650 struct request *rq = blk_mq_rq_from_pdu(img_req);
0192ce2e 3651
679a97d2 3652 rbd_img_request_destroy(img_req);
0192ce2e 3653 blk_mq_end_request(rq, errno_to_blk_status(result));
7114edac 3654 }
8b3e1a56 3655}
bf0d5f50 3656
ed95b21a 3657static const struct rbd_client_id rbd_empty_cid;
b8d70035 3658
ed95b21a
ID
3659static bool rbd_cid_equal(const struct rbd_client_id *lhs,
3660 const struct rbd_client_id *rhs)
3661{
3662 return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
3663}
3664
3665static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
3666{
3667 struct rbd_client_id cid;
3668
3669 mutex_lock(&rbd_dev->watch_mutex);
3670 cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
3671 cid.handle = rbd_dev->watch_cookie;
3672 mutex_unlock(&rbd_dev->watch_mutex);
3673 return cid;
3674}
3675
3676/*
3677 * lock_rwsem must be held for write
3678 */
3679static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
3680 const struct rbd_client_id *cid)
3681{
3682 dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
3683 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
3684 cid->gid, cid->handle);
3685 rbd_dev->owner_cid = *cid; /* struct */
3686}
3687
3688static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
3689{
3690 mutex_lock(&rbd_dev->watch_mutex);
3691 sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
3692 mutex_unlock(&rbd_dev->watch_mutex);
3693}
3694
edd8ca80
FM
3695static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
3696{
3697 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3698
a2b1da09 3699 rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
edd8ca80
FM
3700 strcpy(rbd_dev->lock_cookie, cookie);
3701 rbd_set_owner_cid(rbd_dev, &cid);
3702 queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
3703}
3704
ed95b21a
ID
3705/*
3706 * lock_rwsem must be held for write
3707 */
3708static int rbd_lock(struct rbd_device *rbd_dev)
b8d70035 3709{
922dab61 3710 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
ed95b21a 3711 char cookie[32];
e627db08 3712 int ret;
b8d70035 3713
cbbfb0ff
ID
3714 WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
3715 rbd_dev->lock_cookie[0] != '\0');
52bb1f9b 3716
ed95b21a
ID
3717 format_lock_cookie(rbd_dev, cookie);
3718 ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3719 RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
3720 RBD_LOCK_TAG, "", 0);
e627db08 3721 if (ret)
ed95b21a 3722 return ret;
b8d70035 3723
edd8ca80 3724 __rbd_lock(rbd_dev, cookie);
ed95b21a 3725 return 0;
b8d70035
AE
3726}
3727
ed95b21a
ID
3728/*
3729 * lock_rwsem must be held for write
3730 */
bbead745 3731static void rbd_unlock(struct rbd_device *rbd_dev)
bb040aa0 3732{
922dab61 3733 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
bb040aa0
ID
3734 int ret;
3735
cbbfb0ff
ID
3736 WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
3737 rbd_dev->lock_cookie[0] == '\0');
bb040aa0 3738
ed95b21a 3739 ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
cbbfb0ff 3740 RBD_LOCK_NAME, rbd_dev->lock_cookie);
bbead745 3741 if (ret && ret != -ENOENT)
637cd060 3742 rbd_warn(rbd_dev, "failed to unlock header: %d", ret);
bb040aa0 3743
bbead745
ID
3744 /* treat errors as the image is unlocked */
3745 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
cbbfb0ff 3746 rbd_dev->lock_cookie[0] = '\0';
ed95b21a
ID
3747 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3748 queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
bb040aa0
ID
3749}
3750
ed95b21a
ID
3751static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
3752 enum rbd_notify_op notify_op,
3753 struct page ***preply_pages,
3754 size_t *preply_len)
9969ebc5
AE
3755{
3756 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
ed95b21a 3757 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
08a79102
KS
3758 char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
3759 int buf_size = sizeof(buf);
ed95b21a 3760 void *p = buf;
9969ebc5 3761
ed95b21a 3762 dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
9969ebc5 3763
ed95b21a
ID
3764 /* encode *LockPayload NotifyMessage (op + ClientId) */
3765 ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
3766 ceph_encode_32(&p, notify_op);
3767 ceph_encode_64(&p, cid.gid);
3768 ceph_encode_64(&p, cid.handle);
8eb87565 3769
ed95b21a
ID
3770 return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
3771 &rbd_dev->header_oloc, buf, buf_size,
3772 RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
b30a01f2
ID
3773}
3774
ed95b21a
ID
3775static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
3776 enum rbd_notify_op notify_op)
b30a01f2 3777{
8ae0299a 3778 __rbd_notify_op_lock(rbd_dev, notify_op, NULL, NULL);
ed95b21a 3779}
b30a01f2 3780
ed95b21a
ID
3781static void rbd_notify_acquired_lock(struct work_struct *work)
3782{
3783 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3784 acquired_lock_work);
76756a51 3785
ed95b21a 3786 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
c525f036
ID
3787}
3788
ed95b21a 3789static void rbd_notify_released_lock(struct work_struct *work)
c525f036 3790{
ed95b21a
ID
3791 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3792 released_lock_work);
811c6688 3793
ed95b21a 3794 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
fca27065
ID
3795}
3796
ed95b21a 3797static int rbd_request_lock(struct rbd_device *rbd_dev)
36be9a76 3798{
ed95b21a
ID
3799 struct page **reply_pages;
3800 size_t reply_len;
3801 bool lock_owner_responded = false;
36be9a76
AE
3802 int ret;
3803
ed95b21a 3804 dout("%s rbd_dev %p\n", __func__, rbd_dev);
36be9a76 3805
ed95b21a
ID
3806 ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
3807 &reply_pages, &reply_len);
3808 if (ret && ret != -ETIMEDOUT) {
3809 rbd_warn(rbd_dev, "failed to request lock: %d", ret);
36be9a76 3810 goto out;
ed95b21a 3811 }
36be9a76 3812
ed95b21a
ID
3813 if (reply_len > 0 && reply_len <= PAGE_SIZE) {
3814 void *p = page_address(reply_pages[0]);
3815 void *const end = p + reply_len;
3816 u32 n;
36be9a76 3817
ed95b21a
ID
3818 ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
3819 while (n--) {
3820 u8 struct_v;
3821 u32 len;
36be9a76 3822
ed95b21a
ID
3823 ceph_decode_need(&p, end, 8 + 8, e_inval);
3824 p += 8 + 8; /* skip gid and cookie */
04017e29 3825
ed95b21a
ID
3826 ceph_decode_32_safe(&p, end, len, e_inval);
3827 if (!len)
3828 continue;
3829
3830 if (lock_owner_responded) {
3831 rbd_warn(rbd_dev,
3832 "duplicate lock owners detected");
3833 ret = -EIO;
3834 goto out;
3835 }
3836
3837 lock_owner_responded = true;
3838 ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
3839 &struct_v, &len);
3840 if (ret) {
3841 rbd_warn(rbd_dev,
3842 "failed to decode ResponseMessage: %d",
3843 ret);
3844 goto e_inval;
3845 }
3846
3847 ret = ceph_decode_32(&p);
3848 }
3849 }
3850
3851 if (!lock_owner_responded) {
3852 rbd_warn(rbd_dev, "no lock owners detected");
3853 ret = -ETIMEDOUT;
3854 }
3855
3856out:
3857 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3858 return ret;
3859
3860e_inval:
3861 ret = -EINVAL;
3862 goto out;
3863}
3864
637cd060
ID
3865/*
3866 * Either image request state machine(s) or rbd_add_acquire_lock()
3867 * (i.e. "rbd map").
3868 */
3869static void wake_lock_waiters(struct rbd_device *rbd_dev, int result)
ed95b21a 3870{
637cd060
ID
3871 struct rbd_img_request *img_req;
3872
3873 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
d9b9c893 3874 lockdep_assert_held_write(&rbd_dev->lock_rwsem);
ed95b21a
ID
3875
3876 cancel_delayed_work(&rbd_dev->lock_dwork);
637cd060
ID
3877 if (!completion_done(&rbd_dev->acquire_wait)) {
3878 rbd_assert(list_empty(&rbd_dev->acquiring_list) &&
3879 list_empty(&rbd_dev->running_list));
3880 rbd_dev->acquire_err = result;
3881 complete_all(&rbd_dev->acquire_wait);
3882 return;
3883 }
3884
3885 list_for_each_entry(img_req, &rbd_dev->acquiring_list, lock_item) {
3886 mutex_lock(&img_req->state_mutex);
3887 rbd_assert(img_req->state == RBD_IMG_EXCLUSIVE_LOCK);
3888 rbd_img_schedule(img_req, result);
3889 mutex_unlock(&img_req->state_mutex);
3890 }
3891
3892 list_splice_tail_init(&rbd_dev->acquiring_list, &rbd_dev->running_list);
ed95b21a
ID
3893}
3894
3895static int get_lock_owner_info(struct rbd_device *rbd_dev,
3896 struct ceph_locker **lockers, u32 *num_lockers)
3897{
3898 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3899 u8 lock_type;
3900 char *lock_tag;
3901 int ret;
3902
3903 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3904
3905 ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
3906 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3907 &lock_type, &lock_tag, lockers, num_lockers);
3908 if (ret)
3909 return ret;
3910
3911 if (*num_lockers == 0) {
3912 dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
3913 goto out;
3914 }
3915
3916 if (strcmp(lock_tag, RBD_LOCK_TAG)) {
3917 rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
3918 lock_tag);
3919 ret = -EBUSY;
3920 goto out;
3921 }
3922
3923 if (lock_type == CEPH_CLS_LOCK_SHARED) {
3924 rbd_warn(rbd_dev, "shared lock type detected");
3925 ret = -EBUSY;
3926 goto out;
3927 }
3928
3929 if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
3930 strlen(RBD_LOCK_COOKIE_PREFIX))) {
3931 rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
3932 (*lockers)[0].id.cookie);
3933 ret = -EBUSY;
3934 goto out;
3935 }
3936
3937out:
3938 kfree(lock_tag);
3939 return ret;
3940}
3941
3942static int find_watcher(struct rbd_device *rbd_dev,
3943 const struct ceph_locker *locker)
3944{
3945 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3946 struct ceph_watch_item *watchers;
3947 u32 num_watchers;
3948 u64 cookie;
3949 int i;
3950 int ret;
3951
3952 ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
3953 &rbd_dev->header_oloc, &watchers,
3954 &num_watchers);
3955 if (ret)
3956 return ret;
3957
3958 sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
3959 for (i = 0; i < num_watchers; i++) {
3960 if (!memcmp(&watchers[i].addr, &locker->info.addr,
3961 sizeof(locker->info.addr)) &&
3962 watchers[i].cookie == cookie) {
3963 struct rbd_client_id cid = {
3964 .gid = le64_to_cpu(watchers[i].name.num),
3965 .handle = cookie,
3966 };
3967
3968 dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
3969 rbd_dev, cid.gid, cid.handle);
3970 rbd_set_owner_cid(rbd_dev, &cid);
3971 ret = 1;
3972 goto out;
3973 }
3974 }
3975
3976 dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
3977 ret = 0;
3978out:
3979 kfree(watchers);
3980 return ret;
3981}
3982
3983/*
3984 * lock_rwsem must be held for write
3985 */
3986static int rbd_try_lock(struct rbd_device *rbd_dev)
3987{
3988 struct ceph_client *client = rbd_dev->rbd_client->client;
3989 struct ceph_locker *lockers;
3990 u32 num_lockers;
3991 int ret;
3992
3993 for (;;) {
3994 ret = rbd_lock(rbd_dev);
3995 if (ret != -EBUSY)
3996 return ret;
3997
3998 /* determine if the current lock holder is still alive */
3999 ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
4000 if (ret)
4001 return ret;
4002
4003 if (num_lockers == 0)
4004 goto again;
4005
4006 ret = find_watcher(rbd_dev, lockers);
637cd060
ID
4007 if (ret)
4008 goto out; /* request lock or error */
ed95b21a 4009
22e8bd51 4010 rbd_warn(rbd_dev, "breaking header lock owned by %s%llu",
ed95b21a
ID
4011 ENTITY_NAME(lockers[0].id.name));
4012
0b98acd6 4013 ret = ceph_monc_blocklist_add(&client->monc,
ed95b21a
ID
4014 &lockers[0].info.addr);
4015 if (ret) {
0b98acd6 4016 rbd_warn(rbd_dev, "blocklist of %s%llu failed: %d",
ed95b21a
ID
4017 ENTITY_NAME(lockers[0].id.name), ret);
4018 goto out;
4019 }
4020
4021 ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
4022 &rbd_dev->header_oloc, RBD_LOCK_NAME,
4023 lockers[0].id.cookie,
4024 &lockers[0].id.name);
4025 if (ret && ret != -ENOENT)
4026 goto out;
4027
4028again:
4029 ceph_free_lockers(lockers, num_lockers);
4030 }
4031
4032out:
4033 ceph_free_lockers(lockers, num_lockers);
4034 return ret;
4035}
4036
22e8bd51
ID
4037static int rbd_post_acquire_action(struct rbd_device *rbd_dev)
4038{
4039 int ret;
4040
4041 if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) {
4042 ret = rbd_object_map_open(rbd_dev);
4043 if (ret)
4044 return ret;
4045 }
4046
4047 return 0;
4048}
4049
ed95b21a 4050/*
637cd060
ID
4051 * Return:
4052 * 0 - lock acquired
4053 * 1 - caller should call rbd_request_lock()
4054 * <0 - error
ed95b21a 4055 */
637cd060 4056static int rbd_try_acquire_lock(struct rbd_device *rbd_dev)
ed95b21a 4057{
637cd060 4058 int ret;
ed95b21a
ID
4059
4060 down_read(&rbd_dev->lock_rwsem);
4061 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
4062 rbd_dev->lock_state);
4063 if (__rbd_is_lock_owner(rbd_dev)) {
ed95b21a 4064 up_read(&rbd_dev->lock_rwsem);
637cd060 4065 return 0;
ed95b21a
ID
4066 }
4067
4068 up_read(&rbd_dev->lock_rwsem);
4069 down_write(&rbd_dev->lock_rwsem);
4070 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
4071 rbd_dev->lock_state);
637cd060
ID
4072 if (__rbd_is_lock_owner(rbd_dev)) {
4073 up_write(&rbd_dev->lock_rwsem);
4074 return 0;
ed95b21a
ID
4075 }
4076
637cd060
ID
4077 ret = rbd_try_lock(rbd_dev);
4078 if (ret < 0) {
4079 rbd_warn(rbd_dev, "failed to lock header: %d", ret);
0b98acd6 4080 if (ret == -EBLOCKLISTED)
637cd060
ID
4081 goto out;
4082
4083 ret = 1; /* request lock anyway */
4084 }
4085 if (ret > 0) {
4086 up_write(&rbd_dev->lock_rwsem);
4087 return ret;
4088 }
4089
4090 rbd_assert(rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED);
4091 rbd_assert(list_empty(&rbd_dev->running_list));
4092
22e8bd51
ID
4093 ret = rbd_post_acquire_action(rbd_dev);
4094 if (ret) {
4095 rbd_warn(rbd_dev, "post-acquire action failed: %d", ret);
4096 /*
4097 * Can't stay in RBD_LOCK_STATE_LOCKED because
4098 * rbd_lock_add_request() would let the request through,
4099 * assuming that e.g. object map is locked and loaded.
4100 */
4101 rbd_unlock(rbd_dev);
ed95b21a
ID
4102 }
4103
637cd060
ID
4104out:
4105 wake_lock_waiters(rbd_dev, ret);
ed95b21a 4106 up_write(&rbd_dev->lock_rwsem);
637cd060 4107 return ret;
ed95b21a
ID
4108}
4109
4110static void rbd_acquire_lock(struct work_struct *work)
4111{
4112 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
4113 struct rbd_device, lock_dwork);
637cd060 4114 int ret;
ed95b21a
ID
4115
4116 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4117again:
637cd060
ID
4118 ret = rbd_try_acquire_lock(rbd_dev);
4119 if (ret <= 0) {
4120 dout("%s rbd_dev %p ret %d - done\n", __func__, rbd_dev, ret);
ed95b21a
ID
4121 return;
4122 }
4123
4124 ret = rbd_request_lock(rbd_dev);
4125 if (ret == -ETIMEDOUT) {
4126 goto again; /* treat this as a dead client */
e010dd0a
ID
4127 } else if (ret == -EROFS) {
4128 rbd_warn(rbd_dev, "peer will not release lock");
637cd060
ID
4129 down_write(&rbd_dev->lock_rwsem);
4130 wake_lock_waiters(rbd_dev, ret);
4131 up_write(&rbd_dev->lock_rwsem);
ed95b21a
ID
4132 } else if (ret < 0) {
4133 rbd_warn(rbd_dev, "error requesting lock: %d", ret);
4134 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4135 RBD_RETRY_DELAY);
4136 } else {
4137 /*
4138 * lock owner acked, but resend if we don't see them
4139 * release the lock
4140 */
6b0a8774 4141 dout("%s rbd_dev %p requeuing lock_dwork\n", __func__,
ed95b21a
ID
4142 rbd_dev);
4143 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4144 msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
4145 }
4146}
4147
a2b1da09 4148static bool rbd_quiesce_lock(struct rbd_device *rbd_dev)
ed95b21a 4149{
e1fddc8f
ID
4150 bool need_wait;
4151
a2b1da09 4152 dout("%s rbd_dev %p\n", __func__, rbd_dev);
d9b9c893 4153 lockdep_assert_held_write(&rbd_dev->lock_rwsem);
a2b1da09 4154
ed95b21a
ID
4155 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
4156 return false;
4157
52bb1f9b 4158 /*
ed95b21a 4159 * Ensure that all in-flight IO is flushed.
52bb1f9b 4160 */
e1fddc8f
ID
4161 rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
4162 rbd_assert(!completion_done(&rbd_dev->releasing_wait));
4163 need_wait = !list_empty(&rbd_dev->running_list);
4164 downgrade_write(&rbd_dev->lock_rwsem);
4165 if (need_wait)
4166 wait_for_completion(&rbd_dev->releasing_wait);
ed95b21a
ID
4167 up_read(&rbd_dev->lock_rwsem);
4168
4169 down_write(&rbd_dev->lock_rwsem);
ed95b21a
ID
4170 if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
4171 return false;
4172
e1fddc8f 4173 rbd_assert(list_empty(&rbd_dev->running_list));
a2b1da09
ID
4174 return true;
4175}
4176
22e8bd51
ID
4177static void rbd_pre_release_action(struct rbd_device *rbd_dev)
4178{
4179 if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)
4180 rbd_object_map_close(rbd_dev);
4181}
4182
e1fddc8f
ID
4183static void __rbd_release_lock(struct rbd_device *rbd_dev)
4184{
4185 rbd_assert(list_empty(&rbd_dev->running_list));
4186
22e8bd51 4187 rbd_pre_release_action(rbd_dev);
bbead745 4188 rbd_unlock(rbd_dev);
e1fddc8f
ID
4189}
4190
a2b1da09
ID
4191/*
4192 * lock_rwsem must be held for write
4193 */
4194static void rbd_release_lock(struct rbd_device *rbd_dev)
4195{
4196 if (!rbd_quiesce_lock(rbd_dev))
4197 return;
4198
e1fddc8f 4199 __rbd_release_lock(rbd_dev);
a2b1da09 4200
bbead745
ID
4201 /*
4202 * Give others a chance to grab the lock - we would re-acquire
637cd060
ID
4203 * almost immediately if we got new IO while draining the running
4204 * list otherwise. We need to ack our own notifications, so this
4205 * lock_dwork will be requeued from rbd_handle_released_lock() by
4206 * way of maybe_kick_acquire().
bbead745
ID
4207 */
4208 cancel_delayed_work(&rbd_dev->lock_dwork);
ed95b21a
ID
4209}
4210
4211static void rbd_release_lock_work(struct work_struct *work)
4212{
4213 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
4214 unlock_work);
4215
4216 down_write(&rbd_dev->lock_rwsem);
4217 rbd_release_lock(rbd_dev);
4218 up_write(&rbd_dev->lock_rwsem);
4219}
4220
637cd060
ID
4221static void maybe_kick_acquire(struct rbd_device *rbd_dev)
4222{
4223 bool have_requests;
4224
4225 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4226 if (__rbd_is_lock_owner(rbd_dev))
4227 return;
4228
4229 spin_lock(&rbd_dev->lock_lists_lock);
4230 have_requests = !list_empty(&rbd_dev->acquiring_list);
4231 spin_unlock(&rbd_dev->lock_lists_lock);
4232 if (have_requests || delayed_work_pending(&rbd_dev->lock_dwork)) {
4233 dout("%s rbd_dev %p kicking lock_dwork\n", __func__, rbd_dev);
4234 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
4235 }
4236}
4237
ed95b21a
ID
4238static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
4239 void **p)
4240{
4241 struct rbd_client_id cid = { 0 };
4242
4243 if (struct_v >= 2) {
4244 cid.gid = ceph_decode_64(p);
4245 cid.handle = ceph_decode_64(p);
4246 }
4247
4248 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4249 cid.handle);
4250 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4251 down_write(&rbd_dev->lock_rwsem);
4252 if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
4253 /*
4254 * we already know that the remote client is
4255 * the owner
4256 */
4257 up_write(&rbd_dev->lock_rwsem);
4258 return;
4259 }
4260
4261 rbd_set_owner_cid(rbd_dev, &cid);
4262 downgrade_write(&rbd_dev->lock_rwsem);
4263 } else {
4264 down_read(&rbd_dev->lock_rwsem);
4265 }
4266
637cd060 4267 maybe_kick_acquire(rbd_dev);
ed95b21a
ID
4268 up_read(&rbd_dev->lock_rwsem);
4269}
4270
4271static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
4272 void **p)
4273{
4274 struct rbd_client_id cid = { 0 };
4275
4276 if (struct_v >= 2) {
4277 cid.gid = ceph_decode_64(p);
4278 cid.handle = ceph_decode_64(p);
4279 }
4280
4281 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4282 cid.handle);
4283 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4284 down_write(&rbd_dev->lock_rwsem);
4285 if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
4286 dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
4287 __func__, rbd_dev, cid.gid, cid.handle,
4288 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
4289 up_write(&rbd_dev->lock_rwsem);
4290 return;
4291 }
4292
4293 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
4294 downgrade_write(&rbd_dev->lock_rwsem);
4295 } else {
4296 down_read(&rbd_dev->lock_rwsem);
4297 }
4298
637cd060 4299 maybe_kick_acquire(rbd_dev);
ed95b21a
ID
4300 up_read(&rbd_dev->lock_rwsem);
4301}
4302
3b77faa0
ID
4303/*
4304 * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
4305 * ResponseMessage is needed.
4306 */
4307static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
4308 void **p)
ed95b21a
ID
4309{
4310 struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
4311 struct rbd_client_id cid = { 0 };
3b77faa0 4312 int result = 1;
ed95b21a
ID
4313
4314 if (struct_v >= 2) {
4315 cid.gid = ceph_decode_64(p);
4316 cid.handle = ceph_decode_64(p);
4317 }
4318
4319 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4320 cid.handle);
4321 if (rbd_cid_equal(&cid, &my_cid))
3b77faa0 4322 return result;
ed95b21a
ID
4323
4324 down_read(&rbd_dev->lock_rwsem);
3b77faa0
ID
4325 if (__rbd_is_lock_owner(rbd_dev)) {
4326 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
4327 rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
4328 goto out_unlock;
4329
4330 /*
4331 * encode ResponseMessage(0) so the peer can detect
4332 * a missing owner
4333 */
4334 result = 0;
4335
4336 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
e010dd0a
ID
4337 if (!rbd_dev->opts->exclusive) {
4338 dout("%s rbd_dev %p queueing unlock_work\n",
4339 __func__, rbd_dev);
4340 queue_work(rbd_dev->task_wq,
4341 &rbd_dev->unlock_work);
4342 } else {
4343 /* refuse to release the lock */
4344 result = -EROFS;
4345 }
ed95b21a
ID
4346 }
4347 }
3b77faa0
ID
4348
4349out_unlock:
ed95b21a 4350 up_read(&rbd_dev->lock_rwsem);
3b77faa0 4351 return result;
ed95b21a
ID
4352}
4353
4354static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
4355 u64 notify_id, u64 cookie, s32 *result)
4356{
4357 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
08a79102
KS
4358 char buf[4 + CEPH_ENCODING_START_BLK_LEN];
4359 int buf_size = sizeof(buf);
ed95b21a
ID
4360 int ret;
4361
4362 if (result) {
4363 void *p = buf;
4364
4365 /* encode ResponseMessage */
4366 ceph_start_encoding(&p, 1, 1,
4367 buf_size - CEPH_ENCODING_START_BLK_LEN);
4368 ceph_encode_32(&p, *result);
4369 } else {
4370 buf_size = 0;
4371 }
b8d70035 4372
922dab61
ID
4373 ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
4374 &rbd_dev->header_oloc, notify_id, cookie,
ed95b21a 4375 buf, buf_size);
52bb1f9b 4376 if (ret)
ed95b21a
ID
4377 rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
4378}
4379
4380static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
4381 u64 cookie)
4382{
4383 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4384 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
4385}
4386
4387static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
4388 u64 notify_id, u64 cookie, s32 result)
4389{
4390 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
4391 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
4392}
4393
4394static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
4395 u64 notifier_id, void *data, size_t data_len)
4396{
4397 struct rbd_device *rbd_dev = arg;
4398 void *p = data;
4399 void *const end = p + data_len;
d4c2269b 4400 u8 struct_v = 0;
ed95b21a
ID
4401 u32 len;
4402 u32 notify_op;
4403 int ret;
4404
4405 dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
4406 __func__, rbd_dev, cookie, notify_id, data_len);
4407 if (data_len) {
4408 ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
4409 &struct_v, &len);
4410 if (ret) {
4411 rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
4412 ret);
4413 return;
4414 }
4415
4416 notify_op = ceph_decode_32(&p);
4417 } else {
4418 /* legacy notification for header updates */
4419 notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
4420 len = 0;
4421 }
4422
4423 dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
4424 switch (notify_op) {
4425 case RBD_NOTIFY_OP_ACQUIRED_LOCK:
4426 rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
4427 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4428 break;
4429 case RBD_NOTIFY_OP_RELEASED_LOCK:
4430 rbd_handle_released_lock(rbd_dev, struct_v, &p);
4431 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4432 break;
4433 case RBD_NOTIFY_OP_REQUEST_LOCK:
3b77faa0
ID
4434 ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
4435 if (ret <= 0)
ed95b21a 4436 rbd_acknowledge_notify_result(rbd_dev, notify_id,
3b77faa0 4437 cookie, ret);
ed95b21a
ID
4438 else
4439 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4440 break;
4441 case RBD_NOTIFY_OP_HEADER_UPDATE:
4442 ret = rbd_dev_refresh(rbd_dev);
4443 if (ret)
4444 rbd_warn(rbd_dev, "refresh failed: %d", ret);
4445
4446 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4447 break;
4448 default:
4449 if (rbd_is_lock_owner(rbd_dev))
4450 rbd_acknowledge_notify_result(rbd_dev, notify_id,
4451 cookie, -EOPNOTSUPP);
4452 else
4453 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4454 break;
4455 }
b8d70035
AE
4456}
4457
99d16943
ID
4458static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
4459
922dab61 4460static void rbd_watch_errcb(void *arg, u64 cookie, int err)
bb040aa0 4461{
922dab61 4462 struct rbd_device *rbd_dev = arg;
bb040aa0 4463
922dab61 4464 rbd_warn(rbd_dev, "encountered watch error: %d", err);
bb040aa0 4465
ed95b21a
ID
4466 down_write(&rbd_dev->lock_rwsem);
4467 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
4468 up_write(&rbd_dev->lock_rwsem);
4469
99d16943
ID
4470 mutex_lock(&rbd_dev->watch_mutex);
4471 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
4472 __rbd_unregister_watch(rbd_dev);
4473 rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
bb040aa0 4474
99d16943 4475 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
bb040aa0 4476 }
99d16943 4477 mutex_unlock(&rbd_dev->watch_mutex);
bb040aa0
ID
4478}
4479
9969ebc5 4480/*
99d16943 4481 * watch_mutex must be locked
9969ebc5 4482 */
99d16943 4483static int __rbd_register_watch(struct rbd_device *rbd_dev)
9969ebc5
AE
4484{
4485 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
922dab61 4486 struct ceph_osd_linger_request *handle;
9969ebc5 4487
922dab61 4488 rbd_assert(!rbd_dev->watch_handle);
99d16943 4489 dout("%s rbd_dev %p\n", __func__, rbd_dev);
9969ebc5 4490
922dab61
ID
4491 handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
4492 &rbd_dev->header_oloc, rbd_watch_cb,
4493 rbd_watch_errcb, rbd_dev);
4494 if (IS_ERR(handle))
4495 return PTR_ERR(handle);
8eb87565 4496
922dab61 4497 rbd_dev->watch_handle = handle;
b30a01f2 4498 return 0;
b30a01f2
ID
4499}
4500
99d16943
ID
4501/*
4502 * watch_mutex must be locked
4503 */
4504static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
b30a01f2 4505{
922dab61
ID
4506 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4507 int ret;
b30a01f2 4508
99d16943
ID
4509 rbd_assert(rbd_dev->watch_handle);
4510 dout("%s rbd_dev %p\n", __func__, rbd_dev);
b30a01f2 4511
922dab61
ID
4512 ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
4513 if (ret)
4514 rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
76756a51 4515
922dab61 4516 rbd_dev->watch_handle = NULL;
c525f036
ID
4517}
4518
99d16943
ID
4519static int rbd_register_watch(struct rbd_device *rbd_dev)
4520{
4521 int ret;
4522
4523 mutex_lock(&rbd_dev->watch_mutex);
4524 rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
4525 ret = __rbd_register_watch(rbd_dev);
4526 if (ret)
4527 goto out;
4528
4529 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
4530 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
4531
4532out:
4533 mutex_unlock(&rbd_dev->watch_mutex);
4534 return ret;
4535}
4536
4537static void cancel_tasks_sync(struct rbd_device *rbd_dev)
c525f036 4538{
99d16943
ID
4539 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4540
ed95b21a
ID
4541 cancel_work_sync(&rbd_dev->acquired_lock_work);
4542 cancel_work_sync(&rbd_dev->released_lock_work);
4543 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
4544 cancel_work_sync(&rbd_dev->unlock_work);
99d16943
ID
4545}
4546
0e4e1de5
ID
4547/*
4548 * header_rwsem must not be held to avoid a deadlock with
4549 * rbd_dev_refresh() when flushing notifies.
4550 */
99d16943
ID
4551static void rbd_unregister_watch(struct rbd_device *rbd_dev)
4552{
4553 cancel_tasks_sync(rbd_dev);
4554
4555 mutex_lock(&rbd_dev->watch_mutex);
4556 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
4557 __rbd_unregister_watch(rbd_dev);
4558 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4559 mutex_unlock(&rbd_dev->watch_mutex);
811c6688 4560
23edca86 4561 cancel_delayed_work_sync(&rbd_dev->watch_dwork);
811c6688 4562 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
fca27065
ID
4563}
4564
14bb211d
ID
4565/*
4566 * lock_rwsem must be held for write
4567 */
4568static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
4569{
4570 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4571 char cookie[32];
4572 int ret;
4573
a2b1da09
ID
4574 if (!rbd_quiesce_lock(rbd_dev))
4575 return;
14bb211d
ID
4576
4577 format_lock_cookie(rbd_dev, cookie);
4578 ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
4579 &rbd_dev->header_oloc, RBD_LOCK_NAME,
4580 CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
4581 RBD_LOCK_TAG, cookie);
4582 if (ret) {
4583 if (ret != -EOPNOTSUPP)
4584 rbd_warn(rbd_dev, "failed to update lock cookie: %d",
4585 ret);
4586
4587 /*
4588 * Lock cookie cannot be updated on older OSDs, so do
4589 * a manual release and queue an acquire.
4590 */
e1fddc8f 4591 __rbd_release_lock(rbd_dev);
a2b1da09 4592 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
14bb211d 4593 } else {
edd8ca80 4594 __rbd_lock(rbd_dev, cookie);
637cd060 4595 wake_lock_waiters(rbd_dev, 0);
14bb211d
ID
4596 }
4597}
4598
99d16943
ID
4599static void rbd_reregister_watch(struct work_struct *work)
4600{
4601 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
4602 struct rbd_device, watch_dwork);
4603 int ret;
4604
4605 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4606
4607 mutex_lock(&rbd_dev->watch_mutex);
87c0fded
ID
4608 if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
4609 mutex_unlock(&rbd_dev->watch_mutex);
14bb211d 4610 return;
87c0fded 4611 }
99d16943
ID
4612
4613 ret = __rbd_register_watch(rbd_dev);
4614 if (ret) {
4615 rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
0b98acd6 4616 if (ret != -EBLOCKLISTED && ret != -ENOENT) {
99d16943
ID
4617 queue_delayed_work(rbd_dev->task_wq,
4618 &rbd_dev->watch_dwork,
4619 RBD_RETRY_DELAY);
637cd060
ID
4620 mutex_unlock(&rbd_dev->watch_mutex);
4621 return;
87c0fded 4622 }
637cd060 4623
87c0fded 4624 mutex_unlock(&rbd_dev->watch_mutex);
637cd060
ID
4625 down_write(&rbd_dev->lock_rwsem);
4626 wake_lock_waiters(rbd_dev, ret);
4627 up_write(&rbd_dev->lock_rwsem);
14bb211d 4628 return;
99d16943
ID
4629 }
4630
4631 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
4632 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
4633 mutex_unlock(&rbd_dev->watch_mutex);
4634
14bb211d
ID
4635 down_write(&rbd_dev->lock_rwsem);
4636 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
4637 rbd_reacquire_lock(rbd_dev);
4638 up_write(&rbd_dev->lock_rwsem);
4639
99d16943
ID
4640 ret = rbd_dev_refresh(rbd_dev);
4641 if (ret)
f6870cc9 4642 rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
99d16943
ID
4643}
4644
36be9a76 4645/*
f40eb349
AE
4646 * Synchronous osd object method call. Returns the number of bytes
4647 * returned in the outbound buffer, or a negative error code.
36be9a76
AE
4648 */
4649static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
ecd4a68a
ID
4650 struct ceph_object_id *oid,
4651 struct ceph_object_locator *oloc,
36be9a76 4652 const char *method_name,
4157976b 4653 const void *outbound,
36be9a76 4654 size_t outbound_size,
4157976b 4655 void *inbound,
e2a58ee5 4656 size_t inbound_size)
36be9a76 4657{
ecd4a68a
ID
4658 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4659 struct page *req_page = NULL;
4660 struct page *reply_page;
36be9a76
AE
4661 int ret;
4662
4663 /*
6010a451
AE
4664 * Method calls are ultimately read operations. The result
4665 * should placed into the inbound buffer provided. They
4666 * also supply outbound data--parameters for the object
4667 * method. Currently if this is present it will be a
4668 * snapshot id.
36be9a76 4669 */
ecd4a68a
ID
4670 if (outbound) {
4671 if (outbound_size > PAGE_SIZE)
4672 return -E2BIG;
36be9a76 4673
ecd4a68a
ID
4674 req_page = alloc_page(GFP_KERNEL);
4675 if (!req_page)
4676 return -ENOMEM;
04017e29 4677
ecd4a68a 4678 memcpy(page_address(req_page), outbound, outbound_size);
04017e29 4679 }
36be9a76 4680
ecd4a68a
ID
4681 reply_page = alloc_page(GFP_KERNEL);
4682 if (!reply_page) {
4683 if (req_page)
4684 __free_page(req_page);
4685 return -ENOMEM;
4686 }
57385b51 4687
ecd4a68a
ID
4688 ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
4689 CEPH_OSD_FLAG_READ, req_page, outbound_size,
68ada915 4690 &reply_page, &inbound_size);
ecd4a68a
ID
4691 if (!ret) {
4692 memcpy(inbound, page_address(reply_page), inbound_size);
4693 ret = inbound_size;
4694 }
36be9a76 4695
ecd4a68a
ID
4696 if (req_page)
4697 __free_page(req_page);
4698 __free_page(reply_page);
36be9a76
AE
4699 return ret;
4700}
4701
7ad18afa 4702static void rbd_queue_workfn(struct work_struct *work)
bf0d5f50 4703{
59e542c8
ID
4704 struct rbd_img_request *img_request =
4705 container_of(work, struct rbd_img_request, work);
4706 struct rbd_device *rbd_dev = img_request->rbd_dev;
4707 enum obj_operation_type op_type = img_request->op_type;
4708 struct request *rq = blk_mq_rq_from_pdu(img_request);
bc1ecc65
ID
4709 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
4710 u64 length = blk_rq_bytes(rq);
4e752f0a 4711 u64 mapping_size;
bf0d5f50
AE
4712 int result;
4713
bc1ecc65 4714 /* Ignore/skip any zero-length requests */
bc1ecc65
ID
4715 if (!length) {
4716 dout("%s: zero-length request\n", __func__);
4717 result = 0;
59e542c8 4718 goto err_img_request;
bc1ecc65 4719 }
4dda41d3 4720
7ad18afa
CH
4721 blk_mq_start_request(rq);
4722
4e752f0a
JD
4723 down_read(&rbd_dev->header_rwsem);
4724 mapping_size = rbd_dev->mapping.size;
a52cc685 4725 rbd_img_capture_header(img_request);
4e752f0a
JD
4726 up_read(&rbd_dev->header_rwsem);
4727
4728 if (offset + length > mapping_size) {
bc1ecc65 4729 rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
4e752f0a 4730 length, mapping_size);
bc1ecc65 4731 result = -EIO;
a52cc685 4732 goto err_img_request;
bc1ecc65 4733 }
bf0d5f50 4734
21ed05a8
ID
4735 dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev,
4736 img_request, obj_op_name(op_type), offset, length);
4737
6484cbe9 4738 if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
5a237819 4739 result = rbd_img_fill_nodata(img_request, offset, length);
90e98c52 4740 else
5a237819
ID
4741 result = rbd_img_fill_from_bio(img_request, offset, length,
4742 rq->bio);
0192ce2e 4743 if (result)
bc1ecc65 4744 goto err_img_request;
bf0d5f50 4745
e1fddc8f 4746 rbd_img_handle_request(img_request, 0);
bc1ecc65 4747 return;
bf0d5f50 4748
bc1ecc65 4749err_img_request:
679a97d2 4750 rbd_img_request_destroy(img_request);
bc1ecc65
ID
4751 if (result)
4752 rbd_warn(rbd_dev, "%s %llx at %llx result %d",
6d2940c8 4753 obj_op_name(op_type), length, offset, result);
2a842aca 4754 blk_mq_end_request(rq, errno_to_blk_status(result));
bc1ecc65 4755}
bf0d5f50 4756
fc17b653 4757static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
7ad18afa 4758 const struct blk_mq_queue_data *bd)
bc1ecc65 4759{
59e542c8
ID
4760 struct rbd_device *rbd_dev = hctx->queue->queuedata;
4761 struct rbd_img_request *img_req = blk_mq_rq_to_pdu(bd->rq);
4762 enum obj_operation_type op_type;
bf0d5f50 4763
59e542c8
ID
4764 switch (req_op(bd->rq)) {
4765 case REQ_OP_DISCARD:
4766 op_type = OBJ_OP_DISCARD;
4767 break;
4768 case REQ_OP_WRITE_ZEROES:
4769 op_type = OBJ_OP_ZEROOUT;
4770 break;
4771 case REQ_OP_WRITE:
4772 op_type = OBJ_OP_WRITE;
4773 break;
4774 case REQ_OP_READ:
4775 op_type = OBJ_OP_READ;
4776 break;
4777 default:
4778 rbd_warn(rbd_dev, "unknown req_op %d", req_op(bd->rq));
4779 return BLK_STS_IOERR;
4780 }
4781
4782 rbd_img_request_init(img_req, rbd_dev, op_type);
4783
4784 if (rbd_img_is_write(img_req)) {
4785 if (rbd_is_ro(rbd_dev)) {
4786 rbd_warn(rbd_dev, "%s on read-only mapping",
4787 obj_op_name(img_req->op_type));
4788 return BLK_STS_IOERR;
4789 }
4790 rbd_assert(!rbd_is_snap(rbd_dev));
4791 }
4792
4793 INIT_WORK(&img_req->work, rbd_queue_workfn);
4794 queue_work(rbd_wq, &img_req->work);
fc17b653 4795 return BLK_STS_OK;
bf0d5f50
AE
4796}
4797
602adf40
YS
4798static void rbd_free_disk(struct rbd_device *rbd_dev)
4799{
5769ed0c
ID
4800 blk_cleanup_queue(rbd_dev->disk->queue);
4801 blk_mq_free_tag_set(&rbd_dev->tag_set);
4802 put_disk(rbd_dev->disk);
a0cab924 4803 rbd_dev->disk = NULL;
602adf40
YS
4804}
4805
788e2df3 4806static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
fe5478e0
ID
4807 struct ceph_object_id *oid,
4808 struct ceph_object_locator *oloc,
4809 void *buf, int buf_len)
788e2df3
AE
4810
4811{
fe5478e0
ID
4812 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4813 struct ceph_osd_request *req;
4814 struct page **pages;
4815 int num_pages = calc_pages_for(0, buf_len);
788e2df3
AE
4816 int ret;
4817
fe5478e0
ID
4818 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
4819 if (!req)
4820 return -ENOMEM;
788e2df3 4821
fe5478e0
ID
4822 ceph_oid_copy(&req->r_base_oid, oid);
4823 ceph_oloc_copy(&req->r_base_oloc, oloc);
4824 req->r_flags = CEPH_OSD_FLAG_READ;
430c28c3 4825
fe5478e0
ID
4826 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
4827 if (IS_ERR(pages)) {
4828 ret = PTR_ERR(pages);
4829 goto out_req;
4830 }
1ceae7ef 4831
fe5478e0
ID
4832 osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
4833 osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
4834 true);
4835
26f887e0
ID
4836 ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
4837 if (ret)
4838 goto out_req;
4839
fe5478e0
ID
4840 ceph_osdc_start_request(osdc, req, false);
4841 ret = ceph_osdc_wait_request(osdc, req);
4842 if (ret >= 0)
4843 ceph_copy_from_page_vector(pages, buf, 0, ret);
788e2df3 4844
fe5478e0
ID
4845out_req:
4846 ceph_osdc_put_request(req);
788e2df3
AE
4847 return ret;
4848}
4849
602adf40 4850/*
662518b1
AE
4851 * Read the complete header for the given rbd device. On successful
4852 * return, the rbd_dev->header field will contain up-to-date
4853 * information about the image.
602adf40 4854 */
99a41ebc 4855static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
602adf40 4856{
4156d998 4857 struct rbd_image_header_ondisk *ondisk = NULL;
50f7c4c9 4858 u32 snap_count = 0;
4156d998
AE
4859 u64 names_size = 0;
4860 u32 want_count;
4861 int ret;
602adf40 4862
00f1f36f 4863 /*
4156d998
AE
4864 * The complete header will include an array of its 64-bit
4865 * snapshot ids, followed by the names of those snapshots as
4866 * a contiguous block of NUL-terminated strings. Note that
4867 * the number of snapshots could change by the time we read
4868 * it in, in which case we re-read it.
00f1f36f 4869 */
4156d998
AE
4870 do {
4871 size_t size;
4872
4873 kfree(ondisk);
4874
4875 size = sizeof (*ondisk);
4876 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
4877 size += names_size;
4878 ondisk = kmalloc(size, GFP_KERNEL);
4879 if (!ondisk)
662518b1 4880 return -ENOMEM;
4156d998 4881
fe5478e0
ID
4882 ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
4883 &rbd_dev->header_oloc, ondisk, size);
4156d998 4884 if (ret < 0)
662518b1 4885 goto out;
c0cd10db 4886 if ((size_t)ret < size) {
4156d998 4887 ret = -ENXIO;
06ecc6cb
AE
4888 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
4889 size, ret);
662518b1 4890 goto out;
4156d998
AE
4891 }
4892 if (!rbd_dev_ondisk_valid(ondisk)) {
4893 ret = -ENXIO;
06ecc6cb 4894 rbd_warn(rbd_dev, "invalid header");
662518b1 4895 goto out;
81e759fb 4896 }
602adf40 4897
4156d998
AE
4898 names_size = le64_to_cpu(ondisk->snap_names_len);
4899 want_count = snap_count;
4900 snap_count = le32_to_cpu(ondisk->snap_count);
4901 } while (snap_count != want_count);
00f1f36f 4902
662518b1
AE
4903 ret = rbd_header_from_disk(rbd_dev, ondisk);
4904out:
4156d998
AE
4905 kfree(ondisk);
4906
4907 return ret;
602adf40
YS
4908}
4909
9875201e
JD
4910static void rbd_dev_update_size(struct rbd_device *rbd_dev)
4911{
4912 sector_t size;
9875201e
JD
4913
4914 /*
811c6688
ID
4915 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
4916 * try to update its size. If REMOVING is set, updating size
4917 * is just useless work since the device can't be opened.
9875201e 4918 */
811c6688
ID
4919 if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
4920 !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
9875201e
JD
4921 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
4922 dout("setting size to %llu sectors", (unsigned long long)size);
4923 set_capacity(rbd_dev->disk, size);
659e56ba 4924 revalidate_disk_size(rbd_dev->disk, true);
9875201e
JD
4925 }
4926}
4927
cc4a38bd 4928static int rbd_dev_refresh(struct rbd_device *rbd_dev)
1fe5e993 4929{
e627db08 4930 u64 mapping_size;
1fe5e993
AE
4931 int ret;
4932
cfbf6377 4933 down_write(&rbd_dev->header_rwsem);
3b5cf2a2 4934 mapping_size = rbd_dev->mapping.size;
a720ae09
ID
4935
4936 ret = rbd_dev_header_info(rbd_dev);
52bb1f9b 4937 if (ret)
73e39e4d 4938 goto out;
15228ede 4939
e8f59b59
ID
4940 /*
4941 * If there is a parent, see if it has disappeared due to the
4942 * mapped image getting flattened.
4943 */
4944 if (rbd_dev->parent) {
4945 ret = rbd_dev_v2_parent_info(rbd_dev);
4946 if (ret)
73e39e4d 4947 goto out;
e8f59b59
ID
4948 }
4949
686238b7
ID
4950 rbd_assert(!rbd_is_snap(rbd_dev));
4951 rbd_dev->mapping.size = rbd_dev->header.image_size;
15228ede 4952
73e39e4d 4953out:
cfbf6377 4954 up_write(&rbd_dev->header_rwsem);
73e39e4d 4955 if (!ret && mapping_size != rbd_dev->mapping.size)
9875201e 4956 rbd_dev_update_size(rbd_dev);
1fe5e993 4957
73e39e4d 4958 return ret;
1fe5e993
AE
4959}
4960
f363b089 4961static const struct blk_mq_ops rbd_mq_ops = {
7ad18afa 4962 .queue_rq = rbd_queue_rq,
7ad18afa
CH
4963};
4964
602adf40
YS
4965static int rbd_init_disk(struct rbd_device *rbd_dev)
4966{
4967 struct gendisk *disk;
4968 struct request_queue *q;
420efbdf
ID
4969 unsigned int objset_bytes =
4970 rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
7ad18afa 4971 int err;
602adf40 4972
602adf40 4973 /* create gendisk info */
7e513d43
ID
4974 disk = alloc_disk(single_major ?
4975 (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
4976 RBD_MINORS_PER_MAJOR);
602adf40 4977 if (!disk)
1fcdb8aa 4978 return -ENOMEM;
602adf40 4979
f0f8cef5 4980 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
de71a297 4981 rbd_dev->dev_id);
602adf40 4982 disk->major = rbd_dev->major;
dd82fff1 4983 disk->first_minor = rbd_dev->minor;
7e513d43
ID
4984 if (single_major)
4985 disk->flags |= GENHD_FL_EXT_DEVT;
602adf40
YS
4986 disk->fops = &rbd_bd_ops;
4987 disk->private_data = rbd_dev;
4988
7ad18afa
CH
4989 memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
4990 rbd_dev->tag_set.ops = &rbd_mq_ops;
b5584180 4991 rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
7ad18afa 4992 rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
56d18f62 4993 rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
f9b6b98d 4994 rbd_dev->tag_set.nr_hw_queues = num_present_cpus();
59e542c8 4995 rbd_dev->tag_set.cmd_size = sizeof(struct rbd_img_request);
7ad18afa
CH
4996
4997 err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
4998 if (err)
602adf40 4999 goto out_disk;
029bcbd8 5000
7ad18afa
CH
5001 q = blk_mq_init_queue(&rbd_dev->tag_set);
5002 if (IS_ERR(q)) {
5003 err = PTR_ERR(q);
5004 goto out_tag_set;
5005 }
5006
8b904b5b 5007 blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
d8a2c89c 5008 /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
593a9e7b 5009
420efbdf 5010 blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
0d9fde4f 5011 q->limits.max_sectors = queue_max_hw_sectors(q);
21acdf45 5012 blk_queue_max_segments(q, USHRT_MAX);
24f1df60 5013 blk_queue_max_segment_size(q, UINT_MAX);
16d80c54
ID
5014 blk_queue_io_min(q, rbd_dev->opts->alloc_size);
5015 blk_queue_io_opt(q, rbd_dev->opts->alloc_size);
029bcbd8 5016
d9360540
ID
5017 if (rbd_dev->opts->trim) {
5018 blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
16d80c54 5019 q->limits.discard_granularity = rbd_dev->opts->alloc_size;
d9360540
ID
5020 blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
5021 blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
5022 }
90e98c52 5023
bae818ee 5024 if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
1cb039f3 5025 blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
bae818ee 5026
5769ed0c
ID
5027 /*
5028 * disk_release() expects a queue ref from add_disk() and will
5029 * put it. Hold an extra ref until add_disk() is called.
5030 */
5031 WARN_ON(!blk_get_queue(q));
602adf40 5032 disk->queue = q;
602adf40
YS
5033 q->queuedata = rbd_dev;
5034
5035 rbd_dev->disk = disk;
602adf40 5036
602adf40 5037 return 0;
7ad18afa
CH
5038out_tag_set:
5039 blk_mq_free_tag_set(&rbd_dev->tag_set);
602adf40
YS
5040out_disk:
5041 put_disk(disk);
7ad18afa 5042 return err;
602adf40
YS
5043}
5044
dfc5606d
YS
5045/*
5046 sysfs
5047*/
5048
593a9e7b
AE
5049static struct rbd_device *dev_to_rbd_dev(struct device *dev)
5050{
5051 return container_of(dev, struct rbd_device, dev);
5052}
5053
dfc5606d
YS
5054static ssize_t rbd_size_show(struct device *dev,
5055 struct device_attribute *attr, char *buf)
5056{
593a9e7b 5057 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
a51aa0c0 5058
fc71d833
AE
5059 return sprintf(buf, "%llu\n",
5060 (unsigned long long)rbd_dev->mapping.size);
dfc5606d
YS
5061}
5062
34b13184
AE
5063static ssize_t rbd_features_show(struct device *dev,
5064 struct device_attribute *attr, char *buf)
5065{
5066 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5067
fa58bcad 5068 return sprintf(buf, "0x%016llx\n", rbd_dev->header.features);
34b13184
AE
5069}
5070
dfc5606d
YS
5071static ssize_t rbd_major_show(struct device *dev,
5072 struct device_attribute *attr, char *buf)
5073{
593a9e7b 5074 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 5075
fc71d833
AE
5076 if (rbd_dev->major)
5077 return sprintf(buf, "%d\n", rbd_dev->major);
5078
5079 return sprintf(buf, "(none)\n");
dd82fff1
ID
5080}
5081
5082static ssize_t rbd_minor_show(struct device *dev,
5083 struct device_attribute *attr, char *buf)
5084{
5085 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
fc71d833 5086
dd82fff1 5087 return sprintf(buf, "%d\n", rbd_dev->minor);
dfc5606d
YS
5088}
5089
005a07bf
ID
5090static ssize_t rbd_client_addr_show(struct device *dev,
5091 struct device_attribute *attr, char *buf)
5092{
5093 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5094 struct ceph_entity_addr *client_addr =
5095 ceph_client_addr(rbd_dev->rbd_client->client);
5096
5097 return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
5098 le32_to_cpu(client_addr->nonce));
5099}
5100
dfc5606d
YS
5101static ssize_t rbd_client_id_show(struct device *dev,
5102 struct device_attribute *attr, char *buf)
602adf40 5103{
593a9e7b 5104 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 5105
1dbb4399 5106 return sprintf(buf, "client%lld\n",
033268a5 5107 ceph_client_gid(rbd_dev->rbd_client->client));
602adf40
YS
5108}
5109
267fb90b
MC
5110static ssize_t rbd_cluster_fsid_show(struct device *dev,
5111 struct device_attribute *attr, char *buf)
5112{
5113 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5114
5115 return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
5116}
5117
0d6d1e9c
MC
5118static ssize_t rbd_config_info_show(struct device *dev,
5119 struct device_attribute *attr, char *buf)
5120{
5121 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5122
f44d04e6
ID
5123 if (!capable(CAP_SYS_ADMIN))
5124 return -EPERM;
5125
0d6d1e9c 5126 return sprintf(buf, "%s\n", rbd_dev->config_info);
602adf40
YS
5127}
5128
dfc5606d
YS
5129static ssize_t rbd_pool_show(struct device *dev,
5130 struct device_attribute *attr, char *buf)
602adf40 5131{
593a9e7b 5132 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 5133
0d7dbfce 5134 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
dfc5606d
YS
5135}
5136
9bb2f334
AE
5137static ssize_t rbd_pool_id_show(struct device *dev,
5138 struct device_attribute *attr, char *buf)
5139{
5140 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5141
0d7dbfce 5142 return sprintf(buf, "%llu\n",
fc71d833 5143 (unsigned long long) rbd_dev->spec->pool_id);
9bb2f334
AE
5144}
5145
b26c047b
ID
5146static ssize_t rbd_pool_ns_show(struct device *dev,
5147 struct device_attribute *attr, char *buf)
5148{
5149 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5150
5151 return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: "");
5152}
5153
dfc5606d
YS
5154static ssize_t rbd_name_show(struct device *dev,
5155 struct device_attribute *attr, char *buf)
5156{
593a9e7b 5157 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 5158
a92ffdf8
AE
5159 if (rbd_dev->spec->image_name)
5160 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
5161
5162 return sprintf(buf, "(unknown)\n");
dfc5606d
YS
5163}
5164
589d30e0
AE
5165static ssize_t rbd_image_id_show(struct device *dev,
5166 struct device_attribute *attr, char *buf)
5167{
5168 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5169
0d7dbfce 5170 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
589d30e0
AE
5171}
5172
34b13184
AE
5173/*
5174 * Shows the name of the currently-mapped snapshot (or
5175 * RBD_SNAP_HEAD_NAME for the base image).
5176 */
dfc5606d
YS
5177static ssize_t rbd_snap_show(struct device *dev,
5178 struct device_attribute *attr,
5179 char *buf)
5180{
593a9e7b 5181 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 5182
0d7dbfce 5183 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
dfc5606d
YS
5184}
5185
92a58671
MC
5186static ssize_t rbd_snap_id_show(struct device *dev,
5187 struct device_attribute *attr, char *buf)
5188{
5189 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5190
5191 return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
5192}
5193
86b00e0d 5194/*
ff96128f
ID
5195 * For a v2 image, shows the chain of parent images, separated by empty
5196 * lines. For v1 images or if there is no parent, shows "(no parent
5197 * image)".
86b00e0d
AE
5198 */
5199static ssize_t rbd_parent_show(struct device *dev,
ff96128f
ID
5200 struct device_attribute *attr,
5201 char *buf)
86b00e0d
AE
5202{
5203 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
ff96128f 5204 ssize_t count = 0;
86b00e0d 5205
ff96128f 5206 if (!rbd_dev->parent)
86b00e0d
AE
5207 return sprintf(buf, "(no parent image)\n");
5208
ff96128f
ID
5209 for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
5210 struct rbd_spec *spec = rbd_dev->parent_spec;
5211
5212 count += sprintf(&buf[count], "%s"
5213 "pool_id %llu\npool_name %s\n"
e92c0eaf 5214 "pool_ns %s\n"
ff96128f
ID
5215 "image_id %s\nimage_name %s\n"
5216 "snap_id %llu\nsnap_name %s\n"
5217 "overlap %llu\n",
5218 !count ? "" : "\n", /* first? */
5219 spec->pool_id, spec->pool_name,
e92c0eaf 5220 spec->pool_ns ?: "",
ff96128f
ID
5221 spec->image_id, spec->image_name ?: "(unknown)",
5222 spec->snap_id, spec->snap_name,
5223 rbd_dev->parent_overlap);
5224 }
5225
5226 return count;
86b00e0d
AE
5227}
5228
dfc5606d
YS
5229static ssize_t rbd_image_refresh(struct device *dev,
5230 struct device_attribute *attr,
5231 const char *buf,
5232 size_t size)
5233{
593a9e7b 5234 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
b813623a 5235 int ret;
602adf40 5236
f44d04e6
ID
5237 if (!capable(CAP_SYS_ADMIN))
5238 return -EPERM;
5239
cc4a38bd 5240 ret = rbd_dev_refresh(rbd_dev);
e627db08 5241 if (ret)
52bb1f9b 5242 return ret;
b813623a 5243
52bb1f9b 5244 return size;
dfc5606d 5245}
602adf40 5246
5657a819
JP
5247static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
5248static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
5249static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
5250static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
5251static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
5252static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
5253static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
5254static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
5255static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
5256static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
b26c047b 5257static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL);
5657a819
JP
5258static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
5259static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
5260static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
5261static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
5262static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
5263static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
dfc5606d
YS
5264
5265static struct attribute *rbd_attrs[] = {
5266 &dev_attr_size.attr,
34b13184 5267 &dev_attr_features.attr,
dfc5606d 5268 &dev_attr_major.attr,
dd82fff1 5269 &dev_attr_minor.attr,
005a07bf 5270 &dev_attr_client_addr.attr,
dfc5606d 5271 &dev_attr_client_id.attr,
267fb90b 5272 &dev_attr_cluster_fsid.attr,
0d6d1e9c 5273 &dev_attr_config_info.attr,
dfc5606d 5274 &dev_attr_pool.attr,
9bb2f334 5275 &dev_attr_pool_id.attr,
b26c047b 5276 &dev_attr_pool_ns.attr,
dfc5606d 5277 &dev_attr_name.attr,
589d30e0 5278 &dev_attr_image_id.attr,
dfc5606d 5279 &dev_attr_current_snap.attr,
92a58671 5280 &dev_attr_snap_id.attr,
86b00e0d 5281 &dev_attr_parent.attr,
dfc5606d 5282 &dev_attr_refresh.attr,
dfc5606d
YS
5283 NULL
5284};
5285
5286static struct attribute_group rbd_attr_group = {
5287 .attrs = rbd_attrs,
5288};
5289
5290static const struct attribute_group *rbd_attr_groups[] = {
5291 &rbd_attr_group,
5292 NULL
5293};
5294
6cac4695 5295static void rbd_dev_release(struct device *dev);
dfc5606d 5296
b9942bc9 5297static const struct device_type rbd_device_type = {
dfc5606d
YS
5298 .name = "rbd",
5299 .groups = rbd_attr_groups,
6cac4695 5300 .release = rbd_dev_release,
dfc5606d
YS
5301};
5302
8b8fb99c
AE
5303static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
5304{
5305 kref_get(&spec->kref);
5306
5307 return spec;
5308}
5309
5310static void rbd_spec_free(struct kref *kref);
5311static void rbd_spec_put(struct rbd_spec *spec)
5312{
5313 if (spec)
5314 kref_put(&spec->kref, rbd_spec_free);
5315}
5316
5317static struct rbd_spec *rbd_spec_alloc(void)
5318{
5319 struct rbd_spec *spec;
5320
5321 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
5322 if (!spec)
5323 return NULL;
04077599
ID
5324
5325 spec->pool_id = CEPH_NOPOOL;
5326 spec->snap_id = CEPH_NOSNAP;
8b8fb99c
AE
5327 kref_init(&spec->kref);
5328
8b8fb99c
AE
5329 return spec;
5330}
5331
5332static void rbd_spec_free(struct kref *kref)
5333{
5334 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
5335
5336 kfree(spec->pool_name);
b26c047b 5337 kfree(spec->pool_ns);
8b8fb99c
AE
5338 kfree(spec->image_id);
5339 kfree(spec->image_name);
5340 kfree(spec->snap_name);
5341 kfree(spec);
5342}
5343
1643dfa4 5344static void rbd_dev_free(struct rbd_device *rbd_dev)
dd5ac32d 5345{
99d16943 5346 WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
ed95b21a 5347 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
dd5ac32d 5348
c41d13a3 5349 ceph_oid_destroy(&rbd_dev->header_oid);
6b6dddbe 5350 ceph_oloc_destroy(&rbd_dev->header_oloc);
0d6d1e9c 5351 kfree(rbd_dev->config_info);
c41d13a3 5352
dd5ac32d
ID
5353 rbd_put_client(rbd_dev->rbd_client);
5354 rbd_spec_put(rbd_dev->spec);
5355 kfree(rbd_dev->opts);
5356 kfree(rbd_dev);
1643dfa4
ID
5357}
5358
5359static void rbd_dev_release(struct device *dev)
5360{
5361 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5362 bool need_put = !!rbd_dev->opts;
5363
5364 if (need_put) {
5365 destroy_workqueue(rbd_dev->task_wq);
5366 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
5367 }
5368
5369 rbd_dev_free(rbd_dev);
dd5ac32d
ID
5370
5371 /*
5372 * This is racy, but way better than putting module outside of
5373 * the release callback. The race window is pretty small, so
5374 * doing something similar to dm (dm-builtin.c) is overkill.
5375 */
5376 if (need_put)
5377 module_put(THIS_MODULE);
5378}
5379
1643dfa4
ID
5380static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
5381 struct rbd_spec *spec)
c53d5893
AE
5382{
5383 struct rbd_device *rbd_dev;
5384
1643dfa4 5385 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
c53d5893
AE
5386 if (!rbd_dev)
5387 return NULL;
5388
5389 spin_lock_init(&rbd_dev->lock);
5390 INIT_LIST_HEAD(&rbd_dev->node);
c53d5893
AE
5391 init_rwsem(&rbd_dev->header_rwsem);
5392
7e97332e 5393 rbd_dev->header.data_pool_id = CEPH_NOPOOL;
c41d13a3 5394 ceph_oid_init(&rbd_dev->header_oid);
431a02cd 5395 rbd_dev->header_oloc.pool = spec->pool_id;
b26c047b
ID
5396 if (spec->pool_ns) {
5397 WARN_ON(!*spec->pool_ns);
5398 rbd_dev->header_oloc.pool_ns =
5399 ceph_find_or_create_string(spec->pool_ns,
5400 strlen(spec->pool_ns));
5401 }
c41d13a3 5402
99d16943
ID
5403 mutex_init(&rbd_dev->watch_mutex);
5404 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
5405 INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
5406
ed95b21a
ID
5407 init_rwsem(&rbd_dev->lock_rwsem);
5408 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
5409 INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
5410 INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
5411 INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
5412 INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
e1fddc8f 5413 spin_lock_init(&rbd_dev->lock_lists_lock);
637cd060 5414 INIT_LIST_HEAD(&rbd_dev->acquiring_list);
e1fddc8f 5415 INIT_LIST_HEAD(&rbd_dev->running_list);
637cd060 5416 init_completion(&rbd_dev->acquire_wait);
e1fddc8f 5417 init_completion(&rbd_dev->releasing_wait);
ed95b21a 5418
22e8bd51 5419 spin_lock_init(&rbd_dev->object_map_lock);
ed95b21a 5420
dd5ac32d
ID
5421 rbd_dev->dev.bus = &rbd_bus_type;
5422 rbd_dev->dev.type = &rbd_device_type;
5423 rbd_dev->dev.parent = &rbd_root_dev;
dd5ac32d
ID
5424 device_initialize(&rbd_dev->dev);
5425
c53d5893 5426 rbd_dev->rbd_client = rbdc;
d147543d 5427 rbd_dev->spec = spec;
0903e875 5428
1643dfa4
ID
5429 return rbd_dev;
5430}
5431
5432/*
5433 * Create a mapping rbd_dev.
5434 */
5435static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
5436 struct rbd_spec *spec,
5437 struct rbd_options *opts)
5438{
5439 struct rbd_device *rbd_dev;
5440
5441 rbd_dev = __rbd_dev_create(rbdc, spec);
5442 if (!rbd_dev)
5443 return NULL;
5444
5445 rbd_dev->opts = opts;
5446
5447 /* get an id and fill in device name */
5448 rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
5449 minor_to_rbd_dev_id(1 << MINORBITS),
5450 GFP_KERNEL);
5451 if (rbd_dev->dev_id < 0)
5452 goto fail_rbd_dev;
5453
5454 sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
5455 rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
5456 rbd_dev->name);
5457 if (!rbd_dev->task_wq)
5458 goto fail_dev_id;
dd5ac32d 5459
1643dfa4
ID
5460 /* we have a ref from do_rbd_add() */
5461 __module_get(THIS_MODULE);
dd5ac32d 5462
1643dfa4 5463 dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
c53d5893 5464 return rbd_dev;
1643dfa4
ID
5465
5466fail_dev_id:
5467 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
5468fail_rbd_dev:
5469 rbd_dev_free(rbd_dev);
5470 return NULL;
c53d5893
AE
5471}
5472
5473static void rbd_dev_destroy(struct rbd_device *rbd_dev)
5474{
dd5ac32d
ID
5475 if (rbd_dev)
5476 put_device(&rbd_dev->dev);
c53d5893
AE
5477}
5478
9d475de5
AE
5479/*
5480 * Get the size and object order for an image snapshot, or if
5481 * snap_id is CEPH_NOSNAP, gets this information for the base
5482 * image.
5483 */
5484static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
5485 u8 *order, u64 *snap_size)
5486{
5487 __le64 snapid = cpu_to_le64(snap_id);
5488 int ret;
5489 struct {
5490 u8 order;
5491 __le64 size;
5492 } __attribute__ ((packed)) size_buf = { 0 };
5493
ecd4a68a
ID
5494 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5495 &rbd_dev->header_oloc, "get_size",
5496 &snapid, sizeof(snapid),
5497 &size_buf, sizeof(size_buf));
36be9a76 5498 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
9d475de5
AE
5499 if (ret < 0)
5500 return ret;
57385b51
AE
5501 if (ret < sizeof (size_buf))
5502 return -ERANGE;
9d475de5 5503
c3545579 5504 if (order) {
c86f86e9 5505 *order = size_buf.order;
c3545579
JD
5506 dout(" order %u", (unsigned int)*order);
5507 }
9d475de5
AE
5508 *snap_size = le64_to_cpu(size_buf.size);
5509
c3545579
JD
5510 dout(" snap_id 0x%016llx snap_size = %llu\n",
5511 (unsigned long long)snap_id,
57385b51 5512 (unsigned long long)*snap_size);
9d475de5
AE
5513
5514 return 0;
5515}
5516
5517static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
5518{
5519 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
5520 &rbd_dev->header.obj_order,
5521 &rbd_dev->header.image_size);
5522}
5523
1e130199
AE
5524static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
5525{
5435d206 5526 size_t size;
1e130199
AE
5527 void *reply_buf;
5528 int ret;
5529 void *p;
5530
5435d206
DY
5531 /* Response will be an encoded string, which includes a length */
5532 size = sizeof(__le32) + RBD_OBJ_PREFIX_LEN_MAX;
5533 reply_buf = kzalloc(size, GFP_KERNEL);
1e130199
AE
5534 if (!reply_buf)
5535 return -ENOMEM;
5536
ecd4a68a
ID
5537 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5538 &rbd_dev->header_oloc, "get_object_prefix",
5435d206 5539 NULL, 0, reply_buf, size);
36be9a76 5540 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
1e130199
AE
5541 if (ret < 0)
5542 goto out;
5543
5544 p = reply_buf;
5545 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
57385b51
AE
5546 p + ret, NULL, GFP_NOIO);
5547 ret = 0;
1e130199
AE
5548
5549 if (IS_ERR(rbd_dev->header.object_prefix)) {
5550 ret = PTR_ERR(rbd_dev->header.object_prefix);
5551 rbd_dev->header.object_prefix = NULL;
5552 } else {
5553 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
5554 }
1e130199
AE
5555out:
5556 kfree(reply_buf);
5557
5558 return ret;
5559}
5560
b1b5402a 5561static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
196e2d6d 5562 bool read_only, u64 *snap_features)
b1b5402a 5563{
196e2d6d
ID
5564 struct {
5565 __le64 snap_id;
5566 u8 read_only;
5567 } features_in;
b1b5402a
AE
5568 struct {
5569 __le64 features;
5570 __le64 incompat;
4157976b 5571 } __attribute__ ((packed)) features_buf = { 0 };
d3767f0f 5572 u64 unsup;
b1b5402a
AE
5573 int ret;
5574
196e2d6d
ID
5575 features_in.snap_id = cpu_to_le64(snap_id);
5576 features_in.read_only = read_only;
5577
ecd4a68a
ID
5578 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5579 &rbd_dev->header_oloc, "get_features",
196e2d6d 5580 &features_in, sizeof(features_in),
ecd4a68a 5581 &features_buf, sizeof(features_buf));
36be9a76 5582 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
b1b5402a
AE
5583 if (ret < 0)
5584 return ret;
57385b51
AE
5585 if (ret < sizeof (features_buf))
5586 return -ERANGE;
d889140c 5587
d3767f0f
ID
5588 unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
5589 if (unsup) {
5590 rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
5591 unsup);
b8f5c6ed 5592 return -ENXIO;
d3767f0f 5593 }
d889140c 5594
b1b5402a
AE
5595 *snap_features = le64_to_cpu(features_buf.features);
5596
5597 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
57385b51
AE
5598 (unsigned long long)snap_id,
5599 (unsigned long long)*snap_features,
5600 (unsigned long long)le64_to_cpu(features_buf.incompat));
b1b5402a
AE
5601
5602 return 0;
5603}
5604
5605static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
5606{
5607 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
196e2d6d
ID
5608 rbd_is_ro(rbd_dev),
5609 &rbd_dev->header.features);
b1b5402a
AE
5610}
5611
22e8bd51
ID
5612/*
5613 * These are generic image flags, but since they are used only for
5614 * object map, store them in rbd_dev->object_map_flags.
5615 *
5616 * For the same reason, this function is called only on object map
5617 * (re)load and not on header refresh.
5618 */
5619static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev)
5620{
5621 __le64 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
5622 __le64 flags;
5623 int ret;
5624
5625 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5626 &rbd_dev->header_oloc, "get_flags",
5627 &snapid, sizeof(snapid),
5628 &flags, sizeof(flags));
5629 if (ret < 0)
5630 return ret;
5631 if (ret < sizeof(flags))
5632 return -EBADMSG;
5633
5634 rbd_dev->object_map_flags = le64_to_cpu(flags);
5635 return 0;
5636}
5637
eb3b2d6b
ID
5638struct parent_image_info {
5639 u64 pool_id;
e92c0eaf 5640 const char *pool_ns;
eb3b2d6b
ID
5641 const char *image_id;
5642 u64 snap_id;
5643
e92c0eaf 5644 bool has_overlap;
eb3b2d6b
ID
5645 u64 overlap;
5646};
5647
e92c0eaf
ID
5648/*
5649 * The caller is responsible for @pii.
5650 */
5651static int decode_parent_image_spec(void **p, void *end,
5652 struct parent_image_info *pii)
5653{
5654 u8 struct_v;
5655 u32 struct_len;
5656 int ret;
5657
5658 ret = ceph_start_decoding(p, end, 1, "ParentImageSpec",
5659 &struct_v, &struct_len);
5660 if (ret)
5661 return ret;
5662
5663 ceph_decode_64_safe(p, end, pii->pool_id, e_inval);
5664 pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5665 if (IS_ERR(pii->pool_ns)) {
5666 ret = PTR_ERR(pii->pool_ns);
5667 pii->pool_ns = NULL;
5668 return ret;
5669 }
5670 pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5671 if (IS_ERR(pii->image_id)) {
5672 ret = PTR_ERR(pii->image_id);
5673 pii->image_id = NULL;
5674 return ret;
5675 }
5676 ceph_decode_64_safe(p, end, pii->snap_id, e_inval);
5677 return 0;
5678
5679e_inval:
5680 return -EINVAL;
5681}
5682
5683static int __get_parent_info(struct rbd_device *rbd_dev,
5684 struct page *req_page,
5685 struct page *reply_page,
5686 struct parent_image_info *pii)
5687{
5688 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5689 size_t reply_len = PAGE_SIZE;
5690 void *p, *end;
5691 int ret;
5692
5693 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5694 "rbd", "parent_get", CEPH_OSD_FLAG_READ,
68ada915 5695 req_page, sizeof(u64), &reply_page, &reply_len);
e92c0eaf
ID
5696 if (ret)
5697 return ret == -EOPNOTSUPP ? 1 : ret;
5698
5699 p = page_address(reply_page);
5700 end = p + reply_len;
5701 ret = decode_parent_image_spec(&p, end, pii);
5702 if (ret)
5703 return ret;
5704
5705 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5706 "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
68ada915 5707 req_page, sizeof(u64), &reply_page, &reply_len);
e92c0eaf
ID
5708 if (ret)
5709 return ret;
5710
5711 p = page_address(reply_page);
5712 end = p + reply_len;
5713 ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval);
5714 if (pii->has_overlap)
5715 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5716
5717 return 0;
5718
5719e_inval:
5720 return -EINVAL;
5721}
5722
eb3b2d6b
ID
5723/*
5724 * The caller is responsible for @pii.
5725 */
5726static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
5727 struct page *req_page,
5728 struct page *reply_page,
5729 struct parent_image_info *pii)
5730{
5731 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5732 size_t reply_len = PAGE_SIZE;
5733 void *p, *end;
5734 int ret;
5735
5736 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5737 "rbd", "get_parent", CEPH_OSD_FLAG_READ,
68ada915 5738 req_page, sizeof(u64), &reply_page, &reply_len);
eb3b2d6b
ID
5739 if (ret)
5740 return ret;
5741
5742 p = page_address(reply_page);
5743 end = p + reply_len;
5744 ceph_decode_64_safe(&p, end, pii->pool_id, e_inval);
5745 pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
5746 if (IS_ERR(pii->image_id)) {
5747 ret = PTR_ERR(pii->image_id);
5748 pii->image_id = NULL;
5749 return ret;
5750 }
5751 ceph_decode_64_safe(&p, end, pii->snap_id, e_inval);
e92c0eaf 5752 pii->has_overlap = true;
eb3b2d6b
ID
5753 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5754
5755 return 0;
5756
5757e_inval:
5758 return -EINVAL;
5759}
5760
5761static int get_parent_info(struct rbd_device *rbd_dev,
5762 struct parent_image_info *pii)
5763{
5764 struct page *req_page, *reply_page;
5765 void *p;
5766 int ret;
5767
5768 req_page = alloc_page(GFP_KERNEL);
5769 if (!req_page)
5770 return -ENOMEM;
5771
5772 reply_page = alloc_page(GFP_KERNEL);
5773 if (!reply_page) {
5774 __free_page(req_page);
5775 return -ENOMEM;
5776 }
5777
5778 p = page_address(req_page);
5779 ceph_encode_64(&p, rbd_dev->spec->snap_id);
e92c0eaf
ID
5780 ret = __get_parent_info(rbd_dev, req_page, reply_page, pii);
5781 if (ret > 0)
5782 ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page,
5783 pii);
eb3b2d6b
ID
5784
5785 __free_page(req_page);
5786 __free_page(reply_page);
5787 return ret;
5788}
5789
86b00e0d
AE
5790static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
5791{
5792 struct rbd_spec *parent_spec;
eb3b2d6b 5793 struct parent_image_info pii = { 0 };
86b00e0d
AE
5794 int ret;
5795
5796 parent_spec = rbd_spec_alloc();
5797 if (!parent_spec)
5798 return -ENOMEM;
5799
eb3b2d6b
ID
5800 ret = get_parent_info(rbd_dev, &pii);
5801 if (ret)
86b00e0d 5802 goto out_err;
86b00e0d 5803
e92c0eaf
ID
5804 dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
5805 __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id,
5806 pii.has_overlap, pii.overlap);
86b00e0d 5807
e92c0eaf 5808 if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) {
392a9dad
AE
5809 /*
5810 * Either the parent never existed, or we have
5811 * record of it but the image got flattened so it no
5812 * longer has a parent. When the parent of a
5813 * layered image disappears we immediately set the
5814 * overlap to 0. The effect of this is that all new
5815 * requests will be treated as if the image had no
5816 * parent.
e92c0eaf
ID
5817 *
5818 * If !pii.has_overlap, the parent image spec is not
5819 * applicable. It's there to avoid duplication in each
5820 * snapshot record.
392a9dad
AE
5821 */
5822 if (rbd_dev->parent_overlap) {
5823 rbd_dev->parent_overlap = 0;
392a9dad
AE
5824 rbd_dev_parent_put(rbd_dev);
5825 pr_info("%s: clone image has been flattened\n",
5826 rbd_dev->disk->disk_name);
5827 }
5828
86b00e0d 5829 goto out; /* No parent? No problem. */
392a9dad 5830 }
86b00e0d 5831
0903e875
AE
5832 /* The ceph file layout needs to fit pool id in 32 bits */
5833
5834 ret = -EIO;
eb3b2d6b 5835 if (pii.pool_id > (u64)U32_MAX) {
9584d508 5836 rbd_warn(NULL, "parent pool id too large (%llu > %u)",
eb3b2d6b 5837 (unsigned long long)pii.pool_id, U32_MAX);
86b00e0d
AE
5838 goto out_err;
5839 }
86b00e0d 5840
3b5cf2a2
AE
5841 /*
5842 * The parent won't change (except when the clone is
5843 * flattened, already handled that). So we only need to
5844 * record the parent spec we have not already done so.
5845 */
5846 if (!rbd_dev->parent_spec) {
eb3b2d6b 5847 parent_spec->pool_id = pii.pool_id;
e92c0eaf
ID
5848 if (pii.pool_ns && *pii.pool_ns) {
5849 parent_spec->pool_ns = pii.pool_ns;
5850 pii.pool_ns = NULL;
5851 }
eb3b2d6b
ID
5852 parent_spec->image_id = pii.image_id;
5853 pii.image_id = NULL;
5854 parent_spec->snap_id = pii.snap_id;
b26c047b 5855
70cf49cf
AE
5856 rbd_dev->parent_spec = parent_spec;
5857 parent_spec = NULL; /* rbd_dev now owns this */
3b5cf2a2
AE
5858 }
5859
5860 /*
cf32bd9c
ID
5861 * We always update the parent overlap. If it's zero we issue
5862 * a warning, as we will proceed as if there was no parent.
3b5cf2a2 5863 */
eb3b2d6b 5864 if (!pii.overlap) {
3b5cf2a2 5865 if (parent_spec) {
cf32bd9c
ID
5866 /* refresh, careful to warn just once */
5867 if (rbd_dev->parent_overlap)
5868 rbd_warn(rbd_dev,
5869 "clone now standalone (overlap became 0)");
3b5cf2a2 5870 } else {
cf32bd9c
ID
5871 /* initial probe */
5872 rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
3b5cf2a2 5873 }
70cf49cf 5874 }
eb3b2d6b 5875 rbd_dev->parent_overlap = pii.overlap;
cf32bd9c 5876
86b00e0d
AE
5877out:
5878 ret = 0;
5879out_err:
e92c0eaf 5880 kfree(pii.pool_ns);
eb3b2d6b 5881 kfree(pii.image_id);
86b00e0d 5882 rbd_spec_put(parent_spec);
86b00e0d
AE
5883 return ret;
5884}
5885
cc070d59
AE
5886static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
5887{
5888 struct {
5889 __le64 stripe_unit;
5890 __le64 stripe_count;
5891 } __attribute__ ((packed)) striping_info_buf = { 0 };
5892 size_t size = sizeof (striping_info_buf);
5893 void *p;
cc070d59
AE
5894 int ret;
5895
ecd4a68a
ID
5896 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5897 &rbd_dev->header_oloc, "get_stripe_unit_count",
5898 NULL, 0, &striping_info_buf, size);
cc070d59
AE
5899 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5900 if (ret < 0)
5901 return ret;
5902 if (ret < size)
5903 return -ERANGE;
5904
cc070d59 5905 p = &striping_info_buf;
b1331852
ID
5906 rbd_dev->header.stripe_unit = ceph_decode_64(&p);
5907 rbd_dev->header.stripe_count = ceph_decode_64(&p);
cc070d59
AE
5908 return 0;
5909}
5910
7e97332e
ID
5911static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
5912{
5913 __le64 data_pool_id;
5914 int ret;
5915
5916 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5917 &rbd_dev->header_oloc, "get_data_pool",
5918 NULL, 0, &data_pool_id, sizeof(data_pool_id));
5919 if (ret < 0)
5920 return ret;
5921 if (ret < sizeof(data_pool_id))
5922 return -EBADMSG;
5923
5924 rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
5925 WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
5926 return 0;
5927}
5928
9e15b77d
AE
5929static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
5930{
ecd4a68a 5931 CEPH_DEFINE_OID_ONSTACK(oid);
9e15b77d
AE
5932 size_t image_id_size;
5933 char *image_id;
5934 void *p;
5935 void *end;
5936 size_t size;
5937 void *reply_buf = NULL;
5938 size_t len = 0;
5939 char *image_name = NULL;
5940 int ret;
5941
5942 rbd_assert(!rbd_dev->spec->image_name);
5943
69e7a02f
AE
5944 len = strlen(rbd_dev->spec->image_id);
5945 image_id_size = sizeof (__le32) + len;
9e15b77d
AE
5946 image_id = kmalloc(image_id_size, GFP_KERNEL);
5947 if (!image_id)
5948 return NULL;
5949
5950 p = image_id;
4157976b 5951 end = image_id + image_id_size;
57385b51 5952 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
9e15b77d
AE
5953
5954 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
5955 reply_buf = kmalloc(size, GFP_KERNEL);
5956 if (!reply_buf)
5957 goto out;
5958
ecd4a68a
ID
5959 ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
5960 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5961 "dir_get_name", image_id, image_id_size,
5962 reply_buf, size);
9e15b77d
AE
5963 if (ret < 0)
5964 goto out;
5965 p = reply_buf;
f40eb349
AE
5966 end = reply_buf + ret;
5967
9e15b77d
AE
5968 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
5969 if (IS_ERR(image_name))
5970 image_name = NULL;
5971 else
5972 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
5973out:
5974 kfree(reply_buf);
5975 kfree(image_id);
5976
5977 return image_name;
5978}
5979
2ad3d716
AE
5980static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5981{
5982 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5983 const char *snap_name;
5984 u32 which = 0;
5985
5986 /* Skip over names until we find the one we are looking for */
5987
5988 snap_name = rbd_dev->header.snap_names;
5989 while (which < snapc->num_snaps) {
5990 if (!strcmp(name, snap_name))
5991 return snapc->snaps[which];
5992 snap_name += strlen(snap_name) + 1;
5993 which++;
5994 }
5995 return CEPH_NOSNAP;
5996}
5997
5998static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5999{
6000 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
6001 u32 which;
6002 bool found = false;
6003 u64 snap_id;
6004
6005 for (which = 0; !found && which < snapc->num_snaps; which++) {
6006 const char *snap_name;
6007
6008 snap_id = snapc->snaps[which];
6009 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
efadc98a
JD
6010 if (IS_ERR(snap_name)) {
6011 /* ignore no-longer existing snapshots */
6012 if (PTR_ERR(snap_name) == -ENOENT)
6013 continue;
6014 else
6015 break;
6016 }
2ad3d716
AE
6017 found = !strcmp(name, snap_name);
6018 kfree(snap_name);
6019 }
6020 return found ? snap_id : CEPH_NOSNAP;
6021}
6022
6023/*
6024 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
6025 * no snapshot by that name is found, or if an error occurs.
6026 */
6027static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
6028{
6029 if (rbd_dev->image_format == 1)
6030 return rbd_v1_snap_id_by_name(rbd_dev, name);
6031
6032 return rbd_v2_snap_id_by_name(rbd_dev, name);
6033}
6034
9e15b77d 6035/*
04077599
ID
6036 * An image being mapped will have everything but the snap id.
6037 */
6038static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
6039{
6040 struct rbd_spec *spec = rbd_dev->spec;
6041
6042 rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
6043 rbd_assert(spec->image_id && spec->image_name);
6044 rbd_assert(spec->snap_name);
6045
6046 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
6047 u64 snap_id;
6048
6049 snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
6050 if (snap_id == CEPH_NOSNAP)
6051 return -ENOENT;
6052
6053 spec->snap_id = snap_id;
6054 } else {
6055 spec->snap_id = CEPH_NOSNAP;
6056 }
6057
6058 return 0;
6059}
6060
6061/*
6062 * A parent image will have all ids but none of the names.
e1d4213f 6063 *
04077599
ID
6064 * All names in an rbd spec are dynamically allocated. It's OK if we
6065 * can't figure out the name for an image id.
9e15b77d 6066 */
04077599 6067static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
9e15b77d 6068{
2e9f7f1c
AE
6069 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
6070 struct rbd_spec *spec = rbd_dev->spec;
6071 const char *pool_name;
6072 const char *image_name;
6073 const char *snap_name;
9e15b77d
AE
6074 int ret;
6075
04077599
ID
6076 rbd_assert(spec->pool_id != CEPH_NOPOOL);
6077 rbd_assert(spec->image_id);
6078 rbd_assert(spec->snap_id != CEPH_NOSNAP);
9e15b77d 6079
2e9f7f1c 6080 /* Get the pool name; we have to make our own copy of this */
9e15b77d 6081
2e9f7f1c
AE
6082 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
6083 if (!pool_name) {
6084 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
935dc89f
AE
6085 return -EIO;
6086 }
2e9f7f1c
AE
6087 pool_name = kstrdup(pool_name, GFP_KERNEL);
6088 if (!pool_name)
9e15b77d
AE
6089 return -ENOMEM;
6090
6091 /* Fetch the image name; tolerate failure here */
6092
2e9f7f1c
AE
6093 image_name = rbd_dev_image_name(rbd_dev);
6094 if (!image_name)
06ecc6cb 6095 rbd_warn(rbd_dev, "unable to get image name");
9e15b77d 6096
04077599 6097 /* Fetch the snapshot name */
9e15b77d 6098
2e9f7f1c 6099 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
da6a6b63
JD
6100 if (IS_ERR(snap_name)) {
6101 ret = PTR_ERR(snap_name);
9e15b77d 6102 goto out_err;
2e9f7f1c
AE
6103 }
6104
6105 spec->pool_name = pool_name;
6106 spec->image_name = image_name;
6107 spec->snap_name = snap_name;
9e15b77d
AE
6108
6109 return 0;
04077599 6110
9e15b77d 6111out_err:
2e9f7f1c
AE
6112 kfree(image_name);
6113 kfree(pool_name);
9e15b77d
AE
6114 return ret;
6115}
6116
cc4a38bd 6117static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
35d489f9
AE
6118{
6119 size_t size;
6120 int ret;
6121 void *reply_buf;
6122 void *p;
6123 void *end;
6124 u64 seq;
6125 u32 snap_count;
6126 struct ceph_snap_context *snapc;
6127 u32 i;
6128
6129 /*
6130 * We'll need room for the seq value (maximum snapshot id),
6131 * snapshot count, and array of that many snapshot ids.
6132 * For now we have a fixed upper limit on the number we're
6133 * prepared to receive.
6134 */
6135 size = sizeof (__le64) + sizeof (__le32) +
6136 RBD_MAX_SNAP_COUNT * sizeof (__le64);
6137 reply_buf = kzalloc(size, GFP_KERNEL);
6138 if (!reply_buf)
6139 return -ENOMEM;
6140
ecd4a68a
ID
6141 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6142 &rbd_dev->header_oloc, "get_snapcontext",
6143 NULL, 0, reply_buf, size);
36be9a76 6144 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
35d489f9
AE
6145 if (ret < 0)
6146 goto out;
6147
35d489f9 6148 p = reply_buf;
57385b51
AE
6149 end = reply_buf + ret;
6150 ret = -ERANGE;
35d489f9
AE
6151 ceph_decode_64_safe(&p, end, seq, out);
6152 ceph_decode_32_safe(&p, end, snap_count, out);
6153
6154 /*
6155 * Make sure the reported number of snapshot ids wouldn't go
6156 * beyond the end of our buffer. But before checking that,
6157 * make sure the computed size of the snapshot context we
6158 * allocate is representable in a size_t.
6159 */
6160 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
6161 / sizeof (u64)) {
6162 ret = -EINVAL;
6163 goto out;
6164 }
6165 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
6166 goto out;
468521c1 6167 ret = 0;
35d489f9 6168
812164f8 6169 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
35d489f9
AE
6170 if (!snapc) {
6171 ret = -ENOMEM;
6172 goto out;
6173 }
35d489f9 6174 snapc->seq = seq;
35d489f9
AE
6175 for (i = 0; i < snap_count; i++)
6176 snapc->snaps[i] = ceph_decode_64(&p);
6177
49ece554 6178 ceph_put_snap_context(rbd_dev->header.snapc);
35d489f9
AE
6179 rbd_dev->header.snapc = snapc;
6180
6181 dout(" snap context seq = %llu, snap_count = %u\n",
57385b51 6182 (unsigned long long)seq, (unsigned int)snap_count);
35d489f9
AE
6183out:
6184 kfree(reply_buf);
6185
57385b51 6186 return ret;
35d489f9
AE
6187}
6188
54cac61f
AE
6189static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
6190 u64 snap_id)
b8b1e2db
AE
6191{
6192 size_t size;
6193 void *reply_buf;
54cac61f 6194 __le64 snapid;
b8b1e2db
AE
6195 int ret;
6196 void *p;
6197 void *end;
b8b1e2db
AE
6198 char *snap_name;
6199
6200 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
6201 reply_buf = kmalloc(size, GFP_KERNEL);
6202 if (!reply_buf)
6203 return ERR_PTR(-ENOMEM);
6204
54cac61f 6205 snapid = cpu_to_le64(snap_id);
ecd4a68a
ID
6206 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6207 &rbd_dev->header_oloc, "get_snapshot_name",
6208 &snapid, sizeof(snapid), reply_buf, size);
36be9a76 6209 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
f40eb349
AE
6210 if (ret < 0) {
6211 snap_name = ERR_PTR(ret);
b8b1e2db 6212 goto out;
f40eb349 6213 }
b8b1e2db
AE
6214
6215 p = reply_buf;
f40eb349 6216 end = reply_buf + ret;
e5c35534 6217 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
f40eb349 6218 if (IS_ERR(snap_name))
b8b1e2db 6219 goto out;
b8b1e2db 6220
f40eb349 6221 dout(" snap_id 0x%016llx snap_name = %s\n",
54cac61f 6222 (unsigned long long)snap_id, snap_name);
b8b1e2db
AE
6223out:
6224 kfree(reply_buf);
6225
f40eb349 6226 return snap_name;
b8b1e2db
AE
6227}
6228
2df3fac7 6229static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
117973fb 6230{
2df3fac7 6231 bool first_time = rbd_dev->header.object_prefix == NULL;
117973fb 6232 int ret;
117973fb 6233
1617e40c
JD
6234 ret = rbd_dev_v2_image_size(rbd_dev);
6235 if (ret)
cfbf6377 6236 return ret;
1617e40c 6237
2df3fac7
AE
6238 if (first_time) {
6239 ret = rbd_dev_v2_header_onetime(rbd_dev);
6240 if (ret)
cfbf6377 6241 return ret;
2df3fac7
AE
6242 }
6243
cc4a38bd 6244 ret = rbd_dev_v2_snap_context(rbd_dev);
d194cd1d
ID
6245 if (ret && first_time) {
6246 kfree(rbd_dev->header.object_prefix);
6247 rbd_dev->header.object_prefix = NULL;
6248 }
117973fb
AE
6249
6250 return ret;
6251}
6252
a720ae09
ID
6253static int rbd_dev_header_info(struct rbd_device *rbd_dev)
6254{
6255 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
6256
6257 if (rbd_dev->image_format == 1)
6258 return rbd_dev_v1_header_info(rbd_dev);
6259
6260 return rbd_dev_v2_header_info(rbd_dev);
6261}
6262
e28fff26
AE
6263/*
6264 * Skips over white space at *buf, and updates *buf to point to the
6265 * first found non-space character (if any). Returns the length of
593a9e7b
AE
6266 * the token (string of non-white space characters) found. Note
6267 * that *buf must be terminated with '\0'.
e28fff26
AE
6268 */
6269static inline size_t next_token(const char **buf)
6270{
6271 /*
6272 * These are the characters that produce nonzero for
6273 * isspace() in the "C" and "POSIX" locales.
6274 */
6275 const char *spaces = " \f\n\r\t\v";
6276
6277 *buf += strspn(*buf, spaces); /* Find start of token */
6278
6279 return strcspn(*buf, spaces); /* Return token length */
6280}
6281
ea3352f4
AE
6282/*
6283 * Finds the next token in *buf, dynamically allocates a buffer big
6284 * enough to hold a copy of it, and copies the token into the new
6285 * buffer. The copy is guaranteed to be terminated with '\0'. Note
6286 * that a duplicate buffer is created even for a zero-length token.
6287 *
6288 * Returns a pointer to the newly-allocated duplicate, or a null
6289 * pointer if memory for the duplicate was not available. If
6290 * the lenp argument is a non-null pointer, the length of the token
6291 * (not including the '\0') is returned in *lenp.
6292 *
6293 * If successful, the *buf pointer will be updated to point beyond
6294 * the end of the found token.
6295 *
6296 * Note: uses GFP_KERNEL for allocation.
6297 */
6298static inline char *dup_token(const char **buf, size_t *lenp)
6299{
6300 char *dup;
6301 size_t len;
6302
6303 len = next_token(buf);
4caf35f9 6304 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
ea3352f4
AE
6305 if (!dup)
6306 return NULL;
ea3352f4
AE
6307 *(dup + len) = '\0';
6308 *buf += len;
6309
6310 if (lenp)
6311 *lenp = len;
6312
6313 return dup;
6314}
6315
82995cc6
DH
6316static int rbd_parse_param(struct fs_parameter *param,
6317 struct rbd_parse_opts_ctx *pctx)
6318{
6319 struct rbd_options *opt = pctx->opts;
6320 struct fs_parse_result result;
3fbb8d55 6321 struct p_log log = {.prefix = "rbd"};
82995cc6
DH
6322 int token, ret;
6323
6324 ret = ceph_parse_param(param, pctx->copts, NULL);
6325 if (ret != -ENOPARAM)
6326 return ret;
6327
d7167b14 6328 token = __fs_parse(&log, rbd_parameters, param, &result);
82995cc6
DH
6329 dout("%s fs_parse '%s' token %d\n", __func__, param->key, token);
6330 if (token < 0) {
2c3f3dc3
AV
6331 if (token == -ENOPARAM)
6332 return inval_plog(&log, "Unknown parameter '%s'",
6333 param->key);
82995cc6
DH
6334 return token;
6335 }
6336
6337 switch (token) {
6338 case Opt_queue_depth:
6339 if (result.uint_32 < 1)
6340 goto out_of_range;
6341 opt->queue_depth = result.uint_32;
6342 break;
6343 case Opt_alloc_size:
6344 if (result.uint_32 < SECTOR_SIZE)
6345 goto out_of_range;
2c3f3dc3
AV
6346 if (!is_power_of_2(result.uint_32))
6347 return inval_plog(&log, "alloc_size must be a power of 2");
82995cc6
DH
6348 opt->alloc_size = result.uint_32;
6349 break;
6350 case Opt_lock_timeout:
6351 /* 0 is "wait forever" (i.e. infinite timeout) */
6352 if (result.uint_32 > INT_MAX / 1000)
6353 goto out_of_range;
6354 opt->lock_timeout = msecs_to_jiffies(result.uint_32 * 1000);
6355 break;
6356 case Opt_pool_ns:
6357 kfree(pctx->spec->pool_ns);
6358 pctx->spec->pool_ns = param->string;
6359 param->string = NULL;
6360 break;
dc1dad8e
ID
6361 case Opt_compression_hint:
6362 switch (result.uint_32) {
6363 case Opt_compression_hint_none:
6364 opt->alloc_hint_flags &=
6365 ~(CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE |
6366 CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE);
6367 break;
6368 case Opt_compression_hint_compressible:
6369 opt->alloc_hint_flags |=
6370 CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE;
6371 opt->alloc_hint_flags &=
6372 ~CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE;
6373 break;
6374 case Opt_compression_hint_incompressible:
6375 opt->alloc_hint_flags |=
6376 CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE;
6377 opt->alloc_hint_flags &=
6378 ~CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE;
6379 break;
6380 default:
6381 BUG();
6382 }
6383 break;
82995cc6
DH
6384 case Opt_read_only:
6385 opt->read_only = true;
6386 break;
6387 case Opt_read_write:
6388 opt->read_only = false;
6389 break;
6390 case Opt_lock_on_read:
6391 opt->lock_on_read = true;
6392 break;
6393 case Opt_exclusive:
6394 opt->exclusive = true;
6395 break;
6396 case Opt_notrim:
6397 opt->trim = false;
6398 break;
6399 default:
6400 BUG();
6401 }
6402
6403 return 0;
6404
6405out_of_range:
2c3f3dc3 6406 return inval_plog(&log, "%s out of range", param->key);
82995cc6
DH
6407}
6408
6409/*
6410 * This duplicates most of generic_parse_monolithic(), untying it from
6411 * fs_context and skipping standard superblock and security options.
6412 */
6413static int rbd_parse_options(char *options, struct rbd_parse_opts_ctx *pctx)
6414{
6415 char *key;
6416 int ret = 0;
6417
6418 dout("%s '%s'\n", __func__, options);
6419 while ((key = strsep(&options, ",")) != NULL) {
6420 if (*key) {
6421 struct fs_parameter param = {
6422 .key = key,
0f89589a 6423 .type = fs_value_is_flag,
82995cc6
DH
6424 };
6425 char *value = strchr(key, '=');
6426 size_t v_len = 0;
6427
6428 if (value) {
6429 if (value == key)
6430 continue;
6431 *value++ = 0;
6432 v_len = strlen(value);
82995cc6
DH
6433 param.string = kmemdup_nul(value, v_len,
6434 GFP_KERNEL);
6435 if (!param.string)
6436 return -ENOMEM;
0f89589a 6437 param.type = fs_value_is_string;
82995cc6
DH
6438 }
6439 param.size = v_len;
6440
6441 ret = rbd_parse_param(&param, pctx);
6442 kfree(param.string);
6443 if (ret)
6444 break;
6445 }
6446 }
6447
6448 return ret;
6449}
6450
a725f65e 6451/*
859c31df
AE
6452 * Parse the options provided for an "rbd add" (i.e., rbd image
6453 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
6454 * and the data written is passed here via a NUL-terminated buffer.
6455 * Returns 0 if successful or an error code otherwise.
d22f76e7 6456 *
859c31df
AE
6457 * The information extracted from these options is recorded in
6458 * the other parameters which return dynamically-allocated
6459 * structures:
6460 * ceph_opts
6461 * The address of a pointer that will refer to a ceph options
6462 * structure. Caller must release the returned pointer using
6463 * ceph_destroy_options() when it is no longer needed.
6464 * rbd_opts
6465 * Address of an rbd options pointer. Fully initialized by
6466 * this function; caller must release with kfree().
6467 * spec
6468 * Address of an rbd image specification pointer. Fully
6469 * initialized by this function based on parsed options.
6470 * Caller must release with rbd_spec_put().
6471 *
6472 * The options passed take this form:
6473 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
6474 * where:
6475 * <mon_addrs>
6476 * A comma-separated list of one or more monitor addresses.
6477 * A monitor address is an ip address, optionally followed
6478 * by a port number (separated by a colon).
6479 * I.e.: ip1[:port1][,ip2[:port2]...]
6480 * <options>
6481 * A comma-separated list of ceph and/or rbd options.
6482 * <pool_name>
6483 * The name of the rados pool containing the rbd image.
6484 * <image_name>
6485 * The name of the image in that pool to map.
6486 * <snap_id>
6487 * An optional snapshot id. If provided, the mapping will
6488 * present data from the image at the time that snapshot was
6489 * created. The image head is used if no snapshot id is
6490 * provided. Snapshot mappings are always read-only.
a725f65e 6491 */
859c31df 6492static int rbd_add_parse_args(const char *buf,
dc79b113 6493 struct ceph_options **ceph_opts,
859c31df
AE
6494 struct rbd_options **opts,
6495 struct rbd_spec **rbd_spec)
e28fff26 6496{
d22f76e7 6497 size_t len;
859c31df 6498 char *options;
0ddebc0c 6499 const char *mon_addrs;
ecb4dc22 6500 char *snap_name;
0ddebc0c 6501 size_t mon_addrs_size;
82995cc6 6502 struct rbd_parse_opts_ctx pctx = { 0 };
dc79b113 6503 int ret;
e28fff26
AE
6504
6505 /* The first four tokens are required */
6506
7ef3214a 6507 len = next_token(&buf);
4fb5d671
AE
6508 if (!len) {
6509 rbd_warn(NULL, "no monitor address(es) provided");
6510 return -EINVAL;
6511 }
0ddebc0c 6512 mon_addrs = buf;
82995cc6 6513 mon_addrs_size = len;
7ef3214a 6514 buf += len;
a725f65e 6515
dc79b113 6516 ret = -EINVAL;
f28e565a
AE
6517 options = dup_token(&buf, NULL);
6518 if (!options)
dc79b113 6519 return -ENOMEM;
4fb5d671
AE
6520 if (!*options) {
6521 rbd_warn(NULL, "no options provided");
6522 goto out_err;
6523 }
e28fff26 6524
c300156b
ID
6525 pctx.spec = rbd_spec_alloc();
6526 if (!pctx.spec)
f28e565a 6527 goto out_mem;
859c31df 6528
c300156b
ID
6529 pctx.spec->pool_name = dup_token(&buf, NULL);
6530 if (!pctx.spec->pool_name)
859c31df 6531 goto out_mem;
c300156b 6532 if (!*pctx.spec->pool_name) {
4fb5d671
AE
6533 rbd_warn(NULL, "no pool name provided");
6534 goto out_err;
6535 }
e28fff26 6536
c300156b
ID
6537 pctx.spec->image_name = dup_token(&buf, NULL);
6538 if (!pctx.spec->image_name)
f28e565a 6539 goto out_mem;
c300156b 6540 if (!*pctx.spec->image_name) {
4fb5d671
AE
6541 rbd_warn(NULL, "no image name provided");
6542 goto out_err;
6543 }
d4b125e9 6544
f28e565a
AE
6545 /*
6546 * Snapshot name is optional; default is to use "-"
6547 * (indicating the head/no snapshot).
6548 */
3feeb894 6549 len = next_token(&buf);
820a5f3e 6550 if (!len) {
3feeb894
AE
6551 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
6552 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
f28e565a 6553 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
dc79b113 6554 ret = -ENAMETOOLONG;
f28e565a 6555 goto out_err;
849b4260 6556 }
ecb4dc22
AE
6557 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
6558 if (!snap_name)
f28e565a 6559 goto out_mem;
ecb4dc22 6560 *(snap_name + len) = '\0';
c300156b 6561 pctx.spec->snap_name = snap_name;
e5c35534 6562
82995cc6
DH
6563 pctx.copts = ceph_alloc_options();
6564 if (!pctx.copts)
6565 goto out_mem;
6566
0ddebc0c 6567 /* Initialize all rbd options to the defaults */
e28fff26 6568
c300156b
ID
6569 pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL);
6570 if (!pctx.opts)
4e9afeba
AE
6571 goto out_mem;
6572
c300156b
ID
6573 pctx.opts->read_only = RBD_READ_ONLY_DEFAULT;
6574 pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
0c93e1b7 6575 pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT;
c300156b
ID
6576 pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
6577 pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
6578 pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
6579 pctx.opts->trim = RBD_TRIM_DEFAULT;
d22f76e7 6580
82995cc6
DH
6581 ret = ceph_parse_mon_ips(mon_addrs, mon_addrs_size, pctx.copts, NULL);
6582 if (ret)
dc79b113 6583 goto out_err;
859c31df 6584
82995cc6
DH
6585 ret = rbd_parse_options(options, &pctx);
6586 if (ret)
6587 goto out_err;
6588
6589 *ceph_opts = pctx.copts;
c300156b
ID
6590 *opts = pctx.opts;
6591 *rbd_spec = pctx.spec;
82995cc6 6592 kfree(options);
dc79b113 6593 return 0;
82995cc6 6594
f28e565a 6595out_mem:
dc79b113 6596 ret = -ENOMEM;
d22f76e7 6597out_err:
c300156b 6598 kfree(pctx.opts);
82995cc6 6599 ceph_destroy_options(pctx.copts);
c300156b 6600 rbd_spec_put(pctx.spec);
f28e565a 6601 kfree(options);
dc79b113 6602 return ret;
a725f65e
AE
6603}
6604
e010dd0a
ID
6605static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
6606{
6607 down_write(&rbd_dev->lock_rwsem);
6608 if (__rbd_is_lock_owner(rbd_dev))
e1fddc8f 6609 __rbd_release_lock(rbd_dev);
e010dd0a
ID
6610 up_write(&rbd_dev->lock_rwsem);
6611}
6612
637cd060
ID
6613/*
6614 * If the wait is interrupted, an error is returned even if the lock
6615 * was successfully acquired. rbd_dev_image_unlock() will release it
6616 * if needed.
6617 */
e010dd0a
ID
6618static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
6619{
637cd060 6620 long ret;
2f18d466 6621
e010dd0a 6622 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
637cd060
ID
6623 if (!rbd_dev->opts->exclusive && !rbd_dev->opts->lock_on_read)
6624 return 0;
6625
e010dd0a
ID
6626 rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
6627 return -EINVAL;
6628 }
6629
3fe69921 6630 if (rbd_is_ro(rbd_dev))
637cd060
ID
6631 return 0;
6632
6633 rbd_assert(!rbd_is_lock_owner(rbd_dev));
6634 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
6635 ret = wait_for_completion_killable_timeout(&rbd_dev->acquire_wait,
6636 ceph_timeout_jiffies(rbd_dev->opts->lock_timeout));
25e6be21 6637 if (ret > 0) {
637cd060 6638 ret = rbd_dev->acquire_err;
25e6be21
DY
6639 } else {
6640 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
6641 if (!ret)
6642 ret = -ETIMEDOUT;
6643 }
637cd060 6644
2f18d466 6645 if (ret) {
637cd060
ID
6646 rbd_warn(rbd_dev, "failed to acquire exclusive lock: %ld", ret);
6647 return ret;
e010dd0a
ID
6648 }
6649
637cd060
ID
6650 /*
6651 * The lock may have been released by now, unless automatic lock
6652 * transitions are disabled.
6653 */
6654 rbd_assert(!rbd_dev->opts->exclusive || rbd_is_lock_owner(rbd_dev));
e010dd0a
ID
6655 return 0;
6656}
6657
589d30e0
AE
6658/*
6659 * An rbd format 2 image has a unique identifier, distinct from the
6660 * name given to it by the user. Internally, that identifier is
6661 * what's used to specify the names of objects related to the image.
6662 *
6663 * A special "rbd id" object is used to map an rbd image name to its
6664 * id. If that object doesn't exist, then there is no v2 rbd image
6665 * with the supplied name.
6666 *
6667 * This function will record the given rbd_dev's image_id field if
6668 * it can be determined, and in that case will return 0. If any
6669 * errors occur a negative errno will be returned and the rbd_dev's
6670 * image_id field will be unchanged (and should be NULL).
6671 */
6672static int rbd_dev_image_id(struct rbd_device *rbd_dev)
6673{
6674 int ret;
6675 size_t size;
ecd4a68a 6676 CEPH_DEFINE_OID_ONSTACK(oid);
589d30e0 6677 void *response;
c0fba368 6678 char *image_id;
2f82ee54 6679
2c0d0a10
AE
6680 /*
6681 * When probing a parent image, the image id is already
6682 * known (and the image name likely is not). There's no
c0fba368
AE
6683 * need to fetch the image id again in this case. We
6684 * do still need to set the image format though.
2c0d0a10 6685 */
c0fba368
AE
6686 if (rbd_dev->spec->image_id) {
6687 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
6688
2c0d0a10 6689 return 0;
c0fba368 6690 }
2c0d0a10 6691
589d30e0
AE
6692 /*
6693 * First, see if the format 2 image id file exists, and if
6694 * so, get the image's persistent id from it.
6695 */
ecd4a68a
ID
6696 ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
6697 rbd_dev->spec->image_name);
6698 if (ret)
6699 return ret;
6700
6701 dout("rbd id object name is %s\n", oid.name);
589d30e0
AE
6702
6703 /* Response will be an encoded string, which includes a length */
589d30e0
AE
6704 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
6705 response = kzalloc(size, GFP_NOIO);
6706 if (!response) {
6707 ret = -ENOMEM;
6708 goto out;
6709 }
6710
c0fba368
AE
6711 /* If it doesn't exist we'll assume it's a format 1 image */
6712
ecd4a68a
ID
6713 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
6714 "get_id", NULL, 0,
5435d206 6715 response, size);
36be9a76 6716 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
c0fba368
AE
6717 if (ret == -ENOENT) {
6718 image_id = kstrdup("", GFP_KERNEL);
6719 ret = image_id ? 0 : -ENOMEM;
6720 if (!ret)
6721 rbd_dev->image_format = 1;
7dd440c9 6722 } else if (ret >= 0) {
c0fba368
AE
6723 void *p = response;
6724
6725 image_id = ceph_extract_encoded_string(&p, p + ret,
979ed480 6726 NULL, GFP_NOIO);
461f758a 6727 ret = PTR_ERR_OR_ZERO(image_id);
c0fba368
AE
6728 if (!ret)
6729 rbd_dev->image_format = 2;
c0fba368
AE
6730 }
6731
6732 if (!ret) {
6733 rbd_dev->spec->image_id = image_id;
6734 dout("image_id is %s\n", image_id);
589d30e0
AE
6735 }
6736out:
6737 kfree(response);
ecd4a68a 6738 ceph_oid_destroy(&oid);
589d30e0
AE
6739 return ret;
6740}
6741
3abef3b3
AE
6742/*
6743 * Undo whatever state changes are made by v1 or v2 header info
6744 * call.
6745 */
6fd48b3b
AE
6746static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
6747{
6748 struct rbd_image_header *header;
6749
e69b8d41 6750 rbd_dev_parent_put(rbd_dev);
22e8bd51 6751 rbd_object_map_free(rbd_dev);
da5ef6be 6752 rbd_dev_mapping_clear(rbd_dev);
6fd48b3b
AE
6753
6754 /* Free dynamic fields from the header, then zero it out */
6755
6756 header = &rbd_dev->header;
812164f8 6757 ceph_put_snap_context(header->snapc);
6fd48b3b
AE
6758 kfree(header->snap_sizes);
6759 kfree(header->snap_names);
6760 kfree(header->object_prefix);
6761 memset(header, 0, sizeof (*header));
6762}
6763
2df3fac7 6764static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
a30b71b9
AE
6765{
6766 int ret;
a30b71b9 6767
1e130199 6768 ret = rbd_dev_v2_object_prefix(rbd_dev);
57385b51 6769 if (ret)
b1b5402a
AE
6770 goto out_err;
6771
2df3fac7
AE
6772 /*
6773 * Get the and check features for the image. Currently the
6774 * features are assumed to never change.
6775 */
b1b5402a 6776 ret = rbd_dev_v2_features(rbd_dev);
57385b51 6777 if (ret)
9d475de5 6778 goto out_err;
35d489f9 6779
cc070d59
AE
6780 /* If the image supports fancy striping, get its parameters */
6781
6782 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
6783 ret = rbd_dev_v2_striping_info(rbd_dev);
6784 if (ret < 0)
6785 goto out_err;
6786 }
a30b71b9 6787
7e97332e
ID
6788 if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
6789 ret = rbd_dev_v2_data_pool(rbd_dev);
6790 if (ret)
6791 goto out_err;
6792 }
6793
263423f8 6794 rbd_init_layout(rbd_dev);
35152979 6795 return 0;
263423f8 6796
9d475de5 6797out_err:
642a2537 6798 rbd_dev->header.features = 0;
1e130199
AE
6799 kfree(rbd_dev->header.object_prefix);
6800 rbd_dev->header.object_prefix = NULL;
9d475de5 6801 return ret;
a30b71b9
AE
6802}
6803
6d69bb53
ID
6804/*
6805 * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
6806 * rbd_dev_image_probe() recursion depth, which means it's also the
6807 * length of the already discovered part of the parent chain.
6808 */
6809static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
83a06263 6810{
2f82ee54 6811 struct rbd_device *parent = NULL;
124afba2
AE
6812 int ret;
6813
6814 if (!rbd_dev->parent_spec)
6815 return 0;
124afba2 6816
6d69bb53
ID
6817 if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
6818 pr_info("parent chain is too long (%d)\n", depth);
6819 ret = -EINVAL;
6820 goto out_err;
6821 }
6822
1643dfa4 6823 parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
1f2c6651
ID
6824 if (!parent) {
6825 ret = -ENOMEM;
124afba2 6826 goto out_err;
1f2c6651
ID
6827 }
6828
6829 /*
6830 * Images related by parent/child relationships always share
6831 * rbd_client and spec/parent_spec, so bump their refcounts.
6832 */
6833 __rbd_get_client(rbd_dev->rbd_client);
6834 rbd_spec_get(rbd_dev->parent_spec);
124afba2 6835
39258aa2
ID
6836 __set_bit(RBD_DEV_FLAG_READONLY, &parent->flags);
6837
6d69bb53 6838 ret = rbd_dev_image_probe(parent, depth);
124afba2
AE
6839 if (ret < 0)
6840 goto out_err;
1f2c6651 6841
124afba2 6842 rbd_dev->parent = parent;
a2acd00e 6843 atomic_set(&rbd_dev->parent_ref, 1);
124afba2 6844 return 0;
1f2c6651 6845
124afba2 6846out_err:
1f2c6651 6847 rbd_dev_unparent(rbd_dev);
1761b229 6848 rbd_dev_destroy(parent);
124afba2
AE
6849 return ret;
6850}
6851
5769ed0c
ID
6852static void rbd_dev_device_release(struct rbd_device *rbd_dev)
6853{
6854 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5769ed0c
ID
6855 rbd_free_disk(rbd_dev);
6856 if (!single_major)
6857 unregister_blkdev(rbd_dev->major, rbd_dev->name);
6858}
6859
811c6688
ID
6860/*
6861 * rbd_dev->header_rwsem must be locked for write and will be unlocked
6862 * upon return.
6863 */
200a6a8b 6864static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
124afba2 6865{
83a06263 6866 int ret;
d1cf5788 6867
9b60e70b 6868 /* Record our major and minor device numbers. */
83a06263 6869
9b60e70b
ID
6870 if (!single_major) {
6871 ret = register_blkdev(0, rbd_dev->name);
6872 if (ret < 0)
1643dfa4 6873 goto err_out_unlock;
9b60e70b
ID
6874
6875 rbd_dev->major = ret;
6876 rbd_dev->minor = 0;
6877 } else {
6878 rbd_dev->major = rbd_major;
6879 rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
6880 }
83a06263
AE
6881
6882 /* Set up the blkdev mapping. */
6883
6884 ret = rbd_init_disk(rbd_dev);
6885 if (ret)
6886 goto err_out_blkdev;
6887
f35a4dee 6888 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
39258aa2 6889 set_disk_ro(rbd_dev->disk, rbd_is_ro(rbd_dev));
f35a4dee 6890
5769ed0c 6891 ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
f35a4dee 6892 if (ret)
da5ef6be 6893 goto err_out_disk;
83a06263 6894
129b79d4 6895 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
811c6688 6896 up_write(&rbd_dev->header_rwsem);
5769ed0c 6897 return 0;
2f82ee54 6898
83a06263
AE
6899err_out_disk:
6900 rbd_free_disk(rbd_dev);
6901err_out_blkdev:
9b60e70b
ID
6902 if (!single_major)
6903 unregister_blkdev(rbd_dev->major, rbd_dev->name);
811c6688
ID
6904err_out_unlock:
6905 up_write(&rbd_dev->header_rwsem);
83a06263
AE
6906 return ret;
6907}
6908
332bb12d
AE
6909static int rbd_dev_header_name(struct rbd_device *rbd_dev)
6910{
6911 struct rbd_spec *spec = rbd_dev->spec;
c41d13a3 6912 int ret;
332bb12d
AE
6913
6914 /* Record the header object name for this rbd image. */
6915
6916 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
332bb12d 6917 if (rbd_dev->image_format == 1)
c41d13a3
ID
6918 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6919 spec->image_name, RBD_SUFFIX);
332bb12d 6920 else
c41d13a3
ID
6921 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6922 RBD_HEADER_PREFIX, spec->image_id);
332bb12d 6923
c41d13a3 6924 return ret;
332bb12d
AE
6925}
6926
b9ef2b88
ID
6927static void rbd_print_dne(struct rbd_device *rbd_dev, bool is_snap)
6928{
6929 if (!is_snap) {
6930 pr_info("image %s/%s%s%s does not exist\n",
6931 rbd_dev->spec->pool_name,
6932 rbd_dev->spec->pool_ns ?: "",
6933 rbd_dev->spec->pool_ns ? "/" : "",
6934 rbd_dev->spec->image_name);
6935 } else {
6936 pr_info("snap %s/%s%s%s@%s does not exist\n",
6937 rbd_dev->spec->pool_name,
6938 rbd_dev->spec->pool_ns ?: "",
6939 rbd_dev->spec->pool_ns ? "/" : "",
6940 rbd_dev->spec->image_name,
6941 rbd_dev->spec->snap_name);
6942 }
6943}
6944
200a6a8b
AE
6945static void rbd_dev_image_release(struct rbd_device *rbd_dev)
6946{
b8776051 6947 if (!rbd_is_ro(rbd_dev))
fd22aef8 6948 rbd_unregister_watch(rbd_dev);
952c48b0
ID
6949
6950 rbd_dev_unprobe(rbd_dev);
6fd48b3b
AE
6951 rbd_dev->image_format = 0;
6952 kfree(rbd_dev->spec->image_id);
6953 rbd_dev->spec->image_id = NULL;
200a6a8b
AE
6954}
6955
a30b71b9
AE
6956/*
6957 * Probe for the existence of the header object for the given rbd
1f3ef788
AE
6958 * device. If this image is the one being mapped (i.e., not a
6959 * parent), initiate a watch on its header object before using that
6960 * object to get detailed information about the rbd image.
0e4e1de5
ID
6961 *
6962 * On success, returns with header_rwsem held for write if called
6963 * with @depth == 0.
a30b71b9 6964 */
6d69bb53 6965static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
a30b71b9 6966{
b9ef2b88 6967 bool need_watch = !rbd_is_ro(rbd_dev);
a30b71b9
AE
6968 int ret;
6969
6970 /*
3abef3b3
AE
6971 * Get the id from the image id object. Unless there's an
6972 * error, rbd_dev->spec->image_id will be filled in with
6973 * a dynamically-allocated string, and rbd_dev->image_format
6974 * will be set to either 1 or 2.
a30b71b9
AE
6975 */
6976 ret = rbd_dev_image_id(rbd_dev);
6977 if (ret)
c0fba368 6978 return ret;
c0fba368 6979
332bb12d
AE
6980 ret = rbd_dev_header_name(rbd_dev);
6981 if (ret)
6982 goto err_out_format;
6983
b9ef2b88 6984 if (need_watch) {
99d16943 6985 ret = rbd_register_watch(rbd_dev);
1fe48023
ID
6986 if (ret) {
6987 if (ret == -ENOENT)
b9ef2b88 6988 rbd_print_dne(rbd_dev, false);
c41d13a3 6989 goto err_out_format;
1fe48023 6990 }
1f3ef788 6991 }
b644de2b 6992
0e4e1de5
ID
6993 if (!depth)
6994 down_write(&rbd_dev->header_rwsem);
6995
a720ae09 6996 ret = rbd_dev_header_info(rbd_dev);
b9ef2b88
ID
6997 if (ret) {
6998 if (ret == -ENOENT && !need_watch)
6999 rbd_print_dne(rbd_dev, false);
952c48b0 7000 goto err_out_probe;
b9ef2b88 7001 }
83a06263 7002
04077599
ID
7003 /*
7004 * If this image is the one being mapped, we have pool name and
7005 * id, image name and id, and snap name - need to fill snap id.
7006 * Otherwise this is a parent image, identified by pool, image
7007 * and snap ids - need to fill in names for those ids.
7008 */
6d69bb53 7009 if (!depth)
04077599
ID
7010 ret = rbd_spec_fill_snap_id(rbd_dev);
7011 else
7012 ret = rbd_spec_fill_names(rbd_dev);
1fe48023
ID
7013 if (ret) {
7014 if (ret == -ENOENT)
b9ef2b88 7015 rbd_print_dne(rbd_dev, true);
33dca39f 7016 goto err_out_probe;
1fe48023 7017 }
9bb81c9b 7018
da5ef6be
ID
7019 ret = rbd_dev_mapping_set(rbd_dev);
7020 if (ret)
7021 goto err_out_probe;
7022
f3c0e459 7023 if (rbd_is_snap(rbd_dev) &&
22e8bd51
ID
7024 (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) {
7025 ret = rbd_object_map_load(rbd_dev);
7026 if (ret)
7027 goto err_out_probe;
7028 }
7029
e8f59b59
ID
7030 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
7031 ret = rbd_dev_v2_parent_info(rbd_dev);
7032 if (ret)
7033 goto err_out_probe;
e8f59b59
ID
7034 }
7035
6d69bb53 7036 ret = rbd_dev_probe_parent(rbd_dev, depth);
30d60ba2
AE
7037 if (ret)
7038 goto err_out_probe;
7039
7040 dout("discovered format %u image, header name is %s\n",
c41d13a3 7041 rbd_dev->image_format, rbd_dev->header_oid.name);
30d60ba2 7042 return 0;
e8f59b59 7043
6fd48b3b 7044err_out_probe:
0e4e1de5
ID
7045 if (!depth)
7046 up_write(&rbd_dev->header_rwsem);
b9ef2b88 7047 if (need_watch)
99d16943 7048 rbd_unregister_watch(rbd_dev);
952c48b0 7049 rbd_dev_unprobe(rbd_dev);
332bb12d
AE
7050err_out_format:
7051 rbd_dev->image_format = 0;
5655c4d9
AE
7052 kfree(rbd_dev->spec->image_id);
7053 rbd_dev->spec->image_id = NULL;
a30b71b9
AE
7054 return ret;
7055}
7056
9b60e70b
ID
7057static ssize_t do_rbd_add(struct bus_type *bus,
7058 const char *buf,
7059 size_t count)
602adf40 7060{
cb8627c7 7061 struct rbd_device *rbd_dev = NULL;
dc79b113 7062 struct ceph_options *ceph_opts = NULL;
4e9afeba 7063 struct rbd_options *rbd_opts = NULL;
859c31df 7064 struct rbd_spec *spec = NULL;
9d3997fd 7065 struct rbd_client *rbdc;
b51c83c2 7066 int rc;
602adf40 7067
f44d04e6
ID
7068 if (!capable(CAP_SYS_ADMIN))
7069 return -EPERM;
7070
602adf40
YS
7071 if (!try_module_get(THIS_MODULE))
7072 return -ENODEV;
7073
602adf40 7074 /* parse add command */
859c31df 7075 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
dc79b113 7076 if (rc < 0)
dd5ac32d 7077 goto out;
78cea76e 7078
9d3997fd
AE
7079 rbdc = rbd_get_client(ceph_opts);
7080 if (IS_ERR(rbdc)) {
7081 rc = PTR_ERR(rbdc);
0ddebc0c 7082 goto err_out_args;
9d3997fd 7083 }
602adf40 7084
602adf40 7085 /* pick the pool */
dd435855 7086 rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
1fe48023
ID
7087 if (rc < 0) {
7088 if (rc == -ENOENT)
7089 pr_info("pool %s does not exist\n", spec->pool_name);
602adf40 7090 goto err_out_client;
1fe48023 7091 }
c0cd10db 7092 spec->pool_id = (u64)rc;
859c31df 7093
d147543d 7094 rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
b51c83c2
ID
7095 if (!rbd_dev) {
7096 rc = -ENOMEM;
bd4ba655 7097 goto err_out_client;
b51c83c2 7098 }
c53d5893
AE
7099 rbdc = NULL; /* rbd_dev now owns this */
7100 spec = NULL; /* rbd_dev now owns this */
d147543d 7101 rbd_opts = NULL; /* rbd_dev now owns this */
602adf40 7102
39258aa2
ID
7103 /* if we are mapping a snapshot it will be a read-only mapping */
7104 if (rbd_dev->opts->read_only ||
7105 strcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME))
7106 __set_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags);
7107
0d6d1e9c
MC
7108 rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
7109 if (!rbd_dev->config_info) {
7110 rc = -ENOMEM;
7111 goto err_out_rbd_dev;
7112 }
7113
6d69bb53 7114 rc = rbd_dev_image_probe(rbd_dev, 0);
0e4e1de5 7115 if (rc < 0)
c53d5893 7116 goto err_out_rbd_dev;
05fd6f6f 7117
0c93e1b7
ID
7118 if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) {
7119 rbd_warn(rbd_dev, "alloc_size adjusted to %u",
7120 rbd_dev->layout.object_size);
7121 rbd_dev->opts->alloc_size = rbd_dev->layout.object_size;
7122 }
7123
b536f69a 7124 rc = rbd_dev_device_setup(rbd_dev);
fd22aef8 7125 if (rc)
8b679ec5 7126 goto err_out_image_probe;
3abef3b3 7127
637cd060
ID
7128 rc = rbd_add_acquire_lock(rbd_dev);
7129 if (rc)
7130 goto err_out_image_lock;
3abef3b3 7131
5769ed0c
ID
7132 /* Everything's ready. Announce the disk to the world. */
7133
7134 rc = device_add(&rbd_dev->dev);
7135 if (rc)
e010dd0a 7136 goto err_out_image_lock;
5769ed0c 7137
3325322f 7138 device_add_disk(&rbd_dev->dev, rbd_dev->disk, NULL);
5769ed0c
ID
7139 /* see rbd_init_disk() */
7140 blk_put_queue(rbd_dev->disk->queue);
7141
7142 spin_lock(&rbd_dev_list_lock);
7143 list_add_tail(&rbd_dev->node, &rbd_dev_list);
7144 spin_unlock(&rbd_dev_list_lock);
7145
7146 pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
7147 (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
7148 rbd_dev->header.features);
dd5ac32d
ID
7149 rc = count;
7150out:
7151 module_put(THIS_MODULE);
7152 return rc;
b536f69a 7153
e010dd0a
ID
7154err_out_image_lock:
7155 rbd_dev_image_unlock(rbd_dev);
5769ed0c 7156 rbd_dev_device_release(rbd_dev);
8b679ec5
ID
7157err_out_image_probe:
7158 rbd_dev_image_release(rbd_dev);
c53d5893
AE
7159err_out_rbd_dev:
7160 rbd_dev_destroy(rbd_dev);
bd4ba655 7161err_out_client:
9d3997fd 7162 rbd_put_client(rbdc);
0ddebc0c 7163err_out_args:
859c31df 7164 rbd_spec_put(spec);
d147543d 7165 kfree(rbd_opts);
dd5ac32d 7166 goto out;
602adf40
YS
7167}
7168
7e9586ba 7169static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count)
9b60e70b
ID
7170{
7171 if (single_major)
7172 return -EINVAL;
7173
7174 return do_rbd_add(bus, buf, count);
7175}
7176
7e9586ba
GKH
7177static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
7178 size_t count)
9b60e70b
ID
7179{
7180 return do_rbd_add(bus, buf, count);
7181}
7182
05a46afd
AE
7183static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
7184{
ad945fc1 7185 while (rbd_dev->parent) {
05a46afd
AE
7186 struct rbd_device *first = rbd_dev;
7187 struct rbd_device *second = first->parent;
7188 struct rbd_device *third;
7189
7190 /*
7191 * Follow to the parent with no grandparent and
7192 * remove it.
7193 */
7194 while (second && (third = second->parent)) {
7195 first = second;
7196 second = third;
7197 }
ad945fc1 7198 rbd_assert(second);
8ad42cd0 7199 rbd_dev_image_release(second);
8b679ec5 7200 rbd_dev_destroy(second);
ad945fc1
AE
7201 first->parent = NULL;
7202 first->parent_overlap = 0;
7203
7204 rbd_assert(first->parent_spec);
05a46afd
AE
7205 rbd_spec_put(first->parent_spec);
7206 first->parent_spec = NULL;
05a46afd
AE
7207 }
7208}
7209
9b60e70b
ID
7210static ssize_t do_rbd_remove(struct bus_type *bus,
7211 const char *buf,
7212 size_t count)
602adf40
YS
7213{
7214 struct rbd_device *rbd_dev = NULL;
751cc0e3
AE
7215 struct list_head *tmp;
7216 int dev_id;
0276dca6 7217 char opt_buf[6];
0276dca6 7218 bool force = false;
0d8189e1 7219 int ret;
602adf40 7220
f44d04e6
ID
7221 if (!capable(CAP_SYS_ADMIN))
7222 return -EPERM;
7223
0276dca6
MC
7224 dev_id = -1;
7225 opt_buf[0] = '\0';
7226 sscanf(buf, "%d %5s", &dev_id, opt_buf);
7227 if (dev_id < 0) {
7228 pr_err("dev_id out of range\n");
602adf40 7229 return -EINVAL;
0276dca6
MC
7230 }
7231 if (opt_buf[0] != '\0') {
7232 if (!strcmp(opt_buf, "force")) {
7233 force = true;
7234 } else {
7235 pr_err("bad remove option at '%s'\n", opt_buf);
7236 return -EINVAL;
7237 }
7238 }
602adf40 7239
751cc0e3
AE
7240 ret = -ENOENT;
7241 spin_lock(&rbd_dev_list_lock);
7242 list_for_each(tmp, &rbd_dev_list) {
7243 rbd_dev = list_entry(tmp, struct rbd_device, node);
7244 if (rbd_dev->dev_id == dev_id) {
7245 ret = 0;
7246 break;
7247 }
42382b70 7248 }
751cc0e3
AE
7249 if (!ret) {
7250 spin_lock_irq(&rbd_dev->lock);
0276dca6 7251 if (rbd_dev->open_count && !force)
751cc0e3 7252 ret = -EBUSY;
85f5a4d6
ID
7253 else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING,
7254 &rbd_dev->flags))
7255 ret = -EINPROGRESS;
751cc0e3
AE
7256 spin_unlock_irq(&rbd_dev->lock);
7257 }
7258 spin_unlock(&rbd_dev_list_lock);
85f5a4d6 7259 if (ret)
1ba0f1e7 7260 return ret;
751cc0e3 7261
0276dca6
MC
7262 if (force) {
7263 /*
7264 * Prevent new IO from being queued and wait for existing
7265 * IO to complete/fail.
7266 */
7267 blk_mq_freeze_queue(rbd_dev->disk->queue);
7268 blk_set_queue_dying(rbd_dev->disk->queue);
7269 }
7270
5769ed0c
ID
7271 del_gendisk(rbd_dev->disk);
7272 spin_lock(&rbd_dev_list_lock);
7273 list_del_init(&rbd_dev->node);
7274 spin_unlock(&rbd_dev_list_lock);
7275 device_del(&rbd_dev->dev);
fca27065 7276
e010dd0a 7277 rbd_dev_image_unlock(rbd_dev);
dd5ac32d 7278 rbd_dev_device_release(rbd_dev);
8ad42cd0 7279 rbd_dev_image_release(rbd_dev);
8b679ec5 7280 rbd_dev_destroy(rbd_dev);
1ba0f1e7 7281 return count;
602adf40
YS
7282}
7283
7e9586ba 7284static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count)
9b60e70b
ID
7285{
7286 if (single_major)
7287 return -EINVAL;
7288
7289 return do_rbd_remove(bus, buf, count);
7290}
7291
7e9586ba
GKH
7292static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
7293 size_t count)
9b60e70b
ID
7294{
7295 return do_rbd_remove(bus, buf, count);
7296}
7297
602adf40
YS
7298/*
7299 * create control files in sysfs
dfc5606d 7300 * /sys/bus/rbd/...
602adf40 7301 */
7d8dc534 7302static int __init rbd_sysfs_init(void)
602adf40 7303{
dfc5606d 7304 int ret;
602adf40 7305
fed4c143 7306 ret = device_register(&rbd_root_dev);
21079786 7307 if (ret < 0)
dfc5606d 7308 return ret;
602adf40 7309
fed4c143
AE
7310 ret = bus_register(&rbd_bus_type);
7311 if (ret < 0)
7312 device_unregister(&rbd_root_dev);
602adf40 7313
602adf40
YS
7314 return ret;
7315}
7316
7d8dc534 7317static void __exit rbd_sysfs_cleanup(void)
602adf40 7318{
dfc5606d 7319 bus_unregister(&rbd_bus_type);
fed4c143 7320 device_unregister(&rbd_root_dev);
602adf40
YS
7321}
7322
7d8dc534 7323static int __init rbd_slab_init(void)
1c2a9dfe
AE
7324{
7325 rbd_assert(!rbd_img_request_cache);
03d94406 7326 rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
868311b1
AE
7327 if (!rbd_img_request_cache)
7328 return -ENOMEM;
7329
7330 rbd_assert(!rbd_obj_request_cache);
03d94406 7331 rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
78c2a44a
AE
7332 if (!rbd_obj_request_cache)
7333 goto out_err;
7334
6c696d85 7335 return 0;
1c2a9dfe 7336
6c696d85 7337out_err:
868311b1
AE
7338 kmem_cache_destroy(rbd_img_request_cache);
7339 rbd_img_request_cache = NULL;
1c2a9dfe
AE
7340 return -ENOMEM;
7341}
7342
7343static void rbd_slab_exit(void)
7344{
868311b1
AE
7345 rbd_assert(rbd_obj_request_cache);
7346 kmem_cache_destroy(rbd_obj_request_cache);
7347 rbd_obj_request_cache = NULL;
7348
1c2a9dfe
AE
7349 rbd_assert(rbd_img_request_cache);
7350 kmem_cache_destroy(rbd_img_request_cache);
7351 rbd_img_request_cache = NULL;
7352}
7353
cc344fa1 7354static int __init rbd_init(void)
602adf40
YS
7355{
7356 int rc;
7357
1e32d34c
AE
7358 if (!libceph_compatible(NULL)) {
7359 rbd_warn(NULL, "libceph incompatibility (quitting)");
1e32d34c
AE
7360 return -EINVAL;
7361 }
e1b4d96d 7362
1c2a9dfe 7363 rc = rbd_slab_init();
602adf40
YS
7364 if (rc)
7365 return rc;
e1b4d96d 7366
f5ee37bd
ID
7367 /*
7368 * The number of active work items is limited by the number of
f77303bd 7369 * rbd devices * queue depth, so leave @max_active at default.
f5ee37bd
ID
7370 */
7371 rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
7372 if (!rbd_wq) {
7373 rc = -ENOMEM;
7374 goto err_out_slab;
7375 }
7376
9b60e70b
ID
7377 if (single_major) {
7378 rbd_major = register_blkdev(0, RBD_DRV_NAME);
7379 if (rbd_major < 0) {
7380 rc = rbd_major;
f5ee37bd 7381 goto err_out_wq;
9b60e70b
ID
7382 }
7383 }
7384
1c2a9dfe
AE
7385 rc = rbd_sysfs_init();
7386 if (rc)
9b60e70b
ID
7387 goto err_out_blkdev;
7388
7389 if (single_major)
7390 pr_info("loaded (major %d)\n", rbd_major);
7391 else
7392 pr_info("loaded\n");
1c2a9dfe 7393
e1b4d96d
ID
7394 return 0;
7395
9b60e70b
ID
7396err_out_blkdev:
7397 if (single_major)
7398 unregister_blkdev(rbd_major, RBD_DRV_NAME);
f5ee37bd
ID
7399err_out_wq:
7400 destroy_workqueue(rbd_wq);
e1b4d96d
ID
7401err_out_slab:
7402 rbd_slab_exit();
1c2a9dfe 7403 return rc;
602adf40
YS
7404}
7405
cc344fa1 7406static void __exit rbd_exit(void)
602adf40 7407{
ffe312cf 7408 ida_destroy(&rbd_dev_id_ida);
602adf40 7409 rbd_sysfs_cleanup();
9b60e70b
ID
7410 if (single_major)
7411 unregister_blkdev(rbd_major, RBD_DRV_NAME);
f5ee37bd 7412 destroy_workqueue(rbd_wq);
1c2a9dfe 7413 rbd_slab_exit();
602adf40
YS
7414}
7415
7416module_init(rbd_init);
7417module_exit(rbd_exit);
7418
d552c619 7419MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
602adf40
YS
7420MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
7421MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
602adf40
YS
7422/* following authorship retained from original osdblk.c */
7423MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
7424
90da258b 7425MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
602adf40 7426MODULE_LICENSE("GPL");