]>
git.proxmox.com Git - ceph.git/blob - ceph/src/krbd.cc
bf7e25583691835a3525194e41b9887e168f82b5
2 * Ceph - scalable distributed file system
4 * Copyright (C) 2014 Inktank Storage, Inc.
6 * This is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License version 2.1, as published by the Free Software
9 * Foundation. See file COPYING.
24 #include <sys/types.h>
27 #include "auth/KeyRing.h"
28 #include "common/errno.h"
29 #include "common/Formatter.h"
30 #include "common/module.h"
31 #include "common/run_cmd.h"
32 #include "common/safe_io.h"
33 #include "common/secret.h"
34 #include "common/TextTable.h"
35 #include "include/assert.h"
36 #include "include/stringify.h"
37 #include "include/krbd.h"
38 #include "mon/MonMap.h"
40 #include <blkid/blkid.h>
45 const static int POLL_TIMEOUT
=120000;
52 static string
get_kernel_rbd_name(const char *id
)
54 return string("/dev/rbd") + id
;
57 static int sysfs_write_rbd(const char *which
, const string
& buf
)
59 const string s
= string("/sys/bus/rbd/") + which
;
60 const string t
= s
+ "_single_major";
65 * 'add' and 'add_single_major' interfaces are identical, but if rbd
66 * kernel module is new enough and is configured to use single-major
67 * scheme, 'add' is disabled in order to prevent old userspace from
68 * doing weird things at unmap time.
70 * Same goes for 'remove' vs 'remove_single_major'.
72 fd
= open(t
.c_str(), O_WRONLY
);
74 if (errno
== ENOENT
) {
75 fd
= open(s
.c_str(), O_WRONLY
);
83 r
= safe_write(fd
, buf
.c_str(), buf
.size());
89 static int sysfs_write_rbd_add(const string
& buf
)
91 return sysfs_write_rbd("add", buf
);
94 static int sysfs_write_rbd_remove(const string
& buf
)
96 return sysfs_write_rbd("remove", buf
);
99 static int have_minor_attr(void)
102 * 'minor' attribute was added as part of single_major merge, which
103 * exposed the 'single_major' parameter. 'minor' is always present,
104 * regardless of whether single-major scheme is turned on or not.
106 * (Something like ver >= KERNEL_VERSION(3, 14, 0) is a no-go because
107 * this has to work with rbd.ko backported to various kernels.)
109 return access("/sys/module/rbd/parameters/single_major", F_OK
) == 0;
112 static int build_map_buf(CephContext
*cct
, const char *pool
, const char *image
,
113 const char *snap
, const char *options
, string
*pbuf
)
119 r
= monmap
.build_initial(cct
, cerr
);
123 list
<entity_addr_t
> mon_addr
;
124 monmap
.list_addrs(mon_addr
);
126 for (const auto &p
: mon_addr
) {
127 if (oss
.tellp() > 0) {
130 oss
<< p
.get_sockaddr();
133 oss
<< " name=" << cct
->_conf
->name
.get_id();
136 r
= keyring
.from_ceph_context(cct
);
137 if (r
== -ENOENT
&& !(cct
->_conf
->keyfile
.length() ||
138 cct
->_conf
->key
.length()))
141 cerr
<< "rbd: failed to get secret" << std::endl
;
146 string key_name
= string("client.") + cct
->_conf
->name
.get_id();
147 if (keyring
.get_secret(cct
->_conf
->name
, secret
)) {
149 secret
.encode_base64(secret_str
);
151 r
= set_kernel_secret(secret_str
.c_str(), key_name
.c_str());
154 cerr
<< "rbd: warning: secret has length 0" << std::endl
;
155 oss
<< ",key=" << key_name
;
156 } else if (r
== -ENODEV
|| r
== -ENOSYS
) {
157 // running against older kernel; fall back to secret= in options
158 oss
<< ",secret=" << secret_str
;
160 cerr
<< "rbd: failed to add secret '" << key_name
<< "' to kernel"
164 } else if (is_kernel_secret(key_name
.c_str())) {
165 oss
<< ",key=" << key_name
;
168 if (strcmp(options
, "") != 0)
169 oss
<< "," << options
;
171 oss
<< " " << pool
<< " " << image
<< " " << snap
;
177 static int wait_for_udev_add(struct udev_monitor
*mon
, const char *pool
,
178 const char *image
, const char *snap
,
181 struct udev_device
*bus_dev
= NULL
;
184 * Catch /sys/devices/rbd/<id>/ and wait for the corresponding
185 * block device to show up. This is necessary because rbd devices
186 * and block devices aren't linked together in our sysfs layout.
189 struct pollfd fds
[1];
190 struct udev_device
*dev
;
192 fds
[0].fd
= udev_monitor_get_fd(mon
);
193 fds
[0].events
= POLLIN
;
194 if (poll(fds
, 1, POLL_TIMEOUT
) < 0)
197 dev
= udev_monitor_receive_device(mon
);
201 if (strcmp(udev_device_get_action(dev
), "add") != 0)
205 if (strcmp(udev_device_get_subsystem(dev
), "rbd") == 0) {
206 const char *this_pool
= udev_device_get_sysattr_value(dev
, "pool");
207 const char *this_image
= udev_device_get_sysattr_value(dev
, "name");
208 const char *this_snap
= udev_device_get_sysattr_value(dev
,
211 if (this_pool
&& strcmp(this_pool
, pool
) == 0 &&
212 this_image
&& strcmp(this_image
, image
) == 0 &&
213 this_snap
&& strcmp(this_snap
, snap
) == 0) {
219 if (strcmp(udev_device_get_subsystem(dev
), "block") == 0) {
220 const char *major
= udev_device_get_sysattr_value(bus_dev
, "major");
221 const char *minor
= udev_device_get_sysattr_value(bus_dev
, "minor");
222 const char *this_major
= udev_device_get_property_value(dev
, "MAJOR");
223 const char *this_minor
= udev_device_get_property_value(dev
, "MINOR");
225 assert(!minor
^ have_minor_attr());
227 if (strcmp(this_major
, major
) == 0 &&
228 (!minor
|| strcmp(this_minor
, minor
) == 0)) {
229 string name
= get_kernel_rbd_name(udev_device_get_sysname(bus_dev
));
231 assert(strcmp(udev_device_get_devnode(dev
), name
.c_str()) == 0);
234 udev_device_unref(dev
);
235 udev_device_unref(bus_dev
);
242 udev_device_unref(dev
);
248 static int do_map(struct udev
*udev
, const char *pool
, const char *image
,
249 const char *snap
, const string
& buf
, string
*pname
)
251 struct udev_monitor
*mon
;
254 mon
= udev_monitor_new_from_netlink(udev
, "udev");
258 r
= udev_monitor_filter_add_match_subsystem_devtype(mon
, "rbd", NULL
);
262 r
= udev_monitor_filter_add_match_subsystem_devtype(mon
, "block", "disk");
266 r
= udev_monitor_enable_receiving(mon
);
270 r
= sysfs_write_rbd_add(buf
);
272 cerr
<< "rbd: sysfs write failed" << std::endl
;
276 r
= wait_for_udev_add(mon
, pool
, image
, snap
, pname
);
278 cerr
<< "rbd: wait failed" << std::endl
;
283 udev_monitor_unref(mon
);
287 static int map_image(struct krbd_ctx
*ctx
, const char *pool
, const char *image
,
288 const char *snap
, const char *options
, string
*pname
)
293 if (strcmp(snap
, "") == 0)
296 r
= build_map_buf(ctx
->cct
, pool
, image
, snap
, options
, &buf
);
301 * Modprobe rbd kernel module. If it supports single-major device
302 * number allocation scheme, make sure it's turned on.
304 if (access("/sys/bus/rbd", F_OK
) != 0) {
305 const char *module_options
= NULL
;
306 if (module_has_param("rbd", "single_major"))
307 module_options
= "single_major=Y";
309 r
= module_load("rbd", module_options
);
311 cerr
<< "rbd: failed to load rbd kernel module (" << r
<< ")"
314 * Ignore the error: modprobe failing doesn't necessarily prevent
320 return do_map(ctx
->udev
, pool
, image
, snap
, buf
, pname
);
323 static int devno_to_krbd_id(struct udev
*udev
, dev_t devno
, string
*pid
)
325 struct udev_enumerate
*enm
;
326 struct udev_list_entry
*l
;
327 struct udev_device
*dev
;
330 enm
= udev_enumerate_new(udev
);
334 r
= udev_enumerate_add_match_subsystem(enm
, "rbd");
338 r
= udev_enumerate_add_match_sysattr(enm
, "major",
339 stringify(major(devno
)).c_str());
343 if (have_minor_attr()) {
344 r
= udev_enumerate_add_match_sysattr(enm
, "minor",
345 stringify(minor(devno
)).c_str());
350 r
= udev_enumerate_scan_devices(enm
);
354 l
= udev_enumerate_get_list_entry(enm
);
360 /* make sure there is only one match */
361 assert(!udev_list_entry_get_next(l
));
363 dev
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(l
));
369 *pid
= udev_device_get_sysname(dev
);
371 udev_device_unref(dev
);
373 udev_enumerate_unref(enm
);
377 static int spec_to_devno_and_krbd_id(struct udev
*udev
, const char *pool
,
378 const char *image
, const char *snap
,
379 dev_t
*pdevno
, string
*pid
)
381 struct udev_enumerate
*enm
;
382 struct udev_list_entry
*l
;
383 struct udev_device
*dev
;
384 unsigned int maj
, min
= 0;
388 enm
= udev_enumerate_new(udev
);
392 r
= udev_enumerate_add_match_subsystem(enm
, "rbd");
396 r
= udev_enumerate_add_match_sysattr(enm
, "pool", pool
);
400 r
= udev_enumerate_add_match_sysattr(enm
, "name", image
);
404 r
= udev_enumerate_add_match_sysattr(enm
, "current_snap", snap
);
408 r
= udev_enumerate_scan_devices(enm
);
412 l
= udev_enumerate_get_list_entry(enm
);
418 dev
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(l
));
424 maj
= strict_strtoll(udev_device_get_sysattr_value(dev
, "major"), 10, &err
);
426 cerr
<< "rbd: couldn't parse major: " << err
<< std::endl
;
430 if (have_minor_attr()) {
431 min
= strict_strtoll(udev_device_get_sysattr_value(dev
, "minor"), 10, &err
);
433 cerr
<< "rbd: couldn't parse minor: " << err
<< std::endl
;
440 * If an image is mapped more than once don't bother trying to unmap
441 * all devices - let users run unmap the same number of times they
444 if (udev_list_entry_get_next(l
))
445 cerr
<< "rbd: " << pool
<< "/" << image
<< "@" << snap
446 << ": mapped more than once, unmapping "
447 << get_kernel_rbd_name(udev_device_get_sysname(dev
))
448 << " only" << std::endl
;
450 *pdevno
= makedev(maj
, min
);
451 *pid
= udev_device_get_sysname(dev
);
454 udev_device_unref(dev
);
456 udev_enumerate_unref(enm
);
460 static string
build_unmap_buf(const string
& id
, const char *options
)
463 if (strcmp(options
, "") != 0) {
470 static int wait_for_udev_remove(struct udev_monitor
*mon
, dev_t devno
)
473 struct pollfd fds
[1];
474 struct udev_device
*dev
;
476 fds
[0].fd
= udev_monitor_get_fd(mon
);
477 fds
[0].events
= POLLIN
;
478 if (poll(fds
, 1, POLL_TIMEOUT
) < 0)
481 dev
= udev_monitor_receive_device(mon
);
485 if (strcmp(udev_device_get_action(dev
), "remove") == 0 &&
486 udev_device_get_devnum(dev
) == devno
) {
487 udev_device_unref(dev
);
491 udev_device_unref(dev
);
497 static int do_unmap(struct udev
*udev
, dev_t devno
, const string
& buf
)
499 struct udev_monitor
*mon
;
502 mon
= udev_monitor_new_from_netlink(udev
, "udev");
506 r
= udev_monitor_filter_add_match_subsystem_devtype(mon
, "block", "disk");
510 r
= udev_monitor_enable_receiving(mon
);
515 * On final device close(), kernel sends a block change event, in
516 * response to which udev apparently runs blkid on the device. This
517 * makes unmap fail with EBUSY, if issued right after final close().
518 * Try to circumvent this with a retry before turning to udev.
520 for (int tries
= 0; ; tries
++) {
521 r
= sysfs_write_rbd_remove(buf
);
524 } else if (r
== -EBUSY
&& tries
< 2) {
529 * libudev does not provide the "wait until the queue is empty"
530 * API or the sufficient amount of primitives to build it from.
532 string err
= run_cmd("udevadm", "settle", "--timeout", "10", NULL
);
534 cerr
<< "rbd: " << err
<< std::endl
;
537 cerr
<< "rbd: sysfs write failed" << std::endl
;
542 r
= wait_for_udev_remove(mon
, devno
);
544 cerr
<< "rbd: wait failed" << std::endl
;
549 udev_monitor_unref(mon
);
553 static int unmap_image(struct krbd_ctx
*ctx
, const char *devnode
,
557 dev_t wholedevno
= 0;
561 if (stat(devnode
, &sb
) < 0 || !S_ISBLK(sb
.st_mode
)) {
562 cerr
<< "rbd: '" << devnode
<< "' is not a block device" << std::endl
;
566 r
= blkid_devno_to_wholedisk(sb
.st_rdev
, NULL
, 0, &wholedevno
);
568 cerr
<< "rbd: couldn't compute wholedevno: " << cpp_strerror(r
)
571 * Ignore the error: we are given whole disks most of the time, and
572 * if it turns out this is a partition we will fail later anyway.
574 wholedevno
= sb
.st_rdev
;
577 r
= devno_to_krbd_id(ctx
->udev
, wholedevno
, &id
);
580 cerr
<< "rbd: '" << devnode
<< "' is not an rbd device" << std::endl
;
586 return do_unmap(ctx
->udev
, wholedevno
, build_unmap_buf(id
, options
));
589 static int unmap_image(struct krbd_ctx
*ctx
, const char *pool
,
590 const char *image
, const char *snap
,
600 r
= spec_to_devno_and_krbd_id(ctx
->udev
, pool
, image
, snap
, &devno
, &id
);
603 cerr
<< "rbd: " << pool
<< "/" << image
<< "@" << snap
604 << ": not a mapped image or snapshot" << std::endl
;
610 return do_unmap(ctx
->udev
, devno
, build_unmap_buf(id
, options
));
613 static bool dump_one_image(Formatter
*f
, TextTable
*tbl
,
614 struct udev_device
*dev
)
616 const char *id
= udev_device_get_sysname(dev
);
617 const char *pool
= udev_device_get_sysattr_value(dev
, "pool");
618 const char *image
= udev_device_get_sysattr_value(dev
, "name");
619 const char *snap
= udev_device_get_sysattr_value(dev
, "current_snap");
620 string kname
= get_kernel_rbd_name(id
);
622 if (!pool
|| !image
|| !snap
)
626 f
->open_object_section(id
);
627 f
->dump_string("pool", pool
);
628 f
->dump_string("name", image
);
629 f
->dump_string("snap", snap
);
630 f
->dump_string("device", kname
);
633 *tbl
<< id
<< pool
<< image
<< snap
<< kname
<< TextTable::endrow
;
639 static int do_dump(struct udev
*udev
, Formatter
*f
, TextTable
*tbl
)
641 struct udev_enumerate
*enm
;
642 struct udev_list_entry
*l
;
643 bool have_output
= false;
646 enm
= udev_enumerate_new(udev
);
650 r
= udev_enumerate_add_match_subsystem(enm
, "rbd");
654 r
= udev_enumerate_scan_devices(enm
);
658 udev_list_entry_foreach(l
, udev_enumerate_get_list_entry(enm
)) {
659 struct udev_device
*dev
;
661 dev
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(l
));
663 have_output
|= dump_one_image(f
, tbl
, dev
);
664 udev_device_unref(dev
);
670 udev_enumerate_unref(enm
);
674 int dump_images(struct krbd_ctx
*ctx
, Formatter
*f
)
680 f
->open_object_section("devices");
682 tbl
.define_column("id", TextTable::LEFT
, TextTable::LEFT
);
683 tbl
.define_column("pool", TextTable::LEFT
, TextTable::LEFT
);
684 tbl
.define_column("image", TextTable::LEFT
, TextTable::LEFT
);
685 tbl
.define_column("snap", TextTable::LEFT
, TextTable::LEFT
);
686 tbl
.define_column("device", TextTable::LEFT
, TextTable::LEFT
);
689 r
= do_dump(ctx
->udev
, f
, &tbl
);
702 extern "C" int krbd_create_from_context(rados_config_t cct
,
703 struct krbd_ctx
**pctx
)
705 struct krbd_ctx
*ctx
= new struct krbd_ctx();
707 ctx
->cct
= reinterpret_cast<CephContext
*>(cct
);
708 ctx
->udev
= udev_new();
718 extern "C" void krbd_destroy(struct krbd_ctx
*ctx
)
723 udev_unref(ctx
->udev
);
728 extern "C" int krbd_map(struct krbd_ctx
*ctx
, const char *pool
,
729 const char *image
, const char *snap
,
730 const char *options
, char **pdevnode
)
736 r
= map_image(ctx
, pool
, image
, snap
, options
, &name
);
740 devnode
= strdup(name
.c_str());
748 extern "C" int krbd_unmap(struct krbd_ctx
*ctx
, const char *devnode
,
751 return unmap_image(ctx
, devnode
, options
);
754 extern "C" int krbd_unmap_by_spec(struct krbd_ctx
*ctx
, const char *pool
,
755 const char *image
, const char *snap
,
758 return unmap_image(ctx
, pool
, image
, snap
, options
);
761 int krbd_showmapped(struct krbd_ctx
*ctx
, Formatter
*f
)
763 return dump_images(ctx
, f
);