]>
git.proxmox.com Git - ceph.git/blob - ceph/src/krbd.cc
2 * Ceph - scalable distributed file system
4 * Copyright (C) 2014 Inktank Storage, Inc.
6 * This is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License version 2.1, as published by the Free Software
9 * Foundation. See file COPYING.
24 #include <sys/sysmacros.h>
25 #include <sys/types.h>
28 #include "auth/KeyRing.h"
29 #include "common/errno.h"
30 #include "common/Formatter.h"
31 #include "common/module.h"
32 #include "common/run_cmd.h"
33 #include "common/safe_io.h"
34 #include "common/secret.h"
35 #include "common/TextTable.h"
36 #include "include/ceph_assert.h"
37 #include "include/stringify.h"
38 #include "include/krbd.h"
39 #include "mon/MonMap.h"
41 #include <blkid/blkid.h>
45 const static int POLL_TIMEOUT
=120000;
52 static const std::string
SNAP_HEAD_NAME("-");
55 std::string pool_name
;
56 std::string nspace_name
;
57 std::string image_name
;
58 std::string snap_name
;
60 krbd_spec(const char *pool_name
, const char *nspace_name
,
61 const char *image_name
, const char *snap_name
)
62 : pool_name(pool_name
),
63 nspace_name(nspace_name
),
64 image_name(image_name
),
65 snap_name(*snap_name
? snap_name
: SNAP_HEAD_NAME
) { }
67 bool operator==(const krbd_spec
& rhs
) const {
68 return pool_name
== rhs
.pool_name
&&
69 nspace_name
== rhs
.nspace_name
&&
70 image_name
== rhs
.image_name
&&
71 snap_name
== rhs
.snap_name
;
75 std::ostream
& operator<<(std::ostream
& os
, const krbd_spec
& spec
) {
76 os
<< spec
.pool_name
<< "/";
77 if (!spec
.nspace_name
.empty())
78 os
<< spec
.nspace_name
<< "/";
79 os
<< spec
.image_name
;
80 if (spec
.snap_name
!= SNAP_HEAD_NAME
)
81 os
<< "@" << spec
.snap_name
;
85 std::optional
<krbd_spec
> spec_from_dev(udev_device
*dev
) {
86 const char *pool_name
= udev_device_get_sysattr_value(dev
, "pool");
87 const char *nspace_name
= udev_device_get_sysattr_value(dev
, "pool_ns");
88 const char *image_name
= udev_device_get_sysattr_value(dev
, "name");
89 const char *snap_name
= udev_device_get_sysattr_value(dev
, "current_snap");
91 if (!pool_name
|| !image_name
|| !snap_name
)
94 return std::make_optional
<krbd_spec
>(
95 pool_name
, nspace_name
?: "", image_name
, snap_name
);
98 static string
get_kernel_rbd_name(const char *id
)
100 return string("/dev/rbd") + id
;
103 static int sysfs_write_rbd(const char *which
, const string
& buf
)
105 const string s
= string("/sys/bus/rbd/") + which
;
106 const string t
= s
+ "_single_major";
111 * 'add' and 'add_single_major' interfaces are identical, but if rbd
112 * kernel module is new enough and is configured to use single-major
113 * scheme, 'add' is disabled in order to prevent old userspace from
114 * doing weird things at unmap time.
116 * Same goes for 'remove' vs 'remove_single_major'.
118 fd
= open(t
.c_str(), O_WRONLY
);
120 if (errno
== ENOENT
) {
121 fd
= open(s
.c_str(), O_WRONLY
);
129 r
= safe_write(fd
, buf
.c_str(), buf
.size());
135 static int sysfs_write_rbd_add(const string
& buf
)
137 return sysfs_write_rbd("add", buf
);
140 static int sysfs_write_rbd_remove(const string
& buf
)
142 return sysfs_write_rbd("remove", buf
);
145 static int have_minor_attr(void)
148 * 'minor' attribute was added as part of single_major merge, which
149 * exposed the 'single_major' parameter. 'minor' is always present,
150 * regardless of whether single-major scheme is turned on or not.
152 * (Something like ver >= KERNEL_VERSION(3, 14, 0) is a no-go because
153 * this has to work with rbd.ko backported to various kernels.)
155 return access("/sys/module/rbd/parameters/single_major", F_OK
) == 0;
158 static int build_map_buf(CephContext
*cct
, const krbd_spec
& spec
,
159 const char *options
, string
*pbuf
)
165 r
= monmap
.build_initial(cct
, false, cerr
);
169 list
<entity_addr_t
> mon_addr
;
170 monmap
.list_addrs(mon_addr
);
172 for (const auto &p
: mon_addr
) {
173 if (oss
.tellp() > 0) {
176 oss
<< p
.get_sockaddr();
179 oss
<< " name=" << cct
->_conf
->name
.get_id();
182 auto auth_client_required
=
183 cct
->_conf
.get_val
<std::string
>("auth_client_required");
184 if (auth_client_required
!= "none") {
185 r
= keyring
.from_ceph_context(cct
);
186 auto keyfile
= cct
->_conf
.get_val
<std::string
>("keyfile");
187 auto key
= cct
->_conf
.get_val
<std::string
>("key");
188 if (r
== -ENOENT
&& keyfile
.empty() && key
.empty())
191 cerr
<< "rbd: failed to get secret" << std::endl
;
197 string key_name
= string("client.") + cct
->_conf
->name
.get_id();
198 if (keyring
.get_secret(cct
->_conf
->name
, secret
)) {
200 secret
.encode_base64(secret_str
);
202 r
= set_kernel_secret(secret_str
.c_str(), key_name
.c_str());
205 cerr
<< "rbd: warning: secret has length 0" << std::endl
;
206 oss
<< ",key=" << key_name
;
207 } else if (r
== -ENODEV
|| r
== -ENOSYS
) {
208 // running against older kernel; fall back to secret= in options
209 oss
<< ",secret=" << secret_str
;
211 cerr
<< "rbd: failed to add secret '" << key_name
<< "' to kernel"
215 } else if (is_kernel_secret(key_name
.c_str())) {
216 oss
<< ",key=" << key_name
;
219 if (strcmp(options
, "") != 0)
220 oss
<< "," << options
;
221 if (!spec
.nspace_name
.empty())
222 oss
<< ",_pool_ns=" << spec
.nspace_name
;
224 oss
<< " " << spec
.pool_name
<< " " << spec
.image_name
<< " "
231 static int wait_for_udev_add(struct udev_monitor
*mon
, const krbd_spec
& spec
,
234 struct udev_device
*bus_dev
= nullptr;
235 std::vector
<struct udev_device
*> block_dev_vec
;
239 * Catch /sys/devices/rbd/<id>/ and wait for the corresponding
240 * block device to show up. This is necessary because rbd devices
241 * and block devices aren't linked together in our sysfs layout.
244 struct pollfd fds
[1];
245 struct udev_device
*dev
;
247 fds
[0].fd
= udev_monitor_get_fd(mon
);
248 fds
[0].events
= POLLIN
;
249 r
= poll(fds
, 1, POLL_TIMEOUT
);
253 r
= (r
== 0) ? -ETIMEDOUT
: -errno
;
257 dev
= udev_monitor_receive_device(mon
);
261 if (strcmp(udev_device_get_action(dev
), "add") != 0)
264 if (strcmp(udev_device_get_subsystem(dev
), "rbd") == 0) {
266 auto cur_spec
= spec_from_dev(dev
);
267 if (cur_spec
&& *cur_spec
== spec
) {
272 } else if (strcmp(udev_device_get_subsystem(dev
), "block") == 0) {
273 block_dev_vec
.push_back(dev
);
278 udev_device_unref(dev
);
282 if (bus_dev
&& !block_dev_vec
.empty()) {
283 const char *major
= udev_device_get_sysattr_value(bus_dev
, "major");
284 const char *minor
= udev_device_get_sysattr_value(bus_dev
, "minor");
285 ceph_assert(!minor
^ have_minor_attr());
287 for (auto p
: block_dev_vec
) {
288 const char *this_major
= udev_device_get_property_value(p
, "MAJOR");
289 const char *this_minor
= udev_device_get_property_value(p
, "MINOR");
291 if (strcmp(this_major
, major
) == 0 &&
292 (!minor
|| strcmp(this_minor
, minor
) == 0)) {
293 string name
= get_kernel_rbd_name(udev_device_get_sysname(bus_dev
));
295 ceph_assert(strcmp(udev_device_get_devnode(p
), name
.c_str()) == 0);
305 udev_device_unref(bus_dev
);
308 for (auto p
: block_dev_vec
) {
309 udev_device_unref(p
);
315 static int do_map(struct udev
*udev
, const krbd_spec
& spec
, const string
& buf
,
318 struct udev_monitor
*mon
;
321 mon
= udev_monitor_new_from_netlink(udev
, "udev");
325 r
= udev_monitor_filter_add_match_subsystem_devtype(mon
, "rbd", nullptr);
329 r
= udev_monitor_filter_add_match_subsystem_devtype(mon
, "block", "disk");
333 r
= udev_monitor_enable_receiving(mon
);
337 r
= sysfs_write_rbd_add(buf
);
339 cerr
<< "rbd: sysfs write failed" << std::endl
;
343 r
= wait_for_udev_add(mon
, spec
, pname
);
345 cerr
<< "rbd: wait failed" << std::endl
;
350 udev_monitor_unref(mon
);
354 static int map_image(struct krbd_ctx
*ctx
, const krbd_spec
& spec
,
355 const char *options
, string
*pname
)
360 r
= build_map_buf(ctx
->cct
, spec
, options
, &buf
);
365 * Modprobe rbd kernel module. If it supports single-major device
366 * number allocation scheme, make sure it's turned on.
368 if (access("/sys/bus/rbd", F_OK
) != 0) {
369 const char *module_options
= NULL
;
370 if (module_has_param("rbd", "single_major"))
371 module_options
= "single_major=Y";
373 r
= module_load("rbd", module_options
);
375 cerr
<< "rbd: failed to load rbd kernel module (" << r
<< ")"
378 * Ignore the error: modprobe failing doesn't necessarily prevent
384 return do_map(ctx
->udev
, spec
, buf
, pname
);
387 static int devno_to_krbd_id(struct udev
*udev
, dev_t devno
, string
*pid
)
389 struct udev_enumerate
*enm
;
390 struct udev_list_entry
*l
;
391 struct udev_device
*dev
;
394 enm
= udev_enumerate_new(udev
);
398 r
= udev_enumerate_add_match_subsystem(enm
, "rbd");
402 r
= udev_enumerate_add_match_sysattr(enm
, "major",
403 stringify(major(devno
)).c_str());
407 if (have_minor_attr()) {
408 r
= udev_enumerate_add_match_sysattr(enm
, "minor",
409 stringify(minor(devno
)).c_str());
414 r
= udev_enumerate_scan_devices(enm
);
418 l
= udev_enumerate_get_list_entry(enm
);
424 /* make sure there is only one match */
425 ceph_assert(!udev_list_entry_get_next(l
));
427 dev
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(l
));
433 *pid
= udev_device_get_sysname(dev
);
435 udev_device_unref(dev
);
437 udev_enumerate_unref(enm
);
441 static int __enumerate_devices(struct udev
*udev
, const krbd_spec
& spec
,
442 bool match_nspace
, struct udev_enumerate
**penm
)
444 struct udev_enumerate
*enm
;
447 enm
= udev_enumerate_new(udev
);
451 r
= udev_enumerate_add_match_subsystem(enm
, "rbd");
455 r
= udev_enumerate_add_match_sysattr(enm
, "pool", spec
.pool_name
.c_str());
460 r
= udev_enumerate_add_match_sysattr(enm
, "pool_ns",
461 spec
.nspace_name
.c_str());
464 * Match _only_ devices that don't have pool_ns attribute.
465 * If the kernel supports namespaces, the result will be empty.
467 r
= udev_enumerate_add_nomatch_sysattr(enm
, "pool_ns", nullptr);
472 r
= udev_enumerate_add_match_sysattr(enm
, "name", spec
.image_name
.c_str());
476 r
= udev_enumerate_add_match_sysattr(enm
, "current_snap",
477 spec
.snap_name
.c_str());
481 r
= udev_enumerate_scan_devices(enm
);
489 udev_enumerate_unref(enm
);
493 static int enumerate_devices(struct udev
*udev
, const krbd_spec
& spec
,
494 struct udev_enumerate
**penm
)
496 struct udev_enumerate
*enm
;
499 r
= __enumerate_devices(udev
, spec
, true, &enm
);
504 * If no namespace is set, try again with match_nspace=false to
505 * handle older kernels. On a newer kernel the result will remain
506 * the same (i.e. empty).
508 if (!udev_enumerate_get_list_entry(enm
) && spec
.nspace_name
.empty()) {
509 udev_enumerate_unref(enm
);
510 r
= __enumerate_devices(udev
, spec
, false, &enm
);
519 static int spec_to_devno_and_krbd_id(struct udev
*udev
, const krbd_spec
& spec
,
520 dev_t
*pdevno
, string
*pid
)
522 struct udev_enumerate
*enm
;
523 struct udev_list_entry
*l
;
524 struct udev_device
*dev
;
525 unsigned int maj
, min
= 0;
529 r
= enumerate_devices(udev
, spec
, &enm
);
533 l
= udev_enumerate_get_list_entry(enm
);
539 dev
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(l
));
545 maj
= strict_strtoll(udev_device_get_sysattr_value(dev
, "major"), 10, &err
);
547 cerr
<< "rbd: couldn't parse major: " << err
<< std::endl
;
551 if (have_minor_attr()) {
552 min
= strict_strtoll(udev_device_get_sysattr_value(dev
, "minor"), 10, &err
);
554 cerr
<< "rbd: couldn't parse minor: " << err
<< std::endl
;
561 * If an image is mapped more than once don't bother trying to unmap
562 * all devices - let users run unmap the same number of times they
565 if (udev_list_entry_get_next(l
))
566 cerr
<< "rbd: " << spec
<< ": mapped more than once, unmapping "
567 << get_kernel_rbd_name(udev_device_get_sysname(dev
))
568 << " only" << std::endl
;
570 *pdevno
= makedev(maj
, min
);
571 *pid
= udev_device_get_sysname(dev
);
574 udev_device_unref(dev
);
576 udev_enumerate_unref(enm
);
580 static string
build_unmap_buf(const string
& id
, const char *options
)
583 if (strcmp(options
, "") != 0) {
590 static int wait_for_udev_remove(struct udev_monitor
*mon
, dev_t devno
)
593 struct pollfd fds
[1];
594 struct udev_device
*dev
;
597 fds
[0].fd
= udev_monitor_get_fd(mon
);
598 fds
[0].events
= POLLIN
;
599 r
= poll(fds
, 1, POLL_TIMEOUT
);
606 dev
= udev_monitor_receive_device(mon
);
610 if (strcmp(udev_device_get_action(dev
), "remove") == 0 &&
611 udev_device_get_devnum(dev
) == devno
) {
612 udev_device_unref(dev
);
616 udev_device_unref(dev
);
622 static int do_unmap(struct udev
*udev
, dev_t devno
, const string
& buf
)
624 struct udev_monitor
*mon
;
627 mon
= udev_monitor_new_from_netlink(udev
, "udev");
631 r
= udev_monitor_filter_add_match_subsystem_devtype(mon
, "block", "disk");
635 r
= udev_monitor_enable_receiving(mon
);
640 * On final device close(), kernel sends a block change event, in
641 * response to which udev apparently runs blkid on the device. This
642 * makes unmap fail with EBUSY, if issued right after final close().
643 * Try to circumvent this with a retry before turning to udev.
645 for (int tries
= 0; ; tries
++) {
646 r
= sysfs_write_rbd_remove(buf
);
649 } else if (r
== -EBUSY
&& tries
< 2) {
654 * libudev does not provide the "wait until the queue is empty"
655 * API or the sufficient amount of primitives to build it from.
657 string err
= run_cmd("udevadm", "settle", "--timeout", "10", (char*)NULL
);
659 cerr
<< "rbd: " << err
<< std::endl
;
662 cerr
<< "rbd: sysfs write failed" << std::endl
;
667 r
= wait_for_udev_remove(mon
, devno
);
669 cerr
<< "rbd: wait failed" << std::endl
;
674 udev_monitor_unref(mon
);
678 static int unmap_image(struct krbd_ctx
*ctx
, const char *devnode
,
682 dev_t wholedevno
= 0;
686 if (stat(devnode
, &sb
) < 0 || !S_ISBLK(sb
.st_mode
)) {
687 cerr
<< "rbd: '" << devnode
<< "' is not a block device" << std::endl
;
691 r
= blkid_devno_to_wholedisk(sb
.st_rdev
, NULL
, 0, &wholedevno
);
693 cerr
<< "rbd: couldn't compute wholedevno: " << cpp_strerror(r
)
696 * Ignore the error: we are given whole disks most of the time, and
697 * if it turns out this is a partition we will fail later anyway.
699 wholedevno
= sb
.st_rdev
;
702 r
= devno_to_krbd_id(ctx
->udev
, wholedevno
, &id
);
705 cerr
<< "rbd: '" << devnode
<< "' is not an rbd device" << std::endl
;
711 return do_unmap(ctx
->udev
, wholedevno
, build_unmap_buf(id
, options
));
714 static int unmap_image(struct krbd_ctx
*ctx
, const krbd_spec
& spec
,
721 r
= spec_to_devno_and_krbd_id(ctx
->udev
, spec
, &devno
, &id
);
724 cerr
<< "rbd: " << spec
<< ": not a mapped image or snapshot"
731 return do_unmap(ctx
->udev
, devno
, build_unmap_buf(id
, options
));
734 static bool dump_one_image(Formatter
*f
, TextTable
*tbl
,
735 struct udev_device
*dev
)
737 const char *id
= udev_device_get_sysname(dev
);
738 auto spec
= spec_from_dev(dev
);
739 string kname
= get_kernel_rbd_name(id
);
745 f
->open_object_section("device");
746 f
->dump_string("id", id
);
747 f
->dump_string("pool", spec
->pool_name
);
748 f
->dump_string("namespace", spec
->nspace_name
);
749 f
->dump_string("name", spec
->image_name
);
750 f
->dump_string("snap", spec
->snap_name
);
751 f
->dump_string("device", kname
);
754 *tbl
<< id
<< spec
->pool_name
<< spec
->nspace_name
<< spec
->image_name
755 << spec
->snap_name
<< kname
<< TextTable::endrow
;
761 static int do_dump(struct udev
*udev
, Formatter
*f
, TextTable
*tbl
)
763 struct udev_enumerate
*enm
;
764 struct udev_list_entry
*l
= NULL
;
765 bool have_output
= false;
768 enm
= udev_enumerate_new(udev
);
772 r
= udev_enumerate_add_match_subsystem(enm
, "rbd");
776 r
= udev_enumerate_scan_devices(enm
);
780 udev_list_entry_foreach(l
, udev_enumerate_get_list_entry(enm
)) {
781 struct udev_device
*dev
;
783 dev
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(l
));
785 have_output
|= dump_one_image(f
, tbl
, dev
);
786 udev_device_unref(dev
);
792 udev_enumerate_unref(enm
);
796 int dump_images(struct krbd_ctx
*ctx
, Formatter
*f
)
802 f
->open_array_section("devices");
804 tbl
.define_column("id", TextTable::LEFT
, TextTable::LEFT
);
805 tbl
.define_column("pool", TextTable::LEFT
, TextTable::LEFT
);
806 tbl
.define_column("namespace", TextTable::LEFT
, TextTable::LEFT
);
807 tbl
.define_column("image", TextTable::LEFT
, TextTable::LEFT
);
808 tbl
.define_column("snap", TextTable::LEFT
, TextTable::LEFT
);
809 tbl
.define_column("device", TextTable::LEFT
, TextTable::LEFT
);
812 r
= do_dump(ctx
->udev
, f
, &tbl
);
825 static int is_mapped_image(struct udev
*udev
, const krbd_spec
& spec
,
828 struct udev_enumerate
*enm
;
829 struct udev_list_entry
*l
;
832 r
= enumerate_devices(udev
, spec
, &enm
);
836 l
= udev_enumerate_get_list_entry(enm
);
838 struct udev_device
*dev
;
840 dev
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(l
));
847 *pname
= get_kernel_rbd_name(udev_device_get_sysname(dev
));
848 udev_device_unref(dev
);
850 r
= 0; /* not mapped */
854 udev_enumerate_unref(enm
);
858 extern "C" int krbd_create_from_context(rados_config_t cct
,
859 struct krbd_ctx
**pctx
)
861 struct krbd_ctx
*ctx
= new struct krbd_ctx();
863 ctx
->cct
= reinterpret_cast<CephContext
*>(cct
);
864 ctx
->udev
= udev_new();
874 extern "C" void krbd_destroy(struct krbd_ctx
*ctx
)
879 udev_unref(ctx
->udev
);
884 extern "C" int krbd_map(struct krbd_ctx
*ctx
,
885 const char *pool_name
,
886 const char *nspace_name
,
887 const char *image_name
,
888 const char *snap_name
,
892 krbd_spec
spec(pool_name
, nspace_name
, image_name
, snap_name
);
897 r
= map_image(ctx
, spec
, options
, &name
);
901 devnode
= strdup(name
.c_str());
909 extern "C" int krbd_unmap(struct krbd_ctx
*ctx
, const char *devnode
,
912 return unmap_image(ctx
, devnode
, options
);
915 extern "C" int krbd_unmap_by_spec(struct krbd_ctx
*ctx
,
916 const char *pool_name
,
917 const char *nspace_name
,
918 const char *image_name
,
919 const char *snap_name
,
922 krbd_spec
spec(pool_name
, nspace_name
, image_name
, snap_name
);
923 return unmap_image(ctx
, spec
, options
);
926 int krbd_showmapped(struct krbd_ctx
*ctx
, Formatter
*f
)
928 return dump_images(ctx
, f
);
931 extern "C" int krbd_is_mapped(struct krbd_ctx
*ctx
,
932 const char *pool_name
,
933 const char *nspace_name
,
934 const char *image_name
,
935 const char *snap_name
,
938 krbd_spec
spec(pool_name
, nspace_name
, image_name
, snap_name
);
943 r
= is_mapped_image(ctx
->udev
, spec
, &name
);
944 if (r
<= 0) /* error or not mapped */
947 devnode
= strdup(name
.c_str());