#include <sys/stat.h>
#include <sys/sysmacros.h>
#include <sys/types.h>
+#include <tuple>
#include <unistd.h>
+#include <utility>
#include "auth/KeyRing.h"
#include "common/errno.h"
#include "common/safe_io.h"
#include "common/secret.h"
#include "common/TextTable.h"
+#include "common/Thread.h"
#include "include/ceph_assert.h"
#include "include/stringify.h"
#include "include/krbd.h"
#include <blkid/blkid.h>
#include <libudev.h>
-
-const static int POLL_TIMEOUT=120000;
+static const int UDEV_BUF_SIZE = 1 << 20; /* doubled to 2M (SO_RCVBUFFORCE) */
struct krbd_ctx {
CephContext *cct;
return 0;
}
-static int wait_for_udev_add(struct udev_monitor *mon, const krbd_spec& spec,
- string *pname)
+/*
+ * Return:
+ * <kernel error, false> - didn't map
+ * <0 or udev error, true> - mapped
+ */
+template <typename F>
+static std::pair<int, bool> wait_for_mapping(int sysfs_r_fd, udev_monitor *mon,
+ F udev_device_handler)
{
- struct udev_device *bus_dev = nullptr;
- std::vector<struct udev_device*> block_dev_vec;
+ struct pollfd fds[2];
+ int sysfs_r = INT_MAX, udev_r = INT_MAX;
int r;
- /*
- * Catch /sys/devices/rbd/<id>/ and wait for the corresponding
- * block device to show up. This is necessary because rbd devices
- * and block devices aren't linked together in our sysfs layout.
- */
+ fds[0].fd = sysfs_r_fd;
+ fds[0].events = POLLIN;
+ fds[1].fd = udev_monitor_get_fd(mon);
+ fds[1].events = POLLIN;
+
for (;;) {
- struct pollfd fds[1];
- struct udev_device *dev;
+ if (poll(fds, 2, -1) < 0) {
+ ceph_abort_msgf("poll failed: %d", -errno);
+ }
- fds[0].fd = udev_monitor_get_fd(mon);
- fds[0].events = POLLIN;
- r = poll(fds, 1, POLL_TIMEOUT);
- if (r > 0) {
- r = 0;
- } else {
- r = (r == 0) ? -ETIMEDOUT : -errno;
- break;
+ if (fds[0].revents) {
+ r = safe_read_exact(sysfs_r_fd, &sysfs_r, sizeof(sysfs_r));
+ if (r < 0) {
+ ceph_abort_msgf("safe_read_exact failed: %d", r);
+ }
+ if (sysfs_r < 0) {
+ return std::make_pair(sysfs_r, false);
+ }
+ if (udev_r != INT_MAX) {
+ ceph_assert(!sysfs_r);
+ return std::make_pair(udev_r, true);
+ }
+ fds[0].fd = -1;
}
- dev = udev_monitor_receive_device(mon);
- if (!dev)
- continue;
+ if (fds[1].revents) {
+ for (;;) {
+ struct udev_device *dev;
+
+ dev = udev_monitor_receive_device(mon);
+ if (!dev) {
+ if (errno != EINTR && errno != EAGAIN) {
+ udev_r = -errno;
+ if (sysfs_r != INT_MAX) {
+ ceph_assert(!sysfs_r);
+ return std::make_pair(udev_r, true);
+ }
+ fds[1].fd = -1;
+ }
+ break;
+ }
+ if (udev_device_handler(dev)) {
+ udev_r = 0;
+ if (sysfs_r != INT_MAX) {
+ ceph_assert(!sysfs_r);
+ return std::make_pair(udev_r, true);
+ }
+ fds[1].fd = -1;
+ break;
+ }
+ }
+ }
+ }
+}
- if (strcmp(udev_device_get_action(dev), "add") != 0)
- goto next;
+class UdevMapHandler {
+public:
+ UdevMapHandler(const krbd_spec *spec, std::string *pdevnode) :
+ m_spec(spec), m_pdevnode(pdevnode) {}
- if (strcmp(udev_device_get_subsystem(dev), "rbd") == 0) {
- if (!bus_dev) {
- auto cur_spec = spec_from_dev(dev);
- if (cur_spec && *cur_spec == spec) {
- bus_dev = dev;
+ /*
+ * Catch /sys/devices/rbd/<id>/ and wait for the corresponding
+ * block device to show up. This is necessary because rbd devices
+ * and block devices aren't linked together in our sysfs layout.
+ */
+ bool operator()(udev_device *dev) {
+ if (strcmp(udev_device_get_action(dev), "add")) {
+ goto next;
+ }
+ if (!strcmp(udev_device_get_subsystem(dev), "rbd")) {
+ if (!m_bus_dev) {
+ auto spec = spec_from_dev(dev);
+ if (spec && *spec == *m_spec) {
+ m_bus_dev = dev;
goto check;
}
}
- } else if (strcmp(udev_device_get_subsystem(dev), "block") == 0) {
- block_dev_vec.push_back(dev);
+ } else if (!strcmp(udev_device_get_subsystem(dev), "block")) {
+ m_block_devs.push_back(dev);
goto check;
}
next:
udev_device_unref(dev);
- continue;
+ return false;
check:
- if (bus_dev && !block_dev_vec.empty()) {
- const char *major = udev_device_get_sysattr_value(bus_dev, "major");
- const char *minor = udev_device_get_sysattr_value(bus_dev, "minor");
+ if (m_bus_dev && !m_block_devs.empty()) {
+ const char *major = udev_device_get_sysattr_value(m_bus_dev, "major");
+ const char *minor = udev_device_get_sysattr_value(m_bus_dev, "minor");
ceph_assert(!minor ^ have_minor_attr());
- for (auto p : block_dev_vec) {
+ for (auto p : m_block_devs) {
const char *this_major = udev_device_get_property_value(p, "MAJOR");
const char *this_minor = udev_device_get_property_value(p, "MINOR");
if (strcmp(this_major, major) == 0 &&
(!minor || strcmp(this_minor, minor) == 0)) {
- string name = get_kernel_rbd_name(udev_device_get_sysname(bus_dev));
+ string name = get_kernel_rbd_name(udev_device_get_sysname(m_bus_dev));
ceph_assert(strcmp(udev_device_get_devnode(p), name.c_str()) == 0);
- *pname = name;
- goto done;
+ *m_pdevnode = name;
+ return true;
}
}
}
+ return false;
}
-done:
- if (bus_dev) {
- udev_device_unref(bus_dev);
- }
-
- for (auto p : block_dev_vec) {
- udev_device_unref(p);
+ ~UdevMapHandler() {
+ if (m_bus_dev) {
+ udev_device_unref(m_bus_dev);
+ }
+
+ for (auto p : m_block_devs) {
+ udev_device_unref(p);
+ }
}
- return r;
-}
+private:
+ udev_device *m_bus_dev = nullptr;
+ std::vector<udev_device *> m_block_devs;
+ const krbd_spec *m_spec;
+ std::string *m_pdevnode;
+};
static int do_map(struct udev *udev, const krbd_spec& spec, const string& buf,
string *pname)
{
struct udev_monitor *mon;
+ std::thread mapper;
+ bool mapped;
+ int fds[2];
int r;
mon = udev_monitor_new_from_netlink(udev, "udev");
if (r < 0)
goto out_mon;
+ r = udev_monitor_set_receive_buffer_size(mon, UDEV_BUF_SIZE);
+ if (r < 0) {
+ std::cerr << "rbd: failed to set udev buffer size: " << cpp_strerror(r)
+ << std::endl;
+ /* not fatal */
+ }
+
r = udev_monitor_enable_receiving(mon);
if (r < 0)
goto out_mon;
- r = sysfs_write_rbd_add(buf);
- if (r < 0) {
- cerr << "rbd: sysfs write failed" << std::endl;
+ if (pipe2(fds, O_NONBLOCK) < 0) {
+ r = -errno;
goto out_mon;
}
- r = wait_for_udev_add(mon, spec, pname);
+ mapper = make_named_thread("mapper", [&buf, sysfs_r_fd = fds[1]]() {
+ int sysfs_r = sysfs_write_rbd_add(buf);
+ int r = safe_write(sysfs_r_fd, &sysfs_r, sizeof(sysfs_r));
+ if (r < 0) {
+ ceph_abort_msgf("safe_write failed: %d", r);
+ }
+ });
+
+ std::tie(r, mapped) = wait_for_mapping(fds[0], mon,
+ UdevMapHandler(&spec, pname));
if (r < 0) {
- cerr << "rbd: wait failed" << std::endl;
- goto out_mon;
+ if (!mapped) {
+ std::cerr << "rbd: sysfs write failed" << std::endl;
+ } else {
+ std::cerr << "rbd: udev wait failed" << std::endl;
+ /* TODO: fall back to enumeration */
+ }
}
+ mapper.join();
+ close(fds[0]);
+ close(fds[1]);
+
out_mon:
udev_monitor_unref(mon);
return r;
struct udev_device *dev;
int r;
+retry:
enm = udev_enumerate_new(udev);
if (!enm)
return -ENOMEM;
}
r = udev_enumerate_scan_devices(enm);
- if (r < 0)
+ if (r < 0) {
+ if (r == -ENOENT || r == -ENODEV) {
+ std::cerr << "rbd: udev enumerate failed, retrying" << std::endl;
+ udev_enumerate_unref(enm);
+ goto retry;
+ }
goto out_enm;
+ }
l = udev_enumerate_get_list_entry(enm);
if (!l) {
struct udev_enumerate *enm;
int r;
+retry:
enm = udev_enumerate_new(udev);
if (!enm)
return -ENOMEM;
goto out_enm;
r = udev_enumerate_scan_devices(enm);
- if (r < 0)
+ if (r < 0) {
+ if (r == -ENOENT || r == -ENODEV) {
+ std::cerr << "rbd: udev enumerate failed, retrying" << std::endl;
+ udev_enumerate_unref(enm);
+ goto retry;
+ }
goto out_enm;
+ }
*penm = enm;
return 0;
return buf;
}
-static int wait_for_udev_remove(struct udev_monitor *mon, dev_t devno)
-{
- for (;;) {
- struct pollfd fds[1];
- struct udev_device *dev;
- int r;
-
- fds[0].fd = udev_monitor_get_fd(mon);
- fds[0].events = POLLIN;
- r = poll(fds, 1, POLL_TIMEOUT);
- if (r < 0)
- return -errno;
+class UdevUnmapHandler {
+public:
+ UdevUnmapHandler(dev_t devno) : m_devno(devno) {}
- if (r == 0)
- return -ETIMEDOUT;
+ bool operator()(udev_device *dev) {
+ bool match = false;
- dev = udev_monitor_receive_device(mon);
- if (!dev)
- continue;
-
- if (strcmp(udev_device_get_action(dev), "remove") == 0 &&
- udev_device_get_devnum(dev) == devno) {
- udev_device_unref(dev);
- break;
+ if (!strcmp(udev_device_get_action(dev), "remove") &&
+ udev_device_get_devnum(dev) == m_devno) {
+ match = true;
}
-
udev_device_unref(dev);
+ return match;
}
- return 0;
-}
+private:
+ dev_t m_devno;
+};
static int do_unmap(struct udev *udev, dev_t devno, const string& buf)
{
struct udev_monitor *mon;
+ std::thread unmapper;
+ bool unmapped;
+ int fds[2];
int r;
mon = udev_monitor_new_from_netlink(udev, "udev");
if (r < 0)
goto out_mon;
+ r = udev_monitor_set_receive_buffer_size(mon, UDEV_BUF_SIZE);
+ if (r < 0) {
+ std::cerr << "rbd: failed to set udev buffer size: " << cpp_strerror(r)
+ << std::endl;
+ /* not fatal */
+ }
+
r = udev_monitor_enable_receiving(mon);
if (r < 0)
goto out_mon;
- /*
- * On final device close(), kernel sends a block change event, in
- * response to which udev apparently runs blkid on the device. This
- * makes unmap fail with EBUSY, if issued right after final close().
- * Try to circumvent this with a retry before turning to udev.
- */
- for (int tries = 0; ; tries++) {
- r = sysfs_write_rbd_remove(buf);
- if (r >= 0) {
- break;
- } else if (r == -EBUSY && tries < 2) {
- if (!tries) {
- usleep(250 * 1000);
+ if (pipe2(fds, O_NONBLOCK) < 0) {
+ r = -errno;
+ goto out_mon;
+ }
+
+ unmapper = make_named_thread("unmapper", [&buf, sysfs_r_fd = fds[1]]() {
+ /*
+ * On final device close(), kernel sends a block change event, in
+ * response to which udev apparently runs blkid on the device. This
+ * makes unmap fail with EBUSY, if issued right after final close().
+ * Try to circumvent this with a retry before turning to udev.
+ */
+ for (int tries = 0; ; tries++) {
+ int sysfs_r = sysfs_write_rbd_remove(buf);
+ if (sysfs_r == -EBUSY && tries < 2) {
+ if (!tries) {
+ usleep(250 * 1000);
+ } else {
+ /*
+ * libudev does not provide the "wait until the queue is empty"
+ * API or the sufficient amount of primitives to build it from.
+ */
+ std::string err = run_cmd("udevadm", "settle", "--timeout", "10",
+ (char *)NULL);
+ if (!err.empty())
+ std::cerr << "rbd: " << err << std::endl;
+ }
} else {
- /*
- * libudev does not provide the "wait until the queue is empty"
- * API or the sufficient amount of primitives to build it from.
- */
- string err = run_cmd("udevadm", "settle", "--timeout", "10", (char*)NULL);
- if (!err.empty())
- cerr << "rbd: " << err << std::endl;
+ int r = safe_write(sysfs_r_fd, &sysfs_r, sizeof(sysfs_r));
+ if (r < 0) {
+ ceph_abort_msgf("safe_write failed: %d", r);
+ }
+ break;
}
- } else {
- cerr << "rbd: sysfs write failed" << std::endl;
- goto out_mon;
}
- }
+ });
- r = wait_for_udev_remove(mon, devno);
+ std::tie(r, unmapped) = wait_for_mapping(fds[0], mon,
+ UdevUnmapHandler(devno));
if (r < 0) {
- cerr << "rbd: wait failed" << std::endl;
- goto out_mon;
+ if (!unmapped) {
+ std::cerr << "rbd: sysfs write failed" << std::endl;
+ } else {
+ std::cerr << "rbd: udev wait failed: " << cpp_strerror(r) << std::endl;
+ r = 0;
+ }
}
+ unmapper.join();
+ close(fds[0]);
+ close(fds[1]);
+
out_mon:
udev_monitor_unref(mon);
return r;
wholedevno = sb.st_rdev;
}
- r = devno_to_krbd_id(ctx->udev, wholedevno, &id);
- if (r < 0) {
- if (r == -ENOENT) {
- cerr << "rbd: '" << devnode << "' is not an rbd device" << std::endl;
- r = -EINVAL;
+ for (int tries = 0; ; tries++) {
+ r = devno_to_krbd_id(ctx->udev, wholedevno, &id);
+ if (r == -ENOENT && tries < 2) {
+ usleep(250 * 1000);
+ } else {
+ if (r < 0) {
+ if (r == -ENOENT) {
+ std::cerr << "rbd: '" << devnode << "' is not an rbd device"
+ << std::endl;
+ r = -EINVAL;
+ }
+ return r;
+ }
+ if (tries) {
+ std::cerr << "rbd: udev enumerate missed a device, tries = " << tries
+ << std::endl;
+ }
+ break;
}
- return r;
}
return do_unmap(ctx->udev, wholedevno, build_unmap_buf(id, options));
string id;
int r;
- r = spec_to_devno_and_krbd_id(ctx->udev, spec, &devno, &id);
- if (r < 0) {
- if (r == -ENOENT) {
- cerr << "rbd: " << spec << ": not a mapped image or snapshot"
- << std::endl;
- r = -EINVAL;
+ for (int tries = 0; ; tries++) {
+ r = spec_to_devno_and_krbd_id(ctx->udev, spec, &devno, &id);
+ if (r == -ENOENT && tries < 2) {
+ usleep(250 * 1000);
+ } else {
+ if (r < 0) {
+ if (r == -ENOENT) {
+ std::cerr << "rbd: " << spec << ": not a mapped image or snapshot"
+ << std::endl;
+ r = -EINVAL;
+ }
+ return r;
+ }
+ if (tries) {
+ std::cerr << "rbd: udev enumerate missed a device, tries = " << tries
+ << std::endl;
+ }
+ break;
}
- return r;
}
return do_unmap(ctx->udev, devno, build_unmap_buf(id, options));
bool have_output = false;
int r;
+retry:
enm = udev_enumerate_new(udev);
if (!enm)
return -ENOMEM;
goto out_enm;
r = udev_enumerate_scan_devices(enm);
- if (r < 0)
+ if (r < 0) {
+ if (r == -ENOENT || r == -ENODEV) {
+ std::cerr << "rbd: udev enumerate failed, retrying" << std::endl;
+ udev_enumerate_unref(enm);
+ goto retry;
+ }
goto out_enm;
+ }
udev_list_entry_foreach(l, udev_enumerate_get_list_entry(enm)) {
struct udev_device *dev;