]> git.proxmox.com Git - ceph.git/blame - ceph/src/krbd.cc
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / krbd.cc
CommitLineData
7c673cae
FG
1/*
2 * Ceph - scalable distributed file system
3 *
4 * Copyright (C) 2014 Inktank Storage, Inc.
5 *
6 * This is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License version 2.1, as published by the Free Software
9 * Foundation. See file COPYING.
10 *
11 */
12
13#include <errno.h>
14#include <fcntl.h>
15#include <iostream>
9f95a23c 16#include <memory>
11fdf7f2 17#include <optional>
7c673cae
FG
18#include <poll.h>
19#include <sstream>
20#include <stdio.h>
21#include <stdlib.h>
22#include <string.h>
23#include <string>
24#include <sys/stat.h>
11fdf7f2 25#include <sys/sysmacros.h>
7c673cae 26#include <sys/types.h>
eafe8130 27#include <tuple>
7c673cae 28#include <unistd.h>
eafe8130 29#include <utility>
7c673cae
FG
30
31#include "auth/KeyRing.h"
32#include "common/errno.h"
33#include "common/Formatter.h"
34#include "common/module.h"
35#include "common/run_cmd.h"
36#include "common/safe_io.h"
37#include "common/secret.h"
38#include "common/TextTable.h"
eafe8130 39#include "common/Thread.h"
11fdf7f2 40#include "include/ceph_assert.h"
7c673cae
FG
41#include "include/stringify.h"
42#include "include/krbd.h"
43#include "mon/MonMap.h"
44
45#include <blkid/blkid.h>
9f95a23c 46#include <boost/algorithm/string/predicate.hpp>
adb31ebb 47#include <boost/tokenizer.hpp>
7c673cae
FG
48#include <libudev.h>
49
eafe8130 50static const int UDEV_BUF_SIZE = 1 << 20; /* doubled to 2M (SO_RCVBUFFORCE) */
9f95a23c
TL
51static const char DEVNODE_PREFIX[] = "/dev/rbd";
52static const char SNAP_HEAD_NAME[] = "-";
53
54#define DEFINE_UDEV_UPTR(what) \
55struct udev_##what##_deleter { \
56 void operator()(udev_##what *p) { \
57 udev_##what##_unref(p); \
58 } \
59}; \
60using udev_##what##_uptr = \
61 std::unique_ptr<udev_##what, udev_##what##_deleter>;
62
63DEFINE_UDEV_UPTR(monitor) /* udev_monitor_uptr */
64DEFINE_UDEV_UPTR(enumerate) /* udev_enumerate_uptr */
65DEFINE_UDEV_UPTR(device) /* udev_device_uptr */
7c673cae
FG
66
67struct krbd_ctx {
68 CephContext *cct;
69 struct udev *udev;
f91f0fd5 70 uint32_t flags; /* KRBD_CTX_F_* */
7c673cae
FG
71};
72
11fdf7f2
TL
73struct krbd_spec {
74 std::string pool_name;
75 std::string nspace_name;
76 std::string image_name;
77 std::string snap_name;
78
79 krbd_spec(const char *pool_name, const char *nspace_name,
80 const char *image_name, const char *snap_name)
81 : pool_name(pool_name),
82 nspace_name(nspace_name),
83 image_name(image_name),
84 snap_name(*snap_name ? snap_name : SNAP_HEAD_NAME) { }
85
86 bool operator==(const krbd_spec& rhs) const {
87 return pool_name == rhs.pool_name &&
88 nspace_name == rhs.nspace_name &&
89 image_name == rhs.image_name &&
90 snap_name == rhs.snap_name;
91 }
92};
93
9f95a23c
TL
94static std::ostream& operator<<(std::ostream& os, const krbd_spec& spec)
95{
11fdf7f2
TL
96 os << spec.pool_name << "/";
97 if (!spec.nspace_name.empty())
98 os << spec.nspace_name << "/";
99 os << spec.image_name;
100 if (spec.snap_name != SNAP_HEAD_NAME)
101 os << "@" << spec.snap_name;
102 return os;
103}
104
9f95a23c
TL
105static std::optional<krbd_spec> spec_from_dev(udev_device *dev)
106{
11fdf7f2
TL
107 const char *pool_name = udev_device_get_sysattr_value(dev, "pool");
108 const char *nspace_name = udev_device_get_sysattr_value(dev, "pool_ns");
109 const char *image_name = udev_device_get_sysattr_value(dev, "name");
110 const char *snap_name = udev_device_get_sysattr_value(dev, "current_snap");
111
112 if (!pool_name || !image_name || !snap_name)
113 return std::nullopt;
114
115 return std::make_optional<krbd_spec>(
116 pool_name, nspace_name ?: "", image_name, snap_name);
117}
118
9f95a23c 119static udev_device_uptr dev_from_list_entry(udev *udev, udev_list_entry *l)
7c673cae 120{
9f95a23c
TL
121 return udev_device_uptr(
122 udev_device_new_from_syspath(udev, udev_list_entry_get_name(l)));
123}
124
125static std::string get_devnode(udev_device *dev)
126{
127 std::string devnode = DEVNODE_PREFIX;
128 devnode += udev_device_get_sysname(dev);
129 return devnode;
7c673cae
FG
130}
131
132static int sysfs_write_rbd(const char *which, const string& buf)
133{
134 const string s = string("/sys/bus/rbd/") + which;
135 const string t = s + "_single_major";
136 int fd;
137 int r;
138
139 /*
140 * 'add' and 'add_single_major' interfaces are identical, but if rbd
141 * kernel module is new enough and is configured to use single-major
142 * scheme, 'add' is disabled in order to prevent old userspace from
143 * doing weird things at unmap time.
144 *
145 * Same goes for 'remove' vs 'remove_single_major'.
146 */
147 fd = open(t.c_str(), O_WRONLY);
148 if (fd < 0) {
149 if (errno == ENOENT) {
150 fd = open(s.c_str(), O_WRONLY);
151 if (fd < 0)
152 return -errno;
153 } else {
154 return -errno;
155 }
156 }
157
158 r = safe_write(fd, buf.c_str(), buf.size());
159
160 close(fd);
161 return r;
162}
163
164static int sysfs_write_rbd_add(const string& buf)
165{
166 return sysfs_write_rbd("add", buf);
167}
168
169static int sysfs_write_rbd_remove(const string& buf)
170{
171 return sysfs_write_rbd("remove", buf);
172}
173
174static int have_minor_attr(void)
175{
176 /*
177 * 'minor' attribute was added as part of single_major merge, which
178 * exposed the 'single_major' parameter. 'minor' is always present,
179 * regardless of whether single-major scheme is turned on or not.
180 *
181 * (Something like ver >= KERNEL_VERSION(3, 14, 0) is a no-go because
182 * this has to work with rbd.ko backported to various kernels.)
183 */
184 return access("/sys/module/rbd/parameters/single_major", F_OK) == 0;
185}
186
11fdf7f2 187static int build_map_buf(CephContext *cct, const krbd_spec& spec,
adb31ebb 188 const string& options, string *pbuf)
7c673cae 189{
adb31ebb 190 bool msgr2 = false;
7c673cae
FG
191 ostringstream oss;
192 int r;
193
adb31ebb
TL
194 boost::char_separator<char> sep(",");
195 boost::tokenizer<boost::char_separator<char>> tok(options, sep);
196 for (const auto& t : tok) {
197 if (boost::starts_with(t, "ms_mode=")) {
198 /* msgr2 unless ms_mode=legacy */
199 msgr2 = t.compare(8, t.npos, "legacy");
200 }
201 }
202
7c673cae 203 MonMap monmap;
11fdf7f2 204 r = monmap.build_initial(cct, false, cerr);
7c673cae
FG
205 if (r < 0)
206 return r;
207
adb31ebb
TL
208 /*
209 * If msgr2, filter TYPE_MSGR2 addresses. Otherwise, filter
210 * TYPE_LEGACY addresses.
211 */
212 for (const auto& p : monmap.mon_info) {
213 for (const auto& a : p.second.public_addrs.v) {
214 if ((msgr2 && a.is_msgr2()) || (!msgr2 && a.is_legacy())) {
215 if (oss.tellp() > 0) {
216 oss << ",";
217 }
218 oss << a.get_sockaddr();
219 }
7c673cae 220 }
7c673cae
FG
221 }
222
223 oss << " name=" << cct->_conf->name.get_id();
224
225 KeyRing keyring;
11fdf7f2
TL
226 auto auth_client_required =
227 cct->_conf.get_val<std::string>("auth_client_required");
228 if (auth_client_required != "none") {
224ce89b 229 r = keyring.from_ceph_context(cct);
11fdf7f2
TL
230 auto keyfile = cct->_conf.get_val<std::string>("keyfile");
231 auto key = cct->_conf.get_val<std::string>("key");
232 if (r == -ENOENT && keyfile.empty() && key.empty())
224ce89b
WB
233 r = 0;
234 if (r < 0) {
235 cerr << "rbd: failed to get secret" << std::endl;
236 return r;
237 }
7c673cae
FG
238 }
239
240 CryptoKey secret;
241 string key_name = string("client.") + cct->_conf->name.get_id();
242 if (keyring.get_secret(cct->_conf->name, secret)) {
243 string secret_str;
244 secret.encode_base64(secret_str);
245
246 r = set_kernel_secret(secret_str.c_str(), key_name.c_str());
247 if (r >= 0) {
248 if (r == 0)
249 cerr << "rbd: warning: secret has length 0" << std::endl;
250 oss << ",key=" << key_name;
251 } else if (r == -ENODEV || r == -ENOSYS) {
252 // running against older kernel; fall back to secret= in options
253 oss << ",secret=" << secret_str;
254 } else {
255 cerr << "rbd: failed to add secret '" << key_name << "' to kernel"
256 << std::endl;
257 return r;
258 }
259 } else if (is_kernel_secret(key_name.c_str())) {
260 oss << ",key=" << key_name;
261 }
262
adb31ebb 263 if (!options.empty())
7c673cae 264 oss << "," << options;
11fdf7f2
TL
265 if (!spec.nspace_name.empty())
266 oss << ",_pool_ns=" << spec.nspace_name;
7c673cae 267
11fdf7f2
TL
268 oss << " " << spec.pool_name << " " << spec.image_name << " "
269 << spec.snap_name;
7c673cae
FG
270
271 *pbuf = oss.str();
272 return 0;
273}
274
eafe8130
TL
275/*
276 * Return:
277 * <kernel error, false> - didn't map
278 * <0 or udev error, true> - mapped
279 */
280template <typename F>
281static std::pair<int, bool> wait_for_mapping(int sysfs_r_fd, udev_monitor *mon,
282 F udev_device_handler)
7c673cae 283{
eafe8130
TL
284 struct pollfd fds[2];
285 int sysfs_r = INT_MAX, udev_r = INT_MAX;
81eedcae 286 int r;
7c673cae 287
eafe8130
TL
288 fds[0].fd = sysfs_r_fd;
289 fds[0].events = POLLIN;
290 fds[1].fd = udev_monitor_get_fd(mon);
291 fds[1].events = POLLIN;
292
7c673cae 293 for (;;) {
eafe8130
TL
294 if (poll(fds, 2, -1) < 0) {
295 ceph_abort_msgf("poll failed: %d", -errno);
296 }
7c673cae 297
eafe8130
TL
298 if (fds[0].revents) {
299 r = safe_read_exact(sysfs_r_fd, &sysfs_r, sizeof(sysfs_r));
300 if (r < 0) {
301 ceph_abort_msgf("safe_read_exact failed: %d", r);
302 }
303 if (sysfs_r < 0) {
304 return std::make_pair(sysfs_r, false);
305 }
306 if (udev_r != INT_MAX) {
307 ceph_assert(!sysfs_r);
308 return std::make_pair(udev_r, true);
309 }
310 fds[0].fd = -1;
81eedcae 311 }
11fdf7f2 312
eafe8130
TL
313 if (fds[1].revents) {
314 for (;;) {
9f95a23c 315 udev_device_uptr dev(udev_monitor_receive_device(mon));
eafe8130
TL
316 if (!dev) {
317 if (errno != EINTR && errno != EAGAIN) {
318 udev_r = -errno;
319 if (sysfs_r != INT_MAX) {
320 ceph_assert(!sysfs_r);
321 return std::make_pair(udev_r, true);
322 }
323 fds[1].fd = -1;
324 }
325 break;
326 }
9f95a23c 327 if (udev_device_handler(std::move(dev))) {
eafe8130
TL
328 udev_r = 0;
329 if (sysfs_r != INT_MAX) {
330 ceph_assert(!sysfs_r);
331 return std::make_pair(udev_r, true);
332 }
333 fds[1].fd = -1;
334 break;
335 }
336 }
337 }
338 }
339}
340
341class UdevMapHandler {
342public:
f67539c2
TL
343 UdevMapHandler(const krbd_spec *spec, std::string *pdevnode,
344 std::string *majnum, std::string *minnum) :
345 m_spec(spec), m_pdevnode(pdevnode), m_majnum(majnum), m_minnum(minnum) {}
7c673cae 346
eafe8130
TL
347 /*
348 * Catch /sys/devices/rbd/<id>/ and wait for the corresponding
349 * block device to show up. This is necessary because rbd devices
350 * and block devices aren't linked together in our sysfs layout.
9f95a23c
TL
351 *
352 * Note that our "block" event can come before the "rbd" event, so
353 * all potential "block" events are gathered in m_block_devs before
354 * m_bus_dev is caught.
eafe8130 355 */
9f95a23c
TL
356 bool operator()(udev_device_uptr dev) {
357 if (strcmp(udev_device_get_action(dev.get()), "add")) {
358 return false;
eafe8130 359 }
9f95a23c 360 if (!strcmp(udev_device_get_subsystem(dev.get()), "rbd")) {
eafe8130 361 if (!m_bus_dev) {
9f95a23c 362 auto spec = spec_from_dev(dev.get());
eafe8130 363 if (spec && *spec == *m_spec) {
9f95a23c
TL
364 m_bus_dev = std::move(dev);
365 m_devnode = get_devnode(m_bus_dev.get());
7c673cae
FG
366 }
367 }
9f95a23c
TL
368 } else if (!strcmp(udev_device_get_subsystem(dev.get()), "block")) {
369 if (boost::starts_with(udev_device_get_devnode(dev.get()),
370 DEVNODE_PREFIX)) {
371 m_block_devs.push_back(std::move(dev));
372 }
81eedcae 373 }
7c673cae 374
eafe8130 375 if (m_bus_dev && !m_block_devs.empty()) {
9f95a23c
TL
376 for (const auto& p : m_block_devs) {
377 if (udev_device_get_devnode(p.get()) == m_devnode) {
9f95a23c 378 *m_pdevnode = std::move(m_devnode);
f67539c2
TL
379 *m_majnum = udev_device_get_property_value(p.get(), "MAJOR");
380 *m_minnum = udev_device_get_property_value(p.get(), "MINOR");
381 ceph_assert(*m_majnum == udev_device_get_sysattr_value(
382 m_bus_dev.get(), "major"));
383 ceph_assert(!have_minor_attr() ||
384 *m_minnum == udev_device_get_sysattr_value(
385 m_bus_dev.get(), "minor"));
eafe8130 386 return true;
7c673cae
FG
387 }
388 }
9f95a23c 389 m_block_devs.clear();
7c673cae 390 }
eafe8130 391 return false;
81eedcae 392 }
7c673cae 393
eafe8130 394private:
9f95a23c
TL
395 udev_device_uptr m_bus_dev;
396 std::vector<udev_device_uptr> m_block_devs;
397 std::string m_devnode;
eafe8130
TL
398 const krbd_spec *m_spec;
399 std::string *m_pdevnode;
f67539c2
TL
400 std::string *m_majnum;
401 std::string *m_minnum;
eafe8130 402};
7c673cae 403
f91f0fd5
TL
404static const char *get_event_source(const krbd_ctx *ctx)
405{
406 if (ctx->flags & KRBD_CTX_F_NOUDEV) {
407 /*
408 * For block devices (unlike network interfaces, they don't
409 * carry any namespace tags), the kernel broadcasts uevents
410 * into all network namespaces that are owned by the initial
411 * user namespace. This restriction is new in 4.18: starting
412 * with 2.6.35 and through 4.17 the kernel broadcast uevents
413 * into all network namespaces, period.
414 *
415 * However, when invoked from a non-initial user namespace,
416 * udev_monitor_receive_device() has always ignored both kernel
417 * and udev uevents by virtue of requiring SCM_CREDENTIALS and
418 * checking that ucred->uid == 0. When UIDs and GIDs are sent to
419 * a process in a user namespace, they are translated according
420 * to that process's UID and GID mappings and, unless root in the
421 * user namespace is mapped to the global root, that check fails.
422 * Normally they show up as 65534(nobody) because the global root
423 * is not mapped.
424 */
425 return "kernel";
426 }
427
428 /*
429 * Like most netlink messages, udev uevents don't cross network
430 * namespace boundaries and are therefore confined to the initial
431 * network namespace.
432 */
433 return "udev";
434}
435
436static int do_map(krbd_ctx *ctx, const krbd_spec& spec, const string& buf,
11fdf7f2 437 string *pname)
7c673cae 438{
f67539c2
TL
439 std::string majnum, minnum;
440 struct stat sb;
eafe8130
TL
441 bool mapped;
442 int fds[2];
7c673cae
FG
443 int r;
444
f91f0fd5
TL
445 udev_monitor_uptr mon(udev_monitor_new_from_netlink(ctx->udev,
446 get_event_source(ctx)));
7c673cae
FG
447 if (!mon)
448 return -ENOMEM;
449
9f95a23c
TL
450 r = udev_monitor_filter_add_match_subsystem_devtype(mon.get(), "rbd",
451 nullptr);
7c673cae 452 if (r < 0)
9f95a23c 453 return r;
7c673cae 454
9f95a23c
TL
455 r = udev_monitor_filter_add_match_subsystem_devtype(mon.get(), "block",
456 "disk");
7c673cae 457 if (r < 0)
9f95a23c 458 return r;
7c673cae 459
9f95a23c 460 r = udev_monitor_set_receive_buffer_size(mon.get(), UDEV_BUF_SIZE);
eafe8130
TL
461 if (r < 0) {
462 std::cerr << "rbd: failed to set udev buffer size: " << cpp_strerror(r)
463 << std::endl;
464 /* not fatal */
465 }
466
9f95a23c 467 r = udev_monitor_enable_receiving(mon.get());
7c673cae 468 if (r < 0)
9f95a23c 469 return r;
7c673cae 470
9f95a23c
TL
471 if (pipe2(fds, O_NONBLOCK) < 0)
472 return -errno;
7c673cae 473
9f95a23c 474 auto mapper = make_named_thread("mapper", [&buf, sysfs_r_fd = fds[1]]() {
eafe8130
TL
475 int sysfs_r = sysfs_write_rbd_add(buf);
476 int r = safe_write(sysfs_r_fd, &sysfs_r, sizeof(sysfs_r));
477 if (r < 0) {
478 ceph_abort_msgf("safe_write failed: %d", r);
479 }
480 });
481
9f95a23c 482 std::tie(r, mapped) = wait_for_mapping(fds[0], mon.get(),
f67539c2
TL
483 UdevMapHandler(&spec, pname, &majnum,
484 &minnum));
7c673cae 485 if (r < 0) {
eafe8130
TL
486 if (!mapped) {
487 std::cerr << "rbd: sysfs write failed" << std::endl;
488 } else {
489 std::cerr << "rbd: udev wait failed" << std::endl;
490 /* TODO: fall back to enumeration */
491 }
7c673cae
FG
492 }
493
eafe8130
TL
494 mapper.join();
495 close(fds[0]);
496 close(fds[1]);
f67539c2
TL
497
498 if (r < 0)
499 return r;
500
501 /*
502 * Make sure our device node is there. This is intended to help
503 * diagnose environments where "rbd map" is run from a container with
504 * a private /dev and some external mechanism (e.g. udev) is used to
505 * add the device to the container asynchronously, possibly seconds
506 * after "rbd map" successfully exits. These setups are very fragile
507 * and in some cases can even lead to data loss, depending on higher
508 * level logic and orchestration layers involved.
509 */
510 ceph_assert(mapped);
511 if (stat(pname->c_str(), &sb) < 0 || !S_ISBLK(sb.st_mode)) {
512 std::cerr << "rbd: mapping succeeded but " << *pname
513 << " is not accessible, is host /dev mounted?" << std::endl;
514 return -EINVAL;
515 }
516 if (stringify(major(sb.st_rdev)) != majnum ||
517 stringify(minor(sb.st_rdev)) != minnum) {
518 std::cerr << "rbd: mapping succeeded but " << *pname
519 << " (" << major(sb.st_rdev) << ":" << minor(sb.st_rdev)
520 << ") does not match expected " << majnum << ":" << minnum
521 << std::endl;
522 return -EINVAL;
523 }
524
525 return 0;
7c673cae
FG
526}
527
11fdf7f2
TL
528static int map_image(struct krbd_ctx *ctx, const krbd_spec& spec,
529 const char *options, string *pname)
7c673cae
FG
530{
531 string buf;
532 int r;
533
7c673cae
FG
534 /*
535 * Modprobe rbd kernel module. If it supports single-major device
536 * number allocation scheme, make sure it's turned on.
9f95a23c
TL
537 *
538 * Do this before calling build_map_buf() - it wants "ceph" key type
539 * registered.
7c673cae
FG
540 */
541 if (access("/sys/bus/rbd", F_OK) != 0) {
542 const char *module_options = NULL;
543 if (module_has_param("rbd", "single_major"))
544 module_options = "single_major=Y";
545
546 r = module_load("rbd", module_options);
547 if (r) {
548 cerr << "rbd: failed to load rbd kernel module (" << r << ")"
549 << std::endl;
550 /*
551 * Ignore the error: modprobe failing doesn't necessarily prevent
552 * from working.
553 */
554 }
555 }
556
9f95a23c
TL
557 r = build_map_buf(ctx->cct, spec, options, &buf);
558 if (r < 0)
559 return r;
560
f91f0fd5 561 return do_map(ctx, spec, buf, pname);
7c673cae
FG
562}
563
564static int devno_to_krbd_id(struct udev *udev, dev_t devno, string *pid)
565{
9f95a23c 566 udev_enumerate_uptr enm;
7c673cae 567 struct udev_list_entry *l;
7c673cae
FG
568 int r;
569
eafe8130 570retry:
9f95a23c 571 enm.reset(udev_enumerate_new(udev));
7c673cae
FG
572 if (!enm)
573 return -ENOMEM;
574
9f95a23c 575 r = udev_enumerate_add_match_subsystem(enm.get(), "rbd");
7c673cae 576 if (r < 0)
9f95a23c 577 return r;
7c673cae 578
9f95a23c 579 r = udev_enumerate_add_match_sysattr(enm.get(), "major",
7c673cae
FG
580 stringify(major(devno)).c_str());
581 if (r < 0)
9f95a23c 582 return r;
7c673cae
FG
583
584 if (have_minor_attr()) {
9f95a23c 585 r = udev_enumerate_add_match_sysattr(enm.get(), "minor",
7c673cae
FG
586 stringify(minor(devno)).c_str());
587 if (r < 0)
9f95a23c 588 return r;
7c673cae
FG
589 }
590
9f95a23c 591 r = udev_enumerate_scan_devices(enm.get());
eafe8130
TL
592 if (r < 0) {
593 if (r == -ENOENT || r == -ENODEV) {
594 std::cerr << "rbd: udev enumerate failed, retrying" << std::endl;
eafe8130
TL
595 goto retry;
596 }
9f95a23c 597 return r;
eafe8130 598 }
7c673cae 599
9f95a23c
TL
600 l = udev_enumerate_get_list_entry(enm.get());
601 if (!l)
602 return -ENOENT;
7c673cae
FG
603
604 /* make sure there is only one match */
11fdf7f2 605 ceph_assert(!udev_list_entry_get_next(l));
7c673cae 606
9f95a23c
TL
607 auto dev = dev_from_list_entry(udev, l);
608 if (!dev)
609 return -ENOMEM;
7c673cae 610
9f95a23c
TL
611 *pid = udev_device_get_sysname(dev.get());
612 return 0;
7c673cae
FG
613}
614
11fdf7f2 615static int __enumerate_devices(struct udev *udev, const krbd_spec& spec,
9f95a23c 616 bool match_nspace, udev_enumerate_uptr *penm)
7c673cae 617{
9f95a23c 618 udev_enumerate_uptr enm;
7c673cae
FG
619 int r;
620
eafe8130 621retry:
9f95a23c 622 enm.reset(udev_enumerate_new(udev));
7c673cae
FG
623 if (!enm)
624 return -ENOMEM;
625
9f95a23c 626 r = udev_enumerate_add_match_subsystem(enm.get(), "rbd");
7c673cae 627 if (r < 0)
9f95a23c 628 return r;
7c673cae 629
9f95a23c
TL
630 r = udev_enumerate_add_match_sysattr(enm.get(), "pool",
631 spec.pool_name.c_str());
7c673cae 632 if (r < 0)
9f95a23c 633 return r;
7c673cae 634
11fdf7f2 635 if (match_nspace) {
9f95a23c 636 r = udev_enumerate_add_match_sysattr(enm.get(), "pool_ns",
11fdf7f2
TL
637 spec.nspace_name.c_str());
638 } else {
639 /*
640 * Match _only_ devices that don't have pool_ns attribute.
641 * If the kernel supports namespaces, the result will be empty.
642 */
9f95a23c 643 r = udev_enumerate_add_nomatch_sysattr(enm.get(), "pool_ns", nullptr);
11fdf7f2
TL
644 }
645 if (r < 0)
9f95a23c 646 return r;
11fdf7f2 647
9f95a23c
TL
648 r = udev_enumerate_add_match_sysattr(enm.get(), "name",
649 spec.image_name.c_str());
7c673cae 650 if (r < 0)
9f95a23c 651 return r;
7c673cae 652
9f95a23c 653 r = udev_enumerate_add_match_sysattr(enm.get(), "current_snap",
11fdf7f2 654 spec.snap_name.c_str());
7c673cae 655 if (r < 0)
9f95a23c 656 return r;
7c673cae 657
9f95a23c 658 r = udev_enumerate_scan_devices(enm.get());
eafe8130
TL
659 if (r < 0) {
660 if (r == -ENOENT || r == -ENODEV) {
661 std::cerr << "rbd: udev enumerate failed, retrying" << std::endl;
eafe8130
TL
662 goto retry;
663 }
9f95a23c 664 return r;
eafe8130 665 }
7c673cae 666
9f95a23c 667 *penm = std::move(enm);
11fdf7f2 668 return 0;
11fdf7f2
TL
669}
670
671static int enumerate_devices(struct udev *udev, const krbd_spec& spec,
9f95a23c 672 udev_enumerate_uptr *penm)
11fdf7f2 673{
9f95a23c 674 udev_enumerate_uptr enm;
11fdf7f2
TL
675 int r;
676
677 r = __enumerate_devices(udev, spec, true, &enm);
678 if (r < 0)
679 return r;
680
681 /*
682 * If no namespace is set, try again with match_nspace=false to
683 * handle older kernels. On a newer kernel the result will remain
684 * the same (i.e. empty).
685 */
9f95a23c 686 if (!udev_enumerate_get_list_entry(enm.get()) && spec.nspace_name.empty()) {
11fdf7f2
TL
687 r = __enumerate_devices(udev, spec, false, &enm);
688 if (r < 0)
689 return r;
690 }
691
9f95a23c 692 *penm = std::move(enm);
11fdf7f2
TL
693 return 0;
694}
695
696static int spec_to_devno_and_krbd_id(struct udev *udev, const krbd_spec& spec,
697 dev_t *pdevno, string *pid)
698{
9f95a23c 699 udev_enumerate_uptr enm;
11fdf7f2 700 struct udev_list_entry *l;
11fdf7f2
TL
701 unsigned int maj, min = 0;
702 string err;
703 int r;
704
705 r = enumerate_devices(udev, spec, &enm);
706 if (r < 0)
707 return r;
708
9f95a23c
TL
709 l = udev_enumerate_get_list_entry(enm.get());
710 if (!l)
711 return -ENOENT;
7c673cae 712
9f95a23c
TL
713 auto dev = dev_from_list_entry(udev, l);
714 if (!dev)
715 return -ENOMEM;
7c673cae 716
9f95a23c
TL
717 maj = strict_strtoll(udev_device_get_sysattr_value(dev.get(), "major"), 10,
718 &err);
7c673cae
FG
719 if (!err.empty()) {
720 cerr << "rbd: couldn't parse major: " << err << std::endl;
9f95a23c 721 return -EINVAL;
7c673cae
FG
722 }
723 if (have_minor_attr()) {
9f95a23c
TL
724 min = strict_strtoll(udev_device_get_sysattr_value(dev.get(), "minor"), 10,
725 &err);
7c673cae
FG
726 if (!err.empty()) {
727 cerr << "rbd: couldn't parse minor: " << err << std::endl;
9f95a23c 728 return -EINVAL;
7c673cae
FG
729 }
730 }
731
732 /*
733 * If an image is mapped more than once don't bother trying to unmap
734 * all devices - let users run unmap the same number of times they
735 * ran map.
736 */
737 if (udev_list_entry_get_next(l))
11fdf7f2 738 cerr << "rbd: " << spec << ": mapped more than once, unmapping "
9f95a23c 739 << get_devnode(dev.get()) << " only" << std::endl;
7c673cae
FG
740
741 *pdevno = makedev(maj, min);
9f95a23c
TL
742 *pid = udev_device_get_sysname(dev.get());
743 return 0;
7c673cae
FG
744}
745
9f95a23c 746static void append_unmap_options(std::string *buf, const char *options)
7c673cae 747{
7c673cae 748 if (strcmp(options, "") != 0) {
9f95a23c
TL
749 *buf += " ";
750 *buf += options;
7c673cae 751 }
7c673cae
FG
752}
753
eafe8130
TL
754class UdevUnmapHandler {
755public:
756 UdevUnmapHandler(dev_t devno) : m_devno(devno) {}
7c673cae 757
9f95a23c
TL
758 bool operator()(udev_device_uptr dev) {
759 if (strcmp(udev_device_get_action(dev.get()), "remove")) {
760 return false;
7c673cae 761 }
9f95a23c 762 return udev_device_get_devnum(dev.get()) == m_devno;
7c673cae
FG
763 }
764
eafe8130
TL
765private:
766 dev_t m_devno;
767};
7c673cae 768
f91f0fd5 769static int do_unmap(krbd_ctx *ctx, dev_t devno, const string& buf)
7c673cae 770{
eafe8130
TL
771 bool unmapped;
772 int fds[2];
7c673cae
FG
773 int r;
774
f91f0fd5
TL
775 udev_monitor_uptr mon(udev_monitor_new_from_netlink(ctx->udev,
776 get_event_source(ctx)));
7c673cae
FG
777 if (!mon)
778 return -ENOMEM;
779
9f95a23c
TL
780 r = udev_monitor_filter_add_match_subsystem_devtype(mon.get(), "block",
781 "disk");
7c673cae 782 if (r < 0)
9f95a23c 783 return r;
7c673cae 784
9f95a23c 785 r = udev_monitor_set_receive_buffer_size(mon.get(), UDEV_BUF_SIZE);
eafe8130
TL
786 if (r < 0) {
787 std::cerr << "rbd: failed to set udev buffer size: " << cpp_strerror(r)
788 << std::endl;
789 /* not fatal */
790 }
791
9f95a23c 792 r = udev_monitor_enable_receiving(mon.get());
7c673cae 793 if (r < 0)
9f95a23c 794 return r;
7c673cae 795
9f95a23c
TL
796 if (pipe2(fds, O_NONBLOCK) < 0)
797 return -errno;
eafe8130 798
f91f0fd5
TL
799 auto unmapper = make_named_thread(
800 "unmapper", [&buf, sysfs_r_fd = fds[1], flags = ctx->flags]() {
eafe8130
TL
801 /*
802 * On final device close(), kernel sends a block change event, in
803 * response to which udev apparently runs blkid on the device. This
804 * makes unmap fail with EBUSY, if issued right after final close().
805 * Try to circumvent this with a retry before turning to udev.
806 */
807 for (int tries = 0; ; tries++) {
808 int sysfs_r = sysfs_write_rbd_remove(buf);
809 if (sysfs_r == -EBUSY && tries < 2) {
810 if (!tries) {
811 usleep(250 * 1000);
f91f0fd5 812 } else if (!(flags & KRBD_CTX_F_NOUDEV)) {
eafe8130
TL
813 /*
814 * libudev does not provide the "wait until the queue is empty"
815 * API or the sufficient amount of primitives to build it from.
816 */
817 std::string err = run_cmd("udevadm", "settle", "--timeout", "10",
818 (char *)NULL);
819 if (!err.empty())
820 std::cerr << "rbd: " << err << std::endl;
821 }
7c673cae 822 } else {
eafe8130
TL
823 int r = safe_write(sysfs_r_fd, &sysfs_r, sizeof(sysfs_r));
824 if (r < 0) {
825 ceph_abort_msgf("safe_write failed: %d", r);
826 }
827 break;
7c673cae 828 }
7c673cae 829 }
eafe8130 830 });
7c673cae 831
9f95a23c 832 std::tie(r, unmapped) = wait_for_mapping(fds[0], mon.get(),
eafe8130 833 UdevUnmapHandler(devno));
7c673cae 834 if (r < 0) {
eafe8130
TL
835 if (!unmapped) {
836 std::cerr << "rbd: sysfs write failed" << std::endl;
837 } else {
838 std::cerr << "rbd: udev wait failed: " << cpp_strerror(r) << std::endl;
839 r = 0;
840 }
7c673cae
FG
841 }
842
eafe8130
TL
843 unmapper.join();
844 close(fds[0]);
845 close(fds[1]);
7c673cae
FG
846 return r;
847}
848
849static int unmap_image(struct krbd_ctx *ctx, const char *devnode,
850 const char *options)
851{
852 struct stat sb;
853 dev_t wholedevno = 0;
9f95a23c 854 std::string buf;
7c673cae
FG
855 int r;
856
857 if (stat(devnode, &sb) < 0 || !S_ISBLK(sb.st_mode)) {
858 cerr << "rbd: '" << devnode << "' is not a block device" << std::endl;
859 return -EINVAL;
860 }
861
862 r = blkid_devno_to_wholedisk(sb.st_rdev, NULL, 0, &wholedevno);
863 if (r < 0) {
864 cerr << "rbd: couldn't compute wholedevno: " << cpp_strerror(r)
865 << std::endl;
866 /*
867 * Ignore the error: we are given whole disks most of the time, and
868 * if it turns out this is a partition we will fail later anyway.
869 */
870 wholedevno = sb.st_rdev;
871 }
872
eafe8130 873 for (int tries = 0; ; tries++) {
9f95a23c 874 r = devno_to_krbd_id(ctx->udev, wholedevno, &buf);
eafe8130
TL
875 if (r == -ENOENT && tries < 2) {
876 usleep(250 * 1000);
877 } else {
878 if (r < 0) {
879 if (r == -ENOENT) {
880 std::cerr << "rbd: '" << devnode << "' is not an rbd device"
881 << std::endl;
882 r = -EINVAL;
883 }
884 return r;
885 }
886 if (tries) {
887 std::cerr << "rbd: udev enumerate missed a device, tries = " << tries
888 << std::endl;
889 }
890 break;
7c673cae 891 }
7c673cae
FG
892 }
893
9f95a23c 894 append_unmap_options(&buf, options);
f91f0fd5 895 return do_unmap(ctx, wholedevno, buf);
7c673cae
FG
896}
897
11fdf7f2 898static int unmap_image(struct krbd_ctx *ctx, const krbd_spec& spec,
7c673cae
FG
899 const char *options)
900{
901 dev_t devno = 0;
9f95a23c 902 std::string buf;
7c673cae
FG
903 int r;
904
eafe8130 905 for (int tries = 0; ; tries++) {
9f95a23c 906 r = spec_to_devno_and_krbd_id(ctx->udev, spec, &devno, &buf);
eafe8130
TL
907 if (r == -ENOENT && tries < 2) {
908 usleep(250 * 1000);
909 } else {
910 if (r < 0) {
911 if (r == -ENOENT) {
912 std::cerr << "rbd: " << spec << ": not a mapped image or snapshot"
913 << std::endl;
914 r = -EINVAL;
915 }
916 return r;
917 }
918 if (tries) {
919 std::cerr << "rbd: udev enumerate missed a device, tries = " << tries
920 << std::endl;
921 }
922 break;
7c673cae 923 }
7c673cae
FG
924 }
925
9f95a23c 926 append_unmap_options(&buf, options);
f91f0fd5 927 return do_unmap(ctx, devno, buf);
7c673cae
FG
928}
929
930static bool dump_one_image(Formatter *f, TextTable *tbl,
931 struct udev_device *dev)
932{
11fdf7f2 933 auto spec = spec_from_dev(dev);
9f95a23c
TL
934 std::string devnode = get_devnode(dev);
935 const char *id = devnode.c_str() + sizeof(DEVNODE_PREFIX) - 1;
7c673cae 936
11fdf7f2 937 if (!spec)
7c673cae
FG
938 return false;
939
940 if (f) {
11fdf7f2
TL
941 f->open_object_section("device");
942 f->dump_string("id", id);
943 f->dump_string("pool", spec->pool_name);
944 f->dump_string("namespace", spec->nspace_name);
945 f->dump_string("name", spec->image_name);
946 f->dump_string("snap", spec->snap_name);
9f95a23c 947 f->dump_string("device", devnode);
7c673cae
FG
948 f->close_section();
949 } else {
11fdf7f2 950 *tbl << id << spec->pool_name << spec->nspace_name << spec->image_name
9f95a23c 951 << spec->snap_name << devnode << TextTable::endrow;
7c673cae
FG
952 }
953
954 return true;
955}
956
957static int do_dump(struct udev *udev, Formatter *f, TextTable *tbl)
958{
9f95a23c 959 udev_enumerate_uptr enm;
11fdf7f2 960 struct udev_list_entry *l = NULL;
7c673cae
FG
961 bool have_output = false;
962 int r;
963
eafe8130 964retry:
9f95a23c 965 enm.reset(udev_enumerate_new(udev));
7c673cae
FG
966 if (!enm)
967 return -ENOMEM;
968
9f95a23c 969 r = udev_enumerate_add_match_subsystem(enm.get(), "rbd");
7c673cae 970 if (r < 0)
9f95a23c 971 return r;
7c673cae 972
9f95a23c 973 r = udev_enumerate_scan_devices(enm.get());
eafe8130
TL
974 if (r < 0) {
975 if (r == -ENOENT || r == -ENODEV) {
976 std::cerr << "rbd: udev enumerate failed, retrying" << std::endl;
eafe8130
TL
977 goto retry;
978 }
9f95a23c 979 return r;
eafe8130 980 }
7c673cae 981
9f95a23c
TL
982 udev_list_entry_foreach(l, udev_enumerate_get_list_entry(enm.get())) {
983 auto dev = dev_from_list_entry(udev, l);
7c673cae 984 if (dev) {
9f95a23c 985 have_output |= dump_one_image(f, tbl, dev.get());
7c673cae
FG
986 }
987 }
988
9f95a23c 989 return have_output;
7c673cae
FG
990}
991
9f95a23c 992static int dump_images(struct krbd_ctx *ctx, Formatter *f)
7c673cae
FG
993{
994 TextTable tbl;
995 int r;
996
997 if (f) {
11fdf7f2 998 f->open_array_section("devices");
7c673cae
FG
999 } else {
1000 tbl.define_column("id", TextTable::LEFT, TextTable::LEFT);
1001 tbl.define_column("pool", TextTable::LEFT, TextTable::LEFT);
11fdf7f2 1002 tbl.define_column("namespace", TextTable::LEFT, TextTable::LEFT);
7c673cae
FG
1003 tbl.define_column("image", TextTable::LEFT, TextTable::LEFT);
1004 tbl.define_column("snap", TextTable::LEFT, TextTable::LEFT);
1005 tbl.define_column("device", TextTable::LEFT, TextTable::LEFT);
1006 }
1007
1008 r = do_dump(ctx->udev, f, &tbl);
1009
1010 if (f) {
1011 f->close_section();
1012 f->flush(cout);
1013 } else {
1014 if (r > 0)
1015 cout << tbl;
1016 }
1017
1018 return r;
1019}
1020
11fdf7f2
TL
1021static int is_mapped_image(struct udev *udev, const krbd_spec& spec,
1022 string *pname)
1023{
9f95a23c 1024 udev_enumerate_uptr enm;
11fdf7f2
TL
1025 struct udev_list_entry *l;
1026 int r;
1027
1028 r = enumerate_devices(udev, spec, &enm);
1029 if (r < 0)
1030 return r;
1031
9f95a23c 1032 l = udev_enumerate_get_list_entry(enm.get());
11fdf7f2 1033 if (l) {
9f95a23c
TL
1034 auto dev = dev_from_list_entry(udev, l);
1035 if (!dev)
1036 return -ENOMEM;
11fdf7f2 1037
9f95a23c
TL
1038 *pname = get_devnode(dev.get());
1039 return 1;
11fdf7f2
TL
1040 }
1041
9f95a23c 1042 return 0; /* not mapped */
11fdf7f2
TL
1043}
1044
f91f0fd5 1045extern "C" int krbd_create_from_context(rados_config_t cct, uint32_t flags,
7c673cae
FG
1046 struct krbd_ctx **pctx)
1047{
1048 struct krbd_ctx *ctx = new struct krbd_ctx();
1049
1050 ctx->cct = reinterpret_cast<CephContext *>(cct);
1051 ctx->udev = udev_new();
1052 if (!ctx->udev) {
1053 delete ctx;
1054 return -ENOMEM;
1055 }
f91f0fd5 1056 ctx->flags = flags;
7c673cae
FG
1057
1058 *pctx = ctx;
1059 return 0;
1060}
1061
1062extern "C" void krbd_destroy(struct krbd_ctx *ctx)
1063{
1064 if (!ctx)
1065 return;
1066
1067 udev_unref(ctx->udev);
1068
1069 delete ctx;
1070}
1071
11fdf7f2
TL
1072extern "C" int krbd_map(struct krbd_ctx *ctx,
1073 const char *pool_name,
1074 const char *nspace_name,
1075 const char *image_name,
1076 const char *snap_name,
1077 const char *options,
1078 char **pdevnode)
7c673cae 1079{
11fdf7f2 1080 krbd_spec spec(pool_name, nspace_name, image_name, snap_name);
7c673cae
FG
1081 string name;
1082 char *devnode;
1083 int r;
1084
11fdf7f2 1085 r = map_image(ctx, spec, options, &name);
7c673cae
FG
1086 if (r < 0)
1087 return r;
1088
1089 devnode = strdup(name.c_str());
1090 if (!devnode)
1091 return -ENOMEM;
1092
1093 *pdevnode = devnode;
1094 return r;
1095}
1096
1097extern "C" int krbd_unmap(struct krbd_ctx *ctx, const char *devnode,
1098 const char *options)
1099{
1100 return unmap_image(ctx, devnode, options);
1101}
1102
11fdf7f2
TL
1103extern "C" int krbd_unmap_by_spec(struct krbd_ctx *ctx,
1104 const char *pool_name,
1105 const char *nspace_name,
1106 const char *image_name,
1107 const char *snap_name,
7c673cae
FG
1108 const char *options)
1109{
11fdf7f2
TL
1110 krbd_spec spec(pool_name, nspace_name, image_name, snap_name);
1111 return unmap_image(ctx, spec, options);
7c673cae
FG
1112}
1113
1114int krbd_showmapped(struct krbd_ctx *ctx, Formatter *f)
1115{
1116 return dump_images(ctx, f);
1117}
11fdf7f2
TL
1118
1119extern "C" int krbd_is_mapped(struct krbd_ctx *ctx,
1120 const char *pool_name,
1121 const char *nspace_name,
1122 const char *image_name,
1123 const char *snap_name,
1124 char **pdevnode)
1125{
1126 krbd_spec spec(pool_name, nspace_name, image_name, snap_name);
1127 string name;
1128 char *devnode;
1129 int r;
1130
1131 r = is_mapped_image(ctx->udev, spec, &name);
1132 if (r <= 0) /* error or not mapped */
1133 return r;
1134
1135 devnode = strdup(name.c_str());
1136 if (!devnode)
1137 return -ENOMEM;
1138
1139 *pdevnode = devnode;
1140 return r;
1141}