]> git.proxmox.com Git - ceph.git/blob - ceph/src/krbd.cc
5e011d7a9a077becce2b83cfac72f8390948ec1d
[ceph.git] / ceph / src / krbd.cc
1 /*
2 * Ceph - scalable distributed file system
3 *
4 * Copyright (C) 2014 Inktank Storage, Inc.
5 *
6 * This is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License version 2.1, as published by the Free Software
9 * Foundation. See file COPYING.
10 *
11 */
12
13 #include <errno.h>
14 #include <fcntl.h>
15 #include <iostream>
16 #include <memory>
17 #include <optional>
18 #include <poll.h>
19 #include <sstream>
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <string.h>
23 #include <string>
24 #include <sys/stat.h>
25 #include <sys/sysmacros.h>
26 #include <sys/types.h>
27 #include <tuple>
28 #include <unistd.h>
29 #include <utility>
30
31 #include "auth/KeyRing.h"
32 #include "common/errno.h"
33 #include "common/Formatter.h"
34 #include "common/module.h"
35 #include "common/run_cmd.h"
36 #include "common/safe_io.h"
37 #include "common/secret.h"
38 #include "common/TextTable.h"
39 #include "common/Thread.h"
40 #include "include/ceph_assert.h"
41 #include "include/stringify.h"
42 #include "include/krbd.h"
43 #include "mon/MonMap.h"
44
45 #include <blkid/blkid.h>
46 #include <boost/algorithm/string/predicate.hpp>
47 #include <boost/tokenizer.hpp>
48 #include <libudev.h>
49
50 static const int UDEV_BUF_SIZE = 1 << 20; /* doubled to 2M (SO_RCVBUFFORCE) */
51 static const char DEVNODE_PREFIX[] = "/dev/rbd";
52 static const char SNAP_HEAD_NAME[] = "-";
53
54 #define DEFINE_UDEV_UPTR(what) \
55 struct udev_##what##_deleter { \
56 void operator()(udev_##what *p) { \
57 udev_##what##_unref(p); \
58 } \
59 }; \
60 using udev_##what##_uptr = \
61 std::unique_ptr<udev_##what, udev_##what##_deleter>;
62
63 DEFINE_UDEV_UPTR(monitor) /* udev_monitor_uptr */
64 DEFINE_UDEV_UPTR(enumerate) /* udev_enumerate_uptr */
65 DEFINE_UDEV_UPTR(device) /* udev_device_uptr */
66
67 struct krbd_ctx {
68 CephContext *cct;
69 struct udev *udev;
70 uint32_t flags; /* KRBD_CTX_F_* */
71 };
72
73 struct krbd_spec {
74 std::string pool_name;
75 std::string nspace_name;
76 std::string image_name;
77 std::string snap_name;
78
79 krbd_spec(const char *pool_name, const char *nspace_name,
80 const char *image_name, const char *snap_name)
81 : pool_name(pool_name),
82 nspace_name(nspace_name),
83 image_name(image_name),
84 snap_name(*snap_name ? snap_name : SNAP_HEAD_NAME) { }
85
86 bool operator==(const krbd_spec& rhs) const {
87 return pool_name == rhs.pool_name &&
88 nspace_name == rhs.nspace_name &&
89 image_name == rhs.image_name &&
90 snap_name == rhs.snap_name;
91 }
92 };
93
94 static std::ostream& operator<<(std::ostream& os, const krbd_spec& spec)
95 {
96 os << spec.pool_name << "/";
97 if (!spec.nspace_name.empty())
98 os << spec.nspace_name << "/";
99 os << spec.image_name;
100 if (spec.snap_name != SNAP_HEAD_NAME)
101 os << "@" << spec.snap_name;
102 return os;
103 }
104
105 static std::optional<krbd_spec> spec_from_dev(udev_device *dev)
106 {
107 const char *pool_name = udev_device_get_sysattr_value(dev, "pool");
108 const char *nspace_name = udev_device_get_sysattr_value(dev, "pool_ns");
109 const char *image_name = udev_device_get_sysattr_value(dev, "name");
110 const char *snap_name = udev_device_get_sysattr_value(dev, "current_snap");
111
112 if (!pool_name || !image_name || !snap_name)
113 return std::nullopt;
114
115 return std::make_optional<krbd_spec>(
116 pool_name, nspace_name ?: "", image_name, snap_name);
117 }
118
119 static udev_device_uptr dev_from_list_entry(udev *udev, udev_list_entry *l)
120 {
121 return udev_device_uptr(
122 udev_device_new_from_syspath(udev, udev_list_entry_get_name(l)));
123 }
124
125 static std::string get_devnode(udev_device *dev)
126 {
127 std::string devnode = DEVNODE_PREFIX;
128 devnode += udev_device_get_sysname(dev);
129 return devnode;
130 }
131
132 static int sysfs_write_rbd(const char *which, const string& buf)
133 {
134 const string s = string("/sys/bus/rbd/") + which;
135 const string t = s + "_single_major";
136 int fd;
137 int r;
138
139 /*
140 * 'add' and 'add_single_major' interfaces are identical, but if rbd
141 * kernel module is new enough and is configured to use single-major
142 * scheme, 'add' is disabled in order to prevent old userspace from
143 * doing weird things at unmap time.
144 *
145 * Same goes for 'remove' vs 'remove_single_major'.
146 */
147 fd = open(t.c_str(), O_WRONLY);
148 if (fd < 0) {
149 if (errno == ENOENT) {
150 fd = open(s.c_str(), O_WRONLY);
151 if (fd < 0)
152 return -errno;
153 } else {
154 return -errno;
155 }
156 }
157
158 r = safe_write(fd, buf.c_str(), buf.size());
159
160 close(fd);
161 return r;
162 }
163
164 static int sysfs_write_rbd_add(const string& buf)
165 {
166 return sysfs_write_rbd("add", buf);
167 }
168
169 static int sysfs_write_rbd_remove(const string& buf)
170 {
171 return sysfs_write_rbd("remove", buf);
172 }
173
174 static int have_minor_attr(void)
175 {
176 /*
177 * 'minor' attribute was added as part of single_major merge, which
178 * exposed the 'single_major' parameter. 'minor' is always present,
179 * regardless of whether single-major scheme is turned on or not.
180 *
181 * (Something like ver >= KERNEL_VERSION(3, 14, 0) is a no-go because
182 * this has to work with rbd.ko backported to various kernels.)
183 */
184 return access("/sys/module/rbd/parameters/single_major", F_OK) == 0;
185 }
186
187 static int build_map_buf(CephContext *cct, const krbd_spec& spec,
188 const string& options, string *pbuf)
189 {
190 bool msgr2 = false;
191 ostringstream oss;
192 int r;
193
194 boost::char_separator<char> sep(",");
195 boost::tokenizer<boost::char_separator<char>> tok(options, sep);
196 for (const auto& t : tok) {
197 if (boost::starts_with(t, "ms_mode=")) {
198 /* msgr2 unless ms_mode=legacy */
199 msgr2 = t.compare(8, t.npos, "legacy");
200 }
201 }
202
203 MonMap monmap;
204 r = monmap.build_initial(cct, false, cerr);
205 if (r < 0)
206 return r;
207
208 /*
209 * If msgr2, filter TYPE_MSGR2 addresses. Otherwise, filter
210 * TYPE_LEGACY addresses.
211 */
212 for (const auto& p : monmap.mon_info) {
213 for (const auto& a : p.second.public_addrs.v) {
214 if ((msgr2 && a.is_msgr2()) || (!msgr2 && a.is_legacy())) {
215 if (oss.tellp() > 0) {
216 oss << ",";
217 }
218 oss << a.get_sockaddr();
219 }
220 }
221 }
222
223 oss << " name=" << cct->_conf->name.get_id();
224
225 KeyRing keyring;
226 auto auth_client_required =
227 cct->_conf.get_val<std::string>("auth_client_required");
228 if (auth_client_required != "none") {
229 r = keyring.from_ceph_context(cct);
230 auto keyfile = cct->_conf.get_val<std::string>("keyfile");
231 auto key = cct->_conf.get_val<std::string>("key");
232 if (r == -ENOENT && keyfile.empty() && key.empty())
233 r = 0;
234 if (r < 0) {
235 cerr << "rbd: failed to get secret" << std::endl;
236 return r;
237 }
238 }
239
240 CryptoKey secret;
241 string key_name = string("client.") + cct->_conf->name.get_id();
242 if (keyring.get_secret(cct->_conf->name, secret)) {
243 string secret_str;
244 secret.encode_base64(secret_str);
245
246 r = set_kernel_secret(secret_str.c_str(), key_name.c_str());
247 if (r >= 0) {
248 if (r == 0)
249 cerr << "rbd: warning: secret has length 0" << std::endl;
250 oss << ",key=" << key_name;
251 } else if (r == -ENODEV || r == -ENOSYS) {
252 // running against older kernel; fall back to secret= in options
253 oss << ",secret=" << secret_str;
254 } else {
255 cerr << "rbd: failed to add secret '" << key_name << "' to kernel"
256 << std::endl;
257 return r;
258 }
259 } else if (is_kernel_secret(key_name.c_str())) {
260 oss << ",key=" << key_name;
261 }
262
263 if (!options.empty())
264 oss << "," << options;
265 if (!spec.nspace_name.empty())
266 oss << ",_pool_ns=" << spec.nspace_name;
267
268 oss << " " << spec.pool_name << " " << spec.image_name << " "
269 << spec.snap_name;
270
271 *pbuf = oss.str();
272 return 0;
273 }
274
275 /*
276 * Return:
277 * <kernel error, false> - didn't map
278 * <0 or udev error, true> - mapped
279 */
280 template <typename F>
281 static std::pair<int, bool> wait_for_mapping(int sysfs_r_fd, udev_monitor *mon,
282 F udev_device_handler)
283 {
284 struct pollfd fds[2];
285 int sysfs_r = INT_MAX, udev_r = INT_MAX;
286 int r;
287
288 fds[0].fd = sysfs_r_fd;
289 fds[0].events = POLLIN;
290 fds[1].fd = udev_monitor_get_fd(mon);
291 fds[1].events = POLLIN;
292
293 for (;;) {
294 if (poll(fds, 2, -1) < 0) {
295 ceph_abort_msgf("poll failed: %d", -errno);
296 }
297
298 if (fds[0].revents) {
299 r = safe_read_exact(sysfs_r_fd, &sysfs_r, sizeof(sysfs_r));
300 if (r < 0) {
301 ceph_abort_msgf("safe_read_exact failed: %d", r);
302 }
303 if (sysfs_r < 0) {
304 return std::make_pair(sysfs_r, false);
305 }
306 if (udev_r != INT_MAX) {
307 ceph_assert(!sysfs_r);
308 return std::make_pair(udev_r, true);
309 }
310 fds[0].fd = -1;
311 }
312
313 if (fds[1].revents) {
314 for (;;) {
315 udev_device_uptr dev(udev_monitor_receive_device(mon));
316 if (!dev) {
317 if (errno != EINTR && errno != EAGAIN) {
318 udev_r = -errno;
319 if (sysfs_r != INT_MAX) {
320 ceph_assert(!sysfs_r);
321 return std::make_pair(udev_r, true);
322 }
323 fds[1].fd = -1;
324 }
325 break;
326 }
327 if (udev_device_handler(std::move(dev))) {
328 udev_r = 0;
329 if (sysfs_r != INT_MAX) {
330 ceph_assert(!sysfs_r);
331 return std::make_pair(udev_r, true);
332 }
333 fds[1].fd = -1;
334 break;
335 }
336 }
337 }
338 }
339 }
340
341 class UdevMapHandler {
342 public:
343 UdevMapHandler(const krbd_spec *spec, std::string *pdevnode) :
344 m_spec(spec), m_pdevnode(pdevnode) {}
345
346 /*
347 * Catch /sys/devices/rbd/<id>/ and wait for the corresponding
348 * block device to show up. This is necessary because rbd devices
349 * and block devices aren't linked together in our sysfs layout.
350 *
351 * Note that our "block" event can come before the "rbd" event, so
352 * all potential "block" events are gathered in m_block_devs before
353 * m_bus_dev is caught.
354 */
355 bool operator()(udev_device_uptr dev) {
356 if (strcmp(udev_device_get_action(dev.get()), "add")) {
357 return false;
358 }
359 if (!strcmp(udev_device_get_subsystem(dev.get()), "rbd")) {
360 if (!m_bus_dev) {
361 auto spec = spec_from_dev(dev.get());
362 if (spec && *spec == *m_spec) {
363 m_bus_dev = std::move(dev);
364 m_devnode = get_devnode(m_bus_dev.get());
365 }
366 }
367 } else if (!strcmp(udev_device_get_subsystem(dev.get()), "block")) {
368 if (boost::starts_with(udev_device_get_devnode(dev.get()),
369 DEVNODE_PREFIX)) {
370 m_block_devs.push_back(std::move(dev));
371 }
372 }
373
374 if (m_bus_dev && !m_block_devs.empty()) {
375 for (const auto& p : m_block_devs) {
376 if (udev_device_get_devnode(p.get()) == m_devnode) {
377 ceph_assert(!strcmp(
378 udev_device_get_sysattr_value(m_bus_dev.get(), "major"),
379 udev_device_get_property_value(p.get(), "MAJOR")));
380 ceph_assert(!have_minor_attr() || !strcmp(
381 udev_device_get_sysattr_value(m_bus_dev.get(), "minor"),
382 udev_device_get_property_value(p.get(), "MINOR")));
383 *m_pdevnode = std::move(m_devnode);
384 return true;
385 }
386 }
387 m_block_devs.clear();
388 }
389 return false;
390 }
391
392 private:
393 udev_device_uptr m_bus_dev;
394 std::vector<udev_device_uptr> m_block_devs;
395 std::string m_devnode;
396 const krbd_spec *m_spec;
397 std::string *m_pdevnode;
398 };
399
400 static const char *get_event_source(const krbd_ctx *ctx)
401 {
402 if (ctx->flags & KRBD_CTX_F_NOUDEV) {
403 /*
404 * For block devices (unlike network interfaces, they don't
405 * carry any namespace tags), the kernel broadcasts uevents
406 * into all network namespaces that are owned by the initial
407 * user namespace. This restriction is new in 4.18: starting
408 * with 2.6.35 and through 4.17 the kernel broadcast uevents
409 * into all network namespaces, period.
410 *
411 * However, when invoked from a non-initial user namespace,
412 * udev_monitor_receive_device() has always ignored both kernel
413 * and udev uevents by virtue of requiring SCM_CREDENTIALS and
414 * checking that ucred->uid == 0. When UIDs and GIDs are sent to
415 * a process in a user namespace, they are translated according
416 * to that process's UID and GID mappings and, unless root in the
417 * user namespace is mapped to the global root, that check fails.
418 * Normally they show up as 65534(nobody) because the global root
419 * is not mapped.
420 */
421 return "kernel";
422 }
423
424 /*
425 * Like most netlink messages, udev uevents don't cross network
426 * namespace boundaries and are therefore confined to the initial
427 * network namespace.
428 */
429 return "udev";
430 }
431
432 static int do_map(krbd_ctx *ctx, const krbd_spec& spec, const string& buf,
433 string *pname)
434 {
435 bool mapped;
436 int fds[2];
437 int r;
438
439 udev_monitor_uptr mon(udev_monitor_new_from_netlink(ctx->udev,
440 get_event_source(ctx)));
441 if (!mon)
442 return -ENOMEM;
443
444 r = udev_monitor_filter_add_match_subsystem_devtype(mon.get(), "rbd",
445 nullptr);
446 if (r < 0)
447 return r;
448
449 r = udev_monitor_filter_add_match_subsystem_devtype(mon.get(), "block",
450 "disk");
451 if (r < 0)
452 return r;
453
454 r = udev_monitor_set_receive_buffer_size(mon.get(), UDEV_BUF_SIZE);
455 if (r < 0) {
456 std::cerr << "rbd: failed to set udev buffer size: " << cpp_strerror(r)
457 << std::endl;
458 /* not fatal */
459 }
460
461 r = udev_monitor_enable_receiving(mon.get());
462 if (r < 0)
463 return r;
464
465 if (pipe2(fds, O_NONBLOCK) < 0)
466 return -errno;
467
468 auto mapper = make_named_thread("mapper", [&buf, sysfs_r_fd = fds[1]]() {
469 int sysfs_r = sysfs_write_rbd_add(buf);
470 int r = safe_write(sysfs_r_fd, &sysfs_r, sizeof(sysfs_r));
471 if (r < 0) {
472 ceph_abort_msgf("safe_write failed: %d", r);
473 }
474 });
475
476 std::tie(r, mapped) = wait_for_mapping(fds[0], mon.get(),
477 UdevMapHandler(&spec, pname));
478 if (r < 0) {
479 if (!mapped) {
480 std::cerr << "rbd: sysfs write failed" << std::endl;
481 } else {
482 std::cerr << "rbd: udev wait failed" << std::endl;
483 /* TODO: fall back to enumeration */
484 }
485 }
486
487 mapper.join();
488 close(fds[0]);
489 close(fds[1]);
490 return r;
491 }
492
493 static int map_image(struct krbd_ctx *ctx, const krbd_spec& spec,
494 const char *options, string *pname)
495 {
496 string buf;
497 int r;
498
499 /*
500 * Modprobe rbd kernel module. If it supports single-major device
501 * number allocation scheme, make sure it's turned on.
502 *
503 * Do this before calling build_map_buf() - it wants "ceph" key type
504 * registered.
505 */
506 if (access("/sys/bus/rbd", F_OK) != 0) {
507 const char *module_options = NULL;
508 if (module_has_param("rbd", "single_major"))
509 module_options = "single_major=Y";
510
511 r = module_load("rbd", module_options);
512 if (r) {
513 cerr << "rbd: failed to load rbd kernel module (" << r << ")"
514 << std::endl;
515 /*
516 * Ignore the error: modprobe failing doesn't necessarily prevent
517 * from working.
518 */
519 }
520 }
521
522 r = build_map_buf(ctx->cct, spec, options, &buf);
523 if (r < 0)
524 return r;
525
526 return do_map(ctx, spec, buf, pname);
527 }
528
529 static int devno_to_krbd_id(struct udev *udev, dev_t devno, string *pid)
530 {
531 udev_enumerate_uptr enm;
532 struct udev_list_entry *l;
533 int r;
534
535 retry:
536 enm.reset(udev_enumerate_new(udev));
537 if (!enm)
538 return -ENOMEM;
539
540 r = udev_enumerate_add_match_subsystem(enm.get(), "rbd");
541 if (r < 0)
542 return r;
543
544 r = udev_enumerate_add_match_sysattr(enm.get(), "major",
545 stringify(major(devno)).c_str());
546 if (r < 0)
547 return r;
548
549 if (have_minor_attr()) {
550 r = udev_enumerate_add_match_sysattr(enm.get(), "minor",
551 stringify(minor(devno)).c_str());
552 if (r < 0)
553 return r;
554 }
555
556 r = udev_enumerate_scan_devices(enm.get());
557 if (r < 0) {
558 if (r == -ENOENT || r == -ENODEV) {
559 std::cerr << "rbd: udev enumerate failed, retrying" << std::endl;
560 goto retry;
561 }
562 return r;
563 }
564
565 l = udev_enumerate_get_list_entry(enm.get());
566 if (!l)
567 return -ENOENT;
568
569 /* make sure there is only one match */
570 ceph_assert(!udev_list_entry_get_next(l));
571
572 auto dev = dev_from_list_entry(udev, l);
573 if (!dev)
574 return -ENOMEM;
575
576 *pid = udev_device_get_sysname(dev.get());
577 return 0;
578 }
579
580 static int __enumerate_devices(struct udev *udev, const krbd_spec& spec,
581 bool match_nspace, udev_enumerate_uptr *penm)
582 {
583 udev_enumerate_uptr enm;
584 int r;
585
586 retry:
587 enm.reset(udev_enumerate_new(udev));
588 if (!enm)
589 return -ENOMEM;
590
591 r = udev_enumerate_add_match_subsystem(enm.get(), "rbd");
592 if (r < 0)
593 return r;
594
595 r = udev_enumerate_add_match_sysattr(enm.get(), "pool",
596 spec.pool_name.c_str());
597 if (r < 0)
598 return r;
599
600 if (match_nspace) {
601 r = udev_enumerate_add_match_sysattr(enm.get(), "pool_ns",
602 spec.nspace_name.c_str());
603 } else {
604 /*
605 * Match _only_ devices that don't have pool_ns attribute.
606 * If the kernel supports namespaces, the result will be empty.
607 */
608 r = udev_enumerate_add_nomatch_sysattr(enm.get(), "pool_ns", nullptr);
609 }
610 if (r < 0)
611 return r;
612
613 r = udev_enumerate_add_match_sysattr(enm.get(), "name",
614 spec.image_name.c_str());
615 if (r < 0)
616 return r;
617
618 r = udev_enumerate_add_match_sysattr(enm.get(), "current_snap",
619 spec.snap_name.c_str());
620 if (r < 0)
621 return r;
622
623 r = udev_enumerate_scan_devices(enm.get());
624 if (r < 0) {
625 if (r == -ENOENT || r == -ENODEV) {
626 std::cerr << "rbd: udev enumerate failed, retrying" << std::endl;
627 goto retry;
628 }
629 return r;
630 }
631
632 *penm = std::move(enm);
633 return 0;
634 }
635
636 static int enumerate_devices(struct udev *udev, const krbd_spec& spec,
637 udev_enumerate_uptr *penm)
638 {
639 udev_enumerate_uptr enm;
640 int r;
641
642 r = __enumerate_devices(udev, spec, true, &enm);
643 if (r < 0)
644 return r;
645
646 /*
647 * If no namespace is set, try again with match_nspace=false to
648 * handle older kernels. On a newer kernel the result will remain
649 * the same (i.e. empty).
650 */
651 if (!udev_enumerate_get_list_entry(enm.get()) && spec.nspace_name.empty()) {
652 r = __enumerate_devices(udev, spec, false, &enm);
653 if (r < 0)
654 return r;
655 }
656
657 *penm = std::move(enm);
658 return 0;
659 }
660
661 static int spec_to_devno_and_krbd_id(struct udev *udev, const krbd_spec& spec,
662 dev_t *pdevno, string *pid)
663 {
664 udev_enumerate_uptr enm;
665 struct udev_list_entry *l;
666 unsigned int maj, min = 0;
667 string err;
668 int r;
669
670 r = enumerate_devices(udev, spec, &enm);
671 if (r < 0)
672 return r;
673
674 l = udev_enumerate_get_list_entry(enm.get());
675 if (!l)
676 return -ENOENT;
677
678 auto dev = dev_from_list_entry(udev, l);
679 if (!dev)
680 return -ENOMEM;
681
682 maj = strict_strtoll(udev_device_get_sysattr_value(dev.get(), "major"), 10,
683 &err);
684 if (!err.empty()) {
685 cerr << "rbd: couldn't parse major: " << err << std::endl;
686 return -EINVAL;
687 }
688 if (have_minor_attr()) {
689 min = strict_strtoll(udev_device_get_sysattr_value(dev.get(), "minor"), 10,
690 &err);
691 if (!err.empty()) {
692 cerr << "rbd: couldn't parse minor: " << err << std::endl;
693 return -EINVAL;
694 }
695 }
696
697 /*
698 * If an image is mapped more than once don't bother trying to unmap
699 * all devices - let users run unmap the same number of times they
700 * ran map.
701 */
702 if (udev_list_entry_get_next(l))
703 cerr << "rbd: " << spec << ": mapped more than once, unmapping "
704 << get_devnode(dev.get()) << " only" << std::endl;
705
706 *pdevno = makedev(maj, min);
707 *pid = udev_device_get_sysname(dev.get());
708 return 0;
709 }
710
711 static void append_unmap_options(std::string *buf, const char *options)
712 {
713 if (strcmp(options, "") != 0) {
714 *buf += " ";
715 *buf += options;
716 }
717 }
718
719 class UdevUnmapHandler {
720 public:
721 UdevUnmapHandler(dev_t devno) : m_devno(devno) {}
722
723 bool operator()(udev_device_uptr dev) {
724 if (strcmp(udev_device_get_action(dev.get()), "remove")) {
725 return false;
726 }
727 return udev_device_get_devnum(dev.get()) == m_devno;
728 }
729
730 private:
731 dev_t m_devno;
732 };
733
734 static int do_unmap(krbd_ctx *ctx, dev_t devno, const string& buf)
735 {
736 bool unmapped;
737 int fds[2];
738 int r;
739
740 udev_monitor_uptr mon(udev_monitor_new_from_netlink(ctx->udev,
741 get_event_source(ctx)));
742 if (!mon)
743 return -ENOMEM;
744
745 r = udev_monitor_filter_add_match_subsystem_devtype(mon.get(), "block",
746 "disk");
747 if (r < 0)
748 return r;
749
750 r = udev_monitor_set_receive_buffer_size(mon.get(), UDEV_BUF_SIZE);
751 if (r < 0) {
752 std::cerr << "rbd: failed to set udev buffer size: " << cpp_strerror(r)
753 << std::endl;
754 /* not fatal */
755 }
756
757 r = udev_monitor_enable_receiving(mon.get());
758 if (r < 0)
759 return r;
760
761 if (pipe2(fds, O_NONBLOCK) < 0)
762 return -errno;
763
764 auto unmapper = make_named_thread(
765 "unmapper", [&buf, sysfs_r_fd = fds[1], flags = ctx->flags]() {
766 /*
767 * On final device close(), kernel sends a block change event, in
768 * response to which udev apparently runs blkid on the device. This
769 * makes unmap fail with EBUSY, if issued right after final close().
770 * Try to circumvent this with a retry before turning to udev.
771 */
772 for (int tries = 0; ; tries++) {
773 int sysfs_r = sysfs_write_rbd_remove(buf);
774 if (sysfs_r == -EBUSY && tries < 2) {
775 if (!tries) {
776 usleep(250 * 1000);
777 } else if (!(flags & KRBD_CTX_F_NOUDEV)) {
778 /*
779 * libudev does not provide the "wait until the queue is empty"
780 * API or the sufficient amount of primitives to build it from.
781 */
782 std::string err = run_cmd("udevadm", "settle", "--timeout", "10",
783 (char *)NULL);
784 if (!err.empty())
785 std::cerr << "rbd: " << err << std::endl;
786 }
787 } else {
788 int r = safe_write(sysfs_r_fd, &sysfs_r, sizeof(sysfs_r));
789 if (r < 0) {
790 ceph_abort_msgf("safe_write failed: %d", r);
791 }
792 break;
793 }
794 }
795 });
796
797 std::tie(r, unmapped) = wait_for_mapping(fds[0], mon.get(),
798 UdevUnmapHandler(devno));
799 if (r < 0) {
800 if (!unmapped) {
801 std::cerr << "rbd: sysfs write failed" << std::endl;
802 } else {
803 std::cerr << "rbd: udev wait failed: " << cpp_strerror(r) << std::endl;
804 r = 0;
805 }
806 }
807
808 unmapper.join();
809 close(fds[0]);
810 close(fds[1]);
811 return r;
812 }
813
814 static int unmap_image(struct krbd_ctx *ctx, const char *devnode,
815 const char *options)
816 {
817 struct stat sb;
818 dev_t wholedevno = 0;
819 std::string buf;
820 int r;
821
822 if (stat(devnode, &sb) < 0 || !S_ISBLK(sb.st_mode)) {
823 cerr << "rbd: '" << devnode << "' is not a block device" << std::endl;
824 return -EINVAL;
825 }
826
827 r = blkid_devno_to_wholedisk(sb.st_rdev, NULL, 0, &wholedevno);
828 if (r < 0) {
829 cerr << "rbd: couldn't compute wholedevno: " << cpp_strerror(r)
830 << std::endl;
831 /*
832 * Ignore the error: we are given whole disks most of the time, and
833 * if it turns out this is a partition we will fail later anyway.
834 */
835 wholedevno = sb.st_rdev;
836 }
837
838 for (int tries = 0; ; tries++) {
839 r = devno_to_krbd_id(ctx->udev, wholedevno, &buf);
840 if (r == -ENOENT && tries < 2) {
841 usleep(250 * 1000);
842 } else {
843 if (r < 0) {
844 if (r == -ENOENT) {
845 std::cerr << "rbd: '" << devnode << "' is not an rbd device"
846 << std::endl;
847 r = -EINVAL;
848 }
849 return r;
850 }
851 if (tries) {
852 std::cerr << "rbd: udev enumerate missed a device, tries = " << tries
853 << std::endl;
854 }
855 break;
856 }
857 }
858
859 append_unmap_options(&buf, options);
860 return do_unmap(ctx, wholedevno, buf);
861 }
862
863 static int unmap_image(struct krbd_ctx *ctx, const krbd_spec& spec,
864 const char *options)
865 {
866 dev_t devno = 0;
867 std::string buf;
868 int r;
869
870 for (int tries = 0; ; tries++) {
871 r = spec_to_devno_and_krbd_id(ctx->udev, spec, &devno, &buf);
872 if (r == -ENOENT && tries < 2) {
873 usleep(250 * 1000);
874 } else {
875 if (r < 0) {
876 if (r == -ENOENT) {
877 std::cerr << "rbd: " << spec << ": not a mapped image or snapshot"
878 << std::endl;
879 r = -EINVAL;
880 }
881 return r;
882 }
883 if (tries) {
884 std::cerr << "rbd: udev enumerate missed a device, tries = " << tries
885 << std::endl;
886 }
887 break;
888 }
889 }
890
891 append_unmap_options(&buf, options);
892 return do_unmap(ctx, devno, buf);
893 }
894
895 static bool dump_one_image(Formatter *f, TextTable *tbl,
896 struct udev_device *dev)
897 {
898 auto spec = spec_from_dev(dev);
899 std::string devnode = get_devnode(dev);
900 const char *id = devnode.c_str() + sizeof(DEVNODE_PREFIX) - 1;
901
902 if (!spec)
903 return false;
904
905 if (f) {
906 f->open_object_section("device");
907 f->dump_string("id", id);
908 f->dump_string("pool", spec->pool_name);
909 f->dump_string("namespace", spec->nspace_name);
910 f->dump_string("name", spec->image_name);
911 f->dump_string("snap", spec->snap_name);
912 f->dump_string("device", devnode);
913 f->close_section();
914 } else {
915 *tbl << id << spec->pool_name << spec->nspace_name << spec->image_name
916 << spec->snap_name << devnode << TextTable::endrow;
917 }
918
919 return true;
920 }
921
922 static int do_dump(struct udev *udev, Formatter *f, TextTable *tbl)
923 {
924 udev_enumerate_uptr enm;
925 struct udev_list_entry *l = NULL;
926 bool have_output = false;
927 int r;
928
929 retry:
930 enm.reset(udev_enumerate_new(udev));
931 if (!enm)
932 return -ENOMEM;
933
934 r = udev_enumerate_add_match_subsystem(enm.get(), "rbd");
935 if (r < 0)
936 return r;
937
938 r = udev_enumerate_scan_devices(enm.get());
939 if (r < 0) {
940 if (r == -ENOENT || r == -ENODEV) {
941 std::cerr << "rbd: udev enumerate failed, retrying" << std::endl;
942 goto retry;
943 }
944 return r;
945 }
946
947 udev_list_entry_foreach(l, udev_enumerate_get_list_entry(enm.get())) {
948 auto dev = dev_from_list_entry(udev, l);
949 if (dev) {
950 have_output |= dump_one_image(f, tbl, dev.get());
951 }
952 }
953
954 return have_output;
955 }
956
957 static int dump_images(struct krbd_ctx *ctx, Formatter *f)
958 {
959 TextTable tbl;
960 int r;
961
962 if (f) {
963 f->open_array_section("devices");
964 } else {
965 tbl.define_column("id", TextTable::LEFT, TextTable::LEFT);
966 tbl.define_column("pool", TextTable::LEFT, TextTable::LEFT);
967 tbl.define_column("namespace", TextTable::LEFT, TextTable::LEFT);
968 tbl.define_column("image", TextTable::LEFT, TextTable::LEFT);
969 tbl.define_column("snap", TextTable::LEFT, TextTable::LEFT);
970 tbl.define_column("device", TextTable::LEFT, TextTable::LEFT);
971 }
972
973 r = do_dump(ctx->udev, f, &tbl);
974
975 if (f) {
976 f->close_section();
977 f->flush(cout);
978 } else {
979 if (r > 0)
980 cout << tbl;
981 }
982
983 return r;
984 }
985
986 static int is_mapped_image(struct udev *udev, const krbd_spec& spec,
987 string *pname)
988 {
989 udev_enumerate_uptr enm;
990 struct udev_list_entry *l;
991 int r;
992
993 r = enumerate_devices(udev, spec, &enm);
994 if (r < 0)
995 return r;
996
997 l = udev_enumerate_get_list_entry(enm.get());
998 if (l) {
999 auto dev = dev_from_list_entry(udev, l);
1000 if (!dev)
1001 return -ENOMEM;
1002
1003 *pname = get_devnode(dev.get());
1004 return 1;
1005 }
1006
1007 return 0; /* not mapped */
1008 }
1009
1010 extern "C" int krbd_create_from_context(rados_config_t cct, uint32_t flags,
1011 struct krbd_ctx **pctx)
1012 {
1013 struct krbd_ctx *ctx = new struct krbd_ctx();
1014
1015 ctx->cct = reinterpret_cast<CephContext *>(cct);
1016 ctx->udev = udev_new();
1017 if (!ctx->udev) {
1018 delete ctx;
1019 return -ENOMEM;
1020 }
1021 ctx->flags = flags;
1022
1023 *pctx = ctx;
1024 return 0;
1025 }
1026
1027 extern "C" void krbd_destroy(struct krbd_ctx *ctx)
1028 {
1029 if (!ctx)
1030 return;
1031
1032 udev_unref(ctx->udev);
1033
1034 delete ctx;
1035 }
1036
1037 extern "C" int krbd_map(struct krbd_ctx *ctx,
1038 const char *pool_name,
1039 const char *nspace_name,
1040 const char *image_name,
1041 const char *snap_name,
1042 const char *options,
1043 char **pdevnode)
1044 {
1045 krbd_spec spec(pool_name, nspace_name, image_name, snap_name);
1046 string name;
1047 char *devnode;
1048 int r;
1049
1050 r = map_image(ctx, spec, options, &name);
1051 if (r < 0)
1052 return r;
1053
1054 devnode = strdup(name.c_str());
1055 if (!devnode)
1056 return -ENOMEM;
1057
1058 *pdevnode = devnode;
1059 return r;
1060 }
1061
1062 extern "C" int krbd_unmap(struct krbd_ctx *ctx, const char *devnode,
1063 const char *options)
1064 {
1065 return unmap_image(ctx, devnode, options);
1066 }
1067
1068 extern "C" int krbd_unmap_by_spec(struct krbd_ctx *ctx,
1069 const char *pool_name,
1070 const char *nspace_name,
1071 const char *image_name,
1072 const char *snap_name,
1073 const char *options)
1074 {
1075 krbd_spec spec(pool_name, nspace_name, image_name, snap_name);
1076 return unmap_image(ctx, spec, options);
1077 }
1078
1079 int krbd_showmapped(struct krbd_ctx *ctx, Formatter *f)
1080 {
1081 return dump_images(ctx, f);
1082 }
1083
1084 extern "C" int krbd_is_mapped(struct krbd_ctx *ctx,
1085 const char *pool_name,
1086 const char *nspace_name,
1087 const char *image_name,
1088 const char *snap_name,
1089 char **pdevnode)
1090 {
1091 krbd_spec spec(pool_name, nspace_name, image_name, snap_name);
1092 string name;
1093 char *devnode;
1094 int r;
1095
1096 r = is_mapped_image(ctx->udev, spec, &name);
1097 if (r <= 0) /* error or not mapped */
1098 return r;
1099
1100 devnode = strdup(name.c_str());
1101 if (!devnode)
1102 return -ENOMEM;
1103
1104 *pdevnode = devnode;
1105 return r;
1106 }