]> git.proxmox.com Git - ceph.git/blob - ceph/src/krbd.cc
bump version to 18.2.2-pve1
[ceph.git] / ceph / src / krbd.cc
1 /*
2 * Ceph - scalable distributed file system
3 *
4 * Copyright (C) 2014 Inktank Storage, Inc.
5 *
6 * This is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License version 2.1, as published by the Free Software
9 * Foundation. See file COPYING.
10 *
11 */
12
13 #include <errno.h>
14 #include <fcntl.h>
15 #include <iostream>
16 #include <memory>
17 #include <optional>
18 #include <poll.h>
19 #include <regex>
20 #include <sstream>
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <string>
25 #include <sys/stat.h>
26 #include <sys/sysmacros.h>
27 #include <sys/types.h>
28 #include <tuple>
29 #include <unistd.h>
30 #include <utility>
31
32 #include "auth/KeyRing.h"
33 #include "common/errno.h"
34 #include "common/Formatter.h"
35 #include "common/module.h"
36 #include "common/run_cmd.h"
37 #include "common/safe_io.h"
38 #include "common/secret.h"
39 #include "common/TextTable.h"
40 #include "common/Thread.h"
41 #include "include/ceph_assert.h"
42 #include "include/stringify.h"
43 #include "include/krbd.h"
44 #include "mon/MonMap.h"
45
46 #include <blkid/blkid.h>
47 #include <boost/algorithm/string/predicate.hpp>
48 #include <boost/tokenizer.hpp>
49 #include <libudev.h>
50
51 static const int UDEV_BUF_SIZE = 1 << 20; /* doubled to 2M (SO_RCVBUFFORCE) */
52 static const char DEVNODE_PREFIX[] = "/dev/rbd";
53 static const char SNAP_HEAD_NAME[] = "-";
54
55 #define DEFINE_UDEV_UPTR(what) \
56 struct udev_##what##_deleter { \
57 void operator()(udev_##what *p) { \
58 udev_##what##_unref(p); \
59 } \
60 }; \
61 using udev_##what##_uptr = \
62 std::unique_ptr<udev_##what, udev_##what##_deleter>;
63
64 DEFINE_UDEV_UPTR(monitor) /* udev_monitor_uptr */
65 DEFINE_UDEV_UPTR(enumerate) /* udev_enumerate_uptr */
66 DEFINE_UDEV_UPTR(device) /* udev_device_uptr */
67
68 using std::string;
69
70 struct krbd_ctx {
71 CephContext *cct;
72 struct udev *udev;
73 uint32_t flags; /* KRBD_CTX_F_* */
74 };
75
76 struct krbd_spec {
77 std::string pool_name;
78 std::string nspace_name;
79 std::string image_name;
80 std::string snap_name;
81
82 krbd_spec(const char *pool_name, const char *nspace_name,
83 const char *image_name, const char *snap_name)
84 : pool_name(pool_name),
85 nspace_name(nspace_name),
86 image_name(image_name),
87 snap_name(*snap_name ? snap_name : SNAP_HEAD_NAME) { }
88
89 bool operator==(const krbd_spec& rhs) const {
90 return pool_name == rhs.pool_name &&
91 nspace_name == rhs.nspace_name &&
92 image_name == rhs.image_name &&
93 snap_name == rhs.snap_name;
94 }
95 };
96
97 static std::ostream& operator<<(std::ostream& os, const krbd_spec& spec)
98 {
99 os << spec.pool_name << "/";
100 if (!spec.nspace_name.empty())
101 os << spec.nspace_name << "/";
102 os << spec.image_name;
103 if (spec.snap_name != SNAP_HEAD_NAME)
104 os << "@" << spec.snap_name;
105 return os;
106 }
107
108 static std::optional<krbd_spec> spec_from_dev(udev_device *dev)
109 {
110 const char *pool_name = udev_device_get_sysattr_value(dev, "pool");
111 const char *nspace_name = udev_device_get_sysattr_value(dev, "pool_ns");
112 const char *image_name = udev_device_get_sysattr_value(dev, "name");
113 const char *snap_name = udev_device_get_sysattr_value(dev, "current_snap");
114
115 if (!pool_name || !image_name || !snap_name)
116 return std::nullopt;
117
118 return std::make_optional<krbd_spec>(
119 pool_name, nspace_name ?: "", image_name, snap_name);
120 }
121
122 static udev_device_uptr dev_from_list_entry(udev *udev, udev_list_entry *l)
123 {
124 return udev_device_uptr(
125 udev_device_new_from_syspath(udev, udev_list_entry_get_name(l)));
126 }
127
128 static std::string get_devnode(udev_device *dev)
129 {
130 std::string devnode = DEVNODE_PREFIX;
131 devnode += udev_device_get_sysname(dev);
132 return devnode;
133 }
134
135 static int sysfs_write_rbd(const char *which, const string& buf)
136 {
137 const string s = string("/sys/bus/rbd/") + which;
138 const string t = s + "_single_major";
139 int fd;
140 int r;
141
142 /*
143 * 'add' and 'add_single_major' interfaces are identical, but if rbd
144 * kernel module is new enough and is configured to use single-major
145 * scheme, 'add' is disabled in order to prevent old userspace from
146 * doing weird things at unmap time.
147 *
148 * Same goes for 'remove' vs 'remove_single_major'.
149 */
150 fd = open(t.c_str(), O_WRONLY);
151 if (fd < 0) {
152 if (errno == ENOENT) {
153 fd = open(s.c_str(), O_WRONLY);
154 if (fd < 0)
155 return -errno;
156 } else {
157 return -errno;
158 }
159 }
160
161 r = safe_write(fd, buf.c_str(), buf.size());
162
163 close(fd);
164 return r;
165 }
166
167 static int sysfs_write_rbd_add(const string& buf)
168 {
169 return sysfs_write_rbd("add", buf);
170 }
171
172 static int sysfs_write_rbd_remove(const string& buf)
173 {
174 return sysfs_write_rbd("remove", buf);
175 }
176
177 static int have_minor_attr(void)
178 {
179 /*
180 * 'minor' attribute was added as part of single_major merge, which
181 * exposed the 'single_major' parameter. 'minor' is always present,
182 * regardless of whether single-major scheme is turned on or not.
183 *
184 * (Something like ver >= KERNEL_VERSION(3, 14, 0) is a no-go because
185 * this has to work with rbd.ko backported to various kernels.)
186 */
187 return access("/sys/module/rbd/parameters/single_major", F_OK) == 0;
188 }
189
190 static int build_map_buf(CephContext *cct, const krbd_spec& spec,
191 const string& options, string *pbuf)
192 {
193 bool msgr2 = false;
194 std::ostringstream oss;
195 int r;
196
197 boost::char_separator<char> sep(",");
198 boost::tokenizer<boost::char_separator<char>> tok(options, sep);
199 for (const auto& t : tok) {
200 if (boost::starts_with(t, "ms_mode=")) {
201 /* msgr2 unless ms_mode=legacy */
202 msgr2 = t.compare(8, t.npos, "legacy");
203 }
204 }
205
206 MonMap monmap;
207 r = monmap.build_initial(cct, false, std::cerr);
208 if (r < 0)
209 return r;
210
211 /*
212 * If msgr2, filter TYPE_MSGR2 addresses. Otherwise, filter
213 * TYPE_LEGACY addresses.
214 */
215 for (const auto& p : monmap.mon_info) {
216 for (const auto& a : p.second.public_addrs.v) {
217 if ((msgr2 && a.is_msgr2()) || (!msgr2 && a.is_legacy())) {
218 if (oss.tellp() > 0) {
219 oss << ",";
220 }
221 oss << a.get_sockaddr();
222 }
223 }
224 }
225
226 if (oss.tellp() == 0) {
227 std::cerr << "rbd: failed to get mon address (possible ms_mode mismatch)" << std::endl;
228 return -ENOENT;
229 }
230
231 oss << " name=" << cct->_conf->name.get_id();
232
233 KeyRing keyring;
234 auto auth_client_required =
235 cct->_conf.get_val<std::string>("auth_client_required");
236 if (auth_client_required != "none") {
237 r = keyring.from_ceph_context(cct);
238 auto keyfile = cct->_conf.get_val<std::string>("keyfile");
239 auto key = cct->_conf.get_val<std::string>("key");
240 if (r == -ENOENT && keyfile.empty() && key.empty())
241 r = 0;
242 if (r < 0) {
243 std::cerr << "rbd: failed to get secret" << std::endl;
244 return r;
245 }
246 }
247
248 CryptoKey secret;
249 string key_name = string("client.") + cct->_conf->name.get_id();
250 if (keyring.get_secret(cct->_conf->name, secret)) {
251 string secret_str;
252 secret.encode_base64(secret_str);
253
254 r = set_kernel_secret(secret_str.c_str(), key_name.c_str());
255 if (r >= 0) {
256 if (r == 0)
257 std::cerr << "rbd: warning: secret has length 0" << std::endl;
258 oss << ",key=" << key_name;
259 } else if (r == -ENODEV || r == -ENOSYS) {
260 // running against older kernel; fall back to secret= in options
261 oss << ",secret=" << secret_str;
262 } else {
263 std::cerr << "rbd: failed to add secret '" << key_name << "' to kernel"
264 << std::endl;
265 return r;
266 }
267 } else if (is_kernel_secret(key_name.c_str())) {
268 oss << ",key=" << key_name;
269 }
270
271 if (!options.empty())
272 oss << "," << options;
273 if (!spec.nspace_name.empty())
274 oss << ",_pool_ns=" << spec.nspace_name;
275
276 oss << " " << spec.pool_name << " " << spec.image_name << " "
277 << spec.snap_name;
278
279 *pbuf = oss.str();
280 return 0;
281 }
282
283 /*
284 * Return:
285 * <kernel error, false> - didn't map
286 * <0 or udev error, true> - mapped
287 */
288 template <typename F>
289 static std::pair<int, bool> wait_for_mapping(int sysfs_r_fd, udev_monitor *mon,
290 F udev_device_handler)
291 {
292 struct pollfd fds[2];
293 int sysfs_r = INT_MAX, udev_r = INT_MAX;
294 int r;
295
296 fds[0].fd = sysfs_r_fd;
297 fds[0].events = POLLIN;
298 fds[1].fd = udev_monitor_get_fd(mon);
299 fds[1].events = POLLIN;
300
301 for (;;) {
302 if (poll(fds, 2, -1) < 0) {
303 ceph_abort_msgf("poll failed: %d", -errno);
304 }
305
306 if (fds[0].revents) {
307 r = safe_read_exact(sysfs_r_fd, &sysfs_r, sizeof(sysfs_r));
308 if (r < 0) {
309 ceph_abort_msgf("safe_read_exact failed: %d", r);
310 }
311 if (sysfs_r < 0) {
312 return std::make_pair(sysfs_r, false);
313 }
314 if (udev_r != INT_MAX) {
315 ceph_assert(!sysfs_r);
316 return std::make_pair(udev_r, true);
317 }
318 fds[0].fd = -1;
319 }
320
321 if (fds[1].revents) {
322 for (;;) {
323 udev_device_uptr dev(udev_monitor_receive_device(mon));
324 if (!dev) {
325 if (errno != EINTR && errno != EAGAIN) {
326 udev_r = -errno;
327 if (sysfs_r != INT_MAX) {
328 ceph_assert(!sysfs_r);
329 return std::make_pair(udev_r, true);
330 }
331 fds[1].fd = -1;
332 }
333 break;
334 }
335 if (udev_device_handler(std::move(dev))) {
336 udev_r = 0;
337 if (sysfs_r != INT_MAX) {
338 ceph_assert(!sysfs_r);
339 return std::make_pair(udev_r, true);
340 }
341 fds[1].fd = -1;
342 break;
343 }
344 }
345 }
346 }
347 }
348
349 class UdevMapHandler {
350 public:
351 UdevMapHandler(const krbd_spec *spec, std::string *pdevnode,
352 std::string *majnum, std::string *minnum) :
353 m_spec(spec), m_pdevnode(pdevnode), m_majnum(majnum), m_minnum(minnum) {}
354
355 /*
356 * Catch /sys/devices/rbd/<id>/ and wait for the corresponding
357 * block device to show up. This is necessary because rbd devices
358 * and block devices aren't linked together in our sysfs layout.
359 *
360 * Note that our "block" event can come before the "rbd" event, so
361 * all potential "block" events are gathered in m_block_devs before
362 * m_bus_dev is caught.
363 */
364 bool operator()(udev_device_uptr dev) {
365 if (strcmp(udev_device_get_action(dev.get()), "add")) {
366 return false;
367 }
368 if (!strcmp(udev_device_get_subsystem(dev.get()), "rbd")) {
369 if (!m_bus_dev) {
370 auto spec = spec_from_dev(dev.get());
371 if (spec && *spec == *m_spec) {
372 m_bus_dev = std::move(dev);
373 m_devnode = get_devnode(m_bus_dev.get());
374 }
375 }
376 } else if (!strcmp(udev_device_get_subsystem(dev.get()), "block")) {
377 if (boost::starts_with(udev_device_get_devnode(dev.get()),
378 DEVNODE_PREFIX)) {
379 m_block_devs.push_back(std::move(dev));
380 }
381 }
382
383 if (m_bus_dev && !m_block_devs.empty()) {
384 for (const auto& p : m_block_devs) {
385 if (udev_device_get_devnode(p.get()) == m_devnode) {
386 *m_pdevnode = std::move(m_devnode);
387 *m_majnum = udev_device_get_property_value(p.get(), "MAJOR");
388 *m_minnum = udev_device_get_property_value(p.get(), "MINOR");
389 ceph_assert(*m_majnum == udev_device_get_sysattr_value(
390 m_bus_dev.get(), "major"));
391 ceph_assert(!have_minor_attr() ||
392 *m_minnum == udev_device_get_sysattr_value(
393 m_bus_dev.get(), "minor"));
394 return true;
395 }
396 }
397 m_block_devs.clear();
398 }
399 return false;
400 }
401
402 private:
403 udev_device_uptr m_bus_dev;
404 std::vector<udev_device_uptr> m_block_devs;
405 std::string m_devnode;
406 const krbd_spec *m_spec;
407 std::string *m_pdevnode;
408 std::string *m_majnum;
409 std::string *m_minnum;
410 };
411
412 static const char *get_event_source(const krbd_ctx *ctx)
413 {
414 if (ctx->flags & KRBD_CTX_F_NOUDEV) {
415 /*
416 * For block devices (unlike network interfaces, they don't
417 * carry any namespace tags), the kernel broadcasts uevents
418 * into all network namespaces that are owned by the initial
419 * user namespace. This restriction is new in 4.18: starting
420 * with 2.6.35 and through 4.17 the kernel broadcast uevents
421 * into all network namespaces, period.
422 *
423 * However, when invoked from a non-initial user namespace,
424 * udev_monitor_receive_device() has always ignored both kernel
425 * and udev uevents by virtue of requiring SCM_CREDENTIALS and
426 * checking that ucred->uid == 0. When UIDs and GIDs are sent to
427 * a process in a user namespace, they are translated according
428 * to that process's UID and GID mappings and, unless root in the
429 * user namespace is mapped to the global root, that check fails.
430 * Normally they show up as 65534(nobody) because the global root
431 * is not mapped.
432 */
433 return "kernel";
434 }
435
436 /*
437 * Like most netlink messages, udev uevents don't cross network
438 * namespace boundaries and are therefore confined to the initial
439 * network namespace.
440 */
441 return "udev";
442 }
443
444 static int do_map(krbd_ctx *ctx, const krbd_spec& spec, const string& buf,
445 string *pname)
446 {
447 std::string majnum, minnum;
448 struct stat sb;
449 bool mapped;
450 int fds[2];
451 int r;
452
453 udev_monitor_uptr mon(udev_monitor_new_from_netlink(ctx->udev,
454 get_event_source(ctx)));
455 if (!mon)
456 return -ENOMEM;
457
458 r = udev_monitor_filter_add_match_subsystem_devtype(mon.get(), "rbd",
459 nullptr);
460 if (r < 0)
461 return r;
462
463 r = udev_monitor_filter_add_match_subsystem_devtype(mon.get(), "block",
464 "disk");
465 if (r < 0)
466 return r;
467
468 r = udev_monitor_set_receive_buffer_size(mon.get(), UDEV_BUF_SIZE);
469 if (r < 0) {
470 std::cerr << "rbd: failed to set udev buffer size: " << cpp_strerror(r)
471 << std::endl;
472 /* not fatal */
473 }
474
475 r = udev_monitor_enable_receiving(mon.get());
476 if (r < 0)
477 return r;
478
479 if (pipe2(fds, O_NONBLOCK) < 0)
480 return -errno;
481
482 auto mapper = make_named_thread("mapper", [&buf, sysfs_r_fd = fds[1]]() {
483 int sysfs_r = sysfs_write_rbd_add(buf);
484 int r = safe_write(sysfs_r_fd, &sysfs_r, sizeof(sysfs_r));
485 if (r < 0) {
486 ceph_abort_msgf("safe_write failed: %d", r);
487 }
488 });
489
490 std::tie(r, mapped) = wait_for_mapping(fds[0], mon.get(),
491 UdevMapHandler(&spec, pname, &majnum,
492 &minnum));
493 if (r < 0) {
494 if (!mapped) {
495 std::cerr << "rbd: sysfs write failed" << std::endl;
496 } else {
497 std::cerr << "rbd: udev wait failed" << std::endl;
498 /* TODO: fall back to enumeration */
499 }
500 }
501
502 mapper.join();
503 close(fds[0]);
504 close(fds[1]);
505
506 if (r < 0)
507 return r;
508
509 /*
510 * Make sure our device node is there. This is intended to help
511 * diagnose environments where "rbd map" is run from a container with
512 * a private /dev and some external mechanism (e.g. udev) is used to
513 * add the device to the container asynchronously, possibly seconds
514 * after "rbd map" successfully exits. These setups are very fragile
515 * and in some cases can even lead to data loss, depending on higher
516 * level logic and orchestration layers involved.
517 */
518 ceph_assert(mapped);
519 if (stat(pname->c_str(), &sb) < 0 || !S_ISBLK(sb.st_mode)) {
520 std::cerr << "rbd: mapping succeeded but " << *pname
521 << " is not accessible, is host /dev mounted?" << std::endl;
522 return -EINVAL;
523 }
524 if (stringify(major(sb.st_rdev)) != majnum ||
525 stringify(minor(sb.st_rdev)) != minnum) {
526 std::cerr << "rbd: mapping succeeded but " << *pname
527 << " (" << major(sb.st_rdev) << ":" << minor(sb.st_rdev)
528 << ") does not match expected " << majnum << ":" << minnum
529 << std::endl;
530 return -EINVAL;
531 }
532
533 return 0;
534 }
535
536 static int map_image(struct krbd_ctx *ctx, const krbd_spec& spec,
537 const char *options, string *pname)
538 {
539 string buf;
540 int r;
541
542 /*
543 * Modprobe rbd kernel module. If it supports single-major device
544 * number allocation scheme, make sure it's turned on.
545 *
546 * Do this before calling build_map_buf() - it wants "ceph" key type
547 * registered.
548 */
549 if (access("/sys/bus/rbd", F_OK) != 0) {
550 const char *module_options = NULL;
551 if (module_has_param("rbd", "single_major"))
552 module_options = "single_major=Y";
553
554 r = module_load("rbd", module_options);
555 if (r) {
556 std::cerr << "rbd: failed to load rbd kernel module (" << r << ")"
557 << std::endl;
558 /*
559 * Ignore the error: modprobe failing doesn't necessarily prevent
560 * from working.
561 */
562 }
563 }
564
565 r = build_map_buf(ctx->cct, spec, options, &buf);
566 if (r < 0)
567 return r;
568
569 return do_map(ctx, spec, buf, pname);
570 }
571
572 static int devno_to_krbd_id(struct udev *udev, dev_t devno, string *pid)
573 {
574 udev_enumerate_uptr enm;
575 struct udev_list_entry *l;
576 int r;
577
578 retry:
579 enm.reset(udev_enumerate_new(udev));
580 if (!enm)
581 return -ENOMEM;
582
583 r = udev_enumerate_add_match_subsystem(enm.get(), "rbd");
584 if (r < 0)
585 return r;
586
587 r = udev_enumerate_add_match_sysattr(enm.get(), "major",
588 stringify(major(devno)).c_str());
589 if (r < 0)
590 return r;
591
592 if (have_minor_attr()) {
593 r = udev_enumerate_add_match_sysattr(enm.get(), "minor",
594 stringify(minor(devno)).c_str());
595 if (r < 0)
596 return r;
597 }
598
599 r = udev_enumerate_scan_devices(enm.get());
600 if (r < 0) {
601 if (r == -ENOENT || r == -ENODEV) {
602 std::cerr << "rbd: udev enumerate failed, retrying" << std::endl;
603 goto retry;
604 }
605 return r;
606 }
607
608 l = udev_enumerate_get_list_entry(enm.get());
609 if (!l)
610 return -ENOENT;
611
612 /* make sure there is only one match */
613 ceph_assert(!udev_list_entry_get_next(l));
614
615 auto dev = dev_from_list_entry(udev, l);
616 if (!dev)
617 return -ENOMEM;
618
619 *pid = udev_device_get_sysname(dev.get());
620 return 0;
621 }
622
623 // wrap any of * ? [ between square brackets
624 static std::string escape_glob(const std::string& s)
625 {
626 std::regex glob_meta("([*?[])");
627 return std::regex_replace(s, glob_meta, "[$1]");
628 }
629
630 static int __enumerate_devices(struct udev *udev, const krbd_spec& spec,
631 bool match_nspace, udev_enumerate_uptr *penm)
632 {
633 udev_enumerate_uptr enm;
634 int r;
635
636 retry:
637 enm.reset(udev_enumerate_new(udev));
638 if (!enm)
639 return -ENOMEM;
640
641 r = udev_enumerate_add_match_subsystem(enm.get(), "rbd");
642 if (r < 0)
643 return r;
644
645 r = udev_enumerate_add_match_sysattr(enm.get(), "pool",
646 escape_glob(spec.pool_name).c_str());
647 if (r < 0)
648 return r;
649
650 if (match_nspace) {
651 r = udev_enumerate_add_match_sysattr(enm.get(), "pool_ns",
652 escape_glob(spec.nspace_name).c_str());
653 } else {
654 /*
655 * Match _only_ devices that don't have pool_ns attribute.
656 * If the kernel supports namespaces, the result will be empty.
657 */
658 r = udev_enumerate_add_nomatch_sysattr(enm.get(), "pool_ns", nullptr);
659 }
660 if (r < 0)
661 return r;
662
663 r = udev_enumerate_add_match_sysattr(enm.get(), "name",
664 escape_glob(spec.image_name).c_str());
665 if (r < 0)
666 return r;
667
668 r = udev_enumerate_add_match_sysattr(enm.get(), "current_snap",
669 escape_glob(spec.snap_name).c_str());
670 if (r < 0)
671 return r;
672
673 r = udev_enumerate_scan_devices(enm.get());
674 if (r < 0) {
675 if (r == -ENOENT || r == -ENODEV) {
676 std::cerr << "rbd: udev enumerate failed, retrying" << std::endl;
677 goto retry;
678 }
679 return r;
680 }
681
682 *penm = std::move(enm);
683 return 0;
684 }
685
686 static int enumerate_devices(struct udev *udev, const krbd_spec& spec,
687 udev_enumerate_uptr *penm)
688 {
689 udev_enumerate_uptr enm;
690 int r;
691
692 r = __enumerate_devices(udev, spec, true, &enm);
693 if (r < 0)
694 return r;
695
696 /*
697 * If no namespace is set, try again with match_nspace=false to
698 * handle older kernels. On a newer kernel the result will remain
699 * the same (i.e. empty).
700 */
701 if (!udev_enumerate_get_list_entry(enm.get()) && spec.nspace_name.empty()) {
702 r = __enumerate_devices(udev, spec, false, &enm);
703 if (r < 0)
704 return r;
705 }
706
707 *penm = std::move(enm);
708 return 0;
709 }
710
711 static int spec_to_devno_and_krbd_id(struct udev *udev, const krbd_spec& spec,
712 dev_t *pdevno, string *pid)
713 {
714 udev_enumerate_uptr enm;
715 struct udev_list_entry *l;
716 unsigned int maj, min = 0;
717 string err;
718 int r;
719
720 r = enumerate_devices(udev, spec, &enm);
721 if (r < 0)
722 return r;
723
724 l = udev_enumerate_get_list_entry(enm.get());
725 if (!l)
726 return -ENOENT;
727
728 auto dev = dev_from_list_entry(udev, l);
729 if (!dev)
730 return -ENOMEM;
731
732 maj = strict_strtoll(udev_device_get_sysattr_value(dev.get(), "major"), 10,
733 &err);
734 if (!err.empty()) {
735 std::cerr << "rbd: couldn't parse major: " << err << std::endl;
736 return -EINVAL;
737 }
738 if (have_minor_attr()) {
739 min = strict_strtoll(udev_device_get_sysattr_value(dev.get(), "minor"), 10,
740 &err);
741 if (!err.empty()) {
742 std::cerr << "rbd: couldn't parse minor: " << err << std::endl;
743 return -EINVAL;
744 }
745 }
746
747 /*
748 * If an image is mapped more than once don't bother trying to unmap
749 * all devices - let users run unmap the same number of times they
750 * ran map.
751 */
752 if (udev_list_entry_get_next(l))
753 std::cerr << "rbd: " << spec << ": mapped more than once, unmapping "
754 << get_devnode(dev.get()) << " only" << std::endl;
755
756 *pdevno = makedev(maj, min);
757 *pid = udev_device_get_sysname(dev.get());
758 return 0;
759 }
760
761 static void append_unmap_options(std::string *buf, const char *options)
762 {
763 if (strcmp(options, "") != 0) {
764 *buf += " ";
765 *buf += options;
766 }
767 }
768
769 class UdevUnmapHandler {
770 public:
771 UdevUnmapHandler(dev_t devno) : m_devno(devno) {}
772
773 bool operator()(udev_device_uptr dev) {
774 if (strcmp(udev_device_get_action(dev.get()), "remove")) {
775 return false;
776 }
777 return udev_device_get_devnum(dev.get()) == m_devno;
778 }
779
780 private:
781 dev_t m_devno;
782 };
783
784 static int do_unmap(krbd_ctx *ctx, dev_t devno, const string& buf)
785 {
786 bool unmapped;
787 int fds[2];
788 int r;
789
790 udev_monitor_uptr mon(udev_monitor_new_from_netlink(ctx->udev,
791 get_event_source(ctx)));
792 if (!mon)
793 return -ENOMEM;
794
795 r = udev_monitor_filter_add_match_subsystem_devtype(mon.get(), "block",
796 "disk");
797 if (r < 0)
798 return r;
799
800 r = udev_monitor_set_receive_buffer_size(mon.get(), UDEV_BUF_SIZE);
801 if (r < 0) {
802 std::cerr << "rbd: failed to set udev buffer size: " << cpp_strerror(r)
803 << std::endl;
804 /* not fatal */
805 }
806
807 r = udev_monitor_enable_receiving(mon.get());
808 if (r < 0)
809 return r;
810
811 if (pipe2(fds, O_NONBLOCK) < 0)
812 return -errno;
813
814 auto unmapper = make_named_thread(
815 "unmapper", [&buf, sysfs_r_fd = fds[1], flags = ctx->flags]() {
816 /*
817 * On final device close(), kernel sends a block change event, in
818 * response to which udev apparently runs blkid on the device. This
819 * makes unmap fail with EBUSY, if issued right after final close().
820 * Try to circumvent this with a retry before turning to udev.
821 */
822 for (int tries = 0; ; tries++) {
823 int sysfs_r = sysfs_write_rbd_remove(buf);
824 if (sysfs_r == -EBUSY && tries < 2) {
825 if (!tries) {
826 usleep(250 * 1000);
827 } else if (!(flags & KRBD_CTX_F_NOUDEV)) {
828 /*
829 * libudev does not provide the "wait until the queue is empty"
830 * API or the sufficient amount of primitives to build it from.
831 */
832 std::string err = run_cmd("udevadm", "settle", "--timeout", "10",
833 (char *)NULL);
834 if (!err.empty())
835 std::cerr << "rbd: " << err << std::endl;
836 }
837 } else {
838 int r = safe_write(sysfs_r_fd, &sysfs_r, sizeof(sysfs_r));
839 if (r < 0) {
840 ceph_abort_msgf("safe_write failed: %d", r);
841 }
842 break;
843 }
844 }
845 });
846
847 std::tie(r, unmapped) = wait_for_mapping(fds[0], mon.get(),
848 UdevUnmapHandler(devno));
849 if (r < 0) {
850 if (!unmapped) {
851 std::cerr << "rbd: sysfs write failed" << std::endl;
852 } else {
853 std::cerr << "rbd: udev wait failed: " << cpp_strerror(r) << std::endl;
854 r = 0;
855 }
856 }
857
858 unmapper.join();
859 close(fds[0]);
860 close(fds[1]);
861 return r;
862 }
863
864 static int unmap_image(struct krbd_ctx *ctx, const char *devnode,
865 const char *options)
866 {
867 struct stat sb;
868 dev_t wholedevno = 0;
869 std::string buf;
870 int r;
871
872 if (stat(devnode, &sb) < 0 || !S_ISBLK(sb.st_mode)) {
873 std::cerr << "rbd: '" << devnode << "' is not a block device" << std::endl;
874 return -EINVAL;
875 }
876
877 r = blkid_devno_to_wholedisk(sb.st_rdev, NULL, 0, &wholedevno);
878 if (r < 0) {
879 std::cerr << "rbd: couldn't compute wholedevno: " << cpp_strerror(r)
880 << std::endl;
881 /*
882 * Ignore the error: we are given whole disks most of the time, and
883 * if it turns out this is a partition we will fail later anyway.
884 */
885 wholedevno = sb.st_rdev;
886 }
887
888 for (int tries = 0; ; tries++) {
889 r = devno_to_krbd_id(ctx->udev, wholedevno, &buf);
890 if (r == -ENOENT && tries < 2) {
891 usleep(250 * 1000);
892 } else {
893 if (r < 0) {
894 if (r == -ENOENT) {
895 std::cerr << "rbd: '" << devnode << "' is not an rbd device"
896 << std::endl;
897 r = -EINVAL;
898 }
899 return r;
900 }
901 if (tries) {
902 std::cerr << "rbd: udev enumerate missed a device, tries = " << tries
903 << std::endl;
904 }
905 break;
906 }
907 }
908
909 append_unmap_options(&buf, options);
910 return do_unmap(ctx, wholedevno, buf);
911 }
912
913 static int unmap_image(struct krbd_ctx *ctx, const krbd_spec& spec,
914 const char *options)
915 {
916 dev_t devno = 0;
917 std::string buf;
918 int r;
919
920 for (int tries = 0; ; tries++) {
921 r = spec_to_devno_and_krbd_id(ctx->udev, spec, &devno, &buf);
922 if (r == -ENOENT && tries < 2) {
923 usleep(250 * 1000);
924 } else {
925 if (r < 0) {
926 if (r == -ENOENT) {
927 std::cerr << "rbd: " << spec << ": not a mapped image or snapshot"
928 << std::endl;
929 r = -EINVAL;
930 }
931 return r;
932 }
933 if (tries) {
934 std::cerr << "rbd: udev enumerate missed a device, tries = " << tries
935 << std::endl;
936 }
937 break;
938 }
939 }
940
941 append_unmap_options(&buf, options);
942 return do_unmap(ctx, devno, buf);
943 }
944
945 static bool dump_one_image(Formatter *f, TextTable *tbl,
946 struct udev_device *dev)
947 {
948 auto spec = spec_from_dev(dev);
949 std::string devnode = get_devnode(dev);
950 const char *id = devnode.c_str() + sizeof(DEVNODE_PREFIX) - 1;
951
952 if (!spec)
953 return false;
954
955 if (f) {
956 f->open_object_section("device");
957 f->dump_string("id", id);
958 f->dump_string("pool", spec->pool_name);
959 f->dump_string("namespace", spec->nspace_name);
960 f->dump_string("name", spec->image_name);
961 f->dump_string("snap", spec->snap_name);
962 f->dump_string("device", devnode);
963 f->close_section();
964 } else {
965 *tbl << id << spec->pool_name << spec->nspace_name << spec->image_name
966 << spec->snap_name << devnode << TextTable::endrow;
967 }
968
969 return true;
970 }
971
972 static int do_dump(struct udev *udev, Formatter *f, TextTable *tbl)
973 {
974 udev_enumerate_uptr enm;
975 struct udev_list_entry *l = NULL;
976 bool have_output = false;
977 int r;
978
979 retry:
980 enm.reset(udev_enumerate_new(udev));
981 if (!enm)
982 return -ENOMEM;
983
984 r = udev_enumerate_add_match_subsystem(enm.get(), "rbd");
985 if (r < 0)
986 return r;
987
988 r = udev_enumerate_scan_devices(enm.get());
989 if (r < 0) {
990 if (r == -ENOENT || r == -ENODEV) {
991 std::cerr << "rbd: udev enumerate failed, retrying" << std::endl;
992 goto retry;
993 }
994 return r;
995 }
996
997 udev_list_entry_foreach(l, udev_enumerate_get_list_entry(enm.get())) {
998 auto dev = dev_from_list_entry(udev, l);
999 if (dev) {
1000 have_output |= dump_one_image(f, tbl, dev.get());
1001 }
1002 }
1003
1004 return have_output;
1005 }
1006
1007 static int dump_images(struct krbd_ctx *ctx, Formatter *f)
1008 {
1009 TextTable tbl;
1010 int r;
1011
1012 if (f) {
1013 f->open_array_section("devices");
1014 } else {
1015 tbl.define_column("id", TextTable::LEFT, TextTable::LEFT);
1016 tbl.define_column("pool", TextTable::LEFT, TextTable::LEFT);
1017 tbl.define_column("namespace", TextTable::LEFT, TextTable::LEFT);
1018 tbl.define_column("image", TextTable::LEFT, TextTable::LEFT);
1019 tbl.define_column("snap", TextTable::LEFT, TextTable::LEFT);
1020 tbl.define_column("device", TextTable::LEFT, TextTable::LEFT);
1021 }
1022
1023 r = do_dump(ctx->udev, f, &tbl);
1024
1025 if (f) {
1026 f->close_section();
1027 f->flush(std::cout);
1028 } else {
1029 if (r > 0)
1030 std::cout << tbl;
1031 }
1032
1033 return r;
1034 }
1035
1036 static int is_mapped_image(struct udev *udev, const krbd_spec& spec,
1037 string *pname)
1038 {
1039 udev_enumerate_uptr enm;
1040 struct udev_list_entry *l;
1041 int r;
1042
1043 r = enumerate_devices(udev, spec, &enm);
1044 if (r < 0)
1045 return r;
1046
1047 l = udev_enumerate_get_list_entry(enm.get());
1048 if (l) {
1049 auto dev = dev_from_list_entry(udev, l);
1050 if (!dev)
1051 return -ENOMEM;
1052
1053 *pname = get_devnode(dev.get());
1054 return 1;
1055 }
1056
1057 return 0; /* not mapped */
1058 }
1059
1060 extern "C" int krbd_create_from_context(rados_config_t cct, uint32_t flags,
1061 struct krbd_ctx **pctx)
1062 {
1063 struct krbd_ctx *ctx = new struct krbd_ctx();
1064
1065 ctx->cct = reinterpret_cast<CephContext *>(cct);
1066 ctx->udev = udev_new();
1067 if (!ctx->udev) {
1068 delete ctx;
1069 return -ENOMEM;
1070 }
1071 ctx->flags = flags;
1072
1073 *pctx = ctx;
1074 return 0;
1075 }
1076
1077 extern "C" void krbd_destroy(struct krbd_ctx *ctx)
1078 {
1079 if (!ctx)
1080 return;
1081
1082 udev_unref(ctx->udev);
1083
1084 delete ctx;
1085 }
1086
1087 extern "C" int krbd_map(struct krbd_ctx *ctx,
1088 const char *pool_name,
1089 const char *nspace_name,
1090 const char *image_name,
1091 const char *snap_name,
1092 const char *options,
1093 char **pdevnode)
1094 {
1095 krbd_spec spec(pool_name, nspace_name, image_name, snap_name);
1096 string name;
1097 char *devnode;
1098 int r;
1099
1100 r = map_image(ctx, spec, options, &name);
1101 if (r < 0)
1102 return r;
1103
1104 devnode = strdup(name.c_str());
1105 if (!devnode)
1106 return -ENOMEM;
1107
1108 *pdevnode = devnode;
1109 return r;
1110 }
1111
1112 extern "C" int krbd_unmap(struct krbd_ctx *ctx, const char *devnode,
1113 const char *options)
1114 {
1115 return unmap_image(ctx, devnode, options);
1116 }
1117
1118 extern "C" int krbd_unmap_by_spec(struct krbd_ctx *ctx,
1119 const char *pool_name,
1120 const char *nspace_name,
1121 const char *image_name,
1122 const char *snap_name,
1123 const char *options)
1124 {
1125 krbd_spec spec(pool_name, nspace_name, image_name, snap_name);
1126 return unmap_image(ctx, spec, options);
1127 }
1128
1129 int krbd_showmapped(struct krbd_ctx *ctx, Formatter *f)
1130 {
1131 return dump_images(ctx, f);
1132 }
1133
1134 extern "C" int krbd_is_mapped(struct krbd_ctx *ctx,
1135 const char *pool_name,
1136 const char *nspace_name,
1137 const char *image_name,
1138 const char *snap_name,
1139 char **pdevnode)
1140 {
1141 krbd_spec spec(pool_name, nspace_name, image_name, snap_name);
1142 string name;
1143 char *devnode;
1144 int r;
1145
1146 r = is_mapped_image(ctx->udev, spec, &name);
1147 if (r <= 0) /* error or not mapped */
1148 return r;
1149
1150 devnode = strdup(name.c_str());
1151 if (!devnode)
1152 return -ENOMEM;
1153
1154 *pdevnode = devnode;
1155 return r;
1156 }