]> git.proxmox.com Git - ceph.git/blame - ceph/src/krbd.cc
update download target update for octopus release
[ceph.git] / ceph / src / krbd.cc
CommitLineData
7c673cae
FG
1/*
2 * Ceph - scalable distributed file system
3 *
4 * Copyright (C) 2014 Inktank Storage, Inc.
5 *
6 * This is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License version 2.1, as published by the Free Software
9 * Foundation. See file COPYING.
10 *
11 */
12
13#include <errno.h>
14#include <fcntl.h>
15#include <iostream>
11fdf7f2 16#include <optional>
7c673cae
FG
17#include <poll.h>
18#include <sstream>
19#include <stdio.h>
20#include <stdlib.h>
21#include <string.h>
22#include <string>
23#include <sys/stat.h>
11fdf7f2 24#include <sys/sysmacros.h>
7c673cae 25#include <sys/types.h>
eafe8130 26#include <tuple>
7c673cae 27#include <unistd.h>
eafe8130 28#include <utility>
7c673cae
FG
29
30#include "auth/KeyRing.h"
31#include "common/errno.h"
32#include "common/Formatter.h"
33#include "common/module.h"
34#include "common/run_cmd.h"
35#include "common/safe_io.h"
36#include "common/secret.h"
37#include "common/TextTable.h"
eafe8130 38#include "common/Thread.h"
11fdf7f2 39#include "include/ceph_assert.h"
7c673cae
FG
40#include "include/stringify.h"
41#include "include/krbd.h"
42#include "mon/MonMap.h"
43
44#include <blkid/blkid.h>
45#include <libudev.h>
46
eafe8130 47static const int UDEV_BUF_SIZE = 1 << 20; /* doubled to 2M (SO_RCVBUFFORCE) */
7c673cae
FG
48
49struct krbd_ctx {
50 CephContext *cct;
51 struct udev *udev;
52};
53
11fdf7f2
TL
54static const std::string SNAP_HEAD_NAME("-");
55
56struct krbd_spec {
57 std::string pool_name;
58 std::string nspace_name;
59 std::string image_name;
60 std::string snap_name;
61
62 krbd_spec(const char *pool_name, const char *nspace_name,
63 const char *image_name, const char *snap_name)
64 : pool_name(pool_name),
65 nspace_name(nspace_name),
66 image_name(image_name),
67 snap_name(*snap_name ? snap_name : SNAP_HEAD_NAME) { }
68
69 bool operator==(const krbd_spec& rhs) const {
70 return pool_name == rhs.pool_name &&
71 nspace_name == rhs.nspace_name &&
72 image_name == rhs.image_name &&
73 snap_name == rhs.snap_name;
74 }
75};
76
77std::ostream& operator<<(std::ostream& os, const krbd_spec& spec) {
78 os << spec.pool_name << "/";
79 if (!spec.nspace_name.empty())
80 os << spec.nspace_name << "/";
81 os << spec.image_name;
82 if (spec.snap_name != SNAP_HEAD_NAME)
83 os << "@" << spec.snap_name;
84 return os;
85}
86
87std::optional<krbd_spec> spec_from_dev(udev_device *dev) {
88 const char *pool_name = udev_device_get_sysattr_value(dev, "pool");
89 const char *nspace_name = udev_device_get_sysattr_value(dev, "pool_ns");
90 const char *image_name = udev_device_get_sysattr_value(dev, "name");
91 const char *snap_name = udev_device_get_sysattr_value(dev, "current_snap");
92
93 if (!pool_name || !image_name || !snap_name)
94 return std::nullopt;
95
96 return std::make_optional<krbd_spec>(
97 pool_name, nspace_name ?: "", image_name, snap_name);
98}
99
7c673cae
FG
100static string get_kernel_rbd_name(const char *id)
101{
102 return string("/dev/rbd") + id;
103}
104
105static int sysfs_write_rbd(const char *which, const string& buf)
106{
107 const string s = string("/sys/bus/rbd/") + which;
108 const string t = s + "_single_major";
109 int fd;
110 int r;
111
112 /*
113 * 'add' and 'add_single_major' interfaces are identical, but if rbd
114 * kernel module is new enough and is configured to use single-major
115 * scheme, 'add' is disabled in order to prevent old userspace from
116 * doing weird things at unmap time.
117 *
118 * Same goes for 'remove' vs 'remove_single_major'.
119 */
120 fd = open(t.c_str(), O_WRONLY);
121 if (fd < 0) {
122 if (errno == ENOENT) {
123 fd = open(s.c_str(), O_WRONLY);
124 if (fd < 0)
125 return -errno;
126 } else {
127 return -errno;
128 }
129 }
130
131 r = safe_write(fd, buf.c_str(), buf.size());
132
133 close(fd);
134 return r;
135}
136
137static int sysfs_write_rbd_add(const string& buf)
138{
139 return sysfs_write_rbd("add", buf);
140}
141
142static int sysfs_write_rbd_remove(const string& buf)
143{
144 return sysfs_write_rbd("remove", buf);
145}
146
147static int have_minor_attr(void)
148{
149 /*
150 * 'minor' attribute was added as part of single_major merge, which
151 * exposed the 'single_major' parameter. 'minor' is always present,
152 * regardless of whether single-major scheme is turned on or not.
153 *
154 * (Something like ver >= KERNEL_VERSION(3, 14, 0) is a no-go because
155 * this has to work with rbd.ko backported to various kernels.)
156 */
157 return access("/sys/module/rbd/parameters/single_major", F_OK) == 0;
158}
159
11fdf7f2
TL
160static int build_map_buf(CephContext *cct, const krbd_spec& spec,
161 const char *options, string *pbuf)
7c673cae
FG
162{
163 ostringstream oss;
164 int r;
165
166 MonMap monmap;
11fdf7f2 167 r = monmap.build_initial(cct, false, cerr);
7c673cae
FG
168 if (r < 0)
169 return r;
170
171 list<entity_addr_t> mon_addr;
172 monmap.list_addrs(mon_addr);
173
174 for (const auto &p : mon_addr) {
175 if (oss.tellp() > 0) {
176 oss << ",";
177 }
178 oss << p.get_sockaddr();
179 }
180
181 oss << " name=" << cct->_conf->name.get_id();
182
183 KeyRing keyring;
11fdf7f2
TL
184 auto auth_client_required =
185 cct->_conf.get_val<std::string>("auth_client_required");
186 if (auth_client_required != "none") {
224ce89b 187 r = keyring.from_ceph_context(cct);
11fdf7f2
TL
188 auto keyfile = cct->_conf.get_val<std::string>("keyfile");
189 auto key = cct->_conf.get_val<std::string>("key");
190 if (r == -ENOENT && keyfile.empty() && key.empty())
224ce89b
WB
191 r = 0;
192 if (r < 0) {
193 cerr << "rbd: failed to get secret" << std::endl;
194 return r;
195 }
7c673cae
FG
196 }
197
198 CryptoKey secret;
199 string key_name = string("client.") + cct->_conf->name.get_id();
200 if (keyring.get_secret(cct->_conf->name, secret)) {
201 string secret_str;
202 secret.encode_base64(secret_str);
203
204 r = set_kernel_secret(secret_str.c_str(), key_name.c_str());
205 if (r >= 0) {
206 if (r == 0)
207 cerr << "rbd: warning: secret has length 0" << std::endl;
208 oss << ",key=" << key_name;
209 } else if (r == -ENODEV || r == -ENOSYS) {
210 // running against older kernel; fall back to secret= in options
211 oss << ",secret=" << secret_str;
212 } else {
213 cerr << "rbd: failed to add secret '" << key_name << "' to kernel"
214 << std::endl;
215 return r;
216 }
217 } else if (is_kernel_secret(key_name.c_str())) {
218 oss << ",key=" << key_name;
219 }
220
221 if (strcmp(options, "") != 0)
222 oss << "," << options;
11fdf7f2
TL
223 if (!spec.nspace_name.empty())
224 oss << ",_pool_ns=" << spec.nspace_name;
7c673cae 225
11fdf7f2
TL
226 oss << " " << spec.pool_name << " " << spec.image_name << " "
227 << spec.snap_name;
7c673cae
FG
228
229 *pbuf = oss.str();
230 return 0;
231}
232
eafe8130
TL
233/*
234 * Return:
235 * <kernel error, false> - didn't map
236 * <0 or udev error, true> - mapped
237 */
238template <typename F>
239static std::pair<int, bool> wait_for_mapping(int sysfs_r_fd, udev_monitor *mon,
240 F udev_device_handler)
7c673cae 241{
eafe8130
TL
242 struct pollfd fds[2];
243 int sysfs_r = INT_MAX, udev_r = INT_MAX;
81eedcae 244 int r;
7c673cae 245
eafe8130
TL
246 fds[0].fd = sysfs_r_fd;
247 fds[0].events = POLLIN;
248 fds[1].fd = udev_monitor_get_fd(mon);
249 fds[1].events = POLLIN;
250
7c673cae 251 for (;;) {
eafe8130
TL
252 if (poll(fds, 2, -1) < 0) {
253 ceph_abort_msgf("poll failed: %d", -errno);
254 }
7c673cae 255
eafe8130
TL
256 if (fds[0].revents) {
257 r = safe_read_exact(sysfs_r_fd, &sysfs_r, sizeof(sysfs_r));
258 if (r < 0) {
259 ceph_abort_msgf("safe_read_exact failed: %d", r);
260 }
261 if (sysfs_r < 0) {
262 return std::make_pair(sysfs_r, false);
263 }
264 if (udev_r != INT_MAX) {
265 ceph_assert(!sysfs_r);
266 return std::make_pair(udev_r, true);
267 }
268 fds[0].fd = -1;
81eedcae 269 }
11fdf7f2 270
eafe8130
TL
271 if (fds[1].revents) {
272 for (;;) {
273 struct udev_device *dev;
274
275 dev = udev_monitor_receive_device(mon);
276 if (!dev) {
277 if (errno != EINTR && errno != EAGAIN) {
278 udev_r = -errno;
279 if (sysfs_r != INT_MAX) {
280 ceph_assert(!sysfs_r);
281 return std::make_pair(udev_r, true);
282 }
283 fds[1].fd = -1;
284 }
285 break;
286 }
287 if (udev_device_handler(dev)) {
288 udev_r = 0;
289 if (sysfs_r != INT_MAX) {
290 ceph_assert(!sysfs_r);
291 return std::make_pair(udev_r, true);
292 }
293 fds[1].fd = -1;
294 break;
295 }
296 }
297 }
298 }
299}
300
301class UdevMapHandler {
302public:
303 UdevMapHandler(const krbd_spec *spec, std::string *pdevnode) :
304 m_spec(spec), m_pdevnode(pdevnode) {}
7c673cae 305
eafe8130
TL
306 /*
307 * Catch /sys/devices/rbd/<id>/ and wait for the corresponding
308 * block device to show up. This is necessary because rbd devices
309 * and block devices aren't linked together in our sysfs layout.
310 */
311 bool operator()(udev_device *dev) {
312 if (strcmp(udev_device_get_action(dev), "add")) {
7c673cae 313 goto next;
eafe8130
TL
314 }
315 if (!strcmp(udev_device_get_subsystem(dev), "rbd")) {
316 if (!m_bus_dev) {
317 auto spec = spec_from_dev(dev);
318 if (spec && *spec == *m_spec) {
319 m_bus_dev = dev;
81eedcae 320 goto check;
7c673cae
FG
321 }
322 }
eafe8130
TL
323 } else if (!strcmp(udev_device_get_subsystem(dev), "block")) {
324 m_block_devs.push_back(dev);
81eedcae
TL
325 goto check;
326 }
7c673cae 327
81eedcae
TL
328next:
329 udev_device_unref(dev);
eafe8130 330 return false;
81eedcae
TL
331
332check:
eafe8130
TL
333 if (m_bus_dev && !m_block_devs.empty()) {
334 const char *major = udev_device_get_sysattr_value(m_bus_dev, "major");
335 const char *minor = udev_device_get_sysattr_value(m_bus_dev, "minor");
81eedcae
TL
336 ceph_assert(!minor ^ have_minor_attr());
337
eafe8130 338 for (auto p : m_block_devs) {
81eedcae
TL
339 const char *this_major = udev_device_get_property_value(p, "MAJOR");
340 const char *this_minor = udev_device_get_property_value(p, "MINOR");
7c673cae
FG
341
342 if (strcmp(this_major, major) == 0 &&
343 (!minor || strcmp(this_minor, minor) == 0)) {
eafe8130 344 string name = get_kernel_rbd_name(udev_device_get_sysname(m_bus_dev));
7c673cae 345
81eedcae 346 ceph_assert(strcmp(udev_device_get_devnode(p), name.c_str()) == 0);
eafe8130
TL
347 *m_pdevnode = name;
348 return true;
7c673cae
FG
349 }
350 }
351 }
eafe8130 352 return false;
81eedcae 353 }
7c673cae 354
eafe8130
TL
355 ~UdevMapHandler() {
356 if (m_bus_dev) {
357 udev_device_unref(m_bus_dev);
358 }
359
360 for (auto p : m_block_devs) {
361 udev_device_unref(p);
362 }
7c673cae
FG
363 }
364
eafe8130
TL
365private:
366 udev_device *m_bus_dev = nullptr;
367 std::vector<udev_device *> m_block_devs;
368 const krbd_spec *m_spec;
369 std::string *m_pdevnode;
370};
7c673cae 371
11fdf7f2
TL
372static int do_map(struct udev *udev, const krbd_spec& spec, const string& buf,
373 string *pname)
7c673cae
FG
374{
375 struct udev_monitor *mon;
eafe8130
TL
376 std::thread mapper;
377 bool mapped;
378 int fds[2];
7c673cae
FG
379 int r;
380
381 mon = udev_monitor_new_from_netlink(udev, "udev");
382 if (!mon)
383 return -ENOMEM;
384
11fdf7f2 385 r = udev_monitor_filter_add_match_subsystem_devtype(mon, "rbd", nullptr);
7c673cae
FG
386 if (r < 0)
387 goto out_mon;
388
389 r = udev_monitor_filter_add_match_subsystem_devtype(mon, "block", "disk");
390 if (r < 0)
391 goto out_mon;
392
eafe8130
TL
393 r = udev_monitor_set_receive_buffer_size(mon, UDEV_BUF_SIZE);
394 if (r < 0) {
395 std::cerr << "rbd: failed to set udev buffer size: " << cpp_strerror(r)
396 << std::endl;
397 /* not fatal */
398 }
399
7c673cae
FG
400 r = udev_monitor_enable_receiving(mon);
401 if (r < 0)
402 goto out_mon;
403
eafe8130
TL
404 if (pipe2(fds, O_NONBLOCK) < 0) {
405 r = -errno;
7c673cae
FG
406 goto out_mon;
407 }
408
eafe8130
TL
409 mapper = make_named_thread("mapper", [&buf, sysfs_r_fd = fds[1]]() {
410 int sysfs_r = sysfs_write_rbd_add(buf);
411 int r = safe_write(sysfs_r_fd, &sysfs_r, sizeof(sysfs_r));
412 if (r < 0) {
413 ceph_abort_msgf("safe_write failed: %d", r);
414 }
415 });
416
417 std::tie(r, mapped) = wait_for_mapping(fds[0], mon,
418 UdevMapHandler(&spec, pname));
7c673cae 419 if (r < 0) {
eafe8130
TL
420 if (!mapped) {
421 std::cerr << "rbd: sysfs write failed" << std::endl;
422 } else {
423 std::cerr << "rbd: udev wait failed" << std::endl;
424 /* TODO: fall back to enumeration */
425 }
7c673cae
FG
426 }
427
eafe8130
TL
428 mapper.join();
429 close(fds[0]);
430 close(fds[1]);
431
7c673cae
FG
432out_mon:
433 udev_monitor_unref(mon);
434 return r;
435}
436
11fdf7f2
TL
437static int map_image(struct krbd_ctx *ctx, const krbd_spec& spec,
438 const char *options, string *pname)
7c673cae
FG
439{
440 string buf;
441 int r;
442
11fdf7f2 443 r = build_map_buf(ctx->cct, spec, options, &buf);
7c673cae
FG
444 if (r < 0)
445 return r;
446
447 /*
448 * Modprobe rbd kernel module. If it supports single-major device
449 * number allocation scheme, make sure it's turned on.
450 */
451 if (access("/sys/bus/rbd", F_OK) != 0) {
452 const char *module_options = NULL;
453 if (module_has_param("rbd", "single_major"))
454 module_options = "single_major=Y";
455
456 r = module_load("rbd", module_options);
457 if (r) {
458 cerr << "rbd: failed to load rbd kernel module (" << r << ")"
459 << std::endl;
460 /*
461 * Ignore the error: modprobe failing doesn't necessarily prevent
462 * from working.
463 */
464 }
465 }
466
11fdf7f2 467 return do_map(ctx->udev, spec, buf, pname);
7c673cae
FG
468}
469
470static int devno_to_krbd_id(struct udev *udev, dev_t devno, string *pid)
471{
472 struct udev_enumerate *enm;
473 struct udev_list_entry *l;
474 struct udev_device *dev;
475 int r;
476
eafe8130 477retry:
7c673cae
FG
478 enm = udev_enumerate_new(udev);
479 if (!enm)
480 return -ENOMEM;
481
482 r = udev_enumerate_add_match_subsystem(enm, "rbd");
483 if (r < 0)
484 goto out_enm;
485
486 r = udev_enumerate_add_match_sysattr(enm, "major",
487 stringify(major(devno)).c_str());
488 if (r < 0)
489 goto out_enm;
490
491 if (have_minor_attr()) {
492 r = udev_enumerate_add_match_sysattr(enm, "minor",
493 stringify(minor(devno)).c_str());
494 if (r < 0)
495 goto out_enm;
496 }
497
498 r = udev_enumerate_scan_devices(enm);
eafe8130
TL
499 if (r < 0) {
500 if (r == -ENOENT || r == -ENODEV) {
501 std::cerr << "rbd: udev enumerate failed, retrying" << std::endl;
502 udev_enumerate_unref(enm);
503 goto retry;
504 }
7c673cae 505 goto out_enm;
eafe8130 506 }
7c673cae
FG
507
508 l = udev_enumerate_get_list_entry(enm);
509 if (!l) {
510 r = -ENOENT;
511 goto out_enm;
512 }
513
514 /* make sure there is only one match */
11fdf7f2 515 ceph_assert(!udev_list_entry_get_next(l));
7c673cae
FG
516
517 dev = udev_device_new_from_syspath(udev, udev_list_entry_get_name(l));
518 if (!dev) {
519 r = -ENOMEM;
520 goto out_enm;
521 }
522
523 *pid = udev_device_get_sysname(dev);
524
525 udev_device_unref(dev);
526out_enm:
527 udev_enumerate_unref(enm);
528 return r;
529}
530
11fdf7f2
TL
531static int __enumerate_devices(struct udev *udev, const krbd_spec& spec,
532 bool match_nspace, struct udev_enumerate **penm)
7c673cae
FG
533{
534 struct udev_enumerate *enm;
7c673cae
FG
535 int r;
536
eafe8130 537retry:
7c673cae
FG
538 enm = udev_enumerate_new(udev);
539 if (!enm)
540 return -ENOMEM;
541
542 r = udev_enumerate_add_match_subsystem(enm, "rbd");
543 if (r < 0)
544 goto out_enm;
545
11fdf7f2 546 r = udev_enumerate_add_match_sysattr(enm, "pool", spec.pool_name.c_str());
7c673cae
FG
547 if (r < 0)
548 goto out_enm;
549
11fdf7f2
TL
550 if (match_nspace) {
551 r = udev_enumerate_add_match_sysattr(enm, "pool_ns",
552 spec.nspace_name.c_str());
553 } else {
554 /*
555 * Match _only_ devices that don't have pool_ns attribute.
556 * If the kernel supports namespaces, the result will be empty.
557 */
558 r = udev_enumerate_add_nomatch_sysattr(enm, "pool_ns", nullptr);
559 }
560 if (r < 0)
561 goto out_enm;
562
563 r = udev_enumerate_add_match_sysattr(enm, "name", spec.image_name.c_str());
7c673cae
FG
564 if (r < 0)
565 goto out_enm;
566
11fdf7f2
TL
567 r = udev_enumerate_add_match_sysattr(enm, "current_snap",
568 spec.snap_name.c_str());
7c673cae
FG
569 if (r < 0)
570 goto out_enm;
571
572 r = udev_enumerate_scan_devices(enm);
eafe8130
TL
573 if (r < 0) {
574 if (r == -ENOENT || r == -ENODEV) {
575 std::cerr << "rbd: udev enumerate failed, retrying" << std::endl;
576 udev_enumerate_unref(enm);
577 goto retry;
578 }
7c673cae 579 goto out_enm;
eafe8130 580 }
7c673cae 581
11fdf7f2
TL
582 *penm = enm;
583 return 0;
584
585out_enm:
586 udev_enumerate_unref(enm);
587 return r;
588}
589
590static int enumerate_devices(struct udev *udev, const krbd_spec& spec,
591 struct udev_enumerate **penm)
592{
593 struct udev_enumerate *enm;
594 int r;
595
596 r = __enumerate_devices(udev, spec, true, &enm);
597 if (r < 0)
598 return r;
599
600 /*
601 * If no namespace is set, try again with match_nspace=false to
602 * handle older kernels. On a newer kernel the result will remain
603 * the same (i.e. empty).
604 */
605 if (!udev_enumerate_get_list_entry(enm) && spec.nspace_name.empty()) {
606 udev_enumerate_unref(enm);
607 r = __enumerate_devices(udev, spec, false, &enm);
608 if (r < 0)
609 return r;
610 }
611
612 *penm = enm;
613 return 0;
614}
615
616static int spec_to_devno_and_krbd_id(struct udev *udev, const krbd_spec& spec,
617 dev_t *pdevno, string *pid)
618{
619 struct udev_enumerate *enm;
620 struct udev_list_entry *l;
621 struct udev_device *dev;
622 unsigned int maj, min = 0;
623 string err;
624 int r;
625
626 r = enumerate_devices(udev, spec, &enm);
627 if (r < 0)
628 return r;
629
7c673cae
FG
630 l = udev_enumerate_get_list_entry(enm);
631 if (!l) {
632 r = -ENOENT;
633 goto out_enm;
634 }
635
636 dev = udev_device_new_from_syspath(udev, udev_list_entry_get_name(l));
637 if (!dev) {
638 r = -ENOMEM;
639 goto out_enm;
640 }
641
642 maj = strict_strtoll(udev_device_get_sysattr_value(dev, "major"), 10, &err);
643 if (!err.empty()) {
644 cerr << "rbd: couldn't parse major: " << err << std::endl;
645 r = -EINVAL;
646 goto out_dev;
647 }
648 if (have_minor_attr()) {
649 min = strict_strtoll(udev_device_get_sysattr_value(dev, "minor"), 10, &err);
650 if (!err.empty()) {
651 cerr << "rbd: couldn't parse minor: " << err << std::endl;
652 r = -EINVAL;
653 goto out_dev;
654 }
655 }
656
657 /*
658 * If an image is mapped more than once don't bother trying to unmap
659 * all devices - let users run unmap the same number of times they
660 * ran map.
661 */
662 if (udev_list_entry_get_next(l))
11fdf7f2 663 cerr << "rbd: " << spec << ": mapped more than once, unmapping "
7c673cae
FG
664 << get_kernel_rbd_name(udev_device_get_sysname(dev))
665 << " only" << std::endl;
666
667 *pdevno = makedev(maj, min);
668 *pid = udev_device_get_sysname(dev);
669
670out_dev:
671 udev_device_unref(dev);
672out_enm:
673 udev_enumerate_unref(enm);
674 return r;
675}
676
677static string build_unmap_buf(const string& id, const char *options)
678{
679 string buf(id);
680 if (strcmp(options, "") != 0) {
681 buf += " ";
682 buf += options;
683 }
684 return buf;
685}
686
eafe8130
TL
687class UdevUnmapHandler {
688public:
689 UdevUnmapHandler(dev_t devno) : m_devno(devno) {}
7c673cae 690
eafe8130
TL
691 bool operator()(udev_device *dev) {
692 bool match = false;
11fdf7f2 693
eafe8130
TL
694 if (!strcmp(udev_device_get_action(dev), "remove") &&
695 udev_device_get_devnum(dev) == m_devno) {
696 match = true;
7c673cae 697 }
7c673cae 698 udev_device_unref(dev);
eafe8130 699 return match;
7c673cae
FG
700 }
701
eafe8130
TL
702private:
703 dev_t m_devno;
704};
7c673cae
FG
705
706static int do_unmap(struct udev *udev, dev_t devno, const string& buf)
707{
708 struct udev_monitor *mon;
eafe8130
TL
709 std::thread unmapper;
710 bool unmapped;
711 int fds[2];
7c673cae
FG
712 int r;
713
714 mon = udev_monitor_new_from_netlink(udev, "udev");
715 if (!mon)
716 return -ENOMEM;
717
718 r = udev_monitor_filter_add_match_subsystem_devtype(mon, "block", "disk");
719 if (r < 0)
720 goto out_mon;
721
eafe8130
TL
722 r = udev_monitor_set_receive_buffer_size(mon, UDEV_BUF_SIZE);
723 if (r < 0) {
724 std::cerr << "rbd: failed to set udev buffer size: " << cpp_strerror(r)
725 << std::endl;
726 /* not fatal */
727 }
728
7c673cae
FG
729 r = udev_monitor_enable_receiving(mon);
730 if (r < 0)
731 goto out_mon;
732
eafe8130
TL
733 if (pipe2(fds, O_NONBLOCK) < 0) {
734 r = -errno;
735 goto out_mon;
736 }
737
738 unmapper = make_named_thread("unmapper", [&buf, sysfs_r_fd = fds[1]]() {
739 /*
740 * On final device close(), kernel sends a block change event, in
741 * response to which udev apparently runs blkid on the device. This
742 * makes unmap fail with EBUSY, if issued right after final close().
743 * Try to circumvent this with a retry before turning to udev.
744 */
745 for (int tries = 0; ; tries++) {
746 int sysfs_r = sysfs_write_rbd_remove(buf);
747 if (sysfs_r == -EBUSY && tries < 2) {
748 if (!tries) {
749 usleep(250 * 1000);
750 } else {
751 /*
752 * libudev does not provide the "wait until the queue is empty"
753 * API or the sufficient amount of primitives to build it from.
754 */
755 std::string err = run_cmd("udevadm", "settle", "--timeout", "10",
756 (char *)NULL);
757 if (!err.empty())
758 std::cerr << "rbd: " << err << std::endl;
759 }
7c673cae 760 } else {
eafe8130
TL
761 int r = safe_write(sysfs_r_fd, &sysfs_r, sizeof(sysfs_r));
762 if (r < 0) {
763 ceph_abort_msgf("safe_write failed: %d", r);
764 }
765 break;
7c673cae 766 }
7c673cae 767 }
eafe8130 768 });
7c673cae 769
eafe8130
TL
770 std::tie(r, unmapped) = wait_for_mapping(fds[0], mon,
771 UdevUnmapHandler(devno));
7c673cae 772 if (r < 0) {
eafe8130
TL
773 if (!unmapped) {
774 std::cerr << "rbd: sysfs write failed" << std::endl;
775 } else {
776 std::cerr << "rbd: udev wait failed: " << cpp_strerror(r) << std::endl;
777 r = 0;
778 }
7c673cae
FG
779 }
780
eafe8130
TL
781 unmapper.join();
782 close(fds[0]);
783 close(fds[1]);
784
7c673cae
FG
785out_mon:
786 udev_monitor_unref(mon);
787 return r;
788}
789
790static int unmap_image(struct krbd_ctx *ctx, const char *devnode,
791 const char *options)
792{
793 struct stat sb;
794 dev_t wholedevno = 0;
795 string id;
796 int r;
797
798 if (stat(devnode, &sb) < 0 || !S_ISBLK(sb.st_mode)) {
799 cerr << "rbd: '" << devnode << "' is not a block device" << std::endl;
800 return -EINVAL;
801 }
802
803 r = blkid_devno_to_wholedisk(sb.st_rdev, NULL, 0, &wholedevno);
804 if (r < 0) {
805 cerr << "rbd: couldn't compute wholedevno: " << cpp_strerror(r)
806 << std::endl;
807 /*
808 * Ignore the error: we are given whole disks most of the time, and
809 * if it turns out this is a partition we will fail later anyway.
810 */
811 wholedevno = sb.st_rdev;
812 }
813
eafe8130
TL
814 for (int tries = 0; ; tries++) {
815 r = devno_to_krbd_id(ctx->udev, wholedevno, &id);
816 if (r == -ENOENT && tries < 2) {
817 usleep(250 * 1000);
818 } else {
819 if (r < 0) {
820 if (r == -ENOENT) {
821 std::cerr << "rbd: '" << devnode << "' is not an rbd device"
822 << std::endl;
823 r = -EINVAL;
824 }
825 return r;
826 }
827 if (tries) {
828 std::cerr << "rbd: udev enumerate missed a device, tries = " << tries
829 << std::endl;
830 }
831 break;
7c673cae 832 }
7c673cae
FG
833 }
834
835 return do_unmap(ctx->udev, wholedevno, build_unmap_buf(id, options));
836}
837
11fdf7f2 838static int unmap_image(struct krbd_ctx *ctx, const krbd_spec& spec,
7c673cae
FG
839 const char *options)
840{
841 dev_t devno = 0;
842 string id;
843 int r;
844
eafe8130
TL
845 for (int tries = 0; ; tries++) {
846 r = spec_to_devno_and_krbd_id(ctx->udev, spec, &devno, &id);
847 if (r == -ENOENT && tries < 2) {
848 usleep(250 * 1000);
849 } else {
850 if (r < 0) {
851 if (r == -ENOENT) {
852 std::cerr << "rbd: " << spec << ": not a mapped image or snapshot"
853 << std::endl;
854 r = -EINVAL;
855 }
856 return r;
857 }
858 if (tries) {
859 std::cerr << "rbd: udev enumerate missed a device, tries = " << tries
860 << std::endl;
861 }
862 break;
7c673cae 863 }
7c673cae
FG
864 }
865
866 return do_unmap(ctx->udev, devno, build_unmap_buf(id, options));
867}
868
869static bool dump_one_image(Formatter *f, TextTable *tbl,
870 struct udev_device *dev)
871{
872 const char *id = udev_device_get_sysname(dev);
11fdf7f2 873 auto spec = spec_from_dev(dev);
7c673cae
FG
874 string kname = get_kernel_rbd_name(id);
875
11fdf7f2 876 if (!spec)
7c673cae
FG
877 return false;
878
879 if (f) {
11fdf7f2
TL
880 f->open_object_section("device");
881 f->dump_string("id", id);
882 f->dump_string("pool", spec->pool_name);
883 f->dump_string("namespace", spec->nspace_name);
884 f->dump_string("name", spec->image_name);
885 f->dump_string("snap", spec->snap_name);
7c673cae
FG
886 f->dump_string("device", kname);
887 f->close_section();
888 } else {
11fdf7f2
TL
889 *tbl << id << spec->pool_name << spec->nspace_name << spec->image_name
890 << spec->snap_name << kname << TextTable::endrow;
7c673cae
FG
891 }
892
893 return true;
894}
895
896static int do_dump(struct udev *udev, Formatter *f, TextTable *tbl)
897{
898 struct udev_enumerate *enm;
11fdf7f2 899 struct udev_list_entry *l = NULL;
7c673cae
FG
900 bool have_output = false;
901 int r;
902
eafe8130 903retry:
7c673cae
FG
904 enm = udev_enumerate_new(udev);
905 if (!enm)
906 return -ENOMEM;
907
908 r = udev_enumerate_add_match_subsystem(enm, "rbd");
909 if (r < 0)
910 goto out_enm;
911
912 r = udev_enumerate_scan_devices(enm);
eafe8130
TL
913 if (r < 0) {
914 if (r == -ENOENT || r == -ENODEV) {
915 std::cerr << "rbd: udev enumerate failed, retrying" << std::endl;
916 udev_enumerate_unref(enm);
917 goto retry;
918 }
7c673cae 919 goto out_enm;
eafe8130 920 }
7c673cae
FG
921
922 udev_list_entry_foreach(l, udev_enumerate_get_list_entry(enm)) {
923 struct udev_device *dev;
924
925 dev = udev_device_new_from_syspath(udev, udev_list_entry_get_name(l));
926 if (dev) {
927 have_output |= dump_one_image(f, tbl, dev);
928 udev_device_unref(dev);
929 }
930 }
931
932 r = have_output;
933out_enm:
934 udev_enumerate_unref(enm);
935 return r;
936}
937
938int dump_images(struct krbd_ctx *ctx, Formatter *f)
939{
940 TextTable tbl;
941 int r;
942
943 if (f) {
11fdf7f2 944 f->open_array_section("devices");
7c673cae
FG
945 } else {
946 tbl.define_column("id", TextTable::LEFT, TextTable::LEFT);
947 tbl.define_column("pool", TextTable::LEFT, TextTable::LEFT);
11fdf7f2 948 tbl.define_column("namespace", TextTable::LEFT, TextTable::LEFT);
7c673cae
FG
949 tbl.define_column("image", TextTable::LEFT, TextTable::LEFT);
950 tbl.define_column("snap", TextTable::LEFT, TextTable::LEFT);
951 tbl.define_column("device", TextTable::LEFT, TextTable::LEFT);
952 }
953
954 r = do_dump(ctx->udev, f, &tbl);
955
956 if (f) {
957 f->close_section();
958 f->flush(cout);
959 } else {
960 if (r > 0)
961 cout << tbl;
962 }
963
964 return r;
965}
966
11fdf7f2
TL
967static int is_mapped_image(struct udev *udev, const krbd_spec& spec,
968 string *pname)
969{
970 struct udev_enumerate *enm;
971 struct udev_list_entry *l;
972 int r;
973
974 r = enumerate_devices(udev, spec, &enm);
975 if (r < 0)
976 return r;
977
978 l = udev_enumerate_get_list_entry(enm);
979 if (l) {
980 struct udev_device *dev;
981
982 dev = udev_device_new_from_syspath(udev, udev_list_entry_get_name(l));
983 if (!dev) {
984 r = -ENOMEM;
985 goto out_enm;
986 }
987
988 r = 1;
989 *pname = get_kernel_rbd_name(udev_device_get_sysname(dev));
990 udev_device_unref(dev);
991 } else {
992 r = 0; /* not mapped */
993 }
994
995out_enm:
996 udev_enumerate_unref(enm);
997 return r;
998}
999
7c673cae
FG
1000extern "C" int krbd_create_from_context(rados_config_t cct,
1001 struct krbd_ctx **pctx)
1002{
1003 struct krbd_ctx *ctx = new struct krbd_ctx();
1004
1005 ctx->cct = reinterpret_cast<CephContext *>(cct);
1006 ctx->udev = udev_new();
1007 if (!ctx->udev) {
1008 delete ctx;
1009 return -ENOMEM;
1010 }
1011
1012 *pctx = ctx;
1013 return 0;
1014}
1015
1016extern "C" void krbd_destroy(struct krbd_ctx *ctx)
1017{
1018 if (!ctx)
1019 return;
1020
1021 udev_unref(ctx->udev);
1022
1023 delete ctx;
1024}
1025
11fdf7f2
TL
1026extern "C" int krbd_map(struct krbd_ctx *ctx,
1027 const char *pool_name,
1028 const char *nspace_name,
1029 const char *image_name,
1030 const char *snap_name,
1031 const char *options,
1032 char **pdevnode)
7c673cae 1033{
11fdf7f2 1034 krbd_spec spec(pool_name, nspace_name, image_name, snap_name);
7c673cae
FG
1035 string name;
1036 char *devnode;
1037 int r;
1038
11fdf7f2 1039 r = map_image(ctx, spec, options, &name);
7c673cae
FG
1040 if (r < 0)
1041 return r;
1042
1043 devnode = strdup(name.c_str());
1044 if (!devnode)
1045 return -ENOMEM;
1046
1047 *pdevnode = devnode;
1048 return r;
1049}
1050
1051extern "C" int krbd_unmap(struct krbd_ctx *ctx, const char *devnode,
1052 const char *options)
1053{
1054 return unmap_image(ctx, devnode, options);
1055}
1056
11fdf7f2
TL
1057extern "C" int krbd_unmap_by_spec(struct krbd_ctx *ctx,
1058 const char *pool_name,
1059 const char *nspace_name,
1060 const char *image_name,
1061 const char *snap_name,
7c673cae
FG
1062 const char *options)
1063{
11fdf7f2
TL
1064 krbd_spec spec(pool_name, nspace_name, image_name, snap_name);
1065 return unmap_image(ctx, spec, options);
7c673cae
FG
1066}
1067
1068int krbd_showmapped(struct krbd_ctx *ctx, Formatter *f)
1069{
1070 return dump_images(ctx, f);
1071}
11fdf7f2
TL
1072
1073extern "C" int krbd_is_mapped(struct krbd_ctx *ctx,
1074 const char *pool_name,
1075 const char *nspace_name,
1076 const char *image_name,
1077 const char *snap_name,
1078 char **pdevnode)
1079{
1080 krbd_spec spec(pool_name, nspace_name, image_name, snap_name);
1081 string name;
1082 char *devnode;
1083 int r;
1084
1085 r = is_mapped_image(ctx->udev, spec, &name);
1086 if (r <= 0) /* error or not mapped */
1087 return r;
1088
1089 devnode = strdup(name.c_str());
1090 if (!devnode)
1091 return -ENOMEM;
1092
1093 *pdevnode = devnode;
1094 return r;
1095}