]> git.proxmox.com Git - ceph.git/blame - ceph/src/krbd.cc
bump version to 18.2.2-pve1
[ceph.git] / ceph / src / krbd.cc
CommitLineData
7c673cae
FG
1/*
2 * Ceph - scalable distributed file system
3 *
4 * Copyright (C) 2014 Inktank Storage, Inc.
5 *
6 * This is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License version 2.1, as published by the Free Software
9 * Foundation. See file COPYING.
10 *
11 */
12
13#include <errno.h>
14#include <fcntl.h>
15#include <iostream>
9f95a23c 16#include <memory>
11fdf7f2 17#include <optional>
7c673cae 18#include <poll.h>
522d829b 19#include <regex>
7c673cae
FG
20#include <sstream>
21#include <stdio.h>
22#include <stdlib.h>
23#include <string.h>
24#include <string>
25#include <sys/stat.h>
11fdf7f2 26#include <sys/sysmacros.h>
7c673cae 27#include <sys/types.h>
eafe8130 28#include <tuple>
7c673cae 29#include <unistd.h>
eafe8130 30#include <utility>
7c673cae
FG
31
32#include "auth/KeyRing.h"
33#include "common/errno.h"
34#include "common/Formatter.h"
35#include "common/module.h"
36#include "common/run_cmd.h"
37#include "common/safe_io.h"
38#include "common/secret.h"
39#include "common/TextTable.h"
eafe8130 40#include "common/Thread.h"
11fdf7f2 41#include "include/ceph_assert.h"
7c673cae
FG
42#include "include/stringify.h"
43#include "include/krbd.h"
44#include "mon/MonMap.h"
45
46#include <blkid/blkid.h>
9f95a23c 47#include <boost/algorithm/string/predicate.hpp>
adb31ebb 48#include <boost/tokenizer.hpp>
7c673cae
FG
49#include <libudev.h>
50
eafe8130 51static const int UDEV_BUF_SIZE = 1 << 20; /* doubled to 2M (SO_RCVBUFFORCE) */
9f95a23c
TL
52static const char DEVNODE_PREFIX[] = "/dev/rbd";
53static const char SNAP_HEAD_NAME[] = "-";
54
55#define DEFINE_UDEV_UPTR(what) \
56struct udev_##what##_deleter { \
57 void operator()(udev_##what *p) { \
58 udev_##what##_unref(p); \
59 } \
60}; \
61using udev_##what##_uptr = \
62 std::unique_ptr<udev_##what, udev_##what##_deleter>;
63
64DEFINE_UDEV_UPTR(monitor) /* udev_monitor_uptr */
65DEFINE_UDEV_UPTR(enumerate) /* udev_enumerate_uptr */
66DEFINE_UDEV_UPTR(device) /* udev_device_uptr */
7c673cae 67
20effc67
TL
68using std::string;
69
7c673cae
FG
70struct krbd_ctx {
71 CephContext *cct;
72 struct udev *udev;
f91f0fd5 73 uint32_t flags; /* KRBD_CTX_F_* */
7c673cae
FG
74};
75
11fdf7f2
TL
76struct krbd_spec {
77 std::string pool_name;
78 std::string nspace_name;
79 std::string image_name;
80 std::string snap_name;
81
82 krbd_spec(const char *pool_name, const char *nspace_name,
83 const char *image_name, const char *snap_name)
84 : pool_name(pool_name),
85 nspace_name(nspace_name),
86 image_name(image_name),
87 snap_name(*snap_name ? snap_name : SNAP_HEAD_NAME) { }
88
89 bool operator==(const krbd_spec& rhs) const {
90 return pool_name == rhs.pool_name &&
91 nspace_name == rhs.nspace_name &&
92 image_name == rhs.image_name &&
93 snap_name == rhs.snap_name;
94 }
95};
96
9f95a23c
TL
97static std::ostream& operator<<(std::ostream& os, const krbd_spec& spec)
98{
11fdf7f2
TL
99 os << spec.pool_name << "/";
100 if (!spec.nspace_name.empty())
101 os << spec.nspace_name << "/";
102 os << spec.image_name;
103 if (spec.snap_name != SNAP_HEAD_NAME)
104 os << "@" << spec.snap_name;
105 return os;
106}
107
9f95a23c
TL
108static std::optional<krbd_spec> spec_from_dev(udev_device *dev)
109{
11fdf7f2
TL
110 const char *pool_name = udev_device_get_sysattr_value(dev, "pool");
111 const char *nspace_name = udev_device_get_sysattr_value(dev, "pool_ns");
112 const char *image_name = udev_device_get_sysattr_value(dev, "name");
113 const char *snap_name = udev_device_get_sysattr_value(dev, "current_snap");
114
115 if (!pool_name || !image_name || !snap_name)
116 return std::nullopt;
117
118 return std::make_optional<krbd_spec>(
119 pool_name, nspace_name ?: "", image_name, snap_name);
120}
121
9f95a23c 122static udev_device_uptr dev_from_list_entry(udev *udev, udev_list_entry *l)
7c673cae 123{
9f95a23c
TL
124 return udev_device_uptr(
125 udev_device_new_from_syspath(udev, udev_list_entry_get_name(l)));
126}
127
128static std::string get_devnode(udev_device *dev)
129{
130 std::string devnode = DEVNODE_PREFIX;
131 devnode += udev_device_get_sysname(dev);
132 return devnode;
7c673cae
FG
133}
134
135static int sysfs_write_rbd(const char *which, const string& buf)
136{
137 const string s = string("/sys/bus/rbd/") + which;
138 const string t = s + "_single_major";
139 int fd;
140 int r;
141
142 /*
143 * 'add' and 'add_single_major' interfaces are identical, but if rbd
144 * kernel module is new enough and is configured to use single-major
145 * scheme, 'add' is disabled in order to prevent old userspace from
146 * doing weird things at unmap time.
147 *
148 * Same goes for 'remove' vs 'remove_single_major'.
149 */
150 fd = open(t.c_str(), O_WRONLY);
151 if (fd < 0) {
152 if (errno == ENOENT) {
153 fd = open(s.c_str(), O_WRONLY);
154 if (fd < 0)
155 return -errno;
156 } else {
157 return -errno;
158 }
159 }
160
161 r = safe_write(fd, buf.c_str(), buf.size());
162
163 close(fd);
164 return r;
165}
166
167static int sysfs_write_rbd_add(const string& buf)
168{
169 return sysfs_write_rbd("add", buf);
170}
171
172static int sysfs_write_rbd_remove(const string& buf)
173{
174 return sysfs_write_rbd("remove", buf);
175}
176
177static int have_minor_attr(void)
178{
179 /*
180 * 'minor' attribute was added as part of single_major merge, which
181 * exposed the 'single_major' parameter. 'minor' is always present,
182 * regardless of whether single-major scheme is turned on or not.
183 *
184 * (Something like ver >= KERNEL_VERSION(3, 14, 0) is a no-go because
185 * this has to work with rbd.ko backported to various kernels.)
186 */
187 return access("/sys/module/rbd/parameters/single_major", F_OK) == 0;
188}
189
11fdf7f2 190static int build_map_buf(CephContext *cct, const krbd_spec& spec,
adb31ebb 191 const string& options, string *pbuf)
7c673cae 192{
adb31ebb 193 bool msgr2 = false;
20effc67 194 std::ostringstream oss;
7c673cae
FG
195 int r;
196
adb31ebb
TL
197 boost::char_separator<char> sep(",");
198 boost::tokenizer<boost::char_separator<char>> tok(options, sep);
199 for (const auto& t : tok) {
200 if (boost::starts_with(t, "ms_mode=")) {
201 /* msgr2 unless ms_mode=legacy */
202 msgr2 = t.compare(8, t.npos, "legacy");
203 }
204 }
205
7c673cae 206 MonMap monmap;
20effc67 207 r = monmap.build_initial(cct, false, std::cerr);
7c673cae
FG
208 if (r < 0)
209 return r;
210
adb31ebb
TL
211 /*
212 * If msgr2, filter TYPE_MSGR2 addresses. Otherwise, filter
213 * TYPE_LEGACY addresses.
214 */
215 for (const auto& p : monmap.mon_info) {
216 for (const auto& a : p.second.public_addrs.v) {
217 if ((msgr2 && a.is_msgr2()) || (!msgr2 && a.is_legacy())) {
218 if (oss.tellp() > 0) {
219 oss << ",";
220 }
221 oss << a.get_sockaddr();
222 }
7c673cae 223 }
7c673cae
FG
224 }
225
20effc67
TL
226 if (oss.tellp() == 0) {
227 std::cerr << "rbd: failed to get mon address (possible ms_mode mismatch)" << std::endl;
228 return -ENOENT;
229 }
230
7c673cae
FG
231 oss << " name=" << cct->_conf->name.get_id();
232
233 KeyRing keyring;
11fdf7f2
TL
234 auto auth_client_required =
235 cct->_conf.get_val<std::string>("auth_client_required");
236 if (auth_client_required != "none") {
224ce89b 237 r = keyring.from_ceph_context(cct);
11fdf7f2
TL
238 auto keyfile = cct->_conf.get_val<std::string>("keyfile");
239 auto key = cct->_conf.get_val<std::string>("key");
240 if (r == -ENOENT && keyfile.empty() && key.empty())
224ce89b
WB
241 r = 0;
242 if (r < 0) {
20effc67 243 std::cerr << "rbd: failed to get secret" << std::endl;
224ce89b
WB
244 return r;
245 }
7c673cae
FG
246 }
247
248 CryptoKey secret;
249 string key_name = string("client.") + cct->_conf->name.get_id();
250 if (keyring.get_secret(cct->_conf->name, secret)) {
251 string secret_str;
252 secret.encode_base64(secret_str);
253
254 r = set_kernel_secret(secret_str.c_str(), key_name.c_str());
255 if (r >= 0) {
256 if (r == 0)
20effc67 257 std::cerr << "rbd: warning: secret has length 0" << std::endl;
7c673cae
FG
258 oss << ",key=" << key_name;
259 } else if (r == -ENODEV || r == -ENOSYS) {
260 // running against older kernel; fall back to secret= in options
261 oss << ",secret=" << secret_str;
262 } else {
20effc67 263 std::cerr << "rbd: failed to add secret '" << key_name << "' to kernel"
7c673cae
FG
264 << std::endl;
265 return r;
266 }
267 } else if (is_kernel_secret(key_name.c_str())) {
268 oss << ",key=" << key_name;
269 }
270
adb31ebb 271 if (!options.empty())
7c673cae 272 oss << "," << options;
11fdf7f2
TL
273 if (!spec.nspace_name.empty())
274 oss << ",_pool_ns=" << spec.nspace_name;
7c673cae 275
11fdf7f2
TL
276 oss << " " << spec.pool_name << " " << spec.image_name << " "
277 << spec.snap_name;
7c673cae
FG
278
279 *pbuf = oss.str();
280 return 0;
281}
282
eafe8130
TL
283/*
284 * Return:
285 * <kernel error, false> - didn't map
286 * <0 or udev error, true> - mapped
287 */
288template <typename F>
289static std::pair<int, bool> wait_for_mapping(int sysfs_r_fd, udev_monitor *mon,
290 F udev_device_handler)
7c673cae 291{
eafe8130
TL
292 struct pollfd fds[2];
293 int sysfs_r = INT_MAX, udev_r = INT_MAX;
81eedcae 294 int r;
7c673cae 295
eafe8130
TL
296 fds[0].fd = sysfs_r_fd;
297 fds[0].events = POLLIN;
298 fds[1].fd = udev_monitor_get_fd(mon);
299 fds[1].events = POLLIN;
300
7c673cae 301 for (;;) {
eafe8130
TL
302 if (poll(fds, 2, -1) < 0) {
303 ceph_abort_msgf("poll failed: %d", -errno);
304 }
7c673cae 305
eafe8130
TL
306 if (fds[0].revents) {
307 r = safe_read_exact(sysfs_r_fd, &sysfs_r, sizeof(sysfs_r));
308 if (r < 0) {
309 ceph_abort_msgf("safe_read_exact failed: %d", r);
310 }
311 if (sysfs_r < 0) {
312 return std::make_pair(sysfs_r, false);
313 }
314 if (udev_r != INT_MAX) {
315 ceph_assert(!sysfs_r);
316 return std::make_pair(udev_r, true);
317 }
318 fds[0].fd = -1;
81eedcae 319 }
11fdf7f2 320
eafe8130
TL
321 if (fds[1].revents) {
322 for (;;) {
9f95a23c 323 udev_device_uptr dev(udev_monitor_receive_device(mon));
eafe8130
TL
324 if (!dev) {
325 if (errno != EINTR && errno != EAGAIN) {
326 udev_r = -errno;
327 if (sysfs_r != INT_MAX) {
328 ceph_assert(!sysfs_r);
329 return std::make_pair(udev_r, true);
330 }
331 fds[1].fd = -1;
332 }
333 break;
334 }
9f95a23c 335 if (udev_device_handler(std::move(dev))) {
eafe8130
TL
336 udev_r = 0;
337 if (sysfs_r != INT_MAX) {
338 ceph_assert(!sysfs_r);
339 return std::make_pair(udev_r, true);
340 }
341 fds[1].fd = -1;
342 break;
343 }
344 }
345 }
346 }
347}
348
349class UdevMapHandler {
350public:
f67539c2
TL
351 UdevMapHandler(const krbd_spec *spec, std::string *pdevnode,
352 std::string *majnum, std::string *minnum) :
353 m_spec(spec), m_pdevnode(pdevnode), m_majnum(majnum), m_minnum(minnum) {}
7c673cae 354
eafe8130
TL
355 /*
356 * Catch /sys/devices/rbd/<id>/ and wait for the corresponding
357 * block device to show up. This is necessary because rbd devices
358 * and block devices aren't linked together in our sysfs layout.
9f95a23c
TL
359 *
360 * Note that our "block" event can come before the "rbd" event, so
361 * all potential "block" events are gathered in m_block_devs before
362 * m_bus_dev is caught.
eafe8130 363 */
9f95a23c
TL
364 bool operator()(udev_device_uptr dev) {
365 if (strcmp(udev_device_get_action(dev.get()), "add")) {
366 return false;
eafe8130 367 }
9f95a23c 368 if (!strcmp(udev_device_get_subsystem(dev.get()), "rbd")) {
eafe8130 369 if (!m_bus_dev) {
9f95a23c 370 auto spec = spec_from_dev(dev.get());
eafe8130 371 if (spec && *spec == *m_spec) {
9f95a23c
TL
372 m_bus_dev = std::move(dev);
373 m_devnode = get_devnode(m_bus_dev.get());
7c673cae
FG
374 }
375 }
9f95a23c
TL
376 } else if (!strcmp(udev_device_get_subsystem(dev.get()), "block")) {
377 if (boost::starts_with(udev_device_get_devnode(dev.get()),
378 DEVNODE_PREFIX)) {
379 m_block_devs.push_back(std::move(dev));
380 }
81eedcae 381 }
7c673cae 382
eafe8130 383 if (m_bus_dev && !m_block_devs.empty()) {
9f95a23c
TL
384 for (const auto& p : m_block_devs) {
385 if (udev_device_get_devnode(p.get()) == m_devnode) {
9f95a23c 386 *m_pdevnode = std::move(m_devnode);
f67539c2
TL
387 *m_majnum = udev_device_get_property_value(p.get(), "MAJOR");
388 *m_minnum = udev_device_get_property_value(p.get(), "MINOR");
389 ceph_assert(*m_majnum == udev_device_get_sysattr_value(
390 m_bus_dev.get(), "major"));
391 ceph_assert(!have_minor_attr() ||
392 *m_minnum == udev_device_get_sysattr_value(
393 m_bus_dev.get(), "minor"));
eafe8130 394 return true;
7c673cae
FG
395 }
396 }
9f95a23c 397 m_block_devs.clear();
7c673cae 398 }
eafe8130 399 return false;
81eedcae 400 }
7c673cae 401
eafe8130 402private:
9f95a23c
TL
403 udev_device_uptr m_bus_dev;
404 std::vector<udev_device_uptr> m_block_devs;
405 std::string m_devnode;
eafe8130
TL
406 const krbd_spec *m_spec;
407 std::string *m_pdevnode;
f67539c2
TL
408 std::string *m_majnum;
409 std::string *m_minnum;
eafe8130 410};
7c673cae 411
f91f0fd5
TL
412static const char *get_event_source(const krbd_ctx *ctx)
413{
414 if (ctx->flags & KRBD_CTX_F_NOUDEV) {
415 /*
416 * For block devices (unlike network interfaces, they don't
417 * carry any namespace tags), the kernel broadcasts uevents
418 * into all network namespaces that are owned by the initial
419 * user namespace. This restriction is new in 4.18: starting
420 * with 2.6.35 and through 4.17 the kernel broadcast uevents
421 * into all network namespaces, period.
422 *
423 * However, when invoked from a non-initial user namespace,
424 * udev_monitor_receive_device() has always ignored both kernel
425 * and udev uevents by virtue of requiring SCM_CREDENTIALS and
426 * checking that ucred->uid == 0. When UIDs and GIDs are sent to
427 * a process in a user namespace, they are translated according
428 * to that process's UID and GID mappings and, unless root in the
429 * user namespace is mapped to the global root, that check fails.
430 * Normally they show up as 65534(nobody) because the global root
431 * is not mapped.
432 */
433 return "kernel";
434 }
435
436 /*
437 * Like most netlink messages, udev uevents don't cross network
438 * namespace boundaries and are therefore confined to the initial
439 * network namespace.
440 */
441 return "udev";
442}
443
444static int do_map(krbd_ctx *ctx, const krbd_spec& spec, const string& buf,
11fdf7f2 445 string *pname)
7c673cae 446{
f67539c2
TL
447 std::string majnum, minnum;
448 struct stat sb;
eafe8130
TL
449 bool mapped;
450 int fds[2];
7c673cae
FG
451 int r;
452
f91f0fd5
TL
453 udev_monitor_uptr mon(udev_monitor_new_from_netlink(ctx->udev,
454 get_event_source(ctx)));
7c673cae
FG
455 if (!mon)
456 return -ENOMEM;
457
9f95a23c
TL
458 r = udev_monitor_filter_add_match_subsystem_devtype(mon.get(), "rbd",
459 nullptr);
7c673cae 460 if (r < 0)
9f95a23c 461 return r;
7c673cae 462
9f95a23c
TL
463 r = udev_monitor_filter_add_match_subsystem_devtype(mon.get(), "block",
464 "disk");
7c673cae 465 if (r < 0)
9f95a23c 466 return r;
7c673cae 467
9f95a23c 468 r = udev_monitor_set_receive_buffer_size(mon.get(), UDEV_BUF_SIZE);
eafe8130
TL
469 if (r < 0) {
470 std::cerr << "rbd: failed to set udev buffer size: " << cpp_strerror(r)
471 << std::endl;
472 /* not fatal */
473 }
474
9f95a23c 475 r = udev_monitor_enable_receiving(mon.get());
7c673cae 476 if (r < 0)
9f95a23c 477 return r;
7c673cae 478
9f95a23c
TL
479 if (pipe2(fds, O_NONBLOCK) < 0)
480 return -errno;
7c673cae 481
9f95a23c 482 auto mapper = make_named_thread("mapper", [&buf, sysfs_r_fd = fds[1]]() {
eafe8130
TL
483 int sysfs_r = sysfs_write_rbd_add(buf);
484 int r = safe_write(sysfs_r_fd, &sysfs_r, sizeof(sysfs_r));
485 if (r < 0) {
486 ceph_abort_msgf("safe_write failed: %d", r);
487 }
488 });
489
9f95a23c 490 std::tie(r, mapped) = wait_for_mapping(fds[0], mon.get(),
f67539c2
TL
491 UdevMapHandler(&spec, pname, &majnum,
492 &minnum));
7c673cae 493 if (r < 0) {
eafe8130
TL
494 if (!mapped) {
495 std::cerr << "rbd: sysfs write failed" << std::endl;
496 } else {
497 std::cerr << "rbd: udev wait failed" << std::endl;
498 /* TODO: fall back to enumeration */
499 }
7c673cae
FG
500 }
501
eafe8130
TL
502 mapper.join();
503 close(fds[0]);
504 close(fds[1]);
f67539c2
TL
505
506 if (r < 0)
507 return r;
508
509 /*
510 * Make sure our device node is there. This is intended to help
511 * diagnose environments where "rbd map" is run from a container with
512 * a private /dev and some external mechanism (e.g. udev) is used to
513 * add the device to the container asynchronously, possibly seconds
514 * after "rbd map" successfully exits. These setups are very fragile
515 * and in some cases can even lead to data loss, depending on higher
516 * level logic and orchestration layers involved.
517 */
518 ceph_assert(mapped);
519 if (stat(pname->c_str(), &sb) < 0 || !S_ISBLK(sb.st_mode)) {
520 std::cerr << "rbd: mapping succeeded but " << *pname
521 << " is not accessible, is host /dev mounted?" << std::endl;
522 return -EINVAL;
523 }
524 if (stringify(major(sb.st_rdev)) != majnum ||
525 stringify(minor(sb.st_rdev)) != minnum) {
526 std::cerr << "rbd: mapping succeeded but " << *pname
527 << " (" << major(sb.st_rdev) << ":" << minor(sb.st_rdev)
528 << ") does not match expected " << majnum << ":" << minnum
529 << std::endl;
530 return -EINVAL;
531 }
532
533 return 0;
7c673cae
FG
534}
535
11fdf7f2
TL
536static int map_image(struct krbd_ctx *ctx, const krbd_spec& spec,
537 const char *options, string *pname)
7c673cae
FG
538{
539 string buf;
540 int r;
541
7c673cae
FG
542 /*
543 * Modprobe rbd kernel module. If it supports single-major device
544 * number allocation scheme, make sure it's turned on.
9f95a23c
TL
545 *
546 * Do this before calling build_map_buf() - it wants "ceph" key type
547 * registered.
7c673cae
FG
548 */
549 if (access("/sys/bus/rbd", F_OK) != 0) {
550 const char *module_options = NULL;
551 if (module_has_param("rbd", "single_major"))
552 module_options = "single_major=Y";
553
554 r = module_load("rbd", module_options);
555 if (r) {
20effc67 556 std::cerr << "rbd: failed to load rbd kernel module (" << r << ")"
7c673cae
FG
557 << std::endl;
558 /*
559 * Ignore the error: modprobe failing doesn't necessarily prevent
560 * from working.
561 */
562 }
563 }
564
9f95a23c
TL
565 r = build_map_buf(ctx->cct, spec, options, &buf);
566 if (r < 0)
567 return r;
568
f91f0fd5 569 return do_map(ctx, spec, buf, pname);
7c673cae
FG
570}
571
572static int devno_to_krbd_id(struct udev *udev, dev_t devno, string *pid)
573{
9f95a23c 574 udev_enumerate_uptr enm;
7c673cae 575 struct udev_list_entry *l;
7c673cae
FG
576 int r;
577
eafe8130 578retry:
9f95a23c 579 enm.reset(udev_enumerate_new(udev));
7c673cae
FG
580 if (!enm)
581 return -ENOMEM;
582
9f95a23c 583 r = udev_enumerate_add_match_subsystem(enm.get(), "rbd");
7c673cae 584 if (r < 0)
9f95a23c 585 return r;
7c673cae 586
9f95a23c 587 r = udev_enumerate_add_match_sysattr(enm.get(), "major",
7c673cae
FG
588 stringify(major(devno)).c_str());
589 if (r < 0)
9f95a23c 590 return r;
7c673cae
FG
591
592 if (have_minor_attr()) {
9f95a23c 593 r = udev_enumerate_add_match_sysattr(enm.get(), "minor",
7c673cae
FG
594 stringify(minor(devno)).c_str());
595 if (r < 0)
9f95a23c 596 return r;
7c673cae
FG
597 }
598
9f95a23c 599 r = udev_enumerate_scan_devices(enm.get());
eafe8130
TL
600 if (r < 0) {
601 if (r == -ENOENT || r == -ENODEV) {
602 std::cerr << "rbd: udev enumerate failed, retrying" << std::endl;
eafe8130
TL
603 goto retry;
604 }
9f95a23c 605 return r;
eafe8130 606 }
7c673cae 607
9f95a23c
TL
608 l = udev_enumerate_get_list_entry(enm.get());
609 if (!l)
610 return -ENOENT;
7c673cae
FG
611
612 /* make sure there is only one match */
11fdf7f2 613 ceph_assert(!udev_list_entry_get_next(l));
7c673cae 614
9f95a23c
TL
615 auto dev = dev_from_list_entry(udev, l);
616 if (!dev)
617 return -ENOMEM;
7c673cae 618
9f95a23c
TL
619 *pid = udev_device_get_sysname(dev.get());
620 return 0;
7c673cae
FG
621}
622
522d829b
TL
623// wrap any of * ? [ between square brackets
624static std::string escape_glob(const std::string& s)
625{
626 std::regex glob_meta("([*?[])");
627 return std::regex_replace(s, glob_meta, "[$1]");
628}
629
11fdf7f2 630static int __enumerate_devices(struct udev *udev, const krbd_spec& spec,
9f95a23c 631 bool match_nspace, udev_enumerate_uptr *penm)
7c673cae 632{
9f95a23c 633 udev_enumerate_uptr enm;
7c673cae
FG
634 int r;
635
eafe8130 636retry:
9f95a23c 637 enm.reset(udev_enumerate_new(udev));
7c673cae
FG
638 if (!enm)
639 return -ENOMEM;
640
9f95a23c 641 r = udev_enumerate_add_match_subsystem(enm.get(), "rbd");
7c673cae 642 if (r < 0)
9f95a23c 643 return r;
7c673cae 644
9f95a23c 645 r = udev_enumerate_add_match_sysattr(enm.get(), "pool",
522d829b 646 escape_glob(spec.pool_name).c_str());
7c673cae 647 if (r < 0)
9f95a23c 648 return r;
7c673cae 649
11fdf7f2 650 if (match_nspace) {
9f95a23c 651 r = udev_enumerate_add_match_sysattr(enm.get(), "pool_ns",
522d829b 652 escape_glob(spec.nspace_name).c_str());
11fdf7f2
TL
653 } else {
654 /*
655 * Match _only_ devices that don't have pool_ns attribute.
656 * If the kernel supports namespaces, the result will be empty.
657 */
9f95a23c 658 r = udev_enumerate_add_nomatch_sysattr(enm.get(), "pool_ns", nullptr);
11fdf7f2
TL
659 }
660 if (r < 0)
9f95a23c 661 return r;
11fdf7f2 662
9f95a23c 663 r = udev_enumerate_add_match_sysattr(enm.get(), "name",
522d829b 664 escape_glob(spec.image_name).c_str());
7c673cae 665 if (r < 0)
9f95a23c 666 return r;
7c673cae 667
9f95a23c 668 r = udev_enumerate_add_match_sysattr(enm.get(), "current_snap",
522d829b 669 escape_glob(spec.snap_name).c_str());
7c673cae 670 if (r < 0)
9f95a23c 671 return r;
7c673cae 672
9f95a23c 673 r = udev_enumerate_scan_devices(enm.get());
eafe8130
TL
674 if (r < 0) {
675 if (r == -ENOENT || r == -ENODEV) {
676 std::cerr << "rbd: udev enumerate failed, retrying" << std::endl;
eafe8130
TL
677 goto retry;
678 }
9f95a23c 679 return r;
eafe8130 680 }
7c673cae 681
9f95a23c 682 *penm = std::move(enm);
11fdf7f2 683 return 0;
11fdf7f2
TL
684}
685
686static int enumerate_devices(struct udev *udev, const krbd_spec& spec,
9f95a23c 687 udev_enumerate_uptr *penm)
11fdf7f2 688{
9f95a23c 689 udev_enumerate_uptr enm;
11fdf7f2
TL
690 int r;
691
692 r = __enumerate_devices(udev, spec, true, &enm);
693 if (r < 0)
694 return r;
695
696 /*
697 * If no namespace is set, try again with match_nspace=false to
698 * handle older kernels. On a newer kernel the result will remain
699 * the same (i.e. empty).
700 */
9f95a23c 701 if (!udev_enumerate_get_list_entry(enm.get()) && spec.nspace_name.empty()) {
11fdf7f2
TL
702 r = __enumerate_devices(udev, spec, false, &enm);
703 if (r < 0)
704 return r;
705 }
706
9f95a23c 707 *penm = std::move(enm);
11fdf7f2
TL
708 return 0;
709}
710
711static int spec_to_devno_and_krbd_id(struct udev *udev, const krbd_spec& spec,
712 dev_t *pdevno, string *pid)
713{
9f95a23c 714 udev_enumerate_uptr enm;
11fdf7f2 715 struct udev_list_entry *l;
11fdf7f2
TL
716 unsigned int maj, min = 0;
717 string err;
718 int r;
719
720 r = enumerate_devices(udev, spec, &enm);
721 if (r < 0)
722 return r;
723
9f95a23c
TL
724 l = udev_enumerate_get_list_entry(enm.get());
725 if (!l)
726 return -ENOENT;
7c673cae 727
9f95a23c
TL
728 auto dev = dev_from_list_entry(udev, l);
729 if (!dev)
730 return -ENOMEM;
7c673cae 731
9f95a23c
TL
732 maj = strict_strtoll(udev_device_get_sysattr_value(dev.get(), "major"), 10,
733 &err);
7c673cae 734 if (!err.empty()) {
20effc67 735 std::cerr << "rbd: couldn't parse major: " << err << std::endl;
9f95a23c 736 return -EINVAL;
7c673cae
FG
737 }
738 if (have_minor_attr()) {
9f95a23c
TL
739 min = strict_strtoll(udev_device_get_sysattr_value(dev.get(), "minor"), 10,
740 &err);
7c673cae 741 if (!err.empty()) {
20effc67 742 std::cerr << "rbd: couldn't parse minor: " << err << std::endl;
9f95a23c 743 return -EINVAL;
7c673cae
FG
744 }
745 }
746
747 /*
748 * If an image is mapped more than once don't bother trying to unmap
749 * all devices - let users run unmap the same number of times they
750 * ran map.
751 */
752 if (udev_list_entry_get_next(l))
20effc67 753 std::cerr << "rbd: " << spec << ": mapped more than once, unmapping "
9f95a23c 754 << get_devnode(dev.get()) << " only" << std::endl;
7c673cae
FG
755
756 *pdevno = makedev(maj, min);
9f95a23c
TL
757 *pid = udev_device_get_sysname(dev.get());
758 return 0;
7c673cae
FG
759}
760
9f95a23c 761static void append_unmap_options(std::string *buf, const char *options)
7c673cae 762{
7c673cae 763 if (strcmp(options, "") != 0) {
9f95a23c
TL
764 *buf += " ";
765 *buf += options;
7c673cae 766 }
7c673cae
FG
767}
768
eafe8130
TL
769class UdevUnmapHandler {
770public:
771 UdevUnmapHandler(dev_t devno) : m_devno(devno) {}
7c673cae 772
9f95a23c
TL
773 bool operator()(udev_device_uptr dev) {
774 if (strcmp(udev_device_get_action(dev.get()), "remove")) {
775 return false;
7c673cae 776 }
9f95a23c 777 return udev_device_get_devnum(dev.get()) == m_devno;
7c673cae
FG
778 }
779
eafe8130
TL
780private:
781 dev_t m_devno;
782};
7c673cae 783
f91f0fd5 784static int do_unmap(krbd_ctx *ctx, dev_t devno, const string& buf)
7c673cae 785{
eafe8130
TL
786 bool unmapped;
787 int fds[2];
7c673cae
FG
788 int r;
789
f91f0fd5
TL
790 udev_monitor_uptr mon(udev_monitor_new_from_netlink(ctx->udev,
791 get_event_source(ctx)));
7c673cae
FG
792 if (!mon)
793 return -ENOMEM;
794
9f95a23c
TL
795 r = udev_monitor_filter_add_match_subsystem_devtype(mon.get(), "block",
796 "disk");
7c673cae 797 if (r < 0)
9f95a23c 798 return r;
7c673cae 799
9f95a23c 800 r = udev_monitor_set_receive_buffer_size(mon.get(), UDEV_BUF_SIZE);
eafe8130
TL
801 if (r < 0) {
802 std::cerr << "rbd: failed to set udev buffer size: " << cpp_strerror(r)
803 << std::endl;
804 /* not fatal */
805 }
806
9f95a23c 807 r = udev_monitor_enable_receiving(mon.get());
7c673cae 808 if (r < 0)
9f95a23c 809 return r;
7c673cae 810
9f95a23c
TL
811 if (pipe2(fds, O_NONBLOCK) < 0)
812 return -errno;
eafe8130 813
f91f0fd5
TL
814 auto unmapper = make_named_thread(
815 "unmapper", [&buf, sysfs_r_fd = fds[1], flags = ctx->flags]() {
eafe8130
TL
816 /*
817 * On final device close(), kernel sends a block change event, in
818 * response to which udev apparently runs blkid on the device. This
819 * makes unmap fail with EBUSY, if issued right after final close().
820 * Try to circumvent this with a retry before turning to udev.
821 */
822 for (int tries = 0; ; tries++) {
823 int sysfs_r = sysfs_write_rbd_remove(buf);
824 if (sysfs_r == -EBUSY && tries < 2) {
825 if (!tries) {
826 usleep(250 * 1000);
f91f0fd5 827 } else if (!(flags & KRBD_CTX_F_NOUDEV)) {
eafe8130
TL
828 /*
829 * libudev does not provide the "wait until the queue is empty"
830 * API or the sufficient amount of primitives to build it from.
831 */
832 std::string err = run_cmd("udevadm", "settle", "--timeout", "10",
833 (char *)NULL);
834 if (!err.empty())
835 std::cerr << "rbd: " << err << std::endl;
836 }
7c673cae 837 } else {
eafe8130
TL
838 int r = safe_write(sysfs_r_fd, &sysfs_r, sizeof(sysfs_r));
839 if (r < 0) {
840 ceph_abort_msgf("safe_write failed: %d", r);
841 }
842 break;
7c673cae 843 }
7c673cae 844 }
eafe8130 845 });
7c673cae 846
9f95a23c 847 std::tie(r, unmapped) = wait_for_mapping(fds[0], mon.get(),
eafe8130 848 UdevUnmapHandler(devno));
7c673cae 849 if (r < 0) {
eafe8130
TL
850 if (!unmapped) {
851 std::cerr << "rbd: sysfs write failed" << std::endl;
852 } else {
853 std::cerr << "rbd: udev wait failed: " << cpp_strerror(r) << std::endl;
854 r = 0;
855 }
7c673cae
FG
856 }
857
eafe8130
TL
858 unmapper.join();
859 close(fds[0]);
860 close(fds[1]);
7c673cae
FG
861 return r;
862}
863
864static int unmap_image(struct krbd_ctx *ctx, const char *devnode,
865 const char *options)
866{
867 struct stat sb;
868 dev_t wholedevno = 0;
9f95a23c 869 std::string buf;
7c673cae
FG
870 int r;
871
872 if (stat(devnode, &sb) < 0 || !S_ISBLK(sb.st_mode)) {
20effc67 873 std::cerr << "rbd: '" << devnode << "' is not a block device" << std::endl;
7c673cae
FG
874 return -EINVAL;
875 }
876
877 r = blkid_devno_to_wholedisk(sb.st_rdev, NULL, 0, &wholedevno);
878 if (r < 0) {
20effc67 879 std::cerr << "rbd: couldn't compute wholedevno: " << cpp_strerror(r)
7c673cae
FG
880 << std::endl;
881 /*
882 * Ignore the error: we are given whole disks most of the time, and
883 * if it turns out this is a partition we will fail later anyway.
884 */
885 wholedevno = sb.st_rdev;
886 }
887
eafe8130 888 for (int tries = 0; ; tries++) {
9f95a23c 889 r = devno_to_krbd_id(ctx->udev, wholedevno, &buf);
eafe8130
TL
890 if (r == -ENOENT && tries < 2) {
891 usleep(250 * 1000);
892 } else {
893 if (r < 0) {
894 if (r == -ENOENT) {
895 std::cerr << "rbd: '" << devnode << "' is not an rbd device"
896 << std::endl;
897 r = -EINVAL;
898 }
899 return r;
900 }
901 if (tries) {
902 std::cerr << "rbd: udev enumerate missed a device, tries = " << tries
903 << std::endl;
904 }
905 break;
7c673cae 906 }
7c673cae
FG
907 }
908
9f95a23c 909 append_unmap_options(&buf, options);
f91f0fd5 910 return do_unmap(ctx, wholedevno, buf);
7c673cae
FG
911}
912
11fdf7f2 913static int unmap_image(struct krbd_ctx *ctx, const krbd_spec& spec,
7c673cae
FG
914 const char *options)
915{
916 dev_t devno = 0;
9f95a23c 917 std::string buf;
7c673cae
FG
918 int r;
919
eafe8130 920 for (int tries = 0; ; tries++) {
9f95a23c 921 r = spec_to_devno_and_krbd_id(ctx->udev, spec, &devno, &buf);
eafe8130
TL
922 if (r == -ENOENT && tries < 2) {
923 usleep(250 * 1000);
924 } else {
925 if (r < 0) {
926 if (r == -ENOENT) {
927 std::cerr << "rbd: " << spec << ": not a mapped image or snapshot"
928 << std::endl;
929 r = -EINVAL;
930 }
931 return r;
932 }
933 if (tries) {
934 std::cerr << "rbd: udev enumerate missed a device, tries = " << tries
935 << std::endl;
936 }
937 break;
7c673cae 938 }
7c673cae
FG
939 }
940
9f95a23c 941 append_unmap_options(&buf, options);
f91f0fd5 942 return do_unmap(ctx, devno, buf);
7c673cae
FG
943}
944
945static bool dump_one_image(Formatter *f, TextTable *tbl,
946 struct udev_device *dev)
947{
11fdf7f2 948 auto spec = spec_from_dev(dev);
9f95a23c
TL
949 std::string devnode = get_devnode(dev);
950 const char *id = devnode.c_str() + sizeof(DEVNODE_PREFIX) - 1;
7c673cae 951
11fdf7f2 952 if (!spec)
7c673cae
FG
953 return false;
954
955 if (f) {
11fdf7f2
TL
956 f->open_object_section("device");
957 f->dump_string("id", id);
958 f->dump_string("pool", spec->pool_name);
959 f->dump_string("namespace", spec->nspace_name);
960 f->dump_string("name", spec->image_name);
961 f->dump_string("snap", spec->snap_name);
9f95a23c 962 f->dump_string("device", devnode);
7c673cae
FG
963 f->close_section();
964 } else {
11fdf7f2 965 *tbl << id << spec->pool_name << spec->nspace_name << spec->image_name
9f95a23c 966 << spec->snap_name << devnode << TextTable::endrow;
7c673cae
FG
967 }
968
969 return true;
970}
971
972static int do_dump(struct udev *udev, Formatter *f, TextTable *tbl)
973{
9f95a23c 974 udev_enumerate_uptr enm;
11fdf7f2 975 struct udev_list_entry *l = NULL;
7c673cae
FG
976 bool have_output = false;
977 int r;
978
eafe8130 979retry:
9f95a23c 980 enm.reset(udev_enumerate_new(udev));
7c673cae
FG
981 if (!enm)
982 return -ENOMEM;
983
9f95a23c 984 r = udev_enumerate_add_match_subsystem(enm.get(), "rbd");
7c673cae 985 if (r < 0)
9f95a23c 986 return r;
7c673cae 987
9f95a23c 988 r = udev_enumerate_scan_devices(enm.get());
eafe8130
TL
989 if (r < 0) {
990 if (r == -ENOENT || r == -ENODEV) {
991 std::cerr << "rbd: udev enumerate failed, retrying" << std::endl;
eafe8130
TL
992 goto retry;
993 }
9f95a23c 994 return r;
eafe8130 995 }
7c673cae 996
9f95a23c
TL
997 udev_list_entry_foreach(l, udev_enumerate_get_list_entry(enm.get())) {
998 auto dev = dev_from_list_entry(udev, l);
7c673cae 999 if (dev) {
9f95a23c 1000 have_output |= dump_one_image(f, tbl, dev.get());
7c673cae
FG
1001 }
1002 }
1003
9f95a23c 1004 return have_output;
7c673cae
FG
1005}
1006
9f95a23c 1007static int dump_images(struct krbd_ctx *ctx, Formatter *f)
7c673cae
FG
1008{
1009 TextTable tbl;
1010 int r;
1011
1012 if (f) {
11fdf7f2 1013 f->open_array_section("devices");
7c673cae
FG
1014 } else {
1015 tbl.define_column("id", TextTable::LEFT, TextTable::LEFT);
1016 tbl.define_column("pool", TextTable::LEFT, TextTable::LEFT);
11fdf7f2 1017 tbl.define_column("namespace", TextTable::LEFT, TextTable::LEFT);
7c673cae
FG
1018 tbl.define_column("image", TextTable::LEFT, TextTable::LEFT);
1019 tbl.define_column("snap", TextTable::LEFT, TextTable::LEFT);
1020 tbl.define_column("device", TextTable::LEFT, TextTable::LEFT);
1021 }
1022
1023 r = do_dump(ctx->udev, f, &tbl);
1024
1025 if (f) {
1026 f->close_section();
20effc67 1027 f->flush(std::cout);
7c673cae
FG
1028 } else {
1029 if (r > 0)
20effc67 1030 std::cout << tbl;
7c673cae
FG
1031 }
1032
1033 return r;
1034}
1035
11fdf7f2
TL
1036static int is_mapped_image(struct udev *udev, const krbd_spec& spec,
1037 string *pname)
1038{
9f95a23c 1039 udev_enumerate_uptr enm;
11fdf7f2
TL
1040 struct udev_list_entry *l;
1041 int r;
1042
1043 r = enumerate_devices(udev, spec, &enm);
1044 if (r < 0)
1045 return r;
1046
9f95a23c 1047 l = udev_enumerate_get_list_entry(enm.get());
11fdf7f2 1048 if (l) {
9f95a23c
TL
1049 auto dev = dev_from_list_entry(udev, l);
1050 if (!dev)
1051 return -ENOMEM;
11fdf7f2 1052
9f95a23c
TL
1053 *pname = get_devnode(dev.get());
1054 return 1;
11fdf7f2
TL
1055 }
1056
9f95a23c 1057 return 0; /* not mapped */
11fdf7f2
TL
1058}
1059
f91f0fd5 1060extern "C" int krbd_create_from_context(rados_config_t cct, uint32_t flags,
7c673cae
FG
1061 struct krbd_ctx **pctx)
1062{
1063 struct krbd_ctx *ctx = new struct krbd_ctx();
1064
1065 ctx->cct = reinterpret_cast<CephContext *>(cct);
1066 ctx->udev = udev_new();
1067 if (!ctx->udev) {
1068 delete ctx;
1069 return -ENOMEM;
1070 }
f91f0fd5 1071 ctx->flags = flags;
7c673cae
FG
1072
1073 *pctx = ctx;
1074 return 0;
1075}
1076
1077extern "C" void krbd_destroy(struct krbd_ctx *ctx)
1078{
1079 if (!ctx)
1080 return;
1081
1082 udev_unref(ctx->udev);
1083
1084 delete ctx;
1085}
1086
11fdf7f2
TL
1087extern "C" int krbd_map(struct krbd_ctx *ctx,
1088 const char *pool_name,
1089 const char *nspace_name,
1090 const char *image_name,
1091 const char *snap_name,
1092 const char *options,
1093 char **pdevnode)
7c673cae 1094{
11fdf7f2 1095 krbd_spec spec(pool_name, nspace_name, image_name, snap_name);
7c673cae
FG
1096 string name;
1097 char *devnode;
1098 int r;
1099
11fdf7f2 1100 r = map_image(ctx, spec, options, &name);
7c673cae
FG
1101 if (r < 0)
1102 return r;
1103
1104 devnode = strdup(name.c_str());
1105 if (!devnode)
1106 return -ENOMEM;
1107
1108 *pdevnode = devnode;
1109 return r;
1110}
1111
1112extern "C" int krbd_unmap(struct krbd_ctx *ctx, const char *devnode,
1113 const char *options)
1114{
1115 return unmap_image(ctx, devnode, options);
1116}
1117
11fdf7f2
TL
1118extern "C" int krbd_unmap_by_spec(struct krbd_ctx *ctx,
1119 const char *pool_name,
1120 const char *nspace_name,
1121 const char *image_name,
1122 const char *snap_name,
7c673cae
FG
1123 const char *options)
1124{
11fdf7f2
TL
1125 krbd_spec spec(pool_name, nspace_name, image_name, snap_name);
1126 return unmap_image(ctx, spec, options);
7c673cae
FG
1127}
1128
1129int krbd_showmapped(struct krbd_ctx *ctx, Formatter *f)
1130{
1131 return dump_images(ctx, f);
1132}
11fdf7f2
TL
1133
1134extern "C" int krbd_is_mapped(struct krbd_ctx *ctx,
1135 const char *pool_name,
1136 const char *nspace_name,
1137 const char *image_name,
1138 const char *snap_name,
1139 char **pdevnode)
1140{
1141 krbd_spec spec(pool_name, nspace_name, image_name, snap_name);
1142 string name;
1143 char *devnode;
1144 int r;
1145
1146 r = is_mapped_image(ctx->udev, spec, &name);
1147 if (r <= 0) /* error or not mapped */
1148 return r;
1149
1150 devnode = strdup(name.c_str());
1151 if (!devnode)
1152 return -ENOMEM;
1153
1154 *pdevnode = devnode;
1155 return r;
1156}