]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | /* |
2 | * Ceph - scalable distributed file system | |
3 | * | |
4 | * Copyright (C) 2014 Inktank Storage, Inc. | |
5 | * | |
6 | * This is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU Lesser General Public | |
8 | * License version 2.1, as published by the Free Software | |
9 | * Foundation. See file COPYING. | |
10 | * | |
11 | */ | |
12 | ||
13 | #include <errno.h> | |
14 | #include <fcntl.h> | |
15 | #include <iostream> | |
11fdf7f2 | 16 | #include <optional> |
7c673cae FG |
17 | #include <poll.h> |
18 | #include <sstream> | |
19 | #include <stdio.h> | |
20 | #include <stdlib.h> | |
21 | #include <string.h> | |
22 | #include <string> | |
23 | #include <sys/stat.h> | |
11fdf7f2 | 24 | #include <sys/sysmacros.h> |
7c673cae | 25 | #include <sys/types.h> |
eafe8130 | 26 | #include <tuple> |
7c673cae | 27 | #include <unistd.h> |
eafe8130 | 28 | #include <utility> |
7c673cae FG |
29 | |
30 | #include "auth/KeyRing.h" | |
31 | #include "common/errno.h" | |
32 | #include "common/Formatter.h" | |
33 | #include "common/module.h" | |
34 | #include "common/run_cmd.h" | |
35 | #include "common/safe_io.h" | |
36 | #include "common/secret.h" | |
37 | #include "common/TextTable.h" | |
eafe8130 | 38 | #include "common/Thread.h" |
11fdf7f2 | 39 | #include "include/ceph_assert.h" |
7c673cae FG |
40 | #include "include/stringify.h" |
41 | #include "include/krbd.h" | |
42 | #include "mon/MonMap.h" | |
43 | ||
44 | #include <blkid/blkid.h> | |
45 | #include <libudev.h> | |
46 | ||
eafe8130 | 47 | static const int UDEV_BUF_SIZE = 1 << 20; /* doubled to 2M (SO_RCVBUFFORCE) */ |
7c673cae FG |
48 | |
49 | struct krbd_ctx { | |
50 | CephContext *cct; | |
51 | struct udev *udev; | |
52 | }; | |
53 | ||
11fdf7f2 TL |
54 | static const std::string SNAP_HEAD_NAME("-"); |
55 | ||
56 | struct krbd_spec { | |
57 | std::string pool_name; | |
58 | std::string nspace_name; | |
59 | std::string image_name; | |
60 | std::string snap_name; | |
61 | ||
62 | krbd_spec(const char *pool_name, const char *nspace_name, | |
63 | const char *image_name, const char *snap_name) | |
64 | : pool_name(pool_name), | |
65 | nspace_name(nspace_name), | |
66 | image_name(image_name), | |
67 | snap_name(*snap_name ? snap_name : SNAP_HEAD_NAME) { } | |
68 | ||
69 | bool operator==(const krbd_spec& rhs) const { | |
70 | return pool_name == rhs.pool_name && | |
71 | nspace_name == rhs.nspace_name && | |
72 | image_name == rhs.image_name && | |
73 | snap_name == rhs.snap_name; | |
74 | } | |
75 | }; | |
76 | ||
77 | std::ostream& operator<<(std::ostream& os, const krbd_spec& spec) { | |
78 | os << spec.pool_name << "/"; | |
79 | if (!spec.nspace_name.empty()) | |
80 | os << spec.nspace_name << "/"; | |
81 | os << spec.image_name; | |
82 | if (spec.snap_name != SNAP_HEAD_NAME) | |
83 | os << "@" << spec.snap_name; | |
84 | return os; | |
85 | } | |
86 | ||
87 | std::optional<krbd_spec> spec_from_dev(udev_device *dev) { | |
88 | const char *pool_name = udev_device_get_sysattr_value(dev, "pool"); | |
89 | const char *nspace_name = udev_device_get_sysattr_value(dev, "pool_ns"); | |
90 | const char *image_name = udev_device_get_sysattr_value(dev, "name"); | |
91 | const char *snap_name = udev_device_get_sysattr_value(dev, "current_snap"); | |
92 | ||
93 | if (!pool_name || !image_name || !snap_name) | |
94 | return std::nullopt; | |
95 | ||
96 | return std::make_optional<krbd_spec>( | |
97 | pool_name, nspace_name ?: "", image_name, snap_name); | |
98 | } | |
99 | ||
7c673cae FG |
100 | static string get_kernel_rbd_name(const char *id) |
101 | { | |
102 | return string("/dev/rbd") + id; | |
103 | } | |
104 | ||
105 | static int sysfs_write_rbd(const char *which, const string& buf) | |
106 | { | |
107 | const string s = string("/sys/bus/rbd/") + which; | |
108 | const string t = s + "_single_major"; | |
109 | int fd; | |
110 | int r; | |
111 | ||
112 | /* | |
113 | * 'add' and 'add_single_major' interfaces are identical, but if rbd | |
114 | * kernel module is new enough and is configured to use single-major | |
115 | * scheme, 'add' is disabled in order to prevent old userspace from | |
116 | * doing weird things at unmap time. | |
117 | * | |
118 | * Same goes for 'remove' vs 'remove_single_major'. | |
119 | */ | |
120 | fd = open(t.c_str(), O_WRONLY); | |
121 | if (fd < 0) { | |
122 | if (errno == ENOENT) { | |
123 | fd = open(s.c_str(), O_WRONLY); | |
124 | if (fd < 0) | |
125 | return -errno; | |
126 | } else { | |
127 | return -errno; | |
128 | } | |
129 | } | |
130 | ||
131 | r = safe_write(fd, buf.c_str(), buf.size()); | |
132 | ||
133 | close(fd); | |
134 | return r; | |
135 | } | |
136 | ||
137 | static int sysfs_write_rbd_add(const string& buf) | |
138 | { | |
139 | return sysfs_write_rbd("add", buf); | |
140 | } | |
141 | ||
142 | static int sysfs_write_rbd_remove(const string& buf) | |
143 | { | |
144 | return sysfs_write_rbd("remove", buf); | |
145 | } | |
146 | ||
147 | static int have_minor_attr(void) | |
148 | { | |
149 | /* | |
150 | * 'minor' attribute was added as part of single_major merge, which | |
151 | * exposed the 'single_major' parameter. 'minor' is always present, | |
152 | * regardless of whether single-major scheme is turned on or not. | |
153 | * | |
154 | * (Something like ver >= KERNEL_VERSION(3, 14, 0) is a no-go because | |
155 | * this has to work with rbd.ko backported to various kernels.) | |
156 | */ | |
157 | return access("/sys/module/rbd/parameters/single_major", F_OK) == 0; | |
158 | } | |
159 | ||
11fdf7f2 TL |
160 | static int build_map_buf(CephContext *cct, const krbd_spec& spec, |
161 | const char *options, string *pbuf) | |
7c673cae FG |
162 | { |
163 | ostringstream oss; | |
164 | int r; | |
165 | ||
166 | MonMap monmap; | |
11fdf7f2 | 167 | r = monmap.build_initial(cct, false, cerr); |
7c673cae FG |
168 | if (r < 0) |
169 | return r; | |
170 | ||
171 | list<entity_addr_t> mon_addr; | |
172 | monmap.list_addrs(mon_addr); | |
173 | ||
174 | for (const auto &p : mon_addr) { | |
175 | if (oss.tellp() > 0) { | |
176 | oss << ","; | |
177 | } | |
178 | oss << p.get_sockaddr(); | |
179 | } | |
180 | ||
181 | oss << " name=" << cct->_conf->name.get_id(); | |
182 | ||
183 | KeyRing keyring; | |
11fdf7f2 TL |
184 | auto auth_client_required = |
185 | cct->_conf.get_val<std::string>("auth_client_required"); | |
186 | if (auth_client_required != "none") { | |
224ce89b | 187 | r = keyring.from_ceph_context(cct); |
11fdf7f2 TL |
188 | auto keyfile = cct->_conf.get_val<std::string>("keyfile"); |
189 | auto key = cct->_conf.get_val<std::string>("key"); | |
190 | if (r == -ENOENT && keyfile.empty() && key.empty()) | |
224ce89b WB |
191 | r = 0; |
192 | if (r < 0) { | |
193 | cerr << "rbd: failed to get secret" << std::endl; | |
194 | return r; | |
195 | } | |
7c673cae FG |
196 | } |
197 | ||
198 | CryptoKey secret; | |
199 | string key_name = string("client.") + cct->_conf->name.get_id(); | |
200 | if (keyring.get_secret(cct->_conf->name, secret)) { | |
201 | string secret_str; | |
202 | secret.encode_base64(secret_str); | |
203 | ||
204 | r = set_kernel_secret(secret_str.c_str(), key_name.c_str()); | |
205 | if (r >= 0) { | |
206 | if (r == 0) | |
207 | cerr << "rbd: warning: secret has length 0" << std::endl; | |
208 | oss << ",key=" << key_name; | |
209 | } else if (r == -ENODEV || r == -ENOSYS) { | |
210 | // running against older kernel; fall back to secret= in options | |
211 | oss << ",secret=" << secret_str; | |
212 | } else { | |
213 | cerr << "rbd: failed to add secret '" << key_name << "' to kernel" | |
214 | << std::endl; | |
215 | return r; | |
216 | } | |
217 | } else if (is_kernel_secret(key_name.c_str())) { | |
218 | oss << ",key=" << key_name; | |
219 | } | |
220 | ||
221 | if (strcmp(options, "") != 0) | |
222 | oss << "," << options; | |
11fdf7f2 TL |
223 | if (!spec.nspace_name.empty()) |
224 | oss << ",_pool_ns=" << spec.nspace_name; | |
7c673cae | 225 | |
11fdf7f2 TL |
226 | oss << " " << spec.pool_name << " " << spec.image_name << " " |
227 | << spec.snap_name; | |
7c673cae FG |
228 | |
229 | *pbuf = oss.str(); | |
230 | return 0; | |
231 | } | |
232 | ||
eafe8130 TL |
233 | /* |
234 | * Return: | |
235 | * <kernel error, false> - didn't map | |
236 | * <0 or udev error, true> - mapped | |
237 | */ | |
238 | template <typename F> | |
239 | static std::pair<int, bool> wait_for_mapping(int sysfs_r_fd, udev_monitor *mon, | |
240 | F udev_device_handler) | |
7c673cae | 241 | { |
eafe8130 TL |
242 | struct pollfd fds[2]; |
243 | int sysfs_r = INT_MAX, udev_r = INT_MAX; | |
81eedcae | 244 | int r; |
7c673cae | 245 | |
eafe8130 TL |
246 | fds[0].fd = sysfs_r_fd; |
247 | fds[0].events = POLLIN; | |
248 | fds[1].fd = udev_monitor_get_fd(mon); | |
249 | fds[1].events = POLLIN; | |
250 | ||
7c673cae | 251 | for (;;) { |
eafe8130 TL |
252 | if (poll(fds, 2, -1) < 0) { |
253 | ceph_abort_msgf("poll failed: %d", -errno); | |
254 | } | |
7c673cae | 255 | |
eafe8130 TL |
256 | if (fds[0].revents) { |
257 | r = safe_read_exact(sysfs_r_fd, &sysfs_r, sizeof(sysfs_r)); | |
258 | if (r < 0) { | |
259 | ceph_abort_msgf("safe_read_exact failed: %d", r); | |
260 | } | |
261 | if (sysfs_r < 0) { | |
262 | return std::make_pair(sysfs_r, false); | |
263 | } | |
264 | if (udev_r != INT_MAX) { | |
265 | ceph_assert(!sysfs_r); | |
266 | return std::make_pair(udev_r, true); | |
267 | } | |
268 | fds[0].fd = -1; | |
81eedcae | 269 | } |
11fdf7f2 | 270 | |
eafe8130 TL |
271 | if (fds[1].revents) { |
272 | for (;;) { | |
273 | struct udev_device *dev; | |
274 | ||
275 | dev = udev_monitor_receive_device(mon); | |
276 | if (!dev) { | |
277 | if (errno != EINTR && errno != EAGAIN) { | |
278 | udev_r = -errno; | |
279 | if (sysfs_r != INT_MAX) { | |
280 | ceph_assert(!sysfs_r); | |
281 | return std::make_pair(udev_r, true); | |
282 | } | |
283 | fds[1].fd = -1; | |
284 | } | |
285 | break; | |
286 | } | |
287 | if (udev_device_handler(dev)) { | |
288 | udev_r = 0; | |
289 | if (sysfs_r != INT_MAX) { | |
290 | ceph_assert(!sysfs_r); | |
291 | return std::make_pair(udev_r, true); | |
292 | } | |
293 | fds[1].fd = -1; | |
294 | break; | |
295 | } | |
296 | } | |
297 | } | |
298 | } | |
299 | } | |
300 | ||
301 | class UdevMapHandler { | |
302 | public: | |
303 | UdevMapHandler(const krbd_spec *spec, std::string *pdevnode) : | |
304 | m_spec(spec), m_pdevnode(pdevnode) {} | |
7c673cae | 305 | |
eafe8130 TL |
306 | /* |
307 | * Catch /sys/devices/rbd/<id>/ and wait for the corresponding | |
308 | * block device to show up. This is necessary because rbd devices | |
309 | * and block devices aren't linked together in our sysfs layout. | |
310 | */ | |
311 | bool operator()(udev_device *dev) { | |
312 | if (strcmp(udev_device_get_action(dev), "add")) { | |
7c673cae | 313 | goto next; |
eafe8130 TL |
314 | } |
315 | if (!strcmp(udev_device_get_subsystem(dev), "rbd")) { | |
316 | if (!m_bus_dev) { | |
317 | auto spec = spec_from_dev(dev); | |
318 | if (spec && *spec == *m_spec) { | |
319 | m_bus_dev = dev; | |
81eedcae | 320 | goto check; |
7c673cae FG |
321 | } |
322 | } | |
eafe8130 TL |
323 | } else if (!strcmp(udev_device_get_subsystem(dev), "block")) { |
324 | m_block_devs.push_back(dev); | |
81eedcae TL |
325 | goto check; |
326 | } | |
7c673cae | 327 | |
81eedcae TL |
328 | next: |
329 | udev_device_unref(dev); | |
eafe8130 | 330 | return false; |
81eedcae TL |
331 | |
332 | check: | |
eafe8130 TL |
333 | if (m_bus_dev && !m_block_devs.empty()) { |
334 | const char *major = udev_device_get_sysattr_value(m_bus_dev, "major"); | |
335 | const char *minor = udev_device_get_sysattr_value(m_bus_dev, "minor"); | |
81eedcae TL |
336 | ceph_assert(!minor ^ have_minor_attr()); |
337 | ||
eafe8130 | 338 | for (auto p : m_block_devs) { |
81eedcae TL |
339 | const char *this_major = udev_device_get_property_value(p, "MAJOR"); |
340 | const char *this_minor = udev_device_get_property_value(p, "MINOR"); | |
7c673cae FG |
341 | |
342 | if (strcmp(this_major, major) == 0 && | |
343 | (!minor || strcmp(this_minor, minor) == 0)) { | |
eafe8130 | 344 | string name = get_kernel_rbd_name(udev_device_get_sysname(m_bus_dev)); |
7c673cae | 345 | |
81eedcae | 346 | ceph_assert(strcmp(udev_device_get_devnode(p), name.c_str()) == 0); |
eafe8130 TL |
347 | *m_pdevnode = name; |
348 | return true; | |
7c673cae FG |
349 | } |
350 | } | |
351 | } | |
eafe8130 | 352 | return false; |
81eedcae | 353 | } |
7c673cae | 354 | |
eafe8130 TL |
355 | ~UdevMapHandler() { |
356 | if (m_bus_dev) { | |
357 | udev_device_unref(m_bus_dev); | |
358 | } | |
359 | ||
360 | for (auto p : m_block_devs) { | |
361 | udev_device_unref(p); | |
362 | } | |
7c673cae FG |
363 | } |
364 | ||
eafe8130 TL |
365 | private: |
366 | udev_device *m_bus_dev = nullptr; | |
367 | std::vector<udev_device *> m_block_devs; | |
368 | const krbd_spec *m_spec; | |
369 | std::string *m_pdevnode; | |
370 | }; | |
7c673cae | 371 | |
11fdf7f2 TL |
372 | static int do_map(struct udev *udev, const krbd_spec& spec, const string& buf, |
373 | string *pname) | |
7c673cae FG |
374 | { |
375 | struct udev_monitor *mon; | |
eafe8130 TL |
376 | std::thread mapper; |
377 | bool mapped; | |
378 | int fds[2]; | |
7c673cae FG |
379 | int r; |
380 | ||
381 | mon = udev_monitor_new_from_netlink(udev, "udev"); | |
382 | if (!mon) | |
383 | return -ENOMEM; | |
384 | ||
11fdf7f2 | 385 | r = udev_monitor_filter_add_match_subsystem_devtype(mon, "rbd", nullptr); |
7c673cae FG |
386 | if (r < 0) |
387 | goto out_mon; | |
388 | ||
389 | r = udev_monitor_filter_add_match_subsystem_devtype(mon, "block", "disk"); | |
390 | if (r < 0) | |
391 | goto out_mon; | |
392 | ||
eafe8130 TL |
393 | r = udev_monitor_set_receive_buffer_size(mon, UDEV_BUF_SIZE); |
394 | if (r < 0) { | |
395 | std::cerr << "rbd: failed to set udev buffer size: " << cpp_strerror(r) | |
396 | << std::endl; | |
397 | /* not fatal */ | |
398 | } | |
399 | ||
7c673cae FG |
400 | r = udev_monitor_enable_receiving(mon); |
401 | if (r < 0) | |
402 | goto out_mon; | |
403 | ||
eafe8130 TL |
404 | if (pipe2(fds, O_NONBLOCK) < 0) { |
405 | r = -errno; | |
7c673cae FG |
406 | goto out_mon; |
407 | } | |
408 | ||
eafe8130 TL |
409 | mapper = make_named_thread("mapper", [&buf, sysfs_r_fd = fds[1]]() { |
410 | int sysfs_r = sysfs_write_rbd_add(buf); | |
411 | int r = safe_write(sysfs_r_fd, &sysfs_r, sizeof(sysfs_r)); | |
412 | if (r < 0) { | |
413 | ceph_abort_msgf("safe_write failed: %d", r); | |
414 | } | |
415 | }); | |
416 | ||
417 | std::tie(r, mapped) = wait_for_mapping(fds[0], mon, | |
418 | UdevMapHandler(&spec, pname)); | |
7c673cae | 419 | if (r < 0) { |
eafe8130 TL |
420 | if (!mapped) { |
421 | std::cerr << "rbd: sysfs write failed" << std::endl; | |
422 | } else { | |
423 | std::cerr << "rbd: udev wait failed" << std::endl; | |
424 | /* TODO: fall back to enumeration */ | |
425 | } | |
7c673cae FG |
426 | } |
427 | ||
eafe8130 TL |
428 | mapper.join(); |
429 | close(fds[0]); | |
430 | close(fds[1]); | |
431 | ||
7c673cae FG |
432 | out_mon: |
433 | udev_monitor_unref(mon); | |
434 | return r; | |
435 | } | |
436 | ||
11fdf7f2 TL |
437 | static int map_image(struct krbd_ctx *ctx, const krbd_spec& spec, |
438 | const char *options, string *pname) | |
7c673cae FG |
439 | { |
440 | string buf; | |
441 | int r; | |
442 | ||
11fdf7f2 | 443 | r = build_map_buf(ctx->cct, spec, options, &buf); |
7c673cae FG |
444 | if (r < 0) |
445 | return r; | |
446 | ||
447 | /* | |
448 | * Modprobe rbd kernel module. If it supports single-major device | |
449 | * number allocation scheme, make sure it's turned on. | |
450 | */ | |
451 | if (access("/sys/bus/rbd", F_OK) != 0) { | |
452 | const char *module_options = NULL; | |
453 | if (module_has_param("rbd", "single_major")) | |
454 | module_options = "single_major=Y"; | |
455 | ||
456 | r = module_load("rbd", module_options); | |
457 | if (r) { | |
458 | cerr << "rbd: failed to load rbd kernel module (" << r << ")" | |
459 | << std::endl; | |
460 | /* | |
461 | * Ignore the error: modprobe failing doesn't necessarily prevent | |
462 | * from working. | |
463 | */ | |
464 | } | |
465 | } | |
466 | ||
11fdf7f2 | 467 | return do_map(ctx->udev, spec, buf, pname); |
7c673cae FG |
468 | } |
469 | ||
470 | static int devno_to_krbd_id(struct udev *udev, dev_t devno, string *pid) | |
471 | { | |
472 | struct udev_enumerate *enm; | |
473 | struct udev_list_entry *l; | |
474 | struct udev_device *dev; | |
475 | int r; | |
476 | ||
eafe8130 | 477 | retry: |
7c673cae FG |
478 | enm = udev_enumerate_new(udev); |
479 | if (!enm) | |
480 | return -ENOMEM; | |
481 | ||
482 | r = udev_enumerate_add_match_subsystem(enm, "rbd"); | |
483 | if (r < 0) | |
484 | goto out_enm; | |
485 | ||
486 | r = udev_enumerate_add_match_sysattr(enm, "major", | |
487 | stringify(major(devno)).c_str()); | |
488 | if (r < 0) | |
489 | goto out_enm; | |
490 | ||
491 | if (have_minor_attr()) { | |
492 | r = udev_enumerate_add_match_sysattr(enm, "minor", | |
493 | stringify(minor(devno)).c_str()); | |
494 | if (r < 0) | |
495 | goto out_enm; | |
496 | } | |
497 | ||
498 | r = udev_enumerate_scan_devices(enm); | |
eafe8130 TL |
499 | if (r < 0) { |
500 | if (r == -ENOENT || r == -ENODEV) { | |
501 | std::cerr << "rbd: udev enumerate failed, retrying" << std::endl; | |
502 | udev_enumerate_unref(enm); | |
503 | goto retry; | |
504 | } | |
7c673cae | 505 | goto out_enm; |
eafe8130 | 506 | } |
7c673cae FG |
507 | |
508 | l = udev_enumerate_get_list_entry(enm); | |
509 | if (!l) { | |
510 | r = -ENOENT; | |
511 | goto out_enm; | |
512 | } | |
513 | ||
514 | /* make sure there is only one match */ | |
11fdf7f2 | 515 | ceph_assert(!udev_list_entry_get_next(l)); |
7c673cae FG |
516 | |
517 | dev = udev_device_new_from_syspath(udev, udev_list_entry_get_name(l)); | |
518 | if (!dev) { | |
519 | r = -ENOMEM; | |
520 | goto out_enm; | |
521 | } | |
522 | ||
523 | *pid = udev_device_get_sysname(dev); | |
524 | ||
525 | udev_device_unref(dev); | |
526 | out_enm: | |
527 | udev_enumerate_unref(enm); | |
528 | return r; | |
529 | } | |
530 | ||
11fdf7f2 TL |
531 | static int __enumerate_devices(struct udev *udev, const krbd_spec& spec, |
532 | bool match_nspace, struct udev_enumerate **penm) | |
7c673cae FG |
533 | { |
534 | struct udev_enumerate *enm; | |
7c673cae FG |
535 | int r; |
536 | ||
eafe8130 | 537 | retry: |
7c673cae FG |
538 | enm = udev_enumerate_new(udev); |
539 | if (!enm) | |
540 | return -ENOMEM; | |
541 | ||
542 | r = udev_enumerate_add_match_subsystem(enm, "rbd"); | |
543 | if (r < 0) | |
544 | goto out_enm; | |
545 | ||
11fdf7f2 | 546 | r = udev_enumerate_add_match_sysattr(enm, "pool", spec.pool_name.c_str()); |
7c673cae FG |
547 | if (r < 0) |
548 | goto out_enm; | |
549 | ||
11fdf7f2 TL |
550 | if (match_nspace) { |
551 | r = udev_enumerate_add_match_sysattr(enm, "pool_ns", | |
552 | spec.nspace_name.c_str()); | |
553 | } else { | |
554 | /* | |
555 | * Match _only_ devices that don't have pool_ns attribute. | |
556 | * If the kernel supports namespaces, the result will be empty. | |
557 | */ | |
558 | r = udev_enumerate_add_nomatch_sysattr(enm, "pool_ns", nullptr); | |
559 | } | |
560 | if (r < 0) | |
561 | goto out_enm; | |
562 | ||
563 | r = udev_enumerate_add_match_sysattr(enm, "name", spec.image_name.c_str()); | |
7c673cae FG |
564 | if (r < 0) |
565 | goto out_enm; | |
566 | ||
11fdf7f2 TL |
567 | r = udev_enumerate_add_match_sysattr(enm, "current_snap", |
568 | spec.snap_name.c_str()); | |
7c673cae FG |
569 | if (r < 0) |
570 | goto out_enm; | |
571 | ||
572 | r = udev_enumerate_scan_devices(enm); | |
eafe8130 TL |
573 | if (r < 0) { |
574 | if (r == -ENOENT || r == -ENODEV) { | |
575 | std::cerr << "rbd: udev enumerate failed, retrying" << std::endl; | |
576 | udev_enumerate_unref(enm); | |
577 | goto retry; | |
578 | } | |
7c673cae | 579 | goto out_enm; |
eafe8130 | 580 | } |
7c673cae | 581 | |
11fdf7f2 TL |
582 | *penm = enm; |
583 | return 0; | |
584 | ||
585 | out_enm: | |
586 | udev_enumerate_unref(enm); | |
587 | return r; | |
588 | } | |
589 | ||
590 | static int enumerate_devices(struct udev *udev, const krbd_spec& spec, | |
591 | struct udev_enumerate **penm) | |
592 | { | |
593 | struct udev_enumerate *enm; | |
594 | int r; | |
595 | ||
596 | r = __enumerate_devices(udev, spec, true, &enm); | |
597 | if (r < 0) | |
598 | return r; | |
599 | ||
600 | /* | |
601 | * If no namespace is set, try again with match_nspace=false to | |
602 | * handle older kernels. On a newer kernel the result will remain | |
603 | * the same (i.e. empty). | |
604 | */ | |
605 | if (!udev_enumerate_get_list_entry(enm) && spec.nspace_name.empty()) { | |
606 | udev_enumerate_unref(enm); | |
607 | r = __enumerate_devices(udev, spec, false, &enm); | |
608 | if (r < 0) | |
609 | return r; | |
610 | } | |
611 | ||
612 | *penm = enm; | |
613 | return 0; | |
614 | } | |
615 | ||
616 | static int spec_to_devno_and_krbd_id(struct udev *udev, const krbd_spec& spec, | |
617 | dev_t *pdevno, string *pid) | |
618 | { | |
619 | struct udev_enumerate *enm; | |
620 | struct udev_list_entry *l; | |
621 | struct udev_device *dev; | |
622 | unsigned int maj, min = 0; | |
623 | string err; | |
624 | int r; | |
625 | ||
626 | r = enumerate_devices(udev, spec, &enm); | |
627 | if (r < 0) | |
628 | return r; | |
629 | ||
7c673cae FG |
630 | l = udev_enumerate_get_list_entry(enm); |
631 | if (!l) { | |
632 | r = -ENOENT; | |
633 | goto out_enm; | |
634 | } | |
635 | ||
636 | dev = udev_device_new_from_syspath(udev, udev_list_entry_get_name(l)); | |
637 | if (!dev) { | |
638 | r = -ENOMEM; | |
639 | goto out_enm; | |
640 | } | |
641 | ||
642 | maj = strict_strtoll(udev_device_get_sysattr_value(dev, "major"), 10, &err); | |
643 | if (!err.empty()) { | |
644 | cerr << "rbd: couldn't parse major: " << err << std::endl; | |
645 | r = -EINVAL; | |
646 | goto out_dev; | |
647 | } | |
648 | if (have_minor_attr()) { | |
649 | min = strict_strtoll(udev_device_get_sysattr_value(dev, "minor"), 10, &err); | |
650 | if (!err.empty()) { | |
651 | cerr << "rbd: couldn't parse minor: " << err << std::endl; | |
652 | r = -EINVAL; | |
653 | goto out_dev; | |
654 | } | |
655 | } | |
656 | ||
657 | /* | |
658 | * If an image is mapped more than once don't bother trying to unmap | |
659 | * all devices - let users run unmap the same number of times they | |
660 | * ran map. | |
661 | */ | |
662 | if (udev_list_entry_get_next(l)) | |
11fdf7f2 | 663 | cerr << "rbd: " << spec << ": mapped more than once, unmapping " |
7c673cae FG |
664 | << get_kernel_rbd_name(udev_device_get_sysname(dev)) |
665 | << " only" << std::endl; | |
666 | ||
667 | *pdevno = makedev(maj, min); | |
668 | *pid = udev_device_get_sysname(dev); | |
669 | ||
670 | out_dev: | |
671 | udev_device_unref(dev); | |
672 | out_enm: | |
673 | udev_enumerate_unref(enm); | |
674 | return r; | |
675 | } | |
676 | ||
677 | static string build_unmap_buf(const string& id, const char *options) | |
678 | { | |
679 | string buf(id); | |
680 | if (strcmp(options, "") != 0) { | |
681 | buf += " "; | |
682 | buf += options; | |
683 | } | |
684 | return buf; | |
685 | } | |
686 | ||
eafe8130 TL |
687 | class UdevUnmapHandler { |
688 | public: | |
689 | UdevUnmapHandler(dev_t devno) : m_devno(devno) {} | |
7c673cae | 690 | |
eafe8130 TL |
691 | bool operator()(udev_device *dev) { |
692 | bool match = false; | |
11fdf7f2 | 693 | |
eafe8130 TL |
694 | if (!strcmp(udev_device_get_action(dev), "remove") && |
695 | udev_device_get_devnum(dev) == m_devno) { | |
696 | match = true; | |
7c673cae | 697 | } |
7c673cae | 698 | udev_device_unref(dev); |
eafe8130 | 699 | return match; |
7c673cae FG |
700 | } |
701 | ||
eafe8130 TL |
702 | private: |
703 | dev_t m_devno; | |
704 | }; | |
7c673cae FG |
705 | |
706 | static int do_unmap(struct udev *udev, dev_t devno, const string& buf) | |
707 | { | |
708 | struct udev_monitor *mon; | |
eafe8130 TL |
709 | std::thread unmapper; |
710 | bool unmapped; | |
711 | int fds[2]; | |
7c673cae FG |
712 | int r; |
713 | ||
714 | mon = udev_monitor_new_from_netlink(udev, "udev"); | |
715 | if (!mon) | |
716 | return -ENOMEM; | |
717 | ||
718 | r = udev_monitor_filter_add_match_subsystem_devtype(mon, "block", "disk"); | |
719 | if (r < 0) | |
720 | goto out_mon; | |
721 | ||
eafe8130 TL |
722 | r = udev_monitor_set_receive_buffer_size(mon, UDEV_BUF_SIZE); |
723 | if (r < 0) { | |
724 | std::cerr << "rbd: failed to set udev buffer size: " << cpp_strerror(r) | |
725 | << std::endl; | |
726 | /* not fatal */ | |
727 | } | |
728 | ||
7c673cae FG |
729 | r = udev_monitor_enable_receiving(mon); |
730 | if (r < 0) | |
731 | goto out_mon; | |
732 | ||
eafe8130 TL |
733 | if (pipe2(fds, O_NONBLOCK) < 0) { |
734 | r = -errno; | |
735 | goto out_mon; | |
736 | } | |
737 | ||
738 | unmapper = make_named_thread("unmapper", [&buf, sysfs_r_fd = fds[1]]() { | |
739 | /* | |
740 | * On final device close(), kernel sends a block change event, in | |
741 | * response to which udev apparently runs blkid on the device. This | |
742 | * makes unmap fail with EBUSY, if issued right after final close(). | |
743 | * Try to circumvent this with a retry before turning to udev. | |
744 | */ | |
745 | for (int tries = 0; ; tries++) { | |
746 | int sysfs_r = sysfs_write_rbd_remove(buf); | |
747 | if (sysfs_r == -EBUSY && tries < 2) { | |
748 | if (!tries) { | |
749 | usleep(250 * 1000); | |
750 | } else { | |
751 | /* | |
752 | * libudev does not provide the "wait until the queue is empty" | |
753 | * API or the sufficient amount of primitives to build it from. | |
754 | */ | |
755 | std::string err = run_cmd("udevadm", "settle", "--timeout", "10", | |
756 | (char *)NULL); | |
757 | if (!err.empty()) | |
758 | std::cerr << "rbd: " << err << std::endl; | |
759 | } | |
7c673cae | 760 | } else { |
eafe8130 TL |
761 | int r = safe_write(sysfs_r_fd, &sysfs_r, sizeof(sysfs_r)); |
762 | if (r < 0) { | |
763 | ceph_abort_msgf("safe_write failed: %d", r); | |
764 | } | |
765 | break; | |
7c673cae | 766 | } |
7c673cae | 767 | } |
eafe8130 | 768 | }); |
7c673cae | 769 | |
eafe8130 TL |
770 | std::tie(r, unmapped) = wait_for_mapping(fds[0], mon, |
771 | UdevUnmapHandler(devno)); | |
7c673cae | 772 | if (r < 0) { |
eafe8130 TL |
773 | if (!unmapped) { |
774 | std::cerr << "rbd: sysfs write failed" << std::endl; | |
775 | } else { | |
776 | std::cerr << "rbd: udev wait failed: " << cpp_strerror(r) << std::endl; | |
777 | r = 0; | |
778 | } | |
7c673cae FG |
779 | } |
780 | ||
eafe8130 TL |
781 | unmapper.join(); |
782 | close(fds[0]); | |
783 | close(fds[1]); | |
784 | ||
7c673cae FG |
785 | out_mon: |
786 | udev_monitor_unref(mon); | |
787 | return r; | |
788 | } | |
789 | ||
790 | static int unmap_image(struct krbd_ctx *ctx, const char *devnode, | |
791 | const char *options) | |
792 | { | |
793 | struct stat sb; | |
794 | dev_t wholedevno = 0; | |
795 | string id; | |
796 | int r; | |
797 | ||
798 | if (stat(devnode, &sb) < 0 || !S_ISBLK(sb.st_mode)) { | |
799 | cerr << "rbd: '" << devnode << "' is not a block device" << std::endl; | |
800 | return -EINVAL; | |
801 | } | |
802 | ||
803 | r = blkid_devno_to_wholedisk(sb.st_rdev, NULL, 0, &wholedevno); | |
804 | if (r < 0) { | |
805 | cerr << "rbd: couldn't compute wholedevno: " << cpp_strerror(r) | |
806 | << std::endl; | |
807 | /* | |
808 | * Ignore the error: we are given whole disks most of the time, and | |
809 | * if it turns out this is a partition we will fail later anyway. | |
810 | */ | |
811 | wholedevno = sb.st_rdev; | |
812 | } | |
813 | ||
eafe8130 TL |
814 | for (int tries = 0; ; tries++) { |
815 | r = devno_to_krbd_id(ctx->udev, wholedevno, &id); | |
816 | if (r == -ENOENT && tries < 2) { | |
817 | usleep(250 * 1000); | |
818 | } else { | |
819 | if (r < 0) { | |
820 | if (r == -ENOENT) { | |
821 | std::cerr << "rbd: '" << devnode << "' is not an rbd device" | |
822 | << std::endl; | |
823 | r = -EINVAL; | |
824 | } | |
825 | return r; | |
826 | } | |
827 | if (tries) { | |
828 | std::cerr << "rbd: udev enumerate missed a device, tries = " << tries | |
829 | << std::endl; | |
830 | } | |
831 | break; | |
7c673cae | 832 | } |
7c673cae FG |
833 | } |
834 | ||
835 | return do_unmap(ctx->udev, wholedevno, build_unmap_buf(id, options)); | |
836 | } | |
837 | ||
11fdf7f2 | 838 | static int unmap_image(struct krbd_ctx *ctx, const krbd_spec& spec, |
7c673cae FG |
839 | const char *options) |
840 | { | |
841 | dev_t devno = 0; | |
842 | string id; | |
843 | int r; | |
844 | ||
eafe8130 TL |
845 | for (int tries = 0; ; tries++) { |
846 | r = spec_to_devno_and_krbd_id(ctx->udev, spec, &devno, &id); | |
847 | if (r == -ENOENT && tries < 2) { | |
848 | usleep(250 * 1000); | |
849 | } else { | |
850 | if (r < 0) { | |
851 | if (r == -ENOENT) { | |
852 | std::cerr << "rbd: " << spec << ": not a mapped image or snapshot" | |
853 | << std::endl; | |
854 | r = -EINVAL; | |
855 | } | |
856 | return r; | |
857 | } | |
858 | if (tries) { | |
859 | std::cerr << "rbd: udev enumerate missed a device, tries = " << tries | |
860 | << std::endl; | |
861 | } | |
862 | break; | |
7c673cae | 863 | } |
7c673cae FG |
864 | } |
865 | ||
866 | return do_unmap(ctx->udev, devno, build_unmap_buf(id, options)); | |
867 | } | |
868 | ||
869 | static bool dump_one_image(Formatter *f, TextTable *tbl, | |
870 | struct udev_device *dev) | |
871 | { | |
872 | const char *id = udev_device_get_sysname(dev); | |
11fdf7f2 | 873 | auto spec = spec_from_dev(dev); |
7c673cae FG |
874 | string kname = get_kernel_rbd_name(id); |
875 | ||
11fdf7f2 | 876 | if (!spec) |
7c673cae FG |
877 | return false; |
878 | ||
879 | if (f) { | |
11fdf7f2 TL |
880 | f->open_object_section("device"); |
881 | f->dump_string("id", id); | |
882 | f->dump_string("pool", spec->pool_name); | |
883 | f->dump_string("namespace", spec->nspace_name); | |
884 | f->dump_string("name", spec->image_name); | |
885 | f->dump_string("snap", spec->snap_name); | |
7c673cae FG |
886 | f->dump_string("device", kname); |
887 | f->close_section(); | |
888 | } else { | |
11fdf7f2 TL |
889 | *tbl << id << spec->pool_name << spec->nspace_name << spec->image_name |
890 | << spec->snap_name << kname << TextTable::endrow; | |
7c673cae FG |
891 | } |
892 | ||
893 | return true; | |
894 | } | |
895 | ||
896 | static int do_dump(struct udev *udev, Formatter *f, TextTable *tbl) | |
897 | { | |
898 | struct udev_enumerate *enm; | |
11fdf7f2 | 899 | struct udev_list_entry *l = NULL; |
7c673cae FG |
900 | bool have_output = false; |
901 | int r; | |
902 | ||
eafe8130 | 903 | retry: |
7c673cae FG |
904 | enm = udev_enumerate_new(udev); |
905 | if (!enm) | |
906 | return -ENOMEM; | |
907 | ||
908 | r = udev_enumerate_add_match_subsystem(enm, "rbd"); | |
909 | if (r < 0) | |
910 | goto out_enm; | |
911 | ||
912 | r = udev_enumerate_scan_devices(enm); | |
eafe8130 TL |
913 | if (r < 0) { |
914 | if (r == -ENOENT || r == -ENODEV) { | |
915 | std::cerr << "rbd: udev enumerate failed, retrying" << std::endl; | |
916 | udev_enumerate_unref(enm); | |
917 | goto retry; | |
918 | } | |
7c673cae | 919 | goto out_enm; |
eafe8130 | 920 | } |
7c673cae FG |
921 | |
922 | udev_list_entry_foreach(l, udev_enumerate_get_list_entry(enm)) { | |
923 | struct udev_device *dev; | |
924 | ||
925 | dev = udev_device_new_from_syspath(udev, udev_list_entry_get_name(l)); | |
926 | if (dev) { | |
927 | have_output |= dump_one_image(f, tbl, dev); | |
928 | udev_device_unref(dev); | |
929 | } | |
930 | } | |
931 | ||
932 | r = have_output; | |
933 | out_enm: | |
934 | udev_enumerate_unref(enm); | |
935 | return r; | |
936 | } | |
937 | ||
938 | int dump_images(struct krbd_ctx *ctx, Formatter *f) | |
939 | { | |
940 | TextTable tbl; | |
941 | int r; | |
942 | ||
943 | if (f) { | |
11fdf7f2 | 944 | f->open_array_section("devices"); |
7c673cae FG |
945 | } else { |
946 | tbl.define_column("id", TextTable::LEFT, TextTable::LEFT); | |
947 | tbl.define_column("pool", TextTable::LEFT, TextTable::LEFT); | |
11fdf7f2 | 948 | tbl.define_column("namespace", TextTable::LEFT, TextTable::LEFT); |
7c673cae FG |
949 | tbl.define_column("image", TextTable::LEFT, TextTable::LEFT); |
950 | tbl.define_column("snap", TextTable::LEFT, TextTable::LEFT); | |
951 | tbl.define_column("device", TextTable::LEFT, TextTable::LEFT); | |
952 | } | |
953 | ||
954 | r = do_dump(ctx->udev, f, &tbl); | |
955 | ||
956 | if (f) { | |
957 | f->close_section(); | |
958 | f->flush(cout); | |
959 | } else { | |
960 | if (r > 0) | |
961 | cout << tbl; | |
962 | } | |
963 | ||
964 | return r; | |
965 | } | |
966 | ||
11fdf7f2 TL |
967 | static int is_mapped_image(struct udev *udev, const krbd_spec& spec, |
968 | string *pname) | |
969 | { | |
970 | struct udev_enumerate *enm; | |
971 | struct udev_list_entry *l; | |
972 | int r; | |
973 | ||
974 | r = enumerate_devices(udev, spec, &enm); | |
975 | if (r < 0) | |
976 | return r; | |
977 | ||
978 | l = udev_enumerate_get_list_entry(enm); | |
979 | if (l) { | |
980 | struct udev_device *dev; | |
981 | ||
982 | dev = udev_device_new_from_syspath(udev, udev_list_entry_get_name(l)); | |
983 | if (!dev) { | |
984 | r = -ENOMEM; | |
985 | goto out_enm; | |
986 | } | |
987 | ||
988 | r = 1; | |
989 | *pname = get_kernel_rbd_name(udev_device_get_sysname(dev)); | |
990 | udev_device_unref(dev); | |
991 | } else { | |
992 | r = 0; /* not mapped */ | |
993 | } | |
994 | ||
995 | out_enm: | |
996 | udev_enumerate_unref(enm); | |
997 | return r; | |
998 | } | |
999 | ||
7c673cae FG |
1000 | extern "C" int krbd_create_from_context(rados_config_t cct, |
1001 | struct krbd_ctx **pctx) | |
1002 | { | |
1003 | struct krbd_ctx *ctx = new struct krbd_ctx(); | |
1004 | ||
1005 | ctx->cct = reinterpret_cast<CephContext *>(cct); | |
1006 | ctx->udev = udev_new(); | |
1007 | if (!ctx->udev) { | |
1008 | delete ctx; | |
1009 | return -ENOMEM; | |
1010 | } | |
1011 | ||
1012 | *pctx = ctx; | |
1013 | return 0; | |
1014 | } | |
1015 | ||
1016 | extern "C" void krbd_destroy(struct krbd_ctx *ctx) | |
1017 | { | |
1018 | if (!ctx) | |
1019 | return; | |
1020 | ||
1021 | udev_unref(ctx->udev); | |
1022 | ||
1023 | delete ctx; | |
1024 | } | |
1025 | ||
11fdf7f2 TL |
1026 | extern "C" int krbd_map(struct krbd_ctx *ctx, |
1027 | const char *pool_name, | |
1028 | const char *nspace_name, | |
1029 | const char *image_name, | |
1030 | const char *snap_name, | |
1031 | const char *options, | |
1032 | char **pdevnode) | |
7c673cae | 1033 | { |
11fdf7f2 | 1034 | krbd_spec spec(pool_name, nspace_name, image_name, snap_name); |
7c673cae FG |
1035 | string name; |
1036 | char *devnode; | |
1037 | int r; | |
1038 | ||
11fdf7f2 | 1039 | r = map_image(ctx, spec, options, &name); |
7c673cae FG |
1040 | if (r < 0) |
1041 | return r; | |
1042 | ||
1043 | devnode = strdup(name.c_str()); | |
1044 | if (!devnode) | |
1045 | return -ENOMEM; | |
1046 | ||
1047 | *pdevnode = devnode; | |
1048 | return r; | |
1049 | } | |
1050 | ||
1051 | extern "C" int krbd_unmap(struct krbd_ctx *ctx, const char *devnode, | |
1052 | const char *options) | |
1053 | { | |
1054 | return unmap_image(ctx, devnode, options); | |
1055 | } | |
1056 | ||
11fdf7f2 TL |
1057 | extern "C" int krbd_unmap_by_spec(struct krbd_ctx *ctx, |
1058 | const char *pool_name, | |
1059 | const char *nspace_name, | |
1060 | const char *image_name, | |
1061 | const char *snap_name, | |
7c673cae FG |
1062 | const char *options) |
1063 | { | |
11fdf7f2 TL |
1064 | krbd_spec spec(pool_name, nspace_name, image_name, snap_name); |
1065 | return unmap_image(ctx, spec, options); | |
7c673cae FG |
1066 | } |
1067 | ||
1068 | int krbd_showmapped(struct krbd_ctx *ctx, Formatter *f) | |
1069 | { | |
1070 | return dump_images(ctx, f); | |
1071 | } | |
11fdf7f2 TL |
1072 | |
1073 | extern "C" int krbd_is_mapped(struct krbd_ctx *ctx, | |
1074 | const char *pool_name, | |
1075 | const char *nspace_name, | |
1076 | const char *image_name, | |
1077 | const char *snap_name, | |
1078 | char **pdevnode) | |
1079 | { | |
1080 | krbd_spec spec(pool_name, nspace_name, image_name, snap_name); | |
1081 | string name; | |
1082 | char *devnode; | |
1083 | int r; | |
1084 | ||
1085 | r = is_mapped_image(ctx->udev, spec, &name); | |
1086 | if (r <= 0) /* error or not mapped */ | |
1087 | return r; | |
1088 | ||
1089 | devnode = strdup(name.c_str()); | |
1090 | if (!devnode) | |
1091 | return -ENOMEM; | |
1092 | ||
1093 | *pdevnode = devnode; | |
1094 | return r; | |
1095 | } |