]> git.proxmox.com Git - ceph.git/blob - ceph/src/krbd.cc
bf7e25583691835a3525194e41b9887e168f82b5
[ceph.git] / ceph / src / krbd.cc
1 /*
2 * Ceph - scalable distributed file system
3 *
4 * Copyright (C) 2014 Inktank Storage, Inc.
5 *
6 * This is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License version 2.1, as published by the Free Software
9 * Foundation. See file COPYING.
10 *
11 */
12
13 #include <errno.h>
14 #include <fcntl.h>
15 #include <iostream>
16 #include <map>
17 #include <poll.h>
18 #include <sstream>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include <string>
23 #include <sys/stat.h>
24 #include <sys/types.h>
25 #include <unistd.h>
26
27 #include "auth/KeyRing.h"
28 #include "common/errno.h"
29 #include "common/Formatter.h"
30 #include "common/module.h"
31 #include "common/run_cmd.h"
32 #include "common/safe_io.h"
33 #include "common/secret.h"
34 #include "common/TextTable.h"
35 #include "include/assert.h"
36 #include "include/stringify.h"
37 #include "include/krbd.h"
38 #include "mon/MonMap.h"
39
40 #include <blkid/blkid.h>
41 #include <libudev.h>
42
43 using namespace std;
44
45 const static int POLL_TIMEOUT=120000;
46
47 struct krbd_ctx {
48 CephContext *cct;
49 struct udev *udev;
50 };
51
52 static string get_kernel_rbd_name(const char *id)
53 {
54 return string("/dev/rbd") + id;
55 }
56
57 static int sysfs_write_rbd(const char *which, const string& buf)
58 {
59 const string s = string("/sys/bus/rbd/") + which;
60 const string t = s + "_single_major";
61 int fd;
62 int r;
63
64 /*
65 * 'add' and 'add_single_major' interfaces are identical, but if rbd
66 * kernel module is new enough and is configured to use single-major
67 * scheme, 'add' is disabled in order to prevent old userspace from
68 * doing weird things at unmap time.
69 *
70 * Same goes for 'remove' vs 'remove_single_major'.
71 */
72 fd = open(t.c_str(), O_WRONLY);
73 if (fd < 0) {
74 if (errno == ENOENT) {
75 fd = open(s.c_str(), O_WRONLY);
76 if (fd < 0)
77 return -errno;
78 } else {
79 return -errno;
80 }
81 }
82
83 r = safe_write(fd, buf.c_str(), buf.size());
84
85 close(fd);
86 return r;
87 }
88
89 static int sysfs_write_rbd_add(const string& buf)
90 {
91 return sysfs_write_rbd("add", buf);
92 }
93
94 static int sysfs_write_rbd_remove(const string& buf)
95 {
96 return sysfs_write_rbd("remove", buf);
97 }
98
99 static int have_minor_attr(void)
100 {
101 /*
102 * 'minor' attribute was added as part of single_major merge, which
103 * exposed the 'single_major' parameter. 'minor' is always present,
104 * regardless of whether single-major scheme is turned on or not.
105 *
106 * (Something like ver >= KERNEL_VERSION(3, 14, 0) is a no-go because
107 * this has to work with rbd.ko backported to various kernels.)
108 */
109 return access("/sys/module/rbd/parameters/single_major", F_OK) == 0;
110 }
111
112 static int build_map_buf(CephContext *cct, const char *pool, const char *image,
113 const char *snap, const char *options, string *pbuf)
114 {
115 ostringstream oss;
116 int r;
117
118 MonMap monmap;
119 r = monmap.build_initial(cct, cerr);
120 if (r < 0)
121 return r;
122
123 list<entity_addr_t> mon_addr;
124 monmap.list_addrs(mon_addr);
125
126 for (const auto &p : mon_addr) {
127 if (oss.tellp() > 0) {
128 oss << ",";
129 }
130 oss << p.get_sockaddr();
131 }
132
133 oss << " name=" << cct->_conf->name.get_id();
134
135 KeyRing keyring;
136 r = keyring.from_ceph_context(cct);
137 if (r == -ENOENT && !(cct->_conf->keyfile.length() ||
138 cct->_conf->key.length()))
139 r = 0;
140 if (r < 0) {
141 cerr << "rbd: failed to get secret" << std::endl;
142 return r;
143 }
144
145 CryptoKey secret;
146 string key_name = string("client.") + cct->_conf->name.get_id();
147 if (keyring.get_secret(cct->_conf->name, secret)) {
148 string secret_str;
149 secret.encode_base64(secret_str);
150
151 r = set_kernel_secret(secret_str.c_str(), key_name.c_str());
152 if (r >= 0) {
153 if (r == 0)
154 cerr << "rbd: warning: secret has length 0" << std::endl;
155 oss << ",key=" << key_name;
156 } else if (r == -ENODEV || r == -ENOSYS) {
157 // running against older kernel; fall back to secret= in options
158 oss << ",secret=" << secret_str;
159 } else {
160 cerr << "rbd: failed to add secret '" << key_name << "' to kernel"
161 << std::endl;
162 return r;
163 }
164 } else if (is_kernel_secret(key_name.c_str())) {
165 oss << ",key=" << key_name;
166 }
167
168 if (strcmp(options, "") != 0)
169 oss << "," << options;
170
171 oss << " " << pool << " " << image << " " << snap;
172
173 *pbuf = oss.str();
174 return 0;
175 }
176
177 static int wait_for_udev_add(struct udev_monitor *mon, const char *pool,
178 const char *image, const char *snap,
179 string *pname)
180 {
181 struct udev_device *bus_dev = NULL;
182
183 /*
184 * Catch /sys/devices/rbd/<id>/ and wait for the corresponding
185 * block device to show up. This is necessary because rbd devices
186 * and block devices aren't linked together in our sysfs layout.
187 */
188 for (;;) {
189 struct pollfd fds[1];
190 struct udev_device *dev;
191
192 fds[0].fd = udev_monitor_get_fd(mon);
193 fds[0].events = POLLIN;
194 if (poll(fds, 1, POLL_TIMEOUT) < 0)
195 return -errno;
196
197 dev = udev_monitor_receive_device(mon);
198 if (!dev)
199 continue;
200
201 if (strcmp(udev_device_get_action(dev), "add") != 0)
202 goto next;
203
204 if (!bus_dev) {
205 if (strcmp(udev_device_get_subsystem(dev), "rbd") == 0) {
206 const char *this_pool = udev_device_get_sysattr_value(dev, "pool");
207 const char *this_image = udev_device_get_sysattr_value(dev, "name");
208 const char *this_snap = udev_device_get_sysattr_value(dev,
209 "current_snap");
210
211 if (this_pool && strcmp(this_pool, pool) == 0 &&
212 this_image && strcmp(this_image, image) == 0 &&
213 this_snap && strcmp(this_snap, snap) == 0) {
214 bus_dev = dev;
215 continue;
216 }
217 }
218 } else {
219 if (strcmp(udev_device_get_subsystem(dev), "block") == 0) {
220 const char *major = udev_device_get_sysattr_value(bus_dev, "major");
221 const char *minor = udev_device_get_sysattr_value(bus_dev, "minor");
222 const char *this_major = udev_device_get_property_value(dev, "MAJOR");
223 const char *this_minor = udev_device_get_property_value(dev, "MINOR");
224
225 assert(!minor ^ have_minor_attr());
226
227 if (strcmp(this_major, major) == 0 &&
228 (!minor || strcmp(this_minor, minor) == 0)) {
229 string name = get_kernel_rbd_name(udev_device_get_sysname(bus_dev));
230
231 assert(strcmp(udev_device_get_devnode(dev), name.c_str()) == 0);
232 *pname = name;
233
234 udev_device_unref(dev);
235 udev_device_unref(bus_dev);
236 break;
237 }
238 }
239 }
240
241 next:
242 udev_device_unref(dev);
243 }
244
245 return 0;
246 }
247
248 static int do_map(struct udev *udev, const char *pool, const char *image,
249 const char *snap, const string& buf, string *pname)
250 {
251 struct udev_monitor *mon;
252 int r;
253
254 mon = udev_monitor_new_from_netlink(udev, "udev");
255 if (!mon)
256 return -ENOMEM;
257
258 r = udev_monitor_filter_add_match_subsystem_devtype(mon, "rbd", NULL);
259 if (r < 0)
260 goto out_mon;
261
262 r = udev_monitor_filter_add_match_subsystem_devtype(mon, "block", "disk");
263 if (r < 0)
264 goto out_mon;
265
266 r = udev_monitor_enable_receiving(mon);
267 if (r < 0)
268 goto out_mon;
269
270 r = sysfs_write_rbd_add(buf);
271 if (r < 0) {
272 cerr << "rbd: sysfs write failed" << std::endl;
273 goto out_mon;
274 }
275
276 r = wait_for_udev_add(mon, pool, image, snap, pname);
277 if (r < 0) {
278 cerr << "rbd: wait failed" << std::endl;
279 goto out_mon;
280 }
281
282 out_mon:
283 udev_monitor_unref(mon);
284 return r;
285 }
286
287 static int map_image(struct krbd_ctx *ctx, const char *pool, const char *image,
288 const char *snap, const char *options, string *pname)
289 {
290 string buf;
291 int r;
292
293 if (strcmp(snap, "") == 0)
294 snap = "-";
295
296 r = build_map_buf(ctx->cct, pool, image, snap, options, &buf);
297 if (r < 0)
298 return r;
299
300 /*
301 * Modprobe rbd kernel module. If it supports single-major device
302 * number allocation scheme, make sure it's turned on.
303 */
304 if (access("/sys/bus/rbd", F_OK) != 0) {
305 const char *module_options = NULL;
306 if (module_has_param("rbd", "single_major"))
307 module_options = "single_major=Y";
308
309 r = module_load("rbd", module_options);
310 if (r) {
311 cerr << "rbd: failed to load rbd kernel module (" << r << ")"
312 << std::endl;
313 /*
314 * Ignore the error: modprobe failing doesn't necessarily prevent
315 * from working.
316 */
317 }
318 }
319
320 return do_map(ctx->udev, pool, image, snap, buf, pname);
321 }
322
323 static int devno_to_krbd_id(struct udev *udev, dev_t devno, string *pid)
324 {
325 struct udev_enumerate *enm;
326 struct udev_list_entry *l;
327 struct udev_device *dev;
328 int r;
329
330 enm = udev_enumerate_new(udev);
331 if (!enm)
332 return -ENOMEM;
333
334 r = udev_enumerate_add_match_subsystem(enm, "rbd");
335 if (r < 0)
336 goto out_enm;
337
338 r = udev_enumerate_add_match_sysattr(enm, "major",
339 stringify(major(devno)).c_str());
340 if (r < 0)
341 goto out_enm;
342
343 if (have_minor_attr()) {
344 r = udev_enumerate_add_match_sysattr(enm, "minor",
345 stringify(minor(devno)).c_str());
346 if (r < 0)
347 goto out_enm;
348 }
349
350 r = udev_enumerate_scan_devices(enm);
351 if (r < 0)
352 goto out_enm;
353
354 l = udev_enumerate_get_list_entry(enm);
355 if (!l) {
356 r = -ENOENT;
357 goto out_enm;
358 }
359
360 /* make sure there is only one match */
361 assert(!udev_list_entry_get_next(l));
362
363 dev = udev_device_new_from_syspath(udev, udev_list_entry_get_name(l));
364 if (!dev) {
365 r = -ENOMEM;
366 goto out_enm;
367 }
368
369 *pid = udev_device_get_sysname(dev);
370
371 udev_device_unref(dev);
372 out_enm:
373 udev_enumerate_unref(enm);
374 return r;
375 }
376
377 static int spec_to_devno_and_krbd_id(struct udev *udev, const char *pool,
378 const char *image, const char *snap,
379 dev_t *pdevno, string *pid)
380 {
381 struct udev_enumerate *enm;
382 struct udev_list_entry *l;
383 struct udev_device *dev;
384 unsigned int maj, min = 0;
385 string err;
386 int r;
387
388 enm = udev_enumerate_new(udev);
389 if (!enm)
390 return -ENOMEM;
391
392 r = udev_enumerate_add_match_subsystem(enm, "rbd");
393 if (r < 0)
394 goto out_enm;
395
396 r = udev_enumerate_add_match_sysattr(enm, "pool", pool);
397 if (r < 0)
398 goto out_enm;
399
400 r = udev_enumerate_add_match_sysattr(enm, "name", image);
401 if (r < 0)
402 goto out_enm;
403
404 r = udev_enumerate_add_match_sysattr(enm, "current_snap", snap);
405 if (r < 0)
406 goto out_enm;
407
408 r = udev_enumerate_scan_devices(enm);
409 if (r < 0)
410 goto out_enm;
411
412 l = udev_enumerate_get_list_entry(enm);
413 if (!l) {
414 r = -ENOENT;
415 goto out_enm;
416 }
417
418 dev = udev_device_new_from_syspath(udev, udev_list_entry_get_name(l));
419 if (!dev) {
420 r = -ENOMEM;
421 goto out_enm;
422 }
423
424 maj = strict_strtoll(udev_device_get_sysattr_value(dev, "major"), 10, &err);
425 if (!err.empty()) {
426 cerr << "rbd: couldn't parse major: " << err << std::endl;
427 r = -EINVAL;
428 goto out_dev;
429 }
430 if (have_minor_attr()) {
431 min = strict_strtoll(udev_device_get_sysattr_value(dev, "minor"), 10, &err);
432 if (!err.empty()) {
433 cerr << "rbd: couldn't parse minor: " << err << std::endl;
434 r = -EINVAL;
435 goto out_dev;
436 }
437 }
438
439 /*
440 * If an image is mapped more than once don't bother trying to unmap
441 * all devices - let users run unmap the same number of times they
442 * ran map.
443 */
444 if (udev_list_entry_get_next(l))
445 cerr << "rbd: " << pool << "/" << image << "@" << snap
446 << ": mapped more than once, unmapping "
447 << get_kernel_rbd_name(udev_device_get_sysname(dev))
448 << " only" << std::endl;
449
450 *pdevno = makedev(maj, min);
451 *pid = udev_device_get_sysname(dev);
452
453 out_dev:
454 udev_device_unref(dev);
455 out_enm:
456 udev_enumerate_unref(enm);
457 return r;
458 }
459
460 static string build_unmap_buf(const string& id, const char *options)
461 {
462 string buf(id);
463 if (strcmp(options, "") != 0) {
464 buf += " ";
465 buf += options;
466 }
467 return buf;
468 }
469
470 static int wait_for_udev_remove(struct udev_monitor *mon, dev_t devno)
471 {
472 for (;;) {
473 struct pollfd fds[1];
474 struct udev_device *dev;
475
476 fds[0].fd = udev_monitor_get_fd(mon);
477 fds[0].events = POLLIN;
478 if (poll(fds, 1, POLL_TIMEOUT) < 0)
479 return -errno;
480
481 dev = udev_monitor_receive_device(mon);
482 if (!dev)
483 continue;
484
485 if (strcmp(udev_device_get_action(dev), "remove") == 0 &&
486 udev_device_get_devnum(dev) == devno) {
487 udev_device_unref(dev);
488 break;
489 }
490
491 udev_device_unref(dev);
492 }
493
494 return 0;
495 }
496
497 static int do_unmap(struct udev *udev, dev_t devno, const string& buf)
498 {
499 struct udev_monitor *mon;
500 int r;
501
502 mon = udev_monitor_new_from_netlink(udev, "udev");
503 if (!mon)
504 return -ENOMEM;
505
506 r = udev_monitor_filter_add_match_subsystem_devtype(mon, "block", "disk");
507 if (r < 0)
508 goto out_mon;
509
510 r = udev_monitor_enable_receiving(mon);
511 if (r < 0)
512 goto out_mon;
513
514 /*
515 * On final device close(), kernel sends a block change event, in
516 * response to which udev apparently runs blkid on the device. This
517 * makes unmap fail with EBUSY, if issued right after final close().
518 * Try to circumvent this with a retry before turning to udev.
519 */
520 for (int tries = 0; ; tries++) {
521 r = sysfs_write_rbd_remove(buf);
522 if (r >= 0) {
523 break;
524 } else if (r == -EBUSY && tries < 2) {
525 if (!tries) {
526 usleep(250 * 1000);
527 } else {
528 /*
529 * libudev does not provide the "wait until the queue is empty"
530 * API or the sufficient amount of primitives to build it from.
531 */
532 string err = run_cmd("udevadm", "settle", "--timeout", "10", NULL);
533 if (!err.empty())
534 cerr << "rbd: " << err << std::endl;
535 }
536 } else {
537 cerr << "rbd: sysfs write failed" << std::endl;
538 goto out_mon;
539 }
540 }
541
542 r = wait_for_udev_remove(mon, devno);
543 if (r < 0) {
544 cerr << "rbd: wait failed" << std::endl;
545 goto out_mon;
546 }
547
548 out_mon:
549 udev_monitor_unref(mon);
550 return r;
551 }
552
553 static int unmap_image(struct krbd_ctx *ctx, const char *devnode,
554 const char *options)
555 {
556 struct stat sb;
557 dev_t wholedevno = 0;
558 string id;
559 int r;
560
561 if (stat(devnode, &sb) < 0 || !S_ISBLK(sb.st_mode)) {
562 cerr << "rbd: '" << devnode << "' is not a block device" << std::endl;
563 return -EINVAL;
564 }
565
566 r = blkid_devno_to_wholedisk(sb.st_rdev, NULL, 0, &wholedevno);
567 if (r < 0) {
568 cerr << "rbd: couldn't compute wholedevno: " << cpp_strerror(r)
569 << std::endl;
570 /*
571 * Ignore the error: we are given whole disks most of the time, and
572 * if it turns out this is a partition we will fail later anyway.
573 */
574 wholedevno = sb.st_rdev;
575 }
576
577 r = devno_to_krbd_id(ctx->udev, wholedevno, &id);
578 if (r < 0) {
579 if (r == -ENOENT) {
580 cerr << "rbd: '" << devnode << "' is not an rbd device" << std::endl;
581 r = -EINVAL;
582 }
583 return r;
584 }
585
586 return do_unmap(ctx->udev, wholedevno, build_unmap_buf(id, options));
587 }
588
589 static int unmap_image(struct krbd_ctx *ctx, const char *pool,
590 const char *image, const char *snap,
591 const char *options)
592 {
593 dev_t devno = 0;
594 string id;
595 int r;
596
597 if (!snap)
598 snap = "-";
599
600 r = spec_to_devno_and_krbd_id(ctx->udev, pool, image, snap, &devno, &id);
601 if (r < 0) {
602 if (r == -ENOENT) {
603 cerr << "rbd: " << pool << "/" << image << "@" << snap
604 << ": not a mapped image or snapshot" << std::endl;
605 r = -EINVAL;
606 }
607 return r;
608 }
609
610 return do_unmap(ctx->udev, devno, build_unmap_buf(id, options));
611 }
612
613 static bool dump_one_image(Formatter *f, TextTable *tbl,
614 struct udev_device *dev)
615 {
616 const char *id = udev_device_get_sysname(dev);
617 const char *pool = udev_device_get_sysattr_value(dev, "pool");
618 const char *image = udev_device_get_sysattr_value(dev, "name");
619 const char *snap = udev_device_get_sysattr_value(dev, "current_snap");
620 string kname = get_kernel_rbd_name(id);
621
622 if (!pool || !image || !snap)
623 return false;
624
625 if (f) {
626 f->open_object_section(id);
627 f->dump_string("pool", pool);
628 f->dump_string("name", image);
629 f->dump_string("snap", snap);
630 f->dump_string("device", kname);
631 f->close_section();
632 } else {
633 *tbl << id << pool << image << snap << kname << TextTable::endrow;
634 }
635
636 return true;
637 }
638
639 static int do_dump(struct udev *udev, Formatter *f, TextTable *tbl)
640 {
641 struct udev_enumerate *enm;
642 struct udev_list_entry *l;
643 bool have_output = false;
644 int r;
645
646 enm = udev_enumerate_new(udev);
647 if (!enm)
648 return -ENOMEM;
649
650 r = udev_enumerate_add_match_subsystem(enm, "rbd");
651 if (r < 0)
652 goto out_enm;
653
654 r = udev_enumerate_scan_devices(enm);
655 if (r < 0)
656 goto out_enm;
657
658 udev_list_entry_foreach(l, udev_enumerate_get_list_entry(enm)) {
659 struct udev_device *dev;
660
661 dev = udev_device_new_from_syspath(udev, udev_list_entry_get_name(l));
662 if (dev) {
663 have_output |= dump_one_image(f, tbl, dev);
664 udev_device_unref(dev);
665 }
666 }
667
668 r = have_output;
669 out_enm:
670 udev_enumerate_unref(enm);
671 return r;
672 }
673
674 int dump_images(struct krbd_ctx *ctx, Formatter *f)
675 {
676 TextTable tbl;
677 int r;
678
679 if (f) {
680 f->open_object_section("devices");
681 } else {
682 tbl.define_column("id", TextTable::LEFT, TextTable::LEFT);
683 tbl.define_column("pool", TextTable::LEFT, TextTable::LEFT);
684 tbl.define_column("image", TextTable::LEFT, TextTable::LEFT);
685 tbl.define_column("snap", TextTable::LEFT, TextTable::LEFT);
686 tbl.define_column("device", TextTable::LEFT, TextTable::LEFT);
687 }
688
689 r = do_dump(ctx->udev, f, &tbl);
690
691 if (f) {
692 f->close_section();
693 f->flush(cout);
694 } else {
695 if (r > 0)
696 cout << tbl;
697 }
698
699 return r;
700 }
701
702 extern "C" int krbd_create_from_context(rados_config_t cct,
703 struct krbd_ctx **pctx)
704 {
705 struct krbd_ctx *ctx = new struct krbd_ctx();
706
707 ctx->cct = reinterpret_cast<CephContext *>(cct);
708 ctx->udev = udev_new();
709 if (!ctx->udev) {
710 delete ctx;
711 return -ENOMEM;
712 }
713
714 *pctx = ctx;
715 return 0;
716 }
717
718 extern "C" void krbd_destroy(struct krbd_ctx *ctx)
719 {
720 if (!ctx)
721 return;
722
723 udev_unref(ctx->udev);
724
725 delete ctx;
726 }
727
728 extern "C" int krbd_map(struct krbd_ctx *ctx, const char *pool,
729 const char *image, const char *snap,
730 const char *options, char **pdevnode)
731 {
732 string name;
733 char *devnode;
734 int r;
735
736 r = map_image(ctx, pool, image, snap, options, &name);
737 if (r < 0)
738 return r;
739
740 devnode = strdup(name.c_str());
741 if (!devnode)
742 return -ENOMEM;
743
744 *pdevnode = devnode;
745 return r;
746 }
747
748 extern "C" int krbd_unmap(struct krbd_ctx *ctx, const char *devnode,
749 const char *options)
750 {
751 return unmap_image(ctx, devnode, options);
752 }
753
754 extern "C" int krbd_unmap_by_spec(struct krbd_ctx *ctx, const char *pool,
755 const char *image, const char *snap,
756 const char *options)
757 {
758 return unmap_image(ctx, pool, image, snap, options);
759 }
760
761 int krbd_showmapped(struct krbd_ctx *ctx, Formatter *f)
762 {
763 return dump_images(ctx, f);
764 }