1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
5 * rbd-nbd - RBD in userspace
7 * Copyright (C) 2015 - 2016 Kylin Corporation
9 * Author: Yunchuan Wen <yunchuan.wen@kylin-cloud.com>
10 * Li Wang <li.wang@kylin-cloud.com>
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
19 #include "include/int_types.h"
27 #include <sys/types.h>
30 #include <linux/nbd.h>
32 #include <sys/ioctl.h>
33 #include <sys/socket.h>
37 #include <boost/regex.hpp>
39 #include "mon/MonClient.h"
40 #include "common/config.h"
41 #include "common/dout.h"
43 #include "common/errno.h"
44 #include "common/module.h"
45 #include "common/safe_io.h"
46 #include "common/TextTable.h"
47 #include "common/ceph_argparse.h"
48 #include "common/Preforker.h"
49 #include "global/global_init.h"
50 #include "global/signal_handler.h"
52 #include "include/rados/librados.hpp"
53 #include "include/rbd/librbd.hpp"
54 #include "include/stringify.h"
55 #include "include/xlist.h"
57 #define dout_context g_ceph_context
58 #define dout_subsys ceph_subsys_rbd
60 #define dout_prefix *_dout << "rbd-nbd: "
66 bool exclusive
= false;
67 bool readonly
= false;
68 bool set_max_part
= false;
78 std::cout
<< "Usage: rbd-nbd [options] map <image-or-snap-spec> Map an image to nbd device\n"
79 << " unmap <device path> Unmap nbd device\n"
80 << " list-mapped List mapped nbd devices\n"
82 << " --device <device path> Specify nbd device path\n"
83 << " --read-only Map read-only\n"
84 << " --nbds_max <limit> Override for module param nbds_max\n"
85 << " --max_part <limit> Override for module param max_part\n"
86 << " --exclusive Forbid writes by other clients\n"
88 generic_server_usage();
100 #define RBD_NBD_BLKSIZE 512UL
102 #ifdef CEPH_BIG_ENDIAN
103 #define ntohll(a) (a)
104 #elif defined(CEPH_LITTLE_ENDIAN)
105 #define ntohll(a) swab(a)
107 #error "Could not determine endianess"
109 #define htonll(a) ntohll(a)
111 static int parse_args(vector
<const char*>& args
, std::ostream
*err_msg
, Config
*cfg
);
113 static void handle_signal(int signum
)
115 assert(signum
== SIGINT
|| signum
== SIGTERM
);
116 derr
<< "*** Got signal " << sig_str(signum
) << " ***" << dendl
;
117 dout(20) << __func__
<< ": " << "sending NBD_DISCONNECT" << dendl
;
118 if (ioctl(nbd
, NBD_DISCONNECT
) < 0) {
119 derr
<< "rbd-nbd: disconnect failed: " << cpp_strerror(errno
) << dendl
;
121 dout(20) << __func__
<< ": " << "disconnected" << dendl
;
129 librbd::Image
&image
;
132 NBDServer(int _fd
, librbd::Image
& _image
)
135 , lock("NBDServer::Locker")
136 , reader_thread(*this, &NBDServer::reader_entry
)
137 , writer_thread(*this, &NBDServer::writer_entry
)
142 std::atomic
<bool> terminated
= { false };
146 bool expected
= false;
147 if (terminated
.compare_exchange_strong(expected
, true)) {
148 ::shutdown(fd
, SHUT_RDWR
);
150 Mutex::Locker
l(lock
);
157 xlist
<IOContext
*>::item item
;
159 struct nbd_request request
;
160 struct nbd_reply reply
;
169 friend std::ostream
&operator<<(std::ostream
&os
, const IOContext
&ctx
);
173 xlist
<IOContext
*> io_pending
;
174 xlist
<IOContext
*> io_finished
;
176 void io_start(IOContext
*ctx
)
178 Mutex::Locker
l(lock
);
179 io_pending
.push_back(&ctx
->item
);
182 void io_finish(IOContext
*ctx
)
184 Mutex::Locker
l(lock
);
185 assert(ctx
->item
.is_on_list());
186 ctx
->item
.remove_myself();
187 io_finished
.push_back(&ctx
->item
);
191 IOContext
*wait_io_finish()
193 Mutex::Locker
l(lock
);
194 while(io_finished
.empty() && !terminated
)
197 if (io_finished
.empty())
200 IOContext
*ret
= io_finished
.front();
201 io_finished
.pop_front();
208 assert(!reader_thread
.is_started());
209 Mutex::Locker
l(lock
);
210 while(!io_pending
.empty())
213 while(!io_finished
.empty()) {
214 ceph::unique_ptr
<IOContext
> free_ctx(io_finished
.front());
215 io_finished
.pop_front();
219 static void aio_callback(librbd::completion_t cb
, void *arg
)
221 librbd::RBD::AioCompletion
*aio_completion
=
222 reinterpret_cast<librbd::RBD::AioCompletion
*>(cb
);
224 IOContext
*ctx
= reinterpret_cast<IOContext
*>(arg
);
225 int ret
= aio_completion
->get_return_value();
227 dout(20) << __func__
<< ": " << *ctx
<< dendl
;
229 if (ret
== -EINVAL
) {
230 // if shrinking an image, a pagecache writeback might reference
231 // extents outside of the range of the new image extents
232 dout(5) << __func__
<< ": masking IO out-of-bounds error" << dendl
;
238 ctx
->reply
.error
= htonl(-ret
);
239 } else if ((ctx
->command
== NBD_CMD_READ
) &&
240 ret
< static_cast<int>(ctx
->request
.len
)) {
241 int pad_byte_count
= static_cast<int> (ctx
->request
.len
) - ret
;
242 ctx
->data
.append_zero(pad_byte_count
);
243 dout(20) << __func__
<< ": " << *ctx
<< ": Pad byte count: "
244 << pad_byte_count
<< dendl
;
245 ctx
->reply
.error
= 0;
247 ctx
->reply
.error
= htonl(0);
249 ctx
->server
->io_finish(ctx
);
251 aio_completion
->release();
256 while (!terminated
) {
257 ceph::unique_ptr
<IOContext
> ctx(new IOContext());
260 dout(20) << __func__
<< ": waiting for nbd request" << dendl
;
262 int r
= safe_read_exact(fd
, &ctx
->request
, sizeof(struct nbd_request
));
264 derr
<< "failed to read nbd request header: " << cpp_strerror(r
)
269 if (ctx
->request
.magic
!= htonl(NBD_REQUEST_MAGIC
)) {
270 derr
<< "invalid nbd request header" << dendl
;
274 ctx
->request
.from
= ntohll(ctx
->request
.from
);
275 ctx
->request
.type
= ntohl(ctx
->request
.type
);
276 ctx
->request
.len
= ntohl(ctx
->request
.len
);
278 ctx
->reply
.magic
= htonl(NBD_REPLY_MAGIC
);
279 memcpy(ctx
->reply
.handle
, ctx
->request
.handle
, sizeof(ctx
->reply
.handle
));
281 ctx
->command
= ctx
->request
.type
& 0x0000ffff;
283 dout(20) << *ctx
<< ": start" << dendl
;
285 switch (ctx
->command
)
288 // NBD_DO_IT will return when pipe is closed
289 dout(0) << "disconnect request received" << dendl
;
292 bufferptr
ptr(ctx
->request
.len
);
293 r
= safe_read_exact(fd
, ptr
.c_str(), ctx
->request
.len
);
295 derr
<< *ctx
<< ": failed to read nbd request data: "
296 << cpp_strerror(r
) << dendl
;
299 ctx
->data
.push_back(ptr
);
303 IOContext
*pctx
= ctx
.release();
305 librbd::RBD::AioCompletion
*c
= new librbd::RBD::AioCompletion(pctx
, aio_callback
);
306 switch (pctx
->command
)
309 image
.aio_write(pctx
->request
.from
, pctx
->request
.len
, pctx
->data
, c
);
312 image
.aio_read(pctx
->request
.from
, pctx
->request
.len
, pctx
->data
, c
);
318 image
.aio_discard(pctx
->request
.from
, pctx
->request
.len
, c
);
321 derr
<< *pctx
<< ": invalid request command" << dendl
;
326 dout(20) << __func__
<< ": terminated" << dendl
;
331 while (!terminated
) {
332 dout(20) << __func__
<< ": waiting for io request" << dendl
;
333 ceph::unique_ptr
<IOContext
> ctx(wait_io_finish());
335 dout(20) << __func__
<< ": no io requests, terminating" << dendl
;
339 dout(20) << __func__
<< ": got: " << *ctx
<< dendl
;
341 int r
= safe_write(fd
, &ctx
->reply
, sizeof(struct nbd_reply
));
343 derr
<< *ctx
<< ": failed to write reply header: " << cpp_strerror(r
)
347 if (ctx
->command
== NBD_CMD_READ
&& ctx
->reply
.error
== htonl(0)) {
348 r
= ctx
->data
.write_fd(fd
);
350 derr
<< *ctx
<< ": failed to write replay data: " << cpp_strerror(r
)
355 dout(20) << *ctx
<< ": finish" << dendl
;
357 dout(20) << __func__
<< ": terminated" << dendl
;
360 class ThreadHelper
: public Thread
363 typedef void (NBDServer::*entry_func
)();
368 ThreadHelper(NBDServer
&_server
, entry_func _func
)
373 void* entry() override
379 } reader_thread
, writer_thread
;
386 dout(10) << __func__
<< ": starting" << dendl
;
390 reader_thread
.create("rbd_reader");
391 writer_thread
.create("rbd_writer");
398 dout(10) << __func__
<< ": terminating" << dendl
;
402 reader_thread
.join();
403 writer_thread
.join();
417 std::ostream
&operator<<(std::ostream
&os
, const NBDServer::IOContext
&ctx
) {
419 os
<< "[" << std::hex
<< ntohll(*((uint64_t *)ctx
.request
.handle
));
436 os
<< " UNKNOW(" << ctx
.command
<< ") ";
440 os
<< ctx
.request
.from
<< "~" << ctx
.request
.len
<< " "
441 << ntohl(ctx
.reply
.error
) << "]";
446 class NBDWatchCtx
: public librbd::UpdateWatchCtx
450 librados::IoCtx
&io_ctx
;
451 librbd::Image
&image
;
455 librados::IoCtx
&_io_ctx
,
456 librbd::Image
&_image
,
464 ~NBDWatchCtx() override
{}
466 void handle_notify() override
468 librbd::image_info_t info
;
469 if (image
.stat(info
, sizeof(info
)) == 0) {
470 unsigned long new_size
= info
.size
;
472 if (new_size
!= size
) {
473 if (ioctl(fd
, BLKFLSBUF
, NULL
) < 0)
474 derr
<< "invalidate page cache failed: " << cpp_strerror(errno
) << dendl
;
475 if (ioctl(fd
, NBD_SET_SIZE
, new_size
) < 0) {
476 derr
<< "resize failed: " << cpp_strerror(errno
) << dendl
;
480 if (image
.invalidate_cache() < 0)
481 derr
<< "invalidate rbd cache failed" << dendl
;
487 static int open_device(const char* path
, Config
*cfg
= nullptr, bool try_load_module
= false)
489 int nbd
= open(path
, O_RDWR
);
490 bool loaded_module
= false;
492 if (nbd
< 0 && try_load_module
&& access("/sys/module/nbd", F_OK
) != 0) {
496 param
<< "nbds_max=" << cfg
->nbds_max
;
499 param
<< " max_part=" << cfg
->max_part
;
501 r
= module_load("nbd", param
.str().c_str());
503 cerr
<< "rbd-nbd: failed to load nbd kernel module: " << cpp_strerror(-r
) << std::endl
;
506 loaded_module
= true;
508 nbd
= open(path
, O_RDWR
);
511 if (try_load_module
&& !loaded_module
&&
512 (cfg
->nbds_max
|| cfg
->set_max_part
)) {
513 cerr
<< "rbd-nbd: ignoring kernel module parameter options: nbd module already loaded"
520 static int check_device_size(int nbd_index
, unsigned long expected_size
)
522 // There are bugs with some older kernel versions that result in an
523 // overflow for large image sizes. This check is to ensure we are
526 unsigned long size
= 0;
527 std::string path
= "/sys/block/nbd" + stringify(nbd_index
) + "/size";
529 ifs
.open(path
.c_str(), std::ifstream::in
);
530 if (!ifs
.is_open()) {
531 cerr
<< "rbd-nbd: failed to open " << path
<< std::endl
;
535 size
*= RBD_NBD_BLKSIZE
;
538 // Newer kernel versions will report real size only after nbd
539 // connect. Assume this is the case and return success.
543 if (size
!= expected_size
) {
544 cerr
<< "rbd-nbd: kernel reported invalid device size (" << size
545 << ", expected " << expected_size
<< ")" << std::endl
;
552 static int do_map(int argc
, const char *argv
[], Config
*cfg
)
556 librados::Rados rados
;
558 librados::IoCtx io_ctx
;
568 librbd::image_info_t info
;
572 vector
<const char*> args
;
573 argv_to_vec(argc
, argv
, args
);
576 auto cct
= global_init(NULL
, args
, CEPH_ENTITY_TYPE_CLIENT
,
577 CODE_ENVIRONMENT_DAEMON
,
578 CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS
);
579 g_ceph_context
->_conf
->set_val_or_die("pid_file", "");
581 if (global_init_prefork(g_ceph_context
) >= 0) {
583 r
= forker
.prefork(err
);
585 cerr
<< err
<< std::endl
;
589 if (forker
.is_parent()) {
590 global_init_postfork_start(g_ceph_context
);
591 if (forker
.parent_wait(err
) != 0) {
598 common_init_finish(g_ceph_context
);
599 global_init_chdir(g_ceph_context
);
601 if (socketpair(AF_UNIX
, SOCK_STREAM
, 0, fd
) == -1) {
606 if (cfg
->devpath
.empty()) {
608 bool try_load_module
= true;
610 snprintf(dev
, sizeof(dev
), "/dev/nbd%d", index
);
612 nbd
= open_device(dev
, cfg
, try_load_module
);
613 try_load_module
= false;
616 cerr
<< "rbd-nbd: failed to find unused device" << std::endl
;
620 r
= ioctl(nbd
, NBD_SET_SOCK
, fd
[0]);
631 r
= sscanf(cfg
->devpath
.c_str(), "/dev/nbd%d", &index
);
633 cerr
<< "rbd-nbd: invalid device path: " << cfg
->devpath
634 << " (expected /dev/nbd{num})" << std::endl
;
637 nbd
= open_device(cfg
->devpath
.c_str(), cfg
, true);
640 cerr
<< "rbd-nbd: failed to open device: " << cfg
->devpath
<< std::endl
;
644 r
= ioctl(nbd
, NBD_SET_SOCK
, fd
[0]);
647 cerr
<< "rbd-nbd: the device " << cfg
->devpath
<< " is busy" << std::endl
;
653 flags
= NBD_FLAG_SEND_FLUSH
| NBD_FLAG_SEND_TRIM
| NBD_FLAG_HAS_FLAGS
;
654 if (!cfg
->snapname
.empty() || cfg
->readonly
) {
655 flags
|= NBD_FLAG_READ_ONLY
;
659 r
= rados
.init_with_context(g_ceph_context
);
667 r
= rados
.ioctx_create(cfg
->poolname
.c_str(), io_ctx
);
671 r
= rbd
.open(io_ctx
, image
, cfg
->imgname
.c_str());
675 if (cfg
->exclusive
) {
676 r
= image
.lock_acquire(RBD_LOCK_MODE_EXCLUSIVE
);
678 cerr
<< "rbd-nbd: failed to acquire exclusive lock: " << cpp_strerror(r
)
684 if (!cfg
->snapname
.empty()) {
685 r
= image
.snap_set(cfg
->snapname
.c_str());
690 r
= image
.stat(info
, sizeof(info
));
694 r
= ioctl(nbd
, NBD_SET_BLKSIZE
, RBD_NBD_BLKSIZE
);
700 if (info
.size
> ULONG_MAX
) {
702 cerr
<< "rbd-nbd: image is too large (" << prettybyte_t(info
.size
)
703 << ", max is " << prettybyte_t(ULONG_MAX
) << ")" << std::endl
;
709 r
= ioctl(nbd
, NBD_SET_SIZE
, size
);
715 r
= check_device_size(index
, size
);
720 ioctl(nbd
, NBD_SET_FLAGS
, flags
);
722 r
= ioctl(nbd
, BLKROSET
, (unsigned long) &read_only
);
731 NBDWatchCtx
watch_ctx(nbd
, io_ctx
, image
, info
.size
);
732 r
= image
.update_watch(&watch_ctx
, &handle
);
736 cout
<< cfg
->devpath
<< std::endl
;
738 if (g_conf
->daemonize
) {
740 global_init_postfork_start(g_ceph_context
);
741 global_init_postfork_finish(g_ceph_context
);
745 NBDServer
server(fd
[1], image
);
749 init_async_signal_handler();
750 register_async_signal_handler(SIGHUP
, sighup_handler
);
751 register_async_signal_handler_oneshot(SIGINT
, handle_signal
);
752 register_async_signal_handler_oneshot(SIGTERM
, handle_signal
);
754 ioctl(nbd
, NBD_DO_IT
);
756 unregister_async_signal_handler(SIGHUP
, sighup_handler
);
757 unregister_async_signal_handler(SIGINT
, handle_signal
);
758 unregister_async_signal_handler(SIGTERM
, handle_signal
);
759 shutdown_async_signal_handler();
764 r
= image
.update_unwatch(handle
);
770 ioctl(nbd
, NBD_CLEAR_SOCK
);
771 cerr
<< "rbd-nbd: failed to map, status: " << cpp_strerror(-r
) << std::endl
;
782 forker
.exit(r
< 0 ? EXIT_FAILURE
: 0);
787 static int do_unmap(const std::string
&devpath
)
791 int nbd
= open_device(devpath
.c_str());
793 cerr
<< "rbd-nbd: failed to open device: " << devpath
<< std::endl
;
797 r
= ioctl(nbd
, NBD_DISCONNECT
);
799 cerr
<< "rbd-nbd: the device is not used" << std::endl
;
807 static int parse_imgpath(const std::string
&imgpath
, Config
*cfg
)
809 boost::regex
pattern("^(?:([^/@]+)/)?([^/@]+)(?:@([^/@]+))?$");
811 if (!boost::regex_match(imgpath
, match
, pattern
)) {
812 std::cerr
<< "rbd-nbd: invalid spec '" << imgpath
<< "'" << std::endl
;
816 if (match
[1].matched
) {
817 cfg
->poolname
= match
[1];
820 cfg
->imgname
= match
[2];
822 if (match
[3].matched
)
823 cfg
->snapname
= match
[3];
828 static int get_mapped_info(int pid
, Config
*cfg
)
831 std::string path
= "/proc/" + stringify(pid
) + "/cmdline";
834 std::vector
<const char*> args
;
836 ifs
.open(path
.c_str(), std::ifstream::in
);
837 assert (ifs
.is_open());
840 for (unsigned i
= 0; i
< cmdline
.size(); i
++) {
841 const char *arg
= &cmdline
[i
];
843 if (strcmp(basename(arg
) , "rbd-nbd") != 0) {
850 while (cmdline
[i
] != '\0') {
855 std::ostringstream err_msg
;
856 r
= parse_args(args
, &err_msg
, cfg
);
860 static int get_map_pid(const std::string
& pid_path
)
864 ifs
.open(pid_path
.c_str(), std::ifstream::in
);
865 if (!ifs
.is_open()) {
872 static int do_list_mapped_devices()
875 bool should_print
= false;
879 std::string default_pool_name
;
883 tbl
.define_column("pid", TextTable::LEFT
, TextTable::LEFT
);
884 tbl
.define_column("pool", TextTable::LEFT
, TextTable::LEFT
);
885 tbl
.define_column("image", TextTable::LEFT
, TextTable::LEFT
);
886 tbl
.define_column("snap", TextTable::LEFT
, TextTable::LEFT
);
887 tbl
.define_column("device", TextTable::LEFT
, TextTable::LEFT
);
890 std::string nbd_path
= "/sys/block/nbd" + stringify(index
);
891 if(access(nbd_path
.c_str(), F_OK
) != 0) {
894 std::string pid_path
= nbd_path
+ "/pid";
895 pid
= get_map_pid(pid_path
);
899 r
= get_mapped_info(pid
, &cfg
);
905 if (cfg
.snapname
.empty()) {
908 tbl
<< pid
<< cfg
.poolname
<< cfg
.imgname
<< cfg
.snapname
909 << "/dev/nbd" + stringify(index
) << TextTable::endrow
;
921 static int parse_args(vector
<const char*>& args
, std::ostream
*err_msg
, Config
*cfg
)
923 std::vector
<const char*>::iterator i
;
924 std::ostringstream err
;
927 config
.parse_config_files(nullptr, nullptr, 0);
929 config
.parse_argv(args
);
930 cfg
->poolname
= config
.rbd_default_pool
;
932 for (i
= args
.begin(); i
!= args
.end(); ) {
933 if (ceph_argparse_flag(args
, i
, "-h", "--help", (char*)NULL
)) {
935 } else if (ceph_argparse_witharg(args
, i
, &cfg
->devpath
, "--device", (char *)NULL
)) {
936 } else if (ceph_argparse_witharg(args
, i
, &cfg
->nbds_max
, err
, "--nbds_max", (char *)NULL
)) {
937 if (!err
.str().empty()) {
938 *err_msg
<< "rbd-nbd: " << err
.str();
941 if (cfg
->nbds_max
< 0) {
942 *err_msg
<< "rbd-nbd: Invalid argument for nbds_max!";
945 } else if (ceph_argparse_witharg(args
, i
, &cfg
->max_part
, err
, "--max_part", (char *)NULL
)) {
946 if (!err
.str().empty()) {
947 *err_msg
<< "rbd-nbd: " << err
.str();
950 if ((cfg
->max_part
< 0) || (cfg
->max_part
> 255)) {
951 *err_msg
<< "rbd-nbd: Invalid argument for max_part(0~255)!";
954 cfg
->set_max_part
= true;
955 } else if (ceph_argparse_flag(args
, i
, "--read-only", (char *)NULL
)) {
956 cfg
->readonly
= true;
957 } else if (ceph_argparse_flag(args
, i
, "--exclusive", (char *)NULL
)) {
958 cfg
->exclusive
= true;
964 if (args
.begin() != args
.end()) {
965 if (strcmp(*args
.begin(), "map") == 0) {
967 } else if (strcmp(*args
.begin(), "unmap") == 0) {
969 } else if (strcmp(*args
.begin(), "list-mapped") == 0) {
972 *err_msg
<< "rbd-nbd: unknown command: " << *args
.begin();
975 args
.erase(args
.begin());
979 *err_msg
<< "rbd-nbd: must specify command";
985 if (args
.begin() == args
.end()) {
986 *err_msg
<< "rbd-nbd: must specify image-or-snap-spec";
989 if (parse_imgpath(string(*args
.begin()), cfg
) < 0)
991 args
.erase(args
.begin());
994 if (args
.begin() == args
.end()) {
995 *err_msg
<< "rbd-nbd: must specify nbd device path";
998 cfg
->devpath
= *args
.begin();
999 args
.erase(args
.begin());
1006 if (args
.begin() != args
.end()) {
1007 *err_msg
<< "rbd-nbd: unknown args: " << *args
.begin();
1014 static int rbd_nbd(int argc
, const char *argv
[])
1018 vector
<const char*> args
;
1019 argv_to_vec(argc
, argv
, args
);
1021 std::ostringstream err_msg
;
1022 r
= parse_args(args
, &err_msg
, &cfg
);
1023 if (r
== -ENODATA
) {
1027 cerr
<< err_msg
.str() << std::endl
;
1033 if (cfg
.imgname
.empty()) {
1034 cerr
<< "rbd-nbd: image name was not specified" << std::endl
;
1038 r
= do_map(argc
, argv
, &cfg
);
1043 r
= do_unmap(cfg
.devpath
);
1048 r
= do_list_mapped_devices();
1060 int main(int argc
, const char *argv
[])
1062 int r
= rbd_nbd(argc
, argv
);
1064 return EXIT_FAILURE
;