1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
5 * rbd-nbd - RBD in userspace
7 * Copyright (C) 2015 - 2016 Kylin Corporation
9 * Author: Yunchuan Wen <yunchuan.wen@kylin-cloud.com>
10 * Li Wang <li.wang@kylin-cloud.com>
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
19 #include "include/int_types.h"
27 #include <sys/types.h>
30 #include <linux/nbd.h>
32 #include <sys/ioctl.h>
33 #include <sys/socket.h>
37 #include <boost/regex.hpp>
39 #include "mon/MonClient.h"
40 #include "common/config.h"
41 #include "common/dout.h"
43 #include "common/errno.h"
44 #include "common/module.h"
45 #include "common/safe_io.h"
46 #include "common/TextTable.h"
47 #include "common/ceph_argparse.h"
48 #include "common/Preforker.h"
49 #include "common/version.h"
50 #include "global/global_init.h"
51 #include "global/signal_handler.h"
53 #include "include/rados/librados.hpp"
54 #include "include/rbd/librbd.hpp"
55 #include "include/stringify.h"
56 #include "include/xlist.h"
58 #define dout_context g_ceph_context
59 #define dout_subsys ceph_subsys_rbd
61 #define dout_prefix *_dout << "rbd-nbd: "
67 bool exclusive
= false;
68 bool readonly
= false;
69 bool set_max_part
= false;
79 std::cout
<< "Usage: rbd-nbd [options] map <image-or-snap-spec> Map an image to nbd device\n"
80 << " unmap <device path> Unmap nbd device\n"
81 << " list-mapped List mapped nbd devices\n"
83 << " --device <device path> Specify nbd device path\n"
84 << " --read-only Map read-only\n"
85 << " --nbds_max <limit> Override for module param nbds_max\n"
86 << " --max_part <limit> Override for module param max_part\n"
87 << " --exclusive Forbid writes by other clients\n"
89 generic_server_usage();
101 #define RBD_NBD_BLKSIZE 512UL
104 #define VERSION_INFO 2
106 #ifdef CEPH_BIG_ENDIAN
107 #define ntohll(a) (a)
108 #elif defined(CEPH_LITTLE_ENDIAN)
109 #define ntohll(a) swab(a)
111 #error "Could not determine endianess"
113 #define htonll(a) ntohll(a)
115 static int parse_args(vector
<const char*>& args
, std::ostream
*err_msg
, Config
*cfg
);
117 static void handle_signal(int signum
)
119 assert(signum
== SIGINT
|| signum
== SIGTERM
);
120 derr
<< "*** Got signal " << sig_str(signum
) << " ***" << dendl
;
121 dout(20) << __func__
<< ": " << "sending NBD_DISCONNECT" << dendl
;
122 if (ioctl(nbd
, NBD_DISCONNECT
) < 0) {
123 derr
<< "rbd-nbd: disconnect failed: " << cpp_strerror(errno
) << dendl
;
125 dout(20) << __func__
<< ": " << "disconnected" << dendl
;
133 librbd::Image
&image
;
136 NBDServer(int _fd
, librbd::Image
& _image
)
139 , lock("NBDServer::Locker")
140 , reader_thread(*this, &NBDServer::reader_entry
)
141 , writer_thread(*this, &NBDServer::writer_entry
)
146 std::atomic
<bool> terminated
= { false };
150 bool expected
= false;
151 if (terminated
.compare_exchange_strong(expected
, true)) {
152 ::shutdown(fd
, SHUT_RDWR
);
154 Mutex::Locker
l(lock
);
161 xlist
<IOContext
*>::item item
;
163 struct nbd_request request
;
164 struct nbd_reply reply
;
173 friend std::ostream
&operator<<(std::ostream
&os
, const IOContext
&ctx
);
177 xlist
<IOContext
*> io_pending
;
178 xlist
<IOContext
*> io_finished
;
180 void io_start(IOContext
*ctx
)
182 Mutex::Locker
l(lock
);
183 io_pending
.push_back(&ctx
->item
);
186 void io_finish(IOContext
*ctx
)
188 Mutex::Locker
l(lock
);
189 assert(ctx
->item
.is_on_list());
190 ctx
->item
.remove_myself();
191 io_finished
.push_back(&ctx
->item
);
195 IOContext
*wait_io_finish()
197 Mutex::Locker
l(lock
);
198 while(io_finished
.empty() && !terminated
)
201 if (io_finished
.empty())
204 IOContext
*ret
= io_finished
.front();
205 io_finished
.pop_front();
212 assert(!reader_thread
.is_started());
213 Mutex::Locker
l(lock
);
214 while(!io_pending
.empty())
217 while(!io_finished
.empty()) {
218 ceph::unique_ptr
<IOContext
> free_ctx(io_finished
.front());
219 io_finished
.pop_front();
223 static void aio_callback(librbd::completion_t cb
, void *arg
)
225 librbd::RBD::AioCompletion
*aio_completion
=
226 reinterpret_cast<librbd::RBD::AioCompletion
*>(cb
);
228 IOContext
*ctx
= reinterpret_cast<IOContext
*>(arg
);
229 int ret
= aio_completion
->get_return_value();
231 dout(20) << __func__
<< ": " << *ctx
<< dendl
;
233 if (ret
== -EINVAL
) {
234 // if shrinking an image, a pagecache writeback might reference
235 // extents outside of the range of the new image extents
236 dout(5) << __func__
<< ": masking IO out-of-bounds error" << dendl
;
242 ctx
->reply
.error
= htonl(-ret
);
243 } else if ((ctx
->command
== NBD_CMD_READ
) &&
244 ret
< static_cast<int>(ctx
->request
.len
)) {
245 int pad_byte_count
= static_cast<int> (ctx
->request
.len
) - ret
;
246 ctx
->data
.append_zero(pad_byte_count
);
247 dout(20) << __func__
<< ": " << *ctx
<< ": Pad byte count: "
248 << pad_byte_count
<< dendl
;
249 ctx
->reply
.error
= 0;
251 ctx
->reply
.error
= htonl(0);
253 ctx
->server
->io_finish(ctx
);
255 aio_completion
->release();
260 while (!terminated
) {
261 ceph::unique_ptr
<IOContext
> ctx(new IOContext());
264 dout(20) << __func__
<< ": waiting for nbd request" << dendl
;
266 int r
= safe_read_exact(fd
, &ctx
->request
, sizeof(struct nbd_request
));
268 derr
<< "failed to read nbd request header: " << cpp_strerror(r
)
273 if (ctx
->request
.magic
!= htonl(NBD_REQUEST_MAGIC
)) {
274 derr
<< "invalid nbd request header" << dendl
;
278 ctx
->request
.from
= ntohll(ctx
->request
.from
);
279 ctx
->request
.type
= ntohl(ctx
->request
.type
);
280 ctx
->request
.len
= ntohl(ctx
->request
.len
);
282 ctx
->reply
.magic
= htonl(NBD_REPLY_MAGIC
);
283 memcpy(ctx
->reply
.handle
, ctx
->request
.handle
, sizeof(ctx
->reply
.handle
));
285 ctx
->command
= ctx
->request
.type
& 0x0000ffff;
287 dout(20) << *ctx
<< ": start" << dendl
;
289 switch (ctx
->command
)
292 // NBD_DO_IT will return when pipe is closed
293 dout(0) << "disconnect request received" << dendl
;
296 bufferptr
ptr(ctx
->request
.len
);
297 r
= safe_read_exact(fd
, ptr
.c_str(), ctx
->request
.len
);
299 derr
<< *ctx
<< ": failed to read nbd request data: "
300 << cpp_strerror(r
) << dendl
;
303 ctx
->data
.push_back(ptr
);
307 IOContext
*pctx
= ctx
.release();
309 librbd::RBD::AioCompletion
*c
= new librbd::RBD::AioCompletion(pctx
, aio_callback
);
310 switch (pctx
->command
)
313 image
.aio_write(pctx
->request
.from
, pctx
->request
.len
, pctx
->data
, c
);
316 image
.aio_read(pctx
->request
.from
, pctx
->request
.len
, pctx
->data
, c
);
322 image
.aio_discard(pctx
->request
.from
, pctx
->request
.len
, c
);
325 derr
<< *pctx
<< ": invalid request command" << dendl
;
330 dout(20) << __func__
<< ": terminated" << dendl
;
335 while (!terminated
) {
336 dout(20) << __func__
<< ": waiting for io request" << dendl
;
337 ceph::unique_ptr
<IOContext
> ctx(wait_io_finish());
339 dout(20) << __func__
<< ": no io requests, terminating" << dendl
;
343 dout(20) << __func__
<< ": got: " << *ctx
<< dendl
;
345 int r
= safe_write(fd
, &ctx
->reply
, sizeof(struct nbd_reply
));
347 derr
<< *ctx
<< ": failed to write reply header: " << cpp_strerror(r
)
351 if (ctx
->command
== NBD_CMD_READ
&& ctx
->reply
.error
== htonl(0)) {
352 r
= ctx
->data
.write_fd(fd
);
354 derr
<< *ctx
<< ": failed to write replay data: " << cpp_strerror(r
)
359 dout(20) << *ctx
<< ": finish" << dendl
;
361 dout(20) << __func__
<< ": terminated" << dendl
;
364 class ThreadHelper
: public Thread
367 typedef void (NBDServer::*entry_func
)();
372 ThreadHelper(NBDServer
&_server
, entry_func _func
)
377 void* entry() override
383 } reader_thread
, writer_thread
;
390 dout(10) << __func__
<< ": starting" << dendl
;
394 reader_thread
.create("rbd_reader");
395 writer_thread
.create("rbd_writer");
402 dout(10) << __func__
<< ": terminating" << dendl
;
406 reader_thread
.join();
407 writer_thread
.join();
421 std::ostream
&operator<<(std::ostream
&os
, const NBDServer::IOContext
&ctx
) {
423 os
<< "[" << std::hex
<< ntohll(*((uint64_t *)ctx
.request
.handle
));
440 os
<< " UNKNOW(" << ctx
.command
<< ") ";
444 os
<< ctx
.request
.from
<< "~" << ctx
.request
.len
<< " "
445 << ntohl(ctx
.reply
.error
) << "]";
450 class NBDWatchCtx
: public librbd::UpdateWatchCtx
454 librados::IoCtx
&io_ctx
;
455 librbd::Image
&image
;
459 librados::IoCtx
&_io_ctx
,
460 librbd::Image
&_image
,
468 ~NBDWatchCtx() override
{}
470 void handle_notify() override
472 librbd::image_info_t info
;
473 if (image
.stat(info
, sizeof(info
)) == 0) {
474 unsigned long new_size
= info
.size
;
476 if (new_size
!= size
) {
477 if (ioctl(fd
, BLKFLSBUF
, NULL
) < 0)
478 derr
<< "invalidate page cache failed: " << cpp_strerror(errno
) << dendl
;
479 if (ioctl(fd
, NBD_SET_SIZE
, new_size
) < 0) {
480 derr
<< "resize failed: " << cpp_strerror(errno
) << dendl
;
484 if (image
.invalidate_cache() < 0)
485 derr
<< "invalidate rbd cache failed" << dendl
;
491 static int open_device(const char* path
, Config
*cfg
= nullptr, bool try_load_module
= false)
493 int nbd
= open(path
, O_RDWR
);
494 bool loaded_module
= false;
496 if (nbd
< 0 && try_load_module
&& access("/sys/module/nbd", F_OK
) != 0) {
500 param
<< "nbds_max=" << cfg
->nbds_max
;
503 param
<< " max_part=" << cfg
->max_part
;
505 r
= module_load("nbd", param
.str().c_str());
507 cerr
<< "rbd-nbd: failed to load nbd kernel module: " << cpp_strerror(-r
) << std::endl
;
510 loaded_module
= true;
512 nbd
= open(path
, O_RDWR
);
515 if (try_load_module
&& !loaded_module
&&
516 (cfg
->nbds_max
|| cfg
->set_max_part
)) {
517 cerr
<< "rbd-nbd: ignoring kernel module parameter options: nbd module already loaded"
524 static int check_device_size(int nbd_index
, unsigned long expected_size
)
526 // There are bugs with some older kernel versions that result in an
527 // overflow for large image sizes. This check is to ensure we are
530 unsigned long size
= 0;
531 std::string path
= "/sys/block/nbd" + stringify(nbd_index
) + "/size";
533 ifs
.open(path
.c_str(), std::ifstream::in
);
534 if (!ifs
.is_open()) {
535 cerr
<< "rbd-nbd: failed to open " << path
<< std::endl
;
539 size
*= RBD_NBD_BLKSIZE
;
542 // Newer kernel versions will report real size only after nbd
543 // connect. Assume this is the case and return success.
547 if (size
!= expected_size
) {
548 cerr
<< "rbd-nbd: kernel reported invalid device size (" << size
549 << ", expected " << expected_size
<< ")" << std::endl
;
556 static int do_map(int argc
, const char *argv
[], Config
*cfg
)
560 librados::Rados rados
;
562 librados::IoCtx io_ctx
;
572 librbd::image_info_t info
;
576 vector
<const char*> args
;
577 argv_to_vec(argc
, argv
, args
);
580 auto cct
= global_init(NULL
, args
, CEPH_ENTITY_TYPE_CLIENT
,
581 CODE_ENVIRONMENT_DAEMON
,
582 CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS
);
583 g_ceph_context
->_conf
->set_val_or_die("pid_file", "");
585 if (global_init_prefork(g_ceph_context
) >= 0) {
587 r
= forker
.prefork(err
);
589 cerr
<< err
<< std::endl
;
593 if (forker
.is_parent()) {
594 global_init_postfork_start(g_ceph_context
);
595 if (forker
.parent_wait(err
) != 0) {
602 common_init_finish(g_ceph_context
);
603 global_init_chdir(g_ceph_context
);
605 if (socketpair(AF_UNIX
, SOCK_STREAM
, 0, fd
) == -1) {
610 if (cfg
->devpath
.empty()) {
612 bool try_load_module
= true;
614 snprintf(dev
, sizeof(dev
), "/dev/nbd%d", index
);
616 nbd
= open_device(dev
, cfg
, try_load_module
);
617 try_load_module
= false;
620 cerr
<< "rbd-nbd: failed to find unused device" << std::endl
;
624 r
= ioctl(nbd
, NBD_SET_SOCK
, fd
[0]);
635 r
= sscanf(cfg
->devpath
.c_str(), "/dev/nbd%d", &index
);
637 cerr
<< "rbd-nbd: invalid device path: " << cfg
->devpath
638 << " (expected /dev/nbd{num})" << std::endl
;
641 nbd
= open_device(cfg
->devpath
.c_str(), cfg
, true);
644 cerr
<< "rbd-nbd: failed to open device: " << cfg
->devpath
<< std::endl
;
648 r
= ioctl(nbd
, NBD_SET_SOCK
, fd
[0]);
651 cerr
<< "rbd-nbd: the device " << cfg
->devpath
<< " is busy" << std::endl
;
657 flags
= NBD_FLAG_SEND_FLUSH
| NBD_FLAG_SEND_TRIM
| NBD_FLAG_HAS_FLAGS
;
658 if (!cfg
->snapname
.empty() || cfg
->readonly
) {
659 flags
|= NBD_FLAG_READ_ONLY
;
663 r
= rados
.init_with_context(g_ceph_context
);
671 r
= rados
.ioctx_create(cfg
->poolname
.c_str(), io_ctx
);
675 r
= rbd
.open(io_ctx
, image
, cfg
->imgname
.c_str());
679 if (cfg
->exclusive
) {
680 r
= image
.lock_acquire(RBD_LOCK_MODE_EXCLUSIVE
);
682 cerr
<< "rbd-nbd: failed to acquire exclusive lock: " << cpp_strerror(r
)
688 if (!cfg
->snapname
.empty()) {
689 r
= image
.snap_set(cfg
->snapname
.c_str());
694 r
= image
.stat(info
, sizeof(info
));
698 r
= ioctl(nbd
, NBD_SET_BLKSIZE
, RBD_NBD_BLKSIZE
);
704 if (info
.size
> ULONG_MAX
) {
706 cerr
<< "rbd-nbd: image is too large (" << prettybyte_t(info
.size
)
707 << ", max is " << prettybyte_t(ULONG_MAX
) << ")" << std::endl
;
713 r
= ioctl(nbd
, NBD_SET_SIZE
, size
);
719 r
= check_device_size(index
, size
);
724 ioctl(nbd
, NBD_SET_FLAGS
, flags
);
726 r
= ioctl(nbd
, BLKROSET
, (unsigned long) &read_only
);
735 NBDWatchCtx
watch_ctx(nbd
, io_ctx
, image
, info
.size
);
736 r
= image
.update_watch(&watch_ctx
, &handle
);
740 cout
<< cfg
->devpath
<< std::endl
;
742 if (g_conf
->daemonize
) {
744 global_init_postfork_start(g_ceph_context
);
745 global_init_postfork_finish(g_ceph_context
);
749 NBDServer
server(fd
[1], image
);
753 init_async_signal_handler();
754 register_async_signal_handler(SIGHUP
, sighup_handler
);
755 register_async_signal_handler_oneshot(SIGINT
, handle_signal
);
756 register_async_signal_handler_oneshot(SIGTERM
, handle_signal
);
758 ioctl(nbd
, NBD_DO_IT
);
760 unregister_async_signal_handler(SIGHUP
, sighup_handler
);
761 unregister_async_signal_handler(SIGINT
, handle_signal
);
762 unregister_async_signal_handler(SIGTERM
, handle_signal
);
763 shutdown_async_signal_handler();
768 r
= image
.update_unwatch(handle
);
774 ioctl(nbd
, NBD_CLEAR_SOCK
);
775 cerr
<< "rbd-nbd: failed to map, status: " << cpp_strerror(-r
) << std::endl
;
786 forker
.exit(r
< 0 ? EXIT_FAILURE
: 0);
791 static int do_unmap(const std::string
&devpath
)
795 int nbd
= open_device(devpath
.c_str());
797 cerr
<< "rbd-nbd: failed to open device: " << devpath
<< std::endl
;
801 r
= ioctl(nbd
, NBD_DISCONNECT
);
803 cerr
<< "rbd-nbd: the device is not used" << std::endl
;
811 static int parse_imgpath(const std::string
&imgpath
, Config
*cfg
)
813 boost::regex
pattern("^(?:([^/@]+)/)?([^/@]+)(?:@([^/@]+))?$");
815 if (!boost::regex_match(imgpath
, match
, pattern
)) {
816 std::cerr
<< "rbd-nbd: invalid spec '" << imgpath
<< "'" << std::endl
;
820 if (match
[1].matched
) {
821 cfg
->poolname
= match
[1];
824 cfg
->imgname
= match
[2];
826 if (match
[3].matched
)
827 cfg
->snapname
= match
[3];
832 static int get_mapped_info(int pid
, Config
*cfg
)
835 std::string path
= "/proc/" + stringify(pid
) + "/cmdline";
838 std::vector
<const char*> args
;
840 ifs
.open(path
.c_str(), std::ifstream::in
);
841 assert (ifs
.is_open());
844 for (unsigned i
= 0; i
< cmdline
.size(); i
++) {
845 const char *arg
= &cmdline
[i
];
847 if (strcmp(basename(arg
) , "rbd-nbd") != 0) {
854 while (cmdline
[i
] != '\0') {
859 std::ostringstream err_msg
;
860 r
= parse_args(args
, &err_msg
, cfg
);
864 static int get_map_pid(const std::string
& pid_path
)
868 ifs
.open(pid_path
.c_str(), std::ifstream::in
);
869 if (!ifs
.is_open()) {
876 static int do_list_mapped_devices()
879 bool should_print
= false;
883 std::string default_pool_name
;
887 tbl
.define_column("pid", TextTable::LEFT
, TextTable::LEFT
);
888 tbl
.define_column("pool", TextTable::LEFT
, TextTable::LEFT
);
889 tbl
.define_column("image", TextTable::LEFT
, TextTable::LEFT
);
890 tbl
.define_column("snap", TextTable::LEFT
, TextTable::LEFT
);
891 tbl
.define_column("device", TextTable::LEFT
, TextTable::LEFT
);
894 std::string nbd_path
= "/sys/block/nbd" + stringify(index
);
895 if(access(nbd_path
.c_str(), F_OK
) != 0) {
898 std::string pid_path
= nbd_path
+ "/pid";
899 pid
= get_map_pid(pid_path
);
903 r
= get_mapped_info(pid
, &cfg
);
909 if (cfg
.snapname
.empty()) {
912 tbl
<< pid
<< cfg
.poolname
<< cfg
.imgname
<< cfg
.snapname
913 << "/dev/nbd" + stringify(index
) << TextTable::endrow
;
925 static int parse_args(vector
<const char*>& args
, std::ostream
*err_msg
, Config
*cfg
)
927 std::vector
<const char*>::iterator i
;
928 std::ostringstream err
;
931 config
.parse_config_files(nullptr, nullptr, 0);
933 config
.parse_argv(args
);
934 cfg
->poolname
= config
.rbd_default_pool
;
936 for (i
= args
.begin(); i
!= args
.end(); ) {
937 if (ceph_argparse_flag(args
, i
, "-h", "--help", (char*)NULL
)) {
939 } else if (ceph_argparse_flag(args
, i
, "-v", "--version", (char*)NULL
)) {
941 } else if (ceph_argparse_witharg(args
, i
, &cfg
->devpath
, "--device", (char *)NULL
)) {
942 } else if (ceph_argparse_witharg(args
, i
, &cfg
->nbds_max
, err
, "--nbds_max", (char *)NULL
)) {
943 if (!err
.str().empty()) {
944 *err_msg
<< "rbd-nbd: " << err
.str();
947 if (cfg
->nbds_max
< 0) {
948 *err_msg
<< "rbd-nbd: Invalid argument for nbds_max!";
951 } else if (ceph_argparse_witharg(args
, i
, &cfg
->max_part
, err
, "--max_part", (char *)NULL
)) {
952 if (!err
.str().empty()) {
953 *err_msg
<< "rbd-nbd: " << err
.str();
956 if ((cfg
->max_part
< 0) || (cfg
->max_part
> 255)) {
957 *err_msg
<< "rbd-nbd: Invalid argument for max_part(0~255)!";
960 cfg
->set_max_part
= true;
961 } else if (ceph_argparse_flag(args
, i
, "--read-only", (char *)NULL
)) {
962 cfg
->readonly
= true;
963 } else if (ceph_argparse_flag(args
, i
, "--exclusive", (char *)NULL
)) {
964 cfg
->exclusive
= true;
970 if (args
.begin() != args
.end()) {
971 if (strcmp(*args
.begin(), "map") == 0) {
973 } else if (strcmp(*args
.begin(), "unmap") == 0) {
975 } else if (strcmp(*args
.begin(), "list-mapped") == 0) {
978 *err_msg
<< "rbd-nbd: unknown command: " << *args
.begin();
981 args
.erase(args
.begin());
985 *err_msg
<< "rbd-nbd: must specify command";
991 if (args
.begin() == args
.end()) {
992 *err_msg
<< "rbd-nbd: must specify image-or-snap-spec";
995 if (parse_imgpath(string(*args
.begin()), cfg
) < 0)
997 args
.erase(args
.begin());
1000 if (args
.begin() == args
.end()) {
1001 *err_msg
<< "rbd-nbd: must specify nbd device path";
1004 cfg
->devpath
= *args
.begin();
1005 args
.erase(args
.begin());
1012 if (args
.begin() != args
.end()) {
1013 *err_msg
<< "rbd-nbd: unknown args: " << *args
.begin();
1020 static int rbd_nbd(int argc
, const char *argv
[])
1024 vector
<const char*> args
;
1025 argv_to_vec(argc
, argv
, args
);
1027 std::ostringstream err_msg
;
1028 r
= parse_args(args
, &err_msg
, &cfg
);
1029 if (r
== HELP_INFO
) {
1032 } else if (r
== VERSION_INFO
) {
1033 std::cout
<< pretty_version_to_str() << std::endl
;
1037 cerr
<< err_msg
.str() << std::endl
;
1043 if (cfg
.imgname
.empty()) {
1044 cerr
<< "rbd-nbd: image name was not specified" << std::endl
;
1048 r
= do_map(argc
, argv
, &cfg
);
1053 r
= do_unmap(cfg
.devpath
);
1058 r
= do_list_mapped_devices();
1070 int main(int argc
, const char *argv
[])
1072 int r
= rbd_nbd(argc
, argv
);
1074 return EXIT_FAILURE
;