1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include <sys/types.h>
18 #include <boost/scoped_ptr.hpp>
25 #include "os/ObjectStore.h"
26 #include "mon/MonClient.h"
27 #include "include/ceph_features.h"
29 #include "common/config.h"
31 #include "mon/MonMap.h"
33 #include "msg/Messenger.h"
35 #include "common/Timer.h"
36 #include "common/TracepointProvider.h"
37 #include "common/ceph_argparse.h"
39 #include "global/global_init.h"
40 #include "global/signal_handler.h"
42 #include "include/color.h"
43 #include "common/errno.h"
44 #include "common/pick_address.h"
46 #include "perfglue/heap_profiler.h"
48 #include "include/assert.h"
50 #define dout_context g_ceph_context
51 #define dout_subsys ceph_subsys_osd
55 TracepointProvider::Traits
osd_tracepoint_traits("libosd_tp.so",
57 TracepointProvider::Traits
os_tracepoint_traits("libos_tp.so",
58 "osd_objectstore_tracing");
59 #ifdef WITH_OSD_INSTRUMENT_FUNCTIONS
60 TracepointProvider::Traits
cyg_profile_traits("libcyg_profile_tp.so",
61 "osd_function_tracing");
64 } // anonymous namespace
68 void handle_osd_signal(int signum
)
71 osd
->handle_signal(signum
);
76 cout
<< "usage: ceph-osd -i <ID> [flags]\n"
77 << " --osd-data PATH data directory\n"
78 << " --osd-journal PATH\n"
79 << " journal file or block device\n"
80 << " --mkfs create a [new] data directory\n"
81 << " --mkkey generate a new secret key. This is normally used in combination with --mkfs\n"
82 << " --convert-filestore\n"
83 << " run any pending upgrade operations\n"
84 << " --flush-journal flush all data out of journal\n"
85 << " --mkjournal initialize a new journal\n"
86 << " --check-wants-journal\n"
87 << " check whether a journal is desired\n"
88 << " --check-allows-journal\n"
89 << " check whether a journal is allowed\n"
90 << " --check-needs-journal\n"
91 << " check whether a journal is required\n"
92 << " --debug_osd <N> set debug level (e.g. 10)\n"
93 << " --get-device-fsid PATH\n"
94 << " get OSD fsid for the given block device\n"
96 generic_server_usage();
99 #ifdef BUILDING_FOR_EMBEDDED
100 void cephd_preload_embedded_plugins();
101 void cephd_preload_rados_classes(OSD
*osd
);
102 extern "C" int cephd_osd(int argc
, const char **argv
)
104 int main(int argc
, const char **argv
)
107 vector
<const char*> args
;
108 argv_to_vec(argc
, argv
, args
);
111 vector
<const char*> def_args
;
112 // We want to enable leveldb's log, while allowing users to override this
113 // option, therefore we will pass it as a default argument to global_init().
114 def_args
.push_back("--leveldb-log=");
116 auto cct
= global_init(&def_args
, args
, CEPH_ENTITY_TYPE_OSD
,
117 CODE_ENVIRONMENT_DAEMON
,
119 ceph_heap_profiler_init();
123 bool mkjournal
= false;
124 bool check_wants_journal
= false;
125 bool check_allows_journal
= false;
126 bool check_needs_journal
= false;
128 bool flushjournal
= false;
129 bool dump_journal
= false;
130 bool convertfilestore
= false;
131 bool get_osd_fsid
= false;
132 bool get_cluster_fsid
= false;
133 bool get_journal_fsid
= false;
134 bool get_device_fsid
= false;
136 std::string dump_pg_log
;
139 for (std::vector
<const char*>::iterator i
= args
.begin(); i
!= args
.end(); ) {
140 if (ceph_argparse_double_dash(args
, i
)) {
142 } else if (ceph_argparse_flag(args
, i
, "-h", "--help", (char*)NULL
)) {
144 } else if (ceph_argparse_flag(args
, i
, "--mkfs", (char*)NULL
)) {
146 } else if (ceph_argparse_flag(args
, i
, "--mkjournal", (char*)NULL
)) {
148 } else if (ceph_argparse_flag(args
, i
, "--check-allows-journal", (char*)NULL
)) {
149 check_allows_journal
= true;
150 } else if (ceph_argparse_flag(args
, i
, "--check-wants-journal", (char*)NULL
)) {
151 check_wants_journal
= true;
152 } else if (ceph_argparse_flag(args
, i
, "--check-needs-journal", (char*)NULL
)) {
153 check_needs_journal
= true;
154 } else if (ceph_argparse_flag(args
, i
, "--mkkey", (char*)NULL
)) {
156 } else if (ceph_argparse_flag(args
, i
, "--flush-journal", (char*)NULL
)) {
158 } else if (ceph_argparse_flag(args
, i
, "--convert-filestore", (char*)NULL
)) {
159 convertfilestore
= true;
160 } else if (ceph_argparse_witharg(args
, i
, &val
, "--dump-pg-log", (char*)NULL
)) {
162 } else if (ceph_argparse_flag(args
, i
, "--dump-journal", (char*)NULL
)) {
164 } else if (ceph_argparse_flag(args
, i
, "--get-cluster-fsid", (char*)NULL
)) {
165 get_cluster_fsid
= true;
166 } else if (ceph_argparse_flag(args
, i
, "--get-osd-fsid", "--get-osd-uuid", (char*)NULL
)) {
168 } else if (ceph_argparse_flag(args
, i
, "--get-journal-fsid", "--get-journal-uuid", (char*)NULL
)) {
169 get_journal_fsid
= true;
170 } else if (ceph_argparse_witharg(args
, i
, &device_path
,
171 "--get-device-fsid", (char*)NULL
)) {
172 get_device_fsid
= true;
178 derr
<< "unrecognized arg " << args
[0] << dendl
;
182 if (get_journal_fsid
) {
183 device_path
= g_conf
->osd_journal
;
184 get_device_fsid
= true;
186 if (get_device_fsid
) {
188 int r
= ObjectStore::probe_block_device_fsid(g_ceph_context
, device_path
,
191 cerr
<< "failed to get device fsid for " << device_path
192 << ": " << cpp_strerror(r
) << std::endl
;
195 cout
<< uuid
<< std::endl
;
199 if (!dump_pg_log
.empty()) {
200 common_init_finish(g_ceph_context
);
203 int r
= bl
.read_file(dump_pg_log
.c_str(), &error
);
206 bufferlist::iterator p
= bl
.begin();
208 uint64_t pos
= p
.get_off();
212 catch (const buffer::error
&e
) {
213 derr
<< "failed to decode LogEntry at offset " << pos
<< dendl
;
216 derr
<< pos
<< ":\t" << e
<< dendl
;
219 derr
<< "unable to open " << dump_pg_log
<< ": " << error
<< dendl
;
226 const char *id
= g_conf
->name
.get_id().c_str();
227 int whoami
= strtol(id
, &end
, 10);
228 if (*end
|| end
== id
|| whoami
< 0) {
229 derr
<< "must specify '-i #' where # is the osd number" << dendl
;
233 if (g_conf
->osd_data
.empty()) {
234 derr
<< "must specify '--osd-data=foo' data path" << dendl
;
239 string store_type
= g_conf
->osd_objectstore
;
242 snprintf(fn
, sizeof(fn
), "%s/type", g_conf
->osd_data
.c_str());
243 int fd
= ::open(fn
, O_RDONLY
);
248 store_type
= string(bl
.c_str(), bl
.length() - 1); // drop \n
249 dout(5) << "object store type is " << store_type
<< dendl
;
254 ObjectStore
*store
= ObjectStore::create(g_ceph_context
,
258 g_conf
->osd_os_flags
);
260 derr
<< "unable to create object store" << dendl
;
264 #ifdef BUILDING_FOR_EMBEDDED
265 cephd_preload_embedded_plugins();
269 common_init_finish(g_ceph_context
);
270 MonClient
mc(g_ceph_context
);
271 if (mc
.build_initial_monmap() < 0)
273 if (mc
.get_monmap_privately() < 0)
276 if (mc
.monmap
.fsid
.is_zero()) {
277 derr
<< "must specify cluster fsid" << dendl
;
281 int err
= OSD::mkfs(g_ceph_context
, store
, g_conf
->osd_data
,
282 mc
.monmap
.fsid
, whoami
);
284 derr
<< TEXT_RED
<< " ** ERROR: error creating empty object store in "
285 << g_conf
->osd_data
<< ": " << cpp_strerror(-err
) << TEXT_NORMAL
<< dendl
;
288 derr
<< "created object store " << g_conf
->osd_data
289 << " for osd." << whoami
<< " fsid " << mc
.monmap
.fsid
<< dendl
;
292 common_init_finish(g_ceph_context
);
293 KeyRing
*keyring
= KeyRing::create_empty();
295 derr
<< "Unable to get a Ceph keyring." << dendl
;
299 EntityName
ename(g_conf
->name
);
302 int ret
= keyring
->load(g_ceph_context
, g_conf
->keyring
);
304 keyring
->get_auth(ename
, eauth
)) {
305 derr
<< "already have key in keyring " << g_conf
->keyring
<< dendl
;
307 eauth
.key
.create(g_ceph_context
, CEPH_CRYPTO_AES
);
308 keyring
->add(ename
, eauth
);
310 keyring
->encode_plaintext(bl
);
311 int r
= bl
.write_file(g_conf
->keyring
.c_str(), 0600);
313 derr
<< TEXT_RED
<< " ** ERROR: writing new keyring to " << g_conf
->keyring
314 << ": " << cpp_strerror(r
) << TEXT_NORMAL
<< dendl
;
316 derr
<< "created new key in keyring " << g_conf
->keyring
<< dendl
;
322 common_init_finish(g_ceph_context
);
323 int err
= store
->mkjournal();
325 derr
<< TEXT_RED
<< " ** ERROR: error creating fresh journal " << g_conf
->osd_journal
326 << " for object store " << g_conf
->osd_data
327 << ": " << cpp_strerror(-err
) << TEXT_NORMAL
<< dendl
;
330 derr
<< "created new journal " << g_conf
->osd_journal
331 << " for object store " << g_conf
->osd_data
<< dendl
;
334 if (check_wants_journal
) {
335 if (store
->wants_journal()) {
336 cout
<< "yes" << std::endl
;
339 cout
<< "no" << std::endl
;
343 if (check_allows_journal
) {
344 if (store
->allows_journal()) {
345 cout
<< "yes" << std::endl
;
348 cout
<< "no" << std::endl
;
352 if (check_needs_journal
) {
353 if (store
->needs_journal()) {
354 cout
<< "yes" << std::endl
;
357 cout
<< "no" << std::endl
;
362 common_init_finish(g_ceph_context
);
363 int err
= store
->mount();
365 derr
<< TEXT_RED
<< " ** ERROR: error flushing journal " << g_conf
->osd_journal
366 << " for object store " << g_conf
->osd_data
367 << ": " << cpp_strerror(-err
) << TEXT_NORMAL
<< dendl
;
368 goto flushjournal_out
;
371 derr
<< "flushed journal " << g_conf
->osd_journal
372 << " for object store " << g_conf
->osd_data
376 exit(err
< 0 ? 1 : 0);
379 common_init_finish(g_ceph_context
);
380 int err
= store
->dump_journal(cout
);
382 derr
<< TEXT_RED
<< " ** ERROR: error dumping journal " << g_conf
->osd_journal
383 << " for object store " << g_conf
->osd_data
384 << ": " << cpp_strerror(-err
) << TEXT_NORMAL
<< dendl
;
387 derr
<< "dumped journal " << g_conf
->osd_journal
388 << " for object store " << g_conf
->osd_data
395 if (convertfilestore
) {
396 int err
= store
->mount();
398 derr
<< TEXT_RED
<< " ** ERROR: error mounting store " << g_conf
->osd_data
399 << ": " << cpp_strerror(-err
) << TEXT_NORMAL
<< dendl
;
402 err
= store
->upgrade();
405 derr
<< TEXT_RED
<< " ** ERROR: error converting store " << g_conf
->osd_data
406 << ": " << cpp_strerror(-err
) << TEXT_NORMAL
<< dendl
;
413 uuid_d cluster_fsid
, osd_fsid
;
415 int r
= OSD::peek_meta(store
, magic
, cluster_fsid
, osd_fsid
, w
);
417 derr
<< TEXT_RED
<< " ** ERROR: unable to open OSD superblock on "
418 << g_conf
->osd_data
<< ": " << cpp_strerror(-r
)
419 << TEXT_NORMAL
<< dendl
;
421 derr
<< TEXT_RED
<< " ** please verify that underlying storage "
422 << "supports xattrs" << TEXT_NORMAL
<< dendl
;
427 derr
<< "OSD id " << w
<< " != my id " << whoami
<< dendl
;
430 if (strcmp(magic
.c_str(), CEPH_OSD_ONDISK_MAGIC
)) {
431 derr
<< "OSD magic " << magic
<< " != my " << CEPH_OSD_ONDISK_MAGIC
436 if (get_cluster_fsid
) {
437 cout
<< cluster_fsid
<< std::endl
;
441 cout
<< osd_fsid
<< std::endl
;
445 pick_addresses(g_ceph_context
, CEPH_PICK_ADDRESS_PUBLIC
446 |CEPH_PICK_ADDRESS_CLUSTER
);
448 if (g_conf
->public_addr
.is_blank_ip() && !g_conf
->cluster_addr
.is_blank_ip()) {
450 << " ** WARNING: specified cluster addr but not public addr; we recommend **\n"
451 << " ** you specify neither or both. **"
452 << TEXT_NORMAL
<< dendl
;
455 std::string public_msgr_type
= g_conf
->ms_public_type
.empty() ? g_conf
->get_val
<std::string
>("ms_type") : g_conf
->ms_public_type
;
456 std::string cluster_msgr_type
= g_conf
->ms_cluster_type
.empty() ? g_conf
->get_val
<std::string
>("ms_type") : g_conf
->ms_cluster_type
;
457 Messenger
*ms_public
= Messenger::create(g_ceph_context
, public_msgr_type
,
458 entity_name_t::OSD(whoami
), "client",
460 Messenger::HAS_HEAVY_TRAFFIC
|
461 Messenger::HAS_MANY_CONNECTIONS
);
462 Messenger
*ms_cluster
= Messenger::create(g_ceph_context
, cluster_msgr_type
,
463 entity_name_t::OSD(whoami
), "cluster",
465 Messenger::HAS_HEAVY_TRAFFIC
|
466 Messenger::HAS_MANY_CONNECTIONS
);
467 Messenger
*ms_hb_back_client
= Messenger::create(g_ceph_context
, cluster_msgr_type
,
468 entity_name_t::OSD(whoami
), "hb_back_client",
469 getpid(), Messenger::HEARTBEAT
);
470 Messenger
*ms_hb_front_client
= Messenger::create(g_ceph_context
, public_msgr_type
,
471 entity_name_t::OSD(whoami
), "hb_front_client",
472 getpid(), Messenger::HEARTBEAT
);
473 Messenger
*ms_hb_back_server
= Messenger::create(g_ceph_context
, cluster_msgr_type
,
474 entity_name_t::OSD(whoami
), "hb_back_server",
475 getpid(), Messenger::HEARTBEAT
);
476 Messenger
*ms_hb_front_server
= Messenger::create(g_ceph_context
, public_msgr_type
,
477 entity_name_t::OSD(whoami
), "hb_front_server",
478 getpid(), Messenger::HEARTBEAT
);
479 Messenger
*ms_objecter
= Messenger::create(g_ceph_context
, public_msgr_type
,
480 entity_name_t::OSD(whoami
), "ms_objecter",
482 if (!ms_public
|| !ms_cluster
|| !ms_hb_front_client
|| !ms_hb_back_client
|| !ms_hb_back_server
|| !ms_hb_front_server
|| !ms_objecter
)
484 ms_cluster
->set_cluster_protocol(CEPH_OSD_PROTOCOL
);
485 ms_hb_front_client
->set_cluster_protocol(CEPH_OSD_PROTOCOL
);
486 ms_hb_back_client
->set_cluster_protocol(CEPH_OSD_PROTOCOL
);
487 ms_hb_back_server
->set_cluster_protocol(CEPH_OSD_PROTOCOL
);
488 ms_hb_front_server
->set_cluster_protocol(CEPH_OSD_PROTOCOL
);
490 cout
<< "starting osd." << whoami
491 << " at " << ms_public
->get_myaddr()
492 << " osd_data " << g_conf
->osd_data
493 << " " << ((g_conf
->osd_journal
.empty()) ?
494 "(no journal)" : g_conf
->osd_journal
)
497 boost::scoped_ptr
<Throttle
> client_byte_throttler(
498 new Throttle(g_ceph_context
, "osd_client_bytes",
499 g_conf
->osd_client_message_size_cap
));
501 // All feature bits 0 - 34 should be present from dumpling v0.67 forward
502 uint64_t osd_required
=
504 CEPH_FEATURE_PGID64
|
507 ms_public
->set_default_policy(Messenger::Policy::stateless_server(0));
508 ms_public
->set_policy_throttlers(entity_name_t::TYPE_CLIENT
,
509 client_byte_throttler
.get(),
511 ms_public
->set_policy(entity_name_t::TYPE_MON
,
512 Messenger::Policy::lossy_client(CEPH_FEATURE_UID
|
513 CEPH_FEATURE_PGID64
|
514 CEPH_FEATURE_OSDENC
));
515 ms_public
->set_policy(entity_name_t::TYPE_MGR
,
516 Messenger::Policy::lossy_client(CEPH_FEATURE_UID
|
517 CEPH_FEATURE_PGID64
|
518 CEPH_FEATURE_OSDENC
));
520 //try to poison pill any OSD connections on the wrong address
521 ms_public
->set_policy(entity_name_t::TYPE_OSD
,
522 Messenger::Policy::stateless_server(0));
524 ms_cluster
->set_default_policy(Messenger::Policy::stateless_server(0));
525 ms_cluster
->set_policy(entity_name_t::TYPE_MON
, Messenger::Policy::lossy_client(0));
526 ms_cluster
->set_policy(entity_name_t::TYPE_OSD
,
527 Messenger::Policy::lossless_peer(osd_required
));
528 ms_cluster
->set_policy(entity_name_t::TYPE_CLIENT
,
529 Messenger::Policy::stateless_server(0));
531 ms_hb_front_client
->set_policy(entity_name_t::TYPE_OSD
,
532 Messenger::Policy::lossy_client(0));
533 ms_hb_back_client
->set_policy(entity_name_t::TYPE_OSD
,
534 Messenger::Policy::lossy_client(0));
535 ms_hb_back_server
->set_policy(entity_name_t::TYPE_OSD
,
536 Messenger::Policy::stateless_server(0));
537 ms_hb_front_server
->set_policy(entity_name_t::TYPE_OSD
,
538 Messenger::Policy::stateless_server(0));
540 ms_objecter
->set_default_policy(Messenger::Policy::lossy_client(CEPH_FEATURE_OSDREPLYMUX
));
542 r
= ms_public
->bind(g_conf
->public_addr
);
545 r
= ms_cluster
->bind(g_conf
->cluster_addr
);
549 if (g_conf
->osd_heartbeat_use_min_delay_socket
) {
550 ms_hb_front_client
->set_socket_priority(SOCKET_PRIORITY_MIN_DELAY
);
551 ms_hb_back_client
->set_socket_priority(SOCKET_PRIORITY_MIN_DELAY
);
552 ms_hb_back_server
->set_socket_priority(SOCKET_PRIORITY_MIN_DELAY
);
553 ms_hb_front_server
->set_socket_priority(SOCKET_PRIORITY_MIN_DELAY
);
556 // hb back should bind to same ip as cluster_addr (if specified)
557 entity_addr_t hb_back_addr
= g_conf
->osd_heartbeat_addr
;
558 if (hb_back_addr
.is_blank_ip()) {
559 hb_back_addr
= g_conf
->cluster_addr
;
560 if (hb_back_addr
.is_ip())
561 hb_back_addr
.set_port(0);
563 r
= ms_hb_back_server
->bind(hb_back_addr
);
566 r
= ms_hb_back_client
->client_bind(hb_back_addr
);
570 // hb front should bind to same ip as public_addr
571 entity_addr_t hb_front_addr
= g_conf
->public_addr
;
572 if (hb_front_addr
.is_ip())
573 hb_front_addr
.set_port(0);
574 r
= ms_hb_front_server
->bind(hb_front_addr
);
577 r
= ms_hb_front_client
->client_bind(hb_front_addr
);
581 // Set up crypto, daemonize, etc.
582 global_init_daemonize(g_ceph_context
);
583 common_init_finish(g_ceph_context
);
585 TracepointProvider::initialize
<osd_tracepoint_traits
>(g_ceph_context
);
586 TracepointProvider::initialize
<os_tracepoint_traits
>(g_ceph_context
);
587 #ifdef WITH_OSD_INSTRUMENT_FUNCTIONS
588 TracepointProvider::initialize
<cyg_profile_traits
>(g_ceph_context
);
591 MonClient
mc(g_ceph_context
);
592 if (mc
.build_initial_monmap() < 0)
594 global_init_chdir(g_ceph_context
);
596 #ifndef BUILDING_FOR_EMBEDDED
597 if (global_init_preload_erasure_code(g_ceph_context
) < 0)
601 srand(time(NULL
) + getpid());
603 osd
= new OSD(g_ceph_context
,
615 g_conf
->osd_journal
);
617 int err
= osd
->pre_init();
619 derr
<< TEXT_RED
<< " ** ERROR: osd pre_init failed: " << cpp_strerror(-err
)
620 << TEXT_NORMAL
<< dendl
;
625 ms_hb_front_client
->start();
626 ms_hb_back_client
->start();
627 ms_hb_front_server
->start();
628 ms_hb_back_server
->start();
630 ms_objecter
->start();
635 derr
<< TEXT_RED
<< " ** ERROR: osd init failed: " << cpp_strerror(-err
)
636 << TEXT_NORMAL
<< dendl
;
640 #ifdef BUILDING_FOR_EMBEDDED
641 cephd_preload_rados_classes(osd
);
644 // install signal handlers
645 init_async_signal_handler();
646 register_async_signal_handler(SIGHUP
, sighup_handler
);
647 register_async_signal_handler_oneshot(SIGINT
, handle_osd_signal
);
648 register_async_signal_handler_oneshot(SIGTERM
, handle_osd_signal
);
652 if (g_conf
->inject_early_sigterm
)
653 kill(getpid(), SIGTERM
);
656 ms_hb_front_client
->wait();
657 ms_hb_back_client
->wait();
658 ms_hb_front_server
->wait();
659 ms_hb_back_server
->wait();
663 unregister_async_signal_handler(SIGHUP
, sighup_handler
);
664 unregister_async_signal_handler(SIGINT
, handle_osd_signal
);
665 unregister_async_signal_handler(SIGTERM
, handle_osd_signal
);
666 shutdown_async_signal_handler();
671 delete ms_hb_front_client
;
672 delete ms_hb_back_client
;
673 delete ms_hb_front_server
;
674 delete ms_hb_back_server
;
678 client_byte_throttler
.reset();
680 // cd on exit, so that gmon.out (if any) goes into a separate directory for each node.
682 snprintf(s
, sizeof(s
), "gmon/%d", getpid());
683 if ((mkdir(s
, 0755) == 0) && (chdir(s
) == 0)) {
684 dout(0) << "ceph-osd: gmon.out should be in " << s
<< dendl
;