]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | #include <unistd.h> | |
16 | ||
17 | #include "include/compat.h" | |
7c673cae FG |
18 | #include "include/types.h" |
19 | #include "include/str_list.h" | |
c07f9fc5 | 20 | |
7c673cae | 21 | #include "common/Clock.h" |
c07f9fc5 FG |
22 | #include "common/HeartbeatMap.h" |
23 | #include "common/Timer.h" | |
24 | #include "common/backport14.h" | |
7c673cae | 25 | #include "common/ceph_argparse.h" |
c07f9fc5 FG |
26 | #include "common/config.h" |
27 | #include "common/entity_name.h" | |
7c673cae | 28 | #include "common/errno.h" |
c07f9fc5 FG |
29 | #include "common/perf_counters.h" |
30 | #include "common/signal.h" | |
31 | #include "common/version.h" | |
32 | ||
33 | #include "global/signal_handler.h" | |
7c673cae FG |
34 | |
35 | #include "msg/Messenger.h" | |
36 | #include "mon/MonClient.h" | |
37 | ||
38 | #include "osdc/Objecter.h" | |
39 | ||
40 | #include "MDSMap.h" | |
41 | ||
42 | #include "MDSDaemon.h" | |
43 | #include "Server.h" | |
44 | #include "Locker.h" | |
45 | ||
46 | #include "SnapServer.h" | |
47 | #include "SnapClient.h" | |
48 | ||
7c673cae FG |
49 | #include "events/ESession.h" |
50 | #include "events/ESubtreeMap.h" | |
51 | ||
52 | #include "messages/MMDSMap.h" | |
53 | ||
54 | #include "messages/MGenericMessage.h" | |
55 | ||
56 | #include "messages/MMonCommand.h" | |
57 | #include "messages/MCommand.h" | |
58 | #include "messages/MCommandReply.h" | |
59 | ||
60 | #include "auth/AuthAuthorizeHandler.h" | |
61 | #include "auth/RotatingKeyRing.h" | |
62 | #include "auth/KeyRing.h" | |
63 | ||
7c673cae FG |
64 | #include "perfglue/cpu_profiler.h" |
65 | #include "perfglue/heap_profiler.h" | |
66 | ||
67 | #define dout_context g_ceph_context | |
68 | #define dout_subsys ceph_subsys_mds | |
69 | #undef dout_prefix | |
70 | #define dout_prefix *_dout << "mds." << name << ' ' | |
71 | ||
7c673cae | 72 | // cons/des |
94b18763 | 73 | MDSDaemon::MDSDaemon(boost::string_view n, Messenger *m, MonClient *mc) : |
7c673cae FG |
74 | Dispatcher(m->cct), |
75 | mds_lock("MDSDaemon::mds_lock"), | |
76 | stopping(false), | |
77 | timer(m->cct, mds_lock), | |
78 | beacon(m->cct, mc, n), | |
79 | authorize_handler_cluster_registry(new AuthAuthorizeHandlerRegistry(m->cct, | |
80 | m->cct->_conf->auth_supported.empty() ? | |
81 | m->cct->_conf->auth_cluster_required : | |
82 | m->cct->_conf->auth_supported)), | |
83 | authorize_handler_service_registry(new AuthAuthorizeHandlerRegistry(m->cct, | |
84 | m->cct->_conf->auth_supported.empty() ? | |
85 | m->cct->_conf->auth_service_required : | |
86 | m->cct->_conf->auth_supported)), | |
87 | name(n), | |
88 | messenger(m), | |
89 | monc(mc), | |
90 | mgrc(m->cct, m), | |
91 | log_client(m->cct, messenger, &mc->monmap, LogClient::NO_FLAGS), | |
92 | mds_rank(NULL), | |
94b18763 FG |
93 | asok_hook(NULL), |
94 | starttime(mono_clock::now()) | |
7c673cae FG |
95 | { |
96 | orig_argc = 0; | |
97 | orig_argv = NULL; | |
98 | ||
99 | clog = log_client.create_channel(); | |
100 | ||
101 | monc->set_messenger(messenger); | |
102 | ||
103 | mdsmap = new MDSMap; | |
104 | } | |
105 | ||
106 | MDSDaemon::~MDSDaemon() { | |
107 | Mutex::Locker lock(mds_lock); | |
108 | ||
109 | delete mds_rank; | |
110 | mds_rank = NULL; | |
111 | delete mdsmap; | |
112 | mdsmap = NULL; | |
113 | ||
114 | delete authorize_handler_service_registry; | |
115 | delete authorize_handler_cluster_registry; | |
116 | } | |
117 | ||
118 | class MDSSocketHook : public AdminSocketHook { | |
119 | MDSDaemon *mds; | |
120 | public: | |
121 | explicit MDSSocketHook(MDSDaemon *m) : mds(m) {} | |
122 | bool call(std::string command, cmdmap_t& cmdmap, std::string format, | |
123 | bufferlist& out) override { | |
124 | stringstream ss; | |
125 | bool r = mds->asok_command(command, cmdmap, format, ss); | |
126 | out.append(ss); | |
127 | return r; | |
128 | } | |
129 | }; | |
130 | ||
131 | bool MDSDaemon::asok_command(string command, cmdmap_t& cmdmap, string format, | |
132 | ostream& ss) | |
133 | { | |
134 | dout(1) << "asok_command: " << command << " (starting...)" << dendl; | |
135 | ||
136 | Formatter *f = Formatter::create(format, "json-pretty", "json-pretty"); | |
137 | bool handled = false; | |
138 | if (command == "status") { | |
139 | dump_status(f); | |
140 | handled = true; | |
141 | } else { | |
142 | if (mds_rank == NULL) { | |
143 | dout(1) << "Can't run that command on an inactive MDS!" << dendl; | |
144 | f->dump_string("error", "mds_not_active"); | |
145 | } else { | |
146 | handled = mds_rank->handle_asok_command(command, cmdmap, f, ss); | |
147 | } | |
148 | } | |
149 | f->flush(ss); | |
150 | delete f; | |
151 | ||
152 | dout(1) << "asok_command: " << command << " (complete)" << dendl; | |
153 | ||
154 | return handled; | |
155 | } | |
156 | ||
157 | void MDSDaemon::dump_status(Formatter *f) | |
158 | { | |
159 | f->open_object_section("status"); | |
160 | f->dump_stream("cluster_fsid") << monc->get_fsid(); | |
161 | if (mds_rank) { | |
162 | f->dump_int("whoami", mds_rank->get_nodeid()); | |
163 | } else { | |
164 | f->dump_int("whoami", MDS_RANK_NONE); | |
165 | } | |
166 | ||
167 | f->dump_int("id", monc->get_global_id()); | |
168 | f->dump_string("want_state", ceph_mds_state_name(beacon.get_want_state())); | |
169 | f->dump_string("state", ceph_mds_state_name(mdsmap->get_state_gid(mds_gid_t( | |
170 | monc->get_global_id())))); | |
171 | if (mds_rank) { | |
172 | Mutex::Locker l(mds_lock); | |
173 | mds_rank->dump_status(f); | |
174 | } | |
175 | ||
176 | f->dump_unsigned("mdsmap_epoch", mdsmap->get_epoch()); | |
177 | if (mds_rank) { | |
178 | f->dump_unsigned("osdmap_epoch", mds_rank->get_osd_epoch()); | |
179 | f->dump_unsigned("osdmap_epoch_barrier", mds_rank->get_osd_epoch_barrier()); | |
180 | } else { | |
181 | f->dump_unsigned("osdmap_epoch", 0); | |
182 | f->dump_unsigned("osdmap_epoch_barrier", 0); | |
183 | } | |
94b18763 FG |
184 | |
185 | f->dump_float("uptime", get_uptime().count()); | |
186 | ||
7c673cae FG |
187 | f->close_section(); // status |
188 | } | |
189 | ||
190 | void MDSDaemon::set_up_admin_socket() | |
191 | { | |
192 | int r; | |
193 | AdminSocket *admin_socket = g_ceph_context->get_admin_socket(); | |
194 | assert(asok_hook == nullptr); | |
195 | asok_hook = new MDSSocketHook(this); | |
196 | r = admin_socket->register_command("status", "status", asok_hook, | |
197 | "high-level status of MDS"); | |
198 | assert(r == 0); | |
199 | r = admin_socket->register_command("dump_ops_in_flight", | |
200 | "dump_ops_in_flight", asok_hook, | |
201 | "show the ops currently in flight"); | |
202 | assert(r == 0); | |
203 | r = admin_socket->register_command("ops", | |
204 | "ops", asok_hook, | |
205 | "show the ops currently in flight"); | |
206 | assert(r == 0); | |
207 | r = admin_socket->register_command("dump_blocked_ops", "dump_blocked_ops", | |
208 | asok_hook, | |
209 | "show the blocked ops currently in flight"); | |
210 | assert(r == 0); | |
211 | r = admin_socket->register_command("dump_historic_ops", "dump_historic_ops", | |
212 | asok_hook, | |
213 | "show slowest recent ops"); | |
214 | assert(r == 0); | |
215 | r = admin_socket->register_command("dump_historic_ops_by_duration", "dump_historic_ops_by_duration", | |
216 | asok_hook, | |
217 | "show slowest recent ops, sorted by op duration"); | |
218 | assert(r == 0); | |
219 | r = admin_socket->register_command("scrub_path", | |
220 | "scrub_path name=path,type=CephString " | |
221 | "name=scrubops,type=CephChoices," | |
222 | "strings=force|recursive|repair,n=N,req=false", | |
223 | asok_hook, | |
224 | "scrub an inode and output results"); | |
225 | assert(r == 0); | |
226 | r = admin_socket->register_command("tag path", | |
227 | "tag path name=path,type=CephString" | |
228 | " name=tag,type=CephString", | |
229 | asok_hook, | |
230 | "Apply scrub tag recursively"); | |
231 | assert(r == 0); | |
232 | r = admin_socket->register_command("flush_path", | |
233 | "flush_path name=path,type=CephString", | |
234 | asok_hook, | |
235 | "flush an inode (and its dirfrags)"); | |
236 | assert(r == 0); | |
237 | r = admin_socket->register_command("export dir", | |
238 | "export dir " | |
239 | "name=path,type=CephString " | |
240 | "name=rank,type=CephInt", | |
241 | asok_hook, | |
242 | "migrate a subtree to named MDS"); | |
243 | assert(r == 0); | |
244 | r = admin_socket->register_command("dump cache", | |
245 | "dump cache name=path,type=CephString,req=false", | |
246 | asok_hook, | |
247 | "dump metadata cache (optionally to a file)"); | |
248 | assert(r == 0); | |
181888fb FG |
249 | r = admin_socket->register_command("cache status", |
250 | "cache status", | |
251 | asok_hook, | |
252 | "show cache status"); | |
253 | assert(r == 0); | |
7c673cae FG |
254 | r = admin_socket->register_command("dump tree", |
255 | "dump tree " | |
256 | "name=root,type=CephString,req=true " | |
257 | "name=depth,type=CephInt,req=false ", | |
258 | asok_hook, | |
259 | "dump metadata cache for subtree"); | |
260 | assert(r == 0); | |
28e407b8 AA |
261 | r = admin_socket->register_command("dump loads", |
262 | "dump loads", | |
263 | asok_hook, | |
264 | "dump metadata loads"); | |
265 | assert(r == 0); | |
7c673cae FG |
266 | r = admin_socket->register_command("session evict", |
267 | "session evict name=client_id,type=CephString", | |
268 | asok_hook, | |
269 | "Evict a CephFS client"); | |
270 | assert(r == 0); | |
271 | r = admin_socket->register_command("osdmap barrier", | |
272 | "osdmap barrier name=target_epoch,type=CephInt", | |
273 | asok_hook, | |
274 | "Wait until the MDS has this OSD map epoch"); | |
275 | assert(r == 0); | |
276 | r = admin_socket->register_command("session ls", | |
277 | "session ls", | |
278 | asok_hook, | |
279 | "Enumerate connected CephFS clients"); | |
280 | assert(r == 0); | |
281 | r = admin_socket->register_command("flush journal", | |
282 | "flush journal", | |
283 | asok_hook, | |
284 | "Flush the journal to the backing store"); | |
285 | assert(r == 0); | |
286 | r = admin_socket->register_command("force_readonly", | |
287 | "force_readonly", | |
288 | asok_hook, | |
289 | "Force MDS to read-only mode"); | |
290 | assert(r == 0); | |
291 | r = admin_socket->register_command("get subtrees", | |
292 | "get subtrees", | |
293 | asok_hook, | |
294 | "Return the subtree map"); | |
295 | assert(r == 0); | |
296 | r = admin_socket->register_command("dirfrag split", | |
297 | "dirfrag split " | |
298 | "name=path,type=CephString,req=true " | |
299 | "name=frag,type=CephString,req=true " | |
300 | "name=bits,type=CephInt,req=true ", | |
301 | asok_hook, | |
302 | "Fragment directory by path"); | |
303 | assert(r == 0); | |
304 | r = admin_socket->register_command("dirfrag merge", | |
305 | "dirfrag merge " | |
306 | "name=path,type=CephString,req=true " | |
307 | "name=frag,type=CephString,req=true", | |
308 | asok_hook, | |
309 | "De-fragment directory by path"); | |
310 | assert(r == 0); | |
311 | r = admin_socket->register_command("dirfrag ls", | |
312 | "dirfrag ls " | |
313 | "name=path,type=CephString,req=true", | |
314 | asok_hook, | |
315 | "List fragments in directory"); | |
316 | assert(r == 0); | |
317 | } | |
318 | ||
319 | void MDSDaemon::clean_up_admin_socket() | |
320 | { | |
321 | AdminSocket *admin_socket = g_ceph_context->get_admin_socket(); | |
322 | admin_socket->unregister_command("status"); | |
323 | admin_socket->unregister_command("dump_ops_in_flight"); | |
324 | admin_socket->unregister_command("ops"); | |
325 | admin_socket->unregister_command("dump_blocked_ops"); | |
326 | admin_socket->unregister_command("dump_historic_ops"); | |
327 | admin_socket->unregister_command("dump_historic_ops_by_duration"); | |
328 | admin_socket->unregister_command("scrub_path"); | |
329 | admin_socket->unregister_command("tag path"); | |
330 | admin_socket->unregister_command("flush_path"); | |
331 | admin_socket->unregister_command("export dir"); | |
332 | admin_socket->unregister_command("dump cache"); | |
181888fb | 333 | admin_socket->unregister_command("cache status"); |
7c673cae | 334 | admin_socket->unregister_command("dump tree"); |
28e407b8 | 335 | admin_socket->unregister_command("dump loads"); |
7c673cae FG |
336 | admin_socket->unregister_command("session evict"); |
337 | admin_socket->unregister_command("osdmap barrier"); | |
338 | admin_socket->unregister_command("session ls"); | |
339 | admin_socket->unregister_command("flush journal"); | |
340 | admin_socket->unregister_command("force_readonly"); | |
341 | admin_socket->unregister_command("get subtrees"); | |
342 | admin_socket->unregister_command("dirfrag split"); | |
343 | admin_socket->unregister_command("dirfrag merge"); | |
344 | admin_socket->unregister_command("dirfrag ls"); | |
345 | delete asok_hook; | |
346 | asok_hook = NULL; | |
347 | } | |
348 | ||
349 | const char** MDSDaemon::get_tracked_conf_keys() const | |
350 | { | |
351 | static const char* KEYS[] = { | |
352 | "mds_op_complaint_time", "mds_op_log_threshold", | |
353 | "mds_op_history_size", "mds_op_history_duration", | |
354 | "mds_enable_op_tracker", | |
355 | "mds_log_pause", | |
356 | // clog & admin clog | |
357 | "clog_to_monitors", | |
358 | "clog_to_syslog", | |
359 | "clog_to_syslog_facility", | |
360 | "clog_to_syslog_level", | |
361 | // PurgeQueue | |
362 | "mds_max_purge_ops", | |
363 | "mds_max_purge_ops_per_pg", | |
364 | "mds_max_purge_files", | |
28e407b8 | 365 | "mds_inject_migrator_session_race", |
7c673cae FG |
366 | "clog_to_graylog", |
367 | "clog_to_graylog_host", | |
368 | "clog_to_graylog_port", | |
369 | "host", | |
370 | "fsid", | |
371 | NULL | |
372 | }; | |
373 | return KEYS; | |
374 | } | |
375 | ||
376 | void MDSDaemon::handle_conf_change(const struct md_config_t *conf, | |
377 | const std::set <std::string> &changed) | |
378 | { | |
379 | // We may be called within mds_lock (via `tell`) or outwith the | |
380 | // lock (via admin socket `config set`), so handle either case. | |
381 | const bool initially_locked = mds_lock.is_locked_by_me(); | |
382 | if (!initially_locked) { | |
383 | mds_lock.Lock(); | |
384 | } | |
385 | ||
386 | if (changed.count("mds_op_complaint_time") || | |
387 | changed.count("mds_op_log_threshold")) { | |
388 | if (mds_rank) { | |
389 | mds_rank->op_tracker.set_complaint_and_threshold(conf->mds_op_complaint_time, | |
390 | conf->mds_op_log_threshold); | |
391 | } | |
392 | } | |
393 | if (changed.count("mds_op_history_size") || | |
394 | changed.count("mds_op_history_duration")) { | |
395 | if (mds_rank) { | |
396 | mds_rank->op_tracker.set_history_size_and_duration(conf->mds_op_history_size, | |
397 | conf->mds_op_history_duration); | |
398 | } | |
399 | } | |
400 | if (changed.count("mds_enable_op_tracker")) { | |
401 | if (mds_rank) { | |
402 | mds_rank->op_tracker.set_tracking(conf->mds_enable_op_tracker); | |
403 | } | |
404 | } | |
405 | if (changed.count("clog_to_monitors") || | |
406 | changed.count("clog_to_syslog") || | |
407 | changed.count("clog_to_syslog_level") || | |
408 | changed.count("clog_to_syslog_facility") || | |
409 | changed.count("clog_to_graylog") || | |
410 | changed.count("clog_to_graylog_host") || | |
411 | changed.count("clog_to_graylog_port") || | |
412 | changed.count("host") || | |
413 | changed.count("fsid")) { | |
414 | if (mds_rank) { | |
415 | mds_rank->update_log_config(); | |
416 | } | |
417 | } | |
418 | ||
419 | if (!g_conf->mds_log_pause && changed.count("mds_log_pause")) { | |
420 | if (mds_rank) { | |
421 | mds_rank->mdlog->kick_submitter(); | |
422 | } | |
423 | } | |
424 | ||
425 | if (mds_rank) { | |
426 | mds_rank->handle_conf_change(conf, changed); | |
427 | } | |
428 | ||
429 | if (!initially_locked) { | |
430 | mds_lock.Unlock(); | |
431 | } | |
432 | } | |
433 | ||
434 | ||
435 | int MDSDaemon::init() | |
436 | { | |
437 | dout(10) << sizeof(MDSCacheObject) << "\tMDSCacheObject" << dendl; | |
438 | dout(10) << sizeof(CInode) << "\tCInode" << dendl; | |
439 | dout(10) << sizeof(elist<void*>::item) << "\t elist<>::item *7=" << 7*sizeof(elist<void*>::item) << dendl; | |
94b18763 FG |
440 | dout(10) << sizeof(CInode::mempool_inode) << "\t inode " << dendl; |
441 | dout(10) << sizeof(CInode::mempool_old_inode) << "\t old_inode " << dendl; | |
7c673cae FG |
442 | dout(10) << sizeof(nest_info_t) << "\t nest_info_t " << dendl; |
443 | dout(10) << sizeof(frag_info_t) << "\t frag_info_t " << dendl; | |
444 | dout(10) << sizeof(SimpleLock) << "\t SimpleLock *5=" << 5*sizeof(SimpleLock) << dendl; | |
445 | dout(10) << sizeof(ScatterLock) << "\t ScatterLock *3=" << 3*sizeof(ScatterLock) << dendl; | |
446 | dout(10) << sizeof(CDentry) << "\tCDentry" << dendl; | |
447 | dout(10) << sizeof(elist<void*>::item) << "\t elist<>::item" << dendl; | |
448 | dout(10) << sizeof(SimpleLock) << "\t SimpleLock" << dendl; | |
449 | dout(10) << sizeof(CDir) << "\tCDir " << dendl; | |
450 | dout(10) << sizeof(elist<void*>::item) << "\t elist<>::item *2=" << 2*sizeof(elist<void*>::item) << dendl; | |
451 | dout(10) << sizeof(fnode_t) << "\t fnode_t " << dendl; | |
452 | dout(10) << sizeof(nest_info_t) << "\t nest_info_t *2" << dendl; | |
453 | dout(10) << sizeof(frag_info_t) << "\t frag_info_t *2" << dendl; | |
454 | dout(10) << sizeof(Capability) << "\tCapability " << dendl; | |
455 | dout(10) << sizeof(xlist<void*>::item) << "\t xlist<>::item *2=" << 2*sizeof(xlist<void*>::item) << dendl; | |
456 | ||
457 | messenger->add_dispatcher_tail(&beacon); | |
458 | messenger->add_dispatcher_tail(this); | |
459 | ||
460 | // get monmap | |
461 | monc->set_messenger(messenger); | |
462 | ||
463 | monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD | | |
464 | CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_MGR); | |
465 | int r = 0; | |
466 | r = monc->init(); | |
467 | if (r < 0) { | |
468 | derr << "ERROR: failed to get monmap: " << cpp_strerror(-r) << dendl; | |
469 | mds_lock.Lock(); | |
470 | suicide(); | |
471 | mds_lock.Unlock(); | |
472 | return r; | |
473 | } | |
474 | ||
475 | // tell monc about log_client so it will know about mon session resets | |
476 | monc->set_log_client(&log_client); | |
477 | ||
478 | r = monc->authenticate(); | |
479 | if (r < 0) { | |
480 | derr << "ERROR: failed to authenticate: " << cpp_strerror(-r) << dendl; | |
481 | mds_lock.Lock(); | |
482 | suicide(); | |
483 | mds_lock.Unlock(); | |
484 | return r; | |
485 | } | |
486 | ||
487 | int rotating_auth_attempts = 0; | |
488 | while (monc->wait_auth_rotating(30.0) < 0) { | |
489 | if (++rotating_auth_attempts <= g_conf->max_rotating_auth_attempts) { | |
490 | derr << "unable to obtain rotating service keys; retrying" << dendl; | |
491 | continue; | |
492 | } | |
493 | derr << "ERROR: failed to refresh rotating keys, " | |
494 | << "maximum retry time reached." << dendl; | |
495 | mds_lock.Lock(); | |
496 | suicide(); | |
497 | mds_lock.Unlock(); | |
498 | return -ETIMEDOUT; | |
499 | } | |
500 | ||
501 | mgrc.init(); | |
502 | messenger->add_dispatcher_head(&mgrc); | |
503 | ||
504 | mds_lock.Lock(); | |
505 | if (beacon.get_want_state() == CEPH_MDS_STATE_DNE) { | |
506 | dout(4) << __func__ << ": terminated already, dropping out" << dendl; | |
507 | mds_lock.Unlock(); | |
508 | return 0; | |
509 | } | |
510 | ||
511 | monc->sub_want("mdsmap", 0, 0); | |
512 | monc->sub_want("mgrmap", 0, 0); | |
513 | monc->renew_subs(); | |
514 | ||
515 | mds_lock.Unlock(); | |
516 | ||
517 | // Set up admin socket before taking mds_lock, so that ordering | |
518 | // is consistent (later we take mds_lock within asok callbacks) | |
519 | set_up_admin_socket(); | |
520 | g_conf->add_observer(this); | |
521 | mds_lock.Lock(); | |
522 | if (beacon.get_want_state() == MDSMap::STATE_DNE) { | |
523 | suicide(); // we could do something more graceful here | |
524 | dout(4) << __func__ << ": terminated already, dropping out" << dendl; | |
525 | mds_lock.Unlock(); | |
526 | return 0; | |
527 | } | |
528 | ||
529 | timer.init(); | |
530 | ||
531 | beacon.init(mdsmap); | |
532 | messenger->set_myname(entity_name_t::MDS(MDS_RANK_NONE)); | |
533 | ||
534 | // schedule tick | |
535 | reset_tick(); | |
536 | mds_lock.Unlock(); | |
537 | ||
538 | return 0; | |
539 | } | |
540 | ||
541 | void MDSDaemon::reset_tick() | |
542 | { | |
543 | // cancel old | |
544 | if (tick_event) timer.cancel_event(tick_event); | |
545 | ||
546 | // schedule | |
3efd9988 FG |
547 | tick_event = timer.add_event_after( |
548 | g_conf->mds_tick_interval, | |
549 | new FunctionContext([this](int) { | |
550 | assert(mds_lock.is_locked_by_me()); | |
551 | tick(); | |
552 | })); | |
7c673cae FG |
553 | } |
554 | ||
555 | void MDSDaemon::tick() | |
556 | { | |
7c673cae FG |
557 | // reschedule |
558 | reset_tick(); | |
559 | ||
560 | // Call through to subsystems' tick functions | |
561 | if (mds_rank) { | |
562 | mds_rank->tick(); | |
563 | } | |
564 | } | |
565 | ||
566 | void MDSDaemon::send_command_reply(MCommand *m, MDSRank *mds_rank, | |
567 | int r, bufferlist outbl, | |
94b18763 | 568 | boost::string_view outs) |
7c673cae FG |
569 | { |
570 | Session *session = static_cast<Session *>(m->get_connection()->get_priv()); | |
571 | assert(session != NULL); | |
572 | // If someone is using a closed session for sending commands (e.g. | |
573 | // the ceph CLI) then we should feel free to clean up this connection | |
574 | // as soon as we've sent them a response. | |
94b18763 FG |
575 | const bool live_session = |
576 | session->get_state_seq() > 0 && | |
577 | mds_rank && | |
578 | mds_rank->sessionmap.get_session(session->info.inst.name); | |
7c673cae FG |
579 | |
580 | if (!live_session) { | |
581 | // This session only existed to issue commands, so terminate it | |
582 | // as soon as we can. | |
583 | assert(session->is_closed()); | |
584 | session->connection->mark_disposable(); | |
7c673cae | 585 | } |
94b18763 | 586 | session->put(); |
7c673cae FG |
587 | |
588 | MCommandReply *reply = new MCommandReply(r, outs); | |
589 | reply->set_tid(m->get_tid()); | |
590 | reply->set_data(outbl); | |
591 | m->get_connection()->send_message(reply); | |
592 | } | |
593 | ||
594 | /* This function DOES put the passed message before returning*/ | |
595 | void MDSDaemon::handle_command(MCommand *m) | |
596 | { | |
597 | Session *session = static_cast<Session *>(m->get_connection()->get_priv()); | |
598 | assert(session != NULL); | |
599 | ||
600 | int r = 0; | |
601 | cmdmap_t cmdmap; | |
602 | std::stringstream ss; | |
603 | std::string outs; | |
604 | bufferlist outbl; | |
605 | Context *run_after = NULL; | |
606 | bool need_reply = true; | |
607 | ||
608 | if (!session->auth_caps.allow_all()) { | |
609 | dout(1) << __func__ | |
610 | << ": received command from client without `tell` capability: " | |
611 | << m->get_connection()->peer_addr << dendl; | |
612 | ||
613 | ss << "permission denied"; | |
614 | r = -EPERM; | |
615 | } else if (m->cmd.empty()) { | |
616 | r = -EINVAL; | |
617 | ss << "no command given"; | |
618 | outs = ss.str(); | |
619 | } else if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) { | |
620 | r = -EINVAL; | |
621 | outs = ss.str(); | |
622 | } else { | |
623 | r = _handle_command(cmdmap, m, &outbl, &outs, &run_after, &need_reply); | |
624 | } | |
94b18763 | 625 | session->put(); |
7c673cae FG |
626 | |
627 | if (need_reply) { | |
628 | send_command_reply(m, mds_rank, r, outbl, outs); | |
629 | } | |
630 | ||
631 | if (run_after) { | |
632 | run_after->complete(0); | |
633 | } | |
634 | ||
635 | m->put(); | |
636 | } | |
637 | ||
638 | ||
639 | struct MDSCommand { | |
640 | string cmdstring; | |
641 | string helpstring; | |
642 | string module; | |
643 | string perm; | |
644 | string availability; | |
645 | } mds_commands[] = { | |
646 | ||
647 | #define COMMAND(parsesig, helptext, module, perm, availability) \ | |
648 | {parsesig, helptext, module, perm, availability}, | |
649 | ||
650 | COMMAND("injectargs " \ | |
651 | "name=injected_args,type=CephString,n=N", | |
652 | "inject configuration arguments into running MDS", | |
653 | "mds", "*", "cli,rest") | |
c07f9fc5 FG |
654 | COMMAND("config set " \ |
655 | "name=key,type=CephString name=value,type=CephString", | |
656 | "Set a configuration option at runtime (not persistent)", | |
657 | "mds", "*", "cli,rest") | |
7c673cae FG |
658 | COMMAND("exit", |
659 | "Terminate this MDS", | |
660 | "mds", "*", "cli,rest") | |
661 | COMMAND("respawn", | |
662 | "Restart this MDS", | |
663 | "mds", "*", "cli,rest") | |
664 | COMMAND("session kill " \ | |
665 | "name=session_id,type=CephInt", | |
666 | "End a client session", | |
667 | "mds", "*", "cli,rest") | |
668 | COMMAND("cpu_profiler " \ | |
669 | "name=arg,type=CephChoices,strings=status|flush", | |
670 | "run cpu profiling on daemon", "mds", "rw", "cli,rest") | |
671 | COMMAND("session ls " \ | |
672 | "name=filters,type=CephString,n=N,req=false", | |
673 | "List client sessions", "mds", "r", "cli,rest") | |
31f18b77 FG |
674 | COMMAND("client ls " \ |
675 | "name=filters,type=CephString,n=N,req=false", | |
676 | "List client sessions", "mds", "r", "cli,rest") | |
7c673cae FG |
677 | COMMAND("session evict " \ |
678 | "name=filters,type=CephString,n=N,req=false", | |
679 | "Evict client session(s)", "mds", "rw", "cli,rest") | |
31f18b77 FG |
680 | COMMAND("client evict " \ |
681 | "name=filters,type=CephString,n=N,req=false", | |
682 | "Evict client session(s)", "mds", "rw", "cli,rest") | |
7c673cae FG |
683 | COMMAND("damage ls", |
684 | "List detected metadata damage", "mds", "r", "cli,rest") | |
685 | COMMAND("damage rm name=damage_id,type=CephInt", | |
686 | "Remove a damage table entry", "mds", "rw", "cli,rest") | |
c07f9fc5 | 687 | COMMAND("version", "report version of MDS", "mds", "r", "cli,rest") |
7c673cae FG |
688 | COMMAND("heap " \ |
689 | "name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats", \ | |
690 | "show heap usage info (available only if compiled with tcmalloc)", \ | |
691 | "mds", "*", "cli,rest") | |
692 | }; | |
693 | ||
694 | ||
695 | int MDSDaemon::_handle_command( | |
696 | const cmdmap_t &cmdmap, | |
697 | MCommand *m, | |
698 | bufferlist *outbl, | |
699 | std::string *outs, | |
700 | Context **run_later, | |
701 | bool *need_reply) | |
702 | { | |
703 | assert(outbl != NULL); | |
704 | assert(outs != NULL); | |
705 | ||
706 | class SuicideLater : public Context | |
707 | { | |
708 | MDSDaemon *mds; | |
709 | ||
710 | public: | |
711 | explicit SuicideLater(MDSDaemon *mds_) : mds(mds_) {} | |
712 | void finish(int r) override { | |
713 | // Wait a little to improve chances of caller getting | |
714 | // our response before seeing us disappear from mdsmap | |
715 | sleep(1); | |
716 | ||
717 | mds->suicide(); | |
718 | } | |
719 | }; | |
720 | ||
721 | ||
722 | class RespawnLater : public Context | |
723 | { | |
724 | MDSDaemon *mds; | |
725 | ||
726 | public: | |
727 | ||
728 | explicit RespawnLater(MDSDaemon *mds_) : mds(mds_) {} | |
729 | void finish(int r) override { | |
730 | // Wait a little to improve chances of caller getting | |
731 | // our response before seeing us disappear from mdsmap | |
732 | sleep(1); | |
733 | ||
734 | mds->respawn(); | |
735 | } | |
736 | }; | |
737 | ||
738 | std::stringstream ds; | |
739 | std::stringstream ss; | |
740 | std::string prefix; | |
c07f9fc5 FG |
741 | std::string format; |
742 | std::unique_ptr<Formatter> f(Formatter::create(format)); | |
7c673cae FG |
743 | cmd_getval(cct, cmdmap, "prefix", prefix); |
744 | ||
745 | int r = 0; | |
746 | ||
747 | if (prefix == "get_command_descriptions") { | |
748 | int cmdnum = 0; | |
c07f9fc5 | 749 | std::unique_ptr<JSONFormatter> f(ceph::make_unique<JSONFormatter>()); |
7c673cae FG |
750 | f->open_object_section("command_descriptions"); |
751 | for (MDSCommand *cp = mds_commands; | |
752 | cp < &mds_commands[ARRAY_SIZE(mds_commands)]; cp++) { | |
753 | ||
754 | ostringstream secname; | |
755 | secname << "cmd" << setfill('0') << std::setw(3) << cmdnum; | |
c07f9fc5 | 756 | dump_cmddesc_to_json(f.get(), secname.str(), cp->cmdstring, cp->helpstring, |
7c673cae FG |
757 | cp->module, cp->perm, cp->availability, 0); |
758 | cmdnum++; | |
759 | } | |
760 | f->close_section(); // command_descriptions | |
761 | ||
762 | f->flush(ds); | |
c07f9fc5 FG |
763 | goto out; |
764 | } | |
765 | ||
766 | cmd_getval(cct, cmdmap, "format", format); | |
767 | if (prefix == "version") { | |
768 | if (f) { | |
769 | f->open_object_section("version"); | |
770 | f->dump_string("version", pretty_version_to_str()); | |
771 | f->close_section(); | |
772 | f->flush(ds); | |
773 | } else { | |
774 | ds << pretty_version_to_str(); | |
775 | } | |
7c673cae FG |
776 | } else if (prefix == "injectargs") { |
777 | vector<string> argsvec; | |
778 | cmd_getval(cct, cmdmap, "injected_args", argsvec); | |
779 | ||
780 | if (argsvec.empty()) { | |
781 | r = -EINVAL; | |
782 | ss << "ignoring empty injectargs"; | |
783 | goto out; | |
784 | } | |
785 | string args = argsvec.front(); | |
786 | for (vector<string>::iterator a = ++argsvec.begin(); a != argsvec.end(); ++a) | |
787 | args += " " + *a; | |
788 | r = cct->_conf->injectargs(args, &ss); | |
c07f9fc5 FG |
789 | } else if (prefix == "config set") { |
790 | std::string key; | |
791 | cmd_getval(cct, cmdmap, "key", key); | |
792 | std::string val; | |
793 | cmd_getval(cct, cmdmap, "value", val); | |
794 | r = cct->_conf->set_val(key, val, true, &ss); | |
d2e6a577 FG |
795 | if (r == 0) { |
796 | cct->_conf->apply_changes(nullptr); | |
797 | } | |
7c673cae FG |
798 | } else if (prefix == "exit") { |
799 | // We will send response before executing | |
800 | ss << "Exiting..."; | |
801 | *run_later = new SuicideLater(this); | |
c07f9fc5 | 802 | } else if (prefix == "respawn") { |
7c673cae FG |
803 | // We will send response before executing |
804 | ss << "Respawning..."; | |
805 | *run_later = new RespawnLater(this); | |
806 | } else if (prefix == "session kill") { | |
807 | if (mds_rank == NULL) { | |
808 | r = -EINVAL; | |
809 | ss << "MDS not active"; | |
810 | goto out; | |
811 | } | |
812 | // FIXME harmonize `session kill` with admin socket session evict | |
813 | int64_t session_id = 0; | |
814 | bool got = cmd_getval(cct, cmdmap, "session_id", session_id); | |
815 | assert(got); | |
31f18b77 FG |
816 | bool killed = mds_rank->evict_client(session_id, false, |
817 | g_conf->mds_session_blacklist_on_evict, | |
818 | ss); | |
7c673cae FG |
819 | if (!killed) |
820 | r = -ENOENT; | |
821 | } else if (prefix == "heap") { | |
822 | if (!ceph_using_tcmalloc()) { | |
823 | r = -EOPNOTSUPP; | |
824 | ss << "could not issue heap profiler command -- not using tcmalloc!"; | |
825 | } else { | |
826 | string heapcmd; | |
827 | cmd_getval(cct, cmdmap, "heapcmd", heapcmd); | |
828 | vector<string> heapcmd_vec; | |
829 | get_str_vec(heapcmd, heapcmd_vec); | |
830 | ceph_heap_profiler_handle_command(heapcmd_vec, ds); | |
831 | } | |
832 | } else if (prefix == "cpu_profiler") { | |
833 | string arg; | |
834 | cmd_getval(cct, cmdmap, "arg", arg); | |
835 | vector<string> argvec; | |
836 | get_str_vec(arg, argvec); | |
837 | cpu_profiler_handle_command(argvec, ds); | |
838 | } else { | |
839 | // Give MDSRank a shot at the command | |
b32b8144 FG |
840 | if (!mds_rank) { |
841 | ss << "MDS not active"; | |
842 | r = -EINVAL; | |
843 | } | |
844 | else { | |
7c673cae FG |
845 | bool handled = mds_rank->handle_command(cmdmap, m, &r, &ds, &ss, |
846 | need_reply); | |
b32b8144 FG |
847 | if (!handled) { |
848 | // MDSDaemon doesn't know this command | |
849 | ss << "unrecognized command! " << prefix; | |
850 | r = -EINVAL; | |
7c673cae FG |
851 | } |
852 | } | |
7c673cae FG |
853 | } |
854 | ||
855 | out: | |
856 | *outs = ss.str(); | |
857 | outbl->append(ds); | |
858 | return r; | |
859 | } | |
860 | ||
861 | /* This function deletes the passed message before returning. */ | |
862 | ||
863 | void MDSDaemon::handle_mds_map(MMDSMap *m) | |
864 | { | |
865 | version_t epoch = m->get_epoch(); | |
866 | dout(5) << "handle_mds_map epoch " << epoch << " from " << m->get_source() << dendl; | |
867 | ||
868 | // is it new? | |
869 | if (epoch <= mdsmap->get_epoch()) { | |
870 | dout(5) << " old map epoch " << epoch << " <= " << mdsmap->get_epoch() | |
871 | << ", discarding" << dendl; | |
872 | m->put(); | |
873 | return; | |
874 | } | |
875 | ||
876 | entity_addr_t addr; | |
877 | ||
878 | // keep old map, for a moment | |
879 | MDSMap *oldmap = mdsmap; | |
880 | ||
881 | // decode and process | |
882 | mdsmap = new MDSMap; | |
883 | mdsmap->decode(m->get_encoded()); | |
884 | const MDSMap::DaemonState new_state = mdsmap->get_state_gid(mds_gid_t(monc->get_global_id())); | |
885 | const int incarnation = mdsmap->get_inc_gid(mds_gid_t(monc->get_global_id())); | |
886 | ||
887 | monc->sub_got("mdsmap", mdsmap->get_epoch()); | |
888 | ||
889 | // Calculate my effective rank (either my owned rank or my | |
890 | // standby_for_rank if in standby replay) | |
891 | mds_rank_t whoami = mdsmap->get_rank_gid(mds_gid_t(monc->get_global_id())); | |
892 | ||
893 | // verify compatset | |
894 | CompatSet mdsmap_compat(get_mdsmap_compat_set_all()); | |
895 | dout(10) << " my compat " << mdsmap_compat << dendl; | |
896 | dout(10) << " mdsmap compat " << mdsmap->compat << dendl; | |
897 | if (!mdsmap_compat.writeable(mdsmap->compat)) { | |
898 | dout(0) << "handle_mds_map mdsmap compatset " << mdsmap->compat | |
899 | << " not writeable with daemon features " << mdsmap_compat | |
900 | << ", killing myself" << dendl; | |
901 | suicide(); | |
902 | goto out; | |
903 | } | |
904 | ||
905 | // mark down any failed peers | |
906 | for (map<mds_gid_t,MDSMap::mds_info_t>::const_iterator p = oldmap->get_mds_info().begin(); | |
907 | p != oldmap->get_mds_info().end(); | |
908 | ++p) { | |
909 | if (mdsmap->get_mds_info().count(p->first) == 0) { | |
910 | dout(10) << " peer mds gid " << p->first << " removed from map" << dendl; | |
911 | messenger->mark_down(p->second.addr); | |
912 | } | |
913 | } | |
914 | ||
915 | if (whoami == MDS_RANK_NONE && | |
916 | new_state == MDSMap::STATE_STANDBY_REPLAY) { | |
917 | whoami = mdsmap->get_mds_info_gid(mds_gid_t(monc->get_global_id())).standby_for_rank; | |
918 | } | |
919 | ||
920 | // see who i am | |
921 | addr = messenger->get_myaddr(); | |
c07f9fc5 | 922 | dout(10) << "map says I am " << addr << " mds." << whoami << "." << incarnation |
7c673cae FG |
923 | << " state " << ceph_mds_state_name(new_state) << dendl; |
924 | ||
925 | if (whoami == MDS_RANK_NONE) { | |
926 | if (mds_rank != NULL) { | |
c07f9fc5 | 927 | const auto myid = monc->get_global_id(); |
7c673cae FG |
928 | // We have entered a rank-holding state, we shouldn't be back |
929 | // here! | |
930 | if (g_conf->mds_enforce_unique_name) { | |
931 | if (mds_gid_t existing = mdsmap->find_mds_gid_by_name(name)) { | |
932 | const MDSMap::mds_info_t& i = mdsmap->get_info_gid(existing); | |
c07f9fc5 FG |
933 | if (i.global_id > myid) { |
934 | dout(1) << "map replaced me with another mds." << whoami | |
935 | << " with gid (" << i.global_id << ") larger than myself (" | |
936 | << myid << "); quitting!" << dendl; | |
7c673cae FG |
937 | // Call suicide() rather than respawn() because if someone else |
938 | // has taken our ID, we don't want to keep restarting and | |
939 | // fighting them for the ID. | |
940 | suicide(); | |
941 | m->put(); | |
942 | return; | |
943 | } | |
944 | } | |
945 | } | |
946 | ||
c07f9fc5 FG |
947 | dout(1) << "map removed me (mds." << whoami << " gid:" |
948 | << myid << ") from cluster due to lost contact; respawning" << dendl; | |
7c673cae FG |
949 | respawn(); |
950 | } | |
951 | // MDSRank not active: process the map here to see if we have | |
952 | // been assigned a rank. | |
953 | dout(10) << __func__ << ": handling map in rankless mode" << dendl; | |
954 | _handle_mds_map(oldmap); | |
955 | } else { | |
956 | ||
957 | // Did we already hold a different rank? MDSMonitor shouldn't try | |
958 | // to change that out from under me! | |
959 | if (mds_rank && whoami != mds_rank->get_nodeid()) { | |
960 | derr << "Invalid rank transition " << mds_rank->get_nodeid() << "->" | |
961 | << whoami << dendl; | |
962 | respawn(); | |
963 | } | |
964 | ||
965 | // Did I previously not hold a rank? Initialize! | |
966 | if (mds_rank == NULL) { | |
967 | mds_rank = new MDSRankDispatcher(whoami, mds_lock, clog, | |
968 | timer, beacon, mdsmap, messenger, monc, | |
969 | new FunctionContext([this](int r){respawn();}), | |
970 | new FunctionContext([this](int r){suicide();})); | |
971 | dout(10) << __func__ << ": initializing MDS rank " | |
972 | << mds_rank->get_nodeid() << dendl; | |
973 | mds_rank->init(); | |
974 | } | |
975 | ||
976 | // MDSRank is active: let him process the map, we have no say. | |
977 | dout(10) << __func__ << ": handling map as rank " | |
978 | << mds_rank->get_nodeid() << dendl; | |
979 | mds_rank->handle_mds_map(m, oldmap); | |
980 | } | |
981 | ||
982 | out: | |
983 | beacon.notify_mdsmap(mdsmap); | |
984 | m->put(); | |
985 | delete oldmap; | |
986 | } | |
987 | ||
988 | void MDSDaemon::_handle_mds_map(MDSMap *oldmap) | |
989 | { | |
990 | MDSMap::DaemonState new_state = mdsmap->get_state_gid(mds_gid_t(monc->get_global_id())); | |
991 | ||
992 | // Normal rankless case, we're marked as standby | |
993 | if (new_state == MDSMap::STATE_STANDBY) { | |
994 | beacon.set_want_state(mdsmap, new_state); | |
995 | dout(1) << "handle_mds_map standby" << dendl; | |
996 | ||
997 | return; | |
998 | } | |
999 | ||
1000 | // Case where we thought we were standby, but MDSMap disagrees | |
1001 | if (beacon.get_want_state() == MDSMap::STATE_STANDBY) { | |
1002 | dout(10) << "dropped out of mdsmap, try to re-add myself" << dendl; | |
1003 | new_state = MDSMap::STATE_BOOT; | |
1004 | beacon.set_want_state(mdsmap, new_state); | |
1005 | return; | |
1006 | } | |
1007 | ||
1008 | // Case where we have sent a boot beacon that isn't reflected yet | |
1009 | if (beacon.get_want_state() == MDSMap::STATE_BOOT) { | |
1010 | dout(10) << "not in map yet" << dendl; | |
1011 | } | |
1012 | } | |
1013 | ||
1014 | void MDSDaemon::handle_signal(int signum) | |
1015 | { | |
1016 | assert(signum == SIGINT || signum == SIGTERM); | |
1017 | derr << "*** got signal " << sig_str(signum) << " ***" << dendl; | |
1018 | { | |
1019 | Mutex::Locker l(mds_lock); | |
1020 | if (stopping) { | |
1021 | return; | |
1022 | } | |
1023 | suicide(); | |
1024 | } | |
1025 | } | |
1026 | ||
1027 | void MDSDaemon::suicide() | |
1028 | { | |
1029 | assert(mds_lock.is_locked()); | |
1030 | ||
1031 | // make sure we don't suicide twice | |
1032 | assert(stopping == false); | |
1033 | stopping = true; | |
1034 | ||
1035 | dout(1) << "suicide. wanted state " | |
1036 | << ceph_mds_state_name(beacon.get_want_state()) << dendl; | |
1037 | ||
1038 | if (tick_event) { | |
1039 | timer.cancel_event(tick_event); | |
1040 | tick_event = 0; | |
1041 | } | |
1042 | ||
1043 | //because add_observer is called after set_up_admin_socket | |
1044 | //so we can use asok_hook to avoid assert in the remove_observer | |
1045 | if (asok_hook != NULL) | |
1046 | g_conf->remove_observer(this); | |
1047 | ||
1048 | clean_up_admin_socket(); | |
1049 | ||
1050 | // Inform MDS we are going away, then shut down beacon | |
1051 | beacon.set_want_state(mdsmap, MDSMap::STATE_DNE); | |
1052 | if (!mdsmap->is_dne_gid(mds_gid_t(monc->get_global_id()))) { | |
1053 | // Notify the MDSMonitor that we're dying, so that it doesn't have to | |
1054 | // wait for us to go laggy. Only do this if we're actually in the | |
1055 | // MDSMap, because otherwise the MDSMonitor will drop our message. | |
1056 | beacon.send_and_wait(1); | |
1057 | } | |
1058 | beacon.shutdown(); | |
1059 | ||
1060 | mgrc.shutdown(); | |
1061 | ||
1062 | if (mds_rank) { | |
1063 | mds_rank->shutdown(); | |
1064 | } else { | |
1065 | timer.shutdown(); | |
1066 | ||
1067 | monc->shutdown(); | |
1068 | messenger->shutdown(); | |
1069 | } | |
1070 | } | |
1071 | ||
1072 | void MDSDaemon::respawn() | |
1073 | { | |
1074 | dout(1) << "respawn" << dendl; | |
1075 | ||
1076 | char *new_argv[orig_argc+1]; | |
1077 | dout(1) << " e: '" << orig_argv[0] << "'" << dendl; | |
1078 | for (int i=0; i<orig_argc; i++) { | |
1079 | new_argv[i] = (char *)orig_argv[i]; | |
1080 | dout(1) << " " << i << ": '" << orig_argv[i] << "'" << dendl; | |
1081 | } | |
1082 | new_argv[orig_argc] = NULL; | |
1083 | ||
1084 | /* Determine the path to our executable, test if Linux /proc/self/exe exists. | |
1085 | * This allows us to exec the same executable even if it has since been | |
1086 | * unlinked. | |
1087 | */ | |
1088 | char exe_path[PATH_MAX] = ""; | |
1089 | if (readlink(PROCPREFIX "/proc/self/exe", exe_path, PATH_MAX-1) == -1) { | |
1090 | /* Print CWD for the user's interest */ | |
1091 | char buf[PATH_MAX]; | |
1092 | char *cwd = getcwd(buf, sizeof(buf)); | |
1093 | assert(cwd); | |
1094 | dout(1) << " cwd " << cwd << dendl; | |
1095 | ||
1096 | /* Fall back to a best-effort: just running in our CWD */ | |
1097 | strncpy(exe_path, orig_argv[0], PATH_MAX-1); | |
1098 | } else { | |
1099 | dout(1) << "respawning with exe " << exe_path << dendl; | |
1100 | strcpy(exe_path, PROCPREFIX "/proc/self/exe"); | |
1101 | } | |
1102 | ||
1103 | dout(1) << " exe_path " << exe_path << dendl; | |
1104 | ||
1105 | unblock_all_signals(NULL); | |
1106 | execv(exe_path, new_argv); | |
1107 | ||
1108 | dout(0) << "respawn execv " << orig_argv[0] | |
1109 | << " failed with " << cpp_strerror(errno) << dendl; | |
1110 | ||
1111 | // We have to assert out here, because suicide() returns, and callers | |
1112 | // to respawn expect it never to return. | |
1113 | ceph_abort(); | |
1114 | } | |
1115 | ||
1116 | ||
1117 | ||
1118 | bool MDSDaemon::ms_dispatch(Message *m) | |
1119 | { | |
1120 | Mutex::Locker l(mds_lock); | |
1121 | if (stopping) { | |
1122 | return false; | |
1123 | } | |
1124 | ||
1125 | // Drop out early if shutting down | |
1126 | if (beacon.get_want_state() == CEPH_MDS_STATE_DNE) { | |
1127 | dout(10) << " stopping, discarding " << *m << dendl; | |
1128 | m->put(); | |
1129 | return true; | |
1130 | } | |
1131 | ||
1132 | // First see if it's a daemon message | |
1133 | const bool handled_core = handle_core_message(m); | |
1134 | if (handled_core) { | |
1135 | return true; | |
1136 | } | |
1137 | ||
1138 | // Not core, try it as a rank message | |
1139 | if (mds_rank) { | |
1140 | return mds_rank->ms_dispatch(m); | |
1141 | } else { | |
1142 | return false; | |
1143 | } | |
1144 | } | |
1145 | ||
1146 | bool MDSDaemon::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new) | |
1147 | { | |
1148 | dout(10) << "MDSDaemon::ms_get_authorizer type=" | |
1149 | << ceph_entity_type_name(dest_type) << dendl; | |
1150 | ||
1151 | /* monitor authorization is being handled on different layer */ | |
1152 | if (dest_type == CEPH_ENTITY_TYPE_MON) | |
1153 | return true; | |
1154 | ||
1155 | if (force_new) { | |
1156 | if (monc->wait_auth_rotating(10) < 0) | |
1157 | return false; | |
1158 | } | |
1159 | ||
1160 | *authorizer = monc->build_authorizer(dest_type); | |
1161 | return *authorizer != NULL; | |
1162 | } | |
1163 | ||
1164 | ||
1165 | /* | |
1166 | * high priority messages we always process | |
1167 | */ | |
1168 | bool MDSDaemon::handle_core_message(Message *m) | |
1169 | { | |
1170 | switch (m->get_type()) { | |
1171 | case CEPH_MSG_MON_MAP: | |
1172 | ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MON); | |
1173 | m->put(); | |
1174 | break; | |
1175 | ||
1176 | // MDS | |
1177 | case CEPH_MSG_MDS_MAP: | |
1178 | ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_MDS); | |
1179 | handle_mds_map(static_cast<MMDSMap*>(m)); | |
1180 | break; | |
1181 | ||
1182 | // OSD | |
1183 | case MSG_COMMAND: | |
1184 | handle_command(static_cast<MCommand*>(m)); | |
1185 | break; | |
1186 | case CEPH_MSG_OSD_MAP: | |
1187 | ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD); | |
1188 | ||
1189 | if (mds_rank) { | |
1190 | mds_rank->handle_osd_map(); | |
1191 | } | |
1192 | m->put(); | |
1193 | break; | |
1194 | ||
1195 | case MSG_MON_COMMAND: | |
1196 | ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MON); | |
1197 | clog->warn() << "dropping `mds tell` command from legacy monitor"; | |
1198 | m->put(); | |
1199 | break; | |
1200 | ||
1201 | default: | |
1202 | return false; | |
1203 | } | |
1204 | return true; | |
1205 | } | |
1206 | ||
1207 | void MDSDaemon::ms_handle_connect(Connection *con) | |
1208 | { | |
1209 | } | |
1210 | ||
1211 | bool MDSDaemon::ms_handle_reset(Connection *con) | |
1212 | { | |
1213 | if (con->get_peer_type() != CEPH_ENTITY_TYPE_CLIENT) | |
1214 | return false; | |
1215 | ||
1216 | Mutex::Locker l(mds_lock); | |
1217 | if (stopping) { | |
1218 | return false; | |
1219 | } | |
1220 | dout(5) << "ms_handle_reset on " << con->get_peer_addr() << dendl; | |
1221 | if (beacon.get_want_state() == CEPH_MDS_STATE_DNE) | |
1222 | return false; | |
1223 | ||
1224 | Session *session = static_cast<Session *>(con->get_priv()); | |
1225 | if (session) { | |
1226 | if (session->is_closed()) { | |
1227 | dout(3) << "ms_handle_reset closing connection for session " << session->info.inst << dendl; | |
1228 | con->mark_down(); | |
1229 | con->set_priv(NULL); | |
1230 | } | |
1231 | session->put(); | |
1232 | } else { | |
1233 | con->mark_down(); | |
1234 | } | |
1235 | return false; | |
1236 | } | |
1237 | ||
1238 | ||
1239 | void MDSDaemon::ms_handle_remote_reset(Connection *con) | |
1240 | { | |
1241 | if (con->get_peer_type() != CEPH_ENTITY_TYPE_CLIENT) | |
1242 | return; | |
1243 | ||
1244 | Mutex::Locker l(mds_lock); | |
1245 | if (stopping) { | |
1246 | return; | |
1247 | } | |
1248 | ||
1249 | dout(5) << "ms_handle_remote_reset on " << con->get_peer_addr() << dendl; | |
1250 | if (beacon.get_want_state() == CEPH_MDS_STATE_DNE) | |
1251 | return; | |
1252 | ||
1253 | Session *session = static_cast<Session *>(con->get_priv()); | |
1254 | if (session) { | |
1255 | if (session->is_closed()) { | |
1256 | dout(3) << "ms_handle_remote_reset closing connection for session " << session->info.inst << dendl; | |
1257 | con->mark_down(); | |
1258 | con->set_priv(NULL); | |
1259 | } | |
1260 | session->put(); | |
1261 | } | |
1262 | } | |
1263 | ||
1264 | bool MDSDaemon::ms_handle_refused(Connection *con) | |
1265 | { | |
1266 | // do nothing for now | |
1267 | return false; | |
1268 | } | |
1269 | ||
1270 | bool MDSDaemon::ms_verify_authorizer(Connection *con, int peer_type, | |
1271 | int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply, | |
28e407b8 AA |
1272 | bool& is_valid, CryptoKey& session_key, |
1273 | std::unique_ptr<AuthAuthorizerChallenge> *challenge) | |
7c673cae FG |
1274 | { |
1275 | Mutex::Locker l(mds_lock); | |
1276 | if (stopping) { | |
1277 | return false; | |
1278 | } | |
1279 | if (beacon.get_want_state() == CEPH_MDS_STATE_DNE) | |
1280 | return false; | |
1281 | ||
1282 | AuthAuthorizeHandler *authorize_handler = 0; | |
1283 | switch (peer_type) { | |
1284 | case CEPH_ENTITY_TYPE_MDS: | |
1285 | authorize_handler = authorize_handler_cluster_registry->get_handler(protocol); | |
1286 | break; | |
1287 | default: | |
1288 | authorize_handler = authorize_handler_service_registry->get_handler(protocol); | |
1289 | } | |
1290 | if (!authorize_handler) { | |
1291 | dout(0) << "No AuthAuthorizeHandler found for protocol " << protocol << dendl; | |
1292 | is_valid = false; | |
1293 | return true; | |
1294 | } | |
1295 | ||
1296 | AuthCapsInfo caps_info; | |
1297 | EntityName name; | |
1298 | uint64_t global_id; | |
1299 | ||
c07f9fc5 FG |
1300 | RotatingKeyRing *keys = monc->rotating_secrets.get(); |
1301 | if (keys) { | |
1302 | is_valid = authorize_handler->verify_authorizer( | |
1303 | cct, keys, | |
1304 | authorizer_data, authorizer_reply, name, global_id, caps_info, | |
28e407b8 | 1305 | session_key, nullptr, challenge); |
c07f9fc5 FG |
1306 | } else { |
1307 | dout(10) << __func__ << " no rotating_keys (yet), denied" << dendl; | |
1308 | is_valid = false; | |
1309 | } | |
7c673cae FG |
1310 | |
1311 | if (is_valid) { | |
1312 | entity_name_t n(con->get_peer_type(), global_id); | |
1313 | ||
1314 | // We allow connections and assign Session instances to connections | |
1315 | // even if we have not been assigned a rank, because clients with | |
1316 | // "allow *" are allowed to connect and do 'tell' operations before | |
1317 | // we have a rank. | |
1318 | Session *s = NULL; | |
1319 | if (mds_rank) { | |
1320 | // If we do hold a rank, see if this is an existing client establishing | |
1321 | // a new connection, rather than a new client | |
1322 | s = mds_rank->sessionmap.get_session(n); | |
1323 | } | |
1324 | ||
1325 | // Wire up a Session* to this connection | |
1326 | // It doesn't go into a SessionMap instance until it sends an explicit | |
1327 | // request to open a session (initial state of Session is `closed`) | |
1328 | if (!s) { | |
1329 | s = new Session; | |
1330 | s->info.auth_name = name; | |
1331 | s->info.inst.addr = con->get_peer_addr(); | |
1332 | s->info.inst.name = n; | |
1333 | dout(10) << " new session " << s << " for " << s->info.inst << " con " << con << dendl; | |
1334 | con->set_priv(s); | |
1335 | s->connection = con; | |
28e407b8 AA |
1336 | if (mds_rank) { |
1337 | mds_rank->kick_waiters_for_any_client_connection(); | |
1338 | } | |
7c673cae FG |
1339 | } else { |
1340 | dout(10) << " existing session " << s << " for " << s->info.inst << " existing con " << s->connection | |
1341 | << ", new/authorizing con " << con << dendl; | |
1342 | con->set_priv(s->get()); | |
1343 | ||
1344 | ||
1345 | ||
1346 | // Wait until we fully accept the connection before setting | |
1347 | // s->connection. In particular, if there are multiple incoming | |
1348 | // connection attempts, they will all get their authorizer | |
1349 | // validated, but some of them may "lose the race" and get | |
1350 | // dropped. We only want to consider the winner(s). See | |
1351 | // ms_handle_accept(). This is important for Sessions we replay | |
1352 | // from the journal on recovery that don't have established | |
1353 | // messenger state; we want the con from only the winning | |
1354 | // connect attempt(s). (Normal reconnects that don't follow MDS | |
1355 | // recovery are reconnected to the existing con by the | |
1356 | // messenger.) | |
1357 | } | |
1358 | ||
1359 | if (caps_info.allow_all) { | |
1360 | // Flag for auth providers that don't provide cap strings | |
1361 | s->auth_caps.set_allow_all(); | |
b5b8bbf5 FG |
1362 | } else { |
1363 | bufferlist::iterator p = caps_info.caps.begin(); | |
1364 | string auth_cap_str; | |
1365 | try { | |
1366 | ::decode(auth_cap_str, p); | |
1367 | ||
1368 | dout(10) << __func__ << ": parsing auth_cap_str='" << auth_cap_str << "'" << dendl; | |
1369 | std::ostringstream errstr; | |
1370 | if (!s->auth_caps.parse(g_ceph_context, auth_cap_str, &errstr)) { | |
1371 | dout(1) << __func__ << ": auth cap parse error: " << errstr.str() | |
1372 | << " parsing '" << auth_cap_str << "'" << dendl; | |
1373 | clog->warn() << name << " mds cap '" << auth_cap_str | |
1374 | << "' does not parse: " << errstr.str(); | |
1375 | is_valid = false; | |
1376 | } | |
1377 | } catch (buffer::error& e) { | |
1378 | // Assume legacy auth, defaults to: | |
1379 | // * permit all filesystem ops | |
1380 | // * permit no `tell` ops | |
1381 | dout(1) << __func__ << ": cannot decode auth caps bl of length " << caps_info.caps.length() << dendl; | |
1382 | is_valid = false; | |
7c673cae | 1383 | } |
7c673cae FG |
1384 | } |
1385 | } | |
1386 | ||
1387 | return true; // we made a decision (see is_valid) | |
1388 | } | |
1389 | ||
1390 | ||
1391 | void MDSDaemon::ms_handle_accept(Connection *con) | |
1392 | { | |
1393 | Mutex::Locker l(mds_lock); | |
1394 | if (stopping) { | |
1395 | return; | |
1396 | } | |
1397 | ||
1398 | Session *s = static_cast<Session *>(con->get_priv()); | |
1399 | dout(10) << "ms_handle_accept " << con->get_peer_addr() << " con " << con << " session " << s << dendl; | |
1400 | if (s) { | |
1401 | if (s->connection != con) { | |
1402 | dout(10) << " session connection " << s->connection << " -> " << con << dendl; | |
1403 | s->connection = con; | |
1404 | ||
1405 | // send out any queued messages | |
1406 | while (!s->preopen_out_queue.empty()) { | |
1407 | con->send_message(s->preopen_out_queue.front()); | |
1408 | s->preopen_out_queue.pop_front(); | |
1409 | } | |
1410 | } | |
1411 | s->put(); | |
1412 | } | |
1413 | } | |
1414 | ||
1415 | bool MDSDaemon::is_clean_shutdown() | |
1416 | { | |
1417 | if (mds_rank) { | |
1418 | return mds_rank->is_stopped(); | |
1419 | } else { | |
1420 | return true; | |
1421 | } | |
1422 | } |