]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/MDSRank.h
update sources to 12.2.7
[ceph.git] / ceph / src / mds / MDSRank.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2015 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #ifndef MDS_RANK_H_
16 #define MDS_RANK_H_
17
18 #include <boost/utility/string_view.hpp>
19
20 #include "common/DecayCounter.h"
21 #include "common/LogClient.h"
22 #include "common/Timer.h"
23 #include "common/TrackedOp.h"
24
25 #include "messages/MCommand.h"
26
27 #include "Beacon.h"
28 #include "DamageTable.h"
29 #include "MDSMap.h"
30 #include "SessionMap.h"
31 #include "MDCache.h"
32 #include "Migrator.h"
33 #include "MDLog.h"
34 #include "PurgeQueue.h"
35 #include "osdc/Journaler.h"
36
37 // Full .h import instead of forward declaration for PerfCounter, for the
38 // benefit of those including this header and using MDSRank::logger
39 #include "common/perf_counters.h"
40
41 enum {
42 l_mds_first = 2000,
43 l_mds_request,
44 l_mds_reply,
45 l_mds_reply_latency,
46 l_mds_forward,
47 l_mds_dir_fetch,
48 l_mds_dir_commit,
49 l_mds_dir_split,
50 l_mds_dir_merge,
51 l_mds_inode_max,
52 l_mds_inodes,
53 l_mds_inodes_top,
54 l_mds_inodes_bottom,
55 l_mds_inodes_pin_tail,
56 l_mds_inodes_pinned,
57 l_mds_inodes_expired,
58 l_mds_inodes_with_caps,
59 l_mds_caps,
60 l_mds_subtrees,
61 l_mds_traverse,
62 l_mds_traverse_hit,
63 l_mds_traverse_forward,
64 l_mds_traverse_discover,
65 l_mds_traverse_dir_fetch,
66 l_mds_traverse_remote_ino,
67 l_mds_traverse_lock,
68 l_mds_load_cent,
69 l_mds_dispatch_queue_len,
70 l_mds_exported,
71 l_mds_exported_inodes,
72 l_mds_imported,
73 l_mds_imported_inodes,
74 l_mds_last,
75 };
76
77 // memory utilization
78 enum {
79 l_mdm_first = 2500,
80 l_mdm_ino,
81 l_mdm_inoa,
82 l_mdm_inos,
83 l_mdm_dir,
84 l_mdm_dira,
85 l_mdm_dirs,
86 l_mdm_dn,
87 l_mdm_dna,
88 l_mdm_dns,
89 l_mdm_cap,
90 l_mdm_capa,
91 l_mdm_caps,
92 l_mdm_rss,
93 l_mdm_heap,
94 l_mdm_buf,
95 l_mdm_last,
96 };
97
98 namespace ceph {
99 struct heartbeat_handle_d;
100 }
101
102 class Server;
103 class Locker;
104 class MDCache;
105 class MDLog;
106 class MDBalancer;
107 class InoTable;
108 class SnapServer;
109 class SnapClient;
110 class MDSTableServer;
111 class MDSTableClient;
112 class Messenger;
113 class Objecter;
114 class MonClient;
115 class Finisher;
116 class MMDSMap;
117 class ScrubStack;
118
119 /**
120 * The public part of this class's interface is what's exposed to all
121 * the various subsystems (server, mdcache, etc), such as pointers
122 * to the other subsystems, and message-sending calls.
123 */
124 class MDSRank {
125 protected:
126 const mds_rank_t whoami;
127
128 // Incarnation as seen in MDSMap at the point where a rank is
129 // assigned.
130 int incarnation;
131
132 public:
133 mds_rank_t get_nodeid() const { return whoami; }
134 int64_t get_metadata_pool();
135
136 // Reference to global MDS::mds_lock, so that users of MDSRank don't
137 // carry around references to the outer MDS, and we can substitute
138 // a separate lock here in future potentially.
139 Mutex &mds_lock;
140
141 mono_time get_starttime() const {
142 return starttime;
143 }
144 chrono::duration<double> get_uptime() const {
145 mono_time now = mono_clock::now();
146 return chrono::duration<double>(now-starttime);
147 }
148
149 class CephContext *cct;
150
151 bool is_daemon_stopping() const;
152
153 // Reference to global cluster log client, just to avoid initialising
154 // a separate one here.
155 LogChannelRef &clog;
156
157 // Reference to global timer utility, because MDSRank and MDSDaemon
158 // currently both use the same mds_lock, so it makes sense for them
159 // to share a timer.
160 SafeTimer &timer;
161
162 MDSMap *&mdsmap;
163
164 Objecter *objecter;
165
166 // sub systems
167 Server *server;
168 MDCache *mdcache;
169 Locker *locker;
170 MDLog *mdlog;
171 MDBalancer *balancer;
172 ScrubStack *scrubstack;
173 DamageTable damage_table;
174
175
176 InoTable *inotable;
177
178 SnapServer *snapserver;
179 SnapClient *snapclient;
180
181 MDSTableClient *get_table_client(int t);
182 MDSTableServer *get_table_server(int t);
183
184 SessionMap sessionmap;
185 Session *get_session(client_t client) {
186 return sessionmap.get_session(entity_name_t::CLIENT(client.v));
187 }
188 Session *get_session(Message *m);
189
190 PerfCounters *logger, *mlogger;
191 OpTracker op_tracker;
192
193 // The last different state I held before current
194 MDSMap::DaemonState last_state;
195 // The state assigned to me by the MDSMap
196 MDSMap::DaemonState state;
197
198 bool cluster_degraded;
199
200 MDSMap::DaemonState get_state() const { return state; }
201 MDSMap::DaemonState get_want_state() const { return beacon.get_want_state(); }
202
203 bool is_creating() const { return state == MDSMap::STATE_CREATING; }
204 bool is_starting() const { return state == MDSMap::STATE_STARTING; }
205 bool is_standby() const { return state == MDSMap::STATE_STANDBY; }
206 bool is_replay() const { return state == MDSMap::STATE_REPLAY; }
207 bool is_standby_replay() const { return state == MDSMap::STATE_STANDBY_REPLAY; }
208 bool is_resolve() const { return state == MDSMap::STATE_RESOLVE; }
209 bool is_reconnect() const { return state == MDSMap::STATE_RECONNECT; }
210 bool is_rejoin() const { return state == MDSMap::STATE_REJOIN; }
211 bool is_clientreplay() const { return state == MDSMap::STATE_CLIENTREPLAY; }
212 bool is_active() const { return state == MDSMap::STATE_ACTIVE; }
213 bool is_stopping() const { return state == MDSMap::STATE_STOPPING; }
214 bool is_any_replay() const { return (is_replay() || is_standby_replay()); }
215 bool is_stopped() const { return mdsmap->is_stopped(whoami); }
216 bool is_cluster_degraded() const { return cluster_degraded; }
217
218 void handle_write_error(int err);
219
220 void handle_conf_change(const struct md_config_t *conf,
221 const std::set <std::string> &changed)
222 {
223 mdcache->migrator->handle_conf_change(conf, changed, *mdsmap);
224 purge_queue.handle_conf_change(conf, changed, *mdsmap);
225 }
226
227 void update_mlogger();
228 protected:
229 // Flag to indicate we entered shutdown: anyone seeing this to be true
230 // after taking mds_lock must drop out.
231 bool stopping;
232
233 // PurgeQueue is only used by StrayManager, but it is owned by MDSRank
234 // because its init/shutdown happens at the top level.
235 PurgeQueue purge_queue;
236
237 class ProgressThread : public Thread {
238 MDSRank *mds;
239 Cond cond;
240 public:
241 explicit ProgressThread(MDSRank *mds_) : mds(mds_) {}
242 void * entry() override;
243 void shutdown();
244 void signal() {cond.Signal();}
245 } progress_thread;
246
247 list<Message*> waiting_for_nolaggy;
248 list<MDSInternalContextBase*> finished_queue;
249 // Dispatch, retry, queues
250 int dispatch_depth;
251 void inc_dispatch_depth() { ++dispatch_depth; }
252 void dec_dispatch_depth() { --dispatch_depth; }
253 void retry_dispatch(Message *m);
254 bool handle_deferrable_message(Message *m);
255 void _advance_queues();
256 bool _dispatch(Message *m, bool new_msg);
257
258 ceph::heartbeat_handle_d *hb; // Heartbeat for threads using mds_lock
259
260 bool is_stale_message(Message *m) const;
261
262 map<mds_rank_t, version_t> peer_mdsmap_epoch;
263
264 ceph_tid_t last_tid; // for mds-initiated requests (e.g. stray rename)
265
266 list<MDSInternalContextBase*> waiting_for_active, waiting_for_replay, waiting_for_reconnect, waiting_for_resolve;
267 list<MDSInternalContextBase*> waiting_for_any_client_connection;
268 list<MDSInternalContextBase*> replay_queue;
269 map<mds_rank_t, list<MDSInternalContextBase*> > waiting_for_active_peer;
270 map<epoch_t, list<MDSInternalContextBase*> > waiting_for_mdsmap;
271
272 epoch_t osd_epoch_barrier;
273
274 // Const reference to the beacon so that we can behave differently
275 // when it's laggy.
276 Beacon &beacon;
277
278 /**
279 * Emit clog warnings for any ops reported as warnings by optracker
280 */
281 void check_ops_in_flight();
282
283 int mds_slow_req_count;
284
285 /**
286 * Share MDSMap with clients
287 */
288 void bcast_mds_map(); // to mounted clients
289 epoch_t last_client_mdsmap_bcast;
290
291 map<mds_rank_t,DecayCounter> export_targets; /* targets this MDS is exporting to or wants/tries to */
292
293 void create_logger();
294 public:
295
296 void queue_waiter(MDSInternalContextBase *c) {
297 finished_queue.push_back(c);
298 progress_thread.signal();
299 }
300 void queue_waiters(std::list<MDSInternalContextBase*>& ls) {
301 finished_queue.splice( finished_queue.end(), ls );
302 progress_thread.signal();
303 }
304
305 MDSRank(
306 mds_rank_t whoami_,
307 Mutex &mds_lock_,
308 LogChannelRef &clog_,
309 SafeTimer &timer_,
310 Beacon &beacon_,
311 MDSMap *& mdsmap_,
312 Messenger *msgr,
313 MonClient *monc_,
314 Context *respawn_hook_,
315 Context *suicide_hook_);
316
317 protected:
318 ~MDSRank();
319
320 public:
321
322 // Daemon lifetime functions: these guys break the abstraction
323 // and call up into the parent MDSDaemon instance. It's kind
324 // of unavoidable: if we want any depth into our calls
325 // to be able to e.g. tear down the whole process, we have to
326 // have a reference going all the way down.
327 // >>>
328 void suicide();
329 void respawn();
330 // <<<
331
332 /**
333 * Call this periodically if inside a potentially long running piece
334 * of code while holding the mds_lock
335 */
336 void heartbeat_reset();
337
338 /**
339 * Report state DAMAGED to the mon, and then pass on to respawn(). Call
340 * this when an unrecoverable error is encountered while attempting
341 * to load an MDS rank's data structures. This is *not* for use with
342 * errors affecting normal dirfrag/inode objects -- they should be handled
343 * through cleaner scrub/repair mechanisms.
344 *
345 * Callers must already hold mds_lock.
346 */
347 void damaged();
348
349 /**
350 * Wrapper around `damaged` for users who are not
351 * already holding mds_lock.
352 *
353 * Callers must not already hold mds_lock.
354 */
355 void damaged_unlocked();
356
357 utime_t get_laggy_until() const;
358
359 void send_message_mds(Message *m, mds_rank_t mds);
360 void forward_message_mds(Message *req, mds_rank_t mds);
361
362 void send_message_client_counted(Message *m, client_t client);
363 void send_message_client_counted(Message *m, Session *session);
364 void send_message_client_counted(Message *m, Connection *connection);
365 void send_message_client_counted(Message *m, const ConnectionRef& con) {
366 send_message_client_counted(m, con.get());
367 }
368 void send_message_client(Message *m, Session *session);
369 void send_message(Message *m, Connection *c);
370 void send_message(Message *m, const ConnectionRef& c) {
371 send_message(m, c.get());
372 }
373
374 void wait_for_active_peer(mds_rank_t who, MDSInternalContextBase *c) {
375 waiting_for_active_peer[who].push_back(c);
376 }
377 void wait_for_cluster_recovered(MDSInternalContextBase *c) {
378 assert(cluster_degraded);
379 waiting_for_active_peer[MDS_RANK_NONE].push_back(c);
380 }
381
382 void wait_for_any_client_connection(MDSInternalContextBase *c) {
383 waiting_for_any_client_connection.push_back(c);
384 }
385 void kick_waiters_for_any_client_connection(void) {
386 finish_contexts(g_ceph_context, waiting_for_any_client_connection);
387 }
388 void wait_for_active(MDSInternalContextBase *c) {
389 waiting_for_active.push_back(c);
390 }
391 void wait_for_replay(MDSInternalContextBase *c) {
392 waiting_for_replay.push_back(c);
393 }
394 void wait_for_reconnect(MDSInternalContextBase *c) {
395 waiting_for_reconnect.push_back(c);
396 }
397 void wait_for_resolve(MDSInternalContextBase *c) {
398 waiting_for_resolve.push_back(c);
399 }
400 void wait_for_mdsmap(epoch_t e, MDSInternalContextBase *c) {
401 waiting_for_mdsmap[e].push_back(c);
402 }
403 void enqueue_replay(MDSInternalContextBase *c) {
404 replay_queue.push_back(c);
405 }
406
407 bool queue_one_replay();
408
409 void set_osd_epoch_barrier(epoch_t e);
410 epoch_t get_osd_epoch_barrier() const {return osd_epoch_barrier;}
411 epoch_t get_osd_epoch() const;
412
413 ceph_tid_t issue_tid() { return ++last_tid; }
414
415 Finisher *finisher;
416
417 MDSMap *get_mds_map() { return mdsmap; }
418
419 uint64_t get_num_requests() const { return logger->get(l_mds_request); }
420
421 int get_mds_slow_req_count() const { return mds_slow_req_count; }
422
423 void dump_status(Formatter *f) const;
424
425 void hit_export_target(utime_t now, mds_rank_t rank, double amount=-1.0);
426 bool is_export_target(mds_rank_t rank) {
427 const set<mds_rank_t>& map_targets = mdsmap->get_mds_info(get_nodeid()).export_targets;
428 return map_targets.count(rank);
429 }
430
431 bool evict_client(int64_t session_id, bool wait, bool blacklist,
432 std::stringstream& ss, Context *on_killed=nullptr);
433
434 protected:
435 void dump_clientreplay_status(Formatter *f) const;
436 void command_scrub_path(Formatter *f, boost::string_view path, vector<string>& scrubop_vec);
437 void command_tag_path(Formatter *f, boost::string_view path,
438 boost::string_view tag);
439 void command_flush_path(Formatter *f, boost::string_view path);
440 void command_flush_journal(Formatter *f);
441 void command_get_subtrees(Formatter *f);
442 void command_export_dir(Formatter *f,
443 boost::string_view path, mds_rank_t dest);
444 bool command_dirfrag_split(
445 cmdmap_t cmdmap,
446 std::ostream &ss);
447 bool command_dirfrag_merge(
448 cmdmap_t cmdmap,
449 std::ostream &ss);
450 bool command_dirfrag_ls(
451 cmdmap_t cmdmap,
452 std::ostream &ss,
453 Formatter *f);
454 int _command_export_dir(boost::string_view path, mds_rank_t dest);
455 int _command_flush_journal(std::stringstream *ss);
456 CDir *_command_dirfrag_get(
457 const cmdmap_t &cmdmap,
458 std::ostream &ss);
459
460 protected:
461 Messenger *messenger;
462 MonClient *monc;
463
464 Context *respawn_hook;
465 Context *suicide_hook;
466
467 // Friended to access retry_dispatch
468 friend class C_MDS_RetryMessage;
469
470 // FIXME the state machine logic should be separable from the dispatch
471 // logic that calls it.
472 // >>>
473 void calc_recovery_set();
474 void request_state(MDSMap::DaemonState s);
475
476 bool standby_replaying; // true if current replay pass is in standby-replay mode
477
478 typedef enum {
479 // The MDSMap is available, configure default layouts and structures
480 MDS_BOOT_INITIAL = 0,
481 // We are ready to open some inodes
482 MDS_BOOT_OPEN_ROOT,
483 // We are ready to do a replay if needed
484 MDS_BOOT_PREPARE_LOG,
485 // Replay is complete
486 MDS_BOOT_REPLAY_DONE
487 } BootStep;
488 friend class C_MDS_BootStart;
489 friend class C_MDS_InternalBootStart;
490 void boot_create(); // i am new mds.
491 void boot_start(BootStep step=MDS_BOOT_INITIAL, int r=0); // starting|replay
492
493 void replay_start();
494 void creating_done();
495 void starting_done();
496 void replay_done();
497 void standby_replay_restart();
498 void _standby_replay_restart_finish(int r, uint64_t old_read_pos);
499 class C_MDS_StandbyReplayRestart;
500 class C_MDS_StandbyReplayRestartFinish;
501
502 void reopen_log();
503
504 void resolve_start();
505 void resolve_done();
506 void reconnect_start();
507 void reconnect_done();
508 void rejoin_joint_start();
509 void rejoin_start();
510 void rejoin_done();
511 void recovery_done(int oldstate);
512 void clientreplay_start();
513 void clientreplay_done();
514 void active_start();
515 void stopping_start();
516 void stopping_done();
517
518 void validate_sessions();
519 // <<<
520
521 // >>>
522 void handle_mds_recovery(mds_rank_t who);
523 void handle_mds_failure(mds_rank_t who);
524 // <<<
525
526 /* Update MDSMap export_targets for this rank. Called on ::tick(). */
527 void update_targets(utime_t now);
528
529 private:
530 mono_time starttime = mono_clock::zero();
531 };
532
533 /* This expects to be given a reference which it is responsible for.
534 * The finish function calls functions which
535 * will put the Message exactly once.*/
536 class C_MDS_RetryMessage : public MDSInternalContext {
537 protected:
538 Message *m;
539 public:
540 C_MDS_RetryMessage(MDSRank *mds, Message *m)
541 : MDSInternalContext(mds)
542 {
543 assert(m);
544 this->m = m;
545 }
546 void finish(int r) override {
547 mds->retry_dispatch(m);
548 }
549 };
550
551 /**
552 * The aspect of MDSRank exposed to MDSDaemon but not subsystems: i.e.
553 * the service/dispatcher stuff like init/shutdown that subsystems should
554 * never touch.
555 */
556 class MDSRankDispatcher : public MDSRank
557 {
558 public:
559 void init();
560 void tick();
561 void shutdown();
562 bool handle_asok_command(std::string command, cmdmap_t& cmdmap,
563 Formatter *f, std::ostream& ss);
564 void handle_mds_map(MMDSMap *m, MDSMap *oldmap);
565 void handle_osd_map();
566 void update_log_config();
567
568 bool handle_command(
569 const cmdmap_t &cmdmap,
570 MCommand *m,
571 int *r,
572 std::stringstream *ds,
573 std::stringstream *ss,
574 bool *need_reply);
575
576 void dump_sessions(const SessionFilter &filter, Formatter *f) const;
577 void evict_clients(const SessionFilter &filter, MCommand *m);
578
579 // Call into me from MDS::ms_dispatch
580 bool ms_dispatch(Message *m);
581
582 MDSRankDispatcher(
583 mds_rank_t whoami_,
584 Mutex &mds_lock_,
585 LogChannelRef &clog_,
586 SafeTimer &timer_,
587 Beacon &beacon_,
588 MDSMap *& mdsmap_,
589 Messenger *msgr,
590 MonClient *monc_,
591 Context *respawn_hook_,
592 Context *suicide_hook_);
593 };
594
595 // This utility for MDS and MDSRank dispatchers.
596 #define ALLOW_MESSAGES_FROM(peers) \
597 do { \
598 if (m->get_connection() && (m->get_connection()->get_peer_type() & (peers)) == 0) { \
599 dout(0) << __FILE__ << "." << __LINE__ << ": filtered out request, peer=" << m->get_connection()->get_peer_type() \
600 << " allowing=" << #peers << " message=" << *m << dendl; \
601 m->put(); \
602 return true; \
603 } \
604 } while (0)
605
606 #endif // MDS_RANK_H_
607