]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/MDSRank.h
update sources to v12.2.5
[ceph.git] / ceph / src / mds / MDSRank.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2015 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #ifndef MDS_RANK_H_
16 #define MDS_RANK_H_
17
18 #include <boost/utility/string_view.hpp>
19
20 #include "common/DecayCounter.h"
21 #include "common/LogClient.h"
22 #include "common/Timer.h"
23 #include "common/TrackedOp.h"
24
25 #include "messages/MCommand.h"
26
27 #include "Beacon.h"
28 #include "DamageTable.h"
29 #include "MDSMap.h"
30 #include "SessionMap.h"
31 #include "MDCache.h"
32 #include "Migrator.h"
33 #include "MDLog.h"
34 #include "PurgeQueue.h"
35 #include "osdc/Journaler.h"
36
37 // Full .h import instead of forward declaration for PerfCounter, for the
38 // benefit of those including this header and using MDSRank::logger
39 #include "common/perf_counters.h"
40
41 enum {
42 l_mds_first = 2000,
43 l_mds_request,
44 l_mds_reply,
45 l_mds_reply_latency,
46 l_mds_forward,
47 l_mds_dir_fetch,
48 l_mds_dir_commit,
49 l_mds_dir_split,
50 l_mds_dir_merge,
51 l_mds_inode_max,
52 l_mds_inodes,
53 l_mds_inodes_top,
54 l_mds_inodes_bottom,
55 l_mds_inodes_pin_tail,
56 l_mds_inodes_pinned,
57 l_mds_inodes_expired,
58 l_mds_inodes_with_caps,
59 l_mds_caps,
60 l_mds_subtrees,
61 l_mds_traverse,
62 l_mds_traverse_hit,
63 l_mds_traverse_forward,
64 l_mds_traverse_discover,
65 l_mds_traverse_dir_fetch,
66 l_mds_traverse_remote_ino,
67 l_mds_traverse_lock,
68 l_mds_load_cent,
69 l_mds_dispatch_queue_len,
70 l_mds_exported,
71 l_mds_exported_inodes,
72 l_mds_imported,
73 l_mds_imported_inodes,
74 l_mds_last,
75 };
76
77 // memory utilization
78 enum {
79 l_mdm_first = 2500,
80 l_mdm_ino,
81 l_mdm_inoa,
82 l_mdm_inos,
83 l_mdm_dir,
84 l_mdm_dira,
85 l_mdm_dirs,
86 l_mdm_dn,
87 l_mdm_dna,
88 l_mdm_dns,
89 l_mdm_cap,
90 l_mdm_capa,
91 l_mdm_caps,
92 l_mdm_rss,
93 l_mdm_heap,
94 l_mdm_buf,
95 l_mdm_last,
96 };
97
98 namespace ceph {
99 struct heartbeat_handle_d;
100 }
101
102 class Server;
103 class Locker;
104 class MDCache;
105 class MDLog;
106 class MDBalancer;
107 class InoTable;
108 class SnapServer;
109 class SnapClient;
110 class MDSTableServer;
111 class MDSTableClient;
112 class Messenger;
113 class Objecter;
114 class MonClient;
115 class Finisher;
116 class MMDSMap;
117 class ScrubStack;
118
119 /**
120 * The public part of this class's interface is what's exposed to all
121 * the various subsystems (server, mdcache, etc), such as pointers
122 * to the other subsystems, and message-sending calls.
123 */
124 class MDSRank {
125 protected:
126 const mds_rank_t whoami;
127
128 // Incarnation as seen in MDSMap at the point where a rank is
129 // assigned.
130 int incarnation;
131
132 public:
133 mds_rank_t get_nodeid() const { return whoami; }
134 int64_t get_metadata_pool();
135
136 // Reference to global MDS::mds_lock, so that users of MDSRank don't
137 // carry around references to the outer MDS, and we can substitute
138 // a separate lock here in future potentially.
139 Mutex &mds_lock;
140
141 mono_time get_starttime() const {
142 return starttime;
143 }
144 chrono::duration<double> get_uptime() const {
145 mono_time now = mono_clock::now();
146 return chrono::duration<double>(now-starttime);
147 }
148
149 class CephContext *cct;
150
151 bool is_daemon_stopping() const;
152
153 // Reference to global cluster log client, just to avoid initialising
154 // a separate one here.
155 LogChannelRef &clog;
156
157 // Reference to global timer utility, because MDSRank and MDSDaemon
158 // currently both use the same mds_lock, so it makes sense for them
159 // to share a timer.
160 SafeTimer &timer;
161
162 MDSMap *&mdsmap;
163
164 Objecter *objecter;
165
166 // sub systems
167 Server *server;
168 MDCache *mdcache;
169 Locker *locker;
170 MDLog *mdlog;
171 MDBalancer *balancer;
172 ScrubStack *scrubstack;
173 DamageTable damage_table;
174
175
176 InoTable *inotable;
177
178 SnapServer *snapserver;
179 SnapClient *snapclient;
180
181 MDSTableClient *get_table_client(int t);
182 MDSTableServer *get_table_server(int t);
183
184 SessionMap sessionmap;
185 Session *get_session(client_t client) {
186 return sessionmap.get_session(entity_name_t::CLIENT(client.v));
187 }
188 Session *get_session(Message *m);
189
190 PerfCounters *logger, *mlogger;
191 OpTracker op_tracker;
192
193 // The last different state I held before current
194 MDSMap::DaemonState last_state;
195 // The state assigned to me by the MDSMap
196 MDSMap::DaemonState state;
197
198 bool cluster_degraded;
199
200 MDSMap::DaemonState get_state() const { return state; }
201 MDSMap::DaemonState get_want_state() const { return beacon.get_want_state(); }
202
203 bool is_creating() const { return state == MDSMap::STATE_CREATING; }
204 bool is_starting() const { return state == MDSMap::STATE_STARTING; }
205 bool is_standby() const { return state == MDSMap::STATE_STANDBY; }
206 bool is_replay() const { return state == MDSMap::STATE_REPLAY; }
207 bool is_standby_replay() const { return state == MDSMap::STATE_STANDBY_REPLAY; }
208 bool is_resolve() const { return state == MDSMap::STATE_RESOLVE; }
209 bool is_reconnect() const { return state == MDSMap::STATE_RECONNECT; }
210 bool is_rejoin() const { return state == MDSMap::STATE_REJOIN; }
211 bool is_clientreplay() const { return state == MDSMap::STATE_CLIENTREPLAY; }
212 bool is_active() const { return state == MDSMap::STATE_ACTIVE; }
213 bool is_stopping() const { return state == MDSMap::STATE_STOPPING; }
214 bool is_any_replay() const { return (is_replay() || is_standby_replay()); }
215 bool is_stopped() const { return mdsmap->is_stopped(whoami); }
216 bool is_cluster_degraded() const { return cluster_degraded; }
217
218 void handle_write_error(int err);
219
220 void handle_conf_change(const struct md_config_t *conf,
221 const std::set <std::string> &changed)
222 {
223 purge_queue.handle_conf_change(conf, changed, *mdsmap);
224 }
225
226 void update_mlogger();
227 protected:
228 // Flag to indicate we entered shutdown: anyone seeing this to be true
229 // after taking mds_lock must drop out.
230 bool stopping;
231
232 // PurgeQueue is only used by StrayManager, but it is owned by MDSRank
233 // because its init/shutdown happens at the top level.
234 PurgeQueue purge_queue;
235
236 class ProgressThread : public Thread {
237 MDSRank *mds;
238 Cond cond;
239 public:
240 explicit ProgressThread(MDSRank *mds_) : mds(mds_) {}
241 void * entry() override;
242 void shutdown();
243 void signal() {cond.Signal();}
244 } progress_thread;
245
246 list<Message*> waiting_for_nolaggy;
247 list<MDSInternalContextBase*> finished_queue;
248 // Dispatch, retry, queues
249 int dispatch_depth;
250 void inc_dispatch_depth() { ++dispatch_depth; }
251 void dec_dispatch_depth() { --dispatch_depth; }
252 void retry_dispatch(Message *m);
253 bool handle_deferrable_message(Message *m);
254 void _advance_queues();
255 bool _dispatch(Message *m, bool new_msg);
256
257 ceph::heartbeat_handle_d *hb; // Heartbeat for threads using mds_lock
258
259 bool is_stale_message(Message *m) const;
260
261 map<mds_rank_t, version_t> peer_mdsmap_epoch;
262
263 ceph_tid_t last_tid; // for mds-initiated requests (e.g. stray rename)
264
265 list<MDSInternalContextBase*> waiting_for_active, waiting_for_replay, waiting_for_reconnect, waiting_for_resolve;
266 list<MDSInternalContextBase*> replay_queue;
267 map<mds_rank_t, list<MDSInternalContextBase*> > waiting_for_active_peer;
268 map<epoch_t, list<MDSInternalContextBase*> > waiting_for_mdsmap;
269
270 epoch_t osd_epoch_barrier;
271
272 // Const reference to the beacon so that we can behave differently
273 // when it's laggy.
274 Beacon &beacon;
275
276 /**
277 * Emit clog warnings for any ops reported as warnings by optracker
278 */
279 void check_ops_in_flight();
280
281 int mds_slow_req_count;
282
283 /**
284 * Share MDSMap with clients
285 */
286 void bcast_mds_map(); // to mounted clients
287 epoch_t last_client_mdsmap_bcast;
288
289 map<mds_rank_t,DecayCounter> export_targets; /* targets this MDS is exporting to or wants/tries to */
290
291 void create_logger();
292 public:
293
294 void queue_waiter(MDSInternalContextBase *c) {
295 finished_queue.push_back(c);
296 progress_thread.signal();
297 }
298 void queue_waiters(std::list<MDSInternalContextBase*>& ls) {
299 finished_queue.splice( finished_queue.end(), ls );
300 progress_thread.signal();
301 }
302
303 MDSRank(
304 mds_rank_t whoami_,
305 Mutex &mds_lock_,
306 LogChannelRef &clog_,
307 SafeTimer &timer_,
308 Beacon &beacon_,
309 MDSMap *& mdsmap_,
310 Messenger *msgr,
311 MonClient *monc_,
312 Context *respawn_hook_,
313 Context *suicide_hook_);
314
315 protected:
316 ~MDSRank();
317
318 public:
319
320 // Daemon lifetime functions: these guys break the abstraction
321 // and call up into the parent MDSDaemon instance. It's kind
322 // of unavoidable: if we want any depth into our calls
323 // to be able to e.g. tear down the whole process, we have to
324 // have a reference going all the way down.
325 // >>>
326 void suicide();
327 void respawn();
328 // <<<
329
330 /**
331 * Call this periodically if inside a potentially long running piece
332 * of code while holding the mds_lock
333 */
334 void heartbeat_reset();
335
336 /**
337 * Report state DAMAGED to the mon, and then pass on to respawn(). Call
338 * this when an unrecoverable error is encountered while attempting
339 * to load an MDS rank's data structures. This is *not* for use with
340 * errors affecting normal dirfrag/inode objects -- they should be handled
341 * through cleaner scrub/repair mechanisms.
342 *
343 * Callers must already hold mds_lock.
344 */
345 void damaged();
346
347 /**
348 * Wrapper around `damaged` for users who are not
349 * already holding mds_lock.
350 *
351 * Callers must not already hold mds_lock.
352 */
353 void damaged_unlocked();
354
355 utime_t get_laggy_until() const;
356
357 void send_message_mds(Message *m, mds_rank_t mds);
358 void forward_message_mds(Message *req, mds_rank_t mds);
359
360 void send_message_client_counted(Message *m, client_t client);
361 void send_message_client_counted(Message *m, Session *session);
362 void send_message_client_counted(Message *m, Connection *connection);
363 void send_message_client_counted(Message *m, const ConnectionRef& con) {
364 send_message_client_counted(m, con.get());
365 }
366 void send_message_client(Message *m, Session *session);
367 void send_message(Message *m, Connection *c);
368 void send_message(Message *m, const ConnectionRef& c) {
369 send_message(m, c.get());
370 }
371
372 void wait_for_active_peer(mds_rank_t who, MDSInternalContextBase *c) {
373 waiting_for_active_peer[who].push_back(c);
374 }
375 void wait_for_cluster_recovered(MDSInternalContextBase *c) {
376 assert(cluster_degraded);
377 waiting_for_active_peer[MDS_RANK_NONE].push_back(c);
378 }
379
380 void wait_for_active(MDSInternalContextBase *c) {
381 waiting_for_active.push_back(c);
382 }
383 void wait_for_replay(MDSInternalContextBase *c) {
384 waiting_for_replay.push_back(c);
385 }
386 void wait_for_reconnect(MDSInternalContextBase *c) {
387 waiting_for_reconnect.push_back(c);
388 }
389 void wait_for_resolve(MDSInternalContextBase *c) {
390 waiting_for_resolve.push_back(c);
391 }
392 void wait_for_mdsmap(epoch_t e, MDSInternalContextBase *c) {
393 waiting_for_mdsmap[e].push_back(c);
394 }
395 void enqueue_replay(MDSInternalContextBase *c) {
396 replay_queue.push_back(c);
397 }
398
399 bool queue_one_replay();
400
401 void set_osd_epoch_barrier(epoch_t e);
402 epoch_t get_osd_epoch_barrier() const {return osd_epoch_barrier;}
403 epoch_t get_osd_epoch() const;
404
405 ceph_tid_t issue_tid() { return ++last_tid; }
406
407 Finisher *finisher;
408
409 MDSMap *get_mds_map() { return mdsmap; }
410
411 int get_req_rate() const { return logger->get(l_mds_request); }
412
413 int get_mds_slow_req_count() const { return mds_slow_req_count; }
414
415 void dump_status(Formatter *f) const;
416
417 void hit_export_target(utime_t now, mds_rank_t rank, double amount=-1.0);
418 bool is_export_target(mds_rank_t rank) {
419 const set<mds_rank_t>& map_targets = mdsmap->get_mds_info(get_nodeid()).export_targets;
420 return map_targets.count(rank);
421 }
422
423 bool evict_client(int64_t session_id, bool wait, bool blacklist,
424 std::stringstream& ss, Context *on_killed=nullptr);
425
426 protected:
427 void dump_clientreplay_status(Formatter *f) const;
428 void command_scrub_path(Formatter *f, boost::string_view path, vector<string>& scrubop_vec);
429 void command_tag_path(Formatter *f, boost::string_view path,
430 boost::string_view tag);
431 void command_flush_path(Formatter *f, boost::string_view path);
432 void command_flush_journal(Formatter *f);
433 void command_get_subtrees(Formatter *f);
434 void command_export_dir(Formatter *f,
435 boost::string_view path, mds_rank_t dest);
436 bool command_dirfrag_split(
437 cmdmap_t cmdmap,
438 std::ostream &ss);
439 bool command_dirfrag_merge(
440 cmdmap_t cmdmap,
441 std::ostream &ss);
442 bool command_dirfrag_ls(
443 cmdmap_t cmdmap,
444 std::ostream &ss,
445 Formatter *f);
446 int _command_export_dir(boost::string_view path, mds_rank_t dest);
447 int _command_flush_journal(std::stringstream *ss);
448 CDir *_command_dirfrag_get(
449 const cmdmap_t &cmdmap,
450 std::ostream &ss);
451
452 protected:
453 Messenger *messenger;
454 MonClient *monc;
455
456 Context *respawn_hook;
457 Context *suicide_hook;
458
459 // Friended to access retry_dispatch
460 friend class C_MDS_RetryMessage;
461
462 // FIXME the state machine logic should be separable from the dispatch
463 // logic that calls it.
464 // >>>
465 void calc_recovery_set();
466 void request_state(MDSMap::DaemonState s);
467
468 bool standby_replaying; // true if current replay pass is in standby-replay mode
469
470 typedef enum {
471 // The MDSMap is available, configure default layouts and structures
472 MDS_BOOT_INITIAL = 0,
473 // We are ready to open some inodes
474 MDS_BOOT_OPEN_ROOT,
475 // We are ready to do a replay if needed
476 MDS_BOOT_PREPARE_LOG,
477 // Replay is complete
478 MDS_BOOT_REPLAY_DONE
479 } BootStep;
480 friend class C_MDS_BootStart;
481 friend class C_MDS_InternalBootStart;
482 void boot_create(); // i am new mds.
483 void boot_start(BootStep step=MDS_BOOT_INITIAL, int r=0); // starting|replay
484
485 void replay_start();
486 void creating_done();
487 void starting_done();
488 void replay_done();
489 void standby_replay_restart();
490 void _standby_replay_restart_finish(int r, uint64_t old_read_pos);
491 class C_MDS_StandbyReplayRestart;
492 class C_MDS_StandbyReplayRestartFinish;
493
494 void reopen_log();
495
496 void resolve_start();
497 void resolve_done();
498 void reconnect_start();
499 void reconnect_done();
500 void rejoin_joint_start();
501 void rejoin_start();
502 void rejoin_done();
503 void recovery_done(int oldstate);
504 void clientreplay_start();
505 void clientreplay_done();
506 void active_start();
507 void stopping_start();
508 void stopping_done();
509
510 void validate_sessions();
511 // <<<
512
513 // >>>
514 void handle_mds_recovery(mds_rank_t who);
515 void handle_mds_failure(mds_rank_t who);
516 // <<<
517
518 /* Update MDSMap export_targets for this rank. Called on ::tick(). */
519 void update_targets(utime_t now);
520
521 private:
522 mono_time starttime = mono_clock::zero();
523 };
524
525 /* This expects to be given a reference which it is responsible for.
526 * The finish function calls functions which
527 * will put the Message exactly once.*/
528 class C_MDS_RetryMessage : public MDSInternalContext {
529 protected:
530 Message *m;
531 public:
532 C_MDS_RetryMessage(MDSRank *mds, Message *m)
533 : MDSInternalContext(mds)
534 {
535 assert(m);
536 this->m = m;
537 }
538 void finish(int r) override {
539 mds->retry_dispatch(m);
540 }
541 };
542
543 /**
544 * The aspect of MDSRank exposed to MDSDaemon but not subsystems: i.e.
545 * the service/dispatcher stuff like init/shutdown that subsystems should
546 * never touch.
547 */
548 class MDSRankDispatcher : public MDSRank
549 {
550 public:
551 void init();
552 void tick();
553 void shutdown();
554 bool handle_asok_command(std::string command, cmdmap_t& cmdmap,
555 Formatter *f, std::ostream& ss);
556 void handle_mds_map(MMDSMap *m, MDSMap *oldmap);
557 void handle_osd_map();
558 void update_log_config();
559
560 bool handle_command(
561 const cmdmap_t &cmdmap,
562 MCommand *m,
563 int *r,
564 std::stringstream *ds,
565 std::stringstream *ss,
566 bool *need_reply);
567
568 void dump_sessions(const SessionFilter &filter, Formatter *f) const;
569 void evict_clients(const SessionFilter &filter, MCommand *m);
570
571 // Call into me from MDS::ms_dispatch
572 bool ms_dispatch(Message *m);
573
574 MDSRankDispatcher(
575 mds_rank_t whoami_,
576 Mutex &mds_lock_,
577 LogChannelRef &clog_,
578 SafeTimer &timer_,
579 Beacon &beacon_,
580 MDSMap *& mdsmap_,
581 Messenger *msgr,
582 MonClient *monc_,
583 Context *respawn_hook_,
584 Context *suicide_hook_);
585 };
586
587 // This utility for MDS and MDSRank dispatchers.
588 #define ALLOW_MESSAGES_FROM(peers) \
589 do { \
590 if (m->get_connection() && (m->get_connection()->get_peer_type() & (peers)) == 0) { \
591 dout(0) << __FILE__ << "." << __LINE__ << ": filtered out request, peer=" << m->get_connection()->get_peer_type() \
592 << " allowing=" << #peers << " message=" << *m << dendl; \
593 m->put(); \
594 return true; \
595 } \
596 } while (0)
597
598 #endif // MDS_RANK_H_
599