]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/MDSRank.h
update sources to v12.1.2
[ceph.git] / ceph / src / mds / MDSRank.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2015 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #ifndef MDS_RANK_H_
16 #define MDS_RANK_H_
17
18 #include "common/DecayCounter.h"
19 #include "common/LogClient.h"
20 #include "common/Timer.h"
21 #include "common/TrackedOp.h"
22
23 #include "messages/MCommand.h"
24
25 #include "Beacon.h"
26 #include "DamageTable.h"
27 #include "MDSMap.h"
28 #include "SessionMap.h"
29 #include "MDCache.h"
30 #include "Migrator.h"
31 #include "MDLog.h"
32 #include "PurgeQueue.h"
33 #include "osdc/Journaler.h"
34
35 // Full .h import instead of forward declaration for PerfCounter, for the
36 // benefit of those including this header and using MDSRank::logger
37 #include "common/perf_counters.h"
38
39 enum {
40 l_mds_first = 2000,
41 l_mds_request,
42 l_mds_reply,
43 l_mds_reply_latency,
44 l_mds_forward,
45 l_mds_dir_fetch,
46 l_mds_dir_commit,
47 l_mds_dir_split,
48 l_mds_dir_merge,
49 l_mds_inode_max,
50 l_mds_inodes,
51 l_mds_inodes_top,
52 l_mds_inodes_bottom,
53 l_mds_inodes_pin_tail,
54 l_mds_inodes_pinned,
55 l_mds_inodes_expired,
56 l_mds_inodes_with_caps,
57 l_mds_caps,
58 l_mds_subtrees,
59 l_mds_traverse,
60 l_mds_traverse_hit,
61 l_mds_traverse_forward,
62 l_mds_traverse_discover,
63 l_mds_traverse_dir_fetch,
64 l_mds_traverse_remote_ino,
65 l_mds_traverse_lock,
66 l_mds_load_cent,
67 l_mds_dispatch_queue_len,
68 l_mds_exported,
69 l_mds_exported_inodes,
70 l_mds_imported,
71 l_mds_imported_inodes,
72 l_mds_last,
73 };
74
75 // memory utilization
76 enum {
77 l_mdm_first = 2500,
78 l_mdm_ino,
79 l_mdm_inoa,
80 l_mdm_inos,
81 l_mdm_dir,
82 l_mdm_dira,
83 l_mdm_dirs,
84 l_mdm_dn,
85 l_mdm_dna,
86 l_mdm_dns,
87 l_mdm_cap,
88 l_mdm_capa,
89 l_mdm_caps,
90 l_mdm_rss,
91 l_mdm_heap,
92 l_mdm_buf,
93 l_mdm_last,
94 };
95
96 namespace ceph {
97 struct heartbeat_handle_d;
98 }
99
100 class Server;
101 class Locker;
102 class MDCache;
103 class MDLog;
104 class MDBalancer;
105 class InoTable;
106 class SnapServer;
107 class SnapClient;
108 class MDSTableServer;
109 class MDSTableClient;
110 class Messenger;
111 class Objecter;
112 class MonClient;
113 class Finisher;
114 class MMDSMap;
115 class ScrubStack;
116
117 /**
118 * The public part of this class's interface is what's exposed to all
119 * the various subsystems (server, mdcache, etc), such as pointers
120 * to the other subsystems, and message-sending calls.
121 */
122 class MDSRank {
123 protected:
124 const mds_rank_t whoami;
125
126 // Incarnation as seen in MDSMap at the point where a rank is
127 // assigned.
128 int incarnation;
129
130 public:
131 mds_rank_t get_nodeid() const { return whoami; }
132 int64_t get_metadata_pool();
133
134 // Reference to global MDS::mds_lock, so that users of MDSRank don't
135 // carry around references to the outer MDS, and we can substitute
136 // a separate lock here in future potentially.
137 Mutex &mds_lock;
138
139 bool is_daemon_stopping() const;
140
141 // Reference to global cluster log client, just to avoid initialising
142 // a separate one here.
143 LogChannelRef &clog;
144
145 // Reference to global timer utility, because MDSRank and MDSDaemon
146 // currently both use the same mds_lock, so it makes sense for them
147 // to share a timer.
148 SafeTimer &timer;
149
150 MDSMap *&mdsmap;
151
152 Objecter *objecter;
153
154 // sub systems
155 Server *server;
156 MDCache *mdcache;
157 Locker *locker;
158 MDLog *mdlog;
159 MDBalancer *balancer;
160 ScrubStack *scrubstack;
161 DamageTable damage_table;
162
163
164 InoTable *inotable;
165
166 SnapServer *snapserver;
167 SnapClient *snapclient;
168
169 MDSTableClient *get_table_client(int t);
170 MDSTableServer *get_table_server(int t);
171
172 SessionMap sessionmap;
173 Session *get_session(client_t client) {
174 return sessionmap.get_session(entity_name_t::CLIENT(client.v));
175 }
176
177 PerfCounters *logger, *mlogger;
178 OpTracker op_tracker;
179
180 // The last different state I held before current
181 MDSMap::DaemonState last_state;
182 // The state assigned to me by the MDSMap
183 MDSMap::DaemonState state;
184
185 bool cluster_degraded;
186
187 MDSMap::DaemonState get_state() const { return state; }
188 MDSMap::DaemonState get_want_state() const { return beacon.get_want_state(); }
189
190 bool is_creating() const { return state == MDSMap::STATE_CREATING; }
191 bool is_starting() const { return state == MDSMap::STATE_STARTING; }
192 bool is_standby() const { return state == MDSMap::STATE_STANDBY; }
193 bool is_replay() const { return state == MDSMap::STATE_REPLAY; }
194 bool is_standby_replay() const { return state == MDSMap::STATE_STANDBY_REPLAY; }
195 bool is_resolve() const { return state == MDSMap::STATE_RESOLVE; }
196 bool is_reconnect() const { return state == MDSMap::STATE_RECONNECT; }
197 bool is_rejoin() const { return state == MDSMap::STATE_REJOIN; }
198 bool is_clientreplay() const { return state == MDSMap::STATE_CLIENTREPLAY; }
199 bool is_active() const { return state == MDSMap::STATE_ACTIVE; }
200 bool is_stopping() const { return state == MDSMap::STATE_STOPPING; }
201 bool is_any_replay() const { return (is_replay() || is_standby_replay()); }
202 bool is_stopped() const { return mdsmap->is_stopped(whoami); }
203 bool is_cluster_degraded() const { return cluster_degraded; }
204
205 void handle_write_error(int err);
206
207 void handle_conf_change(const struct md_config_t *conf,
208 const std::set <std::string> &changed)
209 {
210 purge_queue.handle_conf_change(conf, changed, *mdsmap);
211 }
212
213 void update_mlogger();
214 protected:
215 // Flag to indicate we entered shutdown: anyone seeing this to be true
216 // after taking mds_lock must drop out.
217 bool stopping;
218
219 // PurgeQueue is only used by StrayManager, but it is owned by MDSRank
220 // because its init/shutdown happens at the top level.
221 PurgeQueue purge_queue;
222
223 class ProgressThread : public Thread {
224 MDSRank *mds;
225 Cond cond;
226 public:
227 explicit ProgressThread(MDSRank *mds_) : mds(mds_) {}
228 void * entry() override;
229 void shutdown();
230 void signal() {cond.Signal();}
231 } progress_thread;
232
233 list<Message*> waiting_for_nolaggy;
234 list<MDSInternalContextBase*> finished_queue;
235 // Dispatch, retry, queues
236 int dispatch_depth;
237 void inc_dispatch_depth() { ++dispatch_depth; }
238 void dec_dispatch_depth() { --dispatch_depth; }
239 void retry_dispatch(Message *m);
240 bool handle_deferrable_message(Message *m);
241 void _advance_queues();
242 bool _dispatch(Message *m, bool new_msg);
243
244 ceph::heartbeat_handle_d *hb; // Heartbeat for threads using mds_lock
245
246 bool is_stale_message(Message *m) const;
247
248 map<mds_rank_t, version_t> peer_mdsmap_epoch;
249
250 ceph_tid_t last_tid; // for mds-initiated requests (e.g. stray rename)
251
252 list<MDSInternalContextBase*> waiting_for_active, waiting_for_replay, waiting_for_reconnect, waiting_for_resolve;
253 list<MDSInternalContextBase*> replay_queue;
254 map<mds_rank_t, list<MDSInternalContextBase*> > waiting_for_active_peer;
255 map<epoch_t, list<MDSInternalContextBase*> > waiting_for_mdsmap;
256
257 epoch_t osd_epoch_barrier;
258
259 // Const reference to the beacon so that we can behave differently
260 // when it's laggy.
261 Beacon &beacon;
262
263 /**
264 * Emit clog warnings for any ops reported as warnings by optracker
265 */
266 void check_ops_in_flight();
267
268 int mds_slow_req_count;
269
270 /**
271 * Share MDSMap with clients
272 */
273 void bcast_mds_map(); // to mounted clients
274 epoch_t last_client_mdsmap_bcast;
275
276 map<mds_rank_t,DecayCounter> export_targets; /* targets this MDS is exporting to or wants/tries to */
277
278 void create_logger();
279 public:
280
281 void queue_waiter(MDSInternalContextBase *c) {
282 finished_queue.push_back(c);
283 progress_thread.signal();
284 }
285 void queue_waiters(list<MDSInternalContextBase*>& ls) {
286 finished_queue.splice( finished_queue.end(), ls );
287 progress_thread.signal();
288 }
289
290 MDSRank(
291 mds_rank_t whoami_,
292 Mutex &mds_lock_,
293 LogChannelRef &clog_,
294 SafeTimer &timer_,
295 Beacon &beacon_,
296 MDSMap *& mdsmap_,
297 Messenger *msgr,
298 MonClient *monc_,
299 Context *respawn_hook_,
300 Context *suicide_hook_);
301
302 protected:
303 ~MDSRank();
304
305 public:
306
307 // Daemon lifetime functions: these guys break the abstraction
308 // and call up into the parent MDSDaemon instance. It's kind
309 // of unavoidable: if we want any depth into our calls
310 // to be able to e.g. tear down the whole process, we have to
311 // have a reference going all the way down.
312 // >>>
313 void suicide();
314 void respawn();
315 // <<<
316
317 /**
318 * Call this periodically if inside a potentially long running piece
319 * of code while holding the mds_lock
320 */
321 void heartbeat_reset();
322
323 /**
324 * Report state DAMAGED to the mon, and then pass on to respawn(). Call
325 * this when an unrecoverable error is encountered while attempting
326 * to load an MDS rank's data structures. This is *not* for use with
327 * errors affecting normal dirfrag/inode objects -- they should be handled
328 * through cleaner scrub/repair mechanisms.
329 *
330 * Callers must already hold mds_lock.
331 */
332 void damaged();
333
334 /**
335 * Wrapper around `damaged` for users who are not
336 * already holding mds_lock.
337 *
338 * Callers must not already hold mds_lock.
339 */
340 void damaged_unlocked();
341
342 utime_t get_laggy_until() const;
343
344 void send_message_mds(Message *m, mds_rank_t mds);
345 void forward_message_mds(Message *req, mds_rank_t mds);
346
347 void send_message_client_counted(Message *m, client_t client);
348 void send_message_client_counted(Message *m, Session *session);
349 void send_message_client_counted(Message *m, Connection *connection);
350 void send_message_client_counted(Message *m, const ConnectionRef& con) {
351 send_message_client_counted(m, con.get());
352 }
353 void send_message_client(Message *m, Session *session);
354 void send_message(Message *m, Connection *c);
355 void send_message(Message *m, const ConnectionRef& c) {
356 send_message(m, c.get());
357 }
358
359 void wait_for_active_peer(mds_rank_t who, MDSInternalContextBase *c) {
360 waiting_for_active_peer[who].push_back(c);
361 }
362 void wait_for_cluster_recovered(MDSInternalContextBase *c) {
363 assert(cluster_degraded);
364 waiting_for_active_peer[MDS_RANK_NONE].push_back(c);
365 }
366
367 void wait_for_active(MDSInternalContextBase *c) {
368 waiting_for_active.push_back(c);
369 }
370 void wait_for_replay(MDSInternalContextBase *c) {
371 waiting_for_replay.push_back(c);
372 }
373 void wait_for_reconnect(MDSInternalContextBase *c) {
374 waiting_for_reconnect.push_back(c);
375 }
376 void wait_for_resolve(MDSInternalContextBase *c) {
377 waiting_for_resolve.push_back(c);
378 }
379 void wait_for_mdsmap(epoch_t e, MDSInternalContextBase *c) {
380 waiting_for_mdsmap[e].push_back(c);
381 }
382 void enqueue_replay(MDSInternalContextBase *c) {
383 replay_queue.push_back(c);
384 }
385
386 bool queue_one_replay();
387
388 void set_osd_epoch_barrier(epoch_t e);
389 epoch_t get_osd_epoch_barrier() const {return osd_epoch_barrier;}
390 epoch_t get_osd_epoch() const;
391
392 ceph_tid_t issue_tid() { return ++last_tid; }
393
394 Finisher *finisher;
395
396 MDSMap *get_mds_map() { return mdsmap; }
397
398 int get_req_rate() const { return logger->get(l_mds_request); }
399
400 int get_mds_slow_req_count() const { return mds_slow_req_count; }
401
402 void dump_status(Formatter *f) const;
403
404 void hit_export_target(utime_t now, mds_rank_t rank, double amount=-1.0);
405 bool is_export_target(mds_rank_t rank) {
406 const set<mds_rank_t>& map_targets = mdsmap->get_mds_info(get_nodeid()).export_targets;
407 return map_targets.count(rank);
408 }
409
410 bool evict_client(int64_t session_id, bool wait, bool blacklist,
411 std::stringstream& ss, Context *on_killed=nullptr);
412
413 protected:
414 void dump_clientreplay_status(Formatter *f) const;
415 void command_scrub_path(Formatter *f, const string& path, vector<string>& scrubop_vec);
416 void command_tag_path(Formatter *f, const string& path,
417 const string &tag);
418 void command_flush_path(Formatter *f, const string& path);
419 void command_flush_journal(Formatter *f);
420 void command_get_subtrees(Formatter *f);
421 void command_export_dir(Formatter *f,
422 const std::string &path, mds_rank_t dest);
423 bool command_dirfrag_split(
424 cmdmap_t cmdmap,
425 std::ostream &ss);
426 bool command_dirfrag_merge(
427 cmdmap_t cmdmap,
428 std::ostream &ss);
429 bool command_dirfrag_ls(
430 cmdmap_t cmdmap,
431 std::ostream &ss,
432 Formatter *f);
433 int _command_export_dir(const std::string &path, mds_rank_t dest);
434 int _command_flush_journal(std::stringstream *ss);
435 CDir *_command_dirfrag_get(
436 const cmdmap_t &cmdmap,
437 std::ostream &ss);
438
439 protected:
440 Messenger *messenger;
441 MonClient *monc;
442
443 Context *respawn_hook;
444 Context *suicide_hook;
445
446 // Friended to access retry_dispatch
447 friend class C_MDS_RetryMessage;
448
449 // FIXME the state machine logic should be separable from the dispatch
450 // logic that calls it.
451 // >>>
452 void calc_recovery_set();
453 void request_state(MDSMap::DaemonState s);
454
455 bool standby_replaying; // true if current replay pass is in standby-replay mode
456
457 typedef enum {
458 // The MDSMap is available, configure default layouts and structures
459 MDS_BOOT_INITIAL = 0,
460 // We are ready to open some inodes
461 MDS_BOOT_OPEN_ROOT,
462 // We are ready to do a replay if needed
463 MDS_BOOT_PREPARE_LOG,
464 // Replay is complete
465 MDS_BOOT_REPLAY_DONE
466 } BootStep;
467 friend class C_MDS_BootStart;
468 friend class C_MDS_InternalBootStart;
469 void boot_create(); // i am new mds.
470 void boot_start(BootStep step=MDS_BOOT_INITIAL, int r=0); // starting|replay
471
472 void replay_start();
473 void creating_done();
474 void starting_done();
475 void replay_done();
476 void standby_replay_restart();
477 void _standby_replay_restart_finish(int r, uint64_t old_read_pos);
478 class C_MDS_StandbyReplayRestart;
479 class C_MDS_StandbyReplayRestartFinish;
480
481 void reopen_log();
482
483 void resolve_start();
484 void resolve_done();
485 void reconnect_start();
486 void reconnect_done();
487 void rejoin_joint_start();
488 void rejoin_start();
489 void rejoin_done();
490 void recovery_done(int oldstate);
491 void clientreplay_start();
492 void clientreplay_done();
493 void active_start();
494 void stopping_start();
495 void stopping_done();
496
497 void validate_sessions();
498 // <<<
499
500 // >>>
501 void handle_mds_recovery(mds_rank_t who);
502 void handle_mds_failure(mds_rank_t who);
503 // <<<
504
505 /* Update MDSMap export_targets for this rank. Called on ::tick(). */
506 void update_targets(utime_t now);
507 };
508
509 /* This expects to be given a reference which it is responsible for.
510 * The finish function calls functions which
511 * will put the Message exactly once.*/
512 class C_MDS_RetryMessage : public MDSInternalContext {
513 protected:
514 Message *m;
515 public:
516 C_MDS_RetryMessage(MDSRank *mds, Message *m)
517 : MDSInternalContext(mds)
518 {
519 assert(m);
520 this->m = m;
521 }
522 void finish(int r) override {
523 mds->retry_dispatch(m);
524 }
525 };
526
527 /**
528 * The aspect of MDSRank exposed to MDSDaemon but not subsystems: i.e.
529 * the service/dispatcher stuff like init/shutdown that subsystems should
530 * never touch.
531 */
532 class MDSRankDispatcher : public MDSRank
533 {
534 public:
535 void init();
536 void tick();
537 void shutdown();
538 bool handle_asok_command(std::string command, cmdmap_t& cmdmap,
539 Formatter *f, std::ostream& ss);
540 void handle_mds_map(MMDSMap *m, MDSMap *oldmap);
541 void handle_osd_map();
542 void update_log_config();
543
544 bool handle_command(
545 const cmdmap_t &cmdmap,
546 MCommand *m,
547 int *r,
548 std::stringstream *ds,
549 std::stringstream *ss,
550 bool *need_reply);
551
552 void dump_sessions(const SessionFilter &filter, Formatter *f) const;
553 void evict_clients(const SessionFilter &filter, MCommand *m);
554
555 // Call into me from MDS::ms_dispatch
556 bool ms_dispatch(Message *m);
557
558 MDSRankDispatcher(
559 mds_rank_t whoami_,
560 Mutex &mds_lock_,
561 LogChannelRef &clog_,
562 SafeTimer &timer_,
563 Beacon &beacon_,
564 MDSMap *& mdsmap_,
565 Messenger *msgr,
566 MonClient *monc_,
567 Context *respawn_hook_,
568 Context *suicide_hook_);
569 };
570
571 // This utility for MDS and MDSRank dispatchers.
572 #define ALLOW_MESSAGES_FROM(peers) \
573 do { \
574 if (m->get_connection() && (m->get_connection()->get_peer_type() & (peers)) == 0) { \
575 dout(0) << __FILE__ << "." << __LINE__ << ": filtered out request, peer=" << m->get_connection()->get_peer_type() \
576 << " allowing=" << #peers << " message=" << *m << dendl; \
577 m->put(); \
578 return true; \
579 } \
580 } while (0)
581
582 #endif // MDS_RANK_H_
583