ceph/src/mds/MDSRank.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2015 Red Hat
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15 #include <string_view>
  16 #include <typeinfo>
  17 #include "common/debug.h"
  18 #include "common/errno.h"
  19 #include "common/likely.h"
  20 #include "common/async/blocked_completion.h"
  21
  22 #include "messages/MClientRequestForward.h"
  23 #include "messages/MMDSLoadTargets.h"
  24 #include "messages/MMDSTableRequest.h"
  25 #include "messages/MMDSMetrics.h"
  26
  27 #include "mgr/MgrClient.h"
  28
  29 #include "MDSDaemon.h"
  30 #include "MDSMap.h"
  31 #include "MetricAggregator.h"
  32 #include "SnapClient.h"
  33 #include "SnapServer.h"
  34 #include "MDBalancer.h"
  35 #include "Migrator.h"
  36 #include "Locker.h"
  37 #include "InoTable.h"
  38 #include "mon/MonClient.h"
  39 #include "common/HeartbeatMap.h"
  40 #include "ScrubStack.h"
  41 #include "Mutation.h"
  42
  43
  44 #include "MDSRank.h"
  45
  46 #define dout_context g_ceph_context
  47 #define dout_subsys ceph_subsys_mds
  48 #undef dout_prefix
  49 #define dout_prefix *_dout << "mds." << whoami << '.' << incarnation << ' '
  50
  51 using std::ostream;
  52 using std::set;
  53 using std::string;
  54 using std::vector;
  55 using TOPNSPC::common::cmd_getval;
  56
  57 class C_Flush_Journal : public MDSInternalContext {
  58 public:
  59   C_Flush_Journal(MDCache *mdcache, MDLog *mdlog, MDSRank *mds,
  60                   std::ostream *ss, Context *on_finish)
  61     : MDSInternalContext(mds),
  62       mdcache(mdcache), mdlog(mdlog), ss(ss), on_finish(on_finish),
  63       whoami(mds->whoami), incarnation(mds->incarnation) {
  64   }
  65
  66   void send() {
  67     ceph_assert(ceph_mutex_is_locked(mds->mds_lock));
  68
  69     dout(20) << __func__ << dendl;
  70
  71     if (mdcache->is_readonly()) {
  72       dout(5) << __func__ << ": read-only FS" << dendl;
  73       complete(-CEPHFS_EROFS);
  74       return;
  75     }
  76
  77     if (!mds->is_active()) {
  78       dout(5) << __func__ << ": MDS not active, no-op" << dendl;
  79       complete(0);
  80       return;
  81     }
  82
  83     flush_mdlog();
  84   }
  85
  86 private:
  87
  88   void flush_mdlog() {
  89     dout(20) << __func__ << dendl;
  90
  91     // I need to seal off the current segment, and then mark all
  92     // previous segments for expiry
  93     mdlog->start_new_segment();
  94
  95     Context *ctx = new LambdaContext([this](int r) {
  96         handle_flush_mdlog(r);
  97       });
  98
  99     // Flush initially so that all the segments older than our new one
 100     // will be elegible for expiry
 101     mdlog->flush();
 102     mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, ctx));
 103   }
 104
 105   void handle_flush_mdlog(int r) {
 106     dout(20) << __func__ << ": r=" << r << dendl;
 107
 108     if (r != 0) {
 109       *ss << "Error " << r << " (" << cpp_strerror(r) << ") while flushing journal";
 110       complete(r);
 111       return;
 112     }
 113
 114     clear_mdlog();
 115   }
 116
 117   void clear_mdlog() {
 118     dout(20) << __func__ << dendl;
 119
 120     Context *ctx = new LambdaContext([this](int r) {
 121         handle_clear_mdlog(r);
 122       });
 123
 124     // Because we may not be the last wait_for_safe context on MDLog,
 125     // and subsequent contexts might wake up in the middle of our
 126     // later trim_all and interfere with expiry (by e.g. marking
 127     // dirs/dentries dirty on previous log segments), we run a second
 128     // wait_for_safe here. See #10368
 129     mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, ctx));
 130   }
 131
 132   void handle_clear_mdlog(int r) {
 133     dout(20) << __func__ << ": r=" << r << dendl;
 134
 135     if (r != 0) {
 136       *ss << "Error " << r << " (" << cpp_strerror(r) << ") while flushing journal";
 137       complete(r);
 138       return;
 139     }
 140
 141     trim_mdlog();
 142   }
 143
 144   void trim_mdlog() {
 145     // Put all the old log segments into expiring or expired state
 146     dout(5) << __func__ << ": beginning segment expiry" << dendl;
 147
 148     int ret = mdlog->trim_all();
 149     if (ret != 0) {
 150       *ss << "Error " << ret << " (" << cpp_strerror(ret) << ") while trimming log";
 151       complete(ret);
 152       return;
 153     }
 154
 155     expire_segments();
 156   }
 157
 158   void expire_segments() {
 159     dout(20) << __func__ << dendl;
 160
 161     // Attach contexts to wait for all expiring segments to expire
 162     MDSGatherBuilder expiry_gather(g_ceph_context);
 163
 164     const auto &expiring_segments = mdlog->get_expiring_segments();
 165     for (auto p : expiring_segments) {
 166       p->wait_for_expiry(expiry_gather.new_sub());
 167     }
 168     dout(5) << __func__ << ": waiting for " << expiry_gather.num_subs_created()
 169             << " segments to expire" << dendl;
 170
 171     if (!expiry_gather.has_subs()) {
 172       trim_segments();
 173       return;
 174     }
 175
 176     Context *ctx = new LambdaContext([this](int r) {
 177         handle_expire_segments(r);
 178       });
 179     expiry_gather.set_finisher(new MDSInternalContextWrapper(mds, ctx));
 180     expiry_gather.activate();
 181   }
 182
 183   void handle_expire_segments(int r) {
 184     dout(20) << __func__ << ": r=" << r << dendl;
 185
 186     ceph_assert(r == 0); // MDLog is not allowed to raise errors via
 187                          // wait_for_expiry
 188     trim_segments();
 189   }
 190
 191   void trim_segments() {
 192     dout(20) << __func__ << dendl;
 193
 194     Context *ctx = new C_OnFinisher(new LambdaContext([this](int) {
 195           std::lock_guard locker(mds->mds_lock);
 196           trim_expired_segments();
 197         }), mds->finisher);
 198     ctx->complete(0);
 199   }
 200
 201   void trim_expired_segments() {
 202     dout(5) << __func__ << ": expiry complete, expire_pos/trim_pos is now "
 203             << std::hex << mdlog->get_journaler()->get_expire_pos() << "/"
 204             << mdlog->get_journaler()->get_trimmed_pos() << dendl;
 205
 206     // Now everyone I'm interested in is expired
 207     mdlog->trim_expired_segments();
 208
 209     dout(5) << __func__ << ": trim complete, expire_pos/trim_pos is now "
 210             << std::hex << mdlog->get_journaler()->get_expire_pos() << "/"
 211             << mdlog->get_journaler()->get_trimmed_pos() << dendl;
 212
 213     write_journal_head();
 214   }
 215
 216   void write_journal_head() {
 217     dout(20) << __func__ << dendl;
 218
 219     Context *ctx = new LambdaContext([this](int r) {
 220         std::lock_guard locker(mds->mds_lock);
 221         handle_write_head(r);
 222       });
 223     // Flush the journal header so that readers will start from after
 224     // the flushed region
 225     mdlog->get_journaler()->write_head(ctx);
 226   }
 227
 228   void handle_write_head(int r) {
 229     if (r != 0) {
 230       *ss << "Error " << r << " (" << cpp_strerror(r) << ") while writing header";
 231     } else {
 232       dout(5) << __func__ << ": write_head complete, all done!" << dendl;
 233     }
 234
 235     complete(r);
 236   }
 237
 238   void finish(int r) override {
 239     dout(20) << __func__ << ": r=" << r << dendl;
 240     on_finish->complete(r);
 241   }
 242
 243   MDCache *mdcache;
 244   MDLog *mdlog;
 245   std::ostream *ss;
 246   Context *on_finish;
 247
 248   // so as to use dout
 249   mds_rank_t whoami;
 250   int incarnation;
 251 };
 252
 253 class C_Drop_Cache : public MDSInternalContext {
 254 public:
 255   C_Drop_Cache(Server *server, MDCache *mdcache, MDLog *mdlog,
 256                MDSRank *mds, uint64_t recall_timeout,
 257                Formatter *f, Context *on_finish)
 258     : MDSInternalContext(mds),
 259       server(server), mdcache(mdcache), mdlog(mdlog),
 260       recall_timeout(recall_timeout), recall_start(mono_clock::now()),
 261       f(f), on_finish(on_finish),
 262       whoami(mds->whoami), incarnation(mds->incarnation) {
 263   }
 264
 265   void send() {
 266     // not really a hard requirement here, but lets ensure this in
 267     // case we change the logic here.
 268     ceph_assert(ceph_mutex_is_locked(mds->mds_lock));
 269
 270     dout(20) << __func__ << dendl;
 271     f->open_object_section("result");
 272     recall_client_state();
 273   }
 274
 275 private:
 276   // context which completes itself (with -CEPHFS_ETIMEDOUT) after a specified
 277   // timeout or when explicitly completed, whichever comes first. Note
 278   // that the context does not detroy itself after completion -- it
 279   // needs to be explicitly freed.
 280   class C_ContextTimeout : public MDSInternalContext {
 281   public:
 282     C_ContextTimeout(MDSRank *mds, uint64_t timeout, Context *on_finish)
 283       : MDSInternalContext(mds),
 284         timeout(timeout),
 285         on_finish(on_finish) {
 286     }
 287     ~C_ContextTimeout() {
 288       ceph_assert(timer_task == nullptr);
 289     }
 290
 291     void start_timer() {
 292       if (!timeout) {
 293         return;
 294       }
 295
 296       timer_task = new LambdaContext([this](int) {
 297           timer_task = nullptr;
 298           complete(-CEPHFS_ETIMEDOUT);
 299         });
 300       mds->timer.add_event_after(timeout, timer_task);
 301     }
 302
 303     void finish(int r) override {
 304       Context *ctx = nullptr;
 305       {
 306         std::lock_guard locker(lock);
 307         std::swap(on_finish, ctx);
 308       }
 309       if (ctx != nullptr) {
 310         ctx->complete(r);
 311       }
 312     }
 313     void complete(int r) override {
 314       if (timer_task != nullptr) {
 315         mds->timer.cancel_event(timer_task);
 316       }
 317
 318       finish(r);
 319     }
 320
 321     uint64_t timeout;
 322     ceph::mutex lock = ceph::make_mutex("mds::context::timeout");
 323     Context *on_finish = nullptr;
 324     Context *timer_task = nullptr;
 325   };
 326
 327   auto do_trim() {
 328     auto [throttled, count] = mdcache->trim(UINT64_MAX);
 329     dout(10) << __func__
 330              << (throttled ? " (throttled)" : "")
 331              << " trimmed " << count << " caps" << dendl;
 332     dentries_trimmed += count;
 333     return std::make_pair(throttled, count);
 334   }
 335
 336   void recall_client_state() {
 337     dout(20) << __func__ << dendl;
 338     auto now = mono_clock::now();
 339     auto duration = std::chrono::duration<double>(now-recall_start).count();
 340
 341     MDSGatherBuilder gather(g_ceph_context);
 342     auto flags = Server::RecallFlags::STEADY|Server::RecallFlags::TRIM;
 343     auto [throttled, count] = server->recall_client_state(&gather, flags);
 344     dout(10) << __func__
 345              << (throttled ? " (throttled)" : "")
 346              << " recalled " << count << " caps" << dendl;
 347
 348     caps_recalled += count;
 349     if ((throttled || count > 0) && (recall_timeout == 0 || duration < recall_timeout)) {
 350       C_ContextTimeout *ctx = new C_ContextTimeout(
 351         mds, 1, new LambdaContext([this](int r) {
 352           recall_client_state();
 353       }));
 354       ctx->start_timer();
 355       gather.set_finisher(new MDSInternalContextWrapper(mds, ctx));
 356       gather.activate();
 357       mdlog->flush(); /* use down-time to incrementally flush log */
 358       do_trim(); /* use down-time to incrementally trim cache */
 359     } else {
 360       if (!gather.has_subs()) {
 361         return handle_recall_client_state(0);
 362       } else if (recall_timeout > 0 && duration > recall_timeout) {
 363         gather.set_finisher(new C_MDSInternalNoop);
 364         gather.activate();
 365         return handle_recall_client_state(-CEPHFS_ETIMEDOUT);
 366       } else {
 367         uint64_t remaining = (recall_timeout == 0 ? 0 : recall_timeout-duration);
 368         C_ContextTimeout *ctx = new C_ContextTimeout(
 369           mds, remaining, new LambdaContext([this](int r) {
 370               handle_recall_client_state(r);
 371             }));
 372
 373         ctx->start_timer();
 374         gather.set_finisher(new MDSInternalContextWrapper(mds, ctx));
 375         gather.activate();
 376       }
 377     }
 378   }
 379
 380   void handle_recall_client_state(int r) {
 381     dout(20) << __func__ << ": r=" << r << dendl;
 382
 383     // client recall section
 384     f->open_object_section("client_recall");
 385     f->dump_int("return_code", r);
 386     f->dump_string("message", cpp_strerror(r));
 387     f->dump_int("recalled", caps_recalled);
 388     f->close_section();
 389
 390     // we can still continue after recall timeout
 391     flush_journal();
 392   }
 393
 394   void flush_journal() {
 395     dout(20) << __func__ << dendl;
 396
 397     Context *ctx = new LambdaContext([this](int r) {
 398         handle_flush_journal(r);
 399       });
 400
 401     C_Flush_Journal *flush_journal = new C_Flush_Journal(mdcache, mdlog, mds, &ss, ctx);
 402     flush_journal->send();
 403   }
 404
 405   void handle_flush_journal(int r) {
 406     dout(20) << __func__ << ": r=" << r << dendl;
 407
 408     if (r != 0) {
 409       cmd_err(f, ss.str());
 410       complete(r);
 411       return;
 412     }
 413
 414     // journal flush section
 415     f->open_object_section("flush_journal");
 416     f->dump_int("return_code", r);
 417     f->dump_string("message", ss.str());
 418     f->close_section();
 419
 420     trim_cache();
 421   }
 422
 423   void trim_cache() {
 424     dout(20) << __func__ << dendl;
 425
 426     auto [throttled, count] = do_trim();
 427     if (throttled && count > 0) {
 428       auto timer = new LambdaContext([this](int) {
 429         trim_cache();
 430       });
 431       mds->timer.add_event_after(1.0, timer);
 432     } else {
 433       cache_status();
 434     }
 435   }
 436
 437   void cache_status() {
 438     dout(20) << __func__ << dendl;
 439
 440     f->open_object_section("trim_cache");
 441     f->dump_int("trimmed", dentries_trimmed);
 442     f->close_section();
 443
 444     // cache status section
 445     mdcache->cache_status(f);
 446
 447     complete(0);
 448   }
 449
 450   void finish(int r) override {
 451     dout(20) << __func__ << ": r=" << r << dendl;
 452
 453     auto d = std::chrono::duration<double>(mono_clock::now()-recall_start);
 454     f->dump_float("duration", d.count());
 455
 456     f->close_section();
 457     on_finish->complete(r);
 458   }
 459
 460   Server *server;
 461   MDCache *mdcache;
 462   MDLog *mdlog;
 463   uint64_t recall_timeout;
 464   mono_time recall_start;
 465   Formatter *f;
 466   Context *on_finish;
 467
 468   int retval = 0;
 469   std::stringstream ss;
 470   uint64_t caps_recalled = 0;
 471   uint64_t dentries_trimmed = 0;
 472
 473   // so as to use dout
 474   mds_rank_t whoami;
 475   int incarnation;
 476
 477   void cmd_err(Formatter *f, std::string_view err) {
 478     f->reset();
 479     f->open_object_section("result");
 480     f->dump_string("error", err);
 481     f->close_section();
 482   }
 483 };
 484
 485 MDSRank::MDSRank(
 486     mds_rank_t whoami_,
 487     ceph::fair_mutex &mds_lock_,
 488     LogChannelRef &clog_,
 489     CommonSafeTimer<ceph::fair_mutex> &timer_,
 490     Beacon &beacon_,
 491     std::unique_ptr<MDSMap>& mdsmap_,
 492     Messenger *msgr,
 493     MonClient *monc_,
 494     MgrClient *mgrc,
 495     Context *respawn_hook_,
 496     Context *suicide_hook_,
 497     boost::asio::io_context& ioc) :
 498     cct(msgr->cct), mds_lock(mds_lock_), clog(clog_),
 499     timer(timer_), mdsmap(mdsmap_),
 500     objecter(new Objecter(g_ceph_context, msgr, monc_, ioc)),
 501     damage_table(whoami_), sessionmap(this),
 502     op_tracker(g_ceph_context, g_conf()->mds_enable_op_tracker,
 503                g_conf()->osd_num_op_tracker_shard),
 504     progress_thread(this), whoami(whoami_),
 505     purge_queue(g_ceph_context, whoami_,
 506       mdsmap_->get_metadata_pool(), objecter,
 507       new LambdaContext([this](int r) {
 508           std::lock_guard l(mds_lock);
 509           handle_write_error(r);
 510         }
 511       )
 512     ),
 513     metrics_handler(cct, this),
 514     beacon(beacon_),
 515     messenger(msgr), monc(monc_), mgrc(mgrc),
 516     respawn_hook(respawn_hook_),
 517     suicide_hook(suicide_hook_),
 518     inject_journal_corrupt_dentry_first(g_conf().get_val<double>("mds_inject_journal_corrupt_dentry_first")),
 519     starttime(mono_clock::now()),
 520     ioc(ioc)
 521 {
 522   hb = g_ceph_context->get_heartbeat_map()->add_worker("MDSRank", pthread_self());
 523
 524   // The metadata pool won't change in the whole life time
 525   // of the fs, with this we can get rid of the mds_lock
 526   // in many places too.
 527   metadata_pool = mdsmap->get_metadata_pool();
 528
 529   purge_queue.update_op_limit(*mdsmap);
 530
 531   objecter->unset_honor_pool_full();
 532
 533   finisher = new Finisher(cct, "MDSRank", "MR_Finisher");
 534
 535   mdcache = new MDCache(this, purge_queue);
 536   mdlog = new MDLog(this);
 537   balancer = new MDBalancer(this, messenger, monc);
 538
 539   scrubstack = new ScrubStack(mdcache, clog, finisher);
 540
 541   inotable = new InoTable(this);
 542   snapserver = new SnapServer(this, monc);
 543   snapclient = new SnapClient(this);
 544
 545   server = new Server(this, &metrics_handler);
 546   locker = new Locker(this, mdcache);
 547
 548   _heartbeat_reset_grace = g_conf().get_val<uint64_t>("mds_heartbeat_reset_grace");
 549   heartbeat_grace = g_conf().get_val<double>("mds_heartbeat_grace");
 550   op_tracker.set_complaint_and_threshold(cct->_conf->mds_op_complaint_time,
 551                                          cct->_conf->mds_op_log_threshold);
 552   op_tracker.set_history_size_and_duration(cct->_conf->mds_op_history_size,
 553                                            cct->_conf->mds_op_history_duration);
 554
 555   schedule_update_timer_task();
 556 }
 557
 558 MDSRank::~MDSRank()
 559 {
 560   if (hb) {
 561     g_ceph_context->get_heartbeat_map()->remove_worker(hb);
 562     hb = nullptr;
 563   }
 564
 565   if (scrubstack) { delete scrubstack; scrubstack = NULL; }
 566   if (mdcache) { delete mdcache; mdcache = NULL; }
 567   if (mdlog) { delete mdlog; mdlog = NULL; }
 568   if (balancer) { delete balancer; balancer = NULL; }
 569   if (inotable) { delete inotable; inotable = NULL; }
 570   if (snapserver) { delete snapserver; snapserver = NULL; }
 571   if (snapclient) { delete snapclient; snapclient = NULL; }
 572
 573   if (server) { delete server; server = 0; }
 574   if (locker) { delete locker; locker = 0; }
 575
 576   if (logger) {
 577     g_ceph_context->get_perfcounters_collection()->remove(logger);
 578     delete logger;
 579     logger = 0;
 580   }
 581   if (mlogger) {
 582     g_ceph_context->get_perfcounters_collection()->remove(mlogger);
 583     delete mlogger;
 584     mlogger = 0;
 585   }
 586
 587   delete finisher;
 588   finisher = NULL;
 589
 590   delete suicide_hook;
 591   suicide_hook = NULL;
 592
 593   delete respawn_hook;
 594   respawn_hook = NULL;
 595
 596   delete objecter;
 597   objecter = nullptr;
 598 }
 599
 600 void MDSRankDispatcher::init()
 601 {
 602   objecter->init();
 603   messenger->add_dispatcher_head(objecter);
 604
 605   objecter->start();
 606
 607   update_log_config();
 608   create_logger();
 609
 610   // Expose the OSDMap (already populated during MDS::init) to anyone
 611   // who is interested in it.
 612   handle_osd_map();
 613
 614   progress_thread.create("mds_rank_progr");
 615
 616   purge_queue.init();
 617
 618   finisher->start();
 619 }
 620
 621 void MDSRank::update_targets()
 622 {
 623   // get MonMap's idea of my export_targets
 624   const set<mds_rank_t>& map_targets = mdsmap->get_mds_info(get_nodeid()).export_targets;
 625
 626   dout(20) << "updating export targets, currently " << map_targets.size() << " ranks are targets" << dendl;
 627
 628   bool send = false;
 629   set<mds_rank_t> new_map_targets;
 630
 631   auto it = export_targets.begin();
 632   while (it != export_targets.end()) {
 633     mds_rank_t rank = it->first;
 634     auto &counter = it->second;
 635     dout(20) << "export target mds." << rank << " is " << counter << dendl;
 636
 637     double val = counter.get();
 638     if (val <= 0.01) {
 639       dout(15) << "export target mds." << rank << " is no longer an export target" << dendl;
 640       export_targets.erase(it++);
 641       send = true;
 642       continue;
 643     }
 644     if (!map_targets.count(rank)) {
 645       dout(15) << "export target mds." << rank << " not in map's export_targets" << dendl;
 646       send = true;
 647     }
 648     new_map_targets.insert(rank);
 649     it++;
 650   }
 651   if (new_map_targets.size() < map_targets.size()) {
 652     dout(15) << "export target map holds stale targets, sending update" << dendl;
 653     send = true;
 654   }
 655
 656   if (send) {
 657     dout(15) << "updating export_targets, now " << new_map_targets.size() << " ranks are targets" << dendl;
 658     auto m = make_message<MMDSLoadTargets>(mds_gid_t(monc->get_global_id()), new_map_targets);
 659     monc->send_mon_message(m.detach());
 660   }
 661 }
 662
 663 void MDSRank::hit_export_target(mds_rank_t rank, double amount)
 664 {
 665   double rate = g_conf()->mds_bal_target_decay;
 666   if (amount < 0.0) {
 667     amount = 100.0/g_conf()->mds_bal_target_decay; /* a good default for "i am trying to keep this export_target active" */
 668   }
 669   auto em = export_targets.emplace(std::piecewise_construct, std::forward_as_tuple(rank), std::forward_as_tuple(DecayRate(rate)));
 670   auto &counter = em.first->second;
 671   counter.hit(amount);
 672   if (em.second) {
 673     dout(15) << "hit export target (new) is " << counter << dendl;
 674   } else {
 675     dout(15) << "hit export target is " << counter << dendl;
 676   }
 677 }
 678
 679 class C_MDS_MonCommand : public MDSInternalContext {
 680   std::string cmd;
 681 public:
 682   std::string outs;
 683   C_MDS_MonCommand(MDSRank *m, std::string_view c)
 684     : MDSInternalContext(m), cmd(c) {}
 685   void finish(int r) override {
 686     mds->_mon_command_finish(r, cmd, outs);
 687   }
 688 };
 689
 690 void MDSRank::_mon_command_finish(int r, std::string_view cmd, std::string_view outs)
 691 {
 692   if (r < 0) {
 693     dout(0) << __func__ << ": mon command " << cmd << " failed with errno " << r
 694             << " (" << outs << ")" << dendl;
 695   } else {
 696     dout(1) << __func__ << ": mon command " << cmd << " succeed" << dendl;
 697   }
 698 }
 699
 700 void MDSRank::set_mdsmap_multimds_snaps_allowed()
 701 {
 702   static bool already_sent = false;
 703   if (already_sent)
 704     return;
 705
 706   CachedStackStringStream css;
 707   *css << "{\"prefix\":\"fs set\", \"fs_name\":\"" <<  mdsmap->get_fs_name() << "\", ";
 708   *css << "\"var\":\"allow_multimds_snaps\", \"val\":\"true\", ";
 709   *css << "\"confirm\":\"--yes-i-am-really-a-mds\"}";
 710   std::vector<std::string> cmd = {css->str()};
 711
 712   dout(0) << __func__ << ": sending mon command: " << cmd[0] << dendl;
 713
 714   C_MDS_MonCommand *fin = new C_MDS_MonCommand(this, cmd[0]);
 715   monc->start_mon_command(cmd, {}, nullptr, &fin->outs, new C_IO_Wrapper(this, fin));
 716
 717   already_sent = true;
 718 }
 719
 720 void MDSRankDispatcher::tick()
 721 {
 722   heartbeat_reset();
 723
 724   if (beacon.is_laggy()) {
 725     dout(1) << "skipping upkeep work because connection to Monitors appears laggy" << dendl;
 726     return;
 727   }
 728
 729   check_ops_in_flight();
 730
 731   // Wake up thread in case we use to be laggy and have waiting_for_nolaggy
 732   // messages to progress.
 733   progress_thread.signal();
 734
 735   // make sure mds log flushes, trims periodically
 736   mdlog->flush();
 737
 738   // update average session uptime
 739   sessionmap.update_average_session_age();
 740
 741   if (is_active() || is_stopping()) {
 742     mdlog->trim();  // NOT during recovery!
 743   }
 744
 745   // ...
 746   if (is_clientreplay() || is_active() || is_stopping()) {
 747     server->find_idle_sessions();
 748     server->evict_cap_revoke_non_responders();
 749     locker->tick();
 750   }
 751
 752   // log
 753   if (logger) {
 754     logger->set(l_mds_subtrees, mdcache->num_subtrees());
 755     mdcache->log_stat();
 756   }
 757
 758   if (is_reconnect())
 759     server->reconnect_tick();
 760
 761   if (is_active()) {
 762     balancer->tick();
 763     mdcache->find_stale_fragment_freeze();
 764     mdcache->migrator->find_stale_export_freeze();
 765
 766     if (mdsmap->get_tableserver() == whoami) {
 767       snapserver->check_osd_map(false);
 768       // Filesystem was created by pre-mimic mds. Allow multi-active mds after
 769       // all old snapshots are deleted.
 770       if (!mdsmap->allows_multimds_snaps() &&
 771           snapserver->can_allow_multimds_snaps()) {
 772         set_mdsmap_multimds_snaps_allowed();
 773       }
 774     }
 775
 776     if (whoami == 0)
 777       scrubstack->advance_scrub_status();
 778   }
 779
 780   if (is_active() || is_stopping()) {
 781     update_targets();
 782   }
 783
 784   // shut down?
 785   if (is_stopping()) {
 786     mdlog->trim();
 787     if (mdcache->shutdown_pass()) {
 788       uint64_t pq_progress = 0 ;
 789       uint64_t pq_total = 0;
 790       size_t pq_in_flight = 0;
 791       if (!purge_queue.drain(&pq_progress, &pq_total, &pq_in_flight)) {
 792         dout(7) << "shutdown_pass=true, but still waiting for purge queue"
 793                 << dendl;
 794         // This takes unbounded time, so we must indicate progress
 795         // to the administrator: we do it in a slightly imperfect way
 796         // by sending periodic (tick frequency) clog messages while
 797         // in this state.
 798         clog->info() << "MDS rank " << whoami << " waiting for purge queue ("
 799           << std::dec << pq_progress << "/" << pq_total << " " << pq_in_flight
 800           << " files purging" << ")";
 801       } else {
 802         dout(7) << "shutdown_pass=true, finished w/ shutdown, moving to "
 803                    "down:stopped" << dendl;
 804         stopping_done();
 805       }
 806     }
 807     else {
 808       dout(7) << "shutdown_pass=false" << dendl;
 809     }
 810   }
 811
 812   // Expose ourselves to Beacon to update health indicators
 813   beacon.notify_health(this);
 814 }
 815
 816 void MDSRankDispatcher::shutdown()
 817 {
 818   // It should never be possible for shutdown to get called twice, because
 819   // anyone picking up mds_lock checks if stopping is true and drops
 820   // out if it is.
 821   ceph_assert(stopping == false);
 822   stopping = true;
 823
 824   dout(1) << __func__ << ": shutting down rank " << whoami << dendl;
 825
 826   g_conf().remove_observer(this);
 827
 828   timer.shutdown();
 829
 830   // MDLog has to shut down before the finisher, because some of its
 831   // threads block on IOs that require finisher to complete.
 832   mdlog->shutdown();
 833
 834   // shut down cache
 835   mdcache->shutdown();
 836
 837   purge_queue.shutdown();
 838
 839   // shutdown metrics handler/updater -- this is ok even if it was not
 840   // inited.
 841   metrics_handler.shutdown();
 842
 843   // shutdown metric aggergator
 844   if (metric_aggregator != nullptr) {
 845     metric_aggregator->shutdown();
 846   }
 847
 848   mds_lock.unlock();
 849   finisher->stop(); // no flushing
 850   mds_lock.lock();
 851
 852   if (objecter->initialized)
 853     objecter->shutdown();
 854
 855   monc->shutdown();
 856
 857   op_tracker.on_shutdown();
 858
 859   progress_thread.shutdown();
 860
 861   // release mds_lock for finisher/messenger threads (e.g.
 862   // MDSDaemon::ms_handle_reset called from Messenger).
 863   mds_lock.unlock();
 864
 865   // shut down messenger
 866   messenger->shutdown();
 867
 868   mds_lock.lock();
 869
 870   // Workaround unclean shutdown: HeartbeatMap will assert if
 871   // worker is not removed (as we do in ~MDS), but ~MDS is not
 872   // always called after suicide.
 873   if (hb) {
 874     g_ceph_context->get_heartbeat_map()->remove_worker(hb);
 875     hb = NULL;
 876   }
 877 }
 878
 879 /**
 880  * Helper for simple callbacks that call a void fn with no args.
 881  */
 882 class C_MDS_VoidFn : public MDSInternalContext
 883 {
 884   typedef void (MDSRank::*fn_ptr)();
 885   protected:
 886    fn_ptr fn;
 887   public:
 888   C_MDS_VoidFn(MDSRank *mds_, fn_ptr fn_)
 889     : MDSInternalContext(mds_), fn(fn_)
 890   {
 891     ceph_assert(mds_);
 892     ceph_assert(fn_);
 893   }
 894
 895   void finish(int r) override
 896   {
 897     (mds->*fn)();
 898   }
 899 };
 900
 901 MDSTableClient *MDSRank::get_table_client(int t)
 902 {
 903   switch (t) {
 904   case TABLE_ANCHOR: return NULL;
 905   case TABLE_SNAP: return snapclient;
 906   default: ceph_abort();
 907   }
 908 }
 909
 910 MDSTableServer *MDSRank::get_table_server(int t)
 911 {
 912   switch (t) {
 913   case TABLE_ANCHOR: return NULL;
 914   case TABLE_SNAP: return snapserver;
 915   default: ceph_abort();
 916   }
 917 }
 918
 919 void MDSRank::suicide()
 920 {
 921   if (suicide_hook) {
 922     suicide_hook->complete(0);
 923     suicide_hook = NULL;
 924   }
 925 }
 926
 927 void MDSRank::respawn()
 928 {
 929   if (respawn_hook) {
 930     respawn_hook->complete(0);
 931     respawn_hook = NULL;
 932   }
 933 }
 934
 935 void MDSRank::abort(std::string_view msg)
 936 {
 937   monc->flush_log();
 938   ceph_abort(msg);
 939 }
 940
 941 void MDSRank::damaged()
 942 {
 943   ceph_assert(whoami != MDS_RANK_NONE);
 944   ceph_assert(ceph_mutex_is_locked_by_me(mds_lock));
 945
 946   beacon.set_want_state(*mdsmap, MDSMap::STATE_DAMAGED);
 947   monc->flush_log();  // Flush any clog error from before we were called
 948   beacon.notify_health(this);  // Include latest status in our swan song
 949   beacon.send_and_wait(g_conf()->mds_mon_shutdown_timeout);
 950
 951   // It's okay if we timed out and the mon didn't get our beacon, because
 952   // another daemon (or ourselves after respawn) will eventually take the
 953   // rank and report DAMAGED again when it hits same problem we did.
 954
 955   respawn();  // Respawn into standby in case mon has other work for us
 956 }
 957
 958 void MDSRank::damaged_unlocked()
 959 {
 960   std::lock_guard l(mds_lock);
 961   damaged();
 962 }
 963
 964 void MDSRank::handle_write_error(int err)
 965 {
 966   if (err == -CEPHFS_EBLOCKLISTED) {
 967     derr << "we have been blocklisted (fenced), respawning..." << dendl;
 968     respawn();
 969     return;
 970   }
 971
 972   if (g_conf()->mds_action_on_write_error >= 2) {
 973     derr << "unhandled write error " << cpp_strerror(err) << ", suicide..." << dendl;
 974     respawn();
 975   } else if (g_conf()->mds_action_on_write_error == 1) {
 976     derr << "unhandled write error " << cpp_strerror(err) << ", force readonly..." << dendl;
 977     mdcache->force_readonly();
 978   } else {
 979     // ignore;
 980     derr << "unhandled write error " << cpp_strerror(err) << ", ignore..." << dendl;
 981   }
 982 }
 983
 984 void MDSRank::handle_write_error_with_lock(int err)
 985 {
 986   std::scoped_lock l(mds_lock);
 987   handle_write_error(err);
 988 }
 989
 990 void *MDSRank::ProgressThread::entry()
 991 {
 992   std::unique_lock l(mds->mds_lock);
 993   while (true) {
 994     cond.wait(l, [this] {
 995       return (mds->stopping ||
 996               !mds->finished_queue.empty() ||
 997               (!mds->waiting_for_nolaggy.empty() && !mds->beacon.is_laggy()));
 998     });
 999
1000     if (mds->stopping) {
1001       break;
1002     }
1003
1004     mds->_advance_queues();
1005   }
1006
1007   return NULL;
1008 }
1009
1010
1011 void MDSRank::ProgressThread::shutdown()
1012 {
1013   ceph_assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
1014   ceph_assert(mds->stopping);
1015
1016   if (am_self()) {
1017     // Stopping is set, we will fall out of our main loop naturally
1018   } else {
1019     // Kick the thread to notice mds->stopping, and join it
1020     cond.notify_all();
1021     mds->mds_lock.unlock();
1022     if (is_started())
1023       join();
1024     mds->mds_lock.lock();
1025   }
1026 }
1027
1028 bool MDSRankDispatcher::ms_dispatch(const cref_t<Message> &m)
1029 {
1030   if (m->get_source().is_mds()) {
1031     const Message *msg = m.get();
1032     const MMDSOp *op = dynamic_cast<const MMDSOp*>(msg);
1033     if (!op)
1034       dout(0) << typeid(*msg).name() << " is not an MMDSOp type" << dendl;
1035     ceph_assert(op);
1036   }
1037   else if (m->get_source().is_client()) {
1038     Session *session = static_cast<Session*>(m->get_connection()->get_priv().get());
1039     if (session)
1040       session->last_seen = Session::clock::now();
1041   }
1042
1043   inc_dispatch_depth();
1044   bool ret = _dispatch(m, true);
1045   dec_dispatch_depth();
1046   return ret;
1047 }
1048
1049 bool MDSRank::_dispatch(const cref_t<Message> &m, bool new_msg)
1050 {
1051   if (is_stale_message(m)) {
1052     return true;
1053   }
1054   // do not proceed if this message cannot be handled
1055   if (!is_valid_message(m)) {
1056     return false;
1057   }
1058
1059   if (beacon.is_laggy()) {
1060     dout(5) << " laggy, deferring " << *m << dendl;
1061     waiting_for_nolaggy.push_back(m);
1062   } else if (new_msg && !waiting_for_nolaggy.empty()) {
1063     dout(5) << " there are deferred messages, deferring " << *m << dendl;
1064     waiting_for_nolaggy.push_back(m);
1065   } else {
1066     handle_message(m);
1067     heartbeat_reset();
1068   }
1069
1070   if (dispatch_depth > 1)
1071     return true;
1072
1073   // finish any triggered contexts
1074   _advance_queues();
1075
1076   if (beacon.is_laggy()) {
1077     // We've gone laggy during dispatch, don't do any
1078     // more housekeeping
1079     return true;
1080   }
1081
1082   // hack: thrash exports
1083   static utime_t start;
1084   utime_t now = ceph_clock_now();
1085   if (start == utime_t())
1086     start = now;
1087   /*double el = now - start;
1088   if (el > 30.0 &&
1089     el < 60.0)*/
1090   for (int i=0; i<g_conf()->mds_thrash_exports; i++) {
1091     set<mds_rank_t> s;
1092     if (!is_active()) break;
1093     mdsmap->get_mds_set(s, MDSMap::STATE_ACTIVE);
1094     if (s.size() < 2 || CInode::count() < 10)
1095       break;  // need peers for this to work.
1096     if (mdcache->migrator->get_num_exporting() > g_conf()->mds_thrash_exports * 5 ||
1097         mdcache->migrator->get_export_queue_size() > g_conf()->mds_thrash_exports * 10)
1098       break;
1099
1100     dout(7) << "mds thrashing exports pass " << (i+1) << "/" << g_conf()->mds_thrash_exports << dendl;
1101
1102     // pick a random dir inode
1103     CInode *in = mdcache->hack_pick_random_inode();
1104
1105     auto&& ls = in->get_dirfrags();
1106     if (!ls.empty()) {  // must be an open dir.
1107       const auto& dir = ls[rand() % ls.size()];
1108       if (!dir->get_parent_dir()) continue;    // must be linked.
1109       if (!dir->is_auth()) continue;           // must be auth.
1110
1111       mds_rank_t dest;
1112       do {
1113         int k = rand() % s.size();
1114         set<mds_rank_t>::iterator p = s.begin();
1115         while (k--) ++p;
1116         dest = *p;
1117       } while (dest == whoami);
1118       mdcache->migrator->export_dir_nicely(dir,dest);
1119     }
1120   }
1121   // hack: thrash fragments
1122   for (int i=0; i<g_conf()->mds_thrash_fragments; i++) {
1123     if (!is_active()) break;
1124     if (mdcache->get_num_fragmenting_dirs() > 5 * g_conf()->mds_thrash_fragments) break;
1125     dout(7) << "mds thrashing fragments pass " << (i+1) << "/" << g_conf()->mds_thrash_fragments << dendl;
1126
1127     // pick a random dir inode
1128     CInode *in = mdcache->hack_pick_random_inode();
1129
1130     auto&& ls = in->get_dirfrags();
1131     if (ls.empty()) continue;                // must be an open dir.
1132     CDir *dir = ls.front();
1133     if (!dir->get_parent_dir()) continue;    // must be linked.
1134     if (!dir->is_auth()) continue;           // must be auth.
1135     frag_t fg = dir->get_frag();
1136     if ((fg == frag_t() || (rand() % (1 << fg.bits()) == 0))) {
1137       mdcache->split_dir(dir, 1);
1138     } else {
1139       balancer->queue_merge(dir);
1140     }
1141   }
1142
1143   // hack: force hash root?
1144   /*
1145   if (false &&
1146       mdcache->get_root() &&
1147       mdcache->get_root()->dir &&
1148       !(mdcache->get_root()->dir->is_hashed() ||
1149         mdcache->get_root()->dir->is_hashing())) {
1150     dout(0) << "hashing root" << dendl;
1151     mdcache->migrator->hash_dir(mdcache->get_root()->dir);
1152   }
1153   */
1154
1155   update_mlogger();
1156   return true;
1157 }
1158
1159 void MDSRank::update_mlogger()
1160 {
1161   if (mlogger) {
1162     mlogger->set(l_mdm_ino, CInode::count());
1163     mlogger->set(l_mdm_dir, CDir::count());
1164     mlogger->set(l_mdm_dn, CDentry::count());
1165     mlogger->set(l_mdm_cap, Capability::count());
1166     mlogger->set(l_mdm_inoa, CInode::increments());
1167     mlogger->set(l_mdm_inos, CInode::decrements());
1168     mlogger->set(l_mdm_dira, CDir::increments());
1169     mlogger->set(l_mdm_dirs, CDir::decrements());
1170     mlogger->set(l_mdm_dna, CDentry::increments());
1171     mlogger->set(l_mdm_dns, CDentry::decrements());
1172     mlogger->set(l_mdm_capa, Capability::increments());
1173     mlogger->set(l_mdm_caps, Capability::decrements());
1174   }
1175 }
1176
1177 // message types that the mds can handle
1178 bool MDSRank::is_valid_message(const cref_t<Message> &m) {
1179   int port = m->get_type() & 0xff00;
1180   int type = m->get_type();
1181
1182   if (port == MDS_PORT_CACHE ||
1183       port == MDS_PORT_MIGRATOR ||
1184       type == CEPH_MSG_CLIENT_SESSION ||
1185       type == CEPH_MSG_CLIENT_RECONNECT ||
1186       type == CEPH_MSG_CLIENT_RECLAIM ||
1187       type == CEPH_MSG_CLIENT_REQUEST ||
1188       type == MSG_MDS_PEER_REQUEST ||
1189       type == MSG_MDS_HEARTBEAT ||
1190       type == MSG_MDS_TABLE_REQUEST ||
1191       type == MSG_MDS_LOCK ||
1192       type == MSG_MDS_INODEFILECAPS ||
1193       type == MSG_MDS_SCRUB ||
1194       type == MSG_MDS_SCRUB_STATS ||
1195       type == CEPH_MSG_CLIENT_CAPS ||
1196       type == CEPH_MSG_CLIENT_CAPRELEASE ||
1197       type == CEPH_MSG_CLIENT_LEASE) {
1198     return true;
1199   }
1200
1201   return false;
1202 }
1203
1204 /*
1205  * lower priority messages we defer if we seem laggy
1206  */
1207
1208 #define ALLOW_MESSAGES_FROM(peers)                                      \
1209   do {                                                                  \
1210     if (m->get_connection() && (m->get_connection()->get_peer_type() & (peers)) == 0) { \
1211       dout(0) << __FILE__ << "." << __LINE__ << ": filtered out request, peer=" << m->get_connection()->get_peer_type() \
1212               << " allowing=" << #peers << " message=" << *m << dendl;  \
1213       return;                                                           \
1214     }                                                                   \
1215   } while (0)
1216
1217 void MDSRank::handle_message(const cref_t<Message> &m)
1218 {
1219   int port = m->get_type() & 0xff00;
1220
1221   switch (port) {
1222   case MDS_PORT_CACHE:
1223     ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
1224     mdcache->dispatch(m);
1225     break;
1226
1227   case MDS_PORT_MIGRATOR:
1228     ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
1229     mdcache->migrator->dispatch(m);
1230     break;
1231
1232   default:
1233     switch (m->get_type()) {
1234       // SERVER
1235     case CEPH_MSG_CLIENT_SESSION:
1236     case CEPH_MSG_CLIENT_RECONNECT:
1237     case CEPH_MSG_CLIENT_RECLAIM:
1238       ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_CLIENT);
1239       // fall-thru
1240     case CEPH_MSG_CLIENT_REQUEST:
1241       server->dispatch(m);
1242       break;
1243     case MSG_MDS_PEER_REQUEST:
1244       ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
1245       server->dispatch(m);
1246       break;
1247
1248     case MSG_MDS_HEARTBEAT:
1249       ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
1250       balancer->proc_message(m);
1251       break;
1252
1253     case MSG_MDS_TABLE_REQUEST:
1254       ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
1255       {
1256         const cref_t<MMDSTableRequest> &req = ref_cast<MMDSTableRequest>(m);
1257         if (req->op < 0) {
1258           MDSTableClient *client = get_table_client(req->table);
1259           client->handle_request(req);
1260         } else {
1261            MDSTableServer *server = get_table_server(req->table);
1262            server->handle_request(req);
1263         }
1264       }
1265       break;
1266
1267     case MSG_MDS_LOCK:
1268     case MSG_MDS_INODEFILECAPS:
1269       ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
1270       locker->dispatch(m);
1271       break;
1272
1273     case CEPH_MSG_CLIENT_CAPS:
1274     case CEPH_MSG_CLIENT_CAPRELEASE:
1275     case CEPH_MSG_CLIENT_LEASE:
1276       ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_CLIENT);
1277       locker->dispatch(m);
1278       break;
1279
1280     case MSG_MDS_SCRUB:
1281     case MSG_MDS_SCRUB_STATS:
1282       ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
1283       scrubstack->dispatch(m);
1284       break;
1285
1286     default:
1287       derr << "unrecognized message " << *m << dendl;
1288     }
1289   }
1290 }
1291
1292 /**
1293  * Advance finished_queue and waiting_for_nolaggy.
1294  *
1295  * Usually drain both queues, but may not drain waiting_for_nolaggy
1296  * if beacon is currently laggy.
1297  */
1298 void MDSRank::_advance_queues()
1299 {
1300   ceph_assert(ceph_mutex_is_locked_by_me(mds_lock));
1301
1302   if (!finished_queue.empty()) {
1303     dout(7) << "mds has " << finished_queue.size() << " queued contexts" << dendl;
1304     while (!finished_queue.empty()) {
1305       auto fin = finished_queue.front();
1306       finished_queue.pop_front();
1307
1308       dout(10) << " finish " << fin << dendl;
1309       fin->complete(0);
1310
1311       heartbeat_reset();
1312     }
1313   }
1314
1315   while (!waiting_for_nolaggy.empty()) {
1316     // stop if we're laggy now!
1317     if (beacon.is_laggy())
1318       break;
1319
1320     cref_t<Message> old = waiting_for_nolaggy.front();
1321     waiting_for_nolaggy.pop_front();
1322
1323     if (!is_stale_message(old)) {
1324       dout(7) << " processing laggy deferred " << *old << dendl;
1325       ceph_assert(is_valid_message(old));
1326       handle_message(old);
1327     }
1328
1329     heartbeat_reset();
1330   }
1331 }
1332
1333 /**
1334  * Call this when you take mds_lock, or periodically if you're going to
1335  * hold the lock for a long time (e.g. iterating over clients/inodes)
1336  */
1337 void MDSRank::heartbeat_reset()
1338 {
1339   // Any thread might jump into mds_lock and call us immediately
1340   // after a call to suicide() completes, in which case MDSRank::hb
1341   // has been freed and we are a no-op.
1342   if (!hb) {
1343       ceph_assert(stopping);
1344       return;
1345   }
1346
1347   // NB not enabling suicide grace, because the mon takes care of killing us
1348   // (by blocklisting us) when we fail to send beacons, and it's simpler to
1349   // only have one way of dying.
1350   g_ceph_context->get_heartbeat_map()->reset_timeout(hb,
1351     ceph::make_timespan(heartbeat_grace),
1352     ceph::timespan::zero());
1353 }
1354
1355 bool MDSRank::is_stale_message(const cref_t<Message> &m) const
1356 {
1357   // from bad mds?
1358   if (m->get_source().is_mds()) {
1359     mds_rank_t from = mds_rank_t(m->get_source().num());
1360     bool bad = false;
1361     if (mdsmap->is_down(from)) {
1362       bad = true;
1363     } else {
1364       // FIXME: this is a convoluted check.  we should be maintaining a nice
1365       // clean map of current ConnectionRefs for current mdses!!!
1366       auto c = messenger->connect_to(CEPH_ENTITY_TYPE_MDS,
1367                                      mdsmap->get_addrs(from));
1368       if (c != m->get_connection()) {
1369         bad = true;
1370         dout(5) << " mds." << from << " should be " << c << " "
1371                 << c->get_peer_addrs() << " but this message is "
1372                 << m->get_connection() << " " << m->get_source_addrs()
1373                 << dendl;
1374       }
1375     }
1376     if (bad) {
1377       // bogus mds?
1378       if (m->get_type() == CEPH_MSG_MDS_MAP) {
1379         dout(5) << "got " << *m << " from old/bad/imposter mds " << m->get_source()
1380                 << ", but it's an mdsmap, looking at it" << dendl;
1381       } else if (m->get_type() == MSG_MDS_CACHEEXPIRE &&
1382                  mdsmap->get_addrs(from) == m->get_source_addrs()) {
1383         dout(5) << "got " << *m << " from down mds " << m->get_source()
1384                 << ", but it's a cache_expire, looking at it" << dendl;
1385       } else {
1386         dout(5) << "got " << *m << " from down/old/bad/imposter mds " << m->get_source()
1387                 << ", dropping" << dendl;
1388         return true;
1389       }
1390     }
1391   }
1392   return false;
1393 }
1394
1395 Session *MDSRank::get_session(const cref_t<Message> &m)
1396 {
1397   // do not carry ref
1398   auto session = static_cast<Session *>(m->get_connection()->get_priv().get());
1399   if (session) {
1400     dout(20) << "get_session have " << session << " " << session->info.inst
1401              << " state " << session->get_state_name() << dendl;
1402     // Check if we've imported an open session since (new sessions start closed)
1403     if (session->is_closed() && m->get_type() == CEPH_MSG_CLIENT_SESSION) {
1404       Session *imported_session = sessionmap.get_session(session->info.inst.name);
1405       if (imported_session && imported_session != session) {
1406         dout(10) << __func__ << " replacing connection bootstrap session "
1407                  << session << " with imported session " << imported_session
1408                  << dendl;
1409         imported_session->info.auth_name = session->info.auth_name;
1410         //assert(session->info.auth_name == imported_session->info.auth_name);
1411         ceph_assert(session->info.inst == imported_session->info.inst);
1412         imported_session->set_connection(session->get_connection().get());
1413         // send out any queued messages
1414         while (!session->preopen_out_queue.empty()) {
1415           imported_session->get_connection()->send_message2(std::move(session->preopen_out_queue.front()));
1416           session->preopen_out_queue.pop_front();
1417         }
1418         imported_session->auth_caps = session->auth_caps;
1419         imported_session->last_seen = session->last_seen;
1420         ceph_assert(session->get_nref() == 1);
1421         imported_session->get_connection()->set_priv(imported_session->get());
1422         session = imported_session;
1423       }
1424     }
1425   } else {
1426     dout(20) << "get_session dne for " << m->get_source_inst() << dendl;
1427   }
1428   return session;
1429 }
1430
1431 void MDSRank::send_message(const ref_t<Message>& m, const ConnectionRef& c)
1432 {
1433   ceph_assert(c);
1434   c->send_message2(m);
1435 }
1436
1437 class C_MDS_RetrySendMessageMDS : public MDSInternalContext {
1438 public:
1439   C_MDS_RetrySendMessageMDS(MDSRank* mds, mds_rank_t who, ref_t<Message> m)
1440     : MDSInternalContext(mds), who(who), m(std::move(m)) {}
1441   void finish(int r) override {
1442     mds->send_message_mds(m, who);
1443   }
1444 private:
1445   mds_rank_t who;
1446   ref_t<Message> m;
1447 };
1448
1449
1450 void MDSRank::send_message_mds(const ref_t<Message>& m, mds_rank_t mds)
1451 {
1452   if (!mdsmap->is_up(mds)) {
1453     dout(10) << "send_message_mds mds." << mds << " not up, dropping " << *m << dendl;
1454     return;
1455   } else if (mdsmap->is_bootstrapping(mds)) {
1456     dout(5) << __func__ << "mds." << mds << " is bootstrapping, deferring " << *m << dendl;
1457     wait_for_bootstrapped_peer(mds, new C_MDS_RetrySendMessageMDS(this, mds, m));
1458     return;
1459   }
1460
1461   // send mdsmap first?
1462   auto addrs = mdsmap->get_addrs(mds);
1463   if (mds != whoami && peer_mdsmap_epoch[mds] < mdsmap->get_epoch()) {
1464     auto _m = make_message<MMDSMap>(monc->get_fsid(), *mdsmap);
1465     send_message_mds(_m, addrs);
1466     peer_mdsmap_epoch[mds] = mdsmap->get_epoch();
1467   }
1468
1469   // send message
1470   send_message_mds(m, addrs);
1471 }
1472
1473 void MDSRank::send_message_mds(const ref_t<Message>& m, const entity_addrvec_t &addr)
1474 {
1475   messenger->send_to_mds(ref_t<Message>(m).detach(), addr);
1476 }
1477
1478 void MDSRank::forward_message_mds(MDRequestRef& mdr, mds_rank_t mds)
1479 {
1480   ceph_assert(mds != whoami);
1481
1482   auto m = mdr->release_client_request();
1483
1484   /*
1485    * don't actually forward if non-idempotent!
1486    * client has to do it.  although the MDS will ignore duplicate requests,
1487    * the affected metadata may migrate, in which case the new authority
1488    * won't have the metareq_id in the completed request map.
1489    */
1490   // NEW: always make the client resend!
1491   bool client_must_resend = true;  //!creq->can_forward();
1492
1493   // tell the client where it should go
1494   auto session = get_session(m);
1495   if (!session) {
1496     dout(1) << "no session found, failed to forward client request " << mdr << dendl;
1497     return;
1498   }
1499   auto f = make_message<MClientRequestForward>(m->get_tid(), mds, m->get_num_fwd()+1, client_must_resend);
1500   send_message_client(f, session);
1501 }
1502
1503 void MDSRank::send_message_client_counted(const ref_t<Message>& m, client_t client)
1504 {
1505   Session *session = sessionmap.get_session(entity_name_t::CLIENT(client.v));
1506   if (session) {
1507     send_message_client_counted(m, session);
1508   } else {
1509     dout(10) << "send_message_client_counted no session for client." << client << " " << *m << dendl;
1510   }
1511 }
1512
1513 void MDSRank::send_message_client_counted(const ref_t<Message>& m, const ConnectionRef& connection)
1514 {
1515   // do not carry ref
1516   auto session = static_cast<Session *>(connection->get_priv().get());
1517   if (session) {
1518     send_message_client_counted(m, session);
1519   } else {
1520     dout(10) << "send_message_client_counted has no session for " << m->get_source_inst() << dendl;
1521     // another Connection took over the Session
1522   }
1523 }
1524
1525 void MDSRank::send_message_client_counted(const ref_t<Message>& m, Session* session)
1526 {
1527   version_t seq = session->inc_push_seq();
1528   dout(10) << "send_message_client_counted " << session->info.inst.name << " seq "
1529            << seq << " " << *m << dendl;
1530   if (session->get_connection()) {
1531     session->get_connection()->send_message2(m);
1532   } else {
1533     session->preopen_out_queue.push_back(m);
1534   }
1535 }
1536
1537 void MDSRank::send_message_client(const ref_t<Message>& m, Session* session)
1538 {
1539   dout(10) << "send_message_client " << session->info.inst << " " << *m << dendl;
1540   if (session->get_connection()) {
1541     session->get_connection()->send_message2(m);
1542   } else {
1543     session->preopen_out_queue.push_back(m);
1544   }
1545 }
1546
1547 /**
1548  * This is used whenever a RADOS operation has been cancelled
1549  * or a RADOS client has been blocklisted, to cause the MDS and
1550  * any clients to wait for this OSD epoch before using any new caps.
1551  *
1552  * See doc/cephfs/eviction
1553  */
1554 void MDSRank::set_osd_epoch_barrier(epoch_t e)
1555 {
1556   dout(4) << __func__ << ": epoch=" << e << dendl;
1557   osd_epoch_barrier = e;
1558 }
1559
1560 void MDSRank::retry_dispatch(const cref_t<Message> &m)
1561 {
1562   inc_dispatch_depth();
1563   _dispatch(m, false);
1564   dec_dispatch_depth();
1565 }
1566
1567 double MDSRank::get_dispatch_queue_max_age(utime_t now) const
1568 {
1569   return messenger->get_dispatch_queue_max_age(now);
1570 }
1571
1572 bool MDSRank::is_daemon_stopping() const
1573 {
1574   return stopping;
1575 }
1576
1577 void MDSRank::request_state(MDSMap::DaemonState s)
1578 {
1579   dout(3) << "request_state " << ceph_mds_state_name(s) << dendl;
1580   beacon.set_want_state(*mdsmap, s);
1581   beacon.send();
1582 }
1583
1584
1585 class C_MDS_BootStart : public MDSInternalContext {
1586   MDSRank::BootStep nextstep;
1587 public:
1588   C_MDS_BootStart(MDSRank *m, MDSRank::BootStep n)
1589     : MDSInternalContext(m), nextstep(n) {}
1590   void finish(int r) override {
1591     mds->boot_start(nextstep, r);
1592   }
1593 };
1594
1595
1596 void MDSRank::boot_start(BootStep step, int r)
1597 {
1598   // Handle errors from previous step
1599   if (r < 0) {
1600     if (is_standby_replay() && (r == -CEPHFS_EAGAIN)) {
1601       dout(0) << "boot_start encountered an error CEPHFS_EAGAIN"
1602               << ", respawning since we fell behind journal" << dendl;
1603       respawn();
1604     } else if (r == -CEPHFS_EINVAL || r == -CEPHFS_ENOENT) {
1605       // Invalid or absent data, indicates damaged on-disk structures
1606       clog->error() << "Error loading MDS rank " << whoami << ": "
1607         << cpp_strerror(r);
1608       damaged();
1609       ceph_assert(r == 0);  // Unreachable, damaged() calls respawn()
1610     } else if (r == -CEPHFS_EROFS) {
1611       dout(0) << "boot error forcing transition to read-only; MDS will try to continue" << dendl;
1612     } else {
1613       // Completely unexpected error, give up and die
1614       dout(0) << "boot_start encountered an error, failing" << dendl;
1615       suicide();
1616       return;
1617     }
1618   }
1619
1620   ceph_assert(is_starting() || is_any_replay());
1621
1622   switch(step) {
1623     case MDS_BOOT_INITIAL:
1624       {
1625         mdcache->init_layouts();
1626
1627         MDSGatherBuilder gather(g_ceph_context,
1628             new C_MDS_BootStart(this, MDS_BOOT_OPEN_ROOT));
1629         dout(2) << "Booting: " << step << ": opening inotable" << dendl;
1630         inotable->set_rank(whoami);
1631         inotable->load(gather.new_sub());
1632
1633         dout(2) << "Booting: " << step << ": opening sessionmap" << dendl;
1634         sessionmap.set_rank(whoami);
1635         sessionmap.load(gather.new_sub());
1636
1637         dout(2) << "Booting: " << step << ": opening mds log" << dendl;
1638         mdlog->open(gather.new_sub());
1639
1640         if (is_starting()) {
1641           dout(2) << "Booting: " << step << ": opening purge queue" << dendl;
1642           purge_queue.open(new C_IO_Wrapper(this, gather.new_sub()));
1643         } else if (!standby_replaying) {
1644           dout(2) << "Booting: " << step << ": opening purge queue (async)" << dendl;
1645           purge_queue.open(NULL);
1646           dout(2) << "Booting: " << step << ": loading open file table (async)" << dendl;
1647           mdcache->open_file_table.load(nullptr);
1648         }
1649
1650         if (mdsmap->get_tableserver() == whoami) {
1651           dout(2) << "Booting: " << step << ": opening snap table" << dendl;
1652           snapserver->set_rank(whoami);
1653           snapserver->load(gather.new_sub());
1654         }
1655
1656         gather.activate();
1657       }
1658       break;
1659     case MDS_BOOT_OPEN_ROOT:
1660       {
1661         dout(2) << "Booting: " << step << ": loading/discovering base inodes" << dendl;
1662
1663         MDSGatherBuilder gather(g_ceph_context,
1664             new C_MDS_BootStart(this, MDS_BOOT_PREPARE_LOG));
1665
1666         if (is_starting()) {
1667           // load mydir frag for the first log segment (creating subtree map)
1668           mdcache->open_mydir_frag(gather.new_sub());
1669         } else {
1670           mdcache->open_mydir_inode(gather.new_sub());
1671         }
1672
1673         mdcache->create_global_snaprealm();
1674
1675         if (whoami == mdsmap->get_root()) {  // load root inode off disk if we are auth
1676           mdcache->open_root_inode(gather.new_sub());
1677         } else if (is_any_replay()) {
1678           // replay.  make up fake root inode to start with
1679           mdcache->create_root_inode();
1680         }
1681         gather.activate();
1682       }
1683       break;
1684     case MDS_BOOT_PREPARE_LOG:
1685       if (is_any_replay()) {
1686         dout(2) << "Booting: " << step << ": replaying mds log" << dendl;
1687         MDSGatherBuilder gather(g_ceph_context,
1688             new C_MDS_BootStart(this, MDS_BOOT_REPLAY_DONE));
1689
1690         if (!standby_replaying) {
1691           dout(2) << "Booting: " << step << ": waiting for purge queue recovered" << dendl;
1692           purge_queue.wait_for_recovery(new C_IO_Wrapper(this, gather.new_sub()));
1693         }
1694
1695         mdlog->replay(gather.new_sub());
1696         gather.activate();
1697       } else {
1698         dout(2) << "Booting: " << step << ": positioning at end of old mds log" << dendl;
1699         mdlog->append();
1700         starting_done();
1701       }
1702       break;
1703     case MDS_BOOT_REPLAY_DONE:
1704       ceph_assert(is_any_replay());
1705
1706       // Sessiontable and inotable should be in sync after replay, validate
1707       // that they are consistent.
1708       validate_sessions();
1709
1710       replay_done();
1711       break;
1712   }
1713 }
1714
1715 void MDSRank::validate_sessions()
1716 {
1717   ceph_assert(ceph_mutex_is_locked_by_me(mds_lock));
1718   bool valid = true;
1719
1720   // Identify any sessions which have state inconsistent with other,
1721   // after they have been loaded from rados during startup.
1722   // Mitigate bugs like: http://tracker.ceph.com/issues/16842
1723   for (const auto &i : sessionmap.get_sessions()) {
1724     Session *session = i.second;
1725     ceph_assert(session->info.prealloc_inos == session->free_prealloc_inos);
1726
1727     interval_set<inodeno_t> badones;
1728     if (inotable->intersects_free(session->info.prealloc_inos, &badones)) {
1729       clog->error() << "client " << *session
1730                     << "loaded with preallocated inodes that are inconsistent with inotable";
1731       valid = false;
1732     }
1733   }
1734
1735   if (!valid) {
1736     damaged();
1737     ceph_assert(valid);
1738   }
1739 }
1740
1741 void MDSRank::starting_done()
1742 {
1743   dout(3) << "starting_done" << dendl;
1744   ceph_assert(is_starting());
1745   request_state(MDSMap::STATE_ACTIVE);
1746
1747   mdlog->start_new_segment();
1748
1749   // sync snaptable cache
1750   snapclient->sync(new C_MDSInternalNoop);
1751 }
1752
1753
1754 void MDSRank::calc_recovery_set()
1755 {
1756   // initialize gather sets
1757   set<mds_rank_t> rs;
1758   mdsmap->get_recovery_mds_set(rs);
1759   rs.erase(whoami);
1760   mdcache->set_recovery_set(rs);
1761
1762   dout(1) << " recovery set is " << rs << dendl;
1763 }
1764
1765 void MDSRank::replay_start()
1766 {
1767   dout(1) << "replay_start" << dendl;
1768
1769   if (is_standby_replay()) {
1770     standby_replaying = true;
1771     if (unlikely(g_conf().get_val<bool>("mds_standby_replay_damaged"))) {
1772       damaged();
1773     }
1774   }
1775
1776   // Check if we need to wait for a newer OSD map before starting
1777   bool const ready = objecter->with_osdmap(
1778     [this](const OSDMap& o) {
1779       return o.get_epoch() >= mdsmap->get_last_failure_osd_epoch();
1780     });
1781
1782   if (ready) {
1783     boot_start();
1784   } else {
1785     dout(1) << " waiting for osdmap " << mdsmap->get_last_failure_osd_epoch()
1786             << " (which blocklists prior instance)" << dendl;
1787     Context *fin = new C_IO_Wrapper(this, new C_MDS_BootStart(this, MDS_BOOT_INITIAL));
1788     objecter->wait_for_map(
1789       mdsmap->get_last_failure_osd_epoch(),
1790       lambdafy(fin));
1791   }
1792 }
1793
1794
1795 class MDSRank::C_MDS_StandbyReplayRestartFinish : public MDSIOContext {
1796   uint64_t old_read_pos;
1797 public:
1798   C_MDS_StandbyReplayRestartFinish(MDSRank *mds_, uint64_t old_read_pos_) :
1799     MDSIOContext(mds_), old_read_pos(old_read_pos_) {}
1800   void finish(int r) override {
1801     mds->_standby_replay_restart_finish(r, old_read_pos);
1802   }
1803   void print(ostream& out) const override {
1804     out << "standby_replay_restart";
1805   }
1806 };
1807
1808 void MDSRank::_standby_replay_restart_finish(int r, uint64_t old_read_pos)
1809 {
1810   if (old_read_pos < mdlog->get_journaler()->get_trimmed_pos()) {
1811     dout(0) << "standby MDS fell behind active MDS journal's expire_pos, restarting" << dendl;
1812     respawn(); /* we're too far back, and this is easier than
1813                   trying to reset everything in the cache, etc */
1814   } else {
1815     mdlog->standby_trim_segments();
1816     boot_start(MDS_BOOT_PREPARE_LOG, r);
1817   }
1818 }
1819
1820 class MDSRank::C_MDS_StandbyReplayRestart : public MDSInternalContext {
1821 public:
1822   explicit C_MDS_StandbyReplayRestart(MDSRank *m) : MDSInternalContext(m) {}
1823   void finish(int r) override {
1824     ceph_assert(!r);
1825     mds->standby_replay_restart();
1826   }
1827 };
1828
1829 void MDSRank::standby_replay_restart()
1830 {
1831   if (standby_replaying) {
1832     /* Go around for another pass of replaying in standby */
1833     dout(5) << "Restarting replay as standby-replay" << dendl;
1834     mdlog->get_journaler()->reread_head_and_probe(
1835       new C_MDS_StandbyReplayRestartFinish(
1836         this,
1837         mdlog->get_journaler()->get_read_pos()));
1838   } else {
1839     /* We are transitioning out of standby: wait for OSD map update
1840        before making final pass */
1841     dout(1) << "standby_replay_restart (final takeover pass)" << dendl;
1842     bool ready = objecter->with_osdmap(
1843       [this](const OSDMap& o) {
1844         return o.get_epoch() >= mdsmap->get_last_failure_osd_epoch();
1845       });
1846     if (ready) {
1847       mdlog->get_journaler()->reread_head_and_probe(
1848         new C_MDS_StandbyReplayRestartFinish(
1849           this,
1850           mdlog->get_journaler()->get_read_pos()));
1851
1852       dout(1) << " opening purge_queue (async)" << dendl;
1853       purge_queue.open(NULL);
1854       dout(1) << " opening open_file_table (async)" << dendl;
1855       mdcache->open_file_table.load(nullptr);
1856     } else {
1857       auto fin = new C_IO_Wrapper(this, new C_MDS_StandbyReplayRestart(this));
1858       dout(1) << " waiting for osdmap " << mdsmap->get_last_failure_osd_epoch()
1859               << " (which blocklists prior instance)" << dendl;
1860       objecter->wait_for_map(mdsmap->get_last_failure_osd_epoch(),
1861                              lambdafy(fin));
1862     }
1863   }
1864 }
1865
1866 void MDSRank::replay_done()
1867 {
1868   if (!standby_replaying) {
1869     dout(1) << "Finished replaying journal" << dendl;
1870   } else {
1871     dout(5) << "Finished replaying journal as standby-replay" << dendl;
1872   }
1873
1874   if (is_standby_replay()) {
1875     // The replay was done in standby state, and we are still in that state
1876     ceph_assert(standby_replaying);
1877     dout(10) << "setting replay timer" << dendl;
1878     timer.add_event_after(g_conf()->mds_replay_interval,
1879                           new C_MDS_StandbyReplayRestart(this));
1880     return;
1881   } else if (standby_replaying) {
1882     // The replay was done in standby state, we have now _left_ that state
1883     dout(10) << " last replay pass was as a standby; making final pass" << dendl;
1884     standby_replaying = false;
1885     standby_replay_restart();
1886     return;
1887   } else {
1888     // Replay is complete, journal read should be up to date
1889     ceph_assert(mdlog->get_journaler()->get_read_pos() == mdlog->get_journaler()->get_write_pos());
1890     ceph_assert(!is_standby_replay());
1891
1892     // Reformat and come back here
1893     if (mdlog->get_journaler()->get_stream_format() < g_conf()->mds_journal_format) {
1894         dout(4) << "reformatting journal on standby-replay->replay transition" << dendl;
1895         mdlog->reopen(new C_MDS_BootStart(this, MDS_BOOT_REPLAY_DONE));
1896         return;
1897     }
1898   }
1899
1900   dout(1) << "making mds journal writeable" << dendl;
1901   mdlog->get_journaler()->set_writeable();
1902   mdlog->get_journaler()->trim_tail();
1903
1904   if (mdsmap->get_tableserver() == whoami &&
1905       snapserver->upgrade_format()) {
1906     dout(1) << "upgrading snaptable format" << dendl;
1907     snapserver->save(new C_MDSInternalNoop);
1908   }
1909
1910   if (g_conf()->mds_wipe_sessions) {
1911     dout(1) << "wiping out client sessions" << dendl;
1912     sessionmap.wipe();
1913     sessionmap.save(new C_MDSInternalNoop);
1914   }
1915   if (g_conf()->mds_wipe_ino_prealloc) {
1916     dout(1) << "wiping out ino prealloc from sessions" << dendl;
1917     sessionmap.wipe_ino_prealloc();
1918     sessionmap.save(new C_MDSInternalNoop);
1919   }
1920   if (g_conf()->mds_skip_ino) {
1921     inodeno_t i = g_conf()->mds_skip_ino;
1922     dout(1) << "skipping " << i << " inodes" << dendl;
1923     inotable->skip_inos(i);
1924     inotable->save(new C_MDSInternalNoop);
1925   }
1926
1927   if (mdsmap->get_num_in_mds() == 1 &&
1928       mdsmap->get_num_failed_mds() == 0) { // just me!
1929     dout(2) << "i am alone, moving to state reconnect" << dendl;
1930     request_state(MDSMap::STATE_RECONNECT);
1931     // sync snaptable cache
1932     snapclient->sync(new C_MDSInternalNoop);
1933   } else {
1934     dout(2) << "i am not alone, moving to state resolve" << dendl;
1935     request_state(MDSMap::STATE_RESOLVE);
1936   }
1937 }
1938
1939 void MDSRank::reopen_log()
1940 {
1941   dout(1) << "reopen_log" << dendl;
1942   mdcache->rollback_uncommitted_fragments();
1943 }
1944
1945 void MDSRank::resolve_start()
1946 {
1947   dout(1) << "resolve_start" << dendl;
1948
1949   reopen_log();
1950
1951   calc_recovery_set();
1952
1953   mdcache->resolve_start(new C_MDS_VoidFn(this, &MDSRank::resolve_done));
1954   finish_contexts(g_ceph_context, waiting_for_resolve);
1955 }
1956
1957 void MDSRank::resolve_done()
1958 {
1959   dout(1) << "resolve_done" << dendl;
1960   request_state(MDSMap::STATE_RECONNECT);
1961   // sync snaptable cache
1962   snapclient->sync(new C_MDSInternalNoop);
1963 }
1964
1965 void MDSRank::apply_blocklist(const std::set<entity_addr_t> &addrs, epoch_t epoch) {
1966   auto victims = server->apply_blocklist();
1967   dout(4) << __func__ << ": killed " << victims << ", blocklisted sessions ("
1968           << addrs.size() << " blocklist entries, "
1969           << sessionmap.get_sessions().size() << ")" << dendl;
1970   if (victims) {
1971     set_osd_epoch_barrier(epoch);
1972   }
1973 }
1974
1975
1976 void MDSRank::reconnect_start()
1977 {
1978   dout(1) << "reconnect_start" << dendl;
1979
1980   if (last_state == MDSMap::STATE_REPLAY) {
1981     reopen_log();
1982   }
1983
1984   // Drop any blocklisted clients from the SessionMap before going
1985   // into reconnect, so that we don't wait for them.
1986   objecter->enable_blocklist_events();
1987   std::set<entity_addr_t> blocklist;
1988   std::set<entity_addr_t> range;
1989   epoch_t epoch = 0;
1990   objecter->with_osdmap([&blocklist, &range, &epoch](const OSDMap& o) {
1991     o.get_blocklist(&blocklist, &range);
1992       epoch = o.get_epoch();
1993   });
1994
1995   apply_blocklist(blocklist, epoch);
1996
1997   server->reconnect_clients(new C_MDS_VoidFn(this, &MDSRank::reconnect_done));
1998   finish_contexts(g_ceph_context, waiting_for_reconnect);
1999 }
2000 void MDSRank::reconnect_done()
2001 {
2002   dout(1) << "reconnect_done" << dendl;
2003   request_state(MDSMap::STATE_REJOIN);    // move to rejoin state
2004 }
2005
2006 void MDSRank::rejoin_joint_start()
2007 {
2008   dout(1) << "rejoin_joint_start" << dendl;
2009   mdcache->rejoin_send_rejoins();
2010 }
2011 void MDSRank::rejoin_start()
2012 {
2013   dout(1) << "rejoin_start" << dendl;
2014   mdcache->rejoin_start(new C_MDS_VoidFn(this, &MDSRank::rejoin_done));
2015   finish_contexts(g_ceph_context, waiting_for_rejoin);
2016 }
2017 void MDSRank::rejoin_done()
2018 {
2019   dout(1) << "rejoin_done" << dendl;
2020   mdcache->show_subtrees();
2021   mdcache->show_cache();
2022
2023   if (mdcache->is_any_uncommitted_fragment()) {
2024     dout(1) << " waiting for uncommitted fragments" << dendl;
2025     mdcache->wait_for_uncommitted_fragments(new C_MDS_VoidFn(this, &MDSRank::rejoin_done));
2026     return;
2027   }
2028
2029   // funny case: is our cache empty?  no subtrees?
2030   if (!mdcache->is_subtrees()) {
2031     if (whoami == 0) {
2032       // The root should always have a subtree!
2033       clog->error() << "No subtrees found for root MDS rank!";
2034       damaged();
2035       ceph_assert(mdcache->is_subtrees());
2036     } else {
2037       dout(1) << " empty cache, no subtrees, leaving cluster" << dendl;
2038       request_state(MDSMap::STATE_STOPPED);
2039     }
2040     return;
2041   }
2042
2043   if (replay_queue.empty() && !server->get_num_pending_reclaim()) {
2044     request_state(MDSMap::STATE_ACTIVE);
2045   } else {
2046     replaying_requests_done = replay_queue.empty();
2047     request_state(MDSMap::STATE_CLIENTREPLAY);
2048   }
2049 }
2050
2051 void MDSRank::clientreplay_start()
2052 {
2053   dout(1) << "clientreplay_start" << dendl;
2054   finish_contexts(g_ceph_context, waiting_for_replay);  // kick waiters
2055   queue_one_replay();
2056 }
2057
2058 bool MDSRank::queue_one_replay()
2059 {
2060   if (!replay_queue.empty()) {
2061     queue_waiter(replay_queue.front());
2062     replay_queue.pop_front();
2063     return true;
2064   }
2065   if (!replaying_requests_done) {
2066     replaying_requests_done = true;
2067     mdlog->flush();
2068   }
2069   maybe_clientreplay_done();
2070   return false;
2071 }
2072
2073 void MDSRank::maybe_clientreplay_done()
2074 {
2075   if (is_clientreplay() && get_want_state() == MDSMap::STATE_CLIENTREPLAY) {
2076
2077     // don't go to active if there are session waiting for being reclaimed
2078     if (replaying_requests_done && !server->get_num_pending_reclaim()) {
2079       mdlog->wait_for_safe(new C_MDS_VoidFn(this, &MDSRank::clientreplay_done));
2080       return;
2081     }
2082
2083     dout(1) << " still have " << replay_queue.size() + (int)!replaying_requests_done
2084             << " requests need to be replayed, " << server->get_num_pending_reclaim()
2085             << " sessions need to be reclaimed" << dendl;
2086   }
2087 }
2088
2089 void MDSRank::clientreplay_done()
2090 {
2091   dout(1) << "clientreplay_done" << dendl;
2092   request_state(MDSMap::STATE_ACTIVE);
2093 }
2094
2095 void MDSRank::active_start()
2096 {
2097   dout(1) << "active_start" << dendl;
2098
2099   if (last_state == MDSMap::STATE_CREATING ||
2100       last_state == MDSMap::STATE_STARTING) {
2101     mdcache->open_root();
2102   }
2103
2104   dout(10) << __func__ << ": initializing metrics handler" << dendl;
2105   metrics_handler.init();
2106   messenger->add_dispatcher_tail(&metrics_handler);
2107
2108   // metric aggregation is solely done by rank 0
2109   if (is_rank0()) {
2110     dout(10) << __func__ << ": initializing metric aggregator" << dendl;
2111     ceph_assert(metric_aggregator == nullptr);
2112     metric_aggregator = std::make_unique<MetricAggregator>(cct, this, mgrc);
2113     metric_aggregator->init();
2114     messenger->add_dispatcher_tail(metric_aggregator.get());
2115   }
2116
2117   mdcache->clean_open_file_lists();
2118   mdcache->export_remaining_imported_caps();
2119   finish_contexts(g_ceph_context, waiting_for_replay);  // kick waiters
2120
2121   mdcache->reissue_all_caps();
2122
2123   finish_contexts(g_ceph_context, waiting_for_active);  // kick waiters
2124 }
2125
2126 void MDSRank::recovery_done(int oldstate)
2127 {
2128   dout(1) << "recovery_done -- successful recovery!" << dendl;
2129   ceph_assert(is_clientreplay() || is_active());
2130
2131   if (oldstate == MDSMap::STATE_CREATING)
2132     return;
2133
2134   mdcache->start_recovered_truncates();
2135   mdcache->start_purge_inodes();
2136   mdcache->start_files_to_recover();
2137
2138   mdcache->populate_mydir();
2139 }
2140
2141 void MDSRank::creating_done()
2142 {
2143   dout(1)<< "creating_done" << dendl;
2144   request_state(MDSMap::STATE_ACTIVE);
2145   // sync snaptable cache
2146   snapclient->sync(new C_MDSInternalNoop);
2147 }
2148
2149 void MDSRank::boot_create()
2150 {
2151   dout(3) << "boot_create" << dendl;
2152
2153   MDSGatherBuilder fin(g_ceph_context, new C_MDS_VoidFn(this, &MDSRank::creating_done));
2154
2155   mdcache->init_layouts();
2156
2157   inotable->set_rank(whoami);
2158   sessionmap.set_rank(whoami);
2159
2160   // start with a fresh journal
2161   dout(10) << "boot_create creating fresh journal" << dendl;
2162   mdlog->create(fin.new_sub());
2163
2164   // open new journal segment, but do not journal subtree map (yet)
2165   mdlog->prepare_new_segment();
2166
2167   if (whoami == mdsmap->get_root()) {
2168     dout(3) << "boot_create creating fresh hierarchy" << dendl;
2169     mdcache->create_empty_hierarchy(fin.get());
2170   }
2171
2172   dout(3) << "boot_create creating mydir hierarchy" << dendl;
2173   mdcache->create_mydir_hierarchy(fin.get());
2174
2175   dout(3) << "boot_create creating global snaprealm" << dendl;
2176   mdcache->create_global_snaprealm();
2177
2178   // fixme: fake out inotable (reset, pretend loaded)
2179   dout(10) << "boot_create creating fresh inotable table" << dendl;
2180   inotable->reset();
2181   inotable->save(fin.new_sub());
2182
2183   // write empty sessionmap
2184   sessionmap.save(fin.new_sub());
2185
2186   // Create empty purge queue
2187   purge_queue.create(new C_IO_Wrapper(this, fin.new_sub()));
2188
2189   // initialize tables
2190   if (mdsmap->get_tableserver() == whoami) {
2191     dout(10) << "boot_create creating fresh snaptable" << dendl;
2192     snapserver->set_rank(whoami);
2193     snapserver->reset();
2194     snapserver->save(fin.new_sub());
2195   }
2196
2197   ceph_assert(g_conf()->mds_kill_create_at != 1);
2198
2199   // ok now journal it
2200   mdlog->journal_segment_subtree_map(fin.new_sub());
2201   mdlog->flush();
2202
2203   // Usually we do this during reconnect, but creation skips that.
2204   objecter->enable_blocklist_events();
2205
2206   fin.activate();
2207 }
2208
2209 void MDSRank::stopping_start()
2210 {
2211   dout(2) << "Stopping..." << dendl;
2212
2213   if (mdsmap->get_num_in_mds() == 1 && !sessionmap.empty()) {
2214     std::vector<Session*> victims;
2215     const auto& sessions = sessionmap.get_sessions();
2216     for (const auto& p : sessions)  {
2217       if (!p.first.is_client()) {
2218         continue;
2219       }
2220
2221       Session *s = p.second;
2222       victims.push_back(s);
2223     }
2224
2225     dout(20) << __func__ << " matched " << victims.size() << " sessions" << dendl;
2226     ceph_assert(!victims.empty());
2227
2228     C_GatherBuilder gather(g_ceph_context, new C_MDSInternalNoop);
2229     for (const auto &s : victims) {
2230       CachedStackStringStream css;
2231       evict_client(s->get_client().v, false,
2232                    g_conf()->mds_session_blocklist_on_evict, *css, gather.new_sub());
2233     }
2234     gather.activate();
2235   }
2236
2237   mdcache->shutdown_start();
2238 }
2239
2240 void MDSRank::stopping_done()
2241 {
2242   dout(2) << "Finished stopping..." << dendl;
2243
2244   // tell monitor we shut down cleanly.
2245   request_state(MDSMap::STATE_STOPPED);
2246 }
2247
2248 void MDSRankDispatcher::handle_mds_map(
2249     const cref_t<MMDSMap> &m,
2250     const MDSMap &oldmap)
2251 {
2252   // I am only to be passed MDSMaps in which I hold a rank
2253   ceph_assert(whoami != MDS_RANK_NONE);
2254
2255   mds_gid_t mds_gid = mds_gid_t(monc->get_global_id());
2256   MDSMap::DaemonState oldstate = oldmap.get_state_gid(mds_gid);
2257   if (oldstate == MDSMap::STATE_NULL) {
2258     // monitor may skip sending me the STANDBY map (e.g. if paxos_propose_interval is high)
2259     // Assuming I have passed STANDBY state if I got a rank in the first map.
2260     oldstate = MDSMap::STATE_STANDBY;
2261   }
2262   // I should not miss map update
2263   ceph_assert(state == oldstate);
2264   state = mdsmap->get_state_gid(mds_gid);
2265   if (state != oldstate) {
2266     last_state = oldstate;
2267     incarnation = mdsmap->get_inc_gid(mds_gid);
2268   }
2269
2270   version_t epoch = m->get_epoch();
2271
2272   // note source's map version
2273   if (m->get_source().is_mds() &&
2274       peer_mdsmap_epoch[mds_rank_t(m->get_source().num())] < epoch) {
2275     dout(15) << " peer " << m->get_source()
2276              << " has mdsmap epoch >= " << epoch
2277              << dendl;
2278     peer_mdsmap_epoch[mds_rank_t(m->get_source().num())] = epoch;
2279   }
2280
2281   // Validate state transitions while I hold a rank
2282   if (!MDSMap::state_transition_valid(oldstate, state)) {
2283     derr << "Invalid state transition " << ceph_mds_state_name(oldstate)
2284       << "->" << ceph_mds_state_name(state) << dendl;
2285     respawn();
2286   }
2287
2288   if (oldstate != state) {
2289     // update messenger.
2290     auto sleep_rank_change = g_conf().get_val<double>("mds_sleep_rank_change");
2291     if (unlikely(sleep_rank_change > 0)) {
2292       // This is to trigger a race where another rank tries to connect to this
2293       // MDS before an update to the messenger "myname" is processed. This race
2294       // should be closed by ranks holding messages until the rank is out of a
2295       // "bootstrapping" state.
2296       usleep(sleep_rank_change);
2297     } if (state == MDSMap::STATE_STANDBY_REPLAY) {
2298       dout(1) << "handle_mds_map i am now mds." << mds_gid << "." << incarnation
2299           << " replaying mds." << whoami << "." << incarnation << dendl;
2300       messenger->set_myname(entity_name_t::MDS(mds_gid));
2301     } else {
2302       dout(1) << "handle_mds_map i am now mds." << whoami << "." << incarnation << dendl;
2303       messenger->set_myname(entity_name_t::MDS(whoami));
2304     }
2305   }
2306
2307   // tell objecter my incarnation
2308   if (objecter->get_client_incarnation() != incarnation)
2309     objecter->set_client_incarnation(incarnation);
2310
2311   if (mdsmap->get_required_client_features() != oldmap.get_required_client_features())
2312     server->update_required_client_features();
2313
2314   // for debug
2315   if (g_conf()->mds_dump_cache_on_map)
2316     mdcache->dump_cache();
2317
2318   cluster_degraded = mdsmap->is_degraded();
2319
2320   // mdsmap and oldmap can be discontinuous. failover might happen in the missing mdsmap.
2321   // the 'restart' set tracks ranks that have restarted since the old mdsmap
2322   set<mds_rank_t> restart;
2323   // replaying mds does not communicate with other ranks
2324   if (state >= MDSMap::STATE_RESOLVE) {
2325     // did someone fail?
2326     //   new down?
2327     set<mds_rank_t> olddown, down;
2328     oldmap.get_down_mds_set(&olddown);
2329     mdsmap->get_down_mds_set(&down);
2330     for (const auto& r : down) {
2331       if (oldmap.have_inst(r) && olddown.count(r) == 0) {
2332         messenger->mark_down_addrs(oldmap.get_addrs(r));
2333         handle_mds_failure(r);
2334       }
2335     }
2336
2337     // did someone fail?
2338     //   did their addr/inst change?
2339     set<mds_rank_t> up;
2340     mdsmap->get_up_mds_set(up);
2341     for (const auto& r : up) {
2342       auto& info = mdsmap->get_info(r);
2343       if (oldmap.have_inst(r)) {
2344         auto& oldinfo = oldmap.get_info(r);
2345         if (info.inc != oldinfo.inc) {
2346           messenger->mark_down_addrs(oldinfo.get_addrs());
2347           if (info.state == MDSMap::STATE_REPLAY ||
2348               info.state == MDSMap::STATE_RESOLVE) {
2349             restart.insert(r);
2350             handle_mds_failure(r);
2351           } else {
2352             ceph_assert(info.state == MDSMap::STATE_STARTING ||
2353                    info.state == MDSMap::STATE_ACTIVE);
2354             // -> stopped (missing) -> starting -> active
2355             restart.insert(r);
2356             mdcache->migrator->handle_mds_failure_or_stop(r);
2357             if (mdsmap->get_tableserver() == whoami)
2358               snapserver->handle_mds_failure_or_stop(r);
2359           }
2360         }
2361       } else {
2362         if (info.state == MDSMap::STATE_REPLAY ||
2363             info.state == MDSMap::STATE_RESOLVE) {
2364           // -> starting/creating (missing) -> active (missing) -> replay -> resolve
2365           restart.insert(r);
2366           handle_mds_failure(r);
2367         } else {
2368           ceph_assert(info.state == MDSMap::STATE_CREATING ||
2369                  info.state == MDSMap::STATE_STARTING ||
2370                  info.state == MDSMap::STATE_ACTIVE);
2371         }
2372       }
2373     }
2374   }
2375
2376   // did it change?
2377   if (oldstate != state) {
2378     dout(1) << "handle_mds_map state change "
2379             << ceph_mds_state_name(oldstate) << " --> "
2380             << ceph_mds_state_name(state) << dendl;
2381     beacon.set_want_state(*mdsmap, state);
2382
2383     if (oldstate == MDSMap::STATE_STANDBY_REPLAY) {
2384         dout(10) << "Monitor activated us! Deactivating replay loop" << dendl;
2385         ceph_assert (state == MDSMap::STATE_REPLAY);
2386     } else {
2387       // did i just recover?
2388       if ((is_active() || is_clientreplay()) &&
2389           (oldstate == MDSMap::STATE_CREATING ||
2390            oldstate == MDSMap::STATE_REJOIN ||
2391            oldstate == MDSMap::STATE_RECONNECT))
2392         recovery_done(oldstate);
2393
2394       if (is_active()) {
2395         active_start();
2396       } else if (is_any_replay()) {
2397         replay_start();
2398       } else if (is_resolve()) {
2399         resolve_start();
2400       } else if (is_reconnect()) {
2401         reconnect_start();
2402       } else if (is_rejoin()) {
2403         rejoin_start();
2404       } else if (is_clientreplay()) {
2405         clientreplay_start();
2406       } else if (is_creating()) {
2407         boot_create();
2408       } else if (is_starting()) {
2409         boot_start();
2410       } else if (is_stopping()) {
2411         ceph_assert(oldstate == MDSMap::STATE_ACTIVE);
2412         stopping_start();
2413       }
2414     }
2415   }
2416
2417   // RESOLVE
2418   // is someone else newly resolving?
2419   if (state >= MDSMap::STATE_RESOLVE) {
2420     // recover snaptable
2421     if (mdsmap->get_tableserver() == whoami) {
2422       if (oldstate < MDSMap::STATE_RESOLVE) {
2423         set<mds_rank_t> s;
2424         mdsmap->get_mds_set_lower_bound(s, MDSMap::STATE_RESOLVE);
2425         snapserver->finish_recovery(s);
2426       } else {
2427         set<mds_rank_t> old_set, new_set;
2428         oldmap.get_mds_set_lower_bound(old_set, MDSMap::STATE_RESOLVE);
2429         mdsmap->get_mds_set_lower_bound(new_set, MDSMap::STATE_RESOLVE);
2430         for (const auto& r : new_set) {
2431           if (r == whoami)
2432             continue; // not me
2433           if (!old_set.count(r) || restart.count(r)) {  // newly so?
2434             snapserver->handle_mds_recovery(r);
2435           }
2436         }
2437       }
2438     }
2439
2440     if ((!oldmap.is_resolving() || !restart.empty()) && mdsmap->is_resolving()) {
2441       set<mds_rank_t> resolve;
2442       mdsmap->get_mds_set(resolve, MDSMap::STATE_RESOLVE);
2443       dout(10) << " resolve set is " << resolve << dendl;
2444       calc_recovery_set();
2445       mdcache->send_resolves();
2446     }
2447   }
2448
2449   // REJOIN
2450   // is everybody finally rejoining?
2451   if (state >= MDSMap::STATE_REJOIN) {
2452     // did we start?
2453     if (!oldmap.is_rejoining() && mdsmap->is_rejoining())
2454       rejoin_joint_start();
2455
2456     // did we finish?
2457     if (g_conf()->mds_dump_cache_after_rejoin &&
2458         oldmap.is_rejoining() && !mdsmap->is_rejoining())
2459       mdcache->dump_cache();      // for DEBUG only
2460
2461     if (oldstate >= MDSMap::STATE_REJOIN ||
2462         oldstate == MDSMap::STATE_STARTING) {
2463       // ACTIVE|CLIENTREPLAY|REJOIN => we can discover from them.
2464       set<mds_rank_t> olddis, dis;
2465       oldmap.get_mds_set_lower_bound(olddis, MDSMap::STATE_REJOIN);
2466       mdsmap->get_mds_set_lower_bound(dis, MDSMap::STATE_REJOIN);
2467       for (const auto& r : dis) {
2468         if (r == whoami)
2469           continue; // not me
2470         if (!olddis.count(r) || restart.count(r)) {  // newly so?
2471           mdcache->kick_discovers(r);
2472           mdcache->kick_open_ino_peers(r);
2473         }
2474       }
2475     }
2476   }
2477
2478   if (oldmap.is_degraded() && !cluster_degraded && state >= MDSMap::STATE_ACTIVE) {
2479     dout(1) << "cluster recovered." << dendl;
2480     auto it = waiting_for_active_peer.find(MDS_RANK_NONE);
2481     if (it != waiting_for_active_peer.end()) {
2482       queue_waiters(it->second);
2483       waiting_for_active_peer.erase(it);
2484     }
2485   }
2486
2487   // did someone leave a "bootstrapping" state? We can't connect until then to
2488   // allow messenger "myname" updates.
2489   {
2490     std::vector<mds_rank_t> erase;
2491     for (auto& [rank, queue] : waiting_for_bootstrapping_peer) {
2492       auto state = mdsmap->get_state(rank);
2493       if (state > MDSMap::STATE_REPLAY) {
2494         queue_waiters(queue);
2495         erase.push_back(rank);
2496       }
2497     }
2498     for (const auto& rank : erase) {
2499       waiting_for_bootstrapping_peer.erase(rank);
2500     }
2501   }
2502   // for testing...
2503   if (unlikely(g_conf().get_val<bool>("mds_connect_bootstrapping"))) {
2504     std::set<mds_rank_t> bootstrapping;
2505     mdsmap->get_mds_set(bootstrapping, MDSMap::STATE_REPLAY);
2506     mdsmap->get_mds_set(bootstrapping, MDSMap::STATE_CREATING);
2507     mdsmap->get_mds_set(bootstrapping, MDSMap::STATE_STARTING);
2508     for (const auto& rank : bootstrapping) {
2509       auto m = make_message<MMDSMap>(monc->get_fsid(), *mdsmap);
2510       send_message_mds(std::move(m), rank);
2511     }
2512   }
2513
2514   // did someone go active?
2515   if (state >= MDSMap::STATE_CLIENTREPLAY &&
2516       oldstate >= MDSMap::STATE_CLIENTREPLAY) {
2517     set<mds_rank_t> oldactive, active;
2518     oldmap.get_mds_set_lower_bound(oldactive, MDSMap::STATE_CLIENTREPLAY);
2519     mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
2520     for (const auto& r : active) {
2521       if (r == whoami)
2522         continue; // not me
2523       if (!oldactive.count(r) || restart.count(r))  // newly so?
2524         handle_mds_recovery(r);
2525     }
2526   }
2527
2528   if (is_clientreplay() || is_active() || is_stopping()) {
2529     // did anyone stop?
2530     set<mds_rank_t> oldstopped, stopped;
2531     oldmap.get_stopped_mds_set(oldstopped);
2532     mdsmap->get_stopped_mds_set(stopped);
2533     for (const auto& r : stopped)
2534       if (oldstopped.count(r) == 0) {     // newly so?
2535         mdcache->migrator->handle_mds_failure_or_stop(r);
2536         if (mdsmap->get_tableserver() == whoami)
2537           snapserver->handle_mds_failure_or_stop(r);
2538       }
2539   }
2540
2541   {
2542     map<epoch_t,MDSContext::vec >::iterator p = waiting_for_mdsmap.begin();
2543     while (p != waiting_for_mdsmap.end() && p->first <= mdsmap->get_epoch()) {
2544       MDSContext::vec ls;
2545       ls.swap(p->second);
2546       waiting_for_mdsmap.erase(p++);
2547       queue_waiters(ls);
2548     }
2549   }
2550
2551   if (is_active()) {
2552     // Before going active, set OSD epoch barrier to latest (so that
2553     // we don't risk handing out caps to clients with old OSD maps that
2554     // might not include barriers from the previous incarnation of this MDS)
2555     set_osd_epoch_barrier(objecter->with_osdmap(
2556                             std::mem_fn(&OSDMap::get_epoch)));
2557
2558     /* Now check if we should hint to the OSD that a read may follow */
2559     if (mdsmap->has_standby_replay(whoami))
2560       mdlog->set_write_iohint(0);
2561     else
2562       mdlog->set_write_iohint(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED);
2563   }
2564
2565   if (oldmap.get_max_mds() != mdsmap->get_max_mds()) {
2566     purge_queue.update_op_limit(*mdsmap);
2567   }
2568
2569   if (mdsmap->get_inline_data_enabled() && !oldmap.get_inline_data_enabled())
2570     dout(0) << "WARNING: inline_data support has been deprecated and will be removed in a future release" << dendl;
2571
2572   mdcache->handle_mdsmap(*mdsmap, oldmap);
2573
2574   if (metric_aggregator != nullptr) {
2575     metric_aggregator->notify_mdsmap(*mdsmap);
2576   }
2577   metrics_handler.notify_mdsmap(*mdsmap);
2578 }
2579
2580 void MDSRank::handle_mds_recovery(mds_rank_t who)
2581 {
2582   dout(5) << "handle_mds_recovery mds." << who << dendl;
2583
2584   mdcache->handle_mds_recovery(who);
2585
2586   queue_waiters(waiting_for_active_peer[who]);
2587   waiting_for_active_peer.erase(who);
2588 }
2589
2590 void MDSRank::handle_mds_failure(mds_rank_t who)
2591 {
2592   if (who == whoami) {
2593     dout(5) << "handle_mds_failure for myself; not doing anything" << dendl;
2594     return;
2595   }
2596   dout(5) << "handle_mds_failure mds." << who << dendl;
2597
2598   mdcache->handle_mds_failure(who);
2599
2600   if (mdsmap->get_tableserver() == whoami)
2601     snapserver->handle_mds_failure_or_stop(who);
2602
2603   snapclient->handle_mds_failure(who);
2604
2605   scrubstack->handle_mds_failure(who);
2606 }
2607
2608 void MDSRankDispatcher::handle_asok_command(
2609   std::string_view command,
2610   const cmdmap_t& cmdmap,
2611   Formatter *f,
2612   const bufferlist &inbl,
2613   std::function<void(int,const std::string&,bufferlist&)> on_finish)
2614 {
2615   int r = 0;
2616   CachedStackStringStream css;
2617   bufferlist outbl;
2618   dout(10) << __func__ << ": " << command << dendl;
2619   if (command == "dump_ops_in_flight") {
2620     if (!op_tracker.dump_ops_in_flight(f)) {
2621       *css << "op_tracker disabled; set mds_enable_op_tracker=true to enable";
2622     }
2623   } else if (command == "ops") {
2624     vector<string> flags;
2625     cmd_getval(cmdmap, "flags", flags);
2626     std::unique_lock l(mds_lock, std::defer_lock);
2627     auto lambda = OpTracker::default_dumper;
2628     if (flags.size()) {
2629       /* use std::function if we actually want to capture flags someday */
2630       lambda = [](const TrackedOp& op, Formatter* f) {
2631         auto* req = dynamic_cast<const MDRequestImpl*>(&op);
2632         if (req) {
2633           req->dump_with_mds_lock(f);
2634         } else {
2635           op.dump_type(f);
2636         }
2637       };
2638       l.lock();
2639     }
2640     if (!op_tracker.dump_ops_in_flight(f, false, {""}, false, lambda)) {
2641       *css << "op_tracker disabled; set mds_enable_op_tracker=true to enable";
2642     }
2643   } else if (command == "dump_blocked_ops") {
2644     if (!op_tracker.dump_ops_in_flight(f, true)) {
2645       *css << "op_tracker disabled; set mds_enable_op_tracker=true to enable";
2646     }
2647   } else if (command == "dump_blocked_ops_count") {
2648     if (!op_tracker.dump_ops_in_flight(f, true, {""}, true)) {
2649       *css << "op_tracker disabled; set mds_enable_op_tracker=true to enable";
2650     }
2651   } else if (command == "dump_historic_ops") {
2652     if (!op_tracker.dump_historic_ops(f)) {
2653       *css << "op_tracker disabled; set mds_enable_op_tracker=true to enable";
2654     }
2655   } else if (command == "dump_historic_ops_by_duration") {
2656     if (!op_tracker.dump_historic_ops(f, true)) {
2657       *css << "op_tracker disabled; set mds_enable_op_tracker=true to enable";
2658     }
2659   } else if (command == "osdmap barrier") {
2660     int64_t target_epoch = 0;
2661     bool got_val = cmd_getval(cmdmap, "target_epoch", target_epoch);
2662
2663     if (!got_val) {
2664       *css << "no target epoch given";
2665       r = -CEPHFS_EINVAL;
2666       goto out;
2667     }
2668     {
2669       std::lock_guard l(mds_lock);
2670       set_osd_epoch_barrier(target_epoch);
2671     }
2672     boost::system::error_code ec;
2673     dout(4) << __func__ << ": possibly waiting for OSD epoch " << target_epoch << dendl;
2674     objecter->wait_for_map(target_epoch, ceph::async::use_blocked[ec]);
2675   } else if (command == "session ls" ||
2676              command == "client ls") {
2677     std::lock_guard l(mds_lock);
2678     bool cap_dump = false;
2679     std::vector<std::string> filter_args;
2680     cmd_getval(cmdmap, "cap_dump", cap_dump);
2681     cmd_getval(cmdmap, "filters", filter_args);
2682
2683     SessionFilter filter;
2684     r = filter.parse(filter_args, css.get());
2685     if (r != 0) {
2686       goto out;
2687     }
2688     dump_sessions(filter, f, cap_dump);
2689   } else if (command == "session evict" ||
2690              command == "client evict") {
2691     std::lock_guard l(mds_lock);
2692     std::vector<std::string> filter_args;
2693     cmd_getval(cmdmap, "filters", filter_args);
2694
2695     SessionFilter filter;
2696     r = filter.parse(filter_args, css.get());
2697     if (r != 0) {
2698       r = -CEPHFS_EINVAL;
2699       goto out;
2700     }
2701     evict_clients(filter, on_finish);
2702     return;
2703   } else if (command == "session kill") {
2704     std::string client_id;
2705     if (!cmd_getval(cmdmap, "client_id", client_id)) {
2706       *css << "Invalid client_id specified";
2707       r = -CEPHFS_ENOENT;
2708       goto out;
2709     }
2710     std::lock_guard l(mds_lock);
2711     bool evicted = evict_client(strtol(client_id.c_str(), 0, 10), true,
2712         g_conf()->mds_session_blocklist_on_evict, *css);
2713     if (!evicted) {
2714       dout(15) << css->strv() << dendl;
2715       r = -CEPHFS_ENOENT;
2716     }
2717   } else if (command == "session config" ||
2718              command == "client config") {
2719     int64_t client_id;
2720     std::string option;
2721     std::string value;
2722
2723     cmd_getval(cmdmap, "client_id", client_id);
2724     cmd_getval(cmdmap, "option", option);
2725     bool got_value = cmd_getval(cmdmap, "value", value);
2726
2727     std::lock_guard l(mds_lock);
2728     r = config_client(client_id, !got_value, option, value, *css);
2729   } else if (command == "scrub start" ||
2730              command == "scrub_start") {
2731     if (whoami != 0) {
2732       *css << "Not rank 0";
2733       r = -CEPHFS_EXDEV;
2734       goto out;
2735     }
2736
2737     string path;
2738     string tag;
2739     vector<string> scrubop_vec;
2740     cmd_getval(cmdmap, "scrubops", scrubop_vec);
2741     cmd_getval(cmdmap, "path", path);
2742     cmd_getval(cmdmap, "tag", tag);
2743
2744     finisher->queue(
2745       new LambdaContext(
2746         [this, on_finish, f, path, tag, scrubop_vec](int r) {
2747           command_scrub_start(
2748             f, path, tag, scrubop_vec,
2749             new LambdaContext(
2750               [on_finish](int r) {
2751                 bufferlist outbl;
2752                 on_finish(r, {}, outbl);
2753               }));
2754         }));
2755     return;
2756   } else if (command == "scrub abort") {
2757     if (whoami != 0) {
2758       *css << "Not rank 0";
2759       r = -CEPHFS_EXDEV;
2760       goto out;
2761     }
2762
2763     finisher->queue(
2764       new LambdaContext(
2765         [this, on_finish, f](int r) {
2766           command_scrub_abort(
2767             f,
2768             new LambdaContext(
2769               [on_finish, f](int r) {
2770                 bufferlist outbl;
2771                 f->open_object_section("result");
2772                 f->dump_int("return_code", r);
2773                 f->close_section();
2774                 on_finish(r, {}, outbl);
2775               }));
2776         }));
2777     return;
2778   } else if (command == "scrub pause") {
2779     if (whoami != 0) {
2780       *css << "Not rank 0";
2781       r = -CEPHFS_EXDEV;
2782       goto out;
2783     }
2784
2785     finisher->queue(
2786       new LambdaContext(
2787         [this, on_finish, f](int r) {
2788           command_scrub_pause(
2789             f,
2790             new LambdaContext(
2791               [on_finish, f](int r) {
2792                 bufferlist outbl;
2793                 f->open_object_section("result");
2794                 f->dump_int("return_code", r);
2795                 f->close_section();
2796                 on_finish(r, {}, outbl);
2797               }));
2798         }));
2799     return;
2800   } else if (command == "scrub resume") {
2801     if (whoami != 0) {
2802       *css << "Not rank 0";
2803       r = -CEPHFS_EXDEV;
2804       goto out;
2805     }
2806     command_scrub_resume(f);
2807   } else if (command == "scrub status") {
2808     command_scrub_status(f);
2809   } else if (command == "tag path") {
2810     if (whoami != 0) {
2811       *css << "Not rank 0";
2812       r = -CEPHFS_EXDEV;
2813       goto out;
2814     }
2815     string path;
2816     cmd_getval(cmdmap, "path", path);
2817     string tag;
2818     cmd_getval(cmdmap, "tag", tag);
2819     command_tag_path(f, path, tag);
2820   } else if (command == "flush_path") {
2821     string path;
2822     cmd_getval(cmdmap, "path", path);
2823     command_flush_path(f, path);
2824   } else if (command == "flush journal") {
2825     command_flush_journal(f);
2826   } else if (command == "get subtrees") {
2827     command_get_subtrees(f);
2828   } else if (command == "export dir") {
2829     string path;
2830     if(!cmd_getval(cmdmap, "path", path)) {
2831       *css << "malformed path";
2832       r = -CEPHFS_EINVAL;
2833       goto out;
2834     }
2835     int64_t rank;
2836     if(!cmd_getval(cmdmap, "rank", rank)) {
2837       *css << "malformed rank";
2838       r = -CEPHFS_EINVAL;
2839       goto out;
2840     }
2841     command_export_dir(f, path, (mds_rank_t)rank);
2842   } else if (command == "dump cache") {
2843     std::lock_guard l(mds_lock);
2844     int64_t timeout = 0;
2845     cmd_getval(cmdmap, "timeout", timeout);
2846     auto mds_beacon_interval = g_conf().get_val<double>("mds_beacon_interval");
2847     if (timeout <= 0)
2848       timeout = mds_beacon_interval / 2;
2849     else if (timeout > mds_beacon_interval)
2850       timeout = mds_beacon_interval;
2851     string path;
2852     if (!cmd_getval(cmdmap, "path", path)) {
2853       r = mdcache->dump_cache(f, timeout);
2854     } else {
2855       r = mdcache->dump_cache(path, timeout);
2856     }
2857   } else if (command == "cache drop") {
2858     int64_t timeout = 0;
2859     cmd_getval(cmdmap, "timeout", timeout);
2860     finisher->queue(
2861       new LambdaContext(
2862         [this, on_finish, f, timeout](int r) {
2863           command_cache_drop(
2864             timeout, f,
2865             new LambdaContext(
2866               [on_finish](int r) {
2867                 bufferlist outbl;
2868                 on_finish(r, {}, outbl);
2869               }));
2870         }));
2871     return;
2872   } else if (command == "cache status") {
2873     std::lock_guard l(mds_lock);
2874     mdcache->cache_status(f);
2875   } else if (command == "dump tree") {
2876     command_dump_tree(cmdmap, *css, f);
2877   } else if (command == "dump loads") {
2878     std::lock_guard l(mds_lock);
2879     int64_t depth = -1;
2880     bool got = cmd_getval(cmdmap, "depth", depth);
2881     if (!got || depth < 0) {
2882       dout(10) << "no depth limit when dirfrags dump_load" << dendl;
2883     }
2884     r = balancer->dump_loads(f, depth);
2885   } else if (command == "dump snaps") {
2886     std::lock_guard l(mds_lock);
2887     string server;
2888     cmd_getval(cmdmap, "server", server);
2889     if (server == "--server") {
2890       if (mdsmap->get_tableserver() == whoami) {
2891         snapserver->dump(f);
2892       } else {
2893         r = -CEPHFS_EXDEV;
2894         *css << "Not snapserver";
2895       }
2896     } else {
2897       r = snapclient->dump_cache(f);
2898     }
2899   } else if (command == "force_readonly") {
2900     std::lock_guard l(mds_lock);
2901     mdcache->force_readonly();
2902   } else if (command == "dirfrag split") {
2903     command_dirfrag_split(cmdmap, *css);
2904   } else if (command == "dirfrag merge") {
2905     command_dirfrag_merge(cmdmap, *css);
2906   } else if (command == "dirfrag ls") {
2907     command_dirfrag_ls(cmdmap, *css, f);
2908   } else if (command == "openfiles ls") {
2909     command_openfiles_ls(f);
2910   } else if (command == "dump inode") {
2911     command_dump_inode(f, cmdmap, *css);
2912   } else if (command == "damage ls") {
2913     std::lock_guard l(mds_lock);
2914     damage_table.dump(f);
2915   } else if (command == "damage rm") {
2916     std::lock_guard l(mds_lock);
2917     damage_entry_id_t id = 0;
2918     if (!cmd_getval(cmdmap, "damage_id", (int64_t&)id)) {
2919       r = -CEPHFS_EINVAL;
2920       goto out;
2921     }
2922     damage_table.erase(id);
2923   } else {
2924     r = -CEPHFS_ENOSYS;
2925   }
2926 out:
2927   on_finish(r, css->str(), outbl);
2928 }
2929
2930 /**
2931  * This function drops the mds_lock, so don't do anything with
2932  * MDSRank after calling it (we could have gone into shutdown): just
2933  * send your result back to the calling client and finish.
2934  */
2935 void MDSRankDispatcher::evict_clients(
2936   const SessionFilter &filter,
2937   std::function<void(int,const std::string&,bufferlist&)> on_finish)
2938 {
2939   bufferlist outbl;
2940   if (is_any_replay()) {
2941     on_finish(-CEPHFS_EAGAIN, "MDS is replaying log", outbl);
2942     return;
2943   }
2944
2945   std::vector<Session*> victims;
2946   const auto& sessions = sessionmap.get_sessions();
2947   for (const auto& p : sessions)  {
2948     if (!p.first.is_client()) {
2949       continue;
2950     }
2951
2952     Session *s = p.second;
2953
2954     if (filter.match(*s, std::bind(&Server::waiting_for_reconnect, server,
2955                                    std::placeholders::_1))) {
2956       victims.push_back(s);
2957     }
2958   }
2959
2960   dout(20) << __func__ << " matched " << victims.size() << " sessions" << dendl;
2961
2962   if (victims.empty()) {
2963     on_finish(0, {}, outbl);
2964     return;
2965   }
2966
2967   C_GatherBuilder gather(g_ceph_context,
2968                          new LambdaContext([on_finish](int r) {
2969                                              bufferlist bl;
2970                                              on_finish(r, {}, bl);
2971                                            }));
2972   for (const auto s : victims) {
2973     CachedStackStringStream css;
2974     evict_client(s->get_client().v, false,
2975                  g_conf()->mds_session_blocklist_on_evict, *css, gather.new_sub());
2976   }
2977   gather.activate();
2978 }
2979
2980 void MDSRankDispatcher::dump_sessions(const SessionFilter &filter, Formatter *f, bool cap_dump) const
2981 {
2982   // Dump sessions, decorated with recovery/replay status
2983   f->open_array_section("sessions");
2984   for (auto& [name, s] : sessionmap.get_sessions()) {
2985     if (!name.is_client()) {
2986       continue;
2987     }
2988
2989     if (!filter.match(*s, std::bind(&Server::waiting_for_reconnect, server, std::placeholders::_1))) {
2990       continue;
2991     }
2992
2993     f->open_object_section("session");
2994     s->dump(f, cap_dump);
2995     f->close_section();
2996   }
2997   f->close_section(); // sessions
2998 }
2999
3000 void MDSRank::command_scrub_start(Formatter *f,
3001                                   std::string_view path, std::string_view tag,
3002                                   const vector<string>& scrubop_vec, Context *on_finish)
3003 {
3004   bool force = false;
3005   bool recursive = false;
3006   bool repair = false;
3007   bool scrub_mdsdir = false;
3008   for (auto &op : scrubop_vec) {
3009     if (op == "force")
3010       force = true;
3011     else if (op == "recursive")
3012       recursive = true;
3013     else if (op == "repair")
3014       repair = true;
3015     else if (op == "scrub_mdsdir" && path == "/")
3016       scrub_mdsdir = true;
3017   }
3018
3019   std::lock_guard l(mds_lock);
3020   mdcache->enqueue_scrub(path, tag, force, recursive, repair, scrub_mdsdir,
3021                          f, on_finish);
3022   // scrub_dentry() finishers will dump the data for us; we're done!
3023 }
3024
3025 void MDSRank::command_tag_path(Formatter *f,
3026     std::string_view path, std::string_view tag)
3027 {
3028   C_SaferCond scond;
3029   {
3030     std::lock_guard l(mds_lock);
3031     mdcache->enqueue_scrub(path, tag, true, true, false, false, f, &scond);
3032   }
3033   scond.wait();
3034 }
3035
3036 void MDSRank::command_scrub_abort(Formatter *f, Context *on_finish) {
3037   std::lock_guard l(mds_lock);
3038   scrubstack->scrub_abort(on_finish);
3039 }
3040
3041 void MDSRank::command_scrub_pause(Formatter *f, Context *on_finish) {
3042   std::lock_guard l(mds_lock);
3043   scrubstack->scrub_pause(on_finish);
3044 }
3045
3046 void MDSRank::command_scrub_resume(Formatter *f) {
3047   std::lock_guard l(mds_lock);
3048   int r = scrubstack->scrub_resume();
3049
3050   f->open_object_section("result");
3051   f->dump_int("return_code", r);
3052   f->close_section();
3053 }
3054
3055 void MDSRank::command_scrub_status(Formatter *f) {
3056   std::lock_guard l(mds_lock);
3057   scrubstack->scrub_status(f);
3058 }
3059
3060 void MDSRank::command_flush_path(Formatter *f, std::string_view path)
3061 {
3062   C_SaferCond scond;
3063   {
3064     std::lock_guard l(mds_lock);
3065     mdcache->flush_dentry(path, &scond);
3066   }
3067   int r = scond.wait();
3068   f->open_object_section("results");
3069   f->dump_int("return_code", r);
3070   f->close_section(); // results
3071 }
3072
3073 // synchronous wrapper around "journal flush" asynchronous context
3074 // execution.
3075 void MDSRank::command_flush_journal(Formatter *f) {
3076   ceph_assert(f != NULL);
3077
3078   C_SaferCond cond;
3079   CachedStackStringStream css;
3080   {
3081     std::lock_guard locker(mds_lock);
3082     C_Flush_Journal *flush_journal = new C_Flush_Journal(mdcache, mdlog, this, css.get(), &cond);
3083     flush_journal->send();
3084   }
3085   int r = cond.wait();
3086
3087   f->open_object_section("result");
3088   f->dump_string("message", css->strv());
3089   f->dump_int("return_code", r);
3090   f->close_section();
3091 }
3092
3093 void MDSRank::command_get_subtrees(Formatter *f)
3094 {
3095   ceph_assert(f != NULL);
3096   std::lock_guard l(mds_lock);
3097
3098   std::vector<CDir*> subtrees;
3099   mdcache->get_subtrees(subtrees);
3100
3101   f->open_array_section("subtrees");
3102   for (const auto& dir : subtrees) {
3103     f->open_object_section("subtree");
3104     {
3105       f->dump_bool("is_auth", dir->is_auth());
3106       f->dump_int("auth_first", dir->get_dir_auth().first);
3107       f->dump_int("auth_second", dir->get_dir_auth().second); {
3108         mds_rank_t export_pin = dir->inode->get_export_pin(false);
3109         f->dump_int("export_pin", export_pin >= 0 ? export_pin : -1);
3110         f->dump_bool("distributed_ephemeral_pin", export_pin == MDS_RANK_EPHEMERAL_DIST);
3111         f->dump_bool("random_ephemeral_pin", export_pin == MDS_RANK_EPHEMERAL_RAND);
3112       }
3113       f->dump_int("export_pin_target", dir->get_export_pin(false));
3114       f->open_object_section("dir");
3115       dir->dump(f);
3116       f->close_section();
3117     }
3118     f->close_section();
3119   }
3120   f->close_section();
3121 }
3122
3123
3124 void MDSRank::command_export_dir(Formatter *f,
3125     std::string_view path,
3126     mds_rank_t target)
3127 {
3128   int r = _command_export_dir(path, target);
3129   f->open_object_section("results");
3130   f->dump_int("return_code", r);
3131   f->close_section(); // results
3132 }
3133
3134 int MDSRank::_command_export_dir(
3135     std::string_view path,
3136     mds_rank_t target)
3137 {
3138   std::lock_guard l(mds_lock);
3139   filepath fp(path);
3140
3141   if (target == whoami || !mdsmap->is_up(target) || !mdsmap->is_in(target)) {
3142     derr << "bad MDS target " << target << dendl;
3143     return -CEPHFS_ENOENT;
3144   }
3145
3146   CInode *in = mdcache->cache_traverse(fp);
3147   if (!in) {
3148     derr << "bad path '" << path << "'" << dendl;
3149     return -CEPHFS_ENOENT;
3150   }
3151   CDir *dir = in->get_dirfrag(frag_t());
3152   if (!dir || !(dir->is_auth())) {
3153     derr << "bad export_dir path dirfrag frag_t() or dir not auth" << dendl;
3154     return -CEPHFS_EINVAL;
3155   }
3156
3157   mdcache->migrator->export_dir(dir, target);
3158   return 0;
3159 }
3160
3161 void MDSRank::command_dump_tree(const cmdmap_t &cmdmap, std::ostream &ss, Formatter *f)
3162 {
3163   std::string root;
3164   int64_t depth;
3165   cmd_getval(cmdmap, "root", root);
3166   if (root.empty()) {
3167     root = "/";
3168   }
3169   if (!cmd_getval(cmdmap, "depth", depth))
3170     depth = -1;
3171   std::lock_guard l(mds_lock);
3172   CInode *in = mdcache->cache_traverse(filepath(root.c_str()));
3173   if (!in) {
3174     ss << "root inode is not in cache";
3175     return;
3176   }
3177   f->open_array_section("inodes");
3178   mdcache->dump_tree(in, 0, depth, f);
3179   f->close_section();
3180 }
3181
3182 CDir *MDSRank::_command_dirfrag_get(
3183     const cmdmap_t &cmdmap,
3184     std::ostream &ss)
3185 {
3186   std::string path;
3187   bool got = cmd_getval(cmdmap, "path", path);
3188   if (!got) {
3189     ss << "missing path argument";
3190     return NULL;
3191   }
3192
3193   std::string frag_str;
3194   if (!cmd_getval(cmdmap, "frag", frag_str)) {
3195     ss << "missing frag argument";
3196     return NULL;
3197   }
3198
3199   CInode *in = mdcache->cache_traverse(filepath(path.c_str()));
3200   if (!in) {
3201     // TODO really we should load something in if it's not in cache,
3202     // but the infrastructure is harder, and we might still be unable
3203     // to act on it if someone else is auth.
3204     ss << "directory '" << path << "' inode not in cache";
3205     return NULL;
3206   }
3207
3208   frag_t fg;
3209
3210   if (!fg.parse(frag_str.c_str())) {
3211     ss << "frag " << frag_str << " failed to parse";
3212     return NULL;
3213   }
3214
3215   CDir *dir = in->get_dirfrag(fg);
3216   if (!dir) {
3217     ss << "frag " << in->ino() << "/" << fg << " not in cache ("
3218           "use `dirfrag ls` to see if it should exist)";
3219     return NULL;
3220   }
3221
3222   if (!dir->is_auth()) {
3223     ss << "frag " << dir->dirfrag() << " not auth (auth = "
3224        << dir->authority() << ")";
3225     return NULL;
3226   }
3227
3228   return dir;
3229 }
3230
3231 bool MDSRank::command_dirfrag_split(
3232     cmdmap_t cmdmap,
3233     std::ostream &ss)
3234 {
3235   std::lock_guard l(mds_lock);
3236   int64_t by = 0;
3237   if (!cmd_getval(cmdmap, "bits", by)) {
3238     ss << "missing bits argument";
3239     return false;
3240   }
3241
3242   if (by <= 0) {
3243     ss << "must split by >0 bits";
3244     return false;
3245   }
3246
3247   CDir *dir = _command_dirfrag_get(cmdmap, ss);
3248   if (!dir) {
3249     return false;
3250   }
3251
3252   mdcache->split_dir(dir, by);
3253
3254   return true;
3255 }
3256
3257 bool MDSRank::command_dirfrag_merge(
3258     cmdmap_t cmdmap,
3259     std::ostream &ss)
3260 {
3261   std::lock_guard l(mds_lock);
3262   std::string path;
3263   bool got = cmd_getval(cmdmap, "path", path);
3264   if (!got) {
3265     ss << "missing path argument";
3266     return false;
3267   }
3268
3269   std::string frag_str;
3270   if (!cmd_getval(cmdmap, "frag", frag_str)) {
3271     ss << "missing frag argument";
3272     return false;
3273   }
3274
3275   CInode *in = mdcache->cache_traverse(filepath(path.c_str()));
3276   if (!in) {
3277     ss << "directory '" << path << "' inode not in cache";
3278     return false;
3279   }
3280
3281   frag_t fg;
3282   if (!fg.parse(frag_str.c_str())) {
3283     ss << "frag " << frag_str << " failed to parse";
3284     return false;
3285   }
3286
3287   mdcache->merge_dir(in, fg);
3288
3289   return true;
3290 }
3291
3292 bool MDSRank::command_dirfrag_ls(
3293     cmdmap_t cmdmap,
3294     std::ostream &ss,
3295     Formatter *f)
3296 {
3297   std::lock_guard l(mds_lock);
3298   std::string path;
3299   bool got = cmd_getval(cmdmap, "path", path);
3300   if (!got) {
3301     ss << "missing path argument";
3302     return false;
3303   }
3304
3305   CInode *in = mdcache->cache_traverse(filepath(path.c_str()));
3306   if (!in) {
3307     ss << "directory inode not in cache";
3308     return false;
3309   }
3310
3311   f->open_array_section("frags");
3312   frag_vec_t leaves;
3313   // NB using get_leaves_under instead of get_dirfrags to give
3314   // you the list of what dirfrags may exist, not which are in cache
3315   in->dirfragtree.get_leaves_under(frag_t(), leaves);
3316   for (const auto& leaf : leaves) {
3317     f->open_object_section("frag");
3318     f->dump_int("value", leaf.value());
3319     f->dump_int("bits", leaf.bits());
3320     CachedStackStringStream css;
3321     *css << std::hex << leaf.value() << "/" << std::dec << leaf.bits();
3322     f->dump_string("str", css->strv());
3323     f->close_section();
3324   }
3325   f->close_section();
3326
3327   return true;
3328 }
3329
3330 void MDSRank::command_openfiles_ls(Formatter *f)
3331 {
3332   std::lock_guard l(mds_lock);
3333   mdcache->dump_openfiles(f);
3334 }
3335
3336 void MDSRank::command_dump_inode(Formatter *f, const cmdmap_t &cmdmap, std::ostream &ss)
3337 {
3338   std::lock_guard l(mds_lock);
3339   int64_t number;
3340   bool got = cmd_getval(cmdmap, "number", number);
3341   if (!got) {
3342     ss << "missing inode number";
3343     return;
3344   }
3345
3346   bool success = mdcache->dump_inode(f, number);
3347   if (!success) {
3348     ss << "dump inode failed, wrong inode number or the inode is not cached";
3349   }
3350 }
3351
3352 void MDSRank::dump_status(Formatter *f) const
3353 {
3354   f->dump_string("fs_name", std::string(mdsmap->get_fs_name()));
3355   if (state == MDSMap::STATE_REPLAY ||
3356       state == MDSMap::STATE_STANDBY_REPLAY) {
3357     mdlog->dump_replay_status(f);
3358   } else if (state == MDSMap::STATE_RESOLVE) {
3359     mdcache->dump_resolve_status(f);
3360   } else if (state == MDSMap::STATE_RECONNECT) {
3361     server->dump_reconnect_status(f);
3362   } else if (state == MDSMap::STATE_REJOIN) {
3363     mdcache->dump_rejoin_status(f);
3364   } else if (state == MDSMap::STATE_CLIENTREPLAY) {
3365     dump_clientreplay_status(f);
3366   }
3367   f->dump_float("rank_uptime", get_uptime().count());
3368 }
3369
3370 void MDSRank::dump_clientreplay_status(Formatter *f) const
3371 {
3372   f->open_object_section("clientreplay_status");
3373   f->dump_unsigned("clientreplay_queue", replay_queue.size());
3374   f->dump_unsigned("active_replay", mdcache->get_num_client_requests());
3375   f->close_section();
3376 }
3377
3378 void MDSRankDispatcher::update_log_config()
3379 {
3380   auto parsed_options = clog->parse_client_options(g_ceph_context);
3381   dout(10) << __func__ << " log_to_monitors " << parsed_options.log_to_monitors << dendl;
3382 }
3383
3384 void MDSRank::create_logger()
3385 {
3386   dout(10) << "create_logger" << dendl;
3387   {
3388     PerfCountersBuilder mds_plb(g_ceph_context, "mds", l_mds_first, l_mds_last);
3389
3390     // super useful (high prio) perf stats
3391     mds_plb.add_u64_counter(l_mds_request, "request", "Requests", "req",
3392                             PerfCountersBuilder::PRIO_CRITICAL);
3393     mds_plb.add_time_avg(l_mds_reply_latency, "reply_latency", "Reply latency", "rlat",
3394                          PerfCountersBuilder::PRIO_CRITICAL);
3395     mds_plb.add_u64(l_mds_inodes, "inodes", "Inodes", "inos",
3396                     PerfCountersBuilder::PRIO_CRITICAL);
3397     mds_plb.add_u64_counter(l_mds_forward, "forward", "Forwarding request", "fwd",
3398                             PerfCountersBuilder::PRIO_INTERESTING);
3399     mds_plb.add_u64(l_mds_caps, "caps", "Capabilities", "caps",
3400                     PerfCountersBuilder::PRIO_INTERESTING);
3401     mds_plb.add_u64_counter(l_mds_exported_inodes, "exported_inodes", "Exported inodes",
3402                             "exi", PerfCountersBuilder::PRIO_INTERESTING);
3403     mds_plb.add_u64_counter(l_mds_imported_inodes, "imported_inodes", "Imported inodes",
3404                             "imi", PerfCountersBuilder::PRIO_INTERESTING);
3405     mds_plb.add_u64_counter(l_mds_slow_reply, "slow_reply", "Slow replies", "slr",
3406                               PerfCountersBuilder::PRIO_INTERESTING);
3407
3408     // caps msg stats
3409     mds_plb.add_u64_counter(l_mdss_handle_client_caps, "handle_client_caps",
3410                            "Client caps msg", "hcc", PerfCountersBuilder::PRIO_INTERESTING);
3411     mds_plb.add_u64_counter(l_mdss_handle_client_caps_dirty, "handle_client_caps_dirty",
3412                            "Client dirty caps msg", "hccd", PerfCountersBuilder::PRIO_INTERESTING);
3413     mds_plb.add_u64_counter(l_mdss_handle_client_cap_release, "handle_client_cap_release",
3414                            "Client cap release msg", "hccr", PerfCountersBuilder::PRIO_INTERESTING);
3415     mds_plb.add_u64_counter(l_mdss_process_request_cap_release, "process_request_cap_release",
3416                            "Process request cap release", "prcr", PerfCountersBuilder::PRIO_INTERESTING);
3417     mds_plb.add_u64_counter(l_mdss_ceph_cap_op_revoke, "ceph_cap_op_revoke",
3418                            "Revoke caps", "crev", PerfCountersBuilder::PRIO_INTERESTING);
3419     mds_plb.add_u64_counter(l_mdss_ceph_cap_op_grant, "ceph_cap_op_grant",
3420                            "Grant caps", "cgra", PerfCountersBuilder::PRIO_INTERESTING);
3421     mds_plb.add_u64_counter(l_mdss_ceph_cap_op_trunc, "ceph_cap_op_trunc",
3422                            "caps truncate notify", "ctru", PerfCountersBuilder::PRIO_INTERESTING);
3423     mds_plb.add_u64_counter(l_mdss_ceph_cap_op_flushsnap_ack, "ceph_cap_op_flushsnap_ack",
3424                            "caps truncate notify", "cfsa", PerfCountersBuilder::PRIO_INTERESTING);
3425     mds_plb.add_u64_counter(l_mdss_ceph_cap_op_flush_ack, "ceph_cap_op_flush_ack",
3426                            "caps truncate notify", "cfa", PerfCountersBuilder::PRIO_INTERESTING);
3427     mds_plb.add_u64_counter(l_mdss_handle_inode_file_caps, "handle_inode_file_caps",
3428                            "Inter mds caps msg", "hifc", PerfCountersBuilder::PRIO_INTERESTING);
3429
3430     // useful dir/inode/subtree stats
3431     mds_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
3432     mds_plb.add_u64(l_mds_root_rfiles, "root_rfiles", "root inode rfiles");
3433     mds_plb.add_u64(l_mds_root_rbytes, "root_rbytes", "root inode rbytes");
3434     mds_plb.add_u64(l_mds_root_rsnaps, "root_rsnaps", "root inode rsnaps");
3435     mds_plb.add_u64_counter(l_mds_dir_fetch_complete,
3436                             "dir_fetch_complete", "Fetch complete dirfrag");
3437     mds_plb.add_u64_counter(l_mds_dir_fetch_keys,
3438                             "dir_fetch_keys", "Fetch keys from dirfrag");
3439     mds_plb.add_u64_counter(l_mds_dir_commit, "dir_commit", "Directory commit");
3440     mds_plb.add_u64_counter(l_mds_dir_split, "dir_split", "Directory split");
3441     mds_plb.add_u64_counter(l_mds_dir_merge, "dir_merge", "Directory merge");
3442     mds_plb.add_u64(l_mds_inodes_pinned, "inodes_pinned", "Inodes pinned");
3443     mds_plb.add_u64(l_mds_inodes_expired, "inodes_expired", "Inodes expired");
3444     mds_plb.add_u64(l_mds_inodes_with_caps, "inodes_with_caps",
3445                     "Inodes with capabilities");
3446     mds_plb.add_u64(l_mds_subtrees, "subtrees", "Subtrees");
3447     mds_plb.add_u64(l_mds_load_cent, "load_cent", "Load per cent");
3448     mds_plb.add_u64_counter(l_mds_openino_dir_fetch, "openino_dir_fetch",
3449                             "OpenIno incomplete directory fetchings");
3450
3451     // low prio stats
3452     mds_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
3453     mds_plb.add_u64_counter(l_mds_reply, "reply", "Replies");
3454     mds_plb.add_u64(l_mds_inodes_top, "inodes_top", "Inodes on top");
3455     mds_plb.add_u64(l_mds_inodes_bottom, "inodes_bottom", "Inodes on bottom");
3456     mds_plb.add_u64(
3457       l_mds_inodes_pin_tail, "inodes_pin_tail", "Inodes on pin tail");
3458     mds_plb.add_u64_counter(l_mds_traverse, "traverse", "Traverses");
3459     mds_plb.add_u64_counter(l_mds_traverse_hit, "traverse_hit", "Traverse hits");
3460     mds_plb.add_u64_counter(l_mds_traverse_forward, "traverse_forward",
3461                             "Traverse forwards");
3462     mds_plb.add_u64_counter(l_mds_traverse_discover, "traverse_discover",
3463                             "Traverse directory discovers");
3464     mds_plb.add_u64_counter(l_mds_traverse_dir_fetch, "traverse_dir_fetch",
3465                             "Traverse incomplete directory content fetchings");
3466     mds_plb.add_u64_counter(l_mds_traverse_remote_ino, "traverse_remote_ino",
3467                             "Traverse remote dentries");
3468     mds_plb.add_u64_counter(l_mds_traverse_lock, "traverse_lock",
3469                             "Traverse locks");
3470     mds_plb.add_u64(l_mds_dispatch_queue_len, "q", "Dispatch queue length");
3471     mds_plb.add_u64_counter(l_mds_exported, "exported", "Exports");
3472     mds_plb.add_u64_counter(l_mds_imported, "imported", "Imports");
3473     mds_plb.add_u64_counter(l_mds_openino_backtrace_fetch, "openino_backtrace_fetch",
3474                             "OpenIno backtrace fetchings");
3475     mds_plb.add_u64_counter(l_mds_openino_peer_discover, "openino_peer_discover",
3476                             "OpenIno peer inode discovers");
3477
3478     // scrub stats
3479     mds_plb.add_u64(l_mds_scrub_backtrace_fetch, "scrub_backtrace_fetch",
3480                     "Scrub backtrace fetchings");
3481     mds_plb.add_u64(l_mds_scrub_set_tag, "scrub_set_tag",
3482                     "Scrub set tags");
3483     mds_plb.add_u64(l_mds_scrub_backtrace_repaired, "scrub_backtrace_repaired",
3484                     "Scrub backtraces repaired");
3485     mds_plb.add_u64(l_mds_scrub_inotable_repaired, "scrub_inotable_repaired",
3486                     "Scrub inotable repaired");
3487     mds_plb.add_u64(l_mds_scrub_dir_inodes, "scrub_dir_inodes",
3488                     "Scrub directory inodes");
3489     mds_plb.add_u64(l_mds_scrub_dir_base_inodes, "scrub_dir_base_inodes",
3490                     "Scrub directory base inodes");
3491     mds_plb.add_u64(l_mds_scrub_dirfrag_rstats, "scrub_dirfrag_rstats",
3492                     "Scrub dirfrags rstates");
3493     mds_plb.add_u64(l_mds_scrub_file_inodes, "scrub_file_inodes",
3494                     "Scrub file inodes");
3495
3496     logger = mds_plb.create_perf_counters();
3497     g_ceph_context->get_perfcounters_collection()->add(logger);
3498   }
3499
3500   {
3501     PerfCountersBuilder mdm_plb(g_ceph_context, "mds_mem", l_mdm_first, l_mdm_last);
3502     mdm_plb.add_u64(l_mdm_ino, "ino", "Inodes", "ino",
3503                     PerfCountersBuilder::PRIO_INTERESTING);
3504     mdm_plb.add_u64(l_mdm_dn, "dn", "Dentries", "dn",
3505                     PerfCountersBuilder::PRIO_INTERESTING);
3506
3507     mdm_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
3508     mdm_plb.add_u64_counter(l_mdm_inoa, "ino+", "Inodes opened");
3509     mdm_plb.add_u64_counter(l_mdm_inos, "ino-", "Inodes closed");
3510     mdm_plb.add_u64(l_mdm_dir, "dir", "Directories");
3511     mdm_plb.add_u64_counter(l_mdm_dira, "dir+", "Directories opened");
3512     mdm_plb.add_u64_counter(l_mdm_dirs, "dir-", "Directories closed");
3513     mdm_plb.add_u64_counter(l_mdm_dna, "dn+", "Dentries opened");
3514     mdm_plb.add_u64_counter(l_mdm_dns, "dn-", "Dentries closed");
3515     mdm_plb.add_u64(l_mdm_cap, "cap", "Capabilities");
3516     mdm_plb.add_u64_counter(l_mdm_capa, "cap+", "Capabilities added");
3517     mdm_plb.add_u64_counter(l_mdm_caps, "cap-", "Capabilities removed");
3518     mdm_plb.add_u64(l_mdm_heap, "heap", "Heap size");
3519
3520     mdm_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
3521     mdm_plb.add_u64(l_mdm_rss, "rss", "RSS");
3522
3523     mlogger = mdm_plb.create_perf_counters();
3524     g_ceph_context->get_perfcounters_collection()->add(mlogger);
3525   }
3526
3527   mdlog->create_logger();
3528   server->create_logger();
3529   purge_queue.create_logger();
3530   sessionmap.register_perfcounters();
3531   mdcache->register_perfcounters();
3532 }
3533
3534 void MDSRank::check_ops_in_flight()
3535 {
3536   string summary;
3537   vector<string> warnings;
3538   int slow = 0;
3539   if (op_tracker.check_ops_in_flight(&summary, warnings, &slow)) {
3540     clog->warn() << summary;
3541     for (const auto& warning : warnings) {
3542       clog->warn() << warning;
3543     }
3544   }
3545
3546   // set mds slow request count
3547   mds_slow_req_count = slow;
3548   return;
3549 }
3550
3551 void MDSRankDispatcher::handle_osd_map()
3552 {
3553   if (is_active() &&
3554       mdsmap->get_tableserver() == whoami) {
3555     snapserver->check_osd_map(true);
3556   }
3557
3558   server->handle_osd_map();
3559
3560   purge_queue.update_op_limit(*mdsmap);
3561
3562   // it's ok if replay state is reached via standby-replay, the
3563   // reconnect state will journal blocklisted clients (journal
3564   // is opened for writing in `replay_done` before moving to
3565   // up:resolve).
3566   if (!is_any_replay()) {
3567     std::set<entity_addr_t> newly_blocklisted;
3568     objecter->consume_blocklist_events(&newly_blocklisted);
3569     auto epoch = objecter->with_osdmap([](const OSDMap &o){return o.get_epoch();});
3570     apply_blocklist(newly_blocklisted, epoch);
3571   }
3572
3573   // By default the objecter only requests OSDMap updates on use,
3574   // we would like to always receive the latest maps in order to
3575   // apply policy based on the FULL flag.
3576   objecter->maybe_request_map();
3577 }
3578
3579 int MDSRank::config_client(int64_t session_id, bool remove,
3580                            const std::string& option, const std::string& value,
3581                            std::ostream& ss)
3582 {
3583   Session *session = sessionmap.get_session(entity_name_t(CEPH_ENTITY_TYPE_CLIENT, session_id));
3584   if (!session) {
3585     ss << "session " << session_id << " not in sessionmap!";
3586     return -CEPHFS_ENOENT;
3587   }
3588
3589   if (option == "timeout") {
3590     if (remove) {
3591       auto it = session->info.client_metadata.find("timeout");
3592       if (it == session->info.client_metadata.end()) {
3593         ss << "Nonexistent config: " << option;
3594         return -CEPHFS_ENODATA;
3595       }
3596       session->info.client_metadata.erase(it);
3597     } else {
3598       char *end;
3599       strtoul(value.c_str(), &end, 0);
3600       if (*end) {
3601         ss << "Invalid config for timeout: " << value;
3602         return -CEPHFS_EINVAL;
3603       }
3604       session->info.client_metadata[option] = value;
3605     }
3606     //sessionmap._mark_dirty(session, true);
3607   } else {
3608     ss << "Invalid config option: " << option;
3609     return -CEPHFS_EINVAL;
3610   }
3611
3612   return 0;
3613 }
3614
3615 bool MDSRank::evict_client(int64_t session_id,
3616     bool wait, bool blocklist, std::ostream& err_ss,
3617     Context *on_killed)
3618 {
3619   ceph_assert(ceph_mutex_is_locked_by_me(mds_lock));
3620
3621   // Mutually exclusive args
3622   ceph_assert(!(wait && on_killed != nullptr));
3623
3624   if (is_any_replay()) {
3625     err_ss << "MDS is replaying log";
3626     return false;
3627   }
3628
3629   Session *session = sessionmap.get_session(
3630       entity_name_t(CEPH_ENTITY_TYPE_CLIENT, session_id));
3631   if (!session) {
3632     err_ss << "session " << session_id << " not in sessionmap!";
3633     return false;
3634   }
3635
3636   auto& addr = session->info.inst.addr;
3637   {
3638     CachedStackStringStream css;
3639     *css << "Evicting " << (blocklist ? "(and blocklisting) " : "")
3640          << "client session " << session_id << " (" << addr << ")";
3641     dout(1) << css->strv() << dendl;
3642     clog->info() << css->strv();
3643   }
3644
3645   dout(4) << "Preparing blocklist command... (wait=" << wait << ")" << dendl;
3646   CachedStackStringStream css;
3647   *css << "{\"prefix\":\"osd blocklist\", \"blocklistop\":\"add\",";
3648   *css << "\"addr\":\"";
3649   *css << addr;
3650   *css << "\"}";
3651   std::vector<std::string> cmd = {css->str()};
3652
3653   auto kill_client_session = [this, session_id, wait, on_killed](){
3654     ceph_assert(ceph_mutex_is_locked_by_me(mds_lock));
3655     Session *session = sessionmap.get_session(
3656         entity_name_t(CEPH_ENTITY_TYPE_CLIENT, session_id));
3657     if (session) {
3658       if (on_killed || !wait) {
3659         server->kill_session(session, on_killed);
3660       } else {
3661         C_SaferCond on_safe;
3662         server->kill_session(session, &on_safe);
3663
3664         mds_lock.unlock();
3665         on_safe.wait();
3666         mds_lock.lock();
3667       }
3668     } else {
3669       dout(1) << "session " << session_id << " was removed while we waited "
3670       "for blocklist" << dendl;
3671
3672       // Even though it wasn't us that removed it, kick our completion
3673       // as the session has been removed.
3674       if (on_killed) {
3675         on_killed->complete(0);
3676       }
3677     }
3678   };
3679
3680   auto apply_blocklist = [this, cmd](std::function<void ()> fn){
3681     ceph_assert(ceph_mutex_is_locked_by_me(mds_lock));
3682
3683     Context *on_blocklist_done = new LambdaContext([this, fn](int r) {
3684       objecter->wait_for_latest_osdmap(
3685       lambdafy((new C_OnFinisher(
3686          new LambdaContext([this, fn](int r) {
3687               std::lock_guard l(mds_lock);
3688               auto epoch = objecter->with_osdmap([](const OSDMap &o){
3689                   return o.get_epoch();
3690               });
3691
3692               set_osd_epoch_barrier(epoch);
3693
3694               fn();
3695             }), finisher)
3696       )));
3697     });
3698
3699     dout(4) << "Sending mon blocklist command: " << cmd[0] << dendl;
3700     monc->start_mon_command(cmd, {}, nullptr, nullptr, on_blocklist_done);
3701   };
3702
3703   if (wait) {
3704     if (blocklist) {
3705       C_SaferCond inline_ctx;
3706       apply_blocklist([&inline_ctx](){inline_ctx.complete(0);});
3707       mds_lock.unlock();
3708       inline_ctx.wait();
3709       mds_lock.lock();
3710     }
3711
3712     // We dropped mds_lock, so check that session still exists
3713     session = sessionmap.get_session(entity_name_t(CEPH_ENTITY_TYPE_CLIENT,
3714           session_id));
3715     if (!session) {
3716       dout(1) << "session " << session_id << " was removed while we waited "
3717                  "for blocklist" << dendl;
3718       client_eviction_dump = true;
3719       return true;
3720     }
3721     kill_client_session();
3722   } else {
3723     if (blocklist) {
3724       apply_blocklist(kill_client_session);
3725     } else {
3726       kill_client_session();
3727     }
3728   }
3729
3730   client_eviction_dump = true;
3731   return true;
3732 }
3733
3734 MDSRankDispatcher::MDSRankDispatcher(
3735     mds_rank_t whoami_,
3736     ceph::fair_mutex &mds_lock_,
3737     LogChannelRef &clog_,
3738     CommonSafeTimer<ceph::fair_mutex> &timer_,
3739     Beacon &beacon_,
3740     std::unique_ptr<MDSMap> &mdsmap_,
3741     Messenger *msgr,
3742     MonClient *monc_,
3743     MgrClient *mgrc,
3744     Context *respawn_hook_,
3745     Context *suicide_hook_,
3746     boost::asio::io_context& ioc)
3747   : MDSRank(whoami_, mds_lock_, clog_, timer_, beacon_, mdsmap_,
3748             msgr, monc_, mgrc, respawn_hook_, suicide_hook_, ioc)
3749 {
3750     g_conf().add_observer(this);
3751 }
3752
3753 void MDSRank::command_cache_drop(uint64_t timeout, Formatter *f, Context *on_finish) {
3754   dout(20) << __func__ << dendl;
3755
3756   std::lock_guard locker(mds_lock);
3757   C_Drop_Cache *request = new C_Drop_Cache(server, mdcache, mdlog, this,
3758                                            timeout, f, on_finish);
3759   request->send();
3760 }
3761
3762 epoch_t MDSRank::get_osd_epoch() const
3763 {
3764   return objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
3765 }
3766
3767 const char** MDSRankDispatcher::get_tracked_conf_keys() const
3768 {
3769   static const char* KEYS[] = {
3770     "clog_to_graylog",
3771     "clog_to_graylog_host",
3772     "clog_to_graylog_port",
3773     "clog_to_monitors",
3774     "clog_to_syslog",
3775     "clog_to_syslog_facility",
3776     "clog_to_syslog_level",
3777     "fsid",
3778     "host",
3779     "mds_bal_fragment_dirs",
3780     "mds_bal_fragment_interval",
3781     "mds_bal_fragment_size_max",
3782     "mds_cache_memory_limit",
3783     "mds_cache_mid",
3784     "mds_cache_reservation",
3785     "mds_cache_trim_decay_rate",
3786     "mds_cap_revoke_eviction_timeout",
3787     "mds_dump_cache_threshold_file",
3788     "mds_dump_cache_threshold_formatter",
3789     "mds_enable_op_tracker",
3790     "mds_export_ephemeral_random",
3791     "mds_export_ephemeral_random_max",
3792     "mds_export_ephemeral_distributed",
3793     "mds_health_cache_threshold",
3794     "mds_inject_migrator_session_race",
3795     "mds_log_pause",
3796     "mds_max_export_size",
3797     "mds_max_purge_files",
3798     "mds_forward_all_requests_to_auth",
3799     "mds_max_purge_ops",
3800     "mds_max_purge_ops_per_pg",
3801     "mds_max_snaps_per_dir",
3802     "mds_op_complaint_time",
3803     "mds_op_history_duration",
3804     "mds_op_history_size",
3805     "mds_op_log_threshold",
3806     "mds_recall_max_decay_rate",
3807     "mds_recall_warning_decay_rate",
3808     "mds_request_load_average_decay_rate",
3809     "mds_session_cache_liveness_decay_rate",
3810     "mds_heartbeat_reset_grace",
3811     "mds_heartbeat_grace",
3812     "mds_session_cap_acquisition_decay_rate",
3813     "mds_max_caps_per_client",
3814     "mds_session_cap_acquisition_throttle",
3815     "mds_session_max_caps_throttle_ratio",
3816     "mds_cap_acquisition_throttle_retry_request_time",
3817     "mds_alternate_name_max",
3818     "mds_dir_max_entries",
3819     "mds_symlink_recovery",
3820     "mds_extraordinary_events_dump_interval",
3821     "mds_inject_rename_corrupt_dentry_first",
3822     "mds_inject_journal_corrupt_dentry_first",
3823     "mds_session_metadata_threshold",
3824     NULL
3825   };
3826   return KEYS;
3827 }
3828
3829 void MDSRankDispatcher::handle_conf_change(const ConfigProxy& conf, const std::set<std::string>& changed)
3830 {
3831   // XXX with or without mds_lock!
3832
3833   if (changed.count("mds_heartbeat_reset_grace")) {
3834     _heartbeat_reset_grace = conf.get_val<uint64_t>("mds_heartbeat_reset_grace");
3835   }
3836   if (changed.count("mds_heartbeat_grace")) {
3837     heartbeat_grace = conf.get_val<double>("mds_heartbeat_grace");
3838   }
3839   if (changed.count("mds_op_complaint_time") || changed.count("mds_op_log_threshold")) {
3840     op_tracker.set_complaint_and_threshold(conf->mds_op_complaint_time, conf->mds_op_log_threshold);
3841   }
3842   if (changed.count("mds_op_history_size") || changed.count("mds_op_history_duration")) {
3843     op_tracker.set_history_size_and_duration(conf->mds_op_history_size, conf->mds_op_history_duration);
3844   }
3845   if (changed.count("mds_enable_op_tracker")) {
3846     op_tracker.set_tracking(conf->mds_enable_op_tracker);
3847   }
3848   if (changed.count("mds_extraordinary_events_dump_interval")) {
3849     reset_event_flags();
3850     extraordinary_events_dump_interval = conf.get_val<std::chrono::seconds>
3851       ("mds_extraordinary_events_dump_interval").count();
3852
3853     //Enable the logging only during low level debugging
3854     if (extraordinary_events_dump_interval) {
3855       uint64_t log_level, gather_level;
3856       std::string debug_mds = g_conf().get_val<std::string>("debug_mds");
3857       auto delim = debug_mds.find("/");
3858       std::istringstream(debug_mds.substr(0, delim)) >> log_level;
3859       std::istringstream(debug_mds.substr(delim + 1)) >> gather_level;
3860
3861       if (log_level < 10 && gather_level >= 10) {
3862         dout(0) << __func__ << " Enabling in-memory log dump..." << dendl;
3863         std::scoped_lock lock(mds_lock);
3864         schedule_inmemory_logger();
3865       }
3866       else {
3867         dout(0) << __func__ << " Enabling in-memory log dump failed. debug_mds=" << log_level
3868                 << "/" << gather_level << dendl;
3869         extraordinary_events_dump_interval = 0;
3870       }
3871     }
3872     else {
3873       //The user set mds_extraordinary_events_dump_interval = 0
3874       dout(0) << __func__ << " In-memory log dump disabled" << dendl;
3875     }
3876   }
3877   if (changed.count("clog_to_monitors") ||
3878       changed.count("clog_to_syslog") ||
3879       changed.count("clog_to_syslog_level") ||
3880       changed.count("clog_to_syslog_facility") ||
3881       changed.count("clog_to_graylog") ||
3882       changed.count("clog_to_graylog_host") ||
3883       changed.count("clog_to_graylog_port") ||
3884       changed.count("host") ||
3885       changed.count("fsid")) {
3886     update_log_config();
3887   }
3888   if (changed.count("mds_inject_journal_corrupt_dentry_first")) {
3889     inject_journal_corrupt_dentry_first = g_conf().get_val<double>("mds_inject_journal_corrupt_dentry_first");
3890   }
3891
3892   finisher->queue(new LambdaContext([this, changed](int) {
3893     std::scoped_lock lock(mds_lock);
3894
3895     dout(10) << "flushing conf change to components: " << changed << dendl;
3896
3897     if (changed.count("mds_log_pause") && !g_conf()->mds_log_pause) {
3898       mdlog->kick_submitter();
3899     }
3900     sessionmap.handle_conf_change(changed);
3901     server->handle_conf_change(changed);
3902     mdcache->handle_conf_change(changed, *mdsmap);
3903     purge_queue.handle_conf_change(changed, *mdsmap);
3904   }));
3905 }
3906
3907 void MDSRank::get_task_status(std::map<std::string, std::string> *status) {
3908   dout(20) << __func__ << dendl;
3909
3910   // scrub summary for now..
3911   std::string_view scrub_summary = scrubstack->scrub_summary();
3912   if (!ScrubStack::is_idle(scrub_summary)) {
3913     send_status = true;
3914     status->emplace(SCRUB_STATUS_KEY, std::move(scrub_summary));
3915   }
3916 }
3917
3918 void MDSRank::schedule_update_timer_task() {
3919   dout(20) << __func__ << dendl;
3920
3921   timer.add_event_after(g_conf().get_val<double>("mds_task_status_update_interval"),
3922                         new LambdaContext([this](int) {
3923                             send_task_status();
3924                           }));
3925 }
3926
3927 void MDSRank::send_task_status() {
3928   std::map<std::string, std::string> status;
3929   get_task_status(&status);
3930
3931   if (send_status) {
3932     if (status.empty()) {
3933       send_status = false;
3934     }
3935
3936     dout(20) << __func__ << ": updating " << status.size() << " status keys" << dendl;
3937     int r = mgrc->service_daemon_update_task_status(std::move(status));
3938     if (r < 0) {
3939       derr << ": failed to update service daemon status: " << cpp_strerror(r) << dendl;
3940     }
3941
3942   }
3943
3944   schedule_update_timer_task();
3945 }
3946
3947 void MDSRank::schedule_inmemory_logger() {
3948   dout(20) << __func__ << dendl;
3949   timer.add_event_after(extraordinary_events_dump_interval,
3950                         new LambdaContext([this]() {
3951                           inmemory_logger();
3952                         }));
3953 }
3954
3955 void MDSRank::inmemory_logger() {
3956   if (client_eviction_dump ||
3957       beacon.missed_beacon_ack_dump ||
3958       beacon.missed_internal_heartbeat_dump) {
3959     //dump the in-memory logs if any of these events occured recently
3960     dout(0) << __func__ << " client_eviction_dump "<< client_eviction_dump
3961             << ", missed_beacon_ack_dump " << beacon.missed_beacon_ack_dump
3962             << ", missed_internal_heartbeat_dump " << beacon.missed_internal_heartbeat_dump
3963             << dendl;
3964     reset_event_flags();
3965     g_ceph_context->_log->dump_recent();
3966   }
3967
3968   //reschedule if it's enabled
3969   if (extraordinary_events_dump_interval) {
3970     schedule_inmemory_logger();
3971   }
3972 }
3973
3974 void MDSRank::reset_event_flags() {
3975   client_eviction_dump = false;
3976   beacon.missed_beacon_ack_dump = false;
3977   beacon.missed_internal_heartbeat_dump = false;
3978 }