ceph/src/mds/MDSRank.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2015 Red Hat
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15 #include <string_view>
  16
  17 #include "common/debug.h"
  18 #include "common/errno.h"
  19
  20 #include "messages/MClientRequestForward.h"
  21 #include "messages/MMDSLoadTargets.h"
  22 #include "messages/MMDSTableRequest.h"
  23
  24 #include "mgr/MgrClient.h"
  25
  26 #include "MDSDaemon.h"
  27 #include "MDSMap.h"
  28 #include "SnapClient.h"
  29 #include "SnapServer.h"
  30 #include "MDBalancer.h"
  31 #include "Migrator.h"
  32 #include "Locker.h"
  33 #include "InoTable.h"
  34 #include "mon/MonClient.h"
  35 #include "common/HeartbeatMap.h"
  36 #include "ScrubStack.h"
  37
  38
  39 #include "MDSRank.h"
  40
  41 #define dout_context g_ceph_context
  42 #define dout_subsys ceph_subsys_mds
  43 #undef dout_prefix
  44 #define dout_prefix *_dout << "mds." << whoami << '.' << incarnation << ' '
  45 using TOPNSPC::common::cmd_getval;
  46 class C_Flush_Journal : public MDSInternalContext {
  47 public:
  48   C_Flush_Journal(MDCache *mdcache, MDLog *mdlog, MDSRank *mds,
  49                   std::ostream *ss, Context *on_finish)
  50     : MDSInternalContext(mds),
  51       mdcache(mdcache), mdlog(mdlog), ss(ss), on_finish(on_finish),
  52       whoami(mds->whoami), incarnation(mds->incarnation) {
  53   }
  54
  55   void send() {
  56     assert(ceph_mutex_is_locked(mds->mds_lock));
  57
  58     dout(20) << __func__ << dendl;
  59
  60     if (mdcache->is_readonly()) {
  61       dout(5) << __func__ << ": read-only FS" << dendl;
  62       complete(-EROFS);
  63       return;
  64     }
  65
  66     if (!mds->is_active()) {
  67       dout(5) << __func__ << ": MDS not active, no-op" << dendl;
  68       complete(0);
  69       return;
  70     }
  71
  72     flush_mdlog();
  73   }
  74
  75 private:
  76
  77   void flush_mdlog() {
  78     dout(20) << __func__ << dendl;
  79
  80     // I need to seal off the current segment, and then mark all
  81     // previous segments for expiry
  82     mdlog->start_new_segment();
  83
  84     Context *ctx = new LambdaContext([this](int r) {
  85         handle_flush_mdlog(r);
  86       });
  87
  88     // Flush initially so that all the segments older than our new one
  89     // will be elegible for expiry
  90     mdlog->flush();
  91     mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, ctx));
  92   }
  93
  94   void handle_flush_mdlog(int r) {
  95     dout(20) << __func__ << ": r=" << r << dendl;
  96
  97     if (r != 0) {
  98       *ss << "Error " << r << " (" << cpp_strerror(r) << ") while flushing journal";
  99       complete(r);
 100       return;
 101     }
 102
 103     clear_mdlog();
 104   }
 105
 106   void clear_mdlog() {
 107     dout(20) << __func__ << dendl;
 108
 109     Context *ctx = new LambdaContext([this](int r) {
 110         handle_clear_mdlog(r);
 111       });
 112
 113     // Because we may not be the last wait_for_safe context on MDLog,
 114     // and subsequent contexts might wake up in the middle of our
 115     // later trim_all and interfere with expiry (by e.g. marking
 116     // dirs/dentries dirty on previous log segments), we run a second
 117     // wait_for_safe here. See #10368
 118     mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, ctx));
 119   }
 120
 121   void handle_clear_mdlog(int r) {
 122     dout(20) << __func__ << ": r=" << r << dendl;
 123
 124     if (r != 0) {
 125       *ss << "Error " << r << " (" << cpp_strerror(r) << ") while flushing journal";
 126       complete(r);
 127       return;
 128     }
 129
 130     trim_mdlog();
 131   }
 132
 133   void trim_mdlog() {
 134     // Put all the old log segments into expiring or expired state
 135     dout(5) << __func__ << ": beginning segment expiry" << dendl;
 136
 137     int ret = mdlog->trim_all();
 138     if (ret != 0) {
 139       *ss << "Error " << ret << " (" << cpp_strerror(ret) << ") while trimming log";
 140       complete(ret);
 141       return;
 142     }
 143
 144     expire_segments();
 145   }
 146
 147   void expire_segments() {
 148     dout(20) << __func__ << dendl;
 149
 150     // Attach contexts to wait for all expiring segments to expire
 151     MDSGatherBuilder *expiry_gather = new MDSGatherBuilder(g_ceph_context);
 152
 153     const auto &expiring_segments = mdlog->get_expiring_segments();
 154     for (auto p : expiring_segments) {
 155       p->wait_for_expiry(expiry_gather->new_sub());
 156     }
 157     dout(5) << __func__ << ": waiting for " << expiry_gather->num_subs_created()
 158             << " segments to expire" << dendl;
 159
 160     if (!expiry_gather->has_subs()) {
 161       trim_segments();
 162       delete expiry_gather;
 163       return;
 164     }
 165
 166     Context *ctx = new LambdaContext([this](int r) {
 167         handle_expire_segments(r);
 168       });
 169     expiry_gather->set_finisher(new MDSInternalContextWrapper(mds, ctx));
 170     expiry_gather->activate();
 171   }
 172
 173   void handle_expire_segments(int r) {
 174     dout(20) << __func__ << ": r=" << r << dendl;
 175
 176     ceph_assert(r == 0); // MDLog is not allowed to raise errors via
 177                          // wait_for_expiry
 178     trim_segments();
 179   }
 180
 181   void trim_segments() {
 182     dout(20) << __func__ << dendl;
 183
 184     Context *ctx = new C_OnFinisher(new LambdaContext([this](int) {
 185           std::lock_guard locker(mds->mds_lock);
 186           trim_expired_segments();
 187         }), mds->finisher);
 188     ctx->complete(0);
 189   }
 190
 191   void trim_expired_segments() {
 192     dout(5) << __func__ << ": expiry complete, expire_pos/trim_pos is now "
 193             << std::hex << mdlog->get_journaler()->get_expire_pos() << "/"
 194             << mdlog->get_journaler()->get_trimmed_pos() << dendl;
 195
 196     // Now everyone I'm interested in is expired
 197     mdlog->trim_expired_segments();
 198
 199     dout(5) << __func__ << ": trim complete, expire_pos/trim_pos is now "
 200             << std::hex << mdlog->get_journaler()->get_expire_pos() << "/"
 201             << mdlog->get_journaler()->get_trimmed_pos() << dendl;
 202
 203     write_journal_head();
 204   }
 205
 206   void write_journal_head() {
 207     dout(20) << __func__ << dendl;
 208
 209     Context *ctx = new LambdaContext([this](int r) {
 210         std::lock_guard locker(mds->mds_lock);
 211         handle_write_head(r);
 212       });
 213     // Flush the journal header so that readers will start from after
 214     // the flushed region
 215     mdlog->get_journaler()->write_head(ctx);
 216   }
 217
 218   void handle_write_head(int r) {
 219     if (r != 0) {
 220       *ss << "Error " << r << " (" << cpp_strerror(r) << ") while writing header";
 221     } else {
 222       dout(5) << __func__ << ": write_head complete, all done!" << dendl;
 223     }
 224
 225     complete(r);
 226   }
 227
 228   void finish(int r) override {
 229     dout(20) << __func__ << ": r=" << r << dendl;
 230     on_finish->complete(r);
 231   }
 232
 233   MDCache *mdcache;
 234   MDLog *mdlog;
 235   std::ostream *ss;
 236   Context *on_finish;
 237
 238   // so as to use dout
 239   mds_rank_t whoami;
 240   int incarnation;
 241 };
 242
 243 class C_Drop_Cache : public MDSInternalContext {
 244 public:
 245   C_Drop_Cache(Server *server, MDCache *mdcache, MDLog *mdlog,
 246                MDSRank *mds, uint64_t recall_timeout,
 247                Formatter *f, Context *on_finish)
 248     : MDSInternalContext(mds),
 249       server(server), mdcache(mdcache), mdlog(mdlog),
 250       recall_timeout(recall_timeout), recall_start(mono_clock::now()),
 251       f(f), on_finish(on_finish),
 252       whoami(mds->whoami), incarnation(mds->incarnation) {
 253   }
 254
 255   void send() {
 256     // not really a hard requirement here, but lets ensure this in
 257     // case we change the logic here.
 258     assert(ceph_mutex_is_locked(mds->mds_lock));
 259
 260     dout(20) << __func__ << dendl;
 261     f->open_object_section("result");
 262     recall_client_state();
 263   }
 264
 265 private:
 266   // context which completes itself (with -ETIMEDOUT) after a specified
 267   // timeout or when explicitly completed, whichever comes first. Note
 268   // that the context does not detroy itself after completion -- it
 269   // needs to be explicitly freed.
 270   class C_ContextTimeout : public MDSInternalContext {
 271   public:
 272     C_ContextTimeout(MDSRank *mds, uint64_t timeout, Context *on_finish)
 273       : MDSInternalContext(mds),
 274         timeout(timeout),
 275         on_finish(on_finish) {
 276     }
 277     ~C_ContextTimeout() {
 278       ceph_assert(timer_task == nullptr);
 279     }
 280
 281     void start_timer() {
 282       if (!timeout) {
 283         return;
 284       }
 285
 286       timer_task = new LambdaContext([this](int) {
 287           timer_task = nullptr;
 288           complete(-ETIMEDOUT);
 289         });
 290       mds->timer.add_event_after(timeout, timer_task);
 291     }
 292
 293     void finish(int r) override {
 294       Context *ctx = nullptr;
 295       {
 296         std::lock_guard locker(lock);
 297         std::swap(on_finish, ctx);
 298       }
 299       if (ctx != nullptr) {
 300         ctx->complete(r);
 301       }
 302     }
 303     void complete(int r) override {
 304       if (timer_task != nullptr) {
 305         mds->timer.cancel_event(timer_task);
 306       }
 307
 308       finish(r);
 309     }
 310
 311     uint64_t timeout;
 312     ceph::mutex lock = ceph::make_mutex("mds::context::timeout");
 313     Context *on_finish = nullptr;
 314     Context *timer_task = nullptr;
 315   };
 316
 317   auto do_trim() {
 318     auto [throttled, count] = mdcache->trim(UINT64_MAX);
 319     dout(10) << __func__
 320              << (throttled ? " (throttled)" : "")
 321              << " trimmed " << count << " caps" << dendl;
 322     dentries_trimmed += count;
 323     return std::make_pair(throttled, count);
 324   }
 325
 326   void recall_client_state() {
 327     dout(20) << __func__ << dendl;
 328     auto now = mono_clock::now();
 329     auto duration = std::chrono::duration<double>(now-recall_start).count();
 330
 331     MDSGatherBuilder *gather = new MDSGatherBuilder(g_ceph_context);
 332     auto flags = Server::RecallFlags::STEADY|Server::RecallFlags::TRIM;
 333     auto [throttled, count] = server->recall_client_state(gather, flags);
 334     dout(10) << __func__
 335              << (throttled ? " (throttled)" : "")
 336              << " recalled " << count << " caps" << dendl;
 337
 338     caps_recalled += count;
 339     if ((throttled || count > 0) && (recall_timeout == 0 || duration < recall_timeout)) {
 340       C_ContextTimeout *ctx = new C_ContextTimeout(
 341         mds, 1, new LambdaContext([this](int r) {
 342           recall_client_state();
 343       }));
 344       ctx->start_timer();
 345       gather->set_finisher(new MDSInternalContextWrapper(mds, ctx));
 346       gather->activate();
 347       mdlog->flush(); /* use down-time to incrementally flush log */
 348       do_trim(); /* use down-time to incrementally trim cache */
 349     } else {
 350       if (!gather->has_subs()) {
 351         delete gather;
 352         return handle_recall_client_state(0);
 353       } else if (recall_timeout > 0 && duration > recall_timeout) {
 354         gather->set_finisher(new C_MDSInternalNoop);
 355         gather->activate();
 356         return handle_recall_client_state(-ETIMEDOUT);
 357       } else {
 358         uint64_t remaining = (recall_timeout == 0 ? 0 : recall_timeout-duration);
 359         C_ContextTimeout *ctx = new C_ContextTimeout(
 360           mds, remaining, new LambdaContext([this](int r) {
 361               handle_recall_client_state(r);
 362             }));
 363
 364         ctx->start_timer();
 365         gather->set_finisher(new MDSInternalContextWrapper(mds, ctx));
 366         gather->activate();
 367       }
 368     }
 369   }
 370
 371   void handle_recall_client_state(int r) {
 372     dout(20) << __func__ << ": r=" << r << dendl;
 373
 374     // client recall section
 375     f->open_object_section("client_recall");
 376     f->dump_int("return_code", r);
 377     f->dump_string("message", cpp_strerror(r));
 378     f->dump_int("recalled", caps_recalled);
 379     f->close_section();
 380
 381     // we can still continue after recall timeout
 382     flush_journal();
 383   }
 384
 385   void flush_journal() {
 386     dout(20) << __func__ << dendl;
 387
 388     Context *ctx = new LambdaContext([this](int r) {
 389         handle_flush_journal(r);
 390       });
 391
 392     C_Flush_Journal *flush_journal = new C_Flush_Journal(mdcache, mdlog, mds, &ss, ctx);
 393     flush_journal->send();
 394   }
 395
 396   void handle_flush_journal(int r) {
 397     dout(20) << __func__ << ": r=" << r << dendl;
 398
 399     if (r != 0) {
 400       cmd_err(f, ss.str());
 401       complete(r);
 402       return;
 403     }
 404
 405     // journal flush section
 406     f->open_object_section("flush_journal");
 407     f->dump_int("return_code", r);
 408     f->dump_string("message", ss.str());
 409     f->close_section();
 410
 411     trim_cache();
 412   }
 413
 414   void trim_cache() {
 415     dout(20) << __func__ << dendl;
 416
 417     auto [throttled, count] = do_trim();
 418     if (throttled && count > 0) {
 419       auto timer = new LambdaContext([this](int) {
 420         trim_cache();
 421       });
 422       mds->timer.add_event_after(1.0, timer);
 423     } else {
 424       cache_status();
 425     }
 426   }
 427
 428   void cache_status() {
 429     dout(20) << __func__ << dendl;
 430
 431     f->open_object_section("trim_cache");
 432     f->dump_int("trimmed", dentries_trimmed);
 433     f->close_section();
 434
 435     // cache status section
 436     mdcache->cache_status(f);
 437
 438     complete(0);
 439   }
 440
 441   void finish(int r) override {
 442     dout(20) << __func__ << ": r=" << r << dendl;
 443
 444     auto d = std::chrono::duration<double>(mono_clock::now()-recall_start);
 445     f->dump_float("duration", d.count());
 446
 447     f->close_section();
 448     on_finish->complete(r);
 449   }
 450
 451   Server *server;
 452   MDCache *mdcache;
 453   MDLog *mdlog;
 454   uint64_t recall_timeout;
 455   mono_time recall_start;
 456   Formatter *f;
 457   Context *on_finish;
 458
 459   int retval = 0;
 460   std::stringstream ss;
 461   uint64_t caps_recalled = 0;
 462   uint64_t dentries_trimmed = 0;
 463
 464   // so as to use dout
 465   mds_rank_t whoami;
 466   int incarnation;
 467
 468   void cmd_err(Formatter *f, std::string_view err) {
 469     f->reset();
 470     f->open_object_section("result");
 471     f->dump_string("error", err);
 472     f->close_section();
 473   }
 474 };
 475
 476 MDSRank::MDSRank(
 477     mds_rank_t whoami_,
 478     ceph::mutex &mds_lock_,
 479     LogChannelRef &clog_,
 480     SafeTimer &timer_,
 481     Beacon &beacon_,
 482     std::unique_ptr<MDSMap>& mdsmap_,
 483     Messenger *msgr,
 484     MonClient *monc_,
 485     MgrClient *mgrc,
 486     Context *respawn_hook_,
 487     Context *suicide_hook_) :
 488     cct(msgr->cct), mds_lock(mds_lock_), clog(clog_),
 489     timer(timer_), mdsmap(mdsmap_),
 490     objecter(new Objecter(g_ceph_context, msgr, monc_, nullptr, 0, 0)),
 491     damage_table(whoami_), sessionmap(this),
 492     op_tracker(g_ceph_context, g_conf()->mds_enable_op_tracker,
 493                g_conf()->osd_num_op_tracker_shard),
 494     progress_thread(this), whoami(whoami_),
 495     purge_queue(g_ceph_context, whoami_,
 496       mdsmap_->get_metadata_pool(), objecter,
 497       new LambdaContext([this](int r) {
 498           std::lock_guard l(mds_lock);
 499           handle_write_error(r);
 500         }
 501       )
 502     ),
 503     beacon(beacon_),
 504     messenger(msgr), monc(monc_), mgrc(mgrc),
 505     respawn_hook(respawn_hook_),
 506     suicide_hook(suicide_hook_),
 507     starttime(mono_clock::now())
 508 {
 509   hb = g_ceph_context->get_heartbeat_map()->add_worker("MDSRank", pthread_self());
 510
 511   purge_queue.update_op_limit(*mdsmap);
 512
 513   objecter->unset_honor_pool_full();
 514
 515   finisher = new Finisher(cct, "MDSRank", "MR_Finisher");
 516
 517   mdcache = new MDCache(this, purge_queue);
 518   mdlog = new MDLog(this);
 519   balancer = new MDBalancer(this, messenger, monc);
 520
 521   scrubstack = new ScrubStack(mdcache, clog, finisher);
 522
 523   inotable = new InoTable(this);
 524   snapserver = new SnapServer(this, monc);
 525   snapclient = new SnapClient(this);
 526
 527   server = new Server(this);
 528   locker = new Locker(this, mdcache);
 529
 530   op_tracker.set_complaint_and_threshold(cct->_conf->mds_op_complaint_time,
 531                                          cct->_conf->mds_op_log_threshold);
 532   op_tracker.set_history_size_and_duration(cct->_conf->mds_op_history_size,
 533                                            cct->_conf->mds_op_history_duration);
 534
 535   schedule_update_timer_task();
 536 }
 537
 538 MDSRank::~MDSRank()
 539 {
 540   if (hb) {
 541     g_ceph_context->get_heartbeat_map()->remove_worker(hb);
 542   }
 543
 544   if (scrubstack) { delete scrubstack; scrubstack = NULL; }
 545   if (mdcache) { delete mdcache; mdcache = NULL; }
 546   if (mdlog) { delete mdlog; mdlog = NULL; }
 547   if (balancer) { delete balancer; balancer = NULL; }
 548   if (inotable) { delete inotable; inotable = NULL; }
 549   if (snapserver) { delete snapserver; snapserver = NULL; }
 550   if (snapclient) { delete snapclient; snapclient = NULL; }
 551
 552   if (server) { delete server; server = 0; }
 553   if (locker) { delete locker; locker = 0; }
 554
 555   if (logger) {
 556     g_ceph_context->get_perfcounters_collection()->remove(logger);
 557     delete logger;
 558     logger = 0;
 559   }
 560   if (mlogger) {
 561     g_ceph_context->get_perfcounters_collection()->remove(mlogger);
 562     delete mlogger;
 563     mlogger = 0;
 564   }
 565
 566   delete finisher;
 567   finisher = NULL;
 568
 569   delete suicide_hook;
 570   suicide_hook = NULL;
 571
 572   delete respawn_hook;
 573   respawn_hook = NULL;
 574
 575   delete objecter;
 576   objecter = nullptr;
 577 }
 578
 579 void MDSRankDispatcher::init()
 580 {
 581   objecter->init();
 582   messenger->add_dispatcher_head(objecter);
 583
 584   objecter->start();
 585
 586   update_log_config();
 587   create_logger();
 588
 589   // Expose the OSDMap (already populated during MDS::init) to anyone
 590   // who is interested in it.
 591   handle_osd_map();
 592
 593   progress_thread.create("mds_rank_progr");
 594
 595   purge_queue.init();
 596
 597   finisher->start();
 598 }
 599
 600 void MDSRank::update_targets()
 601 {
 602   // get MonMap's idea of my export_targets
 603   const set<mds_rank_t>& map_targets = mdsmap->get_mds_info(get_nodeid()).export_targets;
 604
 605   dout(20) << "updating export targets, currently " << map_targets.size() << " ranks are targets" << dendl;
 606
 607   bool send = false;
 608   set<mds_rank_t> new_map_targets;
 609
 610   auto it = export_targets.begin();
 611   while (it != export_targets.end()) {
 612     mds_rank_t rank = it->first;
 613     auto &counter = it->second;
 614     dout(20) << "export target mds." << rank << " is " << counter << dendl;
 615
 616     double val = counter.get();
 617     if (val <= 0.01) {
 618       dout(15) << "export target mds." << rank << " is no longer an export target" << dendl;
 619       export_targets.erase(it++);
 620       send = true;
 621       continue;
 622     }
 623     if (!map_targets.count(rank)) {
 624       dout(15) << "export target mds." << rank << " not in map's export_targets" << dendl;
 625       send = true;
 626     }
 627     new_map_targets.insert(rank);
 628     it++;
 629   }
 630   if (new_map_targets.size() < map_targets.size()) {
 631     dout(15) << "export target map holds stale targets, sending update" << dendl;
 632     send = true;
 633   }
 634
 635   if (send) {
 636     dout(15) << "updating export_targets, now " << new_map_targets.size() << " ranks are targets" << dendl;
 637     auto m = make_message<MMDSLoadTargets>(mds_gid_t(monc->get_global_id()), new_map_targets);
 638     monc->send_mon_message(m.detach());
 639   }
 640 }
 641
 642 void MDSRank::hit_export_target(mds_rank_t rank, double amount)
 643 {
 644   double rate = g_conf()->mds_bal_target_decay;
 645   if (amount < 0.0) {
 646     amount = 100.0/g_conf()->mds_bal_target_decay; /* a good default for "i am trying to keep this export_target active" */
 647   }
 648   auto em = export_targets.emplace(std::piecewise_construct, std::forward_as_tuple(rank), std::forward_as_tuple(DecayRate(rate)));
 649   auto &counter = em.first->second;
 650   counter.hit(amount);
 651   if (em.second) {
 652     dout(15) << "hit export target (new) is " << counter << dendl;
 653   } else {
 654     dout(15) << "hit export target is " << counter << dendl;
 655   }
 656 }
 657
 658 class C_MDS_MonCommand : public MDSInternalContext {
 659   std::string cmd;
 660 public:
 661   std::string outs;
 662   C_MDS_MonCommand(MDSRank *m, std::string_view c)
 663     : MDSInternalContext(m), cmd(c) {}
 664   void finish(int r) override {
 665     mds->_mon_command_finish(r, cmd, outs);
 666   }
 667 };
 668
 669 void MDSRank::_mon_command_finish(int r, std::string_view cmd, std::string_view outs)
 670 {
 671   if (r < 0) {
 672     dout(0) << __func__ << ": mon command " << cmd << " failed with errno " << r
 673             << " (" << outs << ")" << dendl;
 674   } else {
 675     dout(1) << __func__ << ": mon command " << cmd << " succeed" << dendl;
 676   }
 677 }
 678
 679 void MDSRank::set_mdsmap_multimds_snaps_allowed()
 680 {
 681   static bool already_sent = false;
 682   if (already_sent)
 683     return;
 684
 685   stringstream ss;
 686   ss << "{\"prefix\":\"fs set\", \"fs_name\":\"" <<  mdsmap->get_fs_name() << "\", ";
 687   ss << "\"var\":\"allow_multimds_snaps\", \"val\":\"true\", ";
 688   ss << "\"confirm\":\"--yes-i-am-really-a-mds\"}";
 689   std::vector<std::string> cmd = {ss.str()};
 690
 691   dout(0) << __func__ << ": sending mon command: " << cmd[0] << dendl;
 692
 693   C_MDS_MonCommand *fin = new C_MDS_MonCommand(this, cmd[0]);
 694   monc->start_mon_command(cmd, {}, nullptr, &fin->outs, new C_IO_Wrapper(this, fin));
 695
 696   already_sent = true;
 697 }
 698
 699 void MDSRank::mark_base_recursively_scrubbed(inodeno_t ino)
 700 {
 701   if (mdsmap->get_tableserver() == whoami)
 702     snapserver->mark_base_recursively_scrubbed(ino);
 703 }
 704
 705 void MDSRankDispatcher::tick()
 706 {
 707   heartbeat_reset();
 708
 709   if (beacon.is_laggy()) {
 710     dout(1) << "skipping upkeep work because connection to Monitors appears laggy" << dendl;
 711     return;
 712   }
 713
 714   check_ops_in_flight();
 715
 716   // Wake up thread in case we use to be laggy and have waiting_for_nolaggy
 717   // messages to progress.
 718   progress_thread.signal();
 719
 720   // make sure mds log flushes, trims periodically
 721   mdlog->flush();
 722
 723   // update average session uptime
 724   sessionmap.update_average_session_age();
 725
 726   if (is_active() || is_stopping()) {
 727     mdlog->trim();  // NOT during recovery!
 728   }
 729
 730   // ...
 731   if (is_cache_trimmable()) {
 732     server->find_idle_sessions();
 733     server->evict_cap_revoke_non_responders();
 734     locker->tick();
 735   }
 736
 737   // log
 738   if (logger) {
 739     logger->set(l_mds_subtrees, mdcache->num_subtrees());
 740     mdcache->log_stat();
 741   }
 742
 743   if (is_reconnect())
 744     server->reconnect_tick();
 745
 746   if (is_active()) {
 747     balancer->tick();
 748     mdcache->find_stale_fragment_freeze();
 749     mdcache->migrator->find_stale_export_freeze();
 750
 751     if (mdsmap->get_tableserver() == whoami) {
 752       snapserver->check_osd_map(false);
 753       // Filesystem was created by pre-mimic mds. Allow multi-active mds after
 754       // all old snapshots are deleted.
 755       if (!mdsmap->allows_multimds_snaps() &&
 756           snapserver->can_allow_multimds_snaps()) {
 757         set_mdsmap_multimds_snaps_allowed();
 758       }
 759     }
 760   }
 761
 762   if (is_active() || is_stopping()) {
 763     update_targets();
 764   }
 765
 766   // shut down?
 767   if (is_stopping()) {
 768     mdlog->trim();
 769     if (mdcache->shutdown_pass()) {
 770       uint64_t pq_progress = 0 ;
 771       uint64_t pq_total = 0;
 772       size_t pq_in_flight = 0;
 773       if (!purge_queue.drain(&pq_progress, &pq_total, &pq_in_flight)) {
 774         dout(7) << "shutdown_pass=true, but still waiting for purge queue"
 775                 << dendl;
 776         // This takes unbounded time, so we must indicate progress
 777         // to the administrator: we do it in a slightly imperfect way
 778         // by sending periodic (tick frequency) clog messages while
 779         // in this state.
 780         clog->info() << "MDS rank " << whoami << " waiting for purge queue ("
 781           << std::dec << pq_progress << "/" << pq_total << " " << pq_in_flight
 782           << " files purging" << ")";
 783       } else {
 784         dout(7) << "shutdown_pass=true, finished w/ shutdown, moving to "
 785                    "down:stopped" << dendl;
 786         stopping_done();
 787       }
 788     }
 789     else {
 790       dout(7) << "shutdown_pass=false" << dendl;
 791     }
 792   }
 793
 794   // Expose ourselves to Beacon to update health indicators
 795   beacon.notify_health(this);
 796 }
 797
 798 void MDSRankDispatcher::shutdown()
 799 {
 800   // It should never be possible for shutdown to get called twice, because
 801   // anyone picking up mds_lock checks if stopping is true and drops
 802   // out if it is.
 803   ceph_assert(stopping == false);
 804   stopping = true;
 805
 806   dout(1) << __func__ << ": shutting down rank " << whoami << dendl;
 807
 808   g_conf().remove_observer(this);
 809
 810   timer.shutdown();
 811
 812   // MDLog has to shut down before the finisher, because some of its
 813   // threads block on IOs that require finisher to complete.
 814   mdlog->shutdown();
 815
 816   // shut down cache
 817   mdcache->shutdown();
 818
 819   purge_queue.shutdown();
 820
 821   mds_lock.unlock();
 822   finisher->stop(); // no flushing
 823   mds_lock.lock();
 824
 825   if (objecter->initialized)
 826     objecter->shutdown();
 827
 828   monc->shutdown();
 829
 830   op_tracker.on_shutdown();
 831
 832   progress_thread.shutdown();
 833
 834   // release mds_lock for finisher/messenger threads (e.g.
 835   // MDSDaemon::ms_handle_reset called from Messenger).
 836   mds_lock.unlock();
 837
 838   // shut down messenger
 839   messenger->shutdown();
 840
 841   mds_lock.lock();
 842
 843   // Workaround unclean shutdown: HeartbeatMap will assert if
 844   // worker is not removed (as we do in ~MDS), but ~MDS is not
 845   // always called after suicide.
 846   if (hb) {
 847     g_ceph_context->get_heartbeat_map()->remove_worker(hb);
 848     hb = NULL;
 849   }
 850 }
 851
 852 /**
 853  * Helper for simple callbacks that call a void fn with no args.
 854  */
 855 class C_MDS_VoidFn : public MDSInternalContext
 856 {
 857   typedef void (MDSRank::*fn_ptr)();
 858   protected:
 859    fn_ptr fn;
 860   public:
 861   C_MDS_VoidFn(MDSRank *mds_, fn_ptr fn_)
 862     : MDSInternalContext(mds_), fn(fn_)
 863   {
 864     ceph_assert(mds_);
 865     ceph_assert(fn_);
 866   }
 867
 868   void finish(int r) override
 869   {
 870     (mds->*fn)();
 871   }
 872 };
 873
 874 int64_t MDSRank::get_metadata_pool()
 875 {
 876     return mdsmap->get_metadata_pool();
 877 }
 878
 879 MDSTableClient *MDSRank::get_table_client(int t)
 880 {
 881   switch (t) {
 882   case TABLE_ANCHOR: return NULL;
 883   case TABLE_SNAP: return snapclient;
 884   default: ceph_abort();
 885   }
 886 }
 887
 888 MDSTableServer *MDSRank::get_table_server(int t)
 889 {
 890   switch (t) {
 891   case TABLE_ANCHOR: return NULL;
 892   case TABLE_SNAP: return snapserver;
 893   default: ceph_abort();
 894   }
 895 }
 896
 897 void MDSRank::suicide()
 898 {
 899   if (suicide_hook) {
 900     suicide_hook->complete(0);
 901     suicide_hook = NULL;
 902   }
 903 }
 904
 905 void MDSRank::respawn()
 906 {
 907   if (respawn_hook) {
 908     respawn_hook->complete(0);
 909     respawn_hook = NULL;
 910   }
 911 }
 912
 913 void MDSRank::damaged()
 914 {
 915   ceph_assert(whoami != MDS_RANK_NONE);
 916   ceph_assert(ceph_mutex_is_locked_by_me(mds_lock));
 917
 918   beacon.set_want_state(*mdsmap, MDSMap::STATE_DAMAGED);
 919   monc->flush_log();  // Flush any clog error from before we were called
 920   beacon.notify_health(this);  // Include latest status in our swan song
 921   beacon.send_and_wait(g_conf()->mds_mon_shutdown_timeout);
 922
 923   // It's okay if we timed out and the mon didn't get our beacon, because
 924   // another daemon (or ourselves after respawn) will eventually take the
 925   // rank and report DAMAGED again when it hits same problem we did.
 926
 927   respawn();  // Respawn into standby in case mon has other work for us
 928 }
 929
 930 void MDSRank::damaged_unlocked()
 931 {
 932   std::lock_guard l(mds_lock);
 933   damaged();
 934 }
 935
 936 void MDSRank::handle_write_error(int err)
 937 {
 938   if (err == -EBLACKLISTED) {
 939     derr << "we have been blacklisted (fenced), respawning..." << dendl;
 940     respawn();
 941     return;
 942   }
 943
 944   if (g_conf()->mds_action_on_write_error >= 2) {
 945     derr << "unhandled write error " << cpp_strerror(err) << ", suicide..." << dendl;
 946     respawn();
 947   } else if (g_conf()->mds_action_on_write_error == 1) {
 948     derr << "unhandled write error " << cpp_strerror(err) << ", force readonly..." << dendl;
 949     mdcache->force_readonly();
 950   } else {
 951     // ignore;
 952     derr << "unhandled write error " << cpp_strerror(err) << ", ignore..." << dendl;
 953   }
 954 }
 955
 956 void *MDSRank::ProgressThread::entry()
 957 {
 958   std::unique_lock l(mds->mds_lock);
 959   while (true) {
 960     cond.wait(l, [this] {
 961       return (mds->stopping ||
 962               !mds->finished_queue.empty() ||
 963               (!mds->waiting_for_nolaggy.empty() && !mds->beacon.is_laggy()));
 964     });
 965
 966     if (mds->stopping) {
 967       break;
 968     }
 969
 970     mds->_advance_queues();
 971   }
 972
 973   return NULL;
 974 }
 975
 976
 977 void MDSRank::ProgressThread::shutdown()
 978 {
 979   ceph_assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
 980   ceph_assert(mds->stopping);
 981
 982   if (am_self()) {
 983     // Stopping is set, we will fall out of our main loop naturally
 984   } else {
 985     // Kick the thread to notice mds->stopping, and join it
 986     cond.notify_all();
 987     mds->mds_lock.unlock();
 988     if (is_started())
 989       join();
 990     mds->mds_lock.lock();
 991   }
 992 }
 993
 994 bool MDSRankDispatcher::ms_dispatch(const cref_t<Message> &m)
 995 {
 996   if (m->get_source().is_client()) {
 997     Session *session = static_cast<Session*>(m->get_connection()->get_priv().get());
 998     if (session)
 999       session->last_seen = Session::clock::now();
1000   }
1001
1002   inc_dispatch_depth();
1003   bool ret = _dispatch(m, true);
1004   dec_dispatch_depth();
1005   return ret;
1006 }
1007
1008 bool MDSRank::_dispatch(const cref_t<Message> &m, bool new_msg)
1009 {
1010   if (is_stale_message(m)) {
1011     return true;
1012   }
1013
1014   if (beacon.is_laggy()) {
1015     dout(5) << " laggy, deferring " << *m << dendl;
1016     waiting_for_nolaggy.push_back(m);
1017   } else if (new_msg && !waiting_for_nolaggy.empty()) {
1018     dout(5) << " there are deferred messages, deferring " << *m << dendl;
1019     waiting_for_nolaggy.push_back(m);
1020   } else {
1021     if (!handle_deferrable_message(m)) {
1022       return false;
1023     }
1024
1025     heartbeat_reset();
1026   }
1027
1028   if (dispatch_depth > 1)
1029     return true;
1030
1031   // finish any triggered contexts
1032   _advance_queues();
1033
1034   if (beacon.is_laggy()) {
1035     // We've gone laggy during dispatch, don't do any
1036     // more housekeeping
1037     return true;
1038   }
1039
1040   // hack: thrash exports
1041   static utime_t start;
1042   utime_t now = ceph_clock_now();
1043   if (start == utime_t())
1044     start = now;
1045   /*double el = now - start;
1046   if (el > 30.0 &&
1047     el < 60.0)*/
1048   for (int i=0; i<g_conf()->mds_thrash_exports; i++) {
1049     set<mds_rank_t> s;
1050     if (!is_active()) break;
1051     mdsmap->get_mds_set(s, MDSMap::STATE_ACTIVE);
1052     if (s.size() < 2 || CInode::count() < 10)
1053       break;  // need peers for this to work.
1054     if (mdcache->migrator->get_num_exporting() > g_conf()->mds_thrash_exports * 5 ||
1055         mdcache->migrator->get_export_queue_size() > g_conf()->mds_thrash_exports * 10)
1056       break;
1057
1058     dout(7) << "mds thrashing exports pass " << (i+1) << "/" << g_conf()->mds_thrash_exports << dendl;
1059
1060     // pick a random dir inode
1061     CInode *in = mdcache->hack_pick_random_inode();
1062
1063     auto&& ls = in->get_dirfrags();
1064     if (!ls.empty()) {  // must be an open dir.
1065       const auto& dir = ls[rand() % ls.size()];
1066       if (!dir->get_parent_dir()) continue;    // must be linked.
1067       if (!dir->is_auth()) continue;           // must be auth.
1068
1069       mds_rank_t dest;
1070       do {
1071         int k = rand() % s.size();
1072         set<mds_rank_t>::iterator p = s.begin();
1073         while (k--) ++p;
1074         dest = *p;
1075       } while (dest == whoami);
1076       mdcache->migrator->export_dir_nicely(dir,dest);
1077     }
1078   }
1079   // hack: thrash fragments
1080   for (int i=0; i<g_conf()->mds_thrash_fragments; i++) {
1081     if (!is_active()) break;
1082     if (mdcache->get_num_fragmenting_dirs() > 5 * g_conf()->mds_thrash_fragments) break;
1083     dout(7) << "mds thrashing fragments pass " << (i+1) << "/" << g_conf()->mds_thrash_fragments << dendl;
1084
1085     // pick a random dir inode
1086     CInode *in = mdcache->hack_pick_random_inode();
1087
1088     auto&& ls = in->get_dirfrags();
1089     if (ls.empty()) continue;                // must be an open dir.
1090     CDir *dir = ls.front();
1091     if (!dir->get_parent_dir()) continue;    // must be linked.
1092     if (!dir->is_auth()) continue;           // must be auth.
1093     frag_t fg = dir->get_frag();
1094     if ((fg == frag_t() || (rand() % (1 << fg.bits()) == 0))) {
1095       mdcache->split_dir(dir, 1);
1096     } else {
1097       balancer->queue_merge(dir);
1098     }
1099   }
1100
1101   // hack: force hash root?
1102   /*
1103   if (false &&
1104       mdcache->get_root() &&
1105       mdcache->get_root()->dir &&
1106       !(mdcache->get_root()->dir->is_hashed() ||
1107         mdcache->get_root()->dir->is_hashing())) {
1108     dout(0) << "hashing root" << dendl;
1109     mdcache->migrator->hash_dir(mdcache->get_root()->dir);
1110   }
1111   */
1112
1113   update_mlogger();
1114   return true;
1115 }
1116
1117 void MDSRank::update_mlogger()
1118 {
1119   if (mlogger) {
1120     mlogger->set(l_mdm_ino, CInode::count());
1121     mlogger->set(l_mdm_dir, CDir::count());
1122     mlogger->set(l_mdm_dn, CDentry::count());
1123     mlogger->set(l_mdm_cap, Capability::count());
1124     mlogger->set(l_mdm_inoa, CInode::increments());
1125     mlogger->set(l_mdm_inos, CInode::decrements());
1126     mlogger->set(l_mdm_dira, CDir::increments());
1127     mlogger->set(l_mdm_dirs, CDir::decrements());
1128     mlogger->set(l_mdm_dna, CDentry::increments());
1129     mlogger->set(l_mdm_dns, CDentry::decrements());
1130     mlogger->set(l_mdm_capa, Capability::increments());
1131     mlogger->set(l_mdm_caps, Capability::decrements());
1132   }
1133 }
1134
1135 /*
1136  * lower priority messages we defer if we seem laggy
1137  */
1138 bool MDSRank::handle_deferrable_message(const cref_t<Message> &m)
1139 {
1140   int port = m->get_type() & 0xff00;
1141
1142   switch (port) {
1143   case MDS_PORT_CACHE:
1144     ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
1145     mdcache->dispatch(m);
1146     break;
1147
1148   case MDS_PORT_MIGRATOR:
1149     ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
1150     mdcache->migrator->dispatch(m);
1151     break;
1152
1153   default:
1154     switch (m->get_type()) {
1155       // SERVER
1156     case CEPH_MSG_CLIENT_SESSION:
1157     case CEPH_MSG_CLIENT_RECONNECT:
1158     case CEPH_MSG_CLIENT_RECLAIM:
1159       ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_CLIENT);
1160       // fall-thru
1161     case CEPH_MSG_CLIENT_REQUEST:
1162       server->dispatch(m);
1163       break;
1164     case MSG_MDS_SLAVE_REQUEST:
1165       ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
1166       server->dispatch(m);
1167       break;
1168
1169     case MSG_MDS_HEARTBEAT:
1170       ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
1171       balancer->proc_message(m);
1172       break;
1173
1174     case MSG_MDS_TABLE_REQUEST:
1175       ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
1176       {
1177         const cref_t<MMDSTableRequest> &req = ref_cast<MMDSTableRequest>(m);
1178         if (req->op < 0) {
1179           MDSTableClient *client = get_table_client(req->table);
1180           client->handle_request(req);
1181         } else {
1182            MDSTableServer *server = get_table_server(req->table);
1183            server->handle_request(req);
1184         }
1185       }
1186       break;
1187
1188     case MSG_MDS_LOCK:
1189     case MSG_MDS_INODEFILECAPS:
1190       ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
1191       locker->dispatch(m);
1192       break;
1193
1194     case CEPH_MSG_CLIENT_CAPS:
1195     case CEPH_MSG_CLIENT_CAPRELEASE:
1196     case CEPH_MSG_CLIENT_LEASE:
1197       ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_CLIENT);
1198       locker->dispatch(m);
1199       break;
1200
1201     default:
1202       return false;
1203     }
1204   }
1205
1206   return true;
1207 }
1208
1209 /**
1210  * Advance finished_queue and waiting_for_nolaggy.
1211  *
1212  * Usually drain both queues, but may not drain waiting_for_nolaggy
1213  * if beacon is currently laggy.
1214  */
1215 void MDSRank::_advance_queues()
1216 {
1217   ceph_assert(ceph_mutex_is_locked_by_me(mds_lock));
1218
1219   if (!finished_queue.empty()) {
1220     dout(7) << "mds has " << finished_queue.size() << " queued contexts" << dendl;
1221     while (!finished_queue.empty()) {
1222       auto fin = finished_queue.front();
1223       finished_queue.pop_front();
1224
1225       dout(10) << " finish " << fin << dendl;
1226       fin->complete(0);
1227
1228       heartbeat_reset();
1229     }
1230   }
1231
1232   while (!waiting_for_nolaggy.empty()) {
1233     // stop if we're laggy now!
1234     if (beacon.is_laggy())
1235       break;
1236
1237     cref_t<Message> old = waiting_for_nolaggy.front();
1238     waiting_for_nolaggy.pop_front();
1239
1240     if (!is_stale_message(old)) {
1241       dout(7) << " processing laggy deferred " << *old << dendl;
1242       if (!handle_deferrable_message(old)) {
1243         dout(0) << "unrecognized message " << *old << dendl;
1244       }
1245     }
1246
1247     heartbeat_reset();
1248   }
1249 }
1250
1251 /**
1252  * Call this when you take mds_lock, or periodically if you're going to
1253  * hold the lock for a long time (e.g. iterating over clients/inodes)
1254  */
1255 void MDSRank::heartbeat_reset()
1256 {
1257   // Any thread might jump into mds_lock and call us immediately
1258   // after a call to suicide() completes, in which case MDSRank::hb
1259   // has been freed and we are a no-op.
1260   if (!hb) {
1261       ceph_assert(stopping);
1262       return;
1263   }
1264
1265   // NB not enabling suicide grace, because the mon takes care of killing us
1266   // (by blacklisting us) when we fail to send beacons, and it's simpler to
1267   // only have one way of dying.
1268   auto grace = g_conf().get_val<double>("mds_heartbeat_grace");
1269   g_ceph_context->get_heartbeat_map()->reset_timeout(hb, grace, 0);
1270 }
1271
1272 bool MDSRank::is_stale_message(const cref_t<Message> &m) const
1273 {
1274   // from bad mds?
1275   if (m->get_source().is_mds()) {
1276     mds_rank_t from = mds_rank_t(m->get_source().num());
1277     bool bad = false;
1278     if (mdsmap->is_down(from)) {
1279       bad = true;
1280     } else {
1281       // FIXME: this is a convoluted check.  we should be maintaining a nice
1282       // clean map of current ConnectionRefs for current mdses!!!
1283       auto c = messenger->connect_to(CEPH_ENTITY_TYPE_MDS,
1284                                      mdsmap->get_addrs(from));
1285       if (c != m->get_connection()) {
1286         bad = true;
1287         dout(5) << " mds." << from << " should be " << c << " "
1288                 << c->get_peer_addrs() << " but this message is "
1289                 << m->get_connection() << " " << m->get_source_addrs()
1290                 << dendl;
1291       }
1292     }
1293     if (bad) {
1294       // bogus mds?
1295       if (m->get_type() == CEPH_MSG_MDS_MAP) {
1296         dout(5) << "got " << *m << " from old/bad/imposter mds " << m->get_source()
1297                 << ", but it's an mdsmap, looking at it" << dendl;
1298       } else if (m->get_type() == MSG_MDS_CACHEEXPIRE &&
1299                  mdsmap->get_addrs(from) == m->get_source_addrs()) {
1300         dout(5) << "got " << *m << " from down mds " << m->get_source()
1301                 << ", but it's a cache_expire, looking at it" << dendl;
1302       } else {
1303         dout(5) << "got " << *m << " from down/old/bad/imposter mds " << m->get_source()
1304                 << ", dropping" << dendl;
1305         return true;
1306       }
1307     }
1308   }
1309   return false;
1310 }
1311
1312 Session *MDSRank::get_session(const cref_t<Message> &m)
1313 {
1314   // do not carry ref
1315   auto session = static_cast<Session *>(m->get_connection()->get_priv().get());
1316   if (session) {
1317     dout(20) << "get_session have " << session << " " << session->info.inst
1318              << " state " << session->get_state_name() << dendl;
1319     // Check if we've imported an open session since (new sessions start closed)
1320     if (session->is_closed()) {
1321       Session *imported_session = sessionmap.get_session(session->info.inst.name);
1322       if (imported_session && imported_session != session) {
1323         dout(10) << __func__ << " replacing connection bootstrap session "
1324                  << session << " with imported session " << imported_session
1325                  << dendl;
1326         imported_session->info.auth_name = session->info.auth_name;
1327         //assert(session->info.auth_name == imported_session->info.auth_name);
1328         ceph_assert(session->info.inst == imported_session->info.inst);
1329         imported_session->set_connection(session->get_connection().get());
1330         // send out any queued messages
1331         while (!session->preopen_out_queue.empty()) {
1332           imported_session->get_connection()->send_message2(std::move(session->preopen_out_queue.front()));
1333           session->preopen_out_queue.pop_front();
1334         }
1335         imported_session->auth_caps = session->auth_caps;
1336         imported_session->last_seen = session->last_seen;
1337         ceph_assert(session->get_nref() == 1);
1338         imported_session->get_connection()->set_priv(imported_session->get());
1339         session = imported_session;
1340       }
1341     }
1342   } else {
1343     dout(20) << "get_session dne for " << m->get_source_inst() << dendl;
1344   }
1345   return session;
1346 }
1347
1348 void MDSRank::send_message(const ref_t<Message>& m, const ConnectionRef& c)
1349 {
1350   ceph_assert(c);
1351   c->send_message2(m);
1352 }
1353
1354
1355 void MDSRank::send_message_mds(const ref_t<Message>& m, mds_rank_t mds)
1356 {
1357   if (!mdsmap->is_up(mds)) {
1358     dout(10) << "send_message_mds mds." << mds << " not up, dropping " << *m << dendl;
1359     return;
1360   }
1361
1362   // send mdsmap first?
1363   if (mds != whoami && peer_mdsmap_epoch[mds] < mdsmap->get_epoch()) {
1364     auto _m = make_message<MMDSMap>(monc->get_fsid(), *mdsmap);
1365     messenger->send_to_mds(_m.detach(), mdsmap->get_addrs(mds));
1366     peer_mdsmap_epoch[mds] = mdsmap->get_epoch();
1367   }
1368
1369   // send message
1370   messenger->send_to_mds(ref_t<Message>(m).detach(), mdsmap->get_addrs(mds));
1371 }
1372
1373 void MDSRank::forward_message_mds(const cref_t<MClientRequest>& m, mds_rank_t mds)
1374 {
1375   ceph_assert(mds != whoami);
1376
1377   /*
1378    * don't actually forward if non-idempotent!
1379    * client has to do it.  although the MDS will ignore duplicate requests,
1380    * the affected metadata may migrate, in which case the new authority
1381    * won't have the metareq_id in the completed request map.
1382    */
1383   // NEW: always make the client resend!
1384   bool client_must_resend = true;  //!creq->can_forward();
1385
1386   // tell the client where it should go
1387   auto session = get_session(m);
1388   auto f = make_message<MClientRequestForward>(m->get_tid(), mds, m->get_num_fwd()+1, client_must_resend);
1389   send_message_client(f, session);
1390 }
1391
1392 void MDSRank::send_message_client_counted(const ref_t<Message>& m, client_t client)
1393 {
1394   Session *session = sessionmap.get_session(entity_name_t::CLIENT(client.v));
1395   if (session) {
1396     send_message_client_counted(m, session);
1397   } else {
1398     dout(10) << "send_message_client_counted no session for client." << client << " " << *m << dendl;
1399   }
1400 }
1401
1402 void MDSRank::send_message_client_counted(const ref_t<Message>& m, const ConnectionRef& connection)
1403 {
1404   // do not carry ref
1405   auto session = static_cast<Session *>(connection->get_priv().get());
1406   if (session) {
1407     send_message_client_counted(m, session);
1408   } else {
1409     dout(10) << "send_message_client_counted has no session for " << m->get_source_inst() << dendl;
1410     // another Connection took over the Session
1411   }
1412 }
1413
1414 void MDSRank::send_message_client_counted(const ref_t<Message>& m, Session* session)
1415 {
1416   version_t seq = session->inc_push_seq();
1417   dout(10) << "send_message_client_counted " << session->info.inst.name << " seq "
1418            << seq << " " << *m << dendl;
1419   if (session->get_connection()) {
1420     session->get_connection()->send_message2(m);
1421   } else {
1422     session->preopen_out_queue.push_back(m);
1423   }
1424 }
1425
1426 void MDSRank::send_message_client(const ref_t<Message>& m, Session* session)
1427 {
1428   dout(10) << "send_message_client " << session->info.inst << " " << *m << dendl;
1429   if (session->get_connection()) {
1430     session->get_connection()->send_message2(m);
1431   } else {
1432     session->preopen_out_queue.push_back(m);
1433   }
1434 }
1435
1436 /**
1437  * This is used whenever a RADOS operation has been cancelled
1438  * or a RADOS client has been blacklisted, to cause the MDS and
1439  * any clients to wait for this OSD epoch before using any new caps.
1440  *
1441  * See doc/cephfs/eviction
1442  */
1443 void MDSRank::set_osd_epoch_barrier(epoch_t e)
1444 {
1445   dout(4) << __func__ << ": epoch=" << e << dendl;
1446   osd_epoch_barrier = e;
1447 }
1448
1449 void MDSRank::retry_dispatch(const cref_t<Message> &m)
1450 {
1451   inc_dispatch_depth();
1452   _dispatch(m, false);
1453   dec_dispatch_depth();
1454 }
1455
1456 double MDSRank::get_dispatch_queue_max_age(utime_t now) const
1457 {
1458   return messenger->get_dispatch_queue_max_age(now);
1459 }
1460
1461 bool MDSRank::is_daemon_stopping() const
1462 {
1463   return stopping;
1464 }
1465
1466 void MDSRank::request_state(MDSMap::DaemonState s)
1467 {
1468   dout(3) << "request_state " << ceph_mds_state_name(s) << dendl;
1469   beacon.set_want_state(*mdsmap, s);
1470   beacon.send();
1471 }
1472
1473
1474 class C_MDS_BootStart : public MDSInternalContext {
1475   MDSRank::BootStep nextstep;
1476 public:
1477   C_MDS_BootStart(MDSRank *m, MDSRank::BootStep n)
1478     : MDSInternalContext(m), nextstep(n) {}
1479   void finish(int r) override {
1480     mds->boot_start(nextstep, r);
1481   }
1482 };
1483
1484
1485 void MDSRank::boot_start(BootStep step, int r)
1486 {
1487   // Handle errors from previous step
1488   if (r < 0) {
1489     if (is_standby_replay() && (r == -EAGAIN)) {
1490       dout(0) << "boot_start encountered an error EAGAIN"
1491               << ", respawning since we fell behind journal" << dendl;
1492       respawn();
1493     } else if (r == -EINVAL || r == -ENOENT) {
1494       // Invalid or absent data, indicates damaged on-disk structures
1495       clog->error() << "Error loading MDS rank " << whoami << ": "
1496         << cpp_strerror(r);
1497       damaged();
1498       ceph_assert(r == 0);  // Unreachable, damaged() calls respawn()
1499     } else if (r == -EROFS) {
1500       dout(0) << "boot error forcing transition to read-only; MDS will try to continue" << dendl;
1501     } else {
1502       // Completely unexpected error, give up and die
1503       dout(0) << "boot_start encountered an error, failing" << dendl;
1504       suicide();
1505       return;
1506     }
1507   }
1508
1509   ceph_assert(is_starting() || is_any_replay());
1510
1511   switch(step) {
1512     case MDS_BOOT_INITIAL:
1513       {
1514         mdcache->init_layouts();
1515
1516         MDSGatherBuilder gather(g_ceph_context,
1517             new C_MDS_BootStart(this, MDS_BOOT_OPEN_ROOT));
1518         dout(2) << "Booting: " << step << ": opening inotable" << dendl;
1519         inotable->set_rank(whoami);
1520         inotable->load(gather.new_sub());
1521
1522         dout(2) << "Booting: " << step << ": opening sessionmap" << dendl;
1523         sessionmap.set_rank(whoami);
1524         sessionmap.load(gather.new_sub());
1525
1526         dout(2) << "Booting: " << step << ": opening mds log" << dendl;
1527         mdlog->open(gather.new_sub());
1528
1529         if (is_starting()) {
1530           dout(2) << "Booting: " << step << ": opening purge queue" << dendl;
1531           purge_queue.open(new C_IO_Wrapper(this, gather.new_sub()));
1532         } else if (!standby_replaying) {
1533           dout(2) << "Booting: " << step << ": opening purge queue (async)" << dendl;
1534           purge_queue.open(NULL);
1535           dout(2) << "Booting: " << step << ": loading open file table (async)" << dendl;
1536           mdcache->open_file_table.load(nullptr);
1537         }
1538
1539         if (mdsmap->get_tableserver() == whoami) {
1540           dout(2) << "Booting: " << step << ": opening snap table" << dendl;
1541           snapserver->set_rank(whoami);
1542           snapserver->load(gather.new_sub());
1543         }
1544
1545         gather.activate();
1546       }
1547       break;
1548     case MDS_BOOT_OPEN_ROOT:
1549       {
1550         dout(2) << "Booting: " << step << ": loading/discovering base inodes" << dendl;
1551
1552         MDSGatherBuilder gather(g_ceph_context,
1553             new C_MDS_BootStart(this, MDS_BOOT_PREPARE_LOG));
1554
1555         if (is_starting()) {
1556           // load mydir frag for the first log segment (creating subtree map)
1557           mdcache->open_mydir_frag(gather.new_sub());
1558         } else {
1559           mdcache->open_mydir_inode(gather.new_sub());
1560         }
1561
1562         mdcache->create_global_snaprealm();
1563
1564         if (whoami == mdsmap->get_root()) {  // load root inode off disk if we are auth
1565           mdcache->open_root_inode(gather.new_sub());
1566         } else if (is_any_replay()) {
1567           // replay.  make up fake root inode to start with
1568           mdcache->create_root_inode();
1569         }
1570         gather.activate();
1571       }
1572       break;
1573     case MDS_BOOT_PREPARE_LOG:
1574       if (is_any_replay()) {
1575         dout(2) << "Booting: " << step << ": replaying mds log" << dendl;
1576         MDSGatherBuilder gather(g_ceph_context,
1577             new C_MDS_BootStart(this, MDS_BOOT_REPLAY_DONE));
1578
1579         if (!standby_replaying) {
1580           dout(2) << "Booting: " << step << ": waiting for purge queue recovered" << dendl;
1581           purge_queue.wait_for_recovery(new C_IO_Wrapper(this, gather.new_sub()));
1582         }
1583
1584         mdlog->replay(gather.new_sub());
1585         gather.activate();
1586       } else {
1587         dout(2) << "Booting: " << step << ": positioning at end of old mds log" << dendl;
1588         mdlog->append();
1589         starting_done();
1590       }
1591       break;
1592     case MDS_BOOT_REPLAY_DONE:
1593       ceph_assert(is_any_replay());
1594
1595       // Sessiontable and inotable should be in sync after replay, validate
1596       // that they are consistent.
1597       validate_sessions();
1598
1599       replay_done();
1600       break;
1601   }
1602 }
1603
1604 void MDSRank::validate_sessions()
1605 {
1606   ceph_assert(ceph_mutex_is_locked_by_me(mds_lock));
1607   bool valid = true;
1608
1609   // Identify any sessions which have state inconsistent with other,
1610   // after they have been loaded from rados during startup.
1611   // Mitigate bugs like: http://tracker.ceph.com/issues/16842
1612   for (const auto &i : sessionmap.get_sessions()) {
1613     Session *session = i.second;
1614     interval_set<inodeno_t> badones;
1615     if (inotable->intersects_free(session->info.prealloc_inos, &badones)) {
1616       clog->error() << "client " << *session
1617                     << "loaded with preallocated inodes that are inconsistent with inotable";
1618       valid = false;
1619     }
1620   }
1621
1622   if (!valid) {
1623     damaged();
1624     ceph_assert(valid);
1625   }
1626 }
1627
1628 void MDSRank::starting_done()
1629 {
1630   dout(3) << "starting_done" << dendl;
1631   ceph_assert(is_starting());
1632   request_state(MDSMap::STATE_ACTIVE);
1633
1634   mdlog->start_new_segment();
1635
1636   // sync snaptable cache
1637   snapclient->sync(new C_MDSInternalNoop);
1638 }
1639
1640
1641 void MDSRank::calc_recovery_set()
1642 {
1643   // initialize gather sets
1644   set<mds_rank_t> rs;
1645   mdsmap->get_recovery_mds_set(rs);
1646   rs.erase(whoami);
1647   mdcache->set_recovery_set(rs);
1648
1649   dout(1) << " recovery set is " << rs << dendl;
1650 }
1651
1652
1653 void MDSRank::replay_start()
1654 {
1655   dout(1) << "replay_start" << dendl;
1656
1657   if (is_standby_replay())
1658     standby_replaying = true;
1659
1660   calc_recovery_set();
1661
1662   // Check if we need to wait for a newer OSD map before starting
1663   Context *fin = new C_IO_Wrapper(this, new C_MDS_BootStart(this, MDS_BOOT_INITIAL));
1664   bool const ready = objecter->wait_for_map(
1665       mdsmap->get_last_failure_osd_epoch(),
1666       fin);
1667
1668   if (ready) {
1669     delete fin;
1670     boot_start();
1671   } else {
1672     dout(1) << " waiting for osdmap " << mdsmap->get_last_failure_osd_epoch()
1673             << " (which blacklists prior instance)" << dendl;
1674   }
1675 }
1676
1677
1678 class MDSRank::C_MDS_StandbyReplayRestartFinish : public MDSIOContext {
1679   uint64_t old_read_pos;
1680 public:
1681   C_MDS_StandbyReplayRestartFinish(MDSRank *mds_, uint64_t old_read_pos_) :
1682     MDSIOContext(mds_), old_read_pos(old_read_pos_) {}
1683   void finish(int r) override {
1684     mds->_standby_replay_restart_finish(r, old_read_pos);
1685   }
1686   void print(ostream& out) const override {
1687     out << "standby_replay_restart";
1688   }
1689 };
1690
1691 void MDSRank::_standby_replay_restart_finish(int r, uint64_t old_read_pos)
1692 {
1693   if (old_read_pos < mdlog->get_journaler()->get_trimmed_pos()) {
1694     dout(0) << "standby MDS fell behind active MDS journal's expire_pos, restarting" << dendl;
1695     respawn(); /* we're too far back, and this is easier than
1696                   trying to reset everything in the cache, etc */
1697   } else {
1698     mdlog->standby_trim_segments();
1699     boot_start(MDS_BOOT_PREPARE_LOG, r);
1700   }
1701 }
1702
1703 class MDSRank::C_MDS_StandbyReplayRestart : public MDSInternalContext {
1704 public:
1705   explicit C_MDS_StandbyReplayRestart(MDSRank *m) : MDSInternalContext(m) {}
1706   void finish(int r) override {
1707     ceph_assert(!r);
1708     mds->standby_replay_restart();
1709   }
1710 };
1711
1712 void MDSRank::standby_replay_restart()
1713 {
1714   if (standby_replaying) {
1715     /* Go around for another pass of replaying in standby */
1716     dout(5) << "Restarting replay as standby-replay" << dendl;
1717     mdlog->get_journaler()->reread_head_and_probe(
1718       new C_MDS_StandbyReplayRestartFinish(
1719         this,
1720         mdlog->get_journaler()->get_read_pos()));
1721   } else {
1722     /* We are transitioning out of standby: wait for OSD map update
1723        before making final pass */
1724     dout(1) << "standby_replay_restart (final takeover pass)" << dendl;
1725     Context *fin = new C_IO_Wrapper(this, new C_MDS_StandbyReplayRestart(this));
1726     bool ready = objecter->wait_for_map(mdsmap->get_last_failure_osd_epoch(), fin);
1727     if (ready) {
1728       delete fin;
1729       mdlog->get_journaler()->reread_head_and_probe(
1730         new C_MDS_StandbyReplayRestartFinish(
1731           this,
1732           mdlog->get_journaler()->get_read_pos()));
1733
1734       dout(1) << " opening purge_queue (async)" << dendl;
1735       purge_queue.open(NULL);
1736       dout(1) << " opening open_file_table (async)" << dendl;
1737       mdcache->open_file_table.load(nullptr);
1738     } else {
1739       dout(1) << " waiting for osdmap " << mdsmap->get_last_failure_osd_epoch()
1740               << " (which blacklists prior instance)" << dendl;
1741     }
1742   }
1743 }
1744
1745 void MDSRank::replay_done()
1746 {
1747   if (!standby_replaying) {
1748     dout(1) << "Finished replaying journal" << dendl;
1749   } else {
1750     dout(5) << "Finished replaying journal as standby-replay" << dendl;
1751   }
1752
1753   if (is_standby_replay()) {
1754     // The replay was done in standby state, and we are still in that state
1755     ceph_assert(standby_replaying);
1756     dout(10) << "setting replay timer" << dendl;
1757     timer.add_event_after(g_conf()->mds_replay_interval,
1758                           new C_MDS_StandbyReplayRestart(this));
1759     return;
1760   } else if (standby_replaying) {
1761     // The replay was done in standby state, we have now _left_ that state
1762     dout(10) << " last replay pass was as a standby; making final pass" << dendl;
1763     standby_replaying = false;
1764     standby_replay_restart();
1765     return;
1766   } else {
1767     // Replay is complete, journal read should be up to date
1768     ceph_assert(mdlog->get_journaler()->get_read_pos() == mdlog->get_journaler()->get_write_pos());
1769     ceph_assert(!is_standby_replay());
1770
1771     // Reformat and come back here
1772     if (mdlog->get_journaler()->get_stream_format() < g_conf()->mds_journal_format) {
1773         dout(4) << "reformatting journal on standby-replay->replay transition" << dendl;
1774         mdlog->reopen(new C_MDS_BootStart(this, MDS_BOOT_REPLAY_DONE));
1775         return;
1776     }
1777   }
1778
1779   dout(1) << "making mds journal writeable" << dendl;
1780   mdlog->get_journaler()->set_writeable();
1781   mdlog->get_journaler()->trim_tail();
1782
1783   if (mdsmap->get_tableserver() == whoami &&
1784       snapserver->upgrade_format()) {
1785     dout(1) << "upgrading snaptable format" << dendl;
1786     snapserver->save(new C_MDSInternalNoop);
1787   }
1788
1789   if (g_conf()->mds_wipe_sessions) {
1790     dout(1) << "wiping out client sessions" << dendl;
1791     sessionmap.wipe();
1792     sessionmap.save(new C_MDSInternalNoop);
1793   }
1794   if (g_conf()->mds_wipe_ino_prealloc) {
1795     dout(1) << "wiping out ino prealloc from sessions" << dendl;
1796     sessionmap.wipe_ino_prealloc();
1797     sessionmap.save(new C_MDSInternalNoop);
1798   }
1799   if (g_conf()->mds_skip_ino) {
1800     inodeno_t i = g_conf()->mds_skip_ino;
1801     dout(1) << "skipping " << i << " inodes" << dendl;
1802     inotable->skip_inos(i);
1803     inotable->save(new C_MDSInternalNoop);
1804   }
1805
1806   if (mdsmap->get_num_in_mds() == 1 &&
1807       mdsmap->get_num_failed_mds() == 0) { // just me!
1808     dout(2) << "i am alone, moving to state reconnect" << dendl;
1809     request_state(MDSMap::STATE_RECONNECT);
1810     // sync snaptable cache
1811     snapclient->sync(new C_MDSInternalNoop);
1812   } else {
1813     dout(2) << "i am not alone, moving to state resolve" << dendl;
1814     request_state(MDSMap::STATE_RESOLVE);
1815   }
1816 }
1817
1818 void MDSRank::reopen_log()
1819 {
1820   dout(1) << "reopen_log" << dendl;
1821   mdcache->rollback_uncommitted_fragments();
1822 }
1823
1824 void MDSRank::resolve_start()
1825 {
1826   dout(1) << "resolve_start" << dendl;
1827
1828   reopen_log();
1829
1830   mdcache->resolve_start(new C_MDS_VoidFn(this, &MDSRank::resolve_done));
1831   finish_contexts(g_ceph_context, waiting_for_resolve);
1832 }
1833
1834 void MDSRank::resolve_done()
1835 {
1836   dout(1) << "resolve_done" << dendl;
1837   request_state(MDSMap::STATE_RECONNECT);
1838   // sync snaptable cache
1839   snapclient->sync(new C_MDSInternalNoop);
1840 }
1841
1842 void MDSRank::reconnect_start()
1843 {
1844   dout(1) << "reconnect_start" << dendl;
1845
1846   if (last_state == MDSMap::STATE_REPLAY) {
1847     reopen_log();
1848   }
1849
1850   // Drop any blacklisted clients from the SessionMap before going
1851   // into reconnect, so that we don't wait for them.
1852   objecter->enable_blacklist_events();
1853   std::set<entity_addr_t> blacklist;
1854   epoch_t epoch = 0;
1855   objecter->with_osdmap([&blacklist, &epoch](const OSDMap& o) {
1856       o.get_blacklist(&blacklist);
1857       epoch = o.get_epoch();
1858   });
1859   auto killed = server->apply_blacklist(blacklist);
1860   dout(4) << "reconnect_start: killed " << killed << " blacklisted sessions ("
1861           << blacklist.size() << " blacklist entries, "
1862           << sessionmap.get_sessions().size() << ")" << dendl;
1863   if (killed) {
1864     set_osd_epoch_barrier(epoch);
1865   }
1866
1867   server->reconnect_clients(new C_MDS_VoidFn(this, &MDSRank::reconnect_done));
1868   finish_contexts(g_ceph_context, waiting_for_reconnect);
1869 }
1870 void MDSRank::reconnect_done()
1871 {
1872   dout(1) << "reconnect_done" << dendl;
1873   request_state(MDSMap::STATE_REJOIN);    // move to rejoin state
1874 }
1875
1876 void MDSRank::rejoin_joint_start()
1877 {
1878   dout(1) << "rejoin_joint_start" << dendl;
1879   mdcache->rejoin_send_rejoins();
1880 }
1881 void MDSRank::rejoin_start()
1882 {
1883   dout(1) << "rejoin_start" << dendl;
1884   mdcache->rejoin_start(new C_MDS_VoidFn(this, &MDSRank::rejoin_done));
1885   finish_contexts(g_ceph_context, waiting_for_rejoin);
1886 }
1887 void MDSRank::rejoin_done()
1888 {
1889   dout(1) << "rejoin_done" << dendl;
1890   mdcache->show_subtrees();
1891   mdcache->show_cache();
1892
1893   if (mdcache->is_any_uncommitted_fragment()) {
1894     dout(1) << " waiting for uncommitted fragments" << dendl;
1895     MDSGatherBuilder gather(g_ceph_context, new C_MDS_VoidFn(this, &MDSRank::rejoin_done));
1896     mdcache->wait_for_uncommitted_fragments(gather.get());
1897     gather.activate();
1898     return;
1899   }
1900
1901   // funny case: is our cache empty?  no subtrees?
1902   if (!mdcache->is_subtrees()) {
1903     if (whoami == 0) {
1904       // The root should always have a subtree!
1905       clog->error() << "No subtrees found for root MDS rank!";
1906       damaged();
1907       ceph_assert(mdcache->is_subtrees());
1908     } else {
1909       dout(1) << " empty cache, no subtrees, leaving cluster" << dendl;
1910       request_state(MDSMap::STATE_STOPPED);
1911     }
1912     return;
1913   }
1914
1915   if (replay_queue.empty() && !server->get_num_pending_reclaim()) {
1916     request_state(MDSMap::STATE_ACTIVE);
1917   } else {
1918     replaying_requests_done = replay_queue.empty();
1919     request_state(MDSMap::STATE_CLIENTREPLAY);
1920   }
1921 }
1922
1923 void MDSRank::clientreplay_start()
1924 {
1925   dout(1) << "clientreplay_start" << dendl;
1926   finish_contexts(g_ceph_context, waiting_for_replay);  // kick waiters
1927   mdcache->start_files_to_recover();
1928   queue_one_replay();
1929 }
1930
1931 bool MDSRank::queue_one_replay()
1932 {
1933   if (!replay_queue.empty()) {
1934     queue_waiter(replay_queue.front());
1935     replay_queue.pop_front();
1936     return true;
1937   }
1938   if (!replaying_requests_done) {
1939     replaying_requests_done = true;
1940     mdlog->flush();
1941   }
1942   maybe_clientreplay_done();
1943   return false;
1944 }
1945
1946 void MDSRank::maybe_clientreplay_done()
1947 {
1948   if (is_clientreplay() && get_want_state() == MDSMap::STATE_CLIENTREPLAY) {
1949
1950     // don't go to active if there are session waiting for being reclaimed
1951     if (replaying_requests_done && !server->get_num_pending_reclaim()) {
1952       mdlog->wait_for_safe(new C_MDS_VoidFn(this, &MDSRank::clientreplay_done));
1953       return;
1954     }
1955
1956     dout(1) << " still have " << replay_queue.size() + (int)!replaying_requests_done
1957             << " requests need to be replayed, " << server->get_num_pending_reclaim()
1958             << " sessions need to be reclaimed" << dendl;
1959   }
1960 }
1961
1962 void MDSRank::clientreplay_done()
1963 {
1964   dout(1) << "clientreplay_done" << dendl;
1965   request_state(MDSMap::STATE_ACTIVE);
1966 }
1967
1968 void MDSRank::active_start()
1969 {
1970   dout(1) << "active_start" << dendl;
1971
1972   if (last_state == MDSMap::STATE_CREATING ||
1973       last_state == MDSMap::STATE_STARTING) {
1974     mdcache->open_root();
1975   }
1976
1977   mdcache->clean_open_file_lists();
1978   mdcache->export_remaining_imported_caps();
1979   finish_contexts(g_ceph_context, waiting_for_replay);  // kick waiters
1980   mdcache->start_files_to_recover();
1981
1982   mdcache->reissue_all_caps();
1983
1984   finish_contexts(g_ceph_context, waiting_for_active);  // kick waiters
1985 }
1986
1987 void MDSRank::recovery_done(int oldstate)
1988 {
1989   dout(1) << "recovery_done -- successful recovery!" << dendl;
1990   ceph_assert(is_clientreplay() || is_active());
1991
1992   if (oldstate == MDSMap::STATE_CREATING)
1993     return;
1994
1995   mdcache->start_recovered_truncates();
1996   mdcache->start_purge_inodes();
1997   mdcache->do_file_recover();
1998
1999   // tell connected clients
2000   //bcast_mds_map();     // not anymore, they get this from the monitor
2001
2002   mdcache->populate_mydir();
2003 }
2004
2005 void MDSRank::creating_done()
2006 {
2007   dout(1)<< "creating_done" << dendl;
2008   request_state(MDSMap::STATE_ACTIVE);
2009   // sync snaptable cache
2010   snapclient->sync(new C_MDSInternalNoop);
2011 }
2012
2013 void MDSRank::boot_create()
2014 {
2015   dout(3) << "boot_create" << dendl;
2016
2017   MDSGatherBuilder fin(g_ceph_context, new C_MDS_VoidFn(this, &MDSRank::creating_done));
2018
2019   mdcache->init_layouts();
2020
2021   inotable->set_rank(whoami);
2022   sessionmap.set_rank(whoami);
2023
2024   // start with a fresh journal
2025   dout(10) << "boot_create creating fresh journal" << dendl;
2026   mdlog->create(fin.new_sub());
2027
2028   // open new journal segment, but do not journal subtree map (yet)
2029   mdlog->prepare_new_segment();
2030
2031   if (whoami == mdsmap->get_root()) {
2032     dout(3) << "boot_create creating fresh hierarchy" << dendl;
2033     mdcache->create_empty_hierarchy(fin.get());
2034   }
2035
2036   dout(3) << "boot_create creating mydir hierarchy" << dendl;
2037   mdcache->create_mydir_hierarchy(fin.get());
2038
2039   dout(3) << "boot_create creating global snaprealm" << dendl;
2040   mdcache->create_global_snaprealm();
2041
2042   // fixme: fake out inotable (reset, pretend loaded)
2043   dout(10) << "boot_create creating fresh inotable table" << dendl;
2044   inotable->reset();
2045   inotable->save(fin.new_sub());
2046
2047   // write empty sessionmap
2048   sessionmap.save(fin.new_sub());
2049
2050   // Create empty purge queue
2051   purge_queue.create(new C_IO_Wrapper(this, fin.new_sub()));
2052
2053   // initialize tables
2054   if (mdsmap->get_tableserver() == whoami) {
2055     dout(10) << "boot_create creating fresh snaptable" << dendl;
2056     snapserver->set_rank(whoami);
2057     snapserver->reset();
2058     snapserver->save(fin.new_sub());
2059   }
2060
2061   ceph_assert(g_conf()->mds_kill_create_at != 1);
2062
2063   // ok now journal it
2064   mdlog->journal_segment_subtree_map(fin.new_sub());
2065   mdlog->flush();
2066
2067   // Usually we do this during reconnect, but creation skips that.
2068   objecter->enable_blacklist_events();
2069
2070   fin.activate();
2071 }
2072
2073 void MDSRank::stopping_start()
2074 {
2075   dout(2) << "Stopping..." << dendl;
2076
2077   if (mdsmap->get_num_in_mds() == 1 && !sessionmap.empty()) {
2078     std::vector<Session*> victims;
2079     const auto& sessions = sessionmap.get_sessions();
2080     for (const auto& p : sessions)  {
2081       if (!p.first.is_client()) {
2082         continue;
2083       }
2084
2085       Session *s = p.second;
2086       victims.push_back(s);
2087     }
2088
2089     dout(20) << __func__ << " matched " << victims.size() << " sessions" << dendl;
2090     ceph_assert(!victims.empty());
2091
2092     C_GatherBuilder gather(g_ceph_context, new C_MDSInternalNoop);
2093     for (const auto &s : victims) {
2094       std::stringstream ss;
2095       evict_client(s->get_client().v, false,
2096                    g_conf()->mds_session_blacklist_on_evict, ss, gather.new_sub());
2097     }
2098     gather.activate();
2099   }
2100
2101   mdcache->shutdown_start();
2102 }
2103
2104 void MDSRank::stopping_done()
2105 {
2106   dout(2) << "Finished stopping..." << dendl;
2107
2108   // tell monitor we shut down cleanly.
2109   request_state(MDSMap::STATE_STOPPED);
2110 }
2111
2112 void MDSRankDispatcher::handle_mds_map(
2113     const cref_t<MMDSMap> &m,
2114     const MDSMap &oldmap)
2115 {
2116   // I am only to be passed MDSMaps in which I hold a rank
2117   ceph_assert(whoami != MDS_RANK_NONE);
2118
2119   MDSMap::DaemonState oldstate = state;
2120   mds_gid_t mds_gid = mds_gid_t(monc->get_global_id());
2121   state = mdsmap->get_state_gid(mds_gid);
2122   if (state != oldstate) {
2123     last_state = oldstate;
2124     incarnation = mdsmap->get_inc_gid(mds_gid);
2125   }
2126
2127   version_t epoch = m->get_epoch();
2128
2129   // note source's map version
2130   if (m->get_source().is_mds() &&
2131       peer_mdsmap_epoch[mds_rank_t(m->get_source().num())] < epoch) {
2132     dout(15) << " peer " << m->get_source()
2133              << " has mdsmap epoch >= " << epoch
2134              << dendl;
2135     peer_mdsmap_epoch[mds_rank_t(m->get_source().num())] = epoch;
2136   }
2137
2138   // Validate state transitions while I hold a rank
2139   if (!MDSMap::state_transition_valid(oldstate, state)) {
2140     derr << "Invalid state transition " << ceph_mds_state_name(oldstate)
2141       << "->" << ceph_mds_state_name(state) << dendl;
2142     respawn();
2143   }
2144
2145   if (oldstate != state) {
2146     // update messenger.
2147     if (state == MDSMap::STATE_STANDBY_REPLAY) {
2148       dout(1) << "handle_mds_map i am now mds." << mds_gid << "." << incarnation
2149               << " replaying mds." << whoami << "." << incarnation << dendl;
2150       messenger->set_myname(entity_name_t::MDS(mds_gid));
2151     } else {
2152       dout(1) << "handle_mds_map i am now mds." << whoami << "." << incarnation << dendl;
2153       messenger->set_myname(entity_name_t::MDS(whoami));
2154     }
2155   }
2156
2157   // tell objecter my incarnation
2158   if (objecter->get_client_incarnation() != incarnation)
2159     objecter->set_client_incarnation(incarnation);
2160
2161   if (mdsmap->get_min_compat_client() < ceph_release_t::max &&
2162       oldmap.get_min_compat_client() != mdsmap->get_min_compat_client())
2163     server->update_required_client_features();
2164
2165   // for debug
2166   if (g_conf()->mds_dump_cache_on_map)
2167     mdcache->dump_cache();
2168
2169   cluster_degraded = mdsmap->is_degraded();
2170
2171   // mdsmap and oldmap can be discontinuous. failover might happen in the missing mdsmap.
2172   // the 'restart' set tracks ranks that have restarted since the old mdsmap
2173   set<mds_rank_t> restart;
2174   // replaying mds does not communicate with other ranks
2175   if (state >= MDSMap::STATE_RESOLVE) {
2176     // did someone fail?
2177     //   new down?
2178     set<mds_rank_t> olddown, down;
2179     oldmap.get_down_mds_set(&olddown);
2180     mdsmap->get_down_mds_set(&down);
2181     for (const auto& r : down) {
2182       if (oldmap.have_inst(r) && olddown.count(r) == 0) {
2183         messenger->mark_down_addrs(oldmap.get_addrs(r));
2184         handle_mds_failure(r);
2185       }
2186     }
2187
2188     // did someone fail?
2189     //   did their addr/inst change?
2190     set<mds_rank_t> up;
2191     mdsmap->get_up_mds_set(up);
2192     for (const auto& r : up) {
2193       auto& info = mdsmap->get_info(r);
2194       if (oldmap.have_inst(r)) {
2195         auto& oldinfo = oldmap.get_info(r);
2196         if (info.inc != oldinfo.inc) {
2197           messenger->mark_down_addrs(oldinfo.get_addrs());
2198           if (info.state == MDSMap::STATE_REPLAY ||
2199               info.state == MDSMap::STATE_RESOLVE) {
2200             restart.insert(r);
2201             handle_mds_failure(r);
2202           } else {
2203             ceph_assert(info.state == MDSMap::STATE_STARTING ||
2204                    info.state == MDSMap::STATE_ACTIVE);
2205             // -> stopped (missing) -> starting -> active
2206             restart.insert(r);
2207             mdcache->migrator->handle_mds_failure_or_stop(r);
2208             if (mdsmap->get_tableserver() == whoami)
2209               snapserver->handle_mds_failure_or_stop(r);
2210           }
2211         }
2212       } else {
2213         if (info.state == MDSMap::STATE_REPLAY ||
2214             info.state == MDSMap::STATE_RESOLVE) {
2215           // -> starting/creating (missing) -> active (missing) -> replay -> resolve
2216           restart.insert(r);
2217           handle_mds_failure(r);
2218         } else {
2219           ceph_assert(info.state == MDSMap::STATE_CREATING ||
2220                  info.state == MDSMap::STATE_STARTING ||
2221                  info.state == MDSMap::STATE_ACTIVE);
2222         }
2223       }
2224     }
2225   }
2226
2227   // did it change?
2228   if (oldstate != state) {
2229     dout(1) << "handle_mds_map state change "
2230             << ceph_mds_state_name(oldstate) << " --> "
2231             << ceph_mds_state_name(state) << dendl;
2232     beacon.set_want_state(*mdsmap, state);
2233
2234     if (oldstate == MDSMap::STATE_STANDBY_REPLAY) {
2235         dout(10) << "Monitor activated us! Deactivating replay loop" << dendl;
2236         assert (state == MDSMap::STATE_REPLAY);
2237     } else {
2238       // did i just recover?
2239       if ((is_active() || is_clientreplay()) &&
2240           (oldstate == MDSMap::STATE_CREATING ||
2241            oldstate == MDSMap::STATE_REJOIN ||
2242            oldstate == MDSMap::STATE_RECONNECT))
2243         recovery_done(oldstate);
2244
2245       if (is_active()) {
2246         active_start();
2247       } else if (is_any_replay()) {
2248         replay_start();
2249       } else if (is_resolve()) {
2250         resolve_start();
2251       } else if (is_reconnect()) {
2252         reconnect_start();
2253       } else if (is_rejoin()) {
2254         rejoin_start();
2255       } else if (is_clientreplay()) {
2256         clientreplay_start();
2257       } else if (is_creating()) {
2258         boot_create();
2259       } else if (is_starting()) {
2260         boot_start();
2261       } else if (is_stopping()) {
2262         ceph_assert(oldstate == MDSMap::STATE_ACTIVE);
2263         stopping_start();
2264       }
2265     }
2266   }
2267
2268   // RESOLVE
2269   // is someone else newly resolving?
2270   if (state >= MDSMap::STATE_RESOLVE) {
2271     // recover snaptable
2272     if (mdsmap->get_tableserver() == whoami) {
2273       if (oldstate < MDSMap::STATE_RESOLVE) {
2274         set<mds_rank_t> s;
2275         mdsmap->get_mds_set_lower_bound(s, MDSMap::STATE_RESOLVE);
2276         snapserver->finish_recovery(s);
2277       } else {
2278         set<mds_rank_t> old_set, new_set;
2279         oldmap.get_mds_set_lower_bound(old_set, MDSMap::STATE_RESOLVE);
2280         mdsmap->get_mds_set_lower_bound(new_set, MDSMap::STATE_RESOLVE);
2281         for (const auto& r : new_set) {
2282           if (r == whoami)
2283             continue; // not me
2284           if (!old_set.count(r) || restart.count(r)) {  // newly so?
2285             snapserver->handle_mds_recovery(r);
2286           }
2287         }
2288       }
2289     }
2290
2291     if ((!oldmap.is_resolving() || !restart.empty()) && mdsmap->is_resolving()) {
2292       set<mds_rank_t> resolve;
2293       mdsmap->get_mds_set(resolve, MDSMap::STATE_RESOLVE);
2294       dout(10) << " resolve set is " << resolve << dendl;
2295       calc_recovery_set();
2296       mdcache->send_resolves();
2297     }
2298   }
2299
2300   // REJOIN
2301   // is everybody finally rejoining?
2302   if (state >= MDSMap::STATE_REJOIN) {
2303     // did we start?
2304     if (!oldmap.is_rejoining() && mdsmap->is_rejoining())
2305       rejoin_joint_start();
2306
2307     // did we finish?
2308     if (g_conf()->mds_dump_cache_after_rejoin &&
2309         oldmap.is_rejoining() && !mdsmap->is_rejoining())
2310       mdcache->dump_cache();      // for DEBUG only
2311
2312     if (oldstate >= MDSMap::STATE_REJOIN ||
2313         oldstate == MDSMap::STATE_STARTING) {
2314       // ACTIVE|CLIENTREPLAY|REJOIN => we can discover from them.
2315       set<mds_rank_t> olddis, dis;
2316       oldmap.get_mds_set_lower_bound(olddis, MDSMap::STATE_REJOIN);
2317       mdsmap->get_mds_set_lower_bound(dis, MDSMap::STATE_REJOIN);
2318       for (const auto& r : dis) {
2319         if (r == whoami)
2320           continue; // not me
2321         if (!olddis.count(r) || restart.count(r)) {  // newly so?
2322           mdcache->kick_discovers(r);
2323           mdcache->kick_open_ino_peers(r);
2324         }
2325       }
2326     }
2327   }
2328
2329   if (oldmap.is_degraded() && !cluster_degraded && state >= MDSMap::STATE_ACTIVE) {
2330     dout(1) << "cluster recovered." << dendl;
2331     auto it = waiting_for_active_peer.find(MDS_RANK_NONE);
2332     if (it != waiting_for_active_peer.end()) {
2333       queue_waiters(it->second);
2334       waiting_for_active_peer.erase(it);
2335     }
2336   }
2337
2338   // did someone go active?
2339   if (state >= MDSMap::STATE_CLIENTREPLAY &&
2340       oldstate >= MDSMap::STATE_CLIENTREPLAY) {
2341     set<mds_rank_t> oldactive, active;
2342     oldmap.get_mds_set_lower_bound(oldactive, MDSMap::STATE_CLIENTREPLAY);
2343     mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
2344     for (const auto& r : active) {
2345       if (r == whoami)
2346         continue; // not me
2347       if (!oldactive.count(r) || restart.count(r))  // newly so?
2348         handle_mds_recovery(r);
2349     }
2350   }
2351
2352   if (is_clientreplay() || is_active() || is_stopping()) {
2353     // did anyone stop?
2354     set<mds_rank_t> oldstopped, stopped;
2355     oldmap.get_stopped_mds_set(oldstopped);
2356     mdsmap->get_stopped_mds_set(stopped);
2357     for (const auto& r : stopped)
2358       if (oldstopped.count(r) == 0) {     // newly so?
2359         mdcache->migrator->handle_mds_failure_or_stop(r);
2360         if (mdsmap->get_tableserver() == whoami)
2361           snapserver->handle_mds_failure_or_stop(r);
2362       }
2363   }
2364
2365   {
2366     map<epoch_t,MDSContext::vec >::iterator p = waiting_for_mdsmap.begin();
2367     while (p != waiting_for_mdsmap.end() && p->first <= mdsmap->get_epoch()) {
2368       MDSContext::vec ls;
2369       ls.swap(p->second);
2370       waiting_for_mdsmap.erase(p++);
2371       queue_waiters(ls);
2372     }
2373   }
2374
2375   if (is_active()) {
2376     // Before going active, set OSD epoch barrier to latest (so that
2377     // we don't risk handing out caps to clients with old OSD maps that
2378     // might not include barriers from the previous incarnation of this MDS)
2379     set_osd_epoch_barrier(objecter->with_osdmap(
2380                             std::mem_fn(&OSDMap::get_epoch)));
2381
2382     /* Now check if we should hint to the OSD that a read may follow */
2383     if (mdsmap->has_standby_replay(whoami))
2384       mdlog->set_write_iohint(0);
2385     else
2386       mdlog->set_write_iohint(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED);
2387   }
2388
2389   if (oldmap.get_max_mds() != mdsmap->get_max_mds()) {
2390     purge_queue.update_op_limit(*mdsmap);
2391   }
2392
2393   if (mdsmap->get_inline_data_enabled() && !oldmap.get_inline_data_enabled())
2394     dout(0) << "WARNING: inline_data support has been deprecated and will be removed in a future release" << dendl;
2395
2396   if (scrubstack->is_scrubbing()) {
2397     if (mdsmap->get_max_mds() > 1) {
2398       auto c = new C_MDSInternalNoop;
2399       scrubstack->scrub_abort(c);
2400     }
2401   }
2402   mdcache->handle_mdsmap(*mdsmap);
2403 }
2404
2405 void MDSRank::handle_mds_recovery(mds_rank_t who)
2406 {
2407   dout(5) << "handle_mds_recovery mds." << who << dendl;
2408
2409   mdcache->handle_mds_recovery(who);
2410
2411   queue_waiters(waiting_for_active_peer[who]);
2412   waiting_for_active_peer.erase(who);
2413 }
2414
2415 void MDSRank::handle_mds_failure(mds_rank_t who)
2416 {
2417   if (who == whoami) {
2418     dout(5) << "handle_mds_failure for myself; not doing anything" << dendl;
2419     return;
2420   }
2421   dout(5) << "handle_mds_failure mds." << who << dendl;
2422
2423   mdcache->handle_mds_failure(who);
2424
2425   if (mdsmap->get_tableserver() == whoami)
2426     snapserver->handle_mds_failure_or_stop(who);
2427
2428   snapclient->handle_mds_failure(who);
2429 }
2430
2431 void MDSRankDispatcher::handle_asok_command(
2432   std::string_view command,
2433   const cmdmap_t& cmdmap,
2434   Formatter *f,
2435   const bufferlist &inbl,
2436   std::function<void(int,const std::string&,bufferlist&)> on_finish)
2437 {
2438   int r = 0;
2439   stringstream ss;
2440   bufferlist outbl;
2441   if (command == "dump_ops_in_flight" ||
2442       command == "ops") {
2443     if (!op_tracker.dump_ops_in_flight(f)) {
2444       ss << "op_tracker disabled; set mds_enable_op_tracker=true to enable";
2445     }
2446   } else if (command == "dump_blocked_ops") {
2447     if (!op_tracker.dump_ops_in_flight(f, true)) {
2448       ss << "op_tracker disabled; set mds_enable_op_tracker=true to enable";
2449     }
2450   } else if (command == "dump_historic_ops") {
2451     if (!op_tracker.dump_historic_ops(f)) {
2452       ss << "op_tracker disabled; set mds_enable_op_tracker=true to enable";
2453     }
2454   } else if (command == "dump_historic_ops_by_duration") {
2455     if (!op_tracker.dump_historic_ops(f, true)) {
2456       ss << "op_tracker disabled; set mds_enable_op_tracker=true to enable";
2457     }
2458   } else if (command == "osdmap barrier") {
2459     int64_t target_epoch = 0;
2460     bool got_val = cmd_getval(cmdmap, "target_epoch", target_epoch);
2461
2462     if (!got_val) {
2463       ss << "no target epoch given";
2464       r = -EINVAL;
2465       goto out;
2466     }
2467     {
2468       std::lock_guard l(mds_lock);
2469       set_osd_epoch_barrier(target_epoch);
2470     }
2471     C_SaferCond cond;
2472     bool already_got = objecter->wait_for_map(target_epoch, &cond);
2473     if (!already_got) {
2474       dout(4) << __func__ << ": waiting for OSD epoch " << target_epoch << dendl;
2475       cond.wait();
2476     }
2477   } else if (command == "session ls" ||
2478              command == "client ls") {
2479     std::lock_guard l(mds_lock);
2480     std::vector<std::string> filter_args;
2481     cmd_getval(cmdmap, "filters", filter_args);
2482     SessionFilter filter;
2483     r = filter.parse(filter_args, &ss);
2484     if (r != 0) {
2485       goto out;
2486     }
2487     dump_sessions(filter, f);
2488   } else if (command == "session evict" ||
2489              command == "client evict") {
2490     std::lock_guard l(mds_lock);
2491     std::vector<std::string> filter_args;
2492     cmd_getval(cmdmap, "filters", filter_args);
2493
2494     SessionFilter filter;
2495     r = filter.parse(filter_args, &ss);
2496     if (r != 0) {
2497       r = -EINVAL;
2498       goto out;
2499     }
2500     evict_clients(filter, on_finish);
2501     return;
2502   } else if (command == "session kill") {
2503     std::string client_id;
2504     if (!cmd_getval(cmdmap, "client_id", client_id)) {
2505       ss << "Invalid client_id specified";
2506       r = -ENOENT;
2507       goto out;
2508     }
2509     std::lock_guard l(mds_lock);
2510     bool evicted = evict_client(strtol(client_id.c_str(), 0, 10), true,
2511         g_conf()->mds_session_blacklist_on_evict, ss);
2512     if (!evicted) {
2513       dout(15) << ss.str() << dendl;
2514       r = -ENOENT;
2515     }
2516   } else if (command == "session config" ||
2517              command == "client config") {
2518     int64_t client_id;
2519     std::string option;
2520     std::string value;
2521
2522     cmd_getval(cmdmap, "client_id", client_id);
2523     cmd_getval(cmdmap, "option", option);
2524     bool got_value = cmd_getval(cmdmap, "value", value);
2525
2526     std::lock_guard l(mds_lock);
2527     r = config_client(client_id, !got_value, option, value, ss);
2528   } else if (command == "scrub start" ||
2529              command == "scrub_start") {
2530     string path;
2531     string tag;
2532     vector<string> scrubop_vec;
2533     cmd_getval(cmdmap, "scrubops", scrubop_vec);
2534     cmd_getval(cmdmap, "path", path);
2535     cmd_getval(cmdmap, "tag", tag);
2536
2537     /* Multiple MDS scrub is not currently supported. See also: https://tracker.ceph.com/issues/12274 */
2538     if (mdsmap->get_max_mds() > 1) {
2539       ss << "Scrub is not currently supported for multiple active MDS. Please reduce max_mds to 1 and then scrub.";
2540       r = -EINVAL;
2541       goto out;
2542     }
2543
2544     finisher->queue(
2545       new LambdaContext(
2546         [this, on_finish, f, path, tag, scrubop_vec](int r) {
2547           command_scrub_start(
2548             f, path, tag, scrubop_vec,
2549             new LambdaContext(
2550               [on_finish](int r) {
2551                 bufferlist outbl;
2552                 on_finish(r, {}, outbl);
2553               }));
2554         }));
2555     return;
2556   } else if (command == "scrub abort") {
2557     finisher->queue(
2558       new LambdaContext(
2559         [this, on_finish, f](int r) {
2560           command_scrub_abort(
2561             f,
2562             new LambdaContext(
2563               [on_finish, f](int r) {
2564                 bufferlist outbl;
2565                 f->open_object_section("result");
2566                 f->dump_int("return_code", r);
2567                 f->close_section();
2568                 on_finish(r, {}, outbl);
2569               }));
2570         }));
2571     return;
2572   } else if (command == "scrub pause") {
2573     finisher->queue(
2574       new LambdaContext(
2575         [this, on_finish, f](int r) {
2576           command_scrub_pause(
2577             f,
2578             new LambdaContext(
2579               [on_finish, f](int r) {
2580                 bufferlist outbl;
2581                 f->open_object_section("result");
2582                 f->dump_int("return_code", r);
2583                 f->close_section();
2584                 on_finish(r, {}, outbl);
2585               }));
2586         }));
2587     return;
2588   } else if (command == "scrub resume") {
2589     command_scrub_resume(f);
2590   } else if (command == "scrub status") {
2591     command_scrub_status(f);
2592   } else if (command == "tag path") {
2593     string path;
2594     cmd_getval(cmdmap, "path", path);
2595     string tag;
2596     cmd_getval(cmdmap, "tag", tag);
2597     command_tag_path(f, path, tag);
2598   } else if (command == "flush_path") {
2599     string path;
2600     cmd_getval(cmdmap, "path", path);
2601     command_flush_path(f, path);
2602   } else if (command == "flush journal") {
2603     command_flush_journal(f);
2604   } else if (command == "get subtrees") {
2605     command_get_subtrees(f);
2606   } else if (command == "export dir") {
2607     string path;
2608     if(!cmd_getval(cmdmap, "path", path)) {
2609       ss << "malformed path";
2610       r = -EINVAL;
2611       goto out;
2612     }
2613     int64_t rank;
2614     if(!cmd_getval(cmdmap, "rank", rank)) {
2615       ss << "malformed rank";
2616       r = -EINVAL;
2617       goto out;
2618     }
2619     command_export_dir(f, path, (mds_rank_t)rank);
2620   } else if (command == "dump cache") {
2621     std::lock_guard l(mds_lock);
2622     string path;
2623     if (!cmd_getval(cmdmap, "path", path)) {
2624       r = mdcache->dump_cache(f);
2625     } else {
2626       r = mdcache->dump_cache(path);
2627     }
2628   } else if (command == "cache drop") {
2629     int64_t timeout = 0;
2630     cmd_getval(cmdmap, "timeout", timeout);
2631     finisher->queue(
2632       new LambdaContext(
2633         [this, on_finish, f, timeout](int r) {
2634           command_cache_drop(
2635             timeout, f,
2636             new LambdaContext(
2637               [on_finish](int r) {
2638                 bufferlist outbl;
2639                 on_finish(r, {}, outbl);
2640               }));
2641         }));
2642     return;
2643   } else if (command == "cache status") {
2644     std::lock_guard l(mds_lock);
2645     mdcache->cache_status(f);
2646   } else if (command == "dump tree") {
2647     command_dump_tree(cmdmap, ss, f);
2648   } else if (command == "dump loads") {
2649     std::lock_guard l(mds_lock);
2650     r = balancer->dump_loads(f);
2651   } else if (command == "dump snaps") {
2652     std::lock_guard l(mds_lock);
2653     string server;
2654     cmd_getval(cmdmap, "server", server);
2655     if (server == "--server") {
2656       if (mdsmap->get_tableserver() == whoami) {
2657         snapserver->dump(f);
2658       } else {
2659         r = -EXDEV;
2660         ss << "Not snapserver";
2661       }
2662     } else {
2663       r = snapclient->dump_cache(f);
2664     }
2665   } else if (command == "force_readonly") {
2666     std::lock_guard l(mds_lock);
2667     mdcache->force_readonly();
2668   } else if (command == "dirfrag split") {
2669     command_dirfrag_split(cmdmap, ss);
2670   } else if (command == "dirfrag merge") {
2671     command_dirfrag_merge(cmdmap, ss);
2672   } else if (command == "dirfrag ls") {
2673     command_dirfrag_ls(cmdmap, ss, f);
2674   } else if (command == "openfiles ls") {
2675     command_openfiles_ls(f);
2676   } else if (command == "dump inode") {
2677     command_dump_inode(f, cmdmap, ss);
2678   } else if (command == "damage ls") {
2679     std::lock_guard l(mds_lock);
2680     damage_table.dump(f);
2681   } else if (command == "damage rm") {
2682     std::lock_guard l(mds_lock);
2683     damage_entry_id_t id = 0;
2684     if (!cmd_getval(cmdmap, "damage_id", (int64_t&)id)) {
2685       r = -EINVAL;
2686       goto out;
2687     }
2688     damage_table.erase(id);
2689   } else {
2690     r = -ENOSYS;
2691   }
2692 out:
2693   on_finish(r, ss.str(), outbl);
2694 }
2695
2696 /**
2697  * This function drops the mds_lock, so don't do anything with
2698  * MDSRank after calling it (we could have gone into shutdown): just
2699  * send your result back to the calling client and finish.
2700  */
2701 void MDSRankDispatcher::evict_clients(
2702   const SessionFilter &filter,
2703   std::function<void(int,const std::string&,bufferlist&)> on_finish)
2704 {
2705   bufferlist outbl;
2706   if (is_any_replay()) {
2707     on_finish(-EAGAIN, "MDS is replaying log", outbl);
2708     return;
2709   }
2710
2711   std::vector<Session*> victims;
2712   const auto& sessions = sessionmap.get_sessions();
2713   for (const auto& p : sessions)  {
2714     if (!p.first.is_client()) {
2715       continue;
2716     }
2717
2718     Session *s = p.second;
2719
2720     if (filter.match(*s, std::bind(&Server::waiting_for_reconnect, server,
2721                                    std::placeholders::_1))) {
2722       victims.push_back(s);
2723     }
2724   }
2725
2726   dout(20) << __func__ << " matched " << victims.size() << " sessions" << dendl;
2727
2728   if (victims.empty()) {
2729     on_finish(0, {}, outbl);
2730     return;
2731   }
2732
2733   C_GatherBuilder gather(g_ceph_context,
2734                          new LambdaContext([on_finish](int r) {
2735                                              bufferlist bl;
2736                                              on_finish(r, {}, bl);
2737                                            }));
2738   for (const auto s : victims) {
2739     std::stringstream ss;
2740     evict_client(s->get_client().v, false,
2741                  g_conf()->mds_session_blacklist_on_evict, ss, gather.new_sub());
2742   }
2743   gather.activate();
2744 }
2745
2746 void MDSRankDispatcher::dump_sessions(const SessionFilter &filter, Formatter *f) const
2747 {
2748   // Dump sessions, decorated with recovery/replay status
2749   f->open_array_section("sessions");
2750   for (auto& [name, s] : sessionmap.get_sessions()) {
2751     if (!name.is_client()) {
2752       continue;
2753     }
2754
2755     if (!filter.match(*s, std::bind(&Server::waiting_for_reconnect, server, std::placeholders::_1))) {
2756       continue;
2757     }
2758
2759     f->dump_object("session", *s);
2760   }
2761   f->close_section(); // sessions
2762 }
2763
2764 void MDSRank::command_scrub_start(Formatter *f,
2765                                   std::string_view path, std::string_view tag,
2766                                   const vector<string>& scrubop_vec, Context *on_finish)
2767 {
2768   bool force = false;
2769   bool recursive = false;
2770   bool repair = false;
2771   for (auto &op : scrubop_vec) {
2772     if (op == "force")
2773       force = true;
2774     else if (op == "recursive")
2775       recursive = true;
2776     else if (op == "repair")
2777       repair = true;
2778   }
2779
2780   std::lock_guard l(mds_lock);
2781   mdcache->enqueue_scrub(path, tag, force, recursive, repair, f, on_finish);
2782   // scrub_dentry() finishers will dump the data for us; we're done!
2783 }
2784
2785 void MDSRank::command_tag_path(Formatter *f,
2786     std::string_view path, std::string_view tag)
2787 {
2788   C_SaferCond scond;
2789   {
2790     std::lock_guard l(mds_lock);
2791     mdcache->enqueue_scrub(path, tag, true, true, false, f, &scond);
2792   }
2793   scond.wait();
2794 }
2795
2796 void MDSRank::command_scrub_abort(Formatter *f, Context *on_finish) {
2797   std::lock_guard l(mds_lock);
2798   scrubstack->scrub_abort(on_finish);
2799 }
2800
2801 void MDSRank::command_scrub_pause(Formatter *f, Context *on_finish) {
2802   std::lock_guard l(mds_lock);
2803   scrubstack->scrub_pause(on_finish);
2804 }
2805
2806 void MDSRank::command_scrub_resume(Formatter *f) {
2807   std::lock_guard l(mds_lock);
2808   int r = scrubstack->scrub_resume();
2809
2810   f->open_object_section("result");
2811   f->dump_int("return_code", r);
2812   f->close_section();
2813 }
2814
2815 void MDSRank::command_scrub_status(Formatter *f) {
2816   std::lock_guard l(mds_lock);
2817   scrubstack->scrub_status(f);
2818 }
2819
2820 void MDSRank::command_flush_path(Formatter *f, std::string_view path)
2821 {
2822   C_SaferCond scond;
2823   {
2824     std::lock_guard l(mds_lock);
2825     mdcache->flush_dentry(path, &scond);
2826   }
2827   int r = scond.wait();
2828   f->open_object_section("results");
2829   f->dump_int("return_code", r);
2830   f->close_section(); // results
2831 }
2832
2833 // synchronous wrapper around "journal flush" asynchronous context
2834 // execution.
2835 void MDSRank::command_flush_journal(Formatter *f) {
2836   ceph_assert(f != NULL);
2837
2838   C_SaferCond cond;
2839   std::stringstream ss;
2840   {
2841     std::lock_guard locker(mds_lock);
2842     C_Flush_Journal *flush_journal = new C_Flush_Journal(mdcache, mdlog, this, &ss, &cond);
2843     flush_journal->send();
2844   }
2845   int r = cond.wait();
2846
2847   f->open_object_section("result");
2848   f->dump_string("message", ss.str());
2849   f->dump_int("return_code", r);
2850   f->close_section();
2851 }
2852
2853 void MDSRank::command_get_subtrees(Formatter *f)
2854 {
2855   ceph_assert(f != NULL);
2856   std::lock_guard l(mds_lock);
2857
2858   std::vector<CDir*> subtrees;
2859   mdcache->get_subtrees(subtrees);
2860
2861   f->open_array_section("subtrees");
2862   for (const auto& dir : subtrees) {
2863     f->open_object_section("subtree");
2864     {
2865       f->dump_bool("is_auth", dir->is_auth());
2866       f->dump_int("auth_first", dir->get_dir_auth().first);
2867       f->dump_int("auth_second", dir->get_dir_auth().second);
2868       f->dump_int("export_pin", dir->inode->get_export_pin());
2869       f->open_object_section("dir");
2870       dir->dump(f);
2871       f->close_section();
2872     }
2873     f->close_section();
2874   }
2875   f->close_section();
2876 }
2877
2878
2879 void MDSRank::command_export_dir(Formatter *f,
2880     std::string_view path,
2881     mds_rank_t target)
2882 {
2883   int r = _command_export_dir(path, target);
2884   f->open_object_section("results");
2885   f->dump_int("return_code", r);
2886   f->close_section(); // results
2887 }
2888
2889 int MDSRank::_command_export_dir(
2890     std::string_view path,
2891     mds_rank_t target)
2892 {
2893   std::lock_guard l(mds_lock);
2894   filepath fp(path);
2895
2896   if (target == whoami || !mdsmap->is_up(target) || !mdsmap->is_in(target)) {
2897     derr << "bad MDS target " << target << dendl;
2898     return -ENOENT;
2899   }
2900
2901   CInode *in = mdcache->cache_traverse(fp);
2902   if (!in) {
2903     derr << "Bath path '" << path << "'" << dendl;
2904     return -ENOENT;
2905   }
2906   CDir *dir = in->get_dirfrag(frag_t());
2907   if (!dir || !(dir->is_auth())) {
2908     derr << "bad export_dir path dirfrag frag_t() or dir not auth" << dendl;
2909     return -EINVAL;
2910   }
2911
2912   mdcache->migrator->export_dir(dir, target);
2913   return 0;
2914 }
2915
2916 void MDSRank::command_dump_tree(const cmdmap_t &cmdmap, std::ostream &ss, Formatter *f)
2917 {
2918   std::string root;
2919   int64_t depth;
2920   cmd_getval(cmdmap, "root", root);
2921   if (!cmd_getval(cmdmap, "depth", depth))
2922     depth = -1;
2923   std::lock_guard l(mds_lock);
2924   CInode *in = mdcache->cache_traverse(filepath(root.c_str()));
2925   if (!in) {
2926     ss << "root inode is not in cache";
2927     return;
2928   }
2929   f->open_array_section("inodes");
2930   mdcache->dump_tree(in, 0, depth, f);
2931   f->close_section();
2932 }
2933
2934 CDir *MDSRank::_command_dirfrag_get(
2935     const cmdmap_t &cmdmap,
2936     std::ostream &ss)
2937 {
2938   std::string path;
2939   bool got = cmd_getval(cmdmap, "path", path);
2940   if (!got) {
2941     ss << "missing path argument";
2942     return NULL;
2943   }
2944
2945   std::string frag_str;
2946   if (!cmd_getval(cmdmap, "frag", frag_str)) {
2947     ss << "missing frag argument";
2948     return NULL;
2949   }
2950
2951   CInode *in = mdcache->cache_traverse(filepath(path.c_str()));
2952   if (!in) {
2953     // TODO really we should load something in if it's not in cache,
2954     // but the infrastructure is harder, and we might still be unable
2955     // to act on it if someone else is auth.
2956     ss << "directory '" << path << "' inode not in cache";
2957     return NULL;
2958   }
2959
2960   frag_t fg;
2961
2962   if (!fg.parse(frag_str.c_str())) {
2963     ss << "frag " << frag_str << " failed to parse";
2964     return NULL;
2965   }
2966
2967   CDir *dir = in->get_dirfrag(fg);
2968   if (!dir) {
2969     ss << "frag " << in->ino() << "/" << fg << " not in cache ("
2970           "use `dirfrag ls` to see if it should exist)";
2971     return NULL;
2972   }
2973
2974   if (!dir->is_auth()) {
2975     ss << "frag " << dir->dirfrag() << " not auth (auth = "
2976        << dir->authority() << ")";
2977     return NULL;
2978   }
2979
2980   return dir;
2981 }
2982
2983 bool MDSRank::command_dirfrag_split(
2984     cmdmap_t cmdmap,
2985     std::ostream &ss)
2986 {
2987   std::lock_guard l(mds_lock);
2988   int64_t by = 0;
2989   if (!cmd_getval(cmdmap, "bits", by)) {
2990     ss << "missing bits argument";
2991     return false;
2992   }
2993
2994   if (by <= 0) {
2995     ss << "must split by >0 bits";
2996     return false;
2997   }
2998
2999   CDir *dir = _command_dirfrag_get(cmdmap, ss);
3000   if (!dir) {
3001     return false;
3002   }
3003
3004   mdcache->split_dir(dir, by);
3005
3006   return true;
3007 }
3008
3009 bool MDSRank::command_dirfrag_merge(
3010     cmdmap_t cmdmap,
3011     std::ostream &ss)
3012 {
3013   std::lock_guard l(mds_lock);
3014   std::string path;
3015   bool got = cmd_getval(cmdmap, "path", path);
3016   if (!got) {
3017     ss << "missing path argument";
3018     return false;
3019   }
3020
3021   std::string frag_str;
3022   if (!cmd_getval(cmdmap, "frag", frag_str)) {
3023     ss << "missing frag argument";
3024     return false;
3025   }
3026
3027   CInode *in = mdcache->cache_traverse(filepath(path.c_str()));
3028   if (!in) {
3029     ss << "directory '" << path << "' inode not in cache";
3030     return false;
3031   }
3032
3033   frag_t fg;
3034   if (!fg.parse(frag_str.c_str())) {
3035     ss << "frag " << frag_str << " failed to parse";
3036     return false;
3037   }
3038
3039   mdcache->merge_dir(in, fg);
3040
3041   return true;
3042 }
3043
3044 bool MDSRank::command_dirfrag_ls(
3045     cmdmap_t cmdmap,
3046     std::ostream &ss,
3047     Formatter *f)
3048 {
3049   std::lock_guard l(mds_lock);
3050   std::string path;
3051   bool got = cmd_getval(cmdmap, "path", path);
3052   if (!got) {
3053     ss << "missing path argument";
3054     return false;
3055   }
3056
3057   CInode *in = mdcache->cache_traverse(filepath(path.c_str()));
3058   if (!in) {
3059     ss << "directory inode not in cache";
3060     return false;
3061   }
3062
3063   f->open_array_section("frags");
3064   frag_vec_t leaves;
3065   // NB using get_leaves_under instead of get_dirfrags to give
3066   // you the list of what dirfrags may exist, not which are in cache
3067   in->dirfragtree.get_leaves_under(frag_t(), leaves);
3068   for (const auto& leaf : leaves) {
3069     f->open_object_section("frag");
3070     f->dump_int("value", leaf.value());
3071     f->dump_int("bits", leaf.bits());
3072     CachedStackStringStream css;
3073     *css << std::hex << leaf.value() << "/" << std::dec << leaf.bits();
3074     f->dump_string("str", css->strv());
3075     f->close_section();
3076   }
3077   f->close_section();
3078
3079   return true;
3080 }
3081
3082 void MDSRank::command_openfiles_ls(Formatter *f)
3083 {
3084   std::lock_guard l(mds_lock);
3085   mdcache->dump_openfiles(f);
3086 }
3087
3088 void MDSRank::command_dump_inode(Formatter *f, const cmdmap_t &cmdmap, std::ostream &ss)
3089 {
3090   std::lock_guard l(mds_lock);
3091   int64_t number;
3092   bool got = cmd_getval(cmdmap, "number", number);
3093   if (!got) {
3094     ss << "missing inode number";
3095     return;
3096   }
3097
3098   bool success = mdcache->dump_inode(f, number);
3099   if (!success) {
3100     ss << "dump inode failed, wrong inode number or the inode is not cached";
3101   }
3102 }
3103
3104 void MDSRank::dump_status(Formatter *f) const
3105 {
3106   if (state == MDSMap::STATE_REPLAY ||
3107       state == MDSMap::STATE_STANDBY_REPLAY) {
3108     mdlog->dump_replay_status(f);
3109   } else if (state == MDSMap::STATE_RESOLVE) {
3110     mdcache->dump_resolve_status(f);
3111   } else if (state == MDSMap::STATE_RECONNECT) {
3112     server->dump_reconnect_status(f);
3113   } else if (state == MDSMap::STATE_REJOIN) {
3114     mdcache->dump_rejoin_status(f);
3115   } else if (state == MDSMap::STATE_CLIENTREPLAY) {
3116     dump_clientreplay_status(f);
3117   }
3118   f->dump_float("rank_uptime", get_uptime().count());
3119 }
3120
3121 void MDSRank::dump_clientreplay_status(Formatter *f) const
3122 {
3123   f->open_object_section("clientreplay_status");
3124   f->dump_unsigned("clientreplay_queue", replay_queue.size());
3125   f->dump_unsigned("active_replay", mdcache->get_num_client_requests());
3126   f->close_section();
3127 }
3128
3129 void MDSRankDispatcher::update_log_config()
3130 {
3131   map<string,string> log_to_monitors;
3132   map<string,string> log_to_syslog;
3133   map<string,string> log_channel;
3134   map<string,string> log_prio;
3135   map<string,string> log_to_graylog;
3136   map<string,string> log_to_graylog_host;
3137   map<string,string> log_to_graylog_port;
3138   uuid_d fsid;
3139   string host;
3140
3141   if (parse_log_client_options(g_ceph_context, log_to_monitors, log_to_syslog,
3142                                log_channel, log_prio, log_to_graylog,
3143                                log_to_graylog_host, log_to_graylog_port,
3144                                fsid, host) == 0)
3145     clog->update_config(log_to_monitors, log_to_syslog,
3146                         log_channel, log_prio, log_to_graylog,
3147                         log_to_graylog_host, log_to_graylog_port,
3148                         fsid, host);
3149   dout(10) << __func__ << " log_to_monitors " << log_to_monitors << dendl;
3150 }
3151
3152 void MDSRank::create_logger()
3153 {
3154   dout(10) << "create_logger" << dendl;
3155   {
3156     PerfCountersBuilder mds_plb(g_ceph_context, "mds", l_mds_first, l_mds_last);
3157
3158     // super useful (high prio) perf stats
3159     mds_plb.add_u64_counter(l_mds_request, "request", "Requests", "req",
3160                             PerfCountersBuilder::PRIO_CRITICAL);
3161     mds_plb.add_time_avg(l_mds_reply_latency, "reply_latency", "Reply latency", "rlat",
3162                          PerfCountersBuilder::PRIO_CRITICAL);
3163     mds_plb.add_u64(l_mds_inodes, "inodes", "Inodes", "inos",
3164                     PerfCountersBuilder::PRIO_CRITICAL);
3165     mds_plb.add_u64_counter(l_mds_forward, "forward", "Forwarding request", "fwd",
3166                             PerfCountersBuilder::PRIO_INTERESTING);
3167     mds_plb.add_u64(l_mds_caps, "caps", "Capabilities", "caps",
3168                     PerfCountersBuilder::PRIO_INTERESTING);
3169     mds_plb.add_u64_counter(l_mds_exported_inodes, "exported_inodes", "Exported inodes",
3170                             "exi", PerfCountersBuilder::PRIO_INTERESTING);
3171     mds_plb.add_u64_counter(l_mds_imported_inodes, "imported_inodes", "Imported inodes",
3172                             "imi", PerfCountersBuilder::PRIO_INTERESTING);
3173
3174     // useful dir/inode/subtree stats
3175     mds_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
3176     mds_plb.add_u64(l_mds_root_rfiles, "root_rfiles", "root inode rfiles");
3177     mds_plb.add_u64(l_mds_root_rbytes, "root_rbytes", "root inode rbytes");
3178     mds_plb.add_u64(l_mds_root_rsnaps, "root_rsnaps", "root inode rsnaps");
3179     mds_plb.add_u64_counter(l_mds_dir_fetch, "dir_fetch", "Directory fetch");
3180     mds_plb.add_u64_counter(l_mds_dir_commit, "dir_commit", "Directory commit");
3181     mds_plb.add_u64_counter(l_mds_dir_split, "dir_split", "Directory split");
3182     mds_plb.add_u64_counter(l_mds_dir_merge, "dir_merge", "Directory merge");
3183     mds_plb.add_u64(l_mds_inodes_pinned, "inodes_pinned", "Inodes pinned");
3184     mds_plb.add_u64(l_mds_inodes_expired, "inodes_expired", "Inodes expired");
3185     mds_plb.add_u64(l_mds_inodes_with_caps, "inodes_with_caps",
3186                     "Inodes with capabilities");
3187     mds_plb.add_u64(l_mds_subtrees, "subtrees", "Subtrees");
3188     mds_plb.add_u64(l_mds_load_cent, "load_cent", "Load per cent");
3189     mds_plb.add_u64_counter(l_mds_openino_dir_fetch, "openino_dir_fetch",
3190                             "OpenIno incomplete directory fetchings");
3191
3192     // low prio stats
3193     mds_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
3194     mds_plb.add_u64_counter(l_mds_reply, "reply", "Replies");
3195     mds_plb.add_u64(l_mds_inodes_top, "inodes_top", "Inodes on top");
3196     mds_plb.add_u64(l_mds_inodes_bottom, "inodes_bottom", "Inodes on bottom");
3197     mds_plb.add_u64(
3198       l_mds_inodes_pin_tail, "inodes_pin_tail", "Inodes on pin tail");
3199     mds_plb.add_u64_counter(l_mds_traverse, "traverse", "Traverses");
3200     mds_plb.add_u64_counter(l_mds_traverse_hit, "traverse_hit", "Traverse hits");
3201     mds_plb.add_u64_counter(l_mds_traverse_forward, "traverse_forward",
3202                             "Traverse forwards");
3203     mds_plb.add_u64_counter(l_mds_traverse_discover, "traverse_discover",
3204                             "Traverse directory discovers");
3205     mds_plb.add_u64_counter(l_mds_traverse_dir_fetch, "traverse_dir_fetch",
3206                             "Traverse incomplete directory content fetchings");
3207     mds_plb.add_u64_counter(l_mds_traverse_remote_ino, "traverse_remote_ino",
3208                             "Traverse remote dentries");
3209     mds_plb.add_u64_counter(l_mds_traverse_lock, "traverse_lock",
3210                             "Traverse locks");
3211     mds_plb.add_u64(l_mds_dispatch_queue_len, "q", "Dispatch queue length");
3212     mds_plb.add_u64_counter(l_mds_exported, "exported", "Exports");
3213     mds_plb.add_u64_counter(l_mds_imported, "imported", "Imports");
3214     mds_plb.add_u64_counter(l_mds_openino_backtrace_fetch, "openino_backtrace_fetch",
3215                             "OpenIno backtrace fetchings");
3216     mds_plb.add_u64_counter(l_mds_openino_peer_discover, "openino_peer_discover",
3217                             "OpenIno peer inode discovers");
3218
3219     logger = mds_plb.create_perf_counters();
3220     g_ceph_context->get_perfcounters_collection()->add(logger);
3221   }
3222
3223   {
3224     PerfCountersBuilder mdm_plb(g_ceph_context, "mds_mem", l_mdm_first, l_mdm_last);
3225     mdm_plb.add_u64(l_mdm_ino, "ino", "Inodes", "ino",
3226                     PerfCountersBuilder::PRIO_INTERESTING);
3227     mdm_plb.add_u64(l_mdm_dn, "dn", "Dentries", "dn",
3228                     PerfCountersBuilder::PRIO_INTERESTING);
3229
3230     mdm_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
3231     mdm_plb.add_u64_counter(l_mdm_inoa, "ino+", "Inodes opened");
3232     mdm_plb.add_u64_counter(l_mdm_inos, "ino-", "Inodes closed");
3233     mdm_plb.add_u64(l_mdm_dir, "dir", "Directories");
3234     mdm_plb.add_u64_counter(l_mdm_dira, "dir+", "Directories opened");
3235     mdm_plb.add_u64_counter(l_mdm_dirs, "dir-", "Directories closed");
3236     mdm_plb.add_u64_counter(l_mdm_dna, "dn+", "Dentries opened");
3237     mdm_plb.add_u64_counter(l_mdm_dns, "dn-", "Dentries closed");
3238     mdm_plb.add_u64(l_mdm_cap, "cap", "Capabilities");
3239     mdm_plb.add_u64_counter(l_mdm_capa, "cap+", "Capabilities added");
3240     mdm_plb.add_u64_counter(l_mdm_caps, "cap-", "Capabilities removed");
3241     mdm_plb.add_u64(l_mdm_heap, "heap", "Heap size");
3242
3243     mdm_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
3244     mdm_plb.add_u64(l_mdm_rss, "rss", "RSS");
3245
3246     mlogger = mdm_plb.create_perf_counters();
3247     g_ceph_context->get_perfcounters_collection()->add(mlogger);
3248   }
3249
3250   mdlog->create_logger();
3251   server->create_logger();
3252   purge_queue.create_logger();
3253   sessionmap.register_perfcounters();
3254   mdcache->register_perfcounters();
3255 }
3256
3257 void MDSRank::check_ops_in_flight()
3258 {
3259   string summary;
3260   vector<string> warnings;
3261   int slow = 0;
3262   if (op_tracker.check_ops_in_flight(&summary, warnings, &slow)) {
3263     clog->warn() << summary;
3264     for (const auto& warning : warnings) {
3265       clog->warn() << warning;
3266     }
3267   }
3268
3269   // set mds slow request count
3270   mds_slow_req_count = slow;
3271   return;
3272 }
3273
3274 void MDSRankDispatcher::handle_osd_map()
3275 {
3276   if (is_active() &&
3277       mdsmap->get_tableserver() == whoami) {
3278     snapserver->check_osd_map(true);
3279   }
3280
3281   server->handle_osd_map();
3282
3283   purge_queue.update_op_limit(*mdsmap);
3284
3285   std::set<entity_addr_t> newly_blacklisted;
3286   objecter->consume_blacklist_events(&newly_blacklisted);
3287   auto epoch = objecter->with_osdmap([](const OSDMap &o){return o.get_epoch();});
3288   dout(4) << "handle_osd_map epoch " << epoch << ", "
3289           << newly_blacklisted.size() << " new blacklist entries" << dendl;
3290   auto victims = server->apply_blacklist(newly_blacklisted);
3291   if (victims) {
3292     set_osd_epoch_barrier(epoch);
3293   }
3294
3295
3296   // By default the objecter only requests OSDMap updates on use,
3297   // we would like to always receive the latest maps in order to
3298   // apply policy based on the FULL flag.
3299   objecter->maybe_request_map();
3300 }
3301
3302 int MDSRank::config_client(int64_t session_id, bool remove,
3303                            const std::string& option, const std::string& value,
3304                            std::ostream& ss)
3305 {
3306   Session *session = sessionmap.get_session(entity_name_t(CEPH_ENTITY_TYPE_CLIENT, session_id));
3307   if (!session) {
3308     ss << "session " << session_id << " not in sessionmap!";
3309     return -ENOENT;
3310   }
3311
3312   if (option == "timeout") {
3313     if (remove) {
3314       auto it = session->info.client_metadata.find("timeout");
3315       if (it == session->info.client_metadata.end()) {
3316         ss << "Nonexistent config: " << option;
3317         return -ENODATA;
3318       }
3319       session->info.client_metadata.erase(it);
3320     } else {
3321       char *end;
3322       strtoul(value.c_str(), &end, 0);
3323       if (*end) {
3324         ss << "Invalid config for timeout: " << value;
3325         return -EINVAL;
3326       }
3327       session->info.client_metadata[option] = value;
3328     }
3329     //sessionmap._mark_dirty(session, true);
3330   } else {
3331     ss << "Invalid config option: " << option;
3332     return -EINVAL;
3333   }
3334
3335   return 0;
3336 }
3337
3338 bool MDSRank::evict_client(int64_t session_id,
3339     bool wait, bool blacklist, std::ostream& err_ss,
3340     Context *on_killed)
3341 {
3342   ceph_assert(ceph_mutex_is_locked_by_me(mds_lock));
3343
3344   // Mutually exclusive args
3345   ceph_assert(!(wait && on_killed != nullptr));
3346
3347   if (is_any_replay()) {
3348     err_ss << "MDS is replaying log";
3349     return false;
3350   }
3351
3352   Session *session = sessionmap.get_session(
3353       entity_name_t(CEPH_ENTITY_TYPE_CLIENT, session_id));
3354   if (!session) {
3355     err_ss << "session " << session_id << " not in sessionmap!";
3356     return false;
3357   }
3358
3359   auto& addr = session->info.inst.addr;
3360   {
3361     CachedStackStringStream css;
3362     *css << "Evicting " << (blacklist ? "(and blacklisting) " : "")
3363          << "client session " << session_id << " (" << addr << ")";
3364     dout(1) << css->strv() << dendl;
3365     clog->info() << css->strv();
3366   }
3367
3368   dout(4) << "Preparing blacklist command... (wait=" << wait << ")" << dendl;
3369   stringstream ss;
3370   ss << "{\"prefix\":\"osd blacklist\", \"blacklistop\":\"add\",";
3371   ss << "\"addr\":\"";
3372   ss << addr;
3373   ss << "\"}";
3374   std::string tmp = ss.str();
3375   std::vector<std::string> cmd = {tmp};
3376
3377   auto kill_client_session = [this, session_id, wait, on_killed](){
3378     ceph_assert(ceph_mutex_is_locked_by_me(mds_lock));
3379     Session *session = sessionmap.get_session(
3380         entity_name_t(CEPH_ENTITY_TYPE_CLIENT, session_id));
3381     if (session) {
3382       if (on_killed || !wait) {
3383         server->kill_session(session, on_killed);
3384       } else {
3385         C_SaferCond on_safe;
3386         server->kill_session(session, &on_safe);
3387
3388         mds_lock.unlock();
3389         on_safe.wait();
3390         mds_lock.lock();
3391       }
3392     } else {
3393       dout(1) << "session " << session_id << " was removed while we waited "
3394       "for blacklist" << dendl;
3395
3396       // Even though it wasn't us that removed it, kick our completion
3397       // as the session has been removed.
3398       if (on_killed) {
3399         on_killed->complete(0);
3400       }
3401     }
3402   };
3403
3404   auto apply_blacklist = [this, cmd](std::function<void ()> fn){
3405     ceph_assert(ceph_mutex_is_locked_by_me(mds_lock));
3406
3407     Context *on_blacklist_done = new LambdaContext([this, fn](int r) {
3408       objecter->wait_for_latest_osdmap(
3409        new C_OnFinisher(
3410          new LambdaContext([this, fn](int r) {
3411               std::lock_guard l(mds_lock);
3412               auto epoch = objecter->with_osdmap([](const OSDMap &o){
3413                   return o.get_epoch();
3414               });
3415
3416               set_osd_epoch_barrier(epoch);
3417
3418               fn();
3419             }), finisher)
3420        );
3421     });
3422
3423     dout(4) << "Sending mon blacklist command: " << cmd[0] << dendl;
3424     monc->start_mon_command(cmd, {}, nullptr, nullptr, on_blacklist_done);
3425   };
3426
3427   if (wait) {
3428     if (blacklist) {
3429       C_SaferCond inline_ctx;
3430       apply_blacklist([&inline_ctx](){inline_ctx.complete(0);});
3431       mds_lock.unlock();
3432       inline_ctx.wait();
3433       mds_lock.lock();
3434     }
3435
3436     // We dropped mds_lock, so check that session still exists
3437     session = sessionmap.get_session(entity_name_t(CEPH_ENTITY_TYPE_CLIENT,
3438           session_id));
3439     if (!session) {
3440       dout(1) << "session " << session_id << " was removed while we waited "
3441                  "for blacklist" << dendl;
3442       return true;
3443     }
3444     kill_client_session();
3445   } else {
3446     if (blacklist) {
3447       apply_blacklist(kill_client_session);
3448     } else {
3449       kill_client_session();
3450     }
3451   }
3452
3453   return true;
3454 }
3455
3456 void MDSRank::bcast_mds_map()
3457 {
3458   dout(7) << "bcast_mds_map " << mdsmap->get_epoch() << dendl;
3459
3460   // share the map with mounted clients
3461   set<Session*> clients;
3462   sessionmap.get_client_session_set(clients);
3463   for (const auto &session : clients) {
3464     auto m = make_message<MMDSMap>(monc->get_fsid(), *mdsmap);
3465     session->get_connection()->send_message2(std::move(m));
3466   }
3467   last_client_mdsmap_bcast = mdsmap->get_epoch();
3468 }
3469
3470 MDSRankDispatcher::MDSRankDispatcher(
3471     mds_rank_t whoami_,
3472     ceph::mutex &mds_lock_,
3473     LogChannelRef &clog_,
3474     SafeTimer &timer_,
3475     Beacon &beacon_,
3476     std::unique_ptr<MDSMap> &mdsmap_,
3477     Messenger *msgr,
3478     MonClient *monc_,
3479     MgrClient *mgrc,
3480     Context *respawn_hook_,
3481     Context *suicide_hook_)
3482   : MDSRank(whoami_, mds_lock_, clog_, timer_, beacon_, mdsmap_,
3483             msgr, monc_, mgrc, respawn_hook_, suicide_hook_)
3484 {
3485     g_conf().add_observer(this);
3486 }
3487
3488 void MDSRank::command_cache_drop(uint64_t timeout, Formatter *f, Context *on_finish) {
3489   dout(20) << __func__ << dendl;
3490
3491   std::lock_guard locker(mds_lock);
3492   C_Drop_Cache *request = new C_Drop_Cache(server, mdcache, mdlog, this,
3493                                            timeout, f, on_finish);
3494   request->send();
3495 }
3496
3497 epoch_t MDSRank::get_osd_epoch() const
3498 {
3499   return objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
3500 }
3501
3502 const char** MDSRankDispatcher::get_tracked_conf_keys() const
3503 {
3504   static const char* KEYS[] = {
3505     "clog_to_graylog",
3506     "clog_to_graylog_host",
3507     "clog_to_graylog_port",
3508     "clog_to_monitors",
3509     "clog_to_syslog",
3510     "clog_to_syslog_facility",
3511     "clog_to_syslog_level",
3512     "fsid",
3513     "host",
3514     "mds_bal_fragment_dirs",
3515     "mds_bal_fragment_interval",
3516     "mds_cache_memory_limit",
3517     "mds_cache_mid",
3518     "mds_cache_reservation",
3519     "mds_cache_trim_decay_rate",
3520     "mds_cap_revoke_eviction_timeout",
3521     "mds_dump_cache_threshold_file",
3522     "mds_dump_cache_threshold_formatter",
3523     "mds_enable_op_tracker",
3524     "mds_health_cache_threshold",
3525     "mds_inject_migrator_session_race",
3526     "mds_log_pause",
3527     "mds_max_export_size",
3528     "mds_max_purge_files",
3529     "mds_forward_all_requests_to_auth",
3530     "mds_max_purge_ops",
3531     "mds_max_purge_ops_per_pg",
3532     "mds_max_snaps_per_dir",
3533     "mds_op_complaint_time",
3534     "mds_op_history_duration",
3535     "mds_op_history_size",
3536     "mds_op_log_threshold",
3537     "mds_recall_max_decay_rate",
3538     "mds_recall_warning_decay_rate",
3539     "mds_request_load_average_decay_rate",
3540     "mds_session_cache_liveness_decay_rate",
3541     "mds_replay_unsafe_with_closed_session",
3542     NULL
3543   };
3544   return KEYS;
3545 }
3546
3547 void MDSRankDispatcher::handle_conf_change(const ConfigProxy& conf, const std::set<std::string>& changed)
3548 {
3549   // XXX with or without mds_lock!
3550
3551   if (changed.count("mds_op_complaint_time") || changed.count("mds_op_log_threshold")) {
3552     op_tracker.set_complaint_and_threshold(conf->mds_op_complaint_time, conf->mds_op_log_threshold);
3553   }
3554   if (changed.count("mds_op_history_size") || changed.count("mds_op_history_duration")) {
3555     op_tracker.set_history_size_and_duration(conf->mds_op_history_size, conf->mds_op_history_duration);
3556   }
3557   if (changed.count("mds_enable_op_tracker")) {
3558     op_tracker.set_tracking(conf->mds_enable_op_tracker);
3559   }
3560   if (changed.count("clog_to_monitors") ||
3561       changed.count("clog_to_syslog") ||
3562       changed.count("clog_to_syslog_level") ||
3563       changed.count("clog_to_syslog_facility") ||
3564       changed.count("clog_to_graylog") ||
3565       changed.count("clog_to_graylog_host") ||
3566       changed.count("clog_to_graylog_port") ||
3567       changed.count("host") ||
3568       changed.count("fsid")) {
3569     update_log_config();
3570   }
3571
3572   finisher->queue(new LambdaContext([this, changed](int) {
3573     std::scoped_lock lock(mds_lock);
3574
3575     if (changed.count("mds_log_pause") && !g_conf()->mds_log_pause) {
3576       mdlog->kick_submitter();
3577     }
3578     sessionmap.handle_conf_change(changed);
3579     server->handle_conf_change(changed);
3580     mdcache->handle_conf_change(changed, *mdsmap);
3581     purge_queue.handle_conf_change(changed, *mdsmap);
3582   }));
3583 }
3584
3585 void MDSRank::get_task_status(std::map<std::string, std::string> *status) {
3586   dout(20) << __func__ << dendl;
3587
3588   // scrub summary for now..
3589   std::string_view scrub_summary = scrubstack->scrub_summary();
3590   status->emplace(SCRUB_STATUS_KEY, std::move(scrub_summary));
3591 }
3592
3593 void MDSRank::schedule_update_timer_task() {
3594   dout(20) << __func__ << dendl;
3595
3596   timer.add_event_after(g_conf().get_val<double>("mds_task_status_update_interval"),
3597                         new LambdaContext([this](int) {
3598                             send_task_status();
3599                           }));
3600 }
3601
3602 void MDSRank::send_task_status() {
3603   std::map<std::string, std::string> status;
3604   get_task_status(&status);
3605
3606   if (!status.empty()) {
3607     dout(20) << __func__ << ": updating " << status.size() << " status keys" << dendl;
3608
3609     int r = mgrc->service_daemon_update_task_status(std::move(status));
3610     if (r < 0) {
3611       derr << ": failed to update service daemon status: " << cpp_strerror(r) << dendl;
3612     }
3613   }
3614
3615   schedule_update_timer_task();
3616 }