ceph/src/mds/MDSRank.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2015 Red Hat
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15 #include <string_view>
  16
  17 #include "common/debug.h"
  18 #include "common/errno.h"
  19
  20 #include "messages/MClientRequestForward.h"
  21 #include "messages/MMDSLoadTargets.h"
  22 #include "messages/MMDSTableRequest.h"
  23
  24 #include "mgr/MgrClient.h"
  25
  26 #include "MDSDaemon.h"
  27 #include "MDSMap.h"
  28 #include "SnapClient.h"
  29 #include "SnapServer.h"
  30 #include "MDBalancer.h"
  31 #include "Migrator.h"
  32 #include "Locker.h"
  33 #include "InoTable.h"
  34 #include "mon/MonClient.h"
  35 #include "common/HeartbeatMap.h"
  36 #include "ScrubStack.h"
  37
  38
  39 #include "MDSRank.h"
  40
  41 #define dout_context g_ceph_context
  42 #define dout_subsys ceph_subsys_mds
  43 #undef dout_prefix
  44 #define dout_prefix *_dout << "mds." << whoami << '.' << incarnation << ' '
  45 using TOPNSPC::common::cmd_getval;
  46 class C_Flush_Journal : public MDSInternalContext {
  47 public:
  48   C_Flush_Journal(MDCache *mdcache, MDLog *mdlog, MDSRank *mds,
  49                   std::ostream *ss, Context *on_finish)
  50     : MDSInternalContext(mds),
  51       mdcache(mdcache), mdlog(mdlog), ss(ss), on_finish(on_finish),
  52       whoami(mds->whoami), incarnation(mds->incarnation) {
  53   }
  54
  55   void send() {
  56     assert(ceph_mutex_is_locked(mds->mds_lock));
  57
  58     dout(20) << __func__ << dendl;
  59
  60     if (mdcache->is_readonly()) {
  61       dout(5) << __func__ << ": read-only FS" << dendl;
  62       complete(-EROFS);
  63       return;
  64     }
  65
  66     if (!mds->is_active()) {
  67       dout(5) << __func__ << ": MDS not active, no-op" << dendl;
  68       complete(0);
  69       return;
  70     }
  71
  72     flush_mdlog();
  73   }
  74
  75 private:
  76
  77   void flush_mdlog() {
  78     dout(20) << __func__ << dendl;
  79
  80     // I need to seal off the current segment, and then mark all
  81     // previous segments for expiry
  82     mdlog->start_new_segment();
  83
  84     Context *ctx = new LambdaContext([this](int r) {
  85         handle_flush_mdlog(r);
  86       });
  87
  88     // Flush initially so that all the segments older than our new one
  89     // will be elegible for expiry
  90     mdlog->flush();
  91     mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, ctx));
  92   }
  93
  94   void handle_flush_mdlog(int r) {
  95     dout(20) << __func__ << ": r=" << r << dendl;
  96
  97     if (r != 0) {
  98       *ss << "Error " << r << " (" << cpp_strerror(r) << ") while flushing journal";
  99       complete(r);
 100       return;
 101     }
 102
 103     clear_mdlog();
 104   }
 105
 106   void clear_mdlog() {
 107     dout(20) << __func__ << dendl;
 108
 109     Context *ctx = new LambdaContext([this](int r) {
 110         handle_clear_mdlog(r);
 111       });
 112
 113     // Because we may not be the last wait_for_safe context on MDLog,
 114     // and subsequent contexts might wake up in the middle of our
 115     // later trim_all and interfere with expiry (by e.g. marking
 116     // dirs/dentries dirty on previous log segments), we run a second
 117     // wait_for_safe here. See #10368
 118     mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, ctx));
 119   }
 120
 121   void handle_clear_mdlog(int r) {
 122     dout(20) << __func__ << ": r=" << r << dendl;
 123
 124     if (r != 0) {
 125       *ss << "Error " << r << " (" << cpp_strerror(r) << ") while flushing journal";
 126       complete(r);
 127       return;
 128     }
 129
 130     trim_mdlog();
 131   }
 132
 133   void trim_mdlog() {
 134     // Put all the old log segments into expiring or expired state
 135     dout(5) << __func__ << ": beginning segment expiry" << dendl;
 136
 137     int ret = mdlog->trim_all();
 138     if (ret != 0) {
 139       *ss << "Error " << ret << " (" << cpp_strerror(ret) << ") while trimming log";
 140       complete(ret);
 141       return;
 142     }
 143
 144     expire_segments();
 145   }
 146
 147   void expire_segments() {
 148     dout(20) << __func__ << dendl;
 149
 150     // Attach contexts to wait for all expiring segments to expire
 151     MDSGatherBuilder *expiry_gather = new MDSGatherBuilder(g_ceph_context);
 152
 153     const auto &expiring_segments = mdlog->get_expiring_segments();
 154     for (auto p : expiring_segments) {
 155       p->wait_for_expiry(expiry_gather->new_sub());
 156     }
 157     dout(5) << __func__ << ": waiting for " << expiry_gather->num_subs_created()
 158             << " segments to expire" << dendl;
 159
 160     if (!expiry_gather->has_subs()) {
 161       trim_segments();
 162       delete expiry_gather;
 163       return;
 164     }
 165
 166     Context *ctx = new LambdaContext([this](int r) {
 167         handle_expire_segments(r);
 168       });
 169     expiry_gather->set_finisher(new MDSInternalContextWrapper(mds, ctx));
 170     expiry_gather->activate();
 171   }
 172
 173   void handle_expire_segments(int r) {
 174     dout(20) << __func__ << ": r=" << r << dendl;
 175
 176     ceph_assert(r == 0); // MDLog is not allowed to raise errors via
 177                          // wait_for_expiry
 178     trim_segments();
 179   }
 180
 181   void trim_segments() {
 182     dout(20) << __func__ << dendl;
 183
 184     Context *ctx = new C_OnFinisher(new LambdaContext([this](int) {
 185           std::lock_guard locker(mds->mds_lock);
 186           trim_expired_segments();
 187         }), mds->finisher);
 188     ctx->complete(0);
 189   }
 190
 191   void trim_expired_segments() {
 192     dout(5) << __func__ << ": expiry complete, expire_pos/trim_pos is now "
 193             << std::hex << mdlog->get_journaler()->get_expire_pos() << "/"
 194             << mdlog->get_journaler()->get_trimmed_pos() << dendl;
 195
 196     // Now everyone I'm interested in is expired
 197     mdlog->trim_expired_segments();
 198
 199     dout(5) << __func__ << ": trim complete, expire_pos/trim_pos is now "
 200             << std::hex << mdlog->get_journaler()->get_expire_pos() << "/"
 201             << mdlog->get_journaler()->get_trimmed_pos() << dendl;
 202
 203     write_journal_head();
 204   }
 205
 206   void write_journal_head() {
 207     dout(20) << __func__ << dendl;
 208
 209     Context *ctx = new LambdaContext([this](int r) {
 210         std::lock_guard locker(mds->mds_lock);
 211         handle_write_head(r);
 212       });
 213     // Flush the journal header so that readers will start from after
 214     // the flushed region
 215     mdlog->get_journaler()->write_head(ctx);
 216   }
 217
 218   void handle_write_head(int r) {
 219     if (r != 0) {
 220       *ss << "Error " << r << " (" << cpp_strerror(r) << ") while writing header";
 221     } else {
 222       dout(5) << __func__ << ": write_head complete, all done!" << dendl;
 223     }
 224
 225     complete(r);
 226   }
 227
 228   void finish(int r) override {
 229     dout(20) << __func__ << ": r=" << r << dendl;
 230     on_finish->complete(r);
 231   }
 232
 233   MDCache *mdcache;
 234   MDLog *mdlog;
 235   std::ostream *ss;
 236   Context *on_finish;
 237
 238   // so as to use dout
 239   mds_rank_t whoami;
 240   int incarnation;
 241 };
 242
 243 class C_Drop_Cache : public MDSInternalContext {
 244 public:
 245   C_Drop_Cache(Server *server, MDCache *mdcache, MDLog *mdlog,
 246                MDSRank *mds, uint64_t recall_timeout,
 247                Formatter *f, Context *on_finish)
 248     : MDSInternalContext(mds),
 249       server(server), mdcache(mdcache), mdlog(mdlog),
 250       recall_timeout(recall_timeout), recall_start(mono_clock::now()),
 251       f(f), on_finish(on_finish),
 252       whoami(mds->whoami), incarnation(mds->incarnation) {
 253   }
 254
 255   void send() {
 256     // not really a hard requirement here, but lets ensure this in
 257     // case we change the logic here.
 258     assert(ceph_mutex_is_locked(mds->mds_lock));
 259
 260     dout(20) << __func__ << dendl;
 261     f->open_object_section("result");
 262     recall_client_state();
 263   }
 264
 265 private:
 266   // context which completes itself (with -ETIMEDOUT) after a specified
 267   // timeout or when explicitly completed, whichever comes first. Note
 268   // that the context does not detroy itself after completion -- it
 269   // needs to be explicitly freed.
 270   class C_ContextTimeout : public MDSInternalContext {
 271   public:
 272     C_ContextTimeout(MDSRank *mds, uint64_t timeout, Context *on_finish)
 273       : MDSInternalContext(mds),
 274         timeout(timeout),
 275         on_finish(on_finish) {
 276     }
 277     ~C_ContextTimeout() {
 278       ceph_assert(timer_task == nullptr);
 279     }
 280
 281     void start_timer() {
 282       if (!timeout) {
 283         return;
 284       }
 285
 286       timer_task = new LambdaContext([this](int) {
 287           timer_task = nullptr;
 288           complete(-ETIMEDOUT);
 289         });
 290       mds->timer.add_event_after(timeout, timer_task);
 291     }
 292
 293     void finish(int r) override {
 294       Context *ctx = nullptr;
 295       {
 296         std::lock_guard locker(lock);
 297         std::swap(on_finish, ctx);
 298       }
 299       if (ctx != nullptr) {
 300         ctx->complete(r);
 301       }
 302     }
 303     void complete(int r) override {
 304       if (timer_task != nullptr) {
 305         mds->timer.cancel_event(timer_task);
 306       }
 307
 308       finish(r);
 309     }
 310
 311     uint64_t timeout;
 312     ceph::mutex lock = ceph::make_mutex("mds::context::timeout");
 313     Context *on_finish = nullptr;
 314     Context *timer_task = nullptr;
 315   };
 316
 317   auto do_trim() {
 318     auto [throttled, count] = mdcache->trim(UINT64_MAX);
 319     dout(10) << __func__
 320              << (throttled ? " (throttled)" : "")
 321              << " trimmed " << count << " caps" << dendl;
 322     dentries_trimmed += count;
 323     return std::make_pair(throttled, count);
 324   }
 325
 326   void recall_client_state() {
 327     dout(20) << __func__ << dendl;
 328     auto now = mono_clock::now();
 329     auto duration = std::chrono::duration<double>(now-recall_start).count();
 330
 331     MDSGatherBuilder *gather = new MDSGatherBuilder(g_ceph_context);
 332     auto flags = Server::RecallFlags::STEADY|Server::RecallFlags::TRIM;
 333     auto [throttled, count] = server->recall_client_state(gather, flags);
 334     dout(10) << __func__
 335              << (throttled ? " (throttled)" : "")
 336              << " recalled " << count << " caps" << dendl;
 337
 338     caps_recalled += count;
 339     if ((throttled || count > 0) && (recall_timeout == 0 || duration < recall_timeout)) {
 340       C_ContextTimeout *ctx = new C_ContextTimeout(
 341         mds, 1, new LambdaContext([this](int r) {
 342           recall_client_state();
 343       }));
 344       ctx->start_timer();
 345       gather->set_finisher(new MDSInternalContextWrapper(mds, ctx));
 346       gather->activate();
 347       mdlog->flush(); /* use down-time to incrementally flush log */
 348       do_trim(); /* use down-time to incrementally trim cache */
 349     } else {
 350       if (!gather->has_subs()) {
 351         delete gather;
 352         return handle_recall_client_state(0);
 353       } else if (recall_timeout > 0 && duration > recall_timeout) {
 354         gather->set_finisher(new C_MDSInternalNoop);
 355         gather->activate();
 356         return handle_recall_client_state(-ETIMEDOUT);
 357       } else {
 358         uint64_t remaining = (recall_timeout == 0 ? 0 : recall_timeout-duration);
 359         C_ContextTimeout *ctx = new C_ContextTimeout(
 360           mds, remaining, new LambdaContext([this](int r) {
 361               handle_recall_client_state(r);
 362             }));
 363
 364         ctx->start_timer();
 365         gather->set_finisher(new MDSInternalContextWrapper(mds, ctx));
 366         gather->activate();
 367       }
 368     }
 369   }
 370
 371   void handle_recall_client_state(int r) {
 372     dout(20) << __func__ << ": r=" << r << dendl;
 373
 374     // client recall section
 375     f->open_object_section("client_recall");
 376     f->dump_int("return_code", r);
 377     f->dump_string("message", cpp_strerror(r));
 378     f->dump_int("recalled", caps_recalled);
 379     f->close_section();
 380
 381     // we can still continue after recall timeout
 382     flush_journal();
 383   }
 384
 385   void flush_journal() {
 386     dout(20) << __func__ << dendl;
 387
 388     Context *ctx = new LambdaContext([this](int r) {
 389         handle_flush_journal(r);
 390       });
 391
 392     C_Flush_Journal *flush_journal = new C_Flush_Journal(mdcache, mdlog, mds, &ss, ctx);
 393     flush_journal->send();
 394   }
 395
 396   void handle_flush_journal(int r) {
 397     dout(20) << __func__ << ": r=" << r << dendl;
 398
 399     if (r != 0) {
 400       cmd_err(f, ss.str());
 401       complete(r);
 402       return;
 403     }
 404
 405     // journal flush section
 406     f->open_object_section("flush_journal");
 407     f->dump_int("return_code", r);
 408     f->dump_string("message", ss.str());
 409     f->close_section();
 410
 411     trim_cache();
 412   }
 413
 414   void trim_cache() {
 415     dout(20) << __func__ << dendl;
 416
 417     auto [throttled, count] = do_trim();
 418     if (throttled && count > 0) {
 419       auto timer = new LambdaContext([this](int) {
 420         trim_cache();
 421       });
 422       mds->timer.add_event_after(1.0, timer);
 423     } else {
 424       cache_status();
 425     }
 426   }
 427
 428   void cache_status() {
 429     dout(20) << __func__ << dendl;
 430
 431     f->open_object_section("trim_cache");
 432     f->dump_int("trimmed", dentries_trimmed);
 433     f->close_section();
 434
 435     // cache status section
 436     mdcache->cache_status(f);
 437
 438     complete(0);
 439   }
 440
 441   void finish(int r) override {
 442     dout(20) << __func__ << ": r=" << r << dendl;
 443
 444     auto d = std::chrono::duration<double>(mono_clock::now()-recall_start);
 445     f->dump_float("duration", d.count());
 446
 447     f->close_section();
 448     on_finish->complete(r);
 449   }
 450
 451   Server *server;
 452   MDCache *mdcache;
 453   MDLog *mdlog;
 454   uint64_t recall_timeout;
 455   mono_time recall_start;
 456   Formatter *f;
 457   Context *on_finish;
 458
 459   int retval = 0;
 460   std::stringstream ss;
 461   uint64_t caps_recalled = 0;
 462   uint64_t dentries_trimmed = 0;
 463
 464   // so as to use dout
 465   mds_rank_t whoami;
 466   int incarnation;
 467
 468   void cmd_err(Formatter *f, std::string_view err) {
 469     f->reset();
 470     f->open_object_section("result");
 471     f->dump_string("error", err);
 472     f->close_section();
 473   }
 474 };
 475
 476 MDSRank::MDSRank(
 477     mds_rank_t whoami_,
 478     ceph::mutex &mds_lock_,
 479     LogChannelRef &clog_,
 480     SafeTimer &timer_,
 481     Beacon &beacon_,
 482     std::unique_ptr<MDSMap>& mdsmap_,
 483     Messenger *msgr,
 484     MonClient *monc_,
 485     MgrClient *mgrc,
 486     Context *respawn_hook_,
 487     Context *suicide_hook_) :
 488     cct(msgr->cct), mds_lock(mds_lock_), clog(clog_),
 489     timer(timer_), mdsmap(mdsmap_),
 490     objecter(new Objecter(g_ceph_context, msgr, monc_, nullptr, 0, 0)),
 491     damage_table(whoami_), sessionmap(this),
 492     op_tracker(g_ceph_context, g_conf()->mds_enable_op_tracker,
 493                g_conf()->osd_num_op_tracker_shard),
 494     progress_thread(this), whoami(whoami_),
 495     purge_queue(g_ceph_context, whoami_,
 496       mdsmap_->get_metadata_pool(), objecter,
 497       new LambdaContext([this](int r) {
 498           std::lock_guard l(mds_lock);
 499           handle_write_error(r);
 500         }
 501       )
 502     ),
 503     beacon(beacon_),
 504     messenger(msgr), monc(monc_), mgrc(mgrc),
 505     respawn_hook(respawn_hook_),
 506     suicide_hook(suicide_hook_),
 507     starttime(mono_clock::now())
 508 {
 509   hb = g_ceph_context->get_heartbeat_map()->add_worker("MDSRank", pthread_self());
 510
 511   purge_queue.update_op_limit(*mdsmap);
 512
 513   objecter->unset_honor_pool_full();
 514
 515   finisher = new Finisher(cct, "MDSRank", "MR_Finisher");
 516
 517   mdcache = new MDCache(this, purge_queue);
 518   mdlog = new MDLog(this);
 519   balancer = new MDBalancer(this, messenger, monc);
 520
 521   scrubstack = new ScrubStack(mdcache, clog, finisher);
 522
 523   inotable = new InoTable(this);
 524   snapserver = new SnapServer(this, monc);
 525   snapclient = new SnapClient(this);
 526
 527   server = new Server(this);
 528   locker = new Locker(this, mdcache);
 529
 530   op_tracker.set_complaint_and_threshold(cct->_conf->mds_op_complaint_time,
 531                                          cct->_conf->mds_op_log_threshold);
 532   op_tracker.set_history_size_and_duration(cct->_conf->mds_op_history_size,
 533                                            cct->_conf->mds_op_history_duration);
 534
 535   schedule_update_timer_task();
 536 }
 537
 538 MDSRank::~MDSRank()
 539 {
 540   if (hb) {
 541     g_ceph_context->get_heartbeat_map()->remove_worker(hb);
 542   }
 543
 544   if (scrubstack) { delete scrubstack; scrubstack = NULL; }
 545   if (mdcache) { delete mdcache; mdcache = NULL; }
 546   if (mdlog) { delete mdlog; mdlog = NULL; }
 547   if (balancer) { delete balancer; balancer = NULL; }
 548   if (inotable) { delete inotable; inotable = NULL; }
 549   if (snapserver) { delete snapserver; snapserver = NULL; }
 550   if (snapclient) { delete snapclient; snapclient = NULL; }
 551
 552   if (server) { delete server; server = 0; }
 553   if (locker) { delete locker; locker = 0; }
 554
 555   if (logger) {
 556     g_ceph_context->get_perfcounters_collection()->remove(logger);
 557     delete logger;
 558     logger = 0;
 559   }
 560   if (mlogger) {
 561     g_ceph_context->get_perfcounters_collection()->remove(mlogger);
 562     delete mlogger;
 563     mlogger = 0;
 564   }
 565
 566   delete finisher;
 567   finisher = NULL;
 568
 569   delete suicide_hook;
 570   suicide_hook = NULL;
 571
 572   delete respawn_hook;
 573   respawn_hook = NULL;
 574
 575   delete objecter;
 576   objecter = nullptr;
 577 }
 578
 579 void MDSRankDispatcher::init()
 580 {
 581   objecter->init();
 582   messenger->add_dispatcher_head(objecter);
 583
 584   objecter->start();
 585
 586   update_log_config();
 587   create_logger();
 588
 589   // Expose the OSDMap (already populated during MDS::init) to anyone
 590   // who is interested in it.
 591   handle_osd_map();
 592
 593   progress_thread.create("mds_rank_progr");
 594
 595   purge_queue.init();
 596
 597   finisher->start();
 598 }
 599
 600 void MDSRank::update_targets()
 601 {
 602   // get MonMap's idea of my export_targets
 603   const set<mds_rank_t>& map_targets = mdsmap->get_mds_info(get_nodeid()).export_targets;
 604
 605   dout(20) << "updating export targets, currently " << map_targets.size() << " ranks are targets" << dendl;
 606
 607   bool send = false;
 608   set<mds_rank_t> new_map_targets;
 609
 610   auto it = export_targets.begin();
 611   while (it != export_targets.end()) {
 612     mds_rank_t rank = it->first;
 613     auto &counter = it->second;
 614     dout(20) << "export target mds." << rank << " is " << counter << dendl;
 615
 616     double val = counter.get();
 617     if (val <= 0.01) {
 618       dout(15) << "export target mds." << rank << " is no longer an export target" << dendl;
 619       export_targets.erase(it++);
 620       send = true;
 621       continue;
 622     }
 623     if (!map_targets.count(rank)) {
 624       dout(15) << "export target mds." << rank << " not in map's export_targets" << dendl;
 625       send = true;
 626     }
 627     new_map_targets.insert(rank);
 628     it++;
 629   }
 630   if (new_map_targets.size() < map_targets.size()) {
 631     dout(15) << "export target map holds stale targets, sending update" << dendl;
 632     send = true;
 633   }
 634
 635   if (send) {
 636     dout(15) << "updating export_targets, now " << new_map_targets.size() << " ranks are targets" << dendl;
 637     auto m = make_message<MMDSLoadTargets>(mds_gid_t(monc->get_global_id()), new_map_targets);
 638     monc->send_mon_message(m.detach());
 639   }
 640 }
 641
 642 void MDSRank::hit_export_target(mds_rank_t rank, double amount)
 643 {
 644   double rate = g_conf()->mds_bal_target_decay;
 645   if (amount < 0.0) {
 646     amount = 100.0/g_conf()->mds_bal_target_decay; /* a good default for "i am trying to keep this export_target active" */
 647   }
 648   auto em = export_targets.emplace(std::piecewise_construct, std::forward_as_tuple(rank), std::forward_as_tuple(DecayRate(rate)));
 649   auto &counter = em.first->second;
 650   counter.hit(amount);
 651   if (em.second) {
 652     dout(15) << "hit export target (new) is " << counter << dendl;
 653   } else {
 654     dout(15) << "hit export target is " << counter << dendl;
 655   }
 656 }
 657
 658 class C_MDS_MonCommand : public MDSInternalContext {
 659   std::string cmd;
 660 public:
 661   std::string outs;
 662   C_MDS_MonCommand(MDSRank *m, std::string_view c)
 663     : MDSInternalContext(m), cmd(c) {}
 664   void finish(int r) override {
 665     mds->_mon_command_finish(r, cmd, outs);
 666   }
 667 };
 668
 669 void MDSRank::_mon_command_finish(int r, std::string_view cmd, std::string_view outs)
 670 {
 671   if (r < 0) {
 672     dout(0) << __func__ << ": mon command " << cmd << " failed with errno " << r
 673             << " (" << outs << ")" << dendl;
 674   } else {
 675     dout(1) << __func__ << ": mon command " << cmd << " succeed" << dendl;
 676   }
 677 }
 678
 679 void MDSRank::set_mdsmap_multimds_snaps_allowed()
 680 {
 681   static bool already_sent = false;
 682   if (already_sent)
 683     return;
 684
 685   stringstream ss;
 686   ss << "{\"prefix\":\"fs set\", \"fs_name\":\"" <<  mdsmap->get_fs_name() << "\", ";
 687   ss << "\"var\":\"allow_multimds_snaps\", \"val\":\"true\", ";
 688   ss << "\"confirm\":\"--yes-i-am-really-a-mds\"}";
 689   std::vector<std::string> cmd = {ss.str()};
 690
 691   dout(0) << __func__ << ": sending mon command: " << cmd[0] << dendl;
 692
 693   C_MDS_MonCommand *fin = new C_MDS_MonCommand(this, cmd[0]);
 694   monc->start_mon_command(cmd, {}, nullptr, &fin->outs, new C_IO_Wrapper(this, fin));
 695
 696   already_sent = true;
 697 }
 698
 699 void MDSRank::mark_base_recursively_scrubbed(inodeno_t ino)
 700 {
 701   if (mdsmap->get_tableserver() == whoami)
 702     snapserver->mark_base_recursively_scrubbed(ino);
 703 }
 704
 705 void MDSRankDispatcher::tick()
 706 {
 707   heartbeat_reset();
 708
 709   if (beacon.is_laggy()) {
 710     dout(1) << "skipping upkeep work because connection to Monitors appears laggy" << dendl;
 711     return;
 712   }
 713
 714   check_ops_in_flight();
 715
 716   // Wake up thread in case we use to be laggy and have waiting_for_nolaggy
 717   // messages to progress.
 718   progress_thread.signal();
 719
 720   // make sure mds log flushes, trims periodically
 721   mdlog->flush();
 722
 723   // update average session uptime
 724   sessionmap.update_average_session_age();
 725
 726   if (is_active() || is_stopping()) {
 727     mdlog->trim();  // NOT during recovery!
 728   }
 729
 730   // ...
 731   if (is_cache_trimmable()) {
 732     server->find_idle_sessions();
 733     server->evict_cap_revoke_non_responders();
 734     locker->tick();
 735   }
 736
 737   // log
 738   if (logger) {
 739     logger->set(l_mds_subtrees, mdcache->num_subtrees());
 740     mdcache->log_stat();
 741   }
 742
 743   if (is_reconnect())
 744     server->reconnect_tick();
 745
 746   if (is_active()) {
 747     balancer->tick();
 748     mdcache->find_stale_fragment_freeze();
 749     mdcache->migrator->find_stale_export_freeze();
 750
 751     if (mdsmap->get_tableserver() == whoami) {
 752       snapserver->check_osd_map(false);
 753       // Filesystem was created by pre-mimic mds. Allow multi-active mds after
 754       // all old snapshots are deleted.
 755       if (!mdsmap->allows_multimds_snaps() &&
 756           snapserver->can_allow_multimds_snaps()) {
 757         set_mdsmap_multimds_snaps_allowed();
 758       }
 759     }
 760   }
 761
 762   if (is_active() || is_stopping()) {
 763     update_targets();
 764   }
 765
 766   // shut down?
 767   if (is_stopping()) {
 768     mdlog->trim();
 769     if (mdcache->shutdown_pass()) {
 770       uint64_t pq_progress = 0 ;
 771       uint64_t pq_total = 0;
 772       size_t pq_in_flight = 0;
 773       if (!purge_queue.drain(&pq_progress, &pq_total, &pq_in_flight)) {
 774         dout(7) << "shutdown_pass=true, but still waiting for purge queue"
 775                 << dendl;
 776         // This takes unbounded time, so we must indicate progress
 777         // to the administrator: we do it in a slightly imperfect way
 778         // by sending periodic (tick frequency) clog messages while
 779         // in this state.
 780         clog->info() << "MDS rank " << whoami << " waiting for purge queue ("
 781           << std::dec << pq_progress << "/" << pq_total << " " << pq_in_flight
 782           << " files purging" << ")";
 783       } else {
 784         dout(7) << "shutdown_pass=true, finished w/ shutdown, moving to "
 785                    "down:stopped" << dendl;
 786         stopping_done();
 787       }
 788     }
 789     else {
 790       dout(7) << "shutdown_pass=false" << dendl;
 791     }
 792   }
 793
 794   // Expose ourselves to Beacon to update health indicators
 795   beacon.notify_health(this);
 796 }
 797
 798 void MDSRankDispatcher::shutdown()
 799 {
 800   // It should never be possible for shutdown to get called twice, because
 801   // anyone picking up mds_lock checks if stopping is true and drops
 802   // out if it is.
 803   ceph_assert(stopping == false);
 804   stopping = true;
 805
 806   dout(1) << __func__ << ": shutting down rank " << whoami << dendl;
 807
 808   g_conf().remove_observer(this);
 809
 810   timer.shutdown();
 811
 812   // MDLog has to shut down before the finisher, because some of its
 813   // threads block on IOs that require finisher to complete.
 814   mdlog->shutdown();
 815
 816   // shut down cache
 817   mdcache->shutdown();
 818
 819   purge_queue.shutdown();
 820
 821   mds_lock.unlock();
 822   finisher->stop(); // no flushing
 823   mds_lock.lock();
 824
 825   if (objecter->initialized)
 826     objecter->shutdown();
 827
 828   monc->shutdown();
 829
 830   op_tracker.on_shutdown();
 831
 832   progress_thread.shutdown();
 833
 834   // release mds_lock for finisher/messenger threads (e.g.
 835   // MDSDaemon::ms_handle_reset called from Messenger).
 836   mds_lock.unlock();
 837
 838   // shut down messenger
 839   messenger->shutdown();
 840
 841   mds_lock.lock();
 842
 843   // Workaround unclean shutdown: HeartbeatMap will assert if
 844   // worker is not removed (as we do in ~MDS), but ~MDS is not
 845   // always called after suicide.
 846   if (hb) {
 847     g_ceph_context->get_heartbeat_map()->remove_worker(hb);
 848     hb = NULL;
 849   }
 850 }
 851
 852 /**
 853  * Helper for simple callbacks that call a void fn with no args.
 854  */
 855 class C_MDS_VoidFn : public MDSInternalContext
 856 {
 857   typedef void (MDSRank::*fn_ptr)();
 858   protected:
 859    fn_ptr fn;
 860   public:
 861   C_MDS_VoidFn(MDSRank *mds_, fn_ptr fn_)
 862     : MDSInternalContext(mds_), fn(fn_)
 863   {
 864     ceph_assert(mds_);
 865     ceph_assert(fn_);
 866   }
 867
 868   void finish(int r) override
 869   {
 870     (mds->*fn)();
 871   }
 872 };
 873
 874 int64_t MDSRank::get_metadata_pool()
 875 {
 876     return mdsmap->get_metadata_pool();
 877 }
 878
 879 MDSTableClient *MDSRank::get_table_client(int t)
 880 {
 881   switch (t) {
 882   case TABLE_ANCHOR: return NULL;
 883   case TABLE_SNAP: return snapclient;
 884   default: ceph_abort();
 885   }
 886 }
 887
 888 MDSTableServer *MDSRank::get_table_server(int t)
 889 {
 890   switch (t) {
 891   case TABLE_ANCHOR: return NULL;
 892   case TABLE_SNAP: return snapserver;
 893   default: ceph_abort();
 894   }
 895 }
 896
 897 void MDSRank::suicide()
 898 {
 899   if (suicide_hook) {
 900     suicide_hook->complete(0);
 901     suicide_hook = NULL;
 902   }
 903 }
 904
 905 void MDSRank::respawn()
 906 {
 907   if (respawn_hook) {
 908     respawn_hook->complete(0);
 909     respawn_hook = NULL;
 910   }
 911 }
 912
 913 void MDSRank::damaged()
 914 {
 915   ceph_assert(whoami != MDS_RANK_NONE);
 916   ceph_assert(ceph_mutex_is_locked_by_me(mds_lock));
 917
 918   beacon.set_want_state(*mdsmap, MDSMap::STATE_DAMAGED);
 919   monc->flush_log();  // Flush any clog error from before we were called
 920   beacon.notify_health(this);  // Include latest status in our swan song
 921   beacon.send_and_wait(g_conf()->mds_mon_shutdown_timeout);
 922
 923   // It's okay if we timed out and the mon didn't get our beacon, because
 924   // another daemon (or ourselves after respawn) will eventually take the
 925   // rank and report DAMAGED again when it hits same problem we did.
 926
 927   respawn();  // Respawn into standby in case mon has other work for us
 928 }
 929
 930 void MDSRank::damaged_unlocked()
 931 {
 932   std::lock_guard l(mds_lock);
 933   damaged();
 934 }
 935
 936 void MDSRank::handle_write_error(int err)
 937 {
 938   if (err == -EBLACKLISTED) {
 939     derr << "we have been blacklisted (fenced), respawning..." << dendl;
 940     respawn();
 941     return;
 942   }
 943
 944   if (g_conf()->mds_action_on_write_error >= 2) {
 945     derr << "unhandled write error " << cpp_strerror(err) << ", suicide..." << dendl;
 946     respawn();
 947   } else if (g_conf()->mds_action_on_write_error == 1) {
 948     derr << "unhandled write error " << cpp_strerror(err) << ", force readonly..." << dendl;
 949     mdcache->force_readonly();
 950   } else {
 951     // ignore;
 952     derr << "unhandled write error " << cpp_strerror(err) << ", ignore..." << dendl;
 953   }
 954 }
 955
 956 void *MDSRank::ProgressThread::entry()
 957 {
 958   std::unique_lock l(mds->mds_lock);
 959   while (true) {
 960     cond.wait(l, [this] {
 961       return (mds->stopping ||
 962               !mds->finished_queue.empty() ||
 963               (!mds->waiting_for_nolaggy.empty() && !mds->beacon.is_laggy()));
 964     });
 965
 966     if (mds->stopping) {
 967       break;
 968     }
 969
 970     mds->_advance_queues();
 971   }
 972
 973   return NULL;
 974 }
 975
 976
 977 void MDSRank::ProgressThread::shutdown()
 978 {
 979   ceph_assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
 980   ceph_assert(mds->stopping);
 981
 982   if (am_self()) {
 983     // Stopping is set, we will fall out of our main loop naturally
 984   } else {
 985     // Kick the thread to notice mds->stopping, and join it
 986     cond.notify_all();
 987     mds->mds_lock.unlock();
 988     if (is_started())
 989       join();
 990     mds->mds_lock.lock();
 991   }
 992 }
 993
 994 bool MDSRankDispatcher::ms_dispatch(const cref_t<Message> &m)
 995 {
 996   if (m->get_source().is_client()) {
 997     Session *session = static_cast<Session*>(m->get_connection()->get_priv().get());
 998     if (session)
 999       session->last_seen = Session::clock::now();
1000   }
1001
1002   inc_dispatch_depth();
1003   bool ret = _dispatch(m, true);
1004   dec_dispatch_depth();
1005   return ret;
1006 }
1007
1008 bool MDSRank::_dispatch(const cref_t<Message> &m, bool new_msg)
1009 {
1010   if (is_stale_message(m)) {
1011     return true;
1012   }
1013   // do not proceed if this message cannot be handled
1014   if (!is_valid_message(m)) {
1015     return false;
1016   }
1017
1018   if (beacon.is_laggy()) {
1019     dout(5) << " laggy, deferring " << *m << dendl;
1020     waiting_for_nolaggy.push_back(m);
1021   } else if (new_msg && !waiting_for_nolaggy.empty()) {
1022     dout(5) << " there are deferred messages, deferring " << *m << dendl;
1023     waiting_for_nolaggy.push_back(m);
1024   } else {
1025     handle_message(m);
1026     heartbeat_reset();
1027   }
1028
1029   if (dispatch_depth > 1)
1030     return true;
1031
1032   // finish any triggered contexts
1033   _advance_queues();
1034
1035   if (beacon.is_laggy()) {
1036     // We've gone laggy during dispatch, don't do any
1037     // more housekeeping
1038     return true;
1039   }
1040
1041   // hack: thrash exports
1042   static utime_t start;
1043   utime_t now = ceph_clock_now();
1044   if (start == utime_t())
1045     start = now;
1046   /*double el = now - start;
1047   if (el > 30.0 &&
1048     el < 60.0)*/
1049   for (int i=0; i<g_conf()->mds_thrash_exports; i++) {
1050     set<mds_rank_t> s;
1051     if (!is_active()) break;
1052     mdsmap->get_mds_set(s, MDSMap::STATE_ACTIVE);
1053     if (s.size() < 2 || CInode::count() < 10)
1054       break;  // need peers for this to work.
1055     if (mdcache->migrator->get_num_exporting() > g_conf()->mds_thrash_exports * 5 ||
1056         mdcache->migrator->get_export_queue_size() > g_conf()->mds_thrash_exports * 10)
1057       break;
1058
1059     dout(7) << "mds thrashing exports pass " << (i+1) << "/" << g_conf()->mds_thrash_exports << dendl;
1060
1061     // pick a random dir inode
1062     CInode *in = mdcache->hack_pick_random_inode();
1063
1064     auto&& ls = in->get_dirfrags();
1065     if (!ls.empty()) {  // must be an open dir.
1066       const auto& dir = ls[rand() % ls.size()];
1067       if (!dir->get_parent_dir()) continue;    // must be linked.
1068       if (!dir->is_auth()) continue;           // must be auth.
1069
1070       mds_rank_t dest;
1071       do {
1072         int k = rand() % s.size();
1073         set<mds_rank_t>::iterator p = s.begin();
1074         while (k--) ++p;
1075         dest = *p;
1076       } while (dest == whoami);
1077       mdcache->migrator->export_dir_nicely(dir,dest);
1078     }
1079   }
1080   // hack: thrash fragments
1081   for (int i=0; i<g_conf()->mds_thrash_fragments; i++) {
1082     if (!is_active()) break;
1083     if (mdcache->get_num_fragmenting_dirs() > 5 * g_conf()->mds_thrash_fragments) break;
1084     dout(7) << "mds thrashing fragments pass " << (i+1) << "/" << g_conf()->mds_thrash_fragments << dendl;
1085
1086     // pick a random dir inode
1087     CInode *in = mdcache->hack_pick_random_inode();
1088
1089     auto&& ls = in->get_dirfrags();
1090     if (ls.empty()) continue;                // must be an open dir.
1091     CDir *dir = ls.front();
1092     if (!dir->get_parent_dir()) continue;    // must be linked.
1093     if (!dir->is_auth()) continue;           // must be auth.
1094     frag_t fg = dir->get_frag();
1095     if ((fg == frag_t() || (rand() % (1 << fg.bits()) == 0))) {
1096       mdcache->split_dir(dir, 1);
1097     } else {
1098       balancer->queue_merge(dir);
1099     }
1100   }
1101
1102   // hack: force hash root?
1103   /*
1104   if (false &&
1105       mdcache->get_root() &&
1106       mdcache->get_root()->dir &&
1107       !(mdcache->get_root()->dir->is_hashed() ||
1108         mdcache->get_root()->dir->is_hashing())) {
1109     dout(0) << "hashing root" << dendl;
1110     mdcache->migrator->hash_dir(mdcache->get_root()->dir);
1111   }
1112   */
1113
1114   update_mlogger();
1115   return true;
1116 }
1117
1118 void MDSRank::update_mlogger()
1119 {
1120   if (mlogger) {
1121     mlogger->set(l_mdm_ino, CInode::count());
1122     mlogger->set(l_mdm_dir, CDir::count());
1123     mlogger->set(l_mdm_dn, CDentry::count());
1124     mlogger->set(l_mdm_cap, Capability::count());
1125     mlogger->set(l_mdm_inoa, CInode::increments());
1126     mlogger->set(l_mdm_inos, CInode::decrements());
1127     mlogger->set(l_mdm_dira, CDir::increments());
1128     mlogger->set(l_mdm_dirs, CDir::decrements());
1129     mlogger->set(l_mdm_dna, CDentry::increments());
1130     mlogger->set(l_mdm_dns, CDentry::decrements());
1131     mlogger->set(l_mdm_capa, Capability::increments());
1132     mlogger->set(l_mdm_caps, Capability::decrements());
1133   }
1134 }
1135
1136 // message types that the mds can handle
1137 bool MDSRank::is_valid_message(const cref_t<Message> &m) {
1138   int port = m->get_type() & 0xff00;
1139   int type = m->get_type();
1140
1141   if (port == MDS_PORT_CACHE ||
1142       port == MDS_PORT_MIGRATOR ||
1143       type == CEPH_MSG_CLIENT_SESSION ||
1144       type == CEPH_MSG_CLIENT_RECONNECT ||
1145       type == CEPH_MSG_CLIENT_RECLAIM ||
1146       type == CEPH_MSG_CLIENT_REQUEST ||
1147       type == MSG_MDS_SLAVE_REQUEST ||
1148       type == MSG_MDS_HEARTBEAT ||
1149       type == MSG_MDS_TABLE_REQUEST ||
1150       type == MSG_MDS_LOCK ||
1151       type == MSG_MDS_INODEFILECAPS ||
1152       type == CEPH_MSG_CLIENT_CAPS ||
1153       type == CEPH_MSG_CLIENT_CAPRELEASE ||
1154       type == CEPH_MSG_CLIENT_LEASE) {
1155     return true;
1156   }
1157
1158   return false;
1159 }
1160
1161 /*
1162  * lower priority messages we defer if we seem laggy
1163  */
1164
1165 #define ALLOW_MESSAGES_FROM(peers)                                      \
1166   do {                                                                  \
1167     if (m->get_connection() && (m->get_connection()->get_peer_type() & (peers)) == 0) { \
1168       dout(0) << __FILE__ << "." << __LINE__ << ": filtered out request, peer=" << m->get_connection()->get_peer_type() \
1169               << " allowing=" << #peers << " message=" << *m << dendl;  \
1170       return;                                                           \
1171     }                                                                   \
1172   } while (0)
1173
1174 void MDSRank::handle_message(const cref_t<Message> &m)
1175 {
1176   int port = m->get_type() & 0xff00;
1177
1178   switch (port) {
1179   case MDS_PORT_CACHE:
1180     ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
1181     mdcache->dispatch(m);
1182     break;
1183
1184   case MDS_PORT_MIGRATOR:
1185     ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
1186     mdcache->migrator->dispatch(m);
1187     break;
1188
1189   default:
1190     switch (m->get_type()) {
1191       // SERVER
1192     case CEPH_MSG_CLIENT_SESSION:
1193     case CEPH_MSG_CLIENT_RECONNECT:
1194     case CEPH_MSG_CLIENT_RECLAIM:
1195       ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_CLIENT);
1196       // fall-thru
1197     case CEPH_MSG_CLIENT_REQUEST:
1198       server->dispatch(m);
1199       break;
1200     case MSG_MDS_SLAVE_REQUEST:
1201       ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
1202       server->dispatch(m);
1203       break;
1204
1205     case MSG_MDS_HEARTBEAT:
1206       ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
1207       balancer->proc_message(m);
1208       break;
1209
1210     case MSG_MDS_TABLE_REQUEST:
1211       ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
1212       {
1213         const cref_t<MMDSTableRequest> &req = ref_cast<MMDSTableRequest>(m);
1214         if (req->op < 0) {
1215           MDSTableClient *client = get_table_client(req->table);
1216           client->handle_request(req);
1217         } else {
1218            MDSTableServer *server = get_table_server(req->table);
1219            server->handle_request(req);
1220         }
1221       }
1222       break;
1223
1224     case MSG_MDS_LOCK:
1225     case MSG_MDS_INODEFILECAPS:
1226       ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
1227       locker->dispatch(m);
1228       break;
1229
1230     case CEPH_MSG_CLIENT_CAPS:
1231     case CEPH_MSG_CLIENT_CAPRELEASE:
1232     case CEPH_MSG_CLIENT_LEASE:
1233       ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_CLIENT);
1234       locker->dispatch(m);
1235       break;
1236
1237     default:
1238       derr << "unrecognized message " << *m << dendl;
1239     }
1240   }
1241 }
1242
1243 /**
1244  * Advance finished_queue and waiting_for_nolaggy.
1245  *
1246  * Usually drain both queues, but may not drain waiting_for_nolaggy
1247  * if beacon is currently laggy.
1248  */
1249 void MDSRank::_advance_queues()
1250 {
1251   ceph_assert(ceph_mutex_is_locked_by_me(mds_lock));
1252
1253   if (!finished_queue.empty()) {
1254     dout(7) << "mds has " << finished_queue.size() << " queued contexts" << dendl;
1255     while (!finished_queue.empty()) {
1256       auto fin = finished_queue.front();
1257       finished_queue.pop_front();
1258
1259       dout(10) << " finish " << fin << dendl;
1260       fin->complete(0);
1261
1262       heartbeat_reset();
1263     }
1264   }
1265
1266   while (!waiting_for_nolaggy.empty()) {
1267     // stop if we're laggy now!
1268     if (beacon.is_laggy())
1269       break;
1270
1271     cref_t<Message> old = waiting_for_nolaggy.front();
1272     waiting_for_nolaggy.pop_front();
1273
1274     if (!is_stale_message(old)) {
1275       dout(7) << " processing laggy deferred " << *old << dendl;
1276       ceph_assert(is_valid_message(old));
1277       handle_message(old);
1278     }
1279
1280     heartbeat_reset();
1281   }
1282 }
1283
1284 /**
1285  * Call this when you take mds_lock, or periodically if you're going to
1286  * hold the lock for a long time (e.g. iterating over clients/inodes)
1287  */
1288 void MDSRank::heartbeat_reset()
1289 {
1290   // Any thread might jump into mds_lock and call us immediately
1291   // after a call to suicide() completes, in which case MDSRank::hb
1292   // has been freed and we are a no-op.
1293   if (!hb) {
1294       ceph_assert(stopping);
1295       return;
1296   }
1297
1298   // NB not enabling suicide grace, because the mon takes care of killing us
1299   // (by blacklisting us) when we fail to send beacons, and it's simpler to
1300   // only have one way of dying.
1301   auto grace = g_conf().get_val<double>("mds_heartbeat_grace");
1302   g_ceph_context->get_heartbeat_map()->reset_timeout(hb, grace, 0);
1303 }
1304
1305 bool MDSRank::is_stale_message(const cref_t<Message> &m) const
1306 {
1307   // from bad mds?
1308   if (m->get_source().is_mds()) {
1309     mds_rank_t from = mds_rank_t(m->get_source().num());
1310     bool bad = false;
1311     if (mdsmap->is_down(from)) {
1312       bad = true;
1313     } else {
1314       // FIXME: this is a convoluted check.  we should be maintaining a nice
1315       // clean map of current ConnectionRefs for current mdses!!!
1316       auto c = messenger->connect_to(CEPH_ENTITY_TYPE_MDS,
1317                                      mdsmap->get_addrs(from));
1318       if (c != m->get_connection()) {
1319         bad = true;
1320         dout(5) << " mds." << from << " should be " << c << " "
1321                 << c->get_peer_addrs() << " but this message is "
1322                 << m->get_connection() << " " << m->get_source_addrs()
1323                 << dendl;
1324       }
1325     }
1326     if (bad) {
1327       // bogus mds?
1328       if (m->get_type() == CEPH_MSG_MDS_MAP) {
1329         dout(5) << "got " << *m << " from old/bad/imposter mds " << m->get_source()
1330                 << ", but it's an mdsmap, looking at it" << dendl;
1331       } else if (m->get_type() == MSG_MDS_CACHEEXPIRE &&
1332                  mdsmap->get_addrs(from) == m->get_source_addrs()) {
1333         dout(5) << "got " << *m << " from down mds " << m->get_source()
1334                 << ", but it's a cache_expire, looking at it" << dendl;
1335       } else {
1336         dout(5) << "got " << *m << " from down/old/bad/imposter mds " << m->get_source()
1337                 << ", dropping" << dendl;
1338         return true;
1339       }
1340     }
1341   }
1342   return false;
1343 }
1344
1345 Session *MDSRank::get_session(const cref_t<Message> &m)
1346 {
1347   // do not carry ref
1348   auto session = static_cast<Session *>(m->get_connection()->get_priv().get());
1349   if (session) {
1350     dout(20) << "get_session have " << session << " " << session->info.inst
1351              << " state " << session->get_state_name() << dendl;
1352     // Check if we've imported an open session since (new sessions start closed)
1353     if (session->is_closed()) {
1354       Session *imported_session = sessionmap.get_session(session->info.inst.name);
1355       if (imported_session && imported_session != session) {
1356         dout(10) << __func__ << " replacing connection bootstrap session "
1357                  << session << " with imported session " << imported_session
1358                  << dendl;
1359         imported_session->info.auth_name = session->info.auth_name;
1360         //assert(session->info.auth_name == imported_session->info.auth_name);
1361         ceph_assert(session->info.inst == imported_session->info.inst);
1362         imported_session->set_connection(session->get_connection().get());
1363         // send out any queued messages
1364         while (!session->preopen_out_queue.empty()) {
1365           imported_session->get_connection()->send_message2(std::move(session->preopen_out_queue.front()));
1366           session->preopen_out_queue.pop_front();
1367         }
1368         imported_session->auth_caps = session->auth_caps;
1369         imported_session->last_seen = session->last_seen;
1370         ceph_assert(session->get_nref() == 1);
1371         imported_session->get_connection()->set_priv(imported_session->get());
1372         session = imported_session;
1373       }
1374     }
1375   } else {
1376     dout(20) << "get_session dne for " << m->get_source_inst() << dendl;
1377   }
1378   return session;
1379 }
1380
1381 void MDSRank::send_message(const ref_t<Message>& m, const ConnectionRef& c)
1382 {
1383   ceph_assert(c);
1384   c->send_message2(m);
1385 }
1386
1387
1388 void MDSRank::send_message_mds(const ref_t<Message>& m, mds_rank_t mds)
1389 {
1390   if (!mdsmap->is_up(mds)) {
1391     dout(10) << "send_message_mds mds." << mds << " not up, dropping " << *m << dendl;
1392     return;
1393   }
1394
1395   // send mdsmap first?
1396   if (mds != whoami && peer_mdsmap_epoch[mds] < mdsmap->get_epoch()) {
1397     auto _m = make_message<MMDSMap>(monc->get_fsid(), *mdsmap);
1398     messenger->send_to_mds(_m.detach(), mdsmap->get_addrs(mds));
1399     peer_mdsmap_epoch[mds] = mdsmap->get_epoch();
1400   }
1401
1402   // send message
1403   messenger->send_to_mds(ref_t<Message>(m).detach(), mdsmap->get_addrs(mds));
1404 }
1405
1406 void MDSRank::forward_message_mds(const cref_t<MClientRequest>& m, mds_rank_t mds)
1407 {
1408   ceph_assert(mds != whoami);
1409
1410   /*
1411    * don't actually forward if non-idempotent!
1412    * client has to do it.  although the MDS will ignore duplicate requests,
1413    * the affected metadata may migrate, in which case the new authority
1414    * won't have the metareq_id in the completed request map.
1415    */
1416   // NEW: always make the client resend!
1417   bool client_must_resend = true;  //!creq->can_forward();
1418
1419   // tell the client where it should go
1420   auto session = get_session(m);
1421   auto f = make_message<MClientRequestForward>(m->get_tid(), mds, m->get_num_fwd()+1, client_must_resend);
1422   send_message_client(f, session);
1423 }
1424
1425 void MDSRank::send_message_client_counted(const ref_t<Message>& m, client_t client)
1426 {
1427   Session *session = sessionmap.get_session(entity_name_t::CLIENT(client.v));
1428   if (session) {
1429     send_message_client_counted(m, session);
1430   } else {
1431     dout(10) << "send_message_client_counted no session for client." << client << " " << *m << dendl;
1432   }
1433 }
1434
1435 void MDSRank::send_message_client_counted(const ref_t<Message>& m, const ConnectionRef& connection)
1436 {
1437   // do not carry ref
1438   auto session = static_cast<Session *>(connection->get_priv().get());
1439   if (session) {
1440     send_message_client_counted(m, session);
1441   } else {
1442     dout(10) << "send_message_client_counted has no session for " << m->get_source_inst() << dendl;
1443     // another Connection took over the Session
1444   }
1445 }
1446
1447 void MDSRank::send_message_client_counted(const ref_t<Message>& m, Session* session)
1448 {
1449   version_t seq = session->inc_push_seq();
1450   dout(10) << "send_message_client_counted " << session->info.inst.name << " seq "
1451            << seq << " " << *m << dendl;
1452   if (session->get_connection()) {
1453     session->get_connection()->send_message2(m);
1454   } else {
1455     session->preopen_out_queue.push_back(m);
1456   }
1457 }
1458
1459 void MDSRank::send_message_client(const ref_t<Message>& m, Session* session)
1460 {
1461   dout(10) << "send_message_client " << session->info.inst << " " << *m << dendl;
1462   if (session->get_connection()) {
1463     session->get_connection()->send_message2(m);
1464   } else {
1465     session->preopen_out_queue.push_back(m);
1466   }
1467 }
1468
1469 /**
1470  * This is used whenever a RADOS operation has been cancelled
1471  * or a RADOS client has been blacklisted, to cause the MDS and
1472  * any clients to wait for this OSD epoch before using any new caps.
1473  *
1474  * See doc/cephfs/eviction
1475  */
1476 void MDSRank::set_osd_epoch_barrier(epoch_t e)
1477 {
1478   dout(4) << __func__ << ": epoch=" << e << dendl;
1479   osd_epoch_barrier = e;
1480 }
1481
1482 void MDSRank::retry_dispatch(const cref_t<Message> &m)
1483 {
1484   inc_dispatch_depth();
1485   _dispatch(m, false);
1486   dec_dispatch_depth();
1487 }
1488
1489 double MDSRank::get_dispatch_queue_max_age(utime_t now) const
1490 {
1491   return messenger->get_dispatch_queue_max_age(now);
1492 }
1493
1494 bool MDSRank::is_daemon_stopping() const
1495 {
1496   return stopping;
1497 }
1498
1499 void MDSRank::request_state(MDSMap::DaemonState s)
1500 {
1501   dout(3) << "request_state " << ceph_mds_state_name(s) << dendl;
1502   beacon.set_want_state(*mdsmap, s);
1503   beacon.send();
1504 }
1505
1506
1507 class C_MDS_BootStart : public MDSInternalContext {
1508   MDSRank::BootStep nextstep;
1509 public:
1510   C_MDS_BootStart(MDSRank *m, MDSRank::BootStep n)
1511     : MDSInternalContext(m), nextstep(n) {}
1512   void finish(int r) override {
1513     mds->boot_start(nextstep, r);
1514   }
1515 };
1516
1517
1518 void MDSRank::boot_start(BootStep step, int r)
1519 {
1520   // Handle errors from previous step
1521   if (r < 0) {
1522     if (is_standby_replay() && (r == -EAGAIN)) {
1523       dout(0) << "boot_start encountered an error EAGAIN"
1524               << ", respawning since we fell behind journal" << dendl;
1525       respawn();
1526     } else if (r == -EINVAL || r == -ENOENT) {
1527       // Invalid or absent data, indicates damaged on-disk structures
1528       clog->error() << "Error loading MDS rank " << whoami << ": "
1529         << cpp_strerror(r);
1530       damaged();
1531       ceph_assert(r == 0);  // Unreachable, damaged() calls respawn()
1532     } else if (r == -EROFS) {
1533       dout(0) << "boot error forcing transition to read-only; MDS will try to continue" << dendl;
1534     } else {
1535       // Completely unexpected error, give up and die
1536       dout(0) << "boot_start encountered an error, failing" << dendl;
1537       suicide();
1538       return;
1539     }
1540   }
1541
1542   ceph_assert(is_starting() || is_any_replay());
1543
1544   switch(step) {
1545     case MDS_BOOT_INITIAL:
1546       {
1547         mdcache->init_layouts();
1548
1549         MDSGatherBuilder gather(g_ceph_context,
1550             new C_MDS_BootStart(this, MDS_BOOT_OPEN_ROOT));
1551         dout(2) << "Booting: " << step << ": opening inotable" << dendl;
1552         inotable->set_rank(whoami);
1553         inotable->load(gather.new_sub());
1554
1555         dout(2) << "Booting: " << step << ": opening sessionmap" << dendl;
1556         sessionmap.set_rank(whoami);
1557         sessionmap.load(gather.new_sub());
1558
1559         dout(2) << "Booting: " << step << ": opening mds log" << dendl;
1560         mdlog->open(gather.new_sub());
1561
1562         if (is_starting()) {
1563           dout(2) << "Booting: " << step << ": opening purge queue" << dendl;
1564           purge_queue.open(new C_IO_Wrapper(this, gather.new_sub()));
1565         } else if (!standby_replaying) {
1566           dout(2) << "Booting: " << step << ": opening purge queue (async)" << dendl;
1567           purge_queue.open(NULL);
1568           dout(2) << "Booting: " << step << ": loading open file table (async)" << dendl;
1569           mdcache->open_file_table.load(nullptr);
1570         }
1571
1572         if (mdsmap->get_tableserver() == whoami) {
1573           dout(2) << "Booting: " << step << ": opening snap table" << dendl;
1574           snapserver->set_rank(whoami);
1575           snapserver->load(gather.new_sub());
1576         }
1577
1578         gather.activate();
1579       }
1580       break;
1581     case MDS_BOOT_OPEN_ROOT:
1582       {
1583         dout(2) << "Booting: " << step << ": loading/discovering base inodes" << dendl;
1584
1585         MDSGatherBuilder gather(g_ceph_context,
1586             new C_MDS_BootStart(this, MDS_BOOT_PREPARE_LOG));
1587
1588         if (is_starting()) {
1589           // load mydir frag for the first log segment (creating subtree map)
1590           mdcache->open_mydir_frag(gather.new_sub());
1591         } else {
1592           mdcache->open_mydir_inode(gather.new_sub());
1593         }
1594
1595         mdcache->create_global_snaprealm();
1596
1597         if (whoami == mdsmap->get_root()) {  // load root inode off disk if we are auth
1598           mdcache->open_root_inode(gather.new_sub());
1599         } else if (is_any_replay()) {
1600           // replay.  make up fake root inode to start with
1601           mdcache->create_root_inode();
1602         }
1603         gather.activate();
1604       }
1605       break;
1606     case MDS_BOOT_PREPARE_LOG:
1607       if (is_any_replay()) {
1608         dout(2) << "Booting: " << step << ": replaying mds log" << dendl;
1609         MDSGatherBuilder gather(g_ceph_context,
1610             new C_MDS_BootStart(this, MDS_BOOT_REPLAY_DONE));
1611
1612         if (!standby_replaying) {
1613           dout(2) << "Booting: " << step << ": waiting for purge queue recovered" << dendl;
1614           purge_queue.wait_for_recovery(new C_IO_Wrapper(this, gather.new_sub()));
1615         }
1616
1617         mdlog->replay(gather.new_sub());
1618         gather.activate();
1619       } else {
1620         dout(2) << "Booting: " << step << ": positioning at end of old mds log" << dendl;
1621         mdlog->append();
1622         starting_done();
1623       }
1624       break;
1625     case MDS_BOOT_REPLAY_DONE:
1626       ceph_assert(is_any_replay());
1627
1628       // Sessiontable and inotable should be in sync after replay, validate
1629       // that they are consistent.
1630       validate_sessions();
1631
1632       replay_done();
1633       break;
1634   }
1635 }
1636
1637 void MDSRank::validate_sessions()
1638 {
1639   ceph_assert(ceph_mutex_is_locked_by_me(mds_lock));
1640   bool valid = true;
1641
1642   // Identify any sessions which have state inconsistent with other,
1643   // after they have been loaded from rados during startup.
1644   // Mitigate bugs like: http://tracker.ceph.com/issues/16842
1645   for (const auto &i : sessionmap.get_sessions()) {
1646     Session *session = i.second;
1647     interval_set<inodeno_t> badones;
1648     if (inotable->intersects_free(session->info.prealloc_inos, &badones)) {
1649       clog->error() << "client " << *session
1650                     << "loaded with preallocated inodes that are inconsistent with inotable";
1651       valid = false;
1652     }
1653   }
1654
1655   if (!valid) {
1656     damaged();
1657     ceph_assert(valid);
1658   }
1659 }
1660
1661 void MDSRank::starting_done()
1662 {
1663   dout(3) << "starting_done" << dendl;
1664   ceph_assert(is_starting());
1665   request_state(MDSMap::STATE_ACTIVE);
1666
1667   mdlog->start_new_segment();
1668
1669   // sync snaptable cache
1670   snapclient->sync(new C_MDSInternalNoop);
1671 }
1672
1673
1674 void MDSRank::calc_recovery_set()
1675 {
1676   // initialize gather sets
1677   set<mds_rank_t> rs;
1678   mdsmap->get_recovery_mds_set(rs);
1679   rs.erase(whoami);
1680   mdcache->set_recovery_set(rs);
1681
1682   dout(1) << " recovery set is " << rs << dendl;
1683 }
1684
1685
1686 void MDSRank::replay_start()
1687 {
1688   dout(1) << "replay_start" << dendl;
1689
1690   if (is_standby_replay())
1691     standby_replaying = true;
1692
1693   calc_recovery_set();
1694
1695   // Check if we need to wait for a newer OSD map before starting
1696   Context *fin = new C_IO_Wrapper(this, new C_MDS_BootStart(this, MDS_BOOT_INITIAL));
1697   bool const ready = objecter->wait_for_map(
1698       mdsmap->get_last_failure_osd_epoch(),
1699       fin);
1700
1701   if (ready) {
1702     delete fin;
1703     boot_start();
1704   } else {
1705     dout(1) << " waiting for osdmap " << mdsmap->get_last_failure_osd_epoch()
1706             << " (which blacklists prior instance)" << dendl;
1707   }
1708 }
1709
1710
1711 class MDSRank::C_MDS_StandbyReplayRestartFinish : public MDSIOContext {
1712   uint64_t old_read_pos;
1713 public:
1714   C_MDS_StandbyReplayRestartFinish(MDSRank *mds_, uint64_t old_read_pos_) :
1715     MDSIOContext(mds_), old_read_pos(old_read_pos_) {}
1716   void finish(int r) override {
1717     mds->_standby_replay_restart_finish(r, old_read_pos);
1718   }
1719   void print(ostream& out) const override {
1720     out << "standby_replay_restart";
1721   }
1722 };
1723
1724 void MDSRank::_standby_replay_restart_finish(int r, uint64_t old_read_pos)
1725 {
1726   if (old_read_pos < mdlog->get_journaler()->get_trimmed_pos()) {
1727     dout(0) << "standby MDS fell behind active MDS journal's expire_pos, restarting" << dendl;
1728     respawn(); /* we're too far back, and this is easier than
1729                   trying to reset everything in the cache, etc */
1730   } else {
1731     mdlog->standby_trim_segments();
1732     boot_start(MDS_BOOT_PREPARE_LOG, r);
1733   }
1734 }
1735
1736 class MDSRank::C_MDS_StandbyReplayRestart : public MDSInternalContext {
1737 public:
1738   explicit C_MDS_StandbyReplayRestart(MDSRank *m) : MDSInternalContext(m) {}
1739   void finish(int r) override {
1740     ceph_assert(!r);
1741     mds->standby_replay_restart();
1742   }
1743 };
1744
1745 void MDSRank::standby_replay_restart()
1746 {
1747   if (standby_replaying) {
1748     /* Go around for another pass of replaying in standby */
1749     dout(5) << "Restarting replay as standby-replay" << dendl;
1750     mdlog->get_journaler()->reread_head_and_probe(
1751       new C_MDS_StandbyReplayRestartFinish(
1752         this,
1753         mdlog->get_journaler()->get_read_pos()));
1754   } else {
1755     /* We are transitioning out of standby: wait for OSD map update
1756        before making final pass */
1757     dout(1) << "standby_replay_restart (final takeover pass)" << dendl;
1758     Context *fin = new C_IO_Wrapper(this, new C_MDS_StandbyReplayRestart(this));
1759     bool ready = objecter->wait_for_map(mdsmap->get_last_failure_osd_epoch(), fin);
1760     if (ready) {
1761       delete fin;
1762       mdlog->get_journaler()->reread_head_and_probe(
1763         new C_MDS_StandbyReplayRestartFinish(
1764           this,
1765           mdlog->get_journaler()->get_read_pos()));
1766
1767       dout(1) << " opening purge_queue (async)" << dendl;
1768       purge_queue.open(NULL);
1769       dout(1) << " opening open_file_table (async)" << dendl;
1770       mdcache->open_file_table.load(nullptr);
1771     } else {
1772       dout(1) << " waiting for osdmap " << mdsmap->get_last_failure_osd_epoch()
1773               << " (which blacklists prior instance)" << dendl;
1774     }
1775   }
1776 }
1777
1778 void MDSRank::replay_done()
1779 {
1780   if (!standby_replaying) {
1781     dout(1) << "Finished replaying journal" << dendl;
1782   } else {
1783     dout(5) << "Finished replaying journal as standby-replay" << dendl;
1784   }
1785
1786   if (is_standby_replay()) {
1787     // The replay was done in standby state, and we are still in that state
1788     ceph_assert(standby_replaying);
1789     dout(10) << "setting replay timer" << dendl;
1790     timer.add_event_after(g_conf()->mds_replay_interval,
1791                           new C_MDS_StandbyReplayRestart(this));
1792     return;
1793   } else if (standby_replaying) {
1794     // The replay was done in standby state, we have now _left_ that state
1795     dout(10) << " last replay pass was as a standby; making final pass" << dendl;
1796     standby_replaying = false;
1797     standby_replay_restart();
1798     return;
1799   } else {
1800     // Replay is complete, journal read should be up to date
1801     ceph_assert(mdlog->get_journaler()->get_read_pos() == mdlog->get_journaler()->get_write_pos());
1802     ceph_assert(!is_standby_replay());
1803
1804     // Reformat and come back here
1805     if (mdlog->get_journaler()->get_stream_format() < g_conf()->mds_journal_format) {
1806         dout(4) << "reformatting journal on standby-replay->replay transition" << dendl;
1807         mdlog->reopen(new C_MDS_BootStart(this, MDS_BOOT_REPLAY_DONE));
1808         return;
1809     }
1810   }
1811
1812   dout(1) << "making mds journal writeable" << dendl;
1813   mdlog->get_journaler()->set_writeable();
1814   mdlog->get_journaler()->trim_tail();
1815
1816   if (mdsmap->get_tableserver() == whoami &&
1817       snapserver->upgrade_format()) {
1818     dout(1) << "upgrading snaptable format" << dendl;
1819     snapserver->save(new C_MDSInternalNoop);
1820   }
1821
1822   if (g_conf()->mds_wipe_sessions) {
1823     dout(1) << "wiping out client sessions" << dendl;
1824     sessionmap.wipe();
1825     sessionmap.save(new C_MDSInternalNoop);
1826   }
1827   if (g_conf()->mds_wipe_ino_prealloc) {
1828     dout(1) << "wiping out ino prealloc from sessions" << dendl;
1829     sessionmap.wipe_ino_prealloc();
1830     sessionmap.save(new C_MDSInternalNoop);
1831   }
1832   if (g_conf()->mds_skip_ino) {
1833     inodeno_t i = g_conf()->mds_skip_ino;
1834     dout(1) << "skipping " << i << " inodes" << dendl;
1835     inotable->skip_inos(i);
1836     inotable->save(new C_MDSInternalNoop);
1837   }
1838
1839   if (mdsmap->get_num_in_mds() == 1 &&
1840       mdsmap->get_num_failed_mds() == 0) { // just me!
1841     dout(2) << "i am alone, moving to state reconnect" << dendl;
1842     request_state(MDSMap::STATE_RECONNECT);
1843     // sync snaptable cache
1844     snapclient->sync(new C_MDSInternalNoop);
1845   } else {
1846     dout(2) << "i am not alone, moving to state resolve" << dendl;
1847     request_state(MDSMap::STATE_RESOLVE);
1848   }
1849 }
1850
1851 void MDSRank::reopen_log()
1852 {
1853   dout(1) << "reopen_log" << dendl;
1854   mdcache->rollback_uncommitted_fragments();
1855 }
1856
1857 void MDSRank::resolve_start()
1858 {
1859   dout(1) << "resolve_start" << dendl;
1860
1861   reopen_log();
1862
1863   mdcache->resolve_start(new C_MDS_VoidFn(this, &MDSRank::resolve_done));
1864   finish_contexts(g_ceph_context, waiting_for_resolve);
1865 }
1866
1867 void MDSRank::resolve_done()
1868 {
1869   dout(1) << "resolve_done" << dendl;
1870   request_state(MDSMap::STATE_RECONNECT);
1871   // sync snaptable cache
1872   snapclient->sync(new C_MDSInternalNoop);
1873 }
1874
1875 void MDSRank::reconnect_start()
1876 {
1877   dout(1) << "reconnect_start" << dendl;
1878
1879   if (last_state == MDSMap::STATE_REPLAY) {
1880     reopen_log();
1881   }
1882
1883   // Drop any blacklisted clients from the SessionMap before going
1884   // into reconnect, so that we don't wait for them.
1885   objecter->enable_blacklist_events();
1886   std::set<entity_addr_t> blacklist;
1887   epoch_t epoch = 0;
1888   objecter->with_osdmap([&blacklist, &epoch](const OSDMap& o) {
1889       o.get_blacklist(&blacklist);
1890       epoch = o.get_epoch();
1891   });
1892   auto killed = server->apply_blacklist(blacklist);
1893   dout(4) << "reconnect_start: killed " << killed << " blacklisted sessions ("
1894           << blacklist.size() << " blacklist entries, "
1895           << sessionmap.get_sessions().size() << ")" << dendl;
1896   if (killed) {
1897     set_osd_epoch_barrier(epoch);
1898   }
1899
1900   server->reconnect_clients(new C_MDS_VoidFn(this, &MDSRank::reconnect_done));
1901   finish_contexts(g_ceph_context, waiting_for_reconnect);
1902 }
1903 void MDSRank::reconnect_done()
1904 {
1905   dout(1) << "reconnect_done" << dendl;
1906   request_state(MDSMap::STATE_REJOIN);    // move to rejoin state
1907 }
1908
1909 void MDSRank::rejoin_joint_start()
1910 {
1911   dout(1) << "rejoin_joint_start" << dendl;
1912   mdcache->rejoin_send_rejoins();
1913 }
1914 void MDSRank::rejoin_start()
1915 {
1916   dout(1) << "rejoin_start" << dendl;
1917   mdcache->rejoin_start(new C_MDS_VoidFn(this, &MDSRank::rejoin_done));
1918   finish_contexts(g_ceph_context, waiting_for_rejoin);
1919 }
1920 void MDSRank::rejoin_done()
1921 {
1922   dout(1) << "rejoin_done" << dendl;
1923   mdcache->show_subtrees();
1924   mdcache->show_cache();
1925
1926   if (mdcache->is_any_uncommitted_fragment()) {
1927     dout(1) << " waiting for uncommitted fragments" << dendl;
1928     MDSGatherBuilder gather(g_ceph_context, new C_MDS_VoidFn(this, &MDSRank::rejoin_done));
1929     mdcache->wait_for_uncommitted_fragments(gather.get());
1930     gather.activate();
1931     return;
1932   }
1933
1934   // funny case: is our cache empty?  no subtrees?
1935   if (!mdcache->is_subtrees()) {
1936     if (whoami == 0) {
1937       // The root should always have a subtree!
1938       clog->error() << "No subtrees found for root MDS rank!";
1939       damaged();
1940       ceph_assert(mdcache->is_subtrees());
1941     } else {
1942       dout(1) << " empty cache, no subtrees, leaving cluster" << dendl;
1943       request_state(MDSMap::STATE_STOPPED);
1944     }
1945     return;
1946   }
1947
1948   if (replay_queue.empty() && !server->get_num_pending_reclaim()) {
1949     request_state(MDSMap::STATE_ACTIVE);
1950   } else {
1951     replaying_requests_done = replay_queue.empty();
1952     request_state(MDSMap::STATE_CLIENTREPLAY);
1953   }
1954 }
1955
1956 void MDSRank::clientreplay_start()
1957 {
1958   dout(1) << "clientreplay_start" << dendl;
1959   finish_contexts(g_ceph_context, waiting_for_replay);  // kick waiters
1960   mdcache->start_files_to_recover();
1961   queue_one_replay();
1962 }
1963
1964 bool MDSRank::queue_one_replay()
1965 {
1966   if (!replay_queue.empty()) {
1967     queue_waiter(replay_queue.front());
1968     replay_queue.pop_front();
1969     return true;
1970   }
1971   if (!replaying_requests_done) {
1972     replaying_requests_done = true;
1973     mdlog->flush();
1974   }
1975   maybe_clientreplay_done();
1976   return false;
1977 }
1978
1979 void MDSRank::maybe_clientreplay_done()
1980 {
1981   if (is_clientreplay() && get_want_state() == MDSMap::STATE_CLIENTREPLAY) {
1982
1983     // don't go to active if there are session waiting for being reclaimed
1984     if (replaying_requests_done && !server->get_num_pending_reclaim()) {
1985       mdlog->wait_for_safe(new C_MDS_VoidFn(this, &MDSRank::clientreplay_done));
1986       return;
1987     }
1988
1989     dout(1) << " still have " << replay_queue.size() + (int)!replaying_requests_done
1990             << " requests need to be replayed, " << server->get_num_pending_reclaim()
1991             << " sessions need to be reclaimed" << dendl;
1992   }
1993 }
1994
1995 void MDSRank::clientreplay_done()
1996 {
1997   dout(1) << "clientreplay_done" << dendl;
1998   request_state(MDSMap::STATE_ACTIVE);
1999 }
2000
2001 void MDSRank::active_start()
2002 {
2003   dout(1) << "active_start" << dendl;
2004
2005   if (last_state == MDSMap::STATE_CREATING ||
2006       last_state == MDSMap::STATE_STARTING) {
2007     mdcache->open_root();
2008   }
2009
2010   mdcache->clean_open_file_lists();
2011   mdcache->export_remaining_imported_caps();
2012   finish_contexts(g_ceph_context, waiting_for_replay);  // kick waiters
2013   mdcache->start_files_to_recover();
2014
2015   mdcache->reissue_all_caps();
2016
2017   finish_contexts(g_ceph_context, waiting_for_active);  // kick waiters
2018 }
2019
2020 void MDSRank::recovery_done(int oldstate)
2021 {
2022   dout(1) << "recovery_done -- successful recovery!" << dendl;
2023   ceph_assert(is_clientreplay() || is_active());
2024
2025   if (oldstate == MDSMap::STATE_CREATING)
2026     return;
2027
2028   mdcache->start_recovered_truncates();
2029   mdcache->start_purge_inodes();
2030   mdcache->do_file_recover();
2031
2032   // tell connected clients
2033   //bcast_mds_map();     // not anymore, they get this from the monitor
2034
2035   mdcache->populate_mydir();
2036 }
2037
2038 void MDSRank::creating_done()
2039 {
2040   dout(1)<< "creating_done" << dendl;
2041   request_state(MDSMap::STATE_ACTIVE);
2042   // sync snaptable cache
2043   snapclient->sync(new C_MDSInternalNoop);
2044 }
2045
2046 void MDSRank::boot_create()
2047 {
2048   dout(3) << "boot_create" << dendl;
2049
2050   MDSGatherBuilder fin(g_ceph_context, new C_MDS_VoidFn(this, &MDSRank::creating_done));
2051
2052   mdcache->init_layouts();
2053
2054   inotable->set_rank(whoami);
2055   sessionmap.set_rank(whoami);
2056
2057   // start with a fresh journal
2058   dout(10) << "boot_create creating fresh journal" << dendl;
2059   mdlog->create(fin.new_sub());
2060
2061   // open new journal segment, but do not journal subtree map (yet)
2062   mdlog->prepare_new_segment();
2063
2064   if (whoami == mdsmap->get_root()) {
2065     dout(3) << "boot_create creating fresh hierarchy" << dendl;
2066     mdcache->create_empty_hierarchy(fin.get());
2067   }
2068
2069   dout(3) << "boot_create creating mydir hierarchy" << dendl;
2070   mdcache->create_mydir_hierarchy(fin.get());
2071
2072   dout(3) << "boot_create creating global snaprealm" << dendl;
2073   mdcache->create_global_snaprealm();
2074
2075   // fixme: fake out inotable (reset, pretend loaded)
2076   dout(10) << "boot_create creating fresh inotable table" << dendl;
2077   inotable->reset();
2078   inotable->save(fin.new_sub());
2079
2080   // write empty sessionmap
2081   sessionmap.save(fin.new_sub());
2082
2083   // Create empty purge queue
2084   purge_queue.create(new C_IO_Wrapper(this, fin.new_sub()));
2085
2086   // initialize tables
2087   if (mdsmap->get_tableserver() == whoami) {
2088     dout(10) << "boot_create creating fresh snaptable" << dendl;
2089     snapserver->set_rank(whoami);
2090     snapserver->reset();
2091     snapserver->save(fin.new_sub());
2092   }
2093
2094   ceph_assert(g_conf()->mds_kill_create_at != 1);
2095
2096   // ok now journal it
2097   mdlog->journal_segment_subtree_map(fin.new_sub());
2098   mdlog->flush();
2099
2100   // Usually we do this during reconnect, but creation skips that.
2101   objecter->enable_blacklist_events();
2102
2103   fin.activate();
2104 }
2105
2106 void MDSRank::stopping_start()
2107 {
2108   dout(2) << "Stopping..." << dendl;
2109
2110   if (mdsmap->get_num_in_mds() == 1 && !sessionmap.empty()) {
2111     std::vector<Session*> victims;
2112     const auto& sessions = sessionmap.get_sessions();
2113     for (const auto& p : sessions)  {
2114       if (!p.first.is_client()) {
2115         continue;
2116       }
2117
2118       Session *s = p.second;
2119       victims.push_back(s);
2120     }
2121
2122     dout(20) << __func__ << " matched " << victims.size() << " sessions" << dendl;
2123     ceph_assert(!victims.empty());
2124
2125     C_GatherBuilder gather(g_ceph_context, new C_MDSInternalNoop);
2126     for (const auto &s : victims) {
2127       std::stringstream ss;
2128       evict_client(s->get_client().v, false,
2129                    g_conf()->mds_session_blacklist_on_evict, ss, gather.new_sub());
2130     }
2131     gather.activate();
2132   }
2133
2134   mdcache->shutdown_start();
2135 }
2136
2137 void MDSRank::stopping_done()
2138 {
2139   dout(2) << "Finished stopping..." << dendl;
2140
2141   // tell monitor we shut down cleanly.
2142   request_state(MDSMap::STATE_STOPPED);
2143 }
2144
2145 void MDSRankDispatcher::handle_mds_map(
2146     const cref_t<MMDSMap> &m,
2147     const MDSMap &oldmap)
2148 {
2149   // I am only to be passed MDSMaps in which I hold a rank
2150   ceph_assert(whoami != MDS_RANK_NONE);
2151
2152   MDSMap::DaemonState oldstate = state;
2153   mds_gid_t mds_gid = mds_gid_t(monc->get_global_id());
2154   state = mdsmap->get_state_gid(mds_gid);
2155   if (state != oldstate) {
2156     last_state = oldstate;
2157     incarnation = mdsmap->get_inc_gid(mds_gid);
2158   }
2159
2160   version_t epoch = m->get_epoch();
2161
2162   // note source's map version
2163   if (m->get_source().is_mds() &&
2164       peer_mdsmap_epoch[mds_rank_t(m->get_source().num())] < epoch) {
2165     dout(15) << " peer " << m->get_source()
2166              << " has mdsmap epoch >= " << epoch
2167              << dendl;
2168     peer_mdsmap_epoch[mds_rank_t(m->get_source().num())] = epoch;
2169   }
2170
2171   // Validate state transitions while I hold a rank
2172   if (!MDSMap::state_transition_valid(oldstate, state)) {
2173     derr << "Invalid state transition " << ceph_mds_state_name(oldstate)
2174       << "->" << ceph_mds_state_name(state) << dendl;
2175     respawn();
2176   }
2177
2178   if (oldstate != state) {
2179     // update messenger.
2180     if (state == MDSMap::STATE_STANDBY_REPLAY) {
2181       dout(1) << "handle_mds_map i am now mds." << mds_gid << "." << incarnation
2182               << " replaying mds." << whoami << "." << incarnation << dendl;
2183       messenger->set_myname(entity_name_t::MDS(mds_gid));
2184     } else {
2185       dout(1) << "handle_mds_map i am now mds." << whoami << "." << incarnation << dendl;
2186       messenger->set_myname(entity_name_t::MDS(whoami));
2187     }
2188   }
2189
2190   // tell objecter my incarnation
2191   if (objecter->get_client_incarnation() != incarnation)
2192     objecter->set_client_incarnation(incarnation);
2193
2194   if (mdsmap->get_min_compat_client() < ceph_release_t::max &&
2195       oldmap.get_min_compat_client() != mdsmap->get_min_compat_client())
2196     server->update_required_client_features();
2197
2198   // for debug
2199   if (g_conf()->mds_dump_cache_on_map)
2200     mdcache->dump_cache();
2201
2202   cluster_degraded = mdsmap->is_degraded();
2203
2204   // mdsmap and oldmap can be discontinuous. failover might happen in the missing mdsmap.
2205   // the 'restart' set tracks ranks that have restarted since the old mdsmap
2206   set<mds_rank_t> restart;
2207   // replaying mds does not communicate with other ranks
2208   if (state >= MDSMap::STATE_RESOLVE) {
2209     // did someone fail?
2210     //   new down?
2211     set<mds_rank_t> olddown, down;
2212     oldmap.get_down_mds_set(&olddown);
2213     mdsmap->get_down_mds_set(&down);
2214     for (const auto& r : down) {
2215       if (oldmap.have_inst(r) && olddown.count(r) == 0) {
2216         messenger->mark_down_addrs(oldmap.get_addrs(r));
2217         handle_mds_failure(r);
2218       }
2219     }
2220
2221     // did someone fail?
2222     //   did their addr/inst change?
2223     set<mds_rank_t> up;
2224     mdsmap->get_up_mds_set(up);
2225     for (const auto& r : up) {
2226       auto& info = mdsmap->get_info(r);
2227       if (oldmap.have_inst(r)) {
2228         auto& oldinfo = oldmap.get_info(r);
2229         if (info.inc != oldinfo.inc) {
2230           messenger->mark_down_addrs(oldinfo.get_addrs());
2231           if (info.state == MDSMap::STATE_REPLAY ||
2232               info.state == MDSMap::STATE_RESOLVE) {
2233             restart.insert(r);
2234             handle_mds_failure(r);
2235           } else {
2236             ceph_assert(info.state == MDSMap::STATE_STARTING ||
2237                    info.state == MDSMap::STATE_ACTIVE);
2238             // -> stopped (missing) -> starting -> active
2239             restart.insert(r);
2240             mdcache->migrator->handle_mds_failure_or_stop(r);
2241             if (mdsmap->get_tableserver() == whoami)
2242               snapserver->handle_mds_failure_or_stop(r);
2243           }
2244         }
2245       } else {
2246         if (info.state == MDSMap::STATE_REPLAY ||
2247             info.state == MDSMap::STATE_RESOLVE) {
2248           // -> starting/creating (missing) -> active (missing) -> replay -> resolve
2249           restart.insert(r);
2250           handle_mds_failure(r);
2251         } else {
2252           ceph_assert(info.state == MDSMap::STATE_CREATING ||
2253                  info.state == MDSMap::STATE_STARTING ||
2254                  info.state == MDSMap::STATE_ACTIVE);
2255         }
2256       }
2257     }
2258   }
2259
2260   // did it change?
2261   if (oldstate != state) {
2262     dout(1) << "handle_mds_map state change "
2263             << ceph_mds_state_name(oldstate) << " --> "
2264             << ceph_mds_state_name(state) << dendl;
2265     beacon.set_want_state(*mdsmap, state);
2266
2267     if (oldstate == MDSMap::STATE_STANDBY_REPLAY) {
2268         dout(10) << "Monitor activated us! Deactivating replay loop" << dendl;
2269         assert (state == MDSMap::STATE_REPLAY);
2270     } else {
2271       // did i just recover?
2272       if ((is_active() || is_clientreplay()) &&
2273           (oldstate == MDSMap::STATE_CREATING ||
2274            oldstate == MDSMap::STATE_REJOIN ||
2275            oldstate == MDSMap::STATE_RECONNECT))
2276         recovery_done(oldstate);
2277
2278       if (is_active()) {
2279         active_start();
2280       } else if (is_any_replay()) {
2281         replay_start();
2282       } else if (is_resolve()) {
2283         resolve_start();
2284       } else if (is_reconnect()) {
2285         reconnect_start();
2286       } else if (is_rejoin()) {
2287         rejoin_start();
2288       } else if (is_clientreplay()) {
2289         clientreplay_start();
2290       } else if (is_creating()) {
2291         boot_create();
2292       } else if (is_starting()) {
2293         boot_start();
2294       } else if (is_stopping()) {
2295         ceph_assert(oldstate == MDSMap::STATE_ACTIVE);
2296         stopping_start();
2297       }
2298     }
2299   }
2300
2301   // RESOLVE
2302   // is someone else newly resolving?
2303   if (state >= MDSMap::STATE_RESOLVE) {
2304     // recover snaptable
2305     if (mdsmap->get_tableserver() == whoami) {
2306       if (oldstate < MDSMap::STATE_RESOLVE) {
2307         set<mds_rank_t> s;
2308         mdsmap->get_mds_set_lower_bound(s, MDSMap::STATE_RESOLVE);
2309         snapserver->finish_recovery(s);
2310       } else {
2311         set<mds_rank_t> old_set, new_set;
2312         oldmap.get_mds_set_lower_bound(old_set, MDSMap::STATE_RESOLVE);
2313         mdsmap->get_mds_set_lower_bound(new_set, MDSMap::STATE_RESOLVE);
2314         for (const auto& r : new_set) {
2315           if (r == whoami)
2316             continue; // not me
2317           if (!old_set.count(r) || restart.count(r)) {  // newly so?
2318             snapserver->handle_mds_recovery(r);
2319           }
2320         }
2321       }
2322     }
2323
2324     if ((!oldmap.is_resolving() || !restart.empty()) && mdsmap->is_resolving()) {
2325       set<mds_rank_t> resolve;
2326       mdsmap->get_mds_set(resolve, MDSMap::STATE_RESOLVE);
2327       dout(10) << " resolve set is " << resolve << dendl;
2328       calc_recovery_set();
2329       mdcache->send_resolves();
2330     }
2331   }
2332
2333   // REJOIN
2334   // is everybody finally rejoining?
2335   if (state >= MDSMap::STATE_REJOIN) {
2336     // did we start?
2337     if (!oldmap.is_rejoining() && mdsmap->is_rejoining())
2338       rejoin_joint_start();
2339
2340     // did we finish?
2341     if (g_conf()->mds_dump_cache_after_rejoin &&
2342         oldmap.is_rejoining() && !mdsmap->is_rejoining())
2343       mdcache->dump_cache();      // for DEBUG only
2344
2345     if (oldstate >= MDSMap::STATE_REJOIN ||
2346         oldstate == MDSMap::STATE_STARTING) {
2347       // ACTIVE|CLIENTREPLAY|REJOIN => we can discover from them.
2348       set<mds_rank_t> olddis, dis;
2349       oldmap.get_mds_set_lower_bound(olddis, MDSMap::STATE_REJOIN);
2350       mdsmap->get_mds_set_lower_bound(dis, MDSMap::STATE_REJOIN);
2351       for (const auto& r : dis) {
2352         if (r == whoami)
2353           continue; // not me
2354         if (!olddis.count(r) || restart.count(r)) {  // newly so?
2355           mdcache->kick_discovers(r);
2356           mdcache->kick_open_ino_peers(r);
2357         }
2358       }
2359     }
2360   }
2361
2362   if (oldmap.is_degraded() && !cluster_degraded && state >= MDSMap::STATE_ACTIVE) {
2363     dout(1) << "cluster recovered." << dendl;
2364     auto it = waiting_for_active_peer.find(MDS_RANK_NONE);
2365     if (it != waiting_for_active_peer.end()) {
2366       queue_waiters(it->second);
2367       waiting_for_active_peer.erase(it);
2368     }
2369   }
2370
2371   // did someone go active?
2372   if (state >= MDSMap::STATE_CLIENTREPLAY &&
2373       oldstate >= MDSMap::STATE_CLIENTREPLAY) {
2374     set<mds_rank_t> oldactive, active;
2375     oldmap.get_mds_set_lower_bound(oldactive, MDSMap::STATE_CLIENTREPLAY);
2376     mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
2377     for (const auto& r : active) {
2378       if (r == whoami)
2379         continue; // not me
2380       if (!oldactive.count(r) || restart.count(r))  // newly so?
2381         handle_mds_recovery(r);
2382     }
2383   }
2384
2385   if (is_clientreplay() || is_active() || is_stopping()) {
2386     // did anyone stop?
2387     set<mds_rank_t> oldstopped, stopped;
2388     oldmap.get_stopped_mds_set(oldstopped);
2389     mdsmap->get_stopped_mds_set(stopped);
2390     for (const auto& r : stopped)
2391       if (oldstopped.count(r) == 0) {     // newly so?
2392         mdcache->migrator->handle_mds_failure_or_stop(r);
2393         if (mdsmap->get_tableserver() == whoami)
2394           snapserver->handle_mds_failure_or_stop(r);
2395       }
2396   }
2397
2398   {
2399     map<epoch_t,MDSContext::vec >::iterator p = waiting_for_mdsmap.begin();
2400     while (p != waiting_for_mdsmap.end() && p->first <= mdsmap->get_epoch()) {
2401       MDSContext::vec ls;
2402       ls.swap(p->second);
2403       waiting_for_mdsmap.erase(p++);
2404       queue_waiters(ls);
2405     }
2406   }
2407
2408   if (is_active()) {
2409     // Before going active, set OSD epoch barrier to latest (so that
2410     // we don't risk handing out caps to clients with old OSD maps that
2411     // might not include barriers from the previous incarnation of this MDS)
2412     set_osd_epoch_barrier(objecter->with_osdmap(
2413                             std::mem_fn(&OSDMap::get_epoch)));
2414
2415     /* Now check if we should hint to the OSD that a read may follow */
2416     if (mdsmap->has_standby_replay(whoami))
2417       mdlog->set_write_iohint(0);
2418     else
2419       mdlog->set_write_iohint(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED);
2420   }
2421
2422   if (oldmap.get_max_mds() != mdsmap->get_max_mds()) {
2423     purge_queue.update_op_limit(*mdsmap);
2424   }
2425
2426   if (mdsmap->get_inline_data_enabled() && !oldmap.get_inline_data_enabled())
2427     dout(0) << "WARNING: inline_data support has been deprecated and will be removed in a future release" << dendl;
2428
2429   if (scrubstack->is_scrubbing()) {
2430     if (mdsmap->get_max_mds() > 1) {
2431       auto c = new C_MDSInternalNoop;
2432       scrubstack->scrub_abort(c);
2433     }
2434   }
2435   mdcache->handle_mdsmap(*mdsmap, oldmap);
2436 }
2437
2438 void MDSRank::handle_mds_recovery(mds_rank_t who)
2439 {
2440   dout(5) << "handle_mds_recovery mds." << who << dendl;
2441
2442   mdcache->handle_mds_recovery(who);
2443
2444   queue_waiters(waiting_for_active_peer[who]);
2445   waiting_for_active_peer.erase(who);
2446 }
2447
2448 void MDSRank::handle_mds_failure(mds_rank_t who)
2449 {
2450   if (who == whoami) {
2451     dout(5) << "handle_mds_failure for myself; not doing anything" << dendl;
2452     return;
2453   }
2454   dout(5) << "handle_mds_failure mds." << who << dendl;
2455
2456   mdcache->handle_mds_failure(who);
2457
2458   if (mdsmap->get_tableserver() == whoami)
2459     snapserver->handle_mds_failure_or_stop(who);
2460
2461   snapclient->handle_mds_failure(who);
2462 }
2463
2464 void MDSRankDispatcher::handle_asok_command(
2465   std::string_view command,
2466   const cmdmap_t& cmdmap,
2467   Formatter *f,
2468   const bufferlist &inbl,
2469   std::function<void(int,const std::string&,bufferlist&)> on_finish)
2470 {
2471   int r = 0;
2472   stringstream ss;
2473   bufferlist outbl;
2474   if (command == "dump_ops_in_flight" ||
2475       command == "ops") {
2476     if (!op_tracker.dump_ops_in_flight(f)) {
2477       ss << "op_tracker disabled; set mds_enable_op_tracker=true to enable";
2478     }
2479   } else if (command == "dump_blocked_ops") {
2480     if (!op_tracker.dump_ops_in_flight(f, true)) {
2481       ss << "op_tracker disabled; set mds_enable_op_tracker=true to enable";
2482     }
2483   } else if (command == "dump_historic_ops") {
2484     if (!op_tracker.dump_historic_ops(f)) {
2485       ss << "op_tracker disabled; set mds_enable_op_tracker=true to enable";
2486     }
2487   } else if (command == "dump_historic_ops_by_duration") {
2488     if (!op_tracker.dump_historic_ops(f, true)) {
2489       ss << "op_tracker disabled; set mds_enable_op_tracker=true to enable";
2490     }
2491   } else if (command == "osdmap barrier") {
2492     int64_t target_epoch = 0;
2493     bool got_val = cmd_getval(cmdmap, "target_epoch", target_epoch);
2494
2495     if (!got_val) {
2496       ss << "no target epoch given";
2497       r = -EINVAL;
2498       goto out;
2499     }
2500     {
2501       std::lock_guard l(mds_lock);
2502       set_osd_epoch_barrier(target_epoch);
2503     }
2504     C_SaferCond cond;
2505     bool already_got = objecter->wait_for_map(target_epoch, &cond);
2506     if (!already_got) {
2507       dout(4) << __func__ << ": waiting for OSD epoch " << target_epoch << dendl;
2508       cond.wait();
2509     }
2510   } else if (command == "session ls" ||
2511              command == "client ls") {
2512     std::lock_guard l(mds_lock);
2513     std::vector<std::string> filter_args;
2514     cmd_getval(cmdmap, "filters", filter_args);
2515     SessionFilter filter;
2516     r = filter.parse(filter_args, &ss);
2517     if (r != 0) {
2518       goto out;
2519     }
2520     dump_sessions(filter, f);
2521   } else if (command == "session evict" ||
2522              command == "client evict") {
2523     std::lock_guard l(mds_lock);
2524     std::vector<std::string> filter_args;
2525     cmd_getval(cmdmap, "filters", filter_args);
2526
2527     SessionFilter filter;
2528     r = filter.parse(filter_args, &ss);
2529     if (r != 0) {
2530       r = -EINVAL;
2531       goto out;
2532     }
2533     evict_clients(filter, on_finish);
2534     return;
2535   } else if (command == "session kill") {
2536     std::string client_id;
2537     if (!cmd_getval(cmdmap, "client_id", client_id)) {
2538       ss << "Invalid client_id specified";
2539       r = -ENOENT;
2540       goto out;
2541     }
2542     std::lock_guard l(mds_lock);
2543     bool evicted = evict_client(strtol(client_id.c_str(), 0, 10), true,
2544         g_conf()->mds_session_blacklist_on_evict, ss);
2545     if (!evicted) {
2546       dout(15) << ss.str() << dendl;
2547       r = -ENOENT;
2548     }
2549   } else if (command == "session config" ||
2550              command == "client config") {
2551     int64_t client_id;
2552     std::string option;
2553     std::string value;
2554
2555     cmd_getval(cmdmap, "client_id", client_id);
2556     cmd_getval(cmdmap, "option", option);
2557     bool got_value = cmd_getval(cmdmap, "value", value);
2558
2559     std::lock_guard l(mds_lock);
2560     r = config_client(client_id, !got_value, option, value, ss);
2561   } else if (command == "scrub start" ||
2562              command == "scrub_start") {
2563     string path;
2564     string tag;
2565     vector<string> scrubop_vec;
2566     cmd_getval(cmdmap, "scrubops", scrubop_vec);
2567     cmd_getval(cmdmap, "path", path);
2568     cmd_getval(cmdmap, "tag", tag);
2569
2570     /* Multiple MDS scrub is not currently supported. See also: https://tracker.ceph.com/issues/12274 */
2571     if (mdsmap->get_max_mds() > 1) {
2572       ss << "Scrub is not currently supported for multiple active MDS. Please reduce max_mds to 1 and then scrub.";
2573       r = -EINVAL;
2574       goto out;
2575     }
2576
2577     finisher->queue(
2578       new LambdaContext(
2579         [this, on_finish, f, path, tag, scrubop_vec](int r) {
2580           command_scrub_start(
2581             f, path, tag, scrubop_vec,
2582             new LambdaContext(
2583               [on_finish](int r) {
2584                 bufferlist outbl;
2585                 on_finish(r, {}, outbl);
2586               }));
2587         }));
2588     return;
2589   } else if (command == "scrub abort") {
2590     finisher->queue(
2591       new LambdaContext(
2592         [this, on_finish, f](int r) {
2593           command_scrub_abort(
2594             f,
2595             new LambdaContext(
2596               [on_finish, f](int r) {
2597                 bufferlist outbl;
2598                 f->open_object_section("result");
2599                 f->dump_int("return_code", r);
2600                 f->close_section();
2601                 on_finish(r, {}, outbl);
2602               }));
2603         }));
2604     return;
2605   } else if (command == "scrub pause") {
2606     finisher->queue(
2607       new LambdaContext(
2608         [this, on_finish, f](int r) {
2609           command_scrub_pause(
2610             f,
2611             new LambdaContext(
2612               [on_finish, f](int r) {
2613                 bufferlist outbl;
2614                 f->open_object_section("result");
2615                 f->dump_int("return_code", r);
2616                 f->close_section();
2617                 on_finish(r, {}, outbl);
2618               }));
2619         }));
2620     return;
2621   } else if (command == "scrub resume") {
2622     command_scrub_resume(f);
2623   } else if (command == "scrub status") {
2624     command_scrub_status(f);
2625   } else if (command == "tag path") {
2626     string path;
2627     cmd_getval(cmdmap, "path", path);
2628     string tag;
2629     cmd_getval(cmdmap, "tag", tag);
2630     command_tag_path(f, path, tag);
2631   } else if (command == "flush_path") {
2632     string path;
2633     cmd_getval(cmdmap, "path", path);
2634     command_flush_path(f, path);
2635   } else if (command == "flush journal") {
2636     command_flush_journal(f);
2637   } else if (command == "get subtrees") {
2638     command_get_subtrees(f);
2639   } else if (command == "export dir") {
2640     string path;
2641     if(!cmd_getval(cmdmap, "path", path)) {
2642       ss << "malformed path";
2643       r = -EINVAL;
2644       goto out;
2645     }
2646     int64_t rank;
2647     if(!cmd_getval(cmdmap, "rank", rank)) {
2648       ss << "malformed rank";
2649       r = -EINVAL;
2650       goto out;
2651     }
2652     command_export_dir(f, path, (mds_rank_t)rank);
2653   } else if (command == "dump cache") {
2654     std::lock_guard l(mds_lock);
2655     string path;
2656     if (!cmd_getval(cmdmap, "path", path)) {
2657       r = mdcache->dump_cache(f);
2658     } else {
2659       r = mdcache->dump_cache(path);
2660     }
2661   } else if (command == "cache drop") {
2662     int64_t timeout = 0;
2663     cmd_getval(cmdmap, "timeout", timeout);
2664     finisher->queue(
2665       new LambdaContext(
2666         [this, on_finish, f, timeout](int r) {
2667           command_cache_drop(
2668             timeout, f,
2669             new LambdaContext(
2670               [on_finish](int r) {
2671                 bufferlist outbl;
2672                 on_finish(r, {}, outbl);
2673               }));
2674         }));
2675     return;
2676   } else if (command == "cache status") {
2677     std::lock_guard l(mds_lock);
2678     mdcache->cache_status(f);
2679   } else if (command == "dump tree") {
2680     command_dump_tree(cmdmap, ss, f);
2681   } else if (command == "dump loads") {
2682     std::lock_guard l(mds_lock);
2683     r = balancer->dump_loads(f);
2684   } else if (command == "dump snaps") {
2685     std::lock_guard l(mds_lock);
2686     string server;
2687     cmd_getval(cmdmap, "server", server);
2688     if (server == "--server") {
2689       if (mdsmap->get_tableserver() == whoami) {
2690         snapserver->dump(f);
2691       } else {
2692         r = -EXDEV;
2693         ss << "Not snapserver";
2694       }
2695     } else {
2696       r = snapclient->dump_cache(f);
2697     }
2698   } else if (command == "force_readonly") {
2699     std::lock_guard l(mds_lock);
2700     mdcache->force_readonly();
2701   } else if (command == "dirfrag split") {
2702     command_dirfrag_split(cmdmap, ss);
2703   } else if (command == "dirfrag merge") {
2704     command_dirfrag_merge(cmdmap, ss);
2705   } else if (command == "dirfrag ls") {
2706     command_dirfrag_ls(cmdmap, ss, f);
2707   } else if (command == "openfiles ls") {
2708     command_openfiles_ls(f);
2709   } else if (command == "dump inode") {
2710     command_dump_inode(f, cmdmap, ss);
2711   } else if (command == "damage ls") {
2712     std::lock_guard l(mds_lock);
2713     damage_table.dump(f);
2714   } else if (command == "damage rm") {
2715     std::lock_guard l(mds_lock);
2716     damage_entry_id_t id = 0;
2717     if (!cmd_getval(cmdmap, "damage_id", (int64_t&)id)) {
2718       r = -EINVAL;
2719       goto out;
2720     }
2721     damage_table.erase(id);
2722   } else {
2723     r = -ENOSYS;
2724   }
2725 out:
2726   on_finish(r, ss.str(), outbl);
2727 }
2728
2729 /**
2730  * This function drops the mds_lock, so don't do anything with
2731  * MDSRank after calling it (we could have gone into shutdown): just
2732  * send your result back to the calling client and finish.
2733  */
2734 void MDSRankDispatcher::evict_clients(
2735   const SessionFilter &filter,
2736   std::function<void(int,const std::string&,bufferlist&)> on_finish)
2737 {
2738   bufferlist outbl;
2739   if (is_any_replay()) {
2740     on_finish(-EAGAIN, "MDS is replaying log", outbl);
2741     return;
2742   }
2743
2744   std::vector<Session*> victims;
2745   const auto& sessions = sessionmap.get_sessions();
2746   for (const auto& p : sessions)  {
2747     if (!p.first.is_client()) {
2748       continue;
2749     }
2750
2751     Session *s = p.second;
2752
2753     if (filter.match(*s, std::bind(&Server::waiting_for_reconnect, server,
2754                                    std::placeholders::_1))) {
2755       victims.push_back(s);
2756     }
2757   }
2758
2759   dout(20) << __func__ << " matched " << victims.size() << " sessions" << dendl;
2760
2761   if (victims.empty()) {
2762     on_finish(0, {}, outbl);
2763     return;
2764   }
2765
2766   C_GatherBuilder gather(g_ceph_context,
2767                          new LambdaContext([on_finish](int r) {
2768                                              bufferlist bl;
2769                                              on_finish(r, {}, bl);
2770                                            }));
2771   for (const auto s : victims) {
2772     std::stringstream ss;
2773     evict_client(s->get_client().v, false,
2774                  g_conf()->mds_session_blacklist_on_evict, ss, gather.new_sub());
2775   }
2776   gather.activate();
2777 }
2778
2779 void MDSRankDispatcher::dump_sessions(const SessionFilter &filter, Formatter *f) const
2780 {
2781   // Dump sessions, decorated with recovery/replay status
2782   f->open_array_section("sessions");
2783   for (auto& [name, s] : sessionmap.get_sessions()) {
2784     if (!name.is_client()) {
2785       continue;
2786     }
2787
2788     if (!filter.match(*s, std::bind(&Server::waiting_for_reconnect, server, std::placeholders::_1))) {
2789       continue;
2790     }
2791
2792     f->dump_object("session", *s);
2793   }
2794   f->close_section(); // sessions
2795 }
2796
2797 void MDSRank::command_scrub_start(Formatter *f,
2798                                   std::string_view path, std::string_view tag,
2799                                   const vector<string>& scrubop_vec, Context *on_finish)
2800 {
2801   bool force = false;
2802   bool recursive = false;
2803   bool repair = false;
2804   for (auto &op : scrubop_vec) {
2805     if (op == "force")
2806       force = true;
2807     else if (op == "recursive")
2808       recursive = true;
2809     else if (op == "repair")
2810       repair = true;
2811   }
2812
2813   std::lock_guard l(mds_lock);
2814   mdcache->enqueue_scrub(path, tag, force, recursive, repair, f, on_finish);
2815   // scrub_dentry() finishers will dump the data for us; we're done!
2816 }
2817
2818 void MDSRank::command_tag_path(Formatter *f,
2819     std::string_view path, std::string_view tag)
2820 {
2821   C_SaferCond scond;
2822   {
2823     std::lock_guard l(mds_lock);
2824     mdcache->enqueue_scrub(path, tag, true, true, false, f, &scond);
2825   }
2826   scond.wait();
2827 }
2828
2829 void MDSRank::command_scrub_abort(Formatter *f, Context *on_finish) {
2830   std::lock_guard l(mds_lock);
2831   scrubstack->scrub_abort(on_finish);
2832 }
2833
2834 void MDSRank::command_scrub_pause(Formatter *f, Context *on_finish) {
2835   std::lock_guard l(mds_lock);
2836   scrubstack->scrub_pause(on_finish);
2837 }
2838
2839 void MDSRank::command_scrub_resume(Formatter *f) {
2840   std::lock_guard l(mds_lock);
2841   int r = scrubstack->scrub_resume();
2842
2843   f->open_object_section("result");
2844   f->dump_int("return_code", r);
2845   f->close_section();
2846 }
2847
2848 void MDSRank::command_scrub_status(Formatter *f) {
2849   std::lock_guard l(mds_lock);
2850   scrubstack->scrub_status(f);
2851 }
2852
2853 void MDSRank::command_flush_path(Formatter *f, std::string_view path)
2854 {
2855   C_SaferCond scond;
2856   {
2857     std::lock_guard l(mds_lock);
2858     mdcache->flush_dentry(path, &scond);
2859   }
2860   int r = scond.wait();
2861   f->open_object_section("results");
2862   f->dump_int("return_code", r);
2863   f->close_section(); // results
2864 }
2865
2866 // synchronous wrapper around "journal flush" asynchronous context
2867 // execution.
2868 void MDSRank::command_flush_journal(Formatter *f) {
2869   ceph_assert(f != NULL);
2870
2871   C_SaferCond cond;
2872   std::stringstream ss;
2873   {
2874     std::lock_guard locker(mds_lock);
2875     C_Flush_Journal *flush_journal = new C_Flush_Journal(mdcache, mdlog, this, &ss, &cond);
2876     flush_journal->send();
2877   }
2878   int r = cond.wait();
2879
2880   f->open_object_section("result");
2881   f->dump_string("message", ss.str());
2882   f->dump_int("return_code", r);
2883   f->close_section();
2884 }
2885
2886 void MDSRank::command_get_subtrees(Formatter *f)
2887 {
2888   ceph_assert(f != NULL);
2889   std::lock_guard l(mds_lock);
2890
2891   std::vector<CDir*> subtrees;
2892   mdcache->get_subtrees(subtrees);
2893
2894   f->open_array_section("subtrees");
2895   for (const auto& dir : subtrees) {
2896     f->open_object_section("subtree");
2897     {
2898       f->dump_bool("is_auth", dir->is_auth());
2899       f->dump_int("auth_first", dir->get_dir_auth().first);
2900       f->dump_int("auth_second", dir->get_dir_auth().second);
2901       f->dump_int("export_pin", dir->inode->get_export_pin(false, false));
2902       f->dump_bool("distributed_ephemeral_pin", dir->inode->is_ephemeral_dist());
2903       f->dump_bool("random_ephemeral_pin", dir->inode->is_ephemeral_rand());
2904       f->dump_int("ephemeral_pin", mdcache->hash_into_rank_bucket(dir->inode->ino()));
2905       f->open_object_section("dir");
2906       dir->dump(f);
2907       f->close_section();
2908     }
2909     f->close_section();
2910   }
2911   f->close_section();
2912 }
2913
2914
2915 void MDSRank::command_export_dir(Formatter *f,
2916     std::string_view path,
2917     mds_rank_t target)
2918 {
2919   int r = _command_export_dir(path, target);
2920   f->open_object_section("results");
2921   f->dump_int("return_code", r);
2922   f->close_section(); // results
2923 }
2924
2925 int MDSRank::_command_export_dir(
2926     std::string_view path,
2927     mds_rank_t target)
2928 {
2929   std::lock_guard l(mds_lock);
2930   filepath fp(path);
2931
2932   if (target == whoami || !mdsmap->is_up(target) || !mdsmap->is_in(target)) {
2933     derr << "bad MDS target " << target << dendl;
2934     return -ENOENT;
2935   }
2936
2937   CInode *in = mdcache->cache_traverse(fp);
2938   if (!in) {
2939     derr << "Bath path '" << path << "'" << dendl;
2940     return -ENOENT;
2941   }
2942   CDir *dir = in->get_dirfrag(frag_t());
2943   if (!dir || !(dir->is_auth())) {
2944     derr << "bad export_dir path dirfrag frag_t() or dir not auth" << dendl;
2945     return -EINVAL;
2946   }
2947
2948   mdcache->migrator->export_dir(dir, target);
2949   return 0;
2950 }
2951
2952 void MDSRank::command_dump_tree(const cmdmap_t &cmdmap, std::ostream &ss, Formatter *f)
2953 {
2954   std::string root;
2955   int64_t depth;
2956   cmd_getval(cmdmap, "root", root);
2957   if (!cmd_getval(cmdmap, "depth", depth))
2958     depth = -1;
2959   std::lock_guard l(mds_lock);
2960   CInode *in = mdcache->cache_traverse(filepath(root.c_str()));
2961   if (!in) {
2962     ss << "root inode is not in cache";
2963     return;
2964   }
2965   f->open_array_section("inodes");
2966   mdcache->dump_tree(in, 0, depth, f);
2967   f->close_section();
2968 }
2969
2970 CDir *MDSRank::_command_dirfrag_get(
2971     const cmdmap_t &cmdmap,
2972     std::ostream &ss)
2973 {
2974   std::string path;
2975   bool got = cmd_getval(cmdmap, "path", path);
2976   if (!got) {
2977     ss << "missing path argument";
2978     return NULL;
2979   }
2980
2981   std::string frag_str;
2982   if (!cmd_getval(cmdmap, "frag", frag_str)) {
2983     ss << "missing frag argument";
2984     return NULL;
2985   }
2986
2987   CInode *in = mdcache->cache_traverse(filepath(path.c_str()));
2988   if (!in) {
2989     // TODO really we should load something in if it's not in cache,
2990     // but the infrastructure is harder, and we might still be unable
2991     // to act on it if someone else is auth.
2992     ss << "directory '" << path << "' inode not in cache";
2993     return NULL;
2994   }
2995
2996   frag_t fg;
2997
2998   if (!fg.parse(frag_str.c_str())) {
2999     ss << "frag " << frag_str << " failed to parse";
3000     return NULL;
3001   }
3002
3003   CDir *dir = in->get_dirfrag(fg);
3004   if (!dir) {
3005     ss << "frag " << in->ino() << "/" << fg << " not in cache ("
3006           "use `dirfrag ls` to see if it should exist)";
3007     return NULL;
3008   }
3009
3010   if (!dir->is_auth()) {
3011     ss << "frag " << dir->dirfrag() << " not auth (auth = "
3012        << dir->authority() << ")";
3013     return NULL;
3014   }
3015
3016   return dir;
3017 }
3018
3019 bool MDSRank::command_dirfrag_split(
3020     cmdmap_t cmdmap,
3021     std::ostream &ss)
3022 {
3023   std::lock_guard l(mds_lock);
3024   int64_t by = 0;
3025   if (!cmd_getval(cmdmap, "bits", by)) {
3026     ss << "missing bits argument";
3027     return false;
3028   }
3029
3030   if (by <= 0) {
3031     ss << "must split by >0 bits";
3032     return false;
3033   }
3034
3035   CDir *dir = _command_dirfrag_get(cmdmap, ss);
3036   if (!dir) {
3037     return false;
3038   }
3039
3040   mdcache->split_dir(dir, by);
3041
3042   return true;
3043 }
3044
3045 bool MDSRank::command_dirfrag_merge(
3046     cmdmap_t cmdmap,
3047     std::ostream &ss)
3048 {
3049   std::lock_guard l(mds_lock);
3050   std::string path;
3051   bool got = cmd_getval(cmdmap, "path", path);
3052   if (!got) {
3053     ss << "missing path argument";
3054     return false;
3055   }
3056
3057   std::string frag_str;
3058   if (!cmd_getval(cmdmap, "frag", frag_str)) {
3059     ss << "missing frag argument";
3060     return false;
3061   }
3062
3063   CInode *in = mdcache->cache_traverse(filepath(path.c_str()));
3064   if (!in) {
3065     ss << "directory '" << path << "' inode not in cache";
3066     return false;
3067   }
3068
3069   frag_t fg;
3070   if (!fg.parse(frag_str.c_str())) {
3071     ss << "frag " << frag_str << " failed to parse";
3072     return false;
3073   }
3074
3075   mdcache->merge_dir(in, fg);
3076
3077   return true;
3078 }
3079
3080 bool MDSRank::command_dirfrag_ls(
3081     cmdmap_t cmdmap,
3082     std::ostream &ss,
3083     Formatter *f)
3084 {
3085   std::lock_guard l(mds_lock);
3086   std::string path;
3087   bool got = cmd_getval(cmdmap, "path", path);
3088   if (!got) {
3089     ss << "missing path argument";
3090     return false;
3091   }
3092
3093   CInode *in = mdcache->cache_traverse(filepath(path.c_str()));
3094   if (!in) {
3095     ss << "directory inode not in cache";
3096     return false;
3097   }
3098
3099   f->open_array_section("frags");
3100   frag_vec_t leaves;
3101   // NB using get_leaves_under instead of get_dirfrags to give
3102   // you the list of what dirfrags may exist, not which are in cache
3103   in->dirfragtree.get_leaves_under(frag_t(), leaves);
3104   for (const auto& leaf : leaves) {
3105     f->open_object_section("frag");
3106     f->dump_int("value", leaf.value());
3107     f->dump_int("bits", leaf.bits());
3108     CachedStackStringStream css;
3109     *css << std::hex << leaf.value() << "/" << std::dec << leaf.bits();
3110     f->dump_string("str", css->strv());
3111     f->close_section();
3112   }
3113   f->close_section();
3114
3115   return true;
3116 }
3117
3118 void MDSRank::command_openfiles_ls(Formatter *f)
3119 {
3120   std::lock_guard l(mds_lock);
3121   mdcache->dump_openfiles(f);
3122 }
3123
3124 void MDSRank::command_dump_inode(Formatter *f, const cmdmap_t &cmdmap, std::ostream &ss)
3125 {
3126   std::lock_guard l(mds_lock);
3127   int64_t number;
3128   bool got = cmd_getval(cmdmap, "number", number);
3129   if (!got) {
3130     ss << "missing inode number";
3131     return;
3132   }
3133
3134   bool success = mdcache->dump_inode(f, number);
3135   if (!success) {
3136     ss << "dump inode failed, wrong inode number or the inode is not cached";
3137   }
3138 }
3139
3140 void MDSRank::dump_status(Formatter *f) const
3141 {
3142   if (state == MDSMap::STATE_REPLAY ||
3143       state == MDSMap::STATE_STANDBY_REPLAY) {
3144     mdlog->dump_replay_status(f);
3145   } else if (state == MDSMap::STATE_RESOLVE) {
3146     mdcache->dump_resolve_status(f);
3147   } else if (state == MDSMap::STATE_RECONNECT) {
3148     server->dump_reconnect_status(f);
3149   } else if (state == MDSMap::STATE_REJOIN) {
3150     mdcache->dump_rejoin_status(f);
3151   } else if (state == MDSMap::STATE_CLIENTREPLAY) {
3152     dump_clientreplay_status(f);
3153   }
3154   f->dump_float("rank_uptime", get_uptime().count());
3155 }
3156
3157 void MDSRank::dump_clientreplay_status(Formatter *f) const
3158 {
3159   f->open_object_section("clientreplay_status");
3160   f->dump_unsigned("clientreplay_queue", replay_queue.size());
3161   f->dump_unsigned("active_replay", mdcache->get_num_client_requests());
3162   f->close_section();
3163 }
3164
3165 void MDSRankDispatcher::update_log_config()
3166 {
3167   map<string,string> log_to_monitors;
3168   map<string,string> log_to_syslog;
3169   map<string,string> log_channel;
3170   map<string,string> log_prio;
3171   map<string,string> log_to_graylog;
3172   map<string,string> log_to_graylog_host;
3173   map<string,string> log_to_graylog_port;
3174   uuid_d fsid;
3175   string host;
3176
3177   if (parse_log_client_options(g_ceph_context, log_to_monitors, log_to_syslog,
3178                                log_channel, log_prio, log_to_graylog,
3179                                log_to_graylog_host, log_to_graylog_port,
3180                                fsid, host) == 0)
3181     clog->update_config(log_to_monitors, log_to_syslog,
3182                         log_channel, log_prio, log_to_graylog,
3183                         log_to_graylog_host, log_to_graylog_port,
3184                         fsid, host);
3185   dout(10) << __func__ << " log_to_monitors " << log_to_monitors << dendl;
3186 }
3187
3188 void MDSRank::create_logger()
3189 {
3190   dout(10) << "create_logger" << dendl;
3191   {
3192     PerfCountersBuilder mds_plb(g_ceph_context, "mds", l_mds_first, l_mds_last);
3193
3194     // super useful (high prio) perf stats
3195     mds_plb.add_u64_counter(l_mds_request, "request", "Requests", "req",
3196                             PerfCountersBuilder::PRIO_CRITICAL);
3197     mds_plb.add_time_avg(l_mds_reply_latency, "reply_latency", "Reply latency", "rlat",
3198                          PerfCountersBuilder::PRIO_CRITICAL);
3199     mds_plb.add_u64(l_mds_inodes, "inodes", "Inodes", "inos",
3200                     PerfCountersBuilder::PRIO_CRITICAL);
3201     mds_plb.add_u64_counter(l_mds_forward, "forward", "Forwarding request", "fwd",
3202                             PerfCountersBuilder::PRIO_INTERESTING);
3203     mds_plb.add_u64(l_mds_caps, "caps", "Capabilities", "caps",
3204                     PerfCountersBuilder::PRIO_INTERESTING);
3205     mds_plb.add_u64_counter(l_mds_exported_inodes, "exported_inodes", "Exported inodes",
3206                             "exi", PerfCountersBuilder::PRIO_INTERESTING);
3207     mds_plb.add_u64_counter(l_mds_imported_inodes, "imported_inodes", "Imported inodes",
3208                             "imi", PerfCountersBuilder::PRIO_INTERESTING);
3209
3210     // useful dir/inode/subtree stats
3211     mds_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
3212     mds_plb.add_u64(l_mds_root_rfiles, "root_rfiles", "root inode rfiles");
3213     mds_plb.add_u64(l_mds_root_rbytes, "root_rbytes", "root inode rbytes");
3214     mds_plb.add_u64(l_mds_root_rsnaps, "root_rsnaps", "root inode rsnaps");
3215     mds_plb.add_u64_counter(l_mds_dir_fetch, "dir_fetch", "Directory fetch");
3216     mds_plb.add_u64_counter(l_mds_dir_commit, "dir_commit", "Directory commit");
3217     mds_plb.add_u64_counter(l_mds_dir_split, "dir_split", "Directory split");
3218     mds_plb.add_u64_counter(l_mds_dir_merge, "dir_merge", "Directory merge");
3219     mds_plb.add_u64(l_mds_inodes_pinned, "inodes_pinned", "Inodes pinned");
3220     mds_plb.add_u64(l_mds_inodes_expired, "inodes_expired", "Inodes expired");
3221     mds_plb.add_u64(l_mds_inodes_with_caps, "inodes_with_caps",
3222                     "Inodes with capabilities");
3223     mds_plb.add_u64(l_mds_subtrees, "subtrees", "Subtrees");
3224     mds_plb.add_u64(l_mds_load_cent, "load_cent", "Load per cent");
3225     mds_plb.add_u64_counter(l_mds_openino_dir_fetch, "openino_dir_fetch",
3226                             "OpenIno incomplete directory fetchings");
3227
3228     // low prio stats
3229     mds_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
3230     mds_plb.add_u64_counter(l_mds_reply, "reply", "Replies");
3231     mds_plb.add_u64(l_mds_inodes_top, "inodes_top", "Inodes on top");
3232     mds_plb.add_u64(l_mds_inodes_bottom, "inodes_bottom", "Inodes on bottom");
3233     mds_plb.add_u64(
3234       l_mds_inodes_pin_tail, "inodes_pin_tail", "Inodes on pin tail");
3235     mds_plb.add_u64_counter(l_mds_traverse, "traverse", "Traverses");
3236     mds_plb.add_u64_counter(l_mds_traverse_hit, "traverse_hit", "Traverse hits");
3237     mds_plb.add_u64_counter(l_mds_traverse_forward, "traverse_forward",
3238                             "Traverse forwards");
3239     mds_plb.add_u64_counter(l_mds_traverse_discover, "traverse_discover",
3240                             "Traverse directory discovers");
3241     mds_plb.add_u64_counter(l_mds_traverse_dir_fetch, "traverse_dir_fetch",
3242                             "Traverse incomplete directory content fetchings");
3243     mds_plb.add_u64_counter(l_mds_traverse_remote_ino, "traverse_remote_ino",
3244                             "Traverse remote dentries");
3245     mds_plb.add_u64_counter(l_mds_traverse_lock, "traverse_lock",
3246                             "Traverse locks");
3247     mds_plb.add_u64(l_mds_dispatch_queue_len, "q", "Dispatch queue length");
3248     mds_plb.add_u64_counter(l_mds_exported, "exported", "Exports");
3249     mds_plb.add_u64_counter(l_mds_imported, "imported", "Imports");
3250     mds_plb.add_u64_counter(l_mds_openino_backtrace_fetch, "openino_backtrace_fetch",
3251                             "OpenIno backtrace fetchings");
3252     mds_plb.add_u64_counter(l_mds_openino_peer_discover, "openino_peer_discover",
3253                             "OpenIno peer inode discovers");
3254
3255     logger = mds_plb.create_perf_counters();
3256     g_ceph_context->get_perfcounters_collection()->add(logger);
3257   }
3258
3259   {
3260     PerfCountersBuilder mdm_plb(g_ceph_context, "mds_mem", l_mdm_first, l_mdm_last);
3261     mdm_plb.add_u64(l_mdm_ino, "ino", "Inodes", "ino",
3262                     PerfCountersBuilder::PRIO_INTERESTING);
3263     mdm_plb.add_u64(l_mdm_dn, "dn", "Dentries", "dn",
3264                     PerfCountersBuilder::PRIO_INTERESTING);
3265
3266     mdm_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
3267     mdm_plb.add_u64_counter(l_mdm_inoa, "ino+", "Inodes opened");
3268     mdm_plb.add_u64_counter(l_mdm_inos, "ino-", "Inodes closed");
3269     mdm_plb.add_u64(l_mdm_dir, "dir", "Directories");
3270     mdm_plb.add_u64_counter(l_mdm_dira, "dir+", "Directories opened");
3271     mdm_plb.add_u64_counter(l_mdm_dirs, "dir-", "Directories closed");
3272     mdm_plb.add_u64_counter(l_mdm_dna, "dn+", "Dentries opened");
3273     mdm_plb.add_u64_counter(l_mdm_dns, "dn-", "Dentries closed");
3274     mdm_plb.add_u64(l_mdm_cap, "cap", "Capabilities");
3275     mdm_plb.add_u64_counter(l_mdm_capa, "cap+", "Capabilities added");
3276     mdm_plb.add_u64_counter(l_mdm_caps, "cap-", "Capabilities removed");
3277     mdm_plb.add_u64(l_mdm_heap, "heap", "Heap size");
3278
3279     mdm_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
3280     mdm_plb.add_u64(l_mdm_rss, "rss", "RSS");
3281
3282     mlogger = mdm_plb.create_perf_counters();
3283     g_ceph_context->get_perfcounters_collection()->add(mlogger);
3284   }
3285
3286   mdlog->create_logger();
3287   server->create_logger();
3288   purge_queue.create_logger();
3289   sessionmap.register_perfcounters();
3290   mdcache->register_perfcounters();
3291 }
3292
3293 void MDSRank::check_ops_in_flight()
3294 {
3295   string summary;
3296   vector<string> warnings;
3297   int slow = 0;
3298   if (op_tracker.check_ops_in_flight(&summary, warnings, &slow)) {
3299     clog->warn() << summary;
3300     for (const auto& warning : warnings) {
3301       clog->warn() << warning;
3302     }
3303   }
3304
3305   // set mds slow request count
3306   mds_slow_req_count = slow;
3307   return;
3308 }
3309
3310 void MDSRankDispatcher::handle_osd_map()
3311 {
3312   if (is_active() &&
3313       mdsmap->get_tableserver() == whoami) {
3314     snapserver->check_osd_map(true);
3315   }
3316
3317   server->handle_osd_map();
3318
3319   purge_queue.update_op_limit(*mdsmap);
3320
3321   std::set<entity_addr_t> newly_blacklisted;
3322   objecter->consume_blacklist_events(&newly_blacklisted);
3323   auto epoch = objecter->with_osdmap([](const OSDMap &o){return o.get_epoch();});
3324   dout(4) << "handle_osd_map epoch " << epoch << ", "
3325           << newly_blacklisted.size() << " new blacklist entries" << dendl;
3326   auto victims = server->apply_blacklist(newly_blacklisted);
3327   if (victims) {
3328     set_osd_epoch_barrier(epoch);
3329   }
3330
3331
3332   // By default the objecter only requests OSDMap updates on use,
3333   // we would like to always receive the latest maps in order to
3334   // apply policy based on the FULL flag.
3335   objecter->maybe_request_map();
3336 }
3337
3338 int MDSRank::config_client(int64_t session_id, bool remove,
3339                            const std::string& option, const std::string& value,
3340                            std::ostream& ss)
3341 {
3342   Session *session = sessionmap.get_session(entity_name_t(CEPH_ENTITY_TYPE_CLIENT, session_id));
3343   if (!session) {
3344     ss << "session " << session_id << " not in sessionmap!";
3345     return -ENOENT;
3346   }
3347
3348   if (option == "timeout") {
3349     if (remove) {
3350       auto it = session->info.client_metadata.find("timeout");
3351       if (it == session->info.client_metadata.end()) {
3352         ss << "Nonexistent config: " << option;
3353         return -ENODATA;
3354       }
3355       session->info.client_metadata.erase(it);
3356     } else {
3357       char *end;
3358       strtoul(value.c_str(), &end, 0);
3359       if (*end) {
3360         ss << "Invalid config for timeout: " << value;
3361         return -EINVAL;
3362       }
3363       session->info.client_metadata[option] = value;
3364     }
3365     //sessionmap._mark_dirty(session, true);
3366   } else {
3367     ss << "Invalid config option: " << option;
3368     return -EINVAL;
3369   }
3370
3371   return 0;
3372 }
3373
3374 bool MDSRank::evict_client(int64_t session_id,
3375     bool wait, bool blacklist, std::ostream& err_ss,
3376     Context *on_killed)
3377 {
3378   ceph_assert(ceph_mutex_is_locked_by_me(mds_lock));
3379
3380   // Mutually exclusive args
3381   ceph_assert(!(wait && on_killed != nullptr));
3382
3383   if (is_any_replay()) {
3384     err_ss << "MDS is replaying log";
3385     return false;
3386   }
3387
3388   Session *session = sessionmap.get_session(
3389       entity_name_t(CEPH_ENTITY_TYPE_CLIENT, session_id));
3390   if (!session) {
3391     err_ss << "session " << session_id << " not in sessionmap!";
3392     return false;
3393   }
3394
3395   auto& addr = session->info.inst.addr;
3396   {
3397     CachedStackStringStream css;
3398     *css << "Evicting " << (blacklist ? "(and blacklisting) " : "")
3399          << "client session " << session_id << " (" << addr << ")";
3400     dout(1) << css->strv() << dendl;
3401     clog->info() << css->strv();
3402   }
3403
3404   dout(4) << "Preparing blacklist command... (wait=" << wait << ")" << dendl;
3405   stringstream ss;
3406   ss << "{\"prefix\":\"osd blacklist\", \"blacklistop\":\"add\",";
3407   ss << "\"addr\":\"";
3408   ss << addr;
3409   ss << "\"}";
3410   std::string tmp = ss.str();
3411   std::vector<std::string> cmd = {tmp};
3412
3413   auto kill_client_session = [this, session_id, wait, on_killed](){
3414     ceph_assert(ceph_mutex_is_locked_by_me(mds_lock));
3415     Session *session = sessionmap.get_session(
3416         entity_name_t(CEPH_ENTITY_TYPE_CLIENT, session_id));
3417     if (session) {
3418       if (on_killed || !wait) {
3419         server->kill_session(session, on_killed);
3420       } else {
3421         C_SaferCond on_safe;
3422         server->kill_session(session, &on_safe);
3423
3424         mds_lock.unlock();
3425         on_safe.wait();
3426         mds_lock.lock();
3427       }
3428     } else {
3429       dout(1) << "session " << session_id << " was removed while we waited "
3430       "for blacklist" << dendl;
3431
3432       // Even though it wasn't us that removed it, kick our completion
3433       // as the session has been removed.
3434       if (on_killed) {
3435         on_killed->complete(0);
3436       }
3437     }
3438   };
3439
3440   auto apply_blacklist = [this, cmd](std::function<void ()> fn){
3441     ceph_assert(ceph_mutex_is_locked_by_me(mds_lock));
3442
3443     Context *on_blacklist_done = new LambdaContext([this, fn](int r) {
3444       objecter->wait_for_latest_osdmap(
3445        new C_OnFinisher(
3446          new LambdaContext([this, fn](int r) {
3447               std::lock_guard l(mds_lock);
3448               auto epoch = objecter->with_osdmap([](const OSDMap &o){
3449                   return o.get_epoch();
3450               });
3451
3452               set_osd_epoch_barrier(epoch);
3453
3454               fn();
3455             }), finisher)
3456        );
3457     });
3458
3459     dout(4) << "Sending mon blacklist command: " << cmd[0] << dendl;
3460     monc->start_mon_command(cmd, {}, nullptr, nullptr, on_blacklist_done);
3461   };
3462
3463   if (wait) {
3464     if (blacklist) {
3465       C_SaferCond inline_ctx;
3466       apply_blacklist([&inline_ctx](){inline_ctx.complete(0);});
3467       mds_lock.unlock();
3468       inline_ctx.wait();
3469       mds_lock.lock();
3470     }
3471
3472     // We dropped mds_lock, so check that session still exists
3473     session = sessionmap.get_session(entity_name_t(CEPH_ENTITY_TYPE_CLIENT,
3474           session_id));
3475     if (!session) {
3476       dout(1) << "session " << session_id << " was removed while we waited "
3477                  "for blacklist" << dendl;
3478       return true;
3479     }
3480     kill_client_session();
3481   } else {
3482     if (blacklist) {
3483       apply_blacklist(kill_client_session);
3484     } else {
3485       kill_client_session();
3486     }
3487   }
3488
3489   return true;
3490 }
3491
3492 void MDSRank::bcast_mds_map()
3493 {
3494   dout(7) << "bcast_mds_map " << mdsmap->get_epoch() << dendl;
3495
3496   // share the map with mounted clients
3497   set<Session*> clients;
3498   sessionmap.get_client_session_set(clients);
3499   for (const auto &session : clients) {
3500     auto m = make_message<MMDSMap>(monc->get_fsid(), *mdsmap);
3501     session->get_connection()->send_message2(std::move(m));
3502   }
3503   last_client_mdsmap_bcast = mdsmap->get_epoch();
3504 }
3505
3506 MDSRankDispatcher::MDSRankDispatcher(
3507     mds_rank_t whoami_,
3508     ceph::mutex &mds_lock_,
3509     LogChannelRef &clog_,
3510     SafeTimer &timer_,
3511     Beacon &beacon_,
3512     std::unique_ptr<MDSMap> &mdsmap_,
3513     Messenger *msgr,
3514     MonClient *monc_,
3515     MgrClient *mgrc,
3516     Context *respawn_hook_,
3517     Context *suicide_hook_)
3518   : MDSRank(whoami_, mds_lock_, clog_, timer_, beacon_, mdsmap_,
3519             msgr, monc_, mgrc, respawn_hook_, suicide_hook_)
3520 {
3521     g_conf().add_observer(this);
3522 }
3523
3524 void MDSRank::command_cache_drop(uint64_t timeout, Formatter *f, Context *on_finish) {
3525   dout(20) << __func__ << dendl;
3526
3527   std::lock_guard locker(mds_lock);
3528   C_Drop_Cache *request = new C_Drop_Cache(server, mdcache, mdlog, this,
3529                                            timeout, f, on_finish);
3530   request->send();
3531 }
3532
3533 epoch_t MDSRank::get_osd_epoch() const
3534 {
3535   return objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
3536 }
3537
3538 const char** MDSRankDispatcher::get_tracked_conf_keys() const
3539 {
3540   static const char* KEYS[] = {
3541     "clog_to_graylog",
3542     "clog_to_graylog_host",
3543     "clog_to_graylog_port",
3544     "clog_to_monitors",
3545     "clog_to_syslog",
3546     "clog_to_syslog_facility",
3547     "clog_to_syslog_level",
3548     "fsid",
3549     "host",
3550     "mds_bal_fragment_dirs",
3551     "mds_bal_fragment_interval",
3552     "mds_cache_memory_limit",
3553     "mds_cache_mid",
3554     "mds_cache_reservation",
3555     "mds_cache_trim_decay_rate",
3556     "mds_cap_revoke_eviction_timeout",
3557     "mds_dump_cache_threshold_file",
3558     "mds_dump_cache_threshold_formatter",
3559     "mds_enable_op_tracker",
3560     "mds_export_ephemeral_random",
3561     "mds_export_ephemeral_random_max",
3562     "mds_export_ephemeral_distributed",
3563     "mds_health_cache_threshold",
3564     "mds_inject_migrator_session_race",
3565     "mds_log_pause",
3566     "mds_max_export_size",
3567     "mds_max_purge_files",
3568     "mds_forward_all_requests_to_auth",
3569     "mds_max_purge_ops",
3570     "mds_max_purge_ops_per_pg",
3571     "mds_max_snaps_per_dir",
3572     "mds_op_complaint_time",
3573     "mds_op_history_duration",
3574     "mds_op_history_size",
3575     "mds_op_log_threshold",
3576     "mds_recall_max_decay_rate",
3577     "mds_recall_warning_decay_rate",
3578     "mds_request_load_average_decay_rate",
3579     "mds_session_cache_liveness_decay_rate",
3580     "mds_replay_unsafe_with_closed_session",
3581     NULL
3582   };
3583   return KEYS;
3584 }
3585
3586 void MDSRankDispatcher::handle_conf_change(const ConfigProxy& conf, const std::set<std::string>& changed)
3587 {
3588   // XXX with or without mds_lock!
3589
3590   if (changed.count("mds_op_complaint_time") || changed.count("mds_op_log_threshold")) {
3591     op_tracker.set_complaint_and_threshold(conf->mds_op_complaint_time, conf->mds_op_log_threshold);
3592   }
3593   if (changed.count("mds_op_history_size") || changed.count("mds_op_history_duration")) {
3594     op_tracker.set_history_size_and_duration(conf->mds_op_history_size, conf->mds_op_history_duration);
3595   }
3596   if (changed.count("mds_enable_op_tracker")) {
3597     op_tracker.set_tracking(conf->mds_enable_op_tracker);
3598   }
3599   if (changed.count("clog_to_monitors") ||
3600       changed.count("clog_to_syslog") ||
3601       changed.count("clog_to_syslog_level") ||
3602       changed.count("clog_to_syslog_facility") ||
3603       changed.count("clog_to_graylog") ||
3604       changed.count("clog_to_graylog_host") ||
3605       changed.count("clog_to_graylog_port") ||
3606       changed.count("host") ||
3607       changed.count("fsid")) {
3608     update_log_config();
3609   }
3610
3611   finisher->queue(new LambdaContext([this, changed](int) {
3612     std::scoped_lock lock(mds_lock);
3613
3614     dout(10) << "flushing conf change to components: " << changed << dendl;
3615
3616     if (changed.count("mds_log_pause") && !g_conf()->mds_log_pause) {
3617       mdlog->kick_submitter();
3618     }
3619     sessionmap.handle_conf_change(changed);
3620     server->handle_conf_change(changed);
3621     mdcache->handle_conf_change(changed, *mdsmap);
3622     purge_queue.handle_conf_change(changed, *mdsmap);
3623   }));
3624 }
3625
3626 void MDSRank::get_task_status(std::map<std::string, std::string> *status) {
3627   dout(20) << __func__ << dendl;
3628
3629   // scrub summary for now..
3630   std::string_view scrub_summary = scrubstack->scrub_summary();
3631   status->emplace(SCRUB_STATUS_KEY, std::move(scrub_summary));
3632 }
3633
3634 void MDSRank::schedule_update_timer_task() {
3635   dout(20) << __func__ << dendl;
3636
3637   timer.add_event_after(g_conf().get_val<double>("mds_task_status_update_interval"),
3638                         new LambdaContext([this](int) {
3639                             send_task_status();
3640                           }));
3641 }
3642
3643 void MDSRank::send_task_status() {
3644   std::map<std::string, std::string> status;
3645   get_task_status(&status);
3646
3647   if (!status.empty()) {
3648     dout(20) << __func__ << ": updating " << status.size() << " status keys" << dendl;
3649
3650     int r = mgrc->service_daemon_update_task_status(std::move(status));
3651     if (r < 0) {
3652       derr << ": failed to update service daemon status: " << cpp_strerror(r) << dendl;
3653     }
3654   }
3655
3656   schedule_update_timer_task();
3657 }