ceph/src/mds/Beacon.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2012 Red Hat
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15
  16 #include "common/dout.h"
  17 #include "common/HeartbeatMap.h"
  18 #include "include/stringify.h"
  19 #include "include/util.h"
  20
  21 #include "messages/MMDSBeacon.h"
  22 #include "mon/MonClient.h"
  23 #include "mds/MDLog.h"
  24 #include "mds/MDSRank.h"
  25 #include "mds/MDSMap.h"
  26 #include "mds/Locker.h"
  27
  28 #include "Beacon.h"
  29
  30 #define dout_context g_ceph_context
  31 #define dout_subsys ceph_subsys_mds
  32 #undef dout_prefix
  33 #define dout_prefix *_dout << "mds.beacon." << name << ' '
  34
  35
  36 class Beacon::C_MDS_BeaconSender : public Context {
  37 public:
  38   explicit C_MDS_BeaconSender(Beacon *beacon_) : beacon(beacon_) {}
  39   void finish(int r) override {
  40     assert(beacon->lock.is_locked_by_me());
  41     beacon->sender = NULL;
  42     beacon->_send();
  43   }
  44 private:
  45   Beacon *beacon;
  46 };
  47
  48 Beacon::Beacon(CephContext *cct_, MonClient *monc_, std::string name_) :
  49   Dispatcher(cct_), lock("Beacon"), monc(monc_), timer(g_ceph_context, lock),
  50   name(name_), standby_for_rank(MDS_RANK_NONE),
  51   standby_for_fscid(FS_CLUSTER_ID_NONE), want_state(MDSMap::STATE_BOOT),
  52   awaiting_seq(-1)
  53 {
  54   last_seq = 0;
  55   sender = NULL;
  56   was_laggy = false;
  57
  58   epoch = 0;
  59 }
  60
  61
  62 Beacon::~Beacon()
  63 {
  64 }
  65
  66
  67 void Beacon::init(MDSMap const *mdsmap)
  68 {
  69   Mutex::Locker l(lock);
  70   assert(mdsmap != NULL);
  71
  72   _notify_mdsmap(mdsmap);
  73   standby_for_rank = mds_rank_t(g_conf->mds_standby_for_rank);
  74   standby_for_name = g_conf->mds_standby_for_name;
  75   standby_for_fscid = fs_cluster_id_t(g_conf->mds_standby_for_fscid);
  76   standby_replay = g_conf->mds_standby_replay;
  77
  78   // Spawn threads and start messaging
  79   timer.init();
  80   _send();
  81 }
  82
  83
  84 void Beacon::shutdown()
  85 {
  86   Mutex::Locker l(lock);
  87   if (sender) {
  88     timer.cancel_event(sender);
  89     sender = NULL;
  90   }
  91   timer.shutdown();
  92 }
  93
  94
  95 bool Beacon::ms_dispatch(Message *m)
  96 {
  97   if (m->get_type() == MSG_MDS_BEACON) {
  98     if (m->get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
  99       handle_mds_beacon(static_cast<MMDSBeacon*>(m));
 100     }
 101     return true;
 102   }
 103
 104   return false;
 105 }
 106
 107
 108 /**
 109  * Update lagginess state based on response from remote MDSMonitor
 110  *
 111  * This function puts the passed message before returning
 112  */
 113 void Beacon::handle_mds_beacon(MMDSBeacon *m)
 114 {
 115   Mutex::Locker l(lock);
 116   assert(m != NULL);
 117
 118   version_t seq = m->get_seq();
 119
 120   // update lab
 121   if (seq_stamp.count(seq)) {
 122     utime_t now = ceph_clock_now();
 123     if (seq_stamp[seq] > last_acked_stamp) {
 124       last_acked_stamp = seq_stamp[seq];
 125       utime_t rtt = now - last_acked_stamp;
 126
 127       dout(10) << "handle_mds_beacon " << ceph_mds_state_name(m->get_state())
 128                << " seq " << m->get_seq() << " rtt " << rtt << dendl;
 129
 130       if (was_laggy && rtt < g_conf->mds_beacon_grace) {
 131         dout(0) << "handle_mds_beacon no longer laggy" << dendl;
 132         was_laggy = false;
 133         laggy_until = now;
 134       }
 135     } else {
 136       // Mark myself laggy if system clock goes backwards. Hopping
 137       // later beacons will clear it.
 138       dout(1) << "handle_mds_beacon system clock goes backwards, "
 139               << "mark myself laggy" << dendl;
 140       last_acked_stamp = now - utime_t(g_conf->mds_beacon_grace + 1, 0);
 141       was_laggy = true;
 142     }
 143
 144     // clean up seq_stamp map
 145     while (!seq_stamp.empty() &&
 146            seq_stamp.begin()->first <= seq)
 147       seq_stamp.erase(seq_stamp.begin());
 148
 149     // Wake a waiter up if present
 150     if (awaiting_seq == seq) {
 151       waiting_cond.Signal();
 152     }
 153   } else {
 154     dout(10) << "handle_mds_beacon " << ceph_mds_state_name(m->get_state())
 155              << " seq " << m->get_seq() << " dne" << dendl;
 156   }
 157 }
 158
 159
 160 void Beacon::send()
 161 {
 162   Mutex::Locker l(lock);
 163   _send();
 164 }
 165
 166
 167 void Beacon::send_and_wait(const double duration)
 168 {
 169   Mutex::Locker l(lock);
 170   _send();
 171   awaiting_seq = last_seq;
 172   dout(20) << __func__ << ": awaiting " << awaiting_seq
 173            << " for up to " << duration << "s" << dendl;
 174
 175   utime_t timeout;
 176   timeout.set_from_double(ceph_clock_now() + duration);
 177   while ((!seq_stamp.empty() && seq_stamp.begin()->first <= awaiting_seq)
 178          && ceph_clock_now() < timeout) {
 179     waiting_cond.WaitUntil(lock, timeout);
 180   }
 181
 182   awaiting_seq = -1;
 183 }
 184
 185
 186 /**
 187  * Call periodically, or when you have updated the desired state
 188  */
 189 void Beacon::_send()
 190 {
 191   if (sender) {
 192     timer.cancel_event(sender);
 193   }
 194   sender = new C_MDS_BeaconSender(this);
 195   timer.add_event_after(g_conf->mds_beacon_interval, sender);
 196
 197   if (!cct->get_heartbeat_map()->is_healthy()) {
 198     /* If anything isn't progressing, let avoid sending a beacon so that
 199      * the MDS will consider us laggy */
 200     dout(1) << __func__ << " skipping beacon, heartbeat map not healthy" << dendl;
 201     return;
 202   }
 203
 204   ++last_seq;
 205   dout(10) << __func__ << " " << ceph_mds_state_name(want_state)
 206            << " seq " << last_seq
 207            << dendl;
 208
 209   seq_stamp[last_seq] = ceph_clock_now();
 210
 211   assert(want_state != MDSMap::STATE_NULL);
 212
 213   MMDSBeacon *beacon = new MMDSBeacon(
 214       monc->get_fsid(), mds_gid_t(monc->get_global_id()),
 215       name,
 216       epoch,
 217       want_state,
 218       last_seq,
 219       CEPH_FEATURES_SUPPORTED_DEFAULT);
 220
 221   beacon->set_standby_for_rank(standby_for_rank);
 222   beacon->set_standby_for_name(standby_for_name);
 223   beacon->set_standby_for_fscid(standby_for_fscid);
 224   beacon->set_standby_replay(standby_replay);
 225   beacon->set_health(health);
 226   beacon->set_compat(compat);
 227   // piggyback the sys info on beacon msg
 228   if (want_state == MDSMap::STATE_BOOT) {
 229     map<string, string> sys_info;
 230     collect_sys_info(&sys_info, cct);
 231     sys_info["addr"] = stringify(monc->get_myaddr());
 232     beacon->set_sys_info(sys_info);
 233   }
 234   monc->send_mon_message(beacon);
 235 }
 236
 237 /**
 238  * Call this when there is a new MDSMap available
 239  */
 240 void Beacon::notify_mdsmap(MDSMap const *mdsmap)
 241 {
 242   Mutex::Locker l(lock);
 243   assert(mdsmap != NULL);
 244
 245   _notify_mdsmap(mdsmap);
 246 }
 247
 248 void Beacon::_notify_mdsmap(MDSMap const *mdsmap)
 249 {
 250   assert(mdsmap != NULL);
 251   assert(mdsmap->get_epoch() >= epoch);
 252
 253   if (mdsmap->get_epoch() != epoch) {
 254     epoch = mdsmap->get_epoch();
 255     compat = get_mdsmap_compat_set_default();
 256     compat.merge(mdsmap->compat);
 257   }
 258 }
 259
 260
 261 bool Beacon::is_laggy()
 262 {
 263   Mutex::Locker l(lock);
 264
 265   if (last_acked_stamp == utime_t())
 266     return false;
 267
 268   utime_t now = ceph_clock_now();
 269   utime_t since = now - last_acked_stamp;
 270   if (since > g_conf->mds_beacon_grace) {
 271     dout(5) << "is_laggy " << since << " > " << g_conf->mds_beacon_grace
 272             << " since last acked beacon" << dendl;
 273     was_laggy = true;
 274     if (since > (g_conf->mds_beacon_grace*2) &&
 275         now > last_mon_reconnect + g_conf->mds_beacon_interval) {
 276       // maybe it's not us?
 277       dout(5) << "initiating monitor reconnect; maybe we're not the slow one"
 278               << dendl;
 279       last_mon_reconnect = now;
 280       monc->reopen_session();
 281     }
 282     return true;
 283   }
 284   return false;
 285 }
 286
 287 utime_t Beacon::get_laggy_until() const
 288 {
 289   Mutex::Locker l(lock);
 290
 291   return laggy_until;
 292 }
 293
 294 void Beacon::set_want_state(MDSMap const *mdsmap, MDSMap::DaemonState const newstate)
 295 {
 296   Mutex::Locker l(lock);
 297
 298   // Update mdsmap epoch atomically with updating want_state, so that when
 299   // we send a beacon with the new want state it has the latest epoch, and
 300   // once we have updated to the latest epoch, we are not sending out
 301   // a stale want_state (i.e. one from before making it through MDSMap
 302   // handling)
 303   _notify_mdsmap(mdsmap);
 304
 305   if (want_state != newstate) {
 306     dout(10) << __func__ << ": "
 307       << ceph_mds_state_name(want_state) << " -> "
 308       << ceph_mds_state_name(newstate) << dendl;
 309     want_state = newstate;
 310   }
 311 }
 312
 313
 314 /**
 315  * We are 'shown' an MDS briefly in order to update
 316  * some health metrics that we will send in the next
 317  * beacon.
 318  */
 319 void Beacon::notify_health(MDSRank const *mds)
 320 {
 321   Mutex::Locker l(lock);
 322   if (!mds) {
 323     // No MDS rank held
 324     return;
 325   }
 326
 327   // I'm going to touch this MDS, so it must be locked
 328   assert(mds->mds_lock.is_locked_by_me());
 329
 330   health.metrics.clear();
 331
 332   // Detect presence of entries in DamageTable
 333   if (!mds->damage_table.empty()) {
 334     MDSHealthMetric m(MDS_HEALTH_DAMAGE, HEALTH_ERR, std::string(
 335           "Metadata damage detected"));
 336     health.metrics.push_back(m);
 337   }
 338
 339   // Detect MDS_HEALTH_TRIM condition
 340   // Arbitrary factor of 2, indicates MDS is not trimming promptly
 341   {
 342     if (mds->mdlog->get_num_segments() > (size_t)(g_conf->mds_log_max_segments * 2)) {
 343       std::ostringstream oss;
 344       oss << "Behind on trimming (" << mds->mdlog->get_num_segments()
 345         << "/" << g_conf->mds_log_max_segments << ")";
 346
 347       MDSHealthMetric m(MDS_HEALTH_TRIM, HEALTH_WARN, oss.str());
 348       m.metadata["num_segments"] = stringify(mds->mdlog->get_num_segments());
 349       m.metadata["max_segments"] = stringify(g_conf->mds_log_max_segments);
 350       health.metrics.push_back(m);
 351     }
 352   }
 353
 354   // Detect clients failing to respond to modifications to capabilities in
 355   // CLIENT_CAPS messages.
 356   {
 357     std::list<client_t> late_clients;
 358     mds->locker->get_late_revoking_clients(&late_clients);
 359     std::list<MDSHealthMetric> late_cap_metrics;
 360
 361     for (std::list<client_t>::iterator i = late_clients.begin(); i != late_clients.end(); ++i) {
 362
 363       // client_t is equivalent to session.info.inst.name.num
 364       // Construct an entity_name_t to lookup into SessionMap
 365       entity_name_t ename(CEPH_ENTITY_TYPE_CLIENT, i->v);
 366       Session const *s = mds->sessionmap.get_session(ename);
 367       if (s == NULL) {
 368         // Shouldn't happen, but not worth crashing if it does as this is
 369         // just health-reporting code.
 370         derr << "Client ID without session: " << i->v << dendl;
 371         continue;
 372       }
 373
 374       std::ostringstream oss;
 375       oss << "Client " << s->get_human_name() << " failing to respond to capability release";
 376       MDSHealthMetric m(MDS_HEALTH_CLIENT_LATE_RELEASE, HEALTH_WARN, oss.str());
 377       m.metadata["client_id"] = stringify(i->v);
 378       late_cap_metrics.push_back(m);
 379     }
 380
 381     if (late_cap_metrics.size() <= (size_t)g_conf->mds_health_summarize_threshold) {
 382       health.metrics.splice(health.metrics.end(), late_cap_metrics);
 383     } else {
 384       std::ostringstream oss;
 385       oss << "Many clients (" << late_cap_metrics.size()
 386           << ") failing to respond to capability release";
 387       MDSHealthMetric m(MDS_HEALTH_CLIENT_LATE_RELEASE_MANY, HEALTH_WARN, oss.str());
 388       m.metadata["client_count"] = stringify(late_cap_metrics.size());
 389       health.metrics.push_back(m);
 390       late_cap_metrics.clear();
 391     }
 392   }
 393
 394   // Detect clients failing to generate cap releases from CEPH_SESSION_RECALL_STATE
 395   // messages. May be due to buggy client or resource-hogging application.
 396   //
 397   // Detect clients failing to advance their old_client_tid
 398   {
 399     set<Session*> sessions;
 400     mds->sessionmap.get_client_session_set(sessions);
 401
 402     utime_t cutoff = ceph_clock_now();
 403     cutoff -= g_conf->mds_recall_state_timeout;
 404     utime_t last_recall = mds->mdcache->last_recall_state;
 405
 406     std::list<MDSHealthMetric> late_recall_metrics;
 407     std::list<MDSHealthMetric> large_completed_requests_metrics;
 408     for (set<Session*>::iterator i = sessions.begin(); i != sessions.end(); ++i) {
 409       Session *session = *i;
 410       if (!session->recalled_at.is_zero()) {
 411         dout(20) << "Session servicing RECALL " << session->info.inst
 412           << ": " << session->recalled_at << " " << session->recall_release_count
 413           << "/" << session->recall_count << dendl;
 414         if (last_recall < cutoff || session->last_recall_sent < last_recall) {
 415           dout(20) << "  no longer recall" << dendl;
 416           session->clear_recalled_at();
 417         } else if (session->recalled_at < cutoff) {
 418           dout(20) << "  exceeded timeout " << session->recalled_at << " vs. " << cutoff << dendl;
 419           std::ostringstream oss;
 420           oss << "Client " << session->get_human_name() << " failing to respond to cache pressure";
 421           MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str());
 422           m.metadata["client_id"] = stringify(session->info.inst.name.num());
 423           late_recall_metrics.push_back(m);
 424         } else {
 425           dout(20) << "  within timeout " << session->recalled_at << " vs. " << cutoff << dendl;
 426         }
 427       }
 428       if ((session->get_num_trim_requests_warnings() > 0 &&
 429            session->get_num_completed_requests() >= g_conf->mds_max_completed_requests) ||
 430           (session->get_num_trim_flushes_warnings() > 0 &&
 431            session->get_num_completed_flushes() >= g_conf->mds_max_completed_flushes)) {
 432         std::ostringstream oss;
 433         oss << "Client " << session->get_human_name() << " failing to advance its oldest client/flush tid";
 434         MDSHealthMetric m(MDS_HEALTH_CLIENT_OLDEST_TID, HEALTH_WARN, oss.str());
 435         m.metadata["client_id"] = stringify(session->info.inst.name.num());
 436         large_completed_requests_metrics.push_back(m);
 437       }
 438     }
 439
 440     if (late_recall_metrics.size() <= (size_t)g_conf->mds_health_summarize_threshold) {
 441       health.metrics.splice(health.metrics.end(), late_recall_metrics);
 442     } else {
 443       std::ostringstream oss;
 444       oss << "Many clients (" << late_recall_metrics.size()
 445           << ") failing to respond to cache pressure";
 446       MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL_MANY, HEALTH_WARN, oss.str());
 447       m.metadata["client_count"] = stringify(late_recall_metrics.size());
 448       health.metrics.push_back(m);
 449       late_recall_metrics.clear();
 450     }
 451
 452     if (large_completed_requests_metrics.size() <= (size_t)g_conf->mds_health_summarize_threshold) {
 453       health.metrics.splice(health.metrics.end(), large_completed_requests_metrics);
 454     } else {
 455       std::ostringstream oss;
 456       oss << "Many clients (" << large_completed_requests_metrics.size()
 457         << ") failing to advance their oldest client/flush tid";
 458       MDSHealthMetric m(MDS_HEALTH_CLIENT_OLDEST_TID_MANY, HEALTH_WARN, oss.str());
 459       m.metadata["client_count"] = stringify(large_completed_requests_metrics.size());
 460       health.metrics.push_back(m);
 461       large_completed_requests_metrics.clear();
 462     }
 463   }
 464
 465   // Detect MDS_HEALTH_SLOW_REQUEST condition
 466   {
 467     int slow = mds->get_mds_slow_req_count();
 468     dout(20) << slow << " slow request found" << dendl;
 469     if (slow) {
 470       std::ostringstream oss;
 471       oss << slow << " slow requests are blocked > " << g_conf->mds_op_complaint_time << " sec";
 472
 473       MDSHealthMetric m(MDS_HEALTH_SLOW_REQUEST, HEALTH_WARN, oss.str());
 474       health.metrics.push_back(m);
 475     }
 476   }
 477
 478   // Report a health warning if we are readonly
 479   if (mds->mdcache->is_readonly()) {
 480     MDSHealthMetric m(MDS_HEALTH_READ_ONLY, HEALTH_WARN,
 481                       "MDS in read-only mode");
 482     health.metrics.push_back(m);
 483   }
 484
 485   // Report if we have significantly exceeded our cache size limit
 486   if (mds->mdcache->get_cache_size() >
 487         g_conf->mds_cache_size * g_conf->mds_health_cache_threshold) {
 488     std::ostringstream oss;
 489     oss << "Too many inodes in cache (" << mds->mdcache->get_cache_size()
 490         << "/" << g_conf->mds_cache_size << "), "
 491         << mds->mdcache->num_inodes_with_caps << " inodes in use by clients, "
 492         << mds->mdcache->get_num_strays() << " stray files";
 493
 494     MDSHealthMetric m(MDS_HEALTH_CACHE_OVERSIZED, HEALTH_WARN, oss.str());
 495     health.metrics.push_back(m);
 496   }
 497 }
 498
 499 MDSMap::DaemonState Beacon::get_want_state() const
 500 {
 501   Mutex::Locker l(lock);
 502   return want_state;
 503 }
 504