]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/Beacon.cc
import ceph quincy 17.2.4
[ceph.git] / ceph / src / mds / Beacon.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2012 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16#include "common/dout.h"
2a845540 17#include "common/likely.h"
7c673cae 18#include "common/HeartbeatMap.h"
181888fb 19
7c673cae
FG
20#include "include/stringify.h"
21#include "include/util.h"
22
7c673cae
FG
23#include "mon/MonClient.h"
24#include "mds/MDLog.h"
25#include "mds/MDSRank.h"
26#include "mds/MDSMap.h"
27#include "mds/Locker.h"
28
29#include "Beacon.h"
30
91327a77
AA
31#include <chrono>
32
7c673cae
FG
33#define dout_context g_ceph_context
34#define dout_subsys ceph_subsys_mds
35#undef dout_prefix
36#define dout_prefix *_dout << "mds.beacon." << name << ' '
37
20effc67
TL
38using std::map;
39using std::string;
40
11fdf7f2
TL
41using namespace std::chrono_literals;
42
43Beacon::Beacon(CephContext *cct, MonClient *monc, std::string_view name)
91327a77
AA
44 :
45 Dispatcher(cct),
11fdf7f2 46 beacon_interval(g_conf()->mds_beacon_interval),
91327a77 47 monc(monc),
522d829b
TL
48 name(name),
49 compat(MDSMap::get_compat_set_all())
7c673cae 50{
7c673cae
FG
51}
52
7c673cae
FG
53Beacon::~Beacon()
54{
91327a77 55 shutdown();
7c673cae
FG
56}
57
91327a77
AA
58void Beacon::shutdown()
59{
60 std::unique_lock<std::mutex> lock(mutex);
61 if (!finished) {
62 finished = true;
63 lock.unlock();
81eedcae
TL
64 if (sender.joinable())
65 sender.join();
91327a77
AA
66 }
67}
7c673cae 68
11fdf7f2 69void Beacon::init(const MDSMap &mdsmap)
7c673cae 70{
11fdf7f2 71 std::unique_lock lock(mutex);
7c673cae
FG
72
73 _notify_mdsmap(mdsmap);
7c673cae 74
91327a77
AA
75 sender = std::thread([this]() {
76 std::unique_lock<std::mutex> lock(mutex);
77 std::condition_variable c; // no one wakes us
78 while (!finished) {
79 auto now = clock::now();
80 auto since = std::chrono::duration<double>(now-last_send).count();
81 auto interval = beacon_interval;
82 if (since >= interval*.90) {
a8e16298
TL
83 if (!_send()) {
84 interval = 0.5; /* 500ms */
85 }
91327a77
AA
86 } else {
87 interval -= since;
88 }
89 dout(20) << "sender thread waiting interval " << interval << "s" << dendl;
11fdf7f2 90 c.wait_for(lock, interval*1s);
91327a77
AA
91 }
92 });
7c673cae
FG
93}
94
9f95a23c 95bool Beacon::ms_can_fast_dispatch2(const cref_t<Message>& m) const
7c673cae 96{
91327a77 97 return m->get_type() == MSG_MDS_BEACON;
7c673cae
FG
98}
99
9f95a23c 100void Beacon::ms_fast_dispatch2(const ref_t<Message>& m)
91327a77 101{
11fdf7f2
TL
102 bool handled = ms_dispatch2(m);
103 ceph_assert(handled);
91327a77 104}
7c673cae 105
9f95a23c 106bool Beacon::ms_dispatch2(const ref_t<Message>& m)
7c673cae
FG
107{
108 if (m->get_type() == MSG_MDS_BEACON) {
109 if (m->get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
9f95a23c 110 handle_mds_beacon(ref_cast<MMDSBeacon>(m));
7c673cae
FG
111 }
112 return true;
113 }
114
115 return false;
116}
117
118
119/**
120 * Update lagginess state based on response from remote MDSMonitor
121 *
122 * This function puts the passed message before returning
123 */
9f95a23c 124void Beacon::handle_mds_beacon(const cref_t<MMDSBeacon> &m)
7c673cae 125{
11fdf7f2 126 std::unique_lock lock(mutex);
7c673cae
FG
127
128 version_t seq = m->get_seq();
129
130 // update lab
91327a77
AA
131 auto it = seq_stamp.find(seq);
132 if (it != seq_stamp.end()) {
133 auto now = clock::now();
134
135 last_acked_stamp = it->second;
136 auto rtt = std::chrono::duration<double>(now - last_acked_stamp).count();
137
138 dout(5) << "received beacon reply " << ceph_mds_state_name(m->get_state()) << " seq " << m->get_seq() << " rtt " << rtt << dendl;
139
11fdf7f2 140 if (laggy && rtt < g_conf()->mds_beacon_grace) {
91327a77
AA
141 dout(0) << " MDS is no longer laggy" << dendl;
142 laggy = false;
143 last_laggy = now;
7c673cae
FG
144 }
145
146 // clean up seq_stamp map
91327a77 147 seq_stamp.erase(seq_stamp.begin(), ++it);
7c673cae
FG
148
149 // Wake a waiter up if present
91327a77 150 cvar.notify_all();
7c673cae 151 } else {
91327a77
AA
152 dout(1) << "discarding unexpected beacon reply " << ceph_mds_state_name(m->get_state())
153 << " seq " << m->get_seq() << " dne" << dendl;
7c673cae
FG
154 }
155}
156
157
158void Beacon::send()
159{
11fdf7f2 160 std::unique_lock lock(mutex);
7c673cae
FG
161 _send();
162}
163
164
165void Beacon::send_and_wait(const double duration)
166{
11fdf7f2 167 std::unique_lock lock(mutex);
7c673cae 168 _send();
91327a77 169 auto awaiting_seq = last_seq;
7c673cae
FG
170 dout(20) << __func__ << ": awaiting " << awaiting_seq
171 << " for up to " << duration << "s" << dendl;
172
91327a77
AA
173 auto start = clock::now();
174 while (!seq_stamp.empty() && seq_stamp.begin()->first <= awaiting_seq) {
175 auto now = clock::now();
176 auto s = duration*.95-std::chrono::duration<double>(now-start).count();
177 if (s < 0) break;
11fdf7f2 178 cvar.wait_for(lock, s*1s);
7c673cae 179 }
7c673cae
FG
180}
181
182
183/**
184 * Call periodically, or when you have updated the desired state
185 */
a8e16298 186bool Beacon::_send()
7c673cae 187{
91327a77
AA
188 auto now = clock::now();
189 auto since = std::chrono::duration<double>(now-last_acked_stamp).count();
7c673cae
FG
190
191 if (!cct->get_heartbeat_map()->is_healthy()) {
192 /* If anything isn't progressing, let avoid sending a beacon so that
193 * the MDS will consider us laggy */
91327a77 194 dout(0) << "Skipping beacon heartbeat to monitors (last acked " << since << "s ago); MDS internal heartbeat is not healthy!" << dendl;
a8e16298 195 return false;
7c673cae
FG
196 }
197
198 ++last_seq;
91327a77 199 dout(5) << "Sending beacon " << ceph_mds_state_name(want_state) << " seq " << last_seq << dendl;
7c673cae 200
91327a77 201 seq_stamp[last_seq] = now;
7c673cae 202
11fdf7f2 203 ceph_assert(want_state != MDSMap::STATE_NULL);
7c673cae 204
9f95a23c 205 auto beacon = make_message<MMDSBeacon>(
7c673cae
FG
206 monc->get_fsid(), mds_gid_t(monc->get_global_id()),
207 name,
208 epoch,
209 want_state,
210 last_seq,
211 CEPH_FEATURES_SUPPORTED_DEFAULT);
7c673cae
FG
212 beacon->set_health(health);
213 beacon->set_compat(compat);
9f95a23c 214 beacon->set_fs(g_conf().get_val<std::string>("mds_join_fs"));
7c673cae
FG
215 // piggyback the sys info on beacon msg
216 if (want_state == MDSMap::STATE_BOOT) {
217 map<string, string> sys_info;
218 collect_sys_info(&sys_info, cct);
11fdf7f2 219 sys_info["addr"] = stringify(monc->get_myaddrs());
7c673cae
FG
220 beacon->set_sys_info(sys_info);
221 }
11fdf7f2 222 monc->send_mon_message(beacon.detach());
91327a77 223 last_send = now;
a8e16298 224 return true;
7c673cae
FG
225}
226
227/**
228 * Call this when there is a new MDSMap available
229 */
11fdf7f2 230void Beacon::notify_mdsmap(const MDSMap &mdsmap)
7c673cae 231{
11fdf7f2 232 std::unique_lock lock(mutex);
7c673cae
FG
233
234 _notify_mdsmap(mdsmap);
235}
236
11fdf7f2 237void Beacon::_notify_mdsmap(const MDSMap &mdsmap)
7c673cae 238{
11fdf7f2 239 ceph_assert(mdsmap.get_epoch() >= epoch);
7c673cae 240
522d829b 241 if (mdsmap.get_epoch() >= epoch) {
11fdf7f2 242 epoch = mdsmap.get_epoch();
7c673cae
FG
243 }
244}
245
246
247bool Beacon::is_laggy()
248{
11fdf7f2 249 std::unique_lock lock(mutex);
7c673cae 250
91327a77
AA
251 auto now = clock::now();
252 auto since = std::chrono::duration<double>(now-last_acked_stamp).count();
11fdf7f2 253 if (since > g_conf()->mds_beacon_grace) {
91327a77 254 if (!laggy) {
11fdf7f2
TL
255 dout(1) << "MDS connection to Monitors appears to be laggy; " << since
256 << "s since last acked beacon" << dendl;
91327a77
AA
257 }
258 laggy = true;
7c673cae
FG
259 return true;
260 }
261 return false;
262}
263
9f95a23c 264void Beacon::set_want_state(const MDSMap &mdsmap, MDSMap::DaemonState newstate)
7c673cae 265{
11fdf7f2 266 std::unique_lock lock(mutex);
7c673cae
FG
267
268 // Update mdsmap epoch atomically with updating want_state, so that when
269 // we send a beacon with the new want state it has the latest epoch, and
270 // once we have updated to the latest epoch, we are not sending out
271 // a stale want_state (i.e. one from before making it through MDSMap
272 // handling)
273 _notify_mdsmap(mdsmap);
274
275 if (want_state != newstate) {
91327a77 276 dout(5) << __func__ << ": "
7c673cae
FG
277 << ceph_mds_state_name(want_state) << " -> "
278 << ceph_mds_state_name(newstate) << dendl;
279 want_state = newstate;
280 }
281}
282
283
284/**
285 * We are 'shown' an MDS briefly in order to update
286 * some health metrics that we will send in the next
287 * beacon.
288 */
289void Beacon::notify_health(MDSRank const *mds)
290{
11fdf7f2 291 std::unique_lock lock(mutex);
7c673cae
FG
292 if (!mds) {
293 // No MDS rank held
294 return;
295 }
296
297 // I'm going to touch this MDS, so it must be locked
9f95a23c 298 ceph_assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
7c673cae
FG
299
300 health.metrics.clear();
301
2a845540
TL
302 if (unlikely(g_conf().get_val<bool>("mds_inject_health_dummy"))) {
303 MDSHealthMetric m(MDS_HEALTH_DUMMY, HEALTH_ERR, std::string("dummy"));
304 health.metrics.push_back(m);
305 }
306
7c673cae
FG
307 // Detect presence of entries in DamageTable
308 if (!mds->damage_table.empty()) {
309 MDSHealthMetric m(MDS_HEALTH_DAMAGE, HEALTH_ERR, std::string(
310 "Metadata damage detected"));
311 health.metrics.push_back(m);
312 }
313
314 // Detect MDS_HEALTH_TRIM condition
f91f0fd5 315 // Indicates MDS is not trimming promptly
7c673cae 316 {
f91f0fd5 317 if (mds->mdlog->get_num_segments() > (size_t)(g_conf()->mds_log_max_segments * g_conf().get_val<double>("mds_log_warn_factor"))) {
f67539c2
TL
318 CachedStackStringStream css;
319 *css << "Behind on trimming (" << mds->mdlog->get_num_segments()
11fdf7f2 320 << "/" << g_conf()->mds_log_max_segments << ")";
7c673cae 321
f67539c2 322 MDSHealthMetric m(MDS_HEALTH_TRIM, HEALTH_WARN, css->strv());
7c673cae 323 m.metadata["num_segments"] = stringify(mds->mdlog->get_num_segments());
11fdf7f2 324 m.metadata["max_segments"] = stringify(g_conf()->mds_log_max_segments);
7c673cae
FG
325 health.metrics.push_back(m);
326 }
327 }
328
329 // Detect clients failing to respond to modifications to capabilities in
330 // CLIENT_CAPS messages.
331 {
9f95a23c
TL
332 auto&& late_clients = mds->locker->get_late_revoking_clients(mds->mdsmap->get_session_timeout());
333 std::vector<MDSHealthMetric> late_cap_metrics;
7c673cae 334
9f95a23c 335 for (const auto& client : late_clients) {
7c673cae
FG
336 // client_t is equivalent to session.info.inst.name.num
337 // Construct an entity_name_t to lookup into SessionMap
9f95a23c 338 entity_name_t ename(CEPH_ENTITY_TYPE_CLIENT, client.v);
7c673cae
FG
339 Session const *s = mds->sessionmap.get_session(ename);
340 if (s == NULL) {
341 // Shouldn't happen, but not worth crashing if it does as this is
342 // just health-reporting code.
9f95a23c 343 derr << "Client ID without session: " << client.v << dendl;
7c673cae
FG
344 continue;
345 }
346
f67539c2
TL
347 CachedStackStringStream css;
348 *css << "Client " << s->get_human_name() << " failing to respond to capability release";
349 MDSHealthMetric m(MDS_HEALTH_CLIENT_LATE_RELEASE, HEALTH_WARN, css->strv());
9f95a23c
TL
350 m.metadata["client_id"] = stringify(client.v);
351 late_cap_metrics.emplace_back(std::move(m));
7c673cae
FG
352 }
353
11fdf7f2 354 if (late_cap_metrics.size() <= (size_t)g_conf()->mds_health_summarize_threshold) {
9f95a23c
TL
355 auto&& m = late_cap_metrics;
356 health.metrics.insert(std::end(health.metrics), std::cbegin(m), std::cend(m));
7c673cae 357 } else {
f67539c2
TL
358 CachedStackStringStream css;
359 *css << "Many clients (" << late_cap_metrics.size()
7c673cae 360 << ") failing to respond to capability release";
f67539c2 361 MDSHealthMetric m(MDS_HEALTH_CLIENT_LATE_RELEASE_MANY, HEALTH_WARN, css->strv());
7c673cae 362 m.metadata["client_count"] = stringify(late_cap_metrics.size());
9f95a23c 363 health.metrics.push_back(std::move(m));
7c673cae
FG
364 }
365 }
366
367 // Detect clients failing to generate cap releases from CEPH_SESSION_RECALL_STATE
368 // messages. May be due to buggy client or resource-hogging application.
369 //
370 // Detect clients failing to advance their old_client_tid
371 {
20effc67 372 std::set<Session*> sessions;
7c673cae
FG
373 mds->sessionmap.get_client_session_set(sessions);
374
f91f0fd5 375 const auto min_caps_working_set = g_conf().get_val<uint64_t>("mds_min_caps_working_set");
11fdf7f2
TL
376 const auto recall_warning_threshold = g_conf().get_val<Option::size_t>("mds_recall_warning_threshold");
377 const auto max_completed_requests = g_conf()->mds_max_completed_requests;
378 const auto max_completed_flushes = g_conf()->mds_max_completed_flushes;
9f95a23c
TL
379 std::vector<MDSHealthMetric> late_recall_metrics;
380 std::vector<MDSHealthMetric> large_completed_requests_metrics;
91327a77 381 for (auto& session : sessions) {
f91f0fd5 382 const uint64_t num_caps = session->get_num_caps();
11fdf7f2 383 const uint64_t recall_caps = session->get_recall_caps();
f91f0fd5 384 if (recall_caps > recall_warning_threshold && num_caps > min_caps_working_set) {
a8e16298
TL
385 dout(2) << "Session " << *session <<
386 " is not releasing caps fast enough. Recalled caps at " << recall_caps
387 << " > " << recall_warning_threshold << " (mds_recall_warning_threshold)." << dendl;
f67539c2
TL
388 CachedStackStringStream css;
389 *css << "Client " << session->get_human_name() << " failing to respond to cache pressure";
390 MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, css->strv());
a8e16298 391 m.metadata["client_id"] = stringify(session->get_client());
9f95a23c 392 late_recall_metrics.emplace_back(std::move(m));
7c673cae
FG
393 }
394 if ((session->get_num_trim_requests_warnings() > 0 &&
a8e16298 395 session->get_num_completed_requests() >= max_completed_requests) ||
7c673cae 396 (session->get_num_trim_flushes_warnings() > 0 &&
a8e16298 397 session->get_num_completed_flushes() >= max_completed_flushes)) {
f67539c2
TL
398 CachedStackStringStream css;
399 *css << "Client " << session->get_human_name() << " failing to advance its oldest client/flush tid. ";
400 MDSHealthMetric m(MDS_HEALTH_CLIENT_OLDEST_TID, HEALTH_WARN, css->strv());
11fdf7f2 401 m.metadata["client_id"] = stringify(session->get_client());
9f95a23c 402 large_completed_requests_metrics.emplace_back(std::move(m));
7c673cae
FG
403 }
404 }
405
11fdf7f2 406 if (late_recall_metrics.size() <= (size_t)g_conf()->mds_health_summarize_threshold) {
9f95a23c
TL
407 auto&& m = late_recall_metrics;
408 health.metrics.insert(std::end(health.metrics), std::cbegin(m), std::cend(m));
7c673cae 409 } else {
f67539c2
TL
410 CachedStackStringStream css;
411 *css << "Many clients (" << late_recall_metrics.size()
7c673cae 412 << ") failing to respond to cache pressure";
f67539c2 413 MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL_MANY, HEALTH_WARN, css->strv());
7c673cae
FG
414 m.metadata["client_count"] = stringify(late_recall_metrics.size());
415 health.metrics.push_back(m);
416 late_recall_metrics.clear();
417 }
418
11fdf7f2 419 if (large_completed_requests_metrics.size() <= (size_t)g_conf()->mds_health_summarize_threshold) {
9f95a23c
TL
420 auto&& m = large_completed_requests_metrics;
421 health.metrics.insert(std::end(health.metrics), std::cbegin(m), std::cend(m));
7c673cae 422 } else {
f67539c2
TL
423 CachedStackStringStream css;
424 *css << "Many clients (" << large_completed_requests_metrics.size()
7c673cae 425 << ") failing to advance their oldest client/flush tid";
f67539c2 426 MDSHealthMetric m(MDS_HEALTH_CLIENT_OLDEST_TID_MANY, HEALTH_WARN, css->strv());
7c673cae
FG
427 m.metadata["client_count"] = stringify(large_completed_requests_metrics.size());
428 health.metrics.push_back(m);
429 large_completed_requests_metrics.clear();
430 }
431 }
432
433 // Detect MDS_HEALTH_SLOW_REQUEST condition
434 {
435 int slow = mds->get_mds_slow_req_count();
7c673cae 436 if (slow) {
91327a77 437 dout(20) << slow << " slow request found" << dendl;
f67539c2
TL
438 CachedStackStringStream css;
439 *css << slow << " slow requests are blocked > " << g_conf()->mds_op_complaint_time << " secs";
7c673cae 440
f67539c2 441 MDSHealthMetric m(MDS_HEALTH_SLOW_REQUEST, HEALTH_WARN, css->strv());
7c673cae
FG
442 health.metrics.push_back(m);
443 }
444 }
445
91327a77 446 {
11fdf7f2 447 auto complaint_time = g_conf()->osd_op_complaint_time;
91327a77
AA
448 auto now = clock::now();
449 auto cutoff = now - ceph::make_timespan(complaint_time);
450
451 std::string count;
452 ceph::coarse_mono_time oldest;
453 if (MDSIOContextBase::check_ios_in_flight(cutoff, count, oldest)) {
454 dout(20) << count << " slow metadata IOs found" << dendl;
455
456 auto oldest_secs = std::chrono::duration<double>(now - oldest).count();
f67539c2
TL
457 CachedStackStringStream css;
458 *css << count << " slow metadata IOs are blocked > " << complaint_time
91327a77
AA
459 << " secs, oldest blocked for " << (int64_t)oldest_secs << " secs";
460
f67539c2 461 MDSHealthMetric m(MDS_HEALTH_SLOW_METADATA_IO, HEALTH_WARN, css->strv());
91327a77
AA
462 health.metrics.push_back(m);
463 }
464 }
465
7c673cae
FG
466 // Report a health warning if we are readonly
467 if (mds->mdcache->is_readonly()) {
468 MDSHealthMetric m(MDS_HEALTH_READ_ONLY, HEALTH_WARN,
469 "MDS in read-only mode");
470 health.metrics.push_back(m);
471 }
472
473 // Report if we have significantly exceeded our cache size limit
181888fb 474 if (mds->mdcache->cache_overfull()) {
f67539c2
TL
475 CachedStackStringStream css;
476 *css << "MDS cache is too large (" << bytes2str(mds->mdcache->cache_size())
181888fb 477 << "/" << bytes2str(mds->mdcache->cache_limit_memory()) << "); "
7c673cae
FG
478 << mds->mdcache->num_inodes_with_caps << " inodes in use by clients, "
479 << mds->mdcache->get_num_strays() << " stray files";
480
f67539c2 481 MDSHealthMetric m(MDS_HEALTH_CACHE_OVERSIZED, HEALTH_WARN, css->strv());
7c673cae
FG
482 health.metrics.push_back(m);
483 }
484}
485
486MDSMap::DaemonState Beacon::get_want_state() const
487{
11fdf7f2 488 std::unique_lock lock(mutex);
7c673cae
FG
489 return want_state;
490}
491