1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2012 Red Hat
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
16 #include "common/dout.h"
17 #include "common/HeartbeatMap.h"
19 #include "include/stringify.h"
20 #include "include/util.h"
22 #include "mon/MonClient.h"
23 #include "mds/MDLog.h"
24 #include "mds/MDSRank.h"
25 #include "mds/MDSMap.h"
26 #include "mds/Locker.h"
32 #define dout_context g_ceph_context
33 #define dout_subsys ceph_subsys_mds
35 #define dout_prefix *_dout << "mds.beacon." << name << ' '
37 using namespace std::chrono_literals
;
39 Beacon::Beacon(CephContext
*cct
, MonClient
*monc
, std::string_view name
)
42 beacon_interval(g_conf()->mds_beacon_interval
),
53 void Beacon::shutdown()
55 std::unique_lock
<std::mutex
> lock(mutex
);
59 if (sender
.joinable())
64 void Beacon::init(const MDSMap
&mdsmap
)
66 std::unique_lock
lock(mutex
);
68 _notify_mdsmap(mdsmap
);
70 sender
= std::thread([this]() {
71 std::unique_lock
<std::mutex
> lock(mutex
);
72 std::condition_variable c
; // no one wakes us
74 auto now
= clock::now();
75 auto since
= std::chrono::duration
<double>(now
-last_send
).count();
76 auto interval
= beacon_interval
;
77 if (since
>= interval
*.90) {
79 interval
= 0.5; /* 500ms */
84 dout(20) << "sender thread waiting interval " << interval
<< "s" << dendl
;
85 c
.wait_for(lock
, interval
*1s
);
90 bool Beacon::ms_can_fast_dispatch2(const cref_t
<Message
>& m
) const
92 return m
->get_type() == MSG_MDS_BEACON
;
95 void Beacon::ms_fast_dispatch2(const ref_t
<Message
>& m
)
97 bool handled
= ms_dispatch2(m
);
101 bool Beacon::ms_dispatch2(const ref_t
<Message
>& m
)
103 if (m
->get_type() == MSG_MDS_BEACON
) {
104 if (m
->get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_MON
) {
105 handle_mds_beacon(ref_cast
<MMDSBeacon
>(m
));
115 * Update lagginess state based on response from remote MDSMonitor
117 * This function puts the passed message before returning
119 void Beacon::handle_mds_beacon(const cref_t
<MMDSBeacon
> &m
)
121 std::unique_lock
lock(mutex
);
123 version_t seq
= m
->get_seq();
126 auto it
= seq_stamp
.find(seq
);
127 if (it
!= seq_stamp
.end()) {
128 auto now
= clock::now();
130 last_acked_stamp
= it
->second
;
131 auto rtt
= std::chrono::duration
<double>(now
- last_acked_stamp
).count();
133 dout(5) << "received beacon reply " << ceph_mds_state_name(m
->get_state()) << " seq " << m
->get_seq() << " rtt " << rtt
<< dendl
;
135 if (laggy
&& rtt
< g_conf()->mds_beacon_grace
) {
136 dout(0) << " MDS is no longer laggy" << dendl
;
141 // clean up seq_stamp map
142 seq_stamp
.erase(seq_stamp
.begin(), ++it
);
144 // Wake a waiter up if present
147 dout(1) << "discarding unexpected beacon reply " << ceph_mds_state_name(m
->get_state())
148 << " seq " << m
->get_seq() << " dne" << dendl
;
155 std::unique_lock
lock(mutex
);
160 void Beacon::send_and_wait(const double duration
)
162 std::unique_lock
lock(mutex
);
164 auto awaiting_seq
= last_seq
;
165 dout(20) << __func__
<< ": awaiting " << awaiting_seq
166 << " for up to " << duration
<< "s" << dendl
;
168 auto start
= clock::now();
169 while (!seq_stamp
.empty() && seq_stamp
.begin()->first
<= awaiting_seq
) {
170 auto now
= clock::now();
171 auto s
= duration
*.95-std::chrono::duration
<double>(now
-start
).count();
173 cvar
.wait_for(lock
, s
*1s
);
179 * Call periodically, or when you have updated the desired state
183 auto now
= clock::now();
184 auto since
= std::chrono::duration
<double>(now
-last_acked_stamp
).count();
186 if (!cct
->get_heartbeat_map()->is_healthy()) {
187 /* If anything isn't progressing, let avoid sending a beacon so that
188 * the MDS will consider us laggy */
189 dout(0) << "Skipping beacon heartbeat to monitors (last acked " << since
<< "s ago); MDS internal heartbeat is not healthy!" << dendl
;
194 dout(5) << "Sending beacon " << ceph_mds_state_name(want_state
) << " seq " << last_seq
<< dendl
;
196 seq_stamp
[last_seq
] = now
;
198 ceph_assert(want_state
!= MDSMap::STATE_NULL
);
200 auto beacon
= make_message
<MMDSBeacon
>(
201 monc
->get_fsid(), mds_gid_t(monc
->get_global_id()),
206 CEPH_FEATURES_SUPPORTED_DEFAULT
);
207 beacon
->set_health(health
);
208 beacon
->set_compat(compat
);
209 beacon
->set_fs(g_conf().get_val
<std::string
>("mds_join_fs"));
210 // piggyback the sys info on beacon msg
211 if (want_state
== MDSMap::STATE_BOOT
) {
212 map
<string
, string
> sys_info
;
213 collect_sys_info(&sys_info
, cct
);
214 sys_info
["addr"] = stringify(monc
->get_myaddrs());
215 beacon
->set_sys_info(sys_info
);
217 monc
->send_mon_message(beacon
.detach());
223 * Call this when there is a new MDSMap available
225 void Beacon::notify_mdsmap(const MDSMap
&mdsmap
)
227 std::unique_lock
lock(mutex
);
229 _notify_mdsmap(mdsmap
);
232 void Beacon::_notify_mdsmap(const MDSMap
&mdsmap
)
234 ceph_assert(mdsmap
.get_epoch() >= epoch
);
236 if (mdsmap
.get_epoch() != epoch
) {
237 epoch
= mdsmap
.get_epoch();
238 compat
= MDSMap::get_compat_set_default();
239 compat
.merge(mdsmap
.compat
);
244 bool Beacon::is_laggy()
246 std::unique_lock
lock(mutex
);
248 auto now
= clock::now();
249 auto since
= std::chrono::duration
<double>(now
-last_acked_stamp
).count();
250 if (since
> g_conf()->mds_beacon_grace
) {
252 dout(1) << "MDS connection to Monitors appears to be laggy; " << since
253 << "s since last acked beacon" << dendl
;
261 void Beacon::set_want_state(const MDSMap
&mdsmap
, MDSMap::DaemonState newstate
)
263 std::unique_lock
lock(mutex
);
265 // Update mdsmap epoch atomically with updating want_state, so that when
266 // we send a beacon with the new want state it has the latest epoch, and
267 // once we have updated to the latest epoch, we are not sending out
268 // a stale want_state (i.e. one from before making it through MDSMap
270 _notify_mdsmap(mdsmap
);
272 if (want_state
!= newstate
) {
273 dout(5) << __func__
<< ": "
274 << ceph_mds_state_name(want_state
) << " -> "
275 << ceph_mds_state_name(newstate
) << dendl
;
276 want_state
= newstate
;
282 * We are 'shown' an MDS briefly in order to update
283 * some health metrics that we will send in the next
286 void Beacon::notify_health(MDSRank
const *mds
)
288 std::unique_lock
lock(mutex
);
294 // I'm going to touch this MDS, so it must be locked
295 ceph_assert(ceph_mutex_is_locked_by_me(mds
->mds_lock
));
297 health
.metrics
.clear();
299 // Detect presence of entries in DamageTable
300 if (!mds
->damage_table
.empty()) {
301 MDSHealthMetric
m(MDS_HEALTH_DAMAGE
, HEALTH_ERR
, std::string(
302 "Metadata damage detected"));
303 health
.metrics
.push_back(m
);
306 // Detect MDS_HEALTH_TRIM condition
307 // Indicates MDS is not trimming promptly
309 if (mds
->mdlog
->get_num_segments() > (size_t)(g_conf()->mds_log_max_segments
* g_conf().get_val
<double>("mds_log_warn_factor"))) {
310 CachedStackStringStream css
;
311 *css
<< "Behind on trimming (" << mds
->mdlog
->get_num_segments()
312 << "/" << g_conf()->mds_log_max_segments
<< ")";
314 MDSHealthMetric
m(MDS_HEALTH_TRIM
, HEALTH_WARN
, css
->strv());
315 m
.metadata
["num_segments"] = stringify(mds
->mdlog
->get_num_segments());
316 m
.metadata
["max_segments"] = stringify(g_conf()->mds_log_max_segments
);
317 health
.metrics
.push_back(m
);
321 // Detect clients failing to respond to modifications to capabilities in
322 // CLIENT_CAPS messages.
324 auto&& late_clients
= mds
->locker
->get_late_revoking_clients(mds
->mdsmap
->get_session_timeout());
325 std::vector
<MDSHealthMetric
> late_cap_metrics
;
327 for (const auto& client
: late_clients
) {
328 // client_t is equivalent to session.info.inst.name.num
329 // Construct an entity_name_t to lookup into SessionMap
330 entity_name_t
ename(CEPH_ENTITY_TYPE_CLIENT
, client
.v
);
331 Session
const *s
= mds
->sessionmap
.get_session(ename
);
333 // Shouldn't happen, but not worth crashing if it does as this is
334 // just health-reporting code.
335 derr
<< "Client ID without session: " << client
.v
<< dendl
;
339 CachedStackStringStream css
;
340 *css
<< "Client " << s
->get_human_name() << " failing to respond to capability release";
341 MDSHealthMetric
m(MDS_HEALTH_CLIENT_LATE_RELEASE
, HEALTH_WARN
, css
->strv());
342 m
.metadata
["client_id"] = stringify(client
.v
);
343 late_cap_metrics
.emplace_back(std::move(m
));
346 if (late_cap_metrics
.size() <= (size_t)g_conf()->mds_health_summarize_threshold
) {
347 auto&& m
= late_cap_metrics
;
348 health
.metrics
.insert(std::end(health
.metrics
), std::cbegin(m
), std::cend(m
));
350 CachedStackStringStream css
;
351 *css
<< "Many clients (" << late_cap_metrics
.size()
352 << ") failing to respond to capability release";
353 MDSHealthMetric
m(MDS_HEALTH_CLIENT_LATE_RELEASE_MANY
, HEALTH_WARN
, css
->strv());
354 m
.metadata
["client_count"] = stringify(late_cap_metrics
.size());
355 health
.metrics
.push_back(std::move(m
));
359 // Detect clients failing to generate cap releases from CEPH_SESSION_RECALL_STATE
360 // messages. May be due to buggy client or resource-hogging application.
362 // Detect clients failing to advance their old_client_tid
364 set
<Session
*> sessions
;
365 mds
->sessionmap
.get_client_session_set(sessions
);
367 const auto min_caps_working_set
= g_conf().get_val
<uint64_t>("mds_min_caps_working_set");
368 const auto recall_warning_threshold
= g_conf().get_val
<Option::size_t>("mds_recall_warning_threshold");
369 const auto max_completed_requests
= g_conf()->mds_max_completed_requests
;
370 const auto max_completed_flushes
= g_conf()->mds_max_completed_flushes
;
371 std::vector
<MDSHealthMetric
> late_recall_metrics
;
372 std::vector
<MDSHealthMetric
> large_completed_requests_metrics
;
373 for (auto& session
: sessions
) {
374 const uint64_t num_caps
= session
->get_num_caps();
375 const uint64_t recall_caps
= session
->get_recall_caps();
376 if (recall_caps
> recall_warning_threshold
&& num_caps
> min_caps_working_set
) {
377 dout(2) << "Session " << *session
<<
378 " is not releasing caps fast enough. Recalled caps at " << recall_caps
379 << " > " << recall_warning_threshold
<< " (mds_recall_warning_threshold)." << dendl
;
380 CachedStackStringStream css
;
381 *css
<< "Client " << session
->get_human_name() << " failing to respond to cache pressure";
382 MDSHealthMetric
m(MDS_HEALTH_CLIENT_RECALL
, HEALTH_WARN
, css
->strv());
383 m
.metadata
["client_id"] = stringify(session
->get_client());
384 late_recall_metrics
.emplace_back(std::move(m
));
386 if ((session
->get_num_trim_requests_warnings() > 0 &&
387 session
->get_num_completed_requests() >= max_completed_requests
) ||
388 (session
->get_num_trim_flushes_warnings() > 0 &&
389 session
->get_num_completed_flushes() >= max_completed_flushes
)) {
390 CachedStackStringStream css
;
391 *css
<< "Client " << session
->get_human_name() << " failing to advance its oldest client/flush tid. ";
392 MDSHealthMetric
m(MDS_HEALTH_CLIENT_OLDEST_TID
, HEALTH_WARN
, css
->strv());
393 m
.metadata
["client_id"] = stringify(session
->get_client());
394 large_completed_requests_metrics
.emplace_back(std::move(m
));
398 if (late_recall_metrics
.size() <= (size_t)g_conf()->mds_health_summarize_threshold
) {
399 auto&& m
= late_recall_metrics
;
400 health
.metrics
.insert(std::end(health
.metrics
), std::cbegin(m
), std::cend(m
));
402 CachedStackStringStream css
;
403 *css
<< "Many clients (" << late_recall_metrics
.size()
404 << ") failing to respond to cache pressure";
405 MDSHealthMetric
m(MDS_HEALTH_CLIENT_RECALL_MANY
, HEALTH_WARN
, css
->strv());
406 m
.metadata
["client_count"] = stringify(late_recall_metrics
.size());
407 health
.metrics
.push_back(m
);
408 late_recall_metrics
.clear();
411 if (large_completed_requests_metrics
.size() <= (size_t)g_conf()->mds_health_summarize_threshold
) {
412 auto&& m
= large_completed_requests_metrics
;
413 health
.metrics
.insert(std::end(health
.metrics
), std::cbegin(m
), std::cend(m
));
415 CachedStackStringStream css
;
416 *css
<< "Many clients (" << large_completed_requests_metrics
.size()
417 << ") failing to advance their oldest client/flush tid";
418 MDSHealthMetric
m(MDS_HEALTH_CLIENT_OLDEST_TID_MANY
, HEALTH_WARN
, css
->strv());
419 m
.metadata
["client_count"] = stringify(large_completed_requests_metrics
.size());
420 health
.metrics
.push_back(m
);
421 large_completed_requests_metrics
.clear();
425 // Detect MDS_HEALTH_SLOW_REQUEST condition
427 int slow
= mds
->get_mds_slow_req_count();
429 dout(20) << slow
<< " slow request found" << dendl
;
430 CachedStackStringStream css
;
431 *css
<< slow
<< " slow requests are blocked > " << g_conf()->mds_op_complaint_time
<< " secs";
433 MDSHealthMetric
m(MDS_HEALTH_SLOW_REQUEST
, HEALTH_WARN
, css
->strv());
434 health
.metrics
.push_back(m
);
439 auto complaint_time
= g_conf()->osd_op_complaint_time
;
440 auto now
= clock::now();
441 auto cutoff
= now
- ceph::make_timespan(complaint_time
);
444 ceph::coarse_mono_time oldest
;
445 if (MDSIOContextBase::check_ios_in_flight(cutoff
, count
, oldest
)) {
446 dout(20) << count
<< " slow metadata IOs found" << dendl
;
448 auto oldest_secs
= std::chrono::duration
<double>(now
- oldest
).count();
449 CachedStackStringStream css
;
450 *css
<< count
<< " slow metadata IOs are blocked > " << complaint_time
451 << " secs, oldest blocked for " << (int64_t)oldest_secs
<< " secs";
453 MDSHealthMetric
m(MDS_HEALTH_SLOW_METADATA_IO
, HEALTH_WARN
, css
->strv());
454 health
.metrics
.push_back(m
);
458 // Report a health warning if we are readonly
459 if (mds
->mdcache
->is_readonly()) {
460 MDSHealthMetric
m(MDS_HEALTH_READ_ONLY
, HEALTH_WARN
,
461 "MDS in read-only mode");
462 health
.metrics
.push_back(m
);
465 // Report if we have significantly exceeded our cache size limit
466 if (mds
->mdcache
->cache_overfull()) {
467 CachedStackStringStream css
;
468 *css
<< "MDS cache is too large (" << bytes2str(mds
->mdcache
->cache_size())
469 << "/" << bytes2str(mds
->mdcache
->cache_limit_memory()) << "); "
470 << mds
->mdcache
->num_inodes_with_caps
<< " inodes in use by clients, "
471 << mds
->mdcache
->get_num_strays() << " stray files";
473 MDSHealthMetric
m(MDS_HEALTH_CACHE_OVERSIZED
, HEALTH_WARN
, css
->strv());
474 health
.metrics
.push_back(m
);
478 MDSMap::DaemonState
Beacon::get_want_state() const
480 std::unique_lock
lock(mutex
);