]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/Beacon.cc
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / mds / Beacon.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2012 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16#include "common/dout.h"
17#include "common/HeartbeatMap.h"
181888fb 18
7c673cae
FG
19#include "include/stringify.h"
20#include "include/util.h"
21
7c673cae
FG
22#include "mon/MonClient.h"
23#include "mds/MDLog.h"
24#include "mds/MDSRank.h"
25#include "mds/MDSMap.h"
26#include "mds/Locker.h"
27
28#include "Beacon.h"
29
91327a77
AA
30#include <chrono>
31
7c673cae
FG
32#define dout_context g_ceph_context
33#define dout_subsys ceph_subsys_mds
34#undef dout_prefix
35#define dout_prefix *_dout << "mds.beacon." << name << ' '
36
11fdf7f2
TL
37using namespace std::chrono_literals;
38
39Beacon::Beacon(CephContext *cct, MonClient *monc, std::string_view name)
91327a77
AA
40 :
41 Dispatcher(cct),
11fdf7f2 42 beacon_interval(g_conf()->mds_beacon_interval),
91327a77
AA
43 monc(monc),
44 name(name)
7c673cae 45{
7c673cae
FG
46}
47
7c673cae
FG
48Beacon::~Beacon()
49{
91327a77 50 shutdown();
7c673cae
FG
51}
52
91327a77
AA
53void Beacon::shutdown()
54{
55 std::unique_lock<std::mutex> lock(mutex);
56 if (!finished) {
57 finished = true;
58 lock.unlock();
59 sender.join();
60 }
61}
7c673cae 62
11fdf7f2 63void Beacon::init(const MDSMap &mdsmap)
7c673cae 64{
11fdf7f2 65 std::unique_lock lock(mutex);
7c673cae
FG
66
67 _notify_mdsmap(mdsmap);
7c673cae 68
91327a77
AA
69 sender = std::thread([this]() {
70 std::unique_lock<std::mutex> lock(mutex);
71 std::condition_variable c; // no one wakes us
72 while (!finished) {
73 auto now = clock::now();
74 auto since = std::chrono::duration<double>(now-last_send).count();
75 auto interval = beacon_interval;
76 if (since >= interval*.90) {
a8e16298
TL
77 if (!_send()) {
78 interval = 0.5; /* 500ms */
79 }
91327a77
AA
80 } else {
81 interval -= since;
82 }
83 dout(20) << "sender thread waiting interval " << interval << "s" << dendl;
11fdf7f2 84 c.wait_for(lock, interval*1s);
91327a77
AA
85 }
86 });
7c673cae
FG
87}
88
11fdf7f2 89bool Beacon::ms_can_fast_dispatch2(const Message::const_ref& m) const
7c673cae 90{
91327a77 91 return m->get_type() == MSG_MDS_BEACON;
7c673cae
FG
92}
93
11fdf7f2 94void Beacon::ms_fast_dispatch2(const Message::ref& m)
91327a77 95{
11fdf7f2
TL
96 bool handled = ms_dispatch2(m);
97 ceph_assert(handled);
91327a77 98}
7c673cae 99
11fdf7f2 100bool Beacon::ms_dispatch2(const Message::ref& m)
7c673cae
FG
101{
102 if (m->get_type() == MSG_MDS_BEACON) {
103 if (m->get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
11fdf7f2 104 handle_mds_beacon(MMDSBeacon::msgref_cast(m));
7c673cae
FG
105 }
106 return true;
107 }
108
109 return false;
110}
111
112
113/**
114 * Update lagginess state based on response from remote MDSMonitor
115 *
116 * This function puts the passed message before returning
117 */
11fdf7f2 118void Beacon::handle_mds_beacon(const MMDSBeacon::const_ref &m)
7c673cae 119{
11fdf7f2 120 std::unique_lock lock(mutex);
7c673cae
FG
121
122 version_t seq = m->get_seq();
123
124 // update lab
91327a77
AA
125 auto it = seq_stamp.find(seq);
126 if (it != seq_stamp.end()) {
127 auto now = clock::now();
128
129 last_acked_stamp = it->second;
130 auto rtt = std::chrono::duration<double>(now - last_acked_stamp).count();
131
132 dout(5) << "received beacon reply " << ceph_mds_state_name(m->get_state()) << " seq " << m->get_seq() << " rtt " << rtt << dendl;
133
11fdf7f2 134 if (laggy && rtt < g_conf()->mds_beacon_grace) {
91327a77
AA
135 dout(0) << " MDS is no longer laggy" << dendl;
136 laggy = false;
137 last_laggy = now;
7c673cae
FG
138 }
139
140 // clean up seq_stamp map
91327a77 141 seq_stamp.erase(seq_stamp.begin(), ++it);
7c673cae
FG
142
143 // Wake a waiter up if present
91327a77 144 cvar.notify_all();
7c673cae 145 } else {
91327a77
AA
146 dout(1) << "discarding unexpected beacon reply " << ceph_mds_state_name(m->get_state())
147 << " seq " << m->get_seq() << " dne" << dendl;
7c673cae
FG
148 }
149}
150
151
152void Beacon::send()
153{
11fdf7f2 154 std::unique_lock lock(mutex);
7c673cae
FG
155 _send();
156}
157
158
159void Beacon::send_and_wait(const double duration)
160{
11fdf7f2 161 std::unique_lock lock(mutex);
7c673cae 162 _send();
91327a77 163 auto awaiting_seq = last_seq;
7c673cae
FG
164 dout(20) << __func__ << ": awaiting " << awaiting_seq
165 << " for up to " << duration << "s" << dendl;
166
91327a77
AA
167 auto start = clock::now();
168 while (!seq_stamp.empty() && seq_stamp.begin()->first <= awaiting_seq) {
169 auto now = clock::now();
170 auto s = duration*.95-std::chrono::duration<double>(now-start).count();
171 if (s < 0) break;
11fdf7f2 172 cvar.wait_for(lock, s*1s);
7c673cae 173 }
7c673cae
FG
174}
175
176
177/**
178 * Call periodically, or when you have updated the desired state
179 */
a8e16298 180bool Beacon::_send()
7c673cae 181{
91327a77
AA
182 auto now = clock::now();
183 auto since = std::chrono::duration<double>(now-last_acked_stamp).count();
7c673cae
FG
184
185 if (!cct->get_heartbeat_map()->is_healthy()) {
186 /* If anything isn't progressing, let avoid sending a beacon so that
187 * the MDS will consider us laggy */
91327a77 188 dout(0) << "Skipping beacon heartbeat to monitors (last acked " << since << "s ago); MDS internal heartbeat is not healthy!" << dendl;
a8e16298 189 return false;
7c673cae
FG
190 }
191
192 ++last_seq;
91327a77 193 dout(5) << "Sending beacon " << ceph_mds_state_name(want_state) << " seq " << last_seq << dendl;
7c673cae 194
91327a77 195 seq_stamp[last_seq] = now;
7c673cae 196
11fdf7f2 197 ceph_assert(want_state != MDSMap::STATE_NULL);
7c673cae 198
11fdf7f2 199 auto beacon = MMDSBeacon::create(
7c673cae
FG
200 monc->get_fsid(), mds_gid_t(monc->get_global_id()),
201 name,
202 epoch,
203 want_state,
204 last_seq,
205 CEPH_FEATURES_SUPPORTED_DEFAULT);
206
7c673cae
FG
207 beacon->set_health(health);
208 beacon->set_compat(compat);
209 // piggyback the sys info on beacon msg
210 if (want_state == MDSMap::STATE_BOOT) {
211 map<string, string> sys_info;
212 collect_sys_info(&sys_info, cct);
11fdf7f2 213 sys_info["addr"] = stringify(monc->get_myaddrs());
7c673cae
FG
214 beacon->set_sys_info(sys_info);
215 }
11fdf7f2 216 monc->send_mon_message(beacon.detach());
91327a77 217 last_send = now;
a8e16298 218 return true;
7c673cae
FG
219}
220
221/**
222 * Call this when there is a new MDSMap available
223 */
11fdf7f2 224void Beacon::notify_mdsmap(const MDSMap &mdsmap)
7c673cae 225{
11fdf7f2 226 std::unique_lock lock(mutex);
7c673cae
FG
227
228 _notify_mdsmap(mdsmap);
229}
230
11fdf7f2 231void Beacon::_notify_mdsmap(const MDSMap &mdsmap)
7c673cae 232{
11fdf7f2 233 ceph_assert(mdsmap.get_epoch() >= epoch);
7c673cae 234
11fdf7f2
TL
235 if (mdsmap.get_epoch() != epoch) {
236 epoch = mdsmap.get_epoch();
1adf2230 237 compat = MDSMap::get_compat_set_default();
11fdf7f2 238 compat.merge(mdsmap.compat);
7c673cae
FG
239 }
240}
241
242
243bool Beacon::is_laggy()
244{
11fdf7f2 245 std::unique_lock lock(mutex);
7c673cae 246
91327a77
AA
247 auto now = clock::now();
248 auto since = std::chrono::duration<double>(now-last_acked_stamp).count();
11fdf7f2 249 if (since > g_conf()->mds_beacon_grace) {
91327a77 250 if (!laggy) {
11fdf7f2
TL
251 dout(1) << "MDS connection to Monitors appears to be laggy; " << since
252 << "s since last acked beacon" << dendl;
91327a77
AA
253 }
254 laggy = true;
7c673cae
FG
255 return true;
256 }
257 return false;
258}
259
11fdf7f2 260void Beacon::set_want_state(const MDSMap &mdsmap, MDSMap::DaemonState const newstate)
7c673cae 261{
11fdf7f2 262 std::unique_lock lock(mutex);
7c673cae
FG
263
264 // Update mdsmap epoch atomically with updating want_state, so that when
265 // we send a beacon with the new want state it has the latest epoch, and
266 // once we have updated to the latest epoch, we are not sending out
267 // a stale want_state (i.e. one from before making it through MDSMap
268 // handling)
269 _notify_mdsmap(mdsmap);
270
271 if (want_state != newstate) {
91327a77 272 dout(5) << __func__ << ": "
7c673cae
FG
273 << ceph_mds_state_name(want_state) << " -> "
274 << ceph_mds_state_name(newstate) << dendl;
275 want_state = newstate;
276 }
277}
278
279
280/**
281 * We are 'shown' an MDS briefly in order to update
282 * some health metrics that we will send in the next
283 * beacon.
284 */
285void Beacon::notify_health(MDSRank const *mds)
286{
11fdf7f2 287 std::unique_lock lock(mutex);
7c673cae
FG
288 if (!mds) {
289 // No MDS rank held
290 return;
291 }
292
293 // I'm going to touch this MDS, so it must be locked
11fdf7f2 294 ceph_assert(mds->mds_lock.is_locked_by_me());
7c673cae
FG
295
296 health.metrics.clear();
297
298 // Detect presence of entries in DamageTable
299 if (!mds->damage_table.empty()) {
300 MDSHealthMetric m(MDS_HEALTH_DAMAGE, HEALTH_ERR, std::string(
301 "Metadata damage detected"));
302 health.metrics.push_back(m);
303 }
304
305 // Detect MDS_HEALTH_TRIM condition
306 // Arbitrary factor of 2, indicates MDS is not trimming promptly
307 {
11fdf7f2 308 if (mds->mdlog->get_num_segments() > (size_t)(g_conf()->mds_log_max_segments * 2)) {
7c673cae
FG
309 std::ostringstream oss;
310 oss << "Behind on trimming (" << mds->mdlog->get_num_segments()
11fdf7f2 311 << "/" << g_conf()->mds_log_max_segments << ")";
7c673cae
FG
312
313 MDSHealthMetric m(MDS_HEALTH_TRIM, HEALTH_WARN, oss.str());
314 m.metadata["num_segments"] = stringify(mds->mdlog->get_num_segments());
11fdf7f2 315 m.metadata["max_segments"] = stringify(g_conf()->mds_log_max_segments);
7c673cae
FG
316 health.metrics.push_back(m);
317 }
318 }
319
320 // Detect clients failing to respond to modifications to capabilities in
321 // CLIENT_CAPS messages.
322 {
323 std::list<client_t> late_clients;
91327a77
AA
324 mds->locker->get_late_revoking_clients(&late_clients,
325 mds->mdsmap->get_session_timeout());
7c673cae
FG
326 std::list<MDSHealthMetric> late_cap_metrics;
327
328 for (std::list<client_t>::iterator i = late_clients.begin(); i != late_clients.end(); ++i) {
329
330 // client_t is equivalent to session.info.inst.name.num
331 // Construct an entity_name_t to lookup into SessionMap
332 entity_name_t ename(CEPH_ENTITY_TYPE_CLIENT, i->v);
333 Session const *s = mds->sessionmap.get_session(ename);
334 if (s == NULL) {
335 // Shouldn't happen, but not worth crashing if it does as this is
336 // just health-reporting code.
337 derr << "Client ID without session: " << i->v << dendl;
338 continue;
339 }
340
341 std::ostringstream oss;
342 oss << "Client " << s->get_human_name() << " failing to respond to capability release";
343 MDSHealthMetric m(MDS_HEALTH_CLIENT_LATE_RELEASE, HEALTH_WARN, oss.str());
344 m.metadata["client_id"] = stringify(i->v);
345 late_cap_metrics.push_back(m);
346 }
347
11fdf7f2 348 if (late_cap_metrics.size() <= (size_t)g_conf()->mds_health_summarize_threshold) {
7c673cae
FG
349 health.metrics.splice(health.metrics.end(), late_cap_metrics);
350 } else {
351 std::ostringstream oss;
352 oss << "Many clients (" << late_cap_metrics.size()
353 << ") failing to respond to capability release";
354 MDSHealthMetric m(MDS_HEALTH_CLIENT_LATE_RELEASE_MANY, HEALTH_WARN, oss.str());
355 m.metadata["client_count"] = stringify(late_cap_metrics.size());
356 health.metrics.push_back(m);
357 late_cap_metrics.clear();
358 }
359 }
360
361 // Detect clients failing to generate cap releases from CEPH_SESSION_RECALL_STATE
362 // messages. May be due to buggy client or resource-hogging application.
363 //
364 // Detect clients failing to advance their old_client_tid
365 {
366 set<Session*> sessions;
367 mds->sessionmap.get_client_session_set(sessions);
368
11fdf7f2
TL
369 const auto recall_warning_threshold = g_conf().get_val<Option::size_t>("mds_recall_warning_threshold");
370 const auto max_completed_requests = g_conf()->mds_max_completed_requests;
371 const auto max_completed_flushes = g_conf()->mds_max_completed_flushes;
7c673cae
FG
372 std::list<MDSHealthMetric> late_recall_metrics;
373 std::list<MDSHealthMetric> large_completed_requests_metrics;
91327a77 374 for (auto& session : sessions) {
11fdf7f2 375 const uint64_t recall_caps = session->get_recall_caps();
a8e16298
TL
376 if (recall_caps > recall_warning_threshold) {
377 dout(2) << "Session " << *session <<
378 " is not releasing caps fast enough. Recalled caps at " << recall_caps
379 << " > " << recall_warning_threshold << " (mds_recall_warning_threshold)." << dendl;
380 std::ostringstream oss;
381 oss << "Client " << session->get_human_name() << " failing to respond to cache pressure";
382 MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str());
383 m.metadata["client_id"] = stringify(session->get_client());
384 late_recall_metrics.push_back(m);
7c673cae
FG
385 }
386 if ((session->get_num_trim_requests_warnings() > 0 &&
a8e16298 387 session->get_num_completed_requests() >= max_completed_requests) ||
7c673cae 388 (session->get_num_trim_flushes_warnings() > 0 &&
a8e16298 389 session->get_num_completed_flushes() >= max_completed_flushes)) {
7c673cae
FG
390 std::ostringstream oss;
391 oss << "Client " << session->get_human_name() << " failing to advance its oldest client/flush tid";
392 MDSHealthMetric m(MDS_HEALTH_CLIENT_OLDEST_TID, HEALTH_WARN, oss.str());
11fdf7f2 393 m.metadata["client_id"] = stringify(session->get_client());
7c673cae
FG
394 large_completed_requests_metrics.push_back(m);
395 }
396 }
397
11fdf7f2 398 if (late_recall_metrics.size() <= (size_t)g_conf()->mds_health_summarize_threshold) {
7c673cae
FG
399 health.metrics.splice(health.metrics.end(), late_recall_metrics);
400 } else {
401 std::ostringstream oss;
402 oss << "Many clients (" << late_recall_metrics.size()
403 << ") failing to respond to cache pressure";
404 MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL_MANY, HEALTH_WARN, oss.str());
405 m.metadata["client_count"] = stringify(late_recall_metrics.size());
406 health.metrics.push_back(m);
407 late_recall_metrics.clear();
408 }
409
11fdf7f2 410 if (large_completed_requests_metrics.size() <= (size_t)g_conf()->mds_health_summarize_threshold) {
7c673cae
FG
411 health.metrics.splice(health.metrics.end(), large_completed_requests_metrics);
412 } else {
413 std::ostringstream oss;
414 oss << "Many clients (" << large_completed_requests_metrics.size()
415 << ") failing to advance their oldest client/flush tid";
416 MDSHealthMetric m(MDS_HEALTH_CLIENT_OLDEST_TID_MANY, HEALTH_WARN, oss.str());
417 m.metadata["client_count"] = stringify(large_completed_requests_metrics.size());
418 health.metrics.push_back(m);
419 large_completed_requests_metrics.clear();
420 }
421 }
422
423 // Detect MDS_HEALTH_SLOW_REQUEST condition
424 {
425 int slow = mds->get_mds_slow_req_count();
7c673cae 426 if (slow) {
91327a77 427 dout(20) << slow << " slow request found" << dendl;
7c673cae 428 std::ostringstream oss;
11fdf7f2 429 oss << slow << " slow requests are blocked > " << g_conf()->mds_op_complaint_time << " secs";
7c673cae
FG
430
431 MDSHealthMetric m(MDS_HEALTH_SLOW_REQUEST, HEALTH_WARN, oss.str());
432 health.metrics.push_back(m);
433 }
434 }
435
91327a77 436 {
11fdf7f2 437 auto complaint_time = g_conf()->osd_op_complaint_time;
91327a77
AA
438 auto now = clock::now();
439 auto cutoff = now - ceph::make_timespan(complaint_time);
440
441 std::string count;
442 ceph::coarse_mono_time oldest;
443 if (MDSIOContextBase::check_ios_in_flight(cutoff, count, oldest)) {
444 dout(20) << count << " slow metadata IOs found" << dendl;
445
446 auto oldest_secs = std::chrono::duration<double>(now - oldest).count();
447 std::ostringstream oss;
448 oss << count << " slow metadata IOs are blocked > " << complaint_time
449 << " secs, oldest blocked for " << (int64_t)oldest_secs << " secs";
450
451 MDSHealthMetric m(MDS_HEALTH_SLOW_METADATA_IO, HEALTH_WARN, oss.str());
452 health.metrics.push_back(m);
453 }
454 }
455
7c673cae
FG
456 // Report a health warning if we are readonly
457 if (mds->mdcache->is_readonly()) {
458 MDSHealthMetric m(MDS_HEALTH_READ_ONLY, HEALTH_WARN,
459 "MDS in read-only mode");
460 health.metrics.push_back(m);
461 }
462
463 // Report if we have significantly exceeded our cache size limit
181888fb 464 if (mds->mdcache->cache_overfull()) {
7c673cae 465 std::ostringstream oss;
181888fb
FG
466 oss << "MDS cache is too large (" << bytes2str(mds->mdcache->cache_size())
467 << "/" << bytes2str(mds->mdcache->cache_limit_memory()) << "); "
7c673cae
FG
468 << mds->mdcache->num_inodes_with_caps << " inodes in use by clients, "
469 << mds->mdcache->get_num_strays() << " stray files";
470
471 MDSHealthMetric m(MDS_HEALTH_CACHE_OVERSIZED, HEALTH_WARN, oss.str());
472 health.metrics.push_back(m);
473 }
474}
475
476MDSMap::DaemonState Beacon::get_want_state() const
477{
11fdf7f2 478 std::unique_lock lock(mutex);
7c673cae
FG
479 return want_state;
480}
481