]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2012 Red Hat | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | ||
16 | #include "common/dout.h" | |
17 | #include "common/HeartbeatMap.h" | |
18 | #include "include/stringify.h" | |
19 | #include "include/util.h" | |
20 | ||
21 | #include "messages/MMDSBeacon.h" | |
22 | #include "mon/MonClient.h" | |
23 | #include "mds/MDLog.h" | |
24 | #include "mds/MDSRank.h" | |
25 | #include "mds/MDSMap.h" | |
26 | #include "mds/Locker.h" | |
27 | ||
28 | #include "Beacon.h" | |
29 | ||
30 | #define dout_context g_ceph_context | |
31 | #define dout_subsys ceph_subsys_mds | |
32 | #undef dout_prefix | |
33 | #define dout_prefix *_dout << "mds.beacon." << name << ' ' | |
34 | ||
35 | ||
36 | class Beacon::C_MDS_BeaconSender : public Context { | |
37 | public: | |
38 | explicit C_MDS_BeaconSender(Beacon *beacon_) : beacon(beacon_) {} | |
39 | void finish(int r) override { | |
40 | assert(beacon->lock.is_locked_by_me()); | |
41 | beacon->sender = NULL; | |
42 | beacon->_send(); | |
43 | } | |
44 | private: | |
45 | Beacon *beacon; | |
46 | }; | |
47 | ||
48 | Beacon::Beacon(CephContext *cct_, MonClient *monc_, std::string name_) : | |
49 | Dispatcher(cct_), lock("Beacon"), monc(monc_), timer(g_ceph_context, lock), | |
50 | name(name_), standby_for_rank(MDS_RANK_NONE), | |
51 | standby_for_fscid(FS_CLUSTER_ID_NONE), want_state(MDSMap::STATE_BOOT), | |
52 | awaiting_seq(-1) | |
53 | { | |
54 | last_seq = 0; | |
55 | sender = NULL; | |
56 | was_laggy = false; | |
57 | ||
58 | epoch = 0; | |
59 | } | |
60 | ||
61 | ||
62 | Beacon::~Beacon() | |
63 | { | |
64 | } | |
65 | ||
66 | ||
67 | void Beacon::init(MDSMap const *mdsmap) | |
68 | { | |
69 | Mutex::Locker l(lock); | |
70 | assert(mdsmap != NULL); | |
71 | ||
72 | _notify_mdsmap(mdsmap); | |
73 | standby_for_rank = mds_rank_t(g_conf->mds_standby_for_rank); | |
74 | standby_for_name = g_conf->mds_standby_for_name; | |
75 | standby_for_fscid = fs_cluster_id_t(g_conf->mds_standby_for_fscid); | |
76 | standby_replay = g_conf->mds_standby_replay; | |
77 | ||
78 | // Spawn threads and start messaging | |
79 | timer.init(); | |
80 | _send(); | |
81 | } | |
82 | ||
83 | ||
84 | void Beacon::shutdown() | |
85 | { | |
86 | Mutex::Locker l(lock); | |
87 | if (sender) { | |
88 | timer.cancel_event(sender); | |
89 | sender = NULL; | |
90 | } | |
91 | timer.shutdown(); | |
92 | } | |
93 | ||
94 | ||
95 | bool Beacon::ms_dispatch(Message *m) | |
96 | { | |
97 | if (m->get_type() == MSG_MDS_BEACON) { | |
98 | if (m->get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_MON) { | |
99 | handle_mds_beacon(static_cast<MMDSBeacon*>(m)); | |
100 | } | |
101 | return true; | |
102 | } | |
103 | ||
104 | return false; | |
105 | } | |
106 | ||
107 | ||
108 | /** | |
109 | * Update lagginess state based on response from remote MDSMonitor | |
110 | * | |
111 | * This function puts the passed message before returning | |
112 | */ | |
113 | void Beacon::handle_mds_beacon(MMDSBeacon *m) | |
114 | { | |
115 | Mutex::Locker l(lock); | |
116 | assert(m != NULL); | |
117 | ||
118 | version_t seq = m->get_seq(); | |
119 | ||
120 | // update lab | |
121 | if (seq_stamp.count(seq)) { | |
122 | utime_t now = ceph_clock_now(); | |
123 | if (seq_stamp[seq] > last_acked_stamp) { | |
124 | last_acked_stamp = seq_stamp[seq]; | |
125 | utime_t rtt = now - last_acked_stamp; | |
126 | ||
127 | dout(10) << "handle_mds_beacon " << ceph_mds_state_name(m->get_state()) | |
128 | << " seq " << m->get_seq() << " rtt " << rtt << dendl; | |
129 | ||
130 | if (was_laggy && rtt < g_conf->mds_beacon_grace) { | |
131 | dout(0) << "handle_mds_beacon no longer laggy" << dendl; | |
132 | was_laggy = false; | |
133 | laggy_until = now; | |
134 | } | |
135 | } else { | |
136 | // Mark myself laggy if system clock goes backwards. Hopping | |
137 | // later beacons will clear it. | |
138 | dout(1) << "handle_mds_beacon system clock goes backwards, " | |
139 | << "mark myself laggy" << dendl; | |
140 | last_acked_stamp = now - utime_t(g_conf->mds_beacon_grace + 1, 0); | |
141 | was_laggy = true; | |
142 | } | |
143 | ||
144 | // clean up seq_stamp map | |
145 | while (!seq_stamp.empty() && | |
146 | seq_stamp.begin()->first <= seq) | |
147 | seq_stamp.erase(seq_stamp.begin()); | |
148 | ||
149 | // Wake a waiter up if present | |
150 | if (awaiting_seq == seq) { | |
151 | waiting_cond.Signal(); | |
152 | } | |
153 | } else { | |
154 | dout(10) << "handle_mds_beacon " << ceph_mds_state_name(m->get_state()) | |
155 | << " seq " << m->get_seq() << " dne" << dendl; | |
156 | } | |
157 | } | |
158 | ||
159 | ||
160 | void Beacon::send() | |
161 | { | |
162 | Mutex::Locker l(lock); | |
163 | _send(); | |
164 | } | |
165 | ||
166 | ||
167 | void Beacon::send_and_wait(const double duration) | |
168 | { | |
169 | Mutex::Locker l(lock); | |
170 | _send(); | |
171 | awaiting_seq = last_seq; | |
172 | dout(20) << __func__ << ": awaiting " << awaiting_seq | |
173 | << " for up to " << duration << "s" << dendl; | |
174 | ||
175 | utime_t timeout; | |
176 | timeout.set_from_double(ceph_clock_now() + duration); | |
177 | while ((!seq_stamp.empty() && seq_stamp.begin()->first <= awaiting_seq) | |
178 | && ceph_clock_now() < timeout) { | |
179 | waiting_cond.WaitUntil(lock, timeout); | |
180 | } | |
181 | ||
182 | awaiting_seq = -1; | |
183 | } | |
184 | ||
185 | ||
186 | /** | |
187 | * Call periodically, or when you have updated the desired state | |
188 | */ | |
189 | void Beacon::_send() | |
190 | { | |
191 | if (sender) { | |
192 | timer.cancel_event(sender); | |
193 | } | |
194 | sender = new C_MDS_BeaconSender(this); | |
195 | timer.add_event_after(g_conf->mds_beacon_interval, sender); | |
196 | ||
197 | if (!cct->get_heartbeat_map()->is_healthy()) { | |
198 | /* If anything isn't progressing, let avoid sending a beacon so that | |
199 | * the MDS will consider us laggy */ | |
200 | dout(1) << __func__ << " skipping beacon, heartbeat map not healthy" << dendl; | |
201 | return; | |
202 | } | |
203 | ||
204 | ++last_seq; | |
205 | dout(10) << __func__ << " " << ceph_mds_state_name(want_state) | |
206 | << " seq " << last_seq | |
207 | << dendl; | |
208 | ||
209 | seq_stamp[last_seq] = ceph_clock_now(); | |
210 | ||
211 | assert(want_state != MDSMap::STATE_NULL); | |
212 | ||
213 | MMDSBeacon *beacon = new MMDSBeacon( | |
214 | monc->get_fsid(), mds_gid_t(monc->get_global_id()), | |
215 | name, | |
216 | epoch, | |
217 | want_state, | |
218 | last_seq, | |
219 | CEPH_FEATURES_SUPPORTED_DEFAULT); | |
220 | ||
221 | beacon->set_standby_for_rank(standby_for_rank); | |
222 | beacon->set_standby_for_name(standby_for_name); | |
223 | beacon->set_standby_for_fscid(standby_for_fscid); | |
224 | beacon->set_standby_replay(standby_replay); | |
225 | beacon->set_health(health); | |
226 | beacon->set_compat(compat); | |
227 | // piggyback the sys info on beacon msg | |
228 | if (want_state == MDSMap::STATE_BOOT) { | |
229 | map<string, string> sys_info; | |
230 | collect_sys_info(&sys_info, cct); | |
231 | sys_info["addr"] = stringify(monc->get_myaddr()); | |
232 | beacon->set_sys_info(sys_info); | |
233 | } | |
234 | monc->send_mon_message(beacon); | |
235 | } | |
236 | ||
237 | /** | |
238 | * Call this when there is a new MDSMap available | |
239 | */ | |
240 | void Beacon::notify_mdsmap(MDSMap const *mdsmap) | |
241 | { | |
242 | Mutex::Locker l(lock); | |
243 | assert(mdsmap != NULL); | |
244 | ||
245 | _notify_mdsmap(mdsmap); | |
246 | } | |
247 | ||
248 | void Beacon::_notify_mdsmap(MDSMap const *mdsmap) | |
249 | { | |
250 | assert(mdsmap != NULL); | |
251 | assert(mdsmap->get_epoch() >= epoch); | |
252 | ||
253 | if (mdsmap->get_epoch() != epoch) { | |
254 | epoch = mdsmap->get_epoch(); | |
255 | compat = get_mdsmap_compat_set_default(); | |
256 | compat.merge(mdsmap->compat); | |
257 | } | |
258 | } | |
259 | ||
260 | ||
261 | bool Beacon::is_laggy() | |
262 | { | |
263 | Mutex::Locker l(lock); | |
264 | ||
265 | if (last_acked_stamp == utime_t()) | |
266 | return false; | |
267 | ||
268 | utime_t now = ceph_clock_now(); | |
269 | utime_t since = now - last_acked_stamp; | |
270 | if (since > g_conf->mds_beacon_grace) { | |
271 | dout(5) << "is_laggy " << since << " > " << g_conf->mds_beacon_grace | |
272 | << " since last acked beacon" << dendl; | |
273 | was_laggy = true; | |
274 | if (since > (g_conf->mds_beacon_grace*2) && | |
275 | now > last_mon_reconnect + g_conf->mds_beacon_interval) { | |
276 | // maybe it's not us? | |
277 | dout(5) << "initiating monitor reconnect; maybe we're not the slow one" | |
278 | << dendl; | |
279 | last_mon_reconnect = now; | |
280 | monc->reopen_session(); | |
281 | } | |
282 | return true; | |
283 | } | |
284 | return false; | |
285 | } | |
286 | ||
287 | utime_t Beacon::get_laggy_until() const | |
288 | { | |
289 | Mutex::Locker l(lock); | |
290 | ||
291 | return laggy_until; | |
292 | } | |
293 | ||
294 | void Beacon::set_want_state(MDSMap const *mdsmap, MDSMap::DaemonState const newstate) | |
295 | { | |
296 | Mutex::Locker l(lock); | |
297 | ||
298 | // Update mdsmap epoch atomically with updating want_state, so that when | |
299 | // we send a beacon with the new want state it has the latest epoch, and | |
300 | // once we have updated to the latest epoch, we are not sending out | |
301 | // a stale want_state (i.e. one from before making it through MDSMap | |
302 | // handling) | |
303 | _notify_mdsmap(mdsmap); | |
304 | ||
305 | if (want_state != newstate) { | |
306 | dout(10) << __func__ << ": " | |
307 | << ceph_mds_state_name(want_state) << " -> " | |
308 | << ceph_mds_state_name(newstate) << dendl; | |
309 | want_state = newstate; | |
310 | } | |
311 | } | |
312 | ||
313 | ||
314 | /** | |
315 | * We are 'shown' an MDS briefly in order to update | |
316 | * some health metrics that we will send in the next | |
317 | * beacon. | |
318 | */ | |
319 | void Beacon::notify_health(MDSRank const *mds) | |
320 | { | |
321 | Mutex::Locker l(lock); | |
322 | if (!mds) { | |
323 | // No MDS rank held | |
324 | return; | |
325 | } | |
326 | ||
327 | // I'm going to touch this MDS, so it must be locked | |
328 | assert(mds->mds_lock.is_locked_by_me()); | |
329 | ||
330 | health.metrics.clear(); | |
331 | ||
332 | // Detect presence of entries in DamageTable | |
333 | if (!mds->damage_table.empty()) { | |
334 | MDSHealthMetric m(MDS_HEALTH_DAMAGE, HEALTH_ERR, std::string( | |
335 | "Metadata damage detected")); | |
336 | health.metrics.push_back(m); | |
337 | } | |
338 | ||
339 | // Detect MDS_HEALTH_TRIM condition | |
340 | // Arbitrary factor of 2, indicates MDS is not trimming promptly | |
341 | { | |
342 | if (mds->mdlog->get_num_segments() > (size_t)(g_conf->mds_log_max_segments * 2)) { | |
343 | std::ostringstream oss; | |
344 | oss << "Behind on trimming (" << mds->mdlog->get_num_segments() | |
345 | << "/" << g_conf->mds_log_max_segments << ")"; | |
346 | ||
347 | MDSHealthMetric m(MDS_HEALTH_TRIM, HEALTH_WARN, oss.str()); | |
348 | m.metadata["num_segments"] = stringify(mds->mdlog->get_num_segments()); | |
349 | m.metadata["max_segments"] = stringify(g_conf->mds_log_max_segments); | |
350 | health.metrics.push_back(m); | |
351 | } | |
352 | } | |
353 | ||
354 | // Detect clients failing to respond to modifications to capabilities in | |
355 | // CLIENT_CAPS messages. | |
356 | { | |
357 | std::list<client_t> late_clients; | |
358 | mds->locker->get_late_revoking_clients(&late_clients); | |
359 | std::list<MDSHealthMetric> late_cap_metrics; | |
360 | ||
361 | for (std::list<client_t>::iterator i = late_clients.begin(); i != late_clients.end(); ++i) { | |
362 | ||
363 | // client_t is equivalent to session.info.inst.name.num | |
364 | // Construct an entity_name_t to lookup into SessionMap | |
365 | entity_name_t ename(CEPH_ENTITY_TYPE_CLIENT, i->v); | |
366 | Session const *s = mds->sessionmap.get_session(ename); | |
367 | if (s == NULL) { | |
368 | // Shouldn't happen, but not worth crashing if it does as this is | |
369 | // just health-reporting code. | |
370 | derr << "Client ID without session: " << i->v << dendl; | |
371 | continue; | |
372 | } | |
373 | ||
374 | std::ostringstream oss; | |
375 | oss << "Client " << s->get_human_name() << " failing to respond to capability release"; | |
376 | MDSHealthMetric m(MDS_HEALTH_CLIENT_LATE_RELEASE, HEALTH_WARN, oss.str()); | |
377 | m.metadata["client_id"] = stringify(i->v); | |
378 | late_cap_metrics.push_back(m); | |
379 | } | |
380 | ||
381 | if (late_cap_metrics.size() <= (size_t)g_conf->mds_health_summarize_threshold) { | |
382 | health.metrics.splice(health.metrics.end(), late_cap_metrics); | |
383 | } else { | |
384 | std::ostringstream oss; | |
385 | oss << "Many clients (" << late_cap_metrics.size() | |
386 | << ") failing to respond to capability release"; | |
387 | MDSHealthMetric m(MDS_HEALTH_CLIENT_LATE_RELEASE_MANY, HEALTH_WARN, oss.str()); | |
388 | m.metadata["client_count"] = stringify(late_cap_metrics.size()); | |
389 | health.metrics.push_back(m); | |
390 | late_cap_metrics.clear(); | |
391 | } | |
392 | } | |
393 | ||
394 | // Detect clients failing to generate cap releases from CEPH_SESSION_RECALL_STATE | |
395 | // messages. May be due to buggy client or resource-hogging application. | |
396 | // | |
397 | // Detect clients failing to advance their old_client_tid | |
398 | { | |
399 | set<Session*> sessions; | |
400 | mds->sessionmap.get_client_session_set(sessions); | |
401 | ||
402 | utime_t cutoff = ceph_clock_now(); | |
403 | cutoff -= g_conf->mds_recall_state_timeout; | |
404 | utime_t last_recall = mds->mdcache->last_recall_state; | |
405 | ||
406 | std::list<MDSHealthMetric> late_recall_metrics; | |
407 | std::list<MDSHealthMetric> large_completed_requests_metrics; | |
408 | for (set<Session*>::iterator i = sessions.begin(); i != sessions.end(); ++i) { | |
409 | Session *session = *i; | |
410 | if (!session->recalled_at.is_zero()) { | |
411 | dout(20) << "Session servicing RECALL " << session->info.inst | |
412 | << ": " << session->recalled_at << " " << session->recall_release_count | |
413 | << "/" << session->recall_count << dendl; | |
414 | if (last_recall < cutoff || session->last_recall_sent < last_recall) { | |
415 | dout(20) << " no longer recall" << dendl; | |
416 | session->clear_recalled_at(); | |
417 | } else if (session->recalled_at < cutoff) { | |
418 | dout(20) << " exceeded timeout " << session->recalled_at << " vs. " << cutoff << dendl; | |
419 | std::ostringstream oss; | |
420 | oss << "Client " << session->get_human_name() << " failing to respond to cache pressure"; | |
421 | MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str()); | |
422 | m.metadata["client_id"] = stringify(session->info.inst.name.num()); | |
423 | late_recall_metrics.push_back(m); | |
424 | } else { | |
425 | dout(20) << " within timeout " << session->recalled_at << " vs. " << cutoff << dendl; | |
426 | } | |
427 | } | |
428 | if ((session->get_num_trim_requests_warnings() > 0 && | |
429 | session->get_num_completed_requests() >= g_conf->mds_max_completed_requests) || | |
430 | (session->get_num_trim_flushes_warnings() > 0 && | |
431 | session->get_num_completed_flushes() >= g_conf->mds_max_completed_flushes)) { | |
432 | std::ostringstream oss; | |
433 | oss << "Client " << session->get_human_name() << " failing to advance its oldest client/flush tid"; | |
434 | MDSHealthMetric m(MDS_HEALTH_CLIENT_OLDEST_TID, HEALTH_WARN, oss.str()); | |
435 | m.metadata["client_id"] = stringify(session->info.inst.name.num()); | |
436 | large_completed_requests_metrics.push_back(m); | |
437 | } | |
438 | } | |
439 | ||
440 | if (late_recall_metrics.size() <= (size_t)g_conf->mds_health_summarize_threshold) { | |
441 | health.metrics.splice(health.metrics.end(), late_recall_metrics); | |
442 | } else { | |
443 | std::ostringstream oss; | |
444 | oss << "Many clients (" << late_recall_metrics.size() | |
445 | << ") failing to respond to cache pressure"; | |
446 | MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL_MANY, HEALTH_WARN, oss.str()); | |
447 | m.metadata["client_count"] = stringify(late_recall_metrics.size()); | |
448 | health.metrics.push_back(m); | |
449 | late_recall_metrics.clear(); | |
450 | } | |
451 | ||
452 | if (large_completed_requests_metrics.size() <= (size_t)g_conf->mds_health_summarize_threshold) { | |
453 | health.metrics.splice(health.metrics.end(), large_completed_requests_metrics); | |
454 | } else { | |
455 | std::ostringstream oss; | |
456 | oss << "Many clients (" << large_completed_requests_metrics.size() | |
457 | << ") failing to advance their oldest client/flush tid"; | |
458 | MDSHealthMetric m(MDS_HEALTH_CLIENT_OLDEST_TID_MANY, HEALTH_WARN, oss.str()); | |
459 | m.metadata["client_count"] = stringify(large_completed_requests_metrics.size()); | |
460 | health.metrics.push_back(m); | |
461 | large_completed_requests_metrics.clear(); | |
462 | } | |
463 | } | |
464 | ||
465 | // Detect MDS_HEALTH_SLOW_REQUEST condition | |
466 | { | |
467 | int slow = mds->get_mds_slow_req_count(); | |
468 | dout(20) << slow << " slow request found" << dendl; | |
469 | if (slow) { | |
470 | std::ostringstream oss; | |
471 | oss << slow << " slow requests are blocked > " << g_conf->mds_op_complaint_time << " sec"; | |
472 | ||
473 | MDSHealthMetric m(MDS_HEALTH_SLOW_REQUEST, HEALTH_WARN, oss.str()); | |
474 | health.metrics.push_back(m); | |
475 | } | |
476 | } | |
477 | ||
478 | // Report a health warning if we are readonly | |
479 | if (mds->mdcache->is_readonly()) { | |
480 | MDSHealthMetric m(MDS_HEALTH_READ_ONLY, HEALTH_WARN, | |
481 | "MDS in read-only mode"); | |
482 | health.metrics.push_back(m); | |
483 | } | |
484 | ||
485 | // Report if we have significantly exceeded our cache size limit | |
486 | if (mds->mdcache->get_cache_size() > | |
487 | g_conf->mds_cache_size * g_conf->mds_health_cache_threshold) { | |
488 | std::ostringstream oss; | |
489 | oss << "Too many inodes in cache (" << mds->mdcache->get_cache_size() | |
490 | << "/" << g_conf->mds_cache_size << "), " | |
491 | << mds->mdcache->num_inodes_with_caps << " inodes in use by clients, " | |
492 | << mds->mdcache->get_num_strays() << " stray files"; | |
493 | ||
494 | MDSHealthMetric m(MDS_HEALTH_CACHE_OVERSIZED, HEALTH_WARN, oss.str()); | |
495 | health.metrics.push_back(m); | |
496 | } | |
497 | } | |
498 | ||
499 | MDSMap::DaemonState Beacon::get_want_state() const | |
500 | { | |
501 | Mutex::Locker l(lock); | |
502 | return want_state; | |
503 | } | |
504 |