]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2014 John Spray <john.spray@inktank.com> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | */ | |
13 | ||
14 | #include "messages/MMgrDigest.h" | |
31f18b77 | 15 | #include "messages/MMonMgrReport.h" |
7c673cae FG |
16 | #include "messages/MPGStats.h" |
17 | ||
18 | #include "mgr/ClusterState.h" | |
eafe8130 TL |
19 | #include <time.h> |
20 | #include <boost/range/adaptor/reversed.hpp> | |
7c673cae FG |
21 | |
22 | #define dout_context g_ceph_context | |
23 | #define dout_subsys ceph_subsys_mgr | |
24 | #undef dout_prefix | |
25 | #define dout_prefix *_dout << "mgr " << __func__ << " " | |
26 | ||
224ce89b WB |
27 | ClusterState::ClusterState( |
28 | MonClient *monc_, | |
29 | Objecter *objecter_, | |
30 | const MgrMap& mgrmap) | |
31 | : monc(monc_), | |
32 | objecter(objecter_), | |
eafe8130 TL |
33 | mgr_map(mgrmap), |
34 | asok_hook(NULL) | |
7c673cae FG |
35 | {} |
36 | ||
37 | void ClusterState::set_objecter(Objecter *objecter_) | |
38 | { | |
11fdf7f2 | 39 | std::lock_guard l(lock); |
7c673cae FG |
40 | |
41 | objecter = objecter_; | |
42 | } | |
43 | ||
44 | void ClusterState::set_fsmap(FSMap const &new_fsmap) | |
45 | { | |
11fdf7f2 | 46 | std::lock_guard l(lock); |
7c673cae FG |
47 | |
48 | fsmap = new_fsmap; | |
49 | } | |
50 | ||
224ce89b WB |
51 | void ClusterState::set_mgr_map(MgrMap const &new_mgrmap) |
52 | { | |
11fdf7f2 | 53 | std::lock_guard l(lock); |
224ce89b WB |
54 | mgr_map = new_mgrmap; |
55 | } | |
56 | ||
57 | void ClusterState::set_service_map(ServiceMap const &new_service_map) | |
58 | { | |
11fdf7f2 | 59 | std::lock_guard l(lock); |
224ce89b WB |
60 | servicemap = new_service_map; |
61 | } | |
62 | ||
7c673cae FG |
63 | void ClusterState::load_digest(MMgrDigest *m) |
64 | { | |
e306af50 | 65 | std::lock_guard l(lock); |
7c673cae FG |
66 | health_json = std::move(m->health_json); |
67 | mon_status_json = std::move(m->mon_status_json); | |
68 | } | |
69 | ||
9f95a23c | 70 | void ClusterState::ingest_pgstats(ref_t<MPGStats> stats) |
7c673cae | 71 | { |
11fdf7f2 | 72 | std::lock_guard l(lock); |
7c673cae FG |
73 | |
74 | const int from = stats->get_orig_source().num(); | |
11fdf7f2 | 75 | pending_inc.update_stat(from, std::move(stats->osd_stat)); |
7c673cae FG |
76 | |
77 | for (auto p : stats->pg_stat) { | |
78 | pg_t pgid = p.first; | |
79 | const auto &pg_stats = p.second; | |
80 | ||
81 | // In case we're hearing about a PG that according to last | |
82 | // OSDMap update should not exist | |
11fdf7f2 TL |
83 | auto r = existing_pools.find(pgid.pool()); |
84 | if (r == existing_pools.end()) { | |
31f18b77 FG |
85 | dout(15) << " got " << pgid |
86 | << " reported at " << pg_stats.reported_epoch << ":" | |
7c673cae FG |
87 | << pg_stats.reported_seq |
88 | << " state " << pg_state_string(pg_stats.state) | |
31f18b77 | 89 | << " but pool not in " << existing_pools |
7c673cae FG |
90 | << dendl; |
91 | continue; | |
31f18b77 | 92 | } |
11fdf7f2 TL |
93 | if (pgid.ps() >= r->second) { |
94 | dout(15) << " got " << pgid | |
95 | << " reported at " << pg_stats.reported_epoch << ":" | |
96 | << pg_stats.reported_seq | |
97 | << " state " << pg_state_string(pg_stats.state) | |
98 | << " but > pg_num " << r->second | |
99 | << dendl; | |
100 | continue; | |
101 | } | |
31f18b77 FG |
102 | // In case we already heard about more recent stats from this PG |
103 | // from another OSD | |
224ce89b WB |
104 | const auto q = pg_map.pg_stat.find(pgid); |
105 | if (q != pg_map.pg_stat.end() && | |
106 | q->second.get_version_pair() > pg_stats.get_version_pair()) { | |
31f18b77 | 107 | dout(15) << " had " << pgid << " from " |
224ce89b | 108 | << q->second.reported_epoch << ":" |
11fdf7f2 | 109 | << q->second.reported_seq << dendl; |
7c673cae FG |
110 | continue; |
111 | } | |
112 | ||
113 | pending_inc.pg_stat_updates[pgid] = pg_stats; | |
114 | } | |
11fdf7f2 TL |
115 | for (auto p : stats->pool_stat) { |
116 | pending_inc.pool_statfs_updates[std::make_pair(p.first, from)] = p.second; | |
117 | } | |
31f18b77 FG |
118 | } |
119 | ||
120 | void ClusterState::update_delta_stats() | |
121 | { | |
122 | pending_inc.stamp = ceph_clock_now(); | |
123 | pending_inc.version = pg_map.version + 1; // to make apply_incremental happy | |
124 | dout(10) << " v" << pending_inc.version << dendl; | |
125 | ||
126 | dout(30) << " pg_map before:\n"; | |
127 | JSONFormatter jf(true); | |
128 | jf.dump_object("pg_map", pg_map); | |
129 | jf.flush(*_dout); | |
130 | *_dout << dendl; | |
131 | dout(30) << " incremental:\n"; | |
132 | JSONFormatter jf(true); | |
133 | jf.dump_object("pending_inc", pending_inc); | |
134 | jf.flush(*_dout); | |
135 | *_dout << dendl; | |
7c673cae | 136 | pg_map.apply_incremental(g_ceph_context, pending_inc); |
31f18b77 | 137 | pending_inc = PGMap::Incremental(); |
7c673cae FG |
138 | } |
139 | ||
140 | void ClusterState::notify_osdmap(const OSDMap &osd_map) | |
141 | { | |
11fdf7f2 | 142 | assert(ceph_mutex_is_locked(lock)); |
7c673cae | 143 | |
31f18b77 | 144 | pending_inc.stamp = ceph_clock_now(); |
7c673cae | 145 | pending_inc.version = pg_map.version + 1; // to make apply_incremental happy |
31f18b77 | 146 | dout(10) << " v" << pending_inc.version << dendl; |
7c673cae | 147 | |
31f18b77 FG |
148 | PGMapUpdater::check_osd_map(g_ceph_context, osd_map, pg_map, &pending_inc); |
149 | ||
150 | // update our list of pools that exist, so that we can filter pg_map updates | |
151 | // in synchrony with this OSDMap. | |
152 | existing_pools.clear(); | |
153 | for (auto& p : osd_map.get_pools()) { | |
11fdf7f2 | 154 | existing_pools[p.first] = p.second.get_pg_num(); |
31f18b77 | 155 | } |
7c673cae FG |
156 | |
157 | // brute force this for now (don't bother being clever by only | |
158 | // checking osds that went up/down) | |
159 | set<int> need_check_down_pg_osds; | |
160 | PGMapUpdater::check_down_pgs(osd_map, pg_map, true, | |
161 | need_check_down_pg_osds, &pending_inc); | |
162 | ||
31f18b77 FG |
163 | dout(30) << " pg_map before:\n"; |
164 | JSONFormatter jf(true); | |
165 | jf.dump_object("pg_map", pg_map); | |
166 | jf.flush(*_dout); | |
167 | *_dout << dendl; | |
168 | dout(30) << " incremental:\n"; | |
169 | JSONFormatter jf(true); | |
170 | jf.dump_object("pending_inc", pending_inc); | |
171 | jf.flush(*_dout); | |
172 | *_dout << dendl; | |
7c673cae | 173 | |
31f18b77 FG |
174 | pg_map.apply_incremental(g_ceph_context, pending_inc); |
175 | pending_inc = PGMap::Incremental(); | |
7c673cae FG |
176 | // TODO: Complete the separation of PG state handling so |
177 | // that a cut-down set of functionality remains in PGMonitor | |
178 | // while the full-blown PGMap lives only here. | |
179 | } | |
eafe8130 TL |
180 | |
181 | class ClusterSocketHook : public AdminSocketHook { | |
182 | ClusterState *cluster_state; | |
183 | public: | |
184 | explicit ClusterSocketHook(ClusterState *o) : cluster_state(o) {} | |
9f95a23c TL |
185 | int call(std::string_view admin_command, const cmdmap_t& cmdmap, |
186 | Formatter *f, | |
187 | std::ostream& errss, | |
188 | bufferlist& out) override { | |
189 | stringstream outss; | |
190 | int r = 0; | |
eafe8130 | 191 | try { |
9f95a23c TL |
192 | r = cluster_state->asok_command(admin_command, cmdmap, f, outss); |
193 | out.append(outss); | |
194 | } catch (const TOPNSPC::common::bad_cmd_get& e) { | |
195 | errss << e.what(); | |
196 | r = -EINVAL; | |
eafe8130 | 197 | } |
eafe8130 TL |
198 | return r; |
199 | } | |
200 | }; | |
201 | ||
202 | void ClusterState::final_init() | |
203 | { | |
204 | AdminSocket *admin_socket = g_ceph_context->get_admin_socket(); | |
205 | asok_hook = new ClusterSocketHook(this); | |
9f95a23c TL |
206 | int r = admin_socket->register_command( |
207 | "dump_osd_network name=value,type=CephInt,req=false", asok_hook, | |
208 | "Dump osd heartbeat network ping times"); | |
eafe8130 TL |
209 | ceph_assert(r == 0); |
210 | } | |
211 | ||
212 | void ClusterState::shutdown() | |
213 | { | |
214 | // unregister commands | |
215 | g_ceph_context->get_admin_socket()->unregister_commands(asok_hook); | |
216 | delete asok_hook; | |
217 | asok_hook = NULL; | |
218 | } | |
219 | ||
9f95a23c TL |
220 | bool ClusterState::asok_command( |
221 | std::string_view admin_command, | |
222 | const cmdmap_t& cmdmap, | |
223 | Formatter *f, | |
224 | ostream& ss) | |
eafe8130 TL |
225 | { |
226 | std::lock_guard l(lock); | |
eafe8130 TL |
227 | if (admin_command == "dump_osd_network") { |
228 | int64_t value = 0; | |
229 | // Default to health warning level if nothing specified | |
9f95a23c | 230 | if (!(TOPNSPC::common::cmd_getval(cmdmap, "value", value))) { |
eafe8130 TL |
231 | // Convert milliseconds to microseconds |
232 | value = static_cast<int64_t>(g_ceph_context->_conf.get_val<double>("mon_warn_on_slow_ping_time")) * 1000; | |
233 | if (value == 0) { | |
234 | double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio"); | |
235 | value = g_conf().get_val<int64_t>("osd_heartbeat_grace"); | |
236 | value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio | |
237 | } | |
238 | } else { | |
239 | // Convert user input to microseconds | |
240 | value *= 1000; | |
241 | } | |
242 | if (value < 0) | |
243 | value = 0; | |
244 | ||
245 | struct mgr_ping_time_t { | |
246 | uint32_t pingtime; | |
247 | int from; | |
248 | int to; | |
249 | bool back; | |
250 | std::array<uint32_t,3> times; | |
251 | std::array<uint32_t,3> min; | |
252 | std::array<uint32_t,3> max; | |
253 | uint32_t last; | |
254 | uint32_t last_update; | |
255 | ||
256 | bool operator<(const mgr_ping_time_t& rhs) const { | |
257 | if (pingtime < rhs.pingtime) | |
258 | return true; | |
259 | if (pingtime > rhs.pingtime) | |
260 | return false; | |
261 | if (from < rhs.from) | |
262 | return true; | |
263 | if (from > rhs.from) | |
264 | return false; | |
265 | if (to < rhs.to) | |
266 | return true; | |
267 | if (to > rhs.to) | |
268 | return false; | |
269 | return back; | |
270 | } | |
271 | }; | |
272 | ||
273 | set<mgr_ping_time_t> sorted; | |
274 | utime_t now = ceph_clock_now(); | |
275 | for (auto i : pg_map.osd_stat) { | |
276 | for (auto j : i.second.hb_pingtime) { | |
277 | ||
278 | if (j.second.last_update == 0) | |
279 | continue; | |
280 | auto stale_time = g_ceph_context->_conf.get_val<int64_t>("osd_mon_heartbeat_stat_stale"); | |
281 | if (now.sec() - j.second.last_update > stale_time) { | |
282 | dout(20) << __func__ << " time out heartbeat for osd " << i.first | |
283 | << " last_update " << j.second.last_update << dendl; | |
284 | continue; | |
285 | } | |
286 | mgr_ping_time_t item; | |
287 | item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]); | |
288 | item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]); | |
289 | if (!value || item.pingtime >= value) { | |
290 | item.from = i.first; | |
291 | item.to = j.first; | |
292 | item.times[0] = j.second.back_pingtime[0]; | |
293 | item.times[1] = j.second.back_pingtime[1]; | |
294 | item.times[2] = j.second.back_pingtime[2]; | |
295 | item.min[0] = j.second.back_min[0]; | |
296 | item.min[1] = j.second.back_min[1]; | |
297 | item.min[2] = j.second.back_min[2]; | |
298 | item.max[0] = j.second.back_max[0]; | |
299 | item.max[1] = j.second.back_max[1]; | |
300 | item.max[2] = j.second.back_max[2]; | |
301 | item.last = j.second.back_last; | |
302 | item.back = true; | |
303 | item.last_update = j.second.last_update; | |
304 | sorted.emplace(item); | |
305 | } | |
306 | ||
307 | if (j.second.front_last == 0) | |
308 | continue; | |
309 | item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]); | |
310 | item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]); | |
311 | if (!value || item.pingtime >= value) { | |
312 | item.from = i.first; | |
313 | item.to = j.first; | |
314 | item.times[0] = j.second.front_pingtime[0]; | |
315 | item.times[1] = j.second.front_pingtime[1]; | |
316 | item.times[2] = j.second.front_pingtime[2]; | |
317 | item.min[0] = j.second.front_min[0]; | |
318 | item.min[1] = j.second.front_min[1]; | |
319 | item.min[2] = j.second.front_min[2]; | |
320 | item.max[0] = j.second.front_max[0]; | |
321 | item.max[1] = j.second.front_max[1]; | |
322 | item.max[2] = j.second.front_max[2]; | |
323 | item.last = j.second.front_last; | |
324 | item.back = false; | |
325 | item.last_update = j.second.last_update; | |
326 | sorted.emplace(item); | |
327 | } | |
328 | } | |
329 | } | |
330 | ||
331 | // Network ping times (1min 5min 15min) | |
332 | f->open_object_section("network_ping_times"); | |
333 | f->dump_int("threshold", value / 1000); | |
334 | f->open_array_section("entries"); | |
335 | for (auto &sitem : boost::adaptors::reverse(sorted)) { | |
336 | ceph_assert(!value || sitem.pingtime >= value); | |
337 | ||
338 | f->open_object_section("entry"); | |
339 | ||
340 | const time_t lu(sitem.last_update); | |
341 | char buffer[26]; | |
342 | string lustr(ctime_r(&lu, buffer)); | |
343 | lustr.pop_back(); // Remove trailing \n | |
344 | auto stale = g_ceph_context->_conf.get_val<int64_t>("osd_heartbeat_stale"); | |
345 | f->dump_string("last update", lustr); | |
346 | f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale); | |
347 | f->dump_int("from osd", sitem.from); | |
348 | f->dump_int("to osd", sitem.to); | |
349 | f->dump_string("interface", (sitem.back ? "back" : "front")); | |
350 | f->open_object_section("average"); | |
351 | f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str()); | |
352 | f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str()); | |
353 | f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str()); | |
354 | f->close_section(); // average | |
355 | f->open_object_section("min"); | |
356 | f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.min[0],3).c_str()); | |
357 | f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.min[1],3).c_str()); | |
358 | f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.min[2],3).c_str()); | |
359 | f->close_section(); // min | |
360 | f->open_object_section("max"); | |
361 | f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str()); | |
362 | f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str()); | |
363 | f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str()); | |
364 | f->close_section(); // max | |
365 | f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str()); | |
366 | f->close_section(); // entry | |
367 | } | |
368 | f->close_section(); // entries | |
369 | f->close_section(); // network_ping_times | |
370 | } else { | |
371 | ceph_abort_msg("broken asok registration"); | |
372 | } | |
eafe8130 TL |
373 | return true; |
374 | } |