]> git.proxmox.com Git - ceph.git/blob - ceph/src/mgr/ClusterState.cc
import 15.2.4
[ceph.git] / ceph / src / mgr / ClusterState.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 John Spray <john.spray@inktank.com>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 */
13
14 #include "messages/MMgrDigest.h"
15 #include "messages/MMonMgrReport.h"
16 #include "messages/MPGStats.h"
17
18 #include "mgr/ClusterState.h"
19 #include <time.h>
20 #include <boost/range/adaptor/reversed.hpp>
21
22 #define dout_context g_ceph_context
23 #define dout_subsys ceph_subsys_mgr
24 #undef dout_prefix
25 #define dout_prefix *_dout << "mgr " << __func__ << " "
26
27 ClusterState::ClusterState(
28 MonClient *monc_,
29 Objecter *objecter_,
30 const MgrMap& mgrmap)
31 : monc(monc_),
32 objecter(objecter_),
33 mgr_map(mgrmap),
34 asok_hook(NULL)
35 {}
36
37 void ClusterState::set_objecter(Objecter *objecter_)
38 {
39 std::lock_guard l(lock);
40
41 objecter = objecter_;
42 }
43
44 void ClusterState::set_fsmap(FSMap const &new_fsmap)
45 {
46 std::lock_guard l(lock);
47
48 fsmap = new_fsmap;
49 }
50
51 void ClusterState::set_mgr_map(MgrMap const &new_mgrmap)
52 {
53 std::lock_guard l(lock);
54 mgr_map = new_mgrmap;
55 }
56
57 void ClusterState::set_service_map(ServiceMap const &new_service_map)
58 {
59 std::lock_guard l(lock);
60 servicemap = new_service_map;
61 }
62
63 void ClusterState::load_digest(MMgrDigest *m)
64 {
65 std::lock_guard l(lock);
66 health_json = std::move(m->health_json);
67 mon_status_json = std::move(m->mon_status_json);
68 }
69
70 void ClusterState::ingest_pgstats(ref_t<MPGStats> stats)
71 {
72 std::lock_guard l(lock);
73
74 const int from = stats->get_orig_source().num();
75 pending_inc.update_stat(from, std::move(stats->osd_stat));
76
77 for (auto p : stats->pg_stat) {
78 pg_t pgid = p.first;
79 const auto &pg_stats = p.second;
80
81 // In case we're hearing about a PG that according to last
82 // OSDMap update should not exist
83 auto r = existing_pools.find(pgid.pool());
84 if (r == existing_pools.end()) {
85 dout(15) << " got " << pgid
86 << " reported at " << pg_stats.reported_epoch << ":"
87 << pg_stats.reported_seq
88 << " state " << pg_state_string(pg_stats.state)
89 << " but pool not in " << existing_pools
90 << dendl;
91 continue;
92 }
93 if (pgid.ps() >= r->second) {
94 dout(15) << " got " << pgid
95 << " reported at " << pg_stats.reported_epoch << ":"
96 << pg_stats.reported_seq
97 << " state " << pg_state_string(pg_stats.state)
98 << " but > pg_num " << r->second
99 << dendl;
100 continue;
101 }
102 // In case we already heard about more recent stats from this PG
103 // from another OSD
104 const auto q = pg_map.pg_stat.find(pgid);
105 if (q != pg_map.pg_stat.end() &&
106 q->second.get_version_pair() > pg_stats.get_version_pair()) {
107 dout(15) << " had " << pgid << " from "
108 << q->second.reported_epoch << ":"
109 << q->second.reported_seq << dendl;
110 continue;
111 }
112
113 pending_inc.pg_stat_updates[pgid] = pg_stats;
114 }
115 for (auto p : stats->pool_stat) {
116 pending_inc.pool_statfs_updates[std::make_pair(p.first, from)] = p.second;
117 }
118 }
119
120 void ClusterState::update_delta_stats()
121 {
122 pending_inc.stamp = ceph_clock_now();
123 pending_inc.version = pg_map.version + 1; // to make apply_incremental happy
124 dout(10) << " v" << pending_inc.version << dendl;
125
126 dout(30) << " pg_map before:\n";
127 JSONFormatter jf(true);
128 jf.dump_object("pg_map", pg_map);
129 jf.flush(*_dout);
130 *_dout << dendl;
131 dout(30) << " incremental:\n";
132 JSONFormatter jf(true);
133 jf.dump_object("pending_inc", pending_inc);
134 jf.flush(*_dout);
135 *_dout << dendl;
136 pg_map.apply_incremental(g_ceph_context, pending_inc);
137 pending_inc = PGMap::Incremental();
138 }
139
140 void ClusterState::notify_osdmap(const OSDMap &osd_map)
141 {
142 assert(ceph_mutex_is_locked(lock));
143
144 pending_inc.stamp = ceph_clock_now();
145 pending_inc.version = pg_map.version + 1; // to make apply_incremental happy
146 dout(10) << " v" << pending_inc.version << dendl;
147
148 PGMapUpdater::check_osd_map(g_ceph_context, osd_map, pg_map, &pending_inc);
149
150 // update our list of pools that exist, so that we can filter pg_map updates
151 // in synchrony with this OSDMap.
152 existing_pools.clear();
153 for (auto& p : osd_map.get_pools()) {
154 existing_pools[p.first] = p.second.get_pg_num();
155 }
156
157 // brute force this for now (don't bother being clever by only
158 // checking osds that went up/down)
159 set<int> need_check_down_pg_osds;
160 PGMapUpdater::check_down_pgs(osd_map, pg_map, true,
161 need_check_down_pg_osds, &pending_inc);
162
163 dout(30) << " pg_map before:\n";
164 JSONFormatter jf(true);
165 jf.dump_object("pg_map", pg_map);
166 jf.flush(*_dout);
167 *_dout << dendl;
168 dout(30) << " incremental:\n";
169 JSONFormatter jf(true);
170 jf.dump_object("pending_inc", pending_inc);
171 jf.flush(*_dout);
172 *_dout << dendl;
173
174 pg_map.apply_incremental(g_ceph_context, pending_inc);
175 pending_inc = PGMap::Incremental();
176 // TODO: Complete the separation of PG state handling so
177 // that a cut-down set of functionality remains in PGMonitor
178 // while the full-blown PGMap lives only here.
179 }
180
181 class ClusterSocketHook : public AdminSocketHook {
182 ClusterState *cluster_state;
183 public:
184 explicit ClusterSocketHook(ClusterState *o) : cluster_state(o) {}
185 int call(std::string_view admin_command, const cmdmap_t& cmdmap,
186 Formatter *f,
187 std::ostream& errss,
188 bufferlist& out) override {
189 stringstream outss;
190 int r = 0;
191 try {
192 r = cluster_state->asok_command(admin_command, cmdmap, f, outss);
193 out.append(outss);
194 } catch (const TOPNSPC::common::bad_cmd_get& e) {
195 errss << e.what();
196 r = -EINVAL;
197 }
198 return r;
199 }
200 };
201
202 void ClusterState::final_init()
203 {
204 AdminSocket *admin_socket = g_ceph_context->get_admin_socket();
205 asok_hook = new ClusterSocketHook(this);
206 int r = admin_socket->register_command(
207 "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
208 "Dump osd heartbeat network ping times");
209 ceph_assert(r == 0);
210 }
211
212 void ClusterState::shutdown()
213 {
214 // unregister commands
215 g_ceph_context->get_admin_socket()->unregister_commands(asok_hook);
216 delete asok_hook;
217 asok_hook = NULL;
218 }
219
220 bool ClusterState::asok_command(
221 std::string_view admin_command,
222 const cmdmap_t& cmdmap,
223 Formatter *f,
224 ostream& ss)
225 {
226 std::lock_guard l(lock);
227 if (admin_command == "dump_osd_network") {
228 int64_t value = 0;
229 // Default to health warning level if nothing specified
230 if (!(TOPNSPC::common::cmd_getval(cmdmap, "value", value))) {
231 // Convert milliseconds to microseconds
232 value = static_cast<int64_t>(g_ceph_context->_conf.get_val<double>("mon_warn_on_slow_ping_time")) * 1000;
233 if (value == 0) {
234 double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
235 value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
236 value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
237 }
238 } else {
239 // Convert user input to microseconds
240 value *= 1000;
241 }
242 if (value < 0)
243 value = 0;
244
245 struct mgr_ping_time_t {
246 uint32_t pingtime;
247 int from;
248 int to;
249 bool back;
250 std::array<uint32_t,3> times;
251 std::array<uint32_t,3> min;
252 std::array<uint32_t,3> max;
253 uint32_t last;
254 uint32_t last_update;
255
256 bool operator<(const mgr_ping_time_t& rhs) const {
257 if (pingtime < rhs.pingtime)
258 return true;
259 if (pingtime > rhs.pingtime)
260 return false;
261 if (from < rhs.from)
262 return true;
263 if (from > rhs.from)
264 return false;
265 if (to < rhs.to)
266 return true;
267 if (to > rhs.to)
268 return false;
269 return back;
270 }
271 };
272
273 set<mgr_ping_time_t> sorted;
274 utime_t now = ceph_clock_now();
275 for (auto i : pg_map.osd_stat) {
276 for (auto j : i.second.hb_pingtime) {
277
278 if (j.second.last_update == 0)
279 continue;
280 auto stale_time = g_ceph_context->_conf.get_val<int64_t>("osd_mon_heartbeat_stat_stale");
281 if (now.sec() - j.second.last_update > stale_time) {
282 dout(20) << __func__ << " time out heartbeat for osd " << i.first
283 << " last_update " << j.second.last_update << dendl;
284 continue;
285 }
286 mgr_ping_time_t item;
287 item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
288 item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
289 if (!value || item.pingtime >= value) {
290 item.from = i.first;
291 item.to = j.first;
292 item.times[0] = j.second.back_pingtime[0];
293 item.times[1] = j.second.back_pingtime[1];
294 item.times[2] = j.second.back_pingtime[2];
295 item.min[0] = j.second.back_min[0];
296 item.min[1] = j.second.back_min[1];
297 item.min[2] = j.second.back_min[2];
298 item.max[0] = j.second.back_max[0];
299 item.max[1] = j.second.back_max[1];
300 item.max[2] = j.second.back_max[2];
301 item.last = j.second.back_last;
302 item.back = true;
303 item.last_update = j.second.last_update;
304 sorted.emplace(item);
305 }
306
307 if (j.second.front_last == 0)
308 continue;
309 item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
310 item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
311 if (!value || item.pingtime >= value) {
312 item.from = i.first;
313 item.to = j.first;
314 item.times[0] = j.second.front_pingtime[0];
315 item.times[1] = j.second.front_pingtime[1];
316 item.times[2] = j.second.front_pingtime[2];
317 item.min[0] = j.second.front_min[0];
318 item.min[1] = j.second.front_min[1];
319 item.min[2] = j.second.front_min[2];
320 item.max[0] = j.second.front_max[0];
321 item.max[1] = j.second.front_max[1];
322 item.max[2] = j.second.front_max[2];
323 item.last = j.second.front_last;
324 item.back = false;
325 item.last_update = j.second.last_update;
326 sorted.emplace(item);
327 }
328 }
329 }
330
331 // Network ping times (1min 5min 15min)
332 f->open_object_section("network_ping_times");
333 f->dump_int("threshold", value / 1000);
334 f->open_array_section("entries");
335 for (auto &sitem : boost::adaptors::reverse(sorted)) {
336 ceph_assert(!value || sitem.pingtime >= value);
337
338 f->open_object_section("entry");
339
340 const time_t lu(sitem.last_update);
341 char buffer[26];
342 string lustr(ctime_r(&lu, buffer));
343 lustr.pop_back(); // Remove trailing \n
344 auto stale = g_ceph_context->_conf.get_val<int64_t>("osd_heartbeat_stale");
345 f->dump_string("last update", lustr);
346 f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale);
347 f->dump_int("from osd", sitem.from);
348 f->dump_int("to osd", sitem.to);
349 f->dump_string("interface", (sitem.back ? "back" : "front"));
350 f->open_object_section("average");
351 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str());
352 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str());
353 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str());
354 f->close_section(); // average
355 f->open_object_section("min");
356 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.min[0],3).c_str());
357 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.min[1],3).c_str());
358 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.min[2],3).c_str());
359 f->close_section(); // min
360 f->open_object_section("max");
361 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
362 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
363 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
364 f->close_section(); // max
365 f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str());
366 f->close_section(); // entry
367 }
368 f->close_section(); // entries
369 f->close_section(); // network_ping_times
370 } else {
371 ceph_abort_msg("broken asok registration");
372 }
373 return true;
374 }