]> git.proxmox.com Git - ceph.git/blame - ceph/src/mgr/DaemonServer.cc
import ceph quincy 17.2.1
[ceph.git] / ceph / src / mgr / DaemonServer.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2016 John Spray <john.spray@redhat.com>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 */
13
14#include "DaemonServer.h"
f67539c2 15#include <boost/algorithm/string.hpp>
b32b8144 16#include "mgr/Mgr.h"
7c673cae 17
b32b8144 18#include "include/stringify.h"
31f18b77 19#include "include/str_list.h"
7c673cae 20#include "auth/RotatingKeyRing.h"
7c673cae
FG
21#include "json_spirit/json_spirit_writer.h"
22
c07f9fc5 23#include "mgr/mgr_commands.h"
11fdf7f2
TL
24#include "mgr/DaemonHealthMetricCollector.h"
25#include "mgr/OSDPerfMetricCollector.h"
f67539c2 26#include "mgr/MDSPerfMetricCollector.h"
c07f9fc5
FG
27#include "mon/MonCommand.h"
28
7c673cae 29#include "messages/MMgrOpen.h"
11fdf7f2 30#include "messages/MMgrClose.h"
7c673cae 31#include "messages/MMgrConfigure.h"
31f18b77 32#include "messages/MMonMgrReport.h"
7c673cae
FG
33#include "messages/MCommand.h"
34#include "messages/MCommandReply.h"
9f95a23c
TL
35#include "messages/MMgrCommand.h"
36#include "messages/MMgrCommandReply.h"
7c673cae
FG
37#include "messages/MPGStats.h"
38#include "messages/MOSDScrub.h"
11fdf7f2 39#include "messages/MOSDScrub2.h"
c07f9fc5 40#include "messages/MOSDForceRecovery.h"
224ce89b 41#include "common/errno.h"
11fdf7f2 42#include "common/pick_address.h"
7c673cae
FG
43
44#define dout_context g_ceph_context
45#define dout_subsys ceph_subsys_mgr
46#undef dout_prefix
47#define dout_prefix *_dout << "mgr.server " << __func__ << " "
20effc67 48
9f95a23c 49using namespace TOPNSPC::common;
20effc67
TL
50
51using std::list;
52using std::ostringstream;
53using std::string;
54using std::stringstream;
55using std::vector;
56using std::unique_ptr;
57
9f95a23c
TL
58namespace {
59 template <typename Map>
60 bool map_compare(Map const &lhs, Map const &rhs) {
61 return lhs.size() == rhs.size()
62 && std::equal(lhs.begin(), lhs.end(), rhs.begin(),
63 [] (auto a, auto b) { return a.first == b.first && a.second == b.second; });
64 }
65}
c07f9fc5 66
7c673cae
FG
67DaemonServer::DaemonServer(MonClient *monc_,
68 Finisher &finisher_,
69 DaemonStateIndex &daemon_state_,
70 ClusterState &cluster_state_,
3efd9988 71 PyModuleRegistry &py_modules_,
7c673cae
FG
72 LogChannelRef clog_,
73 LogChannelRef audit_clog_)
74 : Dispatcher(g_ceph_context),
75 client_byte_throttler(new Throttle(g_ceph_context, "mgr_client_bytes",
11fdf7f2 76 g_conf().get_val<Option::size_t>("mgr_client_bytes"))),
7c673cae 77 client_msg_throttler(new Throttle(g_ceph_context, "mgr_client_messages",
11fdf7f2 78 g_conf().get_val<uint64_t>("mgr_client_messages"))),
7c673cae 79 osd_byte_throttler(new Throttle(g_ceph_context, "mgr_osd_bytes",
11fdf7f2 80 g_conf().get_val<Option::size_t>("mgr_osd_bytes"))),
7c673cae 81 osd_msg_throttler(new Throttle(g_ceph_context, "mgr_osd_messsages",
11fdf7f2 82 g_conf().get_val<uint64_t>("mgr_osd_messages"))),
7c673cae 83 mds_byte_throttler(new Throttle(g_ceph_context, "mgr_mds_bytes",
11fdf7f2 84 g_conf().get_val<Option::size_t>("mgr_mds_bytes"))),
7c673cae 85 mds_msg_throttler(new Throttle(g_ceph_context, "mgr_mds_messsages",
11fdf7f2 86 g_conf().get_val<uint64_t>("mgr_mds_messages"))),
7c673cae 87 mon_byte_throttler(new Throttle(g_ceph_context, "mgr_mon_bytes",
11fdf7f2 88 g_conf().get_val<Option::size_t>("mgr_mon_bytes"))),
7c673cae 89 mon_msg_throttler(new Throttle(g_ceph_context, "mgr_mon_messsages",
11fdf7f2 90 g_conf().get_val<uint64_t>("mgr_mon_messages"))),
7c673cae
FG
91 msgr(nullptr),
92 monc(monc_),
93 finisher(finisher_),
94 daemon_state(daemon_state_),
95 cluster_state(cluster_state_),
96 py_modules(py_modules_),
97 clog(clog_),
98 audit_clog(audit_clog_),
11fdf7f2
TL
99 pgmap_ready(false),
100 timer(g_ceph_context, lock),
101 shutting_down(false),
102 tick_event(nullptr),
103 osd_perf_metric_collector_listener(this),
f67539c2
TL
104 osd_perf_metric_collector(osd_perf_metric_collector_listener),
105 mds_perf_metric_collector_listener(this),
106 mds_perf_metric_collector(mds_perf_metric_collector_listener)
3efd9988 107{
11fdf7f2 108 g_conf().add_observer(this);
3efd9988 109}
7c673cae
FG
110
111DaemonServer::~DaemonServer() {
112 delete msgr;
11fdf7f2 113 g_conf().remove_observer(this);
7c673cae
FG
114}
115
11fdf7f2 116int DaemonServer::init(uint64_t gid, entity_addrvec_t client_addrs)
7c673cae
FG
117{
118 // Initialize Messenger
11fdf7f2
TL
119 std::string public_msgr_type = g_conf()->ms_public_type.empty() ?
120 g_conf().get_val<std::string>("ms_type") : g_conf()->ms_public_type;
7c673cae
FG
121 msgr = Messenger::create(g_ceph_context, public_msgr_type,
122 entity_name_t::MGR(gid),
123 "mgr",
f67539c2 124 Messenger::get_pid_nonce());
7c673cae
FG
125 msgr->set_default_policy(Messenger::Policy::stateless_server(0));
126
11fdf7f2
TL
127 msgr->set_auth_client(monc);
128
7c673cae
FG
129 // throttle clients
130 msgr->set_policy_throttlers(entity_name_t::TYPE_CLIENT,
131 client_byte_throttler.get(),
132 client_msg_throttler.get());
133
134 // servers
135 msgr->set_policy_throttlers(entity_name_t::TYPE_OSD,
136 osd_byte_throttler.get(),
137 osd_msg_throttler.get());
138 msgr->set_policy_throttlers(entity_name_t::TYPE_MDS,
139 mds_byte_throttler.get(),
140 mds_msg_throttler.get());
141 msgr->set_policy_throttlers(entity_name_t::TYPE_MON,
142 mon_byte_throttler.get(),
143 mon_msg_throttler.get());
144
11fdf7f2
TL
145 entity_addrvec_t addrs;
146 int r = pick_addresses(cct, CEPH_PICK_ADDRESS_PUBLIC, &addrs);
147 if (r < 0) {
148 return r;
149 }
150 dout(20) << __func__ << " will bind to " << addrs << dendl;
151 r = msgr->bindv(addrs);
7c673cae 152 if (r < 0) {
11fdf7f2 153 derr << "unable to bind mgr to " << addrs << dendl;
7c673cae
FG
154 return r;
155 }
156
157 msgr->set_myname(entity_name_t::MGR(gid));
11fdf7f2 158 msgr->set_addr_unknowns(client_addrs);
7c673cae
FG
159
160 msgr->start();
161 msgr->add_dispatcher_tail(this);
162
11fdf7f2
TL
163 msgr->set_auth_server(monc);
164 monc->set_handle_authentication_dispatcher(this);
165
224ce89b
WB
166 started_at = ceph_clock_now();
167
11fdf7f2
TL
168 std::lock_guard l(lock);
169 timer.init();
170
171 schedule_tick_locked(
172 g_conf().get_val<std::chrono::seconds>("mgr_tick_period").count());
173
7c673cae
FG
174 return 0;
175}
176
11fdf7f2 177entity_addrvec_t DaemonServer::get_myaddrs() const
7c673cae 178{
11fdf7f2 179 return msgr->get_myaddrs();
7c673cae
FG
180}
181
11fdf7f2
TL
182int DaemonServer::ms_handle_authentication(Connection *con)
183{
9f95a23c 184 auto s = ceph::make_ref<MgrSession>(cct);
11fdf7f2 185 con->set_priv(s);
7c673cae 186 s->inst.addr = con->get_peer_addr();
11fdf7f2
TL
187 s->entity_name = con->peer_name;
188 dout(10) << __func__ << " new session " << s << " con " << con
189 << " entity " << con->peer_name
190 << " addr " << con->get_peer_addrs()
191 << dendl;
192
193 AuthCapsInfo &caps_info = con->get_peer_caps_info();
194 if (caps_info.allow_all) {
195 dout(10) << " session " << s << " " << s->entity_name
196 << " allow_all" << dendl;
197 s->caps.set_allow_all();
9f95a23c 198 } else if (caps_info.caps.length() > 0) {
11fdf7f2
TL
199 auto p = caps_info.caps.cbegin();
200 string str;
201 try {
202 decode(str, p);
203 }
204 catch (buffer::error& e) {
7c673cae 205 dout(10) << " session " << s << " " << s->entity_name
9f95a23c
TL
206 << " failed to decode caps" << dendl;
207 return -EACCES;
208 }
209 if (!s->caps.parse(str)) {
11fdf7f2
TL
210 dout(10) << " session " << s << " " << s->entity_name
211 << " failed to parse caps '" << str << "'" << dendl;
9f95a23c 212 return -EACCES;
7c673cae 213 }
9f95a23c
TL
214 dout(10) << " session " << s << " " << s->entity_name
215 << " has caps " << s->caps << " '" << str << "'" << dendl;
11fdf7f2 216 }
31f18b77 217
11fdf7f2
TL
218 if (con->get_peer_type() == CEPH_ENTITY_TYPE_OSD) {
219 std::lock_guard l(lock);
220 s->osd_id = atoi(s->entity_name.get_id().c_str());
221 dout(10) << "registering osd." << s->osd_id << " session "
222 << s << " con " << con << dendl;
223 osd_cons[s->osd_id].insert(con);
7c673cae
FG
224 }
225
9f95a23c 226 return 1;
7c673cae
FG
227}
228
31f18b77
FG
229bool DaemonServer::ms_handle_reset(Connection *con)
230{
231 if (con->get_peer_type() == CEPH_ENTITY_TYPE_OSD) {
11fdf7f2
TL
232 auto priv = con->get_priv();
233 auto session = static_cast<MgrSession*>(priv.get());
31f18b77
FG
234 if (!session) {
235 return false;
236 }
11fdf7f2 237 std::lock_guard l(lock);
224ce89b 238 dout(10) << "unregistering osd." << session->osd_id
31f18b77
FG
239 << " session " << session << " con " << con << dendl;
240 osd_cons[session->osd_id].erase(con);
3efd9988
FG
241
242 auto iter = daemon_connections.find(con);
243 if (iter != daemon_connections.end()) {
244 daemon_connections.erase(iter);
245 }
31f18b77
FG
246 }
247 return false;
248}
249
7c673cae
FG
250bool DaemonServer::ms_handle_refused(Connection *con)
251{
252 // do nothing for now
253 return false;
254}
255
9f95a23c 256bool DaemonServer::ms_dispatch2(const ref_t<Message>& m)
7c673cae 257{
3efd9988
FG
258 // Note that we do *not* take ::lock here, in order to avoid
259 // serializing all message handling. It's up to each handler
260 // to take whatever locks it needs.
7c673cae
FG
261 switch (m->get_type()) {
262 case MSG_PGSTATS:
9f95a23c 263 cluster_state.ingest_pgstats(ref_cast<MPGStats>(m));
224ce89b 264 maybe_ready(m->get_source().num());
7c673cae
FG
265 return true;
266 case MSG_MGR_REPORT:
9f95a23c 267 return handle_report(ref_cast<MMgrReport>(m));
7c673cae 268 case MSG_MGR_OPEN:
9f95a23c 269 return handle_open(ref_cast<MMgrOpen>(m));
11fdf7f2 270 case MSG_MGR_CLOSE:
9f95a23c 271 return handle_close(ref_cast<MMgrClose>(m));
7c673cae 272 case MSG_COMMAND:
9f95a23c
TL
273 return handle_command(ref_cast<MCommand>(m));
274 case MSG_MGR_COMMAND:
275 return handle_command(ref_cast<MMgrCommand>(m));
7c673cae
FG
276 default:
277 dout(1) << "Unhandled message type " << m->get_type() << dendl;
278 return false;
279 };
280}
281
9f95a23c
TL
282void DaemonServer::dump_pg_ready(ceph::Formatter *f)
283{
284 f->dump_bool("pg_ready", pgmap_ready.load());
285}
286
224ce89b
WB
287void DaemonServer::maybe_ready(int32_t osd_id)
288{
3efd9988
FG
289 if (pgmap_ready.load()) {
290 // Fast path: we don't need to take lock because pgmap_ready
291 // is already set
292 } else {
11fdf7f2 293 std::lock_guard l(lock);
224ce89b 294
3efd9988
FG
295 if (reported_osds.find(osd_id) == reported_osds.end()) {
296 dout(4) << "initial report from osd " << osd_id << dendl;
297 reported_osds.insert(osd_id);
298 std::set<int32_t> up_osds;
224ce89b 299
3efd9988
FG
300 cluster_state.with_osdmap([&](const OSDMap& osdmap) {
301 osdmap.get_up_osds(up_osds);
302 });
224ce89b 303
3efd9988
FG
304 std::set<int32_t> unreported_osds;
305 std::set_difference(up_osds.begin(), up_osds.end(),
306 reported_osds.begin(), reported_osds.end(),
307 std::inserter(unreported_osds, unreported_osds.begin()));
308
309 if (unreported_osds.size() == 0) {
310 dout(4) << "all osds have reported, sending PG state to mon" << dendl;
311 pgmap_ready = true;
312 reported_osds.clear();
313 // Avoid waiting for next tick
314 send_report();
315 } else {
316 dout(4) << "still waiting for " << unreported_osds.size() << " osds"
317 " to report in before PGMap is ready" << dendl;
318 }
224ce89b
WB
319 }
320 }
321}
322
11fdf7f2
TL
323void DaemonServer::tick()
324{
325 dout(10) << dendl;
326 send_report();
327 adjust_pgs();
328
329 schedule_tick_locked(
330 g_conf().get_val<std::chrono::seconds>("mgr_tick_period").count());
331}
332
333// Currently modules do not set health checks in response to events delivered to
334// all modules (e.g. notify) so we do not risk a thundering hurd situation here.
335// if this pattern emerges in the future, this scheduler could be modified to
336// fire after all modules have had a chance to set their health checks.
337void DaemonServer::schedule_tick_locked(double delay_sec)
338{
9f95a23c 339 ceph_assert(ceph_mutex_is_locked_by_me(lock));
11fdf7f2
TL
340
341 if (tick_event) {
342 timer.cancel_event(tick_event);
343 tick_event = nullptr;
344 }
345
346 // on shutdown start rejecting explicit requests to send reports that may
347 // originate from python land which may still be running.
348 if (shutting_down)
349 return;
350
351 tick_event = timer.add_event_after(delay_sec,
9f95a23c 352 new LambdaContext([this](int r) {
11fdf7f2
TL
353 tick();
354 }));
355}
356
357void DaemonServer::schedule_tick(double delay_sec)
358{
359 std::lock_guard l(lock);
360 schedule_tick_locked(delay_sec);
361}
362
363void DaemonServer::handle_osd_perf_metric_query_updated()
364{
365 dout(10) << dendl;
366
367 // Send a fresh MMgrConfigure to all clients, so that they can follow
368 // the new policy for transmitting stats
9f95a23c 369 finisher.queue(new LambdaContext([this](int r) {
11fdf7f2
TL
370 std::lock_guard l(lock);
371 for (auto &c : daemon_connections) {
372 if (c->peer_is_osd()) {
373 _send_configure(c);
374 }
375 }
376 }));
377}
378
f67539c2
TL
379void DaemonServer::handle_mds_perf_metric_query_updated()
380{
381 dout(10) << dendl;
382
383 // Send a fresh MMgrConfigure to all clients, so that they can follow
384 // the new policy for transmitting stats
385 finisher.queue(new LambdaContext([this](int r) {
386 std::lock_guard l(lock);
387 for (auto &c : daemon_connections) {
388 if (c->peer_is_mds()) {
389 _send_configure(c);
390 }
391 }
392 }));
393}
394
7c673cae
FG
395void DaemonServer::shutdown()
396{
224ce89b 397 dout(10) << "begin" << dendl;
7c673cae
FG
398 msgr->shutdown();
399 msgr->wait();
eafe8130 400 cluster_state.shutdown();
224ce89b 401 dout(10) << "done" << dendl;
11fdf7f2
TL
402
403 std::lock_guard l(lock);
404 shutting_down = true;
405 timer.shutdown();
7c673cae
FG
406}
407
11fdf7f2
TL
408static DaemonKey key_from_service(
409 const std::string& service_name,
410 int peer_type,
411 const std::string& daemon_name)
412{
413 if (!service_name.empty()) {
9f95a23c 414 return DaemonKey{service_name, daemon_name};
11fdf7f2 415 } else {
9f95a23c 416 return DaemonKey{ceph_entity_type_name(peer_type), daemon_name};
11fdf7f2
TL
417 }
418}
7c673cae 419
1911f103
TL
420void DaemonServer::fetch_missing_metadata(const DaemonKey& key,
421 const entity_addr_t& addr)
422{
423 if (!daemon_state.is_updating(key) &&
424 (key.type == "osd" || key.type == "mds" || key.type == "mon")) {
425 std::ostringstream oss;
426 auto c = new MetadataUpdate(daemon_state, key);
427 if (key.type == "osd") {
428 oss << "{\"prefix\": \"osd metadata\", \"id\": "
429 << key.name<< "}";
430 } else if (key.type == "mds") {
431 c->set_default("addr", stringify(addr));
432 oss << "{\"prefix\": \"mds metadata\", \"who\": \""
433 << key.name << "\"}";
434 } else if (key.type == "mon") {
435 oss << "{\"prefix\": \"mon metadata\", \"id\": \""
436 << key.name << "\"}";
f67539c2
TL
437 } else {
438 ceph_abort();
1911f103
TL
439 }
440 monc->start_mon_command({oss.str()}, {}, &c->outbl, &c->outs, c);
441 }
442}
443
9f95a23c 444bool DaemonServer::handle_open(const ref_t<MMgrOpen>& m)
7c673cae 445{
1911f103 446 std::unique_lock l(lock);
3efd9988 447
11fdf7f2
TL
448 DaemonKey key = key_from_service(m->service_name,
449 m->get_connection()->get_peer_type(),
450 m->daemon_name);
7c673cae 451
92f5a8d4 452 auto con = m->get_connection();
9f95a23c 453 dout(10) << "from " << key << " " << con->get_peer_addr() << dendl;
7c673cae 454
92f5a8d4 455 _send_configure(con);
7c673cae 456
c07f9fc5 457 DaemonStatePtr daemon;
7c673cae 458 if (daemon_state.exists(key)) {
92f5a8d4 459 dout(20) << "updating existing DaemonState for " << key << dendl;
c07f9fc5
FG
460 daemon = daemon_state.get(key);
461 }
92f5a8d4
TL
462 if (!daemon) {
463 if (m->service_daemon) {
464 dout(4) << "constructing new DaemonState for " << key << dendl;
465 daemon = std::make_shared<DaemonState>(daemon_state.types);
466 daemon->key = key;
467 daemon->service_daemon = true;
468 daemon_state.insert(daemon);
469 } else {
470 /* A normal Ceph daemon has connected but we are or should be waiting on
471 * metadata for it. Close the session so that it tries to reconnect.
472 */
473 dout(2) << "ignoring open from " << key << " " << con->get_peer_addr()
474 << "; not ready for session (expect reconnect)" << dendl;
475 con->mark_down();
1911f103
TL
476 l.unlock();
477 fetch_missing_metadata(key, m->get_source_addr());
92f5a8d4
TL
478 return true;
479 }
11fdf7f2 480 }
c07f9fc5 481 if (daemon) {
92f5a8d4
TL
482 if (m->service_daemon) {
483 // update the metadata through the daemon state index to
484 // ensure it's kept up-to-date
485 daemon_state.update_metadata(daemon, m->daemon_metadata);
486 }
487
11fdf7f2 488 std::lock_guard l(daemon->lock);
3efd9988 489 daemon->perf_counters.clear();
7c673cae 490
9f95a23c 491 daemon->service_daemon = m->service_daemon;
11fdf7f2 492 if (m->service_daemon) {
11fdf7f2
TL
493 daemon->service_status = m->daemon_status;
494
495 utime_t now = ceph_clock_now();
9f95a23c
TL
496 auto [d, added] = pending_service_map.get_daemon(m->service_name,
497 m->daemon_name);
498 if (added || d->gid != (uint64_t)m->get_source().num()) {
11fdf7f2
TL
499 dout(10) << "registering " << key << " in pending_service_map" << dendl;
500 d->gid = m->get_source().num();
501 d->addr = m->get_source_addr();
502 d->start_epoch = pending_service_map.epoch;
503 d->start_stamp = now;
504 d->metadata = m->daemon_metadata;
505 pending_service_map_dirty = pending_service_map.epoch;
224ce89b 506 }
224ce89b 507 }
11fdf7f2
TL
508
509 auto p = m->config_bl.cbegin();
510 if (p != m->config_bl.end()) {
511 decode(daemon->config, p);
512 decode(daemon->ignored_mon_config, p);
513 dout(20) << " got config " << daemon->config
514 << " ignored " << daemon->ignored_mon_config << dendl;
224ce89b 515 }
11fdf7f2
TL
516 daemon->config_defaults_bl = m->config_defaults_bl;
517 daemon->config_defaults.clear();
518 dout(20) << " got config_defaults_bl " << daemon->config_defaults_bl.length()
519 << " bytes" << dendl;
224ce89b
WB
520 }
521
92f5a8d4 522 if (con->get_peer_type() != entity_name_t::TYPE_CLIENT &&
3efd9988
FG
523 m->service_name.empty())
524 {
525 // Store in set of the daemon/service connections, i.e. those
526 // connections that require an update in the event of stats
527 // configuration changes.
92f5a8d4 528 daemon_connections.insert(con);
3efd9988
FG
529 }
530
7c673cae
FG
531 return true;
532}
533
9f95a23c 534bool DaemonServer::handle_close(const ref_t<MMgrClose>& m)
11fdf7f2
TL
535{
536 std::lock_guard l(lock);
537
538 DaemonKey key = key_from_service(m->service_name,
539 m->get_connection()->get_peer_type(),
540 m->daemon_name);
541 dout(4) << "from " << m->get_connection() << " " << key << dendl;
542
543 if (daemon_state.exists(key)) {
544 DaemonStatePtr daemon = daemon_state.get(key);
545 daemon_state.rm(key);
546 {
547 std::lock_guard l(daemon->lock);
548 if (daemon->service_daemon) {
549 pending_service_map.rm_daemon(m->service_name, m->daemon_name);
550 pending_service_map_dirty = pending_service_map.epoch;
551 }
552 }
553 }
554
555 // send same message back as a reply
9f95a23c 556 m->get_connection()->send_message2(m);
11fdf7f2
TL
557 return true;
558}
559
f91f0fd5
TL
560void DaemonServer::update_task_status(
561 DaemonKey key,
562 const std::map<std::string,std::string>& task_status)
563{
9f95a23c
TL
564 dout(10) << "got task status from " << key << dendl;
565
f91f0fd5
TL
566 [[maybe_unused]] auto [daemon, added] =
567 pending_service_map.get_daemon(key.type, key.name);
568 if (daemon->task_status != task_status) {
569 daemon->task_status = task_status;
9f95a23c
TL
570 pending_service_map_dirty = pending_service_map.epoch;
571 }
572}
573
574bool DaemonServer::handle_report(const ref_t<MMgrReport>& m)
7c673cae 575{
224ce89b
WB
576 DaemonKey key;
577 if (!m->service_name.empty()) {
9f95a23c 578 key.type = m->service_name;
224ce89b 579 } else {
9f95a23c 580 key.type = ceph_entity_type_name(m->get_connection()->get_peer_type());
224ce89b 581 }
9f95a23c 582 key.name = m->daemon_name;
7c673cae 583
9f95a23c 584 dout(10) << "from " << m->get_connection() << " " << key << dendl;
31f18b77 585
224ce89b
WB
586 if (m->get_connection()->get_peer_type() == entity_name_t::TYPE_CLIENT &&
587 m->service_name.empty()) {
588 // Clients should not be sending us stats unless they are declaring
589 // themselves to be a daemon for some service.
9f95a23c
TL
590 dout(10) << "rejecting report from non-daemon client " << m->daemon_name
591 << dendl;
592 clog->warn() << "rejecting report from non-daemon client " << m->daemon_name
593 << " at " << m->get_connection()->get_peer_addrs();
b32b8144 594 m->get_connection()->mark_down();
31f18b77
FG
595 return true;
596 }
7c673cae 597
9f95a23c
TL
598
599 {
600 std::unique_lock locker(lock);
601
602 DaemonStatePtr daemon;
603 // Look up the DaemonState
604 if (daemon_state.exists(key)) {
605 dout(20) << "updating existing DaemonState for " << key << dendl;
606 daemon = daemon_state.get(key);
607 } else {
608 locker.unlock();
609
610 // we don't know the hostname at this stage, reject MMgrReport here.
611 dout(5) << "rejecting report from " << key << ", since we do not have its metadata now."
612 << dendl;
613 // issue metadata request in background
1911f103 614 fetch_missing_metadata(key, m->get_source_addr());
b32b8144 615
9f95a23c
TL
616 locker.lock();
617
b32b8144 618 // kill session
11fdf7f2
TL
619 auto priv = m->get_connection()->get_priv();
620 auto session = static_cast<MgrSession*>(priv.get());
b32b8144 621 if (!session) {
9f95a23c 622 return false;
b32b8144
FG
623 }
624 m->get_connection()->mark_down();
b32b8144
FG
625
626 dout(10) << "unregistering osd." << session->osd_id
9f95a23c 627 << " session " << session << " con " << m->get_connection() << dendl;
b32b8144
FG
628
629 if (osd_cons.find(session->osd_id) != osd_cons.end()) {
9f95a23c
TL
630 osd_cons[session->osd_id].erase(m->get_connection());
631 }
b32b8144
FG
632
633 auto iter = daemon_connections.find(m->get_connection());
634 if (iter != daemon_connections.end()) {
9f95a23c 635 daemon_connections.erase(iter);
b32b8144 636 }
3efd9988 637
9f95a23c 638 return false;
11fdf7f2
TL
639 }
640
9f95a23c
TL
641 // Update the DaemonState
642 ceph_assert(daemon != nullptr);
643 {
644 std::lock_guard l(daemon->lock);
645 auto &daemon_counters = daemon->perf_counters;
646 daemon_counters.update(*m.get());
647
648 auto p = m->config_bl.cbegin();
649 if (p != m->config_bl.end()) {
650 decode(daemon->config, p);
651 decode(daemon->ignored_mon_config, p);
652 dout(20) << " got config " << daemon->config
653 << " ignored " << daemon->ignored_mon_config << dendl;
654 }
655
3efd9988 656 utime_t now = ceph_clock_now();
9f95a23c
TL
657 if (daemon->service_daemon) {
658 if (m->daemon_status) {
659 daemon->service_status_stamp = now;
660 daemon->service_status = *m->daemon_status;
661 }
662 daemon->last_service_beacon = now;
663 } else if (m->daemon_status) {
664 derr << "got status from non-daemon " << key << dendl;
665 }
666 // update task status
667 if (m->task_status) {
f91f0fd5 668 update_task_status(key, *m->task_status);
9f95a23c
TL
669 daemon->last_service_beacon = now;
670 }
671 if (m->get_connection()->peer_is_osd() || m->get_connection()->peer_is_mon()) {
672 // only OSD and MON send health_checks to me now
673 daemon->daemon_health_metrics = std::move(m->daemon_health_metrics);
674 dout(10) << "daemon_health_metrics " << daemon->daemon_health_metrics
675 << dendl;
3efd9988 676 }
b32b8144 677 }
c07f9fc5 678 }
3efd9988 679
c07f9fc5 680 // if there are any schema updates, notify the python modules
20effc67 681 /* no users currently
c07f9fc5 682 if (!m->declare_types.empty() || !m->undeclare_types.empty()) {
9f95a23c 683 py_modules.notify_all("perf_schema_update", ceph::to_string(key));
c07f9fc5 684 }
20effc67 685 */
224ce89b 686
11fdf7f2
TL
687 if (m->get_connection()->peer_is_osd()) {
688 osd_perf_metric_collector.process_reports(m->osd_perf_metric_reports);
689 }
690
9f95a23c
TL
691 if (m->metric_report_message) {
692 const MetricReportMessage &message = *m->metric_report_message;
693 boost::apply_visitor(HandlePayloadVisitor(this), message.payload);
694 }
695
7c673cae
FG
696 return true;
697}
698
7c673cae
FG
699
700void DaemonServer::_generate_command_map(
11fdf7f2 701 cmdmap_t& cmdmap,
7c673cae
FG
702 map<string,string> &param_str_map)
703{
11fdf7f2 704 for (auto p = cmdmap.begin();
7c673cae
FG
705 p != cmdmap.end(); ++p) {
706 if (p->first == "prefix")
707 continue;
708 if (p->first == "caps") {
709 vector<string> cv;
9f95a23c 710 if (cmd_getval(cmdmap, "caps", cv) &&
7c673cae
FG
711 cv.size() % 2 == 0) {
712 for (unsigned i = 0; i < cv.size(); i += 2) {
713 string k = string("caps_") + cv[i];
714 param_str_map[k] = cv[i + 1];
715 }
716 continue;
717 }
718 }
719 param_str_map[p->first] = cmd_vartype_stringify(p->second);
720 }
721}
722
c07f9fc5 723const MonCommand *DaemonServer::_get_mgrcommand(
7c673cae 724 const string &cmd_prefix,
c07f9fc5 725 const std::vector<MonCommand> &cmds)
7c673cae 726{
c07f9fc5
FG
727 const MonCommand *this_cmd = nullptr;
728 for (const auto &cmd : cmds) {
729 if (cmd.cmdstring.compare(0, cmd_prefix.size(), cmd_prefix) == 0) {
730 this_cmd = &cmd;
7c673cae
FG
731 break;
732 }
733 }
734 return this_cmd;
735}
736
737bool DaemonServer::_allowed_command(
738 MgrSession *s,
92f5a8d4 739 const string &service,
7c673cae
FG
740 const string &module,
741 const string &prefix,
11fdf7f2 742 const cmdmap_t& cmdmap,
7c673cae 743 const map<string,string>& param_str_map,
c07f9fc5 744 const MonCommand *this_cmd) {
7c673cae
FG
745
746 if (s->entity_name.is_mon()) {
747 // mon is all-powerful. even when it is forwarding commands on behalf of
748 // old clients; we expect the mon is validating commands before proxying!
749 return true;
750 }
751
752 bool cmd_r = this_cmd->requires_perm('r');
753 bool cmd_w = this_cmd->requires_perm('w');
754 bool cmd_x = this_cmd->requires_perm('x');
755
756 bool capable = s->caps.is_capable(
757 g_ceph_context,
7c673cae 758 s->entity_name,
92f5a8d4 759 service, module, prefix, param_str_map,
11fdf7f2
TL
760 cmd_r, cmd_w, cmd_x,
761 s->get_peer_addr());
7c673cae
FG
762
763 dout(10) << " " << s->entity_name << " "
764 << (capable ? "" : "not ") << "capable" << dendl;
765 return capable;
766}
767
11fdf7f2
TL
768/**
769 * The working data for processing an MCommand. This lives in
770 * a class to enable passing it into other threads for processing
771 * outside of the thread/locks that called handle_command.
772 */
773class CommandContext {
774public:
9f95a23c
TL
775 ceph::ref_t<MCommand> m_tell;
776 ceph::ref_t<MMgrCommand> m_mgr;
777 const std::vector<std::string>& cmd; ///< ref into m_tell or m_mgr
778 const bufferlist& data; ///< ref into m_tell or m_mgr
11fdf7f2
TL
779 bufferlist odata;
780 cmdmap_t cmdmap;
781
9f95a23c
TL
782 explicit CommandContext(ceph::ref_t<MCommand> m)
783 : m_tell{std::move(m)},
784 cmd(m_tell->cmd),
785 data(m_tell->get_data()) {
11fdf7f2 786 }
9f95a23c
TL
787 explicit CommandContext(ceph::ref_t<MMgrCommand> m)
788 : m_mgr{std::move(m)},
789 cmd(m_mgr->cmd),
790 data(m_mgr->get_data()) {
11fdf7f2 791 }
7c673cae 792
11fdf7f2
TL
793 void reply(int r, const std::stringstream &ss) {
794 reply(r, ss.str());
795 }
7c673cae 796
11fdf7f2
TL
797 void reply(int r, const std::string &rs) {
798 // Let the connection drop as soon as we've sent our response
9f95a23c
TL
799 ConnectionRef con = m_tell ? m_tell->get_connection()
800 : m_mgr->get_connection();
11fdf7f2
TL
801 if (con) {
802 con->mark_disposable();
7c673cae
FG
803 }
804
11fdf7f2 805 if (r == 0) {
9f95a23c 806 dout(20) << "success" << dendl;
11fdf7f2
TL
807 } else {
808 derr << __func__ << " " << cpp_strerror(r) << " " << rs << dendl;
7c673cae 809 }
11fdf7f2 810 if (con) {
9f95a23c
TL
811 if (m_tell) {
812 MCommandReply *reply = new MCommandReply(r, rs);
813 reply->set_tid(m_tell->get_tid());
814 reply->set_data(odata);
815 con->send_message(reply);
816 } else {
817 MMgrCommandReply *reply = new MMgrCommandReply(r, rs);
818 reply->set_tid(m_mgr->get_tid());
819 reply->set_data(odata);
820 con->send_message(reply);
821 }
7c673cae 822 }
11fdf7f2
TL
823 }
824};
7c673cae 825
11fdf7f2
TL
826/**
827 * A context for receiving a bufferlist/error string from a background
828 * function and then calling back to a CommandContext when it's done
829 */
830class ReplyOnFinish : public Context {
831 std::shared_ptr<CommandContext> cmdctx;
7c673cae 832
11fdf7f2
TL
833public:
834 bufferlist from_mon;
835 string outs;
7c673cae 836
11fdf7f2
TL
837 explicit ReplyOnFinish(const std::shared_ptr<CommandContext> &cmdctx_)
838 : cmdctx(cmdctx_)
7c673cae 839 {}
11fdf7f2
TL
840 void finish(int r) override {
841 cmdctx->odata.claim_append(from_mon);
842 cmdctx->reply(r, outs);
843 }
844};
7c673cae 845
9f95a23c 846bool DaemonServer::handle_command(const ref_t<MCommand>& m)
11fdf7f2
TL
847{
848 std::lock_guard l(lock);
1911f103
TL
849 auto cmdctx = std::make_shared<CommandContext>(m);
850 try {
851 return _handle_command(cmdctx);
852 } catch (const bad_cmd_get& e) {
853 cmdctx->reply(-EINVAL, e.what());
9f95a23c 854 return true;
9f95a23c
TL
855 }
856}
857
858bool DaemonServer::handle_command(const ref_t<MMgrCommand>& m)
859{
860 std::lock_guard l(lock);
861 auto cmdctx = std::make_shared<CommandContext>(m);
11fdf7f2 862 try {
9f95a23c 863 return _handle_command(cmdctx);
11fdf7f2
TL
864 } catch (const bad_cmd_get& e) {
865 cmdctx->reply(-EINVAL, e.what());
866 return true;
867 }
868}
7c673cae 869
92f5a8d4
TL
870void DaemonServer::log_access_denied(
871 std::shared_ptr<CommandContext>& cmdctx,
872 MgrSession* session, std::stringstream& ss) {
873 dout(1) << " access denied" << dendl;
874 audit_clog->info() << "from='" << session->inst << "' "
875 << "entity='" << session->entity_name << "' "
9f95a23c 876 << "cmd=" << cmdctx->cmd << ": access denied";
92f5a8d4 877 ss << "access denied: does your client key have mgr caps? "
f67539c2 878 "See http://docs.ceph.com/en/latest/mgr/administrator/"
92f5a8d4
TL
879 "#client-authentication";
880}
881
f67539c2
TL
882void DaemonServer::_check_offlines_pgs(
883 const set<int>& osds,
884 const OSDMap& osdmap,
885 const PGMap& pgmap,
886 offline_pg_report *report)
887{
888 // reset output
889 *report = offline_pg_report();
890 report->osds = osds;
891
892 for (const auto& q : pgmap.pg_stat) {
893 set<int32_t> pg_acting; // net acting sets (with no missing if degraded)
894 bool found = false;
895 if (q.second.state == 0) {
896 report->unknown.insert(q.first);
897 continue;
898 }
899 if (q.second.state & PG_STATE_DEGRADED) {
900 for (auto& anm : q.second.avail_no_missing) {
901 if (osds.count(anm.osd)) {
902 found = true;
903 continue;
904 }
905 if (anm.osd != CRUSH_ITEM_NONE) {
906 pg_acting.insert(anm.osd);
907 }
908 }
909 } else {
910 for (auto& a : q.second.acting) {
911 if (osds.count(a)) {
912 found = true;
913 continue;
914 }
915 if (a != CRUSH_ITEM_NONE) {
916 pg_acting.insert(a);
917 }
918 }
919 }
920 if (!found) {
921 continue;
922 }
923 const pg_pool_t *pi = osdmap.get_pg_pool(q.first.pool());
924 bool dangerous = false;
925 if (!pi) {
926 report->bad_no_pool.insert(q.first); // pool is creating or deleting
927 dangerous = true;
928 }
929 if (!(q.second.state & PG_STATE_ACTIVE)) {
930 report->bad_already_inactive.insert(q.first);
931 dangerous = true;
932 }
933 if (pg_acting.size() < pi->min_size) {
934 report->bad_become_inactive.insert(q.first);
935 dangerous = true;
936 }
937 if (dangerous) {
938 report->not_ok.insert(q.first);
939 } else {
940 report->ok.insert(q.first);
941 if (q.second.state & PG_STATE_DEGRADED) {
942 report->ok_become_more_degraded.insert(q.first);
943 } else {
944 report->ok_become_degraded.insert(q.first);
945 }
946 }
947 }
948 dout(20) << osds << " -> " << report->ok.size() << " ok, "
949 << report->not_ok.size() << " not ok, "
950 << report->unknown.size() << " unknown"
951 << dendl;
952}
953
954void DaemonServer::_maximize_ok_to_stop_set(
955 const set<int>& orig_osds,
956 unsigned max,
957 const OSDMap& osdmap,
958 const PGMap& pgmap,
959 offline_pg_report *out_report)
960{
961 dout(20) << "orig_osds " << orig_osds << " max " << max << dendl;
962 _check_offlines_pgs(orig_osds, osdmap, pgmap, out_report);
963 if (!out_report->ok_to_stop()) {
964 return;
965 }
966 if (orig_osds.size() >= max) {
967 // already at max
968 return;
969 }
970
971 // semi-arbitrarily start with the first osd in the set
972 offline_pg_report report;
973 set<int> osds = orig_osds;
974 int parent = *osds.begin();
975 set<int> children;
976
977 while (true) {
978 // identify the next parent
979 int r = osdmap.crush->get_immediate_parent_id(parent, &parent);
980 if (r < 0) {
981 return; // just go with what we have so far!
982 }
983
984 // get candidate additions that are beneath this point in the tree
985 children.clear();
986 r = osdmap.crush->get_all_children(parent, &children);
987 if (r < 0) {
988 return; // just go with what we have so far!
989 }
990 dout(20) << " parent " << parent << " children " << children << dendl;
991
992 // try adding in more osds
993 int failed = 0; // how many children we failed to add to our set
994 for (auto o : children) {
995 if (o >= 0 && osdmap.is_up(o) && osds.count(o) == 0) {
996 osds.insert(o);
997 _check_offlines_pgs(osds, osdmap, pgmap, &report);
998 if (!report.ok_to_stop()) {
999 osds.erase(o);
1000 ++failed;
1001 continue;
1002 }
1003 *out_report = report;
1004 if (osds.size() == max) {
1005 dout(20) << " hit max" << dendl;
1006 return; // yay, we hit the max
1007 }
1008 }
1009 }
1010
1011 if (failed) {
1012 // we hit some failures; go with what we have
1013 dout(20) << " hit some peer failures" << dendl;
1014 return;
1015 }
1016 }
1017}
1018
11fdf7f2 1019bool DaemonServer::_handle_command(
11fdf7f2
TL
1020 std::shared_ptr<CommandContext>& cmdctx)
1021{
9f95a23c 1022 MessageRef m;
1911f103 1023 bool admin_socket_cmd = false;
9f95a23c
TL
1024 if (cmdctx->m_tell) {
1025 m = cmdctx->m_tell;
1911f103
TL
1026 // a blank fsid in MCommand signals a legacy client sending a "mon-mgr" CLI
1027 // command.
1028 admin_socket_cmd = (cmdctx->m_tell->fsid != uuid_d());
9f95a23c
TL
1029 } else {
1030 m = cmdctx->m_mgr;
1031 }
11fdf7f2
TL
1032 auto priv = m->get_connection()->get_priv();
1033 auto session = static_cast<MgrSession*>(priv.get());
7c673cae
FG
1034 if (!session) {
1035 return true;
1036 }
9f95a23c 1037 if (session->inst.name == entity_name_t()) {
7c673cae 1038 session->inst.name = m->get_source();
9f95a23c 1039 }
7c673cae 1040
7c673cae 1041 map<string,string> param_str_map;
11fdf7f2
TL
1042 std::stringstream ss;
1043 int r = 0;
7c673cae 1044
9f95a23c 1045 if (!cmdmap_from_json(cmdctx->cmd, &(cmdctx->cmdmap), ss)) {
7c673cae
FG
1046 cmdctx->reply(-EINVAL, ss);
1047 return true;
1048 }
1049
11fdf7f2 1050 string prefix;
9f95a23c 1051 cmd_getval(cmdctx->cmdmap, "prefix", prefix);
f67539c2 1052 dout(10) << "decoded-size=" << cmdctx->cmdmap.size() << " prefix=" << prefix << dendl;
7c673cae 1053
f67539c2
TL
1054 boost::scoped_ptr<Formatter> f;
1055 {
1056 std::string format;
1057 if (boost::algorithm::ends_with(prefix, "_json")) {
1058 format = "json";
1059 } else {
20effc67 1060 format = cmd_getval_or<string>(cmdctx->cmdmap, "format", "plain");
f67539c2
TL
1061 }
1062 f.reset(Formatter::create(format));
1063 }
7c673cae 1064
1911f103
TL
1065 // this is just for mgr commands - admin socket commands will fall
1066 // through and use the admin socket version of
1067 // get_command_descriptions
1068 if (prefix == "get_command_descriptions" && !admin_socket_cmd) {
7c673cae 1069 dout(10) << "reading commands from python modules" << dendl;
c07f9fc5 1070 const auto py_commands = py_modules.get_commands();
7c673cae 1071
c07f9fc5 1072 int cmdnum = 0;
7c673cae
FG
1073 JSONFormatter f;
1074 f.open_object_section("command_descriptions");
c07f9fc5 1075
11fdf7f2 1076 auto dump_cmd = [&cmdnum, &f, m](const MonCommand &mc){
7c673cae 1077 ostringstream secname;
20effc67 1078 secname << "cmd" << std::setfill('0') << std::setw(3) << cmdnum;
11fdf7f2
TL
1079 dump_cmddesc_to_json(&f, m->get_connection()->get_features(),
1080 secname.str(), mc.cmdstring, mc.helpstring,
1081 mc.module, mc.req_perms, 0);
7c673cae 1082 cmdnum++;
c07f9fc5
FG
1083 };
1084
1085 for (const auto &pyc : py_commands) {
1086 dump_cmd(pyc);
7c673cae 1087 }
7c673cae 1088
c07f9fc5
FG
1089 for (const auto &mgr_cmd : mgr_commands) {
1090 dump_cmd(mgr_cmd);
7c673cae 1091 }
c07f9fc5 1092
7c673cae
FG
1093 f.close_section(); // command_descriptions
1094 f.flush(cmdctx->odata);
1095 cmdctx->reply(0, ss);
1096 return true;
1097 }
1098
1099 // lookup command
c07f9fc5 1100 const MonCommand *mgr_cmd = _get_mgrcommand(prefix, mgr_commands);
7c673cae 1101 _generate_command_map(cmdctx->cmdmap, param_str_map);
11fdf7f2 1102
92f5a8d4
TL
1103 bool is_allowed = false;
1104 ModuleCommand py_command;
1911f103
TL
1105 if (admin_socket_cmd) {
1106 // admin socket commands require all capabilities
1107 is_allowed = session->caps.is_allow_all();
1108 } else if (!mgr_cmd) {
92f5a8d4
TL
1109 // Resolve the command to the name of the module that will
1110 // handle it (if the command exists)
1111 auto py_commands = py_modules.get_py_commands();
1112 for (const auto &pyc : py_commands) {
1113 auto pyc_prefix = cmddesc_get_prefix(pyc.cmdstring);
1114 if (pyc_prefix == prefix) {
1115 py_command = pyc;
1116 break;
1117 }
1118 }
1119
1120 MonCommand pyc = {"", "", "py", py_command.perm};
1121 is_allowed = _allowed_command(session, "py", py_command.module_name,
1122 prefix, cmdctx->cmdmap, param_str_map,
1123 &pyc);
7c673cae
FG
1124 } else {
1125 // validate user's permissions for requested command
92f5a8d4 1126 is_allowed = _allowed_command(session, mgr_cmd->module, "",
11fdf7f2
TL
1127 prefix, cmdctx->cmdmap, param_str_map, mgr_cmd);
1128 }
92f5a8d4 1129
11fdf7f2 1130 if (!is_allowed) {
92f5a8d4 1131 log_access_denied(cmdctx, session, ss);
7c673cae
FG
1132 cmdctx->reply(-EACCES, ss);
1133 return true;
7c673cae
FG
1134 }
1135
1136 audit_clog->debug()
1137 << "from='" << session->inst << "' "
1138 << "entity='" << session->entity_name << "' "
9f95a23c 1139 << "cmd=" << cmdctx->cmd << ": dispatch";
7c673cae 1140
1911f103
TL
1141 if (admin_socket_cmd) {
1142 cct->get_admin_socket()->queue_tell_command(cmdctx->m_tell);
1143 return true;
1144 }
1145
224ce89b
WB
1146 // ----------------
1147 // service map commands
1148 if (prefix == "service dump") {
1149 if (!f)
1150 f.reset(Formatter::create("json-pretty"));
1151 cluster_state.with_servicemap([&](const ServiceMap &service_map) {
1152 f->dump_object("service_map", service_map);
1153 });
1154 f->flush(cmdctx->odata);
1155 cmdctx->reply(0, ss);
1156 return true;
1157 }
1158 if (prefix == "service status") {
1159 if (!f)
1160 f.reset(Formatter::create("json-pretty"));
1161 // only include state from services that are in the persisted service map
1162 f->open_object_section("service_status");
9f95a23c
TL
1163 for (auto& [type, service] : pending_service_map.services) {
1164 if (ServiceMap::is_normal_ceph_entity(type)) {
1165 continue;
1166 }
1167
1168 f->open_object_section(type.c_str());
1169 for (auto& q : service.daemons) {
224ce89b 1170 f->open_object_section(q.first.c_str());
9f95a23c 1171 DaemonKey key{type, q.first};
11fdf7f2 1172 ceph_assert(daemon_state.exists(key));
224ce89b 1173 auto daemon = daemon_state.get(key);
11fdf7f2 1174 std::lock_guard l(daemon->lock);
224ce89b
WB
1175 f->dump_stream("status_stamp") << daemon->service_status_stamp;
1176 f->dump_stream("last_beacon") << daemon->last_service_beacon;
1177 f->open_object_section("status");
1178 for (auto& r : daemon->service_status) {
1179 f->dump_string(r.first.c_str(), r.second);
1180 }
1181 f->close_section();
1182 f->close_section();
1183 }
1184 f->close_section();
1185 }
1186 f->close_section();
1187 f->flush(cmdctx->odata);
1188 cmdctx->reply(0, ss);
1189 return true;
1190 }
1191
3efd9988
FG
1192 if (prefix == "config set") {
1193 std::string key;
1194 std::string val;
9f95a23c
TL
1195 cmd_getval(cmdctx->cmdmap, "key", key);
1196 cmd_getval(cmdctx->cmdmap, "value", val);
11fdf7f2 1197 r = cct->_conf.set_val(key, val, &ss);
3efd9988 1198 if (r == 0) {
11fdf7f2 1199 cct->_conf.apply_changes(nullptr);
3efd9988
FG
1200 }
1201 cmdctx->reply(0, ss);
1202 return true;
1203 }
1204
7c673cae
FG
1205 // -----------
1206 // PG commands
1207
1208 if (prefix == "pg scrub" ||
1209 prefix == "pg repair" ||
1210 prefix == "pg deep-scrub") {
1211 string scrubop = prefix.substr(3, string::npos);
1212 pg_t pgid;
11fdf7f2 1213 spg_t spgid;
7c673cae 1214 string pgidstr;
9f95a23c 1215 cmd_getval(cmdctx->cmdmap, "pgid", pgidstr);
7c673cae
FG
1216 if (!pgid.parse(pgidstr.c_str())) {
1217 ss << "invalid pgid '" << pgidstr << "'";
1218 cmdctx->reply(-EINVAL, ss);
1219 return true;
1220 }
1221 bool pg_exists = false;
1222 cluster_state.with_osdmap([&](const OSDMap& osdmap) {
1223 pg_exists = osdmap.pg_exists(pgid);
1224 });
1225 if (!pg_exists) {
11fdf7f2 1226 ss << "pg " << pgid << " does not exist";
7c673cae
FG
1227 cmdctx->reply(-ENOENT, ss);
1228 return true;
1229 }
1230 int acting_primary = -1;
11fdf7f2 1231 epoch_t epoch;
7c673cae 1232 cluster_state.with_osdmap([&](const OSDMap& osdmap) {
11fdf7f2
TL
1233 epoch = osdmap.get_epoch();
1234 osdmap.get_primary_shard(pgid, &acting_primary, &spgid);
7c673cae
FG
1235 });
1236 if (acting_primary == -1) {
1237 ss << "pg " << pgid << " has no primary osd";
1238 cmdctx->reply(-EAGAIN, ss);
1239 return true;
1240 }
31f18b77
FG
1241 auto p = osd_cons.find(acting_primary);
1242 if (p == osd_cons.end()) {
1243 ss << "pg " << pgid << " primary osd." << acting_primary
1244 << " is not currently connected";
1245 cmdctx->reply(-EAGAIN, ss);
f64942e4 1246 return true;
31f18b77 1247 }
31f18b77 1248 for (auto& con : p->second) {
20effc67
TL
1249 assert(HAVE_FEATURE(con->get_features(), SERVER_OCTOPUS));
1250 vector<spg_t> pgs = { spgid };
1251 con->send_message(new MOSDScrub2(monc->get_fsid(),
1252 epoch,
1253 pgs,
1254 scrubop == "repair",
1255 scrubop == "deep-scrub"));
31f18b77 1256 }
11fdf7f2 1257 ss << "instructing pg " << spgid << " on osd." << acting_primary
31f18b77
FG
1258 << " to " << scrubop;
1259 cmdctx->reply(0, ss);
1260 return true;
1261 } else if (prefix == "osd scrub" ||
1262 prefix == "osd deep-scrub" ||
1263 prefix == "osd repair") {
1264 string whostr;
9f95a23c 1265 cmd_getval(cmdctx->cmdmap, "who", whostr);
31f18b77
FG
1266 vector<string> pvec;
1267 get_str_vec(prefix, pvec);
1268
1269 set<int> osds;
224ce89b 1270 if (whostr == "*" || whostr == "all" || whostr == "any") {
31f18b77
FG
1271 cluster_state.with_osdmap([&](const OSDMap& osdmap) {
1272 for (int i = 0; i < osdmap.get_max_osd(); i++)
1273 if (osdmap.is_up(i)) {
1274 osds.insert(i);
1275 }
1276 });
1277 } else {
1278 long osd = parse_osd_id(whostr.c_str(), &ss);
1279 if (osd < 0) {
1280 ss << "invalid osd '" << whostr << "'";
1281 cmdctx->reply(-EINVAL, ss);
1282 return true;
1283 }
1284 cluster_state.with_osdmap([&](const OSDMap& osdmap) {
1285 if (osdmap.is_up(osd)) {
1286 osds.insert(osd);
1287 }
1288 });
1289 if (osds.empty()) {
1290 ss << "osd." << osd << " is not up";
1291 cmdctx->reply(-EAGAIN, ss);
1292 return true;
1293 }
1294 }
1295 set<int> sent_osds, failed_osds;
1296 for (auto osd : osds) {
11fdf7f2
TL
1297 vector<spg_t> spgs;
1298 epoch_t epoch;
1299 cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pgmap) {
1300 epoch = osdmap.get_epoch();
1301 auto p = pgmap.pg_by_osd.find(osd);
1302 if (p != pgmap.pg_by_osd.end()) {
1303 for (auto pgid : p->second) {
1304 int primary;
1305 spg_t spg;
1306 osdmap.get_primary_shard(pgid, &primary, &spg);
1307 if (primary == osd) {
1308 spgs.push_back(spg);
1309 }
1310 }
1311 }
1312 });
31f18b77
FG
1313 auto p = osd_cons.find(osd);
1314 if (p == osd_cons.end()) {
1315 failed_osds.insert(osd);
1316 } else {
1317 sent_osds.insert(osd);
1318 for (auto& con : p->second) {
11fdf7f2
TL
1319 if (HAVE_FEATURE(con->get_features(), SERVER_MIMIC)) {
1320 con->send_message(new MOSDScrub2(monc->get_fsid(),
1321 epoch,
1322 spgs,
1323 pvec.back() == "repair",
1324 pvec.back() == "deep-scrub"));
1325 } else {
1326 con->send_message(new MOSDScrub(monc->get_fsid(),
1327 pvec.back() == "repair",
1328 pvec.back() == "deep-scrub"));
1329 }
31f18b77
FG
1330 }
1331 }
1332 }
1333 if (failed_osds.size() == osds.size()) {
1334 ss << "failed to instruct osd(s) " << osds << " to " << pvec.back()
1335 << " (not connected)";
1336 r = -EAGAIN;
1337 } else {
1338 ss << "instructed osd(s) " << sent_osds << " to " << pvec.back();
1339 if (!failed_osds.empty()) {
1340 ss << "; osd(s) " << failed_osds << " were not connected";
1341 }
1342 r = 0;
1343 }
7c673cae
FG
1344 cmdctx->reply(0, ss);
1345 return true;
11fdf7f2
TL
1346 } else if (prefix == "osd pool scrub" ||
1347 prefix == "osd pool deep-scrub" ||
1348 prefix == "osd pool repair") {
1349 vector<string> pool_names;
9f95a23c 1350 cmd_getval(cmdctx->cmdmap, "who", pool_names);
11fdf7f2
TL
1351 if (pool_names.empty()) {
1352 ss << "must specify one or more pool names";
1353 cmdctx->reply(-EINVAL, ss);
1354 return true;
1355 }
1356 epoch_t epoch;
1357 map<int32_t, vector<pg_t>> pgs_by_primary; // legacy
1358 map<int32_t, vector<spg_t>> spgs_by_primary;
1359 cluster_state.with_osdmap([&](const OSDMap& osdmap) {
1360 epoch = osdmap.get_epoch();
1361 for (auto& pool_name : pool_names) {
1362 auto pool_id = osdmap.lookup_pg_pool_name(pool_name);
1363 if (pool_id < 0) {
1364 ss << "unrecognized pool '" << pool_name << "'";
1365 r = -ENOENT;
1366 return;
1367 }
1368 auto pool_pg_num = osdmap.get_pg_num(pool_id);
1369 for (int i = 0; i < pool_pg_num; i++) {
1370 pg_t pg(i, pool_id);
1371 int primary;
1372 spg_t spg;
1373 auto got = osdmap.get_primary_shard(pg, &primary, &spg);
1374 if (!got)
1375 continue;
1376 pgs_by_primary[primary].push_back(pg);
1377 spgs_by_primary[primary].push_back(spg);
1378 }
1379 }
1380 });
1381 if (r < 0) {
1382 cmdctx->reply(r, ss);
1383 return true;
1384 }
1385 for (auto& it : spgs_by_primary) {
1386 auto primary = it.first;
1387 auto p = osd_cons.find(primary);
1388 if (p == osd_cons.end()) {
1389 ss << "osd." << primary << " is not currently connected";
1390 cmdctx->reply(-EAGAIN, ss);
1391 return true;
1392 }
1393 for (auto& con : p->second) {
1394 if (HAVE_FEATURE(con->get_features(), SERVER_MIMIC)) {
1395 con->send_message(new MOSDScrub2(monc->get_fsid(),
1396 epoch,
1397 it.second,
1398 prefix == "osd pool repair",
1399 prefix == "osd pool deep-scrub"));
1400 } else {
1401 // legacy
1402 auto q = pgs_by_primary.find(primary);
1403 ceph_assert(q != pgs_by_primary.end());
1404 con->send_message(new MOSDScrub(monc->get_fsid(),
1405 q->second,
1406 prefix == "osd pool repair",
1407 prefix == "osd pool deep-scrub"));
1408 }
1409 }
1410 }
1411 cmdctx->reply(0, "");
1412 return true;
7c673cae
FG
1413 } else if (prefix == "osd reweight-by-pg" ||
1414 prefix == "osd reweight-by-utilization" ||
1415 prefix == "osd test-reweight-by-pg" ||
1416 prefix == "osd test-reweight-by-utilization") {
1417 bool by_pg =
1418 prefix == "osd reweight-by-pg" || prefix == "osd test-reweight-by-pg";
1419 bool dry_run =
1420 prefix == "osd test-reweight-by-pg" ||
1421 prefix == "osd test-reweight-by-utilization";
20effc67 1422 int64_t oload = cmd_getval_or<int64_t>(cmdctx->cmdmap, "oload", 120);
7c673cae
FG
1423 set<int64_t> pools;
1424 vector<string> poolnames;
9f95a23c 1425 cmd_getval(cmdctx->cmdmap, "pools", poolnames);
7c673cae
FG
1426 cluster_state.with_osdmap([&](const OSDMap& osdmap) {
1427 for (const auto& poolname : poolnames) {
1428 int64_t pool = osdmap.lookup_pg_pool_name(poolname);
1429 if (pool < 0) {
1430 ss << "pool '" << poolname << "' does not exist";
1431 r = -ENOENT;
1432 }
1433 pools.insert(pool);
1434 }
1435 });
1436 if (r) {
1437 cmdctx->reply(r, ss);
1438 return true;
1439 }
11fdf7f2
TL
1440
1441 double max_change = g_conf().get_val<double>("mon_reweight_max_change");
9f95a23c 1442 cmd_getval(cmdctx->cmdmap, "max_change", max_change);
7c673cae
FG
1443 if (max_change <= 0.0) {
1444 ss << "max_change " << max_change << " must be positive";
1445 cmdctx->reply(-EINVAL, ss);
1446 return true;
1447 }
11fdf7f2 1448 int64_t max_osds = g_conf().get_val<int64_t>("mon_reweight_max_osds");
9f95a23c 1449 cmd_getval(cmdctx->cmdmap, "max_osds", max_osds);
7c673cae
FG
1450 if (max_osds <= 0) {
1451 ss << "max_osds " << max_osds << " must be positive";
1452 cmdctx->reply(-EINVAL, ss);
1453 return true;
1454 }
11fdf7f2 1455 bool no_increasing = false;
20effc67 1456 cmd_getval_compat_cephbool(cmdctx->cmdmap, "no_increasing", no_increasing);
7c673cae
FG
1457 string out_str;
1458 mempool::osdmap::map<int32_t, uint32_t> new_weights;
11fdf7f2
TL
1459 r = cluster_state.with_osdmap_and_pgmap([&](const OSDMap &osdmap, const PGMap& pgmap) {
1460 return reweight::by_utilization(osdmap, pgmap,
1461 oload,
1462 max_change,
1463 max_osds,
1464 by_pg,
1465 pools.empty() ? NULL : &pools,
1466 no_increasing,
1467 &new_weights,
1468 &ss, &out_str, f.get());
7c673cae
FG
1469 });
1470 if (r >= 0) {
1471 dout(10) << "reweight::by_utilization: finished with " << out_str << dendl;
1472 }
1473 if (f) {
1474 f->flush(cmdctx->odata);
1475 } else {
1476 cmdctx->odata.append(out_str);
1477 }
1478 if (r < 0) {
1479 ss << "FAILED reweight-by-pg";
1480 cmdctx->reply(r, ss);
1481 return true;
1482 } else if (r == 0 || dry_run) {
1483 ss << "no change";
1484 cmdctx->reply(r, ss);
1485 return true;
1486 } else {
1487 json_spirit::Object json_object;
1488 for (const auto& osd_weight : new_weights) {
1489 json_spirit::Config::add(json_object,
1490 std::to_string(osd_weight.first),
1491 std::to_string(osd_weight.second));
1492 }
1493 string s = json_spirit::write(json_object);
1494 std::replace(begin(s), end(s), '\"', '\'');
1495 const string cmd =
1496 "{"
1497 "\"prefix\": \"osd reweightn\", "
1498 "\"weights\": \"" + s + "\""
1499 "}";
1500 auto on_finish = new ReplyOnFinish(cmdctx);
1501 monc->start_mon_command({cmd}, {},
1502 &on_finish->from_mon, &on_finish->outs, on_finish);
1503 return true;
1504 }
31f18b77 1505 } else if (prefix == "osd df") {
9f95a23c
TL
1506 string method, filter;
1507 cmd_getval(cmdctx->cmdmap, "output_method", method);
1508 cmd_getval(cmdctx->cmdmap, "filter", filter);
11fdf7f2
TL
1509 stringstream rs;
1510 r = cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pgmap) {
11fdf7f2 1511 // sanity check filter(s)
9f95a23c
TL
1512 if (!filter.empty() &&
1513 osdmap.lookup_pg_pool_name(filter) < 0 &&
1514 !osdmap.crush->class_exists(filter) &&
1515 !osdmap.crush->name_exists(filter)) {
1516 rs << "'" << filter << "' not a pool, crush node or device class name";
1517 return -EINVAL;
11fdf7f2
TL
1518 }
1519 print_osd_utilization(osdmap, pgmap, ss,
9f95a23c 1520 f.get(), method == "tree", filter);
11fdf7f2
TL
1521 cmdctx->odata.append(ss);
1522 return 0;
31f18b77 1523 });
11fdf7f2 1524 cmdctx->reply(r, rs);
31f18b77 1525 return true;
11fdf7f2
TL
1526 } else if (prefix == "osd pool stats") {
1527 string pool_name;
9f95a23c 1528 cmd_getval(cmdctx->cmdmap, "pool_name", pool_name);
11fdf7f2
TL
1529 int64_t poolid = -ENOENT;
1530 bool one_pool = false;
1531 r = cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) {
1532 if (!pool_name.empty()) {
1533 poolid = osdmap.lookup_pg_pool_name(pool_name);
1534 if (poolid < 0) {
1535 ceph_assert(poolid == -ENOENT);
1536 ss << "unrecognized pool '" << pool_name << "'";
1537 return -ENOENT;
1538 }
1539 one_pool = true;
1540 }
1541 stringstream rs;
1542 if (f)
1543 f->open_array_section("pool_stats");
1544 else {
1545 if (osdmap.get_pools().empty()) {
1546 ss << "there are no pools!";
1547 goto stats_out;
1548 }
1549 }
1550 for (auto &p : osdmap.get_pools()) {
1551 if (!one_pool) {
1552 poolid = p.first;
1553 }
1554 pg_map.dump_pool_stats_and_io_rate(poolid, osdmap, f.get(), &rs);
1555 if (one_pool) {
1556 break;
1557 }
1558 }
1559 stats_out:
1560 if (f) {
1561 f->close_section();
1562 f->flush(cmdctx->odata);
1563 } else {
1564 cmdctx->odata.append(rs.str());
1565 }
1566 return 0;
35e4c445 1567 });
11fdf7f2
TL
1568 if (r != -EOPNOTSUPP) {
1569 cmdctx->reply(r, ss);
1570 return true;
1571 }
1572 } else if (prefix == "osd safe-to-destroy" ||
1573 prefix == "osd destroy" ||
1574 prefix == "osd purge") {
1575 set<int> osds;
1576 int r = 0;
1577 if (prefix == "osd safe-to-destroy") {
1578 vector<string> ids;
9f95a23c 1579 cmd_getval(cmdctx->cmdmap, "ids", ids);
11fdf7f2
TL
1580 cluster_state.with_osdmap([&](const OSDMap& osdmap) {
1581 r = osdmap.parse_osd_id_list(ids, &osds, &ss);
1582 });
1583 if (!r && osds.empty()) {
1584 ss << "must specify one or more OSDs";
1585 r = -EINVAL;
1586 }
1587 } else {
1588 int64_t id;
9f95a23c 1589 if (!cmd_getval(cmdctx->cmdmap, "id", id)) {
11fdf7f2
TL
1590 r = -EINVAL;
1591 ss << "must specify OSD id";
1592 } else {
1593 osds.insert(id);
1594 }
35e4c445
FG
1595 }
1596 if (r < 0) {
1597 cmdctx->reply(r, ss);
1598 return true;
1599 }
11fdf7f2 1600 set<int> active_osds, missing_stats, stored_pgs, safe_to_destroy;
35e4c445 1601 int affected_pgs = 0;
11fdf7f2 1602 cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) {
35e4c445
FG
1603 if (pg_map.num_pg_unknown > 0) {
1604 ss << pg_map.num_pg_unknown << " pgs have unknown state; cannot draw"
1605 << " any conclusions";
1606 r = -EAGAIN;
1607 return;
1608 }
1609 int num_active_clean = 0;
1610 for (auto& p : pg_map.num_pg_by_state) {
1611 unsigned want = PG_STATE_ACTIVE|PG_STATE_CLEAN;
1612 if ((p.first & want) == want) {
1613 num_active_clean += p.second;
1614 }
1615 }
11fdf7f2
TL
1616 for (auto osd : osds) {
1617 if (!osdmap.exists(osd)) {
1618 safe_to_destroy.insert(osd);
1619 continue; // clearly safe to destroy
1620 }
1621 auto q = pg_map.num_pg_by_osd.find(osd);
1622 if (q != pg_map.num_pg_by_osd.end()) {
81eedcae 1623 if (q->second.acting > 0 || q->second.up_not_acting > 0) {
11fdf7f2 1624 active_osds.insert(osd);
81eedcae
TL
1625 // XXX: For overlapping PGs, this counts them again
1626 affected_pgs += q->second.acting + q->second.up_not_acting;
11fdf7f2 1627 continue;
35e4c445 1628 }
11fdf7f2
TL
1629 }
1630 if (num_active_clean < pg_map.num_pg) {
1631 // all pgs aren't active+clean; we need to be careful.
1632 auto p = pg_map.osd_stat.find(osd);
81eedcae 1633 if (p == pg_map.osd_stat.end() || !osdmap.is_up(osd)) {
11fdf7f2
TL
1634 missing_stats.insert(osd);
1635 continue;
1636 } else if (p->second.num_pgs > 0) {
1637 stored_pgs.insert(osd);
1638 continue;
1639 }
1640 }
1641 safe_to_destroy.insert(osd);
1642 }
35e4c445 1643 });
11fdf7f2
TL
1644 if (r && prefix == "osd safe-to-destroy") {
1645 cmdctx->reply(r, ss); // regardless of formatter
1646 return true;
1647 }
1648 if (!r && (!active_osds.empty() ||
1649 !missing_stats.empty() || !stored_pgs.empty())) {
1650 if (!safe_to_destroy.empty()) {
1651 ss << "OSD(s) " << safe_to_destroy
1652 << " are safe to destroy without reducing data durability. ";
1653 }
1654 if (!active_osds.empty()) {
1655 ss << "OSD(s) " << active_osds << " have " << affected_pgs
1656 << " pgs currently mapped to them. ";
1657 }
1658 if (!missing_stats.empty()) {
1659 ss << "OSD(s) " << missing_stats << " have no reported stats, and not all"
1660 << " PGs are active+clean; we cannot draw any conclusions. ";
1661 }
1662 if (!stored_pgs.empty()) {
1663 ss << "OSD(s) " << stored_pgs << " last reported they still store some PG"
1664 << " data, and not all PGs are active+clean; we cannot be sure they"
1665 << " aren't still needed.";
1666 }
1667 if (!active_osds.empty() || !stored_pgs.empty()) {
1668 r = -EBUSY;
1669 } else {
1670 r = -EAGAIN;
1671 }
1672 }
1673
1674 if (prefix == "osd safe-to-destroy") {
1675 if (!r) {
1676 ss << "OSD(s) " << osds << " are safe to destroy without reducing data"
1677 << " durability.";
11fdf7f2
TL
1678 }
1679 if (f) {
1680 f->open_object_section("osd_status");
1681 f->open_array_section("safe_to_destroy");
1682 for (auto i : safe_to_destroy)
1683 f->dump_int("osd", i);
1684 f->close_section();
1685 f->open_array_section("active");
1686 for (auto i : active_osds)
1687 f->dump_int("osd", i);
1688 f->close_section();
1689 f->open_array_section("missing_stats");
1690 for (auto i : missing_stats)
1691 f->dump_int("osd", i);
1692 f->close_section();
1693 f->open_array_section("stored_pgs");
1694 for (auto i : stored_pgs)
1695 f->dump_int("osd", i);
1696 f->close_section();
1697 f->close_section(); // osd_status
1698 f->flush(cmdctx->odata);
1699 r = 0;
1700 std::stringstream().swap(ss);
1701 }
1702 cmdctx->reply(r, ss);
1703 return true;
1704 }
1705
1706 if (r) {
1707 bool force = false;
9f95a23c 1708 cmd_getval(cmdctx->cmdmap, "force", force);
11fdf7f2
TL
1709 if (!force) {
1710 // Backward compat
9f95a23c 1711 cmd_getval(cmdctx->cmdmap, "yes_i_really_mean_it", force);
11fdf7f2
TL
1712 }
1713 if (!force) {
1714 ss << "\nYou can proceed by passing --force, but be warned that"
1715 " this will likely mean real, permanent data loss.";
1716 } else {
1717 r = 0;
1718 }
35e4c445
FG
1719 }
1720 if (r) {
1721 cmdctx->reply(r, ss);
1722 return true;
1723 }
11fdf7f2
TL
1724 const string cmd =
1725 "{"
1726 "\"prefix\": \"" + prefix + "-actual\", "
1727 "\"id\": " + stringify(osds) + ", "
1728 "\"yes_i_really_mean_it\": true"
1729 "}";
1730 auto on_finish = new ReplyOnFinish(cmdctx);
1731 monc->start_mon_command({cmd}, {}, nullptr, &on_finish->outs, on_finish);
35e4c445
FG
1732 return true;
1733 } else if (prefix == "osd ok-to-stop") {
1734 vector<string> ids;
9f95a23c 1735 cmd_getval(cmdctx->cmdmap, "ids", ids);
35e4c445 1736 set<int> osds;
f67539c2
TL
1737 int64_t max = 1;
1738 cmd_getval(cmdctx->cmdmap, "max", max);
35e4c445
FG
1739 int r;
1740 cluster_state.with_osdmap([&](const OSDMap& osdmap) {
1741 r = osdmap.parse_osd_id_list(ids, &osds, &ss);
1742 });
1743 if (!r && osds.empty()) {
1744 ss << "must specify one or more OSDs";
1745 r = -EINVAL;
1746 }
f67539c2
TL
1747 if (max < (int)osds.size()) {
1748 max = osds.size();
1749 }
35e4c445
FG
1750 if (r < 0) {
1751 cmdctx->reply(r, ss);
1752 return true;
1753 }
f67539c2 1754 offline_pg_report out_report;
11fdf7f2 1755 cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) {
f67539c2
TL
1756 _maximize_ok_to_stop_set(
1757 osds, max, osdmap, pg_map,
1758 &out_report);
35e4c445 1759 });
f67539c2
TL
1760 if (!f) {
1761 f.reset(Formatter::create("json"));
35e4c445 1762 }
f67539c2
TL
1763 f->dump_object("ok_to_stop", out_report);
1764 f->flush(cmdctx->odata);
1765 cmdctx->odata.append("\n");
1766 if (!out_report.unknown.empty()) {
1767 ss << out_report.unknown.size() << " pgs have unknown state; "
1768 << "cannot draw any conclusions";
1769 cmdctx->reply(-EAGAIN, ss);
1770 }
1771 if (!out_report.ok_to_stop()) {
1772 ss << "unsafe to stop osd(s) at this time (" << out_report.not_ok.size() << " PGs are or would become offline)";
35e4c445 1773 cmdctx->reply(-EBUSY, ss);
f67539c2
TL
1774 } else {
1775 cmdctx->reply(0, ss);
35e4c445 1776 }
35e4c445 1777 return true;
c07f9fc5 1778 } else if (prefix == "pg force-recovery" ||
11fdf7f2
TL
1779 prefix == "pg force-backfill" ||
1780 prefix == "pg cancel-force-recovery" ||
1781 prefix == "pg cancel-force-backfill" ||
1782 prefix == "osd pool force-recovery" ||
1783 prefix == "osd pool force-backfill" ||
1784 prefix == "osd pool cancel-force-recovery" ||
1785 prefix == "osd pool cancel-force-backfill") {
1786 vector<string> vs;
1787 get_str_vec(prefix, vs);
1788 auto& granularity = vs.front();
1789 auto& forceop = vs.back();
1790 vector<pg_t> pgs;
c07f9fc5
FG
1791
1792 // figure out actual op just once
1793 int actual_op = 0;
1794 if (forceop == "force-recovery") {
1795 actual_op = OFR_RECOVERY;
1796 } else if (forceop == "force-backfill") {
1797 actual_op = OFR_BACKFILL;
1798 } else if (forceop == "cancel-force-backfill") {
1799 actual_op = OFR_BACKFILL | OFR_CANCEL;
1800 } else if (forceop == "cancel-force-recovery") {
1801 actual_op = OFR_RECOVERY | OFR_CANCEL;
1802 }
1803
11fdf7f2
TL
1804 set<pg_t> candidates; // deduped
1805 if (granularity == "pg") {
1806 // covnert pg names to pgs, discard any invalid ones while at it
1807 vector<string> pgids;
9f95a23c 1808 cmd_getval(cmdctx->cmdmap, "pgid", pgids);
11fdf7f2
TL
1809 for (auto& i : pgids) {
1810 pg_t pgid;
1811 if (!pgid.parse(i.c_str())) {
1812 ss << "invlaid pgid '" << i << "'; ";
1813 r = -EINVAL;
1814 continue;
1815 }
1816 candidates.insert(pgid);
c07f9fc5 1817 }
11fdf7f2
TL
1818 } else {
1819 // per pool
1820 vector<string> pool_names;
9f95a23c 1821 cmd_getval(cmdctx->cmdmap, "who", pool_names);
11fdf7f2
TL
1822 if (pool_names.empty()) {
1823 ss << "must specify one or more pool names";
1824 cmdctx->reply(-EINVAL, ss);
1825 return true;
1826 }
1827 cluster_state.with_osdmap([&](const OSDMap& osdmap) {
1828 for (auto& pool_name : pool_names) {
1829 auto pool_id = osdmap.lookup_pg_pool_name(pool_name);
1830 if (pool_id < 0) {
1831 ss << "unrecognized pool '" << pool_name << "'";
1832 r = -ENOENT;
1833 return;
1834 }
1835 auto pool_pg_num = osdmap.get_pg_num(pool_id);
1836 for (int i = 0; i < pool_pg_num; i++)
1837 candidates.insert({(unsigned int)i, (uint64_t)pool_id});
1838 }
c07f9fc5 1839 });
11fdf7f2
TL
1840 if (r < 0) {
1841 cmdctx->reply(r, ss);
1842 return true;
1843 }
c07f9fc5
FG
1844 }
1845
11fdf7f2
TL
1846 cluster_state.with_pgmap([&](const PGMap& pg_map) {
1847 for (auto& i : candidates) {
1848 auto it = pg_map.pg_stat.find(i);
1849 if (it == pg_map.pg_stat.end()) {
1850 ss << "pg " << i << " does not exist; ";
1851 r = -ENOENT;
1852 continue;
1853 }
1854 auto state = it->second.state;
1855 // discard pgs for which user requests are pointless
1856 switch (actual_op) {
1857 case OFR_RECOVERY:
1858 if ((state & (PG_STATE_DEGRADED |
1859 PG_STATE_RECOVERY_WAIT |
1860 PG_STATE_RECOVERING)) == 0) {
1861 // don't return error, user script may be racing with cluster.
1862 // not fatal.
1863 ss << "pg " << i << " doesn't require recovery; ";
1864 continue;
1865 } else if (state & PG_STATE_FORCED_RECOVERY) {
1866 ss << "pg " << i << " recovery already forced; ";
1867 // return error, as it may be a bug in user script
1868 r = -EINVAL;
1869 continue;
1870 }
1871 break;
1872 case OFR_BACKFILL:
1873 if ((state & (PG_STATE_DEGRADED |
1874 PG_STATE_BACKFILL_WAIT |
1875 PG_STATE_BACKFILLING)) == 0) {
1876 ss << "pg " << i << " doesn't require backfilling; ";
1877 continue;
1878 } else if (state & PG_STATE_FORCED_BACKFILL) {
1879 ss << "pg " << i << " backfill already forced; ";
1880 r = -EINVAL;
1881 continue;
1882 }
1883 break;
1884 case OFR_BACKFILL | OFR_CANCEL:
1885 if ((state & PG_STATE_FORCED_BACKFILL) == 0) {
1886 ss << "pg " << i << " backfill not forced; ";
1887 continue;
1888 }
1889 break;
1890 case OFR_RECOVERY | OFR_CANCEL:
1891 if ((state & PG_STATE_FORCED_RECOVERY) == 0) {
1892 ss << "pg " << i << " recovery not forced; ";
1893 continue;
1894 }
1895 break;
1896 default:
1897 ceph_abort_msg("actual_op value is not supported");
1898 }
1899 pgs.push_back(i);
1900 } // for
1901 });
1902
c07f9fc5
FG
1903 // respond with error only when no pgs are correct
1904 // yes, in case of mixed errors, only the last one will be emitted,
1905 // but the message presented will be fine
11fdf7f2 1906 if (pgs.size() != 0) {
c07f9fc5
FG
1907 // clear error to not confuse users/scripts
1908 r = 0;
1909 }
1910
11fdf7f2
TL
1911 // optimize the command -> messages conversion, use only one
1912 // message per distinct OSD
c07f9fc5 1913 cluster_state.with_osdmap([&](const OSDMap& osdmap) {
11fdf7f2
TL
1914 // group pgs to process by osd
1915 map<int, vector<spg_t>> osdpgs;
1916 for (auto& pgid : pgs) {
1917 int primary;
1918 spg_t spg;
1919 if (osdmap.get_primary_shard(pgid, &primary, &spg)) {
1920 osdpgs[primary].push_back(spg);
1921 }
1922 }
1923 for (auto& i : osdpgs) {
1924 if (osdmap.is_up(i.first)) {
1925 auto p = osd_cons.find(i.first);
1926 if (p == osd_cons.end()) {
1927 ss << "osd." << i.first << " is not currently connected";
1928 r = -EAGAIN;
1929 continue;
1930 }
1931 for (auto& con : p->second) {
1932 con->send_message(
1933 new MOSDForceRecovery(monc->get_fsid(), i.second, actual_op));
1934 }
1935 ss << "instructing pg(s) " << i.second << " on osd." << i.first
1936 << " to " << forceop << "; ";
1937 }
1938 }
1939 });
1940 ss << std::endl;
1941 cmdctx->reply(r, ss);
1942 return true;
1943 } else if (prefix == "config show" ||
1944 prefix == "config show-with-defaults") {
1945 string who;
9f95a23c
TL
1946 cmd_getval(cmdctx->cmdmap, "who", who);
1947 auto [key, valid] = DaemonKey::parse(who);
1948 if (!valid) {
1949 ss << "invalid daemon name: use <type>.<id>";
1950 cmdctx->reply(-EINVAL, ss);
1951 return true;
1952 }
11fdf7f2 1953 DaemonStatePtr daemon = daemon_state.get(key);
11fdf7f2
TL
1954 if (!daemon) {
1955 ss << "no config state for daemon " << who;
1956 cmdctx->reply(-ENOENT, ss);
1957 return true;
1958 }
1959
1960 std::lock_guard l(daemon->lock);
1961
9f95a23c
TL
1962 int r = 0;
1963 string name;
1964 if (cmd_getval(cmdctx->cmdmap, "key", name)) {
f6b5b4d7
TL
1965 // handle special options
1966 if (name == "fsid") {
1967 cmdctx->odata.append(stringify(monc->get_fsid()) + "\n");
1968 cmdctx->reply(r, ss);
1969 return true;
1970 }
11fdf7f2
TL
1971 auto p = daemon->config.find(name);
1972 if (p != daemon->config.end() &&
1973 !p->second.empty()) {
1974 cmdctx->odata.append(p->second.rbegin()->second + "\n");
1975 } else {
1976 auto& defaults = daemon->_get_config_defaults();
1977 auto q = defaults.find(name);
1978 if (q != defaults.end()) {
1979 cmdctx->odata.append(q->second + "\n");
1980 } else {
1981 r = -ENOENT;
1982 }
1983 }
1984 } else if (daemon->config_defaults_bl.length() > 0) {
1985 TextTable tbl;
1986 if (f) {
1987 f->open_array_section("config");
1988 } else {
1989 tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
1990 tbl.define_column("VALUE", TextTable::LEFT, TextTable::LEFT);
1991 tbl.define_column("SOURCE", TextTable::LEFT, TextTable::LEFT);
1992 tbl.define_column("OVERRIDES", TextTable::LEFT, TextTable::LEFT);
1993 tbl.define_column("IGNORES", TextTable::LEFT, TextTable::LEFT);
1994 }
1995 if (prefix == "config show") {
1996 // show
1997 for (auto& i : daemon->config) {
1998 dout(20) << " " << i.first << " -> " << i.second << dendl;
1999 if (i.second.empty()) {
c07f9fc5
FG
2000 continue;
2001 }
11fdf7f2
TL
2002 if (f) {
2003 f->open_object_section("value");
2004 f->dump_string("name", i.first);
2005 f->dump_string("value", i.second.rbegin()->second);
2006 f->dump_string("source", ceph_conf_level_name(
2007 i.second.rbegin()->first));
2008 if (i.second.size() > 1) {
2009 f->open_array_section("overrides");
2010 auto j = i.second.rend();
2011 for (--j; j != i.second.rbegin(); --j) {
2012 f->open_object_section("value");
2013 f->dump_string("source", ceph_conf_level_name(j->first));
2014 f->dump_string("value", j->second);
2015 f->close_section();
2016 }
2017 f->close_section();
2018 }
2019 if (daemon->ignored_mon_config.count(i.first)) {
2020 f->dump_string("ignores", "mon");
2021 }
2022 f->close_section();
2023 } else {
2024 tbl << i.first;
2025 tbl << i.second.rbegin()->second;
2026 tbl << ceph_conf_level_name(i.second.rbegin()->first);
2027 if (i.second.size() > 1) {
2028 list<string> ov;
2029 auto j = i.second.rend();
2030 for (--j; j != i.second.rbegin(); --j) {
2031 if (j->second == i.second.rbegin()->second) {
2032 ov.push_front(string("(") + ceph_conf_level_name(j->first) +
2033 string("[") + j->second + string("]") +
2034 string(")"));
2035 } else {
2036 ov.push_front(ceph_conf_level_name(j->first) +
2037 string("[") + j->second + string("]"));
2038
2039 }
2040 }
2041 tbl << ov;
2042 } else {
2043 tbl << "";
2044 }
2045 tbl << (daemon->ignored_mon_config.count(i.first) ? "mon" : "");
2046 tbl << TextTable::endrow;
2047 }
2048 }
2049 } else {
2050 // show-with-defaults
2051 auto& defaults = daemon->_get_config_defaults();
2052 for (auto& i : defaults) {
2053 if (f) {
2054 f->open_object_section("value");
2055 f->dump_string("name", i.first);
2056 } else {
2057 tbl << i.first;
2058 }
2059 auto j = daemon->config.find(i.first);
2060 if (j != daemon->config.end() && !j->second.empty()) {
2061 // have config
2062 if (f) {
2063 f->dump_string("value", j->second.rbegin()->second);
2064 f->dump_string("source", ceph_conf_level_name(
2065 j->second.rbegin()->first));
2066 if (j->second.size() > 1) {
2067 f->open_array_section("overrides");
2068 auto k = j->second.rend();
2069 for (--k; k != j->second.rbegin(); --k) {
2070 f->open_object_section("value");
2071 f->dump_string("source", ceph_conf_level_name(k->first));
2072 f->dump_string("value", k->second);
2073 f->close_section();
2074 }
2075 f->close_section();
2076 }
2077 if (daemon->ignored_mon_config.count(i.first)) {
2078 f->dump_string("ignores", "mon");
2079 }
2080 f->close_section();
2081 } else {
2082 tbl << j->second.rbegin()->second;
2083 tbl << ceph_conf_level_name(j->second.rbegin()->first);
2084 if (j->second.size() > 1) {
2085 list<string> ov;
2086 auto k = j->second.rend();
2087 for (--k; k != j->second.rbegin(); --k) {
2088 if (k->second == j->second.rbegin()->second) {
2089 ov.push_front(string("(") + ceph_conf_level_name(k->first) +
2090 string("[") + k->second + string("]") +
2091 string(")"));
2092 } else {
2093 ov.push_front(ceph_conf_level_name(k->first) +
2094 string("[") + k->second + string("]"));
2095 }
2096 }
2097 tbl << ov;
2098 } else {
2099 tbl << "";
2100 }
2101 tbl << (daemon->ignored_mon_config.count(i.first) ? "mon" : "");
2102 tbl << TextTable::endrow;
2103 }
2104 } else {
2105 // only have default
2106 if (f) {
2107 f->dump_string("value", i.second);
2108 f->dump_string("source", ceph_conf_level_name(CONF_DEFAULT));
2109 f->close_section();
2110 } else {
2111 tbl << i.second;
2112 tbl << ceph_conf_level_name(CONF_DEFAULT);
2113 tbl << "";
2114 tbl << "";
2115 tbl << TextTable::endrow;
2116 }
c07f9fc5 2117 }
c07f9fc5
FG
2118 }
2119 }
11fdf7f2
TL
2120 if (f) {
2121 f->close_section();
2122 f->flush(cmdctx->odata);
2123 } else {
2124 cmdctx->odata.append(stringify(tbl));
2125 }
2126 }
c07f9fc5
FG
2127 cmdctx->reply(r, ss);
2128 return true;
11fdf7f2
TL
2129 } else if (prefix == "device ls") {
2130 set<string> devids;
2131 TextTable tbl;
2132 if (f) {
2133 f->open_array_section("devices");
2134 daemon_state.with_devices([&f](const DeviceState& dev) {
2135 f->dump_object("device", dev);
2136 });
2137 f->close_section();
2138 f->flush(cmdctx->odata);
2139 } else {
2140 tbl.define_column("DEVICE", TextTable::LEFT, TextTable::LEFT);
2141 tbl.define_column("HOST:DEV", TextTable::LEFT, TextTable::LEFT);
2142 tbl.define_column("DAEMONS", TextTable::LEFT, TextTable::LEFT);
f67539c2 2143 tbl.define_column("WEAR", TextTable::RIGHT, TextTable::RIGHT);
11fdf7f2
TL
2144 tbl.define_column("LIFE EXPECTANCY", TextTable::LEFT, TextTable::LEFT);
2145 auto now = ceph_clock_now();
2146 daemon_state.with_devices([&tbl, now](const DeviceState& dev) {
2147 string h;
9f95a23c 2148 for (auto& i : dev.attachments) {
11fdf7f2
TL
2149 if (h.size()) {
2150 h += " ";
2151 }
9f95a23c 2152 h += std::get<0>(i) + ":" + std::get<1>(i);
11fdf7f2
TL
2153 }
2154 string d;
2155 for (auto& i : dev.daemons) {
2156 if (d.size()) {
2157 d += " ";
2158 }
2159 d += to_string(i);
2160 }
f67539c2
TL
2161 char wear_level_str[16] = {0};
2162 if (dev.wear_level >= 0) {
2163 snprintf(wear_level_str, sizeof(wear_level_str)-1, "%d%%",
2164 (int)(100.1 * dev.wear_level));
2165 }
11fdf7f2
TL
2166 tbl << dev.devid
2167 << h
2168 << d
f67539c2 2169 << wear_level_str
11fdf7f2
TL
2170 << dev.get_life_expectancy_str(now)
2171 << TextTable::endrow;
2172 });
2173 cmdctx->odata.append(stringify(tbl));
2174 }
2175 cmdctx->reply(0, ss);
2176 return true;
2177 } else if (prefix == "device ls-by-daemon") {
2178 string who;
9f95a23c
TL
2179 cmd_getval(cmdctx->cmdmap, "who", who);
2180 if (auto [k, valid] = DaemonKey::parse(who); !valid) {
11fdf7f2
TL
2181 ss << who << " is not a valid daemon name";
2182 r = -EINVAL;
2183 } else {
2184 auto dm = daemon_state.get(k);
2185 if (dm) {
2186 if (f) {
2187 f->open_array_section("devices");
2188 for (auto& i : dm->devices) {
2189 daemon_state.with_device(i.first, [&f] (const DeviceState& dev) {
2190 f->dump_object("device", dev);
2191 });
2192 }
2193 f->close_section();
2194 f->flush(cmdctx->odata);
2195 } else {
2196 TextTable tbl;
2197 tbl.define_column("DEVICE", TextTable::LEFT, TextTable::LEFT);
2198 tbl.define_column("HOST:DEV", TextTable::LEFT, TextTable::LEFT);
2199 tbl.define_column("EXPECTED FAILURE", TextTable::LEFT,
2200 TextTable::LEFT);
2201 auto now = ceph_clock_now();
2202 for (auto& i : dm->devices) {
2203 daemon_state.with_device(
2204 i.first, [&tbl, now] (const DeviceState& dev) {
2205 string h;
9f95a23c 2206 for (auto& i : dev.attachments) {
11fdf7f2
TL
2207 if (h.size()) {
2208 h += " ";
2209 }
9f95a23c 2210 h += std::get<0>(i) + ":" + std::get<1>(i);
11fdf7f2
TL
2211 }
2212 tbl << dev.devid
2213 << h
2214 << dev.get_life_expectancy_str(now)
2215 << TextTable::endrow;
2216 });
2217 }
2218 cmdctx->odata.append(stringify(tbl));
2219 }
2220 } else {
2221 r = -ENOENT;
2222 ss << "daemon " << who << " not found";
2223 }
2224 cmdctx->reply(r, ss);
2225 }
2226 } else if (prefix == "device ls-by-host") {
2227 string host;
9f95a23c 2228 cmd_getval(cmdctx->cmdmap, "host", host);
11fdf7f2
TL
2229 set<string> devids;
2230 daemon_state.list_devids_by_server(host, &devids);
2231 if (f) {
2232 f->open_array_section("devices");
2233 for (auto& devid : devids) {
2234 daemon_state.with_device(
2235 devid, [&f] (const DeviceState& dev) {
2236 f->dump_object("device", dev);
7c673cae 2237 });
11fdf7f2
TL
2238 }
2239 f->close_section();
2240 f->flush(cmdctx->odata);
2241 } else {
2242 TextTable tbl;
2243 tbl.define_column("DEVICE", TextTable::LEFT, TextTable::LEFT);
2244 tbl.define_column("DEV", TextTable::LEFT, TextTable::LEFT);
2245 tbl.define_column("DAEMONS", TextTable::LEFT, TextTable::LEFT);
2246 tbl.define_column("EXPECTED FAILURE", TextTable::LEFT, TextTable::LEFT);
2247 auto now = ceph_clock_now();
2248 for (auto& devid : devids) {
2249 daemon_state.with_device(
2250 devid, [&tbl, &host, now] (const DeviceState& dev) {
2251 string n;
9f95a23c
TL
2252 for (auto& j : dev.attachments) {
2253 if (std::get<0>(j) == host) {
11fdf7f2
TL
2254 if (n.size()) {
2255 n += " ";
2256 }
9f95a23c 2257 n += std::get<1>(j);
11fdf7f2
TL
2258 }
2259 }
2260 string d;
2261 for (auto& i : dev.daemons) {
2262 if (d.size()) {
2263 d += " ";
2264 }
2265 d += to_string(i);
2266 }
2267 tbl << dev.devid
2268 << n
2269 << d
2270 << dev.get_life_expectancy_str(now)
2271 << TextTable::endrow;
2272 });
2273 }
2274 cmdctx->odata.append(stringify(tbl));
2275 }
2276 cmdctx->reply(0, ss);
2277 return true;
2278 } else if (prefix == "device info") {
2279 string devid;
9f95a23c 2280 cmd_getval(cmdctx->cmdmap, "devid", devid);
11fdf7f2
TL
2281 int r = 0;
2282 ostringstream rs;
2283 if (!daemon_state.with_device(devid,
2284 [&f, &rs] (const DeviceState& dev) {
2285 if (f) {
2286 f->dump_object("device", dev);
2287 } else {
2288 dev.print(rs);
2289 }
2290 })) {
2291 ss << "device " << devid << " not found";
2292 r = -ENOENT;
2293 } else {
2294 if (f) {
2295 f->flush(cmdctx->odata);
2296 } else {
2297 cmdctx->odata.append(rs.str());
2298 }
2299 }
2300 cmdctx->reply(r, ss);
2301 return true;
2302 } else if (prefix == "device set-life-expectancy") {
2303 string devid;
9f95a23c 2304 cmd_getval(cmdctx->cmdmap, "devid", devid);
11fdf7f2 2305 string from_str, to_str;
9f95a23c
TL
2306 cmd_getval(cmdctx->cmdmap, "from", from_str);
2307 cmd_getval(cmdctx->cmdmap, "to", to_str);
11fdf7f2
TL
2308 utime_t from, to;
2309 if (!from.parse(from_str)) {
2310 ss << "unable to parse datetime '" << from_str << "'";
2311 r = -EINVAL;
2312 cmdctx->reply(r, ss);
2313 } else if (to_str.size() && !to.parse(to_str)) {
2314 ss << "unable to parse datetime '" << to_str << "'";
2315 r = -EINVAL;
2316 cmdctx->reply(r, ss);
2317 } else {
2318 map<string,string> meta;
2319 daemon_state.with_device_create(
2320 devid,
2321 [from, to, &meta] (DeviceState& dev) {
2322 dev.set_life_expectancy(from, to, ceph_clock_now());
2323 meta = dev.metadata;
2324 });
2325 json_spirit::Object json_object;
2326 for (auto& i : meta) {
2327 json_spirit::Config::add(json_object, i.first, i.second);
2328 }
2329 bufferlist json;
2330 json.append(json_spirit::write(json_object));
2331 const string cmd =
2332 "{"
2333 "\"prefix\": \"config-key set\", "
2334 "\"key\": \"device/" + devid + "\""
2335 "}";
2336 auto on_finish = new ReplyOnFinish(cmdctx);
2337 monc->start_mon_command({cmd}, json, nullptr, nullptr, on_finish);
2338 }
2339 return true;
2340 } else if (prefix == "device rm-life-expectancy") {
2341 string devid;
9f95a23c 2342 cmd_getval(cmdctx->cmdmap, "devid", devid);
11fdf7f2
TL
2343 map<string,string> meta;
2344 if (daemon_state.with_device_write(devid, [&meta] (DeviceState& dev) {
2345 dev.rm_life_expectancy();
2346 meta = dev.metadata;
2347 })) {
2348 string cmd;
2349 bufferlist json;
2350 if (meta.empty()) {
2351 cmd =
2352 "{"
2353 "\"prefix\": \"config-key rm\", "
2354 "\"key\": \"device/" + devid + "\""
2355 "}";
2356 } else {
2357 json_spirit::Object json_object;
2358 for (auto& i : meta) {
2359 json_spirit::Config::add(json_object, i.first, i.second);
2360 }
2361 json.append(json_spirit::write(json_object));
2362 cmd =
2363 "{"
2364 "\"prefix\": \"config-key set\", "
2365 "\"key\": \"device/" + devid + "\""
2366 "}";
2367 }
2368 auto on_finish = new ReplyOnFinish(cmdctx);
2369 monc->start_mon_command({cmd}, json, nullptr, nullptr, on_finish);
2370 } else {
2371 cmdctx->reply(0, ss);
2372 }
2373 return true;
2374 } else {
2375 if (!pgmap_ready) {
2376 ss << "Warning: due to ceph-mgr restart, some PG states may not be up to date\n";
2377 }
2378 if (f) {
2379 f->open_object_section("pg_info");
2380 f->dump_bool("pg_ready", pgmap_ready);
2381 }
2382
2383 // fall back to feeding command to PGMap
2384 r = cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) {
2385 return process_pg_map_command(prefix, cmdctx->cmdmap, pg_map, osdmap,
2386 f.get(), &ss, &cmdctx->odata);
7c673cae
FG
2387 });
2388
11fdf7f2
TL
2389 if (f) {
2390 f->close_section();
2391 }
7c673cae 2392 if (r != -EOPNOTSUPP) {
11fdf7f2
TL
2393 if (f) {
2394 f->flush(cmdctx->odata);
2395 }
7c673cae
FG
2396 cmdctx->reply(r, ss);
2397 return true;
2398 }
2399 }
2400
11fdf7f2 2401 // Was the command unfound?
92f5a8d4 2402 if (py_command.cmdstring.empty()) {
7c673cae
FG
2403 ss << "No handler found for '" << prefix << "'";
2404 dout(4) << "No handler found for '" << prefix << "'" << dendl;
2405 cmdctx->reply(-EINVAL, ss);
2406 return true;
7c673cae 2407 }
11fdf7f2 2408
f67539c2 2409 dout(10) << "passing through command '" << prefix << "' size " << cmdctx->cmdmap.size() << dendl;
9f95a23c
TL
2410 finisher.queue(new LambdaContext([this, cmdctx, session, py_command, prefix]
2411 (int r_) mutable {
11fdf7f2
TL
2412 std::stringstream ss;
2413
f67539c2
TL
2414 dout(10) << "dispatching command '" << prefix << "' size " << cmdctx->cmdmap.size() << dendl;
2415
11fdf7f2 2416 // Validate that the module is enabled
92f5a8d4
TL
2417 auto& py_handler_name = py_command.module_name;
2418 PyModuleRef module = py_modules.get_module(py_handler_name);
11fdf7f2
TL
2419 ceph_assert(module);
2420 if (!module->is_enabled()) {
92f5a8d4 2421 ss << "Module '" << py_handler_name << "' is not enabled (required by "
11fdf7f2 2422 "command '" << prefix << "'): use `ceph mgr module enable "
92f5a8d4 2423 << py_handler_name << "` to enable it";
11fdf7f2
TL
2424 dout(4) << ss.str() << dendl;
2425 cmdctx->reply(-EOPNOTSUPP, ss);
2426 return;
2427 }
2428
2429 // Hack: allow the self-test method to run on unhealthy modules.
2430 // Fix this in future by creating a special path for self test rather
2431 // than having the hook be a normal module command.
92f5a8d4 2432 std::string self_test_prefix = py_handler_name + " " + "self-test";
11fdf7f2
TL
2433
2434 // Validate that the module is healthy
2435 bool accept_command;
2436 if (module->is_loaded()) {
2437 if (module->get_can_run() && !module->is_failed()) {
2438 // Healthy module
2439 accept_command = true;
2440 } else if (self_test_prefix == prefix) {
2441 // Unhealthy, but allow because it's a self test command
2442 accept_command = true;
2443 } else {
2444 accept_command = false;
92f5a8d4 2445 ss << "Module '" << py_handler_name << "' has experienced an error and "
11fdf7f2
TL
2446 "cannot handle commands: " << module->get_error_string();
2447 }
2448 } else {
2449 // Module not loaded
2450 accept_command = false;
92f5a8d4 2451 ss << "Module '" << py_handler_name << "' failed to load and "
11fdf7f2
TL
2452 "cannot handle commands: " << module->get_error_string();
2453 }
2454
2455 if (!accept_command) {
2456 dout(4) << ss.str() << dendl;
2457 cmdctx->reply(-EIO, ss);
2458 return;
2459 }
2460
2461 std::stringstream ds;
9f95a23c 2462 bufferlist inbl = cmdctx->data;
92f5a8d4
TL
2463 int r = py_modules.handle_command(py_command, *session, cmdctx->cmdmap,
2464 inbl, &ds, &ss);
2465 if (r == -EACCES) {
2466 log_access_denied(cmdctx, session, ss);
2467 }
2468
11fdf7f2
TL
2469 cmdctx->odata.append(ds);
2470 cmdctx->reply(r, ss);
f67539c2 2471 dout(10) << " command returned " << r << dendl;
11fdf7f2
TL
2472 }));
2473 return true;
7c673cae 2474}
31f18b77 2475
224ce89b
WB
2476void DaemonServer::_prune_pending_service_map()
2477{
2478 utime_t cutoff = ceph_clock_now();
11fdf7f2 2479 cutoff -= g_conf().get_val<double>("mgr_service_beacon_grace");
224ce89b
WB
2480 auto p = pending_service_map.services.begin();
2481 while (p != pending_service_map.services.end()) {
2482 auto q = p->second.daemons.begin();
2483 while (q != p->second.daemons.end()) {
9f95a23c 2484 DaemonKey key{p->first, q->first};
224ce89b 2485 if (!daemon_state.exists(key)) {
1911f103
TL
2486 if (ServiceMap::is_normal_ceph_entity(p->first)) {
2487 dout(10) << "daemon " << key << " in service map but not in daemon state "
2488 << "index -- force pruning" << dendl;
2489 q = p->second.daemons.erase(q);
2490 pending_service_map_dirty = pending_service_map.epoch;
2491 } else {
2492 derr << "missing key " << key << dendl;
2493 ++q;
2494 }
2495
2496 continue;
224ce89b 2497 }
1911f103 2498
224ce89b 2499 auto daemon = daemon_state.get(key);
11fdf7f2 2500 std::lock_guard l(daemon->lock);
224ce89b
WB
2501 if (daemon->last_service_beacon == utime_t()) {
2502 // we must have just restarted; assume they are alive now.
2503 daemon->last_service_beacon = ceph_clock_now();
2504 ++q;
2505 continue;
2506 }
2507 if (daemon->last_service_beacon < cutoff) {
2508 dout(10) << "pruning stale " << p->first << "." << q->first
2509 << " last_beacon " << daemon->last_service_beacon << dendl;
2510 q = p->second.daemons.erase(q);
2511 pending_service_map_dirty = pending_service_map.epoch;
2512 } else {
2513 ++q;
2514 }
2515 }
2516 if (p->second.daemons.empty()) {
2517 p = pending_service_map.services.erase(p);
2518 pending_service_map_dirty = pending_service_map.epoch;
2519 } else {
2520 ++p;
2521 }
2522 }
2523}
2524
31f18b77
FG
2525void DaemonServer::send_report()
2526{
224ce89b 2527 if (!pgmap_ready) {
11fdf7f2 2528 if (ceph_clock_now() - started_at > g_conf().get_val<int64_t>("mgr_stats_period") * 4.0) {
224ce89b
WB
2529 pgmap_ready = true;
2530 reported_osds.clear();
2531 dout(1) << "Giving up on OSDs that haven't reported yet, sending "
2532 << "potentially incomplete PG state to mon" << dendl;
2533 } else {
2534 dout(1) << "Not sending PG status to monitor yet, waiting for OSDs"
2535 << dendl;
2536 return;
2537 }
2538 }
2539
9f95a23c 2540 auto m = ceph::make_message<MMonMgrReport>();
f67539c2 2541 m->gid = monc->get_global_id();
c07f9fc5 2542 py_modules.get_health_checks(&m->health_checks);
11fdf7f2 2543 py_modules.get_progress_events(&m->progress_events);
c07f9fc5 2544
11fdf7f2 2545 cluster_state.with_mutable_pgmap([&](PGMap& pg_map) {
31f18b77
FG
2546 cluster_state.update_delta_stats();
2547
224ce89b
WB
2548 if (pending_service_map.epoch) {
2549 _prune_pending_service_map();
2550 if (pending_service_map_dirty >= pending_service_map.epoch) {
2551 pending_service_map.modified = ceph_clock_now();
11fdf7f2 2552 encode(pending_service_map, m->service_map_bl, CEPH_FEATURES_ALL);
224ce89b
WB
2553 dout(10) << "sending service_map e" << pending_service_map.epoch
2554 << dendl;
2555 pending_service_map.epoch++;
2556 }
2557 }
2558
31f18b77
FG
2559 cluster_state.with_osdmap([&](const OSDMap& osdmap) {
2560 // FIXME: no easy way to get mon features here. this will do for
2561 // now, though, as long as we don't make a backward-incompat change.
2562 pg_map.encode_digest(osdmap, m->get_data(), CEPH_FEATURES_ALL);
2563 dout(10) << pg_map << dendl;
224ce89b
WB
2564
2565 pg_map.get_health_checks(g_ceph_context, osdmap,
2566 &m->health_checks);
c07f9fc5 2567
224ce89b
WB
2568 dout(10) << m->health_checks.checks.size() << " health checks"
2569 << dendl;
2570 dout(20) << "health checks:\n";
2571 JSONFormatter jf(true);
2572 jf.dump_object("health_checks", m->health_checks);
2573 jf.flush(*_dout);
2574 *_dout << dendl;
9f95a23c 2575 if (osdmap.require_osd_release >= ceph_release_t::luminous) {
a8e16298
TL
2576 clog->debug() << "pgmap v" << pg_map.version << ": " << pg_map;
2577 }
31f18b77
FG
2578 });
2579 });
b32b8144 2580
11fdf7f2
TL
2581 map<daemon_metric, unique_ptr<DaemonHealthMetricCollector>> accumulated;
2582 for (auto service : {"osd", "mon"} ) {
2583 auto daemons = daemon_state.get_by_service(service);
2584 for (const auto& [key,state] : daemons) {
2585 std::lock_guard l{state->lock};
2586 for (const auto& metric : state->daemon_health_metrics) {
2587 auto acc = accumulated.find(metric.get_type());
2588 if (acc == accumulated.end()) {
2589 auto collector = DaemonHealthMetricCollector::create(metric.get_type());
2590 if (!collector) {
9f95a23c 2591 derr << __func__ << " " << key
11fdf7f2
TL
2592 << " sent me an unknown health metric: "
2593 << std::hex << static_cast<uint8_t>(metric.get_type())
2594 << std::dec << dendl;
2595 continue;
2596 }
2597 dout(20) << " + " << state->key << " "
2598 << metric << dendl;
2599 tie(acc, std::ignore) = accumulated.emplace(metric.get_type(),
2600 std::move(collector));
2601 }
2602 acc->second->update(key, metric);
b32b8144 2603 }
b32b8144
FG
2604 }
2605 }
2606 for (const auto& acc : accumulated) {
2607 acc.second->summarize(m->health_checks);
2608 }
31f18b77
FG
2609 // TODO? We currently do not notify the PyModules
2610 // TODO: respect needs_send, so we send the report only if we are asked to do
2611 // so, or the state is updated.
9f95a23c 2612 monc->send_mon_message(std::move(m));
31f18b77 2613}
224ce89b 2614
11fdf7f2
TL
2615void DaemonServer::adjust_pgs()
2616{
2617 dout(20) << dendl;
2618 unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs);
2619 double max_misplaced = g_conf().get_val<double>("target_max_misplaced_ratio");
2620 bool aggro = g_conf().get_val<bool>("mgr_debug_aggressive_pg_num_changes");
2621
2622 map<string,unsigned> pg_num_to_set;
2623 map<string,unsigned> pgp_num_to_set;
2624 set<pg_t> upmaps_to_clear;
2625 cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) {
2626 unsigned creating_or_unknown = 0;
2627 for (auto& i : pg_map.num_pg_by_state) {
2628 if ((i.first & (PG_STATE_CREATING)) ||
2629 i.first == 0) {
2630 creating_or_unknown += i.second;
2631 }
2632 }
2633 unsigned left = max;
2634 if (creating_or_unknown >= max) {
2635 return;
2636 }
2637 left -= creating_or_unknown;
2638 dout(10) << "creating_or_unknown " << creating_or_unknown
2639 << " max_creating " << max
2640 << " left " << left
2641 << dendl;
2642
2643 // FIXME: These checks are fundamentally racy given that adjust_pgs()
2644 // can run more frequently than we get updated pg stats from OSDs. We
2645 // may make multiple adjustments with stale informaiton.
2646 double misplaced_ratio, degraded_ratio;
2647 double inactive_pgs_ratio, unknown_pgs_ratio;
2648 pg_map.get_recovery_stats(&misplaced_ratio, &degraded_ratio,
2649 &inactive_pgs_ratio, &unknown_pgs_ratio);
2650 dout(20) << "misplaced_ratio " << misplaced_ratio
2651 << " degraded_ratio " << degraded_ratio
2652 << " inactive_pgs_ratio " << inactive_pgs_ratio
2653 << " unknown_pgs_ratio " << unknown_pgs_ratio
2654 << "; target_max_misplaced_ratio " << max_misplaced
2655 << dendl;
2656
2657 for (auto& i : osdmap.get_pools()) {
2658 const pg_pool_t& p = i.second;
2659
2660 // adjust pg_num?
2661 if (p.get_pg_num_target() != p.get_pg_num()) {
2662 dout(20) << "pool " << i.first
2663 << " pg_num " << p.get_pg_num()
2664 << " target " << p.get_pg_num_target()
2665 << dendl;
2666 if (p.has_flag(pg_pool_t::FLAG_CREATING)) {
2667 dout(10) << "pool " << i.first
2668 << " pg_num_target " << p.get_pg_num_target()
2669 << " pg_num " << p.get_pg_num()
2670 << " - still creating initial pgs"
2671 << dendl;
2672 } else if (p.get_pg_num_target() < p.get_pg_num()) {
2673 // pg_num decrease (merge)
2674 pg_t merge_source(p.get_pg_num() - 1, i.first);
2675 pg_t merge_target = merge_source.get_parent();
2676 bool ok = true;
2677
2678 if (p.get_pg_num() != p.get_pg_num_pending()) {
2679 dout(10) << "pool " << i.first
2680 << " pg_num_target " << p.get_pg_num_target()
2681 << " pg_num " << p.get_pg_num()
2682 << " - decrease and pg_num_pending != pg_num, waiting"
2683 << dendl;
2684 ok = false;
2685 } else if (p.get_pg_num() == p.get_pgp_num()) {
2686 dout(10) << "pool " << i.first
2687 << " pg_num_target " << p.get_pg_num_target()
2688 << " pg_num " << p.get_pg_num()
2689 << " - decrease blocked by pgp_num "
2690 << p.get_pgp_num()
2691 << dendl;
2692 ok = false;
2693 }
9f95a23c 2694 vector<int32_t> source_acting;
11fdf7f2
TL
2695 for (auto &merge_participant : {merge_source, merge_target}) {
2696 bool is_merge_source = merge_participant == merge_source;
2697 if (osdmap.have_pg_upmaps(merge_participant)) {
2698 dout(10) << "pool " << i.first
2699 << " pg_num_target " << p.get_pg_num_target()
2700 << " pg_num " << p.get_pg_num()
2701 << (is_merge_source ? " - merge source " : " - merge target ")
2702 << merge_participant
2703 << " has upmap" << dendl;
2704 upmaps_to_clear.insert(merge_participant);
2705 ok = false;
2706 }
2707 auto q = pg_map.pg_stat.find(merge_participant);
2708 if (q == pg_map.pg_stat.end()) {
2709 dout(10) << "pool " << i.first
2710 << " pg_num_target " << p.get_pg_num_target()
2711 << " pg_num " << p.get_pg_num()
2712 << " - no state for " << merge_participant
2713 << (is_merge_source ? " (merge source)" : " (merge target)")
2714 << dendl;
2715 ok = false;
2716 } else if ((q->second.state & (PG_STATE_ACTIVE | PG_STATE_CLEAN)) !=
2717 (PG_STATE_ACTIVE | PG_STATE_CLEAN)) {
2718 dout(10) << "pool " << i.first
2719 << " pg_num_target " << p.get_pg_num_target()
2720 << " pg_num " << p.get_pg_num()
2721 << (is_merge_source ? " - merge source " : " - merge target ")
2722 << merge_participant
2723 << " not clean (" << pg_state_string(q->second.state)
2724 << ")" << dendl;
2725 ok = false;
2726 }
9f95a23c
TL
2727 if (is_merge_source) {
2728 source_acting = q->second.acting;
2729 } else if (ok && q->second.acting != source_acting) {
2730 dout(10) << "pool " << i.first
2731 << " pg_num_target " << p.get_pg_num_target()
2732 << " pg_num " << p.get_pg_num()
2733 << (is_merge_source ? " - merge source " : " - merge target ")
2734 << merge_participant
2735 << " acting does not match (source " << source_acting
2736 << " != target " << q->second.acting
2737 << ")" << dendl;
2738 ok = false;
2739 }
11fdf7f2
TL
2740 }
2741
2742 if (ok) {
2743 unsigned target = p.get_pg_num() - 1;
2744 dout(10) << "pool " << i.first
2745 << " pg_num_target " << p.get_pg_num_target()
2746 << " pg_num " << p.get_pg_num()
2747 << " -> " << target
2748 << " (merging " << merge_source
2749 << " and " << merge_target
2750 << ")" << dendl;
2751 pg_num_to_set[osdmap.get_pool_name(i.first)] = target;
9f95a23c 2752 continue;
11fdf7f2
TL
2753 }
2754 } else if (p.get_pg_num_target() > p.get_pg_num()) {
2755 // pg_num increase (split)
2756 bool active = true;
2757 auto q = pg_map.num_pg_by_pool_state.find(i.first);
2758 if (q != pg_map.num_pg_by_pool_state.end()) {
2759 for (auto& j : q->second) {
2760 if ((j.first & (PG_STATE_ACTIVE|PG_STATE_PEERED)) == 0) {
2761 dout(20) << "pool " << i.first << " has " << j.second
2762 << " pgs in " << pg_state_string(j.first)
2763 << dendl;
2764 active = false;
2765 break;
2766 }
2767 }
2768 } else {
2769 active = false;
2770 }
20effc67
TL
2771 unsigned pg_gap = p.get_pg_num() - p.get_pgp_num();
2772 unsigned max_jump = cct->_conf->mgr_max_pg_num_change;
11fdf7f2
TL
2773 if (!active) {
2774 dout(10) << "pool " << i.first
2775 << " pg_num_target " << p.get_pg_num_target()
2776 << " pg_num " << p.get_pg_num()
2777 << " - not all pgs active"
2778 << dendl;
20effc67
TL
2779 } else if (pg_gap >= max_jump) {
2780 dout(10) << "pool " << i.first
2781 << " pg_num " << p.get_pg_num()
2782 << " - pgp_num " << p.get_pgp_num()
2783 << " gap > max_pg_num_change " << max_jump
2784 << " - must scale pgp_num first"
2785 << dendl;
11fdf7f2
TL
2786 } else {
2787 unsigned add = std::min(
20effc67 2788 std::min(left, max_jump - pg_gap),
11fdf7f2
TL
2789 p.get_pg_num_target() - p.get_pg_num());
2790 unsigned target = p.get_pg_num() + add;
2791 left -= add;
2792 dout(10) << "pool " << i.first
2793 << " pg_num_target " << p.get_pg_num_target()
2794 << " pg_num " << p.get_pg_num()
2795 << " -> " << target << dendl;
2796 pg_num_to_set[osdmap.get_pool_name(i.first)] = target;
2797 }
2798 }
2799 }
2800
2801 // adjust pgp_num?
2802 unsigned target = std::min(p.get_pg_num_pending(),
2803 p.get_pgp_num_target());
2804 if (target != p.get_pgp_num()) {
2805 dout(20) << "pool " << i.first
2806 << " pgp_num_target " << p.get_pgp_num_target()
2807 << " pgp_num " << p.get_pgp_num()
2808 << " -> " << target << dendl;
2809 if (target > p.get_pgp_num() &&
2810 p.get_pgp_num() == p.get_pg_num()) {
2811 dout(10) << "pool " << i.first
2812 << " pgp_num_target " << p.get_pgp_num_target()
2813 << " pgp_num " << p.get_pgp_num()
2814 << " - increase blocked by pg_num " << p.get_pg_num()
2815 << dendl;
2816 } else if (!aggro && (inactive_pgs_ratio > 0 ||
2817 degraded_ratio > 0 ||
2818 unknown_pgs_ratio > 0)) {
2819 dout(10) << "pool " << i.first
2820 << " pgp_num_target " << p.get_pgp_num_target()
2821 << " pgp_num " << p.get_pgp_num()
2822 << " - inactive|degraded|unknown pgs, deferring pgp_num"
2823 << " update" << dendl;
2824 } else if (!aggro && (misplaced_ratio > max_misplaced)) {
2825 dout(10) << "pool " << i.first
2826 << " pgp_num_target " << p.get_pgp_num_target()
2827 << " pgp_num " << p.get_pgp_num()
2828 << " - misplaced_ratio " << misplaced_ratio
2829 << " > max " << max_misplaced
2830 << ", deferring pgp_num update" << dendl;
2831 } else {
2832 // NOTE: this calculation assumes objects are
2833 // basically uniformly distributed across all PGs
2834 // (regardless of pool), which is probably not
2835 // perfectly correct, but it's a start. make no
2836 // single adjustment that's more than half of the
2837 // max_misplaced, to somewhat limit the magnitude of
2838 // our potential error here.
20effc67 2839 unsigned next;
9f95a23c 2840 static constexpr unsigned MAX_NUM_OBJECTS_PER_PG_FOR_LEAP = 1;
81eedcae
TL
2841 pool_stat_t s = pg_map.get_pg_pool_sum_stat(i.first);
2842 if (aggro ||
2843 // pool is (virtually) empty; just jump to final pgp_num?
2844 (p.get_pgp_num_target() > p.get_pgp_num() &&
9f95a23c
TL
2845 s.stats.sum.num_objects <= (MAX_NUM_OBJECTS_PER_PG_FOR_LEAP *
2846 p.get_pgp_num_target()))) {
11fdf7f2
TL
2847 next = target;
2848 } else {
2849 double room =
2850 std::min<double>(max_misplaced - misplaced_ratio,
81eedcae 2851 max_misplaced / 2.0);
11fdf7f2
TL
2852 unsigned estmax = std::max<unsigned>(
2853 (double)p.get_pg_num() * room, 1u);
b3b6e05e
TL
2854 unsigned next_min = 0;
2855 if (p.get_pgp_num() > estmax) {
2856 next_min = p.get_pgp_num() - estmax;
2857 }
9f95a23c 2858 next = std::clamp(target,
b3b6e05e 2859 next_min,
9f95a23c 2860 p.get_pgp_num() + estmax);
11fdf7f2 2861 dout(20) << " room " << room << " estmax " << estmax
9f95a23c
TL
2862 << " delta " << (target-p.get_pgp_num())
2863 << " next " << next << dendl;
81eedcae
TL
2864 if (p.get_pgp_num_target() == p.get_pg_num_target() &&
2865 p.get_pgp_num_target() < p.get_pg_num()) {
11fdf7f2
TL
2866 // since pgp_num is tracking pg_num, ceph is handling
2867 // pgp_num. so, be responsible: don't let pgp_num get
2868 // too far out ahead of merges (if we are merging).
2869 // this avoids moving lots of unmerged pgs onto a
2870 // small number of OSDs where we might blow out the
2871 // per-osd pg max.
2872 unsigned max_outpace_merges =
2873 std::max<unsigned>(8, p.get_pg_num() * max_misplaced);
2874 if (next + max_outpace_merges < p.get_pg_num()) {
2875 next = p.get_pg_num() - max_outpace_merges;
2876 dout(10) << " using next " << next
2877 << " to avoid outpacing merges (max_outpace_merges "
2878 << max_outpace_merges << ")" << dendl;
2879 }
2880 }
2881 }
522d829b
TL
2882 if (next != p.get_pgp_num()) {
2883 dout(10) << "pool " << i.first
2884 << " pgp_num_target " << p.get_pgp_num_target()
2885 << " pgp_num " << p.get_pgp_num()
2886 << " -> " << next << dendl;
2887 pgp_num_to_set[osdmap.get_pool_name(i.first)] = next;
2888 }
11fdf7f2
TL
2889 }
2890 }
2891 if (left == 0) {
2892 return;
2893 }
2894 }
2895 });
2896 for (auto i : pg_num_to_set) {
2897 const string cmd =
2898 "{"
2899 "\"prefix\": \"osd pool set\", "
2900 "\"pool\": \"" + i.first + "\", "
2901 "\"var\": \"pg_num_actual\", "
2902 "\"val\": \"" + stringify(i.second) + "\""
2903 "}";
2904 monc->start_mon_command({cmd}, {}, nullptr, nullptr, nullptr);
2905 }
2906 for (auto i : pgp_num_to_set) {
2907 const string cmd =
2908 "{"
2909 "\"prefix\": \"osd pool set\", "
2910 "\"pool\": \"" + i.first + "\", "
2911 "\"var\": \"pgp_num_actual\", "
2912 "\"val\": \"" + stringify(i.second) + "\""
2913 "}";
2914 monc->start_mon_command({cmd}, {}, nullptr, nullptr, nullptr);
2915 }
2916 for (auto pg : upmaps_to_clear) {
2917 const string cmd =
2918 "{"
2919 "\"prefix\": \"osd rm-pg-upmap\", "
2920 "\"pgid\": \"" + stringify(pg) + "\""
2921 "}";
2922 monc->start_mon_command({cmd}, {}, nullptr, nullptr, nullptr);
2923 const string cmd2 =
2924 "{"
2925 "\"prefix\": \"osd rm-pg-upmap-items\", "
2926 "\"pgid\": \"" + stringify(pg) + "\"" +
2927 "}";
2928 monc->start_mon_command({cmd2}, {}, nullptr, nullptr, nullptr);
2929 }
2930}
2931
224ce89b
WB
2932void DaemonServer::got_service_map()
2933{
11fdf7f2 2934 std::lock_guard l(lock);
224ce89b
WB
2935
2936 cluster_state.with_servicemap([&](const ServiceMap& service_map) {
2937 if (pending_service_map.epoch == 0) {
2938 // we just started up
2939 dout(10) << "got initial map e" << service_map.epoch << dendl;
2940 pending_service_map = service_map;
f91f0fd5 2941 pending_service_map.epoch = service_map.epoch + 1;
224ce89b
WB
2942 } else {
2943 // we we already active and therefore must have persisted it,
2944 // which means ours is the same or newer.
2945 dout(10) << "got updated map e" << service_map.epoch << dendl;
f91f0fd5 2946 ceph_assert(pending_service_map.epoch > service_map.epoch);
224ce89b 2947 }
224ce89b
WB
2948 });
2949
2950 // cull missing daemons, populate new ones
92f5a8d4 2951 std::set<std::string> types;
9f95a23c
TL
2952 for (auto& [type, service] : pending_service_map.services) {
2953 if (ServiceMap::is_normal_ceph_entity(type)) {
2954 continue;
2955 }
2956
2957 types.insert(type);
92f5a8d4 2958
224ce89b 2959 std::set<std::string> names;
9f95a23c 2960 for (auto& q : service.daemons) {
224ce89b 2961 names.insert(q.first);
9f95a23c 2962 DaemonKey key{type, q.first};
224ce89b
WB
2963 if (!daemon_state.exists(key)) {
2964 auto daemon = std::make_shared<DaemonState>(daemon_state.types);
2965 daemon->key = key;
11fdf7f2 2966 daemon->set_metadata(q.second.metadata);
224ce89b
WB
2967 daemon->service_daemon = true;
2968 daemon_state.insert(daemon);
2969 dout(10) << "added missing " << key << dendl;
2970 }
2971 }
9f95a23c 2972 daemon_state.cull(type, names);
224ce89b 2973 }
92f5a8d4 2974 daemon_state.cull_services(types);
224ce89b 2975}
3efd9988 2976
11fdf7f2
TL
2977void DaemonServer::got_mgr_map()
2978{
2979 std::lock_guard l(lock);
2980 set<std::string> have;
2981 cluster_state.with_mgrmap([&](const MgrMap& mgrmap) {
2982 auto md_update = [&] (DaemonKey key) {
2983 std::ostringstream oss;
2984 auto c = new MetadataUpdate(daemon_state, key);
2985 // FIXME remove post-nautilus: include 'id' for luminous mons
2986 oss << "{\"prefix\": \"mgr metadata\", \"who\": \""
9f95a23c 2987 << key.name << "\", \"id\": \"" << key.name << "\"}";
11fdf7f2
TL
2988 monc->start_mon_command({oss.str()}, {}, &c->outbl, &c->outs, c);
2989 };
2990 if (mgrmap.active_name.size()) {
9f95a23c 2991 DaemonKey key{"mgr", mgrmap.active_name};
11fdf7f2
TL
2992 have.insert(mgrmap.active_name);
2993 if (!daemon_state.exists(key) && !daemon_state.is_updating(key)) {
2994 md_update(key);
2995 dout(10) << "triggered addition of " << key << " via metadata update" << dendl;
2996 }
2997 }
2998 for (auto& i : mgrmap.standbys) {
9f95a23c 2999 DaemonKey key{"mgr", i.second.name};
11fdf7f2
TL
3000 have.insert(i.second.name);
3001 if (!daemon_state.exists(key) && !daemon_state.is_updating(key)) {
3002 md_update(key);
3003 dout(10) << "triggered addition of " << key << " via metadata update" << dendl;
3004 }
3005 }
3006 });
3007 daemon_state.cull("mgr", have);
3008}
3efd9988
FG
3009
3010const char** DaemonServer::get_tracked_conf_keys() const
3011{
3012 static const char *KEYS[] = {
3013 "mgr_stats_threshold",
3014 "mgr_stats_period",
3015 nullptr
3016 };
3017
3018 return KEYS;
3019}
3020
11fdf7f2
TL
3021void DaemonServer::handle_conf_change(const ConfigProxy& conf,
3022 const std::set <std::string> &changed)
3efd9988 3023{
3efd9988
FG
3024
3025 if (changed.count("mgr_stats_threshold") || changed.count("mgr_stats_period")) {
3026 dout(4) << "Updating stats threshold/period on "
3027 << daemon_connections.size() << " clients" << dendl;
3028 // Send a fresh MMgrConfigure to all clients, so that they can follow
3029 // the new policy for transmitting stats
9f95a23c 3030 finisher.queue(new LambdaContext([this](int r) {
11fdf7f2
TL
3031 std::lock_guard l(lock);
3032 for (auto &c : daemon_connections) {
3033 _send_configure(c);
3034 }
3035 }));
3efd9988
FG
3036 }
3037}
3038
3039void DaemonServer::_send_configure(ConnectionRef c)
3040{
9f95a23c 3041 ceph_assert(ceph_mutex_is_locked_by_me(lock));
3efd9988 3042
9f95a23c 3043 auto configure = make_message<MMgrConfigure>();
11fdf7f2
TL
3044 configure->stats_period = g_conf().get_val<int64_t>("mgr_stats_period");
3045 configure->stats_threshold = g_conf().get_val<int64_t>("mgr_stats_threshold");
3046
3047 if (c->peer_is_osd()) {
3048 configure->osd_perf_metric_queries =
3049 osd_perf_metric_collector.get_queries();
f67539c2
TL
3050 } else if (c->peer_is_mds()) {
3051 configure->metric_config_message =
3052 MetricConfigMessage(MDSConfigPayload(mds_perf_metric_collector.get_queries()));
11fdf7f2
TL
3053 }
3054
9f95a23c 3055 c->send_message2(configure);
3efd9988
FG
3056}
3057
9f95a23c 3058MetricQueryID DaemonServer::add_osd_perf_query(
11fdf7f2
TL
3059 const OSDPerfMetricQuery &query,
3060 const std::optional<OSDPerfMetricLimit> &limit)
3061{
3062 return osd_perf_metric_collector.add_query(query, limit);
3063}
3064
9f95a23c 3065int DaemonServer::remove_osd_perf_query(MetricQueryID query_id)
11fdf7f2
TL
3066{
3067 return osd_perf_metric_collector.remove_query(query_id);
3068}
3069
f67539c2
TL
3070int DaemonServer::get_osd_perf_counters(OSDPerfCollector *collector)
3071{
3072 return osd_perf_metric_collector.get_counters(collector);
3073}
3074
3075MetricQueryID DaemonServer::add_mds_perf_query(
3076 const MDSPerfMetricQuery &query,
3077 const std::optional<MDSPerfMetricLimit> &limit)
3078{
3079 return mds_perf_metric_collector.add_query(query, limit);
3080}
3081
3082int DaemonServer::remove_mds_perf_query(MetricQueryID query_id)
3083{
3084 return mds_perf_metric_collector.remove_query(query_id);
3085}
3086
33c7a0ef
TL
3087void DaemonServer::reregister_mds_perf_queries()
3088{
3089 mds_perf_metric_collector.reregister_queries();
3090}
3091
f67539c2 3092int DaemonServer::get_mds_perf_counters(MDSPerfCollector *collector)
11fdf7f2 3093{
f67539c2 3094 return mds_perf_metric_collector.get_counters(collector);
11fdf7f2 3095}