1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
10 * Author: Loic Dachary <loic@dachary.org>
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
20 #include <boost/algorithm/string.hpp>
24 #include "mon/OSDMonitor.h"
25 #include "mon/Monitor.h"
26 #include "mon/MDSMonitor.h"
27 #include "mon/PGMonitor.h"
28 #include "mon/MgrStatMonitor.h"
29 #include "mon/AuthMonitor.h"
30 #include "mon/ConfigKeyService.h"
32 #include "mon/MonitorDBStore.h"
33 #include "mon/Session.h"
35 #include "crush/CrushWrapper.h"
36 #include "crush/CrushTester.h"
37 #include "crush/CrushTreeDumper.h"
39 #include "messages/MOSDBeacon.h"
40 #include "messages/MOSDFailure.h"
41 #include "messages/MOSDMarkMeDown.h"
42 #include "messages/MOSDFull.h"
43 #include "messages/MOSDMap.h"
44 #include "messages/MMonGetOSDMap.h"
45 #include "messages/MOSDBoot.h"
46 #include "messages/MOSDAlive.h"
47 #include "messages/MPoolOp.h"
48 #include "messages/MPoolOpReply.h"
49 #include "messages/MOSDPGCreate.h"
50 #include "messages/MOSDPGCreated.h"
51 #include "messages/MOSDPGTemp.h"
52 #include "messages/MMonCommand.h"
53 #include "messages/MRemoveSnaps.h"
54 #include "messages/MOSDScrub.h"
55 #include "messages/MRoute.h"
57 #include "common/TextTable.h"
58 #include "common/Timer.h"
59 #include "common/ceph_argparse.h"
60 #include "common/perf_counters.h"
61 #include "common/strtol.h"
63 #include "common/config.h"
64 #include "common/errno.h"
66 #include "erasure-code/ErasureCodePlugin.h"
67 #include "compressor/Compressor.h"
68 #include "common/Checksummer.h"
70 #include "include/compat.h"
71 #include "include/assert.h"
72 #include "include/stringify.h"
73 #include "include/util.h"
74 #include "common/cmdparse.h"
75 #include "include/str_list.h"
76 #include "include/str_map.h"
77 #include "include/scope_guard.h"
79 #include "json_spirit/json_spirit_reader.h"
81 #include <boost/algorithm/string/predicate.hpp>
83 #define dout_subsys ceph_subsys_mon
84 #define OSD_PG_CREATING_PREFIX "osd_pg_creating"
88 const uint32_t MAX_POOL_APPLICATIONS
= 4;
89 const uint32_t MAX_POOL_APPLICATION_KEYS
= 64;
90 const uint32_t MAX_POOL_APPLICATION_LENGTH
= 128;
92 } // anonymous namespace
94 void LastEpochClean::Lec::report(ps_t ps
, epoch_t last_epoch_clean
)
96 if (epoch_by_pg
.size() <= ps
) {
97 epoch_by_pg
.resize(ps
+ 1, 0);
99 const auto old_lec
= epoch_by_pg
[ps
];
100 if (old_lec
>= last_epoch_clean
) {
104 epoch_by_pg
[ps
] = last_epoch_clean
;
105 if (last_epoch_clean
< floor
) {
106 floor
= last_epoch_clean
;
107 } else if (last_epoch_clean
> floor
) {
108 if (old_lec
== floor
) {
109 // probably should increase floor?
110 auto new_floor
= std::min_element(std::begin(epoch_by_pg
),
111 std::end(epoch_by_pg
));
115 if (ps
!= next_missing
) {
118 for (; next_missing
< epoch_by_pg
.size(); next_missing
++) {
119 if (epoch_by_pg
[next_missing
] == 0) {
125 void LastEpochClean::remove_pool(uint64_t pool
)
127 report_by_pool
.erase(pool
);
130 void LastEpochClean::report(const pg_t
& pg
, epoch_t last_epoch_clean
)
132 auto& lec
= report_by_pool
[pg
.pool()];
133 return lec
.report(pg
.ps(), last_epoch_clean
);
136 epoch_t
LastEpochClean::get_lower_bound(const OSDMap
& latest
) const
138 auto floor
= latest
.get_epoch();
139 for (auto& pool
: latest
.get_pools()) {
140 auto reported
= report_by_pool
.find(pool
.first
);
141 if (reported
== report_by_pool
.end()) {
144 if (reported
->second
.next_missing
< pool
.second
.get_pg_num()) {
147 if (reported
->second
.floor
< floor
) {
148 floor
= reported
->second
.floor
;
155 struct C_UpdateCreatingPGs
: public Context
{
159 C_UpdateCreatingPGs(OSDMonitor
*osdmon
, epoch_t e
) :
160 osdmon(osdmon
), start(ceph_clock_now()), epoch(e
) {}
161 void finish(int r
) override
{
163 utime_t end
= ceph_clock_now();
164 dout(10) << "osdmap epoch " << epoch
<< " mapping took "
165 << (end
- start
) << " seconds" << dendl
;
166 osdmon
->update_creating_pgs();
167 osdmon
->check_pg_creates_subs();
173 #define dout_prefix _prefix(_dout, mon, osdmap)
174 static ostream
& _prefix(std::ostream
*_dout
, Monitor
*mon
, const OSDMap
& osdmap
) {
175 return *_dout
<< "mon." << mon
->name
<< "@" << mon
->rank
176 << "(" << mon
->get_state_name()
177 << ").osd e" << osdmap
.get_epoch() << " ";
180 OSDMonitor::OSDMonitor(
184 const string
& service_name
)
185 : PaxosService(mn
, p
, service_name
),
187 inc_osd_cache(g_conf
->mon_osd_cache_size
),
188 full_osd_cache(g_conf
->mon_osd_cache_size
),
189 last_attempted_minwait_time(utime_t()),
190 mapper(mn
->cct
, &mn
->cpu_tp
),
191 op_tracker(cct
, true, 1)
194 bool OSDMonitor::_have_pending_crush()
196 return pending_inc
.crush
.length() > 0;
199 CrushWrapper
&OSDMonitor::_get_stable_crush()
201 return *osdmap
.crush
;
204 void OSDMonitor::_get_pending_crush(CrushWrapper
& newcrush
)
207 if (pending_inc
.crush
.length())
208 bl
= pending_inc
.crush
;
210 osdmap
.crush
->encode(bl
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
212 bufferlist::iterator p
= bl
.begin();
216 void OSDMonitor::create_initial()
218 dout(10) << "create_initial for " << mon
->monmap
->fsid
<< dendl
;
223 mon
->store
->get("mkfs", "osdmap", bl
);
227 newmap
.set_fsid(mon
->monmap
->fsid
);
229 newmap
.build_simple(g_ceph_context
, 0, mon
->monmap
->fsid
, 0);
232 newmap
.created
= newmap
.modified
= ceph_clock_now();
234 // new clusters should sort bitwise by default.
235 newmap
.set_flag(CEPH_OSDMAP_SORTBITWISE
);
237 // new cluster should require latest by default
238 if (g_conf
->mon_debug_no_require_luminous
) {
239 newmap
.require_osd_release
= CEPH_RELEASE_KRAKEN
;
240 derr
<< __func__
<< " mon_debug_no_require_luminous=true" << dendl
;
242 newmap
.require_osd_release
= CEPH_RELEASE_LUMINOUS
;
244 CEPH_OSDMAP_RECOVERY_DELETES
|
245 CEPH_OSDMAP_PURGED_SNAPDIRS
;
246 newmap
.full_ratio
= g_conf
->mon_osd_full_ratio
;
247 if (newmap
.full_ratio
> 1.0) newmap
.full_ratio
/= 100;
248 newmap
.backfillfull_ratio
= g_conf
->mon_osd_backfillfull_ratio
;
249 if (newmap
.backfillfull_ratio
> 1.0) newmap
.backfillfull_ratio
/= 100;
250 newmap
.nearfull_ratio
= g_conf
->mon_osd_nearfull_ratio
;
251 if (newmap
.nearfull_ratio
> 1.0) newmap
.nearfull_ratio
/= 100;
252 int r
= ceph_release_from_name(
253 g_conf
->mon_osd_initial_require_min_compat_client
.c_str());
255 assert(0 == "mon_osd_initial_require_min_compat_client is not valid");
257 newmap
.require_min_compat_client
= r
;
260 // encode into pending incremental
261 newmap
.encode(pending_inc
.fullmap
,
262 mon
->get_quorum_con_features() | CEPH_FEATURE_RESERVED
);
263 pending_inc
.full_crc
= newmap
.get_crc();
264 dout(20) << " full crc " << pending_inc
.full_crc
<< dendl
;
267 void OSDMonitor::get_store_prefixes(std::set
<string
>& s
)
269 s
.insert(service_name
);
270 s
.insert(OSD_PG_CREATING_PREFIX
);
273 void OSDMonitor::update_from_paxos(bool *need_bootstrap
)
275 version_t version
= get_last_committed();
276 if (version
== osdmap
.epoch
)
278 assert(version
> osdmap
.epoch
);
280 dout(15) << "update_from_paxos paxos e " << version
281 << ", my e " << osdmap
.epoch
<< dendl
;
284 if (!mapping_job
->is_done()) {
285 dout(1) << __func__
<< " mapping job "
286 << mapping_job
.get() << " did not complete, "
287 << mapping_job
->shards
<< " left, canceling" << dendl
;
288 mapping_job
->abort();
296 * We will possibly have a stashed latest that *we* wrote, and we will
297 * always be sure to have the oldest full map in the first..last range
298 * due to encode_trim_extra(), which includes the oldest full map in the trim
301 * encode_trim_extra() does not however write the full map's
302 * version to 'full_latest'. This is only done when we are building the
303 * full maps from the incremental versions. But don't panic! We make sure
304 * that the following conditions find whichever full map version is newer.
306 version_t latest_full
= get_version_latest_full();
307 if (latest_full
== 0 && get_first_committed() > 1)
308 latest_full
= get_first_committed();
310 if (get_first_committed() > 1 &&
311 latest_full
< get_first_committed()) {
312 // the monitor could be just sync'ed with its peer, and the latest_full key
313 // is not encoded in the paxos commits in encode_pending(), so we need to
314 // make sure we get it pointing to a proper version.
315 version_t lc
= get_last_committed();
316 version_t fc
= get_first_committed();
318 dout(10) << __func__
<< " looking for valid full map in interval"
319 << " [" << fc
<< ", " << lc
<< "]" << dendl
;
322 for (version_t v
= lc
; v
>= fc
; v
--) {
323 string full_key
= "full_" + stringify(v
);
324 if (mon
->store
->exists(get_service_name(), full_key
)) {
325 dout(10) << __func__
<< " found latest full map v " << v
<< dendl
;
331 assert(latest_full
> 0);
332 auto t(std::make_shared
<MonitorDBStore::Transaction
>());
333 put_version_latest_full(t
, latest_full
);
334 mon
->store
->apply_transaction(t
);
335 dout(10) << __func__
<< " updated the on-disk full map version to "
336 << latest_full
<< dendl
;
339 if ((latest_full
> 0) && (latest_full
> osdmap
.epoch
)) {
340 bufferlist latest_bl
;
341 get_version_full(latest_full
, latest_bl
);
342 assert(latest_bl
.length() != 0);
343 dout(7) << __func__
<< " loading latest full map e" << latest_full
<< dendl
;
344 osdmap
.decode(latest_bl
);
347 if (mon
->monmap
->get_required_features().contains_all(
348 ceph::features::mon::FEATURE_LUMINOUS
)) {
350 if (!mon
->store
->get(OSD_PG_CREATING_PREFIX
, "creating", bl
)) {
352 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
353 creating_pgs
.decode(p
);
354 dout(7) << __func__
<< " loading creating_pgs last_scan_epoch "
355 << creating_pgs
.last_scan_epoch
356 << " with " << creating_pgs
.pgs
.size() << " pgs" << dendl
;
358 dout(1) << __func__
<< " missing creating pgs; upgrade from post-kraken?"
363 // make sure we're using the right pg service.. remove me post-luminous!
364 if (osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
365 dout(10) << __func__
<< " pgservice is mgrstat" << dendl
;
366 mon
->pgservice
= mon
->mgrstatmon()->get_pg_stat_service();
368 dout(10) << __func__
<< " pgservice is pg" << dendl
;
369 mon
->pgservice
= mon
->pgmon()->get_pg_stat_service();
372 // walk through incrementals
373 MonitorDBStore::TransactionRef t
;
375 while (version
> osdmap
.epoch
) {
377 int err
= get_version(osdmap
.epoch
+1, inc_bl
);
379 assert(inc_bl
.length());
381 dout(7) << "update_from_paxos applying incremental " << osdmap
.epoch
+1
383 OSDMap::Incremental
inc(inc_bl
);
384 err
= osdmap
.apply_incremental(inc
);
388 t
.reset(new MonitorDBStore::Transaction
);
390 // Write out the full map for all past epochs. Encode the full
391 // map with the same features as the incremental. If we don't
392 // know, use the quorum features. If we don't know those either,
393 // encode with all features.
394 uint64_t f
= inc
.encode_features
;
396 f
= mon
->get_quorum_con_features();
400 osdmap
.encode(full_bl
, f
| CEPH_FEATURE_RESERVED
);
401 tx_size
+= full_bl
.length();
403 bufferlist orig_full_bl
;
404 get_version_full(osdmap
.epoch
, orig_full_bl
);
405 if (orig_full_bl
.length()) {
406 // the primary provided the full map
407 assert(inc
.have_crc
);
408 if (inc
.full_crc
!= osdmap
.crc
) {
409 // This will happen if the mons were running mixed versions in
410 // the past or some other circumstance made the full encoded
411 // maps divergent. Reloading here will bring us back into
412 // sync with the primary for this and all future maps. OSDs
413 // will also be brought back into sync when they discover the
414 // crc mismatch and request a full map from a mon.
415 derr
<< __func__
<< " full map CRC mismatch, resetting to canonical"
418 osdmap
.decode(orig_full_bl
);
421 assert(!inc
.have_crc
);
422 put_version_full(t
, osdmap
.epoch
, full_bl
);
424 put_version_latest_full(t
, osdmap
.epoch
);
427 dout(1) << osdmap
<< dendl
;
429 if (osdmap
.epoch
== 1) {
430 t
->erase("mkfs", "osdmap");
433 // make sure we're using the right pg service.. remove me post-luminous!
434 if (osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
435 dout(10) << __func__
<< " pgservice is mgrstat" << dendl
;
436 mon
->pgservice
= mon
->mgrstatmon()->get_pg_stat_service();
438 dout(10) << __func__
<< " pgservice is pg" << dendl
;
439 mon
->pgservice
= mon
->pgmon()->get_pg_stat_service();
442 if (tx_size
> g_conf
->mon_sync_max_payload_size
*2) {
443 mon
->store
->apply_transaction(t
);
444 t
= MonitorDBStore::TransactionRef();
447 if (mon
->monmap
->get_required_features().contains_all(
448 ceph::features::mon::FEATURE_LUMINOUS
)) {
449 for (const auto &osd_state
: inc
.new_state
) {
450 if (osd_state
.second
& CEPH_OSD_UP
) {
451 // could be marked up *or* down, but we're too lazy to check which
452 last_osd_report
.erase(osd_state
.first
);
454 if (osd_state
.second
& CEPH_OSD_EXISTS
) {
455 // could be created *or* destroyed, but we can safely drop it
456 osd_epochs
.erase(osd_state
.first
);
463 mon
->store
->apply_transaction(t
);
466 for (int o
= 0; o
< osdmap
.get_max_osd(); o
++) {
467 if (osdmap
.is_out(o
))
469 auto found
= down_pending_out
.find(o
);
470 if (osdmap
.is_down(o
)) {
471 // populate down -> out map
472 if (found
== down_pending_out
.end()) {
473 dout(10) << " adding osd." << o
<< " to down_pending_out map" << dendl
;
474 down_pending_out
[o
] = ceph_clock_now();
477 if (found
!= down_pending_out
.end()) {
478 dout(10) << " removing osd." << o
<< " from down_pending_out map" << dendl
;
479 down_pending_out
.erase(found
);
483 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
485 if (mon
->is_leader()) {
486 // kick pgmon, make sure it's seen the latest map
487 mon
->pgmon()->check_osd_map(osdmap
.epoch
);
491 check_pg_creates_subs();
493 share_map_with_random_osd();
498 // make sure our feature bits reflect the latest map
499 update_msgr_features();
501 if (!mon
->is_leader()) {
502 // will be called by on_active() on the leader, avoid doing so twice
507 void OSDMonitor::start_mapping()
509 // initiate mapping job
511 dout(10) << __func__
<< " canceling previous mapping_job " << mapping_job
.get()
513 mapping_job
->abort();
515 if (!osdmap
.get_pools().empty()) {
516 auto fin
= new C_UpdateCreatingPGs(this, osdmap
.get_epoch());
517 mapping_job
= mapping
.start_update(osdmap
, mapper
,
518 g_conf
->mon_osd_mapping_pgs_per_chunk
);
519 dout(10) << __func__
<< " started mapping job " << mapping_job
.get()
520 << " at " << fin
->start
<< dendl
;
521 mapping_job
->set_finish_event(fin
);
523 dout(10) << __func__
<< " no pools, no mapping job" << dendl
;
524 mapping_job
= nullptr;
528 void OSDMonitor::update_msgr_features()
531 types
.insert((int)entity_name_t::TYPE_OSD
);
532 types
.insert((int)entity_name_t::TYPE_CLIENT
);
533 types
.insert((int)entity_name_t::TYPE_MDS
);
534 types
.insert((int)entity_name_t::TYPE_MON
);
535 for (set
<int>::iterator q
= types
.begin(); q
!= types
.end(); ++q
) {
537 uint64_t features
= osdmap
.get_features(*q
, &mask
);
538 if ((mon
->messenger
->get_policy(*q
).features_required
& mask
) != features
) {
539 dout(0) << "crush map has features " << features
<< ", adjusting msgr requires" << dendl
;
540 Messenger::Policy p
= mon
->messenger
->get_policy(*q
);
541 p
.features_required
= (p
.features_required
& ~mask
) | features
;
542 mon
->messenger
->set_policy(*q
, p
);
547 void OSDMonitor::on_active()
551 if (mon
->is_leader()) {
552 mon
->clog
->debug() << "osdmap " << osdmap
;
554 list
<MonOpRequestRef
> ls
;
555 take_all_failures(ls
);
556 while (!ls
.empty()) {
557 MonOpRequestRef op
= ls
.front();
558 op
->mark_osdmon_event(__func__
);
566 void OSDMonitor::on_restart()
568 last_osd_report
.clear();
570 if (mon
->is_leader()) {
571 // fix ruleset != ruleid
572 if (osdmap
.crush
->has_legacy_rulesets() &&
573 !osdmap
.crush
->has_multirule_rulesets()) {
574 CrushWrapper newcrush
;
575 _get_pending_crush(newcrush
);
576 int r
= newcrush
.renumber_rules_by_ruleset();
578 dout(1) << __func__
<< " crush map has ruleset != rule id; fixing" << dendl
;
579 pending_inc
.crush
.clear();
580 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
582 dout(10) << __func__
<< " unable to renumber rules by ruleset" << dendl
;
588 void OSDMonitor::on_shutdown()
590 dout(10) << __func__
<< dendl
;
592 dout(10) << __func__
<< " canceling previous mapping_job " << mapping_job
.get()
594 mapping_job
->abort();
597 // discard failure info, waiters
598 list
<MonOpRequestRef
> ls
;
599 take_all_failures(ls
);
603 void OSDMonitor::update_logger()
605 dout(10) << "update_logger" << dendl
;
607 mon
->cluster_logger
->set(l_cluster_num_osd
, osdmap
.get_num_osds());
608 mon
->cluster_logger
->set(l_cluster_num_osd_up
, osdmap
.get_num_up_osds());
609 mon
->cluster_logger
->set(l_cluster_num_osd_in
, osdmap
.get_num_in_osds());
610 mon
->cluster_logger
->set(l_cluster_osd_epoch
, osdmap
.get_epoch());
613 void OSDMonitor::create_pending()
615 pending_inc
= OSDMap::Incremental(osdmap
.epoch
+1);
616 pending_inc
.fsid
= mon
->monmap
->fsid
;
618 dout(10) << "create_pending e " << pending_inc
.epoch
<< dendl
;
620 // clean up pg_temp, primary_temp
621 OSDMap::clean_temps(g_ceph_context
, osdmap
, &pending_inc
);
622 dout(10) << "create_pending did clean_temps" << dendl
;
624 // On upgrade OSDMap has new field set by mon_osd_backfillfull_ratio config
625 // instead of osd_backfill_full_ratio config
626 if (osdmap
.backfillfull_ratio
<= 0) {
627 pending_inc
.new_backfillfull_ratio
= g_conf
->mon_osd_backfillfull_ratio
;
628 if (pending_inc
.new_backfillfull_ratio
> 1.0)
629 pending_inc
.new_backfillfull_ratio
/= 100;
630 dout(1) << __func__
<< " setting backfillfull_ratio = "
631 << pending_inc
.new_backfillfull_ratio
<< dendl
;
633 if (osdmap
.get_epoch() > 0 &&
634 osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
635 // transition full ratios from PGMap to OSDMap (on upgrade)
636 float full_ratio
= mon
->pgservice
->get_full_ratio();
637 float nearfull_ratio
= mon
->pgservice
->get_nearfull_ratio();
638 if (osdmap
.full_ratio
!= full_ratio
) {
639 dout(10) << __func__
<< " full_ratio " << osdmap
.full_ratio
640 << " -> " << full_ratio
<< " (from pgmap)" << dendl
;
641 pending_inc
.new_full_ratio
= full_ratio
;
643 if (osdmap
.nearfull_ratio
!= nearfull_ratio
) {
644 dout(10) << __func__
<< " nearfull_ratio " << osdmap
.nearfull_ratio
645 << " -> " << nearfull_ratio
<< " (from pgmap)" << dendl
;
646 pending_inc
.new_nearfull_ratio
= nearfull_ratio
;
649 // safety check (this shouldn't really happen)
650 if (osdmap
.full_ratio
<= 0) {
651 pending_inc
.new_full_ratio
= g_conf
->mon_osd_full_ratio
;
652 if (pending_inc
.new_full_ratio
> 1.0)
653 pending_inc
.new_full_ratio
/= 100;
654 dout(1) << __func__
<< " setting full_ratio = "
655 << pending_inc
.new_full_ratio
<< dendl
;
657 if (osdmap
.nearfull_ratio
<= 0) {
658 pending_inc
.new_nearfull_ratio
= g_conf
->mon_osd_nearfull_ratio
;
659 if (pending_inc
.new_nearfull_ratio
> 1.0)
660 pending_inc
.new_nearfull_ratio
/= 100;
661 dout(1) << __func__
<< " setting nearfull_ratio = "
662 << pending_inc
.new_nearfull_ratio
<< dendl
;
668 OSDMonitor::update_pending_pgs(const OSDMap::Incremental
& inc
)
670 dout(10) << __func__
<< dendl
;
671 creating_pgs_t pending_creatings
;
673 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
674 pending_creatings
= creating_pgs
;
676 // check for new or old pools
677 if (pending_creatings
.last_scan_epoch
< inc
.epoch
) {
678 if (osdmap
.get_epoch() &&
679 osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
681 mon
->pgservice
->maybe_add_creating_pgs(creating_pgs
.last_scan_epoch
,
684 dout(7) << __func__
<< " " << added
<< " pgs added from pgmap" << dendl
;
687 queued
+= scan_for_creating_pgs(osdmap
.get_pools(),
691 queued
+= scan_for_creating_pgs(inc
.new_pools
,
695 dout(10) << __func__
<< " " << queued
<< " pools queued" << dendl
;
696 for (auto deleted_pool
: inc
.old_pools
) {
697 auto removed
= pending_creatings
.remove_pool(deleted_pool
);
698 dout(10) << __func__
<< " " << removed
699 << " pg removed because containing pool deleted: "
700 << deleted_pool
<< dendl
;
701 last_epoch_clean
.remove_pool(deleted_pool
);
703 // pgmon updates its creating_pgs in check_osd_map() which is called by
704 // on_active() and check_osd_map() could be delayed if lease expires, so its
705 // creating_pgs could be stale in comparison with the one of osdmon. let's
706 // trim them here. otherwise, they will be added back after being erased.
707 unsigned removed
= 0;
708 for (auto& pg
: pending_created_pgs
) {
709 dout(20) << __func__
<< " noting created pg " << pg
<< dendl
;
710 pending_creatings
.created_pools
.insert(pg
.pool());
711 removed
+= pending_creatings
.pgs
.erase(pg
);
713 pending_created_pgs
.clear();
714 dout(10) << __func__
<< " " << removed
715 << " pgs removed because they're created" << dendl
;
716 pending_creatings
.last_scan_epoch
= osdmap
.get_epoch();
720 unsigned max
= MAX(1, g_conf
->mon_osd_max_creating_pgs
);
721 const auto total
= pending_creatings
.pgs
.size();
722 while (pending_creatings
.pgs
.size() < max
&&
723 !pending_creatings
.queue
.empty()) {
724 auto p
= pending_creatings
.queue
.begin();
725 int64_t poolid
= p
->first
;
726 dout(10) << __func__
<< " pool " << poolid
727 << " created " << p
->second
.created
728 << " modified " << p
->second
.modified
729 << " [" << p
->second
.start
<< "-" << p
->second
.end
<< ")"
731 int n
= MIN(max
- pending_creatings
.pgs
.size(),
732 p
->second
.end
- p
->second
.start
);
733 ps_t first
= p
->second
.start
;
734 ps_t end
= first
+ n
;
735 for (ps_t ps
= first
; ps
< end
; ++ps
) {
736 const pg_t pgid
{ps
, static_cast<uint64_t>(poolid
)};
737 // NOTE: use the *current* epoch as the PG creation epoch so that the
738 // OSD does not have to generate a long set of PastIntervals.
739 pending_creatings
.pgs
.emplace(pgid
, make_pair(inc
.epoch
,
740 p
->second
.modified
));
741 dout(10) << __func__
<< " adding " << pgid
<< dendl
;
743 p
->second
.start
= end
;
744 if (p
->second
.done()) {
745 dout(10) << __func__
<< " done with queue for " << poolid
<< dendl
;
746 pending_creatings
.queue
.erase(p
);
748 dout(10) << __func__
<< " pool " << poolid
749 << " now [" << p
->second
.start
<< "-" << p
->second
.end
<< ")"
753 dout(10) << __func__
<< " queue remaining: " << pending_creatings
.queue
.size()
754 << " pools" << dendl
;
756 << " " << (pending_creatings
.pgs
.size() - total
)
757 << "/" << pending_creatings
.pgs
.size()
758 << " pgs added from queued pools" << dendl
;
759 return pending_creatings
;
762 void OSDMonitor::maybe_prime_pg_temp()
765 if (pending_inc
.crush
.length()) {
766 dout(10) << __func__
<< " new crush map, all" << dendl
;
770 if (!pending_inc
.new_up_client
.empty()) {
771 dout(10) << __func__
<< " new up osds, all" << dendl
;
775 // check for interesting OSDs
777 for (auto p
= pending_inc
.new_state
.begin();
778 !all
&& p
!= pending_inc
.new_state
.end();
780 if ((p
->second
& CEPH_OSD_UP
) &&
781 osdmap
.is_up(p
->first
)) {
782 osds
.insert(p
->first
);
785 for (map
<int32_t,uint32_t>::iterator p
= pending_inc
.new_weight
.begin();
786 !all
&& p
!= pending_inc
.new_weight
.end();
788 if (p
->second
< osdmap
.get_weight(p
->first
)) {
790 osds
.insert(p
->first
);
792 dout(10) << __func__
<< " osd." << p
->first
<< " weight increase, all"
798 if (!all
&& osds
.empty())
803 mapping
.get_osd_acting_pgs(*osds
.begin()).size() * osds
.size();
804 if (estimate
> mapping
.get_num_pgs() *
805 g_conf
->mon_osd_prime_pg_temp_max_estimate
) {
806 dout(10) << __func__
<< " estimate " << estimate
<< " pgs on "
807 << osds
.size() << " osds >= "
808 << g_conf
->mon_osd_prime_pg_temp_max_estimate
<< " of total "
809 << mapping
.get_num_pgs() << " pgs, all"
813 dout(10) << __func__
<< " estimate " << estimate
<< " pgs on "
814 << osds
.size() << " osds" << dendl
;
819 next
.deepish_copy_from(osdmap
);
820 next
.apply_incremental(pending_inc
);
822 if (next
.get_pools().empty()) {
823 dout(10) << __func__
<< " no pools, no pg_temp priming" << dendl
;
825 PrimeTempJob
job(next
, this);
826 mapper
.queue(&job
, g_conf
->mon_osd_mapping_pgs_per_chunk
);
827 if (job
.wait_for(g_conf
->mon_osd_prime_pg_temp_max_time
)) {
828 dout(10) << __func__
<< " done in " << job
.get_duration() << dendl
;
830 dout(10) << __func__
<< " did not finish in "
831 << g_conf
->mon_osd_prime_pg_temp_max_time
832 << ", stopping" << dendl
;
836 dout(10) << __func__
<< " " << osds
.size() << " interesting osds" << dendl
;
837 utime_t stop
= ceph_clock_now();
838 stop
+= g_conf
->mon_osd_prime_pg_temp_max_time
;
839 const int chunk
= 1000;
841 std::unordered_set
<pg_t
> did_pgs
;
842 for (auto osd
: osds
) {
843 auto& pgs
= mapping
.get_osd_acting_pgs(osd
);
844 dout(20) << __func__
<< " osd." << osd
<< " " << pgs
<< dendl
;
845 for (auto pgid
: pgs
) {
846 if (!did_pgs
.insert(pgid
).second
) {
849 prime_pg_temp(next
, pgid
);
852 if (ceph_clock_now() > stop
) {
853 dout(10) << __func__
<< " consumed more than "
854 << g_conf
->mon_osd_prime_pg_temp_max_time
855 << " seconds, stopping"
865 void OSDMonitor::prime_pg_temp(
869 if (mon
->monmap
->get_required_features().contains_all(
870 ceph::features::mon::FEATURE_LUMINOUS
)) {
871 // TODO: remove this creating_pgs direct access?
872 if (creating_pgs
.pgs
.count(pgid
)) {
876 if (mon
->pgservice
->is_creating_pg(pgid
)) {
880 if (!osdmap
.pg_exists(pgid
)) {
884 vector
<int> up
, acting
;
885 mapping
.get(pgid
, &up
, nullptr, &acting
, nullptr);
887 vector
<int> next_up
, next_acting
;
888 int next_up_primary
, next_acting_primary
;
889 next
.pg_to_up_acting_osds(pgid
, &next_up
, &next_up_primary
,
890 &next_acting
, &next_acting_primary
);
891 if (acting
== next_acting
&& next_up
!= next_acting
)
892 return; // no change since last epoch
895 return; // if previously empty now we can be no worse off
896 const pg_pool_t
*pool
= next
.get_pg_pool(pgid
.pool());
897 if (pool
&& acting
.size() < pool
->min_size
)
898 return; // can be no worse off than before
900 if (next_up
== next_acting
) {
902 dout(20) << __func__
<< "next_up === next_acting now, clear pg_temp"
906 dout(20) << __func__
<< " " << pgid
<< " " << up
<< "/" << acting
907 << " -> " << next_up
<< "/" << next_acting
908 << ", priming " << acting
911 Mutex::Locker
l(prime_pg_temp_lock
);
912 // do not touch a mapping if a change is pending
913 pending_inc
.new_pg_temp
.emplace(
915 mempool::osdmap::vector
<int>(acting
.begin(), acting
.end()));
920 * @note receiving a transaction in this function gives a fair amount of
921 * freedom to the service implementation if it does need it. It shouldn't.
923 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t
)
925 dout(10) << "encode_pending e " << pending_inc
.epoch
928 // finalize up pending_inc
929 pending_inc
.modified
= ceph_clock_now();
931 int r
= pending_inc
.propagate_snaps_to_tiers(g_ceph_context
, osdmap
);
935 if (!mapping_job
->is_done()) {
936 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job "
937 << mapping_job
.get() << " did not complete, "
938 << mapping_job
->shards
<< " left" << dendl
;
939 mapping_job
->abort();
940 } else if (mapping
.get_epoch() < osdmap
.get_epoch()) {
941 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job "
942 << mapping_job
.get() << " is prior epoch "
943 << mapping
.get_epoch() << dendl
;
945 if (g_conf
->mon_osd_prime_pg_temp
) {
946 maybe_prime_pg_temp();
949 } else if (g_conf
->mon_osd_prime_pg_temp
) {
950 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job did not start"
955 // ensure we don't have blank new_state updates. these are interrpeted as
956 // CEPH_OSD_UP (and almost certainly not what we want!).
957 auto p
= pending_inc
.new_state
.begin();
958 while (p
!= pending_inc
.new_state
.end()) {
959 if (p
->second
== 0) {
960 dout(10) << "new_state for osd." << p
->first
<< " is 0, removing" << dendl
;
961 p
= pending_inc
.new_state
.erase(p
);
971 tmp
.deepish_copy_from(osdmap
);
972 tmp
.apply_incremental(pending_inc
);
974 if (tmp
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
975 // set or clear full/nearfull?
976 int full
, backfill
, nearfull
;
977 tmp
.count_full_nearfull_osds(&full
, &backfill
, &nearfull
);
979 if (!tmp
.test_flag(CEPH_OSDMAP_FULL
)) {
980 dout(10) << __func__
<< " setting full flag" << dendl
;
981 add_flag(CEPH_OSDMAP_FULL
);
982 remove_flag(CEPH_OSDMAP_NEARFULL
);
985 if (tmp
.test_flag(CEPH_OSDMAP_FULL
)) {
986 dout(10) << __func__
<< " clearing full flag" << dendl
;
987 remove_flag(CEPH_OSDMAP_FULL
);
990 if (!tmp
.test_flag(CEPH_OSDMAP_NEARFULL
)) {
991 dout(10) << __func__
<< " setting nearfull flag" << dendl
;
992 add_flag(CEPH_OSDMAP_NEARFULL
);
995 if (tmp
.test_flag(CEPH_OSDMAP_NEARFULL
)) {
996 dout(10) << __func__
<< " clearing nearfull flag" << dendl
;
997 remove_flag(CEPH_OSDMAP_NEARFULL
);
1002 // min_compat_client?
1003 if (tmp
.require_min_compat_client
== 0) {
1004 auto mv
= tmp
.get_min_compat_client();
1005 dout(1) << __func__
<< " setting require_min_compat_client to currently "
1006 << "required " << ceph_release_name(mv
) << dendl
;
1007 mon
->clog
->info() << "setting require_min_compat_client to currently "
1008 << "required " << ceph_release_name(mv
);
1009 pending_inc
.new_require_min_compat_client
= mv
;
1012 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
1013 // convert ec profile ruleset-* -> crush-*
1014 for (auto& p
: tmp
.erasure_code_profiles
) {
1015 bool changed
= false;
1016 map
<string
,string
> newprofile
;
1017 for (auto& q
: p
.second
) {
1018 if (q
.first
.find("ruleset-") == 0) {
1019 string key
= "crush-";
1020 key
+= q
.first
.substr(8);
1021 newprofile
[key
] = q
.second
;
1023 dout(20) << " updating ec profile " << p
.first
1024 << " key " << q
.first
<< " -> " << key
<< dendl
;
1026 newprofile
[q
.first
] = q
.second
;
1030 dout(10) << " updated ec profile " << p
.first
<< ": "
1031 << newprofile
<< dendl
;
1032 pending_inc
.new_erasure_code_profiles
[p
.first
] = newprofile
;
1036 // auto-enable pool applications upon upgrade
1037 // NOTE: this can be removed post-Luminous assuming upgrades need to
1038 // proceed through Luminous
1039 for (auto &pool_pair
: tmp
.pools
) {
1040 int64_t pool_id
= pool_pair
.first
;
1041 pg_pool_t pg_pool
= pool_pair
.second
;
1042 if (pg_pool
.is_tier()) {
1046 std::string pool_name
= tmp
.get_pool_name(pool_id
);
1047 uint32_t match_count
= 0;
1050 FSMap
const &pending_fsmap
= mon
->mdsmon()->get_pending();
1051 if (pending_fsmap
.pool_in_use(pool_id
)) {
1052 dout(10) << __func__
<< " auto-enabling CephFS on pool '"
1053 << pool_name
<< "'" << dendl
;
1054 pg_pool
.application_metadata
.insert(
1055 {pg_pool_t::APPLICATION_NAME_CEPHFS
, {}});
1059 // RBD heuristics (default OpenStack pool names from docs and
1061 if (boost::algorithm::contains(pool_name
, "rbd") ||
1062 pool_name
== "images" || pool_name
== "volumes" ||
1063 pool_name
== "backups" || pool_name
== "vms") {
1064 dout(10) << __func__
<< " auto-enabling RBD on pool '"
1065 << pool_name
<< "'" << dendl
;
1066 pg_pool
.application_metadata
.insert(
1067 {pg_pool_t::APPLICATION_NAME_RBD
, {}});
1072 if (boost::algorithm::contains(pool_name
, ".rgw") ||
1073 boost::algorithm::contains(pool_name
, ".log") ||
1074 boost::algorithm::contains(pool_name
, ".intent-log") ||
1075 boost::algorithm::contains(pool_name
, ".usage") ||
1076 boost::algorithm::contains(pool_name
, ".users")) {
1077 dout(10) << __func__
<< " auto-enabling RGW on pool '"
1078 << pool_name
<< "'" << dendl
;
1079 pg_pool
.application_metadata
.insert(
1080 {pg_pool_t::APPLICATION_NAME_RGW
, {}});
1084 // OpenStack gnocchi (from ceph-ansible)
1085 if (pool_name
== "metrics" && match_count
== 0) {
1086 dout(10) << __func__
<< " auto-enabling OpenStack Gnocchi on pool '"
1087 << pool_name
<< "'" << dendl
;
1088 pg_pool
.application_metadata
.insert({"openstack_gnocchi", {}});
1092 if (match_count
== 1) {
1093 pg_pool
.last_change
= pending_inc
.epoch
;
1094 pending_inc
.new_pools
[pool_id
] = pg_pool
;
1095 } else if (match_count
> 1) {
1096 auto pstat
= mon
->pgservice
->get_pool_stat(pool_id
);
1097 if (pstat
!= nullptr && pstat
->stats
.sum
.num_objects
> 0) {
1098 mon
->clog
->info() << "unable to auto-enable application for pool "
1099 << "'" << pool_name
<< "'";
1108 for (auto i
= pending_inc
.new_state
.begin();
1109 i
!= pending_inc
.new_state
.end();
1111 int s
= i
->second
? i
->second
: CEPH_OSD_UP
;
1112 if (s
& CEPH_OSD_UP
)
1113 dout(2) << " osd." << i
->first
<< " DOWN" << dendl
;
1114 if (s
& CEPH_OSD_EXISTS
)
1115 dout(2) << " osd." << i
->first
<< " DNE" << dendl
;
1117 for (map
<int32_t,entity_addr_t
>::iterator i
= pending_inc
.new_up_client
.begin();
1118 i
!= pending_inc
.new_up_client
.end();
1120 //FIXME: insert cluster addresses too
1121 dout(2) << " osd." << i
->first
<< " UP " << i
->second
<< dendl
;
1123 for (map
<int32_t,uint32_t>::iterator i
= pending_inc
.new_weight
.begin();
1124 i
!= pending_inc
.new_weight
.end();
1126 if (i
->second
== CEPH_OSD_OUT
) {
1127 dout(2) << " osd." << i
->first
<< " OUT" << dendl
;
1128 } else if (i
->second
== CEPH_OSD_IN
) {
1129 dout(2) << " osd." << i
->first
<< " IN" << dendl
;
1131 dout(2) << " osd." << i
->first
<< " WEIGHT " << hex
<< i
->second
<< dec
<< dendl
;
1135 // features for osdmap and its incremental
1136 uint64_t features
= mon
->get_quorum_con_features();
1138 // encode full map and determine its crc
1141 tmp
.deepish_copy_from(osdmap
);
1142 tmp
.apply_incremental(pending_inc
);
1144 // determine appropriate features
1145 if (tmp
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
1146 dout(10) << __func__
<< " encoding without feature SERVER_LUMINOUS"
1148 features
&= ~CEPH_FEATURE_SERVER_LUMINOUS
;
1150 if (tmp
.require_osd_release
< CEPH_RELEASE_KRAKEN
) {
1151 dout(10) << __func__
<< " encoding without feature SERVER_KRAKEN | "
1152 << "MSG_ADDR2" << dendl
;
1153 features
&= ~(CEPH_FEATURE_SERVER_KRAKEN
|
1154 CEPH_FEATURE_MSG_ADDR2
);
1156 if (tmp
.require_osd_release
< CEPH_RELEASE_JEWEL
) {
1157 dout(10) << __func__
<< " encoding without feature SERVER_JEWEL" << dendl
;
1158 features
&= ~CEPH_FEATURE_SERVER_JEWEL
;
1160 dout(10) << __func__
<< " encoding full map with " << features
<< dendl
;
1163 ::encode(tmp
, fullbl
, features
| CEPH_FEATURE_RESERVED
);
1164 pending_inc
.full_crc
= tmp
.get_crc();
1166 // include full map in the txn. note that old monitors will
1167 // overwrite this. new ones will now skip the local full map
1168 // encode and reload from this.
1169 put_version_full(t
, pending_inc
.epoch
, fullbl
);
1173 assert(get_last_committed() + 1 == pending_inc
.epoch
);
1174 ::encode(pending_inc
, bl
, features
| CEPH_FEATURE_RESERVED
);
1176 dout(20) << " full_crc " << tmp
.get_crc()
1177 << " inc_crc " << pending_inc
.inc_crc
<< dendl
;
1179 /* put everything in the transaction */
1180 put_version(t
, pending_inc
.epoch
, bl
);
1181 put_last_committed(t
, pending_inc
.epoch
);
1184 for (map
<int,bufferlist
>::iterator p
= pending_metadata
.begin();
1185 p
!= pending_metadata
.end();
1187 t
->put(OSD_METADATA_PREFIX
, stringify(p
->first
), p
->second
);
1188 for (set
<int>::iterator p
= pending_metadata_rm
.begin();
1189 p
!= pending_metadata_rm
.end();
1191 t
->erase(OSD_METADATA_PREFIX
, stringify(*p
));
1192 pending_metadata
.clear();
1193 pending_metadata_rm
.clear();
1195 // and pg creating, also!
1196 if (mon
->monmap
->get_required_features().contains_all(
1197 ceph::features::mon::FEATURE_LUMINOUS
)) {
1198 auto pending_creatings
= update_pending_pgs(pending_inc
);
1199 if (osdmap
.get_epoch() &&
1200 osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
1201 dout(7) << __func__
<< " in the middle of upgrading, "
1202 << " trimming pending creating_pgs using pgmap" << dendl
;
1203 mon
->pgservice
->maybe_trim_creating_pgs(&pending_creatings
);
1205 bufferlist creatings_bl
;
1206 ::encode(pending_creatings
, creatings_bl
);
1207 t
->put(OSD_PG_CREATING_PREFIX
, "creating", creatings_bl
);
1211 health_check_map_t next
;
1212 tmp
.check_health(&next
);
1213 encode_health(next
, t
);
1216 void OSDMonitor::trim_creating_pgs(creating_pgs_t
* creating_pgs
,
1217 const ceph::unordered_map
<pg_t
,pg_stat_t
>& pg_stat
)
1219 auto p
= creating_pgs
->pgs
.begin();
1220 while (p
!= creating_pgs
->pgs
.end()) {
1221 auto q
= pg_stat
.find(p
->first
);
1222 if (q
!= pg_stat
.end() &&
1223 !(q
->second
.state
& PG_STATE_CREATING
)) {
1224 dout(20) << __func__
<< " pgmap shows " << p
->first
<< " is created"
1226 p
= creating_pgs
->pgs
.erase(p
);
1233 int OSDMonitor::load_metadata(int osd
, map
<string
, string
>& m
, ostream
*err
)
1236 int r
= mon
->store
->get(OSD_METADATA_PREFIX
, stringify(osd
), bl
);
1240 bufferlist::iterator p
= bl
.begin();
1243 catch (buffer::error
& e
) {
1245 *err
<< "osd." << osd
<< " metadata is corrupt";
1251 void OSDMonitor::count_metadata(const string
& field
, map
<string
,int> *out
)
1253 for (int osd
= 0; osd
< osdmap
.get_max_osd(); ++osd
) {
1254 if (osdmap
.is_up(osd
)) {
1255 map
<string
,string
> meta
;
1256 load_metadata(osd
, meta
, nullptr);
1257 auto p
= meta
.find(field
);
1258 if (p
== meta
.end()) {
1259 (*out
)["unknown"]++;
1261 (*out
)[p
->second
]++;
1267 void OSDMonitor::count_metadata(const string
& field
, Formatter
*f
)
1269 map
<string
,int> by_val
;
1270 count_metadata(field
, &by_val
);
1271 f
->open_object_section(field
.c_str());
1272 for (auto& p
: by_val
) {
1273 f
->dump_int(p
.first
.c_str(), p
.second
);
1278 int OSDMonitor::get_osd_objectstore_type(int osd
, string
*type
)
1280 map
<string
, string
> metadata
;
1281 int r
= load_metadata(osd
, metadata
, nullptr);
1285 auto it
= metadata
.find("osd_objectstore");
1286 if (it
== metadata
.end())
1292 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id
,
1293 const pg_pool_t
&pool
,
1296 // just check a few pgs for efficiency - this can't give a guarantee anyway,
1297 // since filestore osds could always join the pool later
1298 set
<int> checked_osds
;
1299 for (unsigned ps
= 0; ps
< MIN(8, pool
.get_pg_num()); ++ps
) {
1300 vector
<int> up
, acting
;
1301 pg_t
pgid(ps
, pool_id
, -1);
1302 osdmap
.pg_to_up_acting_osds(pgid
, up
, acting
);
1303 for (int osd
: up
) {
1304 if (checked_osds
.find(osd
) != checked_osds
.end())
1306 string objectstore_type
;
1307 int r
= get_osd_objectstore_type(osd
, &objectstore_type
);
1308 // allow with missing metadata, e.g. due to an osd never booting yet
1309 if (r
< 0 || objectstore_type
== "bluestore") {
1310 checked_osds
.insert(osd
);
1313 *err
<< "osd." << osd
<< " uses " << objectstore_type
;
1320 int OSDMonitor::dump_osd_metadata(int osd
, Formatter
*f
, ostream
*err
)
1322 map
<string
,string
> m
;
1323 if (int r
= load_metadata(osd
, m
, err
))
1325 for (map
<string
,string
>::iterator p
= m
.begin(); p
!= m
.end(); ++p
)
1326 f
->dump_string(p
->first
.c_str(), p
->second
);
1330 void OSDMonitor::print_nodes(Formatter
*f
)
1332 // group OSDs by their hosts
1333 map
<string
, list
<int> > osds
; // hostname => osd
1334 for (int osd
= 0; osd
< osdmap
.get_max_osd(); osd
++) {
1335 map
<string
, string
> m
;
1336 if (load_metadata(osd
, m
, NULL
)) {
1339 map
<string
, string
>::iterator hostname
= m
.find("hostname");
1340 if (hostname
== m
.end()) {
1341 // not likely though
1344 osds
[hostname
->second
].push_back(osd
);
1347 dump_services(f
, osds
, "osd");
1350 void OSDMonitor::share_map_with_random_osd()
1352 if (osdmap
.get_num_up_osds() == 0) {
1353 dout(10) << __func__
<< " no up osds, don't share with anyone" << dendl
;
1357 MonSession
*s
= mon
->session_map
.get_random_osd_session(&osdmap
);
1359 dout(10) << __func__
<< " no up osd on our session map" << dendl
;
1363 dout(10) << "committed, telling random " << s
->inst
<< " all about it" << dendl
;
1364 // whatev, they'll request more if they need it
1365 MOSDMap
*m
= build_incremental(osdmap
.get_epoch() - 1, osdmap
.get_epoch());
1366 s
->con
->send_message(m
);
1367 // NOTE: do *not* record osd has up to this epoch (as we do
1368 // elsewhere) as they may still need to request older values.
1371 version_t
OSDMonitor::get_trim_to()
1373 if (mon
->get_quorum().empty()) {
1374 dout(10) << __func__
<< ": quorum not formed" << dendl
;
1379 if (mon
->monmap
->get_required_features().contains_all(
1380 ceph::features::mon::FEATURE_LUMINOUS
)) {
1382 // TODO: Get this hidden in PGStatService
1383 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
1384 if (!creating_pgs
.pgs
.empty()) {
1388 floor
= get_min_last_epoch_clean();
1390 if (!mon
->pgservice
->is_readable())
1392 if (mon
->pgservice
->have_creating_pgs()) {
1395 floor
= mon
->pgservice
->get_min_last_epoch_clean();
1398 dout(10) << " min_last_epoch_clean " << floor
<< dendl
;
1399 if (g_conf
->mon_osd_force_trim_to
> 0 &&
1400 g_conf
->mon_osd_force_trim_to
< (int)get_last_committed()) {
1401 floor
= g_conf
->mon_osd_force_trim_to
;
1402 dout(10) << " explicit mon_osd_force_trim_to = " << floor
<< dendl
;
1404 unsigned min
= g_conf
->mon_min_osdmap_epochs
;
1405 if (floor
+ min
> get_last_committed()) {
1406 if (min
< get_last_committed())
1407 floor
= get_last_committed() - min
;
1411 if (floor
> get_first_committed())
1417 epoch_t
OSDMonitor::get_min_last_epoch_clean() const
1419 auto floor
= last_epoch_clean
.get_lower_bound(osdmap
);
1420 // also scan osd epochs
1421 // don't trim past the oldest reported osd epoch
1422 for (auto& osd_epoch
: osd_epochs
) {
1423 if (osd_epoch
.second
< floor
) {
1424 floor
= osd_epoch
.second
;
1430 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx
,
1433 dout(10) << __func__
<< " including full map for e " << first
<< dendl
;
1435 get_version_full(first
, bl
);
1436 put_version_full(tx
, first
, bl
);
1441 bool OSDMonitor::preprocess_query(MonOpRequestRef op
)
1443 op
->mark_osdmon_event(__func__
);
1444 Message
*m
= op
->get_req();
1445 dout(10) << "preprocess_query " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
1447 switch (m
->get_type()) {
1449 case MSG_MON_COMMAND
:
1450 return preprocess_command(op
);
1451 case CEPH_MSG_MON_GET_OSDMAP
:
1452 return preprocess_get_osdmap(op
);
1455 case MSG_OSD_MARK_ME_DOWN
:
1456 return preprocess_mark_me_down(op
);
1458 return preprocess_full(op
);
1459 case MSG_OSD_FAILURE
:
1460 return preprocess_failure(op
);
1462 return preprocess_boot(op
);
1464 return preprocess_alive(op
);
1465 case MSG_OSD_PG_CREATED
:
1466 return preprocess_pg_created(op
);
1467 case MSG_OSD_PGTEMP
:
1468 return preprocess_pgtemp(op
);
1469 case MSG_OSD_BEACON
:
1470 return preprocess_beacon(op
);
1472 case CEPH_MSG_POOLOP
:
1473 return preprocess_pool_op(op
);
1475 case MSG_REMOVE_SNAPS
:
1476 return preprocess_remove_snaps(op
);
1484 bool OSDMonitor::prepare_update(MonOpRequestRef op
)
1486 op
->mark_osdmon_event(__func__
);
1487 Message
*m
= op
->get_req();
1488 dout(7) << "prepare_update " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
1490 switch (m
->get_type()) {
1492 case MSG_OSD_MARK_ME_DOWN
:
1493 return prepare_mark_me_down(op
);
1495 return prepare_full(op
);
1496 case MSG_OSD_FAILURE
:
1497 return prepare_failure(op
);
1499 return prepare_boot(op
);
1501 return prepare_alive(op
);
1502 case MSG_OSD_PG_CREATED
:
1503 return prepare_pg_created(op
);
1504 case MSG_OSD_PGTEMP
:
1505 return prepare_pgtemp(op
);
1506 case MSG_OSD_BEACON
:
1507 return prepare_beacon(op
);
1509 case MSG_MON_COMMAND
:
1510 return prepare_command(op
);
1512 case CEPH_MSG_POOLOP
:
1513 return prepare_pool_op(op
);
1515 case MSG_REMOVE_SNAPS
:
1516 return prepare_remove_snaps(op
);
1526 bool OSDMonitor::should_propose(double& delay
)
1528 dout(10) << "should_propose" << dendl
;
1530 // if full map, propose immediately! any subsequent changes will be clobbered.
1531 if (pending_inc
.fullmap
.length())
1534 // adjust osd weights?
1535 if (!osd_weight
.empty() &&
1536 osd_weight
.size() == (unsigned)osdmap
.get_max_osd()) {
1537 dout(0) << " adjusting osd weights based on " << osd_weight
<< dendl
;
1538 osdmap
.adjust_osd_weights(osd_weight
, pending_inc
);
1544 // propose as fast as possible if updating up_thru or pg_temp
1545 // want to merge OSDMap changes as much as possible
1546 if ((pending_inc
.new_primary_temp
.size() == 1
1547 || pending_inc
.new_up_thru
.size() == 1)
1548 && pending_inc
.new_state
.size() < 2) {
1549 dout(15) << " propose as fast as possible for up_thru/pg_temp" << dendl
;
1551 utime_t now
= ceph_clock_now();
1552 if (now
- last_attempted_minwait_time
> g_conf
->paxos_propose_interval
1553 && now
- paxos
->get_last_commit_time() > g_conf
->paxos_min_wait
) {
1554 delay
= g_conf
->paxos_min_wait
;
1555 last_attempted_minwait_time
= now
;
1560 return PaxosService::should_propose(delay
);
1565 // ---------------------------
1568 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op
)
1570 op
->mark_osdmon_event(__func__
);
1571 MMonGetOSDMap
*m
= static_cast<MMonGetOSDMap
*>(op
->get_req());
1572 dout(10) << __func__
<< " " << *m
<< dendl
;
1573 MOSDMap
*reply
= new MOSDMap(mon
->monmap
->fsid
);
1574 epoch_t first
= get_first_committed();
1575 epoch_t last
= osdmap
.get_epoch();
1576 int max
= g_conf
->osd_map_message_max
;
1577 for (epoch_t e
= MAX(first
, m
->get_full_first());
1578 e
<= MIN(last
, m
->get_full_last()) && max
> 0;
1580 int r
= get_version_full(e
, reply
->maps
[e
]);
1583 for (epoch_t e
= MAX(first
, m
->get_inc_first());
1584 e
<= MIN(last
, m
->get_inc_last()) && max
> 0;
1586 int r
= get_version(e
, reply
->incremental_maps
[e
]);
1589 reply
->oldest_map
= first
;
1590 reply
->newest_map
= last
;
1591 mon
->send_reply(op
, reply
);
1596 // ---------------------------
1601 bool OSDMonitor::check_source(PaxosServiceMessage
*m
, uuid_d fsid
) {
1602 // check permissions
1603 MonSession
*session
= m
->get_session();
1606 if (!session
->is_capable("osd", MON_CAP_X
)) {
1607 dout(0) << "got MOSDFailure from entity with insufficient caps "
1608 << session
->caps
<< dendl
;
1611 if (fsid
!= mon
->monmap
->fsid
) {
1612 dout(0) << "check_source: on fsid " << fsid
1613 << " != " << mon
->monmap
->fsid
<< dendl
;
1620 bool OSDMonitor::preprocess_failure(MonOpRequestRef op
)
1622 op
->mark_osdmon_event(__func__
);
1623 MOSDFailure
*m
= static_cast<MOSDFailure
*>(op
->get_req());
1624 // who is target_osd
1625 int badboy
= m
->get_target().name
.num();
1627 // check permissions
1628 if (check_source(m
, m
->fsid
))
1631 // first, verify the reporting host is valid
1632 if (m
->get_orig_source().is_osd()) {
1633 int from
= m
->get_orig_source().num();
1634 if (!osdmap
.exists(from
) ||
1635 osdmap
.get_addr(from
) != m
->get_orig_source_inst().addr
||
1636 (osdmap
.is_down(from
) && m
->if_osd_failed())) {
1637 dout(5) << "preprocess_failure from dead osd." << from
<< ", ignoring" << dendl
;
1638 send_incremental(op
, m
->get_epoch()+1);
1645 if (osdmap
.is_down(badboy
)) {
1646 dout(5) << "preprocess_failure dne(/dup?): " << m
->get_target() << ", from " << m
->get_orig_source_inst() << dendl
;
1647 if (m
->get_epoch() < osdmap
.get_epoch())
1648 send_incremental(op
, m
->get_epoch()+1);
1651 if (osdmap
.get_inst(badboy
) != m
->get_target()) {
1652 dout(5) << "preprocess_failure wrong osd: report " << m
->get_target() << " != map's " << osdmap
.get_inst(badboy
)
1653 << ", from " << m
->get_orig_source_inst() << dendl
;
1654 if (m
->get_epoch() < osdmap
.get_epoch())
1655 send_incremental(op
, m
->get_epoch()+1);
1659 // already reported?
1660 if (osdmap
.is_down(badboy
) ||
1661 osdmap
.get_up_from(badboy
) > m
->get_epoch()) {
1662 dout(5) << "preprocess_failure dup/old: " << m
->get_target() << ", from " << m
->get_orig_source_inst() << dendl
;
1663 if (m
->get_epoch() < osdmap
.get_epoch())
1664 send_incremental(op
, m
->get_epoch()+1);
1668 if (!can_mark_down(badboy
)) {
1669 dout(5) << "preprocess_failure ignoring report of " << m
->get_target() << " from " << m
->get_orig_source_inst() << dendl
;
1673 dout(10) << "preprocess_failure new: " << m
->get_target() << ", from " << m
->get_orig_source_inst() << dendl
;
1680 class C_AckMarkedDown
: public C_MonOp
{
1686 : C_MonOp(op
), osdmon(osdmon
) {}
1688 void _finish(int) override
{
1689 MOSDMarkMeDown
*m
= static_cast<MOSDMarkMeDown
*>(op
->get_req());
1690 osdmon
->mon
->send_reply(
1696 false)); // ACK itself does not request an ack
1698 ~C_AckMarkedDown() override
{
1702 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op
)
1704 op
->mark_osdmon_event(__func__
);
1705 MOSDMarkMeDown
*m
= static_cast<MOSDMarkMeDown
*>(op
->get_req());
1706 int requesting_down
= m
->get_target().name
.num();
1707 int from
= m
->get_orig_source().num();
1709 // check permissions
1710 if (check_source(m
, m
->fsid
))
1713 // first, verify the reporting host is valid
1714 if (!m
->get_orig_source().is_osd())
1717 if (!osdmap
.exists(from
) ||
1718 osdmap
.is_down(from
) ||
1719 osdmap
.get_addr(from
) != m
->get_target().addr
) {
1720 dout(5) << "preprocess_mark_me_down from dead osd."
1721 << from
<< ", ignoring" << dendl
;
1722 send_incremental(op
, m
->get_epoch()+1);
1726 // no down might be set
1727 if (!can_mark_down(requesting_down
))
1730 dout(10) << "MOSDMarkMeDown for: " << m
->get_target() << dendl
;
1734 if (m
->request_ack
) {
1735 Context
*c(new C_AckMarkedDown(this, op
));
1741 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op
)
1743 op
->mark_osdmon_event(__func__
);
1744 MOSDMarkMeDown
*m
= static_cast<MOSDMarkMeDown
*>(op
->get_req());
1745 int target_osd
= m
->get_target().name
.num();
1747 assert(osdmap
.is_up(target_osd
));
1748 assert(osdmap
.get_addr(target_osd
) == m
->get_target().addr
);
1750 mon
->clog
->info() << "osd." << target_osd
<< " marked itself down";
1751 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
1753 wait_for_finished_proposal(op
, new C_AckMarkedDown(this, op
));
1757 bool OSDMonitor::can_mark_down(int i
)
1759 if (osdmap
.test_flag(CEPH_OSDMAP_NODOWN
)) {
1760 dout(5) << __func__
<< " NODOWN flag set, will not mark osd." << i
1761 << " down" << dendl
;
1765 if (osdmap
.is_nodown(i
)) {
1766 dout(5) << __func__
<< " osd." << i
<< " is marked as nodown, "
1767 << "will not mark it down" << dendl
;
1771 int num_osds
= osdmap
.get_num_osds();
1772 if (num_osds
== 0) {
1773 dout(5) << __func__
<< " no osds" << dendl
;
1776 int up
= osdmap
.get_num_up_osds() - pending_inc
.get_net_marked_down(&osdmap
);
1777 float up_ratio
= (float)up
/ (float)num_osds
;
1778 if (up_ratio
< g_conf
->mon_osd_min_up_ratio
) {
1779 dout(2) << __func__
<< " current up_ratio " << up_ratio
<< " < min "
1780 << g_conf
->mon_osd_min_up_ratio
1781 << ", will not mark osd." << i
<< " down" << dendl
;
1787 bool OSDMonitor::can_mark_up(int i
)
1789 if (osdmap
.test_flag(CEPH_OSDMAP_NOUP
)) {
1790 dout(5) << __func__
<< " NOUP flag set, will not mark osd." << i
1795 if (osdmap
.is_noup(i
)) {
1796 dout(5) << __func__
<< " osd." << i
<< " is marked as noup, "
1797 << "will not mark it up" << dendl
;
1805 * @note the parameter @p i apparently only exists here so we can output the
1806 * osd's id on messages.
1808 bool OSDMonitor::can_mark_out(int i
)
1810 if (osdmap
.test_flag(CEPH_OSDMAP_NOOUT
)) {
1811 dout(5) << __func__
<< " NOOUT flag set, will not mark osds out" << dendl
;
1815 if (osdmap
.is_noout(i
)) {
1816 dout(5) << __func__
<< " osd." << i
<< " is marked as noout, "
1817 << "will not mark it out" << dendl
;
1821 int num_osds
= osdmap
.get_num_osds();
1822 if (num_osds
== 0) {
1823 dout(5) << __func__
<< " no osds" << dendl
;
1826 int in
= osdmap
.get_num_in_osds() - pending_inc
.get_net_marked_out(&osdmap
);
1827 float in_ratio
= (float)in
/ (float)num_osds
;
1828 if (in_ratio
< g_conf
->mon_osd_min_in_ratio
) {
1830 dout(5) << __func__
<< " current in_ratio " << in_ratio
<< " < min "
1831 << g_conf
->mon_osd_min_in_ratio
1832 << ", will not mark osd." << i
<< " out" << dendl
;
1834 dout(5) << __func__
<< " current in_ratio " << in_ratio
<< " < min "
1835 << g_conf
->mon_osd_min_in_ratio
1836 << ", will not mark osds out" << dendl
;
1843 bool OSDMonitor::can_mark_in(int i
)
1845 if (osdmap
.test_flag(CEPH_OSDMAP_NOIN
)) {
1846 dout(5) << __func__
<< " NOIN flag set, will not mark osd." << i
1851 if (osdmap
.is_noin(i
)) {
1852 dout(5) << __func__
<< " osd." << i
<< " is marked as noin, "
1853 << "will not mark it in" << dendl
;
1860 bool OSDMonitor::check_failures(utime_t now
)
1862 bool found_failure
= false;
1863 for (map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
1864 p
!= failure_info
.end();
1866 if (can_mark_down(p
->first
)) {
1867 found_failure
|= check_failure(now
, p
->first
, p
->second
);
1870 return found_failure
;
1873 bool OSDMonitor::check_failure(utime_t now
, int target_osd
, failure_info_t
& fi
)
1875 // already pending failure?
1876 if (pending_inc
.new_state
.count(target_osd
) &&
1877 pending_inc
.new_state
[target_osd
] & CEPH_OSD_UP
) {
1878 dout(10) << " already pending failure" << dendl
;
1882 set
<string
> reporters_by_subtree
;
1883 string reporter_subtree_level
= g_conf
->mon_osd_reporter_subtree_level
;
1884 utime_t
orig_grace(g_conf
->osd_heartbeat_grace
, 0);
1885 utime_t max_failed_since
= fi
.get_failed_since();
1886 utime_t failed_for
= now
- max_failed_since
;
1888 utime_t grace
= orig_grace
;
1889 double my_grace
= 0, peer_grace
= 0;
1891 if (g_conf
->mon_osd_adjust_heartbeat_grace
) {
1892 double halflife
= (double)g_conf
->mon_osd_laggy_halflife
;
1893 decay_k
= ::log(.5) / halflife
;
1895 // scale grace period based on historical probability of 'lagginess'
1896 // (false positive failures due to slowness).
1897 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(target_osd
);
1898 double decay
= exp((double)failed_for
* decay_k
);
1899 dout(20) << " halflife " << halflife
<< " decay_k " << decay_k
1900 << " failed_for " << failed_for
<< " decay " << decay
<< dendl
;
1901 my_grace
= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
1905 // consider the peers reporting a failure a proxy for a potential
1906 // 'subcluster' over the overall cluster that is similarly
1907 // laggy. this is clearly not true in all cases, but will sometimes
1908 // help us localize the grace correction to a subset of the system
1909 // (say, a rack with a bad switch) that is unhappy.
1910 assert(fi
.reporters
.size());
1911 for (map
<int,failure_reporter_t
>::iterator p
= fi
.reporters
.begin();
1912 p
!= fi
.reporters
.end();
1914 // get the parent bucket whose type matches with "reporter_subtree_level".
1915 // fall back to OSD if the level doesn't exist.
1916 map
<string
, string
> reporter_loc
= osdmap
.crush
->get_full_location(p
->first
);
1917 map
<string
, string
>::iterator iter
= reporter_loc
.find(reporter_subtree_level
);
1918 if (iter
== reporter_loc
.end()) {
1919 reporters_by_subtree
.insert("osd." + to_string(p
->first
));
1921 reporters_by_subtree
.insert(iter
->second
);
1923 if (g_conf
->mon_osd_adjust_heartbeat_grace
) {
1924 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(p
->first
);
1925 utime_t elapsed
= now
- xi
.down_stamp
;
1926 double decay
= exp((double)elapsed
* decay_k
);
1927 peer_grace
+= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
1931 if (g_conf
->mon_osd_adjust_heartbeat_grace
) {
1932 peer_grace
/= (double)fi
.reporters
.size();
1933 grace
+= peer_grace
;
1936 dout(10) << " osd." << target_osd
<< " has "
1937 << fi
.reporters
.size() << " reporters, "
1938 << grace
<< " grace (" << orig_grace
<< " + " << my_grace
1939 << " + " << peer_grace
<< "), max_failed_since " << max_failed_since
1942 if (failed_for
>= grace
&&
1943 (int)reporters_by_subtree
.size() >= g_conf
->mon_osd_min_down_reporters
) {
1944 dout(1) << " we have enough reporters to mark osd." << target_osd
1945 << " down" << dendl
;
1946 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
1948 mon
->clog
->info() << "osd." << target_osd
<< " failed ("
1949 << osdmap
.crush
->get_full_location_ordered_string(
1952 << (int)reporters_by_subtree
.size()
1953 << " reporters from different "
1954 << reporter_subtree_level
<< " after "
1955 << failed_for
<< " >= grace " << grace
<< ")";
1961 void OSDMonitor::force_failure(int target_osd
, int by
)
1963 // already pending failure?
1964 if (pending_inc
.new_state
.count(target_osd
) &&
1965 pending_inc
.new_state
[target_osd
] & CEPH_OSD_UP
) {
1966 dout(10) << " already pending failure" << dendl
;
1970 dout(1) << " we're forcing failure of osd." << target_osd
<< dendl
;
1971 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
1973 mon
->clog
->info() << "osd." << target_osd
<< " failed ("
1974 << osdmap
.crush
->get_full_location_ordered_string(target_osd
)
1975 << ") (connection refused reported by osd." << by
<< ")";
1979 bool OSDMonitor::prepare_failure(MonOpRequestRef op
)
1981 op
->mark_osdmon_event(__func__
);
1982 MOSDFailure
*m
= static_cast<MOSDFailure
*>(op
->get_req());
1983 dout(1) << "prepare_failure " << m
->get_target()
1984 << " from " << m
->get_orig_source_inst()
1985 << " is reporting failure:" << m
->if_osd_failed() << dendl
;
1987 int target_osd
= m
->get_target().name
.num();
1988 int reporter
= m
->get_orig_source().num();
1989 assert(osdmap
.is_up(target_osd
));
1990 assert(osdmap
.get_addr(target_osd
) == m
->get_target().addr
);
1992 if (m
->if_osd_failed()) {
1993 // calculate failure time
1994 utime_t now
= ceph_clock_now();
1995 utime_t failed_since
=
1996 m
->get_recv_stamp() - utime_t(m
->failed_for
, 0);
1999 if (m
->is_immediate()) {
2000 mon
->clog
->debug() << m
->get_target() << " reported immediately failed by "
2001 << m
->get_orig_source_inst();
2002 force_failure(target_osd
, reporter
);
2005 mon
->clog
->debug() << m
->get_target() << " reported failed by "
2006 << m
->get_orig_source_inst();
2008 failure_info_t
& fi
= failure_info
[target_osd
];
2009 MonOpRequestRef old_op
= fi
.add_report(reporter
, failed_since
, op
);
2011 mon
->no_reply(old_op
);
2014 return check_failure(now
, target_osd
, fi
);
2016 // remove the report
2017 mon
->clog
->debug() << m
->get_target() << " failure report canceled by "
2018 << m
->get_orig_source_inst();
2019 if (failure_info
.count(target_osd
)) {
2020 failure_info_t
& fi
= failure_info
[target_osd
];
2021 MonOpRequestRef report_op
= fi
.cancel_report(reporter
);
2023 mon
->no_reply(report_op
);
2025 if (fi
.reporters
.empty()) {
2026 dout(10) << " removing last failure_info for osd." << target_osd
2028 failure_info
.erase(target_osd
);
2030 dout(10) << " failure_info for osd." << target_osd
<< " now "
2031 << fi
.reporters
.size() << " reporters" << dendl
;
2034 dout(10) << " no failure_info for osd." << target_osd
<< dendl
;
2042 void OSDMonitor::process_failures()
2044 map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
2045 while (p
!= failure_info
.end()) {
2046 if (osdmap
.is_up(p
->first
)) {
2049 dout(10) << "process_failures osd." << p
->first
<< dendl
;
2050 list
<MonOpRequestRef
> ls
;
2051 p
->second
.take_report_messages(ls
);
2052 failure_info
.erase(p
++);
2054 while (!ls
.empty()) {
2055 MonOpRequestRef o
= ls
.front();
2057 o
->mark_event(__func__
);
2058 MOSDFailure
*m
= o
->get_req
<MOSDFailure
>();
2059 send_latest(o
, m
->get_epoch());
2067 void OSDMonitor::take_all_failures(list
<MonOpRequestRef
>& ls
)
2069 dout(10) << __func__
<< " on " << failure_info
.size() << " osds" << dendl
;
2071 for (map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
2072 p
!= failure_info
.end();
2074 p
->second
.take_report_messages(ls
);
2076 failure_info
.clear();
2082 bool OSDMonitor::preprocess_boot(MonOpRequestRef op
)
2084 op
->mark_osdmon_event(__func__
);
2085 MOSDBoot
*m
= static_cast<MOSDBoot
*>(op
->get_req());
2086 int from
= m
->get_orig_source_inst().name
.num();
2088 // check permissions, ignore if failed (no response expected)
2089 MonSession
*session
= m
->get_session();
2092 if (!session
->is_capable("osd", MON_CAP_X
)) {
2093 dout(0) << "got preprocess_boot message from entity with insufficient caps"
2094 << session
->caps
<< dendl
;
2098 if (m
->sb
.cluster_fsid
!= mon
->monmap
->fsid
) {
2099 dout(0) << "preprocess_boot on fsid " << m
->sb
.cluster_fsid
2100 << " != " << mon
->monmap
->fsid
<< dendl
;
2104 if (m
->get_orig_source_inst().addr
.is_blank_ip()) {
2105 dout(0) << "preprocess_boot got blank addr for " << m
->get_orig_source_inst() << dendl
;
2109 assert(m
->get_orig_source_inst().name
.is_osd());
2111 // check if osd has required features to boot
2112 if ((osdmap
.get_features(CEPH_ENTITY_TYPE_OSD
, NULL
) &
2113 CEPH_FEATURE_OSD_ERASURE_CODES
) &&
2114 !(m
->get_connection()->get_features() & CEPH_FEATURE_OSD_ERASURE_CODES
)) {
2115 dout(0) << __func__
<< " osdmap requires erasure code but osd at "
2116 << m
->get_orig_source_inst()
2117 << " doesn't announce support -- ignore" << dendl
;
2121 if ((osdmap
.get_features(CEPH_ENTITY_TYPE_OSD
, NULL
) &
2122 CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2
) &&
2123 !(m
->get_connection()->get_features() & CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2
)) {
2124 dout(0) << __func__
<< " osdmap requires erasure code plugins v2 but osd at "
2125 << m
->get_orig_source_inst()
2126 << " doesn't announce support -- ignore" << dendl
;
2130 if ((osdmap
.get_features(CEPH_ENTITY_TYPE_OSD
, NULL
) &
2131 CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3
) &&
2132 !(m
->get_connection()->get_features() & CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3
)) {
2133 dout(0) << __func__
<< " osdmap requires erasure code plugins v3 but osd at "
2134 << m
->get_orig_source_inst()
2135 << " doesn't announce support -- ignore" << dendl
;
2139 if (osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
&&
2140 !HAVE_FEATURE(m
->osd_features
, SERVER_LUMINOUS
)) {
2141 mon
->clog
->info() << "disallowing boot of OSD "
2142 << m
->get_orig_source_inst()
2143 << " because the osdmap requires"
2144 << " CEPH_FEATURE_SERVER_LUMINOUS"
2145 << " but the osd lacks CEPH_FEATURE_SERVER_LUMINOUS";
2149 if (osdmap
.require_osd_release
>= CEPH_RELEASE_JEWEL
&&
2150 !(m
->osd_features
& CEPH_FEATURE_SERVER_JEWEL
)) {
2151 mon
->clog
->info() << "disallowing boot of OSD "
2152 << m
->get_orig_source_inst()
2153 << " because the osdmap requires"
2154 << " CEPH_FEATURE_SERVER_JEWEL"
2155 << " but the osd lacks CEPH_FEATURE_SERVER_JEWEL";
2159 if (osdmap
.require_osd_release
>= CEPH_RELEASE_KRAKEN
&&
2160 !HAVE_FEATURE(m
->osd_features
, SERVER_KRAKEN
)) {
2161 mon
->clog
->info() << "disallowing boot of OSD "
2162 << m
->get_orig_source_inst()
2163 << " because the osdmap requires"
2164 << " CEPH_FEATURE_SERVER_KRAKEN"
2165 << " but the osd lacks CEPH_FEATURE_SERVER_KRAKEN";
2169 if (osdmap
.test_flag(CEPH_OSDMAP_SORTBITWISE
) &&
2170 !(m
->osd_features
& CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT
)) {
2171 mon
->clog
->info() << "disallowing boot of OSD "
2172 << m
->get_orig_source_inst()
2173 << " because 'sortbitwise' osdmap flag is set and OSD lacks the OSD_BITWISE_HOBJ_SORT feature";
2177 if (osdmap
.test_flag(CEPH_OSDMAP_RECOVERY_DELETES
) &&
2178 !(m
->osd_features
& CEPH_FEATURE_OSD_RECOVERY_DELETES
)) {
2179 mon
->clog
->info() << "disallowing boot of OSD "
2180 << m
->get_orig_source_inst()
2181 << " because 'recovery_deletes' osdmap flag is set and OSD lacks the OSD_RECOVERY_DELETES feature";
2185 if (any_of(osdmap
.get_pools().begin(),
2186 osdmap
.get_pools().end(),
2187 [](const std::pair
<int64_t,pg_pool_t
>& pool
)
2188 { return pool
.second
.use_gmt_hitset
; })) {
2189 assert(osdmap
.get_num_up_osds() == 0 ||
2190 osdmap
.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT
);
2191 if (!(m
->osd_features
& CEPH_FEATURE_OSD_HITSET_GMT
)) {
2192 dout(0) << __func__
<< " one or more pools uses GMT hitsets but osd at "
2193 << m
->get_orig_source_inst()
2194 << " doesn't announce support -- ignore" << dendl
;
2199 // make sure upgrades stop at luminous
2200 if (HAVE_FEATURE(m
->osd_features
, SERVER_M
) &&
2201 osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
2202 mon
->clog
->info() << "disallowing boot of post-luminous OSD "
2203 << m
->get_orig_source_inst()
2204 << " because require_osd_release < luminous";
2208 // make sure upgrades stop at jewel
2209 if (HAVE_FEATURE(m
->osd_features
, SERVER_KRAKEN
) &&
2210 osdmap
.require_osd_release
< CEPH_RELEASE_JEWEL
) {
2211 mon
->clog
->info() << "disallowing boot of post-jewel OSD "
2212 << m
->get_orig_source_inst()
2213 << " because require_osd_release < jewel";
2217 // make sure upgrades stop at hammer
2218 // * HAMMER_0_94_4 is the required hammer feature
2219 // * MON_METADATA is the first post-hammer feature
2220 if (osdmap
.get_num_up_osds() > 0) {
2221 if ((m
->osd_features
& CEPH_FEATURE_MON_METADATA
) &&
2222 !(osdmap
.get_up_osd_features() & CEPH_FEATURE_HAMMER_0_94_4
)) {
2223 mon
->clog
->info() << "disallowing boot of post-hammer OSD "
2224 << m
->get_orig_source_inst()
2225 << " because one or more up OSDs is pre-hammer v0.94.4";
2228 if (!(m
->osd_features
& CEPH_FEATURE_HAMMER_0_94_4
) &&
2229 (osdmap
.get_up_osd_features() & CEPH_FEATURE_MON_METADATA
)) {
2230 mon
->clog
->info() << "disallowing boot of pre-hammer v0.94.4 OSD "
2231 << m
->get_orig_source_inst()
2232 << " because all up OSDs are post-hammer";
2238 if (osdmap
.is_up(from
) &&
2239 osdmap
.get_inst(from
) == m
->get_orig_source_inst() &&
2240 osdmap
.get_cluster_addr(from
) == m
->cluster_addr
) {
2242 dout(7) << "preprocess_boot dup from " << m
->get_orig_source_inst()
2243 << " == " << osdmap
.get_inst(from
) << dendl
;
2248 if (osdmap
.exists(from
) &&
2249 !osdmap
.get_uuid(from
).is_zero() &&
2250 osdmap
.get_uuid(from
) != m
->sb
.osd_fsid
) {
2251 dout(7) << __func__
<< " from " << m
->get_orig_source_inst()
2252 << " clashes with existing osd: different fsid"
2253 << " (ours: " << osdmap
.get_uuid(from
)
2254 << " ; theirs: " << m
->sb
.osd_fsid
<< ")" << dendl
;
2258 if (osdmap
.exists(from
) &&
2259 osdmap
.get_info(from
).up_from
> m
->version
&&
2260 osdmap
.get_most_recent_inst(from
) == m
->get_orig_source_inst()) {
2261 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl
;
2262 send_latest(op
, m
->sb
.current_epoch
+1);
2267 if (!can_mark_up(from
)) {
2268 dout(7) << "preprocess_boot ignoring boot from " << m
->get_orig_source_inst() << dendl
;
2269 send_latest(op
, m
->sb
.current_epoch
+1);
2273 dout(10) << "preprocess_boot from " << m
->get_orig_source_inst() << dendl
;
2280 bool OSDMonitor::prepare_boot(MonOpRequestRef op
)
2282 op
->mark_osdmon_event(__func__
);
2283 MOSDBoot
*m
= static_cast<MOSDBoot
*>(op
->get_req());
2284 dout(7) << __func__
<< " from " << m
->get_orig_source_inst() << " sb " << m
->sb
2285 << " cluster_addr " << m
->cluster_addr
2286 << " hb_back_addr " << m
->hb_back_addr
2287 << " hb_front_addr " << m
->hb_front_addr
2290 assert(m
->get_orig_source().is_osd());
2291 int from
= m
->get_orig_source().num();
2293 // does this osd exist?
2294 if (from
>= osdmap
.get_max_osd()) {
2295 dout(1) << "boot from osd." << from
<< " >= max_osd "
2296 << osdmap
.get_max_osd() << dendl
;
2300 int oldstate
= osdmap
.exists(from
) ? osdmap
.get_state(from
) : CEPH_OSD_NEW
;
2301 if (pending_inc
.new_state
.count(from
))
2302 oldstate
^= pending_inc
.new_state
[from
];
2304 // already up? mark down first?
2305 if (osdmap
.is_up(from
)) {
2306 dout(7) << __func__
<< " was up, first marking down "
2307 << osdmap
.get_inst(from
) << dendl
;
2308 // preprocess should have caught these; if not, assert.
2309 assert(osdmap
.get_inst(from
) != m
->get_orig_source_inst() ||
2310 osdmap
.get_cluster_addr(from
) != m
->cluster_addr
);
2311 assert(osdmap
.get_uuid(from
) == m
->sb
.osd_fsid
);
2313 if (pending_inc
.new_state
.count(from
) == 0 ||
2314 (pending_inc
.new_state
[from
] & CEPH_OSD_UP
) == 0) {
2315 // mark previous guy down
2316 pending_inc
.new_state
[from
] = CEPH_OSD_UP
;
2318 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
2319 } else if (pending_inc
.new_up_client
.count(from
)) {
2320 // already prepared, just wait
2321 dout(7) << __func__
<< " already prepared, waiting on "
2322 << m
->get_orig_source_addr() << dendl
;
2323 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
2326 pending_inc
.new_up_client
[from
] = m
->get_orig_source_addr();
2327 if (!m
->cluster_addr
.is_blank_ip())
2328 pending_inc
.new_up_cluster
[from
] = m
->cluster_addr
;
2329 pending_inc
.new_hb_back_up
[from
] = m
->hb_back_addr
;
2330 if (!m
->hb_front_addr
.is_blank_ip())
2331 pending_inc
.new_hb_front_up
[from
] = m
->hb_front_addr
;
2333 down_pending_out
.erase(from
); // if any
2336 osd_weight
[from
] = m
->sb
.weight
;
2339 dout(10) << " setting osd." << from
<< " uuid to " << m
->sb
.osd_fsid
2341 if (!osdmap
.exists(from
) || osdmap
.get_uuid(from
) != m
->sb
.osd_fsid
) {
2342 // preprocess should have caught this; if not, assert.
2343 assert(!osdmap
.exists(from
) || osdmap
.get_uuid(from
).is_zero());
2344 pending_inc
.new_uuid
[from
] = m
->sb
.osd_fsid
;
2348 if (m
->sb
.newest_map
== 0 && osdmap
.exists(from
)) {
2349 const osd_info_t
& i
= osdmap
.get_info(from
);
2350 if (i
.up_from
> i
.lost_at
) {
2351 dout(10) << " fresh osd; marking lost_at too" << dendl
;
2352 pending_inc
.new_lost
[from
] = osdmap
.get_epoch();
2357 bufferlist osd_metadata
;
2358 ::encode(m
->metadata
, osd_metadata
);
2359 pending_metadata
[from
] = osd_metadata
;
2360 pending_metadata_rm
.erase(from
);
2362 // adjust last clean unmount epoch?
2363 const osd_info_t
& info
= osdmap
.get_info(from
);
2364 dout(10) << " old osd_info: " << info
<< dendl
;
2365 if (m
->sb
.mounted
> info
.last_clean_begin
||
2366 (m
->sb
.mounted
== info
.last_clean_begin
&&
2367 m
->sb
.clean_thru
> info
.last_clean_end
)) {
2368 epoch_t begin
= m
->sb
.mounted
;
2369 epoch_t end
= m
->sb
.clean_thru
;
2371 dout(10) << __func__
<< " osd." << from
<< " last_clean_interval "
2372 << "[" << info
.last_clean_begin
<< "," << info
.last_clean_end
2373 << ") -> [" << begin
<< "-" << end
<< ")"
2375 pending_inc
.new_last_clean_interval
[from
] =
2376 pair
<epoch_t
,epoch_t
>(begin
, end
);
2379 osd_xinfo_t xi
= osdmap
.get_xinfo(from
);
2380 if (m
->boot_epoch
== 0) {
2381 xi
.laggy_probability
*= (1.0 - g_conf
->mon_osd_laggy_weight
);
2382 xi
.laggy_interval
*= (1.0 - g_conf
->mon_osd_laggy_weight
);
2383 dout(10) << " not laggy, new xi " << xi
<< dendl
;
2385 if (xi
.down_stamp
.sec()) {
2386 int interval
= ceph_clock_now().sec() -
2387 xi
.down_stamp
.sec();
2388 if (g_conf
->mon_osd_laggy_max_interval
&&
2389 (interval
> g_conf
->mon_osd_laggy_max_interval
)) {
2390 interval
= g_conf
->mon_osd_laggy_max_interval
;
2393 interval
* g_conf
->mon_osd_laggy_weight
+
2394 xi
.laggy_interval
* (1.0 - g_conf
->mon_osd_laggy_weight
);
2396 xi
.laggy_probability
=
2397 g_conf
->mon_osd_laggy_weight
+
2398 xi
.laggy_probability
* (1.0 - g_conf
->mon_osd_laggy_weight
);
2399 dout(10) << " laggy, now xi " << xi
<< dendl
;
2402 // set features shared by the osd
2403 if (m
->osd_features
)
2404 xi
.features
= m
->osd_features
;
2406 xi
.features
= m
->get_connection()->get_features();
2409 if ((g_conf
->mon_osd_auto_mark_auto_out_in
&&
2410 (oldstate
& CEPH_OSD_AUTOOUT
)) ||
2411 (g_conf
->mon_osd_auto_mark_new_in
&& (oldstate
& CEPH_OSD_NEW
)) ||
2412 (g_conf
->mon_osd_auto_mark_in
)) {
2413 if (can_mark_in(from
)) {
2414 if (osdmap
.osd_xinfo
[from
].old_weight
> 0) {
2415 pending_inc
.new_weight
[from
] = osdmap
.osd_xinfo
[from
].old_weight
;
2418 pending_inc
.new_weight
[from
] = CEPH_OSD_IN
;
2421 dout(7) << __func__
<< " NOIN set, will not mark in "
2422 << m
->get_orig_source_addr() << dendl
;
2426 pending_inc
.new_xinfo
[from
] = xi
;
2429 wait_for_finished_proposal(op
, new C_Booted(this, op
));
2434 void OSDMonitor::_booted(MonOpRequestRef op
, bool logit
)
2436 op
->mark_osdmon_event(__func__
);
2437 MOSDBoot
*m
= static_cast<MOSDBoot
*>(op
->get_req());
2438 dout(7) << "_booted " << m
->get_orig_source_inst()
2439 << " w " << m
->sb
.weight
<< " from " << m
->sb
.current_epoch
<< dendl
;
2442 mon
->clog
->info() << m
->get_orig_source_inst() << " boot";
2445 send_latest(op
, m
->sb
.current_epoch
+1);
2452 bool OSDMonitor::preprocess_full(MonOpRequestRef op
)
2454 op
->mark_osdmon_event(__func__
);
2455 MOSDFull
*m
= static_cast<MOSDFull
*>(op
->get_req());
2456 int from
= m
->get_orig_source().num();
2458 unsigned mask
= CEPH_OSD_NEARFULL
| CEPH_OSD_BACKFILLFULL
| CEPH_OSD_FULL
;
2460 // check permissions, ignore if failed
2461 MonSession
*session
= m
->get_session();
2464 if (!session
->is_capable("osd", MON_CAP_X
)) {
2465 dout(0) << "MOSDFull from entity with insufficient privileges:"
2466 << session
->caps
<< dendl
;
2470 // ignore a full message from the osd instance that already went down
2471 if (!osdmap
.exists(from
)) {
2472 dout(7) << __func__
<< " ignoring full message from nonexistent "
2473 << m
->get_orig_source_inst() << dendl
;
2476 if ((!osdmap
.is_up(from
) &&
2477 osdmap
.get_most_recent_inst(from
) == m
->get_orig_source_inst()) ||
2478 (osdmap
.is_up(from
) &&
2479 osdmap
.get_inst(from
) != m
->get_orig_source_inst())) {
2480 dout(7) << __func__
<< " ignoring full message from down "
2481 << m
->get_orig_source_inst() << dendl
;
2485 OSDMap::calc_state_set(osdmap
.get_state(from
), state
);
2487 if ((osdmap
.get_state(from
) & mask
) == m
->state
) {
2488 dout(7) << __func__
<< " state already " << state
<< " for osd." << from
2489 << " " << m
->get_orig_source_inst() << dendl
;
2490 _reply_map(op
, m
->version
);
2494 dout(10) << __func__
<< " want state " << state
<< " for osd." << from
2495 << " " << m
->get_orig_source_inst() << dendl
;
2502 bool OSDMonitor::prepare_full(MonOpRequestRef op
)
2504 op
->mark_osdmon_event(__func__
);
2505 const MOSDFull
*m
= static_cast<MOSDFull
*>(op
->get_req());
2506 const int from
= m
->get_orig_source().num();
2508 const unsigned mask
= CEPH_OSD_NEARFULL
| CEPH_OSD_BACKFILLFULL
| CEPH_OSD_FULL
;
2509 const unsigned want_state
= m
->state
& mask
; // safety first
2511 unsigned cur_state
= osdmap
.get_state(from
);
2512 auto p
= pending_inc
.new_state
.find(from
);
2513 if (p
!= pending_inc
.new_state
.end()) {
2514 cur_state
^= p
->second
;
2518 set
<string
> want_state_set
, cur_state_set
;
2519 OSDMap::calc_state_set(want_state
, want_state_set
);
2520 OSDMap::calc_state_set(cur_state
, cur_state_set
);
2522 if (cur_state
!= want_state
) {
2523 if (p
!= pending_inc
.new_state
.end()) {
2526 pending_inc
.new_state
[from
] = 0;
2528 pending_inc
.new_state
[from
] |= (osdmap
.get_state(from
) & mask
) ^ want_state
;
2529 dout(7) << __func__
<< " osd." << from
<< " " << cur_state_set
2530 << " -> " << want_state_set
<< dendl
;
2532 dout(7) << __func__
<< " osd." << from
<< " " << cur_state_set
2533 << " = wanted " << want_state_set
<< ", just waiting" << dendl
;
2536 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
2543 bool OSDMonitor::preprocess_alive(MonOpRequestRef op
)
2545 op
->mark_osdmon_event(__func__
);
2546 MOSDAlive
*m
= static_cast<MOSDAlive
*>(op
->get_req());
2547 int from
= m
->get_orig_source().num();
2549 // check permissions, ignore if failed
2550 MonSession
*session
= m
->get_session();
2553 if (!session
->is_capable("osd", MON_CAP_X
)) {
2554 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
2555 << session
->caps
<< dendl
;
2559 if (!osdmap
.is_up(from
) ||
2560 osdmap
.get_inst(from
) != m
->get_orig_source_inst()) {
2561 dout(7) << "preprocess_alive ignoring alive message from down " << m
->get_orig_source_inst() << dendl
;
2565 if (osdmap
.get_up_thru(from
) >= m
->want
) {
2567 dout(7) << "preprocess_alive want up_thru " << m
->want
<< " dup from " << m
->get_orig_source_inst() << dendl
;
2568 _reply_map(op
, m
->version
);
2572 dout(10) << "preprocess_alive want up_thru " << m
->want
2573 << " from " << m
->get_orig_source_inst() << dendl
;
2580 bool OSDMonitor::prepare_alive(MonOpRequestRef op
)
2582 op
->mark_osdmon_event(__func__
);
2583 MOSDAlive
*m
= static_cast<MOSDAlive
*>(op
->get_req());
2584 int from
= m
->get_orig_source().num();
2586 if (0) { // we probably don't care much about these
2587 mon
->clog
->debug() << m
->get_orig_source_inst() << " alive";
2590 dout(7) << "prepare_alive want up_thru " << m
->want
<< " have " << m
->version
2591 << " from " << m
->get_orig_source_inst() << dendl
;
2593 update_up_thru(from
, m
->version
); // set to the latest map the OSD has
2594 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
2598 void OSDMonitor::_reply_map(MonOpRequestRef op
, epoch_t e
)
2600 op
->mark_osdmon_event(__func__
);
2601 dout(7) << "_reply_map " << e
2602 << " from " << op
->get_req()->get_orig_source_inst()
2608 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op
)
2610 op
->mark_osdmon_event(__func__
);
2611 auto m
= static_cast<MOSDPGCreated
*>(op
->get_req());
2612 dout(10) << __func__
<< " " << *m
<< dendl
;
2613 auto session
= m
->get_session();
2615 dout(10) << __func__
<< ": no monitor session!" << dendl
;
2618 if (!session
->is_capable("osd", MON_CAP_X
)) {
2619 derr
<< __func__
<< " received from entity "
2620 << "with insufficient privileges " << session
->caps
<< dendl
;
2623 // always forward the "created!" to the leader
2627 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op
)
2629 op
->mark_osdmon_event(__func__
);
2630 auto m
= static_cast<MOSDPGCreated
*>(op
->get_req());
2631 dout(10) << __func__
<< " " << *m
<< dendl
;
2632 auto src
= m
->get_orig_source();
2633 auto from
= src
.num();
2634 if (!src
.is_osd() ||
2635 !mon
->osdmon()->osdmap
.is_up(from
) ||
2636 m
->get_orig_source_inst() != mon
->osdmon()->osdmap
.get_inst(from
)) {
2637 dout(1) << __func__
<< " ignoring stats from non-active osd." << dendl
;
2640 pending_created_pgs
.push_back(m
->pgid
);
2647 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op
)
2649 MOSDPGTemp
*m
= static_cast<MOSDPGTemp
*>(op
->get_req());
2650 dout(10) << "preprocess_pgtemp " << *m
<< dendl
;
2651 mempool::osdmap::vector
<int> empty
;
2652 int from
= m
->get_orig_source().num();
2653 size_t ignore_cnt
= 0;
2656 MonSession
*session
= m
->get_session();
2659 if (!session
->is_capable("osd", MON_CAP_X
)) {
2660 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
2661 << session
->caps
<< dendl
;
2665 if (!osdmap
.is_up(from
) ||
2666 osdmap
.get_inst(from
) != m
->get_orig_source_inst()) {
2667 dout(7) << "ignoring pgtemp message from down " << m
->get_orig_source_inst() << dendl
;
2671 for (auto p
= m
->pg_temp
.begin(); p
!= m
->pg_temp
.end(); ++p
) {
2672 dout(20) << " " << p
->first
2673 << (osdmap
.pg_temp
->count(p
->first
) ? osdmap
.pg_temp
->get(p
->first
) : empty
)
2674 << " -> " << p
->second
<< dendl
;
2676 // does the pool exist?
2677 if (!osdmap
.have_pg_pool(p
->first
.pool())) {
2679 * 1. If the osdmap does not have the pool, it means the pool has been
2680 * removed in-between the osd sending this message and us handling it.
2681 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
2682 * not exist in the pending either, as the osds would not send a
2683 * message about a pool they know nothing about (yet).
2684 * 3. However, if the pool does exist in the pending, then it must be a
2685 * new pool, and not relevant to this message (see 1).
2687 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
2688 << ": pool has been removed" << dendl
;
2693 int acting_primary
= -1;
2694 osdmap
.pg_to_up_acting_osds(
2695 p
->first
, nullptr, nullptr, nullptr, &acting_primary
);
2696 if (acting_primary
!= from
) {
2697 /* If the source isn't the primary based on the current osdmap, we know
2698 * that the interval changed and that we can discard this message.
2699 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
2700 * which of two pg temp mappings on the same pg is more recent.
2702 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
2703 << ": primary has changed" << dendl
;
2709 if (p
->second
.empty() && (osdmap
.pg_temp
->count(p
->first
) ||
2710 osdmap
.primary_temp
->count(p
->first
)))
2713 // NOTE: we assume that this will clear pg_primary, so consider
2714 // an existing pg_primary field to imply a change
2715 if (p
->second
.size() &&
2716 (osdmap
.pg_temp
->count(p
->first
) == 0 ||
2717 !vectors_equal(osdmap
.pg_temp
->get(p
->first
), p
->second
) ||
2718 osdmap
.primary_temp
->count(p
->first
)))
2722 // should we ignore all the pgs?
2723 if (ignore_cnt
== m
->pg_temp
.size())
2726 dout(7) << "preprocess_pgtemp e" << m
->map_epoch
<< " no changes from " << m
->get_orig_source_inst() << dendl
;
2727 _reply_map(op
, m
->map_epoch
);
2734 void OSDMonitor::update_up_thru(int from
, epoch_t up_thru
)
2736 epoch_t old_up_thru
= osdmap
.get_up_thru(from
);
2737 auto ut
= pending_inc
.new_up_thru
.find(from
);
2738 if (ut
!= pending_inc
.new_up_thru
.end()) {
2739 old_up_thru
= ut
->second
;
2741 if (up_thru
> old_up_thru
) {
2742 // set up_thru too, so the osd doesn't have to ask again
2743 pending_inc
.new_up_thru
[from
] = up_thru
;
2747 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op
)
2749 op
->mark_osdmon_event(__func__
);
2750 MOSDPGTemp
*m
= static_cast<MOSDPGTemp
*>(op
->get_req());
2751 int from
= m
->get_orig_source().num();
2752 dout(7) << "prepare_pgtemp e" << m
->map_epoch
<< " from " << m
->get_orig_source_inst() << dendl
;
2753 for (map
<pg_t
,vector
<int32_t> >::iterator p
= m
->pg_temp
.begin(); p
!= m
->pg_temp
.end(); ++p
) {
2754 uint64_t pool
= p
->first
.pool();
2755 if (pending_inc
.old_pools
.count(pool
)) {
2756 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
2757 << ": pool pending removal" << dendl
;
2760 if (!osdmap
.have_pg_pool(pool
)) {
2761 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
2762 << ": pool has been removed" << dendl
;
2765 pending_inc
.new_pg_temp
[p
->first
] =
2766 mempool::osdmap::vector
<int>(p
->second
.begin(), p
->second
.end());
2768 // unconditionally clear pg_primary (until this message can encode
2769 // a change for that, too.. at which point we need to also fix
2770 // preprocess_pg_temp)
2771 if (osdmap
.primary_temp
->count(p
->first
) ||
2772 pending_inc
.new_primary_temp
.count(p
->first
))
2773 pending_inc
.new_primary_temp
[p
->first
] = -1;
2776 // set up_thru too, so the osd doesn't have to ask again
2777 update_up_thru(from
, m
->map_epoch
);
2779 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->map_epoch
));
2786 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op
)
2788 op
->mark_osdmon_event(__func__
);
2789 MRemoveSnaps
*m
= static_cast<MRemoveSnaps
*>(op
->get_req());
2790 dout(7) << "preprocess_remove_snaps " << *m
<< dendl
;
2792 // check privilege, ignore if failed
2793 MonSession
*session
= m
->get_session();
2796 if (!session
->caps
.is_capable(
2798 CEPH_ENTITY_TYPE_MON
,
2799 session
->entity_name
,
2800 "osd", "osd pool rmsnap", {}, true, true, false)) {
2801 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
2802 << session
->caps
<< dendl
;
2806 for (map
<int, vector
<snapid_t
> >::iterator q
= m
->snaps
.begin();
2807 q
!= m
->snaps
.end();
2809 if (!osdmap
.have_pg_pool(q
->first
)) {
2810 dout(10) << " ignoring removed_snaps " << q
->second
<< " on non-existent pool " << q
->first
<< dendl
;
2813 const pg_pool_t
*pi
= osdmap
.get_pg_pool(q
->first
);
2814 for (vector
<snapid_t
>::iterator p
= q
->second
.begin();
2815 p
!= q
->second
.end();
2817 if (*p
> pi
->get_snap_seq() ||
2818 !pi
->removed_snaps
.contains(*p
))
2827 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op
)
2829 op
->mark_osdmon_event(__func__
);
2830 MRemoveSnaps
*m
= static_cast<MRemoveSnaps
*>(op
->get_req());
2831 dout(7) << "prepare_remove_snaps " << *m
<< dendl
;
2833 for (map
<int, vector
<snapid_t
> >::iterator p
= m
->snaps
.begin();
2834 p
!= m
->snaps
.end();
2837 if (!osdmap
.have_pg_pool(p
->first
)) {
2838 dout(10) << " ignoring removed_snaps " << p
->second
<< " on non-existent pool " << p
->first
<< dendl
;
2842 pg_pool_t
& pi
= osdmap
.pools
[p
->first
];
2843 for (vector
<snapid_t
>::iterator q
= p
->second
.begin();
2844 q
!= p
->second
.end();
2846 if (!pi
.removed_snaps
.contains(*q
) &&
2847 (!pending_inc
.new_pools
.count(p
->first
) ||
2848 !pending_inc
.new_pools
[p
->first
].removed_snaps
.contains(*q
))) {
2849 pg_pool_t
*newpi
= pending_inc
.get_new_pool(p
->first
, &pi
);
2850 newpi
->removed_snaps
.insert(*q
);
2851 dout(10) << " pool " << p
->first
<< " removed_snaps added " << *q
2852 << " (now " << newpi
->removed_snaps
<< ")" << dendl
;
2853 if (*q
> newpi
->get_snap_seq()) {
2854 dout(10) << " pool " << p
->first
<< " snap_seq " << newpi
->get_snap_seq() << " -> " << *q
<< dendl
;
2855 newpi
->set_snap_seq(*q
);
2857 newpi
->set_snap_epoch(pending_inc
.epoch
);
2865 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op
)
2867 op
->mark_osdmon_event(__func__
);
2868 auto beacon
= static_cast<MOSDBeacon
*>(op
->get_req());
2870 auto session
= beacon
->get_session();
2872 dout(10) << __func__
<< " no monitor session!" << dendl
;
2875 if (!session
->is_capable("osd", MON_CAP_X
)) {
2876 derr
<< __func__
<< " received from entity "
2877 << "with insufficient privileges " << session
->caps
<< dendl
;
2880 // Always forward the beacon to the leader, even if they are the same as
2881 // the old one. The leader will mark as down osds that haven't sent
2882 // beacon for a few minutes.
2886 bool OSDMonitor::prepare_beacon(MonOpRequestRef op
)
2888 op
->mark_osdmon_event(__func__
);
2889 const auto beacon
= static_cast<MOSDBeacon
*>(op
->get_req());
2890 const auto src
= beacon
->get_orig_source();
2891 dout(10) << __func__
<< " " << *beacon
2892 << " from " << src
<< dendl
;
2893 int from
= src
.num();
2895 if (!src
.is_osd() ||
2896 !osdmap
.is_up(from
) ||
2897 beacon
->get_orig_source_inst() != osdmap
.get_inst(from
)) {
2898 dout(1) << " ignoring beacon from non-active osd." << dendl
;
2902 last_osd_report
[from
] = ceph_clock_now();
2903 osd_epochs
[from
] = beacon
->version
;
2905 for (const auto& pg
: beacon
->pgs
) {
2906 last_epoch_clean
.report(pg
, beacon
->min_last_epoch_clean
);
2914 void OSDMonitor::send_latest(MonOpRequestRef op
, epoch_t start
)
2916 op
->mark_osdmon_event(__func__
);
2917 dout(5) << "send_latest to " << op
->get_req()->get_orig_source_inst()
2918 << " start " << start
<< dendl
;
2922 send_incremental(op
, start
);
2926 MOSDMap
*OSDMonitor::build_latest_full()
2928 MOSDMap
*r
= new MOSDMap(mon
->monmap
->fsid
);
2929 get_version_full(osdmap
.get_epoch(), r
->maps
[osdmap
.get_epoch()]);
2930 r
->oldest_map
= get_first_committed();
2931 r
->newest_map
= osdmap
.get_epoch();
2935 MOSDMap
*OSDMonitor::build_incremental(epoch_t from
, epoch_t to
)
2937 dout(10) << "build_incremental [" << from
<< ".." << to
<< "]" << dendl
;
2938 MOSDMap
*m
= new MOSDMap(mon
->monmap
->fsid
);
2939 m
->oldest_map
= get_first_committed();
2940 m
->newest_map
= osdmap
.get_epoch();
2942 for (epoch_t e
= to
; e
>= from
&& e
> 0; e
--) {
2944 int err
= get_version(e
, bl
);
2946 assert(bl
.length());
2947 // if (get_version(e, bl) > 0) {
2948 dout(20) << "build_incremental inc " << e
<< " "
2949 << bl
.length() << " bytes" << dendl
;
2950 m
->incremental_maps
[e
] = bl
;
2952 assert(err
== -ENOENT
);
2953 assert(!bl
.length());
2954 get_version_full(e
, bl
);
2955 if (bl
.length() > 0) {
2956 //else if (get_version("full", e, bl) > 0) {
2957 dout(20) << "build_incremental full " << e
<< " "
2958 << bl
.length() << " bytes" << dendl
;
2961 ceph_abort(); // we should have all maps.
2968 void OSDMonitor::send_full(MonOpRequestRef op
)
2970 op
->mark_osdmon_event(__func__
);
2971 dout(5) << "send_full to " << op
->get_req()->get_orig_source_inst() << dendl
;
2972 mon
->send_reply(op
, build_latest_full());
2975 void OSDMonitor::send_incremental(MonOpRequestRef op
, epoch_t first
)
2977 op
->mark_osdmon_event(__func__
);
2979 MonSession
*s
= op
->get_session();
2983 s
->proxy_con
->has_feature(CEPH_FEATURE_MON_ROUTE_OSDMAP
)) {
2984 // oh, we can tell the other mon to do it
2985 dout(10) << __func__
<< " asking proxying mon to send_incremental from "
2987 MRoute
*r
= new MRoute(s
->proxy_tid
, NULL
);
2988 r
->send_osdmap_first
= first
;
2989 s
->proxy_con
->send_message(r
);
2990 op
->mark_event("reply: send routed send_osdmap_first reply");
2993 send_incremental(first
, s
, false, op
);
2997 void OSDMonitor::send_incremental(epoch_t first
,
2998 MonSession
*session
,
3000 MonOpRequestRef req
)
3002 dout(5) << "send_incremental [" << first
<< ".." << osdmap
.get_epoch() << "]"
3003 << " to " << session
->inst
<< dendl
;
3005 if (first
<= session
->osd_epoch
) {
3006 dout(10) << __func__
<< " " << session
->inst
<< " should already have epoch "
3007 << session
->osd_epoch
<< dendl
;
3008 first
= session
->osd_epoch
+ 1;
3011 if (first
< get_first_committed()) {
3012 first
= get_first_committed();
3014 int err
= get_version_full(first
, bl
);
3016 assert(bl
.length());
3018 dout(20) << "send_incremental starting with base full "
3019 << first
<< " " << bl
.length() << " bytes" << dendl
;
3021 MOSDMap
*m
= new MOSDMap(osdmap
.get_fsid());
3022 m
->oldest_map
= get_first_committed();
3023 m
->newest_map
= osdmap
.get_epoch();
3024 m
->maps
[first
] = bl
;
3027 mon
->send_reply(req
, m
);
3028 session
->osd_epoch
= first
;
3031 session
->con
->send_message(m
);
3032 session
->osd_epoch
= first
;
3037 while (first
<= osdmap
.get_epoch()) {
3038 epoch_t last
= MIN(first
+ g_conf
->osd_map_message_max
- 1,
3039 osdmap
.get_epoch());
3040 MOSDMap
*m
= build_incremental(first
, last
);
3043 // send some maps. it may not be all of them, but it will get them
3045 mon
->send_reply(req
, m
);
3047 session
->con
->send_message(m
);
3050 session
->osd_epoch
= last
;
3056 int OSDMonitor::get_version(version_t ver
, bufferlist
& bl
)
3058 if (inc_osd_cache
.lookup(ver
, &bl
)) {
3061 int ret
= PaxosService::get_version(ver
, bl
);
3063 inc_osd_cache
.add(ver
, bl
);
3068 int OSDMonitor::get_version_full(version_t ver
, bufferlist
& bl
)
3070 if (full_osd_cache
.lookup(ver
, &bl
)) {
3073 int ret
= PaxosService::get_version_full(ver
, bl
);
3075 full_osd_cache
.add(ver
, bl
);
3080 epoch_t
OSDMonitor::blacklist(const entity_addr_t
& a
, utime_t until
)
3082 dout(10) << "blacklist " << a
<< " until " << until
<< dendl
;
3083 pending_inc
.new_blacklist
[a
] = until
;
3084 return pending_inc
.epoch
;
3088 void OSDMonitor::check_osdmap_subs()
3090 dout(10) << __func__
<< dendl
;
3091 if (!osdmap
.get_epoch()) {
3094 auto osdmap_subs
= mon
->session_map
.subs
.find("osdmap");
3095 if (osdmap_subs
== mon
->session_map
.subs
.end()) {
3098 auto p
= osdmap_subs
->second
->begin();
3102 check_osdmap_sub(sub
);
3106 void OSDMonitor::check_osdmap_sub(Subscription
*sub
)
3108 dout(10) << __func__
<< " " << sub
<< " next " << sub
->next
3109 << (sub
->onetime
? " (onetime)":" (ongoing)") << dendl
;
3110 if (sub
->next
<= osdmap
.get_epoch()) {
3112 send_incremental(sub
->next
, sub
->session
, sub
->incremental_onetime
);
3114 sub
->session
->con
->send_message(build_latest_full());
3116 mon
->session_map
.remove_sub(sub
);
3118 sub
->next
= osdmap
.get_epoch() + 1;
3122 void OSDMonitor::check_pg_creates_subs()
3124 if (!mon
->monmap
->get_required_features().contains_all(
3125 ceph::features::mon::FEATURE_LUMINOUS
)) {
3126 // PGMonitor takes care of this in pre-luminous era.
3129 if (!osdmap
.get_num_up_osds()) {
3132 assert(osdmap
.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB
);
3133 mon
->with_session_map([this](const MonSessionMap
& session_map
) {
3134 auto pg_creates_subs
= session_map
.subs
.find("osd_pg_creates");
3135 if (pg_creates_subs
== session_map
.subs
.end()) {
3138 for (auto sub
: *pg_creates_subs
->second
) {
3139 check_pg_creates_sub(sub
);
3144 void OSDMonitor::check_pg_creates_sub(Subscription
*sub
)
3146 dout(20) << __func__
<< " .. " << sub
->session
->inst
<< dendl
;
3147 assert(sub
->type
== "osd_pg_creates");
3148 // only send these if the OSD is up. we will check_subs() when they do
3149 // come up so they will get the creates then.
3150 if (sub
->session
->inst
.name
.is_osd() &&
3151 mon
->osdmon()->osdmap
.is_up(sub
->session
->inst
.name
.num())) {
3152 sub
->next
= send_pg_creates(sub
->session
->inst
.name
.num(),
3153 sub
->session
->con
.get(),
3158 void OSDMonitor::do_application_enable(int64_t pool_id
,
3159 const std::string
&app_name
)
3161 assert(paxos
->is_plugged() && is_writeable());
3163 dout(20) << __func__
<< ": pool_id=" << pool_id
<< ", app_name=" << app_name
3166 assert(osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
||
3167 pending_inc
.new_require_osd_release
>= CEPH_RELEASE_LUMINOUS
);
3169 auto pp
= osdmap
.get_pg_pool(pool_id
);
3170 assert(pp
!= nullptr);
3173 if (pending_inc
.new_pools
.count(pool_id
)) {
3174 p
= pending_inc
.new_pools
[pool_id
];
3177 p
.application_metadata
.insert({app_name
, {}});
3178 p
.last_change
= pending_inc
.epoch
;
3179 pending_inc
.new_pools
[pool_id
] = p
;
3182 unsigned OSDMonitor::scan_for_creating_pgs(
3183 const mempool::osdmap::map
<int64_t,pg_pool_t
>& pools
,
3184 const mempool::osdmap::set
<int64_t>& removed_pools
,
3186 creating_pgs_t
* creating_pgs
) const
3188 unsigned queued
= 0;
3189 for (auto& p
: pools
) {
3190 int64_t poolid
= p
.first
;
3191 const pg_pool_t
& pool
= p
.second
;
3192 int ruleno
= osdmap
.crush
->find_rule(pool
.get_crush_rule(),
3193 pool
.get_type(), pool
.get_size());
3194 if (ruleno
< 0 || !osdmap
.crush
->rule_exists(ruleno
))
3197 const auto last_scan_epoch
= creating_pgs
->last_scan_epoch
;
3198 const auto created
= pool
.get_last_change();
3199 if (last_scan_epoch
&& created
<= last_scan_epoch
) {
3200 dout(10) << __func__
<< " no change in pool " << poolid
3201 << " " << pool
<< dendl
;
3204 if (removed_pools
.count(poolid
)) {
3205 dout(10) << __func__
<< " pool is being removed: " << poolid
3206 << " " << pool
<< dendl
;
3209 dout(10) << __func__
<< " queueing pool create for " << poolid
3210 << " " << pool
<< dendl
;
3211 if (creating_pgs
->create_pool(poolid
, pool
.get_pg_num(),
3212 created
, modified
)) {
3219 void OSDMonitor::update_creating_pgs()
3221 dout(10) << __func__
<< " " << creating_pgs
.pgs
.size() << " pgs creating, "
3222 << creating_pgs
.queue
.size() << " pools in queue" << dendl
;
3223 decltype(creating_pgs_by_osd_epoch
) new_pgs_by_osd_epoch
;
3224 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
3225 for (const auto& pg
: creating_pgs
.pgs
) {
3226 int acting_primary
= -1;
3227 auto pgid
= pg
.first
;
3228 auto mapped
= pg
.second
.first
;
3229 dout(20) << __func__
<< " looking up " << pgid
<< "@" << mapped
<< dendl
;
3230 mapping
.get(pgid
, nullptr, nullptr, nullptr, &acting_primary
);
3231 // check the previous creating_pgs, look for the target to whom the pg was
3232 // previously mapped
3233 for (const auto& pgs_by_epoch
: creating_pgs_by_osd_epoch
) {
3234 const auto last_acting_primary
= pgs_by_epoch
.first
;
3235 for (auto& pgs
: pgs_by_epoch
.second
) {
3236 if (pgs
.second
.count(pgid
)) {
3237 if (last_acting_primary
== acting_primary
) {
3240 dout(20) << __func__
<< " " << pgid
<< " "
3241 << " acting_primary:" << last_acting_primary
3242 << " -> " << acting_primary
<< dendl
;
3243 // note epoch if the target of the create message changed.
3244 mapped
= mapping
.get_epoch();
3249 mapped
= mapping
.get_epoch();
3253 dout(10) << __func__
<< " will instruct osd." << acting_primary
3254 << " to create " << pgid
<< "@" << mapped
<< dendl
;
3255 new_pgs_by_osd_epoch
[acting_primary
][mapped
].insert(pgid
);
3257 creating_pgs_by_osd_epoch
= std::move(new_pgs_by_osd_epoch
);
3258 creating_pgs_epoch
= mapping
.get_epoch();
3261 epoch_t
OSDMonitor::send_pg_creates(int osd
, Connection
*con
, epoch_t next
) const
3263 dout(30) << __func__
<< " osd." << osd
<< " next=" << next
3264 << " " << creating_pgs_by_osd_epoch
<< dendl
;
3265 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
3266 if (creating_pgs_epoch
<= creating_pgs
.last_scan_epoch
) {
3267 dout(20) << __func__
3268 << " not using stale creating_pgs@" << creating_pgs_epoch
<< dendl
;
3269 // the subscribers will be updated when the mapping is completed anyway
3272 auto creating_pgs_by_epoch
= creating_pgs_by_osd_epoch
.find(osd
);
3273 if (creating_pgs_by_epoch
== creating_pgs_by_osd_epoch
.end())
3275 assert(!creating_pgs_by_epoch
->second
.empty());
3277 MOSDPGCreate
*m
= nullptr;
3279 for (auto epoch_pgs
= creating_pgs_by_epoch
->second
.lower_bound(next
);
3280 epoch_pgs
!= creating_pgs_by_epoch
->second
.end(); ++epoch_pgs
) {
3281 auto epoch
= epoch_pgs
->first
;
3282 auto& pgs
= epoch_pgs
->second
;
3283 dout(20) << __func__
<< " osd." << osd
<< " from " << next
3284 << " : epoch " << epoch
<< " " << pgs
.size() << " pgs" << dendl
;
3286 for (auto& pg
: pgs
) {
3288 m
= new MOSDPGCreate(creating_pgs_epoch
);
3289 // Need the create time from the monitor using its clock to set
3290 // last_scrub_stamp upon pg creation.
3291 auto create
= creating_pgs
.pgs
.find(pg
);
3292 assert(create
!= creating_pgs
.pgs
.end());
3293 m
->mkpg
.emplace(pg
, pg_create_t
{create
->second
.first
, pg
, 0});
3294 m
->ctimes
.emplace(pg
, create
->second
.second
);
3295 dout(20) << __func__
<< " will create " << pg
3296 << " at " << create
->second
.first
<< dendl
;
3300 dout(20) << __func__
<< " osd." << osd
<< " from " << next
3301 << " has nothing to send" << dendl
;
3304 con
->send_message(m
);
3305 // sub is current through last + 1
3312 void OSDMonitor::tick()
3314 if (!is_active()) return;
3316 dout(10) << osdmap
<< dendl
;
3318 if (!mon
->is_leader()) return;
3320 bool do_propose
= false;
3321 utime_t now
= ceph_clock_now();
3323 if (osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
&&
3324 mon
->monmap
->get_required_features().contains_all(
3325 ceph::features::mon::FEATURE_LUMINOUS
)) {
3326 if (handle_osd_timeouts(now
, last_osd_report
)) {
3330 if (!osdmap
.test_flag(CEPH_OSDMAP_PURGED_SNAPDIRS
) &&
3331 osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
&&
3332 mon
->mgrstatmon()->is_readable() &&
3333 mon
->mgrstatmon()->definitely_converted_snapsets()) {
3334 dout(1) << __func__
<< " all snapsets converted, setting purged_snapdirs"
3336 add_flag(CEPH_OSDMAP_PURGED_SNAPDIRS
);
3341 if (check_failures(now
))
3344 // mark down osds out?
3346 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
3347 * influence at all. The decision is made based on the ratio of "in" osds,
3348 * and the function returns false if this ratio is lower that the minimum
3349 * ratio set by g_conf->mon_osd_min_in_ratio. So it's not really up to us.
3351 if (can_mark_out(-1)) {
3352 set
<int> down_cache
; // quick cache of down subtrees
3354 map
<int,utime_t
>::iterator i
= down_pending_out
.begin();
3355 while (i
!= down_pending_out
.end()) {
3361 if (osdmap
.is_down(o
) &&
3364 utime_t
orig_grace(g_conf
->mon_osd_down_out_interval
, 0);
3365 utime_t grace
= orig_grace
;
3366 double my_grace
= 0.0;
3368 if (g_conf
->mon_osd_adjust_down_out_interval
) {
3369 // scale grace period the same way we do the heartbeat grace.
3370 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(o
);
3371 double halflife
= (double)g_conf
->mon_osd_laggy_halflife
;
3372 double decay_k
= ::log(.5) / halflife
;
3373 double decay
= exp((double)down
* decay_k
);
3374 dout(20) << "osd." << o
<< " laggy halflife " << halflife
<< " decay_k " << decay_k
3375 << " down for " << down
<< " decay " << decay
<< dendl
;
3376 my_grace
= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
3380 // is this an entire large subtree down?
3381 if (g_conf
->mon_osd_down_out_subtree_limit
.length()) {
3382 int type
= osdmap
.crush
->get_type_id(g_conf
->mon_osd_down_out_subtree_limit
);
3384 if (osdmap
.containing_subtree_is_down(g_ceph_context
, o
, type
, &down_cache
)) {
3385 dout(10) << "tick entire containing " << g_conf
->mon_osd_down_out_subtree_limit
3386 << " subtree for osd." << o
<< " is down; resetting timer" << dendl
;
3387 // reset timer, too.
3388 down_pending_out
[o
] = now
;
3394 bool down_out
= !osdmap
.is_destroyed(o
) &&
3395 g_conf
->mon_osd_down_out_interval
> 0 && down
.sec() >= grace
;
3396 bool destroyed_out
= osdmap
.is_destroyed(o
) &&
3397 g_conf
->mon_osd_destroyed_out_interval
> 0 &&
3398 // this is not precise enough as we did not make a note when this osd
3399 // was marked as destroyed, but let's not bother with that
3400 // complexity for now.
3401 down
.sec() >= g_conf
->mon_osd_destroyed_out_interval
;
3402 if (down_out
|| destroyed_out
) {
3403 dout(10) << "tick marking osd." << o
<< " OUT after " << down
3404 << " sec (target " << grace
<< " = " << orig_grace
<< " + " << my_grace
<< ")" << dendl
;
3405 pending_inc
.new_weight
[o
] = CEPH_OSD_OUT
;
3407 // set the AUTOOUT bit.
3408 if (pending_inc
.new_state
.count(o
) == 0)
3409 pending_inc
.new_state
[o
] = 0;
3410 pending_inc
.new_state
[o
] |= CEPH_OSD_AUTOOUT
;
3412 // remember previous weight
3413 if (pending_inc
.new_xinfo
.count(o
) == 0)
3414 pending_inc
.new_xinfo
[o
] = osdmap
.osd_xinfo
[o
];
3415 pending_inc
.new_xinfo
[o
].old_weight
= osdmap
.osd_weight
[o
];
3419 mon
->clog
->info() << "Marking osd." << o
<< " out (has been down for "
3420 << int(down
.sec()) << " seconds)";
3425 down_pending_out
.erase(o
);
3428 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl
;
3431 // expire blacklisted items?
3432 for (ceph::unordered_map
<entity_addr_t
,utime_t
>::iterator p
= osdmap
.blacklist
.begin();
3433 p
!= osdmap
.blacklist
.end();
3435 if (p
->second
< now
) {
3436 dout(10) << "expiring blacklist item " << p
->first
<< " expired " << p
->second
<< " < now " << now
<< dendl
;
3437 pending_inc
.old_blacklist
.push_back(p
->first
);
3442 // if map full setting has changed, get that info out there!
3443 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
&&
3444 mon
->pgservice
->is_readable()) {
3445 // for pre-luminous compat only!
3446 if (mon
->pgservice
->have_full_osds()) {
3447 dout(5) << "There are full osds, setting full flag" << dendl
;
3448 add_flag(CEPH_OSDMAP_FULL
);
3449 } else if (osdmap
.test_flag(CEPH_OSDMAP_FULL
)){
3450 dout(10) << "No full osds, removing full flag" << dendl
;
3451 remove_flag(CEPH_OSDMAP_FULL
);
3454 if (mon
->pgservice
->have_nearfull_osds()) {
3455 dout(5) << "There are near full osds, setting nearfull flag" << dendl
;
3456 add_flag(CEPH_OSDMAP_NEARFULL
);
3457 } else if (osdmap
.test_flag(CEPH_OSDMAP_NEARFULL
)){
3458 dout(10) << "No near full osds, removing nearfull flag" << dendl
;
3459 remove_flag(CEPH_OSDMAP_NEARFULL
);
3461 if (pending_inc
.new_flags
!= -1 &&
3462 (pending_inc
.new_flags
^ osdmap
.flags
) & (CEPH_OSDMAP_FULL
| CEPH_OSDMAP_NEARFULL
)) {
3463 dout(1) << "New setting for" <<
3464 (pending_inc
.new_flags
& CEPH_OSDMAP_FULL
? " CEPH_OSDMAP_FULL" : "") <<
3465 (pending_inc
.new_flags
& CEPH_OSDMAP_NEARFULL
? " CEPH_OSDMAP_NEARFULL" : "")
3466 << " -- doing propose" << dendl
;
3471 if (update_pools_status())
3475 !pending_inc
.new_pg_temp
.empty()) // also propose if we adjusted pg_temp
3479 bool OSDMonitor::handle_osd_timeouts(const utime_t
&now
,
3480 std::map
<int,utime_t
> &last_osd_report
)
3482 utime_t
timeo(g_conf
->mon_osd_report_timeout
, 0);
3483 if (now
- mon
->get_leader_since() < timeo
) {
3484 // We haven't been the leader for long enough to consider OSD timeouts
3488 int max_osd
= osdmap
.get_max_osd();
3489 bool new_down
= false;
3491 for (int i
=0; i
< max_osd
; ++i
) {
3492 dout(30) << __func__
<< ": checking up on osd " << i
<< dendl
;
3493 if (!osdmap
.exists(i
)) {
3494 last_osd_report
.erase(i
); // if any
3497 if (!osdmap
.is_up(i
))
3499 const std::map
<int,utime_t
>::const_iterator t
= last_osd_report
.find(i
);
3500 if (t
== last_osd_report
.end()) {
3501 // it wasn't in the map; start the timer.
3502 last_osd_report
[i
] = now
;
3503 } else if (can_mark_down(i
)) {
3504 utime_t diff
= now
- t
->second
;
3506 mon
->clog
->info() << "osd." << i
<< " marked down after no beacon for "
3507 << diff
<< " seconds";
3508 derr
<< "no beacon from osd." << i
<< " since " << t
->second
3509 << ", " << diff
<< " seconds ago. marking down" << dendl
;
3510 pending_inc
.new_state
[i
] = CEPH_OSD_UP
;
3518 void OSDMonitor::get_health(list
<pair
<health_status_t
,string
> >& summary
,
3519 list
<pair
<health_status_t
,string
> > *detail
,
3520 CephContext
*cct
) const
3522 int num_osds
= osdmap
.get_num_osds();
3524 if (num_osds
== 0) {
3525 summary
.push_back(make_pair(HEALTH_ERR
, "no osds"));
3527 int num_in_osds
= 0;
3528 int num_down_in_osds
= 0;
3530 set
<int> down_in_osds
;
3531 set
<int> up_in_osds
;
3532 set
<int> subtree_up
;
3533 unordered_map
<int, set
<int> > subtree_type_down
;
3534 unordered_map
<int, int> num_osds_subtree
;
3535 int max_type
= osdmap
.crush
->get_max_type_id();
3537 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
3538 if (!osdmap
.exists(i
)) {
3539 if (osdmap
.crush
->item_exists(i
)) {
3544 if (osdmap
.is_out(i
))
3547 if (down_in_osds
.count(i
) || up_in_osds
.count(i
))
3549 if (!osdmap
.is_up(i
)) {
3550 down_in_osds
.insert(i
);
3553 for (int type
= 0; type
<= max_type
; type
++) {
3554 if (!osdmap
.crush
->get_type_name(type
))
3556 int r
= osdmap
.crush
->get_immediate_parent_id(current
, &parent_id
);
3559 // break early if this parent is already marked as up
3560 if (subtree_up
.count(parent_id
))
3562 type
= osdmap
.crush
->get_bucket_type(parent_id
);
3563 if (!osdmap
.subtree_type_is_down(
3564 g_ceph_context
, parent_id
, type
,
3565 &down_in_osds
, &up_in_osds
, &subtree_up
, &subtree_type_down
))
3567 current
= parent_id
;
3572 // calculate the number of down osds in each down subtree and
3573 // store it in num_osds_subtree
3574 for (int type
= 1; type
<= max_type
; type
++) {
3575 if (!osdmap
.crush
->get_type_name(type
))
3577 for (auto j
= subtree_type_down
[type
].begin();
3578 j
!= subtree_type_down
[type
].end();
3582 int num
= osdmap
.crush
->get_children(*j
, &children
);
3583 num_osds_subtree
[*j
] = num
;
3587 int num_children
= osdmap
.crush
->get_children(*j
, &children
);
3588 if (num_children
== 0)
3590 for (auto l
= children
.begin(); l
!= children
.end(); ++l
) {
3591 if (num_osds_subtree
[*l
] > 0) {
3592 num
= num
+ num_osds_subtree
[*l
];
3595 num_osds_subtree
[*j
] = num
;
3599 num_down_in_osds
= down_in_osds
.size();
3600 assert(num_down_in_osds
<= num_in_osds
);
3601 if (num_down_in_osds
> 0) {
3602 // summary of down subtree types and osds
3603 for (int type
= max_type
; type
> 0; type
--) {
3604 if (!osdmap
.crush
->get_type_name(type
))
3606 if (subtree_type_down
[type
].size() > 0) {
3608 ss
<< subtree_type_down
[type
].size() << " "
3609 << osdmap
.crush
->get_type_name(type
);
3610 if (subtree_type_down
[type
].size() > 1) {
3613 int sum_down_osds
= 0;
3614 for (auto j
= subtree_type_down
[type
].begin();
3615 j
!= subtree_type_down
[type
].end();
3617 sum_down_osds
= sum_down_osds
+ num_osds_subtree
[*j
];
3619 ss
<< " (" << sum_down_osds
<< " osds) down";
3620 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3624 ss
<< down_in_osds
.size() << " osds down";
3625 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3628 // details of down subtree types
3629 for (int type
= max_type
; type
> 0; type
--) {
3630 if (!osdmap
.crush
->get_type_name(type
))
3632 for (auto j
= subtree_type_down
[type
].rbegin();
3633 j
!= subtree_type_down
[type
].rend();
3636 ss
<< osdmap
.crush
->get_type_name(type
);
3638 ss
<< osdmap
.crush
->get_item_name(*j
);
3639 // at the top level, do not print location
3640 if (type
!= max_type
) {
3642 ss
<< osdmap
.crush
->get_full_location_ordered_string(*j
);
3645 int num
= num_osds_subtree
[*j
];
3646 ss
<< " (" << num
<< " osds)";
3648 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3651 // details of down osds
3652 for (auto it
= down_in_osds
.begin(); it
!= down_in_osds
.end(); ++it
) {
3654 ss
<< "osd." << *it
<< " (";
3655 ss
<< osdmap
.crush
->get_full_location_ordered_string(*it
);
3657 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3662 if (!osds
.empty()) {
3664 ss
<< osds
.size() << " osds exist in the crush map but not in the osdmap";
3665 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3667 ss
<< " (osds: " << osds
<< ")";
3668 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3672 // note: we leave it to ceph-mgr to generate details health warnings
3673 // with actual osd utilizations
3676 uint64_t warn_flags
=
3678 CEPH_OSDMAP_PAUSERD
|
3679 CEPH_OSDMAP_PAUSEWR
|
3680 CEPH_OSDMAP_PAUSEREC
|
3682 CEPH_OSDMAP_NODOWN
|
3685 CEPH_OSDMAP_NOBACKFILL
|
3686 CEPH_OSDMAP_NORECOVER
|
3687 CEPH_OSDMAP_NOSCRUB
|
3688 CEPH_OSDMAP_NODEEP_SCRUB
|
3689 CEPH_OSDMAP_NOTIERAGENT
|
3690 CEPH_OSDMAP_NOREBALANCE
;
3691 if (osdmap
.test_flag(warn_flags
)) {
3693 ss
<< osdmap
.get_flag_string(osdmap
.get_flags() & warn_flags
)
3695 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3697 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3700 // old crush tunables?
3701 if (g_conf
->mon_warn_on_legacy_crush_tunables
) {
3702 string min
= osdmap
.crush
->get_min_required_version();
3703 if (min
< g_conf
->mon_crush_min_required_version
) {
3705 ss
<< "crush map has legacy tunables (require " << min
3706 << ", min is " << g_conf
->mon_crush_min_required_version
<< ")";
3707 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3709 ss
<< "; see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables";
3710 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3714 if (g_conf
->mon_warn_on_crush_straw_calc_version_zero
) {
3715 if (osdmap
.crush
->get_straw_calc_version() == 0) {
3717 ss
<< "crush map has straw_calc_version=0";
3718 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3720 ss
<< "; see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables";
3721 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3726 // hit_set-less cache_mode?
3727 if (g_conf
->mon_warn_on_cache_pools_without_hit_sets
) {
3728 int problem_cache_pools
= 0;
3729 for (map
<int64_t, pg_pool_t
>::const_iterator p
= osdmap
.pools
.begin();
3730 p
!= osdmap
.pools
.end();
3732 const pg_pool_t
& info
= p
->second
;
3733 if (info
.cache_mode_requires_hit_set() &&
3734 info
.hit_set_params
.get_type() == HitSet::TYPE_NONE
) {
3735 ++problem_cache_pools
;
3738 ss
<< "pool '" << osdmap
.get_pool_name(p
->first
)
3739 << "' with cache_mode " << info
.get_cache_mode_name()
3740 << " needs hit_set_type to be set but it is not";
3741 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3745 if (problem_cache_pools
) {
3747 ss
<< problem_cache_pools
<< " cache pools are missing hit_sets";
3748 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3752 if (osdmap
.crush
->has_multirule_rulesets()) {
3754 ss
<< "CRUSH map contains multirule rulesets";
3755 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3757 ss
<< "; please manually fix the map";
3758 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3762 // Not using 'sortbitwise' and should be?
3763 if (!osdmap
.test_flag(CEPH_OSDMAP_SORTBITWISE
) &&
3764 (osdmap
.get_up_osd_features() &
3765 CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT
)) {
3767 ss
<< "no legacy OSD present but 'sortbitwise' flag is not set";
3768 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3771 // Warn if 'mon_osd_down_out_interval' is set to zero.
3772 // Having this option set to zero on the leader acts much like the
3773 // 'noout' flag. It's hard to figure out what's going wrong with clusters
3774 // without the 'noout' flag set but acting like that just the same, so
3775 // we report a HEALTH_WARN in case this option is set to zero.
3776 // This is an ugly hack to get the warning out, but until we find a way
3777 // to spread global options throughout the mon cluster and have all mons
3778 // using a base set of the same options, we need to work around this sort
3780 // There's also the obvious drawback that if this is set on a single
3781 // monitor on a 3-monitor cluster, this warning will only be shown every
3782 // third monitor connection.
3783 if (g_conf
->mon_warn_on_osd_down_out_interval_zero
&&
3784 g_conf
->mon_osd_down_out_interval
== 0) {
3786 ss
<< "mon." << mon
->name
<< " has mon_osd_down_out_interval set to 0";
3787 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3789 ss
<< "; this has the same effect as the 'noout' flag";
3790 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3794 // warn about upgrade flags that can be set but are not.
3795 if (g_conf
->mon_debug_no_require_luminous
) {
3796 // ignore these checks
3797 } else if (HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_LUMINOUS
) &&
3798 osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
3799 string msg
= "all OSDs are running luminous or later but"
3800 " require_osd_release < luminous";
3801 summary
.push_back(make_pair(HEALTH_WARN
, msg
));
3803 detail
->push_back(make_pair(HEALTH_WARN
, msg
));
3805 } else if (HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_KRAKEN
) &&
3806 osdmap
.require_osd_release
< CEPH_RELEASE_KRAKEN
) {
3807 string msg
= "all OSDs are running kraken or later but"
3808 " require_osd_release < kraken";
3809 summary
.push_back(make_pair(HEALTH_WARN
, msg
));
3811 detail
->push_back(make_pair(HEALTH_WARN
, msg
));
3813 } else if (HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_JEWEL
) &&
3814 osdmap
.require_osd_release
< CEPH_RELEASE_JEWEL
) {
3815 string msg
= "all OSDs are running jewel or later but"
3816 " require_osd_release < jewel";
3817 summary
.push_back(make_pair(HEALTH_WARN
, msg
));
3819 detail
->push_back(make_pair(HEALTH_WARN
, msg
));
3823 for (auto it
: osdmap
.get_pools()) {
3824 const pg_pool_t
&pool
= it
.second
;
3825 if (pool
.has_flag(pg_pool_t::FLAG_FULL
)) {
3826 const string
& pool_name
= osdmap
.get_pool_name(it
.first
);
3828 ss
<< "pool '" << pool_name
<< "' is full";
3829 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3831 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3837 void OSDMonitor::dump_info(Formatter
*f
)
3839 f
->open_object_section("osdmap");
3843 f
->open_array_section("osd_metadata");
3844 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
3845 if (osdmap
.exists(i
)) {
3846 f
->open_object_section("osd");
3847 f
->dump_unsigned("id", i
);
3848 dump_osd_metadata(i
, f
, NULL
);
3854 f
->dump_unsigned("osdmap_first_committed", get_first_committed());
3855 f
->dump_unsigned("osdmap_last_committed", get_last_committed());
3857 f
->open_object_section("crushmap");
3858 osdmap
.crush
->dump(f
);
3863 enum osd_pool_get_choices
{
3864 SIZE
, MIN_SIZE
, CRASH_REPLAY_INTERVAL
,
3865 PG_NUM
, PGP_NUM
, CRUSH_RULE
, HASHPSPOOL
,
3866 NODELETE
, NOPGCHANGE
, NOSIZECHANGE
,
3867 WRITE_FADVISE_DONTNEED
, NOSCRUB
, NODEEP_SCRUB
,
3868 HIT_SET_TYPE
, HIT_SET_PERIOD
, HIT_SET_COUNT
, HIT_SET_FPP
,
3869 USE_GMT_HITSET
, AUID
, TARGET_MAX_OBJECTS
, TARGET_MAX_BYTES
,
3870 CACHE_TARGET_DIRTY_RATIO
, CACHE_TARGET_DIRTY_HIGH_RATIO
,
3871 CACHE_TARGET_FULL_RATIO
,
3872 CACHE_MIN_FLUSH_AGE
, CACHE_MIN_EVICT_AGE
,
3873 ERASURE_CODE_PROFILE
, MIN_READ_RECENCY_FOR_PROMOTE
,
3874 MIN_WRITE_RECENCY_FOR_PROMOTE
, FAST_READ
,
3875 HIT_SET_GRADE_DECAY_RATE
, HIT_SET_SEARCH_LAST_N
,
3876 SCRUB_MIN_INTERVAL
, SCRUB_MAX_INTERVAL
, DEEP_SCRUB_INTERVAL
,
3877 RECOVERY_PRIORITY
, RECOVERY_OP_PRIORITY
, SCRUB_PRIORITY
,
3878 COMPRESSION_MODE
, COMPRESSION_ALGORITHM
, COMPRESSION_REQUIRED_RATIO
,
3879 COMPRESSION_MAX_BLOB_SIZE
, COMPRESSION_MIN_BLOB_SIZE
,
3880 CSUM_TYPE
, CSUM_MAX_BLOCK
, CSUM_MIN_BLOCK
};
3882 std::set
<osd_pool_get_choices
>
3883 subtract_second_from_first(const std::set
<osd_pool_get_choices
>& first
,
3884 const std::set
<osd_pool_get_choices
>& second
)
3886 std::set
<osd_pool_get_choices
> result
;
3887 std::set_difference(first
.begin(), first
.end(),
3888 second
.begin(), second
.end(),
3889 std::inserter(result
, result
.end()));
3895 bool OSDMonitor::preprocess_command(MonOpRequestRef op
)
3897 op
->mark_osdmon_event(__func__
);
3898 MMonCommand
*m
= static_cast<MMonCommand
*>(op
->get_req());
3901 stringstream ss
, ds
;
3903 map
<string
, cmd_vartype
> cmdmap
;
3904 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
3905 string rs
= ss
.str();
3906 mon
->reply_command(op
, -EINVAL
, rs
, get_last_committed());
3910 MonSession
*session
= m
->get_session();
3912 mon
->reply_command(op
, -EACCES
, "access denied", get_last_committed());
3917 cmd_getval(g_ceph_context
, cmdmap
, "prefix", prefix
);
3920 cmd_getval(g_ceph_context
, cmdmap
, "format", format
, string("plain"));
3921 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
3923 if (prefix
== "osd stat") {
3924 osdmap
.print_summary(f
.get(), ds
, "");
3930 else if (prefix
== "osd perf" ||
3931 prefix
== "osd blocked-by") {
3932 r
= mon
->pgservice
->process_pg_command(prefix
, cmdmap
,
3933 osdmap
, f
.get(), &ss
, &rdata
);
3935 else if (prefix
== "osd dump" ||
3936 prefix
== "osd tree" ||
3937 prefix
== "osd ls" ||
3938 prefix
== "osd getmap" ||
3939 prefix
== "osd getcrushmap" ||
3940 prefix
== "osd ls-tree") {
3945 cmd_getval(g_ceph_context
, cmdmap
, "epoch", epochnum
, (int64_t)osdmap
.get_epoch());
3948 bufferlist osdmap_bl
;
3949 int err
= get_version_full(epoch
, osdmap_bl
);
3950 if (err
== -ENOENT
) {
3952 ss
<< "there is no map for epoch " << epoch
;
3956 assert(osdmap_bl
.length());
3959 if (epoch
== osdmap
.get_epoch()) {
3963 p
->decode(osdmap_bl
);
3966 auto sg
= make_scope_guard([&] {
3972 if (prefix
== "osd dump") {
3975 f
->open_object_section("osdmap");
3985 } else if (prefix
== "osd ls") {
3987 f
->open_array_section("osds");
3988 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
3989 if (osdmap
.exists(i
)) {
3990 f
->dump_int("osd", i
);
3997 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
3998 if (osdmap
.exists(i
)) {
4007 } else if (prefix
== "osd tree") {
4008 vector
<string
> states
;
4009 cmd_getval(g_ceph_context
, cmdmap
, "states", states
);
4010 unsigned filter
= 0;
4011 for (auto& s
: states
) {
4013 filter
|= OSDMap::DUMP_UP
;
4014 } else if (s
== "down") {
4015 filter
|= OSDMap::DUMP_DOWN
;
4016 } else if (s
== "in") {
4017 filter
|= OSDMap::DUMP_IN
;
4018 } else if (s
== "out") {
4019 filter
|= OSDMap::DUMP_OUT
;
4020 } else if (s
== "destroyed") {
4021 filter
|= OSDMap::DUMP_DESTROYED
;
4023 ss
<< "unrecognized state '" << s
<< "'";
4028 if ((filter
& (OSDMap::DUMP_IN
|OSDMap::DUMP_OUT
)) ==
4029 (OSDMap::DUMP_IN
|OSDMap::DUMP_OUT
)) {
4030 ss
<< "cannot specify both 'in' and 'out'";
4034 if (((filter
& (OSDMap::DUMP_UP
|OSDMap::DUMP_DOWN
)) ==
4035 (OSDMap::DUMP_UP
|OSDMap::DUMP_DOWN
)) ||
4036 ((filter
& (OSDMap::DUMP_UP
|OSDMap::DUMP_DESTROYED
)) ==
4037 (OSDMap::DUMP_UP
|OSDMap::DUMP_DESTROYED
)) ||
4038 ((filter
& (OSDMap::DUMP_DOWN
|OSDMap::DUMP_DESTROYED
)) ==
4039 (OSDMap::DUMP_DOWN
|OSDMap::DUMP_DESTROYED
))) {
4040 ss
<< "can specify only one of 'up', 'down' and 'destroyed'";
4045 f
->open_object_section("tree");
4046 p
->print_tree(f
.get(), NULL
, filter
);
4050 p
->print_tree(NULL
, &ds
, filter
);
4053 } else if (prefix
== "osd getmap") {
4054 rdata
.append(osdmap_bl
);
4055 ss
<< "got osdmap epoch " << p
->get_epoch();
4056 } else if (prefix
== "osd getcrushmap") {
4057 p
->crush
->encode(rdata
, mon
->get_quorum_con_features());
4058 ss
<< p
->get_crush_version();
4059 } else if (prefix
== "osd ls-tree") {
4061 cmd_getval(g_ceph_context
, cmdmap
, "name", bucket_name
);
4063 r
= p
->get_osds_by_bucket_name(bucket_name
, &osds
);
4065 ss
<< "\"" << bucket_name
<< "\" does not exist";
4068 ss
<< "can not parse bucket name:\"" << bucket_name
<< "\"";
4073 f
->open_array_section("osds");
4074 for (auto &i
: osds
) {
4075 if (osdmap
.exists(i
)) {
4076 f
->dump_int("osd", i
);
4083 for (auto &i
: osds
) {
4084 if (osdmap
.exists(i
)) {
4095 } else if (prefix
== "osd df") {
4097 cmd_getval(g_ceph_context
, cmdmap
, "output_method", method
);
4098 print_osd_utilization(osdmap
, mon
->pgservice
, ds
,
4099 f
.get(), method
== "tree");
4101 } else if (prefix
== "osd getmaxosd") {
4103 f
->open_object_section("getmaxosd");
4104 f
->dump_unsigned("epoch", osdmap
.get_epoch());
4105 f
->dump_int("max_osd", osdmap
.get_max_osd());
4109 ds
<< "max_osd = " << osdmap
.get_max_osd() << " in epoch " << osdmap
.get_epoch();
4112 } else if (prefix
== "osd utilization") {
4114 osdmap
.summarize_mapping_stats(NULL
, NULL
, &out
, f
.get());
4121 } else if (prefix
== "osd find") {
4123 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", osd
)) {
4124 ss
<< "unable to parse osd id value '"
4125 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
4129 if (!osdmap
.exists(osd
)) {
4130 ss
<< "osd." << osd
<< " does not exist";
4135 cmd_getval(g_ceph_context
, cmdmap
, "format", format
);
4136 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
4137 f
->open_object_section("osd_location");
4138 f
->dump_int("osd", osd
);
4139 f
->dump_stream("ip") << osdmap
.get_addr(osd
);
4140 f
->open_object_section("crush_location");
4141 map
<string
,string
> loc
= osdmap
.crush
->get_full_location(osd
);
4142 for (map
<string
,string
>::iterator p
= loc
.begin(); p
!= loc
.end(); ++p
)
4143 f
->dump_string(p
->first
.c_str(), p
->second
);
4147 } else if (prefix
== "osd metadata") {
4149 if (cmd_vartype_stringify(cmdmap
["id"]).size() &&
4150 !cmd_getval(g_ceph_context
, cmdmap
, "id", osd
)) {
4151 ss
<< "unable to parse osd id value '"
4152 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
4156 if (osd
>= 0 && !osdmap
.exists(osd
)) {
4157 ss
<< "osd." << osd
<< " does not exist";
4162 cmd_getval(g_ceph_context
, cmdmap
, "format", format
);
4163 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
4165 f
->open_object_section("osd_metadata");
4166 f
->dump_unsigned("id", osd
);
4167 r
= dump_osd_metadata(osd
, f
.get(), &ss
);
4173 f
->open_array_section("osd_metadata");
4174 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
4175 if (osdmap
.exists(i
)) {
4176 f
->open_object_section("osd");
4177 f
->dump_unsigned("id", i
);
4178 r
= dump_osd_metadata(i
, f
.get(), NULL
);
4179 if (r
== -EINVAL
|| r
== -ENOENT
) {
4180 // Drop error, continue to get other daemons' metadata
4181 dout(4) << "No metadata for osd." << i
<< dendl
;
4193 } else if (prefix
== "osd versions") {
4195 f
.reset(Formatter::create("json-pretty"));
4196 count_metadata("ceph_version", f
.get());
4199 } else if (prefix
== "osd count-metadata") {
4201 f
.reset(Formatter::create("json-pretty"));
4203 cmd_getval(g_ceph_context
, cmdmap
, "property", field
);
4204 count_metadata(field
, f
.get());
4207 } else if (prefix
== "osd map") {
4208 string poolstr
, objstr
, namespacestr
;
4209 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
4210 cmd_getval(g_ceph_context
, cmdmap
, "object", objstr
);
4211 cmd_getval(g_ceph_context
, cmdmap
, "nspace", namespacestr
);
4213 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
4215 ss
<< "pool " << poolstr
<< " does not exist";
4219 object_locator_t
oloc(pool
, namespacestr
);
4220 object_t
oid(objstr
);
4221 pg_t pgid
= osdmap
.object_locator_to_pg(oid
, oloc
);
4222 pg_t mpgid
= osdmap
.raw_pg_to_pg(pgid
);
4223 vector
<int> up
, acting
;
4225 osdmap
.pg_to_up_acting_osds(mpgid
, &up
, &up_p
, &acting
, &acting_p
);
4228 if (!namespacestr
.empty())
4229 fullobjname
= namespacestr
+ string("/") + oid
.name
;
4231 fullobjname
= oid
.name
;
4233 f
->open_object_section("osd_map");
4234 f
->dump_unsigned("epoch", osdmap
.get_epoch());
4235 f
->dump_string("pool", poolstr
);
4236 f
->dump_int("pool_id", pool
);
4237 f
->dump_stream("objname") << fullobjname
;
4238 f
->dump_stream("raw_pgid") << pgid
;
4239 f
->dump_stream("pgid") << mpgid
;
4240 f
->open_array_section("up");
4241 for (vector
<int>::iterator p
= up
.begin(); p
!= up
.end(); ++p
)
4242 f
->dump_int("osd", *p
);
4244 f
->dump_int("up_primary", up_p
);
4245 f
->open_array_section("acting");
4246 for (vector
<int>::iterator p
= acting
.begin(); p
!= acting
.end(); ++p
)
4247 f
->dump_int("osd", *p
);
4249 f
->dump_int("acting_primary", acting_p
);
4250 f
->close_section(); // osd_map
4253 ds
<< "osdmap e" << osdmap
.get_epoch()
4254 << " pool '" << poolstr
<< "' (" << pool
<< ")"
4255 << " object '" << fullobjname
<< "' ->"
4256 << " pg " << pgid
<< " (" << mpgid
<< ")"
4257 << " -> up (" << pg_vector_string(up
) << ", p" << up_p
<< ") acting ("
4258 << pg_vector_string(acting
) << ", p" << acting_p
<< ")";
4262 } else if (prefix
== "pg map") {
4265 cmd_getval(g_ceph_context
, cmdmap
, "pgid", pgidstr
);
4266 if (!pgid
.parse(pgidstr
.c_str())) {
4267 ss
<< "invalid pgid '" << pgidstr
<< "'";
4271 vector
<int> up
, acting
;
4272 if (!osdmap
.have_pg_pool(pgid
.pool())) {
4273 ss
<< "pg '" << pgidstr
<< "' does not exist";
4277 pg_t mpgid
= osdmap
.raw_pg_to_pg(pgid
);
4278 osdmap
.pg_to_up_acting_osds(pgid
, up
, acting
);
4280 f
->open_object_section("pg_map");
4281 f
->dump_unsigned("epoch", osdmap
.get_epoch());
4282 f
->dump_stream("raw_pgid") << pgid
;
4283 f
->dump_stream("pgid") << mpgid
;
4284 f
->open_array_section("up");
4285 for (auto osd
: up
) {
4286 f
->dump_int("up_osd", osd
);
4289 f
->open_array_section("acting");
4290 for (auto osd
: acting
) {
4291 f
->dump_int("acting_osd", osd
);
4297 ds
<< "osdmap e" << osdmap
.get_epoch()
4298 << " pg " << pgid
<< " (" << mpgid
<< ")"
4299 << " -> up " << up
<< " acting " << acting
;
4304 } else if (prefix
== "osd scrub" ||
4305 prefix
== "osd deep-scrub" ||
4306 prefix
== "osd repair") {
4308 cmd_getval(g_ceph_context
, cmdmap
, "who", whostr
);
4309 vector
<string
> pvec
;
4310 get_str_vec(prefix
, pvec
);
4312 if (whostr
== "*" || whostr
== "all" || whostr
== "any") {
4315 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++)
4316 if (osdmap
.is_up(i
)) {
4317 ss
<< (c
++ ? "," : "") << i
;
4318 mon
->try_send_message(new MOSDScrub(osdmap
.get_fsid(),
4319 pvec
.back() == "repair",
4320 pvec
.back() == "deep-scrub"),
4321 osdmap
.get_inst(i
));
4324 ss
<< " instructed to " << pvec
.back();
4326 long osd
= parse_osd_id(whostr
.c_str(), &ss
);
4329 } else if (osdmap
.is_up(osd
)) {
4330 mon
->try_send_message(new MOSDScrub(osdmap
.get_fsid(),
4331 pvec
.back() == "repair",
4332 pvec
.back() == "deep-scrub"),
4333 osdmap
.get_inst(osd
));
4334 ss
<< "osd." << osd
<< " instructed to " << pvec
.back();
4336 ss
<< "osd." << osd
<< " is not up";
4340 } else if (prefix
== "osd lspools") {
4342 cmd_getval(g_ceph_context
, cmdmap
, "auid", auid
, int64_t(0));
4344 f
->open_array_section("pools");
4345 for (map
<int64_t, pg_pool_t
>::iterator p
= osdmap
.pools
.begin();
4346 p
!= osdmap
.pools
.end();
4348 if (!auid
|| p
->second
.auid
== (uint64_t)auid
) {
4350 f
->open_object_section("pool");
4351 f
->dump_int("poolnum", p
->first
);
4352 f
->dump_string("poolname", osdmap
.pool_name
[p
->first
]);
4355 ds
<< p
->first
<< ' ' << osdmap
.pool_name
[p
->first
] << ',';
4364 } else if (prefix
== "osd blacklist ls") {
4366 f
->open_array_section("blacklist");
4368 for (ceph::unordered_map
<entity_addr_t
,utime_t
>::iterator p
= osdmap
.blacklist
.begin();
4369 p
!= osdmap
.blacklist
.end();
4372 f
->open_object_section("entry");
4373 f
->dump_stream("addr") << p
->first
;
4374 f
->dump_stream("until") << p
->second
;
4379 ss
<< p
->first
<< " " << p
->second
;
4389 ss
<< "listed " << osdmap
.blacklist
.size() << " entries";
4391 } else if (prefix
== "osd pool ls") {
4393 cmd_getval(g_ceph_context
, cmdmap
, "detail", detail
);
4394 if (!f
&& detail
== "detail") {
4396 osdmap
.print_pools(ss
);
4397 rdata
.append(ss
.str());
4400 f
->open_array_section("pools");
4401 for (map
<int64_t,pg_pool_t
>::const_iterator it
= osdmap
.get_pools().begin();
4402 it
!= osdmap
.get_pools().end();
4405 if (detail
== "detail") {
4406 f
->open_object_section("pool");
4407 f
->dump_string("pool_name", osdmap
.get_pool_name(it
->first
));
4408 it
->second
.dump(f
.get());
4411 f
->dump_string("pool_name", osdmap
.get_pool_name(it
->first
));
4414 rdata
.append(osdmap
.get_pool_name(it
->first
) + "\n");
4423 } else if (prefix
== "osd crush get-tunable") {
4425 cmd_getval(g_ceph_context
, cmdmap
, "tunable", tunable
);
4428 f
->open_object_section("tunable");
4429 if (tunable
== "straw_calc_version") {
4431 f
->dump_int(tunable
.c_str(), osdmap
.crush
->get_straw_calc_version());
4433 rss
<< osdmap
.crush
->get_straw_calc_version() << "\n";
4442 rdata
.append(rss
.str());
4446 } else if (prefix
== "osd pool get") {
4448 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
4449 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
4451 ss
<< "unrecognized pool '" << poolstr
<< "'";
4456 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
4458 cmd_getval(g_ceph_context
, cmdmap
, "var", var
);
4460 typedef std::map
<std::string
, osd_pool_get_choices
> choices_map_t
;
4461 const choices_map_t ALL_CHOICES
= {
4463 {"min_size", MIN_SIZE
},
4464 {"crash_replay_interval", CRASH_REPLAY_INTERVAL
},
4465 {"pg_num", PG_NUM
}, {"pgp_num", PGP_NUM
},
4466 {"crush_rule", CRUSH_RULE
},
4467 {"hashpspool", HASHPSPOOL
}, {"nodelete", NODELETE
},
4468 {"nopgchange", NOPGCHANGE
}, {"nosizechange", NOSIZECHANGE
},
4469 {"noscrub", NOSCRUB
}, {"nodeep-scrub", NODEEP_SCRUB
},
4470 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED
},
4471 {"hit_set_type", HIT_SET_TYPE
}, {"hit_set_period", HIT_SET_PERIOD
},
4472 {"hit_set_count", HIT_SET_COUNT
}, {"hit_set_fpp", HIT_SET_FPP
},
4473 {"use_gmt_hitset", USE_GMT_HITSET
},
4474 {"auid", AUID
}, {"target_max_objects", TARGET_MAX_OBJECTS
},
4475 {"target_max_bytes", TARGET_MAX_BYTES
},
4476 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO
},
4477 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO
},
4478 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO
},
4479 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE
},
4480 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE
},
4481 {"erasure_code_profile", ERASURE_CODE_PROFILE
},
4482 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE
},
4483 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE
},
4484 {"fast_read", FAST_READ
},
4485 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE
},
4486 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N
},
4487 {"scrub_min_interval", SCRUB_MIN_INTERVAL
},
4488 {"scrub_max_interval", SCRUB_MAX_INTERVAL
},
4489 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL
},
4490 {"recovery_priority", RECOVERY_PRIORITY
},
4491 {"recovery_op_priority", RECOVERY_OP_PRIORITY
},
4492 {"scrub_priority", SCRUB_PRIORITY
},
4493 {"compression_mode", COMPRESSION_MODE
},
4494 {"compression_algorithm", COMPRESSION_ALGORITHM
},
4495 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO
},
4496 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE
},
4497 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE
},
4498 {"csum_type", CSUM_TYPE
},
4499 {"csum_max_block", CSUM_MAX_BLOCK
},
4500 {"csum_min_block", CSUM_MIN_BLOCK
},
4503 typedef std::set
<osd_pool_get_choices
> choices_set_t
;
4505 const choices_set_t ONLY_TIER_CHOICES
= {
4506 HIT_SET_TYPE
, HIT_SET_PERIOD
, HIT_SET_COUNT
, HIT_SET_FPP
,
4507 TARGET_MAX_OBJECTS
, TARGET_MAX_BYTES
, CACHE_TARGET_FULL_RATIO
,
4508 CACHE_TARGET_DIRTY_RATIO
, CACHE_TARGET_DIRTY_HIGH_RATIO
,
4509 CACHE_MIN_FLUSH_AGE
, CACHE_MIN_EVICT_AGE
,
4510 MIN_READ_RECENCY_FOR_PROMOTE
,
4511 MIN_WRITE_RECENCY_FOR_PROMOTE
,
4512 HIT_SET_GRADE_DECAY_RATE
, HIT_SET_SEARCH_LAST_N
4514 const choices_set_t ONLY_ERASURE_CHOICES
= {
4515 ERASURE_CODE_PROFILE
4518 choices_set_t selected_choices
;
4520 for(choices_map_t::const_iterator it
= ALL_CHOICES
.begin();
4521 it
!= ALL_CHOICES
.end(); ++it
) {
4522 selected_choices
.insert(it
->second
);
4526 selected_choices
= subtract_second_from_first(selected_choices
,
4530 if(!p
->is_erasure()) {
4531 selected_choices
= subtract_second_from_first(selected_choices
,
4532 ONLY_ERASURE_CHOICES
);
4534 } else /* var != "all" */ {
4535 choices_map_t::const_iterator found
= ALL_CHOICES
.find(var
);
4536 osd_pool_get_choices selected
= found
->second
;
4538 if (!p
->is_tier() &&
4539 ONLY_TIER_CHOICES
.find(selected
) != ONLY_TIER_CHOICES
.end()) {
4540 ss
<< "pool '" << poolstr
4541 << "' is not a tier pool: variable not applicable";
4546 if (!p
->is_erasure() &&
4547 ONLY_ERASURE_CHOICES
.find(selected
)
4548 != ONLY_ERASURE_CHOICES
.end()) {
4549 ss
<< "pool '" << poolstr
4550 << "' is not a erasure pool: variable not applicable";
4555 selected_choices
.insert(selected
);
4559 for(choices_set_t::const_iterator it
= selected_choices
.begin();
4560 it
!= selected_choices
.end(); ++it
) {
4561 choices_map_t::const_iterator i
;
4562 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
4563 if (i
->second
== *it
) {
4567 assert(i
!= ALL_CHOICES
.end());
4568 bool pool_opt
= pool_opts_t::is_opt_name(i
->first
);
4570 f
->open_object_section("pool");
4571 f
->dump_string("pool", poolstr
);
4572 f
->dump_int("pool_id", pool
);
4576 f
->dump_int("pg_num", p
->get_pg_num());
4579 f
->dump_int("pgp_num", p
->get_pgp_num());
4582 f
->dump_int("auid", p
->get_auid());
4585 f
->dump_int("size", p
->get_size());
4588 f
->dump_int("min_size", p
->get_min_size());
4590 case CRASH_REPLAY_INTERVAL
:
4591 f
->dump_int("crash_replay_interval",
4592 p
->get_crash_replay_interval());
4595 if (osdmap
.crush
->rule_exists(p
->get_crush_rule())) {
4596 f
->dump_string("crush_rule", osdmap
.crush
->get_rule_name(
4597 p
->get_crush_rule()));
4599 f
->dump_string("crush_rule", stringify(p
->get_crush_rule()));
4606 case WRITE_FADVISE_DONTNEED
:
4609 f
->dump_string(i
->first
.c_str(),
4610 p
->has_flag(pg_pool_t::get_flag_by_name(i
->first
)) ?
4613 case HIT_SET_PERIOD
:
4614 f
->dump_int("hit_set_period", p
->hit_set_period
);
4617 f
->dump_int("hit_set_count", p
->hit_set_count
);
4620 f
->dump_string("hit_set_type",
4621 HitSet::get_type_name(p
->hit_set_params
.get_type()));
4625 if (p
->hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
4626 BloomHitSet::Params
*bloomp
=
4627 static_cast<BloomHitSet::Params
*>(p
->hit_set_params
.impl
.get());
4628 f
->dump_float("hit_set_fpp", bloomp
->get_fpp());
4629 } else if(var
!= "all") {
4631 ss
<< "hit set is not of type Bloom; " <<
4632 "invalid to get a false positive rate!";
4638 case USE_GMT_HITSET
:
4639 f
->dump_bool("use_gmt_hitset", p
->use_gmt_hitset
);
4641 case TARGET_MAX_OBJECTS
:
4642 f
->dump_unsigned("target_max_objects", p
->target_max_objects
);
4644 case TARGET_MAX_BYTES
:
4645 f
->dump_unsigned("target_max_bytes", p
->target_max_bytes
);
4647 case CACHE_TARGET_DIRTY_RATIO
:
4648 f
->dump_unsigned("cache_target_dirty_ratio_micro",
4649 p
->cache_target_dirty_ratio_micro
);
4650 f
->dump_float("cache_target_dirty_ratio",
4651 ((float)p
->cache_target_dirty_ratio_micro
/1000000));
4653 case CACHE_TARGET_DIRTY_HIGH_RATIO
:
4654 f
->dump_unsigned("cache_target_dirty_high_ratio_micro",
4655 p
->cache_target_dirty_high_ratio_micro
);
4656 f
->dump_float("cache_target_dirty_high_ratio",
4657 ((float)p
->cache_target_dirty_high_ratio_micro
/1000000));
4659 case CACHE_TARGET_FULL_RATIO
:
4660 f
->dump_unsigned("cache_target_full_ratio_micro",
4661 p
->cache_target_full_ratio_micro
);
4662 f
->dump_float("cache_target_full_ratio",
4663 ((float)p
->cache_target_full_ratio_micro
/1000000));
4665 case CACHE_MIN_FLUSH_AGE
:
4666 f
->dump_unsigned("cache_min_flush_age", p
->cache_min_flush_age
);
4668 case CACHE_MIN_EVICT_AGE
:
4669 f
->dump_unsigned("cache_min_evict_age", p
->cache_min_evict_age
);
4671 case ERASURE_CODE_PROFILE
:
4672 f
->dump_string("erasure_code_profile", p
->erasure_code_profile
);
4674 case MIN_READ_RECENCY_FOR_PROMOTE
:
4675 f
->dump_int("min_read_recency_for_promote",
4676 p
->min_read_recency_for_promote
);
4678 case MIN_WRITE_RECENCY_FOR_PROMOTE
:
4679 f
->dump_int("min_write_recency_for_promote",
4680 p
->min_write_recency_for_promote
);
4683 f
->dump_int("fast_read", p
->fast_read
);
4685 case HIT_SET_GRADE_DECAY_RATE
:
4686 f
->dump_int("hit_set_grade_decay_rate",
4687 p
->hit_set_grade_decay_rate
);
4689 case HIT_SET_SEARCH_LAST_N
:
4690 f
->dump_int("hit_set_search_last_n",
4691 p
->hit_set_search_last_n
);
4693 case SCRUB_MIN_INTERVAL
:
4694 case SCRUB_MAX_INTERVAL
:
4695 case DEEP_SCRUB_INTERVAL
:
4696 case RECOVERY_PRIORITY
:
4697 case RECOVERY_OP_PRIORITY
:
4698 case SCRUB_PRIORITY
:
4699 case COMPRESSION_MODE
:
4700 case COMPRESSION_ALGORITHM
:
4701 case COMPRESSION_REQUIRED_RATIO
:
4702 case COMPRESSION_MAX_BLOB_SIZE
:
4703 case COMPRESSION_MIN_BLOB_SIZE
:
4705 case CSUM_MAX_BLOCK
:
4706 case CSUM_MIN_BLOCK
:
4707 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc(i
->first
).key
;
4708 if (p
->opts
.is_set(key
)) {
4709 f
->open_object_section("pool");
4710 f
->dump_string("pool", poolstr
);
4711 f
->dump_int("pool_id", pool
);
4712 if(*it
== CSUM_TYPE
) {
4714 p
->opts
.get(pool_opts_t::CSUM_TYPE
, &val
);
4715 f
->dump_string(i
->first
.c_str(), Checksummer::get_csum_type_string(val
));
4717 p
->opts
.dump(i
->first
, f
.get());
4731 for(choices_set_t::const_iterator it
= selected_choices
.begin();
4732 it
!= selected_choices
.end(); ++it
) {
4733 choices_map_t::const_iterator i
;
4736 ss
<< "pg_num: " << p
->get_pg_num() << "\n";
4739 ss
<< "pgp_num: " << p
->get_pgp_num() << "\n";
4742 ss
<< "auid: " << p
->get_auid() << "\n";
4745 ss
<< "size: " << p
->get_size() << "\n";
4748 ss
<< "min_size: " << p
->get_min_size() << "\n";
4750 case CRASH_REPLAY_INTERVAL
:
4751 ss
<< "crash_replay_interval: " <<
4752 p
->get_crash_replay_interval() << "\n";
4755 if (osdmap
.crush
->rule_exists(p
->get_crush_rule())) {
4756 ss
<< "crush_rule: " << osdmap
.crush
->get_rule_name(
4757 p
->get_crush_rule()) << "\n";
4759 ss
<< "crush_rule: " << p
->get_crush_rule() << "\n";
4762 case HIT_SET_PERIOD
:
4763 ss
<< "hit_set_period: " << p
->hit_set_period
<< "\n";
4766 ss
<< "hit_set_count: " << p
->hit_set_count
<< "\n";
4769 ss
<< "hit_set_type: " <<
4770 HitSet::get_type_name(p
->hit_set_params
.get_type()) << "\n";
4774 if (p
->hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
4775 BloomHitSet::Params
*bloomp
=
4776 static_cast<BloomHitSet::Params
*>(p
->hit_set_params
.impl
.get());
4777 ss
<< "hit_set_fpp: " << bloomp
->get_fpp() << "\n";
4778 } else if(var
!= "all") {
4779 ss
<< "hit set is not of type Bloom; " <<
4780 "invalid to get a false positive rate!";
4786 case USE_GMT_HITSET
:
4787 ss
<< "use_gmt_hitset: " << p
->use_gmt_hitset
<< "\n";
4789 case TARGET_MAX_OBJECTS
:
4790 ss
<< "target_max_objects: " << p
->target_max_objects
<< "\n";
4792 case TARGET_MAX_BYTES
:
4793 ss
<< "target_max_bytes: " << p
->target_max_bytes
<< "\n";
4795 case CACHE_TARGET_DIRTY_RATIO
:
4796 ss
<< "cache_target_dirty_ratio: "
4797 << ((float)p
->cache_target_dirty_ratio_micro
/1000000) << "\n";
4799 case CACHE_TARGET_DIRTY_HIGH_RATIO
:
4800 ss
<< "cache_target_dirty_high_ratio: "
4801 << ((float)p
->cache_target_dirty_high_ratio_micro
/1000000) << "\n";
4803 case CACHE_TARGET_FULL_RATIO
:
4804 ss
<< "cache_target_full_ratio: "
4805 << ((float)p
->cache_target_full_ratio_micro
/1000000) << "\n";
4807 case CACHE_MIN_FLUSH_AGE
:
4808 ss
<< "cache_min_flush_age: " << p
->cache_min_flush_age
<< "\n";
4810 case CACHE_MIN_EVICT_AGE
:
4811 ss
<< "cache_min_evict_age: " << p
->cache_min_evict_age
<< "\n";
4813 case ERASURE_CODE_PROFILE
:
4814 ss
<< "erasure_code_profile: " << p
->erasure_code_profile
<< "\n";
4816 case MIN_READ_RECENCY_FOR_PROMOTE
:
4817 ss
<< "min_read_recency_for_promote: " <<
4818 p
->min_read_recency_for_promote
<< "\n";
4820 case HIT_SET_GRADE_DECAY_RATE
:
4821 ss
<< "hit_set_grade_decay_rate: " <<
4822 p
->hit_set_grade_decay_rate
<< "\n";
4824 case HIT_SET_SEARCH_LAST_N
:
4825 ss
<< "hit_set_search_last_n: " <<
4826 p
->hit_set_search_last_n
<< "\n";
4832 case WRITE_FADVISE_DONTNEED
:
4835 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
4836 if (i
->second
== *it
)
4839 assert(i
!= ALL_CHOICES
.end());
4840 ss
<< i
->first
<< ": " <<
4841 (p
->has_flag(pg_pool_t::get_flag_by_name(i
->first
)) ?
4842 "true" : "false") << "\n";
4844 case MIN_WRITE_RECENCY_FOR_PROMOTE
:
4845 ss
<< "min_write_recency_for_promote: " <<
4846 p
->min_write_recency_for_promote
<< "\n";
4849 ss
<< "fast_read: " << p
->fast_read
<< "\n";
4851 case SCRUB_MIN_INTERVAL
:
4852 case SCRUB_MAX_INTERVAL
:
4853 case DEEP_SCRUB_INTERVAL
:
4854 case RECOVERY_PRIORITY
:
4855 case RECOVERY_OP_PRIORITY
:
4856 case SCRUB_PRIORITY
:
4857 case COMPRESSION_MODE
:
4858 case COMPRESSION_ALGORITHM
:
4859 case COMPRESSION_REQUIRED_RATIO
:
4860 case COMPRESSION_MAX_BLOB_SIZE
:
4861 case COMPRESSION_MIN_BLOB_SIZE
:
4863 case CSUM_MAX_BLOCK
:
4864 case CSUM_MIN_BLOCK
:
4865 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
4866 if (i
->second
== *it
)
4869 assert(i
!= ALL_CHOICES
.end());
4871 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc(i
->first
).key
;
4872 if (p
->opts
.is_set(key
)) {
4873 if(key
== pool_opts_t::CSUM_TYPE
) {
4875 p
->opts
.get(key
, &val
);
4876 ss
<< i
->first
<< ": " << Checksummer::get_csum_type_string(val
) << "\n";
4878 ss
<< i
->first
<< ": " << p
->opts
.get(key
) << "\n";
4884 rdata
.append(ss
.str());
4889 } else if (prefix
== "osd pool stats") {
4890 r
= mon
->pgservice
->process_pg_command(prefix
, cmdmap
,
4891 osdmap
, f
.get(), &ss
, &rdata
);
4892 } else if (prefix
== "osd pool get-quota") {
4894 cmd_getval(g_ceph_context
, cmdmap
, "pool", pool_name
);
4896 int64_t poolid
= osdmap
.lookup_pg_pool_name(pool_name
);
4898 assert(poolid
== -ENOENT
);
4899 ss
<< "unrecognized pool '" << pool_name
<< "'";
4903 const pg_pool_t
*p
= osdmap
.get_pg_pool(poolid
);
4906 f
->open_object_section("pool_quotas");
4907 f
->dump_string("pool_name", pool_name
);
4908 f
->dump_unsigned("pool_id", poolid
);
4909 f
->dump_unsigned("quota_max_objects", p
->quota_max_objects
);
4910 f
->dump_unsigned("quota_max_bytes", p
->quota_max_bytes
);
4915 rs
<< "quotas for pool '" << pool_name
<< "':\n"
4916 << " max objects: ";
4917 if (p
->quota_max_objects
== 0)
4920 rs
<< si_t(p
->quota_max_objects
) << " objects";
4923 if (p
->quota_max_bytes
== 0)
4926 rs
<< si_t(p
->quota_max_bytes
) << "B";
4927 rdata
.append(rs
.str());
4931 } else if (prefix
== "osd crush rule list" ||
4932 prefix
== "osd crush rule ls") {
4934 f
->open_array_section("rules");
4935 osdmap
.crush
->list_rules(f
.get());
4940 osdmap
.crush
->list_rules(&ss
);
4941 rdata
.append(ss
.str());
4943 } else if (prefix
== "osd crush rule ls-by-class") {
4945 cmd_getval(g_ceph_context
, cmdmap
, "class", class_name
);
4946 if (class_name
.empty()) {
4947 ss
<< "no class specified";
4952 r
= osdmap
.crush
->get_rules_by_class(class_name
, &rules
);
4954 ss
<< "failed to get rules by class '" << class_name
<< "'";
4958 f
->open_array_section("rules");
4959 for (auto &rule
: rules
) {
4960 f
->dump_string("name", osdmap
.crush
->get_rule_name(rule
));
4966 for (auto &rule
: rules
) {
4967 rs
<< osdmap
.crush
->get_rule_name(rule
) << "\n";
4969 rdata
.append(rs
.str());
4971 } else if (prefix
== "osd crush rule dump") {
4973 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
4975 cmd_getval(g_ceph_context
, cmdmap
, "format", format
);
4976 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
4978 f
->open_array_section("rules");
4979 osdmap
.crush
->dump_rules(f
.get());
4982 int ruleno
= osdmap
.crush
->get_rule_id(name
);
4984 ss
<< "unknown crush rule '" << name
<< "'";
4988 osdmap
.crush
->dump_rule(ruleno
, f
.get());
4993 rdata
.append(rs
.str());
4994 } else if (prefix
== "osd crush dump") {
4996 cmd_getval(g_ceph_context
, cmdmap
, "format", format
);
4997 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
4998 f
->open_object_section("crush_map");
4999 osdmap
.crush
->dump(f
.get());
5004 rdata
.append(rs
.str());
5005 } else if (prefix
== "osd crush show-tunables") {
5007 cmd_getval(g_ceph_context
, cmdmap
, "format", format
);
5008 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5009 f
->open_object_section("crush_map_tunables");
5010 osdmap
.crush
->dump_tunables(f
.get());
5015 rdata
.append(rs
.str());
5016 } else if (prefix
== "osd crush tree") {
5018 cmd_getval(g_ceph_context
, cmdmap
, "shadow", shadow
);
5019 bool show_shadow
= shadow
== "--show-shadow";
5020 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
5022 osdmap
.crush
->dump_tree(nullptr,
5024 osdmap
.get_pool_names(),
5029 osdmap
.crush
->dump_tree(&ss
,
5031 osdmap
.get_pool_names(),
5033 rdata
.append(ss
.str());
5035 } else if (prefix
== "osd crush ls") {
5037 if (!cmd_getval(g_ceph_context
, cmdmap
, "node", name
)) {
5038 ss
<< "no node specified";
5042 if (!osdmap
.crush
->name_exists(name
)) {
5043 ss
<< "node '" << name
<< "' does not exist";
5047 int id
= osdmap
.crush
->get_item_id(name
);
5050 result
.push_back(id
);
5052 int num
= osdmap
.crush
->get_bucket_size(id
);
5053 for (int i
= 0; i
< num
; ++i
) {
5054 result
.push_back(osdmap
.crush
->get_bucket_item(id
, i
));
5058 f
->open_array_section("items");
5059 for (auto i
: result
) {
5060 f
->dump_string("item", osdmap
.crush
->get_item_name(i
));
5066 for (auto i
: result
) {
5067 ss
<< osdmap
.crush
->get_item_name(i
) << "\n";
5069 rdata
.append(ss
.str());
5072 } else if (prefix
== "osd crush class ls") {
5073 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5074 f
->open_array_section("crush_classes");
5075 for (auto i
: osdmap
.crush
->class_name
)
5076 f
->dump_string("class", i
.second
);
5079 } else if (prefix
== "osd crush class ls-osd") {
5081 cmd_getval(g_ceph_context
, cmdmap
, "class", name
);
5083 osdmap
.crush
->get_devices_by_class(name
, &osds
);
5085 f
->open_array_section("osds");
5086 for (auto &osd
: osds
)
5087 f
->dump_int("osd", osd
);
5092 for (auto &osd
: osds
) {
5100 } else if (prefix
== "osd erasure-code-profile ls") {
5101 const auto &profiles
= osdmap
.get_erasure_code_profiles();
5103 f
->open_array_section("erasure-code-profiles");
5104 for (auto i
= profiles
.begin(); i
!= profiles
.end(); ++i
) {
5106 f
->dump_string("profile", i
->first
.c_str());
5108 rdata
.append(i
->first
+ "\n");
5115 rdata
.append(rs
.str());
5117 } else if (prefix
== "osd crush weight-set ls") {
5118 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
5120 f
->open_array_section("weight_sets");
5121 if (osdmap
.crush
->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS
)) {
5122 f
->dump_string("pool", "(compat)");
5124 for (auto& i
: osdmap
.crush
->choose_args
) {
5126 f
->dump_string("pool", osdmap
.get_pool_name(i
.first
));
5133 if (osdmap
.crush
->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS
)) {
5136 for (auto& i
: osdmap
.crush
->choose_args
) {
5138 rs
<< osdmap
.get_pool_name(i
.first
) << "\n";
5141 rdata
.append(rs
.str());
5143 } else if (prefix
== "osd crush weight-set dump") {
5144 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty",
5146 osdmap
.crush
->dump_choose_args(f
.get());
5148 } else if (prefix
== "osd erasure-code-profile get") {
5150 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
5151 if (!osdmap
.has_erasure_code_profile(name
)) {
5152 ss
<< "unknown erasure code profile '" << name
<< "'";
5156 const map
<string
,string
> &profile
= osdmap
.get_erasure_code_profile(name
);
5158 f
->open_object_section("profile");
5159 for (map
<string
,string
>::const_iterator i
= profile
.begin();
5163 f
->dump_string(i
->first
.c_str(), i
->second
.c_str());
5165 rdata
.append(i
->first
+ "=" + i
->second
+ "\n");
5172 rdata
.append(rs
.str());
5174 } else if (prefix
== "osd pool application get") {
5175 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty",
5178 cmd_getval(g_ceph_context
, cmdmap
, "pool", pool_name
);
5180 cmd_getval(g_ceph_context
, cmdmap
, "app", app
);
5182 cmd_getval(g_ceph_context
, cmdmap
, "key", key
);
5184 if (pool_name
.empty()) {
5186 f
->open_object_section("pools");
5187 for (const auto &pool
: osdmap
.pools
) {
5188 std::string
name("<unknown>");
5189 const auto &pni
= osdmap
.pool_name
.find(pool
.first
);
5190 if (pni
!= osdmap
.pool_name
.end())
5192 f
->open_object_section(name
.c_str());
5193 for (auto &app_pair
: pool
.second
.application_metadata
) {
5194 f
->open_object_section(app_pair
.first
.c_str());
5195 for (auto &kv_pair
: app_pair
.second
) {
5196 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
5200 f
->close_section(); // name
5202 f
->close_section(); // pools
5205 int64_t pool
= osdmap
.lookup_pg_pool_name(pool_name
.c_str());
5207 ss
<< "unrecognized pool '" << pool_name
<< "'";
5211 auto p
= osdmap
.get_pg_pool(pool
);
5214 f
->open_object_section(pool_name
.c_str());
5215 for (auto &app_pair
: p
->application_metadata
) {
5216 f
->open_object_section(app_pair
.first
.c_str());
5217 for (auto &kv_pair
: app_pair
.second
) {
5218 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
5220 f
->close_section(); // application
5222 f
->close_section(); // pool_name
5227 auto app_it
= p
->application_metadata
.find(app
);
5228 if (app_it
== p
->application_metadata
.end()) {
5229 ss
<< "pool '" << pool_name
<< "' has no application '" << app
<< "'";
5233 // filter by pool + app
5235 f
->open_object_section(app_it
->first
.c_str());
5236 for (auto &kv_pair
: app_it
->second
) {
5237 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
5239 f
->close_section(); // application
5243 // filter by pool + app + key
5244 auto key_it
= app_it
->second
.find(key
);
5245 if (key_it
== app_it
->second
.end()) {
5246 ss
<< "application '" << app
<< "' on pool '" << pool_name
5247 << "' does not have key '" << key
<< "'";
5251 ss
<< key_it
->second
<< "\n";
5252 rdata
.append(ss
.str());
5256 // try prepare update
5263 mon
->reply_command(op
, r
, rs
, rdata
, get_last_committed());
5267 void OSDMonitor::update_pool_flags(int64_t pool_id
, uint64_t flags
)
5269 const pg_pool_t
*pool
= osdmap
.get_pg_pool(pool_id
);
5270 pending_inc
.get_new_pool(pool_id
, pool
)->flags
= flags
;
5273 bool OSDMonitor::update_pools_status()
5275 if (!mon
->pgservice
->is_readable())
5280 auto& pools
= osdmap
.get_pools();
5281 for (auto it
= pools
.begin(); it
!= pools
.end(); ++it
) {
5282 const pool_stat_t
*pstat
= mon
->pgservice
->get_pool_stat(it
->first
);
5285 const object_stat_sum_t
& sum
= pstat
->stats
.sum
;
5286 const pg_pool_t
&pool
= it
->second
;
5287 const string
& pool_name
= osdmap
.get_pool_name(it
->first
);
5290 (pool
.quota_max_bytes
> 0 && (uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) ||
5291 (pool
.quota_max_objects
> 0 && (uint64_t)sum
.num_objects
>= pool
.quota_max_objects
);
5293 if (pool
.has_flag(pg_pool_t::FLAG_FULL
)) {
5297 mon
->clog
->info() << "pool '" << pool_name
5298 << "' no longer full; removing FULL flag";
5300 update_pool_flags(it
->first
, pool
.get_flags() & ~pg_pool_t::FLAG_FULL
);
5306 if (pool
.quota_max_bytes
> 0 &&
5307 (uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) {
5308 mon
->clog
->warn() << "pool '" << pool_name
<< "' is full"
5309 << " (reached quota's max_bytes: "
5310 << si_t(pool
.quota_max_bytes
) << ")";
5312 if (pool
.quota_max_objects
> 0 &&
5313 (uint64_t)sum
.num_objects
>= pool
.quota_max_objects
) {
5314 mon
->clog
->warn() << "pool '" << pool_name
<< "' is full"
5315 << " (reached quota's max_objects: "
5316 << pool
.quota_max_objects
<< ")";
5318 update_pool_flags(it
->first
, pool
.get_flags() | pg_pool_t::FLAG_FULL
);
5325 int OSDMonitor::prepare_new_pool(MonOpRequestRef op
)
5327 op
->mark_osdmon_event(__func__
);
5328 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
5329 dout(10) << "prepare_new_pool from " << m
->get_connection() << dendl
;
5330 MonSession
*session
= m
->get_session();
5333 string erasure_code_profile
;
5337 return prepare_new_pool(m
->name
, m
->auid
, m
->crush_rule
, rule_name
,
5339 erasure_code_profile
,
5340 pg_pool_t::TYPE_REPLICATED
, 0, FAST_READ_OFF
, &ss
);
5342 return prepare_new_pool(m
->name
, session
->auid
, m
->crush_rule
, rule_name
,
5344 erasure_code_profile
,
5345 pg_pool_t::TYPE_REPLICATED
, 0, FAST_READ_OFF
, &ss
);
5348 int OSDMonitor::crush_rename_bucket(const string
& srcname
,
5349 const string
& dstname
,
5354 // Avoid creating a pending crush if it does not already exists and
5355 // the rename would fail.
5357 if (!_have_pending_crush()) {
5358 ret
= _get_stable_crush().can_rename_bucket(srcname
,
5365 CrushWrapper newcrush
;
5366 _get_pending_crush(newcrush
);
5368 ret
= newcrush
.rename_bucket(srcname
,
5374 pending_inc
.crush
.clear();
5375 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
5376 *ss
<< "renamed bucket " << srcname
<< " into " << dstname
;
5380 void OSDMonitor::check_legacy_ec_plugin(const string
& plugin
, const string
& profile
) const
5382 string replacement
= "";
5384 if (plugin
== "jerasure_generic" ||
5385 plugin
== "jerasure_sse3" ||
5386 plugin
== "jerasure_sse4" ||
5387 plugin
== "jerasure_neon") {
5388 replacement
= "jerasure";
5389 } else if (plugin
== "shec_generic" ||
5390 plugin
== "shec_sse3" ||
5391 plugin
== "shec_sse4" ||
5392 plugin
== "shec_neon") {
5393 replacement
= "shec";
5396 if (replacement
!= "") {
5397 dout(0) << "WARNING: erasure coding profile " << profile
<< " uses plugin "
5398 << plugin
<< " that has been deprecated. Please use "
5399 << replacement
<< " instead." << dendl
;
5403 int OSDMonitor::normalize_profile(const string
& profilename
,
5404 ErasureCodeProfile
&profile
,
5408 ErasureCodeInterfaceRef erasure_code
;
5409 ErasureCodePluginRegistry
&instance
= ErasureCodePluginRegistry::instance();
5410 ErasureCodeProfile::const_iterator plugin
= profile
.find("plugin");
5411 check_legacy_ec_plugin(plugin
->second
, profilename
);
5412 int err
= instance
.factory(plugin
->second
,
5413 g_conf
->get_val
<std::string
>("erasure_code_dir"),
5414 profile
, &erasure_code
, ss
);
5419 err
= erasure_code
->init(profile
, ss
);
5424 auto it
= profile
.find("stripe_unit");
5425 if (it
!= profile
.end()) {
5427 uint32_t stripe_unit
= strict_si_cast
<uint32_t>(it
->second
.c_str(), &err_str
);
5428 if (!err_str
.empty()) {
5429 *ss
<< "could not parse stripe_unit '" << it
->second
5430 << "': " << err_str
<< std::endl
;
5433 uint32_t data_chunks
= erasure_code
->get_data_chunk_count();
5434 uint32_t chunk_size
= erasure_code
->get_chunk_size(stripe_unit
* data_chunks
);
5435 if (chunk_size
!= stripe_unit
) {
5436 *ss
<< "stripe_unit " << stripe_unit
<< " does not match ec profile "
5437 << "alignment. Would be padded to " << chunk_size
5441 if ((stripe_unit
% 4096) != 0 && !force
) {
5442 *ss
<< "stripe_unit should be a multiple of 4096 bytes for best performance."
5443 << "use --force to override this check" << std::endl
;
5450 int OSDMonitor::crush_rule_create_erasure(const string
&name
,
5451 const string
&profile
,
5455 int ruleid
= osdmap
.crush
->get_rule_id(name
);
5456 if (ruleid
!= -ENOENT
) {
5457 *rule
= osdmap
.crush
->get_rule_mask_ruleset(ruleid
);
5461 CrushWrapper newcrush
;
5462 _get_pending_crush(newcrush
);
5464 ruleid
= newcrush
.get_rule_id(name
);
5465 if (ruleid
!= -ENOENT
) {
5466 *rule
= newcrush
.get_rule_mask_ruleset(ruleid
);
5469 ErasureCodeInterfaceRef erasure_code
;
5470 int err
= get_erasure_code(profile
, &erasure_code
, ss
);
5472 *ss
<< "failed to load plugin using profile " << profile
<< std::endl
;
5476 err
= erasure_code
->create_rule(name
, newcrush
, ss
);
5477 erasure_code
.reset();
5481 pending_inc
.crush
.clear();
5482 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
5487 int OSDMonitor::get_erasure_code(const string
&erasure_code_profile
,
5488 ErasureCodeInterfaceRef
*erasure_code
,
5491 if (pending_inc
.has_erasure_code_profile(erasure_code_profile
))
5493 ErasureCodeProfile profile
=
5494 osdmap
.get_erasure_code_profile(erasure_code_profile
);
5495 ErasureCodeProfile::const_iterator plugin
=
5496 profile
.find("plugin");
5497 if (plugin
== profile
.end()) {
5498 *ss
<< "cannot determine the erasure code plugin"
5499 << " because there is no 'plugin' entry in the erasure_code_profile "
5500 << profile
<< std::endl
;
5503 check_legacy_ec_plugin(plugin
->second
, erasure_code_profile
);
5504 ErasureCodePluginRegistry
&instance
= ErasureCodePluginRegistry::instance();
5505 return instance
.factory(plugin
->second
,
5506 g_conf
->get_val
<std::string
>("erasure_code_dir"),
5507 profile
, erasure_code
, ss
);
5510 int OSDMonitor::check_cluster_features(uint64_t features
,
5513 stringstream unsupported_ss
;
5514 int unsupported_count
= 0;
5515 if ((mon
->get_quorum_con_features() & features
) != features
) {
5516 unsupported_ss
<< "the monitor cluster";
5517 ++unsupported_count
;
5520 set
<int32_t> up_osds
;
5521 osdmap
.get_up_osds(up_osds
);
5522 for (set
<int32_t>::iterator it
= up_osds
.begin();
5523 it
!= up_osds
.end(); ++it
) {
5524 const osd_xinfo_t
&xi
= osdmap
.get_xinfo(*it
);
5525 if ((xi
.features
& features
) != features
) {
5526 if (unsupported_count
> 0)
5527 unsupported_ss
<< ", ";
5528 unsupported_ss
<< "osd." << *it
;
5529 unsupported_count
++;
5533 if (unsupported_count
> 0) {
5534 ss
<< "features " << features
<< " unsupported by: "
5535 << unsupported_ss
.str();
5539 // check pending osd state, too!
5540 for (map
<int32_t,osd_xinfo_t
>::const_iterator p
=
5541 pending_inc
.new_xinfo
.begin();
5542 p
!= pending_inc
.new_xinfo
.end(); ++p
) {
5543 const osd_xinfo_t
&xi
= p
->second
;
5544 if ((xi
.features
& features
) != features
) {
5545 dout(10) << __func__
<< " pending osd." << p
->first
5546 << " features are insufficient; retry" << dendl
;
5554 bool OSDMonitor::validate_crush_against_features(const CrushWrapper
*newcrush
,
5557 OSDMap::Incremental new_pending
= pending_inc
;
5558 ::encode(*newcrush
, new_pending
.crush
, mon
->get_quorum_con_features());
5560 newmap
.deepish_copy_from(osdmap
);
5561 newmap
.apply_incremental(new_pending
);
5564 if (newmap
.require_min_compat_client
> 0) {
5565 auto mv
= newmap
.get_min_compat_client();
5566 if (mv
> newmap
.require_min_compat_client
) {
5567 ss
<< "new crush map requires client version " << ceph_release_name(mv
)
5568 << " but require_min_compat_client is "
5569 << ceph_release_name(newmap
.require_min_compat_client
);
5576 newmap
.get_features(CEPH_ENTITY_TYPE_MON
, NULL
) |
5577 newmap
.get_features(CEPH_ENTITY_TYPE_OSD
, NULL
);
5578 stringstream features_ss
;
5579 int r
= check_cluster_features(features
, features_ss
);
5581 ss
<< "Could not change CRUSH: " << features_ss
.str();
5588 bool OSDMonitor::erasure_code_profile_in_use(
5589 const mempool::osdmap::map
<int64_t, pg_pool_t
> &pools
,
5590 const string
&profile
,
5594 for (map
<int64_t, pg_pool_t
>::const_iterator p
= pools
.begin();
5597 if (p
->second
.erasure_code_profile
== profile
) {
5598 *ss
<< osdmap
.pool_name
[p
->first
] << " ";
5603 *ss
<< "pool(s) are using the erasure code profile '" << profile
<< "'";
5608 int OSDMonitor::parse_erasure_code_profile(const vector
<string
> &erasure_code_profile
,
5609 map
<string
,string
> *erasure_code_profile_map
,
5612 int r
= get_json_str_map(g_conf
->osd_pool_default_erasure_code_profile
,
5614 erasure_code_profile_map
);
5617 assert((*erasure_code_profile_map
).count("plugin"));
5618 string default_plugin
= (*erasure_code_profile_map
)["plugin"];
5619 map
<string
,string
> user_map
;
5620 for (vector
<string
>::const_iterator i
= erasure_code_profile
.begin();
5621 i
!= erasure_code_profile
.end();
5623 size_t equal
= i
->find('=');
5624 if (equal
== string::npos
) {
5625 user_map
[*i
] = string();
5626 (*erasure_code_profile_map
)[*i
] = string();
5628 const string key
= i
->substr(0, equal
);
5630 const string value
= i
->substr(equal
);
5631 user_map
[key
] = value
;
5632 (*erasure_code_profile_map
)[key
] = value
;
5636 if (user_map
.count("plugin") && user_map
["plugin"] != default_plugin
)
5637 (*erasure_code_profile_map
) = user_map
;
5642 int OSDMonitor::prepare_pool_size(const unsigned pool_type
,
5643 const string
&erasure_code_profile
,
5644 unsigned *size
, unsigned *min_size
,
5648 switch (pool_type
) {
5649 case pg_pool_t::TYPE_REPLICATED
:
5650 *size
= g_conf
->osd_pool_default_size
;
5651 *min_size
= g_conf
->get_osd_pool_default_min_size();
5653 case pg_pool_t::TYPE_ERASURE
:
5655 ErasureCodeInterfaceRef erasure_code
;
5656 err
= get_erasure_code(erasure_code_profile
, &erasure_code
, ss
);
5658 *size
= erasure_code
->get_chunk_count();
5659 *min_size
= MIN(erasure_code
->get_data_chunk_count() + 1, *size
);
5664 *ss
<< "prepare_pool_size: " << pool_type
<< " is not a known pool type";
5671 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type
,
5672 const string
&erasure_code_profile
,
5673 uint32_t *stripe_width
,
5677 switch (pool_type
) {
5678 case pg_pool_t::TYPE_REPLICATED
:
5681 case pg_pool_t::TYPE_ERASURE
:
5683 ErasureCodeProfile profile
=
5684 osdmap
.get_erasure_code_profile(erasure_code_profile
);
5685 ErasureCodeInterfaceRef erasure_code
;
5686 err
= get_erasure_code(erasure_code_profile
, &erasure_code
, ss
);
5689 uint32_t data_chunks
= erasure_code
->get_data_chunk_count();
5690 uint32_t stripe_unit
= g_conf
->osd_pool_erasure_code_stripe_unit
;
5691 auto it
= profile
.find("stripe_unit");
5692 if (it
!= profile
.end()) {
5694 stripe_unit
= strict_si_cast
<uint32_t>(it
->second
.c_str(), &err_str
);
5695 assert(err_str
.empty());
5697 *stripe_width
= data_chunks
*
5698 erasure_code
->get_chunk_size(stripe_unit
* data_chunks
);
5702 *ss
<< "prepare_pool_stripe_width: "
5703 << pool_type
<< " is not a known pool type";
5710 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type
,
5711 const string
&erasure_code_profile
,
5712 const string
&rule_name
,
5717 if (*crush_rule
< 0) {
5718 switch (pool_type
) {
5719 case pg_pool_t::TYPE_REPLICATED
:
5721 if (rule_name
== "") {
5723 *crush_rule
= osdmap
.crush
->get_osd_pool_default_crush_replicated_ruleset(g_ceph_context
);
5724 if (*crush_rule
< 0) {
5725 // Errors may happen e.g. if no valid rule is available
5726 *ss
<< "No suitable CRUSH rule exists, check "
5727 << "'osd pool default crush *' config options";
5731 return get_crush_rule(rule_name
, crush_rule
, ss
);
5735 case pg_pool_t::TYPE_ERASURE
:
5737 int err
= crush_rule_create_erasure(rule_name
,
5738 erasure_code_profile
,
5742 dout(20) << "prepare_pool_crush_rule: rule "
5743 << rule_name
<< " try again" << dendl
;
5746 // need to wait for the crush rule to be proposed before proceeding
5757 *ss
<< "prepare_pool_crush_rule: " << pool_type
5758 << " is not a known pool type";
5763 if (!osdmap
.crush
->ruleset_exists(*crush_rule
)) {
5764 *ss
<< "CRUSH rule " << *crush_rule
<< " not found";
5772 int OSDMonitor::get_crush_rule(const string
&rule_name
,
5777 ret
= osdmap
.crush
->get_rule_id(rule_name
);
5778 if (ret
!= -ENOENT
) {
5782 CrushWrapper newcrush
;
5783 _get_pending_crush(newcrush
);
5785 ret
= newcrush
.get_rule_id(rule_name
);
5786 if (ret
!= -ENOENT
) {
5787 // found it, wait for it to be proposed
5788 dout(20) << __func__
<< ": rule " << rule_name
5789 << " try again" << dendl
;
5792 // Cannot find it , return error
5793 *ss
<< "specified rule " << rule_name
<< " doesn't exist";
5801 * @param name The name of the new pool
5802 * @param auid The auid of the pool owner. Can be -1
5803 * @param crush_rule The crush rule to use. If <0, will use the system default
5804 * @param crush_rule_name The crush rule to use, if crush_rulset <0
5805 * @param pg_num The pg_num to use. If set to 0, will use the system default
5806 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
5807 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
5808 * @param pool_type TYPE_ERASURE, or TYPE_REP
5809 * @param expected_num_objects expected number of objects on the pool
5810 * @param fast_read fast read type.
5811 * @param ss human readable error message, if any.
5813 * @return 0 on success, negative errno on failure.
5815 int OSDMonitor::prepare_new_pool(string
& name
, uint64_t auid
,
5817 const string
&crush_rule_name
,
5818 unsigned pg_num
, unsigned pgp_num
,
5819 const string
&erasure_code_profile
,
5820 const unsigned pool_type
,
5821 const uint64_t expected_num_objects
,
5822 FastReadType fast_read
,
5825 if (name
.length() == 0)
5828 pg_num
= g_conf
->osd_pool_default_pg_num
;
5830 pgp_num
= g_conf
->osd_pool_default_pgp_num
;
5831 if (pg_num
> (unsigned)g_conf
->mon_max_pool_pg_num
) {
5832 *ss
<< "'pg_num' must be greater than 0 and less than or equal to "
5833 << g_conf
->mon_max_pool_pg_num
5834 << " (you may adjust 'mon max pool pg num' for higher values)";
5837 if (pgp_num
> pg_num
) {
5838 *ss
<< "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
5839 << ", which in this case is " << pg_num
;
5842 if (pool_type
== pg_pool_t::TYPE_REPLICATED
&& fast_read
== FAST_READ_ON
) {
5843 *ss
<< "'fast_read' can only apply to erasure coding pool";
5847 r
= prepare_pool_crush_rule(pool_type
, erasure_code_profile
,
5848 crush_rule_name
, &crush_rule
, ss
);
5850 dout(10) << " prepare_pool_crush_rule returns " << r
<< dendl
;
5853 if (g_conf
->mon_osd_crush_smoke_test
) {
5854 CrushWrapper newcrush
;
5855 _get_pending_crush(newcrush
);
5857 CrushTester
tester(newcrush
, err
);
5858 tester
.set_min_x(0);
5859 tester
.set_max_x(50);
5860 tester
.set_rule(crush_rule
);
5861 auto start
= ceph::coarse_mono_clock::now();
5862 r
= tester
.test_with_fork(g_conf
->mon_lease
);
5863 auto duration
= ceph::coarse_mono_clock::now() - start
;
5865 dout(10) << " tester.test_with_fork returns " << r
5866 << ": " << err
.str() << dendl
;
5867 *ss
<< "crush test failed with " << r
<< ": " << err
.str();
5870 dout(10) << __func__
<< " crush smoke test duration: "
5871 << duration
<< dendl
;
5873 unsigned size
, min_size
;
5874 r
= prepare_pool_size(pool_type
, erasure_code_profile
, &size
, &min_size
, ss
);
5876 dout(10) << " prepare_pool_size returns " << r
<< dendl
;
5880 if (!osdmap
.crush
->check_crush_rule(crush_rule
, pool_type
, size
, *ss
)) {
5884 uint32_t stripe_width
= 0;
5885 r
= prepare_pool_stripe_width(pool_type
, erasure_code_profile
, &stripe_width
, ss
);
5887 dout(10) << " prepare_pool_stripe_width returns " << r
<< dendl
;
5892 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
5893 switch (fast_read
) {
5900 case FAST_READ_DEFAULT
:
5901 fread
= g_conf
->mon_osd_pool_ec_fast_read
;
5904 *ss
<< "invalid fast_read setting: " << fast_read
;
5909 for (map
<int64_t,string
>::iterator p
= pending_inc
.new_pool_names
.begin();
5910 p
!= pending_inc
.new_pool_names
.end();
5912 if (p
->second
== name
)
5916 if (-1 == pending_inc
.new_pool_max
)
5917 pending_inc
.new_pool_max
= osdmap
.pool_max
;
5918 int64_t pool
= ++pending_inc
.new_pool_max
;
5920 pg_pool_t
*pi
= pending_inc
.get_new_pool(pool
, &empty
);
5921 pi
->type
= pool_type
;
5922 pi
->fast_read
= fread
;
5923 pi
->flags
= g_conf
->osd_pool_default_flags
;
5924 if (g_conf
->osd_pool_default_flag_hashpspool
)
5925 pi
->set_flag(pg_pool_t::FLAG_HASHPSPOOL
);
5926 if (g_conf
->osd_pool_default_flag_nodelete
)
5927 pi
->set_flag(pg_pool_t::FLAG_NODELETE
);
5928 if (g_conf
->osd_pool_default_flag_nopgchange
)
5929 pi
->set_flag(pg_pool_t::FLAG_NOPGCHANGE
);
5930 if (g_conf
->osd_pool_default_flag_nosizechange
)
5931 pi
->set_flag(pg_pool_t::FLAG_NOSIZECHANGE
);
5932 if (g_conf
->osd_pool_use_gmt_hitset
&&
5933 (osdmap
.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT
))
5934 pi
->use_gmt_hitset
= true;
5936 pi
->use_gmt_hitset
= false;
5939 pi
->min_size
= min_size
;
5940 pi
->crush_rule
= crush_rule
;
5941 pi
->expected_num_objects
= expected_num_objects
;
5942 pi
->object_hash
= CEPH_STR_HASH_RJENKINS
;
5943 pi
->set_pg_num(pg_num
);
5944 pi
->set_pgp_num(pgp_num
);
5945 pi
->last_change
= pending_inc
.epoch
;
5947 pi
->erasure_code_profile
= erasure_code_profile
;
5948 pi
->stripe_width
= stripe_width
;
5949 pi
->cache_target_dirty_ratio_micro
=
5950 g_conf
->osd_pool_default_cache_target_dirty_ratio
* 1000000;
5951 pi
->cache_target_dirty_high_ratio_micro
=
5952 g_conf
->osd_pool_default_cache_target_dirty_high_ratio
* 1000000;
5953 pi
->cache_target_full_ratio_micro
=
5954 g_conf
->osd_pool_default_cache_target_full_ratio
* 1000000;
5955 pi
->cache_min_flush_age
= g_conf
->osd_pool_default_cache_min_flush_age
;
5956 pi
->cache_min_evict_age
= g_conf
->osd_pool_default_cache_min_evict_age
;
5957 pending_inc
.new_pool_names
[pool
] = name
;
5961 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op
, int flag
)
5963 op
->mark_osdmon_event(__func__
);
5965 if (pending_inc
.new_flags
< 0)
5966 pending_inc
.new_flags
= osdmap
.get_flags();
5967 pending_inc
.new_flags
|= flag
;
5968 ss
<< OSDMap::get_flag_string(flag
) << " is set";
5969 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
5970 get_last_committed() + 1));
5974 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op
, int flag
)
5976 op
->mark_osdmon_event(__func__
);
5978 if (pending_inc
.new_flags
< 0)
5979 pending_inc
.new_flags
= osdmap
.get_flags();
5980 pending_inc
.new_flags
&= ~flag
;
5981 ss
<< OSDMap::get_flag_string(flag
) << " is unset";
5982 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
5983 get_last_committed() + 1));
5987 int OSDMonitor::prepare_command_pool_set(map
<string
,cmd_vartype
> &cmdmap
,
5991 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
5992 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
5994 ss
<< "unrecognized pool '" << poolstr
<< "'";
5998 cmd_getval(g_ceph_context
, cmdmap
, "var", var
);
6000 pg_pool_t p
= *osdmap
.get_pg_pool(pool
);
6001 if (pending_inc
.new_pools
.count(pool
))
6002 p
= pending_inc
.new_pools
[pool
];
6004 // accept val as a json string in the normal case (current
6005 // generation monitor). parse out int or float values from the
6006 // string as needed. however, if it is not a string, try to pull
6007 // out an int, in case an older monitor with an older json schema is
6008 // forwarding a request.
6010 string interr
, floaterr
;
6013 int64_t uf
= 0; // micro-f
6014 if (!cmd_getval(g_ceph_context
, cmdmap
, "val", val
)) {
6015 // wasn't a string; maybe an older mon forwarded json with an int?
6016 if (!cmd_getval(g_ceph_context
, cmdmap
, "val", n
))
6017 return -EINVAL
; // no value!
6019 // we got a string. see if it contains an int.
6020 n
= strict_strtoll(val
.c_str(), 10, &interr
);
6022 f
= strict_strtod(val
.c_str(), &floaterr
);
6023 uf
= llrintl(f
* (double)1000000.0);
6027 (var
== "hit_set_type" || var
== "hit_set_period" ||
6028 var
== "hit_set_count" || var
== "hit_set_fpp" ||
6029 var
== "target_max_objects" || var
== "target_max_bytes" ||
6030 var
== "cache_target_full_ratio" || var
== "cache_target_dirty_ratio" ||
6031 var
== "cache_target_dirty_high_ratio" || var
== "use_gmt_hitset" ||
6032 var
== "cache_min_flush_age" || var
== "cache_min_evict_age" ||
6033 var
== "hit_set_grade_decay_rate" || var
== "hit_set_search_last_n" ||
6034 var
== "min_read_recency_for_promote" || var
== "min_write_recency_for_promote")) {
6038 if (var
== "size") {
6039 if (p
.has_flag(pg_pool_t::FLAG_NOSIZECHANGE
)) {
6040 ss
<< "pool size change is disabled; you must unset nosizechange flag for the pool first";
6043 if (p
.type
== pg_pool_t::TYPE_ERASURE
) {
6044 ss
<< "can not change the size of an erasure-coded pool";
6047 if (interr
.length()) {
6048 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6051 if (n
<= 0 || n
> 10) {
6052 ss
<< "pool size must be between 1 and 10";
6058 } else if (var
== "min_size") {
6059 if (p
.has_flag(pg_pool_t::FLAG_NOSIZECHANGE
)) {
6060 ss
<< "pool min size change is disabled; you must unset nosizechange flag for the pool first";
6063 if (interr
.length()) {
6064 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6068 if (p
.type
!= pg_pool_t::TYPE_ERASURE
) {
6069 if (n
< 1 || n
> p
.size
) {
6070 ss
<< "pool min_size must be between 1 and " << (int)p
.size
;
6074 ErasureCodeInterfaceRef erasure_code
;
6077 int err
= get_erasure_code(p
.erasure_code_profile
, &erasure_code
, &tmp
);
6079 k
= erasure_code
->get_data_chunk_count();
6081 ss
<< __func__
<< " get_erasure_code failed: " << tmp
.rdbuf();
6085 if (n
< k
|| n
> p
.size
) {
6086 ss
<< "pool min_size must be between " << k
<< " and " << (int)p
.size
;
6091 } else if (var
== "auid") {
6092 if (interr
.length()) {
6093 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6097 } else if (var
== "crash_replay_interval") {
6098 if (interr
.length()) {
6099 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6102 p
.crash_replay_interval
= n
;
6103 } else if (var
== "pg_num") {
6104 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
6105 ss
<< "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
6108 if (interr
.length()) {
6109 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6112 if (n
<= (int)p
.get_pg_num()) {
6113 ss
<< "specified pg_num " << n
<< " <= current " << p
.get_pg_num();
6114 if (n
< (int)p
.get_pg_num())
6118 if (n
> (unsigned)g_conf
->mon_max_pool_pg_num
) {
6119 ss
<< "'pg_num' must be greater than 0 and less than or equal to "
6120 << g_conf
->mon_max_pool_pg_num
6121 << " (you may adjust 'mon max pool pg num' for higher values)";
6125 cmd_getval(g_ceph_context
,cmdmap
, "force", force
);
6126 if (p
.cache_mode
!= pg_pool_t::CACHEMODE_NONE
&&
6127 force
!= "--yes-i-really-mean-it") {
6128 ss
<< "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
6131 int expected_osds
= MIN(p
.get_pg_num(), osdmap
.get_num_osds());
6132 int64_t new_pgs
= n
- p
.get_pg_num();
6133 if (new_pgs
> g_conf
->mon_osd_max_split_count
* expected_osds
) {
6134 ss
<< "specified pg_num " << n
<< " is too large (creating "
6135 << new_pgs
<< " new PGs on ~" << expected_osds
6136 << " OSDs exceeds per-OSD max of " << g_conf
->mon_osd_max_split_count
6141 // force pre-luminous clients to resend their ops, since they
6142 // don't understand that split PGs now form a new interval.
6143 p
.last_force_op_resend_preluminous
= pending_inc
.epoch
;
6144 } else if (var
== "pgp_num") {
6145 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
6146 ss
<< "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
6149 if (interr
.length()) {
6150 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6154 ss
<< "specified pgp_num must > 0, but you set to " << n
;
6157 if (n
> (int)p
.get_pg_num()) {
6158 ss
<< "specified pgp_num " << n
<< " > pg_num " << p
.get_pg_num();
6162 } else if (var
== "crush_rule") {
6163 int id
= osdmap
.crush
->get_rule_id(val
);
6164 if (id
== -ENOENT
) {
6165 ss
<< "crush rule " << val
<< " does not exist";
6169 ss
<< cpp_strerror(id
);
6172 if (!osdmap
.crush
->check_crush_rule(id
, p
.get_type(), p
.get_size(), ss
)) {
6176 } else if (var
== "nodelete" || var
== "nopgchange" ||
6177 var
== "nosizechange" || var
== "write_fadvise_dontneed" ||
6178 var
== "noscrub" || var
== "nodeep-scrub") {
6179 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
6180 // make sure we only compare against 'n' if we didn't receive a string
6181 if (val
== "true" || (interr
.empty() && n
== 1)) {
6183 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
6186 ss
<< "expecting value 'true', 'false', '0', or '1'";
6189 } else if (var
== "hashpspool") {
6190 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
6192 cmd_getval(g_ceph_context
, cmdmap
, "force", force
);
6193 if (force
!= "--yes-i-really-mean-it") {
6194 ss
<< "are you SURE? this will remap all placement groups in this pool,"
6195 " this triggers large data movement,"
6196 " pass --yes-i-really-mean-it if you really do.";
6199 // make sure we only compare against 'n' if we didn't receive a string
6200 if (val
== "true" || (interr
.empty() && n
== 1)) {
6202 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
6205 ss
<< "expecting value 'true', 'false', '0', or '1'";
6208 } else if (var
== "hit_set_type") {
6210 p
.hit_set_params
= HitSet::Params();
6212 int err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
6215 if (val
== "bloom") {
6216 BloomHitSet::Params
*bsp
= new BloomHitSet::Params
;
6217 bsp
->set_fpp(g_conf
->osd_pool_default_hit_set_bloom_fpp
);
6218 p
.hit_set_params
= HitSet::Params(bsp
);
6219 } else if (val
== "explicit_hash")
6220 p
.hit_set_params
= HitSet::Params(new ExplicitHashHitSet::Params
);
6221 else if (val
== "explicit_object")
6222 p
.hit_set_params
= HitSet::Params(new ExplicitObjectHitSet::Params
);
6224 ss
<< "unrecognized hit_set type '" << val
<< "'";
6228 } else if (var
== "hit_set_period") {
6229 if (interr
.length()) {
6230 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6233 p
.hit_set_period
= n
;
6234 } else if (var
== "hit_set_count") {
6235 if (interr
.length()) {
6236 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6239 p
.hit_set_count
= n
;
6240 } else if (var
== "hit_set_fpp") {
6241 if (floaterr
.length()) {
6242 ss
<< "error parsing floating point value '" << val
<< "': " << floaterr
;
6245 if (p
.hit_set_params
.get_type() != HitSet::TYPE_BLOOM
) {
6246 ss
<< "hit set is not of type Bloom; invalid to set a false positive rate!";
6249 BloomHitSet::Params
*bloomp
= static_cast<BloomHitSet::Params
*>(p
.hit_set_params
.impl
.get());
6251 } else if (var
== "use_gmt_hitset") {
6252 if (val
== "true" || (interr
.empty() && n
== 1)) {
6253 if (!(osdmap
.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT
)) {
6254 ss
<< "not all OSDs support GMT hit set.";
6257 p
.use_gmt_hitset
= true;
6259 ss
<< "expecting value 'true' or '1'";
6262 } else if (var
== "allow_ec_overwrites") {
6263 if (!p
.is_erasure()) {
6264 ss
<< "ec overwrites can only be enabled for an erasure coded pool";
6268 if (!g_conf
->mon_debug_no_require_bluestore_for_ec_overwrites
&&
6269 !is_pool_currently_all_bluestore(pool
, p
, &err
)) {
6270 ss
<< "pool must only be stored on bluestore for scrubbing to work: " << err
.str();
6273 if (val
== "true" || (interr
.empty() && n
== 1)) {
6274 p
.flags
|= pg_pool_t::FLAG_EC_OVERWRITES
;
6275 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
6276 ss
<< "ec overwrites cannot be disabled once enabled";
6279 ss
<< "expecting value 'true', 'false', '0', or '1'";
6282 } else if (var
== "target_max_objects") {
6283 if (interr
.length()) {
6284 ss
<< "error parsing int '" << val
<< "': " << interr
;
6287 p
.target_max_objects
= n
;
6288 } else if (var
== "target_max_bytes") {
6289 if (interr
.length()) {
6290 ss
<< "error parsing int '" << val
<< "': " << interr
;
6293 p
.target_max_bytes
= n
;
6294 } else if (var
== "cache_target_dirty_ratio") {
6295 if (floaterr
.length()) {
6296 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
6299 if (f
< 0 || f
> 1.0) {
6300 ss
<< "value must be in the range 0..1";
6303 p
.cache_target_dirty_ratio_micro
= uf
;
6304 } else if (var
== "cache_target_dirty_high_ratio") {
6305 if (floaterr
.length()) {
6306 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
6309 if (f
< 0 || f
> 1.0) {
6310 ss
<< "value must be in the range 0..1";
6313 p
.cache_target_dirty_high_ratio_micro
= uf
;
6314 } else if (var
== "cache_target_full_ratio") {
6315 if (floaterr
.length()) {
6316 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
6319 if (f
< 0 || f
> 1.0) {
6320 ss
<< "value must be in the range 0..1";
6323 p
.cache_target_full_ratio_micro
= uf
;
6324 } else if (var
== "cache_min_flush_age") {
6325 if (interr
.length()) {
6326 ss
<< "error parsing int '" << val
<< "': " << interr
;
6329 p
.cache_min_flush_age
= n
;
6330 } else if (var
== "cache_min_evict_age") {
6331 if (interr
.length()) {
6332 ss
<< "error parsing int '" << val
<< "': " << interr
;
6335 p
.cache_min_evict_age
= n
;
6336 } else if (var
== "min_read_recency_for_promote") {
6337 if (interr
.length()) {
6338 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6341 p
.min_read_recency_for_promote
= n
;
6342 } else if (var
== "hit_set_grade_decay_rate") {
6343 if (interr
.length()) {
6344 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6347 if (n
> 100 || n
< 0) {
6348 ss
<< "value out of range,valid range is 0 - 100";
6351 p
.hit_set_grade_decay_rate
= n
;
6352 } else if (var
== "hit_set_search_last_n") {
6353 if (interr
.length()) {
6354 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6357 if (n
> p
.hit_set_count
|| n
< 0) {
6358 ss
<< "value out of range,valid range is 0 - hit_set_count";
6361 p
.hit_set_search_last_n
= n
;
6362 } else if (var
== "min_write_recency_for_promote") {
6363 if (interr
.length()) {
6364 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6367 p
.min_write_recency_for_promote
= n
;
6368 } else if (var
== "fast_read") {
6369 if (p
.is_replicated()) {
6370 ss
<< "fast read is not supported in replication pool";
6373 if (val
== "true" || (interr
.empty() && n
== 1)) {
6375 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
6376 p
.fast_read
= false;
6378 ss
<< "expecting value 'true', 'false', '0', or '1'";
6381 } else if (pool_opts_t::is_opt_name(var
)) {
6382 bool unset
= val
== "unset";
6383 if (var
== "compression_mode") {
6385 auto cmode
= Compressor::get_comp_mode_type(val
);
6387 ss
<< "unrecognized compression mode '" << val
<< "'";
6391 } else if (var
== "compression_algorithm") {
6393 auto alg
= Compressor::get_comp_alg_type(val
);
6395 ss
<< "unrecognized compression_algorithm '" << val
<< "'";
6399 } else if (var
== "compression_required_ratio") {
6400 if (floaterr
.length()) {
6401 ss
<< "error parsing float value '" << val
<< "': " << floaterr
;
6404 if (f
< 0 || f
> 1) {
6405 ss
<< "compression_required_ratio is out of range (0-1): '" << val
<< "'";
6408 } else if (var
== "csum_type") {
6409 auto t
= unset
? 0 : Checksummer::get_csum_string_type(val
);
6411 ss
<< "unrecognized csum_type '" << val
<< "'";
6414 //preserve csum_type numeric value
6417 } else if (var
== "compression_max_blob_size" ||
6418 var
== "compression_min_blob_size" ||
6419 var
== "csum_max_block" ||
6420 var
== "csum_min_block") {
6421 if (interr
.length()) {
6422 ss
<< "error parsing int value '" << val
<< "': " << interr
;
6427 pool_opts_t::opt_desc_t desc
= pool_opts_t::get_opt_desc(var
);
6428 switch (desc
.type
) {
6429 case pool_opts_t::STR
:
6431 p
.opts
.unset(desc
.key
);
6433 p
.opts
.set(desc
.key
, static_cast<std::string
>(val
));
6436 case pool_opts_t::INT
:
6437 if (interr
.length()) {
6438 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
6442 p
.opts
.unset(desc
.key
);
6444 p
.opts
.set(desc
.key
, static_cast<int>(n
));
6447 case pool_opts_t::DOUBLE
:
6448 if (floaterr
.length()) {
6449 ss
<< "error parsing floating point value '" << val
<< "': " << floaterr
;
6453 p
.opts
.unset(desc
.key
);
6455 p
.opts
.set(desc
.key
, static_cast<double>(f
));
6459 assert(!"unknown type");
6462 ss
<< "unrecognized variable '" << var
<< "'";
6465 if (val
!= "unset") {
6466 ss
<< "set pool " << pool
<< " " << var
<< " to " << val
;
6468 ss
<< "unset pool " << pool
<< " " << var
;
6470 p
.last_change
= pending_inc
.epoch
;
6471 pending_inc
.new_pools
[pool
] = p
;
6475 int OSDMonitor::prepare_command_pool_application(const string
&prefix
,
6476 map
<string
,cmd_vartype
> &cmdmap
,
6480 cmd_getval(g_ceph_context
, cmdmap
, "pool", pool_name
);
6481 int64_t pool
= osdmap
.lookup_pg_pool_name(pool_name
.c_str());
6483 ss
<< "unrecognized pool '" << pool_name
<< "'";
6487 pg_pool_t p
= *osdmap
.get_pg_pool(pool
);
6488 if (pending_inc
.new_pools
.count(pool
)) {
6489 p
= pending_inc
.new_pools
[pool
];
6493 cmd_getval(g_ceph_context
, cmdmap
, "app", app
);
6494 bool app_exists
= (p
.application_metadata
.count(app
) > 0);
6496 if (boost::algorithm::ends_with(prefix
, "enable")) {
6498 ss
<< "application name must be provided";
6503 ss
<< "application must be enabled on base tier";
6508 cmd_getval(g_ceph_context
, cmdmap
, "force", force
);
6510 if (!app_exists
&& !p
.application_metadata
.empty() &&
6511 force
!= "--yes-i-really-mean-it") {
6512 ss
<< "Are you SURE? Pool '" << pool_name
<< "' already has an enabled "
6513 << "application; pass --yes-i-really-mean-it to proceed anyway";
6517 if (!app_exists
&& p
.application_metadata
.size() >= MAX_POOL_APPLICATIONS
) {
6518 ss
<< "too many enabled applications on pool '" << pool_name
<< "'; "
6519 << "max " << MAX_POOL_APPLICATIONS
;
6523 if (app
.length() > MAX_POOL_APPLICATION_LENGTH
) {
6524 ss
<< "application name '" << app
<< "' too long; max length "
6525 << MAX_POOL_APPLICATION_LENGTH
;
6530 p
.application_metadata
[app
] = {};
6532 ss
<< "enabled application '" << app
<< "' on pool '" << pool_name
<< "'";
6534 } else if (boost::algorithm::ends_with(prefix
, "disable")) {
6536 cmd_getval(g_ceph_context
, cmdmap
, "force", force
);
6538 if (force
!= "--yes-i-really-mean-it") {
6539 ss
<< "Are you SURE? Disabling an application within a pool might result "
6540 << "in loss of application functionality; pass "
6541 << "--yes-i-really-mean-it to proceed anyway";
6546 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
6548 return 0; // idempotent
6551 p
.application_metadata
.erase(app
);
6552 ss
<< "disable application '" << app
<< "' on pool '" << pool_name
<< "'";
6554 } else if (boost::algorithm::ends_with(prefix
, "set")) {
6556 ss
<< "application metadata must be set on base tier";
6561 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
6567 cmd_getval(g_ceph_context
, cmdmap
, "key", key
);
6570 ss
<< "key must be provided";
6574 auto &app_keys
= p
.application_metadata
[app
];
6575 if (app_keys
.count(key
) == 0 &&
6576 app_keys
.size() >= MAX_POOL_APPLICATION_KEYS
) {
6577 ss
<< "too many keys set for application '" << app
<< "' on pool '"
6578 << pool_name
<< "'; max " << MAX_POOL_APPLICATION_KEYS
;
6582 if (key
.length() > MAX_POOL_APPLICATION_LENGTH
) {
6583 ss
<< "key '" << app
<< "' too long; max length "
6584 << MAX_POOL_APPLICATION_LENGTH
;
6589 cmd_getval(g_ceph_context
, cmdmap
, "value", value
);
6590 if (value
.length() > MAX_POOL_APPLICATION_LENGTH
) {
6591 ss
<< "value '" << value
<< "' too long; max length "
6592 << MAX_POOL_APPLICATION_LENGTH
;
6596 p
.application_metadata
[app
][key
] = value
;
6597 ss
<< "set application '" << app
<< "' key '" << key
<< "' to '"
6598 << value
<< "' on pool '" << pool_name
<< "'";
6599 } else if (boost::algorithm::ends_with(prefix
, "rm")) {
6601 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
6607 cmd_getval(g_ceph_context
, cmdmap
, "key", key
);
6608 auto it
= p
.application_metadata
[app
].find(key
);
6609 if (it
== p
.application_metadata
[app
].end()) {
6610 ss
<< "application '" << app
<< "' on pool '" << pool_name
6611 << "' does not have key '" << key
<< "'";
6612 return 0; // idempotent
6615 p
.application_metadata
[app
].erase(it
);
6616 ss
<< "removed application '" << app
<< "' key '" << key
<< "' on pool '"
6617 << pool_name
<< "'";
6622 p
.last_change
= pending_inc
.epoch
;
6623 pending_inc
.new_pools
[pool
] = p
;
6627 int OSDMonitor::_prepare_command_osd_crush_remove(
6628 CrushWrapper
&newcrush
,
6637 err
= newcrush
.remove_item_under(g_ceph_context
, id
, ancestor
,
6640 err
= newcrush
.remove_item(g_ceph_context
, id
, unlink_only
);
6645 void OSDMonitor::do_osd_crush_remove(CrushWrapper
& newcrush
)
6647 pending_inc
.crush
.clear();
6648 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
6651 int OSDMonitor::prepare_command_osd_crush_remove(
6652 CrushWrapper
&newcrush
,
6658 int err
= _prepare_command_osd_crush_remove(
6659 newcrush
, id
, ancestor
,
6660 has_ancestor
, unlink_only
);
6666 do_osd_crush_remove(newcrush
);
6671 int OSDMonitor::prepare_command_osd_remove(int32_t id
)
6673 if (osdmap
.is_up(id
)) {
6677 pending_inc
.new_state
[id
] = osdmap
.get_state(id
);
6678 pending_inc
.new_uuid
[id
] = uuid_d();
6679 pending_metadata_rm
.insert(id
);
6680 pending_metadata
.erase(id
);
6685 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id
)
6687 assert(existing_id
);
6690 for (int32_t i
= 0; i
< osdmap
.get_max_osd(); ++i
) {
6691 if (!osdmap
.exists(i
) &&
6692 pending_inc
.new_up_client
.count(i
) == 0 &&
6693 (pending_inc
.new_state
.count(i
) == 0 ||
6694 (pending_inc
.new_state
[i
] & CEPH_OSD_EXISTS
) == 0)) {
6700 if (pending_inc
.new_max_osd
< 0) {
6701 return osdmap
.get_max_osd();
6703 return pending_inc
.new_max_osd
;
6706 void OSDMonitor::do_osd_create(
6711 dout(10) << __func__
<< " uuid " << uuid
<< dendl
;
6714 // We presume validation has been performed prior to calling this
6715 // function. We assert with prejudice.
6717 int32_t allocated_id
= -1; // declare here so we can jump
6718 int32_t existing_id
= -1;
6719 if (!uuid
.is_zero()) {
6720 existing_id
= osdmap
.identify_osd(uuid
);
6721 if (existing_id
>= 0) {
6722 assert(id
< 0 || id
== existing_id
);
6723 *new_id
= existing_id
;
6725 } else if (id
>= 0) {
6726 // uuid does not exist, and id has been provided, so just create
6733 // allocate a new id
6734 allocated_id
= _allocate_osd_id(&existing_id
);
6735 dout(10) << __func__
<< " allocated id " << allocated_id
6736 << " existing id " << existing_id
<< dendl
;
6737 if (existing_id
>= 0) {
6738 assert(existing_id
< osdmap
.get_max_osd());
6739 assert(allocated_id
< 0);
6740 pending_inc
.new_weight
[existing_id
] = CEPH_OSD_OUT
;
6741 *new_id
= existing_id
;
6743 } else if (allocated_id
>= 0) {
6744 assert(existing_id
< 0);
6746 if (pending_inc
.new_max_osd
< 0) {
6747 pending_inc
.new_max_osd
= osdmap
.get_max_osd() + 1;
6749 ++pending_inc
.new_max_osd
;
6751 *new_id
= pending_inc
.new_max_osd
- 1;
6752 assert(*new_id
== allocated_id
);
6754 assert(0 == "unexpected condition");
6758 dout(10) << __func__
<< " using id " << *new_id
<< dendl
;
6759 if (osdmap
.get_max_osd() <= *new_id
&& pending_inc
.new_max_osd
<= *new_id
) {
6760 pending_inc
.new_max_osd
= *new_id
+ 1;
6763 pending_inc
.new_state
[*new_id
] |= CEPH_OSD_EXISTS
| CEPH_OSD_NEW
;
6764 if (!uuid
.is_zero())
6765 pending_inc
.new_uuid
[*new_id
] = uuid
;
6768 int OSDMonitor::validate_osd_create(
6771 const bool check_osd_exists
,
6772 int32_t* existing_id
,
6776 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
6777 << " check_osd_exists " << check_osd_exists
<< dendl
;
6779 assert(existing_id
);
6781 if (id
< 0 && uuid
.is_zero()) {
6782 // we have nothing to validate
6785 } else if (uuid
.is_zero()) {
6786 // we have an id but we will ignore it - because that's what
6787 // `osd create` does.
6792 * This function will be used to validate whether we are able to
6793 * create a new osd when the `uuid` is specified.
6795 * It will be used by both `osd create` and `osd new`, as the checks
6796 * are basically the same when it pertains to osd id and uuid validation.
6797 * However, `osd create` presumes an `uuid` is optional, for legacy
6798 * reasons, while `osd new` requires the `uuid` to be provided. This
6799 * means that `osd create` will not be idempotent if an `uuid` is not
6800 * provided, but we will always guarantee the idempotency of `osd new`.
6803 assert(!uuid
.is_zero());
6804 if (pending_inc
.identify_osd(uuid
) >= 0) {
6805 // osd is about to exist
6809 int32_t i
= osdmap
.identify_osd(uuid
);
6811 // osd already exists
6812 if (id
>= 0 && i
!= id
) {
6813 ss
<< "uuid " << uuid
<< " already in use for different id " << i
;
6816 // return a positive errno to distinguish between a blocking error
6817 // and an error we consider to not be a problem (i.e., this would be
6818 // an idempotent operation).
6824 if (pending_inc
.new_state
.count(id
)) {
6825 // osd is about to exist
6828 // we may not care if an osd exists if we are recreating a previously
6830 if (check_osd_exists
&& osdmap
.exists(id
)) {
6831 ss
<< "id " << id
<< " already in use and does not match uuid "
6839 int OSDMonitor::prepare_command_osd_create(
6842 int32_t* existing_id
,
6845 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
<< dendl
;
6846 assert(existing_id
);
6847 if (osdmap
.is_destroyed(id
)) {
6848 ss
<< "ceph osd create has been deprecated. Please use ceph osd new "
6853 if (uuid
.is_zero()) {
6854 dout(10) << __func__
<< " no uuid; assuming legacy `osd create`" << dendl
;
6857 return validate_osd_create(id
, uuid
, true, existing_id
, ss
);
6860 int OSDMonitor::prepare_command_osd_new(
6862 const map
<string
,cmd_vartype
>& cmdmap
,
6863 const map
<string
,string
>& secrets
,
6871 assert(paxos
->is_plugged());
6873 dout(10) << __func__
<< " " << op
<< dendl
;
6875 /* validate command. abort now if something's wrong. */
6877 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
6879 * If `id` is not specified, we will identify any existing osd based
6880 * on `uuid`. Operation will be idempotent iff secrets match.
6882 * If `id` is specified, we will identify any existing osd based on
6883 * `uuid` and match against `id`. If they match, operation will be
6884 * idempotent iff secrets match.
6886 * `-i secrets.json` will be optional. If supplied, will be used
6887 * to check for idempotency when `id` and `uuid` match.
6889 * If `id` is not specified, and `uuid` does not exist, an id will
6890 * be found or allocated for the osd.
6892 * If `id` is specified, and the osd has been previously marked
6893 * as destroyed, then the `id` will be reused.
6895 if (!cmd_getval(g_ceph_context
, cmdmap
, "uuid", uuidstr
)) {
6896 ss
<< "requires the OSD's UUID to be specified.";
6898 } else if (!uuid
.parse(uuidstr
.c_str())) {
6899 ss
<< "invalid UUID value '" << uuidstr
<< "'.";
6903 if (cmd_getval(g_ceph_context
, cmdmap
, "id", id
) &&
6905 ss
<< "invalid OSD id; must be greater or equal than zero.";
6909 // are we running an `osd create`-like command, or recreating
6910 // a previously destroyed osd?
6912 bool is_recreate_destroyed
= (id
>= 0 && osdmap
.is_destroyed(id
));
6914 // we will care about `id` to assess whether osd is `destroyed`, or
6915 // to create a new osd.
6916 // we will need an `id` by the time we reach auth.
6918 int32_t existing_id
= -1;
6919 int err
= validate_osd_create(id
, uuid
, !is_recreate_destroyed
,
6922 bool may_be_idempotent
= false;
6923 if (err
== EEXIST
) {
6924 // this is idempotent from the osdmon's point-of-view
6925 may_be_idempotent
= true;
6926 assert(existing_id
>= 0);
6928 } else if (err
< 0) {
6932 if (!may_be_idempotent
) {
6933 // idempotency is out of the window. We are either creating a new
6934 // osd or recreating a destroyed osd.
6936 // We now need to figure out if we have an `id` (and if it's valid),
6937 // of find an `id` if we don't have one.
6939 // NOTE: we need to consider the case where the `id` is specified for
6940 // `osd create`, and we must honor it. So this means checking if
6941 // the `id` is destroyed, and if so assume the destroy; otherwise,
6942 // check if it `exists` - in which case we complain about not being
6943 // `destroyed`. In the end, if nothing fails, we must allow the
6944 // creation, so that we are compatible with `create`.
6945 if (id
>= 0 && osdmap
.exists(id
) && !osdmap
.is_destroyed(id
)) {
6946 dout(10) << __func__
<< " osd." << id
<< " isn't destroyed" << dendl
;
6947 ss
<< "OSD " << id
<< " has not yet been destroyed";
6949 } else if (id
< 0) {
6951 id
= _allocate_osd_id(&existing_id
);
6953 assert(existing_id
>= 0);
6956 dout(10) << __func__
<< " found id " << id
<< " to use" << dendl
;
6957 } else if (id
>= 0 && osdmap
.is_destroyed(id
)) {
6958 dout(10) << __func__
<< " recreating osd." << id
<< dendl
;
6960 dout(10) << __func__
<< " creating new osd." << id
<< dendl
;
6964 assert(osdmap
.exists(id
));
6967 // we are now able to either create a brand new osd or reuse an existing
6968 // osd that has been previously destroyed.
6970 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
<< dendl
;
6972 if (may_be_idempotent
&& secrets
.empty()) {
6973 // nothing to do, really.
6974 dout(10) << __func__
<< " idempotent and no secrets -- no op." << dendl
;
6977 f
->open_object_section("created_osd");
6978 f
->dump_int("osdid", id
);
6986 string cephx_secret
, lockbox_secret
, dmcrypt_key
;
6987 bool has_lockbox
= false;
6988 bool has_secrets
= (!secrets
.empty());
6990 ConfigKeyService
*svc
= nullptr;
6991 AuthMonitor::auth_entity_t cephx_entity
, lockbox_entity
;
6994 if (secrets
.count("cephx_secret") == 0) {
6995 ss
<< "requires a cephx secret.";
6998 cephx_secret
= secrets
.at("cephx_secret");
7000 bool has_lockbox_secret
= (secrets
.count("cephx_lockbox_secret") > 0);
7001 bool has_dmcrypt_key
= (secrets
.count("dmcrypt_key") > 0);
7003 dout(10) << __func__
<< " has lockbox " << has_lockbox_secret
7004 << " dmcrypt " << has_dmcrypt_key
<< dendl
;
7006 if (has_lockbox_secret
&& has_dmcrypt_key
) {
7008 lockbox_secret
= secrets
.at("cephx_lockbox_secret");
7009 dmcrypt_key
= secrets
.at("dmcrypt_key");
7010 } else if (!has_lockbox_secret
!= !has_dmcrypt_key
) {
7011 ss
<< "requires both a cephx lockbox secret and a dm-crypt key.";
7015 dout(10) << __func__
<< " validate secrets using osd id " << id
<< dendl
;
7017 err
= mon
->authmon()->validate_osd_new(id
, uuid
,
7025 } else if (may_be_idempotent
&& err
!= EEXIST
) {
7026 // for this to be idempotent, `id` should already be >= 0; no need
7027 // to use validate_id.
7029 ss
<< "osd." << id
<< " exists but secrets do not match";
7034 svc
= (ConfigKeyService
*)mon
->config_key_service
;
7035 err
= svc
->validate_osd_new(uuid
, dmcrypt_key
, ss
);
7038 } else if (may_be_idempotent
&& err
!= EEXIST
) {
7040 ss
<< "osd." << id
<< " exists but dm-crypt key does not match.";
7045 assert(!has_secrets
|| !cephx_secret
.empty());
7046 assert(!has_lockbox
|| !lockbox_secret
.empty());
7048 if (may_be_idempotent
) {
7049 // we have nothing to do for either the osdmon or the authmon,
7050 // and we have no lockbox - so the config key service will not be
7051 // touched. This is therefore an idempotent operation, and we can
7052 // just return right away.
7053 dout(10) << __func__
<< " idempotent -- no op." << dendl
;
7056 f
->open_object_section("created_osd");
7057 f
->dump_int("osdid", id
);
7064 assert(!may_be_idempotent
);
7068 assert(!cephx_secret
.empty());
7069 assert((lockbox_secret
.empty() && dmcrypt_key
.empty()) ||
7070 (!lockbox_secret
.empty() && !dmcrypt_key
.empty()));
7072 err
= mon
->authmon()->do_osd_new(cephx_entity
,
7078 assert(nullptr != svc
);
7079 svc
->do_osd_new(uuid
, dmcrypt_key
);
7083 if (is_recreate_destroyed
) {
7085 assert(osdmap
.is_destroyed(id
));
7086 pending_inc
.new_weight
[id
] = CEPH_OSD_OUT
;
7087 pending_inc
.new_state
[id
] |= CEPH_OSD_DESTROYED
| CEPH_OSD_NEW
;
7088 if (osdmap
.get_state(id
) & CEPH_OSD_UP
) {
7089 // due to http://tracker.ceph.com/issues/20751 some clusters may
7090 // have UP set for non-existent OSDs; make sure it is cleared
7091 // for a newly created osd.
7092 pending_inc
.new_state
[id
] |= CEPH_OSD_UP
;
7094 pending_inc
.new_uuid
[id
] = uuid
;
7097 int32_t new_id
= -1;
7098 do_osd_create(id
, uuid
, &new_id
);
7099 assert(new_id
>= 0);
7100 assert(id
== new_id
);
7104 f
->open_object_section("created_osd");
7105 f
->dump_int("osdid", id
);
7114 bool OSDMonitor::prepare_command(MonOpRequestRef op
)
7116 op
->mark_osdmon_event(__func__
);
7117 MMonCommand
*m
= static_cast<MMonCommand
*>(op
->get_req());
7119 map
<string
, cmd_vartype
> cmdmap
;
7120 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
7121 string rs
= ss
.str();
7122 mon
->reply_command(op
, -EINVAL
, rs
, get_last_committed());
7126 MonSession
*session
= m
->get_session();
7128 mon
->reply_command(op
, -EACCES
, "access denied", get_last_committed());
7132 return prepare_command_impl(op
, cmdmap
);
7135 static int parse_reweights(CephContext
*cct
,
7136 const map
<string
,cmd_vartype
> &cmdmap
,
7137 const OSDMap
& osdmap
,
7138 map
<int32_t, uint32_t>* weights
)
7141 if (!cmd_getval(g_ceph_context
, cmdmap
, "weights", weights_str
)) {
7144 std::replace(begin(weights_str
), end(weights_str
), '\'', '"');
7145 json_spirit::mValue json_value
;
7146 if (!json_spirit::read(weights_str
, json_value
)) {
7149 if (json_value
.type() != json_spirit::obj_type
) {
7152 const auto obj
= json_value
.get_obj();
7154 for (auto& osd_weight
: obj
) {
7155 auto osd_id
= std::stoi(osd_weight
.first
);
7156 if (!osdmap
.exists(osd_id
)) {
7159 if (osd_weight
.second
.type() != json_spirit::str_type
) {
7162 auto weight
= std::stoul(osd_weight
.second
.get_str());
7163 weights
->insert({osd_id
, weight
});
7165 } catch (const std::logic_error
& e
) {
7171 int OSDMonitor::prepare_command_osd_destroy(
7175 assert(paxos
->is_plugged());
7177 // we check if the osd exists for the benefit of `osd purge`, which may
7178 // have previously removed the osd. If the osd does not exist, return
7179 // -ENOENT to convey this, and let the caller deal with it.
7181 // we presume that all auth secrets and config keys were removed prior
7182 // to this command being called. if they exist by now, we also assume
7183 // they must have been created by some other command and do not pertain
7184 // to this non-existent osd.
7185 if (!osdmap
.exists(id
)) {
7186 dout(10) << __func__
<< " osd." << id
<< " does not exist." << dendl
;
7190 uuid_d uuid
= osdmap
.get_uuid(id
);
7191 dout(10) << __func__
<< " destroying osd." << id
7192 << " uuid " << uuid
<< dendl
;
7194 // if it has been destroyed, we assume our work here is done.
7195 if (osdmap
.is_destroyed(id
)) {
7196 ss
<< "destroyed osd." << id
;
7200 EntityName cephx_entity
, lockbox_entity
;
7201 bool idempotent_auth
= false, idempotent_cks
= false;
7203 int err
= mon
->authmon()->validate_osd_destroy(id
, uuid
,
7208 if (err
== -ENOENT
) {
7209 idempotent_auth
= true;
7215 ConfigKeyService
*svc
= (ConfigKeyService
*)mon
->config_key_service
;
7216 err
= svc
->validate_osd_destroy(id
, uuid
);
7218 assert(err
== -ENOENT
);
7220 idempotent_cks
= true;
7223 if (!idempotent_auth
) {
7224 err
= mon
->authmon()->do_osd_destroy(cephx_entity
, lockbox_entity
);
7228 if (!idempotent_cks
) {
7229 svc
->do_osd_destroy(id
, uuid
);
7232 pending_inc
.new_state
[id
] = CEPH_OSD_DESTROYED
;
7233 pending_inc
.new_uuid
[id
] = uuid_d();
7235 // we can only propose_pending() once per service, otherwise we'll be
7236 // defying PaxosService and all laws of nature. Therefore, as we may
7237 // be used during 'osd purge', let's keep the caller responsible for
7243 int OSDMonitor::prepare_command_osd_purge(
7247 assert(paxos
->is_plugged());
7248 dout(10) << __func__
<< " purging osd." << id
<< dendl
;
7250 assert(!osdmap
.is_up(id
));
7253 * This may look a bit weird, but this is what's going to happen:
7255 * 1. we make sure that removing from crush works
7256 * 2. we call `prepare_command_osd_destroy()`. If it returns an
7257 * error, then we abort the whole operation, as no updates
7258 * have been made. However, we this function will have
7259 * side-effects, thus we need to make sure that all operations
7260 * performed henceforth will *always* succeed.
7261 * 3. we call `prepare_command_osd_remove()`. Although this
7262 * function can return an error, it currently only checks if the
7263 * osd is up - and we have made sure that it is not so, so there
7264 * is no conflict, and it is effectively an update.
7265 * 4. finally, we call `do_osd_crush_remove()`, which will perform
7266 * the crush update we delayed from before.
7269 CrushWrapper newcrush
;
7270 _get_pending_crush(newcrush
);
7272 bool may_be_idempotent
= false;
7274 int err
= _prepare_command_osd_crush_remove(newcrush
, id
, 0, false, false);
7275 if (err
== -ENOENT
) {
7277 may_be_idempotent
= true;
7278 } else if (err
< 0) {
7279 ss
<< "error removing osd." << id
<< " from crush";
7283 // no point destroying the osd again if it has already been marked destroyed
7284 if (!osdmap
.is_destroyed(id
)) {
7285 err
= prepare_command_osd_destroy(id
, ss
);
7287 if (err
== -ENOENT
) {
7293 may_be_idempotent
= false;
7298 if (may_be_idempotent
&& !osdmap
.exists(id
)) {
7299 dout(10) << __func__
<< " osd." << id
<< " does not exist and "
7300 << "we are idempotent." << dendl
;
7304 err
= prepare_command_osd_remove(id
);
7305 // we should not be busy, as we should have made sure this id is not up.
7308 do_osd_crush_remove(newcrush
);
7312 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op
,
7313 map
<string
,cmd_vartype
> &cmdmap
)
7315 op
->mark_osdmon_event(__func__
);
7316 MMonCommand
*m
= static_cast<MMonCommand
*>(op
->get_req());
7324 cmd_getval(g_ceph_context
, cmdmap
, "format", format
, string("plain"));
7325 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
7328 cmd_getval(g_ceph_context
, cmdmap
, "prefix", prefix
);
7332 bool osdid_present
= cmd_getval(g_ceph_context
, cmdmap
, "id", osdid
);
7333 if (osdid_present
) {
7335 oss
<< "osd." << osdid
;
7339 // Even if there's a pending state with changes that could affect
7340 // a command, considering that said state isn't yet committed, we
7341 // just don't care about those changes if the command currently being
7342 // handled acts as a no-op against the current committed state.
7343 // In a nutshell, we assume this command happens *before*.
7345 // Let me make this clearer:
7347 // - If we have only one client, and that client issues some
7348 // operation that would conflict with this operation but is
7349 // still on the pending state, then we would be sure that said
7350 // operation wouldn't have returned yet, so the client wouldn't
7351 // issue this operation (unless the client didn't wait for the
7352 // operation to finish, and that would be the client's own fault).
7354 // - If we have more than one client, each client will observe
7355 // whatever is the state at the moment of the commit. So, if we
7356 // have two clients, one issuing an unlink and another issuing a
7357 // link, and if the link happens while the unlink is still on the
7358 // pending state, from the link's point-of-view this is a no-op.
7359 // If different clients are issuing conflicting operations and
7360 // they care about that, then the clients should make sure they
7361 // enforce some kind of concurrency mechanism -- from our
7362 // perspective that's what Douglas Adams would call an SEP.
7364 // This should be used as a general guideline for most commands handled
7365 // in this function. Adapt as you see fit, but please bear in mind that
7366 // this is the expected behavior.
7369 if (prefix
== "osd setcrushmap" ||
7370 (prefix
== "osd crush set" && !osdid_present
)) {
7371 if (pending_inc
.crush
.length()) {
7372 dout(10) << __func__
<< " waiting for pending crush update " << dendl
;
7373 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
7376 dout(10) << "prepare_command setting new crush map" << dendl
;
7377 bufferlist
data(m
->get_data());
7380 bufferlist::iterator
bl(data
.begin());
7383 catch (const std::exception
&e
) {
7385 ss
<< "Failed to parse crushmap: " << e
.what();
7389 int64_t prior_version
= 0;
7390 if (cmd_getval(g_ceph_context
, cmdmap
, "prior_version", prior_version
)) {
7391 if (prior_version
== osdmap
.get_crush_version() - 1) {
7392 // see if we are a resend of the last update. this is imperfect
7393 // (multiple racing updaters may not both get reliable success)
7394 // but we expect crush updaters (via this interface) to be rare-ish.
7395 bufferlist current
, proposed
;
7396 osdmap
.crush
->encode(current
, mon
->get_quorum_con_features());
7397 crush
.encode(proposed
, mon
->get_quorum_con_features());
7398 if (current
.contents_equal(proposed
)) {
7399 dout(10) << __func__
7400 << " proposed matches current and version equals previous"
7403 ss
<< osdmap
.get_crush_version();
7407 if (prior_version
!= osdmap
.get_crush_version()) {
7409 ss
<< "prior_version " << prior_version
<< " != crush version "
7410 << osdmap
.get_crush_version();
7415 if (crush
.has_legacy_rulesets()) {
7417 ss
<< "crush maps with ruleset != ruleid are no longer allowed";
7420 if (!validate_crush_against_features(&crush
, ss
)) {
7425 const auto& osdmap_pools
= osdmap
.get_pools();
7426 for (auto pit
= osdmap_pools
.begin(); pit
!= osdmap_pools
.end(); ++pit
) {
7427 const int64_t pool_id
= pit
->first
;
7428 const pg_pool_t
&pool
= pit
->second
;
7429 int ruleno
= pool
.get_crush_rule();
7430 if (!crush
.rule_exists(ruleno
)) {
7431 ss
<< " the crush rule no "<< ruleno
<< " for pool id " << pool_id
<< " is in use";
7437 if (g_conf
->mon_osd_crush_smoke_test
) {
7438 // sanity check: test some inputs to make sure this map isn't
7440 dout(10) << " testing map" << dendl
;
7442 CrushTester
tester(crush
, ess
);
7443 tester
.set_min_x(0);
7444 tester
.set_max_x(50);
7445 auto start
= ceph::coarse_mono_clock::now();
7446 int r
= tester
.test_with_fork(g_conf
->mon_lease
);
7447 auto duration
= ceph::coarse_mono_clock::now() - start
;
7449 dout(10) << " tester.test_with_fork returns " << r
7450 << ": " << ess
.str() << dendl
;
7451 ss
<< "crush smoke test failed with " << r
<< ": " << ess
.str();
7455 dout(10) << __func__
<< " crush somke test duration: "
7456 << duration
<< ", result: " << ess
.str() << dendl
;
7459 pending_inc
.crush
= data
;
7460 ss
<< osdmap
.get_crush_version() + 1;
7463 } else if (prefix
== "osd crush set-device-class") {
7464 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
7465 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
7466 << "luminous' before using crush device classes";
7471 string device_class
;
7472 if (!cmd_getval(g_ceph_context
, cmdmap
, "class", device_class
)) {
7473 err
= -EINVAL
; // no value!
7478 vector
<string
> idvec
;
7479 cmd_getval(g_ceph_context
, cmdmap
, "ids", idvec
);
7480 CrushWrapper newcrush
;
7481 _get_pending_crush(newcrush
);
7483 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
7487 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
7488 osdmap
.get_all_osds(osds
);
7491 // try traditional single osd way
7492 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
7494 // ss has reason for failure
7495 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
7502 for (auto &osd
: osds
) {
7503 if (!osdmap
.exists(osd
)) {
7504 ss
<< "osd." << osd
<< " does not exist. ";
7509 oss
<< "osd." << osd
;
7510 string name
= oss
.str();
7513 if (newcrush
.item_exists(osd
)) {
7514 action
= "updating";
7516 action
= "creating";
7517 newcrush
.set_item_name(osd
, name
);
7520 dout(5) << action
<< " crush item id " << osd
<< " name '" << name
7521 << "' device_class '" << device_class
<< "'"
7523 err
= newcrush
.update_device_class(osd
, device_class
, name
, &ss
);
7527 if (err
== 0 && !_have_pending_crush()) {
7529 // for single osd only, wildcard makes too much noise
7530 ss
<< "set-device-class item id " << osd
<< " name '" << name
7531 << "' device_class '" << device_class
<< "': no change";
7534 updated
.insert(osd
);
7539 if (!updated
.empty()) {
7540 pending_inc
.crush
.clear();
7541 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7542 ss
<< "set osd(s) " << updated
<< " to class '" << device_class
<< "'";
7544 wait_for_finished_proposal(op
,
7545 new Monitor::C_Command(mon
,op
, 0, rs
, get_last_committed() + 1));
7549 } else if (prefix
== "osd crush rm-device-class") {
7551 vector
<string
> idvec
;
7552 cmd_getval(g_ceph_context
, cmdmap
, "ids", idvec
);
7553 CrushWrapper newcrush
;
7554 _get_pending_crush(newcrush
);
7557 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
7562 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
7563 osdmap
.get_all_osds(osds
);
7566 // try traditional single osd way
7567 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
7569 // ss has reason for failure
7570 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
7577 for (auto &osd
: osds
) {
7578 if (!osdmap
.exists(osd
)) {
7579 ss
<< "osd." << osd
<< " does not exist. ";
7583 auto class_name
= newcrush
.get_item_class(osd
);
7585 ss
<< "osd." << osd
<< " belongs to no class, ";
7588 // note that we do not verify if class_is_in_use here
7589 // in case the device is misclassified and user wants
7590 // to overridely reset...
7592 err
= newcrush
.remove_device_class(g_ceph_context
, osd
, &ss
);
7594 // ss has reason for failure
7597 updated
.insert(osd
);
7601 if (!updated
.empty()) {
7602 pending_inc
.crush
.clear();
7603 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7604 ss
<< "done removing class of osd(s): " << updated
;
7606 wait_for_finished_proposal(op
,
7607 new Monitor::C_Command(mon
,op
, 0, rs
, get_last_committed() + 1));
7610 } else if (prefix
== "osd crush class rename") {
7611 string srcname
, dstname
;
7612 if (!cmd_getval(g_ceph_context
, cmdmap
, "srcname", srcname
)) {
7616 if (!cmd_getval(g_ceph_context
, cmdmap
, "dstname", dstname
)) {
7621 CrushWrapper newcrush
;
7622 _get_pending_crush(newcrush
);
7623 if (!newcrush
.class_exists(srcname
) && newcrush
.class_exists(dstname
)) {
7624 // suppose this is a replay and return success
7625 // so command is idempotent
7626 ss
<< "already renamed to '" << dstname
<< "'";
7631 err
= newcrush
.rename_class(srcname
, dstname
);
7633 ss
<< "fail to rename '" << srcname
<< "' to '" << dstname
<< "' : "
7634 << cpp_strerror(err
);
7638 pending_inc
.crush
.clear();
7639 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7640 ss
<< "rename class '" << srcname
<< "' to '" << dstname
<< "'";
7642 } else if (prefix
== "osd crush add-bucket") {
7643 // os crush add-bucket <name> <type>
7644 string name
, typestr
;
7645 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
7646 cmd_getval(g_ceph_context
, cmdmap
, "type", typestr
);
7648 if (!_have_pending_crush() &&
7649 _get_stable_crush().name_exists(name
)) {
7650 ss
<< "bucket '" << name
<< "' already exists";
7654 CrushWrapper newcrush
;
7655 _get_pending_crush(newcrush
);
7657 if (newcrush
.name_exists(name
)) {
7658 ss
<< "bucket '" << name
<< "' already exists";
7661 int type
= newcrush
.get_type_id(typestr
);
7663 ss
<< "type '" << typestr
<< "' does not exist";
7668 ss
<< "type '" << typestr
<< "' is for devices, not buckets";
7673 err
= newcrush
.add_bucket(0, 0,
7674 CRUSH_HASH_DEFAULT
, type
, 0, NULL
,
7677 ss
<< "add_bucket error: '" << cpp_strerror(err
) << "'";
7680 err
= newcrush
.set_item_name(bucketno
, name
);
7682 ss
<< "error setting bucket name to '" << name
<< "'";
7686 pending_inc
.crush
.clear();
7687 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7688 ss
<< "added bucket " << name
<< " type " << typestr
7691 } else if (prefix
== "osd crush rename-bucket") {
7692 string srcname
, dstname
;
7693 cmd_getval(g_ceph_context
, cmdmap
, "srcname", srcname
);
7694 cmd_getval(g_ceph_context
, cmdmap
, "dstname", dstname
);
7696 err
= crush_rename_bucket(srcname
, dstname
, &ss
);
7697 if (err
== -EALREADY
) // equivalent to success for idempotency
7703 } else if (prefix
== "osd crush weight-set create" ||
7704 prefix
== "osd crush weight-set create-compat") {
7705 CrushWrapper newcrush
;
7706 _get_pending_crush(newcrush
);
7709 if (newcrush
.has_non_straw2_buckets()) {
7710 ss
<< "crush map contains one or more bucket(s) that are not straw2";
7714 if (prefix
== "osd crush weight-set create") {
7715 if (osdmap
.require_min_compat_client
> 0 &&
7716 osdmap
.require_min_compat_client
< CEPH_RELEASE_LUMINOUS
) {
7717 ss
<< "require_min_compat_client "
7718 << ceph_release_name(osdmap
.require_min_compat_client
)
7719 << " < luminous, which is required for per-pool weight-sets. "
7720 << "Try 'ceph osd set-require-min-compat-client luminous' "
7721 << "before using the new interface";
7725 string poolname
, mode
;
7726 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolname
);
7727 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
7729 ss
<< "pool '" << poolname
<< "' not found";
7733 cmd_getval(g_ceph_context
, cmdmap
, "mode", mode
);
7734 if (mode
!= "flat" && mode
!= "positional") {
7735 ss
<< "unrecognized weight-set mode '" << mode
<< "'";
7739 positions
= mode
== "flat" ? 1 : osdmap
.get_pg_pool(pool
)->get_size();
7741 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
7744 newcrush
.create_choose_args(pool
, positions
);
7745 pending_inc
.crush
.clear();
7746 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7749 } else if (prefix
== "osd crush weight-set rm" ||
7750 prefix
== "osd crush weight-set rm-compat") {
7751 CrushWrapper newcrush
;
7752 _get_pending_crush(newcrush
);
7754 if (prefix
== "osd crush weight-set rm") {
7756 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolname
);
7757 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
7759 ss
<< "pool '" << poolname
<< "' not found";
7764 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
7766 newcrush
.rm_choose_args(pool
);
7767 pending_inc
.crush
.clear();
7768 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7771 } else if (prefix
== "osd crush weight-set reweight" ||
7772 prefix
== "osd crush weight-set reweight-compat") {
7773 string poolname
, item
;
7774 vector
<double> weight
;
7775 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolname
);
7776 cmd_getval(g_ceph_context
, cmdmap
, "item", item
);
7777 cmd_getval(g_ceph_context
, cmdmap
, "weight", weight
);
7778 CrushWrapper newcrush
;
7779 _get_pending_crush(newcrush
);
7781 if (prefix
== "osd crush weight-set reweight") {
7782 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
7784 ss
<< "pool '" << poolname
<< "' not found";
7788 if (!newcrush
.have_choose_args(pool
)) {
7789 ss
<< "no weight-set for pool '" << poolname
<< "'";
7793 auto arg_map
= newcrush
.choose_args_get(pool
);
7794 int positions
= newcrush
.get_choose_args_positions(arg_map
);
7795 if (weight
.size() != (size_t)positions
) {
7796 ss
<< "must specify exact " << positions
<< " weight values";
7801 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
7802 if (!newcrush
.have_choose_args(pool
)) {
7803 ss
<< "no backward-compatible weight-set";
7808 if (!newcrush
.name_exists(item
)) {
7809 ss
<< "item '" << item
<< "' does not exist";
7813 err
= newcrush
.choose_args_adjust_item_weightf(
7815 newcrush
.choose_args_get(pool
),
7816 newcrush
.get_item_id(item
),
7823 pending_inc
.crush
.clear();
7824 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7826 } else if (osdid_present
&&
7827 (prefix
== "osd crush set" || prefix
== "osd crush add")) {
7828 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
7829 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
7830 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
7832 if (!osdmap
.exists(osdid
)) {
7834 ss
<< name
<< " does not exist. Create it before updating the crush map";
7839 if (!cmd_getval(g_ceph_context
, cmdmap
, "weight", weight
)) {
7840 ss
<< "unable to parse weight value '"
7841 << cmd_vartype_stringify(cmdmap
["weight"]) << "'";
7847 vector
<string
> argvec
;
7848 cmd_getval(g_ceph_context
, cmdmap
, "args", argvec
);
7849 map
<string
,string
> loc
;
7850 CrushWrapper::parse_loc_map(argvec
, &loc
);
7852 if (prefix
== "osd crush set"
7853 && !_get_stable_crush().item_exists(osdid
)) {
7855 ss
<< "unable to set item id " << osdid
<< " name '" << name
7856 << "' weight " << weight
<< " at location " << loc
7857 << ": does not exist";
7861 dout(5) << "adding/updating crush item id " << osdid
<< " name '"
7862 << name
<< "' weight " << weight
<< " at location "
7864 CrushWrapper newcrush
;
7865 _get_pending_crush(newcrush
);
7868 if (prefix
== "osd crush set" ||
7869 newcrush
.check_item_loc(g_ceph_context
, osdid
, loc
, (int *)NULL
)) {
7871 err
= newcrush
.update_item(g_ceph_context
, osdid
, weight
, name
, loc
);
7874 err
= newcrush
.insert_item(g_ceph_context
, osdid
, weight
, name
, loc
);
7882 if (err
== 0 && !_have_pending_crush()) {
7883 ss
<< action
<< " item id " << osdid
<< " name '" << name
<< "' weight "
7884 << weight
<< " at location " << loc
<< ": no change";
7888 pending_inc
.crush
.clear();
7889 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7890 ss
<< action
<< " item id " << osdid
<< " name '" << name
<< "' weight "
7891 << weight
<< " at location " << loc
<< " to crush map";
7893 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
7894 get_last_committed() + 1));
7897 } else if (prefix
== "osd crush create-or-move") {
7899 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
7900 if (!osdmap
.exists(osdid
)) {
7902 ss
<< name
<< " does not exist. create it before updating the crush map";
7907 if (!cmd_getval(g_ceph_context
, cmdmap
, "weight", weight
)) {
7908 ss
<< "unable to parse weight value '"
7909 << cmd_vartype_stringify(cmdmap
["weight"]) << "'";
7915 vector
<string
> argvec
;
7916 cmd_getval(g_ceph_context
, cmdmap
, "args", argvec
);
7917 map
<string
,string
> loc
;
7918 CrushWrapper::parse_loc_map(argvec
, &loc
);
7920 dout(0) << "create-or-move crush item name '" << name
<< "' initial_weight " << weight
7921 << " at location " << loc
<< dendl
;
7923 CrushWrapper newcrush
;
7924 _get_pending_crush(newcrush
);
7926 err
= newcrush
.create_or_move_item(g_ceph_context
, osdid
, weight
, name
, loc
);
7928 ss
<< "create-or-move updated item name '" << name
<< "' weight " << weight
7929 << " at location " << loc
<< " to crush map";
7933 pending_inc
.crush
.clear();
7934 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7935 ss
<< "create-or-move updating item name '" << name
<< "' weight " << weight
7936 << " at location " << loc
<< " to crush map";
7938 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
7939 get_last_committed() + 1));
7944 } else if (prefix
== "osd crush move") {
7946 // osd crush move <name> <loc1> [<loc2> ...]
7949 vector
<string
> argvec
;
7950 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
7951 cmd_getval(g_ceph_context
, cmdmap
, "args", argvec
);
7952 map
<string
,string
> loc
;
7953 CrushWrapper::parse_loc_map(argvec
, &loc
);
7955 dout(0) << "moving crush item name '" << name
<< "' to location " << loc
<< dendl
;
7956 CrushWrapper newcrush
;
7957 _get_pending_crush(newcrush
);
7959 if (!newcrush
.name_exists(name
)) {
7961 ss
<< "item " << name
<< " does not exist";
7964 int id
= newcrush
.get_item_id(name
);
7966 if (!newcrush
.check_item_loc(g_ceph_context
, id
, loc
, (int *)NULL
)) {
7968 err
= newcrush
.create_or_move_item(g_ceph_context
, id
, 0, name
, loc
);
7970 err
= newcrush
.move_bucket(g_ceph_context
, id
, loc
);
7973 ss
<< "moved item id " << id
<< " name '" << name
<< "' to location " << loc
<< " in crush map";
7974 pending_inc
.crush
.clear();
7975 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7977 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
7978 get_last_committed() + 1));
7982 ss
<< "no need to move item id " << id
<< " name '" << name
<< "' to location " << loc
<< " in crush map";
7986 } else if (prefix
== "osd crush swap-bucket") {
7987 string source
, dest
, force
;
7988 cmd_getval(g_ceph_context
, cmdmap
, "source", source
);
7989 cmd_getval(g_ceph_context
, cmdmap
, "dest", dest
);
7990 cmd_getval(g_ceph_context
, cmdmap
, "force", force
);
7991 CrushWrapper newcrush
;
7992 _get_pending_crush(newcrush
);
7993 if (!newcrush
.name_exists(source
)) {
7994 ss
<< "source item " << source
<< " does not exist";
7998 if (!newcrush
.name_exists(dest
)) {
7999 ss
<< "dest item " << dest
<< " does not exist";
8003 int sid
= newcrush
.get_item_id(source
);
8004 int did
= newcrush
.get_item_id(dest
);
8006 if (newcrush
.get_immediate_parent_id(sid
, &sparent
) == 0 &&
8007 force
!= "--yes-i-really-mean-it") {
8008 ss
<< "source item " << source
<< " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
8012 if (newcrush
.get_bucket_alg(sid
) != newcrush
.get_bucket_alg(did
) &&
8013 force
!= "--yes-i-really-mean-it") {
8014 ss
<< "source bucket alg " << crush_alg_name(newcrush
.get_bucket_alg(sid
)) << " != "
8015 << "dest bucket alg " << crush_alg_name(newcrush
.get_bucket_alg(did
))
8016 << "; pass --yes-i-really-mean-it to proceed anyway";
8020 int r
= newcrush
.swap_bucket(g_ceph_context
, sid
, did
);
8022 ss
<< "failed to swap bucket contents: " << cpp_strerror(r
);
8026 ss
<< "swapped bucket of " << source
<< " to " << dest
;
8027 pending_inc
.crush
.clear();
8028 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8029 wait_for_finished_proposal(op
,
8030 new Monitor::C_Command(mon
, op
, err
, ss
.str(),
8031 get_last_committed() + 1));
8033 } else if (prefix
== "osd crush link") {
8034 // osd crush link <name> <loc1> [<loc2> ...]
8036 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
8037 vector
<string
> argvec
;
8038 cmd_getval(g_ceph_context
, cmdmap
, "args", argvec
);
8039 map
<string
,string
> loc
;
8040 CrushWrapper::parse_loc_map(argvec
, &loc
);
8042 // Need an explicit check for name_exists because get_item_id returns
8044 int id
= osdmap
.crush
->get_item_id(name
);
8045 if (!osdmap
.crush
->name_exists(name
)) {
8047 ss
<< "item " << name
<< " does not exist";
8050 dout(5) << "resolved crush name '" << name
<< "' to id " << id
<< dendl
;
8052 if (osdmap
.crush
->check_item_loc(g_ceph_context
, id
, loc
, (int*) NULL
)) {
8053 ss
<< "no need to move item id " << id
<< " name '" << name
8054 << "' to location " << loc
<< " in crush map";
8059 dout(5) << "linking crush item name '" << name
<< "' at location " << loc
<< dendl
;
8060 CrushWrapper newcrush
;
8061 _get_pending_crush(newcrush
);
8063 if (!newcrush
.name_exists(name
)) {
8065 ss
<< "item " << name
<< " does not exist";
8068 int id
= newcrush
.get_item_id(name
);
8069 if (!newcrush
.check_item_loc(g_ceph_context
, id
, loc
, (int *)NULL
)) {
8070 err
= newcrush
.link_bucket(g_ceph_context
, id
, loc
);
8072 ss
<< "linked item id " << id
<< " name '" << name
8073 << "' to location " << loc
<< " in crush map";
8074 pending_inc
.crush
.clear();
8075 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8077 ss
<< "cannot link item id " << id
<< " name '" << name
8078 << "' to location " << loc
;
8082 ss
<< "no need to move item id " << id
<< " name '" << name
8083 << "' to location " << loc
<< " in crush map";
8087 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, ss
.str(),
8088 get_last_committed() + 1));
8090 } else if (prefix
== "osd crush rm" ||
8091 prefix
== "osd crush remove" ||
8092 prefix
== "osd crush unlink") {
8094 // osd crush rm <id> [ancestor]
8095 CrushWrapper newcrush
;
8096 _get_pending_crush(newcrush
);
8099 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
8101 if (!osdmap
.crush
->name_exists(name
)) {
8103 ss
<< "device '" << name
<< "' does not appear in the crush map";
8106 if (!newcrush
.name_exists(name
)) {
8108 ss
<< "device '" << name
<< "' does not appear in the crush map";
8110 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8111 get_last_committed() + 1));
8114 int id
= newcrush
.get_item_id(name
);
8117 bool unlink_only
= prefix
== "osd crush unlink";
8118 string ancestor_str
;
8119 if (cmd_getval(g_ceph_context
, cmdmap
, "ancestor", ancestor_str
)) {
8120 if (!newcrush
.name_exists(ancestor_str
)) {
8122 ss
<< "ancestor item '" << ancestor_str
8123 << "' does not appear in the crush map";
8126 ancestor
= newcrush
.get_item_id(ancestor_str
);
8129 err
= prepare_command_osd_crush_remove(
8132 (ancestor
< 0), unlink_only
);
8134 if (err
== -ENOENT
) {
8135 ss
<< "item " << id
<< " does not appear in that position";
8140 ss
<< "removed item id " << id
<< " name '" << name
<< "' from crush map";
8142 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8143 get_last_committed() + 1));
8148 } else if (prefix
== "osd crush reweight-all") {
8149 CrushWrapper newcrush
;
8150 _get_pending_crush(newcrush
);
8152 newcrush
.reweight(g_ceph_context
);
8153 pending_inc
.crush
.clear();
8154 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8155 ss
<< "reweighted crush hierarchy";
8157 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8158 get_last_committed() + 1));
8160 } else if (prefix
== "osd crush reweight") {
8161 // osd crush reweight <name> <weight>
8162 CrushWrapper newcrush
;
8163 _get_pending_crush(newcrush
);
8166 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
8167 if (!newcrush
.name_exists(name
)) {
8169 ss
<< "device '" << name
<< "' does not appear in the crush map";
8173 int id
= newcrush
.get_item_id(name
);
8175 ss
<< "device '" << name
<< "' is not a leaf in the crush map";
8180 if (!cmd_getval(g_ceph_context
, cmdmap
, "weight", w
)) {
8181 ss
<< "unable to parse weight value '"
8182 << cmd_vartype_stringify(cmdmap
["weight"]) << "'";
8187 err
= newcrush
.adjust_item_weightf(g_ceph_context
, id
, w
);
8190 pending_inc
.crush
.clear();
8191 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8192 ss
<< "reweighted item id " << id
<< " name '" << name
<< "' to " << w
8195 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8196 get_last_committed() + 1));
8198 } else if (prefix
== "osd crush reweight-subtree") {
8199 // osd crush reweight <name> <weight>
8200 CrushWrapper newcrush
;
8201 _get_pending_crush(newcrush
);
8204 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
8205 if (!newcrush
.name_exists(name
)) {
8207 ss
<< "device '" << name
<< "' does not appear in the crush map";
8211 int id
= newcrush
.get_item_id(name
);
8213 ss
<< "device '" << name
<< "' is not a subtree in the crush map";
8218 if (!cmd_getval(g_ceph_context
, cmdmap
, "weight", w
)) {
8219 ss
<< "unable to parse weight value '"
8220 << cmd_vartype_stringify(cmdmap
["weight"]) << "'";
8225 err
= newcrush
.adjust_subtree_weightf(g_ceph_context
, id
, w
);
8228 pending_inc
.crush
.clear();
8229 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8230 ss
<< "reweighted subtree id " << id
<< " name '" << name
<< "' to " << w
8233 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8234 get_last_committed() + 1));
8236 } else if (prefix
== "osd crush tunables") {
8237 CrushWrapper newcrush
;
8238 _get_pending_crush(newcrush
);
8242 cmd_getval(g_ceph_context
, cmdmap
, "profile", profile
);
8243 if (profile
== "legacy" || profile
== "argonaut") {
8244 newcrush
.set_tunables_legacy();
8245 } else if (profile
== "bobtail") {
8246 newcrush
.set_tunables_bobtail();
8247 } else if (profile
== "firefly") {
8248 newcrush
.set_tunables_firefly();
8249 } else if (profile
== "hammer") {
8250 newcrush
.set_tunables_hammer();
8251 } else if (profile
== "jewel") {
8252 newcrush
.set_tunables_jewel();
8253 } else if (profile
== "optimal") {
8254 newcrush
.set_tunables_optimal();
8255 } else if (profile
== "default") {
8256 newcrush
.set_tunables_default();
8258 ss
<< "unrecognized profile '" << profile
<< "'";
8263 if (!validate_crush_against_features(&newcrush
, ss
)) {
8268 pending_inc
.crush
.clear();
8269 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8270 ss
<< "adjusted tunables profile to " << profile
;
8272 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8273 get_last_committed() + 1));
8275 } else if (prefix
== "osd crush set-tunable") {
8276 CrushWrapper newcrush
;
8277 _get_pending_crush(newcrush
);
8281 cmd_getval(g_ceph_context
, cmdmap
, "tunable", tunable
);
8284 if (!cmd_getval(g_ceph_context
, cmdmap
, "value", value
)) {
8286 ss
<< "failed to parse integer value " << cmd_vartype_stringify(cmdmap
["value"]);
8290 if (tunable
== "straw_calc_version") {
8291 if (value
!= 0 && value
!= 1) {
8292 ss
<< "value must be 0 or 1; got " << value
;
8296 newcrush
.set_straw_calc_version(value
);
8298 ss
<< "unrecognized tunable '" << tunable
<< "'";
8303 if (!validate_crush_against_features(&newcrush
, ss
)) {
8308 pending_inc
.crush
.clear();
8309 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8310 ss
<< "adjusted tunable " << tunable
<< " to " << value
;
8312 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8313 get_last_committed() + 1));
8316 } else if (prefix
== "osd crush rule create-simple") {
8317 string name
, root
, type
, mode
;
8318 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
8319 cmd_getval(g_ceph_context
, cmdmap
, "root", root
);
8320 cmd_getval(g_ceph_context
, cmdmap
, "type", type
);
8321 cmd_getval(g_ceph_context
, cmdmap
, "mode", mode
);
8325 if (osdmap
.crush
->rule_exists(name
)) {
8326 // The name is uniquely associated to a ruleid and the rule it contains
8327 // From the user point of view, the rule is more meaningfull.
8328 ss
<< "rule " << name
<< " already exists";
8333 CrushWrapper newcrush
;
8334 _get_pending_crush(newcrush
);
8336 if (newcrush
.rule_exists(name
)) {
8337 // The name is uniquely associated to a ruleid and the rule it contains
8338 // From the user point of view, the rule is more meaningfull.
8339 ss
<< "rule " << name
<< " already exists";
8342 int ruleno
= newcrush
.add_simple_rule(name
, root
, type
, "", mode
,
8343 pg_pool_t::TYPE_REPLICATED
, &ss
);
8349 pending_inc
.crush
.clear();
8350 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8353 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8354 get_last_committed() + 1));
8357 } else if (prefix
== "osd crush rule create-replicated") {
8358 string name
, root
, type
, device_class
;
8359 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
8360 cmd_getval(g_ceph_context
, cmdmap
, "root", root
);
8361 cmd_getval(g_ceph_context
, cmdmap
, "type", type
);
8362 cmd_getval(g_ceph_context
, cmdmap
, "class", device_class
);
8364 if (!device_class
.empty()) {
8365 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
8366 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
8367 << "luminous' before using crush device classes";
8373 if (osdmap
.crush
->rule_exists(name
)) {
8374 // The name is uniquely associated to a ruleid and the rule it contains
8375 // From the user point of view, the rule is more meaningfull.
8376 ss
<< "rule " << name
<< " already exists";
8381 CrushWrapper newcrush
;
8382 _get_pending_crush(newcrush
);
8384 if (newcrush
.rule_exists(name
)) {
8385 // The name is uniquely associated to a ruleid and the rule it contains
8386 // From the user point of view, the rule is more meaningfull.
8387 ss
<< "rule " << name
<< " already exists";
8390 int ruleno
= newcrush
.add_simple_rule(
8391 name
, root
, type
, device_class
,
8392 "firstn", pg_pool_t::TYPE_REPLICATED
, &ss
);
8398 pending_inc
.crush
.clear();
8399 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8402 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8403 get_last_committed() + 1));
8406 } else if (prefix
== "osd erasure-code-profile rm") {
8408 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
8410 if (erasure_code_profile_in_use(pending_inc
.new_pools
, name
, &ss
))
8413 if (erasure_code_profile_in_use(osdmap
.pools
, name
, &ss
)) {
8418 if (osdmap
.has_erasure_code_profile(name
) ||
8419 pending_inc
.new_erasure_code_profiles
.count(name
)) {
8420 if (osdmap
.has_erasure_code_profile(name
)) {
8421 pending_inc
.old_erasure_code_profiles
.push_back(name
);
8423 dout(20) << "erasure code profile rm " << name
<< ": creation canceled" << dendl
;
8424 pending_inc
.new_erasure_code_profiles
.erase(name
);
8428 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8429 get_last_committed() + 1));
8432 ss
<< "erasure-code-profile " << name
<< " does not exist";
8437 } else if (prefix
== "osd erasure-code-profile set") {
8439 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
8440 vector
<string
> profile
;
8441 cmd_getval(g_ceph_context
, cmdmap
, "profile", profile
);
8443 if (profile
.size() > 0 && profile
.back() == "--force") {
8449 map
<string
,string
> profile_map
;
8450 err
= parse_erasure_code_profile(profile
, &profile_map
, &ss
);
8453 if (profile_map
.find("plugin") == profile_map
.end()) {
8454 ss
<< "erasure-code-profile " << profile_map
8455 << " must contain a plugin entry" << std::endl
;
8459 string plugin
= profile_map
["plugin"];
8461 if (pending_inc
.has_erasure_code_profile(name
)) {
8462 dout(20) << "erasure code profile " << name
<< " try again" << dendl
;
8465 if (plugin
== "isa" || plugin
== "lrc") {
8466 err
= check_cluster_features(CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2
, ss
);
8471 } else if (plugin
== "shec") {
8472 err
= check_cluster_features(CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3
, ss
);
8478 err
= normalize_profile(name
, profile_map
, force
, &ss
);
8482 if (osdmap
.has_erasure_code_profile(name
)) {
8483 ErasureCodeProfile existing_profile_map
=
8484 osdmap
.get_erasure_code_profile(name
);
8485 err
= normalize_profile(name
, existing_profile_map
, force
, &ss
);
8489 if (existing_profile_map
== profile_map
) {
8495 ss
<< "will not override erasure code profile " << name
8496 << " because the existing profile "
8497 << existing_profile_map
8498 << " is different from the proposed profile "
8504 dout(20) << "erasure code profile set " << name
<< "="
8505 << profile_map
<< dendl
;
8506 pending_inc
.set_erasure_code_profile(name
, profile_map
);
8510 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8511 get_last_committed() + 1));
8514 } else if (prefix
== "osd crush rule create-erasure") {
8515 err
= check_cluster_features(CEPH_FEATURE_CRUSH_V2
, ss
);
8520 string name
, poolstr
;
8521 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
8523 cmd_getval(g_ceph_context
, cmdmap
, "profile", profile
);
8525 profile
= "default";
8526 if (profile
== "default") {
8527 if (!osdmap
.has_erasure_code_profile(profile
)) {
8528 if (pending_inc
.has_erasure_code_profile(profile
)) {
8529 dout(20) << "erasure code profile " << profile
<< " already pending" << dendl
;
8533 map
<string
,string
> profile_map
;
8534 err
= osdmap
.get_erasure_code_profile_default(g_ceph_context
,
8539 err
= normalize_profile(name
, profile_map
, true, &ss
);
8542 dout(20) << "erasure code profile set " << profile
<< "="
8543 << profile_map
<< dendl
;
8544 pending_inc
.set_erasure_code_profile(profile
, profile_map
);
8550 err
= crush_rule_create_erasure(name
, profile
, &rule
, &ss
);
8553 case -EEXIST
: // return immediately
8554 ss
<< "rule " << name
<< " already exists";
8558 case -EALREADY
: // wait for pending to be proposed
8559 ss
<< "rule " << name
<< " already exists";
8562 default: // non recoverable error
8567 ss
<< "created rule " << name
<< " at " << rule
;
8571 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8572 get_last_committed() + 1));
8575 } else if (prefix
== "osd crush rule rm") {
8577 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
8579 if (!osdmap
.crush
->rule_exists(name
)) {
8580 ss
<< "rule " << name
<< " does not exist";
8585 CrushWrapper newcrush
;
8586 _get_pending_crush(newcrush
);
8588 if (!newcrush
.rule_exists(name
)) {
8589 ss
<< "rule " << name
<< " does not exist";
8592 int ruleno
= newcrush
.get_rule_id(name
);
8593 assert(ruleno
>= 0);
8595 // make sure it is not in use.
8596 // FIXME: this is ok in some situations, but let's not bother with that
8598 int ruleset
= newcrush
.get_rule_mask_ruleset(ruleno
);
8599 if (osdmap
.crush_ruleset_in_use(ruleset
)) {
8600 ss
<< "crush ruleset " << name
<< " " << ruleset
<< " is in use";
8605 err
= newcrush
.remove_rule(ruleno
);
8610 pending_inc
.crush
.clear();
8611 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8614 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8615 get_last_committed() + 1));
8618 } else if (prefix
== "osd crush rule rename") {
8621 cmd_getval(g_ceph_context
, cmdmap
, "srcname", srcname
);
8622 cmd_getval(g_ceph_context
, cmdmap
, "dstname", dstname
);
8623 if (srcname
.empty() || dstname
.empty()) {
8624 ss
<< "must specify both source rule name and destination rule name";
8628 if (srcname
== dstname
) {
8629 ss
<< "destination rule name is equal to source rule name";
8634 CrushWrapper newcrush
;
8635 _get_pending_crush(newcrush
);
8636 if (!newcrush
.rule_exists(srcname
) && newcrush
.rule_exists(dstname
)) {
8637 // srcname does not exist and dstname already exists
8638 // suppose this is a replay and return success
8639 // (so this command is idempotent)
8640 ss
<< "already renamed to '" << dstname
<< "'";
8645 err
= newcrush
.rename_rule(srcname
, dstname
, &ss
);
8647 // ss has reason for failure
8650 pending_inc
.crush
.clear();
8651 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8653 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8654 get_last_committed() + 1));
8657 } else if (prefix
== "osd setmaxosd") {
8659 if (!cmd_getval(g_ceph_context
, cmdmap
, "newmax", newmax
)) {
8660 ss
<< "unable to parse 'newmax' value '"
8661 << cmd_vartype_stringify(cmdmap
["newmax"]) << "'";
8666 if (newmax
> g_conf
->mon_max_osd
) {
8668 ss
<< "cannot set max_osd to " << newmax
<< " which is > conf.mon_max_osd ("
8669 << g_conf
->mon_max_osd
<< ")";
8673 // Don't allow shrinking OSD number as this will cause data loss
8674 // and may cause kernel crashes.
8675 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
8676 if (newmax
< osdmap
.get_max_osd()) {
8677 // Check if the OSDs exist between current max and new value.
8678 // If there are any OSDs exist, then don't allow shrinking number
8680 for (int i
= newmax
; i
< osdmap
.get_max_osd(); i
++) {
8681 if (osdmap
.exists(i
)) {
8683 ss
<< "cannot shrink max_osd to " << newmax
8684 << " because osd." << i
<< " (and possibly others) still in use";
8690 pending_inc
.new_max_osd
= newmax
;
8691 ss
<< "set new max_osd = " << pending_inc
.new_max_osd
;
8693 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8694 get_last_committed() + 1));
8697 } else if (prefix
== "osd set-full-ratio" ||
8698 prefix
== "osd set-backfillfull-ratio" ||
8699 prefix
== "osd set-nearfull-ratio") {
8700 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
8701 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
8702 << "luminous' before using the new interface";
8707 if (!cmd_getval(g_ceph_context
, cmdmap
, "ratio", n
)) {
8708 ss
<< "unable to parse 'ratio' value '"
8709 << cmd_vartype_stringify(cmdmap
["ratio"]) << "'";
8713 if (prefix
== "osd set-full-ratio")
8714 pending_inc
.new_full_ratio
= n
;
8715 else if (prefix
== "osd set-backfillfull-ratio")
8716 pending_inc
.new_backfillfull_ratio
= n
;
8717 else if (prefix
== "osd set-nearfull-ratio")
8718 pending_inc
.new_nearfull_ratio
= n
;
8719 ss
<< prefix
<< " " << n
;
8721 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8722 get_last_committed() + 1));
8724 } else if (prefix
== "osd set-require-min-compat-client") {
8725 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
8726 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
8727 << "luminous' before using the new interface";
8732 cmd_getval(g_ceph_context
, cmdmap
, "version", v
);
8733 int vno
= ceph_release_from_name(v
.c_str());
8735 ss
<< "version " << v
<< " is not recognized";
8740 newmap
.deepish_copy_from(osdmap
);
8741 newmap
.apply_incremental(pending_inc
);
8742 newmap
.require_min_compat_client
= vno
;
8743 auto mvno
= newmap
.get_min_compat_client();
8745 ss
<< "osdmap current utilizes features that require "
8746 << ceph_release_name(mvno
)
8747 << "; cannot set require_min_compat_client below that to "
8748 << ceph_release_name(vno
);
8753 cmd_getval(g_ceph_context
, cmdmap
, "sure", sure
);
8754 if (sure
!= "--yes-i-really-mean-it") {
8756 mon
->get_combined_feature_map(&m
);
8757 uint64_t features
= ceph_release_features(vno
);
8761 CEPH_ENTITY_TYPE_CLIENT
,
8762 CEPH_ENTITY_TYPE_MDS
,
8763 CEPH_ENTITY_TYPE_MGR
}) {
8764 auto p
= m
.m
.find(type
);
8765 if (p
== m
.m
.end()) {
8768 for (auto& q
: p
->second
) {
8769 uint64_t missing
= ~q
.first
& features
;
8772 ss
<< "cannot set require_min_compat_client to " << v
<< ": ";
8777 ss
<< q
.second
<< " connected " << ceph_entity_type_name(type
)
8778 << "(s) look like " << ceph_release_name(
8779 ceph_release_from_features(q
.first
))
8780 << " (missing 0x" << std::hex
<< missing
<< std::dec
<< ")";
8786 ss
<< "; add --yes-i-really-mean-it to do it anyway";
8791 ss
<< "set require_min_compat_client to " << ceph_release_name(vno
);
8792 pending_inc
.new_require_min_compat_client
= vno
;
8794 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
8795 get_last_committed() + 1));
8797 } else if (prefix
== "osd pause") {
8798 return prepare_set_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
8800 } else if (prefix
== "osd unpause") {
8801 return prepare_unset_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
8803 } else if (prefix
== "osd set") {
8805 cmd_getval(g_ceph_context
, cmdmap
, "key", key
);
8807 return prepare_set_flag(op
, CEPH_OSDMAP_FULL
);
8808 else if (key
== "pause")
8809 return prepare_set_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
8810 else if (key
== "noup")
8811 return prepare_set_flag(op
, CEPH_OSDMAP_NOUP
);
8812 else if (key
== "nodown")
8813 return prepare_set_flag(op
, CEPH_OSDMAP_NODOWN
);
8814 else if (key
== "noout")
8815 return prepare_set_flag(op
, CEPH_OSDMAP_NOOUT
);
8816 else if (key
== "noin")
8817 return prepare_set_flag(op
, CEPH_OSDMAP_NOIN
);
8818 else if (key
== "nobackfill")
8819 return prepare_set_flag(op
, CEPH_OSDMAP_NOBACKFILL
);
8820 else if (key
== "norebalance")
8821 return prepare_set_flag(op
, CEPH_OSDMAP_NOREBALANCE
);
8822 else if (key
== "norecover")
8823 return prepare_set_flag(op
, CEPH_OSDMAP_NORECOVER
);
8824 else if (key
== "noscrub")
8825 return prepare_set_flag(op
, CEPH_OSDMAP_NOSCRUB
);
8826 else if (key
== "nodeep-scrub")
8827 return prepare_set_flag(op
, CEPH_OSDMAP_NODEEP_SCRUB
);
8828 else if (key
== "notieragent")
8829 return prepare_set_flag(op
, CEPH_OSDMAP_NOTIERAGENT
);
8830 else if (key
== "sortbitwise") {
8831 if (osdmap
.get_up_osd_features() & CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT
) {
8832 return prepare_set_flag(op
, CEPH_OSDMAP_SORTBITWISE
);
8834 ss
<< "not all up OSDs have OSD_BITWISE_HOBJ_SORT feature";
8838 } else if (key
== "recovery_deletes") {
8839 if (HAVE_FEATURE(osdmap
.get_up_osd_features(), OSD_RECOVERY_DELETES
)) {
8840 return prepare_set_flag(op
, CEPH_OSDMAP_RECOVERY_DELETES
);
8842 ss
<< "not all up OSDs have OSD_RECOVERY_DELETES feature";
8846 } else if (key
== "require_jewel_osds") {
8847 if (!osdmap
.test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
8848 ss
<< "the sortbitwise flag must be set before require_jewel_osds";
8851 } else if (osdmap
.require_osd_release
>= CEPH_RELEASE_JEWEL
) {
8852 ss
<< "require_osd_release is already >= jewel";
8855 } else if (HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_JEWEL
)) {
8856 return prepare_set_flag(op
, CEPH_OSDMAP_REQUIRE_JEWEL
);
8858 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_JEWEL feature";
8861 } else if (key
== "require_kraken_osds") {
8862 if (!osdmap
.test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
8863 ss
<< "the sortbitwise flag must be set before require_kraken_osds";
8866 } else if (osdmap
.require_osd_release
>= CEPH_RELEASE_KRAKEN
) {
8867 ss
<< "require_osd_release is already >= kraken";
8870 } else if (HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_KRAKEN
)) {
8871 bool r
= prepare_set_flag(op
, CEPH_OSDMAP_REQUIRE_KRAKEN
);
8872 // ensure JEWEL is also set
8873 pending_inc
.new_flags
|= CEPH_OSDMAP_REQUIRE_JEWEL
;
8876 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_KRAKEN feature";
8880 ss
<< "unrecognized flag '" << key
<< "'";
8884 } else if (prefix
== "osd unset") {
8886 cmd_getval(g_ceph_context
, cmdmap
, "key", key
);
8888 return prepare_unset_flag(op
, CEPH_OSDMAP_FULL
);
8889 else if (key
== "pause")
8890 return prepare_unset_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
8891 else if (key
== "noup")
8892 return prepare_unset_flag(op
, CEPH_OSDMAP_NOUP
);
8893 else if (key
== "nodown")
8894 return prepare_unset_flag(op
, CEPH_OSDMAP_NODOWN
);
8895 else if (key
== "noout")
8896 return prepare_unset_flag(op
, CEPH_OSDMAP_NOOUT
);
8897 else if (key
== "noin")
8898 return prepare_unset_flag(op
, CEPH_OSDMAP_NOIN
);
8899 else if (key
== "nobackfill")
8900 return prepare_unset_flag(op
, CEPH_OSDMAP_NOBACKFILL
);
8901 else if (key
== "norebalance")
8902 return prepare_unset_flag(op
, CEPH_OSDMAP_NOREBALANCE
);
8903 else if (key
== "norecover")
8904 return prepare_unset_flag(op
, CEPH_OSDMAP_NORECOVER
);
8905 else if (key
== "noscrub")
8906 return prepare_unset_flag(op
, CEPH_OSDMAP_NOSCRUB
);
8907 else if (key
== "nodeep-scrub")
8908 return prepare_unset_flag(op
, CEPH_OSDMAP_NODEEP_SCRUB
);
8909 else if (key
== "notieragent")
8910 return prepare_unset_flag(op
, CEPH_OSDMAP_NOTIERAGENT
);
8912 ss
<< "unrecognized flag '" << key
<< "'";
8916 } else if (prefix
== "osd require-osd-release") {
8918 cmd_getval(g_ceph_context
, cmdmap
, "release", release
);
8919 if (!osdmap
.test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
8920 ss
<< "the sortbitwise flag must be set first";
8924 int rel
= ceph_release_from_name(release
.c_str());
8926 ss
<< "unrecognized release " << release
;
8930 if (rel
< CEPH_RELEASE_LUMINOUS
) {
8931 ss
<< "use this command only for luminous and later";
8935 if (rel
== osdmap
.require_osd_release
) {
8940 if (rel
== CEPH_RELEASE_LUMINOUS
) {
8941 if (!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_LUMINOUS
)) {
8942 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_LUMINOUS feature";
8947 ss
<< "not supported for this release yet";
8951 if (rel
< osdmap
.require_osd_release
) {
8952 ss
<< "require_osd_release cannot be lowered once it has been set";
8956 pending_inc
.new_require_osd_release
= rel
;
8957 if (rel
>= CEPH_RELEASE_LUMINOUS
&&
8958 !osdmap
.test_flag(CEPH_OSDMAP_RECOVERY_DELETES
)) {
8959 return prepare_set_flag(op
, CEPH_OSDMAP_RECOVERY_DELETES
);
8962 } else if (prefix
== "osd cluster_snap") {
8963 // ** DISABLE THIS FOR NOW **
8964 ss
<< "cluster snapshot currently disabled (broken implementation)";
8965 // ** DISABLE THIS FOR NOW **
8967 } else if (prefix
== "osd down" ||
8968 prefix
== "osd out" ||
8969 prefix
== "osd in" ||
8970 prefix
== "osd rm") {
8974 bool verbose
= true;
8976 vector
<string
> idvec
;
8977 cmd_getval(g_ceph_context
, cmdmap
, "ids", idvec
);
8978 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
8983 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
8984 if (prefix
== "osd in") {
8985 // touch out osds only
8986 osdmap
.get_out_osds(osds
);
8988 osdmap
.get_all_osds(osds
);
8991 verbose
= false; // so the output is less noisy.
8993 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
8995 ss
<< "invalid osd id" << osd
;
8998 } else if (!osdmap
.exists(osd
)) {
8999 ss
<< "osd." << osd
<< " does not exist. ";
9006 for (auto &osd
: osds
) {
9007 if (prefix
== "osd down") {
9008 if (osdmap
.is_down(osd
)) {
9010 ss
<< "osd." << osd
<< " is already down. ";
9012 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_UP
);
9013 ss
<< "marked down osd." << osd
<< ". ";
9016 } else if (prefix
== "osd out") {
9017 if (osdmap
.is_out(osd
)) {
9019 ss
<< "osd." << osd
<< " is already out. ";
9021 pending_inc
.new_weight
[osd
] = CEPH_OSD_OUT
;
9022 if (osdmap
.osd_weight
[osd
]) {
9023 if (pending_inc
.new_xinfo
.count(osd
) == 0) {
9024 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
9026 pending_inc
.new_xinfo
[osd
].old_weight
= osdmap
.osd_weight
[osd
];
9028 ss
<< "marked out osd." << osd
<< ". ";
9029 std::ostringstream msg
;
9030 msg
<< "Client " << op
->get_session()->entity_name
9031 << " marked osd." << osd
<< " out";
9032 if (osdmap
.is_up(osd
)) {
9033 msg
<< ", while it was still marked up";
9035 msg
<< ", after it was down for " << int(down_pending_out
[osd
].sec())
9039 mon
->clog
->info() << msg
.str();
9042 } else if (prefix
== "osd in") {
9043 if (osdmap
.is_in(osd
)) {
9045 ss
<< "osd." << osd
<< " is already in. ";
9047 if (osdmap
.osd_xinfo
[osd
].old_weight
> 0) {
9048 pending_inc
.new_weight
[osd
] = osdmap
.osd_xinfo
[osd
].old_weight
;
9049 if (pending_inc
.new_xinfo
.count(osd
) == 0) {
9050 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
9052 pending_inc
.new_xinfo
[osd
].old_weight
= 0;
9054 pending_inc
.new_weight
[osd
] = CEPH_OSD_IN
;
9056 ss
<< "marked in osd." << osd
<< ". ";
9059 } else if (prefix
== "osd rm") {
9060 err
= prepare_command_osd_remove(osd
);
9062 if (err
== -EBUSY
) {
9065 ss
<< "osd." << osd
<< " is still up; must be down before removal. ";
9069 ss
<< ", osd." << osd
;
9071 ss
<< "removed osd." << osd
;
9080 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
9081 get_last_committed() + 1));
9084 } else if (prefix
== "osd add-noup" ||
9085 prefix
== "osd add-nodown" ||
9086 prefix
== "osd add-noin" ||
9087 prefix
== "osd add-noout") {
9096 if (prefix
== "osd add-noup") {
9098 } else if (prefix
== "osd add-nodown") {
9100 } else if (prefix
== "osd add-noin") {
9109 vector
<string
> idvec
;
9110 cmd_getval(g_ceph_context
, cmdmap
, "ids", idvec
);
9111 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
9117 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
9118 osdmap
.get_all_osds(osds
);
9121 // try traditional single osd way
9123 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
9125 // ss has reason for failure
9126 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
9134 for (auto &osd
: osds
) {
9136 if (!osdmap
.exists(osd
)) {
9137 ss
<< "osd." << osd
<< " does not exist. ";
9143 if (osdmap
.is_up(osd
)) {
9144 ss
<< "osd." << osd
<< " is already up. ";
9148 if (osdmap
.is_noup(osd
)) {
9149 if (pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOUP
))
9152 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOUP
);
9159 if (osdmap
.is_down(osd
)) {
9160 ss
<< "osd." << osd
<< " is already down. ";
9164 if (osdmap
.is_nodown(osd
)) {
9165 if (pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NODOWN
))
9168 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NODOWN
);
9175 if (osdmap
.is_in(osd
)) {
9176 ss
<< "osd." << osd
<< " is already in. ";
9180 if (osdmap
.is_noin(osd
)) {
9181 if (pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOIN
))
9184 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOIN
);
9191 if (osdmap
.is_out(osd
)) {
9192 ss
<< "osd." << osd
<< " is already out. ";
9196 if (osdmap
.is_noout(osd
)) {
9197 if (pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOOUT
))
9200 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOOUT
);
9207 assert(0 == "invalid option");
9214 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
9215 get_last_committed() + 1));
9218 } else if (prefix
== "osd rm-noup" ||
9219 prefix
== "osd rm-nodown" ||
9220 prefix
== "osd rm-noin" ||
9221 prefix
== "osd rm-noout") {
9230 if (prefix
== "osd rm-noup") {
9232 } else if (prefix
== "osd rm-nodown") {
9234 } else if (prefix
== "osd rm-noin") {
9243 vector
<string
> idvec
;
9244 cmd_getval(g_ceph_context
, cmdmap
, "ids", idvec
);
9246 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
9252 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
9254 // touch previous noup/nodown/noin/noout osds only
9257 osdmap
.get_noup_osds(&osds
);
9260 osdmap
.get_nodown_osds(&osds
);
9263 osdmap
.get_noin_osds(&osds
);
9266 osdmap
.get_noout_osds(&osds
);
9269 assert(0 == "invalid option");
9272 // cancel any pending noup/nodown/noin/noout requests too
9273 vector
<int> pending_state_osds
;
9274 (void) pending_inc
.get_pending_state_osds(&pending_state_osds
);
9275 for (auto &p
: pending_state_osds
) {
9279 if (!osdmap
.is_noup(p
) &&
9280 pending_inc
.pending_osd_state_clear(p
, CEPH_OSD_NOUP
)) {
9286 if (!osdmap
.is_nodown(p
) &&
9287 pending_inc
.pending_osd_state_clear(p
, CEPH_OSD_NODOWN
)) {
9293 if (!osdmap
.is_noin(p
) &&
9294 pending_inc
.pending_osd_state_clear(p
, CEPH_OSD_NOIN
)) {
9300 if (!osdmap
.is_noout(p
) &&
9301 pending_inc
.pending_osd_state_clear(p
, CEPH_OSD_NOOUT
)) {
9307 assert(0 == "invalid option");
9313 // try traditional single osd way
9315 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
9317 // ss has reason for failure
9318 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
9323 osds
.push_back(osd
);
9326 for (auto &osd
: osds
) {
9328 if (!osdmap
.exists(osd
)) {
9329 ss
<< "osd." << osd
<< " does not exist. ";
9335 if (osdmap
.is_noup(osd
)) {
9336 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOUP
);
9338 } else if (pending_inc
.pending_osd_state_clear(
9339 osd
, CEPH_OSD_NOUP
)) {
9345 if (osdmap
.is_nodown(osd
)) {
9346 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NODOWN
);
9348 } else if (pending_inc
.pending_osd_state_clear(
9349 osd
, CEPH_OSD_NODOWN
)) {
9355 if (osdmap
.is_noin(osd
)) {
9356 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOIN
);
9358 } else if (pending_inc
.pending_osd_state_clear(
9359 osd
, CEPH_OSD_NOIN
)) {
9365 if (osdmap
.is_noout(osd
)) {
9366 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOOUT
);
9368 } else if (pending_inc
.pending_osd_state_clear(
9369 osd
, CEPH_OSD_NOOUT
)) {
9375 assert(0 == "invalid option");
9382 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
9383 get_last_committed() + 1));
9386 } else if (prefix
== "osd pg-temp") {
9388 if (!cmd_getval(g_ceph_context
, cmdmap
, "pgid", pgidstr
)) {
9389 ss
<< "unable to parse 'pgid' value '"
9390 << cmd_vartype_stringify(cmdmap
["pgid"]) << "'";
9395 if (!pgid
.parse(pgidstr
.c_str())) {
9396 ss
<< "invalid pgid '" << pgidstr
<< "'";
9400 if (!osdmap
.pg_exists(pgid
)) {
9401 ss
<< "pg " << pgid
<< " does not exist";
9405 if (pending_inc
.new_pg_temp
.count(pgid
)) {
9406 dout(10) << __func__
<< " waiting for pending update on " << pgid
<< dendl
;
9407 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
9411 vector
<int64_t> id_vec
;
9412 vector
<int32_t> new_pg_temp
;
9413 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", id_vec
)) {
9414 ss
<< "unable to parse 'id' value(s) '"
9415 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
9419 for (auto osd
: id_vec
) {
9420 if (!osdmap
.exists(osd
)) {
9421 ss
<< "osd." << osd
<< " does not exist";
9425 new_pg_temp
.push_back(osd
);
9428 int pool_min_size
= osdmap
.get_pg_pool_min_size(pgid
);
9429 if ((int)new_pg_temp
.size() < pool_min_size
) {
9430 ss
<< "num of osds (" << new_pg_temp
.size() <<") < pool min size ("
9431 << pool_min_size
<< ")";
9436 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
9437 if ((int)new_pg_temp
.size() > pool_size
) {
9438 ss
<< "num of osds (" << new_pg_temp
.size() <<") > pool size ("
9439 << pool_size
<< ")";
9444 pending_inc
.new_pg_temp
[pgid
] = mempool::osdmap::vector
<int>(
9445 new_pg_temp
.begin(), new_pg_temp
.end());
9446 ss
<< "set " << pgid
<< " pg_temp mapping to " << new_pg_temp
;
9448 } else if (prefix
== "osd primary-temp") {
9450 if (!cmd_getval(g_ceph_context
, cmdmap
, "pgid", pgidstr
)) {
9451 ss
<< "unable to parse 'pgid' value '"
9452 << cmd_vartype_stringify(cmdmap
["pgid"]) << "'";
9457 if (!pgid
.parse(pgidstr
.c_str())) {
9458 ss
<< "invalid pgid '" << pgidstr
<< "'";
9462 if (!osdmap
.pg_exists(pgid
)) {
9463 ss
<< "pg " << pgid
<< " does not exist";
9469 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", osd
)) {
9470 ss
<< "unable to parse 'id' value '"
9471 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
9475 if (osd
!= -1 && !osdmap
.exists(osd
)) {
9476 ss
<< "osd." << osd
<< " does not exist";
9481 if (osdmap
.require_min_compat_client
> 0 &&
9482 osdmap
.require_min_compat_client
< CEPH_RELEASE_FIREFLY
) {
9483 ss
<< "require_min_compat_client "
9484 << ceph_release_name(osdmap
.require_min_compat_client
)
9485 << " < firefly, which is required for primary-temp";
9488 } else if (!g_conf
->mon_osd_allow_primary_temp
) {
9489 ss
<< "you must enable 'mon osd allow primary temp = true' on the mons before you can set primary_temp mappings. note that this is for developers only: older clients/OSDs will break and there is no feature bit infrastructure in place.";
9494 pending_inc
.new_primary_temp
[pgid
] = osd
;
9495 ss
<< "set " << pgid
<< " primary_temp mapping to " << osd
;
9497 } else if (prefix
== "osd pg-upmap" ||
9498 prefix
== "osd rm-pg-upmap" ||
9499 prefix
== "osd pg-upmap-items" ||
9500 prefix
== "osd rm-pg-upmap-items") {
9501 if (osdmap
.require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
9502 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
9503 << "luminous' before using the new interface";
9507 if (osdmap
.require_min_compat_client
< CEPH_RELEASE_LUMINOUS
) {
9508 ss
<< "min_compat_client "
9509 << ceph_release_name(osdmap
.require_min_compat_client
)
9510 << " < luminous, which is required for pg-upmap. "
9511 << "Try 'ceph osd set-require-min-compat-client luminous' "
9512 << "before using the new interface";
9516 err
= check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP
, ss
);
9522 if (!cmd_getval(g_ceph_context
, cmdmap
, "pgid", pgidstr
)) {
9523 ss
<< "unable to parse 'pgid' value '"
9524 << cmd_vartype_stringify(cmdmap
["pgid"]) << "'";
9529 if (!pgid
.parse(pgidstr
.c_str())) {
9530 ss
<< "invalid pgid '" << pgidstr
<< "'";
9534 if (!osdmap
.pg_exists(pgid
)) {
9535 ss
<< "pg " << pgid
<< " does not exist";
9544 OP_RM_PG_UPMAP_ITEMS
,
9547 if (prefix
== "osd pg-upmap") {
9548 option
= OP_PG_UPMAP
;
9549 } else if (prefix
== "osd rm-pg-upmap") {
9550 option
= OP_RM_PG_UPMAP
;
9551 } else if (prefix
== "osd pg-upmap-items") {
9552 option
= OP_PG_UPMAP_ITEMS
;
9554 option
= OP_RM_PG_UPMAP_ITEMS
;
9557 // check pending upmap changes
9559 case OP_PG_UPMAP
: // fall through
9560 case OP_RM_PG_UPMAP
:
9561 if (pending_inc
.new_pg_upmap
.count(pgid
) ||
9562 pending_inc
.old_pg_upmap
.count(pgid
)) {
9563 dout(10) << __func__
<< " waiting for pending update on "
9565 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
9570 case OP_PG_UPMAP_ITEMS
: // fall through
9571 case OP_RM_PG_UPMAP_ITEMS
:
9572 if (pending_inc
.new_pg_upmap_items
.count(pgid
) ||
9573 pending_inc
.old_pg_upmap_items
.count(pgid
)) {
9574 dout(10) << __func__
<< " waiting for pending update on "
9576 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
9582 assert(0 == "invalid option");
9588 vector
<int64_t> id_vec
;
9589 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", id_vec
)) {
9590 ss
<< "unable to parse 'id' value(s) '"
9591 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
9596 int pool_min_size
= osdmap
.get_pg_pool_min_size(pgid
);
9597 if ((int)id_vec
.size() < pool_min_size
) {
9598 ss
<< "num of osds (" << id_vec
.size() <<") < pool min size ("
9599 << pool_min_size
<< ")";
9604 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
9605 if ((int)id_vec
.size() > pool_size
) {
9606 ss
<< "num of osds (" << id_vec
.size() <<") > pool size ("
9607 << pool_size
<< ")";
9612 vector
<int32_t> new_pg_upmap
;
9613 for (auto osd
: id_vec
) {
9614 if (osd
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(osd
)) {
9615 ss
<< "osd." << osd
<< " does not exist";
9619 auto it
= std::find(new_pg_upmap
.begin(), new_pg_upmap
.end(), osd
);
9620 if (it
!= new_pg_upmap
.end()) {
9621 ss
<< "osd." << osd
<< " already exists, ";
9624 new_pg_upmap
.push_back(osd
);
9627 if (new_pg_upmap
.empty()) {
9628 ss
<< "no valid upmap items(pairs) is specified";
9633 pending_inc
.new_pg_upmap
[pgid
] = mempool::osdmap::vector
<int32_t>(
9634 new_pg_upmap
.begin(), new_pg_upmap
.end());
9635 ss
<< "set " << pgid
<< " pg_upmap mapping to " << new_pg_upmap
;
9639 case OP_RM_PG_UPMAP
:
9641 pending_inc
.old_pg_upmap
.insert(pgid
);
9642 ss
<< "clear " << pgid
<< " pg_upmap mapping";
9646 case OP_PG_UPMAP_ITEMS
:
9648 vector
<int64_t> id_vec
;
9649 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", id_vec
)) {
9650 ss
<< "unable to parse 'id' value(s) '"
9651 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
9656 if (id_vec
.size() % 2) {
9657 ss
<< "you must specify pairs of osd ids to be remapped";
9662 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
9663 if ((int)(id_vec
.size() / 2) > pool_size
) {
9664 ss
<< "num of osd pairs (" << id_vec
.size() / 2 <<") > pool size ("
9665 << pool_size
<< ")";
9670 vector
<pair
<int32_t,int32_t>> new_pg_upmap_items
;
9671 ostringstream items
;
9673 for (auto p
= id_vec
.begin(); p
!= id_vec
.end(); ++p
) {
9677 ss
<< "from osd." << from
<< " == to osd." << to
<< ", ";
9680 if (!osdmap
.exists(from
)) {
9681 ss
<< "osd." << from
<< " does not exist";
9685 if (to
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(to
)) {
9686 ss
<< "osd." << to
<< " does not exist";
9690 pair
<int32_t,int32_t> entry
= make_pair(from
, to
);
9691 auto it
= std::find(new_pg_upmap_items
.begin(),
9692 new_pg_upmap_items
.end(), entry
);
9693 if (it
!= new_pg_upmap_items
.end()) {
9694 ss
<< "osd." << from
<< " -> osd." << to
<< " already exists, ";
9697 new_pg_upmap_items
.push_back(entry
);
9698 items
<< from
<< "->" << to
<< ",";
9700 string
out(items
.str());
9701 out
.resize(out
.size() - 1); // drop last ','
9704 if (new_pg_upmap_items
.empty()) {
9705 ss
<< "no valid upmap items(pairs) is specified";
9710 pending_inc
.new_pg_upmap_items
[pgid
] =
9711 mempool::osdmap::vector
<pair
<int32_t,int32_t>>(
9712 new_pg_upmap_items
.begin(), new_pg_upmap_items
.end());
9713 ss
<< "set " << pgid
<< " pg_upmap_items mapping to " << out
;
9717 case OP_RM_PG_UPMAP_ITEMS
:
9719 pending_inc
.old_pg_upmap_items
.insert(pgid
);
9720 ss
<< "clear " << pgid
<< " pg_upmap_items mapping";
9725 assert(0 == "invalid option");
9729 } else if (prefix
== "osd primary-affinity") {
9731 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", id
)) {
9732 ss
<< "invalid osd id value '"
9733 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
9738 if (!cmd_getval(g_ceph_context
, cmdmap
, "weight", w
)) {
9739 ss
<< "unable to parse 'weight' value '"
9740 << cmd_vartype_stringify(cmdmap
["weight"]) << "'";
9744 long ww
= (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY
*w
);
9746 ss
<< "weight must be >= 0";
9750 if (osdmap
.require_min_compat_client
> 0 &&
9751 osdmap
.require_min_compat_client
< CEPH_RELEASE_FIREFLY
) {
9752 ss
<< "require_min_compat_client "
9753 << ceph_release_name(osdmap
.require_min_compat_client
)
9754 << " < firefly, which is required for primary-affinity";
9757 } else if (!g_conf
->mon_osd_allow_primary_affinity
) {
9758 ss
<< "you must enable 'mon osd allow primary affinity = true' on the mons before you can adjust primary-affinity. note that older clients will no longer be able to communicate with the cluster.";
9762 err
= check_cluster_features(CEPH_FEATURE_OSD_PRIMARY_AFFINITY
, ss
);
9767 if (osdmap
.exists(id
)) {
9768 pending_inc
.new_primary_affinity
[id
] = ww
;
9769 ss
<< "set osd." << id
<< " primary-affinity to " << w
<< " (" << ios::hex
<< ww
<< ios::dec
<< ")";
9771 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9772 get_last_committed() + 1));
9775 ss
<< "osd." << id
<< " does not exist";
9779 } else if (prefix
== "osd reweight") {
9781 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", id
)) {
9782 ss
<< "unable to parse osd id value '"
9783 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
9788 if (!cmd_getval(g_ceph_context
, cmdmap
, "weight", w
)) {
9789 ss
<< "unable to parse weight value '"
9790 << cmd_vartype_stringify(cmdmap
["weight"]) << "'";
9794 long ww
= (int)((double)CEPH_OSD_IN
*w
);
9796 ss
<< "weight must be >= 0";
9800 if (osdmap
.exists(id
)) {
9801 pending_inc
.new_weight
[id
] = ww
;
9802 ss
<< "reweighted osd." << id
<< " to " << w
<< " (" << std::hex
<< ww
<< std::dec
<< ")";
9804 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9805 get_last_committed() + 1));
9808 ss
<< "osd." << id
<< " does not exist";
9812 } else if (prefix
== "osd reweightn") {
9813 map
<int32_t, uint32_t> weights
;
9814 err
= parse_reweights(g_ceph_context
, cmdmap
, osdmap
, &weights
);
9816 ss
<< "unable to parse 'weights' value '"
9817 << cmd_vartype_stringify(cmdmap
["weights"]) << "'";
9820 pending_inc
.new_weight
.insert(weights
.begin(), weights
.end());
9821 wait_for_finished_proposal(
9823 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
, get_last_committed() + 1));
9825 } else if (prefix
== "osd lost") {
9827 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", id
)) {
9828 ss
<< "unable to parse osd id value '"
9829 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
9834 if (!cmd_getval(g_ceph_context
, cmdmap
, "sure", sure
) || sure
!= "--yes-i-really-mean-it") {
9835 ss
<< "are you SURE? this might mean real, permanent data loss. pass "
9836 "--yes-i-really-mean-it if you really do.";
9839 } else if (!osdmap
.exists(id
)) {
9840 ss
<< "osd." << id
<< " does not exist";
9843 } else if (!osdmap
.is_down(id
)) {
9844 ss
<< "osd." << id
<< " is not down";
9848 epoch_t e
= osdmap
.get_info(id
).down_at
;
9849 pending_inc
.new_lost
[id
] = e
;
9850 ss
<< "marked osd lost in epoch " << e
;
9852 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9853 get_last_committed() + 1));
9857 } else if (prefix
== "osd destroy" || prefix
== "osd purge") {
9858 /* Destroying an OSD means that we don't expect to further make use of
9859 * the OSDs data (which may even become unreadable after this operation),
9860 * and that we are okay with scrubbing all its cephx keys and config-key
9861 * data (which may include lockbox keys, thus rendering the osd's data
9864 * The OSD will not be removed. Instead, we will mark it as destroyed,
9865 * such that a subsequent call to `create` will not reuse the osd id.
9866 * This will play into being able to recreate the OSD, at the same
9867 * crush location, with minimal data movement.
9870 // make sure authmon is writeable.
9871 if (!mon
->authmon()->is_writeable()) {
9872 dout(10) << __func__
<< " waiting for auth mon to be writeable for "
9873 << "osd destroy" << dendl
;
9874 mon
->authmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
9879 if (!cmd_getval(g_ceph_context
, cmdmap
, "id", id
)) {
9880 ss
<< "unable to parse osd id value '"
9881 << cmd_vartype_stringify(cmdmap
["id"]) << "";
9886 bool is_destroy
= (prefix
== "osd destroy");
9888 assert("osd purge" == prefix
);
9892 if (!cmd_getval(g_ceph_context
, cmdmap
, "sure", sure
) ||
9893 sure
!= "--yes-i-really-mean-it") {
9894 ss
<< "Are you SURE? This will mean real, permanent data loss, as well "
9895 << "as cephx and lockbox keys. Pass --yes-i-really-mean-it if you "
9899 } else if (!osdmap
.exists(id
)) {
9900 ss
<< "osd." << id
<< " does not exist";
9901 err
= 0; // idempotent
9903 } else if (osdmap
.is_up(id
)) {
9904 ss
<< "osd." << id
<< " is not `down`.";
9907 } else if (is_destroy
&& osdmap
.is_destroyed(id
)) {
9908 ss
<< "destroyed osd." << id
;
9913 bool goto_reply
= false;
9917 err
= prepare_command_osd_destroy(id
, ss
);
9918 // we checked above that it should exist.
9919 assert(err
!= -ENOENT
);
9921 err
= prepare_command_osd_purge(id
, ss
);
9922 if (err
== -ENOENT
) {
9924 ss
<< "osd." << id
<< " does not exist.";
9930 if (err
< 0 || goto_reply
) {
9935 ss
<< "destroyed osd." << id
;
9937 ss
<< "purged osd." << id
;
9941 wait_for_finished_proposal(op
,
9942 new Monitor::C_Command(mon
, op
, 0, rs
, get_last_committed() + 1));
9943 force_immediate_propose();
9946 } else if (prefix
== "osd new") {
9948 // make sure authmon is writeable.
9949 if (!mon
->authmon()->is_writeable()) {
9950 dout(10) << __func__
<< " waiting for auth mon to be writeable for "
9951 << "osd new" << dendl
;
9952 mon
->authmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
9956 map
<string
,string
> secrets_map
;
9958 bufferlist bl
= m
->get_data();
9959 string secrets_json
= bl
.to_str();
9960 dout(20) << __func__
<< " osd new json = " << secrets_json
<< dendl
;
9962 err
= get_json_str_map(secrets_json
, ss
, &secrets_map
);
9966 dout(20) << __func__
<< " osd new secrets " << secrets_map
<< dendl
;
9969 err
= prepare_command_osd_new(op
, cmdmap
, secrets_map
, ss
, f
.get());
9982 if (err
== EEXIST
) {
9983 // idempotent operation
9988 wait_for_finished_proposal(op
,
9989 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
,
9990 get_last_committed() + 1));
9991 force_immediate_propose();
9994 } else if (prefix
== "osd create") {
9996 // optional id provided?
9997 int64_t id
= -1, cmd_id
= -1;
9998 if (cmd_getval(g_ceph_context
, cmdmap
, "id", cmd_id
)) {
10000 ss
<< "invalid osd id value '" << cmd_id
<< "'";
10004 dout(10) << " osd create got id " << cmd_id
<< dendl
;
10009 if (cmd_getval(g_ceph_context
, cmdmap
, "uuid", uuidstr
)) {
10010 if (!uuid
.parse(uuidstr
.c_str())) {
10011 ss
<< "invalid uuid value '" << uuidstr
<< "'";
10015 // we only care about the id if we also have the uuid, to
10016 // ensure the operation's idempotency.
10020 int32_t new_id
= -1;
10021 err
= prepare_command_osd_create(id
, uuid
, &new_id
, ss
);
10023 if (err
== -EAGAIN
) {
10024 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
10027 // a check has failed; reply to the user.
10030 } else if (err
== EEXIST
) {
10031 // this is an idempotent operation; we can go ahead and reply.
10033 f
->open_object_section("created_osd");
10034 f
->dump_int("osdid", new_id
);
10035 f
->close_section();
10045 do_osd_create(id
, uuid
, &new_id
);
10048 f
->open_object_section("created_osd");
10049 f
->dump_int("osdid", new_id
);
10050 f
->close_section();
10056 wait_for_finished_proposal(op
,
10057 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
,
10058 get_last_committed() + 1));
10061 } else if (prefix
== "osd blacklist clear") {
10062 pending_inc
.new_blacklist
.clear();
10063 std::list
<std::pair
<entity_addr_t
,utime_t
> > blacklist
;
10064 osdmap
.get_blacklist(&blacklist
);
10065 for (const auto &entry
: blacklist
) {
10066 pending_inc
.old_blacklist
.push_back(entry
.first
);
10068 ss
<< " removed all blacklist entries";
10070 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10071 get_last_committed() + 1));
10073 } else if (prefix
== "osd blacklist") {
10075 cmd_getval(g_ceph_context
, cmdmap
, "addr", addrstr
);
10076 entity_addr_t addr
;
10077 if (!addr
.parse(addrstr
.c_str(), 0)) {
10078 ss
<< "unable to parse address " << addrstr
;
10083 string blacklistop
;
10084 cmd_getval(g_ceph_context
, cmdmap
, "blacklistop", blacklistop
);
10085 if (blacklistop
== "add") {
10086 utime_t expires
= ceph_clock_now();
10088 // default one hour
10089 cmd_getval(g_ceph_context
, cmdmap
, "expire", d
,
10090 g_conf
->mon_osd_blacklist_default_expire
);
10093 pending_inc
.new_blacklist
[addr
] = expires
;
10096 // cancel any pending un-blacklisting request too
10097 auto it
= std::find(pending_inc
.old_blacklist
.begin(),
10098 pending_inc
.old_blacklist
.end(), addr
);
10099 if (it
!= pending_inc
.old_blacklist
.end()) {
10100 pending_inc
.old_blacklist
.erase(it
);
10104 ss
<< "blacklisting " << addr
<< " until " << expires
<< " (" << d
<< " sec)";
10106 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10107 get_last_committed() + 1));
10109 } else if (blacklistop
== "rm") {
10110 if (osdmap
.is_blacklisted(addr
) ||
10111 pending_inc
.new_blacklist
.count(addr
)) {
10112 if (osdmap
.is_blacklisted(addr
))
10113 pending_inc
.old_blacklist
.push_back(addr
);
10115 pending_inc
.new_blacklist
.erase(addr
);
10116 ss
<< "un-blacklisting " << addr
;
10118 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10119 get_last_committed() + 1));
10122 ss
<< addr
<< " isn't blacklisted";
10127 } else if (prefix
== "osd pool mksnap") {
10129 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
10130 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
10132 ss
<< "unrecognized pool '" << poolstr
<< "'";
10137 cmd_getval(g_ceph_context
, cmdmap
, "snap", snapname
);
10138 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
10139 if (p
->is_unmanaged_snaps_mode()) {
10140 ss
<< "pool " << poolstr
<< " is in unmanaged snaps mode";
10143 } else if (p
->snap_exists(snapname
.c_str())) {
10144 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " already exists";
10147 } else if (p
->is_tier()) {
10148 ss
<< "pool " << poolstr
<< " is a cache tier";
10153 if (pending_inc
.new_pools
.count(pool
))
10154 pp
= &pending_inc
.new_pools
[pool
];
10156 pp
= &pending_inc
.new_pools
[pool
];
10159 if (pp
->snap_exists(snapname
.c_str())) {
10160 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " already exists";
10162 pp
->add_snap(snapname
.c_str(), ceph_clock_now());
10163 pp
->set_snap_epoch(pending_inc
.epoch
);
10164 ss
<< "created pool " << poolstr
<< " snap " << snapname
;
10167 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10168 get_last_committed() + 1));
10170 } else if (prefix
== "osd pool rmsnap") {
10172 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
10173 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
10175 ss
<< "unrecognized pool '" << poolstr
<< "'";
10180 cmd_getval(g_ceph_context
, cmdmap
, "snap", snapname
);
10181 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
10182 if (p
->is_unmanaged_snaps_mode()) {
10183 ss
<< "pool " << poolstr
<< " is in unmanaged snaps mode";
10186 } else if (!p
->snap_exists(snapname
.c_str())) {
10187 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " does not exist";
10192 if (pending_inc
.new_pools
.count(pool
))
10193 pp
= &pending_inc
.new_pools
[pool
];
10195 pp
= &pending_inc
.new_pools
[pool
];
10198 snapid_t sn
= pp
->snap_exists(snapname
.c_str());
10200 pp
->remove_snap(sn
);
10201 pp
->set_snap_epoch(pending_inc
.epoch
);
10202 ss
<< "removed pool " << poolstr
<< " snap " << snapname
;
10204 ss
<< "already removed pool " << poolstr
<< " snap " << snapname
;
10207 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10208 get_last_committed() + 1));
10210 } else if (prefix
== "osd pool create") {
10213 cmd_getval(g_ceph_context
, cmdmap
, "pg_num", pg_num
, int64_t(0));
10214 cmd_getval(g_ceph_context
, cmdmap
, "pgp_num", pgp_num
, pg_num
);
10216 string pool_type_str
;
10217 cmd_getval(g_ceph_context
, cmdmap
, "pool_type", pool_type_str
);
10218 if (pool_type_str
.empty())
10219 pool_type_str
= g_conf
->osd_pool_default_type
;
10222 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
10223 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
10224 if (pool_id
>= 0) {
10225 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
10226 if (pool_type_str
!= p
->get_type_name()) {
10227 ss
<< "pool '" << poolstr
<< "' cannot change to type " << pool_type_str
;
10230 ss
<< "pool '" << poolstr
<< "' already exists";
10237 if (pool_type_str
== "replicated") {
10238 pool_type
= pg_pool_t::TYPE_REPLICATED
;
10239 } else if (pool_type_str
== "erasure") {
10240 err
= check_cluster_features(CEPH_FEATURE_CRUSH_V2
|
10241 CEPH_FEATURE_OSD_ERASURE_CODES
,
10243 if (err
== -EAGAIN
)
10247 pool_type
= pg_pool_t::TYPE_ERASURE
;
10249 ss
<< "unknown pool type '" << pool_type_str
<< "'";
10254 bool implicit_rule_creation
= false;
10256 cmd_getval(g_ceph_context
, cmdmap
, "rule", rule_name
);
10257 string erasure_code_profile
;
10258 cmd_getval(g_ceph_context
, cmdmap
, "erasure_code_profile", erasure_code_profile
);
10260 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
10261 if (erasure_code_profile
== "")
10262 erasure_code_profile
= "default";
10263 //handle the erasure code profile
10264 if (erasure_code_profile
== "default") {
10265 if (!osdmap
.has_erasure_code_profile(erasure_code_profile
)) {
10266 if (pending_inc
.has_erasure_code_profile(erasure_code_profile
)) {
10267 dout(20) << "erasure code profile " << erasure_code_profile
<< " already pending" << dendl
;
10271 map
<string
,string
> profile_map
;
10272 err
= osdmap
.get_erasure_code_profile_default(g_ceph_context
,
10277 dout(20) << "erasure code profile " << erasure_code_profile
<< " set" << dendl
;
10278 pending_inc
.set_erasure_code_profile(erasure_code_profile
, profile_map
);
10282 if (rule_name
== "") {
10283 implicit_rule_creation
= true;
10284 if (erasure_code_profile
== "default") {
10285 rule_name
= "erasure-code";
10287 dout(1) << "implicitly use rule named after the pool: "
10288 << poolstr
<< dendl
;
10289 rule_name
= poolstr
;
10293 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
10294 rule_name
= erasure_code_profile
;
10297 if (!implicit_rule_creation
&& rule_name
!= "") {
10299 err
= get_crush_rule(rule_name
, &rule
, &ss
);
10300 if (err
== -EAGAIN
) {
10301 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
10308 int64_t expected_num_objects
;
10309 cmd_getval(g_ceph_context
, cmdmap
, "expected_num_objects", expected_num_objects
, int64_t(0));
10310 if (expected_num_objects
< 0) {
10311 ss
<< "'expected_num_objects' must be non-negative";
10316 int64_t fast_read_param
;
10317 cmd_getval(g_ceph_context
, cmdmap
, "fast_read", fast_read_param
, int64_t(-1));
10318 FastReadType fast_read
= FAST_READ_DEFAULT
;
10319 if (fast_read_param
== 0)
10320 fast_read
= FAST_READ_OFF
;
10321 else if (fast_read_param
> 0)
10322 fast_read
= FAST_READ_ON
;
10324 err
= prepare_new_pool(poolstr
, 0, // auid=0 for admin created pool
10325 -1, // default crush rule
10328 erasure_code_profile
, pool_type
,
10329 (uint64_t)expected_num_objects
,
10335 ss
<< "pool '" << poolstr
<< "' already exists";
10338 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
10347 ss
<< "pool '" << poolstr
<< "' created";
10350 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10351 get_last_committed() + 1));
10354 } else if (prefix
== "osd pool delete" ||
10355 prefix
== "osd pool rm") {
10356 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
10357 string poolstr
, poolstr2
, sure
;
10358 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
10359 cmd_getval(g_ceph_context
, cmdmap
, "pool2", poolstr2
);
10360 cmd_getval(g_ceph_context
, cmdmap
, "sure", sure
);
10361 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
10363 ss
<< "pool '" << poolstr
<< "' does not exist";
10368 bool force_no_fake
= sure
== "--yes-i-really-really-mean-it-not-faking";
10369 if (poolstr2
!= poolstr
||
10370 (sure
!= "--yes-i-really-really-mean-it" && !force_no_fake
)) {
10371 ss
<< "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
10372 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
10373 << "followed by --yes-i-really-really-mean-it.";
10377 err
= _prepare_remove_pool(pool
, &ss
, force_no_fake
);
10378 if (err
== -EAGAIN
) {
10379 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
10385 } else if (prefix
== "osd pool rename") {
10386 string srcpoolstr
, destpoolstr
;
10387 cmd_getval(g_ceph_context
, cmdmap
, "srcpool", srcpoolstr
);
10388 cmd_getval(g_ceph_context
, cmdmap
, "destpool", destpoolstr
);
10389 int64_t pool_src
= osdmap
.lookup_pg_pool_name(srcpoolstr
.c_str());
10390 int64_t pool_dst
= osdmap
.lookup_pg_pool_name(destpoolstr
.c_str());
10392 if (pool_src
< 0) {
10393 if (pool_dst
>= 0) {
10394 // src pool doesn't exist, dst pool does exist: to ensure idempotency
10395 // of operations, assume this rename succeeded, as it is not changing
10396 // the current state. Make sure we output something understandable
10397 // for whoever is issuing the command, if they are paying attention,
10398 // in case it was not intentional; or to avoid a "wtf?" and a bug
10399 // report in case it was intentional, while expecting a failure.
10400 ss
<< "pool '" << srcpoolstr
<< "' does not exist; pool '"
10401 << destpoolstr
<< "' does -- assuming successful rename";
10404 ss
<< "unrecognized pool '" << srcpoolstr
<< "'";
10408 } else if (pool_dst
>= 0) {
10409 // source pool exists and so does the destination pool
10410 ss
<< "pool '" << destpoolstr
<< "' already exists";
10415 int ret
= _prepare_rename_pool(pool_src
, destpoolstr
);
10417 ss
<< "pool '" << srcpoolstr
<< "' renamed to '" << destpoolstr
<< "'";
10419 ss
<< "failed to rename pool '" << srcpoolstr
<< "' to '" << destpoolstr
<< "': "
10420 << cpp_strerror(ret
);
10423 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, ret
, rs
,
10424 get_last_committed() + 1));
10427 } else if (prefix
== "osd pool set") {
10428 err
= prepare_command_pool_set(cmdmap
, ss
);
10429 if (err
== -EAGAIN
)
10435 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10436 get_last_committed() + 1));
10438 } else if (prefix
== "osd tier add") {
10439 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
10440 if (err
== -EAGAIN
)
10445 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
10446 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
10448 ss
<< "unrecognized pool '" << poolstr
<< "'";
10452 string tierpoolstr
;
10453 cmd_getval(g_ceph_context
, cmdmap
, "tierpool", tierpoolstr
);
10454 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
10455 if (tierpool_id
< 0) {
10456 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
10460 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
10462 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
10465 if (!_check_become_tier(tierpool_id
, tp
, pool_id
, p
, &err
, &ss
)) {
10469 // make sure new tier is empty
10470 string force_nonempty
;
10471 cmd_getval(g_ceph_context
, cmdmap
, "force_nonempty", force_nonempty
);
10472 const pool_stat_t
*pstats
= mon
->pgservice
->get_pool_stat(tierpool_id
);
10473 if (pstats
&& pstats
->stats
.sum
.num_objects
!= 0 &&
10474 force_nonempty
!= "--force-nonempty") {
10475 ss
<< "tier pool '" << tierpoolstr
<< "' is not empty; --force-nonempty to force";
10479 if (tp
->ec_pool()) {
10480 ss
<< "tier pool '" << tierpoolstr
10481 << "' is an ec pool, which cannot be a tier";
10485 if ((!tp
->removed_snaps
.empty() || !tp
->snaps
.empty()) &&
10486 ((force_nonempty
!= "--force-nonempty") ||
10487 (!g_conf
->mon_debug_unsafe_allow_tier_with_nonempty_snaps
))) {
10488 ss
<< "tier pool '" << tierpoolstr
<< "' has snapshot state; it cannot be added as a tier without breaking the pool";
10493 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
10494 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
10495 if (np
->tiers
.count(tierpool_id
) || ntp
->is_tier()) {
10496 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
10499 np
->tiers
.insert(tierpool_id
);
10500 np
->set_snap_epoch(pending_inc
.epoch
); // tier will update to our snap info
10501 ntp
->tier_of
= pool_id
;
10502 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) a tier of '" << poolstr
<< "'";
10503 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
10504 get_last_committed() + 1));
10506 } else if (prefix
== "osd tier remove" ||
10507 prefix
== "osd tier rm") {
10509 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
10510 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
10512 ss
<< "unrecognized pool '" << poolstr
<< "'";
10516 string tierpoolstr
;
10517 cmd_getval(g_ceph_context
, cmdmap
, "tierpool", tierpoolstr
);
10518 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
10519 if (tierpool_id
< 0) {
10520 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
10524 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
10526 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
10529 if (!_check_remove_tier(pool_id
, p
, tp
, &err
, &ss
)) {
10533 if (p
->tiers
.count(tierpool_id
) == 0) {
10534 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) not a tier of '" << poolstr
<< "'";
10538 if (tp
->tier_of
!= pool_id
) {
10539 ss
<< "tier pool '" << tierpoolstr
<< "' is a tier of '"
10540 << osdmap
.get_pool_name(tp
->tier_of
) << "': "
10541 // be scary about it; this is an inconsistency and bells must go off
10542 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
10546 if (p
->read_tier
== tierpool_id
) {
10547 ss
<< "tier pool '" << tierpoolstr
<< "' is the overlay for '" << poolstr
<< "'; please remove-overlay first";
10552 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
10553 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
10554 if (np
->tiers
.count(tierpool_id
) == 0 ||
10555 ntp
->tier_of
!= pool_id
||
10556 np
->read_tier
== tierpool_id
) {
10557 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
10560 np
->tiers
.erase(tierpool_id
);
10562 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) not a tier of '" << poolstr
<< "'";
10563 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
10564 get_last_committed() + 1));
10566 } else if (prefix
== "osd tier set-overlay") {
10567 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
10568 if (err
== -EAGAIN
)
10573 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
10574 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
10576 ss
<< "unrecognized pool '" << poolstr
<< "'";
10580 string overlaypoolstr
;
10581 cmd_getval(g_ceph_context
, cmdmap
, "overlaypool", overlaypoolstr
);
10582 int64_t overlaypool_id
= osdmap
.lookup_pg_pool_name(overlaypoolstr
);
10583 if (overlaypool_id
< 0) {
10584 ss
<< "unrecognized pool '" << overlaypoolstr
<< "'";
10588 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
10590 const pg_pool_t
*overlay_p
= osdmap
.get_pg_pool(overlaypool_id
);
10592 if (p
->tiers
.count(overlaypool_id
) == 0) {
10593 ss
<< "tier pool '" << overlaypoolstr
<< "' is not a tier of '" << poolstr
<< "'";
10597 if (p
->read_tier
== overlaypool_id
) {
10599 ss
<< "overlay for '" << poolstr
<< "' is now (or already was) '" << overlaypoolstr
<< "'";
10602 if (p
->has_read_tier()) {
10603 ss
<< "pool '" << poolstr
<< "' has overlay '"
10604 << osdmap
.get_pool_name(p
->read_tier
)
10605 << "'; please remove-overlay first";
10611 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
10612 np
->read_tier
= overlaypool_id
;
10613 np
->write_tier
= overlaypool_id
;
10614 np
->set_last_force_op_resend(pending_inc
.epoch
);
10615 pg_pool_t
*noverlay_p
= pending_inc
.get_new_pool(overlaypool_id
, overlay_p
);
10616 noverlay_p
->set_last_force_op_resend(pending_inc
.epoch
);
10617 ss
<< "overlay for '" << poolstr
<< "' is now (or already was) '" << overlaypoolstr
<< "'";
10618 if (overlay_p
->cache_mode
== pg_pool_t::CACHEMODE_NONE
)
10619 ss
<<" (WARNING: overlay pool cache_mode is still NONE)";
10620 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
10621 get_last_committed() + 1));
10623 } else if (prefix
== "osd tier remove-overlay" ||
10624 prefix
== "osd tier rm-overlay") {
10626 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
10627 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
10629 ss
<< "unrecognized pool '" << poolstr
<< "'";
10633 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
10635 if (!p
->has_read_tier()) {
10637 ss
<< "there is now (or already was) no overlay for '" << poolstr
<< "'";
10641 if (!_check_remove_tier(pool_id
, p
, NULL
, &err
, &ss
)) {
10646 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
10647 if (np
->has_read_tier()) {
10648 const pg_pool_t
*op
= osdmap
.get_pg_pool(np
->read_tier
);
10649 pg_pool_t
*nop
= pending_inc
.get_new_pool(np
->read_tier
,op
);
10650 nop
->set_last_force_op_resend(pending_inc
.epoch
);
10652 if (np
->has_write_tier()) {
10653 const pg_pool_t
*op
= osdmap
.get_pg_pool(np
->write_tier
);
10654 pg_pool_t
*nop
= pending_inc
.get_new_pool(np
->write_tier
, op
);
10655 nop
->set_last_force_op_resend(pending_inc
.epoch
);
10657 np
->clear_read_tier();
10658 np
->clear_write_tier();
10659 np
->set_last_force_op_resend(pending_inc
.epoch
);
10660 ss
<< "there is now (or already was) no overlay for '" << poolstr
<< "'";
10661 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
10662 get_last_committed() + 1));
10664 } else if (prefix
== "osd tier cache-mode") {
10665 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
10666 if (err
== -EAGAIN
)
10671 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
10672 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
10674 ss
<< "unrecognized pool '" << poolstr
<< "'";
10678 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
10680 if (!p
->is_tier()) {
10681 ss
<< "pool '" << poolstr
<< "' is not a tier";
10686 cmd_getval(g_ceph_context
, cmdmap
, "mode", modestr
);
10687 pg_pool_t::cache_mode_t mode
= pg_pool_t::get_cache_mode_from_str(modestr
);
10689 ss
<< "'" << modestr
<< "' is not a valid cache mode";
10695 cmd_getval(g_ceph_context
, cmdmap
, "sure", sure
);
10696 if ((mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
10697 mode
!= pg_pool_t::CACHEMODE_NONE
&&
10698 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
10699 mode
!= pg_pool_t::CACHEMODE_READPROXY
) &&
10700 sure
!= "--yes-i-really-mean-it") {
10701 ss
<< "'" << modestr
<< "' is not a well-supported cache mode and may "
10702 << "corrupt your data. pass --yes-i-really-mean-it to force.";
10707 // pool already has this cache-mode set and there are no pending changes
10708 if (p
->cache_mode
== mode
&&
10709 (pending_inc
.new_pools
.count(pool_id
) == 0 ||
10710 pending_inc
.new_pools
[pool_id
].cache_mode
== p
->cache_mode
)) {
10711 ss
<< "set cache-mode for pool '" << poolstr
<< "'"
10712 << " to " << pg_pool_t::get_cache_mode_name(mode
);
10717 /* Mode description:
10719 * none: No cache-mode defined
10720 * forward: Forward all reads and writes to base pool
10721 * writeback: Cache writes, promote reads from base pool
10722 * readonly: Forward writes to base pool
10723 * readforward: Writes are in writeback mode, Reads are in forward mode
10724 * proxy: Proxy all reads and writes to base pool
10725 * readproxy: Writes are in writeback mode, Reads are in proxy mode
10727 * Hence, these are the allowed transitions:
10730 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
10731 * proxy -> forward || readforward || readproxy || writeback || any IF num_objects_dirty == 0
10732 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
10733 * readproxy -> forward || proxy || readforward || writeback || any IF num_objects_dirty == 0
10734 * writeback -> readforward || readproxy || forward || proxy
10738 // We check if the transition is valid against the current pool mode, as
10739 // it is the only committed state thus far. We will blantly squash
10740 // whatever mode is on the pending state.
10742 if (p
->cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
&&
10743 (mode
!= pg_pool_t::CACHEMODE_FORWARD
&&
10744 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
10745 mode
!= pg_pool_t::CACHEMODE_READFORWARD
&&
10746 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) {
10747 ss
<< "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode
)
10748 << "' on a '" << pg_pool_t::get_cache_mode_name(p
->cache_mode
)
10749 << "' pool; only '"
10750 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_FORWARD
)
10752 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY
)
10754 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READFORWARD
)
10756 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY
)
10761 if ((p
->cache_mode
== pg_pool_t::CACHEMODE_READFORWARD
&&
10762 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
10763 mode
!= pg_pool_t::CACHEMODE_FORWARD
&&
10764 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
10765 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) ||
10767 (p
->cache_mode
== pg_pool_t::CACHEMODE_READPROXY
&&
10768 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
10769 mode
!= pg_pool_t::CACHEMODE_FORWARD
&&
10770 mode
!= pg_pool_t::CACHEMODE_READFORWARD
&&
10771 mode
!= pg_pool_t::CACHEMODE_PROXY
)) ||
10773 (p
->cache_mode
== pg_pool_t::CACHEMODE_PROXY
&&
10774 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
10775 mode
!= pg_pool_t::CACHEMODE_FORWARD
&&
10776 mode
!= pg_pool_t::CACHEMODE_READFORWARD
&&
10777 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) ||
10779 (p
->cache_mode
== pg_pool_t::CACHEMODE_FORWARD
&&
10780 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
10781 mode
!= pg_pool_t::CACHEMODE_READFORWARD
&&
10782 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
10783 mode
!= pg_pool_t::CACHEMODE_READPROXY
))) {
10785 const pool_stat_t
* pstats
=
10786 mon
->pgservice
->get_pool_stat(pool_id
);
10788 if (pstats
&& pstats
->stats
.sum
.num_objects_dirty
> 0) {
10789 ss
<< "unable to set cache-mode '"
10790 << pg_pool_t::get_cache_mode_name(mode
) << "' on pool '" << poolstr
10791 << "': dirty objects found";
10797 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
10798 np
->cache_mode
= mode
;
10799 // set this both when moving to and from cache_mode NONE. this is to
10800 // capture legacy pools that were set up before this flag existed.
10801 np
->flags
|= pg_pool_t::FLAG_INCOMPLETE_CLONES
;
10802 ss
<< "set cache-mode for pool '" << poolstr
10803 << "' to " << pg_pool_t::get_cache_mode_name(mode
);
10804 if (mode
== pg_pool_t::CACHEMODE_NONE
) {
10805 const pg_pool_t
*base_pool
= osdmap
.get_pg_pool(np
->tier_of
);
10807 if (base_pool
->read_tier
== pool_id
||
10808 base_pool
->write_tier
== pool_id
)
10809 ss
<<" (WARNING: pool is still configured as read or write tier)";
10811 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
10812 get_last_committed() + 1));
10814 } else if (prefix
== "osd tier add-cache") {
10815 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
10816 if (err
== -EAGAIN
)
10821 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
10822 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
10824 ss
<< "unrecognized pool '" << poolstr
<< "'";
10828 string tierpoolstr
;
10829 cmd_getval(g_ceph_context
, cmdmap
, "tierpool", tierpoolstr
);
10830 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
10831 if (tierpool_id
< 0) {
10832 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
10836 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
10838 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
10841 if (!_check_become_tier(tierpool_id
, tp
, pool_id
, p
, &err
, &ss
)) {
10846 if (!cmd_getval(g_ceph_context
, cmdmap
, "size", size
)) {
10847 ss
<< "unable to parse 'size' value '"
10848 << cmd_vartype_stringify(cmdmap
["size"]) << "'";
10852 // make sure new tier is empty
10853 const pool_stat_t
*pstats
=
10854 mon
->pgservice
->get_pool_stat(tierpool_id
);
10855 if (pstats
&& pstats
->stats
.sum
.num_objects
!= 0) {
10856 ss
<< "tier pool '" << tierpoolstr
<< "' is not empty";
10860 string modestr
= g_conf
->osd_tier_default_cache_mode
;
10861 pg_pool_t::cache_mode_t mode
= pg_pool_t::get_cache_mode_from_str(modestr
);
10863 ss
<< "osd tier cache default mode '" << modestr
<< "' is not a valid cache mode";
10867 HitSet::Params hsp
;
10868 if (g_conf
->osd_tier_default_cache_hit_set_type
== "bloom") {
10869 BloomHitSet::Params
*bsp
= new BloomHitSet::Params
;
10870 bsp
->set_fpp(g_conf
->osd_pool_default_hit_set_bloom_fpp
);
10871 hsp
= HitSet::Params(bsp
);
10872 } else if (g_conf
->osd_tier_default_cache_hit_set_type
== "explicit_hash") {
10873 hsp
= HitSet::Params(new ExplicitHashHitSet::Params
);
10875 else if (g_conf
->osd_tier_default_cache_hit_set_type
== "explicit_object") {
10876 hsp
= HitSet::Params(new ExplicitObjectHitSet::Params
);
10878 ss
<< "osd tier cache default hit set type '" <<
10879 g_conf
->osd_tier_default_cache_hit_set_type
<< "' is not a known type";
10884 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
10885 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
10886 if (np
->tiers
.count(tierpool_id
) || ntp
->is_tier()) {
10887 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
10890 np
->tiers
.insert(tierpool_id
);
10891 np
->read_tier
= np
->write_tier
= tierpool_id
;
10892 np
->set_snap_epoch(pending_inc
.epoch
); // tier will update to our snap info
10893 np
->set_last_force_op_resend(pending_inc
.epoch
);
10894 ntp
->set_last_force_op_resend(pending_inc
.epoch
);
10895 ntp
->tier_of
= pool_id
;
10896 ntp
->cache_mode
= mode
;
10897 ntp
->hit_set_count
= g_conf
->osd_tier_default_cache_hit_set_count
;
10898 ntp
->hit_set_period
= g_conf
->osd_tier_default_cache_hit_set_period
;
10899 ntp
->min_read_recency_for_promote
= g_conf
->osd_tier_default_cache_min_read_recency_for_promote
;
10900 ntp
->min_write_recency_for_promote
= g_conf
->osd_tier_default_cache_min_write_recency_for_promote
;
10901 ntp
->hit_set_grade_decay_rate
= g_conf
->osd_tier_default_cache_hit_set_grade_decay_rate
;
10902 ntp
->hit_set_search_last_n
= g_conf
->osd_tier_default_cache_hit_set_search_last_n
;
10903 ntp
->hit_set_params
= hsp
;
10904 ntp
->target_max_bytes
= size
;
10905 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) a cache tier of '" << poolstr
<< "'";
10906 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
10907 get_last_committed() + 1));
10909 } else if (prefix
== "osd pool set-quota") {
10911 cmd_getval(g_ceph_context
, cmdmap
, "pool", poolstr
);
10912 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
10914 ss
<< "unrecognized pool '" << poolstr
<< "'";
10920 cmd_getval(g_ceph_context
, cmdmap
, "field", field
);
10921 if (field
!= "max_objects" && field
!= "max_bytes") {
10922 ss
<< "unrecognized field '" << field
<< "'; should be 'max_bytes' or 'max_objects'";
10927 // val could contain unit designations, so we treat as a string
10929 cmd_getval(g_ceph_context
, cmdmap
, "val", val
);
10931 int64_t value
= unit_to_bytesize(val
, &tss
);
10933 ss
<< "error parsing value '" << value
<< "': " << tss
.str();
10938 pg_pool_t
*pi
= pending_inc
.get_new_pool(pool_id
, osdmap
.get_pg_pool(pool_id
));
10939 if (field
== "max_objects") {
10940 pi
->quota_max_objects
= value
;
10941 } else if (field
== "max_bytes") {
10942 pi
->quota_max_bytes
= value
;
10944 assert(0 == "unrecognized option");
10946 ss
<< "set-quota " << field
<< " = " << value
<< " for pool " << poolstr
;
10948 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10949 get_last_committed() + 1));
10951 } else if (prefix
== "osd pool application enable" ||
10952 prefix
== "osd pool application disable" ||
10953 prefix
== "osd pool application set" ||
10954 prefix
== "osd pool application rm") {
10955 err
= prepare_command_pool_application(prefix
, cmdmap
, ss
);
10956 if (err
== -EAGAIN
)
10962 wait_for_finished_proposal(
10963 op
, new Monitor::C_Command(mon
, op
, 0, rs
, get_last_committed() + 1));
10965 } else if (prefix
== "osd reweight-by-pg" ||
10966 prefix
== "osd reweight-by-utilization" ||
10967 prefix
== "osd test-reweight-by-pg" ||
10968 prefix
== "osd test-reweight-by-utilization") {
10970 prefix
== "osd reweight-by-pg" || prefix
== "osd test-reweight-by-pg";
10972 prefix
== "osd test-reweight-by-pg" ||
10973 prefix
== "osd test-reweight-by-utilization";
10975 cmd_getval(g_ceph_context
, cmdmap
, "oload", oload
, int64_t(120));
10976 set
<int64_t> pools
;
10977 vector
<string
> poolnamevec
;
10978 cmd_getval(g_ceph_context
, cmdmap
, "pools", poolnamevec
);
10979 for (unsigned j
= 0; j
< poolnamevec
.size(); j
++) {
10980 int64_t pool
= osdmap
.lookup_pg_pool_name(poolnamevec
[j
]);
10982 ss
<< "pool '" << poolnamevec
[j
] << "' does not exist";
10986 pools
.insert(pool
);
10988 double max_change
= g_conf
->mon_reweight_max_change
;
10989 cmd_getval(g_ceph_context
, cmdmap
, "max_change", max_change
);
10990 if (max_change
<= 0.0) {
10991 ss
<< "max_change " << max_change
<< " must be positive";
10995 int64_t max_osds
= g_conf
->mon_reweight_max_osds
;
10996 cmd_getval(g_ceph_context
, cmdmap
, "max_osds", max_osds
);
10997 if (max_osds
<= 0) {
10998 ss
<< "max_osds " << max_osds
<< " must be positive";
11002 string no_increasing
;
11003 cmd_getval(g_ceph_context
, cmdmap
, "no_increasing", no_increasing
);
11005 mempool::osdmap::map
<int32_t, uint32_t> new_weights
;
11006 err
= mon
->pgservice
->reweight_by_utilization(osdmap
,
11011 pools
.empty() ? NULL
: &pools
,
11012 no_increasing
== "--no-increasing",
11014 &ss
, &out_str
, f
.get());
11016 dout(10) << "reweight::by_utilization: finished with " << out_str
<< dendl
;
11021 rdata
.append(out_str
);
11023 ss
<< "FAILED reweight-by-pg";
11024 } else if (err
== 0 || dry_run
) {
11027 ss
<< "SUCCESSFUL reweight-by-pg";
11028 pending_inc
.new_weight
= std::move(new_weights
);
11029 wait_for_finished_proposal(
11031 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
, get_last_committed() + 1));
11034 } else if (prefix
== "osd force-create-pg") {
11037 cmd_getval(g_ceph_context
, cmdmap
, "pgid", pgidstr
);
11038 if (!pgid
.parse(pgidstr
.c_str())) {
11039 ss
<< "invalid pgid '" << pgidstr
<< "'";
11045 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
11046 auto emplaced
= creating_pgs
.pgs
.emplace(pgid
,
11047 make_pair(osdmap
.get_epoch(),
11048 ceph_clock_now()));
11049 creating_now
= emplaced
.second
;
11051 if (creating_now
) {
11052 ss
<< "pg " << pgidstr
<< " now creating, ok";
11056 ss
<< "pg " << pgid
<< " already creating";
11066 if (err
< 0 && rs
.length() == 0)
11067 rs
= cpp_strerror(err
);
11068 mon
->reply_command(op
, err
, rs
, rdata
, get_last_committed());
11073 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11074 get_last_committed() + 1));
11078 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11082 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op
)
11084 op
->mark_osdmon_event(__func__
);
11085 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
11087 if (m
->fsid
!= mon
->monmap
->fsid
) {
11088 dout(0) << __func__
<< " drop message on fsid " << m
->fsid
11089 << " != " << mon
->monmap
->fsid
<< " for " << *m
<< dendl
;
11090 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
11094 if (m
->op
== POOL_OP_CREATE
)
11095 return preprocess_pool_op_create(op
);
11097 if (!osdmap
.get_pg_pool(m
->pool
)) {
11098 dout(10) << "attempt to operate on non-existent pool id " << m
->pool
<< dendl
;
11099 _pool_op_reply(op
, 0, osdmap
.get_epoch());
11103 // check if the snap and snapname exist
11104 bool snap_exists
= false;
11105 const pg_pool_t
*p
= osdmap
.get_pg_pool(m
->pool
);
11106 if (p
->snap_exists(m
->name
.c_str()))
11107 snap_exists
= true;
11110 case POOL_OP_CREATE_SNAP
:
11111 if (p
->is_unmanaged_snaps_mode() || p
->is_tier()) {
11112 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
11116 _pool_op_reply(op
, 0, osdmap
.get_epoch());
11120 case POOL_OP_CREATE_UNMANAGED_SNAP
:
11121 if (p
->is_pool_snaps_mode()) {
11122 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
11126 case POOL_OP_DELETE_SNAP
:
11127 if (p
->is_unmanaged_snaps_mode()) {
11128 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
11131 if (!snap_exists
) {
11132 _pool_op_reply(op
, 0, osdmap
.get_epoch());
11136 case POOL_OP_DELETE_UNMANAGED_SNAP
:
11137 if (p
->is_pool_snaps_mode()) {
11138 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
11141 if (p
->is_removed_snap(m
->snapid
)) {
11142 _pool_op_reply(op
, 0, osdmap
.get_epoch());
11146 case POOL_OP_DELETE
:
11147 if (osdmap
.lookup_pg_pool_name(m
->name
.c_str()) >= 0) {
11148 _pool_op_reply(op
, 0, osdmap
.get_epoch());
11152 case POOL_OP_AUID_CHANGE
:
11162 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op
)
11164 op
->mark_osdmon_event(__func__
);
11165 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
11166 MonSession
*session
= m
->get_session();
11168 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
11171 if (!session
->is_capable("osd", MON_CAP_W
)) {
11172 dout(5) << "attempt to create new pool without sufficient auid privileges!"
11173 << "message: " << *m
<< std::endl
11174 << "caps: " << session
->caps
<< dendl
;
11175 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
11179 int64_t pool
= osdmap
.lookup_pg_pool_name(m
->name
.c_str());
11181 _pool_op_reply(op
, 0, osdmap
.get_epoch());
11188 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op
)
11190 op
->mark_osdmon_event(__func__
);
11191 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
11192 dout(10) << "prepare_pool_op " << *m
<< dendl
;
11193 if (m
->op
== POOL_OP_CREATE
) {
11194 return prepare_pool_op_create(op
);
11195 } else if (m
->op
== POOL_OP_DELETE
) {
11196 return prepare_pool_op_delete(op
);
11200 bool changed
= false;
11202 if (!osdmap
.have_pg_pool(m
->pool
)) {
11203 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
11207 const pg_pool_t
*pool
= osdmap
.get_pg_pool(m
->pool
);
11210 case POOL_OP_CREATE_SNAP
:
11211 if (pool
->is_tier()) {
11213 _pool_op_reply(op
, ret
, osdmap
.get_epoch());
11215 } // else, fall through
11216 case POOL_OP_DELETE_SNAP
:
11217 if (!pool
->is_unmanaged_snaps_mode()) {
11218 bool snap_exists
= pool
->snap_exists(m
->name
.c_str());
11219 if ((m
->op
== POOL_OP_CREATE_SNAP
&& snap_exists
)
11220 || (m
->op
== POOL_OP_DELETE_SNAP
&& !snap_exists
)) {
11228 _pool_op_reply(op
, ret
, osdmap
.get_epoch());
11231 case POOL_OP_DELETE_UNMANAGED_SNAP
:
11232 // we won't allow removal of an unmanaged snapshot from a pool
11233 // not in unmanaged snaps mode.
11234 if (!pool
->is_unmanaged_snaps_mode()) {
11235 _pool_op_reply(op
, -ENOTSUP
, osdmap
.get_epoch());
11239 case POOL_OP_CREATE_UNMANAGED_SNAP
:
11240 // but we will allow creating an unmanaged snapshot on any pool
11241 // as long as it is not in 'pool' snaps mode.
11242 if (pool
->is_pool_snaps_mode()) {
11243 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
11248 // projected pool info
11250 if (pending_inc
.new_pools
.count(m
->pool
))
11251 pp
= pending_inc
.new_pools
[m
->pool
];
11253 pp
= *osdmap
.get_pg_pool(m
->pool
);
11255 bufferlist reply_data
;
11257 // pool snaps vs unmanaged snaps are mutually exclusive
11259 case POOL_OP_CREATE_SNAP
:
11260 case POOL_OP_DELETE_SNAP
:
11261 if (pp
.is_unmanaged_snaps_mode()) {
11267 case POOL_OP_CREATE_UNMANAGED_SNAP
:
11268 case POOL_OP_DELETE_UNMANAGED_SNAP
:
11269 if (pp
.is_pool_snaps_mode()) {
11276 case POOL_OP_CREATE_SNAP
:
11277 if (!pp
.snap_exists(m
->name
.c_str())) {
11278 pp
.add_snap(m
->name
.c_str(), ceph_clock_now());
11279 dout(10) << "create snap in pool " << m
->pool
<< " " << m
->name
<< " seq " << pp
.get_snap_epoch() << dendl
;
11284 case POOL_OP_DELETE_SNAP
:
11286 snapid_t s
= pp
.snap_exists(m
->name
.c_str());
11294 case POOL_OP_CREATE_UNMANAGED_SNAP
:
11297 pp
.add_unmanaged_snap(snapid
);
11298 ::encode(snapid
, reply_data
);
11303 case POOL_OP_DELETE_UNMANAGED_SNAP
:
11304 if (!pp
.is_removed_snap(m
->snapid
)) {
11305 pp
.remove_unmanaged_snap(m
->snapid
);
11310 case POOL_OP_AUID_CHANGE
:
11311 if (pp
.auid
!= m
->auid
) {
11323 pp
.set_snap_epoch(pending_inc
.epoch
);
11324 pending_inc
.new_pools
[m
->pool
] = pp
;
11328 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, ret
, pending_inc
.epoch
, &reply_data
));
11332 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op
)
11334 op
->mark_osdmon_event(__func__
);
11335 int err
= prepare_new_pool(op
);
11336 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, err
, pending_inc
.epoch
));
11340 int OSDMonitor::_check_remove_pool(int64_t pool_id
, const pg_pool_t
& pool
,
11343 const string
& poolstr
= osdmap
.get_pool_name(pool_id
);
11345 // If the Pool is in use by CephFS, refuse to delete it
11346 FSMap
const &pending_fsmap
= mon
->mdsmon()->get_pending();
11347 if (pending_fsmap
.pool_in_use(pool_id
)) {
11348 *ss
<< "pool '" << poolstr
<< "' is in use by CephFS";
11352 if (pool
.tier_of
>= 0) {
11353 *ss
<< "pool '" << poolstr
<< "' is a tier of '"
11354 << osdmap
.get_pool_name(pool
.tier_of
) << "'";
11357 if (!pool
.tiers
.empty()) {
11358 *ss
<< "pool '" << poolstr
<< "' has tiers";
11359 for(auto tier
: pool
.tiers
) {
11360 *ss
<< " " << osdmap
.get_pool_name(tier
);
11365 if (!g_conf
->mon_allow_pool_delete
) {
11366 *ss
<< "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
11370 if (pool
.has_flag(pg_pool_t::FLAG_NODELETE
)) {
11371 *ss
<< "pool deletion is disabled; you must unset nodelete flag for the pool first";
11375 *ss
<< "pool '" << poolstr
<< "' removed";
11380 * Check if it is safe to add a tier to a base pool
11383 * True if the operation should proceed, false if we should abort here
11384 * (abort doesn't necessarily mean error, could be idempotency)
11386 bool OSDMonitor::_check_become_tier(
11387 const int64_t tier_pool_id
, const pg_pool_t
*tier_pool
,
11388 const int64_t base_pool_id
, const pg_pool_t
*base_pool
,
11392 const std::string
&tier_pool_name
= osdmap
.get_pool_name(tier_pool_id
);
11393 const std::string
&base_pool_name
= osdmap
.get_pool_name(base_pool_id
);
11395 const FSMap
&pending_fsmap
= mon
->mdsmon()->get_pending();
11396 if (pending_fsmap
.pool_in_use(tier_pool_id
)) {
11397 *ss
<< "pool '" << tier_pool_name
<< "' is in use by CephFS";
11402 if (base_pool
->tiers
.count(tier_pool_id
)) {
11403 assert(tier_pool
->tier_of
== base_pool_id
);
11405 *ss
<< "pool '" << tier_pool_name
<< "' is now (or already was) a tier of '"
11406 << base_pool_name
<< "'";
11410 if (base_pool
->is_tier()) {
11411 *ss
<< "pool '" << base_pool_name
<< "' is already a tier of '"
11412 << osdmap
.get_pool_name(base_pool
->tier_of
) << "', "
11413 << "multiple tiers are not yet supported.";
11418 if (tier_pool
->has_tiers()) {
11419 *ss
<< "pool '" << tier_pool_name
<< "' has following tier(s) already:";
11420 for (set
<uint64_t>::iterator it
= tier_pool
->tiers
.begin();
11421 it
!= tier_pool
->tiers
.end(); ++it
)
11422 *ss
<< "'" << osdmap
.get_pool_name(*it
) << "',";
11423 *ss
<< " multiple tiers are not yet supported.";
11428 if (tier_pool
->is_tier()) {
11429 *ss
<< "tier pool '" << tier_pool_name
<< "' is already a tier of '"
11430 << osdmap
.get_pool_name(tier_pool
->tier_of
) << "'";
11441 * Check if it is safe to remove a tier from this base pool
11444 * True if the operation should proceed, false if we should abort here
11445 * (abort doesn't necessarily mean error, could be idempotency)
11447 bool OSDMonitor::_check_remove_tier(
11448 const int64_t base_pool_id
, const pg_pool_t
*base_pool
,
11449 const pg_pool_t
*tier_pool
,
11450 int *err
, ostream
*ss
) const
11452 const std::string
&base_pool_name
= osdmap
.get_pool_name(base_pool_id
);
11454 // Apply CephFS-specific checks
11455 const FSMap
&pending_fsmap
= mon
->mdsmon()->get_pending();
11456 if (pending_fsmap
.pool_in_use(base_pool_id
)) {
11457 if (base_pool
->type
!= pg_pool_t::TYPE_REPLICATED
) {
11458 // If the underlying pool is erasure coded, we can't permit the
11459 // removal of the replicated tier that CephFS relies on to access it
11460 *ss
<< "pool '" << base_pool_name
<< "' is in use by CephFS via its tier";
11465 if (tier_pool
&& tier_pool
->cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
) {
11466 *ss
<< "pool '" << base_pool_name
<< "' is in use by CephFS, and this "
11467 "tier is still in use as a writeback cache. Change the cache "
11468 "mode and flush the cache before removing it";
11478 int OSDMonitor::_prepare_remove_pool(
11479 int64_t pool
, ostream
*ss
, bool no_fake
)
11481 dout(10) << __func__
<< " " << pool
<< dendl
;
11482 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
11483 int r
= _check_remove_pool(pool
, *p
, ss
);
11487 auto new_pool
= pending_inc
.new_pools
.find(pool
);
11488 if (new_pool
!= pending_inc
.new_pools
.end()) {
11489 // if there is a problem with the pending info, wait and retry
11491 const auto& p
= new_pool
->second
;
11492 int r
= _check_remove_pool(pool
, p
, ss
);
11497 if (pending_inc
.old_pools
.count(pool
)) {
11498 dout(10) << __func__
<< " " << pool
<< " already pending removal"
11503 if (g_conf
->mon_fake_pool_delete
&& !no_fake
) {
11504 string old_name
= osdmap
.get_pool_name(pool
);
11505 string new_name
= old_name
+ "." + stringify(pool
) + ".DELETED";
11506 dout(1) << __func__
<< " faking pool deletion: renaming " << pool
<< " "
11507 << old_name
<< " -> " << new_name
<< dendl
;
11508 pending_inc
.new_pool_names
[pool
] = new_name
;
11513 pending_inc
.old_pools
.insert(pool
);
11515 // remove any pg_temp mappings for this pool
11516 for (auto p
= osdmap
.pg_temp
->begin();
11517 p
!= osdmap
.pg_temp
->end();
11519 if (p
->first
.pool() == (uint64_t)pool
) {
11520 dout(10) << __func__
<< " " << pool
<< " removing obsolete pg_temp "
11521 << p
->first
<< dendl
;
11522 pending_inc
.new_pg_temp
[p
->first
].clear();
11525 // remove any primary_temp mappings for this pool
11526 for (auto p
= osdmap
.primary_temp
->begin();
11527 p
!= osdmap
.primary_temp
->end();
11529 if (p
->first
.pool() == (uint64_t)pool
) {
11530 dout(10) << __func__
<< " " << pool
11531 << " removing obsolete primary_temp" << p
->first
<< dendl
;
11532 pending_inc
.new_primary_temp
[p
->first
] = -1;
11535 // remove any pg_upmap mappings for this pool
11536 for (auto& p
: osdmap
.pg_upmap
) {
11537 if (p
.first
.pool() == (uint64_t)pool
) {
11538 dout(10) << __func__
<< " " << pool
11539 << " removing obsolete pg_upmap "
11540 << p
.first
<< dendl
;
11541 pending_inc
.old_pg_upmap
.insert(p
.first
);
11544 // remove any pg_upmap_items mappings for this pool
11545 for (auto& p
: osdmap
.pg_upmap_items
) {
11546 if (p
.first
.pool() == (uint64_t)pool
) {
11547 dout(10) << __func__
<< " " << pool
11548 << " removing obsolete pg_upmap_items " << p
.first
11550 pending_inc
.old_pg_upmap_items
.insert(p
.first
);
11554 // remove any choose_args for this pool
11555 CrushWrapper newcrush
;
11556 _get_pending_crush(newcrush
);
11557 if (newcrush
.have_choose_args(pool
)) {
11558 dout(10) << __func__
<< " removing choose_args for pool " << pool
<< dendl
;
11559 newcrush
.rm_choose_args(pool
);
11560 pending_inc
.crush
.clear();
11561 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
11566 int OSDMonitor::_prepare_rename_pool(int64_t pool
, string newname
)
11568 dout(10) << "_prepare_rename_pool " << pool
<< dendl
;
11569 if (pending_inc
.old_pools
.count(pool
)) {
11570 dout(10) << "_prepare_rename_pool " << pool
<< " pending removal" << dendl
;
11573 for (map
<int64_t,string
>::iterator p
= pending_inc
.new_pool_names
.begin();
11574 p
!= pending_inc
.new_pool_names
.end();
11576 if (p
->second
== newname
&& p
->first
!= pool
) {
11581 pending_inc
.new_pool_names
[pool
] = newname
;
11585 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op
)
11587 op
->mark_osdmon_event(__func__
);
11588 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
11590 int ret
= _prepare_remove_pool(m
->pool
, &ss
, false);
11591 if (ret
== -EAGAIN
) {
11592 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11596 dout(10) << __func__
<< " got " << ret
<< " " << ss
.str() << dendl
;
11597 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, ret
,
11598 pending_inc
.epoch
));
11602 void OSDMonitor::_pool_op_reply(MonOpRequestRef op
,
11603 int ret
, epoch_t epoch
, bufferlist
*blp
)
11605 op
->mark_osdmon_event(__func__
);
11606 MPoolOp
*m
= static_cast<MPoolOp
*>(op
->get_req());
11607 dout(20) << "_pool_op_reply " << ret
<< dendl
;
11608 MPoolOpReply
*reply
= new MPoolOpReply(m
->fsid
, m
->get_tid(),
11609 ret
, epoch
, get_last_committed(), blp
);
11610 mon
->send_reply(op
, reply
);