1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
10 * Author: Loic Dachary <loic@dachary.org>
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
20 #include <boost/algorithm/string.hpp>
21 #include <experimental/iterator>
25 #include "mon/OSDMonitor.h"
26 #include "mon/Monitor.h"
27 #include "mon/MDSMonitor.h"
28 #include "mon/MgrStatMonitor.h"
29 #include "mon/AuthMonitor.h"
30 #include "mon/KVMonitor.h"
32 #include "mon/MonitorDBStore.h"
33 #include "mon/Session.h"
35 #include "crush/CrushWrapper.h"
36 #include "crush/CrushTester.h"
37 #include "crush/CrushTreeDumper.h"
39 #include "messages/MOSDBeacon.h"
40 #include "messages/MOSDFailure.h"
41 #include "messages/MOSDMarkMeDown.h"
42 #include "messages/MOSDMarkMeDead.h"
43 #include "messages/MOSDFull.h"
44 #include "messages/MOSDMap.h"
45 #include "messages/MMonGetOSDMap.h"
46 #include "messages/MOSDBoot.h"
47 #include "messages/MOSDAlive.h"
48 #include "messages/MPoolOp.h"
49 #include "messages/MPoolOpReply.h"
50 #include "messages/MOSDPGCreate.h"
51 #include "messages/MOSDPGCreate2.h"
52 #include "messages/MOSDPGCreated.h"
53 #include "messages/MOSDPGTemp.h"
54 #include "messages/MOSDPGReadyToMerge.h"
55 #include "messages/MMonCommand.h"
56 #include "messages/MRemoveSnaps.h"
57 #include "messages/MOSDScrub.h"
58 #include "messages/MRoute.h"
59 #include "messages/MMonGetPurgedSnaps.h"
60 #include "messages/MMonGetPurgedSnapsReply.h"
62 #include "common/TextTable.h"
63 #include "common/Timer.h"
64 #include "common/ceph_argparse.h"
65 #include "common/perf_counters.h"
66 #include "common/PriorityCache.h"
67 #include "common/strtol.h"
68 #include "common/numa.h"
70 #include "common/config.h"
71 #include "common/errno.h"
73 #include "erasure-code/ErasureCodePlugin.h"
74 #include "compressor/Compressor.h"
75 #include "common/Checksummer.h"
77 #include "include/compat.h"
78 #include "include/ceph_assert.h"
79 #include "include/stringify.h"
80 #include "include/util.h"
81 #include "common/cmdparse.h"
82 #include "include/str_list.h"
83 #include "include/str_map.h"
84 #include "include/scope_guard.h"
85 #include "perfglue/heap_profiler.h"
87 #include "auth/cephx/CephxKeyServer.h"
88 #include "osd/OSDCap.h"
90 #include "json_spirit/json_spirit_reader.h"
92 #include <boost/algorithm/string/predicate.hpp>
99 using std::ostringstream
;
103 using std::stringstream
;
104 using std::to_string
;
107 using ceph::bufferlist
;
110 using ceph::ErasureCodeInterfaceRef
;
111 using ceph::ErasureCodePluginRegistry
;
112 using ceph::ErasureCodeProfile
;
113 using ceph::Formatter
;
114 using ceph::JSONFormatter
;
115 using ceph::make_message
;
117 #define dout_subsys ceph_subsys_mon
118 static const string
OSD_PG_CREATING_PREFIX("osd_pg_creating");
119 static const string
OSD_METADATA_PREFIX("osd_metadata");
120 static const string
OSD_SNAP_PREFIX("osd_snap");
124 OSD snapshot metadata
125 ---------------------
127 -- starting with mimic, removed in octopus --
129 "removed_epoch_%llu_%08lx" % (pool, epoch)
130 -> interval_set<snapid_t>
132 "removed_snap_%llu_%016llx" % (pool, last_snap)
133 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
136 -- starting with mimic --
138 "purged_snap_%llu_%016llx" % (pool, last_snap)
139 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
141 - note that the {removed,purged}_snap put the last snap in they key so
142 that we can use forward iteration only to search for an epoch in an
143 interval. e.g., to test if epoch N is removed/purged, we'll find a key
144 >= N that either does or doesn't contain the given snap.
147 -- starting with octopus --
149 "purged_epoch_%08lx" % epoch
150 -> map<int64_t,interval_set<snapid_t>>
153 using namespace TOPNSPC::common
;
156 struct OSDMemCache
: public PriorityCache::PriCache
{
158 int64_t cache_bytes
[PriorityCache::Priority::LAST
+1] = {0};
159 int64_t committed_bytes
= 0;
160 double cache_ratio
= 0;
162 OSDMemCache(OSDMonitor
*m
) : osdmon(m
) {};
164 virtual uint64_t _get_used_bytes() const = 0;
166 virtual int64_t request_cache_bytes(
167 PriorityCache::Priority pri
, uint64_t total_cache
) const {
168 int64_t assigned
= get_cache_bytes(pri
);
171 // All cache items are currently set to have PRI1 priority
172 case PriorityCache::Priority::PRI1
:
174 int64_t request
= _get_used_bytes();
175 return (request
> assigned
) ? request
- assigned
: 0;
183 virtual int64_t get_cache_bytes(PriorityCache::Priority pri
) const {
184 return cache_bytes
[pri
];
187 virtual int64_t get_cache_bytes() const {
190 for (int i
= 0; i
< PriorityCache::Priority::LAST
+ 1; i
++) {
191 PriorityCache::Priority pri
= static_cast<PriorityCache::Priority
>(i
);
192 total
+= get_cache_bytes(pri
);
197 virtual void set_cache_bytes(PriorityCache::Priority pri
, int64_t bytes
) {
198 cache_bytes
[pri
] = bytes
;
200 virtual void add_cache_bytes(PriorityCache::Priority pri
, int64_t bytes
) {
201 cache_bytes
[pri
] += bytes
;
203 virtual int64_t commit_cache_size(uint64_t total_cache
) {
204 committed_bytes
= PriorityCache::get_chunk(
205 get_cache_bytes(), total_cache
);
206 return committed_bytes
;
208 virtual int64_t get_committed_size() const {
209 return committed_bytes
;
211 virtual double get_cache_ratio() const {
214 virtual void set_cache_ratio(double ratio
) {
217 virtual void shift_bins() {
219 virtual void import_bins(const std::vector
<uint64_t> &bins
) {
221 virtual void set_bins(PriorityCache::Priority pri
, uint64_t end_bin
) {
223 virtual uint64_t get_bins(PriorityCache::Priority pri
) const {
227 virtual string
get_cache_name() const = 0;
230 struct IncCache
: public OSDMemCache
{
231 IncCache(OSDMonitor
*m
) : OSDMemCache(m
) {};
233 virtual uint64_t _get_used_bytes() const {
234 return osdmon
->inc_osd_cache
.get_bytes();
237 virtual string
get_cache_name() const {
238 return "OSDMap Inc Cache";
241 uint64_t _get_num_osdmaps() const {
242 return osdmon
->inc_osd_cache
.get_size();
246 struct FullCache
: public OSDMemCache
{
247 FullCache(OSDMonitor
*m
) : OSDMemCache(m
) {};
249 virtual uint64_t _get_used_bytes() const {
250 return osdmon
->full_osd_cache
.get_bytes();
253 virtual string
get_cache_name() const {
254 return "OSDMap Full Cache";
257 uint64_t _get_num_osdmaps() const {
258 return osdmon
->full_osd_cache
.get_size();
262 std::shared_ptr
<IncCache
> inc_cache
;
263 std::shared_ptr
<FullCache
> full_cache
;
265 const uint32_t MAX_POOL_APPLICATIONS
= 4;
266 const uint32_t MAX_POOL_APPLICATION_KEYS
= 64;
267 const uint32_t MAX_POOL_APPLICATION_LENGTH
= 128;
269 bool is_osd_writable(const OSDCapGrant
& grant
, const std::string
* pool_name
) {
270 // Note: this doesn't include support for the application tag match
271 if ((grant
.spec
.allow
& OSD_CAP_W
) != 0) {
272 auto& match
= grant
.match
;
273 if (match
.is_match_all()) {
275 } else if (pool_name
!= nullptr &&
276 !match
.pool_namespace
.pool_name
.empty() &&
277 match
.pool_namespace
.pool_name
== *pool_name
) {
284 bool is_unmanaged_snap_op_permitted(CephContext
* cct
,
285 const KeyServer
& key_server
,
286 const EntityName
& entity_name
,
287 const MonCap
& mon_caps
,
288 const entity_addr_t
& peer_socket_addr
,
289 const std::string
* pool_name
)
291 typedef std::map
<std::string
, std::string
> CommandArgs
;
293 if (mon_caps
.is_capable(
294 cct
, entity_name
, "osd",
295 "osd pool op unmanaged-snap",
296 (pool_name
== nullptr ?
297 CommandArgs
{} /* pool DNE, require unrestricted cap */ :
298 CommandArgs
{{"poolname", *pool_name
}}),
304 AuthCapsInfo caps_info
;
305 if (!key_server
.get_service_caps(entity_name
, CEPH_ENTITY_TYPE_OSD
,
307 dout(10) << "unable to locate OSD cap data for " << entity_name
308 << " in auth db" << dendl
;
313 if (caps_info
.caps
.length() > 0) {
314 auto p
= caps_info
.caps
.cbegin();
317 } catch (const ceph::buffer::error
&err
) {
318 derr
<< "corrupt OSD cap data for " << entity_name
<< " in auth db"
325 if (!osd_cap
.parse(caps_str
, nullptr)) {
326 dout(10) << "unable to parse OSD cap data for " << entity_name
327 << " in auth db" << dendl
;
331 // if the entity has write permissions in one or all pools, permit
332 // usage of unmanaged-snapshots
333 if (osd_cap
.allow_all()) {
337 for (auto& grant
: osd_cap
.grants
) {
338 if (grant
.profile
.is_valid()) {
339 for (auto& profile_grant
: grant
.profile_grants
) {
340 if (is_osd_writable(profile_grant
, pool_name
)) {
344 } else if (is_osd_writable(grant
, pool_name
)) {
352 } // anonymous namespace
354 void LastEpochClean::Lec::report(unsigned pg_num
, ps_t ps
,
355 epoch_t last_epoch_clean
)
361 epoch_by_pg
.resize(pg_num
, 0);
362 const auto old_lec
= epoch_by_pg
[ps
];
363 if (old_lec
>= last_epoch_clean
) {
367 epoch_by_pg
[ps
] = last_epoch_clean
;
368 if (last_epoch_clean
< floor
) {
369 floor
= last_epoch_clean
;
370 } else if (last_epoch_clean
> floor
) {
371 if (old_lec
== floor
) {
372 // probably should increase floor?
373 auto new_floor
= std::min_element(std::begin(epoch_by_pg
),
374 std::end(epoch_by_pg
));
378 if (ps
!= next_missing
) {
381 for (; next_missing
< epoch_by_pg
.size(); next_missing
++) {
382 if (epoch_by_pg
[next_missing
] == 0) {
388 void LastEpochClean::remove_pool(uint64_t pool
)
390 report_by_pool
.erase(pool
);
393 void LastEpochClean::report(unsigned pg_num
, const pg_t
& pg
,
394 epoch_t last_epoch_clean
)
396 auto& lec
= report_by_pool
[pg
.pool()];
397 return lec
.report(pg_num
, pg
.ps(), last_epoch_clean
);
400 epoch_t
LastEpochClean::get_lower_bound(const OSDMap
& latest
) const
402 auto floor
= latest
.get_epoch();
403 for (auto& pool
: latest
.get_pools()) {
404 auto reported
= report_by_pool
.find(pool
.first
);
405 if (reported
== report_by_pool
.end()) {
408 if (reported
->second
.next_missing
< pool
.second
.get_pg_num()) {
411 if (reported
->second
.floor
< floor
) {
412 floor
= reported
->second
.floor
;
418 void LastEpochClean::dump(Formatter
*f
) const
420 f
->open_array_section("per_pool");
422 for (auto& [pool
, lec
] : report_by_pool
) {
423 f
->open_object_section("pool");
424 f
->dump_unsigned("poolid", pool
);
425 f
->dump_unsigned("floor", lec
.floor
);
432 class C_UpdateCreatingPGs
: public Context
{
437 C_UpdateCreatingPGs(OSDMonitor
*osdmon
, epoch_t e
) :
438 osdmon(osdmon
), start(ceph_clock_now()), epoch(e
) {}
439 void finish(int r
) override
{
441 utime_t end
= ceph_clock_now();
442 dout(10) << "osdmap epoch " << epoch
<< " mapping took "
443 << (end
- start
) << " seconds" << dendl
;
444 osdmon
->update_creating_pgs();
445 osdmon
->check_pg_creates_subs();
451 #define dout_prefix _prefix(_dout, mon, osdmap)
452 static ostream
& _prefix(std::ostream
*_dout
, Monitor
&mon
, const OSDMap
& osdmap
) {
453 return *_dout
<< "mon." << mon
.name
<< "@" << mon
.rank
454 << "(" << mon
.get_state_name()
455 << ").osd e" << osdmap
.get_epoch() << " ";
458 OSDMonitor::OSDMonitor(
462 const string
& service_name
)
463 : PaxosService(mn
, p
, service_name
),
465 inc_osd_cache(g_conf()->mon_osd_cache_size
),
466 full_osd_cache(g_conf()->mon_osd_cache_size
),
467 has_osdmap_manifest(false),
468 mapper(mn
.cct
, &mn
.cpu_tp
)
470 inc_cache
= std::make_shared
<IncCache
>(this);
471 full_cache
= std::make_shared
<FullCache
>(this);
472 cct
->_conf
.add_observer(this);
473 int r
= _set_cache_sizes();
475 derr
<< __func__
<< " using default osd cache size - mon_osd_cache_size ("
476 << g_conf()->mon_osd_cache_size
477 << ") without priority cache management"
482 const char **OSDMonitor::get_tracked_conf_keys() const
484 static const char* KEYS
[] = {
486 "mon_memory_autotune",
487 "rocksdb_cache_size",
493 void OSDMonitor::handle_conf_change(const ConfigProxy
& conf
,
494 const std::set
<std::string
> &changed
)
496 dout(10) << __func__
<< " " << changed
<< dendl
;
498 if (changed
.count("mon_memory_autotune")) {
499 _set_cache_autotuning();
501 if (changed
.count("mon_memory_target") ||
502 changed
.count("rocksdb_cache_size")) {
503 int r
= _update_mon_cache_settings();
505 derr
<< __func__
<< " mon_memory_target:"
506 << g_conf()->mon_memory_target
507 << " rocksdb_cache_size:"
508 << g_conf()->rocksdb_cache_size
509 << ". Unable to update cache size."
515 void OSDMonitor::_set_cache_autotuning()
517 if (!g_conf()->mon_memory_autotune
&& pcm
!= nullptr) {
518 // Disable cache autotuning
519 std::lock_guard
l(balancer_lock
);
523 if (g_conf()->mon_memory_autotune
&& pcm
== nullptr) {
524 int r
= register_cache_with_pcm();
527 << " Error while registering osdmon caches with pcm."
528 << " Cache auto tuning not enabled."
530 mon_memory_autotune
= false;
532 mon_memory_autotune
= true;
537 int OSDMonitor::_update_mon_cache_settings()
539 if (g_conf()->mon_memory_target
<= 0 ||
540 g_conf()->mon_memory_target
< mon_memory_min
||
541 g_conf()->rocksdb_cache_size
<= 0) {
545 if (pcm
== nullptr && rocksdb_binned_kv_cache
== nullptr) {
546 derr
<< __func__
<< " not using pcm and rocksdb" << dendl
;
550 uint64_t old_mon_memory_target
= mon_memory_target
;
551 uint64_t old_rocksdb_cache_size
= rocksdb_cache_size
;
553 // Set the new pcm memory cache sizes
554 mon_memory_target
= g_conf()->mon_memory_target
;
555 rocksdb_cache_size
= g_conf()->rocksdb_cache_size
;
557 uint64_t base
= mon_memory_base
;
558 double fragmentation
= mon_memory_fragmentation
;
559 uint64_t target
= mon_memory_target
;
560 uint64_t min
= mon_memory_min
;
563 uint64_t ltarget
= (1.0 - fragmentation
) * target
;
564 if (ltarget
> base
+ min
) {
565 max
= ltarget
- base
;
568 int r
= _set_cache_ratios();
570 derr
<< __func__
<< " Cache ratios for pcm could not be set."
571 << " Review the kv (rocksdb) and mon_memory_target sizes."
573 mon_memory_target
= old_mon_memory_target
;
574 rocksdb_cache_size
= old_rocksdb_cache_size
;
578 if (mon_memory_autotune
&& pcm
!= nullptr) {
579 std::lock_guard
l(balancer_lock
);
580 // set pcm cache levels
581 pcm
->set_target_memory(target
);
582 pcm
->set_min_memory(min
);
583 pcm
->set_max_memory(max
);
584 // tune memory based on new values
587 _set_new_cache_sizes();
588 dout(1) << __func__
<< " Updated mon cache setting."
589 << " target: " << target
597 int OSDMonitor::_set_cache_sizes()
599 if (g_conf()->mon_memory_autotune
) {
600 // set the new osdmon cache targets to be managed by pcm
601 mon_osd_cache_size
= g_conf()->mon_osd_cache_size
;
602 rocksdb_cache_size
= g_conf()->rocksdb_cache_size
;
603 mon_memory_base
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_base");
604 mon_memory_fragmentation
= cct
->_conf
.get_val
<double>("osd_memory_expected_fragmentation");
605 mon_memory_target
= g_conf()->mon_memory_target
;
606 mon_memory_min
= g_conf()->mon_osd_cache_size_min
;
607 if (mon_memory_target
<= 0 || mon_memory_min
<= 0) {
608 derr
<< __func__
<< " mon_memory_target:" << mon_memory_target
609 << " mon_memory_min:" << mon_memory_min
610 << ". Invalid size option(s) provided."
614 // Set the initial inc and full LRU cache sizes
615 inc_osd_cache
.set_bytes(mon_memory_min
);
616 full_osd_cache
.set_bytes(mon_memory_min
);
617 mon_memory_autotune
= g_conf()->mon_memory_autotune
;
622 bool OSDMonitor::_have_pending_crush()
624 return pending_inc
.crush
.length() > 0;
627 CrushWrapper
&OSDMonitor::_get_stable_crush()
629 return *osdmap
.crush
;
632 CrushWrapper
OSDMonitor::_get_pending_crush()
635 if (pending_inc
.crush
.length())
636 bl
= pending_inc
.crush
;
638 osdmap
.crush
->encode(bl
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
640 auto p
= bl
.cbegin();
646 void OSDMonitor::create_initial()
648 dout(10) << "create_initial for " << mon
.monmap
->fsid
<< dendl
;
653 mon
.store
->get("mkfs", "osdmap", bl
);
657 newmap
.set_fsid(mon
.monmap
->fsid
);
659 newmap
.build_simple(cct
, 0, mon
.monmap
->fsid
, 0);
662 newmap
.created
= newmap
.modified
= ceph_clock_now();
664 // new clusters should sort bitwise by default.
665 newmap
.set_flag(CEPH_OSDMAP_SORTBITWISE
);
668 CEPH_OSDMAP_RECOVERY_DELETES
|
669 CEPH_OSDMAP_PURGED_SNAPDIRS
|
670 CEPH_OSDMAP_PGLOG_HARDLIMIT
;
671 newmap
.full_ratio
= g_conf()->mon_osd_full_ratio
;
672 if (newmap
.full_ratio
> 1.0) newmap
.full_ratio
/= 100;
673 newmap
.backfillfull_ratio
= g_conf()->mon_osd_backfillfull_ratio
;
674 if (newmap
.backfillfull_ratio
> 1.0) newmap
.backfillfull_ratio
/= 100;
675 newmap
.nearfull_ratio
= g_conf()->mon_osd_nearfull_ratio
;
676 if (newmap
.nearfull_ratio
> 1.0) newmap
.nearfull_ratio
/= 100;
678 // new cluster should require latest by default
679 if (g_conf().get_val
<bool>("mon_debug_no_require_quincy")) {
680 if (g_conf().get_val
<bool>("mon_debug_no_require_pacific")) {
681 derr
<< __func__
<< " mon_debug_no_require_quincy and pacific=true" << dendl
;
682 newmap
.require_osd_release
= ceph_release_t::nautilus
;
684 derr
<< __func__
<< " mon_debug_no_require_quincy=true" << dendl
;
685 newmap
.require_osd_release
= ceph_release_t::pacific
;
688 newmap
.require_osd_release
= ceph_release_t::quincy
;
691 ceph_release_t r
= ceph_release_from_name(g_conf()->mon_osd_initial_require_min_compat_client
);
693 ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
695 newmap
.require_min_compat_client
= r
;
697 // encode into pending incremental
698 uint64_t features
= newmap
.get_encoding_features();
699 newmap
.encode(pending_inc
.fullmap
,
700 features
| CEPH_FEATURE_RESERVED
);
701 pending_inc
.full_crc
= newmap
.get_crc();
702 dout(20) << " full crc " << pending_inc
.full_crc
<< dendl
;
705 void OSDMonitor::get_store_prefixes(std::set
<string
>& s
) const
707 s
.insert(service_name
);
708 s
.insert(OSD_PG_CREATING_PREFIX
);
709 s
.insert(OSD_METADATA_PREFIX
);
710 s
.insert(OSD_SNAP_PREFIX
);
713 void OSDMonitor::update_from_paxos(bool *need_bootstrap
)
715 // we really don't care if the version has been updated, because we may
716 // have trimmed without having increased the last committed; yet, we may
717 // need to update the in-memory manifest.
718 load_osdmap_manifest();
720 version_t version
= get_last_committed();
721 if (version
== osdmap
.epoch
)
723 ceph_assert(version
> osdmap
.epoch
);
725 dout(15) << "update_from_paxos paxos e " << version
726 << ", my e " << osdmap
.epoch
<< dendl
;
728 int prev_num_up_osd
= osdmap
.num_up_osd
;
731 if (!mapping_job
->is_done()) {
732 dout(1) << __func__
<< " mapping job "
733 << mapping_job
.get() << " did not complete, "
734 << mapping_job
->shards
<< " left, canceling" << dendl
;
735 mapping_job
->abort();
743 * We will possibly have a stashed latest that *we* wrote, and we will
744 * always be sure to have the oldest full map in the first..last range
745 * due to encode_trim_extra(), which includes the oldest full map in the trim
748 * encode_trim_extra() does not however write the full map's
749 * version to 'full_latest'. This is only done when we are building the
750 * full maps from the incremental versions. But don't panic! We make sure
751 * that the following conditions find whichever full map version is newer.
753 version_t latest_full
= get_version_latest_full();
754 if (latest_full
== 0 && get_first_committed() > 1)
755 latest_full
= get_first_committed();
757 if (get_first_committed() > 1 &&
758 latest_full
< get_first_committed()) {
759 // the monitor could be just sync'ed with its peer, and the latest_full key
760 // is not encoded in the paxos commits in encode_pending(), so we need to
761 // make sure we get it pointing to a proper version.
762 version_t lc
= get_last_committed();
763 version_t fc
= get_first_committed();
765 dout(10) << __func__
<< " looking for valid full map in interval"
766 << " [" << fc
<< ", " << lc
<< "]" << dendl
;
769 for (version_t v
= lc
; v
>= fc
; v
--) {
770 string full_key
= "full_" + stringify(v
);
771 if (mon
.store
->exists(get_service_name(), full_key
)) {
772 dout(10) << __func__
<< " found latest full map v " << v
<< dendl
;
778 ceph_assert(latest_full
> 0);
779 auto t(std::make_shared
<MonitorDBStore::Transaction
>());
780 put_version_latest_full(t
, latest_full
);
781 mon
.store
->apply_transaction(t
);
782 dout(10) << __func__
<< " updated the on-disk full map version to "
783 << latest_full
<< dendl
;
786 if ((latest_full
> 0) && (latest_full
> osdmap
.epoch
)) {
787 bufferlist latest_bl
;
788 get_version_full(latest_full
, latest_bl
);
789 ceph_assert(latest_bl
.length() != 0);
790 dout(7) << __func__
<< " loading latest full map e" << latest_full
<< dendl
;
792 osdmap
.decode(latest_bl
);
796 if (!mon
.store
->get(OSD_PG_CREATING_PREFIX
, "creating", bl
)) {
797 auto p
= bl
.cbegin();
798 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
799 creating_pgs
.decode(p
);
800 dout(7) << __func__
<< " loading creating_pgs last_scan_epoch "
801 << creating_pgs
.last_scan_epoch
802 << " with " << creating_pgs
.pgs
.size() << " pgs" << dendl
;
804 dout(1) << __func__
<< " missing creating pgs; upgrade from post-kraken?"
808 // walk through incrementals
809 MonitorDBStore::TransactionRef t
;
811 while (version
> osdmap
.epoch
) {
813 int err
= get_version(osdmap
.epoch
+1, inc_bl
);
814 ceph_assert(err
== 0);
815 ceph_assert(inc_bl
.length());
816 // set priority cache manager levels if the osdmap is
817 // being populated for the first time.
818 if (mon_memory_autotune
&& pcm
== nullptr) {
819 int r
= register_cache_with_pcm();
822 << " Error while registering osdmon caches with pcm."
823 << " Proceeding without cache auto tuning."
828 dout(7) << "update_from_paxos applying incremental " << osdmap
.epoch
+1
830 OSDMap::Incremental
inc(inc_bl
);
831 err
= osdmap
.apply_incremental(inc
);
832 ceph_assert(err
== 0);
835 t
.reset(new MonitorDBStore::Transaction
);
837 // Write out the full map for all past epochs. Encode the full
838 // map with the same features as the incremental. If we don't
839 // know, use the quorum features. If we don't know those either,
840 // encode with all features.
841 uint64_t f
= inc
.encode_features
;
843 f
= mon
.get_quorum_con_features();
847 osdmap
.encode(full_bl
, f
| CEPH_FEATURE_RESERVED
);
848 tx_size
+= full_bl
.length();
850 bufferlist orig_full_bl
;
851 get_version_full(osdmap
.epoch
, orig_full_bl
);
852 if (orig_full_bl
.length()) {
853 // the primary provided the full map
854 ceph_assert(inc
.have_crc
);
855 if (inc
.full_crc
!= osdmap
.crc
) {
856 // This will happen if the mons were running mixed versions in
857 // the past or some other circumstance made the full encoded
858 // maps divergent. Reloading here will bring us back into
859 // sync with the primary for this and all future maps. OSDs
860 // will also be brought back into sync when they discover the
861 // crc mismatch and request a full map from a mon.
862 derr
<< __func__
<< " full map CRC mismatch, resetting to canonical"
865 dout(20) << __func__
<< " my (bad) full osdmap:\n";
866 JSONFormatter
jf(true);
867 jf
.dump_object("osdmap", osdmap
);
869 *_dout
<< "\nhexdump:\n";
870 full_bl
.hexdump(*_dout
);
874 osdmap
.decode(orig_full_bl
);
876 dout(20) << __func__
<< " canonical full osdmap:\n";
877 JSONFormatter
jf(true);
878 jf
.dump_object("osdmap", osdmap
);
880 *_dout
<< "\nhexdump:\n";
881 orig_full_bl
.hexdump(*_dout
);
885 ceph_assert(!inc
.have_crc
);
886 put_version_full(t
, osdmap
.epoch
, full_bl
);
888 put_version_latest_full(t
, osdmap
.epoch
);
891 dout(1) << osdmap
<< dendl
;
893 if (osdmap
.epoch
== 1) {
894 t
->erase("mkfs", "osdmap");
897 if (tx_size
> g_conf()->mon_sync_max_payload_size
*2) {
898 mon
.store
->apply_transaction(t
);
899 t
= MonitorDBStore::TransactionRef();
902 for (const auto [osd
, state
] : inc
.new_state
) {
903 if (state
& CEPH_OSD_UP
) {
904 // could be marked up *or* down, but we're too lazy to check which
905 last_osd_report
.erase(osd
);
908 for (const auto [osd
, weight
] : inc
.new_weight
) {
909 if (weight
== CEPH_OSD_OUT
) {
910 // manually marked out, so drop it
911 osd_epochs
.erase(osd
);
917 mon
.store
->apply_transaction(t
);
920 bool marked_osd_down
= false;
921 for (int o
= 0; o
< osdmap
.get_max_osd(); o
++) {
922 if (osdmap
.is_out(o
))
924 auto found
= down_pending_out
.find(o
);
925 if (osdmap
.is_down(o
)) {
926 // populate down -> out map
927 if (found
== down_pending_out
.end()) {
928 dout(10) << " adding osd." << o
<< " to down_pending_out map" << dendl
;
929 down_pending_out
[o
] = ceph_clock_now();
930 marked_osd_down
= true;
933 if (found
!= down_pending_out
.end()) {
934 dout(10) << " removing osd." << o
<< " from down_pending_out map" << dendl
;
935 down_pending_out
.erase(found
);
939 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
942 check_pg_creates_subs();
944 share_map_with_random_osd();
948 // make sure our feature bits reflect the latest map
949 update_msgr_features();
951 if (!mon
.is_leader()) {
952 // will be called by on_active() on the leader, avoid doing so twice
955 if (osdmap
.stretch_mode_enabled
) {
956 dout(20) << "Stretch mode enabled in this map" << dendl
;
957 mon
.try_engage_stretch_mode();
958 if (osdmap
.degraded_stretch_mode
) {
959 dout(20) << "Degraded stretch mode set in this map" << dendl
;
960 if (!osdmap
.recovering_stretch_mode
) {
961 mon
.set_degraded_stretch_mode();
962 if (prev_num_up_osd
< osdmap
.num_up_osd
&&
963 (osdmap
.num_up_osd
/ (double)osdmap
.num_osd
) >
964 cct
->_conf
.get_val
<double>("mon_stretch_cluster_recovery_ratio")) {
965 // TODO: This works for 2-site clusters when the OSD maps are appropriately
966 // trimmed and everything is "normal" but not if you have a lot of out OSDs
967 // you're ignoring or in some really degenerate failure cases
968 dout(10) << "Enabling recovery stretch mode in this map" << dendl
;
969 mon
.go_recovery_stretch_mode();
972 mon
.set_recovery_stretch_mode();
975 mon
.set_healthy_stretch_mode();
977 if (marked_osd_down
&&
978 (!osdmap
.degraded_stretch_mode
|| osdmap
.recovering_stretch_mode
)) {
979 dout(20) << "Checking degraded stretch mode due to osd changes" << dendl
;
980 mon
.maybe_go_degraded_stretch_mode();
985 int OSDMonitor::register_cache_with_pcm()
987 if (mon_memory_target
<= 0 || mon_memory_min
<= 0) {
988 derr
<< __func__
<< " Invalid memory size specified for mon caches."
989 << " Caches will not be auto-tuned."
993 uint64_t base
= mon_memory_base
;
994 double fragmentation
= mon_memory_fragmentation
;
995 // For calculating total target memory, consider rocksdb cache size.
996 uint64_t target
= mon_memory_target
;
997 uint64_t min
= mon_memory_min
;
1000 // Apply the same logic as in bluestore to set the max amount
1001 // of memory to use for cache. Assume base memory for OSDMaps
1002 // and then add in some overhead for fragmentation.
1003 uint64_t ltarget
= (1.0 - fragmentation
) * target
;
1004 if (ltarget
> base
+ min
) {
1005 max
= ltarget
- base
;
1008 rocksdb_binned_kv_cache
= mon
.store
->get_priority_cache();
1009 if (!rocksdb_binned_kv_cache
) {
1010 derr
<< __func__
<< " not using rocksdb" << dendl
;
1014 int r
= _set_cache_ratios();
1016 derr
<< __func__
<< " Cache ratios for pcm could not be set."
1017 << " Review the kv (rocksdb) and mon_memory_target sizes."
1022 pcm
= std::make_shared
<PriorityCache::Manager
>(
1023 cct
, min
, max
, target
, true);
1024 pcm
->insert("kv", rocksdb_binned_kv_cache
, true);
1025 pcm
->insert("inc", inc_cache
, true);
1026 pcm
->insert("full", full_cache
, true);
1027 dout(1) << __func__
<< " pcm target: " << target
1028 << " pcm max: " << max
1029 << " pcm min: " << min
1030 << " inc_osd_cache size: " << inc_osd_cache
.get_size()
1035 int OSDMonitor::_set_cache_ratios()
1037 double old_cache_kv_ratio
= cache_kv_ratio
;
1039 // Set the cache ratios for kv(rocksdb), inc and full caches
1040 cache_kv_ratio
= (double)rocksdb_cache_size
/ (double)mon_memory_target
;
1041 if (cache_kv_ratio
>= 1.0) {
1042 derr
<< __func__
<< " Cache kv ratio (" << cache_kv_ratio
1043 << ") must be in range [0,<1.0]."
1045 cache_kv_ratio
= old_cache_kv_ratio
;
1048 rocksdb_binned_kv_cache
->set_cache_ratio(cache_kv_ratio
);
1049 cache_inc_ratio
= cache_full_ratio
= (1.0 - cache_kv_ratio
) / 2;
1050 inc_cache
->set_cache_ratio(cache_inc_ratio
);
1051 full_cache
->set_cache_ratio(cache_full_ratio
);
1053 dout(1) << __func__
<< " kv ratio " << cache_kv_ratio
1054 << " inc ratio " << cache_inc_ratio
1055 << " full ratio " << cache_full_ratio
1060 void OSDMonitor::start_mapping()
1062 // initiate mapping job
1064 dout(10) << __func__
<< " canceling previous mapping_job " << mapping_job
.get()
1066 mapping_job
->abort();
1068 if (!osdmap
.get_pools().empty()) {
1069 auto fin
= new C_UpdateCreatingPGs(this, osdmap
.get_epoch());
1070 mapping_job
= mapping
.start_update(osdmap
, mapper
,
1071 g_conf()->mon_osd_mapping_pgs_per_chunk
);
1072 dout(10) << __func__
<< " started mapping job " << mapping_job
.get()
1073 << " at " << fin
->start
<< dendl
;
1074 mapping_job
->set_finish_event(fin
);
1076 dout(10) << __func__
<< " no pools, no mapping job" << dendl
;
1077 mapping_job
= nullptr;
1081 void OSDMonitor::update_msgr_features()
1083 const int types
[] = {
1084 entity_name_t::TYPE_OSD
,
1085 entity_name_t::TYPE_CLIENT
,
1086 entity_name_t::TYPE_MDS
,
1087 entity_name_t::TYPE_MON
1089 for (int type
: types
) {
1091 uint64_t features
= osdmap
.get_features(type
, &mask
);
1092 if ((mon
.messenger
->get_policy(type
).features_required
& mask
) != features
) {
1093 dout(0) << "crush map has features " << features
<< ", adjusting msgr requires" << dendl
;
1094 ceph::net::Policy p
= mon
.messenger
->get_policy(type
);
1095 p
.features_required
= (p
.features_required
& ~mask
) | features
;
1096 mon
.messenger
->set_policy(type
, p
);
1101 void OSDMonitor::on_active()
1105 if (mon
.is_leader()) {
1106 mon
.clog
->debug() << "osdmap " << osdmap
;
1107 if (!priority_convert
) {
1108 // Only do this once at start-up
1109 convert_pool_priorities();
1110 priority_convert
= true;
1113 list
<MonOpRequestRef
> ls
;
1114 take_all_failures(ls
);
1115 while (!ls
.empty()) {
1116 MonOpRequestRef op
= ls
.front();
1117 op
->mark_osdmon_event(__func__
);
1125 void OSDMonitor::on_restart()
1127 last_osd_report
.clear();
1130 void OSDMonitor::on_shutdown()
1132 dout(10) << __func__
<< dendl
;
1134 dout(10) << __func__
<< " canceling previous mapping_job " << mapping_job
.get()
1136 mapping_job
->abort();
1139 // discard failure info, waiters
1140 list
<MonOpRequestRef
> ls
;
1141 take_all_failures(ls
);
1145 void OSDMonitor::update_logger()
1147 dout(10) << "update_logger" << dendl
;
1149 mon
.cluster_logger
->set(l_cluster_num_osd
, osdmap
.get_num_osds());
1150 mon
.cluster_logger
->set(l_cluster_num_osd_up
, osdmap
.get_num_up_osds());
1151 mon
.cluster_logger
->set(l_cluster_num_osd_in
, osdmap
.get_num_in_osds());
1152 mon
.cluster_logger
->set(l_cluster_osd_epoch
, osdmap
.get_epoch());
1155 void OSDMonitor::create_pending()
1157 pending_inc
= OSDMap::Incremental(osdmap
.epoch
+1);
1158 pending_inc
.fsid
= mon
.monmap
->fsid
;
1159 pending_metadata
.clear();
1160 pending_metadata_rm
.clear();
1161 pending_pseudo_purged_snaps
.clear();
1163 dout(10) << "create_pending e " << pending_inc
.epoch
<< dendl
;
1165 // safety checks (this shouldn't really happen)
1167 if (osdmap
.backfillfull_ratio
<= 0) {
1168 pending_inc
.new_backfillfull_ratio
= g_conf()->mon_osd_backfillfull_ratio
;
1169 if (pending_inc
.new_backfillfull_ratio
> 1.0)
1170 pending_inc
.new_backfillfull_ratio
/= 100;
1171 dout(1) << __func__
<< " setting backfillfull_ratio = "
1172 << pending_inc
.new_backfillfull_ratio
<< dendl
;
1174 if (osdmap
.full_ratio
<= 0) {
1175 pending_inc
.new_full_ratio
= g_conf()->mon_osd_full_ratio
;
1176 if (pending_inc
.new_full_ratio
> 1.0)
1177 pending_inc
.new_full_ratio
/= 100;
1178 dout(1) << __func__
<< " setting full_ratio = "
1179 << pending_inc
.new_full_ratio
<< dendl
;
1181 if (osdmap
.nearfull_ratio
<= 0) {
1182 pending_inc
.new_nearfull_ratio
= g_conf()->mon_osd_nearfull_ratio
;
1183 if (pending_inc
.new_nearfull_ratio
> 1.0)
1184 pending_inc
.new_nearfull_ratio
/= 100;
1185 dout(1) << __func__
<< " setting nearfull_ratio = "
1186 << pending_inc
.new_nearfull_ratio
<< dendl
;
1192 OSDMonitor::update_pending_pgs(const OSDMap::Incremental
& inc
,
1193 const OSDMap
& nextmap
)
1195 dout(10) << __func__
<< dendl
;
1196 creating_pgs_t pending_creatings
;
1198 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
1199 pending_creatings
= creating_pgs
;
1201 // check for new or old pools
1202 if (pending_creatings
.last_scan_epoch
< inc
.epoch
) {
1203 unsigned queued
= 0;
1204 queued
+= scan_for_creating_pgs(osdmap
.get_pools(),
1207 &pending_creatings
);
1208 queued
+= scan_for_creating_pgs(inc
.new_pools
,
1211 &pending_creatings
);
1212 dout(10) << __func__
<< " " << queued
<< " pools queued" << dendl
;
1213 for (auto deleted_pool
: inc
.old_pools
) {
1214 auto removed
= pending_creatings
.remove_pool(deleted_pool
);
1215 dout(10) << __func__
<< " " << removed
1216 << " pg removed because containing pool deleted: "
1217 << deleted_pool
<< dendl
;
1218 last_epoch_clean
.remove_pool(deleted_pool
);
1220 // pgmon updates its creating_pgs in check_osd_map() which is called by
1221 // on_active() and check_osd_map() could be delayed if lease expires, so its
1222 // creating_pgs could be stale in comparison with the one of osdmon. let's
1223 // trim them here. otherwise, they will be added back after being erased.
1224 unsigned removed
= 0;
1225 for (auto& pg
: pending_created_pgs
) {
1226 dout(20) << __func__
<< " noting created pg " << pg
<< dendl
;
1227 pending_creatings
.created_pools
.insert(pg
.pool());
1228 removed
+= pending_creatings
.pgs
.erase(pg
);
1230 pending_created_pgs
.clear();
1231 dout(10) << __func__
<< " " << removed
1232 << " pgs removed because they're created" << dendl
;
1233 pending_creatings
.last_scan_epoch
= osdmap
.get_epoch();
1236 // filter out any pgs that shouldn't exist.
1238 auto i
= pending_creatings
.pgs
.begin();
1239 while (i
!= pending_creatings
.pgs
.end()) {
1240 if (!nextmap
.pg_exists(i
->first
)) {
1241 dout(10) << __func__
<< " removing pg " << i
->first
1242 << " which should not exist" << dendl
;
1243 i
= pending_creatings
.pgs
.erase(i
);
1251 unsigned max
= std::max
<int64_t>(1, g_conf()->mon_osd_max_creating_pgs
);
1252 const auto total
= pending_creatings
.pgs
.size();
1253 while (pending_creatings
.pgs
.size() < max
&&
1254 !pending_creatings
.queue
.empty()) {
1255 auto p
= pending_creatings
.queue
.begin();
1256 int64_t poolid
= p
->first
;
1257 dout(10) << __func__
<< " pool " << poolid
1258 << " created " << p
->second
.created
1259 << " modified " << p
->second
.modified
1260 << " [" << p
->second
.start
<< "-" << p
->second
.end
<< ")"
1262 int64_t n
= std::min
<int64_t>(max
- pending_creatings
.pgs
.size(),
1263 p
->second
.end
- p
->second
.start
);
1264 ps_t first
= p
->second
.start
;
1265 ps_t end
= first
+ n
;
1266 for (ps_t ps
= first
; ps
< end
; ++ps
) {
1267 const pg_t pgid
{ps
, static_cast<uint64_t>(poolid
)};
1268 // NOTE: use the *current* epoch as the PG creation epoch so that the
1269 // OSD does not have to generate a long set of PastIntervals.
1270 pending_creatings
.pgs
.emplace(
1272 creating_pgs_t::pg_create_info(inc
.epoch
,
1273 p
->second
.modified
));
1274 dout(10) << __func__
<< " adding " << pgid
<< dendl
;
1276 p
->second
.start
= end
;
1277 if (p
->second
.done()) {
1278 dout(10) << __func__
<< " done with queue for " << poolid
<< dendl
;
1279 pending_creatings
.queue
.erase(p
);
1281 dout(10) << __func__
<< " pool " << poolid
1282 << " now [" << p
->second
.start
<< "-" << p
->second
.end
<< ")"
1286 dout(10) << __func__
<< " queue remaining: " << pending_creatings
.queue
.size()
1287 << " pools" << dendl
;
1289 if (mon
.monmap
->min_mon_release
>= ceph_release_t::octopus
) {
1290 // walk creating pgs' history and past_intervals forward
1291 for (auto& i
: pending_creatings
.pgs
) {
1292 // this mirrors PG::start_peering_interval()
1293 pg_t pgid
= i
.first
;
1295 // this is a bit imprecise, but sufficient?
1296 struct min_size_predicate_t
: public IsPGRecoverablePredicate
{
1297 const pg_pool_t
*pi
;
1298 bool operator()(const set
<pg_shard_t
> &have
) const {
1299 return have
.size() >= pi
->min_size
;
1301 explicit min_size_predicate_t(const pg_pool_t
*i
) : pi(i
) {}
1302 } min_size_predicate(nextmap
.get_pg_pool(pgid
.pool()));
1304 vector
<int> up
, acting
;
1305 int up_primary
, acting_primary
;
1306 nextmap
.pg_to_up_acting_osds(
1307 pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
1308 if (i
.second
.history
.epoch_created
== 0) {
1309 // new pg entry, set it up
1311 i
.second
.acting
= acting
;
1312 i
.second
.up_primary
= up_primary
;
1313 i
.second
.acting_primary
= acting_primary
;
1314 i
.second
.history
= pg_history_t(i
.second
.create_epoch
,
1315 i
.second
.create_stamp
);
1316 dout(10) << __func__
<< " pg " << pgid
<< " just added, "
1317 << " up " << i
.second
.up
1318 << " p " << i
.second
.up_primary
1319 << " acting " << i
.second
.acting
1320 << " p " << i
.second
.acting_primary
1321 << " history " << i
.second
.history
1322 << " past_intervals " << i
.second
.past_intervals
1325 std::stringstream debug
;
1326 if (PastIntervals::check_new_interval(
1327 i
.second
.acting_primary
, acting_primary
,
1328 i
.second
.acting
, acting
,
1329 i
.second
.up_primary
, up_primary
,
1331 i
.second
.history
.same_interval_since
,
1332 i
.second
.history
.last_epoch_clean
,
1337 &i
.second
.past_intervals
,
1339 epoch_t e
= inc
.epoch
;
1340 i
.second
.history
.same_interval_since
= e
;
1341 if (i
.second
.up
!= up
) {
1342 i
.second
.history
.same_up_since
= e
;
1344 if (i
.second
.acting_primary
!= acting_primary
) {
1345 i
.second
.history
.same_primary_since
= e
;
1348 osdmap
.get_pg_num(pgid
.pool()),
1349 nextmap
.get_pg_num(pgid
.pool()),
1351 i
.second
.history
.last_epoch_split
= e
;
1353 dout(10) << __func__
<< " pg " << pgid
<< " new interval,"
1354 << " up " << i
.second
.up
<< " -> " << up
1355 << " p " << i
.second
.up_primary
<< " -> " << up_primary
1356 << " acting " << i
.second
.acting
<< " -> " << acting
1357 << " p " << i
.second
.acting_primary
<< " -> "
1359 << " history " << i
.second
.history
1360 << " past_intervals " << i
.second
.past_intervals
1362 dout(20) << " debug: " << debug
.str() << dendl
;
1364 i
.second
.acting
= acting
;
1365 i
.second
.up_primary
= up_primary
;
1366 i
.second
.acting_primary
= acting_primary
;
1371 dout(10) << __func__
1372 << " " << (pending_creatings
.pgs
.size() - total
)
1373 << "/" << pending_creatings
.pgs
.size()
1374 << " pgs added from queued pools" << dendl
;
1375 return pending_creatings
;
1378 void OSDMonitor::maybe_prime_pg_temp()
1381 if (pending_inc
.crush
.length()) {
1382 dout(10) << __func__
<< " new crush map, all" << dendl
;
1386 if (!pending_inc
.new_up_client
.empty()) {
1387 dout(10) << __func__
<< " new up osds, all" << dendl
;
1391 // check for interesting OSDs
1393 for (auto p
= pending_inc
.new_state
.begin();
1394 !all
&& p
!= pending_inc
.new_state
.end();
1396 if ((p
->second
& CEPH_OSD_UP
) &&
1397 osdmap
.is_up(p
->first
)) {
1398 osds
.insert(p
->first
);
1401 for (auto p
= pending_inc
.new_weight
.begin();
1402 !all
&& p
!= pending_inc
.new_weight
.end();
1404 if (osdmap
.exists(p
->first
) && p
->second
< osdmap
.get_weight(p
->first
)) {
1406 osds
.insert(p
->first
);
1408 dout(10) << __func__
<< " osd." << p
->first
<< " weight increase, all"
1414 if (!all
&& osds
.empty())
1419 mapping
.get_osd_acting_pgs(*osds
.begin()).size() * osds
.size();
1420 if (estimate
> mapping
.get_num_pgs() *
1421 g_conf()->mon_osd_prime_pg_temp_max_estimate
) {
1422 dout(10) << __func__
<< " estimate " << estimate
<< " pgs on "
1423 << osds
.size() << " osds >= "
1424 << g_conf()->mon_osd_prime_pg_temp_max_estimate
<< " of total "
1425 << mapping
.get_num_pgs() << " pgs, all"
1429 dout(10) << __func__
<< " estimate " << estimate
<< " pgs on "
1430 << osds
.size() << " osds" << dendl
;
1435 next
.deepish_copy_from(osdmap
);
1436 next
.apply_incremental(pending_inc
);
1438 if (next
.get_pools().empty()) {
1439 dout(10) << __func__
<< " no pools, no pg_temp priming" << dendl
;
1441 PrimeTempJob
job(next
, this);
1442 mapper
.queue(&job
, g_conf()->mon_osd_mapping_pgs_per_chunk
, {});
1443 if (job
.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time
)) {
1444 dout(10) << __func__
<< " done in " << job
.get_duration() << dendl
;
1446 dout(10) << __func__
<< " did not finish in "
1447 << g_conf()->mon_osd_prime_pg_temp_max_time
1448 << ", stopping" << dendl
;
1452 dout(10) << __func__
<< " " << osds
.size() << " interesting osds" << dendl
;
1453 utime_t stop
= ceph_clock_now();
1454 stop
+= g_conf()->mon_osd_prime_pg_temp_max_time
;
1455 const int chunk
= 1000;
1457 std::unordered_set
<pg_t
> did_pgs
;
1458 for (auto osd
: osds
) {
1459 auto& pgs
= mapping
.get_osd_acting_pgs(osd
);
1460 dout(20) << __func__
<< " osd." << osd
<< " " << pgs
<< dendl
;
1461 for (auto pgid
: pgs
) {
1462 if (!did_pgs
.insert(pgid
).second
) {
1465 prime_pg_temp(next
, pgid
);
1468 if (ceph_clock_now() > stop
) {
1469 dout(10) << __func__
<< " consumed more than "
1470 << g_conf()->mon_osd_prime_pg_temp_max_time
1471 << " seconds, stopping"
1481 void OSDMonitor::prime_pg_temp(
1485 // TODO: remove this creating_pgs direct access?
1486 if (creating_pgs
.pgs
.count(pgid
)) {
1489 if (!osdmap
.pg_exists(pgid
)) {
1493 vector
<int> up
, acting
;
1494 mapping
.get(pgid
, &up
, nullptr, &acting
, nullptr);
1496 vector
<int> next_up
, next_acting
;
1497 int next_up_primary
, next_acting_primary
;
1498 next
.pg_to_up_acting_osds(pgid
, &next_up
, &next_up_primary
,
1499 &next_acting
, &next_acting_primary
);
1500 if (acting
== next_acting
&&
1501 !(up
!= acting
&& next_up
== next_acting
))
1502 return; // no change since last epoch
1505 return; // if previously empty now we can be no worse off
1506 const pg_pool_t
*pool
= next
.get_pg_pool(pgid
.pool());
1507 if (pool
&& acting
.size() < pool
->min_size
)
1508 return; // can be no worse off than before
1510 if (next_up
== next_acting
) {
1512 dout(20) << __func__
<< " next_up == next_acting now, clear pg_temp"
1516 dout(20) << __func__
<< " " << pgid
<< " " << up
<< "/" << acting
1517 << " -> " << next_up
<< "/" << next_acting
1518 << ", priming " << acting
1521 std::lock_guard
l(prime_pg_temp_lock
);
1522 // do not touch a mapping if a change is pending
1523 pending_inc
.new_pg_temp
.emplace(
1525 mempool::osdmap::vector
<int>(acting
.begin(), acting
.end()));
1530 * @note receiving a transaction in this function gives a fair amount of
1531 * freedom to the service implementation if it does need it. It shouldn't.
1533 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t
)
1535 dout(10) << "encode_pending e " << pending_inc
.epoch
1539 dout(1) << __func__
<< " osdmap full prune encoded e"
1540 << pending_inc
.epoch
<< dendl
;
1543 // finalize up pending_inc
1544 pending_inc
.modified
= ceph_clock_now();
1546 int r
= pending_inc
.propagate_base_properties_to_tiers(cct
, osdmap
);
1547 ceph_assert(r
== 0);
1550 if (!mapping_job
->is_done()) {
1551 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job "
1552 << mapping_job
.get() << " did not complete, "
1553 << mapping_job
->shards
<< " left" << dendl
;
1554 mapping_job
->abort();
1555 } else if (mapping
.get_epoch() < osdmap
.get_epoch()) {
1556 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job "
1557 << mapping_job
.get() << " is prior epoch "
1558 << mapping
.get_epoch() << dendl
;
1560 if (g_conf()->mon_osd_prime_pg_temp
) {
1561 maybe_prime_pg_temp();
1564 } else if (g_conf()->mon_osd_prime_pg_temp
) {
1565 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job did not start"
1568 mapping_job
.reset();
1570 // ensure we don't have blank new_state updates. these are interrpeted as
1571 // CEPH_OSD_UP (and almost certainly not what we want!).
1572 auto p
= pending_inc
.new_state
.begin();
1573 while (p
!= pending_inc
.new_state
.end()) {
1574 if (p
->second
== 0) {
1575 dout(10) << "new_state for osd." << p
->first
<< " is 0, removing" << dendl
;
1576 p
= pending_inc
.new_state
.erase(p
);
1578 if (p
->second
& CEPH_OSD_UP
) {
1579 pending_inc
.new_last_up_change
= pending_inc
.modified
;
1584 if (!pending_inc
.new_up_client
.empty()) {
1585 pending_inc
.new_last_up_change
= pending_inc
.modified
;
1587 for (auto& i
: pending_inc
.new_weight
) {
1588 if (i
.first
>= osdmap
.max_osd
) {
1590 // new osd is already marked in
1591 pending_inc
.new_last_in_change
= pending_inc
.modified
;
1594 } else if (!!i
.second
!= !!osdmap
.osd_weight
[i
.first
]) {
1595 // existing osd marked in or out
1596 pending_inc
.new_last_in_change
= pending_inc
.modified
;
1603 tmp
.deepish_copy_from(osdmap
);
1604 tmp
.apply_incremental(pending_inc
);
1606 // clean pg_temp mappings
1607 OSDMap::clean_temps(cct
, osdmap
, tmp
, &pending_inc
);
1609 // clean inappropriate pg_upmap/pg_upmap_items (if any)
1611 // check every upmapped pg for now
1612 // until we could reliably identify certain cases to ignore,
1613 // which is obviously the hard part TBD..
1614 vector
<pg_t
> pgs_to_check
;
1615 tmp
.get_upmap_pgs(&pgs_to_check
);
1616 if (pgs_to_check
.size() <
1617 static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk
* 2)) {
1618 // not enough pgs, do it inline
1619 tmp
.clean_pg_upmaps(cct
, &pending_inc
);
1621 CleanUpmapJob
job(cct
, tmp
, pending_inc
);
1622 mapper
.queue(&job
, g_conf()->mon_clean_pg_upmaps_per_chunk
, pgs_to_check
);
1627 // update creating pgs first so that we can remove the created pgid and
1628 // process the pool flag removal below in the same osdmap epoch.
1629 auto pending_creatings
= update_pending_pgs(pending_inc
, tmp
);
1630 bufferlist creatings_bl
;
1631 uint64_t features
= CEPH_FEATURES_ALL
;
1632 if (mon
.monmap
->min_mon_release
< ceph_release_t::octopus
) {
1633 dout(20) << __func__
<< " encoding pending pgs without octopus features"
1635 features
&= ~CEPH_FEATURE_SERVER_OCTOPUS
;
1637 encode(pending_creatings
, creatings_bl
, features
);
1638 t
->put(OSD_PG_CREATING_PREFIX
, "creating", creatings_bl
);
1640 // remove any old (or incompat) POOL_CREATING flags
1641 for (auto& i
: tmp
.get_pools()) {
1642 if (tmp
.require_osd_release
< ceph_release_t::nautilus
) {
1643 // pre-nautilus OSDMaps shouldn't get this flag.
1644 if (pending_inc
.new_pools
.count(i
.first
)) {
1645 pending_inc
.new_pools
[i
.first
].flags
&= ~pg_pool_t::FLAG_CREATING
;
1648 if (i
.second
.has_flag(pg_pool_t::FLAG_CREATING
) &&
1649 !pending_creatings
.still_creating_pool(i
.first
)) {
1650 dout(10) << __func__
<< " done creating pool " << i
.first
1651 << ", clearing CREATING flag" << dendl
;
1652 if (pending_inc
.new_pools
.count(i
.first
) == 0) {
1653 pending_inc
.new_pools
[i
.first
] = i
.second
;
1655 pending_inc
.new_pools
[i
.first
].flags
&= ~pg_pool_t::FLAG_CREATING
;
1659 // collect which pools are currently affected by
1660 // the near/backfill/full osd(s),
1661 // and set per-pool near/backfill/full flag instead
1662 set
<int64_t> full_pool_ids
;
1663 set
<int64_t> backfillfull_pool_ids
;
1664 set
<int64_t> nearfull_pool_ids
;
1665 tmp
.get_full_pools(cct
,
1667 &backfillfull_pool_ids
,
1668 &nearfull_pool_ids
);
1669 if (full_pool_ids
.empty() ||
1670 backfillfull_pool_ids
.empty() ||
1671 nearfull_pool_ids
.empty()) {
1672 // normal case - no nearfull, backfillfull or full osds
1673 // try cancel any improper nearfull/backfillfull/full pool
1675 for (auto &pool
: tmp
.get_pools()) {
1676 auto p
= pool
.first
;
1677 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
) &&
1678 nearfull_pool_ids
.empty()) {
1679 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1680 << "'s nearfull flag" << dendl
;
1681 if (pending_inc
.new_pools
.count(p
) == 0) {
1682 // load original pool info first!
1683 pending_inc
.new_pools
[p
] = pool
.second
;
1685 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1687 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
) &&
1688 backfillfull_pool_ids
.empty()) {
1689 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1690 << "'s backfillfull flag" << dendl
;
1691 if (pending_inc
.new_pools
.count(p
) == 0) {
1692 pending_inc
.new_pools
[p
] = pool
.second
;
1694 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1696 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
) &&
1697 full_pool_ids
.empty()) {
1698 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1699 // set by EQUOTA, skipping
1702 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1703 << "'s full flag" << dendl
;
1704 if (pending_inc
.new_pools
.count(p
) == 0) {
1705 pending_inc
.new_pools
[p
] = pool
.second
;
1707 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_FULL
;
1711 if (!full_pool_ids
.empty()) {
1712 dout(10) << __func__
<< " marking pool(s) " << full_pool_ids
1713 << " as full" << dendl
;
1714 for (auto &p
: full_pool_ids
) {
1715 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
)) {
1718 if (pending_inc
.new_pools
.count(p
) == 0) {
1719 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1721 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_FULL
;
1722 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1723 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1725 // cancel FLAG_FULL for pools which are no longer full too
1726 for (auto &pool
: tmp
.get_pools()) {
1727 auto p
= pool
.first
;
1728 if (full_pool_ids
.count(p
)) {
1729 // skip pools we have just marked as full above
1732 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
) ||
1733 tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1734 // don't touch if currently is not full
1735 // or is running out of quota (and hence considered as full)
1738 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1739 << "'s full flag" << dendl
;
1740 if (pending_inc
.new_pools
.count(p
) == 0) {
1741 pending_inc
.new_pools
[p
] = pool
.second
;
1743 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_FULL
;
1746 if (!backfillfull_pool_ids
.empty()) {
1747 for (auto &p
: backfillfull_pool_ids
) {
1748 if (full_pool_ids
.count(p
)) {
1749 // skip pools we have already considered as full above
1752 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1753 // make sure FLAG_FULL is truly set, so we are safe not
1754 // to set a extra (redundant) FLAG_BACKFILLFULL flag
1755 ceph_assert(tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
));
1758 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
)) {
1759 // don't bother if pool is already marked as backfillfull
1762 dout(10) << __func__
<< " marking pool '" << tmp
.pool_name
[p
]
1763 << "'s as backfillfull" << dendl
;
1764 if (pending_inc
.new_pools
.count(p
) == 0) {
1765 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1767 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_BACKFILLFULL
;
1768 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1770 // cancel FLAG_BACKFILLFULL for pools
1771 // which are no longer backfillfull too
1772 for (auto &pool
: tmp
.get_pools()) {
1773 auto p
= pool
.first
;
1774 if (full_pool_ids
.count(p
) || backfillfull_pool_ids
.count(p
)) {
1775 // skip pools we have just marked as backfillfull/full above
1778 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
)) {
1779 // and don't touch if currently is not backfillfull
1782 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1783 << "'s backfillfull flag" << dendl
;
1784 if (pending_inc
.new_pools
.count(p
) == 0) {
1785 pending_inc
.new_pools
[p
] = pool
.second
;
1787 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1790 if (!nearfull_pool_ids
.empty()) {
1791 for (auto &p
: nearfull_pool_ids
) {
1792 if (full_pool_ids
.count(p
) || backfillfull_pool_ids
.count(p
)) {
1795 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1796 // make sure FLAG_FULL is truly set, so we are safe not
1797 // to set a extra (redundant) FLAG_NEARFULL flag
1798 ceph_assert(tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
));
1801 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
)) {
1802 // don't bother if pool is already marked as nearfull
1805 dout(10) << __func__
<< " marking pool '" << tmp
.pool_name
[p
]
1806 << "'s as nearfull" << dendl
;
1807 if (pending_inc
.new_pools
.count(p
) == 0) {
1808 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1810 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_NEARFULL
;
1812 // cancel FLAG_NEARFULL for pools
1813 // which are no longer nearfull too
1814 for (auto &pool
: tmp
.get_pools()) {
1815 auto p
= pool
.first
;
1816 if (full_pool_ids
.count(p
) ||
1817 backfillfull_pool_ids
.count(p
) ||
1818 nearfull_pool_ids
.count(p
)) {
1819 // skip pools we have just marked as
1820 // nearfull/backfillfull/full above
1823 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
)) {
1824 // and don't touch if currently is not nearfull
1827 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1828 << "'s nearfull flag" << dendl
;
1829 if (pending_inc
.new_pools
.count(p
) == 0) {
1830 pending_inc
.new_pools
[p
] = pool
.second
;
1832 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1836 // min_compat_client?
1837 if (!tmp
.require_min_compat_client
) {
1838 auto mv
= tmp
.get_min_compat_client();
1839 dout(1) << __func__
<< " setting require_min_compat_client to currently "
1840 << "required " << mv
<< dendl
;
1841 mon
.clog
->info() << "setting require_min_compat_client to currently "
1842 << "required " << mv
;
1843 pending_inc
.new_require_min_compat_client
= mv
;
1846 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
&&
1847 tmp
.require_osd_release
>= ceph_release_t::nautilus
) {
1848 dout(10) << __func__
<< " first nautilus+ epoch" << dendl
;
1849 // add creating flags?
1850 for (auto& i
: tmp
.get_pools()) {
1851 if (pending_creatings
.still_creating_pool(i
.first
)) {
1852 dout(10) << __func__
<< " adding CREATING flag to pool " << i
.first
1854 if (pending_inc
.new_pools
.count(i
.first
) == 0) {
1855 pending_inc
.new_pools
[i
.first
] = i
.second
;
1857 pending_inc
.new_pools
[i
.first
].flags
|= pg_pool_t::FLAG_CREATING
;
1860 // adjust blocklist items to all be TYPE_ANY
1861 for (auto& i
: tmp
.blocklist
) {
1863 a
.set_type(entity_addr_t::TYPE_ANY
);
1864 pending_inc
.new_blocklist
[a
] = i
.second
;
1865 pending_inc
.old_blocklist
.push_back(i
.first
);
1869 if (osdmap
.require_osd_release
< ceph_release_t::octopus
&&
1870 tmp
.require_osd_release
>= ceph_release_t::octopus
) {
1871 dout(10) << __func__
<< " first octopus+ epoch" << dendl
;
1873 // adjust obsoleted cache modes
1874 for (auto& [poolid
, pi
] : tmp
.pools
) {
1875 if (pi
.cache_mode
== pg_pool_t::CACHEMODE_FORWARD
) {
1876 if (pending_inc
.new_pools
.count(poolid
) == 0) {
1877 pending_inc
.new_pools
[poolid
] = pi
;
1879 dout(10) << __func__
<< " switching pool " << poolid
1880 << " cachemode from forward -> proxy" << dendl
;
1881 pending_inc
.new_pools
[poolid
].cache_mode
= pg_pool_t::CACHEMODE_PROXY
;
1883 if (pi
.cache_mode
== pg_pool_t::CACHEMODE_READFORWARD
) {
1884 if (pending_inc
.new_pools
.count(poolid
) == 0) {
1885 pending_inc
.new_pools
[poolid
] = pi
;
1887 dout(10) << __func__
<< " switching pool " << poolid
1888 << " cachemode from readforward -> readproxy" << dendl
;
1889 pending_inc
.new_pools
[poolid
].cache_mode
=
1890 pg_pool_t::CACHEMODE_READPROXY
;
1894 // clear removed_snaps for every pool
1895 for (auto& [poolid
, pi
] : tmp
.pools
) {
1896 if (pi
.removed_snaps
.empty()) {
1899 if (pending_inc
.new_pools
.count(poolid
) == 0) {
1900 pending_inc
.new_pools
[poolid
] = pi
;
1902 dout(10) << __func__
<< " clearing pool " << poolid
<< " removed_snaps"
1904 pending_inc
.new_pools
[poolid
].removed_snaps
.clear();
1907 // create a combined purged snap epoch key for all purged snaps
1908 // prior to this epoch, and store it in the current epoch (i.e.,
1909 // the last pre-octopus epoch, just prior to the one we're
1911 auto it
= mon
.store
->get_iterator(OSD_SNAP_PREFIX
);
1912 it
->lower_bound("purged_snap_");
1913 map
<int64_t,snap_interval_set_t
> combined
;
1914 while (it
->valid()) {
1915 if (it
->key().find("purged_snap_") != 0) {
1918 string k
= it
->key();
1919 long long unsigned pool
;
1920 int n
= sscanf(k
.c_str(), "purged_snap_%llu_", &pool
);
1922 derr
<< __func__
<< " invalid purged_snaps key '" << k
<< "'" << dendl
;
1924 bufferlist v
= it
->value();
1925 auto p
= v
.cbegin();
1926 snapid_t begin
, end
;
1927 ceph::decode(begin
, p
);
1928 ceph::decode(end
, p
);
1929 combined
[pool
].insert(begin
, end
- begin
);
1933 if (!combined
.empty()) {
1934 string k
= make_purged_snap_epoch_key(pending_inc
.epoch
- 1);
1936 ceph::encode(combined
, v
);
1937 t
->put(OSD_SNAP_PREFIX
, k
, v
);
1938 dout(10) << __func__
<< " recording pre-octopus purged_snaps in epoch "
1939 << (pending_inc
.epoch
- 1) << ", " << v
.length() << " bytes"
1942 dout(10) << __func__
<< " there were no pre-octopus purged snaps"
1946 // clean out the old removed_snap_ and removed_epoch keys
1947 // ('`' is ASCII '_' + 1)
1948 t
->erase_range(OSD_SNAP_PREFIX
, "removed_snap_", "removed_snap`");
1949 t
->erase_range(OSD_SNAP_PREFIX
, "removed_epoch_", "removed_epoch`");
1954 for (auto i
= pending_inc
.new_state
.begin();
1955 i
!= pending_inc
.new_state
.end();
1957 int s
= i
->second
? i
->second
: CEPH_OSD_UP
;
1958 if (s
& CEPH_OSD_UP
) {
1959 dout(2) << " osd." << i
->first
<< " DOWN" << dendl
;
1960 // Reset laggy parameters if failure interval exceeds a threshold.
1961 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(i
->first
);
1962 if ((xi
.laggy_probability
|| xi
.laggy_interval
) && xi
.down_stamp
.sec()) {
1963 int last_failure_interval
= pending_inc
.modified
.sec() - xi
.down_stamp
.sec();
1964 if (grace_interval_threshold_exceeded(last_failure_interval
)) {
1965 set_default_laggy_params(i
->first
);
1969 if (s
& CEPH_OSD_EXISTS
)
1970 dout(2) << " osd." << i
->first
<< " DNE" << dendl
;
1972 for (auto i
= pending_inc
.new_up_client
.begin();
1973 i
!= pending_inc
.new_up_client
.end();
1975 //FIXME: insert cluster addresses too
1976 dout(2) << " osd." << i
->first
<< " UP " << i
->second
<< dendl
;
1978 for (map
<int32_t,uint32_t>::iterator i
= pending_inc
.new_weight
.begin();
1979 i
!= pending_inc
.new_weight
.end();
1981 if (i
->second
== CEPH_OSD_OUT
) {
1982 dout(2) << " osd." << i
->first
<< " OUT" << dendl
;
1983 } else if (i
->second
== CEPH_OSD_IN
) {
1984 dout(2) << " osd." << i
->first
<< " IN" << dendl
;
1986 dout(2) << " osd." << i
->first
<< " WEIGHT " << hex
<< i
->second
<< dec
<< dendl
;
1990 // features for osdmap and its incremental
1993 // encode full map and determine its crc
1996 tmp
.deepish_copy_from(osdmap
);
1997 tmp
.apply_incremental(pending_inc
);
1999 // determine appropriate features
2000 features
= tmp
.get_encoding_features();
2001 dout(10) << __func__
<< " encoding full map with "
2002 << tmp
.require_osd_release
2003 << " features " << features
<< dendl
;
2005 // the features should be a subset of the mon quorum's features!
2006 ceph_assert((features
& ~mon
.get_quorum_con_features()) == 0);
2009 encode(tmp
, fullbl
, features
| CEPH_FEATURE_RESERVED
);
2010 pending_inc
.full_crc
= tmp
.get_crc();
2012 // include full map in the txn. note that old monitors will
2013 // overwrite this. new ones will now skip the local full map
2014 // encode and reload from this.
2015 put_version_full(t
, pending_inc
.epoch
, fullbl
);
2019 ceph_assert(get_last_committed() + 1 == pending_inc
.epoch
);
2021 encode(pending_inc
, bl
, features
| CEPH_FEATURE_RESERVED
);
2023 dout(20) << " full_crc " << tmp
.get_crc()
2024 << " inc_crc " << pending_inc
.inc_crc
<< dendl
;
2026 /* put everything in the transaction */
2027 put_version(t
, pending_inc
.epoch
, bl
);
2028 put_last_committed(t
, pending_inc
.epoch
);
2031 for (map
<int,bufferlist
>::iterator p
= pending_metadata
.begin();
2032 p
!= pending_metadata
.end();
2035 auto mp
= p
->second
.cbegin();
2037 auto it
= m
.find("osd_objectstore");
2038 if (it
!= m
.end()) {
2039 if (it
->second
== "filestore") {
2040 filestore_osds
.insert(p
->first
);
2042 filestore_osds
.erase(p
->first
);
2045 t
->put(OSD_METADATA_PREFIX
, stringify(p
->first
), p
->second
);
2047 for (set
<int>::iterator p
= pending_metadata_rm
.begin();
2048 p
!= pending_metadata_rm
.end();
2050 filestore_osds
.erase(*p
);
2051 t
->erase(OSD_METADATA_PREFIX
, stringify(*p
));
2053 pending_metadata
.clear();
2054 pending_metadata_rm
.clear();
2057 if (tmp
.require_osd_release
>= ceph_release_t::octopus
&&
2058 !pending_inc
.new_purged_snaps
.empty()) {
2059 // all snaps purged this epoch (across all pools)
2060 string k
= make_purged_snap_epoch_key(pending_inc
.epoch
);
2062 encode(pending_inc
.new_purged_snaps
, v
);
2063 t
->put(OSD_SNAP_PREFIX
, k
, v
);
2065 for (auto& i
: pending_inc
.new_purged_snaps
) {
2066 for (auto q
= i
.second
.begin();
2067 q
!= i
.second
.end();
2069 insert_purged_snap_update(i
.first
, q
.get_start(), q
.get_end(),
2074 for (auto& [pool
, snaps
] : pending_pseudo_purged_snaps
) {
2075 for (auto snap
: snaps
) {
2076 insert_purged_snap_update(pool
, snap
, snap
+ 1,
2083 health_check_map_t next
;
2084 tmp
.check_health(cct
, &next
);
2086 check_for_filestore_osds(&next
);
2087 encode_health(next
, t
);
2090 int OSDMonitor::load_metadata(int osd
, map
<string
, string
>& m
, ostream
*err
)
2093 int r
= mon
.store
->get(OSD_METADATA_PREFIX
, stringify(osd
), bl
);
2097 auto p
= bl
.cbegin();
2100 catch (ceph::buffer::error
& e
) {
2102 *err
<< "osd." << osd
<< " metadata is corrupt";
2108 void OSDMonitor::count_metadata(const string
& field
, map
<string
,int> *out
)
2110 for (int osd
= 0; osd
< osdmap
.get_max_osd(); ++osd
) {
2111 if (osdmap
.is_up(osd
)) {
2112 map
<string
,string
> meta
;
2113 load_metadata(osd
, meta
, nullptr);
2114 auto p
= meta
.find(field
);
2115 if (p
== meta
.end()) {
2116 (*out
)["unknown"]++;
2118 (*out
)[p
->second
]++;
2124 void OSDMonitor::count_metadata(const string
& field
, Formatter
*f
)
2126 map
<string
,int> by_val
;
2127 count_metadata(field
, &by_val
);
2128 f
->open_object_section(field
.c_str());
2129 for (auto& p
: by_val
) {
2130 f
->dump_int(p
.first
.c_str(), p
.second
);
2135 void OSDMonitor::get_versions(std::map
<string
, list
<string
>> &versions
)
2137 for (int osd
= 0; osd
< osdmap
.get_max_osd(); ++osd
) {
2138 if (osdmap
.is_up(osd
)) {
2139 map
<string
,string
> meta
;
2140 load_metadata(osd
, meta
, nullptr);
2141 auto p
= meta
.find("ceph_version_short");
2142 if (p
== meta
.end()) continue;
2143 versions
[p
->second
].push_back(string("osd.") + stringify(osd
));
2148 int OSDMonitor::get_osd_objectstore_type(int osd
, string
*type
)
2150 map
<string
, string
> metadata
;
2151 int r
= load_metadata(osd
, metadata
, nullptr);
2155 auto it
= metadata
.find("osd_objectstore");
2156 if (it
== metadata
.end())
2162 void OSDMonitor::get_filestore_osd_list()
2164 for (unsigned osd
= 0; osd
< osdmap
.get_num_osds(); ++osd
) {
2165 string objectstore_type
;
2166 int r
= get_osd_objectstore_type(osd
, &objectstore_type
);
2167 if (r
== 0 && objectstore_type
== "filestore") {
2168 filestore_osds
.insert(osd
);
2173 void OSDMonitor::check_for_filestore_osds(health_check_map_t
*checks
)
2175 if (g_conf()->mon_warn_on_filestore_osds
&&
2176 filestore_osds
.size() > 0) {
2177 ostringstream ss
, deprecated_tip
;
2178 list
<string
> detail
;
2179 ss
<< filestore_osds
.size()
2181 << (filestore_osds
.size() == 1 ? "is" : "are")
2182 << " running Filestore";
2183 deprecated_tip
<< ss
.str();
2184 ss
<< " [Deprecated]";
2185 auto& d
= checks
->add("OSD_FILESTORE", HEALTH_WARN
, ss
.str(),
2186 filestore_osds
.size());
2187 deprecated_tip
<< ", which has been deprecated and"
2188 << " not been optimized for QoS"
2189 << " (Filestore OSDs will use 'osd_op_queue = wpq' strictly)";
2190 detail
.push_back(deprecated_tip
.str());
2191 d
.detail
.swap(detail
);
2195 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id
,
2196 const pg_pool_t
&pool
,
2199 // just check a few pgs for efficiency - this can't give a guarantee anyway,
2200 // since filestore osds could always join the pool later
2201 set
<int> checked_osds
;
2202 for (unsigned ps
= 0; ps
< std::min(8u, pool
.get_pg_num()); ++ps
) {
2203 vector
<int> up
, acting
;
2204 pg_t
pgid(ps
, pool_id
);
2205 osdmap
.pg_to_up_acting_osds(pgid
, up
, acting
);
2206 for (int osd
: up
) {
2207 if (checked_osds
.find(osd
) != checked_osds
.end())
2209 string objectstore_type
;
2210 int r
= get_osd_objectstore_type(osd
, &objectstore_type
);
2211 // allow with missing metadata, e.g. due to an osd never booting yet
2212 if (r
< 0 || objectstore_type
== "bluestore") {
2213 checked_osds
.insert(osd
);
2216 *err
<< "osd." << osd
<< " uses " << objectstore_type
;
2223 int OSDMonitor::dump_osd_metadata(int osd
, Formatter
*f
, ostream
*err
)
2225 map
<string
,string
> m
;
2226 if (int r
= load_metadata(osd
, m
, err
))
2228 for (map
<string
,string
>::iterator p
= m
.begin(); p
!= m
.end(); ++p
)
2229 f
->dump_string(p
->first
.c_str(), p
->second
);
2233 void OSDMonitor::print_nodes(Formatter
*f
)
2235 // group OSDs by their hosts
2236 map
<string
, list
<int> > osds
; // hostname => osd
2237 for (int osd
= 0; osd
< osdmap
.get_max_osd(); osd
++) {
2238 map
<string
, string
> m
;
2239 if (load_metadata(osd
, m
, NULL
)) {
2242 map
<string
, string
>::iterator hostname
= m
.find("hostname");
2243 if (hostname
== m
.end()) {
2244 // not likely though
2247 osds
[hostname
->second
].push_back(osd
);
2250 dump_services(f
, osds
, "osd");
2253 void OSDMonitor::share_map_with_random_osd()
2255 if (osdmap
.get_num_up_osds() == 0) {
2256 dout(10) << __func__
<< " no up osds, don't share with anyone" << dendl
;
2260 MonSession
*s
= mon
.session_map
.get_random_osd_session(&osdmap
);
2262 dout(10) << __func__
<< " no up osd on our session map" << dendl
;
2266 dout(10) << "committed, telling random " << s
->name
2267 << " all about it" << dendl
;
2269 // get feature of the peer
2270 // use quorum_con_features, if it's an anonymous connection.
2271 uint64_t features
= s
->con_features
? s
->con_features
:
2272 mon
.get_quorum_con_features();
2273 // whatev, they'll request more if they need it
2274 MOSDMap
*m
= build_incremental(osdmap
.get_epoch() - 1, osdmap
.get_epoch(), features
);
2275 s
->con
->send_message(m
);
2276 // NOTE: do *not* record osd has up to this epoch (as we do
2277 // elsewhere) as they may still need to request older values.
2280 version_t
OSDMonitor::get_trim_to() const
2282 if (mon
.get_quorum().empty()) {
2283 dout(10) << __func__
<< " quorum not formed, trim_to = 0" << dendl
;
2288 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
2289 if (!creating_pgs
.pgs
.empty()) {
2290 dout(10) << __func__
<< " pgs creating, trim_to = 0" << dendl
;
2295 if (g_conf().get_val
<bool>("mon_debug_block_osdmap_trim")) {
2297 << " blocking osdmap trim"
2298 << " ('mon_debug_block_osdmap_trim' set to 'true')"
2299 << " trim_to = 0" << dendl
;
2304 epoch_t floor
= get_min_last_epoch_clean();
2305 dout(10) << " min_last_epoch_clean " << floor
<< dendl
;
2306 if (g_conf()->mon_osd_force_trim_to
> 0 &&
2307 g_conf()->mon_osd_force_trim_to
< (int)get_last_committed()) {
2308 floor
= g_conf()->mon_osd_force_trim_to
;
2309 dout(10) << __func__
2310 << " explicit mon_osd_force_trim_to = " << floor
<< dendl
;
2312 unsigned min
= g_conf()->mon_min_osdmap_epochs
;
2313 if (floor
+ min
> get_last_committed()) {
2314 if (min
< get_last_committed())
2315 floor
= get_last_committed() - min
;
2319 if (floor
> get_first_committed()) {
2320 dout(10) << __func__
<< " trim_to = " << floor
<< dendl
;
2324 dout(10) << __func__
<< " trim_to = 0" << dendl
;
2328 epoch_t
OSDMonitor::get_min_last_epoch_clean() const
2330 auto floor
= last_epoch_clean
.get_lower_bound(osdmap
);
2331 // also scan osd epochs
2332 // don't trim past the oldest reported osd epoch
2333 for (auto [osd
, epoch
] : osd_epochs
) {
2334 if (epoch
< floor
) {
2341 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx
,
2344 dout(10) << __func__
<< " including full map for e " << first
<< dendl
;
2346 get_version_full(first
, bl
);
2347 put_version_full(tx
, first
, bl
);
2349 if (has_osdmap_manifest
&&
2350 first
> osdmap_manifest
.get_first_pinned()) {
2351 _prune_update_trimmed(tx
, first
);
2356 /* full osdmap prune
2358 * for more information, please refer to doc/dev/mon-osdmap-prune.rst
2361 void OSDMonitor::load_osdmap_manifest()
2363 bool store_has_manifest
=
2364 mon
.store
->exists(get_service_name(), "osdmap_manifest");
2366 if (!store_has_manifest
) {
2367 if (!has_osdmap_manifest
) {
2371 dout(20) << __func__
2372 << " dropping osdmap manifest from memory." << dendl
;
2373 osdmap_manifest
= osdmap_manifest_t();
2374 has_osdmap_manifest
= false;
2378 dout(20) << __func__
2379 << " osdmap manifest detected in store; reload." << dendl
;
2381 bufferlist manifest_bl
;
2382 int r
= get_value("osdmap_manifest", manifest_bl
);
2384 derr
<< __func__
<< " unable to read osdmap version manifest" << dendl
;
2385 ceph_abort_msg("error reading manifest");
2387 osdmap_manifest
.decode(manifest_bl
);
2388 has_osdmap_manifest
= true;
2390 dout(10) << __func__
<< " store osdmap manifest pinned ("
2391 << osdmap_manifest
.get_first_pinned()
2393 << osdmap_manifest
.get_last_pinned()
2398 bool OSDMonitor::should_prune() const
2400 version_t first
= get_first_committed();
2401 version_t last
= get_last_committed();
2402 version_t min_osdmap_epochs
=
2403 g_conf().get_val
<int64_t>("mon_min_osdmap_epochs");
2404 version_t prune_min
=
2405 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_min");
2406 version_t prune_interval
=
2407 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_interval");
2408 version_t last_pinned
= osdmap_manifest
.get_last_pinned();
2409 version_t last_to_pin
= last
- min_osdmap_epochs
;
2411 // Make it or break it constraints.
2413 // If any of these conditions fails, we will not prune, regardless of
2414 // whether we have an on-disk manifest with an on-going pruning state.
2416 if ((last
- first
) <= min_osdmap_epochs
) {
2417 // between the first and last committed epochs, we don't have
2418 // enough epochs to trim, much less to prune.
2419 dout(10) << __func__
2420 << " currently holding only " << (last
- first
)
2421 << " epochs (min osdmap epochs: " << min_osdmap_epochs
2422 << "); do not prune."
2426 } else if ((last_to_pin
- first
) < prune_min
) {
2427 // between the first committed epoch and the last epoch we would prune,
2428 // we simply don't have enough versions over the minimum to prune maps.
2429 dout(10) << __func__
2430 << " could only prune " << (last_to_pin
- first
)
2431 << " epochs (" << first
<< ".." << last_to_pin
<< "), which"
2432 " is less than the required minimum (" << prune_min
<< ")"
2436 } else if (has_osdmap_manifest
&& last_pinned
>= last_to_pin
) {
2437 dout(10) << __func__
2438 << " we have pruned as far as we can; do not prune."
2442 } else if (last_pinned
+ prune_interval
> last_to_pin
) {
2443 dout(10) << __func__
2444 << " not enough epochs to form an interval (last pinned: "
2445 << last_pinned
<< ", last to pin: "
2446 << last_to_pin
<< ", interval: " << prune_interval
<< ")"
2451 dout(15) << __func__
2452 << " should prune (" << last_pinned
<< ".." << last_to_pin
<< ")"
2453 << " lc (" << first
<< ".." << last
<< ")"
2458 void OSDMonitor::_prune_update_trimmed(
2459 MonitorDBStore::TransactionRef tx
,
2462 dout(10) << __func__
2463 << " first " << first
2464 << " last_pinned " << osdmap_manifest
.get_last_pinned()
2467 osdmap_manifest_t manifest
= osdmap_manifest
;
2469 if (!manifest
.is_pinned(first
)) {
2470 manifest
.pin(first
);
2473 set
<version_t
>::iterator p_end
= manifest
.pinned
.find(first
);
2474 set
<version_t
>::iterator p
= manifest
.pinned
.begin();
2475 manifest
.pinned
.erase(p
, p_end
);
2476 ceph_assert(manifest
.get_first_pinned() == first
);
2478 if (manifest
.get_last_pinned() == first
+1 ||
2479 manifest
.pinned
.size() == 1) {
2480 // we reached the end of the line, as pinned maps go; clean up our
2481 // manifest, and let `should_prune()` decide whether we should prune
2483 tx
->erase(get_service_name(), "osdmap_manifest");
2488 manifest
.encode(bl
);
2489 tx
->put(get_service_name(), "osdmap_manifest", bl
);
2492 void OSDMonitor::prune_init(osdmap_manifest_t
& manifest
)
2494 dout(1) << __func__
<< dendl
;
2496 version_t pin_first
;
2498 // verify constrainsts on stable in-memory state
2499 if (!has_osdmap_manifest
) {
2500 // we must have never pruned, OR if we pruned the state must no longer
2501 // be relevant (i.e., the state must have been removed alongside with
2502 // the trim that *must* have removed past the last pinned map in a
2504 ceph_assert(osdmap_manifest
.pinned
.empty());
2505 ceph_assert(!mon
.store
->exists(get_service_name(), "osdmap_manifest"));
2506 pin_first
= get_first_committed();
2509 // we must have pruned in the past AND its state is still relevant
2510 // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
2511 // and thus we still hold a manifest in the store).
2512 ceph_assert(!osdmap_manifest
.pinned
.empty());
2513 ceph_assert(osdmap_manifest
.get_first_pinned() == get_first_committed());
2514 ceph_assert(osdmap_manifest
.get_last_pinned() < get_last_committed());
2516 dout(10) << __func__
2517 << " first_pinned " << osdmap_manifest
.get_first_pinned()
2518 << " last_pinned " << osdmap_manifest
.get_last_pinned()
2521 pin_first
= osdmap_manifest
.get_last_pinned();
2524 manifest
.pin(pin_first
);
2527 bool OSDMonitor::_prune_sanitize_options() const
2529 uint64_t prune_interval
=
2530 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_interval");
2531 uint64_t prune_min
=
2532 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_min");
2534 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_txsize");
2538 if (prune_interval
== 0) {
2540 << " prune is enabled BUT prune interval is zero; abort."
2543 } else if (prune_interval
== 1) {
2545 << " prune interval is equal to one, which essentially means"
2546 " no pruning; abort."
2550 if (prune_min
== 0) {
2552 << " prune is enabled BUT prune min is zero; abort."
2556 if (prune_interval
> prune_min
) {
2558 << " impossible to ascertain proper prune interval because"
2559 << " it is greater than the minimum prune epochs"
2560 << " (min: " << prune_min
<< ", interval: " << prune_interval
<< ")"
2565 if (txsize
< prune_interval
- 1) {
2567 << " 'mon_osdmap_full_prune_txsize' (" << txsize
2568 << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval
- 1
2569 << "); abort." << dendl
;
2575 bool OSDMonitor::is_prune_enabled() const {
2576 return g_conf().get_val
<bool>("mon_osdmap_full_prune_enabled");
2579 bool OSDMonitor::is_prune_supported() const {
2580 return mon
.get_required_mon_features().contains_any(
2581 ceph::features::mon::FEATURE_OSDMAP_PRUNE
);
2586 * @returns true if has side-effects; false otherwise.
2588 bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx
)
2590 bool enabled
= is_prune_enabled();
2592 dout(1) << __func__
<< " osdmap full prune "
2593 << ( enabled
? "enabled" : "disabled")
2596 if (!enabled
|| !_prune_sanitize_options() || !should_prune()) {
2600 // we are beyond the minimum prune versions, we need to remove maps because
2601 // otherwise the store will grow unbounded and we may end up having issues
2602 // with available disk space or store hangs.
2604 // we will not pin all versions. We will leave a buffer number of versions.
2605 // this allows us the monitor to trim maps without caring too much about
2606 // pinned maps, and then allow us to use another ceph-mon without these
2607 // capabilities, without having to repair the store.
2609 osdmap_manifest_t manifest
= osdmap_manifest
;
2611 version_t first
= get_first_committed();
2612 version_t last
= get_last_committed();
2614 version_t last_to_pin
= last
- g_conf()->mon_min_osdmap_epochs
;
2615 version_t last_pinned
= manifest
.get_last_pinned();
2616 uint64_t prune_interval
=
2617 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_interval");
2619 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_txsize");
2621 prune_init(manifest
);
2623 // we need to get rid of some osdmaps
2626 << " lc (" << first
<< " .. " << last
<< ")"
2627 << " last_pinned " << last_pinned
2628 << " interval " << prune_interval
2629 << " last_to_pin " << last_to_pin
2632 // We will be erasing maps as we go.
2634 // We will erase all maps between `last_pinned` and the `next_to_pin`.
2636 // If `next_to_pin` happens to be greater than `last_to_pin`, then
2637 // we stop pruning. We could prune the maps between `next_to_pin` and
2638 // `last_to_pin`, but by not doing it we end up with neater pruned
2639 // intervals, aligned with `prune_interval`. Besides, this should not be a
2640 // problem as long as `prune_interval` is set to a sane value, instead of
2641 // hundreds or thousands of maps.
2643 auto map_exists
= [this](version_t v
) {
2644 string k
= mon
.store
->combine_strings("full", v
);
2645 return mon
.store
->exists(get_service_name(), k
);
2648 // 'interval' represents the number of maps from the last pinned
2649 // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2650 // version 11 next; all intermediate versions will be removed.
2652 // 'txsize' represents the maximum number of versions we'll be removing in
2653 // this iteration. If 'txsize' is large enough to perform multiple passes
2654 // pinning and removing maps, we will do so; if not, we'll do at least one
2655 // pass. We are quite relaxed about honouring 'txsize', but we'll always
2656 // ensure that we never go *over* the maximum.
2658 // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2659 uint64_t removal_interval
= prune_interval
- 1;
2661 if (txsize
< removal_interval
) {
2663 << " setting txsize to removal interval size ("
2664 << removal_interval
<< " versions"
2666 txsize
= removal_interval
;
2668 ceph_assert(removal_interval
> 0);
2670 uint64_t num_pruned
= 0;
2671 while (num_pruned
+ removal_interval
<= txsize
) {
2672 last_pinned
= manifest
.get_last_pinned();
2674 if (last_pinned
+ prune_interval
> last_to_pin
) {
2677 ceph_assert(last_pinned
< last_to_pin
);
2679 version_t next_pinned
= last_pinned
+ prune_interval
;
2680 ceph_assert(next_pinned
<= last_to_pin
);
2681 manifest
.pin(next_pinned
);
2683 dout(20) << __func__
2684 << " last_pinned " << last_pinned
2685 << " next_pinned " << next_pinned
2686 << " num_pruned " << num_pruned
2687 << " removal interval (" << (last_pinned
+1)
2688 << ".." << (next_pinned
-1) << ")"
2689 << " txsize " << txsize
<< dendl
;
2691 ceph_assert(map_exists(last_pinned
));
2692 ceph_assert(map_exists(next_pinned
));
2694 for (version_t v
= last_pinned
+1; v
< next_pinned
; ++v
) {
2695 ceph_assert(!manifest
.is_pinned(v
));
2697 dout(20) << __func__
<< " pruning full osdmap e" << v
<< dendl
;
2698 string full_key
= mon
.store
->combine_strings("full", v
);
2699 tx
->erase(get_service_name(), full_key
);
2704 ceph_assert(num_pruned
> 0);
2707 manifest
.encode(bl
);
2708 tx
->put(get_service_name(), "osdmap_manifest", bl
);
2716 bool OSDMonitor::preprocess_query(MonOpRequestRef op
)
2718 op
->mark_osdmon_event(__func__
);
2719 Message
*m
= op
->get_req();
2720 dout(10) << "preprocess_query " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
2722 switch (m
->get_type()) {
2724 case MSG_MON_COMMAND
:
2726 return preprocess_command(op
);
2727 } catch (const bad_cmd_get
& e
) {
2729 mon
.reply_command(op
, -EINVAL
, e
.what(), bl
, get_last_committed());
2732 case CEPH_MSG_MON_GET_OSDMAP
:
2733 return preprocess_get_osdmap(op
);
2736 case MSG_OSD_MARK_ME_DOWN
:
2737 return preprocess_mark_me_down(op
);
2738 case MSG_OSD_MARK_ME_DEAD
:
2739 return preprocess_mark_me_dead(op
);
2741 return preprocess_full(op
);
2742 case MSG_OSD_FAILURE
:
2743 return preprocess_failure(op
);
2745 return preprocess_boot(op
);
2747 return preprocess_alive(op
);
2748 case MSG_OSD_PG_CREATED
:
2749 return preprocess_pg_created(op
);
2750 case MSG_OSD_PG_READY_TO_MERGE
:
2751 return preprocess_pg_ready_to_merge(op
);
2752 case MSG_OSD_PGTEMP
:
2753 return preprocess_pgtemp(op
);
2754 case MSG_OSD_BEACON
:
2755 return preprocess_beacon(op
);
2757 case CEPH_MSG_POOLOP
:
2758 return preprocess_pool_op(op
);
2760 case MSG_REMOVE_SNAPS
:
2761 return preprocess_remove_snaps(op
);
2763 case MSG_MON_GET_PURGED_SNAPS
:
2764 return preprocess_get_purged_snaps(op
);
2772 bool OSDMonitor::prepare_update(MonOpRequestRef op
)
2774 op
->mark_osdmon_event(__func__
);
2775 Message
*m
= op
->get_req();
2776 dout(7) << "prepare_update " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
2778 switch (m
->get_type()) {
2780 case MSG_OSD_MARK_ME_DOWN
:
2781 return prepare_mark_me_down(op
);
2782 case MSG_OSD_MARK_ME_DEAD
:
2783 return prepare_mark_me_dead(op
);
2785 return prepare_full(op
);
2786 case MSG_OSD_FAILURE
:
2787 return prepare_failure(op
);
2789 return prepare_boot(op
);
2791 return prepare_alive(op
);
2792 case MSG_OSD_PG_CREATED
:
2793 return prepare_pg_created(op
);
2794 case MSG_OSD_PGTEMP
:
2795 return prepare_pgtemp(op
);
2796 case MSG_OSD_PG_READY_TO_MERGE
:
2797 return prepare_pg_ready_to_merge(op
);
2798 case MSG_OSD_BEACON
:
2799 return prepare_beacon(op
);
2801 case MSG_MON_COMMAND
:
2803 return prepare_command(op
);
2804 } catch (const bad_cmd_get
& e
) {
2806 mon
.reply_command(op
, -EINVAL
, e
.what(), bl
, get_last_committed());
2810 case CEPH_MSG_POOLOP
:
2811 return prepare_pool_op(op
);
2813 case MSG_REMOVE_SNAPS
:
2814 return prepare_remove_snaps(op
);
2824 bool OSDMonitor::should_propose(double& delay
)
2826 dout(10) << "should_propose" << dendl
;
2828 // if full map, propose immediately! any subsequent changes will be clobbered.
2829 if (pending_inc
.fullmap
.length())
2832 // adjust osd weights?
2833 if (!osd_weight
.empty() &&
2834 osd_weight
.size() == (unsigned)osdmap
.get_max_osd()) {
2835 dout(0) << " adjusting osd weights based on " << osd_weight
<< dendl
;
2836 osdmap
.adjust_osd_weights(osd_weight
, pending_inc
);
2842 return PaxosService::should_propose(delay
);
2847 // ---------------------------
2850 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op
)
2852 op
->mark_osdmon_event(__func__
);
2853 auto m
= op
->get_req
<MMonGetOSDMap
>();
2855 uint64_t features
= mon
.get_quorum_con_features();
2856 if (op
->get_session() && op
->get_session()->con_features
)
2857 features
= op
->get_session()->con_features
;
2859 dout(10) << __func__
<< " " << *m
<< dendl
;
2860 MOSDMap
*reply
= new MOSDMap(mon
.monmap
->fsid
, features
);
2861 epoch_t first
= get_first_committed();
2862 epoch_t last
= osdmap
.get_epoch();
2863 int max
= g_conf()->osd_map_message_max
;
2864 ssize_t max_bytes
= g_conf()->osd_map_message_max_bytes
;
2865 for (epoch_t e
= std::max(first
, m
->get_full_first());
2866 e
<= std::min(last
, m
->get_full_last()) && max
> 0 && max_bytes
> 0;
2868 bufferlist
& bl
= reply
->maps
[e
];
2869 int r
= get_version_full(e
, features
, bl
);
2870 ceph_assert(r
>= 0);
2871 max_bytes
-= bl
.length();
2873 for (epoch_t e
= std::max(first
, m
->get_inc_first());
2874 e
<= std::min(last
, m
->get_inc_last()) && max
> 0 && max_bytes
> 0;
2876 bufferlist
& bl
= reply
->incremental_maps
[e
];
2877 int r
= get_version(e
, features
, bl
);
2878 ceph_assert(r
>= 0);
2879 max_bytes
-= bl
.length();
2881 reply
->oldest_map
= first
;
2882 reply
->newest_map
= last
;
2883 mon
.send_reply(op
, reply
);
2888 // ---------------------------
2893 bool OSDMonitor::check_source(MonOpRequestRef op
, uuid_d fsid
) {
2894 // check permissions
2895 MonSession
*session
= op
->get_session();
2898 if (!session
->is_capable("osd", MON_CAP_X
)) {
2899 dout(0) << "got MOSDFailure from entity with insufficient caps "
2900 << session
->caps
<< dendl
;
2903 if (fsid
!= mon
.monmap
->fsid
) {
2904 dout(0) << "check_source: on fsid " << fsid
2905 << " != " << mon
.monmap
->fsid
<< dendl
;
2912 bool OSDMonitor::preprocess_failure(MonOpRequestRef op
)
2914 op
->mark_osdmon_event(__func__
);
2915 auto m
= op
->get_req
<MOSDFailure
>();
2916 // who is target_osd
2917 int badboy
= m
->get_target_osd();
2919 // check permissions
2920 if (check_source(op
, m
->fsid
))
2923 // first, verify the reporting host is valid
2924 if (m
->get_orig_source().is_osd()) {
2925 int from
= m
->get_orig_source().num();
2926 if (!osdmap
.exists(from
) ||
2927 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs()) ||
2928 (osdmap
.is_down(from
) && m
->if_osd_failed())) {
2929 dout(5) << "preprocess_failure from dead osd." << from
2930 << ", ignoring" << dendl
;
2931 send_incremental(op
, m
->get_epoch()+1);
2938 if (osdmap
.is_down(badboy
)) {
2939 dout(5) << "preprocess_failure dne(/dup?): osd." << m
->get_target_osd()
2940 << " " << m
->get_target_addrs()
2941 << ", from " << m
->get_orig_source() << dendl
;
2942 if (m
->get_epoch() < osdmap
.get_epoch())
2943 send_incremental(op
, m
->get_epoch()+1);
2946 if (osdmap
.get_addrs(badboy
) != m
->get_target_addrs()) {
2947 dout(5) << "preprocess_failure wrong osd: report osd." << m
->get_target_osd()
2948 << " " << m
->get_target_addrs()
2949 << " != map's " << osdmap
.get_addrs(badboy
)
2950 << ", from " << m
->get_orig_source() << dendl
;
2951 if (m
->get_epoch() < osdmap
.get_epoch())
2952 send_incremental(op
, m
->get_epoch()+1);
2956 // already reported?
2957 if (osdmap
.is_down(badboy
) ||
2958 osdmap
.get_up_from(badboy
) > m
->get_epoch()) {
2959 dout(5) << "preprocess_failure dup/old: osd." << m
->get_target_osd()
2960 << " " << m
->get_target_addrs()
2961 << ", from " << m
->get_orig_source() << dendl
;
2962 if (m
->get_epoch() < osdmap
.get_epoch())
2963 send_incremental(op
, m
->get_epoch()+1);
2967 if (!can_mark_down(badboy
)) {
2968 dout(5) << "preprocess_failure ignoring report of osd."
2969 << m
->get_target_osd() << " " << m
->get_target_addrs()
2970 << " from " << m
->get_orig_source() << dendl
;
2974 dout(10) << "preprocess_failure new: osd." << m
->get_target_osd()
2975 << " " << m
->get_target_addrs()
2976 << ", from " << m
->get_orig_source() << dendl
;
2984 class C_AckMarkedDown
: public C_MonOp
{
2990 : C_MonOp(op
), osdmon(osdmon
) {}
2992 void _finish(int r
) override
{
2994 auto m
= op
->get_req
<MOSDMarkMeDown
>();
2995 osdmon
->mon
.send_reply(
3002 false)); // ACK itself does not request an ack
3003 } else if (r
== -EAGAIN
) {
3004 osdmon
->dispatch(op
);
3006 ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r
);
3009 ~C_AckMarkedDown() override
{
3013 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op
)
3015 op
->mark_osdmon_event(__func__
);
3016 auto m
= op
->get_req
<MOSDMarkMeDown
>();
3017 int from
= m
->target_osd
;
3019 // check permissions
3020 if (check_source(op
, m
->fsid
))
3023 // first, verify the reporting host is valid
3024 if (!m
->get_orig_source().is_osd())
3027 if (!osdmap
.exists(from
) ||
3028 osdmap
.is_down(from
) ||
3029 osdmap
.get_addrs(from
) != m
->target_addrs
) {
3030 dout(5) << "preprocess_mark_me_down from dead osd."
3031 << from
<< ", ignoring" << dendl
;
3032 send_incremental(op
, m
->get_epoch()+1);
3036 // no down might be set
3037 if (!can_mark_down(from
))
3040 dout(10) << "MOSDMarkMeDown for: " << m
->get_orig_source()
3041 << " " << m
->target_addrs
<< dendl
;
3045 if (m
->request_ack
) {
3046 Context
*c(new C_AckMarkedDown(this, op
));
3052 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op
)
3054 op
->mark_osdmon_event(__func__
);
3055 auto m
= op
->get_req
<MOSDMarkMeDown
>();
3056 int target_osd
= m
->target_osd
;
3058 ceph_assert(osdmap
.is_up(target_osd
));
3059 ceph_assert(osdmap
.get_addrs(target_osd
) == m
->target_addrs
);
3061 mon
.clog
->info() << "osd." << target_osd
<< " marked itself " << ((m
->down_and_dead
) ? "down and dead" : "down");
3062 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
3063 if (m
->down_and_dead
) {
3064 if (!pending_inc
.new_xinfo
.count(target_osd
)) {
3065 pending_inc
.new_xinfo
[target_osd
] = osdmap
.osd_xinfo
[target_osd
];
3067 pending_inc
.new_xinfo
[target_osd
].dead_epoch
= m
->get_epoch();
3070 wait_for_finished_proposal(op
, new C_AckMarkedDown(this, op
));
3074 bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op
)
3076 op
->mark_osdmon_event(__func__
);
3077 auto m
= op
->get_req
<MOSDMarkMeDead
>();
3078 int from
= m
->target_osd
;
3080 // check permissions
3081 if (check_source(op
, m
->fsid
)) {
3086 // first, verify the reporting host is valid
3087 if (!m
->get_orig_source().is_osd()) {
3092 if (!osdmap
.exists(from
) ||
3093 !osdmap
.is_down(from
)) {
3094 dout(5) << __func__
<< " from nonexistent or up osd." << from
3095 << ", ignoring" << dendl
;
3096 send_incremental(op
, m
->get_epoch()+1);
3104 bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op
)
3106 op
->mark_osdmon_event(__func__
);
3107 auto m
= op
->get_req
<MOSDMarkMeDead
>();
3108 int target_osd
= m
->target_osd
;
3110 ceph_assert(osdmap
.is_down(target_osd
));
3112 mon
.clog
->info() << "osd." << target_osd
<< " marked itself dead as of e"
3114 if (!pending_inc
.new_xinfo
.count(target_osd
)) {
3115 pending_inc
.new_xinfo
[target_osd
] = osdmap
.osd_xinfo
[target_osd
];
3117 pending_inc
.new_xinfo
[target_osd
].dead_epoch
= m
->get_epoch();
3118 wait_for_finished_proposal(
3121 [op
, this] (int r
) {
3123 mon
.no_reply(op
); // ignore on success
3130 bool OSDMonitor::can_mark_down(int i
)
3132 if (osdmap
.is_nodown(i
)) {
3133 dout(5) << __func__
<< " osd." << i
<< " is marked as nodown, "
3134 << "will not mark it down" << dendl
;
3138 int num_osds
= osdmap
.get_num_osds();
3139 if (num_osds
== 0) {
3140 dout(5) << __func__
<< " no osds" << dendl
;
3143 int up
= osdmap
.get_num_up_osds() - pending_inc
.get_net_marked_down(&osdmap
);
3144 float up_ratio
= (float)up
/ (float)num_osds
;
3145 if (up_ratio
< g_conf()->mon_osd_min_up_ratio
) {
3146 dout(2) << __func__
<< " current up_ratio " << up_ratio
<< " < min "
3147 << g_conf()->mon_osd_min_up_ratio
3148 << ", will not mark osd." << i
<< " down" << dendl
;
3154 bool OSDMonitor::can_mark_up(int i
)
3156 if (osdmap
.is_noup(i
)) {
3157 dout(5) << __func__
<< " osd." << i
<< " is marked as noup, "
3158 << "will not mark it up" << dendl
;
3166 * @note the parameter @p i apparently only exists here so we can output the
3167 * osd's id on messages.
3169 bool OSDMonitor::can_mark_out(int i
)
3171 if (osdmap
.is_noout(i
)) {
3172 dout(5) << __func__
<< " osd." << i
<< " is marked as noout, "
3173 << "will not mark it out" << dendl
;
3177 int num_osds
= osdmap
.get_num_osds();
3178 if (num_osds
== 0) {
3179 dout(5) << __func__
<< " no osds" << dendl
;
3182 int in
= osdmap
.get_num_in_osds() - pending_inc
.get_net_marked_out(&osdmap
);
3183 float in_ratio
= (float)in
/ (float)num_osds
;
3184 if (in_ratio
< g_conf()->mon_osd_min_in_ratio
) {
3186 dout(5) << __func__
<< " current in_ratio " << in_ratio
<< " < min "
3187 << g_conf()->mon_osd_min_in_ratio
3188 << ", will not mark osd." << i
<< " out" << dendl
;
3190 dout(5) << __func__
<< " current in_ratio " << in_ratio
<< " < min "
3191 << g_conf()->mon_osd_min_in_ratio
3192 << ", will not mark osds out" << dendl
;
3199 bool OSDMonitor::can_mark_in(int i
)
3201 if (osdmap
.is_noin(i
)) {
3202 dout(5) << __func__
<< " osd." << i
<< " is marked as noin, "
3203 << "will not mark it in" << dendl
;
3210 bool OSDMonitor::check_failures(utime_t now
)
3212 bool found_failure
= false;
3213 auto p
= failure_info
.begin();
3214 while (p
!= failure_info
.end()) {
3215 auto& [target_osd
, fi
] = *p
;
3216 if (can_mark_down(target_osd
) &&
3217 check_failure(now
, target_osd
, fi
)) {
3218 found_failure
= true;
3220 } else if (is_failure_stale(now
, fi
)) {
3221 dout(10) << " dropping stale failure_info for osd." << target_osd
3222 << " from " << fi
.reporters
.size() << " reporters"
3224 p
= failure_info
.erase(p
);
3229 return found_failure
;
3232 utime_t
OSDMonitor::get_grace_time(utime_t now
,
3234 failure_info_t
& fi
) const
3236 utime_t
orig_grace(g_conf()->osd_heartbeat_grace
, 0);
3237 if (!g_conf()->mon_osd_adjust_heartbeat_grace
) {
3240 utime_t grace
= orig_grace
;
3241 double halflife
= (double)g_conf()->mon_osd_laggy_halflife
;
3242 double decay_k
= ::log(.5) / halflife
;
3244 // scale grace period based on historical probability of 'lagginess'
3245 // (false positive failures due to slowness).
3246 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(target_osd
);
3247 const utime_t failed_for
= now
- fi
.get_failed_since();
3248 double decay
= exp((double)failed_for
* decay_k
);
3249 dout(20) << " halflife " << halflife
<< " decay_k " << decay_k
3250 << " failed_for " << failed_for
<< " decay " << decay
<< dendl
;
3251 double my_grace
= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
3254 // consider the peers reporting a failure a proxy for a potential
3255 // 'subcluster' over the overall cluster that is similarly
3256 // laggy. this is clearly not true in all cases, but will sometimes
3257 // help us localize the grace correction to a subset of the system
3258 // (say, a rack with a bad switch) that is unhappy.
3259 double peer_grace
= 0;
3260 for (auto& [reporter
, report
] : fi
.reporters
) {
3261 if (osdmap
.exists(reporter
)) {
3262 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(reporter
);
3263 utime_t elapsed
= now
- xi
.down_stamp
;
3264 double decay
= exp((double)elapsed
* decay_k
);
3265 peer_grace
+= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
3268 peer_grace
/= (double)fi
.reporters
.size();
3269 grace
+= peer_grace
;
3270 dout(10) << " osd." << target_osd
<< " has "
3271 << fi
.reporters
.size() << " reporters, "
3272 << grace
<< " grace (" << orig_grace
<< " + " << my_grace
3273 << " + " << peer_grace
<< "), max_failed_since " << fi
.get_failed_since()
3279 bool OSDMonitor::check_failure(utime_t now
, int target_osd
, failure_info_t
& fi
)
3281 // already pending failure?
3282 if (pending_inc
.new_state
.count(target_osd
) &&
3283 pending_inc
.new_state
[target_osd
] & CEPH_OSD_UP
) {
3284 dout(10) << " already pending failure" << dendl
;
3288 set
<string
> reporters_by_subtree
;
3289 auto reporter_subtree_level
= g_conf().get_val
<string
>("mon_osd_reporter_subtree_level");
3290 ceph_assert(fi
.reporters
.size());
3291 for (auto p
= fi
.reporters
.begin(); p
!= fi
.reporters
.end();) {
3292 // get the parent bucket whose type matches with "reporter_subtree_level".
3293 // fall back to OSD if the level doesn't exist.
3294 if (osdmap
.exists(p
->first
)) {
3295 auto reporter_loc
= osdmap
.crush
->get_full_location(p
->first
);
3296 if (auto iter
= reporter_loc
.find(reporter_subtree_level
);
3297 iter
== reporter_loc
.end()) {
3298 reporters_by_subtree
.insert("osd." + to_string(p
->first
));
3300 reporters_by_subtree
.insert(iter
->second
);
3304 fi
.cancel_report(p
->first
);;
3305 p
= fi
.reporters
.erase(p
);
3308 if (reporters_by_subtree
.size() < g_conf().get_val
<uint64_t>("mon_osd_min_down_reporters")) {
3311 const utime_t failed_for
= now
- fi
.get_failed_since();
3312 const utime_t grace
= get_grace_time(now
, target_osd
, fi
);
3313 if (failed_for
>= grace
) {
3314 dout(1) << " we have enough reporters to mark osd." << target_osd
3315 << " down" << dendl
;
3316 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
3318 mon
.clog
->info() << "osd." << target_osd
<< " failed ("
3319 << osdmap
.crush
->get_full_location_ordered_string(
3322 << (int)reporters_by_subtree
.size()
3323 << " reporters from different "
3324 << reporter_subtree_level
<< " after "
3325 << failed_for
<< " >= grace " << grace
<< ")";
3331 bool OSDMonitor::is_failure_stale(utime_t now
, failure_info_t
& fi
) const
3333 // if it takes too long to either cancel the report to mark the osd down,
3334 // some reporters must have failed to cancel their reports. let's just
3335 // forget these reports.
3336 const utime_t failed_for
= now
- fi
.get_failed_since();
3337 auto heartbeat_grace
= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_grace");
3338 auto heartbeat_stale
= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_stale");
3339 return failed_for
>= (heartbeat_grace
+ heartbeat_stale
);
3342 void OSDMonitor::force_failure(int target_osd
, int by
)
3344 // already pending failure?
3345 if (pending_inc
.new_state
.count(target_osd
) &&
3346 pending_inc
.new_state
[target_osd
] & CEPH_OSD_UP
) {
3347 dout(10) << " already pending failure" << dendl
;
3351 dout(1) << " we're forcing failure of osd." << target_osd
<< dendl
;
3352 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
3353 if (!pending_inc
.new_xinfo
.count(target_osd
)) {
3354 pending_inc
.new_xinfo
[target_osd
] = osdmap
.osd_xinfo
[target_osd
];
3356 pending_inc
.new_xinfo
[target_osd
].dead_epoch
= pending_inc
.epoch
;
3358 mon
.clog
->info() << "osd." << target_osd
<< " failed ("
3359 << osdmap
.crush
->get_full_location_ordered_string(target_osd
)
3360 << ") (connection refused reported by osd." << by
<< ")";
3364 bool OSDMonitor::prepare_failure(MonOpRequestRef op
)
3366 op
->mark_osdmon_event(__func__
);
3367 auto m
= op
->get_req
<MOSDFailure
>();
3368 dout(1) << "prepare_failure osd." << m
->get_target_osd()
3369 << " " << m
->get_target_addrs()
3370 << " from " << m
->get_orig_source()
3371 << " is reporting failure:" << m
->if_osd_failed() << dendl
;
3373 int target_osd
= m
->get_target_osd();
3374 int reporter
= m
->get_orig_source().num();
3375 ceph_assert(osdmap
.is_up(target_osd
));
3376 ceph_assert(osdmap
.get_addrs(target_osd
) == m
->get_target_addrs());
3380 if (m
->if_osd_failed()) {
3381 // calculate failure time
3382 utime_t now
= ceph_clock_now();
3383 utime_t failed_since
=
3384 m
->get_recv_stamp() - utime_t(m
->failed_for
, 0);
3387 if (m
->is_immediate()) {
3388 mon
.clog
->debug() << "osd." << m
->get_target_osd()
3389 << " reported immediately failed by "
3390 << m
->get_orig_source();
3391 force_failure(target_osd
, reporter
);
3394 mon
.clog
->debug() << "osd." << m
->get_target_osd() << " reported failed by "
3395 << m
->get_orig_source();
3397 failure_info_t
& fi
= failure_info
[target_osd
];
3398 fi
.add_report(reporter
, failed_since
, op
);
3399 return check_failure(now
, target_osd
, fi
);
3401 // remove the report
3402 mon
.clog
->debug() << "osd." << m
->get_target_osd()
3403 << " failure report canceled by "
3404 << m
->get_orig_source();
3405 if (failure_info
.count(target_osd
)) {
3406 failure_info_t
& fi
= failure_info
[target_osd
];
3407 fi
.cancel_report(reporter
);
3408 if (fi
.reporters
.empty()) {
3409 dout(10) << " removing last failure_info for osd." << target_osd
3411 failure_info
.erase(target_osd
);
3413 dout(10) << " failure_info for osd." << target_osd
<< " now "
3414 << fi
.reporters
.size() << " reporters" << dendl
;
3417 dout(10) << " no failure_info for osd." << target_osd
<< dendl
;
3424 void OSDMonitor::process_failures()
3426 map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
3427 while (p
!= failure_info
.end()) {
3428 if (osdmap
.is_up(p
->first
)) {
3431 dout(10) << "process_failures osd." << p
->first
<< dendl
;
3432 list
<MonOpRequestRef
> ls
;
3433 p
->second
.take_report_messages(ls
);
3434 failure_info
.erase(p
++);
3436 while (!ls
.empty()) {
3437 MonOpRequestRef o
= ls
.front();
3439 o
->mark_event(__func__
);
3440 MOSDFailure
*m
= o
->get_req
<MOSDFailure
>();
3441 send_latest(o
, m
->get_epoch());
3450 void OSDMonitor::take_all_failures(list
<MonOpRequestRef
>& ls
)
3452 dout(10) << __func__
<< " on " << failure_info
.size() << " osds" << dendl
;
3454 for (map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
3455 p
!= failure_info
.end();
3457 p
->second
.take_report_messages(ls
);
3459 failure_info
.clear();
3462 int OSDMonitor::get_grace_interval_threshold()
3464 int halflife
= g_conf()->mon_osd_laggy_halflife
;
3465 // Scale the halflife period (default: 1_hr) by
3466 // a factor (48) to calculate the threshold.
3467 int grace_threshold_factor
= 48;
3468 return halflife
* grace_threshold_factor
;
3471 bool OSDMonitor::grace_interval_threshold_exceeded(int last_failed_interval
)
3473 int grace_interval_threshold_secs
= get_grace_interval_threshold();
3474 if (last_failed_interval
> grace_interval_threshold_secs
) {
3475 dout(1) << " last_failed_interval " << last_failed_interval
3476 << " > grace_interval_threshold_secs " << grace_interval_threshold_secs
3483 void OSDMonitor::set_default_laggy_params(int target_osd
)
3485 if (pending_inc
.new_xinfo
.count(target_osd
) == 0) {
3486 pending_inc
.new_xinfo
[target_osd
] = osdmap
.osd_xinfo
[target_osd
];
3488 osd_xinfo_t
& xi
= pending_inc
.new_xinfo
[target_osd
];
3489 xi
.down_stamp
= pending_inc
.modified
;
3490 xi
.laggy_probability
= 0.0;
3491 xi
.laggy_interval
= 0;
3492 dout(20) << __func__
<< " reset laggy, now xi " << xi
<< dendl
;
3498 bool OSDMonitor::preprocess_boot(MonOpRequestRef op
)
3500 op
->mark_osdmon_event(__func__
);
3501 auto m
= op
->get_req
<MOSDBoot
>();
3502 int from
= m
->get_orig_source_inst().name
.num();
3504 // check permissions, ignore if failed (no response expected)
3505 MonSession
*session
= op
->get_session();
3508 if (!session
->is_capable("osd", MON_CAP_X
)) {
3509 dout(0) << "got preprocess_boot message from entity with insufficient caps"
3510 << session
->caps
<< dendl
;
3514 if (m
->sb
.cluster_fsid
!= mon
.monmap
->fsid
) {
3515 dout(0) << "preprocess_boot on fsid " << m
->sb
.cluster_fsid
3516 << " != " << mon
.monmap
->fsid
<< dendl
;
3520 if (m
->get_orig_source_inst().addr
.is_blank_ip()) {
3521 dout(0) << "preprocess_boot got blank addr for " << m
->get_orig_source_inst() << dendl
;
3525 ceph_assert(m
->get_orig_source_inst().name
.is_osd());
3527 // lower bound of N-2
3528 if (!HAVE_FEATURE(m
->osd_features
, SERVER_OCTOPUS
)) {
3529 mon
.clog
->info() << "disallowing boot of OSD "
3530 << m
->get_orig_source_inst()
3531 << " because the osd lacks CEPH_FEATURE_SERVER_OCTOPUS";
3535 // make sure osd versions do not span more than 3 releases
3536 if (HAVE_FEATURE(m
->osd_features
, SERVER_PACIFIC
) &&
3537 osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
3538 mon
.clog
->info() << "disallowing boot of pacific+ OSD "
3539 << m
->get_orig_source_inst()
3540 << " because require_osd_release < nautilus";
3543 if (HAVE_FEATURE(m
->osd_features
, SERVER_QUINCY
) &&
3544 osdmap
.require_osd_release
< ceph_release_t::octopus
) {
3545 mon
.clog
->info() << "disallowing boot of quincy+ OSD "
3546 << m
->get_orig_source_inst()
3547 << " because require_osd_release < octopus";
3551 if (osdmap
.stretch_mode_enabled
&&
3552 !(m
->osd_features
& CEPH_FEATUREMASK_STRETCH_MODE
)) {
3553 mon
.clog
->info() << "disallowing boot of OSD "
3554 << m
->get_orig_source_inst()
3555 << " because stretch mode is on and OSD lacks support";
3560 if (osdmap
.is_up(from
) &&
3561 osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs()) &&
3562 osdmap
.get_cluster_addrs(from
).legacy_equals(m
->cluster_addrs
)) {
3564 dout(7) << "preprocess_boot dup from " << m
->get_orig_source()
3565 << " " << m
->get_orig_source_addrs()
3566 << " =~ " << osdmap
.get_addrs(from
) << dendl
;
3571 if (osdmap
.exists(from
) &&
3572 !osdmap
.get_uuid(from
).is_zero() &&
3573 osdmap
.get_uuid(from
) != m
->sb
.osd_fsid
) {
3574 dout(7) << __func__
<< " from " << m
->get_orig_source_inst()
3575 << " clashes with existing osd: different fsid"
3576 << " (ours: " << osdmap
.get_uuid(from
)
3577 << " ; theirs: " << m
->sb
.osd_fsid
<< ")" << dendl
;
3581 if (osdmap
.exists(from
) &&
3582 osdmap
.get_info(from
).up_from
> m
->version
&&
3583 osdmap
.get_most_recent_addrs(from
).legacy_equals(
3584 m
->get_orig_source_addrs())) {
3585 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl
;
3586 send_latest(op
, m
->sb
.current_epoch
+1);
3591 if (!can_mark_up(from
)) {
3592 dout(7) << "preprocess_boot ignoring boot from " << m
->get_orig_source_inst() << dendl
;
3593 send_latest(op
, m
->sb
.current_epoch
+1);
3597 dout(10) << "preprocess_boot from " << m
->get_orig_source_inst() << dendl
;
3604 bool OSDMonitor::prepare_boot(MonOpRequestRef op
)
3606 op
->mark_osdmon_event(__func__
);
3607 auto m
= op
->get_req
<MOSDBoot
>();
3608 dout(7) << __func__
<< " from " << m
->get_source()
3610 << " client_addrs" << m
->get_connection()->get_peer_addrs()
3611 << " cluster_addrs " << m
->cluster_addrs
3612 << " hb_back_addrs " << m
->hb_back_addrs
3613 << " hb_front_addrs " << m
->hb_front_addrs
3616 ceph_assert(m
->get_orig_source().is_osd());
3617 int from
= m
->get_orig_source().num();
3619 // does this osd exist?
3620 if (from
>= osdmap
.get_max_osd()) {
3621 dout(1) << "boot from osd." << from
<< " >= max_osd "
3622 << osdmap
.get_max_osd() << dendl
;
3626 int oldstate
= osdmap
.exists(from
) ? osdmap
.get_state(from
) : CEPH_OSD_NEW
;
3627 if (pending_inc
.new_state
.count(from
))
3628 oldstate
^= pending_inc
.new_state
[from
];
3630 // already up? mark down first?
3631 if (osdmap
.is_up(from
)) {
3632 dout(7) << __func__
<< " was up, first marking down osd." << from
<< " "
3633 << osdmap
.get_addrs(from
) << dendl
;
3634 // preprocess should have caught these; if not, assert.
3635 ceph_assert(!osdmap
.get_addrs(from
).legacy_equals(
3636 m
->get_orig_source_addrs()) ||
3637 !osdmap
.get_cluster_addrs(from
).legacy_equals(m
->cluster_addrs
));
3638 ceph_assert(osdmap
.get_uuid(from
) == m
->sb
.osd_fsid
);
3640 if (pending_inc
.new_state
.count(from
) == 0 ||
3641 (pending_inc
.new_state
[from
] & CEPH_OSD_UP
) == 0) {
3642 // mark previous guy down
3643 pending_inc
.new_state
[from
] = CEPH_OSD_UP
;
3645 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
3646 } else if (pending_inc
.new_up_client
.count(from
)) {
3647 // already prepared, just wait
3648 dout(7) << __func__
<< " already prepared, waiting on "
3649 << m
->get_orig_source_addr() << dendl
;
3650 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
3653 pending_inc
.new_up_client
[from
] = m
->get_orig_source_addrs();
3654 pending_inc
.new_up_cluster
[from
] = m
->cluster_addrs
;
3655 pending_inc
.new_hb_back_up
[from
] = m
->hb_back_addrs
;
3656 pending_inc
.new_hb_front_up
[from
] = m
->hb_front_addrs
;
3658 down_pending_out
.erase(from
); // if any
3661 osd_weight
[from
] = m
->sb
.weight
;
3664 dout(10) << " setting osd." << from
<< " uuid to " << m
->sb
.osd_fsid
3666 if (!osdmap
.exists(from
) || osdmap
.get_uuid(from
) != m
->sb
.osd_fsid
) {
3667 // preprocess should have caught this; if not, assert.
3668 ceph_assert(!osdmap
.exists(from
) || osdmap
.get_uuid(from
).is_zero());
3669 pending_inc
.new_uuid
[from
] = m
->sb
.osd_fsid
;
3673 if (m
->sb
.newest_map
== 0 && osdmap
.exists(from
)) {
3674 const osd_info_t
& i
= osdmap
.get_info(from
);
3675 if (i
.up_from
> i
.lost_at
) {
3676 dout(10) << " fresh osd; marking lost_at too" << dendl
;
3677 pending_inc
.new_lost
[from
] = osdmap
.get_epoch();
3682 bufferlist osd_metadata
;
3683 encode(m
->metadata
, osd_metadata
);
3684 pending_metadata
[from
] = osd_metadata
;
3685 pending_metadata_rm
.erase(from
);
3687 // adjust last clean unmount epoch?
3688 const osd_info_t
& info
= osdmap
.get_info(from
);
3689 dout(10) << " old osd_info: " << info
<< dendl
;
3690 if (m
->sb
.mounted
> info
.last_clean_begin
||
3691 (m
->sb
.mounted
== info
.last_clean_begin
&&
3692 m
->sb
.clean_thru
> info
.last_clean_end
)) {
3693 epoch_t begin
= m
->sb
.mounted
;
3694 epoch_t end
= m
->sb
.clean_thru
;
3696 dout(10) << __func__
<< " osd." << from
<< " last_clean_interval "
3697 << "[" << info
.last_clean_begin
<< "," << info
.last_clean_end
3698 << ") -> [" << begin
<< "-" << end
<< ")"
3700 pending_inc
.new_last_clean_interval
[from
] =
3701 pair
<epoch_t
,epoch_t
>(begin
, end
);
3704 if (pending_inc
.new_xinfo
.count(from
) == 0)
3705 pending_inc
.new_xinfo
[from
] = osdmap
.osd_xinfo
[from
];
3706 osd_xinfo_t
& xi
= pending_inc
.new_xinfo
[from
];
3707 if (m
->boot_epoch
== 0) {
3708 xi
.laggy_probability
*= (1.0 - g_conf()->mon_osd_laggy_weight
);
3709 xi
.laggy_interval
*= (1.0 - g_conf()->mon_osd_laggy_weight
);
3710 dout(10) << " not laggy, new xi " << xi
<< dendl
;
3712 if (xi
.down_stamp
.sec()) {
3713 int interval
= ceph_clock_now().sec() -
3714 xi
.down_stamp
.sec();
3715 if (g_conf()->mon_osd_laggy_max_interval
&&
3716 (interval
> g_conf()->mon_osd_laggy_max_interval
)) {
3717 interval
= g_conf()->mon_osd_laggy_max_interval
;
3720 interval
* g_conf()->mon_osd_laggy_weight
+
3721 xi
.laggy_interval
* (1.0 - g_conf()->mon_osd_laggy_weight
);
3723 xi
.laggy_probability
=
3724 g_conf()->mon_osd_laggy_weight
+
3725 xi
.laggy_probability
* (1.0 - g_conf()->mon_osd_laggy_weight
);
3726 dout(10) << " laggy, now xi " << xi
<< dendl
;
3729 // set features shared by the osd
3730 if (m
->osd_features
)
3731 xi
.features
= m
->osd_features
;
3733 xi
.features
= m
->get_connection()->get_features();
3736 if ((g_conf()->mon_osd_auto_mark_auto_out_in
&&
3737 (oldstate
& CEPH_OSD_AUTOOUT
)) ||
3738 (g_conf()->mon_osd_auto_mark_new_in
&& (oldstate
& CEPH_OSD_NEW
)) ||
3739 (g_conf()->mon_osd_auto_mark_in
)) {
3740 if (can_mark_in(from
)) {
3741 if (xi
.old_weight
> 0) {
3742 pending_inc
.new_weight
[from
] = xi
.old_weight
;
3745 pending_inc
.new_weight
[from
] = CEPH_OSD_IN
;
3748 dout(7) << __func__
<< " NOIN set, will not mark in "
3749 << m
->get_orig_source_addr() << dendl
;
3754 wait_for_finished_proposal(op
, new C_Booted(this, op
));
3759 void OSDMonitor::_booted(MonOpRequestRef op
, bool logit
)
3761 op
->mark_osdmon_event(__func__
);
3762 auto m
= op
->get_req
<MOSDBoot
>();
3763 dout(7) << "_booted " << m
->get_orig_source_inst()
3764 << " w " << m
->sb
.weight
<< " from " << m
->sb
.current_epoch
<< dendl
;
3767 mon
.clog
->info() << m
->get_source() << " " << m
->get_orig_source_addrs()
3771 send_latest(op
, m
->sb
.current_epoch
+1);
3778 bool OSDMonitor::preprocess_full(MonOpRequestRef op
)
3780 op
->mark_osdmon_event(__func__
);
3781 auto m
= op
->get_req
<MOSDFull
>();
3782 int from
= m
->get_orig_source().num();
3784 unsigned mask
= CEPH_OSD_NEARFULL
| CEPH_OSD_BACKFILLFULL
| CEPH_OSD_FULL
;
3786 // check permissions, ignore if failed
3787 MonSession
*session
= op
->get_session();
3790 if (!session
->is_capable("osd", MON_CAP_X
)) {
3791 dout(0) << "MOSDFull from entity with insufficient privileges:"
3792 << session
->caps
<< dendl
;
3796 // ignore a full message from the osd instance that already went down
3797 if (!osdmap
.exists(from
)) {
3798 dout(7) << __func__
<< " ignoring full message from nonexistent "
3799 << m
->get_orig_source_inst() << dendl
;
3802 if ((!osdmap
.is_up(from
) &&
3803 osdmap
.get_most_recent_addrs(from
).legacy_equals(
3804 m
->get_orig_source_addrs())) ||
3805 (osdmap
.is_up(from
) &&
3806 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs()))) {
3807 dout(7) << __func__
<< " ignoring full message from down "
3808 << m
->get_orig_source_inst() << dendl
;
3812 OSDMap::calc_state_set(osdmap
.get_state(from
), state
);
3814 if ((osdmap
.get_state(from
) & mask
) == m
->state
) {
3815 dout(7) << __func__
<< " state already " << state
<< " for osd." << from
3816 << " " << m
->get_orig_source_inst() << dendl
;
3817 _reply_map(op
, m
->version
);
3821 dout(10) << __func__
<< " want state " << state
<< " for osd." << from
3822 << " " << m
->get_orig_source_inst() << dendl
;
3829 bool OSDMonitor::prepare_full(MonOpRequestRef op
)
3831 op
->mark_osdmon_event(__func__
);
3832 auto m
= op
->get_req
<MOSDFull
>();
3833 const int from
= m
->get_orig_source().num();
3835 const unsigned mask
= CEPH_OSD_NEARFULL
| CEPH_OSD_BACKFILLFULL
| CEPH_OSD_FULL
;
3836 const unsigned want_state
= m
->state
& mask
; // safety first
3838 unsigned cur_state
= osdmap
.get_state(from
);
3839 auto p
= pending_inc
.new_state
.find(from
);
3840 if (p
!= pending_inc
.new_state
.end()) {
3841 cur_state
^= p
->second
;
3845 set
<string
> want_state_set
, cur_state_set
;
3846 OSDMap::calc_state_set(want_state
, want_state_set
);
3847 OSDMap::calc_state_set(cur_state
, cur_state_set
);
3849 if (cur_state
!= want_state
) {
3850 if (p
!= pending_inc
.new_state
.end()) {
3853 pending_inc
.new_state
[from
] = 0;
3855 pending_inc
.new_state
[from
] |= (osdmap
.get_state(from
) & mask
) ^ want_state
;
3856 dout(7) << __func__
<< " osd." << from
<< " " << cur_state_set
3857 << " -> " << want_state_set
<< dendl
;
3859 dout(7) << __func__
<< " osd." << from
<< " " << cur_state_set
3860 << " = wanted " << want_state_set
<< ", just waiting" << dendl
;
3863 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
3870 bool OSDMonitor::preprocess_alive(MonOpRequestRef op
)
3872 op
->mark_osdmon_event(__func__
);
3873 auto m
= op
->get_req
<MOSDAlive
>();
3874 int from
= m
->get_orig_source().num();
3876 // check permissions, ignore if failed
3877 MonSession
*session
= op
->get_session();
3880 if (!session
->is_capable("osd", MON_CAP_X
)) {
3881 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3882 << session
->caps
<< dendl
;
3886 if (!osdmap
.is_up(from
) ||
3887 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs())) {
3888 dout(7) << "preprocess_alive ignoring alive message from down "
3889 << m
->get_orig_source() << " " << m
->get_orig_source_addrs()
3894 if (osdmap
.get_up_thru(from
) >= m
->want
) {
3896 dout(7) << "preprocess_alive want up_thru " << m
->want
<< " dup from " << m
->get_orig_source_inst() << dendl
;
3897 _reply_map(op
, m
->version
);
3901 dout(10) << "preprocess_alive want up_thru " << m
->want
3902 << " from " << m
->get_orig_source_inst() << dendl
;
3909 bool OSDMonitor::prepare_alive(MonOpRequestRef op
)
3911 op
->mark_osdmon_event(__func__
);
3912 auto m
= op
->get_req
<MOSDAlive
>();
3913 int from
= m
->get_orig_source().num();
3915 if (0) { // we probably don't care much about these
3916 mon
.clog
->debug() << m
->get_orig_source_inst() << " alive";
3919 dout(7) << "prepare_alive want up_thru " << m
->want
<< " have " << m
->version
3920 << " from " << m
->get_orig_source_inst() << dendl
;
3922 update_up_thru(from
, m
->version
); // set to the latest map the OSD has
3923 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
3927 void OSDMonitor::_reply_map(MonOpRequestRef op
, epoch_t e
)
3929 op
->mark_osdmon_event(__func__
);
3930 dout(7) << "_reply_map " << e
3931 << " from " << op
->get_req()->get_orig_source_inst()
3937 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op
)
3939 op
->mark_osdmon_event(__func__
);
3940 auto m
= op
->get_req
<MOSDPGCreated
>();
3941 dout(10) << __func__
<< " " << *m
<< dendl
;
3942 auto session
= op
->get_session();
3945 dout(10) << __func__
<< ": no monitor session!" << dendl
;
3948 if (!session
->is_capable("osd", MON_CAP_X
)) {
3949 derr
<< __func__
<< " received from entity "
3950 << "with insufficient privileges " << session
->caps
<< dendl
;
3953 // always forward the "created!" to the leader
3957 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op
)
3959 op
->mark_osdmon_event(__func__
);
3960 auto m
= op
->get_req
<MOSDPGCreated
>();
3961 dout(10) << __func__
<< " " << *m
<< dendl
;
3962 auto src
= m
->get_orig_source();
3963 auto from
= src
.num();
3964 if (!src
.is_osd() ||
3965 !mon
.osdmon()->osdmap
.is_up(from
) ||
3966 !mon
.osdmon()->osdmap
.get_addrs(from
).legacy_equals(
3967 m
->get_orig_source_addrs())) {
3968 dout(1) << __func__
<< " ignoring stats from non-active osd." << dendl
;
3971 pending_created_pgs
.push_back(m
->pgid
);
3975 bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op
)
3977 op
->mark_osdmon_event(__func__
);
3978 auto m
= op
->get_req
<MOSDPGReadyToMerge
>();
3979 dout(10) << __func__
<< " " << *m
<< dendl
;
3980 const pg_pool_t
*pi
;
3981 auto session
= op
->get_session();
3983 dout(10) << __func__
<< ": no monitor session!" << dendl
;
3986 if (!session
->is_capable("osd", MON_CAP_X
)) {
3987 derr
<< __func__
<< " received from entity "
3988 << "with insufficient privileges " << session
->caps
<< dendl
;
3991 pi
= osdmap
.get_pg_pool(m
->pgid
.pool());
3993 derr
<< __func__
<< " pool for " << m
->pgid
<< " dne" << dendl
;
3996 if (pi
->get_pg_num() <= m
->pgid
.ps()) {
3997 dout(20) << " pg_num " << pi
->get_pg_num() << " already < " << m
->pgid
<< dendl
;
4000 if (pi
->get_pg_num() != m
->pgid
.ps() + 1) {
4001 derr
<< " OSD trying to merge wrong pgid " << m
->pgid
<< dendl
;
4004 if (pi
->get_pg_num_pending() > m
->pgid
.ps()) {
4005 dout(20) << " pg_num_pending " << pi
->get_pg_num_pending() << " > " << m
->pgid
<< dendl
;
4015 bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op
)
4017 op
->mark_osdmon_event(__func__
);
4018 auto m
= op
->get_req
<MOSDPGReadyToMerge
>();
4019 dout(10) << __func__
<< " " << *m
<< dendl
;
4021 if (pending_inc
.new_pools
.count(m
->pgid
.pool()))
4022 p
= pending_inc
.new_pools
[m
->pgid
.pool()];
4024 p
= *osdmap
.get_pg_pool(m
->pgid
.pool());
4025 if (p
.get_pg_num() != m
->pgid
.ps() + 1 ||
4026 p
.get_pg_num_pending() > m
->pgid
.ps()) {
4027 dout(10) << __func__
4028 << " race with concurrent pg_num[_pending] update, will retry"
4030 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
4035 p
.dec_pg_num(m
->pgid
,
4039 m
->last_epoch_started
,
4040 m
->last_epoch_clean
);
4041 p
.last_change
= pending_inc
.epoch
;
4043 // back off the merge attempt!
4044 p
.set_pg_num_pending(p
.get_pg_num());
4047 // force pre-nautilus clients to resend their ops, since they
4048 // don't understand pg_num_pending changes form a new interval
4049 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
4051 pending_inc
.new_pools
[m
->pgid
.pool()] = p
;
4053 auto prob
= g_conf().get_val
<double>("mon_inject_pg_merge_bounce_probability");
4056 prob
> (double)(rand() % 1000)/1000.0) {
4057 derr
<< __func__
<< " injecting pg merge pg_num bounce" << dendl
;
4058 auto n
= new MMonCommand(mon
.monmap
->get_fsid());
4059 n
->set_connection(m
->get_connection());
4060 n
->cmd
= { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
4061 osdmap
.get_pool_name(m
->pgid
.pool()) +
4062 "\", \"var\": \"pg_num_actual\", \"val\": \"" +
4063 stringify(m
->pgid
.ps() + 1) + "\"}" };
4064 MonOpRequestRef nop
= mon
.op_tracker
.create_request
<MonOpRequest
>(n
);
4065 nop
->set_type_service();
4066 wait_for_finished_proposal(op
, new C_RetryMessage(this, nop
));
4068 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
4077 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op
)
4079 auto m
= op
->get_req
<MOSDPGTemp
>();
4080 dout(10) << "preprocess_pgtemp " << *m
<< dendl
;
4081 mempool::osdmap::vector
<int> empty
;
4082 int from
= m
->get_orig_source().num();
4083 size_t ignore_cnt
= 0;
4086 MonSession
*session
= op
->get_session();
4089 if (!session
->is_capable("osd", MON_CAP_X
)) {
4090 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
4091 << session
->caps
<< dendl
;
4095 if (!osdmap
.is_up(from
) ||
4096 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs())) {
4097 dout(7) << "ignoring pgtemp message from down "
4098 << m
->get_orig_source() << " " << m
->get_orig_source_addrs()
4107 for (auto p
= m
->pg_temp
.begin(); p
!= m
->pg_temp
.end(); ++p
) {
4108 dout(20) << " " << p
->first
4109 << (osdmap
.pg_temp
->count(p
->first
) ? osdmap
.pg_temp
->get(p
->first
) : empty
)
4110 << " -> " << p
->second
<< dendl
;
4112 // does the pool exist?
4113 if (!osdmap
.have_pg_pool(p
->first
.pool())) {
4115 * 1. If the osdmap does not have the pool, it means the pool has been
4116 * removed in-between the osd sending this message and us handling it.
4117 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
4118 * not exist in the pending either, as the osds would not send a
4119 * message about a pool they know nothing about (yet).
4120 * 3. However, if the pool does exist in the pending, then it must be a
4121 * new pool, and not relevant to this message (see 1).
4123 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
4124 << ": pool has been removed" << dendl
;
4129 int acting_primary
= -1;
4130 osdmap
.pg_to_up_acting_osds(
4131 p
->first
, nullptr, nullptr, nullptr, &acting_primary
);
4132 if (acting_primary
!= from
) {
4133 /* If the source isn't the primary based on the current osdmap, we know
4134 * that the interval changed and that we can discard this message.
4135 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
4136 * which of two pg temp mappings on the same pg is more recent.
4138 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
4139 << ": primary has changed" << dendl
;
4145 if (p
->second
.empty() && (osdmap
.pg_temp
->count(p
->first
) ||
4146 osdmap
.primary_temp
->count(p
->first
)))
4149 // NOTE: we assume that this will clear pg_primary, so consider
4150 // an existing pg_primary field to imply a change
4151 if (p
->second
.size() &&
4152 (osdmap
.pg_temp
->count(p
->first
) == 0 ||
4153 osdmap
.pg_temp
->get(p
->first
) != p
->second
||
4154 osdmap
.primary_temp
->count(p
->first
)))
4158 // should we ignore all the pgs?
4159 if (ignore_cnt
== m
->pg_temp
.size())
4162 dout(7) << "preprocess_pgtemp e" << m
->map_epoch
<< " no changes from " << m
->get_orig_source_inst() << dendl
;
4163 _reply_map(op
, m
->map_epoch
);
4171 void OSDMonitor::update_up_thru(int from
, epoch_t up_thru
)
4173 epoch_t old_up_thru
= osdmap
.get_up_thru(from
);
4174 auto ut
= pending_inc
.new_up_thru
.find(from
);
4175 if (ut
!= pending_inc
.new_up_thru
.end()) {
4176 old_up_thru
= ut
->second
;
4178 if (up_thru
> old_up_thru
) {
4179 // set up_thru too, so the osd doesn't have to ask again
4180 pending_inc
.new_up_thru
[from
] = up_thru
;
4184 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op
)
4186 op
->mark_osdmon_event(__func__
);
4187 auto m
= op
->get_req
<MOSDPGTemp
>();
4188 int from
= m
->get_orig_source().num();
4189 dout(7) << "prepare_pgtemp e" << m
->map_epoch
<< " from " << m
->get_orig_source_inst() << dendl
;
4190 for (map
<pg_t
,vector
<int32_t> >::iterator p
= m
->pg_temp
.begin(); p
!= m
->pg_temp
.end(); ++p
) {
4191 uint64_t pool
= p
->first
.pool();
4192 if (pending_inc
.old_pools
.count(pool
)) {
4193 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
4194 << ": pool pending removal" << dendl
;
4197 if (!osdmap
.have_pg_pool(pool
)) {
4198 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
4199 << ": pool has been removed" << dendl
;
4202 pending_inc
.new_pg_temp
[p
->first
] =
4203 mempool::osdmap::vector
<int>(p
->second
.begin(), p
->second
.end());
4205 // unconditionally clear pg_primary (until this message can encode
4206 // a change for that, too.. at which point we need to also fix
4207 // preprocess_pg_temp)
4208 if (osdmap
.primary_temp
->count(p
->first
) ||
4209 pending_inc
.new_primary_temp
.count(p
->first
))
4210 pending_inc
.new_primary_temp
[p
->first
] = -1;
4213 // set up_thru too, so the osd doesn't have to ask again
4214 update_up_thru(from
, m
->map_epoch
);
4216 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->map_epoch
));
4223 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op
)
4225 op
->mark_osdmon_event(__func__
);
4226 auto m
= op
->get_req
<MRemoveSnaps
>();
4227 dout(7) << "preprocess_remove_snaps " << *m
<< dendl
;
4229 // check privilege, ignore if failed
4230 MonSession
*session
= op
->get_session();
4234 if (!session
->caps
.is_capable(
4236 session
->entity_name
,
4237 "osd", "osd pool rmsnap", {}, true, true, false,
4238 session
->get_peer_socket_addr())) {
4239 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
4240 << session
->caps
<< dendl
;
4244 for (map
<int, vector
<snapid_t
> >::iterator q
= m
->snaps
.begin();
4245 q
!= m
->snaps
.end();
4247 if (!osdmap
.have_pg_pool(q
->first
)) {
4248 dout(10) << " ignoring removed_snaps " << q
->second
4249 << " on non-existent pool " << q
->first
<< dendl
;
4252 const pg_pool_t
*pi
= osdmap
.get_pg_pool(q
->first
);
4253 for (vector
<snapid_t
>::iterator p
= q
->second
.begin();
4254 p
!= q
->second
.end();
4256 if (*p
> pi
->get_snap_seq() ||
4257 !_is_removed_snap(q
->first
, *p
)) {
4263 if (HAVE_FEATURE(m
->get_connection()->get_features(), SERVER_OCTOPUS
)) {
4264 auto reply
= make_message
<MRemoveSnaps
>();
4265 reply
->snaps
= m
->snaps
;
4266 mon
.send_reply(op
, reply
.detach());
4273 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op
)
4275 op
->mark_osdmon_event(__func__
);
4276 auto m
= op
->get_req
<MRemoveSnaps
>();
4277 dout(7) << "prepare_remove_snaps " << *m
<< dendl
;
4279 for (auto& [pool
, snaps
] : m
->snaps
) {
4280 if (!osdmap
.have_pg_pool(pool
)) {
4281 dout(10) << " ignoring removed_snaps " << snaps
4282 << " on non-existent pool " << pool
<< dendl
;
4286 pg_pool_t
& pi
= osdmap
.pools
[pool
];
4287 for (auto s
: snaps
) {
4288 if (!_is_removed_snap(pool
, s
) &&
4289 (!pending_inc
.new_pools
.count(pool
) ||
4290 !pending_inc
.new_pools
[pool
].removed_snaps
.contains(s
)) &&
4291 (!pending_inc
.new_removed_snaps
.count(pool
) ||
4292 !pending_inc
.new_removed_snaps
[pool
].contains(s
))) {
4293 pg_pool_t
*newpi
= pending_inc
.get_new_pool(pool
, &pi
);
4294 if (osdmap
.require_osd_release
< ceph_release_t::octopus
) {
4295 newpi
->removed_snaps
.insert(s
);
4296 dout(10) << " pool " << pool
<< " removed_snaps added " << s
4297 << " (now " << newpi
->removed_snaps
<< ")" << dendl
;
4299 newpi
->flags
|= pg_pool_t::FLAG_SELFMANAGED_SNAPS
;
4300 if (s
> newpi
->get_snap_seq()) {
4301 dout(10) << " pool " << pool
<< " snap_seq "
4302 << newpi
->get_snap_seq() << " -> " << s
<< dendl
;
4303 newpi
->set_snap_seq(s
);
4305 newpi
->set_snap_epoch(pending_inc
.epoch
);
4306 dout(10) << " added pool " << pool
<< " snap " << s
4307 << " to removed_snaps queue" << dendl
;
4308 pending_inc
.new_removed_snaps
[pool
].insert(s
);
4313 if (HAVE_FEATURE(m
->get_connection()->get_features(), SERVER_OCTOPUS
)) {
4314 auto reply
= make_message
<MRemoveSnaps
>();
4315 reply
->snaps
= m
->snaps
;
4316 wait_for_finished_proposal(op
, new C_ReplyOp(this, op
, reply
));
4322 bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op
)
4324 op
->mark_osdmon_event(__func__
);
4325 auto m
= op
->get_req
<MMonGetPurgedSnaps
>();
4326 dout(7) << __func__
<< " " << *m
<< dendl
;
4328 map
<epoch_t
,mempool::osdmap::map
<int64_t,snap_interval_set_t
>> r
;
4330 string k
= make_purged_snap_epoch_key(m
->start
);
4331 auto it
= mon
.store
->get_iterator(OSD_SNAP_PREFIX
);
4333 unsigned long epoch
= m
->last
;
4334 while (it
->valid()) {
4335 if (it
->key().find("purged_epoch_") != 0) {
4338 string k
= it
->key();
4339 int n
= sscanf(k
.c_str(), "purged_epoch_%lx", &epoch
);
4341 derr
<< __func__
<< " unable to parse key '" << it
->key() << "'" << dendl
;
4342 } else if (epoch
> m
->last
) {
4345 bufferlist bl
= it
->value();
4346 auto p
= bl
.cbegin();
4350 } catch (ceph::buffer::error
& e
) {
4351 derr
<< __func__
<< " unable to parse value for key '" << it
->key()
4356 n
+= 4 + v
.size() * 16;
4359 // impose a semi-arbitrary limit to message size
4365 auto reply
= make_message
<MMonGetPurgedSnapsReply
>(m
->start
, epoch
);
4366 reply
->purged_snaps
.swap(r
);
4367 mon
.send_reply(op
, reply
.detach());
4373 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op
)
4375 op
->mark_osdmon_event(__func__
);
4377 auto session
= op
->get_session();
4380 dout(10) << __func__
<< " no monitor session!" << dendl
;
4383 if (!session
->is_capable("osd", MON_CAP_X
)) {
4384 derr
<< __func__
<< " received from entity "
4385 << "with insufficient privileges " << session
->caps
<< dendl
;
4388 // Always forward the beacon to the leader, even if they are the same as
4389 // the old one. The leader will mark as down osds that haven't sent
4390 // beacon for a few minutes.
4394 bool OSDMonitor::prepare_beacon(MonOpRequestRef op
)
4396 op
->mark_osdmon_event(__func__
);
4397 const auto beacon
= op
->get_req
<MOSDBeacon
>();
4398 const auto src
= beacon
->get_orig_source();
4399 dout(10) << __func__
<< " " << *beacon
4400 << " from " << src
<< dendl
;
4401 int from
= src
.num();
4403 if (!src
.is_osd() ||
4404 !osdmap
.is_up(from
) ||
4405 !osdmap
.get_addrs(from
).legacy_equals(beacon
->get_orig_source_addrs())) {
4406 if (src
.is_osd() && !osdmap
.is_up(from
)) {
4407 // share some new maps with this guy in case it may not be
4408 // aware of its own deadness...
4409 send_latest(op
, beacon
->version
+1);
4411 dout(1) << " ignoring beacon from non-active osd." << from
<< dendl
;
4415 last_osd_report
[from
].first
= ceph_clock_now();
4416 last_osd_report
[from
].second
= beacon
->osd_beacon_report_interval
;
4417 osd_epochs
[from
] = beacon
->version
;
4419 for (const auto& pg
: beacon
->pgs
) {
4420 if (auto* pool
= osdmap
.get_pg_pool(pg
.pool()); pool
!= nullptr) {
4421 unsigned pg_num
= pool
->get_pg_num();
4422 last_epoch_clean
.report(pg_num
, pg
, beacon
->min_last_epoch_clean
);
4426 if (osdmap
.osd_xinfo
[from
].last_purged_snaps_scrub
<
4427 beacon
->last_purged_snaps_scrub
) {
4428 if (pending_inc
.new_xinfo
.count(from
) == 0) {
4429 pending_inc
.new_xinfo
[from
] = osdmap
.osd_xinfo
[from
];
4431 pending_inc
.new_xinfo
[from
].last_purged_snaps_scrub
=
4432 beacon
->last_purged_snaps_scrub
;
4442 void OSDMonitor::send_latest(MonOpRequestRef op
, epoch_t start
)
4444 op
->mark_osdmon_event(__func__
);
4445 dout(5) << "send_latest to " << op
->get_req()->get_orig_source_inst()
4446 << " start " << start
<< dendl
;
4450 send_incremental(op
, start
);
4454 MOSDMap
*OSDMonitor::build_latest_full(uint64_t features
)
4456 MOSDMap
*r
= new MOSDMap(mon
.monmap
->fsid
, features
);
4457 get_version_full(osdmap
.get_epoch(), features
, r
->maps
[osdmap
.get_epoch()]);
4458 r
->oldest_map
= get_first_committed();
4459 r
->newest_map
= osdmap
.get_epoch();
4463 MOSDMap
*OSDMonitor::build_incremental(epoch_t from
, epoch_t to
, uint64_t features
)
4465 dout(10) << "build_incremental [" << from
<< ".." << to
<< "] with features "
4466 << std::hex
<< features
<< std::dec
<< dendl
;
4467 MOSDMap
*m
= new MOSDMap(mon
.monmap
->fsid
, features
);
4468 m
->oldest_map
= get_first_committed();
4469 m
->newest_map
= osdmap
.get_epoch();
4471 for (epoch_t e
= to
; e
>= from
&& e
> 0; e
--) {
4473 int err
= get_version(e
, features
, bl
);
4475 ceph_assert(bl
.length());
4476 // if (get_version(e, bl) > 0) {
4477 dout(20) << "build_incremental inc " << e
<< " "
4478 << bl
.length() << " bytes" << dendl
;
4479 m
->incremental_maps
[e
] = bl
;
4481 ceph_assert(err
== -ENOENT
);
4482 ceph_assert(!bl
.length());
4483 get_version_full(e
, features
, bl
);
4484 if (bl
.length() > 0) {
4485 //else if (get_version("full", e, bl) > 0) {
4486 dout(20) << "build_incremental full " << e
<< " "
4487 << bl
.length() << " bytes" << dendl
;
4490 ceph_abort(); // we should have all maps.
4497 void OSDMonitor::send_full(MonOpRequestRef op
)
4499 op
->mark_osdmon_event(__func__
);
4500 dout(5) << "send_full to " << op
->get_req()->get_orig_source_inst() << dendl
;
4501 mon
.send_reply(op
, build_latest_full(op
->get_session()->con_features
));
4504 void OSDMonitor::send_incremental(MonOpRequestRef op
, epoch_t first
)
4506 op
->mark_osdmon_event(__func__
);
4508 MonSession
*s
= op
->get_session();
4512 // oh, we can tell the other mon to do it
4513 dout(10) << __func__
<< " asking proxying mon to send_incremental from "
4515 MRoute
*r
= new MRoute(s
->proxy_tid
, NULL
);
4516 r
->send_osdmap_first
= first
;
4517 s
->proxy_con
->send_message(r
);
4518 op
->mark_event("reply: send routed send_osdmap_first reply");
4521 send_incremental(first
, s
, false, op
);
4525 void OSDMonitor::send_incremental(epoch_t first
,
4526 MonSession
*session
,
4528 MonOpRequestRef req
)
4530 dout(5) << "send_incremental [" << first
<< ".." << osdmap
.get_epoch() << "]"
4531 << " to " << session
->name
<< dendl
;
4533 // get feature of the peer
4534 // use quorum_con_features, if it's an anonymous connection.
4535 uint64_t features
= session
->con_features
? session
->con_features
:
4536 mon
.get_quorum_con_features();
4538 if (first
<= session
->osd_epoch
) {
4539 dout(10) << __func__
<< " " << session
->name
<< " should already have epoch "
4540 << session
->osd_epoch
<< dendl
;
4541 first
= session
->osd_epoch
+ 1;
4544 if (first
< get_first_committed()) {
4545 MOSDMap
*m
= new MOSDMap(osdmap
.get_fsid(), features
);
4546 m
->oldest_map
= get_first_committed();
4547 m
->newest_map
= osdmap
.get_epoch();
4549 first
= get_first_committed();
4551 int err
= get_version_full(first
, features
, bl
);
4552 ceph_assert(err
== 0);
4553 ceph_assert(bl
.length());
4554 dout(20) << "send_incremental starting with base full "
4555 << first
<< " " << bl
.length() << " bytes" << dendl
;
4556 m
->maps
[first
] = bl
;
4559 mon
.send_reply(req
, m
);
4560 session
->osd_epoch
= first
;
4563 session
->con
->send_message(m
);
4564 session
->osd_epoch
= first
;
4569 while (first
<= osdmap
.get_epoch()) {
4570 epoch_t last
= std::min
<epoch_t
>(first
+ g_conf()->osd_map_message_max
- 1,
4571 osdmap
.get_epoch());
4572 MOSDMap
*m
= build_incremental(first
, last
, features
);
4575 // send some maps. it may not be all of them, but it will get them
4577 mon
.send_reply(req
, m
);
4579 session
->con
->send_message(m
);
4582 session
->osd_epoch
= last
;
4588 int OSDMonitor::get_version(version_t ver
, bufferlist
& bl
)
4590 return get_version(ver
, mon
.get_quorum_con_features(), bl
);
4593 void OSDMonitor::reencode_incremental_map(bufferlist
& bl
, uint64_t features
)
4595 OSDMap::Incremental inc
;
4596 auto q
= bl
.cbegin();
4598 // always encode with subset of osdmap's canonical features
4599 uint64_t f
= features
& inc
.encode_features
;
4600 dout(20) << __func__
<< " " << inc
.epoch
<< " with features " << f
4603 if (inc
.fullmap
.length()) {
4604 // embedded full map?
4606 m
.decode(inc
.fullmap
);
4607 inc
.fullmap
.clear();
4608 m
.encode(inc
.fullmap
, f
| CEPH_FEATURE_RESERVED
);
4610 if (inc
.crush
.length()) {
4611 // embedded crush map
4613 auto p
= inc
.crush
.cbegin();
4616 c
.encode(inc
.crush
, f
);
4618 inc
.encode(bl
, f
| CEPH_FEATURE_RESERVED
);
4621 void OSDMonitor::reencode_full_map(bufferlist
& bl
, uint64_t features
)
4624 auto q
= bl
.cbegin();
4626 // always encode with subset of osdmap's canonical features
4627 uint64_t f
= features
& m
.get_encoding_features();
4628 dout(20) << __func__
<< " " << m
.get_epoch() << " with features " << f
4631 m
.encode(bl
, f
| CEPH_FEATURE_RESERVED
);
4634 int OSDMonitor::get_version(version_t ver
, uint64_t features
, bufferlist
& bl
)
4636 uint64_t significant_features
= OSDMap::get_significant_features(features
);
4637 if (inc_osd_cache
.lookup({ver
, significant_features
}, &bl
)) {
4640 int ret
= PaxosService::get_version(ver
, bl
);
4644 // NOTE: this check is imprecise; the OSDMap encoding features may
4645 // be a subset of the latest mon quorum features, but worst case we
4646 // reencode once and then cache the (identical) result under both
4648 if (significant_features
!=
4649 OSDMap::get_significant_features(mon
.get_quorum_con_features())) {
4650 reencode_incremental_map(bl
, features
);
4652 inc_osd_cache
.add_bytes({ver
, significant_features
}, bl
);
4656 int OSDMonitor::get_inc(version_t ver
, OSDMap::Incremental
& inc
)
4659 int err
= get_version(ver
, inc_bl
);
4660 ceph_assert(err
== 0);
4661 ceph_assert(inc_bl
.length());
4663 auto p
= inc_bl
.cbegin();
4665 dout(10) << __func__
<< " "
4666 << " epoch " << inc
.epoch
4667 << " inc_crc " << inc
.inc_crc
4668 << " full_crc " << inc
.full_crc
4669 << " encode_features " << inc
.encode_features
<< dendl
;
4673 int OSDMonitor::get_full_from_pinned_map(version_t ver
, bufferlist
& bl
)
4675 dout(10) << __func__
<< " ver " << ver
<< dendl
;
4677 version_t closest_pinned
= osdmap_manifest
.get_lower_closest_pinned(ver
);
4678 if (closest_pinned
== 0) {
4681 if (closest_pinned
> ver
) {
4682 dout(0) << __func__
<< " pinned: " << osdmap_manifest
.pinned
<< dendl
;
4684 ceph_assert(closest_pinned
<= ver
);
4686 dout(10) << __func__
<< " closest pinned ver " << closest_pinned
<< dendl
;
4688 // get osdmap incremental maps and apply on top of this one.
4690 bool has_cached_osdmap
= false;
4691 for (version_t v
= ver
-1; v
>= closest_pinned
; --v
) {
4692 if (full_osd_cache
.lookup({v
, mon
.get_quorum_con_features()},
4694 dout(10) << __func__
<< " found map in cache ver " << v
<< dendl
;
4696 has_cached_osdmap
= true;
4701 if (!has_cached_osdmap
) {
4702 int err
= PaxosService::get_version_full(closest_pinned
, osdm_bl
);
4704 derr
<< __func__
<< " closest pinned map ver " << closest_pinned
4705 << " not available! error: " << cpp_strerror(err
) << dendl
;
4707 ceph_assert(err
== 0);
4710 ceph_assert(osdm_bl
.length());
4713 osdm
.decode(osdm_bl
);
4715 dout(10) << __func__
<< " loaded osdmap epoch " << closest_pinned
4716 << " e" << osdm
.epoch
4717 << " crc " << osdm
.get_crc()
4718 << " -- applying incremental maps." << dendl
;
4720 uint64_t encode_features
= 0;
4721 for (version_t v
= closest_pinned
+ 1; v
<= ver
; ++v
) {
4722 dout(20) << __func__
<< " applying inc epoch " << v
<< dendl
;
4724 OSDMap::Incremental inc
;
4725 int err
= get_inc(v
, inc
);
4726 ceph_assert(err
== 0);
4728 encode_features
= inc
.encode_features
;
4730 err
= osdm
.apply_incremental(inc
);
4731 ceph_assert(err
== 0);
4733 // this block performs paranoid checks on map retrieval
4734 if (g_conf().get_val
<bool>("mon_debug_extra_checks") &&
4735 inc
.full_crc
!= 0) {
4737 uint64_t f
= encode_features
;
4739 f
= (mon
.quorum_con_features
? mon
.quorum_con_features
: -1);
4742 // encode osdmap to force calculating crcs
4744 osdm
.encode(tbl
, f
| CEPH_FEATURE_RESERVED
);
4745 // decode osdmap to compare crcs with what's expected by incremental
4749 if (tosdm
.get_crc() != inc
.full_crc
) {
4751 << " osdmap crc mismatch! (osdmap crc " << tosdm
.get_crc()
4752 << ", expected " << inc
.full_crc
<< ")" << dendl
;
4753 ceph_abort_msg("osdmap crc mismatch");
4757 // note: we cannot add the recently computed map to the cache, as is,
4758 // because we have not encoded the map into a bl.
4761 if (!encode_features
) {
4762 dout(10) << __func__
4763 << " last incremental map didn't have features;"
4764 << " defaulting to quorum's or all" << dendl
;
4766 (mon
.quorum_con_features
? mon
.quorum_con_features
: -1);
4768 osdm
.encode(bl
, encode_features
| CEPH_FEATURE_RESERVED
);
4773 int OSDMonitor::get_version_full(version_t ver
, bufferlist
& bl
)
4775 return get_version_full(ver
, mon
.get_quorum_con_features(), bl
);
4778 int OSDMonitor::get_version_full(version_t ver
, uint64_t features
,
4781 uint64_t significant_features
= OSDMap::get_significant_features(features
);
4782 if (full_osd_cache
.lookup({ver
, significant_features
}, &bl
)) {
4785 int ret
= PaxosService::get_version_full(ver
, bl
);
4786 if (ret
== -ENOENT
) {
4788 ret
= get_full_from_pinned_map(ver
, bl
);
4793 // NOTE: this check is imprecise; the OSDMap encoding features may
4794 // be a subset of the latest mon quorum features, but worst case we
4795 // reencode once and then cache the (identical) result under both
4797 if (significant_features
!=
4798 OSDMap::get_significant_features(mon
.get_quorum_con_features())) {
4799 reencode_full_map(bl
, features
);
4801 full_osd_cache
.add_bytes({ver
, significant_features
}, bl
);
4805 epoch_t
OSDMonitor::blocklist(const entity_addrvec_t
& av
, utime_t until
)
4807 dout(10) << "blocklist " << av
<< " until " << until
<< dendl
;
4808 for (auto a
: av
.v
) {
4809 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
4810 a
.set_type(entity_addr_t::TYPE_ANY
);
4812 a
.set_type(entity_addr_t::TYPE_LEGACY
);
4814 pending_inc
.new_blocklist
[a
] = until
;
4816 return pending_inc
.epoch
;
4819 epoch_t
OSDMonitor::blocklist(entity_addr_t a
, utime_t until
)
4821 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
4822 a
.set_type(entity_addr_t::TYPE_ANY
);
4824 a
.set_type(entity_addr_t::TYPE_LEGACY
);
4826 dout(10) << "blocklist " << a
<< " until " << until
<< dendl
;
4827 pending_inc
.new_blocklist
[a
] = until
;
4828 return pending_inc
.epoch
;
4832 void OSDMonitor::check_osdmap_subs()
4834 dout(10) << __func__
<< dendl
;
4835 if (!osdmap
.get_epoch()) {
4838 auto osdmap_subs
= mon
.session_map
.subs
.find("osdmap");
4839 if (osdmap_subs
== mon
.session_map
.subs
.end()) {
4842 auto p
= osdmap_subs
->second
->begin();
4846 check_osdmap_sub(sub
);
4850 void OSDMonitor::check_osdmap_sub(Subscription
*sub
)
4852 dout(10) << __func__
<< " " << sub
<< " next " << sub
->next
4853 << (sub
->onetime
? " (onetime)":" (ongoing)") << dendl
;
4854 if (sub
->next
<= osdmap
.get_epoch()) {
4856 send_incremental(sub
->next
, sub
->session
, sub
->incremental_onetime
);
4858 sub
->session
->con
->send_message(build_latest_full(sub
->session
->con_features
));
4860 mon
.session_map
.remove_sub(sub
);
4862 sub
->next
= osdmap
.get_epoch() + 1;
4866 void OSDMonitor::check_pg_creates_subs()
4868 if (!osdmap
.get_num_up_osds()) {
4871 ceph_assert(osdmap
.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB
);
4872 mon
.with_session_map([this](const MonSessionMap
& session_map
) {
4873 auto pg_creates_subs
= session_map
.subs
.find("osd_pg_creates");
4874 if (pg_creates_subs
== session_map
.subs
.end()) {
4877 for (auto sub
: *pg_creates_subs
->second
) {
4878 check_pg_creates_sub(sub
);
4883 void OSDMonitor::check_pg_creates_sub(Subscription
*sub
)
4885 dout(20) << __func__
<< " .. " << sub
->session
->name
<< dendl
;
4886 ceph_assert(sub
->type
== "osd_pg_creates");
4887 // only send these if the OSD is up. we will check_subs() when they do
4888 // come up so they will get the creates then.
4889 if (sub
->session
->name
.is_osd() &&
4890 mon
.osdmon()->osdmap
.is_up(sub
->session
->name
.num())) {
4891 sub
->next
= send_pg_creates(sub
->session
->name
.num(),
4892 sub
->session
->con
.get(),
4897 void OSDMonitor::do_application_enable(int64_t pool_id
,
4898 const std::string
&app_name
,
4899 const std::string
&app_key
,
4900 const std::string
&app_value
,
4903 ceph_assert(paxos
.is_plugged() && is_writeable());
4905 dout(20) << __func__
<< ": pool_id=" << pool_id
<< ", app_name=" << app_name
4908 ceph_assert(osdmap
.require_osd_release
>= ceph_release_t::luminous
);
4910 auto pp
= osdmap
.get_pg_pool(pool_id
);
4911 ceph_assert(pp
!= nullptr);
4914 if (pending_inc
.new_pools
.count(pool_id
)) {
4915 p
= pending_inc
.new_pools
[pool_id
];
4918 if (app_key
.empty()) {
4919 p
.application_metadata
.insert({app_name
, {}});
4922 p
.application_metadata
[app_name
][app_key
] = app_value
;
4924 p
.application_metadata
.insert({app_name
, {{app_key
, app_value
}}});
4927 p
.last_change
= pending_inc
.epoch
;
4928 pending_inc
.new_pools
[pool_id
] = p
;
4931 void OSDMonitor::do_set_pool_opt(int64_t pool_id
,
4932 pool_opts_t::key_t opt
,
4933 pool_opts_t::value_t val
)
4935 dout(10) << __func__
<< " pool: " << pool_id
<< " option: " << opt
4936 << " val: " << val
<< dendl
;
4937 auto p
= pending_inc
.new_pools
.try_emplace(
4938 pool_id
, *osdmap
.get_pg_pool(pool_id
));
4939 p
.first
->second
.opts
.set(opt
, val
);
4942 unsigned OSDMonitor::scan_for_creating_pgs(
4943 const mempool::osdmap::map
<int64_t,pg_pool_t
>& pools
,
4944 const mempool::osdmap::set
<int64_t>& removed_pools
,
4946 creating_pgs_t
* creating_pgs
) const
4948 unsigned queued
= 0;
4949 for (auto& p
: pools
) {
4950 int64_t poolid
= p
.first
;
4951 if (creating_pgs
->created_pools
.count(poolid
)) {
4952 dout(10) << __func__
<< " already created " << poolid
<< dendl
;
4955 const pg_pool_t
& pool
= p
.second
;
4956 int ruleno
= pool
.get_crush_rule();
4957 if (ruleno
< 0 || !osdmap
.crush
->rule_exists(ruleno
))
4960 const auto last_scan_epoch
= creating_pgs
->last_scan_epoch
;
4961 const auto created
= pool
.get_last_change();
4962 if (last_scan_epoch
&& created
<= last_scan_epoch
) {
4963 dout(10) << __func__
<< " no change in pool " << poolid
4964 << " " << pool
<< dendl
;
4967 if (removed_pools
.count(poolid
)) {
4968 dout(10) << __func__
<< " pool is being removed: " << poolid
4969 << " " << pool
<< dendl
;
4972 dout(10) << __func__
<< " queueing pool create for " << poolid
4973 << " " << pool
<< dendl
;
4974 creating_pgs
->create_pool(poolid
, pool
.get_pg_num(),
4981 void OSDMonitor::update_creating_pgs()
4983 dout(10) << __func__
<< " " << creating_pgs
.pgs
.size() << " pgs creating, "
4984 << creating_pgs
.queue
.size() << " pools in queue" << dendl
;
4985 decltype(creating_pgs_by_osd_epoch
) new_pgs_by_osd_epoch
;
4986 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
4987 for (const auto& pg
: creating_pgs
.pgs
) {
4988 int acting_primary
= -1;
4989 auto pgid
= pg
.first
;
4990 if (!osdmap
.pg_exists(pgid
)) {
4991 dout(20) << __func__
<< " ignoring " << pgid
<< " which should not exist"
4995 auto mapped
= pg
.second
.create_epoch
;
4996 dout(20) << __func__
<< " looking up " << pgid
<< "@" << mapped
<< dendl
;
4998 mapping
.get_primary_and_shard(pgid
, &acting_primary
, &spgid
);
4999 // check the previous creating_pgs, look for the target to whom the pg was
5000 // previously mapped
5001 for (const auto& pgs_by_epoch
: creating_pgs_by_osd_epoch
) {
5002 const auto last_acting_primary
= pgs_by_epoch
.first
;
5003 for (auto& pgs
: pgs_by_epoch
.second
) {
5004 if (pgs
.second
.count(spgid
)) {
5005 if (last_acting_primary
== acting_primary
) {
5008 dout(20) << __func__
<< " " << pgid
<< " "
5009 << " acting_primary:" << last_acting_primary
5010 << " -> " << acting_primary
<< dendl
;
5011 // note epoch if the target of the create message changed.
5012 mapped
= mapping
.get_epoch();
5017 mapped
= mapping
.get_epoch();
5021 dout(10) << __func__
<< " will instruct osd." << acting_primary
5022 << " to create " << pgid
<< "@" << mapped
<< dendl
;
5023 new_pgs_by_osd_epoch
[acting_primary
][mapped
].insert(spgid
);
5025 creating_pgs_by_osd_epoch
= std::move(new_pgs_by_osd_epoch
);
5026 creating_pgs_epoch
= mapping
.get_epoch();
5029 epoch_t
OSDMonitor::send_pg_creates(int osd
, Connection
*con
, epoch_t next
) const
5031 dout(30) << __func__
<< " osd." << osd
<< " next=" << next
5032 << " " << creating_pgs_by_osd_epoch
<< dendl
;
5033 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
5034 if (creating_pgs_epoch
<= creating_pgs
.last_scan_epoch
) {
5035 dout(20) << __func__
5036 << " not using stale creating_pgs@" << creating_pgs_epoch
<< dendl
;
5037 // the subscribers will be updated when the mapping is completed anyway
5040 auto creating_pgs_by_epoch
= creating_pgs_by_osd_epoch
.find(osd
);
5041 if (creating_pgs_by_epoch
== creating_pgs_by_osd_epoch
.end())
5043 ceph_assert(!creating_pgs_by_epoch
->second
.empty());
5045 MOSDPGCreate
*oldm
= nullptr; // for pre-mimic OSD compat
5046 MOSDPGCreate2
*m
= nullptr;
5048 bool old
= osdmap
.require_osd_release
< ceph_release_t::nautilus
;
5051 for (auto epoch_pgs
= creating_pgs_by_epoch
->second
.lower_bound(next
);
5052 epoch_pgs
!= creating_pgs_by_epoch
->second
.end(); ++epoch_pgs
) {
5053 auto epoch
= epoch_pgs
->first
;
5054 auto& pgs
= epoch_pgs
->second
;
5055 dout(20) << __func__
<< " osd." << osd
<< " from " << next
5056 << " : epoch " << epoch
<< " " << pgs
.size() << " pgs" << dendl
;
5058 for (auto& pg
: pgs
) {
5059 // Need the create time from the monitor using its clock to set
5060 // last_scrub_stamp upon pg creation.
5061 auto create
= creating_pgs
.pgs
.find(pg
.pgid
);
5062 ceph_assert(create
!= creating_pgs
.pgs
.end());
5065 oldm
= new MOSDPGCreate(creating_pgs_epoch
);
5067 oldm
->mkpg
.emplace(pg
.pgid
,
5068 pg_create_t
{create
->second
.create_epoch
, pg
.pgid
, 0});
5069 oldm
->ctimes
.emplace(pg
.pgid
, create
->second
.create_stamp
);
5072 m
= new MOSDPGCreate2(creating_pgs_epoch
);
5074 m
->pgs
.emplace(pg
, make_pair(create
->second
.create_epoch
,
5075 create
->second
.create_stamp
));
5076 if (create
->second
.history
.epoch_created
) {
5077 dout(20) << __func__
<< " " << pg
<< " " << create
->second
.history
5078 << " " << create
->second
.past_intervals
<< dendl
;
5079 m
->pg_extra
.emplace(pg
, make_pair(create
->second
.history
,
5080 create
->second
.past_intervals
));
5083 dout(20) << __func__
<< " will create " << pg
5084 << " at " << create
->second
.create_epoch
<< dendl
;
5088 con
->send_message(m
);
5090 con
->send_message(oldm
);
5092 dout(20) << __func__
<< " osd." << osd
<< " from " << next
5093 << " has nothing to send" << dendl
;
5097 // sub is current through last + 1
5104 void OSDMonitor::tick()
5106 if (!is_active()) return;
5108 dout(10) << osdmap
<< dendl
;
5110 // always update osdmap manifest, regardless of being the leader.
5111 load_osdmap_manifest();
5113 // always tune priority cache manager memory on leader and peons
5114 if (ceph_using_tcmalloc() && mon_memory_autotune
) {
5115 std::lock_guard
l(balancer_lock
);
5116 if (pcm
!= nullptr) {
5119 _set_new_cache_sizes();
5120 dout(10) << "tick balancer "
5121 << " inc cache_bytes: " << inc_cache
->get_cache_bytes()
5122 << " inc comtd_bytes: " << inc_cache
->get_committed_size()
5123 << " inc used_bytes: " << inc_cache
->_get_used_bytes()
5124 << " inc num_osdmaps: " << inc_cache
->_get_num_osdmaps()
5126 dout(10) << "tick balancer "
5127 << " full cache_bytes: " << full_cache
->get_cache_bytes()
5128 << " full comtd_bytes: " << full_cache
->get_committed_size()
5129 << " full used_bytes: " << full_cache
->_get_used_bytes()
5130 << " full num_osdmaps: " << full_cache
->_get_num_osdmaps()
5135 if (!mon
.is_leader()) return;
5137 bool do_propose
= false;
5138 utime_t now
= ceph_clock_now();
5140 if (handle_osd_timeouts(now
, last_osd_report
)) {
5145 if (check_failures(now
)) {
5149 // Force a proposal if we need to prune; pruning is performed on
5150 // ``encode_pending()``, hence why we need to regularly trigger a proposal
5151 // even if there's nothing going on.
5152 if (is_prune_enabled() && should_prune()) {
5156 // mark down osds out?
5158 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
5159 * influence at all. The decision is made based on the ratio of "in" osds,
5160 * and the function returns false if this ratio is lower that the minimum
5161 * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
5163 if (can_mark_out(-1)) {
5164 string down_out_subtree_limit
= g_conf().get_val
<string
>(
5165 "mon_osd_down_out_subtree_limit");
5166 set
<int> down_cache
; // quick cache of down subtrees
5168 map
<int,utime_t
>::iterator i
= down_pending_out
.begin();
5169 while (i
!= down_pending_out
.end()) {
5175 if (osdmap
.is_down(o
) &&
5178 utime_t
orig_grace(g_conf()->mon_osd_down_out_interval
, 0);
5179 utime_t grace
= orig_grace
;
5180 double my_grace
= 0.0;
5182 if (g_conf()->mon_osd_adjust_down_out_interval
) {
5183 // scale grace period the same way we do the heartbeat grace.
5184 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(o
);
5185 double halflife
= (double)g_conf()->mon_osd_laggy_halflife
;
5186 double decay_k
= ::log(.5) / halflife
;
5187 double decay
= exp((double)down
* decay_k
);
5188 dout(20) << "osd." << o
<< " laggy halflife " << halflife
<< " decay_k " << decay_k
5189 << " down for " << down
<< " decay " << decay
<< dendl
;
5190 my_grace
= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
5194 // is this an entire large subtree down?
5195 if (down_out_subtree_limit
.length()) {
5196 int type
= osdmap
.crush
->get_type_id(down_out_subtree_limit
);
5198 if (osdmap
.containing_subtree_is_down(cct
, o
, type
, &down_cache
)) {
5199 dout(10) << "tick entire containing " << down_out_subtree_limit
5200 << " subtree for osd." << o
5201 << " is down; resetting timer" << dendl
;
5202 // reset timer, too.
5203 down_pending_out
[o
] = now
;
5209 bool down_out
= !osdmap
.is_destroyed(o
) &&
5210 g_conf()->mon_osd_down_out_interval
> 0 && down
.sec() >= grace
;
5211 bool destroyed_out
= osdmap
.is_destroyed(o
) &&
5212 g_conf()->mon_osd_destroyed_out_interval
> 0 &&
5213 // this is not precise enough as we did not make a note when this osd
5214 // was marked as destroyed, but let's not bother with that
5215 // complexity for now.
5216 down
.sec() >= g_conf()->mon_osd_destroyed_out_interval
;
5217 if (down_out
|| destroyed_out
) {
5218 dout(10) << "tick marking osd." << o
<< " OUT after " << down
5219 << " sec (target " << grace
<< " = " << orig_grace
<< " + " << my_grace
<< ")" << dendl
;
5220 pending_inc
.new_weight
[o
] = CEPH_OSD_OUT
;
5222 // set the AUTOOUT bit.
5223 if (pending_inc
.new_state
.count(o
) == 0)
5224 pending_inc
.new_state
[o
] = 0;
5225 pending_inc
.new_state
[o
] |= CEPH_OSD_AUTOOUT
;
5227 // remember previous weight
5228 if (pending_inc
.new_xinfo
.count(o
) == 0)
5229 pending_inc
.new_xinfo
[o
] = osdmap
.osd_xinfo
[o
];
5230 pending_inc
.new_xinfo
[o
].old_weight
= osdmap
.osd_weight
[o
];
5234 mon
.clog
->info() << "Marking osd." << o
<< " out (has been down for "
5235 << int(down
.sec()) << " seconds)";
5240 down_pending_out
.erase(o
);
5243 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl
;
5246 // expire blocklisted items?
5247 for (ceph::unordered_map
<entity_addr_t
,utime_t
>::iterator p
= osdmap
.blocklist
.begin();
5248 p
!= osdmap
.blocklist
.end();
5250 if (p
->second
< now
) {
5251 dout(10) << "expiring blocklist item " << p
->first
<< " expired " << p
->second
<< " < now " << now
<< dendl
;
5252 pending_inc
.old_blocklist
.push_back(p
->first
);
5256 for (auto p
= osdmap
.range_blocklist
.begin();
5257 p
!= osdmap
.range_blocklist
.end();
5259 if (p
->second
< now
) {
5260 dout(10) << "expiring range_blocklist item " << p
->first
5261 << " expired " << p
->second
<< " < now " << now
<< dendl
;
5262 pending_inc
.old_range_blocklist
.push_back(p
->first
);
5267 if (try_prune_purged_snaps()) {
5271 if (update_pools_status())
5275 !pending_inc
.new_pg_temp
.empty()) // also propose if we adjusted pg_temp
5279 void OSDMonitor::_set_new_cache_sizes()
5281 uint64_t cache_size
= 0;
5282 int64_t inc_alloc
= 0;
5283 int64_t full_alloc
= 0;
5284 int64_t kv_alloc
= 0;
5286 if (pcm
!= nullptr && rocksdb_binned_kv_cache
!= nullptr) {
5287 cache_size
= pcm
->get_tuned_mem();
5288 inc_alloc
= inc_cache
->get_committed_size();
5289 full_alloc
= full_cache
->get_committed_size();
5290 kv_alloc
= rocksdb_binned_kv_cache
->get_committed_size();
5293 inc_osd_cache
.set_bytes(inc_alloc
);
5294 full_osd_cache
.set_bytes(full_alloc
);
5296 dout(1) << __func__
<< " cache_size:" << cache_size
5297 << " inc_alloc: " << inc_alloc
5298 << " full_alloc: " << full_alloc
5299 << " kv_alloc: " << kv_alloc
5303 bool OSDMonitor::handle_osd_timeouts(const utime_t
&now
,
5304 std::map
<int, std::pair
<utime_t
, int>> &last_osd_report
)
5306 utime_t
timeo(g_conf()->mon_osd_report_timeout
, 0);
5307 if (now
- mon
.get_leader_since() < timeo
) {
5308 // We haven't been the leader for long enough to consider OSD timeouts
5312 int max_osd
= osdmap
.get_max_osd();
5313 bool new_down
= false;
5315 for (int i
=0; i
< max_osd
; ++i
) {
5316 dout(30) << __func__
<< ": checking up on osd " << i
<< dendl
;
5317 if (!osdmap
.exists(i
)) {
5318 last_osd_report
.erase(i
); // if any
5321 if (!osdmap
.is_up(i
))
5323 const std::map
<int, std::pair
<utime_t
, int>>::const_iterator t
= last_osd_report
.find(i
);
5324 if (t
== last_osd_report
.end()) {
5325 // it wasn't in the map; start the timer.
5326 last_osd_report
[i
].first
= now
;
5327 last_osd_report
[i
].second
= 0;
5328 } else if (can_mark_down(i
)) {
5329 utime_t diff
= now
- t
->second
.first
;
5330 // we use the max(mon_osd_report_timeout, 2*osd_beacon_report_interval) as timeout
5331 // to allow for the osd to miss a beacon.
5332 int mon_osd_report_timeout
= g_conf()->mon_osd_report_timeout
;
5333 utime_t
max_timeout(std::max(mon_osd_report_timeout
, 2 * t
->second
.second
), 0);
5334 if (diff
> max_timeout
) {
5335 mon
.clog
->info() << "osd." << i
<< " marked down after no beacon for "
5336 << diff
<< " seconds";
5337 derr
<< "no beacon from osd." << i
<< " since " << t
->second
.first
5338 << ", " << diff
<< " seconds ago. marking down" << dendl
;
5339 pending_inc
.new_state
[i
] = CEPH_OSD_UP
;
5347 static void dump_cpu_list(Formatter
*f
, const char *name
,
5348 const string
& strlist
)
5351 size_t cpu_set_size
;
5352 if (parse_cpu_set_list(strlist
.c_str(), &cpu_set_size
, &cpu_set
) < 0) {
5355 set
<int> cpus
= cpu_set_to_set(cpu_set_size
, &cpu_set
);
5356 f
->open_array_section(name
);
5357 for (auto cpu
: cpus
) {
5358 f
->dump_int("cpu", cpu
);
5363 void OSDMonitor::dump_info(Formatter
*f
)
5365 f
->open_object_section("osdmap");
5369 f
->open_array_section("osd_metadata");
5370 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
5371 if (osdmap
.exists(i
)) {
5372 f
->open_object_section("osd");
5373 f
->dump_unsigned("id", i
);
5374 dump_osd_metadata(i
, f
, NULL
);
5380 f
->open_object_section("osdmap_clean_epochs");
5381 f
->dump_unsigned("min_last_epoch_clean", get_min_last_epoch_clean());
5383 f
->open_object_section("last_epoch_clean");
5384 last_epoch_clean
.dump(f
);
5387 f
->open_array_section("osd_epochs");
5388 for (auto& osd_epoch
: osd_epochs
) {
5389 f
->open_object_section("osd");
5390 f
->dump_unsigned("id", osd_epoch
.first
);
5391 f
->dump_unsigned("epoch", osd_epoch
.second
);
5394 f
->close_section(); // osd_epochs
5396 f
->close_section(); // osd_clean_epochs
5398 f
->dump_unsigned("osdmap_first_committed", get_first_committed());
5399 f
->dump_unsigned("osdmap_last_committed", get_last_committed());
5401 f
->open_object_section("crushmap");
5402 osdmap
.crush
->dump(f
);
5405 if (has_osdmap_manifest
) {
5406 f
->open_object_section("osdmap_manifest");
5407 osdmap_manifest
.dump(f
);
5413 enum osd_pool_get_choices
{
5415 PG_NUM
, PGP_NUM
, CRUSH_RULE
, HASHPSPOOL
, EC_OVERWRITES
,
5416 NODELETE
, NOPGCHANGE
, NOSIZECHANGE
,
5417 WRITE_FADVISE_DONTNEED
, NOSCRUB
, NODEEP_SCRUB
,
5418 HIT_SET_TYPE
, HIT_SET_PERIOD
, HIT_SET_COUNT
, HIT_SET_FPP
,
5419 USE_GMT_HITSET
, TARGET_MAX_OBJECTS
, TARGET_MAX_BYTES
,
5420 CACHE_TARGET_DIRTY_RATIO
, CACHE_TARGET_DIRTY_HIGH_RATIO
,
5421 CACHE_TARGET_FULL_RATIO
,
5422 CACHE_MIN_FLUSH_AGE
, CACHE_MIN_EVICT_AGE
,
5423 ERASURE_CODE_PROFILE
, MIN_READ_RECENCY_FOR_PROMOTE
,
5424 MIN_WRITE_RECENCY_FOR_PROMOTE
, FAST_READ
,
5425 HIT_SET_GRADE_DECAY_RATE
, HIT_SET_SEARCH_LAST_N
,
5426 SCRUB_MIN_INTERVAL
, SCRUB_MAX_INTERVAL
, DEEP_SCRUB_INTERVAL
,
5427 RECOVERY_PRIORITY
, RECOVERY_OP_PRIORITY
, SCRUB_PRIORITY
,
5428 COMPRESSION_MODE
, COMPRESSION_ALGORITHM
, COMPRESSION_REQUIRED_RATIO
,
5429 COMPRESSION_MAX_BLOB_SIZE
, COMPRESSION_MIN_BLOB_SIZE
,
5430 CSUM_TYPE
, CSUM_MAX_BLOCK
, CSUM_MIN_BLOCK
, FINGERPRINT_ALGORITHM
,
5431 PG_AUTOSCALE_MODE
, PG_NUM_MIN
, TARGET_SIZE_BYTES
, TARGET_SIZE_RATIO
,
5432 PG_AUTOSCALE_BIAS
, DEDUP_TIER
, DEDUP_CHUNK_ALGORITHM
,
5433 DEDUP_CDC_CHUNK_SIZE
, POOL_EIO
, BULK
, PG_NUM_MAX
};
5435 std::set
<osd_pool_get_choices
>
5436 subtract_second_from_first(const std::set
<osd_pool_get_choices
>& first
,
5437 const std::set
<osd_pool_get_choices
>& second
)
5439 std::set
<osd_pool_get_choices
> result
;
5440 std::set_difference(first
.begin(), first
.end(),
5441 second
.begin(), second
.end(),
5442 std::inserter(result
, result
.end()));
5448 bool OSDMonitor::preprocess_command(MonOpRequestRef op
)
5450 op
->mark_osdmon_event(__func__
);
5451 auto m
= op
->get_req
<MMonCommand
>();
5454 stringstream ss
, ds
;
5457 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
5458 string rs
= ss
.str();
5459 mon
.reply_command(op
, -EINVAL
, rs
, get_last_committed());
5463 MonSession
*session
= op
->get_session();
5465 derr
<< __func__
<< " no session" << dendl
;
5466 mon
.reply_command(op
, -EACCES
, "access denied", get_last_committed());
5471 cmd_getval(cmdmap
, "prefix", prefix
);
5473 string format
= cmd_getval_or
<string
>(cmdmap
, "format", "plain");
5474 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
5476 if (prefix
== "osd stat") {
5478 f
->open_object_section("osdmap");
5479 osdmap
.print_summary(f
.get(), ds
, "", true);
5483 osdmap
.print_summary(nullptr, ds
, "", true);
5487 else if (prefix
== "osd dump" ||
5488 prefix
== "osd tree" ||
5489 prefix
== "osd tree-from" ||
5490 prefix
== "osd ls" ||
5491 prefix
== "osd getmap" ||
5492 prefix
== "osd getcrushmap" ||
5493 prefix
== "osd ls-tree" ||
5494 prefix
== "osd info") {
5496 epoch_t epoch
= cmd_getval_or
<int64_t>(cmdmap
, "epoch", osdmap
.get_epoch());
5497 bufferlist osdmap_bl
;
5498 int err
= get_version_full(epoch
, osdmap_bl
);
5499 if (err
== -ENOENT
) {
5501 ss
<< "there is no map for epoch " << epoch
;
5504 ceph_assert(err
== 0);
5505 ceph_assert(osdmap_bl
.length());
5508 if (epoch
== osdmap
.get_epoch()) {
5512 p
->decode(osdmap_bl
);
5515 auto sg
= make_scope_guard([&] {
5521 if (prefix
== "osd dump") {
5524 f
->open_object_section("osdmap");
5534 } else if (prefix
== "osd ls") {
5536 f
->open_array_section("osds");
5537 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
5538 if (osdmap
.exists(i
)) {
5539 f
->dump_int("osd", i
);
5546 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
5547 if (osdmap
.exists(i
)) {
5556 } else if (prefix
== "osd info") {
5558 bool do_single_osd
= true;
5559 if (!cmd_getval(cmdmap
, "id", osd_id
)) {
5560 do_single_osd
= false;
5563 if (do_single_osd
&& !osdmap
.exists(osd_id
)) {
5564 ss
<< "osd." << osd_id
<< " does not exist";
5570 if (do_single_osd
) {
5571 osdmap
.dump_osd(osd_id
, f
.get());
5573 osdmap
.dump_osds(f
.get());
5577 if (do_single_osd
) {
5578 osdmap
.print_osd(osd_id
, ds
);
5580 osdmap
.print_osds(ds
);
5584 } else if (prefix
== "osd tree" || prefix
== "osd tree-from") {
5586 if (prefix
== "osd tree-from") {
5587 cmd_getval(cmdmap
, "bucket", bucket
);
5588 if (!osdmap
.crush
->name_exists(bucket
)) {
5589 ss
<< "bucket '" << bucket
<< "' does not exist";
5593 int id
= osdmap
.crush
->get_item_id(bucket
);
5595 ss
<< "\"" << bucket
<< "\" is not a bucket";
5601 vector
<string
> states
;
5602 cmd_getval(cmdmap
, "states", states
);
5603 unsigned filter
= 0;
5604 for (auto& s
: states
) {
5606 filter
|= OSDMap::DUMP_UP
;
5607 } else if (s
== "down") {
5608 filter
|= OSDMap::DUMP_DOWN
;
5609 } else if (s
== "in") {
5610 filter
|= OSDMap::DUMP_IN
;
5611 } else if (s
== "out") {
5612 filter
|= OSDMap::DUMP_OUT
;
5613 } else if (s
== "destroyed") {
5614 filter
|= OSDMap::DUMP_DESTROYED
;
5616 ss
<< "unrecognized state '" << s
<< "'";
5621 if ((filter
& (OSDMap::DUMP_IN
|OSDMap::DUMP_OUT
)) ==
5622 (OSDMap::DUMP_IN
|OSDMap::DUMP_OUT
)) {
5623 ss
<< "cannot specify both 'in' and 'out'";
5627 if (((filter
& (OSDMap::DUMP_UP
|OSDMap::DUMP_DOWN
)) ==
5628 (OSDMap::DUMP_UP
|OSDMap::DUMP_DOWN
)) ||
5629 ((filter
& (OSDMap::DUMP_UP
|OSDMap::DUMP_DESTROYED
)) ==
5630 (OSDMap::DUMP_UP
|OSDMap::DUMP_DESTROYED
)) ||
5631 ((filter
& (OSDMap::DUMP_DOWN
|OSDMap::DUMP_DESTROYED
)) ==
5632 (OSDMap::DUMP_DOWN
|OSDMap::DUMP_DESTROYED
))) {
5633 ss
<< "can specify only one of 'up', 'down' and 'destroyed'";
5638 f
->open_object_section("tree");
5639 p
->print_tree(f
.get(), NULL
, filter
, bucket
);
5643 p
->print_tree(NULL
, &ds
, filter
, bucket
);
5646 } else if (prefix
== "osd getmap") {
5647 rdata
.append(osdmap_bl
);
5648 ss
<< "got osdmap epoch " << p
->get_epoch();
5649 } else if (prefix
== "osd getcrushmap") {
5650 p
->crush
->encode(rdata
, mon
.get_quorum_con_features());
5651 ss
<< p
->get_crush_version();
5652 } else if (prefix
== "osd ls-tree") {
5654 cmd_getval(cmdmap
, "name", bucket_name
);
5656 r
= p
->get_osds_by_bucket_name(bucket_name
, &osds
);
5658 ss
<< "\"" << bucket_name
<< "\" does not exist";
5661 ss
<< "can not parse bucket name:\"" << bucket_name
<< "\"";
5666 f
->open_array_section("osds");
5667 for (auto &i
: osds
) {
5668 if (osdmap
.exists(i
)) {
5669 f
->dump_int("osd", i
);
5676 for (auto &i
: osds
) {
5677 if (osdmap
.exists(i
)) {
5688 } else if (prefix
== "osd getmaxosd") {
5690 f
->open_object_section("getmaxosd");
5691 f
->dump_unsigned("epoch", osdmap
.get_epoch());
5692 f
->dump_int("max_osd", osdmap
.get_max_osd());
5696 ds
<< "max_osd = " << osdmap
.get_max_osd() << " in epoch " << osdmap
.get_epoch();
5699 } else if (prefix
== "osd utilization") {
5701 osdmap
.summarize_mapping_stats(NULL
, NULL
, &out
, f
.get());
5708 } else if (prefix
== "osd find") {
5710 if (!cmd_getval(cmdmap
, "id", osd
)) {
5711 ss
<< "unable to parse osd id value '"
5712 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
5716 if (!osdmap
.exists(osd
)) {
5717 ss
<< "osd." << osd
<< " does not exist";
5722 cmd_getval(cmdmap
, "format", format
);
5723 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5724 f
->open_object_section("osd_location");
5725 f
->dump_int("osd", osd
);
5726 f
->dump_object("addrs", osdmap
.get_addrs(osd
));
5727 f
->dump_stream("osd_fsid") << osdmap
.get_uuid(osd
);
5729 // try to identify host, pod/container name, etc.
5730 map
<string
,string
> m
;
5731 load_metadata(osd
, m
, nullptr);
5732 if (auto p
= m
.find("hostname"); p
!= m
.end()) {
5733 f
->dump_string("host", p
->second
);
5736 "pod_name", "pod_namespace", // set by rook
5737 "container_name" // set by cephadm, ceph-ansible
5739 if (auto p
= m
.find(k
); p
!= m
.end()) {
5740 f
->dump_string(k
, p
->second
);
5744 // crush is helpful too
5745 f
->open_object_section("crush_location");
5746 map
<string
,string
> loc
= osdmap
.crush
->get_full_location(osd
);
5747 for (map
<string
,string
>::iterator p
= loc
.begin(); p
!= loc
.end(); ++p
)
5748 f
->dump_string(p
->first
.c_str(), p
->second
);
5752 } else if (prefix
== "osd metadata") {
5754 if (cmd_vartype_stringify(cmdmap
["id"]).size() &&
5755 !cmd_getval(cmdmap
, "id", osd
)) {
5756 ss
<< "unable to parse osd id value '"
5757 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
5761 if (osd
>= 0 && !osdmap
.exists(osd
)) {
5762 ss
<< "osd." << osd
<< " does not exist";
5767 cmd_getval(cmdmap
, "format", format
);
5768 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5770 f
->open_object_section("osd_metadata");
5771 f
->dump_unsigned("id", osd
);
5772 r
= dump_osd_metadata(osd
, f
.get(), &ss
);
5778 f
->open_array_section("osd_metadata");
5779 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
5780 if (osdmap
.exists(i
)) {
5781 f
->open_object_section("osd");
5782 f
->dump_unsigned("id", i
);
5783 r
= dump_osd_metadata(i
, f
.get(), NULL
);
5784 if (r
== -EINVAL
|| r
== -ENOENT
) {
5785 // Drop error, continue to get other daemons' metadata
5786 dout(4) << "No metadata for osd." << i
<< dendl
;
5798 } else if (prefix
== "osd versions") {
5800 f
.reset(Formatter::create("json-pretty"));
5801 count_metadata("ceph_version", f
.get());
5804 } else if (prefix
== "osd count-metadata") {
5806 f
.reset(Formatter::create("json-pretty"));
5808 cmd_getval(cmdmap
, "property", field
);
5809 count_metadata(field
, f
.get());
5812 } else if (prefix
== "osd numa-status") {
5815 f
->open_array_section("osds");
5817 tbl
.define_column("OSD", TextTable::LEFT
, TextTable::RIGHT
);
5818 tbl
.define_column("HOST", TextTable::LEFT
, TextTable::LEFT
);
5819 tbl
.define_column("NETWORK", TextTable::RIGHT
, TextTable::RIGHT
);
5820 tbl
.define_column("STORAGE", TextTable::RIGHT
, TextTable::RIGHT
);
5821 tbl
.define_column("AFFINITY", TextTable::RIGHT
, TextTable::RIGHT
);
5822 tbl
.define_column("CPUS", TextTable::LEFT
, TextTable::LEFT
);
5824 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
5825 if (osdmap
.exists(i
)) {
5826 map
<string
,string
> m
;
5828 if (load_metadata(i
, m
, &err
) < 0) {
5832 auto p
= m
.find("hostname");
5837 f
->open_object_section("osd");
5838 f
->dump_int("osd", i
);
5839 f
->dump_string("host", host
);
5840 for (auto n
: { "network_numa_node", "objectstore_numa_node",
5844 f
->dump_int(n
, atoi(p
->second
.c_str()));
5847 for (auto n
: { "network_numa_nodes", "objectstore_numa_nodes" }) {
5850 list
<string
> ls
= get_str_list(p
->second
, ",");
5851 f
->open_array_section(n
);
5852 for (auto node
: ls
) {
5853 f
->dump_int("node", atoi(node
.c_str()));
5858 for (auto n
: { "numa_node_cpus" }) {
5861 dump_cpu_list(f
.get(), n
, p
->second
);
5868 p
= m
.find("network_numa_nodes");
5874 p
= m
.find("objectstore_numa_nodes");
5880 p
= m
.find("numa_node");
5881 auto q
= m
.find("numa_node_cpus");
5882 if (p
!= m
.end() && q
!= m
.end()) {
5889 tbl
<< TextTable::endrow
;
5897 rdata
.append(stringify(tbl
));
5899 } else if (prefix
== "osd map") {
5900 string poolstr
, objstr
, namespacestr
;
5901 cmd_getval(cmdmap
, "pool", poolstr
);
5902 cmd_getval(cmdmap
, "object", objstr
);
5903 cmd_getval(cmdmap
, "nspace", namespacestr
);
5905 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
5907 ss
<< "pool " << poolstr
<< " does not exist";
5911 object_locator_t
oloc(pool
, namespacestr
);
5912 object_t
oid(objstr
);
5913 pg_t pgid
= osdmap
.object_locator_to_pg(oid
, oloc
);
5914 pg_t mpgid
= osdmap
.raw_pg_to_pg(pgid
);
5915 vector
<int> up
, acting
;
5917 osdmap
.pg_to_up_acting_osds(mpgid
, &up
, &up_p
, &acting
, &acting_p
);
5920 if (!namespacestr
.empty())
5921 fullobjname
= namespacestr
+ string("/") + oid
.name
;
5923 fullobjname
= oid
.name
;
5925 f
->open_object_section("osd_map");
5926 f
->dump_unsigned("epoch", osdmap
.get_epoch());
5927 f
->dump_string("pool", poolstr
);
5928 f
->dump_int("pool_id", pool
);
5929 f
->dump_stream("objname") << fullobjname
;
5930 f
->dump_stream("raw_pgid") << pgid
;
5931 f
->dump_stream("pgid") << mpgid
;
5932 f
->open_array_section("up");
5933 for (vector
<int>::iterator p
= up
.begin(); p
!= up
.end(); ++p
)
5934 f
->dump_int("osd", *p
);
5936 f
->dump_int("up_primary", up_p
);
5937 f
->open_array_section("acting");
5938 for (vector
<int>::iterator p
= acting
.begin(); p
!= acting
.end(); ++p
)
5939 f
->dump_int("osd", *p
);
5941 f
->dump_int("acting_primary", acting_p
);
5942 f
->close_section(); // osd_map
5945 ds
<< "osdmap e" << osdmap
.get_epoch()
5946 << " pool '" << poolstr
<< "' (" << pool
<< ")"
5947 << " object '" << fullobjname
<< "' ->"
5948 << " pg " << pgid
<< " (" << mpgid
<< ")"
5949 << " -> up (" << pg_vector_string(up
) << ", p" << up_p
<< ") acting ("
5950 << pg_vector_string(acting
) << ", p" << acting_p
<< ")";
5954 } else if (prefix
== "pg map") {
5957 cmd_getval(cmdmap
, "pgid", pgidstr
);
5958 if (!pgid
.parse(pgidstr
.c_str())) {
5959 ss
<< "invalid pgid '" << pgidstr
<< "'";
5963 vector
<int> up
, acting
;
5964 if (!osdmap
.have_pg_pool(pgid
.pool())) {
5965 ss
<< "pg '" << pgidstr
<< "' does not exist";
5969 pg_t mpgid
= osdmap
.raw_pg_to_pg(pgid
);
5970 osdmap
.pg_to_up_acting_osds(pgid
, up
, acting
);
5972 f
->open_object_section("pg_map");
5973 f
->dump_unsigned("epoch", osdmap
.get_epoch());
5974 f
->dump_stream("raw_pgid") << pgid
;
5975 f
->dump_stream("pgid") << mpgid
;
5976 f
->open_array_section("up");
5977 for (auto osd
: up
) {
5978 f
->dump_int("up_osd", osd
);
5981 f
->open_array_section("acting");
5982 for (auto osd
: acting
) {
5983 f
->dump_int("acting_osd", osd
);
5989 ds
<< "osdmap e" << osdmap
.get_epoch()
5990 << " pg " << pgid
<< " (" << mpgid
<< ")"
5991 << " -> up " << up
<< " acting " << acting
;
5996 } else if (prefix
== "osd lspools") {
5998 f
->open_array_section("pools");
5999 for (map
<int64_t, pg_pool_t
>::iterator p
= osdmap
.pools
.begin();
6000 p
!= osdmap
.pools
.end();
6003 f
->open_object_section("pool");
6004 f
->dump_int("poolnum", p
->first
);
6005 f
->dump_string("poolname", osdmap
.pool_name
[p
->first
]);
6008 ds
<< p
->first
<< ' ' << osdmap
.pool_name
[p
->first
];
6009 if (next(p
) != osdmap
.pools
.end()) {
6019 } else if (prefix
== "osd blocklist ls" ||
6020 prefix
== "osd blacklist ls") {
6022 f
->open_array_section("blocklist");
6024 for (ceph::unordered_map
<entity_addr_t
,utime_t
>::iterator p
= osdmap
.blocklist
.begin();
6025 p
!= osdmap
.blocklist
.end();
6028 f
->open_object_section("entry");
6029 f
->dump_string("addr", p
->first
.get_legacy_str());
6030 f
->dump_stream("until") << p
->second
;
6035 ss
<< p
->first
<< " " << p
->second
;
6046 f
->open_array_section("range_blocklist");
6048 for (auto p
= osdmap
.range_blocklist
.begin();
6049 p
!= osdmap
.range_blocklist
.end();
6052 f
->open_object_section("entry");
6053 f
->dump_string("range", p
->first
.get_legacy_str());
6054 f
->dump_stream("until") << p
->second
;
6059 ss
<< p
->first
<< " " << p
->second
;
6069 ss
<< "listed " << osdmap
.blocklist
.size() + osdmap
.range_blocklist
.size() << " entries";
6071 } else if (prefix
== "osd pool ls") {
6073 cmd_getval(cmdmap
, "detail", detail
);
6074 if (!f
&& detail
== "detail") {
6076 osdmap
.print_pools(ss
);
6077 rdata
.append(ss
.str());
6080 f
->open_array_section("pools");
6081 for (map
<int64_t,pg_pool_t
>::const_iterator it
= osdmap
.get_pools().begin();
6082 it
!= osdmap
.get_pools().end();
6085 if (detail
== "detail") {
6086 f
->open_object_section("pool");
6087 f
->dump_int("pool_id", it
->first
);
6088 f
->dump_string("pool_name", osdmap
.get_pool_name(it
->first
));
6089 it
->second
.dump(f
.get());
6092 f
->dump_string("pool_name", osdmap
.get_pool_name(it
->first
));
6095 rdata
.append(osdmap
.get_pool_name(it
->first
) + "\n");
6104 } else if (prefix
== "osd crush get-tunable") {
6106 cmd_getval(cmdmap
, "tunable", tunable
);
6109 f
->open_object_section("tunable");
6110 if (tunable
== "straw_calc_version") {
6112 f
->dump_int(tunable
.c_str(), osdmap
.crush
->get_straw_calc_version());
6114 rss
<< osdmap
.crush
->get_straw_calc_version() << "\n";
6123 rdata
.append(rss
.str());
6127 } else if (prefix
== "osd pool get") {
6129 cmd_getval(cmdmap
, "pool", poolstr
);
6130 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
6132 ss
<< "unrecognized pool '" << poolstr
<< "'";
6137 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
6139 cmd_getval(cmdmap
, "var", var
);
6141 typedef std::map
<std::string
, osd_pool_get_choices
> choices_map_t
;
6142 const choices_map_t ALL_CHOICES
= {
6144 {"min_size", MIN_SIZE
},
6145 {"pg_num", PG_NUM
}, {"pgp_num", PGP_NUM
},
6146 {"crush_rule", CRUSH_RULE
},
6147 {"hashpspool", HASHPSPOOL
},
6149 {"allow_ec_overwrites", EC_OVERWRITES
}, {"nodelete", NODELETE
},
6150 {"nopgchange", NOPGCHANGE
}, {"nosizechange", NOSIZECHANGE
},
6151 {"noscrub", NOSCRUB
}, {"nodeep-scrub", NODEEP_SCRUB
},
6152 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED
},
6153 {"hit_set_type", HIT_SET_TYPE
}, {"hit_set_period", HIT_SET_PERIOD
},
6154 {"hit_set_count", HIT_SET_COUNT
}, {"hit_set_fpp", HIT_SET_FPP
},
6155 {"use_gmt_hitset", USE_GMT_HITSET
},
6156 {"target_max_objects", TARGET_MAX_OBJECTS
},
6157 {"target_max_bytes", TARGET_MAX_BYTES
},
6158 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO
},
6159 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO
},
6160 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO
},
6161 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE
},
6162 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE
},
6163 {"erasure_code_profile", ERASURE_CODE_PROFILE
},
6164 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE
},
6165 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE
},
6166 {"fast_read", FAST_READ
},
6167 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE
},
6168 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N
},
6169 {"scrub_min_interval", SCRUB_MIN_INTERVAL
},
6170 {"scrub_max_interval", SCRUB_MAX_INTERVAL
},
6171 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL
},
6172 {"recovery_priority", RECOVERY_PRIORITY
},
6173 {"recovery_op_priority", RECOVERY_OP_PRIORITY
},
6174 {"scrub_priority", SCRUB_PRIORITY
},
6175 {"compression_mode", COMPRESSION_MODE
},
6176 {"compression_algorithm", COMPRESSION_ALGORITHM
},
6177 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO
},
6178 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE
},
6179 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE
},
6180 {"csum_type", CSUM_TYPE
},
6181 {"csum_max_block", CSUM_MAX_BLOCK
},
6182 {"csum_min_block", CSUM_MIN_BLOCK
},
6183 {"fingerprint_algorithm", FINGERPRINT_ALGORITHM
},
6184 {"pg_autoscale_mode", PG_AUTOSCALE_MODE
},
6185 {"pg_num_min", PG_NUM_MIN
},
6186 {"pg_num_max", PG_NUM_MAX
},
6187 {"target_size_bytes", TARGET_SIZE_BYTES
},
6188 {"target_size_ratio", TARGET_SIZE_RATIO
},
6189 {"pg_autoscale_bias", PG_AUTOSCALE_BIAS
},
6190 {"dedup_tier", DEDUP_TIER
},
6191 {"dedup_chunk_algorithm", DEDUP_CHUNK_ALGORITHM
},
6192 {"dedup_cdc_chunk_size", DEDUP_CDC_CHUNK_SIZE
},
6196 typedef std::set
<osd_pool_get_choices
> choices_set_t
;
6198 const choices_set_t ONLY_TIER_CHOICES
= {
6199 HIT_SET_TYPE
, HIT_SET_PERIOD
, HIT_SET_COUNT
, HIT_SET_FPP
,
6200 TARGET_MAX_OBJECTS
, TARGET_MAX_BYTES
, CACHE_TARGET_FULL_RATIO
,
6201 CACHE_TARGET_DIRTY_RATIO
, CACHE_TARGET_DIRTY_HIGH_RATIO
,
6202 CACHE_MIN_FLUSH_AGE
, CACHE_MIN_EVICT_AGE
,
6203 MIN_READ_RECENCY_FOR_PROMOTE
,
6204 MIN_WRITE_RECENCY_FOR_PROMOTE
,
6205 HIT_SET_GRADE_DECAY_RATE
, HIT_SET_SEARCH_LAST_N
6207 const choices_set_t ONLY_ERASURE_CHOICES
= {
6208 EC_OVERWRITES
, ERASURE_CODE_PROFILE
6211 choices_set_t selected_choices
;
6213 for(choices_map_t::const_iterator it
= ALL_CHOICES
.begin();
6214 it
!= ALL_CHOICES
.end(); ++it
) {
6215 selected_choices
.insert(it
->second
);
6219 selected_choices
= subtract_second_from_first(selected_choices
,
6223 if(!p
->is_erasure()) {
6224 selected_choices
= subtract_second_from_first(selected_choices
,
6225 ONLY_ERASURE_CHOICES
);
6227 } else /* var != "all" */ {
6228 choices_map_t::const_iterator found
= ALL_CHOICES
.find(var
);
6229 if (found
== ALL_CHOICES
.end()) {
6230 ss
<< "pool '" << poolstr
6231 << "': invalid variable: '" << var
<< "'";
6236 osd_pool_get_choices selected
= found
->second
;
6238 if (!p
->is_tier() &&
6239 ONLY_TIER_CHOICES
.find(selected
) != ONLY_TIER_CHOICES
.end()) {
6240 ss
<< "pool '" << poolstr
6241 << "' is not a tier pool: variable not applicable";
6246 if (!p
->is_erasure() &&
6247 ONLY_ERASURE_CHOICES
.find(selected
)
6248 != ONLY_ERASURE_CHOICES
.end()) {
6249 ss
<< "pool '" << poolstr
6250 << "' is not a erasure pool: variable not applicable";
6255 if (pool_opts_t::is_opt_name(var
) &&
6256 !p
->opts
.is_set(pool_opts_t::get_opt_desc(var
).key
)) {
6257 ss
<< "option '" << var
<< "' is not set on pool '" << poolstr
<< "'";
6262 selected_choices
.insert(selected
);
6266 f
->open_object_section("pool");
6267 f
->dump_string("pool", poolstr
);
6268 f
->dump_int("pool_id", pool
);
6269 for(choices_set_t::const_iterator it
= selected_choices
.begin();
6270 it
!= selected_choices
.end(); ++it
) {
6271 choices_map_t::const_iterator i
;
6272 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
6273 if (i
->second
== *it
) {
6277 ceph_assert(i
!= ALL_CHOICES
.end());
6280 f
->dump_int("pg_num", p
->get_pg_num());
6283 f
->dump_int("pgp_num", p
->get_pgp_num());
6286 f
->dump_int("size", p
->get_size());
6289 f
->dump_int("min_size", p
->get_min_size());
6292 if (osdmap
.crush
->rule_exists(p
->get_crush_rule())) {
6293 f
->dump_string("crush_rule", osdmap
.crush
->get_rule_name(
6294 p
->get_crush_rule()));
6296 f
->dump_string("crush_rule", stringify(p
->get_crush_rule()));
6300 f
->dump_bool("allow_ec_overwrites",
6301 p
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
));
6303 case PG_AUTOSCALE_MODE
:
6304 f
->dump_string("pg_autoscale_mode",
6305 pg_pool_t::get_pg_autoscale_mode_name(
6306 p
->pg_autoscale_mode
));
6314 case WRITE_FADVISE_DONTNEED
:
6317 f
->dump_bool(i
->first
.c_str(),
6318 p
->has_flag(pg_pool_t::get_flag_by_name(i
->first
)));
6320 case HIT_SET_PERIOD
:
6321 f
->dump_int("hit_set_period", p
->hit_set_period
);
6324 f
->dump_int("hit_set_count", p
->hit_set_count
);
6327 f
->dump_string("hit_set_type",
6328 HitSet::get_type_name(p
->hit_set_params
.get_type()));
6332 if (p
->hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
6333 BloomHitSet::Params
*bloomp
=
6334 static_cast<BloomHitSet::Params
*>(p
->hit_set_params
.impl
.get());
6335 f
->dump_float("hit_set_fpp", bloomp
->get_fpp());
6336 } else if(var
!= "all") {
6338 ss
<< "hit set is not of type Bloom; " <<
6339 "invalid to get a false positive rate!";
6345 case USE_GMT_HITSET
:
6346 f
->dump_bool("use_gmt_hitset", p
->use_gmt_hitset
);
6348 case TARGET_MAX_OBJECTS
:
6349 f
->dump_unsigned("target_max_objects", p
->target_max_objects
);
6351 case TARGET_MAX_BYTES
:
6352 f
->dump_unsigned("target_max_bytes", p
->target_max_bytes
);
6354 case CACHE_TARGET_DIRTY_RATIO
:
6355 f
->dump_unsigned("cache_target_dirty_ratio_micro",
6356 p
->cache_target_dirty_ratio_micro
);
6357 f
->dump_float("cache_target_dirty_ratio",
6358 ((float)p
->cache_target_dirty_ratio_micro
/1000000));
6360 case CACHE_TARGET_DIRTY_HIGH_RATIO
:
6361 f
->dump_unsigned("cache_target_dirty_high_ratio_micro",
6362 p
->cache_target_dirty_high_ratio_micro
);
6363 f
->dump_float("cache_target_dirty_high_ratio",
6364 ((float)p
->cache_target_dirty_high_ratio_micro
/1000000));
6366 case CACHE_TARGET_FULL_RATIO
:
6367 f
->dump_unsigned("cache_target_full_ratio_micro",
6368 p
->cache_target_full_ratio_micro
);
6369 f
->dump_float("cache_target_full_ratio",
6370 ((float)p
->cache_target_full_ratio_micro
/1000000));
6372 case CACHE_MIN_FLUSH_AGE
:
6373 f
->dump_unsigned("cache_min_flush_age", p
->cache_min_flush_age
);
6375 case CACHE_MIN_EVICT_AGE
:
6376 f
->dump_unsigned("cache_min_evict_age", p
->cache_min_evict_age
);
6378 case ERASURE_CODE_PROFILE
:
6379 f
->dump_string("erasure_code_profile", p
->erasure_code_profile
);
6381 case MIN_READ_RECENCY_FOR_PROMOTE
:
6382 f
->dump_int("min_read_recency_for_promote",
6383 p
->min_read_recency_for_promote
);
6385 case MIN_WRITE_RECENCY_FOR_PROMOTE
:
6386 f
->dump_int("min_write_recency_for_promote",
6387 p
->min_write_recency_for_promote
);
6390 f
->dump_int("fast_read", p
->fast_read
);
6392 case HIT_SET_GRADE_DECAY_RATE
:
6393 f
->dump_int("hit_set_grade_decay_rate",
6394 p
->hit_set_grade_decay_rate
);
6396 case HIT_SET_SEARCH_LAST_N
:
6397 f
->dump_int("hit_set_search_last_n",
6398 p
->hit_set_search_last_n
);
6400 case SCRUB_MIN_INTERVAL
:
6401 case SCRUB_MAX_INTERVAL
:
6402 case DEEP_SCRUB_INTERVAL
:
6403 case RECOVERY_PRIORITY
:
6404 case RECOVERY_OP_PRIORITY
:
6405 case SCRUB_PRIORITY
:
6406 case COMPRESSION_MODE
:
6407 case COMPRESSION_ALGORITHM
:
6408 case COMPRESSION_REQUIRED_RATIO
:
6409 case COMPRESSION_MAX_BLOB_SIZE
:
6410 case COMPRESSION_MIN_BLOB_SIZE
:
6412 case CSUM_MAX_BLOCK
:
6413 case CSUM_MIN_BLOCK
:
6414 case FINGERPRINT_ALGORITHM
:
6417 case TARGET_SIZE_BYTES
:
6418 case TARGET_SIZE_RATIO
:
6419 case PG_AUTOSCALE_BIAS
:
6421 case DEDUP_CHUNK_ALGORITHM
:
6422 case DEDUP_CDC_CHUNK_SIZE
:
6423 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc(i
->first
).key
;
6424 if (p
->opts
.is_set(key
)) {
6425 if(*it
== CSUM_TYPE
) {
6427 p
->opts
.get(pool_opts_t::CSUM_TYPE
, &val
);
6428 f
->dump_string(i
->first
.c_str(), Checksummer::get_csum_type_string(val
));
6430 p
->opts
.dump(i
->first
, f
.get());
6439 for(choices_set_t::const_iterator it
= selected_choices
.begin();
6440 it
!= selected_choices
.end(); ++it
) {
6441 choices_map_t::const_iterator i
;
6444 ss
<< "pg_num: " << p
->get_pg_num() << "\n";
6447 ss
<< "pgp_num: " << p
->get_pgp_num() << "\n";
6450 ss
<< "size: " << p
->get_size() << "\n";
6453 ss
<< "min_size: " << p
->get_min_size() << "\n";
6456 if (osdmap
.crush
->rule_exists(p
->get_crush_rule())) {
6457 ss
<< "crush_rule: " << osdmap
.crush
->get_rule_name(
6458 p
->get_crush_rule()) << "\n";
6460 ss
<< "crush_rule: " << p
->get_crush_rule() << "\n";
6463 case PG_AUTOSCALE_MODE
:
6464 ss
<< "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
6465 p
->pg_autoscale_mode
) <<"\n";
6467 case HIT_SET_PERIOD
:
6468 ss
<< "hit_set_period: " << p
->hit_set_period
<< "\n";
6471 ss
<< "hit_set_count: " << p
->hit_set_count
<< "\n";
6474 ss
<< "hit_set_type: " <<
6475 HitSet::get_type_name(p
->hit_set_params
.get_type()) << "\n";
6479 if (p
->hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
6480 BloomHitSet::Params
*bloomp
=
6481 static_cast<BloomHitSet::Params
*>(p
->hit_set_params
.impl
.get());
6482 ss
<< "hit_set_fpp: " << bloomp
->get_fpp() << "\n";
6483 } else if(var
!= "all") {
6484 ss
<< "hit set is not of type Bloom; " <<
6485 "invalid to get a false positive rate!";
6491 case USE_GMT_HITSET
:
6492 ss
<< "use_gmt_hitset: " << p
->use_gmt_hitset
<< "\n";
6494 case TARGET_MAX_OBJECTS
:
6495 ss
<< "target_max_objects: " << p
->target_max_objects
<< "\n";
6497 case TARGET_MAX_BYTES
:
6498 ss
<< "target_max_bytes: " << p
->target_max_bytes
<< "\n";
6500 case CACHE_TARGET_DIRTY_RATIO
:
6501 ss
<< "cache_target_dirty_ratio: "
6502 << ((float)p
->cache_target_dirty_ratio_micro
/1000000) << "\n";
6504 case CACHE_TARGET_DIRTY_HIGH_RATIO
:
6505 ss
<< "cache_target_dirty_high_ratio: "
6506 << ((float)p
->cache_target_dirty_high_ratio_micro
/1000000) << "\n";
6508 case CACHE_TARGET_FULL_RATIO
:
6509 ss
<< "cache_target_full_ratio: "
6510 << ((float)p
->cache_target_full_ratio_micro
/1000000) << "\n";
6512 case CACHE_MIN_FLUSH_AGE
:
6513 ss
<< "cache_min_flush_age: " << p
->cache_min_flush_age
<< "\n";
6515 case CACHE_MIN_EVICT_AGE
:
6516 ss
<< "cache_min_evict_age: " << p
->cache_min_evict_age
<< "\n";
6518 case ERASURE_CODE_PROFILE
:
6519 ss
<< "erasure_code_profile: " << p
->erasure_code_profile
<< "\n";
6521 case MIN_READ_RECENCY_FOR_PROMOTE
:
6522 ss
<< "min_read_recency_for_promote: " <<
6523 p
->min_read_recency_for_promote
<< "\n";
6525 case HIT_SET_GRADE_DECAY_RATE
:
6526 ss
<< "hit_set_grade_decay_rate: " <<
6527 p
->hit_set_grade_decay_rate
<< "\n";
6529 case HIT_SET_SEARCH_LAST_N
:
6530 ss
<< "hit_set_search_last_n: " <<
6531 p
->hit_set_search_last_n
<< "\n";
6534 ss
<< "allow_ec_overwrites: " <<
6535 (p
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
) ? "true" : "false") <<
6544 case WRITE_FADVISE_DONTNEED
:
6547 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
6548 if (i
->second
== *it
)
6551 ceph_assert(i
!= ALL_CHOICES
.end());
6552 ss
<< i
->first
<< ": " <<
6553 (p
->has_flag(pg_pool_t::get_flag_by_name(i
->first
)) ?
6554 "true" : "false") << "\n";
6556 case MIN_WRITE_RECENCY_FOR_PROMOTE
:
6557 ss
<< "min_write_recency_for_promote: " <<
6558 p
->min_write_recency_for_promote
<< "\n";
6561 ss
<< "fast_read: " << p
->fast_read
<< "\n";
6563 case SCRUB_MIN_INTERVAL
:
6564 case SCRUB_MAX_INTERVAL
:
6565 case DEEP_SCRUB_INTERVAL
:
6566 case RECOVERY_PRIORITY
:
6567 case RECOVERY_OP_PRIORITY
:
6568 case SCRUB_PRIORITY
:
6569 case COMPRESSION_MODE
:
6570 case COMPRESSION_ALGORITHM
:
6571 case COMPRESSION_REQUIRED_RATIO
:
6572 case COMPRESSION_MAX_BLOB_SIZE
:
6573 case COMPRESSION_MIN_BLOB_SIZE
:
6575 case CSUM_MAX_BLOCK
:
6576 case CSUM_MIN_BLOCK
:
6577 case FINGERPRINT_ALGORITHM
:
6580 case TARGET_SIZE_BYTES
:
6581 case TARGET_SIZE_RATIO
:
6582 case PG_AUTOSCALE_BIAS
:
6584 case DEDUP_CHUNK_ALGORITHM
:
6585 case DEDUP_CDC_CHUNK_SIZE
:
6586 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
6587 if (i
->second
== *it
)
6590 ceph_assert(i
!= ALL_CHOICES
.end());
6592 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc(i
->first
).key
;
6593 if (p
->opts
.is_set(key
)) {
6594 if(key
== pool_opts_t::CSUM_TYPE
) {
6596 p
->opts
.get(key
, &val
);
6597 ss
<< i
->first
<< ": " << Checksummer::get_csum_type_string(val
) << "\n";
6599 ss
<< i
->first
<< ": " << p
->opts
.get(key
) << "\n";
6605 rdata
.append(ss
.str());
6610 } else if (prefix
== "osd pool get-quota") {
6612 cmd_getval(cmdmap
, "pool", pool_name
);
6614 int64_t poolid
= osdmap
.lookup_pg_pool_name(pool_name
);
6616 ceph_assert(poolid
== -ENOENT
);
6617 ss
<< "unrecognized pool '" << pool_name
<< "'";
6621 const pg_pool_t
*p
= osdmap
.get_pg_pool(poolid
);
6622 const pool_stat_t
* pstat
= mon
.mgrstatmon()->get_pool_stat(poolid
);
6624 ss
<< "no stats for pool '" << pool_name
<< "'";
6628 const object_stat_sum_t
& sum
= pstat
->stats
.sum
;
6630 f
->open_object_section("pool_quotas");
6631 f
->dump_string("pool_name", pool_name
);
6632 f
->dump_unsigned("pool_id", poolid
);
6633 f
->dump_unsigned("quota_max_objects", p
->quota_max_objects
);
6634 f
->dump_int("current_num_objects", sum
.num_objects
);
6635 f
->dump_unsigned("quota_max_bytes", p
->quota_max_bytes
);
6636 f
->dump_int("current_num_bytes", sum
.num_bytes
);
6641 rs
<< "quotas for pool '" << pool_name
<< "':\n"
6642 << " max objects: ";
6643 if (p
->quota_max_objects
== 0)
6646 rs
<< si_u_t(p
->quota_max_objects
) << " objects";
6647 rs
<< " (current num objects: " << sum
.num_objects
<< " objects)";
6651 if (p
->quota_max_bytes
== 0)
6654 rs
<< byte_u_t(p
->quota_max_bytes
);
6655 rs
<< " (current num bytes: " << sum
.num_bytes
<< " bytes)";
6657 rdata
.append(rs
.str());
6661 } else if (prefix
== "osd crush rule list" ||
6662 prefix
== "osd crush rule ls") {
6664 f
->open_array_section("rules");
6665 osdmap
.crush
->list_rules(f
.get());
6670 osdmap
.crush
->list_rules(&ss
);
6671 rdata
.append(ss
.str());
6673 } else if (prefix
== "osd crush rule ls-by-class") {
6675 cmd_getval(cmdmap
, "class", class_name
);
6676 if (class_name
.empty()) {
6677 ss
<< "no class specified";
6682 r
= osdmap
.crush
->get_rules_by_class(class_name
, &rules
);
6684 ss
<< "failed to get rules by class '" << class_name
<< "'";
6688 f
->open_array_section("rules");
6689 for (auto &rule
: rules
) {
6690 f
->dump_string("name", osdmap
.crush
->get_rule_name(rule
));
6696 for (auto &rule
: rules
) {
6697 rs
<< osdmap
.crush
->get_rule_name(rule
) << "\n";
6699 rdata
.append(rs
.str());
6701 } else if (prefix
== "osd crush rule dump") {
6703 cmd_getval(cmdmap
, "name", name
);
6705 cmd_getval(cmdmap
, "format", format
);
6706 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6708 f
->open_array_section("rules");
6709 osdmap
.crush
->dump_rules(f
.get());
6712 int ruleno
= osdmap
.crush
->get_rule_id(name
);
6714 ss
<< "unknown crush rule '" << name
<< "'";
6718 osdmap
.crush
->dump_rule(ruleno
, f
.get());
6723 rdata
.append(rs
.str());
6724 } else if (prefix
== "osd crush dump") {
6726 cmd_getval(cmdmap
, "format", format
);
6727 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6728 f
->open_object_section("crush_map");
6729 osdmap
.crush
->dump(f
.get());
6734 rdata
.append(rs
.str());
6735 } else if (prefix
== "osd crush show-tunables") {
6737 cmd_getval(cmdmap
, "format", format
);
6738 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6739 f
->open_object_section("crush_map_tunables");
6740 osdmap
.crush
->dump_tunables(f
.get());
6745 rdata
.append(rs
.str());
6746 } else if (prefix
== "osd crush tree") {
6747 bool show_shadow
= false;
6748 if (!cmd_getval_compat_cephbool(cmdmap
, "show_shadow", show_shadow
)) {
6750 if (cmd_getval(cmdmap
, "shadow", shadow
) &&
6751 shadow
== "--show-shadow") {
6755 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
6757 f
->open_object_section("crush_tree");
6758 osdmap
.crush
->dump_tree(nullptr,
6760 osdmap
.get_pool_names(),
6766 osdmap
.crush
->dump_tree(&ss
,
6768 osdmap
.get_pool_names(),
6770 rdata
.append(ss
.str());
6772 } else if (prefix
== "osd crush ls") {
6774 if (!cmd_getval(cmdmap
, "node", name
)) {
6775 ss
<< "no node specified";
6779 if (!osdmap
.crush
->name_exists(name
)) {
6780 ss
<< "node '" << name
<< "' does not exist";
6784 int id
= osdmap
.crush
->get_item_id(name
);
6787 result
.push_back(id
);
6789 int num
= osdmap
.crush
->get_bucket_size(id
);
6790 for (int i
= 0; i
< num
; ++i
) {
6791 result
.push_back(osdmap
.crush
->get_bucket_item(id
, i
));
6795 f
->open_array_section("items");
6796 for (auto i
: result
) {
6797 f
->dump_string("item", osdmap
.crush
->get_item_name(i
));
6803 for (auto i
: result
) {
6804 ss
<< osdmap
.crush
->get_item_name(i
) << "\n";
6806 rdata
.append(ss
.str());
6809 } else if (prefix
== "osd crush class ls") {
6810 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6811 f
->open_array_section("crush_classes");
6812 for (auto i
: osdmap
.crush
->class_name
)
6813 f
->dump_string("class", i
.second
);
6816 } else if (prefix
== "osd crush class ls-osd") {
6818 cmd_getval(cmdmap
, "class", name
);
6820 osdmap
.crush
->get_devices_by_class(name
, &osds
);
6822 f
->open_array_section("osds");
6823 for (auto &osd
: osds
)
6824 f
->dump_int("osd", osd
);
6829 for (auto &osd
: osds
) {
6837 } else if (prefix
== "osd crush get-device-class") {
6838 vector
<string
> idvec
;
6839 cmd_getval(cmdmap
, "ids", idvec
);
6840 map
<int, string
> class_by_osd
;
6841 for (auto& id
: idvec
) {
6843 long osd
= parse_osd_id(id
.c_str(), &ts
);
6845 ss
<< "unable to parse osd id:'" << id
<< "'";
6849 auto device_class
= osdmap
.crush
->get_item_class(osd
);
6851 class_by_osd
[osd
] = device_class
;
6853 class_by_osd
[osd
] = ""; // no class
6856 f
->open_array_section("osd_device_classes");
6857 for (auto& i
: class_by_osd
) {
6858 f
->open_object_section("osd_device_class");
6859 f
->dump_int("osd", i
.first
);
6860 f
->dump_string("device_class", i
.second
);
6866 if (class_by_osd
.size() == 1) {
6867 // for single input, make a clean output
6868 ds
<< class_by_osd
.begin()->second
;
6870 // note that we do not group osds by class here
6871 for (auto it
= class_by_osd
.begin();
6872 it
!= class_by_osd
.end();
6874 ds
<< "osd." << it
->first
<< ' ' << it
->second
;
6875 if (next(it
) != class_by_osd
.end())
6881 } else if (prefix
== "osd erasure-code-profile ls") {
6882 const auto &profiles
= osdmap
.get_erasure_code_profiles();
6884 f
->open_array_section("erasure-code-profiles");
6885 for (auto i
= profiles
.begin(); i
!= profiles
.end(); ++i
) {
6887 f
->dump_string("profile", i
->first
.c_str());
6889 rdata
.append(i
->first
+ "\n");
6896 rdata
.append(rs
.str());
6898 } else if (prefix
== "osd crush weight-set ls") {
6899 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
6901 f
->open_array_section("weight_sets");
6902 if (osdmap
.crush
->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS
)) {
6903 f
->dump_string("pool", "(compat)");
6905 for (auto& i
: osdmap
.crush
->choose_args
) {
6907 f
->dump_string("pool", osdmap
.get_pool_name(i
.first
));
6914 if (osdmap
.crush
->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS
)) {
6917 for (auto& i
: osdmap
.crush
->choose_args
) {
6919 rs
<< osdmap
.get_pool_name(i
.first
) << "\n";
6922 rdata
.append(rs
.str());
6924 } else if (prefix
== "osd crush weight-set dump") {
6925 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty",
6927 osdmap
.crush
->dump_choose_args(f
.get());
6929 } else if (prefix
== "osd erasure-code-profile get") {
6931 cmd_getval(cmdmap
, "name", name
);
6932 if (!osdmap
.has_erasure_code_profile(name
)) {
6933 ss
<< "unknown erasure code profile '" << name
<< "'";
6937 const map
<string
,string
> &profile
= osdmap
.get_erasure_code_profile(name
);
6939 f
->open_object_section("profile");
6940 for (map
<string
,string
>::const_iterator i
= profile
.begin();
6944 f
->dump_string(i
->first
.c_str(), i
->second
.c_str());
6946 rdata
.append(i
->first
+ "=" + i
->second
+ "\n");
6953 rdata
.append(rs
.str());
6955 } else if (prefix
== "osd pool application get") {
6956 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty",
6959 cmd_getval(cmdmap
, "pool", pool_name
);
6961 cmd_getval(cmdmap
, "app", app
);
6963 cmd_getval(cmdmap
, "key", key
);
6965 if (pool_name
.empty()) {
6967 f
->open_object_section("pools");
6968 for (const auto &pool
: osdmap
.pools
) {
6969 std::string
name("<unknown>");
6970 const auto &pni
= osdmap
.pool_name
.find(pool
.first
);
6971 if (pni
!= osdmap
.pool_name
.end())
6973 f
->open_object_section(name
.c_str());
6974 for (auto &app_pair
: pool
.second
.application_metadata
) {
6975 f
->open_object_section(app_pair
.first
.c_str());
6976 for (auto &kv_pair
: app_pair
.second
) {
6977 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
6981 f
->close_section(); // name
6983 f
->close_section(); // pools
6986 int64_t pool
= osdmap
.lookup_pg_pool_name(pool_name
.c_str());
6988 ss
<< "unrecognized pool '" << pool_name
<< "'";
6992 auto p
= osdmap
.get_pg_pool(pool
);
6995 f
->open_object_section(pool_name
.c_str());
6996 for (auto &app_pair
: p
->application_metadata
) {
6997 f
->open_object_section(app_pair
.first
.c_str());
6998 for (auto &kv_pair
: app_pair
.second
) {
6999 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
7001 f
->close_section(); // application
7003 f
->close_section(); // pool_name
7008 auto app_it
= p
->application_metadata
.find(app
);
7009 if (app_it
== p
->application_metadata
.end()) {
7010 ss
<< "pool '" << pool_name
<< "' has no application '" << app
<< "'";
7014 // filter by pool + app
7016 f
->open_object_section(app_it
->first
.c_str());
7017 for (auto &kv_pair
: app_it
->second
) {
7018 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
7020 f
->close_section(); // application
7024 // filter by pool + app + key
7025 auto key_it
= app_it
->second
.find(key
);
7026 if (key_it
== app_it
->second
.end()) {
7027 ss
<< "application '" << app
<< "' on pool '" << pool_name
7028 << "' does not have key '" << key
<< "'";
7032 ss
<< key_it
->second
<< "\n";
7033 rdata
.append(ss
.str());
7036 } else if (prefix
== "osd get-require-min-compat-client") {
7037 ss
<< osdmap
.require_min_compat_client
<< std::endl
;
7038 rdata
.append(ss
.str());
7041 } else if (prefix
== "osd pool application enable" ||
7042 prefix
== "osd pool application disable" ||
7043 prefix
== "osd pool application set" ||
7044 prefix
== "osd pool application rm") {
7045 bool changed
= false;
7046 r
= preprocess_command_pool_application(prefix
, cmdmap
, ss
, &changed
);
7050 } else if (changed
) {
7051 // Valid mutation, proceed to prepare phase
7054 // Idempotent case, reply
7058 // try prepare update
7065 mon
.reply_command(op
, r
, rs
, rdata
, get_last_committed());
7069 void OSDMonitor::set_pool_flags(int64_t pool_id
, uint64_t flags
)
7071 pg_pool_t
*pool
= pending_inc
.get_new_pool(pool_id
,
7072 osdmap
.get_pg_pool(pool_id
));
7074 pool
->set_flag(flags
);
7077 void OSDMonitor::clear_pool_flags(int64_t pool_id
, uint64_t flags
)
7079 pg_pool_t
*pool
= pending_inc
.get_new_pool(pool_id
,
7080 osdmap
.get_pg_pool(pool_id
));
7082 pool
->unset_flag(flags
);
7085 string
OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch
)
7088 snprintf(k
, sizeof(k
), "purged_epoch_%08lx", (unsigned long)epoch
);
7092 string
OSDMonitor::make_purged_snap_key(int64_t pool
, snapid_t snap
)
7095 snprintf(k
, sizeof(k
), "purged_snap_%llu_%016llx",
7096 (unsigned long long)pool
, (unsigned long long)snap
);
7100 string
OSDMonitor::make_purged_snap_key_value(
7101 int64_t pool
, snapid_t snap
, snapid_t num
,
7102 epoch_t epoch
, bufferlist
*v
)
7104 // encode the *last* epoch in the key so that we can use forward
7105 // iteration only to search for an epoch in an interval.
7107 encode(snap
+ num
, *v
);
7109 return make_purged_snap_key(pool
, snap
+ num
- 1);
7113 int OSDMonitor::lookup_purged_snap(
7114 int64_t pool
, snapid_t snap
,
7115 snapid_t
*begin
, snapid_t
*end
)
7117 string k
= make_purged_snap_key(pool
, snap
);
7118 auto it
= mon
.store
->get_iterator(OSD_SNAP_PREFIX
);
7121 dout(20) << __func__
7122 << " pool " << pool
<< " snap " << snap
7123 << " - key '" << k
<< "' not found" << dendl
;
7126 if (it
->key().find("purged_snap_") != 0) {
7127 dout(20) << __func__
7128 << " pool " << pool
<< " snap " << snap
7129 << " - key '" << k
<< "' got '" << it
->key()
7130 << "', wrong prefix" << dendl
;
7133 string gotk
= it
->key();
7134 const char *format
= "purged_snap_%llu_";
7135 long long int keypool
;
7136 int n
= sscanf(gotk
.c_str(), format
, &keypool
);
7138 derr
<< __func__
<< " invalid k '" << gotk
<< "'" << dendl
;
7141 if (pool
!= keypool
) {
7142 dout(20) << __func__
7143 << " pool " << pool
<< " snap " << snap
7144 << " - key '" << k
<< "' got '" << gotk
7145 << "', wrong pool " << keypool
7149 bufferlist v
= it
->value();
7150 auto p
= v
.cbegin();
7153 if (snap
< *begin
|| snap
>= *end
) {
7154 dout(20) << __func__
7155 << " pool " << pool
<< " snap " << snap
7156 << " - found [" << *begin
<< "," << *end
<< "), no overlap"
7163 void OSDMonitor::insert_purged_snap_update(
7165 snapid_t start
, snapid_t end
,
7167 MonitorDBStore::TransactionRef t
)
7169 snapid_t before_begin
, before_end
;
7170 snapid_t after_begin
, after_end
;
7171 int b
= lookup_purged_snap(pool
, start
- 1,
7172 &before_begin
, &before_end
);
7173 int a
= lookup_purged_snap(pool
, end
,
7174 &after_begin
, &after_end
);
7176 dout(10) << __func__
7177 << " [" << start
<< "," << end
<< ") - joins ["
7178 << before_begin
<< "," << before_end
<< ") and ["
7179 << after_begin
<< "," << after_end
<< ")" << dendl
;
7180 // erase only the begin record; we'll overwrite the end one.
7181 t
->erase(OSD_SNAP_PREFIX
, make_purged_snap_key(pool
, before_end
- 1));
7183 string k
= make_purged_snap_key_value(pool
,
7184 before_begin
, after_end
- before_begin
,
7185 pending_inc
.epoch
, &v
);
7186 t
->put(OSD_SNAP_PREFIX
, k
, v
);
7188 dout(10) << __func__
7189 << " [" << start
<< "," << end
<< ") - join with earlier ["
7190 << before_begin
<< "," << before_end
<< ")" << dendl
;
7191 t
->erase(OSD_SNAP_PREFIX
, make_purged_snap_key(pool
, before_end
- 1));
7193 string k
= make_purged_snap_key_value(pool
,
7194 before_begin
, end
- before_begin
,
7195 pending_inc
.epoch
, &v
);
7196 t
->put(OSD_SNAP_PREFIX
, k
, v
);
7198 dout(10) << __func__
7199 << " [" << start
<< "," << end
<< ") - join with later ["
7200 << after_begin
<< "," << after_end
<< ")" << dendl
;
7201 // overwrite after record
7203 string k
= make_purged_snap_key_value(pool
,
7204 start
, after_end
- start
,
7205 pending_inc
.epoch
, &v
);
7206 t
->put(OSD_SNAP_PREFIX
, k
, v
);
7208 dout(10) << __func__
7209 << " [" << start
<< "," << end
<< ") - new"
7212 string k
= make_purged_snap_key_value(pool
,
7214 pending_inc
.epoch
, &v
);
7215 t
->put(OSD_SNAP_PREFIX
, k
, v
);
7219 bool OSDMonitor::try_prune_purged_snaps()
7221 if (!mon
.mgrstatmon()->is_readable()) {
7224 if (!pending_inc
.new_purged_snaps
.empty()) {
7225 return false; // we already pruned for this epoch
7228 unsigned max_prune
= cct
->_conf
.get_val
<uint64_t>(
7229 "mon_max_snap_prune_per_epoch");
7233 dout(10) << __func__
<< " max_prune " << max_prune
<< dendl
;
7235 unsigned actually_pruned
= 0;
7236 auto& purged_snaps
= mon
.mgrstatmon()->get_digest().purged_snaps
;
7237 for (auto& p
: osdmap
.get_pools()) {
7238 auto q
= purged_snaps
.find(p
.first
);
7239 if (q
== purged_snaps
.end()) {
7242 auto& purged
= q
->second
;
7243 if (purged
.empty()) {
7244 dout(20) << __func__
<< " " << p
.first
<< " nothing purged" << dendl
;
7247 dout(20) << __func__
<< " pool " << p
.first
<< " purged " << purged
<< dendl
;
7248 snap_interval_set_t to_prune
;
7249 unsigned maybe_pruned
= actually_pruned
;
7250 for (auto i
= purged
.begin(); i
!= purged
.end(); ++i
) {
7251 snapid_t begin
= i
.get_start();
7252 auto end
= i
.get_start() + i
.get_len();
7253 snapid_t pbegin
= 0, pend
= 0;
7254 int r
= lookup_purged_snap(p
.first
, begin
, &pbegin
, &pend
);
7257 // be a bit aggressive about backing off here, because the mon may
7258 // do a lot of work going through this set, and if we know the
7259 // purged set from the OSDs is at least *partly* stale we may as
7260 // well wait for it to be fresh.
7261 dout(20) << __func__
<< " we've already purged " << pbegin
7262 << "~" << (pend
- pbegin
) << dendl
;
7265 if (pbegin
&& pbegin
> begin
&& pbegin
< end
) {
7266 // the tail of [begin,end) is purged; shorten the range
7269 to_prune
.insert(begin
, end
- begin
);
7270 maybe_pruned
+= end
- begin
;
7271 if (maybe_pruned
>= max_prune
) {
7275 if (!to_prune
.empty()) {
7276 // PGs may still be reporting things as purged that we have already
7277 // pruned from removed_snaps_queue.
7278 snap_interval_set_t actual
;
7279 auto r
= osdmap
.removed_snaps_queue
.find(p
.first
);
7280 if (r
!= osdmap
.removed_snaps_queue
.end()) {
7281 actual
.intersection_of(to_prune
, r
->second
);
7283 actually_pruned
+= actual
.size();
7284 dout(10) << __func__
<< " pool " << p
.first
<< " reports pruned " << to_prune
7285 << ", actual pruned " << actual
<< dendl
;
7286 if (!actual
.empty()) {
7287 pending_inc
.new_purged_snaps
[p
.first
].swap(actual
);
7290 if (actually_pruned
>= max_prune
) {
7294 dout(10) << __func__
<< " actually pruned " << actually_pruned
<< dendl
;
7295 return !!actually_pruned
;
7298 bool OSDMonitor::update_pools_status()
7300 if (!mon
.mgrstatmon()->is_readable())
7305 auto& pools
= osdmap
.get_pools();
7306 for (auto it
= pools
.begin(); it
!= pools
.end(); ++it
) {
7307 const pool_stat_t
*pstat
= mon
.mgrstatmon()->get_pool_stat(it
->first
);
7310 const object_stat_sum_t
& sum
= pstat
->stats
.sum
;
7311 const pg_pool_t
&pool
= it
->second
;
7312 const string
& pool_name
= osdmap
.get_pool_name(it
->first
);
7315 (pool
.quota_max_bytes
> 0 && (uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) ||
7316 (pool
.quota_max_objects
> 0 && (uint64_t)sum
.num_objects
>= pool
.quota_max_objects
);
7318 if (pool
.has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
7322 mon
.clog
->info() << "pool '" << pool_name
7323 << "' no longer out of quota; removing NO_QUOTA flag";
7324 // below we cancel FLAG_FULL too, we'll set it again in
7325 // OSDMonitor::encode_pending if it still fails the osd-full checking.
7326 clear_pool_flags(it
->first
,
7327 pg_pool_t::FLAG_FULL_QUOTA
| pg_pool_t::FLAG_FULL
);
7333 if (pool
.quota_max_bytes
> 0 &&
7334 (uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) {
7335 mon
.clog
->warn() << "pool '" << pool_name
<< "' is full"
7336 << " (reached quota's max_bytes: "
7337 << byte_u_t(pool
.quota_max_bytes
) << ")";
7339 if (pool
.quota_max_objects
> 0 &&
7340 (uint64_t)sum
.num_objects
>= pool
.quota_max_objects
) {
7341 mon
.clog
->warn() << "pool '" << pool_name
<< "' is full"
7342 << " (reached quota's max_objects: "
7343 << pool
.quota_max_objects
<< ")";
7345 // set both FLAG_FULL_QUOTA and FLAG_FULL
7346 // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
7347 // since FLAG_FULL should always take precedence
7348 set_pool_flags(it
->first
,
7349 pg_pool_t::FLAG_FULL_QUOTA
| pg_pool_t::FLAG_FULL
);
7350 clear_pool_flags(it
->first
,
7351 pg_pool_t::FLAG_NEARFULL
|
7352 pg_pool_t::FLAG_BACKFILLFULL
);
7359 int OSDMonitor::prepare_new_pool(MonOpRequestRef op
)
7361 op
->mark_osdmon_event(__func__
);
7362 auto m
= op
->get_req
<MPoolOp
>();
7363 dout(10) << "prepare_new_pool from " << m
->get_connection() << dendl
;
7364 MonSession
*session
= op
->get_session();
7367 string erasure_code_profile
;
7372 ret
= prepare_new_pool(m
->name
, m
->crush_rule
, rule_name
,
7373 0, 0, 0, 0, 0, 0, 0.0,
7374 erasure_code_profile
,
7375 pg_pool_t::TYPE_REPLICATED
, 0, FAST_READ_OFF
, {}, bulk
,
7379 dout(10) << __func__
<< " got " << ret
<< " " << ss
.str() << dendl
;
7384 int OSDMonitor::crush_rename_bucket(const string
& srcname
,
7385 const string
& dstname
,
7390 // Avoid creating a pending crush if it does not already exists and
7391 // the rename would fail.
7393 if (!_have_pending_crush()) {
7394 ret
= _get_stable_crush().can_rename_bucket(srcname
,
7401 CrushWrapper newcrush
= _get_pending_crush();
7403 ret
= newcrush
.rename_bucket(srcname
,
7409 pending_inc
.crush
.clear();
7410 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
7411 *ss
<< "renamed bucket " << srcname
<< " into " << dstname
;
7415 void OSDMonitor::check_legacy_ec_plugin(const string
& plugin
, const string
& profile
) const
7417 string replacement
= "";
7419 if (plugin
== "jerasure_generic" ||
7420 plugin
== "jerasure_sse3" ||
7421 plugin
== "jerasure_sse4" ||
7422 plugin
== "jerasure_neon") {
7423 replacement
= "jerasure";
7424 } else if (plugin
== "shec_generic" ||
7425 plugin
== "shec_sse3" ||
7426 plugin
== "shec_sse4" ||
7427 plugin
== "shec_neon") {
7428 replacement
= "shec";
7431 if (replacement
!= "") {
7432 dout(0) << "WARNING: erasure coding profile " << profile
<< " uses plugin "
7433 << plugin
<< " that has been deprecated. Please use "
7434 << replacement
<< " instead." << dendl
;
7438 int OSDMonitor::normalize_profile(const string
& profilename
,
7439 ErasureCodeProfile
&profile
,
7443 ErasureCodeInterfaceRef erasure_code
;
7444 ErasureCodePluginRegistry
&instance
= ErasureCodePluginRegistry::instance();
7445 ErasureCodeProfile::const_iterator plugin
= profile
.find("plugin");
7446 check_legacy_ec_plugin(plugin
->second
, profilename
);
7447 int err
= instance
.factory(plugin
->second
,
7448 g_conf().get_val
<std::string
>("erasure_code_dir"),
7449 profile
, &erasure_code
, ss
);
7454 err
= erasure_code
->init(profile
, ss
);
7459 auto it
= profile
.find("stripe_unit");
7460 if (it
!= profile
.end()) {
7462 uint32_t stripe_unit
= strict_iecstrtoll(it
->second
, &err_str
);
7463 if (!err_str
.empty()) {
7464 *ss
<< "could not parse stripe_unit '" << it
->second
7465 << "': " << err_str
<< std::endl
;
7468 uint32_t data_chunks
= erasure_code
->get_data_chunk_count();
7469 uint32_t chunk_size
= erasure_code
->get_chunk_size(stripe_unit
* data_chunks
);
7470 if (chunk_size
!= stripe_unit
) {
7471 *ss
<< "stripe_unit " << stripe_unit
<< " does not match ec profile "
7472 << "alignment. Would be padded to " << chunk_size
7476 if ((stripe_unit
% 4096) != 0 && !force
) {
7477 *ss
<< "stripe_unit should be a multiple of 4096 bytes for best performance."
7478 << "use --force to override this check" << std::endl
;
7485 int OSDMonitor::crush_rule_create_erasure(const string
&name
,
7486 const string
&profile
,
7490 int ruleid
= osdmap
.crush
->get_rule_id(name
);
7491 if (ruleid
!= -ENOENT
) {
7496 CrushWrapper newcrush
= _get_pending_crush();
7498 ruleid
= newcrush
.get_rule_id(name
);
7499 if (ruleid
!= -ENOENT
) {
7503 ErasureCodeInterfaceRef erasure_code
;
7504 int err
= get_erasure_code(profile
, &erasure_code
, ss
);
7506 *ss
<< "failed to load plugin using profile " << profile
<< std::endl
;
7510 err
= erasure_code
->create_rule(name
, newcrush
, ss
);
7511 erasure_code
.reset();
7515 pending_inc
.crush
.clear();
7516 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
7521 int OSDMonitor::get_erasure_code(const string
&erasure_code_profile
,
7522 ErasureCodeInterfaceRef
*erasure_code
,
7525 if (pending_inc
.has_erasure_code_profile(erasure_code_profile
))
7527 ErasureCodeProfile profile
=
7528 osdmap
.get_erasure_code_profile(erasure_code_profile
);
7529 ErasureCodeProfile::const_iterator plugin
=
7530 profile
.find("plugin");
7531 if (plugin
== profile
.end()) {
7532 *ss
<< "cannot determine the erasure code plugin"
7533 << " because there is no 'plugin' entry in the erasure_code_profile "
7534 << profile
<< std::endl
;
7537 check_legacy_ec_plugin(plugin
->second
, erasure_code_profile
);
7538 auto& instance
= ErasureCodePluginRegistry::instance();
7539 return instance
.factory(plugin
->second
,
7540 g_conf().get_val
<std::string
>("erasure_code_dir"),
7541 profile
, erasure_code
, ss
);
7544 int OSDMonitor::check_cluster_features(uint64_t features
,
7547 stringstream unsupported_ss
;
7548 int unsupported_count
= 0;
7549 if ((mon
.get_quorum_con_features() & features
) != features
) {
7550 unsupported_ss
<< "the monitor cluster";
7551 ++unsupported_count
;
7554 set
<int32_t> up_osds
;
7555 osdmap
.get_up_osds(up_osds
);
7556 for (set
<int32_t>::iterator it
= up_osds
.begin();
7557 it
!= up_osds
.end(); ++it
) {
7558 const osd_xinfo_t
&xi
= osdmap
.get_xinfo(*it
);
7559 if ((xi
.features
& features
) != features
) {
7560 if (unsupported_count
> 0)
7561 unsupported_ss
<< ", ";
7562 unsupported_ss
<< "osd." << *it
;
7563 unsupported_count
++;
7567 if (unsupported_count
> 0) {
7568 ss
<< "features " << features
<< " unsupported by: "
7569 << unsupported_ss
.str();
7573 // check pending osd state, too!
7574 for (map
<int32_t,osd_xinfo_t
>::const_iterator p
=
7575 pending_inc
.new_xinfo
.begin();
7576 p
!= pending_inc
.new_xinfo
.end(); ++p
) {
7577 const osd_xinfo_t
&xi
= p
->second
;
7578 if ((xi
.features
& features
) != features
) {
7579 dout(10) << __func__
<< " pending osd." << p
->first
7580 << " features are insufficient; retry" << dendl
;
7588 bool OSDMonitor::validate_crush_against_features(const CrushWrapper
*newcrush
,
7591 OSDMap::Incremental new_pending
= pending_inc
;
7592 encode(*newcrush
, new_pending
.crush
, mon
.get_quorum_con_features());
7594 newmap
.deepish_copy_from(osdmap
);
7595 newmap
.apply_incremental(new_pending
);
7598 if (newmap
.require_min_compat_client
!= ceph_release_t::unknown
) {
7599 auto mv
= newmap
.get_min_compat_client();
7600 if (mv
> newmap
.require_min_compat_client
) {
7601 ss
<< "new crush map requires client version " << mv
7602 << " but require_min_compat_client is "
7603 << newmap
.require_min_compat_client
;
7610 newmap
.get_features(CEPH_ENTITY_TYPE_MON
, NULL
) |
7611 newmap
.get_features(CEPH_ENTITY_TYPE_OSD
, NULL
);
7612 stringstream features_ss
;
7613 int r
= check_cluster_features(features
, features_ss
);
7615 ss
<< "Could not change CRUSH: " << features_ss
.str();
7622 bool OSDMonitor::erasure_code_profile_in_use(
7623 const mempool::osdmap::map
<int64_t, pg_pool_t
> &pools
,
7624 const string
&profile
,
7628 for (map
<int64_t, pg_pool_t
>::const_iterator p
= pools
.begin();
7631 if (p
->second
.erasure_code_profile
== profile
&& p
->second
.is_erasure()) {
7632 *ss
<< osdmap
.pool_name
[p
->first
] << " ";
7637 *ss
<< "pool(s) are using the erasure code profile '" << profile
<< "'";
7642 int OSDMonitor::parse_erasure_code_profile(const vector
<string
> &erasure_code_profile
,
7643 map
<string
,string
> *erasure_code_profile_map
,
7646 int r
= g_conf().with_val
<string
>("osd_pool_default_erasure_code_profile",
7649 erasure_code_profile_map
,
7653 ceph_assert((*erasure_code_profile_map
).count("plugin"));
7654 string default_plugin
= (*erasure_code_profile_map
)["plugin"];
7655 map
<string
,string
> user_map
;
7656 for (vector
<string
>::const_iterator i
= erasure_code_profile
.begin();
7657 i
!= erasure_code_profile
.end();
7659 size_t equal
= i
->find('=');
7660 if (equal
== string::npos
) {
7661 user_map
[*i
] = string();
7662 (*erasure_code_profile_map
)[*i
] = string();
7664 const string key
= i
->substr(0, equal
);
7666 const string value
= i
->substr(equal
);
7667 if (key
.find("ruleset-") == 0) {
7668 *ss
<< "property '" << key
<< "' is no longer supported; try "
7669 << "'crush-" << key
.substr(8) << "' instead";
7672 user_map
[key
] = value
;
7673 (*erasure_code_profile_map
)[key
] = value
;
7677 if (user_map
.count("plugin") && user_map
["plugin"] != default_plugin
)
7678 (*erasure_code_profile_map
) = user_map
;
7683 int OSDMonitor::prepare_pool_size(const unsigned pool_type
,
7684 const string
&erasure_code_profile
,
7686 unsigned *size
, unsigned *min_size
,
7690 bool set_min_size
= false;
7691 switch (pool_type
) {
7692 case pg_pool_t::TYPE_REPLICATED
:
7693 if (osdmap
.stretch_mode_enabled
) {
7695 repl_size
= g_conf().get_val
<uint64_t>("mon_stretch_pool_size");
7696 if (repl_size
!= g_conf().get_val
<uint64_t>("mon_stretch_pool_size")) {
7697 *ss
<< "prepare_pool_size: we are in stretch mode but size "
7698 << repl_size
<< " does not match!";
7701 *min_size
= g_conf().get_val
<uint64_t>("mon_stretch_pool_min_size");
7702 set_min_size
= true;
7704 if (repl_size
== 0) {
7705 repl_size
= g_conf().get_val
<uint64_t>("osd_pool_default_size");
7709 *min_size
= g_conf().get_osd_pool_default_min_size(repl_size
);
7711 case pg_pool_t::TYPE_ERASURE
:
7713 if (osdmap
.stretch_mode_enabled
) {
7714 *ss
<< "prepare_pool_size: we are in stretch mode; cannot create EC pools!";
7717 ErasureCodeInterfaceRef erasure_code
;
7718 err
= get_erasure_code(erasure_code_profile
, &erasure_code
, ss
);
7720 *size
= erasure_code
->get_chunk_count();
7722 erasure_code
->get_data_chunk_count() +
7723 std::min
<int>(1, erasure_code
->get_coding_chunk_count() - 1);
7724 assert(*min_size
<= *size
);
7725 assert(*min_size
>= erasure_code
->get_data_chunk_count());
7730 *ss
<< "prepare_pool_size: " << pool_type
<< " is not a known pool type";
7737 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type
,
7738 const string
&erasure_code_profile
,
7739 uint32_t *stripe_width
,
7743 switch (pool_type
) {
7744 case pg_pool_t::TYPE_REPLICATED
:
7747 case pg_pool_t::TYPE_ERASURE
:
7749 ErasureCodeProfile profile
=
7750 osdmap
.get_erasure_code_profile(erasure_code_profile
);
7751 ErasureCodeInterfaceRef erasure_code
;
7752 err
= get_erasure_code(erasure_code_profile
, &erasure_code
, ss
);
7755 uint32_t data_chunks
= erasure_code
->get_data_chunk_count();
7756 uint32_t stripe_unit
= g_conf().get_val
<Option::size_t>("osd_pool_erasure_code_stripe_unit");
7757 auto it
= profile
.find("stripe_unit");
7758 if (it
!= profile
.end()) {
7760 stripe_unit
= strict_iecstrtoll(it
->second
, &err_str
);
7761 ceph_assert(err_str
.empty());
7763 *stripe_width
= data_chunks
*
7764 erasure_code
->get_chunk_size(stripe_unit
* data_chunks
);
7768 *ss
<< "prepare_pool_stripe_width: "
7769 << pool_type
<< " is not a known pool type";
7776 int OSDMonitor::get_replicated_stretch_crush_rule()
7778 /* we don't write down the stretch rule anywhere, so
7779 * we have to guess it. How? Look at all the pools
7780 * and count up how many times a given rule is used
7781 * on stretch pools and then return the one with
7784 map
<int,int> rule_counts
;
7785 for (const auto& pooli
: osdmap
.pools
) {
7786 const pg_pool_t
& p
= pooli
.second
;
7787 if (p
.is_replicated() && p
.is_stretch_pool()) {
7788 if (!rule_counts
.count(p
.crush_rule
)) {
7789 rule_counts
[p
.crush_rule
] = 1;
7791 ++rule_counts
[p
.crush_rule
];
7796 if (rule_counts
.empty()) {
7800 int most_used_count
= 0;
7801 int most_used_rule
= -1;
7802 for (auto i
: rule_counts
) {
7803 if (i
.second
> most_used_count
) {
7804 most_used_rule
= i
.first
;
7805 most_used_count
= i
.second
;
7808 ceph_assert(most_used_count
> 0);
7809 ceph_assert(most_used_rule
>= 0);
7810 return most_used_rule
;
7813 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type
,
7814 const string
&erasure_code_profile
,
7815 const string
&rule_name
,
7820 if (*crush_rule
< 0) {
7821 switch (pool_type
) {
7822 case pg_pool_t::TYPE_REPLICATED
:
7824 if (rule_name
== "") {
7825 if (osdmap
.stretch_mode_enabled
) {
7826 *crush_rule
= get_replicated_stretch_crush_rule();
7829 *crush_rule
= osdmap
.crush
->get_osd_pool_default_crush_replicated_rule(cct
);
7831 if (*crush_rule
< 0) {
7832 // Errors may happen e.g. if no valid rule is available
7833 *ss
<< "No suitable CRUSH rule exists, check "
7834 << "'osd pool default crush *' config options";
7838 return get_crush_rule(rule_name
, crush_rule
, ss
);
7842 case pg_pool_t::TYPE_ERASURE
:
7844 int err
= crush_rule_create_erasure(rule_name
,
7845 erasure_code_profile
,
7849 dout(20) << "prepare_pool_crush_rule: rule "
7850 << rule_name
<< " try again" << dendl
;
7853 // need to wait for the crush rule to be proposed before proceeding
7864 *ss
<< "prepare_pool_crush_rule: " << pool_type
7865 << " is not a known pool type";
7869 if (!osdmap
.crush
->rule_exists(*crush_rule
)) {
7870 *ss
<< "CRUSH rule " << *crush_rule
<< " not found";
7878 int OSDMonitor::get_crush_rule(const string
&rule_name
,
7883 ret
= osdmap
.crush
->get_rule_id(rule_name
);
7884 if (ret
!= -ENOENT
) {
7888 CrushWrapper newcrush
= _get_pending_crush();
7890 ret
= newcrush
.get_rule_id(rule_name
);
7891 if (ret
!= -ENOENT
) {
7892 // found it, wait for it to be proposed
7893 dout(20) << __func__
<< ": rule " << rule_name
7894 << " try again" << dendl
;
7897 // Cannot find it , return error
7898 *ss
<< "specified rule " << rule_name
<< " doesn't exist";
7905 int OSDMonitor::check_pg_num(int64_t pool
, int pg_num
, int size
, int crush_rule
, ostream
*ss
)
7907 auto max_pgs_per_osd
= g_conf().get_val
<uint64_t>("mon_max_pg_per_osd");
7908 uint64_t projected
= 0;
7909 unsigned osd_num
= 0;
7910 // assume min cluster size 3
7911 auto num_osds
= std::max(osdmap
.get_num_in_osds(), 3u);
7914 projected
+= pg_num
* size
;
7916 if (mapping
.get_epoch() >= osdmap
.get_epoch()) {
7918 CrushWrapper newcrush
= _get_pending_crush();
7919 newcrush
.find_takes_by_rule(crush_rule
, &roots
);
7920 int max_osd
= osdmap
.get_max_osd();
7921 for (auto root
: roots
) {
7922 const char *rootname
= newcrush
.get_item_name(root
);
7924 newcrush
.get_leaves(rootname
, &osd_ids
);
7925 unsigned out_osd
= 0;
7926 for (auto id
: osd_ids
) {
7931 projected
+= mapping
.get_osd_acting_pgs(id
).size();
7933 osd_num
+= osd_ids
.size() - out_osd
;
7936 // update an existing pool's pg num
7937 const auto& pg_info
= osdmap
.get_pools().at(pool
);
7938 // already counted the pgs of this `pool` by iterating crush map, so
7939 // remove them using adding the specified pg num
7940 projected
+= pg_num
* size
;
7941 projected
-= pg_info
.get_pg_num_target() * pg_info
.get_size();
7943 num_osds
= std::max(osd_num
, 3u); // assume min cluster size 3
7945 // use pg_num target for evaluating the projected pg num
7946 for (const auto& [pool_id
, pool_info
] : osdmap
.get_pools()) {
7947 if (pool_id
== pool
) {
7948 projected
+= pg_num
* size
;
7950 projected
+= pool_info
.get_pg_num_target() * pool_info
.get_size();
7954 auto max_pgs
= max_pgs_per_osd
* num_osds
;
7955 if (projected
> max_pgs
) {
7957 *ss
<< "pool id " << pool
;
7959 *ss
<< " pg_num " << pg_num
<< " size " << size
7960 << " would mean " << projected
7961 << " total pgs, which exceeds max " << max_pgs
7962 << " (mon_max_pg_per_osd " << max_pgs_per_osd
7963 << " * num_in_osds " << num_osds
<< ")";
7970 * @param name The name of the new pool
7971 * @param crush_rule The crush rule to use. If <0, will use the system default
7972 * @param crush_rule_name The crush rule to use, if crush_rulset <0
7973 * @param pg_num The pg_num to use. If set to 0, will use the system default
7974 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
7975 * @param pg_num_min min pg_num
7976 * @param pg_num_max max pg_num
7977 * @param repl_size Replication factor, or 0 for default
7978 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
7979 * @param pool_type TYPE_ERASURE, or TYPE_REP
7980 * @param expected_num_objects expected number of objects on the pool
7981 * @param fast_read fast read type.
7982 * @param ss human readable error message, if any.
7984 * @return 0 on success, negative errno on failure.
7986 int OSDMonitor::prepare_new_pool(string
& name
,
7988 const string
&crush_rule_name
,
7989 unsigned pg_num
, unsigned pgp_num
,
7990 unsigned pg_num_min
,
7991 unsigned pg_num_max
,
7992 const uint64_t repl_size
,
7993 const uint64_t target_size_bytes
,
7994 const float target_size_ratio
,
7995 const string
&erasure_code_profile
,
7996 const unsigned pool_type
,
7997 const uint64_t expected_num_objects
,
7998 FastReadType fast_read
,
7999 const string
& pg_autoscale_mode
,
8003 if (name
.length() == 0)
8006 auto pg_num_from_mode
=
8007 [pg_num
=g_conf().get_val
<uint64_t>("osd_pool_default_pg_num")]
8008 (const string
& mode
) {
8009 return mode
== "on" ? 1 : pg_num
;
8011 pg_num
= pg_num_from_mode(
8012 pg_autoscale_mode
.empty() ?
8013 g_conf().get_val
<string
>("osd_pool_default_pg_autoscale_mode") :
8017 pgp_num
= g_conf().get_val
<uint64_t>("osd_pool_default_pgp_num");
8020 if (pg_num
> g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")) {
8021 *ss
<< "'pg_num' must be greater than 0 and less than or equal to "
8022 << g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")
8023 << " (you may adjust 'mon max pool pg num' for higher values)";
8026 if (pgp_num
> pg_num
) {
8027 *ss
<< "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
8028 << ", which in this case is " << pg_num
;
8031 if (pool_type
== pg_pool_t::TYPE_REPLICATED
&& fast_read
== FAST_READ_ON
) {
8032 *ss
<< "'fast_read' can only apply to erasure coding pool";
8036 r
= prepare_pool_crush_rule(pool_type
, erasure_code_profile
,
8037 crush_rule_name
, &crush_rule
, ss
);
8039 dout(10) << "prepare_pool_crush_rule returns " << r
<< dendl
;
8042 unsigned size
, min_size
;
8043 r
= prepare_pool_size(pool_type
, erasure_code_profile
, repl_size
,
8044 &size
, &min_size
, ss
);
8046 dout(10) << "prepare_pool_size returns " << r
<< dendl
;
8049 if (g_conf()->mon_osd_crush_smoke_test
) {
8050 CrushWrapper newcrush
= _get_pending_crush();
8052 CrushTester
tester(newcrush
, err
);
8053 tester
.set_min_x(0);
8054 tester
.set_max_x(50);
8055 tester
.set_rule(crush_rule
);
8056 tester
.set_num_rep(size
);
8057 auto start
= ceph::coarse_mono_clock::now();
8058 r
= tester
.test_with_fork(g_conf()->mon_lease
);
8059 auto duration
= ceph::coarse_mono_clock::now() - start
;
8061 dout(10) << "tester.test_with_fork returns " << r
8062 << ": " << err
.str() << dendl
;
8063 *ss
<< "crush test failed with " << r
<< ": " << err
.str();
8066 dout(10) << __func__
<< " crush smoke test duration: "
8067 << duration
<< dendl
;
8069 r
= check_pg_num(-1, pg_num
, size
, crush_rule
, ss
);
8071 dout(10) << "check_pg_num returns " << r
<< dendl
;
8075 if (osdmap
.crush
->get_rule_type(crush_rule
) != (int)pool_type
) {
8076 *ss
<< "crush rule " << crush_rule
<< " type does not match pool";
8080 uint32_t stripe_width
= 0;
8081 r
= prepare_pool_stripe_width(pool_type
, erasure_code_profile
, &stripe_width
, ss
);
8083 dout(10) << "prepare_pool_stripe_width returns " << r
<< dendl
;
8088 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
8089 switch (fast_read
) {
8096 case FAST_READ_DEFAULT
:
8097 fread
= g_conf()->osd_pool_default_ec_fast_read
;
8100 *ss
<< "invalid fast_read setting: " << fast_read
;
8105 for (map
<int64_t,string
>::iterator p
= pending_inc
.new_pool_names
.begin();
8106 p
!= pending_inc
.new_pool_names
.end();
8108 if (p
->second
== name
)
8112 if (-1 == pending_inc
.new_pool_max
)
8113 pending_inc
.new_pool_max
= osdmap
.pool_max
;
8114 int64_t pool
= ++pending_inc
.new_pool_max
;
8116 pg_pool_t
*pi
= pending_inc
.get_new_pool(pool
, &empty
);
8117 pi
->create_time
= ceph_clock_now();
8118 pi
->type
= pool_type
;
8119 pi
->fast_read
= fread
;
8120 pi
->flags
= g_conf()->osd_pool_default_flags
;
8122 pi
->set_flag(pg_pool_t::FLAG_BULK
);
8123 } else if (g_conf()->osd_pool_default_flag_bulk
) {
8124 pi
->set_flag(pg_pool_t::FLAG_BULK
);
8126 if (g_conf()->osd_pool_default_flag_hashpspool
)
8127 pi
->set_flag(pg_pool_t::FLAG_HASHPSPOOL
);
8128 if (g_conf()->osd_pool_default_flag_nodelete
)
8129 pi
->set_flag(pg_pool_t::FLAG_NODELETE
);
8130 if (g_conf()->osd_pool_default_flag_nopgchange
)
8131 pi
->set_flag(pg_pool_t::FLAG_NOPGCHANGE
);
8132 if (g_conf()->osd_pool_default_flag_nosizechange
)
8133 pi
->set_flag(pg_pool_t::FLAG_NOSIZECHANGE
);
8134 pi
->set_flag(pg_pool_t::FLAG_CREATING
);
8135 if (g_conf()->osd_pool_use_gmt_hitset
)
8136 pi
->use_gmt_hitset
= true;
8138 pi
->use_gmt_hitset
= false;
8141 pi
->min_size
= min_size
;
8142 pi
->crush_rule
= crush_rule
;
8143 pi
->expected_num_objects
= expected_num_objects
;
8144 pi
->object_hash
= CEPH_STR_HASH_RJENKINS
;
8145 if (osdmap
.stretch_mode_enabled
) {
8146 pi
->peering_crush_bucket_count
= osdmap
.stretch_bucket_count
;
8147 pi
->peering_crush_bucket_target
= osdmap
.stretch_bucket_count
;
8148 pi
->peering_crush_bucket_barrier
= osdmap
.stretch_mode_bucket
;
8149 pi
->peering_crush_mandatory_member
= CRUSH_ITEM_NONE
;
8150 if (osdmap
.degraded_stretch_mode
) {
8151 pi
->peering_crush_bucket_count
= osdmap
.degraded_stretch_mode
;
8152 pi
->peering_crush_bucket_target
= osdmap
.degraded_stretch_mode
;
8153 // pi->peering_crush_bucket_mandatory_member = CRUSH_ITEM_NONE;
8154 // TODO: drat, we don't record this ^ anywhere, though given that it
8155 // necessarily won't exist elsewhere it likely doesn't matter
8156 pi
->min_size
= pi
->min_size
/ 2;
8157 pi
->size
= pi
->size
/ 2; // only support 2 zones now
8161 if (auto m
= pg_pool_t::get_pg_autoscale_mode_by_name(
8162 g_conf().get_val
<string
>("osd_pool_default_pg_autoscale_mode"));
8163 m
!= pg_pool_t::pg_autoscale_mode_t::UNKNOWN
) {
8164 pi
->pg_autoscale_mode
= m
;
8166 pi
->pg_autoscale_mode
= pg_pool_t::pg_autoscale_mode_t::OFF
;
8168 auto max
= g_conf().get_val
<int64_t>("mon_osd_max_initial_pgs");
8170 max
> 0 ? std::min
<uint64_t>(pg_num
, std::max
<int64_t>(1, max
))
8172 pi
->set_pg_num_pending(pi
->get_pg_num());
8173 pi
->set_pg_num_target(pg_num
);
8174 pi
->set_pgp_num(pi
->get_pg_num());
8175 pi
->set_pgp_num_target(pgp_num
);
8176 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
&&
8178 pi
->opts
.set(pool_opts_t::PG_NUM_MIN
, static_cast<int64_t>(pg_num_min
));
8180 if (osdmap
.require_osd_release
>= ceph_release_t::quincy
&&
8182 pi
->opts
.set(pool_opts_t::PG_NUM_MAX
, static_cast<int64_t>(pg_num_max
));
8184 if (auto m
= pg_pool_t::get_pg_autoscale_mode_by_name(
8185 pg_autoscale_mode
); m
!= pg_pool_t::pg_autoscale_mode_t::UNKNOWN
) {
8186 pi
->pg_autoscale_mode
= m
;
8189 pi
->last_change
= pending_inc
.epoch
;
8192 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
8193 pi
->erasure_code_profile
= erasure_code_profile
;
8195 pi
->erasure_code_profile
= "";
8197 pi
->stripe_width
= stripe_width
;
8199 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
&&
8200 target_size_bytes
) {
8201 // only store for nautilus+ because TARGET_SIZE_BYTES may be
8202 // larger than int32_t max.
8203 pi
->opts
.set(pool_opts_t::TARGET_SIZE_BYTES
, static_cast<int64_t>(target_size_bytes
));
8205 if (target_size_ratio
> 0.0 &&
8206 osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
8207 // only store for nautilus+, just to be consistent and tidy.
8208 pi
->opts
.set(pool_opts_t::TARGET_SIZE_RATIO
, target_size_ratio
);
8211 pi
->cache_target_dirty_ratio_micro
=
8212 g_conf()->osd_pool_default_cache_target_dirty_ratio
* 1000000;
8213 pi
->cache_target_dirty_high_ratio_micro
=
8214 g_conf()->osd_pool_default_cache_target_dirty_high_ratio
* 1000000;
8215 pi
->cache_target_full_ratio_micro
=
8216 g_conf()->osd_pool_default_cache_target_full_ratio
* 1000000;
8217 pi
->cache_min_flush_age
= g_conf()->osd_pool_default_cache_min_flush_age
;
8218 pi
->cache_min_evict_age
= g_conf()->osd_pool_default_cache_min_evict_age
;
8220 pending_inc
.new_pool_names
[pool
] = name
;
8224 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op
, int flag
)
8226 op
->mark_osdmon_event(__func__
);
8228 if (pending_inc
.new_flags
< 0)
8229 pending_inc
.new_flags
= osdmap
.get_flags();
8230 pending_inc
.new_flags
|= flag
;
8231 ss
<< OSDMap::get_flag_string(flag
) << " is set";
8232 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
8233 get_last_committed() + 1));
8237 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op
, int flag
)
8239 op
->mark_osdmon_event(__func__
);
8241 if (pending_inc
.new_flags
< 0)
8242 pending_inc
.new_flags
= osdmap
.get_flags();
8243 pending_inc
.new_flags
&= ~flag
;
8244 ss
<< OSDMap::get_flag_string(flag
) << " is unset";
8245 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
8246 get_last_committed() + 1));
8250 int OSDMonitor::prepare_command_pool_set(const cmdmap_t
& cmdmap
,
8254 cmd_getval(cmdmap
, "pool", poolstr
);
8255 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
8257 ss
<< "unrecognized pool '" << poolstr
<< "'";
8261 cmd_getval(cmdmap
, "var", var
);
8263 pg_pool_t p
= *osdmap
.get_pg_pool(pool
);
8264 if (pending_inc
.new_pools
.count(pool
))
8265 p
= pending_inc
.new_pools
[pool
];
8267 // accept val as a json string in the normal case (current
8268 // generation monitor). parse out int or float values from the
8269 // string as needed. however, if it is not a string, try to pull
8270 // out an int, in case an older monitor with an older json schema is
8271 // forwarding a request.
8273 string interr
, floaterr
;
8276 int64_t uf
= 0; // micro-f
8277 cmd_getval(cmdmap
, "val", val
);
8280 "target_max_objects"
8282 auto iec_options
= {
8284 "target_size_bytes",
8285 "compression_max_blob_size",
8286 "compression_min_blob_size",
8290 if (count(begin(si_options
), end(si_options
), var
)) {
8291 n
= strict_si_cast
<int64_t>(val
, &interr
);
8292 } else if (count(begin(iec_options
), end(iec_options
), var
)) {
8293 n
= strict_iec_cast
<int64_t>(val
, &interr
);
8295 // parse string as both int and float; different fields use different types.
8296 n
= strict_strtoll(val
.c_str(), 10, &interr
);
8297 f
= strict_strtod(val
.c_str(), &floaterr
);
8298 uf
= llrintl(f
* (double)1000000.0);
8302 (var
== "hit_set_type" || var
== "hit_set_period" ||
8303 var
== "hit_set_count" || var
== "hit_set_fpp" ||
8304 var
== "target_max_objects" || var
== "target_max_bytes" ||
8305 var
== "cache_target_full_ratio" || var
== "cache_target_dirty_ratio" ||
8306 var
== "cache_target_dirty_high_ratio" || var
== "use_gmt_hitset" ||
8307 var
== "cache_min_flush_age" || var
== "cache_min_evict_age" ||
8308 var
== "hit_set_grade_decay_rate" || var
== "hit_set_search_last_n" ||
8309 var
== "min_read_recency_for_promote" || var
== "min_write_recency_for_promote")) {
8313 if (var
== "size") {
8314 if (p
.has_flag(pg_pool_t::FLAG_NOSIZECHANGE
)) {
8315 ss
<< "pool size change is disabled; you must unset nosizechange flag for the pool first";
8318 if (p
.type
== pg_pool_t::TYPE_ERASURE
) {
8319 ss
<< "can not change the size of an erasure-coded pool";
8322 if (interr
.length()) {
8323 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8326 if (n
<= 0 || n
> 10) {
8327 ss
<< "pool size must be between 1 and 10";
8331 if (!g_conf().get_val
<bool>("mon_allow_pool_size_one")) {
8332 ss
<< "configuring pool size as 1 is disabled by default.";
8336 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
8337 if (!sure
) { ss
<< "WARNING: setting pool size 1 could lead to data loss "
8338 "without recovery. If you are *ABSOLUTELY CERTAIN* that is what you want, "
8339 "pass the flag --yes-i-really-mean-it.";
8343 if (osdmap
.crush
->get_rule_type(p
.get_crush_rule()) != (int)p
.type
) {
8344 ss
<< "crush rule " << p
.get_crush_rule() << " type does not match pool";
8347 int r
= check_pg_num(pool
, p
.get_pg_num(), n
, p
.get_crush_rule(), &ss
);
8352 p
.min_size
= g_conf().get_osd_pool_default_min_size(p
.size
);
8353 } else if (var
== "min_size") {
8354 if (p
.has_flag(pg_pool_t::FLAG_NOSIZECHANGE
)) {
8355 ss
<< "pool min size change is disabled; you must unset nosizechange flag for the pool first";
8358 if (interr
.length()) {
8359 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8363 if (p
.type
!= pg_pool_t::TYPE_ERASURE
) {
8364 if (n
< 1 || n
> p
.size
) {
8365 ss
<< "pool min_size must be between 1 and size, which is set to " << (int)p
.size
;
8369 ErasureCodeInterfaceRef erasure_code
;
8372 int err
= get_erasure_code(p
.erasure_code_profile
, &erasure_code
, &tmp
);
8374 k
= erasure_code
->get_data_chunk_count();
8376 ss
<< __func__
<< " get_erasure_code failed: " << tmp
.str();
8380 if (n
< k
|| n
> p
.size
) {
8381 ss
<< "pool min_size must be between " << k
<< " and size, which is set to " << (int)p
.size
;
8386 } else if (var
== "pg_num_actual") {
8387 if (interr
.length()) {
8388 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8391 if (n
== (int)p
.get_pg_num()) {
8394 if (static_cast<uint64_t>(n
) > g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")) {
8395 ss
<< "'pg_num' must be greater than 0 and less than or equal to "
8396 << g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")
8397 << " (you may adjust 'mon max pool pg num' for higher values)";
8400 if (p
.has_flag(pg_pool_t::FLAG_CREATING
)) {
8401 ss
<< "cannot adjust pg_num while initial PGs are being created";
8404 if (n
> (int)p
.get_pg_num()) {
8405 if (p
.get_pg_num() != p
.get_pg_num_pending()) {
8406 // force pre-nautilus clients to resend their ops, since they
8407 // don't understand pg_num_pending changes form a new interval
8408 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
8412 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8413 ss
<< "nautilus OSDs are required to adjust pg_num_pending";
8416 if (n
< (int)p
.get_pgp_num()) {
8417 ss
<< "specified pg_num " << n
<< " < pgp_num " << p
.get_pgp_num();
8420 if (n
< (int)p
.get_pg_num() - 1) {
8421 ss
<< "specified pg_num " << n
<< " < pg_num (" << p
.get_pg_num()
8422 << ") - 1; only single pg decrease is currently supported";
8425 p
.set_pg_num_pending(n
);
8426 // force pre-nautilus clients to resend their ops, since they
8427 // don't understand pg_num_pending changes form a new interval
8428 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
8430 // force pre-luminous clients to resend their ops, since they
8431 // don't understand that split PGs now form a new interval.
8432 p
.last_force_op_resend_preluminous
= pending_inc
.epoch
;
8433 } else if (var
== "pg_num") {
8434 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
8435 ss
<< "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8438 if (interr
.length()) {
8439 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8442 if (n
== (int)p
.get_pg_num_target()) {
8445 if (n
<= 0 || static_cast<uint64_t>(n
) >
8446 g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")) {
8447 ss
<< "'pg_num' must be greater than 0 and less than or equal to "
8448 << g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")
8449 << " (you may adjust 'mon max pool pg num' for higher values)";
8452 if (n
> (int)p
.get_pg_num_target()) {
8453 int r
= check_pg_num(pool
, n
, p
.get_size(), p
.get_crush_rule(), &ss
);
8458 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
8459 if (p
.cache_mode
!= pg_pool_t::CACHEMODE_NONE
&& !force
) {
8460 ss
<< "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
8464 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8465 ss
<< "nautilus OSDs are required to decrease pg_num";
8469 int64_t pg_min
= 0, pg_max
= 0;
8470 p
.opts
.get(pool_opts_t::PG_NUM_MIN
, &pg_min
);
8471 p
.opts
.get(pool_opts_t::PG_NUM_MAX
, &pg_max
);
8472 if (pg_min
&& n
< pg_min
) {
8473 ss
<< "specified pg_num " << n
8474 << " < pg_num_min " << pg_min
;
8477 if (pg_max
&& n
> pg_max
) {
8478 ss
<< "specified pg_num " << n
8479 << " < pg_num_max " << pg_max
;
8482 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8483 // pre-nautilus osdmap format; increase pg_num directly
8484 assert(n
> (int)p
.get_pg_num());
8485 // force pre-nautilus clients to resend their ops, since they
8486 // don't understand pg_num_target changes form a new interval
8487 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
8488 // force pre-luminous clients to resend their ops, since they
8489 // don't understand that split PGs now form a new interval.
8490 p
.last_force_op_resend_preluminous
= pending_inc
.epoch
;
8493 // set targets; mgr will adjust pg_num_actual and pgp_num later.
8494 // make pgp_num track pg_num if it already matches. if it is set
8495 // differently, leave it different and let the user control it
8497 if (p
.get_pg_num_target() == p
.get_pgp_num_target()) {
8498 p
.set_pgp_num_target(n
);
8500 p
.set_pg_num_target(n
);
8502 } else if (var
== "pgp_num_actual") {
8503 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
8504 ss
<< "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8507 if (interr
.length()) {
8508 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8512 ss
<< "specified pgp_num must > 0, but you set to " << n
;
8515 if (n
> (int)p
.get_pg_num()) {
8516 ss
<< "specified pgp_num " << n
<< " > pg_num " << p
.get_pg_num();
8519 if (n
> (int)p
.get_pg_num_pending()) {
8520 ss
<< "specified pgp_num " << n
8521 << " > pg_num_pending " << p
.get_pg_num_pending();
8525 } else if (var
== "pgp_num") {
8526 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
8527 ss
<< "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8530 if (interr
.length()) {
8531 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8535 ss
<< "specified pgp_num must > 0, but you set to " << n
;
8538 if (n
> (int)p
.get_pg_num_target()) {
8539 ss
<< "specified pgp_num " << n
<< " > pg_num " << p
.get_pg_num_target();
8542 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8543 // pre-nautilus osdmap format; increase pgp_num directly
8546 p
.set_pgp_num_target(n
);
8548 } else if (var
== "pg_autoscale_mode") {
8549 auto m
= pg_pool_t::get_pg_autoscale_mode_by_name(val
);
8550 if (m
== pg_pool_t::pg_autoscale_mode_t::UNKNOWN
) {
8551 ss
<< "specified invalid mode " << val
;
8554 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8555 ss
<< "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
8558 p
.pg_autoscale_mode
= m
;
8559 } else if (var
== "crush_rule") {
8560 int id
= osdmap
.crush
->get_rule_id(val
);
8561 if (id
== -ENOENT
) {
8562 ss
<< "crush rule " << val
<< " does not exist";
8566 ss
<< cpp_strerror(id
);
8569 if (osdmap
.crush
->get_rule_type(id
) != (int)p
.get_type()) {
8570 ss
<< "crush rule " << id
<< " type does not match pool";
8574 } else if (var
== "nodelete" || var
== "nopgchange" ||
8575 var
== "nosizechange" || var
== "write_fadvise_dontneed" ||
8576 var
== "noscrub" || var
== "nodeep-scrub" || var
== "bulk") {
8577 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
8578 // make sure we only compare against 'n' if we didn't receive a string
8579 if (val
== "true" || (interr
.empty() && n
== 1)) {
8581 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8584 ss
<< "expecting value 'true', 'false', '0', or '1'";
8587 } else if (var
== "eio") {
8588 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
8590 // make sure we only compare against 'n' if we didn't receive a string
8591 if (val
== "true" || (interr
.empty() && n
== 1)) {
8593 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8596 ss
<< "expecting value 'true', 'false', '0', or '1'";
8599 } else if (var
== "hashpspool") {
8600 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
8602 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
8605 ss
<< "are you SURE? this will remap all placement groups in this pool,"
8606 " this triggers large data movement,"
8607 " pass --yes-i-really-mean-it if you really do.";
8610 // make sure we only compare against 'n' if we didn't receive a string
8611 if (val
== "true" || (interr
.empty() && n
== 1)) {
8613 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8616 ss
<< "expecting value 'true', 'false', '0', or '1'";
8619 } else if (var
== "hit_set_type") {
8621 p
.hit_set_params
= HitSet::Params();
8623 int err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
8626 if (val
== "bloom") {
8627 BloomHitSet::Params
*bsp
= new BloomHitSet::Params
;
8628 bsp
->set_fpp(g_conf().get_val
<double>("osd_pool_default_hit_set_bloom_fpp"));
8629 p
.hit_set_params
= HitSet::Params(bsp
);
8630 } else if (val
== "explicit_hash")
8631 p
.hit_set_params
= HitSet::Params(new ExplicitHashHitSet::Params
);
8632 else if (val
== "explicit_object")
8633 p
.hit_set_params
= HitSet::Params(new ExplicitObjectHitSet::Params
);
8635 ss
<< "unrecognized hit_set type '" << val
<< "'";
8639 } else if (var
== "hit_set_period") {
8640 if (interr
.length()) {
8641 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8644 ss
<< "hit_set_period should be non-negative";
8647 p
.hit_set_period
= n
;
8648 } else if (var
== "hit_set_count") {
8649 if (interr
.length()) {
8650 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8653 ss
<< "hit_set_count should be non-negative";
8656 p
.hit_set_count
= n
;
8657 } else if (var
== "hit_set_fpp") {
8658 if (floaterr
.length()) {
8659 ss
<< "error parsing floating point value '" << val
<< "': " << floaterr
;
8661 } else if (f
< 0 || f
> 1.0) {
8662 ss
<< "hit_set_fpp should be in the range 0..1";
8665 if (p
.hit_set_params
.get_type() != HitSet::TYPE_BLOOM
) {
8666 ss
<< "hit set is not of type Bloom; invalid to set a false positive rate!";
8669 BloomHitSet::Params
*bloomp
= static_cast<BloomHitSet::Params
*>(p
.hit_set_params
.impl
.get());
8671 } else if (var
== "use_gmt_hitset") {
8672 if (val
== "true" || (interr
.empty() && n
== 1)) {
8673 p
.use_gmt_hitset
= true;
8675 ss
<< "expecting value 'true' or '1'";
8678 } else if (var
== "allow_ec_overwrites") {
8679 if (!p
.is_erasure()) {
8680 ss
<< "ec overwrites can only be enabled for an erasure coded pool";
8684 if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites
&&
8685 !is_pool_currently_all_bluestore(pool
, p
, &err
)) {
8686 ss
<< "pool must only be stored on bluestore for scrubbing to work: " << err
.str();
8689 if (val
== "true" || (interr
.empty() && n
== 1)) {
8690 p
.flags
|= pg_pool_t::FLAG_EC_OVERWRITES
;
8691 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8692 ss
<< "ec overwrites cannot be disabled once enabled";
8695 ss
<< "expecting value 'true', 'false', '0', or '1'";
8698 } else if (var
== "target_max_objects") {
8699 if (interr
.length()) {
8700 ss
<< "error parsing int '" << val
<< "': " << interr
;
8703 p
.target_max_objects
= n
;
8704 } else if (var
== "target_max_bytes") {
8705 if (interr
.length()) {
8706 ss
<< "error parsing int '" << val
<< "': " << interr
;
8709 p
.target_max_bytes
= n
;
8710 } else if (var
== "cache_target_dirty_ratio") {
8711 if (floaterr
.length()) {
8712 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
8715 if (f
< 0 || f
> 1.0) {
8716 ss
<< "value must be in the range 0..1";
8719 p
.cache_target_dirty_ratio_micro
= uf
;
8720 } else if (var
== "cache_target_dirty_high_ratio") {
8721 if (floaterr
.length()) {
8722 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
8725 if (f
< 0 || f
> 1.0) {
8726 ss
<< "value must be in the range 0..1";
8729 p
.cache_target_dirty_high_ratio_micro
= uf
;
8730 } else if (var
== "cache_target_full_ratio") {
8731 if (floaterr
.length()) {
8732 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
8735 if (f
< 0 || f
> 1.0) {
8736 ss
<< "value must be in the range 0..1";
8739 p
.cache_target_full_ratio_micro
= uf
;
8740 } else if (var
== "cache_min_flush_age") {
8741 if (interr
.length()) {
8742 ss
<< "error parsing int '" << val
<< "': " << interr
;
8745 p
.cache_min_flush_age
= n
;
8746 } else if (var
== "cache_min_evict_age") {
8747 if (interr
.length()) {
8748 ss
<< "error parsing int '" << val
<< "': " << interr
;
8751 p
.cache_min_evict_age
= n
;
8752 } else if (var
== "min_read_recency_for_promote") {
8753 if (interr
.length()) {
8754 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8757 p
.min_read_recency_for_promote
= n
;
8758 } else if (var
== "hit_set_grade_decay_rate") {
8759 if (interr
.length()) {
8760 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8763 if (n
> 100 || n
< 0) {
8764 ss
<< "value out of range,valid range is 0 - 100";
8767 p
.hit_set_grade_decay_rate
= n
;
8768 } else if (var
== "hit_set_search_last_n") {
8769 if (interr
.length()) {
8770 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8773 if (n
> p
.hit_set_count
|| n
< 0) {
8774 ss
<< "value out of range,valid range is 0 - hit_set_count";
8777 p
.hit_set_search_last_n
= n
;
8778 } else if (var
== "min_write_recency_for_promote") {
8779 if (interr
.length()) {
8780 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8783 p
.min_write_recency_for_promote
= n
;
8784 } else if (var
== "fast_read") {
8785 if (p
.is_replicated()) {
8786 ss
<< "fast read is not supported in replication pool";
8789 if (val
== "true" || (interr
.empty() && n
== 1)) {
8791 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8792 p
.fast_read
= false;
8794 ss
<< "expecting value 'true', 'false', '0', or '1'";
8797 } else if (pool_opts_t::is_opt_name(var
)) {
8798 bool unset
= val
== "unset";
8799 if (var
== "compression_mode") {
8801 auto cmode
= Compressor::get_comp_mode_type(val
);
8803 ss
<< "unrecognized compression mode '" << val
<< "'";
8807 } else if (var
== "compression_algorithm") {
8809 auto alg
= Compressor::get_comp_alg_type(val
);
8811 ss
<< "unrecognized compression_algorithm '" << val
<< "'";
8815 } else if (var
== "compression_required_ratio") {
8816 if (floaterr
.length()) {
8817 ss
<< "error parsing float value '" << val
<< "': " << floaterr
;
8820 if (f
< 0 || f
> 1) {
8821 ss
<< "compression_required_ratio is out of range (0-1): '" << val
<< "'";
8824 } else if (var
== "csum_type") {
8825 auto t
= unset
? 0 : Checksummer::get_csum_string_type(val
);
8827 ss
<< "unrecognized csum_type '" << val
<< "'";
8830 //preserve csum_type numeric value
8833 } else if (var
== "compression_max_blob_size" ||
8834 var
== "compression_min_blob_size" ||
8835 var
== "csum_max_block" ||
8836 var
== "csum_min_block") {
8837 if (interr
.length()) {
8838 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8841 } else if (var
== "fingerprint_algorithm") {
8843 auto alg
= pg_pool_t::get_fingerprint_from_str(val
);
8845 ss
<< "unrecognized fingerprint_algorithm '" << val
<< "'";
8849 } else if (var
== "target_size_bytes") {
8850 if (interr
.length()) {
8851 ss
<< "error parsing unit value '" << val
<< "': " << interr
;
8854 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8855 ss
<< "must set require_osd_release to nautilus or "
8856 << "later before setting target_size_bytes";
8859 } else if (var
== "target_size_ratio") {
8861 ss
<< "target_size_ratio cannot be negative";
8864 } else if (var
== "pg_num_min") {
8865 if (interr
.length()) {
8866 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8869 if (n
> (int)p
.get_pg_num_target()) {
8870 ss
<< "specified pg_num_min " << n
8871 << " > pg_num " << p
.get_pg_num_target();
8874 } else if (var
== "pg_num_max") {
8875 if (interr
.length()) {
8876 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8879 if (n
&& n
< (int)p
.get_pg_num_target()) {
8880 ss
<< "specified pg_num_max " << n
8881 << " < pg_num " << p
.get_pg_num_target();
8884 } else if (var
== "recovery_priority") {
8885 if (interr
.length()) {
8886 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8889 if (!g_conf()->debug_allow_any_pool_priority
) {
8890 if (n
> OSD_POOL_PRIORITY_MAX
|| n
< OSD_POOL_PRIORITY_MIN
) {
8891 ss
<< "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
8892 << " and " << OSD_POOL_PRIORITY_MAX
;
8896 } else if (var
== "pg_autoscale_bias") {
8897 if (f
< 0.0 || f
> 1000.0) {
8898 ss
<< "pg_autoscale_bias must be between 0 and 1000";
8901 } else if (var
== "dedup_tier") {
8902 if (interr
.empty()) {
8903 ss
<< "expecting value 'pool name'";
8906 // Current base tier in dedup does not support ec pool
8907 if (p
.is_erasure()) {
8908 ss
<< "pool '" << poolstr
8909 << "' is an ec pool, which cannot be a base tier";
8912 int64_t lowtierpool_id
= osdmap
.lookup_pg_pool_name(val
);
8913 if (lowtierpool_id
< 0) {
8914 ss
<< "unrecognized pool '" << val
<< "'";
8917 const pg_pool_t
*tp
= osdmap
.get_pg_pool(lowtierpool_id
);
8920 // The original input is string (pool name), but we convert it to int64_t.
8923 } else if (var
== "dedup_chunk_algorithm") {
8925 auto alg
= pg_pool_t::get_dedup_chunk_algorithm_from_str(val
);
8927 ss
<< "unrecognized fingerprint_algorithm '" << val
<< "'";
8931 } else if (var
== "dedup_cdc_chunk_size") {
8932 if (interr
.length()) {
8933 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8938 pool_opts_t::opt_desc_t desc
= pool_opts_t::get_opt_desc(var
);
8939 switch (desc
.type
) {
8940 case pool_opts_t::STR
:
8942 p
.opts
.unset(desc
.key
);
8944 p
.opts
.set(desc
.key
, static_cast<std::string
>(val
));
8947 case pool_opts_t::INT
:
8948 if (interr
.length()) {
8949 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8953 p
.opts
.unset(desc
.key
);
8955 p
.opts
.set(desc
.key
, static_cast<int64_t>(n
));
8958 case pool_opts_t::DOUBLE
:
8959 if (floaterr
.length()) {
8960 ss
<< "error parsing floating point value '" << val
<< "': " << floaterr
;
8964 p
.opts
.unset(desc
.key
);
8966 p
.opts
.set(desc
.key
, static_cast<double>(f
));
8970 ceph_assert(!"unknown type");
8973 ss
<< "unrecognized variable '" << var
<< "'";
8976 if (val
!= "unset") {
8977 ss
<< "set pool " << pool
<< " " << var
<< " to " << val
;
8979 ss
<< "unset pool " << pool
<< " " << var
;
8981 p
.last_change
= pending_inc
.epoch
;
8982 pending_inc
.new_pools
[pool
] = p
;
8986 int OSDMonitor::prepare_command_pool_application(const string
&prefix
,
8987 const cmdmap_t
& cmdmap
,
8990 return _command_pool_application(prefix
, cmdmap
, ss
, nullptr, true);
8993 int OSDMonitor::preprocess_command_pool_application(const string
&prefix
,
8994 const cmdmap_t
& cmdmap
,
8998 return _command_pool_application(prefix
, cmdmap
, ss
, modified
, false);
9003 * Common logic for preprocess and prepare phases of pool application
9004 * tag commands. In preprocess mode we're only detecting invalid
9005 * commands, and determining whether it was a modification or a no-op.
9006 * In prepare mode we're actually updating the pending state.
9008 int OSDMonitor::_command_pool_application(const string
&prefix
,
9009 const cmdmap_t
& cmdmap
,
9015 cmd_getval(cmdmap
, "pool", pool_name
);
9016 int64_t pool
= osdmap
.lookup_pg_pool_name(pool_name
.c_str());
9018 ss
<< "unrecognized pool '" << pool_name
<< "'";
9022 pg_pool_t p
= *osdmap
.get_pg_pool(pool
);
9024 if (pending_inc
.new_pools
.count(pool
)) {
9025 p
= pending_inc
.new_pools
[pool
];
9030 cmd_getval(cmdmap
, "app", app
);
9031 bool app_exists
= (p
.application_metadata
.count(app
) > 0);
9034 cmd_getval(cmdmap
, "key", key
);
9036 ss
<< "key cannot be 'all'";
9041 cmd_getval(cmdmap
, "value", value
);
9042 if (value
== "all") {
9043 ss
<< "value cannot be 'all'";
9047 if (boost::algorithm::ends_with(prefix
, "enable")) {
9049 ss
<< "application name must be provided";
9054 ss
<< "application must be enabled on base tier";
9059 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
9061 if (!app_exists
&& !p
.application_metadata
.empty() && !force
) {
9062 ss
<< "Are you SURE? Pool '" << pool_name
<< "' already has an enabled "
9063 << "application; pass --yes-i-really-mean-it to proceed anyway";
9067 if (!app_exists
&& p
.application_metadata
.size() >= MAX_POOL_APPLICATIONS
) {
9068 ss
<< "too many enabled applications on pool '" << pool_name
<< "'; "
9069 << "max " << MAX_POOL_APPLICATIONS
;
9073 if (app
.length() > MAX_POOL_APPLICATION_LENGTH
) {
9074 ss
<< "application name '" << app
<< "' too long; max length "
9075 << MAX_POOL_APPLICATION_LENGTH
;
9080 p
.application_metadata
[app
] = {};
9082 ss
<< "enabled application '" << app
<< "' on pool '" << pool_name
<< "'";
9084 } else if (boost::algorithm::ends_with(prefix
, "disable")) {
9086 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
9089 ss
<< "Are you SURE? Disabling an application within a pool might result "
9090 << "in loss of application functionality; pass "
9091 << "--yes-i-really-mean-it to proceed anyway";
9096 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
9098 return 0; // idempotent
9101 p
.application_metadata
.erase(app
);
9102 ss
<< "disable application '" << app
<< "' on pool '" << pool_name
<< "'";
9104 } else if (boost::algorithm::ends_with(prefix
, "set")) {
9106 ss
<< "application metadata must be set on base tier";
9111 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
9117 cmd_getval(cmdmap
, "key", key
);
9120 ss
<< "key must be provided";
9124 auto &app_keys
= p
.application_metadata
[app
];
9125 if (app_keys
.count(key
) == 0 &&
9126 app_keys
.size() >= MAX_POOL_APPLICATION_KEYS
) {
9127 ss
<< "too many keys set for application '" << app
<< "' on pool '"
9128 << pool_name
<< "'; max " << MAX_POOL_APPLICATION_KEYS
;
9132 if (key
.length() > MAX_POOL_APPLICATION_LENGTH
) {
9133 ss
<< "key '" << app
<< "' too long; max length "
9134 << MAX_POOL_APPLICATION_LENGTH
;
9139 cmd_getval(cmdmap
, "value", value
);
9140 if (value
.length() > MAX_POOL_APPLICATION_LENGTH
) {
9141 ss
<< "value '" << value
<< "' too long; max length "
9142 << MAX_POOL_APPLICATION_LENGTH
;
9146 p
.application_metadata
[app
][key
] = value
;
9147 ss
<< "set application '" << app
<< "' key '" << key
<< "' to '"
9148 << value
<< "' on pool '" << pool_name
<< "'";
9149 } else if (boost::algorithm::ends_with(prefix
, "rm")) {
9151 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
9157 cmd_getval(cmdmap
, "key", key
);
9158 auto it
= p
.application_metadata
[app
].find(key
);
9159 if (it
== p
.application_metadata
[app
].end()) {
9160 ss
<< "application '" << app
<< "' on pool '" << pool_name
9161 << "' does not have key '" << key
<< "'";
9162 return 0; // idempotent
9165 p
.application_metadata
[app
].erase(it
);
9166 ss
<< "removed application '" << app
<< "' key '" << key
<< "' on pool '"
9167 << pool_name
<< "'";
9173 p
.last_change
= pending_inc
.epoch
;
9174 pending_inc
.new_pools
[pool
] = p
;
9177 // Because we fell through this far, we didn't hit no-op cases,
9178 // so pool was definitely modified
9179 if (modified
!= nullptr) {
9186 int OSDMonitor::_prepare_command_osd_crush_remove(
9187 CrushWrapper
&newcrush
,
9196 err
= newcrush
.remove_item_under(cct
, id
, ancestor
,
9199 err
= newcrush
.remove_item(cct
, id
, unlink_only
);
9204 void OSDMonitor::do_osd_crush_remove(CrushWrapper
& newcrush
)
9206 pending_inc
.crush
.clear();
9207 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
9210 int OSDMonitor::prepare_command_osd_crush_remove(
9211 CrushWrapper
&newcrush
,
9217 int err
= _prepare_command_osd_crush_remove(
9218 newcrush
, id
, ancestor
,
9219 has_ancestor
, unlink_only
);
9224 ceph_assert(err
== 0);
9225 do_osd_crush_remove(newcrush
);
9230 int OSDMonitor::prepare_command_osd_remove(int32_t id
)
9232 if (osdmap
.is_up(id
)) {
9236 pending_inc
.new_state
[id
] = osdmap
.get_state(id
);
9237 pending_inc
.new_uuid
[id
] = uuid_d();
9238 pending_metadata_rm
.insert(id
);
9239 pending_metadata
.erase(id
);
9244 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id
)
9246 ceph_assert(existing_id
);
9249 for (int32_t i
= 0; i
< osdmap
.get_max_osd(); ++i
) {
9250 if (!osdmap
.exists(i
) &&
9251 pending_inc
.new_up_client
.count(i
) == 0 &&
9252 (pending_inc
.new_state
.count(i
) == 0 ||
9253 (pending_inc
.new_state
[i
] & CEPH_OSD_EXISTS
) == 0)) {
9259 if (pending_inc
.new_max_osd
< 0) {
9260 return osdmap
.get_max_osd();
9262 return pending_inc
.new_max_osd
;
9265 void OSDMonitor::do_osd_create(
9268 const string
& device_class
,
9271 dout(10) << __func__
<< " uuid " << uuid
<< dendl
;
9272 ceph_assert(new_id
);
9274 // We presume validation has been performed prior to calling this
9275 // function. We assert with prejudice.
9277 int32_t allocated_id
= -1; // declare here so we can jump
9278 int32_t existing_id
= -1;
9279 if (!uuid
.is_zero()) {
9280 existing_id
= osdmap
.identify_osd(uuid
);
9281 if (existing_id
>= 0) {
9282 ceph_assert(id
< 0 || id
== existing_id
);
9283 *new_id
= existing_id
;
9285 } else if (id
>= 0) {
9286 // uuid does not exist, and id has been provided, so just create
9293 // allocate a new id
9294 allocated_id
= _allocate_osd_id(&existing_id
);
9295 dout(10) << __func__
<< " allocated id " << allocated_id
9296 << " existing id " << existing_id
<< dendl
;
9297 if (existing_id
>= 0) {
9298 ceph_assert(existing_id
< osdmap
.get_max_osd());
9299 ceph_assert(allocated_id
< 0);
9300 *new_id
= existing_id
;
9301 } else if (allocated_id
>= 0) {
9302 ceph_assert(existing_id
< 0);
9304 if (pending_inc
.new_max_osd
< 0) {
9305 pending_inc
.new_max_osd
= osdmap
.get_max_osd() + 1;
9307 ++pending_inc
.new_max_osd
;
9309 *new_id
= pending_inc
.new_max_osd
- 1;
9310 ceph_assert(*new_id
== allocated_id
);
9312 ceph_abort_msg("unexpected condition");
9316 if (device_class
.size()) {
9317 CrushWrapper newcrush
= _get_pending_crush();
9318 if (newcrush
.get_max_devices() < *new_id
+ 1) {
9319 newcrush
.set_max_devices(*new_id
+ 1);
9321 string name
= string("osd.") + stringify(*new_id
);
9322 if (!newcrush
.item_exists(*new_id
)) {
9323 newcrush
.set_item_name(*new_id
, name
);
9326 int r
= newcrush
.update_device_class(*new_id
, device_class
, name
, &ss
);
9328 derr
<< __func__
<< " failed to set " << name
<< " device_class "
9329 << device_class
<< ": " << cpp_strerror(r
) << " - " << ss
.str()
9331 // non-fatal... this might be a replay and we want to be idempotent.
9333 dout(20) << __func__
<< " set " << name
<< " device_class " << device_class
9335 pending_inc
.crush
.clear();
9336 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
9339 dout(20) << __func__
<< " no device_class" << dendl
;
9342 dout(10) << __func__
<< " using id " << *new_id
<< dendl
;
9343 if (osdmap
.get_max_osd() <= *new_id
&& pending_inc
.new_max_osd
<= *new_id
) {
9344 pending_inc
.new_max_osd
= *new_id
+ 1;
9347 pending_inc
.new_weight
[*new_id
] = CEPH_OSD_IN
;
9348 // do not set EXISTS; OSDMap::set_weight, called by apply_incremental, will
9349 // set it for us. (ugh.)
9350 pending_inc
.new_state
[*new_id
] |= CEPH_OSD_NEW
;
9351 if (!uuid
.is_zero())
9352 pending_inc
.new_uuid
[*new_id
] = uuid
;
9355 int OSDMonitor::validate_osd_create(
9358 const bool check_osd_exists
,
9359 int32_t* existing_id
,
9363 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
9364 << " check_osd_exists " << check_osd_exists
<< dendl
;
9366 ceph_assert(existing_id
);
9368 if (id
< 0 && uuid
.is_zero()) {
9369 // we have nothing to validate
9372 } else if (uuid
.is_zero()) {
9373 // we have an id but we will ignore it - because that's what
9374 // `osd create` does.
9379 * This function will be used to validate whether we are able to
9380 * create a new osd when the `uuid` is specified.
9382 * It will be used by both `osd create` and `osd new`, as the checks
9383 * are basically the same when it pertains to osd id and uuid validation.
9384 * However, `osd create` presumes an `uuid` is optional, for legacy
9385 * reasons, while `osd new` requires the `uuid` to be provided. This
9386 * means that `osd create` will not be idempotent if an `uuid` is not
9387 * provided, but we will always guarantee the idempotency of `osd new`.
9390 ceph_assert(!uuid
.is_zero());
9391 if (pending_inc
.identify_osd(uuid
) >= 0) {
9392 // osd is about to exist
9396 int32_t i
= osdmap
.identify_osd(uuid
);
9398 // osd already exists
9399 if (id
>= 0 && i
!= id
) {
9400 ss
<< "uuid " << uuid
<< " already in use for different id " << i
;
9403 // return a positive errno to distinguish between a blocking error
9404 // and an error we consider to not be a problem (i.e., this would be
9405 // an idempotent operation).
9411 if (pending_inc
.new_state
.count(id
)) {
9412 // osd is about to exist
9415 // we may not care if an osd exists if we are recreating a previously
9417 if (check_osd_exists
&& osdmap
.exists(id
)) {
9418 ss
<< "id " << id
<< " already in use and does not match uuid "
9426 int OSDMonitor::prepare_command_osd_create(
9429 int32_t* existing_id
,
9432 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
<< dendl
;
9433 ceph_assert(existing_id
);
9434 if (osdmap
.is_destroyed(id
)) {
9435 ss
<< "ceph osd create has been deprecated. Please use ceph osd new "
9440 if (uuid
.is_zero()) {
9441 dout(10) << __func__
<< " no uuid; assuming legacy `osd create`" << dendl
;
9444 return validate_osd_create(id
, uuid
, true, existing_id
, ss
);
9447 int OSDMonitor::prepare_command_osd_new(
9449 const cmdmap_t
& cmdmap
,
9450 const map
<string
,string
>& params
,
9458 ceph_assert(paxos
.is_plugged());
9460 dout(10) << __func__
<< " " << op
<< dendl
;
9462 /* validate command. abort now if something's wrong. */
9464 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
9466 * If `id` is not specified, we will identify any existing osd based
9467 * on `uuid`. Operation will be idempotent iff secrets match.
9469 * If `id` is specified, we will identify any existing osd based on
9470 * `uuid` and match against `id`. If they match, operation will be
9471 * idempotent iff secrets match.
9473 * `-i secrets.json` will be optional. If supplied, will be used
9474 * to check for idempotency when `id` and `uuid` match.
9476 * If `id` is not specified, and `uuid` does not exist, an id will
9477 * be found or allocated for the osd.
9479 * If `id` is specified, and the osd has been previously marked
9480 * as destroyed, then the `id` will be reused.
9482 if (!cmd_getval(cmdmap
, "uuid", uuidstr
)) {
9483 ss
<< "requires the OSD's UUID to be specified.";
9485 } else if (!uuid
.parse(uuidstr
.c_str())) {
9486 ss
<< "invalid UUID value '" << uuidstr
<< "'.";
9490 if (cmd_getval(cmdmap
, "id", id
) &&
9492 ss
<< "invalid OSD id; must be greater or equal than zero.";
9496 // are we running an `osd create`-like command, or recreating
9497 // a previously destroyed osd?
9499 bool is_recreate_destroyed
= (id
>= 0 && osdmap
.is_destroyed(id
));
9501 // we will care about `id` to assess whether osd is `destroyed`, or
9502 // to create a new osd.
9503 // we will need an `id` by the time we reach auth.
9505 int32_t existing_id
= -1;
9506 int err
= validate_osd_create(id
, uuid
, !is_recreate_destroyed
,
9509 bool may_be_idempotent
= false;
9510 if (err
== EEXIST
) {
9511 // this is idempotent from the osdmon's point-of-view
9512 may_be_idempotent
= true;
9513 ceph_assert(existing_id
>= 0);
9515 } else if (err
< 0) {
9519 if (!may_be_idempotent
) {
9520 // idempotency is out of the window. We are either creating a new
9521 // osd or recreating a destroyed osd.
9523 // We now need to figure out if we have an `id` (and if it's valid),
9524 // of find an `id` if we don't have one.
9526 // NOTE: we need to consider the case where the `id` is specified for
9527 // `osd create`, and we must honor it. So this means checking if
9528 // the `id` is destroyed, and if so assume the destroy; otherwise,
9529 // check if it `exists` - in which case we complain about not being
9530 // `destroyed`. In the end, if nothing fails, we must allow the
9531 // creation, so that we are compatible with `create`.
9532 if (id
>= 0 && osdmap
.exists(id
) && !osdmap
.is_destroyed(id
)) {
9533 dout(10) << __func__
<< " osd." << id
<< " isn't destroyed" << dendl
;
9534 ss
<< "OSD " << id
<< " has not yet been destroyed";
9536 } else if (id
< 0) {
9538 id
= _allocate_osd_id(&existing_id
);
9540 ceph_assert(existing_id
>= 0);
9543 dout(10) << __func__
<< " found id " << id
<< " to use" << dendl
;
9544 } else if (id
>= 0 && osdmap
.is_destroyed(id
)) {
9545 dout(10) << __func__
<< " recreating osd." << id
<< dendl
;
9547 dout(10) << __func__
<< " creating new osd." << id
<< dendl
;
9550 ceph_assert(id
>= 0);
9551 ceph_assert(osdmap
.exists(id
));
9554 // we are now able to either create a brand new osd or reuse an existing
9555 // osd that has been previously destroyed.
9557 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
<< dendl
;
9559 if (may_be_idempotent
&& params
.empty()) {
9560 // nothing to do, really.
9561 dout(10) << __func__
<< " idempotent and no params -- no op." << dendl
;
9562 ceph_assert(id
>= 0);
9564 f
->open_object_section("created_osd");
9565 f
->dump_int("osdid", id
);
9573 string device_class
;
9574 auto p
= params
.find("crush_device_class");
9575 if (p
!= params
.end()) {
9576 device_class
= p
->second
;
9577 dout(20) << __func__
<< " device_class will be " << device_class
<< dendl
;
9579 string cephx_secret
, lockbox_secret
, dmcrypt_key
;
9580 bool has_lockbox
= false;
9581 bool has_secrets
= params
.count("cephx_secret")
9582 || params
.count("cephx_lockbox_secret")
9583 || params
.count("dmcrypt_key");
9585 KVMonitor
*svc
= nullptr;
9586 AuthMonitor::auth_entity_t cephx_entity
, lockbox_entity
;
9589 if (params
.count("cephx_secret") == 0) {
9590 ss
<< "requires a cephx secret.";
9593 cephx_secret
= params
.at("cephx_secret");
9595 bool has_lockbox_secret
= (params
.count("cephx_lockbox_secret") > 0);
9596 bool has_dmcrypt_key
= (params
.count("dmcrypt_key") > 0);
9598 dout(10) << __func__
<< " has lockbox " << has_lockbox_secret
9599 << " dmcrypt " << has_dmcrypt_key
<< dendl
;
9601 if (has_lockbox_secret
&& has_dmcrypt_key
) {
9603 lockbox_secret
= params
.at("cephx_lockbox_secret");
9604 dmcrypt_key
= params
.at("dmcrypt_key");
9605 } else if (!has_lockbox_secret
!= !has_dmcrypt_key
) {
9606 ss
<< "requires both a cephx lockbox secret and a dm-crypt key.";
9610 dout(10) << __func__
<< " validate secrets using osd id " << id
<< dendl
;
9612 err
= mon
.authmon()->validate_osd_new(id
, uuid
,
9620 } else if (may_be_idempotent
&& err
!= EEXIST
) {
9621 // for this to be idempotent, `id` should already be >= 0; no need
9622 // to use validate_id.
9623 ceph_assert(id
>= 0);
9624 ss
<< "osd." << id
<< " exists but secrets do not match";
9630 err
= svc
->validate_osd_new(uuid
, dmcrypt_key
, ss
);
9633 } else if (may_be_idempotent
&& err
!= EEXIST
) {
9634 ceph_assert(id
>= 0);
9635 ss
<< "osd." << id
<< " exists but dm-crypt key does not match.";
9640 ceph_assert(!has_secrets
|| !cephx_secret
.empty());
9641 ceph_assert(!has_lockbox
|| !lockbox_secret
.empty());
9643 if (may_be_idempotent
) {
9644 // we have nothing to do for either the osdmon or the authmon,
9645 // and we have no lockbox - so the config key service will not be
9646 // touched. This is therefore an idempotent operation, and we can
9647 // just return right away.
9648 dout(10) << __func__
<< " idempotent -- no op." << dendl
;
9649 ceph_assert(id
>= 0);
9651 f
->open_object_section("created_osd");
9652 f
->dump_int("osdid", id
);
9659 ceph_assert(!may_be_idempotent
);
9663 ceph_assert(!cephx_secret
.empty());
9664 ceph_assert((lockbox_secret
.empty() && dmcrypt_key
.empty()) ||
9665 (!lockbox_secret
.empty() && !dmcrypt_key
.empty()));
9667 err
= mon
.authmon()->do_osd_new(cephx_entity
,
9670 ceph_assert(0 == err
);
9673 ceph_assert(nullptr != svc
);
9674 svc
->do_osd_new(uuid
, dmcrypt_key
);
9678 if (is_recreate_destroyed
) {
9679 ceph_assert(id
>= 0);
9680 ceph_assert(osdmap
.is_destroyed(id
));
9681 pending_inc
.new_state
[id
] |= CEPH_OSD_DESTROYED
;
9682 if ((osdmap
.get_state(id
) & CEPH_OSD_NEW
) == 0) {
9683 pending_inc
.new_state
[id
] |= CEPH_OSD_NEW
;
9685 if (osdmap
.get_state(id
) & CEPH_OSD_UP
) {
9686 // due to http://tracker.ceph.com/issues/20751 some clusters may
9687 // have UP set for non-existent OSDs; make sure it is cleared
9688 // for a newly created osd.
9689 pending_inc
.new_state
[id
] |= CEPH_OSD_UP
;
9691 pending_inc
.new_uuid
[id
] = uuid
;
9693 ceph_assert(id
>= 0);
9694 int32_t new_id
= -1;
9695 do_osd_create(id
, uuid
, device_class
, &new_id
);
9696 ceph_assert(new_id
>= 0);
9697 ceph_assert(id
== new_id
);
9701 f
->open_object_section("created_osd");
9702 f
->dump_int("osdid", id
);
9711 bool OSDMonitor::prepare_command(MonOpRequestRef op
)
9713 op
->mark_osdmon_event(__func__
);
9714 auto m
= op
->get_req
<MMonCommand
>();
9717 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
9718 string rs
= ss
.str();
9719 mon
.reply_command(op
, -EINVAL
, rs
, get_last_committed());
9723 MonSession
*session
= op
->get_session();
9725 derr
<< __func__
<< " no session" << dendl
;
9726 mon
.reply_command(op
, -EACCES
, "access denied", get_last_committed());
9730 return prepare_command_impl(op
, cmdmap
);
9733 static int parse_reweights(CephContext
*cct
,
9734 const cmdmap_t
& cmdmap
,
9735 const OSDMap
& osdmap
,
9736 map
<int32_t, uint32_t>* weights
)
9739 if (!cmd_getval(cmdmap
, "weights", weights_str
)) {
9742 std::replace(begin(weights_str
), end(weights_str
), '\'', '"');
9743 json_spirit::mValue json_value
;
9744 if (!json_spirit::read(weights_str
, json_value
)) {
9747 if (json_value
.type() != json_spirit::obj_type
) {
9750 const auto obj
= json_value
.get_obj();
9752 for (auto& osd_weight
: obj
) {
9753 auto osd_id
= std::stoi(osd_weight
.first
);
9754 if (!osdmap
.exists(osd_id
)) {
9757 if (osd_weight
.second
.type() != json_spirit::str_type
) {
9760 auto weight
= std::stoul(osd_weight
.second
.get_str());
9761 weights
->insert({osd_id
, weight
});
9763 } catch (const std::logic_error
& e
) {
9769 int OSDMonitor::prepare_command_osd_destroy(
9773 ceph_assert(paxos
.is_plugged());
9775 // we check if the osd exists for the benefit of `osd purge`, which may
9776 // have previously removed the osd. If the osd does not exist, return
9777 // -ENOENT to convey this, and let the caller deal with it.
9779 // we presume that all auth secrets and config keys were removed prior
9780 // to this command being called. if they exist by now, we also assume
9781 // they must have been created by some other command and do not pertain
9782 // to this non-existent osd.
9783 if (!osdmap
.exists(id
)) {
9784 dout(10) << __func__
<< " osd." << id
<< " does not exist." << dendl
;
9788 uuid_d uuid
= osdmap
.get_uuid(id
);
9789 dout(10) << __func__
<< " destroying osd." << id
9790 << " uuid " << uuid
<< dendl
;
9792 // if it has been destroyed, we assume our work here is done.
9793 if (osdmap
.is_destroyed(id
)) {
9794 ss
<< "destroyed osd." << id
;
9798 EntityName cephx_entity
, lockbox_entity
;
9799 bool idempotent_auth
= false, idempotent_cks
= false;
9801 int err
= mon
.authmon()->validate_osd_destroy(id
, uuid
,
9806 if (err
== -ENOENT
) {
9807 idempotent_auth
= true;
9813 auto svc
= mon
.kvmon();
9814 err
= svc
->validate_osd_destroy(id
, uuid
);
9816 ceph_assert(err
== -ENOENT
);
9818 idempotent_cks
= true;
9821 if (!idempotent_auth
) {
9822 err
= mon
.authmon()->do_osd_destroy(cephx_entity
, lockbox_entity
);
9823 ceph_assert(0 == err
);
9826 if (!idempotent_cks
) {
9827 svc
->do_osd_destroy(id
, uuid
);
9830 pending_inc
.new_state
[id
] = CEPH_OSD_DESTROYED
;
9831 pending_inc
.new_uuid
[id
] = uuid_d();
9833 // we can only propose_pending() once per service, otherwise we'll be
9834 // defying PaxosService and all laws of nature. Therefore, as we may
9835 // be used during 'osd purge', let's keep the caller responsible for
9837 ceph_assert(err
== 0);
9841 int OSDMonitor::prepare_command_osd_purge(
9845 ceph_assert(paxos
.is_plugged());
9846 dout(10) << __func__
<< " purging osd." << id
<< dendl
;
9848 ceph_assert(!osdmap
.is_up(id
));
9851 * This may look a bit weird, but this is what's going to happen:
9853 * 1. we make sure that removing from crush works
9854 * 2. we call `prepare_command_osd_destroy()`. If it returns an
9855 * error, then we abort the whole operation, as no updates
9856 * have been made. However, we this function will have
9857 * side-effects, thus we need to make sure that all operations
9858 * performed henceforth will *always* succeed.
9859 * 3. we call `prepare_command_osd_remove()`. Although this
9860 * function can return an error, it currently only checks if the
9861 * osd is up - and we have made sure that it is not so, so there
9862 * is no conflict, and it is effectively an update.
9863 * 4. finally, we call `do_osd_crush_remove()`, which will perform
9864 * the crush update we delayed from before.
9867 CrushWrapper newcrush
= _get_pending_crush();
9869 bool may_be_idempotent
= false;
9871 int err
= _prepare_command_osd_crush_remove(newcrush
, id
, 0, false, false);
9872 if (err
== -ENOENT
) {
9874 may_be_idempotent
= true;
9875 } else if (err
< 0) {
9876 ss
<< "error removing osd." << id
<< " from crush";
9880 // no point destroying the osd again if it has already been marked destroyed
9881 if (!osdmap
.is_destroyed(id
)) {
9882 err
= prepare_command_osd_destroy(id
, ss
);
9884 if (err
== -ENOENT
) {
9890 may_be_idempotent
= false;
9893 ceph_assert(0 == err
);
9895 if (may_be_idempotent
&& !osdmap
.exists(id
)) {
9896 dout(10) << __func__
<< " osd." << id
<< " does not exist and "
9897 << "we are idempotent." << dendl
;
9901 err
= prepare_command_osd_remove(id
);
9902 // we should not be busy, as we should have made sure this id is not up.
9903 ceph_assert(0 == err
);
9905 do_osd_crush_remove(newcrush
);
9909 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op
,
9910 const cmdmap_t
& cmdmap
)
9912 op
->mark_osdmon_event(__func__
);
9913 auto m
= op
->get_req
<MMonCommand
>();
9920 string format
= cmd_getval_or
<string
>(cmdmap
, "format", "plain");
9921 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
9924 cmd_getval(cmdmap
, "prefix", prefix
);
9928 bool osdid_present
= false;
9929 if (prefix
!= "osd pg-temp" &&
9930 prefix
!= "osd pg-upmap" &&
9931 prefix
!= "osd pg-upmap-items") { // avoid commands with non-int id arg
9932 osdid_present
= cmd_getval(cmdmap
, "id", osdid
);
9934 if (osdid_present
) {
9936 oss
<< "osd." << osdid
;
9937 osd_name
= oss
.str();
9940 // Even if there's a pending state with changes that could affect
9941 // a command, considering that said state isn't yet committed, we
9942 // just don't care about those changes if the command currently being
9943 // handled acts as a no-op against the current committed state.
9944 // In a nutshell, we assume this command happens *before*.
9946 // Let me make this clearer:
9948 // - If we have only one client, and that client issues some
9949 // operation that would conflict with this operation but is
9950 // still on the pending state, then we would be sure that said
9951 // operation wouldn't have returned yet, so the client wouldn't
9952 // issue this operation (unless the client didn't wait for the
9953 // operation to finish, and that would be the client's own fault).
9955 // - If we have more than one client, each client will observe
9956 // whatever is the state at the moment of the commit. So, if we
9957 // have two clients, one issuing an unlink and another issuing a
9958 // link, and if the link happens while the unlink is still on the
9959 // pending state, from the link's point-of-view this is a no-op.
9960 // If different clients are issuing conflicting operations and
9961 // they care about that, then the clients should make sure they
9962 // enforce some kind of concurrency mechanism -- from our
9963 // perspective that's what Douglas Adams would call an SEP.
9965 // This should be used as a general guideline for most commands handled
9966 // in this function. Adapt as you see fit, but please bear in mind that
9967 // this is the expected behavior.
9970 if (prefix
== "osd setcrushmap" ||
9971 (prefix
== "osd crush set" && !osdid_present
)) {
9972 if (pending_inc
.crush
.length()) {
9973 dout(10) << __func__
<< " waiting for pending crush update " << dendl
;
9974 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
9977 dout(10) << "prepare_command setting new crush map" << dendl
;
9978 bufferlist
data(m
->get_data());
9981 auto bl
= data
.cbegin();
9984 catch (const std::exception
&e
) {
9986 ss
<< "Failed to parse crushmap: " << e
.what();
9990 int64_t prior_version
= 0;
9991 if (cmd_getval(cmdmap
, "prior_version", prior_version
)) {
9992 if (prior_version
== osdmap
.get_crush_version() - 1) {
9993 // see if we are a resend of the last update. this is imperfect
9994 // (multiple racing updaters may not both get reliable success)
9995 // but we expect crush updaters (via this interface) to be rare-ish.
9996 bufferlist current
, proposed
;
9997 osdmap
.crush
->encode(current
, mon
.get_quorum_con_features());
9998 crush
.encode(proposed
, mon
.get_quorum_con_features());
9999 if (current
.contents_equal(proposed
)) {
10000 dout(10) << __func__
10001 << " proposed matches current and version equals previous"
10004 ss
<< osdmap
.get_crush_version();
10008 if (prior_version
!= osdmap
.get_crush_version()) {
10010 ss
<< "prior_version " << prior_version
<< " != crush version "
10011 << osdmap
.get_crush_version();
10016 if (!validate_crush_against_features(&crush
, ss
)) {
10021 err
= osdmap
.validate_crush_rules(&crush
, &ss
);
10026 if (g_conf()->mon_osd_crush_smoke_test
) {
10027 // sanity check: test some inputs to make sure this map isn't
10029 dout(10) << " testing map" << dendl
;
10031 CrushTester
tester(crush
, ess
);
10032 tester
.set_min_x(0);
10033 tester
.set_max_x(50);
10034 tester
.set_num_rep(3); // arbitrary
10035 auto start
= ceph::coarse_mono_clock::now();
10036 int r
= tester
.test_with_fork(g_conf()->mon_lease
);
10037 auto duration
= ceph::coarse_mono_clock::now() - start
;
10039 dout(10) << " tester.test_with_fork returns " << r
10040 << ": " << ess
.str() << dendl
;
10041 ss
<< "crush smoke test failed with " << r
<< ": " << ess
.str();
10045 dout(10) << __func__
<< " crush somke test duration: "
10046 << duration
<< ", result: " << ess
.str() << dendl
;
10049 pending_inc
.crush
= data
;
10050 ss
<< osdmap
.get_crush_version() + 1;
10053 } else if (prefix
== "osd crush set-all-straw-buckets-to-straw2") {
10054 CrushWrapper newcrush
= _get_pending_crush();
10055 for (int b
= 0; b
< newcrush
.get_max_buckets(); ++b
) {
10057 if (newcrush
.bucket_exists(bid
) &&
10058 newcrush
.get_bucket_alg(bid
) == CRUSH_BUCKET_STRAW
) {
10059 dout(20) << " bucket " << bid
<< " is straw, can convert" << dendl
;
10060 newcrush
.bucket_set_alg(bid
, CRUSH_BUCKET_STRAW2
);
10063 if (!validate_crush_against_features(&newcrush
, ss
)) {
10067 pending_inc
.crush
.clear();
10068 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10069 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10070 get_last_committed() + 1));
10072 } else if (prefix
== "osd crush set-device-class") {
10073 string device_class
;
10074 if (!cmd_getval(cmdmap
, "class", device_class
)) {
10075 err
= -EINVAL
; // no value!
10080 vector
<string
> idvec
;
10081 cmd_getval(cmdmap
, "ids", idvec
);
10082 CrushWrapper newcrush
= _get_pending_crush();
10084 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
10088 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
10089 osdmap
.get_all_osds(osds
);
10092 // try traditional single osd way
10093 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
10095 // ss has reason for failure
10096 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
10103 for (auto &osd
: osds
) {
10104 if (!osdmap
.exists(osd
)) {
10105 ss
<< "osd." << osd
<< " does not exist. ";
10110 oss
<< "osd." << osd
;
10111 string name
= oss
.str();
10113 if (newcrush
.get_max_devices() < osd
+ 1) {
10114 newcrush
.set_max_devices(osd
+ 1);
10117 if (newcrush
.item_exists(osd
)) {
10118 action
= "updating";
10120 action
= "creating";
10121 newcrush
.set_item_name(osd
, name
);
10124 dout(5) << action
<< " crush item id " << osd
<< " name '" << name
10125 << "' device_class '" << device_class
<< "'"
10127 err
= newcrush
.update_device_class(osd
, device_class
, name
, &ss
);
10131 if (err
== 0 && !_have_pending_crush()) {
10133 // for single osd only, wildcard makes too much noise
10134 ss
<< "set-device-class item id " << osd
<< " name '" << name
10135 << "' device_class '" << device_class
<< "': no change. ";
10138 updated
.insert(osd
);
10143 pending_inc
.crush
.clear();
10144 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10145 ss
<< "set osd(s) " << updated
<< " to class '" << device_class
<< "'";
10147 wait_for_finished_proposal(
10149 new Monitor::C_Command(mon
,op
, 0, rs
, get_last_committed() + 1));
10151 } else if (prefix
== "osd crush rm-device-class") {
10153 vector
<string
> idvec
;
10154 cmd_getval(cmdmap
, "ids", idvec
);
10155 CrushWrapper newcrush
= _get_pending_crush();
10158 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
10163 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
10164 osdmap
.get_all_osds(osds
);
10167 // try traditional single osd way
10168 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
10170 // ss has reason for failure
10171 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
10178 for (auto &osd
: osds
) {
10179 if (!osdmap
.exists(osd
)) {
10180 ss
<< "osd." << osd
<< " does not exist. ";
10184 auto class_name
= newcrush
.get_item_class(osd
);
10186 ss
<< "osd." << osd
<< " belongs to no class, ";
10189 // note that we do not verify if class_is_in_use here
10190 // in case the device is misclassified and user wants
10191 // to overridely reset...
10193 err
= newcrush
.remove_device_class(cct
, osd
, &ss
);
10195 // ss has reason for failure
10198 updated
.insert(osd
);
10202 pending_inc
.crush
.clear();
10203 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10204 ss
<< "done removing class of osd(s): " << updated
;
10206 wait_for_finished_proposal(
10208 new Monitor::C_Command(mon
,op
, 0, rs
, get_last_committed() + 1));
10210 } else if (prefix
== "osd crush class create") {
10211 string device_class
;
10212 if (!cmd_getval(cmdmap
, "class", device_class
)) {
10213 err
= -EINVAL
; // no value!
10216 if (osdmap
.require_osd_release
< ceph_release_t::luminous
) {
10217 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
10218 << "luminous' before using crush device classes";
10222 if (!_have_pending_crush() &&
10223 _get_stable_crush().class_exists(device_class
)) {
10224 ss
<< "class '" << device_class
<< "' already exists";
10227 CrushWrapper newcrush
= _get_pending_crush();
10228 if (newcrush
.class_exists(device_class
)) {
10229 ss
<< "class '" << device_class
<< "' already exists";
10232 int class_id
= newcrush
.get_or_create_class_id(device_class
);
10233 pending_inc
.crush
.clear();
10234 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10235 ss
<< "created class " << device_class
<< " with id " << class_id
10236 << " to crush map";
10238 } else if (prefix
== "osd crush class rm") {
10239 string device_class
;
10240 if (!cmd_getval(cmdmap
, "class", device_class
)) {
10241 err
= -EINVAL
; // no value!
10244 if (osdmap
.require_osd_release
< ceph_release_t::luminous
) {
10245 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
10246 << "luminous' before using crush device classes";
10251 if (!osdmap
.crush
->class_exists(device_class
)) {
10256 CrushWrapper newcrush
= _get_pending_crush();
10257 if (!newcrush
.class_exists(device_class
)) {
10258 err
= 0; // make command idempotent
10261 int class_id
= newcrush
.get_class_id(device_class
);
10263 if (newcrush
.class_is_in_use(class_id
, &ts
)) {
10265 ss
<< "class '" << device_class
<< "' " << ts
.str();
10269 // check if class is used by any erasure-code-profiles
10270 mempool::osdmap::map
<string
,map
<string
,string
>> old_ec_profiles
=
10271 osdmap
.get_erasure_code_profiles();
10272 auto ec_profiles
= pending_inc
.get_erasure_code_profiles();
10273 #ifdef HAVE_STDLIB_MAP_SPLICING
10274 ec_profiles
.merge(old_ec_profiles
);
10276 ec_profiles
.insert(make_move_iterator(begin(old_ec_profiles
)),
10277 make_move_iterator(end(old_ec_profiles
)));
10279 list
<string
> referenced_by
;
10280 for (auto &i
: ec_profiles
) {
10281 for (auto &j
: i
.second
) {
10282 if ("crush-device-class" == j
.first
&& device_class
== j
.second
) {
10283 referenced_by
.push_back(i
.first
);
10287 if (!referenced_by
.empty()) {
10289 ss
<< "class '" << device_class
10290 << "' is still referenced by erasure-code-profile(s): " << referenced_by
;
10295 newcrush
.get_devices_by_class(device_class
, &osds
);
10296 for (auto& p
: osds
) {
10297 err
= newcrush
.remove_device_class(g_ceph_context
, p
, &ss
);
10299 // ss has reason for failure
10304 if (osds
.empty()) {
10305 // empty class, remove directly
10306 err
= newcrush
.remove_class_name(device_class
);
10308 ss
<< "class '" << device_class
<< "' cannot be removed '"
10309 << cpp_strerror(err
) << "'";
10314 pending_inc
.crush
.clear();
10315 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10316 ss
<< "removed class " << device_class
<< " with id " << class_id
10317 << " from crush map";
10319 } else if (prefix
== "osd crush class rename") {
10320 string srcname
, dstname
;
10321 if (!cmd_getval(cmdmap
, "srcname", srcname
)) {
10325 if (!cmd_getval(cmdmap
, "dstname", dstname
)) {
10330 CrushWrapper newcrush
= _get_pending_crush();
10331 if (!newcrush
.class_exists(srcname
) && newcrush
.class_exists(dstname
)) {
10332 // suppose this is a replay and return success
10333 // so command is idempotent
10334 ss
<< "already renamed to '" << dstname
<< "'";
10339 err
= newcrush
.rename_class(srcname
, dstname
);
10341 ss
<< "fail to rename '" << srcname
<< "' to '" << dstname
<< "' : "
10342 << cpp_strerror(err
);
10346 pending_inc
.crush
.clear();
10347 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10348 ss
<< "rename class '" << srcname
<< "' to '" << dstname
<< "'";
10350 } else if (prefix
== "osd crush add-bucket") {
10351 // os crush add-bucket <name> <type>
10352 string name
, typestr
;
10353 vector
<string
> argvec
;
10354 cmd_getval(cmdmap
, "name", name
);
10355 cmd_getval(cmdmap
, "type", typestr
);
10356 cmd_getval(cmdmap
, "args", argvec
);
10357 map
<string
,string
> loc
;
10358 if (!argvec
.empty()) {
10359 CrushWrapper::parse_loc_map(argvec
, &loc
);
10360 dout(0) << "will create and move bucket '" << name
10361 << "' to location " << loc
<< dendl
;
10364 if (!_have_pending_crush() &&
10365 _get_stable_crush().name_exists(name
)) {
10366 ss
<< "bucket '" << name
<< "' already exists";
10370 CrushWrapper newcrush
= _get_pending_crush();
10372 if (newcrush
.name_exists(name
)) {
10373 ss
<< "bucket '" << name
<< "' already exists";
10376 int type
= newcrush
.get_type_id(typestr
);
10378 ss
<< "type '" << typestr
<< "' does not exist";
10383 ss
<< "type '" << typestr
<< "' is for devices, not buckets";
10388 err
= newcrush
.add_bucket(0, 0,
10389 CRUSH_HASH_DEFAULT
, type
, 0, NULL
,
10392 ss
<< "add_bucket error: '" << cpp_strerror(err
) << "'";
10395 err
= newcrush
.set_item_name(bucketno
, name
);
10397 ss
<< "error setting bucket name to '" << name
<< "'";
10401 if (!loc
.empty()) {
10402 if (!newcrush
.check_item_loc(cct
, bucketno
, loc
,
10404 err
= newcrush
.move_bucket(cct
, bucketno
, loc
);
10406 ss
<< "error moving bucket '" << name
<< "' to location " << loc
;
10410 ss
<< "no need to move item id " << bucketno
<< " name '" << name
10411 << "' to location " << loc
<< " in crush map";
10415 pending_inc
.crush
.clear();
10416 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10418 ss
<< "added bucket " << name
<< " type " << typestr
10419 << " to crush map";
10421 ss
<< "added bucket " << name
<< " type " << typestr
10422 << " to location " << loc
;
10425 } else if (prefix
== "osd crush rename-bucket") {
10426 string srcname
, dstname
;
10427 cmd_getval(cmdmap
, "srcname", srcname
);
10428 cmd_getval(cmdmap
, "dstname", dstname
);
10430 err
= crush_rename_bucket(srcname
, dstname
, &ss
);
10431 if (err
== -EALREADY
) // equivalent to success for idempotency
10437 } else if (prefix
== "osd crush weight-set create" ||
10438 prefix
== "osd crush weight-set create-compat") {
10439 if (_have_pending_crush()) {
10440 dout(10) << " first waiting for pending crush changes to commit" << dendl
;
10443 CrushWrapper newcrush
= _get_pending_crush();
10446 if (newcrush
.has_non_straw2_buckets()) {
10447 ss
<< "crush map contains one or more bucket(s) that are not straw2";
10451 if (prefix
== "osd crush weight-set create") {
10452 if (osdmap
.require_min_compat_client
!= ceph_release_t::unknown
&&
10453 osdmap
.require_min_compat_client
< ceph_release_t::luminous
) {
10454 ss
<< "require_min_compat_client "
10455 << osdmap
.require_min_compat_client
10456 << " < luminous, which is required for per-pool weight-sets. "
10457 << "Try 'ceph osd set-require-min-compat-client luminous' "
10458 << "before using the new interface";
10462 string poolname
, mode
;
10463 cmd_getval(cmdmap
, "pool", poolname
);
10464 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
10466 ss
<< "pool '" << poolname
<< "' not found";
10470 cmd_getval(cmdmap
, "mode", mode
);
10471 if (mode
!= "flat" && mode
!= "positional") {
10472 ss
<< "unrecognized weight-set mode '" << mode
<< "'";
10476 positions
= mode
== "flat" ? 1 : osdmap
.get_pg_pool(pool
)->get_size();
10478 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
10481 if (!newcrush
.create_choose_args(pool
, positions
)) {
10482 if (pool
== CrushWrapper::DEFAULT_CHOOSE_ARGS
) {
10483 ss
<< "compat weight-set already created";
10485 ss
<< "weight-set for pool '" << osdmap
.get_pool_name(pool
)
10486 << "' already created";
10490 pending_inc
.crush
.clear();
10491 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10494 } else if (prefix
== "osd crush weight-set rm" ||
10495 prefix
== "osd crush weight-set rm-compat") {
10496 CrushWrapper newcrush
= _get_pending_crush();
10498 if (prefix
== "osd crush weight-set rm") {
10500 cmd_getval(cmdmap
, "pool", poolname
);
10501 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
10503 ss
<< "pool '" << poolname
<< "' not found";
10508 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
10510 newcrush
.rm_choose_args(pool
);
10511 pending_inc
.crush
.clear();
10512 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10515 } else if (prefix
== "osd crush weight-set reweight" ||
10516 prefix
== "osd crush weight-set reweight-compat") {
10517 string poolname
, item
;
10518 vector
<double> weight
;
10519 cmd_getval(cmdmap
, "pool", poolname
);
10520 cmd_getval(cmdmap
, "item", item
);
10521 cmd_getval(cmdmap
, "weight", weight
);
10522 CrushWrapper newcrush
= _get_pending_crush();
10524 if (prefix
== "osd crush weight-set reweight") {
10525 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
10527 ss
<< "pool '" << poolname
<< "' not found";
10531 if (!newcrush
.have_choose_args(pool
)) {
10532 ss
<< "no weight-set for pool '" << poolname
<< "'";
10536 auto arg_map
= newcrush
.choose_args_get(pool
);
10537 int positions
= newcrush
.get_choose_args_positions(arg_map
);
10538 if (weight
.size() != (size_t)positions
) {
10539 ss
<< "must specify exact " << positions
<< " weight values";
10544 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
10545 if (!newcrush
.have_choose_args(pool
)) {
10546 ss
<< "no backward-compatible weight-set";
10551 if (!newcrush
.name_exists(item
)) {
10552 ss
<< "item '" << item
<< "' does not exist";
10556 err
= newcrush
.choose_args_adjust_item_weightf(
10558 newcrush
.choose_args_get(pool
),
10559 newcrush
.get_item_id(item
),
10566 pending_inc
.crush
.clear();
10567 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10569 } else if (osdid_present
&&
10570 (prefix
== "osd crush set" || prefix
== "osd crush add")) {
10571 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
10572 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
10573 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
10575 if (!osdmap
.exists(osdid
)) {
10578 << " does not exist. Create it before updating the crush map";
10583 if (!cmd_getval(cmdmap
, "weight", weight
)) {
10584 ss
<< "unable to parse weight value '"
10585 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
10591 vector
<string
> argvec
;
10592 cmd_getval(cmdmap
, "args", argvec
);
10593 map
<string
,string
> loc
;
10594 CrushWrapper::parse_loc_map(argvec
, &loc
);
10596 if (prefix
== "osd crush set"
10597 && !_get_stable_crush().item_exists(osdid
)) {
10599 ss
<< "unable to set item id " << osdid
<< " name '" << osd_name
10600 << "' weight " << weight
<< " at location " << loc
10601 << ": does not exist";
10605 dout(5) << "adding/updating crush item id " << osdid
<< " name '"
10606 << osd_name
<< "' weight " << weight
<< " at location "
10608 CrushWrapper newcrush
= _get_pending_crush();
10611 if (prefix
== "osd crush set" ||
10612 newcrush
.check_item_loc(cct
, osdid
, loc
, (int *)NULL
)) {
10614 err
= newcrush
.update_item(cct
, osdid
, weight
, osd_name
, loc
);
10617 err
= newcrush
.insert_item(cct
, osdid
, weight
, osd_name
, loc
);
10625 if (err
== 0 && !_have_pending_crush()) {
10626 ss
<< action
<< " item id " << osdid
<< " name '" << osd_name
10627 << "' weight " << weight
<< " at location " << loc
<< ": no change";
10631 pending_inc
.crush
.clear();
10632 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10633 ss
<< action
<< " item id " << osdid
<< " name '" << osd_name
<< "' weight "
10634 << weight
<< " at location " << loc
<< " to crush map";
10636 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10637 get_last_committed() + 1));
10640 } else if (prefix
== "osd crush create-or-move") {
10642 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
10643 if (!osdmap
.exists(osdid
)) {
10646 << " does not exist. create it before updating the crush map";
10651 if (!cmd_getval(cmdmap
, "weight", weight
)) {
10652 ss
<< "unable to parse weight value '"
10653 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
10659 vector
<string
> argvec
;
10660 cmd_getval(cmdmap
, "args", argvec
);
10661 map
<string
,string
> loc
;
10662 CrushWrapper::parse_loc_map(argvec
, &loc
);
10664 dout(0) << "create-or-move crush item name '" << osd_name
10665 << "' initial_weight " << weight
<< " at location " << loc
10668 CrushWrapper newcrush
= _get_pending_crush();
10670 err
= newcrush
.create_or_move_item(cct
, osdid
, weight
, osd_name
, loc
,
10671 g_conf()->osd_crush_update_weight_set
);
10673 ss
<< "create-or-move updated item name '" << osd_name
10674 << "' weight " << weight
10675 << " at location " << loc
<< " to crush map";
10679 pending_inc
.crush
.clear();
10680 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10681 ss
<< "create-or-move updating item name '" << osd_name
10682 << "' weight " << weight
10683 << " at location " << loc
<< " to crush map";
10685 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10686 get_last_committed() + 1));
10691 } else if (prefix
== "osd crush move") {
10693 // osd crush move <name> <loc1> [<loc2> ...]
10695 vector
<string
> argvec
;
10696 cmd_getval(cmdmap
, "name", name
);
10697 cmd_getval(cmdmap
, "args", argvec
);
10698 map
<string
,string
> loc
;
10699 CrushWrapper::parse_loc_map(argvec
, &loc
);
10701 dout(0) << "moving crush item name '" << name
<< "' to location " << loc
<< dendl
;
10702 CrushWrapper newcrush
= _get_pending_crush();
10704 if (!newcrush
.name_exists(name
)) {
10706 ss
<< "item " << name
<< " does not exist";
10709 int id
= newcrush
.get_item_id(name
);
10711 if (!newcrush
.check_item_loc(cct
, id
, loc
, (int *)NULL
)) {
10713 err
= newcrush
.create_or_move_item(
10714 cct
, id
, 0, name
, loc
,
10715 g_conf()->osd_crush_update_weight_set
);
10717 err
= newcrush
.move_bucket(cct
, id
, loc
);
10720 ss
<< "moved item id " << id
<< " name '" << name
<< "' to location " << loc
<< " in crush map";
10721 pending_inc
.crush
.clear();
10722 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10724 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10725 get_last_committed() + 1));
10729 ss
<< "no need to move item id " << id
<< " name '" << name
<< "' to location " << loc
<< " in crush map";
10733 } else if (prefix
== "osd crush swap-bucket") {
10734 string source
, dest
;
10735 cmd_getval(cmdmap
, "source", source
);
10736 cmd_getval(cmdmap
, "dest", dest
);
10738 bool force
= false;
10739 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
10741 CrushWrapper newcrush
= _get_pending_crush();
10742 if (!newcrush
.name_exists(source
)) {
10743 ss
<< "source item " << source
<< " does not exist";
10747 if (!newcrush
.name_exists(dest
)) {
10748 ss
<< "dest item " << dest
<< " does not exist";
10752 int sid
= newcrush
.get_item_id(source
);
10753 int did
= newcrush
.get_item_id(dest
);
10755 if (newcrush
.get_immediate_parent_id(sid
, &sparent
) == 0 && !force
) {
10756 ss
<< "source item " << source
<< " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
10760 if (newcrush
.get_bucket_alg(sid
) != newcrush
.get_bucket_alg(did
) &&
10762 ss
<< "source bucket alg " << crush_alg_name(newcrush
.get_bucket_alg(sid
)) << " != "
10763 << "dest bucket alg " << crush_alg_name(newcrush
.get_bucket_alg(did
))
10764 << "; pass --yes-i-really-mean-it to proceed anyway";
10768 int r
= newcrush
.swap_bucket(cct
, sid
, did
);
10770 ss
<< "failed to swap bucket contents: " << cpp_strerror(r
);
10774 ss
<< "swapped bucket of " << source
<< " to " << dest
;
10775 pending_inc
.crush
.clear();
10776 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10777 wait_for_finished_proposal(op
,
10778 new Monitor::C_Command(mon
, op
, err
, ss
.str(),
10779 get_last_committed() + 1));
10781 } else if (prefix
== "osd crush link") {
10782 // osd crush link <name> <loc1> [<loc2> ...]
10784 cmd_getval(cmdmap
, "name", name
);
10785 vector
<string
> argvec
;
10786 cmd_getval(cmdmap
, "args", argvec
);
10787 map
<string
,string
> loc
;
10788 CrushWrapper::parse_loc_map(argvec
, &loc
);
10790 // Need an explicit check for name_exists because get_item_id returns
10792 int id
= osdmap
.crush
->get_item_id(name
);
10793 if (!osdmap
.crush
->name_exists(name
)) {
10795 ss
<< "item " << name
<< " does not exist";
10798 dout(5) << "resolved crush name '" << name
<< "' to id " << id
<< dendl
;
10800 if (osdmap
.crush
->check_item_loc(cct
, id
, loc
, (int*) NULL
)) {
10801 ss
<< "no need to move item id " << id
<< " name '" << name
10802 << "' to location " << loc
<< " in crush map";
10807 dout(5) << "linking crush item name '" << name
<< "' at location " << loc
<< dendl
;
10808 CrushWrapper newcrush
= _get_pending_crush();
10810 if (!newcrush
.name_exists(name
)) {
10812 ss
<< "item " << name
<< " does not exist";
10815 int id
= newcrush
.get_item_id(name
);
10816 if (!newcrush
.check_item_loc(cct
, id
, loc
, (int *)NULL
)) {
10817 err
= newcrush
.link_bucket(cct
, id
, loc
);
10819 ss
<< "linked item id " << id
<< " name '" << name
10820 << "' to location " << loc
<< " in crush map";
10821 pending_inc
.crush
.clear();
10822 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10824 ss
<< "cannot link item id " << id
<< " name '" << name
10825 << "' to location " << loc
;
10829 ss
<< "no need to move item id " << id
<< " name '" << name
10830 << "' to location " << loc
<< " in crush map";
10834 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, ss
.str(),
10835 get_last_committed() + 1));
10837 } else if (prefix
== "osd crush rm" ||
10838 prefix
== "osd crush remove" ||
10839 prefix
== "osd crush unlink") {
10841 // osd crush rm <id> [ancestor]
10842 CrushWrapper newcrush
= _get_pending_crush();
10845 cmd_getval(cmdmap
, "name", name
);
10847 if (!osdmap
.crush
->name_exists(name
)) {
10849 ss
<< "device '" << name
<< "' does not appear in the crush map";
10852 if (!newcrush
.name_exists(name
)) {
10854 ss
<< "device '" << name
<< "' does not appear in the crush map";
10856 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10857 get_last_committed() + 1));
10860 int id
= newcrush
.get_item_id(name
);
10863 bool unlink_only
= prefix
== "osd crush unlink";
10864 string ancestor_str
;
10865 if (cmd_getval(cmdmap
, "ancestor", ancestor_str
)) {
10866 if (!newcrush
.name_exists(ancestor_str
)) {
10868 ss
<< "ancestor item '" << ancestor_str
10869 << "' does not appear in the crush map";
10872 ancestor
= newcrush
.get_item_id(ancestor_str
);
10875 err
= prepare_command_osd_crush_remove(
10878 (ancestor
< 0), unlink_only
);
10880 if (err
== -ENOENT
) {
10881 ss
<< "item " << id
<< " does not appear in that position";
10887 pending_inc
.new_crush_node_flags
[id
] = 0;
10888 ss
<< "removed item id " << id
<< " name '" << name
<< "' from crush map";
10890 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10891 get_last_committed() + 1));
10896 } else if (prefix
== "osd crush reweight-all") {
10897 CrushWrapper newcrush
= _get_pending_crush();
10899 newcrush
.reweight(cct
);
10900 pending_inc
.crush
.clear();
10901 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10902 ss
<< "reweighted crush hierarchy";
10904 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10905 get_last_committed() + 1));
10907 } else if (prefix
== "osd crush reweight") {
10908 // osd crush reweight <name> <weight>
10909 CrushWrapper newcrush
= _get_pending_crush();
10912 cmd_getval(cmdmap
, "name", name
);
10913 if (!newcrush
.name_exists(name
)) {
10915 ss
<< "device '" << name
<< "' does not appear in the crush map";
10919 int id
= newcrush
.get_item_id(name
);
10921 ss
<< "device '" << name
<< "' is not a leaf in the crush map";
10926 if (!cmd_getval(cmdmap
, "weight", w
)) {
10927 ss
<< "unable to parse weight value '"
10928 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
10933 err
= newcrush
.adjust_item_weightf(cct
, id
, w
,
10934 g_conf()->osd_crush_update_weight_set
);
10937 pending_inc
.crush
.clear();
10938 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10939 ss
<< "reweighted item id " << id
<< " name '" << name
<< "' to " << w
10940 << " in crush map";
10942 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10943 get_last_committed() + 1));
10945 } else if (prefix
== "osd crush reweight-subtree") {
10946 // osd crush reweight <name> <weight>
10947 CrushWrapper newcrush
= _get_pending_crush();
10950 cmd_getval(cmdmap
, "name", name
);
10951 if (!newcrush
.name_exists(name
)) {
10953 ss
<< "device '" << name
<< "' does not appear in the crush map";
10957 int id
= newcrush
.get_item_id(name
);
10959 ss
<< "device '" << name
<< "' is not a subtree in the crush map";
10964 if (!cmd_getval(cmdmap
, "weight", w
)) {
10965 ss
<< "unable to parse weight value '"
10966 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
10971 err
= newcrush
.adjust_subtree_weightf(cct
, id
, w
,
10972 g_conf()->osd_crush_update_weight_set
);
10975 pending_inc
.crush
.clear();
10976 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10977 ss
<< "reweighted subtree id " << id
<< " name '" << name
<< "' to " << w
10978 << " in crush map";
10980 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10981 get_last_committed() + 1));
10983 } else if (prefix
== "osd crush tunables") {
10984 CrushWrapper newcrush
= _get_pending_crush();
10988 cmd_getval(cmdmap
, "profile", profile
);
10989 if (profile
== "legacy" || profile
== "argonaut") {
10990 newcrush
.set_tunables_legacy();
10991 } else if (profile
== "bobtail") {
10992 newcrush
.set_tunables_bobtail();
10993 } else if (profile
== "firefly") {
10994 newcrush
.set_tunables_firefly();
10995 } else if (profile
== "hammer") {
10996 newcrush
.set_tunables_hammer();
10997 } else if (profile
== "jewel") {
10998 newcrush
.set_tunables_jewel();
10999 } else if (profile
== "optimal") {
11000 newcrush
.set_tunables_optimal();
11001 } else if (profile
== "default") {
11002 newcrush
.set_tunables_default();
11004 ss
<< "unrecognized profile '" << profile
<< "'";
11009 if (!validate_crush_against_features(&newcrush
, ss
)) {
11014 pending_inc
.crush
.clear();
11015 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
11016 ss
<< "adjusted tunables profile to " << profile
;
11018 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11019 get_last_committed() + 1));
11021 } else if (prefix
== "osd crush set-tunable") {
11022 CrushWrapper newcrush
= _get_pending_crush();
11026 cmd_getval(cmdmap
, "tunable", tunable
);
11028 int64_t value
= -1;
11029 if (!cmd_getval(cmdmap
, "value", value
)) {
11031 ss
<< "failed to parse integer value "
11032 << cmd_vartype_stringify(cmdmap
.at("value"));
11036 if (tunable
== "straw_calc_version") {
11037 if (value
!= 0 && value
!= 1) {
11038 ss
<< "value must be 0 or 1; got " << value
;
11042 newcrush
.set_straw_calc_version(value
);
11044 ss
<< "unrecognized tunable '" << tunable
<< "'";
11049 if (!validate_crush_against_features(&newcrush
, ss
)) {
11054 pending_inc
.crush
.clear();
11055 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
11056 ss
<< "adjusted tunable " << tunable
<< " to " << value
;
11058 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11059 get_last_committed() + 1));
11062 } else if (prefix
== "osd crush rule create-simple") {
11063 string name
, root
, type
, mode
;
11064 cmd_getval(cmdmap
, "name", name
);
11065 cmd_getval(cmdmap
, "root", root
);
11066 cmd_getval(cmdmap
, "type", type
);
11067 cmd_getval(cmdmap
, "mode", mode
);
11071 if (osdmap
.crush
->rule_exists(name
)) {
11072 // The name is uniquely associated to a ruleid and the rule it contains
11073 // From the user point of view, the rule is more meaningfull.
11074 ss
<< "rule " << name
<< " already exists";
11079 CrushWrapper newcrush
= _get_pending_crush();
11081 if (newcrush
.rule_exists(name
)) {
11082 // The name is uniquely associated to a ruleid and the rule it contains
11083 // From the user point of view, the rule is more meaningfull.
11084 ss
<< "rule " << name
<< " already exists";
11087 int ruleno
= newcrush
.add_simple_rule(name
, root
, type
, "", mode
,
11088 pg_pool_t::TYPE_REPLICATED
, &ss
);
11094 pending_inc
.crush
.clear();
11095 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
11098 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11099 get_last_committed() + 1));
11102 } else if (prefix
== "osd crush rule create-replicated") {
11103 string name
, root
, type
, device_class
;
11104 cmd_getval(cmdmap
, "name", name
);
11105 cmd_getval(cmdmap
, "root", root
);
11106 cmd_getval(cmdmap
, "type", type
);
11107 cmd_getval(cmdmap
, "class", device_class
);
11109 if (osdmap
.crush
->rule_exists(name
)) {
11110 // The name is uniquely associated to a ruleid and the rule it contains
11111 // From the user point of view, the rule is more meaningfull.
11112 ss
<< "rule " << name
<< " already exists";
11117 CrushWrapper newcrush
= _get_pending_crush();
11119 if (newcrush
.rule_exists(name
)) {
11120 // The name is uniquely associated to a ruleid and the rule it contains
11121 // From the user point of view, the rule is more meaningfull.
11122 ss
<< "rule " << name
<< " already exists";
11125 int ruleno
= newcrush
.add_simple_rule(
11126 name
, root
, type
, device_class
,
11127 "firstn", pg_pool_t::TYPE_REPLICATED
, &ss
);
11133 pending_inc
.crush
.clear();
11134 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
11137 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11138 get_last_committed() + 1));
11141 } else if (prefix
== "osd erasure-code-profile rm") {
11143 cmd_getval(cmdmap
, "name", name
);
11145 if (erasure_code_profile_in_use(pending_inc
.new_pools
, name
, &ss
))
11148 if (erasure_code_profile_in_use(osdmap
.pools
, name
, &ss
)) {
11153 if (osdmap
.has_erasure_code_profile(name
) ||
11154 pending_inc
.new_erasure_code_profiles
.count(name
)) {
11155 if (osdmap
.has_erasure_code_profile(name
)) {
11156 pending_inc
.old_erasure_code_profiles
.push_back(name
);
11158 dout(20) << "erasure code profile rm " << name
<< ": creation canceled" << dendl
;
11159 pending_inc
.new_erasure_code_profiles
.erase(name
);
11163 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11164 get_last_committed() + 1));
11167 ss
<< "erasure-code-profile " << name
<< " does not exist";
11172 } else if (prefix
== "osd erasure-code-profile set") {
11174 cmd_getval(cmdmap
, "name", name
);
11175 vector
<string
> profile
;
11176 cmd_getval(cmdmap
, "profile", profile
);
11178 bool force
= false;
11179 cmd_getval(cmdmap
, "force", force
);
11181 map
<string
,string
> profile_map
;
11182 err
= parse_erasure_code_profile(profile
, &profile_map
, &ss
);
11185 if (auto found
= profile_map
.find("crush-failure-domain");
11186 found
!= profile_map
.end()) {
11187 const auto& failure_domain
= found
->second
;
11188 int failure_domain_type
= osdmap
.crush
->get_type_id(failure_domain
);
11189 if (failure_domain_type
< 0) {
11190 ss
<< "erasure-code-profile " << profile_map
11191 << " contains an invalid failure-domain " << std::quoted(failure_domain
);
11197 if (profile_map
.find("plugin") == profile_map
.end()) {
11198 ss
<< "erasure-code-profile " << profile_map
11199 << " must contain a plugin entry" << std::endl
;
11203 string plugin
= profile_map
["plugin"];
11205 if (pending_inc
.has_erasure_code_profile(name
)) {
11206 dout(20) << "erasure code profile " << name
<< " try again" << dendl
;
11209 err
= normalize_profile(name
, profile_map
, force
, &ss
);
11213 if (osdmap
.has_erasure_code_profile(name
)) {
11214 ErasureCodeProfile existing_profile_map
=
11215 osdmap
.get_erasure_code_profile(name
);
11216 err
= normalize_profile(name
, existing_profile_map
, force
, &ss
);
11220 if (existing_profile_map
== profile_map
) {
11226 ss
<< "will not override erasure code profile " << name
11227 << " because the existing profile "
11228 << existing_profile_map
11229 << " is different from the proposed profile "
11235 dout(20) << "erasure code profile set " << name
<< "="
11236 << profile_map
<< dendl
;
11237 pending_inc
.set_erasure_code_profile(name
, profile_map
);
11241 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11242 get_last_committed() + 1));
11245 } else if (prefix
== "osd crush rule create-erasure") {
11246 err
= check_cluster_features(CEPH_FEATURE_CRUSH_V2
, ss
);
11247 if (err
== -EAGAIN
)
11251 string name
, poolstr
;
11252 cmd_getval(cmdmap
, "name", name
);
11254 cmd_getval(cmdmap
, "profile", profile
);
11256 profile
= "default";
11257 if (profile
== "default") {
11258 if (!osdmap
.has_erasure_code_profile(profile
)) {
11259 if (pending_inc
.has_erasure_code_profile(profile
)) {
11260 dout(20) << "erasure code profile " << profile
<< " already pending" << dendl
;
11264 map
<string
,string
> profile_map
;
11265 err
= osdmap
.get_erasure_code_profile_default(cct
,
11270 err
= normalize_profile(name
, profile_map
, true, &ss
);
11273 dout(20) << "erasure code profile set " << profile
<< "="
11274 << profile_map
<< dendl
;
11275 pending_inc
.set_erasure_code_profile(profile
, profile_map
);
11281 err
= crush_rule_create_erasure(name
, profile
, &rule
, &ss
);
11284 case -EEXIST
: // return immediately
11285 ss
<< "rule " << name
<< " already exists";
11289 case -EALREADY
: // wait for pending to be proposed
11290 ss
<< "rule " << name
<< " already exists";
11293 default: // non recoverable error
11298 ss
<< "created rule " << name
<< " at " << rule
;
11302 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11303 get_last_committed() + 1));
11306 } else if (prefix
== "osd crush rule rm") {
11308 cmd_getval(cmdmap
, "name", name
);
11310 if (!osdmap
.crush
->rule_exists(name
)) {
11311 ss
<< "rule " << name
<< " does not exist";
11316 CrushWrapper newcrush
= _get_pending_crush();
11318 if (!newcrush
.rule_exists(name
)) {
11319 ss
<< "rule " << name
<< " does not exist";
11322 int ruleno
= newcrush
.get_rule_id(name
);
11323 ceph_assert(ruleno
>= 0);
11325 // make sure it is not in use.
11326 // FIXME: this is ok in some situations, but let's not bother with that
11328 if (osdmap
.crush_rule_in_use(ruleno
)) {
11329 ss
<< "crush rule " << name
<< " (" << ruleno
<< ") is in use";
11334 err
= newcrush
.remove_rule(ruleno
);
11339 pending_inc
.crush
.clear();
11340 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
11343 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11344 get_last_committed() + 1));
11347 } else if (prefix
== "osd crush rule rename") {
11350 cmd_getval(cmdmap
, "srcname", srcname
);
11351 cmd_getval(cmdmap
, "dstname", dstname
);
11352 if (srcname
.empty() || dstname
.empty()) {
11353 ss
<< "must specify both source rule name and destination rule name";
11357 if (srcname
== dstname
) {
11358 ss
<< "destination rule name is equal to source rule name";
11363 CrushWrapper newcrush
= _get_pending_crush();
11364 if (!newcrush
.rule_exists(srcname
) && newcrush
.rule_exists(dstname
)) {
11365 // srcname does not exist and dstname already exists
11366 // suppose this is a replay and return success
11367 // (so this command is idempotent)
11368 ss
<< "already renamed to '" << dstname
<< "'";
11373 err
= newcrush
.rename_rule(srcname
, dstname
, &ss
);
11375 // ss has reason for failure
11378 pending_inc
.crush
.clear();
11379 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
11381 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11382 get_last_committed() + 1));
11385 } else if (prefix
== "osd setmaxosd") {
11387 if (!cmd_getval(cmdmap
, "newmax", newmax
)) {
11388 ss
<< "unable to parse 'newmax' value '"
11389 << cmd_vartype_stringify(cmdmap
.at("newmax")) << "'";
11394 if (newmax
> g_conf()->mon_max_osd
) {
11396 ss
<< "cannot set max_osd to " << newmax
<< " which is > conf.mon_max_osd ("
11397 << g_conf()->mon_max_osd
<< ")";
11401 // Don't allow shrinking OSD number as this will cause data loss
11402 // and may cause kernel crashes.
11403 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
11404 if (newmax
< osdmap
.get_max_osd()) {
11405 // Check if the OSDs exist between current max and new value.
11406 // If there are any OSDs exist, then don't allow shrinking number
11408 for (int i
= newmax
; i
< osdmap
.get_max_osd(); i
++) {
11409 if (osdmap
.exists(i
)) {
11411 ss
<< "cannot shrink max_osd to " << newmax
11412 << " because osd." << i
<< " (and possibly others) still in use";
11418 pending_inc
.new_max_osd
= newmax
;
11419 ss
<< "set new max_osd = " << pending_inc
.new_max_osd
;
11421 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11422 get_last_committed() + 1));
11425 } else if (prefix
== "osd set-full-ratio" ||
11426 prefix
== "osd set-backfillfull-ratio" ||
11427 prefix
== "osd set-nearfull-ratio") {
11429 if (!cmd_getval(cmdmap
, "ratio", n
)) {
11430 ss
<< "unable to parse 'ratio' value '"
11431 << cmd_vartype_stringify(cmdmap
.at("ratio")) << "'";
11435 if (prefix
== "osd set-full-ratio")
11436 pending_inc
.new_full_ratio
= n
;
11437 else if (prefix
== "osd set-backfillfull-ratio")
11438 pending_inc
.new_backfillfull_ratio
= n
;
11439 else if (prefix
== "osd set-nearfull-ratio")
11440 pending_inc
.new_nearfull_ratio
= n
;
11441 ss
<< prefix
<< " " << n
;
11443 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11444 get_last_committed() + 1));
11446 } else if (prefix
== "osd set-require-min-compat-client") {
11448 cmd_getval(cmdmap
, "version", v
);
11449 ceph_release_t vno
= ceph_release_from_name(v
);
11451 ss
<< "version " << v
<< " is not recognized";
11456 newmap
.deepish_copy_from(osdmap
);
11457 newmap
.apply_incremental(pending_inc
);
11458 newmap
.require_min_compat_client
= vno
;
11459 auto mvno
= newmap
.get_min_compat_client();
11461 ss
<< "osdmap current utilizes features that require " << mvno
11462 << "; cannot set require_min_compat_client below that to " << vno
;
11467 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
11470 mon
.get_combined_feature_map(&m
);
11471 uint64_t features
= ceph_release_features(to_integer
<int>(vno
));
11475 CEPH_ENTITY_TYPE_CLIENT
,
11476 CEPH_ENTITY_TYPE_MDS
,
11477 CEPH_ENTITY_TYPE_MGR
}) {
11478 auto p
= m
.m
.find(type
);
11479 if (p
== m
.m
.end()) {
11482 for (auto& q
: p
->second
) {
11483 uint64_t missing
= ~q
.first
& features
;
11486 ss
<< "cannot set require_min_compat_client to " << v
<< ": ";
11491 ss
<< q
.second
<< " connected " << ceph_entity_type_name(type
)
11492 << "(s) look like " << ceph_release_name(
11493 ceph_release_from_features(q
.first
))
11494 << " (missing 0x" << std::hex
<< missing
<< std::dec
<< ")";
11500 ss
<< "; add --yes-i-really-mean-it to do it anyway";
11505 ss
<< "set require_min_compat_client to " << vno
;
11506 pending_inc
.new_require_min_compat_client
= vno
;
11508 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11509 get_last_committed() + 1));
11511 } else if (prefix
== "osd pause") {
11512 return prepare_set_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11514 } else if (prefix
== "osd unpause") {
11515 return prepare_unset_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11517 } else if (prefix
== "osd set") {
11519 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
11522 cmd_getval(cmdmap
, "key", key
);
11523 if (key
== "pause")
11524 return prepare_set_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11525 else if (key
== "noup")
11526 return prepare_set_flag(op
, CEPH_OSDMAP_NOUP
);
11527 else if (key
== "nodown")
11528 return prepare_set_flag(op
, CEPH_OSDMAP_NODOWN
);
11529 else if (key
== "noout")
11530 return prepare_set_flag(op
, CEPH_OSDMAP_NOOUT
);
11531 else if (key
== "noin")
11532 return prepare_set_flag(op
, CEPH_OSDMAP_NOIN
);
11533 else if (key
== "nobackfill")
11534 return prepare_set_flag(op
, CEPH_OSDMAP_NOBACKFILL
);
11535 else if (key
== "norebalance")
11536 return prepare_set_flag(op
, CEPH_OSDMAP_NOREBALANCE
);
11537 else if (key
== "norecover")
11538 return prepare_set_flag(op
, CEPH_OSDMAP_NORECOVER
);
11539 else if (key
== "noscrub")
11540 return prepare_set_flag(op
, CEPH_OSDMAP_NOSCRUB
);
11541 else if (key
== "nodeep-scrub")
11542 return prepare_set_flag(op
, CEPH_OSDMAP_NODEEP_SCRUB
);
11543 else if (key
== "notieragent")
11544 return prepare_set_flag(op
, CEPH_OSDMAP_NOTIERAGENT
);
11545 else if (key
== "nosnaptrim")
11546 return prepare_set_flag(op
, CEPH_OSDMAP_NOSNAPTRIM
);
11547 else if (key
== "pglog_hardlimit") {
11548 if (!osdmap
.get_num_up_osds() && !sure
) {
11549 ss
<< "Not advisable to continue since no OSDs are up. Pass "
11550 << "--yes-i-really-mean-it if you really wish to continue.";
11554 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
11555 // we are reusing a jewel feature bit that was retired in luminous.
11556 if (osdmap
.require_osd_release
>= ceph_release_t::luminous
&&
11557 (HAVE_FEATURE(osdmap
.get_up_osd_features(), OSD_PGLOG_HARDLIMIT
)
11559 return prepare_set_flag(op
, CEPH_OSDMAP_PGLOG_HARDLIMIT
);
11561 ss
<< "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
11566 ss
<< "unrecognized flag '" << key
<< "'";
11570 } else if (prefix
== "osd unset") {
11572 cmd_getval(cmdmap
, "key", key
);
11573 if (key
== "pause")
11574 return prepare_unset_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11575 else if (key
== "noup")
11576 return prepare_unset_flag(op
, CEPH_OSDMAP_NOUP
);
11577 else if (key
== "nodown")
11578 return prepare_unset_flag(op
, CEPH_OSDMAP_NODOWN
);
11579 else if (key
== "noout")
11580 return prepare_unset_flag(op
, CEPH_OSDMAP_NOOUT
);
11581 else if (key
== "noin")
11582 return prepare_unset_flag(op
, CEPH_OSDMAP_NOIN
);
11583 else if (key
== "nobackfill")
11584 return prepare_unset_flag(op
, CEPH_OSDMAP_NOBACKFILL
);
11585 else if (key
== "norebalance")
11586 return prepare_unset_flag(op
, CEPH_OSDMAP_NOREBALANCE
);
11587 else if (key
== "norecover")
11588 return prepare_unset_flag(op
, CEPH_OSDMAP_NORECOVER
);
11589 else if (key
== "noscrub")
11590 return prepare_unset_flag(op
, CEPH_OSDMAP_NOSCRUB
);
11591 else if (key
== "nodeep-scrub")
11592 return prepare_unset_flag(op
, CEPH_OSDMAP_NODEEP_SCRUB
);
11593 else if (key
== "notieragent")
11594 return prepare_unset_flag(op
, CEPH_OSDMAP_NOTIERAGENT
);
11595 else if (key
== "nosnaptrim")
11596 return prepare_unset_flag(op
, CEPH_OSDMAP_NOSNAPTRIM
);
11598 ss
<< "unrecognized flag '" << key
<< "'";
11602 } else if (prefix
== "osd require-osd-release") {
11604 cmd_getval(cmdmap
, "release", release
);
11606 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
11607 ceph_release_t rel
= ceph_release_from_name(release
.c_str());
11609 ss
<< "unrecognized release " << release
;
11613 if (rel
== osdmap
.require_osd_release
) {
11618 ceph_assert(osdmap
.require_osd_release
>= ceph_release_t::octopus
);
11619 if (!osdmap
.get_num_up_osds() && !sure
) {
11620 ss
<< "Not advisable to continue since no OSDs are up. Pass "
11621 << "--yes-i-really-mean-it if you really wish to continue.";
11625 if (rel
== ceph_release_t::octopus
) {
11626 if (!mon
.monmap
->get_required_features().contains_all(
11627 ceph::features::mon::FEATURE_OCTOPUS
)) {
11628 ss
<< "not all mons are octopus";
11632 if ((!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_OCTOPUS
))
11634 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_OCTOPUS feature";
11638 } else if (rel
== ceph_release_t::pacific
) {
11639 if (!mon
.monmap
->get_required_features().contains_all(
11640 ceph::features::mon::FEATURE_PACIFIC
)) {
11641 ss
<< "not all mons are pacific";
11645 if ((!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_PACIFIC
))
11647 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_PACIFIC feature";
11651 } else if (rel
== ceph_release_t::quincy
) {
11652 if (!mon
.monmap
->get_required_features().contains_all(
11653 ceph::features::mon::FEATURE_QUINCY
)) {
11654 ss
<< "not all mons are quincy";
11658 if ((!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_QUINCY
))
11660 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_QUINCY feature";
11665 ss
<< "not supported for this release";
11669 if (rel
< osdmap
.require_osd_release
) {
11670 ss
<< "require_osd_release cannot be lowered once it has been set";
11674 pending_inc
.new_require_osd_release
= rel
;
11676 } else if (prefix
== "osd down" ||
11677 prefix
== "osd out" ||
11678 prefix
== "osd in" ||
11679 prefix
== "osd rm" ||
11680 prefix
== "osd stop") {
11684 bool verbose
= true;
11685 bool definitely_dead
= false;
11687 vector
<string
> idvec
;
11688 cmd_getval(cmdmap
, "ids", idvec
);
11689 cmd_getval(cmdmap
, "definitely_dead", definitely_dead
);
11690 derr
<< "definitely_dead " << (int)definitely_dead
<< dendl
;
11691 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
11696 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
11697 if (prefix
== "osd in") {
11698 // touch out osds only
11699 osdmap
.get_out_existing_osds(osds
);
11701 osdmap
.get_all_osds(osds
);
11704 verbose
= false; // so the output is less noisy.
11706 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
11708 ss
<< "invalid osd id" << osd
;
11711 } else if (!osdmap
.exists(osd
)) {
11712 ss
<< "osd." << osd
<< " does not exist. ";
11719 for (auto &osd
: osds
) {
11720 if (prefix
== "osd down") {
11721 if (osdmap
.is_down(osd
)) {
11723 ss
<< "osd." << osd
<< " is already down. ";
11725 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_UP
);
11726 ss
<< "marked down osd." << osd
<< ". ";
11729 if (definitely_dead
) {
11730 if (!pending_inc
.new_xinfo
.count(osd
)) {
11731 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
11733 if (pending_inc
.new_xinfo
[osd
].dead_epoch
< pending_inc
.epoch
) {
11736 pending_inc
.new_xinfo
[osd
].dead_epoch
= pending_inc
.epoch
;
11738 } else if (prefix
== "osd out") {
11739 if (osdmap
.is_out(osd
)) {
11741 ss
<< "osd." << osd
<< " is already out. ";
11743 pending_inc
.new_weight
[osd
] = CEPH_OSD_OUT
;
11744 if (osdmap
.osd_weight
[osd
]) {
11745 if (pending_inc
.new_xinfo
.count(osd
) == 0) {
11746 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
11748 pending_inc
.new_xinfo
[osd
].old_weight
= osdmap
.osd_weight
[osd
];
11750 ss
<< "marked out osd." << osd
<< ". ";
11751 std::ostringstream msg
;
11752 msg
<< "Client " << op
->get_session()->entity_name
11753 << " marked osd." << osd
<< " out";
11754 if (osdmap
.is_up(osd
)) {
11755 msg
<< ", while it was still marked up";
11757 auto period
= ceph_clock_now() - down_pending_out
[osd
];
11758 msg
<< ", after it was down for " << int(period
.sec())
11762 mon
.clog
->info() << msg
.str();
11765 } else if (prefix
== "osd in") {
11766 if (osdmap
.is_in(osd
)) {
11768 ss
<< "osd." << osd
<< " is already in. ";
11770 if (osdmap
.osd_xinfo
[osd
].old_weight
> 0) {
11771 pending_inc
.new_weight
[osd
] = osdmap
.osd_xinfo
[osd
].old_weight
;
11772 if (pending_inc
.new_xinfo
.count(osd
) == 0) {
11773 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
11775 pending_inc
.new_xinfo
[osd
].old_weight
= 0;
11777 pending_inc
.new_weight
[osd
] = CEPH_OSD_IN
;
11779 ss
<< "marked in osd." << osd
<< ". ";
11782 } else if (prefix
== "osd rm") {
11783 err
= prepare_command_osd_remove(osd
);
11785 if (err
== -EBUSY
) {
11788 ss
<< "osd." << osd
<< " is still up; must be down before removal. ";
11790 ceph_assert(err
== 0);
11792 ss
<< ", osd." << osd
;
11794 ss
<< "removed osd." << osd
;
11798 } else if (prefix
== "osd stop") {
11799 if (osdmap
.is_stop(osd
)) {
11801 ss
<< "osd." << osd
<< " is already stopped. ";
11802 } else if (osdmap
.is_down(osd
)) {
11803 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_STOP
);
11804 ss
<< "stop down osd." << osd
<< ". ";
11807 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_UP
| CEPH_OSD_STOP
);
11808 ss
<< "stop osd." << osd
<< ". ";
11816 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
11817 get_last_committed() + 1));
11820 } else if (prefix
== "osd set-group" ||
11821 prefix
== "osd unset-group" ||
11822 prefix
== "osd add-noup" ||
11823 prefix
== "osd add-nodown" ||
11824 prefix
== "osd add-noin" ||
11825 prefix
== "osd add-noout" ||
11826 prefix
== "osd rm-noup" ||
11827 prefix
== "osd rm-nodown" ||
11828 prefix
== "osd rm-noin" ||
11829 prefix
== "osd rm-noout") {
11830 bool do_set
= prefix
== "osd set-group" ||
11831 prefix
.find("add") != string::npos
;
11833 unsigned flags
= 0;
11834 vector
<string
> who
;
11835 if (prefix
== "osd set-group" || prefix
== "osd unset-group") {
11836 cmd_getval(cmdmap
, "flags", flag_str
);
11837 cmd_getval(cmdmap
, "who", who
);
11838 vector
<string
> raw_flags
;
11839 boost::split(raw_flags
, flag_str
, boost::is_any_of(","));
11840 for (auto& f
: raw_flags
) {
11842 flags
|= CEPH_OSD_NOUP
;
11843 else if (f
== "nodown")
11844 flags
|= CEPH_OSD_NODOWN
;
11845 else if (f
== "noin")
11846 flags
|= CEPH_OSD_NOIN
;
11847 else if (f
== "noout")
11848 flags
|= CEPH_OSD_NOOUT
;
11850 ss
<< "unrecognized flag '" << f
<< "', must be one of "
11851 << "{noup,nodown,noin,noout}";
11857 cmd_getval(cmdmap
, "ids", who
);
11858 if (prefix
.find("noup") != string::npos
)
11859 flags
= CEPH_OSD_NOUP
;
11860 else if (prefix
.find("nodown") != string::npos
)
11861 flags
= CEPH_OSD_NODOWN
;
11862 else if (prefix
.find("noin") != string::npos
)
11863 flags
= CEPH_OSD_NOIN
;
11864 else if (prefix
.find("noout") != string::npos
)
11865 flags
= CEPH_OSD_NOOUT
;
11867 ceph_assert(0 == "Unreachable!");
11870 ss
<< "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
11875 ss
<< "must specify at least one or more targets to set/unset";
11880 set
<int> crush_nodes
;
11881 set
<int> device_classes
;
11882 for (auto& w
: who
) {
11883 if (w
== "any" || w
== "all" || w
== "*") {
11884 osdmap
.get_all_osds(osds
);
11887 std::stringstream ts
;
11888 if (auto osd
= parse_osd_id(w
.c_str(), &ts
); osd
>= 0) {
11890 } else if (osdmap
.crush
->name_exists(w
)) {
11891 crush_nodes
.insert(osdmap
.crush
->get_item_id(w
));
11892 } else if (osdmap
.crush
->class_exists(w
)) {
11893 device_classes
.insert(osdmap
.crush
->get_class_id(w
));
11895 ss
<< "unable to parse osd id or crush node or device class: "
11896 << "\"" << w
<< "\". ";
11899 if (osds
.empty() && crush_nodes
.empty() && device_classes
.empty()) {
11900 // ss has reason for failure
11905 for (auto osd
: osds
) {
11906 if (!osdmap
.exists(osd
)) {
11907 ss
<< "osd." << osd
<< " does not exist. ";
11911 if (flags
& CEPH_OSD_NOUP
) {
11912 any
|= osdmap
.is_noup_by_osd(osd
) ?
11913 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOUP
) :
11914 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOUP
);
11916 if (flags
& CEPH_OSD_NODOWN
) {
11917 any
|= osdmap
.is_nodown_by_osd(osd
) ?
11918 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NODOWN
) :
11919 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NODOWN
);
11921 if (flags
& CEPH_OSD_NOIN
) {
11922 any
|= osdmap
.is_noin_by_osd(osd
) ?
11923 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOIN
) :
11924 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOIN
);
11926 if (flags
& CEPH_OSD_NOOUT
) {
11927 any
|= osdmap
.is_noout_by_osd(osd
) ?
11928 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOOUT
) :
11929 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOOUT
);
11932 if (flags
& CEPH_OSD_NOUP
) {
11933 any
|= osdmap
.is_noup_by_osd(osd
) ?
11934 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOUP
) :
11935 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOUP
);
11937 if (flags
& CEPH_OSD_NODOWN
) {
11938 any
|= osdmap
.is_nodown_by_osd(osd
) ?
11939 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NODOWN
) :
11940 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NODOWN
);
11942 if (flags
& CEPH_OSD_NOIN
) {
11943 any
|= osdmap
.is_noin_by_osd(osd
) ?
11944 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOIN
) :
11945 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOIN
);
11947 if (flags
& CEPH_OSD_NOOUT
) {
11948 any
|= osdmap
.is_noout_by_osd(osd
) ?
11949 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOOUT
) :
11950 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOOUT
);
11954 for (auto& id
: crush_nodes
) {
11955 auto old_flags
= osdmap
.get_crush_node_flags(id
);
11956 auto& pending_flags
= pending_inc
.new_crush_node_flags
[id
];
11957 pending_flags
|= old_flags
; // adopt existing flags first!
11959 pending_flags
|= flags
;
11961 pending_flags
&= ~flags
;
11965 for (auto& id
: device_classes
) {
11966 auto old_flags
= osdmap
.get_device_class_flags(id
);
11967 auto& pending_flags
= pending_inc
.new_device_class_flags
[id
];
11968 pending_flags
|= old_flags
;
11970 pending_flags
|= flags
;
11972 pending_flags
&= ~flags
;
11978 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
11979 get_last_committed() + 1));
11982 } else if (prefix
== "osd pg-temp") {
11984 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
11985 ss
<< "unable to parse 'pgid' value '"
11986 << cmd_vartype_stringify(cmdmap
.at("pgid")) << "'";
11991 if (!pgid
.parse(pgidstr
.c_str())) {
11992 ss
<< "invalid pgid '" << pgidstr
<< "'";
11996 if (!osdmap
.pg_exists(pgid
)) {
11997 ss
<< "pg " << pgid
<< " does not exist";
12001 if (pending_inc
.new_pg_temp
.count(pgid
)) {
12002 dout(10) << __func__
<< " waiting for pending update on " << pgid
<< dendl
;
12003 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12007 vector
<int64_t> id_vec
;
12008 vector
<int32_t> new_pg_temp
;
12009 cmd_getval(cmdmap
, "id", id_vec
);
12010 if (id_vec
.empty()) {
12011 pending_inc
.new_pg_temp
[pgid
] = mempool::osdmap::vector
<int>();
12012 ss
<< "done cleaning up pg_temp of " << pgid
;
12015 for (auto osd
: id_vec
) {
12016 if (!osdmap
.exists(osd
)) {
12017 ss
<< "osd." << osd
<< " does not exist";
12021 new_pg_temp
.push_back(osd
);
12024 int pool_min_size
= osdmap
.get_pg_pool_min_size(pgid
);
12025 if ((int)new_pg_temp
.size() < pool_min_size
) {
12026 ss
<< "num of osds (" << new_pg_temp
.size() <<") < pool min size ("
12027 << pool_min_size
<< ")";
12032 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
12033 if ((int)new_pg_temp
.size() > pool_size
) {
12034 ss
<< "num of osds (" << new_pg_temp
.size() <<") > pool size ("
12035 << pool_size
<< ")";
12040 pending_inc
.new_pg_temp
[pgid
] = mempool::osdmap::vector
<int>(
12041 new_pg_temp
.begin(), new_pg_temp
.end());
12042 ss
<< "set " << pgid
<< " pg_temp mapping to " << new_pg_temp
;
12044 } else if (prefix
== "osd primary-temp") {
12046 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
12047 ss
<< "unable to parse 'pgid' value '"
12048 << cmd_vartype_stringify(cmdmap
.at("pgid")) << "'";
12053 if (!pgid
.parse(pgidstr
.c_str())) {
12054 ss
<< "invalid pgid '" << pgidstr
<< "'";
12058 if (!osdmap
.pg_exists(pgid
)) {
12059 ss
<< "pg " << pgid
<< " does not exist";
12065 if (!cmd_getval(cmdmap
, "id", osd
)) {
12066 ss
<< "unable to parse 'id' value '"
12067 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12071 if (osd
!= -1 && !osdmap
.exists(osd
)) {
12072 ss
<< "osd." << osd
<< " does not exist";
12077 if (osdmap
.require_min_compat_client
!= ceph_release_t::unknown
&&
12078 osdmap
.require_min_compat_client
< ceph_release_t::firefly
) {
12079 ss
<< "require_min_compat_client "
12080 << osdmap
.require_min_compat_client
12081 << " < firefly, which is required for primary-temp";
12086 pending_inc
.new_primary_temp
[pgid
] = osd
;
12087 ss
<< "set " << pgid
<< " primary_temp mapping to " << osd
;
12089 } else if (prefix
== "pg repeer") {
12092 cmd_getval(cmdmap
, "pgid", pgidstr
);
12093 if (!pgid
.parse(pgidstr
.c_str())) {
12094 ss
<< "invalid pgid '" << pgidstr
<< "'";
12098 if (!osdmap
.pg_exists(pgid
)) {
12099 ss
<< "pg '" << pgidstr
<< "' does not exist";
12103 vector
<int> acting
;
12105 osdmap
.pg_to_acting_osds(pgid
, &acting
, &primary
);
12108 ss
<< "pg currently has no primary";
12111 if (acting
.size() > 1) {
12112 // map to just primary; it will map back to what it wants
12113 pending_inc
.new_pg_temp
[pgid
] = { primary
};
12115 // hmm, pick another arbitrary osd to induce a change. Note
12116 // that this won't work if there is only one suitable OSD in the cluster.
12119 for (i
= 0; i
< osdmap
.get_max_osd(); ++i
) {
12120 if (i
== primary
|| !osdmap
.is_up(i
) || !osdmap
.exists(i
)) {
12123 pending_inc
.new_pg_temp
[pgid
] = { primary
, i
};
12129 ss
<< "not enough up OSDs in the cluster to force repeer";
12134 } else if (prefix
== "osd pg-upmap" ||
12135 prefix
== "osd rm-pg-upmap" ||
12136 prefix
== "osd pg-upmap-items" ||
12137 prefix
== "osd rm-pg-upmap-items") {
12138 if (osdmap
.require_min_compat_client
< ceph_release_t::luminous
) {
12139 ss
<< "min_compat_client "
12140 << osdmap
.require_min_compat_client
12141 << " < luminous, which is required for pg-upmap. "
12142 << "Try 'ceph osd set-require-min-compat-client luminous' "
12143 << "before using the new interface";
12147 err
= check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP
, ss
);
12148 if (err
== -EAGAIN
)
12153 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
12154 ss
<< "unable to parse 'pgid' value '"
12155 << cmd_vartype_stringify(cmdmap
.at("pgid")) << "'";
12160 if (!pgid
.parse(pgidstr
.c_str())) {
12161 ss
<< "invalid pgid '" << pgidstr
<< "'";
12165 if (!osdmap
.pg_exists(pgid
)) {
12166 ss
<< "pg " << pgid
<< " does not exist";
12170 if (pending_inc
.old_pools
.count(pgid
.pool())) {
12171 ss
<< "pool of " << pgid
<< " is pending removal";
12174 wait_for_finished_proposal(op
,
12175 new Monitor::C_Command(mon
, op
, err
, rs
, get_last_committed() + 1));
12183 OP_RM_PG_UPMAP_ITEMS
,
12186 if (prefix
== "osd pg-upmap") {
12187 option
= OP_PG_UPMAP
;
12188 } else if (prefix
== "osd rm-pg-upmap") {
12189 option
= OP_RM_PG_UPMAP
;
12190 } else if (prefix
== "osd pg-upmap-items") {
12191 option
= OP_PG_UPMAP_ITEMS
;
12193 option
= OP_RM_PG_UPMAP_ITEMS
;
12196 // check pending upmap changes
12198 case OP_PG_UPMAP
: // fall through
12199 case OP_RM_PG_UPMAP
:
12200 if (pending_inc
.new_pg_upmap
.count(pgid
) ||
12201 pending_inc
.old_pg_upmap
.count(pgid
)) {
12202 dout(10) << __func__
<< " waiting for pending update on "
12204 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12209 case OP_PG_UPMAP_ITEMS
: // fall through
12210 case OP_RM_PG_UPMAP_ITEMS
:
12211 if (pending_inc
.new_pg_upmap_items
.count(pgid
) ||
12212 pending_inc
.old_pg_upmap_items
.count(pgid
)) {
12213 dout(10) << __func__
<< " waiting for pending update on "
12215 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12221 ceph_abort_msg("invalid option");
12227 vector
<int64_t> id_vec
;
12228 if (!cmd_getval(cmdmap
, "id", id_vec
)) {
12229 ss
<< "unable to parse 'id' value(s) '"
12230 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12235 int pool_min_size
= osdmap
.get_pg_pool_min_size(pgid
);
12236 if ((int)id_vec
.size() < pool_min_size
) {
12237 ss
<< "num of osds (" << id_vec
.size() <<") < pool min size ("
12238 << pool_min_size
<< ")";
12243 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
12244 if ((int)id_vec
.size() > pool_size
) {
12245 ss
<< "num of osds (" << id_vec
.size() <<") > pool size ("
12246 << pool_size
<< ")";
12251 vector
<int32_t> new_pg_upmap
;
12252 for (auto osd
: id_vec
) {
12253 if (osd
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(osd
)) {
12254 ss
<< "osd." << osd
<< " does not exist";
12258 auto it
= std::find(new_pg_upmap
.begin(), new_pg_upmap
.end(), osd
);
12259 if (it
!= new_pg_upmap
.end()) {
12260 ss
<< "osd." << osd
<< " already exists, ";
12263 new_pg_upmap
.push_back(osd
);
12266 if (new_pg_upmap
.empty()) {
12267 ss
<< "no valid upmap items(pairs) is specified";
12272 pending_inc
.new_pg_upmap
[pgid
] = mempool::osdmap::vector
<int32_t>(
12273 new_pg_upmap
.begin(), new_pg_upmap
.end());
12274 ss
<< "set " << pgid
<< " pg_upmap mapping to " << new_pg_upmap
;
12278 case OP_RM_PG_UPMAP
:
12280 pending_inc
.old_pg_upmap
.insert(pgid
);
12281 ss
<< "clear " << pgid
<< " pg_upmap mapping";
12285 case OP_PG_UPMAP_ITEMS
:
12287 vector
<int64_t> id_vec
;
12288 if (!cmd_getval(cmdmap
, "id", id_vec
)) {
12289 ss
<< "unable to parse 'id' value(s) '"
12290 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12295 if (id_vec
.size() % 2) {
12296 ss
<< "you must specify pairs of osd ids to be remapped";
12301 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
12302 if ((int)(id_vec
.size() / 2) > pool_size
) {
12303 ss
<< "num of osd pairs (" << id_vec
.size() / 2 <<") > pool size ("
12304 << pool_size
<< ")";
12309 vector
<pair
<int32_t,int32_t>> new_pg_upmap_items
;
12310 ostringstream items
;
12312 for (auto p
= id_vec
.begin(); p
!= id_vec
.end(); ++p
) {
12316 ss
<< "from osd." << from
<< " == to osd." << to
<< ", ";
12319 if (!osdmap
.exists(from
)) {
12320 ss
<< "osd." << from
<< " does not exist";
12324 if (to
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(to
)) {
12325 ss
<< "osd." << to
<< " does not exist";
12329 pair
<int32_t,int32_t> entry
= make_pair(from
, to
);
12330 auto it
= std::find(new_pg_upmap_items
.begin(),
12331 new_pg_upmap_items
.end(), entry
);
12332 if (it
!= new_pg_upmap_items
.end()) {
12333 ss
<< "osd." << from
<< " -> osd." << to
<< " already exists, ";
12336 new_pg_upmap_items
.push_back(entry
);
12337 items
<< from
<< "->" << to
<< ",";
12339 string
out(items
.str());
12340 out
.resize(out
.size() - 1); // drop last ','
12343 if (new_pg_upmap_items
.empty()) {
12344 ss
<< "no valid upmap items(pairs) is specified";
12349 pending_inc
.new_pg_upmap_items
[pgid
] =
12350 mempool::osdmap::vector
<pair
<int32_t,int32_t>>(
12351 new_pg_upmap_items
.begin(), new_pg_upmap_items
.end());
12352 ss
<< "set " << pgid
<< " pg_upmap_items mapping to " << out
;
12356 case OP_RM_PG_UPMAP_ITEMS
:
12358 pending_inc
.old_pg_upmap_items
.insert(pgid
);
12359 ss
<< "clear " << pgid
<< " pg_upmap_items mapping";
12364 ceph_abort_msg("invalid option");
12368 } else if (prefix
== "osd primary-affinity") {
12370 if (!cmd_getval(cmdmap
, "id", id
)) {
12371 ss
<< "invalid osd id value '"
12372 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12377 if (!cmd_getval(cmdmap
, "weight", w
)) {
12378 ss
<< "unable to parse 'weight' value '"
12379 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
12383 long ww
= (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY
*w
);
12385 ss
<< "weight must be >= 0";
12389 if (osdmap
.require_min_compat_client
!= ceph_release_t::unknown
&&
12390 osdmap
.require_min_compat_client
< ceph_release_t::firefly
) {
12391 ss
<< "require_min_compat_client "
12392 << osdmap
.require_min_compat_client
12393 << " < firefly, which is required for primary-affinity";
12397 if (osdmap
.exists(id
)) {
12398 pending_inc
.new_primary_affinity
[id
] = ww
;
12399 ss
<< "set osd." << id
<< " primary-affinity to " << w
<< " (" << std::ios::hex
<< ww
<< std::ios::dec
<< ")";
12401 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12402 get_last_committed() + 1));
12405 ss
<< "osd." << id
<< " does not exist";
12409 } else if (prefix
== "osd reweight") {
12411 if (!cmd_getval(cmdmap
, "id", id
)) {
12412 ss
<< "unable to parse osd id value '"
12413 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12418 if (!cmd_getval(cmdmap
, "weight", w
)) {
12419 ss
<< "unable to parse weight value '"
12420 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
12424 long ww
= (int)((double)CEPH_OSD_IN
*w
);
12426 ss
<< "weight must be >= 0";
12430 if (osdmap
.exists(id
)) {
12431 pending_inc
.new_weight
[id
] = ww
;
12432 ss
<< "reweighted osd." << id
<< " to " << w
<< " (" << std::hex
<< ww
<< std::dec
<< ")";
12434 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12435 get_last_committed() + 1));
12438 ss
<< "osd." << id
<< " does not exist";
12442 } else if (prefix
== "osd reweightn") {
12443 map
<int32_t, uint32_t> weights
;
12444 err
= parse_reweights(cct
, cmdmap
, osdmap
, &weights
);
12446 ss
<< "unable to parse 'weights' value '"
12447 << cmd_vartype_stringify(cmdmap
.at("weights")) << "'";
12450 pending_inc
.new_weight
.insert(weights
.begin(), weights
.end());
12451 wait_for_finished_proposal(
12453 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
, get_last_committed() + 1));
12455 } else if (prefix
== "osd lost") {
12457 if (!cmd_getval(cmdmap
, "id", id
)) {
12458 ss
<< "unable to parse osd id value '"
12459 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12464 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
12466 ss
<< "are you SURE? this might mean real, permanent data loss. pass "
12467 "--yes-i-really-mean-it if you really do.";
12470 } else if (!osdmap
.exists(id
)) {
12471 ss
<< "osd." << id
<< " does not exist";
12474 } else if (!osdmap
.is_down(id
)) {
12475 ss
<< "osd." << id
<< " is not down";
12479 epoch_t e
= osdmap
.get_info(id
).down_at
;
12480 pending_inc
.new_lost
[id
] = e
;
12481 ss
<< "marked osd lost in epoch " << e
;
12483 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12484 get_last_committed() + 1));
12488 } else if (prefix
== "osd destroy-actual" ||
12489 prefix
== "osd purge-actual" ||
12490 prefix
== "osd purge-new") {
12491 /* Destroying an OSD means that we don't expect to further make use of
12492 * the OSDs data (which may even become unreadable after this operation),
12493 * and that we are okay with scrubbing all its cephx keys and config-key
12494 * data (which may include lockbox keys, thus rendering the osd's data
12497 * The OSD will not be removed. Instead, we will mark it as destroyed,
12498 * such that a subsequent call to `create` will not reuse the osd id.
12499 * This will play into being able to recreate the OSD, at the same
12500 * crush location, with minimal data movement.
12503 // make sure authmon is writeable.
12504 if (!mon
.authmon()->is_writeable()) {
12505 dout(10) << __func__
<< " waiting for auth mon to be writeable for "
12506 << "osd destroy" << dendl
;
12507 mon
.authmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
12512 if (!cmd_getval(cmdmap
, "id", id
)) {
12513 auto p
= cmdmap
.find("id");
12514 if (p
== cmdmap
.end()) {
12515 ss
<< "no osd id specified";
12517 ss
<< "unable to parse osd id value '"
12518 << cmd_vartype_stringify(cmdmap
.at("id")) << "";
12524 bool is_destroy
= (prefix
== "osd destroy-actual");
12526 ceph_assert("osd purge-actual" == prefix
||
12527 "osd purge-new" == prefix
);
12531 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
12533 ss
<< "Are you SURE? Did you verify with 'ceph osd safe-to-destroy'? "
12534 << "This will mean real, permanent data loss, as well "
12535 << "as deletion of cephx and lockbox keys. "
12536 << "Pass --yes-i-really-mean-it if you really do.";
12539 } else if (!osdmap
.exists(id
)) {
12540 ss
<< "osd." << id
<< " does not exist";
12541 err
= 0; // idempotent
12543 } else if (osdmap
.is_up(id
)) {
12544 ss
<< "osd." << id
<< " is not `down`.";
12547 } else if (is_destroy
&& osdmap
.is_destroyed(id
)) {
12548 ss
<< "destroyed osd." << id
;
12553 if (prefix
== "osd purge-new" &&
12554 (osdmap
.get_state(id
) & CEPH_OSD_NEW
) == 0) {
12555 ss
<< "osd." << id
<< " is not new";
12560 bool goto_reply
= false;
12564 err
= prepare_command_osd_destroy(id
, ss
);
12565 // we checked above that it should exist.
12566 ceph_assert(err
!= -ENOENT
);
12568 err
= prepare_command_osd_purge(id
, ss
);
12569 if (err
== -ENOENT
) {
12571 ss
<< "osd." << id
<< " does not exist.";
12577 if (err
< 0 || goto_reply
) {
12582 ss
<< "destroyed osd." << id
;
12584 ss
<< "purged osd." << id
;
12588 wait_for_finished_proposal(op
,
12589 new Monitor::C_Command(mon
, op
, 0, rs
, get_last_committed() + 1));
12590 force_immediate_propose();
12593 } else if (prefix
== "osd new") {
12595 // make sure authmon is writeable.
12596 if (!mon
.authmon()->is_writeable()) {
12597 dout(10) << __func__
<< " waiting for auth mon to be writeable for "
12598 << "osd new" << dendl
;
12599 mon
.authmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
12603 // make sure kvmon is writeable.
12604 if (!mon
.kvmon()->is_writeable()) {
12605 dout(10) << __func__
<< " waiting for kv mon to be writeable for "
12606 << "osd new" << dendl
;
12607 mon
.kvmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
12611 map
<string
,string
> param_map
;
12613 bufferlist bl
= m
->get_data();
12614 string param_json
= bl
.to_str();
12615 dout(20) << __func__
<< " osd new json = " << param_json
<< dendl
;
12617 err
= get_json_str_map(param_json
, ss
, ¶m_map
);
12621 dout(20) << __func__
<< " osd new params " << param_map
<< dendl
;
12624 err
= prepare_command_osd_new(op
, cmdmap
, param_map
, ss
, f
.get());
12637 if (err
== EEXIST
) {
12638 // idempotent operation
12643 wait_for_finished_proposal(op
,
12644 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
,
12645 get_last_committed() + 1));
12646 force_immediate_propose();
12649 } else if (prefix
== "osd create") {
12651 // optional id provided?
12652 int64_t id
= -1, cmd_id
= -1;
12653 if (cmd_getval(cmdmap
, "id", cmd_id
)) {
12655 ss
<< "invalid osd id value '" << cmd_id
<< "'";
12659 dout(10) << " osd create got id " << cmd_id
<< dendl
;
12664 if (cmd_getval(cmdmap
, "uuid", uuidstr
)) {
12665 if (!uuid
.parse(uuidstr
.c_str())) {
12666 ss
<< "invalid uuid value '" << uuidstr
<< "'";
12670 // we only care about the id if we also have the uuid, to
12671 // ensure the operation's idempotency.
12675 int32_t new_id
= -1;
12676 err
= prepare_command_osd_create(id
, uuid
, &new_id
, ss
);
12678 if (err
== -EAGAIN
) {
12679 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12682 // a check has failed; reply to the user.
12685 } else if (err
== EEXIST
) {
12686 // this is an idempotent operation; we can go ahead and reply.
12688 f
->open_object_section("created_osd");
12689 f
->dump_int("osdid", new_id
);
12690 f
->close_section();
12700 string empty_device_class
;
12701 do_osd_create(id
, uuid
, empty_device_class
, &new_id
);
12704 f
->open_object_section("created_osd");
12705 f
->dump_int("osdid", new_id
);
12706 f
->close_section();
12712 wait_for_finished_proposal(op
,
12713 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
,
12714 get_last_committed() + 1));
12717 } else if (prefix
== "osd blocklist clear" ||
12718 prefix
== "osd blacklist clear") {
12719 pending_inc
.new_blocklist
.clear();
12720 std::list
<std::pair
<entity_addr_t
,utime_t
> > blocklist
;
12721 std::list
<std::pair
<entity_addr_t
,utime_t
> > range_b
;
12722 osdmap
.get_blocklist(&blocklist
, &range_b
);
12723 for (const auto &entry
: blocklist
) {
12724 pending_inc
.old_blocklist
.push_back(entry
.first
);
12726 for (const auto &entry
: range_b
) {
12727 pending_inc
.old_range_blocklist
.push_back(entry
.first
);
12729 ss
<< " removed all blocklist entries";
12731 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12732 get_last_committed() + 1));
12734 } else if (prefix
== "osd blocklist" ||
12735 prefix
== "osd blacklist") {
12736 string addrstr
, rangestr
;
12737 bool range
= false;
12738 cmd_getval(cmdmap
, "addr", addrstr
);
12739 if (cmd_getval(cmdmap
, "range", rangestr
)) {
12740 if (rangestr
== "range") {
12743 ss
<< "Did you mean to specify \"osd blocklist range\"?";
12748 entity_addr_t addr
;
12749 if (!addr
.parse(addrstr
)) {
12750 ss
<< "unable to parse address " << addrstr
;
12756 if (!addr
.maybe_cidr()) {
12757 ss
<< "You specified a range command, but " << addr
12758 << " does not parse as a CIDR range";
12762 addr
.type
= entity_addr_t::TYPE_CIDR
;
12763 err
= check_cluster_features(CEPH_FEATUREMASK_RANGE_BLOCKLIST
, ss
);
12767 if ((addr
.is_ipv4() && addr
.get_nonce() > 32) ||
12768 (addr
.is_ipv6() && addr
.get_nonce() > 128)) {
12769 ss
<< "Too many bits in range for that protocol!";
12774 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
12775 // always blocklist type ANY
12776 addr
.set_type(entity_addr_t::TYPE_ANY
);
12778 addr
.set_type(entity_addr_t::TYPE_LEGACY
);
12782 string blocklistop
;
12783 if (!cmd_getval(cmdmap
, "blocklistop", blocklistop
)) {
12784 cmd_getval(cmdmap
, "blacklistop", blocklistop
);
12786 if (blocklistop
== "add") {
12787 utime_t expires
= ceph_clock_now();
12788 // default one hour
12789 double d
= cmd_getval_or
<double>(cmdmap
, "expire",
12790 g_conf()->mon_osd_blocklist_default_expire
);
12793 auto add_to_pending_blocklists
= [](auto& nb
, auto& ob
,
12795 const auto& expires
) {
12796 nb
[addr
] = expires
;
12797 // cancel any pending un-blocklisting request too
12798 auto it
= std::find(ob
.begin(),
12800 if (it
!= ob
.end()) {
12805 add_to_pending_blocklists(pending_inc
.new_range_blocklist
,
12806 pending_inc
.old_range_blocklist
,
12810 add_to_pending_blocklists(pending_inc
.new_blocklist
,
12811 pending_inc
.old_blocklist
,
12815 ss
<< "blocklisting " << addr
<< " until " << expires
<< " (" << d
<< " sec)";
12817 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12818 get_last_committed() + 1));
12820 } else if (blocklistop
== "rm") {
12821 auto rm_from_pending_blocklists
= [](const auto& addr
,
12823 auto& ob
, auto& pb
) {
12824 if (blocklist
.count(addr
)) {
12825 ob
.push_back(addr
);
12827 } else if (pb
.count(addr
)) {
12833 if ((!range
&& rm_from_pending_blocklists(addr
, osdmap
.blocklist
,
12834 pending_inc
.old_blocklist
,
12835 pending_inc
.new_blocklist
)) ||
12836 (range
&& rm_from_pending_blocklists(addr
, osdmap
.range_blocklist
,
12837 pending_inc
.old_range_blocklist
,
12838 pending_inc
.new_range_blocklist
))) {
12839 ss
<< "un-blocklisting " << addr
;
12841 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12842 get_last_committed() + 1));
12845 ss
<< addr
<< " isn't blocklisted";
12850 } else if (prefix
== "osd pool mksnap") {
12852 cmd_getval(cmdmap
, "pool", poolstr
);
12853 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
12855 ss
<< "unrecognized pool '" << poolstr
<< "'";
12860 cmd_getval(cmdmap
, "snap", snapname
);
12861 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
12862 if (p
->is_unmanaged_snaps_mode()) {
12863 ss
<< "pool " << poolstr
<< " is in unmanaged snaps mode";
12866 } else if (p
->snap_exists(snapname
.c_str())) {
12867 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " already exists";
12870 } else if (p
->is_tier()) {
12871 ss
<< "pool " << poolstr
<< " is a cache tier";
12876 if (pending_inc
.new_pools
.count(pool
))
12877 pp
= &pending_inc
.new_pools
[pool
];
12879 pp
= &pending_inc
.new_pools
[pool
];
12882 if (pp
->snap_exists(snapname
.c_str())) {
12883 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " already exists";
12885 pp
->add_snap(snapname
.c_str(), ceph_clock_now());
12886 pp
->set_snap_epoch(pending_inc
.epoch
);
12887 ss
<< "created pool " << poolstr
<< " snap " << snapname
;
12890 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12891 get_last_committed() + 1));
12893 } else if (prefix
== "osd pool rmsnap") {
12895 cmd_getval(cmdmap
, "pool", poolstr
);
12896 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
12898 ss
<< "unrecognized pool '" << poolstr
<< "'";
12903 cmd_getval(cmdmap
, "snap", snapname
);
12904 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
12905 if (p
->is_unmanaged_snaps_mode()) {
12906 ss
<< "pool " << poolstr
<< " is in unmanaged snaps mode";
12909 } else if (!p
->snap_exists(snapname
.c_str())) {
12910 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " does not exist";
12915 if (pending_inc
.new_pools
.count(pool
))
12916 pp
= &pending_inc
.new_pools
[pool
];
12918 pp
= &pending_inc
.new_pools
[pool
];
12921 snapid_t sn
= pp
->snap_exists(snapname
.c_str());
12923 pp
->remove_snap(sn
);
12924 pp
->set_snap_epoch(pending_inc
.epoch
);
12925 ss
<< "removed pool " << poolstr
<< " snap " << snapname
;
12927 ss
<< "already removed pool " << poolstr
<< " snap " << snapname
;
12930 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12931 get_last_committed() + 1));
12933 } else if (prefix
== "osd pool create") {
12934 int64_t pg_num
= cmd_getval_or
<int64_t>(cmdmap
, "pg_num", 0);
12935 int64_t pg_num_min
= cmd_getval_or
<int64_t>(cmdmap
, "pg_num_min", 0);
12936 int64_t pg_num_max
= cmd_getval_or
<int64_t>(cmdmap
, "pg_num_max", 0);
12937 int64_t pgp_num
= cmd_getval_or
<int64_t>(cmdmap
, "pgp_num", pg_num
);
12938 string pool_type_str
;
12939 cmd_getval(cmdmap
, "pool_type", pool_type_str
);
12940 if (pool_type_str
.empty())
12941 pool_type_str
= g_conf().get_val
<string
>("osd_pool_default_type");
12944 cmd_getval(cmdmap
, "pool", poolstr
);
12945 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
12946 if (pool_id
>= 0) {
12947 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
12948 if (pool_type_str
!= p
->get_type_name()) {
12949 ss
<< "pool '" << poolstr
<< "' cannot change to type " << pool_type_str
;
12952 ss
<< "pool '" << poolstr
<< "' already exists";
12959 if (pool_type_str
== "replicated") {
12960 pool_type
= pg_pool_t::TYPE_REPLICATED
;
12961 } else if (pool_type_str
== "erasure") {
12962 pool_type
= pg_pool_t::TYPE_ERASURE
;
12964 ss
<< "unknown pool type '" << pool_type_str
<< "'";
12969 bool implicit_rule_creation
= false;
12970 int64_t expected_num_objects
= 0;
12972 cmd_getval(cmdmap
, "rule", rule_name
);
12973 string erasure_code_profile
;
12974 cmd_getval(cmdmap
, "erasure_code_profile", erasure_code_profile
);
12976 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
12977 if (erasure_code_profile
== "")
12978 erasure_code_profile
= "default";
12979 //handle the erasure code profile
12980 if (erasure_code_profile
== "default") {
12981 if (!osdmap
.has_erasure_code_profile(erasure_code_profile
)) {
12982 if (pending_inc
.has_erasure_code_profile(erasure_code_profile
)) {
12983 dout(20) << "erasure code profile " << erasure_code_profile
<< " already pending" << dendl
;
12987 map
<string
,string
> profile_map
;
12988 err
= osdmap
.get_erasure_code_profile_default(cct
,
12993 dout(20) << "erasure code profile " << erasure_code_profile
<< " set" << dendl
;
12994 pending_inc
.set_erasure_code_profile(erasure_code_profile
, profile_map
);
12998 if (rule_name
== "") {
12999 implicit_rule_creation
= true;
13000 if (erasure_code_profile
== "default") {
13001 rule_name
= "erasure-code";
13003 dout(1) << "implicitly use rule named after the pool: "
13004 << poolstr
<< dendl
;
13005 rule_name
= poolstr
;
13008 expected_num_objects
=
13009 cmd_getval_or
<int64_t>(cmdmap
, "expected_num_objects", 0);
13011 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
13012 // and put expected_num_objects to rule field
13013 if (erasure_code_profile
!= "") { // cmd is from CLI
13014 if (rule_name
!= "") {
13016 expected_num_objects
= strict_strtoll(rule_name
.c_str(), 10, &interr
);
13017 if (interr
.length()) {
13018 ss
<< "error parsing integer value '" << rule_name
<< "': " << interr
;
13023 rule_name
= erasure_code_profile
;
13024 } else { // cmd is well-formed
13025 expected_num_objects
=
13026 cmd_getval_or
<int64_t>(cmdmap
, "expected_num_objects", 0);
13030 if (!implicit_rule_creation
&& rule_name
!= "") {
13032 err
= get_crush_rule(rule_name
, &rule
, &ss
);
13033 if (err
== -EAGAIN
) {
13034 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13041 if (expected_num_objects
< 0) {
13042 ss
<< "'expected_num_objects' must be non-negative";
13048 osdmap
.get_all_osds(osds
);
13049 bool has_filestore_osd
= std::any_of(osds
.begin(), osds
.end(), [this](int osd
) {
13051 if (!get_osd_objectstore_type(osd
, &type
)) {
13052 return type
== "filestore";
13058 if (has_filestore_osd
&&
13059 expected_num_objects
> 0 &&
13060 cct
->_conf
->filestore_merge_threshold
> 0) {
13061 ss
<< "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
13066 if (has_filestore_osd
&&
13067 expected_num_objects
== 0 &&
13068 cct
->_conf
->filestore_merge_threshold
< 0) {
13069 int osds
= osdmap
.get_num_osds();
13071 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
13072 if (!sure
&& osds
&& (pg_num
>= 1024 || pg_num
/ osds
>= 100)) {
13073 ss
<< "For better initial performance on pools expected to store a "
13074 << "large number of objects, consider supplying the "
13075 << "expected_num_objects parameter when creating the pool."
13076 << " Pass --yes-i-really-mean-it to ignore it";
13082 int64_t fast_read_param
= cmd_getval_or
<int64_t>(cmdmap
, "fast_read", -1);
13083 FastReadType fast_read
= FAST_READ_DEFAULT
;
13084 if (fast_read_param
== 0)
13085 fast_read
= FAST_READ_OFF
;
13086 else if (fast_read_param
> 0)
13087 fast_read
= FAST_READ_ON
;
13089 int64_t repl_size
= 0;
13090 cmd_getval(cmdmap
, "size", repl_size
);
13091 int64_t target_size_bytes
= 0;
13092 double target_size_ratio
= 0.0;
13093 cmd_getval(cmdmap
, "target_size_bytes", target_size_bytes
);
13094 cmd_getval(cmdmap
, "target_size_ratio", target_size_ratio
);
13096 string pg_autoscale_mode
;
13097 cmd_getval(cmdmap
, "autoscale_mode", pg_autoscale_mode
);
13099 bool bulk
= cmd_getval_or
<bool>(cmdmap
, "bulk", 0);
13100 err
= prepare_new_pool(poolstr
,
13101 -1, // default crush rule
13103 pg_num
, pgp_num
, pg_num_min
, pg_num_max
,
13104 repl_size
, target_size_bytes
, target_size_ratio
,
13105 erasure_code_profile
, pool_type
,
13106 (uint64_t)expected_num_objects
,
13114 ss
<< "pool '" << poolstr
<< "' already exists";
13117 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13126 ss
<< "pool '" << poolstr
<< "' created";
13129 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
13130 get_last_committed() + 1));
13133 } else if (prefix
== "osd pool delete" ||
13134 prefix
== "osd pool rm") {
13135 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
13136 string poolstr
, poolstr2
, sure
;
13137 cmd_getval(cmdmap
, "pool", poolstr
);
13138 cmd_getval(cmdmap
, "pool2", poolstr2
);
13139 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
13141 ss
<< "pool '" << poolstr
<< "' does not exist";
13146 bool force_no_fake
= false;
13147 cmd_getval(cmdmap
, "yes_i_really_really_mean_it", force_no_fake
);
13148 bool force
= false;
13149 cmd_getval(cmdmap
, "yes_i_really_really_mean_it_not_faking", force
);
13150 if (poolstr2
!= poolstr
||
13151 (!force
&& !force_no_fake
)) {
13152 ss
<< "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
13153 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
13154 << "followed by --yes-i-really-really-mean-it.";
13158 err
= _prepare_remove_pool(pool
, &ss
, force_no_fake
);
13159 if (err
== -EAGAIN
) {
13160 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13166 } else if (prefix
== "osd pool rename") {
13167 string srcpoolstr
, destpoolstr
;
13168 cmd_getval(cmdmap
, "srcpool", srcpoolstr
);
13169 cmd_getval(cmdmap
, "destpool", destpoolstr
);
13170 int64_t pool_src
= osdmap
.lookup_pg_pool_name(srcpoolstr
.c_str());
13171 int64_t pool_dst
= osdmap
.lookup_pg_pool_name(destpoolstr
.c_str());
13173 if (pool_src
< 0) {
13174 if (pool_dst
>= 0) {
13175 // src pool doesn't exist, dst pool does exist: to ensure idempotency
13176 // of operations, assume this rename succeeded, as it is not changing
13177 // the current state. Make sure we output something understandable
13178 // for whoever is issuing the command, if they are paying attention,
13179 // in case it was not intentional; or to avoid a "wtf?" and a bug
13180 // report in case it was intentional, while expecting a failure.
13181 ss
<< "pool '" << srcpoolstr
<< "' does not exist; pool '"
13182 << destpoolstr
<< "' does -- assuming successful rename";
13185 ss
<< "unrecognized pool '" << srcpoolstr
<< "'";
13189 } else if (pool_dst
>= 0) {
13190 // source pool exists and so does the destination pool
13191 ss
<< "pool '" << destpoolstr
<< "' already exists";
13196 int ret
= _prepare_rename_pool(pool_src
, destpoolstr
);
13198 ss
<< "pool '" << srcpoolstr
<< "' renamed to '" << destpoolstr
<< "'";
13200 ss
<< "failed to rename pool '" << srcpoolstr
<< "' to '" << destpoolstr
<< "': "
13201 << cpp_strerror(ret
);
13204 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, ret
, rs
,
13205 get_last_committed() + 1));
13208 } else if (prefix
== "osd pool set") {
13209 err
= prepare_command_pool_set(cmdmap
, ss
);
13210 if (err
== -EAGAIN
)
13216 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
13217 get_last_committed() + 1));
13219 } else if (prefix
== "osd tier add") {
13220 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
13221 if (err
== -EAGAIN
)
13226 cmd_getval(cmdmap
, "pool", poolstr
);
13227 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13229 ss
<< "unrecognized pool '" << poolstr
<< "'";
13233 string tierpoolstr
;
13234 cmd_getval(cmdmap
, "tierpool", tierpoolstr
);
13235 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
13236 if (tierpool_id
< 0) {
13237 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
13241 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13243 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
13246 if (!_check_become_tier(tierpool_id
, tp
, pool_id
, p
, &err
, &ss
)) {
13250 // make sure new tier is empty
13251 bool force_nonempty
= false;
13252 cmd_getval_compat_cephbool(cmdmap
, "force_nonempty", force_nonempty
);
13253 const pool_stat_t
*pstats
= mon
.mgrstatmon()->get_pool_stat(tierpool_id
);
13254 if (pstats
&& pstats
->stats
.sum
.num_objects
!= 0 &&
13256 ss
<< "tier pool '" << tierpoolstr
<< "' is not empty; --force-nonempty to force";
13260 if (tp
->is_erasure()) {
13261 ss
<< "tier pool '" << tierpoolstr
13262 << "' is an ec pool, which cannot be a tier";
13266 if ((!tp
->removed_snaps
.empty() || !tp
->snaps
.empty()) &&
13267 (!force_nonempty
||
13268 !g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps
)) {
13269 ss
<< "tier pool '" << tierpoolstr
<< "' has snapshot state; it cannot be added as a tier without breaking the pool";
13274 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13275 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
13276 if (np
->tiers
.count(tierpool_id
) || ntp
->is_tier()) {
13277 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13280 np
->tiers
.insert(tierpool_id
);
13281 np
->set_snap_epoch(pending_inc
.epoch
); // tier will update to our snap info
13282 ntp
->tier_of
= pool_id
;
13283 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) a tier of '" << poolstr
<< "'";
13284 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13285 get_last_committed() + 1));
13287 } else if (prefix
== "osd tier remove" ||
13288 prefix
== "osd tier rm") {
13290 cmd_getval(cmdmap
, "pool", poolstr
);
13291 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13293 ss
<< "unrecognized pool '" << poolstr
<< "'";
13297 string tierpoolstr
;
13298 cmd_getval(cmdmap
, "tierpool", tierpoolstr
);
13299 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
13300 if (tierpool_id
< 0) {
13301 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
13305 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13307 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
13310 if (!_check_remove_tier(pool_id
, p
, tp
, &err
, &ss
)) {
13314 if (p
->tiers
.count(tierpool_id
) == 0) {
13315 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) not a tier of '" << poolstr
<< "'";
13319 if (tp
->tier_of
!= pool_id
) {
13320 ss
<< "tier pool '" << tierpoolstr
<< "' is a tier of '"
13321 << osdmap
.get_pool_name(tp
->tier_of
) << "': "
13322 // be scary about it; this is an inconsistency and bells must go off
13323 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
13327 if (p
->read_tier
== tierpool_id
) {
13328 ss
<< "tier pool '" << tierpoolstr
<< "' is the overlay for '" << poolstr
<< "'; please remove-overlay first";
13333 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13334 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
13335 if (np
->tiers
.count(tierpool_id
) == 0 ||
13336 ntp
->tier_of
!= pool_id
||
13337 np
->read_tier
== tierpool_id
) {
13338 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13341 np
->tiers
.erase(tierpool_id
);
13343 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) not a tier of '" << poolstr
<< "'";
13344 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13345 get_last_committed() + 1));
13347 } else if (prefix
== "osd tier set-overlay") {
13348 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
13349 if (err
== -EAGAIN
)
13354 cmd_getval(cmdmap
, "pool", poolstr
);
13355 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13357 ss
<< "unrecognized pool '" << poolstr
<< "'";
13361 string overlaypoolstr
;
13362 cmd_getval(cmdmap
, "overlaypool", overlaypoolstr
);
13363 int64_t overlaypool_id
= osdmap
.lookup_pg_pool_name(overlaypoolstr
);
13364 if (overlaypool_id
< 0) {
13365 ss
<< "unrecognized pool '" << overlaypoolstr
<< "'";
13369 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13371 const pg_pool_t
*overlay_p
= osdmap
.get_pg_pool(overlaypool_id
);
13372 ceph_assert(overlay_p
);
13373 if (p
->tiers
.count(overlaypool_id
) == 0) {
13374 ss
<< "tier pool '" << overlaypoolstr
<< "' is not a tier of '" << poolstr
<< "'";
13378 if (p
->read_tier
== overlaypool_id
) {
13380 ss
<< "overlay for '" << poolstr
<< "' is now (or already was) '" << overlaypoolstr
<< "'";
13383 if (p
->has_read_tier()) {
13384 ss
<< "pool '" << poolstr
<< "' has overlay '"
13385 << osdmap
.get_pool_name(p
->read_tier
)
13386 << "'; please remove-overlay first";
13392 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13393 np
->read_tier
= overlaypool_id
;
13394 np
->write_tier
= overlaypool_id
;
13395 np
->set_last_force_op_resend(pending_inc
.epoch
);
13396 pg_pool_t
*noverlay_p
= pending_inc
.get_new_pool(overlaypool_id
, overlay_p
);
13397 noverlay_p
->set_last_force_op_resend(pending_inc
.epoch
);
13398 ss
<< "overlay for '" << poolstr
<< "' is now (or already was) '" << overlaypoolstr
<< "'";
13399 if (overlay_p
->cache_mode
== pg_pool_t::CACHEMODE_NONE
)
13400 ss
<<" (WARNING: overlay pool cache_mode is still NONE)";
13401 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13402 get_last_committed() + 1));
13404 } else if (prefix
== "osd tier remove-overlay" ||
13405 prefix
== "osd tier rm-overlay") {
13407 cmd_getval(cmdmap
, "pool", poolstr
);
13408 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13410 ss
<< "unrecognized pool '" << poolstr
<< "'";
13414 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13416 if (!p
->has_read_tier()) {
13418 ss
<< "there is now (or already was) no overlay for '" << poolstr
<< "'";
13422 if (!_check_remove_tier(pool_id
, p
, NULL
, &err
, &ss
)) {
13427 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13428 if (np
->has_read_tier()) {
13429 const pg_pool_t
*op
= osdmap
.get_pg_pool(np
->read_tier
);
13430 pg_pool_t
*nop
= pending_inc
.get_new_pool(np
->read_tier
,op
);
13431 nop
->set_last_force_op_resend(pending_inc
.epoch
);
13433 if (np
->has_write_tier()) {
13434 const pg_pool_t
*op
= osdmap
.get_pg_pool(np
->write_tier
);
13435 pg_pool_t
*nop
= pending_inc
.get_new_pool(np
->write_tier
, op
);
13436 nop
->set_last_force_op_resend(pending_inc
.epoch
);
13438 np
->clear_read_tier();
13439 np
->clear_write_tier();
13440 np
->set_last_force_op_resend(pending_inc
.epoch
);
13441 ss
<< "there is now (or already was) no overlay for '" << poolstr
<< "'";
13442 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13443 get_last_committed() + 1));
13445 } else if (prefix
== "osd tier cache-mode") {
13446 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
13447 if (err
== -EAGAIN
)
13452 cmd_getval(cmdmap
, "pool", poolstr
);
13453 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13455 ss
<< "unrecognized pool '" << poolstr
<< "'";
13459 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13461 if (!p
->is_tier()) {
13462 ss
<< "pool '" << poolstr
<< "' is not a tier";
13467 cmd_getval(cmdmap
, "mode", modestr
);
13468 pg_pool_t::cache_mode_t mode
= pg_pool_t::get_cache_mode_from_str(modestr
);
13469 if (int(mode
) < 0) {
13470 ss
<< "'" << modestr
<< "' is not a valid cache mode";
13476 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
13478 if (mode
== pg_pool_t::CACHEMODE_FORWARD
||
13479 mode
== pg_pool_t::CACHEMODE_READFORWARD
) {
13480 ss
<< "'" << modestr
<< "' is no longer a supported cache mode";
13484 if ((mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13485 mode
!= pg_pool_t::CACHEMODE_NONE
&&
13486 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
13487 mode
!= pg_pool_t::CACHEMODE_READPROXY
) &&
13489 ss
<< "'" << modestr
<< "' is not a well-supported cache mode and may "
13490 << "corrupt your data. pass --yes-i-really-mean-it to force.";
13495 // pool already has this cache-mode set and there are no pending changes
13496 if (p
->cache_mode
== mode
&&
13497 (pending_inc
.new_pools
.count(pool_id
) == 0 ||
13498 pending_inc
.new_pools
[pool_id
].cache_mode
== p
->cache_mode
)) {
13499 ss
<< "set cache-mode for pool '" << poolstr
<< "'"
13500 << " to " << pg_pool_t::get_cache_mode_name(mode
);
13505 /* Mode description:
13507 * none: No cache-mode defined
13508 * forward: Forward all reads and writes to base pool [removed]
13509 * writeback: Cache writes, promote reads from base pool
13510 * readonly: Forward writes to base pool
13511 * readforward: Writes are in writeback mode, Reads are in forward mode [removed]
13512 * proxy: Proxy all reads and writes to base pool
13513 * readproxy: Writes are in writeback mode, Reads are in proxy mode
13515 * Hence, these are the allowed transitions:
13518 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
13519 * proxy -> readproxy || writeback || any IF num_objects_dirty == 0
13520 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
13521 * readproxy -> proxy || writeback || any IF num_objects_dirty == 0
13522 * writeback -> readproxy || proxy
13526 // We check if the transition is valid against the current pool mode, as
13527 // it is the only committed state thus far. We will blantly squash
13528 // whatever mode is on the pending state.
13530 if (p
->cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
&&
13531 (mode
!= pg_pool_t::CACHEMODE_PROXY
&&
13532 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) {
13533 ss
<< "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode
)
13534 << "' on a '" << pg_pool_t::get_cache_mode_name(p
->cache_mode
)
13535 << "' pool; only '"
13536 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY
)
13541 if ((p
->cache_mode
== pg_pool_t::CACHEMODE_READFORWARD
&&
13542 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13543 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
13544 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) ||
13546 (p
->cache_mode
== pg_pool_t::CACHEMODE_READPROXY
&&
13547 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13548 mode
!= pg_pool_t::CACHEMODE_PROXY
)) ||
13550 (p
->cache_mode
== pg_pool_t::CACHEMODE_PROXY
&&
13551 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13552 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) ||
13554 (p
->cache_mode
== pg_pool_t::CACHEMODE_FORWARD
&&
13555 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13556 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
13557 mode
!= pg_pool_t::CACHEMODE_READPROXY
))) {
13559 const pool_stat_t
* pstats
=
13560 mon
.mgrstatmon()->get_pool_stat(pool_id
);
13562 if (pstats
&& pstats
->stats
.sum
.num_objects_dirty
> 0) {
13563 ss
<< "unable to set cache-mode '"
13564 << pg_pool_t::get_cache_mode_name(mode
) << "' on pool '" << poolstr
13565 << "': dirty objects found";
13571 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13572 np
->cache_mode
= mode
;
13573 // set this both when moving to and from cache_mode NONE. this is to
13574 // capture legacy pools that were set up before this flag existed.
13575 np
->flags
|= pg_pool_t::FLAG_INCOMPLETE_CLONES
;
13576 ss
<< "set cache-mode for pool '" << poolstr
13577 << "' to " << pg_pool_t::get_cache_mode_name(mode
);
13578 if (mode
== pg_pool_t::CACHEMODE_NONE
) {
13579 const pg_pool_t
*base_pool
= osdmap
.get_pg_pool(np
->tier_of
);
13580 ceph_assert(base_pool
);
13581 if (base_pool
->read_tier
== pool_id
||
13582 base_pool
->write_tier
== pool_id
)
13583 ss
<<" (WARNING: pool is still configured as read or write tier)";
13585 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13586 get_last_committed() + 1));
13588 } else if (prefix
== "osd tier add-cache") {
13589 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
13590 if (err
== -EAGAIN
)
13595 cmd_getval(cmdmap
, "pool", poolstr
);
13596 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13598 ss
<< "unrecognized pool '" << poolstr
<< "'";
13602 string tierpoolstr
;
13603 cmd_getval(cmdmap
, "tierpool", tierpoolstr
);
13604 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
13605 if (tierpool_id
< 0) {
13606 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
13610 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13612 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
13615 if (!_check_become_tier(tierpool_id
, tp
, pool_id
, p
, &err
, &ss
)) {
13620 if (!cmd_getval(cmdmap
, "size", size
)) {
13621 ss
<< "unable to parse 'size' value '"
13622 << cmd_vartype_stringify(cmdmap
.at("size")) << "'";
13626 // make sure new tier is empty
13627 const pool_stat_t
*pstats
=
13628 mon
.mgrstatmon()->get_pool_stat(tierpool_id
);
13629 if (pstats
&& pstats
->stats
.sum
.num_objects
!= 0) {
13630 ss
<< "tier pool '" << tierpoolstr
<< "' is not empty";
13634 auto& modestr
= g_conf().get_val
<string
>("osd_tier_default_cache_mode");
13635 pg_pool_t::cache_mode_t mode
= pg_pool_t::get_cache_mode_from_str(modestr
);
13636 if (int(mode
) < 0) {
13637 ss
<< "osd tier cache default mode '" << modestr
<< "' is not a valid cache mode";
13641 HitSet::Params hsp
;
13642 auto& cache_hit_set_type
=
13643 g_conf().get_val
<string
>("osd_tier_default_cache_hit_set_type");
13644 if (cache_hit_set_type
== "bloom") {
13645 BloomHitSet::Params
*bsp
= new BloomHitSet::Params
;
13646 bsp
->set_fpp(g_conf().get_val
<double>("osd_pool_default_hit_set_bloom_fpp"));
13647 hsp
= HitSet::Params(bsp
);
13648 } else if (cache_hit_set_type
== "explicit_hash") {
13649 hsp
= HitSet::Params(new ExplicitHashHitSet::Params
);
13650 } else if (cache_hit_set_type
== "explicit_object") {
13651 hsp
= HitSet::Params(new ExplicitObjectHitSet::Params
);
13653 ss
<< "osd tier cache default hit set type '"
13654 << cache_hit_set_type
<< "' is not a known type";
13659 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13660 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
13661 if (np
->tiers
.count(tierpool_id
) || ntp
->is_tier()) {
13662 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13665 np
->tiers
.insert(tierpool_id
);
13666 np
->read_tier
= np
->write_tier
= tierpool_id
;
13667 np
->set_snap_epoch(pending_inc
.epoch
); // tier will update to our snap info
13668 np
->set_last_force_op_resend(pending_inc
.epoch
);
13669 ntp
->set_last_force_op_resend(pending_inc
.epoch
);
13670 ntp
->tier_of
= pool_id
;
13671 ntp
->cache_mode
= mode
;
13672 ntp
->hit_set_count
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_count");
13673 ntp
->hit_set_period
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_period");
13674 ntp
->min_read_recency_for_promote
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
13675 ntp
->min_write_recency_for_promote
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
13676 ntp
->hit_set_grade_decay_rate
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
13677 ntp
->hit_set_search_last_n
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
13678 ntp
->hit_set_params
= hsp
;
13679 ntp
->target_max_bytes
= size
;
13680 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) a cache tier of '" << poolstr
<< "'";
13681 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13682 get_last_committed() + 1));
13684 } else if (prefix
== "osd pool set-quota") {
13686 cmd_getval(cmdmap
, "pool", poolstr
);
13687 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13689 ss
<< "unrecognized pool '" << poolstr
<< "'";
13695 cmd_getval(cmdmap
, "field", field
);
13696 if (field
!= "max_objects" && field
!= "max_bytes") {
13697 ss
<< "unrecognized field '" << field
<< "'; should be 'max_bytes' or 'max_objects'";
13702 // val could contain unit designations, so we treat as a string
13704 cmd_getval(cmdmap
, "val", val
);
13707 if (field
== "max_objects") {
13708 value
= strict_si_cast
<uint64_t>(val
, &tss
);
13709 } else if (field
== "max_bytes") {
13710 value
= strict_iecstrtoll(val
, &tss
);
13712 ceph_abort_msg("unrecognized option");
13714 if (!tss
.empty()) {
13715 ss
<< "error parsing value '" << val
<< "': " << tss
;
13720 pg_pool_t
*pi
= pending_inc
.get_new_pool(pool_id
, osdmap
.get_pg_pool(pool_id
));
13721 if (field
== "max_objects") {
13722 pi
->quota_max_objects
= value
;
13723 } else if (field
== "max_bytes") {
13724 pi
->quota_max_bytes
= value
;
13726 ceph_abort_msg("unrecognized option");
13728 ss
<< "set-quota " << field
<< " = " << value
<< " for pool " << poolstr
;
13730 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
13731 get_last_committed() + 1));
13733 } else if (prefix
== "osd pool application enable" ||
13734 prefix
== "osd pool application disable" ||
13735 prefix
== "osd pool application set" ||
13736 prefix
== "osd pool application rm") {
13737 err
= prepare_command_pool_application(prefix
, cmdmap
, ss
);
13738 if (err
== -EAGAIN
) {
13740 } else if (err
< 0) {
13745 } else if (prefix
== "osd force-create-pg") {
13748 cmd_getval(cmdmap
, "pgid", pgidstr
);
13749 if (!pgid
.parse(pgidstr
.c_str())) {
13750 ss
<< "invalid pgid '" << pgidstr
<< "'";
13754 if (!osdmap
.pg_exists(pgid
)) {
13755 ss
<< "pg " << pgid
<< " should not exist";
13760 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
13762 ss
<< "This command will recreate a lost (as in data lost) PG with data in it, such "
13763 << "that the cluster will give up ever trying to recover the lost data. Do this "
13764 << "only if you are certain that all copies of the PG are in fact lost and you are "
13765 << "willing to accept that the data is permanently destroyed. Pass "
13766 << "--yes-i-really-mean-it to proceed.";
13772 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
13773 auto emplaced
= creating_pgs
.pgs
.emplace(
13775 creating_pgs_t::pg_create_info(osdmap
.get_epoch(),
13776 ceph_clock_now()));
13777 creating_now
= emplaced
.second
;
13779 if (creating_now
) {
13780 ss
<< "pg " << pgidstr
<< " now creating, ok";
13781 // set the pool's CREATING flag so that (1) the osd won't ignore our
13782 // create message and (2) we won't propose any future pg_num changes
13783 // until after the PG has been instantiated.
13784 if (pending_inc
.new_pools
.count(pgid
.pool()) == 0) {
13785 pending_inc
.new_pools
[pgid
.pool()] = *osdmap
.get_pg_pool(pgid
.pool());
13787 pending_inc
.new_pools
[pgid
.pool()].flags
|= pg_pool_t::FLAG_CREATING
;
13791 ss
<< "pg " << pgid
<< " already creating";
13795 } else if (prefix
== "osd force_healthy_stretch_mode") {
13797 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
13799 ss
<< "This command will require peering across multiple CRUSH buckets "
13800 "(probably two data centers or availability zones?) and may result in PGs "
13801 "going inactive until backfilling is complete. Pass --yes-i-really-mean-it to proceed.";
13805 try_end_recovery_stretch_mode(true);
13806 ss
<< "Triggering healthy stretch mode";
13809 } else if (prefix
== "osd force_recovery_stretch_mode") {
13811 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
13813 ss
<< "This command will increase pool sizes to try and spread them "
13814 "across multiple CRUSH buckets (probably two data centers or "
13815 "availability zones?) and should have happened automatically"
13816 "Pass --yes-i-really-mean-it to proceed.";
13820 mon
.go_recovery_stretch_mode();
13821 ss
<< "Triggering recovery stretch mode";
13830 if (err
< 0 && rs
.length() == 0)
13831 rs
= cpp_strerror(err
);
13832 mon
.reply_command(op
, err
, rs
, rdata
, get_last_committed());
13837 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
13838 get_last_committed() + 1));
13842 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13846 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op
)
13848 op
->mark_osdmon_event(__func__
);
13850 auto m
= op
->get_req
<MPoolOp
>();
13851 MonSession
*session
= op
->get_session();
13853 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
13858 case POOL_OP_CREATE_UNMANAGED_SNAP
:
13859 case POOL_OP_DELETE_UNMANAGED_SNAP
:
13861 const std::string
* pool_name
= nullptr;
13862 const pg_pool_t
*pg_pool
= osdmap
.get_pg_pool(m
->pool
);
13863 if (pg_pool
!= nullptr) {
13864 pool_name
= &osdmap
.get_pool_name(m
->pool
);
13867 if (!is_unmanaged_snap_op_permitted(cct
, mon
.key_server
,
13868 session
->entity_name
, session
->caps
,
13869 session
->get_peer_socket_addr(),
13871 dout(0) << "got unmanaged-snap pool op from entity with insufficient "
13872 << "privileges. message: " << *m
<< std::endl
13873 << "caps: " << session
->caps
<< dendl
;
13874 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
13880 if (!session
->is_capable("osd", MON_CAP_W
)) {
13881 dout(0) << "got pool op from entity with insufficient privileges. "
13882 << "message: " << *m
<< std::endl
13883 << "caps: " << session
->caps
<< dendl
;
13884 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
13893 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op
)
13895 op
->mark_osdmon_event(__func__
);
13896 auto m
= op
->get_req
<MPoolOp
>();
13898 if (enforce_pool_op_caps(op
)) {
13902 if (m
->fsid
!= mon
.monmap
->fsid
) {
13903 dout(0) << __func__
<< " drop message on fsid " << m
->fsid
13904 << " != " << mon
.monmap
->fsid
<< " for " << *m
<< dendl
;
13905 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13909 if (m
->op
== POOL_OP_CREATE
)
13910 return preprocess_pool_op_create(op
);
13912 const pg_pool_t
*p
= osdmap
.get_pg_pool(m
->pool
);
13913 if (p
== nullptr) {
13914 dout(10) << "attempt to operate on non-existent pool id " << m
->pool
<< dendl
;
13915 if (m
->op
== POOL_OP_DELETE
) {
13916 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13918 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
13923 // check if the snap and snapname exist
13924 bool snap_exists
= false;
13925 if (p
->snap_exists(m
->name
.c_str()))
13926 snap_exists
= true;
13929 case POOL_OP_CREATE_SNAP
:
13930 if (p
->is_unmanaged_snaps_mode() || p
->is_tier()) {
13931 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13935 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13939 case POOL_OP_CREATE_UNMANAGED_SNAP
:
13940 if (p
->is_pool_snaps_mode()) {
13941 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13945 case POOL_OP_DELETE_SNAP
:
13946 if (p
->is_unmanaged_snaps_mode()) {
13947 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13950 if (!snap_exists
) {
13951 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13955 case POOL_OP_DELETE_UNMANAGED_SNAP
:
13956 if (p
->is_pool_snaps_mode()) {
13957 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13960 if (_is_removed_snap(m
->pool
, m
->snapid
)) {
13961 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13965 case POOL_OP_DELETE
:
13966 if (osdmap
.lookup_pg_pool_name(m
->name
.c_str()) >= 0) {
13967 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13971 case POOL_OP_AUID_CHANGE
:
13981 bool OSDMonitor::_is_removed_snap(int64_t pool
, snapid_t snap
)
13983 if (!osdmap
.have_pg_pool(pool
)) {
13984 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
13985 << " - pool dne" << dendl
;
13988 if (osdmap
.in_removed_snaps_queue(pool
, snap
)) {
13989 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
13990 << " - in osdmap removed_snaps_queue" << dendl
;
13993 snapid_t begin
, end
;
13994 int r
= lookup_purged_snap(pool
, snap
, &begin
, &end
);
13996 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
13997 << " - purged, [" << begin
<< "," << end
<< ")" << dendl
;
14003 bool OSDMonitor::_is_pending_removed_snap(int64_t pool
, snapid_t snap
)
14005 if (pending_inc
.old_pools
.count(pool
)) {
14006 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
14007 << " - pool pending deletion" << dendl
;
14010 if (pending_inc
.in_new_removed_snaps(pool
, snap
)) {
14011 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
14012 << " - in pending new_removed_snaps" << dendl
;
14018 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op
)
14020 op
->mark_osdmon_event(__func__
);
14021 auto m
= op
->get_req
<MPoolOp
>();
14022 int64_t pool
= osdmap
.lookup_pg_pool_name(m
->name
.c_str());
14024 _pool_op_reply(op
, 0, osdmap
.get_epoch());
14031 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op
)
14033 op
->mark_osdmon_event(__func__
);
14034 auto m
= op
->get_req
<MPoolOp
>();
14035 dout(10) << "prepare_pool_op " << *m
<< dendl
;
14036 if (m
->op
== POOL_OP_CREATE
) {
14037 return prepare_pool_op_create(op
);
14038 } else if (m
->op
== POOL_OP_DELETE
) {
14039 return prepare_pool_op_delete(op
);
14043 bool changed
= false;
14045 if (!osdmap
.have_pg_pool(m
->pool
)) {
14046 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
14050 const pg_pool_t
*pool
= osdmap
.get_pg_pool(m
->pool
);
14053 case POOL_OP_CREATE_SNAP
:
14054 if (pool
->is_tier()) {
14056 _pool_op_reply(op
, ret
, osdmap
.get_epoch());
14058 } // else, fall through
14059 case POOL_OP_DELETE_SNAP
:
14060 if (!pool
->is_unmanaged_snaps_mode()) {
14061 bool snap_exists
= pool
->snap_exists(m
->name
.c_str());
14062 if ((m
->op
== POOL_OP_CREATE_SNAP
&& snap_exists
)
14063 || (m
->op
== POOL_OP_DELETE_SNAP
&& !snap_exists
)) {
14071 _pool_op_reply(op
, ret
, osdmap
.get_epoch());
14074 case POOL_OP_DELETE_UNMANAGED_SNAP
:
14075 // we won't allow removal of an unmanaged snapshot from a pool
14076 // not in unmanaged snaps mode.
14077 if (!pool
->is_unmanaged_snaps_mode()) {
14078 _pool_op_reply(op
, -ENOTSUP
, osdmap
.get_epoch());
14082 case POOL_OP_CREATE_UNMANAGED_SNAP
:
14083 // but we will allow creating an unmanaged snapshot on any pool
14084 // as long as it is not in 'pool' snaps mode.
14085 if (pool
->is_pool_snaps_mode()) {
14086 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
14091 // projected pool info
14093 if (pending_inc
.new_pools
.count(m
->pool
))
14094 pp
= pending_inc
.new_pools
[m
->pool
];
14096 pp
= *osdmap
.get_pg_pool(m
->pool
);
14098 bufferlist reply_data
;
14100 // pool snaps vs unmanaged snaps are mutually exclusive
14102 case POOL_OP_CREATE_SNAP
:
14103 case POOL_OP_DELETE_SNAP
:
14104 if (pp
.is_unmanaged_snaps_mode()) {
14110 case POOL_OP_CREATE_UNMANAGED_SNAP
:
14111 case POOL_OP_DELETE_UNMANAGED_SNAP
:
14112 if (pp
.is_pool_snaps_mode()) {
14119 case POOL_OP_CREATE_SNAP
:
14120 if (!pp
.snap_exists(m
->name
.c_str())) {
14121 pp
.add_snap(m
->name
.c_str(), ceph_clock_now());
14122 dout(10) << "create snap in pool " << m
->pool
<< " " << m
->name
14123 << " seq " << pp
.get_snap_epoch() << dendl
;
14128 case POOL_OP_DELETE_SNAP
:
14130 snapid_t s
= pp
.snap_exists(m
->name
.c_str());
14133 pending_inc
.new_removed_snaps
[m
->pool
].insert(s
);
14139 case POOL_OP_CREATE_UNMANAGED_SNAP
:
14141 uint64_t snapid
= pp
.add_unmanaged_snap(
14142 osdmap
.require_osd_release
< ceph_release_t::octopus
);
14143 encode(snapid
, reply_data
);
14148 case POOL_OP_DELETE_UNMANAGED_SNAP
:
14149 if (!_is_removed_snap(m
->pool
, m
->snapid
) &&
14150 !_is_pending_removed_snap(m
->pool
, m
->snapid
)) {
14151 if (m
->snapid
> pp
.get_snap_seq()) {
14152 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
14155 pp
.remove_unmanaged_snap(
14157 osdmap
.require_osd_release
< ceph_release_t::octopus
);
14158 pending_inc
.new_removed_snaps
[m
->pool
].insert(m
->snapid
);
14159 // also record the new seq as purged: this avoids a discontinuity
14160 // after all of the snaps have been purged, since the seq assigned
14161 // during removal lives in the same namespace as the actual snaps.
14162 pending_pseudo_purged_snaps
[m
->pool
].insert(pp
.get_snap_seq());
14167 case POOL_OP_AUID_CHANGE
:
14168 _pool_op_reply(op
, -EOPNOTSUPP
, osdmap
.get_epoch());
14177 pp
.set_snap_epoch(pending_inc
.epoch
);
14178 pending_inc
.new_pools
[m
->pool
] = pp
;
14182 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, ret
, pending_inc
.epoch
, &reply_data
));
14186 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op
)
14188 op
->mark_osdmon_event(__func__
);
14189 int err
= prepare_new_pool(op
);
14190 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, err
, pending_inc
.epoch
));
14194 int OSDMonitor::_check_remove_pool(int64_t pool_id
, const pg_pool_t
& pool
,
14197 const string
& poolstr
= osdmap
.get_pool_name(pool_id
);
14199 // If the Pool is in use by CephFS, refuse to delete it
14200 FSMap
const &pending_fsmap
= mon
.mdsmon()->get_pending_fsmap();
14201 if (pending_fsmap
.pool_in_use(pool_id
)) {
14202 *ss
<< "pool '" << poolstr
<< "' is in use by CephFS";
14206 if (pool
.tier_of
>= 0) {
14207 *ss
<< "pool '" << poolstr
<< "' is a tier of '"
14208 << osdmap
.get_pool_name(pool
.tier_of
) << "'";
14211 if (!pool
.tiers
.empty()) {
14212 *ss
<< "pool '" << poolstr
<< "' has tiers";
14213 for(auto tier
: pool
.tiers
) {
14214 *ss
<< " " << osdmap
.get_pool_name(tier
);
14219 if (!g_conf()->mon_allow_pool_delete
) {
14220 *ss
<< "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
14224 if (pool
.has_flag(pg_pool_t::FLAG_NODELETE
)) {
14225 *ss
<< "pool deletion is disabled; you must unset nodelete flag for the pool first";
14229 *ss
<< "pool '" << poolstr
<< "' removed";
14234 * Check if it is safe to add a tier to a base pool
14237 * True if the operation should proceed, false if we should abort here
14238 * (abort doesn't necessarily mean error, could be idempotency)
14240 bool OSDMonitor::_check_become_tier(
14241 const int64_t tier_pool_id
, const pg_pool_t
*tier_pool
,
14242 const int64_t base_pool_id
, const pg_pool_t
*base_pool
,
14246 const std::string
&tier_pool_name
= osdmap
.get_pool_name(tier_pool_id
);
14247 const std::string
&base_pool_name
= osdmap
.get_pool_name(base_pool_id
);
14249 const FSMap
&pending_fsmap
= mon
.mdsmon()->get_pending_fsmap();
14250 if (pending_fsmap
.pool_in_use(tier_pool_id
)) {
14251 *ss
<< "pool '" << tier_pool_name
<< "' is in use by CephFS";
14256 if (base_pool
->tiers
.count(tier_pool_id
)) {
14257 ceph_assert(tier_pool
->tier_of
== base_pool_id
);
14259 *ss
<< "pool '" << tier_pool_name
<< "' is now (or already was) a tier of '"
14260 << base_pool_name
<< "'";
14264 if (base_pool
->is_tier()) {
14265 *ss
<< "pool '" << base_pool_name
<< "' is already a tier of '"
14266 << osdmap
.get_pool_name(base_pool
->tier_of
) << "', "
14267 << "multiple tiers are not yet supported.";
14272 if (tier_pool
->has_tiers()) {
14273 *ss
<< "pool '" << tier_pool_name
<< "' has following tier(s) already:";
14274 for (set
<uint64_t>::iterator it
= tier_pool
->tiers
.begin();
14275 it
!= tier_pool
->tiers
.end(); ++it
)
14276 *ss
<< "'" << osdmap
.get_pool_name(*it
) << "',";
14277 *ss
<< " multiple tiers are not yet supported.";
14282 if (tier_pool
->is_tier()) {
14283 *ss
<< "tier pool '" << tier_pool_name
<< "' is already a tier of '"
14284 << osdmap
.get_pool_name(tier_pool
->tier_of
) << "'";
14295 * Check if it is safe to remove a tier from this base pool
14298 * True if the operation should proceed, false if we should abort here
14299 * (abort doesn't necessarily mean error, could be idempotency)
14301 bool OSDMonitor::_check_remove_tier(
14302 const int64_t base_pool_id
, const pg_pool_t
*base_pool
,
14303 const pg_pool_t
*tier_pool
,
14304 int *err
, ostream
*ss
) const
14306 const std::string
&base_pool_name
= osdmap
.get_pool_name(base_pool_id
);
14308 // Apply CephFS-specific checks
14309 const FSMap
&pending_fsmap
= mon
.mdsmon()->get_pending_fsmap();
14310 if (pending_fsmap
.pool_in_use(base_pool_id
)) {
14311 if (base_pool
->is_erasure() && !base_pool
->allows_ecoverwrites()) {
14312 // If the underlying pool is erasure coded and does not allow EC
14313 // overwrites, we can't permit the removal of the replicated tier that
14314 // CephFS relies on to access it
14315 *ss
<< "pool '" << base_pool_name
<<
14316 "' does not allow EC overwrites and is in use by CephFS"
14322 if (tier_pool
&& tier_pool
->cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
) {
14323 *ss
<< "pool '" << base_pool_name
<< "' is in use by CephFS, and this "
14324 "tier is still in use as a writeback cache. Change the cache "
14325 "mode and flush the cache before removing it";
14335 int OSDMonitor::_prepare_remove_pool(
14336 int64_t pool
, ostream
*ss
, bool no_fake
)
14338 dout(10) << __func__
<< " " << pool
<< dendl
;
14339 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
14340 int r
= _check_remove_pool(pool
, *p
, ss
);
14344 auto new_pool
= pending_inc
.new_pools
.find(pool
);
14345 if (new_pool
!= pending_inc
.new_pools
.end()) {
14346 // if there is a problem with the pending info, wait and retry
14348 const auto& p
= new_pool
->second
;
14349 int r
= _check_remove_pool(pool
, p
, ss
);
14354 if (pending_inc
.old_pools
.count(pool
)) {
14355 dout(10) << __func__
<< " " << pool
<< " already pending removal"
14360 if (g_conf()->mon_fake_pool_delete
&& !no_fake
) {
14361 string old_name
= osdmap
.get_pool_name(pool
);
14362 string new_name
= old_name
+ "." + stringify(pool
) + ".DELETED";
14363 dout(1) << __func__
<< " faking pool deletion: renaming " << pool
<< " "
14364 << old_name
<< " -> " << new_name
<< dendl
;
14365 pending_inc
.new_pool_names
[pool
] = new_name
;
14370 pending_inc
.old_pools
.insert(pool
);
14372 // remove any pg_temp mappings for this pool
14373 for (auto p
= osdmap
.pg_temp
->begin();
14374 p
!= osdmap
.pg_temp
->end();
14376 if (p
->first
.pool() == pool
) {
14377 dout(10) << __func__
<< " " << pool
<< " removing obsolete pg_temp "
14378 << p
->first
<< dendl
;
14379 pending_inc
.new_pg_temp
[p
->first
].clear();
14382 // remove any primary_temp mappings for this pool
14383 for (auto p
= osdmap
.primary_temp
->begin();
14384 p
!= osdmap
.primary_temp
->end();
14386 if (p
->first
.pool() == pool
) {
14387 dout(10) << __func__
<< " " << pool
14388 << " removing obsolete primary_temp" << p
->first
<< dendl
;
14389 pending_inc
.new_primary_temp
[p
->first
] = -1;
14392 // remove any pg_upmap mappings for this pool
14393 for (auto& p
: osdmap
.pg_upmap
) {
14394 if (p
.first
.pool() == pool
) {
14395 dout(10) << __func__
<< " " << pool
14396 << " removing obsolete pg_upmap "
14397 << p
.first
<< dendl
;
14398 pending_inc
.old_pg_upmap
.insert(p
.first
);
14401 // remove any pending pg_upmap mappings for this pool
14403 auto it
= pending_inc
.new_pg_upmap
.begin();
14404 while (it
!= pending_inc
.new_pg_upmap
.end()) {
14405 if (it
->first
.pool() == pool
) {
14406 dout(10) << __func__
<< " " << pool
14407 << " removing pending pg_upmap "
14408 << it
->first
<< dendl
;
14409 it
= pending_inc
.new_pg_upmap
.erase(it
);
14415 // remove any pg_upmap_items mappings for this pool
14416 for (auto& p
: osdmap
.pg_upmap_items
) {
14417 if (p
.first
.pool() == pool
) {
14418 dout(10) << __func__
<< " " << pool
14419 << " removing obsolete pg_upmap_items " << p
.first
14421 pending_inc
.old_pg_upmap_items
.insert(p
.first
);
14424 // remove any pending pg_upmap mappings for this pool
14426 auto it
= pending_inc
.new_pg_upmap_items
.begin();
14427 while (it
!= pending_inc
.new_pg_upmap_items
.end()) {
14428 if (it
->first
.pool() == pool
) {
14429 dout(10) << __func__
<< " " << pool
14430 << " removing pending pg_upmap_items "
14431 << it
->first
<< dendl
;
14432 it
= pending_inc
.new_pg_upmap_items
.erase(it
);
14439 // remove any choose_args for this pool
14440 CrushWrapper newcrush
= _get_pending_crush();
14441 if (newcrush
.have_choose_args(pool
)) {
14442 dout(10) << __func__
<< " removing choose_args for pool " << pool
<< dendl
;
14443 newcrush
.rm_choose_args(pool
);
14444 pending_inc
.crush
.clear();
14445 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
14450 int OSDMonitor::_prepare_rename_pool(int64_t pool
, string newname
)
14452 dout(10) << "_prepare_rename_pool " << pool
<< dendl
;
14453 if (pending_inc
.old_pools
.count(pool
)) {
14454 dout(10) << "_prepare_rename_pool " << pool
<< " pending removal" << dendl
;
14457 for (map
<int64_t,string
>::iterator p
= pending_inc
.new_pool_names
.begin();
14458 p
!= pending_inc
.new_pool_names
.end();
14460 if (p
->second
== newname
&& p
->first
!= pool
) {
14465 pending_inc
.new_pool_names
[pool
] = newname
;
14469 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op
)
14471 op
->mark_osdmon_event(__func__
);
14472 auto m
= op
->get_req
<MPoolOp
>();
14474 int ret
= _prepare_remove_pool(m
->pool
, &ss
, false);
14475 if (ret
== -EAGAIN
) {
14476 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
14480 dout(10) << __func__
<< " got " << ret
<< " " << ss
.str() << dendl
;
14481 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, ret
,
14482 pending_inc
.epoch
));
14486 void OSDMonitor::_pool_op_reply(MonOpRequestRef op
,
14487 int ret
, epoch_t epoch
, bufferlist
*blp
)
14489 op
->mark_osdmon_event(__func__
);
14490 auto m
= op
->get_req
<MPoolOp
>();
14491 dout(20) << "_pool_op_reply " << ret
<< dendl
;
14492 MPoolOpReply
*reply
= new MPoolOpReply(m
->fsid
, m
->get_tid(),
14493 ret
, epoch
, get_last_committed(), blp
);
14494 mon
.send_reply(op
, reply
);
14497 void OSDMonitor::convert_pool_priorities(void)
14499 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc("recovery_priority").key
;
14500 int64_t max_prio
= 0;
14501 int64_t min_prio
= 0;
14502 for (const auto &i
: osdmap
.get_pools()) {
14503 const auto &pool
= i
.second
;
14505 if (pool
.opts
.is_set(key
)) {
14507 pool
.opts
.get(key
, &prio
);
14508 if (prio
> max_prio
)
14510 if (prio
< min_prio
)
14514 if (max_prio
<= OSD_POOL_PRIORITY_MAX
&& min_prio
>= OSD_POOL_PRIORITY_MIN
) {
14515 dout(20) << __func__
<< " nothing to fix" << dendl
;
14518 // Current pool priorities exceeds new maximum
14519 for (const auto &i
: osdmap
.get_pools()) {
14520 const auto pool_id
= i
.first
;
14521 pg_pool_t pool
= i
.second
;
14524 pool
.opts
.get(key
, &prio
);
14527 if (prio
> 0 && max_prio
> OSD_POOL_PRIORITY_MAX
) { // Likely scenario
14528 // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
14529 n
= (float)prio
/ max_prio
* OSD_POOL_PRIORITY_MAX
;
14530 } else if (prio
< 0 && min_prio
< OSD_POOL_PRIORITY_MIN
) {
14531 // Scaled priority range OSD_POOL_PRIORITY_MIN to 0
14532 n
= (float)prio
/ min_prio
* OSD_POOL_PRIORITY_MIN
;
14537 pool
.opts
.unset(key
);
14539 pool
.opts
.set(key
, static_cast<int64_t>(n
));
14541 dout(10) << __func__
<< " pool " << pool_id
14542 << " recovery_priority adjusted "
14543 << prio
<< " to " << n
<< dendl
;
14544 pool
.last_change
= pending_inc
.epoch
;
14545 pending_inc
.new_pools
[pool_id
] = pool
;
14549 void OSDMonitor::try_enable_stretch_mode_pools(stringstream
& ss
, bool *okay
,
14551 set
<pg_pool_t
*>* pools
,
14552 const string
& new_crush_rule
)
14554 dout(20) << __func__
<< dendl
;
14556 int new_crush_rule_result
= osdmap
.crush
->get_rule_id(new_crush_rule
);
14557 if (new_crush_rule_result
< 0) {
14558 ss
<< "unrecognized crush rule " << new_crush_rule_result
;
14559 *errcode
= new_crush_rule_result
;
14562 __u8 new_rule
= static_cast<__u8
>(new_crush_rule_result
);
14563 for (const auto& pooli
: osdmap
.pools
) {
14564 int64_t poolid
= pooli
.first
;
14565 const pg_pool_t
*p
= &pooli
.second
;
14566 if (!p
->is_replicated()) {
14567 ss
<< "stretched pools must be replicated; '" << osdmap
.pool_name
[poolid
] << "' is erasure-coded";
14568 *errcode
= -EINVAL
;
14571 uint8_t default_size
= g_conf().get_val
<uint64_t>("osd_pool_default_size");
14572 if ((p
->get_size() != default_size
||
14573 (p
->get_min_size() != g_conf().get_osd_pool_default_min_size(default_size
))) &&
14574 (p
->get_crush_rule() != new_rule
)) {
14575 ss
<< "we currently require stretch mode pools start out with the"
14576 " default size/min_size, which '" << osdmap
.pool_name
[poolid
] << "' does not";
14577 *errcode
= -EINVAL
;
14580 pg_pool_t
*pp
= pending_inc
.get_new_pool(poolid
, p
);
14581 // TODO: The part where we unconditionally copy the pools into pending_inc is bad
14582 // the attempt may fail and then we have these pool updates...but they won't do anything
14583 // if there is a failure, so if it's hard to change the interface, no need to bother
14590 void OSDMonitor::try_enable_stretch_mode(stringstream
& ss
, bool *okay
,
14591 int *errcode
, bool commit
,
14592 const string
& dividing_bucket
,
14593 uint32_t bucket_count
,
14594 const set
<pg_pool_t
*>& pools
,
14595 const string
& new_crush_rule
)
14597 dout(20) << __func__
<< dendl
;
14599 CrushWrapper crush
= _get_pending_crush();
14600 int dividing_id
= -1;
14601 if (auto type_id
= crush
.get_validated_type_id(dividing_bucket
);
14602 !type_id
.has_value()) {
14603 ss
<< dividing_bucket
<< " is not a valid crush bucket type";
14604 *errcode
= -ENOENT
;
14605 ceph_assert(!commit
);
14608 dividing_id
= *type_id
;
14610 vector
<int> subtrees
;
14611 crush
.get_subtree_of_type(dividing_id
, &subtrees
);
14612 if (subtrees
.size() != 2) {
14613 ss
<< "there are " << subtrees
.size() << dividing_bucket
14614 << "'s in the cluster but stretch mode currently only works with 2!";
14615 *errcode
= -EINVAL
;
14616 ceph_assert(!commit
|| subtrees
.size() == 2);
14620 int new_crush_rule_result
= crush
.get_rule_id(new_crush_rule
);
14621 if (new_crush_rule_result
< 0) {
14622 ss
<< "unrecognized crush rule " << new_crush_rule
;
14623 *errcode
= new_crush_rule_result
;
14624 ceph_assert(!commit
|| (new_crush_rule_result
> 0));
14627 __u8 new_rule
= static_cast<__u8
>(new_crush_rule_result
);
14629 int weight1
= crush
.get_item_weight(subtrees
[0]);
14630 int weight2
= crush
.get_item_weight(subtrees
[1]);
14631 if (weight1
!= weight2
) {
14632 // TODO: I'm really not sure this is a good idea?
14633 ss
<< "the 2 " << dividing_bucket
14634 << "instances in the cluster have differing weights "
14635 << weight1
<< " and " << weight2
14636 <<" but stretch mode currently requires they be the same!";
14637 *errcode
= -EINVAL
;
14638 ceph_assert(!commit
|| (weight1
== weight2
));
14641 if (bucket_count
!= 2) {
14642 ss
<< "currently we only support 2-site stretch clusters!";
14643 *errcode
= -EINVAL
;
14644 ceph_assert(!commit
|| bucket_count
== 2);
14647 // TODO: check CRUSH rules for pools so that we are appropriately divided
14649 for (auto pool
: pools
) {
14650 pool
->crush_rule
= new_rule
;
14651 pool
->peering_crush_bucket_count
= bucket_count
;
14652 pool
->peering_crush_bucket_target
= bucket_count
;
14653 pool
->peering_crush_bucket_barrier
= dividing_id
;
14654 pool
->peering_crush_mandatory_member
= CRUSH_ITEM_NONE
;
14655 pool
->size
= g_conf().get_val
<uint64_t>("mon_stretch_pool_size");
14656 pool
->min_size
= g_conf().get_val
<uint64_t>("mon_stretch_pool_min_size");
14658 pending_inc
.change_stretch_mode
= true;
14659 pending_inc
.stretch_mode_enabled
= true;
14660 pending_inc
.new_stretch_bucket_count
= bucket_count
;
14661 pending_inc
.new_degraded_stretch_mode
= 0;
14662 pending_inc
.new_stretch_mode_bucket
= dividing_id
;
14668 bool OSDMonitor::check_for_dead_crush_zones(const map
<string
,set
<string
>>& dead_buckets
,
14669 set
<int> *really_down_buckets
,
14670 set
<string
> *really_down_mons
)
14672 dout(20) << __func__
<< " with dead mon zones " << dead_buckets
<< dendl
;
14673 ceph_assert(is_readable());
14674 if (dead_buckets
.empty()) return false;
14675 set
<int> down_cache
;
14676 bool really_down
= false;
14677 for (auto dbi
: dead_buckets
) {
14678 const string
& bucket_name
= dbi
.first
;
14679 ceph_assert(osdmap
.crush
->name_exists(bucket_name
));
14680 int bucket_id
= osdmap
.crush
->get_item_id(bucket_name
);
14681 dout(20) << "Checking " << bucket_name
<< " id " << bucket_id
14682 << " to see if OSDs are also down" << dendl
;
14683 bool subtree_down
= osdmap
.subtree_is_down(bucket_id
, &down_cache
);
14684 if (subtree_down
) {
14685 dout(20) << "subtree is down!" << dendl
;
14686 really_down
= true;
14687 really_down_buckets
->insert(bucket_id
);
14688 really_down_mons
->insert(dbi
.second
.begin(), dbi
.second
.end());
14691 dout(10) << "We determined CRUSH buckets " << *really_down_buckets
14692 << " and mons " << *really_down_mons
<< " are really down" << dendl
;
14693 return really_down
;
14696 void OSDMonitor::trigger_degraded_stretch_mode(const set
<int>& dead_buckets
,
14697 const set
<string
>& live_zones
)
14699 dout(20) << __func__
<< dendl
;
14700 stretch_recovery_triggered
.set_from_double(0); // reset this; we can't go clean now!
14701 // update the general OSDMap changes
14702 pending_inc
.change_stretch_mode
= true;
14703 pending_inc
.stretch_mode_enabled
= osdmap
.stretch_mode_enabled
;
14704 pending_inc
.new_stretch_bucket_count
= osdmap
.stretch_bucket_count
;
14705 int new_site_count
= osdmap
.stretch_bucket_count
- dead_buckets
.size();
14706 ceph_assert(new_site_count
== 1); // stretch count 2!
14707 pending_inc
.new_degraded_stretch_mode
= new_site_count
;
14708 pending_inc
.new_recovering_stretch_mode
= 0;
14709 pending_inc
.new_stretch_mode_bucket
= osdmap
.stretch_mode_bucket
;
14711 // and then apply them to all the pg_pool_ts
14712 ceph_assert(live_zones
.size() == 1); // only support 2 zones now
14713 const string
& remaining_site_name
= *(live_zones
.begin());
14714 ceph_assert(osdmap
.crush
->name_exists(remaining_site_name
));
14715 int remaining_site
= osdmap
.crush
->get_item_id(remaining_site_name
);
14716 for (auto pgi
: osdmap
.pools
) {
14717 if (pgi
.second
.peering_crush_bucket_count
) {
14718 pg_pool_t
& newp
= *pending_inc
.get_new_pool(pgi
.first
, &pgi
.second
);
14719 newp
.peering_crush_bucket_count
= new_site_count
;
14720 newp
.peering_crush_mandatory_member
= remaining_site
;
14721 newp
.min_size
= pgi
.second
.min_size
/ 2; // only support 2 zones now
14722 newp
.set_last_force_op_resend(pending_inc
.epoch
);
14728 void OSDMonitor::trigger_recovery_stretch_mode()
14730 dout(20) << __func__
<< dendl
;
14731 stretch_recovery_triggered
.set_from_double(0); // reset this so we don't go full-active prematurely
14732 pending_inc
.change_stretch_mode
= true;
14733 pending_inc
.stretch_mode_enabled
= osdmap
.stretch_mode_enabled
;
14734 pending_inc
.new_stretch_bucket_count
= osdmap
.stretch_bucket_count
;
14735 pending_inc
.new_degraded_stretch_mode
= osdmap
.degraded_stretch_mode
;
14736 pending_inc
.new_recovering_stretch_mode
= 1;
14737 pending_inc
.new_stretch_mode_bucket
= osdmap
.stretch_mode_bucket
;
14739 for (auto pgi
: osdmap
.pools
) {
14740 if (pgi
.second
.peering_crush_bucket_count
) {
14741 pg_pool_t
& newp
= *pending_inc
.get_new_pool(pgi
.first
, &pgi
.second
);
14742 newp
.set_last_force_op_resend(pending_inc
.epoch
);
14748 void OSDMonitor::set_degraded_stretch_mode()
14750 stretch_recovery_triggered
.set_from_double(0);
14753 void OSDMonitor::set_recovery_stretch_mode()
14755 if (stretch_recovery_triggered
.is_zero()) {
14756 stretch_recovery_triggered
= ceph_clock_now();
14760 void OSDMonitor::set_healthy_stretch_mode()
14762 stretch_recovery_triggered
.set_from_double(0);
14765 void OSDMonitor::notify_new_pg_digest()
14767 dout(20) << __func__
<< dendl
;
14768 if (!stretch_recovery_triggered
.is_zero()) {
14769 try_end_recovery_stretch_mode(false);
14773 struct CMonExitRecovery
: public Context
{
14776 CMonExitRecovery(OSDMonitor
*mon
, bool f
) : m(mon
), force(f
) {}
14777 void finish(int r
) {
14778 m
->try_end_recovery_stretch_mode(force
);
14782 void OSDMonitor::try_end_recovery_stretch_mode(bool force
)
14784 dout(20) << __func__
<< dendl
;
14785 if (!mon
.is_leader()) return;
14786 if (!mon
.is_degraded_stretch_mode()) return;
14787 if (!mon
.is_recovering_stretch_mode()) return;
14788 if (!is_readable()) {
14789 wait_for_readable_ctx(new CMonExitRecovery(this, force
));
14793 if (osdmap
.recovering_stretch_mode
&&
14794 ((!stretch_recovery_triggered
.is_zero() &&
14795 ceph_clock_now() - g_conf().get_val
<double>("mon_stretch_recovery_min_wait") >
14796 stretch_recovery_triggered
) ||
14798 if (!mon
.mgrstatmon()->is_readable()) {
14799 mon
.mgrstatmon()->wait_for_readable_ctx(new CMonExitRecovery(this, force
));
14802 const PGMapDigest
& pgd
= mon
.mgrstatmon()->get_digest();
14803 double misplaced
, degraded
, inactive
, unknown
;
14804 pgd
.get_recovery_stats(&misplaced
, °raded
, &inactive
, &unknown
);
14805 if (force
|| (degraded
== 0.0 && inactive
== 0.0 && unknown
== 0.0)) {
14806 // we can exit degraded stretch mode!
14807 mon
.trigger_healthy_stretch_mode();
14812 void OSDMonitor::trigger_healthy_stretch_mode()
14814 ceph_assert(is_writeable());
14815 stretch_recovery_triggered
.set_from_double(0);
14816 pending_inc
.change_stretch_mode
= true;
14817 pending_inc
.stretch_mode_enabled
= osdmap
.stretch_mode_enabled
;
14818 pending_inc
.new_stretch_bucket_count
= osdmap
.stretch_bucket_count
;
14819 pending_inc
.new_degraded_stretch_mode
= 0; // turn off degraded mode...
14820 pending_inc
.new_recovering_stretch_mode
= 0; //...and recovering mode!
14821 pending_inc
.new_stretch_mode_bucket
= osdmap
.stretch_mode_bucket
;
14822 for (auto pgi
: osdmap
.pools
) {
14823 if (pgi
.second
.peering_crush_bucket_count
) {
14824 pg_pool_t
& newp
= *pending_inc
.get_new_pool(pgi
.first
, &pgi
.second
);
14825 newp
.peering_crush_bucket_count
= osdmap
.stretch_bucket_count
;
14826 newp
.peering_crush_mandatory_member
= CRUSH_ITEM_NONE
;
14827 newp
.min_size
= g_conf().get_val
<uint64_t>("mon_stretch_pool_min_size");
14828 newp
.set_last_force_op_resend(pending_inc
.epoch
);