1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
10 * Author: Loic Dachary <loic@dachary.org>
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
20 #include <boost/algorithm/string.hpp>
21 #include <experimental/iterator>
25 #include "mon/OSDMonitor.h"
26 #include "mon/Monitor.h"
27 #include "mon/MDSMonitor.h"
28 #include "mon/MgrStatMonitor.h"
29 #include "mon/AuthMonitor.h"
30 #include "mon/KVMonitor.h"
32 #include "mon/MonitorDBStore.h"
33 #include "mon/Session.h"
35 #include "crush/CrushWrapper.h"
36 #include "crush/CrushTester.h"
37 #include "crush/CrushTreeDumper.h"
39 #include "messages/MOSDBeacon.h"
40 #include "messages/MOSDFailure.h"
41 #include "messages/MOSDMarkMeDown.h"
42 #include "messages/MOSDMarkMeDead.h"
43 #include "messages/MOSDFull.h"
44 #include "messages/MOSDMap.h"
45 #include "messages/MMonGetOSDMap.h"
46 #include "messages/MOSDBoot.h"
47 #include "messages/MOSDAlive.h"
48 #include "messages/MPoolOp.h"
49 #include "messages/MPoolOpReply.h"
50 #include "messages/MOSDPGCreate.h"
51 #include "messages/MOSDPGCreate2.h"
52 #include "messages/MOSDPGCreated.h"
53 #include "messages/MOSDPGTemp.h"
54 #include "messages/MOSDPGReadyToMerge.h"
55 #include "messages/MMonCommand.h"
56 #include "messages/MRemoveSnaps.h"
57 #include "messages/MOSDScrub.h"
58 #include "messages/MRoute.h"
59 #include "messages/MMonGetPurgedSnaps.h"
60 #include "messages/MMonGetPurgedSnapsReply.h"
62 #include "common/TextTable.h"
63 #include "common/Timer.h"
64 #include "common/ceph_argparse.h"
65 #include "common/perf_counters.h"
66 #include "common/PriorityCache.h"
67 #include "common/strtol.h"
68 #include "common/numa.h"
70 #include "common/config.h"
71 #include "common/errno.h"
73 #include "erasure-code/ErasureCodePlugin.h"
74 #include "compressor/Compressor.h"
75 #include "common/Checksummer.h"
77 #include "include/compat.h"
78 #include "include/ceph_assert.h"
79 #include "include/stringify.h"
80 #include "include/util.h"
81 #include "common/cmdparse.h"
82 #include "include/str_list.h"
83 #include "include/str_map.h"
84 #include "include/scope_guard.h"
85 #include "perfglue/heap_profiler.h"
87 #include "auth/cephx/CephxKeyServer.h"
88 #include "osd/OSDCap.h"
90 #include "json_spirit/json_spirit_reader.h"
92 #include <boost/algorithm/string/predicate.hpp>
99 using std::ostringstream
;
103 using std::stringstream
;
104 using std::to_string
;
107 using ceph::bufferlist
;
110 using ceph::ErasureCodeInterfaceRef
;
111 using ceph::ErasureCodePluginRegistry
;
112 using ceph::ErasureCodeProfile
;
113 using ceph::Formatter
;
114 using ceph::JSONFormatter
;
115 using ceph::make_message
;
117 #define dout_subsys ceph_subsys_mon
118 static const string
OSD_PG_CREATING_PREFIX("osd_pg_creating");
119 static const string
OSD_METADATA_PREFIX("osd_metadata");
120 static const string
OSD_SNAP_PREFIX("osd_snap");
124 OSD snapshot metadata
125 ---------------------
127 -- starting with mimic, removed in octopus --
129 "removed_epoch_%llu_%08lx" % (pool, epoch)
130 -> interval_set<snapid_t>
132 "removed_snap_%llu_%016llx" % (pool, last_snap)
133 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
136 -- starting with mimic --
138 "purged_snap_%llu_%016llx" % (pool, last_snap)
139 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
141 - note that the {removed,purged}_snap put the last snap in they key so
142 that we can use forward iteration only to search for an epoch in an
143 interval. e.g., to test if epoch N is removed/purged, we'll find a key
144 >= N that either does or doesn't contain the given snap.
147 -- starting with octopus --
149 "purged_epoch_%08lx" % epoch
150 -> map<int64_t,interval_set<snapid_t>>
153 using namespace TOPNSPC::common
;
156 struct OSDMemCache
: public PriorityCache::PriCache
{
158 int64_t cache_bytes
[PriorityCache::Priority::LAST
+1] = {0};
159 int64_t committed_bytes
= 0;
160 double cache_ratio
= 0;
162 OSDMemCache(OSDMonitor
*m
) : osdmon(m
) {};
164 virtual uint64_t _get_used_bytes() const = 0;
166 virtual int64_t request_cache_bytes(
167 PriorityCache::Priority pri
, uint64_t total_cache
) const {
168 int64_t assigned
= get_cache_bytes(pri
);
171 // All cache items are currently set to have PRI1 priority
172 case PriorityCache::Priority::PRI1
:
174 int64_t request
= _get_used_bytes();
175 return (request
> assigned
) ? request
- assigned
: 0;
183 virtual int64_t get_cache_bytes(PriorityCache::Priority pri
) const {
184 return cache_bytes
[pri
];
187 virtual int64_t get_cache_bytes() const {
190 for (int i
= 0; i
< PriorityCache::Priority::LAST
+ 1; i
++) {
191 PriorityCache::Priority pri
= static_cast<PriorityCache::Priority
>(i
);
192 total
+= get_cache_bytes(pri
);
197 virtual void set_cache_bytes(PriorityCache::Priority pri
, int64_t bytes
) {
198 cache_bytes
[pri
] = bytes
;
200 virtual void add_cache_bytes(PriorityCache::Priority pri
, int64_t bytes
) {
201 cache_bytes
[pri
] += bytes
;
203 virtual int64_t commit_cache_size(uint64_t total_cache
) {
204 committed_bytes
= PriorityCache::get_chunk(
205 get_cache_bytes(), total_cache
);
206 return committed_bytes
;
208 virtual int64_t get_committed_size() const {
209 return committed_bytes
;
211 virtual double get_cache_ratio() const {
214 virtual void set_cache_ratio(double ratio
) {
217 virtual void shift_bins() {
219 virtual void import_bins(const std::vector
<uint64_t> &bins
) {
221 virtual void set_bins(PriorityCache::Priority pri
, uint64_t end_bin
) {
223 virtual uint64_t get_bins(PriorityCache::Priority pri
) const {
227 virtual string
get_cache_name() const = 0;
230 struct IncCache
: public OSDMemCache
{
231 IncCache(OSDMonitor
*m
) : OSDMemCache(m
) {};
233 virtual uint64_t _get_used_bytes() const {
234 return osdmon
->inc_osd_cache
.get_bytes();
237 virtual string
get_cache_name() const {
238 return "OSDMap Inc Cache";
241 uint64_t _get_num_osdmaps() const {
242 return osdmon
->inc_osd_cache
.get_size();
246 struct FullCache
: public OSDMemCache
{
247 FullCache(OSDMonitor
*m
) : OSDMemCache(m
) {};
249 virtual uint64_t _get_used_bytes() const {
250 return osdmon
->full_osd_cache
.get_bytes();
253 virtual string
get_cache_name() const {
254 return "OSDMap Full Cache";
257 uint64_t _get_num_osdmaps() const {
258 return osdmon
->full_osd_cache
.get_size();
262 std::shared_ptr
<IncCache
> inc_cache
;
263 std::shared_ptr
<FullCache
> full_cache
;
265 const uint32_t MAX_POOL_APPLICATIONS
= 4;
266 const uint32_t MAX_POOL_APPLICATION_KEYS
= 64;
267 const uint32_t MAX_POOL_APPLICATION_LENGTH
= 128;
269 bool is_osd_writable(const OSDCapGrant
& grant
, const std::string
* pool_name
) {
270 // Note: this doesn't include support for the application tag match
271 if ((grant
.spec
.allow
& OSD_CAP_W
) != 0) {
272 auto& match
= grant
.match
;
273 if (match
.is_match_all()) {
275 } else if (pool_name
!= nullptr &&
276 !match
.pool_namespace
.pool_name
.empty() &&
277 match
.pool_namespace
.pool_name
== *pool_name
) {
284 bool is_unmanaged_snap_op_permitted(CephContext
* cct
,
285 const KeyServer
& key_server
,
286 const EntityName
& entity_name
,
287 const MonCap
& mon_caps
,
288 const entity_addr_t
& peer_socket_addr
,
289 const std::string
* pool_name
)
291 typedef std::map
<std::string
, std::string
> CommandArgs
;
293 if (mon_caps
.is_capable(
294 cct
, entity_name
, "osd",
295 "osd pool op unmanaged-snap",
296 (pool_name
== nullptr ?
297 CommandArgs
{} /* pool DNE, require unrestricted cap */ :
298 CommandArgs
{{"poolname", *pool_name
}}),
304 AuthCapsInfo caps_info
;
305 if (!key_server
.get_service_caps(entity_name
, CEPH_ENTITY_TYPE_OSD
,
307 dout(10) << "unable to locate OSD cap data for " << entity_name
308 << " in auth db" << dendl
;
313 if (caps_info
.caps
.length() > 0) {
314 auto p
= caps_info
.caps
.cbegin();
317 } catch (const ceph::buffer::error
&err
) {
318 derr
<< "corrupt OSD cap data for " << entity_name
<< " in auth db"
325 if (!osd_cap
.parse(caps_str
, nullptr)) {
326 dout(10) << "unable to parse OSD cap data for " << entity_name
327 << " in auth db" << dendl
;
331 // if the entity has write permissions in one or all pools, permit
332 // usage of unmanaged-snapshots
333 if (osd_cap
.allow_all()) {
337 for (auto& grant
: osd_cap
.grants
) {
338 if (grant
.profile
.is_valid()) {
339 for (auto& profile_grant
: grant
.profile_grants
) {
340 if (is_osd_writable(profile_grant
, pool_name
)) {
344 } else if (is_osd_writable(grant
, pool_name
)) {
352 } // anonymous namespace
354 void LastEpochClean::Lec::report(unsigned pg_num
, ps_t ps
,
355 epoch_t last_epoch_clean
)
361 epoch_by_pg
.resize(pg_num
, 0);
362 const auto old_lec
= epoch_by_pg
[ps
];
363 if (old_lec
>= last_epoch_clean
) {
367 epoch_by_pg
[ps
] = last_epoch_clean
;
368 if (last_epoch_clean
< floor
) {
369 floor
= last_epoch_clean
;
370 } else if (last_epoch_clean
> floor
) {
371 if (old_lec
== floor
) {
372 // probably should increase floor?
373 auto new_floor
= std::min_element(std::begin(epoch_by_pg
),
374 std::end(epoch_by_pg
));
378 if (ps
!= next_missing
) {
381 for (; next_missing
< epoch_by_pg
.size(); next_missing
++) {
382 if (epoch_by_pg
[next_missing
] == 0) {
388 void LastEpochClean::remove_pool(uint64_t pool
)
390 report_by_pool
.erase(pool
);
393 void LastEpochClean::report(unsigned pg_num
, const pg_t
& pg
,
394 epoch_t last_epoch_clean
)
396 auto& lec
= report_by_pool
[pg
.pool()];
397 return lec
.report(pg_num
, pg
.ps(), last_epoch_clean
);
400 epoch_t
LastEpochClean::get_lower_bound(const OSDMap
& latest
) const
402 auto floor
= latest
.get_epoch();
403 for (auto& pool
: latest
.get_pools()) {
404 auto reported
= report_by_pool
.find(pool
.first
);
405 if (reported
== report_by_pool
.end()) {
408 if (reported
->second
.next_missing
< pool
.second
.get_pg_num()) {
411 if (reported
->second
.floor
< floor
) {
412 floor
= reported
->second
.floor
;
418 void LastEpochClean::dump(Formatter
*f
) const
420 f
->open_array_section("per_pool");
422 for (auto& [pool
, lec
] : report_by_pool
) {
423 f
->open_object_section("pool");
424 f
->dump_unsigned("poolid", pool
);
425 f
->dump_unsigned("floor", lec
.floor
);
432 class C_UpdateCreatingPGs
: public Context
{
437 C_UpdateCreatingPGs(OSDMonitor
*osdmon
, epoch_t e
) :
438 osdmon(osdmon
), start(ceph_clock_now()), epoch(e
) {}
439 void finish(int r
) override
{
441 utime_t end
= ceph_clock_now();
442 dout(10) << "osdmap epoch " << epoch
<< " mapping took "
443 << (end
- start
) << " seconds" << dendl
;
444 osdmon
->update_creating_pgs();
445 osdmon
->check_pg_creates_subs();
451 #define dout_prefix _prefix(_dout, mon, osdmap)
452 static ostream
& _prefix(std::ostream
*_dout
, Monitor
&mon
, const OSDMap
& osdmap
) {
453 return *_dout
<< "mon." << mon
.name
<< "@" << mon
.rank
454 << "(" << mon
.get_state_name()
455 << ").osd e" << osdmap
.get_epoch() << " ";
458 OSDMonitor::OSDMonitor(
462 const string
& service_name
)
463 : PaxosService(mn
, p
, service_name
),
465 inc_osd_cache(g_conf()->mon_osd_cache_size
),
466 full_osd_cache(g_conf()->mon_osd_cache_size
),
467 has_osdmap_manifest(false),
468 mapper(mn
.cct
, &mn
.cpu_tp
)
470 inc_cache
= std::make_shared
<IncCache
>(this);
471 full_cache
= std::make_shared
<FullCache
>(this);
472 cct
->_conf
.add_observer(this);
473 int r
= _set_cache_sizes();
475 derr
<< __func__
<< " using default osd cache size - mon_osd_cache_size ("
476 << g_conf()->mon_osd_cache_size
477 << ") without priority cache management"
482 const char **OSDMonitor::get_tracked_conf_keys() const
484 static const char* KEYS
[] = {
486 "mon_memory_autotune",
487 "rocksdb_cache_size",
493 void OSDMonitor::handle_conf_change(const ConfigProxy
& conf
,
494 const std::set
<std::string
> &changed
)
496 dout(10) << __func__
<< " " << changed
<< dendl
;
498 if (changed
.count("mon_memory_autotune")) {
499 _set_cache_autotuning();
501 if (changed
.count("mon_memory_target") ||
502 changed
.count("rocksdb_cache_size")) {
503 int r
= _update_mon_cache_settings();
505 derr
<< __func__
<< " mon_memory_target:"
506 << g_conf()->mon_memory_target
507 << " rocksdb_cache_size:"
508 << g_conf()->rocksdb_cache_size
509 << ". Unable to update cache size."
515 void OSDMonitor::_set_cache_autotuning()
517 if (!g_conf()->mon_memory_autotune
&& pcm
!= nullptr) {
518 // Disable cache autotuning
519 std::lock_guard
l(balancer_lock
);
523 if (g_conf()->mon_memory_autotune
&& pcm
== nullptr) {
524 int r
= register_cache_with_pcm();
527 << " Error while registering osdmon caches with pcm."
528 << " Cache auto tuning not enabled."
530 mon_memory_autotune
= false;
532 mon_memory_autotune
= true;
537 int OSDMonitor::_update_mon_cache_settings()
539 if (g_conf()->mon_memory_target
<= 0 ||
540 g_conf()->mon_memory_target
< mon_memory_min
||
541 g_conf()->rocksdb_cache_size
<= 0) {
545 if (pcm
== nullptr && rocksdb_binned_kv_cache
== nullptr) {
546 derr
<< __func__
<< " not using pcm and rocksdb" << dendl
;
550 uint64_t old_mon_memory_target
= mon_memory_target
;
551 uint64_t old_rocksdb_cache_size
= rocksdb_cache_size
;
553 // Set the new pcm memory cache sizes
554 mon_memory_target
= g_conf()->mon_memory_target
;
555 rocksdb_cache_size
= g_conf()->rocksdb_cache_size
;
557 uint64_t base
= mon_memory_base
;
558 double fragmentation
= mon_memory_fragmentation
;
559 uint64_t target
= mon_memory_target
;
560 uint64_t min
= mon_memory_min
;
563 uint64_t ltarget
= (1.0 - fragmentation
) * target
;
564 if (ltarget
> base
+ min
) {
565 max
= ltarget
- base
;
568 int r
= _set_cache_ratios();
570 derr
<< __func__
<< " Cache ratios for pcm could not be set."
571 << " Review the kv (rocksdb) and mon_memory_target sizes."
573 mon_memory_target
= old_mon_memory_target
;
574 rocksdb_cache_size
= old_rocksdb_cache_size
;
578 if (mon_memory_autotune
&& pcm
!= nullptr) {
579 std::lock_guard
l(balancer_lock
);
580 // set pcm cache levels
581 pcm
->set_target_memory(target
);
582 pcm
->set_min_memory(min
);
583 pcm
->set_max_memory(max
);
584 // tune memory based on new values
587 _set_new_cache_sizes();
588 dout(1) << __func__
<< " Updated mon cache setting."
589 << " target: " << target
597 int OSDMonitor::_set_cache_sizes()
599 if (g_conf()->mon_memory_autotune
) {
600 // set the new osdmon cache targets to be managed by pcm
601 mon_osd_cache_size
= g_conf()->mon_osd_cache_size
;
602 rocksdb_cache_size
= g_conf()->rocksdb_cache_size
;
603 mon_memory_base
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_base");
604 mon_memory_fragmentation
= cct
->_conf
.get_val
<double>("osd_memory_expected_fragmentation");
605 mon_memory_target
= g_conf()->mon_memory_target
;
606 mon_memory_min
= g_conf()->mon_osd_cache_size_min
;
607 if (mon_memory_target
<= 0 || mon_memory_min
<= 0) {
608 derr
<< __func__
<< " mon_memory_target:" << mon_memory_target
609 << " mon_memory_min:" << mon_memory_min
610 << ". Invalid size option(s) provided."
614 // Set the initial inc and full LRU cache sizes
615 inc_osd_cache
.set_bytes(mon_memory_min
);
616 full_osd_cache
.set_bytes(mon_memory_min
);
617 mon_memory_autotune
= g_conf()->mon_memory_autotune
;
622 bool OSDMonitor::_have_pending_crush()
624 return pending_inc
.crush
.length() > 0;
627 CrushWrapper
&OSDMonitor::_get_stable_crush()
629 return *osdmap
.crush
;
632 CrushWrapper
OSDMonitor::_get_pending_crush()
635 if (pending_inc
.crush
.length())
636 bl
= pending_inc
.crush
;
638 osdmap
.crush
->encode(bl
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
640 auto p
= bl
.cbegin();
646 void OSDMonitor::create_initial()
648 dout(10) << "create_initial for " << mon
.monmap
->fsid
<< dendl
;
653 mon
.store
->get("mkfs", "osdmap", bl
);
657 newmap
.set_fsid(mon
.monmap
->fsid
);
659 newmap
.build_simple(cct
, 0, mon
.monmap
->fsid
, 0);
662 newmap
.created
= newmap
.modified
= ceph_clock_now();
664 // new clusters should sort bitwise by default.
665 newmap
.set_flag(CEPH_OSDMAP_SORTBITWISE
);
668 CEPH_OSDMAP_RECOVERY_DELETES
|
669 CEPH_OSDMAP_PURGED_SNAPDIRS
|
670 CEPH_OSDMAP_PGLOG_HARDLIMIT
;
671 newmap
.full_ratio
= g_conf()->mon_osd_full_ratio
;
672 if (newmap
.full_ratio
> 1.0) newmap
.full_ratio
/= 100;
673 newmap
.backfillfull_ratio
= g_conf()->mon_osd_backfillfull_ratio
;
674 if (newmap
.backfillfull_ratio
> 1.0) newmap
.backfillfull_ratio
/= 100;
675 newmap
.nearfull_ratio
= g_conf()->mon_osd_nearfull_ratio
;
676 if (newmap
.nearfull_ratio
> 1.0) newmap
.nearfull_ratio
/= 100;
678 // new cluster should require latest by default
679 if (g_conf().get_val
<bool>("mon_debug_no_require_quincy")) {
680 if (g_conf().get_val
<bool>("mon_debug_no_require_pacific")) {
681 derr
<< __func__
<< " mon_debug_no_require_quincy and pacific=true" << dendl
;
682 newmap
.require_osd_release
= ceph_release_t::nautilus
;
684 derr
<< __func__
<< " mon_debug_no_require_quincy=true" << dendl
;
685 newmap
.require_osd_release
= ceph_release_t::pacific
;
688 newmap
.require_osd_release
= ceph_release_t::quincy
;
691 ceph_release_t r
= ceph_release_from_name(g_conf()->mon_osd_initial_require_min_compat_client
);
693 ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
695 newmap
.require_min_compat_client
= r
;
697 // encode into pending incremental
698 uint64_t features
= newmap
.get_encoding_features();
699 newmap
.encode(pending_inc
.fullmap
,
700 features
| CEPH_FEATURE_RESERVED
);
701 pending_inc
.full_crc
= newmap
.get_crc();
702 dout(20) << " full crc " << pending_inc
.full_crc
<< dendl
;
705 void OSDMonitor::get_store_prefixes(std::set
<string
>& s
) const
707 s
.insert(service_name
);
708 s
.insert(OSD_PG_CREATING_PREFIX
);
709 s
.insert(OSD_METADATA_PREFIX
);
710 s
.insert(OSD_SNAP_PREFIX
);
713 void OSDMonitor::update_from_paxos(bool *need_bootstrap
)
715 // we really don't care if the version has been updated, because we may
716 // have trimmed without having increased the last committed; yet, we may
717 // need to update the in-memory manifest.
718 load_osdmap_manifest();
720 version_t version
= get_last_committed();
721 if (version
== osdmap
.epoch
)
723 ceph_assert(version
> osdmap
.epoch
);
725 dout(15) << "update_from_paxos paxos e " << version
726 << ", my e " << osdmap
.epoch
<< dendl
;
728 int prev_num_up_osd
= osdmap
.num_up_osd
;
731 if (!mapping_job
->is_done()) {
732 dout(1) << __func__
<< " mapping job "
733 << mapping_job
.get() << " did not complete, "
734 << mapping_job
->shards
<< " left, canceling" << dendl
;
735 mapping_job
->abort();
743 * We will possibly have a stashed latest that *we* wrote, and we will
744 * always be sure to have the oldest full map in the first..last range
745 * due to encode_trim_extra(), which includes the oldest full map in the trim
748 * encode_trim_extra() does not however write the full map's
749 * version to 'full_latest'. This is only done when we are building the
750 * full maps from the incremental versions. But don't panic! We make sure
751 * that the following conditions find whichever full map version is newer.
753 version_t latest_full
= get_version_latest_full();
754 if (latest_full
== 0 && get_first_committed() > 1)
755 latest_full
= get_first_committed();
757 if (get_first_committed() > 1 &&
758 latest_full
< get_first_committed()) {
759 // the monitor could be just sync'ed with its peer, and the latest_full key
760 // is not encoded in the paxos commits in encode_pending(), so we need to
761 // make sure we get it pointing to a proper version.
762 version_t lc
= get_last_committed();
763 version_t fc
= get_first_committed();
765 dout(10) << __func__
<< " looking for valid full map in interval"
766 << " [" << fc
<< ", " << lc
<< "]" << dendl
;
769 for (version_t v
= lc
; v
>= fc
; v
--) {
770 string full_key
= "full_" + stringify(v
);
771 if (mon
.store
->exists(get_service_name(), full_key
)) {
772 dout(10) << __func__
<< " found latest full map v " << v
<< dendl
;
778 ceph_assert(latest_full
> 0);
779 auto t(std::make_shared
<MonitorDBStore::Transaction
>());
780 put_version_latest_full(t
, latest_full
);
781 mon
.store
->apply_transaction(t
);
782 dout(10) << __func__
<< " updated the on-disk full map version to "
783 << latest_full
<< dendl
;
786 if ((latest_full
> 0) && (latest_full
> osdmap
.epoch
)) {
787 bufferlist latest_bl
;
788 get_version_full(latest_full
, latest_bl
);
789 ceph_assert(latest_bl
.length() != 0);
790 dout(7) << __func__
<< " loading latest full map e" << latest_full
<< dendl
;
792 osdmap
.decode(latest_bl
);
796 if (!mon
.store
->get(OSD_PG_CREATING_PREFIX
, "creating", bl
)) {
797 auto p
= bl
.cbegin();
798 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
799 creating_pgs
.decode(p
);
800 dout(7) << __func__
<< " loading creating_pgs last_scan_epoch "
801 << creating_pgs
.last_scan_epoch
802 << " with " << creating_pgs
.pgs
.size() << " pgs" << dendl
;
804 dout(1) << __func__
<< " missing creating pgs; upgrade from post-kraken?"
808 // walk through incrementals
809 MonitorDBStore::TransactionRef t
;
811 while (version
> osdmap
.epoch
) {
813 int err
= get_version(osdmap
.epoch
+1, inc_bl
);
814 ceph_assert(err
== 0);
815 ceph_assert(inc_bl
.length());
816 // set priority cache manager levels if the osdmap is
817 // being populated for the first time.
818 if (mon_memory_autotune
&& pcm
== nullptr) {
819 int r
= register_cache_with_pcm();
822 << " Error while registering osdmon caches with pcm."
823 << " Proceeding without cache auto tuning."
828 dout(7) << "update_from_paxos applying incremental " << osdmap
.epoch
+1
830 OSDMap::Incremental
inc(inc_bl
);
831 err
= osdmap
.apply_incremental(inc
);
832 ceph_assert(err
== 0);
835 t
.reset(new MonitorDBStore::Transaction
);
837 // Write out the full map for all past epochs. Encode the full
838 // map with the same features as the incremental. If we don't
839 // know, use the quorum features. If we don't know those either,
840 // encode with all features.
841 uint64_t f
= inc
.encode_features
;
843 f
= mon
.get_quorum_con_features();
847 osdmap
.encode(full_bl
, f
| CEPH_FEATURE_RESERVED
);
848 tx_size
+= full_bl
.length();
850 bufferlist orig_full_bl
;
851 get_version_full(osdmap
.epoch
, orig_full_bl
);
852 if (orig_full_bl
.length()) {
853 // the primary provided the full map
854 ceph_assert(inc
.have_crc
);
855 if (inc
.full_crc
!= osdmap
.crc
) {
856 // This will happen if the mons were running mixed versions in
857 // the past or some other circumstance made the full encoded
858 // maps divergent. Reloading here will bring us back into
859 // sync with the primary for this and all future maps. OSDs
860 // will also be brought back into sync when they discover the
861 // crc mismatch and request a full map from a mon.
862 derr
<< __func__
<< " full map CRC mismatch, resetting to canonical"
865 dout(20) << __func__
<< " my (bad) full osdmap:\n";
866 JSONFormatter
jf(true);
867 jf
.dump_object("osdmap", osdmap
);
869 *_dout
<< "\nhexdump:\n";
870 full_bl
.hexdump(*_dout
);
874 osdmap
.decode(orig_full_bl
);
876 dout(20) << __func__
<< " canonical full osdmap:\n";
877 JSONFormatter
jf(true);
878 jf
.dump_object("osdmap", osdmap
);
880 *_dout
<< "\nhexdump:\n";
881 orig_full_bl
.hexdump(*_dout
);
885 ceph_assert(!inc
.have_crc
);
886 put_version_full(t
, osdmap
.epoch
, full_bl
);
888 put_version_latest_full(t
, osdmap
.epoch
);
891 dout(1) << osdmap
<< dendl
;
893 if (osdmap
.epoch
== 1) {
894 t
->erase("mkfs", "osdmap");
897 if (tx_size
> g_conf()->mon_sync_max_payload_size
*2) {
898 mon
.store
->apply_transaction(t
);
899 t
= MonitorDBStore::TransactionRef();
902 for (const auto [osd
, state
] : inc
.new_state
) {
903 if (state
& CEPH_OSD_UP
) {
904 // could be marked up *or* down, but we're too lazy to check which
905 last_osd_report
.erase(osd
);
908 for (const auto [osd
, weight
] : inc
.new_weight
) {
909 if (weight
== CEPH_OSD_OUT
) {
910 // manually marked out, so drop it
911 osd_epochs
.erase(osd
);
917 mon
.store
->apply_transaction(t
);
920 bool marked_osd_down
= false;
921 for (int o
= 0; o
< osdmap
.get_max_osd(); o
++) {
922 if (osdmap
.is_out(o
))
924 auto found
= down_pending_out
.find(o
);
925 if (osdmap
.is_down(o
)) {
926 // populate down -> out map
927 if (found
== down_pending_out
.end()) {
928 dout(10) << " adding osd." << o
<< " to down_pending_out map" << dendl
;
929 down_pending_out
[o
] = ceph_clock_now();
930 marked_osd_down
= true;
933 if (found
!= down_pending_out
.end()) {
934 dout(10) << " removing osd." << o
<< " from down_pending_out map" << dendl
;
935 down_pending_out
.erase(found
);
939 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
942 check_pg_creates_subs();
944 share_map_with_random_osd();
948 // make sure our feature bits reflect the latest map
949 update_msgr_features();
951 if (!mon
.is_leader()) {
952 // will be called by on_active() on the leader, avoid doing so twice
955 if (osdmap
.stretch_mode_enabled
) {
956 dout(20) << "Stretch mode enabled in this map" << dendl
;
957 mon
.try_engage_stretch_mode();
958 if (osdmap
.degraded_stretch_mode
) {
959 dout(20) << "Degraded stretch mode set in this map" << dendl
;
960 if (!osdmap
.recovering_stretch_mode
) {
961 mon
.set_degraded_stretch_mode();
962 if (prev_num_up_osd
< osdmap
.num_up_osd
&&
963 (osdmap
.num_up_osd
/ (double)osdmap
.num_osd
) >
964 cct
->_conf
.get_val
<double>("mon_stretch_cluster_recovery_ratio")) {
965 // TODO: This works for 2-site clusters when the OSD maps are appropriately
966 // trimmed and everything is "normal" but not if you have a lot of out OSDs
967 // you're ignoring or in some really degenerate failure cases
968 dout(10) << "Enabling recovery stretch mode in this map" << dendl
;
969 mon
.go_recovery_stretch_mode();
972 mon
.set_recovery_stretch_mode();
975 mon
.set_healthy_stretch_mode();
977 if (marked_osd_down
&&
978 (!osdmap
.degraded_stretch_mode
|| osdmap
.recovering_stretch_mode
)) {
979 dout(20) << "Checking degraded stretch mode due to osd changes" << dendl
;
980 mon
.maybe_go_degraded_stretch_mode();
985 int OSDMonitor::register_cache_with_pcm()
987 if (mon_memory_target
<= 0 || mon_memory_min
<= 0) {
988 derr
<< __func__
<< " Invalid memory size specified for mon caches."
989 << " Caches will not be auto-tuned."
993 uint64_t base
= mon_memory_base
;
994 double fragmentation
= mon_memory_fragmentation
;
995 // For calculating total target memory, consider rocksdb cache size.
996 uint64_t target
= mon_memory_target
;
997 uint64_t min
= mon_memory_min
;
1000 // Apply the same logic as in bluestore to set the max amount
1001 // of memory to use for cache. Assume base memory for OSDMaps
1002 // and then add in some overhead for fragmentation.
1003 uint64_t ltarget
= (1.0 - fragmentation
) * target
;
1004 if (ltarget
> base
+ min
) {
1005 max
= ltarget
- base
;
1008 rocksdb_binned_kv_cache
= mon
.store
->get_priority_cache();
1009 if (!rocksdb_binned_kv_cache
) {
1010 derr
<< __func__
<< " not using rocksdb" << dendl
;
1014 int r
= _set_cache_ratios();
1016 derr
<< __func__
<< " Cache ratios for pcm could not be set."
1017 << " Review the kv (rocksdb) and mon_memory_target sizes."
1022 pcm
= std::make_shared
<PriorityCache::Manager
>(
1023 cct
, min
, max
, target
, true);
1024 pcm
->insert("kv", rocksdb_binned_kv_cache
, true);
1025 pcm
->insert("inc", inc_cache
, true);
1026 pcm
->insert("full", full_cache
, true);
1027 dout(1) << __func__
<< " pcm target: " << target
1028 << " pcm max: " << max
1029 << " pcm min: " << min
1030 << " inc_osd_cache size: " << inc_osd_cache
.get_size()
1035 int OSDMonitor::_set_cache_ratios()
1037 double old_cache_kv_ratio
= cache_kv_ratio
;
1039 // Set the cache ratios for kv(rocksdb), inc and full caches
1040 cache_kv_ratio
= (double)rocksdb_cache_size
/ (double)mon_memory_target
;
1041 if (cache_kv_ratio
>= 1.0) {
1042 derr
<< __func__
<< " Cache kv ratio (" << cache_kv_ratio
1043 << ") must be in range [0,<1.0]."
1045 cache_kv_ratio
= old_cache_kv_ratio
;
1048 rocksdb_binned_kv_cache
->set_cache_ratio(cache_kv_ratio
);
1049 cache_inc_ratio
= cache_full_ratio
= (1.0 - cache_kv_ratio
) / 2;
1050 inc_cache
->set_cache_ratio(cache_inc_ratio
);
1051 full_cache
->set_cache_ratio(cache_full_ratio
);
1053 dout(1) << __func__
<< " kv ratio " << cache_kv_ratio
1054 << " inc ratio " << cache_inc_ratio
1055 << " full ratio " << cache_full_ratio
1060 void OSDMonitor::start_mapping()
1062 // initiate mapping job
1064 dout(10) << __func__
<< " canceling previous mapping_job " << mapping_job
.get()
1066 mapping_job
->abort();
1068 if (!osdmap
.get_pools().empty()) {
1069 auto fin
= new C_UpdateCreatingPGs(this, osdmap
.get_epoch());
1070 mapping_job
= mapping
.start_update(osdmap
, mapper
,
1071 g_conf()->mon_osd_mapping_pgs_per_chunk
);
1072 dout(10) << __func__
<< " started mapping job " << mapping_job
.get()
1073 << " at " << fin
->start
<< dendl
;
1074 mapping_job
->set_finish_event(fin
);
1076 dout(10) << __func__
<< " no pools, no mapping job" << dendl
;
1077 mapping_job
= nullptr;
1081 void OSDMonitor::update_msgr_features()
1083 const int types
[] = {
1084 entity_name_t::TYPE_OSD
,
1085 entity_name_t::TYPE_CLIENT
,
1086 entity_name_t::TYPE_MDS
,
1087 entity_name_t::TYPE_MON
1089 for (int type
: types
) {
1091 uint64_t features
= osdmap
.get_features(type
, &mask
);
1092 if ((mon
.messenger
->get_policy(type
).features_required
& mask
) != features
) {
1093 dout(0) << "crush map has features " << features
<< ", adjusting msgr requires" << dendl
;
1094 ceph::net::Policy p
= mon
.messenger
->get_policy(type
);
1095 p
.features_required
= (p
.features_required
& ~mask
) | features
;
1096 mon
.messenger
->set_policy(type
, p
);
1101 void OSDMonitor::on_active()
1105 if (mon
.is_leader()) {
1106 mon
.clog
->debug() << "osdmap " << osdmap
;
1107 if (!priority_convert
) {
1108 // Only do this once at start-up
1109 convert_pool_priorities();
1110 priority_convert
= true;
1113 list
<MonOpRequestRef
> ls
;
1114 take_all_failures(ls
);
1115 while (!ls
.empty()) {
1116 MonOpRequestRef op
= ls
.front();
1117 op
->mark_osdmon_event(__func__
);
1125 void OSDMonitor::on_restart()
1127 last_osd_report
.clear();
1130 void OSDMonitor::on_shutdown()
1132 dout(10) << __func__
<< dendl
;
1134 dout(10) << __func__
<< " canceling previous mapping_job " << mapping_job
.get()
1136 mapping_job
->abort();
1139 // discard failure info, waiters
1140 list
<MonOpRequestRef
> ls
;
1141 take_all_failures(ls
);
1145 void OSDMonitor::update_logger()
1147 dout(10) << "update_logger" << dendl
;
1149 mon
.cluster_logger
->set(l_cluster_num_osd
, osdmap
.get_num_osds());
1150 mon
.cluster_logger
->set(l_cluster_num_osd_up
, osdmap
.get_num_up_osds());
1151 mon
.cluster_logger
->set(l_cluster_num_osd_in
, osdmap
.get_num_in_osds());
1152 mon
.cluster_logger
->set(l_cluster_osd_epoch
, osdmap
.get_epoch());
1155 void OSDMonitor::create_pending()
1157 pending_inc
= OSDMap::Incremental(osdmap
.epoch
+1);
1158 pending_inc
.fsid
= mon
.monmap
->fsid
;
1159 pending_metadata
.clear();
1160 pending_metadata_rm
.clear();
1161 pending_pseudo_purged_snaps
.clear();
1163 dout(10) << "create_pending e " << pending_inc
.epoch
<< dendl
;
1165 // safety checks (this shouldn't really happen)
1167 if (osdmap
.backfillfull_ratio
<= 0) {
1168 pending_inc
.new_backfillfull_ratio
= g_conf()->mon_osd_backfillfull_ratio
;
1169 if (pending_inc
.new_backfillfull_ratio
> 1.0)
1170 pending_inc
.new_backfillfull_ratio
/= 100;
1171 dout(1) << __func__
<< " setting backfillfull_ratio = "
1172 << pending_inc
.new_backfillfull_ratio
<< dendl
;
1174 if (osdmap
.full_ratio
<= 0) {
1175 pending_inc
.new_full_ratio
= g_conf()->mon_osd_full_ratio
;
1176 if (pending_inc
.new_full_ratio
> 1.0)
1177 pending_inc
.new_full_ratio
/= 100;
1178 dout(1) << __func__
<< " setting full_ratio = "
1179 << pending_inc
.new_full_ratio
<< dendl
;
1181 if (osdmap
.nearfull_ratio
<= 0) {
1182 pending_inc
.new_nearfull_ratio
= g_conf()->mon_osd_nearfull_ratio
;
1183 if (pending_inc
.new_nearfull_ratio
> 1.0)
1184 pending_inc
.new_nearfull_ratio
/= 100;
1185 dout(1) << __func__
<< " setting nearfull_ratio = "
1186 << pending_inc
.new_nearfull_ratio
<< dendl
;
1192 OSDMonitor::update_pending_pgs(const OSDMap::Incremental
& inc
,
1193 const OSDMap
& nextmap
)
1195 dout(10) << __func__
<< dendl
;
1196 creating_pgs_t pending_creatings
;
1198 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
1199 pending_creatings
= creating_pgs
;
1201 // check for new or old pools
1202 if (pending_creatings
.last_scan_epoch
< inc
.epoch
) {
1203 unsigned queued
= 0;
1204 queued
+= scan_for_creating_pgs(osdmap
.get_pools(),
1207 &pending_creatings
);
1208 queued
+= scan_for_creating_pgs(inc
.new_pools
,
1211 &pending_creatings
);
1212 dout(10) << __func__
<< " " << queued
<< " pools queued" << dendl
;
1213 for (auto deleted_pool
: inc
.old_pools
) {
1214 auto removed
= pending_creatings
.remove_pool(deleted_pool
);
1215 dout(10) << __func__
<< " " << removed
1216 << " pg removed because containing pool deleted: "
1217 << deleted_pool
<< dendl
;
1218 last_epoch_clean
.remove_pool(deleted_pool
);
1220 // pgmon updates its creating_pgs in check_osd_map() which is called by
1221 // on_active() and check_osd_map() could be delayed if lease expires, so its
1222 // creating_pgs could be stale in comparison with the one of osdmon. let's
1223 // trim them here. otherwise, they will be added back after being erased.
1224 unsigned removed
= 0;
1225 for (auto& pg
: pending_created_pgs
) {
1226 dout(20) << __func__
<< " noting created pg " << pg
<< dendl
;
1227 pending_creatings
.created_pools
.insert(pg
.pool());
1228 removed
+= pending_creatings
.pgs
.erase(pg
);
1230 pending_created_pgs
.clear();
1231 dout(10) << __func__
<< " " << removed
1232 << " pgs removed because they're created" << dendl
;
1233 pending_creatings
.last_scan_epoch
= osdmap
.get_epoch();
1236 // filter out any pgs that shouldn't exist.
1238 auto i
= pending_creatings
.pgs
.begin();
1239 while (i
!= pending_creatings
.pgs
.end()) {
1240 if (!nextmap
.pg_exists(i
->first
)) {
1241 dout(10) << __func__
<< " removing pg " << i
->first
1242 << " which should not exist" << dendl
;
1243 i
= pending_creatings
.pgs
.erase(i
);
1251 unsigned max
= std::max
<int64_t>(1, g_conf()->mon_osd_max_creating_pgs
);
1252 const auto total
= pending_creatings
.pgs
.size();
1253 while (pending_creatings
.pgs
.size() < max
&&
1254 !pending_creatings
.queue
.empty()) {
1255 auto p
= pending_creatings
.queue
.begin();
1256 int64_t poolid
= p
->first
;
1257 dout(10) << __func__
<< " pool " << poolid
1258 << " created " << p
->second
.created
1259 << " modified " << p
->second
.modified
1260 << " [" << p
->second
.start
<< "-" << p
->second
.end
<< ")"
1262 int64_t n
= std::min
<int64_t>(max
- pending_creatings
.pgs
.size(),
1263 p
->second
.end
- p
->second
.start
);
1264 ps_t first
= p
->second
.start
;
1265 ps_t end
= first
+ n
;
1266 for (ps_t ps
= first
; ps
< end
; ++ps
) {
1267 const pg_t pgid
{ps
, static_cast<uint64_t>(poolid
)};
1268 // NOTE: use the *current* epoch as the PG creation epoch so that the
1269 // OSD does not have to generate a long set of PastIntervals.
1270 pending_creatings
.pgs
.emplace(
1272 creating_pgs_t::pg_create_info(inc
.epoch
,
1273 p
->second
.modified
));
1274 dout(10) << __func__
<< " adding " << pgid
<< dendl
;
1276 p
->second
.start
= end
;
1277 if (p
->second
.done()) {
1278 dout(10) << __func__
<< " done with queue for " << poolid
<< dendl
;
1279 pending_creatings
.queue
.erase(p
);
1281 dout(10) << __func__
<< " pool " << poolid
1282 << " now [" << p
->second
.start
<< "-" << p
->second
.end
<< ")"
1286 dout(10) << __func__
<< " queue remaining: " << pending_creatings
.queue
.size()
1287 << " pools" << dendl
;
1289 if (mon
.monmap
->min_mon_release
>= ceph_release_t::octopus
) {
1290 // walk creating pgs' history and past_intervals forward
1291 for (auto& i
: pending_creatings
.pgs
) {
1292 // this mirrors PG::start_peering_interval()
1293 pg_t pgid
= i
.first
;
1295 // this is a bit imprecise, but sufficient?
1296 struct min_size_predicate_t
: public IsPGRecoverablePredicate
{
1297 const pg_pool_t
*pi
;
1298 bool operator()(const set
<pg_shard_t
> &have
) const {
1299 return have
.size() >= pi
->min_size
;
1301 explicit min_size_predicate_t(const pg_pool_t
*i
) : pi(i
) {}
1302 } min_size_predicate(nextmap
.get_pg_pool(pgid
.pool()));
1304 vector
<int> up
, acting
;
1305 int up_primary
, acting_primary
;
1306 nextmap
.pg_to_up_acting_osds(
1307 pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
1308 if (i
.second
.history
.epoch_created
== 0) {
1309 // new pg entry, set it up
1311 i
.second
.acting
= acting
;
1312 i
.second
.up_primary
= up_primary
;
1313 i
.second
.acting_primary
= acting_primary
;
1314 i
.second
.history
= pg_history_t(i
.second
.create_epoch
,
1315 i
.second
.create_stamp
);
1316 dout(10) << __func__
<< " pg " << pgid
<< " just added, "
1317 << " up " << i
.second
.up
1318 << " p " << i
.second
.up_primary
1319 << " acting " << i
.second
.acting
1320 << " p " << i
.second
.acting_primary
1321 << " history " << i
.second
.history
1322 << " past_intervals " << i
.second
.past_intervals
1325 std::stringstream debug
;
1326 if (PastIntervals::check_new_interval(
1327 i
.second
.acting_primary
, acting_primary
,
1328 i
.second
.acting
, acting
,
1329 i
.second
.up_primary
, up_primary
,
1331 i
.second
.history
.same_interval_since
,
1332 i
.second
.history
.last_epoch_clean
,
1337 &i
.second
.past_intervals
,
1339 epoch_t e
= inc
.epoch
;
1340 i
.second
.history
.same_interval_since
= e
;
1341 if (i
.second
.up
!= up
) {
1342 i
.second
.history
.same_up_since
= e
;
1344 if (i
.second
.acting_primary
!= acting_primary
) {
1345 i
.second
.history
.same_primary_since
= e
;
1348 osdmap
.get_pg_num(pgid
.pool()),
1349 nextmap
.get_pg_num(pgid
.pool()),
1351 i
.second
.history
.last_epoch_split
= e
;
1353 dout(10) << __func__
<< " pg " << pgid
<< " new interval,"
1354 << " up " << i
.second
.up
<< " -> " << up
1355 << " p " << i
.second
.up_primary
<< " -> " << up_primary
1356 << " acting " << i
.second
.acting
<< " -> " << acting
1357 << " p " << i
.second
.acting_primary
<< " -> "
1359 << " history " << i
.second
.history
1360 << " past_intervals " << i
.second
.past_intervals
1362 dout(20) << " debug: " << debug
.str() << dendl
;
1364 i
.second
.acting
= acting
;
1365 i
.second
.up_primary
= up_primary
;
1366 i
.second
.acting_primary
= acting_primary
;
1371 dout(10) << __func__
1372 << " " << (pending_creatings
.pgs
.size() - total
)
1373 << "/" << pending_creatings
.pgs
.size()
1374 << " pgs added from queued pools" << dendl
;
1375 return pending_creatings
;
1378 void OSDMonitor::maybe_prime_pg_temp()
1381 if (pending_inc
.crush
.length()) {
1382 dout(10) << __func__
<< " new crush map, all" << dendl
;
1386 if (!pending_inc
.new_up_client
.empty()) {
1387 dout(10) << __func__
<< " new up osds, all" << dendl
;
1391 // check for interesting OSDs
1393 for (auto p
= pending_inc
.new_state
.begin();
1394 !all
&& p
!= pending_inc
.new_state
.end();
1396 if ((p
->second
& CEPH_OSD_UP
) &&
1397 osdmap
.is_up(p
->first
)) {
1398 osds
.insert(p
->first
);
1401 for (auto p
= pending_inc
.new_weight
.begin();
1402 !all
&& p
!= pending_inc
.new_weight
.end();
1404 if (osdmap
.exists(p
->first
) && p
->second
< osdmap
.get_weight(p
->first
)) {
1406 osds
.insert(p
->first
);
1408 dout(10) << __func__
<< " osd." << p
->first
<< " weight increase, all"
1414 if (!all
&& osds
.empty())
1419 mapping
.get_osd_acting_pgs(*osds
.begin()).size() * osds
.size();
1420 if (estimate
> mapping
.get_num_pgs() *
1421 g_conf()->mon_osd_prime_pg_temp_max_estimate
) {
1422 dout(10) << __func__
<< " estimate " << estimate
<< " pgs on "
1423 << osds
.size() << " osds >= "
1424 << g_conf()->mon_osd_prime_pg_temp_max_estimate
<< " of total "
1425 << mapping
.get_num_pgs() << " pgs, all"
1429 dout(10) << __func__
<< " estimate " << estimate
<< " pgs on "
1430 << osds
.size() << " osds" << dendl
;
1435 next
.deepish_copy_from(osdmap
);
1436 next
.apply_incremental(pending_inc
);
1438 if (next
.get_pools().empty()) {
1439 dout(10) << __func__
<< " no pools, no pg_temp priming" << dendl
;
1441 PrimeTempJob
job(next
, this);
1442 mapper
.queue(&job
, g_conf()->mon_osd_mapping_pgs_per_chunk
, {});
1443 if (job
.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time
)) {
1444 dout(10) << __func__
<< " done in " << job
.get_duration() << dendl
;
1446 dout(10) << __func__
<< " did not finish in "
1447 << g_conf()->mon_osd_prime_pg_temp_max_time
1448 << ", stopping" << dendl
;
1452 dout(10) << __func__
<< " " << osds
.size() << " interesting osds" << dendl
;
1453 utime_t stop
= ceph_clock_now();
1454 stop
+= g_conf()->mon_osd_prime_pg_temp_max_time
;
1455 const int chunk
= 1000;
1457 std::unordered_set
<pg_t
> did_pgs
;
1458 for (auto osd
: osds
) {
1459 auto& pgs
= mapping
.get_osd_acting_pgs(osd
);
1460 dout(20) << __func__
<< " osd." << osd
<< " " << pgs
<< dendl
;
1461 for (auto pgid
: pgs
) {
1462 if (!did_pgs
.insert(pgid
).second
) {
1465 prime_pg_temp(next
, pgid
);
1468 if (ceph_clock_now() > stop
) {
1469 dout(10) << __func__
<< " consumed more than "
1470 << g_conf()->mon_osd_prime_pg_temp_max_time
1471 << " seconds, stopping"
1481 void OSDMonitor::prime_pg_temp(
1485 // TODO: remove this creating_pgs direct access?
1486 if (creating_pgs
.pgs
.count(pgid
)) {
1489 if (!osdmap
.pg_exists(pgid
)) {
1493 vector
<int> up
, acting
;
1494 mapping
.get(pgid
, &up
, nullptr, &acting
, nullptr);
1496 vector
<int> next_up
, next_acting
;
1497 int next_up_primary
, next_acting_primary
;
1498 next
.pg_to_up_acting_osds(pgid
, &next_up
, &next_up_primary
,
1499 &next_acting
, &next_acting_primary
);
1500 if (acting
== next_acting
&&
1501 !(up
!= acting
&& next_up
== next_acting
))
1502 return; // no change since last epoch
1505 return; // if previously empty now we can be no worse off
1506 const pg_pool_t
*pool
= next
.get_pg_pool(pgid
.pool());
1507 if (pool
&& acting
.size() < pool
->min_size
)
1508 return; // can be no worse off than before
1510 if (next_up
== next_acting
) {
1512 dout(20) << __func__
<< " next_up == next_acting now, clear pg_temp"
1516 dout(20) << __func__
<< " " << pgid
<< " " << up
<< "/" << acting
1517 << " -> " << next_up
<< "/" << next_acting
1518 << ", priming " << acting
1521 std::lock_guard
l(prime_pg_temp_lock
);
1522 // do not touch a mapping if a change is pending
1523 pending_inc
.new_pg_temp
.emplace(
1525 mempool::osdmap::vector
<int>(acting
.begin(), acting
.end()));
1530 * @note receiving a transaction in this function gives a fair amount of
1531 * freedom to the service implementation if it does need it. It shouldn't.
1533 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t
)
1535 dout(10) << "encode_pending e " << pending_inc
.epoch
1539 dout(1) << __func__
<< " osdmap full prune encoded e"
1540 << pending_inc
.epoch
<< dendl
;
1543 // finalize up pending_inc
1544 pending_inc
.modified
= ceph_clock_now();
1546 int r
= pending_inc
.propagate_base_properties_to_tiers(cct
, osdmap
);
1547 ceph_assert(r
== 0);
1550 if (!mapping_job
->is_done()) {
1551 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job "
1552 << mapping_job
.get() << " did not complete, "
1553 << mapping_job
->shards
<< " left" << dendl
;
1554 mapping_job
->abort();
1555 } else if (mapping
.get_epoch() < osdmap
.get_epoch()) {
1556 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job "
1557 << mapping_job
.get() << " is prior epoch "
1558 << mapping
.get_epoch() << dendl
;
1560 if (g_conf()->mon_osd_prime_pg_temp
) {
1561 maybe_prime_pg_temp();
1564 } else if (g_conf()->mon_osd_prime_pg_temp
) {
1565 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job did not start"
1568 mapping_job
.reset();
1570 // ensure we don't have blank new_state updates. these are interrpeted as
1571 // CEPH_OSD_UP (and almost certainly not what we want!).
1572 auto p
= pending_inc
.new_state
.begin();
1573 while (p
!= pending_inc
.new_state
.end()) {
1574 if (p
->second
== 0) {
1575 dout(10) << "new_state for osd." << p
->first
<< " is 0, removing" << dendl
;
1576 p
= pending_inc
.new_state
.erase(p
);
1578 if (p
->second
& CEPH_OSD_UP
) {
1579 pending_inc
.new_last_up_change
= pending_inc
.modified
;
1584 if (!pending_inc
.new_up_client
.empty()) {
1585 pending_inc
.new_last_up_change
= pending_inc
.modified
;
1587 for (auto& i
: pending_inc
.new_weight
) {
1588 if (i
.first
>= osdmap
.max_osd
) {
1590 // new osd is already marked in
1591 pending_inc
.new_last_in_change
= pending_inc
.modified
;
1594 } else if (!!i
.second
!= !!osdmap
.osd_weight
[i
.first
]) {
1595 // existing osd marked in or out
1596 pending_inc
.new_last_in_change
= pending_inc
.modified
;
1603 tmp
.deepish_copy_from(osdmap
);
1604 tmp
.apply_incremental(pending_inc
);
1606 // clean pg_temp mappings
1607 OSDMap::clean_temps(cct
, osdmap
, tmp
, &pending_inc
);
1609 // clean inappropriate pg_upmap/pg_upmap_items (if any)
1611 // check every upmapped pg for now
1612 // until we could reliably identify certain cases to ignore,
1613 // which is obviously the hard part TBD..
1614 vector
<pg_t
> pgs_to_check
;
1615 tmp
.get_upmap_pgs(&pgs_to_check
);
1616 if (pgs_to_check
.size() <
1617 static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk
* 2)) {
1618 // not enough pgs, do it inline
1619 tmp
.clean_pg_upmaps(cct
, &pending_inc
);
1621 CleanUpmapJob
job(cct
, tmp
, pending_inc
);
1622 mapper
.queue(&job
, g_conf()->mon_clean_pg_upmaps_per_chunk
, pgs_to_check
);
1627 // update creating pgs first so that we can remove the created pgid and
1628 // process the pool flag removal below in the same osdmap epoch.
1629 auto pending_creatings
= update_pending_pgs(pending_inc
, tmp
);
1630 bufferlist creatings_bl
;
1631 uint64_t features
= CEPH_FEATURES_ALL
;
1632 if (mon
.monmap
->min_mon_release
< ceph_release_t::octopus
) {
1633 dout(20) << __func__
<< " encoding pending pgs without octopus features"
1635 features
&= ~CEPH_FEATURE_SERVER_OCTOPUS
;
1637 encode(pending_creatings
, creatings_bl
, features
);
1638 t
->put(OSD_PG_CREATING_PREFIX
, "creating", creatings_bl
);
1640 // remove any old (or incompat) POOL_CREATING flags
1641 for (auto& i
: tmp
.get_pools()) {
1642 if (tmp
.require_osd_release
< ceph_release_t::nautilus
) {
1643 // pre-nautilus OSDMaps shouldn't get this flag.
1644 if (pending_inc
.new_pools
.count(i
.first
)) {
1645 pending_inc
.new_pools
[i
.first
].flags
&= ~pg_pool_t::FLAG_CREATING
;
1648 if (i
.second
.has_flag(pg_pool_t::FLAG_CREATING
) &&
1649 !pending_creatings
.still_creating_pool(i
.first
)) {
1650 dout(10) << __func__
<< " done creating pool " << i
.first
1651 << ", clearing CREATING flag" << dendl
;
1652 if (pending_inc
.new_pools
.count(i
.first
) == 0) {
1653 pending_inc
.new_pools
[i
.first
] = i
.second
;
1655 pending_inc
.new_pools
[i
.first
].flags
&= ~pg_pool_t::FLAG_CREATING
;
1659 // collect which pools are currently affected by
1660 // the near/backfill/full osd(s),
1661 // and set per-pool near/backfill/full flag instead
1662 set
<int64_t> full_pool_ids
;
1663 set
<int64_t> backfillfull_pool_ids
;
1664 set
<int64_t> nearfull_pool_ids
;
1665 tmp
.get_full_pools(cct
,
1667 &backfillfull_pool_ids
,
1668 &nearfull_pool_ids
);
1669 if (full_pool_ids
.empty() ||
1670 backfillfull_pool_ids
.empty() ||
1671 nearfull_pool_ids
.empty()) {
1672 // normal case - no nearfull, backfillfull or full osds
1673 // try cancel any improper nearfull/backfillfull/full pool
1675 for (auto &pool
: tmp
.get_pools()) {
1676 auto p
= pool
.first
;
1677 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
) &&
1678 nearfull_pool_ids
.empty()) {
1679 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1680 << "'s nearfull flag" << dendl
;
1681 if (pending_inc
.new_pools
.count(p
) == 0) {
1682 // load original pool info first!
1683 pending_inc
.new_pools
[p
] = pool
.second
;
1685 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1687 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
) &&
1688 backfillfull_pool_ids
.empty()) {
1689 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1690 << "'s backfillfull flag" << dendl
;
1691 if (pending_inc
.new_pools
.count(p
) == 0) {
1692 pending_inc
.new_pools
[p
] = pool
.second
;
1694 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1696 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
) &&
1697 full_pool_ids
.empty()) {
1698 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1699 // set by EQUOTA, skipping
1702 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1703 << "'s full flag" << dendl
;
1704 if (pending_inc
.new_pools
.count(p
) == 0) {
1705 pending_inc
.new_pools
[p
] = pool
.second
;
1707 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_FULL
;
1711 if (!full_pool_ids
.empty()) {
1712 dout(10) << __func__
<< " marking pool(s) " << full_pool_ids
1713 << " as full" << dendl
;
1714 for (auto &p
: full_pool_ids
) {
1715 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
)) {
1718 if (pending_inc
.new_pools
.count(p
) == 0) {
1719 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1721 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_FULL
;
1722 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1723 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1725 // cancel FLAG_FULL for pools which are no longer full too
1726 for (auto &pool
: tmp
.get_pools()) {
1727 auto p
= pool
.first
;
1728 if (full_pool_ids
.count(p
)) {
1729 // skip pools we have just marked as full above
1732 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
) ||
1733 tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1734 // don't touch if currently is not full
1735 // or is running out of quota (and hence considered as full)
1738 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1739 << "'s full flag" << dendl
;
1740 if (pending_inc
.new_pools
.count(p
) == 0) {
1741 pending_inc
.new_pools
[p
] = pool
.second
;
1743 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_FULL
;
1746 if (!backfillfull_pool_ids
.empty()) {
1747 for (auto &p
: backfillfull_pool_ids
) {
1748 if (full_pool_ids
.count(p
)) {
1749 // skip pools we have already considered as full above
1752 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1753 // make sure FLAG_FULL is truly set, so we are safe not
1754 // to set a extra (redundant) FLAG_BACKFILLFULL flag
1755 ceph_assert(tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
));
1758 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
)) {
1759 // don't bother if pool is already marked as backfillfull
1762 dout(10) << __func__
<< " marking pool '" << tmp
.pool_name
[p
]
1763 << "'s as backfillfull" << dendl
;
1764 if (pending_inc
.new_pools
.count(p
) == 0) {
1765 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1767 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_BACKFILLFULL
;
1768 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1770 // cancel FLAG_BACKFILLFULL for pools
1771 // which are no longer backfillfull too
1772 for (auto &pool
: tmp
.get_pools()) {
1773 auto p
= pool
.first
;
1774 if (full_pool_ids
.count(p
) || backfillfull_pool_ids
.count(p
)) {
1775 // skip pools we have just marked as backfillfull/full above
1778 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
)) {
1779 // and don't touch if currently is not backfillfull
1782 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1783 << "'s backfillfull flag" << dendl
;
1784 if (pending_inc
.new_pools
.count(p
) == 0) {
1785 pending_inc
.new_pools
[p
] = pool
.second
;
1787 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1790 if (!nearfull_pool_ids
.empty()) {
1791 for (auto &p
: nearfull_pool_ids
) {
1792 if (full_pool_ids
.count(p
) || backfillfull_pool_ids
.count(p
)) {
1795 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1796 // make sure FLAG_FULL is truly set, so we are safe not
1797 // to set a extra (redundant) FLAG_NEARFULL flag
1798 ceph_assert(tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
));
1801 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
)) {
1802 // don't bother if pool is already marked as nearfull
1805 dout(10) << __func__
<< " marking pool '" << tmp
.pool_name
[p
]
1806 << "'s as nearfull" << dendl
;
1807 if (pending_inc
.new_pools
.count(p
) == 0) {
1808 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1810 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_NEARFULL
;
1812 // cancel FLAG_NEARFULL for pools
1813 // which are no longer nearfull too
1814 for (auto &pool
: tmp
.get_pools()) {
1815 auto p
= pool
.first
;
1816 if (full_pool_ids
.count(p
) ||
1817 backfillfull_pool_ids
.count(p
) ||
1818 nearfull_pool_ids
.count(p
)) {
1819 // skip pools we have just marked as
1820 // nearfull/backfillfull/full above
1823 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
)) {
1824 // and don't touch if currently is not nearfull
1827 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1828 << "'s nearfull flag" << dendl
;
1829 if (pending_inc
.new_pools
.count(p
) == 0) {
1830 pending_inc
.new_pools
[p
] = pool
.second
;
1832 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1836 // min_compat_client?
1837 if (!tmp
.require_min_compat_client
) {
1838 auto mv
= tmp
.get_min_compat_client();
1839 dout(1) << __func__
<< " setting require_min_compat_client to currently "
1840 << "required " << mv
<< dendl
;
1841 mon
.clog
->info() << "setting require_min_compat_client to currently "
1842 << "required " << mv
;
1843 pending_inc
.new_require_min_compat_client
= mv
;
1846 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
&&
1847 tmp
.require_osd_release
>= ceph_release_t::nautilus
) {
1848 dout(10) << __func__
<< " first nautilus+ epoch" << dendl
;
1849 // add creating flags?
1850 for (auto& i
: tmp
.get_pools()) {
1851 if (pending_creatings
.still_creating_pool(i
.first
)) {
1852 dout(10) << __func__
<< " adding CREATING flag to pool " << i
.first
1854 if (pending_inc
.new_pools
.count(i
.first
) == 0) {
1855 pending_inc
.new_pools
[i
.first
] = i
.second
;
1857 pending_inc
.new_pools
[i
.first
].flags
|= pg_pool_t::FLAG_CREATING
;
1860 // adjust blocklist items to all be TYPE_ANY
1861 for (auto& i
: tmp
.blocklist
) {
1863 a
.set_type(entity_addr_t::TYPE_ANY
);
1864 pending_inc
.new_blocklist
[a
] = i
.second
;
1865 pending_inc
.old_blocklist
.push_back(i
.first
);
1869 if (osdmap
.require_osd_release
< ceph_release_t::octopus
&&
1870 tmp
.require_osd_release
>= ceph_release_t::octopus
) {
1871 dout(10) << __func__
<< " first octopus+ epoch" << dendl
;
1873 // adjust obsoleted cache modes
1874 for (auto& [poolid
, pi
] : tmp
.pools
) {
1875 if (pi
.cache_mode
== pg_pool_t::CACHEMODE_FORWARD
) {
1876 if (pending_inc
.new_pools
.count(poolid
) == 0) {
1877 pending_inc
.new_pools
[poolid
] = pi
;
1879 dout(10) << __func__
<< " switching pool " << poolid
1880 << " cachemode from forward -> proxy" << dendl
;
1881 pending_inc
.new_pools
[poolid
].cache_mode
= pg_pool_t::CACHEMODE_PROXY
;
1883 if (pi
.cache_mode
== pg_pool_t::CACHEMODE_READFORWARD
) {
1884 if (pending_inc
.new_pools
.count(poolid
) == 0) {
1885 pending_inc
.new_pools
[poolid
] = pi
;
1887 dout(10) << __func__
<< " switching pool " << poolid
1888 << " cachemode from readforward -> readproxy" << dendl
;
1889 pending_inc
.new_pools
[poolid
].cache_mode
=
1890 pg_pool_t::CACHEMODE_READPROXY
;
1894 // clear removed_snaps for every pool
1895 for (auto& [poolid
, pi
] : tmp
.pools
) {
1896 if (pi
.removed_snaps
.empty()) {
1899 if (pending_inc
.new_pools
.count(poolid
) == 0) {
1900 pending_inc
.new_pools
[poolid
] = pi
;
1902 dout(10) << __func__
<< " clearing pool " << poolid
<< " removed_snaps"
1904 pending_inc
.new_pools
[poolid
].removed_snaps
.clear();
1907 // create a combined purged snap epoch key for all purged snaps
1908 // prior to this epoch, and store it in the current epoch (i.e.,
1909 // the last pre-octopus epoch, just prior to the one we're
1911 auto it
= mon
.store
->get_iterator(OSD_SNAP_PREFIX
);
1912 it
->lower_bound("purged_snap_");
1913 map
<int64_t,snap_interval_set_t
> combined
;
1914 while (it
->valid()) {
1915 if (it
->key().find("purged_snap_") != 0) {
1918 string k
= it
->key();
1919 long long unsigned pool
;
1920 int n
= sscanf(k
.c_str(), "purged_snap_%llu_", &pool
);
1922 derr
<< __func__
<< " invalid purged_snaps key '" << k
<< "'" << dendl
;
1924 bufferlist v
= it
->value();
1925 auto p
= v
.cbegin();
1926 snapid_t begin
, end
;
1927 ceph::decode(begin
, p
);
1928 ceph::decode(end
, p
);
1929 combined
[pool
].insert(begin
, end
- begin
);
1933 if (!combined
.empty()) {
1934 string k
= make_purged_snap_epoch_key(pending_inc
.epoch
- 1);
1936 ceph::encode(combined
, v
);
1937 t
->put(OSD_SNAP_PREFIX
, k
, v
);
1938 dout(10) << __func__
<< " recording pre-octopus purged_snaps in epoch "
1939 << (pending_inc
.epoch
- 1) << ", " << v
.length() << " bytes"
1942 dout(10) << __func__
<< " there were no pre-octopus purged snaps"
1946 // clean out the old removed_snap_ and removed_epoch keys
1947 // ('`' is ASCII '_' + 1)
1948 t
->erase_range(OSD_SNAP_PREFIX
, "removed_snap_", "removed_snap`");
1949 t
->erase_range(OSD_SNAP_PREFIX
, "removed_epoch_", "removed_epoch`");
1954 for (auto i
= pending_inc
.new_state
.begin();
1955 i
!= pending_inc
.new_state
.end();
1957 int s
= i
->second
? i
->second
: CEPH_OSD_UP
;
1958 if (s
& CEPH_OSD_UP
) {
1959 dout(2) << " osd." << i
->first
<< " DOWN" << dendl
;
1960 // Reset laggy parameters if failure interval exceeds a threshold.
1961 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(i
->first
);
1962 if ((xi
.laggy_probability
|| xi
.laggy_interval
) && xi
.down_stamp
.sec()) {
1963 int last_failure_interval
= pending_inc
.modified
.sec() - xi
.down_stamp
.sec();
1964 if (grace_interval_threshold_exceeded(last_failure_interval
)) {
1965 set_default_laggy_params(i
->first
);
1969 if (s
& CEPH_OSD_EXISTS
)
1970 dout(2) << " osd." << i
->first
<< " DNE" << dendl
;
1972 for (auto i
= pending_inc
.new_up_client
.begin();
1973 i
!= pending_inc
.new_up_client
.end();
1975 //FIXME: insert cluster addresses too
1976 dout(2) << " osd." << i
->first
<< " UP " << i
->second
<< dendl
;
1978 for (map
<int32_t,uint32_t>::iterator i
= pending_inc
.new_weight
.begin();
1979 i
!= pending_inc
.new_weight
.end();
1981 if (i
->second
== CEPH_OSD_OUT
) {
1982 dout(2) << " osd." << i
->first
<< " OUT" << dendl
;
1983 } else if (i
->second
== CEPH_OSD_IN
) {
1984 dout(2) << " osd." << i
->first
<< " IN" << dendl
;
1986 dout(2) << " osd." << i
->first
<< " WEIGHT " << hex
<< i
->second
<< dec
<< dendl
;
1990 // features for osdmap and its incremental
1993 // encode full map and determine its crc
1996 tmp
.deepish_copy_from(osdmap
);
1997 tmp
.apply_incremental(pending_inc
);
1999 // determine appropriate features
2000 features
= tmp
.get_encoding_features();
2001 dout(10) << __func__
<< " encoding full map with "
2002 << tmp
.require_osd_release
2003 << " features " << features
<< dendl
;
2005 // the features should be a subset of the mon quorum's features!
2006 ceph_assert((features
& ~mon
.get_quorum_con_features()) == 0);
2009 encode(tmp
, fullbl
, features
| CEPH_FEATURE_RESERVED
);
2010 pending_inc
.full_crc
= tmp
.get_crc();
2012 // include full map in the txn. note that old monitors will
2013 // overwrite this. new ones will now skip the local full map
2014 // encode and reload from this.
2015 put_version_full(t
, pending_inc
.epoch
, fullbl
);
2019 ceph_assert(get_last_committed() + 1 == pending_inc
.epoch
);
2021 encode(pending_inc
, bl
, features
| CEPH_FEATURE_RESERVED
);
2023 dout(20) << " full_crc " << tmp
.get_crc()
2024 << " inc_crc " << pending_inc
.inc_crc
<< dendl
;
2026 /* put everything in the transaction */
2027 put_version(t
, pending_inc
.epoch
, bl
);
2028 put_last_committed(t
, pending_inc
.epoch
);
2031 for (map
<int,bufferlist
>::iterator p
= pending_metadata
.begin();
2032 p
!= pending_metadata
.end();
2035 auto mp
= p
->second
.cbegin();
2037 auto it
= m
.find("osd_objectstore");
2038 if (it
!= m
.end()) {
2039 if (it
->second
== "filestore") {
2040 filestore_osds
.insert(p
->first
);
2042 filestore_osds
.erase(p
->first
);
2045 t
->put(OSD_METADATA_PREFIX
, stringify(p
->first
), p
->second
);
2047 for (set
<int>::iterator p
= pending_metadata_rm
.begin();
2048 p
!= pending_metadata_rm
.end();
2050 filestore_osds
.erase(*p
);
2051 t
->erase(OSD_METADATA_PREFIX
, stringify(*p
));
2053 pending_metadata
.clear();
2054 pending_metadata_rm
.clear();
2057 if (tmp
.require_osd_release
>= ceph_release_t::octopus
&&
2058 !pending_inc
.new_purged_snaps
.empty()) {
2059 // all snaps purged this epoch (across all pools)
2060 string k
= make_purged_snap_epoch_key(pending_inc
.epoch
);
2062 encode(pending_inc
.new_purged_snaps
, v
);
2063 t
->put(OSD_SNAP_PREFIX
, k
, v
);
2065 for (auto& i
: pending_inc
.new_purged_snaps
) {
2066 for (auto q
= i
.second
.begin();
2067 q
!= i
.second
.end();
2069 insert_purged_snap_update(i
.first
, q
.get_start(), q
.get_end(),
2074 for (auto& [pool
, snaps
] : pending_pseudo_purged_snaps
) {
2075 for (auto snap
: snaps
) {
2076 insert_purged_snap_update(pool
, snap
, snap
+ 1,
2083 health_check_map_t next
;
2084 tmp
.check_health(cct
, &next
);
2086 check_for_filestore_osds(&next
);
2087 encode_health(next
, t
);
2090 int OSDMonitor::load_metadata(int osd
, map
<string
, string
>& m
, ostream
*err
)
2093 int r
= mon
.store
->get(OSD_METADATA_PREFIX
, stringify(osd
), bl
);
2097 auto p
= bl
.cbegin();
2100 catch (ceph::buffer::error
& e
) {
2102 *err
<< "osd." << osd
<< " metadata is corrupt";
2108 void OSDMonitor::count_metadata(const string
& field
, map
<string
,int> *out
)
2110 for (int osd
= 0; osd
< osdmap
.get_max_osd(); ++osd
) {
2111 if (osdmap
.is_up(osd
)) {
2112 map
<string
,string
> meta
;
2113 load_metadata(osd
, meta
, nullptr);
2114 auto p
= meta
.find(field
);
2115 if (p
== meta
.end()) {
2116 (*out
)["unknown"]++;
2118 (*out
)[p
->second
]++;
2124 void OSDMonitor::count_metadata(const string
& field
, Formatter
*f
)
2126 map
<string
,int> by_val
;
2127 count_metadata(field
, &by_val
);
2128 f
->open_object_section(field
.c_str());
2129 for (auto& p
: by_val
) {
2130 f
->dump_int(p
.first
.c_str(), p
.second
);
2135 void OSDMonitor::get_versions(std::map
<string
, list
<string
>> &versions
)
2137 for (int osd
= 0; osd
< osdmap
.get_max_osd(); ++osd
) {
2138 if (osdmap
.is_up(osd
)) {
2139 map
<string
,string
> meta
;
2140 load_metadata(osd
, meta
, nullptr);
2141 auto p
= meta
.find("ceph_version_short");
2142 if (p
== meta
.end()) continue;
2143 versions
[p
->second
].push_back(string("osd.") + stringify(osd
));
2148 int OSDMonitor::get_osd_objectstore_type(int osd
, string
*type
)
2150 map
<string
, string
> metadata
;
2151 int r
= load_metadata(osd
, metadata
, nullptr);
2155 auto it
= metadata
.find("osd_objectstore");
2156 if (it
== metadata
.end())
2162 void OSDMonitor::get_filestore_osd_list()
2164 for (unsigned osd
= 0; osd
< osdmap
.get_num_osds(); ++osd
) {
2165 string objectstore_type
;
2166 int r
= get_osd_objectstore_type(osd
, &objectstore_type
);
2167 if (r
== 0 && objectstore_type
== "filestore") {
2168 filestore_osds
.insert(osd
);
2173 void OSDMonitor::check_for_filestore_osds(health_check_map_t
*checks
)
2175 if (g_conf()->mon_warn_on_filestore_osds
&&
2176 filestore_osds
.size() > 0) {
2177 ostringstream ss
, deprecated_tip
;
2178 list
<string
> detail
;
2179 ss
<< filestore_osds
.size()
2181 << (filestore_osds
.size() == 1 ? "is" : "are")
2182 << " running Filestore";
2183 deprecated_tip
<< ss
.str();
2184 ss
<< " [Deprecated]";
2185 auto& d
= checks
->add("OSD_FILESTORE", HEALTH_WARN
, ss
.str(),
2186 filestore_osds
.size());
2187 deprecated_tip
<< ", which has been deprecated and"
2188 << " not been optimized for QoS"
2189 << " (Filestore OSDs will use 'osd_op_queue = wpq' strictly)";
2190 detail
.push_back(deprecated_tip
.str());
2191 d
.detail
.swap(detail
);
2195 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id
,
2196 const pg_pool_t
&pool
,
2199 // just check a few pgs for efficiency - this can't give a guarantee anyway,
2200 // since filestore osds could always join the pool later
2201 set
<int> checked_osds
;
2202 for (unsigned ps
= 0; ps
< std::min(8u, pool
.get_pg_num()); ++ps
) {
2203 vector
<int> up
, acting
;
2204 pg_t
pgid(ps
, pool_id
);
2205 osdmap
.pg_to_up_acting_osds(pgid
, up
, acting
);
2206 for (int osd
: up
) {
2207 if (checked_osds
.find(osd
) != checked_osds
.end())
2209 string objectstore_type
;
2210 int r
= get_osd_objectstore_type(osd
, &objectstore_type
);
2211 // allow with missing metadata, e.g. due to an osd never booting yet
2212 if (r
< 0 || objectstore_type
== "bluestore") {
2213 checked_osds
.insert(osd
);
2216 *err
<< "osd." << osd
<< " uses " << objectstore_type
;
2223 int OSDMonitor::dump_osd_metadata(int osd
, Formatter
*f
, ostream
*err
)
2225 map
<string
,string
> m
;
2226 if (int r
= load_metadata(osd
, m
, err
))
2228 for (map
<string
,string
>::iterator p
= m
.begin(); p
!= m
.end(); ++p
)
2229 f
->dump_string(p
->first
.c_str(), p
->second
);
2233 void OSDMonitor::print_nodes(Formatter
*f
)
2235 // group OSDs by their hosts
2236 map
<string
, list
<int> > osds
; // hostname => osd
2237 for (int osd
= 0; osd
< osdmap
.get_max_osd(); osd
++) {
2238 map
<string
, string
> m
;
2239 if (load_metadata(osd
, m
, NULL
)) {
2242 map
<string
, string
>::iterator hostname
= m
.find("hostname");
2243 if (hostname
== m
.end()) {
2244 // not likely though
2247 osds
[hostname
->second
].push_back(osd
);
2250 dump_services(f
, osds
, "osd");
2253 void OSDMonitor::share_map_with_random_osd()
2255 if (osdmap
.get_num_up_osds() == 0) {
2256 dout(10) << __func__
<< " no up osds, don't share with anyone" << dendl
;
2260 MonSession
*s
= mon
.session_map
.get_random_osd_session(&osdmap
);
2262 dout(10) << __func__
<< " no up osd on our session map" << dendl
;
2266 dout(10) << "committed, telling random " << s
->name
2267 << " all about it" << dendl
;
2269 // get feature of the peer
2270 // use quorum_con_features, if it's an anonymous connection.
2271 uint64_t features
= s
->con_features
? s
->con_features
:
2272 mon
.get_quorum_con_features();
2273 // whatev, they'll request more if they need it
2274 MOSDMap
*m
= build_incremental(osdmap
.get_epoch() - 1, osdmap
.get_epoch(), features
);
2275 s
->con
->send_message(m
);
2276 // NOTE: do *not* record osd has up to this epoch (as we do
2277 // elsewhere) as they may still need to request older values.
2280 version_t
OSDMonitor::get_trim_to() const
2282 if (mon
.get_quorum().empty()) {
2283 dout(10) << __func__
<< " quorum not formed, trim_to = 0" << dendl
;
2288 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
2289 if (!creating_pgs
.pgs
.empty()) {
2290 dout(10) << __func__
<< " pgs creating, trim_to = 0" << dendl
;
2295 if (g_conf().get_val
<bool>("mon_debug_block_osdmap_trim")) {
2297 << " blocking osdmap trim"
2298 << " ('mon_debug_block_osdmap_trim' set to 'true')"
2299 << " trim_to = 0" << dendl
;
2304 epoch_t floor
= get_min_last_epoch_clean();
2305 dout(10) << " min_last_epoch_clean " << floor
<< dendl
;
2306 if (g_conf()->mon_osd_force_trim_to
> 0 &&
2307 g_conf()->mon_osd_force_trim_to
< (int)get_last_committed()) {
2308 floor
= g_conf()->mon_osd_force_trim_to
;
2309 dout(10) << __func__
2310 << " explicit mon_osd_force_trim_to = " << floor
<< dendl
;
2312 unsigned min
= g_conf()->mon_min_osdmap_epochs
;
2313 if (floor
+ min
> get_last_committed()) {
2314 if (min
< get_last_committed())
2315 floor
= get_last_committed() - min
;
2319 if (floor
> get_first_committed()) {
2320 dout(10) << __func__
<< " trim_to = " << floor
<< dendl
;
2324 dout(10) << __func__
<< " trim_to = 0" << dendl
;
2328 epoch_t
OSDMonitor::get_min_last_epoch_clean() const
2330 auto floor
= last_epoch_clean
.get_lower_bound(osdmap
);
2331 // also scan osd epochs
2332 // don't trim past the oldest reported osd epoch
2333 for (auto [osd
, epoch
] : osd_epochs
) {
2334 if (epoch
< floor
) {
2341 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx
,
2344 dout(10) << __func__
<< " including full map for e " << first
<< dendl
;
2346 get_version_full(first
, bl
);
2347 put_version_full(tx
, first
, bl
);
2349 if (has_osdmap_manifest
&&
2350 first
> osdmap_manifest
.get_first_pinned()) {
2351 _prune_update_trimmed(tx
, first
);
2356 /* full osdmap prune
2358 * for more information, please refer to doc/dev/mon-osdmap-prune.rst
2361 void OSDMonitor::load_osdmap_manifest()
2363 bool store_has_manifest
=
2364 mon
.store
->exists(get_service_name(), "osdmap_manifest");
2366 if (!store_has_manifest
) {
2367 if (!has_osdmap_manifest
) {
2371 dout(20) << __func__
2372 << " dropping osdmap manifest from memory." << dendl
;
2373 osdmap_manifest
= osdmap_manifest_t();
2374 has_osdmap_manifest
= false;
2378 dout(20) << __func__
2379 << " osdmap manifest detected in store; reload." << dendl
;
2381 bufferlist manifest_bl
;
2382 int r
= get_value("osdmap_manifest", manifest_bl
);
2384 derr
<< __func__
<< " unable to read osdmap version manifest" << dendl
;
2385 ceph_abort_msg("error reading manifest");
2387 osdmap_manifest
.decode(manifest_bl
);
2388 has_osdmap_manifest
= true;
2390 dout(10) << __func__
<< " store osdmap manifest pinned ("
2391 << osdmap_manifest
.get_first_pinned()
2393 << osdmap_manifest
.get_last_pinned()
2398 bool OSDMonitor::should_prune() const
2400 version_t first
= get_first_committed();
2401 version_t last
= get_last_committed();
2402 version_t min_osdmap_epochs
=
2403 g_conf().get_val
<int64_t>("mon_min_osdmap_epochs");
2404 version_t prune_min
=
2405 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_min");
2406 version_t prune_interval
=
2407 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_interval");
2408 version_t last_pinned
= osdmap_manifest
.get_last_pinned();
2409 version_t last_to_pin
= last
- min_osdmap_epochs
;
2411 // Make it or break it constraints.
2413 // If any of these conditions fails, we will not prune, regardless of
2414 // whether we have an on-disk manifest with an on-going pruning state.
2416 if ((last
- first
) <= min_osdmap_epochs
) {
2417 // between the first and last committed epochs, we don't have
2418 // enough epochs to trim, much less to prune.
2419 dout(10) << __func__
2420 << " currently holding only " << (last
- first
)
2421 << " epochs (min osdmap epochs: " << min_osdmap_epochs
2422 << "); do not prune."
2426 } else if ((last_to_pin
- first
) < prune_min
) {
2427 // between the first committed epoch and the last epoch we would prune,
2428 // we simply don't have enough versions over the minimum to prune maps.
2429 dout(10) << __func__
2430 << " could only prune " << (last_to_pin
- first
)
2431 << " epochs (" << first
<< ".." << last_to_pin
<< "), which"
2432 " is less than the required minimum (" << prune_min
<< ")"
2436 } else if (has_osdmap_manifest
&& last_pinned
>= last_to_pin
) {
2437 dout(10) << __func__
2438 << " we have pruned as far as we can; do not prune."
2442 } else if (last_pinned
+ prune_interval
> last_to_pin
) {
2443 dout(10) << __func__
2444 << " not enough epochs to form an interval (last pinned: "
2445 << last_pinned
<< ", last to pin: "
2446 << last_to_pin
<< ", interval: " << prune_interval
<< ")"
2451 dout(15) << __func__
2452 << " should prune (" << last_pinned
<< ".." << last_to_pin
<< ")"
2453 << " lc (" << first
<< ".." << last
<< ")"
2458 void OSDMonitor::_prune_update_trimmed(
2459 MonitorDBStore::TransactionRef tx
,
2462 dout(10) << __func__
2463 << " first " << first
2464 << " last_pinned " << osdmap_manifest
.get_last_pinned()
2467 osdmap_manifest_t manifest
= osdmap_manifest
;
2469 if (!manifest
.is_pinned(first
)) {
2470 manifest
.pin(first
);
2473 set
<version_t
>::iterator p_end
= manifest
.pinned
.find(first
);
2474 set
<version_t
>::iterator p
= manifest
.pinned
.begin();
2475 manifest
.pinned
.erase(p
, p_end
);
2476 ceph_assert(manifest
.get_first_pinned() == first
);
2478 if (manifest
.get_last_pinned() == first
+1 ||
2479 manifest
.pinned
.size() == 1) {
2480 // we reached the end of the line, as pinned maps go; clean up our
2481 // manifest, and let `should_prune()` decide whether we should prune
2483 tx
->erase(get_service_name(), "osdmap_manifest");
2488 manifest
.encode(bl
);
2489 tx
->put(get_service_name(), "osdmap_manifest", bl
);
2492 void OSDMonitor::prune_init(osdmap_manifest_t
& manifest
)
2494 dout(1) << __func__
<< dendl
;
2496 version_t pin_first
;
2498 // verify constrainsts on stable in-memory state
2499 if (!has_osdmap_manifest
) {
2500 // we must have never pruned, OR if we pruned the state must no longer
2501 // be relevant (i.e., the state must have been removed alongside with
2502 // the trim that *must* have removed past the last pinned map in a
2504 ceph_assert(osdmap_manifest
.pinned
.empty());
2505 ceph_assert(!mon
.store
->exists(get_service_name(), "osdmap_manifest"));
2506 pin_first
= get_first_committed();
2509 // we must have pruned in the past AND its state is still relevant
2510 // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
2511 // and thus we still hold a manifest in the store).
2512 ceph_assert(!osdmap_manifest
.pinned
.empty());
2513 ceph_assert(osdmap_manifest
.get_first_pinned() == get_first_committed());
2514 ceph_assert(osdmap_manifest
.get_last_pinned() < get_last_committed());
2516 dout(10) << __func__
2517 << " first_pinned " << osdmap_manifest
.get_first_pinned()
2518 << " last_pinned " << osdmap_manifest
.get_last_pinned()
2521 pin_first
= osdmap_manifest
.get_last_pinned();
2524 manifest
.pin(pin_first
);
2527 bool OSDMonitor::_prune_sanitize_options() const
2529 uint64_t prune_interval
=
2530 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_interval");
2531 uint64_t prune_min
=
2532 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_min");
2534 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_txsize");
2538 if (prune_interval
== 0) {
2540 << " prune is enabled BUT prune interval is zero; abort."
2543 } else if (prune_interval
== 1) {
2545 << " prune interval is equal to one, which essentially means"
2546 " no pruning; abort."
2550 if (prune_min
== 0) {
2552 << " prune is enabled BUT prune min is zero; abort."
2556 if (prune_interval
> prune_min
) {
2558 << " impossible to ascertain proper prune interval because"
2559 << " it is greater than the minimum prune epochs"
2560 << " (min: " << prune_min
<< ", interval: " << prune_interval
<< ")"
2565 if (txsize
< prune_interval
- 1) {
2567 << " 'mon_osdmap_full_prune_txsize' (" << txsize
2568 << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval
- 1
2569 << "); abort." << dendl
;
2575 bool OSDMonitor::is_prune_enabled() const {
2576 return g_conf().get_val
<bool>("mon_osdmap_full_prune_enabled");
2579 bool OSDMonitor::is_prune_supported() const {
2580 return mon
.get_required_mon_features().contains_any(
2581 ceph::features::mon::FEATURE_OSDMAP_PRUNE
);
2586 * @returns true if has side-effects; false otherwise.
2588 bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx
)
2590 bool enabled
= is_prune_enabled();
2592 dout(1) << __func__
<< " osdmap full prune "
2593 << ( enabled
? "enabled" : "disabled")
2596 if (!enabled
|| !_prune_sanitize_options() || !should_prune()) {
2600 // we are beyond the minimum prune versions, we need to remove maps because
2601 // otherwise the store will grow unbounded and we may end up having issues
2602 // with available disk space or store hangs.
2604 // we will not pin all versions. We will leave a buffer number of versions.
2605 // this allows us the monitor to trim maps without caring too much about
2606 // pinned maps, and then allow us to use another ceph-mon without these
2607 // capabilities, without having to repair the store.
2609 osdmap_manifest_t manifest
= osdmap_manifest
;
2611 version_t first
= get_first_committed();
2612 version_t last
= get_last_committed();
2614 version_t last_to_pin
= last
- g_conf()->mon_min_osdmap_epochs
;
2615 version_t last_pinned
= manifest
.get_last_pinned();
2616 uint64_t prune_interval
=
2617 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_interval");
2619 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_txsize");
2621 prune_init(manifest
);
2623 // we need to get rid of some osdmaps
2626 << " lc (" << first
<< " .. " << last
<< ")"
2627 << " last_pinned " << last_pinned
2628 << " interval " << prune_interval
2629 << " last_to_pin " << last_to_pin
2632 // We will be erasing maps as we go.
2634 // We will erase all maps between `last_pinned` and the `next_to_pin`.
2636 // If `next_to_pin` happens to be greater than `last_to_pin`, then
2637 // we stop pruning. We could prune the maps between `next_to_pin` and
2638 // `last_to_pin`, but by not doing it we end up with neater pruned
2639 // intervals, aligned with `prune_interval`. Besides, this should not be a
2640 // problem as long as `prune_interval` is set to a sane value, instead of
2641 // hundreds or thousands of maps.
2643 auto map_exists
= [this](version_t v
) {
2644 string k
= mon
.store
->combine_strings("full", v
);
2645 return mon
.store
->exists(get_service_name(), k
);
2648 // 'interval' represents the number of maps from the last pinned
2649 // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2650 // version 11 next; all intermediate versions will be removed.
2652 // 'txsize' represents the maximum number of versions we'll be removing in
2653 // this iteration. If 'txsize' is large enough to perform multiple passes
2654 // pinning and removing maps, we will do so; if not, we'll do at least one
2655 // pass. We are quite relaxed about honouring 'txsize', but we'll always
2656 // ensure that we never go *over* the maximum.
2658 // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2659 uint64_t removal_interval
= prune_interval
- 1;
2661 if (txsize
< removal_interval
) {
2663 << " setting txsize to removal interval size ("
2664 << removal_interval
<< " versions"
2666 txsize
= removal_interval
;
2668 ceph_assert(removal_interval
> 0);
2670 uint64_t num_pruned
= 0;
2671 while (num_pruned
+ removal_interval
<= txsize
) {
2672 last_pinned
= manifest
.get_last_pinned();
2674 if (last_pinned
+ prune_interval
> last_to_pin
) {
2677 ceph_assert(last_pinned
< last_to_pin
);
2679 version_t next_pinned
= last_pinned
+ prune_interval
;
2680 ceph_assert(next_pinned
<= last_to_pin
);
2681 manifest
.pin(next_pinned
);
2683 dout(20) << __func__
2684 << " last_pinned " << last_pinned
2685 << " next_pinned " << next_pinned
2686 << " num_pruned " << num_pruned
2687 << " removal interval (" << (last_pinned
+1)
2688 << ".." << (next_pinned
-1) << ")"
2689 << " txsize " << txsize
<< dendl
;
2691 ceph_assert(map_exists(last_pinned
));
2692 ceph_assert(map_exists(next_pinned
));
2694 for (version_t v
= last_pinned
+1; v
< next_pinned
; ++v
) {
2695 ceph_assert(!manifest
.is_pinned(v
));
2697 dout(20) << __func__
<< " pruning full osdmap e" << v
<< dendl
;
2698 string full_key
= mon
.store
->combine_strings("full", v
);
2699 tx
->erase(get_service_name(), full_key
);
2704 ceph_assert(num_pruned
> 0);
2707 manifest
.encode(bl
);
2708 tx
->put(get_service_name(), "osdmap_manifest", bl
);
2716 bool OSDMonitor::preprocess_query(MonOpRequestRef op
)
2718 op
->mark_osdmon_event(__func__
);
2719 Message
*m
= op
->get_req();
2720 dout(10) << "preprocess_query " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
2722 switch (m
->get_type()) {
2724 case MSG_MON_COMMAND
:
2726 return preprocess_command(op
);
2727 } catch (const bad_cmd_get
& e
) {
2729 mon
.reply_command(op
, -EINVAL
, e
.what(), bl
, get_last_committed());
2732 case CEPH_MSG_MON_GET_OSDMAP
:
2733 return preprocess_get_osdmap(op
);
2736 case MSG_OSD_MARK_ME_DOWN
:
2737 return preprocess_mark_me_down(op
);
2738 case MSG_OSD_MARK_ME_DEAD
:
2739 return preprocess_mark_me_dead(op
);
2741 return preprocess_full(op
);
2742 case MSG_OSD_FAILURE
:
2743 return preprocess_failure(op
);
2745 return preprocess_boot(op
);
2747 return preprocess_alive(op
);
2748 case MSG_OSD_PG_CREATED
:
2749 return preprocess_pg_created(op
);
2750 case MSG_OSD_PG_READY_TO_MERGE
:
2751 return preprocess_pg_ready_to_merge(op
);
2752 case MSG_OSD_PGTEMP
:
2753 return preprocess_pgtemp(op
);
2754 case MSG_OSD_BEACON
:
2755 return preprocess_beacon(op
);
2757 case CEPH_MSG_POOLOP
:
2758 return preprocess_pool_op(op
);
2760 case MSG_REMOVE_SNAPS
:
2761 return preprocess_remove_snaps(op
);
2763 case MSG_MON_GET_PURGED_SNAPS
:
2764 return preprocess_get_purged_snaps(op
);
2772 bool OSDMonitor::prepare_update(MonOpRequestRef op
)
2774 op
->mark_osdmon_event(__func__
);
2775 Message
*m
= op
->get_req();
2776 dout(7) << "prepare_update " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
2778 switch (m
->get_type()) {
2780 case MSG_OSD_MARK_ME_DOWN
:
2781 return prepare_mark_me_down(op
);
2782 case MSG_OSD_MARK_ME_DEAD
:
2783 return prepare_mark_me_dead(op
);
2785 return prepare_full(op
);
2786 case MSG_OSD_FAILURE
:
2787 return prepare_failure(op
);
2789 return prepare_boot(op
);
2791 return prepare_alive(op
);
2792 case MSG_OSD_PG_CREATED
:
2793 return prepare_pg_created(op
);
2794 case MSG_OSD_PGTEMP
:
2795 return prepare_pgtemp(op
);
2796 case MSG_OSD_PG_READY_TO_MERGE
:
2797 return prepare_pg_ready_to_merge(op
);
2798 case MSG_OSD_BEACON
:
2799 return prepare_beacon(op
);
2801 case MSG_MON_COMMAND
:
2803 return prepare_command(op
);
2804 } catch (const bad_cmd_get
& e
) {
2806 mon
.reply_command(op
, -EINVAL
, e
.what(), bl
, get_last_committed());
2810 case CEPH_MSG_POOLOP
:
2811 return prepare_pool_op(op
);
2813 case MSG_REMOVE_SNAPS
:
2814 return prepare_remove_snaps(op
);
2824 bool OSDMonitor::should_propose(double& delay
)
2826 dout(10) << "should_propose" << dendl
;
2828 // if full map, propose immediately! any subsequent changes will be clobbered.
2829 if (pending_inc
.fullmap
.length())
2832 // adjust osd weights?
2833 if (!osd_weight
.empty() &&
2834 osd_weight
.size() == (unsigned)osdmap
.get_max_osd()) {
2835 dout(0) << " adjusting osd weights based on " << osd_weight
<< dendl
;
2836 osdmap
.adjust_osd_weights(osd_weight
, pending_inc
);
2842 return PaxosService::should_propose(delay
);
2847 // ---------------------------
2850 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op
)
2852 op
->mark_osdmon_event(__func__
);
2853 auto m
= op
->get_req
<MMonGetOSDMap
>();
2855 uint64_t features
= mon
.get_quorum_con_features();
2856 if (op
->get_session() && op
->get_session()->con_features
)
2857 features
= op
->get_session()->con_features
;
2859 dout(10) << __func__
<< " " << *m
<< dendl
;
2860 MOSDMap
*reply
= new MOSDMap(mon
.monmap
->fsid
, features
);
2861 epoch_t first
= get_first_committed();
2862 epoch_t last
= osdmap
.get_epoch();
2863 int max
= g_conf()->osd_map_message_max
;
2864 ssize_t max_bytes
= g_conf()->osd_map_message_max_bytes
;
2865 for (epoch_t e
= std::max(first
, m
->get_full_first());
2866 e
<= std::min(last
, m
->get_full_last()) && max
> 0 && max_bytes
> 0;
2868 bufferlist
& bl
= reply
->maps
[e
];
2869 int r
= get_version_full(e
, features
, bl
);
2870 ceph_assert(r
>= 0);
2871 max_bytes
-= bl
.length();
2873 for (epoch_t e
= std::max(first
, m
->get_inc_first());
2874 e
<= std::min(last
, m
->get_inc_last()) && max
> 0 && max_bytes
> 0;
2876 bufferlist
& bl
= reply
->incremental_maps
[e
];
2877 int r
= get_version(e
, features
, bl
);
2878 ceph_assert(r
>= 0);
2879 max_bytes
-= bl
.length();
2881 reply
->oldest_map
= first
;
2882 reply
->newest_map
= last
;
2883 mon
.send_reply(op
, reply
);
2888 // ---------------------------
2893 bool OSDMonitor::check_source(MonOpRequestRef op
, uuid_d fsid
) {
2894 // check permissions
2895 MonSession
*session
= op
->get_session();
2898 if (!session
->is_capable("osd", MON_CAP_X
)) {
2899 dout(0) << "got MOSDFailure from entity with insufficient caps "
2900 << session
->caps
<< dendl
;
2903 if (fsid
!= mon
.monmap
->fsid
) {
2904 dout(0) << "check_source: on fsid " << fsid
2905 << " != " << mon
.monmap
->fsid
<< dendl
;
2912 bool OSDMonitor::preprocess_failure(MonOpRequestRef op
)
2914 op
->mark_osdmon_event(__func__
);
2915 auto m
= op
->get_req
<MOSDFailure
>();
2916 // who is target_osd
2917 int badboy
= m
->get_target_osd();
2919 // check permissions
2920 if (check_source(op
, m
->fsid
))
2923 // first, verify the reporting host is valid
2924 if (m
->get_orig_source().is_osd()) {
2925 int from
= m
->get_orig_source().num();
2926 if (!osdmap
.exists(from
) ||
2927 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs()) ||
2928 (osdmap
.is_down(from
) && m
->if_osd_failed())) {
2929 dout(5) << "preprocess_failure from dead osd." << from
2930 << ", ignoring" << dendl
;
2931 send_incremental(op
, m
->get_epoch()+1);
2938 if (osdmap
.is_down(badboy
)) {
2939 dout(5) << "preprocess_failure dne(/dup?): osd." << m
->get_target_osd()
2940 << " " << m
->get_target_addrs()
2941 << ", from " << m
->get_orig_source() << dendl
;
2942 if (m
->get_epoch() < osdmap
.get_epoch())
2943 send_incremental(op
, m
->get_epoch()+1);
2946 if (osdmap
.get_addrs(badboy
) != m
->get_target_addrs()) {
2947 dout(5) << "preprocess_failure wrong osd: report osd." << m
->get_target_osd()
2948 << " " << m
->get_target_addrs()
2949 << " != map's " << osdmap
.get_addrs(badboy
)
2950 << ", from " << m
->get_orig_source() << dendl
;
2951 if (m
->get_epoch() < osdmap
.get_epoch())
2952 send_incremental(op
, m
->get_epoch()+1);
2956 // already reported?
2957 if (osdmap
.is_down(badboy
) ||
2958 osdmap
.get_up_from(badboy
) > m
->get_epoch()) {
2959 dout(5) << "preprocess_failure dup/old: osd." << m
->get_target_osd()
2960 << " " << m
->get_target_addrs()
2961 << ", from " << m
->get_orig_source() << dendl
;
2962 if (m
->get_epoch() < osdmap
.get_epoch())
2963 send_incremental(op
, m
->get_epoch()+1);
2967 if (!can_mark_down(badboy
)) {
2968 dout(5) << "preprocess_failure ignoring report of osd."
2969 << m
->get_target_osd() << " " << m
->get_target_addrs()
2970 << " from " << m
->get_orig_source() << dendl
;
2974 dout(10) << "preprocess_failure new: osd." << m
->get_target_osd()
2975 << " " << m
->get_target_addrs()
2976 << ", from " << m
->get_orig_source() << dendl
;
2984 class C_AckMarkedDown
: public C_MonOp
{
2990 : C_MonOp(op
), osdmon(osdmon
) {}
2992 void _finish(int r
) override
{
2994 auto m
= op
->get_req
<MOSDMarkMeDown
>();
2995 osdmon
->mon
.send_reply(
3002 false)); // ACK itself does not request an ack
3003 } else if (r
== -EAGAIN
) {
3004 osdmon
->dispatch(op
);
3006 ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r
);
3009 ~C_AckMarkedDown() override
{
3013 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op
)
3015 op
->mark_osdmon_event(__func__
);
3016 auto m
= op
->get_req
<MOSDMarkMeDown
>();
3017 int from
= m
->target_osd
;
3019 // check permissions
3020 if (check_source(op
, m
->fsid
))
3023 // first, verify the reporting host is valid
3024 if (!m
->get_orig_source().is_osd())
3027 if (!osdmap
.exists(from
) ||
3028 osdmap
.is_down(from
) ||
3029 osdmap
.get_addrs(from
) != m
->target_addrs
) {
3030 dout(5) << "preprocess_mark_me_down from dead osd."
3031 << from
<< ", ignoring" << dendl
;
3032 send_incremental(op
, m
->get_epoch()+1);
3036 // no down might be set
3037 if (!can_mark_down(from
))
3040 dout(10) << "MOSDMarkMeDown for: " << m
->get_orig_source()
3041 << " " << m
->target_addrs
<< dendl
;
3045 if (m
->request_ack
) {
3046 Context
*c(new C_AckMarkedDown(this, op
));
3052 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op
)
3054 op
->mark_osdmon_event(__func__
);
3055 auto m
= op
->get_req
<MOSDMarkMeDown
>();
3056 int target_osd
= m
->target_osd
;
3058 ceph_assert(osdmap
.is_up(target_osd
));
3059 ceph_assert(osdmap
.get_addrs(target_osd
) == m
->target_addrs
);
3061 mon
.clog
->info() << "osd." << target_osd
<< " marked itself down";
3062 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
3064 wait_for_finished_proposal(op
, new C_AckMarkedDown(this, op
));
3068 bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op
)
3070 op
->mark_osdmon_event(__func__
);
3071 auto m
= op
->get_req
<MOSDMarkMeDead
>();
3072 int from
= m
->target_osd
;
3074 // check permissions
3075 if (check_source(op
, m
->fsid
)) {
3080 // first, verify the reporting host is valid
3081 if (!m
->get_orig_source().is_osd()) {
3086 if (!osdmap
.exists(from
) ||
3087 !osdmap
.is_down(from
)) {
3088 dout(5) << __func__
<< " from nonexistent or up osd." << from
3089 << ", ignoring" << dendl
;
3090 send_incremental(op
, m
->get_epoch()+1);
3098 bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op
)
3100 op
->mark_osdmon_event(__func__
);
3101 auto m
= op
->get_req
<MOSDMarkMeDead
>();
3102 int target_osd
= m
->target_osd
;
3104 ceph_assert(osdmap
.is_down(target_osd
));
3106 mon
.clog
->info() << "osd." << target_osd
<< " marked itself dead as of e"
3108 if (!pending_inc
.new_xinfo
.count(target_osd
)) {
3109 pending_inc
.new_xinfo
[target_osd
] = osdmap
.osd_xinfo
[target_osd
];
3111 pending_inc
.new_xinfo
[target_osd
].dead_epoch
= m
->get_epoch();
3112 wait_for_finished_proposal(
3115 [op
, this] (int r
) {
3117 mon
.no_reply(op
); // ignore on success
3124 bool OSDMonitor::can_mark_down(int i
)
3126 if (osdmap
.is_nodown(i
)) {
3127 dout(5) << __func__
<< " osd." << i
<< " is marked as nodown, "
3128 << "will not mark it down" << dendl
;
3132 int num_osds
= osdmap
.get_num_osds();
3133 if (num_osds
== 0) {
3134 dout(5) << __func__
<< " no osds" << dendl
;
3137 int up
= osdmap
.get_num_up_osds() - pending_inc
.get_net_marked_down(&osdmap
);
3138 float up_ratio
= (float)up
/ (float)num_osds
;
3139 if (up_ratio
< g_conf()->mon_osd_min_up_ratio
) {
3140 dout(2) << __func__
<< " current up_ratio " << up_ratio
<< " < min "
3141 << g_conf()->mon_osd_min_up_ratio
3142 << ", will not mark osd." << i
<< " down" << dendl
;
3148 bool OSDMonitor::can_mark_up(int i
)
3150 if (osdmap
.is_noup(i
)) {
3151 dout(5) << __func__
<< " osd." << i
<< " is marked as noup, "
3152 << "will not mark it up" << dendl
;
3160 * @note the parameter @p i apparently only exists here so we can output the
3161 * osd's id on messages.
3163 bool OSDMonitor::can_mark_out(int i
)
3165 if (osdmap
.is_noout(i
)) {
3166 dout(5) << __func__
<< " osd." << i
<< " is marked as noout, "
3167 << "will not mark it out" << dendl
;
3171 int num_osds
= osdmap
.get_num_osds();
3172 if (num_osds
== 0) {
3173 dout(5) << __func__
<< " no osds" << dendl
;
3176 int in
= osdmap
.get_num_in_osds() - pending_inc
.get_net_marked_out(&osdmap
);
3177 float in_ratio
= (float)in
/ (float)num_osds
;
3178 if (in_ratio
< g_conf()->mon_osd_min_in_ratio
) {
3180 dout(5) << __func__
<< " current in_ratio " << in_ratio
<< " < min "
3181 << g_conf()->mon_osd_min_in_ratio
3182 << ", will not mark osd." << i
<< " out" << dendl
;
3184 dout(5) << __func__
<< " current in_ratio " << in_ratio
<< " < min "
3185 << g_conf()->mon_osd_min_in_ratio
3186 << ", will not mark osds out" << dendl
;
3193 bool OSDMonitor::can_mark_in(int i
)
3195 if (osdmap
.is_noin(i
)) {
3196 dout(5) << __func__
<< " osd." << i
<< " is marked as noin, "
3197 << "will not mark it in" << dendl
;
3204 bool OSDMonitor::check_failures(utime_t now
)
3206 bool found_failure
= false;
3207 auto p
= failure_info
.begin();
3208 while (p
!= failure_info
.end()) {
3209 auto& [target_osd
, fi
] = *p
;
3210 if (can_mark_down(target_osd
) &&
3211 check_failure(now
, target_osd
, fi
)) {
3212 found_failure
= true;
3214 } else if (is_failure_stale(now
, fi
)) {
3215 dout(10) << " dropping stale failure_info for osd." << target_osd
3216 << " from " << fi
.reporters
.size() << " reporters"
3218 p
= failure_info
.erase(p
);
3223 return found_failure
;
3226 utime_t
OSDMonitor::get_grace_time(utime_t now
,
3228 failure_info_t
& fi
) const
3230 utime_t
orig_grace(g_conf()->osd_heartbeat_grace
, 0);
3231 if (!g_conf()->mon_osd_adjust_heartbeat_grace
) {
3234 utime_t grace
= orig_grace
;
3235 double halflife
= (double)g_conf()->mon_osd_laggy_halflife
;
3236 double decay_k
= ::log(.5) / halflife
;
3238 // scale grace period based on historical probability of 'lagginess'
3239 // (false positive failures due to slowness).
3240 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(target_osd
);
3241 const utime_t failed_for
= now
- fi
.get_failed_since();
3242 double decay
= exp((double)failed_for
* decay_k
);
3243 dout(20) << " halflife " << halflife
<< " decay_k " << decay_k
3244 << " failed_for " << failed_for
<< " decay " << decay
<< dendl
;
3245 double my_grace
= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
3248 // consider the peers reporting a failure a proxy for a potential
3249 // 'subcluster' over the overall cluster that is similarly
3250 // laggy. this is clearly not true in all cases, but will sometimes
3251 // help us localize the grace correction to a subset of the system
3252 // (say, a rack with a bad switch) that is unhappy.
3253 double peer_grace
= 0;
3254 for (auto& [reporter
, report
] : fi
.reporters
) {
3255 if (osdmap
.exists(reporter
)) {
3256 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(reporter
);
3257 utime_t elapsed
= now
- xi
.down_stamp
;
3258 double decay
= exp((double)elapsed
* decay_k
);
3259 peer_grace
+= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
3262 peer_grace
/= (double)fi
.reporters
.size();
3263 grace
+= peer_grace
;
3264 dout(10) << " osd." << target_osd
<< " has "
3265 << fi
.reporters
.size() << " reporters, "
3266 << grace
<< " grace (" << orig_grace
<< " + " << my_grace
3267 << " + " << peer_grace
<< "), max_failed_since " << fi
.get_failed_since()
3273 bool OSDMonitor::check_failure(utime_t now
, int target_osd
, failure_info_t
& fi
)
3275 // already pending failure?
3276 if (pending_inc
.new_state
.count(target_osd
) &&
3277 pending_inc
.new_state
[target_osd
] & CEPH_OSD_UP
) {
3278 dout(10) << " already pending failure" << dendl
;
3282 set
<string
> reporters_by_subtree
;
3283 auto reporter_subtree_level
= g_conf().get_val
<string
>("mon_osd_reporter_subtree_level");
3284 ceph_assert(fi
.reporters
.size());
3285 for (auto p
= fi
.reporters
.begin(); p
!= fi
.reporters
.end();) {
3286 // get the parent bucket whose type matches with "reporter_subtree_level".
3287 // fall back to OSD if the level doesn't exist.
3288 if (osdmap
.exists(p
->first
)) {
3289 auto reporter_loc
= osdmap
.crush
->get_full_location(p
->first
);
3290 if (auto iter
= reporter_loc
.find(reporter_subtree_level
);
3291 iter
== reporter_loc
.end()) {
3292 reporters_by_subtree
.insert("osd." + to_string(p
->first
));
3294 reporters_by_subtree
.insert(iter
->second
);
3298 fi
.cancel_report(p
->first
);;
3299 p
= fi
.reporters
.erase(p
);
3302 if (reporters_by_subtree
.size() < g_conf().get_val
<uint64_t>("mon_osd_min_down_reporters")) {
3305 const utime_t failed_for
= now
- fi
.get_failed_since();
3306 const utime_t grace
= get_grace_time(now
, target_osd
, fi
);
3307 if (failed_for
>= grace
) {
3308 dout(1) << " we have enough reporters to mark osd." << target_osd
3309 << " down" << dendl
;
3310 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
3312 mon
.clog
->info() << "osd." << target_osd
<< " failed ("
3313 << osdmap
.crush
->get_full_location_ordered_string(
3316 << (int)reporters_by_subtree
.size()
3317 << " reporters from different "
3318 << reporter_subtree_level
<< " after "
3319 << failed_for
<< " >= grace " << grace
<< ")";
3325 bool OSDMonitor::is_failure_stale(utime_t now
, failure_info_t
& fi
) const
3327 // if it takes too long to either cancel the report to mark the osd down,
3328 // some reporters must have failed to cancel their reports. let's just
3329 // forget these reports.
3330 const utime_t failed_for
= now
- fi
.get_failed_since();
3331 auto heartbeat_grace
= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_grace");
3332 auto heartbeat_stale
= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_stale");
3333 return failed_for
>= (heartbeat_grace
+ heartbeat_stale
);
3336 void OSDMonitor::force_failure(int target_osd
, int by
)
3338 // already pending failure?
3339 if (pending_inc
.new_state
.count(target_osd
) &&
3340 pending_inc
.new_state
[target_osd
] & CEPH_OSD_UP
) {
3341 dout(10) << " already pending failure" << dendl
;
3345 dout(1) << " we're forcing failure of osd." << target_osd
<< dendl
;
3346 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
3347 if (!pending_inc
.new_xinfo
.count(target_osd
)) {
3348 pending_inc
.new_xinfo
[target_osd
] = osdmap
.osd_xinfo
[target_osd
];
3350 pending_inc
.new_xinfo
[target_osd
].dead_epoch
= pending_inc
.epoch
;
3352 mon
.clog
->info() << "osd." << target_osd
<< " failed ("
3353 << osdmap
.crush
->get_full_location_ordered_string(target_osd
)
3354 << ") (connection refused reported by osd." << by
<< ")";
3358 bool OSDMonitor::prepare_failure(MonOpRequestRef op
)
3360 op
->mark_osdmon_event(__func__
);
3361 auto m
= op
->get_req
<MOSDFailure
>();
3362 dout(1) << "prepare_failure osd." << m
->get_target_osd()
3363 << " " << m
->get_target_addrs()
3364 << " from " << m
->get_orig_source()
3365 << " is reporting failure:" << m
->if_osd_failed() << dendl
;
3367 int target_osd
= m
->get_target_osd();
3368 int reporter
= m
->get_orig_source().num();
3369 ceph_assert(osdmap
.is_up(target_osd
));
3370 ceph_assert(osdmap
.get_addrs(target_osd
) == m
->get_target_addrs());
3374 if (m
->if_osd_failed()) {
3375 // calculate failure time
3376 utime_t now
= ceph_clock_now();
3377 utime_t failed_since
=
3378 m
->get_recv_stamp() - utime_t(m
->failed_for
, 0);
3381 if (m
->is_immediate()) {
3382 mon
.clog
->debug() << "osd." << m
->get_target_osd()
3383 << " reported immediately failed by "
3384 << m
->get_orig_source();
3385 force_failure(target_osd
, reporter
);
3388 mon
.clog
->debug() << "osd." << m
->get_target_osd() << " reported failed by "
3389 << m
->get_orig_source();
3391 failure_info_t
& fi
= failure_info
[target_osd
];
3392 fi
.add_report(reporter
, failed_since
, op
);
3393 return check_failure(now
, target_osd
, fi
);
3395 // remove the report
3396 mon
.clog
->debug() << "osd." << m
->get_target_osd()
3397 << " failure report canceled by "
3398 << m
->get_orig_source();
3399 if (failure_info
.count(target_osd
)) {
3400 failure_info_t
& fi
= failure_info
[target_osd
];
3401 fi
.cancel_report(reporter
);
3402 if (fi
.reporters
.empty()) {
3403 dout(10) << " removing last failure_info for osd." << target_osd
3405 failure_info
.erase(target_osd
);
3407 dout(10) << " failure_info for osd." << target_osd
<< " now "
3408 << fi
.reporters
.size() << " reporters" << dendl
;
3411 dout(10) << " no failure_info for osd." << target_osd
<< dendl
;
3418 void OSDMonitor::process_failures()
3420 map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
3421 while (p
!= failure_info
.end()) {
3422 if (osdmap
.is_up(p
->first
)) {
3425 dout(10) << "process_failures osd." << p
->first
<< dendl
;
3426 list
<MonOpRequestRef
> ls
;
3427 p
->second
.take_report_messages(ls
);
3428 failure_info
.erase(p
++);
3430 while (!ls
.empty()) {
3431 MonOpRequestRef o
= ls
.front();
3433 o
->mark_event(__func__
);
3434 MOSDFailure
*m
= o
->get_req
<MOSDFailure
>();
3435 send_latest(o
, m
->get_epoch());
3444 void OSDMonitor::take_all_failures(list
<MonOpRequestRef
>& ls
)
3446 dout(10) << __func__
<< " on " << failure_info
.size() << " osds" << dendl
;
3448 for (map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
3449 p
!= failure_info
.end();
3451 p
->second
.take_report_messages(ls
);
3453 failure_info
.clear();
3456 int OSDMonitor::get_grace_interval_threshold()
3458 int halflife
= g_conf()->mon_osd_laggy_halflife
;
3459 // Scale the halflife period (default: 1_hr) by
3460 // a factor (48) to calculate the threshold.
3461 int grace_threshold_factor
= 48;
3462 return halflife
* grace_threshold_factor
;
3465 bool OSDMonitor::grace_interval_threshold_exceeded(int last_failed_interval
)
3467 int grace_interval_threshold_secs
= get_grace_interval_threshold();
3468 if (last_failed_interval
> grace_interval_threshold_secs
) {
3469 dout(1) << " last_failed_interval " << last_failed_interval
3470 << " > grace_interval_threshold_secs " << grace_interval_threshold_secs
3477 void OSDMonitor::set_default_laggy_params(int target_osd
)
3479 if (pending_inc
.new_xinfo
.count(target_osd
) == 0) {
3480 pending_inc
.new_xinfo
[target_osd
] = osdmap
.osd_xinfo
[target_osd
];
3482 osd_xinfo_t
& xi
= pending_inc
.new_xinfo
[target_osd
];
3483 xi
.down_stamp
= pending_inc
.modified
;
3484 xi
.laggy_probability
= 0.0;
3485 xi
.laggy_interval
= 0;
3486 dout(20) << __func__
<< " reset laggy, now xi " << xi
<< dendl
;
3492 bool OSDMonitor::preprocess_boot(MonOpRequestRef op
)
3494 op
->mark_osdmon_event(__func__
);
3495 auto m
= op
->get_req
<MOSDBoot
>();
3496 int from
= m
->get_orig_source_inst().name
.num();
3498 // check permissions, ignore if failed (no response expected)
3499 MonSession
*session
= op
->get_session();
3502 if (!session
->is_capable("osd", MON_CAP_X
)) {
3503 dout(0) << "got preprocess_boot message from entity with insufficient caps"
3504 << session
->caps
<< dendl
;
3508 if (m
->sb
.cluster_fsid
!= mon
.monmap
->fsid
) {
3509 dout(0) << "preprocess_boot on fsid " << m
->sb
.cluster_fsid
3510 << " != " << mon
.monmap
->fsid
<< dendl
;
3514 if (m
->get_orig_source_inst().addr
.is_blank_ip()) {
3515 dout(0) << "preprocess_boot got blank addr for " << m
->get_orig_source_inst() << dendl
;
3519 ceph_assert(m
->get_orig_source_inst().name
.is_osd());
3521 // lower bound of N-2
3522 if (!HAVE_FEATURE(m
->osd_features
, SERVER_OCTOPUS
)) {
3523 mon
.clog
->info() << "disallowing boot of OSD "
3524 << m
->get_orig_source_inst()
3525 << " because the osd lacks CEPH_FEATURE_SERVER_OCTOPUS";
3529 // make sure osd versions do not span more than 3 releases
3530 if (HAVE_FEATURE(m
->osd_features
, SERVER_PACIFIC
) &&
3531 osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
3532 mon
.clog
->info() << "disallowing boot of pacific+ OSD "
3533 << m
->get_orig_source_inst()
3534 << " because require_osd_release < nautilus";
3537 if (HAVE_FEATURE(m
->osd_features
, SERVER_QUINCY
) &&
3538 osdmap
.require_osd_release
< ceph_release_t::octopus
) {
3539 mon
.clog
->info() << "disallowing boot of quincy+ OSD "
3540 << m
->get_orig_source_inst()
3541 << " because require_osd_release < octopus";
3545 if (osdmap
.stretch_mode_enabled
&&
3546 !(m
->osd_features
& CEPH_FEATUREMASK_STRETCH_MODE
)) {
3547 mon
.clog
->info() << "disallowing boot of OSD "
3548 << m
->get_orig_source_inst()
3549 << " because stretch mode is on and OSD lacks support";
3554 if (osdmap
.is_up(from
) &&
3555 osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs()) &&
3556 osdmap
.get_cluster_addrs(from
).legacy_equals(m
->cluster_addrs
)) {
3558 dout(7) << "preprocess_boot dup from " << m
->get_orig_source()
3559 << " " << m
->get_orig_source_addrs()
3560 << " =~ " << osdmap
.get_addrs(from
) << dendl
;
3565 if (osdmap
.exists(from
) &&
3566 !osdmap
.get_uuid(from
).is_zero() &&
3567 osdmap
.get_uuid(from
) != m
->sb
.osd_fsid
) {
3568 dout(7) << __func__
<< " from " << m
->get_orig_source_inst()
3569 << " clashes with existing osd: different fsid"
3570 << " (ours: " << osdmap
.get_uuid(from
)
3571 << " ; theirs: " << m
->sb
.osd_fsid
<< ")" << dendl
;
3575 if (osdmap
.exists(from
) &&
3576 osdmap
.get_info(from
).up_from
> m
->version
&&
3577 osdmap
.get_most_recent_addrs(from
).legacy_equals(
3578 m
->get_orig_source_addrs())) {
3579 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl
;
3580 send_latest(op
, m
->sb
.current_epoch
+1);
3585 if (!can_mark_up(from
)) {
3586 dout(7) << "preprocess_boot ignoring boot from " << m
->get_orig_source_inst() << dendl
;
3587 send_latest(op
, m
->sb
.current_epoch
+1);
3591 dout(10) << "preprocess_boot from " << m
->get_orig_source_inst() << dendl
;
3598 bool OSDMonitor::prepare_boot(MonOpRequestRef op
)
3600 op
->mark_osdmon_event(__func__
);
3601 auto m
= op
->get_req
<MOSDBoot
>();
3602 dout(7) << __func__
<< " from " << m
->get_source()
3604 << " client_addrs" << m
->get_connection()->get_peer_addrs()
3605 << " cluster_addrs " << m
->cluster_addrs
3606 << " hb_back_addrs " << m
->hb_back_addrs
3607 << " hb_front_addrs " << m
->hb_front_addrs
3610 ceph_assert(m
->get_orig_source().is_osd());
3611 int from
= m
->get_orig_source().num();
3613 // does this osd exist?
3614 if (from
>= osdmap
.get_max_osd()) {
3615 dout(1) << "boot from osd." << from
<< " >= max_osd "
3616 << osdmap
.get_max_osd() << dendl
;
3620 int oldstate
= osdmap
.exists(from
) ? osdmap
.get_state(from
) : CEPH_OSD_NEW
;
3621 if (pending_inc
.new_state
.count(from
))
3622 oldstate
^= pending_inc
.new_state
[from
];
3624 // already up? mark down first?
3625 if (osdmap
.is_up(from
)) {
3626 dout(7) << __func__
<< " was up, first marking down osd." << from
<< " "
3627 << osdmap
.get_addrs(from
) << dendl
;
3628 // preprocess should have caught these; if not, assert.
3629 ceph_assert(!osdmap
.get_addrs(from
).legacy_equals(
3630 m
->get_orig_source_addrs()) ||
3631 !osdmap
.get_cluster_addrs(from
).legacy_equals(m
->cluster_addrs
));
3632 ceph_assert(osdmap
.get_uuid(from
) == m
->sb
.osd_fsid
);
3634 if (pending_inc
.new_state
.count(from
) == 0 ||
3635 (pending_inc
.new_state
[from
] & CEPH_OSD_UP
) == 0) {
3636 // mark previous guy down
3637 pending_inc
.new_state
[from
] = CEPH_OSD_UP
;
3639 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
3640 } else if (pending_inc
.new_up_client
.count(from
)) {
3641 // already prepared, just wait
3642 dout(7) << __func__
<< " already prepared, waiting on "
3643 << m
->get_orig_source_addr() << dendl
;
3644 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
3647 pending_inc
.new_up_client
[from
] = m
->get_orig_source_addrs();
3648 pending_inc
.new_up_cluster
[from
] = m
->cluster_addrs
;
3649 pending_inc
.new_hb_back_up
[from
] = m
->hb_back_addrs
;
3650 pending_inc
.new_hb_front_up
[from
] = m
->hb_front_addrs
;
3652 down_pending_out
.erase(from
); // if any
3655 osd_weight
[from
] = m
->sb
.weight
;
3658 dout(10) << " setting osd." << from
<< " uuid to " << m
->sb
.osd_fsid
3660 if (!osdmap
.exists(from
) || osdmap
.get_uuid(from
) != m
->sb
.osd_fsid
) {
3661 // preprocess should have caught this; if not, assert.
3662 ceph_assert(!osdmap
.exists(from
) || osdmap
.get_uuid(from
).is_zero());
3663 pending_inc
.new_uuid
[from
] = m
->sb
.osd_fsid
;
3667 if (m
->sb
.newest_map
== 0 && osdmap
.exists(from
)) {
3668 const osd_info_t
& i
= osdmap
.get_info(from
);
3669 if (i
.up_from
> i
.lost_at
) {
3670 dout(10) << " fresh osd; marking lost_at too" << dendl
;
3671 pending_inc
.new_lost
[from
] = osdmap
.get_epoch();
3676 bufferlist osd_metadata
;
3677 encode(m
->metadata
, osd_metadata
);
3678 pending_metadata
[from
] = osd_metadata
;
3679 pending_metadata_rm
.erase(from
);
3681 // adjust last clean unmount epoch?
3682 const osd_info_t
& info
= osdmap
.get_info(from
);
3683 dout(10) << " old osd_info: " << info
<< dendl
;
3684 if (m
->sb
.mounted
> info
.last_clean_begin
||
3685 (m
->sb
.mounted
== info
.last_clean_begin
&&
3686 m
->sb
.clean_thru
> info
.last_clean_end
)) {
3687 epoch_t begin
= m
->sb
.mounted
;
3688 epoch_t end
= m
->sb
.clean_thru
;
3690 dout(10) << __func__
<< " osd." << from
<< " last_clean_interval "
3691 << "[" << info
.last_clean_begin
<< "," << info
.last_clean_end
3692 << ") -> [" << begin
<< "-" << end
<< ")"
3694 pending_inc
.new_last_clean_interval
[from
] =
3695 pair
<epoch_t
,epoch_t
>(begin
, end
);
3698 if (pending_inc
.new_xinfo
.count(from
) == 0)
3699 pending_inc
.new_xinfo
[from
] = osdmap
.osd_xinfo
[from
];
3700 osd_xinfo_t
& xi
= pending_inc
.new_xinfo
[from
];
3701 if (m
->boot_epoch
== 0) {
3702 xi
.laggy_probability
*= (1.0 - g_conf()->mon_osd_laggy_weight
);
3703 xi
.laggy_interval
*= (1.0 - g_conf()->mon_osd_laggy_weight
);
3704 dout(10) << " not laggy, new xi " << xi
<< dendl
;
3706 if (xi
.down_stamp
.sec()) {
3707 int interval
= ceph_clock_now().sec() -
3708 xi
.down_stamp
.sec();
3709 if (g_conf()->mon_osd_laggy_max_interval
&&
3710 (interval
> g_conf()->mon_osd_laggy_max_interval
)) {
3711 interval
= g_conf()->mon_osd_laggy_max_interval
;
3714 interval
* g_conf()->mon_osd_laggy_weight
+
3715 xi
.laggy_interval
* (1.0 - g_conf()->mon_osd_laggy_weight
);
3717 xi
.laggy_probability
=
3718 g_conf()->mon_osd_laggy_weight
+
3719 xi
.laggy_probability
* (1.0 - g_conf()->mon_osd_laggy_weight
);
3720 dout(10) << " laggy, now xi " << xi
<< dendl
;
3723 // set features shared by the osd
3724 if (m
->osd_features
)
3725 xi
.features
= m
->osd_features
;
3727 xi
.features
= m
->get_connection()->get_features();
3730 if ((g_conf()->mon_osd_auto_mark_auto_out_in
&&
3731 (oldstate
& CEPH_OSD_AUTOOUT
)) ||
3732 (g_conf()->mon_osd_auto_mark_new_in
&& (oldstate
& CEPH_OSD_NEW
)) ||
3733 (g_conf()->mon_osd_auto_mark_in
)) {
3734 if (can_mark_in(from
)) {
3735 if (xi
.old_weight
> 0) {
3736 pending_inc
.new_weight
[from
] = xi
.old_weight
;
3739 pending_inc
.new_weight
[from
] = CEPH_OSD_IN
;
3742 dout(7) << __func__
<< " NOIN set, will not mark in "
3743 << m
->get_orig_source_addr() << dendl
;
3748 wait_for_finished_proposal(op
, new C_Booted(this, op
));
3753 void OSDMonitor::_booted(MonOpRequestRef op
, bool logit
)
3755 op
->mark_osdmon_event(__func__
);
3756 auto m
= op
->get_req
<MOSDBoot
>();
3757 dout(7) << "_booted " << m
->get_orig_source_inst()
3758 << " w " << m
->sb
.weight
<< " from " << m
->sb
.current_epoch
<< dendl
;
3761 mon
.clog
->info() << m
->get_source() << " " << m
->get_orig_source_addrs()
3765 send_latest(op
, m
->sb
.current_epoch
+1);
3772 bool OSDMonitor::preprocess_full(MonOpRequestRef op
)
3774 op
->mark_osdmon_event(__func__
);
3775 auto m
= op
->get_req
<MOSDFull
>();
3776 int from
= m
->get_orig_source().num();
3778 unsigned mask
= CEPH_OSD_NEARFULL
| CEPH_OSD_BACKFILLFULL
| CEPH_OSD_FULL
;
3780 // check permissions, ignore if failed
3781 MonSession
*session
= op
->get_session();
3784 if (!session
->is_capable("osd", MON_CAP_X
)) {
3785 dout(0) << "MOSDFull from entity with insufficient privileges:"
3786 << session
->caps
<< dendl
;
3790 // ignore a full message from the osd instance that already went down
3791 if (!osdmap
.exists(from
)) {
3792 dout(7) << __func__
<< " ignoring full message from nonexistent "
3793 << m
->get_orig_source_inst() << dendl
;
3796 if ((!osdmap
.is_up(from
) &&
3797 osdmap
.get_most_recent_addrs(from
).legacy_equals(
3798 m
->get_orig_source_addrs())) ||
3799 (osdmap
.is_up(from
) &&
3800 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs()))) {
3801 dout(7) << __func__
<< " ignoring full message from down "
3802 << m
->get_orig_source_inst() << dendl
;
3806 OSDMap::calc_state_set(osdmap
.get_state(from
), state
);
3808 if ((osdmap
.get_state(from
) & mask
) == m
->state
) {
3809 dout(7) << __func__
<< " state already " << state
<< " for osd." << from
3810 << " " << m
->get_orig_source_inst() << dendl
;
3811 _reply_map(op
, m
->version
);
3815 dout(10) << __func__
<< " want state " << state
<< " for osd." << from
3816 << " " << m
->get_orig_source_inst() << dendl
;
3823 bool OSDMonitor::prepare_full(MonOpRequestRef op
)
3825 op
->mark_osdmon_event(__func__
);
3826 auto m
= op
->get_req
<MOSDFull
>();
3827 const int from
= m
->get_orig_source().num();
3829 const unsigned mask
= CEPH_OSD_NEARFULL
| CEPH_OSD_BACKFILLFULL
| CEPH_OSD_FULL
;
3830 const unsigned want_state
= m
->state
& mask
; // safety first
3832 unsigned cur_state
= osdmap
.get_state(from
);
3833 auto p
= pending_inc
.new_state
.find(from
);
3834 if (p
!= pending_inc
.new_state
.end()) {
3835 cur_state
^= p
->second
;
3839 set
<string
> want_state_set
, cur_state_set
;
3840 OSDMap::calc_state_set(want_state
, want_state_set
);
3841 OSDMap::calc_state_set(cur_state
, cur_state_set
);
3843 if (cur_state
!= want_state
) {
3844 if (p
!= pending_inc
.new_state
.end()) {
3847 pending_inc
.new_state
[from
] = 0;
3849 pending_inc
.new_state
[from
] |= (osdmap
.get_state(from
) & mask
) ^ want_state
;
3850 dout(7) << __func__
<< " osd." << from
<< " " << cur_state_set
3851 << " -> " << want_state_set
<< dendl
;
3853 dout(7) << __func__
<< " osd." << from
<< " " << cur_state_set
3854 << " = wanted " << want_state_set
<< ", just waiting" << dendl
;
3857 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
3864 bool OSDMonitor::preprocess_alive(MonOpRequestRef op
)
3866 op
->mark_osdmon_event(__func__
);
3867 auto m
= op
->get_req
<MOSDAlive
>();
3868 int from
= m
->get_orig_source().num();
3870 // check permissions, ignore if failed
3871 MonSession
*session
= op
->get_session();
3874 if (!session
->is_capable("osd", MON_CAP_X
)) {
3875 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3876 << session
->caps
<< dendl
;
3880 if (!osdmap
.is_up(from
) ||
3881 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs())) {
3882 dout(7) << "preprocess_alive ignoring alive message from down "
3883 << m
->get_orig_source() << " " << m
->get_orig_source_addrs()
3888 if (osdmap
.get_up_thru(from
) >= m
->want
) {
3890 dout(7) << "preprocess_alive want up_thru " << m
->want
<< " dup from " << m
->get_orig_source_inst() << dendl
;
3891 _reply_map(op
, m
->version
);
3895 dout(10) << "preprocess_alive want up_thru " << m
->want
3896 << " from " << m
->get_orig_source_inst() << dendl
;
3903 bool OSDMonitor::prepare_alive(MonOpRequestRef op
)
3905 op
->mark_osdmon_event(__func__
);
3906 auto m
= op
->get_req
<MOSDAlive
>();
3907 int from
= m
->get_orig_source().num();
3909 if (0) { // we probably don't care much about these
3910 mon
.clog
->debug() << m
->get_orig_source_inst() << " alive";
3913 dout(7) << "prepare_alive want up_thru " << m
->want
<< " have " << m
->version
3914 << " from " << m
->get_orig_source_inst() << dendl
;
3916 update_up_thru(from
, m
->version
); // set to the latest map the OSD has
3917 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
3921 void OSDMonitor::_reply_map(MonOpRequestRef op
, epoch_t e
)
3923 op
->mark_osdmon_event(__func__
);
3924 dout(7) << "_reply_map " << e
3925 << " from " << op
->get_req()->get_orig_source_inst()
3931 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op
)
3933 op
->mark_osdmon_event(__func__
);
3934 auto m
= op
->get_req
<MOSDPGCreated
>();
3935 dout(10) << __func__
<< " " << *m
<< dendl
;
3936 auto session
= op
->get_session();
3939 dout(10) << __func__
<< ": no monitor session!" << dendl
;
3942 if (!session
->is_capable("osd", MON_CAP_X
)) {
3943 derr
<< __func__
<< " received from entity "
3944 << "with insufficient privileges " << session
->caps
<< dendl
;
3947 // always forward the "created!" to the leader
3951 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op
)
3953 op
->mark_osdmon_event(__func__
);
3954 auto m
= op
->get_req
<MOSDPGCreated
>();
3955 dout(10) << __func__
<< " " << *m
<< dendl
;
3956 auto src
= m
->get_orig_source();
3957 auto from
= src
.num();
3958 if (!src
.is_osd() ||
3959 !mon
.osdmon()->osdmap
.is_up(from
) ||
3960 !mon
.osdmon()->osdmap
.get_addrs(from
).legacy_equals(
3961 m
->get_orig_source_addrs())) {
3962 dout(1) << __func__
<< " ignoring stats from non-active osd." << dendl
;
3965 pending_created_pgs
.push_back(m
->pgid
);
3969 bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op
)
3971 op
->mark_osdmon_event(__func__
);
3972 auto m
= op
->get_req
<MOSDPGReadyToMerge
>();
3973 dout(10) << __func__
<< " " << *m
<< dendl
;
3974 const pg_pool_t
*pi
;
3975 auto session
= op
->get_session();
3977 dout(10) << __func__
<< ": no monitor session!" << dendl
;
3980 if (!session
->is_capable("osd", MON_CAP_X
)) {
3981 derr
<< __func__
<< " received from entity "
3982 << "with insufficient privileges " << session
->caps
<< dendl
;
3985 pi
= osdmap
.get_pg_pool(m
->pgid
.pool());
3987 derr
<< __func__
<< " pool for " << m
->pgid
<< " dne" << dendl
;
3990 if (pi
->get_pg_num() <= m
->pgid
.ps()) {
3991 dout(20) << " pg_num " << pi
->get_pg_num() << " already < " << m
->pgid
<< dendl
;
3994 if (pi
->get_pg_num() != m
->pgid
.ps() + 1) {
3995 derr
<< " OSD trying to merge wrong pgid " << m
->pgid
<< dendl
;
3998 if (pi
->get_pg_num_pending() > m
->pgid
.ps()) {
3999 dout(20) << " pg_num_pending " << pi
->get_pg_num_pending() << " > " << m
->pgid
<< dendl
;
4009 bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op
)
4011 op
->mark_osdmon_event(__func__
);
4012 auto m
= op
->get_req
<MOSDPGReadyToMerge
>();
4013 dout(10) << __func__
<< " " << *m
<< dendl
;
4015 if (pending_inc
.new_pools
.count(m
->pgid
.pool()))
4016 p
= pending_inc
.new_pools
[m
->pgid
.pool()];
4018 p
= *osdmap
.get_pg_pool(m
->pgid
.pool());
4019 if (p
.get_pg_num() != m
->pgid
.ps() + 1 ||
4020 p
.get_pg_num_pending() > m
->pgid
.ps()) {
4021 dout(10) << __func__
4022 << " race with concurrent pg_num[_pending] update, will retry"
4024 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
4029 p
.dec_pg_num(m
->pgid
,
4033 m
->last_epoch_started
,
4034 m
->last_epoch_clean
);
4035 p
.last_change
= pending_inc
.epoch
;
4037 // back off the merge attempt!
4038 p
.set_pg_num_pending(p
.get_pg_num());
4041 // force pre-nautilus clients to resend their ops, since they
4042 // don't understand pg_num_pending changes form a new interval
4043 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
4045 pending_inc
.new_pools
[m
->pgid
.pool()] = p
;
4047 auto prob
= g_conf().get_val
<double>("mon_inject_pg_merge_bounce_probability");
4050 prob
> (double)(rand() % 1000)/1000.0) {
4051 derr
<< __func__
<< " injecting pg merge pg_num bounce" << dendl
;
4052 auto n
= new MMonCommand(mon
.monmap
->get_fsid());
4053 n
->set_connection(m
->get_connection());
4054 n
->cmd
= { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
4055 osdmap
.get_pool_name(m
->pgid
.pool()) +
4056 "\", \"var\": \"pg_num_actual\", \"val\": \"" +
4057 stringify(m
->pgid
.ps() + 1) + "\"}" };
4058 MonOpRequestRef nop
= mon
.op_tracker
.create_request
<MonOpRequest
>(n
);
4059 nop
->set_type_service();
4060 wait_for_finished_proposal(op
, new C_RetryMessage(this, nop
));
4062 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
4071 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op
)
4073 auto m
= op
->get_req
<MOSDPGTemp
>();
4074 dout(10) << "preprocess_pgtemp " << *m
<< dendl
;
4075 mempool::osdmap::vector
<int> empty
;
4076 int from
= m
->get_orig_source().num();
4077 size_t ignore_cnt
= 0;
4080 MonSession
*session
= op
->get_session();
4083 if (!session
->is_capable("osd", MON_CAP_X
)) {
4084 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
4085 << session
->caps
<< dendl
;
4089 if (!osdmap
.is_up(from
) ||
4090 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs())) {
4091 dout(7) << "ignoring pgtemp message from down "
4092 << m
->get_orig_source() << " " << m
->get_orig_source_addrs()
4101 for (auto p
= m
->pg_temp
.begin(); p
!= m
->pg_temp
.end(); ++p
) {
4102 dout(20) << " " << p
->first
4103 << (osdmap
.pg_temp
->count(p
->first
) ? osdmap
.pg_temp
->get(p
->first
) : empty
)
4104 << " -> " << p
->second
<< dendl
;
4106 // does the pool exist?
4107 if (!osdmap
.have_pg_pool(p
->first
.pool())) {
4109 * 1. If the osdmap does not have the pool, it means the pool has been
4110 * removed in-between the osd sending this message and us handling it.
4111 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
4112 * not exist in the pending either, as the osds would not send a
4113 * message about a pool they know nothing about (yet).
4114 * 3. However, if the pool does exist in the pending, then it must be a
4115 * new pool, and not relevant to this message (see 1).
4117 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
4118 << ": pool has been removed" << dendl
;
4123 int acting_primary
= -1;
4124 osdmap
.pg_to_up_acting_osds(
4125 p
->first
, nullptr, nullptr, nullptr, &acting_primary
);
4126 if (acting_primary
!= from
) {
4127 /* If the source isn't the primary based on the current osdmap, we know
4128 * that the interval changed and that we can discard this message.
4129 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
4130 * which of two pg temp mappings on the same pg is more recent.
4132 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
4133 << ": primary has changed" << dendl
;
4139 if (p
->second
.empty() && (osdmap
.pg_temp
->count(p
->first
) ||
4140 osdmap
.primary_temp
->count(p
->first
)))
4143 // NOTE: we assume that this will clear pg_primary, so consider
4144 // an existing pg_primary field to imply a change
4145 if (p
->second
.size() &&
4146 (osdmap
.pg_temp
->count(p
->first
) == 0 ||
4147 osdmap
.pg_temp
->get(p
->first
) != p
->second
||
4148 osdmap
.primary_temp
->count(p
->first
)))
4152 // should we ignore all the pgs?
4153 if (ignore_cnt
== m
->pg_temp
.size())
4156 dout(7) << "preprocess_pgtemp e" << m
->map_epoch
<< " no changes from " << m
->get_orig_source_inst() << dendl
;
4157 _reply_map(op
, m
->map_epoch
);
4165 void OSDMonitor::update_up_thru(int from
, epoch_t up_thru
)
4167 epoch_t old_up_thru
= osdmap
.get_up_thru(from
);
4168 auto ut
= pending_inc
.new_up_thru
.find(from
);
4169 if (ut
!= pending_inc
.new_up_thru
.end()) {
4170 old_up_thru
= ut
->second
;
4172 if (up_thru
> old_up_thru
) {
4173 // set up_thru too, so the osd doesn't have to ask again
4174 pending_inc
.new_up_thru
[from
] = up_thru
;
4178 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op
)
4180 op
->mark_osdmon_event(__func__
);
4181 auto m
= op
->get_req
<MOSDPGTemp
>();
4182 int from
= m
->get_orig_source().num();
4183 dout(7) << "prepare_pgtemp e" << m
->map_epoch
<< " from " << m
->get_orig_source_inst() << dendl
;
4184 for (map
<pg_t
,vector
<int32_t> >::iterator p
= m
->pg_temp
.begin(); p
!= m
->pg_temp
.end(); ++p
) {
4185 uint64_t pool
= p
->first
.pool();
4186 if (pending_inc
.old_pools
.count(pool
)) {
4187 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
4188 << ": pool pending removal" << dendl
;
4191 if (!osdmap
.have_pg_pool(pool
)) {
4192 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
4193 << ": pool has been removed" << dendl
;
4196 pending_inc
.new_pg_temp
[p
->first
] =
4197 mempool::osdmap::vector
<int>(p
->second
.begin(), p
->second
.end());
4199 // unconditionally clear pg_primary (until this message can encode
4200 // a change for that, too.. at which point we need to also fix
4201 // preprocess_pg_temp)
4202 if (osdmap
.primary_temp
->count(p
->first
) ||
4203 pending_inc
.new_primary_temp
.count(p
->first
))
4204 pending_inc
.new_primary_temp
[p
->first
] = -1;
4207 // set up_thru too, so the osd doesn't have to ask again
4208 update_up_thru(from
, m
->map_epoch
);
4210 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->map_epoch
));
4217 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op
)
4219 op
->mark_osdmon_event(__func__
);
4220 auto m
= op
->get_req
<MRemoveSnaps
>();
4221 dout(7) << "preprocess_remove_snaps " << *m
<< dendl
;
4223 // check privilege, ignore if failed
4224 MonSession
*session
= op
->get_session();
4228 if (!session
->caps
.is_capable(
4230 session
->entity_name
,
4231 "osd", "osd pool rmsnap", {}, true, true, false,
4232 session
->get_peer_socket_addr())) {
4233 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
4234 << session
->caps
<< dendl
;
4238 for (map
<int, vector
<snapid_t
> >::iterator q
= m
->snaps
.begin();
4239 q
!= m
->snaps
.end();
4241 if (!osdmap
.have_pg_pool(q
->first
)) {
4242 dout(10) << " ignoring removed_snaps " << q
->second
4243 << " on non-existent pool " << q
->first
<< dendl
;
4246 const pg_pool_t
*pi
= osdmap
.get_pg_pool(q
->first
);
4247 for (vector
<snapid_t
>::iterator p
= q
->second
.begin();
4248 p
!= q
->second
.end();
4250 if (*p
> pi
->get_snap_seq() ||
4251 !_is_removed_snap(q
->first
, *p
)) {
4257 if (HAVE_FEATURE(m
->get_connection()->get_features(), SERVER_OCTOPUS
)) {
4258 auto reply
= make_message
<MRemoveSnaps
>();
4259 reply
->snaps
= m
->snaps
;
4260 mon
.send_reply(op
, reply
.detach());
4267 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op
)
4269 op
->mark_osdmon_event(__func__
);
4270 auto m
= op
->get_req
<MRemoveSnaps
>();
4271 dout(7) << "prepare_remove_snaps " << *m
<< dendl
;
4273 for (auto& [pool
, snaps
] : m
->snaps
) {
4274 if (!osdmap
.have_pg_pool(pool
)) {
4275 dout(10) << " ignoring removed_snaps " << snaps
4276 << " on non-existent pool " << pool
<< dendl
;
4280 pg_pool_t
& pi
= osdmap
.pools
[pool
];
4281 for (auto s
: snaps
) {
4282 if (!_is_removed_snap(pool
, s
) &&
4283 (!pending_inc
.new_pools
.count(pool
) ||
4284 !pending_inc
.new_pools
[pool
].removed_snaps
.contains(s
)) &&
4285 (!pending_inc
.new_removed_snaps
.count(pool
) ||
4286 !pending_inc
.new_removed_snaps
[pool
].contains(s
))) {
4287 pg_pool_t
*newpi
= pending_inc
.get_new_pool(pool
, &pi
);
4288 if (osdmap
.require_osd_release
< ceph_release_t::octopus
) {
4289 newpi
->removed_snaps
.insert(s
);
4290 dout(10) << " pool " << pool
<< " removed_snaps added " << s
4291 << " (now " << newpi
->removed_snaps
<< ")" << dendl
;
4293 newpi
->flags
|= pg_pool_t::FLAG_SELFMANAGED_SNAPS
;
4294 if (s
> newpi
->get_snap_seq()) {
4295 dout(10) << " pool " << pool
<< " snap_seq "
4296 << newpi
->get_snap_seq() << " -> " << s
<< dendl
;
4297 newpi
->set_snap_seq(s
);
4299 newpi
->set_snap_epoch(pending_inc
.epoch
);
4300 dout(10) << " added pool " << pool
<< " snap " << s
4301 << " to removed_snaps queue" << dendl
;
4302 pending_inc
.new_removed_snaps
[pool
].insert(s
);
4307 if (HAVE_FEATURE(m
->get_connection()->get_features(), SERVER_OCTOPUS
)) {
4308 auto reply
= make_message
<MRemoveSnaps
>();
4309 reply
->snaps
= m
->snaps
;
4310 wait_for_finished_proposal(op
, new C_ReplyOp(this, op
, reply
));
4316 bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op
)
4318 op
->mark_osdmon_event(__func__
);
4319 auto m
= op
->get_req
<MMonGetPurgedSnaps
>();
4320 dout(7) << __func__
<< " " << *m
<< dendl
;
4322 map
<epoch_t
,mempool::osdmap::map
<int64_t,snap_interval_set_t
>> r
;
4324 string k
= make_purged_snap_epoch_key(m
->start
);
4325 auto it
= mon
.store
->get_iterator(OSD_SNAP_PREFIX
);
4327 unsigned long epoch
= m
->last
;
4328 while (it
->valid()) {
4329 if (it
->key().find("purged_epoch_") != 0) {
4332 string k
= it
->key();
4333 int n
= sscanf(k
.c_str(), "purged_epoch_%lx", &epoch
);
4335 derr
<< __func__
<< " unable to parse key '" << it
->key() << "'" << dendl
;
4336 } else if (epoch
> m
->last
) {
4339 bufferlist bl
= it
->value();
4340 auto p
= bl
.cbegin();
4344 } catch (ceph::buffer::error
& e
) {
4345 derr
<< __func__
<< " unable to parse value for key '" << it
->key()
4350 n
+= 4 + v
.size() * 16;
4353 // impose a semi-arbitrary limit to message size
4359 auto reply
= make_message
<MMonGetPurgedSnapsReply
>(m
->start
, epoch
);
4360 reply
->purged_snaps
.swap(r
);
4361 mon
.send_reply(op
, reply
.detach());
4367 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op
)
4369 op
->mark_osdmon_event(__func__
);
4371 auto session
= op
->get_session();
4374 dout(10) << __func__
<< " no monitor session!" << dendl
;
4377 if (!session
->is_capable("osd", MON_CAP_X
)) {
4378 derr
<< __func__
<< " received from entity "
4379 << "with insufficient privileges " << session
->caps
<< dendl
;
4382 // Always forward the beacon to the leader, even if they are the same as
4383 // the old one. The leader will mark as down osds that haven't sent
4384 // beacon for a few minutes.
4388 bool OSDMonitor::prepare_beacon(MonOpRequestRef op
)
4390 op
->mark_osdmon_event(__func__
);
4391 const auto beacon
= op
->get_req
<MOSDBeacon
>();
4392 const auto src
= beacon
->get_orig_source();
4393 dout(10) << __func__
<< " " << *beacon
4394 << " from " << src
<< dendl
;
4395 int from
= src
.num();
4397 if (!src
.is_osd() ||
4398 !osdmap
.is_up(from
) ||
4399 !osdmap
.get_addrs(from
).legacy_equals(beacon
->get_orig_source_addrs())) {
4400 if (src
.is_osd() && !osdmap
.is_up(from
)) {
4401 // share some new maps with this guy in case it may not be
4402 // aware of its own deadness...
4403 send_latest(op
, beacon
->version
+1);
4405 dout(1) << " ignoring beacon from non-active osd." << from
<< dendl
;
4409 last_osd_report
[from
].first
= ceph_clock_now();
4410 last_osd_report
[from
].second
= beacon
->osd_beacon_report_interval
;
4411 osd_epochs
[from
] = beacon
->version
;
4413 for (const auto& pg
: beacon
->pgs
) {
4414 if (auto* pool
= osdmap
.get_pg_pool(pg
.pool()); pool
!= nullptr) {
4415 unsigned pg_num
= pool
->get_pg_num();
4416 last_epoch_clean
.report(pg_num
, pg
, beacon
->min_last_epoch_clean
);
4420 if (osdmap
.osd_xinfo
[from
].last_purged_snaps_scrub
<
4421 beacon
->last_purged_snaps_scrub
) {
4422 if (pending_inc
.new_xinfo
.count(from
) == 0) {
4423 pending_inc
.new_xinfo
[from
] = osdmap
.osd_xinfo
[from
];
4425 pending_inc
.new_xinfo
[from
].last_purged_snaps_scrub
=
4426 beacon
->last_purged_snaps_scrub
;
4436 void OSDMonitor::send_latest(MonOpRequestRef op
, epoch_t start
)
4438 op
->mark_osdmon_event(__func__
);
4439 dout(5) << "send_latest to " << op
->get_req()->get_orig_source_inst()
4440 << " start " << start
<< dendl
;
4444 send_incremental(op
, start
);
4448 MOSDMap
*OSDMonitor::build_latest_full(uint64_t features
)
4450 MOSDMap
*r
= new MOSDMap(mon
.monmap
->fsid
, features
);
4451 get_version_full(osdmap
.get_epoch(), features
, r
->maps
[osdmap
.get_epoch()]);
4452 r
->oldest_map
= get_first_committed();
4453 r
->newest_map
= osdmap
.get_epoch();
4457 MOSDMap
*OSDMonitor::build_incremental(epoch_t from
, epoch_t to
, uint64_t features
)
4459 dout(10) << "build_incremental [" << from
<< ".." << to
<< "] with features "
4460 << std::hex
<< features
<< std::dec
<< dendl
;
4461 MOSDMap
*m
= new MOSDMap(mon
.monmap
->fsid
, features
);
4462 m
->oldest_map
= get_first_committed();
4463 m
->newest_map
= osdmap
.get_epoch();
4465 for (epoch_t e
= to
; e
>= from
&& e
> 0; e
--) {
4467 int err
= get_version(e
, features
, bl
);
4469 ceph_assert(bl
.length());
4470 // if (get_version(e, bl) > 0) {
4471 dout(20) << "build_incremental inc " << e
<< " "
4472 << bl
.length() << " bytes" << dendl
;
4473 m
->incremental_maps
[e
] = bl
;
4475 ceph_assert(err
== -ENOENT
);
4476 ceph_assert(!bl
.length());
4477 get_version_full(e
, features
, bl
);
4478 if (bl
.length() > 0) {
4479 //else if (get_version("full", e, bl) > 0) {
4480 dout(20) << "build_incremental full " << e
<< " "
4481 << bl
.length() << " bytes" << dendl
;
4484 ceph_abort(); // we should have all maps.
4491 void OSDMonitor::send_full(MonOpRequestRef op
)
4493 op
->mark_osdmon_event(__func__
);
4494 dout(5) << "send_full to " << op
->get_req()->get_orig_source_inst() << dendl
;
4495 mon
.send_reply(op
, build_latest_full(op
->get_session()->con_features
));
4498 void OSDMonitor::send_incremental(MonOpRequestRef op
, epoch_t first
)
4500 op
->mark_osdmon_event(__func__
);
4502 MonSession
*s
= op
->get_session();
4506 // oh, we can tell the other mon to do it
4507 dout(10) << __func__
<< " asking proxying mon to send_incremental from "
4509 MRoute
*r
= new MRoute(s
->proxy_tid
, NULL
);
4510 r
->send_osdmap_first
= first
;
4511 s
->proxy_con
->send_message(r
);
4512 op
->mark_event("reply: send routed send_osdmap_first reply");
4515 send_incremental(first
, s
, false, op
);
4519 void OSDMonitor::send_incremental(epoch_t first
,
4520 MonSession
*session
,
4522 MonOpRequestRef req
)
4524 dout(5) << "send_incremental [" << first
<< ".." << osdmap
.get_epoch() << "]"
4525 << " to " << session
->name
<< dendl
;
4527 // get feature of the peer
4528 // use quorum_con_features, if it's an anonymous connection.
4529 uint64_t features
= session
->con_features
? session
->con_features
:
4530 mon
.get_quorum_con_features();
4532 if (first
<= session
->osd_epoch
) {
4533 dout(10) << __func__
<< " " << session
->name
<< " should already have epoch "
4534 << session
->osd_epoch
<< dendl
;
4535 first
= session
->osd_epoch
+ 1;
4538 if (first
< get_first_committed()) {
4539 MOSDMap
*m
= new MOSDMap(osdmap
.get_fsid(), features
);
4540 m
->oldest_map
= get_first_committed();
4541 m
->newest_map
= osdmap
.get_epoch();
4543 first
= get_first_committed();
4545 int err
= get_version_full(first
, features
, bl
);
4546 ceph_assert(err
== 0);
4547 ceph_assert(bl
.length());
4548 dout(20) << "send_incremental starting with base full "
4549 << first
<< " " << bl
.length() << " bytes" << dendl
;
4550 m
->maps
[first
] = bl
;
4553 mon
.send_reply(req
, m
);
4554 session
->osd_epoch
= first
;
4557 session
->con
->send_message(m
);
4558 session
->osd_epoch
= first
;
4563 while (first
<= osdmap
.get_epoch()) {
4564 epoch_t last
= std::min
<epoch_t
>(first
+ g_conf()->osd_map_message_max
- 1,
4565 osdmap
.get_epoch());
4566 MOSDMap
*m
= build_incremental(first
, last
, features
);
4569 // send some maps. it may not be all of them, but it will get them
4571 mon
.send_reply(req
, m
);
4573 session
->con
->send_message(m
);
4576 session
->osd_epoch
= last
;
4582 int OSDMonitor::get_version(version_t ver
, bufferlist
& bl
)
4584 return get_version(ver
, mon
.get_quorum_con_features(), bl
);
4587 void OSDMonitor::reencode_incremental_map(bufferlist
& bl
, uint64_t features
)
4589 OSDMap::Incremental inc
;
4590 auto q
= bl
.cbegin();
4592 // always encode with subset of osdmap's canonical features
4593 uint64_t f
= features
& inc
.encode_features
;
4594 dout(20) << __func__
<< " " << inc
.epoch
<< " with features " << f
4597 if (inc
.fullmap
.length()) {
4598 // embedded full map?
4600 m
.decode(inc
.fullmap
);
4601 inc
.fullmap
.clear();
4602 m
.encode(inc
.fullmap
, f
| CEPH_FEATURE_RESERVED
);
4604 if (inc
.crush
.length()) {
4605 // embedded crush map
4607 auto p
= inc
.crush
.cbegin();
4610 c
.encode(inc
.crush
, f
);
4612 inc
.encode(bl
, f
| CEPH_FEATURE_RESERVED
);
4615 void OSDMonitor::reencode_full_map(bufferlist
& bl
, uint64_t features
)
4618 auto q
= bl
.cbegin();
4620 // always encode with subset of osdmap's canonical features
4621 uint64_t f
= features
& m
.get_encoding_features();
4622 dout(20) << __func__
<< " " << m
.get_epoch() << " with features " << f
4625 m
.encode(bl
, f
| CEPH_FEATURE_RESERVED
);
4628 int OSDMonitor::get_version(version_t ver
, uint64_t features
, bufferlist
& bl
)
4630 uint64_t significant_features
= OSDMap::get_significant_features(features
);
4631 if (inc_osd_cache
.lookup({ver
, significant_features
}, &bl
)) {
4634 int ret
= PaxosService::get_version(ver
, bl
);
4638 // NOTE: this check is imprecise; the OSDMap encoding features may
4639 // be a subset of the latest mon quorum features, but worst case we
4640 // reencode once and then cache the (identical) result under both
4642 if (significant_features
!=
4643 OSDMap::get_significant_features(mon
.get_quorum_con_features())) {
4644 reencode_incremental_map(bl
, features
);
4646 inc_osd_cache
.add_bytes({ver
, significant_features
}, bl
);
4650 int OSDMonitor::get_inc(version_t ver
, OSDMap::Incremental
& inc
)
4653 int err
= get_version(ver
, inc_bl
);
4654 ceph_assert(err
== 0);
4655 ceph_assert(inc_bl
.length());
4657 auto p
= inc_bl
.cbegin();
4659 dout(10) << __func__
<< " "
4660 << " epoch " << inc
.epoch
4661 << " inc_crc " << inc
.inc_crc
4662 << " full_crc " << inc
.full_crc
4663 << " encode_features " << inc
.encode_features
<< dendl
;
4667 int OSDMonitor::get_full_from_pinned_map(version_t ver
, bufferlist
& bl
)
4669 dout(10) << __func__
<< " ver " << ver
<< dendl
;
4671 version_t closest_pinned
= osdmap_manifest
.get_lower_closest_pinned(ver
);
4672 if (closest_pinned
== 0) {
4675 if (closest_pinned
> ver
) {
4676 dout(0) << __func__
<< " pinned: " << osdmap_manifest
.pinned
<< dendl
;
4678 ceph_assert(closest_pinned
<= ver
);
4680 dout(10) << __func__
<< " closest pinned ver " << closest_pinned
<< dendl
;
4682 // get osdmap incremental maps and apply on top of this one.
4684 bool has_cached_osdmap
= false;
4685 for (version_t v
= ver
-1; v
>= closest_pinned
; --v
) {
4686 if (full_osd_cache
.lookup({v
, mon
.get_quorum_con_features()},
4688 dout(10) << __func__
<< " found map in cache ver " << v
<< dendl
;
4690 has_cached_osdmap
= true;
4695 if (!has_cached_osdmap
) {
4696 int err
= PaxosService::get_version_full(closest_pinned
, osdm_bl
);
4698 derr
<< __func__
<< " closest pinned map ver " << closest_pinned
4699 << " not available! error: " << cpp_strerror(err
) << dendl
;
4701 ceph_assert(err
== 0);
4704 ceph_assert(osdm_bl
.length());
4707 osdm
.decode(osdm_bl
);
4709 dout(10) << __func__
<< " loaded osdmap epoch " << closest_pinned
4710 << " e" << osdm
.epoch
4711 << " crc " << osdm
.get_crc()
4712 << " -- applying incremental maps." << dendl
;
4714 uint64_t encode_features
= 0;
4715 for (version_t v
= closest_pinned
+ 1; v
<= ver
; ++v
) {
4716 dout(20) << __func__
<< " applying inc epoch " << v
<< dendl
;
4718 OSDMap::Incremental inc
;
4719 int err
= get_inc(v
, inc
);
4720 ceph_assert(err
== 0);
4722 encode_features
= inc
.encode_features
;
4724 err
= osdm
.apply_incremental(inc
);
4725 ceph_assert(err
== 0);
4727 // this block performs paranoid checks on map retrieval
4728 if (g_conf().get_val
<bool>("mon_debug_extra_checks") &&
4729 inc
.full_crc
!= 0) {
4731 uint64_t f
= encode_features
;
4733 f
= (mon
.quorum_con_features
? mon
.quorum_con_features
: -1);
4736 // encode osdmap to force calculating crcs
4738 osdm
.encode(tbl
, f
| CEPH_FEATURE_RESERVED
);
4739 // decode osdmap to compare crcs with what's expected by incremental
4743 if (tosdm
.get_crc() != inc
.full_crc
) {
4745 << " osdmap crc mismatch! (osdmap crc " << tosdm
.get_crc()
4746 << ", expected " << inc
.full_crc
<< ")" << dendl
;
4747 ceph_abort_msg("osdmap crc mismatch");
4751 // note: we cannot add the recently computed map to the cache, as is,
4752 // because we have not encoded the map into a bl.
4755 if (!encode_features
) {
4756 dout(10) << __func__
4757 << " last incremental map didn't have features;"
4758 << " defaulting to quorum's or all" << dendl
;
4760 (mon
.quorum_con_features
? mon
.quorum_con_features
: -1);
4762 osdm
.encode(bl
, encode_features
| CEPH_FEATURE_RESERVED
);
4767 int OSDMonitor::get_version_full(version_t ver
, bufferlist
& bl
)
4769 return get_version_full(ver
, mon
.get_quorum_con_features(), bl
);
4772 int OSDMonitor::get_version_full(version_t ver
, uint64_t features
,
4775 uint64_t significant_features
= OSDMap::get_significant_features(features
);
4776 if (full_osd_cache
.lookup({ver
, significant_features
}, &bl
)) {
4779 int ret
= PaxosService::get_version_full(ver
, bl
);
4780 if (ret
== -ENOENT
) {
4782 ret
= get_full_from_pinned_map(ver
, bl
);
4787 // NOTE: this check is imprecise; the OSDMap encoding features may
4788 // be a subset of the latest mon quorum features, but worst case we
4789 // reencode once and then cache the (identical) result under both
4791 if (significant_features
!=
4792 OSDMap::get_significant_features(mon
.get_quorum_con_features())) {
4793 reencode_full_map(bl
, features
);
4795 full_osd_cache
.add_bytes({ver
, significant_features
}, bl
);
4799 epoch_t
OSDMonitor::blocklist(const entity_addrvec_t
& av
, utime_t until
)
4801 dout(10) << "blocklist " << av
<< " until " << until
<< dendl
;
4802 for (auto a
: av
.v
) {
4803 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
4804 a
.set_type(entity_addr_t::TYPE_ANY
);
4806 a
.set_type(entity_addr_t::TYPE_LEGACY
);
4808 pending_inc
.new_blocklist
[a
] = until
;
4810 return pending_inc
.epoch
;
4813 epoch_t
OSDMonitor::blocklist(entity_addr_t a
, utime_t until
)
4815 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
4816 a
.set_type(entity_addr_t::TYPE_ANY
);
4818 a
.set_type(entity_addr_t::TYPE_LEGACY
);
4820 dout(10) << "blocklist " << a
<< " until " << until
<< dendl
;
4821 pending_inc
.new_blocklist
[a
] = until
;
4822 return pending_inc
.epoch
;
4826 void OSDMonitor::check_osdmap_subs()
4828 dout(10) << __func__
<< dendl
;
4829 if (!osdmap
.get_epoch()) {
4832 auto osdmap_subs
= mon
.session_map
.subs
.find("osdmap");
4833 if (osdmap_subs
== mon
.session_map
.subs
.end()) {
4836 auto p
= osdmap_subs
->second
->begin();
4840 check_osdmap_sub(sub
);
4844 void OSDMonitor::check_osdmap_sub(Subscription
*sub
)
4846 dout(10) << __func__
<< " " << sub
<< " next " << sub
->next
4847 << (sub
->onetime
? " (onetime)":" (ongoing)") << dendl
;
4848 if (sub
->next
<= osdmap
.get_epoch()) {
4850 send_incremental(sub
->next
, sub
->session
, sub
->incremental_onetime
);
4852 sub
->session
->con
->send_message(build_latest_full(sub
->session
->con_features
));
4854 mon
.session_map
.remove_sub(sub
);
4856 sub
->next
= osdmap
.get_epoch() + 1;
4860 void OSDMonitor::check_pg_creates_subs()
4862 if (!osdmap
.get_num_up_osds()) {
4865 ceph_assert(osdmap
.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB
);
4866 mon
.with_session_map([this](const MonSessionMap
& session_map
) {
4867 auto pg_creates_subs
= session_map
.subs
.find("osd_pg_creates");
4868 if (pg_creates_subs
== session_map
.subs
.end()) {
4871 for (auto sub
: *pg_creates_subs
->second
) {
4872 check_pg_creates_sub(sub
);
4877 void OSDMonitor::check_pg_creates_sub(Subscription
*sub
)
4879 dout(20) << __func__
<< " .. " << sub
->session
->name
<< dendl
;
4880 ceph_assert(sub
->type
== "osd_pg_creates");
4881 // only send these if the OSD is up. we will check_subs() when they do
4882 // come up so they will get the creates then.
4883 if (sub
->session
->name
.is_osd() &&
4884 mon
.osdmon()->osdmap
.is_up(sub
->session
->name
.num())) {
4885 sub
->next
= send_pg_creates(sub
->session
->name
.num(),
4886 sub
->session
->con
.get(),
4891 void OSDMonitor::do_application_enable(int64_t pool_id
,
4892 const std::string
&app_name
,
4893 const std::string
&app_key
,
4894 const std::string
&app_value
,
4897 ceph_assert(paxos
.is_plugged() && is_writeable());
4899 dout(20) << __func__
<< ": pool_id=" << pool_id
<< ", app_name=" << app_name
4902 ceph_assert(osdmap
.require_osd_release
>= ceph_release_t::luminous
);
4904 auto pp
= osdmap
.get_pg_pool(pool_id
);
4905 ceph_assert(pp
!= nullptr);
4908 if (pending_inc
.new_pools
.count(pool_id
)) {
4909 p
= pending_inc
.new_pools
[pool_id
];
4912 if (app_key
.empty()) {
4913 p
.application_metadata
.insert({app_name
, {}});
4916 p
.application_metadata
[app_name
][app_key
] = app_value
;
4918 p
.application_metadata
.insert({app_name
, {{app_key
, app_value
}}});
4921 p
.last_change
= pending_inc
.epoch
;
4922 pending_inc
.new_pools
[pool_id
] = p
;
4925 void OSDMonitor::do_set_pool_opt(int64_t pool_id
,
4926 pool_opts_t::key_t opt
,
4927 pool_opts_t::value_t val
)
4929 auto p
= pending_inc
.new_pools
.try_emplace(
4930 pool_id
, *osdmap
.get_pg_pool(pool_id
));
4931 p
.first
->second
.opts
.set(opt
, val
);
4934 unsigned OSDMonitor::scan_for_creating_pgs(
4935 const mempool::osdmap::map
<int64_t,pg_pool_t
>& pools
,
4936 const mempool::osdmap::set
<int64_t>& removed_pools
,
4938 creating_pgs_t
* creating_pgs
) const
4940 unsigned queued
= 0;
4941 for (auto& p
: pools
) {
4942 int64_t poolid
= p
.first
;
4943 if (creating_pgs
->created_pools
.count(poolid
)) {
4944 dout(10) << __func__
<< " already created " << poolid
<< dendl
;
4947 const pg_pool_t
& pool
= p
.second
;
4948 int ruleno
= pool
.get_crush_rule();
4949 if (ruleno
< 0 || !osdmap
.crush
->rule_exists(ruleno
))
4952 const auto last_scan_epoch
= creating_pgs
->last_scan_epoch
;
4953 const auto created
= pool
.get_last_change();
4954 if (last_scan_epoch
&& created
<= last_scan_epoch
) {
4955 dout(10) << __func__
<< " no change in pool " << poolid
4956 << " " << pool
<< dendl
;
4959 if (removed_pools
.count(poolid
)) {
4960 dout(10) << __func__
<< " pool is being removed: " << poolid
4961 << " " << pool
<< dendl
;
4964 dout(10) << __func__
<< " queueing pool create for " << poolid
4965 << " " << pool
<< dendl
;
4966 creating_pgs
->create_pool(poolid
, pool
.get_pg_num(),
4973 void OSDMonitor::update_creating_pgs()
4975 dout(10) << __func__
<< " " << creating_pgs
.pgs
.size() << " pgs creating, "
4976 << creating_pgs
.queue
.size() << " pools in queue" << dendl
;
4977 decltype(creating_pgs_by_osd_epoch
) new_pgs_by_osd_epoch
;
4978 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
4979 for (const auto& pg
: creating_pgs
.pgs
) {
4980 int acting_primary
= -1;
4981 auto pgid
= pg
.first
;
4982 if (!osdmap
.pg_exists(pgid
)) {
4983 dout(20) << __func__
<< " ignoring " << pgid
<< " which should not exist"
4987 auto mapped
= pg
.second
.create_epoch
;
4988 dout(20) << __func__
<< " looking up " << pgid
<< "@" << mapped
<< dendl
;
4990 mapping
.get_primary_and_shard(pgid
, &acting_primary
, &spgid
);
4991 // check the previous creating_pgs, look for the target to whom the pg was
4992 // previously mapped
4993 for (const auto& pgs_by_epoch
: creating_pgs_by_osd_epoch
) {
4994 const auto last_acting_primary
= pgs_by_epoch
.first
;
4995 for (auto& pgs
: pgs_by_epoch
.second
) {
4996 if (pgs
.second
.count(spgid
)) {
4997 if (last_acting_primary
== acting_primary
) {
5000 dout(20) << __func__
<< " " << pgid
<< " "
5001 << " acting_primary:" << last_acting_primary
5002 << " -> " << acting_primary
<< dendl
;
5003 // note epoch if the target of the create message changed.
5004 mapped
= mapping
.get_epoch();
5009 mapped
= mapping
.get_epoch();
5013 dout(10) << __func__
<< " will instruct osd." << acting_primary
5014 << " to create " << pgid
<< "@" << mapped
<< dendl
;
5015 new_pgs_by_osd_epoch
[acting_primary
][mapped
].insert(spgid
);
5017 creating_pgs_by_osd_epoch
= std::move(new_pgs_by_osd_epoch
);
5018 creating_pgs_epoch
= mapping
.get_epoch();
5021 epoch_t
OSDMonitor::send_pg_creates(int osd
, Connection
*con
, epoch_t next
) const
5023 dout(30) << __func__
<< " osd." << osd
<< " next=" << next
5024 << " " << creating_pgs_by_osd_epoch
<< dendl
;
5025 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
5026 if (creating_pgs_epoch
<= creating_pgs
.last_scan_epoch
) {
5027 dout(20) << __func__
5028 << " not using stale creating_pgs@" << creating_pgs_epoch
<< dendl
;
5029 // the subscribers will be updated when the mapping is completed anyway
5032 auto creating_pgs_by_epoch
= creating_pgs_by_osd_epoch
.find(osd
);
5033 if (creating_pgs_by_epoch
== creating_pgs_by_osd_epoch
.end())
5035 ceph_assert(!creating_pgs_by_epoch
->second
.empty());
5037 MOSDPGCreate
*oldm
= nullptr; // for pre-mimic OSD compat
5038 MOSDPGCreate2
*m
= nullptr;
5040 bool old
= osdmap
.require_osd_release
< ceph_release_t::nautilus
;
5043 for (auto epoch_pgs
= creating_pgs_by_epoch
->second
.lower_bound(next
);
5044 epoch_pgs
!= creating_pgs_by_epoch
->second
.end(); ++epoch_pgs
) {
5045 auto epoch
= epoch_pgs
->first
;
5046 auto& pgs
= epoch_pgs
->second
;
5047 dout(20) << __func__
<< " osd." << osd
<< " from " << next
5048 << " : epoch " << epoch
<< " " << pgs
.size() << " pgs" << dendl
;
5050 for (auto& pg
: pgs
) {
5051 // Need the create time from the monitor using its clock to set
5052 // last_scrub_stamp upon pg creation.
5053 auto create
= creating_pgs
.pgs
.find(pg
.pgid
);
5054 ceph_assert(create
!= creating_pgs
.pgs
.end());
5057 oldm
= new MOSDPGCreate(creating_pgs_epoch
);
5059 oldm
->mkpg
.emplace(pg
.pgid
,
5060 pg_create_t
{create
->second
.create_epoch
, pg
.pgid
, 0});
5061 oldm
->ctimes
.emplace(pg
.pgid
, create
->second
.create_stamp
);
5064 m
= new MOSDPGCreate2(creating_pgs_epoch
);
5066 m
->pgs
.emplace(pg
, make_pair(create
->second
.create_epoch
,
5067 create
->second
.create_stamp
));
5068 if (create
->second
.history
.epoch_created
) {
5069 dout(20) << __func__
<< " " << pg
<< " " << create
->second
.history
5070 << " " << create
->second
.past_intervals
<< dendl
;
5071 m
->pg_extra
.emplace(pg
, make_pair(create
->second
.history
,
5072 create
->second
.past_intervals
));
5075 dout(20) << __func__
<< " will create " << pg
5076 << " at " << create
->second
.create_epoch
<< dendl
;
5080 con
->send_message(m
);
5082 con
->send_message(oldm
);
5084 dout(20) << __func__
<< " osd." << osd
<< " from " << next
5085 << " has nothing to send" << dendl
;
5089 // sub is current through last + 1
5096 void OSDMonitor::tick()
5098 if (!is_active()) return;
5100 dout(10) << osdmap
<< dendl
;
5102 // always update osdmap manifest, regardless of being the leader.
5103 load_osdmap_manifest();
5105 // always tune priority cache manager memory on leader and peons
5106 if (ceph_using_tcmalloc() && mon_memory_autotune
) {
5107 std::lock_guard
l(balancer_lock
);
5108 if (pcm
!= nullptr) {
5111 _set_new_cache_sizes();
5112 dout(10) << "tick balancer "
5113 << " inc cache_bytes: " << inc_cache
->get_cache_bytes()
5114 << " inc comtd_bytes: " << inc_cache
->get_committed_size()
5115 << " inc used_bytes: " << inc_cache
->_get_used_bytes()
5116 << " inc num_osdmaps: " << inc_cache
->_get_num_osdmaps()
5118 dout(10) << "tick balancer "
5119 << " full cache_bytes: " << full_cache
->get_cache_bytes()
5120 << " full comtd_bytes: " << full_cache
->get_committed_size()
5121 << " full used_bytes: " << full_cache
->_get_used_bytes()
5122 << " full num_osdmaps: " << full_cache
->_get_num_osdmaps()
5127 if (!mon
.is_leader()) return;
5129 bool do_propose
= false;
5130 utime_t now
= ceph_clock_now();
5132 if (handle_osd_timeouts(now
, last_osd_report
)) {
5137 if (check_failures(now
)) {
5141 // Force a proposal if we need to prune; pruning is performed on
5142 // ``encode_pending()``, hence why we need to regularly trigger a proposal
5143 // even if there's nothing going on.
5144 if (is_prune_enabled() && should_prune()) {
5148 // mark down osds out?
5150 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
5151 * influence at all. The decision is made based on the ratio of "in" osds,
5152 * and the function returns false if this ratio is lower that the minimum
5153 * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
5155 if (can_mark_out(-1)) {
5156 string down_out_subtree_limit
= g_conf().get_val
<string
>(
5157 "mon_osd_down_out_subtree_limit");
5158 set
<int> down_cache
; // quick cache of down subtrees
5160 map
<int,utime_t
>::iterator i
= down_pending_out
.begin();
5161 while (i
!= down_pending_out
.end()) {
5167 if (osdmap
.is_down(o
) &&
5170 utime_t
orig_grace(g_conf()->mon_osd_down_out_interval
, 0);
5171 utime_t grace
= orig_grace
;
5172 double my_grace
= 0.0;
5174 if (g_conf()->mon_osd_adjust_down_out_interval
) {
5175 // scale grace period the same way we do the heartbeat grace.
5176 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(o
);
5177 double halflife
= (double)g_conf()->mon_osd_laggy_halflife
;
5178 double decay_k
= ::log(.5) / halflife
;
5179 double decay
= exp((double)down
* decay_k
);
5180 dout(20) << "osd." << o
<< " laggy halflife " << halflife
<< " decay_k " << decay_k
5181 << " down for " << down
<< " decay " << decay
<< dendl
;
5182 my_grace
= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
5186 // is this an entire large subtree down?
5187 if (down_out_subtree_limit
.length()) {
5188 int type
= osdmap
.crush
->get_type_id(down_out_subtree_limit
);
5190 if (osdmap
.containing_subtree_is_down(cct
, o
, type
, &down_cache
)) {
5191 dout(10) << "tick entire containing " << down_out_subtree_limit
5192 << " subtree for osd." << o
5193 << " is down; resetting timer" << dendl
;
5194 // reset timer, too.
5195 down_pending_out
[o
] = now
;
5201 bool down_out
= !osdmap
.is_destroyed(o
) &&
5202 g_conf()->mon_osd_down_out_interval
> 0 && down
.sec() >= grace
;
5203 bool destroyed_out
= osdmap
.is_destroyed(o
) &&
5204 g_conf()->mon_osd_destroyed_out_interval
> 0 &&
5205 // this is not precise enough as we did not make a note when this osd
5206 // was marked as destroyed, but let's not bother with that
5207 // complexity for now.
5208 down
.sec() >= g_conf()->mon_osd_destroyed_out_interval
;
5209 if (down_out
|| destroyed_out
) {
5210 dout(10) << "tick marking osd." << o
<< " OUT after " << down
5211 << " sec (target " << grace
<< " = " << orig_grace
<< " + " << my_grace
<< ")" << dendl
;
5212 pending_inc
.new_weight
[o
] = CEPH_OSD_OUT
;
5214 // set the AUTOOUT bit.
5215 if (pending_inc
.new_state
.count(o
) == 0)
5216 pending_inc
.new_state
[o
] = 0;
5217 pending_inc
.new_state
[o
] |= CEPH_OSD_AUTOOUT
;
5219 // remember previous weight
5220 if (pending_inc
.new_xinfo
.count(o
) == 0)
5221 pending_inc
.new_xinfo
[o
] = osdmap
.osd_xinfo
[o
];
5222 pending_inc
.new_xinfo
[o
].old_weight
= osdmap
.osd_weight
[o
];
5226 mon
.clog
->info() << "Marking osd." << o
<< " out (has been down for "
5227 << int(down
.sec()) << " seconds)";
5232 down_pending_out
.erase(o
);
5235 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl
;
5238 // expire blocklisted items?
5239 for (ceph::unordered_map
<entity_addr_t
,utime_t
>::iterator p
= osdmap
.blocklist
.begin();
5240 p
!= osdmap
.blocklist
.end();
5242 if (p
->second
< now
) {
5243 dout(10) << "expiring blocklist item " << p
->first
<< " expired " << p
->second
<< " < now " << now
<< dendl
;
5244 pending_inc
.old_blocklist
.push_back(p
->first
);
5249 if (try_prune_purged_snaps()) {
5253 if (update_pools_status())
5257 !pending_inc
.new_pg_temp
.empty()) // also propose if we adjusted pg_temp
5261 void OSDMonitor::_set_new_cache_sizes()
5263 uint64_t cache_size
= 0;
5264 int64_t inc_alloc
= 0;
5265 int64_t full_alloc
= 0;
5266 int64_t kv_alloc
= 0;
5268 if (pcm
!= nullptr && rocksdb_binned_kv_cache
!= nullptr) {
5269 cache_size
= pcm
->get_tuned_mem();
5270 inc_alloc
= inc_cache
->get_committed_size();
5271 full_alloc
= full_cache
->get_committed_size();
5272 kv_alloc
= rocksdb_binned_kv_cache
->get_committed_size();
5275 inc_osd_cache
.set_bytes(inc_alloc
);
5276 full_osd_cache
.set_bytes(full_alloc
);
5278 dout(1) << __func__
<< " cache_size:" << cache_size
5279 << " inc_alloc: " << inc_alloc
5280 << " full_alloc: " << full_alloc
5281 << " kv_alloc: " << kv_alloc
5285 bool OSDMonitor::handle_osd_timeouts(const utime_t
&now
,
5286 std::map
<int, std::pair
<utime_t
, int>> &last_osd_report
)
5288 utime_t
timeo(g_conf()->mon_osd_report_timeout
, 0);
5289 if (now
- mon
.get_leader_since() < timeo
) {
5290 // We haven't been the leader for long enough to consider OSD timeouts
5294 int max_osd
= osdmap
.get_max_osd();
5295 bool new_down
= false;
5297 for (int i
=0; i
< max_osd
; ++i
) {
5298 dout(30) << __func__
<< ": checking up on osd " << i
<< dendl
;
5299 if (!osdmap
.exists(i
)) {
5300 last_osd_report
.erase(i
); // if any
5303 if (!osdmap
.is_up(i
))
5305 const std::map
<int, std::pair
<utime_t
, int>>::const_iterator t
= last_osd_report
.find(i
);
5306 if (t
== last_osd_report
.end()) {
5307 // it wasn't in the map; start the timer.
5308 last_osd_report
[i
].first
= now
;
5309 last_osd_report
[i
].second
= 0;
5310 } else if (can_mark_down(i
)) {
5311 utime_t diff
= now
- t
->second
.first
;
5312 // we use the max(mon_osd_report_timeout, 2*osd_beacon_report_interval) as timeout
5313 // to allow for the osd to miss a beacon.
5314 int mon_osd_report_timeout
= g_conf()->mon_osd_report_timeout
;
5315 utime_t
max_timeout(std::max(mon_osd_report_timeout
, 2 * t
->second
.second
), 0);
5316 if (diff
> max_timeout
) {
5317 mon
.clog
->info() << "osd." << i
<< " marked down after no beacon for "
5318 << diff
<< " seconds";
5319 derr
<< "no beacon from osd." << i
<< " since " << t
->second
.first
5320 << ", " << diff
<< " seconds ago. marking down" << dendl
;
5321 pending_inc
.new_state
[i
] = CEPH_OSD_UP
;
5329 static void dump_cpu_list(Formatter
*f
, const char *name
,
5330 const string
& strlist
)
5333 size_t cpu_set_size
;
5334 if (parse_cpu_set_list(strlist
.c_str(), &cpu_set_size
, &cpu_set
) < 0) {
5337 set
<int> cpus
= cpu_set_to_set(cpu_set_size
, &cpu_set
);
5338 f
->open_array_section(name
);
5339 for (auto cpu
: cpus
) {
5340 f
->dump_int("cpu", cpu
);
5345 void OSDMonitor::dump_info(Formatter
*f
)
5347 f
->open_object_section("osdmap");
5351 f
->open_array_section("osd_metadata");
5352 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
5353 if (osdmap
.exists(i
)) {
5354 f
->open_object_section("osd");
5355 f
->dump_unsigned("id", i
);
5356 dump_osd_metadata(i
, f
, NULL
);
5362 f
->open_object_section("osdmap_clean_epochs");
5363 f
->dump_unsigned("min_last_epoch_clean", get_min_last_epoch_clean());
5365 f
->open_object_section("last_epoch_clean");
5366 last_epoch_clean
.dump(f
);
5369 f
->open_array_section("osd_epochs");
5370 for (auto& osd_epoch
: osd_epochs
) {
5371 f
->open_object_section("osd");
5372 f
->dump_unsigned("id", osd_epoch
.first
);
5373 f
->dump_unsigned("epoch", osd_epoch
.second
);
5376 f
->close_section(); // osd_epochs
5378 f
->close_section(); // osd_clean_epochs
5380 f
->dump_unsigned("osdmap_first_committed", get_first_committed());
5381 f
->dump_unsigned("osdmap_last_committed", get_last_committed());
5383 f
->open_object_section("crushmap");
5384 osdmap
.crush
->dump(f
);
5387 if (has_osdmap_manifest
) {
5388 f
->open_object_section("osdmap_manifest");
5389 osdmap_manifest
.dump(f
);
5395 enum osd_pool_get_choices
{
5397 PG_NUM
, PGP_NUM
, CRUSH_RULE
, HASHPSPOOL
, EC_OVERWRITES
,
5398 NODELETE
, NOPGCHANGE
, NOSIZECHANGE
,
5399 WRITE_FADVISE_DONTNEED
, NOSCRUB
, NODEEP_SCRUB
,
5400 HIT_SET_TYPE
, HIT_SET_PERIOD
, HIT_SET_COUNT
, HIT_SET_FPP
,
5401 USE_GMT_HITSET
, TARGET_MAX_OBJECTS
, TARGET_MAX_BYTES
,
5402 CACHE_TARGET_DIRTY_RATIO
, CACHE_TARGET_DIRTY_HIGH_RATIO
,
5403 CACHE_TARGET_FULL_RATIO
,
5404 CACHE_MIN_FLUSH_AGE
, CACHE_MIN_EVICT_AGE
,
5405 ERASURE_CODE_PROFILE
, MIN_READ_RECENCY_FOR_PROMOTE
,
5406 MIN_WRITE_RECENCY_FOR_PROMOTE
, FAST_READ
,
5407 HIT_SET_GRADE_DECAY_RATE
, HIT_SET_SEARCH_LAST_N
,
5408 SCRUB_MIN_INTERVAL
, SCRUB_MAX_INTERVAL
, DEEP_SCRUB_INTERVAL
,
5409 RECOVERY_PRIORITY
, RECOVERY_OP_PRIORITY
, SCRUB_PRIORITY
,
5410 COMPRESSION_MODE
, COMPRESSION_ALGORITHM
, COMPRESSION_REQUIRED_RATIO
,
5411 COMPRESSION_MAX_BLOB_SIZE
, COMPRESSION_MIN_BLOB_SIZE
,
5412 CSUM_TYPE
, CSUM_MAX_BLOCK
, CSUM_MIN_BLOCK
, FINGERPRINT_ALGORITHM
,
5413 PG_AUTOSCALE_MODE
, PG_NUM_MIN
, TARGET_SIZE_BYTES
, TARGET_SIZE_RATIO
,
5414 PG_AUTOSCALE_BIAS
, DEDUP_TIER
, DEDUP_CHUNK_ALGORITHM
,
5415 DEDUP_CDC_CHUNK_SIZE
, POOL_EIO
, BULK
, PG_NUM_MAX
};
5417 std::set
<osd_pool_get_choices
>
5418 subtract_second_from_first(const std::set
<osd_pool_get_choices
>& first
,
5419 const std::set
<osd_pool_get_choices
>& second
)
5421 std::set
<osd_pool_get_choices
> result
;
5422 std::set_difference(first
.begin(), first
.end(),
5423 second
.begin(), second
.end(),
5424 std::inserter(result
, result
.end()));
5430 bool OSDMonitor::preprocess_command(MonOpRequestRef op
)
5432 op
->mark_osdmon_event(__func__
);
5433 auto m
= op
->get_req
<MMonCommand
>();
5436 stringstream ss
, ds
;
5439 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
5440 string rs
= ss
.str();
5441 mon
.reply_command(op
, -EINVAL
, rs
, get_last_committed());
5445 MonSession
*session
= op
->get_session();
5447 derr
<< __func__
<< " no session" << dendl
;
5448 mon
.reply_command(op
, -EACCES
, "access denied", get_last_committed());
5453 cmd_getval(cmdmap
, "prefix", prefix
);
5455 string format
= cmd_getval_or
<string
>(cmdmap
, "format", "plain");
5456 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
5458 if (prefix
== "osd stat") {
5460 f
->open_object_section("osdmap");
5461 osdmap
.print_summary(f
.get(), ds
, "", true);
5465 osdmap
.print_summary(nullptr, ds
, "", true);
5469 else if (prefix
== "osd dump" ||
5470 prefix
== "osd tree" ||
5471 prefix
== "osd tree-from" ||
5472 prefix
== "osd ls" ||
5473 prefix
== "osd getmap" ||
5474 prefix
== "osd getcrushmap" ||
5475 prefix
== "osd ls-tree" ||
5476 prefix
== "osd info") {
5478 epoch_t epoch
= cmd_getval_or
<int64_t>(cmdmap
, "epoch", osdmap
.get_epoch());
5479 bufferlist osdmap_bl
;
5480 int err
= get_version_full(epoch
, osdmap_bl
);
5481 if (err
== -ENOENT
) {
5483 ss
<< "there is no map for epoch " << epoch
;
5486 ceph_assert(err
== 0);
5487 ceph_assert(osdmap_bl
.length());
5490 if (epoch
== osdmap
.get_epoch()) {
5494 p
->decode(osdmap_bl
);
5497 auto sg
= make_scope_guard([&] {
5503 if (prefix
== "osd dump") {
5506 f
->open_object_section("osdmap");
5516 } else if (prefix
== "osd ls") {
5518 f
->open_array_section("osds");
5519 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
5520 if (osdmap
.exists(i
)) {
5521 f
->dump_int("osd", i
);
5528 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
5529 if (osdmap
.exists(i
)) {
5538 } else if (prefix
== "osd info") {
5540 bool do_single_osd
= true;
5541 if (!cmd_getval(cmdmap
, "id", osd_id
)) {
5542 do_single_osd
= false;
5545 if (do_single_osd
&& !osdmap
.exists(osd_id
)) {
5546 ss
<< "osd." << osd_id
<< " does not exist";
5552 if (do_single_osd
) {
5553 osdmap
.dump_osd(osd_id
, f
.get());
5555 osdmap
.dump_osds(f
.get());
5559 if (do_single_osd
) {
5560 osdmap
.print_osd(osd_id
, ds
);
5562 osdmap
.print_osds(ds
);
5566 } else if (prefix
== "osd tree" || prefix
== "osd tree-from") {
5568 if (prefix
== "osd tree-from") {
5569 cmd_getval(cmdmap
, "bucket", bucket
);
5570 if (!osdmap
.crush
->name_exists(bucket
)) {
5571 ss
<< "bucket '" << bucket
<< "' does not exist";
5575 int id
= osdmap
.crush
->get_item_id(bucket
);
5577 ss
<< "\"" << bucket
<< "\" is not a bucket";
5583 vector
<string
> states
;
5584 cmd_getval(cmdmap
, "states", states
);
5585 unsigned filter
= 0;
5586 for (auto& s
: states
) {
5588 filter
|= OSDMap::DUMP_UP
;
5589 } else if (s
== "down") {
5590 filter
|= OSDMap::DUMP_DOWN
;
5591 } else if (s
== "in") {
5592 filter
|= OSDMap::DUMP_IN
;
5593 } else if (s
== "out") {
5594 filter
|= OSDMap::DUMP_OUT
;
5595 } else if (s
== "destroyed") {
5596 filter
|= OSDMap::DUMP_DESTROYED
;
5598 ss
<< "unrecognized state '" << s
<< "'";
5603 if ((filter
& (OSDMap::DUMP_IN
|OSDMap::DUMP_OUT
)) ==
5604 (OSDMap::DUMP_IN
|OSDMap::DUMP_OUT
)) {
5605 ss
<< "cannot specify both 'in' and 'out'";
5609 if (((filter
& (OSDMap::DUMP_UP
|OSDMap::DUMP_DOWN
)) ==
5610 (OSDMap::DUMP_UP
|OSDMap::DUMP_DOWN
)) ||
5611 ((filter
& (OSDMap::DUMP_UP
|OSDMap::DUMP_DESTROYED
)) ==
5612 (OSDMap::DUMP_UP
|OSDMap::DUMP_DESTROYED
)) ||
5613 ((filter
& (OSDMap::DUMP_DOWN
|OSDMap::DUMP_DESTROYED
)) ==
5614 (OSDMap::DUMP_DOWN
|OSDMap::DUMP_DESTROYED
))) {
5615 ss
<< "can specify only one of 'up', 'down' and 'destroyed'";
5620 f
->open_object_section("tree");
5621 p
->print_tree(f
.get(), NULL
, filter
, bucket
);
5625 p
->print_tree(NULL
, &ds
, filter
, bucket
);
5628 } else if (prefix
== "osd getmap") {
5629 rdata
.append(osdmap_bl
);
5630 ss
<< "got osdmap epoch " << p
->get_epoch();
5631 } else if (prefix
== "osd getcrushmap") {
5632 p
->crush
->encode(rdata
, mon
.get_quorum_con_features());
5633 ss
<< p
->get_crush_version();
5634 } else if (prefix
== "osd ls-tree") {
5636 cmd_getval(cmdmap
, "name", bucket_name
);
5638 r
= p
->get_osds_by_bucket_name(bucket_name
, &osds
);
5640 ss
<< "\"" << bucket_name
<< "\" does not exist";
5643 ss
<< "can not parse bucket name:\"" << bucket_name
<< "\"";
5648 f
->open_array_section("osds");
5649 for (auto &i
: osds
) {
5650 if (osdmap
.exists(i
)) {
5651 f
->dump_int("osd", i
);
5658 for (auto &i
: osds
) {
5659 if (osdmap
.exists(i
)) {
5670 } else if (prefix
== "osd getmaxosd") {
5672 f
->open_object_section("getmaxosd");
5673 f
->dump_unsigned("epoch", osdmap
.get_epoch());
5674 f
->dump_int("max_osd", osdmap
.get_max_osd());
5678 ds
<< "max_osd = " << osdmap
.get_max_osd() << " in epoch " << osdmap
.get_epoch();
5681 } else if (prefix
== "osd utilization") {
5683 osdmap
.summarize_mapping_stats(NULL
, NULL
, &out
, f
.get());
5690 } else if (prefix
== "osd find") {
5692 if (!cmd_getval(cmdmap
, "id", osd
)) {
5693 ss
<< "unable to parse osd id value '"
5694 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
5698 if (!osdmap
.exists(osd
)) {
5699 ss
<< "osd." << osd
<< " does not exist";
5704 cmd_getval(cmdmap
, "format", format
);
5705 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5706 f
->open_object_section("osd_location");
5707 f
->dump_int("osd", osd
);
5708 f
->dump_object("addrs", osdmap
.get_addrs(osd
));
5709 f
->dump_stream("osd_fsid") << osdmap
.get_uuid(osd
);
5711 // try to identify host, pod/container name, etc.
5712 map
<string
,string
> m
;
5713 load_metadata(osd
, m
, nullptr);
5714 if (auto p
= m
.find("hostname"); p
!= m
.end()) {
5715 f
->dump_string("host", p
->second
);
5718 "pod_name", "pod_namespace", // set by rook
5719 "container_name" // set by cephadm, ceph-ansible
5721 if (auto p
= m
.find(k
); p
!= m
.end()) {
5722 f
->dump_string(k
, p
->second
);
5726 // crush is helpful too
5727 f
->open_object_section("crush_location");
5728 map
<string
,string
> loc
= osdmap
.crush
->get_full_location(osd
);
5729 for (map
<string
,string
>::iterator p
= loc
.begin(); p
!= loc
.end(); ++p
)
5730 f
->dump_string(p
->first
.c_str(), p
->second
);
5734 } else if (prefix
== "osd metadata") {
5736 if (cmd_vartype_stringify(cmdmap
["id"]).size() &&
5737 !cmd_getval(cmdmap
, "id", osd
)) {
5738 ss
<< "unable to parse osd id value '"
5739 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
5743 if (osd
>= 0 && !osdmap
.exists(osd
)) {
5744 ss
<< "osd." << osd
<< " does not exist";
5749 cmd_getval(cmdmap
, "format", format
);
5750 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5752 f
->open_object_section("osd_metadata");
5753 f
->dump_unsigned("id", osd
);
5754 r
= dump_osd_metadata(osd
, f
.get(), &ss
);
5760 f
->open_array_section("osd_metadata");
5761 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
5762 if (osdmap
.exists(i
)) {
5763 f
->open_object_section("osd");
5764 f
->dump_unsigned("id", i
);
5765 r
= dump_osd_metadata(i
, f
.get(), NULL
);
5766 if (r
== -EINVAL
|| r
== -ENOENT
) {
5767 // Drop error, continue to get other daemons' metadata
5768 dout(4) << "No metadata for osd." << i
<< dendl
;
5780 } else if (prefix
== "osd versions") {
5782 f
.reset(Formatter::create("json-pretty"));
5783 count_metadata("ceph_version", f
.get());
5786 } else if (prefix
== "osd count-metadata") {
5788 f
.reset(Formatter::create("json-pretty"));
5790 cmd_getval(cmdmap
, "property", field
);
5791 count_metadata(field
, f
.get());
5794 } else if (prefix
== "osd numa-status") {
5797 f
->open_array_section("osds");
5799 tbl
.define_column("OSD", TextTable::LEFT
, TextTable::RIGHT
);
5800 tbl
.define_column("HOST", TextTable::LEFT
, TextTable::LEFT
);
5801 tbl
.define_column("NETWORK", TextTable::RIGHT
, TextTable::RIGHT
);
5802 tbl
.define_column("STORAGE", TextTable::RIGHT
, TextTable::RIGHT
);
5803 tbl
.define_column("AFFINITY", TextTable::RIGHT
, TextTable::RIGHT
);
5804 tbl
.define_column("CPUS", TextTable::LEFT
, TextTable::LEFT
);
5806 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
5807 if (osdmap
.exists(i
)) {
5808 map
<string
,string
> m
;
5810 if (load_metadata(i
, m
, &err
) < 0) {
5814 auto p
= m
.find("hostname");
5819 f
->open_object_section("osd");
5820 f
->dump_int("osd", i
);
5821 f
->dump_string("host", host
);
5822 for (auto n
: { "network_numa_node", "objectstore_numa_node",
5826 f
->dump_int(n
, atoi(p
->second
.c_str()));
5829 for (auto n
: { "network_numa_nodes", "objectstore_numa_nodes" }) {
5832 list
<string
> ls
= get_str_list(p
->second
, ",");
5833 f
->open_array_section(n
);
5834 for (auto node
: ls
) {
5835 f
->dump_int("node", atoi(node
.c_str()));
5840 for (auto n
: { "numa_node_cpus" }) {
5843 dump_cpu_list(f
.get(), n
, p
->second
);
5850 p
= m
.find("network_numa_nodes");
5856 p
= m
.find("objectstore_numa_nodes");
5862 p
= m
.find("numa_node");
5863 auto q
= m
.find("numa_node_cpus");
5864 if (p
!= m
.end() && q
!= m
.end()) {
5871 tbl
<< TextTable::endrow
;
5879 rdata
.append(stringify(tbl
));
5881 } else if (prefix
== "osd map") {
5882 string poolstr
, objstr
, namespacestr
;
5883 cmd_getval(cmdmap
, "pool", poolstr
);
5884 cmd_getval(cmdmap
, "object", objstr
);
5885 cmd_getval(cmdmap
, "nspace", namespacestr
);
5887 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
5889 ss
<< "pool " << poolstr
<< " does not exist";
5893 object_locator_t
oloc(pool
, namespacestr
);
5894 object_t
oid(objstr
);
5895 pg_t pgid
= osdmap
.object_locator_to_pg(oid
, oloc
);
5896 pg_t mpgid
= osdmap
.raw_pg_to_pg(pgid
);
5897 vector
<int> up
, acting
;
5899 osdmap
.pg_to_up_acting_osds(mpgid
, &up
, &up_p
, &acting
, &acting_p
);
5902 if (!namespacestr
.empty())
5903 fullobjname
= namespacestr
+ string("/") + oid
.name
;
5905 fullobjname
= oid
.name
;
5907 f
->open_object_section("osd_map");
5908 f
->dump_unsigned("epoch", osdmap
.get_epoch());
5909 f
->dump_string("pool", poolstr
);
5910 f
->dump_int("pool_id", pool
);
5911 f
->dump_stream("objname") << fullobjname
;
5912 f
->dump_stream("raw_pgid") << pgid
;
5913 f
->dump_stream("pgid") << mpgid
;
5914 f
->open_array_section("up");
5915 for (vector
<int>::iterator p
= up
.begin(); p
!= up
.end(); ++p
)
5916 f
->dump_int("osd", *p
);
5918 f
->dump_int("up_primary", up_p
);
5919 f
->open_array_section("acting");
5920 for (vector
<int>::iterator p
= acting
.begin(); p
!= acting
.end(); ++p
)
5921 f
->dump_int("osd", *p
);
5923 f
->dump_int("acting_primary", acting_p
);
5924 f
->close_section(); // osd_map
5927 ds
<< "osdmap e" << osdmap
.get_epoch()
5928 << " pool '" << poolstr
<< "' (" << pool
<< ")"
5929 << " object '" << fullobjname
<< "' ->"
5930 << " pg " << pgid
<< " (" << mpgid
<< ")"
5931 << " -> up (" << pg_vector_string(up
) << ", p" << up_p
<< ") acting ("
5932 << pg_vector_string(acting
) << ", p" << acting_p
<< ")";
5936 } else if (prefix
== "pg map") {
5939 cmd_getval(cmdmap
, "pgid", pgidstr
);
5940 if (!pgid
.parse(pgidstr
.c_str())) {
5941 ss
<< "invalid pgid '" << pgidstr
<< "'";
5945 vector
<int> up
, acting
;
5946 if (!osdmap
.have_pg_pool(pgid
.pool())) {
5947 ss
<< "pg '" << pgidstr
<< "' does not exist";
5951 pg_t mpgid
= osdmap
.raw_pg_to_pg(pgid
);
5952 osdmap
.pg_to_up_acting_osds(pgid
, up
, acting
);
5954 f
->open_object_section("pg_map");
5955 f
->dump_unsigned("epoch", osdmap
.get_epoch());
5956 f
->dump_stream("raw_pgid") << pgid
;
5957 f
->dump_stream("pgid") << mpgid
;
5958 f
->open_array_section("up");
5959 for (auto osd
: up
) {
5960 f
->dump_int("up_osd", osd
);
5963 f
->open_array_section("acting");
5964 for (auto osd
: acting
) {
5965 f
->dump_int("acting_osd", osd
);
5971 ds
<< "osdmap e" << osdmap
.get_epoch()
5972 << " pg " << pgid
<< " (" << mpgid
<< ")"
5973 << " -> up " << up
<< " acting " << acting
;
5978 } else if (prefix
== "osd lspools") {
5980 f
->open_array_section("pools");
5981 for (map
<int64_t, pg_pool_t
>::iterator p
= osdmap
.pools
.begin();
5982 p
!= osdmap
.pools
.end();
5985 f
->open_object_section("pool");
5986 f
->dump_int("poolnum", p
->first
);
5987 f
->dump_string("poolname", osdmap
.pool_name
[p
->first
]);
5990 ds
<< p
->first
<< ' ' << osdmap
.pool_name
[p
->first
];
5991 if (next(p
) != osdmap
.pools
.end()) {
6001 } else if (prefix
== "osd blocklist ls" ||
6002 prefix
== "osd blacklist ls") {
6004 f
->open_array_section("blocklist");
6006 for (ceph::unordered_map
<entity_addr_t
,utime_t
>::iterator p
= osdmap
.blocklist
.begin();
6007 p
!= osdmap
.blocklist
.end();
6010 f
->open_object_section("entry");
6011 f
->dump_string("addr", p
->first
.get_legacy_str());
6012 f
->dump_stream("until") << p
->second
;
6017 ss
<< p
->first
<< " " << p
->second
;
6027 ss
<< "listed " << osdmap
.blocklist
.size() << " entries";
6029 } else if (prefix
== "osd pool ls") {
6031 cmd_getval(cmdmap
, "detail", detail
);
6032 if (!f
&& detail
== "detail") {
6034 osdmap
.print_pools(ss
);
6035 rdata
.append(ss
.str());
6038 f
->open_array_section("pools");
6039 for (map
<int64_t,pg_pool_t
>::const_iterator it
= osdmap
.get_pools().begin();
6040 it
!= osdmap
.get_pools().end();
6043 if (detail
== "detail") {
6044 f
->open_object_section("pool");
6045 f
->dump_int("pool_id", it
->first
);
6046 f
->dump_string("pool_name", osdmap
.get_pool_name(it
->first
));
6047 it
->second
.dump(f
.get());
6050 f
->dump_string("pool_name", osdmap
.get_pool_name(it
->first
));
6053 rdata
.append(osdmap
.get_pool_name(it
->first
) + "\n");
6062 } else if (prefix
== "osd crush get-tunable") {
6064 cmd_getval(cmdmap
, "tunable", tunable
);
6067 f
->open_object_section("tunable");
6068 if (tunable
== "straw_calc_version") {
6070 f
->dump_int(tunable
.c_str(), osdmap
.crush
->get_straw_calc_version());
6072 rss
<< osdmap
.crush
->get_straw_calc_version() << "\n";
6081 rdata
.append(rss
.str());
6085 } else if (prefix
== "osd pool get") {
6087 cmd_getval(cmdmap
, "pool", poolstr
);
6088 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
6090 ss
<< "unrecognized pool '" << poolstr
<< "'";
6095 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
6097 cmd_getval(cmdmap
, "var", var
);
6099 typedef std::map
<std::string
, osd_pool_get_choices
> choices_map_t
;
6100 const choices_map_t ALL_CHOICES
= {
6102 {"min_size", MIN_SIZE
},
6103 {"pg_num", PG_NUM
}, {"pgp_num", PGP_NUM
},
6104 {"crush_rule", CRUSH_RULE
},
6105 {"hashpspool", HASHPSPOOL
},
6107 {"allow_ec_overwrites", EC_OVERWRITES
}, {"nodelete", NODELETE
},
6108 {"nopgchange", NOPGCHANGE
}, {"nosizechange", NOSIZECHANGE
},
6109 {"noscrub", NOSCRUB
}, {"nodeep-scrub", NODEEP_SCRUB
},
6110 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED
},
6111 {"hit_set_type", HIT_SET_TYPE
}, {"hit_set_period", HIT_SET_PERIOD
},
6112 {"hit_set_count", HIT_SET_COUNT
}, {"hit_set_fpp", HIT_SET_FPP
},
6113 {"use_gmt_hitset", USE_GMT_HITSET
},
6114 {"target_max_objects", TARGET_MAX_OBJECTS
},
6115 {"target_max_bytes", TARGET_MAX_BYTES
},
6116 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO
},
6117 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO
},
6118 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO
},
6119 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE
},
6120 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE
},
6121 {"erasure_code_profile", ERASURE_CODE_PROFILE
},
6122 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE
},
6123 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE
},
6124 {"fast_read", FAST_READ
},
6125 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE
},
6126 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N
},
6127 {"scrub_min_interval", SCRUB_MIN_INTERVAL
},
6128 {"scrub_max_interval", SCRUB_MAX_INTERVAL
},
6129 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL
},
6130 {"recovery_priority", RECOVERY_PRIORITY
},
6131 {"recovery_op_priority", RECOVERY_OP_PRIORITY
},
6132 {"scrub_priority", SCRUB_PRIORITY
},
6133 {"compression_mode", COMPRESSION_MODE
},
6134 {"compression_algorithm", COMPRESSION_ALGORITHM
},
6135 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO
},
6136 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE
},
6137 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE
},
6138 {"csum_type", CSUM_TYPE
},
6139 {"csum_max_block", CSUM_MAX_BLOCK
},
6140 {"csum_min_block", CSUM_MIN_BLOCK
},
6141 {"fingerprint_algorithm", FINGERPRINT_ALGORITHM
},
6142 {"pg_autoscale_mode", PG_AUTOSCALE_MODE
},
6143 {"pg_num_min", PG_NUM_MIN
},
6144 {"pg_num_max", PG_NUM_MAX
},
6145 {"target_size_bytes", TARGET_SIZE_BYTES
},
6146 {"target_size_ratio", TARGET_SIZE_RATIO
},
6147 {"pg_autoscale_bias", PG_AUTOSCALE_BIAS
},
6148 {"dedup_tier", DEDUP_TIER
},
6149 {"dedup_chunk_algorithm", DEDUP_CHUNK_ALGORITHM
},
6150 {"dedup_cdc_chunk_size", DEDUP_CDC_CHUNK_SIZE
},
6154 typedef std::set
<osd_pool_get_choices
> choices_set_t
;
6156 const choices_set_t ONLY_TIER_CHOICES
= {
6157 HIT_SET_TYPE
, HIT_SET_PERIOD
, HIT_SET_COUNT
, HIT_SET_FPP
,
6158 TARGET_MAX_OBJECTS
, TARGET_MAX_BYTES
, CACHE_TARGET_FULL_RATIO
,
6159 CACHE_TARGET_DIRTY_RATIO
, CACHE_TARGET_DIRTY_HIGH_RATIO
,
6160 CACHE_MIN_FLUSH_AGE
, CACHE_MIN_EVICT_AGE
,
6161 MIN_READ_RECENCY_FOR_PROMOTE
,
6162 MIN_WRITE_RECENCY_FOR_PROMOTE
,
6163 HIT_SET_GRADE_DECAY_RATE
, HIT_SET_SEARCH_LAST_N
6165 const choices_set_t ONLY_ERASURE_CHOICES
= {
6166 EC_OVERWRITES
, ERASURE_CODE_PROFILE
6169 choices_set_t selected_choices
;
6171 for(choices_map_t::const_iterator it
= ALL_CHOICES
.begin();
6172 it
!= ALL_CHOICES
.end(); ++it
) {
6173 selected_choices
.insert(it
->second
);
6177 selected_choices
= subtract_second_from_first(selected_choices
,
6181 if(!p
->is_erasure()) {
6182 selected_choices
= subtract_second_from_first(selected_choices
,
6183 ONLY_ERASURE_CHOICES
);
6185 } else /* var != "all" */ {
6186 choices_map_t::const_iterator found
= ALL_CHOICES
.find(var
);
6187 if (found
== ALL_CHOICES
.end()) {
6188 ss
<< "pool '" << poolstr
6189 << "': invalid variable: '" << var
<< "'";
6194 osd_pool_get_choices selected
= found
->second
;
6196 if (!p
->is_tier() &&
6197 ONLY_TIER_CHOICES
.find(selected
) != ONLY_TIER_CHOICES
.end()) {
6198 ss
<< "pool '" << poolstr
6199 << "' is not a tier pool: variable not applicable";
6204 if (!p
->is_erasure() &&
6205 ONLY_ERASURE_CHOICES
.find(selected
)
6206 != ONLY_ERASURE_CHOICES
.end()) {
6207 ss
<< "pool '" << poolstr
6208 << "' is not a erasure pool: variable not applicable";
6213 if (pool_opts_t::is_opt_name(var
) &&
6214 !p
->opts
.is_set(pool_opts_t::get_opt_desc(var
).key
)) {
6215 ss
<< "option '" << var
<< "' is not set on pool '" << poolstr
<< "'";
6220 selected_choices
.insert(selected
);
6224 f
->open_object_section("pool");
6225 f
->dump_string("pool", poolstr
);
6226 f
->dump_int("pool_id", pool
);
6227 for(choices_set_t::const_iterator it
= selected_choices
.begin();
6228 it
!= selected_choices
.end(); ++it
) {
6229 choices_map_t::const_iterator i
;
6230 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
6231 if (i
->second
== *it
) {
6235 ceph_assert(i
!= ALL_CHOICES
.end());
6238 f
->dump_int("pg_num", p
->get_pg_num());
6241 f
->dump_int("pgp_num", p
->get_pgp_num());
6244 f
->dump_int("size", p
->get_size());
6247 f
->dump_int("min_size", p
->get_min_size());
6250 if (osdmap
.crush
->rule_exists(p
->get_crush_rule())) {
6251 f
->dump_string("crush_rule", osdmap
.crush
->get_rule_name(
6252 p
->get_crush_rule()));
6254 f
->dump_string("crush_rule", stringify(p
->get_crush_rule()));
6258 f
->dump_bool("allow_ec_overwrites",
6259 p
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
));
6261 case PG_AUTOSCALE_MODE
:
6262 f
->dump_string("pg_autoscale_mode",
6263 pg_pool_t::get_pg_autoscale_mode_name(
6264 p
->pg_autoscale_mode
));
6272 case WRITE_FADVISE_DONTNEED
:
6275 f
->dump_bool(i
->first
.c_str(),
6276 p
->has_flag(pg_pool_t::get_flag_by_name(i
->first
)));
6278 case HIT_SET_PERIOD
:
6279 f
->dump_int("hit_set_period", p
->hit_set_period
);
6282 f
->dump_int("hit_set_count", p
->hit_set_count
);
6285 f
->dump_string("hit_set_type",
6286 HitSet::get_type_name(p
->hit_set_params
.get_type()));
6290 if (p
->hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
6291 BloomHitSet::Params
*bloomp
=
6292 static_cast<BloomHitSet::Params
*>(p
->hit_set_params
.impl
.get());
6293 f
->dump_float("hit_set_fpp", bloomp
->get_fpp());
6294 } else if(var
!= "all") {
6296 ss
<< "hit set is not of type Bloom; " <<
6297 "invalid to get a false positive rate!";
6303 case USE_GMT_HITSET
:
6304 f
->dump_bool("use_gmt_hitset", p
->use_gmt_hitset
);
6306 case TARGET_MAX_OBJECTS
:
6307 f
->dump_unsigned("target_max_objects", p
->target_max_objects
);
6309 case TARGET_MAX_BYTES
:
6310 f
->dump_unsigned("target_max_bytes", p
->target_max_bytes
);
6312 case CACHE_TARGET_DIRTY_RATIO
:
6313 f
->dump_unsigned("cache_target_dirty_ratio_micro",
6314 p
->cache_target_dirty_ratio_micro
);
6315 f
->dump_float("cache_target_dirty_ratio",
6316 ((float)p
->cache_target_dirty_ratio_micro
/1000000));
6318 case CACHE_TARGET_DIRTY_HIGH_RATIO
:
6319 f
->dump_unsigned("cache_target_dirty_high_ratio_micro",
6320 p
->cache_target_dirty_high_ratio_micro
);
6321 f
->dump_float("cache_target_dirty_high_ratio",
6322 ((float)p
->cache_target_dirty_high_ratio_micro
/1000000));
6324 case CACHE_TARGET_FULL_RATIO
:
6325 f
->dump_unsigned("cache_target_full_ratio_micro",
6326 p
->cache_target_full_ratio_micro
);
6327 f
->dump_float("cache_target_full_ratio",
6328 ((float)p
->cache_target_full_ratio_micro
/1000000));
6330 case CACHE_MIN_FLUSH_AGE
:
6331 f
->dump_unsigned("cache_min_flush_age", p
->cache_min_flush_age
);
6333 case CACHE_MIN_EVICT_AGE
:
6334 f
->dump_unsigned("cache_min_evict_age", p
->cache_min_evict_age
);
6336 case ERASURE_CODE_PROFILE
:
6337 f
->dump_string("erasure_code_profile", p
->erasure_code_profile
);
6339 case MIN_READ_RECENCY_FOR_PROMOTE
:
6340 f
->dump_int("min_read_recency_for_promote",
6341 p
->min_read_recency_for_promote
);
6343 case MIN_WRITE_RECENCY_FOR_PROMOTE
:
6344 f
->dump_int("min_write_recency_for_promote",
6345 p
->min_write_recency_for_promote
);
6348 f
->dump_int("fast_read", p
->fast_read
);
6350 case HIT_SET_GRADE_DECAY_RATE
:
6351 f
->dump_int("hit_set_grade_decay_rate",
6352 p
->hit_set_grade_decay_rate
);
6354 case HIT_SET_SEARCH_LAST_N
:
6355 f
->dump_int("hit_set_search_last_n",
6356 p
->hit_set_search_last_n
);
6358 case SCRUB_MIN_INTERVAL
:
6359 case SCRUB_MAX_INTERVAL
:
6360 case DEEP_SCRUB_INTERVAL
:
6361 case RECOVERY_PRIORITY
:
6362 case RECOVERY_OP_PRIORITY
:
6363 case SCRUB_PRIORITY
:
6364 case COMPRESSION_MODE
:
6365 case COMPRESSION_ALGORITHM
:
6366 case COMPRESSION_REQUIRED_RATIO
:
6367 case COMPRESSION_MAX_BLOB_SIZE
:
6368 case COMPRESSION_MIN_BLOB_SIZE
:
6370 case CSUM_MAX_BLOCK
:
6371 case CSUM_MIN_BLOCK
:
6372 case FINGERPRINT_ALGORITHM
:
6375 case TARGET_SIZE_BYTES
:
6376 case TARGET_SIZE_RATIO
:
6377 case PG_AUTOSCALE_BIAS
:
6379 case DEDUP_CHUNK_ALGORITHM
:
6380 case DEDUP_CDC_CHUNK_SIZE
:
6381 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc(i
->first
).key
;
6382 if (p
->opts
.is_set(key
)) {
6383 if(*it
== CSUM_TYPE
) {
6385 p
->opts
.get(pool_opts_t::CSUM_TYPE
, &val
);
6386 f
->dump_string(i
->first
.c_str(), Checksummer::get_csum_type_string(val
));
6388 p
->opts
.dump(i
->first
, f
.get());
6397 for(choices_set_t::const_iterator it
= selected_choices
.begin();
6398 it
!= selected_choices
.end(); ++it
) {
6399 choices_map_t::const_iterator i
;
6402 ss
<< "pg_num: " << p
->get_pg_num() << "\n";
6405 ss
<< "pgp_num: " << p
->get_pgp_num() << "\n";
6408 ss
<< "size: " << p
->get_size() << "\n";
6411 ss
<< "min_size: " << p
->get_min_size() << "\n";
6414 if (osdmap
.crush
->rule_exists(p
->get_crush_rule())) {
6415 ss
<< "crush_rule: " << osdmap
.crush
->get_rule_name(
6416 p
->get_crush_rule()) << "\n";
6418 ss
<< "crush_rule: " << p
->get_crush_rule() << "\n";
6421 case PG_AUTOSCALE_MODE
:
6422 ss
<< "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
6423 p
->pg_autoscale_mode
) <<"\n";
6425 case HIT_SET_PERIOD
:
6426 ss
<< "hit_set_period: " << p
->hit_set_period
<< "\n";
6429 ss
<< "hit_set_count: " << p
->hit_set_count
<< "\n";
6432 ss
<< "hit_set_type: " <<
6433 HitSet::get_type_name(p
->hit_set_params
.get_type()) << "\n";
6437 if (p
->hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
6438 BloomHitSet::Params
*bloomp
=
6439 static_cast<BloomHitSet::Params
*>(p
->hit_set_params
.impl
.get());
6440 ss
<< "hit_set_fpp: " << bloomp
->get_fpp() << "\n";
6441 } else if(var
!= "all") {
6442 ss
<< "hit set is not of type Bloom; " <<
6443 "invalid to get a false positive rate!";
6449 case USE_GMT_HITSET
:
6450 ss
<< "use_gmt_hitset: " << p
->use_gmt_hitset
<< "\n";
6452 case TARGET_MAX_OBJECTS
:
6453 ss
<< "target_max_objects: " << p
->target_max_objects
<< "\n";
6455 case TARGET_MAX_BYTES
:
6456 ss
<< "target_max_bytes: " << p
->target_max_bytes
<< "\n";
6458 case CACHE_TARGET_DIRTY_RATIO
:
6459 ss
<< "cache_target_dirty_ratio: "
6460 << ((float)p
->cache_target_dirty_ratio_micro
/1000000) << "\n";
6462 case CACHE_TARGET_DIRTY_HIGH_RATIO
:
6463 ss
<< "cache_target_dirty_high_ratio: "
6464 << ((float)p
->cache_target_dirty_high_ratio_micro
/1000000) << "\n";
6466 case CACHE_TARGET_FULL_RATIO
:
6467 ss
<< "cache_target_full_ratio: "
6468 << ((float)p
->cache_target_full_ratio_micro
/1000000) << "\n";
6470 case CACHE_MIN_FLUSH_AGE
:
6471 ss
<< "cache_min_flush_age: " << p
->cache_min_flush_age
<< "\n";
6473 case CACHE_MIN_EVICT_AGE
:
6474 ss
<< "cache_min_evict_age: " << p
->cache_min_evict_age
<< "\n";
6476 case ERASURE_CODE_PROFILE
:
6477 ss
<< "erasure_code_profile: " << p
->erasure_code_profile
<< "\n";
6479 case MIN_READ_RECENCY_FOR_PROMOTE
:
6480 ss
<< "min_read_recency_for_promote: " <<
6481 p
->min_read_recency_for_promote
<< "\n";
6483 case HIT_SET_GRADE_DECAY_RATE
:
6484 ss
<< "hit_set_grade_decay_rate: " <<
6485 p
->hit_set_grade_decay_rate
<< "\n";
6487 case HIT_SET_SEARCH_LAST_N
:
6488 ss
<< "hit_set_search_last_n: " <<
6489 p
->hit_set_search_last_n
<< "\n";
6492 ss
<< "allow_ec_overwrites: " <<
6493 (p
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
) ? "true" : "false") <<
6502 case WRITE_FADVISE_DONTNEED
:
6505 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
6506 if (i
->second
== *it
)
6509 ceph_assert(i
!= ALL_CHOICES
.end());
6510 ss
<< i
->first
<< ": " <<
6511 (p
->has_flag(pg_pool_t::get_flag_by_name(i
->first
)) ?
6512 "true" : "false") << "\n";
6514 case MIN_WRITE_RECENCY_FOR_PROMOTE
:
6515 ss
<< "min_write_recency_for_promote: " <<
6516 p
->min_write_recency_for_promote
<< "\n";
6519 ss
<< "fast_read: " << p
->fast_read
<< "\n";
6521 case SCRUB_MIN_INTERVAL
:
6522 case SCRUB_MAX_INTERVAL
:
6523 case DEEP_SCRUB_INTERVAL
:
6524 case RECOVERY_PRIORITY
:
6525 case RECOVERY_OP_PRIORITY
:
6526 case SCRUB_PRIORITY
:
6527 case COMPRESSION_MODE
:
6528 case COMPRESSION_ALGORITHM
:
6529 case COMPRESSION_REQUIRED_RATIO
:
6530 case COMPRESSION_MAX_BLOB_SIZE
:
6531 case COMPRESSION_MIN_BLOB_SIZE
:
6533 case CSUM_MAX_BLOCK
:
6534 case CSUM_MIN_BLOCK
:
6535 case FINGERPRINT_ALGORITHM
:
6538 case TARGET_SIZE_BYTES
:
6539 case TARGET_SIZE_RATIO
:
6540 case PG_AUTOSCALE_BIAS
:
6542 case DEDUP_CHUNK_ALGORITHM
:
6543 case DEDUP_CDC_CHUNK_SIZE
:
6544 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
6545 if (i
->second
== *it
)
6548 ceph_assert(i
!= ALL_CHOICES
.end());
6550 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc(i
->first
).key
;
6551 if (p
->opts
.is_set(key
)) {
6552 if(key
== pool_opts_t::CSUM_TYPE
) {
6554 p
->opts
.get(key
, &val
);
6555 ss
<< i
->first
<< ": " << Checksummer::get_csum_type_string(val
) << "\n";
6557 ss
<< i
->first
<< ": " << p
->opts
.get(key
) << "\n";
6563 rdata
.append(ss
.str());
6568 } else if (prefix
== "osd pool get-quota") {
6570 cmd_getval(cmdmap
, "pool", pool_name
);
6572 int64_t poolid
= osdmap
.lookup_pg_pool_name(pool_name
);
6574 ceph_assert(poolid
== -ENOENT
);
6575 ss
<< "unrecognized pool '" << pool_name
<< "'";
6579 const pg_pool_t
*p
= osdmap
.get_pg_pool(poolid
);
6580 const pool_stat_t
* pstat
= mon
.mgrstatmon()->get_pool_stat(poolid
);
6582 ss
<< "no stats for pool '" << pool_name
<< "'";
6586 const object_stat_sum_t
& sum
= pstat
->stats
.sum
;
6588 f
->open_object_section("pool_quotas");
6589 f
->dump_string("pool_name", pool_name
);
6590 f
->dump_unsigned("pool_id", poolid
);
6591 f
->dump_unsigned("quota_max_objects", p
->quota_max_objects
);
6592 f
->dump_int("current_num_objects", sum
.num_objects
);
6593 f
->dump_unsigned("quota_max_bytes", p
->quota_max_bytes
);
6594 f
->dump_int("current_num_bytes", sum
.num_bytes
);
6599 rs
<< "quotas for pool '" << pool_name
<< "':\n"
6600 << " max objects: ";
6601 if (p
->quota_max_objects
== 0)
6604 rs
<< si_u_t(p
->quota_max_objects
) << " objects";
6605 rs
<< " (current num objects: " << sum
.num_objects
<< " objects)";
6609 if (p
->quota_max_bytes
== 0)
6612 rs
<< byte_u_t(p
->quota_max_bytes
);
6613 rs
<< " (current num bytes: " << sum
.num_bytes
<< " bytes)";
6615 rdata
.append(rs
.str());
6619 } else if (prefix
== "osd crush rule list" ||
6620 prefix
== "osd crush rule ls") {
6622 f
->open_array_section("rules");
6623 osdmap
.crush
->list_rules(f
.get());
6628 osdmap
.crush
->list_rules(&ss
);
6629 rdata
.append(ss
.str());
6631 } else if (prefix
== "osd crush rule ls-by-class") {
6633 cmd_getval(cmdmap
, "class", class_name
);
6634 if (class_name
.empty()) {
6635 ss
<< "no class specified";
6640 r
= osdmap
.crush
->get_rules_by_class(class_name
, &rules
);
6642 ss
<< "failed to get rules by class '" << class_name
<< "'";
6646 f
->open_array_section("rules");
6647 for (auto &rule
: rules
) {
6648 f
->dump_string("name", osdmap
.crush
->get_rule_name(rule
));
6654 for (auto &rule
: rules
) {
6655 rs
<< osdmap
.crush
->get_rule_name(rule
) << "\n";
6657 rdata
.append(rs
.str());
6659 } else if (prefix
== "osd crush rule dump") {
6661 cmd_getval(cmdmap
, "name", name
);
6663 cmd_getval(cmdmap
, "format", format
);
6664 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6666 f
->open_array_section("rules");
6667 osdmap
.crush
->dump_rules(f
.get());
6670 int ruleno
= osdmap
.crush
->get_rule_id(name
);
6672 ss
<< "unknown crush rule '" << name
<< "'";
6676 osdmap
.crush
->dump_rule(ruleno
, f
.get());
6681 rdata
.append(rs
.str());
6682 } else if (prefix
== "osd crush dump") {
6684 cmd_getval(cmdmap
, "format", format
);
6685 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6686 f
->open_object_section("crush_map");
6687 osdmap
.crush
->dump(f
.get());
6692 rdata
.append(rs
.str());
6693 } else if (prefix
== "osd crush show-tunables") {
6695 cmd_getval(cmdmap
, "format", format
);
6696 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6697 f
->open_object_section("crush_map_tunables");
6698 osdmap
.crush
->dump_tunables(f
.get());
6703 rdata
.append(rs
.str());
6704 } else if (prefix
== "osd crush tree") {
6705 bool show_shadow
= false;
6706 if (!cmd_getval_compat_cephbool(cmdmap
, "show_shadow", show_shadow
)) {
6708 if (cmd_getval(cmdmap
, "shadow", shadow
) &&
6709 shadow
== "--show-shadow") {
6713 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
6715 f
->open_object_section("crush_tree");
6716 osdmap
.crush
->dump_tree(nullptr,
6718 osdmap
.get_pool_names(),
6724 osdmap
.crush
->dump_tree(&ss
,
6726 osdmap
.get_pool_names(),
6728 rdata
.append(ss
.str());
6730 } else if (prefix
== "osd crush ls") {
6732 if (!cmd_getval(cmdmap
, "node", name
)) {
6733 ss
<< "no node specified";
6737 if (!osdmap
.crush
->name_exists(name
)) {
6738 ss
<< "node '" << name
<< "' does not exist";
6742 int id
= osdmap
.crush
->get_item_id(name
);
6745 result
.push_back(id
);
6747 int num
= osdmap
.crush
->get_bucket_size(id
);
6748 for (int i
= 0; i
< num
; ++i
) {
6749 result
.push_back(osdmap
.crush
->get_bucket_item(id
, i
));
6753 f
->open_array_section("items");
6754 for (auto i
: result
) {
6755 f
->dump_string("item", osdmap
.crush
->get_item_name(i
));
6761 for (auto i
: result
) {
6762 ss
<< osdmap
.crush
->get_item_name(i
) << "\n";
6764 rdata
.append(ss
.str());
6767 } else if (prefix
== "osd crush class ls") {
6768 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6769 f
->open_array_section("crush_classes");
6770 for (auto i
: osdmap
.crush
->class_name
)
6771 f
->dump_string("class", i
.second
);
6774 } else if (prefix
== "osd crush class ls-osd") {
6776 cmd_getval(cmdmap
, "class", name
);
6778 osdmap
.crush
->get_devices_by_class(name
, &osds
);
6780 f
->open_array_section("osds");
6781 for (auto &osd
: osds
)
6782 f
->dump_int("osd", osd
);
6787 for (auto &osd
: osds
) {
6795 } else if (prefix
== "osd crush get-device-class") {
6796 vector
<string
> idvec
;
6797 cmd_getval(cmdmap
, "ids", idvec
);
6798 map
<int, string
> class_by_osd
;
6799 for (auto& id
: idvec
) {
6801 long osd
= parse_osd_id(id
.c_str(), &ts
);
6803 ss
<< "unable to parse osd id:'" << id
<< "'";
6807 auto device_class
= osdmap
.crush
->get_item_class(osd
);
6809 class_by_osd
[osd
] = device_class
;
6811 class_by_osd
[osd
] = ""; // no class
6814 f
->open_array_section("osd_device_classes");
6815 for (auto& i
: class_by_osd
) {
6816 f
->open_object_section("osd_device_class");
6817 f
->dump_int("osd", i
.first
);
6818 f
->dump_string("device_class", i
.second
);
6824 if (class_by_osd
.size() == 1) {
6825 // for single input, make a clean output
6826 ds
<< class_by_osd
.begin()->second
;
6828 // note that we do not group osds by class here
6829 for (auto it
= class_by_osd
.begin();
6830 it
!= class_by_osd
.end();
6832 ds
<< "osd." << it
->first
<< ' ' << it
->second
;
6833 if (next(it
) != class_by_osd
.end())
6839 } else if (prefix
== "osd erasure-code-profile ls") {
6840 const auto &profiles
= osdmap
.get_erasure_code_profiles();
6842 f
->open_array_section("erasure-code-profiles");
6843 for (auto i
= profiles
.begin(); i
!= profiles
.end(); ++i
) {
6845 f
->dump_string("profile", i
->first
.c_str());
6847 rdata
.append(i
->first
+ "\n");
6854 rdata
.append(rs
.str());
6856 } else if (prefix
== "osd crush weight-set ls") {
6857 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
6859 f
->open_array_section("weight_sets");
6860 if (osdmap
.crush
->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS
)) {
6861 f
->dump_string("pool", "(compat)");
6863 for (auto& i
: osdmap
.crush
->choose_args
) {
6865 f
->dump_string("pool", osdmap
.get_pool_name(i
.first
));
6872 if (osdmap
.crush
->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS
)) {
6875 for (auto& i
: osdmap
.crush
->choose_args
) {
6877 rs
<< osdmap
.get_pool_name(i
.first
) << "\n";
6880 rdata
.append(rs
.str());
6882 } else if (prefix
== "osd crush weight-set dump") {
6883 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty",
6885 osdmap
.crush
->dump_choose_args(f
.get());
6887 } else if (prefix
== "osd erasure-code-profile get") {
6889 cmd_getval(cmdmap
, "name", name
);
6890 if (!osdmap
.has_erasure_code_profile(name
)) {
6891 ss
<< "unknown erasure code profile '" << name
<< "'";
6895 const map
<string
,string
> &profile
= osdmap
.get_erasure_code_profile(name
);
6897 f
->open_object_section("profile");
6898 for (map
<string
,string
>::const_iterator i
= profile
.begin();
6902 f
->dump_string(i
->first
.c_str(), i
->second
.c_str());
6904 rdata
.append(i
->first
+ "=" + i
->second
+ "\n");
6911 rdata
.append(rs
.str());
6913 } else if (prefix
== "osd pool application get") {
6914 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty",
6917 cmd_getval(cmdmap
, "pool", pool_name
);
6919 cmd_getval(cmdmap
, "app", app
);
6921 cmd_getval(cmdmap
, "key", key
);
6923 if (pool_name
.empty()) {
6925 f
->open_object_section("pools");
6926 for (const auto &pool
: osdmap
.pools
) {
6927 std::string
name("<unknown>");
6928 const auto &pni
= osdmap
.pool_name
.find(pool
.first
);
6929 if (pni
!= osdmap
.pool_name
.end())
6931 f
->open_object_section(name
.c_str());
6932 for (auto &app_pair
: pool
.second
.application_metadata
) {
6933 f
->open_object_section(app_pair
.first
.c_str());
6934 for (auto &kv_pair
: app_pair
.second
) {
6935 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
6939 f
->close_section(); // name
6941 f
->close_section(); // pools
6944 int64_t pool
= osdmap
.lookup_pg_pool_name(pool_name
.c_str());
6946 ss
<< "unrecognized pool '" << pool_name
<< "'";
6950 auto p
= osdmap
.get_pg_pool(pool
);
6953 f
->open_object_section(pool_name
.c_str());
6954 for (auto &app_pair
: p
->application_metadata
) {
6955 f
->open_object_section(app_pair
.first
.c_str());
6956 for (auto &kv_pair
: app_pair
.second
) {
6957 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
6959 f
->close_section(); // application
6961 f
->close_section(); // pool_name
6966 auto app_it
= p
->application_metadata
.find(app
);
6967 if (app_it
== p
->application_metadata
.end()) {
6968 ss
<< "pool '" << pool_name
<< "' has no application '" << app
<< "'";
6972 // filter by pool + app
6974 f
->open_object_section(app_it
->first
.c_str());
6975 for (auto &kv_pair
: app_it
->second
) {
6976 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
6978 f
->close_section(); // application
6982 // filter by pool + app + key
6983 auto key_it
= app_it
->second
.find(key
);
6984 if (key_it
== app_it
->second
.end()) {
6985 ss
<< "application '" << app
<< "' on pool '" << pool_name
6986 << "' does not have key '" << key
<< "'";
6990 ss
<< key_it
->second
<< "\n";
6991 rdata
.append(ss
.str());
6994 } else if (prefix
== "osd get-require-min-compat-client") {
6995 ss
<< osdmap
.require_min_compat_client
<< std::endl
;
6996 rdata
.append(ss
.str());
6999 } else if (prefix
== "osd pool application enable" ||
7000 prefix
== "osd pool application disable" ||
7001 prefix
== "osd pool application set" ||
7002 prefix
== "osd pool application rm") {
7003 bool changed
= false;
7004 r
= preprocess_command_pool_application(prefix
, cmdmap
, ss
, &changed
);
7008 } else if (changed
) {
7009 // Valid mutation, proceed to prepare phase
7012 // Idempotent case, reply
7016 // try prepare update
7023 mon
.reply_command(op
, r
, rs
, rdata
, get_last_committed());
7027 void OSDMonitor::set_pool_flags(int64_t pool_id
, uint64_t flags
)
7029 pg_pool_t
*pool
= pending_inc
.get_new_pool(pool_id
,
7030 osdmap
.get_pg_pool(pool_id
));
7032 pool
->set_flag(flags
);
7035 void OSDMonitor::clear_pool_flags(int64_t pool_id
, uint64_t flags
)
7037 pg_pool_t
*pool
= pending_inc
.get_new_pool(pool_id
,
7038 osdmap
.get_pg_pool(pool_id
));
7040 pool
->unset_flag(flags
);
7043 string
OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch
)
7046 snprintf(k
, sizeof(k
), "purged_epoch_%08lx", (unsigned long)epoch
);
7050 string
OSDMonitor::make_purged_snap_key(int64_t pool
, snapid_t snap
)
7053 snprintf(k
, sizeof(k
), "purged_snap_%llu_%016llx",
7054 (unsigned long long)pool
, (unsigned long long)snap
);
7058 string
OSDMonitor::make_purged_snap_key_value(
7059 int64_t pool
, snapid_t snap
, snapid_t num
,
7060 epoch_t epoch
, bufferlist
*v
)
7062 // encode the *last* epoch in the key so that we can use forward
7063 // iteration only to search for an epoch in an interval.
7065 encode(snap
+ num
, *v
);
7067 return make_purged_snap_key(pool
, snap
+ num
- 1);
7071 int OSDMonitor::lookup_purged_snap(
7072 int64_t pool
, snapid_t snap
,
7073 snapid_t
*begin
, snapid_t
*end
)
7075 string k
= make_purged_snap_key(pool
, snap
);
7076 auto it
= mon
.store
->get_iterator(OSD_SNAP_PREFIX
);
7079 dout(20) << __func__
7080 << " pool " << pool
<< " snap " << snap
7081 << " - key '" << k
<< "' not found" << dendl
;
7084 if (it
->key().find("purged_snap_") != 0) {
7085 dout(20) << __func__
7086 << " pool " << pool
<< " snap " << snap
7087 << " - key '" << k
<< "' got '" << it
->key()
7088 << "', wrong prefix" << dendl
;
7091 string gotk
= it
->key();
7092 const char *format
= "purged_snap_%llu_";
7093 long long int keypool
;
7094 int n
= sscanf(gotk
.c_str(), format
, &keypool
);
7096 derr
<< __func__
<< " invalid k '" << gotk
<< "'" << dendl
;
7099 if (pool
!= keypool
) {
7100 dout(20) << __func__
7101 << " pool " << pool
<< " snap " << snap
7102 << " - key '" << k
<< "' got '" << gotk
7103 << "', wrong pool " << keypool
7107 bufferlist v
= it
->value();
7108 auto p
= v
.cbegin();
7111 if (snap
< *begin
|| snap
>= *end
) {
7112 dout(20) << __func__
7113 << " pool " << pool
<< " snap " << snap
7114 << " - found [" << *begin
<< "," << *end
<< "), no overlap"
7121 void OSDMonitor::insert_purged_snap_update(
7123 snapid_t start
, snapid_t end
,
7125 MonitorDBStore::TransactionRef t
)
7127 snapid_t before_begin
, before_end
;
7128 snapid_t after_begin
, after_end
;
7129 int b
= lookup_purged_snap(pool
, start
- 1,
7130 &before_begin
, &before_end
);
7131 int a
= lookup_purged_snap(pool
, end
,
7132 &after_begin
, &after_end
);
7134 dout(10) << __func__
7135 << " [" << start
<< "," << end
<< ") - joins ["
7136 << before_begin
<< "," << before_end
<< ") and ["
7137 << after_begin
<< "," << after_end
<< ")" << dendl
;
7138 // erase only the begin record; we'll overwrite the end one.
7139 t
->erase(OSD_SNAP_PREFIX
, make_purged_snap_key(pool
, before_end
- 1));
7141 string k
= make_purged_snap_key_value(pool
,
7142 before_begin
, after_end
- before_begin
,
7143 pending_inc
.epoch
, &v
);
7144 t
->put(OSD_SNAP_PREFIX
, k
, v
);
7146 dout(10) << __func__
7147 << " [" << start
<< "," << end
<< ") - join with earlier ["
7148 << before_begin
<< "," << before_end
<< ")" << dendl
;
7149 t
->erase(OSD_SNAP_PREFIX
, make_purged_snap_key(pool
, before_end
- 1));
7151 string k
= make_purged_snap_key_value(pool
,
7152 before_begin
, end
- before_begin
,
7153 pending_inc
.epoch
, &v
);
7154 t
->put(OSD_SNAP_PREFIX
, k
, v
);
7156 dout(10) << __func__
7157 << " [" << start
<< "," << end
<< ") - join with later ["
7158 << after_begin
<< "," << after_end
<< ")" << dendl
;
7159 // overwrite after record
7161 string k
= make_purged_snap_key_value(pool
,
7162 start
, after_end
- start
,
7163 pending_inc
.epoch
, &v
);
7164 t
->put(OSD_SNAP_PREFIX
, k
, v
);
7166 dout(10) << __func__
7167 << " [" << start
<< "," << end
<< ") - new"
7170 string k
= make_purged_snap_key_value(pool
,
7172 pending_inc
.epoch
, &v
);
7173 t
->put(OSD_SNAP_PREFIX
, k
, v
);
7177 bool OSDMonitor::try_prune_purged_snaps()
7179 if (!mon
.mgrstatmon()->is_readable()) {
7182 if (!pending_inc
.new_purged_snaps
.empty()) {
7183 return false; // we already pruned for this epoch
7186 unsigned max_prune
= cct
->_conf
.get_val
<uint64_t>(
7187 "mon_max_snap_prune_per_epoch");
7191 dout(10) << __func__
<< " max_prune " << max_prune
<< dendl
;
7193 unsigned actually_pruned
= 0;
7194 auto& purged_snaps
= mon
.mgrstatmon()->get_digest().purged_snaps
;
7195 for (auto& p
: osdmap
.get_pools()) {
7196 auto q
= purged_snaps
.find(p
.first
);
7197 if (q
== purged_snaps
.end()) {
7200 auto& purged
= q
->second
;
7201 if (purged
.empty()) {
7202 dout(20) << __func__
<< " " << p
.first
<< " nothing purged" << dendl
;
7205 dout(20) << __func__
<< " pool " << p
.first
<< " purged " << purged
<< dendl
;
7206 snap_interval_set_t to_prune
;
7207 unsigned maybe_pruned
= actually_pruned
;
7208 for (auto i
= purged
.begin(); i
!= purged
.end(); ++i
) {
7209 snapid_t begin
= i
.get_start();
7210 auto end
= i
.get_start() + i
.get_len();
7211 snapid_t pbegin
= 0, pend
= 0;
7212 int r
= lookup_purged_snap(p
.first
, begin
, &pbegin
, &pend
);
7215 // be a bit aggressive about backing off here, because the mon may
7216 // do a lot of work going through this set, and if we know the
7217 // purged set from the OSDs is at least *partly* stale we may as
7218 // well wait for it to be fresh.
7219 dout(20) << __func__
<< " we've already purged " << pbegin
7220 << "~" << (pend
- pbegin
) << dendl
;
7223 if (pbegin
&& pbegin
> begin
&& pbegin
< end
) {
7224 // the tail of [begin,end) is purged; shorten the range
7227 to_prune
.insert(begin
, end
- begin
);
7228 maybe_pruned
+= end
- begin
;
7229 if (maybe_pruned
>= max_prune
) {
7233 if (!to_prune
.empty()) {
7234 // PGs may still be reporting things as purged that we have already
7235 // pruned from removed_snaps_queue.
7236 snap_interval_set_t actual
;
7237 auto r
= osdmap
.removed_snaps_queue
.find(p
.first
);
7238 if (r
!= osdmap
.removed_snaps_queue
.end()) {
7239 actual
.intersection_of(to_prune
, r
->second
);
7241 actually_pruned
+= actual
.size();
7242 dout(10) << __func__
<< " pool " << p
.first
<< " reports pruned " << to_prune
7243 << ", actual pruned " << actual
<< dendl
;
7244 if (!actual
.empty()) {
7245 pending_inc
.new_purged_snaps
[p
.first
].swap(actual
);
7248 if (actually_pruned
>= max_prune
) {
7252 dout(10) << __func__
<< " actually pruned " << actually_pruned
<< dendl
;
7253 return !!actually_pruned
;
7256 bool OSDMonitor::update_pools_status()
7258 if (!mon
.mgrstatmon()->is_readable())
7263 auto& pools
= osdmap
.get_pools();
7264 for (auto it
= pools
.begin(); it
!= pools
.end(); ++it
) {
7265 const pool_stat_t
*pstat
= mon
.mgrstatmon()->get_pool_stat(it
->first
);
7268 const object_stat_sum_t
& sum
= pstat
->stats
.sum
;
7269 const pg_pool_t
&pool
= it
->second
;
7270 const string
& pool_name
= osdmap
.get_pool_name(it
->first
);
7273 (pool
.quota_max_bytes
> 0 && (uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) ||
7274 (pool
.quota_max_objects
> 0 && (uint64_t)sum
.num_objects
>= pool
.quota_max_objects
);
7276 if (pool
.has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
7280 mon
.clog
->info() << "pool '" << pool_name
7281 << "' no longer out of quota; removing NO_QUOTA flag";
7282 // below we cancel FLAG_FULL too, we'll set it again in
7283 // OSDMonitor::encode_pending if it still fails the osd-full checking.
7284 clear_pool_flags(it
->first
,
7285 pg_pool_t::FLAG_FULL_QUOTA
| pg_pool_t::FLAG_FULL
);
7291 if (pool
.quota_max_bytes
> 0 &&
7292 (uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) {
7293 mon
.clog
->warn() << "pool '" << pool_name
<< "' is full"
7294 << " (reached quota's max_bytes: "
7295 << byte_u_t(pool
.quota_max_bytes
) << ")";
7297 if (pool
.quota_max_objects
> 0 &&
7298 (uint64_t)sum
.num_objects
>= pool
.quota_max_objects
) {
7299 mon
.clog
->warn() << "pool '" << pool_name
<< "' is full"
7300 << " (reached quota's max_objects: "
7301 << pool
.quota_max_objects
<< ")";
7303 // set both FLAG_FULL_QUOTA and FLAG_FULL
7304 // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
7305 // since FLAG_FULL should always take precedence
7306 set_pool_flags(it
->first
,
7307 pg_pool_t::FLAG_FULL_QUOTA
| pg_pool_t::FLAG_FULL
);
7308 clear_pool_flags(it
->first
,
7309 pg_pool_t::FLAG_NEARFULL
|
7310 pg_pool_t::FLAG_BACKFILLFULL
);
7317 int OSDMonitor::prepare_new_pool(MonOpRequestRef op
)
7319 op
->mark_osdmon_event(__func__
);
7320 auto m
= op
->get_req
<MPoolOp
>();
7321 dout(10) << "prepare_new_pool from " << m
->get_connection() << dendl
;
7322 MonSession
*session
= op
->get_session();
7325 string erasure_code_profile
;
7330 ret
= prepare_new_pool(m
->name
, m
->crush_rule
, rule_name
,
7331 0, 0, 0, 0, 0, 0, 0.0,
7332 erasure_code_profile
,
7333 pg_pool_t::TYPE_REPLICATED
, 0, FAST_READ_OFF
, {}, bulk
,
7337 dout(10) << __func__
<< " got " << ret
<< " " << ss
.str() << dendl
;
7342 int OSDMonitor::crush_rename_bucket(const string
& srcname
,
7343 const string
& dstname
,
7348 // Avoid creating a pending crush if it does not already exists and
7349 // the rename would fail.
7351 if (!_have_pending_crush()) {
7352 ret
= _get_stable_crush().can_rename_bucket(srcname
,
7359 CrushWrapper newcrush
= _get_pending_crush();
7361 ret
= newcrush
.rename_bucket(srcname
,
7367 pending_inc
.crush
.clear();
7368 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
7369 *ss
<< "renamed bucket " << srcname
<< " into " << dstname
;
7373 void OSDMonitor::check_legacy_ec_plugin(const string
& plugin
, const string
& profile
) const
7375 string replacement
= "";
7377 if (plugin
== "jerasure_generic" ||
7378 plugin
== "jerasure_sse3" ||
7379 plugin
== "jerasure_sse4" ||
7380 plugin
== "jerasure_neon") {
7381 replacement
= "jerasure";
7382 } else if (plugin
== "shec_generic" ||
7383 plugin
== "shec_sse3" ||
7384 plugin
== "shec_sse4" ||
7385 plugin
== "shec_neon") {
7386 replacement
= "shec";
7389 if (replacement
!= "") {
7390 dout(0) << "WARNING: erasure coding profile " << profile
<< " uses plugin "
7391 << plugin
<< " that has been deprecated. Please use "
7392 << replacement
<< " instead." << dendl
;
7396 int OSDMonitor::normalize_profile(const string
& profilename
,
7397 ErasureCodeProfile
&profile
,
7401 ErasureCodeInterfaceRef erasure_code
;
7402 ErasureCodePluginRegistry
&instance
= ErasureCodePluginRegistry::instance();
7403 ErasureCodeProfile::const_iterator plugin
= profile
.find("plugin");
7404 check_legacy_ec_plugin(plugin
->second
, profilename
);
7405 int err
= instance
.factory(plugin
->second
,
7406 g_conf().get_val
<std::string
>("erasure_code_dir"),
7407 profile
, &erasure_code
, ss
);
7412 err
= erasure_code
->init(profile
, ss
);
7417 auto it
= profile
.find("stripe_unit");
7418 if (it
!= profile
.end()) {
7420 uint32_t stripe_unit
= strict_iecstrtoll(it
->second
, &err_str
);
7421 if (!err_str
.empty()) {
7422 *ss
<< "could not parse stripe_unit '" << it
->second
7423 << "': " << err_str
<< std::endl
;
7426 uint32_t data_chunks
= erasure_code
->get_data_chunk_count();
7427 uint32_t chunk_size
= erasure_code
->get_chunk_size(stripe_unit
* data_chunks
);
7428 if (chunk_size
!= stripe_unit
) {
7429 *ss
<< "stripe_unit " << stripe_unit
<< " does not match ec profile "
7430 << "alignment. Would be padded to " << chunk_size
7434 if ((stripe_unit
% 4096) != 0 && !force
) {
7435 *ss
<< "stripe_unit should be a multiple of 4096 bytes for best performance."
7436 << "use --force to override this check" << std::endl
;
7443 int OSDMonitor::crush_rule_create_erasure(const string
&name
,
7444 const string
&profile
,
7448 int ruleid
= osdmap
.crush
->get_rule_id(name
);
7449 if (ruleid
!= -ENOENT
) {
7454 CrushWrapper newcrush
= _get_pending_crush();
7456 ruleid
= newcrush
.get_rule_id(name
);
7457 if (ruleid
!= -ENOENT
) {
7461 ErasureCodeInterfaceRef erasure_code
;
7462 int err
= get_erasure_code(profile
, &erasure_code
, ss
);
7464 *ss
<< "failed to load plugin using profile " << profile
<< std::endl
;
7468 err
= erasure_code
->create_rule(name
, newcrush
, ss
);
7469 erasure_code
.reset();
7473 pending_inc
.crush
.clear();
7474 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
7479 int OSDMonitor::get_erasure_code(const string
&erasure_code_profile
,
7480 ErasureCodeInterfaceRef
*erasure_code
,
7483 if (pending_inc
.has_erasure_code_profile(erasure_code_profile
))
7485 ErasureCodeProfile profile
=
7486 osdmap
.get_erasure_code_profile(erasure_code_profile
);
7487 ErasureCodeProfile::const_iterator plugin
=
7488 profile
.find("plugin");
7489 if (plugin
== profile
.end()) {
7490 *ss
<< "cannot determine the erasure code plugin"
7491 << " because there is no 'plugin' entry in the erasure_code_profile "
7492 << profile
<< std::endl
;
7495 check_legacy_ec_plugin(plugin
->second
, erasure_code_profile
);
7496 auto& instance
= ErasureCodePluginRegistry::instance();
7497 return instance
.factory(plugin
->second
,
7498 g_conf().get_val
<std::string
>("erasure_code_dir"),
7499 profile
, erasure_code
, ss
);
7502 int OSDMonitor::check_cluster_features(uint64_t features
,
7505 stringstream unsupported_ss
;
7506 int unsupported_count
= 0;
7507 if ((mon
.get_quorum_con_features() & features
) != features
) {
7508 unsupported_ss
<< "the monitor cluster";
7509 ++unsupported_count
;
7512 set
<int32_t> up_osds
;
7513 osdmap
.get_up_osds(up_osds
);
7514 for (set
<int32_t>::iterator it
= up_osds
.begin();
7515 it
!= up_osds
.end(); ++it
) {
7516 const osd_xinfo_t
&xi
= osdmap
.get_xinfo(*it
);
7517 if ((xi
.features
& features
) != features
) {
7518 if (unsupported_count
> 0)
7519 unsupported_ss
<< ", ";
7520 unsupported_ss
<< "osd." << *it
;
7521 unsupported_count
++;
7525 if (unsupported_count
> 0) {
7526 ss
<< "features " << features
<< " unsupported by: "
7527 << unsupported_ss
.str();
7531 // check pending osd state, too!
7532 for (map
<int32_t,osd_xinfo_t
>::const_iterator p
=
7533 pending_inc
.new_xinfo
.begin();
7534 p
!= pending_inc
.new_xinfo
.end(); ++p
) {
7535 const osd_xinfo_t
&xi
= p
->second
;
7536 if ((xi
.features
& features
) != features
) {
7537 dout(10) << __func__
<< " pending osd." << p
->first
7538 << " features are insufficient; retry" << dendl
;
7546 bool OSDMonitor::validate_crush_against_features(const CrushWrapper
*newcrush
,
7549 OSDMap::Incremental new_pending
= pending_inc
;
7550 encode(*newcrush
, new_pending
.crush
, mon
.get_quorum_con_features());
7552 newmap
.deepish_copy_from(osdmap
);
7553 newmap
.apply_incremental(new_pending
);
7556 if (newmap
.require_min_compat_client
!= ceph_release_t::unknown
) {
7557 auto mv
= newmap
.get_min_compat_client();
7558 if (mv
> newmap
.require_min_compat_client
) {
7559 ss
<< "new crush map requires client version " << mv
7560 << " but require_min_compat_client is "
7561 << newmap
.require_min_compat_client
;
7568 newmap
.get_features(CEPH_ENTITY_TYPE_MON
, NULL
) |
7569 newmap
.get_features(CEPH_ENTITY_TYPE_OSD
, NULL
);
7570 stringstream features_ss
;
7571 int r
= check_cluster_features(features
, features_ss
);
7573 ss
<< "Could not change CRUSH: " << features_ss
.str();
7580 bool OSDMonitor::erasure_code_profile_in_use(
7581 const mempool::osdmap::map
<int64_t, pg_pool_t
> &pools
,
7582 const string
&profile
,
7586 for (map
<int64_t, pg_pool_t
>::const_iterator p
= pools
.begin();
7589 if (p
->second
.erasure_code_profile
== profile
&& p
->second
.is_erasure()) {
7590 *ss
<< osdmap
.pool_name
[p
->first
] << " ";
7595 *ss
<< "pool(s) are using the erasure code profile '" << profile
<< "'";
7600 int OSDMonitor::parse_erasure_code_profile(const vector
<string
> &erasure_code_profile
,
7601 map
<string
,string
> *erasure_code_profile_map
,
7604 int r
= g_conf().with_val
<string
>("osd_pool_default_erasure_code_profile",
7607 erasure_code_profile_map
,
7611 ceph_assert((*erasure_code_profile_map
).count("plugin"));
7612 string default_plugin
= (*erasure_code_profile_map
)["plugin"];
7613 map
<string
,string
> user_map
;
7614 for (vector
<string
>::const_iterator i
= erasure_code_profile
.begin();
7615 i
!= erasure_code_profile
.end();
7617 size_t equal
= i
->find('=');
7618 if (equal
== string::npos
) {
7619 user_map
[*i
] = string();
7620 (*erasure_code_profile_map
)[*i
] = string();
7622 const string key
= i
->substr(0, equal
);
7624 const string value
= i
->substr(equal
);
7625 if (key
.find("ruleset-") == 0) {
7626 *ss
<< "property '" << key
<< "' is no longer supported; try "
7627 << "'crush-" << key
.substr(8) << "' instead";
7630 user_map
[key
] = value
;
7631 (*erasure_code_profile_map
)[key
] = value
;
7635 if (user_map
.count("plugin") && user_map
["plugin"] != default_plugin
)
7636 (*erasure_code_profile_map
) = user_map
;
7641 int OSDMonitor::prepare_pool_size(const unsigned pool_type
,
7642 const string
&erasure_code_profile
,
7644 unsigned *size
, unsigned *min_size
,
7648 bool set_min_size
= false;
7649 switch (pool_type
) {
7650 case pg_pool_t::TYPE_REPLICATED
:
7651 if (osdmap
.stretch_mode_enabled
) {
7653 repl_size
= g_conf().get_val
<uint64_t>("mon_stretch_pool_size");
7654 if (repl_size
!= g_conf().get_val
<uint64_t>("mon_stretch_pool_size")) {
7655 *ss
<< "prepare_pool_size: we are in stretch mode but size "
7656 << repl_size
<< " does not match!";
7659 *min_size
= g_conf().get_val
<uint64_t>("mon_stretch_pool_min_size");
7660 set_min_size
= true;
7662 if (repl_size
== 0) {
7663 repl_size
= g_conf().get_val
<uint64_t>("osd_pool_default_size");
7667 *min_size
= g_conf().get_osd_pool_default_min_size(repl_size
);
7669 case pg_pool_t::TYPE_ERASURE
:
7671 if (osdmap
.stretch_mode_enabled
) {
7672 *ss
<< "prepare_pool_size: we are in stretch mode; cannot create EC pools!";
7675 ErasureCodeInterfaceRef erasure_code
;
7676 err
= get_erasure_code(erasure_code_profile
, &erasure_code
, ss
);
7678 *size
= erasure_code
->get_chunk_count();
7680 erasure_code
->get_data_chunk_count() +
7681 std::min
<int>(1, erasure_code
->get_coding_chunk_count() - 1);
7682 assert(*min_size
<= *size
);
7683 assert(*min_size
>= erasure_code
->get_data_chunk_count());
7688 *ss
<< "prepare_pool_size: " << pool_type
<< " is not a known pool type";
7695 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type
,
7696 const string
&erasure_code_profile
,
7697 uint32_t *stripe_width
,
7701 switch (pool_type
) {
7702 case pg_pool_t::TYPE_REPLICATED
:
7705 case pg_pool_t::TYPE_ERASURE
:
7707 ErasureCodeProfile profile
=
7708 osdmap
.get_erasure_code_profile(erasure_code_profile
);
7709 ErasureCodeInterfaceRef erasure_code
;
7710 err
= get_erasure_code(erasure_code_profile
, &erasure_code
, ss
);
7713 uint32_t data_chunks
= erasure_code
->get_data_chunk_count();
7714 uint32_t stripe_unit
= g_conf().get_val
<Option::size_t>("osd_pool_erasure_code_stripe_unit");
7715 auto it
= profile
.find("stripe_unit");
7716 if (it
!= profile
.end()) {
7718 stripe_unit
= strict_iecstrtoll(it
->second
, &err_str
);
7719 ceph_assert(err_str
.empty());
7721 *stripe_width
= data_chunks
*
7722 erasure_code
->get_chunk_size(stripe_unit
* data_chunks
);
7726 *ss
<< "prepare_pool_stripe_width: "
7727 << pool_type
<< " is not a known pool type";
7734 int OSDMonitor::get_replicated_stretch_crush_rule()
7736 /* we don't write down the stretch rule anywhere, so
7737 * we have to guess it. How? Look at all the pools
7738 * and count up how many times a given rule is used
7739 * on stretch pools and then return the one with
7742 map
<int,int> rule_counts
;
7743 for (const auto& pooli
: osdmap
.pools
) {
7744 const pg_pool_t
& p
= pooli
.second
;
7745 if (p
.is_replicated() && p
.is_stretch_pool()) {
7746 if (!rule_counts
.count(p
.crush_rule
)) {
7747 rule_counts
[p
.crush_rule
] = 1;
7749 ++rule_counts
[p
.crush_rule
];
7754 if (rule_counts
.empty()) {
7758 int most_used_count
= 0;
7759 int most_used_rule
= -1;
7760 for (auto i
: rule_counts
) {
7761 if (i
.second
> most_used_count
) {
7762 most_used_rule
= i
.first
;
7763 most_used_count
= i
.second
;
7766 ceph_assert(most_used_count
> 0);
7767 ceph_assert(most_used_rule
>= 0);
7768 return most_used_rule
;
7771 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type
,
7772 const string
&erasure_code_profile
,
7773 const string
&rule_name
,
7778 if (*crush_rule
< 0) {
7779 switch (pool_type
) {
7780 case pg_pool_t::TYPE_REPLICATED
:
7782 if (rule_name
== "") {
7783 if (osdmap
.stretch_mode_enabled
) {
7784 *crush_rule
= get_replicated_stretch_crush_rule();
7787 *crush_rule
= osdmap
.crush
->get_osd_pool_default_crush_replicated_rule(cct
);
7789 if (*crush_rule
< 0) {
7790 // Errors may happen e.g. if no valid rule is available
7791 *ss
<< "No suitable CRUSH rule exists, check "
7792 << "'osd pool default crush *' config options";
7796 return get_crush_rule(rule_name
, crush_rule
, ss
);
7800 case pg_pool_t::TYPE_ERASURE
:
7802 int err
= crush_rule_create_erasure(rule_name
,
7803 erasure_code_profile
,
7807 dout(20) << "prepare_pool_crush_rule: rule "
7808 << rule_name
<< " try again" << dendl
;
7811 // need to wait for the crush rule to be proposed before proceeding
7822 *ss
<< "prepare_pool_crush_rule: " << pool_type
7823 << " is not a known pool type";
7827 if (!osdmap
.crush
->rule_exists(*crush_rule
)) {
7828 *ss
<< "CRUSH rule " << *crush_rule
<< " not found";
7836 int OSDMonitor::get_crush_rule(const string
&rule_name
,
7841 ret
= osdmap
.crush
->get_rule_id(rule_name
);
7842 if (ret
!= -ENOENT
) {
7846 CrushWrapper newcrush
= _get_pending_crush();
7848 ret
= newcrush
.get_rule_id(rule_name
);
7849 if (ret
!= -ENOENT
) {
7850 // found it, wait for it to be proposed
7851 dout(20) << __func__
<< ": rule " << rule_name
7852 << " try again" << dendl
;
7855 // Cannot find it , return error
7856 *ss
<< "specified rule " << rule_name
<< " doesn't exist";
7863 int OSDMonitor::check_pg_num(int64_t pool
, int pg_num
, int size
, int crush_rule
, ostream
*ss
)
7865 auto max_pgs_per_osd
= g_conf().get_val
<uint64_t>("mon_max_pg_per_osd");
7866 uint64_t projected
= 0;
7867 unsigned osd_num
= 0;
7868 // assume min cluster size 3
7869 auto num_osds
= std::max(osdmap
.get_num_in_osds(), 3u);
7872 projected
+= pg_num
* size
;
7874 if (mapping
.get_epoch() >= osdmap
.get_epoch()) {
7876 CrushWrapper newcrush
= _get_pending_crush();
7877 newcrush
.find_takes_by_rule(crush_rule
, &roots
);
7878 int max_osd
= osdmap
.get_max_osd();
7879 for (auto root
: roots
) {
7880 const char *rootname
= newcrush
.get_item_name(root
);
7882 newcrush
.get_leaves(rootname
, &osd_ids
);
7883 unsigned out_osd
= 0;
7884 for (auto id
: osd_ids
) {
7889 projected
+= mapping
.get_osd_acting_pgs(id
).size();
7891 osd_num
+= osd_ids
.size() - out_osd
;
7894 // update an existing pool's pg num
7895 const auto& pg_info
= osdmap
.get_pools().at(pool
);
7896 // already counted the pgs of this `pool` by iterating crush map, so
7897 // remove them using adding the specified pg num
7898 projected
+= pg_num
* size
;
7899 projected
-= pg_info
.get_pg_num_target() * pg_info
.get_size();
7901 num_osds
= std::max(osd_num
, 3u); // assume min cluster size 3
7903 // use pg_num target for evaluating the projected pg num
7904 for (const auto& [pool_id
, pool_info
] : osdmap
.get_pools()) {
7905 if (pool_id
== pool
) {
7906 projected
+= pg_num
* size
;
7908 projected
+= pool_info
.get_pg_num_target() * pool_info
.get_size();
7912 auto max_pgs
= max_pgs_per_osd
* num_osds
;
7913 if (projected
> max_pgs
) {
7915 *ss
<< "pool id " << pool
;
7917 *ss
<< " pg_num " << pg_num
<< " size " << size
7918 << " would mean " << projected
7919 << " total pgs, which exceeds max " << max_pgs
7920 << " (mon_max_pg_per_osd " << max_pgs_per_osd
7921 << " * num_in_osds " << num_osds
<< ")";
7928 * @param name The name of the new pool
7929 * @param crush_rule The crush rule to use. If <0, will use the system default
7930 * @param crush_rule_name The crush rule to use, if crush_rulset <0
7931 * @param pg_num The pg_num to use. If set to 0, will use the system default
7932 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
7933 * @param pg_num_min min pg_num
7934 * @param pg_num_max max pg_num
7935 * @param repl_size Replication factor, or 0 for default
7936 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
7937 * @param pool_type TYPE_ERASURE, or TYPE_REP
7938 * @param expected_num_objects expected number of objects on the pool
7939 * @param fast_read fast read type.
7940 * @param ss human readable error message, if any.
7942 * @return 0 on success, negative errno on failure.
7944 int OSDMonitor::prepare_new_pool(string
& name
,
7946 const string
&crush_rule_name
,
7947 unsigned pg_num
, unsigned pgp_num
,
7948 unsigned pg_num_min
,
7949 unsigned pg_num_max
,
7950 const uint64_t repl_size
,
7951 const uint64_t target_size_bytes
,
7952 const float target_size_ratio
,
7953 const string
&erasure_code_profile
,
7954 const unsigned pool_type
,
7955 const uint64_t expected_num_objects
,
7956 FastReadType fast_read
,
7957 const string
& pg_autoscale_mode
,
7961 if (name
.length() == 0)
7964 auto pg_num_from_mode
=
7965 [pg_num
=g_conf().get_val
<uint64_t>("osd_pool_default_pg_num")]
7966 (const string
& mode
) {
7967 return mode
== "on" ? 1 : pg_num
;
7969 pg_num
= pg_num_from_mode(
7970 pg_autoscale_mode
.empty() ?
7971 g_conf().get_val
<string
>("osd_pool_default_pg_autoscale_mode") :
7975 pgp_num
= g_conf().get_val
<uint64_t>("osd_pool_default_pgp_num");
7978 if (pg_num
> g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")) {
7979 *ss
<< "'pg_num' must be greater than 0 and less than or equal to "
7980 << g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")
7981 << " (you may adjust 'mon max pool pg num' for higher values)";
7984 if (pgp_num
> pg_num
) {
7985 *ss
<< "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
7986 << ", which in this case is " << pg_num
;
7989 if (pool_type
== pg_pool_t::TYPE_REPLICATED
&& fast_read
== FAST_READ_ON
) {
7990 *ss
<< "'fast_read' can only apply to erasure coding pool";
7994 r
= prepare_pool_crush_rule(pool_type
, erasure_code_profile
,
7995 crush_rule_name
, &crush_rule
, ss
);
7997 dout(10) << "prepare_pool_crush_rule returns " << r
<< dendl
;
8000 unsigned size
, min_size
;
8001 r
= prepare_pool_size(pool_type
, erasure_code_profile
, repl_size
,
8002 &size
, &min_size
, ss
);
8004 dout(10) << "prepare_pool_size returns " << r
<< dendl
;
8007 if (g_conf()->mon_osd_crush_smoke_test
) {
8008 CrushWrapper newcrush
= _get_pending_crush();
8010 CrushTester
tester(newcrush
, err
);
8011 tester
.set_min_x(0);
8012 tester
.set_max_x(50);
8013 tester
.set_rule(crush_rule
);
8014 tester
.set_num_rep(size
);
8015 auto start
= ceph::coarse_mono_clock::now();
8016 r
= tester
.test_with_fork(g_conf()->mon_lease
);
8017 auto duration
= ceph::coarse_mono_clock::now() - start
;
8019 dout(10) << "tester.test_with_fork returns " << r
8020 << ": " << err
.str() << dendl
;
8021 *ss
<< "crush test failed with " << r
<< ": " << err
.str();
8024 dout(10) << __func__
<< " crush smoke test duration: "
8025 << duration
<< dendl
;
8027 r
= check_pg_num(-1, pg_num
, size
, crush_rule
, ss
);
8029 dout(10) << "check_pg_num returns " << r
<< dendl
;
8033 if (osdmap
.crush
->get_rule_type(crush_rule
) != (int)pool_type
) {
8034 *ss
<< "crush rule " << crush_rule
<< " type does not match pool";
8038 uint32_t stripe_width
= 0;
8039 r
= prepare_pool_stripe_width(pool_type
, erasure_code_profile
, &stripe_width
, ss
);
8041 dout(10) << "prepare_pool_stripe_width returns " << r
<< dendl
;
8046 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
8047 switch (fast_read
) {
8054 case FAST_READ_DEFAULT
:
8055 fread
= g_conf()->osd_pool_default_ec_fast_read
;
8058 *ss
<< "invalid fast_read setting: " << fast_read
;
8063 for (map
<int64_t,string
>::iterator p
= pending_inc
.new_pool_names
.begin();
8064 p
!= pending_inc
.new_pool_names
.end();
8066 if (p
->second
== name
)
8070 if (-1 == pending_inc
.new_pool_max
)
8071 pending_inc
.new_pool_max
= osdmap
.pool_max
;
8072 int64_t pool
= ++pending_inc
.new_pool_max
;
8074 pg_pool_t
*pi
= pending_inc
.get_new_pool(pool
, &empty
);
8075 pi
->create_time
= ceph_clock_now();
8076 pi
->type
= pool_type
;
8077 pi
->fast_read
= fread
;
8078 pi
->flags
= g_conf()->osd_pool_default_flags
;
8080 pi
->set_flag(pg_pool_t::FLAG_BULK
);
8081 } else if (g_conf()->osd_pool_default_flag_bulk
) {
8082 pi
->set_flag(pg_pool_t::FLAG_BULK
);
8084 if (g_conf()->osd_pool_default_flag_hashpspool
)
8085 pi
->set_flag(pg_pool_t::FLAG_HASHPSPOOL
);
8086 if (g_conf()->osd_pool_default_flag_nodelete
)
8087 pi
->set_flag(pg_pool_t::FLAG_NODELETE
);
8088 if (g_conf()->osd_pool_default_flag_nopgchange
)
8089 pi
->set_flag(pg_pool_t::FLAG_NOPGCHANGE
);
8090 if (g_conf()->osd_pool_default_flag_nosizechange
)
8091 pi
->set_flag(pg_pool_t::FLAG_NOSIZECHANGE
);
8092 pi
->set_flag(pg_pool_t::FLAG_CREATING
);
8093 if (g_conf()->osd_pool_use_gmt_hitset
)
8094 pi
->use_gmt_hitset
= true;
8096 pi
->use_gmt_hitset
= false;
8099 pi
->min_size
= min_size
;
8100 pi
->crush_rule
= crush_rule
;
8101 pi
->expected_num_objects
= expected_num_objects
;
8102 pi
->object_hash
= CEPH_STR_HASH_RJENKINS
;
8103 if (osdmap
.stretch_mode_enabled
) {
8104 pi
->peering_crush_bucket_count
= osdmap
.stretch_bucket_count
;
8105 pi
->peering_crush_bucket_target
= osdmap
.stretch_bucket_count
;
8106 pi
->peering_crush_bucket_barrier
= osdmap
.stretch_mode_bucket
;
8107 pi
->peering_crush_mandatory_member
= CRUSH_ITEM_NONE
;
8108 if (osdmap
.degraded_stretch_mode
) {
8109 pi
->peering_crush_bucket_count
= osdmap
.degraded_stretch_mode
;
8110 pi
->peering_crush_bucket_target
= osdmap
.degraded_stretch_mode
;
8111 // pi->peering_crush_bucket_mandatory_member = CRUSH_ITEM_NONE;
8112 // TODO: drat, we don't record this ^ anywhere, though given that it
8113 // necessarily won't exist elsewhere it likely doesn't matter
8114 pi
->min_size
= pi
->min_size
/ 2;
8115 pi
->size
= pi
->size
/ 2; // only support 2 zones now
8119 if (auto m
= pg_pool_t::get_pg_autoscale_mode_by_name(
8120 g_conf().get_val
<string
>("osd_pool_default_pg_autoscale_mode"));
8121 m
!= pg_pool_t::pg_autoscale_mode_t::UNKNOWN
) {
8122 pi
->pg_autoscale_mode
= m
;
8124 pi
->pg_autoscale_mode
= pg_pool_t::pg_autoscale_mode_t::OFF
;
8126 auto max
= g_conf().get_val
<int64_t>("mon_osd_max_initial_pgs");
8128 max
> 0 ? std::min
<uint64_t>(pg_num
, std::max
<int64_t>(1, max
))
8130 pi
->set_pg_num_pending(pi
->get_pg_num());
8131 pi
->set_pg_num_target(pg_num
);
8132 pi
->set_pgp_num(pi
->get_pg_num());
8133 pi
->set_pgp_num_target(pgp_num
);
8134 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
&&
8136 pi
->opts
.set(pool_opts_t::PG_NUM_MIN
, static_cast<int64_t>(pg_num_min
));
8138 if (osdmap
.require_osd_release
>= ceph_release_t::quincy
&&
8140 pi
->opts
.set(pool_opts_t::PG_NUM_MAX
, static_cast<int64_t>(pg_num_max
));
8142 if (auto m
= pg_pool_t::get_pg_autoscale_mode_by_name(
8143 pg_autoscale_mode
); m
!= pg_pool_t::pg_autoscale_mode_t::UNKNOWN
) {
8144 pi
->pg_autoscale_mode
= m
;
8147 pi
->last_change
= pending_inc
.epoch
;
8150 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
8151 pi
->erasure_code_profile
= erasure_code_profile
;
8153 pi
->erasure_code_profile
= "";
8155 pi
->stripe_width
= stripe_width
;
8157 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
&&
8158 target_size_bytes
) {
8159 // only store for nautilus+ because TARGET_SIZE_BYTES may be
8160 // larger than int32_t max.
8161 pi
->opts
.set(pool_opts_t::TARGET_SIZE_BYTES
, static_cast<int64_t>(target_size_bytes
));
8163 if (target_size_ratio
> 0.0 &&
8164 osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
8165 // only store for nautilus+, just to be consistent and tidy.
8166 pi
->opts
.set(pool_opts_t::TARGET_SIZE_RATIO
, target_size_ratio
);
8169 pi
->cache_target_dirty_ratio_micro
=
8170 g_conf()->osd_pool_default_cache_target_dirty_ratio
* 1000000;
8171 pi
->cache_target_dirty_high_ratio_micro
=
8172 g_conf()->osd_pool_default_cache_target_dirty_high_ratio
* 1000000;
8173 pi
->cache_target_full_ratio_micro
=
8174 g_conf()->osd_pool_default_cache_target_full_ratio
* 1000000;
8175 pi
->cache_min_flush_age
= g_conf()->osd_pool_default_cache_min_flush_age
;
8176 pi
->cache_min_evict_age
= g_conf()->osd_pool_default_cache_min_evict_age
;
8178 pending_inc
.new_pool_names
[pool
] = name
;
8182 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op
, int flag
)
8184 op
->mark_osdmon_event(__func__
);
8186 if (pending_inc
.new_flags
< 0)
8187 pending_inc
.new_flags
= osdmap
.get_flags();
8188 pending_inc
.new_flags
|= flag
;
8189 ss
<< OSDMap::get_flag_string(flag
) << " is set";
8190 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
8191 get_last_committed() + 1));
8195 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op
, int flag
)
8197 op
->mark_osdmon_event(__func__
);
8199 if (pending_inc
.new_flags
< 0)
8200 pending_inc
.new_flags
= osdmap
.get_flags();
8201 pending_inc
.new_flags
&= ~flag
;
8202 ss
<< OSDMap::get_flag_string(flag
) << " is unset";
8203 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
8204 get_last_committed() + 1));
8208 int OSDMonitor::prepare_command_pool_set(const cmdmap_t
& cmdmap
,
8212 cmd_getval(cmdmap
, "pool", poolstr
);
8213 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
8215 ss
<< "unrecognized pool '" << poolstr
<< "'";
8219 cmd_getval(cmdmap
, "var", var
);
8221 pg_pool_t p
= *osdmap
.get_pg_pool(pool
);
8222 if (pending_inc
.new_pools
.count(pool
))
8223 p
= pending_inc
.new_pools
[pool
];
8225 // accept val as a json string in the normal case (current
8226 // generation monitor). parse out int or float values from the
8227 // string as needed. however, if it is not a string, try to pull
8228 // out an int, in case an older monitor with an older json schema is
8229 // forwarding a request.
8231 string interr
, floaterr
;
8234 int64_t uf
= 0; // micro-f
8235 cmd_getval(cmdmap
, "val", val
);
8238 "target_max_objects"
8240 auto iec_options
= {
8242 "target_size_bytes",
8243 "compression_max_blob_size",
8244 "compression_min_blob_size",
8248 if (count(begin(si_options
), end(si_options
), var
)) {
8249 n
= strict_si_cast
<int64_t>(val
, &interr
);
8250 } else if (count(begin(iec_options
), end(iec_options
), var
)) {
8251 n
= strict_iec_cast
<int64_t>(val
, &interr
);
8253 // parse string as both int and float; different fields use different types.
8254 n
= strict_strtoll(val
.c_str(), 10, &interr
);
8255 f
= strict_strtod(val
.c_str(), &floaterr
);
8256 uf
= llrintl(f
* (double)1000000.0);
8260 (var
== "hit_set_type" || var
== "hit_set_period" ||
8261 var
== "hit_set_count" || var
== "hit_set_fpp" ||
8262 var
== "target_max_objects" || var
== "target_max_bytes" ||
8263 var
== "cache_target_full_ratio" || var
== "cache_target_dirty_ratio" ||
8264 var
== "cache_target_dirty_high_ratio" || var
== "use_gmt_hitset" ||
8265 var
== "cache_min_flush_age" || var
== "cache_min_evict_age" ||
8266 var
== "hit_set_grade_decay_rate" || var
== "hit_set_search_last_n" ||
8267 var
== "min_read_recency_for_promote" || var
== "min_write_recency_for_promote")) {
8271 if (var
== "size") {
8272 if (p
.has_flag(pg_pool_t::FLAG_NOSIZECHANGE
)) {
8273 ss
<< "pool size change is disabled; you must unset nosizechange flag for the pool first";
8276 if (p
.type
== pg_pool_t::TYPE_ERASURE
) {
8277 ss
<< "can not change the size of an erasure-coded pool";
8280 if (interr
.length()) {
8281 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8284 if (n
<= 0 || n
> 10) {
8285 ss
<< "pool size must be between 1 and 10";
8289 if (!g_conf().get_val
<bool>("mon_allow_pool_size_one")) {
8290 ss
<< "configuring pool size as 1 is disabled by default.";
8294 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
8295 if (!sure
) { ss
<< "WARNING: setting pool size 1 could lead to data loss "
8296 "without recovery. If you are *ABSOLUTELY CERTAIN* that is what you want, "
8297 "pass the flag --yes-i-really-mean-it.";
8301 if (osdmap
.crush
->get_rule_type(p
.get_crush_rule()) != (int)p
.type
) {
8302 ss
<< "crush rule " << p
.get_crush_rule() << " type does not match pool";
8305 int r
= check_pg_num(pool
, p
.get_pg_num(), n
, p
.get_crush_rule(), &ss
);
8310 p
.min_size
= g_conf().get_osd_pool_default_min_size(p
.size
);
8311 } else if (var
== "min_size") {
8312 if (p
.has_flag(pg_pool_t::FLAG_NOSIZECHANGE
)) {
8313 ss
<< "pool min size change is disabled; you must unset nosizechange flag for the pool first";
8316 if (interr
.length()) {
8317 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8321 if (p
.type
!= pg_pool_t::TYPE_ERASURE
) {
8322 if (n
< 1 || n
> p
.size
) {
8323 ss
<< "pool min_size must be between 1 and size, which is set to " << (int)p
.size
;
8327 ErasureCodeInterfaceRef erasure_code
;
8330 int err
= get_erasure_code(p
.erasure_code_profile
, &erasure_code
, &tmp
);
8332 k
= erasure_code
->get_data_chunk_count();
8334 ss
<< __func__
<< " get_erasure_code failed: " << tmp
.str();
8338 if (n
< k
|| n
> p
.size
) {
8339 ss
<< "pool min_size must be between " << k
<< " and size, which is set to " << (int)p
.size
;
8344 } else if (var
== "pg_num_actual") {
8345 if (interr
.length()) {
8346 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8349 if (n
== (int)p
.get_pg_num()) {
8352 if (static_cast<uint64_t>(n
) > g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")) {
8353 ss
<< "'pg_num' must be greater than 0 and less than or equal to "
8354 << g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")
8355 << " (you may adjust 'mon max pool pg num' for higher values)";
8358 if (p
.has_flag(pg_pool_t::FLAG_CREATING
)) {
8359 ss
<< "cannot adjust pg_num while initial PGs are being created";
8362 if (n
> (int)p
.get_pg_num()) {
8363 if (p
.get_pg_num() != p
.get_pg_num_pending()) {
8364 // force pre-nautilus clients to resend their ops, since they
8365 // don't understand pg_num_pending changes form a new interval
8366 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
8370 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8371 ss
<< "nautilus OSDs are required to adjust pg_num_pending";
8374 if (n
< (int)p
.get_pgp_num()) {
8375 ss
<< "specified pg_num " << n
<< " < pgp_num " << p
.get_pgp_num();
8378 if (n
< (int)p
.get_pg_num() - 1) {
8379 ss
<< "specified pg_num " << n
<< " < pg_num (" << p
.get_pg_num()
8380 << ") - 1; only single pg decrease is currently supported";
8383 p
.set_pg_num_pending(n
);
8384 // force pre-nautilus clients to resend their ops, since they
8385 // don't understand pg_num_pending changes form a new interval
8386 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
8388 // force pre-luminous clients to resend their ops, since they
8389 // don't understand that split PGs now form a new interval.
8390 p
.last_force_op_resend_preluminous
= pending_inc
.epoch
;
8391 } else if (var
== "pg_num") {
8392 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
8393 ss
<< "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8396 if (interr
.length()) {
8397 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8400 if (n
== (int)p
.get_pg_num_target()) {
8403 if (n
<= 0 || static_cast<uint64_t>(n
) >
8404 g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")) {
8405 ss
<< "'pg_num' must be greater than 0 and less than or equal to "
8406 << g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")
8407 << " (you may adjust 'mon max pool pg num' for higher values)";
8410 if (n
> (int)p
.get_pg_num_target()) {
8411 int r
= check_pg_num(pool
, n
, p
.get_size(), p
.get_crush_rule(), &ss
);
8416 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
8417 if (p
.cache_mode
!= pg_pool_t::CACHEMODE_NONE
&& !force
) {
8418 ss
<< "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
8422 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8423 ss
<< "nautilus OSDs are required to decrease pg_num";
8427 int64_t pg_min
= 0, pg_max
= 0;
8428 p
.opts
.get(pool_opts_t::PG_NUM_MIN
, &pg_min
);
8429 p
.opts
.get(pool_opts_t::PG_NUM_MAX
, &pg_max
);
8430 if (pg_min
&& n
< pg_min
) {
8431 ss
<< "specified pg_num " << n
8432 << " < pg_num_min " << pg_min
;
8435 if (pg_max
&& n
> pg_max
) {
8436 ss
<< "specified pg_num " << n
8437 << " < pg_num_max " << pg_max
;
8440 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8441 // pre-nautilus osdmap format; increase pg_num directly
8442 assert(n
> (int)p
.get_pg_num());
8443 // force pre-nautilus clients to resend their ops, since they
8444 // don't understand pg_num_target changes form a new interval
8445 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
8446 // force pre-luminous clients to resend their ops, since they
8447 // don't understand that split PGs now form a new interval.
8448 p
.last_force_op_resend_preluminous
= pending_inc
.epoch
;
8451 // set targets; mgr will adjust pg_num_actual and pgp_num later.
8452 // make pgp_num track pg_num if it already matches. if it is set
8453 // differently, leave it different and let the user control it
8455 if (p
.get_pg_num_target() == p
.get_pgp_num_target()) {
8456 p
.set_pgp_num_target(n
);
8458 p
.set_pg_num_target(n
);
8460 } else if (var
== "pgp_num_actual") {
8461 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
8462 ss
<< "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8465 if (interr
.length()) {
8466 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8470 ss
<< "specified pgp_num must > 0, but you set to " << n
;
8473 if (n
> (int)p
.get_pg_num()) {
8474 ss
<< "specified pgp_num " << n
<< " > pg_num " << p
.get_pg_num();
8477 if (n
> (int)p
.get_pg_num_pending()) {
8478 ss
<< "specified pgp_num " << n
8479 << " > pg_num_pending " << p
.get_pg_num_pending();
8483 } else if (var
== "pgp_num") {
8484 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
8485 ss
<< "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8488 if (interr
.length()) {
8489 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8493 ss
<< "specified pgp_num must > 0, but you set to " << n
;
8496 if (n
> (int)p
.get_pg_num_target()) {
8497 ss
<< "specified pgp_num " << n
<< " > pg_num " << p
.get_pg_num_target();
8500 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8501 // pre-nautilus osdmap format; increase pgp_num directly
8504 p
.set_pgp_num_target(n
);
8506 } else if (var
== "pg_autoscale_mode") {
8507 auto m
= pg_pool_t::get_pg_autoscale_mode_by_name(val
);
8508 if (m
== pg_pool_t::pg_autoscale_mode_t::UNKNOWN
) {
8509 ss
<< "specified invalid mode " << val
;
8512 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8513 ss
<< "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
8516 p
.pg_autoscale_mode
= m
;
8517 } else if (var
== "crush_rule") {
8518 int id
= osdmap
.crush
->get_rule_id(val
);
8519 if (id
== -ENOENT
) {
8520 ss
<< "crush rule " << val
<< " does not exist";
8524 ss
<< cpp_strerror(id
);
8527 if (osdmap
.crush
->get_rule_type(id
) != (int)p
.get_type()) {
8528 ss
<< "crush rule " << id
<< " type does not match pool";
8532 } else if (var
== "nodelete" || var
== "nopgchange" ||
8533 var
== "nosizechange" || var
== "write_fadvise_dontneed" ||
8534 var
== "noscrub" || var
== "nodeep-scrub" || var
== "bulk") {
8535 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
8536 // make sure we only compare against 'n' if we didn't receive a string
8537 if (val
== "true" || (interr
.empty() && n
== 1)) {
8539 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8542 ss
<< "expecting value 'true', 'false', '0', or '1'";
8545 } else if (var
== "eio") {
8546 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
8548 // make sure we only compare against 'n' if we didn't receive a string
8549 if (val
== "true" || (interr
.empty() && n
== 1)) {
8551 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8554 ss
<< "expecting value 'true', 'false', '0', or '1'";
8557 } else if (var
== "hashpspool") {
8558 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
8560 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
8563 ss
<< "are you SURE? this will remap all placement groups in this pool,"
8564 " this triggers large data movement,"
8565 " pass --yes-i-really-mean-it if you really do.";
8568 // make sure we only compare against 'n' if we didn't receive a string
8569 if (val
== "true" || (interr
.empty() && n
== 1)) {
8571 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8574 ss
<< "expecting value 'true', 'false', '0', or '1'";
8577 } else if (var
== "hit_set_type") {
8579 p
.hit_set_params
= HitSet::Params();
8581 int err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
8584 if (val
== "bloom") {
8585 BloomHitSet::Params
*bsp
= new BloomHitSet::Params
;
8586 bsp
->set_fpp(g_conf().get_val
<double>("osd_pool_default_hit_set_bloom_fpp"));
8587 p
.hit_set_params
= HitSet::Params(bsp
);
8588 } else if (val
== "explicit_hash")
8589 p
.hit_set_params
= HitSet::Params(new ExplicitHashHitSet::Params
);
8590 else if (val
== "explicit_object")
8591 p
.hit_set_params
= HitSet::Params(new ExplicitObjectHitSet::Params
);
8593 ss
<< "unrecognized hit_set type '" << val
<< "'";
8597 } else if (var
== "hit_set_period") {
8598 if (interr
.length()) {
8599 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8602 ss
<< "hit_set_period should be non-negative";
8605 p
.hit_set_period
= n
;
8606 } else if (var
== "hit_set_count") {
8607 if (interr
.length()) {
8608 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8611 ss
<< "hit_set_count should be non-negative";
8614 p
.hit_set_count
= n
;
8615 } else if (var
== "hit_set_fpp") {
8616 if (floaterr
.length()) {
8617 ss
<< "error parsing floating point value '" << val
<< "': " << floaterr
;
8619 } else if (f
< 0 || f
> 1.0) {
8620 ss
<< "hit_set_fpp should be in the range 0..1";
8623 if (p
.hit_set_params
.get_type() != HitSet::TYPE_BLOOM
) {
8624 ss
<< "hit set is not of type Bloom; invalid to set a false positive rate!";
8627 BloomHitSet::Params
*bloomp
= static_cast<BloomHitSet::Params
*>(p
.hit_set_params
.impl
.get());
8629 } else if (var
== "use_gmt_hitset") {
8630 if (val
== "true" || (interr
.empty() && n
== 1)) {
8631 p
.use_gmt_hitset
= true;
8633 ss
<< "expecting value 'true' or '1'";
8636 } else if (var
== "allow_ec_overwrites") {
8637 if (!p
.is_erasure()) {
8638 ss
<< "ec overwrites can only be enabled for an erasure coded pool";
8642 if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites
&&
8643 !is_pool_currently_all_bluestore(pool
, p
, &err
)) {
8644 ss
<< "pool must only be stored on bluestore for scrubbing to work: " << err
.str();
8647 if (val
== "true" || (interr
.empty() && n
== 1)) {
8648 p
.flags
|= pg_pool_t::FLAG_EC_OVERWRITES
;
8649 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8650 ss
<< "ec overwrites cannot be disabled once enabled";
8653 ss
<< "expecting value 'true', 'false', '0', or '1'";
8656 } else if (var
== "target_max_objects") {
8657 if (interr
.length()) {
8658 ss
<< "error parsing int '" << val
<< "': " << interr
;
8661 p
.target_max_objects
= n
;
8662 } else if (var
== "target_max_bytes") {
8663 if (interr
.length()) {
8664 ss
<< "error parsing int '" << val
<< "': " << interr
;
8667 p
.target_max_bytes
= n
;
8668 } else if (var
== "cache_target_dirty_ratio") {
8669 if (floaterr
.length()) {
8670 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
8673 if (f
< 0 || f
> 1.0) {
8674 ss
<< "value must be in the range 0..1";
8677 p
.cache_target_dirty_ratio_micro
= uf
;
8678 } else if (var
== "cache_target_dirty_high_ratio") {
8679 if (floaterr
.length()) {
8680 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
8683 if (f
< 0 || f
> 1.0) {
8684 ss
<< "value must be in the range 0..1";
8687 p
.cache_target_dirty_high_ratio_micro
= uf
;
8688 } else if (var
== "cache_target_full_ratio") {
8689 if (floaterr
.length()) {
8690 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
8693 if (f
< 0 || f
> 1.0) {
8694 ss
<< "value must be in the range 0..1";
8697 p
.cache_target_full_ratio_micro
= uf
;
8698 } else if (var
== "cache_min_flush_age") {
8699 if (interr
.length()) {
8700 ss
<< "error parsing int '" << val
<< "': " << interr
;
8703 p
.cache_min_flush_age
= n
;
8704 } else if (var
== "cache_min_evict_age") {
8705 if (interr
.length()) {
8706 ss
<< "error parsing int '" << val
<< "': " << interr
;
8709 p
.cache_min_evict_age
= n
;
8710 } else if (var
== "min_read_recency_for_promote") {
8711 if (interr
.length()) {
8712 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8715 p
.min_read_recency_for_promote
= n
;
8716 } else if (var
== "hit_set_grade_decay_rate") {
8717 if (interr
.length()) {
8718 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8721 if (n
> 100 || n
< 0) {
8722 ss
<< "value out of range,valid range is 0 - 100";
8725 p
.hit_set_grade_decay_rate
= n
;
8726 } else if (var
== "hit_set_search_last_n") {
8727 if (interr
.length()) {
8728 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8731 if (n
> p
.hit_set_count
|| n
< 0) {
8732 ss
<< "value out of range,valid range is 0 - hit_set_count";
8735 p
.hit_set_search_last_n
= n
;
8736 } else if (var
== "min_write_recency_for_promote") {
8737 if (interr
.length()) {
8738 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8741 p
.min_write_recency_for_promote
= n
;
8742 } else if (var
== "fast_read") {
8743 if (p
.is_replicated()) {
8744 ss
<< "fast read is not supported in replication pool";
8747 if (val
== "true" || (interr
.empty() && n
== 1)) {
8749 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8750 p
.fast_read
= false;
8752 ss
<< "expecting value 'true', 'false', '0', or '1'";
8755 } else if (pool_opts_t::is_opt_name(var
)) {
8756 bool unset
= val
== "unset";
8757 if (var
== "compression_mode") {
8759 auto cmode
= Compressor::get_comp_mode_type(val
);
8761 ss
<< "unrecognized compression mode '" << val
<< "'";
8765 } else if (var
== "compression_algorithm") {
8767 auto alg
= Compressor::get_comp_alg_type(val
);
8769 ss
<< "unrecognized compression_algorithm '" << val
<< "'";
8773 } else if (var
== "compression_required_ratio") {
8774 if (floaterr
.length()) {
8775 ss
<< "error parsing float value '" << val
<< "': " << floaterr
;
8778 if (f
< 0 || f
> 1) {
8779 ss
<< "compression_required_ratio is out of range (0-1): '" << val
<< "'";
8782 } else if (var
== "csum_type") {
8783 auto t
= unset
? 0 : Checksummer::get_csum_string_type(val
);
8785 ss
<< "unrecognized csum_type '" << val
<< "'";
8788 //preserve csum_type numeric value
8791 } else if (var
== "compression_max_blob_size" ||
8792 var
== "compression_min_blob_size" ||
8793 var
== "csum_max_block" ||
8794 var
== "csum_min_block") {
8795 if (interr
.length()) {
8796 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8799 } else if (var
== "fingerprint_algorithm") {
8801 auto alg
= pg_pool_t::get_fingerprint_from_str(val
);
8803 ss
<< "unrecognized fingerprint_algorithm '" << val
<< "'";
8807 } else if (var
== "target_size_bytes") {
8808 if (interr
.length()) {
8809 ss
<< "error parsing unit value '" << val
<< "': " << interr
;
8812 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8813 ss
<< "must set require_osd_release to nautilus or "
8814 << "later before setting target_size_bytes";
8817 } else if (var
== "pg_num_min") {
8818 if (interr
.length()) {
8819 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8822 if (n
> (int)p
.get_pg_num_target()) {
8823 ss
<< "specified pg_num_min " << n
8824 << " > pg_num " << p
.get_pg_num_target();
8827 } else if (var
== "pg_num_max") {
8828 if (interr
.length()) {
8829 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8832 if (n
&& n
< (int)p
.get_pg_num_target()) {
8833 ss
<< "specified pg_num_max " << n
8834 << " < pg_num " << p
.get_pg_num_target();
8837 } else if (var
== "recovery_priority") {
8838 if (interr
.length()) {
8839 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8842 if (!g_conf()->debug_allow_any_pool_priority
) {
8843 if (n
> OSD_POOL_PRIORITY_MAX
|| n
< OSD_POOL_PRIORITY_MIN
) {
8844 ss
<< "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
8845 << " and " << OSD_POOL_PRIORITY_MAX
;
8849 } else if (var
== "pg_autoscale_bias") {
8850 if (f
< 0.0 || f
> 1000.0) {
8851 ss
<< "pg_autoscale_bias must be between 0 and 1000";
8854 } else if (var
== "dedup_tier") {
8855 if (interr
.empty()) {
8856 ss
<< "expecting value 'pool name'";
8859 // Current base tier in dedup does not support ec pool
8860 if (p
.is_erasure()) {
8861 ss
<< "pool '" << poolstr
8862 << "' is an ec pool, which cannot be a base tier";
8865 int64_t lowtierpool_id
= osdmap
.lookup_pg_pool_name(val
);
8866 if (lowtierpool_id
< 0) {
8867 ss
<< "unrecognized pool '" << val
<< "'";
8870 const pg_pool_t
*tp
= osdmap
.get_pg_pool(lowtierpool_id
);
8873 // The original input is string (pool name), but we convert it to int64_t.
8876 } else if (var
== "dedup_chunk_algorithm") {
8878 auto alg
= pg_pool_t::get_dedup_chunk_algorithm_from_str(val
);
8880 ss
<< "unrecognized fingerprint_algorithm '" << val
<< "'";
8884 } else if (var
== "dedup_cdc_chunk_size") {
8885 if (interr
.length()) {
8886 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8891 pool_opts_t::opt_desc_t desc
= pool_opts_t::get_opt_desc(var
);
8892 switch (desc
.type
) {
8893 case pool_opts_t::STR
:
8895 p
.opts
.unset(desc
.key
);
8897 p
.opts
.set(desc
.key
, static_cast<std::string
>(val
));
8900 case pool_opts_t::INT
:
8901 if (interr
.length()) {
8902 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8906 p
.opts
.unset(desc
.key
);
8908 p
.opts
.set(desc
.key
, static_cast<int64_t>(n
));
8911 case pool_opts_t::DOUBLE
:
8912 if (floaterr
.length()) {
8913 ss
<< "error parsing floating point value '" << val
<< "': " << floaterr
;
8917 p
.opts
.unset(desc
.key
);
8919 p
.opts
.set(desc
.key
, static_cast<double>(f
));
8923 ceph_assert(!"unknown type");
8926 ss
<< "unrecognized variable '" << var
<< "'";
8929 if (val
!= "unset") {
8930 ss
<< "set pool " << pool
<< " " << var
<< " to " << val
;
8932 ss
<< "unset pool " << pool
<< " " << var
;
8934 p
.last_change
= pending_inc
.epoch
;
8935 pending_inc
.new_pools
[pool
] = p
;
8939 int OSDMonitor::prepare_command_pool_application(const string
&prefix
,
8940 const cmdmap_t
& cmdmap
,
8943 return _command_pool_application(prefix
, cmdmap
, ss
, nullptr, true);
8946 int OSDMonitor::preprocess_command_pool_application(const string
&prefix
,
8947 const cmdmap_t
& cmdmap
,
8951 return _command_pool_application(prefix
, cmdmap
, ss
, modified
, false);
8956 * Common logic for preprocess and prepare phases of pool application
8957 * tag commands. In preprocess mode we're only detecting invalid
8958 * commands, and determining whether it was a modification or a no-op.
8959 * In prepare mode we're actually updating the pending state.
8961 int OSDMonitor::_command_pool_application(const string
&prefix
,
8962 const cmdmap_t
& cmdmap
,
8968 cmd_getval(cmdmap
, "pool", pool_name
);
8969 int64_t pool
= osdmap
.lookup_pg_pool_name(pool_name
.c_str());
8971 ss
<< "unrecognized pool '" << pool_name
<< "'";
8975 pg_pool_t p
= *osdmap
.get_pg_pool(pool
);
8977 if (pending_inc
.new_pools
.count(pool
)) {
8978 p
= pending_inc
.new_pools
[pool
];
8983 cmd_getval(cmdmap
, "app", app
);
8984 bool app_exists
= (p
.application_metadata
.count(app
) > 0);
8987 cmd_getval(cmdmap
, "key", key
);
8989 ss
<< "key cannot be 'all'";
8994 cmd_getval(cmdmap
, "value", value
);
8995 if (value
== "all") {
8996 ss
<< "value cannot be 'all'";
9000 if (boost::algorithm::ends_with(prefix
, "enable")) {
9002 ss
<< "application name must be provided";
9007 ss
<< "application must be enabled on base tier";
9012 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
9014 if (!app_exists
&& !p
.application_metadata
.empty() && !force
) {
9015 ss
<< "Are you SURE? Pool '" << pool_name
<< "' already has an enabled "
9016 << "application; pass --yes-i-really-mean-it to proceed anyway";
9020 if (!app_exists
&& p
.application_metadata
.size() >= MAX_POOL_APPLICATIONS
) {
9021 ss
<< "too many enabled applications on pool '" << pool_name
<< "'; "
9022 << "max " << MAX_POOL_APPLICATIONS
;
9026 if (app
.length() > MAX_POOL_APPLICATION_LENGTH
) {
9027 ss
<< "application name '" << app
<< "' too long; max length "
9028 << MAX_POOL_APPLICATION_LENGTH
;
9033 p
.application_metadata
[app
] = {};
9035 ss
<< "enabled application '" << app
<< "' on pool '" << pool_name
<< "'";
9037 } else if (boost::algorithm::ends_with(prefix
, "disable")) {
9039 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
9042 ss
<< "Are you SURE? Disabling an application within a pool might result "
9043 << "in loss of application functionality; pass "
9044 << "--yes-i-really-mean-it to proceed anyway";
9049 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
9051 return 0; // idempotent
9054 p
.application_metadata
.erase(app
);
9055 ss
<< "disable application '" << app
<< "' on pool '" << pool_name
<< "'";
9057 } else if (boost::algorithm::ends_with(prefix
, "set")) {
9059 ss
<< "application metadata must be set on base tier";
9064 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
9070 cmd_getval(cmdmap
, "key", key
);
9073 ss
<< "key must be provided";
9077 auto &app_keys
= p
.application_metadata
[app
];
9078 if (app_keys
.count(key
) == 0 &&
9079 app_keys
.size() >= MAX_POOL_APPLICATION_KEYS
) {
9080 ss
<< "too many keys set for application '" << app
<< "' on pool '"
9081 << pool_name
<< "'; max " << MAX_POOL_APPLICATION_KEYS
;
9085 if (key
.length() > MAX_POOL_APPLICATION_LENGTH
) {
9086 ss
<< "key '" << app
<< "' too long; max length "
9087 << MAX_POOL_APPLICATION_LENGTH
;
9092 cmd_getval(cmdmap
, "value", value
);
9093 if (value
.length() > MAX_POOL_APPLICATION_LENGTH
) {
9094 ss
<< "value '" << value
<< "' too long; max length "
9095 << MAX_POOL_APPLICATION_LENGTH
;
9099 p
.application_metadata
[app
][key
] = value
;
9100 ss
<< "set application '" << app
<< "' key '" << key
<< "' to '"
9101 << value
<< "' on pool '" << pool_name
<< "'";
9102 } else if (boost::algorithm::ends_with(prefix
, "rm")) {
9104 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
9110 cmd_getval(cmdmap
, "key", key
);
9111 auto it
= p
.application_metadata
[app
].find(key
);
9112 if (it
== p
.application_metadata
[app
].end()) {
9113 ss
<< "application '" << app
<< "' on pool '" << pool_name
9114 << "' does not have key '" << key
<< "'";
9115 return 0; // idempotent
9118 p
.application_metadata
[app
].erase(it
);
9119 ss
<< "removed application '" << app
<< "' key '" << key
<< "' on pool '"
9120 << pool_name
<< "'";
9126 p
.last_change
= pending_inc
.epoch
;
9127 pending_inc
.new_pools
[pool
] = p
;
9130 // Because we fell through this far, we didn't hit no-op cases,
9131 // so pool was definitely modified
9132 if (modified
!= nullptr) {
9139 int OSDMonitor::_prepare_command_osd_crush_remove(
9140 CrushWrapper
&newcrush
,
9149 err
= newcrush
.remove_item_under(cct
, id
, ancestor
,
9152 err
= newcrush
.remove_item(cct
, id
, unlink_only
);
9157 void OSDMonitor::do_osd_crush_remove(CrushWrapper
& newcrush
)
9159 pending_inc
.crush
.clear();
9160 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
9163 int OSDMonitor::prepare_command_osd_crush_remove(
9164 CrushWrapper
&newcrush
,
9170 int err
= _prepare_command_osd_crush_remove(
9171 newcrush
, id
, ancestor
,
9172 has_ancestor
, unlink_only
);
9177 ceph_assert(err
== 0);
9178 do_osd_crush_remove(newcrush
);
9183 int OSDMonitor::prepare_command_osd_remove(int32_t id
)
9185 if (osdmap
.is_up(id
)) {
9189 pending_inc
.new_state
[id
] = osdmap
.get_state(id
);
9190 pending_inc
.new_uuid
[id
] = uuid_d();
9191 pending_metadata_rm
.insert(id
);
9192 pending_metadata
.erase(id
);
9197 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id
)
9199 ceph_assert(existing_id
);
9202 for (int32_t i
= 0; i
< osdmap
.get_max_osd(); ++i
) {
9203 if (!osdmap
.exists(i
) &&
9204 pending_inc
.new_up_client
.count(i
) == 0 &&
9205 (pending_inc
.new_state
.count(i
) == 0 ||
9206 (pending_inc
.new_state
[i
] & CEPH_OSD_EXISTS
) == 0)) {
9212 if (pending_inc
.new_max_osd
< 0) {
9213 return osdmap
.get_max_osd();
9215 return pending_inc
.new_max_osd
;
9218 void OSDMonitor::do_osd_create(
9221 const string
& device_class
,
9224 dout(10) << __func__
<< " uuid " << uuid
<< dendl
;
9225 ceph_assert(new_id
);
9227 // We presume validation has been performed prior to calling this
9228 // function. We assert with prejudice.
9230 int32_t allocated_id
= -1; // declare here so we can jump
9231 int32_t existing_id
= -1;
9232 if (!uuid
.is_zero()) {
9233 existing_id
= osdmap
.identify_osd(uuid
);
9234 if (existing_id
>= 0) {
9235 ceph_assert(id
< 0 || id
== existing_id
);
9236 *new_id
= existing_id
;
9238 } else if (id
>= 0) {
9239 // uuid does not exist, and id has been provided, so just create
9246 // allocate a new id
9247 allocated_id
= _allocate_osd_id(&existing_id
);
9248 dout(10) << __func__
<< " allocated id " << allocated_id
9249 << " existing id " << existing_id
<< dendl
;
9250 if (existing_id
>= 0) {
9251 ceph_assert(existing_id
< osdmap
.get_max_osd());
9252 ceph_assert(allocated_id
< 0);
9253 *new_id
= existing_id
;
9254 } else if (allocated_id
>= 0) {
9255 ceph_assert(existing_id
< 0);
9257 if (pending_inc
.new_max_osd
< 0) {
9258 pending_inc
.new_max_osd
= osdmap
.get_max_osd() + 1;
9260 ++pending_inc
.new_max_osd
;
9262 *new_id
= pending_inc
.new_max_osd
- 1;
9263 ceph_assert(*new_id
== allocated_id
);
9265 ceph_abort_msg("unexpected condition");
9269 if (device_class
.size()) {
9270 CrushWrapper newcrush
= _get_pending_crush();
9271 if (newcrush
.get_max_devices() < *new_id
+ 1) {
9272 newcrush
.set_max_devices(*new_id
+ 1);
9274 string name
= string("osd.") + stringify(*new_id
);
9275 if (!newcrush
.item_exists(*new_id
)) {
9276 newcrush
.set_item_name(*new_id
, name
);
9279 int r
= newcrush
.update_device_class(*new_id
, device_class
, name
, &ss
);
9281 derr
<< __func__
<< " failed to set " << name
<< " device_class "
9282 << device_class
<< ": " << cpp_strerror(r
) << " - " << ss
.str()
9284 // non-fatal... this might be a replay and we want to be idempotent.
9286 dout(20) << __func__
<< " set " << name
<< " device_class " << device_class
9288 pending_inc
.crush
.clear();
9289 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
9292 dout(20) << __func__
<< " no device_class" << dendl
;
9295 dout(10) << __func__
<< " using id " << *new_id
<< dendl
;
9296 if (osdmap
.get_max_osd() <= *new_id
&& pending_inc
.new_max_osd
<= *new_id
) {
9297 pending_inc
.new_max_osd
= *new_id
+ 1;
9300 pending_inc
.new_weight
[*new_id
] = CEPH_OSD_IN
;
9301 // do not set EXISTS; OSDMap::set_weight, called by apply_incremental, will
9302 // set it for us. (ugh.)
9303 pending_inc
.new_state
[*new_id
] |= CEPH_OSD_NEW
;
9304 if (!uuid
.is_zero())
9305 pending_inc
.new_uuid
[*new_id
] = uuid
;
9308 int OSDMonitor::validate_osd_create(
9311 const bool check_osd_exists
,
9312 int32_t* existing_id
,
9316 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
9317 << " check_osd_exists " << check_osd_exists
<< dendl
;
9319 ceph_assert(existing_id
);
9321 if (id
< 0 && uuid
.is_zero()) {
9322 // we have nothing to validate
9325 } else if (uuid
.is_zero()) {
9326 // we have an id but we will ignore it - because that's what
9327 // `osd create` does.
9332 * This function will be used to validate whether we are able to
9333 * create a new osd when the `uuid` is specified.
9335 * It will be used by both `osd create` and `osd new`, as the checks
9336 * are basically the same when it pertains to osd id and uuid validation.
9337 * However, `osd create` presumes an `uuid` is optional, for legacy
9338 * reasons, while `osd new` requires the `uuid` to be provided. This
9339 * means that `osd create` will not be idempotent if an `uuid` is not
9340 * provided, but we will always guarantee the idempotency of `osd new`.
9343 ceph_assert(!uuid
.is_zero());
9344 if (pending_inc
.identify_osd(uuid
) >= 0) {
9345 // osd is about to exist
9349 int32_t i
= osdmap
.identify_osd(uuid
);
9351 // osd already exists
9352 if (id
>= 0 && i
!= id
) {
9353 ss
<< "uuid " << uuid
<< " already in use for different id " << i
;
9356 // return a positive errno to distinguish between a blocking error
9357 // and an error we consider to not be a problem (i.e., this would be
9358 // an idempotent operation).
9364 if (pending_inc
.new_state
.count(id
)) {
9365 // osd is about to exist
9368 // we may not care if an osd exists if we are recreating a previously
9370 if (check_osd_exists
&& osdmap
.exists(id
)) {
9371 ss
<< "id " << id
<< " already in use and does not match uuid "
9379 int OSDMonitor::prepare_command_osd_create(
9382 int32_t* existing_id
,
9385 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
<< dendl
;
9386 ceph_assert(existing_id
);
9387 if (osdmap
.is_destroyed(id
)) {
9388 ss
<< "ceph osd create has been deprecated. Please use ceph osd new "
9393 if (uuid
.is_zero()) {
9394 dout(10) << __func__
<< " no uuid; assuming legacy `osd create`" << dendl
;
9397 return validate_osd_create(id
, uuid
, true, existing_id
, ss
);
9400 int OSDMonitor::prepare_command_osd_new(
9402 const cmdmap_t
& cmdmap
,
9403 const map
<string
,string
>& params
,
9411 ceph_assert(paxos
.is_plugged());
9413 dout(10) << __func__
<< " " << op
<< dendl
;
9415 /* validate command. abort now if something's wrong. */
9417 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
9419 * If `id` is not specified, we will identify any existing osd based
9420 * on `uuid`. Operation will be idempotent iff secrets match.
9422 * If `id` is specified, we will identify any existing osd based on
9423 * `uuid` and match against `id`. If they match, operation will be
9424 * idempotent iff secrets match.
9426 * `-i secrets.json` will be optional. If supplied, will be used
9427 * to check for idempotency when `id` and `uuid` match.
9429 * If `id` is not specified, and `uuid` does not exist, an id will
9430 * be found or allocated for the osd.
9432 * If `id` is specified, and the osd has been previously marked
9433 * as destroyed, then the `id` will be reused.
9435 if (!cmd_getval(cmdmap
, "uuid", uuidstr
)) {
9436 ss
<< "requires the OSD's UUID to be specified.";
9438 } else if (!uuid
.parse(uuidstr
.c_str())) {
9439 ss
<< "invalid UUID value '" << uuidstr
<< "'.";
9443 if (cmd_getval(cmdmap
, "id", id
) &&
9445 ss
<< "invalid OSD id; must be greater or equal than zero.";
9449 // are we running an `osd create`-like command, or recreating
9450 // a previously destroyed osd?
9452 bool is_recreate_destroyed
= (id
>= 0 && osdmap
.is_destroyed(id
));
9454 // we will care about `id` to assess whether osd is `destroyed`, or
9455 // to create a new osd.
9456 // we will need an `id` by the time we reach auth.
9458 int32_t existing_id
= -1;
9459 int err
= validate_osd_create(id
, uuid
, !is_recreate_destroyed
,
9462 bool may_be_idempotent
= false;
9463 if (err
== EEXIST
) {
9464 // this is idempotent from the osdmon's point-of-view
9465 may_be_idempotent
= true;
9466 ceph_assert(existing_id
>= 0);
9468 } else if (err
< 0) {
9472 if (!may_be_idempotent
) {
9473 // idempotency is out of the window. We are either creating a new
9474 // osd or recreating a destroyed osd.
9476 // We now need to figure out if we have an `id` (and if it's valid),
9477 // of find an `id` if we don't have one.
9479 // NOTE: we need to consider the case where the `id` is specified for
9480 // `osd create`, and we must honor it. So this means checking if
9481 // the `id` is destroyed, and if so assume the destroy; otherwise,
9482 // check if it `exists` - in which case we complain about not being
9483 // `destroyed`. In the end, if nothing fails, we must allow the
9484 // creation, so that we are compatible with `create`.
9485 if (id
>= 0 && osdmap
.exists(id
) && !osdmap
.is_destroyed(id
)) {
9486 dout(10) << __func__
<< " osd." << id
<< " isn't destroyed" << dendl
;
9487 ss
<< "OSD " << id
<< " has not yet been destroyed";
9489 } else if (id
< 0) {
9491 id
= _allocate_osd_id(&existing_id
);
9493 ceph_assert(existing_id
>= 0);
9496 dout(10) << __func__
<< " found id " << id
<< " to use" << dendl
;
9497 } else if (id
>= 0 && osdmap
.is_destroyed(id
)) {
9498 dout(10) << __func__
<< " recreating osd." << id
<< dendl
;
9500 dout(10) << __func__
<< " creating new osd." << id
<< dendl
;
9503 ceph_assert(id
>= 0);
9504 ceph_assert(osdmap
.exists(id
));
9507 // we are now able to either create a brand new osd or reuse an existing
9508 // osd that has been previously destroyed.
9510 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
<< dendl
;
9512 if (may_be_idempotent
&& params
.empty()) {
9513 // nothing to do, really.
9514 dout(10) << __func__
<< " idempotent and no params -- no op." << dendl
;
9515 ceph_assert(id
>= 0);
9517 f
->open_object_section("created_osd");
9518 f
->dump_int("osdid", id
);
9526 string device_class
;
9527 auto p
= params
.find("crush_device_class");
9528 if (p
!= params
.end()) {
9529 device_class
= p
->second
;
9530 dout(20) << __func__
<< " device_class will be " << device_class
<< dendl
;
9532 string cephx_secret
, lockbox_secret
, dmcrypt_key
;
9533 bool has_lockbox
= false;
9534 bool has_secrets
= params
.count("cephx_secret")
9535 || params
.count("cephx_lockbox_secret")
9536 || params
.count("dmcrypt_key");
9538 KVMonitor
*svc
= nullptr;
9539 AuthMonitor::auth_entity_t cephx_entity
, lockbox_entity
;
9542 if (params
.count("cephx_secret") == 0) {
9543 ss
<< "requires a cephx secret.";
9546 cephx_secret
= params
.at("cephx_secret");
9548 bool has_lockbox_secret
= (params
.count("cephx_lockbox_secret") > 0);
9549 bool has_dmcrypt_key
= (params
.count("dmcrypt_key") > 0);
9551 dout(10) << __func__
<< " has lockbox " << has_lockbox_secret
9552 << " dmcrypt " << has_dmcrypt_key
<< dendl
;
9554 if (has_lockbox_secret
&& has_dmcrypt_key
) {
9556 lockbox_secret
= params
.at("cephx_lockbox_secret");
9557 dmcrypt_key
= params
.at("dmcrypt_key");
9558 } else if (!has_lockbox_secret
!= !has_dmcrypt_key
) {
9559 ss
<< "requires both a cephx lockbox secret and a dm-crypt key.";
9563 dout(10) << __func__
<< " validate secrets using osd id " << id
<< dendl
;
9565 err
= mon
.authmon()->validate_osd_new(id
, uuid
,
9573 } else if (may_be_idempotent
&& err
!= EEXIST
) {
9574 // for this to be idempotent, `id` should already be >= 0; no need
9575 // to use validate_id.
9576 ceph_assert(id
>= 0);
9577 ss
<< "osd." << id
<< " exists but secrets do not match";
9583 err
= svc
->validate_osd_new(uuid
, dmcrypt_key
, ss
);
9586 } else if (may_be_idempotent
&& err
!= EEXIST
) {
9587 ceph_assert(id
>= 0);
9588 ss
<< "osd." << id
<< " exists but dm-crypt key does not match.";
9593 ceph_assert(!has_secrets
|| !cephx_secret
.empty());
9594 ceph_assert(!has_lockbox
|| !lockbox_secret
.empty());
9596 if (may_be_idempotent
) {
9597 // we have nothing to do for either the osdmon or the authmon,
9598 // and we have no lockbox - so the config key service will not be
9599 // touched. This is therefore an idempotent operation, and we can
9600 // just return right away.
9601 dout(10) << __func__
<< " idempotent -- no op." << dendl
;
9602 ceph_assert(id
>= 0);
9604 f
->open_object_section("created_osd");
9605 f
->dump_int("osdid", id
);
9612 ceph_assert(!may_be_idempotent
);
9616 ceph_assert(!cephx_secret
.empty());
9617 ceph_assert((lockbox_secret
.empty() && dmcrypt_key
.empty()) ||
9618 (!lockbox_secret
.empty() && !dmcrypt_key
.empty()));
9620 err
= mon
.authmon()->do_osd_new(cephx_entity
,
9623 ceph_assert(0 == err
);
9626 ceph_assert(nullptr != svc
);
9627 svc
->do_osd_new(uuid
, dmcrypt_key
);
9631 if (is_recreate_destroyed
) {
9632 ceph_assert(id
>= 0);
9633 ceph_assert(osdmap
.is_destroyed(id
));
9634 pending_inc
.new_state
[id
] |= CEPH_OSD_DESTROYED
;
9635 if ((osdmap
.get_state(id
) & CEPH_OSD_NEW
) == 0) {
9636 pending_inc
.new_state
[id
] |= CEPH_OSD_NEW
;
9638 if (osdmap
.get_state(id
) & CEPH_OSD_UP
) {
9639 // due to http://tracker.ceph.com/issues/20751 some clusters may
9640 // have UP set for non-existent OSDs; make sure it is cleared
9641 // for a newly created osd.
9642 pending_inc
.new_state
[id
] |= CEPH_OSD_UP
;
9644 pending_inc
.new_uuid
[id
] = uuid
;
9646 ceph_assert(id
>= 0);
9647 int32_t new_id
= -1;
9648 do_osd_create(id
, uuid
, device_class
, &new_id
);
9649 ceph_assert(new_id
>= 0);
9650 ceph_assert(id
== new_id
);
9654 f
->open_object_section("created_osd");
9655 f
->dump_int("osdid", id
);
9664 bool OSDMonitor::prepare_command(MonOpRequestRef op
)
9666 op
->mark_osdmon_event(__func__
);
9667 auto m
= op
->get_req
<MMonCommand
>();
9670 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
9671 string rs
= ss
.str();
9672 mon
.reply_command(op
, -EINVAL
, rs
, get_last_committed());
9676 MonSession
*session
= op
->get_session();
9678 derr
<< __func__
<< " no session" << dendl
;
9679 mon
.reply_command(op
, -EACCES
, "access denied", get_last_committed());
9683 return prepare_command_impl(op
, cmdmap
);
9686 static int parse_reweights(CephContext
*cct
,
9687 const cmdmap_t
& cmdmap
,
9688 const OSDMap
& osdmap
,
9689 map
<int32_t, uint32_t>* weights
)
9692 if (!cmd_getval(cmdmap
, "weights", weights_str
)) {
9695 std::replace(begin(weights_str
), end(weights_str
), '\'', '"');
9696 json_spirit::mValue json_value
;
9697 if (!json_spirit::read(weights_str
, json_value
)) {
9700 if (json_value
.type() != json_spirit::obj_type
) {
9703 const auto obj
= json_value
.get_obj();
9705 for (auto& osd_weight
: obj
) {
9706 auto osd_id
= std::stoi(osd_weight
.first
);
9707 if (!osdmap
.exists(osd_id
)) {
9710 if (osd_weight
.second
.type() != json_spirit::str_type
) {
9713 auto weight
= std::stoul(osd_weight
.second
.get_str());
9714 weights
->insert({osd_id
, weight
});
9716 } catch (const std::logic_error
& e
) {
9722 int OSDMonitor::prepare_command_osd_destroy(
9726 ceph_assert(paxos
.is_plugged());
9728 // we check if the osd exists for the benefit of `osd purge`, which may
9729 // have previously removed the osd. If the osd does not exist, return
9730 // -ENOENT to convey this, and let the caller deal with it.
9732 // we presume that all auth secrets and config keys were removed prior
9733 // to this command being called. if they exist by now, we also assume
9734 // they must have been created by some other command and do not pertain
9735 // to this non-existent osd.
9736 if (!osdmap
.exists(id
)) {
9737 dout(10) << __func__
<< " osd." << id
<< " does not exist." << dendl
;
9741 uuid_d uuid
= osdmap
.get_uuid(id
);
9742 dout(10) << __func__
<< " destroying osd." << id
9743 << " uuid " << uuid
<< dendl
;
9745 // if it has been destroyed, we assume our work here is done.
9746 if (osdmap
.is_destroyed(id
)) {
9747 ss
<< "destroyed osd." << id
;
9751 EntityName cephx_entity
, lockbox_entity
;
9752 bool idempotent_auth
= false, idempotent_cks
= false;
9754 int err
= mon
.authmon()->validate_osd_destroy(id
, uuid
,
9759 if (err
== -ENOENT
) {
9760 idempotent_auth
= true;
9766 auto svc
= mon
.kvmon();
9767 err
= svc
->validate_osd_destroy(id
, uuid
);
9769 ceph_assert(err
== -ENOENT
);
9771 idempotent_cks
= true;
9774 if (!idempotent_auth
) {
9775 err
= mon
.authmon()->do_osd_destroy(cephx_entity
, lockbox_entity
);
9776 ceph_assert(0 == err
);
9779 if (!idempotent_cks
) {
9780 svc
->do_osd_destroy(id
, uuid
);
9783 pending_inc
.new_state
[id
] = CEPH_OSD_DESTROYED
;
9784 pending_inc
.new_uuid
[id
] = uuid_d();
9786 // we can only propose_pending() once per service, otherwise we'll be
9787 // defying PaxosService and all laws of nature. Therefore, as we may
9788 // be used during 'osd purge', let's keep the caller responsible for
9790 ceph_assert(err
== 0);
9794 int OSDMonitor::prepare_command_osd_purge(
9798 ceph_assert(paxos
.is_plugged());
9799 dout(10) << __func__
<< " purging osd." << id
<< dendl
;
9801 ceph_assert(!osdmap
.is_up(id
));
9804 * This may look a bit weird, but this is what's going to happen:
9806 * 1. we make sure that removing from crush works
9807 * 2. we call `prepare_command_osd_destroy()`. If it returns an
9808 * error, then we abort the whole operation, as no updates
9809 * have been made. However, we this function will have
9810 * side-effects, thus we need to make sure that all operations
9811 * performed henceforth will *always* succeed.
9812 * 3. we call `prepare_command_osd_remove()`. Although this
9813 * function can return an error, it currently only checks if the
9814 * osd is up - and we have made sure that it is not so, so there
9815 * is no conflict, and it is effectively an update.
9816 * 4. finally, we call `do_osd_crush_remove()`, which will perform
9817 * the crush update we delayed from before.
9820 CrushWrapper newcrush
= _get_pending_crush();
9822 bool may_be_idempotent
= false;
9824 int err
= _prepare_command_osd_crush_remove(newcrush
, id
, 0, false, false);
9825 if (err
== -ENOENT
) {
9827 may_be_idempotent
= true;
9828 } else if (err
< 0) {
9829 ss
<< "error removing osd." << id
<< " from crush";
9833 // no point destroying the osd again if it has already been marked destroyed
9834 if (!osdmap
.is_destroyed(id
)) {
9835 err
= prepare_command_osd_destroy(id
, ss
);
9837 if (err
== -ENOENT
) {
9843 may_be_idempotent
= false;
9846 ceph_assert(0 == err
);
9848 if (may_be_idempotent
&& !osdmap
.exists(id
)) {
9849 dout(10) << __func__
<< " osd." << id
<< " does not exist and "
9850 << "we are idempotent." << dendl
;
9854 err
= prepare_command_osd_remove(id
);
9855 // we should not be busy, as we should have made sure this id is not up.
9856 ceph_assert(0 == err
);
9858 do_osd_crush_remove(newcrush
);
9862 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op
,
9863 const cmdmap_t
& cmdmap
)
9865 op
->mark_osdmon_event(__func__
);
9866 auto m
= op
->get_req
<MMonCommand
>();
9873 string format
= cmd_getval_or
<string
>(cmdmap
, "format", "plain");
9874 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
9877 cmd_getval(cmdmap
, "prefix", prefix
);
9881 bool osdid_present
= false;
9882 if (prefix
!= "osd pg-temp" &&
9883 prefix
!= "osd pg-upmap" &&
9884 prefix
!= "osd pg-upmap-items") { // avoid commands with non-int id arg
9885 osdid_present
= cmd_getval(cmdmap
, "id", osdid
);
9887 if (osdid_present
) {
9889 oss
<< "osd." << osdid
;
9890 osd_name
= oss
.str();
9893 // Even if there's a pending state with changes that could affect
9894 // a command, considering that said state isn't yet committed, we
9895 // just don't care about those changes if the command currently being
9896 // handled acts as a no-op against the current committed state.
9897 // In a nutshell, we assume this command happens *before*.
9899 // Let me make this clearer:
9901 // - If we have only one client, and that client issues some
9902 // operation that would conflict with this operation but is
9903 // still on the pending state, then we would be sure that said
9904 // operation wouldn't have returned yet, so the client wouldn't
9905 // issue this operation (unless the client didn't wait for the
9906 // operation to finish, and that would be the client's own fault).
9908 // - If we have more than one client, each client will observe
9909 // whatever is the state at the moment of the commit. So, if we
9910 // have two clients, one issuing an unlink and another issuing a
9911 // link, and if the link happens while the unlink is still on the
9912 // pending state, from the link's point-of-view this is a no-op.
9913 // If different clients are issuing conflicting operations and
9914 // they care about that, then the clients should make sure they
9915 // enforce some kind of concurrency mechanism -- from our
9916 // perspective that's what Douglas Adams would call an SEP.
9918 // This should be used as a general guideline for most commands handled
9919 // in this function. Adapt as you see fit, but please bear in mind that
9920 // this is the expected behavior.
9923 if (prefix
== "osd setcrushmap" ||
9924 (prefix
== "osd crush set" && !osdid_present
)) {
9925 if (pending_inc
.crush
.length()) {
9926 dout(10) << __func__
<< " waiting for pending crush update " << dendl
;
9927 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
9930 dout(10) << "prepare_command setting new crush map" << dendl
;
9931 bufferlist
data(m
->get_data());
9934 auto bl
= data
.cbegin();
9937 catch (const std::exception
&e
) {
9939 ss
<< "Failed to parse crushmap: " << e
.what();
9943 int64_t prior_version
= 0;
9944 if (cmd_getval(cmdmap
, "prior_version", prior_version
)) {
9945 if (prior_version
== osdmap
.get_crush_version() - 1) {
9946 // see if we are a resend of the last update. this is imperfect
9947 // (multiple racing updaters may not both get reliable success)
9948 // but we expect crush updaters (via this interface) to be rare-ish.
9949 bufferlist current
, proposed
;
9950 osdmap
.crush
->encode(current
, mon
.get_quorum_con_features());
9951 crush
.encode(proposed
, mon
.get_quorum_con_features());
9952 if (current
.contents_equal(proposed
)) {
9953 dout(10) << __func__
9954 << " proposed matches current and version equals previous"
9957 ss
<< osdmap
.get_crush_version();
9961 if (prior_version
!= osdmap
.get_crush_version()) {
9963 ss
<< "prior_version " << prior_version
<< " != crush version "
9964 << osdmap
.get_crush_version();
9969 if (!validate_crush_against_features(&crush
, ss
)) {
9974 err
= osdmap
.validate_crush_rules(&crush
, &ss
);
9979 if (g_conf()->mon_osd_crush_smoke_test
) {
9980 // sanity check: test some inputs to make sure this map isn't
9982 dout(10) << " testing map" << dendl
;
9984 CrushTester
tester(crush
, ess
);
9985 tester
.set_min_x(0);
9986 tester
.set_max_x(50);
9987 tester
.set_num_rep(3); // arbitrary
9988 auto start
= ceph::coarse_mono_clock::now();
9989 int r
= tester
.test_with_fork(g_conf()->mon_lease
);
9990 auto duration
= ceph::coarse_mono_clock::now() - start
;
9992 dout(10) << " tester.test_with_fork returns " << r
9993 << ": " << ess
.str() << dendl
;
9994 ss
<< "crush smoke test failed with " << r
<< ": " << ess
.str();
9998 dout(10) << __func__
<< " crush somke test duration: "
9999 << duration
<< ", result: " << ess
.str() << dendl
;
10002 pending_inc
.crush
= data
;
10003 ss
<< osdmap
.get_crush_version() + 1;
10006 } else if (prefix
== "osd crush set-all-straw-buckets-to-straw2") {
10007 CrushWrapper newcrush
= _get_pending_crush();
10008 for (int b
= 0; b
< newcrush
.get_max_buckets(); ++b
) {
10010 if (newcrush
.bucket_exists(bid
) &&
10011 newcrush
.get_bucket_alg(bid
) == CRUSH_BUCKET_STRAW
) {
10012 dout(20) << " bucket " << bid
<< " is straw, can convert" << dendl
;
10013 newcrush
.bucket_set_alg(bid
, CRUSH_BUCKET_STRAW2
);
10016 if (!validate_crush_against_features(&newcrush
, ss
)) {
10020 pending_inc
.crush
.clear();
10021 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10022 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10023 get_last_committed() + 1));
10025 } else if (prefix
== "osd crush set-device-class") {
10026 string device_class
;
10027 if (!cmd_getval(cmdmap
, "class", device_class
)) {
10028 err
= -EINVAL
; // no value!
10033 vector
<string
> idvec
;
10034 cmd_getval(cmdmap
, "ids", idvec
);
10035 CrushWrapper newcrush
= _get_pending_crush();
10037 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
10041 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
10042 osdmap
.get_all_osds(osds
);
10045 // try traditional single osd way
10046 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
10048 // ss has reason for failure
10049 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
10056 for (auto &osd
: osds
) {
10057 if (!osdmap
.exists(osd
)) {
10058 ss
<< "osd." << osd
<< " does not exist. ";
10063 oss
<< "osd." << osd
;
10064 string name
= oss
.str();
10066 if (newcrush
.get_max_devices() < osd
+ 1) {
10067 newcrush
.set_max_devices(osd
+ 1);
10070 if (newcrush
.item_exists(osd
)) {
10071 action
= "updating";
10073 action
= "creating";
10074 newcrush
.set_item_name(osd
, name
);
10077 dout(5) << action
<< " crush item id " << osd
<< " name '" << name
10078 << "' device_class '" << device_class
<< "'"
10080 err
= newcrush
.update_device_class(osd
, device_class
, name
, &ss
);
10084 if (err
== 0 && !_have_pending_crush()) {
10086 // for single osd only, wildcard makes too much noise
10087 ss
<< "set-device-class item id " << osd
<< " name '" << name
10088 << "' device_class '" << device_class
<< "': no change. ";
10091 updated
.insert(osd
);
10096 pending_inc
.crush
.clear();
10097 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10098 ss
<< "set osd(s) " << updated
<< " to class '" << device_class
<< "'";
10100 wait_for_finished_proposal(
10102 new Monitor::C_Command(mon
,op
, 0, rs
, get_last_committed() + 1));
10104 } else if (prefix
== "osd crush rm-device-class") {
10106 vector
<string
> idvec
;
10107 cmd_getval(cmdmap
, "ids", idvec
);
10108 CrushWrapper newcrush
= _get_pending_crush();
10111 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
10116 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
10117 osdmap
.get_all_osds(osds
);
10120 // try traditional single osd way
10121 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
10123 // ss has reason for failure
10124 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
10131 for (auto &osd
: osds
) {
10132 if (!osdmap
.exists(osd
)) {
10133 ss
<< "osd." << osd
<< " does not exist. ";
10137 auto class_name
= newcrush
.get_item_class(osd
);
10139 ss
<< "osd." << osd
<< " belongs to no class, ";
10142 // note that we do not verify if class_is_in_use here
10143 // in case the device is misclassified and user wants
10144 // to overridely reset...
10146 err
= newcrush
.remove_device_class(cct
, osd
, &ss
);
10148 // ss has reason for failure
10151 updated
.insert(osd
);
10155 pending_inc
.crush
.clear();
10156 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10157 ss
<< "done removing class of osd(s): " << updated
;
10159 wait_for_finished_proposal(
10161 new Monitor::C_Command(mon
,op
, 0, rs
, get_last_committed() + 1));
10163 } else if (prefix
== "osd crush class create") {
10164 string device_class
;
10165 if (!cmd_getval(cmdmap
, "class", device_class
)) {
10166 err
= -EINVAL
; // no value!
10169 if (osdmap
.require_osd_release
< ceph_release_t::luminous
) {
10170 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
10171 << "luminous' before using crush device classes";
10175 if (!_have_pending_crush() &&
10176 _get_stable_crush().class_exists(device_class
)) {
10177 ss
<< "class '" << device_class
<< "' already exists";
10180 CrushWrapper newcrush
= _get_pending_crush();
10181 if (newcrush
.class_exists(device_class
)) {
10182 ss
<< "class '" << device_class
<< "' already exists";
10185 int class_id
= newcrush
.get_or_create_class_id(device_class
);
10186 pending_inc
.crush
.clear();
10187 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10188 ss
<< "created class " << device_class
<< " with id " << class_id
10189 << " to crush map";
10191 } else if (prefix
== "osd crush class rm") {
10192 string device_class
;
10193 if (!cmd_getval(cmdmap
, "class", device_class
)) {
10194 err
= -EINVAL
; // no value!
10197 if (osdmap
.require_osd_release
< ceph_release_t::luminous
) {
10198 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
10199 << "luminous' before using crush device classes";
10204 if (!osdmap
.crush
->class_exists(device_class
)) {
10209 CrushWrapper newcrush
= _get_pending_crush();
10210 if (!newcrush
.class_exists(device_class
)) {
10211 err
= 0; // make command idempotent
10214 int class_id
= newcrush
.get_class_id(device_class
);
10216 if (newcrush
.class_is_in_use(class_id
, &ts
)) {
10218 ss
<< "class '" << device_class
<< "' " << ts
.str();
10222 // check if class is used by any erasure-code-profiles
10223 mempool::osdmap::map
<string
,map
<string
,string
>> old_ec_profiles
=
10224 osdmap
.get_erasure_code_profiles();
10225 auto ec_profiles
= pending_inc
.get_erasure_code_profiles();
10226 #ifdef HAVE_STDLIB_MAP_SPLICING
10227 ec_profiles
.merge(old_ec_profiles
);
10229 ec_profiles
.insert(make_move_iterator(begin(old_ec_profiles
)),
10230 make_move_iterator(end(old_ec_profiles
)));
10232 list
<string
> referenced_by
;
10233 for (auto &i
: ec_profiles
) {
10234 for (auto &j
: i
.second
) {
10235 if ("crush-device-class" == j
.first
&& device_class
== j
.second
) {
10236 referenced_by
.push_back(i
.first
);
10240 if (!referenced_by
.empty()) {
10242 ss
<< "class '" << device_class
10243 << "' is still referenced by erasure-code-profile(s): " << referenced_by
;
10248 newcrush
.get_devices_by_class(device_class
, &osds
);
10249 for (auto& p
: osds
) {
10250 err
= newcrush
.remove_device_class(g_ceph_context
, p
, &ss
);
10252 // ss has reason for failure
10257 if (osds
.empty()) {
10258 // empty class, remove directly
10259 err
= newcrush
.remove_class_name(device_class
);
10261 ss
<< "class '" << device_class
<< "' cannot be removed '"
10262 << cpp_strerror(err
) << "'";
10267 pending_inc
.crush
.clear();
10268 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10269 ss
<< "removed class " << device_class
<< " with id " << class_id
10270 << " from crush map";
10272 } else if (prefix
== "osd crush class rename") {
10273 string srcname
, dstname
;
10274 if (!cmd_getval(cmdmap
, "srcname", srcname
)) {
10278 if (!cmd_getval(cmdmap
, "dstname", dstname
)) {
10283 CrushWrapper newcrush
= _get_pending_crush();
10284 if (!newcrush
.class_exists(srcname
) && newcrush
.class_exists(dstname
)) {
10285 // suppose this is a replay and return success
10286 // so command is idempotent
10287 ss
<< "already renamed to '" << dstname
<< "'";
10292 err
= newcrush
.rename_class(srcname
, dstname
);
10294 ss
<< "fail to rename '" << srcname
<< "' to '" << dstname
<< "' : "
10295 << cpp_strerror(err
);
10299 pending_inc
.crush
.clear();
10300 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10301 ss
<< "rename class '" << srcname
<< "' to '" << dstname
<< "'";
10303 } else if (prefix
== "osd crush add-bucket") {
10304 // os crush add-bucket <name> <type>
10305 string name
, typestr
;
10306 vector
<string
> argvec
;
10307 cmd_getval(cmdmap
, "name", name
);
10308 cmd_getval(cmdmap
, "type", typestr
);
10309 cmd_getval(cmdmap
, "args", argvec
);
10310 map
<string
,string
> loc
;
10311 if (!argvec
.empty()) {
10312 CrushWrapper::parse_loc_map(argvec
, &loc
);
10313 dout(0) << "will create and move bucket '" << name
10314 << "' to location " << loc
<< dendl
;
10317 if (!_have_pending_crush() &&
10318 _get_stable_crush().name_exists(name
)) {
10319 ss
<< "bucket '" << name
<< "' already exists";
10323 CrushWrapper newcrush
= _get_pending_crush();
10325 if (newcrush
.name_exists(name
)) {
10326 ss
<< "bucket '" << name
<< "' already exists";
10329 int type
= newcrush
.get_type_id(typestr
);
10331 ss
<< "type '" << typestr
<< "' does not exist";
10336 ss
<< "type '" << typestr
<< "' is for devices, not buckets";
10341 err
= newcrush
.add_bucket(0, 0,
10342 CRUSH_HASH_DEFAULT
, type
, 0, NULL
,
10345 ss
<< "add_bucket error: '" << cpp_strerror(err
) << "'";
10348 err
= newcrush
.set_item_name(bucketno
, name
);
10350 ss
<< "error setting bucket name to '" << name
<< "'";
10354 if (!loc
.empty()) {
10355 if (!newcrush
.check_item_loc(cct
, bucketno
, loc
,
10357 err
= newcrush
.move_bucket(cct
, bucketno
, loc
);
10359 ss
<< "error moving bucket '" << name
<< "' to location " << loc
;
10363 ss
<< "no need to move item id " << bucketno
<< " name '" << name
10364 << "' to location " << loc
<< " in crush map";
10368 pending_inc
.crush
.clear();
10369 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10371 ss
<< "added bucket " << name
<< " type " << typestr
10372 << " to crush map";
10374 ss
<< "added bucket " << name
<< " type " << typestr
10375 << " to location " << loc
;
10378 } else if (prefix
== "osd crush rename-bucket") {
10379 string srcname
, dstname
;
10380 cmd_getval(cmdmap
, "srcname", srcname
);
10381 cmd_getval(cmdmap
, "dstname", dstname
);
10383 err
= crush_rename_bucket(srcname
, dstname
, &ss
);
10384 if (err
== -EALREADY
) // equivalent to success for idempotency
10390 } else if (prefix
== "osd crush weight-set create" ||
10391 prefix
== "osd crush weight-set create-compat") {
10392 if (_have_pending_crush()) {
10393 dout(10) << " first waiting for pending crush changes to commit" << dendl
;
10396 CrushWrapper newcrush
= _get_pending_crush();
10399 if (newcrush
.has_non_straw2_buckets()) {
10400 ss
<< "crush map contains one or more bucket(s) that are not straw2";
10404 if (prefix
== "osd crush weight-set create") {
10405 if (osdmap
.require_min_compat_client
!= ceph_release_t::unknown
&&
10406 osdmap
.require_min_compat_client
< ceph_release_t::luminous
) {
10407 ss
<< "require_min_compat_client "
10408 << osdmap
.require_min_compat_client
10409 << " < luminous, which is required for per-pool weight-sets. "
10410 << "Try 'ceph osd set-require-min-compat-client luminous' "
10411 << "before using the new interface";
10415 string poolname
, mode
;
10416 cmd_getval(cmdmap
, "pool", poolname
);
10417 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
10419 ss
<< "pool '" << poolname
<< "' not found";
10423 cmd_getval(cmdmap
, "mode", mode
);
10424 if (mode
!= "flat" && mode
!= "positional") {
10425 ss
<< "unrecognized weight-set mode '" << mode
<< "'";
10429 positions
= mode
== "flat" ? 1 : osdmap
.get_pg_pool(pool
)->get_size();
10431 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
10434 if (!newcrush
.create_choose_args(pool
, positions
)) {
10435 if (pool
== CrushWrapper::DEFAULT_CHOOSE_ARGS
) {
10436 ss
<< "compat weight-set already created";
10438 ss
<< "weight-set for pool '" << osdmap
.get_pool_name(pool
)
10439 << "' already created";
10443 pending_inc
.crush
.clear();
10444 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10447 } else if (prefix
== "osd crush weight-set rm" ||
10448 prefix
== "osd crush weight-set rm-compat") {
10449 CrushWrapper newcrush
= _get_pending_crush();
10451 if (prefix
== "osd crush weight-set rm") {
10453 cmd_getval(cmdmap
, "pool", poolname
);
10454 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
10456 ss
<< "pool '" << poolname
<< "' not found";
10461 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
10463 newcrush
.rm_choose_args(pool
);
10464 pending_inc
.crush
.clear();
10465 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10468 } else if (prefix
== "osd crush weight-set reweight" ||
10469 prefix
== "osd crush weight-set reweight-compat") {
10470 string poolname
, item
;
10471 vector
<double> weight
;
10472 cmd_getval(cmdmap
, "pool", poolname
);
10473 cmd_getval(cmdmap
, "item", item
);
10474 cmd_getval(cmdmap
, "weight", weight
);
10475 CrushWrapper newcrush
= _get_pending_crush();
10477 if (prefix
== "osd crush weight-set reweight") {
10478 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
10480 ss
<< "pool '" << poolname
<< "' not found";
10484 if (!newcrush
.have_choose_args(pool
)) {
10485 ss
<< "no weight-set for pool '" << poolname
<< "'";
10489 auto arg_map
= newcrush
.choose_args_get(pool
);
10490 int positions
= newcrush
.get_choose_args_positions(arg_map
);
10491 if (weight
.size() != (size_t)positions
) {
10492 ss
<< "must specify exact " << positions
<< " weight values";
10497 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
10498 if (!newcrush
.have_choose_args(pool
)) {
10499 ss
<< "no backward-compatible weight-set";
10504 if (!newcrush
.name_exists(item
)) {
10505 ss
<< "item '" << item
<< "' does not exist";
10509 err
= newcrush
.choose_args_adjust_item_weightf(
10511 newcrush
.choose_args_get(pool
),
10512 newcrush
.get_item_id(item
),
10519 pending_inc
.crush
.clear();
10520 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10522 } else if (osdid_present
&&
10523 (prefix
== "osd crush set" || prefix
== "osd crush add")) {
10524 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
10525 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
10526 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
10528 if (!osdmap
.exists(osdid
)) {
10531 << " does not exist. Create it before updating the crush map";
10536 if (!cmd_getval(cmdmap
, "weight", weight
)) {
10537 ss
<< "unable to parse weight value '"
10538 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
10544 vector
<string
> argvec
;
10545 cmd_getval(cmdmap
, "args", argvec
);
10546 map
<string
,string
> loc
;
10547 CrushWrapper::parse_loc_map(argvec
, &loc
);
10549 if (prefix
== "osd crush set"
10550 && !_get_stable_crush().item_exists(osdid
)) {
10552 ss
<< "unable to set item id " << osdid
<< " name '" << osd_name
10553 << "' weight " << weight
<< " at location " << loc
10554 << ": does not exist";
10558 dout(5) << "adding/updating crush item id " << osdid
<< " name '"
10559 << osd_name
<< "' weight " << weight
<< " at location "
10561 CrushWrapper newcrush
= _get_pending_crush();
10564 if (prefix
== "osd crush set" ||
10565 newcrush
.check_item_loc(cct
, osdid
, loc
, (int *)NULL
)) {
10567 err
= newcrush
.update_item(cct
, osdid
, weight
, osd_name
, loc
);
10570 err
= newcrush
.insert_item(cct
, osdid
, weight
, osd_name
, loc
);
10578 if (err
== 0 && !_have_pending_crush()) {
10579 ss
<< action
<< " item id " << osdid
<< " name '" << osd_name
10580 << "' weight " << weight
<< " at location " << loc
<< ": no change";
10584 pending_inc
.crush
.clear();
10585 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10586 ss
<< action
<< " item id " << osdid
<< " name '" << osd_name
<< "' weight "
10587 << weight
<< " at location " << loc
<< " to crush map";
10589 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10590 get_last_committed() + 1));
10593 } else if (prefix
== "osd crush create-or-move") {
10595 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
10596 if (!osdmap
.exists(osdid
)) {
10599 << " does not exist. create it before updating the crush map";
10604 if (!cmd_getval(cmdmap
, "weight", weight
)) {
10605 ss
<< "unable to parse weight value '"
10606 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
10612 vector
<string
> argvec
;
10613 cmd_getval(cmdmap
, "args", argvec
);
10614 map
<string
,string
> loc
;
10615 CrushWrapper::parse_loc_map(argvec
, &loc
);
10617 dout(0) << "create-or-move crush item name '" << osd_name
10618 << "' initial_weight " << weight
<< " at location " << loc
10621 CrushWrapper newcrush
= _get_pending_crush();
10623 err
= newcrush
.create_or_move_item(cct
, osdid
, weight
, osd_name
, loc
,
10624 g_conf()->osd_crush_update_weight_set
);
10626 ss
<< "create-or-move updated item name '" << osd_name
10627 << "' weight " << weight
10628 << " at location " << loc
<< " to crush map";
10632 pending_inc
.crush
.clear();
10633 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10634 ss
<< "create-or-move updating item name '" << osd_name
10635 << "' weight " << weight
10636 << " at location " << loc
<< " to crush map";
10638 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10639 get_last_committed() + 1));
10644 } else if (prefix
== "osd crush move") {
10646 // osd crush move <name> <loc1> [<loc2> ...]
10648 vector
<string
> argvec
;
10649 cmd_getval(cmdmap
, "name", name
);
10650 cmd_getval(cmdmap
, "args", argvec
);
10651 map
<string
,string
> loc
;
10652 CrushWrapper::parse_loc_map(argvec
, &loc
);
10654 dout(0) << "moving crush item name '" << name
<< "' to location " << loc
<< dendl
;
10655 CrushWrapper newcrush
= _get_pending_crush();
10657 if (!newcrush
.name_exists(name
)) {
10659 ss
<< "item " << name
<< " does not exist";
10662 int id
= newcrush
.get_item_id(name
);
10664 if (!newcrush
.check_item_loc(cct
, id
, loc
, (int *)NULL
)) {
10666 err
= newcrush
.create_or_move_item(
10667 cct
, id
, 0, name
, loc
,
10668 g_conf()->osd_crush_update_weight_set
);
10670 err
= newcrush
.move_bucket(cct
, id
, loc
);
10673 ss
<< "moved item id " << id
<< " name '" << name
<< "' to location " << loc
<< " in crush map";
10674 pending_inc
.crush
.clear();
10675 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10677 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10678 get_last_committed() + 1));
10682 ss
<< "no need to move item id " << id
<< " name '" << name
<< "' to location " << loc
<< " in crush map";
10686 } else if (prefix
== "osd crush swap-bucket") {
10687 string source
, dest
;
10688 cmd_getval(cmdmap
, "source", source
);
10689 cmd_getval(cmdmap
, "dest", dest
);
10691 bool force
= false;
10692 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
10694 CrushWrapper newcrush
= _get_pending_crush();
10695 if (!newcrush
.name_exists(source
)) {
10696 ss
<< "source item " << source
<< " does not exist";
10700 if (!newcrush
.name_exists(dest
)) {
10701 ss
<< "dest item " << dest
<< " does not exist";
10705 int sid
= newcrush
.get_item_id(source
);
10706 int did
= newcrush
.get_item_id(dest
);
10708 if (newcrush
.get_immediate_parent_id(sid
, &sparent
) == 0 && !force
) {
10709 ss
<< "source item " << source
<< " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
10713 if (newcrush
.get_bucket_alg(sid
) != newcrush
.get_bucket_alg(did
) &&
10715 ss
<< "source bucket alg " << crush_alg_name(newcrush
.get_bucket_alg(sid
)) << " != "
10716 << "dest bucket alg " << crush_alg_name(newcrush
.get_bucket_alg(did
))
10717 << "; pass --yes-i-really-mean-it to proceed anyway";
10721 int r
= newcrush
.swap_bucket(cct
, sid
, did
);
10723 ss
<< "failed to swap bucket contents: " << cpp_strerror(r
);
10727 ss
<< "swapped bucket of " << source
<< " to " << dest
;
10728 pending_inc
.crush
.clear();
10729 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10730 wait_for_finished_proposal(op
,
10731 new Monitor::C_Command(mon
, op
, err
, ss
.str(),
10732 get_last_committed() + 1));
10734 } else if (prefix
== "osd crush link") {
10735 // osd crush link <name> <loc1> [<loc2> ...]
10737 cmd_getval(cmdmap
, "name", name
);
10738 vector
<string
> argvec
;
10739 cmd_getval(cmdmap
, "args", argvec
);
10740 map
<string
,string
> loc
;
10741 CrushWrapper::parse_loc_map(argvec
, &loc
);
10743 // Need an explicit check for name_exists because get_item_id returns
10745 int id
= osdmap
.crush
->get_item_id(name
);
10746 if (!osdmap
.crush
->name_exists(name
)) {
10748 ss
<< "item " << name
<< " does not exist";
10751 dout(5) << "resolved crush name '" << name
<< "' to id " << id
<< dendl
;
10753 if (osdmap
.crush
->check_item_loc(cct
, id
, loc
, (int*) NULL
)) {
10754 ss
<< "no need to move item id " << id
<< " name '" << name
10755 << "' to location " << loc
<< " in crush map";
10760 dout(5) << "linking crush item name '" << name
<< "' at location " << loc
<< dendl
;
10761 CrushWrapper newcrush
= _get_pending_crush();
10763 if (!newcrush
.name_exists(name
)) {
10765 ss
<< "item " << name
<< " does not exist";
10768 int id
= newcrush
.get_item_id(name
);
10769 if (!newcrush
.check_item_loc(cct
, id
, loc
, (int *)NULL
)) {
10770 err
= newcrush
.link_bucket(cct
, id
, loc
);
10772 ss
<< "linked item id " << id
<< " name '" << name
10773 << "' to location " << loc
<< " in crush map";
10774 pending_inc
.crush
.clear();
10775 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10777 ss
<< "cannot link item id " << id
<< " name '" << name
10778 << "' to location " << loc
;
10782 ss
<< "no need to move item id " << id
<< " name '" << name
10783 << "' to location " << loc
<< " in crush map";
10787 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, ss
.str(),
10788 get_last_committed() + 1));
10790 } else if (prefix
== "osd crush rm" ||
10791 prefix
== "osd crush remove" ||
10792 prefix
== "osd crush unlink") {
10794 // osd crush rm <id> [ancestor]
10795 CrushWrapper newcrush
= _get_pending_crush();
10798 cmd_getval(cmdmap
, "name", name
);
10800 if (!osdmap
.crush
->name_exists(name
)) {
10802 ss
<< "device '" << name
<< "' does not appear in the crush map";
10805 if (!newcrush
.name_exists(name
)) {
10807 ss
<< "device '" << name
<< "' does not appear in the crush map";
10809 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10810 get_last_committed() + 1));
10813 int id
= newcrush
.get_item_id(name
);
10816 bool unlink_only
= prefix
== "osd crush unlink";
10817 string ancestor_str
;
10818 if (cmd_getval(cmdmap
, "ancestor", ancestor_str
)) {
10819 if (!newcrush
.name_exists(ancestor_str
)) {
10821 ss
<< "ancestor item '" << ancestor_str
10822 << "' does not appear in the crush map";
10825 ancestor
= newcrush
.get_item_id(ancestor_str
);
10828 err
= prepare_command_osd_crush_remove(
10831 (ancestor
< 0), unlink_only
);
10833 if (err
== -ENOENT
) {
10834 ss
<< "item " << id
<< " does not appear in that position";
10840 pending_inc
.new_crush_node_flags
[id
] = 0;
10841 ss
<< "removed item id " << id
<< " name '" << name
<< "' from crush map";
10843 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10844 get_last_committed() + 1));
10849 } else if (prefix
== "osd crush reweight-all") {
10850 CrushWrapper newcrush
= _get_pending_crush();
10852 newcrush
.reweight(cct
);
10853 pending_inc
.crush
.clear();
10854 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10855 ss
<< "reweighted crush hierarchy";
10857 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10858 get_last_committed() + 1));
10860 } else if (prefix
== "osd crush reweight") {
10861 // osd crush reweight <name> <weight>
10862 CrushWrapper newcrush
= _get_pending_crush();
10865 cmd_getval(cmdmap
, "name", name
);
10866 if (!newcrush
.name_exists(name
)) {
10868 ss
<< "device '" << name
<< "' does not appear in the crush map";
10872 int id
= newcrush
.get_item_id(name
);
10874 ss
<< "device '" << name
<< "' is not a leaf in the crush map";
10879 if (!cmd_getval(cmdmap
, "weight", w
)) {
10880 ss
<< "unable to parse weight value '"
10881 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
10886 err
= newcrush
.adjust_item_weightf(cct
, id
, w
,
10887 g_conf()->osd_crush_update_weight_set
);
10890 pending_inc
.crush
.clear();
10891 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10892 ss
<< "reweighted item id " << id
<< " name '" << name
<< "' to " << w
10893 << " in crush map";
10895 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10896 get_last_committed() + 1));
10898 } else if (prefix
== "osd crush reweight-subtree") {
10899 // osd crush reweight <name> <weight>
10900 CrushWrapper newcrush
= _get_pending_crush();
10903 cmd_getval(cmdmap
, "name", name
);
10904 if (!newcrush
.name_exists(name
)) {
10906 ss
<< "device '" << name
<< "' does not appear in the crush map";
10910 int id
= newcrush
.get_item_id(name
);
10912 ss
<< "device '" << name
<< "' is not a subtree in the crush map";
10917 if (!cmd_getval(cmdmap
, "weight", w
)) {
10918 ss
<< "unable to parse weight value '"
10919 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
10924 err
= newcrush
.adjust_subtree_weightf(cct
, id
, w
,
10925 g_conf()->osd_crush_update_weight_set
);
10928 pending_inc
.crush
.clear();
10929 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10930 ss
<< "reweighted subtree id " << id
<< " name '" << name
<< "' to " << w
10931 << " in crush map";
10933 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10934 get_last_committed() + 1));
10936 } else if (prefix
== "osd crush tunables") {
10937 CrushWrapper newcrush
= _get_pending_crush();
10941 cmd_getval(cmdmap
, "profile", profile
);
10942 if (profile
== "legacy" || profile
== "argonaut") {
10943 newcrush
.set_tunables_legacy();
10944 } else if (profile
== "bobtail") {
10945 newcrush
.set_tunables_bobtail();
10946 } else if (profile
== "firefly") {
10947 newcrush
.set_tunables_firefly();
10948 } else if (profile
== "hammer") {
10949 newcrush
.set_tunables_hammer();
10950 } else if (profile
== "jewel") {
10951 newcrush
.set_tunables_jewel();
10952 } else if (profile
== "optimal") {
10953 newcrush
.set_tunables_optimal();
10954 } else if (profile
== "default") {
10955 newcrush
.set_tunables_default();
10957 ss
<< "unrecognized profile '" << profile
<< "'";
10962 if (!validate_crush_against_features(&newcrush
, ss
)) {
10967 pending_inc
.crush
.clear();
10968 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10969 ss
<< "adjusted tunables profile to " << profile
;
10971 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10972 get_last_committed() + 1));
10974 } else if (prefix
== "osd crush set-tunable") {
10975 CrushWrapper newcrush
= _get_pending_crush();
10979 cmd_getval(cmdmap
, "tunable", tunable
);
10981 int64_t value
= -1;
10982 if (!cmd_getval(cmdmap
, "value", value
)) {
10984 ss
<< "failed to parse integer value "
10985 << cmd_vartype_stringify(cmdmap
.at("value"));
10989 if (tunable
== "straw_calc_version") {
10990 if (value
!= 0 && value
!= 1) {
10991 ss
<< "value must be 0 or 1; got " << value
;
10995 newcrush
.set_straw_calc_version(value
);
10997 ss
<< "unrecognized tunable '" << tunable
<< "'";
11002 if (!validate_crush_against_features(&newcrush
, ss
)) {
11007 pending_inc
.crush
.clear();
11008 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
11009 ss
<< "adjusted tunable " << tunable
<< " to " << value
;
11011 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11012 get_last_committed() + 1));
11015 } else if (prefix
== "osd crush rule create-simple") {
11016 string name
, root
, type
, mode
;
11017 cmd_getval(cmdmap
, "name", name
);
11018 cmd_getval(cmdmap
, "root", root
);
11019 cmd_getval(cmdmap
, "type", type
);
11020 cmd_getval(cmdmap
, "mode", mode
);
11024 if (osdmap
.crush
->rule_exists(name
)) {
11025 // The name is uniquely associated to a ruleid and the rule it contains
11026 // From the user point of view, the rule is more meaningfull.
11027 ss
<< "rule " << name
<< " already exists";
11032 CrushWrapper newcrush
= _get_pending_crush();
11034 if (newcrush
.rule_exists(name
)) {
11035 // The name is uniquely associated to a ruleid and the rule it contains
11036 // From the user point of view, the rule is more meaningfull.
11037 ss
<< "rule " << name
<< " already exists";
11040 int ruleno
= newcrush
.add_simple_rule(name
, root
, type
, "", mode
,
11041 pg_pool_t::TYPE_REPLICATED
, &ss
);
11047 pending_inc
.crush
.clear();
11048 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
11051 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11052 get_last_committed() + 1));
11055 } else if (prefix
== "osd crush rule create-replicated") {
11056 string name
, root
, type
, device_class
;
11057 cmd_getval(cmdmap
, "name", name
);
11058 cmd_getval(cmdmap
, "root", root
);
11059 cmd_getval(cmdmap
, "type", type
);
11060 cmd_getval(cmdmap
, "class", device_class
);
11062 if (osdmap
.crush
->rule_exists(name
)) {
11063 // The name is uniquely associated to a ruleid and the rule it contains
11064 // From the user point of view, the rule is more meaningfull.
11065 ss
<< "rule " << name
<< " already exists";
11070 CrushWrapper newcrush
= _get_pending_crush();
11072 if (newcrush
.rule_exists(name
)) {
11073 // The name is uniquely associated to a ruleid and the rule it contains
11074 // From the user point of view, the rule is more meaningfull.
11075 ss
<< "rule " << name
<< " already exists";
11078 int ruleno
= newcrush
.add_simple_rule(
11079 name
, root
, type
, device_class
,
11080 "firstn", pg_pool_t::TYPE_REPLICATED
, &ss
);
11086 pending_inc
.crush
.clear();
11087 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
11090 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11091 get_last_committed() + 1));
11094 } else if (prefix
== "osd erasure-code-profile rm") {
11096 cmd_getval(cmdmap
, "name", name
);
11098 if (erasure_code_profile_in_use(pending_inc
.new_pools
, name
, &ss
))
11101 if (erasure_code_profile_in_use(osdmap
.pools
, name
, &ss
)) {
11106 if (osdmap
.has_erasure_code_profile(name
) ||
11107 pending_inc
.new_erasure_code_profiles
.count(name
)) {
11108 if (osdmap
.has_erasure_code_profile(name
)) {
11109 pending_inc
.old_erasure_code_profiles
.push_back(name
);
11111 dout(20) << "erasure code profile rm " << name
<< ": creation canceled" << dendl
;
11112 pending_inc
.new_erasure_code_profiles
.erase(name
);
11116 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11117 get_last_committed() + 1));
11120 ss
<< "erasure-code-profile " << name
<< " does not exist";
11125 } else if (prefix
== "osd erasure-code-profile set") {
11127 cmd_getval(cmdmap
, "name", name
);
11128 vector
<string
> profile
;
11129 cmd_getval(cmdmap
, "profile", profile
);
11131 bool force
= false;
11132 cmd_getval(cmdmap
, "force", force
);
11134 map
<string
,string
> profile_map
;
11135 err
= parse_erasure_code_profile(profile
, &profile_map
, &ss
);
11138 if (auto found
= profile_map
.find("crush-failure-domain");
11139 found
!= profile_map
.end()) {
11140 const auto& failure_domain
= found
->second
;
11141 int failure_domain_type
= osdmap
.crush
->get_type_id(failure_domain
);
11142 if (failure_domain_type
< 0) {
11143 ss
<< "erasure-code-profile " << profile_map
11144 << " contains an invalid failure-domain " << std::quoted(failure_domain
);
11150 if (profile_map
.find("plugin") == profile_map
.end()) {
11151 ss
<< "erasure-code-profile " << profile_map
11152 << " must contain a plugin entry" << std::endl
;
11156 string plugin
= profile_map
["plugin"];
11158 if (pending_inc
.has_erasure_code_profile(name
)) {
11159 dout(20) << "erasure code profile " << name
<< " try again" << dendl
;
11162 err
= normalize_profile(name
, profile_map
, force
, &ss
);
11166 if (osdmap
.has_erasure_code_profile(name
)) {
11167 ErasureCodeProfile existing_profile_map
=
11168 osdmap
.get_erasure_code_profile(name
);
11169 err
= normalize_profile(name
, existing_profile_map
, force
, &ss
);
11173 if (existing_profile_map
== profile_map
) {
11179 ss
<< "will not override erasure code profile " << name
11180 << " because the existing profile "
11181 << existing_profile_map
11182 << " is different from the proposed profile "
11188 dout(20) << "erasure code profile set " << name
<< "="
11189 << profile_map
<< dendl
;
11190 pending_inc
.set_erasure_code_profile(name
, profile_map
);
11194 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11195 get_last_committed() + 1));
11198 } else if (prefix
== "osd crush rule create-erasure") {
11199 err
= check_cluster_features(CEPH_FEATURE_CRUSH_V2
, ss
);
11200 if (err
== -EAGAIN
)
11204 string name
, poolstr
;
11205 cmd_getval(cmdmap
, "name", name
);
11207 cmd_getval(cmdmap
, "profile", profile
);
11209 profile
= "default";
11210 if (profile
== "default") {
11211 if (!osdmap
.has_erasure_code_profile(profile
)) {
11212 if (pending_inc
.has_erasure_code_profile(profile
)) {
11213 dout(20) << "erasure code profile " << profile
<< " already pending" << dendl
;
11217 map
<string
,string
> profile_map
;
11218 err
= osdmap
.get_erasure_code_profile_default(cct
,
11223 err
= normalize_profile(name
, profile_map
, true, &ss
);
11226 dout(20) << "erasure code profile set " << profile
<< "="
11227 << profile_map
<< dendl
;
11228 pending_inc
.set_erasure_code_profile(profile
, profile_map
);
11234 err
= crush_rule_create_erasure(name
, profile
, &rule
, &ss
);
11237 case -EEXIST
: // return immediately
11238 ss
<< "rule " << name
<< " already exists";
11242 case -EALREADY
: // wait for pending to be proposed
11243 ss
<< "rule " << name
<< " already exists";
11246 default: // non recoverable error
11251 ss
<< "created rule " << name
<< " at " << rule
;
11255 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11256 get_last_committed() + 1));
11259 } else if (prefix
== "osd crush rule rm") {
11261 cmd_getval(cmdmap
, "name", name
);
11263 if (!osdmap
.crush
->rule_exists(name
)) {
11264 ss
<< "rule " << name
<< " does not exist";
11269 CrushWrapper newcrush
= _get_pending_crush();
11271 if (!newcrush
.rule_exists(name
)) {
11272 ss
<< "rule " << name
<< " does not exist";
11275 int ruleno
= newcrush
.get_rule_id(name
);
11276 ceph_assert(ruleno
>= 0);
11278 // make sure it is not in use.
11279 // FIXME: this is ok in some situations, but let's not bother with that
11281 if (osdmap
.crush_rule_in_use(ruleno
)) {
11282 ss
<< "crush rule " << name
<< " (" << ruleno
<< ") is in use";
11287 err
= newcrush
.remove_rule(ruleno
);
11292 pending_inc
.crush
.clear();
11293 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
11296 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11297 get_last_committed() + 1));
11300 } else if (prefix
== "osd crush rule rename") {
11303 cmd_getval(cmdmap
, "srcname", srcname
);
11304 cmd_getval(cmdmap
, "dstname", dstname
);
11305 if (srcname
.empty() || dstname
.empty()) {
11306 ss
<< "must specify both source rule name and destination rule name";
11310 if (srcname
== dstname
) {
11311 ss
<< "destination rule name is equal to source rule name";
11316 CrushWrapper newcrush
= _get_pending_crush();
11317 if (!newcrush
.rule_exists(srcname
) && newcrush
.rule_exists(dstname
)) {
11318 // srcname does not exist and dstname already exists
11319 // suppose this is a replay and return success
11320 // (so this command is idempotent)
11321 ss
<< "already renamed to '" << dstname
<< "'";
11326 err
= newcrush
.rename_rule(srcname
, dstname
, &ss
);
11328 // ss has reason for failure
11331 pending_inc
.crush
.clear();
11332 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
11334 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11335 get_last_committed() + 1));
11338 } else if (prefix
== "osd setmaxosd") {
11340 if (!cmd_getval(cmdmap
, "newmax", newmax
)) {
11341 ss
<< "unable to parse 'newmax' value '"
11342 << cmd_vartype_stringify(cmdmap
.at("newmax")) << "'";
11347 if (newmax
> g_conf()->mon_max_osd
) {
11349 ss
<< "cannot set max_osd to " << newmax
<< " which is > conf.mon_max_osd ("
11350 << g_conf()->mon_max_osd
<< ")";
11354 // Don't allow shrinking OSD number as this will cause data loss
11355 // and may cause kernel crashes.
11356 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
11357 if (newmax
< osdmap
.get_max_osd()) {
11358 // Check if the OSDs exist between current max and new value.
11359 // If there are any OSDs exist, then don't allow shrinking number
11361 for (int i
= newmax
; i
< osdmap
.get_max_osd(); i
++) {
11362 if (osdmap
.exists(i
)) {
11364 ss
<< "cannot shrink max_osd to " << newmax
11365 << " because osd." << i
<< " (and possibly others) still in use";
11371 pending_inc
.new_max_osd
= newmax
;
11372 ss
<< "set new max_osd = " << pending_inc
.new_max_osd
;
11374 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11375 get_last_committed() + 1));
11378 } else if (prefix
== "osd set-full-ratio" ||
11379 prefix
== "osd set-backfillfull-ratio" ||
11380 prefix
== "osd set-nearfull-ratio") {
11382 if (!cmd_getval(cmdmap
, "ratio", n
)) {
11383 ss
<< "unable to parse 'ratio' value '"
11384 << cmd_vartype_stringify(cmdmap
.at("ratio")) << "'";
11388 if (prefix
== "osd set-full-ratio")
11389 pending_inc
.new_full_ratio
= n
;
11390 else if (prefix
== "osd set-backfillfull-ratio")
11391 pending_inc
.new_backfillfull_ratio
= n
;
11392 else if (prefix
== "osd set-nearfull-ratio")
11393 pending_inc
.new_nearfull_ratio
= n
;
11394 ss
<< prefix
<< " " << n
;
11396 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11397 get_last_committed() + 1));
11399 } else if (prefix
== "osd set-require-min-compat-client") {
11401 cmd_getval(cmdmap
, "version", v
);
11402 ceph_release_t vno
= ceph_release_from_name(v
);
11404 ss
<< "version " << v
<< " is not recognized";
11409 newmap
.deepish_copy_from(osdmap
);
11410 newmap
.apply_incremental(pending_inc
);
11411 newmap
.require_min_compat_client
= vno
;
11412 auto mvno
= newmap
.get_min_compat_client();
11414 ss
<< "osdmap current utilizes features that require " << mvno
11415 << "; cannot set require_min_compat_client below that to " << vno
;
11420 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
11423 mon
.get_combined_feature_map(&m
);
11424 uint64_t features
= ceph_release_features(to_integer
<int>(vno
));
11428 CEPH_ENTITY_TYPE_CLIENT
,
11429 CEPH_ENTITY_TYPE_MDS
,
11430 CEPH_ENTITY_TYPE_MGR
}) {
11431 auto p
= m
.m
.find(type
);
11432 if (p
== m
.m
.end()) {
11435 for (auto& q
: p
->second
) {
11436 uint64_t missing
= ~q
.first
& features
;
11439 ss
<< "cannot set require_min_compat_client to " << v
<< ": ";
11444 ss
<< q
.second
<< " connected " << ceph_entity_type_name(type
)
11445 << "(s) look like " << ceph_release_name(
11446 ceph_release_from_features(q
.first
))
11447 << " (missing 0x" << std::hex
<< missing
<< std::dec
<< ")";
11453 ss
<< "; add --yes-i-really-mean-it to do it anyway";
11458 ss
<< "set require_min_compat_client to " << vno
;
11459 pending_inc
.new_require_min_compat_client
= vno
;
11461 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11462 get_last_committed() + 1));
11464 } else if (prefix
== "osd pause") {
11465 return prepare_set_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11467 } else if (prefix
== "osd unpause") {
11468 return prepare_unset_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11470 } else if (prefix
== "osd set") {
11472 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
11475 cmd_getval(cmdmap
, "key", key
);
11476 if (key
== "pause")
11477 return prepare_set_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11478 else if (key
== "noup")
11479 return prepare_set_flag(op
, CEPH_OSDMAP_NOUP
);
11480 else if (key
== "nodown")
11481 return prepare_set_flag(op
, CEPH_OSDMAP_NODOWN
);
11482 else if (key
== "noout")
11483 return prepare_set_flag(op
, CEPH_OSDMAP_NOOUT
);
11484 else if (key
== "noin")
11485 return prepare_set_flag(op
, CEPH_OSDMAP_NOIN
);
11486 else if (key
== "nobackfill")
11487 return prepare_set_flag(op
, CEPH_OSDMAP_NOBACKFILL
);
11488 else if (key
== "norebalance")
11489 return prepare_set_flag(op
, CEPH_OSDMAP_NOREBALANCE
);
11490 else if (key
== "norecover")
11491 return prepare_set_flag(op
, CEPH_OSDMAP_NORECOVER
);
11492 else if (key
== "noscrub")
11493 return prepare_set_flag(op
, CEPH_OSDMAP_NOSCRUB
);
11494 else if (key
== "nodeep-scrub")
11495 return prepare_set_flag(op
, CEPH_OSDMAP_NODEEP_SCRUB
);
11496 else if (key
== "notieragent")
11497 return prepare_set_flag(op
, CEPH_OSDMAP_NOTIERAGENT
);
11498 else if (key
== "nosnaptrim")
11499 return prepare_set_flag(op
, CEPH_OSDMAP_NOSNAPTRIM
);
11500 else if (key
== "pglog_hardlimit") {
11501 if (!osdmap
.get_num_up_osds() && !sure
) {
11502 ss
<< "Not advisable to continue since no OSDs are up. Pass "
11503 << "--yes-i-really-mean-it if you really wish to continue.";
11507 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
11508 // we are reusing a jewel feature bit that was retired in luminous.
11509 if (osdmap
.require_osd_release
>= ceph_release_t::luminous
&&
11510 (HAVE_FEATURE(osdmap
.get_up_osd_features(), OSD_PGLOG_HARDLIMIT
)
11512 return prepare_set_flag(op
, CEPH_OSDMAP_PGLOG_HARDLIMIT
);
11514 ss
<< "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
11519 ss
<< "unrecognized flag '" << key
<< "'";
11523 } else if (prefix
== "osd unset") {
11525 cmd_getval(cmdmap
, "key", key
);
11526 if (key
== "pause")
11527 return prepare_unset_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11528 else if (key
== "noup")
11529 return prepare_unset_flag(op
, CEPH_OSDMAP_NOUP
);
11530 else if (key
== "nodown")
11531 return prepare_unset_flag(op
, CEPH_OSDMAP_NODOWN
);
11532 else if (key
== "noout")
11533 return prepare_unset_flag(op
, CEPH_OSDMAP_NOOUT
);
11534 else if (key
== "noin")
11535 return prepare_unset_flag(op
, CEPH_OSDMAP_NOIN
);
11536 else if (key
== "nobackfill")
11537 return prepare_unset_flag(op
, CEPH_OSDMAP_NOBACKFILL
);
11538 else if (key
== "norebalance")
11539 return prepare_unset_flag(op
, CEPH_OSDMAP_NOREBALANCE
);
11540 else if (key
== "norecover")
11541 return prepare_unset_flag(op
, CEPH_OSDMAP_NORECOVER
);
11542 else if (key
== "noscrub")
11543 return prepare_unset_flag(op
, CEPH_OSDMAP_NOSCRUB
);
11544 else if (key
== "nodeep-scrub")
11545 return prepare_unset_flag(op
, CEPH_OSDMAP_NODEEP_SCRUB
);
11546 else if (key
== "notieragent")
11547 return prepare_unset_flag(op
, CEPH_OSDMAP_NOTIERAGENT
);
11548 else if (key
== "nosnaptrim")
11549 return prepare_unset_flag(op
, CEPH_OSDMAP_NOSNAPTRIM
);
11551 ss
<< "unrecognized flag '" << key
<< "'";
11555 } else if (prefix
== "osd require-osd-release") {
11557 cmd_getval(cmdmap
, "release", release
);
11559 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
11560 ceph_release_t rel
= ceph_release_from_name(release
.c_str());
11562 ss
<< "unrecognized release " << release
;
11566 if (rel
== osdmap
.require_osd_release
) {
11571 ceph_assert(osdmap
.require_osd_release
>= ceph_release_t::octopus
);
11572 if (!osdmap
.get_num_up_osds() && !sure
) {
11573 ss
<< "Not advisable to continue since no OSDs are up. Pass "
11574 << "--yes-i-really-mean-it if you really wish to continue.";
11578 if (rel
== ceph_release_t::octopus
) {
11579 if (!mon
.monmap
->get_required_features().contains_all(
11580 ceph::features::mon::FEATURE_OCTOPUS
)) {
11581 ss
<< "not all mons are octopus";
11585 if ((!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_OCTOPUS
))
11587 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_OCTOPUS feature";
11591 } else if (rel
== ceph_release_t::pacific
) {
11592 if (!mon
.monmap
->get_required_features().contains_all(
11593 ceph::features::mon::FEATURE_PACIFIC
)) {
11594 ss
<< "not all mons are pacific";
11598 if ((!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_PACIFIC
))
11600 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_PACIFIC feature";
11604 } else if (rel
== ceph_release_t::quincy
) {
11605 if (!mon
.monmap
->get_required_features().contains_all(
11606 ceph::features::mon::FEATURE_QUINCY
)) {
11607 ss
<< "not all mons are quincy";
11611 if ((!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_QUINCY
))
11613 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_QUINCY feature";
11618 ss
<< "not supported for this release";
11622 if (rel
< osdmap
.require_osd_release
) {
11623 ss
<< "require_osd_release cannot be lowered once it has been set";
11627 pending_inc
.new_require_osd_release
= rel
;
11629 } else if (prefix
== "osd down" ||
11630 prefix
== "osd out" ||
11631 prefix
== "osd in" ||
11632 prefix
== "osd rm" ||
11633 prefix
== "osd stop") {
11637 bool verbose
= true;
11638 bool definitely_dead
= false;
11640 vector
<string
> idvec
;
11641 cmd_getval(cmdmap
, "ids", idvec
);
11642 cmd_getval(cmdmap
, "definitely_dead", definitely_dead
);
11643 derr
<< "definitely_dead " << (int)definitely_dead
<< dendl
;
11644 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
11649 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
11650 if (prefix
== "osd in") {
11651 // touch out osds only
11652 osdmap
.get_out_existing_osds(osds
);
11654 osdmap
.get_all_osds(osds
);
11657 verbose
= false; // so the output is less noisy.
11659 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
11661 ss
<< "invalid osd id" << osd
;
11664 } else if (!osdmap
.exists(osd
)) {
11665 ss
<< "osd." << osd
<< " does not exist. ";
11672 for (auto &osd
: osds
) {
11673 if (prefix
== "osd down") {
11674 if (osdmap
.is_down(osd
)) {
11676 ss
<< "osd." << osd
<< " is already down. ";
11678 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_UP
);
11679 ss
<< "marked down osd." << osd
<< ". ";
11682 if (definitely_dead
) {
11683 if (!pending_inc
.new_xinfo
.count(osd
)) {
11684 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
11686 if (pending_inc
.new_xinfo
[osd
].dead_epoch
< pending_inc
.epoch
) {
11689 pending_inc
.new_xinfo
[osd
].dead_epoch
= pending_inc
.epoch
;
11691 } else if (prefix
== "osd out") {
11692 if (osdmap
.is_out(osd
)) {
11694 ss
<< "osd." << osd
<< " is already out. ";
11696 pending_inc
.new_weight
[osd
] = CEPH_OSD_OUT
;
11697 if (osdmap
.osd_weight
[osd
]) {
11698 if (pending_inc
.new_xinfo
.count(osd
) == 0) {
11699 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
11701 pending_inc
.new_xinfo
[osd
].old_weight
= osdmap
.osd_weight
[osd
];
11703 ss
<< "marked out osd." << osd
<< ". ";
11704 std::ostringstream msg
;
11705 msg
<< "Client " << op
->get_session()->entity_name
11706 << " marked osd." << osd
<< " out";
11707 if (osdmap
.is_up(osd
)) {
11708 msg
<< ", while it was still marked up";
11710 auto period
= ceph_clock_now() - down_pending_out
[osd
];
11711 msg
<< ", after it was down for " << int(period
.sec())
11715 mon
.clog
->info() << msg
.str();
11718 } else if (prefix
== "osd in") {
11719 if (osdmap
.is_in(osd
)) {
11721 ss
<< "osd." << osd
<< " is already in. ";
11723 if (osdmap
.osd_xinfo
[osd
].old_weight
> 0) {
11724 pending_inc
.new_weight
[osd
] = osdmap
.osd_xinfo
[osd
].old_weight
;
11725 if (pending_inc
.new_xinfo
.count(osd
) == 0) {
11726 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
11728 pending_inc
.new_xinfo
[osd
].old_weight
= 0;
11730 pending_inc
.new_weight
[osd
] = CEPH_OSD_IN
;
11732 ss
<< "marked in osd." << osd
<< ". ";
11735 } else if (prefix
== "osd rm") {
11736 err
= prepare_command_osd_remove(osd
);
11738 if (err
== -EBUSY
) {
11741 ss
<< "osd." << osd
<< " is still up; must be down before removal. ";
11743 ceph_assert(err
== 0);
11745 ss
<< ", osd." << osd
;
11747 ss
<< "removed osd." << osd
;
11751 } else if (prefix
== "osd stop") {
11752 if (osdmap
.is_stop(osd
)) {
11754 ss
<< "osd." << osd
<< " is already stopped. ";
11755 } else if (osdmap
.is_down(osd
)) {
11756 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_STOP
);
11757 ss
<< "stop down osd." << osd
<< ". ";
11760 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_UP
| CEPH_OSD_STOP
);
11761 ss
<< "stop osd." << osd
<< ". ";
11769 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
11770 get_last_committed() + 1));
11773 } else if (prefix
== "osd set-group" ||
11774 prefix
== "osd unset-group" ||
11775 prefix
== "osd add-noup" ||
11776 prefix
== "osd add-nodown" ||
11777 prefix
== "osd add-noin" ||
11778 prefix
== "osd add-noout" ||
11779 prefix
== "osd rm-noup" ||
11780 prefix
== "osd rm-nodown" ||
11781 prefix
== "osd rm-noin" ||
11782 prefix
== "osd rm-noout") {
11783 bool do_set
= prefix
== "osd set-group" ||
11784 prefix
.find("add") != string::npos
;
11786 unsigned flags
= 0;
11787 vector
<string
> who
;
11788 if (prefix
== "osd set-group" || prefix
== "osd unset-group") {
11789 cmd_getval(cmdmap
, "flags", flag_str
);
11790 cmd_getval(cmdmap
, "who", who
);
11791 vector
<string
> raw_flags
;
11792 boost::split(raw_flags
, flag_str
, boost::is_any_of(","));
11793 for (auto& f
: raw_flags
) {
11795 flags
|= CEPH_OSD_NOUP
;
11796 else if (f
== "nodown")
11797 flags
|= CEPH_OSD_NODOWN
;
11798 else if (f
== "noin")
11799 flags
|= CEPH_OSD_NOIN
;
11800 else if (f
== "noout")
11801 flags
|= CEPH_OSD_NOOUT
;
11803 ss
<< "unrecognized flag '" << f
<< "', must be one of "
11804 << "{noup,nodown,noin,noout}";
11810 cmd_getval(cmdmap
, "ids", who
);
11811 if (prefix
.find("noup") != string::npos
)
11812 flags
= CEPH_OSD_NOUP
;
11813 else if (prefix
.find("nodown") != string::npos
)
11814 flags
= CEPH_OSD_NODOWN
;
11815 else if (prefix
.find("noin") != string::npos
)
11816 flags
= CEPH_OSD_NOIN
;
11817 else if (prefix
.find("noout") != string::npos
)
11818 flags
= CEPH_OSD_NOOUT
;
11820 ceph_assert(0 == "Unreachable!");
11823 ss
<< "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
11828 ss
<< "must specify at least one or more targets to set/unset";
11833 set
<int> crush_nodes
;
11834 set
<int> device_classes
;
11835 for (auto& w
: who
) {
11836 if (w
== "any" || w
== "all" || w
== "*") {
11837 osdmap
.get_all_osds(osds
);
11840 std::stringstream ts
;
11841 if (auto osd
= parse_osd_id(w
.c_str(), &ts
); osd
>= 0) {
11843 } else if (osdmap
.crush
->name_exists(w
)) {
11844 crush_nodes
.insert(osdmap
.crush
->get_item_id(w
));
11845 } else if (osdmap
.crush
->class_exists(w
)) {
11846 device_classes
.insert(osdmap
.crush
->get_class_id(w
));
11848 ss
<< "unable to parse osd id or crush node or device class: "
11849 << "\"" << w
<< "\". ";
11852 if (osds
.empty() && crush_nodes
.empty() && device_classes
.empty()) {
11853 // ss has reason for failure
11858 for (auto osd
: osds
) {
11859 if (!osdmap
.exists(osd
)) {
11860 ss
<< "osd." << osd
<< " does not exist. ";
11864 if (flags
& CEPH_OSD_NOUP
) {
11865 any
|= osdmap
.is_noup_by_osd(osd
) ?
11866 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOUP
) :
11867 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOUP
);
11869 if (flags
& CEPH_OSD_NODOWN
) {
11870 any
|= osdmap
.is_nodown_by_osd(osd
) ?
11871 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NODOWN
) :
11872 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NODOWN
);
11874 if (flags
& CEPH_OSD_NOIN
) {
11875 any
|= osdmap
.is_noin_by_osd(osd
) ?
11876 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOIN
) :
11877 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOIN
);
11879 if (flags
& CEPH_OSD_NOOUT
) {
11880 any
|= osdmap
.is_noout_by_osd(osd
) ?
11881 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOOUT
) :
11882 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOOUT
);
11885 if (flags
& CEPH_OSD_NOUP
) {
11886 any
|= osdmap
.is_noup_by_osd(osd
) ?
11887 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOUP
) :
11888 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOUP
);
11890 if (flags
& CEPH_OSD_NODOWN
) {
11891 any
|= osdmap
.is_nodown_by_osd(osd
) ?
11892 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NODOWN
) :
11893 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NODOWN
);
11895 if (flags
& CEPH_OSD_NOIN
) {
11896 any
|= osdmap
.is_noin_by_osd(osd
) ?
11897 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOIN
) :
11898 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOIN
);
11900 if (flags
& CEPH_OSD_NOOUT
) {
11901 any
|= osdmap
.is_noout_by_osd(osd
) ?
11902 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOOUT
) :
11903 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOOUT
);
11907 for (auto& id
: crush_nodes
) {
11908 auto old_flags
= osdmap
.get_crush_node_flags(id
);
11909 auto& pending_flags
= pending_inc
.new_crush_node_flags
[id
];
11910 pending_flags
|= old_flags
; // adopt existing flags first!
11912 pending_flags
|= flags
;
11914 pending_flags
&= ~flags
;
11918 for (auto& id
: device_classes
) {
11919 auto old_flags
= osdmap
.get_device_class_flags(id
);
11920 auto& pending_flags
= pending_inc
.new_device_class_flags
[id
];
11921 pending_flags
|= old_flags
;
11923 pending_flags
|= flags
;
11925 pending_flags
&= ~flags
;
11931 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
11932 get_last_committed() + 1));
11935 } else if (prefix
== "osd pg-temp") {
11937 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
11938 ss
<< "unable to parse 'pgid' value '"
11939 << cmd_vartype_stringify(cmdmap
.at("pgid")) << "'";
11944 if (!pgid
.parse(pgidstr
.c_str())) {
11945 ss
<< "invalid pgid '" << pgidstr
<< "'";
11949 if (!osdmap
.pg_exists(pgid
)) {
11950 ss
<< "pg " << pgid
<< " does not exist";
11954 if (pending_inc
.new_pg_temp
.count(pgid
)) {
11955 dout(10) << __func__
<< " waiting for pending update on " << pgid
<< dendl
;
11956 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11960 vector
<int64_t> id_vec
;
11961 vector
<int32_t> new_pg_temp
;
11962 cmd_getval(cmdmap
, "id", id_vec
);
11963 if (id_vec
.empty()) {
11964 pending_inc
.new_pg_temp
[pgid
] = mempool::osdmap::vector
<int>();
11965 ss
<< "done cleaning up pg_temp of " << pgid
;
11968 for (auto osd
: id_vec
) {
11969 if (!osdmap
.exists(osd
)) {
11970 ss
<< "osd." << osd
<< " does not exist";
11974 new_pg_temp
.push_back(osd
);
11977 int pool_min_size
= osdmap
.get_pg_pool_min_size(pgid
);
11978 if ((int)new_pg_temp
.size() < pool_min_size
) {
11979 ss
<< "num of osds (" << new_pg_temp
.size() <<") < pool min size ("
11980 << pool_min_size
<< ")";
11985 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
11986 if ((int)new_pg_temp
.size() > pool_size
) {
11987 ss
<< "num of osds (" << new_pg_temp
.size() <<") > pool size ("
11988 << pool_size
<< ")";
11993 pending_inc
.new_pg_temp
[pgid
] = mempool::osdmap::vector
<int>(
11994 new_pg_temp
.begin(), new_pg_temp
.end());
11995 ss
<< "set " << pgid
<< " pg_temp mapping to " << new_pg_temp
;
11997 } else if (prefix
== "osd primary-temp") {
11999 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
12000 ss
<< "unable to parse 'pgid' value '"
12001 << cmd_vartype_stringify(cmdmap
.at("pgid")) << "'";
12006 if (!pgid
.parse(pgidstr
.c_str())) {
12007 ss
<< "invalid pgid '" << pgidstr
<< "'";
12011 if (!osdmap
.pg_exists(pgid
)) {
12012 ss
<< "pg " << pgid
<< " does not exist";
12018 if (!cmd_getval(cmdmap
, "id", osd
)) {
12019 ss
<< "unable to parse 'id' value '"
12020 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12024 if (osd
!= -1 && !osdmap
.exists(osd
)) {
12025 ss
<< "osd." << osd
<< " does not exist";
12030 if (osdmap
.require_min_compat_client
!= ceph_release_t::unknown
&&
12031 osdmap
.require_min_compat_client
< ceph_release_t::firefly
) {
12032 ss
<< "require_min_compat_client "
12033 << osdmap
.require_min_compat_client
12034 << " < firefly, which is required for primary-temp";
12039 pending_inc
.new_primary_temp
[pgid
] = osd
;
12040 ss
<< "set " << pgid
<< " primary_temp mapping to " << osd
;
12042 } else if (prefix
== "pg repeer") {
12045 cmd_getval(cmdmap
, "pgid", pgidstr
);
12046 if (!pgid
.parse(pgidstr
.c_str())) {
12047 ss
<< "invalid pgid '" << pgidstr
<< "'";
12051 if (!osdmap
.pg_exists(pgid
)) {
12052 ss
<< "pg '" << pgidstr
<< "' does not exist";
12056 vector
<int> acting
;
12058 osdmap
.pg_to_acting_osds(pgid
, &acting
, &primary
);
12061 ss
<< "pg currently has no primary";
12064 if (acting
.size() > 1) {
12065 // map to just primary; it will map back to what it wants
12066 pending_inc
.new_pg_temp
[pgid
] = { primary
};
12068 // hmm, pick another arbitrary osd to induce a change. Note
12069 // that this won't work if there is only one suitable OSD in the cluster.
12072 for (i
= 0; i
< osdmap
.get_max_osd(); ++i
) {
12073 if (i
== primary
|| !osdmap
.is_up(i
) || !osdmap
.exists(i
)) {
12076 pending_inc
.new_pg_temp
[pgid
] = { primary
, i
};
12082 ss
<< "not enough up OSDs in the cluster to force repeer";
12087 } else if (prefix
== "osd pg-upmap" ||
12088 prefix
== "osd rm-pg-upmap" ||
12089 prefix
== "osd pg-upmap-items" ||
12090 prefix
== "osd rm-pg-upmap-items") {
12091 if (osdmap
.require_min_compat_client
< ceph_release_t::luminous
) {
12092 ss
<< "min_compat_client "
12093 << osdmap
.require_min_compat_client
12094 << " < luminous, which is required for pg-upmap. "
12095 << "Try 'ceph osd set-require-min-compat-client luminous' "
12096 << "before using the new interface";
12100 err
= check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP
, ss
);
12101 if (err
== -EAGAIN
)
12106 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
12107 ss
<< "unable to parse 'pgid' value '"
12108 << cmd_vartype_stringify(cmdmap
.at("pgid")) << "'";
12113 if (!pgid
.parse(pgidstr
.c_str())) {
12114 ss
<< "invalid pgid '" << pgidstr
<< "'";
12118 if (!osdmap
.pg_exists(pgid
)) {
12119 ss
<< "pg " << pgid
<< " does not exist";
12123 if (pending_inc
.old_pools
.count(pgid
.pool())) {
12124 ss
<< "pool of " << pgid
<< " is pending removal";
12127 wait_for_finished_proposal(op
,
12128 new Monitor::C_Command(mon
, op
, err
, rs
, get_last_committed() + 1));
12136 OP_RM_PG_UPMAP_ITEMS
,
12139 if (prefix
== "osd pg-upmap") {
12140 option
= OP_PG_UPMAP
;
12141 } else if (prefix
== "osd rm-pg-upmap") {
12142 option
= OP_RM_PG_UPMAP
;
12143 } else if (prefix
== "osd pg-upmap-items") {
12144 option
= OP_PG_UPMAP_ITEMS
;
12146 option
= OP_RM_PG_UPMAP_ITEMS
;
12149 // check pending upmap changes
12151 case OP_PG_UPMAP
: // fall through
12152 case OP_RM_PG_UPMAP
:
12153 if (pending_inc
.new_pg_upmap
.count(pgid
) ||
12154 pending_inc
.old_pg_upmap
.count(pgid
)) {
12155 dout(10) << __func__
<< " waiting for pending update on "
12157 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12162 case OP_PG_UPMAP_ITEMS
: // fall through
12163 case OP_RM_PG_UPMAP_ITEMS
:
12164 if (pending_inc
.new_pg_upmap_items
.count(pgid
) ||
12165 pending_inc
.old_pg_upmap_items
.count(pgid
)) {
12166 dout(10) << __func__
<< " waiting for pending update on "
12168 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12174 ceph_abort_msg("invalid option");
12180 vector
<int64_t> id_vec
;
12181 if (!cmd_getval(cmdmap
, "id", id_vec
)) {
12182 ss
<< "unable to parse 'id' value(s) '"
12183 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12188 int pool_min_size
= osdmap
.get_pg_pool_min_size(pgid
);
12189 if ((int)id_vec
.size() < pool_min_size
) {
12190 ss
<< "num of osds (" << id_vec
.size() <<") < pool min size ("
12191 << pool_min_size
<< ")";
12196 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
12197 if ((int)id_vec
.size() > pool_size
) {
12198 ss
<< "num of osds (" << id_vec
.size() <<") > pool size ("
12199 << pool_size
<< ")";
12204 vector
<int32_t> new_pg_upmap
;
12205 for (auto osd
: id_vec
) {
12206 if (osd
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(osd
)) {
12207 ss
<< "osd." << osd
<< " does not exist";
12211 auto it
= std::find(new_pg_upmap
.begin(), new_pg_upmap
.end(), osd
);
12212 if (it
!= new_pg_upmap
.end()) {
12213 ss
<< "osd." << osd
<< " already exists, ";
12216 new_pg_upmap
.push_back(osd
);
12219 if (new_pg_upmap
.empty()) {
12220 ss
<< "no valid upmap items(pairs) is specified";
12225 pending_inc
.new_pg_upmap
[pgid
] = mempool::osdmap::vector
<int32_t>(
12226 new_pg_upmap
.begin(), new_pg_upmap
.end());
12227 ss
<< "set " << pgid
<< " pg_upmap mapping to " << new_pg_upmap
;
12231 case OP_RM_PG_UPMAP
:
12233 pending_inc
.old_pg_upmap
.insert(pgid
);
12234 ss
<< "clear " << pgid
<< " pg_upmap mapping";
12238 case OP_PG_UPMAP_ITEMS
:
12240 vector
<int64_t> id_vec
;
12241 if (!cmd_getval(cmdmap
, "id", id_vec
)) {
12242 ss
<< "unable to parse 'id' value(s) '"
12243 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12248 if (id_vec
.size() % 2) {
12249 ss
<< "you must specify pairs of osd ids to be remapped";
12254 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
12255 if ((int)(id_vec
.size() / 2) > pool_size
) {
12256 ss
<< "num of osd pairs (" << id_vec
.size() / 2 <<") > pool size ("
12257 << pool_size
<< ")";
12262 vector
<pair
<int32_t,int32_t>> new_pg_upmap_items
;
12263 ostringstream items
;
12265 for (auto p
= id_vec
.begin(); p
!= id_vec
.end(); ++p
) {
12269 ss
<< "from osd." << from
<< " == to osd." << to
<< ", ";
12272 if (!osdmap
.exists(from
)) {
12273 ss
<< "osd." << from
<< " does not exist";
12277 if (to
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(to
)) {
12278 ss
<< "osd." << to
<< " does not exist";
12282 pair
<int32_t,int32_t> entry
= make_pair(from
, to
);
12283 auto it
= std::find(new_pg_upmap_items
.begin(),
12284 new_pg_upmap_items
.end(), entry
);
12285 if (it
!= new_pg_upmap_items
.end()) {
12286 ss
<< "osd." << from
<< " -> osd." << to
<< " already exists, ";
12289 new_pg_upmap_items
.push_back(entry
);
12290 items
<< from
<< "->" << to
<< ",";
12292 string
out(items
.str());
12293 out
.resize(out
.size() - 1); // drop last ','
12296 if (new_pg_upmap_items
.empty()) {
12297 ss
<< "no valid upmap items(pairs) is specified";
12302 pending_inc
.new_pg_upmap_items
[pgid
] =
12303 mempool::osdmap::vector
<pair
<int32_t,int32_t>>(
12304 new_pg_upmap_items
.begin(), new_pg_upmap_items
.end());
12305 ss
<< "set " << pgid
<< " pg_upmap_items mapping to " << out
;
12309 case OP_RM_PG_UPMAP_ITEMS
:
12311 pending_inc
.old_pg_upmap_items
.insert(pgid
);
12312 ss
<< "clear " << pgid
<< " pg_upmap_items mapping";
12317 ceph_abort_msg("invalid option");
12321 } else if (prefix
== "osd primary-affinity") {
12323 if (!cmd_getval(cmdmap
, "id", id
)) {
12324 ss
<< "invalid osd id value '"
12325 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12330 if (!cmd_getval(cmdmap
, "weight", w
)) {
12331 ss
<< "unable to parse 'weight' value '"
12332 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
12336 long ww
= (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY
*w
);
12338 ss
<< "weight must be >= 0";
12342 if (osdmap
.require_min_compat_client
!= ceph_release_t::unknown
&&
12343 osdmap
.require_min_compat_client
< ceph_release_t::firefly
) {
12344 ss
<< "require_min_compat_client "
12345 << osdmap
.require_min_compat_client
12346 << " < firefly, which is required for primary-affinity";
12350 if (osdmap
.exists(id
)) {
12351 pending_inc
.new_primary_affinity
[id
] = ww
;
12352 ss
<< "set osd." << id
<< " primary-affinity to " << w
<< " (" << std::ios::hex
<< ww
<< std::ios::dec
<< ")";
12354 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12355 get_last_committed() + 1));
12358 ss
<< "osd." << id
<< " does not exist";
12362 } else if (prefix
== "osd reweight") {
12364 if (!cmd_getval(cmdmap
, "id", id
)) {
12365 ss
<< "unable to parse osd id value '"
12366 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12371 if (!cmd_getval(cmdmap
, "weight", w
)) {
12372 ss
<< "unable to parse weight value '"
12373 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
12377 long ww
= (int)((double)CEPH_OSD_IN
*w
);
12379 ss
<< "weight must be >= 0";
12383 if (osdmap
.exists(id
)) {
12384 pending_inc
.new_weight
[id
] = ww
;
12385 ss
<< "reweighted osd." << id
<< " to " << w
<< " (" << std::hex
<< ww
<< std::dec
<< ")";
12387 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12388 get_last_committed() + 1));
12391 ss
<< "osd." << id
<< " does not exist";
12395 } else if (prefix
== "osd reweightn") {
12396 map
<int32_t, uint32_t> weights
;
12397 err
= parse_reweights(cct
, cmdmap
, osdmap
, &weights
);
12399 ss
<< "unable to parse 'weights' value '"
12400 << cmd_vartype_stringify(cmdmap
.at("weights")) << "'";
12403 pending_inc
.new_weight
.insert(weights
.begin(), weights
.end());
12404 wait_for_finished_proposal(
12406 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
, get_last_committed() + 1));
12408 } else if (prefix
== "osd lost") {
12410 if (!cmd_getval(cmdmap
, "id", id
)) {
12411 ss
<< "unable to parse osd id value '"
12412 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12417 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
12419 ss
<< "are you SURE? this might mean real, permanent data loss. pass "
12420 "--yes-i-really-mean-it if you really do.";
12423 } else if (!osdmap
.exists(id
)) {
12424 ss
<< "osd." << id
<< " does not exist";
12427 } else if (!osdmap
.is_down(id
)) {
12428 ss
<< "osd." << id
<< " is not down";
12432 epoch_t e
= osdmap
.get_info(id
).down_at
;
12433 pending_inc
.new_lost
[id
] = e
;
12434 ss
<< "marked osd lost in epoch " << e
;
12436 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12437 get_last_committed() + 1));
12441 } else if (prefix
== "osd destroy-actual" ||
12442 prefix
== "osd purge-actual" ||
12443 prefix
== "osd purge-new") {
12444 /* Destroying an OSD means that we don't expect to further make use of
12445 * the OSDs data (which may even become unreadable after this operation),
12446 * and that we are okay with scrubbing all its cephx keys and config-key
12447 * data (which may include lockbox keys, thus rendering the osd's data
12450 * The OSD will not be removed. Instead, we will mark it as destroyed,
12451 * such that a subsequent call to `create` will not reuse the osd id.
12452 * This will play into being able to recreate the OSD, at the same
12453 * crush location, with minimal data movement.
12456 // make sure authmon is writeable.
12457 if (!mon
.authmon()->is_writeable()) {
12458 dout(10) << __func__
<< " waiting for auth mon to be writeable for "
12459 << "osd destroy" << dendl
;
12460 mon
.authmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
12465 if (!cmd_getval(cmdmap
, "id", id
)) {
12466 auto p
= cmdmap
.find("id");
12467 if (p
== cmdmap
.end()) {
12468 ss
<< "no osd id specified";
12470 ss
<< "unable to parse osd id value '"
12471 << cmd_vartype_stringify(cmdmap
.at("id")) << "";
12477 bool is_destroy
= (prefix
== "osd destroy-actual");
12479 ceph_assert("osd purge-actual" == prefix
||
12480 "osd purge-new" == prefix
);
12484 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
12486 ss
<< "Are you SURE? Did you verify with 'ceph osd safe-to-destroy'? "
12487 << "This will mean real, permanent data loss, as well "
12488 << "as deletion of cephx and lockbox keys. "
12489 << "Pass --yes-i-really-mean-it if you really do.";
12492 } else if (!osdmap
.exists(id
)) {
12493 ss
<< "osd." << id
<< " does not exist";
12494 err
= 0; // idempotent
12496 } else if (osdmap
.is_up(id
)) {
12497 ss
<< "osd." << id
<< " is not `down`.";
12500 } else if (is_destroy
&& osdmap
.is_destroyed(id
)) {
12501 ss
<< "destroyed osd." << id
;
12506 if (prefix
== "osd purge-new" &&
12507 (osdmap
.get_state(id
) & CEPH_OSD_NEW
) == 0) {
12508 ss
<< "osd." << id
<< " is not new";
12513 bool goto_reply
= false;
12517 err
= prepare_command_osd_destroy(id
, ss
);
12518 // we checked above that it should exist.
12519 ceph_assert(err
!= -ENOENT
);
12521 err
= prepare_command_osd_purge(id
, ss
);
12522 if (err
== -ENOENT
) {
12524 ss
<< "osd." << id
<< " does not exist.";
12530 if (err
< 0 || goto_reply
) {
12535 ss
<< "destroyed osd." << id
;
12537 ss
<< "purged osd." << id
;
12541 wait_for_finished_proposal(op
,
12542 new Monitor::C_Command(mon
, op
, 0, rs
, get_last_committed() + 1));
12543 force_immediate_propose();
12546 } else if (prefix
== "osd new") {
12548 // make sure authmon is writeable.
12549 if (!mon
.authmon()->is_writeable()) {
12550 dout(10) << __func__
<< " waiting for auth mon to be writeable for "
12551 << "osd new" << dendl
;
12552 mon
.authmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
12556 map
<string
,string
> param_map
;
12558 bufferlist bl
= m
->get_data();
12559 string param_json
= bl
.to_str();
12560 dout(20) << __func__
<< " osd new json = " << param_json
<< dendl
;
12562 err
= get_json_str_map(param_json
, ss
, ¶m_map
);
12566 dout(20) << __func__
<< " osd new params " << param_map
<< dendl
;
12569 err
= prepare_command_osd_new(op
, cmdmap
, param_map
, ss
, f
.get());
12582 if (err
== EEXIST
) {
12583 // idempotent operation
12588 wait_for_finished_proposal(op
,
12589 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
,
12590 get_last_committed() + 1));
12591 force_immediate_propose();
12594 } else if (prefix
== "osd create") {
12596 // optional id provided?
12597 int64_t id
= -1, cmd_id
= -1;
12598 if (cmd_getval(cmdmap
, "id", cmd_id
)) {
12600 ss
<< "invalid osd id value '" << cmd_id
<< "'";
12604 dout(10) << " osd create got id " << cmd_id
<< dendl
;
12609 if (cmd_getval(cmdmap
, "uuid", uuidstr
)) {
12610 if (!uuid
.parse(uuidstr
.c_str())) {
12611 ss
<< "invalid uuid value '" << uuidstr
<< "'";
12615 // we only care about the id if we also have the uuid, to
12616 // ensure the operation's idempotency.
12620 int32_t new_id
= -1;
12621 err
= prepare_command_osd_create(id
, uuid
, &new_id
, ss
);
12623 if (err
== -EAGAIN
) {
12624 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12627 // a check has failed; reply to the user.
12630 } else if (err
== EEXIST
) {
12631 // this is an idempotent operation; we can go ahead and reply.
12633 f
->open_object_section("created_osd");
12634 f
->dump_int("osdid", new_id
);
12635 f
->close_section();
12645 string empty_device_class
;
12646 do_osd_create(id
, uuid
, empty_device_class
, &new_id
);
12649 f
->open_object_section("created_osd");
12650 f
->dump_int("osdid", new_id
);
12651 f
->close_section();
12657 wait_for_finished_proposal(op
,
12658 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
,
12659 get_last_committed() + 1));
12662 } else if (prefix
== "osd blocklist clear" ||
12663 prefix
== "osd blacklist clear") {
12664 pending_inc
.new_blocklist
.clear();
12665 std::list
<std::pair
<entity_addr_t
,utime_t
> > blocklist
;
12666 osdmap
.get_blocklist(&blocklist
);
12667 for (const auto &entry
: blocklist
) {
12668 pending_inc
.old_blocklist
.push_back(entry
.first
);
12670 ss
<< " removed all blocklist entries";
12672 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12673 get_last_committed() + 1));
12675 } else if (prefix
== "osd blocklist" ||
12676 prefix
== "osd blacklist") {
12678 cmd_getval(cmdmap
, "addr", addrstr
);
12679 entity_addr_t addr
;
12680 if (!addr
.parse(addrstr
)) {
12681 ss
<< "unable to parse address " << addrstr
;
12686 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
12687 // always blocklist type ANY
12688 addr
.set_type(entity_addr_t::TYPE_ANY
);
12690 addr
.set_type(entity_addr_t::TYPE_LEGACY
);
12693 string blocklistop
;
12694 if (!cmd_getval(cmdmap
, "blocklistop", blocklistop
)) {
12695 cmd_getval(cmdmap
, "blacklistop", blocklistop
);
12697 if (blocklistop
== "add") {
12698 utime_t expires
= ceph_clock_now();
12699 // default one hour
12700 double d
= cmd_getval_or
<double>(cmdmap
, "expire",
12701 g_conf()->mon_osd_blocklist_default_expire
);
12704 pending_inc
.new_blocklist
[addr
] = expires
;
12707 // cancel any pending un-blocklisting request too
12708 auto it
= std::find(pending_inc
.old_blocklist
.begin(),
12709 pending_inc
.old_blocklist
.end(), addr
);
12710 if (it
!= pending_inc
.old_blocklist
.end()) {
12711 pending_inc
.old_blocklist
.erase(it
);
12715 ss
<< "blocklisting " << addr
<< " until " << expires
<< " (" << d
<< " sec)";
12717 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12718 get_last_committed() + 1));
12720 } else if (blocklistop
== "rm") {
12721 if (osdmap
.is_blocklisted(addr
) ||
12722 pending_inc
.new_blocklist
.count(addr
)) {
12723 if (osdmap
.is_blocklisted(addr
))
12724 pending_inc
.old_blocklist
.push_back(addr
);
12726 pending_inc
.new_blocklist
.erase(addr
);
12727 ss
<< "un-blocklisting " << addr
;
12729 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12730 get_last_committed() + 1));
12733 ss
<< addr
<< " isn't blocklisted";
12738 } else if (prefix
== "osd pool mksnap") {
12740 cmd_getval(cmdmap
, "pool", poolstr
);
12741 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
12743 ss
<< "unrecognized pool '" << poolstr
<< "'";
12748 cmd_getval(cmdmap
, "snap", snapname
);
12749 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
12750 if (p
->is_unmanaged_snaps_mode()) {
12751 ss
<< "pool " << poolstr
<< " is in unmanaged snaps mode";
12754 } else if (p
->snap_exists(snapname
.c_str())) {
12755 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " already exists";
12758 } else if (p
->is_tier()) {
12759 ss
<< "pool " << poolstr
<< " is a cache tier";
12764 if (pending_inc
.new_pools
.count(pool
))
12765 pp
= &pending_inc
.new_pools
[pool
];
12767 pp
= &pending_inc
.new_pools
[pool
];
12770 if (pp
->snap_exists(snapname
.c_str())) {
12771 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " already exists";
12773 pp
->add_snap(snapname
.c_str(), ceph_clock_now());
12774 pp
->set_snap_epoch(pending_inc
.epoch
);
12775 ss
<< "created pool " << poolstr
<< " snap " << snapname
;
12778 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12779 get_last_committed() + 1));
12781 } else if (prefix
== "osd pool rmsnap") {
12783 cmd_getval(cmdmap
, "pool", poolstr
);
12784 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
12786 ss
<< "unrecognized pool '" << poolstr
<< "'";
12791 cmd_getval(cmdmap
, "snap", snapname
);
12792 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
12793 if (p
->is_unmanaged_snaps_mode()) {
12794 ss
<< "pool " << poolstr
<< " is in unmanaged snaps mode";
12797 } else if (!p
->snap_exists(snapname
.c_str())) {
12798 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " does not exist";
12803 if (pending_inc
.new_pools
.count(pool
))
12804 pp
= &pending_inc
.new_pools
[pool
];
12806 pp
= &pending_inc
.new_pools
[pool
];
12809 snapid_t sn
= pp
->snap_exists(snapname
.c_str());
12811 pp
->remove_snap(sn
);
12812 pp
->set_snap_epoch(pending_inc
.epoch
);
12813 ss
<< "removed pool " << poolstr
<< " snap " << snapname
;
12815 ss
<< "already removed pool " << poolstr
<< " snap " << snapname
;
12818 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12819 get_last_committed() + 1));
12821 } else if (prefix
== "osd pool create") {
12822 int64_t pg_num
= cmd_getval_or
<int64_t>(cmdmap
, "pg_num", 0);
12823 int64_t pg_num_min
= cmd_getval_or
<int64_t>(cmdmap
, "pg_num_min", 0);
12824 int64_t pg_num_max
= cmd_getval_or
<int64_t>(cmdmap
, "pg_num_max", 0);
12825 int64_t pgp_num
= cmd_getval_or
<int64_t>(cmdmap
, "pgp_num", pg_num
);
12826 string pool_type_str
;
12827 cmd_getval(cmdmap
, "pool_type", pool_type_str
);
12828 if (pool_type_str
.empty())
12829 pool_type_str
= g_conf().get_val
<string
>("osd_pool_default_type");
12832 cmd_getval(cmdmap
, "pool", poolstr
);
12833 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
12834 if (pool_id
>= 0) {
12835 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
12836 if (pool_type_str
!= p
->get_type_name()) {
12837 ss
<< "pool '" << poolstr
<< "' cannot change to type " << pool_type_str
;
12840 ss
<< "pool '" << poolstr
<< "' already exists";
12847 if (pool_type_str
== "replicated") {
12848 pool_type
= pg_pool_t::TYPE_REPLICATED
;
12849 } else if (pool_type_str
== "erasure") {
12850 pool_type
= pg_pool_t::TYPE_ERASURE
;
12852 ss
<< "unknown pool type '" << pool_type_str
<< "'";
12857 bool implicit_rule_creation
= false;
12858 int64_t expected_num_objects
= 0;
12860 cmd_getval(cmdmap
, "rule", rule_name
);
12861 string erasure_code_profile
;
12862 cmd_getval(cmdmap
, "erasure_code_profile", erasure_code_profile
);
12864 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
12865 if (erasure_code_profile
== "")
12866 erasure_code_profile
= "default";
12867 //handle the erasure code profile
12868 if (erasure_code_profile
== "default") {
12869 if (!osdmap
.has_erasure_code_profile(erasure_code_profile
)) {
12870 if (pending_inc
.has_erasure_code_profile(erasure_code_profile
)) {
12871 dout(20) << "erasure code profile " << erasure_code_profile
<< " already pending" << dendl
;
12875 map
<string
,string
> profile_map
;
12876 err
= osdmap
.get_erasure_code_profile_default(cct
,
12881 dout(20) << "erasure code profile " << erasure_code_profile
<< " set" << dendl
;
12882 pending_inc
.set_erasure_code_profile(erasure_code_profile
, profile_map
);
12886 if (rule_name
== "") {
12887 implicit_rule_creation
= true;
12888 if (erasure_code_profile
== "default") {
12889 rule_name
= "erasure-code";
12891 dout(1) << "implicitly use rule named after the pool: "
12892 << poolstr
<< dendl
;
12893 rule_name
= poolstr
;
12896 expected_num_objects
=
12897 cmd_getval_or
<int64_t>(cmdmap
, "expected_num_objects", 0);
12899 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
12900 // and put expected_num_objects to rule field
12901 if (erasure_code_profile
!= "") { // cmd is from CLI
12902 if (rule_name
!= "") {
12904 expected_num_objects
= strict_strtoll(rule_name
.c_str(), 10, &interr
);
12905 if (interr
.length()) {
12906 ss
<< "error parsing integer value '" << rule_name
<< "': " << interr
;
12911 rule_name
= erasure_code_profile
;
12912 } else { // cmd is well-formed
12913 expected_num_objects
=
12914 cmd_getval_or
<int64_t>(cmdmap
, "expected_num_objects", 0);
12918 if (!implicit_rule_creation
&& rule_name
!= "") {
12920 err
= get_crush_rule(rule_name
, &rule
, &ss
);
12921 if (err
== -EAGAIN
) {
12922 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12929 if (expected_num_objects
< 0) {
12930 ss
<< "'expected_num_objects' must be non-negative";
12936 osdmap
.get_all_osds(osds
);
12937 bool has_filestore_osd
= std::any_of(osds
.begin(), osds
.end(), [this](int osd
) {
12939 if (!get_osd_objectstore_type(osd
, &type
)) {
12940 return type
== "filestore";
12946 if (has_filestore_osd
&&
12947 expected_num_objects
> 0 &&
12948 cct
->_conf
->filestore_merge_threshold
> 0) {
12949 ss
<< "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
12954 if (has_filestore_osd
&&
12955 expected_num_objects
== 0 &&
12956 cct
->_conf
->filestore_merge_threshold
< 0) {
12957 int osds
= osdmap
.get_num_osds();
12959 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
12960 if (!sure
&& osds
&& (pg_num
>= 1024 || pg_num
/ osds
>= 100)) {
12961 ss
<< "For better initial performance on pools expected to store a "
12962 << "large number of objects, consider supplying the "
12963 << "expected_num_objects parameter when creating the pool."
12964 << " Pass --yes-i-really-mean-it to ignore it";
12970 int64_t fast_read_param
= cmd_getval_or
<int64_t>(cmdmap
, "fast_read", -1);
12971 FastReadType fast_read
= FAST_READ_DEFAULT
;
12972 if (fast_read_param
== 0)
12973 fast_read
= FAST_READ_OFF
;
12974 else if (fast_read_param
> 0)
12975 fast_read
= FAST_READ_ON
;
12977 int64_t repl_size
= 0;
12978 cmd_getval(cmdmap
, "size", repl_size
);
12979 int64_t target_size_bytes
= 0;
12980 double target_size_ratio
= 0.0;
12981 cmd_getval(cmdmap
, "target_size_bytes", target_size_bytes
);
12982 cmd_getval(cmdmap
, "target_size_ratio", target_size_ratio
);
12984 string pg_autoscale_mode
;
12985 cmd_getval(cmdmap
, "autoscale_mode", pg_autoscale_mode
);
12987 bool bulk
= cmd_getval_or
<bool>(cmdmap
, "bulk", 0);
12988 err
= prepare_new_pool(poolstr
,
12989 -1, // default crush rule
12991 pg_num
, pgp_num
, pg_num_min
, pg_num_max
,
12992 repl_size
, target_size_bytes
, target_size_ratio
,
12993 erasure_code_profile
, pool_type
,
12994 (uint64_t)expected_num_objects
,
13002 ss
<< "pool '" << poolstr
<< "' already exists";
13005 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13014 ss
<< "pool '" << poolstr
<< "' created";
13017 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
13018 get_last_committed() + 1));
13021 } else if (prefix
== "osd pool delete" ||
13022 prefix
== "osd pool rm") {
13023 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
13024 string poolstr
, poolstr2
, sure
;
13025 cmd_getval(cmdmap
, "pool", poolstr
);
13026 cmd_getval(cmdmap
, "pool2", poolstr2
);
13027 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
13029 ss
<< "pool '" << poolstr
<< "' does not exist";
13034 bool force_no_fake
= false;
13035 cmd_getval(cmdmap
, "yes_i_really_really_mean_it", force_no_fake
);
13036 bool force
= false;
13037 cmd_getval(cmdmap
, "yes_i_really_really_mean_it_not_faking", force
);
13038 if (poolstr2
!= poolstr
||
13039 (!force
&& !force_no_fake
)) {
13040 ss
<< "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
13041 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
13042 << "followed by --yes-i-really-really-mean-it.";
13046 err
= _prepare_remove_pool(pool
, &ss
, force_no_fake
);
13047 if (err
== -EAGAIN
) {
13048 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13054 } else if (prefix
== "osd pool rename") {
13055 string srcpoolstr
, destpoolstr
;
13056 cmd_getval(cmdmap
, "srcpool", srcpoolstr
);
13057 cmd_getval(cmdmap
, "destpool", destpoolstr
);
13058 int64_t pool_src
= osdmap
.lookup_pg_pool_name(srcpoolstr
.c_str());
13059 int64_t pool_dst
= osdmap
.lookup_pg_pool_name(destpoolstr
.c_str());
13061 if (pool_src
< 0) {
13062 if (pool_dst
>= 0) {
13063 // src pool doesn't exist, dst pool does exist: to ensure idempotency
13064 // of operations, assume this rename succeeded, as it is not changing
13065 // the current state. Make sure we output something understandable
13066 // for whoever is issuing the command, if they are paying attention,
13067 // in case it was not intentional; or to avoid a "wtf?" and a bug
13068 // report in case it was intentional, while expecting a failure.
13069 ss
<< "pool '" << srcpoolstr
<< "' does not exist; pool '"
13070 << destpoolstr
<< "' does -- assuming successful rename";
13073 ss
<< "unrecognized pool '" << srcpoolstr
<< "'";
13077 } else if (pool_dst
>= 0) {
13078 // source pool exists and so does the destination pool
13079 ss
<< "pool '" << destpoolstr
<< "' already exists";
13084 int ret
= _prepare_rename_pool(pool_src
, destpoolstr
);
13086 ss
<< "pool '" << srcpoolstr
<< "' renamed to '" << destpoolstr
<< "'";
13088 ss
<< "failed to rename pool '" << srcpoolstr
<< "' to '" << destpoolstr
<< "': "
13089 << cpp_strerror(ret
);
13092 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, ret
, rs
,
13093 get_last_committed() + 1));
13096 } else if (prefix
== "osd pool set") {
13097 err
= prepare_command_pool_set(cmdmap
, ss
);
13098 if (err
== -EAGAIN
)
13104 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
13105 get_last_committed() + 1));
13107 } else if (prefix
== "osd tier add") {
13108 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
13109 if (err
== -EAGAIN
)
13114 cmd_getval(cmdmap
, "pool", poolstr
);
13115 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13117 ss
<< "unrecognized pool '" << poolstr
<< "'";
13121 string tierpoolstr
;
13122 cmd_getval(cmdmap
, "tierpool", tierpoolstr
);
13123 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
13124 if (tierpool_id
< 0) {
13125 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
13129 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13131 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
13134 if (!_check_become_tier(tierpool_id
, tp
, pool_id
, p
, &err
, &ss
)) {
13138 // make sure new tier is empty
13139 bool force_nonempty
= false;
13140 cmd_getval_compat_cephbool(cmdmap
, "force_nonempty", force_nonempty
);
13141 const pool_stat_t
*pstats
= mon
.mgrstatmon()->get_pool_stat(tierpool_id
);
13142 if (pstats
&& pstats
->stats
.sum
.num_objects
!= 0 &&
13144 ss
<< "tier pool '" << tierpoolstr
<< "' is not empty; --force-nonempty to force";
13148 if (tp
->is_erasure()) {
13149 ss
<< "tier pool '" << tierpoolstr
13150 << "' is an ec pool, which cannot be a tier";
13154 if ((!tp
->removed_snaps
.empty() || !tp
->snaps
.empty()) &&
13155 (!force_nonempty
||
13156 !g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps
)) {
13157 ss
<< "tier pool '" << tierpoolstr
<< "' has snapshot state; it cannot be added as a tier without breaking the pool";
13162 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13163 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
13164 if (np
->tiers
.count(tierpool_id
) || ntp
->is_tier()) {
13165 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13168 np
->tiers
.insert(tierpool_id
);
13169 np
->set_snap_epoch(pending_inc
.epoch
); // tier will update to our snap info
13170 ntp
->tier_of
= pool_id
;
13171 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) a tier of '" << poolstr
<< "'";
13172 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13173 get_last_committed() + 1));
13175 } else if (prefix
== "osd tier remove" ||
13176 prefix
== "osd tier rm") {
13178 cmd_getval(cmdmap
, "pool", poolstr
);
13179 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13181 ss
<< "unrecognized pool '" << poolstr
<< "'";
13185 string tierpoolstr
;
13186 cmd_getval(cmdmap
, "tierpool", tierpoolstr
);
13187 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
13188 if (tierpool_id
< 0) {
13189 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
13193 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13195 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
13198 if (!_check_remove_tier(pool_id
, p
, tp
, &err
, &ss
)) {
13202 if (p
->tiers
.count(tierpool_id
) == 0) {
13203 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) not a tier of '" << poolstr
<< "'";
13207 if (tp
->tier_of
!= pool_id
) {
13208 ss
<< "tier pool '" << tierpoolstr
<< "' is a tier of '"
13209 << osdmap
.get_pool_name(tp
->tier_of
) << "': "
13210 // be scary about it; this is an inconsistency and bells must go off
13211 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
13215 if (p
->read_tier
== tierpool_id
) {
13216 ss
<< "tier pool '" << tierpoolstr
<< "' is the overlay for '" << poolstr
<< "'; please remove-overlay first";
13221 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13222 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
13223 if (np
->tiers
.count(tierpool_id
) == 0 ||
13224 ntp
->tier_of
!= pool_id
||
13225 np
->read_tier
== tierpool_id
) {
13226 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13229 np
->tiers
.erase(tierpool_id
);
13231 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) not a tier of '" << poolstr
<< "'";
13232 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13233 get_last_committed() + 1));
13235 } else if (prefix
== "osd tier set-overlay") {
13236 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
13237 if (err
== -EAGAIN
)
13242 cmd_getval(cmdmap
, "pool", poolstr
);
13243 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13245 ss
<< "unrecognized pool '" << poolstr
<< "'";
13249 string overlaypoolstr
;
13250 cmd_getval(cmdmap
, "overlaypool", overlaypoolstr
);
13251 int64_t overlaypool_id
= osdmap
.lookup_pg_pool_name(overlaypoolstr
);
13252 if (overlaypool_id
< 0) {
13253 ss
<< "unrecognized pool '" << overlaypoolstr
<< "'";
13257 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13259 const pg_pool_t
*overlay_p
= osdmap
.get_pg_pool(overlaypool_id
);
13260 ceph_assert(overlay_p
);
13261 if (p
->tiers
.count(overlaypool_id
) == 0) {
13262 ss
<< "tier pool '" << overlaypoolstr
<< "' is not a tier of '" << poolstr
<< "'";
13266 if (p
->read_tier
== overlaypool_id
) {
13268 ss
<< "overlay for '" << poolstr
<< "' is now (or already was) '" << overlaypoolstr
<< "'";
13271 if (p
->has_read_tier()) {
13272 ss
<< "pool '" << poolstr
<< "' has overlay '"
13273 << osdmap
.get_pool_name(p
->read_tier
)
13274 << "'; please remove-overlay first";
13280 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13281 np
->read_tier
= overlaypool_id
;
13282 np
->write_tier
= overlaypool_id
;
13283 np
->set_last_force_op_resend(pending_inc
.epoch
);
13284 pg_pool_t
*noverlay_p
= pending_inc
.get_new_pool(overlaypool_id
, overlay_p
);
13285 noverlay_p
->set_last_force_op_resend(pending_inc
.epoch
);
13286 ss
<< "overlay for '" << poolstr
<< "' is now (or already was) '" << overlaypoolstr
<< "'";
13287 if (overlay_p
->cache_mode
== pg_pool_t::CACHEMODE_NONE
)
13288 ss
<<" (WARNING: overlay pool cache_mode is still NONE)";
13289 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13290 get_last_committed() + 1));
13292 } else if (prefix
== "osd tier remove-overlay" ||
13293 prefix
== "osd tier rm-overlay") {
13295 cmd_getval(cmdmap
, "pool", poolstr
);
13296 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13298 ss
<< "unrecognized pool '" << poolstr
<< "'";
13302 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13304 if (!p
->has_read_tier()) {
13306 ss
<< "there is now (or already was) no overlay for '" << poolstr
<< "'";
13310 if (!_check_remove_tier(pool_id
, p
, NULL
, &err
, &ss
)) {
13315 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13316 if (np
->has_read_tier()) {
13317 const pg_pool_t
*op
= osdmap
.get_pg_pool(np
->read_tier
);
13318 pg_pool_t
*nop
= pending_inc
.get_new_pool(np
->read_tier
,op
);
13319 nop
->set_last_force_op_resend(pending_inc
.epoch
);
13321 if (np
->has_write_tier()) {
13322 const pg_pool_t
*op
= osdmap
.get_pg_pool(np
->write_tier
);
13323 pg_pool_t
*nop
= pending_inc
.get_new_pool(np
->write_tier
, op
);
13324 nop
->set_last_force_op_resend(pending_inc
.epoch
);
13326 np
->clear_read_tier();
13327 np
->clear_write_tier();
13328 np
->set_last_force_op_resend(pending_inc
.epoch
);
13329 ss
<< "there is now (or already was) no overlay for '" << poolstr
<< "'";
13330 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13331 get_last_committed() + 1));
13333 } else if (prefix
== "osd tier cache-mode") {
13334 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
13335 if (err
== -EAGAIN
)
13340 cmd_getval(cmdmap
, "pool", poolstr
);
13341 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13343 ss
<< "unrecognized pool '" << poolstr
<< "'";
13347 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13349 if (!p
->is_tier()) {
13350 ss
<< "pool '" << poolstr
<< "' is not a tier";
13355 cmd_getval(cmdmap
, "mode", modestr
);
13356 pg_pool_t::cache_mode_t mode
= pg_pool_t::get_cache_mode_from_str(modestr
);
13357 if (int(mode
) < 0) {
13358 ss
<< "'" << modestr
<< "' is not a valid cache mode";
13364 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
13366 if (mode
== pg_pool_t::CACHEMODE_FORWARD
||
13367 mode
== pg_pool_t::CACHEMODE_READFORWARD
) {
13368 ss
<< "'" << modestr
<< "' is no longer a supported cache mode";
13372 if ((mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13373 mode
!= pg_pool_t::CACHEMODE_NONE
&&
13374 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
13375 mode
!= pg_pool_t::CACHEMODE_READPROXY
) &&
13377 ss
<< "'" << modestr
<< "' is not a well-supported cache mode and may "
13378 << "corrupt your data. pass --yes-i-really-mean-it to force.";
13383 // pool already has this cache-mode set and there are no pending changes
13384 if (p
->cache_mode
== mode
&&
13385 (pending_inc
.new_pools
.count(pool_id
) == 0 ||
13386 pending_inc
.new_pools
[pool_id
].cache_mode
== p
->cache_mode
)) {
13387 ss
<< "set cache-mode for pool '" << poolstr
<< "'"
13388 << " to " << pg_pool_t::get_cache_mode_name(mode
);
13393 /* Mode description:
13395 * none: No cache-mode defined
13396 * forward: Forward all reads and writes to base pool [removed]
13397 * writeback: Cache writes, promote reads from base pool
13398 * readonly: Forward writes to base pool
13399 * readforward: Writes are in writeback mode, Reads are in forward mode [removed]
13400 * proxy: Proxy all reads and writes to base pool
13401 * readproxy: Writes are in writeback mode, Reads are in proxy mode
13403 * Hence, these are the allowed transitions:
13406 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
13407 * proxy -> readproxy || writeback || any IF num_objects_dirty == 0
13408 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
13409 * readproxy -> proxy || writeback || any IF num_objects_dirty == 0
13410 * writeback -> readproxy || proxy
13414 // We check if the transition is valid against the current pool mode, as
13415 // it is the only committed state thus far. We will blantly squash
13416 // whatever mode is on the pending state.
13418 if (p
->cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
&&
13419 (mode
!= pg_pool_t::CACHEMODE_PROXY
&&
13420 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) {
13421 ss
<< "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode
)
13422 << "' on a '" << pg_pool_t::get_cache_mode_name(p
->cache_mode
)
13423 << "' pool; only '"
13424 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY
)
13429 if ((p
->cache_mode
== pg_pool_t::CACHEMODE_READFORWARD
&&
13430 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13431 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
13432 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) ||
13434 (p
->cache_mode
== pg_pool_t::CACHEMODE_READPROXY
&&
13435 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13436 mode
!= pg_pool_t::CACHEMODE_PROXY
)) ||
13438 (p
->cache_mode
== pg_pool_t::CACHEMODE_PROXY
&&
13439 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13440 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) ||
13442 (p
->cache_mode
== pg_pool_t::CACHEMODE_FORWARD
&&
13443 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13444 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
13445 mode
!= pg_pool_t::CACHEMODE_READPROXY
))) {
13447 const pool_stat_t
* pstats
=
13448 mon
.mgrstatmon()->get_pool_stat(pool_id
);
13450 if (pstats
&& pstats
->stats
.sum
.num_objects_dirty
> 0) {
13451 ss
<< "unable to set cache-mode '"
13452 << pg_pool_t::get_cache_mode_name(mode
) << "' on pool '" << poolstr
13453 << "': dirty objects found";
13459 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13460 np
->cache_mode
= mode
;
13461 // set this both when moving to and from cache_mode NONE. this is to
13462 // capture legacy pools that were set up before this flag existed.
13463 np
->flags
|= pg_pool_t::FLAG_INCOMPLETE_CLONES
;
13464 ss
<< "set cache-mode for pool '" << poolstr
13465 << "' to " << pg_pool_t::get_cache_mode_name(mode
);
13466 if (mode
== pg_pool_t::CACHEMODE_NONE
) {
13467 const pg_pool_t
*base_pool
= osdmap
.get_pg_pool(np
->tier_of
);
13468 ceph_assert(base_pool
);
13469 if (base_pool
->read_tier
== pool_id
||
13470 base_pool
->write_tier
== pool_id
)
13471 ss
<<" (WARNING: pool is still configured as read or write tier)";
13473 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13474 get_last_committed() + 1));
13476 } else if (prefix
== "osd tier add-cache") {
13477 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
13478 if (err
== -EAGAIN
)
13483 cmd_getval(cmdmap
, "pool", poolstr
);
13484 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13486 ss
<< "unrecognized pool '" << poolstr
<< "'";
13490 string tierpoolstr
;
13491 cmd_getval(cmdmap
, "tierpool", tierpoolstr
);
13492 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
13493 if (tierpool_id
< 0) {
13494 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
13498 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13500 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
13503 if (!_check_become_tier(tierpool_id
, tp
, pool_id
, p
, &err
, &ss
)) {
13508 if (!cmd_getval(cmdmap
, "size", size
)) {
13509 ss
<< "unable to parse 'size' value '"
13510 << cmd_vartype_stringify(cmdmap
.at("size")) << "'";
13514 // make sure new tier is empty
13515 const pool_stat_t
*pstats
=
13516 mon
.mgrstatmon()->get_pool_stat(tierpool_id
);
13517 if (pstats
&& pstats
->stats
.sum
.num_objects
!= 0) {
13518 ss
<< "tier pool '" << tierpoolstr
<< "' is not empty";
13522 auto& modestr
= g_conf().get_val
<string
>("osd_tier_default_cache_mode");
13523 pg_pool_t::cache_mode_t mode
= pg_pool_t::get_cache_mode_from_str(modestr
);
13524 if (int(mode
) < 0) {
13525 ss
<< "osd tier cache default mode '" << modestr
<< "' is not a valid cache mode";
13529 HitSet::Params hsp
;
13530 auto& cache_hit_set_type
=
13531 g_conf().get_val
<string
>("osd_tier_default_cache_hit_set_type");
13532 if (cache_hit_set_type
== "bloom") {
13533 BloomHitSet::Params
*bsp
= new BloomHitSet::Params
;
13534 bsp
->set_fpp(g_conf().get_val
<double>("osd_pool_default_hit_set_bloom_fpp"));
13535 hsp
= HitSet::Params(bsp
);
13536 } else if (cache_hit_set_type
== "explicit_hash") {
13537 hsp
= HitSet::Params(new ExplicitHashHitSet::Params
);
13538 } else if (cache_hit_set_type
== "explicit_object") {
13539 hsp
= HitSet::Params(new ExplicitObjectHitSet::Params
);
13541 ss
<< "osd tier cache default hit set type '"
13542 << cache_hit_set_type
<< "' is not a known type";
13547 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13548 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
13549 if (np
->tiers
.count(tierpool_id
) || ntp
->is_tier()) {
13550 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13553 np
->tiers
.insert(tierpool_id
);
13554 np
->read_tier
= np
->write_tier
= tierpool_id
;
13555 np
->set_snap_epoch(pending_inc
.epoch
); // tier will update to our snap info
13556 np
->set_last_force_op_resend(pending_inc
.epoch
);
13557 ntp
->set_last_force_op_resend(pending_inc
.epoch
);
13558 ntp
->tier_of
= pool_id
;
13559 ntp
->cache_mode
= mode
;
13560 ntp
->hit_set_count
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_count");
13561 ntp
->hit_set_period
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_period");
13562 ntp
->min_read_recency_for_promote
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
13563 ntp
->min_write_recency_for_promote
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
13564 ntp
->hit_set_grade_decay_rate
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
13565 ntp
->hit_set_search_last_n
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
13566 ntp
->hit_set_params
= hsp
;
13567 ntp
->target_max_bytes
= size
;
13568 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) a cache tier of '" << poolstr
<< "'";
13569 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13570 get_last_committed() + 1));
13572 } else if (prefix
== "osd pool set-quota") {
13574 cmd_getval(cmdmap
, "pool", poolstr
);
13575 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13577 ss
<< "unrecognized pool '" << poolstr
<< "'";
13583 cmd_getval(cmdmap
, "field", field
);
13584 if (field
!= "max_objects" && field
!= "max_bytes") {
13585 ss
<< "unrecognized field '" << field
<< "'; should be 'max_bytes' or 'max_objects'";
13590 // val could contain unit designations, so we treat as a string
13592 cmd_getval(cmdmap
, "val", val
);
13595 if (field
== "max_objects") {
13596 value
= strict_si_cast
<uint64_t>(val
, &tss
);
13597 } else if (field
== "max_bytes") {
13598 value
= strict_iecstrtoll(val
, &tss
);
13600 ceph_abort_msg("unrecognized option");
13602 if (!tss
.empty()) {
13603 ss
<< "error parsing value '" << val
<< "': " << tss
;
13608 pg_pool_t
*pi
= pending_inc
.get_new_pool(pool_id
, osdmap
.get_pg_pool(pool_id
));
13609 if (field
== "max_objects") {
13610 pi
->quota_max_objects
= value
;
13611 } else if (field
== "max_bytes") {
13612 pi
->quota_max_bytes
= value
;
13614 ceph_abort_msg("unrecognized option");
13616 ss
<< "set-quota " << field
<< " = " << value
<< " for pool " << poolstr
;
13618 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
13619 get_last_committed() + 1));
13621 } else if (prefix
== "osd pool application enable" ||
13622 prefix
== "osd pool application disable" ||
13623 prefix
== "osd pool application set" ||
13624 prefix
== "osd pool application rm") {
13625 err
= prepare_command_pool_application(prefix
, cmdmap
, ss
);
13626 if (err
== -EAGAIN
) {
13628 } else if (err
< 0) {
13633 } else if (prefix
== "osd force-create-pg") {
13636 cmd_getval(cmdmap
, "pgid", pgidstr
);
13637 if (!pgid
.parse(pgidstr
.c_str())) {
13638 ss
<< "invalid pgid '" << pgidstr
<< "'";
13642 if (!osdmap
.pg_exists(pgid
)) {
13643 ss
<< "pg " << pgid
<< " should not exist";
13648 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
13650 ss
<< "This command will recreate a lost (as in data lost) PG with data in it, such "
13651 << "that the cluster will give up ever trying to recover the lost data. Do this "
13652 << "only if you are certain that all copies of the PG are in fact lost and you are "
13653 << "willing to accept that the data is permanently destroyed. Pass "
13654 << "--yes-i-really-mean-it to proceed.";
13660 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
13661 auto emplaced
= creating_pgs
.pgs
.emplace(
13663 creating_pgs_t::pg_create_info(osdmap
.get_epoch(),
13664 ceph_clock_now()));
13665 creating_now
= emplaced
.second
;
13667 if (creating_now
) {
13668 ss
<< "pg " << pgidstr
<< " now creating, ok";
13669 // set the pool's CREATING flag so that (1) the osd won't ignore our
13670 // create message and (2) we won't propose any future pg_num changes
13671 // until after the PG has been instantiated.
13672 if (pending_inc
.new_pools
.count(pgid
.pool()) == 0) {
13673 pending_inc
.new_pools
[pgid
.pool()] = *osdmap
.get_pg_pool(pgid
.pool());
13675 pending_inc
.new_pools
[pgid
.pool()].flags
|= pg_pool_t::FLAG_CREATING
;
13679 ss
<< "pg " << pgid
<< " already creating";
13683 } else if (prefix
== "osd force_healthy_stretch_mode") {
13685 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
13687 ss
<< "This command will require peering across multiple CRUSH buckets "
13688 "(probably two data centers or availability zones?) and may result in PGs "
13689 "going inactive until backfilling is complete. Pass --yes-i-really-mean-it to proceed.";
13693 try_end_recovery_stretch_mode(true);
13694 ss
<< "Triggering healthy stretch mode";
13697 } else if (prefix
== "osd force_recovery_stretch_mode") {
13699 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
13701 ss
<< "This command will increase pool sizes to try and spread them "
13702 "across multiple CRUSH buckets (probably two data centers or "
13703 "availability zones?) and should have happened automatically"
13704 "Pass --yes-i-really-mean-it to proceed.";
13708 mon
.go_recovery_stretch_mode();
13709 ss
<< "Triggering recovery stretch mode";
13718 if (err
< 0 && rs
.length() == 0)
13719 rs
= cpp_strerror(err
);
13720 mon
.reply_command(op
, err
, rs
, rdata
, get_last_committed());
13725 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
13726 get_last_committed() + 1));
13730 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13734 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op
)
13736 op
->mark_osdmon_event(__func__
);
13738 auto m
= op
->get_req
<MPoolOp
>();
13739 MonSession
*session
= op
->get_session();
13741 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
13746 case POOL_OP_CREATE_UNMANAGED_SNAP
:
13747 case POOL_OP_DELETE_UNMANAGED_SNAP
:
13749 const std::string
* pool_name
= nullptr;
13750 const pg_pool_t
*pg_pool
= osdmap
.get_pg_pool(m
->pool
);
13751 if (pg_pool
!= nullptr) {
13752 pool_name
= &osdmap
.get_pool_name(m
->pool
);
13755 if (!is_unmanaged_snap_op_permitted(cct
, mon
.key_server
,
13756 session
->entity_name
, session
->caps
,
13757 session
->get_peer_socket_addr(),
13759 dout(0) << "got unmanaged-snap pool op from entity with insufficient "
13760 << "privileges. message: " << *m
<< std::endl
13761 << "caps: " << session
->caps
<< dendl
;
13762 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
13768 if (!session
->is_capable("osd", MON_CAP_W
)) {
13769 dout(0) << "got pool op from entity with insufficient privileges. "
13770 << "message: " << *m
<< std::endl
13771 << "caps: " << session
->caps
<< dendl
;
13772 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
13781 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op
)
13783 op
->mark_osdmon_event(__func__
);
13784 auto m
= op
->get_req
<MPoolOp
>();
13786 if (enforce_pool_op_caps(op
)) {
13790 if (m
->fsid
!= mon
.monmap
->fsid
) {
13791 dout(0) << __func__
<< " drop message on fsid " << m
->fsid
13792 << " != " << mon
.monmap
->fsid
<< " for " << *m
<< dendl
;
13793 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13797 if (m
->op
== POOL_OP_CREATE
)
13798 return preprocess_pool_op_create(op
);
13800 const pg_pool_t
*p
= osdmap
.get_pg_pool(m
->pool
);
13801 if (p
== nullptr) {
13802 dout(10) << "attempt to operate on non-existent pool id " << m
->pool
<< dendl
;
13803 if (m
->op
== POOL_OP_DELETE
) {
13804 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13806 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
13811 // check if the snap and snapname exist
13812 bool snap_exists
= false;
13813 if (p
->snap_exists(m
->name
.c_str()))
13814 snap_exists
= true;
13817 case POOL_OP_CREATE_SNAP
:
13818 if (p
->is_unmanaged_snaps_mode() || p
->is_tier()) {
13819 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13823 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13827 case POOL_OP_CREATE_UNMANAGED_SNAP
:
13828 if (p
->is_pool_snaps_mode()) {
13829 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13833 case POOL_OP_DELETE_SNAP
:
13834 if (p
->is_unmanaged_snaps_mode()) {
13835 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13838 if (!snap_exists
) {
13839 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13843 case POOL_OP_DELETE_UNMANAGED_SNAP
:
13844 if (p
->is_pool_snaps_mode()) {
13845 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13848 if (_is_removed_snap(m
->pool
, m
->snapid
)) {
13849 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13853 case POOL_OP_DELETE
:
13854 if (osdmap
.lookup_pg_pool_name(m
->name
.c_str()) >= 0) {
13855 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13859 case POOL_OP_AUID_CHANGE
:
13869 bool OSDMonitor::_is_removed_snap(int64_t pool
, snapid_t snap
)
13871 if (!osdmap
.have_pg_pool(pool
)) {
13872 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
13873 << " - pool dne" << dendl
;
13876 if (osdmap
.in_removed_snaps_queue(pool
, snap
)) {
13877 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
13878 << " - in osdmap removed_snaps_queue" << dendl
;
13881 snapid_t begin
, end
;
13882 int r
= lookup_purged_snap(pool
, snap
, &begin
, &end
);
13884 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
13885 << " - purged, [" << begin
<< "," << end
<< ")" << dendl
;
13891 bool OSDMonitor::_is_pending_removed_snap(int64_t pool
, snapid_t snap
)
13893 if (pending_inc
.old_pools
.count(pool
)) {
13894 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
13895 << " - pool pending deletion" << dendl
;
13898 if (pending_inc
.in_new_removed_snaps(pool
, snap
)) {
13899 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
13900 << " - in pending new_removed_snaps" << dendl
;
13906 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op
)
13908 op
->mark_osdmon_event(__func__
);
13909 auto m
= op
->get_req
<MPoolOp
>();
13910 int64_t pool
= osdmap
.lookup_pg_pool_name(m
->name
.c_str());
13912 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13919 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op
)
13921 op
->mark_osdmon_event(__func__
);
13922 auto m
= op
->get_req
<MPoolOp
>();
13923 dout(10) << "prepare_pool_op " << *m
<< dendl
;
13924 if (m
->op
== POOL_OP_CREATE
) {
13925 return prepare_pool_op_create(op
);
13926 } else if (m
->op
== POOL_OP_DELETE
) {
13927 return prepare_pool_op_delete(op
);
13931 bool changed
= false;
13933 if (!osdmap
.have_pg_pool(m
->pool
)) {
13934 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
13938 const pg_pool_t
*pool
= osdmap
.get_pg_pool(m
->pool
);
13941 case POOL_OP_CREATE_SNAP
:
13942 if (pool
->is_tier()) {
13944 _pool_op_reply(op
, ret
, osdmap
.get_epoch());
13946 } // else, fall through
13947 case POOL_OP_DELETE_SNAP
:
13948 if (!pool
->is_unmanaged_snaps_mode()) {
13949 bool snap_exists
= pool
->snap_exists(m
->name
.c_str());
13950 if ((m
->op
== POOL_OP_CREATE_SNAP
&& snap_exists
)
13951 || (m
->op
== POOL_OP_DELETE_SNAP
&& !snap_exists
)) {
13959 _pool_op_reply(op
, ret
, osdmap
.get_epoch());
13962 case POOL_OP_DELETE_UNMANAGED_SNAP
:
13963 // we won't allow removal of an unmanaged snapshot from a pool
13964 // not in unmanaged snaps mode.
13965 if (!pool
->is_unmanaged_snaps_mode()) {
13966 _pool_op_reply(op
, -ENOTSUP
, osdmap
.get_epoch());
13970 case POOL_OP_CREATE_UNMANAGED_SNAP
:
13971 // but we will allow creating an unmanaged snapshot on any pool
13972 // as long as it is not in 'pool' snaps mode.
13973 if (pool
->is_pool_snaps_mode()) {
13974 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13979 // projected pool info
13981 if (pending_inc
.new_pools
.count(m
->pool
))
13982 pp
= pending_inc
.new_pools
[m
->pool
];
13984 pp
= *osdmap
.get_pg_pool(m
->pool
);
13986 bufferlist reply_data
;
13988 // pool snaps vs unmanaged snaps are mutually exclusive
13990 case POOL_OP_CREATE_SNAP
:
13991 case POOL_OP_DELETE_SNAP
:
13992 if (pp
.is_unmanaged_snaps_mode()) {
13998 case POOL_OP_CREATE_UNMANAGED_SNAP
:
13999 case POOL_OP_DELETE_UNMANAGED_SNAP
:
14000 if (pp
.is_pool_snaps_mode()) {
14007 case POOL_OP_CREATE_SNAP
:
14008 if (!pp
.snap_exists(m
->name
.c_str())) {
14009 pp
.add_snap(m
->name
.c_str(), ceph_clock_now());
14010 dout(10) << "create snap in pool " << m
->pool
<< " " << m
->name
14011 << " seq " << pp
.get_snap_epoch() << dendl
;
14016 case POOL_OP_DELETE_SNAP
:
14018 snapid_t s
= pp
.snap_exists(m
->name
.c_str());
14021 pending_inc
.new_removed_snaps
[m
->pool
].insert(s
);
14027 case POOL_OP_CREATE_UNMANAGED_SNAP
:
14029 uint64_t snapid
= pp
.add_unmanaged_snap(
14030 osdmap
.require_osd_release
< ceph_release_t::octopus
);
14031 encode(snapid
, reply_data
);
14036 case POOL_OP_DELETE_UNMANAGED_SNAP
:
14037 if (!_is_removed_snap(m
->pool
, m
->snapid
) &&
14038 !_is_pending_removed_snap(m
->pool
, m
->snapid
)) {
14039 if (m
->snapid
> pp
.get_snap_seq()) {
14040 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
14043 pp
.remove_unmanaged_snap(
14045 osdmap
.require_osd_release
< ceph_release_t::octopus
);
14046 pending_inc
.new_removed_snaps
[m
->pool
].insert(m
->snapid
);
14047 // also record the new seq as purged: this avoids a discontinuity
14048 // after all of the snaps have been purged, since the seq assigned
14049 // during removal lives in the same namespace as the actual snaps.
14050 pending_pseudo_purged_snaps
[m
->pool
].insert(pp
.get_snap_seq());
14055 case POOL_OP_AUID_CHANGE
:
14056 _pool_op_reply(op
, -EOPNOTSUPP
, osdmap
.get_epoch());
14065 pp
.set_snap_epoch(pending_inc
.epoch
);
14066 pending_inc
.new_pools
[m
->pool
] = pp
;
14070 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, ret
, pending_inc
.epoch
, &reply_data
));
14074 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op
)
14076 op
->mark_osdmon_event(__func__
);
14077 int err
= prepare_new_pool(op
);
14078 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, err
, pending_inc
.epoch
));
14082 int OSDMonitor::_check_remove_pool(int64_t pool_id
, const pg_pool_t
& pool
,
14085 const string
& poolstr
= osdmap
.get_pool_name(pool_id
);
14087 // If the Pool is in use by CephFS, refuse to delete it
14088 FSMap
const &pending_fsmap
= mon
.mdsmon()->get_pending_fsmap();
14089 if (pending_fsmap
.pool_in_use(pool_id
)) {
14090 *ss
<< "pool '" << poolstr
<< "' is in use by CephFS";
14094 if (pool
.tier_of
>= 0) {
14095 *ss
<< "pool '" << poolstr
<< "' is a tier of '"
14096 << osdmap
.get_pool_name(pool
.tier_of
) << "'";
14099 if (!pool
.tiers
.empty()) {
14100 *ss
<< "pool '" << poolstr
<< "' has tiers";
14101 for(auto tier
: pool
.tiers
) {
14102 *ss
<< " " << osdmap
.get_pool_name(tier
);
14107 if (!g_conf()->mon_allow_pool_delete
) {
14108 *ss
<< "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
14112 if (pool
.has_flag(pg_pool_t::FLAG_NODELETE
)) {
14113 *ss
<< "pool deletion is disabled; you must unset nodelete flag for the pool first";
14117 *ss
<< "pool '" << poolstr
<< "' removed";
14122 * Check if it is safe to add a tier to a base pool
14125 * True if the operation should proceed, false if we should abort here
14126 * (abort doesn't necessarily mean error, could be idempotency)
14128 bool OSDMonitor::_check_become_tier(
14129 const int64_t tier_pool_id
, const pg_pool_t
*tier_pool
,
14130 const int64_t base_pool_id
, const pg_pool_t
*base_pool
,
14134 const std::string
&tier_pool_name
= osdmap
.get_pool_name(tier_pool_id
);
14135 const std::string
&base_pool_name
= osdmap
.get_pool_name(base_pool_id
);
14137 const FSMap
&pending_fsmap
= mon
.mdsmon()->get_pending_fsmap();
14138 if (pending_fsmap
.pool_in_use(tier_pool_id
)) {
14139 *ss
<< "pool '" << tier_pool_name
<< "' is in use by CephFS";
14144 if (base_pool
->tiers
.count(tier_pool_id
)) {
14145 ceph_assert(tier_pool
->tier_of
== base_pool_id
);
14147 *ss
<< "pool '" << tier_pool_name
<< "' is now (or already was) a tier of '"
14148 << base_pool_name
<< "'";
14152 if (base_pool
->is_tier()) {
14153 *ss
<< "pool '" << base_pool_name
<< "' is already a tier of '"
14154 << osdmap
.get_pool_name(base_pool
->tier_of
) << "', "
14155 << "multiple tiers are not yet supported.";
14160 if (tier_pool
->has_tiers()) {
14161 *ss
<< "pool '" << tier_pool_name
<< "' has following tier(s) already:";
14162 for (set
<uint64_t>::iterator it
= tier_pool
->tiers
.begin();
14163 it
!= tier_pool
->tiers
.end(); ++it
)
14164 *ss
<< "'" << osdmap
.get_pool_name(*it
) << "',";
14165 *ss
<< " multiple tiers are not yet supported.";
14170 if (tier_pool
->is_tier()) {
14171 *ss
<< "tier pool '" << tier_pool_name
<< "' is already a tier of '"
14172 << osdmap
.get_pool_name(tier_pool
->tier_of
) << "'";
14183 * Check if it is safe to remove a tier from this base pool
14186 * True if the operation should proceed, false if we should abort here
14187 * (abort doesn't necessarily mean error, could be idempotency)
14189 bool OSDMonitor::_check_remove_tier(
14190 const int64_t base_pool_id
, const pg_pool_t
*base_pool
,
14191 const pg_pool_t
*tier_pool
,
14192 int *err
, ostream
*ss
) const
14194 const std::string
&base_pool_name
= osdmap
.get_pool_name(base_pool_id
);
14196 // Apply CephFS-specific checks
14197 const FSMap
&pending_fsmap
= mon
.mdsmon()->get_pending_fsmap();
14198 if (pending_fsmap
.pool_in_use(base_pool_id
)) {
14199 if (base_pool
->is_erasure() && !base_pool
->allows_ecoverwrites()) {
14200 // If the underlying pool is erasure coded and does not allow EC
14201 // overwrites, we can't permit the removal of the replicated tier that
14202 // CephFS relies on to access it
14203 *ss
<< "pool '" << base_pool_name
<<
14204 "' does not allow EC overwrites and is in use by CephFS"
14210 if (tier_pool
&& tier_pool
->cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
) {
14211 *ss
<< "pool '" << base_pool_name
<< "' is in use by CephFS, and this "
14212 "tier is still in use as a writeback cache. Change the cache "
14213 "mode and flush the cache before removing it";
14223 int OSDMonitor::_prepare_remove_pool(
14224 int64_t pool
, ostream
*ss
, bool no_fake
)
14226 dout(10) << __func__
<< " " << pool
<< dendl
;
14227 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
14228 int r
= _check_remove_pool(pool
, *p
, ss
);
14232 auto new_pool
= pending_inc
.new_pools
.find(pool
);
14233 if (new_pool
!= pending_inc
.new_pools
.end()) {
14234 // if there is a problem with the pending info, wait and retry
14236 const auto& p
= new_pool
->second
;
14237 int r
= _check_remove_pool(pool
, p
, ss
);
14242 if (pending_inc
.old_pools
.count(pool
)) {
14243 dout(10) << __func__
<< " " << pool
<< " already pending removal"
14248 if (g_conf()->mon_fake_pool_delete
&& !no_fake
) {
14249 string old_name
= osdmap
.get_pool_name(pool
);
14250 string new_name
= old_name
+ "." + stringify(pool
) + ".DELETED";
14251 dout(1) << __func__
<< " faking pool deletion: renaming " << pool
<< " "
14252 << old_name
<< " -> " << new_name
<< dendl
;
14253 pending_inc
.new_pool_names
[pool
] = new_name
;
14258 pending_inc
.old_pools
.insert(pool
);
14260 // remove any pg_temp mappings for this pool
14261 for (auto p
= osdmap
.pg_temp
->begin();
14262 p
!= osdmap
.pg_temp
->end();
14264 if (p
->first
.pool() == pool
) {
14265 dout(10) << __func__
<< " " << pool
<< " removing obsolete pg_temp "
14266 << p
->first
<< dendl
;
14267 pending_inc
.new_pg_temp
[p
->first
].clear();
14270 // remove any primary_temp mappings for this pool
14271 for (auto p
= osdmap
.primary_temp
->begin();
14272 p
!= osdmap
.primary_temp
->end();
14274 if (p
->first
.pool() == pool
) {
14275 dout(10) << __func__
<< " " << pool
14276 << " removing obsolete primary_temp" << p
->first
<< dendl
;
14277 pending_inc
.new_primary_temp
[p
->first
] = -1;
14280 // remove any pg_upmap mappings for this pool
14281 for (auto& p
: osdmap
.pg_upmap
) {
14282 if (p
.first
.pool() == pool
) {
14283 dout(10) << __func__
<< " " << pool
14284 << " removing obsolete pg_upmap "
14285 << p
.first
<< dendl
;
14286 pending_inc
.old_pg_upmap
.insert(p
.first
);
14289 // remove any pending pg_upmap mappings for this pool
14291 auto it
= pending_inc
.new_pg_upmap
.begin();
14292 while (it
!= pending_inc
.new_pg_upmap
.end()) {
14293 if (it
->first
.pool() == pool
) {
14294 dout(10) << __func__
<< " " << pool
14295 << " removing pending pg_upmap "
14296 << it
->first
<< dendl
;
14297 it
= pending_inc
.new_pg_upmap
.erase(it
);
14303 // remove any pg_upmap_items mappings for this pool
14304 for (auto& p
: osdmap
.pg_upmap_items
) {
14305 if (p
.first
.pool() == pool
) {
14306 dout(10) << __func__
<< " " << pool
14307 << " removing obsolete pg_upmap_items " << p
.first
14309 pending_inc
.old_pg_upmap_items
.insert(p
.first
);
14312 // remove any pending pg_upmap mappings for this pool
14314 auto it
= pending_inc
.new_pg_upmap_items
.begin();
14315 while (it
!= pending_inc
.new_pg_upmap_items
.end()) {
14316 if (it
->first
.pool() == pool
) {
14317 dout(10) << __func__
<< " " << pool
14318 << " removing pending pg_upmap_items "
14319 << it
->first
<< dendl
;
14320 it
= pending_inc
.new_pg_upmap_items
.erase(it
);
14327 // remove any choose_args for this pool
14328 CrushWrapper newcrush
= _get_pending_crush();
14329 if (newcrush
.have_choose_args(pool
)) {
14330 dout(10) << __func__
<< " removing choose_args for pool " << pool
<< dendl
;
14331 newcrush
.rm_choose_args(pool
);
14332 pending_inc
.crush
.clear();
14333 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
14338 int OSDMonitor::_prepare_rename_pool(int64_t pool
, string newname
)
14340 dout(10) << "_prepare_rename_pool " << pool
<< dendl
;
14341 if (pending_inc
.old_pools
.count(pool
)) {
14342 dout(10) << "_prepare_rename_pool " << pool
<< " pending removal" << dendl
;
14345 for (map
<int64_t,string
>::iterator p
= pending_inc
.new_pool_names
.begin();
14346 p
!= pending_inc
.new_pool_names
.end();
14348 if (p
->second
== newname
&& p
->first
!= pool
) {
14353 pending_inc
.new_pool_names
[pool
] = newname
;
14357 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op
)
14359 op
->mark_osdmon_event(__func__
);
14360 auto m
= op
->get_req
<MPoolOp
>();
14362 int ret
= _prepare_remove_pool(m
->pool
, &ss
, false);
14363 if (ret
== -EAGAIN
) {
14364 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
14368 dout(10) << __func__
<< " got " << ret
<< " " << ss
.str() << dendl
;
14369 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, ret
,
14370 pending_inc
.epoch
));
14374 void OSDMonitor::_pool_op_reply(MonOpRequestRef op
,
14375 int ret
, epoch_t epoch
, bufferlist
*blp
)
14377 op
->mark_osdmon_event(__func__
);
14378 auto m
= op
->get_req
<MPoolOp
>();
14379 dout(20) << "_pool_op_reply " << ret
<< dendl
;
14380 MPoolOpReply
*reply
= new MPoolOpReply(m
->fsid
, m
->get_tid(),
14381 ret
, epoch
, get_last_committed(), blp
);
14382 mon
.send_reply(op
, reply
);
14385 void OSDMonitor::convert_pool_priorities(void)
14387 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc("recovery_priority").key
;
14388 int64_t max_prio
= 0;
14389 int64_t min_prio
= 0;
14390 for (const auto &i
: osdmap
.get_pools()) {
14391 const auto &pool
= i
.second
;
14393 if (pool
.opts
.is_set(key
)) {
14395 pool
.opts
.get(key
, &prio
);
14396 if (prio
> max_prio
)
14398 if (prio
< min_prio
)
14402 if (max_prio
<= OSD_POOL_PRIORITY_MAX
&& min_prio
>= OSD_POOL_PRIORITY_MIN
) {
14403 dout(20) << __func__
<< " nothing to fix" << dendl
;
14406 // Current pool priorities exceeds new maximum
14407 for (const auto &i
: osdmap
.get_pools()) {
14408 const auto pool_id
= i
.first
;
14409 pg_pool_t pool
= i
.second
;
14412 pool
.opts
.get(key
, &prio
);
14415 if (prio
> 0 && max_prio
> OSD_POOL_PRIORITY_MAX
) { // Likely scenario
14416 // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
14417 n
= (float)prio
/ max_prio
* OSD_POOL_PRIORITY_MAX
;
14418 } else if (prio
< 0 && min_prio
< OSD_POOL_PRIORITY_MIN
) {
14419 // Scaled priority range OSD_POOL_PRIORITY_MIN to 0
14420 n
= (float)prio
/ min_prio
* OSD_POOL_PRIORITY_MIN
;
14425 pool
.opts
.unset(key
);
14427 pool
.opts
.set(key
, static_cast<int64_t>(n
));
14429 dout(10) << __func__
<< " pool " << pool_id
14430 << " recovery_priority adjusted "
14431 << prio
<< " to " << n
<< dendl
;
14432 pool
.last_change
= pending_inc
.epoch
;
14433 pending_inc
.new_pools
[pool_id
] = pool
;
14437 void OSDMonitor::try_enable_stretch_mode_pools(stringstream
& ss
, bool *okay
,
14439 set
<pg_pool_t
*>* pools
,
14440 const string
& new_crush_rule
)
14442 dout(20) << __func__
<< dendl
;
14444 int new_crush_rule_result
= osdmap
.crush
->get_rule_id(new_crush_rule
);
14445 if (new_crush_rule_result
< 0) {
14446 ss
<< "unrecognized crush rule " << new_crush_rule_result
;
14447 *errcode
= new_crush_rule_result
;
14450 __u8 new_rule
= static_cast<__u8
>(new_crush_rule_result
);
14451 for (const auto& pooli
: osdmap
.pools
) {
14452 int64_t poolid
= pooli
.first
;
14453 const pg_pool_t
*p
= &pooli
.second
;
14454 if (!p
->is_replicated()) {
14455 ss
<< "stretched pools must be replicated; '" << osdmap
.pool_name
[poolid
] << "' is erasure-coded";
14456 *errcode
= -EINVAL
;
14459 uint8_t default_size
= g_conf().get_val
<uint64_t>("osd_pool_default_size");
14460 if ((p
->get_size() != default_size
||
14461 (p
->get_min_size() != g_conf().get_osd_pool_default_min_size(default_size
))) &&
14462 (p
->get_crush_rule() != new_rule
)) {
14463 ss
<< "we currently require stretch mode pools start out with the"
14464 " default size/min_size, which '" << osdmap
.pool_name
[poolid
] << "' does not";
14465 *errcode
= -EINVAL
;
14468 pg_pool_t
*pp
= pending_inc
.get_new_pool(poolid
, p
);
14469 // TODO: The part where we unconditionally copy the pools into pending_inc is bad
14470 // the attempt may fail and then we have these pool updates...but they won't do anything
14471 // if there is a failure, so if it's hard to change the interface, no need to bother
14478 void OSDMonitor::try_enable_stretch_mode(stringstream
& ss
, bool *okay
,
14479 int *errcode
, bool commit
,
14480 const string
& dividing_bucket
,
14481 uint32_t bucket_count
,
14482 const set
<pg_pool_t
*>& pools
,
14483 const string
& new_crush_rule
)
14485 dout(20) << __func__
<< dendl
;
14487 CrushWrapper crush
= _get_pending_crush();
14488 int dividing_id
= -1;
14489 if (auto type_id
= crush
.get_validated_type_id(dividing_bucket
);
14490 !type_id
.has_value()) {
14491 ss
<< dividing_bucket
<< " is not a valid crush bucket type";
14492 *errcode
= -ENOENT
;
14493 ceph_assert(!commit
);
14496 dividing_id
= *type_id
;
14498 vector
<int> subtrees
;
14499 crush
.get_subtree_of_type(dividing_id
, &subtrees
);
14500 if (subtrees
.size() != 2) {
14501 ss
<< "there are " << subtrees
.size() << dividing_bucket
14502 << "'s in the cluster but stretch mode currently only works with 2!";
14503 *errcode
= -EINVAL
;
14504 ceph_assert(!commit
|| subtrees
.size() == 2);
14508 int new_crush_rule_result
= crush
.get_rule_id(new_crush_rule
);
14509 if (new_crush_rule_result
< 0) {
14510 ss
<< "unrecognized crush rule " << new_crush_rule
;
14511 *errcode
= new_crush_rule_result
;
14512 ceph_assert(!commit
|| (new_crush_rule_result
> 0));
14515 __u8 new_rule
= static_cast<__u8
>(new_crush_rule_result
);
14517 int weight1
= crush
.get_item_weight(subtrees
[0]);
14518 int weight2
= crush
.get_item_weight(subtrees
[1]);
14519 if (weight1
!= weight2
) {
14520 // TODO: I'm really not sure this is a good idea?
14521 ss
<< "the 2 " << dividing_bucket
14522 << "instances in the cluster have differing weights "
14523 << weight1
<< " and " << weight2
14524 <<" but stretch mode currently requires they be the same!";
14525 *errcode
= -EINVAL
;
14526 ceph_assert(!commit
|| (weight1
== weight2
));
14529 if (bucket_count
!= 2) {
14530 ss
<< "currently we only support 2-site stretch clusters!";
14531 *errcode
= -EINVAL
;
14532 ceph_assert(!commit
|| bucket_count
== 2);
14535 // TODO: check CRUSH rules for pools so that we are appropriately divided
14537 for (auto pool
: pools
) {
14538 pool
->crush_rule
= new_rule
;
14539 pool
->peering_crush_bucket_count
= bucket_count
;
14540 pool
->peering_crush_bucket_target
= bucket_count
;
14541 pool
->peering_crush_bucket_barrier
= dividing_id
;
14542 pool
->peering_crush_mandatory_member
= CRUSH_ITEM_NONE
;
14543 pool
->size
= g_conf().get_val
<uint64_t>("mon_stretch_pool_size");
14544 pool
->min_size
= g_conf().get_val
<uint64_t>("mon_stretch_pool_min_size");
14546 pending_inc
.change_stretch_mode
= true;
14547 pending_inc
.stretch_mode_enabled
= true;
14548 pending_inc
.new_stretch_bucket_count
= bucket_count
;
14549 pending_inc
.new_degraded_stretch_mode
= 0;
14550 pending_inc
.new_stretch_mode_bucket
= dividing_id
;
14556 bool OSDMonitor::check_for_dead_crush_zones(const map
<string
,set
<string
>>& dead_buckets
,
14557 set
<int> *really_down_buckets
,
14558 set
<string
> *really_down_mons
)
14560 dout(20) << __func__
<< " with dead mon zones " << dead_buckets
<< dendl
;
14561 ceph_assert(is_readable());
14562 if (dead_buckets
.empty()) return false;
14563 set
<int> down_cache
;
14564 bool really_down
= false;
14565 for (auto dbi
: dead_buckets
) {
14566 const string
& bucket_name
= dbi
.first
;
14567 ceph_assert(osdmap
.crush
->name_exists(bucket_name
));
14568 int bucket_id
= osdmap
.crush
->get_item_id(bucket_name
);
14569 dout(20) << "Checking " << bucket_name
<< " id " << bucket_id
14570 << " to see if OSDs are also down" << dendl
;
14571 bool subtree_down
= osdmap
.subtree_is_down(bucket_id
, &down_cache
);
14572 if (subtree_down
) {
14573 dout(20) << "subtree is down!" << dendl
;
14574 really_down
= true;
14575 really_down_buckets
->insert(bucket_id
);
14576 really_down_mons
->insert(dbi
.second
.begin(), dbi
.second
.end());
14579 dout(10) << "We determined CRUSH buckets " << *really_down_buckets
14580 << " and mons " << *really_down_mons
<< " are really down" << dendl
;
14581 return really_down
;
14584 void OSDMonitor::trigger_degraded_stretch_mode(const set
<int>& dead_buckets
,
14585 const set
<string
>& live_zones
)
14587 dout(20) << __func__
<< dendl
;
14588 stretch_recovery_triggered
.set_from_double(0); // reset this; we can't go clean now!
14589 // update the general OSDMap changes
14590 pending_inc
.change_stretch_mode
= true;
14591 pending_inc
.stretch_mode_enabled
= osdmap
.stretch_mode_enabled
;
14592 pending_inc
.new_stretch_bucket_count
= osdmap
.stretch_bucket_count
;
14593 int new_site_count
= osdmap
.stretch_bucket_count
- dead_buckets
.size();
14594 ceph_assert(new_site_count
== 1); // stretch count 2!
14595 pending_inc
.new_degraded_stretch_mode
= new_site_count
;
14596 pending_inc
.new_recovering_stretch_mode
= 0;
14597 pending_inc
.new_stretch_mode_bucket
= osdmap
.stretch_mode_bucket
;
14599 // and then apply them to all the pg_pool_ts
14600 ceph_assert(live_zones
.size() == 1); // only support 2 zones now
14601 const string
& remaining_site_name
= *(live_zones
.begin());
14602 ceph_assert(osdmap
.crush
->name_exists(remaining_site_name
));
14603 int remaining_site
= osdmap
.crush
->get_item_id(remaining_site_name
);
14604 for (auto pgi
: osdmap
.pools
) {
14605 if (pgi
.second
.peering_crush_bucket_count
) {
14606 pg_pool_t
& newp
= *pending_inc
.get_new_pool(pgi
.first
, &pgi
.second
);
14607 newp
.peering_crush_bucket_count
= new_site_count
;
14608 newp
.peering_crush_mandatory_member
= remaining_site
;
14609 newp
.min_size
= pgi
.second
.min_size
/ 2; // only support 2 zones now
14610 newp
.last_force_op_resend
= pending_inc
.epoch
;
14616 void OSDMonitor::trigger_recovery_stretch_mode()
14618 dout(20) << __func__
<< dendl
;
14619 stretch_recovery_triggered
.set_from_double(0); // reset this so we don't go full-active prematurely
14620 pending_inc
.change_stretch_mode
= true;
14621 pending_inc
.stretch_mode_enabled
= osdmap
.stretch_mode_enabled
;
14622 pending_inc
.new_stretch_bucket_count
= osdmap
.stretch_bucket_count
;
14623 pending_inc
.new_degraded_stretch_mode
= osdmap
.degraded_stretch_mode
;
14624 pending_inc
.new_recovering_stretch_mode
= 1;
14625 pending_inc
.new_stretch_mode_bucket
= osdmap
.stretch_mode_bucket
;
14627 for (auto pgi
: osdmap
.pools
) {
14628 if (pgi
.second
.peering_crush_bucket_count
) {
14629 pg_pool_t
& newp
= *pending_inc
.get_new_pool(pgi
.first
, &pgi
.second
);
14630 newp
.last_force_op_resend
= pending_inc
.epoch
;
14636 void OSDMonitor::set_degraded_stretch_mode()
14638 stretch_recovery_triggered
.set_from_double(0);
14641 void OSDMonitor::set_recovery_stretch_mode()
14643 if (stretch_recovery_triggered
.is_zero()) {
14644 stretch_recovery_triggered
= ceph_clock_now();
14648 void OSDMonitor::set_healthy_stretch_mode()
14650 stretch_recovery_triggered
.set_from_double(0);
14653 void OSDMonitor::notify_new_pg_digest()
14655 dout(20) << __func__
<< dendl
;
14656 if (!stretch_recovery_triggered
.is_zero()) {
14657 try_end_recovery_stretch_mode(false);
14661 struct CMonExitRecovery
: public Context
{
14664 CMonExitRecovery(OSDMonitor
*mon
, bool f
) : m(mon
), force(f
) {}
14665 void finish(int r
) {
14666 m
->try_end_recovery_stretch_mode(force
);
14670 void OSDMonitor::try_end_recovery_stretch_mode(bool force
)
14672 dout(20) << __func__
<< dendl
;
14673 if (!mon
.is_leader()) return;
14674 if (!mon
.is_degraded_stretch_mode()) return;
14675 if (!mon
.is_recovering_stretch_mode()) return;
14676 if (!is_readable()) {
14677 wait_for_readable_ctx(new CMonExitRecovery(this, force
));
14681 if (osdmap
.recovering_stretch_mode
&&
14682 ((!stretch_recovery_triggered
.is_zero() &&
14683 ceph_clock_now() - g_conf().get_val
<double>("mon_stretch_recovery_min_wait") >
14684 stretch_recovery_triggered
) ||
14686 if (!mon
.mgrstatmon()->is_readable()) {
14687 mon
.mgrstatmon()->wait_for_readable_ctx(new CMonExitRecovery(this, force
));
14690 const PGMapDigest
& pgd
= mon
.mgrstatmon()->get_digest();
14691 double misplaced
, degraded
, inactive
, unknown
;
14692 pgd
.get_recovery_stats(&misplaced
, °raded
, &inactive
, &unknown
);
14693 if (force
|| (degraded
== 0.0 && inactive
== 0.0 && unknown
== 0.0)) {
14694 // we can exit degraded stretch mode!
14695 mon
.trigger_healthy_stretch_mode();
14700 void OSDMonitor::trigger_healthy_stretch_mode()
14702 ceph_assert(is_writeable());
14703 stretch_recovery_triggered
.set_from_double(0);
14704 pending_inc
.change_stretch_mode
= true;
14705 pending_inc
.stretch_mode_enabled
= osdmap
.stretch_mode_enabled
;
14706 pending_inc
.new_stretch_bucket_count
= osdmap
.stretch_bucket_count
;
14707 pending_inc
.new_degraded_stretch_mode
= 0; // turn off degraded mode...
14708 pending_inc
.new_recovering_stretch_mode
= 0; //...and recovering mode!
14709 pending_inc
.new_stretch_mode_bucket
= osdmap
.stretch_mode_bucket
;
14710 for (auto pgi
: osdmap
.pools
) {
14711 if (pgi
.second
.peering_crush_bucket_count
) {
14712 pg_pool_t
& newp
= *pending_inc
.get_new_pool(pgi
.first
, &pgi
.second
);
14713 newp
.peering_crush_bucket_count
= osdmap
.stretch_bucket_count
;
14714 newp
.peering_crush_mandatory_member
= CRUSH_ITEM_NONE
;
14715 newp
.min_size
= g_conf().get_val
<uint64_t>("mon_stretch_pool_min_size");
14716 newp
.last_force_op_resend
= pending_inc
.epoch
;