1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
10 * Author: Loic Dachary <loic@dachary.org>
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
20 #include <boost/algorithm/string.hpp>
21 #include <experimental/iterator>
25 #include "mon/OSDMonitor.h"
26 #include "mon/Monitor.h"
27 #include "mon/MDSMonitor.h"
28 #include "mon/MgrStatMonitor.h"
29 #include "mon/AuthMonitor.h"
30 #include "mon/KVMonitor.h"
32 #include "mon/MonitorDBStore.h"
33 #include "mon/Session.h"
35 #include "crush/CrushWrapper.h"
36 #include "crush/CrushTester.h"
37 #include "crush/CrushTreeDumper.h"
39 #include "messages/MOSDBeacon.h"
40 #include "messages/MOSDFailure.h"
41 #include "messages/MOSDMarkMeDown.h"
42 #include "messages/MOSDMarkMeDead.h"
43 #include "messages/MOSDFull.h"
44 #include "messages/MOSDMap.h"
45 #include "messages/MMonGetOSDMap.h"
46 #include "messages/MOSDBoot.h"
47 #include "messages/MOSDAlive.h"
48 #include "messages/MPoolOp.h"
49 #include "messages/MPoolOpReply.h"
50 #include "messages/MOSDPGCreate2.h"
51 #include "messages/MOSDPGCreated.h"
52 #include "messages/MOSDPGTemp.h"
53 #include "messages/MOSDPGReadyToMerge.h"
54 #include "messages/MMonCommand.h"
55 #include "messages/MRemoveSnaps.h"
56 #include "messages/MRoute.h"
57 #include "messages/MMonGetPurgedSnaps.h"
58 #include "messages/MMonGetPurgedSnapsReply.h"
60 #include "common/TextTable.h"
61 #include "common/Timer.h"
62 #include "common/ceph_argparse.h"
63 #include "common/perf_counters.h"
64 #include "common/PriorityCache.h"
65 #include "common/strtol.h"
66 #include "common/numa.h"
68 #include "common/config.h"
69 #include "common/errno.h"
71 #include "erasure-code/ErasureCodePlugin.h"
72 #include "compressor/Compressor.h"
73 #include "common/Checksummer.h"
75 #include "include/compat.h"
76 #include "include/ceph_assert.h"
77 #include "include/stringify.h"
78 #include "include/util.h"
79 #include "common/cmdparse.h"
80 #include "include/str_list.h"
81 #include "include/str_map.h"
82 #include "include/scope_guard.h"
83 #include "perfglue/heap_profiler.h"
85 #include "auth/cephx/CephxKeyServer.h"
86 #include "osd/OSDCap.h"
88 #include "json_spirit/json_spirit_reader.h"
90 #include <boost/algorithm/string/predicate.hpp>
97 using std::ostringstream
;
101 using std::stringstream
;
102 using std::to_string
;
105 using ceph::bufferlist
;
108 using ceph::ErasureCodeInterfaceRef
;
109 using ceph::ErasureCodePluginRegistry
;
110 using ceph::ErasureCodeProfile
;
111 using ceph::Formatter
;
112 using ceph::JSONFormatter
;
113 using ceph::make_message
;
115 #define dout_subsys ceph_subsys_mon
116 static const string
OSD_PG_CREATING_PREFIX("osd_pg_creating");
117 static const string
OSD_METADATA_PREFIX("osd_metadata");
118 static const string
OSD_SNAP_PREFIX("osd_snap");
122 OSD snapshot metadata
123 ---------------------
125 -- starting with mimic, removed in octopus --
127 "removed_epoch_%llu_%08lx" % (pool, epoch)
128 -> interval_set<snapid_t>
130 "removed_snap_%llu_%016llx" % (pool, last_snap)
131 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
134 -- starting with mimic --
136 "purged_snap_%llu_%016llx" % (pool, last_snap)
137 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
139 - note that the {removed,purged}_snap put the last snap in they key so
140 that we can use forward iteration only to search for an epoch in an
141 interval. e.g., to test if epoch N is removed/purged, we'll find a key
142 >= N that either does or doesn't contain the given snap.
145 -- starting with octopus --
147 "purged_epoch_%08lx" % epoch
148 -> map<int64_t,interval_set<snapid_t>>
151 using namespace TOPNSPC::common
;
154 struct OSDMemCache
: public PriorityCache::PriCache
{
156 int64_t cache_bytes
[PriorityCache::Priority::LAST
+1] = {0};
157 int64_t committed_bytes
= 0;
158 double cache_ratio
= 0;
160 OSDMemCache(OSDMonitor
*m
) : osdmon(m
) {};
162 virtual uint64_t _get_used_bytes() const = 0;
164 virtual int64_t request_cache_bytes(
165 PriorityCache::Priority pri
, uint64_t total_cache
) const {
166 int64_t assigned
= get_cache_bytes(pri
);
169 // All cache items are currently set to have PRI1 priority
170 case PriorityCache::Priority::PRI1
:
172 int64_t request
= _get_used_bytes();
173 return (request
> assigned
) ? request
- assigned
: 0;
181 virtual int64_t get_cache_bytes(PriorityCache::Priority pri
) const {
182 return cache_bytes
[pri
];
185 virtual int64_t get_cache_bytes() const {
188 for (int i
= 0; i
< PriorityCache::Priority::LAST
+ 1; i
++) {
189 PriorityCache::Priority pri
= static_cast<PriorityCache::Priority
>(i
);
190 total
+= get_cache_bytes(pri
);
195 virtual void set_cache_bytes(PriorityCache::Priority pri
, int64_t bytes
) {
196 cache_bytes
[pri
] = bytes
;
198 virtual void add_cache_bytes(PriorityCache::Priority pri
, int64_t bytes
) {
199 cache_bytes
[pri
] += bytes
;
201 virtual int64_t commit_cache_size(uint64_t total_cache
) {
202 committed_bytes
= PriorityCache::get_chunk(
203 get_cache_bytes(), total_cache
);
204 return committed_bytes
;
206 virtual int64_t get_committed_size() const {
207 return committed_bytes
;
209 virtual double get_cache_ratio() const {
212 virtual void set_cache_ratio(double ratio
) {
215 virtual void shift_bins() {
217 virtual void import_bins(const std::vector
<uint64_t> &bins
) {
219 virtual void set_bins(PriorityCache::Priority pri
, uint64_t end_bin
) {
221 virtual uint64_t get_bins(PriorityCache::Priority pri
) const {
225 virtual string
get_cache_name() const = 0;
228 struct IncCache
: public OSDMemCache
{
229 IncCache(OSDMonitor
*m
) : OSDMemCache(m
) {};
231 virtual uint64_t _get_used_bytes() const {
232 return osdmon
->inc_osd_cache
.get_bytes();
235 virtual string
get_cache_name() const {
236 return "OSDMap Inc Cache";
239 uint64_t _get_num_osdmaps() const {
240 return osdmon
->inc_osd_cache
.get_size();
244 struct FullCache
: public OSDMemCache
{
245 FullCache(OSDMonitor
*m
) : OSDMemCache(m
) {};
247 virtual uint64_t _get_used_bytes() const {
248 return osdmon
->full_osd_cache
.get_bytes();
251 virtual string
get_cache_name() const {
252 return "OSDMap Full Cache";
255 uint64_t _get_num_osdmaps() const {
256 return osdmon
->full_osd_cache
.get_size();
260 std::shared_ptr
<IncCache
> inc_cache
;
261 std::shared_ptr
<FullCache
> full_cache
;
263 const uint32_t MAX_POOL_APPLICATIONS
= 4;
264 const uint32_t MAX_POOL_APPLICATION_KEYS
= 64;
265 const uint32_t MAX_POOL_APPLICATION_LENGTH
= 128;
267 bool is_osd_writable(const OSDCapGrant
& grant
, const std::string
* pool_name
) {
268 // Note: this doesn't include support for the application tag match
269 if ((grant
.spec
.allow
& OSD_CAP_W
) != 0) {
270 auto& match
= grant
.match
;
271 if (match
.is_match_all()) {
273 } else if (pool_name
!= nullptr &&
274 !match
.pool_namespace
.pool_name
.empty() &&
275 match
.pool_namespace
.pool_name
== *pool_name
) {
282 bool is_unmanaged_snap_op_permitted(CephContext
* cct
,
283 const KeyServer
& key_server
,
284 const EntityName
& entity_name
,
285 const MonCap
& mon_caps
,
286 const entity_addr_t
& peer_socket_addr
,
287 const std::string
* pool_name
)
289 typedef std::map
<std::string
, std::string
> CommandArgs
;
291 if (mon_caps
.is_capable(
292 cct
, entity_name
, "osd",
293 "osd pool op unmanaged-snap",
294 (pool_name
== nullptr ?
295 CommandArgs
{} /* pool DNE, require unrestricted cap */ :
296 CommandArgs
{{"poolname", *pool_name
}}),
302 AuthCapsInfo caps_info
;
303 if (!key_server
.get_service_caps(entity_name
, CEPH_ENTITY_TYPE_OSD
,
305 dout(10) << "unable to locate OSD cap data for " << entity_name
306 << " in auth db" << dendl
;
311 if (caps_info
.caps
.length() > 0) {
312 auto p
= caps_info
.caps
.cbegin();
315 } catch (const ceph::buffer::error
&err
) {
316 derr
<< "corrupt OSD cap data for " << entity_name
<< " in auth db"
323 if (!osd_cap
.parse(caps_str
, nullptr)) {
324 dout(10) << "unable to parse OSD cap data for " << entity_name
325 << " in auth db" << dendl
;
329 // if the entity has write permissions in one or all pools, permit
330 // usage of unmanaged-snapshots
331 if (osd_cap
.allow_all()) {
335 for (auto& grant
: osd_cap
.grants
) {
336 if (grant
.profile
.is_valid()) {
337 for (auto& profile_grant
: grant
.profile_grants
) {
338 if (is_osd_writable(profile_grant
, pool_name
)) {
342 } else if (is_osd_writable(grant
, pool_name
)) {
350 } // anonymous namespace
352 void LastEpochClean::Lec::report(unsigned pg_num
, ps_t ps
,
353 epoch_t last_epoch_clean
)
359 epoch_by_pg
.resize(pg_num
, 0);
360 const auto old_lec
= epoch_by_pg
[ps
];
361 if (old_lec
>= last_epoch_clean
) {
365 epoch_by_pg
[ps
] = last_epoch_clean
;
366 if (last_epoch_clean
< floor
) {
367 floor
= last_epoch_clean
;
368 } else if (last_epoch_clean
> floor
) {
369 if (old_lec
== floor
) {
370 // probably should increase floor?
371 auto new_floor
= std::min_element(std::begin(epoch_by_pg
),
372 std::end(epoch_by_pg
));
376 if (ps
!= next_missing
) {
379 for (; next_missing
< epoch_by_pg
.size(); next_missing
++) {
380 if (epoch_by_pg
[next_missing
] == 0) {
386 void LastEpochClean::remove_pool(uint64_t pool
)
388 report_by_pool
.erase(pool
);
391 void LastEpochClean::report(unsigned pg_num
, const pg_t
& pg
,
392 epoch_t last_epoch_clean
)
394 auto& lec
= report_by_pool
[pg
.pool()];
395 return lec
.report(pg_num
, pg
.ps(), last_epoch_clean
);
398 epoch_t
LastEpochClean::get_lower_bound(const OSDMap
& latest
) const
400 auto floor
= latest
.get_epoch();
401 for (auto& pool
: latest
.get_pools()) {
402 auto reported
= report_by_pool
.find(pool
.first
);
403 if (reported
== report_by_pool
.end()) {
406 if (reported
->second
.next_missing
< pool
.second
.get_pg_num()) {
409 if (reported
->second
.floor
< floor
) {
410 floor
= reported
->second
.floor
;
416 void LastEpochClean::dump(Formatter
*f
) const
418 f
->open_array_section("per_pool");
420 for (auto& [pool
, lec
] : report_by_pool
) {
421 f
->open_object_section("pool");
422 f
->dump_unsigned("poolid", pool
);
423 f
->dump_unsigned("floor", lec
.floor
);
430 class C_UpdateCreatingPGs
: public Context
{
435 C_UpdateCreatingPGs(OSDMonitor
*osdmon
, epoch_t e
) :
436 osdmon(osdmon
), start(ceph_clock_now()), epoch(e
) {}
437 void finish(int r
) override
{
439 utime_t end
= ceph_clock_now();
440 dout(10) << "osdmap epoch " << epoch
<< " mapping took "
441 << (end
- start
) << " seconds" << dendl
;
442 osdmon
->update_creating_pgs();
443 osdmon
->check_pg_creates_subs();
449 #define dout_prefix _prefix(_dout, mon, osdmap)
450 static ostream
& _prefix(std::ostream
*_dout
, Monitor
&mon
, const OSDMap
& osdmap
) {
451 return *_dout
<< "mon." << mon
.name
<< "@" << mon
.rank
452 << "(" << mon
.get_state_name()
453 << ").osd e" << osdmap
.get_epoch() << " ";
456 OSDMonitor::OSDMonitor(
460 const string
& service_name
)
461 : PaxosService(mn
, p
, service_name
),
463 inc_osd_cache(g_conf()->mon_osd_cache_size
),
464 full_osd_cache(g_conf()->mon_osd_cache_size
),
465 has_osdmap_manifest(false),
466 mapper(mn
.cct
, &mn
.cpu_tp
)
468 inc_cache
= std::make_shared
<IncCache
>(this);
469 full_cache
= std::make_shared
<FullCache
>(this);
470 cct
->_conf
.add_observer(this);
471 int r
= _set_cache_sizes();
473 derr
<< __func__
<< " using default osd cache size - mon_osd_cache_size ("
474 << g_conf()->mon_osd_cache_size
475 << ") without priority cache management"
480 const char **OSDMonitor::get_tracked_conf_keys() const
482 static const char* KEYS
[] = {
484 "mon_memory_autotune",
485 "rocksdb_cache_size",
491 void OSDMonitor::handle_conf_change(const ConfigProxy
& conf
,
492 const std::set
<std::string
> &changed
)
494 dout(10) << __func__
<< " " << changed
<< dendl
;
496 if (changed
.count("mon_memory_autotune")) {
497 _set_cache_autotuning();
499 if (changed
.count("mon_memory_target") ||
500 changed
.count("rocksdb_cache_size")) {
501 int r
= _update_mon_cache_settings();
503 derr
<< __func__
<< " mon_memory_target:"
504 << g_conf()->mon_memory_target
505 << " rocksdb_cache_size:"
506 << g_conf()->rocksdb_cache_size
507 << ". Unable to update cache size."
513 void OSDMonitor::_set_cache_autotuning()
515 if (!g_conf()->mon_memory_autotune
&& pcm
!= nullptr) {
516 // Disable cache autotuning
517 std::lock_guard
l(balancer_lock
);
521 if (g_conf()->mon_memory_autotune
&& pcm
== nullptr) {
522 int r
= register_cache_with_pcm();
525 << " Error while registering osdmon caches with pcm."
526 << " Cache auto tuning not enabled."
528 mon_memory_autotune
= false;
530 mon_memory_autotune
= true;
535 int OSDMonitor::_update_mon_cache_settings()
537 if (g_conf()->mon_memory_target
<= 0 ||
538 g_conf()->mon_memory_target
< mon_memory_min
||
539 g_conf()->rocksdb_cache_size
<= 0) {
543 if (pcm
== nullptr && rocksdb_binned_kv_cache
== nullptr) {
544 derr
<< __func__
<< " not using pcm and rocksdb" << dendl
;
548 uint64_t old_mon_memory_target
= mon_memory_target
;
549 uint64_t old_rocksdb_cache_size
= rocksdb_cache_size
;
551 // Set the new pcm memory cache sizes
552 mon_memory_target
= g_conf()->mon_memory_target
;
553 rocksdb_cache_size
= g_conf()->rocksdb_cache_size
;
555 uint64_t base
= mon_memory_base
;
556 double fragmentation
= mon_memory_fragmentation
;
557 uint64_t target
= mon_memory_target
;
558 uint64_t min
= mon_memory_min
;
561 uint64_t ltarget
= (1.0 - fragmentation
) * target
;
562 if (ltarget
> base
+ min
) {
563 max
= ltarget
- base
;
566 int r
= _set_cache_ratios();
568 derr
<< __func__
<< " Cache ratios for pcm could not be set."
569 << " Review the kv (rocksdb) and mon_memory_target sizes."
571 mon_memory_target
= old_mon_memory_target
;
572 rocksdb_cache_size
= old_rocksdb_cache_size
;
576 if (mon_memory_autotune
&& pcm
!= nullptr) {
577 std::lock_guard
l(balancer_lock
);
578 // set pcm cache levels
579 pcm
->set_target_memory(target
);
580 pcm
->set_min_memory(min
);
581 pcm
->set_max_memory(max
);
582 // tune memory based on new values
585 _set_new_cache_sizes();
586 dout(1) << __func__
<< " Updated mon cache setting."
587 << " target: " << target
595 int OSDMonitor::_set_cache_sizes()
597 if (g_conf()->mon_memory_autotune
) {
598 // set the new osdmon cache targets to be managed by pcm
599 mon_osd_cache_size
= g_conf()->mon_osd_cache_size
;
600 rocksdb_cache_size
= g_conf()->rocksdb_cache_size
;
601 mon_memory_base
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_base");
602 mon_memory_fragmentation
= cct
->_conf
.get_val
<double>("osd_memory_expected_fragmentation");
603 mon_memory_target
= g_conf()->mon_memory_target
;
604 mon_memory_min
= g_conf()->mon_osd_cache_size_min
;
605 if (mon_memory_target
<= 0 || mon_memory_min
<= 0) {
606 derr
<< __func__
<< " mon_memory_target:" << mon_memory_target
607 << " mon_memory_min:" << mon_memory_min
608 << ". Invalid size option(s) provided."
612 // Set the initial inc and full LRU cache sizes
613 inc_osd_cache
.set_bytes(mon_memory_min
);
614 full_osd_cache
.set_bytes(mon_memory_min
);
615 mon_memory_autotune
= g_conf()->mon_memory_autotune
;
620 bool OSDMonitor::_have_pending_crush()
622 return pending_inc
.crush
.length() > 0;
625 CrushWrapper
&OSDMonitor::_get_stable_crush()
627 return *osdmap
.crush
;
630 CrushWrapper
OSDMonitor::_get_pending_crush()
633 if (pending_inc
.crush
.length())
634 bl
= pending_inc
.crush
;
636 osdmap
.crush
->encode(bl
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
638 auto p
= bl
.cbegin();
644 void OSDMonitor::create_initial()
646 dout(10) << "create_initial for " << mon
.monmap
->fsid
<< dendl
;
651 mon
.store
->get("mkfs", "osdmap", bl
);
655 newmap
.set_fsid(mon
.monmap
->fsid
);
657 newmap
.build_simple(cct
, 0, mon
.monmap
->fsid
, 0);
660 newmap
.created
= newmap
.modified
= ceph_clock_now();
662 // new clusters should sort bitwise by default.
663 newmap
.set_flag(CEPH_OSDMAP_SORTBITWISE
);
666 CEPH_OSDMAP_RECOVERY_DELETES
|
667 CEPH_OSDMAP_PURGED_SNAPDIRS
|
668 CEPH_OSDMAP_PGLOG_HARDLIMIT
;
669 newmap
.full_ratio
= g_conf()->mon_osd_full_ratio
;
670 if (newmap
.full_ratio
> 1.0) newmap
.full_ratio
/= 100;
671 newmap
.backfillfull_ratio
= g_conf()->mon_osd_backfillfull_ratio
;
672 if (newmap
.backfillfull_ratio
> 1.0) newmap
.backfillfull_ratio
/= 100;
673 newmap
.nearfull_ratio
= g_conf()->mon_osd_nearfull_ratio
;
674 if (newmap
.nearfull_ratio
> 1.0) newmap
.nearfull_ratio
/= 100;
676 // new cluster should require latest by default
677 if (g_conf().get_val
<bool>("mon_debug_no_require_reef")) {
678 if (g_conf().get_val
<bool>("mon_debug_no_require_quincy")) {
679 derr
<< __func__
<< " mon_debug_no_require_reef and quincy=true" << dendl
;
680 newmap
.require_osd_release
= ceph_release_t::pacific
;
682 derr
<< __func__
<< " mon_debug_no_require_reef=true" << dendl
;
683 newmap
.require_osd_release
= ceph_release_t::quincy
;
686 newmap
.require_osd_release
= ceph_release_t::reef
;
689 ceph_release_t r
= ceph_release_from_name(g_conf()->mon_osd_initial_require_min_compat_client
);
691 ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
693 newmap
.require_min_compat_client
= r
;
695 // encode into pending incremental
696 uint64_t features
= newmap
.get_encoding_features();
697 newmap
.encode(pending_inc
.fullmap
,
698 features
| CEPH_FEATURE_RESERVED
);
699 pending_inc
.full_crc
= newmap
.get_crc();
700 dout(20) << " full crc " << pending_inc
.full_crc
<< dendl
;
703 void OSDMonitor::get_store_prefixes(std::set
<string
>& s
) const
705 s
.insert(service_name
);
706 s
.insert(OSD_PG_CREATING_PREFIX
);
707 s
.insert(OSD_METADATA_PREFIX
);
708 s
.insert(OSD_SNAP_PREFIX
);
711 void OSDMonitor::update_from_paxos(bool *need_bootstrap
)
713 // we really don't care if the version has been updated, because we may
714 // have trimmed without having increased the last committed; yet, we may
715 // need to update the in-memory manifest.
716 load_osdmap_manifest();
718 version_t version
= get_last_committed();
719 if (version
== osdmap
.epoch
)
721 ceph_assert(version
> osdmap
.epoch
);
723 dout(15) << "update_from_paxos paxos e " << version
724 << ", my e " << osdmap
.epoch
<< dendl
;
726 int prev_num_up_osd
= osdmap
.num_up_osd
;
729 if (!mapping_job
->is_done()) {
730 dout(1) << __func__
<< " mapping job "
731 << mapping_job
.get() << " did not complete, "
732 << mapping_job
->shards
<< " left, canceling" << dendl
;
733 mapping_job
->abort();
741 * We will possibly have a stashed latest that *we* wrote, and we will
742 * always be sure to have the oldest full map in the first..last range
743 * due to encode_trim_extra(), which includes the oldest full map in the trim
746 * encode_trim_extra() does not however write the full map's
747 * version to 'full_latest'. This is only done when we are building the
748 * full maps from the incremental versions. But don't panic! We make sure
749 * that the following conditions find whichever full map version is newer.
751 version_t latest_full
= get_version_latest_full();
752 if (latest_full
== 0 && get_first_committed() > 1)
753 latest_full
= get_first_committed();
755 if (get_first_committed() > 1 &&
756 latest_full
< get_first_committed()) {
757 // the monitor could be just sync'ed with its peer, and the latest_full key
758 // is not encoded in the paxos commits in encode_pending(), so we need to
759 // make sure we get it pointing to a proper version.
760 version_t lc
= get_last_committed();
761 version_t fc
= get_first_committed();
763 dout(10) << __func__
<< " looking for valid full map in interval"
764 << " [" << fc
<< ", " << lc
<< "]" << dendl
;
767 for (version_t v
= lc
; v
>= fc
; v
--) {
768 string full_key
= "full_" + stringify(v
);
769 if (mon
.store
->exists(get_service_name(), full_key
)) {
770 dout(10) << __func__
<< " found latest full map v " << v
<< dendl
;
776 ceph_assert(latest_full
> 0);
777 auto t(std::make_shared
<MonitorDBStore::Transaction
>());
778 put_version_latest_full(t
, latest_full
);
779 mon
.store
->apply_transaction(t
);
780 dout(10) << __func__
<< " updated the on-disk full map version to "
781 << latest_full
<< dendl
;
784 if ((latest_full
> 0) && (latest_full
> osdmap
.epoch
)) {
785 bufferlist latest_bl
;
786 get_version_full(latest_full
, latest_bl
);
787 ceph_assert(latest_bl
.length() != 0);
788 dout(7) << __func__
<< " loading latest full map e" << latest_full
<< dendl
;
790 osdmap
.decode(latest_bl
);
794 if (!mon
.store
->get(OSD_PG_CREATING_PREFIX
, "creating", bl
)) {
795 auto p
= bl
.cbegin();
796 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
797 creating_pgs
.decode(p
);
798 dout(7) << __func__
<< " loading creating_pgs last_scan_epoch "
799 << creating_pgs
.last_scan_epoch
800 << " with " << creating_pgs
.pgs
.size() << " pgs" << dendl
;
802 dout(1) << __func__
<< " missing creating pgs; upgrade from post-kraken?"
806 // walk through incrementals
807 MonitorDBStore::TransactionRef t
;
809 while (version
> osdmap
.epoch
) {
811 int err
= get_version(osdmap
.epoch
+1, inc_bl
);
812 ceph_assert(err
== 0);
813 ceph_assert(inc_bl
.length());
814 // set priority cache manager levels if the osdmap is
815 // being populated for the first time.
816 if (mon_memory_autotune
&& pcm
== nullptr) {
817 int r
= register_cache_with_pcm();
820 << " Error while registering osdmon caches with pcm."
821 << " Proceeding without cache auto tuning."
826 dout(7) << "update_from_paxos applying incremental " << osdmap
.epoch
+1
828 OSDMap::Incremental
inc(inc_bl
);
829 err
= osdmap
.apply_incremental(inc
);
830 ceph_assert(err
== 0);
833 t
.reset(new MonitorDBStore::Transaction
);
835 // Write out the full map for all past epochs. Encode the full
836 // map with the same features as the incremental. If we don't
837 // know, use the quorum features. If we don't know those either,
838 // encode with all features.
839 uint64_t f
= inc
.encode_features
;
841 f
= mon
.get_quorum_con_features();
845 osdmap
.encode(full_bl
, f
| CEPH_FEATURE_RESERVED
);
846 tx_size
+= full_bl
.length();
848 bufferlist orig_full_bl
;
849 get_version_full(osdmap
.epoch
, orig_full_bl
);
850 if (orig_full_bl
.length()) {
851 // the primary provided the full map
852 ceph_assert(inc
.have_crc
);
853 if (inc
.full_crc
!= osdmap
.crc
) {
854 // This will happen if the mons were running mixed versions in
855 // the past or some other circumstance made the full encoded
856 // maps divergent. Reloading here will bring us back into
857 // sync with the primary for this and all future maps. OSDs
858 // will also be brought back into sync when they discover the
859 // crc mismatch and request a full map from a mon.
860 derr
<< __func__
<< " full map CRC mismatch, resetting to canonical"
863 dout(20) << __func__
<< " my (bad) full osdmap:\n";
864 JSONFormatter
jf(true);
865 jf
.dump_object("osdmap", osdmap
);
867 *_dout
<< "\nhexdump:\n";
868 full_bl
.hexdump(*_dout
);
872 osdmap
.decode(orig_full_bl
);
874 dout(20) << __func__
<< " canonical full osdmap:\n";
875 JSONFormatter
jf(true);
876 jf
.dump_object("osdmap", osdmap
);
878 *_dout
<< "\nhexdump:\n";
879 orig_full_bl
.hexdump(*_dout
);
883 ceph_assert(!inc
.have_crc
);
884 put_version_full(t
, osdmap
.epoch
, full_bl
);
886 put_version_latest_full(t
, osdmap
.epoch
);
889 dout(1) << osdmap
<< dendl
;
891 if (osdmap
.epoch
== 1) {
892 t
->erase("mkfs", "osdmap");
895 if (tx_size
> g_conf()->mon_sync_max_payload_size
*2) {
896 mon
.store
->apply_transaction(t
);
897 t
= MonitorDBStore::TransactionRef();
900 for (auto [osd
, state
] : inc
.new_state
) {
901 if (state
& CEPH_OSD_UP
) {
902 // could be marked up *or* down, but we're too lazy to check which
903 last_osd_report
.erase(osd
);
906 for (auto [osd
, weight
] : inc
.new_weight
) {
907 if (weight
== CEPH_OSD_OUT
) {
908 // manually marked out, so drop it
909 osd_epochs
.erase(osd
);
915 mon
.store
->apply_transaction(t
);
918 bool marked_osd_down
= false;
919 for (int o
= 0; o
< osdmap
.get_max_osd(); o
++) {
920 if (osdmap
.is_out(o
))
922 auto found
= down_pending_out
.find(o
);
923 if (osdmap
.is_down(o
)) {
924 // populate down -> out map
925 if (found
== down_pending_out
.end()) {
926 dout(10) << " adding osd." << o
<< " to down_pending_out map" << dendl
;
927 down_pending_out
[o
] = ceph_clock_now();
928 marked_osd_down
= true;
931 if (found
!= down_pending_out
.end()) {
932 dout(10) << " removing osd." << o
<< " from down_pending_out map" << dendl
;
933 down_pending_out
.erase(found
);
937 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
940 check_pg_creates_subs();
942 share_map_with_random_osd();
946 // make sure our feature bits reflect the latest map
947 update_msgr_features();
949 if (!mon
.is_leader()) {
950 // will be called by on_active() on the leader, avoid doing so twice
953 if (osdmap
.stretch_mode_enabled
) {
954 dout(20) << "Stretch mode enabled in this map" << dendl
;
955 mon
.try_engage_stretch_mode();
956 if (osdmap
.degraded_stretch_mode
) {
957 dout(20) << "Degraded stretch mode set in this map" << dendl
;
958 if (!osdmap
.recovering_stretch_mode
) {
959 mon
.set_degraded_stretch_mode();
960 dout(20) << "prev_num_up_osd: " << prev_num_up_osd
<< dendl
;
961 dout(20) << "osdmap.num_up_osd: " << osdmap
.num_up_osd
<< dendl
;
962 dout(20) << "osdmap.num_osd: " << osdmap
.num_osd
<< dendl
;
963 dout(20) << "mon_stretch_cluster_recovery_ratio: " << cct
->_conf
.get_val
<double>("mon_stretch_cluster_recovery_ratio") << dendl
;
964 if (prev_num_up_osd
< osdmap
.num_up_osd
&&
965 (osdmap
.num_up_osd
/ (double)osdmap
.num_osd
) >
966 cct
->_conf
.get_val
<double>("mon_stretch_cluster_recovery_ratio") &&
967 mon
.dead_mon_buckets
.size() == 0) {
968 // TODO: This works for 2-site clusters when the OSD maps are appropriately
969 // trimmed and everything is "normal" but not if you have a lot of out OSDs
970 // you're ignoring or in some really degenerate failure cases
972 dout(10) << "Enabling recovery stretch mode in this map" << dendl
;
973 mon
.go_recovery_stretch_mode();
976 mon
.set_recovery_stretch_mode();
979 mon
.set_healthy_stretch_mode();
981 if (marked_osd_down
&&
982 (!osdmap
.degraded_stretch_mode
|| osdmap
.recovering_stretch_mode
)) {
983 dout(20) << "Checking degraded stretch mode due to osd changes" << dendl
;
984 mon
.maybe_go_degraded_stretch_mode();
989 int OSDMonitor::register_cache_with_pcm()
991 if (mon_memory_target
<= 0 || mon_memory_min
<= 0) {
992 derr
<< __func__
<< " Invalid memory size specified for mon caches."
993 << " Caches will not be auto-tuned."
997 uint64_t base
= mon_memory_base
;
998 double fragmentation
= mon_memory_fragmentation
;
999 // For calculating total target memory, consider rocksdb cache size.
1000 uint64_t target
= mon_memory_target
;
1001 uint64_t min
= mon_memory_min
;
1004 // Apply the same logic as in bluestore to set the max amount
1005 // of memory to use for cache. Assume base memory for OSDMaps
1006 // and then add in some overhead for fragmentation.
1007 uint64_t ltarget
= (1.0 - fragmentation
) * target
;
1008 if (ltarget
> base
+ min
) {
1009 max
= ltarget
- base
;
1012 rocksdb_binned_kv_cache
= mon
.store
->get_priority_cache();
1013 if (!rocksdb_binned_kv_cache
) {
1014 derr
<< __func__
<< " not using rocksdb" << dendl
;
1018 int r
= _set_cache_ratios();
1020 derr
<< __func__
<< " Cache ratios for pcm could not be set."
1021 << " Review the kv (rocksdb) and mon_memory_target sizes."
1026 pcm
= std::make_shared
<PriorityCache::Manager
>(
1027 cct
, min
, max
, target
, true);
1028 pcm
->insert("kv", rocksdb_binned_kv_cache
, true);
1029 pcm
->insert("inc", inc_cache
, true);
1030 pcm
->insert("full", full_cache
, true);
1031 dout(1) << __func__
<< " pcm target: " << target
1032 << " pcm max: " << max
1033 << " pcm min: " << min
1034 << " inc_osd_cache size: " << inc_osd_cache
.get_size()
1039 int OSDMonitor::_set_cache_ratios()
1041 double old_cache_kv_ratio
= cache_kv_ratio
;
1043 // Set the cache ratios for kv(rocksdb), inc and full caches
1044 cache_kv_ratio
= (double)rocksdb_cache_size
/ (double)mon_memory_target
;
1045 if (cache_kv_ratio
>= 1.0) {
1046 derr
<< __func__
<< " Cache kv ratio (" << cache_kv_ratio
1047 << ") must be in range [0,<1.0]."
1049 cache_kv_ratio
= old_cache_kv_ratio
;
1052 rocksdb_binned_kv_cache
->set_cache_ratio(cache_kv_ratio
);
1053 cache_inc_ratio
= cache_full_ratio
= (1.0 - cache_kv_ratio
) / 2;
1054 inc_cache
->set_cache_ratio(cache_inc_ratio
);
1055 full_cache
->set_cache_ratio(cache_full_ratio
);
1057 dout(1) << __func__
<< " kv ratio " << cache_kv_ratio
1058 << " inc ratio " << cache_inc_ratio
1059 << " full ratio " << cache_full_ratio
1064 void OSDMonitor::start_mapping()
1066 // initiate mapping job
1068 dout(10) << __func__
<< " canceling previous mapping_job " << mapping_job
.get()
1070 mapping_job
->abort();
1072 if (!osdmap
.get_pools().empty()) {
1073 auto fin
= new C_UpdateCreatingPGs(this, osdmap
.get_epoch());
1074 mapping_job
= mapping
.start_update(osdmap
, mapper
,
1075 g_conf()->mon_osd_mapping_pgs_per_chunk
);
1076 dout(10) << __func__
<< " started mapping job " << mapping_job
.get()
1077 << " at " << fin
->start
<< dendl
;
1078 mapping_job
->set_finish_event(fin
);
1080 dout(10) << __func__
<< " no pools, no mapping job" << dendl
;
1081 mapping_job
= nullptr;
1085 void OSDMonitor::update_msgr_features()
1087 const int types
[] = {
1088 entity_name_t::TYPE_OSD
,
1089 entity_name_t::TYPE_CLIENT
,
1090 entity_name_t::TYPE_MDS
,
1091 entity_name_t::TYPE_MON
1093 for (int type
: types
) {
1095 uint64_t features
= osdmap
.get_features(type
, &mask
);
1096 if ((mon
.messenger
->get_policy(type
).features_required
& mask
) != features
) {
1097 dout(0) << "crush map has features " << features
<< ", adjusting msgr requires" << dendl
;
1098 ceph::net::Policy p
= mon
.messenger
->get_policy(type
);
1099 p
.features_required
= (p
.features_required
& ~mask
) | features
;
1100 mon
.messenger
->set_policy(type
, p
);
1105 void OSDMonitor::on_active()
1109 if (mon
.is_leader()) {
1110 mon
.clog
->debug() << "osdmap " << osdmap
;
1111 if (!priority_convert
) {
1112 // Only do this once at start-up
1113 convert_pool_priorities();
1114 priority_convert
= true;
1117 list
<MonOpRequestRef
> ls
;
1118 take_all_failures(ls
);
1119 while (!ls
.empty()) {
1120 MonOpRequestRef op
= ls
.front();
1121 op
->mark_osdmon_event(__func__
);
1129 void OSDMonitor::on_restart()
1131 last_osd_report
.clear();
1134 void OSDMonitor::on_shutdown()
1136 dout(10) << __func__
<< dendl
;
1138 dout(10) << __func__
<< " canceling previous mapping_job " << mapping_job
.get()
1140 mapping_job
->abort();
1143 // discard failure info, waiters
1144 list
<MonOpRequestRef
> ls
;
1145 take_all_failures(ls
);
1149 void OSDMonitor::update_logger()
1151 dout(10) << "update_logger" << dendl
;
1153 mon
.cluster_logger
->set(l_cluster_num_osd
, osdmap
.get_num_osds());
1154 mon
.cluster_logger
->set(l_cluster_num_osd_up
, osdmap
.get_num_up_osds());
1155 mon
.cluster_logger
->set(l_cluster_num_osd_in
, osdmap
.get_num_in_osds());
1156 mon
.cluster_logger
->set(l_cluster_osd_epoch
, osdmap
.get_epoch());
1159 void OSDMonitor::create_pending()
1161 pending_inc
= OSDMap::Incremental(osdmap
.epoch
+1);
1162 pending_inc
.fsid
= mon
.monmap
->fsid
;
1163 pending_metadata
.clear();
1164 pending_metadata_rm
.clear();
1165 pending_pseudo_purged_snaps
.clear();
1167 dout(10) << "create_pending e " << pending_inc
.epoch
<< dendl
;
1169 // safety checks (this shouldn't really happen)
1171 if (osdmap
.backfillfull_ratio
<= 0) {
1172 pending_inc
.new_backfillfull_ratio
= g_conf()->mon_osd_backfillfull_ratio
;
1173 if (pending_inc
.new_backfillfull_ratio
> 1.0)
1174 pending_inc
.new_backfillfull_ratio
/= 100;
1175 dout(1) << __func__
<< " setting backfillfull_ratio = "
1176 << pending_inc
.new_backfillfull_ratio
<< dendl
;
1178 if (osdmap
.full_ratio
<= 0) {
1179 pending_inc
.new_full_ratio
= g_conf()->mon_osd_full_ratio
;
1180 if (pending_inc
.new_full_ratio
> 1.0)
1181 pending_inc
.new_full_ratio
/= 100;
1182 dout(1) << __func__
<< " setting full_ratio = "
1183 << pending_inc
.new_full_ratio
<< dendl
;
1185 if (osdmap
.nearfull_ratio
<= 0) {
1186 pending_inc
.new_nearfull_ratio
= g_conf()->mon_osd_nearfull_ratio
;
1187 if (pending_inc
.new_nearfull_ratio
> 1.0)
1188 pending_inc
.new_nearfull_ratio
/= 100;
1189 dout(1) << __func__
<< " setting nearfull_ratio = "
1190 << pending_inc
.new_nearfull_ratio
<< dendl
;
1196 OSDMonitor::update_pending_pgs(const OSDMap::Incremental
& inc
,
1197 const OSDMap
& nextmap
)
1199 dout(10) << __func__
<< dendl
;
1200 creating_pgs_t pending_creatings
;
1202 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
1203 pending_creatings
= creating_pgs
;
1205 // check for new or old pools
1206 if (pending_creatings
.last_scan_epoch
< inc
.epoch
) {
1207 unsigned queued
= 0;
1208 queued
+= scan_for_creating_pgs(osdmap
.get_pools(),
1211 &pending_creatings
);
1212 queued
+= scan_for_creating_pgs(inc
.new_pools
,
1215 &pending_creatings
);
1216 dout(10) << __func__
<< " " << queued
<< " pools queued" << dendl
;
1217 for (auto deleted_pool
: inc
.old_pools
) {
1218 auto removed
= pending_creatings
.remove_pool(deleted_pool
);
1219 dout(10) << __func__
<< " " << removed
1220 << " pg removed because containing pool deleted: "
1221 << deleted_pool
<< dendl
;
1222 last_epoch_clean
.remove_pool(deleted_pool
);
1224 // pgmon updates its creating_pgs in check_osd_map() which is called by
1225 // on_active() and check_osd_map() could be delayed if lease expires, so its
1226 // creating_pgs could be stale in comparison with the one of osdmon. let's
1227 // trim them here. otherwise, they will be added back after being erased.
1228 unsigned removed
= 0;
1229 for (auto& pg
: pending_created_pgs
) {
1230 dout(20) << __func__
<< " noting created pg " << pg
<< dendl
;
1231 pending_creatings
.created_pools
.insert(pg
.pool());
1232 removed
+= pending_creatings
.pgs
.erase(pg
);
1234 pending_created_pgs
.clear();
1235 dout(10) << __func__
<< " " << removed
1236 << " pgs removed because they're created" << dendl
;
1237 pending_creatings
.last_scan_epoch
= osdmap
.get_epoch();
1240 // filter out any pgs that shouldn't exist.
1242 auto i
= pending_creatings
.pgs
.begin();
1243 while (i
!= pending_creatings
.pgs
.end()) {
1244 if (!nextmap
.pg_exists(i
->first
)) {
1245 dout(10) << __func__
<< " removing pg " << i
->first
1246 << " which should not exist" << dendl
;
1247 i
= pending_creatings
.pgs
.erase(i
);
1255 unsigned max
= std::max
<int64_t>(1, g_conf()->mon_osd_max_creating_pgs
);
1256 const auto total
= pending_creatings
.pgs
.size();
1257 while (pending_creatings
.pgs
.size() < max
&&
1258 !pending_creatings
.queue
.empty()) {
1259 auto p
= pending_creatings
.queue
.begin();
1260 int64_t poolid
= p
->first
;
1261 dout(10) << __func__
<< " pool " << poolid
1262 << " created " << p
->second
.created
1263 << " modified " << p
->second
.modified
1264 << " [" << p
->second
.start
<< "-" << p
->second
.end
<< ")"
1266 int64_t n
= std::min
<int64_t>(max
- pending_creatings
.pgs
.size(),
1267 p
->second
.end
- p
->second
.start
);
1268 ps_t first
= p
->second
.start
;
1269 ps_t end
= first
+ n
;
1270 for (ps_t ps
= first
; ps
< end
; ++ps
) {
1271 const pg_t pgid
{ps
, static_cast<uint64_t>(poolid
)};
1272 // NOTE: use the *current* epoch as the PG creation epoch so that the
1273 // OSD does not have to generate a long set of PastIntervals.
1274 pending_creatings
.pgs
.emplace(
1276 creating_pgs_t::pg_create_info(inc
.epoch
,
1277 p
->second
.modified
));
1278 dout(10) << __func__
<< " adding " << pgid
<< dendl
;
1280 p
->second
.start
= end
;
1281 if (p
->second
.done()) {
1282 dout(10) << __func__
<< " done with queue for " << poolid
<< dendl
;
1283 pending_creatings
.queue
.erase(p
);
1285 dout(10) << __func__
<< " pool " << poolid
1286 << " now [" << p
->second
.start
<< "-" << p
->second
.end
<< ")"
1290 dout(10) << __func__
<< " queue remaining: " << pending_creatings
.queue
.size()
1291 << " pools" << dendl
;
1293 if (mon
.monmap
->min_mon_release
>= ceph_release_t::octopus
) {
1294 // walk creating pgs' history and past_intervals forward
1295 for (auto& i
: pending_creatings
.pgs
) {
1296 // this mirrors PG::start_peering_interval()
1297 pg_t pgid
= i
.first
;
1299 // this is a bit imprecise, but sufficient?
1300 struct min_size_predicate_t
: public IsPGRecoverablePredicate
{
1301 const pg_pool_t
*pi
;
1302 bool operator()(const set
<pg_shard_t
> &have
) const {
1303 return have
.size() >= pi
->min_size
;
1305 explicit min_size_predicate_t(const pg_pool_t
*i
) : pi(i
) {}
1306 } min_size_predicate(nextmap
.get_pg_pool(pgid
.pool()));
1308 vector
<int> up
, acting
;
1309 int up_primary
, acting_primary
;
1310 nextmap
.pg_to_up_acting_osds(
1311 pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
1312 if (i
.second
.history
.epoch_created
== 0) {
1313 // new pg entry, set it up
1315 i
.second
.acting
= acting
;
1316 i
.second
.up_primary
= up_primary
;
1317 i
.second
.acting_primary
= acting_primary
;
1318 i
.second
.history
= pg_history_t(i
.second
.create_epoch
,
1319 i
.second
.create_stamp
);
1320 dout(10) << __func__
<< " pg " << pgid
<< " just added, "
1321 << " up " << i
.second
.up
1322 << " p " << i
.second
.up_primary
1323 << " acting " << i
.second
.acting
1324 << " p " << i
.second
.acting_primary
1325 << " history " << i
.second
.history
1326 << " past_intervals " << i
.second
.past_intervals
1329 std::stringstream debug
;
1330 if (PastIntervals::check_new_interval(
1331 i
.second
.acting_primary
, acting_primary
,
1332 i
.second
.acting
, acting
,
1333 i
.second
.up_primary
, up_primary
,
1335 i
.second
.history
.same_interval_since
,
1336 i
.second
.history
.last_epoch_clean
,
1341 &i
.second
.past_intervals
,
1343 epoch_t e
= inc
.epoch
;
1344 i
.second
.history
.same_interval_since
= e
;
1345 if (i
.second
.up
!= up
) {
1346 i
.second
.history
.same_up_since
= e
;
1348 if (i
.second
.acting_primary
!= acting_primary
) {
1349 i
.second
.history
.same_primary_since
= e
;
1352 osdmap
.get_pg_num(pgid
.pool()),
1353 nextmap
.get_pg_num(pgid
.pool()),
1355 i
.second
.history
.last_epoch_split
= e
;
1357 dout(10) << __func__
<< " pg " << pgid
<< " new interval,"
1358 << " up " << i
.second
.up
<< " -> " << up
1359 << " p " << i
.second
.up_primary
<< " -> " << up_primary
1360 << " acting " << i
.second
.acting
<< " -> " << acting
1361 << " p " << i
.second
.acting_primary
<< " -> "
1363 << " history " << i
.second
.history
1364 << " past_intervals " << i
.second
.past_intervals
1366 dout(20) << " debug: " << debug
.str() << dendl
;
1368 i
.second
.acting
= acting
;
1369 i
.second
.up_primary
= up_primary
;
1370 i
.second
.acting_primary
= acting_primary
;
1375 dout(10) << __func__
1376 << " " << (pending_creatings
.pgs
.size() - total
)
1377 << "/" << pending_creatings
.pgs
.size()
1378 << " pgs added from queued pools" << dendl
;
1379 return pending_creatings
;
1382 void OSDMonitor::maybe_prime_pg_temp()
1385 if (pending_inc
.crush
.length()) {
1386 dout(10) << __func__
<< " new crush map, all" << dendl
;
1390 if (!pending_inc
.new_up_client
.empty()) {
1391 dout(10) << __func__
<< " new up osds, all" << dendl
;
1395 // check for interesting OSDs
1397 for (auto p
= pending_inc
.new_state
.begin();
1398 !all
&& p
!= pending_inc
.new_state
.end();
1400 if ((p
->second
& CEPH_OSD_UP
) &&
1401 osdmap
.is_up(p
->first
)) {
1402 osds
.insert(p
->first
);
1405 for (auto p
= pending_inc
.new_weight
.begin();
1406 !all
&& p
!= pending_inc
.new_weight
.end();
1408 if (osdmap
.exists(p
->first
) && p
->second
< osdmap
.get_weight(p
->first
)) {
1410 osds
.insert(p
->first
);
1412 dout(10) << __func__
<< " osd." << p
->first
<< " weight increase, all"
1418 if (!all
&& osds
.empty())
1423 mapping
.get_osd_acting_pgs(*osds
.begin()).size() * osds
.size();
1424 if (estimate
> mapping
.get_num_pgs() *
1425 g_conf()->mon_osd_prime_pg_temp_max_estimate
) {
1426 dout(10) << __func__
<< " estimate " << estimate
<< " pgs on "
1427 << osds
.size() << " osds >= "
1428 << g_conf()->mon_osd_prime_pg_temp_max_estimate
<< " of total "
1429 << mapping
.get_num_pgs() << " pgs, all"
1433 dout(10) << __func__
<< " estimate " << estimate
<< " pgs on "
1434 << osds
.size() << " osds" << dendl
;
1439 next
.deepish_copy_from(osdmap
);
1440 next
.apply_incremental(pending_inc
);
1442 if (next
.get_pools().empty()) {
1443 dout(10) << __func__
<< " no pools, no pg_temp priming" << dendl
;
1445 PrimeTempJob
job(next
, this);
1446 mapper
.queue(&job
, g_conf()->mon_osd_mapping_pgs_per_chunk
, {});
1447 if (job
.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time
)) {
1448 dout(10) << __func__
<< " done in " << job
.get_duration() << dendl
;
1450 dout(10) << __func__
<< " did not finish in "
1451 << g_conf()->mon_osd_prime_pg_temp_max_time
1452 << ", stopping" << dendl
;
1456 dout(10) << __func__
<< " " << osds
.size() << " interesting osds" << dendl
;
1457 utime_t stop
= ceph_clock_now();
1458 stop
+= g_conf()->mon_osd_prime_pg_temp_max_time
;
1459 const int chunk
= 1000;
1461 std::unordered_set
<pg_t
> did_pgs
;
1462 for (auto osd
: osds
) {
1463 auto& pgs
= mapping
.get_osd_acting_pgs(osd
);
1464 dout(20) << __func__
<< " osd." << osd
<< " " << pgs
<< dendl
;
1465 for (auto pgid
: pgs
) {
1466 if (!did_pgs
.insert(pgid
).second
) {
1469 prime_pg_temp(next
, pgid
);
1472 if (ceph_clock_now() > stop
) {
1473 dout(10) << __func__
<< " consumed more than "
1474 << g_conf()->mon_osd_prime_pg_temp_max_time
1475 << " seconds, stopping"
1485 void OSDMonitor::prime_pg_temp(
1489 // TODO: remove this creating_pgs direct access?
1490 if (creating_pgs
.pgs
.count(pgid
)) {
1493 if (!osdmap
.pg_exists(pgid
)) {
1497 vector
<int> up
, acting
;
1498 mapping
.get(pgid
, &up
, nullptr, &acting
, nullptr);
1500 vector
<int> next_up
, next_acting
;
1501 int next_up_primary
, next_acting_primary
;
1502 next
.pg_to_up_acting_osds(pgid
, &next_up
, &next_up_primary
,
1503 &next_acting
, &next_acting_primary
);
1504 if (acting
== next_acting
&&
1505 !(up
!= acting
&& next_up
== next_acting
))
1506 return; // no change since last epoch
1509 return; // if previously empty now we can be no worse off
1510 const pg_pool_t
*pool
= next
.get_pg_pool(pgid
.pool());
1511 if (pool
&& acting
.size() < pool
->min_size
)
1512 return; // can be no worse off than before
1514 if (next_up
== next_acting
) {
1516 dout(20) << __func__
<< " next_up == next_acting now, clear pg_temp"
1520 dout(20) << __func__
<< " " << pgid
<< " " << up
<< "/" << acting
1521 << " -> " << next_up
<< "/" << next_acting
1522 << ", priming " << acting
1525 std::lock_guard
l(prime_pg_temp_lock
);
1526 // do not touch a mapping if a change is pending
1527 pending_inc
.new_pg_temp
.emplace(
1529 mempool::osdmap::vector
<int>(acting
.begin(), acting
.end()));
1534 * @note receiving a transaction in this function gives a fair amount of
1535 * freedom to the service implementation if it does need it. It shouldn't.
1537 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t
)
1539 dout(10) << "encode_pending e " << pending_inc
.epoch
1543 dout(1) << __func__
<< " osdmap full prune encoded e"
1544 << pending_inc
.epoch
<< dendl
;
1547 // finalize up pending_inc
1548 pending_inc
.modified
= ceph_clock_now();
1550 int r
= pending_inc
.propagate_base_properties_to_tiers(cct
, osdmap
);
1551 ceph_assert(r
== 0);
1554 if (!mapping_job
->is_done()) {
1555 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job "
1556 << mapping_job
.get() << " did not complete, "
1557 << mapping_job
->shards
<< " left" << dendl
;
1558 mapping_job
->abort();
1559 } else if (mapping
.get_epoch() < osdmap
.get_epoch()) {
1560 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job "
1561 << mapping_job
.get() << " is prior epoch "
1562 << mapping
.get_epoch() << dendl
;
1564 if (g_conf()->mon_osd_prime_pg_temp
) {
1565 maybe_prime_pg_temp();
1568 } else if (g_conf()->mon_osd_prime_pg_temp
) {
1569 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job did not start"
1572 mapping_job
.reset();
1574 // ensure we don't have blank new_state updates. these are interrpeted as
1575 // CEPH_OSD_UP (and almost certainly not what we want!).
1576 auto p
= pending_inc
.new_state
.begin();
1577 while (p
!= pending_inc
.new_state
.end()) {
1578 if (p
->second
== 0) {
1579 dout(10) << "new_state for osd." << p
->first
<< " is 0, removing" << dendl
;
1580 p
= pending_inc
.new_state
.erase(p
);
1582 if (p
->second
& CEPH_OSD_UP
) {
1583 pending_inc
.new_last_up_change
= pending_inc
.modified
;
1588 if (!pending_inc
.new_up_client
.empty()) {
1589 pending_inc
.new_last_up_change
= pending_inc
.modified
;
1591 for (auto& i
: pending_inc
.new_weight
) {
1592 if (i
.first
>= osdmap
.max_osd
) {
1594 // new osd is already marked in
1595 pending_inc
.new_last_in_change
= pending_inc
.modified
;
1598 } else if (!!i
.second
!= !!osdmap
.osd_weight
[i
.first
]) {
1599 // existing osd marked in or out
1600 pending_inc
.new_last_in_change
= pending_inc
.modified
;
1607 tmp
.deepish_copy_from(osdmap
);
1608 tmp
.apply_incremental(pending_inc
);
1610 // clean pg_temp mappings
1611 OSDMap::clean_temps(cct
, osdmap
, tmp
, &pending_inc
);
1613 // clean inappropriate pg_upmap/pg_upmap_items (if any)
1615 // check every upmapped pg for now
1616 // until we could reliably identify certain cases to ignore,
1617 // which is obviously the hard part TBD..
1618 vector
<pg_t
> pgs_to_check
;
1619 tmp
.get_upmap_pgs(&pgs_to_check
);
1620 if (pgs_to_check
.size() <
1621 static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk
* 2)) {
1622 // not enough pgs, do it inline
1623 tmp
.clean_pg_upmaps(cct
, &pending_inc
);
1625 CleanUpmapJob
job(cct
, tmp
, pending_inc
);
1626 mapper
.queue(&job
, g_conf()->mon_clean_pg_upmaps_per_chunk
, pgs_to_check
);
1631 // update creating pgs first so that we can remove the created pgid and
1632 // process the pool flag removal below in the same osdmap epoch.
1633 auto pending_creatings
= update_pending_pgs(pending_inc
, tmp
);
1634 bufferlist creatings_bl
;
1635 uint64_t features
= CEPH_FEATURES_ALL
;
1636 if (mon
.monmap
->min_mon_release
< ceph_release_t::octopus
) {
1637 dout(20) << __func__
<< " encoding pending pgs without octopus features"
1639 features
&= ~CEPH_FEATURE_SERVER_OCTOPUS
;
1641 encode(pending_creatings
, creatings_bl
, features
);
1642 t
->put(OSD_PG_CREATING_PREFIX
, "creating", creatings_bl
);
1644 // remove any old (or incompat) POOL_CREATING flags
1645 for (auto& i
: tmp
.get_pools()) {
1646 if (tmp
.require_osd_release
< ceph_release_t::nautilus
) {
1647 // pre-nautilus OSDMaps shouldn't get this flag.
1648 if (pending_inc
.new_pools
.count(i
.first
)) {
1649 pending_inc
.new_pools
[i
.first
].flags
&= ~pg_pool_t::FLAG_CREATING
;
1652 if (i
.second
.has_flag(pg_pool_t::FLAG_CREATING
) &&
1653 !pending_creatings
.still_creating_pool(i
.first
)) {
1654 dout(10) << __func__
<< " done creating pool " << i
.first
1655 << ", clearing CREATING flag" << dendl
;
1656 if (pending_inc
.new_pools
.count(i
.first
) == 0) {
1657 pending_inc
.new_pools
[i
.first
] = i
.second
;
1659 pending_inc
.new_pools
[i
.first
].flags
&= ~pg_pool_t::FLAG_CREATING
;
1663 // collect which pools are currently affected by
1664 // the near/backfill/full osd(s),
1665 // and set per-pool near/backfill/full flag instead
1666 set
<int64_t> full_pool_ids
;
1667 set
<int64_t> backfillfull_pool_ids
;
1668 set
<int64_t> nearfull_pool_ids
;
1669 tmp
.get_full_pools(cct
,
1671 &backfillfull_pool_ids
,
1672 &nearfull_pool_ids
);
1673 if (full_pool_ids
.empty() ||
1674 backfillfull_pool_ids
.empty() ||
1675 nearfull_pool_ids
.empty()) {
1676 // normal case - no nearfull, backfillfull or full osds
1677 // try cancel any improper nearfull/backfillfull/full pool
1679 for (auto &pool
: tmp
.get_pools()) {
1680 auto p
= pool
.first
;
1681 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
) &&
1682 nearfull_pool_ids
.empty()) {
1683 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1684 << "'s nearfull flag" << dendl
;
1685 if (pending_inc
.new_pools
.count(p
) == 0) {
1686 // load original pool info first!
1687 pending_inc
.new_pools
[p
] = pool
.second
;
1689 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1691 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
) &&
1692 backfillfull_pool_ids
.empty()) {
1693 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1694 << "'s backfillfull flag" << dendl
;
1695 if (pending_inc
.new_pools
.count(p
) == 0) {
1696 pending_inc
.new_pools
[p
] = pool
.second
;
1698 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1700 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
) &&
1701 full_pool_ids
.empty()) {
1702 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1703 // set by EQUOTA, skipping
1706 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1707 << "'s full flag" << dendl
;
1708 if (pending_inc
.new_pools
.count(p
) == 0) {
1709 pending_inc
.new_pools
[p
] = pool
.second
;
1711 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_FULL
;
1715 if (!full_pool_ids
.empty()) {
1716 dout(10) << __func__
<< " marking pool(s) " << full_pool_ids
1717 << " as full" << dendl
;
1718 for (auto &p
: full_pool_ids
) {
1719 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
)) {
1722 if (pending_inc
.new_pools
.count(p
) == 0) {
1723 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1725 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_FULL
;
1726 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1727 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1729 // cancel FLAG_FULL for pools which are no longer full too
1730 for (auto &pool
: tmp
.get_pools()) {
1731 auto p
= pool
.first
;
1732 if (full_pool_ids
.count(p
)) {
1733 // skip pools we have just marked as full above
1736 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
) ||
1737 tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1738 // don't touch if currently is not full
1739 // or is running out of quota (and hence considered as full)
1742 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1743 << "'s full flag" << dendl
;
1744 if (pending_inc
.new_pools
.count(p
) == 0) {
1745 pending_inc
.new_pools
[p
] = pool
.second
;
1747 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_FULL
;
1750 if (!backfillfull_pool_ids
.empty()) {
1751 for (auto &p
: backfillfull_pool_ids
) {
1752 if (full_pool_ids
.count(p
)) {
1753 // skip pools we have already considered as full above
1756 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1757 // make sure FLAG_FULL is truly set, so we are safe not
1758 // to set a extra (redundant) FLAG_BACKFILLFULL flag
1759 ceph_assert(tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
));
1762 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
)) {
1763 // don't bother if pool is already marked as backfillfull
1766 dout(10) << __func__
<< " marking pool '" << tmp
.pool_name
[p
]
1767 << "'s as backfillfull" << dendl
;
1768 if (pending_inc
.new_pools
.count(p
) == 0) {
1769 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1771 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_BACKFILLFULL
;
1772 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1774 // cancel FLAG_BACKFILLFULL for pools
1775 // which are no longer backfillfull too
1776 for (auto &pool
: tmp
.get_pools()) {
1777 auto p
= pool
.first
;
1778 if (full_pool_ids
.count(p
) || backfillfull_pool_ids
.count(p
)) {
1779 // skip pools we have just marked as backfillfull/full above
1782 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
)) {
1783 // and don't touch if currently is not backfillfull
1786 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1787 << "'s backfillfull flag" << dendl
;
1788 if (pending_inc
.new_pools
.count(p
) == 0) {
1789 pending_inc
.new_pools
[p
] = pool
.second
;
1791 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1794 if (!nearfull_pool_ids
.empty()) {
1795 for (auto &p
: nearfull_pool_ids
) {
1796 if (full_pool_ids
.count(p
) || backfillfull_pool_ids
.count(p
)) {
1799 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1800 // make sure FLAG_FULL is truly set, so we are safe not
1801 // to set a extra (redundant) FLAG_NEARFULL flag
1802 ceph_assert(tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
));
1805 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
)) {
1806 // don't bother if pool is already marked as nearfull
1809 dout(10) << __func__
<< " marking pool '" << tmp
.pool_name
[p
]
1810 << "'s as nearfull" << dendl
;
1811 if (pending_inc
.new_pools
.count(p
) == 0) {
1812 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1814 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_NEARFULL
;
1816 // cancel FLAG_NEARFULL for pools
1817 // which are no longer nearfull too
1818 for (auto &pool
: tmp
.get_pools()) {
1819 auto p
= pool
.first
;
1820 if (full_pool_ids
.count(p
) ||
1821 backfillfull_pool_ids
.count(p
) ||
1822 nearfull_pool_ids
.count(p
)) {
1823 // skip pools we have just marked as
1824 // nearfull/backfillfull/full above
1827 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
)) {
1828 // and don't touch if currently is not nearfull
1831 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1832 << "'s nearfull flag" << dendl
;
1833 if (pending_inc
.new_pools
.count(p
) == 0) {
1834 pending_inc
.new_pools
[p
] = pool
.second
;
1836 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1840 // min_compat_client?
1841 if (!tmp
.require_min_compat_client
) {
1842 auto mv
= tmp
.get_min_compat_client();
1843 dout(1) << __func__
<< " setting require_min_compat_client to currently "
1844 << "required " << mv
<< dendl
;
1845 mon
.clog
->info() << "setting require_min_compat_client to currently "
1846 << "required " << mv
;
1847 pending_inc
.new_require_min_compat_client
= mv
;
1850 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
&&
1851 tmp
.require_osd_release
>= ceph_release_t::nautilus
) {
1852 dout(10) << __func__
<< " first nautilus+ epoch" << dendl
;
1853 // add creating flags?
1854 for (auto& i
: tmp
.get_pools()) {
1855 if (pending_creatings
.still_creating_pool(i
.first
)) {
1856 dout(10) << __func__
<< " adding CREATING flag to pool " << i
.first
1858 if (pending_inc
.new_pools
.count(i
.first
) == 0) {
1859 pending_inc
.new_pools
[i
.first
] = i
.second
;
1861 pending_inc
.new_pools
[i
.first
].flags
|= pg_pool_t::FLAG_CREATING
;
1864 // adjust blocklist items to all be TYPE_ANY
1865 for (auto& i
: tmp
.blocklist
) {
1867 a
.set_type(entity_addr_t::TYPE_ANY
);
1868 pending_inc
.new_blocklist
[a
] = i
.second
;
1869 pending_inc
.old_blocklist
.push_back(i
.first
);
1873 if (osdmap
.require_osd_release
< ceph_release_t::octopus
&&
1874 tmp
.require_osd_release
>= ceph_release_t::octopus
) {
1875 dout(10) << __func__
<< " first octopus+ epoch" << dendl
;
1877 // adjust obsoleted cache modes
1878 for (auto& [poolid
, pi
] : tmp
.pools
) {
1879 if (pi
.cache_mode
== pg_pool_t::CACHEMODE_FORWARD
) {
1880 if (pending_inc
.new_pools
.count(poolid
) == 0) {
1881 pending_inc
.new_pools
[poolid
] = pi
;
1883 dout(10) << __func__
<< " switching pool " << poolid
1884 << " cachemode from forward -> proxy" << dendl
;
1885 pending_inc
.new_pools
[poolid
].cache_mode
= pg_pool_t::CACHEMODE_PROXY
;
1887 if (pi
.cache_mode
== pg_pool_t::CACHEMODE_READFORWARD
) {
1888 if (pending_inc
.new_pools
.count(poolid
) == 0) {
1889 pending_inc
.new_pools
[poolid
] = pi
;
1891 dout(10) << __func__
<< " switching pool " << poolid
1892 << " cachemode from readforward -> readproxy" << dendl
;
1893 pending_inc
.new_pools
[poolid
].cache_mode
=
1894 pg_pool_t::CACHEMODE_READPROXY
;
1898 // clear removed_snaps for every pool
1899 for (auto& [poolid
, pi
] : tmp
.pools
) {
1900 if (pi
.removed_snaps
.empty()) {
1903 if (pending_inc
.new_pools
.count(poolid
) == 0) {
1904 pending_inc
.new_pools
[poolid
] = pi
;
1906 dout(10) << __func__
<< " clearing pool " << poolid
<< " removed_snaps"
1908 pending_inc
.new_pools
[poolid
].removed_snaps
.clear();
1911 // create a combined purged snap epoch key for all purged snaps
1912 // prior to this epoch, and store it in the current epoch (i.e.,
1913 // the last pre-octopus epoch, just prior to the one we're
1915 auto it
= mon
.store
->get_iterator(OSD_SNAP_PREFIX
);
1916 it
->lower_bound("purged_snap_");
1917 map
<int64_t,snap_interval_set_t
> combined
;
1918 while (it
->valid()) {
1919 if (it
->key().find("purged_snap_") != 0) {
1922 string k
= it
->key();
1923 long long unsigned pool
;
1924 int n
= sscanf(k
.c_str(), "purged_snap_%llu_", &pool
);
1926 derr
<< __func__
<< " invalid purged_snaps key '" << k
<< "'" << dendl
;
1928 bufferlist v
= it
->value();
1929 auto p
= v
.cbegin();
1930 snapid_t begin
, end
;
1931 ceph::decode(begin
, p
);
1932 ceph::decode(end
, p
);
1933 combined
[pool
].insert(begin
, end
- begin
);
1937 if (!combined
.empty()) {
1938 string k
= make_purged_snap_epoch_key(pending_inc
.epoch
- 1);
1940 ceph::encode(combined
, v
);
1941 t
->put(OSD_SNAP_PREFIX
, k
, v
);
1942 dout(10) << __func__
<< " recording pre-octopus purged_snaps in epoch "
1943 << (pending_inc
.epoch
- 1) << ", " << v
.length() << " bytes"
1946 dout(10) << __func__
<< " there were no pre-octopus purged snaps"
1950 // clean out the old removed_snap_ and removed_epoch keys
1951 // ('`' is ASCII '_' + 1)
1952 t
->erase_range(OSD_SNAP_PREFIX
, "removed_snap_", "removed_snap`");
1953 t
->erase_range(OSD_SNAP_PREFIX
, "removed_epoch_", "removed_epoch`");
1958 for (auto i
= pending_inc
.new_state
.begin();
1959 i
!= pending_inc
.new_state
.end();
1961 int s
= i
->second
? i
->second
: CEPH_OSD_UP
;
1962 if (s
& CEPH_OSD_UP
) {
1963 dout(2) << " osd." << i
->first
<< " DOWN" << dendl
;
1964 // Reset laggy parameters if failure interval exceeds a threshold.
1965 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(i
->first
);
1966 if ((xi
.laggy_probability
|| xi
.laggy_interval
) && xi
.down_stamp
.sec()) {
1967 int last_failure_interval
= pending_inc
.modified
.sec() - xi
.down_stamp
.sec();
1968 if (grace_interval_threshold_exceeded(last_failure_interval
)) {
1969 set_default_laggy_params(i
->first
);
1973 if (s
& CEPH_OSD_EXISTS
)
1974 dout(2) << " osd." << i
->first
<< " DNE" << dendl
;
1976 for (auto i
= pending_inc
.new_up_client
.begin();
1977 i
!= pending_inc
.new_up_client
.end();
1979 //FIXME: insert cluster addresses too
1980 dout(2) << " osd." << i
->first
<< " UP " << i
->second
<< dendl
;
1982 for (map
<int32_t,uint32_t>::iterator i
= pending_inc
.new_weight
.begin();
1983 i
!= pending_inc
.new_weight
.end();
1985 if (i
->second
== CEPH_OSD_OUT
) {
1986 dout(2) << " osd." << i
->first
<< " OUT" << dendl
;
1987 } else if (i
->second
== CEPH_OSD_IN
) {
1988 dout(2) << " osd." << i
->first
<< " IN" << dendl
;
1990 dout(2) << " osd." << i
->first
<< " WEIGHT " << hex
<< i
->second
<< dec
<< dendl
;
1994 // features for osdmap and its incremental
1997 // encode full map and determine its crc
2000 tmp
.deepish_copy_from(osdmap
);
2001 tmp
.apply_incremental(pending_inc
);
2003 // determine appropriate features
2004 features
= tmp
.get_encoding_features();
2005 dout(10) << __func__
<< " encoding full map with "
2006 << tmp
.require_osd_release
2007 << " features " << features
<< dendl
;
2009 // the features should be a subset of the mon quorum's features!
2010 ceph_assert((features
& ~mon
.get_quorum_con_features()) == 0);
2013 encode(tmp
, fullbl
, features
| CEPH_FEATURE_RESERVED
);
2014 pending_inc
.full_crc
= tmp
.get_crc();
2016 // include full map in the txn. note that old monitors will
2017 // overwrite this. new ones will now skip the local full map
2018 // encode and reload from this.
2019 put_version_full(t
, pending_inc
.epoch
, fullbl
);
2023 ceph_assert(get_last_committed() + 1 == pending_inc
.epoch
);
2025 encode(pending_inc
, bl
, features
| CEPH_FEATURE_RESERVED
);
2027 dout(20) << " full_crc " << tmp
.get_crc()
2028 << " inc_crc " << pending_inc
.inc_crc
<< dendl
;
2030 /* put everything in the transaction */
2031 put_version(t
, pending_inc
.epoch
, bl
);
2032 put_last_committed(t
, pending_inc
.epoch
);
2035 for (map
<int,bufferlist
>::iterator p
= pending_metadata
.begin();
2036 p
!= pending_metadata
.end();
2039 auto mp
= p
->second
.cbegin();
2041 auto it
= m
.find("osd_objectstore");
2042 if (it
!= m
.end()) {
2043 if (it
->second
== "filestore") {
2044 filestore_osds
.insert(p
->first
);
2046 filestore_osds
.erase(p
->first
);
2049 t
->put(OSD_METADATA_PREFIX
, stringify(p
->first
), p
->second
);
2051 for (set
<int>::iterator p
= pending_metadata_rm
.begin();
2052 p
!= pending_metadata_rm
.end();
2054 filestore_osds
.erase(*p
);
2055 t
->erase(OSD_METADATA_PREFIX
, stringify(*p
));
2057 pending_metadata
.clear();
2058 pending_metadata_rm
.clear();
2061 if (tmp
.require_osd_release
>= ceph_release_t::octopus
&&
2062 !pending_inc
.new_purged_snaps
.empty()) {
2063 // all snaps purged this epoch (across all pools)
2064 string k
= make_purged_snap_epoch_key(pending_inc
.epoch
);
2066 encode(pending_inc
.new_purged_snaps
, v
);
2067 t
->put(OSD_SNAP_PREFIX
, k
, v
);
2069 for (auto& i
: pending_inc
.new_purged_snaps
) {
2070 for (auto q
= i
.second
.begin();
2071 q
!= i
.second
.end();
2073 insert_purged_snap_update(i
.first
, q
.get_start(), q
.get_end(),
2078 for (auto& [pool
, snaps
] : pending_pseudo_purged_snaps
) {
2079 for (auto snap
: snaps
) {
2080 insert_purged_snap_update(pool
, snap
, snap
+ 1,
2087 health_check_map_t next
;
2088 tmp
.check_health(cct
, &next
);
2090 check_for_filestore_osds(&next
);
2091 encode_health(next
, t
);
2094 int OSDMonitor::load_metadata(int osd
, map
<string
, string
>& m
, ostream
*err
)
2097 int r
= mon
.store
->get(OSD_METADATA_PREFIX
, stringify(osd
), bl
);
2101 auto p
= bl
.cbegin();
2104 catch (ceph::buffer::error
& e
) {
2106 *err
<< "osd." << osd
<< " metadata is corrupt";
2112 void OSDMonitor::count_metadata(const string
& field
, map
<string
,int> *out
)
2114 for (int osd
= 0; osd
< osdmap
.get_max_osd(); ++osd
) {
2115 if (osdmap
.is_up(osd
)) {
2116 map
<string
,string
> meta
;
2117 load_metadata(osd
, meta
, nullptr);
2118 auto p
= meta
.find(field
);
2119 if (p
== meta
.end()) {
2120 (*out
)["unknown"]++;
2122 (*out
)[p
->second
]++;
2128 void OSDMonitor::count_metadata(const string
& field
, Formatter
*f
)
2130 map
<string
,int> by_val
;
2131 count_metadata(field
, &by_val
);
2132 f
->open_object_section(field
.c_str());
2133 for (auto& p
: by_val
) {
2134 f
->dump_int(p
.first
.c_str(), p
.second
);
2139 void OSDMonitor::get_versions(std::map
<string
, list
<string
>> &versions
)
2141 for (int osd
= 0; osd
< osdmap
.get_max_osd(); ++osd
) {
2142 if (osdmap
.is_up(osd
)) {
2143 map
<string
,string
> meta
;
2144 load_metadata(osd
, meta
, nullptr);
2145 auto p
= meta
.find("ceph_version_short");
2146 if (p
== meta
.end()) continue;
2147 versions
[p
->second
].push_back(string("osd.") + stringify(osd
));
2152 int OSDMonitor::get_osd_objectstore_type(int osd
, string
*type
)
2154 map
<string
, string
> metadata
;
2155 int r
= load_metadata(osd
, metadata
, nullptr);
2159 auto it
= metadata
.find("osd_objectstore");
2160 if (it
== metadata
.end())
2166 void OSDMonitor::get_filestore_osd_list()
2168 for (unsigned osd
= 0; osd
< osdmap
.get_num_osds(); ++osd
) {
2169 string objectstore_type
;
2170 int r
= get_osd_objectstore_type(osd
, &objectstore_type
);
2171 if (r
== 0 && objectstore_type
== "filestore") {
2172 filestore_osds
.insert(osd
);
2177 void OSDMonitor::check_for_filestore_osds(health_check_map_t
*checks
)
2179 if (g_conf()->mon_warn_on_filestore_osds
&&
2180 filestore_osds
.size() > 0) {
2181 ostringstream ss
, deprecated_tip
;
2182 list
<string
> detail
;
2183 ss
<< filestore_osds
.size()
2185 << (filestore_osds
.size() == 1 ? "is" : "are")
2186 << " running Filestore";
2187 deprecated_tip
<< ss
.str();
2188 ss
<< " [Deprecated]";
2189 auto& d
= checks
->add("OSD_FILESTORE", HEALTH_WARN
, ss
.str(),
2190 filestore_osds
.size());
2191 deprecated_tip
<< ", which has been deprecated and"
2192 << " not been optimized for QoS"
2193 << " (Filestore OSDs will use 'osd_op_queue = wpq' strictly)";
2194 detail
.push_back(deprecated_tip
.str());
2195 d
.detail
.swap(detail
);
2199 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id
,
2200 const pg_pool_t
&pool
,
2203 // just check a few pgs for efficiency - this can't give a guarantee anyway,
2204 // since filestore osds could always join the pool later
2205 set
<int> checked_osds
;
2206 for (unsigned ps
= 0; ps
< std::min(8u, pool
.get_pg_num()); ++ps
) {
2207 vector
<int> up
, acting
;
2208 pg_t
pgid(ps
, pool_id
);
2209 osdmap
.pg_to_up_acting_osds(pgid
, up
, acting
);
2210 for (int osd
: up
) {
2211 if (checked_osds
.find(osd
) != checked_osds
.end())
2213 string objectstore_type
;
2214 int r
= get_osd_objectstore_type(osd
, &objectstore_type
);
2215 // allow with missing metadata, e.g. due to an osd never booting yet
2216 if (r
< 0 || objectstore_type
== "bluestore") {
2217 checked_osds
.insert(osd
);
2220 *err
<< "osd." << osd
<< " uses " << objectstore_type
;
2227 int OSDMonitor::dump_osd_metadata(int osd
, Formatter
*f
, ostream
*err
)
2229 map
<string
,string
> m
;
2230 if (int r
= load_metadata(osd
, m
, err
))
2232 for (map
<string
,string
>::iterator p
= m
.begin(); p
!= m
.end(); ++p
)
2233 f
->dump_string(p
->first
.c_str(), p
->second
);
2237 void OSDMonitor::print_nodes(Formatter
*f
)
2239 // group OSDs by their hosts
2240 map
<string
, list
<int> > osds
; // hostname => osd
2241 for (int osd
= 0; osd
< osdmap
.get_max_osd(); osd
++) {
2242 map
<string
, string
> m
;
2243 if (load_metadata(osd
, m
, NULL
)) {
2246 map
<string
, string
>::iterator hostname
= m
.find("hostname");
2247 if (hostname
== m
.end()) {
2248 // not likely though
2251 osds
[hostname
->second
].push_back(osd
);
2254 dump_services(f
, osds
, "osd");
2257 void OSDMonitor::share_map_with_random_osd()
2259 if (osdmap
.get_num_up_osds() == 0) {
2260 dout(10) << __func__
<< " no up osds, don't share with anyone" << dendl
;
2264 MonSession
*s
= mon
.session_map
.get_random_osd_session(&osdmap
);
2266 dout(10) << __func__
<< " no up osd on our session map" << dendl
;
2270 dout(10) << "committed, telling random " << s
->name
2271 << " all about it" << dendl
;
2273 // get feature of the peer
2274 // use quorum_con_features, if it's an anonymous connection.
2275 uint64_t features
= s
->con_features
? s
->con_features
:
2276 mon
.get_quorum_con_features();
2277 // whatev, they'll request more if they need it
2278 MOSDMap
*m
= build_incremental(osdmap
.get_epoch() - 1, osdmap
.get_epoch(), features
);
2279 s
->con
->send_message(m
);
2280 // NOTE: do *not* record osd has up to this epoch (as we do
2281 // elsewhere) as they may still need to request older values.
2284 version_t
OSDMonitor::get_trim_to() const
2286 if (mon
.get_quorum().empty()) {
2287 dout(10) << __func__
<< " quorum not formed, trim_to = 0" << dendl
;
2292 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
2293 if (!creating_pgs
.pgs
.empty()) {
2294 dout(10) << __func__
<< " pgs creating, trim_to = 0" << dendl
;
2299 if (g_conf().get_val
<bool>("mon_debug_block_osdmap_trim")) {
2301 << " blocking osdmap trim"
2302 << " ('mon_debug_block_osdmap_trim' set to 'true')"
2303 << " trim_to = 0" << dendl
;
2308 epoch_t floor
= get_min_last_epoch_clean();
2309 dout(10) << " min_last_epoch_clean " << floor
<< dendl
;
2310 if (g_conf()->mon_osd_force_trim_to
> 0 &&
2311 g_conf()->mon_osd_force_trim_to
< (int)get_last_committed()) {
2312 floor
= g_conf()->mon_osd_force_trim_to
;
2313 dout(10) << __func__
2314 << " explicit mon_osd_force_trim_to = " << floor
<< dendl
;
2316 unsigned min
= g_conf()->mon_min_osdmap_epochs
;
2317 if (floor
+ min
> get_last_committed()) {
2318 if (min
< get_last_committed())
2319 floor
= get_last_committed() - min
;
2323 if (floor
> get_first_committed()) {
2324 dout(10) << __func__
<< " trim_to = " << floor
<< dendl
;
2328 dout(10) << __func__
<< " trim_to = 0" << dendl
;
2332 epoch_t
OSDMonitor::get_min_last_epoch_clean() const
2334 auto floor
= last_epoch_clean
.get_lower_bound(osdmap
);
2335 // also scan osd epochs
2336 // don't trim past the oldest reported osd epoch
2337 for (auto [osd
, epoch
] : osd_epochs
) {
2338 if (epoch
< floor
) {
2345 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx
,
2348 dout(10) << __func__
<< " including full map for e " << first
<< dendl
;
2350 get_version_full(first
, bl
);
2351 put_version_full(tx
, first
, bl
);
2353 if (has_osdmap_manifest
&&
2354 first
> osdmap_manifest
.get_first_pinned()) {
2355 _prune_update_trimmed(tx
, first
);
2360 /* full osdmap prune
2362 * for more information, please refer to doc/dev/mon-osdmap-prune.rst
2365 void OSDMonitor::load_osdmap_manifest()
2367 bool store_has_manifest
=
2368 mon
.store
->exists(get_service_name(), "osdmap_manifest");
2370 if (!store_has_manifest
) {
2371 if (!has_osdmap_manifest
) {
2375 dout(20) << __func__
2376 << " dropping osdmap manifest from memory." << dendl
;
2377 osdmap_manifest
= osdmap_manifest_t();
2378 has_osdmap_manifest
= false;
2382 dout(20) << __func__
2383 << " osdmap manifest detected in store; reload." << dendl
;
2385 bufferlist manifest_bl
;
2386 int r
= get_value("osdmap_manifest", manifest_bl
);
2388 derr
<< __func__
<< " unable to read osdmap version manifest" << dendl
;
2389 ceph_abort_msg("error reading manifest");
2391 osdmap_manifest
.decode(manifest_bl
);
2392 has_osdmap_manifest
= true;
2394 dout(10) << __func__
<< " store osdmap manifest pinned ("
2395 << osdmap_manifest
.get_first_pinned()
2397 << osdmap_manifest
.get_last_pinned()
2402 bool OSDMonitor::should_prune() const
2404 version_t first
= get_first_committed();
2405 version_t last
= get_last_committed();
2406 version_t min_osdmap_epochs
=
2407 g_conf().get_val
<int64_t>("mon_min_osdmap_epochs");
2408 version_t prune_min
=
2409 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_min");
2410 version_t prune_interval
=
2411 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_interval");
2412 version_t last_pinned
= osdmap_manifest
.get_last_pinned();
2413 version_t last_to_pin
= last
- min_osdmap_epochs
;
2415 // Make it or break it constraints.
2417 // If any of these conditions fails, we will not prune, regardless of
2418 // whether we have an on-disk manifest with an on-going pruning state.
2420 if ((last
- first
) <= min_osdmap_epochs
) {
2421 // between the first and last committed epochs, we don't have
2422 // enough epochs to trim, much less to prune.
2423 dout(10) << __func__
2424 << " currently holding only " << (last
- first
)
2425 << " epochs (min osdmap epochs: " << min_osdmap_epochs
2426 << "); do not prune."
2430 } else if ((last_to_pin
- first
) < prune_min
) {
2431 // between the first committed epoch and the last epoch we would prune,
2432 // we simply don't have enough versions over the minimum to prune maps.
2433 dout(10) << __func__
2434 << " could only prune " << (last_to_pin
- first
)
2435 << " epochs (" << first
<< ".." << last_to_pin
<< "), which"
2436 " is less than the required minimum (" << prune_min
<< ")"
2440 } else if (has_osdmap_manifest
&& last_pinned
>= last_to_pin
) {
2441 dout(10) << __func__
2442 << " we have pruned as far as we can; do not prune."
2446 } else if (last_pinned
+ prune_interval
> last_to_pin
) {
2447 dout(10) << __func__
2448 << " not enough epochs to form an interval (last pinned: "
2449 << last_pinned
<< ", last to pin: "
2450 << last_to_pin
<< ", interval: " << prune_interval
<< ")"
2455 dout(15) << __func__
2456 << " should prune (" << last_pinned
<< ".." << last_to_pin
<< ")"
2457 << " lc (" << first
<< ".." << last
<< ")"
2462 void OSDMonitor::_prune_update_trimmed(
2463 MonitorDBStore::TransactionRef tx
,
2466 dout(10) << __func__
2467 << " first " << first
2468 << " last_pinned " << osdmap_manifest
.get_last_pinned()
2471 osdmap_manifest_t manifest
= osdmap_manifest
;
2473 if (!manifest
.is_pinned(first
)) {
2474 manifest
.pin(first
);
2477 set
<version_t
>::iterator p_end
= manifest
.pinned
.find(first
);
2478 set
<version_t
>::iterator p
= manifest
.pinned
.begin();
2479 manifest
.pinned
.erase(p
, p_end
);
2480 ceph_assert(manifest
.get_first_pinned() == first
);
2482 if (manifest
.get_last_pinned() == first
+1 ||
2483 manifest
.pinned
.size() == 1) {
2484 // we reached the end of the line, as pinned maps go; clean up our
2485 // manifest, and let `should_prune()` decide whether we should prune
2487 tx
->erase(get_service_name(), "osdmap_manifest");
2492 manifest
.encode(bl
);
2493 tx
->put(get_service_name(), "osdmap_manifest", bl
);
2496 void OSDMonitor::prune_init(osdmap_manifest_t
& manifest
)
2498 dout(1) << __func__
<< dendl
;
2500 version_t pin_first
;
2502 // verify constrainsts on stable in-memory state
2503 if (!has_osdmap_manifest
) {
2504 // we must have never pruned, OR if we pruned the state must no longer
2505 // be relevant (i.e., the state must have been removed alongside with
2506 // the trim that *must* have removed past the last pinned map in a
2508 ceph_assert(osdmap_manifest
.pinned
.empty());
2509 ceph_assert(!mon
.store
->exists(get_service_name(), "osdmap_manifest"));
2510 pin_first
= get_first_committed();
2513 // we must have pruned in the past AND its state is still relevant
2514 // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
2515 // and thus we still hold a manifest in the store).
2516 ceph_assert(!osdmap_manifest
.pinned
.empty());
2517 ceph_assert(osdmap_manifest
.get_first_pinned() == get_first_committed());
2518 ceph_assert(osdmap_manifest
.get_last_pinned() < get_last_committed());
2520 dout(10) << __func__
2521 << " first_pinned " << osdmap_manifest
.get_first_pinned()
2522 << " last_pinned " << osdmap_manifest
.get_last_pinned()
2525 pin_first
= osdmap_manifest
.get_last_pinned();
2528 manifest
.pin(pin_first
);
2531 bool OSDMonitor::_prune_sanitize_options() const
2533 uint64_t prune_interval
=
2534 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_interval");
2535 uint64_t prune_min
=
2536 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_min");
2538 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_txsize");
2542 if (prune_interval
== 0) {
2544 << " prune is enabled BUT prune interval is zero; abort."
2547 } else if (prune_interval
== 1) {
2549 << " prune interval is equal to one, which essentially means"
2550 " no pruning; abort."
2554 if (prune_min
== 0) {
2556 << " prune is enabled BUT prune min is zero; abort."
2560 if (prune_interval
> prune_min
) {
2562 << " impossible to ascertain proper prune interval because"
2563 << " it is greater than the minimum prune epochs"
2564 << " (min: " << prune_min
<< ", interval: " << prune_interval
<< ")"
2569 if (txsize
< prune_interval
- 1) {
2571 << " 'mon_osdmap_full_prune_txsize' (" << txsize
2572 << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval
- 1
2573 << "); abort." << dendl
;
2579 bool OSDMonitor::is_prune_enabled() const {
2580 return g_conf().get_val
<bool>("mon_osdmap_full_prune_enabled");
2583 bool OSDMonitor::is_prune_supported() const {
2584 return mon
.get_required_mon_features().contains_any(
2585 ceph::features::mon::FEATURE_OSDMAP_PRUNE
);
2590 * @returns true if has side-effects; false otherwise.
2592 bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx
)
2594 bool enabled
= is_prune_enabled();
2596 dout(1) << __func__
<< " osdmap full prune "
2597 << ( enabled
? "enabled" : "disabled")
2600 if (!enabled
|| !_prune_sanitize_options() || !should_prune()) {
2604 // we are beyond the minimum prune versions, we need to remove maps because
2605 // otherwise the store will grow unbounded and we may end up having issues
2606 // with available disk space or store hangs.
2608 // we will not pin all versions. We will leave a buffer number of versions.
2609 // this allows us the monitor to trim maps without caring too much about
2610 // pinned maps, and then allow us to use another ceph-mon without these
2611 // capabilities, without having to repair the store.
2613 osdmap_manifest_t manifest
= osdmap_manifest
;
2615 version_t first
= get_first_committed();
2616 version_t last
= get_last_committed();
2618 version_t last_to_pin
= last
- g_conf()->mon_min_osdmap_epochs
;
2619 version_t last_pinned
= manifest
.get_last_pinned();
2620 uint64_t prune_interval
=
2621 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_interval");
2623 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_txsize");
2625 prune_init(manifest
);
2627 // we need to get rid of some osdmaps
2630 << " lc (" << first
<< " .. " << last
<< ")"
2631 << " last_pinned " << last_pinned
2632 << " interval " << prune_interval
2633 << " last_to_pin " << last_to_pin
2636 // We will be erasing maps as we go.
2638 // We will erase all maps between `last_pinned` and the `next_to_pin`.
2640 // If `next_to_pin` happens to be greater than `last_to_pin`, then
2641 // we stop pruning. We could prune the maps between `next_to_pin` and
2642 // `last_to_pin`, but by not doing it we end up with neater pruned
2643 // intervals, aligned with `prune_interval`. Besides, this should not be a
2644 // problem as long as `prune_interval` is set to a sane value, instead of
2645 // hundreds or thousands of maps.
2647 auto map_exists
= [this](version_t v
) {
2648 string k
= mon
.store
->combine_strings("full", v
);
2649 return mon
.store
->exists(get_service_name(), k
);
2652 // 'interval' represents the number of maps from the last pinned
2653 // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2654 // version 11 next; all intermediate versions will be removed.
2656 // 'txsize' represents the maximum number of versions we'll be removing in
2657 // this iteration. If 'txsize' is large enough to perform multiple passes
2658 // pinning and removing maps, we will do so; if not, we'll do at least one
2659 // pass. We are quite relaxed about honouring 'txsize', but we'll always
2660 // ensure that we never go *over* the maximum.
2662 // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2663 uint64_t removal_interval
= prune_interval
- 1;
2665 if (txsize
< removal_interval
) {
2667 << " setting txsize to removal interval size ("
2668 << removal_interval
<< " versions"
2670 txsize
= removal_interval
;
2672 ceph_assert(removal_interval
> 0);
2674 uint64_t num_pruned
= 0;
2675 while (num_pruned
+ removal_interval
<= txsize
) {
2676 last_pinned
= manifest
.get_last_pinned();
2678 if (last_pinned
+ prune_interval
> last_to_pin
) {
2681 ceph_assert(last_pinned
< last_to_pin
);
2683 version_t next_pinned
= last_pinned
+ prune_interval
;
2684 ceph_assert(next_pinned
<= last_to_pin
);
2685 manifest
.pin(next_pinned
);
2687 dout(20) << __func__
2688 << " last_pinned " << last_pinned
2689 << " next_pinned " << next_pinned
2690 << " num_pruned " << num_pruned
2691 << " removal interval (" << (last_pinned
+1)
2692 << ".." << (next_pinned
-1) << ")"
2693 << " txsize " << txsize
<< dendl
;
2695 ceph_assert(map_exists(last_pinned
));
2696 ceph_assert(map_exists(next_pinned
));
2698 for (version_t v
= last_pinned
+1; v
< next_pinned
; ++v
) {
2699 ceph_assert(!manifest
.is_pinned(v
));
2701 dout(20) << __func__
<< " pruning full osdmap e" << v
<< dendl
;
2702 string full_key
= mon
.store
->combine_strings("full", v
);
2703 tx
->erase(get_service_name(), full_key
);
2708 ceph_assert(num_pruned
> 0);
2711 manifest
.encode(bl
);
2712 tx
->put(get_service_name(), "osdmap_manifest", bl
);
2720 bool OSDMonitor::preprocess_query(MonOpRequestRef op
)
2722 op
->mark_osdmon_event(__func__
);
2723 Message
*m
= op
->get_req();
2724 dout(10) << "preprocess_query " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
2726 switch (m
->get_type()) {
2728 case MSG_MON_COMMAND
:
2730 return preprocess_command(op
);
2731 } catch (const bad_cmd_get
& e
) {
2733 mon
.reply_command(op
, -EINVAL
, e
.what(), bl
, get_last_committed());
2736 case CEPH_MSG_MON_GET_OSDMAP
:
2737 return preprocess_get_osdmap(op
);
2740 case MSG_OSD_MARK_ME_DOWN
:
2741 return preprocess_mark_me_down(op
);
2742 case MSG_OSD_MARK_ME_DEAD
:
2743 return preprocess_mark_me_dead(op
);
2745 return preprocess_full(op
);
2746 case MSG_OSD_FAILURE
:
2747 return preprocess_failure(op
);
2749 return preprocess_boot(op
);
2751 return preprocess_alive(op
);
2752 case MSG_OSD_PG_CREATED
:
2753 return preprocess_pg_created(op
);
2754 case MSG_OSD_PG_READY_TO_MERGE
:
2755 return preprocess_pg_ready_to_merge(op
);
2756 case MSG_OSD_PGTEMP
:
2757 return preprocess_pgtemp(op
);
2758 case MSG_OSD_BEACON
:
2759 return preprocess_beacon(op
);
2761 case CEPH_MSG_POOLOP
:
2762 return preprocess_pool_op(op
);
2764 case MSG_REMOVE_SNAPS
:
2765 return preprocess_remove_snaps(op
);
2767 case MSG_MON_GET_PURGED_SNAPS
:
2768 return preprocess_get_purged_snaps(op
);
2776 bool OSDMonitor::prepare_update(MonOpRequestRef op
)
2778 op
->mark_osdmon_event(__func__
);
2779 Message
*m
= op
->get_req();
2780 dout(7) << "prepare_update " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
2782 switch (m
->get_type()) {
2784 case MSG_OSD_MARK_ME_DOWN
:
2785 return prepare_mark_me_down(op
);
2786 case MSG_OSD_MARK_ME_DEAD
:
2787 return prepare_mark_me_dead(op
);
2789 return prepare_full(op
);
2790 case MSG_OSD_FAILURE
:
2791 return prepare_failure(op
);
2793 return prepare_boot(op
);
2795 return prepare_alive(op
);
2796 case MSG_OSD_PG_CREATED
:
2797 return prepare_pg_created(op
);
2798 case MSG_OSD_PGTEMP
:
2799 return prepare_pgtemp(op
);
2800 case MSG_OSD_PG_READY_TO_MERGE
:
2801 return prepare_pg_ready_to_merge(op
);
2802 case MSG_OSD_BEACON
:
2803 return prepare_beacon(op
);
2805 case MSG_MON_COMMAND
:
2807 return prepare_command(op
);
2808 } catch (const bad_cmd_get
& e
) {
2810 mon
.reply_command(op
, -EINVAL
, e
.what(), bl
, get_last_committed());
2814 case CEPH_MSG_POOLOP
:
2815 return prepare_pool_op(op
);
2817 case MSG_REMOVE_SNAPS
:
2818 return prepare_remove_snaps(op
);
2828 bool OSDMonitor::should_propose(double& delay
)
2830 dout(10) << "should_propose" << dendl
;
2832 // if full map, propose immediately! any subsequent changes will be clobbered.
2833 if (pending_inc
.fullmap
.length())
2836 // adjust osd weights?
2837 if (!osd_weight
.empty() &&
2838 osd_weight
.size() == (unsigned)osdmap
.get_max_osd()) {
2839 dout(0) << " adjusting osd weights based on " << osd_weight
<< dendl
;
2840 osdmap
.adjust_osd_weights(osd_weight
, pending_inc
);
2846 return PaxosService::should_propose(delay
);
2851 // ---------------------------
2854 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op
)
2856 op
->mark_osdmon_event(__func__
);
2857 auto m
= op
->get_req
<MMonGetOSDMap
>();
2859 uint64_t features
= mon
.get_quorum_con_features();
2860 if (op
->get_session() && op
->get_session()->con_features
)
2861 features
= op
->get_session()->con_features
;
2863 dout(10) << __func__
<< " " << *m
<< dendl
;
2864 MOSDMap
*reply
= new MOSDMap(mon
.monmap
->fsid
, features
);
2865 epoch_t first
= get_first_committed();
2866 epoch_t last
= osdmap
.get_epoch();
2867 int max
= g_conf()->osd_map_message_max
;
2868 ssize_t max_bytes
= g_conf()->osd_map_message_max_bytes
;
2869 for (epoch_t e
= std::max(first
, m
->get_full_first());
2870 e
<= std::min(last
, m
->get_full_last()) && max
> 0 && max_bytes
> 0;
2872 bufferlist
& bl
= reply
->maps
[e
];
2873 int r
= get_version_full(e
, features
, bl
);
2874 ceph_assert(r
>= 0);
2875 max_bytes
-= bl
.length();
2877 for (epoch_t e
= std::max(first
, m
->get_inc_first());
2878 e
<= std::min(last
, m
->get_inc_last()) && max
> 0 && max_bytes
> 0;
2880 bufferlist
& bl
= reply
->incremental_maps
[e
];
2881 int r
= get_version(e
, features
, bl
);
2882 ceph_assert(r
>= 0);
2883 max_bytes
-= bl
.length();
2885 reply
->cluster_osdmap_trim_lower_bound
= first
;
2886 reply
->newest_map
= last
;
2887 mon
.send_reply(op
, reply
);
2892 // ---------------------------
2897 bool OSDMonitor::check_source(MonOpRequestRef op
, uuid_d fsid
) {
2898 // check permissions
2899 MonSession
*session
= op
->get_session();
2902 if (!session
->is_capable("osd", MON_CAP_X
)) {
2903 dout(0) << "got MOSDFailure from entity with insufficient caps "
2904 << session
->caps
<< dendl
;
2907 if (fsid
!= mon
.monmap
->fsid
) {
2908 dout(0) << "check_source: on fsid " << fsid
2909 << " != " << mon
.monmap
->fsid
<< dendl
;
2916 bool OSDMonitor::preprocess_failure(MonOpRequestRef op
)
2918 op
->mark_osdmon_event(__func__
);
2919 auto m
= op
->get_req
<MOSDFailure
>();
2920 // who is target_osd
2921 int badboy
= m
->get_target_osd();
2923 // check permissions
2924 if (check_source(op
, m
->fsid
))
2927 // first, verify the reporting host is valid
2928 if (m
->get_orig_source().is_osd()) {
2929 int from
= m
->get_orig_source().num();
2930 if (!osdmap
.exists(from
) ||
2931 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs()) ||
2932 (osdmap
.is_down(from
) && m
->if_osd_failed())) {
2933 dout(5) << "preprocess_failure from dead osd." << from
2934 << ", ignoring" << dendl
;
2935 send_incremental(op
, m
->get_epoch()+1);
2942 if (osdmap
.is_down(badboy
)) {
2943 dout(5) << "preprocess_failure dne(/dup?): osd." << m
->get_target_osd()
2944 << " " << m
->get_target_addrs()
2945 << ", from " << m
->get_orig_source() << dendl
;
2946 if (m
->get_epoch() < osdmap
.get_epoch())
2947 send_incremental(op
, m
->get_epoch()+1);
2950 if (osdmap
.get_addrs(badboy
) != m
->get_target_addrs()) {
2951 dout(5) << "preprocess_failure wrong osd: report osd." << m
->get_target_osd()
2952 << " " << m
->get_target_addrs()
2953 << " != map's " << osdmap
.get_addrs(badboy
)
2954 << ", from " << m
->get_orig_source() << dendl
;
2955 if (m
->get_epoch() < osdmap
.get_epoch())
2956 send_incremental(op
, m
->get_epoch()+1);
2960 // already reported?
2961 if (osdmap
.is_down(badboy
) ||
2962 osdmap
.get_up_from(badboy
) > m
->get_epoch()) {
2963 dout(5) << "preprocess_failure dup/old: osd." << m
->get_target_osd()
2964 << " " << m
->get_target_addrs()
2965 << ", from " << m
->get_orig_source() << dendl
;
2966 if (m
->get_epoch() < osdmap
.get_epoch())
2967 send_incremental(op
, m
->get_epoch()+1);
2971 if (!can_mark_down(badboy
)) {
2972 dout(5) << "preprocess_failure ignoring report of osd."
2973 << m
->get_target_osd() << " " << m
->get_target_addrs()
2974 << " from " << m
->get_orig_source() << dendl
;
2978 dout(10) << "preprocess_failure new: osd." << m
->get_target_osd()
2979 << " " << m
->get_target_addrs()
2980 << ", from " << m
->get_orig_source() << dendl
;
2988 class C_AckMarkedDown
: public C_MonOp
{
2994 : C_MonOp(op
), osdmon(osdmon
) {}
2996 void _finish(int r
) override
{
2998 auto m
= op
->get_req
<MOSDMarkMeDown
>();
2999 osdmon
->mon
.send_reply(
3006 false)); // ACK itself does not request an ack
3007 } else if (r
== -EAGAIN
) {
3008 osdmon
->dispatch(op
);
3010 ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r
);
3013 ~C_AckMarkedDown() override
{
3017 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op
)
3019 op
->mark_osdmon_event(__func__
);
3020 auto m
= op
->get_req
<MOSDMarkMeDown
>();
3021 int from
= m
->target_osd
;
3023 // check permissions
3024 if (check_source(op
, m
->fsid
))
3027 // first, verify the reporting host is valid
3028 if (!m
->get_orig_source().is_osd())
3031 if (!osdmap
.exists(from
) ||
3032 osdmap
.is_down(from
) ||
3033 osdmap
.get_addrs(from
) != m
->target_addrs
) {
3034 dout(5) << "preprocess_mark_me_down from dead osd."
3035 << from
<< ", ignoring" << dendl
;
3036 send_incremental(op
, m
->get_epoch()+1);
3040 // no down might be set
3041 if (!can_mark_down(from
))
3044 dout(10) << "MOSDMarkMeDown for: " << m
->get_orig_source()
3045 << " " << m
->target_addrs
<< dendl
;
3049 if (m
->request_ack
) {
3050 Context
*c(new C_AckMarkedDown(this, op
));
3056 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op
)
3058 op
->mark_osdmon_event(__func__
);
3059 auto m
= op
->get_req
<MOSDMarkMeDown
>();
3060 int target_osd
= m
->target_osd
;
3062 ceph_assert(osdmap
.is_up(target_osd
));
3063 ceph_assert(osdmap
.get_addrs(target_osd
) == m
->target_addrs
);
3065 mon
.clog
->info() << "osd." << target_osd
<< " marked itself " << ((m
->down_and_dead
) ? "down and dead" : "down");
3066 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
3067 if (m
->down_and_dead
) {
3068 if (!pending_inc
.new_xinfo
.count(target_osd
)) {
3069 pending_inc
.new_xinfo
[target_osd
] = osdmap
.osd_xinfo
[target_osd
];
3071 pending_inc
.new_xinfo
[target_osd
].dead_epoch
= m
->get_epoch();
3074 wait_for_finished_proposal(op
, new C_AckMarkedDown(this, op
));
3078 bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op
)
3080 op
->mark_osdmon_event(__func__
);
3081 auto m
= op
->get_req
<MOSDMarkMeDead
>();
3082 int from
= m
->target_osd
;
3084 // check permissions
3085 if (check_source(op
, m
->fsid
)) {
3090 // first, verify the reporting host is valid
3091 if (!m
->get_orig_source().is_osd()) {
3096 if (!osdmap
.exists(from
) ||
3097 !osdmap
.is_down(from
)) {
3098 dout(5) << __func__
<< " from nonexistent or up osd." << from
3099 << ", ignoring" << dendl
;
3100 send_incremental(op
, m
->get_epoch()+1);
3108 bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op
)
3110 op
->mark_osdmon_event(__func__
);
3111 auto m
= op
->get_req
<MOSDMarkMeDead
>();
3112 int target_osd
= m
->target_osd
;
3114 ceph_assert(osdmap
.is_down(target_osd
));
3116 mon
.clog
->info() << "osd." << target_osd
<< " marked itself dead as of e"
3118 if (!pending_inc
.new_xinfo
.count(target_osd
)) {
3119 pending_inc
.new_xinfo
[target_osd
] = osdmap
.osd_xinfo
[target_osd
];
3121 pending_inc
.new_xinfo
[target_osd
].dead_epoch
= m
->get_epoch();
3122 wait_for_finished_proposal(
3125 [op
, this] (int r
) {
3127 mon
.no_reply(op
); // ignore on success
3134 bool OSDMonitor::can_mark_down(int i
)
3136 if (osdmap
.is_nodown(i
)) {
3137 dout(5) << __func__
<< " osd." << i
<< " is marked as nodown, "
3138 << "will not mark it down" << dendl
;
3142 int num_osds
= osdmap
.get_num_osds();
3143 if (num_osds
== 0) {
3144 dout(5) << __func__
<< " no osds" << dendl
;
3147 int up
= osdmap
.get_num_up_osds() - pending_inc
.get_net_marked_down(&osdmap
);
3148 float up_ratio
= (float)up
/ (float)num_osds
;
3149 if (up_ratio
< g_conf()->mon_osd_min_up_ratio
) {
3150 dout(2) << __func__
<< " current up_ratio " << up_ratio
<< " < min "
3151 << g_conf()->mon_osd_min_up_ratio
3152 << ", will not mark osd." << i
<< " down" << dendl
;
3158 bool OSDMonitor::can_mark_up(int i
)
3160 if (osdmap
.is_noup(i
)) {
3161 dout(5) << __func__
<< " osd." << i
<< " is marked as noup, "
3162 << "will not mark it up" << dendl
;
3170 * @note the parameter @p i apparently only exists here so we can output the
3171 * osd's id on messages.
3173 bool OSDMonitor::can_mark_out(int i
)
3175 if (osdmap
.is_noout(i
)) {
3176 dout(5) << __func__
<< " osd." << i
<< " is marked as noout, "
3177 << "will not mark it out" << dendl
;
3181 int num_osds
= osdmap
.get_num_osds();
3182 if (num_osds
== 0) {
3183 dout(5) << __func__
<< " no osds" << dendl
;
3186 int in
= osdmap
.get_num_in_osds() - pending_inc
.get_net_marked_out(&osdmap
);
3187 float in_ratio
= (float)in
/ (float)num_osds
;
3188 if (in_ratio
< g_conf()->mon_osd_min_in_ratio
) {
3190 dout(5) << __func__
<< " current in_ratio " << in_ratio
<< " < min "
3191 << g_conf()->mon_osd_min_in_ratio
3192 << ", will not mark osd." << i
<< " out" << dendl
;
3194 dout(5) << __func__
<< " current in_ratio " << in_ratio
<< " < min "
3195 << g_conf()->mon_osd_min_in_ratio
3196 << ", will not mark osds out" << dendl
;
3203 bool OSDMonitor::can_mark_in(int i
)
3205 if (osdmap
.is_noin(i
)) {
3206 dout(5) << __func__
<< " osd." << i
<< " is marked as noin, "
3207 << "will not mark it in" << dendl
;
3214 bool OSDMonitor::check_failures(utime_t now
)
3216 bool found_failure
= false;
3217 auto p
= failure_info
.begin();
3218 while (p
!= failure_info
.end()) {
3219 auto& [target_osd
, fi
] = *p
;
3220 if (can_mark_down(target_osd
) &&
3221 check_failure(now
, target_osd
, fi
)) {
3222 found_failure
= true;
3224 } else if (is_failure_stale(now
, fi
)) {
3225 dout(10) << " dropping stale failure_info for osd." << target_osd
3226 << " from " << fi
.reporters
.size() << " reporters"
3228 p
= failure_info
.erase(p
);
3233 return found_failure
;
3236 utime_t
OSDMonitor::get_grace_time(utime_t now
,
3238 failure_info_t
& fi
) const
3240 utime_t
orig_grace(g_conf()->osd_heartbeat_grace
, 0);
3241 if (!g_conf()->mon_osd_adjust_heartbeat_grace
) {
3244 utime_t grace
= orig_grace
;
3245 double halflife
= (double)g_conf()->mon_osd_laggy_halflife
;
3246 double decay_k
= ::log(.5) / halflife
;
3248 // scale grace period based on historical probability of 'lagginess'
3249 // (false positive failures due to slowness).
3250 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(target_osd
);
3251 const utime_t failed_for
= now
- fi
.get_failed_since();
3252 double decay
= exp((double)failed_for
* decay_k
);
3253 dout(20) << " halflife " << halflife
<< " decay_k " << decay_k
3254 << " failed_for " << failed_for
<< " decay " << decay
<< dendl
;
3255 double my_grace
= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
3258 // consider the peers reporting a failure a proxy for a potential
3259 // 'subcluster' over the overall cluster that is similarly
3260 // laggy. this is clearly not true in all cases, but will sometimes
3261 // help us localize the grace correction to a subset of the system
3262 // (say, a rack with a bad switch) that is unhappy.
3263 double peer_grace
= 0;
3264 for (auto& [reporter
, report
] : fi
.reporters
) {
3265 if (osdmap
.exists(reporter
)) {
3266 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(reporter
);
3267 utime_t elapsed
= now
- xi
.down_stamp
;
3268 double decay
= exp((double)elapsed
* decay_k
);
3269 peer_grace
+= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
3272 peer_grace
/= (double)fi
.reporters
.size();
3273 grace
+= peer_grace
;
3274 dout(10) << " osd." << target_osd
<< " has "
3275 << fi
.reporters
.size() << " reporters, "
3276 << grace
<< " grace (" << orig_grace
<< " + " << my_grace
3277 << " + " << peer_grace
<< "), max_failed_since " << fi
.get_failed_since()
3283 bool OSDMonitor::check_failure(utime_t now
, int target_osd
, failure_info_t
& fi
)
3285 // already pending failure?
3286 if (pending_inc
.new_state
.count(target_osd
) &&
3287 pending_inc
.new_state
[target_osd
] & CEPH_OSD_UP
) {
3288 dout(10) << " already pending failure" << dendl
;
3292 set
<string
> reporters_by_subtree
;
3293 auto reporter_subtree_level
= g_conf().get_val
<string
>("mon_osd_reporter_subtree_level");
3294 ceph_assert(fi
.reporters
.size());
3295 for (auto p
= fi
.reporters
.begin(); p
!= fi
.reporters
.end();) {
3296 // get the parent bucket whose type matches with "reporter_subtree_level".
3297 // fall back to OSD if the level doesn't exist.
3298 if (osdmap
.exists(p
->first
)) {
3299 auto reporter_loc
= osdmap
.crush
->get_full_location(p
->first
);
3300 if (auto iter
= reporter_loc
.find(reporter_subtree_level
);
3301 iter
== reporter_loc
.end()) {
3302 reporters_by_subtree
.insert("osd." + to_string(p
->first
));
3304 reporters_by_subtree
.insert(iter
->second
);
3308 fi
.cancel_report(p
->first
);;
3309 p
= fi
.reporters
.erase(p
);
3312 if (reporters_by_subtree
.size() < g_conf().get_val
<uint64_t>("mon_osd_min_down_reporters")) {
3315 const utime_t failed_for
= now
- fi
.get_failed_since();
3316 const utime_t grace
= get_grace_time(now
, target_osd
, fi
);
3317 if (failed_for
>= grace
) {
3318 dout(1) << " we have enough reporters to mark osd." << target_osd
3319 << " down" << dendl
;
3320 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
3322 mon
.clog
->info() << "osd." << target_osd
<< " failed ("
3323 << osdmap
.crush
->get_full_location_ordered_string(
3326 << (int)reporters_by_subtree
.size()
3327 << " reporters from different "
3328 << reporter_subtree_level
<< " after "
3329 << failed_for
<< " >= grace " << grace
<< ")";
3335 bool OSDMonitor::is_failure_stale(utime_t now
, failure_info_t
& fi
) const
3337 // if it takes too long to either cancel the report to mark the osd down,
3338 // some reporters must have failed to cancel their reports. let's just
3339 // forget these reports.
3340 const utime_t failed_for
= now
- fi
.get_failed_since();
3341 auto heartbeat_grace
= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_grace");
3342 auto heartbeat_stale
= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_stale");
3343 return failed_for
>= (heartbeat_grace
+ heartbeat_stale
);
3346 void OSDMonitor::force_failure(int target_osd
, int by
)
3348 // already pending failure?
3349 if (pending_inc
.new_state
.count(target_osd
) &&
3350 pending_inc
.new_state
[target_osd
] & CEPH_OSD_UP
) {
3351 dout(10) << " already pending failure" << dendl
;
3355 dout(1) << " we're forcing failure of osd." << target_osd
<< dendl
;
3356 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
3357 if (!pending_inc
.new_xinfo
.count(target_osd
)) {
3358 pending_inc
.new_xinfo
[target_osd
] = osdmap
.osd_xinfo
[target_osd
];
3360 pending_inc
.new_xinfo
[target_osd
].dead_epoch
= pending_inc
.epoch
;
3362 mon
.clog
->info() << "osd." << target_osd
<< " failed ("
3363 << osdmap
.crush
->get_full_location_ordered_string(target_osd
)
3364 << ") (connection refused reported by osd." << by
<< ")";
3368 bool OSDMonitor::prepare_failure(MonOpRequestRef op
)
3370 op
->mark_osdmon_event(__func__
);
3371 auto m
= op
->get_req
<MOSDFailure
>();
3372 dout(1) << "prepare_failure osd." << m
->get_target_osd()
3373 << " " << m
->get_target_addrs()
3374 << " from " << m
->get_orig_source()
3375 << " is reporting failure:" << m
->if_osd_failed() << dendl
;
3377 int target_osd
= m
->get_target_osd();
3378 int reporter
= m
->get_orig_source().num();
3379 ceph_assert(osdmap
.is_up(target_osd
));
3380 ceph_assert(osdmap
.get_addrs(target_osd
) == m
->get_target_addrs());
3384 if (m
->if_osd_failed()) {
3385 // calculate failure time
3386 utime_t now
= ceph_clock_now();
3387 utime_t failed_since
=
3388 m
->get_recv_stamp() - utime_t(m
->failed_for
, 0);
3391 if (m
->is_immediate()) {
3392 mon
.clog
->debug() << "osd." << m
->get_target_osd()
3393 << " reported immediately failed by "
3394 << m
->get_orig_source();
3395 force_failure(target_osd
, reporter
);
3398 mon
.clog
->debug() << "osd." << m
->get_target_osd() << " reported failed by "
3399 << m
->get_orig_source();
3401 failure_info_t
& fi
= failure_info
[target_osd
];
3402 fi
.add_report(reporter
, failed_since
, op
);
3403 return check_failure(now
, target_osd
, fi
);
3405 // remove the report
3406 mon
.clog
->debug() << "osd." << m
->get_target_osd()
3407 << " failure report canceled by "
3408 << m
->get_orig_source();
3409 if (failure_info
.count(target_osd
)) {
3410 failure_info_t
& fi
= failure_info
[target_osd
];
3411 fi
.cancel_report(reporter
);
3412 if (fi
.reporters
.empty()) {
3413 dout(10) << " removing last failure_info for osd." << target_osd
3415 failure_info
.erase(target_osd
);
3417 dout(10) << " failure_info for osd." << target_osd
<< " now "
3418 << fi
.reporters
.size() << " reporters" << dendl
;
3421 dout(10) << " no failure_info for osd." << target_osd
<< dendl
;
3428 void OSDMonitor::process_failures()
3430 map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
3431 while (p
!= failure_info
.end()) {
3432 if (osdmap
.is_up(p
->first
)) {
3435 dout(10) << "process_failures osd." << p
->first
<< dendl
;
3436 list
<MonOpRequestRef
> ls
;
3437 p
->second
.take_report_messages(ls
);
3438 failure_info
.erase(p
++);
3440 while (!ls
.empty()) {
3441 MonOpRequestRef o
= ls
.front();
3443 o
->mark_event(__func__
);
3444 MOSDFailure
*m
= o
->get_req
<MOSDFailure
>();
3445 send_latest(o
, m
->get_epoch());
3454 void OSDMonitor::take_all_failures(list
<MonOpRequestRef
>& ls
)
3456 dout(10) << __func__
<< " on " << failure_info
.size() << " osds" << dendl
;
3458 for (map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
3459 p
!= failure_info
.end();
3461 p
->second
.take_report_messages(ls
);
3463 failure_info
.clear();
3466 int OSDMonitor::get_grace_interval_threshold()
3468 int halflife
= g_conf()->mon_osd_laggy_halflife
;
3469 // Scale the halflife period (default: 1_hr) by
3470 // a factor (48) to calculate the threshold.
3471 int grace_threshold_factor
= 48;
3472 return halflife
* grace_threshold_factor
;
3475 bool OSDMonitor::grace_interval_threshold_exceeded(int last_failed_interval
)
3477 int grace_interval_threshold_secs
= get_grace_interval_threshold();
3478 if (last_failed_interval
> grace_interval_threshold_secs
) {
3479 dout(1) << " last_failed_interval " << last_failed_interval
3480 << " > grace_interval_threshold_secs " << grace_interval_threshold_secs
3487 void OSDMonitor::set_default_laggy_params(int target_osd
)
3489 if (pending_inc
.new_xinfo
.count(target_osd
) == 0) {
3490 pending_inc
.new_xinfo
[target_osd
] = osdmap
.osd_xinfo
[target_osd
];
3492 osd_xinfo_t
& xi
= pending_inc
.new_xinfo
[target_osd
];
3493 xi
.down_stamp
= pending_inc
.modified
;
3494 xi
.laggy_probability
= 0.0;
3495 xi
.laggy_interval
= 0;
3496 dout(20) << __func__
<< " reset laggy, now xi " << xi
<< dendl
;
3502 bool OSDMonitor::preprocess_boot(MonOpRequestRef op
)
3504 op
->mark_osdmon_event(__func__
);
3505 auto m
= op
->get_req
<MOSDBoot
>();
3506 int from
= m
->get_orig_source_inst().name
.num();
3508 // check permissions, ignore if failed (no response expected)
3509 MonSession
*session
= op
->get_session();
3512 if (!session
->is_capable("osd", MON_CAP_X
)) {
3513 dout(0) << "got preprocess_boot message from entity with insufficient caps"
3514 << session
->caps
<< dendl
;
3518 if (m
->sb
.cluster_fsid
!= mon
.monmap
->fsid
) {
3519 dout(0) << "preprocess_boot on fsid " << m
->sb
.cluster_fsid
3520 << " != " << mon
.monmap
->fsid
<< dendl
;
3524 if (m
->get_orig_source_inst().addr
.is_blank_ip()) {
3525 dout(0) << "preprocess_boot got blank addr for " << m
->get_orig_source_inst() << dendl
;
3529 ceph_assert(m
->get_orig_source_inst().name
.is_osd());
3531 // lower bound of N-2
3532 if (!HAVE_FEATURE(m
->osd_features
, SERVER_PACIFIC
)) {
3533 mon
.clog
->info() << "disallowing boot of OSD "
3534 << m
->get_orig_source_inst()
3535 << " because the osd lacks CEPH_FEATURE_SERVER_PACIFIC";
3539 // make sure osd versions do not span more than 3 releases
3540 if (HAVE_FEATURE(m
->osd_features
, SERVER_QUINCY
) &&
3541 osdmap
.require_osd_release
< ceph_release_t::octopus
) {
3542 mon
.clog
->info() << "disallowing boot of quincy+ OSD "
3543 << m
->get_orig_source_inst()
3544 << " because require_osd_release < octopus";
3547 if (HAVE_FEATURE(m
->osd_features
, SERVER_REEF
) &&
3548 osdmap
.require_osd_release
< ceph_release_t::pacific
) {
3549 mon
.clog
->info() << "disallowing boot of reef+ OSD "
3550 << m
->get_orig_source_inst()
3551 << " because require_osd_release < pacific";
3555 // See crimson/osd/osd.cc: OSD::_send_boot
3556 if (auto type_iter
= m
->metadata
.find("osd_type");
3557 type_iter
!= m
->metadata
.end()) {
3558 const auto &otype
= type_iter
->second
;
3559 // m->metadata["osd_type"] must be "crimson", classic doesn't send osd_type
3560 if (otype
== "crimson") {
3561 if (!osdmap
.get_allow_crimson()) {
3563 << "Disallowing boot of crimson-osd without allow_crimson "
3564 << "OSDMap flag. Run ceph osd set_allow_crimson to set "
3565 << "allow_crimson flag. Note that crimson-osd is "
3566 << "considered unstable and may result in crashes or "
3567 << "data loss. Its usage should be restricted to "
3568 << "testing and development.";
3572 derr
<< __func__
<< ": osd " << m
->get_orig_source_inst()
3573 << " sent non-crimson osd_type field in MOSDBoot: "
3575 << " -- booting anyway"
3580 if (osdmap
.stretch_mode_enabled
&&
3581 !(m
->osd_features
& CEPH_FEATUREMASK_STRETCH_MODE
)) {
3582 mon
.clog
->info() << "disallowing boot of OSD "
3583 << m
->get_orig_source_inst()
3584 << " because stretch mode is on and OSD lacks support";
3589 if (osdmap
.is_up(from
) &&
3590 osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs()) &&
3591 osdmap
.get_cluster_addrs(from
).legacy_equals(m
->cluster_addrs
)) {
3593 dout(7) << "preprocess_boot dup from " << m
->get_orig_source()
3594 << " " << m
->get_orig_source_addrs()
3595 << " =~ " << osdmap
.get_addrs(from
) << dendl
;
3600 if (osdmap
.exists(from
) &&
3601 !osdmap
.get_uuid(from
).is_zero() &&
3602 osdmap
.get_uuid(from
) != m
->sb
.osd_fsid
) {
3603 dout(7) << __func__
<< " from " << m
->get_orig_source_inst()
3604 << " clashes with existing osd: different fsid"
3605 << " (ours: " << osdmap
.get_uuid(from
)
3606 << " ; theirs: " << m
->sb
.osd_fsid
<< ")" << dendl
;
3610 if (osdmap
.exists(from
) &&
3611 osdmap
.get_info(from
).up_from
> m
->version
&&
3612 osdmap
.get_most_recent_addrs(from
).legacy_equals(
3613 m
->get_orig_source_addrs())) {
3614 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl
;
3615 send_latest(op
, m
->sb
.current_epoch
+1);
3620 if (!can_mark_up(from
)) {
3621 dout(7) << "preprocess_boot ignoring boot from " << m
->get_orig_source_inst() << dendl
;
3622 send_latest(op
, m
->sb
.current_epoch
+1);
3626 dout(10) << "preprocess_boot from " << m
->get_orig_source_inst() << dendl
;
3633 bool OSDMonitor::prepare_boot(MonOpRequestRef op
)
3635 op
->mark_osdmon_event(__func__
);
3636 auto m
= op
->get_req
<MOSDBoot
>();
3637 dout(7) << __func__
<< " from " << m
->get_source()
3639 << " client_addrs" << m
->get_connection()->get_peer_addrs()
3640 << " cluster_addrs " << m
->cluster_addrs
3641 << " hb_back_addrs " << m
->hb_back_addrs
3642 << " hb_front_addrs " << m
->hb_front_addrs
3645 ceph_assert(m
->get_orig_source().is_osd());
3646 int from
= m
->get_orig_source().num();
3648 // does this osd exist?
3649 if (from
>= osdmap
.get_max_osd()) {
3650 dout(1) << "boot from osd." << from
<< " >= max_osd "
3651 << osdmap
.get_max_osd() << dendl
;
3655 int oldstate
= osdmap
.exists(from
) ? osdmap
.get_state(from
) : CEPH_OSD_NEW
;
3656 if (pending_inc
.new_state
.count(from
))
3657 oldstate
^= pending_inc
.new_state
[from
];
3659 // already up? mark down first?
3660 if (osdmap
.is_up(from
)) {
3661 dout(7) << __func__
<< " was up, first marking down osd." << from
<< " "
3662 << osdmap
.get_addrs(from
) << dendl
;
3663 // preprocess should have caught these; if not, assert.
3664 ceph_assert(!osdmap
.get_addrs(from
).legacy_equals(
3665 m
->get_orig_source_addrs()) ||
3666 !osdmap
.get_cluster_addrs(from
).legacy_equals(m
->cluster_addrs
));
3667 ceph_assert(osdmap
.get_uuid(from
) == m
->sb
.osd_fsid
);
3669 if (pending_inc
.new_state
.count(from
) == 0 ||
3670 (pending_inc
.new_state
[from
] & CEPH_OSD_UP
) == 0) {
3671 // mark previous guy down
3672 pending_inc
.new_state
[from
] = CEPH_OSD_UP
;
3674 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
3675 } else if (pending_inc
.new_up_client
.count(from
)) {
3676 // already prepared, just wait
3677 dout(7) << __func__
<< " already prepared, waiting on "
3678 << m
->get_orig_source_addr() << dendl
;
3679 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
3682 pending_inc
.new_up_client
[from
] = m
->get_orig_source_addrs();
3683 pending_inc
.new_up_cluster
[from
] = m
->cluster_addrs
;
3684 pending_inc
.new_hb_back_up
[from
] = m
->hb_back_addrs
;
3685 pending_inc
.new_hb_front_up
[from
] = m
->hb_front_addrs
;
3687 down_pending_out
.erase(from
); // if any
3690 osd_weight
[from
] = m
->sb
.weight
;
3693 dout(10) << " setting osd." << from
<< " uuid to " << m
->sb
.osd_fsid
3695 if (!osdmap
.exists(from
) || osdmap
.get_uuid(from
) != m
->sb
.osd_fsid
) {
3696 // preprocess should have caught this; if not, assert.
3697 ceph_assert(!osdmap
.exists(from
) || osdmap
.get_uuid(from
).is_zero());
3698 pending_inc
.new_uuid
[from
] = m
->sb
.osd_fsid
;
3702 if (m
->sb
.newest_map
== 0 && osdmap
.exists(from
)) {
3703 const osd_info_t
& i
= osdmap
.get_info(from
);
3704 if (i
.up_from
> i
.lost_at
) {
3705 dout(10) << " fresh osd; marking lost_at too" << dendl
;
3706 pending_inc
.new_lost
[from
] = osdmap
.get_epoch();
3711 bufferlist osd_metadata
;
3712 encode(m
->metadata
, osd_metadata
);
3713 pending_metadata
[from
] = osd_metadata
;
3714 pending_metadata_rm
.erase(from
);
3716 // adjust last clean unmount epoch?
3717 const osd_info_t
& info
= osdmap
.get_info(from
);
3718 dout(10) << " old osd_info: " << info
<< dendl
;
3719 if (m
->sb
.mounted
> info
.last_clean_begin
||
3720 (m
->sb
.mounted
== info
.last_clean_begin
&&
3721 m
->sb
.clean_thru
> info
.last_clean_end
)) {
3722 epoch_t begin
= m
->sb
.mounted
;
3723 epoch_t end
= m
->sb
.clean_thru
;
3725 dout(10) << __func__
<< " osd." << from
<< " last_clean_interval "
3726 << "[" << info
.last_clean_begin
<< "," << info
.last_clean_end
3727 << ") -> [" << begin
<< "-" << end
<< ")"
3729 pending_inc
.new_last_clean_interval
[from
] =
3730 pair
<epoch_t
,epoch_t
>(begin
, end
);
3733 if (pending_inc
.new_xinfo
.count(from
) == 0)
3734 pending_inc
.new_xinfo
[from
] = osdmap
.osd_xinfo
[from
];
3735 osd_xinfo_t
& xi
= pending_inc
.new_xinfo
[from
];
3736 if (m
->boot_epoch
== 0) {
3737 xi
.laggy_probability
*= (1.0 - g_conf()->mon_osd_laggy_weight
);
3738 xi
.laggy_interval
*= (1.0 - g_conf()->mon_osd_laggy_weight
);
3739 dout(10) << " not laggy, new xi " << xi
<< dendl
;
3741 if (xi
.down_stamp
.sec()) {
3742 int interval
= ceph_clock_now().sec() -
3743 xi
.down_stamp
.sec();
3744 if (g_conf()->mon_osd_laggy_max_interval
&&
3745 (interval
> g_conf()->mon_osd_laggy_max_interval
)) {
3746 interval
= g_conf()->mon_osd_laggy_max_interval
;
3749 interval
* g_conf()->mon_osd_laggy_weight
+
3750 xi
.laggy_interval
* (1.0 - g_conf()->mon_osd_laggy_weight
);
3752 xi
.laggy_probability
=
3753 g_conf()->mon_osd_laggy_weight
+
3754 xi
.laggy_probability
* (1.0 - g_conf()->mon_osd_laggy_weight
);
3755 dout(10) << " laggy, now xi " << xi
<< dendl
;
3758 // set features shared by the osd
3759 if (m
->osd_features
)
3760 xi
.features
= m
->osd_features
;
3762 xi
.features
= m
->get_connection()->get_features();
3765 if ((g_conf()->mon_osd_auto_mark_auto_out_in
&&
3766 (oldstate
& CEPH_OSD_AUTOOUT
)) ||
3767 (g_conf()->mon_osd_auto_mark_new_in
&& (oldstate
& CEPH_OSD_NEW
)) ||
3768 (g_conf()->mon_osd_auto_mark_in
)) {
3769 if (can_mark_in(from
)) {
3770 if (xi
.old_weight
> 0) {
3771 pending_inc
.new_weight
[from
] = xi
.old_weight
;
3774 pending_inc
.new_weight
[from
] = CEPH_OSD_IN
;
3777 dout(7) << __func__
<< " NOIN set, will not mark in "
3778 << m
->get_orig_source_addr() << dendl
;
3783 wait_for_finished_proposal(op
, new C_Booted(this, op
));
3788 void OSDMonitor::_booted(MonOpRequestRef op
, bool logit
)
3790 op
->mark_osdmon_event(__func__
);
3791 auto m
= op
->get_req
<MOSDBoot
>();
3792 dout(7) << "_booted " << m
->get_orig_source_inst()
3793 << " w " << m
->sb
.weight
<< " from " << m
->sb
.current_epoch
<< dendl
;
3796 mon
.clog
->info() << m
->get_source() << " " << m
->get_orig_source_addrs()
3800 send_latest(op
, m
->sb
.current_epoch
+1);
3807 bool OSDMonitor::preprocess_full(MonOpRequestRef op
)
3809 op
->mark_osdmon_event(__func__
);
3810 auto m
= op
->get_req
<MOSDFull
>();
3811 int from
= m
->get_orig_source().num();
3813 unsigned mask
= CEPH_OSD_NEARFULL
| CEPH_OSD_BACKFILLFULL
| CEPH_OSD_FULL
;
3815 // check permissions, ignore if failed
3816 MonSession
*session
= op
->get_session();
3819 if (!session
->is_capable("osd", MON_CAP_X
)) {
3820 dout(0) << "MOSDFull from entity with insufficient privileges:"
3821 << session
->caps
<< dendl
;
3825 // ignore a full message from the osd instance that already went down
3826 if (!osdmap
.exists(from
)) {
3827 dout(7) << __func__
<< " ignoring full message from nonexistent "
3828 << m
->get_orig_source_inst() << dendl
;
3831 if ((!osdmap
.is_up(from
) &&
3832 osdmap
.get_most_recent_addrs(from
).legacy_equals(
3833 m
->get_orig_source_addrs())) ||
3834 (osdmap
.is_up(from
) &&
3835 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs()))) {
3836 dout(7) << __func__
<< " ignoring full message from down "
3837 << m
->get_orig_source_inst() << dendl
;
3841 OSDMap::calc_state_set(osdmap
.get_state(from
), state
);
3843 if ((osdmap
.get_state(from
) & mask
) == m
->state
) {
3844 dout(7) << __func__
<< " state already " << state
<< " for osd." << from
3845 << " " << m
->get_orig_source_inst() << dendl
;
3846 _reply_map(op
, m
->version
);
3850 dout(10) << __func__
<< " want state " << state
<< " for osd." << from
3851 << " " << m
->get_orig_source_inst() << dendl
;
3858 bool OSDMonitor::prepare_full(MonOpRequestRef op
)
3860 op
->mark_osdmon_event(__func__
);
3861 auto m
= op
->get_req
<MOSDFull
>();
3862 const int from
= m
->get_orig_source().num();
3864 const unsigned mask
= CEPH_OSD_NEARFULL
| CEPH_OSD_BACKFILLFULL
| CEPH_OSD_FULL
;
3865 const unsigned want_state
= m
->state
& mask
; // safety first
3867 unsigned cur_state
= osdmap
.get_state(from
);
3868 auto p
= pending_inc
.new_state
.find(from
);
3869 if (p
!= pending_inc
.new_state
.end()) {
3870 cur_state
^= p
->second
;
3874 set
<string
> want_state_set
, cur_state_set
;
3875 OSDMap::calc_state_set(want_state
, want_state_set
);
3876 OSDMap::calc_state_set(cur_state
, cur_state_set
);
3878 if (cur_state
!= want_state
) {
3879 if (p
!= pending_inc
.new_state
.end()) {
3882 pending_inc
.new_state
[from
] = 0;
3884 pending_inc
.new_state
[from
] |= (osdmap
.get_state(from
) & mask
) ^ want_state
;
3885 dout(7) << __func__
<< " osd." << from
<< " " << cur_state_set
3886 << " -> " << want_state_set
<< dendl
;
3888 dout(7) << __func__
<< " osd." << from
<< " " << cur_state_set
3889 << " = wanted " << want_state_set
<< ", just waiting" << dendl
;
3892 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
3899 bool OSDMonitor::preprocess_alive(MonOpRequestRef op
)
3901 op
->mark_osdmon_event(__func__
);
3902 auto m
= op
->get_req
<MOSDAlive
>();
3903 int from
= m
->get_orig_source().num();
3905 // check permissions, ignore if failed
3906 MonSession
*session
= op
->get_session();
3909 if (!session
->is_capable("osd", MON_CAP_X
)) {
3910 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3911 << session
->caps
<< dendl
;
3915 if (!osdmap
.is_up(from
) ||
3916 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs())) {
3917 dout(7) << "preprocess_alive ignoring alive message from down "
3918 << m
->get_orig_source() << " " << m
->get_orig_source_addrs()
3923 if (osdmap
.get_up_thru(from
) >= m
->want
) {
3925 dout(7) << "preprocess_alive want up_thru " << m
->want
<< " dup from " << m
->get_orig_source_inst() << dendl
;
3926 _reply_map(op
, m
->version
);
3930 dout(10) << "preprocess_alive want up_thru " << m
->want
3931 << " from " << m
->get_orig_source_inst() << dendl
;
3938 bool OSDMonitor::prepare_alive(MonOpRequestRef op
)
3940 op
->mark_osdmon_event(__func__
);
3941 auto m
= op
->get_req
<MOSDAlive
>();
3942 int from
= m
->get_orig_source().num();
3944 if (0) { // we probably don't care much about these
3945 mon
.clog
->debug() << m
->get_orig_source_inst() << " alive";
3948 dout(7) << "prepare_alive want up_thru " << m
->want
<< " have " << m
->version
3949 << " from " << m
->get_orig_source_inst() << dendl
;
3951 update_up_thru(from
, m
->version
); // set to the latest map the OSD has
3952 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
3956 void OSDMonitor::_reply_map(MonOpRequestRef op
, epoch_t e
)
3958 op
->mark_osdmon_event(__func__
);
3959 dout(7) << "_reply_map " << e
3960 << " from " << op
->get_req()->get_orig_source_inst()
3966 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op
)
3968 op
->mark_osdmon_event(__func__
);
3969 auto m
= op
->get_req
<MOSDPGCreated
>();
3970 dout(10) << __func__
<< " " << *m
<< dendl
;
3971 auto session
= op
->get_session();
3974 dout(10) << __func__
<< ": no monitor session!" << dendl
;
3977 if (!session
->is_capable("osd", MON_CAP_X
)) {
3978 derr
<< __func__
<< " received from entity "
3979 << "with insufficient privileges " << session
->caps
<< dendl
;
3982 // always forward the "created!" to the leader
3986 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op
)
3988 op
->mark_osdmon_event(__func__
);
3989 auto m
= op
->get_req
<MOSDPGCreated
>();
3990 dout(10) << __func__
<< " " << *m
<< dendl
;
3991 auto src
= m
->get_orig_source();
3992 auto from
= src
.num();
3993 if (!src
.is_osd() ||
3994 !mon
.osdmon()->osdmap
.is_up(from
) ||
3995 !mon
.osdmon()->osdmap
.get_addrs(from
).legacy_equals(
3996 m
->get_orig_source_addrs())) {
3997 dout(1) << __func__
<< " ignoring stats from non-active osd." << dendl
;
4000 pending_created_pgs
.push_back(m
->pgid
);
4004 bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op
)
4006 op
->mark_osdmon_event(__func__
);
4007 auto m
= op
->get_req
<MOSDPGReadyToMerge
>();
4008 dout(10) << __func__
<< " " << *m
<< dendl
;
4009 const pg_pool_t
*pi
;
4010 auto session
= op
->get_session();
4012 dout(10) << __func__
<< ": no monitor session!" << dendl
;
4015 if (!session
->is_capable("osd", MON_CAP_X
)) {
4016 derr
<< __func__
<< " received from entity "
4017 << "with insufficient privileges " << session
->caps
<< dendl
;
4020 pi
= osdmap
.get_pg_pool(m
->pgid
.pool());
4022 derr
<< __func__
<< " pool for " << m
->pgid
<< " dne" << dendl
;
4025 if (pi
->get_pg_num() <= m
->pgid
.ps()) {
4026 dout(20) << " pg_num " << pi
->get_pg_num() << " already < " << m
->pgid
<< dendl
;
4029 if (pi
->get_pg_num() != m
->pgid
.ps() + 1) {
4030 derr
<< " OSD trying to merge wrong pgid " << m
->pgid
<< dendl
;
4033 if (pi
->get_pg_num_pending() > m
->pgid
.ps()) {
4034 dout(20) << " pg_num_pending " << pi
->get_pg_num_pending() << " > " << m
->pgid
<< dendl
;
4044 bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op
)
4046 op
->mark_osdmon_event(__func__
);
4047 auto m
= op
->get_req
<MOSDPGReadyToMerge
>();
4048 dout(10) << __func__
<< " " << *m
<< dendl
;
4050 if (pending_inc
.new_pools
.count(m
->pgid
.pool()))
4051 p
= pending_inc
.new_pools
[m
->pgid
.pool()];
4053 p
= *osdmap
.get_pg_pool(m
->pgid
.pool());
4054 if (p
.get_pg_num() != m
->pgid
.ps() + 1 ||
4055 p
.get_pg_num_pending() > m
->pgid
.ps()) {
4056 dout(10) << __func__
4057 << " race with concurrent pg_num[_pending] update, will retry"
4059 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
4064 p
.dec_pg_num(m
->pgid
,
4068 m
->last_epoch_started
,
4069 m
->last_epoch_clean
);
4070 p
.last_change
= pending_inc
.epoch
;
4072 // back off the merge attempt!
4073 p
.set_pg_num_pending(p
.get_pg_num());
4076 // force pre-nautilus clients to resend their ops, since they
4077 // don't understand pg_num_pending changes form a new interval
4078 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
4080 pending_inc
.new_pools
[m
->pgid
.pool()] = p
;
4082 auto prob
= g_conf().get_val
<double>("mon_inject_pg_merge_bounce_probability");
4085 prob
> (double)(rand() % 1000)/1000.0) {
4086 derr
<< __func__
<< " injecting pg merge pg_num bounce" << dendl
;
4087 auto n
= new MMonCommand(mon
.monmap
->get_fsid());
4088 n
->set_connection(m
->get_connection());
4089 n
->cmd
= { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
4090 osdmap
.get_pool_name(m
->pgid
.pool()) +
4091 "\", \"var\": \"pg_num_actual\", \"val\": \"" +
4092 stringify(m
->pgid
.ps() + 1) + "\"}" };
4093 MonOpRequestRef nop
= mon
.op_tracker
.create_request
<MonOpRequest
>(n
);
4094 nop
->set_type_service();
4095 wait_for_finished_proposal(op
, new C_RetryMessage(this, nop
));
4097 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
4106 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op
)
4108 auto m
= op
->get_req
<MOSDPGTemp
>();
4109 dout(10) << "preprocess_pgtemp " << *m
<< dendl
;
4110 mempool::osdmap::vector
<int> empty
;
4111 int from
= m
->get_orig_source().num();
4112 size_t ignore_cnt
= 0;
4115 MonSession
*session
= op
->get_session();
4118 if (!session
->is_capable("osd", MON_CAP_X
)) {
4119 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
4120 << session
->caps
<< dendl
;
4124 if (!osdmap
.is_up(from
) ||
4125 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs())) {
4126 dout(7) << "ignoring pgtemp message from down "
4127 << m
->get_orig_source() << " " << m
->get_orig_source_addrs()
4136 for (auto p
= m
->pg_temp
.begin(); p
!= m
->pg_temp
.end(); ++p
) {
4137 dout(20) << " " << p
->first
4138 << (osdmap
.pg_temp
->count(p
->first
) ? osdmap
.pg_temp
->get(p
->first
) : empty
)
4139 << " -> " << p
->second
<< dendl
;
4141 // does the pool exist?
4142 if (!osdmap
.have_pg_pool(p
->first
.pool())) {
4144 * 1. If the osdmap does not have the pool, it means the pool has been
4145 * removed in-between the osd sending this message and us handling it.
4146 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
4147 * not exist in the pending either, as the osds would not send a
4148 * message about a pool they know nothing about (yet).
4149 * 3. However, if the pool does exist in the pending, then it must be a
4150 * new pool, and not relevant to this message (see 1).
4152 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
4153 << ": pool has been removed" << dendl
;
4158 int acting_primary
= -1;
4159 osdmap
.pg_to_up_acting_osds(
4160 p
->first
, nullptr, nullptr, nullptr, &acting_primary
);
4161 if (acting_primary
!= from
) {
4162 /* If the source isn't the primary based on the current osdmap, we know
4163 * that the interval changed and that we can discard this message.
4164 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
4165 * which of two pg temp mappings on the same pg is more recent.
4167 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
4168 << ": primary has changed" << dendl
;
4174 if (p
->second
.empty() && (osdmap
.pg_temp
->count(p
->first
) ||
4175 osdmap
.primary_temp
->count(p
->first
)))
4178 // NOTE: we assume that this will clear pg_primary, so consider
4179 // an existing pg_primary field to imply a change
4180 if (p
->second
.size() &&
4181 (osdmap
.pg_temp
->count(p
->first
) == 0 ||
4182 osdmap
.pg_temp
->get(p
->first
) != p
->second
||
4183 osdmap
.primary_temp
->count(p
->first
)))
4187 // should we ignore all the pgs?
4188 if (ignore_cnt
== m
->pg_temp
.size())
4191 dout(7) << "preprocess_pgtemp e" << m
->map_epoch
<< " no changes from " << m
->get_orig_source_inst() << dendl
;
4192 _reply_map(op
, m
->map_epoch
);
4200 void OSDMonitor::update_up_thru(int from
, epoch_t up_thru
)
4202 epoch_t old_up_thru
= osdmap
.get_up_thru(from
);
4203 auto ut
= pending_inc
.new_up_thru
.find(from
);
4204 if (ut
!= pending_inc
.new_up_thru
.end()) {
4205 old_up_thru
= ut
->second
;
4207 if (up_thru
> old_up_thru
) {
4208 // set up_thru too, so the osd doesn't have to ask again
4209 pending_inc
.new_up_thru
[from
] = up_thru
;
4213 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op
)
4215 op
->mark_osdmon_event(__func__
);
4216 auto m
= op
->get_req
<MOSDPGTemp
>();
4217 int from
= m
->get_orig_source().num();
4218 dout(7) << "prepare_pgtemp e" << m
->map_epoch
<< " from " << m
->get_orig_source_inst() << dendl
;
4219 for (map
<pg_t
,vector
<int32_t> >::iterator p
= m
->pg_temp
.begin(); p
!= m
->pg_temp
.end(); ++p
) {
4220 uint64_t pool
= p
->first
.pool();
4221 if (pending_inc
.old_pools
.count(pool
)) {
4222 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
4223 << ": pool pending removal" << dendl
;
4226 if (!osdmap
.have_pg_pool(pool
)) {
4227 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
4228 << ": pool has been removed" << dendl
;
4231 pending_inc
.new_pg_temp
[p
->first
] =
4232 mempool::osdmap::vector
<int>(p
->second
.begin(), p
->second
.end());
4234 // unconditionally clear pg_primary (until this message can encode
4235 // a change for that, too.. at which point we need to also fix
4236 // preprocess_pg_temp)
4237 if (osdmap
.primary_temp
->count(p
->first
) ||
4238 pending_inc
.new_primary_temp
.count(p
->first
))
4239 pending_inc
.new_primary_temp
[p
->first
] = -1;
4242 // set up_thru too, so the osd doesn't have to ask again
4243 update_up_thru(from
, m
->map_epoch
);
4245 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->map_epoch
));
4252 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op
)
4254 op
->mark_osdmon_event(__func__
);
4255 auto m
= op
->get_req
<MRemoveSnaps
>();
4256 dout(7) << "preprocess_remove_snaps " << *m
<< dendl
;
4258 // check privilege, ignore if failed
4259 MonSession
*session
= op
->get_session();
4263 if (!session
->caps
.is_capable(
4265 session
->entity_name
,
4266 "osd", "osd pool rmsnap", {}, true, true, false,
4267 session
->get_peer_socket_addr())) {
4268 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
4269 << session
->caps
<< dendl
;
4273 for (map
<int, vector
<snapid_t
> >::iterator q
= m
->snaps
.begin();
4274 q
!= m
->snaps
.end();
4276 if (!osdmap
.have_pg_pool(q
->first
)) {
4277 dout(10) << " ignoring removed_snaps " << q
->second
4278 << " on non-existent pool " << q
->first
<< dendl
;
4281 const pg_pool_t
*pi
= osdmap
.get_pg_pool(q
->first
);
4282 for (vector
<snapid_t
>::iterator p
= q
->second
.begin();
4283 p
!= q
->second
.end();
4285 if (*p
> pi
->get_snap_seq() ||
4286 !_is_removed_snap(q
->first
, *p
)) {
4292 if (HAVE_FEATURE(m
->get_connection()->get_features(), SERVER_OCTOPUS
)) {
4293 auto reply
= make_message
<MRemoveSnaps
>();
4294 reply
->snaps
= m
->snaps
;
4295 mon
.send_reply(op
, reply
.detach());
4302 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op
)
4304 op
->mark_osdmon_event(__func__
);
4305 auto m
= op
->get_req
<MRemoveSnaps
>();
4306 dout(7) << "prepare_remove_snaps " << *m
<< dendl
;
4308 for (auto& [pool
, snaps
] : m
->snaps
) {
4309 if (!osdmap
.have_pg_pool(pool
)) {
4310 dout(10) << " ignoring removed_snaps " << snaps
4311 << " on non-existent pool " << pool
<< dendl
;
4315 pg_pool_t
& pi
= osdmap
.pools
[pool
];
4316 for (auto s
: snaps
) {
4317 if (!_is_removed_snap(pool
, s
) &&
4318 (!pending_inc
.new_pools
.count(pool
) ||
4319 !pending_inc
.new_pools
[pool
].removed_snaps
.contains(s
)) &&
4320 (!pending_inc
.new_removed_snaps
.count(pool
) ||
4321 !pending_inc
.new_removed_snaps
[pool
].contains(s
))) {
4322 pg_pool_t
*newpi
= pending_inc
.get_new_pool(pool
, &pi
);
4323 if (osdmap
.require_osd_release
< ceph_release_t::octopus
) {
4324 newpi
->removed_snaps
.insert(s
);
4325 dout(10) << " pool " << pool
<< " removed_snaps added " << s
4326 << " (now " << newpi
->removed_snaps
<< ")" << dendl
;
4328 newpi
->flags
|= pg_pool_t::FLAG_SELFMANAGED_SNAPS
;
4329 if (s
> newpi
->get_snap_seq()) {
4330 dout(10) << " pool " << pool
<< " snap_seq "
4331 << newpi
->get_snap_seq() << " -> " << s
<< dendl
;
4332 newpi
->set_snap_seq(s
);
4334 newpi
->set_snap_epoch(pending_inc
.epoch
);
4335 dout(10) << " added pool " << pool
<< " snap " << s
4336 << " to removed_snaps queue" << dendl
;
4337 pending_inc
.new_removed_snaps
[pool
].insert(s
);
4342 if (HAVE_FEATURE(m
->get_connection()->get_features(), SERVER_OCTOPUS
)) {
4343 auto reply
= make_message
<MRemoveSnaps
>();
4344 reply
->snaps
= m
->snaps
;
4345 wait_for_finished_proposal(op
, new C_ReplyOp(this, op
, reply
));
4351 bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op
)
4353 op
->mark_osdmon_event(__func__
);
4354 auto m
= op
->get_req
<MMonGetPurgedSnaps
>();
4355 dout(7) << __func__
<< " " << *m
<< dendl
;
4357 map
<epoch_t
,mempool::osdmap::map
<int64_t,snap_interval_set_t
>> r
;
4359 string k
= make_purged_snap_epoch_key(m
->start
);
4360 auto it
= mon
.store
->get_iterator(OSD_SNAP_PREFIX
);
4362 unsigned long epoch
= m
->last
;
4363 while (it
->valid()) {
4364 if (it
->key().find("purged_epoch_") != 0) {
4367 string k
= it
->key();
4368 int n
= sscanf(k
.c_str(), "purged_epoch_%lx", &epoch
);
4370 derr
<< __func__
<< " unable to parse key '" << it
->key() << "'" << dendl
;
4371 } else if (epoch
> m
->last
) {
4374 bufferlist bl
= it
->value();
4375 auto p
= bl
.cbegin();
4379 } catch (ceph::buffer::error
& e
) {
4380 derr
<< __func__
<< " unable to parse value for key '" << it
->key()
4385 n
+= 4 + v
.size() * 16;
4388 // impose a semi-arbitrary limit to message size
4394 auto reply
= make_message
<MMonGetPurgedSnapsReply
>(m
->start
, epoch
);
4395 reply
->purged_snaps
.swap(r
);
4396 mon
.send_reply(op
, reply
.detach());
4402 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op
)
4404 op
->mark_osdmon_event(__func__
);
4406 auto session
= op
->get_session();
4409 dout(10) << __func__
<< " no monitor session!" << dendl
;
4412 if (!session
->is_capable("osd", MON_CAP_X
)) {
4413 derr
<< __func__
<< " received from entity "
4414 << "with insufficient privileges " << session
->caps
<< dendl
;
4417 // Always forward the beacon to the leader, even if they are the same as
4418 // the old one. The leader will mark as down osds that haven't sent
4419 // beacon for a few minutes.
4423 bool OSDMonitor::prepare_beacon(MonOpRequestRef op
)
4425 op
->mark_osdmon_event(__func__
);
4426 const auto beacon
= op
->get_req
<MOSDBeacon
>();
4427 const auto src
= beacon
->get_orig_source();
4428 dout(10) << __func__
<< " " << *beacon
4429 << " from " << src
<< dendl
;
4430 int from
= src
.num();
4432 if (!src
.is_osd() ||
4433 !osdmap
.is_up(from
) ||
4434 !osdmap
.get_addrs(from
).legacy_equals(beacon
->get_orig_source_addrs())) {
4435 if (src
.is_osd() && !osdmap
.is_up(from
)) {
4436 // share some new maps with this guy in case it may not be
4437 // aware of its own deadness...
4438 send_latest(op
, beacon
->version
+1);
4440 dout(1) << " ignoring beacon from non-active osd." << from
<< dendl
;
4444 last_osd_report
[from
].first
= ceph_clock_now();
4445 last_osd_report
[from
].second
= beacon
->osd_beacon_report_interval
;
4446 osd_epochs
[from
] = beacon
->version
;
4448 for (const auto& pg
: beacon
->pgs
) {
4449 if (auto* pool
= osdmap
.get_pg_pool(pg
.pool()); pool
!= nullptr) {
4450 unsigned pg_num
= pool
->get_pg_num();
4451 last_epoch_clean
.report(pg_num
, pg
, beacon
->min_last_epoch_clean
);
4455 if (osdmap
.osd_xinfo
[from
].last_purged_snaps_scrub
<
4456 beacon
->last_purged_snaps_scrub
) {
4457 if (pending_inc
.new_xinfo
.count(from
) == 0) {
4458 pending_inc
.new_xinfo
[from
] = osdmap
.osd_xinfo
[from
];
4460 pending_inc
.new_xinfo
[from
].last_purged_snaps_scrub
=
4461 beacon
->last_purged_snaps_scrub
;
4471 void OSDMonitor::send_latest(MonOpRequestRef op
, epoch_t start
)
4473 op
->mark_osdmon_event(__func__
);
4474 dout(5) << "send_latest to " << op
->get_req()->get_orig_source_inst()
4475 << " start " << start
<< dendl
;
4479 send_incremental(op
, start
);
4483 MOSDMap
*OSDMonitor::build_latest_full(uint64_t features
)
4485 MOSDMap
*r
= new MOSDMap(mon
.monmap
->fsid
, features
);
4486 get_version_full(osdmap
.get_epoch(), features
, r
->maps
[osdmap
.get_epoch()]);
4487 r
->cluster_osdmap_trim_lower_bound
= get_first_committed();
4488 r
->newest_map
= osdmap
.get_epoch();
4492 MOSDMap
*OSDMonitor::build_incremental(epoch_t from
, epoch_t to
, uint64_t features
)
4494 dout(10) << "build_incremental [" << from
<< ".." << to
<< "] with features "
4495 << std::hex
<< features
<< std::dec
<< dendl
;
4496 MOSDMap
*m
= new MOSDMap(mon
.monmap
->fsid
, features
);
4497 m
->cluster_osdmap_trim_lower_bound
= get_first_committed();
4498 m
->newest_map
= osdmap
.get_epoch();
4500 for (epoch_t e
= to
; e
>= from
&& e
> 0; e
--) {
4502 int err
= get_version(e
, features
, bl
);
4504 ceph_assert(bl
.length());
4505 // if (get_version(e, bl) > 0) {
4506 dout(20) << "build_incremental inc " << e
<< " "
4507 << bl
.length() << " bytes" << dendl
;
4508 m
->incremental_maps
[e
] = bl
;
4510 ceph_assert(err
== -ENOENT
);
4511 ceph_assert(!bl
.length());
4512 get_version_full(e
, features
, bl
);
4513 if (bl
.length() > 0) {
4514 //else if (get_version("full", e, bl) > 0) {
4515 dout(20) << "build_incremental full " << e
<< " "
4516 << bl
.length() << " bytes" << dendl
;
4519 ceph_abort(); // we should have all maps.
4526 void OSDMonitor::send_full(MonOpRequestRef op
)
4528 op
->mark_osdmon_event(__func__
);
4529 dout(5) << "send_full to " << op
->get_req()->get_orig_source_inst() << dendl
;
4530 mon
.send_reply(op
, build_latest_full(op
->get_session()->con_features
));
4533 void OSDMonitor::send_incremental(MonOpRequestRef op
, epoch_t first
)
4535 op
->mark_osdmon_event(__func__
);
4537 MonSession
*s
= op
->get_session();
4541 // oh, we can tell the other mon to do it
4542 dout(10) << __func__
<< " asking proxying mon to send_incremental from "
4544 MRoute
*r
= new MRoute(s
->proxy_tid
, NULL
);
4545 r
->send_osdmap_first
= first
;
4546 s
->proxy_con
->send_message(r
);
4547 op
->mark_event("reply: send routed send_osdmap_first reply");
4550 send_incremental(first
, s
, false, op
);
4554 void OSDMonitor::send_incremental(epoch_t first
,
4555 MonSession
*session
,
4557 MonOpRequestRef req
)
4559 dout(5) << "send_incremental [" << first
<< ".." << osdmap
.get_epoch() << "]"
4560 << " to " << session
->name
<< dendl
;
4562 // get feature of the peer
4563 // use quorum_con_features, if it's an anonymous connection.
4564 uint64_t features
= session
->con_features
? session
->con_features
:
4565 mon
.get_quorum_con_features();
4567 if (first
<= session
->osd_epoch
) {
4568 dout(10) << __func__
<< " " << session
->name
<< " should already have epoch "
4569 << session
->osd_epoch
<< dendl
;
4570 first
= session
->osd_epoch
+ 1;
4573 if (first
< get_first_committed()) {
4574 MOSDMap
*m
= new MOSDMap(osdmap
.get_fsid(), features
);
4575 m
->cluster_osdmap_trim_lower_bound
= get_first_committed();
4576 m
->newest_map
= osdmap
.get_epoch();
4578 first
= get_first_committed();
4580 int err
= get_version_full(first
, features
, bl
);
4581 ceph_assert(err
== 0);
4582 ceph_assert(bl
.length());
4583 dout(20) << "send_incremental starting with base full "
4584 << first
<< " " << bl
.length() << " bytes" << dendl
;
4585 m
->maps
[first
] = bl
;
4588 mon
.send_reply(req
, m
);
4589 session
->osd_epoch
= first
;
4592 session
->con
->send_message(m
);
4593 session
->osd_epoch
= first
;
4598 while (first
<= osdmap
.get_epoch()) {
4599 epoch_t last
= std::min
<epoch_t
>(first
+ g_conf()->osd_map_message_max
- 1,
4600 osdmap
.get_epoch());
4601 MOSDMap
*m
= build_incremental(first
, last
, features
);
4604 // send some maps. it may not be all of them, but it will get them
4606 mon
.send_reply(req
, m
);
4608 session
->con
->send_message(m
);
4611 session
->osd_epoch
= last
;
4617 int OSDMonitor::get_version(version_t ver
, bufferlist
& bl
)
4619 return get_version(ver
, mon
.get_quorum_con_features(), bl
);
4622 void OSDMonitor::reencode_incremental_map(bufferlist
& bl
, uint64_t features
)
4624 OSDMap::Incremental inc
;
4625 auto q
= bl
.cbegin();
4627 // always encode with subset of osdmap's canonical features
4628 uint64_t f
= features
& inc
.encode_features
;
4629 dout(20) << __func__
<< " " << inc
.epoch
<< " with features " << f
4632 if (inc
.fullmap
.length()) {
4633 // embedded full map?
4635 m
.decode(inc
.fullmap
);
4636 inc
.fullmap
.clear();
4637 m
.encode(inc
.fullmap
, f
| CEPH_FEATURE_RESERVED
);
4639 if (inc
.crush
.length()) {
4640 // embedded crush map
4642 auto p
= inc
.crush
.cbegin();
4645 c
.encode(inc
.crush
, f
);
4647 inc
.encode(bl
, f
| CEPH_FEATURE_RESERVED
);
4650 void OSDMonitor::reencode_full_map(bufferlist
& bl
, uint64_t features
)
4653 auto q
= bl
.cbegin();
4655 // always encode with subset of osdmap's canonical features
4656 uint64_t f
= features
& m
.get_encoding_features();
4657 dout(20) << __func__
<< " " << m
.get_epoch() << " with features " << f
4660 m
.encode(bl
, f
| CEPH_FEATURE_RESERVED
);
4663 int OSDMonitor::get_version(version_t ver
, uint64_t features
, bufferlist
& bl
)
4665 uint64_t significant_features
= OSDMap::get_significant_features(features
);
4666 if (inc_osd_cache
.lookup({ver
, significant_features
}, &bl
)) {
4669 int ret
= PaxosService::get_version(ver
, bl
);
4673 // NOTE: this check is imprecise; the OSDMap encoding features may
4674 // be a subset of the latest mon quorum features, but worst case we
4675 // reencode once and then cache the (identical) result under both
4677 if (significant_features
!=
4678 OSDMap::get_significant_features(mon
.get_quorum_con_features())) {
4679 reencode_incremental_map(bl
, features
);
4681 inc_osd_cache
.add_bytes({ver
, significant_features
}, bl
);
4685 int OSDMonitor::get_inc(version_t ver
, OSDMap::Incremental
& inc
)
4688 int err
= get_version(ver
, inc_bl
);
4689 ceph_assert(err
== 0);
4690 ceph_assert(inc_bl
.length());
4692 auto p
= inc_bl
.cbegin();
4694 dout(10) << __func__
<< " "
4695 << " epoch " << inc
.epoch
4696 << " inc_crc " << inc
.inc_crc
4697 << " full_crc " << inc
.full_crc
4698 << " encode_features " << inc
.encode_features
<< dendl
;
4702 int OSDMonitor::get_full_from_pinned_map(version_t ver
, bufferlist
& bl
)
4704 dout(10) << __func__
<< " ver " << ver
<< dendl
;
4706 version_t closest_pinned
= osdmap_manifest
.get_lower_closest_pinned(ver
);
4707 if (closest_pinned
== 0) {
4710 if (closest_pinned
> ver
) {
4711 dout(0) << __func__
<< " pinned: " << osdmap_manifest
.pinned
<< dendl
;
4713 ceph_assert(closest_pinned
<= ver
);
4715 dout(10) << __func__
<< " closest pinned ver " << closest_pinned
<< dendl
;
4717 // get osdmap incremental maps and apply on top of this one.
4719 bool has_cached_osdmap
= false;
4720 for (version_t v
= ver
-1; v
>= closest_pinned
; --v
) {
4721 if (full_osd_cache
.lookup({v
, mon
.get_quorum_con_features()},
4723 dout(10) << __func__
<< " found map in cache ver " << v
<< dendl
;
4725 has_cached_osdmap
= true;
4730 if (!has_cached_osdmap
) {
4731 int err
= PaxosService::get_version_full(closest_pinned
, osdm_bl
);
4733 derr
<< __func__
<< " closest pinned map ver " << closest_pinned
4734 << " not available! error: " << cpp_strerror(err
) << dendl
;
4736 ceph_assert(err
== 0);
4739 ceph_assert(osdm_bl
.length());
4742 osdm
.decode(osdm_bl
);
4744 dout(10) << __func__
<< " loaded osdmap epoch " << closest_pinned
4745 << " e" << osdm
.epoch
4746 << " crc " << osdm
.get_crc()
4747 << " -- applying incremental maps." << dendl
;
4749 uint64_t encode_features
= 0;
4750 for (version_t v
= closest_pinned
+ 1; v
<= ver
; ++v
) {
4751 dout(20) << __func__
<< " applying inc epoch " << v
<< dendl
;
4753 OSDMap::Incremental inc
;
4754 int err
= get_inc(v
, inc
);
4755 ceph_assert(err
== 0);
4757 encode_features
= inc
.encode_features
;
4759 err
= osdm
.apply_incremental(inc
);
4760 ceph_assert(err
== 0);
4762 // this block performs paranoid checks on map retrieval
4763 if (g_conf().get_val
<bool>("mon_debug_extra_checks") &&
4764 inc
.full_crc
!= 0) {
4766 uint64_t f
= encode_features
;
4768 f
= (mon
.quorum_con_features
? mon
.quorum_con_features
: -1);
4771 // encode osdmap to force calculating crcs
4773 osdm
.encode(tbl
, f
| CEPH_FEATURE_RESERVED
);
4774 // decode osdmap to compare crcs with what's expected by incremental
4778 if (tosdm
.get_crc() != inc
.full_crc
) {
4780 << " osdmap crc mismatch! (osdmap crc " << tosdm
.get_crc()
4781 << ", expected " << inc
.full_crc
<< ")" << dendl
;
4782 ceph_abort_msg("osdmap crc mismatch");
4786 // note: we cannot add the recently computed map to the cache, as is,
4787 // because we have not encoded the map into a bl.
4790 if (!encode_features
) {
4791 dout(10) << __func__
4792 << " last incremental map didn't have features;"
4793 << " defaulting to quorum's or all" << dendl
;
4795 (mon
.quorum_con_features
? mon
.quorum_con_features
: -1);
4797 osdm
.encode(bl
, encode_features
| CEPH_FEATURE_RESERVED
);
4802 int OSDMonitor::get_version_full(version_t ver
, bufferlist
& bl
)
4804 return get_version_full(ver
, mon
.get_quorum_con_features(), bl
);
4807 int OSDMonitor::get_version_full(version_t ver
, uint64_t features
,
4810 uint64_t significant_features
= OSDMap::get_significant_features(features
);
4811 if (full_osd_cache
.lookup({ver
, significant_features
}, &bl
)) {
4814 int ret
= PaxosService::get_version_full(ver
, bl
);
4815 if (ret
== -ENOENT
) {
4817 ret
= get_full_from_pinned_map(ver
, bl
);
4822 // NOTE: this check is imprecise; the OSDMap encoding features may
4823 // be a subset of the latest mon quorum features, but worst case we
4824 // reencode once and then cache the (identical) result under both
4826 if (significant_features
!=
4827 OSDMap::get_significant_features(mon
.get_quorum_con_features())) {
4828 reencode_full_map(bl
, features
);
4830 full_osd_cache
.add_bytes({ver
, significant_features
}, bl
);
4834 epoch_t
OSDMonitor::blocklist(const entity_addrvec_t
& av
, utime_t until
)
4836 dout(10) << "blocklist " << av
<< " until " << until
<< dendl
;
4837 for (auto a
: av
.v
) {
4838 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
4839 a
.set_type(entity_addr_t::TYPE_ANY
);
4841 a
.set_type(entity_addr_t::TYPE_LEGACY
);
4843 pending_inc
.new_blocklist
[a
] = until
;
4845 return pending_inc
.epoch
;
4848 epoch_t
OSDMonitor::blocklist(entity_addr_t a
, utime_t until
)
4850 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
4851 a
.set_type(entity_addr_t::TYPE_ANY
);
4853 a
.set_type(entity_addr_t::TYPE_LEGACY
);
4855 dout(10) << "blocklist " << a
<< " until " << until
<< dendl
;
4856 pending_inc
.new_blocklist
[a
] = until
;
4857 return pending_inc
.epoch
;
4861 void OSDMonitor::check_osdmap_subs()
4863 dout(10) << __func__
<< dendl
;
4864 if (!osdmap
.get_epoch()) {
4867 auto osdmap_subs
= mon
.session_map
.subs
.find("osdmap");
4868 if (osdmap_subs
== mon
.session_map
.subs
.end()) {
4871 auto p
= osdmap_subs
->second
->begin();
4875 check_osdmap_sub(sub
);
4879 void OSDMonitor::check_osdmap_sub(Subscription
*sub
)
4881 dout(10) << __func__
<< " " << sub
<< " next " << sub
->next
4882 << (sub
->onetime
? " (onetime)":" (ongoing)") << dendl
;
4883 if (sub
->next
<= osdmap
.get_epoch()) {
4885 send_incremental(sub
->next
, sub
->session
, sub
->incremental_onetime
);
4887 sub
->session
->con
->send_message(build_latest_full(sub
->session
->con_features
));
4889 mon
.session_map
.remove_sub(sub
);
4891 sub
->next
= osdmap
.get_epoch() + 1;
4895 void OSDMonitor::check_pg_creates_subs()
4897 if (!osdmap
.get_num_up_osds()) {
4900 ceph_assert(osdmap
.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB
);
4901 mon
.with_session_map([this](const MonSessionMap
& session_map
) {
4902 auto pg_creates_subs
= session_map
.subs
.find("osd_pg_creates");
4903 if (pg_creates_subs
== session_map
.subs
.end()) {
4906 for (auto sub
: *pg_creates_subs
->second
) {
4907 check_pg_creates_sub(sub
);
4912 void OSDMonitor::check_pg_creates_sub(Subscription
*sub
)
4914 dout(20) << __func__
<< " .. " << sub
->session
->name
<< dendl
;
4915 ceph_assert(sub
->type
== "osd_pg_creates");
4916 // only send these if the OSD is up. we will check_subs() when they do
4917 // come up so they will get the creates then.
4918 if (sub
->session
->name
.is_osd() &&
4919 mon
.osdmon()->osdmap
.is_up(sub
->session
->name
.num())) {
4920 sub
->next
= send_pg_creates(sub
->session
->name
.num(),
4921 sub
->session
->con
.get(),
4926 void OSDMonitor::do_application_enable(int64_t pool_id
,
4927 const std::string
&app_name
,
4928 const std::string
&app_key
,
4929 const std::string
&app_value
,
4932 ceph_assert(paxos
.is_plugged() && is_writeable());
4934 dout(20) << __func__
<< ": pool_id=" << pool_id
<< ", app_name=" << app_name
4937 ceph_assert(osdmap
.require_osd_release
>= ceph_release_t::luminous
);
4939 auto pp
= osdmap
.get_pg_pool(pool_id
);
4940 ceph_assert(pp
!= nullptr);
4943 if (pending_inc
.new_pools
.count(pool_id
)) {
4944 p
= pending_inc
.new_pools
[pool_id
];
4947 if (app_key
.empty()) {
4948 p
.application_metadata
.insert({app_name
, {}});
4951 p
.application_metadata
[app_name
][app_key
] = app_value
;
4953 p
.application_metadata
.insert({app_name
, {{app_key
, app_value
}}});
4956 p
.last_change
= pending_inc
.epoch
;
4957 pending_inc
.new_pools
[pool_id
] = p
;
4960 void OSDMonitor::do_set_pool_opt(int64_t pool_id
,
4961 pool_opts_t::key_t opt
,
4962 pool_opts_t::value_t val
)
4964 dout(10) << __func__
<< " pool: " << pool_id
<< " option: " << opt
4965 << " val: " << val
<< dendl
;
4966 auto p
= pending_inc
.new_pools
.try_emplace(
4967 pool_id
, *osdmap
.get_pg_pool(pool_id
));
4968 p
.first
->second
.opts
.set(opt
, val
);
4971 unsigned OSDMonitor::scan_for_creating_pgs(
4972 const mempool::osdmap::map
<int64_t,pg_pool_t
>& pools
,
4973 const mempool::osdmap::set
<int64_t>& removed_pools
,
4975 creating_pgs_t
* creating_pgs
) const
4977 unsigned queued
= 0;
4978 for (auto& p
: pools
) {
4979 int64_t poolid
= p
.first
;
4980 if (creating_pgs
->created_pools
.count(poolid
)) {
4981 dout(10) << __func__
<< " already created " << poolid
<< dendl
;
4984 const pg_pool_t
& pool
= p
.second
;
4985 int ruleno
= pool
.get_crush_rule();
4986 if (ruleno
< 0 || !osdmap
.crush
->rule_exists(ruleno
))
4989 const auto last_scan_epoch
= creating_pgs
->last_scan_epoch
;
4990 const auto created
= pool
.get_last_change();
4991 if (last_scan_epoch
&& created
<= last_scan_epoch
) {
4992 dout(10) << __func__
<< " no change in pool " << poolid
4993 << " " << pool
<< dendl
;
4996 if (removed_pools
.count(poolid
)) {
4997 dout(10) << __func__
<< " pool is being removed: " << poolid
4998 << " " << pool
<< dendl
;
5001 dout(10) << __func__
<< " queueing pool create for " << poolid
5002 << " " << pool
<< dendl
;
5003 creating_pgs
->create_pool(poolid
, pool
.get_pg_num(),
5010 void OSDMonitor::update_creating_pgs()
5012 dout(10) << __func__
<< " " << creating_pgs
.pgs
.size() << " pgs creating, "
5013 << creating_pgs
.queue
.size() << " pools in queue" << dendl
;
5014 decltype(creating_pgs_by_osd_epoch
) new_pgs_by_osd_epoch
;
5015 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
5016 for (const auto& pg
: creating_pgs
.pgs
) {
5017 int acting_primary
= -1;
5018 auto pgid
= pg
.first
;
5019 if (!osdmap
.pg_exists(pgid
)) {
5020 dout(20) << __func__
<< " ignoring " << pgid
<< " which should not exist"
5024 auto mapped
= pg
.second
.create_epoch
;
5025 dout(20) << __func__
<< " looking up " << pgid
<< "@" << mapped
<< dendl
;
5027 mapping
.get_primary_and_shard(pgid
, &acting_primary
, &spgid
);
5028 // check the previous creating_pgs, look for the target to whom the pg was
5029 // previously mapped
5030 for (const auto& pgs_by_epoch
: creating_pgs_by_osd_epoch
) {
5031 const auto last_acting_primary
= pgs_by_epoch
.first
;
5032 for (auto& pgs
: pgs_by_epoch
.second
) {
5033 if (pgs
.second
.count(spgid
)) {
5034 if (last_acting_primary
== acting_primary
) {
5037 dout(20) << __func__
<< " " << pgid
<< " "
5038 << " acting_primary:" << last_acting_primary
5039 << " -> " << acting_primary
<< dendl
;
5040 // note epoch if the target of the create message changed.
5041 mapped
= mapping
.get_epoch();
5046 mapped
= mapping
.get_epoch();
5050 dout(10) << __func__
<< " will instruct osd." << acting_primary
5051 << " to create " << pgid
<< "@" << mapped
<< dendl
;
5052 new_pgs_by_osd_epoch
[acting_primary
][mapped
].insert(spgid
);
5054 creating_pgs_by_osd_epoch
= std::move(new_pgs_by_osd_epoch
);
5055 creating_pgs_epoch
= mapping
.get_epoch();
5058 epoch_t
OSDMonitor::send_pg_creates(int osd
, Connection
*con
, epoch_t next
) const
5060 dout(30) << __func__
<< " osd." << osd
<< " next=" << next
5061 << " " << creating_pgs_by_osd_epoch
<< dendl
;
5062 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
5063 if (creating_pgs_epoch
<= creating_pgs
.last_scan_epoch
) {
5064 dout(20) << __func__
5065 << " not using stale creating_pgs@" << creating_pgs_epoch
<< dendl
;
5066 // the subscribers will be updated when the mapping is completed anyway
5069 auto creating_pgs_by_epoch
= creating_pgs_by_osd_epoch
.find(osd
);
5070 if (creating_pgs_by_epoch
== creating_pgs_by_osd_epoch
.end())
5072 ceph_assert(!creating_pgs_by_epoch
->second
.empty());
5074 auto m
= make_message
<MOSDPGCreate2
>(creating_pgs_epoch
);
5077 for (auto epoch_pgs
= creating_pgs_by_epoch
->second
.lower_bound(next
);
5078 epoch_pgs
!= creating_pgs_by_epoch
->second
.end(); ++epoch_pgs
) {
5079 auto epoch
= epoch_pgs
->first
;
5080 auto& pgs
= epoch_pgs
->second
;
5081 dout(20) << __func__
<< " osd." << osd
<< " from " << next
5082 << " : epoch " << epoch
<< " " << pgs
.size() << " pgs" << dendl
;
5084 for (auto& pg
: pgs
) {
5085 // Need the create time from the monitor using its clock to set
5086 // last_scrub_stamp upon pg creation.
5087 auto create
= creating_pgs
.pgs
.find(pg
.pgid
);
5088 ceph_assert(create
!= creating_pgs
.pgs
.end());
5089 m
->pgs
.emplace(pg
, make_pair(create
->second
.create_epoch
,
5090 create
->second
.create_stamp
));
5091 if (create
->second
.history
.epoch_created
) {
5092 dout(20) << __func__
<< " " << pg
<< " " << create
->second
.history
5093 << " " << create
->second
.past_intervals
<< dendl
;
5094 m
->pg_extra
.emplace(pg
, make_pair(create
->second
.history
,
5095 create
->second
.past_intervals
));
5097 dout(20) << __func__
<< " will create " << pg
5098 << " at " << create
->second
.create_epoch
<< dendl
;
5101 if (!m
->pgs
.empty()) {
5102 con
->send_message2(std::move(m
));
5104 dout(20) << __func__
<< " osd." << osd
<< " from " << next
5105 << " has nothing to send" << dendl
;
5109 // sub is current through last + 1
5116 void OSDMonitor::tick()
5118 if (!is_active()) return;
5120 dout(10) << osdmap
<< dendl
;
5122 // always update osdmap manifest, regardless of being the leader.
5123 load_osdmap_manifest();
5125 // always tune priority cache manager memory on leader and peons
5126 if (ceph_using_tcmalloc() && mon_memory_autotune
) {
5127 std::lock_guard
l(balancer_lock
);
5128 if (pcm
!= nullptr) {
5131 _set_new_cache_sizes();
5132 dout(10) << "tick balancer "
5133 << " inc cache_bytes: " << inc_cache
->get_cache_bytes()
5134 << " inc comtd_bytes: " << inc_cache
->get_committed_size()
5135 << " inc used_bytes: " << inc_cache
->_get_used_bytes()
5136 << " inc num_osdmaps: " << inc_cache
->_get_num_osdmaps()
5138 dout(10) << "tick balancer "
5139 << " full cache_bytes: " << full_cache
->get_cache_bytes()
5140 << " full comtd_bytes: " << full_cache
->get_committed_size()
5141 << " full used_bytes: " << full_cache
->_get_used_bytes()
5142 << " full num_osdmaps: " << full_cache
->_get_num_osdmaps()
5147 if (!mon
.is_leader()) return;
5149 bool do_propose
= false;
5150 utime_t now
= ceph_clock_now();
5152 if (handle_osd_timeouts(now
, last_osd_report
)) {
5157 if (check_failures(now
)) {
5161 // Force a proposal if we need to prune; pruning is performed on
5162 // ``encode_pending()``, hence why we need to regularly trigger a proposal
5163 // even if there's nothing going on.
5164 if (is_prune_enabled() && should_prune()) {
5168 // mark down osds out?
5170 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
5171 * influence at all. The decision is made based on the ratio of "in" osds,
5172 * and the function returns false if this ratio is lower that the minimum
5173 * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
5175 if (can_mark_out(-1)) {
5176 string down_out_subtree_limit
= g_conf().get_val
<string
>(
5177 "mon_osd_down_out_subtree_limit");
5178 set
<int> down_cache
; // quick cache of down subtrees
5180 map
<int,utime_t
>::iterator i
= down_pending_out
.begin();
5181 while (i
!= down_pending_out
.end()) {
5187 if (osdmap
.is_down(o
) &&
5190 utime_t
orig_grace(g_conf()->mon_osd_down_out_interval
, 0);
5191 utime_t grace
= orig_grace
;
5192 double my_grace
= 0.0;
5194 if (g_conf()->mon_osd_adjust_down_out_interval
) {
5195 // scale grace period the same way we do the heartbeat grace.
5196 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(o
);
5197 double halflife
= (double)g_conf()->mon_osd_laggy_halflife
;
5198 double decay_k
= ::log(.5) / halflife
;
5199 double decay
= exp((double)down
* decay_k
);
5200 dout(20) << "osd." << o
<< " laggy halflife " << halflife
<< " decay_k " << decay_k
5201 << " down for " << down
<< " decay " << decay
<< dendl
;
5202 my_grace
= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
5206 // is this an entire large subtree down?
5207 if (down_out_subtree_limit
.length()) {
5208 int type
= osdmap
.crush
->get_type_id(down_out_subtree_limit
);
5210 if (osdmap
.containing_subtree_is_down(cct
, o
, type
, &down_cache
)) {
5211 dout(10) << "tick entire containing " << down_out_subtree_limit
5212 << " subtree for osd." << o
5213 << " is down; resetting timer" << dendl
;
5214 // reset timer, too.
5215 down_pending_out
[o
] = now
;
5221 bool down_out
= !osdmap
.is_destroyed(o
) &&
5222 g_conf()->mon_osd_down_out_interval
> 0 && down
.sec() >= grace
;
5223 bool destroyed_out
= osdmap
.is_destroyed(o
) &&
5224 g_conf()->mon_osd_destroyed_out_interval
> 0 &&
5225 // this is not precise enough as we did not make a note when this osd
5226 // was marked as destroyed, but let's not bother with that
5227 // complexity for now.
5228 down
.sec() >= g_conf()->mon_osd_destroyed_out_interval
;
5229 if (down_out
|| destroyed_out
) {
5230 dout(10) << "tick marking osd." << o
<< " OUT after " << down
5231 << " sec (target " << grace
<< " = " << orig_grace
<< " + " << my_grace
<< ")" << dendl
;
5232 pending_inc
.new_weight
[o
] = CEPH_OSD_OUT
;
5234 // set the AUTOOUT bit.
5235 if (pending_inc
.new_state
.count(o
) == 0)
5236 pending_inc
.new_state
[o
] = 0;
5237 pending_inc
.new_state
[o
] |= CEPH_OSD_AUTOOUT
;
5239 // remember previous weight
5240 if (pending_inc
.new_xinfo
.count(o
) == 0)
5241 pending_inc
.new_xinfo
[o
] = osdmap
.osd_xinfo
[o
];
5242 pending_inc
.new_xinfo
[o
].old_weight
= osdmap
.osd_weight
[o
];
5246 mon
.clog
->info() << "Marking osd." << o
<< " out (has been down for "
5247 << int(down
.sec()) << " seconds)";
5252 down_pending_out
.erase(o
);
5255 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl
;
5258 // expire blocklisted items?
5259 for (ceph::unordered_map
<entity_addr_t
,utime_t
>::iterator p
= osdmap
.blocklist
.begin();
5260 p
!= osdmap
.blocklist
.end();
5262 if (p
->second
< now
) {
5263 dout(10) << "expiring blocklist item " << p
->first
<< " expired " << p
->second
<< " < now " << now
<< dendl
;
5264 pending_inc
.old_blocklist
.push_back(p
->first
);
5268 for (auto p
= osdmap
.range_blocklist
.begin();
5269 p
!= osdmap
.range_blocklist
.end();
5271 if (p
->second
< now
) {
5272 dout(10) << "expiring range_blocklist item " << p
->first
5273 << " expired " << p
->second
<< " < now " << now
<< dendl
;
5274 pending_inc
.old_range_blocklist
.push_back(p
->first
);
5279 if (try_prune_purged_snaps()) {
5283 if (update_pools_status())
5287 !pending_inc
.new_pg_temp
.empty()) // also propose if we adjusted pg_temp
5291 void OSDMonitor::_set_new_cache_sizes()
5293 uint64_t cache_size
= 0;
5294 int64_t inc_alloc
= 0;
5295 int64_t full_alloc
= 0;
5296 int64_t kv_alloc
= 0;
5298 if (pcm
!= nullptr && rocksdb_binned_kv_cache
!= nullptr) {
5299 cache_size
= pcm
->get_tuned_mem();
5300 inc_alloc
= inc_cache
->get_committed_size();
5301 full_alloc
= full_cache
->get_committed_size();
5302 kv_alloc
= rocksdb_binned_kv_cache
->get_committed_size();
5305 inc_osd_cache
.set_bytes(inc_alloc
);
5306 full_osd_cache
.set_bytes(full_alloc
);
5308 dout(1) << __func__
<< " cache_size:" << cache_size
5309 << " inc_alloc: " << inc_alloc
5310 << " full_alloc: " << full_alloc
5311 << " kv_alloc: " << kv_alloc
5315 bool OSDMonitor::handle_osd_timeouts(const utime_t
&now
,
5316 std::map
<int, std::pair
<utime_t
, int>> &last_osd_report
)
5318 utime_t
timeo(g_conf()->mon_osd_report_timeout
, 0);
5319 if (now
- mon
.get_leader_since() < timeo
) {
5320 // We haven't been the leader for long enough to consider OSD timeouts
5324 int max_osd
= osdmap
.get_max_osd();
5325 bool new_down
= false;
5327 for (int i
=0; i
< max_osd
; ++i
) {
5328 dout(30) << __func__
<< ": checking up on osd " << i
<< dendl
;
5329 if (!osdmap
.exists(i
)) {
5330 last_osd_report
.erase(i
); // if any
5333 if (!osdmap
.is_up(i
))
5335 const std::map
<int, std::pair
<utime_t
, int>>::const_iterator t
= last_osd_report
.find(i
);
5336 if (t
== last_osd_report
.end()) {
5337 // it wasn't in the map; start the timer.
5338 last_osd_report
[i
].first
= now
;
5339 last_osd_report
[i
].second
= 0;
5340 } else if (can_mark_down(i
)) {
5341 utime_t diff
= now
- t
->second
.first
;
5342 // we use the max(mon_osd_report_timeout, 2*osd_beacon_report_interval) as timeout
5343 // to allow for the osd to miss a beacon.
5344 int mon_osd_report_timeout
= g_conf()->mon_osd_report_timeout
;
5345 utime_t
max_timeout(std::max(mon_osd_report_timeout
, 2 * t
->second
.second
), 0);
5346 if (diff
> max_timeout
) {
5347 mon
.clog
->info() << "osd." << i
<< " marked down after no beacon for "
5348 << diff
<< " seconds";
5349 derr
<< "no beacon from osd." << i
<< " since " << t
->second
.first
5350 << ", " << diff
<< " seconds ago. marking down" << dendl
;
5351 pending_inc
.new_state
[i
] = CEPH_OSD_UP
;
5359 static void dump_cpu_list(Formatter
*f
, const char *name
,
5360 const string
& strlist
)
5363 size_t cpu_set_size
;
5364 if (parse_cpu_set_list(strlist
.c_str(), &cpu_set_size
, &cpu_set
) < 0) {
5367 set
<int> cpus
= cpu_set_to_set(cpu_set_size
, &cpu_set
);
5368 f
->open_array_section(name
);
5369 for (auto cpu
: cpus
) {
5370 f
->dump_int("cpu", cpu
);
5375 void OSDMonitor::dump_info(Formatter
*f
)
5377 f
->open_object_section("osdmap");
5378 osdmap
.dump(f
, cct
);
5381 f
->open_array_section("osd_metadata");
5382 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
5383 if (osdmap
.exists(i
)) {
5384 f
->open_object_section("osd");
5385 f
->dump_unsigned("id", i
);
5386 dump_osd_metadata(i
, f
, NULL
);
5392 f
->open_object_section("osdmap_clean_epochs");
5393 f
->dump_unsigned("min_last_epoch_clean", get_min_last_epoch_clean());
5395 f
->open_object_section("last_epoch_clean");
5396 last_epoch_clean
.dump(f
);
5399 f
->open_array_section("osd_epochs");
5400 for (auto& osd_epoch
: osd_epochs
) {
5401 f
->open_object_section("osd");
5402 f
->dump_unsigned("id", osd_epoch
.first
);
5403 f
->dump_unsigned("epoch", osd_epoch
.second
);
5406 f
->close_section(); // osd_epochs
5408 f
->close_section(); // osd_clean_epochs
5410 f
->dump_unsigned("osdmap_first_committed", get_first_committed());
5411 f
->dump_unsigned("osdmap_last_committed", get_last_committed());
5413 f
->open_object_section("crushmap");
5414 osdmap
.crush
->dump(f
);
5417 if (has_osdmap_manifest
) {
5418 f
->open_object_section("osdmap_manifest");
5419 osdmap_manifest
.dump(f
);
5425 enum osd_pool_get_choices
{
5427 PG_NUM
, PGP_NUM
, CRUSH_RULE
, HASHPSPOOL
, EC_OVERWRITES
,
5428 NODELETE
, NOPGCHANGE
, NOSIZECHANGE
,
5429 WRITE_FADVISE_DONTNEED
, NOSCRUB
, NODEEP_SCRUB
,
5430 HIT_SET_TYPE
, HIT_SET_PERIOD
, HIT_SET_COUNT
, HIT_SET_FPP
,
5431 USE_GMT_HITSET
, TARGET_MAX_OBJECTS
, TARGET_MAX_BYTES
,
5432 CACHE_TARGET_DIRTY_RATIO
, CACHE_TARGET_DIRTY_HIGH_RATIO
,
5433 CACHE_TARGET_FULL_RATIO
,
5434 CACHE_MIN_FLUSH_AGE
, CACHE_MIN_EVICT_AGE
,
5435 ERASURE_CODE_PROFILE
, MIN_READ_RECENCY_FOR_PROMOTE
,
5436 MIN_WRITE_RECENCY_FOR_PROMOTE
, FAST_READ
,
5437 HIT_SET_GRADE_DECAY_RATE
, HIT_SET_SEARCH_LAST_N
,
5438 SCRUB_MIN_INTERVAL
, SCRUB_MAX_INTERVAL
, DEEP_SCRUB_INTERVAL
,
5439 RECOVERY_PRIORITY
, RECOVERY_OP_PRIORITY
, SCRUB_PRIORITY
,
5440 COMPRESSION_MODE
, COMPRESSION_ALGORITHM
, COMPRESSION_REQUIRED_RATIO
,
5441 COMPRESSION_MAX_BLOB_SIZE
, COMPRESSION_MIN_BLOB_SIZE
,
5442 CSUM_TYPE
, CSUM_MAX_BLOCK
, CSUM_MIN_BLOCK
, FINGERPRINT_ALGORITHM
,
5443 PG_AUTOSCALE_MODE
, PG_NUM_MIN
, TARGET_SIZE_BYTES
, TARGET_SIZE_RATIO
,
5444 PG_AUTOSCALE_BIAS
, DEDUP_TIER
, DEDUP_CHUNK_ALGORITHM
,
5445 DEDUP_CDC_CHUNK_SIZE
, POOL_EIO
, BULK
, PG_NUM_MAX
};
5447 std::set
<osd_pool_get_choices
>
5448 subtract_second_from_first(const std::set
<osd_pool_get_choices
>& first
,
5449 const std::set
<osd_pool_get_choices
>& second
)
5451 std::set
<osd_pool_get_choices
> result
;
5452 std::set_difference(first
.begin(), first
.end(),
5453 second
.begin(), second
.end(),
5454 std::inserter(result
, result
.end()));
5460 bool OSDMonitor::preprocess_command(MonOpRequestRef op
)
5462 op
->mark_osdmon_event(__func__
);
5463 auto m
= op
->get_req
<MMonCommand
>();
5466 stringstream ss
, ds
;
5469 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
5470 string rs
= ss
.str();
5471 mon
.reply_command(op
, -EINVAL
, rs
, get_last_committed());
5475 MonSession
*session
= op
->get_session();
5477 derr
<< __func__
<< " no session" << dendl
;
5478 mon
.reply_command(op
, -EACCES
, "access denied", get_last_committed());
5483 cmd_getval(cmdmap
, "prefix", prefix
);
5485 string format
= cmd_getval_or
<string
>(cmdmap
, "format", "plain");
5486 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
5488 if (prefix
== "osd stat") {
5490 f
->open_object_section("osdmap");
5491 osdmap
.print_summary(f
.get(), ds
, "", true);
5495 osdmap
.print_summary(nullptr, ds
, "", true);
5499 else if (prefix
== "osd dump" ||
5500 prefix
== "osd tree" ||
5501 prefix
== "osd tree-from" ||
5502 prefix
== "osd ls" ||
5503 prefix
== "osd getmap" ||
5504 prefix
== "osd getcrushmap" ||
5505 prefix
== "osd ls-tree" ||
5506 prefix
== "osd info") {
5508 epoch_t epoch
= cmd_getval_or
<int64_t>(cmdmap
, "epoch", osdmap
.get_epoch());
5509 bufferlist osdmap_bl
;
5510 int err
= get_version_full(epoch
, osdmap_bl
);
5511 if (err
== -ENOENT
) {
5513 ss
<< "there is no map for epoch " << epoch
;
5516 ceph_assert(err
== 0);
5517 ceph_assert(osdmap_bl
.length());
5520 if (epoch
== osdmap
.get_epoch()) {
5524 p
->decode(osdmap_bl
);
5527 auto sg
= make_scope_guard([&] {
5533 if (prefix
== "osd dump") {
5536 f
->open_object_section("osdmap");
5537 p
->dump(f
.get(), cct
);
5546 } else if (prefix
== "osd ls") {
5548 f
->open_array_section("osds");
5549 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
5550 if (osdmap
.exists(i
)) {
5551 f
->dump_int("osd", i
);
5558 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
5559 if (osdmap
.exists(i
)) {
5568 } else if (prefix
== "osd info") {
5570 bool do_single_osd
= true;
5571 if (!cmd_getval(cmdmap
, "id", osd_id
)) {
5572 do_single_osd
= false;
5575 if (do_single_osd
&& !osdmap
.exists(osd_id
)) {
5576 ss
<< "osd." << osd_id
<< " does not exist";
5582 if (do_single_osd
) {
5583 osdmap
.dump_osd(osd_id
, f
.get());
5585 osdmap
.dump_osds(f
.get());
5589 if (do_single_osd
) {
5590 osdmap
.print_osd(osd_id
, ds
);
5592 osdmap
.print_osds(ds
);
5596 } else if (prefix
== "osd tree" || prefix
== "osd tree-from") {
5598 if (prefix
== "osd tree-from") {
5599 cmd_getval(cmdmap
, "bucket", bucket
);
5600 if (!osdmap
.crush
->name_exists(bucket
)) {
5601 ss
<< "bucket '" << bucket
<< "' does not exist";
5605 int id
= osdmap
.crush
->get_item_id(bucket
);
5607 ss
<< "\"" << bucket
<< "\" is not a bucket";
5613 vector
<string
> states
;
5614 cmd_getval(cmdmap
, "states", states
);
5615 unsigned filter
= 0;
5616 for (auto& s
: states
) {
5618 filter
|= OSDMap::DUMP_UP
;
5619 } else if (s
== "down") {
5620 filter
|= OSDMap::DUMP_DOWN
;
5621 } else if (s
== "in") {
5622 filter
|= OSDMap::DUMP_IN
;
5623 } else if (s
== "out") {
5624 filter
|= OSDMap::DUMP_OUT
;
5625 } else if (s
== "destroyed") {
5626 filter
|= OSDMap::DUMP_DESTROYED
;
5628 ss
<< "unrecognized state '" << s
<< "'";
5633 if ((filter
& (OSDMap::DUMP_IN
|OSDMap::DUMP_OUT
)) ==
5634 (OSDMap::DUMP_IN
|OSDMap::DUMP_OUT
)) {
5635 ss
<< "cannot specify both 'in' and 'out'";
5639 if (((filter
& (OSDMap::DUMP_UP
|OSDMap::DUMP_DOWN
)) ==
5640 (OSDMap::DUMP_UP
|OSDMap::DUMP_DOWN
)) ||
5641 ((filter
& (OSDMap::DUMP_UP
|OSDMap::DUMP_DESTROYED
)) ==
5642 (OSDMap::DUMP_UP
|OSDMap::DUMP_DESTROYED
)) ||
5643 ((filter
& (OSDMap::DUMP_DOWN
|OSDMap::DUMP_DESTROYED
)) ==
5644 (OSDMap::DUMP_DOWN
|OSDMap::DUMP_DESTROYED
))) {
5645 ss
<< "can specify only one of 'up', 'down' and 'destroyed'";
5650 f
->open_object_section("tree");
5651 p
->print_tree(f
.get(), NULL
, filter
, bucket
);
5655 p
->print_tree(NULL
, &ds
, filter
, bucket
);
5658 } else if (prefix
== "osd getmap") {
5659 rdata
.append(osdmap_bl
);
5660 ss
<< "got osdmap epoch " << p
->get_epoch();
5661 } else if (prefix
== "osd getcrushmap") {
5662 p
->crush
->encode(rdata
, mon
.get_quorum_con_features());
5663 ss
<< p
->get_crush_version();
5664 } else if (prefix
== "osd ls-tree") {
5666 cmd_getval(cmdmap
, "name", bucket_name
);
5668 r
= p
->get_osds_by_bucket_name(bucket_name
, &osds
);
5670 ss
<< "\"" << bucket_name
<< "\" does not exist";
5673 ss
<< "can not parse bucket name:\"" << bucket_name
<< "\"";
5678 f
->open_array_section("osds");
5679 for (auto &i
: osds
) {
5680 if (osdmap
.exists(i
)) {
5681 f
->dump_int("osd", i
);
5688 for (auto &i
: osds
) {
5689 if (osdmap
.exists(i
)) {
5700 } else if (prefix
== "osd getmaxosd") {
5702 f
->open_object_section("getmaxosd");
5703 f
->dump_unsigned("epoch", osdmap
.get_epoch());
5704 f
->dump_int("max_osd", osdmap
.get_max_osd());
5708 ds
<< "max_osd = " << osdmap
.get_max_osd() << " in epoch " << osdmap
.get_epoch();
5711 } else if (prefix
== "osd utilization") {
5713 osdmap
.summarize_mapping_stats(NULL
, NULL
, &out
, f
.get());
5720 } else if (prefix
== "osd find") {
5722 if (!cmd_getval(cmdmap
, "id", osd
)) {
5723 ss
<< "unable to parse osd id value '"
5724 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
5728 if (!osdmap
.exists(osd
)) {
5729 ss
<< "osd." << osd
<< " does not exist";
5734 cmd_getval(cmdmap
, "format", format
);
5735 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5736 f
->open_object_section("osd_location");
5737 f
->dump_int("osd", osd
);
5738 f
->dump_object("addrs", osdmap
.get_addrs(osd
));
5739 f
->dump_stream("osd_fsid") << osdmap
.get_uuid(osd
);
5741 // try to identify host, pod/container name, etc.
5742 map
<string
,string
> m
;
5743 load_metadata(osd
, m
, nullptr);
5744 if (auto p
= m
.find("hostname"); p
!= m
.end()) {
5745 f
->dump_string("host", p
->second
);
5748 "pod_name", "pod_namespace", // set by rook
5749 "container_name" // set by cephadm, ceph-ansible
5751 if (auto p
= m
.find(k
); p
!= m
.end()) {
5752 f
->dump_string(k
, p
->second
);
5756 // crush is helpful too
5757 f
->open_object_section("crush_location");
5758 map
<string
,string
> loc
= osdmap
.crush
->get_full_location(osd
);
5759 for (map
<string
,string
>::iterator p
= loc
.begin(); p
!= loc
.end(); ++p
)
5760 f
->dump_string(p
->first
.c_str(), p
->second
);
5764 } else if (prefix
== "osd metadata") {
5766 if (cmd_vartype_stringify(cmdmap
["id"]).size() &&
5767 !cmd_getval(cmdmap
, "id", osd
)) {
5768 ss
<< "unable to parse osd id value '"
5769 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
5773 if (osd
>= 0 && !osdmap
.exists(osd
)) {
5774 ss
<< "osd." << osd
<< " does not exist";
5779 cmd_getval(cmdmap
, "format", format
);
5780 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5782 f
->open_object_section("osd_metadata");
5783 f
->dump_unsigned("id", osd
);
5784 r
= dump_osd_metadata(osd
, f
.get(), &ss
);
5790 f
->open_array_section("osd_metadata");
5791 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
5792 if (osdmap
.exists(i
)) {
5793 f
->open_object_section("osd");
5794 f
->dump_unsigned("id", i
);
5795 r
= dump_osd_metadata(i
, f
.get(), NULL
);
5796 if (r
== -EINVAL
|| r
== -ENOENT
) {
5797 // Drop error, continue to get other daemons' metadata
5798 dout(4) << "No metadata for osd." << i
<< dendl
;
5810 } else if (prefix
== "osd versions") {
5812 f
.reset(Formatter::create("json-pretty"));
5813 count_metadata("ceph_version", f
.get());
5816 } else if (prefix
== "osd count-metadata") {
5818 f
.reset(Formatter::create("json-pretty"));
5820 cmd_getval(cmdmap
, "property", field
);
5821 count_metadata(field
, f
.get());
5824 } else if (prefix
== "osd numa-status") {
5827 f
->open_array_section("osds");
5829 tbl
.define_column("OSD", TextTable::LEFT
, TextTable::RIGHT
);
5830 tbl
.define_column("HOST", TextTable::LEFT
, TextTable::LEFT
);
5831 tbl
.define_column("NETWORK", TextTable::RIGHT
, TextTable::RIGHT
);
5832 tbl
.define_column("STORAGE", TextTable::RIGHT
, TextTable::RIGHT
);
5833 tbl
.define_column("AFFINITY", TextTable::RIGHT
, TextTable::RIGHT
);
5834 tbl
.define_column("CPUS", TextTable::LEFT
, TextTable::LEFT
);
5836 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
5837 if (osdmap
.exists(i
)) {
5838 map
<string
,string
> m
;
5840 if (load_metadata(i
, m
, &err
) < 0) {
5844 auto p
= m
.find("hostname");
5849 f
->open_object_section("osd");
5850 f
->dump_int("osd", i
);
5851 f
->dump_string("host", host
);
5852 for (auto n
: { "network_numa_node", "objectstore_numa_node",
5856 f
->dump_int(n
, atoi(p
->second
.c_str()));
5859 for (auto n
: { "network_numa_nodes", "objectstore_numa_nodes" }) {
5862 list
<string
> ls
= get_str_list(p
->second
, ",");
5863 f
->open_array_section(n
);
5864 for (auto node
: ls
) {
5865 f
->dump_int("node", atoi(node
.c_str()));
5870 for (auto n
: { "numa_node_cpus" }) {
5873 dump_cpu_list(f
.get(), n
, p
->second
);
5880 p
= m
.find("network_numa_nodes");
5886 p
= m
.find("objectstore_numa_nodes");
5892 p
= m
.find("numa_node");
5893 auto q
= m
.find("numa_node_cpus");
5894 if (p
!= m
.end() && q
!= m
.end()) {
5901 tbl
<< TextTable::endrow
;
5909 rdata
.append(stringify(tbl
));
5911 } else if (prefix
== "osd map") {
5912 string poolstr
, objstr
, namespacestr
;
5913 cmd_getval(cmdmap
, "pool", poolstr
);
5914 cmd_getval(cmdmap
, "object", objstr
);
5915 cmd_getval(cmdmap
, "nspace", namespacestr
);
5917 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
5919 ss
<< "pool " << poolstr
<< " does not exist";
5923 object_locator_t
oloc(pool
, namespacestr
);
5924 object_t
oid(objstr
);
5925 pg_t pgid
= osdmap
.object_locator_to_pg(oid
, oloc
);
5926 pg_t mpgid
= osdmap
.raw_pg_to_pg(pgid
);
5927 vector
<int> up
, acting
;
5929 osdmap
.pg_to_up_acting_osds(mpgid
, &up
, &up_p
, &acting
, &acting_p
);
5932 if (!namespacestr
.empty())
5933 fullobjname
= namespacestr
+ string("/") + oid
.name
;
5935 fullobjname
= oid
.name
;
5937 f
->open_object_section("osd_map");
5938 f
->dump_unsigned("epoch", osdmap
.get_epoch());
5939 f
->dump_string("pool", poolstr
);
5940 f
->dump_int("pool_id", pool
);
5941 f
->dump_stream("objname") << fullobjname
;
5942 f
->dump_stream("raw_pgid") << pgid
;
5943 f
->dump_stream("pgid") << mpgid
;
5944 f
->open_array_section("up");
5945 for (vector
<int>::iterator p
= up
.begin(); p
!= up
.end(); ++p
)
5946 f
->dump_int("osd", *p
);
5948 f
->dump_int("up_primary", up_p
);
5949 f
->open_array_section("acting");
5950 for (vector
<int>::iterator p
= acting
.begin(); p
!= acting
.end(); ++p
)
5951 f
->dump_int("osd", *p
);
5953 f
->dump_int("acting_primary", acting_p
);
5954 f
->close_section(); // osd_map
5957 ds
<< "osdmap e" << osdmap
.get_epoch()
5958 << " pool '" << poolstr
<< "' (" << pool
<< ")"
5959 << " object '" << fullobjname
<< "' ->"
5960 << " pg " << pgid
<< " (" << mpgid
<< ")"
5961 << " -> up (" << pg_vector_string(up
) << ", p" << up_p
<< ") acting ("
5962 << pg_vector_string(acting
) << ", p" << acting_p
<< ")";
5966 } else if (prefix
== "pg map") {
5968 vector
<int> up
, acting
;
5969 r
= parse_pgid(cmdmap
, ss
, pgid
);
5972 pg_t mpgid
= osdmap
.raw_pg_to_pg(pgid
);
5973 osdmap
.pg_to_up_acting_osds(pgid
, up
, acting
);
5975 f
->open_object_section("pg_map");
5976 f
->dump_unsigned("epoch", osdmap
.get_epoch());
5977 f
->dump_stream("raw_pgid") << pgid
;
5978 f
->dump_stream("pgid") << mpgid
;
5979 f
->open_array_section("up");
5980 for (auto osd
: up
) {
5981 f
->dump_int("up_osd", osd
);
5984 f
->open_array_section("acting");
5985 for (auto osd
: acting
) {
5986 f
->dump_int("acting_osd", osd
);
5992 ds
<< "osdmap e" << osdmap
.get_epoch()
5993 << " pg " << pgid
<< " (" << mpgid
<< ")"
5994 << " -> up " << up
<< " acting " << acting
;
5999 } else if (prefix
== "osd lspools") {
6001 f
->open_array_section("pools");
6002 for (map
<int64_t, pg_pool_t
>::iterator p
= osdmap
.pools
.begin();
6003 p
!= osdmap
.pools
.end();
6006 f
->open_object_section("pool");
6007 f
->dump_int("poolnum", p
->first
);
6008 f
->dump_string("poolname", osdmap
.pool_name
[p
->first
]);
6011 ds
<< p
->first
<< ' ' << osdmap
.pool_name
[p
->first
];
6012 if (next(p
) != osdmap
.pools
.end()) {
6022 } else if (prefix
== "osd blocklist ls" ||
6023 prefix
== "osd blacklist ls") {
6025 f
->open_array_section("blocklist");
6027 for (ceph::unordered_map
<entity_addr_t
,utime_t
>::iterator p
= osdmap
.blocklist
.begin();
6028 p
!= osdmap
.blocklist
.end();
6031 f
->open_object_section("entry");
6032 f
->dump_string("addr", p
->first
.get_legacy_str());
6033 f
->dump_stream("until") << p
->second
;
6038 ss
<< p
->first
<< " " << p
->second
;
6049 f
->open_array_section("range_blocklist");
6051 for (auto p
= osdmap
.range_blocklist
.begin();
6052 p
!= osdmap
.range_blocklist
.end();
6055 f
->open_object_section("entry");
6056 f
->dump_string("range", p
->first
.get_legacy_str());
6057 f
->dump_stream("until") << p
->second
;
6062 ss
<< p
->first
<< " " << p
->second
;
6072 ss
<< "listed " << osdmap
.blocklist
.size() + osdmap
.range_blocklist
.size() << " entries";
6074 } else if (prefix
== "osd pool ls") {
6076 cmd_getval(cmdmap
, "detail", detail
);
6077 if (!f
&& detail
== "detail") {
6079 osdmap
.print_pools(cct
, ss
);
6080 rdata
.append(ss
.str());
6083 f
->open_array_section("pools");
6084 for (auto &[pid
, pdata
] : osdmap
.get_pools()) {
6086 if (detail
== "detail") {
6087 f
->open_object_section("pool");
6088 f
->dump_int("pool_id", pid
);
6089 f
->dump_string("pool_name", osdmap
.get_pool_name(pid
));
6090 pdata
.dump(f
.get());
6091 osdmap
.dump_read_balance_score(cct
, pid
, pdata
, f
.get());
6094 f
->dump_string("pool_name", osdmap
.get_pool_name(pid
));
6097 rdata
.append(osdmap
.get_pool_name(pid
) + "\n");
6106 } else if (prefix
== "osd crush get-tunable") {
6108 cmd_getval(cmdmap
, "tunable", tunable
);
6111 f
->open_object_section("tunable");
6112 if (tunable
== "straw_calc_version") {
6114 f
->dump_int(tunable
.c_str(), osdmap
.crush
->get_straw_calc_version());
6116 rss
<< osdmap
.crush
->get_straw_calc_version() << "\n";
6125 rdata
.append(rss
.str());
6129 } else if (prefix
== "osd pool get") {
6131 cmd_getval(cmdmap
, "pool", poolstr
);
6132 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
6134 ss
<< "unrecognized pool '" << poolstr
<< "'";
6139 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
6141 cmd_getval(cmdmap
, "var", var
);
6143 typedef std::map
<std::string
, osd_pool_get_choices
> choices_map_t
;
6144 const choices_map_t ALL_CHOICES
= {
6146 {"min_size", MIN_SIZE
},
6147 {"pg_num", PG_NUM
}, {"pgp_num", PGP_NUM
},
6148 {"crush_rule", CRUSH_RULE
},
6149 {"hashpspool", HASHPSPOOL
},
6151 {"allow_ec_overwrites", EC_OVERWRITES
}, {"nodelete", NODELETE
},
6152 {"nopgchange", NOPGCHANGE
}, {"nosizechange", NOSIZECHANGE
},
6153 {"noscrub", NOSCRUB
}, {"nodeep-scrub", NODEEP_SCRUB
},
6154 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED
},
6155 {"hit_set_type", HIT_SET_TYPE
}, {"hit_set_period", HIT_SET_PERIOD
},
6156 {"hit_set_count", HIT_SET_COUNT
}, {"hit_set_fpp", HIT_SET_FPP
},
6157 {"use_gmt_hitset", USE_GMT_HITSET
},
6158 {"target_max_objects", TARGET_MAX_OBJECTS
},
6159 {"target_max_bytes", TARGET_MAX_BYTES
},
6160 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO
},
6161 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO
},
6162 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO
},
6163 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE
},
6164 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE
},
6165 {"erasure_code_profile", ERASURE_CODE_PROFILE
},
6166 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE
},
6167 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE
},
6168 {"fast_read", FAST_READ
},
6169 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE
},
6170 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N
},
6171 {"scrub_min_interval", SCRUB_MIN_INTERVAL
},
6172 {"scrub_max_interval", SCRUB_MAX_INTERVAL
},
6173 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL
},
6174 {"recovery_priority", RECOVERY_PRIORITY
},
6175 {"recovery_op_priority", RECOVERY_OP_PRIORITY
},
6176 {"scrub_priority", SCRUB_PRIORITY
},
6177 {"compression_mode", COMPRESSION_MODE
},
6178 {"compression_algorithm", COMPRESSION_ALGORITHM
},
6179 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO
},
6180 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE
},
6181 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE
},
6182 {"csum_type", CSUM_TYPE
},
6183 {"csum_max_block", CSUM_MAX_BLOCK
},
6184 {"csum_min_block", CSUM_MIN_BLOCK
},
6185 {"fingerprint_algorithm", FINGERPRINT_ALGORITHM
},
6186 {"pg_autoscale_mode", PG_AUTOSCALE_MODE
},
6187 {"pg_num_min", PG_NUM_MIN
},
6188 {"pg_num_max", PG_NUM_MAX
},
6189 {"target_size_bytes", TARGET_SIZE_BYTES
},
6190 {"target_size_ratio", TARGET_SIZE_RATIO
},
6191 {"pg_autoscale_bias", PG_AUTOSCALE_BIAS
},
6192 {"dedup_tier", DEDUP_TIER
},
6193 {"dedup_chunk_algorithm", DEDUP_CHUNK_ALGORITHM
},
6194 {"dedup_cdc_chunk_size", DEDUP_CDC_CHUNK_SIZE
},
6198 typedef std::set
<osd_pool_get_choices
> choices_set_t
;
6200 const choices_set_t ONLY_TIER_CHOICES
= {
6201 HIT_SET_TYPE
, HIT_SET_PERIOD
, HIT_SET_COUNT
, HIT_SET_FPP
,
6202 TARGET_MAX_OBJECTS
, TARGET_MAX_BYTES
, CACHE_TARGET_FULL_RATIO
,
6203 CACHE_TARGET_DIRTY_RATIO
, CACHE_TARGET_DIRTY_HIGH_RATIO
,
6204 CACHE_MIN_FLUSH_AGE
, CACHE_MIN_EVICT_AGE
,
6205 MIN_READ_RECENCY_FOR_PROMOTE
,
6206 MIN_WRITE_RECENCY_FOR_PROMOTE
,
6207 HIT_SET_GRADE_DECAY_RATE
, HIT_SET_SEARCH_LAST_N
6209 const choices_set_t ONLY_ERASURE_CHOICES
= {
6210 EC_OVERWRITES
, ERASURE_CODE_PROFILE
6213 choices_set_t selected_choices
;
6215 for(choices_map_t::const_iterator it
= ALL_CHOICES
.begin();
6216 it
!= ALL_CHOICES
.end(); ++it
) {
6217 selected_choices
.insert(it
->second
);
6221 selected_choices
= subtract_second_from_first(selected_choices
,
6225 if(!p
->is_erasure()) {
6226 selected_choices
= subtract_second_from_first(selected_choices
,
6227 ONLY_ERASURE_CHOICES
);
6229 } else /* var != "all" */ {
6230 choices_map_t::const_iterator found
= ALL_CHOICES
.find(var
);
6231 if (found
== ALL_CHOICES
.end()) {
6232 ss
<< "pool '" << poolstr
6233 << "': invalid variable: '" << var
<< "'";
6238 osd_pool_get_choices selected
= found
->second
;
6240 if (!p
->is_tier() &&
6241 ONLY_TIER_CHOICES
.find(selected
) != ONLY_TIER_CHOICES
.end()) {
6242 ss
<< "pool '" << poolstr
6243 << "' is not a tier pool: variable not applicable";
6248 if (!p
->is_erasure() &&
6249 ONLY_ERASURE_CHOICES
.find(selected
)
6250 != ONLY_ERASURE_CHOICES
.end()) {
6251 ss
<< "pool '" << poolstr
6252 << "' is not a erasure pool: variable not applicable";
6257 if (pool_opts_t::is_opt_name(var
) &&
6258 !p
->opts
.is_set(pool_opts_t::get_opt_desc(var
).key
)) {
6259 ss
<< "option '" << var
<< "' is not set on pool '" << poolstr
<< "'";
6264 selected_choices
.insert(selected
);
6268 f
->open_object_section("pool");
6269 f
->dump_string("pool", poolstr
);
6270 f
->dump_int("pool_id", pool
);
6271 for(choices_set_t::const_iterator it
= selected_choices
.begin();
6272 it
!= selected_choices
.end(); ++it
) {
6273 choices_map_t::const_iterator i
;
6274 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
6275 if (i
->second
== *it
) {
6279 ceph_assert(i
!= ALL_CHOICES
.end());
6282 f
->dump_int("pg_num", p
->get_pg_num());
6285 f
->dump_int("pgp_num", p
->get_pgp_num());
6288 f
->dump_int("size", p
->get_size());
6291 f
->dump_int("min_size", p
->get_min_size());
6294 if (osdmap
.crush
->rule_exists(p
->get_crush_rule())) {
6295 f
->dump_string("crush_rule", osdmap
.crush
->get_rule_name(
6296 p
->get_crush_rule()));
6298 f
->dump_string("crush_rule", stringify(p
->get_crush_rule()));
6302 f
->dump_bool("allow_ec_overwrites",
6303 p
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
));
6305 case PG_AUTOSCALE_MODE
:
6306 f
->dump_string("pg_autoscale_mode",
6307 pg_pool_t::get_pg_autoscale_mode_name(
6308 p
->pg_autoscale_mode
));
6316 case WRITE_FADVISE_DONTNEED
:
6319 f
->dump_bool(i
->first
.c_str(),
6320 p
->has_flag(pg_pool_t::get_flag_by_name(i
->first
)));
6322 case HIT_SET_PERIOD
:
6323 f
->dump_int("hit_set_period", p
->hit_set_period
);
6326 f
->dump_int("hit_set_count", p
->hit_set_count
);
6329 f
->dump_string("hit_set_type",
6330 HitSet::get_type_name(p
->hit_set_params
.get_type()));
6334 if (p
->hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
6335 BloomHitSet::Params
*bloomp
=
6336 static_cast<BloomHitSet::Params
*>(p
->hit_set_params
.impl
.get());
6337 f
->dump_float("hit_set_fpp", bloomp
->get_fpp());
6338 } else if(var
!= "all") {
6340 ss
<< "hit set is not of type Bloom; " <<
6341 "invalid to get a false positive rate!";
6347 case USE_GMT_HITSET
:
6348 f
->dump_bool("use_gmt_hitset", p
->use_gmt_hitset
);
6350 case TARGET_MAX_OBJECTS
:
6351 f
->dump_unsigned("target_max_objects", p
->target_max_objects
);
6353 case TARGET_MAX_BYTES
:
6354 f
->dump_unsigned("target_max_bytes", p
->target_max_bytes
);
6356 case CACHE_TARGET_DIRTY_RATIO
:
6357 f
->dump_unsigned("cache_target_dirty_ratio_micro",
6358 p
->cache_target_dirty_ratio_micro
);
6359 f
->dump_float("cache_target_dirty_ratio",
6360 ((float)p
->cache_target_dirty_ratio_micro
/1000000));
6362 case CACHE_TARGET_DIRTY_HIGH_RATIO
:
6363 f
->dump_unsigned("cache_target_dirty_high_ratio_micro",
6364 p
->cache_target_dirty_high_ratio_micro
);
6365 f
->dump_float("cache_target_dirty_high_ratio",
6366 ((float)p
->cache_target_dirty_high_ratio_micro
/1000000));
6368 case CACHE_TARGET_FULL_RATIO
:
6369 f
->dump_unsigned("cache_target_full_ratio_micro",
6370 p
->cache_target_full_ratio_micro
);
6371 f
->dump_float("cache_target_full_ratio",
6372 ((float)p
->cache_target_full_ratio_micro
/1000000));
6374 case CACHE_MIN_FLUSH_AGE
:
6375 f
->dump_unsigned("cache_min_flush_age", p
->cache_min_flush_age
);
6377 case CACHE_MIN_EVICT_AGE
:
6378 f
->dump_unsigned("cache_min_evict_age", p
->cache_min_evict_age
);
6380 case ERASURE_CODE_PROFILE
:
6381 f
->dump_string("erasure_code_profile", p
->erasure_code_profile
);
6383 case MIN_READ_RECENCY_FOR_PROMOTE
:
6384 f
->dump_int("min_read_recency_for_promote",
6385 p
->min_read_recency_for_promote
);
6387 case MIN_WRITE_RECENCY_FOR_PROMOTE
:
6388 f
->dump_int("min_write_recency_for_promote",
6389 p
->min_write_recency_for_promote
);
6392 f
->dump_int("fast_read", p
->fast_read
);
6394 case HIT_SET_GRADE_DECAY_RATE
:
6395 f
->dump_int("hit_set_grade_decay_rate",
6396 p
->hit_set_grade_decay_rate
);
6398 case HIT_SET_SEARCH_LAST_N
:
6399 f
->dump_int("hit_set_search_last_n",
6400 p
->hit_set_search_last_n
);
6402 case SCRUB_MIN_INTERVAL
:
6403 case SCRUB_MAX_INTERVAL
:
6404 case DEEP_SCRUB_INTERVAL
:
6405 case RECOVERY_PRIORITY
:
6406 case RECOVERY_OP_PRIORITY
:
6407 case SCRUB_PRIORITY
:
6408 case COMPRESSION_MODE
:
6409 case COMPRESSION_ALGORITHM
:
6410 case COMPRESSION_REQUIRED_RATIO
:
6411 case COMPRESSION_MAX_BLOB_SIZE
:
6412 case COMPRESSION_MIN_BLOB_SIZE
:
6414 case CSUM_MAX_BLOCK
:
6415 case CSUM_MIN_BLOCK
:
6416 case FINGERPRINT_ALGORITHM
:
6419 case TARGET_SIZE_BYTES
:
6420 case TARGET_SIZE_RATIO
:
6421 case PG_AUTOSCALE_BIAS
:
6423 case DEDUP_CHUNK_ALGORITHM
:
6424 case DEDUP_CDC_CHUNK_SIZE
:
6425 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc(i
->first
).key
;
6426 if (p
->opts
.is_set(key
)) {
6427 if(*it
== CSUM_TYPE
) {
6429 p
->opts
.get(pool_opts_t::CSUM_TYPE
, &val
);
6430 f
->dump_string(i
->first
.c_str(), Checksummer::get_csum_type_string(val
));
6432 p
->opts
.dump(i
->first
, f
.get());
6441 for(choices_set_t::const_iterator it
= selected_choices
.begin();
6442 it
!= selected_choices
.end(); ++it
) {
6443 choices_map_t::const_iterator i
;
6446 ss
<< "pg_num: " << p
->get_pg_num() << "\n";
6449 ss
<< "pgp_num: " << p
->get_pgp_num() << "\n";
6452 ss
<< "size: " << p
->get_size() << "\n";
6455 ss
<< "min_size: " << p
->get_min_size() << "\n";
6458 if (osdmap
.crush
->rule_exists(p
->get_crush_rule())) {
6459 ss
<< "crush_rule: " << osdmap
.crush
->get_rule_name(
6460 p
->get_crush_rule()) << "\n";
6462 ss
<< "crush_rule: " << p
->get_crush_rule() << "\n";
6465 case PG_AUTOSCALE_MODE
:
6466 ss
<< "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
6467 p
->pg_autoscale_mode
) <<"\n";
6469 case HIT_SET_PERIOD
:
6470 ss
<< "hit_set_period: " << p
->hit_set_period
<< "\n";
6473 ss
<< "hit_set_count: " << p
->hit_set_count
<< "\n";
6476 ss
<< "hit_set_type: " <<
6477 HitSet::get_type_name(p
->hit_set_params
.get_type()) << "\n";
6481 if (p
->hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
6482 BloomHitSet::Params
*bloomp
=
6483 static_cast<BloomHitSet::Params
*>(p
->hit_set_params
.impl
.get());
6484 ss
<< "hit_set_fpp: " << bloomp
->get_fpp() << "\n";
6485 } else if(var
!= "all") {
6486 ss
<< "hit set is not of type Bloom; " <<
6487 "invalid to get a false positive rate!";
6493 case USE_GMT_HITSET
:
6494 ss
<< "use_gmt_hitset: " << p
->use_gmt_hitset
<< "\n";
6496 case TARGET_MAX_OBJECTS
:
6497 ss
<< "target_max_objects: " << p
->target_max_objects
<< "\n";
6499 case TARGET_MAX_BYTES
:
6500 ss
<< "target_max_bytes: " << p
->target_max_bytes
<< "\n";
6502 case CACHE_TARGET_DIRTY_RATIO
:
6503 ss
<< "cache_target_dirty_ratio: "
6504 << ((float)p
->cache_target_dirty_ratio_micro
/1000000) << "\n";
6506 case CACHE_TARGET_DIRTY_HIGH_RATIO
:
6507 ss
<< "cache_target_dirty_high_ratio: "
6508 << ((float)p
->cache_target_dirty_high_ratio_micro
/1000000) << "\n";
6510 case CACHE_TARGET_FULL_RATIO
:
6511 ss
<< "cache_target_full_ratio: "
6512 << ((float)p
->cache_target_full_ratio_micro
/1000000) << "\n";
6514 case CACHE_MIN_FLUSH_AGE
:
6515 ss
<< "cache_min_flush_age: " << p
->cache_min_flush_age
<< "\n";
6517 case CACHE_MIN_EVICT_AGE
:
6518 ss
<< "cache_min_evict_age: " << p
->cache_min_evict_age
<< "\n";
6520 case ERASURE_CODE_PROFILE
:
6521 ss
<< "erasure_code_profile: " << p
->erasure_code_profile
<< "\n";
6523 case MIN_READ_RECENCY_FOR_PROMOTE
:
6524 ss
<< "min_read_recency_for_promote: " <<
6525 p
->min_read_recency_for_promote
<< "\n";
6527 case HIT_SET_GRADE_DECAY_RATE
:
6528 ss
<< "hit_set_grade_decay_rate: " <<
6529 p
->hit_set_grade_decay_rate
<< "\n";
6531 case HIT_SET_SEARCH_LAST_N
:
6532 ss
<< "hit_set_search_last_n: " <<
6533 p
->hit_set_search_last_n
<< "\n";
6536 ss
<< "allow_ec_overwrites: " <<
6537 (p
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
) ? "true" : "false") <<
6546 case WRITE_FADVISE_DONTNEED
:
6549 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
6550 if (i
->second
== *it
)
6553 ceph_assert(i
!= ALL_CHOICES
.end());
6554 ss
<< i
->first
<< ": " <<
6555 (p
->has_flag(pg_pool_t::get_flag_by_name(i
->first
)) ?
6556 "true" : "false") << "\n";
6558 case MIN_WRITE_RECENCY_FOR_PROMOTE
:
6559 ss
<< "min_write_recency_for_promote: " <<
6560 p
->min_write_recency_for_promote
<< "\n";
6563 ss
<< "fast_read: " << p
->fast_read
<< "\n";
6565 case SCRUB_MIN_INTERVAL
:
6566 case SCRUB_MAX_INTERVAL
:
6567 case DEEP_SCRUB_INTERVAL
:
6568 case RECOVERY_PRIORITY
:
6569 case RECOVERY_OP_PRIORITY
:
6570 case SCRUB_PRIORITY
:
6571 case COMPRESSION_MODE
:
6572 case COMPRESSION_ALGORITHM
:
6573 case COMPRESSION_REQUIRED_RATIO
:
6574 case COMPRESSION_MAX_BLOB_SIZE
:
6575 case COMPRESSION_MIN_BLOB_SIZE
:
6577 case CSUM_MAX_BLOCK
:
6578 case CSUM_MIN_BLOCK
:
6579 case FINGERPRINT_ALGORITHM
:
6582 case TARGET_SIZE_BYTES
:
6583 case TARGET_SIZE_RATIO
:
6584 case PG_AUTOSCALE_BIAS
:
6586 case DEDUP_CHUNK_ALGORITHM
:
6587 case DEDUP_CDC_CHUNK_SIZE
:
6588 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
6589 if (i
->second
== *it
)
6592 ceph_assert(i
!= ALL_CHOICES
.end());
6594 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc(i
->first
).key
;
6595 if (p
->opts
.is_set(key
)) {
6596 if(key
== pool_opts_t::CSUM_TYPE
) {
6598 p
->opts
.get(key
, &val
);
6599 ss
<< i
->first
<< ": " << Checksummer::get_csum_type_string(val
) << "\n";
6601 ss
<< i
->first
<< ": " << p
->opts
.get(key
) << "\n";
6607 rdata
.append(ss
.str());
6612 } else if (prefix
== "osd pool get-quota") {
6614 cmd_getval(cmdmap
, "pool", pool_name
);
6616 int64_t poolid
= osdmap
.lookup_pg_pool_name(pool_name
);
6618 ceph_assert(poolid
== -ENOENT
);
6619 ss
<< "unrecognized pool '" << pool_name
<< "'";
6623 const pg_pool_t
*p
= osdmap
.get_pg_pool(poolid
);
6624 const pool_stat_t
* pstat
= mon
.mgrstatmon()->get_pool_stat(poolid
);
6626 ss
<< "no stats for pool '" << pool_name
<< "'";
6630 const object_stat_sum_t
& sum
= pstat
->stats
.sum
;
6632 f
->open_object_section("pool_quotas");
6633 f
->dump_string("pool_name", pool_name
);
6634 f
->dump_unsigned("pool_id", poolid
);
6635 f
->dump_unsigned("quota_max_objects", p
->quota_max_objects
);
6636 f
->dump_int("current_num_objects", sum
.num_objects
);
6637 f
->dump_unsigned("quota_max_bytes", p
->quota_max_bytes
);
6638 f
->dump_int("current_num_bytes", sum
.num_bytes
);
6643 rs
<< "quotas for pool '" << pool_name
<< "':\n"
6644 << " max objects: ";
6645 if (p
->quota_max_objects
== 0)
6648 rs
<< si_u_t(p
->quota_max_objects
) << " objects";
6649 rs
<< " (current num objects: " << sum
.num_objects
<< " objects)";
6653 if (p
->quota_max_bytes
== 0)
6656 rs
<< byte_u_t(p
->quota_max_bytes
);
6657 rs
<< " (current num bytes: " << sum
.num_bytes
<< " bytes)";
6659 rdata
.append(rs
.str());
6663 } else if (prefix
== "osd crush rule list" ||
6664 prefix
== "osd crush rule ls") {
6666 f
->open_array_section("rules");
6667 osdmap
.crush
->list_rules(f
.get());
6672 osdmap
.crush
->list_rules(&ss
);
6673 rdata
.append(ss
.str());
6675 } else if (prefix
== "osd crush rule ls-by-class") {
6677 cmd_getval(cmdmap
, "class", class_name
);
6678 if (class_name
.empty()) {
6679 ss
<< "no class specified";
6684 r
= osdmap
.crush
->get_rules_by_class(class_name
, &rules
);
6686 ss
<< "failed to get rules by class '" << class_name
<< "'";
6690 f
->open_array_section("rules");
6691 for (auto &rule
: rules
) {
6692 f
->dump_string("name", osdmap
.crush
->get_rule_name(rule
));
6698 for (auto &rule
: rules
) {
6699 rs
<< osdmap
.crush
->get_rule_name(rule
) << "\n";
6701 rdata
.append(rs
.str());
6703 } else if (prefix
== "osd crush rule dump") {
6705 cmd_getval(cmdmap
, "name", name
);
6707 cmd_getval(cmdmap
, "format", format
);
6708 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6710 f
->open_array_section("rules");
6711 osdmap
.crush
->dump_rules(f
.get());
6714 int ruleno
= osdmap
.crush
->get_rule_id(name
);
6716 ss
<< "unknown crush rule '" << name
<< "'";
6720 osdmap
.crush
->dump_rule(ruleno
, f
.get());
6725 rdata
.append(rs
.str());
6726 } else if (prefix
== "osd crush dump") {
6728 cmd_getval(cmdmap
, "format", format
);
6729 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6730 f
->open_object_section("crush_map");
6731 osdmap
.crush
->dump(f
.get());
6736 rdata
.append(rs
.str());
6737 } else if (prefix
== "osd crush show-tunables") {
6739 cmd_getval(cmdmap
, "format", format
);
6740 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6741 f
->open_object_section("crush_map_tunables");
6742 osdmap
.crush
->dump_tunables(f
.get());
6747 rdata
.append(rs
.str());
6748 } else if (prefix
== "osd crush tree") {
6749 bool show_shadow
= false;
6750 if (!cmd_getval_compat_cephbool(cmdmap
, "show_shadow", show_shadow
)) {
6752 if (cmd_getval(cmdmap
, "shadow", shadow
) &&
6753 shadow
== "--show-shadow") {
6757 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
6759 f
->open_object_section("crush_tree");
6760 osdmap
.crush
->dump_tree(nullptr,
6762 osdmap
.get_pool_names(),
6768 osdmap
.crush
->dump_tree(&ss
,
6770 osdmap
.get_pool_names(),
6772 rdata
.append(ss
.str());
6774 } else if (prefix
== "osd crush ls") {
6776 if (!cmd_getval(cmdmap
, "node", name
)) {
6777 ss
<< "no node specified";
6781 if (!osdmap
.crush
->name_exists(name
)) {
6782 ss
<< "node '" << name
<< "' does not exist";
6786 int id
= osdmap
.crush
->get_item_id(name
);
6789 result
.push_back(id
);
6791 int num
= osdmap
.crush
->get_bucket_size(id
);
6792 for (int i
= 0; i
< num
; ++i
) {
6793 result
.push_back(osdmap
.crush
->get_bucket_item(id
, i
));
6797 f
->open_array_section("items");
6798 for (auto i
: result
) {
6799 f
->dump_string("item", osdmap
.crush
->get_item_name(i
));
6805 for (auto i
: result
) {
6806 ss
<< osdmap
.crush
->get_item_name(i
) << "\n";
6808 rdata
.append(ss
.str());
6811 } else if (prefix
== "osd crush class ls") {
6812 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6813 f
->open_array_section("crush_classes");
6814 for (auto i
: osdmap
.crush
->class_name
)
6815 f
->dump_string("class", i
.second
);
6818 } else if (prefix
== "osd crush class ls-osd") {
6820 cmd_getval(cmdmap
, "class", name
);
6822 osdmap
.crush
->get_devices_by_class(name
, &osds
);
6824 f
->open_array_section("osds");
6825 for (auto &osd
: osds
)
6826 f
->dump_int("osd", osd
);
6831 for (auto &osd
: osds
) {
6839 } else if (prefix
== "osd crush get-device-class") {
6840 vector
<string
> idvec
;
6841 cmd_getval(cmdmap
, "ids", idvec
);
6842 map
<int, string
> class_by_osd
;
6843 for (auto& id
: idvec
) {
6845 long osd
= parse_osd_id(id
.c_str(), &ts
);
6847 ss
<< "unable to parse osd id:'" << id
<< "'";
6851 auto device_class
= osdmap
.crush
->get_item_class(osd
);
6853 class_by_osd
[osd
] = device_class
;
6855 class_by_osd
[osd
] = ""; // no class
6858 f
->open_array_section("osd_device_classes");
6859 for (auto& i
: class_by_osd
) {
6860 f
->open_object_section("osd_device_class");
6861 f
->dump_int("osd", i
.first
);
6862 f
->dump_string("device_class", i
.second
);
6868 if (class_by_osd
.size() == 1) {
6869 // for single input, make a clean output
6870 ds
<< class_by_osd
.begin()->second
;
6872 // note that we do not group osds by class here
6873 for (auto it
= class_by_osd
.begin();
6874 it
!= class_by_osd
.end();
6876 ds
<< "osd." << it
->first
<< ' ' << it
->second
;
6877 if (next(it
) != class_by_osd
.end())
6883 } else if (prefix
== "osd erasure-code-profile ls") {
6884 const auto &profiles
= osdmap
.get_erasure_code_profiles();
6886 f
->open_array_section("erasure-code-profiles");
6887 for (auto i
= profiles
.begin(); i
!= profiles
.end(); ++i
) {
6889 f
->dump_string("profile", i
->first
.c_str());
6891 rdata
.append(i
->first
+ "\n");
6898 rdata
.append(rs
.str());
6900 } else if (prefix
== "osd crush weight-set ls") {
6901 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
6903 f
->open_array_section("weight_sets");
6904 if (osdmap
.crush
->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS
)) {
6905 f
->dump_string("pool", "(compat)");
6907 for (auto& i
: osdmap
.crush
->choose_args
) {
6909 f
->dump_string("pool", osdmap
.get_pool_name(i
.first
));
6916 if (osdmap
.crush
->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS
)) {
6919 for (auto& i
: osdmap
.crush
->choose_args
) {
6921 rs
<< osdmap
.get_pool_name(i
.first
) << "\n";
6924 rdata
.append(rs
.str());
6926 } else if (prefix
== "osd crush weight-set dump") {
6927 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty",
6929 osdmap
.crush
->dump_choose_args(f
.get());
6931 } else if (prefix
== "osd erasure-code-profile get") {
6933 cmd_getval(cmdmap
, "name", name
);
6934 if (!osdmap
.has_erasure_code_profile(name
)) {
6935 ss
<< "unknown erasure code profile '" << name
<< "'";
6939 const map
<string
,string
> &profile
= osdmap
.get_erasure_code_profile(name
);
6941 f
->open_object_section("profile");
6942 for (map
<string
,string
>::const_iterator i
= profile
.begin();
6946 f
->dump_string(i
->first
.c_str(), i
->second
.c_str());
6948 rdata
.append(i
->first
+ "=" + i
->second
+ "\n");
6955 rdata
.append(rs
.str());
6957 } else if (prefix
== "osd pool application get") {
6958 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty",
6961 cmd_getval(cmdmap
, "pool", pool_name
);
6963 cmd_getval(cmdmap
, "app", app
);
6965 cmd_getval(cmdmap
, "key", key
);
6967 if (pool_name
.empty()) {
6969 f
->open_object_section("pools");
6970 for (const auto &pool
: osdmap
.pools
) {
6971 std::string
name("<unknown>");
6972 const auto &pni
= osdmap
.pool_name
.find(pool
.first
);
6973 if (pni
!= osdmap
.pool_name
.end())
6975 f
->open_object_section(name
.c_str());
6976 for (auto &app_pair
: pool
.second
.application_metadata
) {
6977 f
->open_object_section(app_pair
.first
.c_str());
6978 for (auto &kv_pair
: app_pair
.second
) {
6979 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
6983 f
->close_section(); // name
6985 f
->close_section(); // pools
6988 int64_t pool
= osdmap
.lookup_pg_pool_name(pool_name
.c_str());
6990 ss
<< "unrecognized pool '" << pool_name
<< "'";
6994 auto p
= osdmap
.get_pg_pool(pool
);
6997 f
->open_object_section(pool_name
.c_str());
6998 for (auto &app_pair
: p
->application_metadata
) {
6999 f
->open_object_section(app_pair
.first
.c_str());
7000 for (auto &kv_pair
: app_pair
.second
) {
7001 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
7003 f
->close_section(); // application
7005 f
->close_section(); // pool_name
7010 auto app_it
= p
->application_metadata
.find(app
);
7011 if (app_it
== p
->application_metadata
.end()) {
7012 ss
<< "pool '" << pool_name
<< "' has no application '" << app
<< "'";
7016 // filter by pool + app
7018 f
->open_object_section(app_it
->first
.c_str());
7019 for (auto &kv_pair
: app_it
->second
) {
7020 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
7022 f
->close_section(); // application
7026 // filter by pool + app + key
7027 auto key_it
= app_it
->second
.find(key
);
7028 if (key_it
== app_it
->second
.end()) {
7029 ss
<< "application '" << app
<< "' on pool '" << pool_name
7030 << "' does not have key '" << key
<< "'";
7034 ss
<< key_it
->second
<< "\n";
7035 rdata
.append(ss
.str());
7038 } else if (prefix
== "osd get-require-min-compat-client") {
7039 ss
<< osdmap
.require_min_compat_client
<< std::endl
;
7040 rdata
.append(ss
.str());
7043 } else if (prefix
== "osd pool application enable" ||
7044 prefix
== "osd pool application disable" ||
7045 prefix
== "osd pool application set" ||
7046 prefix
== "osd pool application rm") {
7047 bool changed
= false;
7048 r
= preprocess_command_pool_application(prefix
, cmdmap
, ss
, &changed
);
7052 } else if (changed
) {
7053 // Valid mutation, proceed to prepare phase
7056 // Idempotent case, reply
7060 // try prepare update
7067 mon
.reply_command(op
, r
, rs
, rdata
, get_last_committed());
7071 void OSDMonitor::set_pool_flags(int64_t pool_id
, uint64_t flags
)
7073 pg_pool_t
*pool
= pending_inc
.get_new_pool(pool_id
,
7074 osdmap
.get_pg_pool(pool_id
));
7076 pool
->set_flag(flags
);
7079 void OSDMonitor::clear_pool_flags(int64_t pool_id
, uint64_t flags
)
7081 pg_pool_t
*pool
= pending_inc
.get_new_pool(pool_id
,
7082 osdmap
.get_pg_pool(pool_id
));
7084 pool
->unset_flag(flags
);
7087 string
OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch
)
7090 snprintf(k
, sizeof(k
), "purged_epoch_%08lx", (unsigned long)epoch
);
7094 string
OSDMonitor::make_purged_snap_key(int64_t pool
, snapid_t snap
)
7097 snprintf(k
, sizeof(k
), "purged_snap_%llu_%016llx",
7098 (unsigned long long)pool
, (unsigned long long)snap
);
7102 string
OSDMonitor::make_purged_snap_key_value(
7103 int64_t pool
, snapid_t snap
, snapid_t num
,
7104 epoch_t epoch
, bufferlist
*v
)
7106 // encode the *last* epoch in the key so that we can use forward
7107 // iteration only to search for an epoch in an interval.
7109 encode(snap
+ num
, *v
);
7111 return make_purged_snap_key(pool
, snap
+ num
- 1);
7115 int OSDMonitor::lookup_purged_snap(
7116 int64_t pool
, snapid_t snap
,
7117 snapid_t
*begin
, snapid_t
*end
)
7119 string k
= make_purged_snap_key(pool
, snap
);
7120 auto it
= mon
.store
->get_iterator(OSD_SNAP_PREFIX
);
7123 dout(20) << __func__
7124 << " pool " << pool
<< " snap " << snap
7125 << " - key '" << k
<< "' not found" << dendl
;
7128 if (it
->key().find("purged_snap_") != 0) {
7129 dout(20) << __func__
7130 << " pool " << pool
<< " snap " << snap
7131 << " - key '" << k
<< "' got '" << it
->key()
7132 << "', wrong prefix" << dendl
;
7135 string gotk
= it
->key();
7136 const char *format
= "purged_snap_%llu_";
7137 long long int keypool
;
7138 int n
= sscanf(gotk
.c_str(), format
, &keypool
);
7140 derr
<< __func__
<< " invalid k '" << gotk
<< "'" << dendl
;
7143 if (pool
!= keypool
) {
7144 dout(20) << __func__
7145 << " pool " << pool
<< " snap " << snap
7146 << " - key '" << k
<< "' got '" << gotk
7147 << "', wrong pool " << keypool
7151 bufferlist v
= it
->value();
7152 auto p
= v
.cbegin();
7155 if (snap
< *begin
|| snap
>= *end
) {
7156 dout(20) << __func__
7157 << " pool " << pool
<< " snap " << snap
7158 << " - found [" << *begin
<< "," << *end
<< "), no overlap"
7165 void OSDMonitor::insert_purged_snap_update(
7167 snapid_t start
, snapid_t end
,
7169 MonitorDBStore::TransactionRef t
)
7171 snapid_t before_begin
, before_end
;
7172 snapid_t after_begin
, after_end
;
7173 int b
= lookup_purged_snap(pool
, start
- 1,
7174 &before_begin
, &before_end
);
7175 int a
= lookup_purged_snap(pool
, end
,
7176 &after_begin
, &after_end
);
7178 dout(10) << __func__
7179 << " [" << start
<< "," << end
<< ") - joins ["
7180 << before_begin
<< "," << before_end
<< ") and ["
7181 << after_begin
<< "," << after_end
<< ")" << dendl
;
7182 // erase only the begin record; we'll overwrite the end one.
7183 t
->erase(OSD_SNAP_PREFIX
, make_purged_snap_key(pool
, before_end
- 1));
7185 string k
= make_purged_snap_key_value(pool
,
7186 before_begin
, after_end
- before_begin
,
7187 pending_inc
.epoch
, &v
);
7188 t
->put(OSD_SNAP_PREFIX
, k
, v
);
7190 dout(10) << __func__
7191 << " [" << start
<< "," << end
<< ") - join with earlier ["
7192 << before_begin
<< "," << before_end
<< ")" << dendl
;
7193 t
->erase(OSD_SNAP_PREFIX
, make_purged_snap_key(pool
, before_end
- 1));
7195 string k
= make_purged_snap_key_value(pool
,
7196 before_begin
, end
- before_begin
,
7197 pending_inc
.epoch
, &v
);
7198 t
->put(OSD_SNAP_PREFIX
, k
, v
);
7200 dout(10) << __func__
7201 << " [" << start
<< "," << end
<< ") - join with later ["
7202 << after_begin
<< "," << after_end
<< ")" << dendl
;
7203 // overwrite after record
7205 string k
= make_purged_snap_key_value(pool
,
7206 start
, after_end
- start
,
7207 pending_inc
.epoch
, &v
);
7208 t
->put(OSD_SNAP_PREFIX
, k
, v
);
7210 dout(10) << __func__
7211 << " [" << start
<< "," << end
<< ") - new"
7214 string k
= make_purged_snap_key_value(pool
,
7216 pending_inc
.epoch
, &v
);
7217 t
->put(OSD_SNAP_PREFIX
, k
, v
);
7221 bool OSDMonitor::try_prune_purged_snaps()
7223 if (!mon
.mgrstatmon()->is_readable()) {
7226 if (!pending_inc
.new_purged_snaps
.empty()) {
7227 return false; // we already pruned for this epoch
7230 unsigned max_prune
= cct
->_conf
.get_val
<uint64_t>(
7231 "mon_max_snap_prune_per_epoch");
7235 dout(10) << __func__
<< " max_prune " << max_prune
<< dendl
;
7237 unsigned actually_pruned
= 0;
7238 auto& purged_snaps
= mon
.mgrstatmon()->get_digest().purged_snaps
;
7239 for (auto& p
: osdmap
.get_pools()) {
7240 auto q
= purged_snaps
.find(p
.first
);
7241 if (q
== purged_snaps
.end()) {
7244 auto& purged
= q
->second
;
7245 if (purged
.empty()) {
7246 dout(20) << __func__
<< " " << p
.first
<< " nothing purged" << dendl
;
7249 dout(20) << __func__
<< " pool " << p
.first
<< " purged " << purged
<< dendl
;
7250 snap_interval_set_t to_prune
;
7251 unsigned maybe_pruned
= actually_pruned
;
7252 for (auto i
= purged
.begin(); i
!= purged
.end(); ++i
) {
7253 snapid_t begin
= i
.get_start();
7254 auto end
= i
.get_start() + i
.get_len();
7255 snapid_t pbegin
= 0, pend
= 0;
7256 int r
= lookup_purged_snap(p
.first
, begin
, &pbegin
, &pend
);
7259 // be a bit aggressive about backing off here, because the mon may
7260 // do a lot of work going through this set, and if we know the
7261 // purged set from the OSDs is at least *partly* stale we may as
7262 // well wait for it to be fresh.
7263 dout(20) << __func__
<< " we've already purged " << pbegin
7264 << "~" << (pend
- pbegin
) << dendl
;
7267 if (pbegin
&& pbegin
> begin
&& pbegin
< end
) {
7268 // the tail of [begin,end) is purged; shorten the range
7271 to_prune
.insert(begin
, end
- begin
);
7272 maybe_pruned
+= end
- begin
;
7273 if (maybe_pruned
>= max_prune
) {
7277 if (!to_prune
.empty()) {
7278 // PGs may still be reporting things as purged that we have already
7279 // pruned from removed_snaps_queue.
7280 snap_interval_set_t actual
;
7281 auto r
= osdmap
.removed_snaps_queue
.find(p
.first
);
7282 if (r
!= osdmap
.removed_snaps_queue
.end()) {
7283 actual
.intersection_of(to_prune
, r
->second
);
7285 actually_pruned
+= actual
.size();
7286 dout(10) << __func__
<< " pool " << p
.first
<< " reports pruned " << to_prune
7287 << ", actual pruned " << actual
<< dendl
;
7288 if (!actual
.empty()) {
7289 pending_inc
.new_purged_snaps
[p
.first
].swap(actual
);
7292 if (actually_pruned
>= max_prune
) {
7296 dout(10) << __func__
<< " actually pruned " << actually_pruned
<< dendl
;
7297 return !!actually_pruned
;
7300 bool OSDMonitor::update_pools_status()
7302 if (!mon
.mgrstatmon()->is_readable())
7307 auto& pools
= osdmap
.get_pools();
7308 for (auto it
= pools
.begin(); it
!= pools
.end(); ++it
) {
7309 const pool_stat_t
*pstat
= mon
.mgrstatmon()->get_pool_stat(it
->first
);
7312 const object_stat_sum_t
& sum
= pstat
->stats
.sum
;
7313 const pg_pool_t
&pool
= it
->second
;
7314 const string
& pool_name
= osdmap
.get_pool_name(it
->first
);
7317 (pool
.quota_max_bytes
> 0 && (uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) ||
7318 (pool
.quota_max_objects
> 0 && (uint64_t)sum
.num_objects
>= pool
.quota_max_objects
);
7320 if (pool
.has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
7324 mon
.clog
->info() << "pool '" << pool_name
7325 << "' no longer out of quota; removing NO_QUOTA flag";
7326 // below we cancel FLAG_FULL too, we'll set it again in
7327 // OSDMonitor::encode_pending if it still fails the osd-full checking.
7328 clear_pool_flags(it
->first
,
7329 pg_pool_t::FLAG_FULL_QUOTA
| pg_pool_t::FLAG_FULL
);
7335 if (pool
.quota_max_bytes
> 0 &&
7336 (uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) {
7337 mon
.clog
->warn() << "pool '" << pool_name
<< "' is full"
7338 << " (reached quota's max_bytes: "
7339 << byte_u_t(pool
.quota_max_bytes
) << ")";
7341 if (pool
.quota_max_objects
> 0 &&
7342 (uint64_t)sum
.num_objects
>= pool
.quota_max_objects
) {
7343 mon
.clog
->warn() << "pool '" << pool_name
<< "' is full"
7344 << " (reached quota's max_objects: "
7345 << pool
.quota_max_objects
<< ")";
7347 // set both FLAG_FULL_QUOTA and FLAG_FULL
7348 // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
7349 // since FLAG_FULL should always take precedence
7350 set_pool_flags(it
->first
,
7351 pg_pool_t::FLAG_FULL_QUOTA
| pg_pool_t::FLAG_FULL
);
7352 clear_pool_flags(it
->first
,
7353 pg_pool_t::FLAG_NEARFULL
|
7354 pg_pool_t::FLAG_BACKFILLFULL
);
7361 int OSDMonitor::prepare_new_pool(MonOpRequestRef op
)
7363 op
->mark_osdmon_event(__func__
);
7364 auto m
= op
->get_req
<MPoolOp
>();
7365 dout(10) << "prepare_new_pool from " << m
->get_connection() << dendl
;
7366 MonSession
*session
= op
->get_session();
7369 string erasure_code_profile
;
7374 ret
= prepare_new_pool(m
->name
, m
->crush_rule
, rule_name
,
7375 0, 0, 0, 0, 0, 0, 0.0,
7376 erasure_code_profile
,
7377 pg_pool_t::TYPE_REPLICATED
, 0, FAST_READ_OFF
, {}, bulk
,
7378 cct
->_conf
.get_val
<bool>("osd_pool_default_crimson"),
7382 dout(10) << __func__
<< " got " << ret
<< " " << ss
.str() << dendl
;
7387 int OSDMonitor::crush_rename_bucket(const string
& srcname
,
7388 const string
& dstname
,
7393 // Avoid creating a pending crush if it does not already exists and
7394 // the rename would fail.
7396 if (!_have_pending_crush()) {
7397 ret
= _get_stable_crush().can_rename_bucket(srcname
,
7404 CrushWrapper newcrush
= _get_pending_crush();
7406 ret
= newcrush
.rename_bucket(srcname
,
7412 pending_inc
.crush
.clear();
7413 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
7414 *ss
<< "renamed bucket " << srcname
<< " into " << dstname
;
7418 void OSDMonitor::check_legacy_ec_plugin(const string
& plugin
, const string
& profile
) const
7420 string replacement
= "";
7422 if (plugin
== "jerasure_generic" ||
7423 plugin
== "jerasure_sse3" ||
7424 plugin
== "jerasure_sse4" ||
7425 plugin
== "jerasure_neon") {
7426 replacement
= "jerasure";
7427 } else if (plugin
== "shec_generic" ||
7428 plugin
== "shec_sse3" ||
7429 plugin
== "shec_sse4" ||
7430 plugin
== "shec_neon") {
7431 replacement
= "shec";
7434 if (replacement
!= "") {
7435 dout(0) << "WARNING: erasure coding profile " << profile
<< " uses plugin "
7436 << plugin
<< " that has been deprecated. Please use "
7437 << replacement
<< " instead." << dendl
;
7441 int OSDMonitor::normalize_profile(const string
& profilename
,
7442 ErasureCodeProfile
&profile
,
7446 ErasureCodeInterfaceRef erasure_code
;
7447 ErasureCodePluginRegistry
&instance
= ErasureCodePluginRegistry::instance();
7448 ErasureCodeProfile::const_iterator plugin
= profile
.find("plugin");
7449 check_legacy_ec_plugin(plugin
->second
, profilename
);
7450 int err
= instance
.factory(plugin
->second
,
7451 g_conf().get_val
<std::string
>("erasure_code_dir"),
7452 profile
, &erasure_code
, ss
);
7457 err
= erasure_code
->init(profile
, ss
);
7462 auto it
= profile
.find("stripe_unit");
7463 if (it
!= profile
.end()) {
7465 uint32_t stripe_unit
= strict_iecstrtoll(it
->second
, &err_str
);
7466 if (!err_str
.empty()) {
7467 *ss
<< "could not parse stripe_unit '" << it
->second
7468 << "': " << err_str
<< std::endl
;
7471 uint32_t data_chunks
= erasure_code
->get_data_chunk_count();
7472 uint32_t chunk_size
= erasure_code
->get_chunk_size(stripe_unit
* data_chunks
);
7473 if (chunk_size
!= stripe_unit
) {
7474 *ss
<< "stripe_unit " << stripe_unit
<< " does not match ec profile "
7475 << "alignment. Would be padded to " << chunk_size
7479 if ((stripe_unit
% 4096) != 0 && !force
) {
7480 *ss
<< "stripe_unit should be a multiple of 4096 bytes for best performance."
7481 << "use --force to override this check" << std::endl
;
7488 int OSDMonitor::crush_rule_create_erasure(const string
&name
,
7489 const string
&profile
,
7493 int ruleid
= osdmap
.crush
->get_rule_id(name
);
7494 if (ruleid
!= -ENOENT
) {
7499 CrushWrapper newcrush
= _get_pending_crush();
7501 ruleid
= newcrush
.get_rule_id(name
);
7502 if (ruleid
!= -ENOENT
) {
7506 ErasureCodeInterfaceRef erasure_code
;
7507 int err
= get_erasure_code(profile
, &erasure_code
, ss
);
7509 *ss
<< "failed to load plugin using profile " << profile
<< std::endl
;
7513 err
= erasure_code
->create_rule(name
, newcrush
, ss
);
7514 erasure_code
.reset();
7518 pending_inc
.crush
.clear();
7519 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
7524 int OSDMonitor::get_erasure_code(const string
&erasure_code_profile
,
7525 ErasureCodeInterfaceRef
*erasure_code
,
7528 if (pending_inc
.has_erasure_code_profile(erasure_code_profile
))
7530 ErasureCodeProfile profile
=
7531 osdmap
.get_erasure_code_profile(erasure_code_profile
);
7532 ErasureCodeProfile::const_iterator plugin
=
7533 profile
.find("plugin");
7534 if (plugin
== profile
.end()) {
7535 *ss
<< "cannot determine the erasure code plugin"
7536 << " because there is no 'plugin' entry in the erasure_code_profile "
7537 << profile
<< std::endl
;
7540 check_legacy_ec_plugin(plugin
->second
, erasure_code_profile
);
7541 auto& instance
= ErasureCodePluginRegistry::instance();
7542 return instance
.factory(plugin
->second
,
7543 g_conf().get_val
<std::string
>("erasure_code_dir"),
7544 profile
, erasure_code
, ss
);
7547 int OSDMonitor::check_cluster_features(uint64_t features
,
7550 stringstream unsupported_ss
;
7551 int unsupported_count
= 0;
7552 if ((mon
.get_quorum_con_features() & features
) != features
) {
7553 unsupported_ss
<< "the monitor cluster";
7554 ++unsupported_count
;
7557 set
<int32_t> up_osds
;
7558 osdmap
.get_up_osds(up_osds
);
7559 for (set
<int32_t>::iterator it
= up_osds
.begin();
7560 it
!= up_osds
.end(); ++it
) {
7561 const osd_xinfo_t
&xi
= osdmap
.get_xinfo(*it
);
7562 if ((xi
.features
& features
) != features
) {
7563 if (unsupported_count
> 0)
7564 unsupported_ss
<< ", ";
7565 unsupported_ss
<< "osd." << *it
;
7566 unsupported_count
++;
7570 if (unsupported_count
> 0) {
7571 ss
<< "features " << features
<< " unsupported by: "
7572 << unsupported_ss
.str();
7576 // check pending osd state, too!
7577 for (map
<int32_t,osd_xinfo_t
>::const_iterator p
=
7578 pending_inc
.new_xinfo
.begin();
7579 p
!= pending_inc
.new_xinfo
.end(); ++p
) {
7580 const osd_xinfo_t
&xi
= p
->second
;
7581 if ((xi
.features
& features
) != features
) {
7582 dout(10) << __func__
<< " pending osd." << p
->first
7583 << " features are insufficient; retry" << dendl
;
7591 bool OSDMonitor::validate_crush_against_features(const CrushWrapper
*newcrush
,
7594 OSDMap::Incremental new_pending
= pending_inc
;
7595 encode(*newcrush
, new_pending
.crush
, mon
.get_quorum_con_features());
7597 newmap
.deepish_copy_from(osdmap
);
7598 newmap
.apply_incremental(new_pending
);
7601 if (newmap
.require_min_compat_client
!= ceph_release_t::unknown
) {
7602 auto mv
= newmap
.get_min_compat_client();
7603 if (mv
> newmap
.require_min_compat_client
) {
7604 ss
<< "new crush map requires client version " << mv
7605 << " but require_min_compat_client is "
7606 << newmap
.require_min_compat_client
;
7613 newmap
.get_features(CEPH_ENTITY_TYPE_MON
, NULL
) |
7614 newmap
.get_features(CEPH_ENTITY_TYPE_OSD
, NULL
);
7615 stringstream features_ss
;
7616 int r
= check_cluster_features(features
, features_ss
);
7618 ss
<< "Could not change CRUSH: " << features_ss
.str();
7625 bool OSDMonitor::erasure_code_profile_in_use(
7626 const mempool::osdmap::map
<int64_t, pg_pool_t
> &pools
,
7627 const string
&profile
,
7631 for (map
<int64_t, pg_pool_t
>::const_iterator p
= pools
.begin();
7634 if (p
->second
.erasure_code_profile
== profile
&& p
->second
.is_erasure()) {
7635 *ss
<< osdmap
.pool_name
[p
->first
] << " ";
7640 *ss
<< "pool(s) are using the erasure code profile '" << profile
<< "'";
7645 int OSDMonitor::parse_erasure_code_profile(const vector
<string
> &erasure_code_profile
,
7646 map
<string
,string
> *erasure_code_profile_map
,
7649 int r
= g_conf().with_val
<string
>("osd_pool_default_erasure_code_profile",
7652 erasure_code_profile_map
,
7656 ceph_assert((*erasure_code_profile_map
).count("plugin"));
7657 string default_plugin
= (*erasure_code_profile_map
)["plugin"];
7658 map
<string
,string
> user_map
;
7659 for (vector
<string
>::const_iterator i
= erasure_code_profile
.begin();
7660 i
!= erasure_code_profile
.end();
7662 size_t equal
= i
->find('=');
7663 if (equal
== string::npos
) {
7664 user_map
[*i
] = string();
7665 (*erasure_code_profile_map
)[*i
] = string();
7667 const string key
= i
->substr(0, equal
);
7669 const string value
= i
->substr(equal
);
7670 if (key
.find("ruleset-") == 0) {
7671 *ss
<< "property '" << key
<< "' is no longer supported; try "
7672 << "'crush-" << key
.substr(8) << "' instead";
7675 user_map
[key
] = value
;
7676 (*erasure_code_profile_map
)[key
] = value
;
7680 if (user_map
.count("plugin") && user_map
["plugin"] != default_plugin
)
7681 (*erasure_code_profile_map
) = user_map
;
7686 int OSDMonitor::prepare_pool_size(const unsigned pool_type
,
7687 const string
&erasure_code_profile
,
7689 unsigned *size
, unsigned *min_size
,
7693 bool set_min_size
= false;
7694 switch (pool_type
) {
7695 case pg_pool_t::TYPE_REPLICATED
:
7696 if (osdmap
.stretch_mode_enabled
) {
7698 repl_size
= g_conf().get_val
<uint64_t>("mon_stretch_pool_size");
7699 if (repl_size
!= g_conf().get_val
<uint64_t>("mon_stretch_pool_size")) {
7700 *ss
<< "prepare_pool_size: we are in stretch mode but size "
7701 << repl_size
<< " does not match!";
7704 *min_size
= g_conf().get_val
<uint64_t>("mon_stretch_pool_min_size");
7705 set_min_size
= true;
7707 if (repl_size
== 0) {
7708 repl_size
= g_conf().get_val
<uint64_t>("osd_pool_default_size");
7712 *min_size
= g_conf().get_osd_pool_default_min_size(repl_size
);
7714 case pg_pool_t::TYPE_ERASURE
:
7716 if (osdmap
.stretch_mode_enabled
) {
7717 *ss
<< "prepare_pool_size: we are in stretch mode; cannot create EC pools!";
7720 ErasureCodeInterfaceRef erasure_code
;
7721 err
= get_erasure_code(erasure_code_profile
, &erasure_code
, ss
);
7723 *size
= erasure_code
->get_chunk_count();
7725 erasure_code
->get_data_chunk_count() +
7726 std::min
<int>(1, erasure_code
->get_coding_chunk_count() - 1);
7727 assert(*min_size
<= *size
);
7728 assert(*min_size
>= erasure_code
->get_data_chunk_count());
7733 *ss
<< "prepare_pool_size: " << pool_type
<< " is not a known pool type";
7740 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type
,
7741 const string
&erasure_code_profile
,
7742 uint32_t *stripe_width
,
7746 switch (pool_type
) {
7747 case pg_pool_t::TYPE_REPLICATED
:
7750 case pg_pool_t::TYPE_ERASURE
:
7752 ErasureCodeProfile profile
=
7753 osdmap
.get_erasure_code_profile(erasure_code_profile
);
7754 ErasureCodeInterfaceRef erasure_code
;
7755 err
= get_erasure_code(erasure_code_profile
, &erasure_code
, ss
);
7758 uint32_t data_chunks
= erasure_code
->get_data_chunk_count();
7759 uint32_t stripe_unit
= g_conf().get_val
<Option::size_t>("osd_pool_erasure_code_stripe_unit");
7760 auto it
= profile
.find("stripe_unit");
7761 if (it
!= profile
.end()) {
7763 stripe_unit
= strict_iecstrtoll(it
->second
, &err_str
);
7764 ceph_assert(err_str
.empty());
7766 *stripe_width
= data_chunks
*
7767 erasure_code
->get_chunk_size(stripe_unit
* data_chunks
);
7771 *ss
<< "prepare_pool_stripe_width: "
7772 << pool_type
<< " is not a known pool type";
7779 int OSDMonitor::get_replicated_stretch_crush_rule()
7781 /* we don't write down the stretch rule anywhere, so
7782 * we have to guess it. How? Look at all the pools
7783 * and count up how many times a given rule is used
7784 * on stretch pools and then return the one with
7787 map
<int,int> rule_counts
;
7788 for (const auto& pooli
: osdmap
.pools
) {
7789 const pg_pool_t
& p
= pooli
.second
;
7790 if (p
.is_replicated() && p
.is_stretch_pool()) {
7791 if (!rule_counts
.count(p
.crush_rule
)) {
7792 rule_counts
[p
.crush_rule
] = 1;
7794 ++rule_counts
[p
.crush_rule
];
7799 if (rule_counts
.empty()) {
7803 int most_used_count
= 0;
7804 int most_used_rule
= -1;
7805 for (auto i
: rule_counts
) {
7806 if (i
.second
> most_used_count
) {
7807 most_used_rule
= i
.first
;
7808 most_used_count
= i
.second
;
7811 ceph_assert(most_used_count
> 0);
7812 ceph_assert(most_used_rule
>= 0);
7813 return most_used_rule
;
7816 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type
,
7817 const string
&erasure_code_profile
,
7818 const string
&rule_name
,
7823 if (*crush_rule
< 0) {
7824 switch (pool_type
) {
7825 case pg_pool_t::TYPE_REPLICATED
:
7827 if (rule_name
== "") {
7828 if (osdmap
.stretch_mode_enabled
) {
7829 *crush_rule
= get_replicated_stretch_crush_rule();
7832 *crush_rule
= osdmap
.crush
->get_osd_pool_default_crush_replicated_rule(cct
);
7834 if (*crush_rule
< 0) {
7835 // Errors may happen e.g. if no valid rule is available
7836 *ss
<< "No suitable CRUSH rule exists, check "
7837 << "'osd pool default crush *' config options";
7841 return get_crush_rule(rule_name
, crush_rule
, ss
);
7845 case pg_pool_t::TYPE_ERASURE
:
7847 int err
= crush_rule_create_erasure(rule_name
,
7848 erasure_code_profile
,
7852 dout(20) << "prepare_pool_crush_rule: rule "
7853 << rule_name
<< " try again" << dendl
;
7856 // need to wait for the crush rule to be proposed before proceeding
7867 *ss
<< "prepare_pool_crush_rule: " << pool_type
7868 << " is not a known pool type";
7872 if (!osdmap
.crush
->rule_exists(*crush_rule
)) {
7873 *ss
<< "CRUSH rule " << *crush_rule
<< " not found";
7881 int OSDMonitor::get_crush_rule(const string
&rule_name
,
7886 ret
= osdmap
.crush
->get_rule_id(rule_name
);
7887 if (ret
!= -ENOENT
) {
7891 CrushWrapper newcrush
= _get_pending_crush();
7893 ret
= newcrush
.get_rule_id(rule_name
);
7894 if (ret
!= -ENOENT
) {
7895 // found it, wait for it to be proposed
7896 dout(20) << __func__
<< ": rule " << rule_name
7897 << " try again" << dendl
;
7900 // Cannot find it , return error
7901 *ss
<< "specified rule " << rule_name
<< " doesn't exist";
7909 * Get the number of 'in' osds according to the crush_rule,
7911 uint32_t OSDMonitor::get_osd_num_by_crush(int crush_rule
)
7914 set
<int> crush_in_osds
;
7916 CrushWrapper newcrush
= _get_pending_crush();
7917 newcrush
.find_takes_by_rule(crush_rule
, &roots
);
7918 for (auto root
: roots
) {
7919 const char *rootname
= newcrush
.get_item_name(root
);
7920 set
<int> crush_all_osds
;
7921 newcrush
.get_leaves(rootname
, &crush_all_osds
);
7922 std::set_difference(crush_all_osds
.begin(), crush_all_osds
.end(),
7923 out_osds
.begin(), out_osds
.end(),
7924 std::inserter(crush_in_osds
, crush_in_osds
.end()));
7926 return crush_in_osds
.size();
7929 int OSDMonitor::check_pg_num(int64_t pool
,
7935 auto max_pgs_per_osd
= g_conf().get_val
<uint64_t>("mon_max_pg_per_osd");
7936 uint64_t projected
= 0;
7937 uint32_t osd_num_by_crush
= 0;
7938 set
<int64_t> crush_pool_ids
;
7941 projected
+= pg_num
* size
;
7944 osd_num_by_crush
= get_osd_num_by_crush(crush_rule
);
7945 osdmap
.get_pool_ids_by_rule(crush_rule
, &crush_pool_ids
);
7947 for (const auto& [pool_id
, pool_info
] : osdmap
.get_pools()) {
7948 // Check only for pools affected by crush rule
7949 if (crush_pool_ids
.contains(pool_id
)) {
7950 if (pool_id
== pool
) {
7951 // Specified pool, use given pg_num and size values.
7952 projected
+= pg_num
* size
;
7954 // Use pg_num_target for evaluating the projected pg num
7955 projected
+= pool_info
.get_pg_num_target() * pool_info
.get_size();
7959 // assume min cluster size 3
7960 osd_num_by_crush
= std::max(osd_num_by_crush
, 3u);
7961 auto projected_pgs_per_osd
= projected
/ osd_num_by_crush
;
7963 if (projected_pgs_per_osd
> max_pgs_per_osd
) {
7965 *ss
<< "pool id " << pool
;
7967 *ss
<< " pg_num " << pg_num
7969 << " for this pool would result in "
7970 << projected_pgs_per_osd
7971 << " cumulative PGs per OSD (" << projected
7972 << " total PG replicas on " << osd_num_by_crush
7973 << " 'in' root OSDs by crush rule) "
7974 << "which exceeds the mon_max_pg_per_osd "
7975 << "value of " << max_pgs_per_osd
;
7982 * @param name The name of the new pool
7983 * @param crush_rule The crush rule to use. If <0, will use the system default
7984 * @param crush_rule_name The crush rule to use, if crush_rulset <0
7985 * @param pg_num The pg_num to use. If set to 0, will use the system default
7986 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
7987 * @param pg_num_min min pg_num
7988 * @param pg_num_max max pg_num
7989 * @param repl_size Replication factor, or 0 for default
7990 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
7991 * @param pool_type TYPE_ERASURE, or TYPE_REP
7992 * @param expected_num_objects expected number of objects on the pool
7993 * @param fast_read fast read type.
7994 * @param pg_autoscale_mode autoscale mode, one of on, off, warn
7995 * @param bool bulk indicates whether pool should be a bulk pool
7996 * @param bool crimson indicates whether pool is a crimson pool
7997 * @param ss human readable error message, if any.
7999 * @return 0 on success, negative errno on failure.
8001 int OSDMonitor::prepare_new_pool(string
& name
,
8003 const string
&crush_rule_name
,
8004 unsigned pg_num
, unsigned pgp_num
,
8005 unsigned pg_num_min
,
8006 unsigned pg_num_max
,
8007 const uint64_t repl_size
,
8008 const uint64_t target_size_bytes
,
8009 const float target_size_ratio
,
8010 const string
&erasure_code_profile
,
8011 const unsigned pool_type
,
8012 const uint64_t expected_num_objects
,
8013 FastReadType fast_read
,
8014 string pg_autoscale_mode
,
8019 if (crimson
&& pg_autoscale_mode
.empty()) {
8020 // default pg_autoscale_mode to off for crimson, we'll error out below if
8021 // the user tried to actually set pg_autoscale_mode to something other than
8023 pg_autoscale_mode
= "off";
8026 if (name
.length() == 0)
8030 auto pg_num_from_mode
=
8031 [pg_num
=g_conf().get_val
<uint64_t>("osd_pool_default_pg_num")]
8032 (const string
& mode
) {
8033 return mode
== "on" ? 1 : pg_num
;
8035 pg_num
= pg_num_from_mode(
8036 pg_autoscale_mode
.empty() ?
8037 g_conf().get_val
<string
>("osd_pool_default_pg_autoscale_mode") :
8041 pgp_num
= g_conf().get_val
<uint64_t>("osd_pool_default_pgp_num");
8044 if (pg_num
> g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")) {
8045 *ss
<< "'pg_num' must be greater than 0 and less than or equal to "
8046 << g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")
8047 << " (you may adjust 'mon max pool pg num' for higher values)";
8050 if (pgp_num
> pg_num
) {
8051 *ss
<< "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
8052 << ", which in this case is " << pg_num
;
8057 /* crimson-osd requires that the pool be replicated and that pg_num/pgp_num
8058 * be static. User must also have specified set-allow-crimson */
8059 const auto *suffix
= " (--crimson specified or osd_pool_default_crimson set)";
8060 if (pool_type
!= pg_pool_t::TYPE_REPLICATED
) {
8061 *ss
<< "crimson-osd only supports replicated pools" << suffix
;
8063 } else if (pg_autoscale_mode
!= "off") {
8064 *ss
<< "crimson-osd does not support changing pg_num or pgp_num, "
8065 << "pg_autoscale_mode must be set to 'off'" << suffix
;
8067 } else if (!osdmap
.get_allow_crimson()) {
8068 *ss
<< "set-allow-crimson must be set to create a pool with the "
8069 << "crimson flag" << suffix
;
8074 if (pool_type
== pg_pool_t::TYPE_REPLICATED
&& fast_read
== FAST_READ_ON
) {
8075 *ss
<< "'fast_read' can only apply to erasure coding pool";
8079 r
= prepare_pool_crush_rule(pool_type
, erasure_code_profile
,
8080 crush_rule_name
, &crush_rule
, ss
);
8082 dout(10) << "prepare_pool_crush_rule returns " << r
<< dendl
;
8085 unsigned size
, min_size
;
8086 r
= prepare_pool_size(pool_type
, erasure_code_profile
, repl_size
,
8087 &size
, &min_size
, ss
);
8089 dout(10) << "prepare_pool_size returns " << r
<< dendl
;
8092 if (g_conf()->mon_osd_crush_smoke_test
) {
8093 CrushWrapper newcrush
= _get_pending_crush();
8095 CrushTester
tester(newcrush
, err
);
8096 tester
.set_min_x(0);
8097 tester
.set_max_x(50);
8098 tester
.set_rule(crush_rule
);
8099 tester
.set_num_rep(size
);
8100 auto start
= ceph::coarse_mono_clock::now();
8101 r
= tester
.test_with_fork(cct
, g_conf()->mon_lease
);
8102 dout(10) << __func__
<< " crush test_with_fork tester created " << dendl
;
8103 auto duration
= ceph::coarse_mono_clock::now() - start
;
8105 dout(10) << "tester.test_with_fork returns " << r
8106 << ": " << err
.str() << dendl
;
8107 *ss
<< "crush test failed with " << r
<< ": " << err
.str();
8110 dout(10) << __func__
<< " crush smoke test duration: "
8111 << duration
<< dendl
;
8113 r
= check_pg_num(-1, pg_num
, size
, crush_rule
, ss
);
8115 dout(10) << "check_pg_num returns " << r
<< dendl
;
8119 if (osdmap
.crush
->get_rule_type(crush_rule
) != (int)pool_type
) {
8120 *ss
<< "crush rule " << crush_rule
<< " type does not match pool";
8124 uint32_t stripe_width
= 0;
8125 r
= prepare_pool_stripe_width(pool_type
, erasure_code_profile
, &stripe_width
, ss
);
8127 dout(10) << "prepare_pool_stripe_width returns " << r
<< dendl
;
8132 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
8133 switch (fast_read
) {
8140 case FAST_READ_DEFAULT
:
8141 fread
= g_conf()->osd_pool_default_ec_fast_read
;
8144 *ss
<< "invalid fast_read setting: " << fast_read
;
8149 for (map
<int64_t,string
>::iterator p
= pending_inc
.new_pool_names
.begin();
8150 p
!= pending_inc
.new_pool_names
.end();
8152 if (p
->second
== name
)
8156 if (-1 == pending_inc
.new_pool_max
)
8157 pending_inc
.new_pool_max
= osdmap
.pool_max
;
8158 int64_t pool
= ++pending_inc
.new_pool_max
;
8160 pg_pool_t
*pi
= pending_inc
.get_new_pool(pool
, &empty
);
8161 pi
->create_time
= ceph_clock_now();
8162 pi
->type
= pool_type
;
8163 pi
->fast_read
= fread
;
8164 pi
->flags
= g_conf()->osd_pool_default_flags
;
8166 pi
->set_flag(pg_pool_t::FLAG_BULK
);
8167 } else if (g_conf()->osd_pool_default_flag_bulk
) {
8168 pi
->set_flag(pg_pool_t::FLAG_BULK
);
8170 if (g_conf()->osd_pool_default_flag_hashpspool
)
8171 pi
->set_flag(pg_pool_t::FLAG_HASHPSPOOL
);
8172 if (g_conf()->osd_pool_default_flag_nodelete
)
8173 pi
->set_flag(pg_pool_t::FLAG_NODELETE
);
8174 if (g_conf()->osd_pool_default_flag_nopgchange
)
8175 pi
->set_flag(pg_pool_t::FLAG_NOPGCHANGE
);
8176 if (g_conf()->osd_pool_default_flag_nosizechange
)
8177 pi
->set_flag(pg_pool_t::FLAG_NOSIZECHANGE
);
8178 pi
->set_flag(pg_pool_t::FLAG_CREATING
);
8179 if (g_conf()->osd_pool_use_gmt_hitset
)
8180 pi
->use_gmt_hitset
= true;
8182 pi
->use_gmt_hitset
= false;
8184 pi
->set_flag(pg_pool_t::FLAG_CRIMSON
);
8185 pi
->set_flag(pg_pool_t::FLAG_NOPGCHANGE
);
8189 pi
->min_size
= min_size
;
8190 pi
->crush_rule
= crush_rule
;
8191 pi
->expected_num_objects
= expected_num_objects
;
8192 pi
->object_hash
= CEPH_STR_HASH_RJENKINS
;
8193 if (osdmap
.stretch_mode_enabled
) {
8194 pi
->peering_crush_bucket_count
= osdmap
.stretch_bucket_count
;
8195 pi
->peering_crush_bucket_target
= osdmap
.stretch_bucket_count
;
8196 pi
->peering_crush_bucket_barrier
= osdmap
.stretch_mode_bucket
;
8197 pi
->peering_crush_mandatory_member
= CRUSH_ITEM_NONE
;
8198 if (osdmap
.degraded_stretch_mode
) {
8199 pi
->peering_crush_bucket_count
= osdmap
.degraded_stretch_mode
;
8200 pi
->peering_crush_bucket_target
= osdmap
.degraded_stretch_mode
;
8201 // pi->peering_crush_bucket_mandatory_member = CRUSH_ITEM_NONE;
8202 // TODO: drat, we don't record this ^ anywhere, though given that it
8203 // necessarily won't exist elsewhere it likely doesn't matter
8204 pi
->min_size
= pi
->min_size
/ 2;
8205 pi
->size
= pi
->size
/ 2; // only support 2 zones now
8209 if (auto m
= pg_pool_t::get_pg_autoscale_mode_by_name(
8210 g_conf().get_val
<string
>("osd_pool_default_pg_autoscale_mode"));
8211 m
!= pg_pool_t::pg_autoscale_mode_t::UNKNOWN
) {
8212 pi
->pg_autoscale_mode
= m
;
8214 pi
->pg_autoscale_mode
= pg_pool_t::pg_autoscale_mode_t::OFF
;
8216 auto max
= g_conf().get_val
<int64_t>("mon_osd_max_initial_pgs");
8218 max
> 0 ? std::min
<uint64_t>(pg_num
, std::max
<int64_t>(1, max
))
8220 pi
->set_pg_num_pending(pi
->get_pg_num());
8221 pi
->set_pg_num_target(pg_num
);
8222 pi
->set_pgp_num(pi
->get_pg_num());
8223 pi
->set_pgp_num_target(pgp_num
);
8224 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
&&
8226 pi
->opts
.set(pool_opts_t::PG_NUM_MIN
, static_cast<int64_t>(pg_num_min
));
8228 if (osdmap
.require_osd_release
>= ceph_release_t::quincy
&&
8230 pi
->opts
.set(pool_opts_t::PG_NUM_MAX
, static_cast<int64_t>(pg_num_max
));
8232 if (auto m
= pg_pool_t::get_pg_autoscale_mode_by_name(
8233 pg_autoscale_mode
); m
!= pg_pool_t::pg_autoscale_mode_t::UNKNOWN
) {
8234 pi
->pg_autoscale_mode
= m
;
8237 pi
->last_change
= pending_inc
.epoch
;
8240 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
8241 pi
->erasure_code_profile
= erasure_code_profile
;
8243 pi
->erasure_code_profile
= "";
8245 pi
->stripe_width
= stripe_width
;
8247 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
&&
8248 target_size_bytes
) {
8249 // only store for nautilus+ because TARGET_SIZE_BYTES may be
8250 // larger than int32_t max.
8251 pi
->opts
.set(pool_opts_t::TARGET_SIZE_BYTES
, static_cast<int64_t>(target_size_bytes
));
8253 if (target_size_ratio
> 0.0 &&
8254 osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
8255 // only store for nautilus+, just to be consistent and tidy.
8256 pi
->opts
.set(pool_opts_t::TARGET_SIZE_RATIO
, target_size_ratio
);
8259 pi
->cache_target_dirty_ratio_micro
=
8260 g_conf()->osd_pool_default_cache_target_dirty_ratio
* 1000000;
8261 pi
->cache_target_dirty_high_ratio_micro
=
8262 g_conf()->osd_pool_default_cache_target_dirty_high_ratio
* 1000000;
8263 pi
->cache_target_full_ratio_micro
=
8264 g_conf()->osd_pool_default_cache_target_full_ratio
* 1000000;
8265 pi
->cache_min_flush_age
= g_conf()->osd_pool_default_cache_min_flush_age
;
8266 pi
->cache_min_evict_age
= g_conf()->osd_pool_default_cache_min_evict_age
;
8268 pending_inc
.new_pool_names
[pool
] = name
;
8272 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op
, int flag
)
8274 op
->mark_osdmon_event(__func__
);
8276 if (pending_inc
.new_flags
< 0)
8277 pending_inc
.new_flags
= osdmap
.get_flags();
8278 pending_inc
.new_flags
|= flag
;
8279 ss
<< OSDMap::get_flag_string(flag
) << " is set";
8280 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
8281 get_last_committed() + 1));
8285 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op
, int flag
)
8287 op
->mark_osdmon_event(__func__
);
8289 if (pending_inc
.new_flags
< 0)
8290 pending_inc
.new_flags
= osdmap
.get_flags();
8291 pending_inc
.new_flags
&= ~flag
;
8292 ss
<< OSDMap::get_flag_string(flag
) << " is unset";
8293 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
8294 get_last_committed() + 1));
8298 int OSDMonitor::prepare_command_pool_set(const cmdmap_t
& cmdmap
,
8302 cmd_getval(cmdmap
, "pool", poolstr
);
8303 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
8305 ss
<< "unrecognized pool '" << poolstr
<< "'";
8309 cmd_getval(cmdmap
, "var", var
);
8311 pg_pool_t p
= *osdmap
.get_pg_pool(pool
);
8312 if (pending_inc
.new_pools
.count(pool
))
8313 p
= pending_inc
.new_pools
[pool
];
8315 // accept val as a json string in the normal case (current
8316 // generation monitor). parse out int or float values from the
8317 // string as needed. however, if it is not a string, try to pull
8318 // out an int, in case an older monitor with an older json schema is
8319 // forwarding a request.
8321 string interr
, floaterr
;
8324 int64_t uf
= 0; // micro-f
8325 cmd_getval(cmdmap
, "val", val
);
8328 "target_max_objects"
8330 auto iec_options
= {
8332 "target_size_bytes",
8333 "compression_max_blob_size",
8334 "compression_min_blob_size",
8338 if (count(begin(si_options
), end(si_options
), var
)) {
8339 n
= strict_si_cast
<int64_t>(val
, &interr
);
8340 } else if (count(begin(iec_options
), end(iec_options
), var
)) {
8341 n
= strict_iec_cast
<int64_t>(val
, &interr
);
8343 // parse string as both int and float; different fields use different types.
8344 n
= strict_strtoll(val
.c_str(), 10, &interr
);
8345 f
= strict_strtod(val
.c_str(), &floaterr
);
8346 uf
= llrintl(f
* (double)1000000.0);
8350 (var
== "hit_set_type" || var
== "hit_set_period" ||
8351 var
== "hit_set_count" || var
== "hit_set_fpp" ||
8352 var
== "target_max_objects" || var
== "target_max_bytes" ||
8353 var
== "cache_target_full_ratio" || var
== "cache_target_dirty_ratio" ||
8354 var
== "cache_target_dirty_high_ratio" || var
== "use_gmt_hitset" ||
8355 var
== "cache_min_flush_age" || var
== "cache_min_evict_age" ||
8356 var
== "hit_set_grade_decay_rate" || var
== "hit_set_search_last_n" ||
8357 var
== "min_read_recency_for_promote" || var
== "min_write_recency_for_promote")) {
8361 if (var
== "size") {
8362 if (p
.has_flag(pg_pool_t::FLAG_NOSIZECHANGE
)) {
8363 ss
<< "pool size change is disabled; you must unset nosizechange flag for the pool first";
8366 if (p
.type
== pg_pool_t::TYPE_ERASURE
) {
8367 ss
<< "can not change the size of an erasure-coded pool";
8370 if (interr
.length()) {
8371 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8374 if (n
<= 0 || n
> 10) {
8375 ss
<< "pool size must be between 1 and 10";
8379 if (!g_conf().get_val
<bool>("mon_allow_pool_size_one")) {
8380 ss
<< "configuring pool size as 1 is disabled by default.";
8384 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
8385 if (!sure
) { ss
<< "WARNING: setting pool size 1 could lead to data loss "
8386 "without recovery. If you are *ABSOLUTELY CERTAIN* that is what you want, "
8387 "pass the flag --yes-i-really-mean-it.";
8391 if (osdmap
.crush
->get_rule_type(p
.get_crush_rule()) != (int)p
.type
) {
8392 ss
<< "crush rule " << p
.get_crush_rule() << " type does not match pool";
8396 // only when increasing pool size
8397 int r
= check_pg_num(pool
, p
.get_pg_num(), n
, p
.get_crush_rule(), &ss
);
8403 p
.min_size
= g_conf().get_osd_pool_default_min_size(p
.size
);
8404 } else if (var
== "min_size") {
8405 if (p
.has_flag(pg_pool_t::FLAG_NOSIZECHANGE
)) {
8406 ss
<< "pool min size change is disabled; you must unset nosizechange flag for the pool first";
8409 if (interr
.length()) {
8410 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8414 if (p
.type
!= pg_pool_t::TYPE_ERASURE
) {
8415 if (n
< 1 || n
> p
.size
) {
8416 ss
<< "pool min_size must be between 1 and size, which is set to " << (int)p
.size
;
8420 ErasureCodeInterfaceRef erasure_code
;
8423 int err
= get_erasure_code(p
.erasure_code_profile
, &erasure_code
, &tmp
);
8425 k
= erasure_code
->get_data_chunk_count();
8427 ss
<< __func__
<< " get_erasure_code failed: " << tmp
.str();
8431 if (n
< k
|| n
> p
.size
) {
8432 ss
<< "pool min_size must be between " << k
<< " and size, which is set to " << (int)p
.size
;
8437 } else if (var
== "pg_num_actual") {
8438 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
8439 ss
<< "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8442 if (interr
.length()) {
8443 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8446 if (n
== (int)p
.get_pg_num()) {
8449 if (static_cast<uint64_t>(n
) > g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")) {
8450 ss
<< "'pg_num' must be greater than 0 and less than or equal to "
8451 << g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")
8452 << " (you may adjust 'mon max pool pg num' for higher values)";
8455 if (p
.has_flag(pg_pool_t::FLAG_CREATING
)) {
8456 ss
<< "cannot adjust pg_num while initial PGs are being created";
8459 if (n
> (int)p
.get_pg_num()) {
8460 if (p
.get_pg_num() != p
.get_pg_num_pending()) {
8461 // force pre-nautilus clients to resend their ops, since they
8462 // don't understand pg_num_pending changes form a new interval
8463 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
8467 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8468 ss
<< "nautilus OSDs are required to adjust pg_num_pending";
8471 if (n
< (int)p
.get_pgp_num()) {
8472 ss
<< "specified pg_num " << n
<< " < pgp_num " << p
.get_pgp_num();
8475 if (n
< (int)p
.get_pg_num() - 1) {
8476 ss
<< "specified pg_num " << n
<< " < pg_num (" << p
.get_pg_num()
8477 << ") - 1; only single pg decrease is currently supported";
8480 p
.set_pg_num_pending(n
);
8481 // force pre-nautilus clients to resend their ops, since they
8482 // don't understand pg_num_pending changes form a new interval
8483 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
8485 // force pre-luminous clients to resend their ops, since they
8486 // don't understand that split PGs now form a new interval.
8487 p
.last_force_op_resend_preluminous
= pending_inc
.epoch
;
8488 } else if (var
== "pg_num") {
8489 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
8490 ss
<< "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8493 if (interr
.length()) {
8494 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8497 if (n
== (int)p
.get_pg_num_target()) {
8500 if (n
<= 0 || static_cast<uint64_t>(n
) >
8501 g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")) {
8502 ss
<< "'pg_num' must be greater than 0 and less than or equal to "
8503 << g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")
8504 << " (you may adjust 'mon max pool pg num' for higher values)";
8507 if (n
> (int)p
.get_pg_num_target()) {
8508 int r
= check_pg_num(pool
, n
, p
.get_size(), p
.get_crush_rule(), &ss
);
8513 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
8514 if (p
.cache_mode
!= pg_pool_t::CACHEMODE_NONE
&& !force
) {
8515 ss
<< "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
8519 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8520 ss
<< "nautilus OSDs are required to decrease pg_num";
8524 int64_t pg_min
= 0, pg_max
= 0;
8525 p
.opts
.get(pool_opts_t::PG_NUM_MIN
, &pg_min
);
8526 p
.opts
.get(pool_opts_t::PG_NUM_MAX
, &pg_max
);
8527 if (pg_min
&& n
< pg_min
) {
8528 ss
<< "specified pg_num " << n
8529 << " < pg_num_min " << pg_min
;
8532 if (pg_max
&& n
> pg_max
) {
8533 ss
<< "specified pg_num " << n
8534 << " < pg_num_max " << pg_max
;
8537 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8538 // pre-nautilus osdmap format; increase pg_num directly
8539 assert(n
> (int)p
.get_pg_num());
8540 // force pre-nautilus clients to resend their ops, since they
8541 // don't understand pg_num_target changes form a new interval
8542 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
8543 // force pre-luminous clients to resend their ops, since they
8544 // don't understand that split PGs now form a new interval.
8545 p
.last_force_op_resend_preluminous
= pending_inc
.epoch
;
8548 // set targets; mgr will adjust pg_num_actual and pgp_num later.
8549 // make pgp_num track pg_num if it already matches. if it is set
8550 // differently, leave it different and let the user control it
8552 if (p
.get_pg_num_target() == p
.get_pgp_num_target()) {
8553 p
.set_pgp_num_target(n
);
8555 p
.set_pg_num_target(n
);
8557 } else if (var
== "pgp_num_actual") {
8558 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
8559 ss
<< "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8562 if (interr
.length()) {
8563 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8567 ss
<< "specified pgp_num must > 0, but you set to " << n
;
8570 if (n
> (int)p
.get_pg_num()) {
8571 ss
<< "specified pgp_num " << n
<< " > pg_num " << p
.get_pg_num();
8574 if (n
> (int)p
.get_pg_num_pending()) {
8575 ss
<< "specified pgp_num " << n
8576 << " > pg_num_pending " << p
.get_pg_num_pending();
8580 } else if (var
== "pgp_num") {
8581 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
8582 ss
<< "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8585 if (interr
.length()) {
8586 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8590 ss
<< "specified pgp_num must > 0, but you set to " << n
;
8593 if (n
> (int)p
.get_pg_num_target()) {
8594 ss
<< "specified pgp_num " << n
<< " > pg_num " << p
.get_pg_num_target();
8597 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8598 // pre-nautilus osdmap format; increase pgp_num directly
8601 p
.set_pgp_num_target(n
);
8603 } else if (var
== "pg_autoscale_mode") {
8604 auto m
= pg_pool_t::get_pg_autoscale_mode_by_name(val
);
8605 if (m
== pg_pool_t::pg_autoscale_mode_t::UNKNOWN
) {
8606 ss
<< "specified invalid mode " << val
;
8609 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8610 ss
<< "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
8613 p
.pg_autoscale_mode
= m
;
8614 } else if (var
== "crush_rule") {
8615 int id
= osdmap
.crush
->get_rule_id(val
);
8616 if (id
== -ENOENT
) {
8617 ss
<< "crush rule " << val
<< " does not exist";
8621 ss
<< cpp_strerror(id
);
8624 if (osdmap
.crush
->get_rule_type(id
) != (int)p
.get_type()) {
8625 ss
<< "crush rule " << id
<< " type does not match pool";
8629 } else if (var
== "nodelete" || var
== "nopgchange" ||
8630 var
== "nosizechange" || var
== "write_fadvise_dontneed" ||
8631 var
== "noscrub" || var
== "nodeep-scrub" || var
== "bulk") {
8632 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
8633 // make sure we only compare against 'n' if we didn't receive a string
8634 if (val
== "true" || (interr
.empty() && n
== 1)) {
8636 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8637 if (flag
== pg_pool_t::FLAG_NOPGCHANGE
&& p
.is_crimson()) {
8638 ss
<< "cannot clear FLAG_NOPGCHANGE on a crimson pool";
8643 ss
<< "expecting value 'true', 'false', '0', or '1'";
8646 } else if (var
== "eio") {
8647 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
8649 // make sure we only compare against 'n' if we didn't receive a string
8650 if (val
== "true" || (interr
.empty() && n
== 1)) {
8652 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8655 ss
<< "expecting value 'true', 'false', '0', or '1'";
8658 } else if (var
== "hashpspool") {
8659 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
8661 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
8664 ss
<< "are you SURE? this will remap all placement groups in this pool,"
8665 " this triggers large data movement,"
8666 " pass --yes-i-really-mean-it if you really do.";
8669 // make sure we only compare against 'n' if we didn't receive a string
8670 if (val
== "true" || (interr
.empty() && n
== 1)) {
8672 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8675 ss
<< "expecting value 'true', 'false', '0', or '1'";
8678 } else if (var
== "hit_set_type") {
8680 p
.hit_set_params
= HitSet::Params();
8682 int err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
8685 if (val
== "bloom") {
8686 BloomHitSet::Params
*bsp
= new BloomHitSet::Params
;
8687 bsp
->set_fpp(g_conf().get_val
<double>("osd_pool_default_hit_set_bloom_fpp"));
8688 p
.hit_set_params
= HitSet::Params(bsp
);
8689 } else if (val
== "explicit_hash")
8690 p
.hit_set_params
= HitSet::Params(new ExplicitHashHitSet::Params
);
8691 else if (val
== "explicit_object")
8692 p
.hit_set_params
= HitSet::Params(new ExplicitObjectHitSet::Params
);
8694 ss
<< "unrecognized hit_set type '" << val
<< "'";
8698 } else if (var
== "hit_set_period") {
8699 if (interr
.length()) {
8700 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8703 ss
<< "hit_set_period should be non-negative";
8706 p
.hit_set_period
= n
;
8707 } else if (var
== "hit_set_count") {
8708 if (interr
.length()) {
8709 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8712 ss
<< "hit_set_count should be non-negative";
8715 p
.hit_set_count
= n
;
8716 } else if (var
== "hit_set_fpp") {
8717 if (floaterr
.length()) {
8718 ss
<< "error parsing floating point value '" << val
<< "': " << floaterr
;
8720 } else if (f
< 0 || f
> 1.0) {
8721 ss
<< "hit_set_fpp should be in the range 0..1";
8724 if (p
.hit_set_params
.get_type() != HitSet::TYPE_BLOOM
) {
8725 ss
<< "hit set is not of type Bloom; invalid to set a false positive rate!";
8728 BloomHitSet::Params
*bloomp
= static_cast<BloomHitSet::Params
*>(p
.hit_set_params
.impl
.get());
8730 } else if (var
== "use_gmt_hitset") {
8731 if (val
== "true" || (interr
.empty() && n
== 1)) {
8732 p
.use_gmt_hitset
= true;
8734 ss
<< "expecting value 'true' or '1'";
8737 } else if (var
== "allow_ec_overwrites") {
8738 if (!p
.is_erasure()) {
8739 ss
<< "ec overwrites can only be enabled for an erasure coded pool";
8743 if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites
&&
8744 !is_pool_currently_all_bluestore(pool
, p
, &err
)) {
8745 ss
<< "pool must only be stored on bluestore for scrubbing to work: " << err
.str();
8748 if (val
== "true" || (interr
.empty() && n
== 1)) {
8749 p
.flags
|= pg_pool_t::FLAG_EC_OVERWRITES
;
8750 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8751 ss
<< "ec overwrites cannot be disabled once enabled";
8754 ss
<< "expecting value 'true', 'false', '0', or '1'";
8757 } else if (var
== "target_max_objects") {
8758 if (interr
.length()) {
8759 ss
<< "error parsing int '" << val
<< "': " << interr
;
8762 p
.target_max_objects
= n
;
8763 } else if (var
== "target_max_bytes") {
8764 if (interr
.length()) {
8765 ss
<< "error parsing int '" << val
<< "': " << interr
;
8768 p
.target_max_bytes
= n
;
8769 } else if (var
== "cache_target_dirty_ratio") {
8770 if (floaterr
.length()) {
8771 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
8774 if (f
< 0 || f
> 1.0) {
8775 ss
<< "value must be in the range 0..1";
8778 p
.cache_target_dirty_ratio_micro
= uf
;
8779 } else if (var
== "cache_target_dirty_high_ratio") {
8780 if (floaterr
.length()) {
8781 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
8784 if (f
< 0 || f
> 1.0) {
8785 ss
<< "value must be in the range 0..1";
8788 p
.cache_target_dirty_high_ratio_micro
= uf
;
8789 } else if (var
== "cache_target_full_ratio") {
8790 if (floaterr
.length()) {
8791 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
8794 if (f
< 0 || f
> 1.0) {
8795 ss
<< "value must be in the range 0..1";
8798 p
.cache_target_full_ratio_micro
= uf
;
8799 } else if (var
== "cache_min_flush_age") {
8800 if (interr
.length()) {
8801 ss
<< "error parsing int '" << val
<< "': " << interr
;
8804 p
.cache_min_flush_age
= n
;
8805 } else if (var
== "cache_min_evict_age") {
8806 if (interr
.length()) {
8807 ss
<< "error parsing int '" << val
<< "': " << interr
;
8810 p
.cache_min_evict_age
= n
;
8811 } else if (var
== "min_read_recency_for_promote") {
8812 if (interr
.length()) {
8813 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8816 p
.min_read_recency_for_promote
= n
;
8817 } else if (var
== "hit_set_grade_decay_rate") {
8818 if (interr
.length()) {
8819 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8822 if (n
> 100 || n
< 0) {
8823 ss
<< "value out of range,valid range is 0 - 100";
8826 p
.hit_set_grade_decay_rate
= n
;
8827 } else if (var
== "hit_set_search_last_n") {
8828 if (interr
.length()) {
8829 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8832 if (n
> p
.hit_set_count
|| n
< 0) {
8833 ss
<< "value out of range,valid range is 0 - hit_set_count";
8836 p
.hit_set_search_last_n
= n
;
8837 } else if (var
== "min_write_recency_for_promote") {
8838 if (interr
.length()) {
8839 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8842 p
.min_write_recency_for_promote
= n
;
8843 } else if (var
== "fast_read") {
8844 if (p
.is_replicated()) {
8845 ss
<< "fast read is not supported in replication pool";
8848 if (val
== "true" || (interr
.empty() && n
== 1)) {
8850 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8851 p
.fast_read
= false;
8853 ss
<< "expecting value 'true', 'false', '0', or '1'";
8856 } else if (pool_opts_t::is_opt_name(var
)) {
8857 bool unset
= val
== "unset";
8858 if (var
== "compression_mode") {
8860 auto cmode
= Compressor::get_comp_mode_type(val
);
8862 ss
<< "unrecognized compression mode '" << val
<< "'";
8866 } else if (var
== "compression_algorithm") {
8868 auto alg
= Compressor::get_comp_alg_type(val
);
8870 ss
<< "unrecognized compression_algorithm '" << val
<< "'";
8874 } else if (var
== "compression_required_ratio") {
8875 if (floaterr
.length()) {
8876 ss
<< "error parsing float value '" << val
<< "': " << floaterr
;
8879 if (f
< 0 || f
> 1) {
8880 ss
<< "compression_required_ratio is out of range (0-1): '" << val
<< "'";
8883 } else if (var
== "csum_type") {
8884 auto t
= unset
? 0 : Checksummer::get_csum_string_type(val
);
8886 ss
<< "unrecognized csum_type '" << val
<< "'";
8889 //preserve csum_type numeric value
8892 } else if (var
== "compression_max_blob_size" ||
8893 var
== "compression_min_blob_size" ||
8894 var
== "csum_max_block" ||
8895 var
== "csum_min_block") {
8896 if (interr
.length()) {
8897 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8900 } else if (var
== "fingerprint_algorithm") {
8902 auto alg
= pg_pool_t::get_fingerprint_from_str(val
);
8904 ss
<< "unrecognized fingerprint_algorithm '" << val
<< "'";
8908 } else if (var
== "target_size_bytes") {
8909 if (interr
.length()) {
8910 ss
<< "error parsing unit value '" << val
<< "': " << interr
;
8913 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8914 ss
<< "must set require_osd_release to nautilus or "
8915 << "later before setting target_size_bytes";
8918 } else if (var
== "target_size_ratio") {
8920 ss
<< "target_size_ratio cannot be negative";
8923 } else if (var
== "pg_num_min") {
8924 if (interr
.length()) {
8925 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8928 if (n
> (int)p
.get_pg_num_target()) {
8929 ss
<< "specified pg_num_min " << n
8930 << " > pg_num " << p
.get_pg_num_target();
8933 } else if (var
== "pg_num_max") {
8934 if (interr
.length()) {
8935 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8938 if (n
&& n
< (int)p
.get_pg_num_target()) {
8939 ss
<< "specified pg_num_max " << n
8940 << " < pg_num " << p
.get_pg_num_target();
8943 } else if (var
== "recovery_priority") {
8944 if (interr
.length()) {
8945 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8948 if (!g_conf()->debug_allow_any_pool_priority
) {
8949 if (n
> OSD_POOL_PRIORITY_MAX
|| n
< OSD_POOL_PRIORITY_MIN
) {
8950 ss
<< "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
8951 << " and " << OSD_POOL_PRIORITY_MAX
;
8955 } else if (var
== "pg_autoscale_bias") {
8956 if (f
< 0.0 || f
> 1000.0) {
8957 ss
<< "pg_autoscale_bias must be between 0 and 1000";
8960 } else if (var
== "dedup_tier") {
8961 if (interr
.empty()) {
8962 ss
<< "expecting value 'pool name'";
8965 // Current base tier in dedup does not support ec pool
8966 if (p
.is_erasure()) {
8967 ss
<< "pool '" << poolstr
8968 << "' is an ec pool, which cannot be a base tier";
8971 int64_t lowtierpool_id
= osdmap
.lookup_pg_pool_name(val
);
8972 if (lowtierpool_id
< 0) {
8973 ss
<< "unrecognized pool '" << val
<< "'";
8976 const pg_pool_t
*tp
= osdmap
.get_pg_pool(lowtierpool_id
);
8979 // The original input is string (pool name), but we convert it to int64_t.
8982 } else if (var
== "dedup_chunk_algorithm") {
8984 auto alg
= pg_pool_t::get_dedup_chunk_algorithm_from_str(val
);
8986 ss
<< "unrecognized fingerprint_algorithm '" << val
<< "'";
8990 } else if (var
== "dedup_cdc_chunk_size") {
8991 if (interr
.length()) {
8992 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8997 pool_opts_t::opt_desc_t desc
= pool_opts_t::get_opt_desc(var
);
8998 switch (desc
.type
) {
8999 case pool_opts_t::STR
:
9001 p
.opts
.unset(desc
.key
);
9003 p
.opts
.set(desc
.key
, static_cast<std::string
>(val
));
9006 case pool_opts_t::INT
:
9007 if (interr
.length()) {
9008 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
9012 p
.opts
.unset(desc
.key
);
9014 p
.opts
.set(desc
.key
, static_cast<int64_t>(n
));
9017 case pool_opts_t::DOUBLE
:
9018 if (floaterr
.length()) {
9019 ss
<< "error parsing floating point value '" << val
<< "': " << floaterr
;
9023 p
.opts
.unset(desc
.key
);
9025 p
.opts
.set(desc
.key
, static_cast<double>(f
));
9029 ceph_assert(!"unknown type");
9032 ss
<< "unrecognized variable '" << var
<< "'";
9035 if (val
!= "unset") {
9036 ss
<< "set pool " << pool
<< " " << var
<< " to " << val
;
9038 ss
<< "unset pool " << pool
<< " " << var
;
9040 p
.last_change
= pending_inc
.epoch
;
9041 pending_inc
.new_pools
[pool
] = p
;
9045 int OSDMonitor::prepare_command_pool_application(const string
&prefix
,
9046 const cmdmap_t
& cmdmap
,
9049 return _command_pool_application(prefix
, cmdmap
, ss
, nullptr, true);
9052 int OSDMonitor::preprocess_command_pool_application(const string
&prefix
,
9053 const cmdmap_t
& cmdmap
,
9057 return _command_pool_application(prefix
, cmdmap
, ss
, modified
, false);
9062 * Common logic for preprocess and prepare phases of pool application
9063 * tag commands. In preprocess mode we're only detecting invalid
9064 * commands, and determining whether it was a modification or a no-op.
9065 * In prepare mode we're actually updating the pending state.
9067 int OSDMonitor::_command_pool_application(const string
&prefix
,
9068 const cmdmap_t
& cmdmap
,
9074 cmd_getval(cmdmap
, "pool", pool_name
);
9075 int64_t pool
= osdmap
.lookup_pg_pool_name(pool_name
.c_str());
9077 ss
<< "unrecognized pool '" << pool_name
<< "'";
9081 pg_pool_t p
= *osdmap
.get_pg_pool(pool
);
9083 if (pending_inc
.new_pools
.count(pool
)) {
9084 p
= pending_inc
.new_pools
[pool
];
9089 cmd_getval(cmdmap
, "app", app
);
9090 bool app_exists
= (p
.application_metadata
.count(app
) > 0);
9093 cmd_getval(cmdmap
, "key", key
);
9095 ss
<< "key cannot be 'all'";
9100 cmd_getval(cmdmap
, "value", value
);
9101 if (value
== "all") {
9102 ss
<< "value cannot be 'all'";
9106 if (boost::algorithm::ends_with(prefix
, "enable")) {
9108 ss
<< "application name must be provided";
9113 ss
<< "application must be enabled on base tier";
9118 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
9120 if (!app_exists
&& !p
.application_metadata
.empty() && !force
) {
9121 ss
<< "Are you SURE? Pool '" << pool_name
<< "' already has an enabled "
9122 << "application; pass --yes-i-really-mean-it to proceed anyway";
9126 if (!app_exists
&& p
.application_metadata
.size() >= MAX_POOL_APPLICATIONS
) {
9127 ss
<< "too many enabled applications on pool '" << pool_name
<< "'; "
9128 << "max " << MAX_POOL_APPLICATIONS
;
9132 if (app
.length() > MAX_POOL_APPLICATION_LENGTH
) {
9133 ss
<< "application name '" << app
<< "' too long; max length "
9134 << MAX_POOL_APPLICATION_LENGTH
;
9139 p
.application_metadata
[app
] = {};
9141 ss
<< "enabled application '" << app
<< "' on pool '" << pool_name
<< "'";
9143 } else if (boost::algorithm::ends_with(prefix
, "disable")) {
9145 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
9148 ss
<< "Are you SURE? Disabling an application within a pool might result "
9149 << "in loss of application functionality; pass "
9150 << "--yes-i-really-mean-it to proceed anyway";
9155 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
9157 return 0; // idempotent
9160 p
.application_metadata
.erase(app
);
9161 ss
<< "disable application '" << app
<< "' on pool '" << pool_name
<< "'";
9163 } else if (boost::algorithm::ends_with(prefix
, "set")) {
9165 ss
<< "application metadata must be set on base tier";
9170 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
9176 cmd_getval(cmdmap
, "key", key
);
9179 ss
<< "key must be provided";
9183 auto &app_keys
= p
.application_metadata
[app
];
9184 if (app_keys
.count(key
) == 0 &&
9185 app_keys
.size() >= MAX_POOL_APPLICATION_KEYS
) {
9186 ss
<< "too many keys set for application '" << app
<< "' on pool '"
9187 << pool_name
<< "'; max " << MAX_POOL_APPLICATION_KEYS
;
9191 if (key
.length() > MAX_POOL_APPLICATION_LENGTH
) {
9192 ss
<< "key '" << app
<< "' too long; max length "
9193 << MAX_POOL_APPLICATION_LENGTH
;
9198 cmd_getval(cmdmap
, "value", value
);
9199 if (value
.length() > MAX_POOL_APPLICATION_LENGTH
) {
9200 ss
<< "value '" << value
<< "' too long; max length "
9201 << MAX_POOL_APPLICATION_LENGTH
;
9205 p
.application_metadata
[app
][key
] = value
;
9206 ss
<< "set application '" << app
<< "' key '" << key
<< "' to '"
9207 << value
<< "' on pool '" << pool_name
<< "'";
9208 } else if (boost::algorithm::ends_with(prefix
, "rm")) {
9210 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
9216 cmd_getval(cmdmap
, "key", key
);
9217 auto it
= p
.application_metadata
[app
].find(key
);
9218 if (it
== p
.application_metadata
[app
].end()) {
9219 ss
<< "application '" << app
<< "' on pool '" << pool_name
9220 << "' does not have key '" << key
<< "'";
9221 return 0; // idempotent
9224 p
.application_metadata
[app
].erase(it
);
9225 ss
<< "removed application '" << app
<< "' key '" << key
<< "' on pool '"
9226 << pool_name
<< "'";
9232 p
.last_change
= pending_inc
.epoch
;
9233 pending_inc
.new_pools
[pool
] = p
;
9236 // Because we fell through this far, we didn't hit no-op cases,
9237 // so pool was definitely modified
9238 if (modified
!= nullptr) {
9245 int OSDMonitor::_prepare_command_osd_crush_remove(
9246 CrushWrapper
&newcrush
,
9255 err
= newcrush
.remove_item_under(cct
, id
, ancestor
,
9258 err
= newcrush
.remove_item(cct
, id
, unlink_only
);
9263 void OSDMonitor::do_osd_crush_remove(CrushWrapper
& newcrush
)
9265 pending_inc
.crush
.clear();
9266 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
9269 int OSDMonitor::prepare_command_osd_crush_remove(
9270 CrushWrapper
&newcrush
,
9276 int err
= _prepare_command_osd_crush_remove(
9277 newcrush
, id
, ancestor
,
9278 has_ancestor
, unlink_only
);
9283 ceph_assert(err
== 0);
9284 do_osd_crush_remove(newcrush
);
9289 int OSDMonitor::prepare_command_osd_remove(int32_t id
)
9291 if (osdmap
.is_up(id
)) {
9295 pending_inc
.new_state
[id
] = osdmap
.get_state(id
);
9296 pending_inc
.new_uuid
[id
] = uuid_d();
9297 pending_metadata_rm
.insert(id
);
9298 pending_metadata
.erase(id
);
9303 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id
)
9305 ceph_assert(existing_id
);
9308 for (int32_t i
= 0; i
< osdmap
.get_max_osd(); ++i
) {
9309 if (!osdmap
.exists(i
) &&
9310 pending_inc
.new_up_client
.count(i
) == 0 &&
9311 (pending_inc
.new_state
.count(i
) == 0 ||
9312 (pending_inc
.new_state
[i
] & CEPH_OSD_EXISTS
) == 0)) {
9318 if (pending_inc
.new_max_osd
< 0) {
9319 return osdmap
.get_max_osd();
9321 return pending_inc
.new_max_osd
;
9324 void OSDMonitor::do_osd_create(
9327 const string
& device_class
,
9330 dout(10) << __func__
<< " uuid " << uuid
<< dendl
;
9331 ceph_assert(new_id
);
9333 // We presume validation has been performed prior to calling this
9334 // function. We assert with prejudice.
9336 int32_t allocated_id
= -1; // declare here so we can jump
9337 int32_t existing_id
= -1;
9338 if (!uuid
.is_zero()) {
9339 existing_id
= osdmap
.identify_osd(uuid
);
9340 if (existing_id
>= 0) {
9341 ceph_assert(id
< 0 || id
== existing_id
);
9342 *new_id
= existing_id
;
9344 } else if (id
>= 0) {
9345 // uuid does not exist, and id has been provided, so just create
9352 // allocate a new id
9353 allocated_id
= _allocate_osd_id(&existing_id
);
9354 dout(10) << __func__
<< " allocated id " << allocated_id
9355 << " existing id " << existing_id
<< dendl
;
9356 if (existing_id
>= 0) {
9357 ceph_assert(existing_id
< osdmap
.get_max_osd());
9358 ceph_assert(allocated_id
< 0);
9359 *new_id
= existing_id
;
9360 } else if (allocated_id
>= 0) {
9361 ceph_assert(existing_id
< 0);
9363 if (pending_inc
.new_max_osd
< 0) {
9364 pending_inc
.new_max_osd
= osdmap
.get_max_osd() + 1;
9366 ++pending_inc
.new_max_osd
;
9368 *new_id
= pending_inc
.new_max_osd
- 1;
9369 ceph_assert(*new_id
== allocated_id
);
9371 ceph_abort_msg("unexpected condition");
9375 if (device_class
.size()) {
9376 CrushWrapper newcrush
= _get_pending_crush();
9377 if (newcrush
.get_max_devices() < *new_id
+ 1) {
9378 newcrush
.set_max_devices(*new_id
+ 1);
9380 string name
= string("osd.") + stringify(*new_id
);
9381 if (!newcrush
.item_exists(*new_id
)) {
9382 newcrush
.set_item_name(*new_id
, name
);
9385 int r
= newcrush
.update_device_class(*new_id
, device_class
, name
, &ss
);
9387 derr
<< __func__
<< " failed to set " << name
<< " device_class "
9388 << device_class
<< ": " << cpp_strerror(r
) << " - " << ss
.str()
9390 // non-fatal... this might be a replay and we want to be idempotent.
9392 dout(20) << __func__
<< " set " << name
<< " device_class " << device_class
9394 pending_inc
.crush
.clear();
9395 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
9398 dout(20) << __func__
<< " no device_class" << dendl
;
9401 dout(10) << __func__
<< " using id " << *new_id
<< dendl
;
9402 if (osdmap
.get_max_osd() <= *new_id
&& pending_inc
.new_max_osd
<= *new_id
) {
9403 pending_inc
.new_max_osd
= *new_id
+ 1;
9406 pending_inc
.new_weight
[*new_id
] = CEPH_OSD_IN
;
9407 // do not set EXISTS; OSDMap::set_weight, called by apply_incremental, will
9408 // set it for us. (ugh.)
9409 pending_inc
.new_state
[*new_id
] |= CEPH_OSD_NEW
;
9410 if (!uuid
.is_zero())
9411 pending_inc
.new_uuid
[*new_id
] = uuid
;
9414 int OSDMonitor::validate_osd_create(
9417 const bool check_osd_exists
,
9418 int32_t* existing_id
,
9422 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
9423 << " check_osd_exists " << check_osd_exists
<< dendl
;
9425 ceph_assert(existing_id
);
9427 if (id
< 0 && uuid
.is_zero()) {
9428 // we have nothing to validate
9431 } else if (uuid
.is_zero()) {
9432 // we have an id but we will ignore it - because that's what
9433 // `osd create` does.
9438 * This function will be used to validate whether we are able to
9439 * create a new osd when the `uuid` is specified.
9441 * It will be used by both `osd create` and `osd new`, as the checks
9442 * are basically the same when it pertains to osd id and uuid validation.
9443 * However, `osd create` presumes an `uuid` is optional, for legacy
9444 * reasons, while `osd new` requires the `uuid` to be provided. This
9445 * means that `osd create` will not be idempotent if an `uuid` is not
9446 * provided, but we will always guarantee the idempotency of `osd new`.
9449 ceph_assert(!uuid
.is_zero());
9450 if (pending_inc
.identify_osd(uuid
) >= 0) {
9451 // osd is about to exist
9455 int32_t i
= osdmap
.identify_osd(uuid
);
9457 // osd already exists
9458 if (id
>= 0 && i
!= id
) {
9459 ss
<< "uuid " << uuid
<< " already in use for different id " << i
;
9462 // return a positive errno to distinguish between a blocking error
9463 // and an error we consider to not be a problem (i.e., this would be
9464 // an idempotent operation).
9470 if (pending_inc
.new_state
.count(id
)) {
9471 // osd is about to exist
9474 // we may not care if an osd exists if we are recreating a previously
9476 if (check_osd_exists
&& osdmap
.exists(id
)) {
9477 ss
<< "id " << id
<< " already in use and does not match uuid "
9485 int OSDMonitor::prepare_command_osd_create(
9488 int32_t* existing_id
,
9491 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
<< dendl
;
9492 ceph_assert(existing_id
);
9493 if (osdmap
.is_destroyed(id
)) {
9494 ss
<< "ceph osd create has been deprecated. Please use ceph osd new "
9499 if (uuid
.is_zero()) {
9500 dout(10) << __func__
<< " no uuid; assuming legacy `osd create`" << dendl
;
9503 return validate_osd_create(id
, uuid
, true, existing_id
, ss
);
9506 int OSDMonitor::prepare_command_osd_new(
9508 const cmdmap_t
& cmdmap
,
9509 const map
<string
,string
>& params
,
9517 ceph_assert(paxos
.is_plugged());
9519 dout(10) << __func__
<< " " << op
<< dendl
;
9521 /* validate command. abort now if something's wrong. */
9523 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
9525 * If `id` is not specified, we will identify any existing osd based
9526 * on `uuid`. Operation will be idempotent iff secrets match.
9528 * If `id` is specified, we will identify any existing osd based on
9529 * `uuid` and match against `id`. If they match, operation will be
9530 * idempotent iff secrets match.
9532 * `-i secrets.json` will be optional. If supplied, will be used
9533 * to check for idempotency when `id` and `uuid` match.
9535 * If `id` is not specified, and `uuid` does not exist, an id will
9536 * be found or allocated for the osd.
9538 * If `id` is specified, and the osd has been previously marked
9539 * as destroyed, then the `id` will be reused.
9541 if (!cmd_getval(cmdmap
, "uuid", uuidstr
)) {
9542 ss
<< "requires the OSD's UUID to be specified.";
9544 } else if (!uuid
.parse(uuidstr
.c_str())) {
9545 ss
<< "invalid UUID value '" << uuidstr
<< "'.";
9549 if (cmd_getval(cmdmap
, "id", id
) &&
9551 ss
<< "invalid OSD id; must be greater or equal than zero.";
9555 // are we running an `osd create`-like command, or recreating
9556 // a previously destroyed osd?
9558 bool is_recreate_destroyed
= (id
>= 0 && osdmap
.is_destroyed(id
));
9560 // we will care about `id` to assess whether osd is `destroyed`, or
9561 // to create a new osd.
9562 // we will need an `id` by the time we reach auth.
9564 int32_t existing_id
= -1;
9565 int err
= validate_osd_create(id
, uuid
, !is_recreate_destroyed
,
9568 bool may_be_idempotent
= false;
9569 if (err
== EEXIST
) {
9570 // this is idempotent from the osdmon's point-of-view
9571 may_be_idempotent
= true;
9572 ceph_assert(existing_id
>= 0);
9574 } else if (err
< 0) {
9578 if (!may_be_idempotent
) {
9579 // idempotency is out of the window. We are either creating a new
9580 // osd or recreating a destroyed osd.
9582 // We now need to figure out if we have an `id` (and if it's valid),
9583 // of find an `id` if we don't have one.
9585 // NOTE: we need to consider the case where the `id` is specified for
9586 // `osd create`, and we must honor it. So this means checking if
9587 // the `id` is destroyed, and if so assume the destroy; otherwise,
9588 // check if it `exists` - in which case we complain about not being
9589 // `destroyed`. In the end, if nothing fails, we must allow the
9590 // creation, so that we are compatible with `create`.
9591 if (id
>= 0 && osdmap
.exists(id
) && !osdmap
.is_destroyed(id
)) {
9592 dout(10) << __func__
<< " osd." << id
<< " isn't destroyed" << dendl
;
9593 ss
<< "OSD " << id
<< " has not yet been destroyed";
9595 } else if (id
< 0) {
9597 id
= _allocate_osd_id(&existing_id
);
9599 ceph_assert(existing_id
>= 0);
9602 dout(10) << __func__
<< " found id " << id
<< " to use" << dendl
;
9603 } else if (id
>= 0 && osdmap
.is_destroyed(id
)) {
9604 dout(10) << __func__
<< " recreating osd." << id
<< dendl
;
9606 dout(10) << __func__
<< " creating new osd." << id
<< dendl
;
9609 ceph_assert(id
>= 0);
9610 ceph_assert(osdmap
.exists(id
));
9613 // we are now able to either create a brand new osd or reuse an existing
9614 // osd that has been previously destroyed.
9616 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
<< dendl
;
9618 if (may_be_idempotent
&& params
.empty()) {
9619 // nothing to do, really.
9620 dout(10) << __func__
<< " idempotent and no params -- no op." << dendl
;
9621 ceph_assert(id
>= 0);
9623 f
->open_object_section("created_osd");
9624 f
->dump_int("osdid", id
);
9632 string device_class
;
9633 auto p
= params
.find("crush_device_class");
9634 if (p
!= params
.end()) {
9635 device_class
= p
->second
;
9636 dout(20) << __func__
<< " device_class will be " << device_class
<< dendl
;
9638 string cephx_secret
, lockbox_secret
, dmcrypt_key
;
9639 bool has_lockbox
= false;
9640 bool has_secrets
= params
.count("cephx_secret")
9641 || params
.count("cephx_lockbox_secret")
9642 || params
.count("dmcrypt_key");
9644 KVMonitor
*svc
= nullptr;
9645 AuthMonitor::auth_entity_t cephx_entity
, lockbox_entity
;
9648 if (params
.count("cephx_secret") == 0) {
9649 ss
<< "requires a cephx secret.";
9652 cephx_secret
= params
.at("cephx_secret");
9654 bool has_lockbox_secret
= (params
.count("cephx_lockbox_secret") > 0);
9655 bool has_dmcrypt_key
= (params
.count("dmcrypt_key") > 0);
9657 dout(10) << __func__
<< " has lockbox " << has_lockbox_secret
9658 << " dmcrypt " << has_dmcrypt_key
<< dendl
;
9660 if (has_lockbox_secret
&& has_dmcrypt_key
) {
9662 lockbox_secret
= params
.at("cephx_lockbox_secret");
9663 dmcrypt_key
= params
.at("dmcrypt_key");
9664 } else if (!has_lockbox_secret
!= !has_dmcrypt_key
) {
9665 ss
<< "requires both a cephx lockbox secret and a dm-crypt key.";
9669 dout(10) << __func__
<< " validate secrets using osd id " << id
<< dendl
;
9671 err
= mon
.authmon()->validate_osd_new(id
, uuid
,
9679 } else if (may_be_idempotent
&& err
!= EEXIST
) {
9680 // for this to be idempotent, `id` should already be >= 0; no need
9681 // to use validate_id.
9682 ceph_assert(id
>= 0);
9683 ss
<< "osd." << id
<< " exists but secrets do not match";
9689 err
= svc
->validate_osd_new(uuid
, dmcrypt_key
, ss
);
9692 } else if (may_be_idempotent
&& err
!= EEXIST
) {
9693 ceph_assert(id
>= 0);
9694 ss
<< "osd." << id
<< " exists but dm-crypt key does not match.";
9699 ceph_assert(!has_secrets
|| !cephx_secret
.empty());
9700 ceph_assert(!has_lockbox
|| !lockbox_secret
.empty());
9702 if (may_be_idempotent
) {
9703 // we have nothing to do for either the osdmon or the authmon,
9704 // and we have no lockbox - so the config key service will not be
9705 // touched. This is therefore an idempotent operation, and we can
9706 // just return right away.
9707 dout(10) << __func__
<< " idempotent -- no op." << dendl
;
9708 ceph_assert(id
>= 0);
9710 f
->open_object_section("created_osd");
9711 f
->dump_int("osdid", id
);
9718 ceph_assert(!may_be_idempotent
);
9722 ceph_assert(!cephx_secret
.empty());
9723 ceph_assert((lockbox_secret
.empty() && dmcrypt_key
.empty()) ||
9724 (!lockbox_secret
.empty() && !dmcrypt_key
.empty()));
9726 err
= mon
.authmon()->do_osd_new(cephx_entity
,
9729 ceph_assert(0 == err
);
9732 ceph_assert(nullptr != svc
);
9733 svc
->do_osd_new(uuid
, dmcrypt_key
);
9737 if (is_recreate_destroyed
) {
9738 ceph_assert(id
>= 0);
9739 ceph_assert(osdmap
.is_destroyed(id
));
9740 pending_inc
.new_state
[id
] |= CEPH_OSD_DESTROYED
;
9741 if ((osdmap
.get_state(id
) & CEPH_OSD_NEW
) == 0) {
9742 pending_inc
.new_state
[id
] |= CEPH_OSD_NEW
;
9744 if (osdmap
.get_state(id
) & CEPH_OSD_UP
) {
9745 // due to http://tracker.ceph.com/issues/20751 some clusters may
9746 // have UP set for non-existent OSDs; make sure it is cleared
9747 // for a newly created osd.
9748 pending_inc
.new_state
[id
] |= CEPH_OSD_UP
;
9750 pending_inc
.new_uuid
[id
] = uuid
;
9752 ceph_assert(id
>= 0);
9753 int32_t new_id
= -1;
9754 do_osd_create(id
, uuid
, device_class
, &new_id
);
9755 ceph_assert(new_id
>= 0);
9756 ceph_assert(id
== new_id
);
9760 f
->open_object_section("created_osd");
9761 f
->dump_int("osdid", id
);
9770 bool OSDMonitor::prepare_command(MonOpRequestRef op
)
9772 op
->mark_osdmon_event(__func__
);
9773 auto m
= op
->get_req
<MMonCommand
>();
9776 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
9777 string rs
= ss
.str();
9778 mon
.reply_command(op
, -EINVAL
, rs
, get_last_committed());
9782 MonSession
*session
= op
->get_session();
9784 derr
<< __func__
<< " no session" << dendl
;
9785 mon
.reply_command(op
, -EACCES
, "access denied", get_last_committed());
9789 return prepare_command_impl(op
, cmdmap
);
9792 static int parse_reweights(CephContext
*cct
,
9793 const cmdmap_t
& cmdmap
,
9794 const OSDMap
& osdmap
,
9795 map
<int32_t, uint32_t>* weights
)
9798 if (!cmd_getval(cmdmap
, "weights", weights_str
)) {
9801 std::replace(begin(weights_str
), end(weights_str
), '\'', '"');
9802 json_spirit::mValue json_value
;
9803 if (!json_spirit::read(weights_str
, json_value
)) {
9806 if (json_value
.type() != json_spirit::obj_type
) {
9809 const auto obj
= json_value
.get_obj();
9811 for (auto& osd_weight
: obj
) {
9812 auto osd_id
= std::stoi(osd_weight
.first
);
9813 if (!osdmap
.exists(osd_id
)) {
9816 if (osd_weight
.second
.type() != json_spirit::str_type
) {
9819 auto weight
= std::stoul(osd_weight
.second
.get_str());
9820 weights
->insert({osd_id
, weight
});
9822 } catch (const std::logic_error
& e
) {
9828 int OSDMonitor::prepare_command_osd_destroy(
9832 ceph_assert(paxos
.is_plugged());
9834 // we check if the osd exists for the benefit of `osd purge`, which may
9835 // have previously removed the osd. If the osd does not exist, return
9836 // -ENOENT to convey this, and let the caller deal with it.
9838 // we presume that all auth secrets and config keys were removed prior
9839 // to this command being called. if they exist by now, we also assume
9840 // they must have been created by some other command and do not pertain
9841 // to this non-existent osd.
9842 if (!osdmap
.exists(id
)) {
9843 dout(10) << __func__
<< " osd." << id
<< " does not exist." << dendl
;
9847 uuid_d uuid
= osdmap
.get_uuid(id
);
9848 dout(10) << __func__
<< " destroying osd." << id
9849 << " uuid " << uuid
<< dendl
;
9851 // if it has been destroyed, we assume our work here is done.
9852 if (osdmap
.is_destroyed(id
)) {
9853 ss
<< "destroyed osd." << id
;
9857 EntityName cephx_entity
, lockbox_entity
;
9858 bool idempotent_auth
= false, idempotent_cks
= false;
9860 int err
= mon
.authmon()->validate_osd_destroy(id
, uuid
,
9865 if (err
== -ENOENT
) {
9866 idempotent_auth
= true;
9872 auto svc
= mon
.kvmon();
9873 err
= svc
->validate_osd_destroy(id
, uuid
);
9875 ceph_assert(err
== -ENOENT
);
9877 idempotent_cks
= true;
9880 if (!idempotent_auth
) {
9881 err
= mon
.authmon()->do_osd_destroy(cephx_entity
, lockbox_entity
);
9882 ceph_assert(0 == err
);
9885 if (!idempotent_cks
) {
9886 svc
->do_osd_destroy(id
, uuid
);
9889 pending_inc
.new_state
[id
] = CEPH_OSD_DESTROYED
;
9890 pending_inc
.new_uuid
[id
] = uuid_d();
9892 // we can only propose_pending() once per service, otherwise we'll be
9893 // defying PaxosService and all laws of nature. Therefore, as we may
9894 // be used during 'osd purge', let's keep the caller responsible for
9896 ceph_assert(err
== 0);
9900 int OSDMonitor::prepare_command_osd_purge(
9904 ceph_assert(paxos
.is_plugged());
9905 dout(10) << __func__
<< " purging osd." << id
<< dendl
;
9907 ceph_assert(!osdmap
.is_up(id
));
9910 * This may look a bit weird, but this is what's going to happen:
9912 * 1. we make sure that removing from crush works
9913 * 2. we call `prepare_command_osd_destroy()`. If it returns an
9914 * error, then we abort the whole operation, as no updates
9915 * have been made. However, we this function will have
9916 * side-effects, thus we need to make sure that all operations
9917 * performed henceforth will *always* succeed.
9918 * 3. we call `prepare_command_osd_remove()`. Although this
9919 * function can return an error, it currently only checks if the
9920 * osd is up - and we have made sure that it is not so, so there
9921 * is no conflict, and it is effectively an update.
9922 * 4. finally, we call `do_osd_crush_remove()`, which will perform
9923 * the crush update we delayed from before.
9926 CrushWrapper newcrush
= _get_pending_crush();
9928 bool may_be_idempotent
= false;
9930 int err
= _prepare_command_osd_crush_remove(newcrush
, id
, 0, false, false);
9931 if (err
== -ENOENT
) {
9933 may_be_idempotent
= true;
9934 } else if (err
< 0) {
9935 ss
<< "error removing osd." << id
<< " from crush";
9939 // no point destroying the osd again if it has already been marked destroyed
9940 if (!osdmap
.is_destroyed(id
)) {
9941 err
= prepare_command_osd_destroy(id
, ss
);
9943 if (err
== -ENOENT
) {
9949 may_be_idempotent
= false;
9952 ceph_assert(0 == err
);
9954 if (may_be_idempotent
&& !osdmap
.exists(id
)) {
9955 dout(10) << __func__
<< " osd." << id
<< " does not exist and "
9956 << "we are idempotent." << dendl
;
9960 err
= prepare_command_osd_remove(id
);
9961 // we should not be busy, as we should have made sure this id is not up.
9962 ceph_assert(0 == err
);
9964 do_osd_crush_remove(newcrush
);
9968 int OSDMonitor::parse_pgid(const cmdmap_t
& cmdmap
, stringstream
&ss
,
9969 /* out */ pg_t
&pgid
, std::optional
<string
> pgids
) {
9971 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
9972 ss
<< "unable to parse 'pgid' value '"
9973 << cmd_vartype_stringify(cmdmap
.at("pgid")) << "'";
9976 if (!pgid
.parse(pgidstr
.c_str())) {
9977 ss
<< "invalid pgid '" << pgidstr
<< "'";
9980 if (!osdmap
.pg_exists(pgid
)) {
9981 ss
<< "pgid '" << pgid
<< "' does not exist";
9984 if (pgids
.has_value())
9985 pgids
.value() = pgidstr
;
9989 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op
,
9990 const cmdmap_t
& cmdmap
)
9992 op
->mark_osdmon_event(__func__
);
9993 auto m
= op
->get_req
<MMonCommand
>();
10000 string format
= cmd_getval_or
<string
>(cmdmap
, "format", "plain");
10001 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
10004 cmd_getval(cmdmap
, "prefix", prefix
);
10008 bool osdid_present
= false;
10009 if (prefix
!= "osd pg-temp" &&
10010 prefix
!= "osd pg-upmap" &&
10011 prefix
!= "osd pg-upmap-items") { // avoid commands with non-int id arg
10012 osdid_present
= cmd_getval(cmdmap
, "id", osdid
);
10014 if (osdid_present
) {
10016 oss
<< "osd." << osdid
;
10017 osd_name
= oss
.str();
10020 // Even if there's a pending state with changes that could affect
10021 // a command, considering that said state isn't yet committed, we
10022 // just don't care about those changes if the command currently being
10023 // handled acts as a no-op against the current committed state.
10024 // In a nutshell, we assume this command happens *before*.
10026 // Let me make this clearer:
10028 // - If we have only one client, and that client issues some
10029 // operation that would conflict with this operation but is
10030 // still on the pending state, then we would be sure that said
10031 // operation wouldn't have returned yet, so the client wouldn't
10032 // issue this operation (unless the client didn't wait for the
10033 // operation to finish, and that would be the client's own fault).
10035 // - If we have more than one client, each client will observe
10036 // whatever is the state at the moment of the commit. So, if we
10037 // have two clients, one issuing an unlink and another issuing a
10038 // link, and if the link happens while the unlink is still on the
10039 // pending state, from the link's point-of-view this is a no-op.
10040 // If different clients are issuing conflicting operations and
10041 // they care about that, then the clients should make sure they
10042 // enforce some kind of concurrency mechanism -- from our
10043 // perspective that's what Douglas Adams would call an SEP.
10045 // This should be used as a general guideline for most commands handled
10046 // in this function. Adapt as you see fit, but please bear in mind that
10047 // this is the expected behavior.
10050 if (prefix
== "osd setcrushmap" ||
10051 (prefix
== "osd crush set" && !osdid_present
)) {
10052 if (pending_inc
.crush
.length()) {
10053 dout(10) << __func__
<< " waiting for pending crush update " << dendl
;
10054 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
10057 dout(10) << "prepare_command setting new crush map" << dendl
;
10058 bufferlist
data(m
->get_data());
10059 CrushWrapper crush
;
10061 auto bl
= data
.cbegin();
10064 catch (const std::exception
&e
) {
10066 ss
<< "Failed to parse crushmap: " << e
.what();
10070 int64_t prior_version
= 0;
10071 if (cmd_getval(cmdmap
, "prior_version", prior_version
)) {
10072 if (prior_version
== osdmap
.get_crush_version() - 1) {
10073 // see if we are a resend of the last update. this is imperfect
10074 // (multiple racing updaters may not both get reliable success)
10075 // but we expect crush updaters (via this interface) to be rare-ish.
10076 bufferlist current
, proposed
;
10077 osdmap
.crush
->encode(current
, mon
.get_quorum_con_features());
10078 crush
.encode(proposed
, mon
.get_quorum_con_features());
10079 if (current
.contents_equal(proposed
)) {
10080 dout(10) << __func__
10081 << " proposed matches current and version equals previous"
10084 ss
<< osdmap
.get_crush_version();
10088 if (prior_version
!= osdmap
.get_crush_version()) {
10090 ss
<< "prior_version " << prior_version
<< " != crush version "
10091 << osdmap
.get_crush_version();
10096 if (!validate_crush_against_features(&crush
, ss
)) {
10101 err
= osdmap
.validate_crush_rules(&crush
, &ss
);
10106 if (g_conf()->mon_osd_crush_smoke_test
) {
10107 // sanity check: test some inputs to make sure this map isn't
10109 dout(10) << " testing map" << dendl
;
10111 CrushTester
tester(crush
, ess
);
10112 tester
.set_min_x(0);
10113 tester
.set_max_x(50);
10114 tester
.set_num_rep(3); // arbitrary
10115 auto start
= ceph::coarse_mono_clock::now();
10116 int r
= tester
.test_with_fork(cct
, g_conf()->mon_lease
);
10117 auto duration
= ceph::coarse_mono_clock::now() - start
;
10119 dout(10) << " tester.test_with_fork returns " << r
10120 << ": " << ess
.str() << dendl
;
10121 ss
<< "crush smoke test failed with " << r
<< ": " << ess
.str();
10125 dout(10) << __func__
<< " crush somke test duration: "
10126 << duration
<< ", result: " << ess
.str() << dendl
;
10129 pending_inc
.crush
= data
;
10130 ss
<< osdmap
.get_crush_version() + 1;
10133 } else if (prefix
== "osd crush set-all-straw-buckets-to-straw2") {
10134 CrushWrapper newcrush
= _get_pending_crush();
10135 for (int b
= 0; b
< newcrush
.get_max_buckets(); ++b
) {
10137 if (newcrush
.bucket_exists(bid
) &&
10138 newcrush
.get_bucket_alg(bid
) == CRUSH_BUCKET_STRAW
) {
10139 dout(20) << " bucket " << bid
<< " is straw, can convert" << dendl
;
10140 newcrush
.bucket_set_alg(bid
, CRUSH_BUCKET_STRAW2
);
10143 if (!validate_crush_against_features(&newcrush
, ss
)) {
10147 pending_inc
.crush
.clear();
10148 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10149 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10150 get_last_committed() + 1));
10152 } else if (prefix
== "osd crush set-device-class") {
10153 string device_class
;
10154 if (!cmd_getval(cmdmap
, "class", device_class
)) {
10155 err
= -EINVAL
; // no value!
10160 vector
<string
> idvec
;
10161 cmd_getval(cmdmap
, "ids", idvec
);
10162 CrushWrapper newcrush
= _get_pending_crush();
10164 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
10168 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
10169 osdmap
.get_all_osds(osds
);
10172 // try traditional single osd way
10173 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
10175 // ss has reason for failure
10176 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
10183 for (auto &osd
: osds
) {
10184 if (!osdmap
.exists(osd
)) {
10185 ss
<< "osd." << osd
<< " does not exist. ";
10190 oss
<< "osd." << osd
;
10191 string name
= oss
.str();
10193 if (newcrush
.get_max_devices() < osd
+ 1) {
10194 newcrush
.set_max_devices(osd
+ 1);
10197 if (newcrush
.item_exists(osd
)) {
10198 action
= "updating";
10200 action
= "creating";
10201 newcrush
.set_item_name(osd
, name
);
10204 dout(5) << action
<< " crush item id " << osd
<< " name '" << name
10205 << "' device_class '" << device_class
<< "'"
10207 err
= newcrush
.update_device_class(osd
, device_class
, name
, &ss
);
10211 if (err
== 0 && !_have_pending_crush()) {
10213 // for single osd only, wildcard makes too much noise
10214 ss
<< "set-device-class item id " << osd
<< " name '" << name
10215 << "' device_class '" << device_class
<< "': no change. ";
10218 updated
.insert(osd
);
10223 pending_inc
.crush
.clear();
10224 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10225 ss
<< "set osd(s) " << updated
<< " to class '" << device_class
<< "'";
10227 wait_for_finished_proposal(
10229 new Monitor::C_Command(mon
,op
, 0, rs
, get_last_committed() + 1));
10231 } else if (prefix
== "osd crush rm-device-class") {
10233 vector
<string
> idvec
;
10234 cmd_getval(cmdmap
, "ids", idvec
);
10235 CrushWrapper newcrush
= _get_pending_crush();
10238 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
10243 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
10244 osdmap
.get_all_osds(osds
);
10247 // try traditional single osd way
10248 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
10250 // ss has reason for failure
10251 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
10258 for (auto &osd
: osds
) {
10259 if (!osdmap
.exists(osd
)) {
10260 ss
<< "osd." << osd
<< " does not exist. ";
10264 auto class_name
= newcrush
.get_item_class(osd
);
10266 ss
<< "osd." << osd
<< " belongs to no class, ";
10269 // note that we do not verify if class_is_in_use here
10270 // in case the device is misclassified and user wants
10271 // to overridely reset...
10273 err
= newcrush
.remove_device_class(cct
, osd
, &ss
);
10275 // ss has reason for failure
10278 updated
.insert(osd
);
10282 pending_inc
.crush
.clear();
10283 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10284 ss
<< "done removing class of osd(s): " << updated
;
10286 wait_for_finished_proposal(
10288 new Monitor::C_Command(mon
,op
, 0, rs
, get_last_committed() + 1));
10290 } else if (prefix
== "osd crush class create") {
10291 string device_class
;
10292 if (!cmd_getval(cmdmap
, "class", device_class
)) {
10293 err
= -EINVAL
; // no value!
10296 if (osdmap
.require_osd_release
< ceph_release_t::luminous
) {
10297 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
10298 << "luminous' before using crush device classes";
10302 if (!_have_pending_crush() &&
10303 _get_stable_crush().class_exists(device_class
)) {
10304 ss
<< "class '" << device_class
<< "' already exists";
10307 CrushWrapper newcrush
= _get_pending_crush();
10308 if (newcrush
.class_exists(device_class
)) {
10309 ss
<< "class '" << device_class
<< "' already exists";
10312 int class_id
= newcrush
.get_or_create_class_id(device_class
);
10313 pending_inc
.crush
.clear();
10314 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10315 ss
<< "created class " << device_class
<< " with id " << class_id
10316 << " to crush map";
10318 } else if (prefix
== "osd crush class rm") {
10319 string device_class
;
10320 if (!cmd_getval(cmdmap
, "class", device_class
)) {
10321 err
= -EINVAL
; // no value!
10324 if (osdmap
.require_osd_release
< ceph_release_t::luminous
) {
10325 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
10326 << "luminous' before using crush device classes";
10331 if (!osdmap
.crush
->class_exists(device_class
)) {
10336 CrushWrapper newcrush
= _get_pending_crush();
10337 if (!newcrush
.class_exists(device_class
)) {
10338 err
= 0; // make command idempotent
10341 int class_id
= newcrush
.get_class_id(device_class
);
10343 if (newcrush
.class_is_in_use(class_id
, &ts
)) {
10345 ss
<< "class '" << device_class
<< "' " << ts
.str();
10349 // check if class is used by any erasure-code-profiles
10350 mempool::osdmap::map
<string
,map
<string
,string
>> old_ec_profiles
=
10351 osdmap
.get_erasure_code_profiles();
10352 auto ec_profiles
= pending_inc
.get_erasure_code_profiles();
10353 #ifdef HAVE_STDLIB_MAP_SPLICING
10354 ec_profiles
.merge(old_ec_profiles
);
10356 ec_profiles
.insert(make_move_iterator(begin(old_ec_profiles
)),
10357 make_move_iterator(end(old_ec_profiles
)));
10359 list
<string
> referenced_by
;
10360 for (auto &i
: ec_profiles
) {
10361 for (auto &j
: i
.second
) {
10362 if ("crush-device-class" == j
.first
&& device_class
== j
.second
) {
10363 referenced_by
.push_back(i
.first
);
10367 if (!referenced_by
.empty()) {
10369 ss
<< "class '" << device_class
10370 << "' is still referenced by erasure-code-profile(s): " << referenced_by
;
10375 newcrush
.get_devices_by_class(device_class
, &osds
);
10376 for (auto& p
: osds
) {
10377 err
= newcrush
.remove_device_class(cct
, p
, &ss
);
10379 // ss has reason for failure
10384 if (osds
.empty()) {
10385 // empty class, remove directly
10386 err
= newcrush
.remove_class_name(device_class
);
10388 ss
<< "class '" << device_class
<< "' cannot be removed '"
10389 << cpp_strerror(err
) << "'";
10394 pending_inc
.crush
.clear();
10395 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10396 ss
<< "removed class " << device_class
<< " with id " << class_id
10397 << " from crush map";
10399 } else if (prefix
== "osd crush class rename") {
10400 string srcname
, dstname
;
10401 if (!cmd_getval(cmdmap
, "srcname", srcname
)) {
10405 if (!cmd_getval(cmdmap
, "dstname", dstname
)) {
10410 CrushWrapper newcrush
= _get_pending_crush();
10411 if (!newcrush
.class_exists(srcname
) && newcrush
.class_exists(dstname
)) {
10412 // suppose this is a replay and return success
10413 // so command is idempotent
10414 ss
<< "already renamed to '" << dstname
<< "'";
10419 err
= newcrush
.rename_class(srcname
, dstname
);
10421 ss
<< "fail to rename '" << srcname
<< "' to '" << dstname
<< "' : "
10422 << cpp_strerror(err
);
10426 pending_inc
.crush
.clear();
10427 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10428 ss
<< "rename class '" << srcname
<< "' to '" << dstname
<< "'";
10430 } else if (prefix
== "osd crush add-bucket") {
10431 // os crush add-bucket <name> <type>
10432 string name
, typestr
;
10433 vector
<string
> argvec
;
10434 cmd_getval(cmdmap
, "name", name
);
10435 cmd_getval(cmdmap
, "type", typestr
);
10436 cmd_getval(cmdmap
, "args", argvec
);
10437 map
<string
,string
> loc
;
10438 if (!argvec
.empty()) {
10439 CrushWrapper::parse_loc_map(argvec
, &loc
);
10440 dout(0) << "will create and move bucket '" << name
10441 << "' to location " << loc
<< dendl
;
10444 if (!_have_pending_crush() &&
10445 _get_stable_crush().name_exists(name
)) {
10446 ss
<< "bucket '" << name
<< "' already exists";
10450 CrushWrapper newcrush
= _get_pending_crush();
10452 if (newcrush
.name_exists(name
)) {
10453 ss
<< "bucket '" << name
<< "' already exists";
10456 int type
= newcrush
.get_type_id(typestr
);
10458 ss
<< "type '" << typestr
<< "' does not exist";
10463 ss
<< "type '" << typestr
<< "' is for devices, not buckets";
10468 err
= newcrush
.add_bucket(0, 0,
10469 CRUSH_HASH_DEFAULT
, type
, 0, NULL
,
10472 ss
<< "add_bucket error: '" << cpp_strerror(err
) << "'";
10475 err
= newcrush
.set_item_name(bucketno
, name
);
10477 ss
<< "error setting bucket name to '" << name
<< "'";
10481 if (!loc
.empty()) {
10482 if (!newcrush
.check_item_loc(cct
, bucketno
, loc
,
10484 err
= newcrush
.move_bucket(cct
, bucketno
, loc
);
10486 ss
<< "error moving bucket '" << name
<< "' to location " << loc
;
10490 ss
<< "no need to move item id " << bucketno
<< " name '" << name
10491 << "' to location " << loc
<< " in crush map";
10495 pending_inc
.crush
.clear();
10496 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10498 ss
<< "added bucket " << name
<< " type " << typestr
10499 << " to crush map";
10501 ss
<< "added bucket " << name
<< " type " << typestr
10502 << " to location " << loc
;
10505 } else if (prefix
== "osd crush rename-bucket") {
10506 string srcname
, dstname
;
10507 cmd_getval(cmdmap
, "srcname", srcname
);
10508 cmd_getval(cmdmap
, "dstname", dstname
);
10510 err
= crush_rename_bucket(srcname
, dstname
, &ss
);
10511 if (err
== -EALREADY
) // equivalent to success for idempotency
10517 } else if (prefix
== "osd crush weight-set create" ||
10518 prefix
== "osd crush weight-set create-compat") {
10519 if (_have_pending_crush()) {
10520 dout(10) << " first waiting for pending crush changes to commit" << dendl
;
10523 CrushWrapper newcrush
= _get_pending_crush();
10526 if (newcrush
.has_non_straw2_buckets()) {
10527 ss
<< "crush map contains one or more bucket(s) that are not straw2";
10531 if (prefix
== "osd crush weight-set create") {
10532 if (osdmap
.require_min_compat_client
!= ceph_release_t::unknown
&&
10533 osdmap
.require_min_compat_client
< ceph_release_t::luminous
) {
10534 ss
<< "require_min_compat_client "
10535 << osdmap
.require_min_compat_client
10536 << " < luminous, which is required for per-pool weight-sets. "
10537 << "Try 'ceph osd set-require-min-compat-client luminous' "
10538 << "before using the new interface";
10542 string poolname
, mode
;
10543 cmd_getval(cmdmap
, "pool", poolname
);
10544 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
10546 ss
<< "pool '" << poolname
<< "' not found";
10550 cmd_getval(cmdmap
, "mode", mode
);
10551 if (mode
!= "flat" && mode
!= "positional") {
10552 ss
<< "unrecognized weight-set mode '" << mode
<< "'";
10556 positions
= mode
== "flat" ? 1 : osdmap
.get_pg_pool(pool
)->get_size();
10558 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
10561 if (!newcrush
.create_choose_args(pool
, positions
)) {
10562 if (pool
== CrushWrapper::DEFAULT_CHOOSE_ARGS
) {
10563 ss
<< "compat weight-set already created";
10565 ss
<< "weight-set for pool '" << osdmap
.get_pool_name(pool
)
10566 << "' already created";
10570 pending_inc
.crush
.clear();
10571 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10574 } else if (prefix
== "osd crush weight-set rm" ||
10575 prefix
== "osd crush weight-set rm-compat") {
10576 CrushWrapper newcrush
= _get_pending_crush();
10578 if (prefix
== "osd crush weight-set rm") {
10580 cmd_getval(cmdmap
, "pool", poolname
);
10581 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
10583 ss
<< "pool '" << poolname
<< "' not found";
10588 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
10590 newcrush
.rm_choose_args(pool
);
10591 pending_inc
.crush
.clear();
10592 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10595 } else if (prefix
== "osd crush weight-set reweight" ||
10596 prefix
== "osd crush weight-set reweight-compat") {
10597 string poolname
, item
;
10598 vector
<double> weight
;
10599 cmd_getval(cmdmap
, "pool", poolname
);
10600 cmd_getval(cmdmap
, "item", item
);
10601 cmd_getval(cmdmap
, "weight", weight
);
10602 CrushWrapper newcrush
= _get_pending_crush();
10604 if (prefix
== "osd crush weight-set reweight") {
10605 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
10607 ss
<< "pool '" << poolname
<< "' not found";
10611 if (!newcrush
.have_choose_args(pool
)) {
10612 ss
<< "no weight-set for pool '" << poolname
<< "'";
10616 auto arg_map
= newcrush
.choose_args_get(pool
);
10617 int positions
= newcrush
.get_choose_args_positions(arg_map
);
10618 if (weight
.size() != (size_t)positions
) {
10619 ss
<< "must specify exact " << positions
<< " weight values";
10624 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
10625 if (!newcrush
.have_choose_args(pool
)) {
10626 ss
<< "no backward-compatible weight-set";
10631 if (!newcrush
.name_exists(item
)) {
10632 ss
<< "item '" << item
<< "' does not exist";
10636 err
= newcrush
.choose_args_adjust_item_weightf(
10638 newcrush
.choose_args_get(pool
),
10639 newcrush
.get_item_id(item
),
10646 pending_inc
.crush
.clear();
10647 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10649 } else if (osdid_present
&&
10650 (prefix
== "osd crush set" || prefix
== "osd crush add")) {
10651 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
10652 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
10653 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
10655 if (!osdmap
.exists(osdid
)) {
10658 << " does not exist. Create it before updating the crush map";
10663 if (!cmd_getval(cmdmap
, "weight", weight
)) {
10664 ss
<< "unable to parse weight value '"
10665 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
10671 vector
<string
> argvec
;
10672 cmd_getval(cmdmap
, "args", argvec
);
10673 map
<string
,string
> loc
;
10674 CrushWrapper::parse_loc_map(argvec
, &loc
);
10676 if (prefix
== "osd crush set"
10677 && !_get_stable_crush().item_exists(osdid
)) {
10679 ss
<< "unable to set item id " << osdid
<< " name '" << osd_name
10680 << "' weight " << weight
<< " at location " << loc
10681 << ": does not exist";
10685 dout(5) << "adding/updating crush item id " << osdid
<< " name '"
10686 << osd_name
<< "' weight " << weight
<< " at location "
10688 CrushWrapper newcrush
= _get_pending_crush();
10691 if (prefix
== "osd crush set" ||
10692 newcrush
.check_item_loc(cct
, osdid
, loc
, (int *)NULL
)) {
10694 err
= newcrush
.update_item(cct
, osdid
, weight
, osd_name
, loc
);
10697 err
= newcrush
.insert_item(cct
, osdid
, weight
, osd_name
, loc
);
10705 if (err
== 0 && !_have_pending_crush()) {
10706 ss
<< action
<< " item id " << osdid
<< " name '" << osd_name
10707 << "' weight " << weight
<< " at location " << loc
<< ": no change";
10711 pending_inc
.crush
.clear();
10712 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10713 ss
<< action
<< " item id " << osdid
<< " name '" << osd_name
<< "' weight "
10714 << weight
<< " at location " << loc
<< " to crush map";
10716 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10717 get_last_committed() + 1));
10720 } else if (prefix
== "osd crush create-or-move") {
10722 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
10723 if (!osdmap
.exists(osdid
)) {
10726 << " does not exist. create it before updating the crush map";
10731 if (!cmd_getval(cmdmap
, "weight", weight
)) {
10732 ss
<< "unable to parse weight value '"
10733 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
10739 vector
<string
> argvec
;
10740 cmd_getval(cmdmap
, "args", argvec
);
10741 map
<string
,string
> loc
;
10742 CrushWrapper::parse_loc_map(argvec
, &loc
);
10744 dout(0) << "create-or-move crush item name '" << osd_name
10745 << "' initial_weight " << weight
<< " at location " << loc
10748 CrushWrapper newcrush
= _get_pending_crush();
10750 err
= newcrush
.create_or_move_item(cct
, osdid
, weight
, osd_name
, loc
,
10751 g_conf()->osd_crush_update_weight_set
);
10753 ss
<< "create-or-move updated item name '" << osd_name
10754 << "' weight " << weight
10755 << " at location " << loc
<< " to crush map";
10759 pending_inc
.crush
.clear();
10760 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10761 ss
<< "create-or-move updating item name '" << osd_name
10762 << "' weight " << weight
10763 << " at location " << loc
<< " to crush map";
10765 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10766 get_last_committed() + 1));
10771 } else if (prefix
== "osd crush move") {
10773 // osd crush move <name> <loc1> [<loc2> ...]
10775 vector
<string
> argvec
;
10776 cmd_getval(cmdmap
, "name", name
);
10777 cmd_getval(cmdmap
, "args", argvec
);
10778 map
<string
,string
> loc
;
10779 CrushWrapper::parse_loc_map(argvec
, &loc
);
10781 dout(0) << "moving crush item name '" << name
<< "' to location " << loc
<< dendl
;
10782 CrushWrapper newcrush
= _get_pending_crush();
10784 if (!newcrush
.name_exists(name
)) {
10786 ss
<< "item " << name
<< " does not exist";
10789 int id
= newcrush
.get_item_id(name
);
10791 if (!newcrush
.check_item_loc(cct
, id
, loc
, (int *)NULL
)) {
10793 err
= newcrush
.create_or_move_item(
10794 cct
, id
, 0, name
, loc
,
10795 g_conf()->osd_crush_update_weight_set
);
10797 err
= newcrush
.move_bucket(cct
, id
, loc
);
10800 ss
<< "moved item id " << id
<< " name '" << name
<< "' to location " << loc
<< " in crush map";
10801 pending_inc
.crush
.clear();
10802 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10804 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10805 get_last_committed() + 1));
10809 ss
<< "no need to move item id " << id
<< " name '" << name
<< "' to location " << loc
<< " in crush map";
10813 } else if (prefix
== "osd crush swap-bucket") {
10814 string source
, dest
;
10815 cmd_getval(cmdmap
, "source", source
);
10816 cmd_getval(cmdmap
, "dest", dest
);
10818 bool force
= false;
10819 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
10821 CrushWrapper newcrush
= _get_pending_crush();
10822 if (!newcrush
.name_exists(source
)) {
10823 ss
<< "source item " << source
<< " does not exist";
10827 if (!newcrush
.name_exists(dest
)) {
10828 ss
<< "dest item " << dest
<< " does not exist";
10832 int sid
= newcrush
.get_item_id(source
);
10833 int did
= newcrush
.get_item_id(dest
);
10835 if (newcrush
.get_immediate_parent_id(sid
, &sparent
) == 0 && !force
) {
10836 ss
<< "source item " << source
<< " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
10840 if (newcrush
.get_bucket_alg(sid
) != newcrush
.get_bucket_alg(did
) &&
10842 ss
<< "source bucket alg " << crush_alg_name(newcrush
.get_bucket_alg(sid
)) << " != "
10843 << "dest bucket alg " << crush_alg_name(newcrush
.get_bucket_alg(did
))
10844 << "; pass --yes-i-really-mean-it to proceed anyway";
10848 int r
= newcrush
.swap_bucket(cct
, sid
, did
);
10850 ss
<< "failed to swap bucket contents: " << cpp_strerror(r
);
10854 ss
<< "swapped bucket of " << source
<< " to " << dest
;
10855 pending_inc
.crush
.clear();
10856 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10857 wait_for_finished_proposal(op
,
10858 new Monitor::C_Command(mon
, op
, err
, ss
.str(),
10859 get_last_committed() + 1));
10861 } else if (prefix
== "osd crush link") {
10862 // osd crush link <name> <loc1> [<loc2> ...]
10864 cmd_getval(cmdmap
, "name", name
);
10865 vector
<string
> argvec
;
10866 cmd_getval(cmdmap
, "args", argvec
);
10867 map
<string
,string
> loc
;
10868 CrushWrapper::parse_loc_map(argvec
, &loc
);
10870 // Need an explicit check for name_exists because get_item_id returns
10872 int id
= osdmap
.crush
->get_item_id(name
);
10873 if (!osdmap
.crush
->name_exists(name
)) {
10875 ss
<< "item " << name
<< " does not exist";
10878 dout(5) << "resolved crush name '" << name
<< "' to id " << id
<< dendl
;
10880 if (osdmap
.crush
->check_item_loc(cct
, id
, loc
, (int*) NULL
)) {
10881 ss
<< "no need to move item id " << id
<< " name '" << name
10882 << "' to location " << loc
<< " in crush map";
10887 dout(5) << "linking crush item name '" << name
<< "' at location " << loc
<< dendl
;
10888 CrushWrapper newcrush
= _get_pending_crush();
10890 if (!newcrush
.name_exists(name
)) {
10892 ss
<< "item " << name
<< " does not exist";
10895 int id
= newcrush
.get_item_id(name
);
10896 if (!newcrush
.check_item_loc(cct
, id
, loc
, (int *)NULL
)) {
10897 err
= newcrush
.link_bucket(cct
, id
, loc
);
10899 ss
<< "linked item id " << id
<< " name '" << name
10900 << "' to location " << loc
<< " in crush map";
10901 pending_inc
.crush
.clear();
10902 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10904 ss
<< "cannot link item id " << id
<< " name '" << name
10905 << "' to location " << loc
;
10909 ss
<< "no need to move item id " << id
<< " name '" << name
10910 << "' to location " << loc
<< " in crush map";
10914 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, ss
.str(),
10915 get_last_committed() + 1));
10917 } else if (prefix
== "osd crush rm" ||
10918 prefix
== "osd crush remove" ||
10919 prefix
== "osd crush unlink") {
10921 // osd crush rm <id> [ancestor]
10922 CrushWrapper newcrush
= _get_pending_crush();
10925 cmd_getval(cmdmap
, "name", name
);
10927 if (!osdmap
.crush
->name_exists(name
)) {
10929 ss
<< "device '" << name
<< "' does not appear in the crush map";
10932 if (!newcrush
.name_exists(name
)) {
10934 ss
<< "device '" << name
<< "' does not appear in the crush map";
10936 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10937 get_last_committed() + 1));
10940 int id
= newcrush
.get_item_id(name
);
10943 bool unlink_only
= prefix
== "osd crush unlink";
10944 string ancestor_str
;
10945 if (cmd_getval(cmdmap
, "ancestor", ancestor_str
)) {
10946 if (!newcrush
.name_exists(ancestor_str
)) {
10948 ss
<< "ancestor item '" << ancestor_str
10949 << "' does not appear in the crush map";
10952 ancestor
= newcrush
.get_item_id(ancestor_str
);
10955 err
= prepare_command_osd_crush_remove(
10958 (ancestor
< 0), unlink_only
);
10960 if (err
== -ENOENT
) {
10961 ss
<< "item " << id
<< " does not appear in that position";
10967 pending_inc
.new_crush_node_flags
[id
] = 0;
10968 ss
<< "removed item id " << id
<< " name '" << name
<< "' from crush map";
10970 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10971 get_last_committed() + 1));
10976 } else if (prefix
== "osd crush reweight-all") {
10977 CrushWrapper newcrush
= _get_pending_crush();
10979 newcrush
.reweight(cct
);
10980 pending_inc
.crush
.clear();
10981 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10982 ss
<< "reweighted crush hierarchy";
10984 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10985 get_last_committed() + 1));
10987 } else if (prefix
== "osd crush reweight") {
10988 // osd crush reweight <name> <weight>
10989 CrushWrapper newcrush
= _get_pending_crush();
10992 cmd_getval(cmdmap
, "name", name
);
10993 if (!newcrush
.name_exists(name
)) {
10995 ss
<< "device '" << name
<< "' does not appear in the crush map";
10999 int id
= newcrush
.get_item_id(name
);
11001 ss
<< "device '" << name
<< "' is not a leaf in the crush map";
11006 if (!cmd_getval(cmdmap
, "weight", w
)) {
11007 ss
<< "unable to parse weight value '"
11008 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
11013 err
= newcrush
.adjust_item_weightf(cct
, id
, w
,
11014 g_conf()->osd_crush_update_weight_set
);
11017 pending_inc
.crush
.clear();
11018 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
11019 ss
<< "reweighted item id " << id
<< " name '" << name
<< "' to " << w
11020 << " in crush map";
11022 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11023 get_last_committed() + 1));
11025 } else if (prefix
== "osd crush reweight-subtree") {
11026 // osd crush reweight <name> <weight>
11027 CrushWrapper newcrush
= _get_pending_crush();
11030 cmd_getval(cmdmap
, "name", name
);
11031 if (!newcrush
.name_exists(name
)) {
11033 ss
<< "device '" << name
<< "' does not appear in the crush map";
11037 int id
= newcrush
.get_item_id(name
);
11039 ss
<< "device '" << name
<< "' is not a subtree in the crush map";
11044 if (!cmd_getval(cmdmap
, "weight", w
)) {
11045 ss
<< "unable to parse weight value '"
11046 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
11051 err
= newcrush
.adjust_subtree_weightf(cct
, id
, w
,
11052 g_conf()->osd_crush_update_weight_set
);
11055 pending_inc
.crush
.clear();
11056 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
11057 ss
<< "reweighted subtree id " << id
<< " name '" << name
<< "' to " << w
11058 << " in crush map";
11060 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11061 get_last_committed() + 1));
11063 } else if (prefix
== "osd crush tunables") {
11064 CrushWrapper newcrush
= _get_pending_crush();
11068 cmd_getval(cmdmap
, "profile", profile
);
11069 if (profile
== "legacy" || profile
== "argonaut") {
11070 newcrush
.set_tunables_legacy();
11071 } else if (profile
== "bobtail") {
11072 newcrush
.set_tunables_bobtail();
11073 } else if (profile
== "firefly") {
11074 newcrush
.set_tunables_firefly();
11075 } else if (profile
== "hammer") {
11076 newcrush
.set_tunables_hammer();
11077 } else if (profile
== "jewel") {
11078 newcrush
.set_tunables_jewel();
11079 } else if (profile
== "optimal") {
11080 newcrush
.set_tunables_optimal();
11081 } else if (profile
== "default") {
11082 newcrush
.set_tunables_default();
11084 ss
<< "unrecognized profile '" << profile
<< "'";
11089 if (!validate_crush_against_features(&newcrush
, ss
)) {
11094 pending_inc
.crush
.clear();
11095 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
11096 ss
<< "adjusted tunables profile to " << profile
;
11098 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11099 get_last_committed() + 1));
11101 } else if (prefix
== "osd crush set-tunable") {
11102 CrushWrapper newcrush
= _get_pending_crush();
11106 cmd_getval(cmdmap
, "tunable", tunable
);
11108 int64_t value
= -1;
11109 if (!cmd_getval(cmdmap
, "value", value
)) {
11111 ss
<< "failed to parse integer value "
11112 << cmd_vartype_stringify(cmdmap
.at("value"));
11116 if (tunable
== "straw_calc_version") {
11117 if (value
!= 0 && value
!= 1) {
11118 ss
<< "value must be 0 or 1; got " << value
;
11122 newcrush
.set_straw_calc_version(value
);
11124 ss
<< "unrecognized tunable '" << tunable
<< "'";
11129 if (!validate_crush_against_features(&newcrush
, ss
)) {
11134 pending_inc
.crush
.clear();
11135 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
11136 ss
<< "adjusted tunable " << tunable
<< " to " << value
;
11138 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11139 get_last_committed() + 1));
11142 } else if (prefix
== "osd crush rule create-simple") {
11143 string name
, root
, type
, mode
;
11144 cmd_getval(cmdmap
, "name", name
);
11145 cmd_getval(cmdmap
, "root", root
);
11146 cmd_getval(cmdmap
, "type", type
);
11147 cmd_getval(cmdmap
, "mode", mode
);
11151 if (osdmap
.crush
->rule_exists(name
)) {
11152 // The name is uniquely associated to a ruleid and the rule it contains
11153 // From the user point of view, the rule is more meaningfull.
11154 ss
<< "rule " << name
<< " already exists";
11159 CrushWrapper newcrush
= _get_pending_crush();
11161 if (newcrush
.rule_exists(name
)) {
11162 // The name is uniquely associated to a ruleid and the rule it contains
11163 // From the user point of view, the rule is more meaningfull.
11164 ss
<< "rule " << name
<< " already exists";
11167 int ruleno
= newcrush
.add_simple_rule(name
, root
, type
, "", mode
,
11168 pg_pool_t::TYPE_REPLICATED
, &ss
);
11174 pending_inc
.crush
.clear();
11175 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
11178 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11179 get_last_committed() + 1));
11182 } else if (prefix
== "osd crush rule create-replicated") {
11183 string name
, root
, type
, device_class
;
11184 cmd_getval(cmdmap
, "name", name
);
11185 cmd_getval(cmdmap
, "root", root
);
11186 cmd_getval(cmdmap
, "type", type
);
11187 cmd_getval(cmdmap
, "class", device_class
);
11189 if (osdmap
.crush
->rule_exists(name
)) {
11190 // The name is uniquely associated to a ruleid and the rule it contains
11191 // From the user point of view, the rule is more meaningfull.
11192 ss
<< "rule " << name
<< " already exists";
11197 CrushWrapper newcrush
= _get_pending_crush();
11199 if (newcrush
.rule_exists(name
)) {
11200 // The name is uniquely associated to a ruleid and the rule it contains
11201 // From the user point of view, the rule is more meaningfull.
11202 ss
<< "rule " << name
<< " already exists";
11205 int ruleno
= newcrush
.add_simple_rule(
11206 name
, root
, type
, device_class
,
11207 "firstn", pg_pool_t::TYPE_REPLICATED
, &ss
);
11213 pending_inc
.crush
.clear();
11214 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
11217 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11218 get_last_committed() + 1));
11221 } else if (prefix
== "osd erasure-code-profile rm") {
11223 cmd_getval(cmdmap
, "name", name
);
11225 if (erasure_code_profile_in_use(pending_inc
.new_pools
, name
, &ss
))
11228 if (erasure_code_profile_in_use(osdmap
.pools
, name
, &ss
)) {
11233 if (osdmap
.has_erasure_code_profile(name
) ||
11234 pending_inc
.new_erasure_code_profiles
.count(name
)) {
11235 if (osdmap
.has_erasure_code_profile(name
)) {
11236 pending_inc
.old_erasure_code_profiles
.push_back(name
);
11238 dout(20) << "erasure code profile rm " << name
<< ": creation canceled" << dendl
;
11239 pending_inc
.new_erasure_code_profiles
.erase(name
);
11243 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11244 get_last_committed() + 1));
11247 ss
<< "erasure-code-profile " << name
<< " does not exist";
11252 } else if (prefix
== "osd erasure-code-profile set") {
11254 cmd_getval(cmdmap
, "name", name
);
11255 vector
<string
> profile
;
11256 cmd_getval(cmdmap
, "profile", profile
);
11258 bool force
= false;
11259 cmd_getval(cmdmap
, "force", force
);
11261 map
<string
,string
> profile_map
;
11262 err
= parse_erasure_code_profile(profile
, &profile_map
, &ss
);
11265 if (auto found
= profile_map
.find("crush-failure-domain");
11266 found
!= profile_map
.end()) {
11267 const auto& failure_domain
= found
->second
;
11268 int failure_domain_type
= osdmap
.crush
->get_type_id(failure_domain
);
11269 if (failure_domain_type
< 0) {
11270 ss
<< "erasure-code-profile " << profile_map
11271 << " contains an invalid failure-domain " << std::quoted(failure_domain
);
11277 if (profile_map
.find("plugin") == profile_map
.end()) {
11278 ss
<< "erasure-code-profile " << profile_map
11279 << " must contain a plugin entry" << std::endl
;
11283 string plugin
= profile_map
["plugin"];
11285 if (pending_inc
.has_erasure_code_profile(name
)) {
11286 dout(20) << "erasure code profile " << name
<< " try again" << dendl
;
11289 err
= normalize_profile(name
, profile_map
, force
, &ss
);
11293 if (osdmap
.has_erasure_code_profile(name
)) {
11294 ErasureCodeProfile existing_profile_map
=
11295 osdmap
.get_erasure_code_profile(name
);
11296 err
= normalize_profile(name
, existing_profile_map
, force
, &ss
);
11300 if (existing_profile_map
== profile_map
) {
11306 ss
<< "will not override erasure code profile " << name
11307 << " because the existing profile "
11308 << existing_profile_map
11309 << " is different from the proposed profile "
11315 dout(20) << "erasure code profile set " << name
<< "="
11316 << profile_map
<< dendl
;
11317 pending_inc
.set_erasure_code_profile(name
, profile_map
);
11321 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11322 get_last_committed() + 1));
11325 } else if (prefix
== "osd crush rule create-erasure") {
11326 err
= check_cluster_features(CEPH_FEATURE_CRUSH_V2
, ss
);
11327 if (err
== -EAGAIN
)
11331 string name
, poolstr
;
11332 cmd_getval(cmdmap
, "name", name
);
11334 cmd_getval(cmdmap
, "profile", profile
);
11336 profile
= "default";
11337 if (profile
== "default") {
11338 if (!osdmap
.has_erasure_code_profile(profile
)) {
11339 if (pending_inc
.has_erasure_code_profile(profile
)) {
11340 dout(20) << "erasure code profile " << profile
<< " already pending" << dendl
;
11344 map
<string
,string
> profile_map
;
11345 err
= osdmap
.get_erasure_code_profile_default(cct
,
11350 err
= normalize_profile(name
, profile_map
, true, &ss
);
11353 dout(20) << "erasure code profile set " << profile
<< "="
11354 << profile_map
<< dendl
;
11355 pending_inc
.set_erasure_code_profile(profile
, profile_map
);
11361 err
= crush_rule_create_erasure(name
, profile
, &rule
, &ss
);
11364 case -EEXIST
: // return immediately
11365 ss
<< "rule " << name
<< " already exists";
11369 case -EALREADY
: // wait for pending to be proposed
11370 ss
<< "rule " << name
<< " already exists";
11373 default: // non recoverable error
11378 ss
<< "created rule " << name
<< " at " << rule
;
11382 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11383 get_last_committed() + 1));
11386 } else if (prefix
== "osd crush rule rm") {
11388 cmd_getval(cmdmap
, "name", name
);
11390 if (!osdmap
.crush
->rule_exists(name
)) {
11391 ss
<< "rule " << name
<< " does not exist";
11396 CrushWrapper newcrush
= _get_pending_crush();
11398 if (!newcrush
.rule_exists(name
)) {
11399 ss
<< "rule " << name
<< " does not exist";
11402 int ruleno
= newcrush
.get_rule_id(name
);
11403 ceph_assert(ruleno
>= 0);
11405 // make sure it is not in use.
11406 // FIXME: this is ok in some situations, but let's not bother with that
11408 if (osdmap
.crush_rule_in_use(ruleno
)) {
11409 ss
<< "crush rule " << name
<< " (" << ruleno
<< ") is in use";
11414 err
= newcrush
.remove_rule(ruleno
);
11419 pending_inc
.crush
.clear();
11420 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
11423 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11424 get_last_committed() + 1));
11427 } else if (prefix
== "osd crush rule rename") {
11430 cmd_getval(cmdmap
, "srcname", srcname
);
11431 cmd_getval(cmdmap
, "dstname", dstname
);
11432 if (srcname
.empty() || dstname
.empty()) {
11433 ss
<< "must specify both source rule name and destination rule name";
11437 if (srcname
== dstname
) {
11438 ss
<< "destination rule name is equal to source rule name";
11443 CrushWrapper newcrush
= _get_pending_crush();
11444 if (!newcrush
.rule_exists(srcname
) && newcrush
.rule_exists(dstname
)) {
11445 // srcname does not exist and dstname already exists
11446 // suppose this is a replay and return success
11447 // (so this command is idempotent)
11448 ss
<< "already renamed to '" << dstname
<< "'";
11453 err
= newcrush
.rename_rule(srcname
, dstname
, &ss
);
11455 // ss has reason for failure
11458 pending_inc
.crush
.clear();
11459 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
11461 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11462 get_last_committed() + 1));
11465 } else if (prefix
== "osd setmaxosd") {
11467 if (!cmd_getval(cmdmap
, "newmax", newmax
)) {
11468 ss
<< "unable to parse 'newmax' value '"
11469 << cmd_vartype_stringify(cmdmap
.at("newmax")) << "'";
11474 if (newmax
> g_conf()->mon_max_osd
) {
11476 ss
<< "cannot set max_osd to " << newmax
<< " which is > conf.mon_max_osd ("
11477 << g_conf()->mon_max_osd
<< ")";
11481 // Don't allow shrinking OSD number as this will cause data loss
11482 // and may cause kernel crashes.
11483 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
11484 if (newmax
< osdmap
.get_max_osd()) {
11485 // Check if the OSDs exist between current max and new value.
11486 // If there are any OSDs exist, then don't allow shrinking number
11488 for (int i
= newmax
; i
< osdmap
.get_max_osd(); i
++) {
11489 if (osdmap
.exists(i
)) {
11491 ss
<< "cannot shrink max_osd to " << newmax
11492 << " because osd." << i
<< " (and possibly others) still in use";
11498 pending_inc
.new_max_osd
= newmax
;
11499 ss
<< "set new max_osd = " << pending_inc
.new_max_osd
;
11501 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11502 get_last_committed() + 1));
11505 } else if (prefix
== "osd set-full-ratio" ||
11506 prefix
== "osd set-backfillfull-ratio" ||
11507 prefix
== "osd set-nearfull-ratio") {
11509 if (!cmd_getval(cmdmap
, "ratio", n
)) {
11510 ss
<< "unable to parse 'ratio' value '"
11511 << cmd_vartype_stringify(cmdmap
.at("ratio")) << "'";
11515 if (prefix
== "osd set-full-ratio")
11516 pending_inc
.new_full_ratio
= n
;
11517 else if (prefix
== "osd set-backfillfull-ratio")
11518 pending_inc
.new_backfillfull_ratio
= n
;
11519 else if (prefix
== "osd set-nearfull-ratio")
11520 pending_inc
.new_nearfull_ratio
= n
;
11521 ss
<< prefix
<< " " << n
;
11523 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11524 get_last_committed() + 1));
11526 } else if (prefix
== "osd set-require-min-compat-client") {
11528 cmd_getval(cmdmap
, "version", v
);
11529 ceph_release_t vno
= ceph_release_from_name(v
);
11531 ss
<< "version " << v
<< " is not recognized";
11536 newmap
.deepish_copy_from(osdmap
);
11537 newmap
.apply_incremental(pending_inc
);
11538 newmap
.require_min_compat_client
= vno
;
11539 auto mvno
= newmap
.get_min_compat_client();
11541 ss
<< "osdmap current utilizes features that require " << mvno
11542 << "; cannot set require_min_compat_client below that to " << vno
;
11547 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
11550 mon
.get_combined_feature_map(&m
);
11551 uint64_t features
= ceph_release_features(to_integer
<int>(vno
));
11555 CEPH_ENTITY_TYPE_CLIENT
,
11556 CEPH_ENTITY_TYPE_MDS
,
11557 CEPH_ENTITY_TYPE_MGR
}) {
11558 auto p
= m
.m
.find(type
);
11559 if (p
== m
.m
.end()) {
11562 for (auto& q
: p
->second
) {
11563 uint64_t missing
= ~q
.first
& features
;
11566 ss
<< "cannot set require_min_compat_client to " << v
<< ": ";
11571 ss
<< q
.second
<< " connected " << ceph_entity_type_name(type
)
11572 << "(s) look like " << ceph_release_name(
11573 ceph_release_from_features(q
.first
))
11574 << " (missing 0x" << std::hex
<< missing
<< std::dec
<< ")";
11580 ss
<< "; add --yes-i-really-mean-it to do it anyway";
11585 ss
<< "set require_min_compat_client to " << vno
;
11586 pending_inc
.new_require_min_compat_client
= vno
;
11588 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11589 get_last_committed() + 1));
11591 } else if (prefix
== "osd pause") {
11592 return prepare_set_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11594 } else if (prefix
== "osd unpause") {
11595 return prepare_unset_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11597 } else if (prefix
== "osd set") {
11599 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
11602 cmd_getval(cmdmap
, "key", key
);
11603 if (key
== "pause")
11604 return prepare_set_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11605 else if (key
== "noup")
11606 return prepare_set_flag(op
, CEPH_OSDMAP_NOUP
);
11607 else if (key
== "nodown")
11608 return prepare_set_flag(op
, CEPH_OSDMAP_NODOWN
);
11609 else if (key
== "noout")
11610 return prepare_set_flag(op
, CEPH_OSDMAP_NOOUT
);
11611 else if (key
== "noin")
11612 return prepare_set_flag(op
, CEPH_OSDMAP_NOIN
);
11613 else if (key
== "nobackfill")
11614 return prepare_set_flag(op
, CEPH_OSDMAP_NOBACKFILL
);
11615 else if (key
== "norebalance")
11616 return prepare_set_flag(op
, CEPH_OSDMAP_NOREBALANCE
);
11617 else if (key
== "norecover")
11618 return prepare_set_flag(op
, CEPH_OSDMAP_NORECOVER
);
11619 else if (key
== "noscrub")
11620 return prepare_set_flag(op
, CEPH_OSDMAP_NOSCRUB
);
11621 else if (key
== "nodeep-scrub")
11622 return prepare_set_flag(op
, CEPH_OSDMAP_NODEEP_SCRUB
);
11623 else if (key
== "notieragent")
11624 return prepare_set_flag(op
, CEPH_OSDMAP_NOTIERAGENT
);
11625 else if (key
== "nosnaptrim")
11626 return prepare_set_flag(op
, CEPH_OSDMAP_NOSNAPTRIM
);
11627 else if (key
== "pglog_hardlimit") {
11628 if (!osdmap
.get_num_up_osds() && !sure
) {
11629 ss
<< "Not advisable to continue since no OSDs are up. Pass "
11630 << "--yes-i-really-mean-it if you really wish to continue.";
11634 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
11635 // we are reusing a jewel feature bit that was retired in luminous.
11636 if (osdmap
.require_osd_release
>= ceph_release_t::luminous
&&
11637 (HAVE_FEATURE(osdmap
.get_up_osd_features(), OSD_PGLOG_HARDLIMIT
)
11639 return prepare_set_flag(op
, CEPH_OSDMAP_PGLOG_HARDLIMIT
);
11641 ss
<< "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
11646 ss
<< "unrecognized flag '" << key
<< "'";
11650 } else if (prefix
== "osd unset") {
11652 cmd_getval(cmdmap
, "key", key
);
11653 if (key
== "pause")
11654 return prepare_unset_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11655 else if (key
== "noup")
11656 return prepare_unset_flag(op
, CEPH_OSDMAP_NOUP
);
11657 else if (key
== "nodown")
11658 return prepare_unset_flag(op
, CEPH_OSDMAP_NODOWN
);
11659 else if (key
== "noout")
11660 return prepare_unset_flag(op
, CEPH_OSDMAP_NOOUT
);
11661 else if (key
== "noin")
11662 return prepare_unset_flag(op
, CEPH_OSDMAP_NOIN
);
11663 else if (key
== "nobackfill")
11664 return prepare_unset_flag(op
, CEPH_OSDMAP_NOBACKFILL
);
11665 else if (key
== "norebalance")
11666 return prepare_unset_flag(op
, CEPH_OSDMAP_NOREBALANCE
);
11667 else if (key
== "norecover")
11668 return prepare_unset_flag(op
, CEPH_OSDMAP_NORECOVER
);
11669 else if (key
== "noscrub")
11670 return prepare_unset_flag(op
, CEPH_OSDMAP_NOSCRUB
);
11671 else if (key
== "nodeep-scrub")
11672 return prepare_unset_flag(op
, CEPH_OSDMAP_NODEEP_SCRUB
);
11673 else if (key
== "notieragent")
11674 return prepare_unset_flag(op
, CEPH_OSDMAP_NOTIERAGENT
);
11675 else if (key
== "nosnaptrim")
11676 return prepare_unset_flag(op
, CEPH_OSDMAP_NOSNAPTRIM
);
11678 ss
<< "unrecognized flag '" << key
<< "'";
11682 } else if (prefix
== "osd require-osd-release") {
11684 cmd_getval(cmdmap
, "release", release
);
11686 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
11687 ceph_release_t rel
= ceph_release_from_name(release
.c_str());
11689 ss
<< "unrecognized release " << release
;
11693 if (rel
== osdmap
.require_osd_release
) {
11698 if (osdmap
.require_osd_release
< ceph_release_t::pacific
&& !sure
) {
11699 ss
<< "Not advisable to continue since current 'require_osd_release' "
11700 << "refers to a very old Ceph release. Pass "
11701 << "--yes-i-really-mean-it if you really wish to continue.";
11705 if (!osdmap
.get_num_up_osds() && !sure
) {
11706 ss
<< "Not advisable to continue since no OSDs are up. Pass "
11707 << "--yes-i-really-mean-it if you really wish to continue.";
11711 if (rel
== ceph_release_t::pacific
) {
11712 if (!mon
.monmap
->get_required_features().contains_all(
11713 ceph::features::mon::FEATURE_PACIFIC
)) {
11714 ss
<< "not all mons are pacific";
11718 if ((!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_PACIFIC
))
11720 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_PACIFIC feature";
11724 } else if (rel
== ceph_release_t::quincy
) {
11725 if (!mon
.monmap
->get_required_features().contains_all(
11726 ceph::features::mon::FEATURE_QUINCY
)) {
11727 ss
<< "not all mons are quincy";
11731 if ((!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_QUINCY
))
11733 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_QUINCY feature";
11737 } else if (rel
== ceph_release_t::reef
) {
11738 if (!mon
.monmap
->get_required_features().contains_all(
11739 ceph::features::mon::FEATURE_REEF
)) {
11740 ss
<< "not all mons are reef";
11744 if ((!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_REEF
))
11746 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_REEF feature";
11751 ss
<< "not supported for this release";
11755 if (rel
< osdmap
.require_osd_release
) {
11756 ss
<< "require_osd_release cannot be lowered once it has been set";
11760 pending_inc
.new_require_osd_release
= rel
;
11762 } else if (prefix
== "osd down" ||
11763 prefix
== "osd out" ||
11764 prefix
== "osd in" ||
11765 prefix
== "osd rm" ||
11766 prefix
== "osd stop") {
11770 bool verbose
= true;
11771 bool definitely_dead
= false;
11773 vector
<string
> idvec
;
11774 cmd_getval(cmdmap
, "ids", idvec
);
11775 cmd_getval(cmdmap
, "definitely_dead", definitely_dead
);
11776 derr
<< "definitely_dead " << (int)definitely_dead
<< dendl
;
11777 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
11782 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
11783 if (prefix
== "osd in") {
11784 // touch out osds only
11785 osdmap
.get_out_existing_osds(osds
);
11787 osdmap
.get_all_osds(osds
);
11790 verbose
= false; // so the output is less noisy.
11792 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
11794 ss
<< "invalid osd id" << osd
;
11797 } else if (!osdmap
.exists(osd
)) {
11798 ss
<< "osd." << osd
<< " does not exist. ";
11805 for (auto &osd
: osds
) {
11806 if (prefix
== "osd down") {
11807 if (osdmap
.is_down(osd
)) {
11809 ss
<< "osd." << osd
<< " is already down. ";
11811 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_UP
);
11812 ss
<< "marked down osd." << osd
<< ". ";
11815 if (definitely_dead
) {
11816 if (!pending_inc
.new_xinfo
.count(osd
)) {
11817 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
11819 if (pending_inc
.new_xinfo
[osd
].dead_epoch
< pending_inc
.epoch
) {
11822 pending_inc
.new_xinfo
[osd
].dead_epoch
= pending_inc
.epoch
;
11824 } else if (prefix
== "osd out") {
11825 if (osdmap
.is_out(osd
)) {
11827 ss
<< "osd." << osd
<< " is already out. ";
11829 pending_inc
.new_weight
[osd
] = CEPH_OSD_OUT
;
11830 if (osdmap
.osd_weight
[osd
]) {
11831 if (pending_inc
.new_xinfo
.count(osd
) == 0) {
11832 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
11834 pending_inc
.new_xinfo
[osd
].old_weight
= osdmap
.osd_weight
[osd
];
11836 ss
<< "marked out osd." << osd
<< ". ";
11837 std::ostringstream msg
;
11838 msg
<< "Client " << op
->get_session()->entity_name
11839 << " marked osd." << osd
<< " out";
11840 if (osdmap
.is_up(osd
)) {
11841 msg
<< ", while it was still marked up";
11843 auto period
= ceph_clock_now() - down_pending_out
[osd
];
11844 msg
<< ", after it was down for " << int(period
.sec())
11848 mon
.clog
->info() << msg
.str();
11851 } else if (prefix
== "osd in") {
11852 if (osdmap
.is_in(osd
)) {
11854 ss
<< "osd." << osd
<< " is already in. ";
11856 if (osdmap
.osd_xinfo
[osd
].old_weight
> 0) {
11857 pending_inc
.new_weight
[osd
] = osdmap
.osd_xinfo
[osd
].old_weight
;
11858 if (pending_inc
.new_xinfo
.count(osd
) == 0) {
11859 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
11861 pending_inc
.new_xinfo
[osd
].old_weight
= 0;
11863 pending_inc
.new_weight
[osd
] = CEPH_OSD_IN
;
11865 ss
<< "marked in osd." << osd
<< ". ";
11868 } else if (prefix
== "osd rm") {
11869 err
= prepare_command_osd_remove(osd
);
11871 if (err
== -EBUSY
) {
11874 ss
<< "osd." << osd
<< " is still up; must be down before removal. ";
11876 ceph_assert(err
== 0);
11878 ss
<< ", osd." << osd
;
11880 ss
<< "removed osd." << osd
;
11884 } else if (prefix
== "osd stop") {
11885 if (osdmap
.is_stop(osd
)) {
11887 ss
<< "osd." << osd
<< " is already stopped. ";
11888 } else if (osdmap
.is_down(osd
)) {
11889 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_STOP
);
11890 ss
<< "stop down osd." << osd
<< ". ";
11893 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_UP
| CEPH_OSD_STOP
);
11894 ss
<< "stop osd." << osd
<< ". ";
11902 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
11903 get_last_committed() + 1));
11906 } else if (prefix
== "osd set-group" ||
11907 prefix
== "osd unset-group" ||
11908 prefix
== "osd add-noup" ||
11909 prefix
== "osd add-nodown" ||
11910 prefix
== "osd add-noin" ||
11911 prefix
== "osd add-noout" ||
11912 prefix
== "osd rm-noup" ||
11913 prefix
== "osd rm-nodown" ||
11914 prefix
== "osd rm-noin" ||
11915 prefix
== "osd rm-noout") {
11916 bool do_set
= prefix
== "osd set-group" ||
11917 prefix
.find("add") != string::npos
;
11919 unsigned flags
= 0;
11920 vector
<string
> who
;
11921 if (prefix
== "osd set-group" || prefix
== "osd unset-group") {
11922 cmd_getval(cmdmap
, "flags", flag_str
);
11923 cmd_getval(cmdmap
, "who", who
);
11924 vector
<string
> raw_flags
;
11925 boost::split(raw_flags
, flag_str
, boost::is_any_of(","));
11926 for (auto& f
: raw_flags
) {
11928 flags
|= CEPH_OSD_NOUP
;
11929 else if (f
== "nodown")
11930 flags
|= CEPH_OSD_NODOWN
;
11931 else if (f
== "noin")
11932 flags
|= CEPH_OSD_NOIN
;
11933 else if (f
== "noout")
11934 flags
|= CEPH_OSD_NOOUT
;
11936 ss
<< "unrecognized flag '" << f
<< "', must be one of "
11937 << "{noup,nodown,noin,noout}";
11943 cmd_getval(cmdmap
, "ids", who
);
11944 if (prefix
.find("noup") != string::npos
)
11945 flags
= CEPH_OSD_NOUP
;
11946 else if (prefix
.find("nodown") != string::npos
)
11947 flags
= CEPH_OSD_NODOWN
;
11948 else if (prefix
.find("noin") != string::npos
)
11949 flags
= CEPH_OSD_NOIN
;
11950 else if (prefix
.find("noout") != string::npos
)
11951 flags
= CEPH_OSD_NOOUT
;
11953 ceph_assert(0 == "Unreachable!");
11956 ss
<< "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
11961 ss
<< "must specify at least one or more targets to set/unset";
11966 set
<int> crush_nodes
;
11967 set
<int> device_classes
;
11968 for (auto& w
: who
) {
11969 if (w
== "any" || w
== "all" || w
== "*") {
11970 osdmap
.get_all_osds(osds
);
11973 std::stringstream ts
;
11974 if (auto osd
= parse_osd_id(w
.c_str(), &ts
); osd
>= 0) {
11976 } else if (osdmap
.crush
->name_exists(w
)) {
11977 crush_nodes
.insert(osdmap
.crush
->get_item_id(w
));
11978 } else if (osdmap
.crush
->class_exists(w
)) {
11979 device_classes
.insert(osdmap
.crush
->get_class_id(w
));
11981 ss
<< "unable to parse osd id or crush node or device class: "
11982 << "\"" << w
<< "\". ";
11985 if (osds
.empty() && crush_nodes
.empty() && device_classes
.empty()) {
11986 // ss has reason for failure
11991 for (auto osd
: osds
) {
11992 if (!osdmap
.exists(osd
)) {
11993 ss
<< "osd." << osd
<< " does not exist. ";
11997 if (flags
& CEPH_OSD_NOUP
) {
11998 any
|= osdmap
.is_noup_by_osd(osd
) ?
11999 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOUP
) :
12000 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOUP
);
12002 if (flags
& CEPH_OSD_NODOWN
) {
12003 any
|= osdmap
.is_nodown_by_osd(osd
) ?
12004 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NODOWN
) :
12005 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NODOWN
);
12007 if (flags
& CEPH_OSD_NOIN
) {
12008 any
|= osdmap
.is_noin_by_osd(osd
) ?
12009 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOIN
) :
12010 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOIN
);
12012 if (flags
& CEPH_OSD_NOOUT
) {
12013 any
|= osdmap
.is_noout_by_osd(osd
) ?
12014 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOOUT
) :
12015 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOOUT
);
12018 if (flags
& CEPH_OSD_NOUP
) {
12019 any
|= osdmap
.is_noup_by_osd(osd
) ?
12020 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOUP
) :
12021 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOUP
);
12023 if (flags
& CEPH_OSD_NODOWN
) {
12024 any
|= osdmap
.is_nodown_by_osd(osd
) ?
12025 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NODOWN
) :
12026 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NODOWN
);
12028 if (flags
& CEPH_OSD_NOIN
) {
12029 any
|= osdmap
.is_noin_by_osd(osd
) ?
12030 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOIN
) :
12031 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOIN
);
12033 if (flags
& CEPH_OSD_NOOUT
) {
12034 any
|= osdmap
.is_noout_by_osd(osd
) ?
12035 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOOUT
) :
12036 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOOUT
);
12040 for (auto& id
: crush_nodes
) {
12041 auto old_flags
= osdmap
.get_crush_node_flags(id
);
12042 auto& pending_flags
= pending_inc
.new_crush_node_flags
[id
];
12043 pending_flags
|= old_flags
; // adopt existing flags first!
12045 pending_flags
|= flags
;
12047 pending_flags
&= ~flags
;
12051 for (auto& id
: device_classes
) {
12052 auto old_flags
= osdmap
.get_device_class_flags(id
);
12053 auto& pending_flags
= pending_inc
.new_device_class_flags
[id
];
12054 pending_flags
|= old_flags
;
12056 pending_flags
|= flags
;
12058 pending_flags
&= ~flags
;
12064 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
12065 get_last_committed() + 1));
12068 } else if (prefix
== "osd pg-temp") {
12070 err
= parse_pgid(cmdmap
, ss
, pgid
);
12073 if (pending_inc
.new_pg_temp
.count(pgid
)) {
12074 dout(10) << __func__
<< " waiting for pending update on " << pgid
<< dendl
;
12075 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12079 vector
<int64_t> id_vec
;
12080 vector
<int32_t> new_pg_temp
;
12081 cmd_getval(cmdmap
, "id", id_vec
);
12082 if (id_vec
.empty()) {
12083 pending_inc
.new_pg_temp
[pgid
] = mempool::osdmap::vector
<int>();
12084 ss
<< "done cleaning up pg_temp of " << pgid
;
12087 for (auto osd
: id_vec
) {
12088 if (!osdmap
.exists(osd
)) {
12089 ss
<< "osd." << osd
<< " does not exist";
12093 new_pg_temp
.push_back(osd
);
12096 int pool_min_size
= osdmap
.get_pg_pool_min_size(pgid
);
12097 if ((int)new_pg_temp
.size() < pool_min_size
) {
12098 ss
<< "num of osds (" << new_pg_temp
.size() <<") < pool min size ("
12099 << pool_min_size
<< ")";
12104 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
12105 if ((int)new_pg_temp
.size() > pool_size
) {
12106 ss
<< "num of osds (" << new_pg_temp
.size() <<") > pool size ("
12107 << pool_size
<< ")";
12112 pending_inc
.new_pg_temp
[pgid
] = mempool::osdmap::vector
<int>(
12113 new_pg_temp
.begin(), new_pg_temp
.end());
12114 ss
<< "set " << pgid
<< " pg_temp mapping to " << new_pg_temp
;
12116 } else if (prefix
== "osd primary-temp" ||
12117 prefix
== "osd rm-primary-temp") {
12119 err
= parse_pgid(cmdmap
, ss
, pgid
);
12124 if (prefix
== "osd primary-temp") {
12125 if (!cmd_getval(cmdmap
, "id", osd
)) {
12126 ss
<< "unable to parse 'id' value '"
12127 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12131 if (!osdmap
.exists(osd
)) {
12132 ss
<< "osd." << osd
<< " does not exist";
12137 else if (prefix
== "osd rm-primary-temp") {
12141 ceph_assert(0 == "Unreachable!");
12144 if (osdmap
.require_min_compat_client
!= ceph_release_t::unknown
&&
12145 osdmap
.require_min_compat_client
< ceph_release_t::firefly
) {
12146 ss
<< "require_min_compat_client "
12147 << osdmap
.require_min_compat_client
12148 << " < firefly, which is required for primary-temp";
12153 pending_inc
.new_primary_temp
[pgid
] = osd
;
12154 ss
<< "set " << pgid
<< " primary_temp mapping to " << osd
;
12156 } else if (prefix
== "pg repeer") {
12158 err
= parse_pgid(cmdmap
, ss
, pgid
);
12161 vector
<int> acting
;
12163 osdmap
.pg_to_acting_osds(pgid
, &acting
, &primary
);
12166 ss
<< "pg currently has no primary";
12169 if (acting
.size() > 1) {
12170 // map to just primary; it will map back to what it wants
12171 pending_inc
.new_pg_temp
[pgid
] = { primary
};
12173 // hmm, pick another arbitrary osd to induce a change. Note
12174 // that this won't work if there is only one suitable OSD in the cluster.
12177 for (i
= 0; i
< osdmap
.get_max_osd(); ++i
) {
12178 if (i
== primary
|| !osdmap
.is_up(i
) || !osdmap
.exists(i
)) {
12181 pending_inc
.new_pg_temp
[pgid
] = { primary
, i
};
12187 ss
<< "not enough up OSDs in the cluster to force repeer";
12192 } else if (prefix
== "osd pg-upmap" ||
12193 prefix
== "osd rm-pg-upmap" ||
12194 prefix
== "osd pg-upmap-items" ||
12195 prefix
== "osd rm-pg-upmap-items" ||
12196 prefix
== "osd pg-upmap-primary" ||
12197 prefix
== "osd rm-pg-upmap-primary") {
12202 OP_RM_PG_UPMAP_ITEMS
,
12203 OP_PG_UPMAP_PRIMARY
,
12204 OP_RM_PG_UPMAP_PRIMARY
,
12207 if (prefix
== "osd pg-upmap") {
12208 upmap_option
= OP_PG_UPMAP
;
12209 } else if (prefix
== "osd rm-pg-upmap") {
12210 upmap_option
= OP_RM_PG_UPMAP
;
12211 } else if (prefix
== "osd pg-upmap-items") {
12212 upmap_option
= OP_PG_UPMAP_ITEMS
;
12213 } else if (prefix
== "osd rm-pg-upmap-items") {
12214 upmap_option
= OP_RM_PG_UPMAP_ITEMS
;
12215 } else if (prefix
== "osd pg-upmap-primary") {
12216 upmap_option
= OP_PG_UPMAP_PRIMARY
;
12217 } else if (prefix
== "osd rm-pg-upmap-primary") {
12218 upmap_option
= OP_RM_PG_UPMAP_PRIMARY
;
12220 ceph_abort_msg("invalid upmap option");
12223 ceph_release_t min_release
= ceph_release_t::unknown
;
12224 string feature_name
= "unknown";
12225 switch (upmap_option
) {
12226 case OP_PG_UPMAP
: // fall through
12227 case OP_RM_PG_UPMAP
: // fall through
12228 case OP_PG_UPMAP_ITEMS
: // fall through
12229 case OP_RM_PG_UPMAP_ITEMS
:
12230 min_release
= ceph_release_t::luminous
;
12231 feature_name
= "pg-upmap";
12234 case OP_PG_UPMAP_PRIMARY
: // fall through
12235 case OP_RM_PG_UPMAP_PRIMARY
:
12236 min_release
= ceph_release_t::reef
;
12237 feature_name
= "pg-upmap-primary";
12241 ceph_abort_msg("invalid upmap option");
12243 uint64_t min_feature
= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP
;
12244 string min_release_name
= ceph_release_name(static_cast<int>(min_release
));
12246 if (osdmap
.require_min_compat_client
< min_release
) {
12247 ss
<< "min_compat_client "
12248 << osdmap
.require_min_compat_client
12249 << " < " << min_release_name
<< ", which is required for " << feature_name
<< ". "
12250 << "Try 'ceph osd set-require-min-compat-client " << min_release_name
<< "' "
12251 << "before using the new interface";
12256 //TODO: Should I add feature and test for upmap-primary?
12257 err
= check_cluster_features(min_feature
, ss
);
12258 if (err
== -EAGAIN
)
12263 err
= parse_pgid(cmdmap
, ss
, pgid
);
12266 if (pending_inc
.old_pools
.count(pgid
.pool())) {
12267 ss
<< "pool of " << pgid
<< " is pending removal";
12270 wait_for_finished_proposal(op
,
12271 new Monitor::C_Command(mon
, op
, err
, rs
, get_last_committed() + 1));
12275 // check pending upmap changes
12276 switch (upmap_option
) {
12277 case OP_PG_UPMAP
: // fall through
12278 case OP_RM_PG_UPMAP
:
12279 if (pending_inc
.new_pg_upmap
.count(pgid
) ||
12280 pending_inc
.old_pg_upmap
.count(pgid
)) {
12281 dout(10) << __func__
<< " waiting for pending update on "
12283 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12288 case OP_PG_UPMAP_PRIMARY
: // fall through
12289 case OP_RM_PG_UPMAP_PRIMARY
:
12291 const pg_pool_t
*pt
= osdmap
.get_pg_pool(pgid
.pool());
12292 if (! pt
->is_replicated()) {
12293 ss
<< "pg-upmap-primary is only supported for replicated pools";
12299 case OP_PG_UPMAP_ITEMS
: // fall through
12300 case OP_RM_PG_UPMAP_ITEMS
: // fall through
12301 if (pending_inc
.new_pg_upmap_items
.count(pgid
) ||
12302 pending_inc
.old_pg_upmap_items
.count(pgid
)) {
12303 dout(10) << __func__
<< " waiting for pending update on "
12305 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12311 ceph_abort_msg("invalid upmap option");
12314 switch (upmap_option
) {
12317 vector
<int64_t> id_vec
;
12318 if (!cmd_getval(cmdmap
, "id", id_vec
)) {
12319 ss
<< "unable to parse 'id' value(s) '"
12320 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12325 int pool_min_size
= osdmap
.get_pg_pool_min_size(pgid
);
12326 if ((int)id_vec
.size() < pool_min_size
) {
12327 ss
<< "num of osds (" << id_vec
.size() <<") < pool min size ("
12328 << pool_min_size
<< ")";
12333 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
12334 if ((int)id_vec
.size() > pool_size
) {
12335 ss
<< "num of osds (" << id_vec
.size() <<") > pool size ("
12336 << pool_size
<< ")";
12341 vector
<int32_t> new_pg_upmap
;
12342 for (auto osd
: id_vec
) {
12343 if (osd
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(osd
)) {
12344 ss
<< "osd." << osd
<< " does not exist";
12348 auto it
= std::find(new_pg_upmap
.begin(), new_pg_upmap
.end(), osd
);
12349 if (it
!= new_pg_upmap
.end()) {
12350 ss
<< "osd." << osd
<< " already exists, ";
12353 new_pg_upmap
.push_back(osd
);
12356 if (new_pg_upmap
.empty()) {
12357 ss
<< "no valid upmap items(pairs) is specified";
12362 pending_inc
.new_pg_upmap
[pgid
] = mempool::osdmap::vector
<int32_t>(
12363 new_pg_upmap
.begin(), new_pg_upmap
.end());
12364 ss
<< "set " << pgid
<< " pg_upmap mapping to " << new_pg_upmap
;
12368 case OP_RM_PG_UPMAP
:
12370 pending_inc
.old_pg_upmap
.insert(pgid
);
12371 ss
<< "clear " << pgid
<< " pg_upmap mapping";
12375 case OP_PG_UPMAP_ITEMS
:
12377 vector
<int64_t> id_vec
;
12378 if (!cmd_getval(cmdmap
, "id", id_vec
)) {
12379 ss
<< "unable to parse 'id' value(s) '"
12380 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12385 if (id_vec
.size() % 2) {
12386 ss
<< "you must specify pairs of osd ids to be remapped";
12391 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
12392 if ((int)(id_vec
.size() / 2) > pool_size
) {
12393 ss
<< "num of osd pairs (" << id_vec
.size() / 2 <<") > pool size ("
12394 << pool_size
<< ")";
12399 vector
<pair
<int32_t,int32_t>> new_pg_upmap_items
;
12400 ostringstream items
;
12402 for (auto p
= id_vec
.begin(); p
!= id_vec
.end(); ++p
) {
12406 ss
<< "from osd." << from
<< " == to osd." << to
<< ", ";
12409 if (!osdmap
.exists(from
)) {
12410 ss
<< "osd." << from
<< " does not exist";
12414 if (to
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(to
)) {
12415 ss
<< "osd." << to
<< " does not exist";
12419 pair
<int32_t,int32_t> entry
= make_pair(from
, to
);
12420 auto it
= std::find(new_pg_upmap_items
.begin(),
12421 new_pg_upmap_items
.end(), entry
);
12422 if (it
!= new_pg_upmap_items
.end()) {
12423 ss
<< "osd." << from
<< " -> osd." << to
<< " already exists, ";
12426 new_pg_upmap_items
.push_back(entry
);
12427 items
<< from
<< "->" << to
<< ",";
12429 string
out(items
.str());
12430 out
.resize(out
.size() - 1); // drop last ','
12433 if (new_pg_upmap_items
.empty()) {
12434 ss
<< "no valid upmap items(pairs) is specified";
12439 pending_inc
.new_pg_upmap_items
[pgid
] =
12440 mempool::osdmap::vector
<pair
<int32_t,int32_t>>(
12441 new_pg_upmap_items
.begin(), new_pg_upmap_items
.end());
12442 ss
<< "set " << pgid
<< " pg_upmap_items mapping to " << out
;
12446 case OP_RM_PG_UPMAP_ITEMS
:
12448 pending_inc
.old_pg_upmap_items
.insert(pgid
);
12449 ss
<< "clear " << pgid
<< " pg_upmap_items mapping";
12453 case OP_PG_UPMAP_PRIMARY
:
12456 if (!cmd_getval(cmdmap
, "id", id
)) {
12457 ss
<< "invalid osd id value '"
12458 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12462 if (id
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(id
)) {
12463 ss
<< "osd." << id
<< " does not exist";
12467 vector
<int> acting
;
12469 osdmap
.pg_to_acting_osds(pgid
, &acting
, &primary
);
12470 if (id
== primary
) {
12471 ss
<< "osd." << id
<< " is already primary for pg " << pgid
;
12476 for (int i
= 1 ; i
< (int)acting
.size(); i
++) { // skip 0 on purpose
12477 if (acting
[i
] == id
) {
12482 if (found_idx
== 0) {
12483 ss
<< "osd." << id
<< " is not in acting set for pg " << pgid
;
12487 vector
<int> new_acting(acting
);
12488 new_acting
[found_idx
] = new_acting
[0];
12489 new_acting
[0] = id
;
12490 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
12491 if (osdmap
.crush
->verify_upmap(cct
, osdmap
.get_pg_pool_crush_rule(pgid
),
12492 pool_size
, new_acting
) >= 0) {
12493 ss
<< "change primary for pg " << pgid
<< " to osd." << id
;
12496 ss
<< "can't change primary for pg " << pgid
<< " to osd." << id
12497 << " - illegal pg after the change";
12501 pending_inc
.new_pg_upmap_primary
[pgid
] = id
;
12503 ldout(cct
, 20) << "pg " << pgid
<< ": set pg_upmap_primary to " << id
<< dendl
;
12507 case OP_RM_PG_UPMAP_PRIMARY
:
12509 pending_inc
.old_pg_upmap_primary
.insert(pgid
);
12510 ss
<< "clear " << pgid
<< " pg_upmap_primary mapping";
12515 ceph_abort_msg("invalid upmap option");
12519 } else if (prefix
== "osd primary-affinity") {
12521 if (!cmd_getval(cmdmap
, "id", id
)) {
12522 ss
<< "invalid osd id value '"
12523 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12528 if (!cmd_getval(cmdmap
, "weight", w
)) {
12529 ss
<< "unable to parse 'weight' value '"
12530 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
12534 long ww
= (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY
*w
);
12536 ss
<< "weight must be >= 0";
12540 if (osdmap
.require_min_compat_client
!= ceph_release_t::unknown
&&
12541 osdmap
.require_min_compat_client
< ceph_release_t::firefly
) {
12542 ss
<< "require_min_compat_client "
12543 << osdmap
.require_min_compat_client
12544 << " < firefly, which is required for primary-affinity";
12548 if (osdmap
.exists(id
)) {
12549 pending_inc
.new_primary_affinity
[id
] = ww
;
12550 ss
<< "set osd." << id
<< " primary-affinity to " << w
<< " (" << std::ios::hex
<< ww
<< std::ios::dec
<< ")";
12552 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12553 get_last_committed() + 1));
12556 ss
<< "osd." << id
<< " does not exist";
12560 } else if (prefix
== "osd reweight") {
12562 if (!cmd_getval(cmdmap
, "id", id
)) {
12563 ss
<< "unable to parse osd id value '"
12564 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12569 if (!cmd_getval(cmdmap
, "weight", w
)) {
12570 ss
<< "unable to parse weight value '"
12571 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
12575 long ww
= (int)((double)CEPH_OSD_IN
*w
);
12577 ss
<< "weight must be >= 0";
12581 if (osdmap
.exists(id
)) {
12582 pending_inc
.new_weight
[id
] = ww
;
12583 ss
<< "reweighted osd." << id
<< " to " << w
<< " (" << std::hex
<< ww
<< std::dec
<< ")";
12585 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12586 get_last_committed() + 1));
12589 ss
<< "osd." << id
<< " does not exist";
12593 } else if (prefix
== "osd reweightn") {
12594 map
<int32_t, uint32_t> weights
;
12595 err
= parse_reweights(cct
, cmdmap
, osdmap
, &weights
);
12597 ss
<< "unable to parse 'weights' value '"
12598 << cmd_vartype_stringify(cmdmap
.at("weights")) << "'";
12601 pending_inc
.new_weight
.insert(weights
.begin(), weights
.end());
12602 wait_for_finished_proposal(
12604 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
, get_last_committed() + 1));
12606 } else if (prefix
== "osd lost") {
12608 if (!cmd_getval(cmdmap
, "id", id
)) {
12609 ss
<< "unable to parse osd id value '"
12610 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12615 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
12617 ss
<< "are you SURE? this might mean real, permanent data loss. pass "
12618 "--yes-i-really-mean-it if you really do.";
12621 } else if (!osdmap
.exists(id
)) {
12622 ss
<< "osd." << id
<< " does not exist";
12625 } else if (!osdmap
.is_down(id
)) {
12626 ss
<< "osd." << id
<< " is not down";
12630 epoch_t e
= osdmap
.get_info(id
).down_at
;
12631 pending_inc
.new_lost
[id
] = e
;
12632 ss
<< "marked osd lost in epoch " << e
;
12634 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12635 get_last_committed() + 1));
12639 } else if (prefix
== "osd destroy-actual" ||
12640 prefix
== "osd purge-actual" ||
12641 prefix
== "osd purge-new") {
12642 /* Destroying an OSD means that we don't expect to further make use of
12643 * the OSDs data (which may even become unreadable after this operation),
12644 * and that we are okay with scrubbing all its cephx keys and config-key
12645 * data (which may include lockbox keys, thus rendering the osd's data
12648 * The OSD will not be removed. Instead, we will mark it as destroyed,
12649 * such that a subsequent call to `create` will not reuse the osd id.
12650 * This will play into being able to recreate the OSD, at the same
12651 * crush location, with minimal data movement.
12654 // make sure authmon is writeable.
12655 if (!mon
.authmon()->is_writeable()) {
12656 dout(10) << __func__
<< " waiting for auth mon to be writeable for "
12657 << "osd destroy" << dendl
;
12658 mon
.authmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
12663 if (!cmd_getval(cmdmap
, "id", id
)) {
12664 auto p
= cmdmap
.find("id");
12665 if (p
== cmdmap
.end()) {
12666 ss
<< "no osd id specified";
12668 ss
<< "unable to parse osd id value '"
12669 << cmd_vartype_stringify(cmdmap
.at("id")) << "";
12675 bool is_destroy
= (prefix
== "osd destroy-actual");
12677 ceph_assert("osd purge-actual" == prefix
||
12678 "osd purge-new" == prefix
);
12682 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
12684 ss
<< "Are you SURE? Did you verify with 'ceph osd safe-to-destroy'? "
12685 << "This will mean real, permanent data loss, as well "
12686 << "as deletion of cephx and lockbox keys. "
12687 << "Pass --yes-i-really-mean-it if you really do.";
12690 } else if (!osdmap
.exists(id
)) {
12691 ss
<< "osd." << id
<< " does not exist";
12692 err
= 0; // idempotent
12694 } else if (osdmap
.is_up(id
)) {
12695 ss
<< "osd." << id
<< " is not `down`.";
12698 } else if (is_destroy
&& osdmap
.is_destroyed(id
)) {
12699 ss
<< "destroyed osd." << id
;
12704 if (prefix
== "osd purge-new" &&
12705 (osdmap
.get_state(id
) & CEPH_OSD_NEW
) == 0) {
12706 ss
<< "osd." << id
<< " is not new";
12711 bool goto_reply
= false;
12715 err
= prepare_command_osd_destroy(id
, ss
);
12716 // we checked above that it should exist.
12717 ceph_assert(err
!= -ENOENT
);
12719 err
= prepare_command_osd_purge(id
, ss
);
12720 if (err
== -ENOENT
) {
12722 ss
<< "osd." << id
<< " does not exist.";
12728 if (err
< 0 || goto_reply
) {
12733 ss
<< "destroyed osd." << id
;
12735 ss
<< "purged osd." << id
;
12739 wait_for_finished_proposal(op
,
12740 new Monitor::C_Command(mon
, op
, 0, rs
, get_last_committed() + 1));
12741 force_immediate_propose();
12744 } else if (prefix
== "osd new") {
12746 // make sure authmon is writeable.
12747 if (!mon
.authmon()->is_writeable()) {
12748 dout(10) << __func__
<< " waiting for auth mon to be writeable for "
12749 << "osd new" << dendl
;
12750 mon
.authmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
12754 // make sure kvmon is writeable.
12755 if (!mon
.kvmon()->is_writeable()) {
12756 dout(10) << __func__
<< " waiting for kv mon to be writeable for "
12757 << "osd new" << dendl
;
12758 mon
.kvmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
12762 map
<string
,string
> param_map
;
12764 bufferlist bl
= m
->get_data();
12765 string param_json
= bl
.to_str();
12766 dout(20) << __func__
<< " osd new json = " << param_json
<< dendl
;
12768 err
= get_json_str_map(param_json
, ss
, ¶m_map
);
12772 dout(20) << __func__
<< " osd new params " << param_map
<< dendl
;
12775 err
= prepare_command_osd_new(op
, cmdmap
, param_map
, ss
, f
.get());
12788 if (err
== EEXIST
) {
12789 // idempotent operation
12794 wait_for_finished_proposal(op
,
12795 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
,
12796 get_last_committed() + 1));
12797 force_immediate_propose();
12800 } else if (prefix
== "osd create") {
12802 // optional id provided?
12803 int64_t id
= -1, cmd_id
= -1;
12804 if (cmd_getval(cmdmap
, "id", cmd_id
)) {
12806 ss
<< "invalid osd id value '" << cmd_id
<< "'";
12810 dout(10) << " osd create got id " << cmd_id
<< dendl
;
12815 if (cmd_getval(cmdmap
, "uuid", uuidstr
)) {
12816 if (!uuid
.parse(uuidstr
.c_str())) {
12817 ss
<< "invalid uuid value '" << uuidstr
<< "'";
12821 // we only care about the id if we also have the uuid, to
12822 // ensure the operation's idempotency.
12826 int32_t new_id
= -1;
12827 err
= prepare_command_osd_create(id
, uuid
, &new_id
, ss
);
12829 if (err
== -EAGAIN
) {
12830 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12833 // a check has failed; reply to the user.
12836 } else if (err
== EEXIST
) {
12837 // this is an idempotent operation; we can go ahead and reply.
12839 f
->open_object_section("created_osd");
12840 f
->dump_int("osdid", new_id
);
12841 f
->close_section();
12851 string empty_device_class
;
12852 do_osd_create(id
, uuid
, empty_device_class
, &new_id
);
12855 f
->open_object_section("created_osd");
12856 f
->dump_int("osdid", new_id
);
12857 f
->close_section();
12863 wait_for_finished_proposal(op
,
12864 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
,
12865 get_last_committed() + 1));
12868 } else if (prefix
== "osd blocklist clear" ||
12869 prefix
== "osd blacklist clear") {
12870 pending_inc
.new_blocklist
.clear();
12871 std::list
<std::pair
<entity_addr_t
,utime_t
> > blocklist
;
12872 std::list
<std::pair
<entity_addr_t
,utime_t
> > range_b
;
12873 osdmap
.get_blocklist(&blocklist
, &range_b
);
12874 for (const auto &entry
: blocklist
) {
12875 pending_inc
.old_blocklist
.push_back(entry
.first
);
12877 for (const auto &entry
: range_b
) {
12878 pending_inc
.old_range_blocklist
.push_back(entry
.first
);
12880 ss
<< " removed all blocklist entries";
12882 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12883 get_last_committed() + 1));
12885 } else if (prefix
== "osd blocklist" ||
12886 prefix
== "osd blacklist") {
12887 string addrstr
, rangestr
;
12888 bool range
= false;
12889 cmd_getval(cmdmap
, "addr", addrstr
);
12890 if (cmd_getval(cmdmap
, "range", rangestr
)) {
12891 if (rangestr
== "range") {
12894 ss
<< "Did you mean to specify \"osd blocklist range\"?";
12899 entity_addr_t addr
;
12900 if (!addr
.parse(addrstr
)) {
12901 ss
<< "unable to parse address " << addrstr
;
12907 if (!addr
.maybe_cidr()) {
12908 ss
<< "You specified a range command, but " << addr
12909 << " does not parse as a CIDR range";
12913 addr
.type
= entity_addr_t::TYPE_CIDR
;
12914 err
= check_cluster_features(CEPH_FEATUREMASK_RANGE_BLOCKLIST
, ss
);
12918 if ((addr
.is_ipv4() && addr
.get_nonce() > 32) ||
12919 (addr
.is_ipv6() && addr
.get_nonce() > 128)) {
12920 ss
<< "Too many bits in range for that protocol!";
12925 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
12926 // always blocklist type ANY
12927 addr
.set_type(entity_addr_t::TYPE_ANY
);
12929 addr
.set_type(entity_addr_t::TYPE_LEGACY
);
12933 string blocklistop
;
12934 if (!cmd_getval(cmdmap
, "blocklistop", blocklistop
)) {
12935 cmd_getval(cmdmap
, "blacklistop", blocklistop
);
12937 if (blocklistop
== "add") {
12938 utime_t expires
= ceph_clock_now();
12939 // default one hour
12940 double d
= cmd_getval_or
<double>(cmdmap
, "expire",
12941 g_conf()->mon_osd_blocklist_default_expire
);
12944 auto add_to_pending_blocklists
= [](auto& nb
, auto& ob
,
12946 const auto& expires
) {
12947 nb
[addr
] = expires
;
12948 // cancel any pending un-blocklisting request too
12949 auto it
= std::find(ob
.begin(),
12951 if (it
!= ob
.end()) {
12956 add_to_pending_blocklists(pending_inc
.new_range_blocklist
,
12957 pending_inc
.old_range_blocklist
,
12961 add_to_pending_blocklists(pending_inc
.new_blocklist
,
12962 pending_inc
.old_blocklist
,
12966 ss
<< "blocklisting " << addr
<< " until " << expires
<< " (" << d
<< " sec)";
12968 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12969 get_last_committed() + 1));
12971 } else if (blocklistop
== "rm") {
12972 auto rm_from_pending_blocklists
= [](const auto& addr
,
12974 auto& ob
, auto& pb
) {
12975 if (blocklist
.count(addr
)) {
12976 ob
.push_back(addr
);
12978 } else if (pb
.count(addr
)) {
12984 if ((!range
&& rm_from_pending_blocklists(addr
, osdmap
.blocklist
,
12985 pending_inc
.old_blocklist
,
12986 pending_inc
.new_blocklist
)) ||
12987 (range
&& rm_from_pending_blocklists(addr
, osdmap
.range_blocklist
,
12988 pending_inc
.old_range_blocklist
,
12989 pending_inc
.new_range_blocklist
))) {
12990 ss
<< "un-blocklisting " << addr
;
12992 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12993 get_last_committed() + 1));
12996 ss
<< addr
<< " isn't blocklisted";
13001 } else if (prefix
== "osd pool mksnap") {
13003 cmd_getval(cmdmap
, "pool", poolstr
);
13004 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
13006 ss
<< "unrecognized pool '" << poolstr
<< "'";
13011 cmd_getval(cmdmap
, "snap", snapname
);
13012 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
13013 if (p
->is_unmanaged_snaps_mode()) {
13014 ss
<< "pool " << poolstr
<< " is in unmanaged snaps mode";
13017 } else if (p
->snap_exists(snapname
.c_str())) {
13018 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " already exists";
13021 } else if (p
->is_tier()) {
13022 ss
<< "pool " << poolstr
<< " is a cache tier";
13027 if (pending_inc
.new_pools
.count(pool
))
13028 pp
= &pending_inc
.new_pools
[pool
];
13030 pp
= &pending_inc
.new_pools
[pool
];
13033 if (pp
->snap_exists(snapname
.c_str())) {
13034 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " already exists";
13036 if (const auto& fsmap
= mon
.mdsmon()->get_fsmap(); fsmap
.pool_in_use(pool
)) {
13037 dout(20) << "pool-level snapshots have been disabled for pools "
13038 "attached to an fs - poolid:" << pool
<< dendl
;
13042 pp
->add_snap(snapname
.c_str(), ceph_clock_now());
13043 pp
->set_snap_epoch(pending_inc
.epoch
);
13044 ss
<< "created pool " << poolstr
<< " snap " << snapname
;
13047 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
13048 get_last_committed() + 1));
13050 } else if (prefix
== "osd pool rmsnap") {
13052 cmd_getval(cmdmap
, "pool", poolstr
);
13053 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
13055 ss
<< "unrecognized pool '" << poolstr
<< "'";
13060 cmd_getval(cmdmap
, "snap", snapname
);
13061 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
13062 if (p
->is_unmanaged_snaps_mode()) {
13063 ss
<< "pool " << poolstr
<< " is in unmanaged snaps mode";
13066 } else if (!p
->snap_exists(snapname
.c_str())) {
13067 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " does not exist";
13072 if (pending_inc
.new_pools
.count(pool
))
13073 pp
= &pending_inc
.new_pools
[pool
];
13075 pp
= &pending_inc
.new_pools
[pool
];
13078 snapid_t sn
= pp
->snap_exists(snapname
.c_str());
13080 pp
->remove_snap(sn
);
13081 pp
->set_snap_epoch(pending_inc
.epoch
);
13082 ss
<< "removed pool " << poolstr
<< " snap " << snapname
;
13084 ss
<< "already removed pool " << poolstr
<< " snap " << snapname
;
13087 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
13088 get_last_committed() + 1));
13090 } else if (prefix
== "osd pool create") {
13091 int64_t pg_num
= cmd_getval_or
<int64_t>(cmdmap
, "pg_num", 0);
13092 int64_t pg_num_min
= cmd_getval_or
<int64_t>(cmdmap
, "pg_num_min", 0);
13093 int64_t pg_num_max
= cmd_getval_or
<int64_t>(cmdmap
, "pg_num_max", 0);
13094 int64_t pgp_num
= cmd_getval_or
<int64_t>(cmdmap
, "pgp_num", pg_num
);
13095 string pool_type_str
;
13096 cmd_getval(cmdmap
, "pool_type", pool_type_str
);
13097 if (pool_type_str
.empty())
13098 pool_type_str
= g_conf().get_val
<string
>("osd_pool_default_type");
13101 cmd_getval(cmdmap
, "pool", poolstr
);
13102 bool confirm
= false;
13103 //confirmation may be set to true only by internal operations.
13104 cmd_getval(cmdmap
, "yes_i_really_mean_it", confirm
);
13105 if (poolstr
[0] == '.' && !confirm
) {
13106 ss
<< "pool names beginning with . are not allowed";
13110 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13111 if (pool_id
>= 0) {
13112 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13113 if (pool_type_str
!= p
->get_type_name()) {
13114 ss
<< "pool '" << poolstr
<< "' cannot change to type " << pool_type_str
;
13117 ss
<< "pool '" << poolstr
<< "' already exists";
13124 if (pool_type_str
== "replicated") {
13125 pool_type
= pg_pool_t::TYPE_REPLICATED
;
13126 } else if (pool_type_str
== "erasure") {
13127 pool_type
= pg_pool_t::TYPE_ERASURE
;
13129 ss
<< "unknown pool type '" << pool_type_str
<< "'";
13134 bool implicit_rule_creation
= false;
13135 int64_t expected_num_objects
= 0;
13137 cmd_getval(cmdmap
, "rule", rule_name
);
13138 string erasure_code_profile
;
13139 cmd_getval(cmdmap
, "erasure_code_profile", erasure_code_profile
);
13141 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
13142 if (erasure_code_profile
== "")
13143 erasure_code_profile
= "default";
13144 //handle the erasure code profile
13145 if (erasure_code_profile
== "default") {
13146 if (!osdmap
.has_erasure_code_profile(erasure_code_profile
)) {
13147 if (pending_inc
.has_erasure_code_profile(erasure_code_profile
)) {
13148 dout(20) << "erasure code profile " << erasure_code_profile
<< " already pending" << dendl
;
13152 map
<string
,string
> profile_map
;
13153 err
= osdmap
.get_erasure_code_profile_default(cct
,
13158 dout(20) << "erasure code profile " << erasure_code_profile
<< " set" << dendl
;
13159 pending_inc
.set_erasure_code_profile(erasure_code_profile
, profile_map
);
13163 if (rule_name
== "") {
13164 implicit_rule_creation
= true;
13165 if (erasure_code_profile
== "default") {
13166 rule_name
= "erasure-code";
13168 dout(1) << "implicitly use rule named after the pool: "
13169 << poolstr
<< dendl
;
13170 rule_name
= poolstr
;
13173 expected_num_objects
=
13174 cmd_getval_or
<int64_t>(cmdmap
, "expected_num_objects", 0);
13176 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
13177 // and put expected_num_objects to rule field
13178 if (erasure_code_profile
!= "") { // cmd is from CLI
13179 if (rule_name
!= "") {
13181 expected_num_objects
= strict_strtoll(rule_name
.c_str(), 10, &interr
);
13182 if (interr
.length()) {
13183 ss
<< "error parsing integer value '" << rule_name
<< "': " << interr
;
13188 rule_name
= erasure_code_profile
;
13189 } else { // cmd is well-formed
13190 expected_num_objects
=
13191 cmd_getval_or
<int64_t>(cmdmap
, "expected_num_objects", 0);
13195 if (!implicit_rule_creation
&& rule_name
!= "") {
13197 err
= get_crush_rule(rule_name
, &rule
, &ss
);
13198 if (err
== -EAGAIN
) {
13199 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13206 if (expected_num_objects
< 0) {
13207 ss
<< "'expected_num_objects' must be non-negative";
13213 osdmap
.get_all_osds(osds
);
13214 bool has_filestore_osd
= std::any_of(osds
.begin(), osds
.end(), [this](int osd
) {
13216 if (!get_osd_objectstore_type(osd
, &type
)) {
13217 return type
== "filestore";
13223 if (has_filestore_osd
&&
13224 expected_num_objects
> 0 &&
13225 cct
->_conf
->filestore_merge_threshold
> 0) {
13226 ss
<< "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
13231 if (has_filestore_osd
&&
13232 expected_num_objects
== 0 &&
13233 cct
->_conf
->filestore_merge_threshold
< 0) {
13234 int osds
= osdmap
.get_num_osds();
13236 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
13237 if (!sure
&& osds
&& (pg_num
>= 1024 || pg_num
/ osds
>= 100)) {
13238 ss
<< "For better initial performance on pools expected to store a "
13239 << "large number of objects, consider supplying the "
13240 << "expected_num_objects parameter when creating the pool."
13241 << " Pass --yes-i-really-mean-it to ignore it";
13247 int64_t fast_read_param
= cmd_getval_or
<int64_t>(cmdmap
, "fast_read", -1);
13248 FastReadType fast_read
= FAST_READ_DEFAULT
;
13249 if (fast_read_param
== 0)
13250 fast_read
= FAST_READ_OFF
;
13251 else if (fast_read_param
> 0)
13252 fast_read
= FAST_READ_ON
;
13254 int64_t repl_size
= 0;
13255 cmd_getval(cmdmap
, "size", repl_size
);
13256 int64_t target_size_bytes
= 0;
13257 double target_size_ratio
= 0.0;
13258 cmd_getval(cmdmap
, "target_size_bytes", target_size_bytes
);
13259 cmd_getval(cmdmap
, "target_size_ratio", target_size_ratio
);
13261 string pg_autoscale_mode
;
13262 cmd_getval(cmdmap
, "autoscale_mode", pg_autoscale_mode
);
13264 bool bulk
= cmd_getval_or
<bool>(cmdmap
, "bulk", 0);
13266 bool crimson
= cmd_getval_or
<bool>(cmdmap
, "crimson", false) ||
13267 cct
->_conf
.get_val
<bool>("osd_pool_default_crimson");
13269 err
= prepare_new_pool(poolstr
,
13270 -1, // default crush rule
13272 pg_num
, pgp_num
, pg_num_min
, pg_num_max
,
13273 repl_size
, target_size_bytes
, target_size_ratio
,
13274 erasure_code_profile
, pool_type
,
13275 (uint64_t)expected_num_objects
,
13284 ss
<< "pool '" << poolstr
<< "' already exists";
13287 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13296 ss
<< "pool '" << poolstr
<< "' created";
13299 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
13300 get_last_committed() + 1));
13303 } else if (prefix
== "osd pool delete" ||
13304 prefix
== "osd pool rm") {
13305 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
13306 string poolstr
, poolstr2
, sure
;
13307 cmd_getval(cmdmap
, "pool", poolstr
);
13308 cmd_getval(cmdmap
, "pool2", poolstr2
);
13309 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
13311 ss
<< "pool '" << poolstr
<< "' does not exist";
13316 bool force_no_fake
= false;
13317 cmd_getval(cmdmap
, "yes_i_really_really_mean_it", force_no_fake
);
13318 bool force
= false;
13319 cmd_getval(cmdmap
, "yes_i_really_really_mean_it_not_faking", force
);
13320 if (poolstr2
!= poolstr
||
13321 (!force
&& !force_no_fake
)) {
13322 ss
<< "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
13323 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
13324 << "followed by --yes-i-really-really-mean-it.";
13328 err
= _prepare_remove_pool(pool
, &ss
, force_no_fake
);
13329 if (err
== -EAGAIN
) {
13330 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13336 } else if (prefix
== "osd pool rename") {
13337 string srcpoolstr
, destpoolstr
;
13338 cmd_getval(cmdmap
, "srcpool", srcpoolstr
);
13339 cmd_getval(cmdmap
, "destpool", destpoolstr
);
13340 int64_t pool_src
= osdmap
.lookup_pg_pool_name(srcpoolstr
.c_str());
13341 int64_t pool_dst
= osdmap
.lookup_pg_pool_name(destpoolstr
.c_str());
13342 bool confirm
= false;
13343 //confirmation may be set to true only by internal operations.
13344 cmd_getval(cmdmap
, "yes_i_really_mean_it", confirm
);
13345 if (destpoolstr
[0] == '.' && !confirm
) {
13346 ss
<< "pool names beginning with . are not allowed";
13350 if (pool_src
< 0) {
13351 if (pool_dst
>= 0) {
13352 // src pool doesn't exist, dst pool does exist: to ensure idempotency
13353 // of operations, assume this rename succeeded, as it is not changing
13354 // the current state. Make sure we output something understandable
13355 // for whoever is issuing the command, if they are paying attention,
13356 // in case it was not intentional; or to avoid a "wtf?" and a bug
13357 // report in case it was intentional, while expecting a failure.
13358 ss
<< "pool '" << srcpoolstr
<< "' does not exist; pool '"
13359 << destpoolstr
<< "' does -- assuming successful rename";
13362 ss
<< "unrecognized pool '" << srcpoolstr
<< "'";
13366 } else if (pool_dst
>= 0) {
13367 // source pool exists and so does the destination pool
13368 ss
<< "pool '" << destpoolstr
<< "' already exists";
13373 int ret
= _prepare_rename_pool(pool_src
, destpoolstr
);
13375 ss
<< "pool '" << srcpoolstr
<< "' renamed to '" << destpoolstr
<< "'";
13377 ss
<< "failed to rename pool '" << srcpoolstr
<< "' to '" << destpoolstr
<< "': "
13378 << cpp_strerror(ret
);
13381 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, ret
, rs
,
13382 get_last_committed() + 1));
13385 } else if (prefix
== "osd pool set") {
13386 err
= prepare_command_pool_set(cmdmap
, ss
);
13387 if (err
== -EAGAIN
)
13393 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
13394 get_last_committed() + 1));
13396 } else if (prefix
== "osd tier add") {
13397 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
13398 if (err
== -EAGAIN
)
13403 cmd_getval(cmdmap
, "pool", poolstr
);
13404 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13406 ss
<< "unrecognized pool '" << poolstr
<< "'";
13410 string tierpoolstr
;
13411 cmd_getval(cmdmap
, "tierpool", tierpoolstr
);
13412 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
13413 if (tierpool_id
< 0) {
13414 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
13418 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13420 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
13423 if (!_check_become_tier(tierpool_id
, tp
, pool_id
, p
, &err
, &ss
)) {
13427 // make sure new tier is empty
13428 bool force_nonempty
= false;
13429 cmd_getval_compat_cephbool(cmdmap
, "force_nonempty", force_nonempty
);
13430 const pool_stat_t
*pstats
= mon
.mgrstatmon()->get_pool_stat(tierpool_id
);
13431 if (pstats
&& pstats
->stats
.sum
.num_objects
!= 0 &&
13433 ss
<< "tier pool '" << tierpoolstr
<< "' is not empty; --force-nonempty to force";
13437 if (tp
->is_erasure()) {
13438 ss
<< "tier pool '" << tierpoolstr
13439 << "' is an ec pool, which cannot be a tier";
13443 if ((!tp
->removed_snaps
.empty() || !tp
->snaps
.empty()) &&
13444 (!force_nonempty
||
13445 !g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps
)) {
13446 ss
<< "tier pool '" << tierpoolstr
<< "' has snapshot state; it cannot be added as a tier without breaking the pool";
13451 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13452 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
13453 if (np
->tiers
.count(tierpool_id
) || ntp
->is_tier()) {
13454 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13457 np
->tiers
.insert(tierpool_id
);
13458 np
->set_snap_epoch(pending_inc
.epoch
); // tier will update to our snap info
13459 ntp
->tier_of
= pool_id
;
13460 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) a tier of '" << poolstr
<< "'";
13461 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13462 get_last_committed() + 1));
13464 } else if (prefix
== "osd tier remove" ||
13465 prefix
== "osd tier rm") {
13467 cmd_getval(cmdmap
, "pool", poolstr
);
13468 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13470 ss
<< "unrecognized pool '" << poolstr
<< "'";
13474 string tierpoolstr
;
13475 cmd_getval(cmdmap
, "tierpool", tierpoolstr
);
13476 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
13477 if (tierpool_id
< 0) {
13478 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
13482 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13484 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
13487 if (!_check_remove_tier(pool_id
, p
, tp
, &err
, &ss
)) {
13491 if (p
->tiers
.count(tierpool_id
) == 0) {
13492 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) not a tier of '" << poolstr
<< "'";
13496 if (tp
->tier_of
!= pool_id
) {
13497 ss
<< "tier pool '" << tierpoolstr
<< "' is a tier of '"
13498 << osdmap
.get_pool_name(tp
->tier_of
) << "': "
13499 // be scary about it; this is an inconsistency and bells must go off
13500 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
13504 if (p
->read_tier
== tierpool_id
) {
13505 ss
<< "tier pool '" << tierpoolstr
<< "' is the overlay for '" << poolstr
<< "'; please remove-overlay first";
13510 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13511 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
13512 if (np
->tiers
.count(tierpool_id
) == 0 ||
13513 ntp
->tier_of
!= pool_id
||
13514 np
->read_tier
== tierpool_id
) {
13515 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13518 np
->tiers
.erase(tierpool_id
);
13520 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) not a tier of '" << poolstr
<< "'";
13521 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13522 get_last_committed() + 1));
13524 } else if (prefix
== "osd tier set-overlay") {
13525 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
13526 if (err
== -EAGAIN
)
13531 cmd_getval(cmdmap
, "pool", poolstr
);
13532 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13534 ss
<< "unrecognized pool '" << poolstr
<< "'";
13538 string overlaypoolstr
;
13539 cmd_getval(cmdmap
, "overlaypool", overlaypoolstr
);
13540 int64_t overlaypool_id
= osdmap
.lookup_pg_pool_name(overlaypoolstr
);
13541 if (overlaypool_id
< 0) {
13542 ss
<< "unrecognized pool '" << overlaypoolstr
<< "'";
13546 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13548 const pg_pool_t
*overlay_p
= osdmap
.get_pg_pool(overlaypool_id
);
13549 ceph_assert(overlay_p
);
13550 if (p
->tiers
.count(overlaypool_id
) == 0) {
13551 ss
<< "tier pool '" << overlaypoolstr
<< "' is not a tier of '" << poolstr
<< "'";
13555 if (p
->read_tier
== overlaypool_id
) {
13557 ss
<< "overlay for '" << poolstr
<< "' is now (or already was) '" << overlaypoolstr
<< "'";
13560 if (p
->has_read_tier()) {
13561 ss
<< "pool '" << poolstr
<< "' has overlay '"
13562 << osdmap
.get_pool_name(p
->read_tier
)
13563 << "'; please remove-overlay first";
13569 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13570 np
->read_tier
= overlaypool_id
;
13571 np
->write_tier
= overlaypool_id
;
13572 np
->set_last_force_op_resend(pending_inc
.epoch
);
13573 pg_pool_t
*noverlay_p
= pending_inc
.get_new_pool(overlaypool_id
, overlay_p
);
13574 noverlay_p
->set_last_force_op_resend(pending_inc
.epoch
);
13575 ss
<< "overlay for '" << poolstr
<< "' is now (or already was) '" << overlaypoolstr
<< "'";
13576 if (overlay_p
->cache_mode
== pg_pool_t::CACHEMODE_NONE
)
13577 ss
<<" (WARNING: overlay pool cache_mode is still NONE)";
13578 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13579 get_last_committed() + 1));
13581 } else if (prefix
== "osd tier remove-overlay" ||
13582 prefix
== "osd tier rm-overlay") {
13584 cmd_getval(cmdmap
, "pool", poolstr
);
13585 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13587 ss
<< "unrecognized pool '" << poolstr
<< "'";
13591 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13593 if (!p
->has_read_tier()) {
13595 ss
<< "there is now (or already was) no overlay for '" << poolstr
<< "'";
13599 if (!_check_remove_tier(pool_id
, p
, NULL
, &err
, &ss
)) {
13604 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13605 if (np
->has_read_tier()) {
13606 const pg_pool_t
*op
= osdmap
.get_pg_pool(np
->read_tier
);
13607 pg_pool_t
*nop
= pending_inc
.get_new_pool(np
->read_tier
,op
);
13608 nop
->set_last_force_op_resend(pending_inc
.epoch
);
13610 if (np
->has_write_tier()) {
13611 const pg_pool_t
*op
= osdmap
.get_pg_pool(np
->write_tier
);
13612 pg_pool_t
*nop
= pending_inc
.get_new_pool(np
->write_tier
, op
);
13613 nop
->set_last_force_op_resend(pending_inc
.epoch
);
13615 np
->clear_read_tier();
13616 np
->clear_write_tier();
13617 np
->set_last_force_op_resend(pending_inc
.epoch
);
13618 ss
<< "there is now (or already was) no overlay for '" << poolstr
<< "'";
13619 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13620 get_last_committed() + 1));
13622 } else if (prefix
== "osd tier cache-mode") {
13623 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
13624 if (err
== -EAGAIN
)
13629 cmd_getval(cmdmap
, "pool", poolstr
);
13630 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13632 ss
<< "unrecognized pool '" << poolstr
<< "'";
13636 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13638 if (!p
->is_tier()) {
13639 ss
<< "pool '" << poolstr
<< "' is not a tier";
13644 cmd_getval(cmdmap
, "mode", modestr
);
13645 pg_pool_t::cache_mode_t mode
= pg_pool_t::get_cache_mode_from_str(modestr
);
13646 if (int(mode
) < 0) {
13647 ss
<< "'" << modestr
<< "' is not a valid cache mode";
13653 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
13655 if (mode
== pg_pool_t::CACHEMODE_FORWARD
||
13656 mode
== pg_pool_t::CACHEMODE_READFORWARD
) {
13657 ss
<< "'" << modestr
<< "' is no longer a supported cache mode";
13661 if ((mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13662 mode
!= pg_pool_t::CACHEMODE_NONE
&&
13663 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
13664 mode
!= pg_pool_t::CACHEMODE_READPROXY
) &&
13666 ss
<< "'" << modestr
<< "' is not a well-supported cache mode and may "
13667 << "corrupt your data. pass --yes-i-really-mean-it to force.";
13672 // pool already has this cache-mode set and there are no pending changes
13673 if (p
->cache_mode
== mode
&&
13674 (pending_inc
.new_pools
.count(pool_id
) == 0 ||
13675 pending_inc
.new_pools
[pool_id
].cache_mode
== p
->cache_mode
)) {
13676 ss
<< "set cache-mode for pool '" << poolstr
<< "'"
13677 << " to " << pg_pool_t::get_cache_mode_name(mode
);
13682 /* Mode description:
13684 * none: No cache-mode defined
13685 * forward: Forward all reads and writes to base pool [removed]
13686 * writeback: Cache writes, promote reads from base pool
13687 * readonly: Forward writes to base pool
13688 * readforward: Writes are in writeback mode, Reads are in forward mode [removed]
13689 * proxy: Proxy all reads and writes to base pool
13690 * readproxy: Writes are in writeback mode, Reads are in proxy mode
13692 * Hence, these are the allowed transitions:
13695 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
13696 * proxy -> readproxy || writeback || any IF num_objects_dirty == 0
13697 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
13698 * readproxy -> proxy || writeback || any IF num_objects_dirty == 0
13699 * writeback -> readproxy || proxy
13703 // We check if the transition is valid against the current pool mode, as
13704 // it is the only committed state thus far. We will blantly squash
13705 // whatever mode is on the pending state.
13707 if (p
->cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
&&
13708 (mode
!= pg_pool_t::CACHEMODE_PROXY
&&
13709 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) {
13710 ss
<< "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode
)
13711 << "' on a '" << pg_pool_t::get_cache_mode_name(p
->cache_mode
)
13712 << "' pool; only '"
13713 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY
)
13715 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY
)
13720 if ((p
->cache_mode
== pg_pool_t::CACHEMODE_READFORWARD
&&
13721 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13722 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
13723 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) ||
13725 (p
->cache_mode
== pg_pool_t::CACHEMODE_READPROXY
&&
13726 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13727 mode
!= pg_pool_t::CACHEMODE_PROXY
)) ||
13729 (p
->cache_mode
== pg_pool_t::CACHEMODE_PROXY
&&
13730 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13731 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) ||
13733 (p
->cache_mode
== pg_pool_t::CACHEMODE_FORWARD
&&
13734 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13735 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
13736 mode
!= pg_pool_t::CACHEMODE_READPROXY
))) {
13738 const pool_stat_t
* pstats
=
13739 mon
.mgrstatmon()->get_pool_stat(pool_id
);
13741 if (pstats
&& pstats
->stats
.sum
.num_objects_dirty
> 0) {
13742 ss
<< "unable to set cache-mode '"
13743 << pg_pool_t::get_cache_mode_name(mode
) << "' on pool '" << poolstr
13744 << "': dirty objects found";
13750 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13751 np
->cache_mode
= mode
;
13752 // set this both when moving to and from cache_mode NONE. this is to
13753 // capture legacy pools that were set up before this flag existed.
13754 np
->flags
|= pg_pool_t::FLAG_INCOMPLETE_CLONES
;
13755 ss
<< "set cache-mode for pool '" << poolstr
13756 << "' to " << pg_pool_t::get_cache_mode_name(mode
);
13757 if (mode
== pg_pool_t::CACHEMODE_NONE
) {
13758 const pg_pool_t
*base_pool
= osdmap
.get_pg_pool(np
->tier_of
);
13759 ceph_assert(base_pool
);
13760 if (base_pool
->read_tier
== pool_id
||
13761 base_pool
->write_tier
== pool_id
)
13762 ss
<<" (WARNING: pool is still configured as read or write tier)";
13764 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13765 get_last_committed() + 1));
13767 } else if (prefix
== "osd tier add-cache") {
13768 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
13769 if (err
== -EAGAIN
)
13774 cmd_getval(cmdmap
, "pool", poolstr
);
13775 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13777 ss
<< "unrecognized pool '" << poolstr
<< "'";
13781 string tierpoolstr
;
13782 cmd_getval(cmdmap
, "tierpool", tierpoolstr
);
13783 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
13784 if (tierpool_id
< 0) {
13785 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
13789 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13791 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
13794 if (!_check_become_tier(tierpool_id
, tp
, pool_id
, p
, &err
, &ss
)) {
13799 if (!cmd_getval(cmdmap
, "size", size
)) {
13800 ss
<< "unable to parse 'size' value '"
13801 << cmd_vartype_stringify(cmdmap
.at("size")) << "'";
13805 // make sure new tier is empty
13806 const pool_stat_t
*pstats
=
13807 mon
.mgrstatmon()->get_pool_stat(tierpool_id
);
13808 if (pstats
&& pstats
->stats
.sum
.num_objects
!= 0) {
13809 ss
<< "tier pool '" << tierpoolstr
<< "' is not empty";
13813 auto& modestr
= g_conf().get_val
<string
>("osd_tier_default_cache_mode");
13814 pg_pool_t::cache_mode_t mode
= pg_pool_t::get_cache_mode_from_str(modestr
);
13815 if (int(mode
) < 0) {
13816 ss
<< "osd tier cache default mode '" << modestr
<< "' is not a valid cache mode";
13820 HitSet::Params hsp
;
13821 auto& cache_hit_set_type
=
13822 g_conf().get_val
<string
>("osd_tier_default_cache_hit_set_type");
13823 if (cache_hit_set_type
== "bloom") {
13824 BloomHitSet::Params
*bsp
= new BloomHitSet::Params
;
13825 bsp
->set_fpp(g_conf().get_val
<double>("osd_pool_default_hit_set_bloom_fpp"));
13826 hsp
= HitSet::Params(bsp
);
13827 } else if (cache_hit_set_type
== "explicit_hash") {
13828 hsp
= HitSet::Params(new ExplicitHashHitSet::Params
);
13829 } else if (cache_hit_set_type
== "explicit_object") {
13830 hsp
= HitSet::Params(new ExplicitObjectHitSet::Params
);
13832 ss
<< "osd tier cache default hit set type '"
13833 << cache_hit_set_type
<< "' is not a known type";
13838 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13839 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
13840 if (np
->tiers
.count(tierpool_id
) || ntp
->is_tier()) {
13841 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13844 np
->tiers
.insert(tierpool_id
);
13845 np
->read_tier
= np
->write_tier
= tierpool_id
;
13846 np
->set_snap_epoch(pending_inc
.epoch
); // tier will update to our snap info
13847 np
->set_last_force_op_resend(pending_inc
.epoch
);
13848 ntp
->set_last_force_op_resend(pending_inc
.epoch
);
13849 ntp
->tier_of
= pool_id
;
13850 ntp
->cache_mode
= mode
;
13851 ntp
->hit_set_count
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_count");
13852 ntp
->hit_set_period
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_period");
13853 ntp
->min_read_recency_for_promote
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
13854 ntp
->min_write_recency_for_promote
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
13855 ntp
->hit_set_grade_decay_rate
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
13856 ntp
->hit_set_search_last_n
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
13857 ntp
->hit_set_params
= hsp
;
13858 ntp
->target_max_bytes
= size
;
13859 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) a cache tier of '" << poolstr
<< "'";
13860 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13861 get_last_committed() + 1));
13863 } else if (prefix
== "osd pool set-quota") {
13865 cmd_getval(cmdmap
, "pool", poolstr
);
13866 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13868 ss
<< "unrecognized pool '" << poolstr
<< "'";
13874 cmd_getval(cmdmap
, "field", field
);
13875 if (field
!= "max_objects" && field
!= "max_bytes") {
13876 ss
<< "unrecognized field '" << field
<< "'; should be 'max_bytes' or 'max_objects'";
13881 // val could contain unit designations, so we treat as a string
13883 cmd_getval(cmdmap
, "val", val
);
13886 if (field
== "max_objects") {
13887 value
= strict_si_cast
<uint64_t>(val
, &tss
);
13888 } else if (field
== "max_bytes") {
13889 value
= strict_iecstrtoll(val
, &tss
);
13891 ceph_abort_msg("unrecognized option");
13893 if (!tss
.empty()) {
13894 ss
<< "error parsing value '" << val
<< "': " << tss
;
13899 pg_pool_t
*pi
= pending_inc
.get_new_pool(pool_id
, osdmap
.get_pg_pool(pool_id
));
13900 if (field
== "max_objects") {
13901 pi
->quota_max_objects
= value
;
13902 } else if (field
== "max_bytes") {
13903 pi
->quota_max_bytes
= value
;
13905 ceph_abort_msg("unrecognized option");
13907 ss
<< "set-quota " << field
<< " = " << value
<< " for pool " << poolstr
;
13909 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
13910 get_last_committed() + 1));
13912 } else if (prefix
== "osd pool application enable" ||
13913 prefix
== "osd pool application disable" ||
13914 prefix
== "osd pool application set" ||
13915 prefix
== "osd pool application rm") {
13916 err
= prepare_command_pool_application(prefix
, cmdmap
, ss
);
13917 if (err
== -EAGAIN
) {
13919 } else if (err
< 0) {
13924 } else if (prefix
== "osd force-create-pg") {
13927 err
= parse_pgid(cmdmap
, ss
, pgid
, pgidstr
);
13931 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
13933 ss
<< "This command will recreate a lost (as in data lost) PG with data in it, such "
13934 << "that the cluster will give up ever trying to recover the lost data. Do this "
13935 << "only if you are certain that all copies of the PG are in fact lost and you are "
13936 << "willing to accept that the data is permanently destroyed. Pass "
13937 << "--yes-i-really-mean-it to proceed.";
13943 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
13944 auto emplaced
= creating_pgs
.pgs
.emplace(
13946 creating_pgs_t::pg_create_info(osdmap
.get_epoch(),
13947 ceph_clock_now()));
13948 creating_now
= emplaced
.second
;
13950 if (creating_now
) {
13951 ss
<< "pg " << pgidstr
<< " now creating, ok";
13952 // set the pool's CREATING flag so that (1) the osd won't ignore our
13953 // create message and (2) we won't propose any future pg_num changes
13954 // until after the PG has been instantiated.
13955 if (pending_inc
.new_pools
.count(pgid
.pool()) == 0) {
13956 pending_inc
.new_pools
[pgid
.pool()] = *osdmap
.get_pg_pool(pgid
.pool());
13958 pending_inc
.new_pools
[pgid
.pool()].flags
|= pg_pool_t::FLAG_CREATING
;
13962 ss
<< "pg " << pgid
<< " already creating";
13966 } else if (prefix
== "osd force_healthy_stretch_mode") {
13968 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
13970 ss
<< "This command will require peering across multiple CRUSH buckets "
13971 "(probably two data centers or availability zones?) and may result in PGs "
13972 "going inactive until backfilling is complete. Pass --yes-i-really-mean-it to proceed.";
13976 try_end_recovery_stretch_mode(true);
13977 ss
<< "Triggering healthy stretch mode";
13980 } else if (prefix
== "osd force_recovery_stretch_mode") {
13982 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
13984 ss
<< "This command will increase pool sizes to try and spread them "
13985 "across multiple CRUSH buckets (probably two data centers or "
13986 "availability zones?) and should have happened automatically"
13987 "Pass --yes-i-really-mean-it to proceed.";
13991 mon
.go_recovery_stretch_mode();
13992 ss
<< "Triggering recovery stretch mode";
13995 } else if (prefix
== "osd set-allow-crimson") {
13998 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
14000 bool experimental_enabled
=
14001 g_ceph_context
->check_experimental_feature_enabled("crimson");
14002 if (!sure
|| !experimental_enabled
) {
14003 ss
<< "This command will allow usage of crimson-osd osd daemons. "
14004 << "crimson-osd is not considered stable and will likely cause "
14005 << "crashes or data corruption. At this time, crimson-osd is mainly "
14006 << "useful for performance evaluation, testing, and development. "
14007 << "If you are sure, add --yes-i-really-mean-it and add 'crimson' to "
14008 << "the experimental features config. This setting is irrevocable.";
14014 if (osdmap
.get_allow_crimson()) {
14017 pending_inc
.set_allow_crimson();
14026 if (err
< 0 && rs
.length() == 0)
14027 rs
= cpp_strerror(err
);
14028 mon
.reply_command(op
, err
, rs
, rdata
, get_last_committed());
14033 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
14034 get_last_committed() + 1));
14038 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
14042 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op
)
14044 op
->mark_osdmon_event(__func__
);
14046 auto m
= op
->get_req
<MPoolOp
>();
14047 MonSession
*session
= op
->get_session();
14049 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
14054 case POOL_OP_CREATE_UNMANAGED_SNAP
:
14055 case POOL_OP_DELETE_UNMANAGED_SNAP
:
14057 const std::string
* pool_name
= nullptr;
14058 const pg_pool_t
*pg_pool
= osdmap
.get_pg_pool(m
->pool
);
14059 if (pg_pool
!= nullptr) {
14060 pool_name
= &osdmap
.get_pool_name(m
->pool
);
14063 if (!is_unmanaged_snap_op_permitted(cct
, mon
.key_server
,
14064 session
->entity_name
, session
->caps
,
14065 session
->get_peer_socket_addr(),
14067 dout(0) << "got unmanaged-snap pool op from entity with insufficient "
14068 << "privileges. message: " << *m
<< std::endl
14069 << "caps: " << session
->caps
<< dendl
;
14070 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
14076 if (!session
->is_capable("osd", MON_CAP_W
)) {
14077 dout(0) << "got pool op from entity with insufficient privileges. "
14078 << "message: " << *m
<< std::endl
14079 << "caps: " << session
->caps
<< dendl
;
14080 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
14089 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op
)
14091 op
->mark_osdmon_event(__func__
);
14092 auto m
= op
->get_req
<MPoolOp
>();
14094 if (enforce_pool_op_caps(op
)) {
14098 if (m
->fsid
!= mon
.monmap
->fsid
) {
14099 dout(0) << __func__
<< " drop message on fsid " << m
->fsid
14100 << " != " << mon
.monmap
->fsid
<< " for " << *m
<< dendl
;
14101 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
14105 if (m
->op
== POOL_OP_CREATE
)
14106 return preprocess_pool_op_create(op
);
14108 const pg_pool_t
*p
= osdmap
.get_pg_pool(m
->pool
);
14109 if (p
== nullptr) {
14110 dout(10) << "attempt to operate on non-existent pool id " << m
->pool
<< dendl
;
14111 if (m
->op
== POOL_OP_DELETE
) {
14112 _pool_op_reply(op
, 0, osdmap
.get_epoch());
14114 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
14119 // check if the snap and snapname exist
14120 bool snap_exists
= false;
14121 if (p
->snap_exists(m
->name
.c_str()))
14122 snap_exists
= true;
14125 case POOL_OP_CREATE_SNAP
:
14126 if (p
->is_unmanaged_snaps_mode() || p
->is_tier()) {
14127 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
14131 _pool_op_reply(op
, 0, osdmap
.get_epoch());
14135 case POOL_OP_CREATE_UNMANAGED_SNAP
:
14136 if (p
->is_pool_snaps_mode()) {
14137 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
14141 case POOL_OP_DELETE_SNAP
:
14142 if (p
->is_unmanaged_snaps_mode()) {
14143 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
14146 if (!snap_exists
) {
14147 _pool_op_reply(op
, 0, osdmap
.get_epoch());
14151 case POOL_OP_DELETE_UNMANAGED_SNAP
:
14152 if (p
->is_pool_snaps_mode()) {
14153 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
14156 if (_is_removed_snap(m
->pool
, m
->snapid
)) {
14157 _pool_op_reply(op
, 0, osdmap
.get_epoch());
14161 case POOL_OP_DELETE
:
14162 if (osdmap
.lookup_pg_pool_name(m
->name
.c_str()) >= 0) {
14163 _pool_op_reply(op
, 0, osdmap
.get_epoch());
14167 case POOL_OP_AUID_CHANGE
:
14177 bool OSDMonitor::_is_removed_snap(int64_t pool
, snapid_t snap
)
14179 if (!osdmap
.have_pg_pool(pool
)) {
14180 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
14181 << " - pool dne" << dendl
;
14184 if (osdmap
.in_removed_snaps_queue(pool
, snap
)) {
14185 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
14186 << " - in osdmap removed_snaps_queue" << dendl
;
14189 snapid_t begin
, end
;
14190 int r
= lookup_purged_snap(pool
, snap
, &begin
, &end
);
14192 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
14193 << " - purged, [" << begin
<< "," << end
<< ")" << dendl
;
14199 bool OSDMonitor::_is_pending_removed_snap(int64_t pool
, snapid_t snap
)
14201 if (pending_inc
.old_pools
.count(pool
)) {
14202 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
14203 << " - pool pending deletion" << dendl
;
14206 if (pending_inc
.in_new_removed_snaps(pool
, snap
)) {
14207 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
14208 << " - in pending new_removed_snaps" << dendl
;
14214 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op
)
14216 op
->mark_osdmon_event(__func__
);
14217 auto m
= op
->get_req
<MPoolOp
>();
14218 int64_t pool
= osdmap
.lookup_pg_pool_name(m
->name
.c_str());
14220 _pool_op_reply(op
, 0, osdmap
.get_epoch());
14227 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op
)
14229 op
->mark_osdmon_event(__func__
);
14230 auto m
= op
->get_req
<MPoolOp
>();
14231 dout(10) << "prepare_pool_op " << *m
<< dendl
;
14232 if (m
->op
== POOL_OP_CREATE
) {
14233 return prepare_pool_op_create(op
);
14234 } else if (m
->op
== POOL_OP_DELETE
) {
14235 return prepare_pool_op_delete(op
);
14239 bool changed
= false;
14241 if (!osdmap
.have_pg_pool(m
->pool
)) {
14242 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
14246 const pg_pool_t
*pool
= osdmap
.get_pg_pool(m
->pool
);
14248 if (m
->op
== POOL_OP_CREATE_SNAP
||
14249 m
->op
== POOL_OP_CREATE_UNMANAGED_SNAP
) {
14250 if (const auto& fsmap
= mon
.mdsmon()->get_fsmap(); fsmap
.pool_in_use(m
->pool
)) {
14251 dout(20) << "monitor-managed snapshots have been disabled for pools "
14252 " attached to an fs - pool:" << m
->pool
<< dendl
;
14253 _pool_op_reply(op
, -EOPNOTSUPP
, osdmap
.get_epoch());
14259 case POOL_OP_CREATE_SNAP
:
14260 if (pool
->is_tier()) {
14262 _pool_op_reply(op
, ret
, osdmap
.get_epoch());
14264 } // else, fall through
14265 case POOL_OP_DELETE_SNAP
:
14266 if (!pool
->is_unmanaged_snaps_mode()) {
14267 bool snap_exists
= pool
->snap_exists(m
->name
.c_str());
14268 if ((m
->op
== POOL_OP_CREATE_SNAP
&& snap_exists
)
14269 || (m
->op
== POOL_OP_DELETE_SNAP
&& !snap_exists
)) {
14277 _pool_op_reply(op
, ret
, osdmap
.get_epoch());
14280 case POOL_OP_DELETE_UNMANAGED_SNAP
:
14281 // we won't allow removal of an unmanaged snapshot from a pool
14282 // not in unmanaged snaps mode.
14283 if (!pool
->is_unmanaged_snaps_mode()) {
14284 _pool_op_reply(op
, -ENOTSUP
, osdmap
.get_epoch());
14288 case POOL_OP_CREATE_UNMANAGED_SNAP
:
14289 // but we will allow creating an unmanaged snapshot on any pool
14290 // as long as it is not in 'pool' snaps mode.
14291 if (pool
->is_pool_snaps_mode()) {
14292 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
14297 // projected pool info
14299 if (pending_inc
.new_pools
.count(m
->pool
))
14300 pp
= pending_inc
.new_pools
[m
->pool
];
14302 pp
= *osdmap
.get_pg_pool(m
->pool
);
14304 bufferlist reply_data
;
14306 // pool snaps vs unmanaged snaps are mutually exclusive
14308 case POOL_OP_CREATE_SNAP
:
14309 case POOL_OP_DELETE_SNAP
:
14310 if (pp
.is_unmanaged_snaps_mode()) {
14316 case POOL_OP_CREATE_UNMANAGED_SNAP
:
14317 case POOL_OP_DELETE_UNMANAGED_SNAP
:
14318 if (pp
.is_pool_snaps_mode()) {
14325 case POOL_OP_CREATE_SNAP
:
14326 if (!pp
.snap_exists(m
->name
.c_str())) {
14327 pp
.add_snap(m
->name
.c_str(), ceph_clock_now());
14328 dout(10) << "create snap in pool " << m
->pool
<< " " << m
->name
14329 << " seq " << pp
.get_snap_epoch() << dendl
;
14334 case POOL_OP_DELETE_SNAP
:
14336 snapid_t s
= pp
.snap_exists(m
->name
.c_str());
14339 pending_inc
.new_removed_snaps
[m
->pool
].insert(s
);
14345 case POOL_OP_CREATE_UNMANAGED_SNAP
:
14347 uint64_t snapid
= pp
.add_unmanaged_snap(
14348 osdmap
.require_osd_release
< ceph_release_t::octopus
);
14349 encode(snapid
, reply_data
);
14354 case POOL_OP_DELETE_UNMANAGED_SNAP
:
14355 if (!_is_removed_snap(m
->pool
, m
->snapid
) &&
14356 !_is_pending_removed_snap(m
->pool
, m
->snapid
)) {
14357 if (m
->snapid
> pp
.get_snap_seq()) {
14358 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
14361 pp
.remove_unmanaged_snap(
14363 osdmap
.require_osd_release
< ceph_release_t::octopus
);
14364 pending_inc
.new_removed_snaps
[m
->pool
].insert(m
->snapid
);
14365 // also record the new seq as purged: this avoids a discontinuity
14366 // after all of the snaps have been purged, since the seq assigned
14367 // during removal lives in the same namespace as the actual snaps.
14368 pending_pseudo_purged_snaps
[m
->pool
].insert(pp
.get_snap_seq());
14373 case POOL_OP_AUID_CHANGE
:
14374 _pool_op_reply(op
, -EOPNOTSUPP
, osdmap
.get_epoch());
14383 pp
.set_snap_epoch(pending_inc
.epoch
);
14384 pending_inc
.new_pools
[m
->pool
] = pp
;
14388 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, ret
, pending_inc
.epoch
, &reply_data
));
14392 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op
)
14394 op
->mark_osdmon_event(__func__
);
14395 int err
= prepare_new_pool(op
);
14396 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, err
, pending_inc
.epoch
));
14400 int OSDMonitor::_check_remove_pool(int64_t pool_id
, const pg_pool_t
& pool
,
14403 const string
& poolstr
= osdmap
.get_pool_name(pool_id
);
14405 // If the Pool is in use by CephFS, refuse to delete it
14406 FSMap
const &pending_fsmap
= mon
.mdsmon()->get_pending_fsmap();
14407 if (pending_fsmap
.pool_in_use(pool_id
)) {
14408 *ss
<< "pool '" << poolstr
<< "' is in use by CephFS";
14412 if (pool
.tier_of
>= 0) {
14413 *ss
<< "pool '" << poolstr
<< "' is a tier of '"
14414 << osdmap
.get_pool_name(pool
.tier_of
) << "'";
14417 if (!pool
.tiers
.empty()) {
14418 *ss
<< "pool '" << poolstr
<< "' has tiers";
14419 for(auto tier
: pool
.tiers
) {
14420 *ss
<< " " << osdmap
.get_pool_name(tier
);
14425 if (!g_conf()->mon_allow_pool_delete
) {
14426 *ss
<< "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
14430 if (pool
.has_flag(pg_pool_t::FLAG_NODELETE
)) {
14431 *ss
<< "pool deletion is disabled; you must unset nodelete flag for the pool first";
14435 *ss
<< "pool '" << poolstr
<< "' removed";
14440 * Check if it is safe to add a tier to a base pool
14443 * True if the operation should proceed, false if we should abort here
14444 * (abort doesn't necessarily mean error, could be idempotency)
14446 bool OSDMonitor::_check_become_tier(
14447 const int64_t tier_pool_id
, const pg_pool_t
*tier_pool
,
14448 const int64_t base_pool_id
, const pg_pool_t
*base_pool
,
14452 const std::string
&tier_pool_name
= osdmap
.get_pool_name(tier_pool_id
);
14453 const std::string
&base_pool_name
= osdmap
.get_pool_name(base_pool_id
);
14455 if (tier_pool
->is_crimson()) {
14456 *ss
<< "pool '" << tier_pool_name
<< "' is a crimson pool, tiering "
14457 << "features are not supported";
14461 if (base_pool
->is_crimson()) {
14462 *ss
<< "pool '" << base_pool_name
<< "' is a crimson pool, tiering "
14463 << "features are not supported";
14468 const FSMap
&pending_fsmap
= mon
.mdsmon()->get_pending_fsmap();
14469 if (pending_fsmap
.pool_in_use(tier_pool_id
)) {
14470 *ss
<< "pool '" << tier_pool_name
<< "' is in use by CephFS";
14475 if (base_pool
->tiers
.count(tier_pool_id
)) {
14476 ceph_assert(tier_pool
->tier_of
== base_pool_id
);
14478 *ss
<< "pool '" << tier_pool_name
<< "' is now (or already was) a tier of '"
14479 << base_pool_name
<< "'";
14483 if (base_pool
->is_tier()) {
14484 *ss
<< "pool '" << base_pool_name
<< "' is already a tier of '"
14485 << osdmap
.get_pool_name(base_pool
->tier_of
) << "', "
14486 << "multiple tiers are not yet supported.";
14491 if (tier_pool
->has_tiers()) {
14492 *ss
<< "pool '" << tier_pool_name
<< "' has following tier(s) already:";
14493 for (set
<uint64_t>::iterator it
= tier_pool
->tiers
.begin();
14494 it
!= tier_pool
->tiers
.end(); ++it
)
14495 *ss
<< "'" << osdmap
.get_pool_name(*it
) << "',";
14496 *ss
<< " multiple tiers are not yet supported.";
14501 if (tier_pool
->is_tier()) {
14502 *ss
<< "tier pool '" << tier_pool_name
<< "' is already a tier of '"
14503 << osdmap
.get_pool_name(tier_pool
->tier_of
) << "'";
14514 * Check if it is safe to remove a tier from this base pool
14517 * True if the operation should proceed, false if we should abort here
14518 * (abort doesn't necessarily mean error, could be idempotency)
14520 bool OSDMonitor::_check_remove_tier(
14521 const int64_t base_pool_id
, const pg_pool_t
*base_pool
,
14522 const pg_pool_t
*tier_pool
,
14523 int *err
, ostream
*ss
) const
14525 const std::string
&base_pool_name
= osdmap
.get_pool_name(base_pool_id
);
14527 // Apply CephFS-specific checks
14528 const FSMap
&pending_fsmap
= mon
.mdsmon()->get_pending_fsmap();
14529 if (pending_fsmap
.pool_in_use(base_pool_id
)) {
14530 if (base_pool
->is_erasure() && !base_pool
->allows_ecoverwrites()) {
14531 // If the underlying pool is erasure coded and does not allow EC
14532 // overwrites, we can't permit the removal of the replicated tier that
14533 // CephFS relies on to access it
14534 *ss
<< "pool '" << base_pool_name
<<
14535 "' does not allow EC overwrites and is in use by CephFS"
14541 if (tier_pool
&& tier_pool
->cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
) {
14542 *ss
<< "pool '" << base_pool_name
<< "' is in use by CephFS, and this "
14543 "tier is still in use as a writeback cache. Change the cache "
14544 "mode and flush the cache before removing it";
14554 int OSDMonitor::_prepare_remove_pool(
14555 int64_t pool
, ostream
*ss
, bool no_fake
)
14557 dout(10) << __func__
<< " " << pool
<< dendl
;
14558 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
14559 int r
= _check_remove_pool(pool
, *p
, ss
);
14563 auto new_pool
= pending_inc
.new_pools
.find(pool
);
14564 if (new_pool
!= pending_inc
.new_pools
.end()) {
14565 // if there is a problem with the pending info, wait and retry
14567 const auto& p
= new_pool
->second
;
14568 int r
= _check_remove_pool(pool
, p
, ss
);
14573 if (pending_inc
.old_pools
.count(pool
)) {
14574 dout(10) << __func__
<< " " << pool
<< " already pending removal"
14579 if (g_conf()->mon_fake_pool_delete
&& !no_fake
) {
14580 string old_name
= osdmap
.get_pool_name(pool
);
14581 string new_name
= old_name
+ "." + stringify(pool
) + ".DELETED";
14582 dout(1) << __func__
<< " faking pool deletion: renaming " << pool
<< " "
14583 << old_name
<< " -> " << new_name
<< dendl
;
14584 pending_inc
.new_pool_names
[pool
] = new_name
;
14589 pending_inc
.old_pools
.insert(pool
);
14591 // remove any pg_temp mappings for this pool
14592 for (auto p
= osdmap
.pg_temp
->begin();
14593 p
!= osdmap
.pg_temp
->end();
14595 if (p
->first
.pool() == pool
) {
14596 dout(10) << __func__
<< " " << pool
<< " removing obsolete pg_temp "
14597 << p
->first
<< dendl
;
14598 pending_inc
.new_pg_temp
[p
->first
].clear();
14601 // remove any primary_temp mappings for this pool
14602 for (auto p
= osdmap
.primary_temp
->begin();
14603 p
!= osdmap
.primary_temp
->end();
14605 if (p
->first
.pool() == pool
) {
14606 dout(10) << __func__
<< " " << pool
14607 << " removing obsolete primary_temp" << p
->first
<< dendl
;
14608 pending_inc
.new_primary_temp
[p
->first
] = -1;
14611 // remove any pg_upmap mappings for this pool
14612 for (auto& p
: osdmap
.pg_upmap
) {
14613 if (p
.first
.pool() == pool
) {
14614 dout(10) << __func__
<< " " << pool
14615 << " removing obsolete pg_upmap "
14616 << p
.first
<< dendl
;
14617 pending_inc
.old_pg_upmap
.insert(p
.first
);
14620 // remove any pending pg_upmap mappings for this pool
14622 auto it
= pending_inc
.new_pg_upmap
.begin();
14623 while (it
!= pending_inc
.new_pg_upmap
.end()) {
14624 if (it
->first
.pool() == pool
) {
14625 dout(10) << __func__
<< " " << pool
14626 << " removing pending pg_upmap "
14627 << it
->first
<< dendl
;
14628 it
= pending_inc
.new_pg_upmap
.erase(it
);
14634 // remove any pg_upmap_items mappings for this pool
14635 for (auto& p
: osdmap
.pg_upmap_items
) {
14636 if (p
.first
.pool() == pool
) {
14637 dout(10) << __func__
<< " " << pool
14638 << " removing obsolete pg_upmap_items " << p
.first
14640 pending_inc
.old_pg_upmap_items
.insert(p
.first
);
14643 // remove any pending pg_upmap mappings for this pool
14645 auto it
= pending_inc
.new_pg_upmap_items
.begin();
14646 while (it
!= pending_inc
.new_pg_upmap_items
.end()) {
14647 if (it
->first
.pool() == pool
) {
14648 dout(10) << __func__
<< " " << pool
14649 << " removing pending pg_upmap_items "
14650 << it
->first
<< dendl
;
14651 it
= pending_inc
.new_pg_upmap_items
.erase(it
);
14658 // remove any choose_args for this pool
14659 CrushWrapper newcrush
= _get_pending_crush();
14660 if (newcrush
.have_choose_args(pool
)) {
14661 dout(10) << __func__
<< " removing choose_args for pool " << pool
<< dendl
;
14662 newcrush
.rm_choose_args(pool
);
14663 pending_inc
.crush
.clear();
14664 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
14669 int OSDMonitor::_prepare_rename_pool(int64_t pool
, string newname
)
14671 dout(10) << "_prepare_rename_pool " << pool
<< dendl
;
14672 if (pending_inc
.old_pools
.count(pool
)) {
14673 dout(10) << "_prepare_rename_pool " << pool
<< " pending removal" << dendl
;
14676 for (map
<int64_t,string
>::iterator p
= pending_inc
.new_pool_names
.begin();
14677 p
!= pending_inc
.new_pool_names
.end();
14679 if (p
->second
== newname
&& p
->first
!= pool
) {
14684 pending_inc
.new_pool_names
[pool
] = newname
;
14688 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op
)
14690 op
->mark_osdmon_event(__func__
);
14691 auto m
= op
->get_req
<MPoolOp
>();
14693 int ret
= _prepare_remove_pool(m
->pool
, &ss
, false);
14694 if (ret
== -EAGAIN
) {
14695 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
14699 dout(10) << __func__
<< " got " << ret
<< " " << ss
.str() << dendl
;
14700 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, ret
,
14701 pending_inc
.epoch
));
14705 void OSDMonitor::_pool_op_reply(MonOpRequestRef op
,
14706 int ret
, epoch_t epoch
, bufferlist
*blp
)
14708 op
->mark_osdmon_event(__func__
);
14709 auto m
= op
->get_req
<MPoolOp
>();
14710 dout(20) << "_pool_op_reply " << ret
<< dendl
;
14711 MPoolOpReply
*reply
= new MPoolOpReply(m
->fsid
, m
->get_tid(),
14712 ret
, epoch
, get_last_committed(), blp
);
14713 mon
.send_reply(op
, reply
);
14716 void OSDMonitor::convert_pool_priorities(void)
14718 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc("recovery_priority").key
;
14719 int64_t max_prio
= 0;
14720 int64_t min_prio
= 0;
14721 for (const auto &i
: osdmap
.get_pools()) {
14722 const auto &pool
= i
.second
;
14724 if (pool
.opts
.is_set(key
)) {
14726 pool
.opts
.get(key
, &prio
);
14727 if (prio
> max_prio
)
14729 if (prio
< min_prio
)
14733 if (max_prio
<= OSD_POOL_PRIORITY_MAX
&& min_prio
>= OSD_POOL_PRIORITY_MIN
) {
14734 dout(20) << __func__
<< " nothing to fix" << dendl
;
14737 // Current pool priorities exceeds new maximum
14738 for (const auto &i
: osdmap
.get_pools()) {
14739 const auto pool_id
= i
.first
;
14740 pg_pool_t pool
= i
.second
;
14743 pool
.opts
.get(key
, &prio
);
14746 if (prio
> 0 && max_prio
> OSD_POOL_PRIORITY_MAX
) { // Likely scenario
14747 // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
14748 n
= (float)prio
/ max_prio
* OSD_POOL_PRIORITY_MAX
;
14749 } else if (prio
< 0 && min_prio
< OSD_POOL_PRIORITY_MIN
) {
14750 // Scaled priority range OSD_POOL_PRIORITY_MIN to 0
14751 n
= (float)prio
/ min_prio
* OSD_POOL_PRIORITY_MIN
;
14756 pool
.opts
.unset(key
);
14758 pool
.opts
.set(key
, static_cast<int64_t>(n
));
14760 dout(10) << __func__
<< " pool " << pool_id
14761 << " recovery_priority adjusted "
14762 << prio
<< " to " << n
<< dendl
;
14763 pool
.last_change
= pending_inc
.epoch
;
14764 pending_inc
.new_pools
[pool_id
] = pool
;
14768 void OSDMonitor::try_enable_stretch_mode_pools(stringstream
& ss
, bool *okay
,
14770 set
<pg_pool_t
*>* pools
,
14771 const string
& new_crush_rule
)
14773 dout(20) << __func__
<< dendl
;
14775 int new_crush_rule_result
= osdmap
.crush
->get_rule_id(new_crush_rule
);
14776 if (new_crush_rule_result
< 0) {
14777 ss
<< "unrecognized crush rule " << new_crush_rule_result
;
14778 *errcode
= new_crush_rule_result
;
14781 __u8 new_rule
= static_cast<__u8
>(new_crush_rule_result
);
14782 for (const auto& pooli
: osdmap
.pools
) {
14783 int64_t poolid
= pooli
.first
;
14784 const pg_pool_t
*p
= &pooli
.second
;
14785 if (!p
->is_replicated()) {
14786 ss
<< "stretched pools must be replicated; '" << osdmap
.pool_name
[poolid
] << "' is erasure-coded";
14787 *errcode
= -EINVAL
;
14790 uint8_t default_size
= g_conf().get_val
<uint64_t>("osd_pool_default_size");
14791 if ((p
->get_size() != default_size
||
14792 (p
->get_min_size() != g_conf().get_osd_pool_default_min_size(default_size
))) &&
14793 (p
->get_crush_rule() != new_rule
)) {
14794 ss
<< "we currently require stretch mode pools start out with the"
14795 " default size/min_size, which '" << osdmap
.pool_name
[poolid
] << "' does not";
14796 *errcode
= -EINVAL
;
14799 pg_pool_t
*pp
= pending_inc
.get_new_pool(poolid
, p
);
14800 // TODO: The part where we unconditionally copy the pools into pending_inc is bad
14801 // the attempt may fail and then we have these pool updates...but they won't do anything
14802 // if there is a failure, so if it's hard to change the interface, no need to bother
14809 void OSDMonitor::try_enable_stretch_mode(stringstream
& ss
, bool *okay
,
14810 int *errcode
, bool commit
,
14811 const string
& dividing_bucket
,
14812 uint32_t bucket_count
,
14813 const set
<pg_pool_t
*>& pools
,
14814 const string
& new_crush_rule
)
14816 dout(20) << __func__
<< dendl
;
14818 CrushWrapper crush
= _get_pending_crush();
14819 int dividing_id
= -1;
14820 if (auto type_id
= crush
.get_validated_type_id(dividing_bucket
);
14821 !type_id
.has_value()) {
14822 ss
<< dividing_bucket
<< " is not a valid crush bucket type";
14823 *errcode
= -ENOENT
;
14824 ceph_assert(!commit
);
14827 dividing_id
= *type_id
;
14829 vector
<int> subtrees
;
14830 crush
.get_subtree_of_type(dividing_id
, &subtrees
);
14831 if (subtrees
.size() != 2) {
14832 ss
<< "there are " << subtrees
.size() << dividing_bucket
14833 << "'s in the cluster but stretch mode currently only works with 2!";
14834 *errcode
= -EINVAL
;
14835 ceph_assert(!commit
|| subtrees
.size() == 2);
14839 int new_crush_rule_result
= crush
.get_rule_id(new_crush_rule
);
14840 if (new_crush_rule_result
< 0) {
14841 ss
<< "unrecognized crush rule " << new_crush_rule
;
14842 *errcode
= new_crush_rule_result
;
14843 ceph_assert(!commit
|| (new_crush_rule_result
> 0));
14846 __u8 new_rule
= static_cast<__u8
>(new_crush_rule_result
);
14848 int weight1
= crush
.get_item_weight(subtrees
[0]);
14849 int weight2
= crush
.get_item_weight(subtrees
[1]);
14850 if (weight1
!= weight2
) {
14851 // TODO: I'm really not sure this is a good idea?
14852 ss
<< "the 2 " << dividing_bucket
14853 << "instances in the cluster have differing weights "
14854 << weight1
<< " and " << weight2
14855 <<" but stretch mode currently requires they be the same!";
14856 *errcode
= -EINVAL
;
14857 ceph_assert(!commit
|| (weight1
== weight2
));
14860 if (bucket_count
!= 2) {
14861 ss
<< "currently we only support 2-site stretch clusters!";
14862 *errcode
= -EINVAL
;
14863 ceph_assert(!commit
|| bucket_count
== 2);
14866 // TODO: check CRUSH rules for pools so that we are appropriately divided
14868 for (auto pool
: pools
) {
14869 pool
->crush_rule
= new_rule
;
14870 pool
->peering_crush_bucket_count
= bucket_count
;
14871 pool
->peering_crush_bucket_target
= bucket_count
;
14872 pool
->peering_crush_bucket_barrier
= dividing_id
;
14873 pool
->peering_crush_mandatory_member
= CRUSH_ITEM_NONE
;
14874 pool
->size
= g_conf().get_val
<uint64_t>("mon_stretch_pool_size");
14875 pool
->min_size
= g_conf().get_val
<uint64_t>("mon_stretch_pool_min_size");
14877 pending_inc
.change_stretch_mode
= true;
14878 pending_inc
.stretch_mode_enabled
= true;
14879 pending_inc
.new_stretch_bucket_count
= bucket_count
;
14880 pending_inc
.new_degraded_stretch_mode
= 0;
14881 pending_inc
.new_stretch_mode_bucket
= dividing_id
;
14887 bool OSDMonitor::check_for_dead_crush_zones(const map
<string
,set
<string
>>& dead_buckets
,
14888 set
<int> *really_down_buckets
,
14889 set
<string
> *really_down_mons
)
14891 dout(20) << __func__
<< " with dead mon zones " << dead_buckets
<< dendl
;
14892 ceph_assert(is_readable());
14893 if (dead_buckets
.empty()) return false;
14894 set
<int> down_cache
;
14895 bool really_down
= false;
14896 for (auto dbi
: dead_buckets
) {
14897 const string
& bucket_name
= dbi
.first
;
14898 ceph_assert(osdmap
.crush
->name_exists(bucket_name
));
14899 int bucket_id
= osdmap
.crush
->get_item_id(bucket_name
);
14900 dout(20) << "Checking " << bucket_name
<< " id " << bucket_id
14901 << " to see if OSDs are also down" << dendl
;
14902 bool subtree_down
= osdmap
.subtree_is_down(bucket_id
, &down_cache
);
14903 if (subtree_down
) {
14904 dout(20) << "subtree is down!" << dendl
;
14905 really_down
= true;
14906 really_down_buckets
->insert(bucket_id
);
14907 really_down_mons
->insert(dbi
.second
.begin(), dbi
.second
.end());
14910 dout(10) << "We determined CRUSH buckets " << *really_down_buckets
14911 << " and mons " << *really_down_mons
<< " are really down" << dendl
;
14912 return really_down
;
14915 void OSDMonitor::trigger_degraded_stretch_mode(const set
<int>& dead_buckets
,
14916 const set
<string
>& live_zones
)
14918 dout(20) << __func__
<< dendl
;
14919 stretch_recovery_triggered
.set_from_double(0); // reset this; we can't go clean now!
14920 // update the general OSDMap changes
14921 pending_inc
.change_stretch_mode
= true;
14922 pending_inc
.stretch_mode_enabled
= osdmap
.stretch_mode_enabled
;
14923 pending_inc
.new_stretch_bucket_count
= osdmap
.stretch_bucket_count
;
14924 int new_site_count
= osdmap
.stretch_bucket_count
- dead_buckets
.size();
14925 ceph_assert(new_site_count
== 1); // stretch count 2!
14926 pending_inc
.new_degraded_stretch_mode
= new_site_count
;
14927 pending_inc
.new_recovering_stretch_mode
= 0;
14928 pending_inc
.new_stretch_mode_bucket
= osdmap
.stretch_mode_bucket
;
14930 // and then apply them to all the pg_pool_ts
14931 ceph_assert(live_zones
.size() == 1); // only support 2 zones now
14932 const string
& remaining_site_name
= *(live_zones
.begin());
14933 ceph_assert(osdmap
.crush
->name_exists(remaining_site_name
));
14934 int remaining_site
= osdmap
.crush
->get_item_id(remaining_site_name
);
14935 for (auto pgi
: osdmap
.pools
) {
14936 if (pgi
.second
.peering_crush_bucket_count
) {
14937 pg_pool_t
& newp
= *pending_inc
.get_new_pool(pgi
.first
, &pgi
.second
);
14938 newp
.peering_crush_bucket_count
= new_site_count
;
14939 newp
.peering_crush_mandatory_member
= remaining_site
;
14940 newp
.min_size
= pgi
.second
.min_size
/ 2; // only support 2 zones now
14941 newp
.set_last_force_op_resend(pending_inc
.epoch
);
14947 void OSDMonitor::trigger_recovery_stretch_mode()
14949 dout(20) << __func__
<< dendl
;
14950 stretch_recovery_triggered
.set_from_double(0); // reset this so we don't go full-active prematurely
14951 pending_inc
.change_stretch_mode
= true;
14952 pending_inc
.stretch_mode_enabled
= osdmap
.stretch_mode_enabled
;
14953 pending_inc
.new_stretch_bucket_count
= osdmap
.stretch_bucket_count
;
14954 pending_inc
.new_degraded_stretch_mode
= osdmap
.degraded_stretch_mode
;
14955 pending_inc
.new_recovering_stretch_mode
= 1;
14956 pending_inc
.new_stretch_mode_bucket
= osdmap
.stretch_mode_bucket
;
14958 for (auto pgi
: osdmap
.pools
) {
14959 if (pgi
.second
.peering_crush_bucket_count
) {
14960 pg_pool_t
& newp
= *pending_inc
.get_new_pool(pgi
.first
, &pgi
.second
);
14961 newp
.set_last_force_op_resend(pending_inc
.epoch
);
14967 void OSDMonitor::set_degraded_stretch_mode()
14969 stretch_recovery_triggered
.set_from_double(0);
14972 void OSDMonitor::set_recovery_stretch_mode()
14974 if (stretch_recovery_triggered
.is_zero()) {
14975 stretch_recovery_triggered
= ceph_clock_now();
14979 void OSDMonitor::set_healthy_stretch_mode()
14981 stretch_recovery_triggered
.set_from_double(0);
14984 void OSDMonitor::notify_new_pg_digest()
14986 dout(20) << __func__
<< dendl
;
14987 if (!stretch_recovery_triggered
.is_zero()) {
14988 try_end_recovery_stretch_mode(false);
14992 struct CMonExitRecovery
: public Context
{
14995 CMonExitRecovery(OSDMonitor
*mon
, bool f
) : m(mon
), force(f
) {}
14996 void finish(int r
) {
14997 m
->try_end_recovery_stretch_mode(force
);
15001 void OSDMonitor::try_end_recovery_stretch_mode(bool force
)
15003 dout(20) << __func__
<< dendl
;
15004 if (!mon
.is_leader()) return;
15005 if (!mon
.is_degraded_stretch_mode()) return;
15006 if (!mon
.is_recovering_stretch_mode()) return;
15007 if (!is_readable()) {
15008 wait_for_readable_ctx(new CMonExitRecovery(this, force
));
15012 if (osdmap
.recovering_stretch_mode
&&
15013 ((!stretch_recovery_triggered
.is_zero() &&
15014 ceph_clock_now() - g_conf().get_val
<double>("mon_stretch_recovery_min_wait") >
15015 stretch_recovery_triggered
) ||
15017 if (!mon
.mgrstatmon()->is_readable()) {
15018 mon
.mgrstatmon()->wait_for_readable_ctx(new CMonExitRecovery(this, force
));
15021 const PGMapDigest
& pgd
= mon
.mgrstatmon()->get_digest();
15022 double misplaced
, degraded
, inactive
, unknown
;
15023 pgd
.get_recovery_stats(&misplaced
, °raded
, &inactive
, &unknown
);
15024 if (force
|| (degraded
== 0.0 && inactive
== 0.0 && unknown
== 0.0)) {
15025 // we can exit degraded stretch mode!
15026 mon
.trigger_healthy_stretch_mode();
15031 void OSDMonitor::trigger_healthy_stretch_mode()
15033 ceph_assert(is_writeable());
15034 stretch_recovery_triggered
.set_from_double(0);
15035 pending_inc
.change_stretch_mode
= true;
15036 pending_inc
.stretch_mode_enabled
= osdmap
.stretch_mode_enabled
;
15037 pending_inc
.new_stretch_bucket_count
= osdmap
.stretch_bucket_count
;
15038 pending_inc
.new_degraded_stretch_mode
= 0; // turn off degraded mode...
15039 pending_inc
.new_recovering_stretch_mode
= 0; //...and recovering mode!
15040 pending_inc
.new_stretch_mode_bucket
= osdmap
.stretch_mode_bucket
;
15041 for (auto pgi
: osdmap
.pools
) {
15042 if (pgi
.second
.peering_crush_bucket_count
) {
15043 pg_pool_t
& newp
= *pending_inc
.get_new_pool(pgi
.first
, &pgi
.second
);
15044 newp
.peering_crush_bucket_count
= osdmap
.stretch_bucket_count
;
15045 newp
.peering_crush_mandatory_member
= CRUSH_ITEM_NONE
;
15046 newp
.min_size
= g_conf().get_val
<uint64_t>("mon_stretch_pool_min_size");
15047 newp
.set_last_force_op_resend(pending_inc
.epoch
);