1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
10 * Author: Loic Dachary <loic@dachary.org>
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
20 #include <boost/algorithm/string.hpp>
21 #include <experimental/iterator>
25 #include "mon/OSDMonitor.h"
26 #include "mon/Monitor.h"
27 #include "mon/MDSMonitor.h"
28 #include "mon/MgrStatMonitor.h"
29 #include "mon/AuthMonitor.h"
30 #include "mon/KVMonitor.h"
32 #include "mon/MonitorDBStore.h"
33 #include "mon/Session.h"
35 #include "crush/CrushWrapper.h"
36 #include "crush/CrushTester.h"
37 #include "crush/CrushTreeDumper.h"
39 #include "messages/MOSDBeacon.h"
40 #include "messages/MOSDFailure.h"
41 #include "messages/MOSDMarkMeDown.h"
42 #include "messages/MOSDMarkMeDead.h"
43 #include "messages/MOSDFull.h"
44 #include "messages/MOSDMap.h"
45 #include "messages/MMonGetOSDMap.h"
46 #include "messages/MOSDBoot.h"
47 #include "messages/MOSDAlive.h"
48 #include "messages/MPoolOp.h"
49 #include "messages/MPoolOpReply.h"
50 #include "messages/MOSDPGCreate2.h"
51 #include "messages/MOSDPGCreated.h"
52 #include "messages/MOSDPGTemp.h"
53 #include "messages/MOSDPGReadyToMerge.h"
54 #include "messages/MMonCommand.h"
55 #include "messages/MRemoveSnaps.h"
56 #include "messages/MRoute.h"
57 #include "messages/MMonGetPurgedSnaps.h"
58 #include "messages/MMonGetPurgedSnapsReply.h"
60 #include "common/TextTable.h"
61 #include "common/Timer.h"
62 #include "common/ceph_argparse.h"
63 #include "common/perf_counters.h"
64 #include "common/PriorityCache.h"
65 #include "common/strtol.h"
66 #include "common/numa.h"
68 #include "common/config.h"
69 #include "common/errno.h"
71 #include "erasure-code/ErasureCodePlugin.h"
72 #include "compressor/Compressor.h"
73 #include "common/Checksummer.h"
75 #include "include/compat.h"
76 #include "include/ceph_assert.h"
77 #include "include/stringify.h"
78 #include "include/util.h"
79 #include "common/cmdparse.h"
80 #include "include/str_list.h"
81 #include "include/str_map.h"
82 #include "include/scope_guard.h"
83 #include "perfglue/heap_profiler.h"
85 #include "auth/cephx/CephxKeyServer.h"
86 #include "osd/OSDCap.h"
88 #include "json_spirit/json_spirit_reader.h"
90 #include <boost/algorithm/string/predicate.hpp>
97 using std::ostringstream
;
101 using std::stringstream
;
102 using std::to_string
;
105 using ceph::bufferlist
;
108 using ceph::ErasureCodeInterfaceRef
;
109 using ceph::ErasureCodePluginRegistry
;
110 using ceph::ErasureCodeProfile
;
111 using ceph::Formatter
;
112 using ceph::JSONFormatter
;
113 using ceph::make_message
;
115 #define dout_subsys ceph_subsys_mon
116 static const string
OSD_PG_CREATING_PREFIX("osd_pg_creating");
117 static const string
OSD_METADATA_PREFIX("osd_metadata");
118 static const string
OSD_SNAP_PREFIX("osd_snap");
122 OSD snapshot metadata
123 ---------------------
125 -- starting with mimic, removed in octopus --
127 "removed_epoch_%llu_%08lx" % (pool, epoch)
128 -> interval_set<snapid_t>
130 "removed_snap_%llu_%016llx" % (pool, last_snap)
131 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
134 -- starting with mimic --
136 "purged_snap_%llu_%016llx" % (pool, last_snap)
137 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
139 - note that the {removed,purged}_snap put the last snap in they key so
140 that we can use forward iteration only to search for an epoch in an
141 interval. e.g., to test if epoch N is removed/purged, we'll find a key
142 >= N that either does or doesn't contain the given snap.
145 -- starting with octopus --
147 "purged_epoch_%08lx" % epoch
148 -> map<int64_t,interval_set<snapid_t>>
151 using namespace TOPNSPC::common
;
154 struct OSDMemCache
: public PriorityCache::PriCache
{
156 int64_t cache_bytes
[PriorityCache::Priority::LAST
+1] = {0};
157 int64_t committed_bytes
= 0;
158 double cache_ratio
= 0;
160 OSDMemCache(OSDMonitor
*m
) : osdmon(m
) {};
162 virtual uint64_t _get_used_bytes() const = 0;
164 virtual int64_t request_cache_bytes(
165 PriorityCache::Priority pri
, uint64_t total_cache
) const {
166 int64_t assigned
= get_cache_bytes(pri
);
169 // All cache items are currently set to have PRI1 priority
170 case PriorityCache::Priority::PRI1
:
172 int64_t request
= _get_used_bytes();
173 return (request
> assigned
) ? request
- assigned
: 0;
181 virtual int64_t get_cache_bytes(PriorityCache::Priority pri
) const {
182 return cache_bytes
[pri
];
185 virtual int64_t get_cache_bytes() const {
188 for (int i
= 0; i
< PriorityCache::Priority::LAST
+ 1; i
++) {
189 PriorityCache::Priority pri
= static_cast<PriorityCache::Priority
>(i
);
190 total
+= get_cache_bytes(pri
);
195 virtual void set_cache_bytes(PriorityCache::Priority pri
, int64_t bytes
) {
196 cache_bytes
[pri
] = bytes
;
198 virtual void add_cache_bytes(PriorityCache::Priority pri
, int64_t bytes
) {
199 cache_bytes
[pri
] += bytes
;
201 virtual int64_t commit_cache_size(uint64_t total_cache
) {
202 committed_bytes
= PriorityCache::get_chunk(
203 get_cache_bytes(), total_cache
);
204 return committed_bytes
;
206 virtual int64_t get_committed_size() const {
207 return committed_bytes
;
209 virtual double get_cache_ratio() const {
212 virtual void set_cache_ratio(double ratio
) {
215 virtual void shift_bins() {
217 virtual void import_bins(const std::vector
<uint64_t> &bins
) {
219 virtual void set_bins(PriorityCache::Priority pri
, uint64_t end_bin
) {
221 virtual uint64_t get_bins(PriorityCache::Priority pri
) const {
225 virtual string
get_cache_name() const = 0;
228 struct IncCache
: public OSDMemCache
{
229 IncCache(OSDMonitor
*m
) : OSDMemCache(m
) {};
231 virtual uint64_t _get_used_bytes() const {
232 return osdmon
->inc_osd_cache
.get_bytes();
235 virtual string
get_cache_name() const {
236 return "OSDMap Inc Cache";
239 uint64_t _get_num_osdmaps() const {
240 return osdmon
->inc_osd_cache
.get_size();
244 struct FullCache
: public OSDMemCache
{
245 FullCache(OSDMonitor
*m
) : OSDMemCache(m
) {};
247 virtual uint64_t _get_used_bytes() const {
248 return osdmon
->full_osd_cache
.get_bytes();
251 virtual string
get_cache_name() const {
252 return "OSDMap Full Cache";
255 uint64_t _get_num_osdmaps() const {
256 return osdmon
->full_osd_cache
.get_size();
260 std::shared_ptr
<IncCache
> inc_cache
;
261 std::shared_ptr
<FullCache
> full_cache
;
263 const uint32_t MAX_POOL_APPLICATIONS
= 4;
264 const uint32_t MAX_POOL_APPLICATION_KEYS
= 64;
265 const uint32_t MAX_POOL_APPLICATION_LENGTH
= 128;
267 bool is_osd_writable(const OSDCapGrant
& grant
, const std::string
* pool_name
) {
268 // Note: this doesn't include support for the application tag match
269 if ((grant
.spec
.allow
& OSD_CAP_W
) != 0) {
270 auto& match
= grant
.match
;
271 if (match
.is_match_all()) {
273 } else if (pool_name
!= nullptr &&
274 !match
.pool_namespace
.pool_name
.empty() &&
275 match
.pool_namespace
.pool_name
== *pool_name
) {
282 bool is_unmanaged_snap_op_permitted(CephContext
* cct
,
283 const KeyServer
& key_server
,
284 const EntityName
& entity_name
,
285 const MonCap
& mon_caps
,
286 const entity_addr_t
& peer_socket_addr
,
287 const std::string
* pool_name
)
289 typedef std::map
<std::string
, std::string
> CommandArgs
;
291 if (mon_caps
.is_capable(
292 cct
, entity_name
, "osd",
293 "osd pool op unmanaged-snap",
294 (pool_name
== nullptr ?
295 CommandArgs
{} /* pool DNE, require unrestricted cap */ :
296 CommandArgs
{{"poolname", *pool_name
}}),
302 AuthCapsInfo caps_info
;
303 if (!key_server
.get_service_caps(entity_name
, CEPH_ENTITY_TYPE_OSD
,
305 dout(10) << "unable to locate OSD cap data for " << entity_name
306 << " in auth db" << dendl
;
311 if (caps_info
.caps
.length() > 0) {
312 auto p
= caps_info
.caps
.cbegin();
315 } catch (const ceph::buffer::error
&err
) {
316 derr
<< "corrupt OSD cap data for " << entity_name
<< " in auth db"
323 if (!osd_cap
.parse(caps_str
, nullptr)) {
324 dout(10) << "unable to parse OSD cap data for " << entity_name
325 << " in auth db" << dendl
;
329 // if the entity has write permissions in one or all pools, permit
330 // usage of unmanaged-snapshots
331 if (osd_cap
.allow_all()) {
335 for (auto& grant
: osd_cap
.grants
) {
336 if (grant
.profile
.is_valid()) {
337 for (auto& profile_grant
: grant
.profile_grants
) {
338 if (is_osd_writable(profile_grant
, pool_name
)) {
342 } else if (is_osd_writable(grant
, pool_name
)) {
350 } // anonymous namespace
352 void LastEpochClean::Lec::report(unsigned pg_num
, ps_t ps
,
353 epoch_t last_epoch_clean
)
359 epoch_by_pg
.resize(pg_num
, 0);
360 const auto old_lec
= epoch_by_pg
[ps
];
361 if (old_lec
>= last_epoch_clean
) {
365 epoch_by_pg
[ps
] = last_epoch_clean
;
366 if (last_epoch_clean
< floor
) {
367 floor
= last_epoch_clean
;
368 } else if (last_epoch_clean
> floor
) {
369 if (old_lec
== floor
) {
370 // probably should increase floor?
371 auto new_floor
= std::min_element(std::begin(epoch_by_pg
),
372 std::end(epoch_by_pg
));
376 if (ps
!= next_missing
) {
379 for (; next_missing
< epoch_by_pg
.size(); next_missing
++) {
380 if (epoch_by_pg
[next_missing
] == 0) {
386 void LastEpochClean::remove_pool(uint64_t pool
)
388 report_by_pool
.erase(pool
);
391 void LastEpochClean::report(unsigned pg_num
, const pg_t
& pg
,
392 epoch_t last_epoch_clean
)
394 auto& lec
= report_by_pool
[pg
.pool()];
395 return lec
.report(pg_num
, pg
.ps(), last_epoch_clean
);
398 epoch_t
LastEpochClean::get_lower_bound(const OSDMap
& latest
) const
400 auto floor
= latest
.get_epoch();
401 for (auto& pool
: latest
.get_pools()) {
402 auto reported
= report_by_pool
.find(pool
.first
);
403 if (reported
== report_by_pool
.end()) {
406 if (reported
->second
.next_missing
< pool
.second
.get_pg_num()) {
409 if (reported
->second
.floor
< floor
) {
410 floor
= reported
->second
.floor
;
416 void LastEpochClean::dump(Formatter
*f
) const
418 f
->open_array_section("per_pool");
420 for (auto& [pool
, lec
] : report_by_pool
) {
421 f
->open_object_section("pool");
422 f
->dump_unsigned("poolid", pool
);
423 f
->dump_unsigned("floor", lec
.floor
);
430 class C_UpdateCreatingPGs
: public Context
{
435 C_UpdateCreatingPGs(OSDMonitor
*osdmon
, epoch_t e
) :
436 osdmon(osdmon
), start(ceph_clock_now()), epoch(e
) {}
437 void finish(int r
) override
{
439 utime_t end
= ceph_clock_now();
440 dout(10) << "osdmap epoch " << epoch
<< " mapping took "
441 << (end
- start
) << " seconds" << dendl
;
442 osdmon
->update_creating_pgs();
443 osdmon
->check_pg_creates_subs();
449 #define dout_prefix _prefix(_dout, mon, osdmap)
450 static ostream
& _prefix(std::ostream
*_dout
, Monitor
&mon
, const OSDMap
& osdmap
) {
451 return *_dout
<< "mon." << mon
.name
<< "@" << mon
.rank
452 << "(" << mon
.get_state_name()
453 << ").osd e" << osdmap
.get_epoch() << " ";
456 OSDMonitor::OSDMonitor(
460 const string
& service_name
)
461 : PaxosService(mn
, p
, service_name
),
463 inc_osd_cache(g_conf()->mon_osd_cache_size
),
464 full_osd_cache(g_conf()->mon_osd_cache_size
),
465 has_osdmap_manifest(false),
466 mapper(mn
.cct
, &mn
.cpu_tp
)
468 inc_cache
= std::make_shared
<IncCache
>(this);
469 full_cache
= std::make_shared
<FullCache
>(this);
470 cct
->_conf
.add_observer(this);
471 int r
= _set_cache_sizes();
473 derr
<< __func__
<< " using default osd cache size - mon_osd_cache_size ("
474 << g_conf()->mon_osd_cache_size
475 << ") without priority cache management"
480 const char **OSDMonitor::get_tracked_conf_keys() const
482 static const char* KEYS
[] = {
484 "mon_memory_autotune",
485 "rocksdb_cache_size",
491 void OSDMonitor::handle_conf_change(const ConfigProxy
& conf
,
492 const std::set
<std::string
> &changed
)
494 dout(10) << __func__
<< " " << changed
<< dendl
;
496 if (changed
.count("mon_memory_autotune")) {
497 _set_cache_autotuning();
499 if (changed
.count("mon_memory_target") ||
500 changed
.count("rocksdb_cache_size")) {
501 int r
= _update_mon_cache_settings();
503 derr
<< __func__
<< " mon_memory_target:"
504 << g_conf()->mon_memory_target
505 << " rocksdb_cache_size:"
506 << g_conf()->rocksdb_cache_size
507 << ". Unable to update cache size."
513 void OSDMonitor::_set_cache_autotuning()
515 if (!g_conf()->mon_memory_autotune
&& pcm
!= nullptr) {
516 // Disable cache autotuning
517 std::lock_guard
l(balancer_lock
);
521 if (g_conf()->mon_memory_autotune
&& pcm
== nullptr) {
522 int r
= register_cache_with_pcm();
525 << " Error while registering osdmon caches with pcm."
526 << " Cache auto tuning not enabled."
528 mon_memory_autotune
= false;
530 mon_memory_autotune
= true;
535 int OSDMonitor::_update_mon_cache_settings()
537 if (g_conf()->mon_memory_target
<= 0 ||
538 g_conf()->mon_memory_target
< mon_memory_min
||
539 g_conf()->rocksdb_cache_size
<= 0) {
543 if (pcm
== nullptr && rocksdb_binned_kv_cache
== nullptr) {
544 derr
<< __func__
<< " not using pcm and rocksdb" << dendl
;
548 uint64_t old_mon_memory_target
= mon_memory_target
;
549 uint64_t old_rocksdb_cache_size
= rocksdb_cache_size
;
551 // Set the new pcm memory cache sizes
552 mon_memory_target
= g_conf()->mon_memory_target
;
553 rocksdb_cache_size
= g_conf()->rocksdb_cache_size
;
555 uint64_t base
= mon_memory_base
;
556 double fragmentation
= mon_memory_fragmentation
;
557 uint64_t target
= mon_memory_target
;
558 uint64_t min
= mon_memory_min
;
561 uint64_t ltarget
= (1.0 - fragmentation
) * target
;
562 if (ltarget
> base
+ min
) {
563 max
= ltarget
- base
;
566 int r
= _set_cache_ratios();
568 derr
<< __func__
<< " Cache ratios for pcm could not be set."
569 << " Review the kv (rocksdb) and mon_memory_target sizes."
571 mon_memory_target
= old_mon_memory_target
;
572 rocksdb_cache_size
= old_rocksdb_cache_size
;
576 if (mon_memory_autotune
&& pcm
!= nullptr) {
577 std::lock_guard
l(balancer_lock
);
578 // set pcm cache levels
579 pcm
->set_target_memory(target
);
580 pcm
->set_min_memory(min
);
581 pcm
->set_max_memory(max
);
582 // tune memory based on new values
585 _set_new_cache_sizes();
586 dout(1) << __func__
<< " Updated mon cache setting."
587 << " target: " << target
595 int OSDMonitor::_set_cache_sizes()
597 if (g_conf()->mon_memory_autotune
) {
598 // set the new osdmon cache targets to be managed by pcm
599 mon_osd_cache_size
= g_conf()->mon_osd_cache_size
;
600 rocksdb_cache_size
= g_conf()->rocksdb_cache_size
;
601 mon_memory_base
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_base");
602 mon_memory_fragmentation
= cct
->_conf
.get_val
<double>("osd_memory_expected_fragmentation");
603 mon_memory_target
= g_conf()->mon_memory_target
;
604 mon_memory_min
= g_conf()->mon_osd_cache_size_min
;
605 if (mon_memory_target
<= 0 || mon_memory_min
<= 0) {
606 derr
<< __func__
<< " mon_memory_target:" << mon_memory_target
607 << " mon_memory_min:" << mon_memory_min
608 << ". Invalid size option(s) provided."
612 // Set the initial inc and full LRU cache sizes
613 inc_osd_cache
.set_bytes(mon_memory_min
);
614 full_osd_cache
.set_bytes(mon_memory_min
);
615 mon_memory_autotune
= g_conf()->mon_memory_autotune
;
620 bool OSDMonitor::_have_pending_crush()
622 return pending_inc
.crush
.length() > 0;
625 CrushWrapper
&OSDMonitor::_get_stable_crush()
627 return *osdmap
.crush
;
630 CrushWrapper
OSDMonitor::_get_pending_crush()
633 if (pending_inc
.crush
.length())
634 bl
= pending_inc
.crush
;
636 osdmap
.crush
->encode(bl
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
638 auto p
= bl
.cbegin();
644 void OSDMonitor::create_initial()
646 dout(10) << "create_initial for " << mon
.monmap
->fsid
<< dendl
;
651 mon
.store
->get("mkfs", "osdmap", bl
);
655 newmap
.set_fsid(mon
.monmap
->fsid
);
657 newmap
.build_simple(cct
, 0, mon
.monmap
->fsid
, 0);
660 newmap
.created
= newmap
.modified
= ceph_clock_now();
662 // new clusters should sort bitwise by default.
663 newmap
.set_flag(CEPH_OSDMAP_SORTBITWISE
);
666 CEPH_OSDMAP_RECOVERY_DELETES
|
667 CEPH_OSDMAP_PURGED_SNAPDIRS
|
668 CEPH_OSDMAP_PGLOG_HARDLIMIT
;
669 newmap
.full_ratio
= g_conf()->mon_osd_full_ratio
;
670 if (newmap
.full_ratio
> 1.0) newmap
.full_ratio
/= 100;
671 newmap
.backfillfull_ratio
= g_conf()->mon_osd_backfillfull_ratio
;
672 if (newmap
.backfillfull_ratio
> 1.0) newmap
.backfillfull_ratio
/= 100;
673 newmap
.nearfull_ratio
= g_conf()->mon_osd_nearfull_ratio
;
674 if (newmap
.nearfull_ratio
> 1.0) newmap
.nearfull_ratio
/= 100;
676 // new cluster should require latest by default
677 if (g_conf().get_val
<bool>("mon_debug_no_require_reef")) {
678 if (g_conf().get_val
<bool>("mon_debug_no_require_quincy")) {
679 derr
<< __func__
<< " mon_debug_no_require_reef and quincy=true" << dendl
;
680 newmap
.require_osd_release
= ceph_release_t::pacific
;
682 derr
<< __func__
<< " mon_debug_no_require_reef=true" << dendl
;
683 newmap
.require_osd_release
= ceph_release_t::quincy
;
686 newmap
.require_osd_release
= ceph_release_t::reef
;
689 ceph_release_t r
= ceph_release_from_name(g_conf()->mon_osd_initial_require_min_compat_client
);
691 ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
693 newmap
.require_min_compat_client
= r
;
695 // encode into pending incremental
696 uint64_t features
= newmap
.get_encoding_features();
697 newmap
.encode(pending_inc
.fullmap
,
698 features
| CEPH_FEATURE_RESERVED
);
699 pending_inc
.full_crc
= newmap
.get_crc();
700 dout(20) << " full crc " << pending_inc
.full_crc
<< dendl
;
703 void OSDMonitor::get_store_prefixes(std::set
<string
>& s
) const
705 s
.insert(service_name
);
706 s
.insert(OSD_PG_CREATING_PREFIX
);
707 s
.insert(OSD_METADATA_PREFIX
);
708 s
.insert(OSD_SNAP_PREFIX
);
711 void OSDMonitor::update_from_paxos(bool *need_bootstrap
)
713 // we really don't care if the version has been updated, because we may
714 // have trimmed without having increased the last committed; yet, we may
715 // need to update the in-memory manifest.
716 load_osdmap_manifest();
718 version_t version
= get_last_committed();
719 if (version
== osdmap
.epoch
)
721 ceph_assert(version
> osdmap
.epoch
);
723 dout(15) << "update_from_paxos paxos e " << version
724 << ", my e " << osdmap
.epoch
<< dendl
;
726 int prev_num_up_osd
= osdmap
.num_up_osd
;
729 if (!mapping_job
->is_done()) {
730 dout(1) << __func__
<< " mapping job "
731 << mapping_job
.get() << " did not complete, "
732 << mapping_job
->shards
<< " left, canceling" << dendl
;
733 mapping_job
->abort();
741 * We will possibly have a stashed latest that *we* wrote, and we will
742 * always be sure to have the oldest full map in the first..last range
743 * due to encode_trim_extra(), which includes the oldest full map in the trim
746 * encode_trim_extra() does not however write the full map's
747 * version to 'full_latest'. This is only done when we are building the
748 * full maps from the incremental versions. But don't panic! We make sure
749 * that the following conditions find whichever full map version is newer.
751 version_t latest_full
= get_version_latest_full();
752 if (latest_full
== 0 && get_first_committed() > 1)
753 latest_full
= get_first_committed();
755 if (get_first_committed() > 1 &&
756 latest_full
< get_first_committed()) {
757 // the monitor could be just sync'ed with its peer, and the latest_full key
758 // is not encoded in the paxos commits in encode_pending(), so we need to
759 // make sure we get it pointing to a proper version.
760 version_t lc
= get_last_committed();
761 version_t fc
= get_first_committed();
763 dout(10) << __func__
<< " looking for valid full map in interval"
764 << " [" << fc
<< ", " << lc
<< "]" << dendl
;
767 for (version_t v
= lc
; v
>= fc
; v
--) {
768 string full_key
= "full_" + stringify(v
);
769 if (mon
.store
->exists(get_service_name(), full_key
)) {
770 dout(10) << __func__
<< " found latest full map v " << v
<< dendl
;
776 ceph_assert(latest_full
> 0);
777 auto t(std::make_shared
<MonitorDBStore::Transaction
>());
778 put_version_latest_full(t
, latest_full
);
779 mon
.store
->apply_transaction(t
);
780 dout(10) << __func__
<< " updated the on-disk full map version to "
781 << latest_full
<< dendl
;
784 if ((latest_full
> 0) && (latest_full
> osdmap
.epoch
)) {
785 bufferlist latest_bl
;
786 get_version_full(latest_full
, latest_bl
);
787 ceph_assert(latest_bl
.length() != 0);
788 dout(7) << __func__
<< " loading latest full map e" << latest_full
<< dendl
;
790 osdmap
.decode(latest_bl
);
794 if (!mon
.store
->get(OSD_PG_CREATING_PREFIX
, "creating", bl
)) {
795 auto p
= bl
.cbegin();
796 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
797 creating_pgs
.decode(p
);
798 dout(7) << __func__
<< " loading creating_pgs last_scan_epoch "
799 << creating_pgs
.last_scan_epoch
800 << " with " << creating_pgs
.pgs
.size() << " pgs" << dendl
;
802 dout(1) << __func__
<< " missing creating pgs; upgrade from post-kraken?"
806 // walk through incrementals
807 MonitorDBStore::TransactionRef t
;
809 while (version
> osdmap
.epoch
) {
811 int err
= get_version(osdmap
.epoch
+1, inc_bl
);
812 ceph_assert(err
== 0);
813 ceph_assert(inc_bl
.length());
814 // set priority cache manager levels if the osdmap is
815 // being populated for the first time.
816 if (mon_memory_autotune
&& pcm
== nullptr) {
817 int r
= register_cache_with_pcm();
820 << " Error while registering osdmon caches with pcm."
821 << " Proceeding without cache auto tuning."
826 dout(7) << "update_from_paxos applying incremental " << osdmap
.epoch
+1
828 OSDMap::Incremental
inc(inc_bl
);
829 err
= osdmap
.apply_incremental(inc
);
830 ceph_assert(err
== 0);
833 t
.reset(new MonitorDBStore::Transaction
);
835 // Write out the full map for all past epochs. Encode the full
836 // map with the same features as the incremental. If we don't
837 // know, use the quorum features. If we don't know those either,
838 // encode with all features.
839 uint64_t f
= inc
.encode_features
;
841 f
= mon
.get_quorum_con_features();
845 osdmap
.encode(full_bl
, f
| CEPH_FEATURE_RESERVED
);
846 tx_size
+= full_bl
.length();
848 bufferlist orig_full_bl
;
849 get_version_full(osdmap
.epoch
, orig_full_bl
);
850 if (orig_full_bl
.length()) {
851 // the primary provided the full map
852 ceph_assert(inc
.have_crc
);
853 if (inc
.full_crc
!= osdmap
.crc
) {
854 // This will happen if the mons were running mixed versions in
855 // the past or some other circumstance made the full encoded
856 // maps divergent. Reloading here will bring us back into
857 // sync with the primary for this and all future maps. OSDs
858 // will also be brought back into sync when they discover the
859 // crc mismatch and request a full map from a mon.
860 derr
<< __func__
<< " full map CRC mismatch, resetting to canonical"
863 dout(20) << __func__
<< " my (bad) full osdmap:\n";
864 JSONFormatter
jf(true);
865 jf
.dump_object("osdmap", osdmap
);
867 *_dout
<< "\nhexdump:\n";
868 full_bl
.hexdump(*_dout
);
872 osdmap
.decode(orig_full_bl
);
874 dout(20) << __func__
<< " canonical full osdmap:\n";
875 JSONFormatter
jf(true);
876 jf
.dump_object("osdmap", osdmap
);
878 *_dout
<< "\nhexdump:\n";
879 orig_full_bl
.hexdump(*_dout
);
883 ceph_assert(!inc
.have_crc
);
884 put_version_full(t
, osdmap
.epoch
, full_bl
);
886 put_version_latest_full(t
, osdmap
.epoch
);
889 dout(1) << osdmap
<< dendl
;
891 if (osdmap
.epoch
== 1) {
892 t
->erase("mkfs", "osdmap");
895 if (tx_size
> g_conf()->mon_sync_max_payload_size
*2) {
896 mon
.store
->apply_transaction(t
);
897 t
= MonitorDBStore::TransactionRef();
900 for (auto [osd
, state
] : inc
.new_state
) {
901 if (state
& CEPH_OSD_UP
) {
902 // could be marked up *or* down, but we're too lazy to check which
903 last_osd_report
.erase(osd
);
906 for (auto [osd
, weight
] : inc
.new_weight
) {
907 if (weight
== CEPH_OSD_OUT
) {
908 // manually marked out, so drop it
909 osd_epochs
.erase(osd
);
915 mon
.store
->apply_transaction(t
);
918 bool marked_osd_down
= false;
919 for (int o
= 0; o
< osdmap
.get_max_osd(); o
++) {
920 if (osdmap
.is_out(o
))
922 auto found
= down_pending_out
.find(o
);
923 if (osdmap
.is_down(o
)) {
924 // populate down -> out map
925 if (found
== down_pending_out
.end()) {
926 dout(10) << " adding osd." << o
<< " to down_pending_out map" << dendl
;
927 down_pending_out
[o
] = ceph_clock_now();
928 marked_osd_down
= true;
931 if (found
!= down_pending_out
.end()) {
932 dout(10) << " removing osd." << o
<< " from down_pending_out map" << dendl
;
933 down_pending_out
.erase(found
);
937 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
940 check_pg_creates_subs();
942 share_map_with_random_osd();
946 // make sure our feature bits reflect the latest map
947 update_msgr_features();
949 if (!mon
.is_leader()) {
950 // will be called by on_active() on the leader, avoid doing so twice
953 if (osdmap
.stretch_mode_enabled
) {
954 dout(20) << "Stretch mode enabled in this map" << dendl
;
955 mon
.try_engage_stretch_mode();
956 if (osdmap
.degraded_stretch_mode
) {
957 dout(20) << "Degraded stretch mode set in this map" << dendl
;
958 if (!osdmap
.recovering_stretch_mode
) {
959 mon
.set_degraded_stretch_mode();
960 dout(20) << "prev_num_up_osd: " << prev_num_up_osd
<< dendl
;
961 dout(20) << "osdmap.num_up_osd: " << osdmap
.num_up_osd
<< dendl
;
962 dout(20) << "osdmap.num_osd: " << osdmap
.num_osd
<< dendl
;
963 dout(20) << "mon_stretch_cluster_recovery_ratio: " << cct
->_conf
.get_val
<double>("mon_stretch_cluster_recovery_ratio") << dendl
;
964 if (prev_num_up_osd
< osdmap
.num_up_osd
&&
965 (osdmap
.num_up_osd
/ (double)osdmap
.num_osd
) >
966 cct
->_conf
.get_val
<double>("mon_stretch_cluster_recovery_ratio") &&
967 mon
.dead_mon_buckets
.size() == 0) {
968 // TODO: This works for 2-site clusters when the OSD maps are appropriately
969 // trimmed and everything is "normal" but not if you have a lot of out OSDs
970 // you're ignoring or in some really degenerate failure cases
972 dout(10) << "Enabling recovery stretch mode in this map" << dendl
;
973 mon
.go_recovery_stretch_mode();
976 mon
.set_recovery_stretch_mode();
979 mon
.set_healthy_stretch_mode();
981 if (marked_osd_down
&&
982 (!osdmap
.degraded_stretch_mode
|| osdmap
.recovering_stretch_mode
)) {
983 dout(20) << "Checking degraded stretch mode due to osd changes" << dendl
;
984 mon
.maybe_go_degraded_stretch_mode();
989 int OSDMonitor::register_cache_with_pcm()
991 if (mon_memory_target
<= 0 || mon_memory_min
<= 0) {
992 derr
<< __func__
<< " Invalid memory size specified for mon caches."
993 << " Caches will not be auto-tuned."
997 uint64_t base
= mon_memory_base
;
998 double fragmentation
= mon_memory_fragmentation
;
999 // For calculating total target memory, consider rocksdb cache size.
1000 uint64_t target
= mon_memory_target
;
1001 uint64_t min
= mon_memory_min
;
1004 // Apply the same logic as in bluestore to set the max amount
1005 // of memory to use for cache. Assume base memory for OSDMaps
1006 // and then add in some overhead for fragmentation.
1007 uint64_t ltarget
= (1.0 - fragmentation
) * target
;
1008 if (ltarget
> base
+ min
) {
1009 max
= ltarget
- base
;
1012 rocksdb_binned_kv_cache
= mon
.store
->get_priority_cache();
1013 if (!rocksdb_binned_kv_cache
) {
1014 derr
<< __func__
<< " not using rocksdb" << dendl
;
1018 int r
= _set_cache_ratios();
1020 derr
<< __func__
<< " Cache ratios for pcm could not be set."
1021 << " Review the kv (rocksdb) and mon_memory_target sizes."
1026 pcm
= std::make_shared
<PriorityCache::Manager
>(
1027 cct
, min
, max
, target
, true);
1028 pcm
->insert("kv", rocksdb_binned_kv_cache
, true);
1029 pcm
->insert("inc", inc_cache
, true);
1030 pcm
->insert("full", full_cache
, true);
1031 dout(1) << __func__
<< " pcm target: " << target
1032 << " pcm max: " << max
1033 << " pcm min: " << min
1034 << " inc_osd_cache size: " << inc_osd_cache
.get_size()
1039 int OSDMonitor::_set_cache_ratios()
1041 double old_cache_kv_ratio
= cache_kv_ratio
;
1043 // Set the cache ratios for kv(rocksdb), inc and full caches
1044 cache_kv_ratio
= (double)rocksdb_cache_size
/ (double)mon_memory_target
;
1045 if (cache_kv_ratio
>= 1.0) {
1046 derr
<< __func__
<< " Cache kv ratio (" << cache_kv_ratio
1047 << ") must be in range [0,<1.0]."
1049 cache_kv_ratio
= old_cache_kv_ratio
;
1052 rocksdb_binned_kv_cache
->set_cache_ratio(cache_kv_ratio
);
1053 cache_inc_ratio
= cache_full_ratio
= (1.0 - cache_kv_ratio
) / 2;
1054 inc_cache
->set_cache_ratio(cache_inc_ratio
);
1055 full_cache
->set_cache_ratio(cache_full_ratio
);
1057 dout(1) << __func__
<< " kv ratio " << cache_kv_ratio
1058 << " inc ratio " << cache_inc_ratio
1059 << " full ratio " << cache_full_ratio
1064 void OSDMonitor::start_mapping()
1066 // initiate mapping job
1068 dout(10) << __func__
<< " canceling previous mapping_job " << mapping_job
.get()
1070 mapping_job
->abort();
1072 if (!osdmap
.get_pools().empty()) {
1073 auto fin
= new C_UpdateCreatingPGs(this, osdmap
.get_epoch());
1074 mapping_job
= mapping
.start_update(osdmap
, mapper
,
1075 g_conf()->mon_osd_mapping_pgs_per_chunk
);
1076 dout(10) << __func__
<< " started mapping job " << mapping_job
.get()
1077 << " at " << fin
->start
<< dendl
;
1078 mapping_job
->set_finish_event(fin
);
1080 dout(10) << __func__
<< " no pools, no mapping job" << dendl
;
1081 mapping_job
= nullptr;
1085 void OSDMonitor::update_msgr_features()
1087 const int types
[] = {
1088 entity_name_t::TYPE_OSD
,
1089 entity_name_t::TYPE_CLIENT
,
1090 entity_name_t::TYPE_MDS
,
1091 entity_name_t::TYPE_MON
1093 for (int type
: types
) {
1095 uint64_t features
= osdmap
.get_features(type
, &mask
);
1096 if ((mon
.messenger
->get_policy(type
).features_required
& mask
) != features
) {
1097 dout(0) << "crush map has features " << features
<< ", adjusting msgr requires" << dendl
;
1098 ceph::net::Policy p
= mon
.messenger
->get_policy(type
);
1099 p
.features_required
= (p
.features_required
& ~mask
) | features
;
1100 mon
.messenger
->set_policy(type
, p
);
1105 void OSDMonitor::on_active()
1109 if (mon
.is_leader()) {
1110 mon
.clog
->debug() << "osdmap " << osdmap
;
1111 if (!priority_convert
) {
1112 // Only do this once at start-up
1113 convert_pool_priorities();
1114 priority_convert
= true;
1117 list
<MonOpRequestRef
> ls
;
1118 take_all_failures(ls
);
1119 while (!ls
.empty()) {
1120 MonOpRequestRef op
= ls
.front();
1121 op
->mark_osdmon_event(__func__
);
1129 void OSDMonitor::on_restart()
1131 last_osd_report
.clear();
1134 void OSDMonitor::on_shutdown()
1136 dout(10) << __func__
<< dendl
;
1138 dout(10) << __func__
<< " canceling previous mapping_job " << mapping_job
.get()
1140 mapping_job
->abort();
1143 // discard failure info, waiters
1144 list
<MonOpRequestRef
> ls
;
1145 take_all_failures(ls
);
1149 void OSDMonitor::update_logger()
1151 dout(10) << "update_logger" << dendl
;
1153 mon
.cluster_logger
->set(l_cluster_num_osd
, osdmap
.get_num_osds());
1154 mon
.cluster_logger
->set(l_cluster_num_osd_up
, osdmap
.get_num_up_osds());
1155 mon
.cluster_logger
->set(l_cluster_num_osd_in
, osdmap
.get_num_in_osds());
1156 mon
.cluster_logger
->set(l_cluster_osd_epoch
, osdmap
.get_epoch());
1159 void OSDMonitor::create_pending()
1161 pending_inc
= OSDMap::Incremental(osdmap
.epoch
+1);
1162 pending_inc
.fsid
= mon
.monmap
->fsid
;
1163 pending_metadata
.clear();
1164 pending_metadata_rm
.clear();
1165 pending_pseudo_purged_snaps
.clear();
1167 dout(10) << "create_pending e " << pending_inc
.epoch
<< dendl
;
1169 // safety checks (this shouldn't really happen)
1171 if (osdmap
.backfillfull_ratio
<= 0) {
1172 pending_inc
.new_backfillfull_ratio
= g_conf()->mon_osd_backfillfull_ratio
;
1173 if (pending_inc
.new_backfillfull_ratio
> 1.0)
1174 pending_inc
.new_backfillfull_ratio
/= 100;
1175 dout(1) << __func__
<< " setting backfillfull_ratio = "
1176 << pending_inc
.new_backfillfull_ratio
<< dendl
;
1178 if (osdmap
.full_ratio
<= 0) {
1179 pending_inc
.new_full_ratio
= g_conf()->mon_osd_full_ratio
;
1180 if (pending_inc
.new_full_ratio
> 1.0)
1181 pending_inc
.new_full_ratio
/= 100;
1182 dout(1) << __func__
<< " setting full_ratio = "
1183 << pending_inc
.new_full_ratio
<< dendl
;
1185 if (osdmap
.nearfull_ratio
<= 0) {
1186 pending_inc
.new_nearfull_ratio
= g_conf()->mon_osd_nearfull_ratio
;
1187 if (pending_inc
.new_nearfull_ratio
> 1.0)
1188 pending_inc
.new_nearfull_ratio
/= 100;
1189 dout(1) << __func__
<< " setting nearfull_ratio = "
1190 << pending_inc
.new_nearfull_ratio
<< dendl
;
1196 OSDMonitor::update_pending_pgs(const OSDMap::Incremental
& inc
,
1197 const OSDMap
& nextmap
)
1199 dout(10) << __func__
<< dendl
;
1200 creating_pgs_t pending_creatings
;
1202 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
1203 pending_creatings
= creating_pgs
;
1205 // check for new or old pools
1206 if (pending_creatings
.last_scan_epoch
< inc
.epoch
) {
1207 unsigned queued
= 0;
1208 queued
+= scan_for_creating_pgs(osdmap
.get_pools(),
1211 &pending_creatings
);
1212 queued
+= scan_for_creating_pgs(inc
.new_pools
,
1215 &pending_creatings
);
1216 dout(10) << __func__
<< " " << queued
<< " pools queued" << dendl
;
1217 for (auto deleted_pool
: inc
.old_pools
) {
1218 auto removed
= pending_creatings
.remove_pool(deleted_pool
);
1219 dout(10) << __func__
<< " " << removed
1220 << " pg removed because containing pool deleted: "
1221 << deleted_pool
<< dendl
;
1222 last_epoch_clean
.remove_pool(deleted_pool
);
1224 // pgmon updates its creating_pgs in check_osd_map() which is called by
1225 // on_active() and check_osd_map() could be delayed if lease expires, so its
1226 // creating_pgs could be stale in comparison with the one of osdmon. let's
1227 // trim them here. otherwise, they will be added back after being erased.
1228 unsigned removed
= 0;
1229 for (auto& pg
: pending_created_pgs
) {
1230 dout(20) << __func__
<< " noting created pg " << pg
<< dendl
;
1231 pending_creatings
.created_pools
.insert(pg
.pool());
1232 removed
+= pending_creatings
.pgs
.erase(pg
);
1234 pending_created_pgs
.clear();
1235 dout(10) << __func__
<< " " << removed
1236 << " pgs removed because they're created" << dendl
;
1237 pending_creatings
.last_scan_epoch
= osdmap
.get_epoch();
1240 // filter out any pgs that shouldn't exist.
1242 auto i
= pending_creatings
.pgs
.begin();
1243 while (i
!= pending_creatings
.pgs
.end()) {
1244 if (!nextmap
.pg_exists(i
->first
)) {
1245 dout(10) << __func__
<< " removing pg " << i
->first
1246 << " which should not exist" << dendl
;
1247 i
= pending_creatings
.pgs
.erase(i
);
1255 unsigned max
= std::max
<int64_t>(1, g_conf()->mon_osd_max_creating_pgs
);
1256 const auto total
= pending_creatings
.pgs
.size();
1257 while (pending_creatings
.pgs
.size() < max
&&
1258 !pending_creatings
.queue
.empty()) {
1259 auto p
= pending_creatings
.queue
.begin();
1260 int64_t poolid
= p
->first
;
1261 dout(10) << __func__
<< " pool " << poolid
1262 << " created " << p
->second
.created
1263 << " modified " << p
->second
.modified
1264 << " [" << p
->second
.start
<< "-" << p
->second
.end
<< ")"
1266 int64_t n
= std::min
<int64_t>(max
- pending_creatings
.pgs
.size(),
1267 p
->second
.end
- p
->second
.start
);
1268 ps_t first
= p
->second
.start
;
1269 ps_t end
= first
+ n
;
1270 for (ps_t ps
= first
; ps
< end
; ++ps
) {
1271 const pg_t pgid
{ps
, static_cast<uint64_t>(poolid
)};
1272 // NOTE: use the *current* epoch as the PG creation epoch so that the
1273 // OSD does not have to generate a long set of PastIntervals.
1274 pending_creatings
.pgs
.emplace(
1276 creating_pgs_t::pg_create_info(inc
.epoch
,
1277 p
->second
.modified
));
1278 dout(10) << __func__
<< " adding " << pgid
<< dendl
;
1280 p
->second
.start
= end
;
1281 if (p
->second
.done()) {
1282 dout(10) << __func__
<< " done with queue for " << poolid
<< dendl
;
1283 pending_creatings
.queue
.erase(p
);
1285 dout(10) << __func__
<< " pool " << poolid
1286 << " now [" << p
->second
.start
<< "-" << p
->second
.end
<< ")"
1290 dout(10) << __func__
<< " queue remaining: " << pending_creatings
.queue
.size()
1291 << " pools" << dendl
;
1293 if (mon
.monmap
->min_mon_release
>= ceph_release_t::octopus
) {
1294 // walk creating pgs' history and past_intervals forward
1295 for (auto& i
: pending_creatings
.pgs
) {
1296 // this mirrors PG::start_peering_interval()
1297 pg_t pgid
= i
.first
;
1299 // this is a bit imprecise, but sufficient?
1300 struct min_size_predicate_t
: public IsPGRecoverablePredicate
{
1301 const pg_pool_t
*pi
;
1302 bool operator()(const set
<pg_shard_t
> &have
) const {
1303 return have
.size() >= pi
->min_size
;
1305 explicit min_size_predicate_t(const pg_pool_t
*i
) : pi(i
) {}
1306 } min_size_predicate(nextmap
.get_pg_pool(pgid
.pool()));
1308 vector
<int> up
, acting
;
1309 int up_primary
, acting_primary
;
1310 nextmap
.pg_to_up_acting_osds(
1311 pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
1312 if (i
.second
.history
.epoch_created
== 0) {
1313 // new pg entry, set it up
1315 i
.second
.acting
= acting
;
1316 i
.second
.up_primary
= up_primary
;
1317 i
.second
.acting_primary
= acting_primary
;
1318 i
.second
.history
= pg_history_t(i
.second
.create_epoch
,
1319 i
.second
.create_stamp
);
1320 dout(10) << __func__
<< " pg " << pgid
<< " just added, "
1321 << " up " << i
.second
.up
1322 << " p " << i
.second
.up_primary
1323 << " acting " << i
.second
.acting
1324 << " p " << i
.second
.acting_primary
1325 << " history " << i
.second
.history
1326 << " past_intervals " << i
.second
.past_intervals
1329 std::stringstream debug
;
1330 if (PastIntervals::check_new_interval(
1331 i
.second
.acting_primary
, acting_primary
,
1332 i
.second
.acting
, acting
,
1333 i
.second
.up_primary
, up_primary
,
1335 i
.second
.history
.same_interval_since
,
1336 i
.second
.history
.last_epoch_clean
,
1341 &i
.second
.past_intervals
,
1343 epoch_t e
= inc
.epoch
;
1344 i
.second
.history
.same_interval_since
= e
;
1345 if (i
.second
.up
!= up
) {
1346 i
.second
.history
.same_up_since
= e
;
1348 if (i
.second
.acting_primary
!= acting_primary
) {
1349 i
.second
.history
.same_primary_since
= e
;
1352 osdmap
.get_pg_num(pgid
.pool()),
1353 nextmap
.get_pg_num(pgid
.pool()),
1355 i
.second
.history
.last_epoch_split
= e
;
1357 dout(10) << __func__
<< " pg " << pgid
<< " new interval,"
1358 << " up " << i
.second
.up
<< " -> " << up
1359 << " p " << i
.second
.up_primary
<< " -> " << up_primary
1360 << " acting " << i
.second
.acting
<< " -> " << acting
1361 << " p " << i
.second
.acting_primary
<< " -> "
1363 << " history " << i
.second
.history
1364 << " past_intervals " << i
.second
.past_intervals
1366 dout(20) << " debug: " << debug
.str() << dendl
;
1368 i
.second
.acting
= acting
;
1369 i
.second
.up_primary
= up_primary
;
1370 i
.second
.acting_primary
= acting_primary
;
1375 dout(10) << __func__
1376 << " " << (pending_creatings
.pgs
.size() - total
)
1377 << "/" << pending_creatings
.pgs
.size()
1378 << " pgs added from queued pools" << dendl
;
1379 return pending_creatings
;
1382 void OSDMonitor::maybe_prime_pg_temp()
1385 if (pending_inc
.crush
.length()) {
1386 dout(10) << __func__
<< " new crush map, all" << dendl
;
1390 if (!pending_inc
.new_up_client
.empty()) {
1391 dout(10) << __func__
<< " new up osds, all" << dendl
;
1395 // check for interesting OSDs
1397 for (auto p
= pending_inc
.new_state
.begin();
1398 !all
&& p
!= pending_inc
.new_state
.end();
1400 if ((p
->second
& CEPH_OSD_UP
) &&
1401 osdmap
.is_up(p
->first
)) {
1402 osds
.insert(p
->first
);
1405 for (auto p
= pending_inc
.new_weight
.begin();
1406 !all
&& p
!= pending_inc
.new_weight
.end();
1408 if (osdmap
.exists(p
->first
) && p
->second
< osdmap
.get_weight(p
->first
)) {
1410 osds
.insert(p
->first
);
1412 dout(10) << __func__
<< " osd." << p
->first
<< " weight increase, all"
1418 if (!all
&& osds
.empty())
1423 mapping
.get_osd_acting_pgs(*osds
.begin()).size() * osds
.size();
1424 if (estimate
> mapping
.get_num_pgs() *
1425 g_conf()->mon_osd_prime_pg_temp_max_estimate
) {
1426 dout(10) << __func__
<< " estimate " << estimate
<< " pgs on "
1427 << osds
.size() << " osds >= "
1428 << g_conf()->mon_osd_prime_pg_temp_max_estimate
<< " of total "
1429 << mapping
.get_num_pgs() << " pgs, all"
1433 dout(10) << __func__
<< " estimate " << estimate
<< " pgs on "
1434 << osds
.size() << " osds" << dendl
;
1439 next
.deepish_copy_from(osdmap
);
1440 next
.apply_incremental(pending_inc
);
1442 if (next
.get_pools().empty()) {
1443 dout(10) << __func__
<< " no pools, no pg_temp priming" << dendl
;
1445 PrimeTempJob
job(next
, this);
1446 mapper
.queue(&job
, g_conf()->mon_osd_mapping_pgs_per_chunk
, {});
1447 if (job
.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time
)) {
1448 dout(10) << __func__
<< " done in " << job
.get_duration() << dendl
;
1450 dout(10) << __func__
<< " did not finish in "
1451 << g_conf()->mon_osd_prime_pg_temp_max_time
1452 << ", stopping" << dendl
;
1456 dout(10) << __func__
<< " " << osds
.size() << " interesting osds" << dendl
;
1457 utime_t stop
= ceph_clock_now();
1458 stop
+= g_conf()->mon_osd_prime_pg_temp_max_time
;
1459 const int chunk
= 1000;
1461 std::unordered_set
<pg_t
> did_pgs
;
1462 for (auto osd
: osds
) {
1463 auto& pgs
= mapping
.get_osd_acting_pgs(osd
);
1464 dout(20) << __func__
<< " osd." << osd
<< " " << pgs
<< dendl
;
1465 for (auto pgid
: pgs
) {
1466 if (!did_pgs
.insert(pgid
).second
) {
1469 prime_pg_temp(next
, pgid
);
1472 if (ceph_clock_now() > stop
) {
1473 dout(10) << __func__
<< " consumed more than "
1474 << g_conf()->mon_osd_prime_pg_temp_max_time
1475 << " seconds, stopping"
1485 void OSDMonitor::prime_pg_temp(
1489 // TODO: remove this creating_pgs direct access?
1490 if (creating_pgs
.pgs
.count(pgid
)) {
1493 if (!osdmap
.pg_exists(pgid
)) {
1497 vector
<int> up
, acting
;
1498 mapping
.get(pgid
, &up
, nullptr, &acting
, nullptr);
1500 vector
<int> next_up
, next_acting
;
1501 int next_up_primary
, next_acting_primary
;
1502 next
.pg_to_up_acting_osds(pgid
, &next_up
, &next_up_primary
,
1503 &next_acting
, &next_acting_primary
);
1504 if (acting
== next_acting
&&
1505 !(up
!= acting
&& next_up
== next_acting
))
1506 return; // no change since last epoch
1509 return; // if previously empty now we can be no worse off
1510 const pg_pool_t
*pool
= next
.get_pg_pool(pgid
.pool());
1511 if (pool
&& acting
.size() < pool
->min_size
)
1512 return; // can be no worse off than before
1514 if (next_up
== next_acting
) {
1516 dout(20) << __func__
<< " next_up == next_acting now, clear pg_temp"
1520 dout(20) << __func__
<< " " << pgid
<< " " << up
<< "/" << acting
1521 << " -> " << next_up
<< "/" << next_acting
1522 << ", priming " << acting
1525 std::lock_guard
l(prime_pg_temp_lock
);
1526 // do not touch a mapping if a change is pending
1527 pending_inc
.new_pg_temp
.emplace(
1529 mempool::osdmap::vector
<int>(acting
.begin(), acting
.end()));
1534 * @note receiving a transaction in this function gives a fair amount of
1535 * freedom to the service implementation if it does need it. It shouldn't.
1537 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t
)
1539 dout(10) << "encode_pending e " << pending_inc
.epoch
1543 dout(1) << __func__
<< " osdmap full prune encoded e"
1544 << pending_inc
.epoch
<< dendl
;
1547 // finalize up pending_inc
1548 pending_inc
.modified
= ceph_clock_now();
1550 int r
= pending_inc
.propagate_base_properties_to_tiers(cct
, osdmap
);
1551 ceph_assert(r
== 0);
1554 if (!mapping_job
->is_done()) {
1555 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job "
1556 << mapping_job
.get() << " did not complete, "
1557 << mapping_job
->shards
<< " left" << dendl
;
1558 mapping_job
->abort();
1559 } else if (mapping
.get_epoch() < osdmap
.get_epoch()) {
1560 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job "
1561 << mapping_job
.get() << " is prior epoch "
1562 << mapping
.get_epoch() << dendl
;
1564 if (g_conf()->mon_osd_prime_pg_temp
) {
1565 maybe_prime_pg_temp();
1568 } else if (g_conf()->mon_osd_prime_pg_temp
) {
1569 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job did not start"
1572 mapping_job
.reset();
1574 // ensure we don't have blank new_state updates. these are interrpeted as
1575 // CEPH_OSD_UP (and almost certainly not what we want!).
1576 auto p
= pending_inc
.new_state
.begin();
1577 while (p
!= pending_inc
.new_state
.end()) {
1578 if (p
->second
== 0) {
1579 dout(10) << "new_state for osd." << p
->first
<< " is 0, removing" << dendl
;
1580 p
= pending_inc
.new_state
.erase(p
);
1582 if (p
->second
& CEPH_OSD_UP
) {
1583 pending_inc
.new_last_up_change
= pending_inc
.modified
;
1588 if (!pending_inc
.new_up_client
.empty()) {
1589 pending_inc
.new_last_up_change
= pending_inc
.modified
;
1591 for (auto& i
: pending_inc
.new_weight
) {
1592 if (i
.first
>= osdmap
.max_osd
) {
1594 // new osd is already marked in
1595 pending_inc
.new_last_in_change
= pending_inc
.modified
;
1598 } else if (!!i
.second
!= !!osdmap
.osd_weight
[i
.first
]) {
1599 // existing osd marked in or out
1600 pending_inc
.new_last_in_change
= pending_inc
.modified
;
1607 tmp
.deepish_copy_from(osdmap
);
1608 tmp
.apply_incremental(pending_inc
);
1610 // clean pg_temp mappings
1611 OSDMap::clean_temps(cct
, osdmap
, tmp
, &pending_inc
);
1613 // clean inappropriate pg_upmap/pg_upmap_items (if any)
1615 // check every upmapped pg for now
1616 // until we could reliably identify certain cases to ignore,
1617 // which is obviously the hard part TBD..
1618 vector
<pg_t
> pgs_to_check
;
1619 tmp
.get_upmap_pgs(&pgs_to_check
);
1620 if (pgs_to_check
.size() <
1621 static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk
* 2)) {
1622 // not enough pgs, do it inline
1623 tmp
.clean_pg_upmaps(cct
, &pending_inc
);
1625 CleanUpmapJob
job(cct
, tmp
, pending_inc
);
1626 mapper
.queue(&job
, g_conf()->mon_clean_pg_upmaps_per_chunk
, pgs_to_check
);
1631 // update creating pgs first so that we can remove the created pgid and
1632 // process the pool flag removal below in the same osdmap epoch.
1633 auto pending_creatings
= update_pending_pgs(pending_inc
, tmp
);
1634 bufferlist creatings_bl
;
1635 uint64_t features
= CEPH_FEATURES_ALL
;
1636 if (mon
.monmap
->min_mon_release
< ceph_release_t::octopus
) {
1637 dout(20) << __func__
<< " encoding pending pgs without octopus features"
1639 features
&= ~CEPH_FEATURE_SERVER_OCTOPUS
;
1641 encode(pending_creatings
, creatings_bl
, features
);
1642 t
->put(OSD_PG_CREATING_PREFIX
, "creating", creatings_bl
);
1644 // remove any old (or incompat) POOL_CREATING flags
1645 for (auto& i
: tmp
.get_pools()) {
1646 if (tmp
.require_osd_release
< ceph_release_t::nautilus
) {
1647 // pre-nautilus OSDMaps shouldn't get this flag.
1648 if (pending_inc
.new_pools
.count(i
.first
)) {
1649 pending_inc
.new_pools
[i
.first
].flags
&= ~pg_pool_t::FLAG_CREATING
;
1652 if (i
.second
.has_flag(pg_pool_t::FLAG_CREATING
) &&
1653 !pending_creatings
.still_creating_pool(i
.first
)) {
1654 dout(10) << __func__
<< " done creating pool " << i
.first
1655 << ", clearing CREATING flag" << dendl
;
1656 if (pending_inc
.new_pools
.count(i
.first
) == 0) {
1657 pending_inc
.new_pools
[i
.first
] = i
.second
;
1659 pending_inc
.new_pools
[i
.first
].flags
&= ~pg_pool_t::FLAG_CREATING
;
1663 // collect which pools are currently affected by
1664 // the near/backfill/full osd(s),
1665 // and set per-pool near/backfill/full flag instead
1666 set
<int64_t> full_pool_ids
;
1667 set
<int64_t> backfillfull_pool_ids
;
1668 set
<int64_t> nearfull_pool_ids
;
1669 tmp
.get_full_pools(cct
,
1671 &backfillfull_pool_ids
,
1672 &nearfull_pool_ids
);
1673 if (full_pool_ids
.empty() ||
1674 backfillfull_pool_ids
.empty() ||
1675 nearfull_pool_ids
.empty()) {
1676 // normal case - no nearfull, backfillfull or full osds
1677 // try cancel any improper nearfull/backfillfull/full pool
1679 for (auto &pool
: tmp
.get_pools()) {
1680 auto p
= pool
.first
;
1681 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
) &&
1682 nearfull_pool_ids
.empty()) {
1683 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1684 << "'s nearfull flag" << dendl
;
1685 if (pending_inc
.new_pools
.count(p
) == 0) {
1686 // load original pool info first!
1687 pending_inc
.new_pools
[p
] = pool
.second
;
1689 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1691 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
) &&
1692 backfillfull_pool_ids
.empty()) {
1693 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1694 << "'s backfillfull flag" << dendl
;
1695 if (pending_inc
.new_pools
.count(p
) == 0) {
1696 pending_inc
.new_pools
[p
] = pool
.second
;
1698 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1700 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
) &&
1701 full_pool_ids
.empty()) {
1702 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1703 // set by EQUOTA, skipping
1706 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1707 << "'s full flag" << dendl
;
1708 if (pending_inc
.new_pools
.count(p
) == 0) {
1709 pending_inc
.new_pools
[p
] = pool
.second
;
1711 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_FULL
;
1715 if (!full_pool_ids
.empty()) {
1716 dout(10) << __func__
<< " marking pool(s) " << full_pool_ids
1717 << " as full" << dendl
;
1718 for (auto &p
: full_pool_ids
) {
1719 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
)) {
1722 if (pending_inc
.new_pools
.count(p
) == 0) {
1723 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1725 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_FULL
;
1726 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1727 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1729 // cancel FLAG_FULL for pools which are no longer full too
1730 for (auto &pool
: tmp
.get_pools()) {
1731 auto p
= pool
.first
;
1732 if (full_pool_ids
.count(p
)) {
1733 // skip pools we have just marked as full above
1736 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
) ||
1737 tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1738 // don't touch if currently is not full
1739 // or is running out of quota (and hence considered as full)
1742 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1743 << "'s full flag" << dendl
;
1744 if (pending_inc
.new_pools
.count(p
) == 0) {
1745 pending_inc
.new_pools
[p
] = pool
.second
;
1747 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_FULL
;
1750 if (!backfillfull_pool_ids
.empty()) {
1751 for (auto &p
: backfillfull_pool_ids
) {
1752 if (full_pool_ids
.count(p
)) {
1753 // skip pools we have already considered as full above
1756 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1757 // make sure FLAG_FULL is truly set, so we are safe not
1758 // to set a extra (redundant) FLAG_BACKFILLFULL flag
1759 ceph_assert(tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
));
1762 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
)) {
1763 // don't bother if pool is already marked as backfillfull
1766 dout(10) << __func__
<< " marking pool '" << tmp
.pool_name
[p
]
1767 << "'s as backfillfull" << dendl
;
1768 if (pending_inc
.new_pools
.count(p
) == 0) {
1769 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1771 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_BACKFILLFULL
;
1772 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1774 // cancel FLAG_BACKFILLFULL for pools
1775 // which are no longer backfillfull too
1776 for (auto &pool
: tmp
.get_pools()) {
1777 auto p
= pool
.first
;
1778 if (full_pool_ids
.count(p
) || backfillfull_pool_ids
.count(p
)) {
1779 // skip pools we have just marked as backfillfull/full above
1782 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
)) {
1783 // and don't touch if currently is not backfillfull
1786 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1787 << "'s backfillfull flag" << dendl
;
1788 if (pending_inc
.new_pools
.count(p
) == 0) {
1789 pending_inc
.new_pools
[p
] = pool
.second
;
1791 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1794 if (!nearfull_pool_ids
.empty()) {
1795 for (auto &p
: nearfull_pool_ids
) {
1796 if (full_pool_ids
.count(p
) || backfillfull_pool_ids
.count(p
)) {
1799 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1800 // make sure FLAG_FULL is truly set, so we are safe not
1801 // to set a extra (redundant) FLAG_NEARFULL flag
1802 ceph_assert(tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
));
1805 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
)) {
1806 // don't bother if pool is already marked as nearfull
1809 dout(10) << __func__
<< " marking pool '" << tmp
.pool_name
[p
]
1810 << "'s as nearfull" << dendl
;
1811 if (pending_inc
.new_pools
.count(p
) == 0) {
1812 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1814 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_NEARFULL
;
1816 // cancel FLAG_NEARFULL for pools
1817 // which are no longer nearfull too
1818 for (auto &pool
: tmp
.get_pools()) {
1819 auto p
= pool
.first
;
1820 if (full_pool_ids
.count(p
) ||
1821 backfillfull_pool_ids
.count(p
) ||
1822 nearfull_pool_ids
.count(p
)) {
1823 // skip pools we have just marked as
1824 // nearfull/backfillfull/full above
1827 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
)) {
1828 // and don't touch if currently is not nearfull
1831 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1832 << "'s nearfull flag" << dendl
;
1833 if (pending_inc
.new_pools
.count(p
) == 0) {
1834 pending_inc
.new_pools
[p
] = pool
.second
;
1836 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1840 // min_compat_client?
1841 if (!tmp
.require_min_compat_client
) {
1842 auto mv
= tmp
.get_min_compat_client();
1843 dout(1) << __func__
<< " setting require_min_compat_client to currently "
1844 << "required " << mv
<< dendl
;
1845 mon
.clog
->info() << "setting require_min_compat_client to currently "
1846 << "required " << mv
;
1847 pending_inc
.new_require_min_compat_client
= mv
;
1850 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
&&
1851 tmp
.require_osd_release
>= ceph_release_t::nautilus
) {
1852 dout(10) << __func__
<< " first nautilus+ epoch" << dendl
;
1853 // add creating flags?
1854 for (auto& i
: tmp
.get_pools()) {
1855 if (pending_creatings
.still_creating_pool(i
.first
)) {
1856 dout(10) << __func__
<< " adding CREATING flag to pool " << i
.first
1858 if (pending_inc
.new_pools
.count(i
.first
) == 0) {
1859 pending_inc
.new_pools
[i
.first
] = i
.second
;
1861 pending_inc
.new_pools
[i
.first
].flags
|= pg_pool_t::FLAG_CREATING
;
1864 // adjust blocklist items to all be TYPE_ANY
1865 for (auto& i
: tmp
.blocklist
) {
1867 a
.set_type(entity_addr_t::TYPE_ANY
);
1868 pending_inc
.new_blocklist
[a
] = i
.second
;
1869 pending_inc
.old_blocklist
.push_back(i
.first
);
1873 if (osdmap
.require_osd_release
< ceph_release_t::octopus
&&
1874 tmp
.require_osd_release
>= ceph_release_t::octopus
) {
1875 dout(10) << __func__
<< " first octopus+ epoch" << dendl
;
1877 // adjust obsoleted cache modes
1878 for (auto& [poolid
, pi
] : tmp
.pools
) {
1879 if (pi
.cache_mode
== pg_pool_t::CACHEMODE_FORWARD
) {
1880 if (pending_inc
.new_pools
.count(poolid
) == 0) {
1881 pending_inc
.new_pools
[poolid
] = pi
;
1883 dout(10) << __func__
<< " switching pool " << poolid
1884 << " cachemode from forward -> proxy" << dendl
;
1885 pending_inc
.new_pools
[poolid
].cache_mode
= pg_pool_t::CACHEMODE_PROXY
;
1887 if (pi
.cache_mode
== pg_pool_t::CACHEMODE_READFORWARD
) {
1888 if (pending_inc
.new_pools
.count(poolid
) == 0) {
1889 pending_inc
.new_pools
[poolid
] = pi
;
1891 dout(10) << __func__
<< " switching pool " << poolid
1892 << " cachemode from readforward -> readproxy" << dendl
;
1893 pending_inc
.new_pools
[poolid
].cache_mode
=
1894 pg_pool_t::CACHEMODE_READPROXY
;
1898 // clear removed_snaps for every pool
1899 for (auto& [poolid
, pi
] : tmp
.pools
) {
1900 if (pi
.removed_snaps
.empty()) {
1903 if (pending_inc
.new_pools
.count(poolid
) == 0) {
1904 pending_inc
.new_pools
[poolid
] = pi
;
1906 dout(10) << __func__
<< " clearing pool " << poolid
<< " removed_snaps"
1908 pending_inc
.new_pools
[poolid
].removed_snaps
.clear();
1911 // create a combined purged snap epoch key for all purged snaps
1912 // prior to this epoch, and store it in the current epoch (i.e.,
1913 // the last pre-octopus epoch, just prior to the one we're
1915 auto it
= mon
.store
->get_iterator(OSD_SNAP_PREFIX
);
1916 it
->lower_bound("purged_snap_");
1917 map
<int64_t,snap_interval_set_t
> combined
;
1918 while (it
->valid()) {
1919 if (it
->key().find("purged_snap_") != 0) {
1922 string k
= it
->key();
1923 long long unsigned pool
;
1924 int n
= sscanf(k
.c_str(), "purged_snap_%llu_", &pool
);
1926 derr
<< __func__
<< " invalid purged_snaps key '" << k
<< "'" << dendl
;
1928 bufferlist v
= it
->value();
1929 auto p
= v
.cbegin();
1930 snapid_t begin
, end
;
1931 ceph::decode(begin
, p
);
1932 ceph::decode(end
, p
);
1933 combined
[pool
].insert(begin
, end
- begin
);
1937 if (!combined
.empty()) {
1938 string k
= make_purged_snap_epoch_key(pending_inc
.epoch
- 1);
1940 ceph::encode(combined
, v
);
1941 t
->put(OSD_SNAP_PREFIX
, k
, v
);
1942 dout(10) << __func__
<< " recording pre-octopus purged_snaps in epoch "
1943 << (pending_inc
.epoch
- 1) << ", " << v
.length() << " bytes"
1946 dout(10) << __func__
<< " there were no pre-octopus purged snaps"
1950 // clean out the old removed_snap_ and removed_epoch keys
1951 // ('`' is ASCII '_' + 1)
1952 t
->erase_range(OSD_SNAP_PREFIX
, "removed_snap_", "removed_snap`");
1953 t
->erase_range(OSD_SNAP_PREFIX
, "removed_epoch_", "removed_epoch`");
1958 for (auto i
= pending_inc
.new_state
.begin();
1959 i
!= pending_inc
.new_state
.end();
1961 int s
= i
->second
? i
->second
: CEPH_OSD_UP
;
1962 if (s
& CEPH_OSD_UP
) {
1963 dout(2) << " osd." << i
->first
<< " DOWN" << dendl
;
1964 // Reset laggy parameters if failure interval exceeds a threshold.
1965 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(i
->first
);
1966 if ((xi
.laggy_probability
|| xi
.laggy_interval
) && xi
.down_stamp
.sec()) {
1967 int last_failure_interval
= pending_inc
.modified
.sec() - xi
.down_stamp
.sec();
1968 if (grace_interval_threshold_exceeded(last_failure_interval
)) {
1969 set_default_laggy_params(i
->first
);
1973 if (s
& CEPH_OSD_EXISTS
)
1974 dout(2) << " osd." << i
->first
<< " DNE" << dendl
;
1976 for (auto i
= pending_inc
.new_up_client
.begin();
1977 i
!= pending_inc
.new_up_client
.end();
1979 //FIXME: insert cluster addresses too
1980 dout(2) << " osd." << i
->first
<< " UP " << i
->second
<< dendl
;
1982 for (map
<int32_t,uint32_t>::iterator i
= pending_inc
.new_weight
.begin();
1983 i
!= pending_inc
.new_weight
.end();
1985 if (i
->second
== CEPH_OSD_OUT
) {
1986 dout(2) << " osd." << i
->first
<< " OUT" << dendl
;
1987 } else if (i
->second
== CEPH_OSD_IN
) {
1988 dout(2) << " osd." << i
->first
<< " IN" << dendl
;
1990 dout(2) << " osd." << i
->first
<< " WEIGHT " << hex
<< i
->second
<< dec
<< dendl
;
1994 // features for osdmap and its incremental
1997 // encode full map and determine its crc
2000 tmp
.deepish_copy_from(osdmap
);
2001 tmp
.apply_incremental(pending_inc
);
2003 // determine appropriate features
2004 features
= tmp
.get_encoding_features();
2005 dout(10) << __func__
<< " encoding full map with "
2006 << tmp
.require_osd_release
2007 << " features " << features
<< dendl
;
2009 // the features should be a subset of the mon quorum's features!
2010 ceph_assert((features
& ~mon
.get_quorum_con_features()) == 0);
2013 encode(tmp
, fullbl
, features
| CEPH_FEATURE_RESERVED
);
2014 pending_inc
.full_crc
= tmp
.get_crc();
2016 // include full map in the txn. note that old monitors will
2017 // overwrite this. new ones will now skip the local full map
2018 // encode and reload from this.
2019 put_version_full(t
, pending_inc
.epoch
, fullbl
);
2023 ceph_assert(get_last_committed() + 1 == pending_inc
.epoch
);
2025 encode(pending_inc
, bl
, features
| CEPH_FEATURE_RESERVED
);
2027 dout(20) << " full_crc " << tmp
.get_crc()
2028 << " inc_crc " << pending_inc
.inc_crc
<< dendl
;
2030 /* put everything in the transaction */
2031 put_version(t
, pending_inc
.epoch
, bl
);
2032 put_last_committed(t
, pending_inc
.epoch
);
2035 for (map
<int,bufferlist
>::iterator p
= pending_metadata
.begin();
2036 p
!= pending_metadata
.end();
2039 auto mp
= p
->second
.cbegin();
2041 auto it
= m
.find("osd_objectstore");
2042 if (it
!= m
.end()) {
2043 if (it
->second
== "filestore") {
2044 filestore_osds
.insert(p
->first
);
2046 filestore_osds
.erase(p
->first
);
2049 t
->put(OSD_METADATA_PREFIX
, stringify(p
->first
), p
->second
);
2051 for (set
<int>::iterator p
= pending_metadata_rm
.begin();
2052 p
!= pending_metadata_rm
.end();
2054 filestore_osds
.erase(*p
);
2055 t
->erase(OSD_METADATA_PREFIX
, stringify(*p
));
2057 pending_metadata
.clear();
2058 pending_metadata_rm
.clear();
2061 if (tmp
.require_osd_release
>= ceph_release_t::octopus
&&
2062 !pending_inc
.new_purged_snaps
.empty()) {
2063 // all snaps purged this epoch (across all pools)
2064 string k
= make_purged_snap_epoch_key(pending_inc
.epoch
);
2066 encode(pending_inc
.new_purged_snaps
, v
);
2067 t
->put(OSD_SNAP_PREFIX
, k
, v
);
2069 for (auto& i
: pending_inc
.new_purged_snaps
) {
2070 for (auto q
= i
.second
.begin();
2071 q
!= i
.second
.end();
2073 insert_purged_snap_update(i
.first
, q
.get_start(), q
.get_end(),
2078 for (auto& [pool
, snaps
] : pending_pseudo_purged_snaps
) {
2079 for (auto snap
: snaps
) {
2080 insert_purged_snap_update(pool
, snap
, snap
+ 1,
2087 health_check_map_t next
;
2088 tmp
.check_health(cct
, &next
);
2090 check_for_filestore_osds(&next
);
2091 encode_health(next
, t
);
2094 int OSDMonitor::load_metadata(int osd
, map
<string
, string
>& m
, ostream
*err
)
2097 int r
= mon
.store
->get(OSD_METADATA_PREFIX
, stringify(osd
), bl
);
2101 auto p
= bl
.cbegin();
2104 catch (ceph::buffer::error
& e
) {
2106 *err
<< "osd." << osd
<< " metadata is corrupt";
2112 void OSDMonitor::count_metadata(const string
& field
, map
<string
,int> *out
)
2114 for (int osd
= 0; osd
< osdmap
.get_max_osd(); ++osd
) {
2115 if (osdmap
.is_up(osd
)) {
2116 map
<string
,string
> meta
;
2117 load_metadata(osd
, meta
, nullptr);
2118 auto p
= meta
.find(field
);
2119 if (p
== meta
.end()) {
2120 (*out
)["unknown"]++;
2122 (*out
)[p
->second
]++;
2128 void OSDMonitor::count_metadata(const string
& field
, Formatter
*f
)
2130 map
<string
,int> by_val
;
2131 count_metadata(field
, &by_val
);
2132 f
->open_object_section(field
.c_str());
2133 for (auto& p
: by_val
) {
2134 f
->dump_int(p
.first
.c_str(), p
.second
);
2139 void OSDMonitor::get_versions(std::map
<string
, list
<string
>> &versions
)
2141 for (int osd
= 0; osd
< osdmap
.get_max_osd(); ++osd
) {
2142 if (osdmap
.is_up(osd
)) {
2143 map
<string
,string
> meta
;
2144 load_metadata(osd
, meta
, nullptr);
2145 auto p
= meta
.find("ceph_version_short");
2146 if (p
== meta
.end()) continue;
2147 versions
[p
->second
].push_back(string("osd.") + stringify(osd
));
2152 int OSDMonitor::get_osd_objectstore_type(int osd
, string
*type
)
2154 map
<string
, string
> metadata
;
2155 int r
= load_metadata(osd
, metadata
, nullptr);
2159 auto it
= metadata
.find("osd_objectstore");
2160 if (it
== metadata
.end())
2166 void OSDMonitor::get_filestore_osd_list()
2168 for (unsigned osd
= 0; osd
< osdmap
.get_num_osds(); ++osd
) {
2169 string objectstore_type
;
2170 int r
= get_osd_objectstore_type(osd
, &objectstore_type
);
2171 if (r
== 0 && objectstore_type
== "filestore") {
2172 filestore_osds
.insert(osd
);
2177 void OSDMonitor::check_for_filestore_osds(health_check_map_t
*checks
)
2179 if (g_conf()->mon_warn_on_filestore_osds
&&
2180 filestore_osds
.size() > 0) {
2181 ostringstream ss
, deprecated_tip
;
2182 list
<string
> detail
;
2183 ss
<< filestore_osds
.size()
2185 << (filestore_osds
.size() == 1 ? "is" : "are")
2186 << " running Filestore";
2187 deprecated_tip
<< ss
.str();
2188 ss
<< " [Deprecated]";
2189 auto& d
= checks
->add("OSD_FILESTORE", HEALTH_WARN
, ss
.str(),
2190 filestore_osds
.size());
2191 deprecated_tip
<< ", which has been deprecated and"
2192 << " not been optimized for QoS"
2193 << " (Filestore OSDs will use 'osd_op_queue = wpq' strictly)";
2194 detail
.push_back(deprecated_tip
.str());
2195 d
.detail
.swap(detail
);
2199 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id
,
2200 const pg_pool_t
&pool
,
2203 // just check a few pgs for efficiency - this can't give a guarantee anyway,
2204 // since filestore osds could always join the pool later
2205 set
<int> checked_osds
;
2206 for (unsigned ps
= 0; ps
< std::min(8u, pool
.get_pg_num()); ++ps
) {
2207 vector
<int> up
, acting
;
2208 pg_t
pgid(ps
, pool_id
);
2209 osdmap
.pg_to_up_acting_osds(pgid
, up
, acting
);
2210 for (int osd
: up
) {
2211 if (checked_osds
.find(osd
) != checked_osds
.end())
2213 string objectstore_type
;
2214 int r
= get_osd_objectstore_type(osd
, &objectstore_type
);
2215 // allow with missing metadata, e.g. due to an osd never booting yet
2216 if (r
< 0 || objectstore_type
== "bluestore") {
2217 checked_osds
.insert(osd
);
2220 *err
<< "osd." << osd
<< " uses " << objectstore_type
;
2227 int OSDMonitor::dump_osd_metadata(int osd
, Formatter
*f
, ostream
*err
)
2229 map
<string
,string
> m
;
2230 if (int r
= load_metadata(osd
, m
, err
))
2232 for (map
<string
,string
>::iterator p
= m
.begin(); p
!= m
.end(); ++p
)
2233 f
->dump_string(p
->first
.c_str(), p
->second
);
2237 void OSDMonitor::print_nodes(Formatter
*f
)
2239 // group OSDs by their hosts
2240 map
<string
, list
<int> > osds
; // hostname => osd
2241 for (int osd
= 0; osd
< osdmap
.get_max_osd(); osd
++) {
2242 map
<string
, string
> m
;
2243 if (load_metadata(osd
, m
, NULL
)) {
2246 map
<string
, string
>::iterator hostname
= m
.find("hostname");
2247 if (hostname
== m
.end()) {
2248 // not likely though
2251 osds
[hostname
->second
].push_back(osd
);
2254 dump_services(f
, osds
, "osd");
2257 void OSDMonitor::share_map_with_random_osd()
2259 if (osdmap
.get_num_up_osds() == 0) {
2260 dout(10) << __func__
<< " no up osds, don't share with anyone" << dendl
;
2264 MonSession
*s
= mon
.session_map
.get_random_osd_session(&osdmap
);
2266 dout(10) << __func__
<< " no up osd on our session map" << dendl
;
2270 dout(10) << "committed, telling random " << s
->name
2271 << " all about it" << dendl
;
2273 // get feature of the peer
2274 // use quorum_con_features, if it's an anonymous connection.
2275 uint64_t features
= s
->con_features
? s
->con_features
:
2276 mon
.get_quorum_con_features();
2277 // whatev, they'll request more if they need it
2278 MOSDMap
*m
= build_incremental(osdmap
.get_epoch() - 1, osdmap
.get_epoch(), features
);
2279 s
->con
->send_message(m
);
2280 // NOTE: do *not* record osd has up to this epoch (as we do
2281 // elsewhere) as they may still need to request older values.
2284 version_t
OSDMonitor::get_trim_to() const
2286 if (mon
.get_quorum().empty()) {
2287 dout(10) << __func__
<< " quorum not formed, trim_to = 0" << dendl
;
2292 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
2293 if (!creating_pgs
.pgs
.empty()) {
2294 dout(10) << __func__
<< " pgs creating, trim_to = 0" << dendl
;
2299 if (g_conf().get_val
<bool>("mon_debug_block_osdmap_trim")) {
2301 << " blocking osdmap trim"
2302 << " ('mon_debug_block_osdmap_trim' set to 'true')"
2303 << " trim_to = 0" << dendl
;
2308 epoch_t floor
= get_min_last_epoch_clean();
2309 dout(10) << " min_last_epoch_clean " << floor
<< dendl
;
2310 if (g_conf()->mon_osd_force_trim_to
> 0 &&
2311 g_conf()->mon_osd_force_trim_to
< (int)get_last_committed()) {
2312 floor
= g_conf()->mon_osd_force_trim_to
;
2313 dout(10) << __func__
2314 << " explicit mon_osd_force_trim_to = " << floor
<< dendl
;
2316 unsigned min
= g_conf()->mon_min_osdmap_epochs
;
2317 if (floor
+ min
> get_last_committed()) {
2318 if (min
< get_last_committed())
2319 floor
= get_last_committed() - min
;
2323 if (floor
> get_first_committed()) {
2324 dout(10) << __func__
<< " trim_to = " << floor
<< dendl
;
2328 dout(10) << __func__
<< " trim_to = 0" << dendl
;
2332 epoch_t
OSDMonitor::get_min_last_epoch_clean() const
2334 auto floor
= last_epoch_clean
.get_lower_bound(osdmap
);
2335 // also scan osd epochs
2336 // don't trim past the oldest reported osd epoch
2337 for (auto [osd
, epoch
] : osd_epochs
) {
2338 if (epoch
< floor
) {
2345 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx
,
2348 dout(10) << __func__
<< " including full map for e " << first
<< dendl
;
2350 get_version_full(first
, bl
);
2351 put_version_full(tx
, first
, bl
);
2353 if (has_osdmap_manifest
&&
2354 first
> osdmap_manifest
.get_first_pinned()) {
2355 _prune_update_trimmed(tx
, first
);
2360 /* full osdmap prune
2362 * for more information, please refer to doc/dev/mon-osdmap-prune.rst
2365 void OSDMonitor::load_osdmap_manifest()
2367 bool store_has_manifest
=
2368 mon
.store
->exists(get_service_name(), "osdmap_manifest");
2370 if (!store_has_manifest
) {
2371 if (!has_osdmap_manifest
) {
2375 dout(20) << __func__
2376 << " dropping osdmap manifest from memory." << dendl
;
2377 osdmap_manifest
= osdmap_manifest_t();
2378 has_osdmap_manifest
= false;
2382 dout(20) << __func__
2383 << " osdmap manifest detected in store; reload." << dendl
;
2385 bufferlist manifest_bl
;
2386 int r
= get_value("osdmap_manifest", manifest_bl
);
2388 derr
<< __func__
<< " unable to read osdmap version manifest" << dendl
;
2389 ceph_abort_msg("error reading manifest");
2391 osdmap_manifest
.decode(manifest_bl
);
2392 has_osdmap_manifest
= true;
2394 dout(10) << __func__
<< " store osdmap manifest pinned ("
2395 << osdmap_manifest
.get_first_pinned()
2397 << osdmap_manifest
.get_last_pinned()
2402 bool OSDMonitor::should_prune() const
2404 version_t first
= get_first_committed();
2405 version_t last
= get_last_committed();
2406 version_t min_osdmap_epochs
=
2407 g_conf().get_val
<int64_t>("mon_min_osdmap_epochs");
2408 version_t prune_min
=
2409 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_min");
2410 version_t prune_interval
=
2411 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_interval");
2412 version_t last_pinned
= osdmap_manifest
.get_last_pinned();
2413 version_t last_to_pin
= last
- min_osdmap_epochs
;
2415 // Make it or break it constraints.
2417 // If any of these conditions fails, we will not prune, regardless of
2418 // whether we have an on-disk manifest with an on-going pruning state.
2420 if ((last
- first
) <= min_osdmap_epochs
) {
2421 // between the first and last committed epochs, we don't have
2422 // enough epochs to trim, much less to prune.
2423 dout(10) << __func__
2424 << " currently holding only " << (last
- first
)
2425 << " epochs (min osdmap epochs: " << min_osdmap_epochs
2426 << "); do not prune."
2430 } else if ((last_to_pin
- first
) < prune_min
) {
2431 // between the first committed epoch and the last epoch we would prune,
2432 // we simply don't have enough versions over the minimum to prune maps.
2433 dout(10) << __func__
2434 << " could only prune " << (last_to_pin
- first
)
2435 << " epochs (" << first
<< ".." << last_to_pin
<< "), which"
2436 " is less than the required minimum (" << prune_min
<< ")"
2440 } else if (has_osdmap_manifest
&& last_pinned
>= last_to_pin
) {
2441 dout(10) << __func__
2442 << " we have pruned as far as we can; do not prune."
2446 } else if (last_pinned
+ prune_interval
> last_to_pin
) {
2447 dout(10) << __func__
2448 << " not enough epochs to form an interval (last pinned: "
2449 << last_pinned
<< ", last to pin: "
2450 << last_to_pin
<< ", interval: " << prune_interval
<< ")"
2455 dout(15) << __func__
2456 << " should prune (" << last_pinned
<< ".." << last_to_pin
<< ")"
2457 << " lc (" << first
<< ".." << last
<< ")"
2462 void OSDMonitor::_prune_update_trimmed(
2463 MonitorDBStore::TransactionRef tx
,
2466 dout(10) << __func__
2467 << " first " << first
2468 << " last_pinned " << osdmap_manifest
.get_last_pinned()
2471 osdmap_manifest_t manifest
= osdmap_manifest
;
2473 if (!manifest
.is_pinned(first
)) {
2474 manifest
.pin(first
);
2477 set
<version_t
>::iterator p_end
= manifest
.pinned
.find(first
);
2478 set
<version_t
>::iterator p
= manifest
.pinned
.begin();
2479 manifest
.pinned
.erase(p
, p_end
);
2480 ceph_assert(manifest
.get_first_pinned() == first
);
2482 if (manifest
.get_last_pinned() == first
+1 ||
2483 manifest
.pinned
.size() == 1) {
2484 // we reached the end of the line, as pinned maps go; clean up our
2485 // manifest, and let `should_prune()` decide whether we should prune
2487 tx
->erase(get_service_name(), "osdmap_manifest");
2492 manifest
.encode(bl
);
2493 tx
->put(get_service_name(), "osdmap_manifest", bl
);
2496 void OSDMonitor::prune_init(osdmap_manifest_t
& manifest
)
2498 dout(1) << __func__
<< dendl
;
2500 version_t pin_first
;
2502 // verify constrainsts on stable in-memory state
2503 if (!has_osdmap_manifest
) {
2504 // we must have never pruned, OR if we pruned the state must no longer
2505 // be relevant (i.e., the state must have been removed alongside with
2506 // the trim that *must* have removed past the last pinned map in a
2508 ceph_assert(osdmap_manifest
.pinned
.empty());
2509 ceph_assert(!mon
.store
->exists(get_service_name(), "osdmap_manifest"));
2510 pin_first
= get_first_committed();
2513 // we must have pruned in the past AND its state is still relevant
2514 // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
2515 // and thus we still hold a manifest in the store).
2516 ceph_assert(!osdmap_manifest
.pinned
.empty());
2517 ceph_assert(osdmap_manifest
.get_first_pinned() == get_first_committed());
2518 ceph_assert(osdmap_manifest
.get_last_pinned() < get_last_committed());
2520 dout(10) << __func__
2521 << " first_pinned " << osdmap_manifest
.get_first_pinned()
2522 << " last_pinned " << osdmap_manifest
.get_last_pinned()
2525 pin_first
= osdmap_manifest
.get_last_pinned();
2528 manifest
.pin(pin_first
);
2531 bool OSDMonitor::_prune_sanitize_options() const
2533 uint64_t prune_interval
=
2534 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_interval");
2535 uint64_t prune_min
=
2536 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_min");
2538 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_txsize");
2542 if (prune_interval
== 0) {
2544 << " prune is enabled BUT prune interval is zero; abort."
2547 } else if (prune_interval
== 1) {
2549 << " prune interval is equal to one, which essentially means"
2550 " no pruning; abort."
2554 if (prune_min
== 0) {
2556 << " prune is enabled BUT prune min is zero; abort."
2560 if (prune_interval
> prune_min
) {
2562 << " impossible to ascertain proper prune interval because"
2563 << " it is greater than the minimum prune epochs"
2564 << " (min: " << prune_min
<< ", interval: " << prune_interval
<< ")"
2569 if (txsize
< prune_interval
- 1) {
2571 << " 'mon_osdmap_full_prune_txsize' (" << txsize
2572 << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval
- 1
2573 << "); abort." << dendl
;
2579 bool OSDMonitor::is_prune_enabled() const {
2580 return g_conf().get_val
<bool>("mon_osdmap_full_prune_enabled");
2583 bool OSDMonitor::is_prune_supported() const {
2584 return mon
.get_required_mon_features().contains_any(
2585 ceph::features::mon::FEATURE_OSDMAP_PRUNE
);
2590 * @returns true if has side-effects; false otherwise.
2592 bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx
)
2594 bool enabled
= is_prune_enabled();
2596 dout(1) << __func__
<< " osdmap full prune "
2597 << ( enabled
? "enabled" : "disabled")
2600 if (!enabled
|| !_prune_sanitize_options() || !should_prune()) {
2604 // we are beyond the minimum prune versions, we need to remove maps because
2605 // otherwise the store will grow unbounded and we may end up having issues
2606 // with available disk space or store hangs.
2608 // we will not pin all versions. We will leave a buffer number of versions.
2609 // this allows us the monitor to trim maps without caring too much about
2610 // pinned maps, and then allow us to use another ceph-mon without these
2611 // capabilities, without having to repair the store.
2613 osdmap_manifest_t manifest
= osdmap_manifest
;
2615 version_t first
= get_first_committed();
2616 version_t last
= get_last_committed();
2618 version_t last_to_pin
= last
- g_conf()->mon_min_osdmap_epochs
;
2619 version_t last_pinned
= manifest
.get_last_pinned();
2620 uint64_t prune_interval
=
2621 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_interval");
2623 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_txsize");
2625 prune_init(manifest
);
2627 // we need to get rid of some osdmaps
2630 << " lc (" << first
<< " .. " << last
<< ")"
2631 << " last_pinned " << last_pinned
2632 << " interval " << prune_interval
2633 << " last_to_pin " << last_to_pin
2636 // We will be erasing maps as we go.
2638 // We will erase all maps between `last_pinned` and the `next_to_pin`.
2640 // If `next_to_pin` happens to be greater than `last_to_pin`, then
2641 // we stop pruning. We could prune the maps between `next_to_pin` and
2642 // `last_to_pin`, but by not doing it we end up with neater pruned
2643 // intervals, aligned with `prune_interval`. Besides, this should not be a
2644 // problem as long as `prune_interval` is set to a sane value, instead of
2645 // hundreds or thousands of maps.
2647 auto map_exists
= [this](version_t v
) {
2648 string k
= mon
.store
->combine_strings("full", v
);
2649 return mon
.store
->exists(get_service_name(), k
);
2652 // 'interval' represents the number of maps from the last pinned
2653 // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2654 // version 11 next; all intermediate versions will be removed.
2656 // 'txsize' represents the maximum number of versions we'll be removing in
2657 // this iteration. If 'txsize' is large enough to perform multiple passes
2658 // pinning and removing maps, we will do so; if not, we'll do at least one
2659 // pass. We are quite relaxed about honouring 'txsize', but we'll always
2660 // ensure that we never go *over* the maximum.
2662 // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2663 uint64_t removal_interval
= prune_interval
- 1;
2665 if (txsize
< removal_interval
) {
2667 << " setting txsize to removal interval size ("
2668 << removal_interval
<< " versions"
2670 txsize
= removal_interval
;
2672 ceph_assert(removal_interval
> 0);
2674 uint64_t num_pruned
= 0;
2675 while (num_pruned
+ removal_interval
<= txsize
) {
2676 last_pinned
= manifest
.get_last_pinned();
2678 if (last_pinned
+ prune_interval
> last_to_pin
) {
2681 ceph_assert(last_pinned
< last_to_pin
);
2683 version_t next_pinned
= last_pinned
+ prune_interval
;
2684 ceph_assert(next_pinned
<= last_to_pin
);
2685 manifest
.pin(next_pinned
);
2687 dout(20) << __func__
2688 << " last_pinned " << last_pinned
2689 << " next_pinned " << next_pinned
2690 << " num_pruned " << num_pruned
2691 << " removal interval (" << (last_pinned
+1)
2692 << ".." << (next_pinned
-1) << ")"
2693 << " txsize " << txsize
<< dendl
;
2695 ceph_assert(map_exists(last_pinned
));
2696 ceph_assert(map_exists(next_pinned
));
2698 for (version_t v
= last_pinned
+1; v
< next_pinned
; ++v
) {
2699 ceph_assert(!manifest
.is_pinned(v
));
2701 dout(20) << __func__
<< " pruning full osdmap e" << v
<< dendl
;
2702 string full_key
= mon
.store
->combine_strings("full", v
);
2703 tx
->erase(get_service_name(), full_key
);
2708 ceph_assert(num_pruned
> 0);
2711 manifest
.encode(bl
);
2712 tx
->put(get_service_name(), "osdmap_manifest", bl
);
2720 bool OSDMonitor::preprocess_query(MonOpRequestRef op
)
2722 op
->mark_osdmon_event(__func__
);
2723 Message
*m
= op
->get_req();
2724 dout(10) << "preprocess_query " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
2726 switch (m
->get_type()) {
2728 case MSG_MON_COMMAND
:
2730 return preprocess_command(op
);
2731 } catch (const bad_cmd_get
& e
) {
2733 mon
.reply_command(op
, -EINVAL
, e
.what(), bl
, get_last_committed());
2736 case CEPH_MSG_MON_GET_OSDMAP
:
2737 return preprocess_get_osdmap(op
);
2740 case MSG_OSD_MARK_ME_DOWN
:
2741 return preprocess_mark_me_down(op
);
2742 case MSG_OSD_MARK_ME_DEAD
:
2743 return preprocess_mark_me_dead(op
);
2745 return preprocess_full(op
);
2746 case MSG_OSD_FAILURE
:
2747 return preprocess_failure(op
);
2749 return preprocess_boot(op
);
2751 return preprocess_alive(op
);
2752 case MSG_OSD_PG_CREATED
:
2753 return preprocess_pg_created(op
);
2754 case MSG_OSD_PG_READY_TO_MERGE
:
2755 return preprocess_pg_ready_to_merge(op
);
2756 case MSG_OSD_PGTEMP
:
2757 return preprocess_pgtemp(op
);
2758 case MSG_OSD_BEACON
:
2759 return preprocess_beacon(op
);
2761 case CEPH_MSG_POOLOP
:
2762 return preprocess_pool_op(op
);
2764 case MSG_REMOVE_SNAPS
:
2765 return preprocess_remove_snaps(op
);
2767 case MSG_MON_GET_PURGED_SNAPS
:
2768 return preprocess_get_purged_snaps(op
);
2776 bool OSDMonitor::prepare_update(MonOpRequestRef op
)
2778 op
->mark_osdmon_event(__func__
);
2779 Message
*m
= op
->get_req();
2780 dout(7) << "prepare_update " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
2782 switch (m
->get_type()) {
2784 case MSG_OSD_MARK_ME_DOWN
:
2785 return prepare_mark_me_down(op
);
2786 case MSG_OSD_MARK_ME_DEAD
:
2787 return prepare_mark_me_dead(op
);
2789 return prepare_full(op
);
2790 case MSG_OSD_FAILURE
:
2791 return prepare_failure(op
);
2793 return prepare_boot(op
);
2795 return prepare_alive(op
);
2796 case MSG_OSD_PG_CREATED
:
2797 return prepare_pg_created(op
);
2798 case MSG_OSD_PGTEMP
:
2799 return prepare_pgtemp(op
);
2800 case MSG_OSD_PG_READY_TO_MERGE
:
2801 return prepare_pg_ready_to_merge(op
);
2802 case MSG_OSD_BEACON
:
2803 return prepare_beacon(op
);
2805 case MSG_MON_COMMAND
:
2807 return prepare_command(op
);
2808 } catch (const bad_cmd_get
& e
) {
2810 mon
.reply_command(op
, -EINVAL
, e
.what(), bl
, get_last_committed());
2811 return false; /* nothing to propose */
2814 case CEPH_MSG_POOLOP
:
2815 return prepare_pool_op(op
);
2817 case MSG_REMOVE_SNAPS
:
2818 return prepare_remove_snaps(op
);
2828 bool OSDMonitor::should_propose(double& delay
)
2830 dout(10) << "should_propose" << dendl
;
2832 // if full map, propose immediately! any subsequent changes will be clobbered.
2833 if (pending_inc
.fullmap
.length())
2836 // adjust osd weights?
2837 if (!osd_weight
.empty() &&
2838 osd_weight
.size() == (unsigned)osdmap
.get_max_osd()) {
2839 dout(0) << " adjusting osd weights based on " << osd_weight
<< dendl
;
2840 osdmap
.adjust_osd_weights(osd_weight
, pending_inc
);
2846 return PaxosService::should_propose(delay
);
2851 // ---------------------------
2854 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op
)
2856 op
->mark_osdmon_event(__func__
);
2857 auto m
= op
->get_req
<MMonGetOSDMap
>();
2859 uint64_t features
= mon
.get_quorum_con_features();
2860 if (op
->get_session() && op
->get_session()->con_features
)
2861 features
= op
->get_session()->con_features
;
2863 dout(10) << __func__
<< " " << *m
<< dendl
;
2864 MOSDMap
*reply
= new MOSDMap(mon
.monmap
->fsid
, features
);
2865 epoch_t first
= get_first_committed();
2866 epoch_t last
= osdmap
.get_epoch();
2867 int max
= g_conf()->osd_map_message_max
;
2868 ssize_t max_bytes
= g_conf()->osd_map_message_max_bytes
;
2869 for (epoch_t e
= std::max(first
, m
->get_full_first());
2870 e
<= std::min(last
, m
->get_full_last()) && max
> 0 && max_bytes
> 0;
2872 bufferlist
& bl
= reply
->maps
[e
];
2873 int r
= get_version_full(e
, features
, bl
);
2874 ceph_assert(r
>= 0);
2875 max_bytes
-= bl
.length();
2877 for (epoch_t e
= std::max(first
, m
->get_inc_first());
2878 e
<= std::min(last
, m
->get_inc_last()) && max
> 0 && max_bytes
> 0;
2880 bufferlist
& bl
= reply
->incremental_maps
[e
];
2881 int r
= get_version(e
, features
, bl
);
2882 ceph_assert(r
>= 0);
2883 max_bytes
-= bl
.length();
2885 reply
->cluster_osdmap_trim_lower_bound
= first
;
2886 reply
->newest_map
= last
;
2887 mon
.send_reply(op
, reply
);
2892 // ---------------------------
2897 bool OSDMonitor::check_source(MonOpRequestRef op
, uuid_d fsid
) {
2898 // check permissions
2899 MonSession
*session
= op
->get_session();
2902 if (!session
->is_capable("osd", MON_CAP_X
)) {
2903 dout(0) << "got MOSDFailure from entity with insufficient caps "
2904 << session
->caps
<< dendl
;
2907 if (fsid
!= mon
.monmap
->fsid
) {
2908 dout(0) << "check_source: on fsid " << fsid
2909 << " != " << mon
.monmap
->fsid
<< dendl
;
2916 bool OSDMonitor::preprocess_failure(MonOpRequestRef op
)
2918 op
->mark_osdmon_event(__func__
);
2919 auto m
= op
->get_req
<MOSDFailure
>();
2920 // who is target_osd
2921 int badboy
= m
->get_target_osd();
2923 // check permissions
2924 if (check_source(op
, m
->fsid
))
2927 // first, verify the reporting host is valid
2928 if (m
->get_orig_source().is_osd()) {
2929 int from
= m
->get_orig_source().num();
2930 if (!osdmap
.exists(from
) ||
2931 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs()) ||
2932 (osdmap
.is_down(from
) && m
->if_osd_failed())) {
2933 dout(5) << "preprocess_failure from dead osd." << from
2934 << ", ignoring" << dendl
;
2935 send_incremental(op
, m
->get_epoch()+1);
2942 if (osdmap
.is_down(badboy
)) {
2943 dout(5) << "preprocess_failure dne(/dup?): osd." << m
->get_target_osd()
2944 << " " << m
->get_target_addrs()
2945 << ", from " << m
->get_orig_source() << dendl
;
2946 if (m
->get_epoch() < osdmap
.get_epoch())
2947 send_incremental(op
, m
->get_epoch()+1);
2950 if (osdmap
.get_addrs(badboy
) != m
->get_target_addrs()) {
2951 dout(5) << "preprocess_failure wrong osd: report osd." << m
->get_target_osd()
2952 << " " << m
->get_target_addrs()
2953 << " != map's " << osdmap
.get_addrs(badboy
)
2954 << ", from " << m
->get_orig_source() << dendl
;
2955 if (m
->get_epoch() < osdmap
.get_epoch())
2956 send_incremental(op
, m
->get_epoch()+1);
2960 // already reported?
2961 if (osdmap
.is_down(badboy
) ||
2962 osdmap
.get_up_from(badboy
) > m
->get_epoch()) {
2963 dout(5) << "preprocess_failure dup/old: osd." << m
->get_target_osd()
2964 << " " << m
->get_target_addrs()
2965 << ", from " << m
->get_orig_source() << dendl
;
2966 if (m
->get_epoch() < osdmap
.get_epoch())
2967 send_incremental(op
, m
->get_epoch()+1);
2971 if (!can_mark_down(badboy
)) {
2972 dout(5) << "preprocess_failure ignoring report of osd."
2973 << m
->get_target_osd() << " " << m
->get_target_addrs()
2974 << " from " << m
->get_orig_source() << dendl
;
2978 dout(10) << "preprocess_failure new: osd." << m
->get_target_osd()
2979 << " " << m
->get_target_addrs()
2980 << ", from " << m
->get_orig_source() << dendl
;
2988 class C_AckMarkedDown
: public C_MonOp
{
2994 : C_MonOp(op
), osdmon(osdmon
) {}
2996 void _finish(int r
) override
{
2998 auto m
= op
->get_req
<MOSDMarkMeDown
>();
2999 osdmon
->mon
.send_reply(
3006 false)); // ACK itself does not request an ack
3007 } else if (r
== -EAGAIN
) {
3008 osdmon
->dispatch(op
);
3010 ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r
);
3013 ~C_AckMarkedDown() override
{
3017 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op
)
3019 op
->mark_osdmon_event(__func__
);
3020 auto m
= op
->get_req
<MOSDMarkMeDown
>();
3021 int from
= m
->target_osd
;
3023 // check permissions
3024 if (check_source(op
, m
->fsid
))
3027 // first, verify the reporting host is valid
3028 if (!m
->get_orig_source().is_osd())
3031 if (!osdmap
.exists(from
) ||
3032 osdmap
.is_down(from
) ||
3033 osdmap
.get_addrs(from
) != m
->target_addrs
) {
3034 dout(5) << "preprocess_mark_me_down from dead osd."
3035 << from
<< ", ignoring" << dendl
;
3036 send_incremental(op
, m
->get_epoch()+1);
3040 // no down might be set
3041 if (!can_mark_down(from
))
3044 dout(10) << "MOSDMarkMeDown for: " << m
->get_orig_source()
3045 << " " << m
->target_addrs
<< dendl
;
3049 if (m
->request_ack
) {
3050 Context
*c(new C_AckMarkedDown(this, op
));
3056 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op
)
3058 op
->mark_osdmon_event(__func__
);
3059 auto m
= op
->get_req
<MOSDMarkMeDown
>();
3060 int target_osd
= m
->target_osd
;
3062 ceph_assert(osdmap
.is_up(target_osd
));
3063 ceph_assert(osdmap
.get_addrs(target_osd
) == m
->target_addrs
);
3065 mon
.clog
->info() << "osd." << target_osd
<< " marked itself " << ((m
->down_and_dead
) ? "down and dead" : "down");
3066 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
3067 if (m
->down_and_dead
) {
3068 if (!pending_inc
.new_xinfo
.count(target_osd
)) {
3069 pending_inc
.new_xinfo
[target_osd
] = osdmap
.osd_xinfo
[target_osd
];
3071 pending_inc
.new_xinfo
[target_osd
].dead_epoch
= m
->get_epoch();
3074 wait_for_finished_proposal(op
, new C_AckMarkedDown(this, op
));
3078 bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op
)
3080 op
->mark_osdmon_event(__func__
);
3081 auto m
= op
->get_req
<MOSDMarkMeDead
>();
3082 int from
= m
->target_osd
;
3084 // check permissions
3085 if (check_source(op
, m
->fsid
)) {
3090 // first, verify the reporting host is valid
3091 if (!m
->get_orig_source().is_osd()) {
3096 if (!osdmap
.exists(from
) ||
3097 !osdmap
.is_down(from
)) {
3098 dout(5) << __func__
<< " from nonexistent or up osd." << from
3099 << ", ignoring" << dendl
;
3100 send_incremental(op
, m
->get_epoch()+1);
3108 bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op
)
3110 op
->mark_osdmon_event(__func__
);
3111 auto m
= op
->get_req
<MOSDMarkMeDead
>();
3112 int target_osd
= m
->target_osd
;
3114 ceph_assert(osdmap
.is_down(target_osd
));
3116 mon
.clog
->info() << "osd." << target_osd
<< " marked itself dead as of e"
3118 if (!pending_inc
.new_xinfo
.count(target_osd
)) {
3119 pending_inc
.new_xinfo
[target_osd
] = osdmap
.osd_xinfo
[target_osd
];
3121 pending_inc
.new_xinfo
[target_osd
].dead_epoch
= m
->get_epoch();
3122 wait_for_finished_proposal(
3125 [op
, this] (int r
) {
3127 mon
.no_reply(op
); // ignore on success
3134 bool OSDMonitor::can_mark_down(int i
)
3136 if (osdmap
.is_nodown(i
)) {
3137 dout(5) << __func__
<< " osd." << i
<< " is marked as nodown, "
3138 << "will not mark it down" << dendl
;
3142 int num_osds
= osdmap
.get_num_osds();
3143 if (num_osds
== 0) {
3144 dout(5) << __func__
<< " no osds" << dendl
;
3147 int up
= osdmap
.get_num_up_osds() - pending_inc
.get_net_marked_down(&osdmap
);
3148 float up_ratio
= (float)up
/ (float)num_osds
;
3149 if (up_ratio
< g_conf()->mon_osd_min_up_ratio
) {
3150 dout(2) << __func__
<< " current up_ratio " << up_ratio
<< " < min "
3151 << g_conf()->mon_osd_min_up_ratio
3152 << ", will not mark osd." << i
<< " down" << dendl
;
3158 bool OSDMonitor::can_mark_up(int i
)
3160 if (osdmap
.is_noup(i
)) {
3161 dout(5) << __func__
<< " osd." << i
<< " is marked as noup, "
3162 << "will not mark it up" << dendl
;
3170 * @note the parameter @p i apparently only exists here so we can output the
3171 * osd's id on messages.
3173 bool OSDMonitor::can_mark_out(int i
)
3175 if (osdmap
.is_noout(i
)) {
3176 dout(5) << __func__
<< " osd." << i
<< " is marked as noout, "
3177 << "will not mark it out" << dendl
;
3181 int num_osds
= osdmap
.get_num_osds();
3182 if (num_osds
== 0) {
3183 dout(5) << __func__
<< " no osds" << dendl
;
3186 int in
= osdmap
.get_num_in_osds() - pending_inc
.get_net_marked_out(&osdmap
);
3187 float in_ratio
= (float)in
/ (float)num_osds
;
3188 if (in_ratio
< g_conf()->mon_osd_min_in_ratio
) {
3190 dout(5) << __func__
<< " current in_ratio " << in_ratio
<< " < min "
3191 << g_conf()->mon_osd_min_in_ratio
3192 << ", will not mark osd." << i
<< " out" << dendl
;
3194 dout(5) << __func__
<< " current in_ratio " << in_ratio
<< " < min "
3195 << g_conf()->mon_osd_min_in_ratio
3196 << ", will not mark osds out" << dendl
;
3203 bool OSDMonitor::can_mark_in(int i
)
3205 if (osdmap
.is_noin(i
)) {
3206 dout(5) << __func__
<< " osd." << i
<< " is marked as noin, "
3207 << "will not mark it in" << dendl
;
3214 bool OSDMonitor::check_failures(utime_t now
)
3216 bool found_failure
= false;
3217 auto p
= failure_info
.begin();
3218 while (p
!= failure_info
.end()) {
3219 auto& [target_osd
, fi
] = *p
;
3220 if (can_mark_down(target_osd
) &&
3221 check_failure(now
, target_osd
, fi
)) {
3222 found_failure
= true;
3224 } else if (is_failure_stale(now
, fi
)) {
3225 dout(10) << " dropping stale failure_info for osd." << target_osd
3226 << " from " << fi
.reporters
.size() << " reporters"
3228 p
= failure_info
.erase(p
);
3233 return found_failure
;
3236 utime_t
OSDMonitor::get_grace_time(utime_t now
,
3238 failure_info_t
& fi
) const
3240 utime_t
orig_grace(g_conf()->osd_heartbeat_grace
, 0);
3241 if (!g_conf()->mon_osd_adjust_heartbeat_grace
) {
3244 utime_t grace
= orig_grace
;
3245 double halflife
= (double)g_conf()->mon_osd_laggy_halflife
;
3246 double decay_k
= ::log(.5) / halflife
;
3248 // scale grace period based on historical probability of 'lagginess'
3249 // (false positive failures due to slowness).
3250 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(target_osd
);
3251 const utime_t failed_for
= now
- fi
.get_failed_since();
3252 double decay
= exp((double)failed_for
* decay_k
);
3253 dout(20) << " halflife " << halflife
<< " decay_k " << decay_k
3254 << " failed_for " << failed_for
<< " decay " << decay
<< dendl
;
3255 double my_grace
= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
3258 // consider the peers reporting a failure a proxy for a potential
3259 // 'subcluster' over the overall cluster that is similarly
3260 // laggy. this is clearly not true in all cases, but will sometimes
3261 // help us localize the grace correction to a subset of the system
3262 // (say, a rack with a bad switch) that is unhappy.
3263 double peer_grace
= 0;
3264 for (auto& [reporter
, report
] : fi
.reporters
) {
3265 if (osdmap
.exists(reporter
)) {
3266 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(reporter
);
3267 utime_t elapsed
= now
- xi
.down_stamp
;
3268 double decay
= exp((double)elapsed
* decay_k
);
3269 peer_grace
+= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
3272 peer_grace
/= (double)fi
.reporters
.size();
3273 grace
+= peer_grace
;
3274 dout(10) << " osd." << target_osd
<< " has "
3275 << fi
.reporters
.size() << " reporters, "
3276 << grace
<< " grace (" << orig_grace
<< " + " << my_grace
3277 << " + " << peer_grace
<< "), max_failed_since " << fi
.get_failed_since()
3283 bool OSDMonitor::check_failure(utime_t now
, int target_osd
, failure_info_t
& fi
)
3285 // already pending failure?
3286 if (pending_inc
.new_state
.count(target_osd
) &&
3287 pending_inc
.new_state
[target_osd
] & CEPH_OSD_UP
) {
3288 dout(10) << " already pending failure" << dendl
;
3292 set
<string
> reporters_by_subtree
;
3293 auto reporter_subtree_level
= g_conf().get_val
<string
>("mon_osd_reporter_subtree_level");
3294 ceph_assert(fi
.reporters
.size());
3295 for (auto p
= fi
.reporters
.begin(); p
!= fi
.reporters
.end();) {
3296 // get the parent bucket whose type matches with "reporter_subtree_level".
3297 // fall back to OSD if the level doesn't exist.
3298 if (osdmap
.exists(p
->first
)) {
3299 auto reporter_loc
= osdmap
.crush
->get_full_location(p
->first
);
3300 if (auto iter
= reporter_loc
.find(reporter_subtree_level
);
3301 iter
== reporter_loc
.end()) {
3302 reporters_by_subtree
.insert("osd." + to_string(p
->first
));
3304 reporters_by_subtree
.insert(iter
->second
);
3308 fi
.cancel_report(p
->first
);;
3309 p
= fi
.reporters
.erase(p
);
3312 if (reporters_by_subtree
.size() < g_conf().get_val
<uint64_t>("mon_osd_min_down_reporters")) {
3315 const utime_t failed_for
= now
- fi
.get_failed_since();
3316 const utime_t grace
= get_grace_time(now
, target_osd
, fi
);
3317 if (failed_for
>= grace
) {
3318 dout(1) << " we have enough reporters to mark osd." << target_osd
3319 << " down" << dendl
;
3320 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
3322 mon
.clog
->info() << "osd." << target_osd
<< " failed ("
3323 << osdmap
.crush
->get_full_location_ordered_string(
3326 << (int)reporters_by_subtree
.size()
3327 << " reporters from different "
3328 << reporter_subtree_level
<< " after "
3329 << failed_for
<< " >= grace " << grace
<< ")";
3335 bool OSDMonitor::is_failure_stale(utime_t now
, failure_info_t
& fi
) const
3337 // if it takes too long to either cancel the report to mark the osd down,
3338 // some reporters must have failed to cancel their reports. let's just
3339 // forget these reports.
3340 const utime_t failed_for
= now
- fi
.get_failed_since();
3341 auto heartbeat_grace
= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_grace");
3342 auto heartbeat_stale
= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_stale");
3343 return failed_for
>= (heartbeat_grace
+ heartbeat_stale
);
3346 void OSDMonitor::force_failure(int target_osd
, int by
)
3348 // already pending failure?
3349 if (pending_inc
.new_state
.count(target_osd
) &&
3350 pending_inc
.new_state
[target_osd
] & CEPH_OSD_UP
) {
3351 dout(10) << " already pending failure" << dendl
;
3355 dout(1) << " we're forcing failure of osd." << target_osd
<< dendl
;
3356 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
3357 if (!pending_inc
.new_xinfo
.count(target_osd
)) {
3358 pending_inc
.new_xinfo
[target_osd
] = osdmap
.osd_xinfo
[target_osd
];
3360 pending_inc
.new_xinfo
[target_osd
].dead_epoch
= pending_inc
.epoch
;
3362 mon
.clog
->info() << "osd." << target_osd
<< " failed ("
3363 << osdmap
.crush
->get_full_location_ordered_string(target_osd
)
3364 << ") (connection refused reported by osd." << by
<< ")";
3368 bool OSDMonitor::prepare_failure(MonOpRequestRef op
)
3370 op
->mark_osdmon_event(__func__
);
3371 auto m
= op
->get_req
<MOSDFailure
>();
3372 dout(1) << "prepare_failure osd." << m
->get_target_osd()
3373 << " " << m
->get_target_addrs()
3374 << " from " << m
->get_orig_source()
3375 << " is reporting failure:" << m
->if_osd_failed() << dendl
;
3377 int target_osd
= m
->get_target_osd();
3378 int reporter
= m
->get_orig_source().num();
3379 ceph_assert(osdmap
.is_up(target_osd
));
3380 ceph_assert(osdmap
.get_addrs(target_osd
) == m
->get_target_addrs());
3384 if (m
->if_osd_failed()) {
3385 // calculate failure time
3386 utime_t now
= ceph_clock_now();
3387 utime_t failed_since
=
3388 m
->get_recv_stamp() - utime_t(m
->failed_for
, 0);
3391 if (m
->is_immediate()) {
3392 mon
.clog
->debug() << "osd." << m
->get_target_osd()
3393 << " reported immediately failed by "
3394 << m
->get_orig_source();
3395 force_failure(target_osd
, reporter
);
3398 mon
.clog
->debug() << "osd." << m
->get_target_osd() << " reported failed by "
3399 << m
->get_orig_source();
3401 failure_info_t
& fi
= failure_info
[target_osd
];
3402 fi
.add_report(reporter
, failed_since
, op
);
3403 return check_failure(now
, target_osd
, fi
);
3405 // remove the report
3406 mon
.clog
->debug() << "osd." << m
->get_target_osd()
3407 << " failure report canceled by "
3408 << m
->get_orig_source();
3409 if (failure_info
.count(target_osd
)) {
3410 failure_info_t
& fi
= failure_info
[target_osd
];
3411 fi
.cancel_report(reporter
);
3412 if (fi
.reporters
.empty()) {
3413 dout(10) << " removing last failure_info for osd." << target_osd
3415 failure_info
.erase(target_osd
);
3417 dout(10) << " failure_info for osd." << target_osd
<< " now "
3418 << fi
.reporters
.size() << " reporters" << dendl
;
3421 dout(10) << " no failure_info for osd." << target_osd
<< dendl
;
3428 void OSDMonitor::process_failures()
3430 map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
3431 while (p
!= failure_info
.end()) {
3432 if (osdmap
.is_up(p
->first
)) {
3435 dout(10) << "process_failures osd." << p
->first
<< dendl
;
3436 list
<MonOpRequestRef
> ls
;
3437 p
->second
.take_report_messages(ls
);
3438 failure_info
.erase(p
++);
3440 while (!ls
.empty()) {
3441 MonOpRequestRef o
= ls
.front();
3443 o
->mark_event(__func__
);
3444 MOSDFailure
*m
= o
->get_req
<MOSDFailure
>();
3445 send_latest(o
, m
->get_epoch());
3454 void OSDMonitor::take_all_failures(list
<MonOpRequestRef
>& ls
)
3456 dout(10) << __func__
<< " on " << failure_info
.size() << " osds" << dendl
;
3458 for (map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
3459 p
!= failure_info
.end();
3461 p
->second
.take_report_messages(ls
);
3463 failure_info
.clear();
3466 int OSDMonitor::get_grace_interval_threshold()
3468 int halflife
= g_conf()->mon_osd_laggy_halflife
;
3469 // Scale the halflife period (default: 1_hr) by
3470 // a factor (48) to calculate the threshold.
3471 int grace_threshold_factor
= 48;
3472 return halflife
* grace_threshold_factor
;
3475 bool OSDMonitor::grace_interval_threshold_exceeded(int last_failed_interval
)
3477 int grace_interval_threshold_secs
= get_grace_interval_threshold();
3478 if (last_failed_interval
> grace_interval_threshold_secs
) {
3479 dout(1) << " last_failed_interval " << last_failed_interval
3480 << " > grace_interval_threshold_secs " << grace_interval_threshold_secs
3487 void OSDMonitor::set_default_laggy_params(int target_osd
)
3489 if (pending_inc
.new_xinfo
.count(target_osd
) == 0) {
3490 pending_inc
.new_xinfo
[target_osd
] = osdmap
.osd_xinfo
[target_osd
];
3492 osd_xinfo_t
& xi
= pending_inc
.new_xinfo
[target_osd
];
3493 xi
.down_stamp
= pending_inc
.modified
;
3494 xi
.laggy_probability
= 0.0;
3495 xi
.laggy_interval
= 0;
3496 dout(20) << __func__
<< " reset laggy, now xi " << xi
<< dendl
;
3502 bool OSDMonitor::preprocess_boot(MonOpRequestRef op
)
3504 op
->mark_osdmon_event(__func__
);
3505 auto m
= op
->get_req
<MOSDBoot
>();
3506 int from
= m
->get_orig_source_inst().name
.num();
3508 // check permissions, ignore if failed (no response expected)
3509 MonSession
*session
= op
->get_session();
3512 if (!session
->is_capable("osd", MON_CAP_X
)) {
3513 dout(0) << "got preprocess_boot message from entity with insufficient caps"
3514 << session
->caps
<< dendl
;
3518 if (m
->sb
.cluster_fsid
!= mon
.monmap
->fsid
) {
3519 dout(0) << "preprocess_boot on fsid " << m
->sb
.cluster_fsid
3520 << " != " << mon
.monmap
->fsid
<< dendl
;
3524 if (m
->get_orig_source_inst().addr
.is_blank_ip()) {
3525 dout(0) << "preprocess_boot got blank addr for " << m
->get_orig_source_inst() << dendl
;
3529 ceph_assert(m
->get_orig_source_inst().name
.is_osd());
3531 // lower bound of N-2
3532 if (!HAVE_FEATURE(m
->osd_features
, SERVER_PACIFIC
)) {
3533 mon
.clog
->info() << "disallowing boot of OSD "
3534 << m
->get_orig_source_inst()
3535 << " because the osd lacks CEPH_FEATURE_SERVER_PACIFIC";
3539 // make sure osd versions do not span more than 3 releases
3540 if (HAVE_FEATURE(m
->osd_features
, SERVER_QUINCY
) &&
3541 osdmap
.require_osd_release
< ceph_release_t::octopus
) {
3542 mon
.clog
->info() << "disallowing boot of quincy+ OSD "
3543 << m
->get_orig_source_inst()
3544 << " because require_osd_release < octopus";
3547 if (HAVE_FEATURE(m
->osd_features
, SERVER_REEF
) &&
3548 osdmap
.require_osd_release
< ceph_release_t::pacific
) {
3549 mon
.clog
->info() << "disallowing boot of reef+ OSD "
3550 << m
->get_orig_source_inst()
3551 << " because require_osd_release < pacific";
3555 // See crimson/osd/osd.cc: OSD::_send_boot
3556 if (auto type_iter
= m
->metadata
.find("osd_type");
3557 type_iter
!= m
->metadata
.end()) {
3558 const auto &otype
= type_iter
->second
;
3559 // m->metadata["osd_type"] must be "crimson", classic doesn't send osd_type
3560 if (otype
== "crimson") {
3561 if (!osdmap
.get_allow_crimson()) {
3563 << "Disallowing boot of crimson-osd without allow_crimson "
3564 << "OSDMap flag. Run ceph osd set_allow_crimson to set "
3565 << "allow_crimson flag. Note that crimson-osd is "
3566 << "considered unstable and may result in crashes or "
3567 << "data loss. Its usage should be restricted to "
3568 << "testing and development.";
3572 derr
<< __func__
<< ": osd " << m
->get_orig_source_inst()
3573 << " sent non-crimson osd_type field in MOSDBoot: "
3575 << " -- booting anyway"
3580 if (osdmap
.stretch_mode_enabled
&&
3581 !(m
->osd_features
& CEPH_FEATUREMASK_STRETCH_MODE
)) {
3582 mon
.clog
->info() << "disallowing boot of OSD "
3583 << m
->get_orig_source_inst()
3584 << " because stretch mode is on and OSD lacks support";
3589 if (osdmap
.is_up(from
) &&
3590 osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs()) &&
3591 osdmap
.get_cluster_addrs(from
).legacy_equals(m
->cluster_addrs
)) {
3593 dout(7) << "preprocess_boot dup from " << m
->get_orig_source()
3594 << " " << m
->get_orig_source_addrs()
3595 << " =~ " << osdmap
.get_addrs(from
) << dendl
;
3600 if (osdmap
.exists(from
) &&
3601 !osdmap
.get_uuid(from
).is_zero() &&
3602 osdmap
.get_uuid(from
) != m
->sb
.osd_fsid
) {
3603 dout(7) << __func__
<< " from " << m
->get_orig_source_inst()
3604 << " clashes with existing osd: different fsid"
3605 << " (ours: " << osdmap
.get_uuid(from
)
3606 << " ; theirs: " << m
->sb
.osd_fsid
<< ")" << dendl
;
3610 if (osdmap
.exists(from
) &&
3611 osdmap
.get_info(from
).up_from
> m
->version
&&
3612 osdmap
.get_most_recent_addrs(from
).legacy_equals(
3613 m
->get_orig_source_addrs())) {
3614 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl
;
3615 send_latest(op
, m
->sb
.current_epoch
+1);
3620 if (!can_mark_up(from
)) {
3621 dout(7) << "preprocess_boot ignoring boot from " << m
->get_orig_source_inst() << dendl
;
3622 send_latest(op
, m
->sb
.current_epoch
+1);
3626 dout(10) << "preprocess_boot from " << m
->get_orig_source_inst() << dendl
;
3633 bool OSDMonitor::prepare_boot(MonOpRequestRef op
)
3635 op
->mark_osdmon_event(__func__
);
3636 auto m
= op
->get_req
<MOSDBoot
>();
3637 dout(7) << __func__
<< " from " << m
->get_source()
3639 << " client_addrs" << m
->get_connection()->get_peer_addrs()
3640 << " cluster_addrs " << m
->cluster_addrs
3641 << " hb_back_addrs " << m
->hb_back_addrs
3642 << " hb_front_addrs " << m
->hb_front_addrs
3645 ceph_assert(m
->get_orig_source().is_osd());
3646 int from
= m
->get_orig_source().num();
3648 // does this osd exist?
3649 if (from
>= osdmap
.get_max_osd()) {
3650 dout(1) << "boot from osd." << from
<< " >= max_osd "
3651 << osdmap
.get_max_osd() << dendl
;
3655 int oldstate
= osdmap
.exists(from
) ? osdmap
.get_state(from
) : CEPH_OSD_NEW
;
3656 if (pending_inc
.new_state
.count(from
))
3657 oldstate
^= pending_inc
.new_state
[from
];
3659 // already up? mark down first?
3660 if (osdmap
.is_up(from
)) {
3661 dout(7) << __func__
<< " was up, first marking down osd." << from
<< " "
3662 << osdmap
.get_addrs(from
) << dendl
;
3663 // preprocess should have caught these; if not, assert.
3664 ceph_assert(!osdmap
.get_addrs(from
).legacy_equals(
3665 m
->get_orig_source_addrs()) ||
3666 !osdmap
.get_cluster_addrs(from
).legacy_equals(m
->cluster_addrs
));
3667 ceph_assert(osdmap
.get_uuid(from
) == m
->sb
.osd_fsid
);
3669 if (pending_inc
.new_state
.count(from
) == 0 ||
3670 (pending_inc
.new_state
[from
] & CEPH_OSD_UP
) == 0) {
3671 // mark previous guy down
3672 pending_inc
.new_state
[from
] = CEPH_OSD_UP
;
3674 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
3675 } else if (pending_inc
.new_up_client
.count(from
)) {
3676 // already prepared, just wait
3677 dout(7) << __func__
<< " already prepared, waiting on "
3678 << m
->get_orig_source_addr() << dendl
;
3679 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
3682 pending_inc
.new_up_client
[from
] = m
->get_orig_source_addrs();
3683 pending_inc
.new_up_cluster
[from
] = m
->cluster_addrs
;
3684 pending_inc
.new_hb_back_up
[from
] = m
->hb_back_addrs
;
3685 pending_inc
.new_hb_front_up
[from
] = m
->hb_front_addrs
;
3687 down_pending_out
.erase(from
); // if any
3690 osd_weight
[from
] = m
->sb
.weight
;
3693 dout(10) << " setting osd." << from
<< " uuid to " << m
->sb
.osd_fsid
3695 if (!osdmap
.exists(from
) || osdmap
.get_uuid(from
) != m
->sb
.osd_fsid
) {
3696 // preprocess should have caught this; if not, assert.
3697 ceph_assert(!osdmap
.exists(from
) || osdmap
.get_uuid(from
).is_zero());
3698 pending_inc
.new_uuid
[from
] = m
->sb
.osd_fsid
;
3702 if (m
->sb
.newest_map
== 0 && osdmap
.exists(from
)) {
3703 const osd_info_t
& i
= osdmap
.get_info(from
);
3704 if (i
.up_from
> i
.lost_at
) {
3705 dout(10) << " fresh osd; marking lost_at too" << dendl
;
3706 pending_inc
.new_lost
[from
] = osdmap
.get_epoch();
3711 bufferlist osd_metadata
;
3712 encode(m
->metadata
, osd_metadata
);
3713 pending_metadata
[from
] = osd_metadata
;
3714 pending_metadata_rm
.erase(from
);
3716 // adjust last clean unmount epoch?
3717 const osd_info_t
& info
= osdmap
.get_info(from
);
3718 dout(10) << " old osd_info: " << info
<< dendl
;
3719 if (m
->sb
.mounted
> info
.last_clean_begin
||
3720 (m
->sb
.mounted
== info
.last_clean_begin
&&
3721 m
->sb
.clean_thru
> info
.last_clean_end
)) {
3722 epoch_t begin
= m
->sb
.mounted
;
3723 epoch_t end
= m
->sb
.clean_thru
;
3725 dout(10) << __func__
<< " osd." << from
<< " last_clean_interval "
3726 << "[" << info
.last_clean_begin
<< "," << info
.last_clean_end
3727 << ") -> [" << begin
<< "-" << end
<< ")"
3729 pending_inc
.new_last_clean_interval
[from
] =
3730 pair
<epoch_t
,epoch_t
>(begin
, end
);
3733 if (pending_inc
.new_xinfo
.count(from
) == 0)
3734 pending_inc
.new_xinfo
[from
] = osdmap
.osd_xinfo
[from
];
3735 osd_xinfo_t
& xi
= pending_inc
.new_xinfo
[from
];
3736 if (m
->boot_epoch
== 0) {
3737 xi
.laggy_probability
*= (1.0 - g_conf()->mon_osd_laggy_weight
);
3738 xi
.laggy_interval
*= (1.0 - g_conf()->mon_osd_laggy_weight
);
3739 dout(10) << " not laggy, new xi " << xi
<< dendl
;
3741 if (xi
.down_stamp
.sec()) {
3742 int interval
= ceph_clock_now().sec() -
3743 xi
.down_stamp
.sec();
3744 if (g_conf()->mon_osd_laggy_max_interval
&&
3745 (interval
> g_conf()->mon_osd_laggy_max_interval
)) {
3746 interval
= g_conf()->mon_osd_laggy_max_interval
;
3749 interval
* g_conf()->mon_osd_laggy_weight
+
3750 xi
.laggy_interval
* (1.0 - g_conf()->mon_osd_laggy_weight
);
3752 xi
.laggy_probability
=
3753 g_conf()->mon_osd_laggy_weight
+
3754 xi
.laggy_probability
* (1.0 - g_conf()->mon_osd_laggy_weight
);
3755 dout(10) << " laggy, now xi " << xi
<< dendl
;
3758 // set features shared by the osd
3759 if (m
->osd_features
)
3760 xi
.features
= m
->osd_features
;
3762 xi
.features
= m
->get_connection()->get_features();
3765 if ((g_conf()->mon_osd_auto_mark_auto_out_in
&&
3766 (oldstate
& CEPH_OSD_AUTOOUT
)) ||
3767 (g_conf()->mon_osd_auto_mark_new_in
&& (oldstate
& CEPH_OSD_NEW
)) ||
3768 (g_conf()->mon_osd_auto_mark_in
)) {
3769 if (can_mark_in(from
)) {
3770 if (xi
.old_weight
> 0) {
3771 pending_inc
.new_weight
[from
] = xi
.old_weight
;
3774 pending_inc
.new_weight
[from
] = CEPH_OSD_IN
;
3777 dout(7) << __func__
<< " NOIN set, will not mark in "
3778 << m
->get_orig_source_addr() << dendl
;
3783 wait_for_finished_proposal(op
, new C_Booted(this, op
));
3788 void OSDMonitor::_booted(MonOpRequestRef op
, bool logit
)
3790 op
->mark_osdmon_event(__func__
);
3791 auto m
= op
->get_req
<MOSDBoot
>();
3792 dout(7) << "_booted " << m
->get_orig_source_inst()
3793 << " w " << m
->sb
.weight
<< " from " << m
->sb
.current_epoch
<< dendl
;
3796 mon
.clog
->info() << m
->get_source() << " " << m
->get_orig_source_addrs()
3800 send_latest(op
, m
->sb
.current_epoch
+1);
3807 bool OSDMonitor::preprocess_full(MonOpRequestRef op
)
3809 op
->mark_osdmon_event(__func__
);
3810 auto m
= op
->get_req
<MOSDFull
>();
3811 int from
= m
->get_orig_source().num();
3813 unsigned mask
= CEPH_OSD_NEARFULL
| CEPH_OSD_BACKFILLFULL
| CEPH_OSD_FULL
;
3815 // check permissions, ignore if failed
3816 MonSession
*session
= op
->get_session();
3819 if (!session
->is_capable("osd", MON_CAP_X
)) {
3820 dout(0) << "MOSDFull from entity with insufficient privileges:"
3821 << session
->caps
<< dendl
;
3825 // ignore a full message from the osd instance that already went down
3826 if (!osdmap
.exists(from
)) {
3827 dout(7) << __func__
<< " ignoring full message from nonexistent "
3828 << m
->get_orig_source_inst() << dendl
;
3831 if ((!osdmap
.is_up(from
) &&
3832 osdmap
.get_most_recent_addrs(from
).legacy_equals(
3833 m
->get_orig_source_addrs())) ||
3834 (osdmap
.is_up(from
) &&
3835 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs()))) {
3836 dout(7) << __func__
<< " ignoring full message from down "
3837 << m
->get_orig_source_inst() << dendl
;
3841 OSDMap::calc_state_set(osdmap
.get_state(from
), state
);
3843 if ((osdmap
.get_state(from
) & mask
) == m
->state
) {
3844 dout(7) << __func__
<< " state already " << state
<< " for osd." << from
3845 << " " << m
->get_orig_source_inst() << dendl
;
3846 _reply_map(op
, m
->version
);
3850 dout(10) << __func__
<< " want state " << state
<< " for osd." << from
3851 << " " << m
->get_orig_source_inst() << dendl
;
3858 bool OSDMonitor::prepare_full(MonOpRequestRef op
)
3860 op
->mark_osdmon_event(__func__
);
3861 auto m
= op
->get_req
<MOSDFull
>();
3862 const int from
= m
->get_orig_source().num();
3864 const unsigned mask
= CEPH_OSD_NEARFULL
| CEPH_OSD_BACKFILLFULL
| CEPH_OSD_FULL
;
3865 const unsigned want_state
= m
->state
& mask
; // safety first
3867 unsigned cur_state
= osdmap
.get_state(from
);
3868 auto p
= pending_inc
.new_state
.find(from
);
3869 if (p
!= pending_inc
.new_state
.end()) {
3870 cur_state
^= p
->second
;
3874 set
<string
> want_state_set
, cur_state_set
;
3875 OSDMap::calc_state_set(want_state
, want_state_set
);
3876 OSDMap::calc_state_set(cur_state
, cur_state_set
);
3878 if (cur_state
!= want_state
) {
3879 if (p
!= pending_inc
.new_state
.end()) {
3882 pending_inc
.new_state
[from
] = 0;
3884 pending_inc
.new_state
[from
] |= (osdmap
.get_state(from
) & mask
) ^ want_state
;
3885 dout(7) << __func__
<< " osd." << from
<< " " << cur_state_set
3886 << " -> " << want_state_set
<< dendl
;
3888 dout(7) << __func__
<< " osd." << from
<< " " << cur_state_set
3889 << " = wanted " << want_state_set
<< ", just waiting" << dendl
;
3892 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
3899 bool OSDMonitor::preprocess_alive(MonOpRequestRef op
)
3901 op
->mark_osdmon_event(__func__
);
3902 auto m
= op
->get_req
<MOSDAlive
>();
3903 int from
= m
->get_orig_source().num();
3905 // check permissions, ignore if failed
3906 MonSession
*session
= op
->get_session();
3909 if (!session
->is_capable("osd", MON_CAP_X
)) {
3910 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3911 << session
->caps
<< dendl
;
3915 if (!osdmap
.is_up(from
) ||
3916 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs())) {
3917 dout(7) << "preprocess_alive ignoring alive message from down "
3918 << m
->get_orig_source() << " " << m
->get_orig_source_addrs()
3923 if (osdmap
.get_up_thru(from
) >= m
->want
) {
3925 dout(7) << "preprocess_alive want up_thru " << m
->want
<< " dup from " << m
->get_orig_source_inst() << dendl
;
3926 _reply_map(op
, m
->version
);
3930 dout(10) << "preprocess_alive want up_thru " << m
->want
3931 << " from " << m
->get_orig_source_inst() << dendl
;
3938 bool OSDMonitor::prepare_alive(MonOpRequestRef op
)
3940 op
->mark_osdmon_event(__func__
);
3941 auto m
= op
->get_req
<MOSDAlive
>();
3942 int from
= m
->get_orig_source().num();
3944 if (0) { // we probably don't care much about these
3945 mon
.clog
->debug() << m
->get_orig_source_inst() << " alive";
3948 dout(7) << "prepare_alive want up_thru " << m
->want
<< " have " << m
->version
3949 << " from " << m
->get_orig_source_inst() << dendl
;
3951 update_up_thru(from
, m
->version
); // set to the latest map the OSD has
3952 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
3956 void OSDMonitor::_reply_map(MonOpRequestRef op
, epoch_t e
)
3958 op
->mark_osdmon_event(__func__
);
3959 dout(7) << "_reply_map " << e
3960 << " from " << op
->get_req()->get_orig_source_inst()
3966 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op
)
3968 op
->mark_osdmon_event(__func__
);
3969 auto m
= op
->get_req
<MOSDPGCreated
>();
3970 dout(10) << __func__
<< " " << *m
<< dendl
;
3971 auto session
= op
->get_session();
3974 dout(10) << __func__
<< ": no monitor session!" << dendl
;
3977 if (!session
->is_capable("osd", MON_CAP_X
)) {
3978 derr
<< __func__
<< " received from entity "
3979 << "with insufficient privileges " << session
->caps
<< dendl
;
3982 // always forward the "created!" to the leader
3986 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op
)
3988 op
->mark_osdmon_event(__func__
);
3989 auto m
= op
->get_req
<MOSDPGCreated
>();
3990 dout(10) << __func__
<< " " << *m
<< dendl
;
3991 auto src
= m
->get_orig_source();
3992 auto from
= src
.num();
3993 if (!src
.is_osd() ||
3994 !mon
.osdmon()->osdmap
.is_up(from
) ||
3995 !mon
.osdmon()->osdmap
.get_addrs(from
).legacy_equals(
3996 m
->get_orig_source_addrs())) {
3997 dout(1) << __func__
<< " ignoring stats from non-active osd." << dendl
;
4000 pending_created_pgs
.push_back(m
->pgid
);
4004 bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op
)
4006 op
->mark_osdmon_event(__func__
);
4007 auto m
= op
->get_req
<MOSDPGReadyToMerge
>();
4008 dout(10) << __func__
<< " " << *m
<< dendl
;
4009 const pg_pool_t
*pi
;
4010 auto session
= op
->get_session();
4012 dout(10) << __func__
<< ": no monitor session!" << dendl
;
4015 if (!session
->is_capable("osd", MON_CAP_X
)) {
4016 derr
<< __func__
<< " received from entity "
4017 << "with insufficient privileges " << session
->caps
<< dendl
;
4020 pi
= osdmap
.get_pg_pool(m
->pgid
.pool());
4022 derr
<< __func__
<< " pool for " << m
->pgid
<< " dne" << dendl
;
4025 if (pi
->get_pg_num() <= m
->pgid
.ps()) {
4026 dout(20) << " pg_num " << pi
->get_pg_num() << " already < " << m
->pgid
<< dendl
;
4029 if (pi
->get_pg_num() != m
->pgid
.ps() + 1) {
4030 derr
<< " OSD trying to merge wrong pgid " << m
->pgid
<< dendl
;
4033 if (pi
->get_pg_num_pending() > m
->pgid
.ps()) {
4034 dout(20) << " pg_num_pending " << pi
->get_pg_num_pending() << " > " << m
->pgid
<< dendl
;
4044 bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op
)
4046 op
->mark_osdmon_event(__func__
);
4047 auto m
= op
->get_req
<MOSDPGReadyToMerge
>();
4048 dout(10) << __func__
<< " " << *m
<< dendl
;
4050 if (pending_inc
.new_pools
.count(m
->pgid
.pool()))
4051 p
= pending_inc
.new_pools
[m
->pgid
.pool()];
4053 p
= *osdmap
.get_pg_pool(m
->pgid
.pool());
4054 if (p
.get_pg_num() != m
->pgid
.ps() + 1 ||
4055 p
.get_pg_num_pending() > m
->pgid
.ps()) {
4056 dout(10) << __func__
4057 << " race with concurrent pg_num[_pending] update, will retry"
4059 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
4060 return false; /* nothing to propose, yet */
4064 p
.dec_pg_num(m
->pgid
,
4068 m
->last_epoch_started
,
4069 m
->last_epoch_clean
);
4070 p
.last_change
= pending_inc
.epoch
;
4072 // back off the merge attempt!
4073 p
.set_pg_num_pending(p
.get_pg_num());
4076 // force pre-nautilus clients to resend their ops, since they
4077 // don't understand pg_num_pending changes form a new interval
4078 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
4080 pending_inc
.new_pools
[m
->pgid
.pool()] = p
;
4082 auto prob
= g_conf().get_val
<double>("mon_inject_pg_merge_bounce_probability");
4085 prob
> (double)(rand() % 1000)/1000.0) {
4086 derr
<< __func__
<< " injecting pg merge pg_num bounce" << dendl
;
4087 auto n
= new MMonCommand(mon
.monmap
->get_fsid());
4088 n
->set_connection(m
->get_connection());
4089 n
->cmd
= { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
4090 osdmap
.get_pool_name(m
->pgid
.pool()) +
4091 "\", \"var\": \"pg_num_actual\", \"val\": \"" +
4092 stringify(m
->pgid
.ps() + 1) + "\"}" };
4093 MonOpRequestRef nop
= mon
.op_tracker
.create_request
<MonOpRequest
>(n
);
4094 nop
->set_type_service();
4095 wait_for_finished_proposal(op
, new C_RetryMessage(this, nop
));
4097 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
4106 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op
)
4108 auto m
= op
->get_req
<MOSDPGTemp
>();
4109 dout(10) << "preprocess_pgtemp " << *m
<< dendl
;
4110 mempool::osdmap::vector
<int> empty
;
4111 int from
= m
->get_orig_source().num();
4112 size_t ignore_cnt
= 0;
4115 MonSession
*session
= op
->get_session();
4118 if (!session
->is_capable("osd", MON_CAP_X
)) {
4119 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
4120 << session
->caps
<< dendl
;
4124 if (!osdmap
.is_up(from
) ||
4125 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs())) {
4126 dout(7) << "ignoring pgtemp message from down "
4127 << m
->get_orig_source() << " " << m
->get_orig_source_addrs()
4136 for (auto p
= m
->pg_temp
.begin(); p
!= m
->pg_temp
.end(); ++p
) {
4137 dout(20) << " " << p
->first
4138 << (osdmap
.pg_temp
->count(p
->first
) ? osdmap
.pg_temp
->get(p
->first
) : empty
)
4139 << " -> " << p
->second
<< dendl
;
4141 // does the pool exist?
4142 if (!osdmap
.have_pg_pool(p
->first
.pool())) {
4144 * 1. If the osdmap does not have the pool, it means the pool has been
4145 * removed in-between the osd sending this message and us handling it.
4146 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
4147 * not exist in the pending either, as the osds would not send a
4148 * message about a pool they know nothing about (yet).
4149 * 3. However, if the pool does exist in the pending, then it must be a
4150 * new pool, and not relevant to this message (see 1).
4152 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
4153 << ": pool has been removed" << dendl
;
4158 int acting_primary
= -1;
4159 osdmap
.pg_to_up_acting_osds(
4160 p
->first
, nullptr, nullptr, nullptr, &acting_primary
);
4161 if (acting_primary
!= from
) {
4162 /* If the source isn't the primary based on the current osdmap, we know
4163 * that the interval changed and that we can discard this message.
4164 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
4165 * which of two pg temp mappings on the same pg is more recent.
4167 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
4168 << ": primary has changed" << dendl
;
4174 if (p
->second
.empty() && (osdmap
.pg_temp
->count(p
->first
) ||
4175 osdmap
.primary_temp
->count(p
->first
)))
4178 // NOTE: we assume that this will clear pg_primary, so consider
4179 // an existing pg_primary field to imply a change
4180 if (p
->second
.size() &&
4181 (osdmap
.pg_temp
->count(p
->first
) == 0 ||
4182 osdmap
.pg_temp
->get(p
->first
) != p
->second
||
4183 osdmap
.primary_temp
->count(p
->first
)))
4187 // should we ignore all the pgs?
4188 if (ignore_cnt
== m
->pg_temp
.size())
4191 dout(7) << "preprocess_pgtemp e" << m
->map_epoch
<< " no changes from " << m
->get_orig_source_inst() << dendl
;
4192 _reply_map(op
, m
->map_epoch
);
4200 void OSDMonitor::update_up_thru(int from
, epoch_t up_thru
)
4202 epoch_t old_up_thru
= osdmap
.get_up_thru(from
);
4203 auto ut
= pending_inc
.new_up_thru
.find(from
);
4204 if (ut
!= pending_inc
.new_up_thru
.end()) {
4205 old_up_thru
= ut
->second
;
4207 if (up_thru
> old_up_thru
) {
4208 // set up_thru too, so the osd doesn't have to ask again
4209 pending_inc
.new_up_thru
[from
] = up_thru
;
4213 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op
)
4215 op
->mark_osdmon_event(__func__
);
4216 auto m
= op
->get_req
<MOSDPGTemp
>();
4217 int from
= m
->get_orig_source().num();
4218 dout(7) << "prepare_pgtemp e" << m
->map_epoch
<< " from " << m
->get_orig_source_inst() << dendl
;
4219 for (map
<pg_t
,vector
<int32_t> >::iterator p
= m
->pg_temp
.begin(); p
!= m
->pg_temp
.end(); ++p
) {
4220 uint64_t pool
= p
->first
.pool();
4221 if (pending_inc
.old_pools
.count(pool
)) {
4222 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
4223 << ": pool pending removal" << dendl
;
4226 if (!osdmap
.have_pg_pool(pool
)) {
4227 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
4228 << ": pool has been removed" << dendl
;
4231 pending_inc
.new_pg_temp
[p
->first
] =
4232 mempool::osdmap::vector
<int>(p
->second
.begin(), p
->second
.end());
4234 // unconditionally clear pg_primary (until this message can encode
4235 // a change for that, too.. at which point we need to also fix
4236 // preprocess_pg_temp)
4237 if (osdmap
.primary_temp
->count(p
->first
) ||
4238 pending_inc
.new_primary_temp
.count(p
->first
))
4239 pending_inc
.new_primary_temp
[p
->first
] = -1;
4242 // set up_thru too, so the osd doesn't have to ask again
4243 update_up_thru(from
, m
->map_epoch
);
4245 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->map_epoch
));
4252 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op
)
4254 op
->mark_osdmon_event(__func__
);
4255 auto m
= op
->get_req
<MRemoveSnaps
>();
4256 dout(7) << "preprocess_remove_snaps " << *m
<< dendl
;
4258 // check privilege, ignore if failed
4259 MonSession
*session
= op
->get_session();
4263 if (!session
->caps
.is_capable(
4265 session
->entity_name
,
4266 "osd", "osd pool rmsnap", {}, true, true, false,
4267 session
->get_peer_socket_addr())) {
4268 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
4269 << session
->caps
<< dendl
;
4273 for (map
<int, vector
<snapid_t
> >::iterator q
= m
->snaps
.begin();
4274 q
!= m
->snaps
.end();
4276 if (!osdmap
.have_pg_pool(q
->first
)) {
4277 dout(10) << " ignoring removed_snaps " << q
->second
4278 << " on non-existent pool " << q
->first
<< dendl
;
4281 const pg_pool_t
*pi
= osdmap
.get_pg_pool(q
->first
);
4282 for (vector
<snapid_t
>::iterator p
= q
->second
.begin();
4283 p
!= q
->second
.end();
4285 if (*p
> pi
->get_snap_seq() ||
4286 !_is_removed_snap(q
->first
, *p
)) {
4292 if (HAVE_FEATURE(m
->get_connection()->get_features(), SERVER_OCTOPUS
)) {
4293 auto reply
= make_message
<MRemoveSnaps
>();
4294 reply
->snaps
= m
->snaps
;
4295 mon
.send_reply(op
, reply
.detach());
4302 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op
)
4304 op
->mark_osdmon_event(__func__
);
4305 auto m
= op
->get_req
<MRemoveSnaps
>();
4306 dout(7) << "prepare_remove_snaps " << *m
<< dendl
;
4308 for (auto& [pool
, snaps
] : m
->snaps
) {
4309 if (!osdmap
.have_pg_pool(pool
)) {
4310 dout(10) << " ignoring removed_snaps " << snaps
4311 << " on non-existent pool " << pool
<< dendl
;
4315 pg_pool_t
& pi
= osdmap
.pools
[pool
];
4316 for (auto s
: snaps
) {
4317 if (!_is_removed_snap(pool
, s
) &&
4318 (!pending_inc
.new_pools
.count(pool
) ||
4319 !pending_inc
.new_pools
[pool
].removed_snaps
.contains(s
)) &&
4320 (!pending_inc
.new_removed_snaps
.count(pool
) ||
4321 !pending_inc
.new_removed_snaps
[pool
].contains(s
))) {
4322 pg_pool_t
*newpi
= pending_inc
.get_new_pool(pool
, &pi
);
4323 if (osdmap
.require_osd_release
< ceph_release_t::octopus
) {
4324 newpi
->removed_snaps
.insert(s
);
4325 dout(10) << " pool " << pool
<< " removed_snaps added " << s
4326 << " (now " << newpi
->removed_snaps
<< ")" << dendl
;
4328 newpi
->flags
|= pg_pool_t::FLAG_SELFMANAGED_SNAPS
;
4329 if (s
> newpi
->get_snap_seq()) {
4330 dout(10) << " pool " << pool
<< " snap_seq "
4331 << newpi
->get_snap_seq() << " -> " << s
<< dendl
;
4332 newpi
->set_snap_seq(s
);
4334 newpi
->set_snap_epoch(pending_inc
.epoch
);
4335 dout(10) << " added pool " << pool
<< " snap " << s
4336 << " to removed_snaps queue" << dendl
;
4337 pending_inc
.new_removed_snaps
[pool
].insert(s
);
4342 if (HAVE_FEATURE(m
->get_connection()->get_features(), SERVER_OCTOPUS
)) {
4343 auto reply
= make_message
<MRemoveSnaps
>();
4344 reply
->snaps
= m
->snaps
;
4345 wait_for_finished_proposal(op
, new C_ReplyOp(this, op
, reply
));
4351 bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op
)
4353 op
->mark_osdmon_event(__func__
);
4354 auto m
= op
->get_req
<MMonGetPurgedSnaps
>();
4355 dout(7) << __func__
<< " " << *m
<< dendl
;
4357 map
<epoch_t
,mempool::osdmap::map
<int64_t,snap_interval_set_t
>> r
;
4359 string k
= make_purged_snap_epoch_key(m
->start
);
4360 auto it
= mon
.store
->get_iterator(OSD_SNAP_PREFIX
);
4362 unsigned long epoch
= m
->last
;
4363 while (it
->valid()) {
4364 if (it
->key().find("purged_epoch_") != 0) {
4367 string k
= it
->key();
4368 int n
= sscanf(k
.c_str(), "purged_epoch_%lx", &epoch
);
4370 derr
<< __func__
<< " unable to parse key '" << it
->key() << "'" << dendl
;
4371 } else if (epoch
> m
->last
) {
4374 bufferlist bl
= it
->value();
4375 auto p
= bl
.cbegin();
4379 } catch (ceph::buffer::error
& e
) {
4380 derr
<< __func__
<< " unable to parse value for key '" << it
->key()
4385 n
+= 4 + v
.size() * 16;
4388 // impose a semi-arbitrary limit to message size
4394 auto reply
= make_message
<MMonGetPurgedSnapsReply
>(m
->start
, epoch
);
4395 reply
->purged_snaps
.swap(r
);
4396 mon
.send_reply(op
, reply
.detach());
4402 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op
)
4404 op
->mark_osdmon_event(__func__
);
4406 auto session
= op
->get_session();
4409 dout(10) << __func__
<< " no monitor session!" << dendl
;
4412 if (!session
->is_capable("osd", MON_CAP_X
)) {
4413 derr
<< __func__
<< " received from entity "
4414 << "with insufficient privileges " << session
->caps
<< dendl
;
4417 // Always forward the beacon to the leader, even if they are the same as
4418 // the old one. The leader will mark as down osds that haven't sent
4419 // beacon for a few minutes.
4423 bool OSDMonitor::prepare_beacon(MonOpRequestRef op
)
4425 op
->mark_osdmon_event(__func__
);
4426 const auto beacon
= op
->get_req
<MOSDBeacon
>();
4427 const auto src
= beacon
->get_orig_source();
4428 dout(10) << __func__
<< " " << *beacon
4429 << " from " << src
<< dendl
;
4430 int from
= src
.num();
4432 if (!src
.is_osd() ||
4433 !osdmap
.is_up(from
) ||
4434 !osdmap
.get_addrs(from
).legacy_equals(beacon
->get_orig_source_addrs())) {
4435 if (src
.is_osd() && !osdmap
.is_up(from
)) {
4436 // share some new maps with this guy in case it may not be
4437 // aware of its own deadness...
4438 send_latest(op
, beacon
->version
+1);
4440 dout(1) << " ignoring beacon from non-active osd." << from
<< dendl
;
4441 return false; /* nothing to propose */
4444 last_osd_report
[from
].first
= ceph_clock_now();
4445 last_osd_report
[from
].second
= beacon
->osd_beacon_report_interval
;
4446 osd_epochs
[from
] = beacon
->version
;
4448 for (const auto& pg
: beacon
->pgs
) {
4449 if (auto* pool
= osdmap
.get_pg_pool(pg
.pool()); pool
!= nullptr) {
4450 unsigned pg_num
= pool
->get_pg_num();
4451 last_epoch_clean
.report(pg_num
, pg
, beacon
->min_last_epoch_clean
);
4455 if (osdmap
.osd_xinfo
[from
].last_purged_snaps_scrub
<
4456 beacon
->last_purged_snaps_scrub
) {
4457 if (pending_inc
.new_xinfo
.count(from
) == 0) {
4458 pending_inc
.new_xinfo
[from
] = osdmap
.osd_xinfo
[from
];
4460 pending_inc
.new_xinfo
[from
].last_purged_snaps_scrub
=
4461 beacon
->last_purged_snaps_scrub
;
4464 return false; /* nothing to propose */
4471 void OSDMonitor::send_latest(MonOpRequestRef op
, epoch_t start
)
4473 op
->mark_osdmon_event(__func__
);
4474 dout(5) << "send_latest to " << op
->get_req()->get_orig_source_inst()
4475 << " start " << start
<< dendl
;
4479 send_incremental(op
, start
);
4483 MOSDMap
*OSDMonitor::build_latest_full(uint64_t features
)
4485 MOSDMap
*r
= new MOSDMap(mon
.monmap
->fsid
, features
);
4486 get_version_full(osdmap
.get_epoch(), features
, r
->maps
[osdmap
.get_epoch()]);
4487 r
->cluster_osdmap_trim_lower_bound
= get_first_committed();
4488 r
->newest_map
= osdmap
.get_epoch();
4492 MOSDMap
*OSDMonitor::build_incremental(epoch_t from
, epoch_t to
, uint64_t features
)
4494 dout(10) << "build_incremental [" << from
<< ".." << to
<< "] with features "
4495 << std::hex
<< features
<< std::dec
<< dendl
;
4496 MOSDMap
*m
= new MOSDMap(mon
.monmap
->fsid
, features
);
4497 m
->cluster_osdmap_trim_lower_bound
= get_first_committed();
4498 m
->newest_map
= osdmap
.get_epoch();
4500 for (epoch_t e
= to
; e
>= from
&& e
> 0; e
--) {
4502 int err
= get_version(e
, features
, bl
);
4504 ceph_assert(bl
.length());
4505 // if (get_version(e, bl) > 0) {
4506 dout(20) << "build_incremental inc " << e
<< " "
4507 << bl
.length() << " bytes" << dendl
;
4508 m
->incremental_maps
[e
] = bl
;
4510 ceph_assert(err
== -ENOENT
);
4511 ceph_assert(!bl
.length());
4512 get_version_full(e
, features
, bl
);
4513 if (bl
.length() > 0) {
4514 //else if (get_version("full", e, bl) > 0) {
4515 dout(20) << "build_incremental full " << e
<< " "
4516 << bl
.length() << " bytes" << dendl
;
4519 ceph_abort(); // we should have all maps.
4526 void OSDMonitor::send_full(MonOpRequestRef op
)
4528 op
->mark_osdmon_event(__func__
);
4529 dout(5) << "send_full to " << op
->get_req()->get_orig_source_inst() << dendl
;
4530 mon
.send_reply(op
, build_latest_full(op
->get_session()->con_features
));
4533 void OSDMonitor::send_incremental(MonOpRequestRef op
, epoch_t first
)
4535 op
->mark_osdmon_event(__func__
);
4537 MonSession
*s
= op
->get_session();
4541 // oh, we can tell the other mon to do it
4542 dout(10) << __func__
<< " asking proxying mon to send_incremental from "
4544 MRoute
*r
= new MRoute(s
->proxy_tid
, NULL
);
4545 r
->send_osdmap_first
= first
;
4546 s
->proxy_con
->send_message(r
);
4547 op
->mark_event("reply: send routed send_osdmap_first reply");
4550 send_incremental(first
, s
, false, op
);
4554 void OSDMonitor::send_incremental(epoch_t first
,
4555 MonSession
*session
,
4557 MonOpRequestRef req
)
4559 dout(5) << "send_incremental [" << first
<< ".." << osdmap
.get_epoch() << "]"
4560 << " to " << session
->name
<< dendl
;
4562 // get feature of the peer
4563 // use quorum_con_features, if it's an anonymous connection.
4564 uint64_t features
= session
->con_features
? session
->con_features
:
4565 mon
.get_quorum_con_features();
4567 if (first
<= session
->osd_epoch
) {
4568 dout(10) << __func__
<< " " << session
->name
<< " should already have epoch "
4569 << session
->osd_epoch
<< dendl
;
4570 first
= session
->osd_epoch
+ 1;
4573 if (first
< get_first_committed()) {
4574 MOSDMap
*m
= new MOSDMap(osdmap
.get_fsid(), features
);
4575 m
->cluster_osdmap_trim_lower_bound
= get_first_committed();
4576 m
->newest_map
= osdmap
.get_epoch();
4578 first
= get_first_committed();
4580 int err
= get_version_full(first
, features
, bl
);
4581 ceph_assert(err
== 0);
4582 ceph_assert(bl
.length());
4583 dout(20) << "send_incremental starting with base full "
4584 << first
<< " " << bl
.length() << " bytes" << dendl
;
4585 m
->maps
[first
] = bl
;
4588 mon
.send_reply(req
, m
);
4589 session
->osd_epoch
= first
;
4592 session
->con
->send_message(m
);
4593 session
->osd_epoch
= first
;
4598 while (first
<= osdmap
.get_epoch()) {
4599 epoch_t last
= std::min
<epoch_t
>(first
+ g_conf()->osd_map_message_max
- 1,
4600 osdmap
.get_epoch());
4601 MOSDMap
*m
= build_incremental(first
, last
, features
);
4604 // send some maps. it may not be all of them, but it will get them
4606 mon
.send_reply(req
, m
);
4608 session
->con
->send_message(m
);
4611 session
->osd_epoch
= last
;
4617 int OSDMonitor::get_version(version_t ver
, bufferlist
& bl
)
4619 return get_version(ver
, mon
.get_quorum_con_features(), bl
);
4622 void OSDMonitor::reencode_incremental_map(bufferlist
& bl
, uint64_t features
)
4624 OSDMap::Incremental inc
;
4625 auto q
= bl
.cbegin();
4627 // always encode with subset of osdmap's canonical features
4628 uint64_t f
= features
& inc
.encode_features
;
4629 dout(20) << __func__
<< " " << inc
.epoch
<< " with features " << f
4632 if (inc
.fullmap
.length()) {
4633 // embedded full map?
4635 m
.decode(inc
.fullmap
);
4636 inc
.fullmap
.clear();
4637 m
.encode(inc
.fullmap
, f
| CEPH_FEATURE_RESERVED
);
4639 if (inc
.crush
.length()) {
4640 // embedded crush map
4642 auto p
= inc
.crush
.cbegin();
4645 c
.encode(inc
.crush
, f
);
4647 inc
.encode(bl
, f
| CEPH_FEATURE_RESERVED
);
4650 void OSDMonitor::reencode_full_map(bufferlist
& bl
, uint64_t features
)
4653 auto q
= bl
.cbegin();
4655 // always encode with subset of osdmap's canonical features
4656 uint64_t f
= features
& m
.get_encoding_features();
4657 dout(20) << __func__
<< " " << m
.get_epoch() << " with features " << f
4660 m
.encode(bl
, f
| CEPH_FEATURE_RESERVED
);
4663 int OSDMonitor::get_version(version_t ver
, uint64_t features
, bufferlist
& bl
)
4665 uint64_t significant_features
= OSDMap::get_significant_features(features
);
4666 if (inc_osd_cache
.lookup({ver
, significant_features
}, &bl
)) {
4669 int ret
= PaxosService::get_version(ver
, bl
);
4673 // NOTE: this check is imprecise; the OSDMap encoding features may
4674 // be a subset of the latest mon quorum features, but worst case we
4675 // reencode once and then cache the (identical) result under both
4677 if (significant_features
!=
4678 OSDMap::get_significant_features(mon
.get_quorum_con_features())) {
4679 reencode_incremental_map(bl
, features
);
4681 inc_osd_cache
.add_bytes({ver
, significant_features
}, bl
);
4685 int OSDMonitor::get_inc(version_t ver
, OSDMap::Incremental
& inc
)
4688 int err
= get_version(ver
, inc_bl
);
4689 ceph_assert(err
== 0);
4690 ceph_assert(inc_bl
.length());
4692 auto p
= inc_bl
.cbegin();
4694 dout(10) << __func__
<< " "
4695 << " epoch " << inc
.epoch
4696 << " inc_crc " << inc
.inc_crc
4697 << " full_crc " << inc
.full_crc
4698 << " encode_features " << inc
.encode_features
<< dendl
;
4702 int OSDMonitor::get_full_from_pinned_map(version_t ver
, bufferlist
& bl
)
4704 dout(10) << __func__
<< " ver " << ver
<< dendl
;
4706 version_t closest_pinned
= osdmap_manifest
.get_lower_closest_pinned(ver
);
4707 if (closest_pinned
== 0) {
4710 if (closest_pinned
> ver
) {
4711 dout(0) << __func__
<< " pinned: " << osdmap_manifest
.pinned
<< dendl
;
4713 ceph_assert(closest_pinned
<= ver
);
4715 dout(10) << __func__
<< " closest pinned ver " << closest_pinned
<< dendl
;
4717 // get osdmap incremental maps and apply on top of this one.
4719 bool has_cached_osdmap
= false;
4720 for (version_t v
= ver
-1; v
>= closest_pinned
; --v
) {
4721 if (full_osd_cache
.lookup({v
, mon
.get_quorum_con_features()},
4723 dout(10) << __func__
<< " found map in cache ver " << v
<< dendl
;
4725 has_cached_osdmap
= true;
4730 if (!has_cached_osdmap
) {
4731 int err
= PaxosService::get_version_full(closest_pinned
, osdm_bl
);
4733 derr
<< __func__
<< " closest pinned map ver " << closest_pinned
4734 << " not available! error: " << cpp_strerror(err
) << dendl
;
4736 ceph_assert(err
== 0);
4739 ceph_assert(osdm_bl
.length());
4742 osdm
.decode(osdm_bl
);
4744 dout(10) << __func__
<< " loaded osdmap epoch " << closest_pinned
4745 << " e" << osdm
.epoch
4746 << " crc " << osdm
.get_crc()
4747 << " -- applying incremental maps." << dendl
;
4749 uint64_t encode_features
= 0;
4750 for (version_t v
= closest_pinned
+ 1; v
<= ver
; ++v
) {
4751 dout(20) << __func__
<< " applying inc epoch " << v
<< dendl
;
4753 OSDMap::Incremental inc
;
4754 int err
= get_inc(v
, inc
);
4755 ceph_assert(err
== 0);
4757 encode_features
= inc
.encode_features
;
4759 err
= osdm
.apply_incremental(inc
);
4760 ceph_assert(err
== 0);
4762 // this block performs paranoid checks on map retrieval
4763 if (g_conf().get_val
<bool>("mon_debug_extra_checks") &&
4764 inc
.full_crc
!= 0) {
4766 uint64_t f
= encode_features
;
4768 f
= (mon
.quorum_con_features
? mon
.quorum_con_features
: -1);
4771 // encode osdmap to force calculating crcs
4773 osdm
.encode(tbl
, f
| CEPH_FEATURE_RESERVED
);
4774 // decode osdmap to compare crcs with what's expected by incremental
4778 if (tosdm
.get_crc() != inc
.full_crc
) {
4780 << " osdmap crc mismatch! (osdmap crc " << tosdm
.get_crc()
4781 << ", expected " << inc
.full_crc
<< ")" << dendl
;
4782 ceph_abort_msg("osdmap crc mismatch");
4786 // note: we cannot add the recently computed map to the cache, as is,
4787 // because we have not encoded the map into a bl.
4790 if (!encode_features
) {
4791 dout(10) << __func__
4792 << " last incremental map didn't have features;"
4793 << " defaulting to quorum's or all" << dendl
;
4795 (mon
.quorum_con_features
? mon
.quorum_con_features
: -1);
4797 osdm
.encode(bl
, encode_features
| CEPH_FEATURE_RESERVED
);
4802 int OSDMonitor::get_version_full(version_t ver
, bufferlist
& bl
)
4804 return get_version_full(ver
, mon
.get_quorum_con_features(), bl
);
4807 int OSDMonitor::get_version_full(version_t ver
, uint64_t features
,
4810 uint64_t significant_features
= OSDMap::get_significant_features(features
);
4811 if (full_osd_cache
.lookup({ver
, significant_features
}, &bl
)) {
4814 int ret
= PaxosService::get_version_full(ver
, bl
);
4815 if (ret
== -ENOENT
) {
4817 ret
= get_full_from_pinned_map(ver
, bl
);
4822 // NOTE: this check is imprecise; the OSDMap encoding features may
4823 // be a subset of the latest mon quorum features, but worst case we
4824 // reencode once and then cache the (identical) result under both
4826 if (significant_features
!=
4827 OSDMap::get_significant_features(mon
.get_quorum_con_features())) {
4828 reencode_full_map(bl
, features
);
4830 full_osd_cache
.add_bytes({ver
, significant_features
}, bl
);
4834 epoch_t
OSDMonitor::blocklist(const entity_addrvec_t
& av
, utime_t until
)
4836 dout(10) << "blocklist " << av
<< " until " << until
<< dendl
;
4837 for (auto a
: av
.v
) {
4838 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
4839 a
.set_type(entity_addr_t::TYPE_ANY
);
4841 a
.set_type(entity_addr_t::TYPE_LEGACY
);
4843 pending_inc
.new_blocklist
[a
] = until
;
4845 return pending_inc
.epoch
;
4848 epoch_t
OSDMonitor::blocklist(entity_addr_t a
, utime_t until
)
4850 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
4851 a
.set_type(entity_addr_t::TYPE_ANY
);
4853 a
.set_type(entity_addr_t::TYPE_LEGACY
);
4855 dout(10) << "blocklist " << a
<< " until " << until
<< dendl
;
4856 pending_inc
.new_blocklist
[a
] = until
;
4857 return pending_inc
.epoch
;
4861 void OSDMonitor::check_osdmap_subs()
4863 dout(10) << __func__
<< dendl
;
4864 if (!osdmap
.get_epoch()) {
4867 auto osdmap_subs
= mon
.session_map
.subs
.find("osdmap");
4868 if (osdmap_subs
== mon
.session_map
.subs
.end()) {
4871 auto p
= osdmap_subs
->second
->begin();
4875 check_osdmap_sub(sub
);
4879 void OSDMonitor::check_osdmap_sub(Subscription
*sub
)
4881 dout(10) << __func__
<< " " << sub
<< " next " << sub
->next
4882 << (sub
->onetime
? " (onetime)":" (ongoing)") << dendl
;
4883 if (sub
->next
<= osdmap
.get_epoch()) {
4885 send_incremental(sub
->next
, sub
->session
, sub
->incremental_onetime
);
4887 sub
->session
->con
->send_message(build_latest_full(sub
->session
->con_features
));
4889 mon
.session_map
.remove_sub(sub
);
4891 sub
->next
= osdmap
.get_epoch() + 1;
4895 void OSDMonitor::check_pg_creates_subs()
4897 if (!osdmap
.get_num_up_osds()) {
4900 ceph_assert(osdmap
.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB
);
4901 mon
.with_session_map([this](const MonSessionMap
& session_map
) {
4902 auto pg_creates_subs
= session_map
.subs
.find("osd_pg_creates");
4903 if (pg_creates_subs
== session_map
.subs
.end()) {
4906 for (auto sub
: *pg_creates_subs
->second
) {
4907 check_pg_creates_sub(sub
);
4912 void OSDMonitor::check_pg_creates_sub(Subscription
*sub
)
4914 dout(20) << __func__
<< " .. " << sub
->session
->name
<< dendl
;
4915 ceph_assert(sub
->type
== "osd_pg_creates");
4916 // only send these if the OSD is up. we will check_subs() when they do
4917 // come up so they will get the creates then.
4918 if (sub
->session
->name
.is_osd() &&
4919 mon
.osdmon()->osdmap
.is_up(sub
->session
->name
.num())) {
4920 sub
->next
= send_pg_creates(sub
->session
->name
.num(),
4921 sub
->session
->con
.get(),
4926 void OSDMonitor::do_application_enable(int64_t pool_id
,
4927 const std::string
&app_name
,
4928 const std::string
&app_key
,
4929 const std::string
&app_value
,
4932 ceph_assert(paxos
.is_plugged() && is_writeable());
4934 dout(20) << __func__
<< ": pool_id=" << pool_id
<< ", app_name=" << app_name
4937 ceph_assert(osdmap
.require_osd_release
>= ceph_release_t::luminous
);
4939 auto pp
= osdmap
.get_pg_pool(pool_id
);
4940 ceph_assert(pp
!= nullptr);
4943 if (pending_inc
.new_pools
.count(pool_id
)) {
4944 p
= pending_inc
.new_pools
[pool_id
];
4947 if (app_key
.empty()) {
4948 p
.application_metadata
.insert({app_name
, {}});
4951 p
.application_metadata
[app_name
][app_key
] = app_value
;
4953 p
.application_metadata
.insert({app_name
, {{app_key
, app_value
}}});
4956 p
.last_change
= pending_inc
.epoch
;
4957 pending_inc
.new_pools
[pool_id
] = p
;
4960 void OSDMonitor::do_set_pool_opt(int64_t pool_id
,
4961 pool_opts_t::key_t opt
,
4962 pool_opts_t::value_t val
)
4964 dout(10) << __func__
<< " pool: " << pool_id
<< " option: " << opt
4965 << " val: " << val
<< dendl
;
4966 auto p
= pending_inc
.new_pools
.try_emplace(
4967 pool_id
, *osdmap
.get_pg_pool(pool_id
));
4968 p
.first
->second
.opts
.set(opt
, val
);
4971 unsigned OSDMonitor::scan_for_creating_pgs(
4972 const mempool::osdmap::map
<int64_t,pg_pool_t
>& pools
,
4973 const mempool::osdmap::set
<int64_t>& removed_pools
,
4975 creating_pgs_t
* creating_pgs
) const
4977 unsigned queued
= 0;
4978 for (auto& p
: pools
) {
4979 int64_t poolid
= p
.first
;
4980 if (creating_pgs
->created_pools
.count(poolid
)) {
4981 dout(10) << __func__
<< " already created " << poolid
<< dendl
;
4984 const pg_pool_t
& pool
= p
.second
;
4985 int ruleno
= pool
.get_crush_rule();
4986 if (ruleno
< 0 || !osdmap
.crush
->rule_exists(ruleno
))
4989 const auto last_scan_epoch
= creating_pgs
->last_scan_epoch
;
4990 const auto created
= pool
.get_last_change();
4991 if (last_scan_epoch
&& created
<= last_scan_epoch
) {
4992 dout(10) << __func__
<< " no change in pool " << poolid
4993 << " " << pool
<< dendl
;
4996 if (removed_pools
.count(poolid
)) {
4997 dout(10) << __func__
<< " pool is being removed: " << poolid
4998 << " " << pool
<< dendl
;
5001 dout(10) << __func__
<< " queueing pool create for " << poolid
5002 << " " << pool
<< dendl
;
5003 creating_pgs
->create_pool(poolid
, pool
.get_pg_num(),
5010 void OSDMonitor::update_creating_pgs()
5012 dout(10) << __func__
<< " " << creating_pgs
.pgs
.size() << " pgs creating, "
5013 << creating_pgs
.queue
.size() << " pools in queue" << dendl
;
5014 decltype(creating_pgs_by_osd_epoch
) new_pgs_by_osd_epoch
;
5015 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
5016 for (const auto& pg
: creating_pgs
.pgs
) {
5017 int acting_primary
= -1;
5018 auto pgid
= pg
.first
;
5019 if (!osdmap
.pg_exists(pgid
)) {
5020 dout(20) << __func__
<< " ignoring " << pgid
<< " which should not exist"
5024 auto mapped
= pg
.second
.create_epoch
;
5025 dout(20) << __func__
<< " looking up " << pgid
<< "@" << mapped
<< dendl
;
5027 mapping
.get_primary_and_shard(pgid
, &acting_primary
, &spgid
);
5028 // check the previous creating_pgs, look for the target to whom the pg was
5029 // previously mapped
5030 for (const auto& pgs_by_epoch
: creating_pgs_by_osd_epoch
) {
5031 const auto last_acting_primary
= pgs_by_epoch
.first
;
5032 for (auto& pgs
: pgs_by_epoch
.second
) {
5033 if (pgs
.second
.count(spgid
)) {
5034 if (last_acting_primary
== acting_primary
) {
5037 dout(20) << __func__
<< " " << pgid
<< " "
5038 << " acting_primary:" << last_acting_primary
5039 << " -> " << acting_primary
<< dendl
;
5040 // note epoch if the target of the create message changed.
5041 mapped
= mapping
.get_epoch();
5046 mapped
= mapping
.get_epoch();
5050 dout(10) << __func__
<< " will instruct osd." << acting_primary
5051 << " to create " << pgid
<< "@" << mapped
<< dendl
;
5052 new_pgs_by_osd_epoch
[acting_primary
][mapped
].insert(spgid
);
5054 creating_pgs_by_osd_epoch
= std::move(new_pgs_by_osd_epoch
);
5055 creating_pgs_epoch
= mapping
.get_epoch();
5058 epoch_t
OSDMonitor::send_pg_creates(int osd
, Connection
*con
, epoch_t next
) const
5060 dout(30) << __func__
<< " osd." << osd
<< " next=" << next
5061 << " " << creating_pgs_by_osd_epoch
<< dendl
;
5062 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
5063 if (creating_pgs_epoch
<= creating_pgs
.last_scan_epoch
) {
5064 dout(20) << __func__
5065 << " not using stale creating_pgs@" << creating_pgs_epoch
<< dendl
;
5066 // the subscribers will be updated when the mapping is completed anyway
5069 auto creating_pgs_by_epoch
= creating_pgs_by_osd_epoch
.find(osd
);
5070 if (creating_pgs_by_epoch
== creating_pgs_by_osd_epoch
.end())
5072 ceph_assert(!creating_pgs_by_epoch
->second
.empty());
5074 auto m
= make_message
<MOSDPGCreate2
>(creating_pgs_epoch
);
5077 for (auto epoch_pgs
= creating_pgs_by_epoch
->second
.lower_bound(next
);
5078 epoch_pgs
!= creating_pgs_by_epoch
->second
.end(); ++epoch_pgs
) {
5079 auto epoch
= epoch_pgs
->first
;
5080 auto& pgs
= epoch_pgs
->second
;
5081 dout(20) << __func__
<< " osd." << osd
<< " from " << next
5082 << " : epoch " << epoch
<< " " << pgs
.size() << " pgs" << dendl
;
5084 for (auto& pg
: pgs
) {
5085 // Need the create time from the monitor using its clock to set
5086 // last_scrub_stamp upon pg creation.
5087 auto create
= creating_pgs
.pgs
.find(pg
.pgid
);
5088 ceph_assert(create
!= creating_pgs
.pgs
.end());
5089 m
->pgs
.emplace(pg
, make_pair(create
->second
.create_epoch
,
5090 create
->second
.create_stamp
));
5091 if (create
->second
.history
.epoch_created
) {
5092 dout(20) << __func__
<< " " << pg
<< " " << create
->second
.history
5093 << " " << create
->second
.past_intervals
<< dendl
;
5094 m
->pg_extra
.emplace(pg
, make_pair(create
->second
.history
,
5095 create
->second
.past_intervals
));
5097 dout(20) << __func__
<< " will create " << pg
5098 << " at " << create
->second
.create_epoch
<< dendl
;
5101 if (!m
->pgs
.empty()) {
5102 con
->send_message2(std::move(m
));
5104 dout(20) << __func__
<< " osd." << osd
<< " from " << next
5105 << " has nothing to send" << dendl
;
5109 // sub is current through last + 1
5116 void OSDMonitor::tick()
5118 if (!is_active()) return;
5120 dout(10) << osdmap
<< dendl
;
5122 // always update osdmap manifest, regardless of being the leader.
5123 load_osdmap_manifest();
5125 // always tune priority cache manager memory on leader and peons
5126 if (ceph_using_tcmalloc() && mon_memory_autotune
) {
5127 std::lock_guard
l(balancer_lock
);
5128 if (pcm
!= nullptr) {
5131 _set_new_cache_sizes();
5132 dout(10) << "tick balancer "
5133 << " inc cache_bytes: " << inc_cache
->get_cache_bytes()
5134 << " inc comtd_bytes: " << inc_cache
->get_committed_size()
5135 << " inc used_bytes: " << inc_cache
->_get_used_bytes()
5136 << " inc num_osdmaps: " << inc_cache
->_get_num_osdmaps()
5138 dout(10) << "tick balancer "
5139 << " full cache_bytes: " << full_cache
->get_cache_bytes()
5140 << " full comtd_bytes: " << full_cache
->get_committed_size()
5141 << " full used_bytes: " << full_cache
->_get_used_bytes()
5142 << " full num_osdmaps: " << full_cache
->_get_num_osdmaps()
5147 if (!mon
.is_leader()) return;
5149 bool do_propose
= false;
5150 utime_t now
= ceph_clock_now();
5152 if (handle_osd_timeouts(now
, last_osd_report
)) {
5157 if (check_failures(now
)) {
5161 // Force a proposal if we need to prune; pruning is performed on
5162 // ``encode_pending()``, hence why we need to regularly trigger a proposal
5163 // even if there's nothing going on.
5164 if (is_prune_enabled() && should_prune()) {
5168 // mark down osds out?
5170 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
5171 * influence at all. The decision is made based on the ratio of "in" osds,
5172 * and the function returns false if this ratio is lower that the minimum
5173 * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
5175 if (can_mark_out(-1)) {
5176 string down_out_subtree_limit
= g_conf().get_val
<string
>(
5177 "mon_osd_down_out_subtree_limit");
5178 set
<int> down_cache
; // quick cache of down subtrees
5180 map
<int,utime_t
>::iterator i
= down_pending_out
.begin();
5181 while (i
!= down_pending_out
.end()) {
5187 if (osdmap
.is_down(o
) &&
5190 utime_t
orig_grace(g_conf()->mon_osd_down_out_interval
, 0);
5191 utime_t grace
= orig_grace
;
5192 double my_grace
= 0.0;
5194 if (g_conf()->mon_osd_adjust_down_out_interval
) {
5195 // scale grace period the same way we do the heartbeat grace.
5196 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(o
);
5197 double halflife
= (double)g_conf()->mon_osd_laggy_halflife
;
5198 double decay_k
= ::log(.5) / halflife
;
5199 double decay
= exp((double)down
* decay_k
);
5200 dout(20) << "osd." << o
<< " laggy halflife " << halflife
<< " decay_k " << decay_k
5201 << " down for " << down
<< " decay " << decay
<< dendl
;
5202 my_grace
= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
5206 // is this an entire large subtree down?
5207 if (down_out_subtree_limit
.length()) {
5208 int type
= osdmap
.crush
->get_type_id(down_out_subtree_limit
);
5210 if (osdmap
.containing_subtree_is_down(cct
, o
, type
, &down_cache
)) {
5211 dout(10) << "tick entire containing " << down_out_subtree_limit
5212 << " subtree for osd." << o
5213 << " is down; resetting timer" << dendl
;
5214 // reset timer, too.
5215 down_pending_out
[o
] = now
;
5221 bool down_out
= !osdmap
.is_destroyed(o
) &&
5222 g_conf()->mon_osd_down_out_interval
> 0 && down
.sec() >= grace
;
5223 bool destroyed_out
= osdmap
.is_destroyed(o
) &&
5224 g_conf()->mon_osd_destroyed_out_interval
> 0 &&
5225 // this is not precise enough as we did not make a note when this osd
5226 // was marked as destroyed, but let's not bother with that
5227 // complexity for now.
5228 down
.sec() >= g_conf()->mon_osd_destroyed_out_interval
;
5229 if (down_out
|| destroyed_out
) {
5230 dout(10) << "tick marking osd." << o
<< " OUT after " << down
5231 << " sec (target " << grace
<< " = " << orig_grace
<< " + " << my_grace
<< ")" << dendl
;
5232 pending_inc
.new_weight
[o
] = CEPH_OSD_OUT
;
5234 // set the AUTOOUT bit.
5235 if (pending_inc
.new_state
.count(o
) == 0)
5236 pending_inc
.new_state
[o
] = 0;
5237 pending_inc
.new_state
[o
] |= CEPH_OSD_AUTOOUT
;
5239 // remember previous weight
5240 if (pending_inc
.new_xinfo
.count(o
) == 0)
5241 pending_inc
.new_xinfo
[o
] = osdmap
.osd_xinfo
[o
];
5242 pending_inc
.new_xinfo
[o
].old_weight
= osdmap
.osd_weight
[o
];
5246 mon
.clog
->info() << "Marking osd." << o
<< " out (has been down for "
5247 << int(down
.sec()) << " seconds)";
5252 down_pending_out
.erase(o
);
5255 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl
;
5258 // expire blocklisted items?
5259 for (ceph::unordered_map
<entity_addr_t
,utime_t
>::iterator p
= osdmap
.blocklist
.begin();
5260 p
!= osdmap
.blocklist
.end();
5262 if (p
->second
< now
) {
5263 dout(10) << "expiring blocklist item " << p
->first
<< " expired " << p
->second
<< " < now " << now
<< dendl
;
5264 pending_inc
.old_blocklist
.push_back(p
->first
);
5268 for (auto p
= osdmap
.range_blocklist
.begin();
5269 p
!= osdmap
.range_blocklist
.end();
5271 if (p
->second
< now
) {
5272 dout(10) << "expiring range_blocklist item " << p
->first
5273 << " expired " << p
->second
<< " < now " << now
<< dendl
;
5274 pending_inc
.old_range_blocklist
.push_back(p
->first
);
5279 if (try_prune_purged_snaps()) {
5283 if (update_pools_status())
5287 !pending_inc
.new_pg_temp
.empty()) // also propose if we adjusted pg_temp
5291 void OSDMonitor::_set_new_cache_sizes()
5293 uint64_t cache_size
= 0;
5294 int64_t inc_alloc
= 0;
5295 int64_t full_alloc
= 0;
5296 int64_t kv_alloc
= 0;
5298 if (pcm
!= nullptr && rocksdb_binned_kv_cache
!= nullptr) {
5299 cache_size
= pcm
->get_tuned_mem();
5300 inc_alloc
= inc_cache
->get_committed_size();
5301 full_alloc
= full_cache
->get_committed_size();
5302 kv_alloc
= rocksdb_binned_kv_cache
->get_committed_size();
5305 inc_osd_cache
.set_bytes(inc_alloc
);
5306 full_osd_cache
.set_bytes(full_alloc
);
5308 dout(1) << __func__
<< " cache_size:" << cache_size
5309 << " inc_alloc: " << inc_alloc
5310 << " full_alloc: " << full_alloc
5311 << " kv_alloc: " << kv_alloc
5315 bool OSDMonitor::handle_osd_timeouts(const utime_t
&now
,
5316 std::map
<int, std::pair
<utime_t
, int>> &last_osd_report
)
5318 utime_t
timeo(g_conf()->mon_osd_report_timeout
, 0);
5319 if (now
- mon
.get_leader_since() < timeo
) {
5320 // We haven't been the leader for long enough to consider OSD timeouts
5324 int max_osd
= osdmap
.get_max_osd();
5325 bool new_down
= false;
5327 for (int i
=0; i
< max_osd
; ++i
) {
5328 dout(30) << __func__
<< ": checking up on osd " << i
<< dendl
;
5329 if (!osdmap
.exists(i
)) {
5330 last_osd_report
.erase(i
); // if any
5333 if (!osdmap
.is_up(i
))
5335 const std::map
<int, std::pair
<utime_t
, int>>::const_iterator t
= last_osd_report
.find(i
);
5336 if (t
== last_osd_report
.end()) {
5337 // it wasn't in the map; start the timer.
5338 last_osd_report
[i
].first
= now
;
5339 last_osd_report
[i
].second
= 0;
5340 } else if (can_mark_down(i
)) {
5341 utime_t diff
= now
- t
->second
.first
;
5342 // we use the max(mon_osd_report_timeout, 2*osd_beacon_report_interval) as timeout
5343 // to allow for the osd to miss a beacon.
5344 int mon_osd_report_timeout
= g_conf()->mon_osd_report_timeout
;
5345 utime_t
max_timeout(std::max(mon_osd_report_timeout
, 2 * t
->second
.second
), 0);
5346 if (diff
> max_timeout
) {
5347 mon
.clog
->info() << "osd." << i
<< " marked down after no beacon for "
5348 << diff
<< " seconds";
5349 derr
<< "no beacon from osd." << i
<< " since " << t
->second
.first
5350 << ", " << diff
<< " seconds ago. marking down" << dendl
;
5351 pending_inc
.new_state
[i
] = CEPH_OSD_UP
;
5359 static void dump_cpu_list(Formatter
*f
, const char *name
,
5360 const string
& strlist
)
5363 size_t cpu_set_size
;
5364 if (parse_cpu_set_list(strlist
.c_str(), &cpu_set_size
, &cpu_set
) < 0) {
5367 set
<int> cpus
= cpu_set_to_set(cpu_set_size
, &cpu_set
);
5368 f
->open_array_section(name
);
5369 for (auto cpu
: cpus
) {
5370 f
->dump_int("cpu", cpu
);
5375 void OSDMonitor::dump_info(Formatter
*f
)
5377 f
->open_object_section("osdmap");
5378 osdmap
.dump(f
, cct
);
5381 f
->open_array_section("osd_metadata");
5382 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
5383 if (osdmap
.exists(i
)) {
5384 f
->open_object_section("osd");
5385 f
->dump_unsigned("id", i
);
5386 dump_osd_metadata(i
, f
, NULL
);
5392 f
->open_object_section("osdmap_clean_epochs");
5393 f
->dump_unsigned("min_last_epoch_clean", get_min_last_epoch_clean());
5395 f
->open_object_section("last_epoch_clean");
5396 last_epoch_clean
.dump(f
);
5399 f
->open_array_section("osd_epochs");
5400 for (auto& osd_epoch
: osd_epochs
) {
5401 f
->open_object_section("osd");
5402 f
->dump_unsigned("id", osd_epoch
.first
);
5403 f
->dump_unsigned("epoch", osd_epoch
.second
);
5406 f
->close_section(); // osd_epochs
5408 f
->close_section(); // osd_clean_epochs
5410 f
->dump_unsigned("osdmap_first_committed", get_first_committed());
5411 f
->dump_unsigned("osdmap_last_committed", get_last_committed());
5413 f
->open_object_section("crushmap");
5414 osdmap
.crush
->dump(f
);
5417 if (has_osdmap_manifest
) {
5418 f
->open_object_section("osdmap_manifest");
5419 osdmap_manifest
.dump(f
);
5425 enum osd_pool_get_choices
{
5427 PG_NUM
, PGP_NUM
, CRUSH_RULE
, HASHPSPOOL
, EC_OVERWRITES
,
5428 NODELETE
, NOPGCHANGE
, NOSIZECHANGE
,
5429 WRITE_FADVISE_DONTNEED
, NOSCRUB
, NODEEP_SCRUB
,
5430 HIT_SET_TYPE
, HIT_SET_PERIOD
, HIT_SET_COUNT
, HIT_SET_FPP
,
5431 USE_GMT_HITSET
, TARGET_MAX_OBJECTS
, TARGET_MAX_BYTES
,
5432 CACHE_TARGET_DIRTY_RATIO
, CACHE_TARGET_DIRTY_HIGH_RATIO
,
5433 CACHE_TARGET_FULL_RATIO
,
5434 CACHE_MIN_FLUSH_AGE
, CACHE_MIN_EVICT_AGE
,
5435 ERASURE_CODE_PROFILE
, MIN_READ_RECENCY_FOR_PROMOTE
,
5436 MIN_WRITE_RECENCY_FOR_PROMOTE
, FAST_READ
,
5437 HIT_SET_GRADE_DECAY_RATE
, HIT_SET_SEARCH_LAST_N
,
5438 SCRUB_MIN_INTERVAL
, SCRUB_MAX_INTERVAL
, DEEP_SCRUB_INTERVAL
,
5439 RECOVERY_PRIORITY
, RECOVERY_OP_PRIORITY
, SCRUB_PRIORITY
,
5440 COMPRESSION_MODE
, COMPRESSION_ALGORITHM
, COMPRESSION_REQUIRED_RATIO
,
5441 COMPRESSION_MAX_BLOB_SIZE
, COMPRESSION_MIN_BLOB_SIZE
,
5442 CSUM_TYPE
, CSUM_MAX_BLOCK
, CSUM_MIN_BLOCK
, FINGERPRINT_ALGORITHM
,
5443 PG_AUTOSCALE_MODE
, PG_NUM_MIN
, TARGET_SIZE_BYTES
, TARGET_SIZE_RATIO
,
5444 PG_AUTOSCALE_BIAS
, DEDUP_TIER
, DEDUP_CHUNK_ALGORITHM
,
5445 DEDUP_CDC_CHUNK_SIZE
, POOL_EIO
, BULK
, PG_NUM_MAX
};
5447 std::set
<osd_pool_get_choices
>
5448 subtract_second_from_first(const std::set
<osd_pool_get_choices
>& first
,
5449 const std::set
<osd_pool_get_choices
>& second
)
5451 std::set
<osd_pool_get_choices
> result
;
5452 std::set_difference(first
.begin(), first
.end(),
5453 second
.begin(), second
.end(),
5454 std::inserter(result
, result
.end()));
5460 bool OSDMonitor::preprocess_command(MonOpRequestRef op
)
5462 op
->mark_osdmon_event(__func__
);
5463 auto m
= op
->get_req
<MMonCommand
>();
5466 stringstream ss
, ds
;
5469 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
5470 string rs
= ss
.str();
5471 mon
.reply_command(op
, -EINVAL
, rs
, get_last_committed());
5475 MonSession
*session
= op
->get_session();
5477 derr
<< __func__
<< " no session" << dendl
;
5478 mon
.reply_command(op
, -EACCES
, "access denied", get_last_committed());
5483 cmd_getval(cmdmap
, "prefix", prefix
);
5485 string format
= cmd_getval_or
<string
>(cmdmap
, "format", "plain");
5486 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
5488 if (prefix
== "osd stat") {
5490 f
->open_object_section("osdmap");
5491 osdmap
.print_summary(f
.get(), ds
, "", true);
5495 osdmap
.print_summary(nullptr, ds
, "", true);
5499 else if (prefix
== "osd dump" ||
5500 prefix
== "osd tree" ||
5501 prefix
== "osd tree-from" ||
5502 prefix
== "osd ls" ||
5503 prefix
== "osd getmap" ||
5504 prefix
== "osd getcrushmap" ||
5505 prefix
== "osd ls-tree" ||
5506 prefix
== "osd info") {
5508 epoch_t epoch
= cmd_getval_or
<int64_t>(cmdmap
, "epoch", osdmap
.get_epoch());
5509 bufferlist osdmap_bl
;
5510 int err
= get_version_full(epoch
, osdmap_bl
);
5511 if (err
== -ENOENT
) {
5513 ss
<< "there is no map for epoch " << epoch
;
5516 ceph_assert(err
== 0);
5517 ceph_assert(osdmap_bl
.length());
5520 if (epoch
== osdmap
.get_epoch()) {
5524 p
->decode(osdmap_bl
);
5527 auto sg
= make_scope_guard([&] {
5533 if (prefix
== "osd dump") {
5536 f
->open_object_section("osdmap");
5537 p
->dump(f
.get(), cct
);
5546 } else if (prefix
== "osd ls") {
5548 f
->open_array_section("osds");
5549 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
5550 if (osdmap
.exists(i
)) {
5551 f
->dump_int("osd", i
);
5558 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
5559 if (osdmap
.exists(i
)) {
5568 } else if (prefix
== "osd info") {
5570 bool do_single_osd
= true;
5571 if (!cmd_getval(cmdmap
, "id", osd_id
)) {
5572 do_single_osd
= false;
5575 if (do_single_osd
&& !osdmap
.exists(osd_id
)) {
5576 ss
<< "osd." << osd_id
<< " does not exist";
5582 if (do_single_osd
) {
5583 osdmap
.dump_osd(osd_id
, f
.get());
5585 osdmap
.dump_osds(f
.get());
5589 if (do_single_osd
) {
5590 osdmap
.print_osd(osd_id
, ds
);
5592 osdmap
.print_osds(ds
);
5596 } else if (prefix
== "osd tree" || prefix
== "osd tree-from") {
5598 if (prefix
== "osd tree-from") {
5599 cmd_getval(cmdmap
, "bucket", bucket
);
5600 if (!osdmap
.crush
->name_exists(bucket
)) {
5601 ss
<< "bucket '" << bucket
<< "' does not exist";
5605 int id
= osdmap
.crush
->get_item_id(bucket
);
5607 ss
<< "\"" << bucket
<< "\" is not a bucket";
5613 vector
<string
> states
;
5614 cmd_getval(cmdmap
, "states", states
);
5615 unsigned filter
= 0;
5616 for (auto& s
: states
) {
5618 filter
|= OSDMap::DUMP_UP
;
5619 } else if (s
== "down") {
5620 filter
|= OSDMap::DUMP_DOWN
;
5621 } else if (s
== "in") {
5622 filter
|= OSDMap::DUMP_IN
;
5623 } else if (s
== "out") {
5624 filter
|= OSDMap::DUMP_OUT
;
5625 } else if (s
== "destroyed") {
5626 filter
|= OSDMap::DUMP_DESTROYED
;
5628 ss
<< "unrecognized state '" << s
<< "'";
5633 if ((filter
& (OSDMap::DUMP_IN
|OSDMap::DUMP_OUT
)) ==
5634 (OSDMap::DUMP_IN
|OSDMap::DUMP_OUT
)) {
5635 ss
<< "cannot specify both 'in' and 'out'";
5639 if (((filter
& (OSDMap::DUMP_UP
|OSDMap::DUMP_DOWN
)) ==
5640 (OSDMap::DUMP_UP
|OSDMap::DUMP_DOWN
)) ||
5641 ((filter
& (OSDMap::DUMP_UP
|OSDMap::DUMP_DESTROYED
)) ==
5642 (OSDMap::DUMP_UP
|OSDMap::DUMP_DESTROYED
)) ||
5643 ((filter
& (OSDMap::DUMP_DOWN
|OSDMap::DUMP_DESTROYED
)) ==
5644 (OSDMap::DUMP_DOWN
|OSDMap::DUMP_DESTROYED
))) {
5645 ss
<< "can specify only one of 'up', 'down' and 'destroyed'";
5650 f
->open_object_section("tree");
5651 p
->print_tree(f
.get(), NULL
, filter
, bucket
);
5655 p
->print_tree(NULL
, &ds
, filter
, bucket
);
5658 } else if (prefix
== "osd getmap") {
5659 rdata
.append(osdmap_bl
);
5660 ss
<< "got osdmap epoch " << p
->get_epoch();
5661 } else if (prefix
== "osd getcrushmap") {
5662 p
->crush
->encode(rdata
, mon
.get_quorum_con_features());
5663 ss
<< p
->get_crush_version();
5664 } else if (prefix
== "osd ls-tree") {
5666 cmd_getval(cmdmap
, "name", bucket_name
);
5668 r
= p
->get_osds_by_bucket_name(bucket_name
, &osds
);
5670 ss
<< "\"" << bucket_name
<< "\" does not exist";
5673 ss
<< "can not parse bucket name:\"" << bucket_name
<< "\"";
5678 f
->open_array_section("osds");
5679 for (auto &i
: osds
) {
5680 if (osdmap
.exists(i
)) {
5681 f
->dump_int("osd", i
);
5688 for (auto &i
: osds
) {
5689 if (osdmap
.exists(i
)) {
5700 } else if (prefix
== "osd getmaxosd") {
5702 f
->open_object_section("getmaxosd");
5703 f
->dump_unsigned("epoch", osdmap
.get_epoch());
5704 f
->dump_int("max_osd", osdmap
.get_max_osd());
5708 ds
<< "max_osd = " << osdmap
.get_max_osd() << " in epoch " << osdmap
.get_epoch();
5711 } else if (prefix
== "osd utilization") {
5713 osdmap
.summarize_mapping_stats(NULL
, NULL
, &out
, f
.get());
5720 } else if (prefix
== "osd find") {
5722 if (!cmd_getval(cmdmap
, "id", osd
)) {
5723 ss
<< "unable to parse osd id value '"
5724 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
5728 if (!osdmap
.exists(osd
)) {
5729 ss
<< "osd." << osd
<< " does not exist";
5734 cmd_getval(cmdmap
, "format", format
);
5735 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5736 f
->open_object_section("osd_location");
5737 f
->dump_int("osd", osd
);
5738 f
->dump_object("addrs", osdmap
.get_addrs(osd
));
5739 f
->dump_stream("osd_fsid") << osdmap
.get_uuid(osd
);
5741 // try to identify host, pod/container name, etc.
5742 map
<string
,string
> m
;
5743 load_metadata(osd
, m
, nullptr);
5744 if (auto p
= m
.find("hostname"); p
!= m
.end()) {
5745 f
->dump_string("host", p
->second
);
5748 "pod_name", "pod_namespace", // set by rook
5749 "container_name" // set by cephadm, ceph-ansible
5751 if (auto p
= m
.find(k
); p
!= m
.end()) {
5752 f
->dump_string(k
, p
->second
);
5756 // crush is helpful too
5757 f
->open_object_section("crush_location");
5758 map
<string
,string
> loc
= osdmap
.crush
->get_full_location(osd
);
5759 for (map
<string
,string
>::iterator p
= loc
.begin(); p
!= loc
.end(); ++p
)
5760 f
->dump_string(p
->first
.c_str(), p
->second
);
5764 } else if (prefix
== "osd metadata") {
5766 if (cmd_vartype_stringify(cmdmap
["id"]).size() &&
5767 !cmd_getval(cmdmap
, "id", osd
)) {
5768 ss
<< "unable to parse osd id value '"
5769 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
5773 if (osd
>= 0 && !osdmap
.exists(osd
)) {
5774 ss
<< "osd." << osd
<< " does not exist";
5779 cmd_getval(cmdmap
, "format", format
);
5780 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5782 f
->open_object_section("osd_metadata");
5783 f
->dump_unsigned("id", osd
);
5784 r
= dump_osd_metadata(osd
, f
.get(), &ss
);
5790 f
->open_array_section("osd_metadata");
5791 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
5792 if (osdmap
.exists(i
)) {
5793 f
->open_object_section("osd");
5794 f
->dump_unsigned("id", i
);
5795 r
= dump_osd_metadata(i
, f
.get(), NULL
);
5796 if (r
== -EINVAL
|| r
== -ENOENT
) {
5797 // Drop error, continue to get other daemons' metadata
5798 dout(4) << "No metadata for osd." << i
<< dendl
;
5810 } else if (prefix
== "osd versions") {
5812 f
.reset(Formatter::create("json-pretty"));
5813 count_metadata("ceph_version", f
.get());
5816 } else if (prefix
== "osd count-metadata") {
5818 f
.reset(Formatter::create("json-pretty"));
5820 cmd_getval(cmdmap
, "property", field
);
5821 count_metadata(field
, f
.get());
5824 } else if (prefix
== "osd numa-status") {
5827 f
->open_array_section("osds");
5829 tbl
.define_column("OSD", TextTable::LEFT
, TextTable::RIGHT
);
5830 tbl
.define_column("HOST", TextTable::LEFT
, TextTable::LEFT
);
5831 tbl
.define_column("NETWORK", TextTable::RIGHT
, TextTable::RIGHT
);
5832 tbl
.define_column("STORAGE", TextTable::RIGHT
, TextTable::RIGHT
);
5833 tbl
.define_column("AFFINITY", TextTable::RIGHT
, TextTable::RIGHT
);
5834 tbl
.define_column("CPUS", TextTable::LEFT
, TextTable::LEFT
);
5836 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
5837 if (osdmap
.exists(i
)) {
5838 map
<string
,string
> m
;
5840 if (load_metadata(i
, m
, &err
) < 0) {
5844 auto p
= m
.find("hostname");
5849 f
->open_object_section("osd");
5850 f
->dump_int("osd", i
);
5851 f
->dump_string("host", host
);
5852 for (auto n
: { "network_numa_node", "objectstore_numa_node",
5856 f
->dump_int(n
, atoi(p
->second
.c_str()));
5859 for (auto n
: { "network_numa_nodes", "objectstore_numa_nodes" }) {
5862 list
<string
> ls
= get_str_list(p
->second
, ",");
5863 f
->open_array_section(n
);
5864 for (auto node
: ls
) {
5865 f
->dump_int("node", atoi(node
.c_str()));
5870 for (auto n
: { "numa_node_cpus" }) {
5873 dump_cpu_list(f
.get(), n
, p
->second
);
5880 p
= m
.find("network_numa_nodes");
5886 p
= m
.find("objectstore_numa_nodes");
5892 p
= m
.find("numa_node");
5893 auto q
= m
.find("numa_node_cpus");
5894 if (p
!= m
.end() && q
!= m
.end()) {
5901 tbl
<< TextTable::endrow
;
5909 rdata
.append(stringify(tbl
));
5911 } else if (prefix
== "osd map") {
5912 string poolstr
, objstr
, namespacestr
;
5913 cmd_getval(cmdmap
, "pool", poolstr
);
5914 cmd_getval(cmdmap
, "object", objstr
);
5915 cmd_getval(cmdmap
, "nspace", namespacestr
);
5917 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
5919 ss
<< "pool " << poolstr
<< " does not exist";
5923 object_locator_t
oloc(pool
, namespacestr
);
5924 object_t
oid(objstr
);
5925 pg_t pgid
= osdmap
.object_locator_to_pg(oid
, oloc
);
5926 pg_t mpgid
= osdmap
.raw_pg_to_pg(pgid
);
5927 vector
<int> up
, acting
;
5929 osdmap
.pg_to_up_acting_osds(mpgid
, &up
, &up_p
, &acting
, &acting_p
);
5932 if (!namespacestr
.empty())
5933 fullobjname
= namespacestr
+ string("/") + oid
.name
;
5935 fullobjname
= oid
.name
;
5937 f
->open_object_section("osd_map");
5938 f
->dump_unsigned("epoch", osdmap
.get_epoch());
5939 f
->dump_string("pool", poolstr
);
5940 f
->dump_int("pool_id", pool
);
5941 f
->dump_stream("objname") << fullobjname
;
5942 f
->dump_stream("raw_pgid") << pgid
;
5943 f
->dump_stream("pgid") << mpgid
;
5944 f
->open_array_section("up");
5945 for (vector
<int>::iterator p
= up
.begin(); p
!= up
.end(); ++p
)
5946 f
->dump_int("osd", *p
);
5948 f
->dump_int("up_primary", up_p
);
5949 f
->open_array_section("acting");
5950 for (vector
<int>::iterator p
= acting
.begin(); p
!= acting
.end(); ++p
)
5951 f
->dump_int("osd", *p
);
5953 f
->dump_int("acting_primary", acting_p
);
5954 f
->close_section(); // osd_map
5957 ds
<< "osdmap e" << osdmap
.get_epoch()
5958 << " pool '" << poolstr
<< "' (" << pool
<< ")"
5959 << " object '" << fullobjname
<< "' ->"
5960 << " pg " << pgid
<< " (" << mpgid
<< ")"
5961 << " -> up (" << pg_vector_string(up
) << ", p" << up_p
<< ") acting ("
5962 << pg_vector_string(acting
) << ", p" << acting_p
<< ")";
5966 } else if (prefix
== "pg map") {
5968 vector
<int> up
, acting
;
5969 r
= parse_pgid(cmdmap
, ss
, pgid
);
5972 pg_t mpgid
= osdmap
.raw_pg_to_pg(pgid
);
5973 osdmap
.pg_to_up_acting_osds(pgid
, up
, acting
);
5975 f
->open_object_section("pg_map");
5976 f
->dump_unsigned("epoch", osdmap
.get_epoch());
5977 f
->dump_stream("raw_pgid") << pgid
;
5978 f
->dump_stream("pgid") << mpgid
;
5979 f
->open_array_section("up");
5980 for (auto osd
: up
) {
5981 f
->dump_int("up_osd", osd
);
5984 f
->open_array_section("acting");
5985 for (auto osd
: acting
) {
5986 f
->dump_int("acting_osd", osd
);
5992 ds
<< "osdmap e" << osdmap
.get_epoch()
5993 << " pg " << pgid
<< " (" << mpgid
<< ")"
5994 << " -> up " << up
<< " acting " << acting
;
5999 } else if (prefix
== "osd lspools") {
6001 f
->open_array_section("pools");
6002 for (map
<int64_t, pg_pool_t
>::iterator p
= osdmap
.pools
.begin();
6003 p
!= osdmap
.pools
.end();
6006 f
->open_object_section("pool");
6007 f
->dump_int("poolnum", p
->first
);
6008 f
->dump_string("poolname", osdmap
.pool_name
[p
->first
]);
6011 ds
<< p
->first
<< ' ' << osdmap
.pool_name
[p
->first
];
6012 if (next(p
) != osdmap
.pools
.end()) {
6022 } else if (prefix
== "osd blocklist ls" ||
6023 prefix
== "osd blacklist ls") {
6025 f
->open_array_section("blocklist");
6027 for (ceph::unordered_map
<entity_addr_t
,utime_t
>::iterator p
= osdmap
.blocklist
.begin();
6028 p
!= osdmap
.blocklist
.end();
6031 f
->open_object_section("entry");
6032 f
->dump_string("addr", p
->first
.get_legacy_str());
6033 f
->dump_stream("until") << p
->second
;
6038 ss
<< p
->first
<< " " << p
->second
;
6049 f
->open_array_section("range_blocklist");
6051 for (auto p
= osdmap
.range_blocklist
.begin();
6052 p
!= osdmap
.range_blocklist
.end();
6055 f
->open_object_section("entry");
6056 f
->dump_string("range", p
->first
.get_legacy_str());
6057 f
->dump_stream("until") << p
->second
;
6062 ss
<< p
->first
<< " " << p
->second
;
6072 ss
<< "listed " << osdmap
.blocklist
.size() + osdmap
.range_blocklist
.size() << " entries";
6074 } else if (prefix
== "osd pool ls") {
6076 cmd_getval(cmdmap
, "detail", detail
);
6077 if (!f
&& detail
== "detail") {
6079 osdmap
.print_pools(cct
, ss
);
6080 rdata
.append(ss
.str());
6083 f
->open_array_section("pools");
6084 for (auto &[pid
, pdata
] : osdmap
.get_pools()) {
6086 if (detail
== "detail") {
6087 f
->open_object_section("pool");
6088 f
->dump_int("pool_id", pid
);
6089 f
->dump_string("pool_name", osdmap
.get_pool_name(pid
));
6090 pdata
.dump(f
.get());
6091 osdmap
.dump_read_balance_score(cct
, pid
, pdata
, f
.get());
6094 f
->dump_string("pool_name", osdmap
.get_pool_name(pid
));
6097 rdata
.append(osdmap
.get_pool_name(pid
) + "\n");
6106 } else if (prefix
== "osd crush get-tunable") {
6108 cmd_getval(cmdmap
, "tunable", tunable
);
6111 f
->open_object_section("tunable");
6112 if (tunable
== "straw_calc_version") {
6114 f
->dump_int(tunable
.c_str(), osdmap
.crush
->get_straw_calc_version());
6116 rss
<< osdmap
.crush
->get_straw_calc_version() << "\n";
6125 rdata
.append(rss
.str());
6129 } else if (prefix
== "osd pool get") {
6131 cmd_getval(cmdmap
, "pool", poolstr
);
6132 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
6134 ss
<< "unrecognized pool '" << poolstr
<< "'";
6139 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
6141 cmd_getval(cmdmap
, "var", var
);
6143 typedef std::map
<std::string
, osd_pool_get_choices
> choices_map_t
;
6144 const choices_map_t ALL_CHOICES
= {
6146 {"min_size", MIN_SIZE
},
6147 {"pg_num", PG_NUM
}, {"pgp_num", PGP_NUM
},
6148 {"crush_rule", CRUSH_RULE
},
6149 {"hashpspool", HASHPSPOOL
},
6151 {"allow_ec_overwrites", EC_OVERWRITES
}, {"nodelete", NODELETE
},
6152 {"nopgchange", NOPGCHANGE
}, {"nosizechange", NOSIZECHANGE
},
6153 {"noscrub", NOSCRUB
}, {"nodeep-scrub", NODEEP_SCRUB
},
6154 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED
},
6155 {"hit_set_type", HIT_SET_TYPE
}, {"hit_set_period", HIT_SET_PERIOD
},
6156 {"hit_set_count", HIT_SET_COUNT
}, {"hit_set_fpp", HIT_SET_FPP
},
6157 {"use_gmt_hitset", USE_GMT_HITSET
},
6158 {"target_max_objects", TARGET_MAX_OBJECTS
},
6159 {"target_max_bytes", TARGET_MAX_BYTES
},
6160 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO
},
6161 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO
},
6162 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO
},
6163 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE
},
6164 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE
},
6165 {"erasure_code_profile", ERASURE_CODE_PROFILE
},
6166 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE
},
6167 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE
},
6168 {"fast_read", FAST_READ
},
6169 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE
},
6170 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N
},
6171 {"scrub_min_interval", SCRUB_MIN_INTERVAL
},
6172 {"scrub_max_interval", SCRUB_MAX_INTERVAL
},
6173 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL
},
6174 {"recovery_priority", RECOVERY_PRIORITY
},
6175 {"recovery_op_priority", RECOVERY_OP_PRIORITY
},
6176 {"scrub_priority", SCRUB_PRIORITY
},
6177 {"compression_mode", COMPRESSION_MODE
},
6178 {"compression_algorithm", COMPRESSION_ALGORITHM
},
6179 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO
},
6180 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE
},
6181 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE
},
6182 {"csum_type", CSUM_TYPE
},
6183 {"csum_max_block", CSUM_MAX_BLOCK
},
6184 {"csum_min_block", CSUM_MIN_BLOCK
},
6185 {"fingerprint_algorithm", FINGERPRINT_ALGORITHM
},
6186 {"pg_autoscale_mode", PG_AUTOSCALE_MODE
},
6187 {"pg_num_min", PG_NUM_MIN
},
6188 {"pg_num_max", PG_NUM_MAX
},
6189 {"target_size_bytes", TARGET_SIZE_BYTES
},
6190 {"target_size_ratio", TARGET_SIZE_RATIO
},
6191 {"pg_autoscale_bias", PG_AUTOSCALE_BIAS
},
6192 {"dedup_tier", DEDUP_TIER
},
6193 {"dedup_chunk_algorithm", DEDUP_CHUNK_ALGORITHM
},
6194 {"dedup_cdc_chunk_size", DEDUP_CDC_CHUNK_SIZE
},
6198 typedef std::set
<osd_pool_get_choices
> choices_set_t
;
6200 const choices_set_t ONLY_TIER_CHOICES
= {
6201 HIT_SET_TYPE
, HIT_SET_PERIOD
, HIT_SET_COUNT
, HIT_SET_FPP
,
6202 TARGET_MAX_OBJECTS
, TARGET_MAX_BYTES
, CACHE_TARGET_FULL_RATIO
,
6203 CACHE_TARGET_DIRTY_RATIO
, CACHE_TARGET_DIRTY_HIGH_RATIO
,
6204 CACHE_MIN_FLUSH_AGE
, CACHE_MIN_EVICT_AGE
,
6205 MIN_READ_RECENCY_FOR_PROMOTE
,
6206 MIN_WRITE_RECENCY_FOR_PROMOTE
,
6207 HIT_SET_GRADE_DECAY_RATE
, HIT_SET_SEARCH_LAST_N
6209 const choices_set_t ONLY_ERASURE_CHOICES
= {
6210 EC_OVERWRITES
, ERASURE_CODE_PROFILE
6213 choices_set_t selected_choices
;
6215 for(choices_map_t::const_iterator it
= ALL_CHOICES
.begin();
6216 it
!= ALL_CHOICES
.end(); ++it
) {
6217 selected_choices
.insert(it
->second
);
6221 selected_choices
= subtract_second_from_first(selected_choices
,
6225 if(!p
->is_erasure()) {
6226 selected_choices
= subtract_second_from_first(selected_choices
,
6227 ONLY_ERASURE_CHOICES
);
6229 } else /* var != "all" */ {
6230 choices_map_t::const_iterator found
= ALL_CHOICES
.find(var
);
6231 if (found
== ALL_CHOICES
.end()) {
6232 ss
<< "pool '" << poolstr
6233 << "': invalid variable: '" << var
<< "'";
6238 osd_pool_get_choices selected
= found
->second
;
6240 if (!p
->is_tier() &&
6241 ONLY_TIER_CHOICES
.find(selected
) != ONLY_TIER_CHOICES
.end()) {
6242 ss
<< "pool '" << poolstr
6243 << "' is not a tier pool: variable not applicable";
6248 if (!p
->is_erasure() &&
6249 ONLY_ERASURE_CHOICES
.find(selected
)
6250 != ONLY_ERASURE_CHOICES
.end()) {
6251 ss
<< "pool '" << poolstr
6252 << "' is not a erasure pool: variable not applicable";
6257 if (pool_opts_t::is_opt_name(var
) &&
6258 !p
->opts
.is_set(pool_opts_t::get_opt_desc(var
).key
)) {
6259 ss
<< "option '" << var
<< "' is not set on pool '" << poolstr
<< "'";
6264 selected_choices
.insert(selected
);
6268 f
->open_object_section("pool");
6269 f
->dump_string("pool", poolstr
);
6270 f
->dump_int("pool_id", pool
);
6271 for(choices_set_t::const_iterator it
= selected_choices
.begin();
6272 it
!= selected_choices
.end(); ++it
) {
6273 choices_map_t::const_iterator i
;
6274 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
6275 if (i
->second
== *it
) {
6279 ceph_assert(i
!= ALL_CHOICES
.end());
6282 f
->dump_int("pg_num", p
->get_pg_num());
6285 f
->dump_int("pgp_num", p
->get_pgp_num());
6288 f
->dump_int("size", p
->get_size());
6291 f
->dump_int("min_size", p
->get_min_size());
6294 if (osdmap
.crush
->rule_exists(p
->get_crush_rule())) {
6295 f
->dump_string("crush_rule", osdmap
.crush
->get_rule_name(
6296 p
->get_crush_rule()));
6298 f
->dump_string("crush_rule", stringify(p
->get_crush_rule()));
6302 f
->dump_bool("allow_ec_overwrites",
6303 p
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
));
6305 case PG_AUTOSCALE_MODE
:
6306 f
->dump_string("pg_autoscale_mode",
6307 pg_pool_t::get_pg_autoscale_mode_name(
6308 p
->pg_autoscale_mode
));
6316 case WRITE_FADVISE_DONTNEED
:
6319 f
->dump_bool(i
->first
.c_str(),
6320 p
->has_flag(pg_pool_t::get_flag_by_name(i
->first
)));
6322 case HIT_SET_PERIOD
:
6323 f
->dump_int("hit_set_period", p
->hit_set_period
);
6326 f
->dump_int("hit_set_count", p
->hit_set_count
);
6329 f
->dump_string("hit_set_type",
6330 HitSet::get_type_name(p
->hit_set_params
.get_type()));
6334 if (p
->hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
6335 BloomHitSet::Params
*bloomp
=
6336 static_cast<BloomHitSet::Params
*>(p
->hit_set_params
.impl
.get());
6337 f
->dump_float("hit_set_fpp", bloomp
->get_fpp());
6338 } else if(var
!= "all") {
6340 ss
<< "hit set is not of type Bloom; " <<
6341 "invalid to get a false positive rate!";
6347 case USE_GMT_HITSET
:
6348 f
->dump_bool("use_gmt_hitset", p
->use_gmt_hitset
);
6350 case TARGET_MAX_OBJECTS
:
6351 f
->dump_unsigned("target_max_objects", p
->target_max_objects
);
6353 case TARGET_MAX_BYTES
:
6354 f
->dump_unsigned("target_max_bytes", p
->target_max_bytes
);
6356 case CACHE_TARGET_DIRTY_RATIO
:
6357 f
->dump_unsigned("cache_target_dirty_ratio_micro",
6358 p
->cache_target_dirty_ratio_micro
);
6359 f
->dump_float("cache_target_dirty_ratio",
6360 ((float)p
->cache_target_dirty_ratio_micro
/1000000));
6362 case CACHE_TARGET_DIRTY_HIGH_RATIO
:
6363 f
->dump_unsigned("cache_target_dirty_high_ratio_micro",
6364 p
->cache_target_dirty_high_ratio_micro
);
6365 f
->dump_float("cache_target_dirty_high_ratio",
6366 ((float)p
->cache_target_dirty_high_ratio_micro
/1000000));
6368 case CACHE_TARGET_FULL_RATIO
:
6369 f
->dump_unsigned("cache_target_full_ratio_micro",
6370 p
->cache_target_full_ratio_micro
);
6371 f
->dump_float("cache_target_full_ratio",
6372 ((float)p
->cache_target_full_ratio_micro
/1000000));
6374 case CACHE_MIN_FLUSH_AGE
:
6375 f
->dump_unsigned("cache_min_flush_age", p
->cache_min_flush_age
);
6377 case CACHE_MIN_EVICT_AGE
:
6378 f
->dump_unsigned("cache_min_evict_age", p
->cache_min_evict_age
);
6380 case ERASURE_CODE_PROFILE
:
6381 f
->dump_string("erasure_code_profile", p
->erasure_code_profile
);
6383 case MIN_READ_RECENCY_FOR_PROMOTE
:
6384 f
->dump_int("min_read_recency_for_promote",
6385 p
->min_read_recency_for_promote
);
6387 case MIN_WRITE_RECENCY_FOR_PROMOTE
:
6388 f
->dump_int("min_write_recency_for_promote",
6389 p
->min_write_recency_for_promote
);
6392 f
->dump_int("fast_read", p
->fast_read
);
6394 case HIT_SET_GRADE_DECAY_RATE
:
6395 f
->dump_int("hit_set_grade_decay_rate",
6396 p
->hit_set_grade_decay_rate
);
6398 case HIT_SET_SEARCH_LAST_N
:
6399 f
->dump_int("hit_set_search_last_n",
6400 p
->hit_set_search_last_n
);
6402 case SCRUB_MIN_INTERVAL
:
6403 case SCRUB_MAX_INTERVAL
:
6404 case DEEP_SCRUB_INTERVAL
:
6405 case RECOVERY_PRIORITY
:
6406 case RECOVERY_OP_PRIORITY
:
6407 case SCRUB_PRIORITY
:
6408 case COMPRESSION_MODE
:
6409 case COMPRESSION_ALGORITHM
:
6410 case COMPRESSION_REQUIRED_RATIO
:
6411 case COMPRESSION_MAX_BLOB_SIZE
:
6412 case COMPRESSION_MIN_BLOB_SIZE
:
6414 case CSUM_MAX_BLOCK
:
6415 case CSUM_MIN_BLOCK
:
6416 case FINGERPRINT_ALGORITHM
:
6419 case TARGET_SIZE_BYTES
:
6420 case TARGET_SIZE_RATIO
:
6421 case PG_AUTOSCALE_BIAS
:
6423 case DEDUP_CHUNK_ALGORITHM
:
6424 case DEDUP_CDC_CHUNK_SIZE
:
6425 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc(i
->first
).key
;
6426 if (p
->opts
.is_set(key
)) {
6427 if(*it
== CSUM_TYPE
) {
6429 p
->opts
.get(pool_opts_t::CSUM_TYPE
, &val
);
6430 f
->dump_string(i
->first
.c_str(), Checksummer::get_csum_type_string(val
));
6432 p
->opts
.dump(i
->first
, f
.get());
6441 for(choices_set_t::const_iterator it
= selected_choices
.begin();
6442 it
!= selected_choices
.end(); ++it
) {
6443 choices_map_t::const_iterator i
;
6446 ss
<< "pg_num: " << p
->get_pg_num() << "\n";
6449 ss
<< "pgp_num: " << p
->get_pgp_num() << "\n";
6452 ss
<< "size: " << p
->get_size() << "\n";
6455 ss
<< "min_size: " << p
->get_min_size() << "\n";
6458 if (osdmap
.crush
->rule_exists(p
->get_crush_rule())) {
6459 ss
<< "crush_rule: " << osdmap
.crush
->get_rule_name(
6460 p
->get_crush_rule()) << "\n";
6462 ss
<< "crush_rule: " << p
->get_crush_rule() << "\n";
6465 case PG_AUTOSCALE_MODE
:
6466 ss
<< "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
6467 p
->pg_autoscale_mode
) <<"\n";
6469 case HIT_SET_PERIOD
:
6470 ss
<< "hit_set_period: " << p
->hit_set_period
<< "\n";
6473 ss
<< "hit_set_count: " << p
->hit_set_count
<< "\n";
6476 ss
<< "hit_set_type: " <<
6477 HitSet::get_type_name(p
->hit_set_params
.get_type()) << "\n";
6481 if (p
->hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
6482 BloomHitSet::Params
*bloomp
=
6483 static_cast<BloomHitSet::Params
*>(p
->hit_set_params
.impl
.get());
6484 ss
<< "hit_set_fpp: " << bloomp
->get_fpp() << "\n";
6485 } else if(var
!= "all") {
6486 ss
<< "hit set is not of type Bloom; " <<
6487 "invalid to get a false positive rate!";
6493 case USE_GMT_HITSET
:
6494 ss
<< "use_gmt_hitset: " << p
->use_gmt_hitset
<< "\n";
6496 case TARGET_MAX_OBJECTS
:
6497 ss
<< "target_max_objects: " << p
->target_max_objects
<< "\n";
6499 case TARGET_MAX_BYTES
:
6500 ss
<< "target_max_bytes: " << p
->target_max_bytes
<< "\n";
6502 case CACHE_TARGET_DIRTY_RATIO
:
6503 ss
<< "cache_target_dirty_ratio: "
6504 << ((float)p
->cache_target_dirty_ratio_micro
/1000000) << "\n";
6506 case CACHE_TARGET_DIRTY_HIGH_RATIO
:
6507 ss
<< "cache_target_dirty_high_ratio: "
6508 << ((float)p
->cache_target_dirty_high_ratio_micro
/1000000) << "\n";
6510 case CACHE_TARGET_FULL_RATIO
:
6511 ss
<< "cache_target_full_ratio: "
6512 << ((float)p
->cache_target_full_ratio_micro
/1000000) << "\n";
6514 case CACHE_MIN_FLUSH_AGE
:
6515 ss
<< "cache_min_flush_age: " << p
->cache_min_flush_age
<< "\n";
6517 case CACHE_MIN_EVICT_AGE
:
6518 ss
<< "cache_min_evict_age: " << p
->cache_min_evict_age
<< "\n";
6520 case ERASURE_CODE_PROFILE
:
6521 ss
<< "erasure_code_profile: " << p
->erasure_code_profile
<< "\n";
6523 case MIN_READ_RECENCY_FOR_PROMOTE
:
6524 ss
<< "min_read_recency_for_promote: " <<
6525 p
->min_read_recency_for_promote
<< "\n";
6527 case HIT_SET_GRADE_DECAY_RATE
:
6528 ss
<< "hit_set_grade_decay_rate: " <<
6529 p
->hit_set_grade_decay_rate
<< "\n";
6531 case HIT_SET_SEARCH_LAST_N
:
6532 ss
<< "hit_set_search_last_n: " <<
6533 p
->hit_set_search_last_n
<< "\n";
6536 ss
<< "allow_ec_overwrites: " <<
6537 (p
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
) ? "true" : "false") <<
6546 case WRITE_FADVISE_DONTNEED
:
6549 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
6550 if (i
->second
== *it
)
6553 ceph_assert(i
!= ALL_CHOICES
.end());
6554 ss
<< i
->first
<< ": " <<
6555 (p
->has_flag(pg_pool_t::get_flag_by_name(i
->first
)) ?
6556 "true" : "false") << "\n";
6558 case MIN_WRITE_RECENCY_FOR_PROMOTE
:
6559 ss
<< "min_write_recency_for_promote: " <<
6560 p
->min_write_recency_for_promote
<< "\n";
6563 ss
<< "fast_read: " << p
->fast_read
<< "\n";
6565 case SCRUB_MIN_INTERVAL
:
6566 case SCRUB_MAX_INTERVAL
:
6567 case DEEP_SCRUB_INTERVAL
:
6568 case RECOVERY_PRIORITY
:
6569 case RECOVERY_OP_PRIORITY
:
6570 case SCRUB_PRIORITY
:
6571 case COMPRESSION_MODE
:
6572 case COMPRESSION_ALGORITHM
:
6573 case COMPRESSION_REQUIRED_RATIO
:
6574 case COMPRESSION_MAX_BLOB_SIZE
:
6575 case COMPRESSION_MIN_BLOB_SIZE
:
6577 case CSUM_MAX_BLOCK
:
6578 case CSUM_MIN_BLOCK
:
6579 case FINGERPRINT_ALGORITHM
:
6582 case TARGET_SIZE_BYTES
:
6583 case TARGET_SIZE_RATIO
:
6584 case PG_AUTOSCALE_BIAS
:
6586 case DEDUP_CHUNK_ALGORITHM
:
6587 case DEDUP_CDC_CHUNK_SIZE
:
6588 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
6589 if (i
->second
== *it
)
6592 ceph_assert(i
!= ALL_CHOICES
.end());
6594 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc(i
->first
).key
;
6595 if (p
->opts
.is_set(key
)) {
6596 if(key
== pool_opts_t::CSUM_TYPE
) {
6598 p
->opts
.get(key
, &val
);
6599 ss
<< i
->first
<< ": " << Checksummer::get_csum_type_string(val
) << "\n";
6601 ss
<< i
->first
<< ": " << p
->opts
.get(key
) << "\n";
6607 rdata
.append(ss
.str());
6612 } else if (prefix
== "osd pool get-quota") {
6614 cmd_getval(cmdmap
, "pool", pool_name
);
6616 int64_t poolid
= osdmap
.lookup_pg_pool_name(pool_name
);
6618 ceph_assert(poolid
== -ENOENT
);
6619 ss
<< "unrecognized pool '" << pool_name
<< "'";
6623 const pg_pool_t
*p
= osdmap
.get_pg_pool(poolid
);
6624 const pool_stat_t
* pstat
= mon
.mgrstatmon()->get_pool_stat(poolid
);
6626 ss
<< "no stats for pool '" << pool_name
<< "'";
6630 const object_stat_sum_t
& sum
= pstat
->stats
.sum
;
6632 f
->open_object_section("pool_quotas");
6633 f
->dump_string("pool_name", pool_name
);
6634 f
->dump_unsigned("pool_id", poolid
);
6635 f
->dump_unsigned("quota_max_objects", p
->quota_max_objects
);
6636 f
->dump_int("current_num_objects", sum
.num_objects
);
6637 f
->dump_unsigned("quota_max_bytes", p
->quota_max_bytes
);
6638 f
->dump_int("current_num_bytes", sum
.num_bytes
);
6643 rs
<< "quotas for pool '" << pool_name
<< "':\n"
6644 << " max objects: ";
6645 if (p
->quota_max_objects
== 0)
6648 rs
<< si_u_t(p
->quota_max_objects
) << " objects";
6649 rs
<< " (current num objects: " << sum
.num_objects
<< " objects)";
6653 if (p
->quota_max_bytes
== 0)
6656 rs
<< byte_u_t(p
->quota_max_bytes
);
6657 rs
<< " (current num bytes: " << sum
.num_bytes
<< " bytes)";
6659 rdata
.append(rs
.str());
6663 } else if (prefix
== "osd crush rule list" ||
6664 prefix
== "osd crush rule ls") {
6666 f
->open_array_section("rules");
6667 osdmap
.crush
->list_rules(f
.get());
6672 osdmap
.crush
->list_rules(&ss
);
6673 rdata
.append(ss
.str());
6675 } else if (prefix
== "osd crush rule ls-by-class") {
6677 cmd_getval(cmdmap
, "class", class_name
);
6678 if (class_name
.empty()) {
6679 ss
<< "no class specified";
6684 r
= osdmap
.crush
->get_rules_by_class(class_name
, &rules
);
6686 ss
<< "failed to get rules by class '" << class_name
<< "'";
6690 f
->open_array_section("rules");
6691 for (auto &rule
: rules
) {
6692 f
->dump_string("name", osdmap
.crush
->get_rule_name(rule
));
6698 for (auto &rule
: rules
) {
6699 rs
<< osdmap
.crush
->get_rule_name(rule
) << "\n";
6701 rdata
.append(rs
.str());
6703 } else if (prefix
== "osd crush rule dump") {
6705 cmd_getval(cmdmap
, "name", name
);
6707 cmd_getval(cmdmap
, "format", format
);
6708 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6710 f
->open_array_section("rules");
6711 osdmap
.crush
->dump_rules(f
.get());
6714 int ruleno
= osdmap
.crush
->get_rule_id(name
);
6716 ss
<< "unknown crush rule '" << name
<< "'";
6720 osdmap
.crush
->dump_rule(ruleno
, f
.get());
6725 rdata
.append(rs
.str());
6726 } else if (prefix
== "osd crush dump") {
6728 cmd_getval(cmdmap
, "format", format
);
6729 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6730 f
->open_object_section("crush_map");
6731 osdmap
.crush
->dump(f
.get());
6736 rdata
.append(rs
.str());
6737 } else if (prefix
== "osd crush show-tunables") {
6739 cmd_getval(cmdmap
, "format", format
);
6740 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6741 f
->open_object_section("crush_map_tunables");
6742 osdmap
.crush
->dump_tunables(f
.get());
6747 rdata
.append(rs
.str());
6748 } else if (prefix
== "osd crush tree") {
6749 bool show_shadow
= false;
6750 if (!cmd_getval_compat_cephbool(cmdmap
, "show_shadow", show_shadow
)) {
6752 if (cmd_getval(cmdmap
, "shadow", shadow
) &&
6753 shadow
== "--show-shadow") {
6757 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
6759 f
->open_object_section("crush_tree");
6760 osdmap
.crush
->dump_tree(nullptr,
6762 osdmap
.get_pool_names(),
6768 osdmap
.crush
->dump_tree(&ss
,
6770 osdmap
.get_pool_names(),
6772 rdata
.append(ss
.str());
6774 } else if (prefix
== "osd crush ls") {
6776 if (!cmd_getval(cmdmap
, "node", name
)) {
6777 ss
<< "no node specified";
6781 if (!osdmap
.crush
->name_exists(name
)) {
6782 ss
<< "node '" << name
<< "' does not exist";
6786 int id
= osdmap
.crush
->get_item_id(name
);
6789 result
.push_back(id
);
6791 int num
= osdmap
.crush
->get_bucket_size(id
);
6792 for (int i
= 0; i
< num
; ++i
) {
6793 result
.push_back(osdmap
.crush
->get_bucket_item(id
, i
));
6797 f
->open_array_section("items");
6798 for (auto i
: result
) {
6799 f
->dump_string("item", osdmap
.crush
->get_item_name(i
));
6805 for (auto i
: result
) {
6806 ss
<< osdmap
.crush
->get_item_name(i
) << "\n";
6808 rdata
.append(ss
.str());
6811 } else if (prefix
== "osd crush class ls") {
6812 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6813 f
->open_array_section("crush_classes");
6814 for (auto i
: osdmap
.crush
->class_name
)
6815 f
->dump_string("class", i
.second
);
6818 } else if (prefix
== "osd crush class ls-osd") {
6820 cmd_getval(cmdmap
, "class", name
);
6822 osdmap
.crush
->get_devices_by_class(name
, &osds
);
6824 f
->open_array_section("osds");
6825 for (auto &osd
: osds
)
6826 f
->dump_int("osd", osd
);
6831 for (auto &osd
: osds
) {
6839 } else if (prefix
== "osd crush get-device-class") {
6840 vector
<string
> idvec
;
6841 cmd_getval(cmdmap
, "ids", idvec
);
6842 map
<int, string
> class_by_osd
;
6843 for (auto& id
: idvec
) {
6845 long osd
= parse_osd_id(id
.c_str(), &ts
);
6847 ss
<< "unable to parse osd id:'" << id
<< "'";
6851 auto device_class
= osdmap
.crush
->get_item_class(osd
);
6853 class_by_osd
[osd
] = device_class
;
6855 class_by_osd
[osd
] = ""; // no class
6858 f
->open_array_section("osd_device_classes");
6859 for (auto& i
: class_by_osd
) {
6860 f
->open_object_section("osd_device_class");
6861 f
->dump_int("osd", i
.first
);
6862 f
->dump_string("device_class", i
.second
);
6868 if (class_by_osd
.size() == 1) {
6869 // for single input, make a clean output
6870 ds
<< class_by_osd
.begin()->second
;
6872 // note that we do not group osds by class here
6873 for (auto it
= class_by_osd
.begin();
6874 it
!= class_by_osd
.end();
6876 ds
<< "osd." << it
->first
<< ' ' << it
->second
;
6877 if (next(it
) != class_by_osd
.end())
6883 } else if (prefix
== "osd erasure-code-profile ls") {
6884 const auto &profiles
= osdmap
.get_erasure_code_profiles();
6886 f
->open_array_section("erasure-code-profiles");
6887 for (auto i
= profiles
.begin(); i
!= profiles
.end(); ++i
) {
6889 f
->dump_string("profile", i
->first
.c_str());
6891 rdata
.append(i
->first
+ "\n");
6898 rdata
.append(rs
.str());
6900 } else if (prefix
== "osd crush weight-set ls") {
6901 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
6903 f
->open_array_section("weight_sets");
6904 if (osdmap
.crush
->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS
)) {
6905 f
->dump_string("pool", "(compat)");
6907 for (auto& i
: osdmap
.crush
->choose_args
) {
6909 f
->dump_string("pool", osdmap
.get_pool_name(i
.first
));
6916 if (osdmap
.crush
->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS
)) {
6919 for (auto& i
: osdmap
.crush
->choose_args
) {
6921 rs
<< osdmap
.get_pool_name(i
.first
) << "\n";
6924 rdata
.append(rs
.str());
6926 } else if (prefix
== "osd crush weight-set dump") {
6927 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty",
6929 osdmap
.crush
->dump_choose_args(f
.get());
6931 } else if (prefix
== "osd erasure-code-profile get") {
6933 cmd_getval(cmdmap
, "name", name
);
6934 if (!osdmap
.has_erasure_code_profile(name
)) {
6935 ss
<< "unknown erasure code profile '" << name
<< "'";
6939 const map
<string
,string
> &profile
= osdmap
.get_erasure_code_profile(name
);
6941 f
->open_object_section("profile");
6942 for (map
<string
,string
>::const_iterator i
= profile
.begin();
6946 f
->dump_string(i
->first
.c_str(), i
->second
.c_str());
6948 rdata
.append(i
->first
+ "=" + i
->second
+ "\n");
6955 rdata
.append(rs
.str());
6957 } else if (prefix
== "osd pool application get") {
6958 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty",
6961 cmd_getval(cmdmap
, "pool", pool_name
);
6963 cmd_getval(cmdmap
, "app", app
);
6965 cmd_getval(cmdmap
, "key", key
);
6967 if (pool_name
.empty()) {
6969 f
->open_object_section("pools");
6970 for (const auto &pool
: osdmap
.pools
) {
6971 std::string
name("<unknown>");
6972 const auto &pni
= osdmap
.pool_name
.find(pool
.first
);
6973 if (pni
!= osdmap
.pool_name
.end())
6975 f
->open_object_section(name
.c_str());
6976 for (auto &app_pair
: pool
.second
.application_metadata
) {
6977 f
->open_object_section(app_pair
.first
.c_str());
6978 for (auto &kv_pair
: app_pair
.second
) {
6979 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
6983 f
->close_section(); // name
6985 f
->close_section(); // pools
6988 int64_t pool
= osdmap
.lookup_pg_pool_name(pool_name
.c_str());
6990 ss
<< "unrecognized pool '" << pool_name
<< "'";
6994 auto p
= osdmap
.get_pg_pool(pool
);
6997 f
->open_object_section(pool_name
.c_str());
6998 for (auto &app_pair
: p
->application_metadata
) {
6999 f
->open_object_section(app_pair
.first
.c_str());
7000 for (auto &kv_pair
: app_pair
.second
) {
7001 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
7003 f
->close_section(); // application
7005 f
->close_section(); // pool_name
7010 auto app_it
= p
->application_metadata
.find(app
);
7011 if (app_it
== p
->application_metadata
.end()) {
7012 ss
<< "pool '" << pool_name
<< "' has no application '" << app
<< "'";
7016 // filter by pool + app
7018 f
->open_object_section(app_it
->first
.c_str());
7019 for (auto &kv_pair
: app_it
->second
) {
7020 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
7022 f
->close_section(); // application
7026 // filter by pool + app + key
7027 auto key_it
= app_it
->second
.find(key
);
7028 if (key_it
== app_it
->second
.end()) {
7029 ss
<< "application '" << app
<< "' on pool '" << pool_name
7030 << "' does not have key '" << key
<< "'";
7034 ss
<< key_it
->second
<< "\n";
7035 rdata
.append(ss
.str());
7038 } else if (prefix
== "osd get-require-min-compat-client") {
7039 ss
<< osdmap
.require_min_compat_client
<< std::endl
;
7040 rdata
.append(ss
.str());
7043 } else if (prefix
== "osd pool application enable" ||
7044 prefix
== "osd pool application disable" ||
7045 prefix
== "osd pool application set" ||
7046 prefix
== "osd pool application rm") {
7047 bool changed
= false;
7048 r
= preprocess_command_pool_application(prefix
, cmdmap
, ss
, &changed
);
7052 } else if (changed
) {
7053 // Valid mutation, proceed to prepare phase
7056 // Idempotent case, reply
7060 // try prepare update
7067 mon
.reply_command(op
, r
, rs
, rdata
, get_last_committed());
7071 void OSDMonitor::set_pool_flags(int64_t pool_id
, uint64_t flags
)
7073 pg_pool_t
*pool
= pending_inc
.get_new_pool(pool_id
,
7074 osdmap
.get_pg_pool(pool_id
));
7076 pool
->set_flag(flags
);
7079 void OSDMonitor::clear_pool_flags(int64_t pool_id
, uint64_t flags
)
7081 pg_pool_t
*pool
= pending_inc
.get_new_pool(pool_id
,
7082 osdmap
.get_pg_pool(pool_id
));
7084 pool
->unset_flag(flags
);
7087 string
OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch
)
7090 snprintf(k
, sizeof(k
), "purged_epoch_%08lx", (unsigned long)epoch
);
7094 string
OSDMonitor::make_purged_snap_key(int64_t pool
, snapid_t snap
)
7097 snprintf(k
, sizeof(k
), "purged_snap_%llu_%016llx",
7098 (unsigned long long)pool
, (unsigned long long)snap
);
7102 string
OSDMonitor::make_purged_snap_key_value(
7103 int64_t pool
, snapid_t snap
, snapid_t num
,
7104 epoch_t epoch
, bufferlist
*v
)
7106 // encode the *last* epoch in the key so that we can use forward
7107 // iteration only to search for an epoch in an interval.
7109 encode(snap
+ num
, *v
);
7111 return make_purged_snap_key(pool
, snap
+ num
- 1);
7115 int OSDMonitor::lookup_purged_snap(
7116 int64_t pool
, snapid_t snap
,
7117 snapid_t
*begin
, snapid_t
*end
)
7119 string k
= make_purged_snap_key(pool
, snap
);
7120 auto it
= mon
.store
->get_iterator(OSD_SNAP_PREFIX
);
7123 dout(20) << __func__
7124 << " pool " << pool
<< " snap " << snap
7125 << " - key '" << k
<< "' not found" << dendl
;
7128 if (it
->key().find("purged_snap_") != 0) {
7129 dout(20) << __func__
7130 << " pool " << pool
<< " snap " << snap
7131 << " - key '" << k
<< "' got '" << it
->key()
7132 << "', wrong prefix" << dendl
;
7135 string gotk
= it
->key();
7136 const char *format
= "purged_snap_%llu_";
7137 long long int keypool
;
7138 int n
= sscanf(gotk
.c_str(), format
, &keypool
);
7140 derr
<< __func__
<< " invalid k '" << gotk
<< "'" << dendl
;
7143 if (pool
!= keypool
) {
7144 dout(20) << __func__
7145 << " pool " << pool
<< " snap " << snap
7146 << " - key '" << k
<< "' got '" << gotk
7147 << "', wrong pool " << keypool
7151 bufferlist v
= it
->value();
7152 auto p
= v
.cbegin();
7155 if (snap
< *begin
|| snap
>= *end
) {
7156 dout(20) << __func__
7157 << " pool " << pool
<< " snap " << snap
7158 << " - found [" << *begin
<< "," << *end
<< "), no overlap"
7165 void OSDMonitor::insert_purged_snap_update(
7167 snapid_t start
, snapid_t end
,
7169 MonitorDBStore::TransactionRef t
)
7171 snapid_t before_begin
, before_end
;
7172 snapid_t after_begin
, after_end
;
7173 int b
= lookup_purged_snap(pool
, start
- 1,
7174 &before_begin
, &before_end
);
7175 int a
= lookup_purged_snap(pool
, end
,
7176 &after_begin
, &after_end
);
7178 dout(10) << __func__
7179 << " [" << start
<< "," << end
<< ") - joins ["
7180 << before_begin
<< "," << before_end
<< ") and ["
7181 << after_begin
<< "," << after_end
<< ")" << dendl
;
7182 // erase only the begin record; we'll overwrite the end one.
7183 t
->erase(OSD_SNAP_PREFIX
, make_purged_snap_key(pool
, before_end
- 1));
7185 string k
= make_purged_snap_key_value(pool
,
7186 before_begin
, after_end
- before_begin
,
7187 pending_inc
.epoch
, &v
);
7188 t
->put(OSD_SNAP_PREFIX
, k
, v
);
7190 dout(10) << __func__
7191 << " [" << start
<< "," << end
<< ") - join with earlier ["
7192 << before_begin
<< "," << before_end
<< ")" << dendl
;
7193 t
->erase(OSD_SNAP_PREFIX
, make_purged_snap_key(pool
, before_end
- 1));
7195 string k
= make_purged_snap_key_value(pool
,
7196 before_begin
, end
- before_begin
,
7197 pending_inc
.epoch
, &v
);
7198 t
->put(OSD_SNAP_PREFIX
, k
, v
);
7200 dout(10) << __func__
7201 << " [" << start
<< "," << end
<< ") - join with later ["
7202 << after_begin
<< "," << after_end
<< ")" << dendl
;
7203 // overwrite after record
7205 string k
= make_purged_snap_key_value(pool
,
7206 start
, after_end
- start
,
7207 pending_inc
.epoch
, &v
);
7208 t
->put(OSD_SNAP_PREFIX
, k
, v
);
7210 dout(10) << __func__
7211 << " [" << start
<< "," << end
<< ") - new"
7214 string k
= make_purged_snap_key_value(pool
,
7216 pending_inc
.epoch
, &v
);
7217 t
->put(OSD_SNAP_PREFIX
, k
, v
);
7221 bool OSDMonitor::try_prune_purged_snaps()
7223 if (!mon
.mgrstatmon()->is_readable()) {
7226 if (!pending_inc
.new_purged_snaps
.empty()) {
7227 return false; // we already pruned for this epoch
7230 unsigned max_prune
= cct
->_conf
.get_val
<uint64_t>(
7231 "mon_max_snap_prune_per_epoch");
7235 dout(10) << __func__
<< " max_prune " << max_prune
<< dendl
;
7237 unsigned actually_pruned
= 0;
7238 auto& purged_snaps
= mon
.mgrstatmon()->get_digest().purged_snaps
;
7239 for (auto& p
: osdmap
.get_pools()) {
7240 auto q
= purged_snaps
.find(p
.first
);
7241 if (q
== purged_snaps
.end()) {
7244 auto& purged
= q
->second
;
7245 if (purged
.empty()) {
7246 dout(20) << __func__
<< " " << p
.first
<< " nothing purged" << dendl
;
7249 dout(20) << __func__
<< " pool " << p
.first
<< " purged " << purged
<< dendl
;
7250 snap_interval_set_t to_prune
;
7251 unsigned maybe_pruned
= actually_pruned
;
7252 for (auto i
= purged
.begin(); i
!= purged
.end(); ++i
) {
7253 snapid_t begin
= i
.get_start();
7254 auto end
= i
.get_start() + i
.get_len();
7255 snapid_t pbegin
= 0, pend
= 0;
7256 int r
= lookup_purged_snap(p
.first
, begin
, &pbegin
, &pend
);
7259 // be a bit aggressive about backing off here, because the mon may
7260 // do a lot of work going through this set, and if we know the
7261 // purged set from the OSDs is at least *partly* stale we may as
7262 // well wait for it to be fresh.
7263 dout(20) << __func__
<< " we've already purged " << pbegin
7264 << "~" << (pend
- pbegin
) << dendl
;
7267 if (pbegin
&& pbegin
> begin
&& pbegin
< end
) {
7268 // the tail of [begin,end) is purged; shorten the range
7271 to_prune
.insert(begin
, end
- begin
);
7272 maybe_pruned
+= end
- begin
;
7273 if (maybe_pruned
>= max_prune
) {
7277 if (!to_prune
.empty()) {
7278 // PGs may still be reporting things as purged that we have already
7279 // pruned from removed_snaps_queue.
7280 snap_interval_set_t actual
;
7281 auto r
= osdmap
.removed_snaps_queue
.find(p
.first
);
7282 if (r
!= osdmap
.removed_snaps_queue
.end()) {
7283 actual
.intersection_of(to_prune
, r
->second
);
7285 actually_pruned
+= actual
.size();
7286 dout(10) << __func__
<< " pool " << p
.first
<< " reports pruned " << to_prune
7287 << ", actual pruned " << actual
<< dendl
;
7288 if (!actual
.empty()) {
7289 pending_inc
.new_purged_snaps
[p
.first
].swap(actual
);
7292 if (actually_pruned
>= max_prune
) {
7296 dout(10) << __func__
<< " actually pruned " << actually_pruned
<< dendl
;
7297 return !!actually_pruned
;
7300 bool OSDMonitor::update_pools_status()
7302 if (!mon
.mgrstatmon()->is_readable())
7307 auto& pools
= osdmap
.get_pools();
7308 for (auto it
= pools
.begin(); it
!= pools
.end(); ++it
) {
7309 const pool_stat_t
*pstat
= mon
.mgrstatmon()->get_pool_stat(it
->first
);
7312 const object_stat_sum_t
& sum
= pstat
->stats
.sum
;
7313 const pg_pool_t
&pool
= it
->second
;
7314 const string
& pool_name
= osdmap
.get_pool_name(it
->first
);
7317 (pool
.quota_max_bytes
> 0 && (uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) ||
7318 (pool
.quota_max_objects
> 0 && (uint64_t)sum
.num_objects
>= pool
.quota_max_objects
);
7320 if (pool
.has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
7324 mon
.clog
->info() << "pool '" << pool_name
7325 << "' no longer out of quota; removing NO_QUOTA flag";
7326 // below we cancel FLAG_FULL too, we'll set it again in
7327 // OSDMonitor::encode_pending if it still fails the osd-full checking.
7328 clear_pool_flags(it
->first
,
7329 pg_pool_t::FLAG_FULL_QUOTA
| pg_pool_t::FLAG_FULL
);
7335 if (pool
.quota_max_bytes
> 0 &&
7336 (uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) {
7337 mon
.clog
->warn() << "pool '" << pool_name
<< "' is full"
7338 << " (reached quota's max_bytes: "
7339 << byte_u_t(pool
.quota_max_bytes
) << ")";
7341 if (pool
.quota_max_objects
> 0 &&
7342 (uint64_t)sum
.num_objects
>= pool
.quota_max_objects
) {
7343 mon
.clog
->warn() << "pool '" << pool_name
<< "' is full"
7344 << " (reached quota's max_objects: "
7345 << pool
.quota_max_objects
<< ")";
7347 // set both FLAG_FULL_QUOTA and FLAG_FULL
7348 // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
7349 // since FLAG_FULL should always take precedence
7350 set_pool_flags(it
->first
,
7351 pg_pool_t::FLAG_FULL_QUOTA
| pg_pool_t::FLAG_FULL
);
7352 clear_pool_flags(it
->first
,
7353 pg_pool_t::FLAG_NEARFULL
|
7354 pg_pool_t::FLAG_BACKFILLFULL
);
7361 int OSDMonitor::prepare_new_pool(MonOpRequestRef op
)
7363 op
->mark_osdmon_event(__func__
);
7364 auto m
= op
->get_req
<MPoolOp
>();
7365 dout(10) << "prepare_new_pool from " << m
->get_connection() << dendl
;
7366 MonSession
*session
= op
->get_session();
7369 string erasure_code_profile
;
7374 ret
= prepare_new_pool(m
->name
, m
->crush_rule
, rule_name
,
7375 0, 0, 0, 0, 0, 0, 0.0,
7376 erasure_code_profile
,
7377 pg_pool_t::TYPE_REPLICATED
, 0, FAST_READ_OFF
, {}, bulk
,
7378 cct
->_conf
.get_val
<bool>("osd_pool_default_crimson"),
7382 dout(10) << __func__
<< " got " << ret
<< " " << ss
.str() << dendl
;
7387 int OSDMonitor::crush_rename_bucket(const string
& srcname
,
7388 const string
& dstname
,
7393 // Avoid creating a pending crush if it does not already exists and
7394 // the rename would fail.
7396 if (!_have_pending_crush()) {
7397 ret
= _get_stable_crush().can_rename_bucket(srcname
,
7404 CrushWrapper newcrush
= _get_pending_crush();
7406 ret
= newcrush
.rename_bucket(srcname
,
7412 pending_inc
.crush
.clear();
7413 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
7414 *ss
<< "renamed bucket " << srcname
<< " into " << dstname
;
7418 void OSDMonitor::check_legacy_ec_plugin(const string
& plugin
, const string
& profile
) const
7420 string replacement
= "";
7422 if (plugin
== "jerasure_generic" ||
7423 plugin
== "jerasure_sse3" ||
7424 plugin
== "jerasure_sse4" ||
7425 plugin
== "jerasure_neon") {
7426 replacement
= "jerasure";
7427 } else if (plugin
== "shec_generic" ||
7428 plugin
== "shec_sse3" ||
7429 plugin
== "shec_sse4" ||
7430 plugin
== "shec_neon") {
7431 replacement
= "shec";
7434 if (replacement
!= "") {
7435 dout(0) << "WARNING: erasure coding profile " << profile
<< " uses plugin "
7436 << plugin
<< " that has been deprecated. Please use "
7437 << replacement
<< " instead." << dendl
;
7441 int OSDMonitor::normalize_profile(const string
& profilename
,
7442 ErasureCodeProfile
&profile
,
7446 ErasureCodeInterfaceRef erasure_code
;
7447 ErasureCodePluginRegistry
&instance
= ErasureCodePluginRegistry::instance();
7448 ErasureCodeProfile::const_iterator plugin
= profile
.find("plugin");
7449 check_legacy_ec_plugin(plugin
->second
, profilename
);
7450 int err
= instance
.factory(plugin
->second
,
7451 g_conf().get_val
<std::string
>("erasure_code_dir"),
7452 profile
, &erasure_code
, ss
);
7457 err
= erasure_code
->init(profile
, ss
);
7462 auto it
= profile
.find("stripe_unit");
7463 if (it
!= profile
.end()) {
7465 uint32_t stripe_unit
= strict_iecstrtoll(it
->second
, &err_str
);
7466 if (!err_str
.empty()) {
7467 *ss
<< "could not parse stripe_unit '" << it
->second
7468 << "': " << err_str
<< std::endl
;
7471 uint32_t data_chunks
= erasure_code
->get_data_chunk_count();
7472 uint32_t chunk_size
= erasure_code
->get_chunk_size(stripe_unit
* data_chunks
);
7473 if (chunk_size
!= stripe_unit
) {
7474 *ss
<< "stripe_unit " << stripe_unit
<< " does not match ec profile "
7475 << "alignment. Would be padded to " << chunk_size
7479 if ((stripe_unit
% 4096) != 0 && !force
) {
7480 *ss
<< "stripe_unit should be a multiple of 4096 bytes for best performance."
7481 << "use --force to override this check" << std::endl
;
7488 int OSDMonitor::crush_rule_create_erasure(const string
&name
,
7489 const string
&profile
,
7493 int ruleid
= osdmap
.crush
->get_rule_id(name
);
7494 if (ruleid
!= -ENOENT
) {
7499 CrushWrapper newcrush
= _get_pending_crush();
7501 ruleid
= newcrush
.get_rule_id(name
);
7502 if (ruleid
!= -ENOENT
) {
7506 ErasureCodeInterfaceRef erasure_code
;
7507 int err
= get_erasure_code(profile
, &erasure_code
, ss
);
7509 *ss
<< "failed to load plugin using profile " << profile
<< std::endl
;
7513 err
= erasure_code
->create_rule(name
, newcrush
, ss
);
7514 erasure_code
.reset();
7518 pending_inc
.crush
.clear();
7519 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
7524 int OSDMonitor::get_erasure_code(const string
&erasure_code_profile
,
7525 ErasureCodeInterfaceRef
*erasure_code
,
7528 if (pending_inc
.has_erasure_code_profile(erasure_code_profile
))
7530 ErasureCodeProfile profile
=
7531 osdmap
.get_erasure_code_profile(erasure_code_profile
);
7532 ErasureCodeProfile::const_iterator plugin
=
7533 profile
.find("plugin");
7534 if (plugin
== profile
.end()) {
7535 *ss
<< "cannot determine the erasure code plugin"
7536 << " because there is no 'plugin' entry in the erasure_code_profile "
7537 << profile
<< std::endl
;
7540 check_legacy_ec_plugin(plugin
->second
, erasure_code_profile
);
7541 auto& instance
= ErasureCodePluginRegistry::instance();
7542 return instance
.factory(plugin
->second
,
7543 g_conf().get_val
<std::string
>("erasure_code_dir"),
7544 profile
, erasure_code
, ss
);
7547 int OSDMonitor::check_cluster_features(uint64_t features
,
7550 stringstream unsupported_ss
;
7551 int unsupported_count
= 0;
7552 if ((mon
.get_quorum_con_features() & features
) != features
) {
7553 unsupported_ss
<< "the monitor cluster";
7554 ++unsupported_count
;
7557 set
<int32_t> up_osds
;
7558 osdmap
.get_up_osds(up_osds
);
7559 for (set
<int32_t>::iterator it
= up_osds
.begin();
7560 it
!= up_osds
.end(); ++it
) {
7561 const osd_xinfo_t
&xi
= osdmap
.get_xinfo(*it
);
7562 if ((xi
.features
& features
) != features
) {
7563 if (unsupported_count
> 0)
7564 unsupported_ss
<< ", ";
7565 unsupported_ss
<< "osd." << *it
;
7566 unsupported_count
++;
7570 if (unsupported_count
> 0) {
7571 ss
<< "features " << features
<< " unsupported by: "
7572 << unsupported_ss
.str();
7576 // check pending osd state, too!
7577 for (map
<int32_t,osd_xinfo_t
>::const_iterator p
=
7578 pending_inc
.new_xinfo
.begin();
7579 p
!= pending_inc
.new_xinfo
.end(); ++p
) {
7580 const osd_xinfo_t
&xi
= p
->second
;
7581 if ((xi
.features
& features
) != features
) {
7582 dout(10) << __func__
<< " pending osd." << p
->first
7583 << " features are insufficient; retry" << dendl
;
7591 bool OSDMonitor::validate_crush_against_features(const CrushWrapper
*newcrush
,
7594 OSDMap::Incremental new_pending
= pending_inc
;
7595 encode(*newcrush
, new_pending
.crush
, mon
.get_quorum_con_features());
7597 newmap
.deepish_copy_from(osdmap
);
7598 newmap
.apply_incremental(new_pending
);
7601 if (newmap
.require_min_compat_client
!= ceph_release_t::unknown
) {
7602 auto mv
= newmap
.get_min_compat_client();
7603 if (mv
> newmap
.require_min_compat_client
) {
7604 ss
<< "new crush map requires client version " << mv
7605 << " but require_min_compat_client is "
7606 << newmap
.require_min_compat_client
;
7613 newmap
.get_features(CEPH_ENTITY_TYPE_MON
, NULL
) |
7614 newmap
.get_features(CEPH_ENTITY_TYPE_OSD
, NULL
);
7615 stringstream features_ss
;
7616 int r
= check_cluster_features(features
, features_ss
);
7618 ss
<< "Could not change CRUSH: " << features_ss
.str();
7625 bool OSDMonitor::erasure_code_profile_in_use(
7626 const mempool::osdmap::map
<int64_t, pg_pool_t
> &pools
,
7627 const string
&profile
,
7631 for (map
<int64_t, pg_pool_t
>::const_iterator p
= pools
.begin();
7634 if (p
->second
.erasure_code_profile
== profile
&& p
->second
.is_erasure()) {
7635 *ss
<< osdmap
.pool_name
[p
->first
] << " ";
7640 *ss
<< "pool(s) are using the erasure code profile '" << profile
<< "'";
7645 int OSDMonitor::parse_erasure_code_profile(const vector
<string
> &erasure_code_profile
,
7646 map
<string
,string
> *erasure_code_profile_map
,
7649 int r
= g_conf().with_val
<string
>("osd_pool_default_erasure_code_profile",
7652 erasure_code_profile_map
,
7656 ceph_assert((*erasure_code_profile_map
).count("plugin"));
7657 string default_plugin
= (*erasure_code_profile_map
)["plugin"];
7658 map
<string
,string
> user_map
;
7659 for (vector
<string
>::const_iterator i
= erasure_code_profile
.begin();
7660 i
!= erasure_code_profile
.end();
7662 size_t equal
= i
->find('=');
7663 if (equal
== string::npos
) {
7664 user_map
[*i
] = string();
7665 (*erasure_code_profile_map
)[*i
] = string();
7667 const string key
= i
->substr(0, equal
);
7669 const string value
= i
->substr(equal
);
7670 if (key
.find("ruleset-") == 0) {
7671 *ss
<< "property '" << key
<< "' is no longer supported; try "
7672 << "'crush-" << key
.substr(8) << "' instead";
7675 user_map
[key
] = value
;
7676 (*erasure_code_profile_map
)[key
] = value
;
7680 if (user_map
.count("plugin") && user_map
["plugin"] != default_plugin
)
7681 (*erasure_code_profile_map
) = user_map
;
7686 int OSDMonitor::prepare_pool_size(const unsigned pool_type
,
7687 const string
&erasure_code_profile
,
7689 unsigned *size
, unsigned *min_size
,
7693 bool set_min_size
= false;
7694 switch (pool_type
) {
7695 case pg_pool_t::TYPE_REPLICATED
:
7696 if (osdmap
.stretch_mode_enabled
) {
7698 repl_size
= g_conf().get_val
<uint64_t>("mon_stretch_pool_size");
7699 if (repl_size
!= g_conf().get_val
<uint64_t>("mon_stretch_pool_size")) {
7700 *ss
<< "prepare_pool_size: we are in stretch mode but size "
7701 << repl_size
<< " does not match!";
7704 *min_size
= g_conf().get_val
<uint64_t>("mon_stretch_pool_min_size");
7705 set_min_size
= true;
7707 if (repl_size
== 0) {
7708 repl_size
= g_conf().get_val
<uint64_t>("osd_pool_default_size");
7712 *min_size
= g_conf().get_osd_pool_default_min_size(repl_size
);
7714 case pg_pool_t::TYPE_ERASURE
:
7716 if (osdmap
.stretch_mode_enabled
) {
7717 *ss
<< "prepare_pool_size: we are in stretch mode; cannot create EC pools!";
7720 ErasureCodeInterfaceRef erasure_code
;
7721 err
= get_erasure_code(erasure_code_profile
, &erasure_code
, ss
);
7723 *size
= erasure_code
->get_chunk_count();
7725 erasure_code
->get_data_chunk_count() +
7726 std::min
<int>(1, erasure_code
->get_coding_chunk_count() - 1);
7727 assert(*min_size
<= *size
);
7728 assert(*min_size
>= erasure_code
->get_data_chunk_count());
7733 *ss
<< "prepare_pool_size: " << pool_type
<< " is not a known pool type";
7740 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type
,
7741 const string
&erasure_code_profile
,
7742 uint32_t *stripe_width
,
7746 switch (pool_type
) {
7747 case pg_pool_t::TYPE_REPLICATED
:
7750 case pg_pool_t::TYPE_ERASURE
:
7752 ErasureCodeProfile profile
=
7753 osdmap
.get_erasure_code_profile(erasure_code_profile
);
7754 ErasureCodeInterfaceRef erasure_code
;
7755 err
= get_erasure_code(erasure_code_profile
, &erasure_code
, ss
);
7758 uint32_t data_chunks
= erasure_code
->get_data_chunk_count();
7759 uint32_t stripe_unit
= g_conf().get_val
<Option::size_t>("osd_pool_erasure_code_stripe_unit");
7760 auto it
= profile
.find("stripe_unit");
7761 if (it
!= profile
.end()) {
7763 stripe_unit
= strict_iecstrtoll(it
->second
, &err_str
);
7764 ceph_assert(err_str
.empty());
7766 *stripe_width
= data_chunks
*
7767 erasure_code
->get_chunk_size(stripe_unit
* data_chunks
);
7771 *ss
<< "prepare_pool_stripe_width: "
7772 << pool_type
<< " is not a known pool type";
7779 int OSDMonitor::get_replicated_stretch_crush_rule()
7781 /* we don't write down the stretch rule anywhere, so
7782 * we have to guess it. How? Look at all the pools
7783 * and count up how many times a given rule is used
7784 * on stretch pools and then return the one with
7787 map
<int,int> rule_counts
;
7788 for (const auto& pooli
: osdmap
.pools
) {
7789 const pg_pool_t
& p
= pooli
.second
;
7790 if (p
.is_replicated() && p
.is_stretch_pool()) {
7791 if (!rule_counts
.count(p
.crush_rule
)) {
7792 rule_counts
[p
.crush_rule
] = 1;
7794 ++rule_counts
[p
.crush_rule
];
7799 if (rule_counts
.empty()) {
7803 int most_used_count
= 0;
7804 int most_used_rule
= -1;
7805 for (auto i
: rule_counts
) {
7806 if (i
.second
> most_used_count
) {
7807 most_used_rule
= i
.first
;
7808 most_used_count
= i
.second
;
7811 ceph_assert(most_used_count
> 0);
7812 ceph_assert(most_used_rule
>= 0);
7813 return most_used_rule
;
7816 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type
,
7817 const string
&erasure_code_profile
,
7818 const string
&rule_name
,
7823 if (*crush_rule
< 0) {
7824 switch (pool_type
) {
7825 case pg_pool_t::TYPE_REPLICATED
:
7827 if (rule_name
== "") {
7828 if (osdmap
.stretch_mode_enabled
) {
7829 *crush_rule
= get_replicated_stretch_crush_rule();
7832 *crush_rule
= osdmap
.crush
->get_osd_pool_default_crush_replicated_rule(cct
);
7834 if (*crush_rule
< 0) {
7835 // Errors may happen e.g. if no valid rule is available
7836 *ss
<< "No suitable CRUSH rule exists, check "
7837 << "'osd pool default crush *' config options";
7841 return get_crush_rule(rule_name
, crush_rule
, ss
);
7845 case pg_pool_t::TYPE_ERASURE
:
7847 int err
= crush_rule_create_erasure(rule_name
,
7848 erasure_code_profile
,
7852 dout(20) << "prepare_pool_crush_rule: rule "
7853 << rule_name
<< " try again" << dendl
;
7856 // need to wait for the crush rule to be proposed before proceeding
7867 *ss
<< "prepare_pool_crush_rule: " << pool_type
7868 << " is not a known pool type";
7872 if (!osdmap
.crush
->rule_exists(*crush_rule
)) {
7873 *ss
<< "CRUSH rule " << *crush_rule
<< " not found";
7881 int OSDMonitor::get_crush_rule(const string
&rule_name
,
7886 ret
= osdmap
.crush
->get_rule_id(rule_name
);
7887 if (ret
!= -ENOENT
) {
7891 CrushWrapper newcrush
= _get_pending_crush();
7893 ret
= newcrush
.get_rule_id(rule_name
);
7894 if (ret
!= -ENOENT
) {
7895 // found it, wait for it to be proposed
7896 dout(20) << __func__
<< ": rule " << rule_name
7897 << " try again" << dendl
;
7900 // Cannot find it , return error
7901 *ss
<< "specified rule " << rule_name
<< " doesn't exist";
7909 * Get the number of 'in' osds according to the crush_rule,
7911 uint32_t OSDMonitor::get_osd_num_by_crush(int crush_rule
)
7914 set
<int> crush_in_osds
;
7916 CrushWrapper newcrush
= _get_pending_crush();
7917 newcrush
.find_takes_by_rule(crush_rule
, &roots
);
7918 for (auto root
: roots
) {
7919 const char *rootname
= newcrush
.get_item_name(root
);
7920 set
<int> crush_all_osds
;
7921 newcrush
.get_leaves(rootname
, &crush_all_osds
);
7922 std::set_difference(crush_all_osds
.begin(), crush_all_osds
.end(),
7923 out_osds
.begin(), out_osds
.end(),
7924 std::inserter(crush_in_osds
, crush_in_osds
.end()));
7926 return crush_in_osds
.size();
7929 int OSDMonitor::check_pg_num(int64_t pool
,
7935 auto max_pgs_per_osd
= g_conf().get_val
<uint64_t>("mon_max_pg_per_osd");
7936 uint64_t projected
= 0;
7937 uint32_t osd_num_by_crush
= 0;
7938 set
<int64_t> crush_pool_ids
;
7941 projected
+= pg_num
* size
;
7944 osd_num_by_crush
= get_osd_num_by_crush(crush_rule
);
7945 osdmap
.get_pool_ids_by_rule(crush_rule
, &crush_pool_ids
);
7947 for (const auto& [pool_id
, pool_info
] : osdmap
.get_pools()) {
7948 // Check only for pools affected by crush rule
7949 if (crush_pool_ids
.contains(pool_id
)) {
7950 if (pool_id
== pool
) {
7951 // Specified pool, use given pg_num and size values.
7952 projected
+= pg_num
* size
;
7954 // Use pg_num_target for evaluating the projected pg num
7955 projected
+= pool_info
.get_pg_num_target() * pool_info
.get_size();
7959 // assume min cluster size 3
7960 osd_num_by_crush
= std::max(osd_num_by_crush
, 3u);
7961 auto projected_pgs_per_osd
= projected
/ osd_num_by_crush
;
7963 if (projected_pgs_per_osd
> max_pgs_per_osd
) {
7965 *ss
<< "pool id " << pool
;
7967 *ss
<< " pg_num " << pg_num
7969 << " for this pool would result in "
7970 << projected_pgs_per_osd
7971 << " cumulative PGs per OSD (" << projected
7972 << " total PG replicas on " << osd_num_by_crush
7973 << " 'in' root OSDs by crush rule) "
7974 << "which exceeds the mon_max_pg_per_osd "
7975 << "value of " << max_pgs_per_osd
;
7982 * @param name The name of the new pool
7983 * @param crush_rule The crush rule to use. If <0, will use the system default
7984 * @param crush_rule_name The crush rule to use, if crush_rulset <0
7985 * @param pg_num The pg_num to use. If set to 0, will use the system default
7986 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
7987 * @param pg_num_min min pg_num
7988 * @param pg_num_max max pg_num
7989 * @param repl_size Replication factor, or 0 for default
7990 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
7991 * @param pool_type TYPE_ERASURE, or TYPE_REP
7992 * @param expected_num_objects expected number of objects on the pool
7993 * @param fast_read fast read type.
7994 * @param pg_autoscale_mode autoscale mode, one of on, off, warn
7995 * @param bool bulk indicates whether pool should be a bulk pool
7996 * @param bool crimson indicates whether pool is a crimson pool
7997 * @param ss human readable error message, if any.
7999 * @return 0 on success, negative errno on failure.
8001 int OSDMonitor::prepare_new_pool(string
& name
,
8003 const string
&crush_rule_name
,
8004 unsigned pg_num
, unsigned pgp_num
,
8005 unsigned pg_num_min
,
8006 unsigned pg_num_max
,
8007 const uint64_t repl_size
,
8008 const uint64_t target_size_bytes
,
8009 const float target_size_ratio
,
8010 const string
&erasure_code_profile
,
8011 const unsigned pool_type
,
8012 const uint64_t expected_num_objects
,
8013 FastReadType fast_read
,
8014 string pg_autoscale_mode
,
8019 if (crimson
&& pg_autoscale_mode
.empty()) {
8020 // default pg_autoscale_mode to off for crimson, we'll error out below if
8021 // the user tried to actually set pg_autoscale_mode to something other than
8023 pg_autoscale_mode
= "off";
8026 if (name
.length() == 0)
8030 auto pg_num_from_mode
=
8031 [pg_num
=g_conf().get_val
<uint64_t>("osd_pool_default_pg_num")]
8032 (const string
& mode
) {
8033 return mode
== "on" ? 1 : pg_num
;
8035 pg_num
= pg_num_from_mode(
8036 pg_autoscale_mode
.empty() ?
8037 g_conf().get_val
<string
>("osd_pool_default_pg_autoscale_mode") :
8041 pgp_num
= g_conf().get_val
<uint64_t>("osd_pool_default_pgp_num");
8044 if (pg_num
> g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")) {
8045 *ss
<< "'pg_num' must be greater than 0 and less than or equal to "
8046 << g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")
8047 << " (you may adjust 'mon max pool pg num' for higher values)";
8050 if (pgp_num
> pg_num
) {
8051 *ss
<< "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
8052 << ", which in this case is " << pg_num
;
8057 /* crimson-osd requires that the pool be replicated and that pg_num/pgp_num
8058 * be static. User must also have specified set-allow-crimson */
8059 const auto *suffix
= " (--crimson specified or osd_pool_default_crimson set)";
8060 if (pool_type
!= pg_pool_t::TYPE_REPLICATED
) {
8061 *ss
<< "crimson-osd only supports replicated pools" << suffix
;
8063 } else if (pg_autoscale_mode
!= "off") {
8064 *ss
<< "crimson-osd does not support changing pg_num or pgp_num, "
8065 << "pg_autoscale_mode must be set to 'off'" << suffix
;
8067 } else if (!osdmap
.get_allow_crimson()) {
8068 *ss
<< "set-allow-crimson must be set to create a pool with the "
8069 << "crimson flag" << suffix
;
8074 if (pool_type
== pg_pool_t::TYPE_REPLICATED
&& fast_read
== FAST_READ_ON
) {
8075 *ss
<< "'fast_read' can only apply to erasure coding pool";
8079 r
= prepare_pool_crush_rule(pool_type
, erasure_code_profile
,
8080 crush_rule_name
, &crush_rule
, ss
);
8082 dout(10) << "prepare_pool_crush_rule returns " << r
<< dendl
;
8085 unsigned size
, min_size
;
8086 r
= prepare_pool_size(pool_type
, erasure_code_profile
, repl_size
,
8087 &size
, &min_size
, ss
);
8089 dout(10) << "prepare_pool_size returns " << r
<< dendl
;
8092 if (g_conf()->mon_osd_crush_smoke_test
) {
8093 CrushWrapper newcrush
= _get_pending_crush();
8095 CrushTester
tester(newcrush
, err
);
8096 tester
.set_min_x(0);
8097 tester
.set_max_x(50);
8098 tester
.set_rule(crush_rule
);
8099 tester
.set_num_rep(size
);
8100 auto start
= ceph::coarse_mono_clock::now();
8101 r
= tester
.test_with_fork(cct
, g_conf()->mon_lease
);
8102 dout(10) << __func__
<< " crush test_with_fork tester created " << dendl
;
8103 auto duration
= ceph::coarse_mono_clock::now() - start
;
8105 dout(10) << "tester.test_with_fork returns " << r
8106 << ": " << err
.str() << dendl
;
8107 *ss
<< "crush test failed with " << r
<< ": " << err
.str();
8110 dout(10) << __func__
<< " crush smoke test duration: "
8111 << duration
<< dendl
;
8113 r
= check_pg_num(-1, pg_num
, size
, crush_rule
, ss
);
8115 dout(10) << "check_pg_num returns " << r
<< dendl
;
8119 if (osdmap
.crush
->get_rule_type(crush_rule
) != (int)pool_type
) {
8120 *ss
<< "crush rule " << crush_rule
<< " type does not match pool";
8124 uint32_t stripe_width
= 0;
8125 r
= prepare_pool_stripe_width(pool_type
, erasure_code_profile
, &stripe_width
, ss
);
8127 dout(10) << "prepare_pool_stripe_width returns " << r
<< dendl
;
8132 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
8133 switch (fast_read
) {
8140 case FAST_READ_DEFAULT
:
8141 fread
= g_conf()->osd_pool_default_ec_fast_read
;
8144 *ss
<< "invalid fast_read setting: " << fast_read
;
8149 for (map
<int64_t,string
>::iterator p
= pending_inc
.new_pool_names
.begin();
8150 p
!= pending_inc
.new_pool_names
.end();
8152 if (p
->second
== name
)
8156 if (-1 == pending_inc
.new_pool_max
)
8157 pending_inc
.new_pool_max
= osdmap
.pool_max
;
8158 int64_t pool
= ++pending_inc
.new_pool_max
;
8160 pg_pool_t
*pi
= pending_inc
.get_new_pool(pool
, &empty
);
8161 pi
->create_time
= ceph_clock_now();
8162 pi
->type
= pool_type
;
8163 pi
->fast_read
= fread
;
8164 pi
->flags
= g_conf()->osd_pool_default_flags
;
8166 pi
->set_flag(pg_pool_t::FLAG_BULK
);
8167 } else if (g_conf()->osd_pool_default_flag_bulk
) {
8168 pi
->set_flag(pg_pool_t::FLAG_BULK
);
8170 if (g_conf()->osd_pool_default_flag_hashpspool
)
8171 pi
->set_flag(pg_pool_t::FLAG_HASHPSPOOL
);
8172 if (g_conf()->osd_pool_default_flag_nodelete
)
8173 pi
->set_flag(pg_pool_t::FLAG_NODELETE
);
8174 if (g_conf()->osd_pool_default_flag_nopgchange
)
8175 pi
->set_flag(pg_pool_t::FLAG_NOPGCHANGE
);
8176 if (g_conf()->osd_pool_default_flag_nosizechange
)
8177 pi
->set_flag(pg_pool_t::FLAG_NOSIZECHANGE
);
8178 pi
->set_flag(pg_pool_t::FLAG_CREATING
);
8179 if (g_conf()->osd_pool_use_gmt_hitset
)
8180 pi
->use_gmt_hitset
= true;
8182 pi
->use_gmt_hitset
= false;
8184 pi
->set_flag(pg_pool_t::FLAG_CRIMSON
);
8185 pi
->set_flag(pg_pool_t::FLAG_NOPGCHANGE
);
8189 pi
->min_size
= min_size
;
8190 pi
->crush_rule
= crush_rule
;
8191 pi
->expected_num_objects
= expected_num_objects
;
8192 pi
->object_hash
= CEPH_STR_HASH_RJENKINS
;
8193 if (osdmap
.stretch_mode_enabled
) {
8194 pi
->peering_crush_bucket_count
= osdmap
.stretch_bucket_count
;
8195 pi
->peering_crush_bucket_target
= osdmap
.stretch_bucket_count
;
8196 pi
->peering_crush_bucket_barrier
= osdmap
.stretch_mode_bucket
;
8197 pi
->peering_crush_mandatory_member
= CRUSH_ITEM_NONE
;
8198 if (osdmap
.degraded_stretch_mode
) {
8199 pi
->peering_crush_bucket_count
= osdmap
.degraded_stretch_mode
;
8200 pi
->peering_crush_bucket_target
= osdmap
.degraded_stretch_mode
;
8201 // pi->peering_crush_bucket_mandatory_member = CRUSH_ITEM_NONE;
8202 // TODO: drat, we don't record this ^ anywhere, though given that it
8203 // necessarily won't exist elsewhere it likely doesn't matter
8204 pi
->min_size
= pi
->min_size
/ 2;
8205 pi
->size
= pi
->size
/ 2; // only support 2 zones now
8209 if (auto m
= pg_pool_t::get_pg_autoscale_mode_by_name(
8210 g_conf().get_val
<string
>("osd_pool_default_pg_autoscale_mode"));
8211 m
!= pg_pool_t::pg_autoscale_mode_t::UNKNOWN
) {
8212 pi
->pg_autoscale_mode
= m
;
8214 pi
->pg_autoscale_mode
= pg_pool_t::pg_autoscale_mode_t::OFF
;
8216 auto max
= g_conf().get_val
<int64_t>("mon_osd_max_initial_pgs");
8218 max
> 0 ? std::min
<uint64_t>(pg_num
, std::max
<int64_t>(1, max
))
8220 pi
->set_pg_num_pending(pi
->get_pg_num());
8221 pi
->set_pg_num_target(pg_num
);
8222 pi
->set_pgp_num(pi
->get_pg_num());
8223 pi
->set_pgp_num_target(pgp_num
);
8224 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
&&
8226 pi
->opts
.set(pool_opts_t::PG_NUM_MIN
, static_cast<int64_t>(pg_num_min
));
8228 if (osdmap
.require_osd_release
>= ceph_release_t::quincy
&&
8230 pi
->opts
.set(pool_opts_t::PG_NUM_MAX
, static_cast<int64_t>(pg_num_max
));
8232 if (auto m
= pg_pool_t::get_pg_autoscale_mode_by_name(
8233 pg_autoscale_mode
); m
!= pg_pool_t::pg_autoscale_mode_t::UNKNOWN
) {
8234 pi
->pg_autoscale_mode
= m
;
8237 pi
->last_change
= pending_inc
.epoch
;
8240 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
8241 pi
->erasure_code_profile
= erasure_code_profile
;
8243 pi
->erasure_code_profile
= "";
8245 pi
->stripe_width
= stripe_width
;
8247 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
&&
8248 target_size_bytes
) {
8249 // only store for nautilus+ because TARGET_SIZE_BYTES may be
8250 // larger than int32_t max.
8251 pi
->opts
.set(pool_opts_t::TARGET_SIZE_BYTES
, static_cast<int64_t>(target_size_bytes
));
8253 if (target_size_ratio
> 0.0 &&
8254 osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
8255 // only store for nautilus+, just to be consistent and tidy.
8256 pi
->opts
.set(pool_opts_t::TARGET_SIZE_RATIO
, target_size_ratio
);
8259 pi
->cache_target_dirty_ratio_micro
=
8260 g_conf()->osd_pool_default_cache_target_dirty_ratio
* 1000000;
8261 pi
->cache_target_dirty_high_ratio_micro
=
8262 g_conf()->osd_pool_default_cache_target_dirty_high_ratio
* 1000000;
8263 pi
->cache_target_full_ratio_micro
=
8264 g_conf()->osd_pool_default_cache_target_full_ratio
* 1000000;
8265 pi
->cache_min_flush_age
= g_conf()->osd_pool_default_cache_min_flush_age
;
8266 pi
->cache_min_evict_age
= g_conf()->osd_pool_default_cache_min_evict_age
;
8268 pending_inc
.new_pool_names
[pool
] = name
;
8272 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op
, int flag
)
8274 op
->mark_osdmon_event(__func__
);
8276 if (pending_inc
.new_flags
< 0)
8277 pending_inc
.new_flags
= osdmap
.get_flags();
8278 pending_inc
.new_flags
|= flag
;
8279 ss
<< OSDMap::get_flag_string(flag
) << " is set";
8280 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
8281 get_last_committed() + 1));
8285 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op
, int flag
)
8287 op
->mark_osdmon_event(__func__
);
8289 if (pending_inc
.new_flags
< 0)
8290 pending_inc
.new_flags
= osdmap
.get_flags();
8291 pending_inc
.new_flags
&= ~flag
;
8292 ss
<< OSDMap::get_flag_string(flag
) << " is unset";
8293 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
8294 get_last_committed() + 1));
8298 int OSDMonitor::prepare_command_pool_set(const cmdmap_t
& cmdmap
,
8302 cmd_getval(cmdmap
, "pool", poolstr
);
8303 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
8305 ss
<< "unrecognized pool '" << poolstr
<< "'";
8309 cmd_getval(cmdmap
, "var", var
);
8311 pg_pool_t p
= *osdmap
.get_pg_pool(pool
);
8312 if (pending_inc
.new_pools
.count(pool
))
8313 p
= pending_inc
.new_pools
[pool
];
8315 // accept val as a json string in the normal case (current
8316 // generation monitor). parse out int or float values from the
8317 // string as needed. however, if it is not a string, try to pull
8318 // out an int, in case an older monitor with an older json schema is
8319 // forwarding a request.
8321 string interr
, floaterr
;
8324 int64_t uf
= 0; // micro-f
8325 cmd_getval(cmdmap
, "val", val
);
8328 "target_max_objects"
8330 auto iec_options
= {
8332 "target_size_bytes",
8333 "compression_max_blob_size",
8334 "compression_min_blob_size",
8338 if (count(begin(si_options
), end(si_options
), var
)) {
8339 n
= strict_si_cast
<int64_t>(val
, &interr
);
8340 } else if (count(begin(iec_options
), end(iec_options
), var
)) {
8341 n
= strict_iec_cast
<int64_t>(val
, &interr
);
8343 // parse string as both int and float; different fields use different types.
8344 n
= strict_strtoll(val
.c_str(), 10, &interr
);
8345 f
= strict_strtod(val
.c_str(), &floaterr
);
8346 uf
= llrintl(f
* (double)1000000.0);
8350 (var
== "hit_set_type" || var
== "hit_set_period" ||
8351 var
== "hit_set_count" || var
== "hit_set_fpp" ||
8352 var
== "target_max_objects" || var
== "target_max_bytes" ||
8353 var
== "cache_target_full_ratio" || var
== "cache_target_dirty_ratio" ||
8354 var
== "cache_target_dirty_high_ratio" || var
== "use_gmt_hitset" ||
8355 var
== "cache_min_flush_age" || var
== "cache_min_evict_age" ||
8356 var
== "hit_set_grade_decay_rate" || var
== "hit_set_search_last_n" ||
8357 var
== "min_read_recency_for_promote" || var
== "min_write_recency_for_promote")) {
8361 if (var
== "size") {
8362 if (p
.has_flag(pg_pool_t::FLAG_NOSIZECHANGE
)) {
8363 ss
<< "pool size change is disabled; you must unset nosizechange flag for the pool first";
8366 if (p
.type
== pg_pool_t::TYPE_ERASURE
) {
8367 ss
<< "can not change the size of an erasure-coded pool";
8370 if (interr
.length()) {
8371 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8374 if (n
<= 0 || n
> 10) {
8375 ss
<< "pool size must be between 1 and 10";
8379 if (!g_conf().get_val
<bool>("mon_allow_pool_size_one")) {
8380 ss
<< "configuring pool size as 1 is disabled by default.";
8384 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
8385 if (!sure
) { ss
<< "WARNING: setting pool size 1 could lead to data loss "
8386 "without recovery. If you are *ABSOLUTELY CERTAIN* that is what you want, "
8387 "pass the flag --yes-i-really-mean-it.";
8391 if (osdmap
.crush
->get_rule_type(p
.get_crush_rule()) != (int)p
.type
) {
8392 ss
<< "crush rule " << p
.get_crush_rule() << " type does not match pool";
8396 // only when increasing pool size
8397 int r
= check_pg_num(pool
, p
.get_pg_num(), n
, p
.get_crush_rule(), &ss
);
8403 p
.min_size
= g_conf().get_osd_pool_default_min_size(p
.size
);
8404 } else if (var
== "min_size") {
8405 if (p
.has_flag(pg_pool_t::FLAG_NOSIZECHANGE
)) {
8406 ss
<< "pool min size change is disabled; you must unset nosizechange flag for the pool first";
8409 if (interr
.length()) {
8410 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8414 if (p
.type
!= pg_pool_t::TYPE_ERASURE
) {
8415 if (n
< 1 || n
> p
.size
) {
8416 ss
<< "pool min_size must be between 1 and size, which is set to " << (int)p
.size
;
8420 ErasureCodeInterfaceRef erasure_code
;
8423 int err
= get_erasure_code(p
.erasure_code_profile
, &erasure_code
, &tmp
);
8425 k
= erasure_code
->get_data_chunk_count();
8427 ss
<< __func__
<< " get_erasure_code failed: " << tmp
.str();
8431 if (n
< k
|| n
> p
.size
) {
8432 ss
<< "pool min_size must be between " << k
<< " and size, which is set to " << (int)p
.size
;
8437 } else if (var
== "pg_num_actual") {
8438 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
8439 ss
<< "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8442 if (interr
.length()) {
8443 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8446 if (n
== (int)p
.get_pg_num()) {
8449 if (static_cast<uint64_t>(n
) > g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")) {
8450 ss
<< "'pg_num' must be greater than 0 and less than or equal to "
8451 << g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")
8452 << " (you may adjust 'mon max pool pg num' for higher values)";
8455 if (p
.has_flag(pg_pool_t::FLAG_CREATING
)) {
8456 ss
<< "cannot adjust pg_num while initial PGs are being created";
8459 if (n
> (int)p
.get_pg_num()) {
8460 if (p
.get_pg_num() != p
.get_pg_num_pending()) {
8461 // force pre-nautilus clients to resend their ops, since they
8462 // don't understand pg_num_pending changes form a new interval
8463 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
8467 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8468 ss
<< "nautilus OSDs are required to adjust pg_num_pending";
8471 if (n
< (int)p
.get_pgp_num()) {
8472 ss
<< "specified pg_num " << n
<< " < pgp_num " << p
.get_pgp_num();
8475 if (n
< (int)p
.get_pg_num() - 1) {
8476 ss
<< "specified pg_num " << n
<< " < pg_num (" << p
.get_pg_num()
8477 << ") - 1; only single pg decrease is currently supported";
8480 p
.set_pg_num_pending(n
);
8481 // force pre-nautilus clients to resend their ops, since they
8482 // don't understand pg_num_pending changes form a new interval
8483 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
8485 // force pre-luminous clients to resend their ops, since they
8486 // don't understand that split PGs now form a new interval.
8487 p
.last_force_op_resend_preluminous
= pending_inc
.epoch
;
8488 } else if (var
== "pg_num") {
8489 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
8490 ss
<< "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8493 if (interr
.length()) {
8494 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8497 if (n
== (int)p
.get_pg_num_target()) {
8500 if (n
<= 0 || static_cast<uint64_t>(n
) >
8501 g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")) {
8502 ss
<< "'pg_num' must be greater than 0 and less than or equal to "
8503 << g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")
8504 << " (you may adjust 'mon max pool pg num' for higher values)";
8507 if (n
> (int)p
.get_pg_num_target()) {
8508 int r
= check_pg_num(pool
, n
, p
.get_size(), p
.get_crush_rule(), &ss
);
8513 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
8514 if (p
.cache_mode
!= pg_pool_t::CACHEMODE_NONE
&& !force
) {
8515 ss
<< "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
8519 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8520 ss
<< "nautilus OSDs are required to decrease pg_num";
8524 int64_t pg_min
= 0, pg_max
= 0;
8525 p
.opts
.get(pool_opts_t::PG_NUM_MIN
, &pg_min
);
8526 p
.opts
.get(pool_opts_t::PG_NUM_MAX
, &pg_max
);
8527 if (pg_min
&& n
< pg_min
) {
8528 ss
<< "specified pg_num " << n
8529 << " < pg_num_min " << pg_min
;
8532 if (pg_max
&& n
> pg_max
) {
8533 ss
<< "specified pg_num " << n
8534 << " < pg_num_max " << pg_max
;
8537 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8538 // pre-nautilus osdmap format; increase pg_num directly
8539 assert(n
> (int)p
.get_pg_num());
8540 // force pre-nautilus clients to resend their ops, since they
8541 // don't understand pg_num_target changes form a new interval
8542 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
8543 // force pre-luminous clients to resend their ops, since they
8544 // don't understand that split PGs now form a new interval.
8545 p
.last_force_op_resend_preluminous
= pending_inc
.epoch
;
8548 // set targets; mgr will adjust pg_num_actual and pgp_num later.
8549 // make pgp_num track pg_num if it already matches. if it is set
8550 // differently, leave it different and let the user control it
8552 if (p
.get_pg_num_target() == p
.get_pgp_num_target()) {
8553 p
.set_pgp_num_target(n
);
8555 p
.set_pg_num_target(n
);
8557 } else if (var
== "pgp_num_actual") {
8558 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
8559 ss
<< "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8562 if (interr
.length()) {
8563 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8567 ss
<< "specified pgp_num must > 0, but you set to " << n
;
8570 if (n
> (int)p
.get_pg_num()) {
8571 ss
<< "specified pgp_num " << n
<< " > pg_num " << p
.get_pg_num();
8574 if (n
> (int)p
.get_pg_num_pending()) {
8575 ss
<< "specified pgp_num " << n
8576 << " > pg_num_pending " << p
.get_pg_num_pending();
8580 } else if (var
== "pgp_num") {
8581 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
8582 ss
<< "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8585 if (interr
.length()) {
8586 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8590 ss
<< "specified pgp_num must > 0, but you set to " << n
;
8593 if (n
> (int)p
.get_pg_num_target()) {
8594 ss
<< "specified pgp_num " << n
<< " > pg_num " << p
.get_pg_num_target();
8597 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8598 // pre-nautilus osdmap format; increase pgp_num directly
8601 p
.set_pgp_num_target(n
);
8603 } else if (var
== "pg_autoscale_mode") {
8604 auto m
= pg_pool_t::get_pg_autoscale_mode_by_name(val
);
8605 if (m
== pg_pool_t::pg_autoscale_mode_t::UNKNOWN
) {
8606 ss
<< "specified invalid mode " << val
;
8609 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8610 ss
<< "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
8613 p
.pg_autoscale_mode
= m
;
8614 } else if (var
== "crush_rule") {
8615 int id
= osdmap
.crush
->get_rule_id(val
);
8616 if (id
== -ENOENT
) {
8617 ss
<< "crush rule " << val
<< " does not exist";
8621 ss
<< cpp_strerror(id
);
8624 if (osdmap
.crush
->get_rule_type(id
) != (int)p
.get_type()) {
8625 ss
<< "crush rule " << id
<< " type does not match pool";
8629 } else if (var
== "nodelete" || var
== "nopgchange" ||
8630 var
== "nosizechange" || var
== "write_fadvise_dontneed" ||
8631 var
== "noscrub" || var
== "nodeep-scrub" || var
== "bulk") {
8632 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
8633 // make sure we only compare against 'n' if we didn't receive a string
8634 if (val
== "true" || (interr
.empty() && n
== 1)) {
8636 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8637 if (flag
== pg_pool_t::FLAG_NOPGCHANGE
&& p
.is_crimson()) {
8638 ss
<< "cannot clear FLAG_NOPGCHANGE on a crimson pool";
8643 ss
<< "expecting value 'true', 'false', '0', or '1'";
8646 } else if (var
== "eio") {
8647 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
8649 // make sure we only compare against 'n' if we didn't receive a string
8650 if (val
== "true" || (interr
.empty() && n
== 1)) {
8652 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8655 ss
<< "expecting value 'true', 'false', '0', or '1'";
8658 } else if (var
== "hashpspool") {
8659 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
8661 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
8664 ss
<< "are you SURE? this will remap all placement groups in this pool,"
8665 " this triggers large data movement,"
8666 " pass --yes-i-really-mean-it if you really do.";
8669 // make sure we only compare against 'n' if we didn't receive a string
8670 if (val
== "true" || (interr
.empty() && n
== 1)) {
8672 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8675 ss
<< "expecting value 'true', 'false', '0', or '1'";
8678 } else if (var
== "hit_set_type") {
8680 p
.hit_set_params
= HitSet::Params();
8682 int err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
8685 if (val
== "bloom") {
8686 BloomHitSet::Params
*bsp
= new BloomHitSet::Params
;
8687 bsp
->set_fpp(g_conf().get_val
<double>("osd_pool_default_hit_set_bloom_fpp"));
8688 p
.hit_set_params
= HitSet::Params(bsp
);
8689 } else if (val
== "explicit_hash")
8690 p
.hit_set_params
= HitSet::Params(new ExplicitHashHitSet::Params
);
8691 else if (val
== "explicit_object")
8692 p
.hit_set_params
= HitSet::Params(new ExplicitObjectHitSet::Params
);
8694 ss
<< "unrecognized hit_set type '" << val
<< "'";
8698 } else if (var
== "hit_set_period") {
8699 if (interr
.length()) {
8700 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8703 ss
<< "hit_set_period should be non-negative";
8706 p
.hit_set_period
= n
;
8707 } else if (var
== "hit_set_count") {
8708 if (interr
.length()) {
8709 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8712 ss
<< "hit_set_count should be non-negative";
8715 p
.hit_set_count
= n
;
8716 } else if (var
== "hit_set_fpp") {
8717 if (floaterr
.length()) {
8718 ss
<< "error parsing floating point value '" << val
<< "': " << floaterr
;
8720 } else if (f
< 0 || f
> 1.0) {
8721 ss
<< "hit_set_fpp should be in the range 0..1";
8724 if (p
.hit_set_params
.get_type() != HitSet::TYPE_BLOOM
) {
8725 ss
<< "hit set is not of type Bloom; invalid to set a false positive rate!";
8728 BloomHitSet::Params
*bloomp
= static_cast<BloomHitSet::Params
*>(p
.hit_set_params
.impl
.get());
8730 } else if (var
== "use_gmt_hitset") {
8731 if (val
== "true" || (interr
.empty() && n
== 1)) {
8732 p
.use_gmt_hitset
= true;
8734 ss
<< "expecting value 'true' or '1'";
8737 } else if (var
== "allow_ec_overwrites") {
8738 if (!p
.is_erasure()) {
8739 ss
<< "ec overwrites can only be enabled for an erasure coded pool";
8743 if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites
&&
8744 !is_pool_currently_all_bluestore(pool
, p
, &err
)) {
8745 ss
<< "pool must only be stored on bluestore for scrubbing to work: " << err
.str();
8748 if (val
== "true" || (interr
.empty() && n
== 1)) {
8749 p
.flags
|= pg_pool_t::FLAG_EC_OVERWRITES
;
8750 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8751 ss
<< "ec overwrites cannot be disabled once enabled";
8754 ss
<< "expecting value 'true', 'false', '0', or '1'";
8757 } else if (var
== "target_max_objects") {
8758 if (interr
.length()) {
8759 ss
<< "error parsing int '" << val
<< "': " << interr
;
8762 p
.target_max_objects
= n
;
8763 } else if (var
== "target_max_bytes") {
8764 if (interr
.length()) {
8765 ss
<< "error parsing int '" << val
<< "': " << interr
;
8768 p
.target_max_bytes
= n
;
8769 } else if (var
== "cache_target_dirty_ratio") {
8770 if (floaterr
.length()) {
8771 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
8774 if (f
< 0 || f
> 1.0) {
8775 ss
<< "value must be in the range 0..1";
8778 p
.cache_target_dirty_ratio_micro
= uf
;
8779 } else if (var
== "cache_target_dirty_high_ratio") {
8780 if (floaterr
.length()) {
8781 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
8784 if (f
< 0 || f
> 1.0) {
8785 ss
<< "value must be in the range 0..1";
8788 p
.cache_target_dirty_high_ratio_micro
= uf
;
8789 } else if (var
== "cache_target_full_ratio") {
8790 if (floaterr
.length()) {
8791 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
8794 if (f
< 0 || f
> 1.0) {
8795 ss
<< "value must be in the range 0..1";
8798 p
.cache_target_full_ratio_micro
= uf
;
8799 } else if (var
== "cache_min_flush_age") {
8800 if (interr
.length()) {
8801 ss
<< "error parsing int '" << val
<< "': " << interr
;
8804 p
.cache_min_flush_age
= n
;
8805 } else if (var
== "cache_min_evict_age") {
8806 if (interr
.length()) {
8807 ss
<< "error parsing int '" << val
<< "': " << interr
;
8810 p
.cache_min_evict_age
= n
;
8811 } else if (var
== "min_read_recency_for_promote") {
8812 if (interr
.length()) {
8813 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8816 p
.min_read_recency_for_promote
= n
;
8817 } else if (var
== "hit_set_grade_decay_rate") {
8818 if (interr
.length()) {
8819 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8822 if (n
> 100 || n
< 0) {
8823 ss
<< "value out of range,valid range is 0 - 100";
8826 p
.hit_set_grade_decay_rate
= n
;
8827 } else if (var
== "hit_set_search_last_n") {
8828 if (interr
.length()) {
8829 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8832 if (n
> p
.hit_set_count
|| n
< 0) {
8833 ss
<< "value out of range,valid range is 0 - hit_set_count";
8836 p
.hit_set_search_last_n
= n
;
8837 } else if (var
== "min_write_recency_for_promote") {
8838 if (interr
.length()) {
8839 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8842 p
.min_write_recency_for_promote
= n
;
8843 } else if (var
== "fast_read") {
8844 if (p
.is_replicated()) {
8845 ss
<< "fast read is not supported in replication pool";
8848 if (val
== "true" || (interr
.empty() && n
== 1)) {
8850 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8851 p
.fast_read
= false;
8853 ss
<< "expecting value 'true', 'false', '0', or '1'";
8856 } else if (pool_opts_t::is_opt_name(var
)) {
8857 bool unset
= val
== "unset";
8858 if (var
== "compression_mode") {
8860 auto cmode
= Compressor::get_comp_mode_type(val
);
8862 ss
<< "unrecognized compression mode '" << val
<< "'";
8866 } else if (var
== "compression_algorithm") {
8868 auto alg
= Compressor::get_comp_alg_type(val
);
8870 ss
<< "unrecognized compression_algorithm '" << val
<< "'";
8874 } else if (var
== "compression_required_ratio") {
8875 if (floaterr
.length()) {
8876 ss
<< "error parsing float value '" << val
<< "': " << floaterr
;
8879 if (f
< 0 || f
> 1) {
8880 ss
<< "compression_required_ratio is out of range (0-1): '" << val
<< "'";
8883 } else if (var
== "csum_type") {
8884 auto t
= unset
? 0 : Checksummer::get_csum_string_type(val
);
8886 ss
<< "unrecognized csum_type '" << val
<< "'";
8889 //preserve csum_type numeric value
8892 } else if (var
== "compression_max_blob_size" ||
8893 var
== "compression_min_blob_size" ||
8894 var
== "csum_max_block" ||
8895 var
== "csum_min_block") {
8896 if (interr
.length()) {
8897 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8900 } else if (var
== "fingerprint_algorithm") {
8902 auto alg
= pg_pool_t::get_fingerprint_from_str(val
);
8904 ss
<< "unrecognized fingerprint_algorithm '" << val
<< "'";
8908 } else if (var
== "target_size_bytes") {
8909 if (interr
.length()) {
8910 ss
<< "error parsing unit value '" << val
<< "': " << interr
;
8913 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8914 ss
<< "must set require_osd_release to nautilus or "
8915 << "later before setting target_size_bytes";
8918 } else if (var
== "target_size_ratio") {
8920 ss
<< "target_size_ratio cannot be negative";
8923 } else if (var
== "pg_num_min") {
8924 if (interr
.length()) {
8925 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8928 if (n
> (int)p
.get_pg_num_target()) {
8929 ss
<< "specified pg_num_min " << n
8930 << " > pg_num " << p
.get_pg_num_target();
8933 } else if (var
== "pg_num_max") {
8934 if (interr
.length()) {
8935 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8938 if (n
&& n
< (int)p
.get_pg_num_target()) {
8939 ss
<< "specified pg_num_max " << n
8940 << " < pg_num " << p
.get_pg_num_target();
8943 } else if (var
== "recovery_priority") {
8944 if (interr
.length()) {
8945 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8948 if (!g_conf()->debug_allow_any_pool_priority
) {
8949 if (n
> OSD_POOL_PRIORITY_MAX
|| n
< OSD_POOL_PRIORITY_MIN
) {
8950 ss
<< "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
8951 << " and " << OSD_POOL_PRIORITY_MAX
;
8955 } else if (var
== "pg_autoscale_bias") {
8956 if (f
< 0.0 || f
> 1000.0) {
8957 ss
<< "pg_autoscale_bias must be between 0 and 1000";
8960 } else if (var
== "dedup_tier") {
8961 if (interr
.empty()) {
8962 ss
<< "expecting value 'pool name'";
8965 // Current base tier in dedup does not support ec pool
8966 if (p
.is_erasure()) {
8967 ss
<< "pool '" << poolstr
8968 << "' is an ec pool, which cannot be a base tier";
8971 int64_t lowtierpool_id
= osdmap
.lookup_pg_pool_name(val
);
8972 if (lowtierpool_id
< 0) {
8973 ss
<< "unrecognized pool '" << val
<< "'";
8976 const pg_pool_t
*tp
= osdmap
.get_pg_pool(lowtierpool_id
);
8979 // The original input is string (pool name), but we convert it to int64_t.
8982 } else if (var
== "dedup_chunk_algorithm") {
8984 auto alg
= pg_pool_t::get_dedup_chunk_algorithm_from_str(val
);
8986 ss
<< "unrecognized fingerprint_algorithm '" << val
<< "'";
8990 } else if (var
== "dedup_cdc_chunk_size") {
8991 if (interr
.length()) {
8992 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8997 pool_opts_t::opt_desc_t desc
= pool_opts_t::get_opt_desc(var
);
8998 switch (desc
.type
) {
8999 case pool_opts_t::STR
:
9001 p
.opts
.unset(desc
.key
);
9003 p
.opts
.set(desc
.key
, static_cast<std::string
>(val
));
9006 case pool_opts_t::INT
:
9007 if (interr
.length()) {
9008 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
9012 p
.opts
.unset(desc
.key
);
9014 p
.opts
.set(desc
.key
, static_cast<int64_t>(n
));
9017 case pool_opts_t::DOUBLE
:
9018 if (floaterr
.length()) {
9019 ss
<< "error parsing floating point value '" << val
<< "': " << floaterr
;
9023 p
.opts
.unset(desc
.key
);
9025 p
.opts
.set(desc
.key
, static_cast<double>(f
));
9029 ceph_assert(!"unknown type");
9032 ss
<< "unrecognized variable '" << var
<< "'";
9035 if (val
!= "unset") {
9036 ss
<< "set pool " << pool
<< " " << var
<< " to " << val
;
9038 ss
<< "unset pool " << pool
<< " " << var
;
9040 p
.last_change
= pending_inc
.epoch
;
9041 pending_inc
.new_pools
[pool
] = p
;
9045 int OSDMonitor::prepare_command_pool_application(const string
&prefix
,
9046 const cmdmap_t
& cmdmap
,
9049 return _command_pool_application(prefix
, cmdmap
, ss
, nullptr, true);
9052 int OSDMonitor::preprocess_command_pool_application(const string
&prefix
,
9053 const cmdmap_t
& cmdmap
,
9057 return _command_pool_application(prefix
, cmdmap
, ss
, modified
, false);
9062 * Common logic for preprocess and prepare phases of pool application
9063 * tag commands. In preprocess mode we're only detecting invalid
9064 * commands, and determining whether it was a modification or a no-op.
9065 * In prepare mode we're actually updating the pending state.
9067 int OSDMonitor::_command_pool_application(const string
&prefix
,
9068 const cmdmap_t
& cmdmap
,
9074 cmd_getval(cmdmap
, "pool", pool_name
);
9075 int64_t pool
= osdmap
.lookup_pg_pool_name(pool_name
.c_str());
9077 ss
<< "unrecognized pool '" << pool_name
<< "'";
9081 pg_pool_t p
= *osdmap
.get_pg_pool(pool
);
9083 if (pending_inc
.new_pools
.count(pool
)) {
9084 p
= pending_inc
.new_pools
[pool
];
9089 cmd_getval(cmdmap
, "app", app
);
9090 bool app_exists
= (p
.application_metadata
.count(app
) > 0);
9093 cmd_getval(cmdmap
, "key", key
);
9095 ss
<< "key cannot be 'all'";
9100 cmd_getval(cmdmap
, "value", value
);
9101 if (value
== "all") {
9102 ss
<< "value cannot be 'all'";
9106 if (boost::algorithm::ends_with(prefix
, "enable")) {
9108 ss
<< "application name must be provided";
9113 ss
<< "application must be enabled on base tier";
9118 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
9120 if (!app_exists
&& !p
.application_metadata
.empty() && !force
) {
9121 ss
<< "Are you SURE? Pool '" << pool_name
<< "' already has an enabled "
9122 << "application; pass --yes-i-really-mean-it to proceed anyway";
9126 if (!app_exists
&& p
.application_metadata
.size() >= MAX_POOL_APPLICATIONS
) {
9127 ss
<< "too many enabled applications on pool '" << pool_name
<< "'; "
9128 << "max " << MAX_POOL_APPLICATIONS
;
9132 if (app
.length() > MAX_POOL_APPLICATION_LENGTH
) {
9133 ss
<< "application name '" << app
<< "' too long; max length "
9134 << MAX_POOL_APPLICATION_LENGTH
;
9139 p
.application_metadata
[app
] = {};
9141 ss
<< "enabled application '" << app
<< "' on pool '" << pool_name
<< "'";
9143 } else if (boost::algorithm::ends_with(prefix
, "disable")) {
9145 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
9148 ss
<< "Are you SURE? Disabling an application within a pool might result "
9149 << "in loss of application functionality; pass "
9150 << "--yes-i-really-mean-it to proceed anyway";
9155 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
9157 return 0; // idempotent
9160 p
.application_metadata
.erase(app
);
9161 ss
<< "disable application '" << app
<< "' on pool '" << pool_name
<< "'";
9163 } else if (boost::algorithm::ends_with(prefix
, "set")) {
9165 ss
<< "application metadata must be set on base tier";
9170 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
9176 cmd_getval(cmdmap
, "key", key
);
9179 ss
<< "key must be provided";
9183 auto &app_keys
= p
.application_metadata
[app
];
9184 if (app_keys
.count(key
) == 0 &&
9185 app_keys
.size() >= MAX_POOL_APPLICATION_KEYS
) {
9186 ss
<< "too many keys set for application '" << app
<< "' on pool '"
9187 << pool_name
<< "'; max " << MAX_POOL_APPLICATION_KEYS
;
9191 if (key
.length() > MAX_POOL_APPLICATION_LENGTH
) {
9192 ss
<< "key '" << app
<< "' too long; max length "
9193 << MAX_POOL_APPLICATION_LENGTH
;
9198 cmd_getval(cmdmap
, "value", value
);
9199 if (value
.length() > MAX_POOL_APPLICATION_LENGTH
) {
9200 ss
<< "value '" << value
<< "' too long; max length "
9201 << MAX_POOL_APPLICATION_LENGTH
;
9205 p
.application_metadata
[app
][key
] = value
;
9206 ss
<< "set application '" << app
<< "' key '" << key
<< "' to '"
9207 << value
<< "' on pool '" << pool_name
<< "'";
9208 } else if (boost::algorithm::ends_with(prefix
, "rm")) {
9210 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
9216 cmd_getval(cmdmap
, "key", key
);
9217 auto it
= p
.application_metadata
[app
].find(key
);
9218 if (it
== p
.application_metadata
[app
].end()) {
9219 ss
<< "application '" << app
<< "' on pool '" << pool_name
9220 << "' does not have key '" << key
<< "'";
9221 return 0; // idempotent
9224 p
.application_metadata
[app
].erase(it
);
9225 ss
<< "removed application '" << app
<< "' key '" << key
<< "' on pool '"
9226 << pool_name
<< "'";
9232 p
.last_change
= pending_inc
.epoch
;
9233 pending_inc
.new_pools
[pool
] = p
;
9236 // Because we fell through this far, we didn't hit no-op cases,
9237 // so pool was definitely modified
9238 if (modified
!= nullptr) {
9245 int OSDMonitor::_prepare_command_osd_crush_remove(
9246 CrushWrapper
&newcrush
,
9255 err
= newcrush
.remove_item_under(cct
, id
, ancestor
,
9258 err
= newcrush
.remove_item(cct
, id
, unlink_only
);
9263 void OSDMonitor::do_osd_crush_remove(CrushWrapper
& newcrush
)
9265 pending_inc
.crush
.clear();
9266 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
9269 int OSDMonitor::prepare_command_osd_crush_remove(
9270 CrushWrapper
&newcrush
,
9276 int err
= _prepare_command_osd_crush_remove(
9277 newcrush
, id
, ancestor
,
9278 has_ancestor
, unlink_only
);
9283 ceph_assert(err
== 0);
9284 do_osd_crush_remove(newcrush
);
9289 int OSDMonitor::prepare_command_osd_remove(int32_t id
)
9291 if (osdmap
.is_up(id
)) {
9295 pending_inc
.new_state
[id
] = osdmap
.get_state(id
);
9296 pending_inc
.new_uuid
[id
] = uuid_d();
9297 pending_metadata_rm
.insert(id
);
9298 pending_metadata
.erase(id
);
9303 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id
)
9305 ceph_assert(existing_id
);
9308 for (int32_t i
= 0; i
< osdmap
.get_max_osd(); ++i
) {
9309 if (!osdmap
.exists(i
) &&
9310 pending_inc
.new_up_client
.count(i
) == 0 &&
9311 (pending_inc
.new_state
.count(i
) == 0 ||
9312 (pending_inc
.new_state
[i
] & CEPH_OSD_EXISTS
) == 0)) {
9318 if (pending_inc
.new_max_osd
< 0) {
9319 return osdmap
.get_max_osd();
9321 return pending_inc
.new_max_osd
;
9324 void OSDMonitor::do_osd_create(
9327 const string
& device_class
,
9330 dout(10) << __func__
<< " uuid " << uuid
<< dendl
;
9331 ceph_assert(new_id
);
9333 // We presume validation has been performed prior to calling this
9334 // function. We assert with prejudice.
9336 int32_t allocated_id
= -1; // declare here so we can jump
9337 int32_t existing_id
= -1;
9338 if (!uuid
.is_zero()) {
9339 existing_id
= osdmap
.identify_osd(uuid
);
9340 if (existing_id
>= 0) {
9341 ceph_assert(id
< 0 || id
== existing_id
);
9342 *new_id
= existing_id
;
9344 } else if (id
>= 0) {
9345 // uuid does not exist, and id has been provided, so just create
9352 // allocate a new id
9353 allocated_id
= _allocate_osd_id(&existing_id
);
9354 dout(10) << __func__
<< " allocated id " << allocated_id
9355 << " existing id " << existing_id
<< dendl
;
9356 if (existing_id
>= 0) {
9357 ceph_assert(existing_id
< osdmap
.get_max_osd());
9358 ceph_assert(allocated_id
< 0);
9359 *new_id
= existing_id
;
9360 } else if (allocated_id
>= 0) {
9361 ceph_assert(existing_id
< 0);
9363 if (pending_inc
.new_max_osd
< 0) {
9364 pending_inc
.new_max_osd
= osdmap
.get_max_osd() + 1;
9366 ++pending_inc
.new_max_osd
;
9368 *new_id
= pending_inc
.new_max_osd
- 1;
9369 ceph_assert(*new_id
== allocated_id
);
9371 ceph_abort_msg("unexpected condition");
9375 if (device_class
.size()) {
9376 CrushWrapper newcrush
= _get_pending_crush();
9377 if (newcrush
.get_max_devices() < *new_id
+ 1) {
9378 newcrush
.set_max_devices(*new_id
+ 1);
9380 string name
= string("osd.") + stringify(*new_id
);
9381 if (!newcrush
.item_exists(*new_id
)) {
9382 newcrush
.set_item_name(*new_id
, name
);
9385 int r
= newcrush
.update_device_class(*new_id
, device_class
, name
, &ss
);
9387 derr
<< __func__
<< " failed to set " << name
<< " device_class "
9388 << device_class
<< ": " << cpp_strerror(r
) << " - " << ss
.str()
9390 // non-fatal... this might be a replay and we want to be idempotent.
9392 dout(20) << __func__
<< " set " << name
<< " device_class " << device_class
9394 pending_inc
.crush
.clear();
9395 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
9398 dout(20) << __func__
<< " no device_class" << dendl
;
9401 dout(10) << __func__
<< " using id " << *new_id
<< dendl
;
9402 if (osdmap
.get_max_osd() <= *new_id
&& pending_inc
.new_max_osd
<= *new_id
) {
9403 pending_inc
.new_max_osd
= *new_id
+ 1;
9406 pending_inc
.new_weight
[*new_id
] = CEPH_OSD_IN
;
9407 // do not set EXISTS; OSDMap::set_weight, called by apply_incremental, will
9408 // set it for us. (ugh.)
9409 pending_inc
.new_state
[*new_id
] |= CEPH_OSD_NEW
;
9410 if (!uuid
.is_zero())
9411 pending_inc
.new_uuid
[*new_id
] = uuid
;
9414 int OSDMonitor::validate_osd_create(
9417 const bool check_osd_exists
,
9418 int32_t* existing_id
,
9422 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
9423 << " check_osd_exists " << check_osd_exists
<< dendl
;
9425 ceph_assert(existing_id
);
9427 if (id
< 0 && uuid
.is_zero()) {
9428 // we have nothing to validate
9431 } else if (uuid
.is_zero()) {
9432 // we have an id but we will ignore it - because that's what
9433 // `osd create` does.
9438 * This function will be used to validate whether we are able to
9439 * create a new osd when the `uuid` is specified.
9441 * It will be used by both `osd create` and `osd new`, as the checks
9442 * are basically the same when it pertains to osd id and uuid validation.
9443 * However, `osd create` presumes an `uuid` is optional, for legacy
9444 * reasons, while `osd new` requires the `uuid` to be provided. This
9445 * means that `osd create` will not be idempotent if an `uuid` is not
9446 * provided, but we will always guarantee the idempotency of `osd new`.
9449 ceph_assert(!uuid
.is_zero());
9450 if (pending_inc
.identify_osd(uuid
) >= 0) {
9451 // osd is about to exist
9455 int32_t i
= osdmap
.identify_osd(uuid
);
9457 // osd already exists
9458 if (id
>= 0 && i
!= id
) {
9459 ss
<< "uuid " << uuid
<< " already in use for different id " << i
;
9462 // return a positive errno to distinguish between a blocking error
9463 // and an error we consider to not be a problem (i.e., this would be
9464 // an idempotent operation).
9470 if (pending_inc
.new_state
.count(id
)) {
9471 // osd is about to exist
9474 // we may not care if an osd exists if we are recreating a previously
9476 if (check_osd_exists
&& osdmap
.exists(id
)) {
9477 ss
<< "id " << id
<< " already in use and does not match uuid "
9485 int OSDMonitor::prepare_command_osd_create(
9488 int32_t* existing_id
,
9491 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
<< dendl
;
9492 ceph_assert(existing_id
);
9493 if (osdmap
.is_destroyed(id
)) {
9494 ss
<< "ceph osd create has been deprecated. Please use ceph osd new "
9499 if (uuid
.is_zero()) {
9500 dout(10) << __func__
<< " no uuid; assuming legacy `osd create`" << dendl
;
9503 return validate_osd_create(id
, uuid
, true, existing_id
, ss
);
9506 int OSDMonitor::prepare_command_osd_new(
9508 const cmdmap_t
& cmdmap
,
9509 const map
<string
,string
>& params
,
9517 ceph_assert(paxos
.is_plugged());
9519 dout(10) << __func__
<< " " << op
<< dendl
;
9521 /* validate command. abort now if something's wrong. */
9523 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
9525 * If `id` is not specified, we will identify any existing osd based
9526 * on `uuid`. Operation will be idempotent iff secrets match.
9528 * If `id` is specified, we will identify any existing osd based on
9529 * `uuid` and match against `id`. If they match, operation will be
9530 * idempotent iff secrets match.
9532 * `-i secrets.json` will be optional. If supplied, will be used
9533 * to check for idempotency when `id` and `uuid` match.
9535 * If `id` is not specified, and `uuid` does not exist, an id will
9536 * be found or allocated for the osd.
9538 * If `id` is specified, and the osd has been previously marked
9539 * as destroyed, then the `id` will be reused.
9541 if (!cmd_getval(cmdmap
, "uuid", uuidstr
)) {
9542 ss
<< "requires the OSD's UUID to be specified.";
9544 } else if (!uuid
.parse(uuidstr
.c_str())) {
9545 ss
<< "invalid UUID value '" << uuidstr
<< "'.";
9549 if (cmd_getval(cmdmap
, "id", id
) &&
9551 ss
<< "invalid OSD id; must be greater or equal than zero.";
9555 // are we running an `osd create`-like command, or recreating
9556 // a previously destroyed osd?
9558 bool is_recreate_destroyed
= (id
>= 0 && osdmap
.is_destroyed(id
));
9560 // we will care about `id` to assess whether osd is `destroyed`, or
9561 // to create a new osd.
9562 // we will need an `id` by the time we reach auth.
9564 int32_t existing_id
= -1;
9565 int err
= validate_osd_create(id
, uuid
, !is_recreate_destroyed
,
9568 bool may_be_idempotent
= false;
9569 if (err
== EEXIST
) {
9570 // this is idempotent from the osdmon's point-of-view
9571 may_be_idempotent
= true;
9572 ceph_assert(existing_id
>= 0);
9574 } else if (err
< 0) {
9578 if (!may_be_idempotent
) {
9579 // idempotency is out of the window. We are either creating a new
9580 // osd or recreating a destroyed osd.
9582 // We now need to figure out if we have an `id` (and if it's valid),
9583 // of find an `id` if we don't have one.
9585 // NOTE: we need to consider the case where the `id` is specified for
9586 // `osd create`, and we must honor it. So this means checking if
9587 // the `id` is destroyed, and if so assume the destroy; otherwise,
9588 // check if it `exists` - in which case we complain about not being
9589 // `destroyed`. In the end, if nothing fails, we must allow the
9590 // creation, so that we are compatible with `create`.
9591 if (id
>= 0 && osdmap
.exists(id
) && !osdmap
.is_destroyed(id
)) {
9592 dout(10) << __func__
<< " osd." << id
<< " isn't destroyed" << dendl
;
9593 ss
<< "OSD " << id
<< " has not yet been destroyed";
9595 } else if (id
< 0) {
9597 id
= _allocate_osd_id(&existing_id
);
9599 ceph_assert(existing_id
>= 0);
9602 dout(10) << __func__
<< " found id " << id
<< " to use" << dendl
;
9603 } else if (id
>= 0 && osdmap
.is_destroyed(id
)) {
9604 dout(10) << __func__
<< " recreating osd." << id
<< dendl
;
9606 dout(10) << __func__
<< " creating new osd." << id
<< dendl
;
9609 ceph_assert(id
>= 0);
9610 ceph_assert(osdmap
.exists(id
));
9613 // we are now able to either create a brand new osd or reuse an existing
9614 // osd that has been previously destroyed.
9616 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
<< dendl
;
9618 if (may_be_idempotent
&& params
.empty()) {
9619 // nothing to do, really.
9620 dout(10) << __func__
<< " idempotent and no params -- no op." << dendl
;
9621 ceph_assert(id
>= 0);
9623 f
->open_object_section("created_osd");
9624 f
->dump_int("osdid", id
);
9632 string device_class
;
9633 auto p
= params
.find("crush_device_class");
9634 if (p
!= params
.end()) {
9635 device_class
= p
->second
;
9636 dout(20) << __func__
<< " device_class will be " << device_class
<< dendl
;
9638 string cephx_secret
, lockbox_secret
, dmcrypt_key
;
9639 bool has_lockbox
= false;
9640 bool has_secrets
= params
.count("cephx_secret")
9641 || params
.count("cephx_lockbox_secret")
9642 || params
.count("dmcrypt_key");
9644 KVMonitor
*svc
= nullptr;
9645 AuthMonitor::auth_entity_t cephx_entity
, lockbox_entity
;
9648 if (params
.count("cephx_secret") == 0) {
9649 ss
<< "requires a cephx secret.";
9652 cephx_secret
= params
.at("cephx_secret");
9654 bool has_lockbox_secret
= (params
.count("cephx_lockbox_secret") > 0);
9655 bool has_dmcrypt_key
= (params
.count("dmcrypt_key") > 0);
9657 dout(10) << __func__
<< " has lockbox " << has_lockbox_secret
9658 << " dmcrypt " << has_dmcrypt_key
<< dendl
;
9660 if (has_lockbox_secret
&& has_dmcrypt_key
) {
9662 lockbox_secret
= params
.at("cephx_lockbox_secret");
9663 dmcrypt_key
= params
.at("dmcrypt_key");
9664 } else if (!has_lockbox_secret
!= !has_dmcrypt_key
) {
9665 ss
<< "requires both a cephx lockbox secret and a dm-crypt key.";
9669 dout(10) << __func__
<< " validate secrets using osd id " << id
<< dendl
;
9671 err
= mon
.authmon()->validate_osd_new(id
, uuid
,
9679 } else if (may_be_idempotent
&& err
!= EEXIST
) {
9680 // for this to be idempotent, `id` should already be >= 0; no need
9681 // to use validate_id.
9682 ceph_assert(id
>= 0);
9683 ss
<< "osd." << id
<< " exists but secrets do not match";
9689 err
= svc
->validate_osd_new(uuid
, dmcrypt_key
, ss
);
9692 } else if (may_be_idempotent
&& err
!= EEXIST
) {
9693 ceph_assert(id
>= 0);
9694 ss
<< "osd." << id
<< " exists but dm-crypt key does not match.";
9699 ceph_assert(!has_secrets
|| !cephx_secret
.empty());
9700 ceph_assert(!has_lockbox
|| !lockbox_secret
.empty());
9702 if (may_be_idempotent
) {
9703 // we have nothing to do for either the osdmon or the authmon,
9704 // and we have no lockbox - so the config key service will not be
9705 // touched. This is therefore an idempotent operation, and we can
9706 // just return right away.
9707 dout(10) << __func__
<< " idempotent -- no op." << dendl
;
9708 ceph_assert(id
>= 0);
9710 f
->open_object_section("created_osd");
9711 f
->dump_int("osdid", id
);
9718 ceph_assert(!may_be_idempotent
);
9722 ceph_assert(!cephx_secret
.empty());
9723 ceph_assert((lockbox_secret
.empty() && dmcrypt_key
.empty()) ||
9724 (!lockbox_secret
.empty() && !dmcrypt_key
.empty()));
9726 err
= mon
.authmon()->do_osd_new(cephx_entity
,
9729 ceph_assert(0 == err
);
9732 ceph_assert(nullptr != svc
);
9733 svc
->do_osd_new(uuid
, dmcrypt_key
);
9737 if (is_recreate_destroyed
) {
9738 ceph_assert(id
>= 0);
9739 ceph_assert(osdmap
.is_destroyed(id
));
9740 pending_inc
.new_state
[id
] |= CEPH_OSD_DESTROYED
;
9741 if ((osdmap
.get_state(id
) & CEPH_OSD_NEW
) == 0) {
9742 pending_inc
.new_state
[id
] |= CEPH_OSD_NEW
;
9744 if (osdmap
.get_state(id
) & CEPH_OSD_UP
) {
9745 // due to http://tracker.ceph.com/issues/20751 some clusters may
9746 // have UP set for non-existent OSDs; make sure it is cleared
9747 // for a newly created osd.
9748 pending_inc
.new_state
[id
] |= CEPH_OSD_UP
;
9750 pending_inc
.new_uuid
[id
] = uuid
;
9752 ceph_assert(id
>= 0);
9753 int32_t new_id
= -1;
9754 do_osd_create(id
, uuid
, device_class
, &new_id
);
9755 ceph_assert(new_id
>= 0);
9756 ceph_assert(id
== new_id
);
9760 f
->open_object_section("created_osd");
9761 f
->dump_int("osdid", id
);
9770 bool OSDMonitor::prepare_command(MonOpRequestRef op
)
9772 op
->mark_osdmon_event(__func__
);
9773 auto m
= op
->get_req
<MMonCommand
>();
9776 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
9777 string rs
= ss
.str();
9778 mon
.reply_command(op
, -EINVAL
, rs
, get_last_committed());
9779 return false; /* nothing to propose */
9782 MonSession
*session
= op
->get_session();
9784 derr
<< __func__
<< " no session" << dendl
;
9785 mon
.reply_command(op
, -EACCES
, "access denied", get_last_committed());
9786 return false; /* nothing to propose */
9789 return prepare_command_impl(op
, cmdmap
);
9792 static int parse_reweights(CephContext
*cct
,
9793 const cmdmap_t
& cmdmap
,
9794 const OSDMap
& osdmap
,
9795 map
<int32_t, uint32_t>* weights
)
9798 if (!cmd_getval(cmdmap
, "weights", weights_str
)) {
9801 std::replace(begin(weights_str
), end(weights_str
), '\'', '"');
9802 json_spirit::mValue json_value
;
9803 if (!json_spirit::read(weights_str
, json_value
)) {
9806 if (json_value
.type() != json_spirit::obj_type
) {
9809 const auto obj
= json_value
.get_obj();
9811 for (auto& osd_weight
: obj
) {
9812 auto osd_id
= std::stoi(osd_weight
.first
);
9813 if (!osdmap
.exists(osd_id
)) {
9816 if (osd_weight
.second
.type() != json_spirit::str_type
) {
9819 auto weight
= std::stoul(osd_weight
.second
.get_str());
9820 weights
->insert({osd_id
, weight
});
9822 } catch (const std::logic_error
& e
) {
9828 int OSDMonitor::prepare_command_osd_destroy(
9832 ceph_assert(paxos
.is_plugged());
9834 // we check if the osd exists for the benefit of `osd purge`, which may
9835 // have previously removed the osd. If the osd does not exist, return
9836 // -ENOENT to convey this, and let the caller deal with it.
9838 // we presume that all auth secrets and config keys were removed prior
9839 // to this command being called. if they exist by now, we also assume
9840 // they must have been created by some other command and do not pertain
9841 // to this non-existent osd.
9842 if (!osdmap
.exists(id
)) {
9843 dout(10) << __func__
<< " osd." << id
<< " does not exist." << dendl
;
9847 uuid_d uuid
= osdmap
.get_uuid(id
);
9848 dout(10) << __func__
<< " destroying osd." << id
9849 << " uuid " << uuid
<< dendl
;
9851 // if it has been destroyed, we assume our work here is done.
9852 if (osdmap
.is_destroyed(id
)) {
9853 ss
<< "destroyed osd." << id
;
9857 EntityName cephx_entity
, lockbox_entity
;
9858 bool idempotent_auth
= false, idempotent_cks
= false;
9860 int err
= mon
.authmon()->validate_osd_destroy(id
, uuid
,
9865 if (err
== -ENOENT
) {
9866 idempotent_auth
= true;
9872 auto svc
= mon
.kvmon();
9873 err
= svc
->validate_osd_destroy(id
, uuid
);
9875 ceph_assert(err
== -ENOENT
);
9877 idempotent_cks
= true;
9880 if (!idempotent_auth
) {
9881 err
= mon
.authmon()->do_osd_destroy(cephx_entity
, lockbox_entity
);
9882 ceph_assert(0 == err
);
9885 if (!idempotent_cks
) {
9886 svc
->do_osd_destroy(id
, uuid
);
9889 pending_inc
.new_state
[id
] = CEPH_OSD_DESTROYED
;
9890 pending_inc
.new_uuid
[id
] = uuid_d();
9892 // we can only propose_pending() once per service, otherwise we'll be
9893 // defying PaxosService and all laws of nature. Therefore, as we may
9894 // be used during 'osd purge', let's keep the caller responsible for
9896 ceph_assert(err
== 0);
9900 int OSDMonitor::prepare_command_osd_purge(
9904 ceph_assert(paxos
.is_plugged());
9905 dout(10) << __func__
<< " purging osd." << id
<< dendl
;
9907 ceph_assert(!osdmap
.is_up(id
));
9910 * This may look a bit weird, but this is what's going to happen:
9912 * 1. we make sure that removing from crush works
9913 * 2. we call `prepare_command_osd_destroy()`. If it returns an
9914 * error, then we abort the whole operation, as no updates
9915 * have been made. However, we this function will have
9916 * side-effects, thus we need to make sure that all operations
9917 * performed henceforth will *always* succeed.
9918 * 3. we call `prepare_command_osd_remove()`. Although this
9919 * function can return an error, it currently only checks if the
9920 * osd is up - and we have made sure that it is not so, so there
9921 * is no conflict, and it is effectively an update.
9922 * 4. finally, we call `do_osd_crush_remove()`, which will perform
9923 * the crush update we delayed from before.
9926 CrushWrapper newcrush
= _get_pending_crush();
9928 bool may_be_idempotent
= false;
9930 int err
= _prepare_command_osd_crush_remove(newcrush
, id
, 0, false, false);
9931 if (err
== -ENOENT
) {
9933 may_be_idempotent
= true;
9934 } else if (err
< 0) {
9935 ss
<< "error removing osd." << id
<< " from crush";
9939 // no point destroying the osd again if it has already been marked destroyed
9940 if (!osdmap
.is_destroyed(id
)) {
9941 err
= prepare_command_osd_destroy(id
, ss
);
9943 if (err
== -ENOENT
) {
9949 may_be_idempotent
= false;
9952 ceph_assert(0 == err
);
9954 if (may_be_idempotent
&& !osdmap
.exists(id
)) {
9955 dout(10) << __func__
<< " osd." << id
<< " does not exist and "
9956 << "we are idempotent." << dendl
;
9960 err
= prepare_command_osd_remove(id
);
9961 // we should not be busy, as we should have made sure this id is not up.
9962 ceph_assert(0 == err
);
9964 do_osd_crush_remove(newcrush
);
9968 int OSDMonitor::parse_pgid(const cmdmap_t
& cmdmap
, stringstream
&ss
,
9969 /* out */ pg_t
&pgid
, std::optional
<string
> pgids
) {
9971 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
9972 ss
<< "unable to parse 'pgid' value '"
9973 << cmd_vartype_stringify(cmdmap
.at("pgid")) << "'";
9976 if (!pgid
.parse(pgidstr
.c_str())) {
9977 ss
<< "invalid pgid '" << pgidstr
<< "'";
9980 if (!osdmap
.pg_exists(pgid
)) {
9981 ss
<< "pgid '" << pgid
<< "' does not exist";
9984 if (pgids
.has_value())
9985 pgids
.value() = pgidstr
;
9989 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op
,
9990 const cmdmap_t
& cmdmap
)
9992 op
->mark_osdmon_event(__func__
);
9993 auto m
= op
->get_req
<MMonCommand
>();
9999 string format
= cmd_getval_or
<string
>(cmdmap
, "format", "plain");
10000 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
10003 cmd_getval(cmdmap
, "prefix", prefix
);
10007 bool osdid_present
= false;
10008 if (prefix
!= "osd pg-temp" &&
10009 prefix
!= "osd pg-upmap" &&
10010 prefix
!= "osd pg-upmap-items") { // avoid commands with non-int id arg
10011 osdid_present
= cmd_getval(cmdmap
, "id", osdid
);
10013 if (osdid_present
) {
10015 oss
<< "osd." << osdid
;
10016 osd_name
= oss
.str();
10019 // Even if there's a pending state with changes that could affect
10020 // a command, considering that said state isn't yet committed, we
10021 // just don't care about those changes if the command currently being
10022 // handled acts as a no-op against the current committed state.
10023 // In a nutshell, we assume this command happens *before*.
10025 // Let me make this clearer:
10027 // - If we have only one client, and that client issues some
10028 // operation that would conflict with this operation but is
10029 // still on the pending state, then we would be sure that said
10030 // operation wouldn't have returned yet, so the client wouldn't
10031 // issue this operation (unless the client didn't wait for the
10032 // operation to finish, and that would be the client's own fault).
10034 // - If we have more than one client, each client will observe
10035 // whatever is the state at the moment of the commit. So, if we
10036 // have two clients, one issuing an unlink and another issuing a
10037 // link, and if the link happens while the unlink is still on the
10038 // pending state, from the link's point-of-view this is a no-op.
10039 // If different clients are issuing conflicting operations and
10040 // they care about that, then the clients should make sure they
10041 // enforce some kind of concurrency mechanism -- from our
10042 // perspective that's what Douglas Adams would call an SEP.
10044 // This should be used as a general guideline for most commands handled
10045 // in this function. Adapt as you see fit, but please bear in mind that
10046 // this is the expected behavior.
10049 if (prefix
== "osd setcrushmap" ||
10050 (prefix
== "osd crush set" && !osdid_present
)) {
10051 if (pending_inc
.crush
.length()) {
10052 dout(10) << __func__
<< " waiting for pending crush update " << dendl
;
10055 dout(10) << "prepare_command setting new crush map" << dendl
;
10056 bufferlist
data(m
->get_data());
10057 CrushWrapper crush
;
10059 auto bl
= data
.cbegin();
10062 catch (const std::exception
&e
) {
10064 ss
<< "Failed to parse crushmap: " << e
.what();
10065 goto reply_no_propose
;
10068 int64_t prior_version
= 0;
10069 if (cmd_getval(cmdmap
, "prior_version", prior_version
)) {
10070 if (prior_version
== osdmap
.get_crush_version() - 1) {
10071 // see if we are a resend of the last update. this is imperfect
10072 // (multiple racing updaters may not both get reliable success)
10073 // but we expect crush updaters (via this interface) to be rare-ish.
10074 bufferlist current
, proposed
;
10075 osdmap
.crush
->encode(current
, mon
.get_quorum_con_features());
10076 crush
.encode(proposed
, mon
.get_quorum_con_features());
10077 if (current
.contents_equal(proposed
)) {
10078 dout(10) << __func__
10079 << " proposed matches current and version equals previous"
10082 ss
<< osdmap
.get_crush_version();
10083 goto reply_no_propose
;
10086 if (prior_version
!= osdmap
.get_crush_version()) {
10088 ss
<< "prior_version " << prior_version
<< " != crush version "
10089 << osdmap
.get_crush_version();
10090 goto reply_no_propose
;
10094 if (!validate_crush_against_features(&crush
, ss
)) {
10096 goto reply_no_propose
;
10099 err
= osdmap
.validate_crush_rules(&crush
, &ss
);
10101 goto reply_no_propose
;
10104 if (g_conf()->mon_osd_crush_smoke_test
) {
10105 // sanity check: test some inputs to make sure this map isn't
10107 dout(10) << " testing map" << dendl
;
10109 CrushTester
tester(crush
, ess
);
10110 tester
.set_min_x(0);
10111 tester
.set_max_x(50);
10112 tester
.set_num_rep(3); // arbitrary
10113 auto start
= ceph::coarse_mono_clock::now();
10114 int r
= tester
.test_with_fork(cct
, g_conf()->mon_lease
);
10115 auto duration
= ceph::coarse_mono_clock::now() - start
;
10117 dout(10) << " tester.test_with_fork returns " << r
10118 << ": " << ess
.str() << dendl
;
10119 ss
<< "crush smoke test failed with " << r
<< ": " << ess
.str();
10121 goto reply_no_propose
;
10123 dout(10) << __func__
<< " crush somke test duration: "
10124 << duration
<< ", result: " << ess
.str() << dendl
;
10127 pending_inc
.crush
= data
;
10128 ss
<< osdmap
.get_crush_version() + 1;
10131 } else if (prefix
== "osd crush set-all-straw-buckets-to-straw2") {
10132 CrushWrapper newcrush
= _get_pending_crush();
10133 for (int b
= 0; b
< newcrush
.get_max_buckets(); ++b
) {
10135 if (newcrush
.bucket_exists(bid
) &&
10136 newcrush
.get_bucket_alg(bid
) == CRUSH_BUCKET_STRAW
) {
10137 dout(20) << " bucket " << bid
<< " is straw, can convert" << dendl
;
10138 newcrush
.bucket_set_alg(bid
, CRUSH_BUCKET_STRAW2
);
10141 if (!validate_crush_against_features(&newcrush
, ss
)) {
10143 goto reply_no_propose
;
10145 pending_inc
.crush
.clear();
10146 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10147 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10148 get_last_committed() + 1));
10150 } else if (prefix
== "osd crush set-device-class") {
10151 string device_class
;
10152 if (!cmd_getval(cmdmap
, "class", device_class
)) {
10153 err
= -EINVAL
; // no value!
10154 goto reply_no_propose
;
10158 vector
<string
> idvec
;
10159 cmd_getval(cmdmap
, "ids", idvec
);
10160 CrushWrapper newcrush
= _get_pending_crush();
10162 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
10166 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
10167 osdmap
.get_all_osds(osds
);
10170 // try traditional single osd way
10171 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
10173 // ss has reason for failure
10174 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
10181 for (auto &osd
: osds
) {
10182 if (!osdmap
.exists(osd
)) {
10183 ss
<< "osd." << osd
<< " does not exist. ";
10188 oss
<< "osd." << osd
;
10189 string name
= oss
.str();
10191 if (newcrush
.get_max_devices() < osd
+ 1) {
10192 newcrush
.set_max_devices(osd
+ 1);
10195 if (newcrush
.item_exists(osd
)) {
10196 action
= "updating";
10198 action
= "creating";
10199 newcrush
.set_item_name(osd
, name
);
10202 dout(5) << action
<< " crush item id " << osd
<< " name '" << name
10203 << "' device_class '" << device_class
<< "'"
10205 err
= newcrush
.update_device_class(osd
, device_class
, name
, &ss
);
10207 goto reply_no_propose
;
10209 if (err
== 0 && !_have_pending_crush()) {
10211 // for single osd only, wildcard makes too much noise
10212 ss
<< "set-device-class item id " << osd
<< " name '" << name
10213 << "' device_class '" << device_class
<< "': no change. ";
10216 updated
.insert(osd
);
10221 pending_inc
.crush
.clear();
10222 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10223 ss
<< "set osd(s) " << updated
<< " to class '" << device_class
<< "'";
10225 wait_for_finished_proposal(
10227 new Monitor::C_Command(mon
,op
, 0, rs
, get_last_committed() + 1));
10229 } else if (prefix
== "osd crush rm-device-class") {
10231 vector
<string
> idvec
;
10232 cmd_getval(cmdmap
, "ids", idvec
);
10233 CrushWrapper newcrush
= _get_pending_crush();
10236 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
10241 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
10242 osdmap
.get_all_osds(osds
);
10245 // try traditional single osd way
10246 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
10248 // ss has reason for failure
10249 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
10251 goto reply_no_propose
;
10256 for (auto &osd
: osds
) {
10257 if (!osdmap
.exists(osd
)) {
10258 ss
<< "osd." << osd
<< " does not exist. ";
10262 auto class_name
= newcrush
.get_item_class(osd
);
10264 ss
<< "osd." << osd
<< " belongs to no class, ";
10267 // note that we do not verify if class_is_in_use here
10268 // in case the device is misclassified and user wants
10269 // to overridely reset...
10271 err
= newcrush
.remove_device_class(cct
, osd
, &ss
);
10273 // ss has reason for failure
10274 goto reply_no_propose
;
10276 updated
.insert(osd
);
10280 pending_inc
.crush
.clear();
10281 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10282 ss
<< "done removing class of osd(s): " << updated
;
10284 wait_for_finished_proposal(
10286 new Monitor::C_Command(mon
,op
, 0, rs
, get_last_committed() + 1));
10288 } else if (prefix
== "osd crush class create") {
10289 string device_class
;
10290 if (!cmd_getval(cmdmap
, "class", device_class
)) {
10291 err
= -EINVAL
; // no value!
10292 goto reply_no_propose
;
10294 if (osdmap
.require_osd_release
< ceph_release_t::luminous
) {
10295 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
10296 << "luminous' before using crush device classes";
10298 goto reply_no_propose
;
10300 if (!_have_pending_crush() &&
10301 _get_stable_crush().class_exists(device_class
)) {
10302 ss
<< "class '" << device_class
<< "' already exists";
10303 goto reply_no_propose
;
10305 CrushWrapper newcrush
= _get_pending_crush();
10306 if (newcrush
.class_exists(device_class
)) {
10307 ss
<< "class '" << device_class
<< "' already exists";
10310 int class_id
= newcrush
.get_or_create_class_id(device_class
);
10311 pending_inc
.crush
.clear();
10312 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10313 ss
<< "created class " << device_class
<< " with id " << class_id
10314 << " to crush map";
10316 } else if (prefix
== "osd crush class rm") {
10317 string device_class
;
10318 if (!cmd_getval(cmdmap
, "class", device_class
)) {
10319 err
= -EINVAL
; // no value!
10320 goto reply_no_propose
;
10322 if (osdmap
.require_osd_release
< ceph_release_t::luminous
) {
10323 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
10324 << "luminous' before using crush device classes";
10326 goto reply_no_propose
;
10329 if (!osdmap
.crush
->class_exists(device_class
)) {
10331 goto reply_no_propose
;
10334 CrushWrapper newcrush
= _get_pending_crush();
10335 if (!newcrush
.class_exists(device_class
)) {
10336 err
= 0; // make command idempotent
10339 int class_id
= newcrush
.get_class_id(device_class
);
10341 if (newcrush
.class_is_in_use(class_id
, &ts
)) {
10343 ss
<< "class '" << device_class
<< "' " << ts
.str();
10344 goto reply_no_propose
;
10347 // check if class is used by any erasure-code-profiles
10348 mempool::osdmap::map
<string
,map
<string
,string
>> old_ec_profiles
=
10349 osdmap
.get_erasure_code_profiles();
10350 auto ec_profiles
= pending_inc
.get_erasure_code_profiles();
10351 #ifdef HAVE_STDLIB_MAP_SPLICING
10352 ec_profiles
.merge(old_ec_profiles
);
10354 ec_profiles
.insert(make_move_iterator(begin(old_ec_profiles
)),
10355 make_move_iterator(end(old_ec_profiles
)));
10357 list
<string
> referenced_by
;
10358 for (auto &i
: ec_profiles
) {
10359 for (auto &j
: i
.second
) {
10360 if ("crush-device-class" == j
.first
&& device_class
== j
.second
) {
10361 referenced_by
.push_back(i
.first
);
10365 if (!referenced_by
.empty()) {
10367 ss
<< "class '" << device_class
10368 << "' is still referenced by erasure-code-profile(s): " << referenced_by
;
10369 goto reply_no_propose
;
10373 newcrush
.get_devices_by_class(device_class
, &osds
);
10374 for (auto& p
: osds
) {
10375 err
= newcrush
.remove_device_class(cct
, p
, &ss
);
10377 // ss has reason for failure
10378 goto reply_no_propose
;
10382 if (osds
.empty()) {
10383 // empty class, remove directly
10384 err
= newcrush
.remove_class_name(device_class
);
10386 ss
<< "class '" << device_class
<< "' cannot be removed '"
10387 << cpp_strerror(err
) << "'";
10388 goto reply_no_propose
;
10392 pending_inc
.crush
.clear();
10393 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10394 ss
<< "removed class " << device_class
<< " with id " << class_id
10395 << " from crush map";
10397 } else if (prefix
== "osd crush class rename") {
10398 string srcname
, dstname
;
10399 if (!cmd_getval(cmdmap
, "srcname", srcname
)) {
10401 goto reply_no_propose
;
10403 if (!cmd_getval(cmdmap
, "dstname", dstname
)) {
10405 goto reply_no_propose
;
10408 CrushWrapper newcrush
= _get_pending_crush();
10409 if (!newcrush
.class_exists(srcname
) && newcrush
.class_exists(dstname
)) {
10410 // suppose this is a replay and return success
10411 // so command is idempotent
10412 ss
<< "already renamed to '" << dstname
<< "'";
10414 goto reply_no_propose
;
10417 err
= newcrush
.rename_class(srcname
, dstname
);
10419 ss
<< "fail to rename '" << srcname
<< "' to '" << dstname
<< "' : "
10420 << cpp_strerror(err
);
10421 goto reply_no_propose
;
10424 pending_inc
.crush
.clear();
10425 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10426 ss
<< "rename class '" << srcname
<< "' to '" << dstname
<< "'";
10428 } else if (prefix
== "osd crush add-bucket") {
10429 // os crush add-bucket <name> <type>
10430 string name
, typestr
;
10431 vector
<string
> argvec
;
10432 cmd_getval(cmdmap
, "name", name
);
10433 cmd_getval(cmdmap
, "type", typestr
);
10434 cmd_getval(cmdmap
, "args", argvec
);
10435 map
<string
,string
> loc
;
10436 if (!argvec
.empty()) {
10437 CrushWrapper::parse_loc_map(argvec
, &loc
);
10438 dout(0) << "will create and move bucket '" << name
10439 << "' to location " << loc
<< dendl
;
10442 if (!_have_pending_crush() &&
10443 _get_stable_crush().name_exists(name
)) {
10444 ss
<< "bucket '" << name
<< "' already exists";
10445 goto reply_no_propose
;
10448 CrushWrapper newcrush
= _get_pending_crush();
10450 if (newcrush
.name_exists(name
)) {
10451 ss
<< "bucket '" << name
<< "' already exists";
10454 int type
= newcrush
.get_type_id(typestr
);
10456 ss
<< "type '" << typestr
<< "' does not exist";
10458 goto reply_no_propose
;
10461 ss
<< "type '" << typestr
<< "' is for devices, not buckets";
10463 goto reply_no_propose
;
10466 err
= newcrush
.add_bucket(0, 0,
10467 CRUSH_HASH_DEFAULT
, type
, 0, NULL
,
10470 ss
<< "add_bucket error: '" << cpp_strerror(err
) << "'";
10471 goto reply_no_propose
;
10473 err
= newcrush
.set_item_name(bucketno
, name
);
10475 ss
<< "error setting bucket name to '" << name
<< "'";
10476 goto reply_no_propose
;
10479 if (!loc
.empty()) {
10480 if (!newcrush
.check_item_loc(cct
, bucketno
, loc
,
10482 err
= newcrush
.move_bucket(cct
, bucketno
, loc
);
10484 ss
<< "error moving bucket '" << name
<< "' to location " << loc
;
10485 goto reply_no_propose
;
10488 ss
<< "no need to move item id " << bucketno
<< " name '" << name
10489 << "' to location " << loc
<< " in crush map";
10493 pending_inc
.crush
.clear();
10494 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10496 ss
<< "added bucket " << name
<< " type " << typestr
10497 << " to crush map";
10499 ss
<< "added bucket " << name
<< " type " << typestr
10500 << " to location " << loc
;
10503 } else if (prefix
== "osd crush rename-bucket") {
10504 string srcname
, dstname
;
10505 cmd_getval(cmdmap
, "srcname", srcname
);
10506 cmd_getval(cmdmap
, "dstname", dstname
);
10508 err
= crush_rename_bucket(srcname
, dstname
, &ss
);
10510 // equivalent to success for idempotency
10511 if (err
== -EALREADY
) {
10514 goto reply_no_propose
;
10518 } else if (prefix
== "osd crush weight-set create" ||
10519 prefix
== "osd crush weight-set create-compat") {
10520 if (_have_pending_crush()) {
10521 dout(10) << " first waiting for pending crush changes to commit" << dendl
;
10524 CrushWrapper newcrush
= _get_pending_crush();
10527 if (newcrush
.has_non_straw2_buckets()) {
10528 ss
<< "crush map contains one or more bucket(s) that are not straw2";
10530 goto reply_no_propose
;
10532 if (prefix
== "osd crush weight-set create") {
10533 if (osdmap
.require_min_compat_client
!= ceph_release_t::unknown
&&
10534 osdmap
.require_min_compat_client
< ceph_release_t::luminous
) {
10535 ss
<< "require_min_compat_client "
10536 << osdmap
.require_min_compat_client
10537 << " < luminous, which is required for per-pool weight-sets. "
10538 << "Try 'ceph osd set-require-min-compat-client luminous' "
10539 << "before using the new interface";
10541 goto reply_no_propose
;
10543 string poolname
, mode
;
10544 cmd_getval(cmdmap
, "pool", poolname
);
10545 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
10547 ss
<< "pool '" << poolname
<< "' not found";
10549 goto reply_no_propose
;
10551 cmd_getval(cmdmap
, "mode", mode
);
10552 if (mode
!= "flat" && mode
!= "positional") {
10553 ss
<< "unrecognized weight-set mode '" << mode
<< "'";
10555 goto reply_no_propose
;
10557 positions
= mode
== "flat" ? 1 : osdmap
.get_pg_pool(pool
)->get_size();
10559 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
10562 if (!newcrush
.create_choose_args(pool
, positions
)) {
10563 if (pool
== CrushWrapper::DEFAULT_CHOOSE_ARGS
) {
10564 ss
<< "compat weight-set already created";
10566 ss
<< "weight-set for pool '" << osdmap
.get_pool_name(pool
)
10567 << "' already created";
10569 goto reply_no_propose
;
10571 pending_inc
.crush
.clear();
10572 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10575 } else if (prefix
== "osd crush weight-set rm" ||
10576 prefix
== "osd crush weight-set rm-compat") {
10577 CrushWrapper newcrush
= _get_pending_crush();
10579 if (prefix
== "osd crush weight-set rm") {
10581 cmd_getval(cmdmap
, "pool", poolname
);
10582 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
10584 ss
<< "pool '" << poolname
<< "' not found";
10586 goto reply_no_propose
;
10589 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
10591 newcrush
.rm_choose_args(pool
);
10592 pending_inc
.crush
.clear();
10593 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10596 } else if (prefix
== "osd crush weight-set reweight" ||
10597 prefix
== "osd crush weight-set reweight-compat") {
10598 string poolname
, item
;
10599 vector
<double> weight
;
10600 cmd_getval(cmdmap
, "pool", poolname
);
10601 cmd_getval(cmdmap
, "item", item
);
10602 cmd_getval(cmdmap
, "weight", weight
);
10603 CrushWrapper newcrush
= _get_pending_crush();
10605 if (prefix
== "osd crush weight-set reweight") {
10606 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
10608 ss
<< "pool '" << poolname
<< "' not found";
10610 goto reply_no_propose
;
10612 if (!newcrush
.have_choose_args(pool
)) {
10613 ss
<< "no weight-set for pool '" << poolname
<< "'";
10615 goto reply_no_propose
;
10617 auto arg_map
= newcrush
.choose_args_get(pool
);
10618 int positions
= newcrush
.get_choose_args_positions(arg_map
);
10619 if (weight
.size() != (size_t)positions
) {
10620 ss
<< "must specify exact " << positions
<< " weight values";
10622 goto reply_no_propose
;
10625 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
10626 if (!newcrush
.have_choose_args(pool
)) {
10627 ss
<< "no backward-compatible weight-set";
10629 goto reply_no_propose
;
10632 if (!newcrush
.name_exists(item
)) {
10633 ss
<< "item '" << item
<< "' does not exist";
10635 goto reply_no_propose
;
10637 err
= newcrush
.choose_args_adjust_item_weightf(
10639 newcrush
.choose_args_get(pool
),
10640 newcrush
.get_item_id(item
),
10644 goto reply_no_propose
;
10647 pending_inc
.crush
.clear();
10648 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10650 } else if (osdid_present
&&
10651 (prefix
== "osd crush set" || prefix
== "osd crush add")) {
10652 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
10653 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
10654 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
10656 if (!osdmap
.exists(osdid
)) {
10659 << " does not exist. Create it before updating the crush map";
10660 goto reply_no_propose
;
10664 if (!cmd_getval(cmdmap
, "weight", weight
)) {
10665 ss
<< "unable to parse weight value '"
10666 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
10668 goto reply_no_propose
;
10672 vector
<string
> argvec
;
10673 cmd_getval(cmdmap
, "args", argvec
);
10674 map
<string
,string
> loc
;
10675 CrushWrapper::parse_loc_map(argvec
, &loc
);
10677 if (prefix
== "osd crush set"
10678 && !_get_stable_crush().item_exists(osdid
)) {
10680 ss
<< "unable to set item id " << osdid
<< " name '" << osd_name
10681 << "' weight " << weight
<< " at location " << loc
10682 << ": does not exist";
10683 goto reply_no_propose
;
10686 dout(5) << "adding/updating crush item id " << osdid
<< " name '"
10687 << osd_name
<< "' weight " << weight
<< " at location "
10689 CrushWrapper newcrush
= _get_pending_crush();
10692 if (prefix
== "osd crush set" ||
10693 newcrush
.check_item_loc(cct
, osdid
, loc
, (int *)NULL
)) {
10695 err
= newcrush
.update_item(cct
, osdid
, weight
, osd_name
, loc
);
10698 err
= newcrush
.insert_item(cct
, osdid
, weight
, osd_name
, loc
);
10704 goto reply_no_propose
;
10706 if (err
== 0 && !_have_pending_crush()) {
10707 ss
<< action
<< " item id " << osdid
<< " name '" << osd_name
10708 << "' weight " << weight
<< " at location " << loc
<< ": no change";
10709 goto reply_no_propose
;
10712 pending_inc
.crush
.clear();
10713 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10714 ss
<< action
<< " item id " << osdid
<< " name '" << osd_name
<< "' weight "
10715 << weight
<< " at location " << loc
<< " to crush map";
10717 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10718 get_last_committed() + 1));
10721 } else if (prefix
== "osd crush create-or-move") {
10723 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
10724 if (!osdmap
.exists(osdid
)) {
10727 << " does not exist. create it before updating the crush map";
10728 goto reply_no_propose
;
10732 if (!cmd_getval(cmdmap
, "weight", weight
)) {
10733 ss
<< "unable to parse weight value '"
10734 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
10736 goto reply_no_propose
;
10740 vector
<string
> argvec
;
10741 cmd_getval(cmdmap
, "args", argvec
);
10742 map
<string
,string
> loc
;
10743 CrushWrapper::parse_loc_map(argvec
, &loc
);
10745 dout(0) << "create-or-move crush item name '" << osd_name
10746 << "' initial_weight " << weight
<< " at location " << loc
10749 CrushWrapper newcrush
= _get_pending_crush();
10751 err
= newcrush
.create_or_move_item(cct
, osdid
, weight
, osd_name
, loc
,
10752 g_conf()->osd_crush_update_weight_set
);
10754 ss
<< "create-or-move updated item name '" << osd_name
10755 << "' weight " << weight
10756 << " at location " << loc
<< " to crush map";
10760 pending_inc
.crush
.clear();
10761 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10762 ss
<< "create-or-move updating item name '" << osd_name
10763 << "' weight " << weight
10764 << " at location " << loc
<< " to crush map";
10766 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10767 get_last_committed() + 1));
10772 } else if (prefix
== "osd crush move") {
10774 // osd crush move <name> <loc1> [<loc2> ...]
10776 vector
<string
> argvec
;
10777 cmd_getval(cmdmap
, "name", name
);
10778 cmd_getval(cmdmap
, "args", argvec
);
10779 map
<string
,string
> loc
;
10780 CrushWrapper::parse_loc_map(argvec
, &loc
);
10782 dout(0) << "moving crush item name '" << name
<< "' to location " << loc
<< dendl
;
10783 CrushWrapper newcrush
= _get_pending_crush();
10785 if (!newcrush
.name_exists(name
)) {
10787 ss
<< "item " << name
<< " does not exist";
10790 int id
= newcrush
.get_item_id(name
);
10792 if (!newcrush
.check_item_loc(cct
, id
, loc
, (int *)NULL
)) {
10794 err
= newcrush
.create_or_move_item(
10795 cct
, id
, 0, name
, loc
,
10796 g_conf()->osd_crush_update_weight_set
);
10798 err
= newcrush
.move_bucket(cct
, id
, loc
);
10801 ss
<< "moved item id " << id
<< " name '" << name
<< "' to location " << loc
<< " in crush map";
10802 pending_inc
.crush
.clear();
10803 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10805 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10806 get_last_committed() + 1));
10810 ss
<< "no need to move item id " << id
<< " name '" << name
<< "' to location " << loc
<< " in crush map";
10814 } else if (prefix
== "osd crush swap-bucket") {
10815 string source
, dest
;
10816 cmd_getval(cmdmap
, "source", source
);
10817 cmd_getval(cmdmap
, "dest", dest
);
10819 bool force
= false;
10820 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
10822 CrushWrapper newcrush
= _get_pending_crush();
10823 if (!newcrush
.name_exists(source
)) {
10824 ss
<< "source item " << source
<< " does not exist";
10826 goto reply_no_propose
;
10828 if (!newcrush
.name_exists(dest
)) {
10829 ss
<< "dest item " << dest
<< " does not exist";
10831 goto reply_no_propose
;
10833 int sid
= newcrush
.get_item_id(source
);
10834 int did
= newcrush
.get_item_id(dest
);
10836 if (newcrush
.get_immediate_parent_id(sid
, &sparent
) == 0 && !force
) {
10837 ss
<< "source item " << source
<< " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
10839 goto reply_no_propose
;
10841 if (newcrush
.get_bucket_alg(sid
) != newcrush
.get_bucket_alg(did
) &&
10843 ss
<< "source bucket alg " << crush_alg_name(newcrush
.get_bucket_alg(sid
)) << " != "
10844 << "dest bucket alg " << crush_alg_name(newcrush
.get_bucket_alg(did
))
10845 << "; pass --yes-i-really-mean-it to proceed anyway";
10847 goto reply_no_propose
;
10849 int r
= newcrush
.swap_bucket(cct
, sid
, did
);
10851 ss
<< "failed to swap bucket contents: " << cpp_strerror(r
);
10853 goto reply_no_propose
;
10855 ss
<< "swapped bucket of " << source
<< " to " << dest
;
10856 pending_inc
.crush
.clear();
10857 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10858 wait_for_finished_proposal(op
,
10859 new Monitor::C_Command(mon
, op
, err
, ss
.str(),
10860 get_last_committed() + 1));
10862 } else if (prefix
== "osd crush link") {
10863 // osd crush link <name> <loc1> [<loc2> ...]
10865 cmd_getval(cmdmap
, "name", name
);
10866 vector
<string
> argvec
;
10867 cmd_getval(cmdmap
, "args", argvec
);
10868 map
<string
,string
> loc
;
10869 CrushWrapper::parse_loc_map(argvec
, &loc
);
10871 // Need an explicit check for name_exists because get_item_id returns
10873 int id
= osdmap
.crush
->get_item_id(name
);
10874 if (!osdmap
.crush
->name_exists(name
)) {
10876 ss
<< "item " << name
<< " does not exist";
10877 goto reply_no_propose
;
10879 dout(5) << "resolved crush name '" << name
<< "' to id " << id
<< dendl
;
10881 if (osdmap
.crush
->check_item_loc(cct
, id
, loc
, (int*) NULL
)) {
10882 ss
<< "no need to move item id " << id
<< " name '" << name
10883 << "' to location " << loc
<< " in crush map";
10885 goto reply_no_propose
;
10888 dout(5) << "linking crush item name '" << name
<< "' at location " << loc
<< dendl
;
10889 CrushWrapper newcrush
= _get_pending_crush();
10891 if (!newcrush
.name_exists(name
)) {
10893 ss
<< "item " << name
<< " does not exist";
10894 goto reply_no_propose
;
10896 int id
= newcrush
.get_item_id(name
);
10897 if (!newcrush
.check_item_loc(cct
, id
, loc
, (int *)NULL
)) {
10898 err
= newcrush
.link_bucket(cct
, id
, loc
);
10900 ss
<< "linked item id " << id
<< " name '" << name
10901 << "' to location " << loc
<< " in crush map";
10902 pending_inc
.crush
.clear();
10903 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10905 ss
<< "cannot link item id " << id
<< " name '" << name
10906 << "' to location " << loc
;
10907 goto reply_no_propose
;
10910 ss
<< "no need to move item id " << id
<< " name '" << name
10911 << "' to location " << loc
<< " in crush map";
10915 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, ss
.str(),
10916 get_last_committed() + 1));
10918 } else if (prefix
== "osd crush rm" ||
10919 prefix
== "osd crush remove" ||
10920 prefix
== "osd crush unlink") {
10922 // osd crush rm <id> [ancestor]
10923 CrushWrapper newcrush
= _get_pending_crush();
10926 cmd_getval(cmdmap
, "name", name
);
10928 if (!osdmap
.crush
->name_exists(name
)) {
10930 ss
<< "device '" << name
<< "' does not appear in the crush map";
10933 if (!newcrush
.name_exists(name
)) {
10935 ss
<< "device '" << name
<< "' does not appear in the crush map";
10937 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10938 get_last_committed() + 1));
10941 int id
= newcrush
.get_item_id(name
);
10944 bool unlink_only
= prefix
== "osd crush unlink";
10945 string ancestor_str
;
10946 if (cmd_getval(cmdmap
, "ancestor", ancestor_str
)) {
10947 if (!newcrush
.name_exists(ancestor_str
)) {
10949 ss
<< "ancestor item '" << ancestor_str
10950 << "' does not appear in the crush map";
10953 ancestor
= newcrush
.get_item_id(ancestor_str
);
10956 err
= prepare_command_osd_crush_remove(
10959 (ancestor
< 0), unlink_only
);
10961 if (err
== -ENOENT
) {
10962 ss
<< "item " << id
<< " does not appear in that position";
10968 pending_inc
.new_crush_node_flags
[id
] = 0;
10969 ss
<< "removed item id " << id
<< " name '" << name
<< "' from crush map";
10971 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10972 get_last_committed() + 1));
10977 } else if (prefix
== "osd crush reweight-all") {
10978 CrushWrapper newcrush
= _get_pending_crush();
10980 newcrush
.reweight(cct
);
10981 pending_inc
.crush
.clear();
10982 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10983 ss
<< "reweighted crush hierarchy";
10985 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10986 get_last_committed() + 1));
10988 } else if (prefix
== "osd crush reweight") {
10989 // osd crush reweight <name> <weight>
10990 CrushWrapper newcrush
= _get_pending_crush();
10993 cmd_getval(cmdmap
, "name", name
);
10994 if (!newcrush
.name_exists(name
)) {
10996 ss
<< "device '" << name
<< "' does not appear in the crush map";
10997 goto reply_no_propose
;
11000 int id
= newcrush
.get_item_id(name
);
11002 ss
<< "device '" << name
<< "' is not a leaf in the crush map";
11004 goto reply_no_propose
;
11007 if (!cmd_getval(cmdmap
, "weight", w
)) {
11008 ss
<< "unable to parse weight value '"
11009 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
11011 goto reply_no_propose
;
11014 err
= newcrush
.adjust_item_weightf(cct
, id
, w
,
11015 g_conf()->osd_crush_update_weight_set
);
11017 goto reply_no_propose
;
11018 pending_inc
.crush
.clear();
11019 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
11020 ss
<< "reweighted item id " << id
<< " name '" << name
<< "' to " << w
11021 << " in crush map";
11023 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11024 get_last_committed() + 1));
11026 } else if (prefix
== "osd crush reweight-subtree") {
11027 // osd crush reweight <name> <weight>
11028 CrushWrapper newcrush
= _get_pending_crush();
11031 cmd_getval(cmdmap
, "name", name
);
11032 if (!newcrush
.name_exists(name
)) {
11034 ss
<< "device '" << name
<< "' does not appear in the crush map";
11035 goto reply_no_propose
;
11038 int id
= newcrush
.get_item_id(name
);
11040 ss
<< "device '" << name
<< "' is not a subtree in the crush map";
11042 goto reply_no_propose
;
11045 if (!cmd_getval(cmdmap
, "weight", w
)) {
11046 ss
<< "unable to parse weight value '"
11047 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
11049 goto reply_no_propose
;
11052 err
= newcrush
.adjust_subtree_weightf(cct
, id
, w
,
11053 g_conf()->osd_crush_update_weight_set
);
11055 goto reply_no_propose
;
11056 pending_inc
.crush
.clear();
11057 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
11058 ss
<< "reweighted subtree id " << id
<< " name '" << name
<< "' to " << w
11059 << " in crush map";
11061 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11062 get_last_committed() + 1));
11064 } else if (prefix
== "osd crush tunables") {
11065 CrushWrapper newcrush
= _get_pending_crush();
11069 cmd_getval(cmdmap
, "profile", profile
);
11070 if (profile
== "legacy" || profile
== "argonaut") {
11071 newcrush
.set_tunables_legacy();
11072 } else if (profile
== "bobtail") {
11073 newcrush
.set_tunables_bobtail();
11074 } else if (profile
== "firefly") {
11075 newcrush
.set_tunables_firefly();
11076 } else if (profile
== "hammer") {
11077 newcrush
.set_tunables_hammer();
11078 } else if (profile
== "jewel") {
11079 newcrush
.set_tunables_jewel();
11080 } else if (profile
== "optimal") {
11081 newcrush
.set_tunables_optimal();
11082 } else if (profile
== "default") {
11083 newcrush
.set_tunables_default();
11085 ss
<< "unrecognized profile '" << profile
<< "'";
11087 goto reply_no_propose
;
11090 if (!validate_crush_against_features(&newcrush
, ss
)) {
11092 goto reply_no_propose
;
11095 pending_inc
.crush
.clear();
11096 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
11097 ss
<< "adjusted tunables profile to " << profile
;
11099 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11100 get_last_committed() + 1));
11102 } else if (prefix
== "osd crush set-tunable") {
11103 CrushWrapper newcrush
= _get_pending_crush();
11107 cmd_getval(cmdmap
, "tunable", tunable
);
11109 int64_t value
= -1;
11110 if (!cmd_getval(cmdmap
, "value", value
)) {
11112 ss
<< "failed to parse integer value "
11113 << cmd_vartype_stringify(cmdmap
.at("value"));
11114 goto reply_no_propose
;
11117 if (tunable
== "straw_calc_version") {
11118 if (value
!= 0 && value
!= 1) {
11119 ss
<< "value must be 0 or 1; got " << value
;
11121 goto reply_no_propose
;
11123 newcrush
.set_straw_calc_version(value
);
11125 ss
<< "unrecognized tunable '" << tunable
<< "'";
11127 goto reply_no_propose
;
11130 if (!validate_crush_against_features(&newcrush
, ss
)) {
11132 goto reply_no_propose
;
11135 pending_inc
.crush
.clear();
11136 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
11137 ss
<< "adjusted tunable " << tunable
<< " to " << value
;
11139 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11140 get_last_committed() + 1));
11143 } else if (prefix
== "osd crush rule create-simple") {
11144 string name
, root
, type
, mode
;
11145 cmd_getval(cmdmap
, "name", name
);
11146 cmd_getval(cmdmap
, "root", root
);
11147 cmd_getval(cmdmap
, "type", type
);
11148 cmd_getval(cmdmap
, "mode", mode
);
11152 if (osdmap
.crush
->rule_exists(name
)) {
11153 // The name is uniquely associated to a ruleid and the rule it contains
11154 // From the user point of view, the rule is more meaningfull.
11155 ss
<< "rule " << name
<< " already exists";
11157 goto reply_no_propose
;
11160 CrushWrapper newcrush
= _get_pending_crush();
11162 if (newcrush
.rule_exists(name
)) {
11163 // The name is uniquely associated to a ruleid and the rule it contains
11164 // From the user point of view, the rule is more meaningfull.
11165 ss
<< "rule " << name
<< " already exists";
11168 int ruleno
= newcrush
.add_simple_rule(name
, root
, type
, "", mode
,
11169 pg_pool_t::TYPE_REPLICATED
, &ss
);
11172 goto reply_no_propose
;
11175 pending_inc
.crush
.clear();
11176 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
11179 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11180 get_last_committed() + 1));
11183 } else if (prefix
== "osd crush rule create-replicated") {
11184 string name
, root
, type
, device_class
;
11185 cmd_getval(cmdmap
, "name", name
);
11186 cmd_getval(cmdmap
, "root", root
);
11187 cmd_getval(cmdmap
, "type", type
);
11188 cmd_getval(cmdmap
, "class", device_class
);
11190 if (osdmap
.crush
->rule_exists(name
)) {
11191 // The name is uniquely associated to a ruleid and the rule it contains
11192 // From the user point of view, the rule is more meaningfull.
11193 ss
<< "rule " << name
<< " already exists";
11195 goto reply_no_propose
;
11198 CrushWrapper newcrush
= _get_pending_crush();
11200 if (newcrush
.rule_exists(name
)) {
11201 // The name is uniquely associated to a ruleid and the rule it contains
11202 // From the user point of view, the rule is more meaningfull.
11203 ss
<< "rule " << name
<< " already exists";
11206 int ruleno
= newcrush
.add_simple_rule(
11207 name
, root
, type
, device_class
,
11208 "firstn", pg_pool_t::TYPE_REPLICATED
, &ss
);
11211 goto reply_no_propose
;
11214 pending_inc
.crush
.clear();
11215 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
11218 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11219 get_last_committed() + 1));
11222 } else if (prefix
== "osd erasure-code-profile rm") {
11224 cmd_getval(cmdmap
, "name", name
);
11226 if (erasure_code_profile_in_use(pending_inc
.new_pools
, name
, &ss
))
11229 if (erasure_code_profile_in_use(osdmap
.pools
, name
, &ss
)) {
11231 goto reply_no_propose
;
11234 if (osdmap
.has_erasure_code_profile(name
) ||
11235 pending_inc
.new_erasure_code_profiles
.count(name
)) {
11236 if (osdmap
.has_erasure_code_profile(name
)) {
11237 pending_inc
.old_erasure_code_profiles
.push_back(name
);
11239 dout(20) << "erasure code profile rm " << name
<< ": creation canceled" << dendl
;
11240 pending_inc
.new_erasure_code_profiles
.erase(name
);
11244 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11245 get_last_committed() + 1));
11248 ss
<< "erasure-code-profile " << name
<< " does not exist";
11250 goto reply_no_propose
;
11253 } else if (prefix
== "osd erasure-code-profile set") {
11255 cmd_getval(cmdmap
, "name", name
);
11256 vector
<string
> profile
;
11257 cmd_getval(cmdmap
, "profile", profile
);
11259 bool force
= false;
11260 cmd_getval(cmdmap
, "force", force
);
11262 map
<string
,string
> profile_map
;
11263 err
= parse_erasure_code_profile(profile
, &profile_map
, &ss
);
11265 goto reply_no_propose
;
11266 if (auto found
= profile_map
.find("crush-failure-domain");
11267 found
!= profile_map
.end()) {
11268 const auto& failure_domain
= found
->second
;
11269 int failure_domain_type
= osdmap
.crush
->get_type_id(failure_domain
);
11270 if (failure_domain_type
< 0) {
11271 ss
<< "erasure-code-profile " << profile_map
11272 << " contains an invalid failure-domain " << std::quoted(failure_domain
);
11274 goto reply_no_propose
;
11278 if (profile_map
.find("plugin") == profile_map
.end()) {
11279 ss
<< "erasure-code-profile " << profile_map
11280 << " must contain a plugin entry" << std::endl
;
11282 goto reply_no_propose
;
11284 string plugin
= profile_map
["plugin"];
11286 if (pending_inc
.has_erasure_code_profile(name
)) {
11287 dout(20) << "erasure code profile " << name
<< " try again" << dendl
;
11290 err
= normalize_profile(name
, profile_map
, force
, &ss
);
11292 goto reply_no_propose
;
11294 if (osdmap
.has_erasure_code_profile(name
)) {
11295 ErasureCodeProfile existing_profile_map
=
11296 osdmap
.get_erasure_code_profile(name
);
11297 err
= normalize_profile(name
, existing_profile_map
, force
, &ss
);
11299 goto reply_no_propose
;
11301 if (existing_profile_map
== profile_map
) {
11303 goto reply_no_propose
;
11307 ss
<< "will not override erasure code profile " << name
11308 << " because the existing profile "
11309 << existing_profile_map
11310 << " is different from the proposed profile "
11312 goto reply_no_propose
;
11316 dout(20) << "erasure code profile set " << name
<< "="
11317 << profile_map
<< dendl
;
11318 pending_inc
.set_erasure_code_profile(name
, profile_map
);
11322 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11323 get_last_committed() + 1));
11326 } else if (prefix
== "osd crush rule create-erasure") {
11327 err
= check_cluster_features(CEPH_FEATURE_CRUSH_V2
, ss
);
11328 if (err
== -EAGAIN
)
11331 goto reply_no_propose
;
11332 string name
, poolstr
;
11333 cmd_getval(cmdmap
, "name", name
);
11335 cmd_getval(cmdmap
, "profile", profile
);
11337 profile
= "default";
11338 if (profile
== "default") {
11339 if (!osdmap
.has_erasure_code_profile(profile
)) {
11340 if (pending_inc
.has_erasure_code_profile(profile
)) {
11341 dout(20) << "erasure code profile " << profile
<< " already pending" << dendl
;
11345 map
<string
,string
> profile_map
;
11346 err
= osdmap
.get_erasure_code_profile_default(cct
,
11350 goto reply_no_propose
;
11351 err
= normalize_profile(name
, profile_map
, true, &ss
);
11353 goto reply_no_propose
;
11354 dout(20) << "erasure code profile set " << profile
<< "="
11355 << profile_map
<< dendl
;
11356 pending_inc
.set_erasure_code_profile(profile
, profile_map
);
11362 err
= crush_rule_create_erasure(name
, profile
, &rule
, &ss
);
11365 case -EEXIST
: // return immediately
11366 ss
<< "rule " << name
<< " already exists";
11368 goto reply_no_propose
;
11369 case -EALREADY
: // wait for pending to be proposed
11370 ss
<< "rule " << name
<< " already exists";
11373 default: // non recoverable error
11374 goto reply_no_propose
;
11377 ss
<< "created rule " << name
<< " at " << rule
;
11381 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11382 get_last_committed() + 1));
11385 } else if (prefix
== "osd crush rule rm") {
11387 cmd_getval(cmdmap
, "name", name
);
11389 if (!osdmap
.crush
->rule_exists(name
)) {
11390 ss
<< "rule " << name
<< " does not exist";
11392 goto reply_no_propose
;
11395 CrushWrapper newcrush
= _get_pending_crush();
11397 if (!newcrush
.rule_exists(name
)) {
11398 ss
<< "rule " << name
<< " does not exist";
11401 int ruleno
= newcrush
.get_rule_id(name
);
11402 ceph_assert(ruleno
>= 0);
11404 // make sure it is not in use.
11405 // FIXME: this is ok in some situations, but let's not bother with that
11407 if (osdmap
.crush_rule_in_use(ruleno
)) {
11408 ss
<< "crush rule " << name
<< " (" << ruleno
<< ") is in use";
11410 goto reply_no_propose
;
11413 err
= newcrush
.remove_rule(ruleno
);
11415 goto reply_no_propose
;
11418 pending_inc
.crush
.clear();
11419 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
11422 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11423 get_last_committed() + 1));
11426 } else if (prefix
== "osd crush rule rename") {
11429 cmd_getval(cmdmap
, "srcname", srcname
);
11430 cmd_getval(cmdmap
, "dstname", dstname
);
11431 if (srcname
.empty() || dstname
.empty()) {
11432 ss
<< "must specify both source rule name and destination rule name";
11434 goto reply_no_propose
;
11436 if (srcname
== dstname
) {
11437 ss
<< "destination rule name is equal to source rule name";
11439 goto reply_no_propose
;
11442 CrushWrapper newcrush
= _get_pending_crush();
11443 if (!newcrush
.rule_exists(srcname
) && newcrush
.rule_exists(dstname
)) {
11444 // srcname does not exist and dstname already exists
11445 // suppose this is a replay and return success
11446 // (so this command is idempotent)
11447 ss
<< "already renamed to '" << dstname
<< "'";
11449 goto reply_no_propose
;
11452 err
= newcrush
.rename_rule(srcname
, dstname
, &ss
);
11454 // ss has reason for failure
11455 goto reply_no_propose
;
11457 pending_inc
.crush
.clear();
11458 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
11460 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11461 get_last_committed() + 1));
11464 } else if (prefix
== "osd setmaxosd") {
11466 if (!cmd_getval(cmdmap
, "newmax", newmax
)) {
11467 ss
<< "unable to parse 'newmax' value '"
11468 << cmd_vartype_stringify(cmdmap
.at("newmax")) << "'";
11470 goto reply_no_propose
;
11473 if (newmax
> g_conf()->mon_max_osd
) {
11475 ss
<< "cannot set max_osd to " << newmax
<< " which is > conf.mon_max_osd ("
11476 << g_conf()->mon_max_osd
<< ")";
11477 goto reply_no_propose
;
11480 // Don't allow shrinking OSD number as this will cause data loss
11481 // and may cause kernel crashes.
11482 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
11483 if (newmax
< osdmap
.get_max_osd()) {
11484 // Check if the OSDs exist between current max and new value.
11485 // If there are any OSDs exist, then don't allow shrinking number
11487 for (int i
= newmax
; i
< osdmap
.get_max_osd(); i
++) {
11488 if (osdmap
.exists(i
)) {
11490 ss
<< "cannot shrink max_osd to " << newmax
11491 << " because osd." << i
<< " (and possibly others) still in use";
11492 goto reply_no_propose
;
11497 pending_inc
.new_max_osd
= newmax
;
11498 ss
<< "set new max_osd = " << pending_inc
.new_max_osd
;
11500 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11501 get_last_committed() + 1));
11504 } else if (prefix
== "osd set-full-ratio" ||
11505 prefix
== "osd set-backfillfull-ratio" ||
11506 prefix
== "osd set-nearfull-ratio") {
11508 if (!cmd_getval(cmdmap
, "ratio", n
)) {
11509 ss
<< "unable to parse 'ratio' value '"
11510 << cmd_vartype_stringify(cmdmap
.at("ratio")) << "'";
11512 goto reply_no_propose
;
11514 if (prefix
== "osd set-full-ratio")
11515 pending_inc
.new_full_ratio
= n
;
11516 else if (prefix
== "osd set-backfillfull-ratio")
11517 pending_inc
.new_backfillfull_ratio
= n
;
11518 else if (prefix
== "osd set-nearfull-ratio")
11519 pending_inc
.new_nearfull_ratio
= n
;
11520 ss
<< prefix
<< " " << n
;
11522 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11523 get_last_committed() + 1));
11525 } else if (prefix
== "osd set-require-min-compat-client") {
11527 cmd_getval(cmdmap
, "version", v
);
11528 ceph_release_t vno
= ceph_release_from_name(v
);
11530 ss
<< "version " << v
<< " is not recognized";
11532 goto reply_no_propose
;
11535 newmap
.deepish_copy_from(osdmap
);
11536 newmap
.apply_incremental(pending_inc
);
11537 newmap
.require_min_compat_client
= vno
;
11538 auto mvno
= newmap
.get_min_compat_client();
11540 ss
<< "osdmap current utilizes features that require " << mvno
11541 << "; cannot set require_min_compat_client below that to " << vno
;
11543 goto reply_no_propose
;
11546 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
11549 mon
.get_combined_feature_map(&m
);
11550 uint64_t features
= ceph_release_features(to_integer
<int>(vno
));
11554 CEPH_ENTITY_TYPE_CLIENT
,
11555 CEPH_ENTITY_TYPE_MDS
,
11556 CEPH_ENTITY_TYPE_MGR
}) {
11557 auto p
= m
.m
.find(type
);
11558 if (p
== m
.m
.end()) {
11561 for (auto& q
: p
->second
) {
11562 uint64_t missing
= ~q
.first
& features
;
11565 ss
<< "cannot set require_min_compat_client to " << v
<< ": ";
11570 ss
<< q
.second
<< " connected " << ceph_entity_type_name(type
)
11571 << "(s) look like " << ceph_release_name(
11572 ceph_release_from_features(q
.first
))
11573 << " (missing 0x" << std::hex
<< missing
<< std::dec
<< ")";
11579 ss
<< "; add --yes-i-really-mean-it to do it anyway";
11581 goto reply_no_propose
;
11584 ss
<< "set require_min_compat_client to " << vno
;
11585 pending_inc
.new_require_min_compat_client
= vno
;
11587 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11588 get_last_committed() + 1));
11590 } else if (prefix
== "osd pause") {
11591 return prepare_set_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11593 } else if (prefix
== "osd unpause") {
11594 return prepare_unset_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11596 } else if (prefix
== "osd set") {
11598 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
11601 cmd_getval(cmdmap
, "key", key
);
11602 if (key
== "pause")
11603 return prepare_set_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11604 else if (key
== "noup")
11605 return prepare_set_flag(op
, CEPH_OSDMAP_NOUP
);
11606 else if (key
== "nodown")
11607 return prepare_set_flag(op
, CEPH_OSDMAP_NODOWN
);
11608 else if (key
== "noout")
11609 return prepare_set_flag(op
, CEPH_OSDMAP_NOOUT
);
11610 else if (key
== "noin")
11611 return prepare_set_flag(op
, CEPH_OSDMAP_NOIN
);
11612 else if (key
== "nobackfill")
11613 return prepare_set_flag(op
, CEPH_OSDMAP_NOBACKFILL
);
11614 else if (key
== "norebalance")
11615 return prepare_set_flag(op
, CEPH_OSDMAP_NOREBALANCE
);
11616 else if (key
== "norecover")
11617 return prepare_set_flag(op
, CEPH_OSDMAP_NORECOVER
);
11618 else if (key
== "noscrub")
11619 return prepare_set_flag(op
, CEPH_OSDMAP_NOSCRUB
);
11620 else if (key
== "nodeep-scrub")
11621 return prepare_set_flag(op
, CEPH_OSDMAP_NODEEP_SCRUB
);
11622 else if (key
== "notieragent")
11623 return prepare_set_flag(op
, CEPH_OSDMAP_NOTIERAGENT
);
11624 else if (key
== "nosnaptrim")
11625 return prepare_set_flag(op
, CEPH_OSDMAP_NOSNAPTRIM
);
11626 else if (key
== "pglog_hardlimit") {
11627 if (!osdmap
.get_num_up_osds() && !sure
) {
11628 ss
<< "Not advisable to continue since no OSDs are up. Pass "
11629 << "--yes-i-really-mean-it if you really wish to continue.";
11631 goto reply_no_propose
;
11633 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
11634 // we are reusing a jewel feature bit that was retired in luminous.
11635 if (osdmap
.require_osd_release
>= ceph_release_t::luminous
&&
11636 (HAVE_FEATURE(osdmap
.get_up_osd_features(), OSD_PGLOG_HARDLIMIT
)
11638 return prepare_set_flag(op
, CEPH_OSDMAP_PGLOG_HARDLIMIT
);
11640 ss
<< "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
11642 goto reply_no_propose
;
11644 } else if (key
== "noautoscale") {
11645 return prepare_set_flag(op
, CEPH_OSDMAP_NOAUTOSCALE
);
11647 ss
<< "unrecognized flag '" << key
<< "'";
11651 } else if (prefix
== "osd unset") {
11653 cmd_getval(cmdmap
, "key", key
);
11654 if (key
== "pause")
11655 return prepare_unset_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11656 else if (key
== "noup")
11657 return prepare_unset_flag(op
, CEPH_OSDMAP_NOUP
);
11658 else if (key
== "nodown")
11659 return prepare_unset_flag(op
, CEPH_OSDMAP_NODOWN
);
11660 else if (key
== "noout")
11661 return prepare_unset_flag(op
, CEPH_OSDMAP_NOOUT
);
11662 else if (key
== "noin")
11663 return prepare_unset_flag(op
, CEPH_OSDMAP_NOIN
);
11664 else if (key
== "nobackfill")
11665 return prepare_unset_flag(op
, CEPH_OSDMAP_NOBACKFILL
);
11666 else if (key
== "norebalance")
11667 return prepare_unset_flag(op
, CEPH_OSDMAP_NOREBALANCE
);
11668 else if (key
== "norecover")
11669 return prepare_unset_flag(op
, CEPH_OSDMAP_NORECOVER
);
11670 else if (key
== "noscrub")
11671 return prepare_unset_flag(op
, CEPH_OSDMAP_NOSCRUB
);
11672 else if (key
== "nodeep-scrub")
11673 return prepare_unset_flag(op
, CEPH_OSDMAP_NODEEP_SCRUB
);
11674 else if (key
== "notieragent")
11675 return prepare_unset_flag(op
, CEPH_OSDMAP_NOTIERAGENT
);
11676 else if (key
== "nosnaptrim")
11677 return prepare_unset_flag(op
, CEPH_OSDMAP_NOSNAPTRIM
);
11678 else if (key
== "noautoscale")
11679 return prepare_unset_flag(op
, CEPH_OSDMAP_NOAUTOSCALE
);
11681 ss
<< "unrecognized flag '" << key
<< "'";
11685 } else if (prefix
== "osd require-osd-release") {
11687 cmd_getval(cmdmap
, "release", release
);
11689 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
11690 ceph_release_t rel
= ceph_release_from_name(release
.c_str());
11692 ss
<< "unrecognized release " << release
;
11694 goto reply_no_propose
;
11696 if (rel
== osdmap
.require_osd_release
) {
11699 goto reply_no_propose
;
11701 if (osdmap
.require_osd_release
< ceph_release_t::pacific
&& !sure
) {
11702 ss
<< "Not advisable to continue since current 'require_osd_release' "
11703 << "refers to a very old Ceph release. Pass "
11704 << "--yes-i-really-mean-it if you really wish to continue.";
11706 goto reply_no_propose
;
11708 if (!osdmap
.get_num_up_osds() && !sure
) {
11709 ss
<< "Not advisable to continue since no OSDs are up. Pass "
11710 << "--yes-i-really-mean-it if you really wish to continue.";
11712 goto reply_no_propose
;
11714 if (rel
== ceph_release_t::pacific
) {
11715 if (!mon
.monmap
->get_required_features().contains_all(
11716 ceph::features::mon::FEATURE_PACIFIC
)) {
11717 ss
<< "not all mons are pacific";
11719 goto reply_no_propose
;
11721 if ((!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_PACIFIC
))
11723 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_PACIFIC feature";
11725 goto reply_no_propose
;
11727 } else if (rel
== ceph_release_t::quincy
) {
11728 if (!mon
.monmap
->get_required_features().contains_all(
11729 ceph::features::mon::FEATURE_QUINCY
)) {
11730 ss
<< "not all mons are quincy";
11732 goto reply_no_propose
;
11734 if ((!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_QUINCY
))
11736 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_QUINCY feature";
11738 goto reply_no_propose
;
11740 } else if (rel
== ceph_release_t::reef
) {
11741 if (!mon
.monmap
->get_required_features().contains_all(
11742 ceph::features::mon::FEATURE_REEF
)) {
11743 ss
<< "not all mons are reef";
11745 goto reply_no_propose
;
11747 if ((!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_REEF
))
11749 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_REEF feature";
11751 goto reply_no_propose
;
11754 ss
<< "not supported for this release";
11756 goto reply_no_propose
;
11758 if (rel
< osdmap
.require_osd_release
) {
11759 ss
<< "require_osd_release cannot be lowered once it has been set";
11761 goto reply_no_propose
;
11763 pending_inc
.new_require_osd_release
= rel
;
11765 } else if (prefix
== "osd down" ||
11766 prefix
== "osd out" ||
11767 prefix
== "osd in" ||
11768 prefix
== "osd rm" ||
11769 prefix
== "osd stop") {
11773 bool verbose
= true;
11774 bool definitely_dead
= false;
11776 vector
<string
> idvec
;
11777 cmd_getval(cmdmap
, "ids", idvec
);
11778 cmd_getval(cmdmap
, "definitely_dead", definitely_dead
);
11779 derr
<< "definitely_dead " << (int)definitely_dead
<< dendl
;
11780 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
11785 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
11786 if (prefix
== "osd in") {
11787 // touch out osds only
11788 osdmap
.get_out_existing_osds(osds
);
11790 osdmap
.get_all_osds(osds
);
11793 verbose
= false; // so the output is less noisy.
11795 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
11797 ss
<< "invalid osd id" << osd
;
11800 } else if (!osdmap
.exists(osd
)) {
11801 ss
<< "osd." << osd
<< " does not exist. ";
11808 for (auto &osd
: osds
) {
11809 if (prefix
== "osd down") {
11810 if (osdmap
.is_down(osd
)) {
11812 ss
<< "osd." << osd
<< " is already down. ";
11814 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_UP
);
11815 ss
<< "marked down osd." << osd
<< ". ";
11818 if (definitely_dead
) {
11819 if (!pending_inc
.new_xinfo
.count(osd
)) {
11820 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
11822 if (pending_inc
.new_xinfo
[osd
].dead_epoch
< pending_inc
.epoch
) {
11825 pending_inc
.new_xinfo
[osd
].dead_epoch
= pending_inc
.epoch
;
11827 } else if (prefix
== "osd out") {
11828 if (osdmap
.is_out(osd
)) {
11830 ss
<< "osd." << osd
<< " is already out. ";
11832 pending_inc
.new_weight
[osd
] = CEPH_OSD_OUT
;
11833 if (osdmap
.osd_weight
[osd
]) {
11834 if (pending_inc
.new_xinfo
.count(osd
) == 0) {
11835 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
11837 pending_inc
.new_xinfo
[osd
].old_weight
= osdmap
.osd_weight
[osd
];
11839 ss
<< "marked out osd." << osd
<< ". ";
11840 std::ostringstream msg
;
11841 msg
<< "Client " << op
->get_session()->entity_name
11842 << " marked osd." << osd
<< " out";
11843 if (osdmap
.is_up(osd
)) {
11844 msg
<< ", while it was still marked up";
11846 auto period
= ceph_clock_now() - down_pending_out
[osd
];
11847 msg
<< ", after it was down for " << int(period
.sec())
11851 mon
.clog
->info() << msg
.str();
11854 } else if (prefix
== "osd in") {
11855 if (osdmap
.is_in(osd
)) {
11857 ss
<< "osd." << osd
<< " is already in. ";
11859 if (osdmap
.osd_xinfo
[osd
].old_weight
> 0) {
11860 pending_inc
.new_weight
[osd
] = osdmap
.osd_xinfo
[osd
].old_weight
;
11861 if (pending_inc
.new_xinfo
.count(osd
) == 0) {
11862 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
11864 pending_inc
.new_xinfo
[osd
].old_weight
= 0;
11866 pending_inc
.new_weight
[osd
] = CEPH_OSD_IN
;
11868 ss
<< "marked in osd." << osd
<< ". ";
11871 } else if (prefix
== "osd rm") {
11872 err
= prepare_command_osd_remove(osd
);
11874 if (err
== -EBUSY
) {
11877 ss
<< "osd." << osd
<< " is still up; must be down before removal. ";
11879 ceph_assert(err
== 0);
11881 ss
<< ", osd." << osd
;
11883 ss
<< "removed osd." << osd
;
11887 } else if (prefix
== "osd stop") {
11888 if (osdmap
.is_stop(osd
)) {
11890 ss
<< "osd." << osd
<< " is already stopped. ";
11891 } else if (osdmap
.is_down(osd
)) {
11892 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_STOP
);
11893 ss
<< "stop down osd." << osd
<< ". ";
11896 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_UP
| CEPH_OSD_STOP
);
11897 ss
<< "stop osd." << osd
<< ". ";
11905 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
11906 get_last_committed() + 1));
11909 } else if (prefix
== "osd set-group" ||
11910 prefix
== "osd unset-group" ||
11911 prefix
== "osd add-noup" ||
11912 prefix
== "osd add-nodown" ||
11913 prefix
== "osd add-noin" ||
11914 prefix
== "osd add-noout" ||
11915 prefix
== "osd rm-noup" ||
11916 prefix
== "osd rm-nodown" ||
11917 prefix
== "osd rm-noin" ||
11918 prefix
== "osd rm-noout") {
11919 bool do_set
= prefix
== "osd set-group" ||
11920 prefix
.find("add") != string::npos
;
11922 unsigned flags
= 0;
11923 vector
<string
> who
;
11924 if (prefix
== "osd set-group" || prefix
== "osd unset-group") {
11925 cmd_getval(cmdmap
, "flags", flag_str
);
11926 cmd_getval(cmdmap
, "who", who
);
11927 vector
<string
> raw_flags
;
11928 boost::split(raw_flags
, flag_str
, boost::is_any_of(","));
11929 for (auto& f
: raw_flags
) {
11931 flags
|= CEPH_OSD_NOUP
;
11932 else if (f
== "nodown")
11933 flags
|= CEPH_OSD_NODOWN
;
11934 else if (f
== "noin")
11935 flags
|= CEPH_OSD_NOIN
;
11936 else if (f
== "noout")
11937 flags
|= CEPH_OSD_NOOUT
;
11939 ss
<< "unrecognized flag '" << f
<< "', must be one of "
11940 << "{noup,nodown,noin,noout}";
11942 goto reply_no_propose
;
11946 cmd_getval(cmdmap
, "ids", who
);
11947 if (prefix
.find("noup") != string::npos
)
11948 flags
= CEPH_OSD_NOUP
;
11949 else if (prefix
.find("nodown") != string::npos
)
11950 flags
= CEPH_OSD_NODOWN
;
11951 else if (prefix
.find("noin") != string::npos
)
11952 flags
= CEPH_OSD_NOIN
;
11953 else if (prefix
.find("noout") != string::npos
)
11954 flags
= CEPH_OSD_NOOUT
;
11956 ceph_assert(0 == "Unreachable!");
11959 ss
<< "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
11961 goto reply_no_propose
;
11964 ss
<< "must specify at least one or more targets to set/unset";
11966 goto reply_no_propose
;
11969 set
<int> crush_nodes
;
11970 set
<int> device_classes
;
11971 for (auto& w
: who
) {
11972 if (w
== "any" || w
== "all" || w
== "*") {
11973 osdmap
.get_all_osds(osds
);
11976 std::stringstream ts
;
11977 if (auto osd
= parse_osd_id(w
.c_str(), &ts
); osd
>= 0) {
11979 } else if (osdmap
.crush
->name_exists(w
)) {
11980 crush_nodes
.insert(osdmap
.crush
->get_item_id(w
));
11981 } else if (osdmap
.crush
->class_exists(w
)) {
11982 device_classes
.insert(osdmap
.crush
->get_class_id(w
));
11984 ss
<< "unable to parse osd id or crush node or device class: "
11985 << "\"" << w
<< "\". ";
11988 if (osds
.empty() && crush_nodes
.empty() && device_classes
.empty()) {
11989 // ss has reason for failure
11991 goto reply_no_propose
;
11994 for (auto osd
: osds
) {
11995 if (!osdmap
.exists(osd
)) {
11996 ss
<< "osd." << osd
<< " does not exist. ";
12000 if (flags
& CEPH_OSD_NOUP
) {
12001 any
|= osdmap
.is_noup_by_osd(osd
) ?
12002 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOUP
) :
12003 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOUP
);
12005 if (flags
& CEPH_OSD_NODOWN
) {
12006 any
|= osdmap
.is_nodown_by_osd(osd
) ?
12007 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NODOWN
) :
12008 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NODOWN
);
12010 if (flags
& CEPH_OSD_NOIN
) {
12011 any
|= osdmap
.is_noin_by_osd(osd
) ?
12012 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOIN
) :
12013 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOIN
);
12015 if (flags
& CEPH_OSD_NOOUT
) {
12016 any
|= osdmap
.is_noout_by_osd(osd
) ?
12017 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOOUT
) :
12018 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOOUT
);
12021 if (flags
& CEPH_OSD_NOUP
) {
12022 any
|= osdmap
.is_noup_by_osd(osd
) ?
12023 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOUP
) :
12024 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOUP
);
12026 if (flags
& CEPH_OSD_NODOWN
) {
12027 any
|= osdmap
.is_nodown_by_osd(osd
) ?
12028 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NODOWN
) :
12029 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NODOWN
);
12031 if (flags
& CEPH_OSD_NOIN
) {
12032 any
|= osdmap
.is_noin_by_osd(osd
) ?
12033 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOIN
) :
12034 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOIN
);
12036 if (flags
& CEPH_OSD_NOOUT
) {
12037 any
|= osdmap
.is_noout_by_osd(osd
) ?
12038 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOOUT
) :
12039 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOOUT
);
12043 for (auto& id
: crush_nodes
) {
12044 auto old_flags
= osdmap
.get_crush_node_flags(id
);
12045 auto& pending_flags
= pending_inc
.new_crush_node_flags
[id
];
12046 pending_flags
|= old_flags
; // adopt existing flags first!
12048 pending_flags
|= flags
;
12050 pending_flags
&= ~flags
;
12054 for (auto& id
: device_classes
) {
12055 auto old_flags
= osdmap
.get_device_class_flags(id
);
12056 auto& pending_flags
= pending_inc
.new_device_class_flags
[id
];
12057 pending_flags
|= old_flags
;
12059 pending_flags
|= flags
;
12061 pending_flags
&= ~flags
;
12067 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
12068 get_last_committed() + 1));
12071 } else if (prefix
== "osd pg-temp") {
12073 err
= parse_pgid(cmdmap
, ss
, pgid
);
12075 goto reply_no_propose
;
12076 if (pending_inc
.new_pg_temp
.count(pgid
)) {
12077 dout(10) << __func__
<< " waiting for pending update on " << pgid
<< dendl
;
12081 vector
<int64_t> id_vec
;
12082 vector
<int32_t> new_pg_temp
;
12083 cmd_getval(cmdmap
, "id", id_vec
);
12084 if (id_vec
.empty()) {
12085 pending_inc
.new_pg_temp
[pgid
] = mempool::osdmap::vector
<int>();
12086 ss
<< "done cleaning up pg_temp of " << pgid
;
12089 for (auto osd
: id_vec
) {
12090 if (!osdmap
.exists(osd
)) {
12091 ss
<< "osd." << osd
<< " does not exist";
12093 goto reply_no_propose
;
12095 new_pg_temp
.push_back(osd
);
12098 int pool_min_size
= osdmap
.get_pg_pool_min_size(pgid
);
12099 if ((int)new_pg_temp
.size() < pool_min_size
) {
12100 ss
<< "num of osds (" << new_pg_temp
.size() <<") < pool min size ("
12101 << pool_min_size
<< ")";
12103 goto reply_no_propose
;
12106 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
12107 if ((int)new_pg_temp
.size() > pool_size
) {
12108 ss
<< "num of osds (" << new_pg_temp
.size() <<") > pool size ("
12109 << pool_size
<< ")";
12111 goto reply_no_propose
;
12114 pending_inc
.new_pg_temp
[pgid
] = mempool::osdmap::vector
<int>(
12115 new_pg_temp
.begin(), new_pg_temp
.end());
12116 ss
<< "set " << pgid
<< " pg_temp mapping to " << new_pg_temp
;
12118 } else if (prefix
== "osd primary-temp" ||
12119 prefix
== "osd rm-primary-temp") {
12121 err
= parse_pgid(cmdmap
, ss
, pgid
);
12123 goto reply_no_propose
;
12126 if (prefix
== "osd primary-temp") {
12127 if (!cmd_getval(cmdmap
, "id", osd
)) {
12128 ss
<< "unable to parse 'id' value '"
12129 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12131 goto reply_no_propose
;
12133 if (!osdmap
.exists(osd
)) {
12134 ss
<< "osd." << osd
<< " does not exist";
12136 goto reply_no_propose
;
12139 else if (prefix
== "osd rm-primary-temp") {
12143 ceph_assert(0 == "Unreachable!");
12146 if (osdmap
.require_min_compat_client
!= ceph_release_t::unknown
&&
12147 osdmap
.require_min_compat_client
< ceph_release_t::firefly
) {
12148 ss
<< "require_min_compat_client "
12149 << osdmap
.require_min_compat_client
12150 << " < firefly, which is required for primary-temp";
12152 goto reply_no_propose
;
12155 pending_inc
.new_primary_temp
[pgid
] = osd
;
12156 ss
<< "set " << pgid
<< " primary_temp mapping to " << osd
;
12158 } else if (prefix
== "pg repeer") {
12160 err
= parse_pgid(cmdmap
, ss
, pgid
);
12162 goto reply_no_propose
;
12163 vector
<int> acting
;
12165 osdmap
.pg_to_acting_osds(pgid
, &acting
, &primary
);
12168 ss
<< "pg currently has no primary";
12169 goto reply_no_propose
;
12171 if (acting
.size() > 1) {
12172 // map to just primary; it will map back to what it wants
12173 pending_inc
.new_pg_temp
[pgid
] = { primary
};
12175 // hmm, pick another arbitrary osd to induce a change. Note
12176 // that this won't work if there is only one suitable OSD in the cluster.
12179 for (i
= 0; i
< osdmap
.get_max_osd(); ++i
) {
12180 if (i
== primary
|| !osdmap
.is_up(i
) || !osdmap
.exists(i
)) {
12183 pending_inc
.new_pg_temp
[pgid
] = { primary
, i
};
12189 ss
<< "not enough up OSDs in the cluster to force repeer";
12190 goto reply_no_propose
;
12194 } else if (prefix
== "osd pg-upmap" ||
12195 prefix
== "osd rm-pg-upmap" ||
12196 prefix
== "osd pg-upmap-items" ||
12197 prefix
== "osd rm-pg-upmap-items" ||
12198 prefix
== "osd pg-upmap-primary" ||
12199 prefix
== "osd rm-pg-upmap-primary") {
12204 OP_RM_PG_UPMAP_ITEMS
,
12205 OP_PG_UPMAP_PRIMARY
,
12206 OP_RM_PG_UPMAP_PRIMARY
,
12209 if (prefix
== "osd pg-upmap") {
12210 upmap_option
= OP_PG_UPMAP
;
12211 } else if (prefix
== "osd rm-pg-upmap") {
12212 upmap_option
= OP_RM_PG_UPMAP
;
12213 } else if (prefix
== "osd pg-upmap-items") {
12214 upmap_option
= OP_PG_UPMAP_ITEMS
;
12215 } else if (prefix
== "osd rm-pg-upmap-items") {
12216 upmap_option
= OP_RM_PG_UPMAP_ITEMS
;
12217 } else if (prefix
== "osd pg-upmap-primary") {
12218 upmap_option
= OP_PG_UPMAP_PRIMARY
;
12219 } else if (prefix
== "osd rm-pg-upmap-primary") {
12220 upmap_option
= OP_RM_PG_UPMAP_PRIMARY
;
12222 ceph_abort_msg("invalid upmap option");
12225 ceph_release_t min_release
= ceph_release_t::unknown
;
12226 string feature_name
= "unknown";
12227 switch (upmap_option
) {
12228 case OP_PG_UPMAP
: // fall through
12229 case OP_RM_PG_UPMAP
: // fall through
12230 case OP_PG_UPMAP_ITEMS
: // fall through
12231 case OP_RM_PG_UPMAP_ITEMS
:
12232 min_release
= ceph_release_t::luminous
;
12233 feature_name
= "pg-upmap";
12236 case OP_PG_UPMAP_PRIMARY
: // fall through
12237 case OP_RM_PG_UPMAP_PRIMARY
:
12238 min_release
= ceph_release_t::reef
;
12239 feature_name
= "pg-upmap-primary";
12243 ceph_abort_msg("invalid upmap option");
12245 uint64_t min_feature
= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP
;
12246 string min_release_name
= ceph_release_name(static_cast<int>(min_release
));
12248 if (osdmap
.require_min_compat_client
< min_release
) {
12249 ss
<< "min_compat_client "
12250 << osdmap
.require_min_compat_client
12251 << " < " << min_release_name
<< ", which is required for " << feature_name
<< ". "
12252 << "Try 'ceph osd set-require-min-compat-client " << min_release_name
<< "' "
12253 << "before using the new interface";
12255 goto reply_no_propose
;
12258 //TODO: Should I add feature and test for upmap-primary?
12259 err
= check_cluster_features(min_feature
, ss
);
12260 if (err
== -EAGAIN
)
12263 goto reply_no_propose
;
12265 err
= parse_pgid(cmdmap
, ss
, pgid
);
12267 goto reply_no_propose
;
12268 if (pending_inc
.old_pools
.count(pgid
.pool())) {
12269 ss
<< "pool of " << pgid
<< " is pending removal";
12272 wait_for_finished_proposal(op
,
12273 new Monitor::C_Command(mon
, op
, err
, rs
, get_last_committed() + 1));
12277 // check pending upmap changes
12278 switch (upmap_option
) {
12279 case OP_PG_UPMAP
: // fall through
12280 case OP_RM_PG_UPMAP
:
12281 if (pending_inc
.new_pg_upmap
.count(pgid
) ||
12282 pending_inc
.old_pg_upmap
.count(pgid
)) {
12283 dout(10) << __func__
<< " waiting for pending update on "
12289 case OP_PG_UPMAP_PRIMARY
: // fall through
12290 case OP_RM_PG_UPMAP_PRIMARY
:
12292 const pg_pool_t
*pt
= osdmap
.get_pg_pool(pgid
.pool());
12293 if (! pt
->is_replicated()) {
12294 ss
<< "pg-upmap-primary is only supported for replicated pools";
12296 goto reply_no_propose
;
12300 case OP_PG_UPMAP_ITEMS
: // fall through
12301 case OP_RM_PG_UPMAP_ITEMS
: // fall through
12302 if (pending_inc
.new_pg_upmap_items
.count(pgid
) ||
12303 pending_inc
.old_pg_upmap_items
.count(pgid
)) {
12304 dout(10) << __func__
<< " waiting for pending update on "
12311 ceph_abort_msg("invalid upmap option");
12314 switch (upmap_option
) {
12317 vector
<int64_t> id_vec
;
12318 if (!cmd_getval(cmdmap
, "id", id_vec
)) {
12319 ss
<< "unable to parse 'id' value(s) '"
12320 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12322 goto reply_no_propose
;
12325 int pool_min_size
= osdmap
.get_pg_pool_min_size(pgid
);
12326 if ((int)id_vec
.size() < pool_min_size
) {
12327 ss
<< "num of osds (" << id_vec
.size() <<") < pool min size ("
12328 << pool_min_size
<< ")";
12330 goto reply_no_propose
;
12333 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
12334 if ((int)id_vec
.size() > pool_size
) {
12335 ss
<< "num of osds (" << id_vec
.size() <<") > pool size ("
12336 << pool_size
<< ")";
12338 goto reply_no_propose
;
12341 vector
<int32_t> new_pg_upmap
;
12342 for (auto osd
: id_vec
) {
12343 if (osd
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(osd
)) {
12344 ss
<< "osd." << osd
<< " does not exist";
12346 goto reply_no_propose
;
12348 auto it
= std::find(new_pg_upmap
.begin(), new_pg_upmap
.end(), osd
);
12349 if (it
!= new_pg_upmap
.end()) {
12350 ss
<< "osd." << osd
<< " already exists, ";
12353 new_pg_upmap
.push_back(osd
);
12356 if (new_pg_upmap
.empty()) {
12357 ss
<< "no valid upmap items(pairs) is specified";
12359 goto reply_no_propose
;
12362 pending_inc
.new_pg_upmap
[pgid
] = mempool::osdmap::vector
<int32_t>(
12363 new_pg_upmap
.begin(), new_pg_upmap
.end());
12364 ss
<< "set " << pgid
<< " pg_upmap mapping to " << new_pg_upmap
;
12368 case OP_RM_PG_UPMAP
:
12370 pending_inc
.old_pg_upmap
.insert(pgid
);
12371 ss
<< "clear " << pgid
<< " pg_upmap mapping";
12375 case OP_PG_UPMAP_ITEMS
:
12377 vector
<int64_t> id_vec
;
12378 if (!cmd_getval(cmdmap
, "id", id_vec
)) {
12379 ss
<< "unable to parse 'id' value(s) '"
12380 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12382 goto reply_no_propose
;
12385 if (id_vec
.size() % 2) {
12386 ss
<< "you must specify pairs of osd ids to be remapped";
12388 goto reply_no_propose
;
12391 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
12392 if ((int)(id_vec
.size() / 2) > pool_size
) {
12393 ss
<< "num of osd pairs (" << id_vec
.size() / 2 <<") > pool size ("
12394 << pool_size
<< ")";
12396 goto reply_no_propose
;
12399 vector
<pair
<int32_t,int32_t>> new_pg_upmap_items
;
12400 ostringstream items
;
12402 for (auto p
= id_vec
.begin(); p
!= id_vec
.end(); ++p
) {
12406 ss
<< "from osd." << from
<< " == to osd." << to
<< ", ";
12409 if (!osdmap
.exists(from
)) {
12410 ss
<< "osd." << from
<< " does not exist";
12412 goto reply_no_propose
;
12414 if (to
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(to
)) {
12415 ss
<< "osd." << to
<< " does not exist";
12417 goto reply_no_propose
;
12419 pair
<int32_t,int32_t> entry
= make_pair(from
, to
);
12420 auto it
= std::find(new_pg_upmap_items
.begin(),
12421 new_pg_upmap_items
.end(), entry
);
12422 if (it
!= new_pg_upmap_items
.end()) {
12423 ss
<< "osd." << from
<< " -> osd." << to
<< " already exists, ";
12426 new_pg_upmap_items
.push_back(entry
);
12427 items
<< from
<< "->" << to
<< ",";
12429 string
out(items
.str());
12430 out
.resize(out
.size() - 1); // drop last ','
12433 if (new_pg_upmap_items
.empty()) {
12434 ss
<< "no valid upmap items(pairs) is specified";
12436 goto reply_no_propose
;
12439 pending_inc
.new_pg_upmap_items
[pgid
] =
12440 mempool::osdmap::vector
<pair
<int32_t,int32_t>>(
12441 new_pg_upmap_items
.begin(), new_pg_upmap_items
.end());
12442 ss
<< "set " << pgid
<< " pg_upmap_items mapping to " << out
;
12446 case OP_RM_PG_UPMAP_ITEMS
:
12448 pending_inc
.old_pg_upmap_items
.insert(pgid
);
12449 ss
<< "clear " << pgid
<< " pg_upmap_items mapping";
12453 case OP_PG_UPMAP_PRIMARY
:
12456 if (!cmd_getval(cmdmap
, "id", id
)) {
12457 ss
<< "invalid osd id value '"
12458 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12460 goto reply_no_propose
;
12462 if (id
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(id
)) {
12463 ss
<< "osd." << id
<< " does not exist";
12465 goto reply_no_propose
;
12467 vector
<int> acting
;
12469 osdmap
.pg_to_acting_osds(pgid
, &acting
, &primary
);
12470 if (id
== primary
) {
12471 ss
<< "osd." << id
<< " is already primary for pg " << pgid
;
12473 goto reply_no_propose
;
12476 for (int i
= 1 ; i
< (int)acting
.size(); i
++) { // skip 0 on purpose
12477 if (acting
[i
] == id
) {
12482 if (found_idx
== 0) {
12483 ss
<< "osd." << id
<< " is not in acting set for pg " << pgid
;
12485 goto reply_no_propose
;
12487 vector
<int> new_acting(acting
);
12488 new_acting
[found_idx
] = new_acting
[0];
12489 new_acting
[0] = id
;
12490 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
12491 if (osdmap
.crush
->verify_upmap(cct
, osdmap
.get_pg_pool_crush_rule(pgid
),
12492 pool_size
, new_acting
) >= 0) {
12493 ss
<< "change primary for pg " << pgid
<< " to osd." << id
;
12496 ss
<< "can't change primary for pg " << pgid
<< " to osd." << id
12497 << " - illegal pg after the change";
12499 goto reply_no_propose
;
12501 pending_inc
.new_pg_upmap_primary
[pgid
] = id
;
12503 ldout(cct
, 20) << "pg " << pgid
<< ": set pg_upmap_primary to " << id
<< dendl
;
12507 case OP_RM_PG_UPMAP_PRIMARY
:
12509 pending_inc
.old_pg_upmap_primary
.insert(pgid
);
12510 ss
<< "clear " << pgid
<< " pg_upmap_primary mapping";
12515 ceph_abort_msg("invalid upmap option");
12519 } else if (prefix
== "osd primary-affinity") {
12521 if (!cmd_getval(cmdmap
, "id", id
)) {
12522 ss
<< "invalid osd id value '"
12523 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12525 goto reply_no_propose
;
12528 if (!cmd_getval(cmdmap
, "weight", w
)) {
12529 ss
<< "unable to parse 'weight' value '"
12530 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
12532 goto reply_no_propose
;
12534 long ww
= (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY
*w
);
12536 ss
<< "weight must be >= 0";
12538 goto reply_no_propose
;
12540 if (osdmap
.require_min_compat_client
!= ceph_release_t::unknown
&&
12541 osdmap
.require_min_compat_client
< ceph_release_t::firefly
) {
12542 ss
<< "require_min_compat_client "
12543 << osdmap
.require_min_compat_client
12544 << " < firefly, which is required for primary-affinity";
12546 goto reply_no_propose
;
12548 if (osdmap
.exists(id
)) {
12549 pending_inc
.new_primary_affinity
[id
] = ww
;
12550 ss
<< "set osd." << id
<< " primary-affinity to " << w
<< " (" << std::ios::hex
<< ww
<< std::ios::dec
<< ")";
12552 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12553 get_last_committed() + 1));
12556 ss
<< "osd." << id
<< " does not exist";
12558 goto reply_no_propose
;
12560 } else if (prefix
== "osd reweight") {
12562 if (!cmd_getval(cmdmap
, "id", id
)) {
12563 ss
<< "unable to parse osd id value '"
12564 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12566 goto reply_no_propose
;
12569 if (!cmd_getval(cmdmap
, "weight", w
)) {
12570 ss
<< "unable to parse weight value '"
12571 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
12573 goto reply_no_propose
;
12575 long ww
= (int)((double)CEPH_OSD_IN
*w
);
12577 ss
<< "weight must be >= 0";
12579 goto reply_no_propose
;
12581 if (osdmap
.exists(id
)) {
12582 pending_inc
.new_weight
[id
] = ww
;
12583 ss
<< "reweighted osd." << id
<< " to " << w
<< " (" << std::hex
<< ww
<< std::dec
<< ")";
12585 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12586 get_last_committed() + 1));
12589 ss
<< "osd." << id
<< " does not exist";
12591 goto reply_no_propose
;
12593 } else if (prefix
== "osd reweightn") {
12594 map
<int32_t, uint32_t> weights
;
12595 err
= parse_reweights(cct
, cmdmap
, osdmap
, &weights
);
12597 ss
<< "unable to parse 'weights' value '"
12598 << cmd_vartype_stringify(cmdmap
.at("weights")) << "'";
12599 goto reply_no_propose
;
12601 pending_inc
.new_weight
.insert(weights
.begin(), weights
.end());
12602 wait_for_finished_proposal(
12604 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
, get_last_committed() + 1));
12606 } else if (prefix
== "osd lost") {
12608 if (!cmd_getval(cmdmap
, "id", id
)) {
12609 ss
<< "unable to parse osd id value '"
12610 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12612 goto reply_no_propose
;
12615 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
12617 ss
<< "are you SURE? this might mean real, permanent data loss. pass "
12618 "--yes-i-really-mean-it if you really do.";
12620 goto reply_no_propose
;
12621 } else if (!osdmap
.exists(id
)) {
12622 ss
<< "osd." << id
<< " does not exist";
12624 goto reply_no_propose
;
12625 } else if (!osdmap
.is_down(id
)) {
12626 ss
<< "osd." << id
<< " is not down";
12628 goto reply_no_propose
;
12630 epoch_t e
= osdmap
.get_info(id
).down_at
;
12631 pending_inc
.new_lost
[id
] = e
;
12632 ss
<< "marked osd lost in epoch " << e
;
12634 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12635 get_last_committed() + 1));
12639 } else if (prefix
== "osd destroy-actual" ||
12640 prefix
== "osd purge-actual" ||
12641 prefix
== "osd purge-new") {
12642 /* Destroying an OSD means that we don't expect to further make use of
12643 * the OSDs data (which may even become unreadable after this operation),
12644 * and that we are okay with scrubbing all its cephx keys and config-key
12645 * data (which may include lockbox keys, thus rendering the osd's data
12648 * The OSD will not be removed. Instead, we will mark it as destroyed,
12649 * such that a subsequent call to `create` will not reuse the osd id.
12650 * This will play into being able to recreate the OSD, at the same
12651 * crush location, with minimal data movement.
12654 // make sure authmon is writeable.
12655 if (!mon
.authmon()->is_writeable()) {
12656 dout(10) << __func__
<< " waiting for auth mon to be writeable for "
12657 << "osd destroy" << dendl
;
12658 mon
.authmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
12663 if (!cmd_getval(cmdmap
, "id", id
)) {
12664 auto p
= cmdmap
.find("id");
12665 if (p
== cmdmap
.end()) {
12666 ss
<< "no osd id specified";
12668 ss
<< "unable to parse osd id value '"
12669 << cmd_vartype_stringify(cmdmap
.at("id")) << "";
12672 goto reply_no_propose
;
12675 bool is_destroy
= (prefix
== "osd destroy-actual");
12677 ceph_assert("osd purge-actual" == prefix
||
12678 "osd purge-new" == prefix
);
12682 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
12684 ss
<< "Are you SURE? Did you verify with 'ceph osd safe-to-destroy'? "
12685 << "This will mean real, permanent data loss, as well "
12686 << "as deletion of cephx and lockbox keys. "
12687 << "Pass --yes-i-really-mean-it if you really do.";
12689 goto reply_no_propose
;
12690 } else if (!osdmap
.exists(id
)) {
12691 ss
<< "osd." << id
<< " does not exist";
12692 err
= 0; // idempotent
12693 goto reply_no_propose
;
12694 } else if (osdmap
.is_up(id
)) {
12695 ss
<< "osd." << id
<< " is not `down`.";
12697 goto reply_no_propose
;
12698 } else if (is_destroy
&& osdmap
.is_destroyed(id
)) {
12699 ss
<< "destroyed osd." << id
;
12701 goto reply_no_propose
;
12704 if (prefix
== "osd purge-new" &&
12705 (osdmap
.get_state(id
) & CEPH_OSD_NEW
) == 0) {
12706 ss
<< "osd." << id
<< " is not new";
12708 goto reply_no_propose
;
12711 bool goto_reply
= false;
12715 err
= prepare_command_osd_destroy(id
, ss
);
12716 // we checked above that it should exist.
12717 ceph_assert(err
!= -ENOENT
);
12719 err
= prepare_command_osd_purge(id
, ss
);
12720 if (err
== -ENOENT
) {
12722 ss
<< "osd." << id
<< " does not exist.";
12728 if (err
< 0 || goto_reply
) {
12729 goto reply_no_propose
;
12733 ss
<< "destroyed osd." << id
;
12735 ss
<< "purged osd." << id
;
12739 wait_for_finished_proposal(op
,
12740 new Monitor::C_Command(mon
, op
, 0, rs
, get_last_committed() + 1));
12741 force_immediate_propose();
12744 } else if (prefix
== "osd new") {
12746 // make sure authmon is writeable.
12747 if (!mon
.authmon()->is_writeable()) {
12748 dout(10) << __func__
<< " waiting for auth mon to be writeable for "
12749 << "osd new" << dendl
;
12750 mon
.authmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
12754 // make sure kvmon is writeable.
12755 if (!mon
.kvmon()->is_writeable()) {
12756 dout(10) << __func__
<< " waiting for kv mon to be writeable for "
12757 << "osd new" << dendl
;
12758 mon
.kvmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
12762 map
<string
,string
> param_map
;
12764 bufferlist bl
= m
->get_data();
12765 string param_json
= bl
.to_str();
12766 dout(20) << __func__
<< " osd new json = " << param_json
<< dendl
;
12768 err
= get_json_str_map(param_json
, ss
, ¶m_map
);
12770 goto reply_no_propose
;
12772 dout(20) << __func__
<< " osd new params " << param_map
<< dendl
;
12775 err
= prepare_command_osd_new(op
, cmdmap
, param_map
, ss
, f
.get());
12779 goto reply_no_propose
;
12788 if (err
== EEXIST
) {
12789 // idempotent operation
12791 goto reply_no_propose
;
12794 wait_for_finished_proposal(op
,
12795 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
,
12796 get_last_committed() + 1));
12797 force_immediate_propose();
12800 } else if (prefix
== "osd create") {
12802 // optional id provided?
12803 int64_t id
= -1, cmd_id
= -1;
12804 if (cmd_getval(cmdmap
, "id", cmd_id
)) {
12806 ss
<< "invalid osd id value '" << cmd_id
<< "'";
12808 goto reply_no_propose
;
12810 dout(10) << " osd create got id " << cmd_id
<< dendl
;
12815 if (cmd_getval(cmdmap
, "uuid", uuidstr
)) {
12816 if (!uuid
.parse(uuidstr
.c_str())) {
12817 ss
<< "invalid uuid value '" << uuidstr
<< "'";
12819 goto reply_no_propose
;
12821 // we only care about the id if we also have the uuid, to
12822 // ensure the operation's idempotency.
12826 int32_t new_id
= -1;
12827 err
= prepare_command_osd_create(id
, uuid
, &new_id
, ss
);
12829 if (err
== -EAGAIN
) {
12832 // a check has failed; reply to the user.
12833 goto reply_no_propose
;
12835 } else if (err
== EEXIST
) {
12836 // this is an idempotent operation; we can go ahead and reply.
12838 f
->open_object_section("created_osd");
12839 f
->dump_int("osdid", new_id
);
12840 f
->close_section();
12847 goto reply_no_propose
;
12850 string empty_device_class
;
12851 do_osd_create(id
, uuid
, empty_device_class
, &new_id
);
12854 f
->open_object_section("created_osd");
12855 f
->dump_int("osdid", new_id
);
12856 f
->close_section();
12862 wait_for_finished_proposal(op
,
12863 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
,
12864 get_last_committed() + 1));
12867 } else if (prefix
== "osd blocklist clear" ||
12868 prefix
== "osd blacklist clear") {
12869 pending_inc
.new_blocklist
.clear();
12870 std::list
<std::pair
<entity_addr_t
,utime_t
> > blocklist
;
12871 std::list
<std::pair
<entity_addr_t
,utime_t
> > range_b
;
12872 osdmap
.get_blocklist(&blocklist
, &range_b
);
12873 for (const auto &entry
: blocklist
) {
12874 pending_inc
.old_blocklist
.push_back(entry
.first
);
12876 for (const auto &entry
: range_b
) {
12877 pending_inc
.old_range_blocklist
.push_back(entry
.first
);
12879 ss
<< " removed all blocklist entries";
12881 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12882 get_last_committed() + 1));
12884 } else if (prefix
== "osd blocklist" ||
12885 prefix
== "osd blacklist") {
12886 string addrstr
, rangestr
;
12887 bool range
= false;
12888 cmd_getval(cmdmap
, "addr", addrstr
);
12889 if (cmd_getval(cmdmap
, "range", rangestr
)) {
12890 if (rangestr
== "range") {
12893 ss
<< "Did you mean to specify \"osd blocklist range\"?";
12895 goto reply_no_propose
;
12898 entity_addr_t addr
;
12899 if (!addr
.parse(addrstr
)) {
12900 ss
<< "unable to parse address " << addrstr
;
12902 goto reply_no_propose
;
12906 if (!addr
.maybe_cidr()) {
12907 ss
<< "You specified a range command, but " << addr
12908 << " does not parse as a CIDR range";
12910 goto reply_no_propose
;
12912 addr
.type
= entity_addr_t::TYPE_CIDR
;
12913 err
= check_cluster_features(CEPH_FEATUREMASK_RANGE_BLOCKLIST
, ss
);
12915 goto reply_no_propose
;
12917 if ((addr
.is_ipv4() && addr
.get_nonce() > 32) ||
12918 (addr
.is_ipv6() && addr
.get_nonce() > 128)) {
12919 ss
<< "Too many bits in range for that protocol!";
12921 goto reply_no_propose
;
12924 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
12925 // always blocklist type ANY
12926 addr
.set_type(entity_addr_t::TYPE_ANY
);
12928 addr
.set_type(entity_addr_t::TYPE_LEGACY
);
12932 string blocklistop
;
12933 if (!cmd_getval(cmdmap
, "blocklistop", blocklistop
)) {
12934 cmd_getval(cmdmap
, "blacklistop", blocklistop
);
12936 if (blocklistop
== "add") {
12937 utime_t expires
= ceph_clock_now();
12938 // default one hour
12939 double d
= cmd_getval_or
<double>(cmdmap
, "expire",
12940 g_conf()->mon_osd_blocklist_default_expire
);
12943 auto add_to_pending_blocklists
= [](auto& nb
, auto& ob
,
12945 const auto& expires
) {
12946 nb
[addr
] = expires
;
12947 // cancel any pending un-blocklisting request too
12948 auto it
= std::find(ob
.begin(),
12950 if (it
!= ob
.end()) {
12955 add_to_pending_blocklists(pending_inc
.new_range_blocklist
,
12956 pending_inc
.old_range_blocklist
,
12960 add_to_pending_blocklists(pending_inc
.new_blocklist
,
12961 pending_inc
.old_blocklist
,
12965 ss
<< "blocklisting " << addr
<< " until " << expires
<< " (" << d
<< " sec)";
12967 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12968 get_last_committed() + 1));
12970 } else if (blocklistop
== "rm") {
12971 auto rm_from_pending_blocklists
= [](const auto& addr
,
12973 auto& ob
, auto& pb
) {
12974 if (blocklist
.count(addr
)) {
12975 ob
.push_back(addr
);
12977 } else if (pb
.count(addr
)) {
12983 if ((!range
&& rm_from_pending_blocklists(addr
, osdmap
.blocklist
,
12984 pending_inc
.old_blocklist
,
12985 pending_inc
.new_blocklist
)) ||
12986 (range
&& rm_from_pending_blocklists(addr
, osdmap
.range_blocklist
,
12987 pending_inc
.old_range_blocklist
,
12988 pending_inc
.new_range_blocklist
))) {
12989 ss
<< "un-blocklisting " << addr
;
12991 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12992 get_last_committed() + 1));
12995 ss
<< addr
<< " isn't blocklisted";
12997 goto reply_no_propose
;
13000 } else if (prefix
== "osd pool mksnap") {
13002 cmd_getval(cmdmap
, "pool", poolstr
);
13003 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
13005 ss
<< "unrecognized pool '" << poolstr
<< "'";
13007 goto reply_no_propose
;
13010 cmd_getval(cmdmap
, "snap", snapname
);
13011 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
13012 if (p
->is_unmanaged_snaps_mode()) {
13013 ss
<< "pool " << poolstr
<< " is in unmanaged snaps mode";
13015 goto reply_no_propose
;
13016 } else if (p
->snap_exists(snapname
.c_str())) {
13017 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " already exists";
13019 goto reply_no_propose
;
13020 } else if (p
->is_tier()) {
13021 ss
<< "pool " << poolstr
<< " is a cache tier";
13023 goto reply_no_propose
;
13026 if (pending_inc
.new_pools
.count(pool
))
13027 pp
= &pending_inc
.new_pools
[pool
];
13029 pp
= &pending_inc
.new_pools
[pool
];
13032 if (pp
->snap_exists(snapname
.c_str())) {
13033 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " already exists";
13035 if (const auto& fsmap
= mon
.mdsmon()->get_fsmap(); fsmap
.pool_in_use(pool
)) {
13036 dout(20) << "pool-level snapshots have been disabled for pools "
13037 "attached to an fs - poolid:" << pool
<< dendl
;
13039 goto reply_no_propose
;
13041 pp
->add_snap(snapname
.c_str(), ceph_clock_now());
13042 pp
->set_snap_epoch(pending_inc
.epoch
);
13043 ss
<< "created pool " << poolstr
<< " snap " << snapname
;
13046 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
13047 get_last_committed() + 1));
13049 } else if (prefix
== "osd pool rmsnap") {
13051 cmd_getval(cmdmap
, "pool", poolstr
);
13052 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
13054 ss
<< "unrecognized pool '" << poolstr
<< "'";
13056 goto reply_no_propose
;
13059 cmd_getval(cmdmap
, "snap", snapname
);
13060 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
13061 if (p
->is_unmanaged_snaps_mode()) {
13062 ss
<< "pool " << poolstr
<< " is in unmanaged snaps mode";
13064 goto reply_no_propose
;
13065 } else if (!p
->snap_exists(snapname
.c_str())) {
13066 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " does not exist";
13068 goto reply_no_propose
;
13071 if (pending_inc
.new_pools
.count(pool
))
13072 pp
= &pending_inc
.new_pools
[pool
];
13074 pp
= &pending_inc
.new_pools
[pool
];
13077 snapid_t sn
= pp
->snap_exists(snapname
.c_str());
13079 pp
->remove_snap(sn
);
13080 pp
->set_snap_epoch(pending_inc
.epoch
);
13081 ss
<< "removed pool " << poolstr
<< " snap " << snapname
;
13083 ss
<< "already removed pool " << poolstr
<< " snap " << snapname
;
13086 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
13087 get_last_committed() + 1));
13089 } else if (prefix
== "osd pool create") {
13090 int64_t pg_num
= cmd_getval_or
<int64_t>(cmdmap
, "pg_num", 0);
13091 int64_t pg_num_min
= cmd_getval_or
<int64_t>(cmdmap
, "pg_num_min", 0);
13092 int64_t pg_num_max
= cmd_getval_or
<int64_t>(cmdmap
, "pg_num_max", 0);
13093 int64_t pgp_num
= cmd_getval_or
<int64_t>(cmdmap
, "pgp_num", pg_num
);
13094 string pool_type_str
;
13095 cmd_getval(cmdmap
, "pool_type", pool_type_str
);
13096 if (pool_type_str
.empty())
13097 pool_type_str
= g_conf().get_val
<string
>("osd_pool_default_type");
13100 cmd_getval(cmdmap
, "pool", poolstr
);
13101 bool confirm
= false;
13102 //confirmation may be set to true only by internal operations.
13103 cmd_getval(cmdmap
, "yes_i_really_mean_it", confirm
);
13104 if (poolstr
[0] == '.' && !confirm
) {
13105 ss
<< "pool names beginning with . are not allowed";
13107 goto reply_no_propose
;
13109 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13110 if (pool_id
>= 0) {
13111 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13112 if (pool_type_str
!= p
->get_type_name()) {
13113 ss
<< "pool '" << poolstr
<< "' cannot change to type " << pool_type_str
;
13116 ss
<< "pool '" << poolstr
<< "' already exists";
13119 goto reply_no_propose
;
13123 if (pool_type_str
== "replicated") {
13124 pool_type
= pg_pool_t::TYPE_REPLICATED
;
13125 } else if (pool_type_str
== "erasure") {
13126 pool_type
= pg_pool_t::TYPE_ERASURE
;
13128 ss
<< "unknown pool type '" << pool_type_str
<< "'";
13130 goto reply_no_propose
;
13133 bool implicit_rule_creation
= false;
13134 int64_t expected_num_objects
= 0;
13136 cmd_getval(cmdmap
, "rule", rule_name
);
13137 string erasure_code_profile
;
13138 cmd_getval(cmdmap
, "erasure_code_profile", erasure_code_profile
);
13140 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
13141 if (erasure_code_profile
== "")
13142 erasure_code_profile
= "default";
13143 //handle the erasure code profile
13144 if (erasure_code_profile
== "default") {
13145 if (!osdmap
.has_erasure_code_profile(erasure_code_profile
)) {
13146 if (pending_inc
.has_erasure_code_profile(erasure_code_profile
)) {
13147 dout(20) << "erasure code profile " << erasure_code_profile
<< " already pending" << dendl
;
13151 map
<string
,string
> profile_map
;
13152 err
= osdmap
.get_erasure_code_profile_default(cct
,
13156 goto reply_no_propose
;
13157 dout(20) << "erasure code profile " << erasure_code_profile
<< " set" << dendl
;
13158 pending_inc
.set_erasure_code_profile(erasure_code_profile
, profile_map
);
13162 if (rule_name
== "") {
13163 implicit_rule_creation
= true;
13164 if (erasure_code_profile
== "default") {
13165 rule_name
= "erasure-code";
13167 dout(1) << "implicitly use rule named after the pool: "
13168 << poolstr
<< dendl
;
13169 rule_name
= poolstr
;
13172 expected_num_objects
=
13173 cmd_getval_or
<int64_t>(cmdmap
, "expected_num_objects", 0);
13175 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
13176 // and put expected_num_objects to rule field
13177 if (erasure_code_profile
!= "") { // cmd is from CLI
13178 if (rule_name
!= "") {
13180 expected_num_objects
= strict_strtoll(rule_name
.c_str(), 10, &interr
);
13181 if (interr
.length()) {
13182 ss
<< "error parsing integer value '" << rule_name
<< "': " << interr
;
13184 goto reply_no_propose
;
13187 rule_name
= erasure_code_profile
;
13188 } else { // cmd is well-formed
13189 expected_num_objects
=
13190 cmd_getval_or
<int64_t>(cmdmap
, "expected_num_objects", 0);
13194 if (!implicit_rule_creation
&& rule_name
!= "") {
13196 err
= get_crush_rule(rule_name
, &rule
, &ss
);
13197 if (err
== -EAGAIN
) {
13201 goto reply_no_propose
;
13204 if (expected_num_objects
< 0) {
13205 ss
<< "'expected_num_objects' must be non-negative";
13207 goto reply_no_propose
;
13211 osdmap
.get_all_osds(osds
);
13212 bool has_filestore_osd
= std::any_of(osds
.begin(), osds
.end(), [this](int osd
) {
13214 if (!get_osd_objectstore_type(osd
, &type
)) {
13215 return type
== "filestore";
13221 if (has_filestore_osd
&&
13222 expected_num_objects
> 0 &&
13223 cct
->_conf
->filestore_merge_threshold
> 0) {
13224 ss
<< "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
13226 goto reply_no_propose
;
13229 if (has_filestore_osd
&&
13230 expected_num_objects
== 0 &&
13231 cct
->_conf
->filestore_merge_threshold
< 0) {
13232 int osds
= osdmap
.get_num_osds();
13234 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
13235 if (!sure
&& osds
&& (pg_num
>= 1024 || pg_num
/ osds
>= 100)) {
13236 ss
<< "For better initial performance on pools expected to store a "
13237 << "large number of objects, consider supplying the "
13238 << "expected_num_objects parameter when creating the pool."
13239 << " Pass --yes-i-really-mean-it to ignore it";
13241 goto reply_no_propose
;
13245 int64_t fast_read_param
= cmd_getval_or
<int64_t>(cmdmap
, "fast_read", -1);
13246 FastReadType fast_read
= FAST_READ_DEFAULT
;
13247 if (fast_read_param
== 0)
13248 fast_read
= FAST_READ_OFF
;
13249 else if (fast_read_param
> 0)
13250 fast_read
= FAST_READ_ON
;
13252 int64_t repl_size
= 0;
13253 cmd_getval(cmdmap
, "size", repl_size
);
13254 int64_t target_size_bytes
= 0;
13255 double target_size_ratio
= 0.0;
13256 cmd_getval(cmdmap
, "target_size_bytes", target_size_bytes
);
13257 cmd_getval(cmdmap
, "target_size_ratio", target_size_ratio
);
13259 string pg_autoscale_mode
;
13260 cmd_getval(cmdmap
, "autoscale_mode", pg_autoscale_mode
);
13262 bool bulk
= cmd_getval_or
<bool>(cmdmap
, "bulk", 0);
13264 bool crimson
= cmd_getval_or
<bool>(cmdmap
, "crimson", false) ||
13265 cct
->_conf
.get_val
<bool>("osd_pool_default_crimson");
13267 err
= prepare_new_pool(poolstr
,
13268 -1, // default crush rule
13270 pg_num
, pgp_num
, pg_num_min
, pg_num_max
,
13271 repl_size
, target_size_bytes
, target_size_ratio
,
13272 erasure_code_profile
, pool_type
,
13273 (uint64_t)expected_num_objects
,
13282 ss
<< "pool '" << poolstr
<< "' already exists";
13284 goto reply_no_propose
;
13288 goto reply_no_propose
;
13290 goto reply_no_propose
;
13293 ss
<< "pool '" << poolstr
<< "' created";
13296 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
13297 get_last_committed() + 1));
13300 } else if (prefix
== "osd pool delete" ||
13301 prefix
== "osd pool rm") {
13302 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
13303 string poolstr
, poolstr2
, sure
;
13304 cmd_getval(cmdmap
, "pool", poolstr
);
13305 cmd_getval(cmdmap
, "pool2", poolstr2
);
13306 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
13308 ss
<< "pool '" << poolstr
<< "' does not exist";
13310 goto reply_no_propose
;
13313 bool force_no_fake
= false;
13314 cmd_getval(cmdmap
, "yes_i_really_really_mean_it", force_no_fake
);
13315 bool force
= false;
13316 cmd_getval(cmdmap
, "yes_i_really_really_mean_it_not_faking", force
);
13317 if (poolstr2
!= poolstr
||
13318 (!force
&& !force_no_fake
)) {
13319 ss
<< "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
13320 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
13321 << "followed by --yes-i-really-really-mean-it.";
13323 goto reply_no_propose
;
13325 err
= _prepare_remove_pool(pool
, &ss
, force_no_fake
);
13326 if (err
== -EAGAIN
) {
13330 goto reply_no_propose
;
13332 } else if (prefix
== "osd pool rename") {
13333 string srcpoolstr
, destpoolstr
;
13334 cmd_getval(cmdmap
, "srcpool", srcpoolstr
);
13335 cmd_getval(cmdmap
, "destpool", destpoolstr
);
13336 int64_t pool_src
= osdmap
.lookup_pg_pool_name(srcpoolstr
.c_str());
13337 int64_t pool_dst
= osdmap
.lookup_pg_pool_name(destpoolstr
.c_str());
13338 bool confirm
= false;
13339 //confirmation may be set to true only by internal operations.
13340 cmd_getval(cmdmap
, "yes_i_really_mean_it", confirm
);
13341 if (destpoolstr
[0] == '.' && !confirm
) {
13342 ss
<< "pool names beginning with . are not allowed";
13344 goto reply_no_propose
;
13346 if (pool_src
< 0) {
13347 if (pool_dst
>= 0) {
13348 // src pool doesn't exist, dst pool does exist: to ensure idempotency
13349 // of operations, assume this rename succeeded, as it is not changing
13350 // the current state. Make sure we output something understandable
13351 // for whoever is issuing the command, if they are paying attention,
13352 // in case it was not intentional; or to avoid a "wtf?" and a bug
13353 // report in case it was intentional, while expecting a failure.
13354 ss
<< "pool '" << srcpoolstr
<< "' does not exist; pool '"
13355 << destpoolstr
<< "' does -- assuming successful rename";
13358 ss
<< "unrecognized pool '" << srcpoolstr
<< "'";
13361 goto reply_no_propose
;
13362 } else if (pool_dst
>= 0) {
13363 // source pool exists and so does the destination pool
13364 ss
<< "pool '" << destpoolstr
<< "' already exists";
13366 goto reply_no_propose
;
13369 int ret
= _prepare_rename_pool(pool_src
, destpoolstr
);
13371 ss
<< "pool '" << srcpoolstr
<< "' renamed to '" << destpoolstr
<< "'";
13373 ss
<< "failed to rename pool '" << srcpoolstr
<< "' to '" << destpoolstr
<< "': "
13374 << cpp_strerror(ret
);
13377 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, ret
, rs
,
13378 get_last_committed() + 1));
13381 } else if (prefix
== "osd pool set") {
13382 err
= prepare_command_pool_set(cmdmap
, ss
);
13383 if (err
== -EAGAIN
)
13386 goto reply_no_propose
;
13389 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
13390 get_last_committed() + 1));
13392 } else if (prefix
== "osd tier add") {
13393 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
13394 if (err
== -EAGAIN
)
13397 goto reply_no_propose
;
13399 cmd_getval(cmdmap
, "pool", poolstr
);
13400 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13402 ss
<< "unrecognized pool '" << poolstr
<< "'";
13404 goto reply_no_propose
;
13406 string tierpoolstr
;
13407 cmd_getval(cmdmap
, "tierpool", tierpoolstr
);
13408 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
13409 if (tierpool_id
< 0) {
13410 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
13412 goto reply_no_propose
;
13414 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13416 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
13419 if (!_check_become_tier(tierpool_id
, tp
, pool_id
, p
, &err
, &ss
)) {
13420 goto reply_no_propose
;
13423 // make sure new tier is empty
13424 bool force_nonempty
= false;
13425 cmd_getval_compat_cephbool(cmdmap
, "force_nonempty", force_nonempty
);
13426 const pool_stat_t
*pstats
= mon
.mgrstatmon()->get_pool_stat(tierpool_id
);
13427 if (pstats
&& pstats
->stats
.sum
.num_objects
!= 0 &&
13429 ss
<< "tier pool '" << tierpoolstr
<< "' is not empty; --force-nonempty to force";
13431 goto reply_no_propose
;
13433 if (tp
->is_erasure()) {
13434 ss
<< "tier pool '" << tierpoolstr
13435 << "' is an ec pool, which cannot be a tier";
13437 goto reply_no_propose
;
13439 if ((!tp
->removed_snaps
.empty() || !tp
->snaps
.empty()) &&
13440 (!force_nonempty
||
13441 !g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps
)) {
13442 ss
<< "tier pool '" << tierpoolstr
<< "' has snapshot state; it cannot be added as a tier without breaking the pool";
13444 goto reply_no_propose
;
13447 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13448 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
13449 if (np
->tiers
.count(tierpool_id
) || ntp
->is_tier()) {
13452 np
->tiers
.insert(tierpool_id
);
13453 np
->set_snap_epoch(pending_inc
.epoch
); // tier will update to our snap info
13454 ntp
->tier_of
= pool_id
;
13455 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) a tier of '" << poolstr
<< "'";
13456 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13457 get_last_committed() + 1));
13459 } else if (prefix
== "osd tier remove" ||
13460 prefix
== "osd tier rm") {
13462 cmd_getval(cmdmap
, "pool", poolstr
);
13463 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13465 ss
<< "unrecognized pool '" << poolstr
<< "'";
13467 goto reply_no_propose
;
13469 string tierpoolstr
;
13470 cmd_getval(cmdmap
, "tierpool", tierpoolstr
);
13471 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
13472 if (tierpool_id
< 0) {
13473 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
13475 goto reply_no_propose
;
13477 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13479 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
13482 if (!_check_remove_tier(pool_id
, p
, tp
, &err
, &ss
)) {
13483 goto reply_no_propose
;
13486 if (p
->tiers
.count(tierpool_id
) == 0) {
13487 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) not a tier of '" << poolstr
<< "'";
13489 goto reply_no_propose
;
13491 if (tp
->tier_of
!= pool_id
) {
13492 ss
<< "tier pool '" << tierpoolstr
<< "' is a tier of '"
13493 << osdmap
.get_pool_name(tp
->tier_of
) << "': "
13494 // be scary about it; this is an inconsistency and bells must go off
13495 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
13497 goto reply_no_propose
;
13499 if (p
->read_tier
== tierpool_id
) {
13500 ss
<< "tier pool '" << tierpoolstr
<< "' is the overlay for '" << poolstr
<< "'; please remove-overlay first";
13502 goto reply_no_propose
;
13505 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13506 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
13507 if (np
->tiers
.count(tierpool_id
) == 0 ||
13508 ntp
->tier_of
!= pool_id
||
13509 np
->read_tier
== tierpool_id
) {
13512 np
->tiers
.erase(tierpool_id
);
13514 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) not a tier of '" << poolstr
<< "'";
13515 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13516 get_last_committed() + 1));
13518 } else if (prefix
== "osd tier set-overlay") {
13519 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
13520 if (err
== -EAGAIN
)
13523 goto reply_no_propose
;
13525 cmd_getval(cmdmap
, "pool", poolstr
);
13526 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13528 ss
<< "unrecognized pool '" << poolstr
<< "'";
13530 goto reply_no_propose
;
13532 string overlaypoolstr
;
13533 cmd_getval(cmdmap
, "overlaypool", overlaypoolstr
);
13534 int64_t overlaypool_id
= osdmap
.lookup_pg_pool_name(overlaypoolstr
);
13535 if (overlaypool_id
< 0) {
13536 ss
<< "unrecognized pool '" << overlaypoolstr
<< "'";
13538 goto reply_no_propose
;
13540 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13542 const pg_pool_t
*overlay_p
= osdmap
.get_pg_pool(overlaypool_id
);
13543 ceph_assert(overlay_p
);
13544 if (p
->tiers
.count(overlaypool_id
) == 0) {
13545 ss
<< "tier pool '" << overlaypoolstr
<< "' is not a tier of '" << poolstr
<< "'";
13547 goto reply_no_propose
;
13549 if (p
->read_tier
== overlaypool_id
) {
13551 ss
<< "overlay for '" << poolstr
<< "' is now (or already was) '" << overlaypoolstr
<< "'";
13552 goto reply_no_propose
;
13554 if (p
->has_read_tier()) {
13555 ss
<< "pool '" << poolstr
<< "' has overlay '"
13556 << osdmap
.get_pool_name(p
->read_tier
)
13557 << "'; please remove-overlay first";
13559 goto reply_no_propose
;
13563 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13564 np
->read_tier
= overlaypool_id
;
13565 np
->write_tier
= overlaypool_id
;
13566 np
->set_last_force_op_resend(pending_inc
.epoch
);
13567 pg_pool_t
*noverlay_p
= pending_inc
.get_new_pool(overlaypool_id
, overlay_p
);
13568 noverlay_p
->set_last_force_op_resend(pending_inc
.epoch
);
13569 ss
<< "overlay for '" << poolstr
<< "' is now (or already was) '" << overlaypoolstr
<< "'";
13570 if (overlay_p
->cache_mode
== pg_pool_t::CACHEMODE_NONE
)
13571 ss
<<" (WARNING: overlay pool cache_mode is still NONE)";
13572 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13573 get_last_committed() + 1));
13575 } else if (prefix
== "osd tier remove-overlay" ||
13576 prefix
== "osd tier rm-overlay") {
13578 cmd_getval(cmdmap
, "pool", poolstr
);
13579 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13581 ss
<< "unrecognized pool '" << poolstr
<< "'";
13583 goto reply_no_propose
;
13585 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13587 if (!p
->has_read_tier()) {
13589 ss
<< "there is now (or already was) no overlay for '" << poolstr
<< "'";
13590 goto reply_no_propose
;
13593 if (!_check_remove_tier(pool_id
, p
, NULL
, &err
, &ss
)) {
13594 goto reply_no_propose
;
13598 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13599 if (np
->has_read_tier()) {
13600 const pg_pool_t
*op
= osdmap
.get_pg_pool(np
->read_tier
);
13601 pg_pool_t
*nop
= pending_inc
.get_new_pool(np
->read_tier
,op
);
13602 nop
->set_last_force_op_resend(pending_inc
.epoch
);
13604 if (np
->has_write_tier()) {
13605 const pg_pool_t
*op
= osdmap
.get_pg_pool(np
->write_tier
);
13606 pg_pool_t
*nop
= pending_inc
.get_new_pool(np
->write_tier
, op
);
13607 nop
->set_last_force_op_resend(pending_inc
.epoch
);
13609 np
->clear_read_tier();
13610 np
->clear_write_tier();
13611 np
->set_last_force_op_resend(pending_inc
.epoch
);
13612 ss
<< "there is now (or already was) no overlay for '" << poolstr
<< "'";
13613 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13614 get_last_committed() + 1));
13616 } else if (prefix
== "osd tier cache-mode") {
13617 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
13618 if (err
== -EAGAIN
)
13621 goto reply_no_propose
;
13623 cmd_getval(cmdmap
, "pool", poolstr
);
13624 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13626 ss
<< "unrecognized pool '" << poolstr
<< "'";
13628 goto reply_no_propose
;
13630 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13632 if (!p
->is_tier()) {
13633 ss
<< "pool '" << poolstr
<< "' is not a tier";
13635 goto reply_no_propose
;
13638 cmd_getval(cmdmap
, "mode", modestr
);
13639 pg_pool_t::cache_mode_t mode
= pg_pool_t::get_cache_mode_from_str(modestr
);
13640 if (int(mode
) < 0) {
13641 ss
<< "'" << modestr
<< "' is not a valid cache mode";
13643 goto reply_no_propose
;
13647 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
13649 if (mode
== pg_pool_t::CACHEMODE_FORWARD
||
13650 mode
== pg_pool_t::CACHEMODE_READFORWARD
) {
13651 ss
<< "'" << modestr
<< "' is no longer a supported cache mode";
13653 goto reply_no_propose
;
13655 if ((mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13656 mode
!= pg_pool_t::CACHEMODE_NONE
&&
13657 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
13658 mode
!= pg_pool_t::CACHEMODE_READPROXY
) &&
13660 ss
<< "'" << modestr
<< "' is not a well-supported cache mode and may "
13661 << "corrupt your data. pass --yes-i-really-mean-it to force.";
13663 goto reply_no_propose
;
13666 // pool already has this cache-mode set and there are no pending changes
13667 if (p
->cache_mode
== mode
&&
13668 (pending_inc
.new_pools
.count(pool_id
) == 0 ||
13669 pending_inc
.new_pools
[pool_id
].cache_mode
== p
->cache_mode
)) {
13670 ss
<< "set cache-mode for pool '" << poolstr
<< "'"
13671 << " to " << pg_pool_t::get_cache_mode_name(mode
);
13673 goto reply_no_propose
;
13676 /* Mode description:
13678 * none: No cache-mode defined
13679 * forward: Forward all reads and writes to base pool [removed]
13680 * writeback: Cache writes, promote reads from base pool
13681 * readonly: Forward writes to base pool
13682 * readforward: Writes are in writeback mode, Reads are in forward mode [removed]
13683 * proxy: Proxy all reads and writes to base pool
13684 * readproxy: Writes are in writeback mode, Reads are in proxy mode
13686 * Hence, these are the allowed transitions:
13689 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
13690 * proxy -> readproxy || writeback || any IF num_objects_dirty == 0
13691 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
13692 * readproxy -> proxy || writeback || any IF num_objects_dirty == 0
13693 * writeback -> readproxy || proxy
13697 // We check if the transition is valid against the current pool mode, as
13698 // it is the only committed state thus far. We will blantly squash
13699 // whatever mode is on the pending state.
13701 if (p
->cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
&&
13702 (mode
!= pg_pool_t::CACHEMODE_PROXY
&&
13703 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) {
13704 ss
<< "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode
)
13705 << "' on a '" << pg_pool_t::get_cache_mode_name(p
->cache_mode
)
13706 << "' pool; only '"
13707 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY
)
13709 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY
)
13712 goto reply_no_propose
;
13714 if ((p
->cache_mode
== pg_pool_t::CACHEMODE_READFORWARD
&&
13715 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13716 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
13717 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) ||
13719 (p
->cache_mode
== pg_pool_t::CACHEMODE_READPROXY
&&
13720 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13721 mode
!= pg_pool_t::CACHEMODE_PROXY
)) ||
13723 (p
->cache_mode
== pg_pool_t::CACHEMODE_PROXY
&&
13724 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13725 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) ||
13727 (p
->cache_mode
== pg_pool_t::CACHEMODE_FORWARD
&&
13728 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13729 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
13730 mode
!= pg_pool_t::CACHEMODE_READPROXY
))) {
13732 const pool_stat_t
* pstats
=
13733 mon
.mgrstatmon()->get_pool_stat(pool_id
);
13735 if (pstats
&& pstats
->stats
.sum
.num_objects_dirty
> 0) {
13736 ss
<< "unable to set cache-mode '"
13737 << pg_pool_t::get_cache_mode_name(mode
) << "' on pool '" << poolstr
13738 << "': dirty objects found";
13740 goto reply_no_propose
;
13744 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13745 np
->cache_mode
= mode
;
13746 // set this both when moving to and from cache_mode NONE. this is to
13747 // capture legacy pools that were set up before this flag existed.
13748 np
->flags
|= pg_pool_t::FLAG_INCOMPLETE_CLONES
;
13749 ss
<< "set cache-mode for pool '" << poolstr
13750 << "' to " << pg_pool_t::get_cache_mode_name(mode
);
13751 if (mode
== pg_pool_t::CACHEMODE_NONE
) {
13752 const pg_pool_t
*base_pool
= osdmap
.get_pg_pool(np
->tier_of
);
13753 ceph_assert(base_pool
);
13754 if (base_pool
->read_tier
== pool_id
||
13755 base_pool
->write_tier
== pool_id
)
13756 ss
<<" (WARNING: pool is still configured as read or write tier)";
13758 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13759 get_last_committed() + 1));
13761 } else if (prefix
== "osd tier add-cache") {
13762 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
13763 if (err
== -EAGAIN
)
13766 goto reply_no_propose
;
13768 cmd_getval(cmdmap
, "pool", poolstr
);
13769 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13771 ss
<< "unrecognized pool '" << poolstr
<< "'";
13773 goto reply_no_propose
;
13775 string tierpoolstr
;
13776 cmd_getval(cmdmap
, "tierpool", tierpoolstr
);
13777 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
13778 if (tierpool_id
< 0) {
13779 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
13781 goto reply_no_propose
;
13783 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13785 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
13788 if (!_check_become_tier(tierpool_id
, tp
, pool_id
, p
, &err
, &ss
)) {
13789 goto reply_no_propose
;
13793 if (!cmd_getval(cmdmap
, "size", size
)) {
13794 ss
<< "unable to parse 'size' value '"
13795 << cmd_vartype_stringify(cmdmap
.at("size")) << "'";
13797 goto reply_no_propose
;
13799 // make sure new tier is empty
13800 const pool_stat_t
*pstats
=
13801 mon
.mgrstatmon()->get_pool_stat(tierpool_id
);
13802 if (pstats
&& pstats
->stats
.sum
.num_objects
!= 0) {
13803 ss
<< "tier pool '" << tierpoolstr
<< "' is not empty";
13805 goto reply_no_propose
;
13807 auto& modestr
= g_conf().get_val
<string
>("osd_tier_default_cache_mode");
13808 pg_pool_t::cache_mode_t mode
= pg_pool_t::get_cache_mode_from_str(modestr
);
13809 if (int(mode
) < 0) {
13810 ss
<< "osd tier cache default mode '" << modestr
<< "' is not a valid cache mode";
13812 goto reply_no_propose
;
13814 HitSet::Params hsp
;
13815 auto& cache_hit_set_type
=
13816 g_conf().get_val
<string
>("osd_tier_default_cache_hit_set_type");
13817 if (cache_hit_set_type
== "bloom") {
13818 BloomHitSet::Params
*bsp
= new BloomHitSet::Params
;
13819 bsp
->set_fpp(g_conf().get_val
<double>("osd_pool_default_hit_set_bloom_fpp"));
13820 hsp
= HitSet::Params(bsp
);
13821 } else if (cache_hit_set_type
== "explicit_hash") {
13822 hsp
= HitSet::Params(new ExplicitHashHitSet::Params
);
13823 } else if (cache_hit_set_type
== "explicit_object") {
13824 hsp
= HitSet::Params(new ExplicitObjectHitSet::Params
);
13826 ss
<< "osd tier cache default hit set type '"
13827 << cache_hit_set_type
<< "' is not a known type";
13829 goto reply_no_propose
;
13832 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13833 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
13834 if (np
->tiers
.count(tierpool_id
) || ntp
->is_tier()) {
13837 np
->tiers
.insert(tierpool_id
);
13838 np
->read_tier
= np
->write_tier
= tierpool_id
;
13839 np
->set_snap_epoch(pending_inc
.epoch
); // tier will update to our snap info
13840 np
->set_last_force_op_resend(pending_inc
.epoch
);
13841 ntp
->set_last_force_op_resend(pending_inc
.epoch
);
13842 ntp
->tier_of
= pool_id
;
13843 ntp
->cache_mode
= mode
;
13844 ntp
->hit_set_count
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_count");
13845 ntp
->hit_set_period
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_period");
13846 ntp
->min_read_recency_for_promote
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
13847 ntp
->min_write_recency_for_promote
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
13848 ntp
->hit_set_grade_decay_rate
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
13849 ntp
->hit_set_search_last_n
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
13850 ntp
->hit_set_params
= hsp
;
13851 ntp
->target_max_bytes
= size
;
13852 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) a cache tier of '" << poolstr
<< "'";
13853 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13854 get_last_committed() + 1));
13856 } else if (prefix
== "osd pool set-quota") {
13858 cmd_getval(cmdmap
, "pool", poolstr
);
13859 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13861 ss
<< "unrecognized pool '" << poolstr
<< "'";
13863 goto reply_no_propose
;
13867 cmd_getval(cmdmap
, "field", field
);
13868 if (field
!= "max_objects" && field
!= "max_bytes") {
13869 ss
<< "unrecognized field '" << field
<< "'; should be 'max_bytes' or 'max_objects'";
13871 goto reply_no_propose
;
13874 // val could contain unit designations, so we treat as a string
13876 cmd_getval(cmdmap
, "val", val
);
13879 if (field
== "max_objects") {
13880 value
= strict_si_cast
<uint64_t>(val
, &tss
);
13881 } else if (field
== "max_bytes") {
13882 value
= strict_iecstrtoll(val
, &tss
);
13884 ceph_abort_msg("unrecognized option");
13886 if (!tss
.empty()) {
13887 ss
<< "error parsing value '" << val
<< "': " << tss
;
13889 goto reply_no_propose
;
13892 pg_pool_t
*pi
= pending_inc
.get_new_pool(pool_id
, osdmap
.get_pg_pool(pool_id
));
13893 if (field
== "max_objects") {
13894 pi
->quota_max_objects
= value
;
13895 } else if (field
== "max_bytes") {
13896 pi
->quota_max_bytes
= value
;
13898 ceph_abort_msg("unrecognized option");
13900 ss
<< "set-quota " << field
<< " = " << value
<< " for pool " << poolstr
;
13902 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
13903 get_last_committed() + 1));
13905 } else if (prefix
== "osd pool application enable" ||
13906 prefix
== "osd pool application disable" ||
13907 prefix
== "osd pool application set" ||
13908 prefix
== "osd pool application rm") {
13909 err
= prepare_command_pool_application(prefix
, cmdmap
, ss
);
13910 if (err
== -EAGAIN
) {
13912 } else if (err
< 0) {
13913 goto reply_no_propose
;
13917 } else if (prefix
== "osd force-create-pg") {
13920 err
= parse_pgid(cmdmap
, ss
, pgid
, pgidstr
);
13922 goto reply_no_propose
;
13924 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
13926 ss
<< "This command will recreate a lost (as in data lost) PG with data in it, such "
13927 << "that the cluster will give up ever trying to recover the lost data. Do this "
13928 << "only if you are certain that all copies of the PG are in fact lost and you are "
13929 << "willing to accept that the data is permanently destroyed. Pass "
13930 << "--yes-i-really-mean-it to proceed.";
13932 goto reply_no_propose
;
13936 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
13937 auto emplaced
= creating_pgs
.pgs
.emplace(
13939 creating_pgs_t::pg_create_info(osdmap
.get_epoch(),
13940 ceph_clock_now()));
13941 creating_now
= emplaced
.second
;
13943 if (creating_now
) {
13944 ss
<< "pg " << pgidstr
<< " now creating, ok";
13945 // set the pool's CREATING flag so that (1) the osd won't ignore our
13946 // create message and (2) we won't propose any future pg_num changes
13947 // until after the PG has been instantiated.
13948 if (pending_inc
.new_pools
.count(pgid
.pool()) == 0) {
13949 pending_inc
.new_pools
[pgid
.pool()] = *osdmap
.get_pg_pool(pgid
.pool());
13951 pending_inc
.new_pools
[pgid
.pool()].flags
|= pg_pool_t::FLAG_CREATING
;
13955 ss
<< "pg " << pgid
<< " already creating";
13957 goto reply_no_propose
;
13959 } else if (prefix
== "osd force_healthy_stretch_mode") {
13961 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
13963 ss
<< "This command will require peering across multiple CRUSH buckets "
13964 "(probably two data centers or availability zones?) and may result in PGs "
13965 "going inactive until backfilling is complete. Pass --yes-i-really-mean-it to proceed.";
13967 goto reply_no_propose
;
13969 try_end_recovery_stretch_mode(true);
13970 ss
<< "Triggering healthy stretch mode";
13972 goto reply_no_propose
;
13973 } else if (prefix
== "osd force_recovery_stretch_mode") {
13975 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
13977 ss
<< "This command will increase pool sizes to try and spread them "
13978 "across multiple CRUSH buckets (probably two data centers or "
13979 "availability zones?) and should have happened automatically"
13980 "Pass --yes-i-really-mean-it to proceed.";
13982 goto reply_no_propose
;
13984 mon
.go_recovery_stretch_mode();
13985 ss
<< "Triggering recovery stretch mode";
13987 goto reply_no_propose
;
13988 } else if (prefix
== "osd set-allow-crimson") {
13991 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
13993 bool experimental_enabled
=
13994 g_ceph_context
->check_experimental_feature_enabled("crimson");
13995 if (!sure
|| !experimental_enabled
) {
13996 ss
<< "This command will allow usage of crimson-osd osd daemons. "
13997 << "crimson-osd is not considered stable and will likely cause "
13998 << "crashes or data corruption. At this time, crimson-osd is mainly "
13999 << "useful for performance evaluation, testing, and development. "
14000 << "If you are sure, add --yes-i-really-mean-it and add 'crimson' to "
14001 << "the experimental features config. This setting is irrevocable.";
14003 goto reply_no_propose
;
14007 if (osdmap
.get_allow_crimson()) {
14008 goto reply_no_propose
;
14010 pending_inc
.set_allow_crimson();
14019 if (err
< 0 && rs
.length() == 0)
14020 rs
= cpp_strerror(err
);
14021 mon
.reply_command(op
, err
, rs
, rdata
, get_last_committed());
14022 return false; /* nothing to propose */
14026 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
14027 get_last_committed() + 1));
14032 // Some osd commands split changes across two epochs.
14033 // It seems this is mostly for crush rule changes. It doesn't need
14034 // to be this way but it's a bit of work to fix that. For now,
14035 // trigger a proposal by returning true and then retry the command
14036 // to complete the operation.
14037 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
14041 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op
)
14043 op
->mark_osdmon_event(__func__
);
14045 auto m
= op
->get_req
<MPoolOp
>();
14046 MonSession
*session
= op
->get_session();
14048 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
14053 case POOL_OP_CREATE_UNMANAGED_SNAP
:
14054 case POOL_OP_DELETE_UNMANAGED_SNAP
:
14056 const std::string
* pool_name
= nullptr;
14057 const pg_pool_t
*pg_pool
= osdmap
.get_pg_pool(m
->pool
);
14058 if (pg_pool
!= nullptr) {
14059 pool_name
= &osdmap
.get_pool_name(m
->pool
);
14062 if (!is_unmanaged_snap_op_permitted(cct
, mon
.key_server
,
14063 session
->entity_name
, session
->caps
,
14064 session
->get_peer_socket_addr(),
14066 dout(0) << "got unmanaged-snap pool op from entity with insufficient "
14067 << "privileges. message: " << *m
<< std::endl
14068 << "caps: " << session
->caps
<< dendl
;
14069 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
14075 if (!session
->is_capable("osd", MON_CAP_W
)) {
14076 dout(0) << "got pool op from entity with insufficient privileges. "
14077 << "message: " << *m
<< std::endl
14078 << "caps: " << session
->caps
<< dendl
;
14079 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
14088 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op
)
14090 op
->mark_osdmon_event(__func__
);
14091 auto m
= op
->get_req
<MPoolOp
>();
14093 if (enforce_pool_op_caps(op
)) {
14097 if (m
->fsid
!= mon
.monmap
->fsid
) {
14098 dout(0) << __func__
<< " drop message on fsid " << m
->fsid
14099 << " != " << mon
.monmap
->fsid
<< " for " << *m
<< dendl
;
14100 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
14104 if (m
->op
== POOL_OP_CREATE
)
14105 return preprocess_pool_op_create(op
);
14107 const pg_pool_t
*p
= osdmap
.get_pg_pool(m
->pool
);
14108 if (p
== nullptr) {
14109 dout(10) << "attempt to operate on non-existent pool id " << m
->pool
<< dendl
;
14110 if (m
->op
== POOL_OP_DELETE
) {
14111 _pool_op_reply(op
, 0, osdmap
.get_epoch());
14113 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
14118 // check if the snap and snapname exist
14119 bool snap_exists
= false;
14120 if (p
->snap_exists(m
->name
.c_str()))
14121 snap_exists
= true;
14124 case POOL_OP_CREATE_SNAP
:
14125 if (p
->is_unmanaged_snaps_mode() || p
->is_tier()) {
14126 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
14130 _pool_op_reply(op
, 0, osdmap
.get_epoch());
14134 case POOL_OP_CREATE_UNMANAGED_SNAP
:
14135 if (p
->is_pool_snaps_mode()) {
14136 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
14140 case POOL_OP_DELETE_SNAP
:
14141 if (p
->is_unmanaged_snaps_mode()) {
14142 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
14145 if (!snap_exists
) {
14146 _pool_op_reply(op
, 0, osdmap
.get_epoch());
14150 case POOL_OP_DELETE_UNMANAGED_SNAP
:
14151 if (p
->is_pool_snaps_mode()) {
14152 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
14155 if (_is_removed_snap(m
->pool
, m
->snapid
)) {
14156 _pool_op_reply(op
, 0, osdmap
.get_epoch());
14160 case POOL_OP_DELETE
:
14161 if (osdmap
.lookup_pg_pool_name(m
->name
.c_str()) >= 0) {
14162 _pool_op_reply(op
, 0, osdmap
.get_epoch());
14166 case POOL_OP_AUID_CHANGE
:
14176 bool OSDMonitor::_is_removed_snap(int64_t pool
, snapid_t snap
)
14178 if (!osdmap
.have_pg_pool(pool
)) {
14179 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
14180 << " - pool dne" << dendl
;
14183 if (osdmap
.in_removed_snaps_queue(pool
, snap
)) {
14184 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
14185 << " - in osdmap removed_snaps_queue" << dendl
;
14188 snapid_t begin
, end
;
14189 int r
= lookup_purged_snap(pool
, snap
, &begin
, &end
);
14191 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
14192 << " - purged, [" << begin
<< "," << end
<< ")" << dendl
;
14198 bool OSDMonitor::_is_pending_removed_snap(int64_t pool
, snapid_t snap
)
14200 if (pending_inc
.old_pools
.count(pool
)) {
14201 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
14202 << " - pool pending deletion" << dendl
;
14205 if (pending_inc
.in_new_removed_snaps(pool
, snap
)) {
14206 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
14207 << " - in pending new_removed_snaps" << dendl
;
14213 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op
)
14215 op
->mark_osdmon_event(__func__
);
14216 auto m
= op
->get_req
<MPoolOp
>();
14217 int64_t pool
= osdmap
.lookup_pg_pool_name(m
->name
.c_str());
14219 _pool_op_reply(op
, 0, osdmap
.get_epoch());
14226 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op
)
14228 op
->mark_osdmon_event(__func__
);
14229 auto m
= op
->get_req
<MPoolOp
>();
14230 dout(10) << "prepare_pool_op " << *m
<< dendl
;
14231 if (m
->op
== POOL_OP_CREATE
) {
14232 return prepare_pool_op_create(op
);
14233 } else if (m
->op
== POOL_OP_DELETE
) {
14234 return prepare_pool_op_delete(op
);
14238 bool changed
= false;
14240 if (!osdmap
.have_pg_pool(m
->pool
)) {
14241 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
14245 const pg_pool_t
*pool
= osdmap
.get_pg_pool(m
->pool
);
14247 if (m
->op
== POOL_OP_CREATE_SNAP
||
14248 m
->op
== POOL_OP_CREATE_UNMANAGED_SNAP
) {
14249 if (const auto& fsmap
= mon
.mdsmon()->get_fsmap(); fsmap
.pool_in_use(m
->pool
)) {
14250 dout(20) << "monitor-managed snapshots have been disabled for pools "
14251 " attached to an fs - pool:" << m
->pool
<< dendl
;
14252 _pool_op_reply(op
, -EOPNOTSUPP
, osdmap
.get_epoch());
14258 case POOL_OP_CREATE_SNAP
:
14259 if (pool
->is_tier()) {
14261 _pool_op_reply(op
, ret
, osdmap
.get_epoch());
14263 } // else, fall through
14264 case POOL_OP_DELETE_SNAP
:
14265 if (!pool
->is_unmanaged_snaps_mode()) {
14266 bool snap_exists
= pool
->snap_exists(m
->name
.c_str());
14267 if ((m
->op
== POOL_OP_CREATE_SNAP
&& snap_exists
)
14268 || (m
->op
== POOL_OP_DELETE_SNAP
&& !snap_exists
)) {
14276 _pool_op_reply(op
, ret
, osdmap
.get_epoch());
14279 case POOL_OP_DELETE_UNMANAGED_SNAP
:
14280 // we won't allow removal of an unmanaged snapshot from a pool
14281 // not in unmanaged snaps mode.
14282 if (!pool
->is_unmanaged_snaps_mode()) {
14283 _pool_op_reply(op
, -ENOTSUP
, osdmap
.get_epoch());
14287 case POOL_OP_CREATE_UNMANAGED_SNAP
:
14288 // but we will allow creating an unmanaged snapshot on any pool
14289 // as long as it is not in 'pool' snaps mode.
14290 if (pool
->is_pool_snaps_mode()) {
14291 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
14296 // projected pool info
14298 if (pending_inc
.new_pools
.count(m
->pool
))
14299 pp
= pending_inc
.new_pools
[m
->pool
];
14301 pp
= *osdmap
.get_pg_pool(m
->pool
);
14303 bufferlist reply_data
;
14305 // pool snaps vs unmanaged snaps are mutually exclusive
14307 case POOL_OP_CREATE_SNAP
:
14308 case POOL_OP_DELETE_SNAP
:
14309 if (pp
.is_unmanaged_snaps_mode()) {
14315 case POOL_OP_CREATE_UNMANAGED_SNAP
:
14316 case POOL_OP_DELETE_UNMANAGED_SNAP
:
14317 if (pp
.is_pool_snaps_mode()) {
14324 case POOL_OP_CREATE_SNAP
:
14325 if (!pp
.snap_exists(m
->name
.c_str())) {
14326 pp
.add_snap(m
->name
.c_str(), ceph_clock_now());
14327 dout(10) << "create snap in pool " << m
->pool
<< " " << m
->name
14328 << " seq " << pp
.get_snap_epoch() << dendl
;
14333 case POOL_OP_DELETE_SNAP
:
14335 snapid_t s
= pp
.snap_exists(m
->name
.c_str());
14338 pending_inc
.new_removed_snaps
[m
->pool
].insert(s
);
14344 case POOL_OP_CREATE_UNMANAGED_SNAP
:
14346 uint64_t snapid
= pp
.add_unmanaged_snap(
14347 osdmap
.require_osd_release
< ceph_release_t::octopus
);
14348 encode(snapid
, reply_data
);
14353 case POOL_OP_DELETE_UNMANAGED_SNAP
:
14354 if (!_is_removed_snap(m
->pool
, m
->snapid
) &&
14355 !_is_pending_removed_snap(m
->pool
, m
->snapid
)) {
14356 if (m
->snapid
> pp
.get_snap_seq()) {
14357 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
14360 pp
.remove_unmanaged_snap(
14362 osdmap
.require_osd_release
< ceph_release_t::octopus
);
14363 pending_inc
.new_removed_snaps
[m
->pool
].insert(m
->snapid
);
14364 // also record the new seq as purged: this avoids a discontinuity
14365 // after all of the snaps have been purged, since the seq assigned
14366 // during removal lives in the same namespace as the actual snaps.
14367 pending_pseudo_purged_snaps
[m
->pool
].insert(pp
.get_snap_seq());
14372 case POOL_OP_AUID_CHANGE
:
14373 _pool_op_reply(op
, -EOPNOTSUPP
, osdmap
.get_epoch());
14382 pp
.set_snap_epoch(pending_inc
.epoch
);
14383 pending_inc
.new_pools
[m
->pool
] = pp
;
14387 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, ret
, pending_inc
.epoch
, &reply_data
));
14391 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op
)
14393 op
->mark_osdmon_event(__func__
);
14394 int err
= prepare_new_pool(op
);
14395 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, err
, pending_inc
.epoch
));
14399 int OSDMonitor::_check_remove_pool(int64_t pool_id
, const pg_pool_t
& pool
,
14402 const string
& poolstr
= osdmap
.get_pool_name(pool_id
);
14404 // If the Pool is in use by CephFS, refuse to delete it
14405 FSMap
const &pending_fsmap
= mon
.mdsmon()->get_pending_fsmap();
14406 if (pending_fsmap
.pool_in_use(pool_id
)) {
14407 *ss
<< "pool '" << poolstr
<< "' is in use by CephFS";
14411 if (pool
.tier_of
>= 0) {
14412 *ss
<< "pool '" << poolstr
<< "' is a tier of '"
14413 << osdmap
.get_pool_name(pool
.tier_of
) << "'";
14416 if (!pool
.tiers
.empty()) {
14417 *ss
<< "pool '" << poolstr
<< "' has tiers";
14418 for(auto tier
: pool
.tiers
) {
14419 *ss
<< " " << osdmap
.get_pool_name(tier
);
14424 if (!g_conf()->mon_allow_pool_delete
) {
14425 *ss
<< "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
14429 if (pool
.has_flag(pg_pool_t::FLAG_NODELETE
)) {
14430 *ss
<< "pool deletion is disabled; you must unset nodelete flag for the pool first";
14434 *ss
<< "pool '" << poolstr
<< "' removed";
14439 * Check if it is safe to add a tier to a base pool
14442 * True if the operation should proceed, false if we should abort here
14443 * (abort doesn't necessarily mean error, could be idempotency)
14445 bool OSDMonitor::_check_become_tier(
14446 const int64_t tier_pool_id
, const pg_pool_t
*tier_pool
,
14447 const int64_t base_pool_id
, const pg_pool_t
*base_pool
,
14451 const std::string
&tier_pool_name
= osdmap
.get_pool_name(tier_pool_id
);
14452 const std::string
&base_pool_name
= osdmap
.get_pool_name(base_pool_id
);
14454 if (tier_pool
->is_crimson()) {
14455 *ss
<< "pool '" << tier_pool_name
<< "' is a crimson pool, tiering "
14456 << "features are not supported";
14460 if (base_pool
->is_crimson()) {
14461 *ss
<< "pool '" << base_pool_name
<< "' is a crimson pool, tiering "
14462 << "features are not supported";
14467 const FSMap
&pending_fsmap
= mon
.mdsmon()->get_pending_fsmap();
14468 if (pending_fsmap
.pool_in_use(tier_pool_id
)) {
14469 *ss
<< "pool '" << tier_pool_name
<< "' is in use by CephFS";
14474 if (base_pool
->tiers
.count(tier_pool_id
)) {
14475 ceph_assert(tier_pool
->tier_of
== base_pool_id
);
14477 *ss
<< "pool '" << tier_pool_name
<< "' is now (or already was) a tier of '"
14478 << base_pool_name
<< "'";
14482 if (base_pool
->is_tier()) {
14483 *ss
<< "pool '" << base_pool_name
<< "' is already a tier of '"
14484 << osdmap
.get_pool_name(base_pool
->tier_of
) << "', "
14485 << "multiple tiers are not yet supported.";
14490 if (tier_pool
->has_tiers()) {
14491 *ss
<< "pool '" << tier_pool_name
<< "' has following tier(s) already:";
14492 for (set
<uint64_t>::iterator it
= tier_pool
->tiers
.begin();
14493 it
!= tier_pool
->tiers
.end(); ++it
)
14494 *ss
<< "'" << osdmap
.get_pool_name(*it
) << "',";
14495 *ss
<< " multiple tiers are not yet supported.";
14500 if (tier_pool
->is_tier()) {
14501 *ss
<< "tier pool '" << tier_pool_name
<< "' is already a tier of '"
14502 << osdmap
.get_pool_name(tier_pool
->tier_of
) << "'";
14513 * Check if it is safe to remove a tier from this base pool
14516 * True if the operation should proceed, false if we should abort here
14517 * (abort doesn't necessarily mean error, could be idempotency)
14519 bool OSDMonitor::_check_remove_tier(
14520 const int64_t base_pool_id
, const pg_pool_t
*base_pool
,
14521 const pg_pool_t
*tier_pool
,
14522 int *err
, ostream
*ss
) const
14524 const std::string
&base_pool_name
= osdmap
.get_pool_name(base_pool_id
);
14526 // Apply CephFS-specific checks
14527 const FSMap
&pending_fsmap
= mon
.mdsmon()->get_pending_fsmap();
14528 if (pending_fsmap
.pool_in_use(base_pool_id
)) {
14529 if (base_pool
->is_erasure() && !base_pool
->allows_ecoverwrites()) {
14530 // If the underlying pool is erasure coded and does not allow EC
14531 // overwrites, we can't permit the removal of the replicated tier that
14532 // CephFS relies on to access it
14533 *ss
<< "pool '" << base_pool_name
<<
14534 "' does not allow EC overwrites and is in use by CephFS"
14540 if (tier_pool
&& tier_pool
->cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
) {
14541 *ss
<< "pool '" << base_pool_name
<< "' is in use by CephFS, and this "
14542 "tier is still in use as a writeback cache. Change the cache "
14543 "mode and flush the cache before removing it";
14553 int OSDMonitor::_prepare_remove_pool(
14554 int64_t pool
, ostream
*ss
, bool no_fake
)
14556 dout(10) << __func__
<< " " << pool
<< dendl
;
14557 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
14558 int r
= _check_remove_pool(pool
, *p
, ss
);
14562 auto new_pool
= pending_inc
.new_pools
.find(pool
);
14563 if (new_pool
!= pending_inc
.new_pools
.end()) {
14564 // if there is a problem with the pending info, wait and retry
14566 const auto& p
= new_pool
->second
;
14567 int r
= _check_remove_pool(pool
, p
, ss
);
14572 if (pending_inc
.old_pools
.count(pool
)) {
14573 dout(10) << __func__
<< " " << pool
<< " already pending removal"
14578 if (g_conf()->mon_fake_pool_delete
&& !no_fake
) {
14579 string old_name
= osdmap
.get_pool_name(pool
);
14580 string new_name
= old_name
+ "." + stringify(pool
) + ".DELETED";
14581 dout(1) << __func__
<< " faking pool deletion: renaming " << pool
<< " "
14582 << old_name
<< " -> " << new_name
<< dendl
;
14583 pending_inc
.new_pool_names
[pool
] = new_name
;
14588 pending_inc
.old_pools
.insert(pool
);
14590 // remove any pg_temp mappings for this pool
14591 for (auto p
= osdmap
.pg_temp
->begin();
14592 p
!= osdmap
.pg_temp
->end();
14594 if (p
->first
.pool() == pool
) {
14595 dout(10) << __func__
<< " " << pool
<< " removing obsolete pg_temp "
14596 << p
->first
<< dendl
;
14597 pending_inc
.new_pg_temp
[p
->first
].clear();
14600 // remove any primary_temp mappings for this pool
14601 for (auto p
= osdmap
.primary_temp
->begin();
14602 p
!= osdmap
.primary_temp
->end();
14604 if (p
->first
.pool() == pool
) {
14605 dout(10) << __func__
<< " " << pool
14606 << " removing obsolete primary_temp" << p
->first
<< dendl
;
14607 pending_inc
.new_primary_temp
[p
->first
] = -1;
14610 // remove any pg_upmap mappings for this pool
14611 for (auto& p
: osdmap
.pg_upmap
) {
14612 if (p
.first
.pool() == pool
) {
14613 dout(10) << __func__
<< " " << pool
14614 << " removing obsolete pg_upmap "
14615 << p
.first
<< dendl
;
14616 pending_inc
.old_pg_upmap
.insert(p
.first
);
14619 // remove any pending pg_upmap mappings for this pool
14621 auto it
= pending_inc
.new_pg_upmap
.begin();
14622 while (it
!= pending_inc
.new_pg_upmap
.end()) {
14623 if (it
->first
.pool() == pool
) {
14624 dout(10) << __func__
<< " " << pool
14625 << " removing pending pg_upmap "
14626 << it
->first
<< dendl
;
14627 it
= pending_inc
.new_pg_upmap
.erase(it
);
14633 // remove any pg_upmap_items mappings for this pool
14634 for (auto& p
: osdmap
.pg_upmap_items
) {
14635 if (p
.first
.pool() == pool
) {
14636 dout(10) << __func__
<< " " << pool
14637 << " removing obsolete pg_upmap_items " << p
.first
14639 pending_inc
.old_pg_upmap_items
.insert(p
.first
);
14642 // remove any pending pg_upmap mappings for this pool
14644 auto it
= pending_inc
.new_pg_upmap_items
.begin();
14645 while (it
!= pending_inc
.new_pg_upmap_items
.end()) {
14646 if (it
->first
.pool() == pool
) {
14647 dout(10) << __func__
<< " " << pool
14648 << " removing pending pg_upmap_items "
14649 << it
->first
<< dendl
;
14650 it
= pending_inc
.new_pg_upmap_items
.erase(it
);
14657 // remove any choose_args for this pool
14658 CrushWrapper newcrush
= _get_pending_crush();
14659 if (newcrush
.have_choose_args(pool
)) {
14660 dout(10) << __func__
<< " removing choose_args for pool " << pool
<< dendl
;
14661 newcrush
.rm_choose_args(pool
);
14662 pending_inc
.crush
.clear();
14663 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
14668 int OSDMonitor::_prepare_rename_pool(int64_t pool
, string newname
)
14670 dout(10) << "_prepare_rename_pool " << pool
<< dendl
;
14671 if (pending_inc
.old_pools
.count(pool
)) {
14672 dout(10) << "_prepare_rename_pool " << pool
<< " pending removal" << dendl
;
14675 for (map
<int64_t,string
>::iterator p
= pending_inc
.new_pool_names
.begin();
14676 p
!= pending_inc
.new_pool_names
.end();
14678 if (p
->second
== newname
&& p
->first
!= pool
) {
14683 pending_inc
.new_pool_names
[pool
] = newname
;
14687 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op
)
14689 op
->mark_osdmon_event(__func__
);
14690 auto m
= op
->get_req
<MPoolOp
>();
14692 int ret
= _prepare_remove_pool(m
->pool
, &ss
, false);
14693 if (ret
== -EAGAIN
) {
14694 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
14698 dout(10) << __func__
<< " got " << ret
<< " " << ss
.str() << dendl
;
14699 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, ret
,
14700 pending_inc
.epoch
));
14704 void OSDMonitor::_pool_op_reply(MonOpRequestRef op
,
14705 int ret
, epoch_t epoch
, bufferlist
*blp
)
14707 op
->mark_osdmon_event(__func__
);
14708 auto m
= op
->get_req
<MPoolOp
>();
14709 dout(20) << "_pool_op_reply " << ret
<< dendl
;
14710 MPoolOpReply
*reply
= new MPoolOpReply(m
->fsid
, m
->get_tid(),
14711 ret
, epoch
, get_last_committed(), blp
);
14712 mon
.send_reply(op
, reply
);
14715 void OSDMonitor::convert_pool_priorities(void)
14717 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc("recovery_priority").key
;
14718 int64_t max_prio
= 0;
14719 int64_t min_prio
= 0;
14720 for (const auto &i
: osdmap
.get_pools()) {
14721 const auto &pool
= i
.second
;
14723 if (pool
.opts
.is_set(key
)) {
14725 pool
.opts
.get(key
, &prio
);
14726 if (prio
> max_prio
)
14728 if (prio
< min_prio
)
14732 if (max_prio
<= OSD_POOL_PRIORITY_MAX
&& min_prio
>= OSD_POOL_PRIORITY_MIN
) {
14733 dout(20) << __func__
<< " nothing to fix" << dendl
;
14736 // Current pool priorities exceeds new maximum
14737 for (const auto &i
: osdmap
.get_pools()) {
14738 const auto pool_id
= i
.first
;
14739 pg_pool_t pool
= i
.second
;
14742 pool
.opts
.get(key
, &prio
);
14745 if (prio
> 0 && max_prio
> OSD_POOL_PRIORITY_MAX
) { // Likely scenario
14746 // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
14747 n
= (float)prio
/ max_prio
* OSD_POOL_PRIORITY_MAX
;
14748 } else if (prio
< 0 && min_prio
< OSD_POOL_PRIORITY_MIN
) {
14749 // Scaled priority range OSD_POOL_PRIORITY_MIN to 0
14750 n
= (float)prio
/ min_prio
* OSD_POOL_PRIORITY_MIN
;
14755 pool
.opts
.unset(key
);
14757 pool
.opts
.set(key
, static_cast<int64_t>(n
));
14759 dout(10) << __func__
<< " pool " << pool_id
14760 << " recovery_priority adjusted "
14761 << prio
<< " to " << n
<< dendl
;
14762 pool
.last_change
= pending_inc
.epoch
;
14763 pending_inc
.new_pools
[pool_id
] = pool
;
14767 void OSDMonitor::try_enable_stretch_mode_pools(stringstream
& ss
, bool *okay
,
14769 set
<pg_pool_t
*>* pools
,
14770 const string
& new_crush_rule
)
14772 dout(20) << __func__
<< dendl
;
14774 int new_crush_rule_result
= osdmap
.crush
->get_rule_id(new_crush_rule
);
14775 if (new_crush_rule_result
< 0) {
14776 ss
<< "unrecognized crush rule " << new_crush_rule_result
;
14777 *errcode
= new_crush_rule_result
;
14780 __u8 new_rule
= static_cast<__u8
>(new_crush_rule_result
);
14781 for (const auto& pooli
: osdmap
.pools
) {
14782 int64_t poolid
= pooli
.first
;
14783 const pg_pool_t
*p
= &pooli
.second
;
14784 if (!p
->is_replicated()) {
14785 ss
<< "stretched pools must be replicated; '" << osdmap
.pool_name
[poolid
] << "' is erasure-coded";
14786 *errcode
= -EINVAL
;
14789 uint8_t default_size
= g_conf().get_val
<uint64_t>("osd_pool_default_size");
14790 if ((p
->get_size() != default_size
||
14791 (p
->get_min_size() != g_conf().get_osd_pool_default_min_size(default_size
))) &&
14792 (p
->get_crush_rule() != new_rule
)) {
14793 ss
<< "we currently require stretch mode pools start out with the"
14794 " default size/min_size, which '" << osdmap
.pool_name
[poolid
] << "' does not";
14795 *errcode
= -EINVAL
;
14798 pg_pool_t
*pp
= pending_inc
.get_new_pool(poolid
, p
);
14799 // TODO: The part where we unconditionally copy the pools into pending_inc is bad
14800 // the attempt may fail and then we have these pool updates...but they won't do anything
14801 // if there is a failure, so if it's hard to change the interface, no need to bother
14808 void OSDMonitor::try_enable_stretch_mode(stringstream
& ss
, bool *okay
,
14809 int *errcode
, bool commit
,
14810 const string
& dividing_bucket
,
14811 uint32_t bucket_count
,
14812 const set
<pg_pool_t
*>& pools
,
14813 const string
& new_crush_rule
)
14815 dout(20) << __func__
<< dendl
;
14817 CrushWrapper crush
= _get_pending_crush();
14818 int dividing_id
= -1;
14819 if (auto type_id
= crush
.get_validated_type_id(dividing_bucket
);
14820 !type_id
.has_value()) {
14821 ss
<< dividing_bucket
<< " is not a valid crush bucket type";
14822 *errcode
= -ENOENT
;
14823 ceph_assert(!commit
);
14826 dividing_id
= *type_id
;
14828 vector
<int> subtrees
;
14829 crush
.get_subtree_of_type(dividing_id
, &subtrees
);
14830 if (subtrees
.size() != 2) {
14831 ss
<< "there are " << subtrees
.size() << dividing_bucket
14832 << "'s in the cluster but stretch mode currently only works with 2!";
14833 *errcode
= -EINVAL
;
14834 ceph_assert(!commit
|| subtrees
.size() == 2);
14838 int new_crush_rule_result
= crush
.get_rule_id(new_crush_rule
);
14839 if (new_crush_rule_result
< 0) {
14840 ss
<< "unrecognized crush rule " << new_crush_rule
;
14841 *errcode
= new_crush_rule_result
;
14842 ceph_assert(!commit
|| (new_crush_rule_result
> 0));
14845 __u8 new_rule
= static_cast<__u8
>(new_crush_rule_result
);
14847 int weight1
= crush
.get_item_weight(subtrees
[0]);
14848 int weight2
= crush
.get_item_weight(subtrees
[1]);
14849 if (weight1
!= weight2
) {
14850 // TODO: I'm really not sure this is a good idea?
14851 ss
<< "the 2 " << dividing_bucket
14852 << "instances in the cluster have differing weights "
14853 << weight1
<< " and " << weight2
14854 <<" but stretch mode currently requires they be the same!";
14855 *errcode
= -EINVAL
;
14856 ceph_assert(!commit
|| (weight1
== weight2
));
14859 if (bucket_count
!= 2) {
14860 ss
<< "currently we only support 2-site stretch clusters!";
14861 *errcode
= -EINVAL
;
14862 ceph_assert(!commit
|| bucket_count
== 2);
14865 // TODO: check CRUSH rules for pools so that we are appropriately divided
14867 for (auto pool
: pools
) {
14868 pool
->crush_rule
= new_rule
;
14869 pool
->peering_crush_bucket_count
= bucket_count
;
14870 pool
->peering_crush_bucket_target
= bucket_count
;
14871 pool
->peering_crush_bucket_barrier
= dividing_id
;
14872 pool
->peering_crush_mandatory_member
= CRUSH_ITEM_NONE
;
14873 pool
->size
= g_conf().get_val
<uint64_t>("mon_stretch_pool_size");
14874 pool
->min_size
= g_conf().get_val
<uint64_t>("mon_stretch_pool_min_size");
14876 pending_inc
.change_stretch_mode
= true;
14877 pending_inc
.stretch_mode_enabled
= true;
14878 pending_inc
.new_stretch_bucket_count
= bucket_count
;
14879 pending_inc
.new_degraded_stretch_mode
= 0;
14880 pending_inc
.new_stretch_mode_bucket
= dividing_id
;
14886 bool OSDMonitor::check_for_dead_crush_zones(const map
<string
,set
<string
>>& dead_buckets
,
14887 set
<int> *really_down_buckets
,
14888 set
<string
> *really_down_mons
)
14890 dout(20) << __func__
<< " with dead mon zones " << dead_buckets
<< dendl
;
14891 ceph_assert(is_readable());
14892 if (dead_buckets
.empty()) return false;
14893 set
<int> down_cache
;
14894 bool really_down
= false;
14895 for (auto dbi
: dead_buckets
) {
14896 const string
& bucket_name
= dbi
.first
;
14897 ceph_assert(osdmap
.crush
->name_exists(bucket_name
));
14898 int bucket_id
= osdmap
.crush
->get_item_id(bucket_name
);
14899 dout(20) << "Checking " << bucket_name
<< " id " << bucket_id
14900 << " to see if OSDs are also down" << dendl
;
14901 bool subtree_down
= osdmap
.subtree_is_down(bucket_id
, &down_cache
);
14902 if (subtree_down
) {
14903 dout(20) << "subtree is down!" << dendl
;
14904 really_down
= true;
14905 really_down_buckets
->insert(bucket_id
);
14906 really_down_mons
->insert(dbi
.second
.begin(), dbi
.second
.end());
14909 dout(10) << "We determined CRUSH buckets " << *really_down_buckets
14910 << " and mons " << *really_down_mons
<< " are really down" << dendl
;
14911 return really_down
;
14914 void OSDMonitor::trigger_degraded_stretch_mode(const set
<int>& dead_buckets
,
14915 const set
<string
>& live_zones
)
14917 dout(20) << __func__
<< dendl
;
14918 stretch_recovery_triggered
.set_from_double(0); // reset this; we can't go clean now!
14919 // update the general OSDMap changes
14920 pending_inc
.change_stretch_mode
= true;
14921 pending_inc
.stretch_mode_enabled
= osdmap
.stretch_mode_enabled
;
14922 pending_inc
.new_stretch_bucket_count
= osdmap
.stretch_bucket_count
;
14923 int new_site_count
= osdmap
.stretch_bucket_count
- dead_buckets
.size();
14924 ceph_assert(new_site_count
== 1); // stretch count 2!
14925 pending_inc
.new_degraded_stretch_mode
= new_site_count
;
14926 pending_inc
.new_recovering_stretch_mode
= 0;
14927 pending_inc
.new_stretch_mode_bucket
= osdmap
.stretch_mode_bucket
;
14929 // and then apply them to all the pg_pool_ts
14930 ceph_assert(live_zones
.size() == 1); // only support 2 zones now
14931 const string
& remaining_site_name
= *(live_zones
.begin());
14932 ceph_assert(osdmap
.crush
->name_exists(remaining_site_name
));
14933 int remaining_site
= osdmap
.crush
->get_item_id(remaining_site_name
);
14934 for (auto pgi
: osdmap
.pools
) {
14935 if (pgi
.second
.peering_crush_bucket_count
) {
14936 pg_pool_t
& newp
= *pending_inc
.get_new_pool(pgi
.first
, &pgi
.second
);
14937 newp
.peering_crush_bucket_count
= new_site_count
;
14938 newp
.peering_crush_mandatory_member
= remaining_site
;
14939 newp
.min_size
= pgi
.second
.min_size
/ 2; // only support 2 zones now
14940 newp
.set_last_force_op_resend(pending_inc
.epoch
);
14946 void OSDMonitor::trigger_recovery_stretch_mode()
14948 dout(20) << __func__
<< dendl
;
14949 stretch_recovery_triggered
.set_from_double(0); // reset this so we don't go full-active prematurely
14950 pending_inc
.change_stretch_mode
= true;
14951 pending_inc
.stretch_mode_enabled
= osdmap
.stretch_mode_enabled
;
14952 pending_inc
.new_stretch_bucket_count
= osdmap
.stretch_bucket_count
;
14953 pending_inc
.new_degraded_stretch_mode
= osdmap
.degraded_stretch_mode
;
14954 pending_inc
.new_recovering_stretch_mode
= 1;
14955 pending_inc
.new_stretch_mode_bucket
= osdmap
.stretch_mode_bucket
;
14957 for (auto pgi
: osdmap
.pools
) {
14958 if (pgi
.second
.peering_crush_bucket_count
) {
14959 pg_pool_t
& newp
= *pending_inc
.get_new_pool(pgi
.first
, &pgi
.second
);
14960 newp
.set_last_force_op_resend(pending_inc
.epoch
);
14966 void OSDMonitor::set_degraded_stretch_mode()
14968 stretch_recovery_triggered
.set_from_double(0);
14971 void OSDMonitor::set_recovery_stretch_mode()
14973 if (stretch_recovery_triggered
.is_zero()) {
14974 stretch_recovery_triggered
= ceph_clock_now();
14978 void OSDMonitor::set_healthy_stretch_mode()
14980 stretch_recovery_triggered
.set_from_double(0);
14983 void OSDMonitor::notify_new_pg_digest()
14985 dout(20) << __func__
<< dendl
;
14986 if (!stretch_recovery_triggered
.is_zero()) {
14987 try_end_recovery_stretch_mode(false);
14991 struct CMonExitRecovery
: public Context
{
14994 CMonExitRecovery(OSDMonitor
*mon
, bool f
) : m(mon
), force(f
) {}
14995 void finish(int r
) {
14996 m
->try_end_recovery_stretch_mode(force
);
15000 void OSDMonitor::try_end_recovery_stretch_mode(bool force
)
15002 dout(20) << __func__
<< dendl
;
15003 if (!mon
.is_leader()) return;
15004 if (!mon
.is_degraded_stretch_mode()) return;
15005 if (!mon
.is_recovering_stretch_mode()) return;
15006 if (!is_readable()) {
15007 wait_for_readable_ctx(new CMonExitRecovery(this, force
));
15011 if (osdmap
.recovering_stretch_mode
&&
15012 ((!stretch_recovery_triggered
.is_zero() &&
15013 ceph_clock_now() - g_conf().get_val
<double>("mon_stretch_recovery_min_wait") >
15014 stretch_recovery_triggered
) ||
15016 if (!mon
.mgrstatmon()->is_readable()) {
15017 mon
.mgrstatmon()->wait_for_readable_ctx(new CMonExitRecovery(this, force
));
15020 const PGMapDigest
& pgd
= mon
.mgrstatmon()->get_digest();
15021 double misplaced
, degraded
, inactive
, unknown
;
15022 pgd
.get_recovery_stats(&misplaced
, °raded
, &inactive
, &unknown
);
15023 if (force
|| (degraded
== 0.0 && inactive
== 0.0 && unknown
== 0.0)) {
15024 // we can exit degraded stretch mode!
15025 mon
.trigger_healthy_stretch_mode();
15030 void OSDMonitor::trigger_healthy_stretch_mode()
15032 ceph_assert(is_writeable());
15033 stretch_recovery_triggered
.set_from_double(0);
15034 pending_inc
.change_stretch_mode
= true;
15035 pending_inc
.stretch_mode_enabled
= osdmap
.stretch_mode_enabled
;
15036 pending_inc
.new_stretch_bucket_count
= osdmap
.stretch_bucket_count
;
15037 pending_inc
.new_degraded_stretch_mode
= 0; // turn off degraded mode...
15038 pending_inc
.new_recovering_stretch_mode
= 0; //...and recovering mode!
15039 pending_inc
.new_stretch_mode_bucket
= osdmap
.stretch_mode_bucket
;
15040 for (auto pgi
: osdmap
.pools
) {
15041 if (pgi
.second
.peering_crush_bucket_count
) {
15042 pg_pool_t
& newp
= *pending_inc
.get_new_pool(pgi
.first
, &pgi
.second
);
15043 newp
.peering_crush_bucket_count
= osdmap
.stretch_bucket_count
;
15044 newp
.peering_crush_mandatory_member
= CRUSH_ITEM_NONE
;
15045 newp
.min_size
= g_conf().get_val
<uint64_t>("mon_stretch_pool_min_size");
15046 newp
.set_last_force_op_resend(pending_inc
.epoch
);