1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
10 * Author: Loic Dachary <loic@dachary.org>
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
20 #include <boost/algorithm/string.hpp>
21 #include <experimental/iterator>
25 #include "mon/OSDMonitor.h"
26 #include "mon/Monitor.h"
27 #include "mon/MDSMonitor.h"
28 #include "mon/MgrStatMonitor.h"
29 #include "mon/AuthMonitor.h"
30 #include "mon/KVMonitor.h"
32 #include "mon/MonitorDBStore.h"
33 #include "mon/Session.h"
35 #include "crush/CrushWrapper.h"
36 #include "crush/CrushTester.h"
37 #include "crush/CrushTreeDumper.h"
39 #include "messages/MOSDBeacon.h"
40 #include "messages/MOSDFailure.h"
41 #include "messages/MOSDMarkMeDown.h"
42 #include "messages/MOSDMarkMeDead.h"
43 #include "messages/MOSDFull.h"
44 #include "messages/MOSDMap.h"
45 #include "messages/MMonGetOSDMap.h"
46 #include "messages/MOSDBoot.h"
47 #include "messages/MOSDAlive.h"
48 #include "messages/MPoolOp.h"
49 #include "messages/MPoolOpReply.h"
50 #include "messages/MOSDPGCreate.h"
51 #include "messages/MOSDPGCreate2.h"
52 #include "messages/MOSDPGCreated.h"
53 #include "messages/MOSDPGTemp.h"
54 #include "messages/MOSDPGReadyToMerge.h"
55 #include "messages/MMonCommand.h"
56 #include "messages/MRemoveSnaps.h"
57 #include "messages/MOSDScrub.h"
58 #include "messages/MRoute.h"
59 #include "messages/MMonGetPurgedSnaps.h"
60 #include "messages/MMonGetPurgedSnapsReply.h"
62 #include "common/TextTable.h"
63 #include "common/Timer.h"
64 #include "common/ceph_argparse.h"
65 #include "common/perf_counters.h"
66 #include "common/PriorityCache.h"
67 #include "common/strtol.h"
68 #include "common/numa.h"
70 #include "common/config.h"
71 #include "common/errno.h"
73 #include "erasure-code/ErasureCodePlugin.h"
74 #include "compressor/Compressor.h"
75 #include "common/Checksummer.h"
77 #include "include/compat.h"
78 #include "include/ceph_assert.h"
79 #include "include/stringify.h"
80 #include "include/util.h"
81 #include "common/cmdparse.h"
82 #include "include/str_list.h"
83 #include "include/str_map.h"
84 #include "include/scope_guard.h"
85 #include "perfglue/heap_profiler.h"
87 #include "auth/cephx/CephxKeyServer.h"
88 #include "osd/OSDCap.h"
90 #include "json_spirit/json_spirit_reader.h"
92 #include <boost/algorithm/string/predicate.hpp>
99 using std::ostringstream
;
103 using std::stringstream
;
104 using std::to_string
;
107 using ceph::bufferlist
;
110 using ceph::ErasureCodeInterfaceRef
;
111 using ceph::ErasureCodePluginRegistry
;
112 using ceph::ErasureCodeProfile
;
113 using ceph::Formatter
;
114 using ceph::JSONFormatter
;
115 using ceph::make_message
;
117 #define dout_subsys ceph_subsys_mon
118 static const string
OSD_PG_CREATING_PREFIX("osd_pg_creating");
119 static const string
OSD_METADATA_PREFIX("osd_metadata");
120 static const string
OSD_SNAP_PREFIX("osd_snap");
124 OSD snapshot metadata
125 ---------------------
127 -- starting with mimic, removed in octopus --
129 "removed_epoch_%llu_%08lx" % (pool, epoch)
130 -> interval_set<snapid_t>
132 "removed_snap_%llu_%016llx" % (pool, last_snap)
133 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
136 -- starting with mimic --
138 "purged_snap_%llu_%016llx" % (pool, last_snap)
139 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
141 - note that the {removed,purged}_snap put the last snap in they key so
142 that we can use forward iteration only to search for an epoch in an
143 interval. e.g., to test if epoch N is removed/purged, we'll find a key
144 >= N that either does or doesn't contain the given snap.
147 -- starting with octopus --
149 "purged_epoch_%08lx" % epoch
150 -> map<int64_t,interval_set<snapid_t>>
153 using namespace TOPNSPC::common
;
156 struct OSDMemCache
: public PriorityCache::PriCache
{
158 int64_t cache_bytes
[PriorityCache::Priority::LAST
+1] = {0};
159 int64_t committed_bytes
= 0;
160 double cache_ratio
= 0;
162 OSDMemCache(OSDMonitor
*m
) : osdmon(m
) {};
164 virtual uint64_t _get_used_bytes() const = 0;
166 virtual int64_t request_cache_bytes(
167 PriorityCache::Priority pri
, uint64_t total_cache
) const {
168 int64_t assigned
= get_cache_bytes(pri
);
171 // All cache items are currently set to have PRI1 priority
172 case PriorityCache::Priority::PRI1
:
174 int64_t request
= _get_used_bytes();
175 return (request
> assigned
) ? request
- assigned
: 0;
183 virtual int64_t get_cache_bytes(PriorityCache::Priority pri
) const {
184 return cache_bytes
[pri
];
187 virtual int64_t get_cache_bytes() const {
190 for (int i
= 0; i
< PriorityCache::Priority::LAST
+ 1; i
++) {
191 PriorityCache::Priority pri
= static_cast<PriorityCache::Priority
>(i
);
192 total
+= get_cache_bytes(pri
);
197 virtual void set_cache_bytes(PriorityCache::Priority pri
, int64_t bytes
) {
198 cache_bytes
[pri
] = bytes
;
200 virtual void add_cache_bytes(PriorityCache::Priority pri
, int64_t bytes
) {
201 cache_bytes
[pri
] += bytes
;
203 virtual int64_t commit_cache_size(uint64_t total_cache
) {
204 committed_bytes
= PriorityCache::get_chunk(
205 get_cache_bytes(), total_cache
);
206 return committed_bytes
;
208 virtual int64_t get_committed_size() const {
209 return committed_bytes
;
211 virtual double get_cache_ratio() const {
214 virtual void set_cache_ratio(double ratio
) {
217 virtual string
get_cache_name() const = 0;
220 struct IncCache
: public OSDMemCache
{
221 IncCache(OSDMonitor
*m
) : OSDMemCache(m
) {};
223 virtual uint64_t _get_used_bytes() const {
224 return osdmon
->inc_osd_cache
.get_bytes();
227 virtual string
get_cache_name() const {
228 return "OSDMap Inc Cache";
231 uint64_t _get_num_osdmaps() const {
232 return osdmon
->inc_osd_cache
.get_size();
236 struct FullCache
: public OSDMemCache
{
237 FullCache(OSDMonitor
*m
) : OSDMemCache(m
) {};
239 virtual uint64_t _get_used_bytes() const {
240 return osdmon
->full_osd_cache
.get_bytes();
243 virtual string
get_cache_name() const {
244 return "OSDMap Full Cache";
247 uint64_t _get_num_osdmaps() const {
248 return osdmon
->full_osd_cache
.get_size();
252 std::shared_ptr
<IncCache
> inc_cache
;
253 std::shared_ptr
<FullCache
> full_cache
;
255 const uint32_t MAX_POOL_APPLICATIONS
= 4;
256 const uint32_t MAX_POOL_APPLICATION_KEYS
= 64;
257 const uint32_t MAX_POOL_APPLICATION_LENGTH
= 128;
259 bool is_osd_writable(const OSDCapGrant
& grant
, const std::string
* pool_name
) {
260 // Note: this doesn't include support for the application tag match
261 if ((grant
.spec
.allow
& OSD_CAP_W
) != 0) {
262 auto& match
= grant
.match
;
263 if (match
.is_match_all()) {
265 } else if (pool_name
!= nullptr &&
266 !match
.pool_namespace
.pool_name
.empty() &&
267 match
.pool_namespace
.pool_name
== *pool_name
) {
274 bool is_unmanaged_snap_op_permitted(CephContext
* cct
,
275 const KeyServer
& key_server
,
276 const EntityName
& entity_name
,
277 const MonCap
& mon_caps
,
278 const entity_addr_t
& peer_socket_addr
,
279 const std::string
* pool_name
)
281 typedef std::map
<std::string
, std::string
> CommandArgs
;
283 if (mon_caps
.is_capable(
284 cct
, entity_name
, "osd",
285 "osd pool op unmanaged-snap",
286 (pool_name
== nullptr ?
287 CommandArgs
{} /* pool DNE, require unrestricted cap */ :
288 CommandArgs
{{"poolname", *pool_name
}}),
294 AuthCapsInfo caps_info
;
295 if (!key_server
.get_service_caps(entity_name
, CEPH_ENTITY_TYPE_OSD
,
297 dout(10) << "unable to locate OSD cap data for " << entity_name
298 << " in auth db" << dendl
;
303 if (caps_info
.caps
.length() > 0) {
304 auto p
= caps_info
.caps
.cbegin();
307 } catch (const ceph::buffer::error
&err
) {
308 derr
<< "corrupt OSD cap data for " << entity_name
<< " in auth db"
315 if (!osd_cap
.parse(caps_str
, nullptr)) {
316 dout(10) << "unable to parse OSD cap data for " << entity_name
317 << " in auth db" << dendl
;
321 // if the entity has write permissions in one or all pools, permit
322 // usage of unmanaged-snapshots
323 if (osd_cap
.allow_all()) {
327 for (auto& grant
: osd_cap
.grants
) {
328 if (grant
.profile
.is_valid()) {
329 for (auto& profile_grant
: grant
.profile_grants
) {
330 if (is_osd_writable(profile_grant
, pool_name
)) {
334 } else if (is_osd_writable(grant
, pool_name
)) {
342 } // anonymous namespace
344 void LastEpochClean::Lec::report(ps_t ps
, epoch_t last_epoch_clean
)
346 if (epoch_by_pg
.size() <= ps
) {
347 epoch_by_pg
.resize(ps
+ 1, 0);
349 const auto old_lec
= epoch_by_pg
[ps
];
350 if (old_lec
>= last_epoch_clean
) {
354 epoch_by_pg
[ps
] = last_epoch_clean
;
355 if (last_epoch_clean
< floor
) {
356 floor
= last_epoch_clean
;
357 } else if (last_epoch_clean
> floor
) {
358 if (old_lec
== floor
) {
359 // probably should increase floor?
360 auto new_floor
= std::min_element(std::begin(epoch_by_pg
),
361 std::end(epoch_by_pg
));
365 if (ps
!= next_missing
) {
368 for (; next_missing
< epoch_by_pg
.size(); next_missing
++) {
369 if (epoch_by_pg
[next_missing
] == 0) {
375 void LastEpochClean::remove_pool(uint64_t pool
)
377 report_by_pool
.erase(pool
);
380 void LastEpochClean::report(const pg_t
& pg
, epoch_t last_epoch_clean
)
382 auto& lec
= report_by_pool
[pg
.pool()];
383 return lec
.report(pg
.ps(), last_epoch_clean
);
386 epoch_t
LastEpochClean::get_lower_bound(const OSDMap
& latest
) const
388 auto floor
= latest
.get_epoch();
389 for (auto& pool
: latest
.get_pools()) {
390 auto reported
= report_by_pool
.find(pool
.first
);
391 if (reported
== report_by_pool
.end()) {
394 if (reported
->second
.next_missing
< pool
.second
.get_pg_num()) {
397 if (reported
->second
.floor
< floor
) {
398 floor
= reported
->second
.floor
;
404 void LastEpochClean::dump(Formatter
*f
) const
406 f
->open_array_section("per_pool");
408 for (auto& [pool
, lec
] : report_by_pool
) {
409 f
->open_object_section("pool");
410 f
->dump_unsigned("poolid", pool
);
411 f
->dump_unsigned("floor", lec
.floor
);
418 class C_UpdateCreatingPGs
: public Context
{
423 C_UpdateCreatingPGs(OSDMonitor
*osdmon
, epoch_t e
) :
424 osdmon(osdmon
), start(ceph_clock_now()), epoch(e
) {}
425 void finish(int r
) override
{
427 utime_t end
= ceph_clock_now();
428 dout(10) << "osdmap epoch " << epoch
<< " mapping took "
429 << (end
- start
) << " seconds" << dendl
;
430 osdmon
->update_creating_pgs();
431 osdmon
->check_pg_creates_subs();
437 #define dout_prefix _prefix(_dout, mon, osdmap)
438 static ostream
& _prefix(std::ostream
*_dout
, Monitor
&mon
, const OSDMap
& osdmap
) {
439 return *_dout
<< "mon." << mon
.name
<< "@" << mon
.rank
440 << "(" << mon
.get_state_name()
441 << ").osd e" << osdmap
.get_epoch() << " ";
444 OSDMonitor::OSDMonitor(
448 const string
& service_name
)
449 : PaxosService(mn
, p
, service_name
),
451 inc_osd_cache(g_conf()->mon_osd_cache_size
),
452 full_osd_cache(g_conf()->mon_osd_cache_size
),
453 has_osdmap_manifest(false),
454 mapper(mn
.cct
, &mn
.cpu_tp
)
456 inc_cache
= std::make_shared
<IncCache
>(this);
457 full_cache
= std::make_shared
<FullCache
>(this);
458 cct
->_conf
.add_observer(this);
459 int r
= _set_cache_sizes();
461 derr
<< __func__
<< " using default osd cache size - mon_osd_cache_size ("
462 << g_conf()->mon_osd_cache_size
463 << ") without priority cache management"
468 const char **OSDMonitor::get_tracked_conf_keys() const
470 static const char* KEYS
[] = {
472 "mon_memory_autotune",
473 "rocksdb_cache_size",
479 void OSDMonitor::handle_conf_change(const ConfigProxy
& conf
,
480 const std::set
<std::string
> &changed
)
482 dout(10) << __func__
<< " " << changed
<< dendl
;
484 if (changed
.count("mon_memory_autotune")) {
485 _set_cache_autotuning();
487 if (changed
.count("mon_memory_target") ||
488 changed
.count("rocksdb_cache_size")) {
489 int r
= _update_mon_cache_settings();
491 derr
<< __func__
<< " mon_memory_target:"
492 << g_conf()->mon_memory_target
493 << " rocksdb_cache_size:"
494 << g_conf()->rocksdb_cache_size
495 << ". Unable to update cache size."
501 void OSDMonitor::_set_cache_autotuning()
503 if (!g_conf()->mon_memory_autotune
&& pcm
!= nullptr) {
504 // Disable cache autotuning
505 std::lock_guard
l(balancer_lock
);
509 if (g_conf()->mon_memory_autotune
&& pcm
== nullptr) {
510 int r
= register_cache_with_pcm();
513 << " Error while registering osdmon caches with pcm."
514 << " Cache auto tuning not enabled."
516 mon_memory_autotune
= false;
518 mon_memory_autotune
= true;
523 int OSDMonitor::_update_mon_cache_settings()
525 if (g_conf()->mon_memory_target
<= 0 ||
526 g_conf()->mon_memory_target
< mon_memory_min
||
527 g_conf()->rocksdb_cache_size
<= 0) {
531 if (pcm
== nullptr && rocksdb_binned_kv_cache
== nullptr) {
532 derr
<< __func__
<< " not using pcm and rocksdb" << dendl
;
536 uint64_t old_mon_memory_target
= mon_memory_target
;
537 uint64_t old_rocksdb_cache_size
= rocksdb_cache_size
;
539 // Set the new pcm memory cache sizes
540 mon_memory_target
= g_conf()->mon_memory_target
;
541 rocksdb_cache_size
= g_conf()->rocksdb_cache_size
;
543 uint64_t base
= mon_memory_base
;
544 double fragmentation
= mon_memory_fragmentation
;
545 uint64_t target
= mon_memory_target
;
546 uint64_t min
= mon_memory_min
;
549 uint64_t ltarget
= (1.0 - fragmentation
) * target
;
550 if (ltarget
> base
+ min
) {
551 max
= ltarget
- base
;
554 int r
= _set_cache_ratios();
556 derr
<< __func__
<< " Cache ratios for pcm could not be set."
557 << " Review the kv (rocksdb) and mon_memory_target sizes."
559 mon_memory_target
= old_mon_memory_target
;
560 rocksdb_cache_size
= old_rocksdb_cache_size
;
564 if (mon_memory_autotune
&& pcm
!= nullptr) {
565 std::lock_guard
l(balancer_lock
);
566 // set pcm cache levels
567 pcm
->set_target_memory(target
);
568 pcm
->set_min_memory(min
);
569 pcm
->set_max_memory(max
);
570 // tune memory based on new values
573 _set_new_cache_sizes();
574 dout(1) << __func__
<< " Updated mon cache setting."
575 << " target: " << target
583 int OSDMonitor::_set_cache_sizes()
585 if (g_conf()->mon_memory_autotune
) {
586 // set the new osdmon cache targets to be managed by pcm
587 mon_osd_cache_size
= g_conf()->mon_osd_cache_size
;
588 rocksdb_cache_size
= g_conf()->rocksdb_cache_size
;
589 mon_memory_base
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_base");
590 mon_memory_fragmentation
= cct
->_conf
.get_val
<double>("osd_memory_expected_fragmentation");
591 mon_memory_target
= g_conf()->mon_memory_target
;
592 mon_memory_min
= g_conf()->mon_osd_cache_size_min
;
593 if (mon_memory_target
<= 0 || mon_memory_min
<= 0) {
594 derr
<< __func__
<< " mon_memory_target:" << mon_memory_target
595 << " mon_memory_min:" << mon_memory_min
596 << ". Invalid size option(s) provided."
600 // Set the initial inc and full LRU cache sizes
601 inc_osd_cache
.set_bytes(mon_memory_min
);
602 full_osd_cache
.set_bytes(mon_memory_min
);
603 mon_memory_autotune
= g_conf()->mon_memory_autotune
;
608 bool OSDMonitor::_have_pending_crush()
610 return pending_inc
.crush
.length() > 0;
613 CrushWrapper
&OSDMonitor::_get_stable_crush()
615 return *osdmap
.crush
;
618 void OSDMonitor::_get_pending_crush(CrushWrapper
& newcrush
)
621 if (pending_inc
.crush
.length())
622 bl
= pending_inc
.crush
;
624 osdmap
.crush
->encode(bl
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
626 auto p
= bl
.cbegin();
630 void OSDMonitor::create_initial()
632 dout(10) << "create_initial for " << mon
.monmap
->fsid
<< dendl
;
637 mon
.store
->get("mkfs", "osdmap", bl
);
641 newmap
.set_fsid(mon
.monmap
->fsid
);
643 newmap
.build_simple(cct
, 0, mon
.monmap
->fsid
, 0);
646 newmap
.created
= newmap
.modified
= ceph_clock_now();
648 // new clusters should sort bitwise by default.
649 newmap
.set_flag(CEPH_OSDMAP_SORTBITWISE
);
652 CEPH_OSDMAP_RECOVERY_DELETES
|
653 CEPH_OSDMAP_PURGED_SNAPDIRS
|
654 CEPH_OSDMAP_PGLOG_HARDLIMIT
;
655 newmap
.full_ratio
= g_conf()->mon_osd_full_ratio
;
656 if (newmap
.full_ratio
> 1.0) newmap
.full_ratio
/= 100;
657 newmap
.backfillfull_ratio
= g_conf()->mon_osd_backfillfull_ratio
;
658 if (newmap
.backfillfull_ratio
> 1.0) newmap
.backfillfull_ratio
/= 100;
659 newmap
.nearfull_ratio
= g_conf()->mon_osd_nearfull_ratio
;
660 if (newmap
.nearfull_ratio
> 1.0) newmap
.nearfull_ratio
/= 100;
662 // new cluster should require latest by default
663 if (g_conf().get_val
<bool>("mon_debug_no_require_pacific")) {
664 if (g_conf().get_val
<bool>("mon_debug_no_require_octopus")) {
665 derr
<< __func__
<< " mon_debug_no_require_pacific and octopus=true" << dendl
;
666 newmap
.require_osd_release
= ceph_release_t::nautilus
;
668 derr
<< __func__
<< " mon_debug_no_require_pacific=true" << dendl
;
669 newmap
.require_osd_release
= ceph_release_t::octopus
;
672 newmap
.require_osd_release
= ceph_release_t::pacific
;
675 if (newmap
.require_osd_release
>= ceph_release_t::octopus
) {
676 ceph_release_t r
= ceph_release_from_name(
677 g_conf()->mon_osd_initial_require_min_compat_client
);
679 ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
681 newmap
.require_min_compat_client
= r
;
684 // encode into pending incremental
685 uint64_t features
= newmap
.get_encoding_features();
686 newmap
.encode(pending_inc
.fullmap
,
687 features
| CEPH_FEATURE_RESERVED
);
688 pending_inc
.full_crc
= newmap
.get_crc();
689 dout(20) << " full crc " << pending_inc
.full_crc
<< dendl
;
692 void OSDMonitor::get_store_prefixes(std::set
<string
>& s
) const
694 s
.insert(service_name
);
695 s
.insert(OSD_PG_CREATING_PREFIX
);
696 s
.insert(OSD_METADATA_PREFIX
);
697 s
.insert(OSD_SNAP_PREFIX
);
700 void OSDMonitor::update_from_paxos(bool *need_bootstrap
)
702 // we really don't care if the version has been updated, because we may
703 // have trimmed without having increased the last committed; yet, we may
704 // need to update the in-memory manifest.
705 load_osdmap_manifest();
707 version_t version
= get_last_committed();
708 if (version
== osdmap
.epoch
)
710 ceph_assert(version
> osdmap
.epoch
);
712 dout(15) << "update_from_paxos paxos e " << version
713 << ", my e " << osdmap
.epoch
<< dendl
;
715 int prev_num_up_osd
= osdmap
.num_up_osd
;
718 if (!mapping_job
->is_done()) {
719 dout(1) << __func__
<< " mapping job "
720 << mapping_job
.get() << " did not complete, "
721 << mapping_job
->shards
<< " left, canceling" << dendl
;
722 mapping_job
->abort();
730 * We will possibly have a stashed latest that *we* wrote, and we will
731 * always be sure to have the oldest full map in the first..last range
732 * due to encode_trim_extra(), which includes the oldest full map in the trim
735 * encode_trim_extra() does not however write the full map's
736 * version to 'full_latest'. This is only done when we are building the
737 * full maps from the incremental versions. But don't panic! We make sure
738 * that the following conditions find whichever full map version is newer.
740 version_t latest_full
= get_version_latest_full();
741 if (latest_full
== 0 && get_first_committed() > 1)
742 latest_full
= get_first_committed();
744 if (get_first_committed() > 1 &&
745 latest_full
< get_first_committed()) {
746 // the monitor could be just sync'ed with its peer, and the latest_full key
747 // is not encoded in the paxos commits in encode_pending(), so we need to
748 // make sure we get it pointing to a proper version.
749 version_t lc
= get_last_committed();
750 version_t fc
= get_first_committed();
752 dout(10) << __func__
<< " looking for valid full map in interval"
753 << " [" << fc
<< ", " << lc
<< "]" << dendl
;
756 for (version_t v
= lc
; v
>= fc
; v
--) {
757 string full_key
= "full_" + stringify(v
);
758 if (mon
.store
->exists(get_service_name(), full_key
)) {
759 dout(10) << __func__
<< " found latest full map v " << v
<< dendl
;
765 ceph_assert(latest_full
> 0);
766 auto t(std::make_shared
<MonitorDBStore::Transaction
>());
767 put_version_latest_full(t
, latest_full
);
768 mon
.store
->apply_transaction(t
);
769 dout(10) << __func__
<< " updated the on-disk full map version to "
770 << latest_full
<< dendl
;
773 if ((latest_full
> 0) && (latest_full
> osdmap
.epoch
)) {
774 bufferlist latest_bl
;
775 get_version_full(latest_full
, latest_bl
);
776 ceph_assert(latest_bl
.length() != 0);
777 dout(7) << __func__
<< " loading latest full map e" << latest_full
<< dendl
;
779 osdmap
.decode(latest_bl
);
783 if (!mon
.store
->get(OSD_PG_CREATING_PREFIX
, "creating", bl
)) {
784 auto p
= bl
.cbegin();
785 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
786 creating_pgs
.decode(p
);
787 dout(7) << __func__
<< " loading creating_pgs last_scan_epoch "
788 << creating_pgs
.last_scan_epoch
789 << " with " << creating_pgs
.pgs
.size() << " pgs" << dendl
;
791 dout(1) << __func__
<< " missing creating pgs; upgrade from post-kraken?"
795 // walk through incrementals
796 MonitorDBStore::TransactionRef t
;
798 while (version
> osdmap
.epoch
) {
800 int err
= get_version(osdmap
.epoch
+1, inc_bl
);
801 ceph_assert(err
== 0);
802 ceph_assert(inc_bl
.length());
803 // set priority cache manager levels if the osdmap is
804 // being populated for the first time.
805 if (mon_memory_autotune
&& pcm
== nullptr) {
806 int r
= register_cache_with_pcm();
809 << " Error while registering osdmon caches with pcm."
810 << " Proceeding without cache auto tuning."
815 dout(7) << "update_from_paxos applying incremental " << osdmap
.epoch
+1
817 OSDMap::Incremental
inc(inc_bl
);
818 err
= osdmap
.apply_incremental(inc
);
819 ceph_assert(err
== 0);
822 t
.reset(new MonitorDBStore::Transaction
);
824 // Write out the full map for all past epochs. Encode the full
825 // map with the same features as the incremental. If we don't
826 // know, use the quorum features. If we don't know those either,
827 // encode with all features.
828 uint64_t f
= inc
.encode_features
;
830 f
= mon
.get_quorum_con_features();
834 osdmap
.encode(full_bl
, f
| CEPH_FEATURE_RESERVED
);
835 tx_size
+= full_bl
.length();
837 bufferlist orig_full_bl
;
838 get_version_full(osdmap
.epoch
, orig_full_bl
);
839 if (orig_full_bl
.length()) {
840 // the primary provided the full map
841 ceph_assert(inc
.have_crc
);
842 if (inc
.full_crc
!= osdmap
.crc
) {
843 // This will happen if the mons were running mixed versions in
844 // the past or some other circumstance made the full encoded
845 // maps divergent. Reloading here will bring us back into
846 // sync with the primary for this and all future maps. OSDs
847 // will also be brought back into sync when they discover the
848 // crc mismatch and request a full map from a mon.
849 derr
<< __func__
<< " full map CRC mismatch, resetting to canonical"
852 dout(20) << __func__
<< " my (bad) full osdmap:\n";
853 JSONFormatter
jf(true);
854 jf
.dump_object("osdmap", osdmap
);
856 *_dout
<< "\nhexdump:\n";
857 full_bl
.hexdump(*_dout
);
861 osdmap
.decode(orig_full_bl
);
863 dout(20) << __func__
<< " canonical full osdmap:\n";
864 JSONFormatter
jf(true);
865 jf
.dump_object("osdmap", osdmap
);
867 *_dout
<< "\nhexdump:\n";
868 orig_full_bl
.hexdump(*_dout
);
872 ceph_assert(!inc
.have_crc
);
873 put_version_full(t
, osdmap
.epoch
, full_bl
);
875 put_version_latest_full(t
, osdmap
.epoch
);
878 dout(1) << osdmap
<< dendl
;
880 if (osdmap
.epoch
== 1) {
881 t
->erase("mkfs", "osdmap");
884 if (tx_size
> g_conf()->mon_sync_max_payload_size
*2) {
885 mon
.store
->apply_transaction(t
);
886 t
= MonitorDBStore::TransactionRef();
889 for (const auto [osd
, state
] : inc
.new_state
) {
890 if (state
& CEPH_OSD_UP
) {
891 // could be marked up *or* down, but we're too lazy to check which
892 last_osd_report
.erase(osd
);
894 if (state
& CEPH_OSD_OUT
) {
895 // could be marked in *or* out, but we can safely drop it
896 osd_epochs
.erase(osd
);
899 for (const auto [osd
, weight
] : inc
.new_weight
) {
900 if (weight
== CEPH_OSD_OUT
) {
901 // manually marked out, so drop it
902 osd_epochs
.erase(osd
);
908 mon
.store
->apply_transaction(t
);
911 bool marked_osd_down
= false;
912 for (int o
= 0; o
< osdmap
.get_max_osd(); o
++) {
913 if (osdmap
.is_out(o
))
915 auto found
= down_pending_out
.find(o
);
916 if (osdmap
.is_down(o
)) {
917 // populate down -> out map
918 if (found
== down_pending_out
.end()) {
919 dout(10) << " adding osd." << o
<< " to down_pending_out map" << dendl
;
920 down_pending_out
[o
] = ceph_clock_now();
921 marked_osd_down
= true;
924 if (found
!= down_pending_out
.end()) {
925 dout(10) << " removing osd." << o
<< " from down_pending_out map" << dendl
;
926 down_pending_out
.erase(found
);
930 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
933 check_pg_creates_subs();
935 share_map_with_random_osd();
939 // make sure our feature bits reflect the latest map
940 update_msgr_features();
942 if (!mon
.is_leader()) {
943 // will be called by on_active() on the leader, avoid doing so twice
946 if (osdmap
.stretch_mode_enabled
) {
947 dout(20) << "Stretch mode enabled in this map" << dendl
;
948 mon
.maybe_engage_stretch_mode();
949 if (osdmap
.degraded_stretch_mode
) {
950 dout(20) << "Degraded stretch mode set in this map" << dendl
;
951 if (!osdmap
.recovering_stretch_mode
) {
952 mon
.set_degraded_stretch_mode();
953 if (prev_num_up_osd
< osdmap
.num_up_osd
&&
954 (osdmap
.num_up_osd
/ (double)osdmap
.num_osd
) >
955 cct
->_conf
.get_val
<double>("mon_stretch_cluster_recovery_ratio")) {
956 // TODO: This works for 2-site clusters when the OSD maps are appropriately
957 // trimmed and everything is "normal" but not if you have a lot of out OSDs
958 // you're ignoring or in some really degenerate failure cases
959 dout(10) << "Enabling recovery stretch mode in this map" << dendl
;
960 mon
.go_recovery_stretch_mode();
964 if (marked_osd_down
&&
965 (!osdmap
.degraded_stretch_mode
|| osdmap
.recovering_stretch_mode
)) {
966 dout(20) << "Checking degraded stretch mode due to osd changes" << dendl
;
967 mon
.maybe_go_degraded_stretch_mode();
969 if (osdmap
.recovering_stretch_mode
&& stretch_recovery_triggered
.is_zero()) {
970 stretch_recovery_triggered
= ceph_clock_now();
975 int OSDMonitor::register_cache_with_pcm()
977 if (mon_memory_target
<= 0 || mon_memory_min
<= 0) {
978 derr
<< __func__
<< " Invalid memory size specified for mon caches."
979 << " Caches will not be auto-tuned."
983 uint64_t base
= mon_memory_base
;
984 double fragmentation
= mon_memory_fragmentation
;
985 // For calculating total target memory, consider rocksdb cache size.
986 uint64_t target
= mon_memory_target
;
987 uint64_t min
= mon_memory_min
;
990 // Apply the same logic as in bluestore to set the max amount
991 // of memory to use for cache. Assume base memory for OSDMaps
992 // and then add in some overhead for fragmentation.
993 uint64_t ltarget
= (1.0 - fragmentation
) * target
;
994 if (ltarget
> base
+ min
) {
995 max
= ltarget
- base
;
998 rocksdb_binned_kv_cache
= mon
.store
->get_priority_cache();
999 if (!rocksdb_binned_kv_cache
) {
1000 derr
<< __func__
<< " not using rocksdb" << dendl
;
1004 int r
= _set_cache_ratios();
1006 derr
<< __func__
<< " Cache ratios for pcm could not be set."
1007 << " Review the kv (rocksdb) and mon_memory_target sizes."
1012 pcm
= std::make_shared
<PriorityCache::Manager
>(
1013 cct
, min
, max
, target
, true);
1014 pcm
->insert("kv", rocksdb_binned_kv_cache
, true);
1015 pcm
->insert("inc", inc_cache
, true);
1016 pcm
->insert("full", full_cache
, true);
1017 dout(1) << __func__
<< " pcm target: " << target
1018 << " pcm max: " << max
1019 << " pcm min: " << min
1020 << " inc_osd_cache size: " << inc_osd_cache
.get_size()
1025 int OSDMonitor::_set_cache_ratios()
1027 double old_cache_kv_ratio
= cache_kv_ratio
;
1029 // Set the cache ratios for kv(rocksdb), inc and full caches
1030 cache_kv_ratio
= (double)rocksdb_cache_size
/ (double)mon_memory_target
;
1031 if (cache_kv_ratio
>= 1.0) {
1032 derr
<< __func__
<< " Cache kv ratio (" << cache_kv_ratio
1033 << ") must be in range [0,<1.0]."
1035 cache_kv_ratio
= old_cache_kv_ratio
;
1038 rocksdb_binned_kv_cache
->set_cache_ratio(cache_kv_ratio
);
1039 cache_inc_ratio
= cache_full_ratio
= (1.0 - cache_kv_ratio
) / 2;
1040 inc_cache
->set_cache_ratio(cache_inc_ratio
);
1041 full_cache
->set_cache_ratio(cache_full_ratio
);
1043 dout(1) << __func__
<< " kv ratio " << cache_kv_ratio
1044 << " inc ratio " << cache_inc_ratio
1045 << " full ratio " << cache_full_ratio
1050 void OSDMonitor::start_mapping()
1052 // initiate mapping job
1054 dout(10) << __func__
<< " canceling previous mapping_job " << mapping_job
.get()
1056 mapping_job
->abort();
1058 if (!osdmap
.get_pools().empty()) {
1059 auto fin
= new C_UpdateCreatingPGs(this, osdmap
.get_epoch());
1060 mapping_job
= mapping
.start_update(osdmap
, mapper
,
1061 g_conf()->mon_osd_mapping_pgs_per_chunk
);
1062 dout(10) << __func__
<< " started mapping job " << mapping_job
.get()
1063 << " at " << fin
->start
<< dendl
;
1064 mapping_job
->set_finish_event(fin
);
1066 dout(10) << __func__
<< " no pools, no mapping job" << dendl
;
1067 mapping_job
= nullptr;
1071 void OSDMonitor::update_msgr_features()
1073 const int types
[] = {
1074 entity_name_t::TYPE_OSD
,
1075 entity_name_t::TYPE_CLIENT
,
1076 entity_name_t::TYPE_MDS
,
1077 entity_name_t::TYPE_MON
1079 for (int type
: types
) {
1081 uint64_t features
= osdmap
.get_features(type
, &mask
);
1082 if ((mon
.messenger
->get_policy(type
).features_required
& mask
) != features
) {
1083 dout(0) << "crush map has features " << features
<< ", adjusting msgr requires" << dendl
;
1084 ceph::net::Policy p
= mon
.messenger
->get_policy(type
);
1085 p
.features_required
= (p
.features_required
& ~mask
) | features
;
1086 mon
.messenger
->set_policy(type
, p
);
1091 void OSDMonitor::on_active()
1095 if (mon
.is_leader()) {
1096 mon
.clog
->debug() << "osdmap " << osdmap
;
1097 if (!priority_convert
) {
1098 // Only do this once at start-up
1099 convert_pool_priorities();
1100 priority_convert
= true;
1103 list
<MonOpRequestRef
> ls
;
1104 take_all_failures(ls
);
1105 while (!ls
.empty()) {
1106 MonOpRequestRef op
= ls
.front();
1107 op
->mark_osdmon_event(__func__
);
1115 void OSDMonitor::on_restart()
1117 last_osd_report
.clear();
1120 void OSDMonitor::on_shutdown()
1122 dout(10) << __func__
<< dendl
;
1124 dout(10) << __func__
<< " canceling previous mapping_job " << mapping_job
.get()
1126 mapping_job
->abort();
1129 // discard failure info, waiters
1130 list
<MonOpRequestRef
> ls
;
1131 take_all_failures(ls
);
1135 void OSDMonitor::update_logger()
1137 dout(10) << "update_logger" << dendl
;
1139 mon
.cluster_logger
->set(l_cluster_num_osd
, osdmap
.get_num_osds());
1140 mon
.cluster_logger
->set(l_cluster_num_osd_up
, osdmap
.get_num_up_osds());
1141 mon
.cluster_logger
->set(l_cluster_num_osd_in
, osdmap
.get_num_in_osds());
1142 mon
.cluster_logger
->set(l_cluster_osd_epoch
, osdmap
.get_epoch());
1145 void OSDMonitor::create_pending()
1147 pending_inc
= OSDMap::Incremental(osdmap
.epoch
+1);
1148 pending_inc
.fsid
= mon
.monmap
->fsid
;
1149 pending_metadata
.clear();
1150 pending_metadata_rm
.clear();
1151 pending_pseudo_purged_snaps
.clear();
1153 dout(10) << "create_pending e " << pending_inc
.epoch
<< dendl
;
1155 // safety checks (this shouldn't really happen)
1157 if (osdmap
.backfillfull_ratio
<= 0) {
1158 pending_inc
.new_backfillfull_ratio
= g_conf()->mon_osd_backfillfull_ratio
;
1159 if (pending_inc
.new_backfillfull_ratio
> 1.0)
1160 pending_inc
.new_backfillfull_ratio
/= 100;
1161 dout(1) << __func__
<< " setting backfillfull_ratio = "
1162 << pending_inc
.new_backfillfull_ratio
<< dendl
;
1164 if (osdmap
.full_ratio
<= 0) {
1165 pending_inc
.new_full_ratio
= g_conf()->mon_osd_full_ratio
;
1166 if (pending_inc
.new_full_ratio
> 1.0)
1167 pending_inc
.new_full_ratio
/= 100;
1168 dout(1) << __func__
<< " setting full_ratio = "
1169 << pending_inc
.new_full_ratio
<< dendl
;
1171 if (osdmap
.nearfull_ratio
<= 0) {
1172 pending_inc
.new_nearfull_ratio
= g_conf()->mon_osd_nearfull_ratio
;
1173 if (pending_inc
.new_nearfull_ratio
> 1.0)
1174 pending_inc
.new_nearfull_ratio
/= 100;
1175 dout(1) << __func__
<< " setting nearfull_ratio = "
1176 << pending_inc
.new_nearfull_ratio
<< dendl
;
1180 // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
1182 if (osdmap
.crush
->has_legacy_rule_ids()) {
1183 CrushWrapper newcrush
;
1184 _get_pending_crush(newcrush
);
1186 // First, for all pools, work out which rule they really used
1187 // by resolving ruleset to rule.
1188 for (const auto &i
: osdmap
.get_pools()) {
1189 const auto pool_id
= i
.first
;
1190 const auto &pool
= i
.second
;
1191 int new_rule_id
= newcrush
.find_rule(pool
.crush_rule
,
1192 pool
.type
, pool
.size
);
1194 dout(1) << __func__
<< " rewriting pool "
1195 << osdmap
.get_pool_name(pool_id
) << " crush ruleset "
1196 << pool
.crush_rule
<< " -> rule id " << new_rule_id
<< dendl
;
1197 if (pending_inc
.new_pools
.count(pool_id
) == 0) {
1198 pending_inc
.new_pools
[pool_id
] = pool
;
1200 pending_inc
.new_pools
[pool_id
].crush_rule
= new_rule_id
;
1203 // Now, go ahead and renumber all the rules so that their
1204 // rule_id field corresponds to their position in the array
1205 auto old_to_new
= newcrush
.renumber_rules();
1206 dout(1) << __func__
<< " Rewrote " << old_to_new
<< " crush IDs:" << dendl
;
1207 for (const auto &i
: old_to_new
) {
1208 dout(1) << __func__
<< " " << i
.first
<< " -> " << i
.second
<< dendl
;
1210 pending_inc
.crush
.clear();
1211 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
1216 OSDMonitor::update_pending_pgs(const OSDMap::Incremental
& inc
,
1217 const OSDMap
& nextmap
)
1219 dout(10) << __func__
<< dendl
;
1220 creating_pgs_t pending_creatings
;
1222 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
1223 pending_creatings
= creating_pgs
;
1225 // check for new or old pools
1226 if (pending_creatings
.last_scan_epoch
< inc
.epoch
) {
1227 unsigned queued
= 0;
1228 queued
+= scan_for_creating_pgs(osdmap
.get_pools(),
1231 &pending_creatings
);
1232 queued
+= scan_for_creating_pgs(inc
.new_pools
,
1235 &pending_creatings
);
1236 dout(10) << __func__
<< " " << queued
<< " pools queued" << dendl
;
1237 for (auto deleted_pool
: inc
.old_pools
) {
1238 auto removed
= pending_creatings
.remove_pool(deleted_pool
);
1239 dout(10) << __func__
<< " " << removed
1240 << " pg removed because containing pool deleted: "
1241 << deleted_pool
<< dendl
;
1242 last_epoch_clean
.remove_pool(deleted_pool
);
1244 // pgmon updates its creating_pgs in check_osd_map() which is called by
1245 // on_active() and check_osd_map() could be delayed if lease expires, so its
1246 // creating_pgs could be stale in comparison with the one of osdmon. let's
1247 // trim them here. otherwise, they will be added back after being erased.
1248 unsigned removed
= 0;
1249 for (auto& pg
: pending_created_pgs
) {
1250 dout(20) << __func__
<< " noting created pg " << pg
<< dendl
;
1251 pending_creatings
.created_pools
.insert(pg
.pool());
1252 removed
+= pending_creatings
.pgs
.erase(pg
);
1254 pending_created_pgs
.clear();
1255 dout(10) << __func__
<< " " << removed
1256 << " pgs removed because they're created" << dendl
;
1257 pending_creatings
.last_scan_epoch
= osdmap
.get_epoch();
1260 // filter out any pgs that shouldn't exist.
1262 auto i
= pending_creatings
.pgs
.begin();
1263 while (i
!= pending_creatings
.pgs
.end()) {
1264 if (!nextmap
.pg_exists(i
->first
)) {
1265 dout(10) << __func__
<< " removing pg " << i
->first
1266 << " which should not exist" << dendl
;
1267 i
= pending_creatings
.pgs
.erase(i
);
1275 unsigned max
= std::max
<int64_t>(1, g_conf()->mon_osd_max_creating_pgs
);
1276 const auto total
= pending_creatings
.pgs
.size();
1277 while (pending_creatings
.pgs
.size() < max
&&
1278 !pending_creatings
.queue
.empty()) {
1279 auto p
= pending_creatings
.queue
.begin();
1280 int64_t poolid
= p
->first
;
1281 dout(10) << __func__
<< " pool " << poolid
1282 << " created " << p
->second
.created
1283 << " modified " << p
->second
.modified
1284 << " [" << p
->second
.start
<< "-" << p
->second
.end
<< ")"
1286 int64_t n
= std::min
<int64_t>(max
- pending_creatings
.pgs
.size(),
1287 p
->second
.end
- p
->second
.start
);
1288 ps_t first
= p
->second
.start
;
1289 ps_t end
= first
+ n
;
1290 for (ps_t ps
= first
; ps
< end
; ++ps
) {
1291 const pg_t pgid
{ps
, static_cast<uint64_t>(poolid
)};
1292 // NOTE: use the *current* epoch as the PG creation epoch so that the
1293 // OSD does not have to generate a long set of PastIntervals.
1294 pending_creatings
.pgs
.emplace(
1296 creating_pgs_t::pg_create_info(inc
.epoch
,
1297 p
->second
.modified
));
1298 dout(10) << __func__
<< " adding " << pgid
<< dendl
;
1300 p
->second
.start
= end
;
1301 if (p
->second
.done()) {
1302 dout(10) << __func__
<< " done with queue for " << poolid
<< dendl
;
1303 pending_creatings
.queue
.erase(p
);
1305 dout(10) << __func__
<< " pool " << poolid
1306 << " now [" << p
->second
.start
<< "-" << p
->second
.end
<< ")"
1310 dout(10) << __func__
<< " queue remaining: " << pending_creatings
.queue
.size()
1311 << " pools" << dendl
;
1313 if (mon
.monmap
->min_mon_release
>= ceph_release_t::octopus
) {
1314 // walk creating pgs' history and past_intervals forward
1315 for (auto& i
: pending_creatings
.pgs
) {
1316 // this mirrors PG::start_peering_interval()
1317 pg_t pgid
= i
.first
;
1319 // this is a bit imprecise, but sufficient?
1320 struct min_size_predicate_t
: public IsPGRecoverablePredicate
{
1321 const pg_pool_t
*pi
;
1322 bool operator()(const set
<pg_shard_t
> &have
) const {
1323 return have
.size() >= pi
->min_size
;
1325 explicit min_size_predicate_t(const pg_pool_t
*i
) : pi(i
) {}
1326 } min_size_predicate(nextmap
.get_pg_pool(pgid
.pool()));
1328 vector
<int> up
, acting
;
1329 int up_primary
, acting_primary
;
1330 nextmap
.pg_to_up_acting_osds(
1331 pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
1332 if (i
.second
.history
.epoch_created
== 0) {
1333 // new pg entry, set it up
1335 i
.second
.acting
= acting
;
1336 i
.second
.up_primary
= up_primary
;
1337 i
.second
.acting_primary
= acting_primary
;
1338 i
.second
.history
= pg_history_t(i
.second
.create_epoch
,
1339 i
.second
.create_stamp
);
1340 dout(10) << __func__
<< " pg " << pgid
<< " just added, "
1341 << " up " << i
.second
.up
1342 << " p " << i
.second
.up_primary
1343 << " acting " << i
.second
.acting
1344 << " p " << i
.second
.acting_primary
1345 << " history " << i
.second
.history
1346 << " past_intervals " << i
.second
.past_intervals
1349 std::stringstream debug
;
1350 if (PastIntervals::check_new_interval(
1351 i
.second
.acting_primary
, acting_primary
,
1352 i
.second
.acting
, acting
,
1353 i
.second
.up_primary
, up_primary
,
1355 i
.second
.history
.same_interval_since
,
1356 i
.second
.history
.last_epoch_clean
,
1361 &i
.second
.past_intervals
,
1363 epoch_t e
= inc
.epoch
;
1364 i
.second
.history
.same_interval_since
= e
;
1365 if (i
.second
.up
!= up
) {
1366 i
.second
.history
.same_up_since
= e
;
1368 if (i
.second
.acting_primary
!= acting_primary
) {
1369 i
.second
.history
.same_primary_since
= e
;
1372 osdmap
.get_pg_num(pgid
.pool()),
1373 nextmap
.get_pg_num(pgid
.pool()),
1375 i
.second
.history
.last_epoch_split
= e
;
1377 dout(10) << __func__
<< " pg " << pgid
<< " new interval,"
1378 << " up " << i
.second
.up
<< " -> " << up
1379 << " p " << i
.second
.up_primary
<< " -> " << up_primary
1380 << " acting " << i
.second
.acting
<< " -> " << acting
1381 << " p " << i
.second
.acting_primary
<< " -> "
1383 << " history " << i
.second
.history
1384 << " past_intervals " << i
.second
.past_intervals
1386 dout(20) << " debug: " << debug
.str() << dendl
;
1388 i
.second
.acting
= acting
;
1389 i
.second
.up_primary
= up_primary
;
1390 i
.second
.acting_primary
= acting_primary
;
1395 dout(10) << __func__
1396 << " " << (pending_creatings
.pgs
.size() - total
)
1397 << "/" << pending_creatings
.pgs
.size()
1398 << " pgs added from queued pools" << dendl
;
1399 return pending_creatings
;
1402 void OSDMonitor::maybe_prime_pg_temp()
1405 if (pending_inc
.crush
.length()) {
1406 dout(10) << __func__
<< " new crush map, all" << dendl
;
1410 if (!pending_inc
.new_up_client
.empty()) {
1411 dout(10) << __func__
<< " new up osds, all" << dendl
;
1415 // check for interesting OSDs
1417 for (auto p
= pending_inc
.new_state
.begin();
1418 !all
&& p
!= pending_inc
.new_state
.end();
1420 if ((p
->second
& CEPH_OSD_UP
) &&
1421 osdmap
.is_up(p
->first
)) {
1422 osds
.insert(p
->first
);
1425 for (auto p
= pending_inc
.new_weight
.begin();
1426 !all
&& p
!= pending_inc
.new_weight
.end();
1428 if (osdmap
.exists(p
->first
) && p
->second
< osdmap
.get_weight(p
->first
)) {
1430 osds
.insert(p
->first
);
1432 dout(10) << __func__
<< " osd." << p
->first
<< " weight increase, all"
1438 if (!all
&& osds
.empty())
1443 mapping
.get_osd_acting_pgs(*osds
.begin()).size() * osds
.size();
1444 if (estimate
> mapping
.get_num_pgs() *
1445 g_conf()->mon_osd_prime_pg_temp_max_estimate
) {
1446 dout(10) << __func__
<< " estimate " << estimate
<< " pgs on "
1447 << osds
.size() << " osds >= "
1448 << g_conf()->mon_osd_prime_pg_temp_max_estimate
<< " of total "
1449 << mapping
.get_num_pgs() << " pgs, all"
1453 dout(10) << __func__
<< " estimate " << estimate
<< " pgs on "
1454 << osds
.size() << " osds" << dendl
;
1459 next
.deepish_copy_from(osdmap
);
1460 next
.apply_incremental(pending_inc
);
1462 if (next
.get_pools().empty()) {
1463 dout(10) << __func__
<< " no pools, no pg_temp priming" << dendl
;
1465 PrimeTempJob
job(next
, this);
1466 mapper
.queue(&job
, g_conf()->mon_osd_mapping_pgs_per_chunk
, {});
1467 if (job
.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time
)) {
1468 dout(10) << __func__
<< " done in " << job
.get_duration() << dendl
;
1470 dout(10) << __func__
<< " did not finish in "
1471 << g_conf()->mon_osd_prime_pg_temp_max_time
1472 << ", stopping" << dendl
;
1476 dout(10) << __func__
<< " " << osds
.size() << " interesting osds" << dendl
;
1477 utime_t stop
= ceph_clock_now();
1478 stop
+= g_conf()->mon_osd_prime_pg_temp_max_time
;
1479 const int chunk
= 1000;
1481 std::unordered_set
<pg_t
> did_pgs
;
1482 for (auto osd
: osds
) {
1483 auto& pgs
= mapping
.get_osd_acting_pgs(osd
);
1484 dout(20) << __func__
<< " osd." << osd
<< " " << pgs
<< dendl
;
1485 for (auto pgid
: pgs
) {
1486 if (!did_pgs
.insert(pgid
).second
) {
1489 prime_pg_temp(next
, pgid
);
1492 if (ceph_clock_now() > stop
) {
1493 dout(10) << __func__
<< " consumed more than "
1494 << g_conf()->mon_osd_prime_pg_temp_max_time
1495 << " seconds, stopping"
1505 void OSDMonitor::prime_pg_temp(
1509 // TODO: remove this creating_pgs direct access?
1510 if (creating_pgs
.pgs
.count(pgid
)) {
1513 if (!osdmap
.pg_exists(pgid
)) {
1517 vector
<int> up
, acting
;
1518 mapping
.get(pgid
, &up
, nullptr, &acting
, nullptr);
1520 vector
<int> next_up
, next_acting
;
1521 int next_up_primary
, next_acting_primary
;
1522 next
.pg_to_up_acting_osds(pgid
, &next_up
, &next_up_primary
,
1523 &next_acting
, &next_acting_primary
);
1524 if (acting
== next_acting
&&
1525 !(up
!= acting
&& next_up
== next_acting
))
1526 return; // no change since last epoch
1529 return; // if previously empty now we can be no worse off
1530 const pg_pool_t
*pool
= next
.get_pg_pool(pgid
.pool());
1531 if (pool
&& acting
.size() < pool
->min_size
)
1532 return; // can be no worse off than before
1534 if (next_up
== next_acting
) {
1536 dout(20) << __func__
<< " next_up == next_acting now, clear pg_temp"
1540 dout(20) << __func__
<< " " << pgid
<< " " << up
<< "/" << acting
1541 << " -> " << next_up
<< "/" << next_acting
1542 << ", priming " << acting
1545 std::lock_guard
l(prime_pg_temp_lock
);
1546 // do not touch a mapping if a change is pending
1547 pending_inc
.new_pg_temp
.emplace(
1549 mempool::osdmap::vector
<int>(acting
.begin(), acting
.end()));
1554 * @note receiving a transaction in this function gives a fair amount of
1555 * freedom to the service implementation if it does need it. It shouldn't.
1557 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t
)
1559 dout(10) << "encode_pending e " << pending_inc
.epoch
1563 dout(1) << __func__
<< " osdmap full prune encoded e"
1564 << pending_inc
.epoch
<< dendl
;
1567 // finalize up pending_inc
1568 pending_inc
.modified
= ceph_clock_now();
1570 int r
= pending_inc
.propagate_base_properties_to_tiers(cct
, osdmap
);
1571 ceph_assert(r
== 0);
1574 if (!mapping_job
->is_done()) {
1575 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job "
1576 << mapping_job
.get() << " did not complete, "
1577 << mapping_job
->shards
<< " left" << dendl
;
1578 mapping_job
->abort();
1579 } else if (mapping
.get_epoch() < osdmap
.get_epoch()) {
1580 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job "
1581 << mapping_job
.get() << " is prior epoch "
1582 << mapping
.get_epoch() << dendl
;
1584 if (g_conf()->mon_osd_prime_pg_temp
) {
1585 maybe_prime_pg_temp();
1588 } else if (g_conf()->mon_osd_prime_pg_temp
) {
1589 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job did not start"
1592 mapping_job
.reset();
1594 // ensure we don't have blank new_state updates. these are interrpeted as
1595 // CEPH_OSD_UP (and almost certainly not what we want!).
1596 auto p
= pending_inc
.new_state
.begin();
1597 while (p
!= pending_inc
.new_state
.end()) {
1598 if (p
->second
== 0) {
1599 dout(10) << "new_state for osd." << p
->first
<< " is 0, removing" << dendl
;
1600 p
= pending_inc
.new_state
.erase(p
);
1602 if (p
->second
& CEPH_OSD_UP
) {
1603 pending_inc
.new_last_up_change
= pending_inc
.modified
;
1608 if (!pending_inc
.new_up_client
.empty()) {
1609 pending_inc
.new_last_up_change
= pending_inc
.modified
;
1611 for (auto& i
: pending_inc
.new_weight
) {
1612 if (i
.first
>= osdmap
.max_osd
) {
1614 // new osd is already marked in
1615 pending_inc
.new_last_in_change
= pending_inc
.modified
;
1618 } else if (!!i
.second
!= !!osdmap
.osd_weight
[i
.first
]) {
1619 // existing osd marked in or out
1620 pending_inc
.new_last_in_change
= pending_inc
.modified
;
1627 tmp
.deepish_copy_from(osdmap
);
1628 tmp
.apply_incremental(pending_inc
);
1630 // clean pg_temp mappings
1631 OSDMap::clean_temps(cct
, osdmap
, tmp
, &pending_inc
);
1633 // clean inappropriate pg_upmap/pg_upmap_items (if any)
1635 // check every upmapped pg for now
1636 // until we could reliably identify certain cases to ignore,
1637 // which is obviously the hard part TBD..
1638 vector
<pg_t
> pgs_to_check
;
1639 tmp
.get_upmap_pgs(&pgs_to_check
);
1640 if (pgs_to_check
.size() <
1641 static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk
* 2)) {
1642 // not enough pgs, do it inline
1643 tmp
.clean_pg_upmaps(cct
, &pending_inc
);
1645 CleanUpmapJob
job(cct
, tmp
, pending_inc
);
1646 mapper
.queue(&job
, g_conf()->mon_clean_pg_upmaps_per_chunk
, pgs_to_check
);
1651 // update creating pgs first so that we can remove the created pgid and
1652 // process the pool flag removal below in the same osdmap epoch.
1653 auto pending_creatings
= update_pending_pgs(pending_inc
, tmp
);
1654 bufferlist creatings_bl
;
1655 uint64_t features
= CEPH_FEATURES_ALL
;
1656 if (mon
.monmap
->min_mon_release
< ceph_release_t::octopus
) {
1657 dout(20) << __func__
<< " encoding pending pgs without octopus features"
1659 features
&= ~CEPH_FEATURE_SERVER_OCTOPUS
;
1661 encode(pending_creatings
, creatings_bl
, features
);
1662 t
->put(OSD_PG_CREATING_PREFIX
, "creating", creatings_bl
);
1664 // remove any old (or incompat) POOL_CREATING flags
1665 for (auto& i
: tmp
.get_pools()) {
1666 if (tmp
.require_osd_release
< ceph_release_t::nautilus
) {
1667 // pre-nautilus OSDMaps shouldn't get this flag.
1668 if (pending_inc
.new_pools
.count(i
.first
)) {
1669 pending_inc
.new_pools
[i
.first
].flags
&= ~pg_pool_t::FLAG_CREATING
;
1672 if (i
.second
.has_flag(pg_pool_t::FLAG_CREATING
) &&
1673 !pending_creatings
.still_creating_pool(i
.first
)) {
1674 dout(10) << __func__
<< " done creating pool " << i
.first
1675 << ", clearing CREATING flag" << dendl
;
1676 if (pending_inc
.new_pools
.count(i
.first
) == 0) {
1677 pending_inc
.new_pools
[i
.first
] = i
.second
;
1679 pending_inc
.new_pools
[i
.first
].flags
&= ~pg_pool_t::FLAG_CREATING
;
1683 // collect which pools are currently affected by
1684 // the near/backfill/full osd(s),
1685 // and set per-pool near/backfill/full flag instead
1686 set
<int64_t> full_pool_ids
;
1687 set
<int64_t> backfillfull_pool_ids
;
1688 set
<int64_t> nearfull_pool_ids
;
1689 tmp
.get_full_pools(cct
,
1691 &backfillfull_pool_ids
,
1692 &nearfull_pool_ids
);
1693 if (full_pool_ids
.empty() ||
1694 backfillfull_pool_ids
.empty() ||
1695 nearfull_pool_ids
.empty()) {
1696 // normal case - no nearfull, backfillfull or full osds
1697 // try cancel any improper nearfull/backfillfull/full pool
1699 for (auto &pool
: tmp
.get_pools()) {
1700 auto p
= pool
.first
;
1701 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
) &&
1702 nearfull_pool_ids
.empty()) {
1703 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1704 << "'s nearfull flag" << dendl
;
1705 if (pending_inc
.new_pools
.count(p
) == 0) {
1706 // load original pool info first!
1707 pending_inc
.new_pools
[p
] = pool
.second
;
1709 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1711 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
) &&
1712 backfillfull_pool_ids
.empty()) {
1713 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1714 << "'s backfillfull flag" << dendl
;
1715 if (pending_inc
.new_pools
.count(p
) == 0) {
1716 pending_inc
.new_pools
[p
] = pool
.second
;
1718 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1720 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
) &&
1721 full_pool_ids
.empty()) {
1722 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1723 // set by EQUOTA, skipping
1726 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1727 << "'s full flag" << dendl
;
1728 if (pending_inc
.new_pools
.count(p
) == 0) {
1729 pending_inc
.new_pools
[p
] = pool
.second
;
1731 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_FULL
;
1735 if (!full_pool_ids
.empty()) {
1736 dout(10) << __func__
<< " marking pool(s) " << full_pool_ids
1737 << " as full" << dendl
;
1738 for (auto &p
: full_pool_ids
) {
1739 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
)) {
1742 if (pending_inc
.new_pools
.count(p
) == 0) {
1743 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1745 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_FULL
;
1746 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1747 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1749 // cancel FLAG_FULL for pools which are no longer full too
1750 for (auto &pool
: tmp
.get_pools()) {
1751 auto p
= pool
.first
;
1752 if (full_pool_ids
.count(p
)) {
1753 // skip pools we have just marked as full above
1756 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
) ||
1757 tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1758 // don't touch if currently is not full
1759 // or is running out of quota (and hence considered as full)
1762 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1763 << "'s full flag" << dendl
;
1764 if (pending_inc
.new_pools
.count(p
) == 0) {
1765 pending_inc
.new_pools
[p
] = pool
.second
;
1767 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_FULL
;
1770 if (!backfillfull_pool_ids
.empty()) {
1771 for (auto &p
: backfillfull_pool_ids
) {
1772 if (full_pool_ids
.count(p
)) {
1773 // skip pools we have already considered as full above
1776 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1777 // make sure FLAG_FULL is truly set, so we are safe not
1778 // to set a extra (redundant) FLAG_BACKFILLFULL flag
1779 ceph_assert(tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
));
1782 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
)) {
1783 // don't bother if pool is already marked as backfillfull
1786 dout(10) << __func__
<< " marking pool '" << tmp
.pool_name
[p
]
1787 << "'s as backfillfull" << dendl
;
1788 if (pending_inc
.new_pools
.count(p
) == 0) {
1789 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1791 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_BACKFILLFULL
;
1792 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1794 // cancel FLAG_BACKFILLFULL for pools
1795 // which are no longer backfillfull too
1796 for (auto &pool
: tmp
.get_pools()) {
1797 auto p
= pool
.first
;
1798 if (full_pool_ids
.count(p
) || backfillfull_pool_ids
.count(p
)) {
1799 // skip pools we have just marked as backfillfull/full above
1802 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
)) {
1803 // and don't touch if currently is not backfillfull
1806 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1807 << "'s backfillfull flag" << dendl
;
1808 if (pending_inc
.new_pools
.count(p
) == 0) {
1809 pending_inc
.new_pools
[p
] = pool
.second
;
1811 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1814 if (!nearfull_pool_ids
.empty()) {
1815 for (auto &p
: nearfull_pool_ids
) {
1816 if (full_pool_ids
.count(p
) || backfillfull_pool_ids
.count(p
)) {
1819 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1820 // make sure FLAG_FULL is truly set, so we are safe not
1821 // to set a extra (redundant) FLAG_NEARFULL flag
1822 ceph_assert(tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
));
1825 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
)) {
1826 // don't bother if pool is already marked as nearfull
1829 dout(10) << __func__
<< " marking pool '" << tmp
.pool_name
[p
]
1830 << "'s as nearfull" << dendl
;
1831 if (pending_inc
.new_pools
.count(p
) == 0) {
1832 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1834 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_NEARFULL
;
1836 // cancel FLAG_NEARFULL for pools
1837 // which are no longer nearfull too
1838 for (auto &pool
: tmp
.get_pools()) {
1839 auto p
= pool
.first
;
1840 if (full_pool_ids
.count(p
) ||
1841 backfillfull_pool_ids
.count(p
) ||
1842 nearfull_pool_ids
.count(p
)) {
1843 // skip pools we have just marked as
1844 // nearfull/backfillfull/full above
1847 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
)) {
1848 // and don't touch if currently is not nearfull
1851 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1852 << "'s nearfull flag" << dendl
;
1853 if (pending_inc
.new_pools
.count(p
) == 0) {
1854 pending_inc
.new_pools
[p
] = pool
.second
;
1856 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1860 // min_compat_client?
1861 if (!tmp
.require_min_compat_client
) {
1862 auto mv
= tmp
.get_min_compat_client();
1863 dout(1) << __func__
<< " setting require_min_compat_client to currently "
1864 << "required " << mv
<< dendl
;
1865 mon
.clog
->info() << "setting require_min_compat_client to currently "
1866 << "required " << mv
;
1867 pending_inc
.new_require_min_compat_client
= mv
;
1870 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
&&
1871 tmp
.require_osd_release
>= ceph_release_t::nautilus
) {
1872 dout(10) << __func__
<< " first nautilus+ epoch" << dendl
;
1873 // add creating flags?
1874 for (auto& i
: tmp
.get_pools()) {
1875 if (pending_creatings
.still_creating_pool(i
.first
)) {
1876 dout(10) << __func__
<< " adding CREATING flag to pool " << i
.first
1878 if (pending_inc
.new_pools
.count(i
.first
) == 0) {
1879 pending_inc
.new_pools
[i
.first
] = i
.second
;
1881 pending_inc
.new_pools
[i
.first
].flags
|= pg_pool_t::FLAG_CREATING
;
1884 // adjust blocklist items to all be TYPE_ANY
1885 for (auto& i
: tmp
.blocklist
) {
1887 a
.set_type(entity_addr_t::TYPE_ANY
);
1888 pending_inc
.new_blocklist
[a
] = i
.second
;
1889 pending_inc
.old_blocklist
.push_back(i
.first
);
1893 if (osdmap
.require_osd_release
< ceph_release_t::octopus
&&
1894 tmp
.require_osd_release
>= ceph_release_t::octopus
) {
1895 dout(10) << __func__
<< " first octopus+ epoch" << dendl
;
1897 // adjust obsoleted cache modes
1898 for (auto& [poolid
, pi
] : tmp
.pools
) {
1899 if (pi
.cache_mode
== pg_pool_t::CACHEMODE_FORWARD
) {
1900 if (pending_inc
.new_pools
.count(poolid
) == 0) {
1901 pending_inc
.new_pools
[poolid
] = pi
;
1903 dout(10) << __func__
<< " switching pool " << poolid
1904 << " cachemode from forward -> proxy" << dendl
;
1905 pending_inc
.new_pools
[poolid
].cache_mode
= pg_pool_t::CACHEMODE_PROXY
;
1907 if (pi
.cache_mode
== pg_pool_t::CACHEMODE_READFORWARD
) {
1908 if (pending_inc
.new_pools
.count(poolid
) == 0) {
1909 pending_inc
.new_pools
[poolid
] = pi
;
1911 dout(10) << __func__
<< " switching pool " << poolid
1912 << " cachemode from readforward -> readproxy" << dendl
;
1913 pending_inc
.new_pools
[poolid
].cache_mode
=
1914 pg_pool_t::CACHEMODE_READPROXY
;
1918 // clear removed_snaps for every pool
1919 for (auto& [poolid
, pi
] : tmp
.pools
) {
1920 if (pi
.removed_snaps
.empty()) {
1923 if (pending_inc
.new_pools
.count(poolid
) == 0) {
1924 pending_inc
.new_pools
[poolid
] = pi
;
1926 dout(10) << __func__
<< " clearing pool " << poolid
<< " removed_snaps"
1928 pending_inc
.new_pools
[poolid
].removed_snaps
.clear();
1931 // create a combined purged snap epoch key for all purged snaps
1932 // prior to this epoch, and store it in the current epoch (i.e.,
1933 // the last pre-octopus epoch, just prior to the one we're
1935 auto it
= mon
.store
->get_iterator(OSD_SNAP_PREFIX
);
1936 it
->lower_bound("purged_snap_");
1937 map
<int64_t,snap_interval_set_t
> combined
;
1938 while (it
->valid()) {
1939 if (it
->key().find("purged_snap_") != 0) {
1942 string k
= it
->key();
1943 long long unsigned pool
;
1944 int n
= sscanf(k
.c_str(), "purged_snap_%llu_", &pool
);
1946 derr
<< __func__
<< " invalid purged_snaps key '" << k
<< "'" << dendl
;
1948 bufferlist v
= it
->value();
1949 auto p
= v
.cbegin();
1950 snapid_t begin
, end
;
1951 ceph::decode(begin
, p
);
1952 ceph::decode(end
, p
);
1953 combined
[pool
].insert(begin
, end
- begin
);
1957 if (!combined
.empty()) {
1958 string k
= make_purged_snap_epoch_key(pending_inc
.epoch
- 1);
1960 ceph::encode(combined
, v
);
1961 t
->put(OSD_SNAP_PREFIX
, k
, v
);
1962 dout(10) << __func__
<< " recording pre-octopus purged_snaps in epoch "
1963 << (pending_inc
.epoch
- 1) << ", " << v
.length() << " bytes"
1966 dout(10) << __func__
<< " there were no pre-octopus purged snaps"
1970 // clean out the old removed_snap_ and removed_epoch keys
1971 // ('`' is ASCII '_' + 1)
1972 t
->erase_range(OSD_SNAP_PREFIX
, "removed_snap_", "removed_snap`");
1973 t
->erase_range(OSD_SNAP_PREFIX
, "removed_epoch_", "removed_epoch`");
1978 for (auto i
= pending_inc
.new_state
.begin();
1979 i
!= pending_inc
.new_state
.end();
1981 int s
= i
->second
? i
->second
: CEPH_OSD_UP
;
1982 if (s
& CEPH_OSD_UP
) {
1983 dout(2) << " osd." << i
->first
<< " DOWN" << dendl
;
1984 // Reset laggy parameters if failure interval exceeds a threshold.
1985 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(i
->first
);
1986 if ((xi
.laggy_probability
|| xi
.laggy_interval
) && xi
.down_stamp
.sec()) {
1987 int last_failure_interval
= pending_inc
.modified
.sec() - xi
.down_stamp
.sec();
1988 if (grace_interval_threshold_exceeded(last_failure_interval
)) {
1989 set_default_laggy_params(i
->first
);
1993 if (s
& CEPH_OSD_EXISTS
)
1994 dout(2) << " osd." << i
->first
<< " DNE" << dendl
;
1996 for (auto i
= pending_inc
.new_up_client
.begin();
1997 i
!= pending_inc
.new_up_client
.end();
1999 //FIXME: insert cluster addresses too
2000 dout(2) << " osd." << i
->first
<< " UP " << i
->second
<< dendl
;
2002 for (map
<int32_t,uint32_t>::iterator i
= pending_inc
.new_weight
.begin();
2003 i
!= pending_inc
.new_weight
.end();
2005 if (i
->second
== CEPH_OSD_OUT
) {
2006 dout(2) << " osd." << i
->first
<< " OUT" << dendl
;
2007 } else if (i
->second
== CEPH_OSD_IN
) {
2008 dout(2) << " osd." << i
->first
<< " IN" << dendl
;
2010 dout(2) << " osd." << i
->first
<< " WEIGHT " << hex
<< i
->second
<< dec
<< dendl
;
2014 // features for osdmap and its incremental
2017 // encode full map and determine its crc
2020 tmp
.deepish_copy_from(osdmap
);
2021 tmp
.apply_incremental(pending_inc
);
2023 // determine appropriate features
2024 features
= tmp
.get_encoding_features();
2025 dout(10) << __func__
<< " encoding full map with "
2026 << tmp
.require_osd_release
2027 << " features " << features
<< dendl
;
2029 // the features should be a subset of the mon quorum's features!
2030 ceph_assert((features
& ~mon
.get_quorum_con_features()) == 0);
2033 encode(tmp
, fullbl
, features
| CEPH_FEATURE_RESERVED
);
2034 pending_inc
.full_crc
= tmp
.get_crc();
2036 // include full map in the txn. note that old monitors will
2037 // overwrite this. new ones will now skip the local full map
2038 // encode and reload from this.
2039 put_version_full(t
, pending_inc
.epoch
, fullbl
);
2043 ceph_assert(get_last_committed() + 1 == pending_inc
.epoch
);
2045 encode(pending_inc
, bl
, features
| CEPH_FEATURE_RESERVED
);
2047 dout(20) << " full_crc " << tmp
.get_crc()
2048 << " inc_crc " << pending_inc
.inc_crc
<< dendl
;
2050 /* put everything in the transaction */
2051 put_version(t
, pending_inc
.epoch
, bl
);
2052 put_last_committed(t
, pending_inc
.epoch
);
2055 for (map
<int,bufferlist
>::iterator p
= pending_metadata
.begin();
2056 p
!= pending_metadata
.end();
2058 t
->put(OSD_METADATA_PREFIX
, stringify(p
->first
), p
->second
);
2059 for (set
<int>::iterator p
= pending_metadata_rm
.begin();
2060 p
!= pending_metadata_rm
.end();
2062 t
->erase(OSD_METADATA_PREFIX
, stringify(*p
));
2063 pending_metadata
.clear();
2064 pending_metadata_rm
.clear();
2067 if (tmp
.require_osd_release
>= ceph_release_t::octopus
&&
2068 !pending_inc
.new_purged_snaps
.empty()) {
2069 // all snaps purged this epoch (across all pools)
2070 string k
= make_purged_snap_epoch_key(pending_inc
.epoch
);
2072 encode(pending_inc
.new_purged_snaps
, v
);
2073 t
->put(OSD_SNAP_PREFIX
, k
, v
);
2075 for (auto& i
: pending_inc
.new_purged_snaps
) {
2076 for (auto q
= i
.second
.begin();
2077 q
!= i
.second
.end();
2079 insert_purged_snap_update(i
.first
, q
.get_start(), q
.get_end(),
2084 for (auto& [pool
, snaps
] : pending_pseudo_purged_snaps
) {
2085 for (auto snap
: snaps
) {
2086 insert_purged_snap_update(pool
, snap
, snap
+ 1,
2093 health_check_map_t next
;
2094 tmp
.check_health(cct
, &next
);
2095 encode_health(next
, t
);
2098 int OSDMonitor::load_metadata(int osd
, map
<string
, string
>& m
, ostream
*err
)
2101 int r
= mon
.store
->get(OSD_METADATA_PREFIX
, stringify(osd
), bl
);
2105 auto p
= bl
.cbegin();
2108 catch (ceph::buffer::error
& e
) {
2110 *err
<< "osd." << osd
<< " metadata is corrupt";
2116 void OSDMonitor::count_metadata(const string
& field
, map
<string
,int> *out
)
2118 for (int osd
= 0; osd
< osdmap
.get_max_osd(); ++osd
) {
2119 if (osdmap
.is_up(osd
)) {
2120 map
<string
,string
> meta
;
2121 load_metadata(osd
, meta
, nullptr);
2122 auto p
= meta
.find(field
);
2123 if (p
== meta
.end()) {
2124 (*out
)["unknown"]++;
2126 (*out
)[p
->second
]++;
2132 void OSDMonitor::count_metadata(const string
& field
, Formatter
*f
)
2134 map
<string
,int> by_val
;
2135 count_metadata(field
, &by_val
);
2136 f
->open_object_section(field
.c_str());
2137 for (auto& p
: by_val
) {
2138 f
->dump_int(p
.first
.c_str(), p
.second
);
2143 void OSDMonitor::get_versions(std::map
<string
, list
<string
>> &versions
)
2145 for (int osd
= 0; osd
< osdmap
.get_max_osd(); ++osd
) {
2146 if (osdmap
.is_up(osd
)) {
2147 map
<string
,string
> meta
;
2148 load_metadata(osd
, meta
, nullptr);
2149 auto p
= meta
.find("ceph_version_short");
2150 if (p
== meta
.end()) continue;
2151 versions
[p
->second
].push_back(string("osd.") + stringify(osd
));
2156 int OSDMonitor::get_osd_objectstore_type(int osd
, string
*type
)
2158 map
<string
, string
> metadata
;
2159 int r
= load_metadata(osd
, metadata
, nullptr);
2163 auto it
= metadata
.find("osd_objectstore");
2164 if (it
== metadata
.end())
2170 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id
,
2171 const pg_pool_t
&pool
,
2174 // just check a few pgs for efficiency - this can't give a guarantee anyway,
2175 // since filestore osds could always join the pool later
2176 set
<int> checked_osds
;
2177 for (unsigned ps
= 0; ps
< std::min(8u, pool
.get_pg_num()); ++ps
) {
2178 vector
<int> up
, acting
;
2179 pg_t
pgid(ps
, pool_id
);
2180 osdmap
.pg_to_up_acting_osds(pgid
, up
, acting
);
2181 for (int osd
: up
) {
2182 if (checked_osds
.find(osd
) != checked_osds
.end())
2184 string objectstore_type
;
2185 int r
= get_osd_objectstore_type(osd
, &objectstore_type
);
2186 // allow with missing metadata, e.g. due to an osd never booting yet
2187 if (r
< 0 || objectstore_type
== "bluestore") {
2188 checked_osds
.insert(osd
);
2191 *err
<< "osd." << osd
<< " uses " << objectstore_type
;
2198 int OSDMonitor::dump_osd_metadata(int osd
, Formatter
*f
, ostream
*err
)
2200 map
<string
,string
> m
;
2201 if (int r
= load_metadata(osd
, m
, err
))
2203 for (map
<string
,string
>::iterator p
= m
.begin(); p
!= m
.end(); ++p
)
2204 f
->dump_string(p
->first
.c_str(), p
->second
);
2208 void OSDMonitor::print_nodes(Formatter
*f
)
2210 // group OSDs by their hosts
2211 map
<string
, list
<int> > osds
; // hostname => osd
2212 for (int osd
= 0; osd
< osdmap
.get_max_osd(); osd
++) {
2213 map
<string
, string
> m
;
2214 if (load_metadata(osd
, m
, NULL
)) {
2217 map
<string
, string
>::iterator hostname
= m
.find("hostname");
2218 if (hostname
== m
.end()) {
2219 // not likely though
2222 osds
[hostname
->second
].push_back(osd
);
2225 dump_services(f
, osds
, "osd");
2228 void OSDMonitor::share_map_with_random_osd()
2230 if (osdmap
.get_num_up_osds() == 0) {
2231 dout(10) << __func__
<< " no up osds, don't share with anyone" << dendl
;
2235 MonSession
*s
= mon
.session_map
.get_random_osd_session(&osdmap
);
2237 dout(10) << __func__
<< " no up osd on our session map" << dendl
;
2241 dout(10) << "committed, telling random " << s
->name
2242 << " all about it" << dendl
;
2244 // get feature of the peer
2245 // use quorum_con_features, if it's an anonymous connection.
2246 uint64_t features
= s
->con_features
? s
->con_features
:
2247 mon
.get_quorum_con_features();
2248 // whatev, they'll request more if they need it
2249 MOSDMap
*m
= build_incremental(osdmap
.get_epoch() - 1, osdmap
.get_epoch(), features
);
2250 s
->con
->send_message(m
);
2251 // NOTE: do *not* record osd has up to this epoch (as we do
2252 // elsewhere) as they may still need to request older values.
2255 version_t
OSDMonitor::get_trim_to() const
2257 if (mon
.get_quorum().empty()) {
2258 dout(10) << __func__
<< " quorum not formed, trim_to = 0" << dendl
;
2263 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
2264 if (!creating_pgs
.pgs
.empty()) {
2265 dout(10) << __func__
<< " pgs creating, trim_to = 0" << dendl
;
2270 if (g_conf().get_val
<bool>("mon_debug_block_osdmap_trim")) {
2272 << " blocking osdmap trim"
2273 << " ('mon_debug_block_osdmap_trim' set to 'true')"
2274 << " trim_to = 0" << dendl
;
2279 epoch_t floor
= get_min_last_epoch_clean();
2280 dout(10) << " min_last_epoch_clean " << floor
<< dendl
;
2281 if (g_conf()->mon_osd_force_trim_to
> 0 &&
2282 g_conf()->mon_osd_force_trim_to
< (int)get_last_committed()) {
2283 floor
= g_conf()->mon_osd_force_trim_to
;
2284 dout(10) << __func__
2285 << " explicit mon_osd_force_trim_to = " << floor
<< dendl
;
2287 unsigned min
= g_conf()->mon_min_osdmap_epochs
;
2288 if (floor
+ min
> get_last_committed()) {
2289 if (min
< get_last_committed())
2290 floor
= get_last_committed() - min
;
2294 if (floor
> get_first_committed()) {
2295 dout(10) << __func__
<< " trim_to = " << floor
<< dendl
;
2299 dout(10) << __func__
<< " trim_to = 0" << dendl
;
2303 epoch_t
OSDMonitor::get_min_last_epoch_clean() const
2305 auto floor
= last_epoch_clean
.get_lower_bound(osdmap
);
2306 // also scan osd epochs
2307 // don't trim past the oldest reported osd epoch
2308 for (auto [osd
, epoch
] : osd_epochs
) {
2309 if (epoch
< floor
) {
2316 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx
,
2319 dout(10) << __func__
<< " including full map for e " << first
<< dendl
;
2321 get_version_full(first
, bl
);
2322 put_version_full(tx
, first
, bl
);
2324 if (has_osdmap_manifest
&&
2325 first
> osdmap_manifest
.get_first_pinned()) {
2326 _prune_update_trimmed(tx
, first
);
2331 /* full osdmap prune
2333 * for more information, please refer to doc/dev/mon-osdmap-prune.rst
2336 void OSDMonitor::load_osdmap_manifest()
2338 bool store_has_manifest
=
2339 mon
.store
->exists(get_service_name(), "osdmap_manifest");
2341 if (!store_has_manifest
) {
2342 if (!has_osdmap_manifest
) {
2346 dout(20) << __func__
2347 << " dropping osdmap manifest from memory." << dendl
;
2348 osdmap_manifest
= osdmap_manifest_t();
2349 has_osdmap_manifest
= false;
2353 dout(20) << __func__
2354 << " osdmap manifest detected in store; reload." << dendl
;
2356 bufferlist manifest_bl
;
2357 int r
= get_value("osdmap_manifest", manifest_bl
);
2359 derr
<< __func__
<< " unable to read osdmap version manifest" << dendl
;
2360 ceph_abort_msg("error reading manifest");
2362 osdmap_manifest
.decode(manifest_bl
);
2363 has_osdmap_manifest
= true;
2365 dout(10) << __func__
<< " store osdmap manifest pinned ("
2366 << osdmap_manifest
.get_first_pinned()
2368 << osdmap_manifest
.get_last_pinned()
2373 bool OSDMonitor::should_prune() const
2375 version_t first
= get_first_committed();
2376 version_t last
= get_last_committed();
2377 version_t min_osdmap_epochs
=
2378 g_conf().get_val
<int64_t>("mon_min_osdmap_epochs");
2379 version_t prune_min
=
2380 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_min");
2381 version_t prune_interval
=
2382 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_interval");
2383 version_t last_pinned
= osdmap_manifest
.get_last_pinned();
2384 version_t last_to_pin
= last
- min_osdmap_epochs
;
2386 // Make it or break it constraints.
2388 // If any of these conditions fails, we will not prune, regardless of
2389 // whether we have an on-disk manifest with an on-going pruning state.
2391 if ((last
- first
) <= min_osdmap_epochs
) {
2392 // between the first and last committed epochs, we don't have
2393 // enough epochs to trim, much less to prune.
2394 dout(10) << __func__
2395 << " currently holding only " << (last
- first
)
2396 << " epochs (min osdmap epochs: " << min_osdmap_epochs
2397 << "); do not prune."
2401 } else if ((last_to_pin
- first
) < prune_min
) {
2402 // between the first committed epoch and the last epoch we would prune,
2403 // we simply don't have enough versions over the minimum to prune maps.
2404 dout(10) << __func__
2405 << " could only prune " << (last_to_pin
- first
)
2406 << " epochs (" << first
<< ".." << last_to_pin
<< "), which"
2407 " is less than the required minimum (" << prune_min
<< ")"
2411 } else if (has_osdmap_manifest
&& last_pinned
>= last_to_pin
) {
2412 dout(10) << __func__
2413 << " we have pruned as far as we can; do not prune."
2417 } else if (last_pinned
+ prune_interval
> last_to_pin
) {
2418 dout(10) << __func__
2419 << " not enough epochs to form an interval (last pinned: "
2420 << last_pinned
<< ", last to pin: "
2421 << last_to_pin
<< ", interval: " << prune_interval
<< ")"
2426 dout(15) << __func__
2427 << " should prune (" << last_pinned
<< ".." << last_to_pin
<< ")"
2428 << " lc (" << first
<< ".." << last
<< ")"
2433 void OSDMonitor::_prune_update_trimmed(
2434 MonitorDBStore::TransactionRef tx
,
2437 dout(10) << __func__
2438 << " first " << first
2439 << " last_pinned " << osdmap_manifest
.get_last_pinned()
2442 osdmap_manifest_t manifest
= osdmap_manifest
;
2444 if (!manifest
.is_pinned(first
)) {
2445 manifest
.pin(first
);
2448 set
<version_t
>::iterator p_end
= manifest
.pinned
.find(first
);
2449 set
<version_t
>::iterator p
= manifest
.pinned
.begin();
2450 manifest
.pinned
.erase(p
, p_end
);
2451 ceph_assert(manifest
.get_first_pinned() == first
);
2453 if (manifest
.get_last_pinned() == first
+1 ||
2454 manifest
.pinned
.size() == 1) {
2455 // we reached the end of the line, as pinned maps go; clean up our
2456 // manifest, and let `should_prune()` decide whether we should prune
2458 tx
->erase(get_service_name(), "osdmap_manifest");
2463 manifest
.encode(bl
);
2464 tx
->put(get_service_name(), "osdmap_manifest", bl
);
2467 void OSDMonitor::prune_init(osdmap_manifest_t
& manifest
)
2469 dout(1) << __func__
<< dendl
;
2471 version_t pin_first
;
2473 // verify constrainsts on stable in-memory state
2474 if (!has_osdmap_manifest
) {
2475 // we must have never pruned, OR if we pruned the state must no longer
2476 // be relevant (i.e., the state must have been removed alongside with
2477 // the trim that *must* have removed past the last pinned map in a
2479 ceph_assert(osdmap_manifest
.pinned
.empty());
2480 ceph_assert(!mon
.store
->exists(get_service_name(), "osdmap_manifest"));
2481 pin_first
= get_first_committed();
2484 // we must have pruned in the past AND its state is still relevant
2485 // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
2486 // and thus we still hold a manifest in the store).
2487 ceph_assert(!osdmap_manifest
.pinned
.empty());
2488 ceph_assert(osdmap_manifest
.get_first_pinned() == get_first_committed());
2489 ceph_assert(osdmap_manifest
.get_last_pinned() < get_last_committed());
2491 dout(10) << __func__
2492 << " first_pinned " << osdmap_manifest
.get_first_pinned()
2493 << " last_pinned " << osdmap_manifest
.get_last_pinned()
2496 pin_first
= osdmap_manifest
.get_last_pinned();
2499 manifest
.pin(pin_first
);
2502 bool OSDMonitor::_prune_sanitize_options() const
2504 uint64_t prune_interval
=
2505 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_interval");
2506 uint64_t prune_min
=
2507 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_min");
2509 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_txsize");
2513 if (prune_interval
== 0) {
2515 << " prune is enabled BUT prune interval is zero; abort."
2518 } else if (prune_interval
== 1) {
2520 << " prune interval is equal to one, which essentially means"
2521 " no pruning; abort."
2525 if (prune_min
== 0) {
2527 << " prune is enabled BUT prune min is zero; abort."
2531 if (prune_interval
> prune_min
) {
2533 << " impossible to ascertain proper prune interval because"
2534 << " it is greater than the minimum prune epochs"
2535 << " (min: " << prune_min
<< ", interval: " << prune_interval
<< ")"
2540 if (txsize
< prune_interval
- 1) {
2542 << " 'mon_osdmap_full_prune_txsize' (" << txsize
2543 << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval
- 1
2544 << "); abort." << dendl
;
2550 bool OSDMonitor::is_prune_enabled() const {
2551 return g_conf().get_val
<bool>("mon_osdmap_full_prune_enabled");
2554 bool OSDMonitor::is_prune_supported() const {
2555 return mon
.get_required_mon_features().contains_any(
2556 ceph::features::mon::FEATURE_OSDMAP_PRUNE
);
2561 * @returns true if has side-effects; false otherwise.
2563 bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx
)
2565 bool enabled
= is_prune_enabled();
2567 dout(1) << __func__
<< " osdmap full prune "
2568 << ( enabled
? "enabled" : "disabled")
2571 if (!enabled
|| !_prune_sanitize_options() || !should_prune()) {
2575 // we are beyond the minimum prune versions, we need to remove maps because
2576 // otherwise the store will grow unbounded and we may end up having issues
2577 // with available disk space or store hangs.
2579 // we will not pin all versions. We will leave a buffer number of versions.
2580 // this allows us the monitor to trim maps without caring too much about
2581 // pinned maps, and then allow us to use another ceph-mon without these
2582 // capabilities, without having to repair the store.
2584 osdmap_manifest_t manifest
= osdmap_manifest
;
2586 version_t first
= get_first_committed();
2587 version_t last
= get_last_committed();
2589 version_t last_to_pin
= last
- g_conf()->mon_min_osdmap_epochs
;
2590 version_t last_pinned
= manifest
.get_last_pinned();
2591 uint64_t prune_interval
=
2592 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_interval");
2594 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_txsize");
2596 prune_init(manifest
);
2598 // we need to get rid of some osdmaps
2601 << " lc (" << first
<< " .. " << last
<< ")"
2602 << " last_pinned " << last_pinned
2603 << " interval " << prune_interval
2604 << " last_to_pin " << last_to_pin
2607 // We will be erasing maps as we go.
2609 // We will erase all maps between `last_pinned` and the `next_to_pin`.
2611 // If `next_to_pin` happens to be greater than `last_to_pin`, then
2612 // we stop pruning. We could prune the maps between `next_to_pin` and
2613 // `last_to_pin`, but by not doing it we end up with neater pruned
2614 // intervals, aligned with `prune_interval`. Besides, this should not be a
2615 // problem as long as `prune_interval` is set to a sane value, instead of
2616 // hundreds or thousands of maps.
2618 auto map_exists
= [this](version_t v
) {
2619 string k
= mon
.store
->combine_strings("full", v
);
2620 return mon
.store
->exists(get_service_name(), k
);
2623 // 'interval' represents the number of maps from the last pinned
2624 // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2625 // version 11 next; all intermediate versions will be removed.
2627 // 'txsize' represents the maximum number of versions we'll be removing in
2628 // this iteration. If 'txsize' is large enough to perform multiple passes
2629 // pinning and removing maps, we will do so; if not, we'll do at least one
2630 // pass. We are quite relaxed about honouring 'txsize', but we'll always
2631 // ensure that we never go *over* the maximum.
2633 // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2634 uint64_t removal_interval
= prune_interval
- 1;
2636 if (txsize
< removal_interval
) {
2638 << " setting txsize to removal interval size ("
2639 << removal_interval
<< " versions"
2641 txsize
= removal_interval
;
2643 ceph_assert(removal_interval
> 0);
2645 uint64_t num_pruned
= 0;
2646 while (num_pruned
+ removal_interval
<= txsize
) {
2647 last_pinned
= manifest
.get_last_pinned();
2649 if (last_pinned
+ prune_interval
> last_to_pin
) {
2652 ceph_assert(last_pinned
< last_to_pin
);
2654 version_t next_pinned
= last_pinned
+ prune_interval
;
2655 ceph_assert(next_pinned
<= last_to_pin
);
2656 manifest
.pin(next_pinned
);
2658 dout(20) << __func__
2659 << " last_pinned " << last_pinned
2660 << " next_pinned " << next_pinned
2661 << " num_pruned " << num_pruned
2662 << " removal interval (" << (last_pinned
+1)
2663 << ".." << (next_pinned
-1) << ")"
2664 << " txsize " << txsize
<< dendl
;
2666 ceph_assert(map_exists(last_pinned
));
2667 ceph_assert(map_exists(next_pinned
));
2669 for (version_t v
= last_pinned
+1; v
< next_pinned
; ++v
) {
2670 ceph_assert(!manifest
.is_pinned(v
));
2672 dout(20) << __func__
<< " pruning full osdmap e" << v
<< dendl
;
2673 string full_key
= mon
.store
->combine_strings("full", v
);
2674 tx
->erase(get_service_name(), full_key
);
2679 ceph_assert(num_pruned
> 0);
2682 manifest
.encode(bl
);
2683 tx
->put(get_service_name(), "osdmap_manifest", bl
);
2691 bool OSDMonitor::preprocess_query(MonOpRequestRef op
)
2693 op
->mark_osdmon_event(__func__
);
2694 Message
*m
= op
->get_req();
2695 dout(10) << "preprocess_query " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
2697 switch (m
->get_type()) {
2699 case MSG_MON_COMMAND
:
2701 return preprocess_command(op
);
2702 } catch (const bad_cmd_get
& e
) {
2704 mon
.reply_command(op
, -EINVAL
, e
.what(), bl
, get_last_committed());
2707 case CEPH_MSG_MON_GET_OSDMAP
:
2708 return preprocess_get_osdmap(op
);
2711 case MSG_OSD_MARK_ME_DOWN
:
2712 return preprocess_mark_me_down(op
);
2713 case MSG_OSD_MARK_ME_DEAD
:
2714 return preprocess_mark_me_dead(op
);
2716 return preprocess_full(op
);
2717 case MSG_OSD_FAILURE
:
2718 return preprocess_failure(op
);
2720 return preprocess_boot(op
);
2722 return preprocess_alive(op
);
2723 case MSG_OSD_PG_CREATED
:
2724 return preprocess_pg_created(op
);
2725 case MSG_OSD_PG_READY_TO_MERGE
:
2726 return preprocess_pg_ready_to_merge(op
);
2727 case MSG_OSD_PGTEMP
:
2728 return preprocess_pgtemp(op
);
2729 case MSG_OSD_BEACON
:
2730 return preprocess_beacon(op
);
2732 case CEPH_MSG_POOLOP
:
2733 return preprocess_pool_op(op
);
2735 case MSG_REMOVE_SNAPS
:
2736 return preprocess_remove_snaps(op
);
2738 case MSG_MON_GET_PURGED_SNAPS
:
2739 return preprocess_get_purged_snaps(op
);
2747 bool OSDMonitor::prepare_update(MonOpRequestRef op
)
2749 op
->mark_osdmon_event(__func__
);
2750 Message
*m
= op
->get_req();
2751 dout(7) << "prepare_update " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
2753 switch (m
->get_type()) {
2755 case MSG_OSD_MARK_ME_DOWN
:
2756 return prepare_mark_me_down(op
);
2757 case MSG_OSD_MARK_ME_DEAD
:
2758 return prepare_mark_me_dead(op
);
2760 return prepare_full(op
);
2761 case MSG_OSD_FAILURE
:
2762 return prepare_failure(op
);
2764 return prepare_boot(op
);
2766 return prepare_alive(op
);
2767 case MSG_OSD_PG_CREATED
:
2768 return prepare_pg_created(op
);
2769 case MSG_OSD_PGTEMP
:
2770 return prepare_pgtemp(op
);
2771 case MSG_OSD_PG_READY_TO_MERGE
:
2772 return prepare_pg_ready_to_merge(op
);
2773 case MSG_OSD_BEACON
:
2774 return prepare_beacon(op
);
2776 case MSG_MON_COMMAND
:
2778 return prepare_command(op
);
2779 } catch (const bad_cmd_get
& e
) {
2781 mon
.reply_command(op
, -EINVAL
, e
.what(), bl
, get_last_committed());
2785 case CEPH_MSG_POOLOP
:
2786 return prepare_pool_op(op
);
2788 case MSG_REMOVE_SNAPS
:
2789 return prepare_remove_snaps(op
);
2799 bool OSDMonitor::should_propose(double& delay
)
2801 dout(10) << "should_propose" << dendl
;
2803 // if full map, propose immediately! any subsequent changes will be clobbered.
2804 if (pending_inc
.fullmap
.length())
2807 // adjust osd weights?
2808 if (!osd_weight
.empty() &&
2809 osd_weight
.size() == (unsigned)osdmap
.get_max_osd()) {
2810 dout(0) << " adjusting osd weights based on " << osd_weight
<< dendl
;
2811 osdmap
.adjust_osd_weights(osd_weight
, pending_inc
);
2817 return PaxosService::should_propose(delay
);
2822 // ---------------------------
2825 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op
)
2827 op
->mark_osdmon_event(__func__
);
2828 auto m
= op
->get_req
<MMonGetOSDMap
>();
2830 uint64_t features
= mon
.get_quorum_con_features();
2831 if (op
->get_session() && op
->get_session()->con_features
)
2832 features
= op
->get_session()->con_features
;
2834 dout(10) << __func__
<< " " << *m
<< dendl
;
2835 MOSDMap
*reply
= new MOSDMap(mon
.monmap
->fsid
, features
);
2836 epoch_t first
= get_first_committed();
2837 epoch_t last
= osdmap
.get_epoch();
2838 int max
= g_conf()->osd_map_message_max
;
2839 ssize_t max_bytes
= g_conf()->osd_map_message_max_bytes
;
2840 for (epoch_t e
= std::max(first
, m
->get_full_first());
2841 e
<= std::min(last
, m
->get_full_last()) && max
> 0 && max_bytes
> 0;
2843 bufferlist
& bl
= reply
->maps
[e
];
2844 int r
= get_version_full(e
, features
, bl
);
2845 ceph_assert(r
>= 0);
2846 max_bytes
-= bl
.length();
2848 for (epoch_t e
= std::max(first
, m
->get_inc_first());
2849 e
<= std::min(last
, m
->get_inc_last()) && max
> 0 && max_bytes
> 0;
2851 bufferlist
& bl
= reply
->incremental_maps
[e
];
2852 int r
= get_version(e
, features
, bl
);
2853 ceph_assert(r
>= 0);
2854 max_bytes
-= bl
.length();
2856 reply
->oldest_map
= first
;
2857 reply
->newest_map
= last
;
2858 mon
.send_reply(op
, reply
);
2863 // ---------------------------
2868 bool OSDMonitor::check_source(MonOpRequestRef op
, uuid_d fsid
) {
2869 // check permissions
2870 MonSession
*session
= op
->get_session();
2873 if (!session
->is_capable("osd", MON_CAP_X
)) {
2874 dout(0) << "got MOSDFailure from entity with insufficient caps "
2875 << session
->caps
<< dendl
;
2878 if (fsid
!= mon
.monmap
->fsid
) {
2879 dout(0) << "check_source: on fsid " << fsid
2880 << " != " << mon
.monmap
->fsid
<< dendl
;
2887 bool OSDMonitor::preprocess_failure(MonOpRequestRef op
)
2889 op
->mark_osdmon_event(__func__
);
2890 auto m
= op
->get_req
<MOSDFailure
>();
2891 // who is target_osd
2892 int badboy
= m
->get_target_osd();
2894 // check permissions
2895 if (check_source(op
, m
->fsid
))
2898 // first, verify the reporting host is valid
2899 if (m
->get_orig_source().is_osd()) {
2900 int from
= m
->get_orig_source().num();
2901 if (!osdmap
.exists(from
) ||
2902 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs()) ||
2903 (osdmap
.is_down(from
) && m
->if_osd_failed())) {
2904 dout(5) << "preprocess_failure from dead osd." << from
2905 << ", ignoring" << dendl
;
2906 send_incremental(op
, m
->get_epoch()+1);
2913 if (osdmap
.is_down(badboy
)) {
2914 dout(5) << "preprocess_failure dne(/dup?): osd." << m
->get_target_osd()
2915 << " " << m
->get_target_addrs()
2916 << ", from " << m
->get_orig_source() << dendl
;
2917 if (m
->get_epoch() < osdmap
.get_epoch())
2918 send_incremental(op
, m
->get_epoch()+1);
2921 if (osdmap
.get_addrs(badboy
) != m
->get_target_addrs()) {
2922 dout(5) << "preprocess_failure wrong osd: report osd." << m
->get_target_osd()
2923 << " " << m
->get_target_addrs()
2924 << " != map's " << osdmap
.get_addrs(badboy
)
2925 << ", from " << m
->get_orig_source() << dendl
;
2926 if (m
->get_epoch() < osdmap
.get_epoch())
2927 send_incremental(op
, m
->get_epoch()+1);
2931 // already reported?
2932 if (osdmap
.is_down(badboy
) ||
2933 osdmap
.get_up_from(badboy
) > m
->get_epoch()) {
2934 dout(5) << "preprocess_failure dup/old: osd." << m
->get_target_osd()
2935 << " " << m
->get_target_addrs()
2936 << ", from " << m
->get_orig_source() << dendl
;
2937 if (m
->get_epoch() < osdmap
.get_epoch())
2938 send_incremental(op
, m
->get_epoch()+1);
2942 if (!can_mark_down(badboy
)) {
2943 dout(5) << "preprocess_failure ignoring report of osd."
2944 << m
->get_target_osd() << " " << m
->get_target_addrs()
2945 << " from " << m
->get_orig_source() << dendl
;
2949 dout(10) << "preprocess_failure new: osd." << m
->get_target_osd()
2950 << " " << m
->get_target_addrs()
2951 << ", from " << m
->get_orig_source() << dendl
;
2959 class C_AckMarkedDown
: public C_MonOp
{
2965 : C_MonOp(op
), osdmon(osdmon
) {}
2967 void _finish(int r
) override
{
2969 auto m
= op
->get_req
<MOSDMarkMeDown
>();
2970 osdmon
->mon
.send_reply(
2977 false)); // ACK itself does not request an ack
2978 } else if (r
== -EAGAIN
) {
2979 osdmon
->dispatch(op
);
2981 ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r
);
2984 ~C_AckMarkedDown() override
{
2988 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op
)
2990 op
->mark_osdmon_event(__func__
);
2991 auto m
= op
->get_req
<MOSDMarkMeDown
>();
2992 int from
= m
->target_osd
;
2994 // check permissions
2995 if (check_source(op
, m
->fsid
))
2998 // first, verify the reporting host is valid
2999 if (!m
->get_orig_source().is_osd())
3002 if (!osdmap
.exists(from
) ||
3003 osdmap
.is_down(from
) ||
3004 osdmap
.get_addrs(from
) != m
->target_addrs
) {
3005 dout(5) << "preprocess_mark_me_down from dead osd."
3006 << from
<< ", ignoring" << dendl
;
3007 send_incremental(op
, m
->get_epoch()+1);
3011 // no down might be set
3012 if (!can_mark_down(from
))
3015 dout(10) << "MOSDMarkMeDown for: " << m
->get_orig_source()
3016 << " " << m
->target_addrs
<< dendl
;
3020 if (m
->request_ack
) {
3021 Context
*c(new C_AckMarkedDown(this, op
));
3027 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op
)
3029 op
->mark_osdmon_event(__func__
);
3030 auto m
= op
->get_req
<MOSDMarkMeDown
>();
3031 int target_osd
= m
->target_osd
;
3033 ceph_assert(osdmap
.is_up(target_osd
));
3034 ceph_assert(osdmap
.get_addrs(target_osd
) == m
->target_addrs
);
3036 mon
.clog
->info() << "osd." << target_osd
<< " marked itself down";
3037 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
3039 wait_for_finished_proposal(op
, new C_AckMarkedDown(this, op
));
3043 bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op
)
3045 op
->mark_osdmon_event(__func__
);
3046 auto m
= op
->get_req
<MOSDMarkMeDead
>();
3047 int from
= m
->target_osd
;
3049 // check permissions
3050 if (check_source(op
, m
->fsid
)) {
3055 // first, verify the reporting host is valid
3056 if (!m
->get_orig_source().is_osd()) {
3061 if (!osdmap
.exists(from
) ||
3062 !osdmap
.is_down(from
)) {
3063 dout(5) << __func__
<< " from nonexistent or up osd." << from
3064 << ", ignoring" << dendl
;
3065 send_incremental(op
, m
->get_epoch()+1);
3073 bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op
)
3075 op
->mark_osdmon_event(__func__
);
3076 auto m
= op
->get_req
<MOSDMarkMeDead
>();
3077 int target_osd
= m
->target_osd
;
3079 ceph_assert(osdmap
.is_down(target_osd
));
3081 mon
.clog
->info() << "osd." << target_osd
<< " marked itself dead as of e"
3083 if (!pending_inc
.new_xinfo
.count(target_osd
)) {
3084 pending_inc
.new_xinfo
[target_osd
] = osdmap
.osd_xinfo
[target_osd
];
3086 pending_inc
.new_xinfo
[target_osd
].dead_epoch
= m
->get_epoch();
3087 wait_for_finished_proposal(
3090 [op
, this] (int r
) {
3092 mon
.no_reply(op
); // ignore on success
3099 bool OSDMonitor::can_mark_down(int i
)
3101 if (osdmap
.is_nodown(i
)) {
3102 dout(5) << __func__
<< " osd." << i
<< " is marked as nodown, "
3103 << "will not mark it down" << dendl
;
3107 int num_osds
= osdmap
.get_num_osds();
3108 if (num_osds
== 0) {
3109 dout(5) << __func__
<< " no osds" << dendl
;
3112 int up
= osdmap
.get_num_up_osds() - pending_inc
.get_net_marked_down(&osdmap
);
3113 float up_ratio
= (float)up
/ (float)num_osds
;
3114 if (up_ratio
< g_conf()->mon_osd_min_up_ratio
) {
3115 dout(2) << __func__
<< " current up_ratio " << up_ratio
<< " < min "
3116 << g_conf()->mon_osd_min_up_ratio
3117 << ", will not mark osd." << i
<< " down" << dendl
;
3123 bool OSDMonitor::can_mark_up(int i
)
3125 if (osdmap
.is_noup(i
)) {
3126 dout(5) << __func__
<< " osd." << i
<< " is marked as noup, "
3127 << "will not mark it up" << dendl
;
3135 * @note the parameter @p i apparently only exists here so we can output the
3136 * osd's id on messages.
3138 bool OSDMonitor::can_mark_out(int i
)
3140 if (osdmap
.is_noout(i
)) {
3141 dout(5) << __func__
<< " osd." << i
<< " is marked as noout, "
3142 << "will not mark it out" << dendl
;
3146 int num_osds
= osdmap
.get_num_osds();
3147 if (num_osds
== 0) {
3148 dout(5) << __func__
<< " no osds" << dendl
;
3151 int in
= osdmap
.get_num_in_osds() - pending_inc
.get_net_marked_out(&osdmap
);
3152 float in_ratio
= (float)in
/ (float)num_osds
;
3153 if (in_ratio
< g_conf()->mon_osd_min_in_ratio
) {
3155 dout(5) << __func__
<< " current in_ratio " << in_ratio
<< " < min "
3156 << g_conf()->mon_osd_min_in_ratio
3157 << ", will not mark osd." << i
<< " out" << dendl
;
3159 dout(5) << __func__
<< " current in_ratio " << in_ratio
<< " < min "
3160 << g_conf()->mon_osd_min_in_ratio
3161 << ", will not mark osds out" << dendl
;
3168 bool OSDMonitor::can_mark_in(int i
)
3170 if (osdmap
.is_noin(i
)) {
3171 dout(5) << __func__
<< " osd." << i
<< " is marked as noin, "
3172 << "will not mark it in" << dendl
;
3179 bool OSDMonitor::check_failures(utime_t now
)
3181 bool found_failure
= false;
3182 for (map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
3183 p
!= failure_info
.end();
3185 if (can_mark_down(p
->first
)) {
3186 found_failure
|= check_failure(now
, p
->first
, p
->second
);
3189 return found_failure
;
3192 bool OSDMonitor::check_failure(utime_t now
, int target_osd
, failure_info_t
& fi
)
3194 // already pending failure?
3195 if (pending_inc
.new_state
.count(target_osd
) &&
3196 pending_inc
.new_state
[target_osd
] & CEPH_OSD_UP
) {
3197 dout(10) << " already pending failure" << dendl
;
3201 set
<string
> reporters_by_subtree
;
3202 auto reporter_subtree_level
= g_conf().get_val
<string
>("mon_osd_reporter_subtree_level");
3203 utime_t
orig_grace(g_conf()->osd_heartbeat_grace
, 0);
3204 utime_t max_failed_since
= fi
.get_failed_since();
3205 utime_t failed_for
= now
- max_failed_since
;
3207 utime_t grace
= orig_grace
;
3208 double my_grace
= 0, peer_grace
= 0;
3210 if (g_conf()->mon_osd_adjust_heartbeat_grace
) {
3211 double halflife
= (double)g_conf()->mon_osd_laggy_halflife
;
3212 decay_k
= ::log(.5) / halflife
;
3214 // scale grace period based on historical probability of 'lagginess'
3215 // (false positive failures due to slowness).
3216 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(target_osd
);
3217 double decay
= exp((double)failed_for
* decay_k
);
3218 dout(20) << " halflife " << halflife
<< " decay_k " << decay_k
3219 << " failed_for " << failed_for
<< " decay " << decay
<< dendl
;
3220 my_grace
= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
3224 // consider the peers reporting a failure a proxy for a potential
3225 // 'subcluster' over the overall cluster that is similarly
3226 // laggy. this is clearly not true in all cases, but will sometimes
3227 // help us localize the grace correction to a subset of the system
3228 // (say, a rack with a bad switch) that is unhappy.
3229 ceph_assert(fi
.reporters
.size());
3230 for (auto p
= fi
.reporters
.begin(); p
!= fi
.reporters
.end();) {
3231 // get the parent bucket whose type matches with "reporter_subtree_level".
3232 // fall back to OSD if the level doesn't exist.
3233 if (osdmap
.exists(p
->first
)) {
3234 auto reporter_loc
= osdmap
.crush
->get_full_location(p
->first
);
3235 if (auto iter
= reporter_loc
.find(reporter_subtree_level
);
3236 iter
== reporter_loc
.end()) {
3237 reporters_by_subtree
.insert("osd." + to_string(p
->first
));
3239 reporters_by_subtree
.insert(iter
->second
);
3241 if (g_conf()->mon_osd_adjust_heartbeat_grace
) {
3242 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(p
->first
);
3243 utime_t elapsed
= now
- xi
.down_stamp
;
3244 double decay
= exp((double)elapsed
* decay_k
);
3245 peer_grace
+= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
3249 fi
.cancel_report(p
->first
);;
3250 p
= fi
.reporters
.erase(p
);
3254 if (g_conf()->mon_osd_adjust_heartbeat_grace
) {
3255 peer_grace
/= (double)fi
.reporters
.size();
3256 grace
+= peer_grace
;
3259 dout(10) << " osd." << target_osd
<< " has "
3260 << fi
.reporters
.size() << " reporters, "
3261 << grace
<< " grace (" << orig_grace
<< " + " << my_grace
3262 << " + " << peer_grace
<< "), max_failed_since " << max_failed_since
3265 if (failed_for
>= grace
&&
3266 reporters_by_subtree
.size() >= g_conf().get_val
<uint64_t>("mon_osd_min_down_reporters")) {
3267 dout(1) << " we have enough reporters to mark osd." << target_osd
3268 << " down" << dendl
;
3269 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
3271 mon
.clog
->info() << "osd." << target_osd
<< " failed ("
3272 << osdmap
.crush
->get_full_location_ordered_string(
3275 << (int)reporters_by_subtree
.size()
3276 << " reporters from different "
3277 << reporter_subtree_level
<< " after "
3278 << failed_for
<< " >= grace " << grace
<< ")";
3284 void OSDMonitor::force_failure(int target_osd
, int by
)
3286 // already pending failure?
3287 if (pending_inc
.new_state
.count(target_osd
) &&
3288 pending_inc
.new_state
[target_osd
] & CEPH_OSD_UP
) {
3289 dout(10) << " already pending failure" << dendl
;
3293 dout(1) << " we're forcing failure of osd." << target_osd
<< dendl
;
3294 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
3295 if (!pending_inc
.new_xinfo
.count(target_osd
)) {
3296 pending_inc
.new_xinfo
[target_osd
] = osdmap
.osd_xinfo
[target_osd
];
3298 pending_inc
.new_xinfo
[target_osd
].dead_epoch
= pending_inc
.epoch
;
3300 mon
.clog
->info() << "osd." << target_osd
<< " failed ("
3301 << osdmap
.crush
->get_full_location_ordered_string(target_osd
)
3302 << ") (connection refused reported by osd." << by
<< ")";
3306 bool OSDMonitor::prepare_failure(MonOpRequestRef op
)
3308 op
->mark_osdmon_event(__func__
);
3309 auto m
= op
->get_req
<MOSDFailure
>();
3310 dout(1) << "prepare_failure osd." << m
->get_target_osd()
3311 << " " << m
->get_target_addrs()
3312 << " from " << m
->get_orig_source()
3313 << " is reporting failure:" << m
->if_osd_failed() << dendl
;
3315 int target_osd
= m
->get_target_osd();
3316 int reporter
= m
->get_orig_source().num();
3317 ceph_assert(osdmap
.is_up(target_osd
));
3318 ceph_assert(osdmap
.get_addrs(target_osd
) == m
->get_target_addrs());
3322 if (m
->if_osd_failed()) {
3323 // calculate failure time
3324 utime_t now
= ceph_clock_now();
3325 utime_t failed_since
=
3326 m
->get_recv_stamp() - utime_t(m
->failed_for
, 0);
3329 if (m
->is_immediate()) {
3330 mon
.clog
->debug() << "osd." << m
->get_target_osd()
3331 << " reported immediately failed by "
3332 << m
->get_orig_source();
3333 force_failure(target_osd
, reporter
);
3336 mon
.clog
->debug() << "osd." << m
->get_target_osd() << " reported failed by "
3337 << m
->get_orig_source();
3339 failure_info_t
& fi
= failure_info
[target_osd
];
3340 MonOpRequestRef old_op
= fi
.add_report(reporter
, failed_since
, op
);
3342 mon
.no_reply(old_op
);
3345 return check_failure(now
, target_osd
, fi
);
3347 // remove the report
3348 mon
.clog
->debug() << "osd." << m
->get_target_osd()
3349 << " failure report canceled by "
3350 << m
->get_orig_source();
3351 if (failure_info
.count(target_osd
)) {
3352 failure_info_t
& fi
= failure_info
[target_osd
];
3353 MonOpRequestRef report_op
= fi
.cancel_report(reporter
);
3355 mon
.no_reply(report_op
);
3357 if (fi
.reporters
.empty()) {
3358 dout(10) << " removing last failure_info for osd." << target_osd
3360 failure_info
.erase(target_osd
);
3362 dout(10) << " failure_info for osd." << target_osd
<< " now "
3363 << fi
.reporters
.size() << " reporters" << dendl
;
3366 dout(10) << " no failure_info for osd." << target_osd
<< dendl
;
3373 void OSDMonitor::process_failures()
3375 map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
3376 while (p
!= failure_info
.end()) {
3377 if (osdmap
.is_up(p
->first
)) {
3380 dout(10) << "process_failures osd." << p
->first
<< dendl
;
3381 list
<MonOpRequestRef
> ls
;
3382 p
->second
.take_report_messages(ls
);
3383 failure_info
.erase(p
++);
3385 while (!ls
.empty()) {
3386 MonOpRequestRef o
= ls
.front();
3388 o
->mark_event(__func__
);
3389 MOSDFailure
*m
= o
->get_req
<MOSDFailure
>();
3390 send_latest(o
, m
->get_epoch());
3399 void OSDMonitor::take_all_failures(list
<MonOpRequestRef
>& ls
)
3401 dout(10) << __func__
<< " on " << failure_info
.size() << " osds" << dendl
;
3403 for (map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
3404 p
!= failure_info
.end();
3406 p
->second
.take_report_messages(ls
);
3408 failure_info
.clear();
3411 int OSDMonitor::get_grace_interval_threshold()
3413 int halflife
= g_conf()->mon_osd_laggy_halflife
;
3414 // Scale the halflife period (default: 1_hr) by
3415 // a factor (48) to calculate the threshold.
3416 int grace_threshold_factor
= 48;
3417 return halflife
* grace_threshold_factor
;
3420 bool OSDMonitor::grace_interval_threshold_exceeded(int last_failed_interval
)
3422 int grace_interval_threshold_secs
= get_grace_interval_threshold();
3423 if (last_failed_interval
> grace_interval_threshold_secs
) {
3424 dout(1) << " last_failed_interval " << last_failed_interval
3425 << " > grace_interval_threshold_secs " << grace_interval_threshold_secs
3432 void OSDMonitor::set_default_laggy_params(int target_osd
)
3434 if (pending_inc
.new_xinfo
.count(target_osd
) == 0) {
3435 pending_inc
.new_xinfo
[target_osd
] = osdmap
.osd_xinfo
[target_osd
];
3437 osd_xinfo_t
& xi
= pending_inc
.new_xinfo
[target_osd
];
3438 xi
.down_stamp
= pending_inc
.modified
;
3439 xi
.laggy_probability
= 0.0;
3440 xi
.laggy_interval
= 0;
3441 dout(20) << __func__
<< " reset laggy, now xi " << xi
<< dendl
;
3447 bool OSDMonitor::preprocess_boot(MonOpRequestRef op
)
3449 op
->mark_osdmon_event(__func__
);
3450 auto m
= op
->get_req
<MOSDBoot
>();
3451 int from
= m
->get_orig_source_inst().name
.num();
3453 // check permissions, ignore if failed (no response expected)
3454 MonSession
*session
= op
->get_session();
3457 if (!session
->is_capable("osd", MON_CAP_X
)) {
3458 dout(0) << "got preprocess_boot message from entity with insufficient caps"
3459 << session
->caps
<< dendl
;
3463 if (m
->sb
.cluster_fsid
!= mon
.monmap
->fsid
) {
3464 dout(0) << "preprocess_boot on fsid " << m
->sb
.cluster_fsid
3465 << " != " << mon
.monmap
->fsid
<< dendl
;
3469 if (m
->get_orig_source_inst().addr
.is_blank_ip()) {
3470 dout(0) << "preprocess_boot got blank addr for " << m
->get_orig_source_inst() << dendl
;
3474 ceph_assert(m
->get_orig_source_inst().name
.is_osd());
3476 // force all osds to have gone through luminous prior to upgrade to nautilus
3478 vector
<string
> missing
;
3479 if (!HAVE_FEATURE(m
->osd_features
, SERVER_LUMINOUS
)) {
3480 missing
.push_back("CEPH_FEATURE_SERVER_LUMINOUS");
3482 if (!HAVE_FEATURE(m
->osd_features
, SERVER_JEWEL
)) {
3483 missing
.push_back("CEPH_FEATURE_SERVER_JEWEL");
3485 if (!HAVE_FEATURE(m
->osd_features
, SERVER_KRAKEN
)) {
3486 missing
.push_back("CEPH_FEATURE_SERVER_KRAKEN");
3488 if (!HAVE_FEATURE(m
->osd_features
, OSD_RECOVERY_DELETES
)) {
3489 missing
.push_back("CEPH_FEATURE_OSD_RECOVERY_DELETES");
3492 if (!missing
.empty()) {
3493 using std::experimental::make_ostream_joiner
;
3496 copy(begin(missing
), end(missing
), make_ostream_joiner(ss
, ";"));
3498 mon
.clog
->info() << "disallowing boot of OSD "
3499 << m
->get_orig_source_inst()
3500 << " because the osd lacks " << ss
.str();
3505 // make sure osd versions do not span more than 3 releases
3506 if (HAVE_FEATURE(m
->osd_features
, SERVER_OCTOPUS
) &&
3507 osdmap
.require_osd_release
< ceph_release_t::mimic
) {
3508 mon
.clog
->info() << "disallowing boot of octopus+ OSD "
3509 << m
->get_orig_source_inst()
3510 << " because require_osd_release < mimic";
3513 if (HAVE_FEATURE(m
->osd_features
, SERVER_PACIFIC
) &&
3514 osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
3515 mon
.clog
->info() << "disallowing boot of pacific+ OSD "
3516 << m
->get_orig_source_inst()
3517 << " because require_osd_release < nautilus";
3521 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
3522 // we are reusing a jewel feature bit that was retired in luminous.
3523 if (osdmap
.require_osd_release
>= ceph_release_t::luminous
&&
3524 osdmap
.test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT
) &&
3525 !(m
->osd_features
& CEPH_FEATURE_OSD_PGLOG_HARDLIMIT
)) {
3526 mon
.clog
->info() << "disallowing boot of OSD "
3527 << m
->get_orig_source_inst()
3528 << " because 'pglog_hardlimit' osdmap flag is set and OSD lacks the OSD_PGLOG_HARDLIMIT feature";
3532 if (osdmap
.stretch_mode_enabled
&&
3533 !(m
->osd_features
& CEPH_FEATUREMASK_STRETCH_MODE
)) {
3534 mon
.clog
->info() << "disallowing boot of OSD "
3535 << m
->get_orig_source_inst()
3536 << " because stretch mode is on and OSD lacks support";
3541 if (osdmap
.is_up(from
) &&
3542 osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs()) &&
3543 osdmap
.get_cluster_addrs(from
).legacy_equals(m
->cluster_addrs
)) {
3545 dout(7) << "preprocess_boot dup from " << m
->get_orig_source()
3546 << " " << m
->get_orig_source_addrs()
3547 << " =~ " << osdmap
.get_addrs(from
) << dendl
;
3552 if (osdmap
.exists(from
) &&
3553 !osdmap
.get_uuid(from
).is_zero() &&
3554 osdmap
.get_uuid(from
) != m
->sb
.osd_fsid
) {
3555 dout(7) << __func__
<< " from " << m
->get_orig_source_inst()
3556 << " clashes with existing osd: different fsid"
3557 << " (ours: " << osdmap
.get_uuid(from
)
3558 << " ; theirs: " << m
->sb
.osd_fsid
<< ")" << dendl
;
3562 if (osdmap
.exists(from
) &&
3563 osdmap
.get_info(from
).up_from
> m
->version
&&
3564 osdmap
.get_most_recent_addrs(from
).legacy_equals(
3565 m
->get_orig_source_addrs())) {
3566 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl
;
3567 send_latest(op
, m
->sb
.current_epoch
+1);
3572 if (!can_mark_up(from
)) {
3573 dout(7) << "preprocess_boot ignoring boot from " << m
->get_orig_source_inst() << dendl
;
3574 send_latest(op
, m
->sb
.current_epoch
+1);
3578 dout(10) << "preprocess_boot from " << m
->get_orig_source_inst() << dendl
;
3585 bool OSDMonitor::prepare_boot(MonOpRequestRef op
)
3587 op
->mark_osdmon_event(__func__
);
3588 auto m
= op
->get_req
<MOSDBoot
>();
3589 dout(7) << __func__
<< " from " << m
->get_source()
3591 << " client_addrs" << m
->get_connection()->get_peer_addrs()
3592 << " cluster_addrs " << m
->cluster_addrs
3593 << " hb_back_addrs " << m
->hb_back_addrs
3594 << " hb_front_addrs " << m
->hb_front_addrs
3597 ceph_assert(m
->get_orig_source().is_osd());
3598 int from
= m
->get_orig_source().num();
3600 // does this osd exist?
3601 if (from
>= osdmap
.get_max_osd()) {
3602 dout(1) << "boot from osd." << from
<< " >= max_osd "
3603 << osdmap
.get_max_osd() << dendl
;
3607 int oldstate
= osdmap
.exists(from
) ? osdmap
.get_state(from
) : CEPH_OSD_NEW
;
3608 if (pending_inc
.new_state
.count(from
))
3609 oldstate
^= pending_inc
.new_state
[from
];
3611 // already up? mark down first?
3612 if (osdmap
.is_up(from
)) {
3613 dout(7) << __func__
<< " was up, first marking down osd." << from
<< " "
3614 << osdmap
.get_addrs(from
) << dendl
;
3615 // preprocess should have caught these; if not, assert.
3616 ceph_assert(!osdmap
.get_addrs(from
).legacy_equals(
3617 m
->get_orig_source_addrs()) ||
3618 !osdmap
.get_cluster_addrs(from
).legacy_equals(m
->cluster_addrs
));
3619 ceph_assert(osdmap
.get_uuid(from
) == m
->sb
.osd_fsid
);
3621 if (pending_inc
.new_state
.count(from
) == 0 ||
3622 (pending_inc
.new_state
[from
] & CEPH_OSD_UP
) == 0) {
3623 // mark previous guy down
3624 pending_inc
.new_state
[from
] = CEPH_OSD_UP
;
3626 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
3627 } else if (pending_inc
.new_up_client
.count(from
)) {
3628 // already prepared, just wait
3629 dout(7) << __func__
<< " already prepared, waiting on "
3630 << m
->get_orig_source_addr() << dendl
;
3631 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
3634 pending_inc
.new_up_client
[from
] = m
->get_orig_source_addrs();
3635 pending_inc
.new_up_cluster
[from
] = m
->cluster_addrs
;
3636 pending_inc
.new_hb_back_up
[from
] = m
->hb_back_addrs
;
3637 pending_inc
.new_hb_front_up
[from
] = m
->hb_front_addrs
;
3639 down_pending_out
.erase(from
); // if any
3642 osd_weight
[from
] = m
->sb
.weight
;
3645 dout(10) << " setting osd." << from
<< " uuid to " << m
->sb
.osd_fsid
3647 if (!osdmap
.exists(from
) || osdmap
.get_uuid(from
) != m
->sb
.osd_fsid
) {
3648 // preprocess should have caught this; if not, assert.
3649 ceph_assert(!osdmap
.exists(from
) || osdmap
.get_uuid(from
).is_zero());
3650 pending_inc
.new_uuid
[from
] = m
->sb
.osd_fsid
;
3654 if (m
->sb
.newest_map
== 0 && osdmap
.exists(from
)) {
3655 const osd_info_t
& i
= osdmap
.get_info(from
);
3656 if (i
.up_from
> i
.lost_at
) {
3657 dout(10) << " fresh osd; marking lost_at too" << dendl
;
3658 pending_inc
.new_lost
[from
] = osdmap
.get_epoch();
3663 bufferlist osd_metadata
;
3664 encode(m
->metadata
, osd_metadata
);
3665 pending_metadata
[from
] = osd_metadata
;
3666 pending_metadata_rm
.erase(from
);
3668 // adjust last clean unmount epoch?
3669 const osd_info_t
& info
= osdmap
.get_info(from
);
3670 dout(10) << " old osd_info: " << info
<< dendl
;
3671 if (m
->sb
.mounted
> info
.last_clean_begin
||
3672 (m
->sb
.mounted
== info
.last_clean_begin
&&
3673 m
->sb
.clean_thru
> info
.last_clean_end
)) {
3674 epoch_t begin
= m
->sb
.mounted
;
3675 epoch_t end
= m
->sb
.clean_thru
;
3677 dout(10) << __func__
<< " osd." << from
<< " last_clean_interval "
3678 << "[" << info
.last_clean_begin
<< "," << info
.last_clean_end
3679 << ") -> [" << begin
<< "-" << end
<< ")"
3681 pending_inc
.new_last_clean_interval
[from
] =
3682 pair
<epoch_t
,epoch_t
>(begin
, end
);
3685 if (pending_inc
.new_xinfo
.count(from
) == 0)
3686 pending_inc
.new_xinfo
[from
] = osdmap
.osd_xinfo
[from
];
3687 osd_xinfo_t
& xi
= pending_inc
.new_xinfo
[from
];
3688 if (m
->boot_epoch
== 0) {
3689 xi
.laggy_probability
*= (1.0 - g_conf()->mon_osd_laggy_weight
);
3690 xi
.laggy_interval
*= (1.0 - g_conf()->mon_osd_laggy_weight
);
3691 dout(10) << " not laggy, new xi " << xi
<< dendl
;
3693 if (xi
.down_stamp
.sec()) {
3694 int interval
= ceph_clock_now().sec() -
3695 xi
.down_stamp
.sec();
3696 if (g_conf()->mon_osd_laggy_max_interval
&&
3697 (interval
> g_conf()->mon_osd_laggy_max_interval
)) {
3698 interval
= g_conf()->mon_osd_laggy_max_interval
;
3701 interval
* g_conf()->mon_osd_laggy_weight
+
3702 xi
.laggy_interval
* (1.0 - g_conf()->mon_osd_laggy_weight
);
3704 xi
.laggy_probability
=
3705 g_conf()->mon_osd_laggy_weight
+
3706 xi
.laggy_probability
* (1.0 - g_conf()->mon_osd_laggy_weight
);
3707 dout(10) << " laggy, now xi " << xi
<< dendl
;
3710 // set features shared by the osd
3711 if (m
->osd_features
)
3712 xi
.features
= m
->osd_features
;
3714 xi
.features
= m
->get_connection()->get_features();
3717 if ((g_conf()->mon_osd_auto_mark_auto_out_in
&&
3718 (oldstate
& CEPH_OSD_AUTOOUT
)) ||
3719 (g_conf()->mon_osd_auto_mark_new_in
&& (oldstate
& CEPH_OSD_NEW
)) ||
3720 (g_conf()->mon_osd_auto_mark_in
)) {
3721 if (can_mark_in(from
)) {
3722 if (xi
.old_weight
> 0) {
3723 pending_inc
.new_weight
[from
] = xi
.old_weight
;
3726 pending_inc
.new_weight
[from
] = CEPH_OSD_IN
;
3729 dout(7) << __func__
<< " NOIN set, will not mark in "
3730 << m
->get_orig_source_addr() << dendl
;
3735 wait_for_finished_proposal(op
, new C_Booted(this, op
));
3740 void OSDMonitor::_booted(MonOpRequestRef op
, bool logit
)
3742 op
->mark_osdmon_event(__func__
);
3743 auto m
= op
->get_req
<MOSDBoot
>();
3744 dout(7) << "_booted " << m
->get_orig_source_inst()
3745 << " w " << m
->sb
.weight
<< " from " << m
->sb
.current_epoch
<< dendl
;
3748 mon
.clog
->info() << m
->get_source() << " " << m
->get_orig_source_addrs()
3752 send_latest(op
, m
->sb
.current_epoch
+1);
3759 bool OSDMonitor::preprocess_full(MonOpRequestRef op
)
3761 op
->mark_osdmon_event(__func__
);
3762 auto m
= op
->get_req
<MOSDFull
>();
3763 int from
= m
->get_orig_source().num();
3765 unsigned mask
= CEPH_OSD_NEARFULL
| CEPH_OSD_BACKFILLFULL
| CEPH_OSD_FULL
;
3767 // check permissions, ignore if failed
3768 MonSession
*session
= op
->get_session();
3771 if (!session
->is_capable("osd", MON_CAP_X
)) {
3772 dout(0) << "MOSDFull from entity with insufficient privileges:"
3773 << session
->caps
<< dendl
;
3777 // ignore a full message from the osd instance that already went down
3778 if (!osdmap
.exists(from
)) {
3779 dout(7) << __func__
<< " ignoring full message from nonexistent "
3780 << m
->get_orig_source_inst() << dendl
;
3783 if ((!osdmap
.is_up(from
) &&
3784 osdmap
.get_most_recent_addrs(from
).legacy_equals(
3785 m
->get_orig_source_addrs())) ||
3786 (osdmap
.is_up(from
) &&
3787 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs()))) {
3788 dout(7) << __func__
<< " ignoring full message from down "
3789 << m
->get_orig_source_inst() << dendl
;
3793 OSDMap::calc_state_set(osdmap
.get_state(from
), state
);
3795 if ((osdmap
.get_state(from
) & mask
) == m
->state
) {
3796 dout(7) << __func__
<< " state already " << state
<< " for osd." << from
3797 << " " << m
->get_orig_source_inst() << dendl
;
3798 _reply_map(op
, m
->version
);
3802 dout(10) << __func__
<< " want state " << state
<< " for osd." << from
3803 << " " << m
->get_orig_source_inst() << dendl
;
3810 bool OSDMonitor::prepare_full(MonOpRequestRef op
)
3812 op
->mark_osdmon_event(__func__
);
3813 auto m
= op
->get_req
<MOSDFull
>();
3814 const int from
= m
->get_orig_source().num();
3816 const unsigned mask
= CEPH_OSD_NEARFULL
| CEPH_OSD_BACKFILLFULL
| CEPH_OSD_FULL
;
3817 const unsigned want_state
= m
->state
& mask
; // safety first
3819 unsigned cur_state
= osdmap
.get_state(from
);
3820 auto p
= pending_inc
.new_state
.find(from
);
3821 if (p
!= pending_inc
.new_state
.end()) {
3822 cur_state
^= p
->second
;
3826 set
<string
> want_state_set
, cur_state_set
;
3827 OSDMap::calc_state_set(want_state
, want_state_set
);
3828 OSDMap::calc_state_set(cur_state
, cur_state_set
);
3830 if (cur_state
!= want_state
) {
3831 if (p
!= pending_inc
.new_state
.end()) {
3834 pending_inc
.new_state
[from
] = 0;
3836 pending_inc
.new_state
[from
] |= (osdmap
.get_state(from
) & mask
) ^ want_state
;
3837 dout(7) << __func__
<< " osd." << from
<< " " << cur_state_set
3838 << " -> " << want_state_set
<< dendl
;
3840 dout(7) << __func__
<< " osd." << from
<< " " << cur_state_set
3841 << " = wanted " << want_state_set
<< ", just waiting" << dendl
;
3844 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
3851 bool OSDMonitor::preprocess_alive(MonOpRequestRef op
)
3853 op
->mark_osdmon_event(__func__
);
3854 auto m
= op
->get_req
<MOSDAlive
>();
3855 int from
= m
->get_orig_source().num();
3857 // check permissions, ignore if failed
3858 MonSession
*session
= op
->get_session();
3861 if (!session
->is_capable("osd", MON_CAP_X
)) {
3862 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3863 << session
->caps
<< dendl
;
3867 if (!osdmap
.is_up(from
) ||
3868 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs())) {
3869 dout(7) << "preprocess_alive ignoring alive message from down "
3870 << m
->get_orig_source() << " " << m
->get_orig_source_addrs()
3875 if (osdmap
.get_up_thru(from
) >= m
->want
) {
3877 dout(7) << "preprocess_alive want up_thru " << m
->want
<< " dup from " << m
->get_orig_source_inst() << dendl
;
3878 _reply_map(op
, m
->version
);
3882 dout(10) << "preprocess_alive want up_thru " << m
->want
3883 << " from " << m
->get_orig_source_inst() << dendl
;
3890 bool OSDMonitor::prepare_alive(MonOpRequestRef op
)
3892 op
->mark_osdmon_event(__func__
);
3893 auto m
= op
->get_req
<MOSDAlive
>();
3894 int from
= m
->get_orig_source().num();
3896 if (0) { // we probably don't care much about these
3897 mon
.clog
->debug() << m
->get_orig_source_inst() << " alive";
3900 dout(7) << "prepare_alive want up_thru " << m
->want
<< " have " << m
->version
3901 << " from " << m
->get_orig_source_inst() << dendl
;
3903 update_up_thru(from
, m
->version
); // set to the latest map the OSD has
3904 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
3908 void OSDMonitor::_reply_map(MonOpRequestRef op
, epoch_t e
)
3910 op
->mark_osdmon_event(__func__
);
3911 dout(7) << "_reply_map " << e
3912 << " from " << op
->get_req()->get_orig_source_inst()
3918 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op
)
3920 op
->mark_osdmon_event(__func__
);
3921 auto m
= op
->get_req
<MOSDPGCreated
>();
3922 dout(10) << __func__
<< " " << *m
<< dendl
;
3923 auto session
= op
->get_session();
3926 dout(10) << __func__
<< ": no monitor session!" << dendl
;
3929 if (!session
->is_capable("osd", MON_CAP_X
)) {
3930 derr
<< __func__
<< " received from entity "
3931 << "with insufficient privileges " << session
->caps
<< dendl
;
3934 // always forward the "created!" to the leader
3938 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op
)
3940 op
->mark_osdmon_event(__func__
);
3941 auto m
= op
->get_req
<MOSDPGCreated
>();
3942 dout(10) << __func__
<< " " << *m
<< dendl
;
3943 auto src
= m
->get_orig_source();
3944 auto from
= src
.num();
3945 if (!src
.is_osd() ||
3946 !mon
.osdmon()->osdmap
.is_up(from
) ||
3947 !mon
.osdmon()->osdmap
.get_addrs(from
).legacy_equals(
3948 m
->get_orig_source_addrs())) {
3949 dout(1) << __func__
<< " ignoring stats from non-active osd." << dendl
;
3952 pending_created_pgs
.push_back(m
->pgid
);
3956 bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op
)
3958 op
->mark_osdmon_event(__func__
);
3959 auto m
= op
->get_req
<MOSDPGReadyToMerge
>();
3960 dout(10) << __func__
<< " " << *m
<< dendl
;
3961 const pg_pool_t
*pi
;
3962 auto session
= op
->get_session();
3964 dout(10) << __func__
<< ": no monitor session!" << dendl
;
3967 if (!session
->is_capable("osd", MON_CAP_X
)) {
3968 derr
<< __func__
<< " received from entity "
3969 << "with insufficient privileges " << session
->caps
<< dendl
;
3972 pi
= osdmap
.get_pg_pool(m
->pgid
.pool());
3974 derr
<< __func__
<< " pool for " << m
->pgid
<< " dne" << dendl
;
3977 if (pi
->get_pg_num() <= m
->pgid
.ps()) {
3978 dout(20) << " pg_num " << pi
->get_pg_num() << " already < " << m
->pgid
<< dendl
;
3981 if (pi
->get_pg_num() != m
->pgid
.ps() + 1) {
3982 derr
<< " OSD trying to merge wrong pgid " << m
->pgid
<< dendl
;
3985 if (pi
->get_pg_num_pending() > m
->pgid
.ps()) {
3986 dout(20) << " pg_num_pending " << pi
->get_pg_num_pending() << " > " << m
->pgid
<< dendl
;
3996 bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op
)
3998 op
->mark_osdmon_event(__func__
);
3999 auto m
= op
->get_req
<MOSDPGReadyToMerge
>();
4000 dout(10) << __func__
<< " " << *m
<< dendl
;
4002 if (pending_inc
.new_pools
.count(m
->pgid
.pool()))
4003 p
= pending_inc
.new_pools
[m
->pgid
.pool()];
4005 p
= *osdmap
.get_pg_pool(m
->pgid
.pool());
4006 if (p
.get_pg_num() != m
->pgid
.ps() + 1 ||
4007 p
.get_pg_num_pending() > m
->pgid
.ps()) {
4008 dout(10) << __func__
4009 << " race with concurrent pg_num[_pending] update, will retry"
4011 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
4016 p
.dec_pg_num(m
->pgid
,
4020 m
->last_epoch_started
,
4021 m
->last_epoch_clean
);
4022 p
.last_change
= pending_inc
.epoch
;
4024 // back off the merge attempt!
4025 p
.set_pg_num_pending(p
.get_pg_num());
4028 // force pre-nautilus clients to resend their ops, since they
4029 // don't understand pg_num_pending changes form a new interval
4030 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
4032 pending_inc
.new_pools
[m
->pgid
.pool()] = p
;
4034 auto prob
= g_conf().get_val
<double>("mon_inject_pg_merge_bounce_probability");
4037 prob
> (double)(rand() % 1000)/1000.0) {
4038 derr
<< __func__
<< " injecting pg merge pg_num bounce" << dendl
;
4039 auto n
= new MMonCommand(mon
.monmap
->get_fsid());
4040 n
->set_connection(m
->get_connection());
4041 n
->cmd
= { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
4042 osdmap
.get_pool_name(m
->pgid
.pool()) +
4043 "\", \"var\": \"pg_num_actual\", \"val\": \"" +
4044 stringify(m
->pgid
.ps() + 1) + "\"}" };
4045 MonOpRequestRef nop
= mon
.op_tracker
.create_request
<MonOpRequest
>(n
);
4046 nop
->set_type_service();
4047 wait_for_finished_proposal(op
, new C_RetryMessage(this, nop
));
4049 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
4058 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op
)
4060 auto m
= op
->get_req
<MOSDPGTemp
>();
4061 dout(10) << "preprocess_pgtemp " << *m
<< dendl
;
4062 mempool::osdmap::vector
<int> empty
;
4063 int from
= m
->get_orig_source().num();
4064 size_t ignore_cnt
= 0;
4067 MonSession
*session
= op
->get_session();
4070 if (!session
->is_capable("osd", MON_CAP_X
)) {
4071 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
4072 << session
->caps
<< dendl
;
4076 if (!osdmap
.is_up(from
) ||
4077 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs())) {
4078 dout(7) << "ignoring pgtemp message from down "
4079 << m
->get_orig_source() << " " << m
->get_orig_source_addrs()
4088 for (auto p
= m
->pg_temp
.begin(); p
!= m
->pg_temp
.end(); ++p
) {
4089 dout(20) << " " << p
->first
4090 << (osdmap
.pg_temp
->count(p
->first
) ? osdmap
.pg_temp
->get(p
->first
) : empty
)
4091 << " -> " << p
->second
<< dendl
;
4093 // does the pool exist?
4094 if (!osdmap
.have_pg_pool(p
->first
.pool())) {
4096 * 1. If the osdmap does not have the pool, it means the pool has been
4097 * removed in-between the osd sending this message and us handling it.
4098 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
4099 * not exist in the pending either, as the osds would not send a
4100 * message about a pool they know nothing about (yet).
4101 * 3. However, if the pool does exist in the pending, then it must be a
4102 * new pool, and not relevant to this message (see 1).
4104 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
4105 << ": pool has been removed" << dendl
;
4110 int acting_primary
= -1;
4111 osdmap
.pg_to_up_acting_osds(
4112 p
->first
, nullptr, nullptr, nullptr, &acting_primary
);
4113 if (acting_primary
!= from
) {
4114 /* If the source isn't the primary based on the current osdmap, we know
4115 * that the interval changed and that we can discard this message.
4116 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
4117 * which of two pg temp mappings on the same pg is more recent.
4119 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
4120 << ": primary has changed" << dendl
;
4126 if (p
->second
.empty() && (osdmap
.pg_temp
->count(p
->first
) ||
4127 osdmap
.primary_temp
->count(p
->first
)))
4130 // NOTE: we assume that this will clear pg_primary, so consider
4131 // an existing pg_primary field to imply a change
4132 if (p
->second
.size() &&
4133 (osdmap
.pg_temp
->count(p
->first
) == 0 ||
4134 osdmap
.pg_temp
->get(p
->first
) != p
->second
||
4135 osdmap
.primary_temp
->count(p
->first
)))
4139 // should we ignore all the pgs?
4140 if (ignore_cnt
== m
->pg_temp
.size())
4143 dout(7) << "preprocess_pgtemp e" << m
->map_epoch
<< " no changes from " << m
->get_orig_source_inst() << dendl
;
4144 _reply_map(op
, m
->map_epoch
);
4152 void OSDMonitor::update_up_thru(int from
, epoch_t up_thru
)
4154 epoch_t old_up_thru
= osdmap
.get_up_thru(from
);
4155 auto ut
= pending_inc
.new_up_thru
.find(from
);
4156 if (ut
!= pending_inc
.new_up_thru
.end()) {
4157 old_up_thru
= ut
->second
;
4159 if (up_thru
> old_up_thru
) {
4160 // set up_thru too, so the osd doesn't have to ask again
4161 pending_inc
.new_up_thru
[from
] = up_thru
;
4165 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op
)
4167 op
->mark_osdmon_event(__func__
);
4168 auto m
= op
->get_req
<MOSDPGTemp
>();
4169 int from
= m
->get_orig_source().num();
4170 dout(7) << "prepare_pgtemp e" << m
->map_epoch
<< " from " << m
->get_orig_source_inst() << dendl
;
4171 for (map
<pg_t
,vector
<int32_t> >::iterator p
= m
->pg_temp
.begin(); p
!= m
->pg_temp
.end(); ++p
) {
4172 uint64_t pool
= p
->first
.pool();
4173 if (pending_inc
.old_pools
.count(pool
)) {
4174 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
4175 << ": pool pending removal" << dendl
;
4178 if (!osdmap
.have_pg_pool(pool
)) {
4179 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
4180 << ": pool has been removed" << dendl
;
4183 pending_inc
.new_pg_temp
[p
->first
] =
4184 mempool::osdmap::vector
<int>(p
->second
.begin(), p
->second
.end());
4186 // unconditionally clear pg_primary (until this message can encode
4187 // a change for that, too.. at which point we need to also fix
4188 // preprocess_pg_temp)
4189 if (osdmap
.primary_temp
->count(p
->first
) ||
4190 pending_inc
.new_primary_temp
.count(p
->first
))
4191 pending_inc
.new_primary_temp
[p
->first
] = -1;
4194 // set up_thru too, so the osd doesn't have to ask again
4195 update_up_thru(from
, m
->map_epoch
);
4197 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->map_epoch
));
4204 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op
)
4206 op
->mark_osdmon_event(__func__
);
4207 auto m
= op
->get_req
<MRemoveSnaps
>();
4208 dout(7) << "preprocess_remove_snaps " << *m
<< dendl
;
4210 // check privilege, ignore if failed
4211 MonSession
*session
= op
->get_session();
4215 if (!session
->caps
.is_capable(
4217 session
->entity_name
,
4218 "osd", "osd pool rmsnap", {}, true, true, false,
4219 session
->get_peer_socket_addr())) {
4220 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
4221 << session
->caps
<< dendl
;
4225 for (map
<int, vector
<snapid_t
> >::iterator q
= m
->snaps
.begin();
4226 q
!= m
->snaps
.end();
4228 if (!osdmap
.have_pg_pool(q
->first
)) {
4229 dout(10) << " ignoring removed_snaps " << q
->second
4230 << " on non-existent pool " << q
->first
<< dendl
;
4233 const pg_pool_t
*pi
= osdmap
.get_pg_pool(q
->first
);
4234 for (vector
<snapid_t
>::iterator p
= q
->second
.begin();
4235 p
!= q
->second
.end();
4237 if (*p
> pi
->get_snap_seq() ||
4238 !_is_removed_snap(q
->first
, *p
)) {
4244 if (HAVE_FEATURE(m
->get_connection()->get_features(), SERVER_OCTOPUS
)) {
4245 auto reply
= make_message
<MRemoveSnaps
>();
4246 reply
->snaps
= m
->snaps
;
4247 mon
.send_reply(op
, reply
.detach());
4254 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op
)
4256 op
->mark_osdmon_event(__func__
);
4257 auto m
= op
->get_req
<MRemoveSnaps
>();
4258 dout(7) << "prepare_remove_snaps " << *m
<< dendl
;
4260 for (auto& [pool
, snaps
] : m
->snaps
) {
4261 if (!osdmap
.have_pg_pool(pool
)) {
4262 dout(10) << " ignoring removed_snaps " << snaps
4263 << " on non-existent pool " << pool
<< dendl
;
4267 pg_pool_t
& pi
= osdmap
.pools
[pool
];
4268 for (auto s
: snaps
) {
4269 if (!_is_removed_snap(pool
, s
) &&
4270 (!pending_inc
.new_pools
.count(pool
) ||
4271 !pending_inc
.new_pools
[pool
].removed_snaps
.contains(s
)) &&
4272 (!pending_inc
.new_removed_snaps
.count(pool
) ||
4273 !pending_inc
.new_removed_snaps
[pool
].contains(s
))) {
4274 pg_pool_t
*newpi
= pending_inc
.get_new_pool(pool
, &pi
);
4275 if (osdmap
.require_osd_release
< ceph_release_t::octopus
) {
4276 newpi
->removed_snaps
.insert(s
);
4277 dout(10) << " pool " << pool
<< " removed_snaps added " << s
4278 << " (now " << newpi
->removed_snaps
<< ")" << dendl
;
4280 newpi
->flags
|= pg_pool_t::FLAG_SELFMANAGED_SNAPS
;
4281 if (s
> newpi
->get_snap_seq()) {
4282 dout(10) << " pool " << pool
<< " snap_seq "
4283 << newpi
->get_snap_seq() << " -> " << s
<< dendl
;
4284 newpi
->set_snap_seq(s
);
4286 newpi
->set_snap_epoch(pending_inc
.epoch
);
4287 dout(10) << " added pool " << pool
<< " snap " << s
4288 << " to removed_snaps queue" << dendl
;
4289 pending_inc
.new_removed_snaps
[pool
].insert(s
);
4294 if (HAVE_FEATURE(m
->get_connection()->get_features(), SERVER_OCTOPUS
)) {
4295 auto reply
= make_message
<MRemoveSnaps
>();
4296 reply
->snaps
= m
->snaps
;
4297 wait_for_finished_proposal(op
, new C_ReplyOp(this, op
, reply
));
4303 bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op
)
4305 op
->mark_osdmon_event(__func__
);
4306 auto m
= op
->get_req
<MMonGetPurgedSnaps
>();
4307 dout(7) << __func__
<< " " << *m
<< dendl
;
4309 map
<epoch_t
,mempool::osdmap::map
<int64_t,snap_interval_set_t
>> r
;
4311 string k
= make_purged_snap_epoch_key(m
->start
);
4312 auto it
= mon
.store
->get_iterator(OSD_SNAP_PREFIX
);
4314 unsigned long epoch
= m
->last
;
4315 while (it
->valid()) {
4316 if (it
->key().find("purged_epoch_") != 0) {
4319 string k
= it
->key();
4320 int n
= sscanf(k
.c_str(), "purged_epoch_%lx", &epoch
);
4322 derr
<< __func__
<< " unable to parse key '" << it
->key() << "'" << dendl
;
4323 } else if (epoch
> m
->last
) {
4326 bufferlist bl
= it
->value();
4327 auto p
= bl
.cbegin();
4331 } catch (ceph::buffer::error
& e
) {
4332 derr
<< __func__
<< " unable to parse value for key '" << it
->key()
4337 n
+= 4 + v
.size() * 16;
4340 // impose a semi-arbitrary limit to message size
4346 auto reply
= make_message
<MMonGetPurgedSnapsReply
>(m
->start
, epoch
);
4347 reply
->purged_snaps
.swap(r
);
4348 mon
.send_reply(op
, reply
.detach());
4354 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op
)
4356 op
->mark_osdmon_event(__func__
);
4358 auto session
= op
->get_session();
4361 dout(10) << __func__
<< " no monitor session!" << dendl
;
4364 if (!session
->is_capable("osd", MON_CAP_X
)) {
4365 derr
<< __func__
<< " received from entity "
4366 << "with insufficient privileges " << session
->caps
<< dendl
;
4369 // Always forward the beacon to the leader, even if they are the same as
4370 // the old one. The leader will mark as down osds that haven't sent
4371 // beacon for a few minutes.
4375 bool OSDMonitor::prepare_beacon(MonOpRequestRef op
)
4377 op
->mark_osdmon_event(__func__
);
4378 const auto beacon
= op
->get_req
<MOSDBeacon
>();
4379 const auto src
= beacon
->get_orig_source();
4380 dout(10) << __func__
<< " " << *beacon
4381 << " from " << src
<< dendl
;
4382 int from
= src
.num();
4384 if (!src
.is_osd() ||
4385 !osdmap
.is_up(from
) ||
4386 !osdmap
.get_addrs(from
).legacy_equals(beacon
->get_orig_source_addrs())) {
4387 if (src
.is_osd() && !osdmap
.is_up(from
)) {
4388 // share some new maps with this guy in case it may not be
4389 // aware of its own deadness...
4390 send_latest(op
, beacon
->version
+1);
4392 dout(1) << " ignoring beacon from non-active osd." << from
<< dendl
;
4396 last_osd_report
[from
].first
= ceph_clock_now();
4397 last_osd_report
[from
].second
= beacon
->osd_beacon_report_interval
;
4398 osd_epochs
[from
] = beacon
->version
;
4400 for (const auto& pg
: beacon
->pgs
) {
4401 last_epoch_clean
.report(pg
, beacon
->min_last_epoch_clean
);
4404 if (osdmap
.osd_xinfo
[from
].last_purged_snaps_scrub
<
4405 beacon
->last_purged_snaps_scrub
) {
4406 if (pending_inc
.new_xinfo
.count(from
) == 0) {
4407 pending_inc
.new_xinfo
[from
] = osdmap
.osd_xinfo
[from
];
4409 pending_inc
.new_xinfo
[from
].last_purged_snaps_scrub
=
4410 beacon
->last_purged_snaps_scrub
;
4420 void OSDMonitor::send_latest(MonOpRequestRef op
, epoch_t start
)
4422 op
->mark_osdmon_event(__func__
);
4423 dout(5) << "send_latest to " << op
->get_req()->get_orig_source_inst()
4424 << " start " << start
<< dendl
;
4428 send_incremental(op
, start
);
4432 MOSDMap
*OSDMonitor::build_latest_full(uint64_t features
)
4434 MOSDMap
*r
= new MOSDMap(mon
.monmap
->fsid
, features
);
4435 get_version_full(osdmap
.get_epoch(), features
, r
->maps
[osdmap
.get_epoch()]);
4436 r
->oldest_map
= get_first_committed();
4437 r
->newest_map
= osdmap
.get_epoch();
4441 MOSDMap
*OSDMonitor::build_incremental(epoch_t from
, epoch_t to
, uint64_t features
)
4443 dout(10) << "build_incremental [" << from
<< ".." << to
<< "] with features "
4444 << std::hex
<< features
<< std::dec
<< dendl
;
4445 MOSDMap
*m
= new MOSDMap(mon
.monmap
->fsid
, features
);
4446 m
->oldest_map
= get_first_committed();
4447 m
->newest_map
= osdmap
.get_epoch();
4449 for (epoch_t e
= to
; e
>= from
&& e
> 0; e
--) {
4451 int err
= get_version(e
, features
, bl
);
4453 ceph_assert(bl
.length());
4454 // if (get_version(e, bl) > 0) {
4455 dout(20) << "build_incremental inc " << e
<< " "
4456 << bl
.length() << " bytes" << dendl
;
4457 m
->incremental_maps
[e
] = bl
;
4459 ceph_assert(err
== -ENOENT
);
4460 ceph_assert(!bl
.length());
4461 get_version_full(e
, features
, bl
);
4462 if (bl
.length() > 0) {
4463 //else if (get_version("full", e, bl) > 0) {
4464 dout(20) << "build_incremental full " << e
<< " "
4465 << bl
.length() << " bytes" << dendl
;
4468 ceph_abort(); // we should have all maps.
4475 void OSDMonitor::send_full(MonOpRequestRef op
)
4477 op
->mark_osdmon_event(__func__
);
4478 dout(5) << "send_full to " << op
->get_req()->get_orig_source_inst() << dendl
;
4479 mon
.send_reply(op
, build_latest_full(op
->get_session()->con_features
));
4482 void OSDMonitor::send_incremental(MonOpRequestRef op
, epoch_t first
)
4484 op
->mark_osdmon_event(__func__
);
4486 MonSession
*s
= op
->get_session();
4490 // oh, we can tell the other mon to do it
4491 dout(10) << __func__
<< " asking proxying mon to send_incremental from "
4493 MRoute
*r
= new MRoute(s
->proxy_tid
, NULL
);
4494 r
->send_osdmap_first
= first
;
4495 s
->proxy_con
->send_message(r
);
4496 op
->mark_event("reply: send routed send_osdmap_first reply");
4499 send_incremental(first
, s
, false, op
);
4503 void OSDMonitor::send_incremental(epoch_t first
,
4504 MonSession
*session
,
4506 MonOpRequestRef req
)
4508 dout(5) << "send_incremental [" << first
<< ".." << osdmap
.get_epoch() << "]"
4509 << " to " << session
->name
<< dendl
;
4511 // get feature of the peer
4512 // use quorum_con_features, if it's an anonymous connection.
4513 uint64_t features
= session
->con_features
? session
->con_features
:
4514 mon
.get_quorum_con_features();
4516 if (first
<= session
->osd_epoch
) {
4517 dout(10) << __func__
<< " " << session
->name
<< " should already have epoch "
4518 << session
->osd_epoch
<< dendl
;
4519 first
= session
->osd_epoch
+ 1;
4522 if (first
< get_first_committed()) {
4523 MOSDMap
*m
= new MOSDMap(osdmap
.get_fsid(), features
);
4524 m
->oldest_map
= get_first_committed();
4525 m
->newest_map
= osdmap
.get_epoch();
4527 first
= get_first_committed();
4529 int err
= get_version_full(first
, features
, bl
);
4530 ceph_assert(err
== 0);
4531 ceph_assert(bl
.length());
4532 dout(20) << "send_incremental starting with base full "
4533 << first
<< " " << bl
.length() << " bytes" << dendl
;
4534 m
->maps
[first
] = bl
;
4537 mon
.send_reply(req
, m
);
4538 session
->osd_epoch
= first
;
4541 session
->con
->send_message(m
);
4542 session
->osd_epoch
= first
;
4547 while (first
<= osdmap
.get_epoch()) {
4548 epoch_t last
= std::min
<epoch_t
>(first
+ g_conf()->osd_map_message_max
- 1,
4549 osdmap
.get_epoch());
4550 MOSDMap
*m
= build_incremental(first
, last
, features
);
4553 // send some maps. it may not be all of them, but it will get them
4555 mon
.send_reply(req
, m
);
4557 session
->con
->send_message(m
);
4560 session
->osd_epoch
= last
;
4566 int OSDMonitor::get_version(version_t ver
, bufferlist
& bl
)
4568 return get_version(ver
, mon
.get_quorum_con_features(), bl
);
4571 void OSDMonitor::reencode_incremental_map(bufferlist
& bl
, uint64_t features
)
4573 OSDMap::Incremental inc
;
4574 auto q
= bl
.cbegin();
4576 // always encode with subset of osdmap's canonical features
4577 uint64_t f
= features
& inc
.encode_features
;
4578 dout(20) << __func__
<< " " << inc
.epoch
<< " with features " << f
4581 if (inc
.fullmap
.length()) {
4582 // embedded full map?
4584 m
.decode(inc
.fullmap
);
4585 inc
.fullmap
.clear();
4586 m
.encode(inc
.fullmap
, f
| CEPH_FEATURE_RESERVED
);
4588 if (inc
.crush
.length()) {
4589 // embedded crush map
4591 auto p
= inc
.crush
.cbegin();
4594 c
.encode(inc
.crush
, f
);
4596 inc
.encode(bl
, f
| CEPH_FEATURE_RESERVED
);
4599 void OSDMonitor::reencode_full_map(bufferlist
& bl
, uint64_t features
)
4602 auto q
= bl
.cbegin();
4604 // always encode with subset of osdmap's canonical features
4605 uint64_t f
= features
& m
.get_encoding_features();
4606 dout(20) << __func__
<< " " << m
.get_epoch() << " with features " << f
4609 m
.encode(bl
, f
| CEPH_FEATURE_RESERVED
);
4612 int OSDMonitor::get_version(version_t ver
, uint64_t features
, bufferlist
& bl
)
4614 uint64_t significant_features
= OSDMap::get_significant_features(features
);
4615 if (inc_osd_cache
.lookup({ver
, significant_features
}, &bl
)) {
4618 int ret
= PaxosService::get_version(ver
, bl
);
4622 // NOTE: this check is imprecise; the OSDMap encoding features may
4623 // be a subset of the latest mon quorum features, but worst case we
4624 // reencode once and then cache the (identical) result under both
4626 if (significant_features
!=
4627 OSDMap::get_significant_features(mon
.get_quorum_con_features())) {
4628 reencode_incremental_map(bl
, features
);
4630 inc_osd_cache
.add_bytes({ver
, significant_features
}, bl
);
4634 int OSDMonitor::get_inc(version_t ver
, OSDMap::Incremental
& inc
)
4637 int err
= get_version(ver
, inc_bl
);
4638 ceph_assert(err
== 0);
4639 ceph_assert(inc_bl
.length());
4641 auto p
= inc_bl
.cbegin();
4643 dout(10) << __func__
<< " "
4644 << " epoch " << inc
.epoch
4645 << " inc_crc " << inc
.inc_crc
4646 << " full_crc " << inc
.full_crc
4647 << " encode_features " << inc
.encode_features
<< dendl
;
4651 int OSDMonitor::get_full_from_pinned_map(version_t ver
, bufferlist
& bl
)
4653 dout(10) << __func__
<< " ver " << ver
<< dendl
;
4655 version_t closest_pinned
= osdmap_manifest
.get_lower_closest_pinned(ver
);
4656 if (closest_pinned
== 0) {
4659 if (closest_pinned
> ver
) {
4660 dout(0) << __func__
<< " pinned: " << osdmap_manifest
.pinned
<< dendl
;
4662 ceph_assert(closest_pinned
<= ver
);
4664 dout(10) << __func__
<< " closest pinned ver " << closest_pinned
<< dendl
;
4666 // get osdmap incremental maps and apply on top of this one.
4668 bool has_cached_osdmap
= false;
4669 for (version_t v
= ver
-1; v
>= closest_pinned
; --v
) {
4670 if (full_osd_cache
.lookup({v
, mon
.get_quorum_con_features()},
4672 dout(10) << __func__
<< " found map in cache ver " << v
<< dendl
;
4674 has_cached_osdmap
= true;
4679 if (!has_cached_osdmap
) {
4680 int err
= PaxosService::get_version_full(closest_pinned
, osdm_bl
);
4682 derr
<< __func__
<< " closest pinned map ver " << closest_pinned
4683 << " not available! error: " << cpp_strerror(err
) << dendl
;
4685 ceph_assert(err
== 0);
4688 ceph_assert(osdm_bl
.length());
4691 osdm
.decode(osdm_bl
);
4693 dout(10) << __func__
<< " loaded osdmap epoch " << closest_pinned
4694 << " e" << osdm
.epoch
4695 << " crc " << osdm
.get_crc()
4696 << " -- applying incremental maps." << dendl
;
4698 uint64_t encode_features
= 0;
4699 for (version_t v
= closest_pinned
+ 1; v
<= ver
; ++v
) {
4700 dout(20) << __func__
<< " applying inc epoch " << v
<< dendl
;
4702 OSDMap::Incremental inc
;
4703 int err
= get_inc(v
, inc
);
4704 ceph_assert(err
== 0);
4706 encode_features
= inc
.encode_features
;
4708 err
= osdm
.apply_incremental(inc
);
4709 ceph_assert(err
== 0);
4711 // this block performs paranoid checks on map retrieval
4712 if (g_conf().get_val
<bool>("mon_debug_extra_checks") &&
4713 inc
.full_crc
!= 0) {
4715 uint64_t f
= encode_features
;
4717 f
= (mon
.quorum_con_features
? mon
.quorum_con_features
: -1);
4720 // encode osdmap to force calculating crcs
4722 osdm
.encode(tbl
, f
| CEPH_FEATURE_RESERVED
);
4723 // decode osdmap to compare crcs with what's expected by incremental
4727 if (tosdm
.get_crc() != inc
.full_crc
) {
4729 << " osdmap crc mismatch! (osdmap crc " << tosdm
.get_crc()
4730 << ", expected " << inc
.full_crc
<< ")" << dendl
;
4731 ceph_abort_msg("osdmap crc mismatch");
4735 // note: we cannot add the recently computed map to the cache, as is,
4736 // because we have not encoded the map into a bl.
4739 if (!encode_features
) {
4740 dout(10) << __func__
4741 << " last incremental map didn't have features;"
4742 << " defaulting to quorum's or all" << dendl
;
4744 (mon
.quorum_con_features
? mon
.quorum_con_features
: -1);
4746 osdm
.encode(bl
, encode_features
| CEPH_FEATURE_RESERVED
);
4751 int OSDMonitor::get_version_full(version_t ver
, bufferlist
& bl
)
4753 return get_version_full(ver
, mon
.get_quorum_con_features(), bl
);
4756 int OSDMonitor::get_version_full(version_t ver
, uint64_t features
,
4759 uint64_t significant_features
= OSDMap::get_significant_features(features
);
4760 if (full_osd_cache
.lookup({ver
, significant_features
}, &bl
)) {
4763 int ret
= PaxosService::get_version_full(ver
, bl
);
4764 if (ret
== -ENOENT
) {
4766 ret
= get_full_from_pinned_map(ver
, bl
);
4771 // NOTE: this check is imprecise; the OSDMap encoding features may
4772 // be a subset of the latest mon quorum features, but worst case we
4773 // reencode once and then cache the (identical) result under both
4775 if (significant_features
!=
4776 OSDMap::get_significant_features(mon
.get_quorum_con_features())) {
4777 reencode_full_map(bl
, features
);
4779 full_osd_cache
.add_bytes({ver
, significant_features
}, bl
);
4783 epoch_t
OSDMonitor::blocklist(const entity_addrvec_t
& av
, utime_t until
)
4785 dout(10) << "blocklist " << av
<< " until " << until
<< dendl
;
4786 for (auto a
: av
.v
) {
4787 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
4788 a
.set_type(entity_addr_t::TYPE_ANY
);
4790 a
.set_type(entity_addr_t::TYPE_LEGACY
);
4792 pending_inc
.new_blocklist
[a
] = until
;
4794 return pending_inc
.epoch
;
4797 epoch_t
OSDMonitor::blocklist(entity_addr_t a
, utime_t until
)
4799 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
4800 a
.set_type(entity_addr_t::TYPE_ANY
);
4802 a
.set_type(entity_addr_t::TYPE_LEGACY
);
4804 dout(10) << "blocklist " << a
<< " until " << until
<< dendl
;
4805 pending_inc
.new_blocklist
[a
] = until
;
4806 return pending_inc
.epoch
;
4810 void OSDMonitor::check_osdmap_subs()
4812 dout(10) << __func__
<< dendl
;
4813 if (!osdmap
.get_epoch()) {
4816 auto osdmap_subs
= mon
.session_map
.subs
.find("osdmap");
4817 if (osdmap_subs
== mon
.session_map
.subs
.end()) {
4820 auto p
= osdmap_subs
->second
->begin();
4824 check_osdmap_sub(sub
);
4828 void OSDMonitor::check_osdmap_sub(Subscription
*sub
)
4830 dout(10) << __func__
<< " " << sub
<< " next " << sub
->next
4831 << (sub
->onetime
? " (onetime)":" (ongoing)") << dendl
;
4832 if (sub
->next
<= osdmap
.get_epoch()) {
4834 send_incremental(sub
->next
, sub
->session
, sub
->incremental_onetime
);
4836 sub
->session
->con
->send_message(build_latest_full(sub
->session
->con_features
));
4838 mon
.session_map
.remove_sub(sub
);
4840 sub
->next
= osdmap
.get_epoch() + 1;
4844 void OSDMonitor::check_pg_creates_subs()
4846 if (!osdmap
.get_num_up_osds()) {
4849 ceph_assert(osdmap
.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB
);
4850 mon
.with_session_map([this](const MonSessionMap
& session_map
) {
4851 auto pg_creates_subs
= session_map
.subs
.find("osd_pg_creates");
4852 if (pg_creates_subs
== session_map
.subs
.end()) {
4855 for (auto sub
: *pg_creates_subs
->second
) {
4856 check_pg_creates_sub(sub
);
4861 void OSDMonitor::check_pg_creates_sub(Subscription
*sub
)
4863 dout(20) << __func__
<< " .. " << sub
->session
->name
<< dendl
;
4864 ceph_assert(sub
->type
== "osd_pg_creates");
4865 // only send these if the OSD is up. we will check_subs() when they do
4866 // come up so they will get the creates then.
4867 if (sub
->session
->name
.is_osd() &&
4868 mon
.osdmon()->osdmap
.is_up(sub
->session
->name
.num())) {
4869 sub
->next
= send_pg_creates(sub
->session
->name
.num(),
4870 sub
->session
->con
.get(),
4875 void OSDMonitor::do_application_enable(int64_t pool_id
,
4876 const std::string
&app_name
,
4877 const std::string
&app_key
,
4878 const std::string
&app_value
,
4881 ceph_assert(paxos
.is_plugged() && is_writeable());
4883 dout(20) << __func__
<< ": pool_id=" << pool_id
<< ", app_name=" << app_name
4886 ceph_assert(osdmap
.require_osd_release
>= ceph_release_t::luminous
);
4888 auto pp
= osdmap
.get_pg_pool(pool_id
);
4889 ceph_assert(pp
!= nullptr);
4892 if (pending_inc
.new_pools
.count(pool_id
)) {
4893 p
= pending_inc
.new_pools
[pool_id
];
4896 if (app_key
.empty()) {
4897 p
.application_metadata
.insert({app_name
, {}});
4900 p
.application_metadata
[app_name
][app_key
] = app_value
;
4902 p
.application_metadata
.insert({app_name
, {{app_key
, app_value
}}});
4905 p
.last_change
= pending_inc
.epoch
;
4906 pending_inc
.new_pools
[pool_id
] = p
;
4909 void OSDMonitor::do_set_pool_opt(int64_t pool_id
,
4910 pool_opts_t::key_t opt
,
4911 pool_opts_t::value_t val
)
4913 auto p
= pending_inc
.new_pools
.try_emplace(
4914 pool_id
, *osdmap
.get_pg_pool(pool_id
));
4915 p
.first
->second
.opts
.set(opt
, val
);
4918 unsigned OSDMonitor::scan_for_creating_pgs(
4919 const mempool::osdmap::map
<int64_t,pg_pool_t
>& pools
,
4920 const mempool::osdmap::set
<int64_t>& removed_pools
,
4922 creating_pgs_t
* creating_pgs
) const
4924 unsigned queued
= 0;
4925 for (auto& p
: pools
) {
4926 int64_t poolid
= p
.first
;
4927 if (creating_pgs
->created_pools
.count(poolid
)) {
4928 dout(10) << __func__
<< " already created " << poolid
<< dendl
;
4931 const pg_pool_t
& pool
= p
.second
;
4932 int ruleno
= osdmap
.crush
->find_rule(pool
.get_crush_rule(),
4933 pool
.get_type(), pool
.get_size());
4934 if (ruleno
< 0 || !osdmap
.crush
->rule_exists(ruleno
))
4937 const auto last_scan_epoch
= creating_pgs
->last_scan_epoch
;
4938 const auto created
= pool
.get_last_change();
4939 if (last_scan_epoch
&& created
<= last_scan_epoch
) {
4940 dout(10) << __func__
<< " no change in pool " << poolid
4941 << " " << pool
<< dendl
;
4944 if (removed_pools
.count(poolid
)) {
4945 dout(10) << __func__
<< " pool is being removed: " << poolid
4946 << " " << pool
<< dendl
;
4949 dout(10) << __func__
<< " queueing pool create for " << poolid
4950 << " " << pool
<< dendl
;
4951 creating_pgs
->create_pool(poolid
, pool
.get_pg_num(),
4958 void OSDMonitor::update_creating_pgs()
4960 dout(10) << __func__
<< " " << creating_pgs
.pgs
.size() << " pgs creating, "
4961 << creating_pgs
.queue
.size() << " pools in queue" << dendl
;
4962 decltype(creating_pgs_by_osd_epoch
) new_pgs_by_osd_epoch
;
4963 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
4964 for (const auto& pg
: creating_pgs
.pgs
) {
4965 int acting_primary
= -1;
4966 auto pgid
= pg
.first
;
4967 if (!osdmap
.pg_exists(pgid
)) {
4968 dout(20) << __func__
<< " ignoring " << pgid
<< " which should not exist"
4972 auto mapped
= pg
.second
.create_epoch
;
4973 dout(20) << __func__
<< " looking up " << pgid
<< "@" << mapped
<< dendl
;
4975 mapping
.get_primary_and_shard(pgid
, &acting_primary
, &spgid
);
4976 // check the previous creating_pgs, look for the target to whom the pg was
4977 // previously mapped
4978 for (const auto& pgs_by_epoch
: creating_pgs_by_osd_epoch
) {
4979 const auto last_acting_primary
= pgs_by_epoch
.first
;
4980 for (auto& pgs
: pgs_by_epoch
.second
) {
4981 if (pgs
.second
.count(spgid
)) {
4982 if (last_acting_primary
== acting_primary
) {
4985 dout(20) << __func__
<< " " << pgid
<< " "
4986 << " acting_primary:" << last_acting_primary
4987 << " -> " << acting_primary
<< dendl
;
4988 // note epoch if the target of the create message changed.
4989 mapped
= mapping
.get_epoch();
4994 mapped
= mapping
.get_epoch();
4998 dout(10) << __func__
<< " will instruct osd." << acting_primary
4999 << " to create " << pgid
<< "@" << mapped
<< dendl
;
5000 new_pgs_by_osd_epoch
[acting_primary
][mapped
].insert(spgid
);
5002 creating_pgs_by_osd_epoch
= std::move(new_pgs_by_osd_epoch
);
5003 creating_pgs_epoch
= mapping
.get_epoch();
5006 epoch_t
OSDMonitor::send_pg_creates(int osd
, Connection
*con
, epoch_t next
) const
5008 dout(30) << __func__
<< " osd." << osd
<< " next=" << next
5009 << " " << creating_pgs_by_osd_epoch
<< dendl
;
5010 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
5011 if (creating_pgs_epoch
<= creating_pgs
.last_scan_epoch
) {
5012 dout(20) << __func__
5013 << " not using stale creating_pgs@" << creating_pgs_epoch
<< dendl
;
5014 // the subscribers will be updated when the mapping is completed anyway
5017 auto creating_pgs_by_epoch
= creating_pgs_by_osd_epoch
.find(osd
);
5018 if (creating_pgs_by_epoch
== creating_pgs_by_osd_epoch
.end())
5020 ceph_assert(!creating_pgs_by_epoch
->second
.empty());
5022 MOSDPGCreate
*oldm
= nullptr; // for pre-mimic OSD compat
5023 MOSDPGCreate2
*m
= nullptr;
5025 bool old
= osdmap
.require_osd_release
< ceph_release_t::nautilus
;
5028 for (auto epoch_pgs
= creating_pgs_by_epoch
->second
.lower_bound(next
);
5029 epoch_pgs
!= creating_pgs_by_epoch
->second
.end(); ++epoch_pgs
) {
5030 auto epoch
= epoch_pgs
->first
;
5031 auto& pgs
= epoch_pgs
->second
;
5032 dout(20) << __func__
<< " osd." << osd
<< " from " << next
5033 << " : epoch " << epoch
<< " " << pgs
.size() << " pgs" << dendl
;
5035 for (auto& pg
: pgs
) {
5036 // Need the create time from the monitor using its clock to set
5037 // last_scrub_stamp upon pg creation.
5038 auto create
= creating_pgs
.pgs
.find(pg
.pgid
);
5039 ceph_assert(create
!= creating_pgs
.pgs
.end());
5042 oldm
= new MOSDPGCreate(creating_pgs_epoch
);
5044 oldm
->mkpg
.emplace(pg
.pgid
,
5045 pg_create_t
{create
->second
.create_epoch
, pg
.pgid
, 0});
5046 oldm
->ctimes
.emplace(pg
.pgid
, create
->second
.create_stamp
);
5049 m
= new MOSDPGCreate2(creating_pgs_epoch
);
5051 m
->pgs
.emplace(pg
, make_pair(create
->second
.create_epoch
,
5052 create
->second
.create_stamp
));
5053 if (create
->second
.history
.epoch_created
) {
5054 dout(20) << __func__
<< " " << pg
<< " " << create
->second
.history
5055 << " " << create
->second
.past_intervals
<< dendl
;
5056 m
->pg_extra
.emplace(pg
, make_pair(create
->second
.history
,
5057 create
->second
.past_intervals
));
5060 dout(20) << __func__
<< " will create " << pg
5061 << " at " << create
->second
.create_epoch
<< dendl
;
5065 con
->send_message(m
);
5067 con
->send_message(oldm
);
5069 dout(20) << __func__
<< " osd." << osd
<< " from " << next
5070 << " has nothing to send" << dendl
;
5074 // sub is current through last + 1
5081 void OSDMonitor::tick()
5083 if (!is_active()) return;
5085 dout(10) << osdmap
<< dendl
;
5087 // always update osdmap manifest, regardless of being the leader.
5088 load_osdmap_manifest();
5090 // always tune priority cache manager memory on leader and peons
5091 if (ceph_using_tcmalloc() && mon_memory_autotune
) {
5092 std::lock_guard
l(balancer_lock
);
5093 if (pcm
!= nullptr) {
5096 _set_new_cache_sizes();
5097 dout(10) << "tick balancer "
5098 << " inc cache_bytes: " << inc_cache
->get_cache_bytes()
5099 << " inc comtd_bytes: " << inc_cache
->get_committed_size()
5100 << " inc used_bytes: " << inc_cache
->_get_used_bytes()
5101 << " inc num_osdmaps: " << inc_cache
->_get_num_osdmaps()
5103 dout(10) << "tick balancer "
5104 << " full cache_bytes: " << full_cache
->get_cache_bytes()
5105 << " full comtd_bytes: " << full_cache
->get_committed_size()
5106 << " full used_bytes: " << full_cache
->_get_used_bytes()
5107 << " full num_osdmaps: " << full_cache
->_get_num_osdmaps()
5112 if (!mon
.is_leader()) return;
5114 bool do_propose
= false;
5115 utime_t now
= ceph_clock_now();
5117 if (handle_osd_timeouts(now
, last_osd_report
)) {
5122 if (check_failures(now
)) {
5126 // Force a proposal if we need to prune; pruning is performed on
5127 // ``encode_pending()``, hence why we need to regularly trigger a proposal
5128 // even if there's nothing going on.
5129 if (is_prune_enabled() && should_prune()) {
5133 // mark down osds out?
5135 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
5136 * influence at all. The decision is made based on the ratio of "in" osds,
5137 * and the function returns false if this ratio is lower that the minimum
5138 * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
5140 if (can_mark_out(-1)) {
5141 string down_out_subtree_limit
= g_conf().get_val
<string
>(
5142 "mon_osd_down_out_subtree_limit");
5143 set
<int> down_cache
; // quick cache of down subtrees
5145 map
<int,utime_t
>::iterator i
= down_pending_out
.begin();
5146 while (i
!= down_pending_out
.end()) {
5152 if (osdmap
.is_down(o
) &&
5155 utime_t
orig_grace(g_conf()->mon_osd_down_out_interval
, 0);
5156 utime_t grace
= orig_grace
;
5157 double my_grace
= 0.0;
5159 if (g_conf()->mon_osd_adjust_down_out_interval
) {
5160 // scale grace period the same way we do the heartbeat grace.
5161 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(o
);
5162 double halflife
= (double)g_conf()->mon_osd_laggy_halflife
;
5163 double decay_k
= ::log(.5) / halflife
;
5164 double decay
= exp((double)down
* decay_k
);
5165 dout(20) << "osd." << o
<< " laggy halflife " << halflife
<< " decay_k " << decay_k
5166 << " down for " << down
<< " decay " << decay
<< dendl
;
5167 my_grace
= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
5171 // is this an entire large subtree down?
5172 if (down_out_subtree_limit
.length()) {
5173 int type
= osdmap
.crush
->get_type_id(down_out_subtree_limit
);
5175 if (osdmap
.containing_subtree_is_down(cct
, o
, type
, &down_cache
)) {
5176 dout(10) << "tick entire containing " << down_out_subtree_limit
5177 << " subtree for osd." << o
5178 << " is down; resetting timer" << dendl
;
5179 // reset timer, too.
5180 down_pending_out
[o
] = now
;
5186 bool down_out
= !osdmap
.is_destroyed(o
) &&
5187 g_conf()->mon_osd_down_out_interval
> 0 && down
.sec() >= grace
;
5188 bool destroyed_out
= osdmap
.is_destroyed(o
) &&
5189 g_conf()->mon_osd_destroyed_out_interval
> 0 &&
5190 // this is not precise enough as we did not make a note when this osd
5191 // was marked as destroyed, but let's not bother with that
5192 // complexity for now.
5193 down
.sec() >= g_conf()->mon_osd_destroyed_out_interval
;
5194 if (down_out
|| destroyed_out
) {
5195 dout(10) << "tick marking osd." << o
<< " OUT after " << down
5196 << " sec (target " << grace
<< " = " << orig_grace
<< " + " << my_grace
<< ")" << dendl
;
5197 pending_inc
.new_weight
[o
] = CEPH_OSD_OUT
;
5199 // set the AUTOOUT bit.
5200 if (pending_inc
.new_state
.count(o
) == 0)
5201 pending_inc
.new_state
[o
] = 0;
5202 pending_inc
.new_state
[o
] |= CEPH_OSD_AUTOOUT
;
5204 // remember previous weight
5205 if (pending_inc
.new_xinfo
.count(o
) == 0)
5206 pending_inc
.new_xinfo
[o
] = osdmap
.osd_xinfo
[o
];
5207 pending_inc
.new_xinfo
[o
].old_weight
= osdmap
.osd_weight
[o
];
5211 mon
.clog
->info() << "Marking osd." << o
<< " out (has been down for "
5212 << int(down
.sec()) << " seconds)";
5217 down_pending_out
.erase(o
);
5220 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl
;
5223 // expire blocklisted items?
5224 for (ceph::unordered_map
<entity_addr_t
,utime_t
>::iterator p
= osdmap
.blocklist
.begin();
5225 p
!= osdmap
.blocklist
.end();
5227 if (p
->second
< now
) {
5228 dout(10) << "expiring blocklist item " << p
->first
<< " expired " << p
->second
<< " < now " << now
<< dendl
;
5229 pending_inc
.old_blocklist
.push_back(p
->first
);
5234 if (try_prune_purged_snaps()) {
5238 if (update_pools_status())
5242 !pending_inc
.new_pg_temp
.empty()) // also propose if we adjusted pg_temp
5246 void OSDMonitor::_set_new_cache_sizes()
5248 uint64_t cache_size
= 0;
5249 int64_t inc_alloc
= 0;
5250 int64_t full_alloc
= 0;
5251 int64_t kv_alloc
= 0;
5253 if (pcm
!= nullptr && rocksdb_binned_kv_cache
!= nullptr) {
5254 cache_size
= pcm
->get_tuned_mem();
5255 inc_alloc
= inc_cache
->get_committed_size();
5256 full_alloc
= full_cache
->get_committed_size();
5257 kv_alloc
= rocksdb_binned_kv_cache
->get_committed_size();
5260 inc_osd_cache
.set_bytes(inc_alloc
);
5261 full_osd_cache
.set_bytes(full_alloc
);
5263 dout(1) << __func__
<< " cache_size:" << cache_size
5264 << " inc_alloc: " << inc_alloc
5265 << " full_alloc: " << full_alloc
5266 << " kv_alloc: " << kv_alloc
5270 bool OSDMonitor::handle_osd_timeouts(const utime_t
&now
,
5271 std::map
<int, std::pair
<utime_t
, int>> &last_osd_report
)
5273 utime_t
timeo(g_conf()->mon_osd_report_timeout
, 0);
5274 if (now
- mon
.get_leader_since() < timeo
) {
5275 // We haven't been the leader for long enough to consider OSD timeouts
5279 int max_osd
= osdmap
.get_max_osd();
5280 bool new_down
= false;
5282 for (int i
=0; i
< max_osd
; ++i
) {
5283 dout(30) << __func__
<< ": checking up on osd " << i
<< dendl
;
5284 if (!osdmap
.exists(i
)) {
5285 last_osd_report
.erase(i
); // if any
5288 if (!osdmap
.is_up(i
))
5290 const std::map
<int, std::pair
<utime_t
, int>>::const_iterator t
= last_osd_report
.find(i
);
5291 if (t
== last_osd_report
.end()) {
5292 // it wasn't in the map; start the timer.
5293 last_osd_report
[i
].first
= now
;
5294 last_osd_report
[i
].second
= 0;
5295 } else if (can_mark_down(i
)) {
5296 utime_t diff
= now
- t
->second
.first
;
5297 // we use the max(mon_osd_report_timeout, 2*osd_beacon_report_interval) as timeout
5298 // to allow for the osd to miss a beacon.
5299 int mon_osd_report_timeout
= g_conf()->mon_osd_report_timeout
;
5300 utime_t
max_timeout(std::max(mon_osd_report_timeout
, 2 * t
->second
.second
), 0);
5301 if (diff
> max_timeout
) {
5302 mon
.clog
->info() << "osd." << i
<< " marked down after no beacon for "
5303 << diff
<< " seconds";
5304 derr
<< "no beacon from osd." << i
<< " since " << t
->second
.first
5305 << ", " << diff
<< " seconds ago. marking down" << dendl
;
5306 pending_inc
.new_state
[i
] = CEPH_OSD_UP
;
5314 static void dump_cpu_list(Formatter
*f
, const char *name
,
5315 const string
& strlist
)
5318 size_t cpu_set_size
;
5319 if (parse_cpu_set_list(strlist
.c_str(), &cpu_set_size
, &cpu_set
) < 0) {
5322 set
<int> cpus
= cpu_set_to_set(cpu_set_size
, &cpu_set
);
5323 f
->open_array_section(name
);
5324 for (auto cpu
: cpus
) {
5325 f
->dump_int("cpu", cpu
);
5330 void OSDMonitor::dump_info(Formatter
*f
)
5332 f
->open_object_section("osdmap");
5336 f
->open_array_section("osd_metadata");
5337 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
5338 if (osdmap
.exists(i
)) {
5339 f
->open_object_section("osd");
5340 f
->dump_unsigned("id", i
);
5341 dump_osd_metadata(i
, f
, NULL
);
5347 f
->open_object_section("osdmap_clean_epochs");
5348 f
->dump_unsigned("min_last_epoch_clean", get_min_last_epoch_clean());
5350 f
->open_object_section("last_epoch_clean");
5351 last_epoch_clean
.dump(f
);
5354 f
->open_array_section("osd_epochs");
5355 for (auto& osd_epoch
: osd_epochs
) {
5356 f
->open_object_section("osd");
5357 f
->dump_unsigned("id", osd_epoch
.first
);
5358 f
->dump_unsigned("epoch", osd_epoch
.second
);
5361 f
->close_section(); // osd_epochs
5363 f
->close_section(); // osd_clean_epochs
5365 f
->dump_unsigned("osdmap_first_committed", get_first_committed());
5366 f
->dump_unsigned("osdmap_last_committed", get_last_committed());
5368 f
->open_object_section("crushmap");
5369 osdmap
.crush
->dump(f
);
5372 if (has_osdmap_manifest
) {
5373 f
->open_object_section("osdmap_manifest");
5374 osdmap_manifest
.dump(f
);
5380 enum osd_pool_get_choices
{
5382 PG_NUM
, PGP_NUM
, CRUSH_RULE
, HASHPSPOOL
, EC_OVERWRITES
,
5383 NODELETE
, NOPGCHANGE
, NOSIZECHANGE
,
5384 WRITE_FADVISE_DONTNEED
, NOSCRUB
, NODEEP_SCRUB
,
5385 HIT_SET_TYPE
, HIT_SET_PERIOD
, HIT_SET_COUNT
, HIT_SET_FPP
,
5386 USE_GMT_HITSET
, TARGET_MAX_OBJECTS
, TARGET_MAX_BYTES
,
5387 CACHE_TARGET_DIRTY_RATIO
, CACHE_TARGET_DIRTY_HIGH_RATIO
,
5388 CACHE_TARGET_FULL_RATIO
,
5389 CACHE_MIN_FLUSH_AGE
, CACHE_MIN_EVICT_AGE
,
5390 ERASURE_CODE_PROFILE
, MIN_READ_RECENCY_FOR_PROMOTE
,
5391 MIN_WRITE_RECENCY_FOR_PROMOTE
, FAST_READ
,
5392 HIT_SET_GRADE_DECAY_RATE
, HIT_SET_SEARCH_LAST_N
,
5393 SCRUB_MIN_INTERVAL
, SCRUB_MAX_INTERVAL
, DEEP_SCRUB_INTERVAL
,
5394 RECOVERY_PRIORITY
, RECOVERY_OP_PRIORITY
, SCRUB_PRIORITY
,
5395 COMPRESSION_MODE
, COMPRESSION_ALGORITHM
, COMPRESSION_REQUIRED_RATIO
,
5396 COMPRESSION_MAX_BLOB_SIZE
, COMPRESSION_MIN_BLOB_SIZE
,
5397 CSUM_TYPE
, CSUM_MAX_BLOCK
, CSUM_MIN_BLOCK
, FINGERPRINT_ALGORITHM
,
5398 PG_AUTOSCALE_MODE
, PG_NUM_MIN
, TARGET_SIZE_BYTES
, TARGET_SIZE_RATIO
,
5399 PG_AUTOSCALE_BIAS
, DEDUP_TIER
, DEDUP_CHUNK_ALGORITHM
,
5400 DEDUP_CDC_CHUNK_SIZE
};
5402 std::set
<osd_pool_get_choices
>
5403 subtract_second_from_first(const std::set
<osd_pool_get_choices
>& first
,
5404 const std::set
<osd_pool_get_choices
>& second
)
5406 std::set
<osd_pool_get_choices
> result
;
5407 std::set_difference(first
.begin(), first
.end(),
5408 second
.begin(), second
.end(),
5409 std::inserter(result
, result
.end()));
5415 bool OSDMonitor::preprocess_command(MonOpRequestRef op
)
5417 op
->mark_osdmon_event(__func__
);
5418 auto m
= op
->get_req
<MMonCommand
>();
5421 stringstream ss
, ds
;
5424 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
5425 string rs
= ss
.str();
5426 mon
.reply_command(op
, -EINVAL
, rs
, get_last_committed());
5430 MonSession
*session
= op
->get_session();
5432 derr
<< __func__
<< " no session" << dendl
;
5433 mon
.reply_command(op
, -EACCES
, "access denied", get_last_committed());
5438 cmd_getval(cmdmap
, "prefix", prefix
);
5441 cmd_getval(cmdmap
, "format", format
, string("plain"));
5442 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
5444 if (prefix
== "osd stat") {
5446 f
->open_object_section("osdmap");
5447 osdmap
.print_summary(f
.get(), ds
, "", true);
5451 osdmap
.print_summary(nullptr, ds
, "", true);
5455 else if (prefix
== "osd dump" ||
5456 prefix
== "osd tree" ||
5457 prefix
== "osd tree-from" ||
5458 prefix
== "osd ls" ||
5459 prefix
== "osd getmap" ||
5460 prefix
== "osd getcrushmap" ||
5461 prefix
== "osd ls-tree" ||
5462 prefix
== "osd info") {
5466 cmd_getval(cmdmap
, "epoch", epochnum
, (int64_t)osdmap
.get_epoch());
5469 bufferlist osdmap_bl
;
5470 int err
= get_version_full(epoch
, osdmap_bl
);
5471 if (err
== -ENOENT
) {
5473 ss
<< "there is no map for epoch " << epoch
;
5476 ceph_assert(err
== 0);
5477 ceph_assert(osdmap_bl
.length());
5480 if (epoch
== osdmap
.get_epoch()) {
5484 p
->decode(osdmap_bl
);
5487 auto sg
= make_scope_guard([&] {
5493 if (prefix
== "osd dump") {
5496 f
->open_object_section("osdmap");
5506 } else if (prefix
== "osd ls") {
5508 f
->open_array_section("osds");
5509 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
5510 if (osdmap
.exists(i
)) {
5511 f
->dump_int("osd", i
);
5518 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
5519 if (osdmap
.exists(i
)) {
5528 } else if (prefix
== "osd info") {
5530 bool do_single_osd
= true;
5531 if (!cmd_getval(cmdmap
, "id", osd_id
)) {
5532 do_single_osd
= false;
5535 if (do_single_osd
&& !osdmap
.exists(osd_id
)) {
5536 ss
<< "osd." << osd_id
<< " does not exist";
5542 if (do_single_osd
) {
5543 osdmap
.dump_osd(osd_id
, f
.get());
5545 osdmap
.dump_osds(f
.get());
5549 if (do_single_osd
) {
5550 osdmap
.print_osd(osd_id
, ds
);
5552 osdmap
.print_osds(ds
);
5556 } else if (prefix
== "osd tree" || prefix
== "osd tree-from") {
5558 if (prefix
== "osd tree-from") {
5559 cmd_getval(cmdmap
, "bucket", bucket
);
5560 if (!osdmap
.crush
->name_exists(bucket
)) {
5561 ss
<< "bucket '" << bucket
<< "' does not exist";
5565 int id
= osdmap
.crush
->get_item_id(bucket
);
5567 ss
<< "\"" << bucket
<< "\" is not a bucket";
5573 vector
<string
> states
;
5574 cmd_getval(cmdmap
, "states", states
);
5575 unsigned filter
= 0;
5576 for (auto& s
: states
) {
5578 filter
|= OSDMap::DUMP_UP
;
5579 } else if (s
== "down") {
5580 filter
|= OSDMap::DUMP_DOWN
;
5581 } else if (s
== "in") {
5582 filter
|= OSDMap::DUMP_IN
;
5583 } else if (s
== "out") {
5584 filter
|= OSDMap::DUMP_OUT
;
5585 } else if (s
== "destroyed") {
5586 filter
|= OSDMap::DUMP_DESTROYED
;
5588 ss
<< "unrecognized state '" << s
<< "'";
5593 if ((filter
& (OSDMap::DUMP_IN
|OSDMap::DUMP_OUT
)) ==
5594 (OSDMap::DUMP_IN
|OSDMap::DUMP_OUT
)) {
5595 ss
<< "cannot specify both 'in' and 'out'";
5599 if (((filter
& (OSDMap::DUMP_UP
|OSDMap::DUMP_DOWN
)) ==
5600 (OSDMap::DUMP_UP
|OSDMap::DUMP_DOWN
)) ||
5601 ((filter
& (OSDMap::DUMP_UP
|OSDMap::DUMP_DESTROYED
)) ==
5602 (OSDMap::DUMP_UP
|OSDMap::DUMP_DESTROYED
)) ||
5603 ((filter
& (OSDMap::DUMP_DOWN
|OSDMap::DUMP_DESTROYED
)) ==
5604 (OSDMap::DUMP_DOWN
|OSDMap::DUMP_DESTROYED
))) {
5605 ss
<< "can specify only one of 'up', 'down' and 'destroyed'";
5610 f
->open_object_section("tree");
5611 p
->print_tree(f
.get(), NULL
, filter
, bucket
);
5615 p
->print_tree(NULL
, &ds
, filter
, bucket
);
5618 } else if (prefix
== "osd getmap") {
5619 rdata
.append(osdmap_bl
);
5620 ss
<< "got osdmap epoch " << p
->get_epoch();
5621 } else if (prefix
== "osd getcrushmap") {
5622 p
->crush
->encode(rdata
, mon
.get_quorum_con_features());
5623 ss
<< p
->get_crush_version();
5624 } else if (prefix
== "osd ls-tree") {
5626 cmd_getval(cmdmap
, "name", bucket_name
);
5628 r
= p
->get_osds_by_bucket_name(bucket_name
, &osds
);
5630 ss
<< "\"" << bucket_name
<< "\" does not exist";
5633 ss
<< "can not parse bucket name:\"" << bucket_name
<< "\"";
5638 f
->open_array_section("osds");
5639 for (auto &i
: osds
) {
5640 if (osdmap
.exists(i
)) {
5641 f
->dump_int("osd", i
);
5648 for (auto &i
: osds
) {
5649 if (osdmap
.exists(i
)) {
5660 } else if (prefix
== "osd getmaxosd") {
5662 f
->open_object_section("getmaxosd");
5663 f
->dump_unsigned("epoch", osdmap
.get_epoch());
5664 f
->dump_int("max_osd", osdmap
.get_max_osd());
5668 ds
<< "max_osd = " << osdmap
.get_max_osd() << " in epoch " << osdmap
.get_epoch();
5671 } else if (prefix
== "osd utilization") {
5673 osdmap
.summarize_mapping_stats(NULL
, NULL
, &out
, f
.get());
5680 } else if (prefix
== "osd find") {
5682 if (!cmd_getval(cmdmap
, "id", osd
)) {
5683 ss
<< "unable to parse osd id value '"
5684 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
5688 if (!osdmap
.exists(osd
)) {
5689 ss
<< "osd." << osd
<< " does not exist";
5694 cmd_getval(cmdmap
, "format", format
);
5695 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5696 f
->open_object_section("osd_location");
5697 f
->dump_int("osd", osd
);
5698 f
->dump_object("addrs", osdmap
.get_addrs(osd
));
5699 f
->dump_stream("osd_fsid") << osdmap
.get_uuid(osd
);
5701 // try to identify host, pod/container name, etc.
5702 map
<string
,string
> m
;
5703 load_metadata(osd
, m
, nullptr);
5704 if (auto p
= m
.find("hostname"); p
!= m
.end()) {
5705 f
->dump_string("host", p
->second
);
5708 "pod_name", "pod_namespace", // set by rook
5709 "container_name" // set by cephadm, ceph-ansible
5711 if (auto p
= m
.find(k
); p
!= m
.end()) {
5712 f
->dump_string(k
, p
->second
);
5716 // crush is helpful too
5717 f
->open_object_section("crush_location");
5718 map
<string
,string
> loc
= osdmap
.crush
->get_full_location(osd
);
5719 for (map
<string
,string
>::iterator p
= loc
.begin(); p
!= loc
.end(); ++p
)
5720 f
->dump_string(p
->first
.c_str(), p
->second
);
5724 } else if (prefix
== "osd metadata") {
5726 if (cmd_vartype_stringify(cmdmap
["id"]).size() &&
5727 !cmd_getval(cmdmap
, "id", osd
)) {
5728 ss
<< "unable to parse osd id value '"
5729 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
5733 if (osd
>= 0 && !osdmap
.exists(osd
)) {
5734 ss
<< "osd." << osd
<< " does not exist";
5739 cmd_getval(cmdmap
, "format", format
);
5740 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5742 f
->open_object_section("osd_metadata");
5743 f
->dump_unsigned("id", osd
);
5744 r
= dump_osd_metadata(osd
, f
.get(), &ss
);
5750 f
->open_array_section("osd_metadata");
5751 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
5752 if (osdmap
.exists(i
)) {
5753 f
->open_object_section("osd");
5754 f
->dump_unsigned("id", i
);
5755 r
= dump_osd_metadata(i
, f
.get(), NULL
);
5756 if (r
== -EINVAL
|| r
== -ENOENT
) {
5757 // Drop error, continue to get other daemons' metadata
5758 dout(4) << "No metadata for osd." << i
<< dendl
;
5770 } else if (prefix
== "osd versions") {
5772 f
.reset(Formatter::create("json-pretty"));
5773 count_metadata("ceph_version", f
.get());
5776 } else if (prefix
== "osd count-metadata") {
5778 f
.reset(Formatter::create("json-pretty"));
5780 cmd_getval(cmdmap
, "property", field
);
5781 count_metadata(field
, f
.get());
5784 } else if (prefix
== "osd numa-status") {
5787 f
->open_array_section("osds");
5789 tbl
.define_column("OSD", TextTable::LEFT
, TextTable::RIGHT
);
5790 tbl
.define_column("HOST", TextTable::LEFT
, TextTable::LEFT
);
5791 tbl
.define_column("NETWORK", TextTable::RIGHT
, TextTable::RIGHT
);
5792 tbl
.define_column("STORAGE", TextTable::RIGHT
, TextTable::RIGHT
);
5793 tbl
.define_column("AFFINITY", TextTable::RIGHT
, TextTable::RIGHT
);
5794 tbl
.define_column("CPUS", TextTable::LEFT
, TextTable::LEFT
);
5796 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
5797 if (osdmap
.exists(i
)) {
5798 map
<string
,string
> m
;
5800 if (load_metadata(i
, m
, &err
) < 0) {
5804 auto p
= m
.find("hostname");
5809 f
->open_object_section("osd");
5810 f
->dump_int("osd", i
);
5811 f
->dump_string("host", host
);
5812 for (auto n
: { "network_numa_node", "objectstore_numa_node",
5816 f
->dump_int(n
, atoi(p
->second
.c_str()));
5819 for (auto n
: { "network_numa_nodes", "objectstore_numa_nodes" }) {
5822 list
<string
> ls
= get_str_list(p
->second
, ",");
5823 f
->open_array_section(n
);
5824 for (auto node
: ls
) {
5825 f
->dump_int("node", atoi(node
.c_str()));
5830 for (auto n
: { "numa_node_cpus" }) {
5833 dump_cpu_list(f
.get(), n
, p
->second
);
5840 p
= m
.find("network_numa_nodes");
5846 p
= m
.find("objectstore_numa_nodes");
5852 p
= m
.find("numa_node");
5853 auto q
= m
.find("numa_node_cpus");
5854 if (p
!= m
.end() && q
!= m
.end()) {
5861 tbl
<< TextTable::endrow
;
5869 rdata
.append(stringify(tbl
));
5871 } else if (prefix
== "osd map") {
5872 string poolstr
, objstr
, namespacestr
;
5873 cmd_getval(cmdmap
, "pool", poolstr
);
5874 cmd_getval(cmdmap
, "object", objstr
);
5875 cmd_getval(cmdmap
, "nspace", namespacestr
);
5877 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
5879 ss
<< "pool " << poolstr
<< " does not exist";
5883 object_locator_t
oloc(pool
, namespacestr
);
5884 object_t
oid(objstr
);
5885 pg_t pgid
= osdmap
.object_locator_to_pg(oid
, oloc
);
5886 pg_t mpgid
= osdmap
.raw_pg_to_pg(pgid
);
5887 vector
<int> up
, acting
;
5889 osdmap
.pg_to_up_acting_osds(mpgid
, &up
, &up_p
, &acting
, &acting_p
);
5892 if (!namespacestr
.empty())
5893 fullobjname
= namespacestr
+ string("/") + oid
.name
;
5895 fullobjname
= oid
.name
;
5897 f
->open_object_section("osd_map");
5898 f
->dump_unsigned("epoch", osdmap
.get_epoch());
5899 f
->dump_string("pool", poolstr
);
5900 f
->dump_int("pool_id", pool
);
5901 f
->dump_stream("objname") << fullobjname
;
5902 f
->dump_stream("raw_pgid") << pgid
;
5903 f
->dump_stream("pgid") << mpgid
;
5904 f
->open_array_section("up");
5905 for (vector
<int>::iterator p
= up
.begin(); p
!= up
.end(); ++p
)
5906 f
->dump_int("osd", *p
);
5908 f
->dump_int("up_primary", up_p
);
5909 f
->open_array_section("acting");
5910 for (vector
<int>::iterator p
= acting
.begin(); p
!= acting
.end(); ++p
)
5911 f
->dump_int("osd", *p
);
5913 f
->dump_int("acting_primary", acting_p
);
5914 f
->close_section(); // osd_map
5917 ds
<< "osdmap e" << osdmap
.get_epoch()
5918 << " pool '" << poolstr
<< "' (" << pool
<< ")"
5919 << " object '" << fullobjname
<< "' ->"
5920 << " pg " << pgid
<< " (" << mpgid
<< ")"
5921 << " -> up (" << pg_vector_string(up
) << ", p" << up_p
<< ") acting ("
5922 << pg_vector_string(acting
) << ", p" << acting_p
<< ")";
5926 } else if (prefix
== "pg map") {
5929 cmd_getval(cmdmap
, "pgid", pgidstr
);
5930 if (!pgid
.parse(pgidstr
.c_str())) {
5931 ss
<< "invalid pgid '" << pgidstr
<< "'";
5935 vector
<int> up
, acting
;
5936 if (!osdmap
.have_pg_pool(pgid
.pool())) {
5937 ss
<< "pg '" << pgidstr
<< "' does not exist";
5941 pg_t mpgid
= osdmap
.raw_pg_to_pg(pgid
);
5942 osdmap
.pg_to_up_acting_osds(pgid
, up
, acting
);
5944 f
->open_object_section("pg_map");
5945 f
->dump_unsigned("epoch", osdmap
.get_epoch());
5946 f
->dump_stream("raw_pgid") << pgid
;
5947 f
->dump_stream("pgid") << mpgid
;
5948 f
->open_array_section("up");
5949 for (auto osd
: up
) {
5950 f
->dump_int("up_osd", osd
);
5953 f
->open_array_section("acting");
5954 for (auto osd
: acting
) {
5955 f
->dump_int("acting_osd", osd
);
5961 ds
<< "osdmap e" << osdmap
.get_epoch()
5962 << " pg " << pgid
<< " (" << mpgid
<< ")"
5963 << " -> up " << up
<< " acting " << acting
;
5968 } else if (prefix
== "osd lspools") {
5970 f
->open_array_section("pools");
5971 for (map
<int64_t, pg_pool_t
>::iterator p
= osdmap
.pools
.begin();
5972 p
!= osdmap
.pools
.end();
5975 f
->open_object_section("pool");
5976 f
->dump_int("poolnum", p
->first
);
5977 f
->dump_string("poolname", osdmap
.pool_name
[p
->first
]);
5980 ds
<< p
->first
<< ' ' << osdmap
.pool_name
[p
->first
];
5981 if (next(p
) != osdmap
.pools
.end()) {
5991 } else if (prefix
== "osd blocklist ls" ||
5992 prefix
== "osd blacklist ls") {
5994 f
->open_array_section("blocklist");
5996 for (ceph::unordered_map
<entity_addr_t
,utime_t
>::iterator p
= osdmap
.blocklist
.begin();
5997 p
!= osdmap
.blocklist
.end();
6000 f
->open_object_section("entry");
6001 f
->dump_string("addr", p
->first
.get_legacy_str());
6002 f
->dump_stream("until") << p
->second
;
6007 ss
<< p
->first
<< " " << p
->second
;
6017 ss
<< "listed " << osdmap
.blocklist
.size() << " entries";
6019 } else if (prefix
== "osd pool ls") {
6021 cmd_getval(cmdmap
, "detail", detail
);
6022 if (!f
&& detail
== "detail") {
6024 osdmap
.print_pools(ss
);
6025 rdata
.append(ss
.str());
6028 f
->open_array_section("pools");
6029 for (map
<int64_t,pg_pool_t
>::const_iterator it
= osdmap
.get_pools().begin();
6030 it
!= osdmap
.get_pools().end();
6033 if (detail
== "detail") {
6034 f
->open_object_section("pool");
6035 f
->dump_int("pool_id", it
->first
);
6036 f
->dump_string("pool_name", osdmap
.get_pool_name(it
->first
));
6037 it
->second
.dump(f
.get());
6040 f
->dump_string("pool_name", osdmap
.get_pool_name(it
->first
));
6043 rdata
.append(osdmap
.get_pool_name(it
->first
) + "\n");
6052 } else if (prefix
== "osd crush get-tunable") {
6054 cmd_getval(cmdmap
, "tunable", tunable
);
6057 f
->open_object_section("tunable");
6058 if (tunable
== "straw_calc_version") {
6060 f
->dump_int(tunable
.c_str(), osdmap
.crush
->get_straw_calc_version());
6062 rss
<< osdmap
.crush
->get_straw_calc_version() << "\n";
6071 rdata
.append(rss
.str());
6075 } else if (prefix
== "osd pool get") {
6077 cmd_getval(cmdmap
, "pool", poolstr
);
6078 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
6080 ss
<< "unrecognized pool '" << poolstr
<< "'";
6085 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
6087 cmd_getval(cmdmap
, "var", var
);
6089 typedef std::map
<std::string
, osd_pool_get_choices
> choices_map_t
;
6090 const choices_map_t ALL_CHOICES
= {
6092 {"min_size", MIN_SIZE
},
6093 {"pg_num", PG_NUM
}, {"pgp_num", PGP_NUM
},
6094 {"crush_rule", CRUSH_RULE
}, {"hashpspool", HASHPSPOOL
},
6095 {"allow_ec_overwrites", EC_OVERWRITES
}, {"nodelete", NODELETE
},
6096 {"nopgchange", NOPGCHANGE
}, {"nosizechange", NOSIZECHANGE
},
6097 {"noscrub", NOSCRUB
}, {"nodeep-scrub", NODEEP_SCRUB
},
6098 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED
},
6099 {"hit_set_type", HIT_SET_TYPE
}, {"hit_set_period", HIT_SET_PERIOD
},
6100 {"hit_set_count", HIT_SET_COUNT
}, {"hit_set_fpp", HIT_SET_FPP
},
6101 {"use_gmt_hitset", USE_GMT_HITSET
},
6102 {"target_max_objects", TARGET_MAX_OBJECTS
},
6103 {"target_max_bytes", TARGET_MAX_BYTES
},
6104 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO
},
6105 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO
},
6106 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO
},
6107 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE
},
6108 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE
},
6109 {"erasure_code_profile", ERASURE_CODE_PROFILE
},
6110 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE
},
6111 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE
},
6112 {"fast_read", FAST_READ
},
6113 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE
},
6114 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N
},
6115 {"scrub_min_interval", SCRUB_MIN_INTERVAL
},
6116 {"scrub_max_interval", SCRUB_MAX_INTERVAL
},
6117 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL
},
6118 {"recovery_priority", RECOVERY_PRIORITY
},
6119 {"recovery_op_priority", RECOVERY_OP_PRIORITY
},
6120 {"scrub_priority", SCRUB_PRIORITY
},
6121 {"compression_mode", COMPRESSION_MODE
},
6122 {"compression_algorithm", COMPRESSION_ALGORITHM
},
6123 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO
},
6124 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE
},
6125 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE
},
6126 {"csum_type", CSUM_TYPE
},
6127 {"csum_max_block", CSUM_MAX_BLOCK
},
6128 {"csum_min_block", CSUM_MIN_BLOCK
},
6129 {"fingerprint_algorithm", FINGERPRINT_ALGORITHM
},
6130 {"pg_autoscale_mode", PG_AUTOSCALE_MODE
},
6131 {"pg_num_min", PG_NUM_MIN
},
6132 {"target_size_bytes", TARGET_SIZE_BYTES
},
6133 {"target_size_ratio", TARGET_SIZE_RATIO
},
6134 {"pg_autoscale_bias", PG_AUTOSCALE_BIAS
},
6135 {"dedup_tier", DEDUP_TIER
},
6136 {"dedup_chunk_algorithm", DEDUP_CHUNK_ALGORITHM
},
6137 {"dedup_cdc_chunk_size", DEDUP_CDC_CHUNK_SIZE
},
6140 typedef std::set
<osd_pool_get_choices
> choices_set_t
;
6142 const choices_set_t ONLY_TIER_CHOICES
= {
6143 HIT_SET_TYPE
, HIT_SET_PERIOD
, HIT_SET_COUNT
, HIT_SET_FPP
,
6144 TARGET_MAX_OBJECTS
, TARGET_MAX_BYTES
, CACHE_TARGET_FULL_RATIO
,
6145 CACHE_TARGET_DIRTY_RATIO
, CACHE_TARGET_DIRTY_HIGH_RATIO
,
6146 CACHE_MIN_FLUSH_AGE
, CACHE_MIN_EVICT_AGE
,
6147 MIN_READ_RECENCY_FOR_PROMOTE
,
6148 MIN_WRITE_RECENCY_FOR_PROMOTE
,
6149 HIT_SET_GRADE_DECAY_RATE
, HIT_SET_SEARCH_LAST_N
6151 const choices_set_t ONLY_ERASURE_CHOICES
= {
6152 EC_OVERWRITES
, ERASURE_CODE_PROFILE
6155 choices_set_t selected_choices
;
6157 for(choices_map_t::const_iterator it
= ALL_CHOICES
.begin();
6158 it
!= ALL_CHOICES
.end(); ++it
) {
6159 selected_choices
.insert(it
->second
);
6163 selected_choices
= subtract_second_from_first(selected_choices
,
6167 if(!p
->is_erasure()) {
6168 selected_choices
= subtract_second_from_first(selected_choices
,
6169 ONLY_ERASURE_CHOICES
);
6171 } else /* var != "all" */ {
6172 choices_map_t::const_iterator found
= ALL_CHOICES
.find(var
);
6173 osd_pool_get_choices selected
= found
->second
;
6175 if (!p
->is_tier() &&
6176 ONLY_TIER_CHOICES
.find(selected
) != ONLY_TIER_CHOICES
.end()) {
6177 ss
<< "pool '" << poolstr
6178 << "' is not a tier pool: variable not applicable";
6183 if (!p
->is_erasure() &&
6184 ONLY_ERASURE_CHOICES
.find(selected
)
6185 != ONLY_ERASURE_CHOICES
.end()) {
6186 ss
<< "pool '" << poolstr
6187 << "' is not a erasure pool: variable not applicable";
6192 if (pool_opts_t::is_opt_name(var
) &&
6193 !p
->opts
.is_set(pool_opts_t::get_opt_desc(var
).key
)) {
6194 ss
<< "option '" << var
<< "' is not set on pool '" << poolstr
<< "'";
6199 selected_choices
.insert(selected
);
6203 f
->open_object_section("pool");
6204 f
->dump_string("pool", poolstr
);
6205 f
->dump_int("pool_id", pool
);
6206 for(choices_set_t::const_iterator it
= selected_choices
.begin();
6207 it
!= selected_choices
.end(); ++it
) {
6208 choices_map_t::const_iterator i
;
6209 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
6210 if (i
->second
== *it
) {
6214 ceph_assert(i
!= ALL_CHOICES
.end());
6217 f
->dump_int("pg_num", p
->get_pg_num());
6220 f
->dump_int("pgp_num", p
->get_pgp_num());
6223 f
->dump_int("size", p
->get_size());
6226 f
->dump_int("min_size", p
->get_min_size());
6229 if (osdmap
.crush
->rule_exists(p
->get_crush_rule())) {
6230 f
->dump_string("crush_rule", osdmap
.crush
->get_rule_name(
6231 p
->get_crush_rule()));
6233 f
->dump_string("crush_rule", stringify(p
->get_crush_rule()));
6237 f
->dump_bool("allow_ec_overwrites",
6238 p
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
));
6240 case PG_AUTOSCALE_MODE
:
6241 f
->dump_string("pg_autoscale_mode",
6242 pg_pool_t::get_pg_autoscale_mode_name(
6243 p
->pg_autoscale_mode
));
6249 case WRITE_FADVISE_DONTNEED
:
6252 f
->dump_bool(i
->first
.c_str(),
6253 p
->has_flag(pg_pool_t::get_flag_by_name(i
->first
)));
6255 case HIT_SET_PERIOD
:
6256 f
->dump_int("hit_set_period", p
->hit_set_period
);
6259 f
->dump_int("hit_set_count", p
->hit_set_count
);
6262 f
->dump_string("hit_set_type",
6263 HitSet::get_type_name(p
->hit_set_params
.get_type()));
6267 if (p
->hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
6268 BloomHitSet::Params
*bloomp
=
6269 static_cast<BloomHitSet::Params
*>(p
->hit_set_params
.impl
.get());
6270 f
->dump_float("hit_set_fpp", bloomp
->get_fpp());
6271 } else if(var
!= "all") {
6273 ss
<< "hit set is not of type Bloom; " <<
6274 "invalid to get a false positive rate!";
6280 case USE_GMT_HITSET
:
6281 f
->dump_bool("use_gmt_hitset", p
->use_gmt_hitset
);
6283 case TARGET_MAX_OBJECTS
:
6284 f
->dump_unsigned("target_max_objects", p
->target_max_objects
);
6286 case TARGET_MAX_BYTES
:
6287 f
->dump_unsigned("target_max_bytes", p
->target_max_bytes
);
6289 case CACHE_TARGET_DIRTY_RATIO
:
6290 f
->dump_unsigned("cache_target_dirty_ratio_micro",
6291 p
->cache_target_dirty_ratio_micro
);
6292 f
->dump_float("cache_target_dirty_ratio",
6293 ((float)p
->cache_target_dirty_ratio_micro
/1000000));
6295 case CACHE_TARGET_DIRTY_HIGH_RATIO
:
6296 f
->dump_unsigned("cache_target_dirty_high_ratio_micro",
6297 p
->cache_target_dirty_high_ratio_micro
);
6298 f
->dump_float("cache_target_dirty_high_ratio",
6299 ((float)p
->cache_target_dirty_high_ratio_micro
/1000000));
6301 case CACHE_TARGET_FULL_RATIO
:
6302 f
->dump_unsigned("cache_target_full_ratio_micro",
6303 p
->cache_target_full_ratio_micro
);
6304 f
->dump_float("cache_target_full_ratio",
6305 ((float)p
->cache_target_full_ratio_micro
/1000000));
6307 case CACHE_MIN_FLUSH_AGE
:
6308 f
->dump_unsigned("cache_min_flush_age", p
->cache_min_flush_age
);
6310 case CACHE_MIN_EVICT_AGE
:
6311 f
->dump_unsigned("cache_min_evict_age", p
->cache_min_evict_age
);
6313 case ERASURE_CODE_PROFILE
:
6314 f
->dump_string("erasure_code_profile", p
->erasure_code_profile
);
6316 case MIN_READ_RECENCY_FOR_PROMOTE
:
6317 f
->dump_int("min_read_recency_for_promote",
6318 p
->min_read_recency_for_promote
);
6320 case MIN_WRITE_RECENCY_FOR_PROMOTE
:
6321 f
->dump_int("min_write_recency_for_promote",
6322 p
->min_write_recency_for_promote
);
6325 f
->dump_int("fast_read", p
->fast_read
);
6327 case HIT_SET_GRADE_DECAY_RATE
:
6328 f
->dump_int("hit_set_grade_decay_rate",
6329 p
->hit_set_grade_decay_rate
);
6331 case HIT_SET_SEARCH_LAST_N
:
6332 f
->dump_int("hit_set_search_last_n",
6333 p
->hit_set_search_last_n
);
6335 case SCRUB_MIN_INTERVAL
:
6336 case SCRUB_MAX_INTERVAL
:
6337 case DEEP_SCRUB_INTERVAL
:
6338 case RECOVERY_PRIORITY
:
6339 case RECOVERY_OP_PRIORITY
:
6340 case SCRUB_PRIORITY
:
6341 case COMPRESSION_MODE
:
6342 case COMPRESSION_ALGORITHM
:
6343 case COMPRESSION_REQUIRED_RATIO
:
6344 case COMPRESSION_MAX_BLOB_SIZE
:
6345 case COMPRESSION_MIN_BLOB_SIZE
:
6347 case CSUM_MAX_BLOCK
:
6348 case CSUM_MIN_BLOCK
:
6349 case FINGERPRINT_ALGORITHM
:
6351 case TARGET_SIZE_BYTES
:
6352 case TARGET_SIZE_RATIO
:
6353 case PG_AUTOSCALE_BIAS
:
6355 case DEDUP_CHUNK_ALGORITHM
:
6356 case DEDUP_CDC_CHUNK_SIZE
:
6357 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc(i
->first
).key
;
6358 if (p
->opts
.is_set(key
)) {
6359 if(*it
== CSUM_TYPE
) {
6361 p
->opts
.get(pool_opts_t::CSUM_TYPE
, &val
);
6362 f
->dump_string(i
->first
.c_str(), Checksummer::get_csum_type_string(val
));
6364 p
->opts
.dump(i
->first
, f
.get());
6373 for(choices_set_t::const_iterator it
= selected_choices
.begin();
6374 it
!= selected_choices
.end(); ++it
) {
6375 choices_map_t::const_iterator i
;
6378 ss
<< "pg_num: " << p
->get_pg_num() << "\n";
6381 ss
<< "pgp_num: " << p
->get_pgp_num() << "\n";
6384 ss
<< "size: " << p
->get_size() << "\n";
6387 ss
<< "min_size: " << p
->get_min_size() << "\n";
6390 if (osdmap
.crush
->rule_exists(p
->get_crush_rule())) {
6391 ss
<< "crush_rule: " << osdmap
.crush
->get_rule_name(
6392 p
->get_crush_rule()) << "\n";
6394 ss
<< "crush_rule: " << p
->get_crush_rule() << "\n";
6397 case PG_AUTOSCALE_MODE
:
6398 ss
<< "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
6399 p
->pg_autoscale_mode
) <<"\n";
6401 case HIT_SET_PERIOD
:
6402 ss
<< "hit_set_period: " << p
->hit_set_period
<< "\n";
6405 ss
<< "hit_set_count: " << p
->hit_set_count
<< "\n";
6408 ss
<< "hit_set_type: " <<
6409 HitSet::get_type_name(p
->hit_set_params
.get_type()) << "\n";
6413 if (p
->hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
6414 BloomHitSet::Params
*bloomp
=
6415 static_cast<BloomHitSet::Params
*>(p
->hit_set_params
.impl
.get());
6416 ss
<< "hit_set_fpp: " << bloomp
->get_fpp() << "\n";
6417 } else if(var
!= "all") {
6418 ss
<< "hit set is not of type Bloom; " <<
6419 "invalid to get a false positive rate!";
6425 case USE_GMT_HITSET
:
6426 ss
<< "use_gmt_hitset: " << p
->use_gmt_hitset
<< "\n";
6428 case TARGET_MAX_OBJECTS
:
6429 ss
<< "target_max_objects: " << p
->target_max_objects
<< "\n";
6431 case TARGET_MAX_BYTES
:
6432 ss
<< "target_max_bytes: " << p
->target_max_bytes
<< "\n";
6434 case CACHE_TARGET_DIRTY_RATIO
:
6435 ss
<< "cache_target_dirty_ratio: "
6436 << ((float)p
->cache_target_dirty_ratio_micro
/1000000) << "\n";
6438 case CACHE_TARGET_DIRTY_HIGH_RATIO
:
6439 ss
<< "cache_target_dirty_high_ratio: "
6440 << ((float)p
->cache_target_dirty_high_ratio_micro
/1000000) << "\n";
6442 case CACHE_TARGET_FULL_RATIO
:
6443 ss
<< "cache_target_full_ratio: "
6444 << ((float)p
->cache_target_full_ratio_micro
/1000000) << "\n";
6446 case CACHE_MIN_FLUSH_AGE
:
6447 ss
<< "cache_min_flush_age: " << p
->cache_min_flush_age
<< "\n";
6449 case CACHE_MIN_EVICT_AGE
:
6450 ss
<< "cache_min_evict_age: " << p
->cache_min_evict_age
<< "\n";
6452 case ERASURE_CODE_PROFILE
:
6453 ss
<< "erasure_code_profile: " << p
->erasure_code_profile
<< "\n";
6455 case MIN_READ_RECENCY_FOR_PROMOTE
:
6456 ss
<< "min_read_recency_for_promote: " <<
6457 p
->min_read_recency_for_promote
<< "\n";
6459 case HIT_SET_GRADE_DECAY_RATE
:
6460 ss
<< "hit_set_grade_decay_rate: " <<
6461 p
->hit_set_grade_decay_rate
<< "\n";
6463 case HIT_SET_SEARCH_LAST_N
:
6464 ss
<< "hit_set_search_last_n: " <<
6465 p
->hit_set_search_last_n
<< "\n";
6468 ss
<< "allow_ec_overwrites: " <<
6469 (p
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
) ? "true" : "false") <<
6476 case WRITE_FADVISE_DONTNEED
:
6479 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
6480 if (i
->second
== *it
)
6483 ceph_assert(i
!= ALL_CHOICES
.end());
6484 ss
<< i
->first
<< ": " <<
6485 (p
->has_flag(pg_pool_t::get_flag_by_name(i
->first
)) ?
6486 "true" : "false") << "\n";
6488 case MIN_WRITE_RECENCY_FOR_PROMOTE
:
6489 ss
<< "min_write_recency_for_promote: " <<
6490 p
->min_write_recency_for_promote
<< "\n";
6493 ss
<< "fast_read: " << p
->fast_read
<< "\n";
6495 case SCRUB_MIN_INTERVAL
:
6496 case SCRUB_MAX_INTERVAL
:
6497 case DEEP_SCRUB_INTERVAL
:
6498 case RECOVERY_PRIORITY
:
6499 case RECOVERY_OP_PRIORITY
:
6500 case SCRUB_PRIORITY
:
6501 case COMPRESSION_MODE
:
6502 case COMPRESSION_ALGORITHM
:
6503 case COMPRESSION_REQUIRED_RATIO
:
6504 case COMPRESSION_MAX_BLOB_SIZE
:
6505 case COMPRESSION_MIN_BLOB_SIZE
:
6507 case CSUM_MAX_BLOCK
:
6508 case CSUM_MIN_BLOCK
:
6509 case FINGERPRINT_ALGORITHM
:
6511 case TARGET_SIZE_BYTES
:
6512 case TARGET_SIZE_RATIO
:
6513 case PG_AUTOSCALE_BIAS
:
6515 case DEDUP_CHUNK_ALGORITHM
:
6516 case DEDUP_CDC_CHUNK_SIZE
:
6517 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
6518 if (i
->second
== *it
)
6521 ceph_assert(i
!= ALL_CHOICES
.end());
6523 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc(i
->first
).key
;
6524 if (p
->opts
.is_set(key
)) {
6525 if(key
== pool_opts_t::CSUM_TYPE
) {
6527 p
->opts
.get(key
, &val
);
6528 ss
<< i
->first
<< ": " << Checksummer::get_csum_type_string(val
) << "\n";
6530 ss
<< i
->first
<< ": " << p
->opts
.get(key
) << "\n";
6536 rdata
.append(ss
.str());
6541 } else if (prefix
== "osd pool get-quota") {
6543 cmd_getval(cmdmap
, "pool", pool_name
);
6545 int64_t poolid
= osdmap
.lookup_pg_pool_name(pool_name
);
6547 ceph_assert(poolid
== -ENOENT
);
6548 ss
<< "unrecognized pool '" << pool_name
<< "'";
6552 const pg_pool_t
*p
= osdmap
.get_pg_pool(poolid
);
6553 const pool_stat_t
* pstat
= mon
.mgrstatmon()->get_pool_stat(poolid
);
6554 const object_stat_sum_t
& sum
= pstat
->stats
.sum
;
6556 f
->open_object_section("pool_quotas");
6557 f
->dump_string("pool_name", pool_name
);
6558 f
->dump_unsigned("pool_id", poolid
);
6559 f
->dump_unsigned("quota_max_objects", p
->quota_max_objects
);
6560 f
->dump_int("current_num_objects", sum
.num_objects
);
6561 f
->dump_unsigned("quota_max_bytes", p
->quota_max_bytes
);
6562 f
->dump_int("current_num_bytes", sum
.num_bytes
);
6567 rs
<< "quotas for pool '" << pool_name
<< "':\n"
6568 << " max objects: ";
6569 if (p
->quota_max_objects
== 0)
6572 rs
<< si_u_t(p
->quota_max_objects
) << " objects";
6573 rs
<< " (current num objects: " << sum
.num_objects
<< " objects)";
6577 if (p
->quota_max_bytes
== 0)
6580 rs
<< byte_u_t(p
->quota_max_bytes
);
6581 rs
<< " (current num bytes: " << sum
.num_bytes
<< " bytes)";
6583 rdata
.append(rs
.str());
6587 } else if (prefix
== "osd crush rule list" ||
6588 prefix
== "osd crush rule ls") {
6590 f
->open_array_section("rules");
6591 osdmap
.crush
->list_rules(f
.get());
6596 osdmap
.crush
->list_rules(&ss
);
6597 rdata
.append(ss
.str());
6599 } else if (prefix
== "osd crush rule ls-by-class") {
6601 cmd_getval(cmdmap
, "class", class_name
);
6602 if (class_name
.empty()) {
6603 ss
<< "no class specified";
6608 r
= osdmap
.crush
->get_rules_by_class(class_name
, &rules
);
6610 ss
<< "failed to get rules by class '" << class_name
<< "'";
6614 f
->open_array_section("rules");
6615 for (auto &rule
: rules
) {
6616 f
->dump_string("name", osdmap
.crush
->get_rule_name(rule
));
6622 for (auto &rule
: rules
) {
6623 rs
<< osdmap
.crush
->get_rule_name(rule
) << "\n";
6625 rdata
.append(rs
.str());
6627 } else if (prefix
== "osd crush rule dump") {
6629 cmd_getval(cmdmap
, "name", name
);
6631 cmd_getval(cmdmap
, "format", format
);
6632 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6634 f
->open_array_section("rules");
6635 osdmap
.crush
->dump_rules(f
.get());
6638 int ruleno
= osdmap
.crush
->get_rule_id(name
);
6640 ss
<< "unknown crush rule '" << name
<< "'";
6644 osdmap
.crush
->dump_rule(ruleno
, f
.get());
6649 rdata
.append(rs
.str());
6650 } else if (prefix
== "osd crush dump") {
6652 cmd_getval(cmdmap
, "format", format
);
6653 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6654 f
->open_object_section("crush_map");
6655 osdmap
.crush
->dump(f
.get());
6660 rdata
.append(rs
.str());
6661 } else if (prefix
== "osd crush show-tunables") {
6663 cmd_getval(cmdmap
, "format", format
);
6664 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6665 f
->open_object_section("crush_map_tunables");
6666 osdmap
.crush
->dump_tunables(f
.get());
6671 rdata
.append(rs
.str());
6672 } else if (prefix
== "osd crush tree") {
6674 cmd_getval(cmdmap
, "shadow", shadow
);
6675 bool show_shadow
= shadow
== "--show-shadow";
6676 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
6678 f
->open_object_section("crush_tree");
6679 osdmap
.crush
->dump_tree(nullptr,
6681 osdmap
.get_pool_names(),
6687 osdmap
.crush
->dump_tree(&ss
,
6689 osdmap
.get_pool_names(),
6691 rdata
.append(ss
.str());
6693 } else if (prefix
== "osd crush ls") {
6695 if (!cmd_getval(cmdmap
, "node", name
)) {
6696 ss
<< "no node specified";
6700 if (!osdmap
.crush
->name_exists(name
)) {
6701 ss
<< "node '" << name
<< "' does not exist";
6705 int id
= osdmap
.crush
->get_item_id(name
);
6708 result
.push_back(id
);
6710 int num
= osdmap
.crush
->get_bucket_size(id
);
6711 for (int i
= 0; i
< num
; ++i
) {
6712 result
.push_back(osdmap
.crush
->get_bucket_item(id
, i
));
6716 f
->open_array_section("items");
6717 for (auto i
: result
) {
6718 f
->dump_string("item", osdmap
.crush
->get_item_name(i
));
6724 for (auto i
: result
) {
6725 ss
<< osdmap
.crush
->get_item_name(i
) << "\n";
6727 rdata
.append(ss
.str());
6730 } else if (prefix
== "osd crush class ls") {
6731 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6732 f
->open_array_section("crush_classes");
6733 for (auto i
: osdmap
.crush
->class_name
)
6734 f
->dump_string("class", i
.second
);
6737 } else if (prefix
== "osd crush class ls-osd") {
6739 cmd_getval(cmdmap
, "class", name
);
6741 osdmap
.crush
->get_devices_by_class(name
, &osds
);
6743 f
->open_array_section("osds");
6744 for (auto &osd
: osds
)
6745 f
->dump_int("osd", osd
);
6750 for (auto &osd
: osds
) {
6758 } else if (prefix
== "osd crush get-device-class") {
6759 vector
<string
> idvec
;
6760 cmd_getval(cmdmap
, "ids", idvec
);
6761 map
<int, string
> class_by_osd
;
6762 for (auto& id
: idvec
) {
6764 long osd
= parse_osd_id(id
.c_str(), &ts
);
6766 ss
<< "unable to parse osd id:'" << id
<< "'";
6770 auto device_class
= osdmap
.crush
->get_item_class(osd
);
6772 class_by_osd
[osd
] = device_class
;
6774 class_by_osd
[osd
] = ""; // no class
6777 f
->open_array_section("osd_device_classes");
6778 for (auto& i
: class_by_osd
) {
6779 f
->open_object_section("osd_device_class");
6780 f
->dump_int("osd", i
.first
);
6781 f
->dump_string("device_class", i
.second
);
6787 if (class_by_osd
.size() == 1) {
6788 // for single input, make a clean output
6789 ds
<< class_by_osd
.begin()->second
;
6791 // note that we do not group osds by class here
6792 for (auto it
= class_by_osd
.begin();
6793 it
!= class_by_osd
.end();
6795 ds
<< "osd." << it
->first
<< ' ' << it
->second
;
6796 if (next(it
) != class_by_osd
.end())
6802 } else if (prefix
== "osd erasure-code-profile ls") {
6803 const auto &profiles
= osdmap
.get_erasure_code_profiles();
6805 f
->open_array_section("erasure-code-profiles");
6806 for (auto i
= profiles
.begin(); i
!= profiles
.end(); ++i
) {
6808 f
->dump_string("profile", i
->first
.c_str());
6810 rdata
.append(i
->first
+ "\n");
6817 rdata
.append(rs
.str());
6819 } else if (prefix
== "osd crush weight-set ls") {
6820 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
6822 f
->open_array_section("weight_sets");
6823 if (osdmap
.crush
->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS
)) {
6824 f
->dump_string("pool", "(compat)");
6826 for (auto& i
: osdmap
.crush
->choose_args
) {
6828 f
->dump_string("pool", osdmap
.get_pool_name(i
.first
));
6835 if (osdmap
.crush
->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS
)) {
6838 for (auto& i
: osdmap
.crush
->choose_args
) {
6840 rs
<< osdmap
.get_pool_name(i
.first
) << "\n";
6843 rdata
.append(rs
.str());
6845 } else if (prefix
== "osd crush weight-set dump") {
6846 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty",
6848 osdmap
.crush
->dump_choose_args(f
.get());
6850 } else if (prefix
== "osd erasure-code-profile get") {
6852 cmd_getval(cmdmap
, "name", name
);
6853 if (!osdmap
.has_erasure_code_profile(name
)) {
6854 ss
<< "unknown erasure code profile '" << name
<< "'";
6858 const map
<string
,string
> &profile
= osdmap
.get_erasure_code_profile(name
);
6860 f
->open_object_section("profile");
6861 for (map
<string
,string
>::const_iterator i
= profile
.begin();
6865 f
->dump_string(i
->first
.c_str(), i
->second
.c_str());
6867 rdata
.append(i
->first
+ "=" + i
->second
+ "\n");
6874 rdata
.append(rs
.str());
6876 } else if (prefix
== "osd pool application get") {
6877 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty",
6880 cmd_getval(cmdmap
, "pool", pool_name
);
6882 cmd_getval(cmdmap
, "app", app
);
6884 cmd_getval(cmdmap
, "key", key
);
6886 if (pool_name
.empty()) {
6888 f
->open_object_section("pools");
6889 for (const auto &pool
: osdmap
.pools
) {
6890 std::string
name("<unknown>");
6891 const auto &pni
= osdmap
.pool_name
.find(pool
.first
);
6892 if (pni
!= osdmap
.pool_name
.end())
6894 f
->open_object_section(name
.c_str());
6895 for (auto &app_pair
: pool
.second
.application_metadata
) {
6896 f
->open_object_section(app_pair
.first
.c_str());
6897 for (auto &kv_pair
: app_pair
.second
) {
6898 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
6902 f
->close_section(); // name
6904 f
->close_section(); // pools
6907 int64_t pool
= osdmap
.lookup_pg_pool_name(pool_name
.c_str());
6909 ss
<< "unrecognized pool '" << pool_name
<< "'";
6913 auto p
= osdmap
.get_pg_pool(pool
);
6916 f
->open_object_section(pool_name
.c_str());
6917 for (auto &app_pair
: p
->application_metadata
) {
6918 f
->open_object_section(app_pair
.first
.c_str());
6919 for (auto &kv_pair
: app_pair
.second
) {
6920 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
6922 f
->close_section(); // application
6924 f
->close_section(); // pool_name
6929 auto app_it
= p
->application_metadata
.find(app
);
6930 if (app_it
== p
->application_metadata
.end()) {
6931 ss
<< "pool '" << pool_name
<< "' has no application '" << app
<< "'";
6935 // filter by pool + app
6937 f
->open_object_section(app_it
->first
.c_str());
6938 for (auto &kv_pair
: app_it
->second
) {
6939 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
6941 f
->close_section(); // application
6945 // filter by pool + app + key
6946 auto key_it
= app_it
->second
.find(key
);
6947 if (key_it
== app_it
->second
.end()) {
6948 ss
<< "application '" << app
<< "' on pool '" << pool_name
6949 << "' does not have key '" << key
<< "'";
6953 ss
<< key_it
->second
<< "\n";
6954 rdata
.append(ss
.str());
6957 } else if (prefix
== "osd get-require-min-compat-client") {
6958 ss
<< osdmap
.require_min_compat_client
<< std::endl
;
6959 rdata
.append(ss
.str());
6962 } else if (prefix
== "osd pool application enable" ||
6963 prefix
== "osd pool application disable" ||
6964 prefix
== "osd pool application set" ||
6965 prefix
== "osd pool application rm") {
6966 bool changed
= false;
6967 r
= preprocess_command_pool_application(prefix
, cmdmap
, ss
, &changed
);
6971 } else if (changed
) {
6972 // Valid mutation, proceed to prepare phase
6975 // Idempotent case, reply
6979 // try prepare update
6986 mon
.reply_command(op
, r
, rs
, rdata
, get_last_committed());
6990 void OSDMonitor::set_pool_flags(int64_t pool_id
, uint64_t flags
)
6992 pg_pool_t
*pool
= pending_inc
.get_new_pool(pool_id
,
6993 osdmap
.get_pg_pool(pool_id
));
6995 pool
->set_flag(flags
);
6998 void OSDMonitor::clear_pool_flags(int64_t pool_id
, uint64_t flags
)
7000 pg_pool_t
*pool
= pending_inc
.get_new_pool(pool_id
,
7001 osdmap
.get_pg_pool(pool_id
));
7003 pool
->unset_flag(flags
);
7006 string
OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch
)
7009 snprintf(k
, sizeof(k
), "purged_epoch_%08lx", (unsigned long)epoch
);
7013 string
OSDMonitor::make_purged_snap_key(int64_t pool
, snapid_t snap
)
7016 snprintf(k
, sizeof(k
), "purged_snap_%llu_%016llx",
7017 (unsigned long long)pool
, (unsigned long long)snap
);
7021 string
OSDMonitor::make_purged_snap_key_value(
7022 int64_t pool
, snapid_t snap
, snapid_t num
,
7023 epoch_t epoch
, bufferlist
*v
)
7025 // encode the *last* epoch in the key so that we can use forward
7026 // iteration only to search for an epoch in an interval.
7028 encode(snap
+ num
, *v
);
7030 return make_purged_snap_key(pool
, snap
+ num
- 1);
7034 int OSDMonitor::lookup_purged_snap(
7035 int64_t pool
, snapid_t snap
,
7036 snapid_t
*begin
, snapid_t
*end
)
7038 string k
= make_purged_snap_key(pool
, snap
);
7039 auto it
= mon
.store
->get_iterator(OSD_SNAP_PREFIX
);
7042 dout(20) << __func__
7043 << " pool " << pool
<< " snap " << snap
7044 << " - key '" << k
<< "' not found" << dendl
;
7047 if (it
->key().find("purged_snap_") != 0) {
7048 dout(20) << __func__
7049 << " pool " << pool
<< " snap " << snap
7050 << " - key '" << k
<< "' got '" << it
->key()
7051 << "', wrong prefix" << dendl
;
7054 string gotk
= it
->key();
7055 const char *format
= "purged_snap_%llu_";
7056 long long int keypool
;
7057 int n
= sscanf(gotk
.c_str(), format
, &keypool
);
7059 derr
<< __func__
<< " invalid k '" << gotk
<< "'" << dendl
;
7062 if (pool
!= keypool
) {
7063 dout(20) << __func__
7064 << " pool " << pool
<< " snap " << snap
7065 << " - key '" << k
<< "' got '" << gotk
7066 << "', wrong pool " << keypool
7070 bufferlist v
= it
->value();
7071 auto p
= v
.cbegin();
7074 if (snap
< *begin
|| snap
>= *end
) {
7075 dout(20) << __func__
7076 << " pool " << pool
<< " snap " << snap
7077 << " - found [" << *begin
<< "," << *end
<< "), no overlap"
7084 void OSDMonitor::insert_purged_snap_update(
7086 snapid_t start
, snapid_t end
,
7088 MonitorDBStore::TransactionRef t
)
7090 snapid_t before_begin
, before_end
;
7091 snapid_t after_begin
, after_end
;
7092 int b
= lookup_purged_snap(pool
, start
- 1,
7093 &before_begin
, &before_end
);
7094 int a
= lookup_purged_snap(pool
, end
,
7095 &after_begin
, &after_end
);
7097 dout(10) << __func__
7098 << " [" << start
<< "," << end
<< ") - joins ["
7099 << before_begin
<< "," << before_end
<< ") and ["
7100 << after_begin
<< "," << after_end
<< ")" << dendl
;
7101 // erase only the begin record; we'll overwrite the end one.
7102 t
->erase(OSD_SNAP_PREFIX
, make_purged_snap_key(pool
, before_end
- 1));
7104 string k
= make_purged_snap_key_value(pool
,
7105 before_begin
, after_end
- before_begin
,
7106 pending_inc
.epoch
, &v
);
7107 t
->put(OSD_SNAP_PREFIX
, k
, v
);
7109 dout(10) << __func__
7110 << " [" << start
<< "," << end
<< ") - join with earlier ["
7111 << before_begin
<< "," << before_end
<< ")" << dendl
;
7112 t
->erase(OSD_SNAP_PREFIX
, make_purged_snap_key(pool
, before_end
- 1));
7114 string k
= make_purged_snap_key_value(pool
,
7115 before_begin
, end
- before_begin
,
7116 pending_inc
.epoch
, &v
);
7117 t
->put(OSD_SNAP_PREFIX
, k
, v
);
7119 dout(10) << __func__
7120 << " [" << start
<< "," << end
<< ") - join with later ["
7121 << after_begin
<< "," << after_end
<< ")" << dendl
;
7122 // overwrite after record
7124 string k
= make_purged_snap_key_value(pool
,
7125 start
, after_end
- start
,
7126 pending_inc
.epoch
, &v
);
7127 t
->put(OSD_SNAP_PREFIX
, k
, v
);
7129 dout(10) << __func__
7130 << " [" << start
<< "," << end
<< ") - new"
7133 string k
= make_purged_snap_key_value(pool
,
7135 pending_inc
.epoch
, &v
);
7136 t
->put(OSD_SNAP_PREFIX
, k
, v
);
7140 bool OSDMonitor::try_prune_purged_snaps()
7142 if (!mon
.mgrstatmon()->is_readable()) {
7145 if (!pending_inc
.new_purged_snaps
.empty()) {
7146 return false; // we already pruned for this epoch
7149 unsigned max_prune
= cct
->_conf
.get_val
<uint64_t>(
7150 "mon_max_snap_prune_per_epoch");
7154 dout(10) << __func__
<< " max_prune " << max_prune
<< dendl
;
7156 unsigned actually_pruned
= 0;
7157 auto& purged_snaps
= mon
.mgrstatmon()->get_digest().purged_snaps
;
7158 for (auto& p
: osdmap
.get_pools()) {
7159 auto q
= purged_snaps
.find(p
.first
);
7160 if (q
== purged_snaps
.end()) {
7163 auto& purged
= q
->second
;
7164 if (purged
.empty()) {
7165 dout(20) << __func__
<< " " << p
.first
<< " nothing purged" << dendl
;
7168 dout(20) << __func__
<< " pool " << p
.first
<< " purged " << purged
<< dendl
;
7169 snap_interval_set_t to_prune
;
7170 unsigned maybe_pruned
= actually_pruned
;
7171 for (auto i
= purged
.begin(); i
!= purged
.end(); ++i
) {
7172 snapid_t begin
= i
.get_start();
7173 auto end
= i
.get_start() + i
.get_len();
7174 snapid_t pbegin
= 0, pend
= 0;
7175 int r
= lookup_purged_snap(p
.first
, begin
, &pbegin
, &pend
);
7178 // be a bit aggressive about backing off here, because the mon may
7179 // do a lot of work going through this set, and if we know the
7180 // purged set from the OSDs is at least *partly* stale we may as
7181 // well wait for it to be fresh.
7182 dout(20) << __func__
<< " we've already purged " << pbegin
7183 << "~" << (pend
- pbegin
) << dendl
;
7186 if (pbegin
&& pbegin
> begin
&& pbegin
< end
) {
7187 // the tail of [begin,end) is purged; shorten the range
7190 to_prune
.insert(begin
, end
- begin
);
7191 maybe_pruned
+= end
- begin
;
7192 if (maybe_pruned
>= max_prune
) {
7196 if (!to_prune
.empty()) {
7197 // PGs may still be reporting things as purged that we have already
7198 // pruned from removed_snaps_queue.
7199 snap_interval_set_t actual
;
7200 auto r
= osdmap
.removed_snaps_queue
.find(p
.first
);
7201 if (r
!= osdmap
.removed_snaps_queue
.end()) {
7202 actual
.intersection_of(to_prune
, r
->second
);
7204 actually_pruned
+= actual
.size();
7205 dout(10) << __func__
<< " pool " << p
.first
<< " reports pruned " << to_prune
7206 << ", actual pruned " << actual
<< dendl
;
7207 if (!actual
.empty()) {
7208 pending_inc
.new_purged_snaps
[p
.first
].swap(actual
);
7211 if (actually_pruned
>= max_prune
) {
7215 dout(10) << __func__
<< " actually pruned " << actually_pruned
<< dendl
;
7216 return !!actually_pruned
;
7219 bool OSDMonitor::update_pools_status()
7221 if (!mon
.mgrstatmon()->is_readable())
7226 auto& pools
= osdmap
.get_pools();
7227 for (auto it
= pools
.begin(); it
!= pools
.end(); ++it
) {
7228 const pool_stat_t
*pstat
= mon
.mgrstatmon()->get_pool_stat(it
->first
);
7231 const object_stat_sum_t
& sum
= pstat
->stats
.sum
;
7232 const pg_pool_t
&pool
= it
->second
;
7233 const string
& pool_name
= osdmap
.get_pool_name(it
->first
);
7236 (pool
.quota_max_bytes
> 0 && (uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) ||
7237 (pool
.quota_max_objects
> 0 && (uint64_t)sum
.num_objects
>= pool
.quota_max_objects
);
7239 if (pool
.has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
7243 mon
.clog
->info() << "pool '" << pool_name
7244 << "' no longer out of quota; removing NO_QUOTA flag";
7245 // below we cancel FLAG_FULL too, we'll set it again in
7246 // OSDMonitor::encode_pending if it still fails the osd-full checking.
7247 clear_pool_flags(it
->first
,
7248 pg_pool_t::FLAG_FULL_QUOTA
| pg_pool_t::FLAG_FULL
);
7254 if (pool
.quota_max_bytes
> 0 &&
7255 (uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) {
7256 mon
.clog
->warn() << "pool '" << pool_name
<< "' is full"
7257 << " (reached quota's max_bytes: "
7258 << byte_u_t(pool
.quota_max_bytes
) << ")";
7260 if (pool
.quota_max_objects
> 0 &&
7261 (uint64_t)sum
.num_objects
>= pool
.quota_max_objects
) {
7262 mon
.clog
->warn() << "pool '" << pool_name
<< "' is full"
7263 << " (reached quota's max_objects: "
7264 << pool
.quota_max_objects
<< ")";
7266 // set both FLAG_FULL_QUOTA and FLAG_FULL
7267 // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
7268 // since FLAG_FULL should always take precedence
7269 set_pool_flags(it
->first
,
7270 pg_pool_t::FLAG_FULL_QUOTA
| pg_pool_t::FLAG_FULL
);
7271 clear_pool_flags(it
->first
,
7272 pg_pool_t::FLAG_NEARFULL
|
7273 pg_pool_t::FLAG_BACKFILLFULL
);
7280 int OSDMonitor::prepare_new_pool(MonOpRequestRef op
)
7282 op
->mark_osdmon_event(__func__
);
7283 auto m
= op
->get_req
<MPoolOp
>();
7284 dout(10) << "prepare_new_pool from " << m
->get_connection() << dendl
;
7285 MonSession
*session
= op
->get_session();
7288 string erasure_code_profile
;
7292 ret
= prepare_new_pool(m
->name
, m
->crush_rule
, rule_name
,
7294 erasure_code_profile
,
7295 pg_pool_t::TYPE_REPLICATED
, 0, FAST_READ_OFF
, {},
7299 dout(10) << __func__
<< " got " << ret
<< " " << ss
.str() << dendl
;
7304 int OSDMonitor::crush_rename_bucket(const string
& srcname
,
7305 const string
& dstname
,
7310 // Avoid creating a pending crush if it does not already exists and
7311 // the rename would fail.
7313 if (!_have_pending_crush()) {
7314 ret
= _get_stable_crush().can_rename_bucket(srcname
,
7321 CrushWrapper newcrush
;
7322 _get_pending_crush(newcrush
);
7324 ret
= newcrush
.rename_bucket(srcname
,
7330 pending_inc
.crush
.clear();
7331 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
7332 *ss
<< "renamed bucket " << srcname
<< " into " << dstname
;
7336 void OSDMonitor::check_legacy_ec_plugin(const string
& plugin
, const string
& profile
) const
7338 string replacement
= "";
7340 if (plugin
== "jerasure_generic" ||
7341 plugin
== "jerasure_sse3" ||
7342 plugin
== "jerasure_sse4" ||
7343 plugin
== "jerasure_neon") {
7344 replacement
= "jerasure";
7345 } else if (plugin
== "shec_generic" ||
7346 plugin
== "shec_sse3" ||
7347 plugin
== "shec_sse4" ||
7348 plugin
== "shec_neon") {
7349 replacement
= "shec";
7352 if (replacement
!= "") {
7353 dout(0) << "WARNING: erasure coding profile " << profile
<< " uses plugin "
7354 << plugin
<< " that has been deprecated. Please use "
7355 << replacement
<< " instead." << dendl
;
7359 int OSDMonitor::normalize_profile(const string
& profilename
,
7360 ErasureCodeProfile
&profile
,
7364 ErasureCodeInterfaceRef erasure_code
;
7365 ErasureCodePluginRegistry
&instance
= ErasureCodePluginRegistry::instance();
7366 ErasureCodeProfile::const_iterator plugin
= profile
.find("plugin");
7367 check_legacy_ec_plugin(plugin
->second
, profilename
);
7368 int err
= instance
.factory(plugin
->second
,
7369 g_conf().get_val
<std::string
>("erasure_code_dir"),
7370 profile
, &erasure_code
, ss
);
7375 err
= erasure_code
->init(profile
, ss
);
7380 auto it
= profile
.find("stripe_unit");
7381 if (it
!= profile
.end()) {
7383 uint32_t stripe_unit
= strict_iecstrtoll(it
->second
.c_str(), &err_str
);
7384 if (!err_str
.empty()) {
7385 *ss
<< "could not parse stripe_unit '" << it
->second
7386 << "': " << err_str
<< std::endl
;
7389 uint32_t data_chunks
= erasure_code
->get_data_chunk_count();
7390 uint32_t chunk_size
= erasure_code
->get_chunk_size(stripe_unit
* data_chunks
);
7391 if (chunk_size
!= stripe_unit
) {
7392 *ss
<< "stripe_unit " << stripe_unit
<< " does not match ec profile "
7393 << "alignment. Would be padded to " << chunk_size
7397 if ((stripe_unit
% 4096) != 0 && !force
) {
7398 *ss
<< "stripe_unit should be a multiple of 4096 bytes for best performance."
7399 << "use --force to override this check" << std::endl
;
7406 int OSDMonitor::crush_rule_create_erasure(const string
&name
,
7407 const string
&profile
,
7411 int ruleid
= osdmap
.crush
->get_rule_id(name
);
7412 if (ruleid
!= -ENOENT
) {
7413 *rule
= osdmap
.crush
->get_rule_mask_ruleset(ruleid
);
7417 CrushWrapper newcrush
;
7418 _get_pending_crush(newcrush
);
7420 ruleid
= newcrush
.get_rule_id(name
);
7421 if (ruleid
!= -ENOENT
) {
7422 *rule
= newcrush
.get_rule_mask_ruleset(ruleid
);
7425 ErasureCodeInterfaceRef erasure_code
;
7426 int err
= get_erasure_code(profile
, &erasure_code
, ss
);
7428 *ss
<< "failed to load plugin using profile " << profile
<< std::endl
;
7432 err
= erasure_code
->create_rule(name
, newcrush
, ss
);
7433 erasure_code
.reset();
7437 pending_inc
.crush
.clear();
7438 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
7443 int OSDMonitor::get_erasure_code(const string
&erasure_code_profile
,
7444 ErasureCodeInterfaceRef
*erasure_code
,
7447 if (pending_inc
.has_erasure_code_profile(erasure_code_profile
))
7449 ErasureCodeProfile profile
=
7450 osdmap
.get_erasure_code_profile(erasure_code_profile
);
7451 ErasureCodeProfile::const_iterator plugin
=
7452 profile
.find("plugin");
7453 if (plugin
== profile
.end()) {
7454 *ss
<< "cannot determine the erasure code plugin"
7455 << " because there is no 'plugin' entry in the erasure_code_profile "
7456 << profile
<< std::endl
;
7459 check_legacy_ec_plugin(plugin
->second
, erasure_code_profile
);
7460 auto& instance
= ErasureCodePluginRegistry::instance();
7461 return instance
.factory(plugin
->second
,
7462 g_conf().get_val
<std::string
>("erasure_code_dir"),
7463 profile
, erasure_code
, ss
);
7466 int OSDMonitor::check_cluster_features(uint64_t features
,
7469 stringstream unsupported_ss
;
7470 int unsupported_count
= 0;
7471 if ((mon
.get_quorum_con_features() & features
) != features
) {
7472 unsupported_ss
<< "the monitor cluster";
7473 ++unsupported_count
;
7476 set
<int32_t> up_osds
;
7477 osdmap
.get_up_osds(up_osds
);
7478 for (set
<int32_t>::iterator it
= up_osds
.begin();
7479 it
!= up_osds
.end(); ++it
) {
7480 const osd_xinfo_t
&xi
= osdmap
.get_xinfo(*it
);
7481 if ((xi
.features
& features
) != features
) {
7482 if (unsupported_count
> 0)
7483 unsupported_ss
<< ", ";
7484 unsupported_ss
<< "osd." << *it
;
7485 unsupported_count
++;
7489 if (unsupported_count
> 0) {
7490 ss
<< "features " << features
<< " unsupported by: "
7491 << unsupported_ss
.str();
7495 // check pending osd state, too!
7496 for (map
<int32_t,osd_xinfo_t
>::const_iterator p
=
7497 pending_inc
.new_xinfo
.begin();
7498 p
!= pending_inc
.new_xinfo
.end(); ++p
) {
7499 const osd_xinfo_t
&xi
= p
->second
;
7500 if ((xi
.features
& features
) != features
) {
7501 dout(10) << __func__
<< " pending osd." << p
->first
7502 << " features are insufficient; retry" << dendl
;
7510 bool OSDMonitor::validate_crush_against_features(const CrushWrapper
*newcrush
,
7513 OSDMap::Incremental new_pending
= pending_inc
;
7514 encode(*newcrush
, new_pending
.crush
, mon
.get_quorum_con_features());
7516 newmap
.deepish_copy_from(osdmap
);
7517 newmap
.apply_incremental(new_pending
);
7520 if (newmap
.require_min_compat_client
!= ceph_release_t::unknown
) {
7521 auto mv
= newmap
.get_min_compat_client();
7522 if (mv
> newmap
.require_min_compat_client
) {
7523 ss
<< "new crush map requires client version " << mv
7524 << " but require_min_compat_client is "
7525 << newmap
.require_min_compat_client
;
7532 newmap
.get_features(CEPH_ENTITY_TYPE_MON
, NULL
) |
7533 newmap
.get_features(CEPH_ENTITY_TYPE_OSD
, NULL
);
7534 stringstream features_ss
;
7535 int r
= check_cluster_features(features
, features_ss
);
7537 ss
<< "Could not change CRUSH: " << features_ss
.str();
7544 bool OSDMonitor::erasure_code_profile_in_use(
7545 const mempool::osdmap::map
<int64_t, pg_pool_t
> &pools
,
7546 const string
&profile
,
7550 for (map
<int64_t, pg_pool_t
>::const_iterator p
= pools
.begin();
7553 if (p
->second
.erasure_code_profile
== profile
&& p
->second
.is_erasure()) {
7554 *ss
<< osdmap
.pool_name
[p
->first
] << " ";
7559 *ss
<< "pool(s) are using the erasure code profile '" << profile
<< "'";
7564 int OSDMonitor::parse_erasure_code_profile(const vector
<string
> &erasure_code_profile
,
7565 map
<string
,string
> *erasure_code_profile_map
,
7568 int r
= g_conf().with_val
<string
>("osd_pool_default_erasure_code_profile",
7571 erasure_code_profile_map
,
7575 ceph_assert((*erasure_code_profile_map
).count("plugin"));
7576 string default_plugin
= (*erasure_code_profile_map
)["plugin"];
7577 map
<string
,string
> user_map
;
7578 for (vector
<string
>::const_iterator i
= erasure_code_profile
.begin();
7579 i
!= erasure_code_profile
.end();
7581 size_t equal
= i
->find('=');
7582 if (equal
== string::npos
) {
7583 user_map
[*i
] = string();
7584 (*erasure_code_profile_map
)[*i
] = string();
7586 const string key
= i
->substr(0, equal
);
7588 const string value
= i
->substr(equal
);
7589 if (key
.find("ruleset-") == 0) {
7590 *ss
<< "property '" << key
<< "' is no longer supported; try "
7591 << "'crush-" << key
.substr(8) << "' instead";
7594 user_map
[key
] = value
;
7595 (*erasure_code_profile_map
)[key
] = value
;
7599 if (user_map
.count("plugin") && user_map
["plugin"] != default_plugin
)
7600 (*erasure_code_profile_map
) = user_map
;
7605 int OSDMonitor::prepare_pool_size(const unsigned pool_type
,
7606 const string
&erasure_code_profile
,
7608 unsigned *size
, unsigned *min_size
,
7612 bool set_min_size
= false;
7613 switch (pool_type
) {
7614 case pg_pool_t::TYPE_REPLICATED
:
7615 if (osdmap
.stretch_mode_enabled
) {
7617 repl_size
= g_conf().get_val
<uint64_t>("mon_stretch_pool_size");
7618 if (repl_size
!= g_conf().get_val
<uint64_t>("mon_stretch_pool_size")) {
7619 *ss
<< "prepare_pool_size: we are in stretch mode but size "
7620 << repl_size
<< " does not match!";
7623 *min_size
= g_conf().get_val
<uint64_t>("mon_stretch_pool_min_size");
7624 set_min_size
= true;
7626 if (repl_size
== 0) {
7627 repl_size
= g_conf().get_val
<uint64_t>("osd_pool_default_size");
7631 *min_size
= g_conf().get_osd_pool_default_min_size(repl_size
);
7633 case pg_pool_t::TYPE_ERASURE
:
7635 if (osdmap
.stretch_mode_enabled
) {
7636 *ss
<< "prepare_pool_size: we are in stretch mode; cannot create EC pools!";
7639 ErasureCodeInterfaceRef erasure_code
;
7640 err
= get_erasure_code(erasure_code_profile
, &erasure_code
, ss
);
7642 *size
= erasure_code
->get_chunk_count();
7644 erasure_code
->get_data_chunk_count() +
7645 std::min
<int>(1, erasure_code
->get_coding_chunk_count() - 1);
7646 assert(*min_size
<= *size
);
7647 assert(*min_size
>= erasure_code
->get_data_chunk_count());
7652 *ss
<< "prepare_pool_size: " << pool_type
<< " is not a known pool type";
7659 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type
,
7660 const string
&erasure_code_profile
,
7661 uint32_t *stripe_width
,
7665 switch (pool_type
) {
7666 case pg_pool_t::TYPE_REPLICATED
:
7669 case pg_pool_t::TYPE_ERASURE
:
7671 ErasureCodeProfile profile
=
7672 osdmap
.get_erasure_code_profile(erasure_code_profile
);
7673 ErasureCodeInterfaceRef erasure_code
;
7674 err
= get_erasure_code(erasure_code_profile
, &erasure_code
, ss
);
7677 uint32_t data_chunks
= erasure_code
->get_data_chunk_count();
7678 uint32_t stripe_unit
= g_conf().get_val
<Option::size_t>("osd_pool_erasure_code_stripe_unit");
7679 auto it
= profile
.find("stripe_unit");
7680 if (it
!= profile
.end()) {
7682 stripe_unit
= strict_iecstrtoll(it
->second
.c_str(), &err_str
);
7683 ceph_assert(err_str
.empty());
7685 *stripe_width
= data_chunks
*
7686 erasure_code
->get_chunk_size(stripe_unit
* data_chunks
);
7690 *ss
<< "prepare_pool_stripe_width: "
7691 << pool_type
<< " is not a known pool type";
7698 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type
,
7699 const string
&erasure_code_profile
,
7700 const string
&rule_name
,
7705 if (*crush_rule
< 0) {
7706 switch (pool_type
) {
7707 case pg_pool_t::TYPE_REPLICATED
:
7709 if (rule_name
== "") {
7711 *crush_rule
= osdmap
.crush
->get_osd_pool_default_crush_replicated_ruleset(cct
);
7712 if (*crush_rule
< 0) {
7713 // Errors may happen e.g. if no valid rule is available
7714 *ss
<< "No suitable CRUSH rule exists, check "
7715 << "'osd pool default crush *' config options";
7719 return get_crush_rule(rule_name
, crush_rule
, ss
);
7723 case pg_pool_t::TYPE_ERASURE
:
7725 int err
= crush_rule_create_erasure(rule_name
,
7726 erasure_code_profile
,
7730 dout(20) << "prepare_pool_crush_rule: rule "
7731 << rule_name
<< " try again" << dendl
;
7734 // need to wait for the crush rule to be proposed before proceeding
7745 *ss
<< "prepare_pool_crush_rule: " << pool_type
7746 << " is not a known pool type";
7750 if (!osdmap
.crush
->ruleset_exists(*crush_rule
)) {
7751 *ss
<< "CRUSH rule " << *crush_rule
<< " not found";
7759 int OSDMonitor::get_crush_rule(const string
&rule_name
,
7764 ret
= osdmap
.crush
->get_rule_id(rule_name
);
7765 if (ret
!= -ENOENT
) {
7769 CrushWrapper newcrush
;
7770 _get_pending_crush(newcrush
);
7772 ret
= newcrush
.get_rule_id(rule_name
);
7773 if (ret
!= -ENOENT
) {
7774 // found it, wait for it to be proposed
7775 dout(20) << __func__
<< ": rule " << rule_name
7776 << " try again" << dendl
;
7779 // Cannot find it , return error
7780 *ss
<< "specified rule " << rule_name
<< " doesn't exist";
7787 int OSDMonitor::check_pg_num(int64_t pool
, int pg_num
, int size
, ostream
*ss
)
7789 auto max_pgs_per_osd
= g_conf().get_val
<uint64_t>("mon_max_pg_per_osd");
7790 auto num_osds
= std::max(osdmap
.get_num_in_osds(), 3u); // assume min cluster size 3
7791 auto max_pgs
= max_pgs_per_osd
* num_osds
;
7792 uint64_t projected
= 0;
7794 projected
+= pg_num
* size
;
7796 for (const auto& i
: osdmap
.get_pools()) {
7797 if (i
.first
== pool
) {
7798 projected
+= pg_num
* size
;
7800 projected
+= i
.second
.get_pg_num_target() * i
.second
.get_size();
7803 if (projected
> max_pgs
) {
7805 *ss
<< "pool id " << pool
;
7807 *ss
<< " pg_num " << pg_num
<< " size " << size
7808 << " would mean " << projected
7809 << " total pgs, which exceeds max " << max_pgs
7810 << " (mon_max_pg_per_osd " << max_pgs_per_osd
7811 << " * num_in_osds " << num_osds
<< ")";
7818 * @param name The name of the new pool
7819 * @param crush_rule The crush rule to use. If <0, will use the system default
7820 * @param crush_rule_name The crush rule to use, if crush_rulset <0
7821 * @param pg_num The pg_num to use. If set to 0, will use the system default
7822 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
7823 * @param repl_size Replication factor, or 0 for default
7824 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
7825 * @param pool_type TYPE_ERASURE, or TYPE_REP
7826 * @param expected_num_objects expected number of objects on the pool
7827 * @param fast_read fast read type.
7828 * @param ss human readable error message, if any.
7830 * @return 0 on success, negative errno on failure.
7832 int OSDMonitor::prepare_new_pool(string
& name
,
7834 const string
&crush_rule_name
,
7835 unsigned pg_num
, unsigned pgp_num
,
7836 unsigned pg_num_min
,
7837 const uint64_t repl_size
,
7838 const uint64_t target_size_bytes
,
7839 const float target_size_ratio
,
7840 const string
&erasure_code_profile
,
7841 const unsigned pool_type
,
7842 const uint64_t expected_num_objects
,
7843 FastReadType fast_read
,
7844 const string
& pg_autoscale_mode
,
7847 if (name
.length() == 0)
7850 pg_num
= g_conf().get_val
<uint64_t>("osd_pool_default_pg_num");
7852 pgp_num
= g_conf().get_val
<uint64_t>("osd_pool_default_pgp_num");
7855 if (pg_num
> g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")) {
7856 *ss
<< "'pg_num' must be greater than 0 and less than or equal to "
7857 << g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")
7858 << " (you may adjust 'mon max pool pg num' for higher values)";
7861 if (pgp_num
> pg_num
) {
7862 *ss
<< "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
7863 << ", which in this case is " << pg_num
;
7866 if (pool_type
== pg_pool_t::TYPE_REPLICATED
&& fast_read
== FAST_READ_ON
) {
7867 *ss
<< "'fast_read' can only apply to erasure coding pool";
7871 r
= prepare_pool_crush_rule(pool_type
, erasure_code_profile
,
7872 crush_rule_name
, &crush_rule
, ss
);
7874 dout(10) << "prepare_pool_crush_rule returns " << r
<< dendl
;
7877 if (g_conf()->mon_osd_crush_smoke_test
) {
7878 CrushWrapper newcrush
;
7879 _get_pending_crush(newcrush
);
7881 CrushTester
tester(newcrush
, err
);
7882 tester
.set_min_x(0);
7883 tester
.set_max_x(50);
7884 tester
.set_rule(crush_rule
);
7885 auto start
= ceph::coarse_mono_clock::now();
7886 r
= tester
.test_with_fork(g_conf()->mon_lease
);
7887 auto duration
= ceph::coarse_mono_clock::now() - start
;
7889 dout(10) << "tester.test_with_fork returns " << r
7890 << ": " << err
.str() << dendl
;
7891 *ss
<< "crush test failed with " << r
<< ": " << err
.str();
7894 dout(10) << __func__
<< " crush smoke test duration: "
7895 << duration
<< dendl
;
7897 unsigned size
, min_size
;
7898 r
= prepare_pool_size(pool_type
, erasure_code_profile
, repl_size
,
7899 &size
, &min_size
, ss
);
7901 dout(10) << "prepare_pool_size returns " << r
<< dendl
;
7904 r
= check_pg_num(-1, pg_num
, size
, ss
);
7906 dout(10) << "check_pg_num returns " << r
<< dendl
;
7910 if (!osdmap
.crush
->check_crush_rule(crush_rule
, pool_type
, size
, *ss
)) {
7914 uint32_t stripe_width
= 0;
7915 r
= prepare_pool_stripe_width(pool_type
, erasure_code_profile
, &stripe_width
, ss
);
7917 dout(10) << "prepare_pool_stripe_width returns " << r
<< dendl
;
7922 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
7923 switch (fast_read
) {
7930 case FAST_READ_DEFAULT
:
7931 fread
= g_conf()->osd_pool_default_ec_fast_read
;
7934 *ss
<< "invalid fast_read setting: " << fast_read
;
7939 for (map
<int64_t,string
>::iterator p
= pending_inc
.new_pool_names
.begin();
7940 p
!= pending_inc
.new_pool_names
.end();
7942 if (p
->second
== name
)
7946 if (-1 == pending_inc
.new_pool_max
)
7947 pending_inc
.new_pool_max
= osdmap
.pool_max
;
7948 int64_t pool
= ++pending_inc
.new_pool_max
;
7950 pg_pool_t
*pi
= pending_inc
.get_new_pool(pool
, &empty
);
7951 pi
->create_time
= ceph_clock_now();
7952 pi
->type
= pool_type
;
7953 pi
->fast_read
= fread
;
7954 pi
->flags
= g_conf()->osd_pool_default_flags
;
7955 if (g_conf()->osd_pool_default_flag_hashpspool
)
7956 pi
->set_flag(pg_pool_t::FLAG_HASHPSPOOL
);
7957 if (g_conf()->osd_pool_default_flag_nodelete
)
7958 pi
->set_flag(pg_pool_t::FLAG_NODELETE
);
7959 if (g_conf()->osd_pool_default_flag_nopgchange
)
7960 pi
->set_flag(pg_pool_t::FLAG_NOPGCHANGE
);
7961 if (g_conf()->osd_pool_default_flag_nosizechange
)
7962 pi
->set_flag(pg_pool_t::FLAG_NOSIZECHANGE
);
7963 pi
->set_flag(pg_pool_t::FLAG_CREATING
);
7964 if (g_conf()->osd_pool_use_gmt_hitset
)
7965 pi
->use_gmt_hitset
= true;
7967 pi
->use_gmt_hitset
= false;
7970 pi
->min_size
= min_size
;
7971 pi
->crush_rule
= crush_rule
;
7972 pi
->expected_num_objects
= expected_num_objects
;
7973 pi
->object_hash
= CEPH_STR_HASH_RJENKINS
;
7974 if (osdmap
.stretch_mode_enabled
) {
7975 pi
->peering_crush_bucket_count
= osdmap
.stretch_bucket_count
;
7976 pi
->peering_crush_bucket_target
= osdmap
.stretch_bucket_count
;
7977 pi
->peering_crush_bucket_barrier
= osdmap
.stretch_mode_bucket
;
7978 pi
->peering_crush_mandatory_member
= CRUSH_ITEM_NONE
;
7979 if (osdmap
.degraded_stretch_mode
) {
7980 pi
->peering_crush_bucket_count
= osdmap
.degraded_stretch_mode
;
7981 pi
->peering_crush_bucket_target
= osdmap
.degraded_stretch_mode
;
7982 // pi->peering_crush_bucket_mandatory_member = CRUSH_ITEM_NONE;
7983 // TODO: drat, we don't record this ^ anywhere, though given that it
7984 // necessarily won't exist elsewhere it likely doesn't matter
7985 pi
->min_size
= pi
->min_size
/ 2;
7986 pi
->size
= pi
->size
/ 2; // only support 2 zones now
7990 if (auto m
= pg_pool_t::get_pg_autoscale_mode_by_name(
7991 g_conf().get_val
<string
>("osd_pool_default_pg_autoscale_mode"));
7992 m
!= pg_pool_t::pg_autoscale_mode_t::UNKNOWN
) {
7993 pi
->pg_autoscale_mode
= m
;
7995 pi
->pg_autoscale_mode
= pg_pool_t::pg_autoscale_mode_t::OFF
;
7997 auto max
= g_conf().get_val
<int64_t>("mon_osd_max_initial_pgs");
7999 max
> 0 ? std::min
<uint64_t>(pg_num
, std::max
<int64_t>(1, max
))
8001 pi
->set_pg_num_pending(pi
->get_pg_num());
8002 pi
->set_pg_num_target(pg_num
);
8003 pi
->set_pgp_num(pi
->get_pg_num());
8004 pi
->set_pgp_num_target(pgp_num
);
8005 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
&&
8007 pi
->opts
.set(pool_opts_t::PG_NUM_MIN
, static_cast<int64_t>(pg_num_min
));
8009 if (auto m
= pg_pool_t::get_pg_autoscale_mode_by_name(
8010 pg_autoscale_mode
); m
!= pg_pool_t::pg_autoscale_mode_t::UNKNOWN
) {
8011 pi
->pg_autoscale_mode
= m
;
8014 pi
->last_change
= pending_inc
.epoch
;
8017 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
8018 pi
->erasure_code_profile
= erasure_code_profile
;
8020 pi
->erasure_code_profile
= "";
8022 pi
->stripe_width
= stripe_width
;
8024 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
&&
8025 target_size_bytes
) {
8026 // only store for nautilus+ because TARGET_SIZE_BYTES may be
8027 // larger than int32_t max.
8028 pi
->opts
.set(pool_opts_t::TARGET_SIZE_BYTES
, static_cast<int64_t>(target_size_bytes
));
8030 if (target_size_ratio
> 0.0 &&
8031 osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
8032 // only store for nautilus+, just to be consistent and tidy.
8033 pi
->opts
.set(pool_opts_t::TARGET_SIZE_RATIO
, target_size_ratio
);
8036 pi
->cache_target_dirty_ratio_micro
=
8037 g_conf()->osd_pool_default_cache_target_dirty_ratio
* 1000000;
8038 pi
->cache_target_dirty_high_ratio_micro
=
8039 g_conf()->osd_pool_default_cache_target_dirty_high_ratio
* 1000000;
8040 pi
->cache_target_full_ratio_micro
=
8041 g_conf()->osd_pool_default_cache_target_full_ratio
* 1000000;
8042 pi
->cache_min_flush_age
= g_conf()->osd_pool_default_cache_min_flush_age
;
8043 pi
->cache_min_evict_age
= g_conf()->osd_pool_default_cache_min_evict_age
;
8045 pending_inc
.new_pool_names
[pool
] = name
;
8049 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op
, int flag
)
8051 op
->mark_osdmon_event(__func__
);
8053 if (pending_inc
.new_flags
< 0)
8054 pending_inc
.new_flags
= osdmap
.get_flags();
8055 pending_inc
.new_flags
|= flag
;
8056 ss
<< OSDMap::get_flag_string(flag
) << " is set";
8057 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
8058 get_last_committed() + 1));
8062 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op
, int flag
)
8064 op
->mark_osdmon_event(__func__
);
8066 if (pending_inc
.new_flags
< 0)
8067 pending_inc
.new_flags
= osdmap
.get_flags();
8068 pending_inc
.new_flags
&= ~flag
;
8069 ss
<< OSDMap::get_flag_string(flag
) << " is unset";
8070 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
8071 get_last_committed() + 1));
8075 int OSDMonitor::prepare_command_pool_set(const cmdmap_t
& cmdmap
,
8079 cmd_getval(cmdmap
, "pool", poolstr
);
8080 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
8082 ss
<< "unrecognized pool '" << poolstr
<< "'";
8086 cmd_getval(cmdmap
, "var", var
);
8088 pg_pool_t p
= *osdmap
.get_pg_pool(pool
);
8089 if (pending_inc
.new_pools
.count(pool
))
8090 p
= pending_inc
.new_pools
[pool
];
8092 // accept val as a json string in the normal case (current
8093 // generation monitor). parse out int or float values from the
8094 // string as needed. however, if it is not a string, try to pull
8095 // out an int, in case an older monitor with an older json schema is
8096 // forwarding a request.
8098 string interr
, floaterr
;
8101 int64_t uf
= 0; // micro-f
8102 cmd_getval(cmdmap
, "val", val
);
8105 "target_max_objects"
8107 auto iec_options
= {
8109 "target_size_bytes",
8110 "compression_max_blob_size",
8111 "compression_min_blob_size",
8115 if (count(begin(si_options
), end(si_options
), var
)) {
8116 n
= strict_si_cast
<int64_t>(val
.c_str(), &interr
);
8117 } else if (count(begin(iec_options
), end(iec_options
), var
)) {
8118 n
= strict_iec_cast
<int64_t>(val
.c_str(), &interr
);
8120 // parse string as both int and float; different fields use different types.
8121 n
= strict_strtoll(val
.c_str(), 10, &interr
);
8122 f
= strict_strtod(val
.c_str(), &floaterr
);
8123 uf
= llrintl(f
* (double)1000000.0);
8127 (var
== "hit_set_type" || var
== "hit_set_period" ||
8128 var
== "hit_set_count" || var
== "hit_set_fpp" ||
8129 var
== "target_max_objects" || var
== "target_max_bytes" ||
8130 var
== "cache_target_full_ratio" || var
== "cache_target_dirty_ratio" ||
8131 var
== "cache_target_dirty_high_ratio" || var
== "use_gmt_hitset" ||
8132 var
== "cache_min_flush_age" || var
== "cache_min_evict_age" ||
8133 var
== "hit_set_grade_decay_rate" || var
== "hit_set_search_last_n" ||
8134 var
== "min_read_recency_for_promote" || var
== "min_write_recency_for_promote")) {
8138 if (var
== "size") {
8139 if (p
.has_flag(pg_pool_t::FLAG_NOSIZECHANGE
)) {
8140 ss
<< "pool size change is disabled; you must unset nosizechange flag for the pool first";
8143 if (p
.type
== pg_pool_t::TYPE_ERASURE
) {
8144 ss
<< "can not change the size of an erasure-coded pool";
8147 if (interr
.length()) {
8148 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8151 if (n
<= 0 || n
> 10) {
8152 ss
<< "pool size must be between 1 and 10";
8156 if (!g_conf().get_val
<bool>("mon_allow_pool_size_one")) {
8157 ss
<< "configuring pool size as 1 is disabled by default.";
8161 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
8162 if (!sure
) { ss
<< "WARNING: setting pool size 1 could lead to data loss "
8163 "without recovery. If you are *ABSOLUTELY CERTAIN* that is what you want, "
8164 "pass the flag --yes-i-really-mean-it.";
8168 if (!osdmap
.crush
->check_crush_rule(p
.get_crush_rule(), p
.type
, n
, ss
)) {
8171 int r
= check_pg_num(pool
, p
.get_pg_num(), n
, &ss
);
8176 p
.min_size
= g_conf().get_osd_pool_default_min_size(p
.size
);
8177 } else if (var
== "min_size") {
8178 if (p
.has_flag(pg_pool_t::FLAG_NOSIZECHANGE
)) {
8179 ss
<< "pool min size change is disabled; you must unset nosizechange flag for the pool first";
8182 if (interr
.length()) {
8183 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8187 if (p
.type
!= pg_pool_t::TYPE_ERASURE
) {
8188 if (n
< 1 || n
> p
.size
) {
8189 ss
<< "pool min_size must be between 1 and size, which is set to " << (int)p
.size
;
8193 ErasureCodeInterfaceRef erasure_code
;
8196 int err
= get_erasure_code(p
.erasure_code_profile
, &erasure_code
, &tmp
);
8198 k
= erasure_code
->get_data_chunk_count();
8200 ss
<< __func__
<< " get_erasure_code failed: " << tmp
.str();
8204 if (n
< k
|| n
> p
.size
) {
8205 ss
<< "pool min_size must be between " << k
<< " and size, which is set to " << (int)p
.size
;
8210 } else if (var
== "pg_num_actual") {
8211 if (interr
.length()) {
8212 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8215 if (n
== (int)p
.get_pg_num()) {
8218 if (static_cast<uint64_t>(n
) > g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")) {
8219 ss
<< "'pg_num' must be greater than 0 and less than or equal to "
8220 << g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")
8221 << " (you may adjust 'mon max pool pg num' for higher values)";
8224 if (p
.has_flag(pg_pool_t::FLAG_CREATING
)) {
8225 ss
<< "cannot adjust pg_num while initial PGs are being created";
8228 if (n
> (int)p
.get_pg_num()) {
8229 if (p
.get_pg_num() != p
.get_pg_num_pending()) {
8230 // force pre-nautilus clients to resend their ops, since they
8231 // don't understand pg_num_pending changes form a new interval
8232 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
8236 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8237 ss
<< "nautilus OSDs are required to adjust pg_num_pending";
8240 if (n
< (int)p
.get_pgp_num()) {
8241 ss
<< "specified pg_num " << n
<< " < pgp_num " << p
.get_pgp_num();
8244 if (n
< (int)p
.get_pg_num() - 1) {
8245 ss
<< "specified pg_num " << n
<< " < pg_num (" << p
.get_pg_num()
8246 << ") - 1; only single pg decrease is currently supported";
8249 p
.set_pg_num_pending(n
);
8250 // force pre-nautilus clients to resend their ops, since they
8251 // don't understand pg_num_pending changes form a new interval
8252 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
8254 // force pre-luminous clients to resend their ops, since they
8255 // don't understand that split PGs now form a new interval.
8256 p
.last_force_op_resend_preluminous
= pending_inc
.epoch
;
8257 } else if (var
== "pg_num") {
8258 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
8259 ss
<< "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8262 if (interr
.length()) {
8263 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8266 if (n
== (int)p
.get_pg_num_target()) {
8269 if (n
<= 0 || static_cast<uint64_t>(n
) >
8270 g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")) {
8271 ss
<< "'pg_num' must be greater than 0 and less than or equal to "
8272 << g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")
8273 << " (you may adjust 'mon max pool pg num' for higher values)";
8276 if (n
> (int)p
.get_pg_num_target()) {
8277 int r
= check_pg_num(pool
, n
, p
.get_size(), &ss
);
8282 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
8283 if (p
.cache_mode
!= pg_pool_t::CACHEMODE_NONE
&& !force
) {
8284 ss
<< "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
8288 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8289 ss
<< "nautilus OSDs are required to decrease pg_num";
8293 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8294 // pre-nautilus osdmap format; increase pg_num directly
8295 assert(n
> (int)p
.get_pg_num());
8296 // force pre-nautilus clients to resend their ops, since they
8297 // don't understand pg_num_target changes form a new interval
8298 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
8299 // force pre-luminous clients to resend their ops, since they
8300 // don't understand that split PGs now form a new interval.
8301 p
.last_force_op_resend_preluminous
= pending_inc
.epoch
;
8304 // set targets; mgr will adjust pg_num_actual and pgp_num later.
8305 // make pgp_num track pg_num if it already matches. if it is set
8306 // differently, leave it different and let the user control it
8308 if (p
.get_pg_num_target() == p
.get_pgp_num_target()) {
8309 p
.set_pgp_num_target(n
);
8311 p
.set_pg_num_target(n
);
8313 } else if (var
== "pgp_num_actual") {
8314 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
8315 ss
<< "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8318 if (interr
.length()) {
8319 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8323 ss
<< "specified pgp_num must > 0, but you set to " << n
;
8326 if (n
> (int)p
.get_pg_num()) {
8327 ss
<< "specified pgp_num " << n
<< " > pg_num " << p
.get_pg_num();
8330 if (n
> (int)p
.get_pg_num_pending()) {
8331 ss
<< "specified pgp_num " << n
8332 << " > pg_num_pending " << p
.get_pg_num_pending();
8336 } else if (var
== "pgp_num") {
8337 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
8338 ss
<< "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8341 if (interr
.length()) {
8342 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8346 ss
<< "specified pgp_num must > 0, but you set to " << n
;
8349 if (n
> (int)p
.get_pg_num_target()) {
8350 ss
<< "specified pgp_num " << n
<< " > pg_num " << p
.get_pg_num_target();
8353 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8354 // pre-nautilus osdmap format; increase pgp_num directly
8357 p
.set_pgp_num_target(n
);
8359 } else if (var
== "pg_autoscale_mode") {
8360 auto m
= pg_pool_t::get_pg_autoscale_mode_by_name(val
);
8361 if (m
== pg_pool_t::pg_autoscale_mode_t::UNKNOWN
) {
8362 ss
<< "specified invalid mode " << val
;
8365 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8366 ss
<< "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
8369 p
.pg_autoscale_mode
= m
;
8370 } else if (var
== "crush_rule") {
8371 int id
= osdmap
.crush
->get_rule_id(val
);
8372 if (id
== -ENOENT
) {
8373 ss
<< "crush rule " << val
<< " does not exist";
8377 ss
<< cpp_strerror(id
);
8380 if (!osdmap
.crush
->check_crush_rule(id
, p
.get_type(), p
.get_size(), ss
)) {
8384 } else if (var
== "nodelete" || var
== "nopgchange" ||
8385 var
== "nosizechange" || var
== "write_fadvise_dontneed" ||
8386 var
== "noscrub" || var
== "nodeep-scrub") {
8387 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
8388 // make sure we only compare against 'n' if we didn't receive a string
8389 if (val
== "true" || (interr
.empty() && n
== 1)) {
8391 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8394 ss
<< "expecting value 'true', 'false', '0', or '1'";
8397 } else if (var
== "hashpspool") {
8398 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
8400 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
8403 ss
<< "are you SURE? this will remap all placement groups in this pool,"
8404 " this triggers large data movement,"
8405 " pass --yes-i-really-mean-it if you really do.";
8408 // make sure we only compare against 'n' if we didn't receive a string
8409 if (val
== "true" || (interr
.empty() && n
== 1)) {
8411 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8414 ss
<< "expecting value 'true', 'false', '0', or '1'";
8417 } else if (var
== "hit_set_type") {
8419 p
.hit_set_params
= HitSet::Params();
8421 int err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
8424 if (val
== "bloom") {
8425 BloomHitSet::Params
*bsp
= new BloomHitSet::Params
;
8426 bsp
->set_fpp(g_conf().get_val
<double>("osd_pool_default_hit_set_bloom_fpp"));
8427 p
.hit_set_params
= HitSet::Params(bsp
);
8428 } else if (val
== "explicit_hash")
8429 p
.hit_set_params
= HitSet::Params(new ExplicitHashHitSet::Params
);
8430 else if (val
== "explicit_object")
8431 p
.hit_set_params
= HitSet::Params(new ExplicitObjectHitSet::Params
);
8433 ss
<< "unrecognized hit_set type '" << val
<< "'";
8437 } else if (var
== "hit_set_period") {
8438 if (interr
.length()) {
8439 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8442 ss
<< "hit_set_period should be non-negative";
8445 p
.hit_set_period
= n
;
8446 } else if (var
== "hit_set_count") {
8447 if (interr
.length()) {
8448 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8451 ss
<< "hit_set_count should be non-negative";
8454 p
.hit_set_count
= n
;
8455 } else if (var
== "hit_set_fpp") {
8456 if (floaterr
.length()) {
8457 ss
<< "error parsing floating point value '" << val
<< "': " << floaterr
;
8459 } else if (f
< 0 || f
> 1.0) {
8460 ss
<< "hit_set_fpp should be in the range 0..1";
8463 if (p
.hit_set_params
.get_type() != HitSet::TYPE_BLOOM
) {
8464 ss
<< "hit set is not of type Bloom; invalid to set a false positive rate!";
8467 BloomHitSet::Params
*bloomp
= static_cast<BloomHitSet::Params
*>(p
.hit_set_params
.impl
.get());
8469 } else if (var
== "use_gmt_hitset") {
8470 if (val
== "true" || (interr
.empty() && n
== 1)) {
8471 p
.use_gmt_hitset
= true;
8473 ss
<< "expecting value 'true' or '1'";
8476 } else if (var
== "allow_ec_overwrites") {
8477 if (!p
.is_erasure()) {
8478 ss
<< "ec overwrites can only be enabled for an erasure coded pool";
8482 if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites
&&
8483 !is_pool_currently_all_bluestore(pool
, p
, &err
)) {
8484 ss
<< "pool must only be stored on bluestore for scrubbing to work: " << err
.str();
8487 if (val
== "true" || (interr
.empty() && n
== 1)) {
8488 p
.flags
|= pg_pool_t::FLAG_EC_OVERWRITES
;
8489 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8490 ss
<< "ec overwrites cannot be disabled once enabled";
8493 ss
<< "expecting value 'true', 'false', '0', or '1'";
8496 } else if (var
== "target_max_objects") {
8497 if (interr
.length()) {
8498 ss
<< "error parsing int '" << val
<< "': " << interr
;
8501 p
.target_max_objects
= n
;
8502 } else if (var
== "target_max_bytes") {
8503 if (interr
.length()) {
8504 ss
<< "error parsing int '" << val
<< "': " << interr
;
8507 p
.target_max_bytes
= n
;
8508 } else if (var
== "cache_target_dirty_ratio") {
8509 if (floaterr
.length()) {
8510 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
8513 if (f
< 0 || f
> 1.0) {
8514 ss
<< "value must be in the range 0..1";
8517 p
.cache_target_dirty_ratio_micro
= uf
;
8518 } else if (var
== "cache_target_dirty_high_ratio") {
8519 if (floaterr
.length()) {
8520 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
8523 if (f
< 0 || f
> 1.0) {
8524 ss
<< "value must be in the range 0..1";
8527 p
.cache_target_dirty_high_ratio_micro
= uf
;
8528 } else if (var
== "cache_target_full_ratio") {
8529 if (floaterr
.length()) {
8530 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
8533 if (f
< 0 || f
> 1.0) {
8534 ss
<< "value must be in the range 0..1";
8537 p
.cache_target_full_ratio_micro
= uf
;
8538 } else if (var
== "cache_min_flush_age") {
8539 if (interr
.length()) {
8540 ss
<< "error parsing int '" << val
<< "': " << interr
;
8543 p
.cache_min_flush_age
= n
;
8544 } else if (var
== "cache_min_evict_age") {
8545 if (interr
.length()) {
8546 ss
<< "error parsing int '" << val
<< "': " << interr
;
8549 p
.cache_min_evict_age
= n
;
8550 } else if (var
== "min_read_recency_for_promote") {
8551 if (interr
.length()) {
8552 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8555 p
.min_read_recency_for_promote
= n
;
8556 } else if (var
== "hit_set_grade_decay_rate") {
8557 if (interr
.length()) {
8558 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8561 if (n
> 100 || n
< 0) {
8562 ss
<< "value out of range,valid range is 0 - 100";
8565 p
.hit_set_grade_decay_rate
= n
;
8566 } else if (var
== "hit_set_search_last_n") {
8567 if (interr
.length()) {
8568 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8571 if (n
> p
.hit_set_count
|| n
< 0) {
8572 ss
<< "value out of range,valid range is 0 - hit_set_count";
8575 p
.hit_set_search_last_n
= n
;
8576 } else if (var
== "min_write_recency_for_promote") {
8577 if (interr
.length()) {
8578 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8581 p
.min_write_recency_for_promote
= n
;
8582 } else if (var
== "fast_read") {
8583 if (p
.is_replicated()) {
8584 ss
<< "fast read is not supported in replication pool";
8587 if (val
== "true" || (interr
.empty() && n
== 1)) {
8589 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8590 p
.fast_read
= false;
8592 ss
<< "expecting value 'true', 'false', '0', or '1'";
8595 } else if (pool_opts_t::is_opt_name(var
)) {
8596 bool unset
= val
== "unset";
8597 if (var
== "compression_mode") {
8599 auto cmode
= Compressor::get_comp_mode_type(val
);
8601 ss
<< "unrecognized compression mode '" << val
<< "'";
8605 } else if (var
== "compression_algorithm") {
8607 auto alg
= Compressor::get_comp_alg_type(val
);
8609 ss
<< "unrecognized compression_algorithm '" << val
<< "'";
8613 } else if (var
== "compression_required_ratio") {
8614 if (floaterr
.length()) {
8615 ss
<< "error parsing float value '" << val
<< "': " << floaterr
;
8618 if (f
< 0 || f
> 1) {
8619 ss
<< "compression_required_ratio is out of range (0-1): '" << val
<< "'";
8622 } else if (var
== "csum_type") {
8623 auto t
= unset
? 0 : Checksummer::get_csum_string_type(val
);
8625 ss
<< "unrecognized csum_type '" << val
<< "'";
8628 //preserve csum_type numeric value
8631 } else if (var
== "compression_max_blob_size" ||
8632 var
== "compression_min_blob_size" ||
8633 var
== "csum_max_block" ||
8634 var
== "csum_min_block") {
8635 if (interr
.length()) {
8636 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8639 } else if (var
== "fingerprint_algorithm") {
8641 auto alg
= pg_pool_t::get_fingerprint_from_str(val
);
8643 ss
<< "unrecognized fingerprint_algorithm '" << val
<< "'";
8647 } else if (var
== "target_size_bytes") {
8648 if (interr
.length()) {
8649 ss
<< "error parsing unit value '" << val
<< "': " << interr
;
8652 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8653 ss
<< "must set require_osd_release to nautilus or "
8654 << "later before setting target_size_bytes";
8657 } else if (var
== "pg_num_min") {
8658 if (interr
.length()) {
8659 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8662 if (n
> (int)p
.get_pg_num_target()) {
8663 ss
<< "specified pg_num_min " << n
8664 << " > pg_num " << p
.get_pg_num_target();
8667 } else if (var
== "recovery_priority") {
8668 if (interr
.length()) {
8669 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8672 if (!g_conf()->debug_allow_any_pool_priority
) {
8673 if (n
> OSD_POOL_PRIORITY_MAX
|| n
< OSD_POOL_PRIORITY_MIN
) {
8674 ss
<< "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
8675 << " and " << OSD_POOL_PRIORITY_MAX
;
8679 } else if (var
== "pg_autoscale_bias") {
8680 if (f
< 0.0 || f
> 1000.0) {
8681 ss
<< "pg_autoscale_bias must be between 0 and 1000";
8684 } else if (var
== "dedup_tier") {
8685 if (interr
.empty()) {
8686 ss
<< "expecting value 'pool name'";
8689 // Current base tier in dedup does not support ec pool
8690 if (p
.is_erasure()) {
8691 ss
<< "pool '" << poolstr
8692 << "' is an ec pool, which cannot be a base tier";
8695 int64_t lowtierpool_id
= osdmap
.lookup_pg_pool_name(val
);
8696 if (lowtierpool_id
< 0) {
8697 ss
<< "unrecognized pool '" << val
<< "'";
8700 const pg_pool_t
*tp
= osdmap
.get_pg_pool(lowtierpool_id
);
8703 // The original input is string (pool name), but we convert it to int64_t.
8706 } else if (var
== "dedup_chunk_algorithm") {
8708 auto alg
= pg_pool_t::get_dedup_chunk_algorithm_from_str(val
);
8710 ss
<< "unrecognized fingerprint_algorithm '" << val
<< "'";
8714 } else if (var
== "dedup_cdc_chunk_size") {
8715 if (interr
.length()) {
8716 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8721 pool_opts_t::opt_desc_t desc
= pool_opts_t::get_opt_desc(var
);
8722 switch (desc
.type
) {
8723 case pool_opts_t::STR
:
8725 p
.opts
.unset(desc
.key
);
8727 p
.opts
.set(desc
.key
, static_cast<std::string
>(val
));
8730 case pool_opts_t::INT
:
8731 if (interr
.length()) {
8732 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8736 p
.opts
.unset(desc
.key
);
8738 p
.opts
.set(desc
.key
, static_cast<int64_t>(n
));
8741 case pool_opts_t::DOUBLE
:
8742 if (floaterr
.length()) {
8743 ss
<< "error parsing floating point value '" << val
<< "': " << floaterr
;
8747 p
.opts
.unset(desc
.key
);
8749 p
.opts
.set(desc
.key
, static_cast<double>(f
));
8753 ceph_assert(!"unknown type");
8756 ss
<< "unrecognized variable '" << var
<< "'";
8759 if (val
!= "unset") {
8760 ss
<< "set pool " << pool
<< " " << var
<< " to " << val
;
8762 ss
<< "unset pool " << pool
<< " " << var
;
8764 p
.last_change
= pending_inc
.epoch
;
8765 pending_inc
.new_pools
[pool
] = p
;
8769 int OSDMonitor::prepare_command_pool_application(const string
&prefix
,
8770 const cmdmap_t
& cmdmap
,
8773 return _command_pool_application(prefix
, cmdmap
, ss
, nullptr, true);
8776 int OSDMonitor::preprocess_command_pool_application(const string
&prefix
,
8777 const cmdmap_t
& cmdmap
,
8781 return _command_pool_application(prefix
, cmdmap
, ss
, modified
, false);
8786 * Common logic for preprocess and prepare phases of pool application
8787 * tag commands. In preprocess mode we're only detecting invalid
8788 * commands, and determining whether it was a modification or a no-op.
8789 * In prepare mode we're actually updating the pending state.
8791 int OSDMonitor::_command_pool_application(const string
&prefix
,
8792 const cmdmap_t
& cmdmap
,
8798 cmd_getval(cmdmap
, "pool", pool_name
);
8799 int64_t pool
= osdmap
.lookup_pg_pool_name(pool_name
.c_str());
8801 ss
<< "unrecognized pool '" << pool_name
<< "'";
8805 pg_pool_t p
= *osdmap
.get_pg_pool(pool
);
8807 if (pending_inc
.new_pools
.count(pool
)) {
8808 p
= pending_inc
.new_pools
[pool
];
8813 cmd_getval(cmdmap
, "app", app
);
8814 bool app_exists
= (p
.application_metadata
.count(app
) > 0);
8817 cmd_getval(cmdmap
, "key", key
);
8819 ss
<< "key cannot be 'all'";
8824 cmd_getval(cmdmap
, "value", value
);
8825 if (value
== "all") {
8826 ss
<< "value cannot be 'all'";
8830 if (boost::algorithm::ends_with(prefix
, "enable")) {
8832 ss
<< "application name must be provided";
8837 ss
<< "application must be enabled on base tier";
8842 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
8844 if (!app_exists
&& !p
.application_metadata
.empty() && !force
) {
8845 ss
<< "Are you SURE? Pool '" << pool_name
<< "' already has an enabled "
8846 << "application; pass --yes-i-really-mean-it to proceed anyway";
8850 if (!app_exists
&& p
.application_metadata
.size() >= MAX_POOL_APPLICATIONS
) {
8851 ss
<< "too many enabled applications on pool '" << pool_name
<< "'; "
8852 << "max " << MAX_POOL_APPLICATIONS
;
8856 if (app
.length() > MAX_POOL_APPLICATION_LENGTH
) {
8857 ss
<< "application name '" << app
<< "' too long; max length "
8858 << MAX_POOL_APPLICATION_LENGTH
;
8863 p
.application_metadata
[app
] = {};
8865 ss
<< "enabled application '" << app
<< "' on pool '" << pool_name
<< "'";
8867 } else if (boost::algorithm::ends_with(prefix
, "disable")) {
8869 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
8872 ss
<< "Are you SURE? Disabling an application within a pool might result "
8873 << "in loss of application functionality; pass "
8874 << "--yes-i-really-mean-it to proceed anyway";
8879 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
8881 return 0; // idempotent
8884 p
.application_metadata
.erase(app
);
8885 ss
<< "disable application '" << app
<< "' on pool '" << pool_name
<< "'";
8887 } else if (boost::algorithm::ends_with(prefix
, "set")) {
8889 ss
<< "application metadata must be set on base tier";
8894 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
8900 cmd_getval(cmdmap
, "key", key
);
8903 ss
<< "key must be provided";
8907 auto &app_keys
= p
.application_metadata
[app
];
8908 if (app_keys
.count(key
) == 0 &&
8909 app_keys
.size() >= MAX_POOL_APPLICATION_KEYS
) {
8910 ss
<< "too many keys set for application '" << app
<< "' on pool '"
8911 << pool_name
<< "'; max " << MAX_POOL_APPLICATION_KEYS
;
8915 if (key
.length() > MAX_POOL_APPLICATION_LENGTH
) {
8916 ss
<< "key '" << app
<< "' too long; max length "
8917 << MAX_POOL_APPLICATION_LENGTH
;
8922 cmd_getval(cmdmap
, "value", value
);
8923 if (value
.length() > MAX_POOL_APPLICATION_LENGTH
) {
8924 ss
<< "value '" << value
<< "' too long; max length "
8925 << MAX_POOL_APPLICATION_LENGTH
;
8929 p
.application_metadata
[app
][key
] = value
;
8930 ss
<< "set application '" << app
<< "' key '" << key
<< "' to '"
8931 << value
<< "' on pool '" << pool_name
<< "'";
8932 } else if (boost::algorithm::ends_with(prefix
, "rm")) {
8934 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
8940 cmd_getval(cmdmap
, "key", key
);
8941 auto it
= p
.application_metadata
[app
].find(key
);
8942 if (it
== p
.application_metadata
[app
].end()) {
8943 ss
<< "application '" << app
<< "' on pool '" << pool_name
8944 << "' does not have key '" << key
<< "'";
8945 return 0; // idempotent
8948 p
.application_metadata
[app
].erase(it
);
8949 ss
<< "removed application '" << app
<< "' key '" << key
<< "' on pool '"
8950 << pool_name
<< "'";
8956 p
.last_change
= pending_inc
.epoch
;
8957 pending_inc
.new_pools
[pool
] = p
;
8960 // Because we fell through this far, we didn't hit no-op cases,
8961 // so pool was definitely modified
8962 if (modified
!= nullptr) {
8969 int OSDMonitor::_prepare_command_osd_crush_remove(
8970 CrushWrapper
&newcrush
,
8979 err
= newcrush
.remove_item_under(cct
, id
, ancestor
,
8982 err
= newcrush
.remove_item(cct
, id
, unlink_only
);
8987 void OSDMonitor::do_osd_crush_remove(CrushWrapper
& newcrush
)
8989 pending_inc
.crush
.clear();
8990 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
8993 int OSDMonitor::prepare_command_osd_crush_remove(
8994 CrushWrapper
&newcrush
,
9000 int err
= _prepare_command_osd_crush_remove(
9001 newcrush
, id
, ancestor
,
9002 has_ancestor
, unlink_only
);
9007 ceph_assert(err
== 0);
9008 do_osd_crush_remove(newcrush
);
9013 int OSDMonitor::prepare_command_osd_remove(int32_t id
)
9015 if (osdmap
.is_up(id
)) {
9019 pending_inc
.new_state
[id
] = osdmap
.get_state(id
);
9020 pending_inc
.new_uuid
[id
] = uuid_d();
9021 pending_metadata_rm
.insert(id
);
9022 pending_metadata
.erase(id
);
9027 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id
)
9029 ceph_assert(existing_id
);
9032 for (int32_t i
= 0; i
< osdmap
.get_max_osd(); ++i
) {
9033 if (!osdmap
.exists(i
) &&
9034 pending_inc
.new_up_client
.count(i
) == 0 &&
9035 (pending_inc
.new_state
.count(i
) == 0 ||
9036 (pending_inc
.new_state
[i
] & CEPH_OSD_EXISTS
) == 0)) {
9042 if (pending_inc
.new_max_osd
< 0) {
9043 return osdmap
.get_max_osd();
9045 return pending_inc
.new_max_osd
;
9048 void OSDMonitor::do_osd_create(
9051 const string
& device_class
,
9054 dout(10) << __func__
<< " uuid " << uuid
<< dendl
;
9055 ceph_assert(new_id
);
9057 // We presume validation has been performed prior to calling this
9058 // function. We assert with prejudice.
9060 int32_t allocated_id
= -1; // declare here so we can jump
9061 int32_t existing_id
= -1;
9062 if (!uuid
.is_zero()) {
9063 existing_id
= osdmap
.identify_osd(uuid
);
9064 if (existing_id
>= 0) {
9065 ceph_assert(id
< 0 || id
== existing_id
);
9066 *new_id
= existing_id
;
9068 } else if (id
>= 0) {
9069 // uuid does not exist, and id has been provided, so just create
9076 // allocate a new id
9077 allocated_id
= _allocate_osd_id(&existing_id
);
9078 dout(10) << __func__
<< " allocated id " << allocated_id
9079 << " existing id " << existing_id
<< dendl
;
9080 if (existing_id
>= 0) {
9081 ceph_assert(existing_id
< osdmap
.get_max_osd());
9082 ceph_assert(allocated_id
< 0);
9083 *new_id
= existing_id
;
9084 } else if (allocated_id
>= 0) {
9085 ceph_assert(existing_id
< 0);
9087 if (pending_inc
.new_max_osd
< 0) {
9088 pending_inc
.new_max_osd
= osdmap
.get_max_osd() + 1;
9090 ++pending_inc
.new_max_osd
;
9092 *new_id
= pending_inc
.new_max_osd
- 1;
9093 ceph_assert(*new_id
== allocated_id
);
9095 ceph_abort_msg("unexpected condition");
9099 if (device_class
.size()) {
9100 CrushWrapper newcrush
;
9101 _get_pending_crush(newcrush
);
9102 if (newcrush
.get_max_devices() < *new_id
+ 1) {
9103 newcrush
.set_max_devices(*new_id
+ 1);
9105 string name
= string("osd.") + stringify(*new_id
);
9106 if (!newcrush
.item_exists(*new_id
)) {
9107 newcrush
.set_item_name(*new_id
, name
);
9110 int r
= newcrush
.update_device_class(*new_id
, device_class
, name
, &ss
);
9112 derr
<< __func__
<< " failed to set " << name
<< " device_class "
9113 << device_class
<< ": " << cpp_strerror(r
) << " - " << ss
.str()
9115 // non-fatal... this might be a replay and we want to be idempotent.
9117 dout(20) << __func__
<< " set " << name
<< " device_class " << device_class
9119 pending_inc
.crush
.clear();
9120 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
9123 dout(20) << __func__
<< " no device_class" << dendl
;
9126 dout(10) << __func__
<< " using id " << *new_id
<< dendl
;
9127 if (osdmap
.get_max_osd() <= *new_id
&& pending_inc
.new_max_osd
<= *new_id
) {
9128 pending_inc
.new_max_osd
= *new_id
+ 1;
9131 pending_inc
.new_weight
[*new_id
] = CEPH_OSD_IN
;
9132 // do not set EXISTS; OSDMap::set_weight, called by apply_incremental, will
9133 // set it for us. (ugh.)
9134 pending_inc
.new_state
[*new_id
] |= CEPH_OSD_NEW
;
9135 if (!uuid
.is_zero())
9136 pending_inc
.new_uuid
[*new_id
] = uuid
;
9139 int OSDMonitor::validate_osd_create(
9142 const bool check_osd_exists
,
9143 int32_t* existing_id
,
9147 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
9148 << " check_osd_exists " << check_osd_exists
<< dendl
;
9150 ceph_assert(existing_id
);
9152 if (id
< 0 && uuid
.is_zero()) {
9153 // we have nothing to validate
9156 } else if (uuid
.is_zero()) {
9157 // we have an id but we will ignore it - because that's what
9158 // `osd create` does.
9163 * This function will be used to validate whether we are able to
9164 * create a new osd when the `uuid` is specified.
9166 * It will be used by both `osd create` and `osd new`, as the checks
9167 * are basically the same when it pertains to osd id and uuid validation.
9168 * However, `osd create` presumes an `uuid` is optional, for legacy
9169 * reasons, while `osd new` requires the `uuid` to be provided. This
9170 * means that `osd create` will not be idempotent if an `uuid` is not
9171 * provided, but we will always guarantee the idempotency of `osd new`.
9174 ceph_assert(!uuid
.is_zero());
9175 if (pending_inc
.identify_osd(uuid
) >= 0) {
9176 // osd is about to exist
9180 int32_t i
= osdmap
.identify_osd(uuid
);
9182 // osd already exists
9183 if (id
>= 0 && i
!= id
) {
9184 ss
<< "uuid " << uuid
<< " already in use for different id " << i
;
9187 // return a positive errno to distinguish between a blocking error
9188 // and an error we consider to not be a problem (i.e., this would be
9189 // an idempotent operation).
9195 if (pending_inc
.new_state
.count(id
)) {
9196 // osd is about to exist
9199 // we may not care if an osd exists if we are recreating a previously
9201 if (check_osd_exists
&& osdmap
.exists(id
)) {
9202 ss
<< "id " << id
<< " already in use and does not match uuid "
9210 int OSDMonitor::prepare_command_osd_create(
9213 int32_t* existing_id
,
9216 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
<< dendl
;
9217 ceph_assert(existing_id
);
9218 if (osdmap
.is_destroyed(id
)) {
9219 ss
<< "ceph osd create has been deprecated. Please use ceph osd new "
9224 if (uuid
.is_zero()) {
9225 dout(10) << __func__
<< " no uuid; assuming legacy `osd create`" << dendl
;
9228 return validate_osd_create(id
, uuid
, true, existing_id
, ss
);
9231 int OSDMonitor::prepare_command_osd_new(
9233 const cmdmap_t
& cmdmap
,
9234 const map
<string
,string
>& params
,
9242 ceph_assert(paxos
.is_plugged());
9244 dout(10) << __func__
<< " " << op
<< dendl
;
9246 /* validate command. abort now if something's wrong. */
9248 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
9250 * If `id` is not specified, we will identify any existing osd based
9251 * on `uuid`. Operation will be idempotent iff secrets match.
9253 * If `id` is specified, we will identify any existing osd based on
9254 * `uuid` and match against `id`. If they match, operation will be
9255 * idempotent iff secrets match.
9257 * `-i secrets.json` will be optional. If supplied, will be used
9258 * to check for idempotency when `id` and `uuid` match.
9260 * If `id` is not specified, and `uuid` does not exist, an id will
9261 * be found or allocated for the osd.
9263 * If `id` is specified, and the osd has been previously marked
9264 * as destroyed, then the `id` will be reused.
9266 if (!cmd_getval(cmdmap
, "uuid", uuidstr
)) {
9267 ss
<< "requires the OSD's UUID to be specified.";
9269 } else if (!uuid
.parse(uuidstr
.c_str())) {
9270 ss
<< "invalid UUID value '" << uuidstr
<< "'.";
9274 if (cmd_getval(cmdmap
, "id", id
) &&
9276 ss
<< "invalid OSD id; must be greater or equal than zero.";
9280 // are we running an `osd create`-like command, or recreating
9281 // a previously destroyed osd?
9283 bool is_recreate_destroyed
= (id
>= 0 && osdmap
.is_destroyed(id
));
9285 // we will care about `id` to assess whether osd is `destroyed`, or
9286 // to create a new osd.
9287 // we will need an `id` by the time we reach auth.
9289 int32_t existing_id
= -1;
9290 int err
= validate_osd_create(id
, uuid
, !is_recreate_destroyed
,
9293 bool may_be_idempotent
= false;
9294 if (err
== EEXIST
) {
9295 // this is idempotent from the osdmon's point-of-view
9296 may_be_idempotent
= true;
9297 ceph_assert(existing_id
>= 0);
9299 } else if (err
< 0) {
9303 if (!may_be_idempotent
) {
9304 // idempotency is out of the window. We are either creating a new
9305 // osd or recreating a destroyed osd.
9307 // We now need to figure out if we have an `id` (and if it's valid),
9308 // of find an `id` if we don't have one.
9310 // NOTE: we need to consider the case where the `id` is specified for
9311 // `osd create`, and we must honor it. So this means checking if
9312 // the `id` is destroyed, and if so assume the destroy; otherwise,
9313 // check if it `exists` - in which case we complain about not being
9314 // `destroyed`. In the end, if nothing fails, we must allow the
9315 // creation, so that we are compatible with `create`.
9316 if (id
>= 0 && osdmap
.exists(id
) && !osdmap
.is_destroyed(id
)) {
9317 dout(10) << __func__
<< " osd." << id
<< " isn't destroyed" << dendl
;
9318 ss
<< "OSD " << id
<< " has not yet been destroyed";
9320 } else if (id
< 0) {
9322 id
= _allocate_osd_id(&existing_id
);
9324 ceph_assert(existing_id
>= 0);
9327 dout(10) << __func__
<< " found id " << id
<< " to use" << dendl
;
9328 } else if (id
>= 0 && osdmap
.is_destroyed(id
)) {
9329 dout(10) << __func__
<< " recreating osd." << id
<< dendl
;
9331 dout(10) << __func__
<< " creating new osd." << id
<< dendl
;
9334 ceph_assert(id
>= 0);
9335 ceph_assert(osdmap
.exists(id
));
9338 // we are now able to either create a brand new osd or reuse an existing
9339 // osd that has been previously destroyed.
9341 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
<< dendl
;
9343 if (may_be_idempotent
&& params
.empty()) {
9344 // nothing to do, really.
9345 dout(10) << __func__
<< " idempotent and no params -- no op." << dendl
;
9346 ceph_assert(id
>= 0);
9348 f
->open_object_section("created_osd");
9349 f
->dump_int("osdid", id
);
9357 string device_class
;
9358 auto p
= params
.find("crush_device_class");
9359 if (p
!= params
.end()) {
9360 device_class
= p
->second
;
9361 dout(20) << __func__
<< " device_class will be " << device_class
<< dendl
;
9363 string cephx_secret
, lockbox_secret
, dmcrypt_key
;
9364 bool has_lockbox
= false;
9365 bool has_secrets
= params
.count("cephx_secret")
9366 || params
.count("cephx_lockbox_secret")
9367 || params
.count("dmcrypt_key");
9369 KVMonitor
*svc
= nullptr;
9370 AuthMonitor::auth_entity_t cephx_entity
, lockbox_entity
;
9373 if (params
.count("cephx_secret") == 0) {
9374 ss
<< "requires a cephx secret.";
9377 cephx_secret
= params
.at("cephx_secret");
9379 bool has_lockbox_secret
= (params
.count("cephx_lockbox_secret") > 0);
9380 bool has_dmcrypt_key
= (params
.count("dmcrypt_key") > 0);
9382 dout(10) << __func__
<< " has lockbox " << has_lockbox_secret
9383 << " dmcrypt " << has_dmcrypt_key
<< dendl
;
9385 if (has_lockbox_secret
&& has_dmcrypt_key
) {
9387 lockbox_secret
= params
.at("cephx_lockbox_secret");
9388 dmcrypt_key
= params
.at("dmcrypt_key");
9389 } else if (!has_lockbox_secret
!= !has_dmcrypt_key
) {
9390 ss
<< "requires both a cephx lockbox secret and a dm-crypt key.";
9394 dout(10) << __func__
<< " validate secrets using osd id " << id
<< dendl
;
9396 err
= mon
.authmon()->validate_osd_new(id
, uuid
,
9404 } else if (may_be_idempotent
&& err
!= EEXIST
) {
9405 // for this to be idempotent, `id` should already be >= 0; no need
9406 // to use validate_id.
9407 ceph_assert(id
>= 0);
9408 ss
<< "osd." << id
<< " exists but secrets do not match";
9414 err
= svc
->validate_osd_new(uuid
, dmcrypt_key
, ss
);
9417 } else if (may_be_idempotent
&& err
!= EEXIST
) {
9418 ceph_assert(id
>= 0);
9419 ss
<< "osd." << id
<< " exists but dm-crypt key does not match.";
9424 ceph_assert(!has_secrets
|| !cephx_secret
.empty());
9425 ceph_assert(!has_lockbox
|| !lockbox_secret
.empty());
9427 if (may_be_idempotent
) {
9428 // we have nothing to do for either the osdmon or the authmon,
9429 // and we have no lockbox - so the config key service will not be
9430 // touched. This is therefore an idempotent operation, and we can
9431 // just return right away.
9432 dout(10) << __func__
<< " idempotent -- no op." << dendl
;
9433 ceph_assert(id
>= 0);
9435 f
->open_object_section("created_osd");
9436 f
->dump_int("osdid", id
);
9443 ceph_assert(!may_be_idempotent
);
9447 ceph_assert(!cephx_secret
.empty());
9448 ceph_assert((lockbox_secret
.empty() && dmcrypt_key
.empty()) ||
9449 (!lockbox_secret
.empty() && !dmcrypt_key
.empty()));
9451 err
= mon
.authmon()->do_osd_new(cephx_entity
,
9454 ceph_assert(0 == err
);
9457 ceph_assert(nullptr != svc
);
9458 svc
->do_osd_new(uuid
, dmcrypt_key
);
9462 if (is_recreate_destroyed
) {
9463 ceph_assert(id
>= 0);
9464 ceph_assert(osdmap
.is_destroyed(id
));
9465 pending_inc
.new_state
[id
] |= CEPH_OSD_DESTROYED
;
9466 if ((osdmap
.get_state(id
) & CEPH_OSD_NEW
) == 0) {
9467 pending_inc
.new_state
[id
] |= CEPH_OSD_NEW
;
9469 if (osdmap
.get_state(id
) & CEPH_OSD_UP
) {
9470 // due to http://tracker.ceph.com/issues/20751 some clusters may
9471 // have UP set for non-existent OSDs; make sure it is cleared
9472 // for a newly created osd.
9473 pending_inc
.new_state
[id
] |= CEPH_OSD_UP
;
9475 pending_inc
.new_uuid
[id
] = uuid
;
9477 ceph_assert(id
>= 0);
9478 int32_t new_id
= -1;
9479 do_osd_create(id
, uuid
, device_class
, &new_id
);
9480 ceph_assert(new_id
>= 0);
9481 ceph_assert(id
== new_id
);
9485 f
->open_object_section("created_osd");
9486 f
->dump_int("osdid", id
);
9495 bool OSDMonitor::prepare_command(MonOpRequestRef op
)
9497 op
->mark_osdmon_event(__func__
);
9498 auto m
= op
->get_req
<MMonCommand
>();
9501 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
9502 string rs
= ss
.str();
9503 mon
.reply_command(op
, -EINVAL
, rs
, get_last_committed());
9507 MonSession
*session
= op
->get_session();
9509 derr
<< __func__
<< " no session" << dendl
;
9510 mon
.reply_command(op
, -EACCES
, "access denied", get_last_committed());
9514 return prepare_command_impl(op
, cmdmap
);
9517 static int parse_reweights(CephContext
*cct
,
9518 const cmdmap_t
& cmdmap
,
9519 const OSDMap
& osdmap
,
9520 map
<int32_t, uint32_t>* weights
)
9523 if (!cmd_getval(cmdmap
, "weights", weights_str
)) {
9526 std::replace(begin(weights_str
), end(weights_str
), '\'', '"');
9527 json_spirit::mValue json_value
;
9528 if (!json_spirit::read(weights_str
, json_value
)) {
9531 if (json_value
.type() != json_spirit::obj_type
) {
9534 const auto obj
= json_value
.get_obj();
9536 for (auto& osd_weight
: obj
) {
9537 auto osd_id
= std::stoi(osd_weight
.first
);
9538 if (!osdmap
.exists(osd_id
)) {
9541 if (osd_weight
.second
.type() != json_spirit::str_type
) {
9544 auto weight
= std::stoul(osd_weight
.second
.get_str());
9545 weights
->insert({osd_id
, weight
});
9547 } catch (const std::logic_error
& e
) {
9553 int OSDMonitor::prepare_command_osd_destroy(
9557 ceph_assert(paxos
.is_plugged());
9559 // we check if the osd exists for the benefit of `osd purge`, which may
9560 // have previously removed the osd. If the osd does not exist, return
9561 // -ENOENT to convey this, and let the caller deal with it.
9563 // we presume that all auth secrets and config keys were removed prior
9564 // to this command being called. if they exist by now, we also assume
9565 // they must have been created by some other command and do not pertain
9566 // to this non-existent osd.
9567 if (!osdmap
.exists(id
)) {
9568 dout(10) << __func__
<< " osd." << id
<< " does not exist." << dendl
;
9572 uuid_d uuid
= osdmap
.get_uuid(id
);
9573 dout(10) << __func__
<< " destroying osd." << id
9574 << " uuid " << uuid
<< dendl
;
9576 // if it has been destroyed, we assume our work here is done.
9577 if (osdmap
.is_destroyed(id
)) {
9578 ss
<< "destroyed osd." << id
;
9582 EntityName cephx_entity
, lockbox_entity
;
9583 bool idempotent_auth
= false, idempotent_cks
= false;
9585 int err
= mon
.authmon()->validate_osd_destroy(id
, uuid
,
9590 if (err
== -ENOENT
) {
9591 idempotent_auth
= true;
9597 auto svc
= mon
.kvmon();
9598 err
= svc
->validate_osd_destroy(id
, uuid
);
9600 ceph_assert(err
== -ENOENT
);
9602 idempotent_cks
= true;
9605 if (!idempotent_auth
) {
9606 err
= mon
.authmon()->do_osd_destroy(cephx_entity
, lockbox_entity
);
9607 ceph_assert(0 == err
);
9610 if (!idempotent_cks
) {
9611 svc
->do_osd_destroy(id
, uuid
);
9614 pending_inc
.new_state
[id
] = CEPH_OSD_DESTROYED
;
9615 pending_inc
.new_uuid
[id
] = uuid_d();
9617 // we can only propose_pending() once per service, otherwise we'll be
9618 // defying PaxosService and all laws of nature. Therefore, as we may
9619 // be used during 'osd purge', let's keep the caller responsible for
9621 ceph_assert(err
== 0);
9625 int OSDMonitor::prepare_command_osd_purge(
9629 ceph_assert(paxos
.is_plugged());
9630 dout(10) << __func__
<< " purging osd." << id
<< dendl
;
9632 ceph_assert(!osdmap
.is_up(id
));
9635 * This may look a bit weird, but this is what's going to happen:
9637 * 1. we make sure that removing from crush works
9638 * 2. we call `prepare_command_osd_destroy()`. If it returns an
9639 * error, then we abort the whole operation, as no updates
9640 * have been made. However, we this function will have
9641 * side-effects, thus we need to make sure that all operations
9642 * performed henceforth will *always* succeed.
9643 * 3. we call `prepare_command_osd_remove()`. Although this
9644 * function can return an error, it currently only checks if the
9645 * osd is up - and we have made sure that it is not so, so there
9646 * is no conflict, and it is effectively an update.
9647 * 4. finally, we call `do_osd_crush_remove()`, which will perform
9648 * the crush update we delayed from before.
9651 CrushWrapper newcrush
;
9652 _get_pending_crush(newcrush
);
9654 bool may_be_idempotent
= false;
9656 int err
= _prepare_command_osd_crush_remove(newcrush
, id
, 0, false, false);
9657 if (err
== -ENOENT
) {
9659 may_be_idempotent
= true;
9660 } else if (err
< 0) {
9661 ss
<< "error removing osd." << id
<< " from crush";
9665 // no point destroying the osd again if it has already been marked destroyed
9666 if (!osdmap
.is_destroyed(id
)) {
9667 err
= prepare_command_osd_destroy(id
, ss
);
9669 if (err
== -ENOENT
) {
9675 may_be_idempotent
= false;
9678 ceph_assert(0 == err
);
9680 if (may_be_idempotent
&& !osdmap
.exists(id
)) {
9681 dout(10) << __func__
<< " osd." << id
<< " does not exist and "
9682 << "we are idempotent." << dendl
;
9686 err
= prepare_command_osd_remove(id
);
9687 // we should not be busy, as we should have made sure this id is not up.
9688 ceph_assert(0 == err
);
9690 do_osd_crush_remove(newcrush
);
9694 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op
,
9695 const cmdmap_t
& cmdmap
)
9697 op
->mark_osdmon_event(__func__
);
9698 auto m
= op
->get_req
<MMonCommand
>();
9706 cmd_getval(cmdmap
, "format", format
, string("plain"));
9707 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
9710 cmd_getval(cmdmap
, "prefix", prefix
);
9714 bool osdid_present
= false;
9715 if (prefix
!= "osd pg-temp" &&
9716 prefix
!= "osd pg-upmap" &&
9717 prefix
!= "osd pg-upmap-items") { // avoid commands with non-int id arg
9718 osdid_present
= cmd_getval(cmdmap
, "id", osdid
);
9720 if (osdid_present
) {
9722 oss
<< "osd." << osdid
;
9723 osd_name
= oss
.str();
9726 // Even if there's a pending state with changes that could affect
9727 // a command, considering that said state isn't yet committed, we
9728 // just don't care about those changes if the command currently being
9729 // handled acts as a no-op against the current committed state.
9730 // In a nutshell, we assume this command happens *before*.
9732 // Let me make this clearer:
9734 // - If we have only one client, and that client issues some
9735 // operation that would conflict with this operation but is
9736 // still on the pending state, then we would be sure that said
9737 // operation wouldn't have returned yet, so the client wouldn't
9738 // issue this operation (unless the client didn't wait for the
9739 // operation to finish, and that would be the client's own fault).
9741 // - If we have more than one client, each client will observe
9742 // whatever is the state at the moment of the commit. So, if we
9743 // have two clients, one issuing an unlink and another issuing a
9744 // link, and if the link happens while the unlink is still on the
9745 // pending state, from the link's point-of-view this is a no-op.
9746 // If different clients are issuing conflicting operations and
9747 // they care about that, then the clients should make sure they
9748 // enforce some kind of concurrency mechanism -- from our
9749 // perspective that's what Douglas Adams would call an SEP.
9751 // This should be used as a general guideline for most commands handled
9752 // in this function. Adapt as you see fit, but please bear in mind that
9753 // this is the expected behavior.
9756 if (prefix
== "osd setcrushmap" ||
9757 (prefix
== "osd crush set" && !osdid_present
)) {
9758 if (pending_inc
.crush
.length()) {
9759 dout(10) << __func__
<< " waiting for pending crush update " << dendl
;
9760 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
9763 dout(10) << "prepare_command setting new crush map" << dendl
;
9764 bufferlist
data(m
->get_data());
9767 auto bl
= data
.cbegin();
9770 catch (const std::exception
&e
) {
9772 ss
<< "Failed to parse crushmap: " << e
.what();
9776 int64_t prior_version
= 0;
9777 if (cmd_getval(cmdmap
, "prior_version", prior_version
)) {
9778 if (prior_version
== osdmap
.get_crush_version() - 1) {
9779 // see if we are a resend of the last update. this is imperfect
9780 // (multiple racing updaters may not both get reliable success)
9781 // but we expect crush updaters (via this interface) to be rare-ish.
9782 bufferlist current
, proposed
;
9783 osdmap
.crush
->encode(current
, mon
.get_quorum_con_features());
9784 crush
.encode(proposed
, mon
.get_quorum_con_features());
9785 if (current
.contents_equal(proposed
)) {
9786 dout(10) << __func__
9787 << " proposed matches current and version equals previous"
9790 ss
<< osdmap
.get_crush_version();
9794 if (prior_version
!= osdmap
.get_crush_version()) {
9796 ss
<< "prior_version " << prior_version
<< " != crush version "
9797 << osdmap
.get_crush_version();
9802 if (crush
.has_legacy_rule_ids()) {
9804 ss
<< "crush maps with ruleset != ruleid are no longer allowed";
9807 if (!validate_crush_against_features(&crush
, ss
)) {
9812 err
= osdmap
.validate_crush_rules(&crush
, &ss
);
9817 if (g_conf()->mon_osd_crush_smoke_test
) {
9818 // sanity check: test some inputs to make sure this map isn't
9820 dout(10) << " testing map" << dendl
;
9822 CrushTester
tester(crush
, ess
);
9823 tester
.set_min_x(0);
9824 tester
.set_max_x(50);
9825 auto start
= ceph::coarse_mono_clock::now();
9826 int r
= tester
.test_with_fork(g_conf()->mon_lease
);
9827 auto duration
= ceph::coarse_mono_clock::now() - start
;
9829 dout(10) << " tester.test_with_fork returns " << r
9830 << ": " << ess
.str() << dendl
;
9831 ss
<< "crush smoke test failed with " << r
<< ": " << ess
.str();
9835 dout(10) << __func__
<< " crush somke test duration: "
9836 << duration
<< ", result: " << ess
.str() << dendl
;
9839 pending_inc
.crush
= data
;
9840 ss
<< osdmap
.get_crush_version() + 1;
9843 } else if (prefix
== "osd crush set-all-straw-buckets-to-straw2") {
9844 CrushWrapper newcrush
;
9845 _get_pending_crush(newcrush
);
9846 for (int b
= 0; b
< newcrush
.get_max_buckets(); ++b
) {
9848 if (newcrush
.bucket_exists(bid
) &&
9849 newcrush
.get_bucket_alg(bid
) == CRUSH_BUCKET_STRAW
) {
9850 dout(20) << " bucket " << bid
<< " is straw, can convert" << dendl
;
9851 newcrush
.bucket_set_alg(bid
, CRUSH_BUCKET_STRAW2
);
9854 if (!validate_crush_against_features(&newcrush
, ss
)) {
9858 pending_inc
.crush
.clear();
9859 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
9860 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9861 get_last_committed() + 1));
9863 } else if (prefix
== "osd crush set-device-class") {
9864 string device_class
;
9865 if (!cmd_getval(cmdmap
, "class", device_class
)) {
9866 err
= -EINVAL
; // no value!
9871 vector
<string
> idvec
;
9872 cmd_getval(cmdmap
, "ids", idvec
);
9873 CrushWrapper newcrush
;
9874 _get_pending_crush(newcrush
);
9876 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
9880 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
9881 osdmap
.get_all_osds(osds
);
9884 // try traditional single osd way
9885 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
9887 // ss has reason for failure
9888 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
9895 for (auto &osd
: osds
) {
9896 if (!osdmap
.exists(osd
)) {
9897 ss
<< "osd." << osd
<< " does not exist. ";
9902 oss
<< "osd." << osd
;
9903 string name
= oss
.str();
9905 if (newcrush
.get_max_devices() < osd
+ 1) {
9906 newcrush
.set_max_devices(osd
+ 1);
9909 if (newcrush
.item_exists(osd
)) {
9910 action
= "updating";
9912 action
= "creating";
9913 newcrush
.set_item_name(osd
, name
);
9916 dout(5) << action
<< " crush item id " << osd
<< " name '" << name
9917 << "' device_class '" << device_class
<< "'"
9919 err
= newcrush
.update_device_class(osd
, device_class
, name
, &ss
);
9923 if (err
== 0 && !_have_pending_crush()) {
9925 // for single osd only, wildcard makes too much noise
9926 ss
<< "set-device-class item id " << osd
<< " name '" << name
9927 << "' device_class '" << device_class
<< "': no change. ";
9930 updated
.insert(osd
);
9935 pending_inc
.crush
.clear();
9936 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
9937 ss
<< "set osd(s) " << updated
<< " to class '" << device_class
<< "'";
9939 wait_for_finished_proposal(
9941 new Monitor::C_Command(mon
,op
, 0, rs
, get_last_committed() + 1));
9943 } else if (prefix
== "osd crush rm-device-class") {
9945 vector
<string
> idvec
;
9946 cmd_getval(cmdmap
, "ids", idvec
);
9947 CrushWrapper newcrush
;
9948 _get_pending_crush(newcrush
);
9951 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
9956 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
9957 osdmap
.get_all_osds(osds
);
9960 // try traditional single osd way
9961 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
9963 // ss has reason for failure
9964 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
9971 for (auto &osd
: osds
) {
9972 if (!osdmap
.exists(osd
)) {
9973 ss
<< "osd." << osd
<< " does not exist. ";
9977 auto class_name
= newcrush
.get_item_class(osd
);
9979 ss
<< "osd." << osd
<< " belongs to no class, ";
9982 // note that we do not verify if class_is_in_use here
9983 // in case the device is misclassified and user wants
9984 // to overridely reset...
9986 err
= newcrush
.remove_device_class(cct
, osd
, &ss
);
9988 // ss has reason for failure
9991 updated
.insert(osd
);
9995 pending_inc
.crush
.clear();
9996 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
9997 ss
<< "done removing class of osd(s): " << updated
;
9999 wait_for_finished_proposal(
10001 new Monitor::C_Command(mon
,op
, 0, rs
, get_last_committed() + 1));
10003 } else if (prefix
== "osd crush class create") {
10004 string device_class
;
10005 if (!cmd_getval(cmdmap
, "class", device_class
)) {
10006 err
= -EINVAL
; // no value!
10009 if (osdmap
.require_osd_release
< ceph_release_t::luminous
) {
10010 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
10011 << "luminous' before using crush device classes";
10015 if (!_have_pending_crush() &&
10016 _get_stable_crush().class_exists(device_class
)) {
10017 ss
<< "class '" << device_class
<< "' already exists";
10020 CrushWrapper newcrush
;
10021 _get_pending_crush(newcrush
);
10022 if (newcrush
.class_exists(device_class
)) {
10023 ss
<< "class '" << device_class
<< "' already exists";
10026 int class_id
= newcrush
.get_or_create_class_id(device_class
);
10027 pending_inc
.crush
.clear();
10028 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10029 ss
<< "created class " << device_class
<< " with id " << class_id
10030 << " to crush map";
10032 } else if (prefix
== "osd crush class rm") {
10033 string device_class
;
10034 if (!cmd_getval(cmdmap
, "class", device_class
)) {
10035 err
= -EINVAL
; // no value!
10038 if (osdmap
.require_osd_release
< ceph_release_t::luminous
) {
10039 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
10040 << "luminous' before using crush device classes";
10045 if (!osdmap
.crush
->class_exists(device_class
)) {
10050 CrushWrapper newcrush
;
10051 _get_pending_crush(newcrush
);
10052 if (!newcrush
.class_exists(device_class
)) {
10053 err
= 0; // make command idempotent
10056 int class_id
= newcrush
.get_class_id(device_class
);
10058 if (newcrush
.class_is_in_use(class_id
, &ts
)) {
10060 ss
<< "class '" << device_class
<< "' " << ts
.str();
10064 // check if class is used by any erasure-code-profiles
10065 mempool::osdmap::map
<string
,map
<string
,string
>> old_ec_profiles
=
10066 osdmap
.get_erasure_code_profiles();
10067 auto ec_profiles
= pending_inc
.get_erasure_code_profiles();
10068 #ifdef HAVE_STDLIB_MAP_SPLICING
10069 ec_profiles
.merge(old_ec_profiles
);
10071 ec_profiles
.insert(make_move_iterator(begin(old_ec_profiles
)),
10072 make_move_iterator(end(old_ec_profiles
)));
10074 list
<string
> referenced_by
;
10075 for (auto &i
: ec_profiles
) {
10076 for (auto &j
: i
.second
) {
10077 if ("crush-device-class" == j
.first
&& device_class
== j
.second
) {
10078 referenced_by
.push_back(i
.first
);
10082 if (!referenced_by
.empty()) {
10084 ss
<< "class '" << device_class
10085 << "' is still referenced by erasure-code-profile(s): " << referenced_by
;
10090 newcrush
.get_devices_by_class(device_class
, &osds
);
10091 for (auto& p
: osds
) {
10092 err
= newcrush
.remove_device_class(g_ceph_context
, p
, &ss
);
10094 // ss has reason for failure
10099 if (osds
.empty()) {
10100 // empty class, remove directly
10101 err
= newcrush
.remove_class_name(device_class
);
10103 ss
<< "class '" << device_class
<< "' cannot be removed '"
10104 << cpp_strerror(err
) << "'";
10109 pending_inc
.crush
.clear();
10110 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10111 ss
<< "removed class " << device_class
<< " with id " << class_id
10112 << " from crush map";
10114 } else if (prefix
== "osd crush class rename") {
10115 string srcname
, dstname
;
10116 if (!cmd_getval(cmdmap
, "srcname", srcname
)) {
10120 if (!cmd_getval(cmdmap
, "dstname", dstname
)) {
10125 CrushWrapper newcrush
;
10126 _get_pending_crush(newcrush
);
10127 if (!newcrush
.class_exists(srcname
) && newcrush
.class_exists(dstname
)) {
10128 // suppose this is a replay and return success
10129 // so command is idempotent
10130 ss
<< "already renamed to '" << dstname
<< "'";
10135 err
= newcrush
.rename_class(srcname
, dstname
);
10137 ss
<< "fail to rename '" << srcname
<< "' to '" << dstname
<< "' : "
10138 << cpp_strerror(err
);
10142 pending_inc
.crush
.clear();
10143 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10144 ss
<< "rename class '" << srcname
<< "' to '" << dstname
<< "'";
10146 } else if (prefix
== "osd crush add-bucket") {
10147 // os crush add-bucket <name> <type>
10148 string name
, typestr
;
10149 vector
<string
> argvec
;
10150 cmd_getval(cmdmap
, "name", name
);
10151 cmd_getval(cmdmap
, "type", typestr
);
10152 cmd_getval(cmdmap
, "args", argvec
);
10153 map
<string
,string
> loc
;
10154 if (!argvec
.empty()) {
10155 CrushWrapper::parse_loc_map(argvec
, &loc
);
10156 dout(0) << "will create and move bucket '" << name
10157 << "' to location " << loc
<< dendl
;
10160 if (!_have_pending_crush() &&
10161 _get_stable_crush().name_exists(name
)) {
10162 ss
<< "bucket '" << name
<< "' already exists";
10166 CrushWrapper newcrush
;
10167 _get_pending_crush(newcrush
);
10169 if (newcrush
.name_exists(name
)) {
10170 ss
<< "bucket '" << name
<< "' already exists";
10173 int type
= newcrush
.get_type_id(typestr
);
10175 ss
<< "type '" << typestr
<< "' does not exist";
10180 ss
<< "type '" << typestr
<< "' is for devices, not buckets";
10185 err
= newcrush
.add_bucket(0, 0,
10186 CRUSH_HASH_DEFAULT
, type
, 0, NULL
,
10189 ss
<< "add_bucket error: '" << cpp_strerror(err
) << "'";
10192 err
= newcrush
.set_item_name(bucketno
, name
);
10194 ss
<< "error setting bucket name to '" << name
<< "'";
10198 if (!loc
.empty()) {
10199 if (!newcrush
.check_item_loc(cct
, bucketno
, loc
,
10201 err
= newcrush
.move_bucket(cct
, bucketno
, loc
);
10203 ss
<< "error moving bucket '" << name
<< "' to location " << loc
;
10207 ss
<< "no need to move item id " << bucketno
<< " name '" << name
10208 << "' to location " << loc
<< " in crush map";
10212 pending_inc
.crush
.clear();
10213 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10215 ss
<< "added bucket " << name
<< " type " << typestr
10216 << " to crush map";
10218 ss
<< "added bucket " << name
<< " type " << typestr
10219 << " to location " << loc
;
10222 } else if (prefix
== "osd crush rename-bucket") {
10223 string srcname
, dstname
;
10224 cmd_getval(cmdmap
, "srcname", srcname
);
10225 cmd_getval(cmdmap
, "dstname", dstname
);
10227 err
= crush_rename_bucket(srcname
, dstname
, &ss
);
10228 if (err
== -EALREADY
) // equivalent to success for idempotency
10234 } else if (prefix
== "osd crush weight-set create" ||
10235 prefix
== "osd crush weight-set create-compat") {
10236 CrushWrapper newcrush
;
10237 _get_pending_crush(newcrush
);
10240 if (newcrush
.has_non_straw2_buckets()) {
10241 ss
<< "crush map contains one or more bucket(s) that are not straw2";
10245 if (prefix
== "osd crush weight-set create") {
10246 if (osdmap
.require_min_compat_client
!= ceph_release_t::unknown
&&
10247 osdmap
.require_min_compat_client
< ceph_release_t::luminous
) {
10248 ss
<< "require_min_compat_client "
10249 << osdmap
.require_min_compat_client
10250 << " < luminous, which is required for per-pool weight-sets. "
10251 << "Try 'ceph osd set-require-min-compat-client luminous' "
10252 << "before using the new interface";
10256 string poolname
, mode
;
10257 cmd_getval(cmdmap
, "pool", poolname
);
10258 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
10260 ss
<< "pool '" << poolname
<< "' not found";
10264 cmd_getval(cmdmap
, "mode", mode
);
10265 if (mode
!= "flat" && mode
!= "positional") {
10266 ss
<< "unrecognized weight-set mode '" << mode
<< "'";
10270 positions
= mode
== "flat" ? 1 : osdmap
.get_pg_pool(pool
)->get_size();
10272 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
10275 if (!newcrush
.create_choose_args(pool
, positions
)) {
10276 if (pool
== CrushWrapper::DEFAULT_CHOOSE_ARGS
) {
10277 ss
<< "compat weight-set already created";
10279 ss
<< "weight-set for pool '" << osdmap
.get_pool_name(pool
)
10280 << "' already created";
10284 pending_inc
.crush
.clear();
10285 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10288 } else if (prefix
== "osd crush weight-set rm" ||
10289 prefix
== "osd crush weight-set rm-compat") {
10290 CrushWrapper newcrush
;
10291 _get_pending_crush(newcrush
);
10293 if (prefix
== "osd crush weight-set rm") {
10295 cmd_getval(cmdmap
, "pool", poolname
);
10296 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
10298 ss
<< "pool '" << poolname
<< "' not found";
10303 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
10305 newcrush
.rm_choose_args(pool
);
10306 pending_inc
.crush
.clear();
10307 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10310 } else if (prefix
== "osd crush weight-set reweight" ||
10311 prefix
== "osd crush weight-set reweight-compat") {
10312 string poolname
, item
;
10313 vector
<double> weight
;
10314 cmd_getval(cmdmap
, "pool", poolname
);
10315 cmd_getval(cmdmap
, "item", item
);
10316 cmd_getval(cmdmap
, "weight", weight
);
10317 CrushWrapper newcrush
;
10318 _get_pending_crush(newcrush
);
10320 if (prefix
== "osd crush weight-set reweight") {
10321 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
10323 ss
<< "pool '" << poolname
<< "' not found";
10327 if (!newcrush
.have_choose_args(pool
)) {
10328 ss
<< "no weight-set for pool '" << poolname
<< "'";
10332 auto arg_map
= newcrush
.choose_args_get(pool
);
10333 int positions
= newcrush
.get_choose_args_positions(arg_map
);
10334 if (weight
.size() != (size_t)positions
) {
10335 ss
<< "must specify exact " << positions
<< " weight values";
10340 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
10341 if (!newcrush
.have_choose_args(pool
)) {
10342 ss
<< "no backward-compatible weight-set";
10347 if (!newcrush
.name_exists(item
)) {
10348 ss
<< "item '" << item
<< "' does not exist";
10352 err
= newcrush
.choose_args_adjust_item_weightf(
10354 newcrush
.choose_args_get(pool
),
10355 newcrush
.get_item_id(item
),
10362 pending_inc
.crush
.clear();
10363 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10365 } else if (osdid_present
&&
10366 (prefix
== "osd crush set" || prefix
== "osd crush add")) {
10367 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
10368 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
10369 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
10371 if (!osdmap
.exists(osdid
)) {
10374 << " does not exist. Create it before updating the crush map";
10379 if (!cmd_getval(cmdmap
, "weight", weight
)) {
10380 ss
<< "unable to parse weight value '"
10381 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
10387 vector
<string
> argvec
;
10388 cmd_getval(cmdmap
, "args", argvec
);
10389 map
<string
,string
> loc
;
10390 CrushWrapper::parse_loc_map(argvec
, &loc
);
10392 if (prefix
== "osd crush set"
10393 && !_get_stable_crush().item_exists(osdid
)) {
10395 ss
<< "unable to set item id " << osdid
<< " name '" << osd_name
10396 << "' weight " << weight
<< " at location " << loc
10397 << ": does not exist";
10401 dout(5) << "adding/updating crush item id " << osdid
<< " name '"
10402 << osd_name
<< "' weight " << weight
<< " at location "
10404 CrushWrapper newcrush
;
10405 _get_pending_crush(newcrush
);
10408 if (prefix
== "osd crush set" ||
10409 newcrush
.check_item_loc(cct
, osdid
, loc
, (int *)NULL
)) {
10411 err
= newcrush
.update_item(cct
, osdid
, weight
, osd_name
, loc
);
10414 err
= newcrush
.insert_item(cct
, osdid
, weight
, osd_name
, loc
);
10422 if (err
== 0 && !_have_pending_crush()) {
10423 ss
<< action
<< " item id " << osdid
<< " name '" << osd_name
10424 << "' weight " << weight
<< " at location " << loc
<< ": no change";
10428 pending_inc
.crush
.clear();
10429 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10430 ss
<< action
<< " item id " << osdid
<< " name '" << osd_name
<< "' weight "
10431 << weight
<< " at location " << loc
<< " to crush map";
10433 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10434 get_last_committed() + 1));
10437 } else if (prefix
== "osd crush create-or-move") {
10439 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
10440 if (!osdmap
.exists(osdid
)) {
10443 << " does not exist. create it before updating the crush map";
10448 if (!cmd_getval(cmdmap
, "weight", weight
)) {
10449 ss
<< "unable to parse weight value '"
10450 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
10456 vector
<string
> argvec
;
10457 cmd_getval(cmdmap
, "args", argvec
);
10458 map
<string
,string
> loc
;
10459 CrushWrapper::parse_loc_map(argvec
, &loc
);
10461 dout(0) << "create-or-move crush item name '" << osd_name
10462 << "' initial_weight " << weight
<< " at location " << loc
10465 CrushWrapper newcrush
;
10466 _get_pending_crush(newcrush
);
10468 err
= newcrush
.create_or_move_item(cct
, osdid
, weight
, osd_name
, loc
,
10469 g_conf()->osd_crush_update_weight_set
);
10471 ss
<< "create-or-move updated item name '" << osd_name
10472 << "' weight " << weight
10473 << " at location " << loc
<< " to crush map";
10477 pending_inc
.crush
.clear();
10478 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10479 ss
<< "create-or-move updating item name '" << osd_name
10480 << "' weight " << weight
10481 << " at location " << loc
<< " to crush map";
10483 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10484 get_last_committed() + 1));
10489 } else if (prefix
== "osd crush move") {
10491 // osd crush move <name> <loc1> [<loc2> ...]
10493 vector
<string
> argvec
;
10494 cmd_getval(cmdmap
, "name", name
);
10495 cmd_getval(cmdmap
, "args", argvec
);
10496 map
<string
,string
> loc
;
10497 CrushWrapper::parse_loc_map(argvec
, &loc
);
10499 dout(0) << "moving crush item name '" << name
<< "' to location " << loc
<< dendl
;
10500 CrushWrapper newcrush
;
10501 _get_pending_crush(newcrush
);
10503 if (!newcrush
.name_exists(name
)) {
10505 ss
<< "item " << name
<< " does not exist";
10508 int id
= newcrush
.get_item_id(name
);
10510 if (!newcrush
.check_item_loc(cct
, id
, loc
, (int *)NULL
)) {
10512 err
= newcrush
.create_or_move_item(
10513 cct
, id
, 0, name
, loc
,
10514 g_conf()->osd_crush_update_weight_set
);
10516 err
= newcrush
.move_bucket(cct
, id
, loc
);
10519 ss
<< "moved item id " << id
<< " name '" << name
<< "' to location " << loc
<< " in crush map";
10520 pending_inc
.crush
.clear();
10521 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10523 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10524 get_last_committed() + 1));
10528 ss
<< "no need to move item id " << id
<< " name '" << name
<< "' to location " << loc
<< " in crush map";
10532 } else if (prefix
== "osd crush swap-bucket") {
10533 string source
, dest
;
10534 cmd_getval(cmdmap
, "source", source
);
10535 cmd_getval(cmdmap
, "dest", dest
);
10537 bool force
= false;
10538 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
10540 CrushWrapper newcrush
;
10541 _get_pending_crush(newcrush
);
10542 if (!newcrush
.name_exists(source
)) {
10543 ss
<< "source item " << source
<< " does not exist";
10547 if (!newcrush
.name_exists(dest
)) {
10548 ss
<< "dest item " << dest
<< " does not exist";
10552 int sid
= newcrush
.get_item_id(source
);
10553 int did
= newcrush
.get_item_id(dest
);
10555 if (newcrush
.get_immediate_parent_id(sid
, &sparent
) == 0 && !force
) {
10556 ss
<< "source item " << source
<< " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
10560 if (newcrush
.get_bucket_alg(sid
) != newcrush
.get_bucket_alg(did
) &&
10562 ss
<< "source bucket alg " << crush_alg_name(newcrush
.get_bucket_alg(sid
)) << " != "
10563 << "dest bucket alg " << crush_alg_name(newcrush
.get_bucket_alg(did
))
10564 << "; pass --yes-i-really-mean-it to proceed anyway";
10568 int r
= newcrush
.swap_bucket(cct
, sid
, did
);
10570 ss
<< "failed to swap bucket contents: " << cpp_strerror(r
);
10574 ss
<< "swapped bucket of " << source
<< " to " << dest
;
10575 pending_inc
.crush
.clear();
10576 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10577 wait_for_finished_proposal(op
,
10578 new Monitor::C_Command(mon
, op
, err
, ss
.str(),
10579 get_last_committed() + 1));
10581 } else if (prefix
== "osd crush link") {
10582 // osd crush link <name> <loc1> [<loc2> ...]
10584 cmd_getval(cmdmap
, "name", name
);
10585 vector
<string
> argvec
;
10586 cmd_getval(cmdmap
, "args", argvec
);
10587 map
<string
,string
> loc
;
10588 CrushWrapper::parse_loc_map(argvec
, &loc
);
10590 // Need an explicit check for name_exists because get_item_id returns
10592 int id
= osdmap
.crush
->get_item_id(name
);
10593 if (!osdmap
.crush
->name_exists(name
)) {
10595 ss
<< "item " << name
<< " does not exist";
10598 dout(5) << "resolved crush name '" << name
<< "' to id " << id
<< dendl
;
10600 if (osdmap
.crush
->check_item_loc(cct
, id
, loc
, (int*) NULL
)) {
10601 ss
<< "no need to move item id " << id
<< " name '" << name
10602 << "' to location " << loc
<< " in crush map";
10607 dout(5) << "linking crush item name '" << name
<< "' at location " << loc
<< dendl
;
10608 CrushWrapper newcrush
;
10609 _get_pending_crush(newcrush
);
10611 if (!newcrush
.name_exists(name
)) {
10613 ss
<< "item " << name
<< " does not exist";
10616 int id
= newcrush
.get_item_id(name
);
10617 if (!newcrush
.check_item_loc(cct
, id
, loc
, (int *)NULL
)) {
10618 err
= newcrush
.link_bucket(cct
, id
, loc
);
10620 ss
<< "linked item id " << id
<< " name '" << name
10621 << "' to location " << loc
<< " in crush map";
10622 pending_inc
.crush
.clear();
10623 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10625 ss
<< "cannot link item id " << id
<< " name '" << name
10626 << "' to location " << loc
;
10630 ss
<< "no need to move item id " << id
<< " name '" << name
10631 << "' to location " << loc
<< " in crush map";
10635 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, ss
.str(),
10636 get_last_committed() + 1));
10638 } else if (prefix
== "osd crush rm" ||
10639 prefix
== "osd crush remove" ||
10640 prefix
== "osd crush unlink") {
10642 // osd crush rm <id> [ancestor]
10643 CrushWrapper newcrush
;
10644 _get_pending_crush(newcrush
);
10647 cmd_getval(cmdmap
, "name", name
);
10649 if (!osdmap
.crush
->name_exists(name
)) {
10651 ss
<< "device '" << name
<< "' does not appear in the crush map";
10654 if (!newcrush
.name_exists(name
)) {
10656 ss
<< "device '" << name
<< "' does not appear in the crush map";
10658 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10659 get_last_committed() + 1));
10662 int id
= newcrush
.get_item_id(name
);
10665 bool unlink_only
= prefix
== "osd crush unlink";
10666 string ancestor_str
;
10667 if (cmd_getval(cmdmap
, "ancestor", ancestor_str
)) {
10668 if (!newcrush
.name_exists(ancestor_str
)) {
10670 ss
<< "ancestor item '" << ancestor_str
10671 << "' does not appear in the crush map";
10674 ancestor
= newcrush
.get_item_id(ancestor_str
);
10677 err
= prepare_command_osd_crush_remove(
10680 (ancestor
< 0), unlink_only
);
10682 if (err
== -ENOENT
) {
10683 ss
<< "item " << id
<< " does not appear in that position";
10689 pending_inc
.new_crush_node_flags
[id
] = 0;
10690 ss
<< "removed item id " << id
<< " name '" << name
<< "' from crush map";
10692 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10693 get_last_committed() + 1));
10698 } else if (prefix
== "osd crush reweight-all") {
10699 CrushWrapper newcrush
;
10700 _get_pending_crush(newcrush
);
10702 newcrush
.reweight(cct
);
10703 pending_inc
.crush
.clear();
10704 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10705 ss
<< "reweighted crush hierarchy";
10707 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10708 get_last_committed() + 1));
10710 } else if (prefix
== "osd crush reweight") {
10711 // osd crush reweight <name> <weight>
10712 CrushWrapper newcrush
;
10713 _get_pending_crush(newcrush
);
10716 cmd_getval(cmdmap
, "name", name
);
10717 if (!newcrush
.name_exists(name
)) {
10719 ss
<< "device '" << name
<< "' does not appear in the crush map";
10723 int id
= newcrush
.get_item_id(name
);
10725 ss
<< "device '" << name
<< "' is not a leaf in the crush map";
10730 if (!cmd_getval(cmdmap
, "weight", w
)) {
10731 ss
<< "unable to parse weight value '"
10732 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
10737 err
= newcrush
.adjust_item_weightf(cct
, id
, w
,
10738 g_conf()->osd_crush_update_weight_set
);
10741 pending_inc
.crush
.clear();
10742 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10743 ss
<< "reweighted item id " << id
<< " name '" << name
<< "' to " << w
10744 << " in crush map";
10746 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10747 get_last_committed() + 1));
10749 } else if (prefix
== "osd crush reweight-subtree") {
10750 // osd crush reweight <name> <weight>
10751 CrushWrapper newcrush
;
10752 _get_pending_crush(newcrush
);
10755 cmd_getval(cmdmap
, "name", name
);
10756 if (!newcrush
.name_exists(name
)) {
10758 ss
<< "device '" << name
<< "' does not appear in the crush map";
10762 int id
= newcrush
.get_item_id(name
);
10764 ss
<< "device '" << name
<< "' is not a subtree in the crush map";
10769 if (!cmd_getval(cmdmap
, "weight", w
)) {
10770 ss
<< "unable to parse weight value '"
10771 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
10776 err
= newcrush
.adjust_subtree_weightf(cct
, id
, w
,
10777 g_conf()->osd_crush_update_weight_set
);
10780 pending_inc
.crush
.clear();
10781 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10782 ss
<< "reweighted subtree id " << id
<< " name '" << name
<< "' to " << w
10783 << " in crush map";
10785 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10786 get_last_committed() + 1));
10788 } else if (prefix
== "osd crush tunables") {
10789 CrushWrapper newcrush
;
10790 _get_pending_crush(newcrush
);
10794 cmd_getval(cmdmap
, "profile", profile
);
10795 if (profile
== "legacy" || profile
== "argonaut") {
10796 newcrush
.set_tunables_legacy();
10797 } else if (profile
== "bobtail") {
10798 newcrush
.set_tunables_bobtail();
10799 } else if (profile
== "firefly") {
10800 newcrush
.set_tunables_firefly();
10801 } else if (profile
== "hammer") {
10802 newcrush
.set_tunables_hammer();
10803 } else if (profile
== "jewel") {
10804 newcrush
.set_tunables_jewel();
10805 } else if (profile
== "optimal") {
10806 newcrush
.set_tunables_optimal();
10807 } else if (profile
== "default") {
10808 newcrush
.set_tunables_default();
10810 ss
<< "unrecognized profile '" << profile
<< "'";
10815 if (!validate_crush_against_features(&newcrush
, ss
)) {
10820 pending_inc
.crush
.clear();
10821 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10822 ss
<< "adjusted tunables profile to " << profile
;
10824 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10825 get_last_committed() + 1));
10827 } else if (prefix
== "osd crush set-tunable") {
10828 CrushWrapper newcrush
;
10829 _get_pending_crush(newcrush
);
10833 cmd_getval(cmdmap
, "tunable", tunable
);
10835 int64_t value
= -1;
10836 if (!cmd_getval(cmdmap
, "value", value
)) {
10838 ss
<< "failed to parse integer value "
10839 << cmd_vartype_stringify(cmdmap
.at("value"));
10843 if (tunable
== "straw_calc_version") {
10844 if (value
!= 0 && value
!= 1) {
10845 ss
<< "value must be 0 or 1; got " << value
;
10849 newcrush
.set_straw_calc_version(value
);
10851 ss
<< "unrecognized tunable '" << tunable
<< "'";
10856 if (!validate_crush_against_features(&newcrush
, ss
)) {
10861 pending_inc
.crush
.clear();
10862 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10863 ss
<< "adjusted tunable " << tunable
<< " to " << value
;
10865 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10866 get_last_committed() + 1));
10869 } else if (prefix
== "osd crush rule create-simple") {
10870 string name
, root
, type
, mode
;
10871 cmd_getval(cmdmap
, "name", name
);
10872 cmd_getval(cmdmap
, "root", root
);
10873 cmd_getval(cmdmap
, "type", type
);
10874 cmd_getval(cmdmap
, "mode", mode
);
10878 if (osdmap
.crush
->rule_exists(name
)) {
10879 // The name is uniquely associated to a ruleid and the rule it contains
10880 // From the user point of view, the rule is more meaningfull.
10881 ss
<< "rule " << name
<< " already exists";
10886 CrushWrapper newcrush
;
10887 _get_pending_crush(newcrush
);
10889 if (newcrush
.rule_exists(name
)) {
10890 // The name is uniquely associated to a ruleid and the rule it contains
10891 // From the user point of view, the rule is more meaningfull.
10892 ss
<< "rule " << name
<< " already exists";
10895 int ruleno
= newcrush
.add_simple_rule(name
, root
, type
, "", mode
,
10896 pg_pool_t::TYPE_REPLICATED
, &ss
);
10902 pending_inc
.crush
.clear();
10903 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10906 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10907 get_last_committed() + 1));
10910 } else if (prefix
== "osd crush rule create-replicated") {
10911 string name
, root
, type
, device_class
;
10912 cmd_getval(cmdmap
, "name", name
);
10913 cmd_getval(cmdmap
, "root", root
);
10914 cmd_getval(cmdmap
, "type", type
);
10915 cmd_getval(cmdmap
, "class", device_class
);
10917 if (osdmap
.crush
->rule_exists(name
)) {
10918 // The name is uniquely associated to a ruleid and the rule it contains
10919 // From the user point of view, the rule is more meaningfull.
10920 ss
<< "rule " << name
<< " already exists";
10925 CrushWrapper newcrush
;
10926 _get_pending_crush(newcrush
);
10928 if (newcrush
.rule_exists(name
)) {
10929 // The name is uniquely associated to a ruleid and the rule it contains
10930 // From the user point of view, the rule is more meaningfull.
10931 ss
<< "rule " << name
<< " already exists";
10934 int ruleno
= newcrush
.add_simple_rule(
10935 name
, root
, type
, device_class
,
10936 "firstn", pg_pool_t::TYPE_REPLICATED
, &ss
);
10942 pending_inc
.crush
.clear();
10943 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
10946 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10947 get_last_committed() + 1));
10950 } else if (prefix
== "osd erasure-code-profile rm") {
10952 cmd_getval(cmdmap
, "name", name
);
10954 if (erasure_code_profile_in_use(pending_inc
.new_pools
, name
, &ss
))
10957 if (erasure_code_profile_in_use(osdmap
.pools
, name
, &ss
)) {
10962 if (osdmap
.has_erasure_code_profile(name
) ||
10963 pending_inc
.new_erasure_code_profiles
.count(name
)) {
10964 if (osdmap
.has_erasure_code_profile(name
)) {
10965 pending_inc
.old_erasure_code_profiles
.push_back(name
);
10967 dout(20) << "erasure code profile rm " << name
<< ": creation canceled" << dendl
;
10968 pending_inc
.new_erasure_code_profiles
.erase(name
);
10972 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10973 get_last_committed() + 1));
10976 ss
<< "erasure-code-profile " << name
<< " does not exist";
10981 } else if (prefix
== "osd erasure-code-profile set") {
10983 cmd_getval(cmdmap
, "name", name
);
10984 vector
<string
> profile
;
10985 cmd_getval(cmdmap
, "profile", profile
);
10987 bool force
= false;
10988 cmd_getval(cmdmap
, "force", force
);
10990 map
<string
,string
> profile_map
;
10991 err
= parse_erasure_code_profile(profile
, &profile_map
, &ss
);
10994 if (auto found
= profile_map
.find("crush-failure-domain");
10995 found
!= profile_map
.end()) {
10996 const auto& failure_domain
= found
->second
;
10997 int failure_domain_type
= osdmap
.crush
->get_type_id(failure_domain
);
10998 if (failure_domain_type
< 0) {
10999 ss
<< "erasure-code-profile " << profile_map
11000 << " contains an invalid failure-domain " << std::quoted(failure_domain
);
11006 if (profile_map
.find("plugin") == profile_map
.end()) {
11007 ss
<< "erasure-code-profile " << profile_map
11008 << " must contain a plugin entry" << std::endl
;
11012 string plugin
= profile_map
["plugin"];
11014 if (pending_inc
.has_erasure_code_profile(name
)) {
11015 dout(20) << "erasure code profile " << name
<< " try again" << dendl
;
11018 err
= normalize_profile(name
, profile_map
, force
, &ss
);
11022 if (osdmap
.has_erasure_code_profile(name
)) {
11023 ErasureCodeProfile existing_profile_map
=
11024 osdmap
.get_erasure_code_profile(name
);
11025 err
= normalize_profile(name
, existing_profile_map
, force
, &ss
);
11029 if (existing_profile_map
== profile_map
) {
11035 ss
<< "will not override erasure code profile " << name
11036 << " because the existing profile "
11037 << existing_profile_map
11038 << " is different from the proposed profile "
11044 dout(20) << "erasure code profile set " << name
<< "="
11045 << profile_map
<< dendl
;
11046 pending_inc
.set_erasure_code_profile(name
, profile_map
);
11050 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11051 get_last_committed() + 1));
11054 } else if (prefix
== "osd crush rule create-erasure") {
11055 err
= check_cluster_features(CEPH_FEATURE_CRUSH_V2
, ss
);
11056 if (err
== -EAGAIN
)
11060 string name
, poolstr
;
11061 cmd_getval(cmdmap
, "name", name
);
11063 cmd_getval(cmdmap
, "profile", profile
);
11065 profile
= "default";
11066 if (profile
== "default") {
11067 if (!osdmap
.has_erasure_code_profile(profile
)) {
11068 if (pending_inc
.has_erasure_code_profile(profile
)) {
11069 dout(20) << "erasure code profile " << profile
<< " already pending" << dendl
;
11073 map
<string
,string
> profile_map
;
11074 err
= osdmap
.get_erasure_code_profile_default(cct
,
11079 err
= normalize_profile(name
, profile_map
, true, &ss
);
11082 dout(20) << "erasure code profile set " << profile
<< "="
11083 << profile_map
<< dendl
;
11084 pending_inc
.set_erasure_code_profile(profile
, profile_map
);
11090 err
= crush_rule_create_erasure(name
, profile
, &rule
, &ss
);
11093 case -EEXIST
: // return immediately
11094 ss
<< "rule " << name
<< " already exists";
11098 case -EALREADY
: // wait for pending to be proposed
11099 ss
<< "rule " << name
<< " already exists";
11102 default: // non recoverable error
11107 ss
<< "created rule " << name
<< " at " << rule
;
11111 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11112 get_last_committed() + 1));
11115 } else if (prefix
== "osd crush rule rm") {
11117 cmd_getval(cmdmap
, "name", name
);
11119 if (!osdmap
.crush
->rule_exists(name
)) {
11120 ss
<< "rule " << name
<< " does not exist";
11125 CrushWrapper newcrush
;
11126 _get_pending_crush(newcrush
);
11128 if (!newcrush
.rule_exists(name
)) {
11129 ss
<< "rule " << name
<< " does not exist";
11132 int ruleno
= newcrush
.get_rule_id(name
);
11133 ceph_assert(ruleno
>= 0);
11135 // make sure it is not in use.
11136 // FIXME: this is ok in some situations, but let's not bother with that
11138 int ruleset
= newcrush
.get_rule_mask_ruleset(ruleno
);
11139 if (osdmap
.crush_rule_in_use(ruleset
)) {
11140 ss
<< "crush ruleset " << name
<< " " << ruleset
<< " is in use";
11145 err
= newcrush
.remove_rule(ruleno
);
11150 pending_inc
.crush
.clear();
11151 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
11154 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11155 get_last_committed() + 1));
11158 } else if (prefix
== "osd crush rule rename") {
11161 cmd_getval(cmdmap
, "srcname", srcname
);
11162 cmd_getval(cmdmap
, "dstname", dstname
);
11163 if (srcname
.empty() || dstname
.empty()) {
11164 ss
<< "must specify both source rule name and destination rule name";
11168 if (srcname
== dstname
) {
11169 ss
<< "destination rule name is equal to source rule name";
11174 CrushWrapper newcrush
;
11175 _get_pending_crush(newcrush
);
11176 if (!newcrush
.rule_exists(srcname
) && newcrush
.rule_exists(dstname
)) {
11177 // srcname does not exist and dstname already exists
11178 // suppose this is a replay and return success
11179 // (so this command is idempotent)
11180 ss
<< "already renamed to '" << dstname
<< "'";
11185 err
= newcrush
.rename_rule(srcname
, dstname
, &ss
);
11187 // ss has reason for failure
11190 pending_inc
.crush
.clear();
11191 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
11193 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11194 get_last_committed() + 1));
11197 } else if (prefix
== "osd setmaxosd") {
11199 if (!cmd_getval(cmdmap
, "newmax", newmax
)) {
11200 ss
<< "unable to parse 'newmax' value '"
11201 << cmd_vartype_stringify(cmdmap
.at("newmax")) << "'";
11206 if (newmax
> g_conf()->mon_max_osd
) {
11208 ss
<< "cannot set max_osd to " << newmax
<< " which is > conf.mon_max_osd ("
11209 << g_conf()->mon_max_osd
<< ")";
11213 // Don't allow shrinking OSD number as this will cause data loss
11214 // and may cause kernel crashes.
11215 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
11216 if (newmax
< osdmap
.get_max_osd()) {
11217 // Check if the OSDs exist between current max and new value.
11218 // If there are any OSDs exist, then don't allow shrinking number
11220 for (int i
= newmax
; i
< osdmap
.get_max_osd(); i
++) {
11221 if (osdmap
.exists(i
)) {
11223 ss
<< "cannot shrink max_osd to " << newmax
11224 << " because osd." << i
<< " (and possibly others) still in use";
11230 pending_inc
.new_max_osd
= newmax
;
11231 ss
<< "set new max_osd = " << pending_inc
.new_max_osd
;
11233 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11234 get_last_committed() + 1));
11237 } else if (prefix
== "osd set-full-ratio" ||
11238 prefix
== "osd set-backfillfull-ratio" ||
11239 prefix
== "osd set-nearfull-ratio") {
11241 if (!cmd_getval(cmdmap
, "ratio", n
)) {
11242 ss
<< "unable to parse 'ratio' value '"
11243 << cmd_vartype_stringify(cmdmap
.at("ratio")) << "'";
11247 if (prefix
== "osd set-full-ratio")
11248 pending_inc
.new_full_ratio
= n
;
11249 else if (prefix
== "osd set-backfillfull-ratio")
11250 pending_inc
.new_backfillfull_ratio
= n
;
11251 else if (prefix
== "osd set-nearfull-ratio")
11252 pending_inc
.new_nearfull_ratio
= n
;
11253 ss
<< prefix
<< " " << n
;
11255 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11256 get_last_committed() + 1));
11258 } else if (prefix
== "osd set-require-min-compat-client") {
11260 cmd_getval(cmdmap
, "version", v
);
11261 ceph_release_t vno
= ceph_release_from_name(v
);
11263 ss
<< "version " << v
<< " is not recognized";
11268 newmap
.deepish_copy_from(osdmap
);
11269 newmap
.apply_incremental(pending_inc
);
11270 newmap
.require_min_compat_client
= vno
;
11271 auto mvno
= newmap
.get_min_compat_client();
11273 ss
<< "osdmap current utilizes features that require " << mvno
11274 << "; cannot set require_min_compat_client below that to " << vno
;
11279 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
11282 mon
.get_combined_feature_map(&m
);
11283 uint64_t features
= ceph_release_features(to_integer
<int>(vno
));
11287 CEPH_ENTITY_TYPE_CLIENT
,
11288 CEPH_ENTITY_TYPE_MDS
,
11289 CEPH_ENTITY_TYPE_MGR
}) {
11290 auto p
= m
.m
.find(type
);
11291 if (p
== m
.m
.end()) {
11294 for (auto& q
: p
->second
) {
11295 uint64_t missing
= ~q
.first
& features
;
11298 ss
<< "cannot set require_min_compat_client to " << v
<< ": ";
11303 ss
<< q
.second
<< " connected " << ceph_entity_type_name(type
)
11304 << "(s) look like " << ceph_release_name(
11305 ceph_release_from_features(q
.first
))
11306 << " (missing 0x" << std::hex
<< missing
<< std::dec
<< ")";
11312 ss
<< "; add --yes-i-really-mean-it to do it anyway";
11317 ss
<< "set require_min_compat_client to " << vno
;
11318 pending_inc
.new_require_min_compat_client
= vno
;
11320 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11321 get_last_committed() + 1));
11323 } else if (prefix
== "osd pause") {
11324 return prepare_set_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11326 } else if (prefix
== "osd unpause") {
11327 return prepare_unset_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11329 } else if (prefix
== "osd set") {
11331 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
11334 cmd_getval(cmdmap
, "key", key
);
11335 if (key
== "pause")
11336 return prepare_set_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11337 else if (key
== "noup")
11338 return prepare_set_flag(op
, CEPH_OSDMAP_NOUP
);
11339 else if (key
== "nodown")
11340 return prepare_set_flag(op
, CEPH_OSDMAP_NODOWN
);
11341 else if (key
== "noout")
11342 return prepare_set_flag(op
, CEPH_OSDMAP_NOOUT
);
11343 else if (key
== "noin")
11344 return prepare_set_flag(op
, CEPH_OSDMAP_NOIN
);
11345 else if (key
== "nobackfill")
11346 return prepare_set_flag(op
, CEPH_OSDMAP_NOBACKFILL
);
11347 else if (key
== "norebalance")
11348 return prepare_set_flag(op
, CEPH_OSDMAP_NOREBALANCE
);
11349 else if (key
== "norecover")
11350 return prepare_set_flag(op
, CEPH_OSDMAP_NORECOVER
);
11351 else if (key
== "noscrub")
11352 return prepare_set_flag(op
, CEPH_OSDMAP_NOSCRUB
);
11353 else if (key
== "nodeep-scrub")
11354 return prepare_set_flag(op
, CEPH_OSDMAP_NODEEP_SCRUB
);
11355 else if (key
== "notieragent")
11356 return prepare_set_flag(op
, CEPH_OSDMAP_NOTIERAGENT
);
11357 else if (key
== "nosnaptrim")
11358 return prepare_set_flag(op
, CEPH_OSDMAP_NOSNAPTRIM
);
11359 else if (key
== "pglog_hardlimit") {
11360 if (!osdmap
.get_num_up_osds() && !sure
) {
11361 ss
<< "Not advisable to continue since no OSDs are up. Pass "
11362 << "--yes-i-really-mean-it if you really wish to continue.";
11366 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
11367 // we are reusing a jewel feature bit that was retired in luminous.
11368 if (osdmap
.require_osd_release
>= ceph_release_t::luminous
&&
11369 (HAVE_FEATURE(osdmap
.get_up_osd_features(), OSD_PGLOG_HARDLIMIT
)
11371 return prepare_set_flag(op
, CEPH_OSDMAP_PGLOG_HARDLIMIT
);
11373 ss
<< "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
11378 ss
<< "unrecognized flag '" << key
<< "'";
11382 } else if (prefix
== "osd unset") {
11384 cmd_getval(cmdmap
, "key", key
);
11385 if (key
== "pause")
11386 return prepare_unset_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11387 else if (key
== "noup")
11388 return prepare_unset_flag(op
, CEPH_OSDMAP_NOUP
);
11389 else if (key
== "nodown")
11390 return prepare_unset_flag(op
, CEPH_OSDMAP_NODOWN
);
11391 else if (key
== "noout")
11392 return prepare_unset_flag(op
, CEPH_OSDMAP_NOOUT
);
11393 else if (key
== "noin")
11394 return prepare_unset_flag(op
, CEPH_OSDMAP_NOIN
);
11395 else if (key
== "nobackfill")
11396 return prepare_unset_flag(op
, CEPH_OSDMAP_NOBACKFILL
);
11397 else if (key
== "norebalance")
11398 return prepare_unset_flag(op
, CEPH_OSDMAP_NOREBALANCE
);
11399 else if (key
== "norecover")
11400 return prepare_unset_flag(op
, CEPH_OSDMAP_NORECOVER
);
11401 else if (key
== "noscrub")
11402 return prepare_unset_flag(op
, CEPH_OSDMAP_NOSCRUB
);
11403 else if (key
== "nodeep-scrub")
11404 return prepare_unset_flag(op
, CEPH_OSDMAP_NODEEP_SCRUB
);
11405 else if (key
== "notieragent")
11406 return prepare_unset_flag(op
, CEPH_OSDMAP_NOTIERAGENT
);
11407 else if (key
== "nosnaptrim")
11408 return prepare_unset_flag(op
, CEPH_OSDMAP_NOSNAPTRIM
);
11410 ss
<< "unrecognized flag '" << key
<< "'";
11414 } else if (prefix
== "osd require-osd-release") {
11416 cmd_getval(cmdmap
, "release", release
);
11418 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
11419 ceph_release_t rel
= ceph_release_from_name(release
.c_str());
11421 ss
<< "unrecognized release " << release
;
11425 if (rel
== osdmap
.require_osd_release
) {
11430 ceph_assert(osdmap
.require_osd_release
>= ceph_release_t::luminous
);
11431 if (!osdmap
.get_num_up_osds() && !sure
) {
11432 ss
<< "Not advisable to continue since no OSDs are up. Pass "
11433 << "--yes-i-really-mean-it if you really wish to continue.";
11437 if (rel
== ceph_release_t::mimic
) {
11438 if (!mon
.monmap
->get_required_features().contains_all(
11439 ceph::features::mon::FEATURE_MIMIC
)) {
11440 ss
<< "not all mons are mimic";
11444 if ((!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_MIMIC
))
11446 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_MIMIC feature";
11450 } else if (rel
== ceph_release_t::nautilus
) {
11451 if (!mon
.monmap
->get_required_features().contains_all(
11452 ceph::features::mon::FEATURE_NAUTILUS
)) {
11453 ss
<< "not all mons are nautilus";
11457 if ((!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_NAUTILUS
))
11459 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_NAUTILUS feature";
11463 } else if (rel
== ceph_release_t::octopus
) {
11464 if (!mon
.monmap
->get_required_features().contains_all(
11465 ceph::features::mon::FEATURE_OCTOPUS
)) {
11466 ss
<< "not all mons are octopus";
11470 if ((!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_OCTOPUS
))
11472 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_OCTOPUS feature";
11476 } else if (rel
== ceph_release_t::pacific
) {
11477 if (!mon
.monmap
->get_required_features().contains_all(
11478 ceph::features::mon::FEATURE_PACIFIC
)) {
11479 ss
<< "not all mons are pacific";
11483 if ((!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_PACIFIC
))
11485 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_PACIFIC feature";
11490 ss
<< "not supported for this release yet";
11494 if (rel
< osdmap
.require_osd_release
) {
11495 ss
<< "require_osd_release cannot be lowered once it has been set";
11499 pending_inc
.new_require_osd_release
= rel
;
11501 } else if (prefix
== "osd down" ||
11502 prefix
== "osd out" ||
11503 prefix
== "osd in" ||
11504 prefix
== "osd rm" ||
11505 prefix
== "osd stop") {
11509 bool verbose
= true;
11510 bool definitely_dead
= false;
11512 vector
<string
> idvec
;
11513 cmd_getval(cmdmap
, "ids", idvec
);
11514 cmd_getval(cmdmap
, "definitely_dead", definitely_dead
);
11515 derr
<< "definitely_dead " << (int)definitely_dead
<< dendl
;
11516 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
11521 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
11522 if (prefix
== "osd in") {
11523 // touch out osds only
11524 osdmap
.get_out_existing_osds(osds
);
11526 osdmap
.get_all_osds(osds
);
11529 verbose
= false; // so the output is less noisy.
11531 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
11533 ss
<< "invalid osd id" << osd
;
11536 } else if (!osdmap
.exists(osd
)) {
11537 ss
<< "osd." << osd
<< " does not exist. ";
11544 for (auto &osd
: osds
) {
11545 if (prefix
== "osd down") {
11546 if (osdmap
.is_down(osd
)) {
11548 ss
<< "osd." << osd
<< " is already down. ";
11550 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_UP
);
11551 ss
<< "marked down osd." << osd
<< ". ";
11554 if (definitely_dead
) {
11555 if (!pending_inc
.new_xinfo
.count(osd
)) {
11556 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
11558 if (pending_inc
.new_xinfo
[osd
].dead_epoch
< pending_inc
.epoch
) {
11561 pending_inc
.new_xinfo
[osd
].dead_epoch
= pending_inc
.epoch
;
11563 } else if (prefix
== "osd out") {
11564 if (osdmap
.is_out(osd
)) {
11566 ss
<< "osd." << osd
<< " is already out. ";
11568 pending_inc
.new_weight
[osd
] = CEPH_OSD_OUT
;
11569 if (osdmap
.osd_weight
[osd
]) {
11570 if (pending_inc
.new_xinfo
.count(osd
) == 0) {
11571 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
11573 pending_inc
.new_xinfo
[osd
].old_weight
= osdmap
.osd_weight
[osd
];
11575 ss
<< "marked out osd." << osd
<< ". ";
11576 std::ostringstream msg
;
11577 msg
<< "Client " << op
->get_session()->entity_name
11578 << " marked osd." << osd
<< " out";
11579 if (osdmap
.is_up(osd
)) {
11580 msg
<< ", while it was still marked up";
11582 auto period
= ceph_clock_now() - down_pending_out
[osd
];
11583 msg
<< ", after it was down for " << int(period
.sec())
11587 mon
.clog
->info() << msg
.str();
11590 } else if (prefix
== "osd in") {
11591 if (osdmap
.is_in(osd
)) {
11593 ss
<< "osd." << osd
<< " is already in. ";
11595 if (osdmap
.osd_xinfo
[osd
].old_weight
> 0) {
11596 pending_inc
.new_weight
[osd
] = osdmap
.osd_xinfo
[osd
].old_weight
;
11597 if (pending_inc
.new_xinfo
.count(osd
) == 0) {
11598 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
11600 pending_inc
.new_xinfo
[osd
].old_weight
= 0;
11602 pending_inc
.new_weight
[osd
] = CEPH_OSD_IN
;
11604 ss
<< "marked in osd." << osd
<< ". ";
11607 } else if (prefix
== "osd rm") {
11608 err
= prepare_command_osd_remove(osd
);
11610 if (err
== -EBUSY
) {
11613 ss
<< "osd." << osd
<< " is still up; must be down before removal. ";
11615 ceph_assert(err
== 0);
11617 ss
<< ", osd." << osd
;
11619 ss
<< "removed osd." << osd
;
11623 } else if (prefix
== "osd stop") {
11624 if (osdmap
.is_stop(osd
)) {
11626 ss
<< "osd." << osd
<< " is already stopped. ";
11627 } else if (osdmap
.is_down(osd
)) {
11628 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_STOP
);
11629 ss
<< "stop down osd." << osd
<< ". ";
11632 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_UP
| CEPH_OSD_STOP
);
11633 ss
<< "stop osd." << osd
<< ". ";
11641 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
11642 get_last_committed() + 1));
11645 } else if (prefix
== "osd set-group" ||
11646 prefix
== "osd unset-group" ||
11647 prefix
== "osd add-noup" ||
11648 prefix
== "osd add-nodown" ||
11649 prefix
== "osd add-noin" ||
11650 prefix
== "osd add-noout" ||
11651 prefix
== "osd rm-noup" ||
11652 prefix
== "osd rm-nodown" ||
11653 prefix
== "osd rm-noin" ||
11654 prefix
== "osd rm-noout") {
11655 bool do_set
= prefix
== "osd set-group" ||
11656 prefix
.find("add") != string::npos
;
11658 unsigned flags
= 0;
11659 vector
<string
> who
;
11660 if (prefix
== "osd set-group" || prefix
== "osd unset-group") {
11661 cmd_getval(cmdmap
, "flags", flag_str
);
11662 cmd_getval(cmdmap
, "who", who
);
11663 vector
<string
> raw_flags
;
11664 boost::split(raw_flags
, flag_str
, boost::is_any_of(","));
11665 for (auto& f
: raw_flags
) {
11667 flags
|= CEPH_OSD_NOUP
;
11668 else if (f
== "nodown")
11669 flags
|= CEPH_OSD_NODOWN
;
11670 else if (f
== "noin")
11671 flags
|= CEPH_OSD_NOIN
;
11672 else if (f
== "noout")
11673 flags
|= CEPH_OSD_NOOUT
;
11675 ss
<< "unrecognized flag '" << f
<< "', must be one of "
11676 << "{noup,nodown,noin,noout}";
11682 cmd_getval(cmdmap
, "ids", who
);
11683 if (prefix
.find("noup") != string::npos
)
11684 flags
= CEPH_OSD_NOUP
;
11685 else if (prefix
.find("nodown") != string::npos
)
11686 flags
= CEPH_OSD_NODOWN
;
11687 else if (prefix
.find("noin") != string::npos
)
11688 flags
= CEPH_OSD_NOIN
;
11689 else if (prefix
.find("noout") != string::npos
)
11690 flags
= CEPH_OSD_NOOUT
;
11692 ceph_assert(0 == "Unreachable!");
11695 ss
<< "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
11700 ss
<< "must specify at least one or more targets to set/unset";
11705 set
<int> crush_nodes
;
11706 set
<int> device_classes
;
11707 for (auto& w
: who
) {
11708 if (w
== "any" || w
== "all" || w
== "*") {
11709 osdmap
.get_all_osds(osds
);
11712 std::stringstream ts
;
11713 if (auto osd
= parse_osd_id(w
.c_str(), &ts
); osd
>= 0) {
11715 } else if (osdmap
.crush
->name_exists(w
)) {
11716 crush_nodes
.insert(osdmap
.crush
->get_item_id(w
));
11717 } else if (osdmap
.crush
->class_exists(w
)) {
11718 device_classes
.insert(osdmap
.crush
->get_class_id(w
));
11720 ss
<< "unable to parse osd id or crush node or device class: "
11721 << "\"" << w
<< "\". ";
11724 if (osds
.empty() && crush_nodes
.empty() && device_classes
.empty()) {
11725 // ss has reason for failure
11730 for (auto osd
: osds
) {
11731 if (!osdmap
.exists(osd
)) {
11732 ss
<< "osd." << osd
<< " does not exist. ";
11736 if (flags
& CEPH_OSD_NOUP
) {
11737 any
|= osdmap
.is_noup_by_osd(osd
) ?
11738 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOUP
) :
11739 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOUP
);
11741 if (flags
& CEPH_OSD_NODOWN
) {
11742 any
|= osdmap
.is_nodown_by_osd(osd
) ?
11743 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NODOWN
) :
11744 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NODOWN
);
11746 if (flags
& CEPH_OSD_NOIN
) {
11747 any
|= osdmap
.is_noin_by_osd(osd
) ?
11748 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOIN
) :
11749 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOIN
);
11751 if (flags
& CEPH_OSD_NOOUT
) {
11752 any
|= osdmap
.is_noout_by_osd(osd
) ?
11753 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOOUT
) :
11754 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOOUT
);
11757 if (flags
& CEPH_OSD_NOUP
) {
11758 any
|= osdmap
.is_noup_by_osd(osd
) ?
11759 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOUP
) :
11760 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOUP
);
11762 if (flags
& CEPH_OSD_NODOWN
) {
11763 any
|= osdmap
.is_nodown_by_osd(osd
) ?
11764 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NODOWN
) :
11765 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NODOWN
);
11767 if (flags
& CEPH_OSD_NOIN
) {
11768 any
|= osdmap
.is_noin_by_osd(osd
) ?
11769 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOIN
) :
11770 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOIN
);
11772 if (flags
& CEPH_OSD_NOOUT
) {
11773 any
|= osdmap
.is_noout_by_osd(osd
) ?
11774 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOOUT
) :
11775 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOOUT
);
11779 for (auto& id
: crush_nodes
) {
11780 auto old_flags
= osdmap
.get_crush_node_flags(id
);
11781 auto& pending_flags
= pending_inc
.new_crush_node_flags
[id
];
11782 pending_flags
|= old_flags
; // adopt existing flags first!
11784 pending_flags
|= flags
;
11786 pending_flags
&= ~flags
;
11790 for (auto& id
: device_classes
) {
11791 auto old_flags
= osdmap
.get_device_class_flags(id
);
11792 auto& pending_flags
= pending_inc
.new_device_class_flags
[id
];
11793 pending_flags
|= old_flags
;
11795 pending_flags
|= flags
;
11797 pending_flags
&= ~flags
;
11803 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
11804 get_last_committed() + 1));
11807 } else if (prefix
== "osd pg-temp") {
11809 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
11810 ss
<< "unable to parse 'pgid' value '"
11811 << cmd_vartype_stringify(cmdmap
.at("pgid")) << "'";
11816 if (!pgid
.parse(pgidstr
.c_str())) {
11817 ss
<< "invalid pgid '" << pgidstr
<< "'";
11821 if (!osdmap
.pg_exists(pgid
)) {
11822 ss
<< "pg " << pgid
<< " does not exist";
11826 if (pending_inc
.new_pg_temp
.count(pgid
)) {
11827 dout(10) << __func__
<< " waiting for pending update on " << pgid
<< dendl
;
11828 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11832 vector
<int64_t> id_vec
;
11833 vector
<int32_t> new_pg_temp
;
11834 cmd_getval(cmdmap
, "id", id_vec
);
11835 if (id_vec
.empty()) {
11836 pending_inc
.new_pg_temp
[pgid
] = mempool::osdmap::vector
<int>();
11837 ss
<< "done cleaning up pg_temp of " << pgid
;
11840 for (auto osd
: id_vec
) {
11841 if (!osdmap
.exists(osd
)) {
11842 ss
<< "osd." << osd
<< " does not exist";
11846 new_pg_temp
.push_back(osd
);
11849 int pool_min_size
= osdmap
.get_pg_pool_min_size(pgid
);
11850 if ((int)new_pg_temp
.size() < pool_min_size
) {
11851 ss
<< "num of osds (" << new_pg_temp
.size() <<") < pool min size ("
11852 << pool_min_size
<< ")";
11857 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
11858 if ((int)new_pg_temp
.size() > pool_size
) {
11859 ss
<< "num of osds (" << new_pg_temp
.size() <<") > pool size ("
11860 << pool_size
<< ")";
11865 pending_inc
.new_pg_temp
[pgid
] = mempool::osdmap::vector
<int>(
11866 new_pg_temp
.begin(), new_pg_temp
.end());
11867 ss
<< "set " << pgid
<< " pg_temp mapping to " << new_pg_temp
;
11869 } else if (prefix
== "osd primary-temp") {
11871 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
11872 ss
<< "unable to parse 'pgid' value '"
11873 << cmd_vartype_stringify(cmdmap
.at("pgid")) << "'";
11878 if (!pgid
.parse(pgidstr
.c_str())) {
11879 ss
<< "invalid pgid '" << pgidstr
<< "'";
11883 if (!osdmap
.pg_exists(pgid
)) {
11884 ss
<< "pg " << pgid
<< " does not exist";
11890 if (!cmd_getval(cmdmap
, "id", osd
)) {
11891 ss
<< "unable to parse 'id' value '"
11892 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
11896 if (osd
!= -1 && !osdmap
.exists(osd
)) {
11897 ss
<< "osd." << osd
<< " does not exist";
11902 if (osdmap
.require_min_compat_client
!= ceph_release_t::unknown
&&
11903 osdmap
.require_min_compat_client
< ceph_release_t::firefly
) {
11904 ss
<< "require_min_compat_client "
11905 << osdmap
.require_min_compat_client
11906 << " < firefly, which is required for primary-temp";
11911 pending_inc
.new_primary_temp
[pgid
] = osd
;
11912 ss
<< "set " << pgid
<< " primary_temp mapping to " << osd
;
11914 } else if (prefix
== "pg repeer") {
11917 cmd_getval(cmdmap
, "pgid", pgidstr
);
11918 if (!pgid
.parse(pgidstr
.c_str())) {
11919 ss
<< "invalid pgid '" << pgidstr
<< "'";
11923 if (!osdmap
.pg_exists(pgid
)) {
11924 ss
<< "pg '" << pgidstr
<< "' does not exist";
11928 vector
<int> acting
;
11930 osdmap
.pg_to_acting_osds(pgid
, &acting
, &primary
);
11933 ss
<< "pg currently has no primary";
11936 if (acting
.size() > 1) {
11937 // map to just primary; it will map back to what it wants
11938 pending_inc
.new_pg_temp
[pgid
] = { primary
};
11940 // hmm, pick another arbitrary osd to induce a change. Note
11941 // that this won't work if there is only one suitable OSD in the cluster.
11944 for (i
= 0; i
< osdmap
.get_max_osd(); ++i
) {
11945 if (i
== primary
|| !osdmap
.is_up(i
) || !osdmap
.exists(i
)) {
11948 pending_inc
.new_pg_temp
[pgid
] = { primary
, i
};
11954 ss
<< "not enough up OSDs in the cluster to force repeer";
11959 } else if (prefix
== "osd pg-upmap" ||
11960 prefix
== "osd rm-pg-upmap" ||
11961 prefix
== "osd pg-upmap-items" ||
11962 prefix
== "osd rm-pg-upmap-items") {
11963 if (osdmap
.require_min_compat_client
< ceph_release_t::luminous
) {
11964 ss
<< "min_compat_client "
11965 << osdmap
.require_min_compat_client
11966 << " < luminous, which is required for pg-upmap. "
11967 << "Try 'ceph osd set-require-min-compat-client luminous' "
11968 << "before using the new interface";
11972 err
= check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP
, ss
);
11973 if (err
== -EAGAIN
)
11978 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
11979 ss
<< "unable to parse 'pgid' value '"
11980 << cmd_vartype_stringify(cmdmap
.at("pgid")) << "'";
11985 if (!pgid
.parse(pgidstr
.c_str())) {
11986 ss
<< "invalid pgid '" << pgidstr
<< "'";
11990 if (!osdmap
.pg_exists(pgid
)) {
11991 ss
<< "pg " << pgid
<< " does not exist";
11995 if (pending_inc
.old_pools
.count(pgid
.pool())) {
11996 ss
<< "pool of " << pgid
<< " is pending removal";
11999 wait_for_finished_proposal(op
,
12000 new Monitor::C_Command(mon
, op
, err
, rs
, get_last_committed() + 1));
12008 OP_RM_PG_UPMAP_ITEMS
,
12011 if (prefix
== "osd pg-upmap") {
12012 option
= OP_PG_UPMAP
;
12013 } else if (prefix
== "osd rm-pg-upmap") {
12014 option
= OP_RM_PG_UPMAP
;
12015 } else if (prefix
== "osd pg-upmap-items") {
12016 option
= OP_PG_UPMAP_ITEMS
;
12018 option
= OP_RM_PG_UPMAP_ITEMS
;
12021 // check pending upmap changes
12023 case OP_PG_UPMAP
: // fall through
12024 case OP_RM_PG_UPMAP
:
12025 if (pending_inc
.new_pg_upmap
.count(pgid
) ||
12026 pending_inc
.old_pg_upmap
.count(pgid
)) {
12027 dout(10) << __func__
<< " waiting for pending update on "
12029 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12034 case OP_PG_UPMAP_ITEMS
: // fall through
12035 case OP_RM_PG_UPMAP_ITEMS
:
12036 if (pending_inc
.new_pg_upmap_items
.count(pgid
) ||
12037 pending_inc
.old_pg_upmap_items
.count(pgid
)) {
12038 dout(10) << __func__
<< " waiting for pending update on "
12040 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12046 ceph_abort_msg("invalid option");
12052 vector
<int64_t> id_vec
;
12053 if (!cmd_getval(cmdmap
, "id", id_vec
)) {
12054 ss
<< "unable to parse 'id' value(s) '"
12055 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12060 int pool_min_size
= osdmap
.get_pg_pool_min_size(pgid
);
12061 if ((int)id_vec
.size() < pool_min_size
) {
12062 ss
<< "num of osds (" << id_vec
.size() <<") < pool min size ("
12063 << pool_min_size
<< ")";
12068 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
12069 if ((int)id_vec
.size() > pool_size
) {
12070 ss
<< "num of osds (" << id_vec
.size() <<") > pool size ("
12071 << pool_size
<< ")";
12076 vector
<int32_t> new_pg_upmap
;
12077 for (auto osd
: id_vec
) {
12078 if (osd
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(osd
)) {
12079 ss
<< "osd." << osd
<< " does not exist";
12083 auto it
= std::find(new_pg_upmap
.begin(), new_pg_upmap
.end(), osd
);
12084 if (it
!= new_pg_upmap
.end()) {
12085 ss
<< "osd." << osd
<< " already exists, ";
12088 new_pg_upmap
.push_back(osd
);
12091 if (new_pg_upmap
.empty()) {
12092 ss
<< "no valid upmap items(pairs) is specified";
12097 pending_inc
.new_pg_upmap
[pgid
] = mempool::osdmap::vector
<int32_t>(
12098 new_pg_upmap
.begin(), new_pg_upmap
.end());
12099 ss
<< "set " << pgid
<< " pg_upmap mapping to " << new_pg_upmap
;
12103 case OP_RM_PG_UPMAP
:
12105 pending_inc
.old_pg_upmap
.insert(pgid
);
12106 ss
<< "clear " << pgid
<< " pg_upmap mapping";
12110 case OP_PG_UPMAP_ITEMS
:
12112 vector
<int64_t> id_vec
;
12113 if (!cmd_getval(cmdmap
, "id", id_vec
)) {
12114 ss
<< "unable to parse 'id' value(s) '"
12115 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12120 if (id_vec
.size() % 2) {
12121 ss
<< "you must specify pairs of osd ids to be remapped";
12126 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
12127 if ((int)(id_vec
.size() / 2) > pool_size
) {
12128 ss
<< "num of osd pairs (" << id_vec
.size() / 2 <<") > pool size ("
12129 << pool_size
<< ")";
12134 vector
<pair
<int32_t,int32_t>> new_pg_upmap_items
;
12135 ostringstream items
;
12137 for (auto p
= id_vec
.begin(); p
!= id_vec
.end(); ++p
) {
12141 ss
<< "from osd." << from
<< " == to osd." << to
<< ", ";
12144 if (!osdmap
.exists(from
)) {
12145 ss
<< "osd." << from
<< " does not exist";
12149 if (to
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(to
)) {
12150 ss
<< "osd." << to
<< " does not exist";
12154 pair
<int32_t,int32_t> entry
= make_pair(from
, to
);
12155 auto it
= std::find(new_pg_upmap_items
.begin(),
12156 new_pg_upmap_items
.end(), entry
);
12157 if (it
!= new_pg_upmap_items
.end()) {
12158 ss
<< "osd." << from
<< " -> osd." << to
<< " already exists, ";
12161 new_pg_upmap_items
.push_back(entry
);
12162 items
<< from
<< "->" << to
<< ",";
12164 string
out(items
.str());
12165 out
.resize(out
.size() - 1); // drop last ','
12168 if (new_pg_upmap_items
.empty()) {
12169 ss
<< "no valid upmap items(pairs) is specified";
12174 pending_inc
.new_pg_upmap_items
[pgid
] =
12175 mempool::osdmap::vector
<pair
<int32_t,int32_t>>(
12176 new_pg_upmap_items
.begin(), new_pg_upmap_items
.end());
12177 ss
<< "set " << pgid
<< " pg_upmap_items mapping to " << out
;
12181 case OP_RM_PG_UPMAP_ITEMS
:
12183 pending_inc
.old_pg_upmap_items
.insert(pgid
);
12184 ss
<< "clear " << pgid
<< " pg_upmap_items mapping";
12189 ceph_abort_msg("invalid option");
12193 } else if (prefix
== "osd primary-affinity") {
12195 if (!cmd_getval(cmdmap
, "id", id
)) {
12196 ss
<< "invalid osd id value '"
12197 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12202 if (!cmd_getval(cmdmap
, "weight", w
)) {
12203 ss
<< "unable to parse 'weight' value '"
12204 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
12208 long ww
= (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY
*w
);
12210 ss
<< "weight must be >= 0";
12214 if (osdmap
.require_min_compat_client
!= ceph_release_t::unknown
&&
12215 osdmap
.require_min_compat_client
< ceph_release_t::firefly
) {
12216 ss
<< "require_min_compat_client "
12217 << osdmap
.require_min_compat_client
12218 << " < firefly, which is required for primary-affinity";
12222 if (osdmap
.exists(id
)) {
12223 pending_inc
.new_primary_affinity
[id
] = ww
;
12224 ss
<< "set osd." << id
<< " primary-affinity to " << w
<< " (" << std::ios::hex
<< ww
<< std::ios::dec
<< ")";
12226 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12227 get_last_committed() + 1));
12230 ss
<< "osd." << id
<< " does not exist";
12234 } else if (prefix
== "osd reweight") {
12236 if (!cmd_getval(cmdmap
, "id", id
)) {
12237 ss
<< "unable to parse osd id value '"
12238 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12243 if (!cmd_getval(cmdmap
, "weight", w
)) {
12244 ss
<< "unable to parse weight value '"
12245 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
12249 long ww
= (int)((double)CEPH_OSD_IN
*w
);
12251 ss
<< "weight must be >= 0";
12255 if (osdmap
.exists(id
)) {
12256 pending_inc
.new_weight
[id
] = ww
;
12257 ss
<< "reweighted osd." << id
<< " to " << w
<< " (" << std::hex
<< ww
<< std::dec
<< ")";
12259 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12260 get_last_committed() + 1));
12263 ss
<< "osd." << id
<< " does not exist";
12267 } else if (prefix
== "osd reweightn") {
12268 map
<int32_t, uint32_t> weights
;
12269 err
= parse_reweights(cct
, cmdmap
, osdmap
, &weights
);
12271 ss
<< "unable to parse 'weights' value '"
12272 << cmd_vartype_stringify(cmdmap
.at("weights")) << "'";
12275 pending_inc
.new_weight
.insert(weights
.begin(), weights
.end());
12276 wait_for_finished_proposal(
12278 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
, get_last_committed() + 1));
12280 } else if (prefix
== "osd lost") {
12282 if (!cmd_getval(cmdmap
, "id", id
)) {
12283 ss
<< "unable to parse osd id value '"
12284 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12289 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
12291 ss
<< "are you SURE? this might mean real, permanent data loss. pass "
12292 "--yes-i-really-mean-it if you really do.";
12295 } else if (!osdmap
.exists(id
)) {
12296 ss
<< "osd." << id
<< " does not exist";
12299 } else if (!osdmap
.is_down(id
)) {
12300 ss
<< "osd." << id
<< " is not down";
12304 epoch_t e
= osdmap
.get_info(id
).down_at
;
12305 pending_inc
.new_lost
[id
] = e
;
12306 ss
<< "marked osd lost in epoch " << e
;
12308 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12309 get_last_committed() + 1));
12313 } else if (prefix
== "osd destroy-actual" ||
12314 prefix
== "osd purge-actual" ||
12315 prefix
== "osd purge-new") {
12316 /* Destroying an OSD means that we don't expect to further make use of
12317 * the OSDs data (which may even become unreadable after this operation),
12318 * and that we are okay with scrubbing all its cephx keys and config-key
12319 * data (which may include lockbox keys, thus rendering the osd's data
12322 * The OSD will not be removed. Instead, we will mark it as destroyed,
12323 * such that a subsequent call to `create` will not reuse the osd id.
12324 * This will play into being able to recreate the OSD, at the same
12325 * crush location, with minimal data movement.
12328 // make sure authmon is writeable.
12329 if (!mon
.authmon()->is_writeable()) {
12330 dout(10) << __func__
<< " waiting for auth mon to be writeable for "
12331 << "osd destroy" << dendl
;
12332 mon
.authmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
12337 if (!cmd_getval(cmdmap
, "id", id
)) {
12338 auto p
= cmdmap
.find("id");
12339 if (p
== cmdmap
.end()) {
12340 ss
<< "no osd id specified";
12342 ss
<< "unable to parse osd id value '"
12343 << cmd_vartype_stringify(cmdmap
.at("id")) << "";
12349 bool is_destroy
= (prefix
== "osd destroy-actual");
12351 ceph_assert("osd purge-actual" == prefix
||
12352 "osd purge-new" == prefix
);
12356 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
12358 ss
<< "Are you SURE? Did you verify with 'ceph osd safe-to-destroy'? "
12359 << "This will mean real, permanent data loss, as well "
12360 << "as deletion of cephx and lockbox keys. "
12361 << "Pass --yes-i-really-mean-it if you really do.";
12364 } else if (!osdmap
.exists(id
)) {
12365 ss
<< "osd." << id
<< " does not exist";
12366 err
= 0; // idempotent
12368 } else if (osdmap
.is_up(id
)) {
12369 ss
<< "osd." << id
<< " is not `down`.";
12372 } else if (is_destroy
&& osdmap
.is_destroyed(id
)) {
12373 ss
<< "destroyed osd." << id
;
12378 if (prefix
== "osd purge-new" &&
12379 (osdmap
.get_state(id
) & CEPH_OSD_NEW
) == 0) {
12380 ss
<< "osd." << id
<< " is not new";
12385 bool goto_reply
= false;
12389 err
= prepare_command_osd_destroy(id
, ss
);
12390 // we checked above that it should exist.
12391 ceph_assert(err
!= -ENOENT
);
12393 err
= prepare_command_osd_purge(id
, ss
);
12394 if (err
== -ENOENT
) {
12396 ss
<< "osd." << id
<< " does not exist.";
12402 if (err
< 0 || goto_reply
) {
12407 ss
<< "destroyed osd." << id
;
12409 ss
<< "purged osd." << id
;
12413 wait_for_finished_proposal(op
,
12414 new Monitor::C_Command(mon
, op
, 0, rs
, get_last_committed() + 1));
12415 force_immediate_propose();
12418 } else if (prefix
== "osd new") {
12420 // make sure authmon is writeable.
12421 if (!mon
.authmon()->is_writeable()) {
12422 dout(10) << __func__
<< " waiting for auth mon to be writeable for "
12423 << "osd new" << dendl
;
12424 mon
.authmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
12428 map
<string
,string
> param_map
;
12430 bufferlist bl
= m
->get_data();
12431 string param_json
= bl
.to_str();
12432 dout(20) << __func__
<< " osd new json = " << param_json
<< dendl
;
12434 err
= get_json_str_map(param_json
, ss
, ¶m_map
);
12438 dout(20) << __func__
<< " osd new params " << param_map
<< dendl
;
12441 err
= prepare_command_osd_new(op
, cmdmap
, param_map
, ss
, f
.get());
12454 if (err
== EEXIST
) {
12455 // idempotent operation
12460 wait_for_finished_proposal(op
,
12461 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
,
12462 get_last_committed() + 1));
12463 force_immediate_propose();
12466 } else if (prefix
== "osd create") {
12468 // optional id provided?
12469 int64_t id
= -1, cmd_id
= -1;
12470 if (cmd_getval(cmdmap
, "id", cmd_id
)) {
12472 ss
<< "invalid osd id value '" << cmd_id
<< "'";
12476 dout(10) << " osd create got id " << cmd_id
<< dendl
;
12481 if (cmd_getval(cmdmap
, "uuid", uuidstr
)) {
12482 if (!uuid
.parse(uuidstr
.c_str())) {
12483 ss
<< "invalid uuid value '" << uuidstr
<< "'";
12487 // we only care about the id if we also have the uuid, to
12488 // ensure the operation's idempotency.
12492 int32_t new_id
= -1;
12493 err
= prepare_command_osd_create(id
, uuid
, &new_id
, ss
);
12495 if (err
== -EAGAIN
) {
12496 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12499 // a check has failed; reply to the user.
12502 } else if (err
== EEXIST
) {
12503 // this is an idempotent operation; we can go ahead and reply.
12505 f
->open_object_section("created_osd");
12506 f
->dump_int("osdid", new_id
);
12507 f
->close_section();
12517 string empty_device_class
;
12518 do_osd_create(id
, uuid
, empty_device_class
, &new_id
);
12521 f
->open_object_section("created_osd");
12522 f
->dump_int("osdid", new_id
);
12523 f
->close_section();
12529 wait_for_finished_proposal(op
,
12530 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
,
12531 get_last_committed() + 1));
12534 } else if (prefix
== "osd blocklist clear" ||
12535 prefix
== "osd blacklist clear") {
12536 pending_inc
.new_blocklist
.clear();
12537 std::list
<std::pair
<entity_addr_t
,utime_t
> > blocklist
;
12538 osdmap
.get_blocklist(&blocklist
);
12539 for (const auto &entry
: blocklist
) {
12540 pending_inc
.old_blocklist
.push_back(entry
.first
);
12542 ss
<< " removed all blocklist entries";
12544 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12545 get_last_committed() + 1));
12547 } else if (prefix
== "osd blocklist" ||
12548 prefix
== "osd blacklist") {
12550 cmd_getval(cmdmap
, "addr", addrstr
);
12551 entity_addr_t addr
;
12552 if (!addr
.parse(addrstr
.c_str(), 0)) {
12553 ss
<< "unable to parse address " << addrstr
;
12558 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
12559 // always blocklist type ANY
12560 addr
.set_type(entity_addr_t::TYPE_ANY
);
12562 addr
.set_type(entity_addr_t::TYPE_LEGACY
);
12565 string blocklistop
;
12566 if (!cmd_getval(cmdmap
, "blocklistop", blocklistop
)) {
12567 cmd_getval(cmdmap
, "blacklistop", blocklistop
);
12569 if (blocklistop
== "add") {
12570 utime_t expires
= ceph_clock_now();
12572 // default one hour
12573 cmd_getval(cmdmap
, "expire", d
,
12574 g_conf()->mon_osd_blocklist_default_expire
);
12577 pending_inc
.new_blocklist
[addr
] = expires
;
12580 // cancel any pending un-blocklisting request too
12581 auto it
= std::find(pending_inc
.old_blocklist
.begin(),
12582 pending_inc
.old_blocklist
.end(), addr
);
12583 if (it
!= pending_inc
.old_blocklist
.end()) {
12584 pending_inc
.old_blocklist
.erase(it
);
12588 ss
<< "blocklisting " << addr
<< " until " << expires
<< " (" << d
<< " sec)";
12590 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12591 get_last_committed() + 1));
12593 } else if (blocklistop
== "rm") {
12594 if (osdmap
.is_blocklisted(addr
) ||
12595 pending_inc
.new_blocklist
.count(addr
)) {
12596 if (osdmap
.is_blocklisted(addr
))
12597 pending_inc
.old_blocklist
.push_back(addr
);
12599 pending_inc
.new_blocklist
.erase(addr
);
12600 ss
<< "un-blocklisting " << addr
;
12602 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12603 get_last_committed() + 1));
12606 ss
<< addr
<< " isn't blocklisted";
12611 } else if (prefix
== "osd pool mksnap") {
12613 cmd_getval(cmdmap
, "pool", poolstr
);
12614 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
12616 ss
<< "unrecognized pool '" << poolstr
<< "'";
12621 cmd_getval(cmdmap
, "snap", snapname
);
12622 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
12623 if (p
->is_unmanaged_snaps_mode()) {
12624 ss
<< "pool " << poolstr
<< " is in unmanaged snaps mode";
12627 } else if (p
->snap_exists(snapname
.c_str())) {
12628 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " already exists";
12631 } else if (p
->is_tier()) {
12632 ss
<< "pool " << poolstr
<< " is a cache tier";
12637 if (pending_inc
.new_pools
.count(pool
))
12638 pp
= &pending_inc
.new_pools
[pool
];
12640 pp
= &pending_inc
.new_pools
[pool
];
12643 if (pp
->snap_exists(snapname
.c_str())) {
12644 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " already exists";
12646 pp
->add_snap(snapname
.c_str(), ceph_clock_now());
12647 pp
->set_snap_epoch(pending_inc
.epoch
);
12648 ss
<< "created pool " << poolstr
<< " snap " << snapname
;
12651 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12652 get_last_committed() + 1));
12654 } else if (prefix
== "osd pool rmsnap") {
12656 cmd_getval(cmdmap
, "pool", poolstr
);
12657 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
12659 ss
<< "unrecognized pool '" << poolstr
<< "'";
12664 cmd_getval(cmdmap
, "snap", snapname
);
12665 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
12666 if (p
->is_unmanaged_snaps_mode()) {
12667 ss
<< "pool " << poolstr
<< " is in unmanaged snaps mode";
12670 } else if (!p
->snap_exists(snapname
.c_str())) {
12671 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " does not exist";
12676 if (pending_inc
.new_pools
.count(pool
))
12677 pp
= &pending_inc
.new_pools
[pool
];
12679 pp
= &pending_inc
.new_pools
[pool
];
12682 snapid_t sn
= pp
->snap_exists(snapname
.c_str());
12684 pp
->remove_snap(sn
);
12685 pp
->set_snap_epoch(pending_inc
.epoch
);
12686 ss
<< "removed pool " << poolstr
<< " snap " << snapname
;
12688 ss
<< "already removed pool " << poolstr
<< " snap " << snapname
;
12691 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12692 get_last_committed() + 1));
12694 } else if (prefix
== "osd pool create") {
12695 int64_t pg_num
, pg_num_min
;
12697 cmd_getval(cmdmap
, "pg_num", pg_num
, int64_t(0));
12698 cmd_getval(cmdmap
, "pgp_num", pgp_num
, pg_num
);
12699 cmd_getval(cmdmap
, "pg_num_min", pg_num_min
, int64_t(0));
12701 string pool_type_str
;
12702 cmd_getval(cmdmap
, "pool_type", pool_type_str
);
12703 if (pool_type_str
.empty())
12704 pool_type_str
= g_conf().get_val
<string
>("osd_pool_default_type");
12707 cmd_getval(cmdmap
, "pool", poolstr
);
12708 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
12709 if (pool_id
>= 0) {
12710 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
12711 if (pool_type_str
!= p
->get_type_name()) {
12712 ss
<< "pool '" << poolstr
<< "' cannot change to type " << pool_type_str
;
12715 ss
<< "pool '" << poolstr
<< "' already exists";
12722 if (pool_type_str
== "replicated") {
12723 pool_type
= pg_pool_t::TYPE_REPLICATED
;
12724 } else if (pool_type_str
== "erasure") {
12725 pool_type
= pg_pool_t::TYPE_ERASURE
;
12727 ss
<< "unknown pool type '" << pool_type_str
<< "'";
12732 bool implicit_rule_creation
= false;
12733 int64_t expected_num_objects
= 0;
12735 cmd_getval(cmdmap
, "rule", rule_name
);
12736 string erasure_code_profile
;
12737 cmd_getval(cmdmap
, "erasure_code_profile", erasure_code_profile
);
12739 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
12740 if (erasure_code_profile
== "")
12741 erasure_code_profile
= "default";
12742 //handle the erasure code profile
12743 if (erasure_code_profile
== "default") {
12744 if (!osdmap
.has_erasure_code_profile(erasure_code_profile
)) {
12745 if (pending_inc
.has_erasure_code_profile(erasure_code_profile
)) {
12746 dout(20) << "erasure code profile " << erasure_code_profile
<< " already pending" << dendl
;
12750 map
<string
,string
> profile_map
;
12751 err
= osdmap
.get_erasure_code_profile_default(cct
,
12756 dout(20) << "erasure code profile " << erasure_code_profile
<< " set" << dendl
;
12757 pending_inc
.set_erasure_code_profile(erasure_code_profile
, profile_map
);
12761 if (rule_name
== "") {
12762 implicit_rule_creation
= true;
12763 if (erasure_code_profile
== "default") {
12764 rule_name
= "erasure-code";
12766 dout(1) << "implicitly use rule named after the pool: "
12767 << poolstr
<< dendl
;
12768 rule_name
= poolstr
;
12771 cmd_getval(cmdmap
, "expected_num_objects",
12772 expected_num_objects
, int64_t(0));
12774 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
12775 // and put expected_num_objects to rule field
12776 if (erasure_code_profile
!= "") { // cmd is from CLI
12777 if (rule_name
!= "") {
12779 expected_num_objects
= strict_strtoll(rule_name
.c_str(), 10, &interr
);
12780 if (interr
.length()) {
12781 ss
<< "error parsing integer value '" << rule_name
<< "': " << interr
;
12786 rule_name
= erasure_code_profile
;
12787 } else { // cmd is well-formed
12788 cmd_getval(cmdmap
, "expected_num_objects",
12789 expected_num_objects
, int64_t(0));
12793 if (!implicit_rule_creation
&& rule_name
!= "") {
12795 err
= get_crush_rule(rule_name
, &rule
, &ss
);
12796 if (err
== -EAGAIN
) {
12797 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12804 if (expected_num_objects
< 0) {
12805 ss
<< "'expected_num_objects' must be non-negative";
12811 osdmap
.get_all_osds(osds
);
12812 bool has_filestore_osd
= std::any_of(osds
.begin(), osds
.end(), [this](int osd
) {
12814 if (!get_osd_objectstore_type(osd
, &type
)) {
12815 return type
== "filestore";
12821 if (has_filestore_osd
&&
12822 expected_num_objects
> 0 &&
12823 cct
->_conf
->filestore_merge_threshold
> 0) {
12824 ss
<< "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
12829 if (has_filestore_osd
&&
12830 expected_num_objects
== 0 &&
12831 cct
->_conf
->filestore_merge_threshold
< 0) {
12832 int osds
= osdmap
.get_num_osds();
12834 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
12835 if (!sure
&& osds
&& (pg_num
>= 1024 || pg_num
/ osds
>= 100)) {
12836 ss
<< "For better initial performance on pools expected to store a "
12837 << "large number of objects, consider supplying the "
12838 << "expected_num_objects parameter when creating the pool."
12839 << " Pass --yes-i-really-mean-it to ignore it";
12845 int64_t fast_read_param
;
12846 cmd_getval(cmdmap
, "fast_read", fast_read_param
, int64_t(-1));
12847 FastReadType fast_read
= FAST_READ_DEFAULT
;
12848 if (fast_read_param
== 0)
12849 fast_read
= FAST_READ_OFF
;
12850 else if (fast_read_param
> 0)
12851 fast_read
= FAST_READ_ON
;
12853 int64_t repl_size
= 0;
12854 cmd_getval(cmdmap
, "size", repl_size
);
12855 int64_t target_size_bytes
= 0;
12856 double target_size_ratio
= 0.0;
12857 cmd_getval(cmdmap
, "target_size_bytes", target_size_bytes
);
12858 cmd_getval(cmdmap
, "target_size_ratio", target_size_ratio
);
12860 string pg_autoscale_mode
;
12861 cmd_getval(cmdmap
, "autoscale_mode", pg_autoscale_mode
);
12863 err
= prepare_new_pool(poolstr
,
12864 -1, // default crush rule
12866 pg_num
, pgp_num
, pg_num_min
,
12867 repl_size
, target_size_bytes
, target_size_ratio
,
12868 erasure_code_profile
, pool_type
,
12869 (uint64_t)expected_num_objects
,
12876 ss
<< "pool '" << poolstr
<< "' already exists";
12879 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12888 ss
<< "pool '" << poolstr
<< "' created";
12891 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12892 get_last_committed() + 1));
12895 } else if (prefix
== "osd pool delete" ||
12896 prefix
== "osd pool rm") {
12897 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
12898 string poolstr
, poolstr2
, sure
;
12899 cmd_getval(cmdmap
, "pool", poolstr
);
12900 cmd_getval(cmdmap
, "pool2", poolstr2
);
12901 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
12903 ss
<< "pool '" << poolstr
<< "' does not exist";
12908 bool force_no_fake
= false;
12909 cmd_getval(cmdmap
, "yes_i_really_really_mean_it", force_no_fake
);
12910 bool force
= false;
12911 cmd_getval(cmdmap
, "yes_i_really_really_mean_it_not_faking", force
);
12912 if (poolstr2
!= poolstr
||
12913 (!force
&& !force_no_fake
)) {
12914 ss
<< "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
12915 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
12916 << "followed by --yes-i-really-really-mean-it.";
12920 err
= _prepare_remove_pool(pool
, &ss
, force_no_fake
);
12921 if (err
== -EAGAIN
) {
12922 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12928 } else if (prefix
== "osd pool rename") {
12929 string srcpoolstr
, destpoolstr
;
12930 cmd_getval(cmdmap
, "srcpool", srcpoolstr
);
12931 cmd_getval(cmdmap
, "destpool", destpoolstr
);
12932 int64_t pool_src
= osdmap
.lookup_pg_pool_name(srcpoolstr
.c_str());
12933 int64_t pool_dst
= osdmap
.lookup_pg_pool_name(destpoolstr
.c_str());
12935 if (pool_src
< 0) {
12936 if (pool_dst
>= 0) {
12937 // src pool doesn't exist, dst pool does exist: to ensure idempotency
12938 // of operations, assume this rename succeeded, as it is not changing
12939 // the current state. Make sure we output something understandable
12940 // for whoever is issuing the command, if they are paying attention,
12941 // in case it was not intentional; or to avoid a "wtf?" and a bug
12942 // report in case it was intentional, while expecting a failure.
12943 ss
<< "pool '" << srcpoolstr
<< "' does not exist; pool '"
12944 << destpoolstr
<< "' does -- assuming successful rename";
12947 ss
<< "unrecognized pool '" << srcpoolstr
<< "'";
12951 } else if (pool_dst
>= 0) {
12952 // source pool exists and so does the destination pool
12953 ss
<< "pool '" << destpoolstr
<< "' already exists";
12958 int ret
= _prepare_rename_pool(pool_src
, destpoolstr
);
12960 ss
<< "pool '" << srcpoolstr
<< "' renamed to '" << destpoolstr
<< "'";
12962 ss
<< "failed to rename pool '" << srcpoolstr
<< "' to '" << destpoolstr
<< "': "
12963 << cpp_strerror(ret
);
12966 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, ret
, rs
,
12967 get_last_committed() + 1));
12970 } else if (prefix
== "osd pool set") {
12971 err
= prepare_command_pool_set(cmdmap
, ss
);
12972 if (err
== -EAGAIN
)
12978 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12979 get_last_committed() + 1));
12981 } else if (prefix
== "osd tier add") {
12982 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
12983 if (err
== -EAGAIN
)
12988 cmd_getval(cmdmap
, "pool", poolstr
);
12989 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
12991 ss
<< "unrecognized pool '" << poolstr
<< "'";
12995 string tierpoolstr
;
12996 cmd_getval(cmdmap
, "tierpool", tierpoolstr
);
12997 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
12998 if (tierpool_id
< 0) {
12999 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
13003 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13005 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
13008 if (!_check_become_tier(tierpool_id
, tp
, pool_id
, p
, &err
, &ss
)) {
13012 // make sure new tier is empty
13013 string force_nonempty
;
13014 cmd_getval(cmdmap
, "force_nonempty", force_nonempty
);
13015 const pool_stat_t
*pstats
= mon
.mgrstatmon()->get_pool_stat(tierpool_id
);
13016 if (pstats
&& pstats
->stats
.sum
.num_objects
!= 0 &&
13017 force_nonempty
!= "--force-nonempty") {
13018 ss
<< "tier pool '" << tierpoolstr
<< "' is not empty; --force-nonempty to force";
13022 if (tp
->is_erasure()) {
13023 ss
<< "tier pool '" << tierpoolstr
13024 << "' is an ec pool, which cannot be a tier";
13028 if ((!tp
->removed_snaps
.empty() || !tp
->snaps
.empty()) &&
13029 ((force_nonempty
!= "--force-nonempty") ||
13030 (!g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps
))) {
13031 ss
<< "tier pool '" << tierpoolstr
<< "' has snapshot state; it cannot be added as a tier without breaking the pool";
13036 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13037 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
13038 if (np
->tiers
.count(tierpool_id
) || ntp
->is_tier()) {
13039 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13042 np
->tiers
.insert(tierpool_id
);
13043 np
->set_snap_epoch(pending_inc
.epoch
); // tier will update to our snap info
13044 ntp
->tier_of
= pool_id
;
13045 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) a tier of '" << poolstr
<< "'";
13046 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13047 get_last_committed() + 1));
13049 } else if (prefix
== "osd tier remove" ||
13050 prefix
== "osd tier rm") {
13052 cmd_getval(cmdmap
, "pool", poolstr
);
13053 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13055 ss
<< "unrecognized pool '" << poolstr
<< "'";
13059 string tierpoolstr
;
13060 cmd_getval(cmdmap
, "tierpool", tierpoolstr
);
13061 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
13062 if (tierpool_id
< 0) {
13063 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
13067 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13069 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
13072 if (!_check_remove_tier(pool_id
, p
, tp
, &err
, &ss
)) {
13076 if (p
->tiers
.count(tierpool_id
) == 0) {
13077 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) not a tier of '" << poolstr
<< "'";
13081 if (tp
->tier_of
!= pool_id
) {
13082 ss
<< "tier pool '" << tierpoolstr
<< "' is a tier of '"
13083 << osdmap
.get_pool_name(tp
->tier_of
) << "': "
13084 // be scary about it; this is an inconsistency and bells must go off
13085 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
13089 if (p
->read_tier
== tierpool_id
) {
13090 ss
<< "tier pool '" << tierpoolstr
<< "' is the overlay for '" << poolstr
<< "'; please remove-overlay first";
13095 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13096 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
13097 if (np
->tiers
.count(tierpool_id
) == 0 ||
13098 ntp
->tier_of
!= pool_id
||
13099 np
->read_tier
== tierpool_id
) {
13100 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13103 np
->tiers
.erase(tierpool_id
);
13105 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) not a tier of '" << poolstr
<< "'";
13106 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13107 get_last_committed() + 1));
13109 } else if (prefix
== "osd tier set-overlay") {
13110 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
13111 if (err
== -EAGAIN
)
13116 cmd_getval(cmdmap
, "pool", poolstr
);
13117 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13119 ss
<< "unrecognized pool '" << poolstr
<< "'";
13123 string overlaypoolstr
;
13124 cmd_getval(cmdmap
, "overlaypool", overlaypoolstr
);
13125 int64_t overlaypool_id
= osdmap
.lookup_pg_pool_name(overlaypoolstr
);
13126 if (overlaypool_id
< 0) {
13127 ss
<< "unrecognized pool '" << overlaypoolstr
<< "'";
13131 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13133 const pg_pool_t
*overlay_p
= osdmap
.get_pg_pool(overlaypool_id
);
13134 ceph_assert(overlay_p
);
13135 if (p
->tiers
.count(overlaypool_id
) == 0) {
13136 ss
<< "tier pool '" << overlaypoolstr
<< "' is not a tier of '" << poolstr
<< "'";
13140 if (p
->read_tier
== overlaypool_id
) {
13142 ss
<< "overlay for '" << poolstr
<< "' is now (or already was) '" << overlaypoolstr
<< "'";
13145 if (p
->has_read_tier()) {
13146 ss
<< "pool '" << poolstr
<< "' has overlay '"
13147 << osdmap
.get_pool_name(p
->read_tier
)
13148 << "'; please remove-overlay first";
13154 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13155 np
->read_tier
= overlaypool_id
;
13156 np
->write_tier
= overlaypool_id
;
13157 np
->set_last_force_op_resend(pending_inc
.epoch
);
13158 pg_pool_t
*noverlay_p
= pending_inc
.get_new_pool(overlaypool_id
, overlay_p
);
13159 noverlay_p
->set_last_force_op_resend(pending_inc
.epoch
);
13160 ss
<< "overlay for '" << poolstr
<< "' is now (or already was) '" << overlaypoolstr
<< "'";
13161 if (overlay_p
->cache_mode
== pg_pool_t::CACHEMODE_NONE
)
13162 ss
<<" (WARNING: overlay pool cache_mode is still NONE)";
13163 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13164 get_last_committed() + 1));
13166 } else if (prefix
== "osd tier remove-overlay" ||
13167 prefix
== "osd tier rm-overlay") {
13169 cmd_getval(cmdmap
, "pool", poolstr
);
13170 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13172 ss
<< "unrecognized pool '" << poolstr
<< "'";
13176 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13178 if (!p
->has_read_tier()) {
13180 ss
<< "there is now (or already was) no overlay for '" << poolstr
<< "'";
13184 if (!_check_remove_tier(pool_id
, p
, NULL
, &err
, &ss
)) {
13189 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13190 if (np
->has_read_tier()) {
13191 const pg_pool_t
*op
= osdmap
.get_pg_pool(np
->read_tier
);
13192 pg_pool_t
*nop
= pending_inc
.get_new_pool(np
->read_tier
,op
);
13193 nop
->set_last_force_op_resend(pending_inc
.epoch
);
13195 if (np
->has_write_tier()) {
13196 const pg_pool_t
*op
= osdmap
.get_pg_pool(np
->write_tier
);
13197 pg_pool_t
*nop
= pending_inc
.get_new_pool(np
->write_tier
, op
);
13198 nop
->set_last_force_op_resend(pending_inc
.epoch
);
13200 np
->clear_read_tier();
13201 np
->clear_write_tier();
13202 np
->set_last_force_op_resend(pending_inc
.epoch
);
13203 ss
<< "there is now (or already was) no overlay for '" << poolstr
<< "'";
13204 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13205 get_last_committed() + 1));
13207 } else if (prefix
== "osd tier cache-mode") {
13208 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
13209 if (err
== -EAGAIN
)
13214 cmd_getval(cmdmap
, "pool", poolstr
);
13215 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13217 ss
<< "unrecognized pool '" << poolstr
<< "'";
13221 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13223 if (!p
->is_tier()) {
13224 ss
<< "pool '" << poolstr
<< "' is not a tier";
13229 cmd_getval(cmdmap
, "mode", modestr
);
13230 pg_pool_t::cache_mode_t mode
= pg_pool_t::get_cache_mode_from_str(modestr
);
13231 if (int(mode
) < 0) {
13232 ss
<< "'" << modestr
<< "' is not a valid cache mode";
13238 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
13240 if (mode
== pg_pool_t::CACHEMODE_FORWARD
||
13241 mode
== pg_pool_t::CACHEMODE_READFORWARD
) {
13242 ss
<< "'" << modestr
<< "' is no longer a supported cache mode";
13246 if ((mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13247 mode
!= pg_pool_t::CACHEMODE_NONE
&&
13248 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
13249 mode
!= pg_pool_t::CACHEMODE_READPROXY
) &&
13251 ss
<< "'" << modestr
<< "' is not a well-supported cache mode and may "
13252 << "corrupt your data. pass --yes-i-really-mean-it to force.";
13257 // pool already has this cache-mode set and there are no pending changes
13258 if (p
->cache_mode
== mode
&&
13259 (pending_inc
.new_pools
.count(pool_id
) == 0 ||
13260 pending_inc
.new_pools
[pool_id
].cache_mode
== p
->cache_mode
)) {
13261 ss
<< "set cache-mode for pool '" << poolstr
<< "'"
13262 << " to " << pg_pool_t::get_cache_mode_name(mode
);
13267 /* Mode description:
13269 * none: No cache-mode defined
13270 * forward: Forward all reads and writes to base pool [removed]
13271 * writeback: Cache writes, promote reads from base pool
13272 * readonly: Forward writes to base pool
13273 * readforward: Writes are in writeback mode, Reads are in forward mode [removed]
13274 * proxy: Proxy all reads and writes to base pool
13275 * readproxy: Writes are in writeback mode, Reads are in proxy mode
13277 * Hence, these are the allowed transitions:
13280 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
13281 * proxy -> readproxy || writeback || any IF num_objects_dirty == 0
13282 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
13283 * readproxy -> proxy || writeback || any IF num_objects_dirty == 0
13284 * writeback -> readproxy || proxy
13288 // We check if the transition is valid against the current pool mode, as
13289 // it is the only committed state thus far. We will blantly squash
13290 // whatever mode is on the pending state.
13292 if (p
->cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
&&
13293 (mode
!= pg_pool_t::CACHEMODE_PROXY
&&
13294 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) {
13295 ss
<< "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode
)
13296 << "' on a '" << pg_pool_t::get_cache_mode_name(p
->cache_mode
)
13297 << "' pool; only '"
13298 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY
)
13303 if ((p
->cache_mode
== pg_pool_t::CACHEMODE_READFORWARD
&&
13304 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13305 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
13306 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) ||
13308 (p
->cache_mode
== pg_pool_t::CACHEMODE_READPROXY
&&
13309 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13310 mode
!= pg_pool_t::CACHEMODE_PROXY
)) ||
13312 (p
->cache_mode
== pg_pool_t::CACHEMODE_PROXY
&&
13313 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13314 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) ||
13316 (p
->cache_mode
== pg_pool_t::CACHEMODE_FORWARD
&&
13317 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13318 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
13319 mode
!= pg_pool_t::CACHEMODE_READPROXY
))) {
13321 const pool_stat_t
* pstats
=
13322 mon
.mgrstatmon()->get_pool_stat(pool_id
);
13324 if (pstats
&& pstats
->stats
.sum
.num_objects_dirty
> 0) {
13325 ss
<< "unable to set cache-mode '"
13326 << pg_pool_t::get_cache_mode_name(mode
) << "' on pool '" << poolstr
13327 << "': dirty objects found";
13333 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13334 np
->cache_mode
= mode
;
13335 // set this both when moving to and from cache_mode NONE. this is to
13336 // capture legacy pools that were set up before this flag existed.
13337 np
->flags
|= pg_pool_t::FLAG_INCOMPLETE_CLONES
;
13338 ss
<< "set cache-mode for pool '" << poolstr
13339 << "' to " << pg_pool_t::get_cache_mode_name(mode
);
13340 if (mode
== pg_pool_t::CACHEMODE_NONE
) {
13341 const pg_pool_t
*base_pool
= osdmap
.get_pg_pool(np
->tier_of
);
13342 ceph_assert(base_pool
);
13343 if (base_pool
->read_tier
== pool_id
||
13344 base_pool
->write_tier
== pool_id
)
13345 ss
<<" (WARNING: pool is still configured as read or write tier)";
13347 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13348 get_last_committed() + 1));
13350 } else if (prefix
== "osd tier add-cache") {
13351 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
13352 if (err
== -EAGAIN
)
13357 cmd_getval(cmdmap
, "pool", poolstr
);
13358 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13360 ss
<< "unrecognized pool '" << poolstr
<< "'";
13364 string tierpoolstr
;
13365 cmd_getval(cmdmap
, "tierpool", tierpoolstr
);
13366 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
13367 if (tierpool_id
< 0) {
13368 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
13372 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13374 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
13377 if (!_check_become_tier(tierpool_id
, tp
, pool_id
, p
, &err
, &ss
)) {
13382 if (!cmd_getval(cmdmap
, "size", size
)) {
13383 ss
<< "unable to parse 'size' value '"
13384 << cmd_vartype_stringify(cmdmap
.at("size")) << "'";
13388 // make sure new tier is empty
13389 const pool_stat_t
*pstats
=
13390 mon
.mgrstatmon()->get_pool_stat(tierpool_id
);
13391 if (pstats
&& pstats
->stats
.sum
.num_objects
!= 0) {
13392 ss
<< "tier pool '" << tierpoolstr
<< "' is not empty";
13396 auto& modestr
= g_conf().get_val
<string
>("osd_tier_default_cache_mode");
13397 pg_pool_t::cache_mode_t mode
= pg_pool_t::get_cache_mode_from_str(modestr
);
13398 if (int(mode
) < 0) {
13399 ss
<< "osd tier cache default mode '" << modestr
<< "' is not a valid cache mode";
13403 HitSet::Params hsp
;
13404 auto& cache_hit_set_type
=
13405 g_conf().get_val
<string
>("osd_tier_default_cache_hit_set_type");
13406 if (cache_hit_set_type
== "bloom") {
13407 BloomHitSet::Params
*bsp
= new BloomHitSet::Params
;
13408 bsp
->set_fpp(g_conf().get_val
<double>("osd_pool_default_hit_set_bloom_fpp"));
13409 hsp
= HitSet::Params(bsp
);
13410 } else if (cache_hit_set_type
== "explicit_hash") {
13411 hsp
= HitSet::Params(new ExplicitHashHitSet::Params
);
13412 } else if (cache_hit_set_type
== "explicit_object") {
13413 hsp
= HitSet::Params(new ExplicitObjectHitSet::Params
);
13415 ss
<< "osd tier cache default hit set type '"
13416 << cache_hit_set_type
<< "' is not a known type";
13421 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13422 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
13423 if (np
->tiers
.count(tierpool_id
) || ntp
->is_tier()) {
13424 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13427 np
->tiers
.insert(tierpool_id
);
13428 np
->read_tier
= np
->write_tier
= tierpool_id
;
13429 np
->set_snap_epoch(pending_inc
.epoch
); // tier will update to our snap info
13430 np
->set_last_force_op_resend(pending_inc
.epoch
);
13431 ntp
->set_last_force_op_resend(pending_inc
.epoch
);
13432 ntp
->tier_of
= pool_id
;
13433 ntp
->cache_mode
= mode
;
13434 ntp
->hit_set_count
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_count");
13435 ntp
->hit_set_period
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_period");
13436 ntp
->min_read_recency_for_promote
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
13437 ntp
->min_write_recency_for_promote
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
13438 ntp
->hit_set_grade_decay_rate
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
13439 ntp
->hit_set_search_last_n
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
13440 ntp
->hit_set_params
= hsp
;
13441 ntp
->target_max_bytes
= size
;
13442 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) a cache tier of '" << poolstr
<< "'";
13443 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13444 get_last_committed() + 1));
13446 } else if (prefix
== "osd pool set-quota") {
13448 cmd_getval(cmdmap
, "pool", poolstr
);
13449 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13451 ss
<< "unrecognized pool '" << poolstr
<< "'";
13457 cmd_getval(cmdmap
, "field", field
);
13458 if (field
!= "max_objects" && field
!= "max_bytes") {
13459 ss
<< "unrecognized field '" << field
<< "'; should be 'max_bytes' or 'max_objects'";
13464 // val could contain unit designations, so we treat as a string
13466 cmd_getval(cmdmap
, "val", val
);
13469 if (field
== "max_objects") {
13470 value
= strict_sistrtoll(val
.c_str(), &tss
);
13471 } else if (field
== "max_bytes") {
13472 value
= strict_iecstrtoll(val
.c_str(), &tss
);
13474 ceph_abort_msg("unrecognized option");
13476 if (!tss
.empty()) {
13477 ss
<< "error parsing value '" << val
<< "': " << tss
;
13482 pg_pool_t
*pi
= pending_inc
.get_new_pool(pool_id
, osdmap
.get_pg_pool(pool_id
));
13483 if (field
== "max_objects") {
13484 pi
->quota_max_objects
= value
;
13485 } else if (field
== "max_bytes") {
13486 pi
->quota_max_bytes
= value
;
13488 ceph_abort_msg("unrecognized option");
13490 ss
<< "set-quota " << field
<< " = " << value
<< " for pool " << poolstr
;
13492 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
13493 get_last_committed() + 1));
13495 } else if (prefix
== "osd pool application enable" ||
13496 prefix
== "osd pool application disable" ||
13497 prefix
== "osd pool application set" ||
13498 prefix
== "osd pool application rm") {
13499 err
= prepare_command_pool_application(prefix
, cmdmap
, ss
);
13500 if (err
== -EAGAIN
) {
13502 } else if (err
< 0) {
13507 } else if (prefix
== "osd force-create-pg") {
13510 cmd_getval(cmdmap
, "pgid", pgidstr
);
13511 if (!pgid
.parse(pgidstr
.c_str())) {
13512 ss
<< "invalid pgid '" << pgidstr
<< "'";
13516 if (!osdmap
.pg_exists(pgid
)) {
13517 ss
<< "pg " << pgid
<< " should not exist";
13522 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
13524 ss
<< "This command will recreate a lost (as in data lost) PG with data in it, such "
13525 << "that the cluster will give up ever trying to recover the lost data. Do this "
13526 << "only if you are certain that all copies of the PG are in fact lost and you are "
13527 << "willing to accept that the data is permanently destroyed. Pass "
13528 << "--yes-i-really-mean-it to proceed.";
13534 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
13535 auto emplaced
= creating_pgs
.pgs
.emplace(
13537 creating_pgs_t::pg_create_info(osdmap
.get_epoch(),
13538 ceph_clock_now()));
13539 creating_now
= emplaced
.second
;
13541 if (creating_now
) {
13542 ss
<< "pg " << pgidstr
<< " now creating, ok";
13543 // set the pool's CREATING flag so that (1) the osd won't ignore our
13544 // create message and (2) we won't propose any future pg_num changes
13545 // until after the PG has been instantiated.
13546 if (pending_inc
.new_pools
.count(pgid
.pool()) == 0) {
13547 pending_inc
.new_pools
[pgid
.pool()] = *osdmap
.get_pg_pool(pgid
.pool());
13549 pending_inc
.new_pools
[pgid
.pool()].flags
|= pg_pool_t::FLAG_CREATING
;
13553 ss
<< "pg " << pgid
<< " already creating";
13557 } else if (prefix
== "osd force_healthy_stretch_mode") {
13559 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
13561 ss
<< "This command will require peering across multiple CRUSH buckets "
13562 "(probably two data centers or availability zones?) and may result in PGs "
13563 "going inactive until backfilling is complete. Pass --yes-i-really-mean-it to proceed.";
13567 try_end_recovery_stretch_mode(true);
13568 ss
<< "Triggering healthy stretch mode";
13571 } else if (prefix
== "osd force_recovery_stretch_mode") {
13573 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
13575 ss
<< "This command will increase pool sizes to try and spread them "
13576 "across multiple CRUSH buckets (probably two data centers or "
13577 "availability zones?) and should have happened automatically"
13578 "Pass --yes-i-really-mean-it to proceed.";
13582 mon
.go_recovery_stretch_mode();
13583 ss
<< "Triggering recovery stretch mode";
13592 if (err
< 0 && rs
.length() == 0)
13593 rs
= cpp_strerror(err
);
13594 mon
.reply_command(op
, err
, rs
, rdata
, get_last_committed());
13599 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
13600 get_last_committed() + 1));
13604 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13608 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op
)
13610 op
->mark_osdmon_event(__func__
);
13612 auto m
= op
->get_req
<MPoolOp
>();
13613 MonSession
*session
= op
->get_session();
13615 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
13620 case POOL_OP_CREATE_UNMANAGED_SNAP
:
13621 case POOL_OP_DELETE_UNMANAGED_SNAP
:
13623 const std::string
* pool_name
= nullptr;
13624 const pg_pool_t
*pg_pool
= osdmap
.get_pg_pool(m
->pool
);
13625 if (pg_pool
!= nullptr) {
13626 pool_name
= &osdmap
.get_pool_name(m
->pool
);
13629 if (!is_unmanaged_snap_op_permitted(cct
, mon
.key_server
,
13630 session
->entity_name
, session
->caps
,
13631 session
->get_peer_socket_addr(),
13633 dout(0) << "got unmanaged-snap pool op from entity with insufficient "
13634 << "privileges. message: " << *m
<< std::endl
13635 << "caps: " << session
->caps
<< dendl
;
13636 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
13642 if (!session
->is_capable("osd", MON_CAP_W
)) {
13643 dout(0) << "got pool op from entity with insufficient privileges. "
13644 << "message: " << *m
<< std::endl
13645 << "caps: " << session
->caps
<< dendl
;
13646 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
13655 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op
)
13657 op
->mark_osdmon_event(__func__
);
13658 auto m
= op
->get_req
<MPoolOp
>();
13660 if (enforce_pool_op_caps(op
)) {
13664 if (m
->fsid
!= mon
.monmap
->fsid
) {
13665 dout(0) << __func__
<< " drop message on fsid " << m
->fsid
13666 << " != " << mon
.monmap
->fsid
<< " for " << *m
<< dendl
;
13667 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13671 if (m
->op
== POOL_OP_CREATE
)
13672 return preprocess_pool_op_create(op
);
13674 const pg_pool_t
*p
= osdmap
.get_pg_pool(m
->pool
);
13675 if (p
== nullptr) {
13676 dout(10) << "attempt to operate on non-existent pool id " << m
->pool
<< dendl
;
13677 if (m
->op
== POOL_OP_DELETE
) {
13678 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13680 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
13685 // check if the snap and snapname exist
13686 bool snap_exists
= false;
13687 if (p
->snap_exists(m
->name
.c_str()))
13688 snap_exists
= true;
13691 case POOL_OP_CREATE_SNAP
:
13692 if (p
->is_unmanaged_snaps_mode() || p
->is_tier()) {
13693 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13697 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13701 case POOL_OP_CREATE_UNMANAGED_SNAP
:
13702 if (p
->is_pool_snaps_mode()) {
13703 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13707 case POOL_OP_DELETE_SNAP
:
13708 if (p
->is_unmanaged_snaps_mode()) {
13709 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13712 if (!snap_exists
) {
13713 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13717 case POOL_OP_DELETE_UNMANAGED_SNAP
:
13718 if (p
->is_pool_snaps_mode()) {
13719 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13722 if (_is_removed_snap(m
->pool
, m
->snapid
)) {
13723 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13727 case POOL_OP_DELETE
:
13728 if (osdmap
.lookup_pg_pool_name(m
->name
.c_str()) >= 0) {
13729 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13733 case POOL_OP_AUID_CHANGE
:
13743 bool OSDMonitor::_is_removed_snap(int64_t pool
, snapid_t snap
)
13745 if (!osdmap
.have_pg_pool(pool
)) {
13746 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
13747 << " - pool dne" << dendl
;
13750 if (osdmap
.in_removed_snaps_queue(pool
, snap
)) {
13751 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
13752 << " - in osdmap removed_snaps_queue" << dendl
;
13755 snapid_t begin
, end
;
13756 int r
= lookup_purged_snap(pool
, snap
, &begin
, &end
);
13758 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
13759 << " - purged, [" << begin
<< "," << end
<< ")" << dendl
;
13765 bool OSDMonitor::_is_pending_removed_snap(int64_t pool
, snapid_t snap
)
13767 if (pending_inc
.old_pools
.count(pool
)) {
13768 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
13769 << " - pool pending deletion" << dendl
;
13772 if (pending_inc
.in_new_removed_snaps(pool
, snap
)) {
13773 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
13774 << " - in pending new_removed_snaps" << dendl
;
13780 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op
)
13782 op
->mark_osdmon_event(__func__
);
13783 auto m
= op
->get_req
<MPoolOp
>();
13784 int64_t pool
= osdmap
.lookup_pg_pool_name(m
->name
.c_str());
13786 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13793 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op
)
13795 op
->mark_osdmon_event(__func__
);
13796 auto m
= op
->get_req
<MPoolOp
>();
13797 dout(10) << "prepare_pool_op " << *m
<< dendl
;
13798 if (m
->op
== POOL_OP_CREATE
) {
13799 return prepare_pool_op_create(op
);
13800 } else if (m
->op
== POOL_OP_DELETE
) {
13801 return prepare_pool_op_delete(op
);
13805 bool changed
= false;
13807 if (!osdmap
.have_pg_pool(m
->pool
)) {
13808 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
13812 const pg_pool_t
*pool
= osdmap
.get_pg_pool(m
->pool
);
13815 case POOL_OP_CREATE_SNAP
:
13816 if (pool
->is_tier()) {
13818 _pool_op_reply(op
, ret
, osdmap
.get_epoch());
13820 } // else, fall through
13821 case POOL_OP_DELETE_SNAP
:
13822 if (!pool
->is_unmanaged_snaps_mode()) {
13823 bool snap_exists
= pool
->snap_exists(m
->name
.c_str());
13824 if ((m
->op
== POOL_OP_CREATE_SNAP
&& snap_exists
)
13825 || (m
->op
== POOL_OP_DELETE_SNAP
&& !snap_exists
)) {
13833 _pool_op_reply(op
, ret
, osdmap
.get_epoch());
13836 case POOL_OP_DELETE_UNMANAGED_SNAP
:
13837 // we won't allow removal of an unmanaged snapshot from a pool
13838 // not in unmanaged snaps mode.
13839 if (!pool
->is_unmanaged_snaps_mode()) {
13840 _pool_op_reply(op
, -ENOTSUP
, osdmap
.get_epoch());
13844 case POOL_OP_CREATE_UNMANAGED_SNAP
:
13845 // but we will allow creating an unmanaged snapshot on any pool
13846 // as long as it is not in 'pool' snaps mode.
13847 if (pool
->is_pool_snaps_mode()) {
13848 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13853 // projected pool info
13855 if (pending_inc
.new_pools
.count(m
->pool
))
13856 pp
= pending_inc
.new_pools
[m
->pool
];
13858 pp
= *osdmap
.get_pg_pool(m
->pool
);
13860 bufferlist reply_data
;
13862 // pool snaps vs unmanaged snaps are mutually exclusive
13864 case POOL_OP_CREATE_SNAP
:
13865 case POOL_OP_DELETE_SNAP
:
13866 if (pp
.is_unmanaged_snaps_mode()) {
13872 case POOL_OP_CREATE_UNMANAGED_SNAP
:
13873 case POOL_OP_DELETE_UNMANAGED_SNAP
:
13874 if (pp
.is_pool_snaps_mode()) {
13881 case POOL_OP_CREATE_SNAP
:
13882 if (!pp
.snap_exists(m
->name
.c_str())) {
13883 pp
.add_snap(m
->name
.c_str(), ceph_clock_now());
13884 dout(10) << "create snap in pool " << m
->pool
<< " " << m
->name
13885 << " seq " << pp
.get_snap_epoch() << dendl
;
13890 case POOL_OP_DELETE_SNAP
:
13892 snapid_t s
= pp
.snap_exists(m
->name
.c_str());
13895 pending_inc
.new_removed_snaps
[m
->pool
].insert(s
);
13901 case POOL_OP_CREATE_UNMANAGED_SNAP
:
13903 uint64_t snapid
= pp
.add_unmanaged_snap(
13904 osdmap
.require_osd_release
< ceph_release_t::octopus
);
13905 encode(snapid
, reply_data
);
13910 case POOL_OP_DELETE_UNMANAGED_SNAP
:
13911 if (!_is_removed_snap(m
->pool
, m
->snapid
) &&
13912 !_is_pending_removed_snap(m
->pool
, m
->snapid
)) {
13913 if (m
->snapid
> pp
.get_snap_seq()) {
13914 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
13917 pp
.remove_unmanaged_snap(
13919 osdmap
.require_osd_release
< ceph_release_t::octopus
);
13920 pending_inc
.new_removed_snaps
[m
->pool
].insert(m
->snapid
);
13921 // also record the new seq as purged: this avoids a discontinuity
13922 // after all of the snaps have been purged, since the seq assigned
13923 // during removal lives in the same namespace as the actual snaps.
13924 pending_pseudo_purged_snaps
[m
->pool
].insert(pp
.get_snap_seq());
13929 case POOL_OP_AUID_CHANGE
:
13930 _pool_op_reply(op
, -EOPNOTSUPP
, osdmap
.get_epoch());
13939 pp
.set_snap_epoch(pending_inc
.epoch
);
13940 pending_inc
.new_pools
[m
->pool
] = pp
;
13944 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, ret
, pending_inc
.epoch
, &reply_data
));
13948 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op
)
13950 op
->mark_osdmon_event(__func__
);
13951 int err
= prepare_new_pool(op
);
13952 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, err
, pending_inc
.epoch
));
13956 int OSDMonitor::_check_remove_pool(int64_t pool_id
, const pg_pool_t
& pool
,
13959 const string
& poolstr
= osdmap
.get_pool_name(pool_id
);
13961 // If the Pool is in use by CephFS, refuse to delete it
13962 FSMap
const &pending_fsmap
= mon
.mdsmon()->get_pending_fsmap();
13963 if (pending_fsmap
.pool_in_use(pool_id
)) {
13964 *ss
<< "pool '" << poolstr
<< "' is in use by CephFS";
13968 if (pool
.tier_of
>= 0) {
13969 *ss
<< "pool '" << poolstr
<< "' is a tier of '"
13970 << osdmap
.get_pool_name(pool
.tier_of
) << "'";
13973 if (!pool
.tiers
.empty()) {
13974 *ss
<< "pool '" << poolstr
<< "' has tiers";
13975 for(auto tier
: pool
.tiers
) {
13976 *ss
<< " " << osdmap
.get_pool_name(tier
);
13981 if (!g_conf()->mon_allow_pool_delete
) {
13982 *ss
<< "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
13986 if (pool
.has_flag(pg_pool_t::FLAG_NODELETE
)) {
13987 *ss
<< "pool deletion is disabled; you must unset nodelete flag for the pool first";
13991 *ss
<< "pool '" << poolstr
<< "' removed";
13996 * Check if it is safe to add a tier to a base pool
13999 * True if the operation should proceed, false if we should abort here
14000 * (abort doesn't necessarily mean error, could be idempotency)
14002 bool OSDMonitor::_check_become_tier(
14003 const int64_t tier_pool_id
, const pg_pool_t
*tier_pool
,
14004 const int64_t base_pool_id
, const pg_pool_t
*base_pool
,
14008 const std::string
&tier_pool_name
= osdmap
.get_pool_name(tier_pool_id
);
14009 const std::string
&base_pool_name
= osdmap
.get_pool_name(base_pool_id
);
14011 const FSMap
&pending_fsmap
= mon
.mdsmon()->get_pending_fsmap();
14012 if (pending_fsmap
.pool_in_use(tier_pool_id
)) {
14013 *ss
<< "pool '" << tier_pool_name
<< "' is in use by CephFS";
14018 if (base_pool
->tiers
.count(tier_pool_id
)) {
14019 ceph_assert(tier_pool
->tier_of
== base_pool_id
);
14021 *ss
<< "pool '" << tier_pool_name
<< "' is now (or already was) a tier of '"
14022 << base_pool_name
<< "'";
14026 if (base_pool
->is_tier()) {
14027 *ss
<< "pool '" << base_pool_name
<< "' is already a tier of '"
14028 << osdmap
.get_pool_name(base_pool
->tier_of
) << "', "
14029 << "multiple tiers are not yet supported.";
14034 if (tier_pool
->has_tiers()) {
14035 *ss
<< "pool '" << tier_pool_name
<< "' has following tier(s) already:";
14036 for (set
<uint64_t>::iterator it
= tier_pool
->tiers
.begin();
14037 it
!= tier_pool
->tiers
.end(); ++it
)
14038 *ss
<< "'" << osdmap
.get_pool_name(*it
) << "',";
14039 *ss
<< " multiple tiers are not yet supported.";
14044 if (tier_pool
->is_tier()) {
14045 *ss
<< "tier pool '" << tier_pool_name
<< "' is already a tier of '"
14046 << osdmap
.get_pool_name(tier_pool
->tier_of
) << "'";
14057 * Check if it is safe to remove a tier from this base pool
14060 * True if the operation should proceed, false if we should abort here
14061 * (abort doesn't necessarily mean error, could be idempotency)
14063 bool OSDMonitor::_check_remove_tier(
14064 const int64_t base_pool_id
, const pg_pool_t
*base_pool
,
14065 const pg_pool_t
*tier_pool
,
14066 int *err
, ostream
*ss
) const
14068 const std::string
&base_pool_name
= osdmap
.get_pool_name(base_pool_id
);
14070 // Apply CephFS-specific checks
14071 const FSMap
&pending_fsmap
= mon
.mdsmon()->get_pending_fsmap();
14072 if (pending_fsmap
.pool_in_use(base_pool_id
)) {
14073 if (base_pool
->is_erasure() && !base_pool
->allows_ecoverwrites()) {
14074 // If the underlying pool is erasure coded and does not allow EC
14075 // overwrites, we can't permit the removal of the replicated tier that
14076 // CephFS relies on to access it
14077 *ss
<< "pool '" << base_pool_name
<<
14078 "' does not allow EC overwrites and is in use by CephFS"
14084 if (tier_pool
&& tier_pool
->cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
) {
14085 *ss
<< "pool '" << base_pool_name
<< "' is in use by CephFS, and this "
14086 "tier is still in use as a writeback cache. Change the cache "
14087 "mode and flush the cache before removing it";
14097 int OSDMonitor::_prepare_remove_pool(
14098 int64_t pool
, ostream
*ss
, bool no_fake
)
14100 dout(10) << __func__
<< " " << pool
<< dendl
;
14101 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
14102 int r
= _check_remove_pool(pool
, *p
, ss
);
14106 auto new_pool
= pending_inc
.new_pools
.find(pool
);
14107 if (new_pool
!= pending_inc
.new_pools
.end()) {
14108 // if there is a problem with the pending info, wait and retry
14110 const auto& p
= new_pool
->second
;
14111 int r
= _check_remove_pool(pool
, p
, ss
);
14116 if (pending_inc
.old_pools
.count(pool
)) {
14117 dout(10) << __func__
<< " " << pool
<< " already pending removal"
14122 if (g_conf()->mon_fake_pool_delete
&& !no_fake
) {
14123 string old_name
= osdmap
.get_pool_name(pool
);
14124 string new_name
= old_name
+ "." + stringify(pool
) + ".DELETED";
14125 dout(1) << __func__
<< " faking pool deletion: renaming " << pool
<< " "
14126 << old_name
<< " -> " << new_name
<< dendl
;
14127 pending_inc
.new_pool_names
[pool
] = new_name
;
14132 pending_inc
.old_pools
.insert(pool
);
14134 // remove any pg_temp mappings for this pool
14135 for (auto p
= osdmap
.pg_temp
->begin();
14136 p
!= osdmap
.pg_temp
->end();
14138 if (p
->first
.pool() == pool
) {
14139 dout(10) << __func__
<< " " << pool
<< " removing obsolete pg_temp "
14140 << p
->first
<< dendl
;
14141 pending_inc
.new_pg_temp
[p
->first
].clear();
14144 // remove any primary_temp mappings for this pool
14145 for (auto p
= osdmap
.primary_temp
->begin();
14146 p
!= osdmap
.primary_temp
->end();
14148 if (p
->first
.pool() == pool
) {
14149 dout(10) << __func__
<< " " << pool
14150 << " removing obsolete primary_temp" << p
->first
<< dendl
;
14151 pending_inc
.new_primary_temp
[p
->first
] = -1;
14154 // remove any pg_upmap mappings for this pool
14155 for (auto& p
: osdmap
.pg_upmap
) {
14156 if (p
.first
.pool() == pool
) {
14157 dout(10) << __func__
<< " " << pool
14158 << " removing obsolete pg_upmap "
14159 << p
.first
<< dendl
;
14160 pending_inc
.old_pg_upmap
.insert(p
.first
);
14163 // remove any pending pg_upmap mappings for this pool
14165 auto it
= pending_inc
.new_pg_upmap
.begin();
14166 while (it
!= pending_inc
.new_pg_upmap
.end()) {
14167 if (it
->first
.pool() == pool
) {
14168 dout(10) << __func__
<< " " << pool
14169 << " removing pending pg_upmap "
14170 << it
->first
<< dendl
;
14171 it
= pending_inc
.new_pg_upmap
.erase(it
);
14177 // remove any pg_upmap_items mappings for this pool
14178 for (auto& p
: osdmap
.pg_upmap_items
) {
14179 if (p
.first
.pool() == pool
) {
14180 dout(10) << __func__
<< " " << pool
14181 << " removing obsolete pg_upmap_items " << p
.first
14183 pending_inc
.old_pg_upmap_items
.insert(p
.first
);
14186 // remove any pending pg_upmap mappings for this pool
14188 auto it
= pending_inc
.new_pg_upmap_items
.begin();
14189 while (it
!= pending_inc
.new_pg_upmap_items
.end()) {
14190 if (it
->first
.pool() == pool
) {
14191 dout(10) << __func__
<< " " << pool
14192 << " removing pending pg_upmap_items "
14193 << it
->first
<< dendl
;
14194 it
= pending_inc
.new_pg_upmap_items
.erase(it
);
14201 // remove any choose_args for this pool
14202 CrushWrapper newcrush
;
14203 _get_pending_crush(newcrush
);
14204 if (newcrush
.have_choose_args(pool
)) {
14205 dout(10) << __func__
<< " removing choose_args for pool " << pool
<< dendl
;
14206 newcrush
.rm_choose_args(pool
);
14207 pending_inc
.crush
.clear();
14208 newcrush
.encode(pending_inc
.crush
, mon
.get_quorum_con_features());
14213 int OSDMonitor::_prepare_rename_pool(int64_t pool
, string newname
)
14215 dout(10) << "_prepare_rename_pool " << pool
<< dendl
;
14216 if (pending_inc
.old_pools
.count(pool
)) {
14217 dout(10) << "_prepare_rename_pool " << pool
<< " pending removal" << dendl
;
14220 for (map
<int64_t,string
>::iterator p
= pending_inc
.new_pool_names
.begin();
14221 p
!= pending_inc
.new_pool_names
.end();
14223 if (p
->second
== newname
&& p
->first
!= pool
) {
14228 pending_inc
.new_pool_names
[pool
] = newname
;
14232 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op
)
14234 op
->mark_osdmon_event(__func__
);
14235 auto m
= op
->get_req
<MPoolOp
>();
14237 int ret
= _prepare_remove_pool(m
->pool
, &ss
, false);
14238 if (ret
== -EAGAIN
) {
14239 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
14243 dout(10) << __func__
<< " got " << ret
<< " " << ss
.str() << dendl
;
14244 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, ret
,
14245 pending_inc
.epoch
));
14249 void OSDMonitor::_pool_op_reply(MonOpRequestRef op
,
14250 int ret
, epoch_t epoch
, bufferlist
*blp
)
14252 op
->mark_osdmon_event(__func__
);
14253 auto m
= op
->get_req
<MPoolOp
>();
14254 dout(20) << "_pool_op_reply " << ret
<< dendl
;
14255 MPoolOpReply
*reply
= new MPoolOpReply(m
->fsid
, m
->get_tid(),
14256 ret
, epoch
, get_last_committed(), blp
);
14257 mon
.send_reply(op
, reply
);
14260 void OSDMonitor::convert_pool_priorities(void)
14262 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc("recovery_priority").key
;
14263 int64_t max_prio
= 0;
14264 int64_t min_prio
= 0;
14265 for (const auto &i
: osdmap
.get_pools()) {
14266 const auto &pool
= i
.second
;
14268 if (pool
.opts
.is_set(key
)) {
14270 pool
.opts
.get(key
, &prio
);
14271 if (prio
> max_prio
)
14273 if (prio
< min_prio
)
14277 if (max_prio
<= OSD_POOL_PRIORITY_MAX
&& min_prio
>= OSD_POOL_PRIORITY_MIN
) {
14278 dout(20) << __func__
<< " nothing to fix" << dendl
;
14281 // Current pool priorities exceeds new maximum
14282 for (const auto &i
: osdmap
.get_pools()) {
14283 const auto pool_id
= i
.first
;
14284 pg_pool_t pool
= i
.second
;
14287 pool
.opts
.get(key
, &prio
);
14290 if (prio
> 0 && max_prio
> OSD_POOL_PRIORITY_MAX
) { // Likely scenario
14291 // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
14292 n
= (float)prio
/ max_prio
* OSD_POOL_PRIORITY_MAX
;
14293 } else if (prio
< 0 && min_prio
< OSD_POOL_PRIORITY_MIN
) {
14294 // Scaled priority range OSD_POOL_PRIORITY_MIN to 0
14295 n
= (float)prio
/ min_prio
* OSD_POOL_PRIORITY_MIN
;
14300 pool
.opts
.unset(key
);
14302 pool
.opts
.set(key
, static_cast<int64_t>(n
));
14304 dout(10) << __func__
<< " pool " << pool_id
14305 << " recovery_priority adjusted "
14306 << prio
<< " to " << n
<< dendl
;
14307 pool
.last_change
= pending_inc
.epoch
;
14308 pending_inc
.new_pools
[pool_id
] = pool
;
14312 void OSDMonitor::try_enable_stretch_mode_pools(stringstream
& ss
, bool *okay
,
14314 set
<pg_pool_t
*>* pools
,
14315 const string
& new_crush_rule
)
14317 dout(20) << __func__
<< dendl
;
14319 int new_crush_rule_result
= osdmap
.crush
->get_rule_id(new_crush_rule
);
14320 if (new_crush_rule_result
< 0) {
14321 ss
<< "unrecognized crush rule " << new_crush_rule_result
;
14322 *errcode
= new_crush_rule_result
;
14325 __u8 new_rule
= static_cast<__u8
>(new_crush_rule_result
);
14326 for (const auto& pooli
: osdmap
.pools
) {
14327 int64_t poolid
= pooli
.first
;
14328 const pg_pool_t
*p
= &pooli
.second
;
14329 if (!p
->is_replicated()) {
14330 ss
<< "stretched pools must be replicated; '" << osdmap
.pool_name
[poolid
] << "' is erasure-coded";
14331 *errcode
= -EINVAL
;
14334 uint8_t default_size
= g_conf().get_val
<uint64_t>("osd_pool_default_size");
14335 if ((p
->get_size() != default_size
||
14336 (p
->get_min_size() != g_conf().get_osd_pool_default_min_size(default_size
))) &&
14337 (p
->get_crush_rule() != new_rule
)) {
14338 ss
<< "we currently require stretch mode pools start out with the"
14339 " default size/min_size, which '" << osdmap
.pool_name
[poolid
] << "' does not";
14340 *errcode
= -EINVAL
;
14343 pg_pool_t
*pp
= pending_inc
.get_new_pool(poolid
, p
);
14344 // TODO: The part where we unconditionally copy the pools into pending_inc is bad
14345 // the attempt may fail and then we have these pool updates...but they won't do anything
14346 // if there is a failure, so if it's hard to change the interface, no need to bother
14353 void OSDMonitor::try_enable_stretch_mode(stringstream
& ss
, bool *okay
,
14354 int *errcode
, bool commit
,
14355 const string
& dividing_bucket
,
14356 uint32_t bucket_count
,
14357 const set
<pg_pool_t
*>& pools
,
14358 const string
& new_crush_rule
)
14360 dout(20) << __func__
<< dendl
;
14362 CrushWrapper crush
;
14363 _get_pending_crush(crush
);
14365 int retval
= crush
.get_validated_type_id(dividing_bucket
, &dividing_id
);
14366 if (retval
== -1) {
14367 ss
<< dividing_bucket
<< " is not a valid crush bucket type";
14368 *errcode
= -ENOENT
;
14369 ceph_assert(!commit
|| retval
!= -1);
14372 vector
<int> subtrees
;
14373 crush
.get_subtree_of_type(dividing_id
, &subtrees
);
14374 if (subtrees
.size() != 2) {
14375 ss
<< "there are " << subtrees
.size() << dividing_bucket
14376 << "'s in the cluster but stretch mode currently only works with 2!";
14377 *errcode
= -EINVAL
;
14378 ceph_assert(!commit
|| subtrees
.size() == 2);
14382 int new_crush_rule_result
= crush
.get_rule_id(new_crush_rule
);
14383 if (new_crush_rule_result
< 0) {
14384 ss
<< "unrecognized crush rule " << new_crush_rule
;
14385 *errcode
= new_crush_rule_result
;
14386 ceph_assert(!commit
|| (new_crush_rule_result
> 0));
14389 __u8 new_rule
= static_cast<__u8
>(new_crush_rule_result
);
14391 int weight1
= crush
.get_item_weight(subtrees
[0]);
14392 int weight2
= crush
.get_item_weight(subtrees
[1]);
14393 if (weight1
!= weight2
) {
14394 // TODO: I'm really not sure this is a good idea?
14395 ss
<< "the 2 " << dividing_bucket
14396 << "instances in the cluster have differing weights "
14397 << weight1
<< " and " << weight2
14398 <<" but stretch mode currently requires they be the same!";
14399 *errcode
= -EINVAL
;
14400 ceph_assert(!commit
|| (weight1
== weight2
));
14403 if (bucket_count
!= 2) {
14404 ss
<< "currently we only support 2-site stretch clusters!";
14405 *errcode
= -EINVAL
;
14406 ceph_assert(!commit
|| bucket_count
== 2);
14409 // TODO: check CRUSH rules for pools so that we are appropriately divided
14411 for (auto pool
: pools
) {
14412 pool
->crush_rule
= new_rule
;
14413 pool
->peering_crush_bucket_count
= bucket_count
;
14414 pool
->peering_crush_bucket_target
= bucket_count
;
14415 pool
->peering_crush_bucket_barrier
= dividing_id
;
14416 pool
->peering_crush_mandatory_member
= CRUSH_ITEM_NONE
;
14417 pool
->size
= g_conf().get_val
<uint64_t>("mon_stretch_pool_size");
14418 pool
->min_size
= g_conf().get_val
<uint64_t>("mon_stretch_pool_min_size");
14420 pending_inc
.change_stretch_mode
= true;
14421 pending_inc
.stretch_mode_enabled
= true;
14422 pending_inc
.new_stretch_bucket_count
= bucket_count
;
14423 pending_inc
.new_degraded_stretch_mode
= 0;
14424 pending_inc
.new_stretch_mode_bucket
= dividing_id
;
14430 bool OSDMonitor::check_for_dead_crush_zones(const map
<string
,set
<string
>>& dead_buckets
,
14431 set
<int> *really_down_buckets
,
14432 set
<string
> *really_down_mons
)
14434 dout(20) << __func__
<< " with dead mon zones " << dead_buckets
<< dendl
;
14435 ceph_assert(is_readable());
14436 if (dead_buckets
.empty()) return false;
14437 set
<int> down_cache
;
14438 bool really_down
= false;
14439 for (auto dbi
: dead_buckets
) {
14440 const string
& bucket_name
= dbi
.first
;
14441 ceph_assert(osdmap
.crush
->name_exists(bucket_name
));
14442 int bucket_id
= osdmap
.crush
->get_item_id(bucket_name
);
14443 dout(20) << "Checking " << bucket_name
<< " id " << bucket_id
14444 << " to see if OSDs are also down" << dendl
;
14445 bool subtree_down
= osdmap
.subtree_is_down(bucket_id
, &down_cache
);
14446 if (subtree_down
) {
14447 dout(20) << "subtree is down!" << dendl
;
14448 really_down
= true;
14449 really_down_buckets
->insert(bucket_id
);
14450 really_down_mons
->insert(dbi
.second
.begin(), dbi
.second
.end());
14453 dout(10) << "We determined CRUSH buckets " << *really_down_buckets
14454 << " and mons " << *really_down_mons
<< " are really down" << dendl
;
14455 return really_down
;
14458 void OSDMonitor::trigger_degraded_stretch_mode(const set
<int>& dead_buckets
,
14459 const set
<string
>& live_zones
)
14461 dout(20) << __func__
<< dendl
;
14462 stretch_recovery_triggered
.set_from_double(0); // reset this; we can't go clean now!
14463 // update the general OSDMap changes
14464 pending_inc
.change_stretch_mode
= true;
14465 pending_inc
.stretch_mode_enabled
= osdmap
.stretch_mode_enabled
;
14466 pending_inc
.new_stretch_bucket_count
= osdmap
.stretch_bucket_count
;
14467 int new_site_count
= osdmap
.stretch_bucket_count
- dead_buckets
.size();
14468 ceph_assert(new_site_count
== 1); // stretch count 2!
14469 pending_inc
.new_degraded_stretch_mode
= new_site_count
;
14470 pending_inc
.new_recovering_stretch_mode
= 0;
14471 pending_inc
.new_stretch_mode_bucket
= osdmap
.stretch_mode_bucket
;
14473 // and then apply them to all the pg_pool_ts
14474 ceph_assert(live_zones
.size() == 1); // only support 2 zones now
14475 const string
& remaining_site_name
= *(live_zones
.begin());
14476 ceph_assert(osdmap
.crush
->name_exists(remaining_site_name
));
14477 int remaining_site
= osdmap
.crush
->get_item_id(remaining_site_name
);
14478 for (auto pgi
: osdmap
.pools
) {
14479 if (pgi
.second
.peering_crush_bucket_count
) {
14480 pg_pool_t
& newp
= *pending_inc
.get_new_pool(pgi
.first
, &pgi
.second
);
14481 newp
.peering_crush_bucket_count
= new_site_count
;
14482 newp
.peering_crush_mandatory_member
= remaining_site
;
14483 newp
.min_size
= pgi
.second
.min_size
/ 2; // only support 2 zones now
14484 newp
.last_force_op_resend
= pending_inc
.epoch
;
14490 void OSDMonitor::trigger_recovery_stretch_mode()
14492 dout(20) << __func__
<< dendl
;
14493 stretch_recovery_triggered
.set_from_double(0); // reset this so we don't go full-active prematurely
14494 pending_inc
.change_stretch_mode
= true;
14495 pending_inc
.stretch_mode_enabled
= osdmap
.stretch_mode_enabled
;
14496 pending_inc
.new_stretch_bucket_count
= osdmap
.stretch_bucket_count
;
14497 pending_inc
.new_degraded_stretch_mode
= osdmap
.degraded_stretch_mode
;
14498 pending_inc
.new_recovering_stretch_mode
= 1;
14499 pending_inc
.new_stretch_mode_bucket
= osdmap
.stretch_mode_bucket
;
14501 for (auto pgi
: osdmap
.pools
) {
14502 if (pgi
.second
.peering_crush_bucket_count
) {
14503 pg_pool_t
& newp
= *pending_inc
.get_new_pool(pgi
.first
, &pgi
.second
);
14504 newp
.last_force_op_resend
= pending_inc
.epoch
;
14510 void OSDMonitor::notify_new_pg_digest()
14512 dout(20) << __func__
<< dendl
;
14513 if (!stretch_recovery_triggered
.is_zero()) {
14514 try_end_recovery_stretch_mode(false);
14518 struct CMonExitRecovery
: public Context
{
14521 CMonExitRecovery(OSDMonitor
*mon
, bool f
) : m(mon
), force(f
) {}
14522 void finish(int r
) {
14523 m
->try_end_recovery_stretch_mode(force
);
14527 void OSDMonitor::try_end_recovery_stretch_mode(bool force
)
14529 dout(20) << __func__
<< dendl
;
14530 if (!mon
.is_leader()) return;
14531 if (!mon
.is_degraded_stretch_mode()) return;
14532 if (!mon
.is_recovering_stretch_mode()) return;
14533 if (!is_readable()) {
14534 wait_for_readable_ctx(new CMonExitRecovery(this, force
));
14538 if (osdmap
.recovering_stretch_mode
&&
14539 ((!stretch_recovery_triggered
.is_zero() &&
14540 ceph_clock_now() - g_conf().get_val
<double>("mon_stretch_recovery_min_wait") >
14541 stretch_recovery_triggered
) ||
14543 if (!mon
.mgrstatmon()->is_readable()) {
14544 mon
.mgrstatmon()->wait_for_readable_ctx(new CMonExitRecovery(this, force
));
14547 const PGMapDigest
& pgd
= mon
.mgrstatmon()->get_digest();
14548 double misplaced
, degraded
, inactive
, unknown
;
14549 pgd
.get_recovery_stats(&misplaced
, °raded
, &inactive
, &unknown
);
14550 if (force
|| (degraded
== 0.0 && inactive
== 0.0 && unknown
== 0.0)) {
14551 // we can exit degraded stretch mode!
14552 mon
.trigger_healthy_stretch_mode();
14557 void OSDMonitor::trigger_healthy_stretch_mode()
14559 ceph_assert(is_writeable());
14560 stretch_recovery_triggered
.set_from_double(0);
14561 pending_inc
.change_stretch_mode
= true;
14562 pending_inc
.stretch_mode_enabled
= osdmap
.stretch_mode_enabled
;
14563 pending_inc
.new_stretch_bucket_count
= osdmap
.stretch_bucket_count
;
14564 pending_inc
.new_degraded_stretch_mode
= 0; // turn off degraded mode...
14565 pending_inc
.new_recovering_stretch_mode
= 0; //...and recovering mode!
14566 pending_inc
.new_stretch_mode_bucket
= osdmap
.stretch_mode_bucket
;
14567 for (auto pgi
: osdmap
.pools
) {
14568 if (pgi
.second
.peering_crush_bucket_count
) {
14569 pg_pool_t
& newp
= *pending_inc
.get_new_pool(pgi
.first
, &pgi
.second
);
14570 newp
.peering_crush_bucket_count
= osdmap
.stretch_bucket_count
;
14571 newp
.peering_crush_mandatory_member
= CRUSH_ITEM_NONE
;
14572 newp
.min_size
= g_conf().get_val
<uint64_t>("mon_stretch_pool_min_size");
14573 newp
.last_force_op_resend
= pending_inc
.epoch
;