1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
10 * Author: Loic Dachary <loic@dachary.org>
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
20 #include <boost/algorithm/string.hpp>
21 #include <experimental/iterator>
25 #include "mon/OSDMonitor.h"
26 #include "mon/Monitor.h"
27 #include "mon/MDSMonitor.h"
28 #include "mon/MgrStatMonitor.h"
29 #include "mon/AuthMonitor.h"
30 #include "mon/ConfigKeyService.h"
32 #include "mon/MonitorDBStore.h"
33 #include "mon/Session.h"
35 #include "crush/CrushWrapper.h"
36 #include "crush/CrushTester.h"
37 #include "crush/CrushTreeDumper.h"
39 #include "messages/MOSDBeacon.h"
40 #include "messages/MOSDFailure.h"
41 #include "messages/MOSDMarkMeDown.h"
42 #include "messages/MOSDMarkMeDead.h"
43 #include "messages/MOSDFull.h"
44 #include "messages/MOSDMap.h"
45 #include "messages/MMonGetOSDMap.h"
46 #include "messages/MOSDBoot.h"
47 #include "messages/MOSDAlive.h"
48 #include "messages/MPoolOp.h"
49 #include "messages/MPoolOpReply.h"
50 #include "messages/MOSDPGCreate.h"
51 #include "messages/MOSDPGCreate2.h"
52 #include "messages/MOSDPGCreated.h"
53 #include "messages/MOSDPGTemp.h"
54 #include "messages/MOSDPGReadyToMerge.h"
55 #include "messages/MMonCommand.h"
56 #include "messages/MRemoveSnaps.h"
57 #include "messages/MOSDScrub.h"
58 #include "messages/MRoute.h"
59 #include "messages/MMonGetPurgedSnaps.h"
60 #include "messages/MMonGetPurgedSnapsReply.h"
62 #include "common/TextTable.h"
63 #include "common/Timer.h"
64 #include "common/ceph_argparse.h"
65 #include "common/perf_counters.h"
66 #include "common/PriorityCache.h"
67 #include "common/strtol.h"
68 #include "common/numa.h"
70 #include "common/config.h"
71 #include "common/errno.h"
73 #include "erasure-code/ErasureCodePlugin.h"
74 #include "compressor/Compressor.h"
75 #include "common/Checksummer.h"
77 #include "include/compat.h"
78 #include "include/ceph_assert.h"
79 #include "include/stringify.h"
80 #include "include/util.h"
81 #include "common/cmdparse.h"
82 #include "include/str_list.h"
83 #include "include/str_map.h"
84 #include "include/scope_guard.h"
85 #include "perfglue/heap_profiler.h"
87 #include "auth/cephx/CephxKeyServer.h"
88 #include "osd/OSDCap.h"
90 #include "json_spirit/json_spirit_reader.h"
92 #include <boost/algorithm/string/predicate.hpp>
94 #define dout_subsys ceph_subsys_mon
95 static const string
OSD_PG_CREATING_PREFIX("osd_pg_creating");
96 static const string
OSD_METADATA_PREFIX("osd_metadata");
97 static const string
OSD_SNAP_PREFIX("osd_snap");
101 OSD snapshot metadata
102 ---------------------
104 -- starting with mimic, removed in octopus --
106 "removed_epoch_%llu_%08lx" % (pool, epoch)
107 -> interval_set<snapid_t>
109 "removed_snap_%llu_%016llx" % (pool, last_snap)
110 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
113 -- starting with mimic --
115 "purged_snap_%llu_%016llx" % (pool, last_snap)
116 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
118 - note that the {removed,purged}_snap put the last snap in they key so
119 that we can use forward iteration only to search for an epoch in an
120 interval. e.g., to test if epoch N is removed/purged, we'll find a key
121 >= N that either does or doesn't contain the given snap.
124 -- starting with octopus --
126 "purged_epoch_%08lx" % epoch
127 -> map<int64_t,interval_set<snapid_t>>
130 using namespace TOPNSPC::common
;
133 struct OSDMemCache
: public PriorityCache::PriCache
{
135 int64_t cache_bytes
[PriorityCache::Priority::LAST
+1] = {0};
136 int64_t committed_bytes
= 0;
137 double cache_ratio
= 0;
139 OSDMemCache(OSDMonitor
*m
) : osdmon(m
) {};
141 virtual uint64_t _get_used_bytes() const = 0;
143 virtual int64_t request_cache_bytes(
144 PriorityCache::Priority pri
, uint64_t total_cache
) const {
145 int64_t assigned
= get_cache_bytes(pri
);
148 // All cache items are currently set to have PRI1 priority
149 case PriorityCache::Priority::PRI1
:
151 int64_t request
= _get_used_bytes();
152 return (request
> assigned
) ? request
- assigned
: 0;
160 virtual int64_t get_cache_bytes(PriorityCache::Priority pri
) const {
161 return cache_bytes
[pri
];
164 virtual int64_t get_cache_bytes() const {
167 for (int i
= 0; i
< PriorityCache::Priority::LAST
+ 1; i
++) {
168 PriorityCache::Priority pri
= static_cast<PriorityCache::Priority
>(i
);
169 total
+= get_cache_bytes(pri
);
174 virtual void set_cache_bytes(PriorityCache::Priority pri
, int64_t bytes
) {
175 cache_bytes
[pri
] = bytes
;
177 virtual void add_cache_bytes(PriorityCache::Priority pri
, int64_t bytes
) {
178 cache_bytes
[pri
] += bytes
;
180 virtual int64_t commit_cache_size(uint64_t total_cache
) {
181 committed_bytes
= PriorityCache::get_chunk(
182 get_cache_bytes(), total_cache
);
183 return committed_bytes
;
185 virtual int64_t get_committed_size() const {
186 return committed_bytes
;
188 virtual double get_cache_ratio() const {
191 virtual void set_cache_ratio(double ratio
) {
194 virtual string
get_cache_name() const = 0;
197 struct IncCache
: public OSDMemCache
{
198 IncCache(OSDMonitor
*m
) : OSDMemCache(m
) {};
200 virtual uint64_t _get_used_bytes() const {
201 return osdmon
->inc_osd_cache
.get_bytes();
204 virtual string
get_cache_name() const {
205 return "OSDMap Inc Cache";
208 uint64_t _get_num_osdmaps() const {
209 return osdmon
->inc_osd_cache
.get_size();
213 struct FullCache
: public OSDMemCache
{
214 FullCache(OSDMonitor
*m
) : OSDMemCache(m
) {};
216 virtual uint64_t _get_used_bytes() const {
217 return osdmon
->full_osd_cache
.get_bytes();
220 virtual string
get_cache_name() const {
221 return "OSDMap Full Cache";
224 uint64_t _get_num_osdmaps() const {
225 return osdmon
->full_osd_cache
.get_size();
229 std::shared_ptr
<IncCache
> inc_cache
;
230 std::shared_ptr
<FullCache
> full_cache
;
232 const uint32_t MAX_POOL_APPLICATIONS
= 4;
233 const uint32_t MAX_POOL_APPLICATION_KEYS
= 64;
234 const uint32_t MAX_POOL_APPLICATION_LENGTH
= 128;
236 bool is_osd_writable(const OSDCapGrant
& grant
, const std::string
* pool_name
) {
237 // Note: this doesn't include support for the application tag match
238 if ((grant
.spec
.allow
& OSD_CAP_W
) != 0) {
239 auto& match
= grant
.match
;
240 if (match
.is_match_all()) {
242 } else if (pool_name
!= nullptr &&
243 !match
.pool_namespace
.pool_name
.empty() &&
244 match
.pool_namespace
.pool_name
== *pool_name
) {
251 bool is_unmanaged_snap_op_permitted(CephContext
* cct
,
252 const KeyServer
& key_server
,
253 const EntityName
& entity_name
,
254 const MonCap
& mon_caps
,
255 const entity_addr_t
& peer_socket_addr
,
256 const std::string
* pool_name
)
258 typedef std::map
<std::string
, std::string
> CommandArgs
;
260 if (mon_caps
.is_capable(
261 cct
, entity_name
, "osd",
262 "osd pool op unmanaged-snap",
263 (pool_name
== nullptr ?
264 CommandArgs
{} /* pool DNE, require unrestricted cap */ :
265 CommandArgs
{{"poolname", *pool_name
}}),
271 AuthCapsInfo caps_info
;
272 if (!key_server
.get_service_caps(entity_name
, CEPH_ENTITY_TYPE_OSD
,
274 dout(10) << "unable to locate OSD cap data for " << entity_name
275 << " in auth db" << dendl
;
280 if (caps_info
.caps
.length() > 0) {
281 auto p
= caps_info
.caps
.cbegin();
284 } catch (const buffer::error
&err
) {
285 derr
<< "corrupt OSD cap data for " << entity_name
<< " in auth db"
292 if (!osd_cap
.parse(caps_str
, nullptr)) {
293 dout(10) << "unable to parse OSD cap data for " << entity_name
294 << " in auth db" << dendl
;
298 // if the entity has write permissions in one or all pools, permit
299 // usage of unmanaged-snapshots
300 if (osd_cap
.allow_all()) {
304 for (auto& grant
: osd_cap
.grants
) {
305 if (grant
.profile
.is_valid()) {
306 for (auto& profile_grant
: grant
.profile_grants
) {
307 if (is_osd_writable(profile_grant
, pool_name
)) {
311 } else if (is_osd_writable(grant
, pool_name
)) {
319 } // anonymous namespace
321 void LastEpochClean::Lec::report(ps_t ps
, epoch_t last_epoch_clean
)
323 if (epoch_by_pg
.size() <= ps
) {
324 epoch_by_pg
.resize(ps
+ 1, 0);
326 const auto old_lec
= epoch_by_pg
[ps
];
327 if (old_lec
>= last_epoch_clean
) {
331 epoch_by_pg
[ps
] = last_epoch_clean
;
332 if (last_epoch_clean
< floor
) {
333 floor
= last_epoch_clean
;
334 } else if (last_epoch_clean
> floor
) {
335 if (old_lec
== floor
) {
336 // probably should increase floor?
337 auto new_floor
= std::min_element(std::begin(epoch_by_pg
),
338 std::end(epoch_by_pg
));
342 if (ps
!= next_missing
) {
345 for (; next_missing
< epoch_by_pg
.size(); next_missing
++) {
346 if (epoch_by_pg
[next_missing
] == 0) {
352 void LastEpochClean::remove_pool(uint64_t pool
)
354 report_by_pool
.erase(pool
);
357 void LastEpochClean::report(const pg_t
& pg
, epoch_t last_epoch_clean
)
359 auto& lec
= report_by_pool
[pg
.pool()];
360 return lec
.report(pg
.ps(), last_epoch_clean
);
363 epoch_t
LastEpochClean::get_lower_bound(const OSDMap
& latest
) const
365 auto floor
= latest
.get_epoch();
366 for (auto& pool
: latest
.get_pools()) {
367 auto reported
= report_by_pool
.find(pool
.first
);
368 if (reported
== report_by_pool
.end()) {
371 if (reported
->second
.next_missing
< pool
.second
.get_pg_num()) {
374 if (reported
->second
.floor
< floor
) {
375 floor
= reported
->second
.floor
;
382 class C_UpdateCreatingPGs
: public Context
{
387 C_UpdateCreatingPGs(OSDMonitor
*osdmon
, epoch_t e
) :
388 osdmon(osdmon
), start(ceph_clock_now()), epoch(e
) {}
389 void finish(int r
) override
{
391 utime_t end
= ceph_clock_now();
392 dout(10) << "osdmap epoch " << epoch
<< " mapping took "
393 << (end
- start
) << " seconds" << dendl
;
394 osdmon
->update_creating_pgs();
395 osdmon
->check_pg_creates_subs();
401 #define dout_prefix _prefix(_dout, mon, osdmap)
402 static ostream
& _prefix(std::ostream
*_dout
, Monitor
*mon
, const OSDMap
& osdmap
) {
403 return *_dout
<< "mon." << mon
->name
<< "@" << mon
->rank
404 << "(" << mon
->get_state_name()
405 << ").osd e" << osdmap
.get_epoch() << " ";
408 OSDMonitor::OSDMonitor(
412 const string
& service_name
)
413 : PaxosService(mn
, p
, service_name
),
415 inc_osd_cache(g_conf()->mon_osd_cache_size
),
416 full_osd_cache(g_conf()->mon_osd_cache_size
),
417 has_osdmap_manifest(false),
418 mapper(mn
->cct
, &mn
->cpu_tp
)
420 inc_cache
= std::make_shared
<IncCache
>(this);
421 full_cache
= std::make_shared
<FullCache
>(this);
422 cct
->_conf
.add_observer(this);
423 int r
= _set_cache_sizes();
425 derr
<< __func__
<< " using default osd cache size - mon_osd_cache_size ("
426 << g_conf()->mon_osd_cache_size
427 << ") without priority cache management"
432 const char **OSDMonitor::get_tracked_conf_keys() const
434 static const char* KEYS
[] = {
436 "mon_memory_autotune",
437 "rocksdb_cache_size",
443 void OSDMonitor::handle_conf_change(const ConfigProxy
& conf
,
444 const std::set
<std::string
> &changed
)
446 dout(10) << __func__
<< " " << changed
<< dendl
;
448 if (changed
.count("mon_memory_autotune")) {
449 _set_cache_autotuning();
451 if (changed
.count("mon_memory_target") ||
452 changed
.count("rocksdb_cache_size")) {
453 int r
= _update_mon_cache_settings();
455 derr
<< __func__
<< " mon_memory_target:"
456 << g_conf()->mon_memory_target
457 << " rocksdb_cache_size:"
458 << g_conf()->rocksdb_cache_size
459 << ". Unable to update cache size."
465 void OSDMonitor::_set_cache_autotuning()
467 if (!g_conf()->mon_memory_autotune
&& pcm
!= nullptr) {
468 // Disable cache autotuning
469 std::lock_guard
l(balancer_lock
);
473 if (g_conf()->mon_memory_autotune
&& pcm
== nullptr) {
474 int r
= register_cache_with_pcm();
477 << " Error while registering osdmon caches with pcm."
478 << " Cache auto tuning not enabled."
480 mon_memory_autotune
= false;
482 mon_memory_autotune
= true;
487 int OSDMonitor::_update_mon_cache_settings()
489 if (g_conf()->mon_memory_target
<= 0 ||
490 g_conf()->mon_memory_target
< mon_memory_min
||
491 g_conf()->rocksdb_cache_size
<= 0) {
495 if (pcm
== nullptr && rocksdb_binned_kv_cache
== nullptr) {
496 derr
<< __func__
<< " not using pcm and rocksdb" << dendl
;
500 uint64_t old_mon_memory_target
= mon_memory_target
;
501 uint64_t old_rocksdb_cache_size
= rocksdb_cache_size
;
503 // Set the new pcm memory cache sizes
504 mon_memory_target
= g_conf()->mon_memory_target
;
505 rocksdb_cache_size
= g_conf()->rocksdb_cache_size
;
507 uint64_t base
= mon_memory_base
;
508 double fragmentation
= mon_memory_fragmentation
;
509 uint64_t target
= mon_memory_target
;
510 uint64_t min
= mon_memory_min
;
513 uint64_t ltarget
= (1.0 - fragmentation
) * target
;
514 if (ltarget
> base
+ min
) {
515 max
= ltarget
- base
;
518 int r
= _set_cache_ratios();
520 derr
<< __func__
<< " Cache ratios for pcm could not be set."
521 << " Review the kv (rocksdb) and mon_memory_target sizes."
523 mon_memory_target
= old_mon_memory_target
;
524 rocksdb_cache_size
= old_rocksdb_cache_size
;
528 if (mon_memory_autotune
&& pcm
!= nullptr) {
529 std::lock_guard
l(balancer_lock
);
530 // set pcm cache levels
531 pcm
->set_target_memory(target
);
532 pcm
->set_min_memory(min
);
533 pcm
->set_max_memory(max
);
534 // tune memory based on new values
537 _set_new_cache_sizes();
538 dout(1) << __func__
<< " Updated mon cache setting."
539 << " target: " << target
547 int OSDMonitor::_set_cache_sizes()
549 if (g_conf()->mon_memory_autotune
) {
550 // set the new osdmon cache targets to be managed by pcm
551 mon_osd_cache_size
= g_conf()->mon_osd_cache_size
;
552 rocksdb_cache_size
= g_conf()->rocksdb_cache_size
;
553 mon_memory_base
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_base");
554 mon_memory_fragmentation
= cct
->_conf
.get_val
<double>("osd_memory_expected_fragmentation");
555 mon_memory_target
= g_conf()->mon_memory_target
;
556 mon_memory_min
= g_conf()->mon_osd_cache_size_min
;
557 if (mon_memory_target
<= 0 || mon_memory_min
<= 0) {
558 derr
<< __func__
<< " mon_memory_target:" << mon_memory_target
559 << " mon_memory_min:" << mon_memory_min
560 << ". Invalid size option(s) provided."
564 // Set the initial inc and full LRU cache sizes
565 inc_osd_cache
.set_bytes(mon_memory_min
);
566 full_osd_cache
.set_bytes(mon_memory_min
);
567 mon_memory_autotune
= g_conf()->mon_memory_autotune
;
572 bool OSDMonitor::_have_pending_crush()
574 return pending_inc
.crush
.length() > 0;
577 CrushWrapper
&OSDMonitor::_get_stable_crush()
579 return *osdmap
.crush
;
582 void OSDMonitor::_get_pending_crush(CrushWrapper
& newcrush
)
585 if (pending_inc
.crush
.length())
586 bl
= pending_inc
.crush
;
588 osdmap
.crush
->encode(bl
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
590 auto p
= bl
.cbegin();
594 void OSDMonitor::create_initial()
596 dout(10) << "create_initial for " << mon
->monmap
->fsid
<< dendl
;
601 mon
->store
->get("mkfs", "osdmap", bl
);
605 newmap
.set_fsid(mon
->monmap
->fsid
);
607 newmap
.build_simple(cct
, 0, mon
->monmap
->fsid
, 0);
610 newmap
.created
= newmap
.modified
= ceph_clock_now();
612 // new clusters should sort bitwise by default.
613 newmap
.set_flag(CEPH_OSDMAP_SORTBITWISE
);
616 CEPH_OSDMAP_RECOVERY_DELETES
|
617 CEPH_OSDMAP_PURGED_SNAPDIRS
|
618 CEPH_OSDMAP_PGLOG_HARDLIMIT
;
619 newmap
.full_ratio
= g_conf()->mon_osd_full_ratio
;
620 if (newmap
.full_ratio
> 1.0) newmap
.full_ratio
/= 100;
621 newmap
.backfillfull_ratio
= g_conf()->mon_osd_backfillfull_ratio
;
622 if (newmap
.backfillfull_ratio
> 1.0) newmap
.backfillfull_ratio
/= 100;
623 newmap
.nearfull_ratio
= g_conf()->mon_osd_nearfull_ratio
;
624 if (newmap
.nearfull_ratio
> 1.0) newmap
.nearfull_ratio
/= 100;
626 // new cluster should require latest by default
627 if (g_conf().get_val
<bool>("mon_debug_no_require_octopus")) {
628 if (g_conf().get_val
<bool>("mon_debug_no_require_nautilus")) {
629 derr
<< __func__
<< " mon_debug_no_require_octopus and nautilus=true" << dendl
;
630 newmap
.require_osd_release
= ceph_release_t::mimic
;
632 derr
<< __func__
<< " mon_debug_no_require_octopus=true" << dendl
;
633 newmap
.require_osd_release
= ceph_release_t::nautilus
;
636 newmap
.require_osd_release
= ceph_release_t::octopus
;
637 ceph_release_t r
= ceph_release_from_name(
638 g_conf()->mon_osd_initial_require_min_compat_client
);
640 ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
642 newmap
.require_min_compat_client
= r
;
645 // encode into pending incremental
646 uint64_t features
= newmap
.get_encoding_features();
647 newmap
.encode(pending_inc
.fullmap
,
648 features
| CEPH_FEATURE_RESERVED
);
649 pending_inc
.full_crc
= newmap
.get_crc();
650 dout(20) << " full crc " << pending_inc
.full_crc
<< dendl
;
653 void OSDMonitor::get_store_prefixes(std::set
<string
>& s
) const
655 s
.insert(service_name
);
656 s
.insert(OSD_PG_CREATING_PREFIX
);
657 s
.insert(OSD_METADATA_PREFIX
);
658 s
.insert(OSD_SNAP_PREFIX
);
661 void OSDMonitor::update_from_paxos(bool *need_bootstrap
)
663 // we really don't care if the version has been updated, because we may
664 // have trimmed without having increased the last committed; yet, we may
665 // need to update the in-memory manifest.
666 load_osdmap_manifest();
668 version_t version
= get_last_committed();
669 if (version
== osdmap
.epoch
)
671 ceph_assert(version
> osdmap
.epoch
);
673 dout(15) << "update_from_paxos paxos e " << version
674 << ", my e " << osdmap
.epoch
<< dendl
;
677 if (!mapping_job
->is_done()) {
678 dout(1) << __func__
<< " mapping job "
679 << mapping_job
.get() << " did not complete, "
680 << mapping_job
->shards
<< " left, canceling" << dendl
;
681 mapping_job
->abort();
689 * We will possibly have a stashed latest that *we* wrote, and we will
690 * always be sure to have the oldest full map in the first..last range
691 * due to encode_trim_extra(), which includes the oldest full map in the trim
694 * encode_trim_extra() does not however write the full map's
695 * version to 'full_latest'. This is only done when we are building the
696 * full maps from the incremental versions. But don't panic! We make sure
697 * that the following conditions find whichever full map version is newer.
699 version_t latest_full
= get_version_latest_full();
700 if (latest_full
== 0 && get_first_committed() > 1)
701 latest_full
= get_first_committed();
703 if (get_first_committed() > 1 &&
704 latest_full
< get_first_committed()) {
705 // the monitor could be just sync'ed with its peer, and the latest_full key
706 // is not encoded in the paxos commits in encode_pending(), so we need to
707 // make sure we get it pointing to a proper version.
708 version_t lc
= get_last_committed();
709 version_t fc
= get_first_committed();
711 dout(10) << __func__
<< " looking for valid full map in interval"
712 << " [" << fc
<< ", " << lc
<< "]" << dendl
;
715 for (version_t v
= lc
; v
>= fc
; v
--) {
716 string full_key
= "full_" + stringify(v
);
717 if (mon
->store
->exists(get_service_name(), full_key
)) {
718 dout(10) << __func__
<< " found latest full map v " << v
<< dendl
;
724 ceph_assert(latest_full
> 0);
725 auto t(std::make_shared
<MonitorDBStore::Transaction
>());
726 put_version_latest_full(t
, latest_full
);
727 mon
->store
->apply_transaction(t
);
728 dout(10) << __func__
<< " updated the on-disk full map version to "
729 << latest_full
<< dendl
;
732 if ((latest_full
> 0) && (latest_full
> osdmap
.epoch
)) {
733 bufferlist latest_bl
;
734 get_version_full(latest_full
, latest_bl
);
735 ceph_assert(latest_bl
.length() != 0);
736 dout(7) << __func__
<< " loading latest full map e" << latest_full
<< dendl
;
738 osdmap
.decode(latest_bl
);
742 if (!mon
->store
->get(OSD_PG_CREATING_PREFIX
, "creating", bl
)) {
743 auto p
= bl
.cbegin();
744 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
745 creating_pgs
.decode(p
);
746 dout(7) << __func__
<< " loading creating_pgs last_scan_epoch "
747 << creating_pgs
.last_scan_epoch
748 << " with " << creating_pgs
.pgs
.size() << " pgs" << dendl
;
750 dout(1) << __func__
<< " missing creating pgs; upgrade from post-kraken?"
754 // walk through incrementals
755 MonitorDBStore::TransactionRef t
;
757 while (version
> osdmap
.epoch
) {
759 int err
= get_version(osdmap
.epoch
+1, inc_bl
);
760 ceph_assert(err
== 0);
761 ceph_assert(inc_bl
.length());
762 // set priority cache manager levels if the osdmap is
763 // being populated for the first time.
764 if (mon_memory_autotune
&& pcm
== nullptr) {
765 int r
= register_cache_with_pcm();
768 << " Error while registering osdmon caches with pcm."
769 << " Proceeding without cache auto tuning."
774 dout(7) << "update_from_paxos applying incremental " << osdmap
.epoch
+1
776 OSDMap::Incremental
inc(inc_bl
);
777 err
= osdmap
.apply_incremental(inc
);
778 ceph_assert(err
== 0);
781 t
.reset(new MonitorDBStore::Transaction
);
783 // Write out the full map for all past epochs. Encode the full
784 // map with the same features as the incremental. If we don't
785 // know, use the quorum features. If we don't know those either,
786 // encode with all features.
787 uint64_t f
= inc
.encode_features
;
789 f
= mon
->get_quorum_con_features();
793 osdmap
.encode(full_bl
, f
| CEPH_FEATURE_RESERVED
);
794 tx_size
+= full_bl
.length();
796 bufferlist orig_full_bl
;
797 get_version_full(osdmap
.epoch
, orig_full_bl
);
798 if (orig_full_bl
.length()) {
799 // the primary provided the full map
800 ceph_assert(inc
.have_crc
);
801 if (inc
.full_crc
!= osdmap
.crc
) {
802 // This will happen if the mons were running mixed versions in
803 // the past or some other circumstance made the full encoded
804 // maps divergent. Reloading here will bring us back into
805 // sync with the primary for this and all future maps. OSDs
806 // will also be brought back into sync when they discover the
807 // crc mismatch and request a full map from a mon.
808 derr
<< __func__
<< " full map CRC mismatch, resetting to canonical"
811 dout(20) << __func__
<< " my (bad) full osdmap:\n";
812 JSONFormatter
jf(true);
813 jf
.dump_object("osdmap", osdmap
);
815 *_dout
<< "\nhexdump:\n";
816 full_bl
.hexdump(*_dout
);
820 osdmap
.decode(orig_full_bl
);
822 dout(20) << __func__
<< " canonical full osdmap:\n";
823 JSONFormatter
jf(true);
824 jf
.dump_object("osdmap", osdmap
);
826 *_dout
<< "\nhexdump:\n";
827 orig_full_bl
.hexdump(*_dout
);
831 ceph_assert(!inc
.have_crc
);
832 put_version_full(t
, osdmap
.epoch
, full_bl
);
834 put_version_latest_full(t
, osdmap
.epoch
);
837 dout(1) << osdmap
<< dendl
;
839 if (osdmap
.epoch
== 1) {
840 t
->erase("mkfs", "osdmap");
843 if (tx_size
> g_conf()->mon_sync_max_payload_size
*2) {
844 mon
->store
->apply_transaction(t
);
845 t
= MonitorDBStore::TransactionRef();
848 for (const auto &osd_state
: inc
.new_state
) {
849 if (osd_state
.second
& CEPH_OSD_UP
) {
850 // could be marked up *or* down, but we're too lazy to check which
851 last_osd_report
.erase(osd_state
.first
);
853 if (osd_state
.second
& CEPH_OSD_EXISTS
) {
854 // could be created *or* destroyed, but we can safely drop it
855 osd_epochs
.erase(osd_state
.first
);
861 mon
->store
->apply_transaction(t
);
864 for (int o
= 0; o
< osdmap
.get_max_osd(); o
++) {
865 if (osdmap
.is_out(o
))
867 auto found
= down_pending_out
.find(o
);
868 if (osdmap
.is_down(o
)) {
869 // populate down -> out map
870 if (found
== down_pending_out
.end()) {
871 dout(10) << " adding osd." << o
<< " to down_pending_out map" << dendl
;
872 down_pending_out
[o
] = ceph_clock_now();
875 if (found
!= down_pending_out
.end()) {
876 dout(10) << " removing osd." << o
<< " from down_pending_out map" << dendl
;
877 down_pending_out
.erase(found
);
881 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
884 check_pg_creates_subs();
886 share_map_with_random_osd();
890 // make sure our feature bits reflect the latest map
891 update_msgr_features();
893 if (!mon
->is_leader()) {
894 // will be called by on_active() on the leader, avoid doing so twice
899 int OSDMonitor::register_cache_with_pcm()
901 if (mon_memory_target
<= 0 || mon_memory_min
<= 0) {
902 derr
<< __func__
<< " Invalid memory size specified for mon caches."
903 << " Caches will not be auto-tuned."
907 uint64_t base
= mon_memory_base
;
908 double fragmentation
= mon_memory_fragmentation
;
909 // For calculating total target memory, consider rocksdb cache size.
910 uint64_t target
= mon_memory_target
;
911 uint64_t min
= mon_memory_min
;
914 // Apply the same logic as in bluestore to set the max amount
915 // of memory to use for cache. Assume base memory for OSDMaps
916 // and then add in some overhead for fragmentation.
917 uint64_t ltarget
= (1.0 - fragmentation
) * target
;
918 if (ltarget
> base
+ min
) {
919 max
= ltarget
- base
;
922 rocksdb_binned_kv_cache
= mon
->store
->get_priority_cache();
923 if (!rocksdb_binned_kv_cache
) {
924 derr
<< __func__
<< " not using rocksdb" << dendl
;
928 int r
= _set_cache_ratios();
930 derr
<< __func__
<< " Cache ratios for pcm could not be set."
931 << " Review the kv (rocksdb) and mon_memory_target sizes."
936 pcm
= std::make_shared
<PriorityCache::Manager
>(
937 cct
, min
, max
, target
, true);
938 pcm
->insert("kv", rocksdb_binned_kv_cache
, true);
939 pcm
->insert("inc", inc_cache
, true);
940 pcm
->insert("full", full_cache
, true);
941 dout(1) << __func__
<< " pcm target: " << target
942 << " pcm max: " << max
943 << " pcm min: " << min
944 << " inc_osd_cache size: " << inc_osd_cache
.get_size()
949 int OSDMonitor::_set_cache_ratios()
951 double old_cache_kv_ratio
= cache_kv_ratio
;
953 // Set the cache ratios for kv(rocksdb), inc and full caches
954 cache_kv_ratio
= (double)rocksdb_cache_size
/ (double)mon_memory_target
;
955 if (cache_kv_ratio
>= 1.0) {
956 derr
<< __func__
<< " Cache kv ratio (" << cache_kv_ratio
957 << ") must be in range [0,<1.0]."
959 cache_kv_ratio
= old_cache_kv_ratio
;
962 rocksdb_binned_kv_cache
->set_cache_ratio(cache_kv_ratio
);
963 cache_inc_ratio
= cache_full_ratio
= (1.0 - cache_kv_ratio
) / 2;
964 inc_cache
->set_cache_ratio(cache_inc_ratio
);
965 full_cache
->set_cache_ratio(cache_full_ratio
);
967 dout(1) << __func__
<< " kv ratio " << cache_kv_ratio
968 << " inc ratio " << cache_inc_ratio
969 << " full ratio " << cache_full_ratio
974 void OSDMonitor::start_mapping()
976 // initiate mapping job
978 dout(10) << __func__
<< " canceling previous mapping_job " << mapping_job
.get()
980 mapping_job
->abort();
982 if (!osdmap
.get_pools().empty()) {
983 auto fin
= new C_UpdateCreatingPGs(this, osdmap
.get_epoch());
984 mapping_job
= mapping
.start_update(osdmap
, mapper
,
985 g_conf()->mon_osd_mapping_pgs_per_chunk
);
986 dout(10) << __func__
<< " started mapping job " << mapping_job
.get()
987 << " at " << fin
->start
<< dendl
;
988 mapping_job
->set_finish_event(fin
);
990 dout(10) << __func__
<< " no pools, no mapping job" << dendl
;
991 mapping_job
= nullptr;
995 void OSDMonitor::update_msgr_features()
998 types
.insert((int)entity_name_t::TYPE_OSD
);
999 types
.insert((int)entity_name_t::TYPE_CLIENT
);
1000 types
.insert((int)entity_name_t::TYPE_MDS
);
1001 types
.insert((int)entity_name_t::TYPE_MON
);
1002 for (set
<int>::iterator q
= types
.begin(); q
!= types
.end(); ++q
) {
1004 uint64_t features
= osdmap
.get_features(*q
, &mask
);
1005 if ((mon
->messenger
->get_policy(*q
).features_required
& mask
) != features
) {
1006 dout(0) << "crush map has features " << features
<< ", adjusting msgr requires" << dendl
;
1007 ceph::net::Policy p
= mon
->messenger
->get_policy(*q
);
1008 p
.features_required
= (p
.features_required
& ~mask
) | features
;
1009 mon
->messenger
->set_policy(*q
, p
);
1014 void OSDMonitor::on_active()
1018 if (mon
->is_leader()) {
1019 mon
->clog
->debug() << "osdmap " << osdmap
;
1020 if (!priority_convert
) {
1021 // Only do this once at start-up
1022 convert_pool_priorities();
1023 priority_convert
= true;
1026 list
<MonOpRequestRef
> ls
;
1027 take_all_failures(ls
);
1028 while (!ls
.empty()) {
1029 MonOpRequestRef op
= ls
.front();
1030 op
->mark_osdmon_event(__func__
);
1038 void OSDMonitor::on_restart()
1040 last_osd_report
.clear();
1043 void OSDMonitor::on_shutdown()
1045 dout(10) << __func__
<< dendl
;
1047 dout(10) << __func__
<< " canceling previous mapping_job " << mapping_job
.get()
1049 mapping_job
->abort();
1052 // discard failure info, waiters
1053 list
<MonOpRequestRef
> ls
;
1054 take_all_failures(ls
);
1058 void OSDMonitor::update_logger()
1060 dout(10) << "update_logger" << dendl
;
1062 mon
->cluster_logger
->set(l_cluster_num_osd
, osdmap
.get_num_osds());
1063 mon
->cluster_logger
->set(l_cluster_num_osd_up
, osdmap
.get_num_up_osds());
1064 mon
->cluster_logger
->set(l_cluster_num_osd_in
, osdmap
.get_num_in_osds());
1065 mon
->cluster_logger
->set(l_cluster_osd_epoch
, osdmap
.get_epoch());
1068 void OSDMonitor::create_pending()
1070 pending_inc
= OSDMap::Incremental(osdmap
.epoch
+1);
1071 pending_inc
.fsid
= mon
->monmap
->fsid
;
1072 pending_metadata
.clear();
1073 pending_metadata_rm
.clear();
1074 pending_pseudo_purged_snaps
.clear();
1076 dout(10) << "create_pending e " << pending_inc
.epoch
<< dendl
;
1078 // safety checks (this shouldn't really happen)
1080 if (osdmap
.backfillfull_ratio
<= 0) {
1081 pending_inc
.new_backfillfull_ratio
= g_conf()->mon_osd_backfillfull_ratio
;
1082 if (pending_inc
.new_backfillfull_ratio
> 1.0)
1083 pending_inc
.new_backfillfull_ratio
/= 100;
1084 dout(1) << __func__
<< " setting backfillfull_ratio = "
1085 << pending_inc
.new_backfillfull_ratio
<< dendl
;
1087 if (osdmap
.full_ratio
<= 0) {
1088 pending_inc
.new_full_ratio
= g_conf()->mon_osd_full_ratio
;
1089 if (pending_inc
.new_full_ratio
> 1.0)
1090 pending_inc
.new_full_ratio
/= 100;
1091 dout(1) << __func__
<< " setting full_ratio = "
1092 << pending_inc
.new_full_ratio
<< dendl
;
1094 if (osdmap
.nearfull_ratio
<= 0) {
1095 pending_inc
.new_nearfull_ratio
= g_conf()->mon_osd_nearfull_ratio
;
1096 if (pending_inc
.new_nearfull_ratio
> 1.0)
1097 pending_inc
.new_nearfull_ratio
/= 100;
1098 dout(1) << __func__
<< " setting nearfull_ratio = "
1099 << pending_inc
.new_nearfull_ratio
<< dendl
;
1103 // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
1105 if (osdmap
.crush
->has_legacy_rule_ids()) {
1106 CrushWrapper newcrush
;
1107 _get_pending_crush(newcrush
);
1109 // First, for all pools, work out which rule they really used
1110 // by resolving ruleset to rule.
1111 for (const auto &i
: osdmap
.get_pools()) {
1112 const auto pool_id
= i
.first
;
1113 const auto &pool
= i
.second
;
1114 int new_rule_id
= newcrush
.find_rule(pool
.crush_rule
,
1115 pool
.type
, pool
.size
);
1117 dout(1) << __func__
<< " rewriting pool "
1118 << osdmap
.get_pool_name(pool_id
) << " crush ruleset "
1119 << pool
.crush_rule
<< " -> rule id " << new_rule_id
<< dendl
;
1120 if (pending_inc
.new_pools
.count(pool_id
) == 0) {
1121 pending_inc
.new_pools
[pool_id
] = pool
;
1123 pending_inc
.new_pools
[pool_id
].crush_rule
= new_rule_id
;
1126 // Now, go ahead and renumber all the rules so that their
1127 // rule_id field corresponds to their position in the array
1128 auto old_to_new
= newcrush
.renumber_rules();
1129 dout(1) << __func__
<< " Rewrote " << old_to_new
<< " crush IDs:" << dendl
;
1130 for (const auto &i
: old_to_new
) {
1131 dout(1) << __func__
<< " " << i
.first
<< " -> " << i
.second
<< dendl
;
1133 pending_inc
.crush
.clear();
1134 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
1139 OSDMonitor::update_pending_pgs(const OSDMap::Incremental
& inc
,
1140 const OSDMap
& nextmap
)
1142 dout(10) << __func__
<< dendl
;
1143 creating_pgs_t pending_creatings
;
1145 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
1146 pending_creatings
= creating_pgs
;
1148 // check for new or old pools
1149 if (pending_creatings
.last_scan_epoch
< inc
.epoch
) {
1150 unsigned queued
= 0;
1151 queued
+= scan_for_creating_pgs(osdmap
.get_pools(),
1154 &pending_creatings
);
1155 queued
+= scan_for_creating_pgs(inc
.new_pools
,
1158 &pending_creatings
);
1159 dout(10) << __func__
<< " " << queued
<< " pools queued" << dendl
;
1160 for (auto deleted_pool
: inc
.old_pools
) {
1161 auto removed
= pending_creatings
.remove_pool(deleted_pool
);
1162 dout(10) << __func__
<< " " << removed
1163 << " pg removed because containing pool deleted: "
1164 << deleted_pool
<< dendl
;
1165 last_epoch_clean
.remove_pool(deleted_pool
);
1167 // pgmon updates its creating_pgs in check_osd_map() which is called by
1168 // on_active() and check_osd_map() could be delayed if lease expires, so its
1169 // creating_pgs could be stale in comparison with the one of osdmon. let's
1170 // trim them here. otherwise, they will be added back after being erased.
1171 unsigned removed
= 0;
1172 for (auto& pg
: pending_created_pgs
) {
1173 dout(20) << __func__
<< " noting created pg " << pg
<< dendl
;
1174 pending_creatings
.created_pools
.insert(pg
.pool());
1175 removed
+= pending_creatings
.pgs
.erase(pg
);
1177 pending_created_pgs
.clear();
1178 dout(10) << __func__
<< " " << removed
1179 << " pgs removed because they're created" << dendl
;
1180 pending_creatings
.last_scan_epoch
= osdmap
.get_epoch();
1183 // filter out any pgs that shouldn't exist.
1185 auto i
= pending_creatings
.pgs
.begin();
1186 while (i
!= pending_creatings
.pgs
.end()) {
1187 if (!nextmap
.pg_exists(i
->first
)) {
1188 dout(10) << __func__
<< " removing pg " << i
->first
1189 << " which should not exist" << dendl
;
1190 i
= pending_creatings
.pgs
.erase(i
);
1198 unsigned max
= std::max
<int64_t>(1, g_conf()->mon_osd_max_creating_pgs
);
1199 const auto total
= pending_creatings
.pgs
.size();
1200 while (pending_creatings
.pgs
.size() < max
&&
1201 !pending_creatings
.queue
.empty()) {
1202 auto p
= pending_creatings
.queue
.begin();
1203 int64_t poolid
= p
->first
;
1204 dout(10) << __func__
<< " pool " << poolid
1205 << " created " << p
->second
.created
1206 << " modified " << p
->second
.modified
1207 << " [" << p
->second
.start
<< "-" << p
->second
.end
<< ")"
1209 int64_t n
= std::min
<int64_t>(max
- pending_creatings
.pgs
.size(),
1210 p
->second
.end
- p
->second
.start
);
1211 ps_t first
= p
->second
.start
;
1212 ps_t end
= first
+ n
;
1213 for (ps_t ps
= first
; ps
< end
; ++ps
) {
1214 const pg_t pgid
{ps
, static_cast<uint64_t>(poolid
)};
1215 // NOTE: use the *current* epoch as the PG creation epoch so that the
1216 // OSD does not have to generate a long set of PastIntervals.
1217 pending_creatings
.pgs
.emplace(
1219 creating_pgs_t::pg_create_info(inc
.epoch
,
1220 p
->second
.modified
));
1221 dout(10) << __func__
<< " adding " << pgid
<< dendl
;
1223 p
->second
.start
= end
;
1224 if (p
->second
.done()) {
1225 dout(10) << __func__
<< " done with queue for " << poolid
<< dendl
;
1226 pending_creatings
.queue
.erase(p
);
1228 dout(10) << __func__
<< " pool " << poolid
1229 << " now [" << p
->second
.start
<< "-" << p
->second
.end
<< ")"
1233 dout(10) << __func__
<< " queue remaining: " << pending_creatings
.queue
.size()
1234 << " pools" << dendl
;
1236 if (mon
->monmap
->min_mon_release
>= ceph_release_t::octopus
) {
1237 // walk creating pgs' history and past_intervals forward
1238 for (auto& i
: pending_creatings
.pgs
) {
1239 // this mirrors PG::start_peering_interval()
1240 pg_t pgid
= i
.first
;
1242 // this is a bit imprecise, but sufficient?
1243 struct min_size_predicate_t
: public IsPGRecoverablePredicate
{
1244 const pg_pool_t
*pi
;
1245 bool operator()(const set
<pg_shard_t
> &have
) const {
1246 return have
.size() >= pi
->min_size
;
1248 explicit min_size_predicate_t(const pg_pool_t
*i
) : pi(i
) {}
1249 } min_size_predicate(nextmap
.get_pg_pool(pgid
.pool()));
1251 vector
<int> up
, acting
;
1252 int up_primary
, acting_primary
;
1253 nextmap
.pg_to_up_acting_osds(
1254 pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
1255 if (i
.second
.history
.epoch_created
== 0) {
1256 // new pg entry, set it up
1258 i
.second
.acting
= acting
;
1259 i
.second
.up_primary
= up_primary
;
1260 i
.second
.acting_primary
= acting_primary
;
1261 i
.second
.history
= pg_history_t(i
.second
.create_epoch
,
1262 i
.second
.create_stamp
);
1263 dout(10) << __func__
<< " pg " << pgid
<< " just added, "
1264 << " up " << i
.second
.up
1265 << " p " << i
.second
.up_primary
1266 << " acting " << i
.second
.acting
1267 << " p " << i
.second
.acting_primary
1268 << " history " << i
.second
.history
1269 << " past_intervals " << i
.second
.past_intervals
1272 std::stringstream debug
;
1273 if (PastIntervals::check_new_interval(
1274 i
.second
.acting_primary
, acting_primary
,
1275 i
.second
.acting
, acting
,
1276 i
.second
.up_primary
, up_primary
,
1278 i
.second
.history
.same_interval_since
,
1279 i
.second
.history
.last_epoch_clean
,
1284 &i
.second
.past_intervals
,
1286 epoch_t e
= inc
.epoch
;
1287 i
.second
.history
.same_interval_since
= e
;
1288 if (i
.second
.up
!= up
) {
1289 i
.second
.history
.same_up_since
= e
;
1291 if (i
.second
.acting_primary
!= acting_primary
) {
1292 i
.second
.history
.same_primary_since
= e
;
1295 osdmap
.get_pg_num(pgid
.pool()),
1296 nextmap
.get_pg_num(pgid
.pool()),
1298 i
.second
.history
.last_epoch_split
= e
;
1300 dout(10) << __func__
<< " pg " << pgid
<< " new interval,"
1301 << " up " << i
.second
.up
<< " -> " << up
1302 << " p " << i
.second
.up_primary
<< " -> " << up_primary
1303 << " acting " << i
.second
.acting
<< " -> " << acting
1304 << " p " << i
.second
.acting_primary
<< " -> "
1306 << " history " << i
.second
.history
1307 << " past_intervals " << i
.second
.past_intervals
1309 dout(20) << " debug: " << debug
.str() << dendl
;
1311 i
.second
.acting
= acting
;
1312 i
.second
.up_primary
= up_primary
;
1313 i
.second
.acting_primary
= acting_primary
;
1318 dout(10) << __func__
1319 << " " << (pending_creatings
.pgs
.size() - total
)
1320 << "/" << pending_creatings
.pgs
.size()
1321 << " pgs added from queued pools" << dendl
;
1322 return pending_creatings
;
1325 void OSDMonitor::maybe_prime_pg_temp()
1328 if (pending_inc
.crush
.length()) {
1329 dout(10) << __func__
<< " new crush map, all" << dendl
;
1333 if (!pending_inc
.new_up_client
.empty()) {
1334 dout(10) << __func__
<< " new up osds, all" << dendl
;
1338 // check for interesting OSDs
1340 for (auto p
= pending_inc
.new_state
.begin();
1341 !all
&& p
!= pending_inc
.new_state
.end();
1343 if ((p
->second
& CEPH_OSD_UP
) &&
1344 osdmap
.is_up(p
->first
)) {
1345 osds
.insert(p
->first
);
1348 for (map
<int32_t,uint32_t>::iterator p
= pending_inc
.new_weight
.begin();
1349 !all
&& p
!= pending_inc
.new_weight
.end();
1351 if (p
->second
< osdmap
.get_weight(p
->first
)) {
1353 osds
.insert(p
->first
);
1355 dout(10) << __func__
<< " osd." << p
->first
<< " weight increase, all"
1361 if (!all
&& osds
.empty())
1366 mapping
.get_osd_acting_pgs(*osds
.begin()).size() * osds
.size();
1367 if (estimate
> mapping
.get_num_pgs() *
1368 g_conf()->mon_osd_prime_pg_temp_max_estimate
) {
1369 dout(10) << __func__
<< " estimate " << estimate
<< " pgs on "
1370 << osds
.size() << " osds >= "
1371 << g_conf()->mon_osd_prime_pg_temp_max_estimate
<< " of total "
1372 << mapping
.get_num_pgs() << " pgs, all"
1376 dout(10) << __func__
<< " estimate " << estimate
<< " pgs on "
1377 << osds
.size() << " osds" << dendl
;
1382 next
.deepish_copy_from(osdmap
);
1383 next
.apply_incremental(pending_inc
);
1385 if (next
.get_pools().empty()) {
1386 dout(10) << __func__
<< " no pools, no pg_temp priming" << dendl
;
1388 PrimeTempJob
job(next
, this);
1389 mapper
.queue(&job
, g_conf()->mon_osd_mapping_pgs_per_chunk
, {});
1390 if (job
.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time
)) {
1391 dout(10) << __func__
<< " done in " << job
.get_duration() << dendl
;
1393 dout(10) << __func__
<< " did not finish in "
1394 << g_conf()->mon_osd_prime_pg_temp_max_time
1395 << ", stopping" << dendl
;
1399 dout(10) << __func__
<< " " << osds
.size() << " interesting osds" << dendl
;
1400 utime_t stop
= ceph_clock_now();
1401 stop
+= g_conf()->mon_osd_prime_pg_temp_max_time
;
1402 const int chunk
= 1000;
1404 std::unordered_set
<pg_t
> did_pgs
;
1405 for (auto osd
: osds
) {
1406 auto& pgs
= mapping
.get_osd_acting_pgs(osd
);
1407 dout(20) << __func__
<< " osd." << osd
<< " " << pgs
<< dendl
;
1408 for (auto pgid
: pgs
) {
1409 if (!did_pgs
.insert(pgid
).second
) {
1412 prime_pg_temp(next
, pgid
);
1415 if (ceph_clock_now() > stop
) {
1416 dout(10) << __func__
<< " consumed more than "
1417 << g_conf()->mon_osd_prime_pg_temp_max_time
1418 << " seconds, stopping"
1428 void OSDMonitor::prime_pg_temp(
1432 // TODO: remove this creating_pgs direct access?
1433 if (creating_pgs
.pgs
.count(pgid
)) {
1436 if (!osdmap
.pg_exists(pgid
)) {
1440 vector
<int> up
, acting
;
1441 mapping
.get(pgid
, &up
, nullptr, &acting
, nullptr);
1443 vector
<int> next_up
, next_acting
;
1444 int next_up_primary
, next_acting_primary
;
1445 next
.pg_to_up_acting_osds(pgid
, &next_up
, &next_up_primary
,
1446 &next_acting
, &next_acting_primary
);
1447 if (acting
== next_acting
&&
1448 !(up
!= acting
&& next_up
== next_acting
))
1449 return; // no change since last epoch
1452 return; // if previously empty now we can be no worse off
1453 const pg_pool_t
*pool
= next
.get_pg_pool(pgid
.pool());
1454 if (pool
&& acting
.size() < pool
->min_size
)
1455 return; // can be no worse off than before
1457 if (next_up
== next_acting
) {
1459 dout(20) << __func__
<< " next_up == next_acting now, clear pg_temp"
1463 dout(20) << __func__
<< " " << pgid
<< " " << up
<< "/" << acting
1464 << " -> " << next_up
<< "/" << next_acting
1465 << ", priming " << acting
1468 std::lock_guard
l(prime_pg_temp_lock
);
1469 // do not touch a mapping if a change is pending
1470 pending_inc
.new_pg_temp
.emplace(
1472 mempool::osdmap::vector
<int>(acting
.begin(), acting
.end()));
1477 * @note receiving a transaction in this function gives a fair amount of
1478 * freedom to the service implementation if it does need it. It shouldn't.
1480 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t
)
1482 dout(10) << "encode_pending e " << pending_inc
.epoch
1486 dout(1) << __func__
<< " osdmap full prune encoded e"
1487 << pending_inc
.epoch
<< dendl
;
1490 // finalize up pending_inc
1491 pending_inc
.modified
= ceph_clock_now();
1493 int r
= pending_inc
.propagate_snaps_to_tiers(cct
, osdmap
);
1494 ceph_assert(r
== 0);
1497 if (!mapping_job
->is_done()) {
1498 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job "
1499 << mapping_job
.get() << " did not complete, "
1500 << mapping_job
->shards
<< " left" << dendl
;
1501 mapping_job
->abort();
1502 } else if (mapping
.get_epoch() < osdmap
.get_epoch()) {
1503 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job "
1504 << mapping_job
.get() << " is prior epoch "
1505 << mapping
.get_epoch() << dendl
;
1507 if (g_conf()->mon_osd_prime_pg_temp
) {
1508 maybe_prime_pg_temp();
1511 } else if (g_conf()->mon_osd_prime_pg_temp
) {
1512 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job did not start"
1515 mapping_job
.reset();
1517 // ensure we don't have blank new_state updates. these are interrpeted as
1518 // CEPH_OSD_UP (and almost certainly not what we want!).
1519 auto p
= pending_inc
.new_state
.begin();
1520 while (p
!= pending_inc
.new_state
.end()) {
1521 if (p
->second
== 0) {
1522 dout(10) << "new_state for osd." << p
->first
<< " is 0, removing" << dendl
;
1523 p
= pending_inc
.new_state
.erase(p
);
1525 if (p
->second
& CEPH_OSD_UP
) {
1526 pending_inc
.new_last_up_change
= pending_inc
.modified
;
1531 if (!pending_inc
.new_up_client
.empty()) {
1532 pending_inc
.new_last_up_change
= pending_inc
.modified
;
1534 for (auto& i
: pending_inc
.new_weight
) {
1535 if (i
.first
>= osdmap
.max_osd
) {
1537 // new osd is already marked in
1538 pending_inc
.new_last_in_change
= pending_inc
.modified
;
1541 } else if (!!i
.second
!= !!osdmap
.osd_weight
[i
.first
]) {
1542 // existing osd marked in or out
1543 pending_inc
.new_last_in_change
= pending_inc
.modified
;
1550 tmp
.deepish_copy_from(osdmap
);
1551 tmp
.apply_incremental(pending_inc
);
1553 // clean pg_temp mappings
1554 OSDMap::clean_temps(cct
, osdmap
, tmp
, &pending_inc
);
1556 // clean inappropriate pg_upmap/pg_upmap_items (if any)
1558 // check every upmapped pg for now
1559 // until we could reliably identify certain cases to ignore,
1560 // which is obviously the hard part TBD..
1561 vector
<pg_t
> pgs_to_check
;
1562 tmp
.get_upmap_pgs(&pgs_to_check
);
1563 if (pgs_to_check
.size() <
1564 static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk
* 2)) {
1565 // not enough pgs, do it inline
1566 tmp
.clean_pg_upmaps(cct
, &pending_inc
);
1568 CleanUpmapJob
job(cct
, tmp
, pending_inc
);
1569 mapper
.queue(&job
, g_conf()->mon_clean_pg_upmaps_per_chunk
, pgs_to_check
);
1574 // update creating pgs first so that we can remove the created pgid and
1575 // process the pool flag removal below in the same osdmap epoch.
1576 auto pending_creatings
= update_pending_pgs(pending_inc
, tmp
);
1577 bufferlist creatings_bl
;
1578 uint64_t features
= CEPH_FEATURES_ALL
;
1579 if (mon
->monmap
->min_mon_release
< ceph_release_t::octopus
) {
1580 dout(20) << __func__
<< " encoding pending pgs without octopus features"
1582 features
&= ~CEPH_FEATURE_SERVER_OCTOPUS
;
1584 encode(pending_creatings
, creatings_bl
, features
);
1585 t
->put(OSD_PG_CREATING_PREFIX
, "creating", creatings_bl
);
1587 // remove any old (or incompat) POOL_CREATING flags
1588 for (auto& i
: tmp
.get_pools()) {
1589 if (tmp
.require_osd_release
< ceph_release_t::nautilus
) {
1590 // pre-nautilus OSDMaps shouldn't get this flag.
1591 if (pending_inc
.new_pools
.count(i
.first
)) {
1592 pending_inc
.new_pools
[i
.first
].flags
&= ~pg_pool_t::FLAG_CREATING
;
1595 if (i
.second
.has_flag(pg_pool_t::FLAG_CREATING
) &&
1596 !pending_creatings
.still_creating_pool(i
.first
)) {
1597 dout(10) << __func__
<< " done creating pool " << i
.first
1598 << ", clearing CREATING flag" << dendl
;
1599 if (pending_inc
.new_pools
.count(i
.first
) == 0) {
1600 pending_inc
.new_pools
[i
.first
] = i
.second
;
1602 pending_inc
.new_pools
[i
.first
].flags
&= ~pg_pool_t::FLAG_CREATING
;
1606 // collect which pools are currently affected by
1607 // the near/backfill/full osd(s),
1608 // and set per-pool near/backfill/full flag instead
1609 set
<int64_t> full_pool_ids
;
1610 set
<int64_t> backfillfull_pool_ids
;
1611 set
<int64_t> nearfull_pool_ids
;
1612 tmp
.get_full_pools(cct
,
1614 &backfillfull_pool_ids
,
1615 &nearfull_pool_ids
);
1616 if (full_pool_ids
.empty() ||
1617 backfillfull_pool_ids
.empty() ||
1618 nearfull_pool_ids
.empty()) {
1619 // normal case - no nearfull, backfillfull or full osds
1620 // try cancel any improper nearfull/backfillfull/full pool
1622 for (auto &pool
: tmp
.get_pools()) {
1623 auto p
= pool
.first
;
1624 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
) &&
1625 nearfull_pool_ids
.empty()) {
1626 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1627 << "'s nearfull flag" << dendl
;
1628 if (pending_inc
.new_pools
.count(p
) == 0) {
1629 // load original pool info first!
1630 pending_inc
.new_pools
[p
] = pool
.second
;
1632 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1634 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
) &&
1635 backfillfull_pool_ids
.empty()) {
1636 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1637 << "'s backfillfull flag" << dendl
;
1638 if (pending_inc
.new_pools
.count(p
) == 0) {
1639 pending_inc
.new_pools
[p
] = pool
.second
;
1641 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1643 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
) &&
1644 full_pool_ids
.empty()) {
1645 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1646 // set by EQUOTA, skipping
1649 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1650 << "'s full flag" << dendl
;
1651 if (pending_inc
.new_pools
.count(p
) == 0) {
1652 pending_inc
.new_pools
[p
] = pool
.second
;
1654 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_FULL
;
1658 if (!full_pool_ids
.empty()) {
1659 dout(10) << __func__
<< " marking pool(s) " << full_pool_ids
1660 << " as full" << dendl
;
1661 for (auto &p
: full_pool_ids
) {
1662 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
)) {
1665 if (pending_inc
.new_pools
.count(p
) == 0) {
1666 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1668 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_FULL
;
1669 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1670 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1672 // cancel FLAG_FULL for pools which are no longer full too
1673 for (auto &pool
: tmp
.get_pools()) {
1674 auto p
= pool
.first
;
1675 if (full_pool_ids
.count(p
)) {
1676 // skip pools we have just marked as full above
1679 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
) ||
1680 tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1681 // don't touch if currently is not full
1682 // or is running out of quota (and hence considered as full)
1685 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1686 << "'s full flag" << dendl
;
1687 if (pending_inc
.new_pools
.count(p
) == 0) {
1688 pending_inc
.new_pools
[p
] = pool
.second
;
1690 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_FULL
;
1693 if (!backfillfull_pool_ids
.empty()) {
1694 for (auto &p
: backfillfull_pool_ids
) {
1695 if (full_pool_ids
.count(p
)) {
1696 // skip pools we have already considered as full above
1699 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1700 // make sure FLAG_FULL is truly set, so we are safe not
1701 // to set a extra (redundant) FLAG_BACKFILLFULL flag
1702 ceph_assert(tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
));
1705 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
)) {
1706 // don't bother if pool is already marked as backfillfull
1709 dout(10) << __func__
<< " marking pool '" << tmp
.pool_name
[p
]
1710 << "'s as backfillfull" << dendl
;
1711 if (pending_inc
.new_pools
.count(p
) == 0) {
1712 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1714 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_BACKFILLFULL
;
1715 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1717 // cancel FLAG_BACKFILLFULL for pools
1718 // which are no longer backfillfull too
1719 for (auto &pool
: tmp
.get_pools()) {
1720 auto p
= pool
.first
;
1721 if (full_pool_ids
.count(p
) || backfillfull_pool_ids
.count(p
)) {
1722 // skip pools we have just marked as backfillfull/full above
1725 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
)) {
1726 // and don't touch if currently is not backfillfull
1729 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1730 << "'s backfillfull flag" << dendl
;
1731 if (pending_inc
.new_pools
.count(p
) == 0) {
1732 pending_inc
.new_pools
[p
] = pool
.second
;
1734 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1737 if (!nearfull_pool_ids
.empty()) {
1738 for (auto &p
: nearfull_pool_ids
) {
1739 if (full_pool_ids
.count(p
) || backfillfull_pool_ids
.count(p
)) {
1742 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1743 // make sure FLAG_FULL is truly set, so we are safe not
1744 // to set a extra (redundant) FLAG_NEARFULL flag
1745 ceph_assert(tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
));
1748 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
)) {
1749 // don't bother if pool is already marked as nearfull
1752 dout(10) << __func__
<< " marking pool '" << tmp
.pool_name
[p
]
1753 << "'s as nearfull" << dendl
;
1754 if (pending_inc
.new_pools
.count(p
) == 0) {
1755 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1757 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_NEARFULL
;
1759 // cancel FLAG_NEARFULL for pools
1760 // which are no longer nearfull too
1761 for (auto &pool
: tmp
.get_pools()) {
1762 auto p
= pool
.first
;
1763 if (full_pool_ids
.count(p
) ||
1764 backfillfull_pool_ids
.count(p
) ||
1765 nearfull_pool_ids
.count(p
)) {
1766 // skip pools we have just marked as
1767 // nearfull/backfillfull/full above
1770 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
)) {
1771 // and don't touch if currently is not nearfull
1774 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1775 << "'s nearfull flag" << dendl
;
1776 if (pending_inc
.new_pools
.count(p
) == 0) {
1777 pending_inc
.new_pools
[p
] = pool
.second
;
1779 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1783 // min_compat_client?
1784 if (!tmp
.require_min_compat_client
) {
1785 auto mv
= tmp
.get_min_compat_client();
1786 dout(1) << __func__
<< " setting require_min_compat_client to currently "
1787 << "required " << mv
<< dendl
;
1788 mon
->clog
->info() << "setting require_min_compat_client to currently "
1789 << "required " << mv
;
1790 pending_inc
.new_require_min_compat_client
= mv
;
1793 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
&&
1794 tmp
.require_osd_release
>= ceph_release_t::nautilus
) {
1795 dout(10) << __func__
<< " first nautilus+ epoch" << dendl
;
1796 // add creating flags?
1797 for (auto& i
: tmp
.get_pools()) {
1798 if (pending_creatings
.still_creating_pool(i
.first
)) {
1799 dout(10) << __func__
<< " adding CREATING flag to pool " << i
.first
1801 if (pending_inc
.new_pools
.count(i
.first
) == 0) {
1802 pending_inc
.new_pools
[i
.first
] = i
.second
;
1804 pending_inc
.new_pools
[i
.first
].flags
|= pg_pool_t::FLAG_CREATING
;
1807 // adjust blacklist items to all be TYPE_ANY
1808 for (auto& i
: tmp
.blacklist
) {
1810 a
.set_type(entity_addr_t::TYPE_ANY
);
1811 pending_inc
.new_blacklist
[a
] = i
.second
;
1812 pending_inc
.old_blacklist
.push_back(i
.first
);
1816 if (osdmap
.require_osd_release
< ceph_release_t::octopus
&&
1817 tmp
.require_osd_release
>= ceph_release_t::octopus
) {
1818 dout(10) << __func__
<< " first octopus+ epoch" << dendl
;
1820 // adjust obsoleted cache modes
1821 for (auto& [poolid
, pi
] : tmp
.pools
) {
1822 if (pi
.cache_mode
== pg_pool_t::CACHEMODE_FORWARD
) {
1823 if (pending_inc
.new_pools
.count(poolid
) == 0) {
1824 pending_inc
.new_pools
[poolid
] = pi
;
1826 dout(10) << __func__
<< " switching pool " << poolid
1827 << " cachemode from forward -> proxy" << dendl
;
1828 pending_inc
.new_pools
[poolid
].cache_mode
= pg_pool_t::CACHEMODE_PROXY
;
1830 if (pi
.cache_mode
== pg_pool_t::CACHEMODE_READFORWARD
) {
1831 if (pending_inc
.new_pools
.count(poolid
) == 0) {
1832 pending_inc
.new_pools
[poolid
] = pi
;
1834 dout(10) << __func__
<< " switching pool " << poolid
1835 << " cachemode from readforward -> readproxy" << dendl
;
1836 pending_inc
.new_pools
[poolid
].cache_mode
=
1837 pg_pool_t::CACHEMODE_READPROXY
;
1841 // clear removed_snaps for every pool
1842 for (auto& [poolid
, pi
] : tmp
.pools
) {
1843 if (pi
.removed_snaps
.empty()) {
1846 if (pending_inc
.new_pools
.count(poolid
) == 0) {
1847 pending_inc
.new_pools
[poolid
] = pi
;
1849 dout(10) << __func__
<< " clearing pool " << poolid
<< " removed_snaps"
1851 pending_inc
.new_pools
[poolid
].removed_snaps
.clear();
1854 // create a combined purged snap epoch key for all purged snaps
1855 // prior to this epoch, and store it in the current epoch (i.e.,
1856 // the last pre-octopus epoch, just prior to the one we're
1858 auto it
= mon
->store
->get_iterator(OSD_SNAP_PREFIX
);
1859 it
->lower_bound("purged_snap_");
1860 map
<int64_t,snap_interval_set_t
> combined
;
1861 while (it
->valid()) {
1862 if (it
->key().find("purged_snap_") != 0) {
1865 string k
= it
->key();
1866 long long unsigned pool
;
1867 int n
= sscanf(k
.c_str(), "purged_snap_%llu_", &pool
);
1869 derr
<< __func__
<< " invalid purged_snaps key '" << k
<< "'" << dendl
;
1871 bufferlist v
= it
->value();
1872 auto p
= v
.cbegin();
1873 snapid_t begin
, end
;
1874 ceph::decode(begin
, p
);
1875 ceph::decode(end
, p
);
1876 combined
[pool
].insert(begin
, end
- begin
);
1880 if (!combined
.empty()) {
1881 string k
= make_purged_snap_epoch_key(pending_inc
.epoch
- 1);
1883 ceph::encode(combined
, v
);
1884 t
->put(OSD_SNAP_PREFIX
, k
, v
);
1885 dout(10) << __func__
<< " recording pre-octopus purged_snaps in epoch "
1886 << (pending_inc
.epoch
- 1) << ", " << v
.length() << " bytes"
1889 dout(10) << __func__
<< " there were no pre-octopus purged snaps"
1893 // clean out the old removed_snap_ and removed_epoch keys
1894 // ('`' is ASCII '_' + 1)
1895 t
->erase_range(OSD_SNAP_PREFIX
, "removed_snap_", "removed_snap`");
1896 t
->erase_range(OSD_SNAP_PREFIX
, "removed_epoch_", "removed_epoch`");
1901 for (auto i
= pending_inc
.new_state
.begin();
1902 i
!= pending_inc
.new_state
.end();
1904 int s
= i
->second
? i
->second
: CEPH_OSD_UP
;
1905 if (s
& CEPH_OSD_UP
)
1906 dout(2) << " osd." << i
->first
<< " DOWN" << dendl
;
1907 if (s
& CEPH_OSD_EXISTS
)
1908 dout(2) << " osd." << i
->first
<< " DNE" << dendl
;
1910 for (auto i
= pending_inc
.new_up_client
.begin();
1911 i
!= pending_inc
.new_up_client
.end();
1913 //FIXME: insert cluster addresses too
1914 dout(2) << " osd." << i
->first
<< " UP " << i
->second
<< dendl
;
1916 for (map
<int32_t,uint32_t>::iterator i
= pending_inc
.new_weight
.begin();
1917 i
!= pending_inc
.new_weight
.end();
1919 if (i
->second
== CEPH_OSD_OUT
) {
1920 dout(2) << " osd." << i
->first
<< " OUT" << dendl
;
1921 } else if (i
->second
== CEPH_OSD_IN
) {
1922 dout(2) << " osd." << i
->first
<< " IN" << dendl
;
1924 dout(2) << " osd." << i
->first
<< " WEIGHT " << hex
<< i
->second
<< dec
<< dendl
;
1928 // features for osdmap and its incremental
1931 // encode full map and determine its crc
1934 tmp
.deepish_copy_from(osdmap
);
1935 tmp
.apply_incremental(pending_inc
);
1937 // determine appropriate features
1938 features
= tmp
.get_encoding_features();
1939 dout(10) << __func__
<< " encoding full map with "
1940 << tmp
.require_osd_release
1941 << " features " << features
<< dendl
;
1943 // the features should be a subset of the mon quorum's features!
1944 ceph_assert((features
& ~mon
->get_quorum_con_features()) == 0);
1947 encode(tmp
, fullbl
, features
| CEPH_FEATURE_RESERVED
);
1948 pending_inc
.full_crc
= tmp
.get_crc();
1950 // include full map in the txn. note that old monitors will
1951 // overwrite this. new ones will now skip the local full map
1952 // encode and reload from this.
1953 put_version_full(t
, pending_inc
.epoch
, fullbl
);
1957 ceph_assert(get_last_committed() + 1 == pending_inc
.epoch
);
1959 encode(pending_inc
, bl
, features
| CEPH_FEATURE_RESERVED
);
1961 dout(20) << " full_crc " << tmp
.get_crc()
1962 << " inc_crc " << pending_inc
.inc_crc
<< dendl
;
1964 /* put everything in the transaction */
1965 put_version(t
, pending_inc
.epoch
, bl
);
1966 put_last_committed(t
, pending_inc
.epoch
);
1969 for (map
<int,bufferlist
>::iterator p
= pending_metadata
.begin();
1970 p
!= pending_metadata
.end();
1972 t
->put(OSD_METADATA_PREFIX
, stringify(p
->first
), p
->second
);
1973 for (set
<int>::iterator p
= pending_metadata_rm
.begin();
1974 p
!= pending_metadata_rm
.end();
1976 t
->erase(OSD_METADATA_PREFIX
, stringify(*p
));
1977 pending_metadata
.clear();
1978 pending_metadata_rm
.clear();
1981 if (tmp
.require_osd_release
>= ceph_release_t::octopus
&&
1982 !pending_inc
.new_purged_snaps
.empty()) {
1983 // all snaps purged this epoch (across all pools)
1984 string k
= make_purged_snap_epoch_key(pending_inc
.epoch
);
1986 encode(pending_inc
.new_purged_snaps
, v
);
1987 t
->put(OSD_SNAP_PREFIX
, k
, v
);
1989 for (auto& i
: pending_inc
.new_purged_snaps
) {
1990 for (auto q
= i
.second
.begin();
1991 q
!= i
.second
.end();
1993 insert_purged_snap_update(i
.first
, q
.get_start(), q
.get_end(),
1998 for (auto& [pool
, snaps
] : pending_pseudo_purged_snaps
) {
1999 for (auto snap
: snaps
) {
2000 insert_purged_snap_update(pool
, snap
, snap
+ 1,
2007 health_check_map_t next
;
2008 tmp
.check_health(cct
, &next
);
2009 encode_health(next
, t
);
2012 int OSDMonitor::load_metadata(int osd
, map
<string
, string
>& m
, ostream
*err
)
2015 int r
= mon
->store
->get(OSD_METADATA_PREFIX
, stringify(osd
), bl
);
2019 auto p
= bl
.cbegin();
2022 catch (buffer::error
& e
) {
2024 *err
<< "osd." << osd
<< " metadata is corrupt";
2030 void OSDMonitor::count_metadata(const string
& field
, map
<string
,int> *out
)
2032 for (int osd
= 0; osd
< osdmap
.get_max_osd(); ++osd
) {
2033 if (osdmap
.is_up(osd
)) {
2034 map
<string
,string
> meta
;
2035 load_metadata(osd
, meta
, nullptr);
2036 auto p
= meta
.find(field
);
2037 if (p
== meta
.end()) {
2038 (*out
)["unknown"]++;
2040 (*out
)[p
->second
]++;
2046 void OSDMonitor::count_metadata(const string
& field
, Formatter
*f
)
2048 map
<string
,int> by_val
;
2049 count_metadata(field
, &by_val
);
2050 f
->open_object_section(field
.c_str());
2051 for (auto& p
: by_val
) {
2052 f
->dump_int(p
.first
.c_str(), p
.second
);
2057 int OSDMonitor::get_osd_objectstore_type(int osd
, string
*type
)
2059 map
<string
, string
> metadata
;
2060 int r
= load_metadata(osd
, metadata
, nullptr);
2064 auto it
= metadata
.find("osd_objectstore");
2065 if (it
== metadata
.end())
2071 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id
,
2072 const pg_pool_t
&pool
,
2075 // just check a few pgs for efficiency - this can't give a guarantee anyway,
2076 // since filestore osds could always join the pool later
2077 set
<int> checked_osds
;
2078 for (unsigned ps
= 0; ps
< std::min(8u, pool
.get_pg_num()); ++ps
) {
2079 vector
<int> up
, acting
;
2080 pg_t
pgid(ps
, pool_id
);
2081 osdmap
.pg_to_up_acting_osds(pgid
, up
, acting
);
2082 for (int osd
: up
) {
2083 if (checked_osds
.find(osd
) != checked_osds
.end())
2085 string objectstore_type
;
2086 int r
= get_osd_objectstore_type(osd
, &objectstore_type
);
2087 // allow with missing metadata, e.g. due to an osd never booting yet
2088 if (r
< 0 || objectstore_type
== "bluestore") {
2089 checked_osds
.insert(osd
);
2092 *err
<< "osd." << osd
<< " uses " << objectstore_type
;
2099 int OSDMonitor::dump_osd_metadata(int osd
, Formatter
*f
, ostream
*err
)
2101 map
<string
,string
> m
;
2102 if (int r
= load_metadata(osd
, m
, err
))
2104 for (map
<string
,string
>::iterator p
= m
.begin(); p
!= m
.end(); ++p
)
2105 f
->dump_string(p
->first
.c_str(), p
->second
);
2109 void OSDMonitor::print_nodes(Formatter
*f
)
2111 // group OSDs by their hosts
2112 map
<string
, list
<int> > osds
; // hostname => osd
2113 for (int osd
= 0; osd
< osdmap
.get_max_osd(); osd
++) {
2114 map
<string
, string
> m
;
2115 if (load_metadata(osd
, m
, NULL
)) {
2118 map
<string
, string
>::iterator hostname
= m
.find("hostname");
2119 if (hostname
== m
.end()) {
2120 // not likely though
2123 osds
[hostname
->second
].push_back(osd
);
2126 dump_services(f
, osds
, "osd");
2129 void OSDMonitor::share_map_with_random_osd()
2131 if (osdmap
.get_num_up_osds() == 0) {
2132 dout(10) << __func__
<< " no up osds, don't share with anyone" << dendl
;
2136 MonSession
*s
= mon
->session_map
.get_random_osd_session(&osdmap
);
2138 dout(10) << __func__
<< " no up osd on our session map" << dendl
;
2142 dout(10) << "committed, telling random " << s
->name
2143 << " all about it" << dendl
;
2145 // get feature of the peer
2146 // use quorum_con_features, if it's an anonymous connection.
2147 uint64_t features
= s
->con_features
? s
->con_features
:
2148 mon
->get_quorum_con_features();
2149 // whatev, they'll request more if they need it
2150 MOSDMap
*m
= build_incremental(osdmap
.get_epoch() - 1, osdmap
.get_epoch(), features
);
2151 s
->con
->send_message(m
);
2152 // NOTE: do *not* record osd has up to this epoch (as we do
2153 // elsewhere) as they may still need to request older values.
2156 version_t
OSDMonitor::get_trim_to() const
2158 if (mon
->get_quorum().empty()) {
2159 dout(10) << __func__
<< ": quorum not formed" << dendl
;
2164 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
2165 if (!creating_pgs
.pgs
.empty()) {
2170 if (g_conf().get_val
<bool>("mon_debug_block_osdmap_trim")) {
2172 << " blocking osdmap trim"
2173 " ('mon_debug_block_osdmap_trim' set to 'true')"
2179 epoch_t floor
= get_min_last_epoch_clean();
2180 dout(10) << " min_last_epoch_clean " << floor
<< dendl
;
2181 if (g_conf()->mon_osd_force_trim_to
> 0 &&
2182 g_conf()->mon_osd_force_trim_to
< (int)get_last_committed()) {
2183 floor
= g_conf()->mon_osd_force_trim_to
;
2184 dout(10) << " explicit mon_osd_force_trim_to = " << floor
<< dendl
;
2186 unsigned min
= g_conf()->mon_min_osdmap_epochs
;
2187 if (floor
+ min
> get_last_committed()) {
2188 if (min
< get_last_committed())
2189 floor
= get_last_committed() - min
;
2193 if (floor
> get_first_committed())
2199 epoch_t
OSDMonitor::get_min_last_epoch_clean() const
2201 auto floor
= last_epoch_clean
.get_lower_bound(osdmap
);
2202 // also scan osd epochs
2203 // don't trim past the oldest reported osd epoch
2204 for (auto& osd_epoch
: osd_epochs
) {
2205 if (osd_epoch
.second
< floor
) {
2206 floor
= osd_epoch
.second
;
2212 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx
,
2215 dout(10) << __func__
<< " including full map for e " << first
<< dendl
;
2217 get_version_full(first
, bl
);
2218 put_version_full(tx
, first
, bl
);
2220 if (has_osdmap_manifest
&&
2221 first
> osdmap_manifest
.get_first_pinned()) {
2222 _prune_update_trimmed(tx
, first
);
2227 /* full osdmap prune
2229 * for more information, please refer to doc/dev/mon-osdmap-prune.rst
2232 void OSDMonitor::load_osdmap_manifest()
2234 bool store_has_manifest
=
2235 mon
->store
->exists(get_service_name(), "osdmap_manifest");
2237 if (!store_has_manifest
) {
2238 if (!has_osdmap_manifest
) {
2242 dout(20) << __func__
2243 << " dropping osdmap manifest from memory." << dendl
;
2244 osdmap_manifest
= osdmap_manifest_t();
2245 has_osdmap_manifest
= false;
2249 dout(20) << __func__
2250 << " osdmap manifest detected in store; reload." << dendl
;
2252 bufferlist manifest_bl
;
2253 int r
= get_value("osdmap_manifest", manifest_bl
);
2255 derr
<< __func__
<< " unable to read osdmap version manifest" << dendl
;
2256 ceph_abort_msg("error reading manifest");
2258 osdmap_manifest
.decode(manifest_bl
);
2259 has_osdmap_manifest
= true;
2261 dout(10) << __func__
<< " store osdmap manifest pinned ("
2262 << osdmap_manifest
.get_first_pinned()
2264 << osdmap_manifest
.get_last_pinned()
2269 bool OSDMonitor::should_prune() const
2271 version_t first
= get_first_committed();
2272 version_t last
= get_last_committed();
2273 version_t min_osdmap_epochs
=
2274 g_conf().get_val
<int64_t>("mon_min_osdmap_epochs");
2275 version_t prune_min
=
2276 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_min");
2277 version_t prune_interval
=
2278 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_interval");
2279 version_t last_pinned
= osdmap_manifest
.get_last_pinned();
2280 version_t last_to_pin
= last
- min_osdmap_epochs
;
2282 // Make it or break it constraints.
2284 // If any of these conditions fails, we will not prune, regardless of
2285 // whether we have an on-disk manifest with an on-going pruning state.
2287 if ((last
- first
) <= min_osdmap_epochs
) {
2288 // between the first and last committed epochs, we don't have
2289 // enough epochs to trim, much less to prune.
2290 dout(10) << __func__
2291 << " currently holding only " << (last
- first
)
2292 << " epochs (min osdmap epochs: " << min_osdmap_epochs
2293 << "); do not prune."
2297 } else if ((last_to_pin
- first
) < prune_min
) {
2298 // between the first committed epoch and the last epoch we would prune,
2299 // we simply don't have enough versions over the minimum to prune maps.
2300 dout(10) << __func__
2301 << " could only prune " << (last_to_pin
- first
)
2302 << " epochs (" << first
<< ".." << last_to_pin
<< "), which"
2303 " is less than the required minimum (" << prune_min
<< ")"
2307 } else if (has_osdmap_manifest
&& last_pinned
>= last_to_pin
) {
2308 dout(10) << __func__
2309 << " we have pruned as far as we can; do not prune."
2313 } else if (last_pinned
+ prune_interval
> last_to_pin
) {
2314 dout(10) << __func__
2315 << " not enough epochs to form an interval (last pinned: "
2316 << last_pinned
<< ", last to pin: "
2317 << last_to_pin
<< ", interval: " << prune_interval
<< ")"
2322 dout(15) << __func__
2323 << " should prune (" << last_pinned
<< ".." << last_to_pin
<< ")"
2324 << " lc (" << first
<< ".." << last
<< ")"
2329 void OSDMonitor::_prune_update_trimmed(
2330 MonitorDBStore::TransactionRef tx
,
2333 dout(10) << __func__
2334 << " first " << first
2335 << " last_pinned " << osdmap_manifest
.get_last_pinned()
2336 << " last_pinned " << osdmap_manifest
.get_last_pinned()
2339 osdmap_manifest_t manifest
= osdmap_manifest
;
2341 if (!manifest
.is_pinned(first
)) {
2342 manifest
.pin(first
);
2345 set
<version_t
>::iterator p_end
= manifest
.pinned
.find(first
);
2346 set
<version_t
>::iterator p
= manifest
.pinned
.begin();
2347 manifest
.pinned
.erase(p
, p_end
);
2348 ceph_assert(manifest
.get_first_pinned() == first
);
2350 if (manifest
.get_last_pinned() == first
+1 ||
2351 manifest
.pinned
.size() == 1) {
2352 // we reached the end of the line, as pinned maps go; clean up our
2353 // manifest, and let `should_prune()` decide whether we should prune
2355 tx
->erase(get_service_name(), "osdmap_manifest");
2360 manifest
.encode(bl
);
2361 tx
->put(get_service_name(), "osdmap_manifest", bl
);
2364 void OSDMonitor::prune_init(osdmap_manifest_t
& manifest
)
2366 dout(1) << __func__
<< dendl
;
2368 version_t pin_first
;
2370 // verify constrainsts on stable in-memory state
2371 if (!has_osdmap_manifest
) {
2372 // we must have never pruned, OR if we pruned the state must no longer
2373 // be relevant (i.e., the state must have been removed alongside with
2374 // the trim that *must* have removed past the last pinned map in a
2376 ceph_assert(osdmap_manifest
.pinned
.empty());
2377 ceph_assert(!mon
->store
->exists(get_service_name(), "osdmap_manifest"));
2378 pin_first
= get_first_committed();
2381 // we must have pruned in the past AND its state is still relevant
2382 // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
2383 // and thus we still hold a manifest in the store).
2384 ceph_assert(!osdmap_manifest
.pinned
.empty());
2385 ceph_assert(osdmap_manifest
.get_first_pinned() == get_first_committed());
2386 ceph_assert(osdmap_manifest
.get_last_pinned() < get_last_committed());
2388 dout(10) << __func__
2389 << " first_pinned " << osdmap_manifest
.get_first_pinned()
2390 << " last_pinned " << osdmap_manifest
.get_last_pinned()
2393 pin_first
= osdmap_manifest
.get_last_pinned();
2396 manifest
.pin(pin_first
);
2399 bool OSDMonitor::_prune_sanitize_options() const
2401 uint64_t prune_interval
=
2402 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_interval");
2403 uint64_t prune_min
=
2404 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_min");
2406 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_txsize");
2410 if (prune_interval
== 0) {
2412 << " prune is enabled BUT prune interval is zero; abort."
2415 } else if (prune_interval
== 1) {
2417 << " prune interval is equal to one, which essentially means"
2418 " no pruning; abort."
2422 if (prune_min
== 0) {
2424 << " prune is enabled BUT prune min is zero; abort."
2428 if (prune_interval
> prune_min
) {
2430 << " impossible to ascertain proper prune interval because"
2431 << " it is greater than the minimum prune epochs"
2432 << " (min: " << prune_min
<< ", interval: " << prune_interval
<< ")"
2437 if (txsize
< prune_interval
- 1) {
2439 << "'mon_osdmap_full_prune_txsize' (" << txsize
2440 << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval
- 1
2441 << "); abort." << dendl
;
2447 bool OSDMonitor::is_prune_enabled() const {
2448 return g_conf().get_val
<bool>("mon_osdmap_full_prune_enabled");
2451 bool OSDMonitor::is_prune_supported() const {
2452 return mon
->get_required_mon_features().contains_any(
2453 ceph::features::mon::FEATURE_OSDMAP_PRUNE
);
2458 * @returns true if has side-effects; false otherwise.
2460 bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx
)
2462 bool enabled
= is_prune_enabled();
2464 dout(1) << __func__
<< " osdmap full prune "
2465 << ( enabled
? "enabled" : "disabled")
2468 if (!enabled
|| !_prune_sanitize_options() || !should_prune()) {
2472 // we are beyond the minimum prune versions, we need to remove maps because
2473 // otherwise the store will grow unbounded and we may end up having issues
2474 // with available disk space or store hangs.
2476 // we will not pin all versions. We will leave a buffer number of versions.
2477 // this allows us the monitor to trim maps without caring too much about
2478 // pinned maps, and then allow us to use another ceph-mon without these
2479 // capabilities, without having to repair the store.
2481 osdmap_manifest_t manifest
= osdmap_manifest
;
2483 version_t first
= get_first_committed();
2484 version_t last
= get_last_committed();
2486 version_t last_to_pin
= last
- g_conf()->mon_min_osdmap_epochs
;
2487 version_t last_pinned
= manifest
.get_last_pinned();
2488 uint64_t prune_interval
=
2489 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_interval");
2491 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_txsize");
2493 prune_init(manifest
);
2495 // we need to get rid of some osdmaps
2498 << " lc (" << first
<< " .. " << last
<< ")"
2499 << " last_pinned " << last_pinned
2500 << " interval " << prune_interval
2501 << " last_to_pin " << last_to_pin
2504 // We will be erasing maps as we go.
2506 // We will erase all maps between `last_pinned` and the `next_to_pin`.
2508 // If `next_to_pin` happens to be greater than `last_to_pin`, then
2509 // we stop pruning. We could prune the maps between `next_to_pin` and
2510 // `last_to_pin`, but by not doing it we end up with neater pruned
2511 // intervals, aligned with `prune_interval`. Besides, this should not be a
2512 // problem as long as `prune_interval` is set to a sane value, instead of
2513 // hundreds or thousands of maps.
2515 auto map_exists
= [this](version_t v
) {
2516 string k
= mon
->store
->combine_strings("full", v
);
2517 return mon
->store
->exists(get_service_name(), k
);
2520 // 'interval' represents the number of maps from the last pinned
2521 // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2522 // version 11 next; all intermediate versions will be removed.
2524 // 'txsize' represents the maximum number of versions we'll be removing in
2525 // this iteration. If 'txsize' is large enough to perform multiple passes
2526 // pinning and removing maps, we will do so; if not, we'll do at least one
2527 // pass. We are quite relaxed about honouring 'txsize', but we'll always
2528 // ensure that we never go *over* the maximum.
2530 // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2531 uint64_t removal_interval
= prune_interval
- 1;
2533 if (txsize
< removal_interval
) {
2535 << " setting txsize to removal interval size ("
2536 << removal_interval
<< " versions"
2538 txsize
= removal_interval
;
2540 ceph_assert(removal_interval
> 0);
2542 uint64_t num_pruned
= 0;
2543 while (num_pruned
+ removal_interval
<= txsize
) {
2544 last_pinned
= manifest
.get_last_pinned();
2546 if (last_pinned
+ prune_interval
> last_to_pin
) {
2549 ceph_assert(last_pinned
< last_to_pin
);
2551 version_t next_pinned
= last_pinned
+ prune_interval
;
2552 ceph_assert(next_pinned
<= last_to_pin
);
2553 manifest
.pin(next_pinned
);
2555 dout(20) << __func__
2556 << " last_pinned " << last_pinned
2557 << " next_pinned " << next_pinned
2558 << " num_pruned " << num_pruned
2559 << " removal interval (" << (last_pinned
+1)
2560 << ".." << (next_pinned
-1) << ")"
2561 << " txsize " << txsize
<< dendl
;
2563 ceph_assert(map_exists(last_pinned
));
2564 ceph_assert(map_exists(next_pinned
));
2566 for (version_t v
= last_pinned
+1; v
< next_pinned
; ++v
) {
2567 ceph_assert(!manifest
.is_pinned(v
));
2569 dout(20) << __func__
<< " pruning full osdmap e" << v
<< dendl
;
2570 string full_key
= mon
->store
->combine_strings("full", v
);
2571 tx
->erase(get_service_name(), full_key
);
2576 ceph_assert(num_pruned
> 0);
2579 manifest
.encode(bl
);
2580 tx
->put(get_service_name(), "osdmap_manifest", bl
);
2588 bool OSDMonitor::preprocess_query(MonOpRequestRef op
)
2590 op
->mark_osdmon_event(__func__
);
2591 Message
*m
= op
->get_req();
2592 dout(10) << "preprocess_query " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
2594 switch (m
->get_type()) {
2596 case MSG_MON_COMMAND
:
2598 return preprocess_command(op
);
2599 } catch (const bad_cmd_get
& e
) {
2601 mon
->reply_command(op
, -EINVAL
, e
.what(), bl
, get_last_committed());
2604 case CEPH_MSG_MON_GET_OSDMAP
:
2605 return preprocess_get_osdmap(op
);
2608 case MSG_OSD_MARK_ME_DOWN
:
2609 return preprocess_mark_me_down(op
);
2610 case MSG_OSD_MARK_ME_DEAD
:
2611 return preprocess_mark_me_dead(op
);
2613 return preprocess_full(op
);
2614 case MSG_OSD_FAILURE
:
2615 return preprocess_failure(op
);
2617 return preprocess_boot(op
);
2619 return preprocess_alive(op
);
2620 case MSG_OSD_PG_CREATED
:
2621 return preprocess_pg_created(op
);
2622 case MSG_OSD_PG_READY_TO_MERGE
:
2623 return preprocess_pg_ready_to_merge(op
);
2624 case MSG_OSD_PGTEMP
:
2625 return preprocess_pgtemp(op
);
2626 case MSG_OSD_BEACON
:
2627 return preprocess_beacon(op
);
2629 case CEPH_MSG_POOLOP
:
2630 return preprocess_pool_op(op
);
2632 case MSG_REMOVE_SNAPS
:
2633 return preprocess_remove_snaps(op
);
2635 case MSG_MON_GET_PURGED_SNAPS
:
2636 return preprocess_get_purged_snaps(op
);
2644 bool OSDMonitor::prepare_update(MonOpRequestRef op
)
2646 op
->mark_osdmon_event(__func__
);
2647 Message
*m
= op
->get_req();
2648 dout(7) << "prepare_update " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
2650 switch (m
->get_type()) {
2652 case MSG_OSD_MARK_ME_DOWN
:
2653 return prepare_mark_me_down(op
);
2654 case MSG_OSD_MARK_ME_DEAD
:
2655 return prepare_mark_me_dead(op
);
2657 return prepare_full(op
);
2658 case MSG_OSD_FAILURE
:
2659 return prepare_failure(op
);
2661 return prepare_boot(op
);
2663 return prepare_alive(op
);
2664 case MSG_OSD_PG_CREATED
:
2665 return prepare_pg_created(op
);
2666 case MSG_OSD_PGTEMP
:
2667 return prepare_pgtemp(op
);
2668 case MSG_OSD_PG_READY_TO_MERGE
:
2669 return prepare_pg_ready_to_merge(op
);
2670 case MSG_OSD_BEACON
:
2671 return prepare_beacon(op
);
2673 case MSG_MON_COMMAND
:
2675 return prepare_command(op
);
2676 } catch (const bad_cmd_get
& e
) {
2678 mon
->reply_command(op
, -EINVAL
, e
.what(), bl
, get_last_committed());
2682 case CEPH_MSG_POOLOP
:
2683 return prepare_pool_op(op
);
2685 case MSG_REMOVE_SNAPS
:
2686 return prepare_remove_snaps(op
);
2696 bool OSDMonitor::should_propose(double& delay
)
2698 dout(10) << "should_propose" << dendl
;
2700 // if full map, propose immediately! any subsequent changes will be clobbered.
2701 if (pending_inc
.fullmap
.length())
2704 // adjust osd weights?
2705 if (!osd_weight
.empty() &&
2706 osd_weight
.size() == (unsigned)osdmap
.get_max_osd()) {
2707 dout(0) << " adjusting osd weights based on " << osd_weight
<< dendl
;
2708 osdmap
.adjust_osd_weights(osd_weight
, pending_inc
);
2714 return PaxosService::should_propose(delay
);
2719 // ---------------------------
2722 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op
)
2724 op
->mark_osdmon_event(__func__
);
2725 auto m
= op
->get_req
<MMonGetOSDMap
>();
2727 uint64_t features
= mon
->get_quorum_con_features();
2728 if (op
->get_session() && op
->get_session()->con_features
)
2729 features
= op
->get_session()->con_features
;
2731 dout(10) << __func__
<< " " << *m
<< dendl
;
2732 MOSDMap
*reply
= new MOSDMap(mon
->monmap
->fsid
, features
);
2733 epoch_t first
= get_first_committed();
2734 epoch_t last
= osdmap
.get_epoch();
2735 int max
= g_conf()->osd_map_message_max
;
2736 ssize_t max_bytes
= g_conf()->osd_map_message_max_bytes
;
2737 for (epoch_t e
= std::max(first
, m
->get_full_first());
2738 e
<= std::min(last
, m
->get_full_last()) && max
> 0 && max_bytes
> 0;
2740 bufferlist
& bl
= reply
->maps
[e
];
2741 int r
= get_version_full(e
, features
, bl
);
2742 ceph_assert(r
>= 0);
2743 max_bytes
-= bl
.length();
2745 for (epoch_t e
= std::max(first
, m
->get_inc_first());
2746 e
<= std::min(last
, m
->get_inc_last()) && max
> 0 && max_bytes
> 0;
2748 bufferlist
& bl
= reply
->incremental_maps
[e
];
2749 int r
= get_version(e
, features
, bl
);
2750 ceph_assert(r
>= 0);
2751 max_bytes
-= bl
.length();
2753 reply
->oldest_map
= first
;
2754 reply
->newest_map
= last
;
2755 mon
->send_reply(op
, reply
);
2760 // ---------------------------
2765 bool OSDMonitor::check_source(MonOpRequestRef op
, uuid_d fsid
) {
2766 // check permissions
2767 MonSession
*session
= op
->get_session();
2770 if (!session
->is_capable("osd", MON_CAP_X
)) {
2771 dout(0) << "got MOSDFailure from entity with insufficient caps "
2772 << session
->caps
<< dendl
;
2775 if (fsid
!= mon
->monmap
->fsid
) {
2776 dout(0) << "check_source: on fsid " << fsid
2777 << " != " << mon
->monmap
->fsid
<< dendl
;
2784 bool OSDMonitor::preprocess_failure(MonOpRequestRef op
)
2786 op
->mark_osdmon_event(__func__
);
2787 auto m
= op
->get_req
<MOSDFailure
>();
2788 // who is target_osd
2789 int badboy
= m
->get_target_osd();
2791 // check permissions
2792 if (check_source(op
, m
->fsid
))
2795 // first, verify the reporting host is valid
2796 if (m
->get_orig_source().is_osd()) {
2797 int from
= m
->get_orig_source().num();
2798 if (!osdmap
.exists(from
) ||
2799 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs()) ||
2800 (osdmap
.is_down(from
) && m
->if_osd_failed())) {
2801 dout(5) << "preprocess_failure from dead osd." << from
2802 << ", ignoring" << dendl
;
2803 send_incremental(op
, m
->get_epoch()+1);
2810 if (osdmap
.is_down(badboy
)) {
2811 dout(5) << "preprocess_failure dne(/dup?): osd." << m
->get_target_osd()
2812 << " " << m
->get_target_addrs()
2813 << ", from " << m
->get_orig_source() << dendl
;
2814 if (m
->get_epoch() < osdmap
.get_epoch())
2815 send_incremental(op
, m
->get_epoch()+1);
2818 if (osdmap
.get_addrs(badboy
) != m
->get_target_addrs()) {
2819 dout(5) << "preprocess_failure wrong osd: report osd." << m
->get_target_osd()
2820 << " " << m
->get_target_addrs()
2821 << " != map's " << osdmap
.get_addrs(badboy
)
2822 << ", from " << m
->get_orig_source() << dendl
;
2823 if (m
->get_epoch() < osdmap
.get_epoch())
2824 send_incremental(op
, m
->get_epoch()+1);
2828 // already reported?
2829 if (osdmap
.is_down(badboy
) ||
2830 osdmap
.get_up_from(badboy
) > m
->get_epoch()) {
2831 dout(5) << "preprocess_failure dup/old: osd." << m
->get_target_osd()
2832 << " " << m
->get_target_addrs()
2833 << ", from " << m
->get_orig_source() << dendl
;
2834 if (m
->get_epoch() < osdmap
.get_epoch())
2835 send_incremental(op
, m
->get_epoch()+1);
2839 if (!can_mark_down(badboy
)) {
2840 dout(5) << "preprocess_failure ignoring report of osd."
2841 << m
->get_target_osd() << " " << m
->get_target_addrs()
2842 << " from " << m
->get_orig_source() << dendl
;
2846 dout(10) << "preprocess_failure new: osd." << m
->get_target_osd()
2847 << " " << m
->get_target_addrs()
2848 << ", from " << m
->get_orig_source() << dendl
;
2856 class C_AckMarkedDown
: public C_MonOp
{
2862 : C_MonOp(op
), osdmon(osdmon
) {}
2864 void _finish(int r
) override
{
2866 auto m
= op
->get_req
<MOSDMarkMeDown
>();
2867 osdmon
->mon
->send_reply(
2874 false)); // ACK itself does not request an ack
2875 } else if (r
== -EAGAIN
) {
2876 osdmon
->dispatch(op
);
2878 ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r
);
2881 ~C_AckMarkedDown() override
{
2885 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op
)
2887 op
->mark_osdmon_event(__func__
);
2888 auto m
= op
->get_req
<MOSDMarkMeDown
>();
2889 int from
= m
->target_osd
;
2891 // check permissions
2892 if (check_source(op
, m
->fsid
))
2895 // first, verify the reporting host is valid
2896 if (!m
->get_orig_source().is_osd())
2899 if (!osdmap
.exists(from
) ||
2900 osdmap
.is_down(from
) ||
2901 osdmap
.get_addrs(from
) != m
->target_addrs
) {
2902 dout(5) << "preprocess_mark_me_down from dead osd."
2903 << from
<< ", ignoring" << dendl
;
2904 send_incremental(op
, m
->get_epoch()+1);
2908 // no down might be set
2909 if (!can_mark_down(from
))
2912 dout(10) << "MOSDMarkMeDown for: " << m
->get_orig_source()
2913 << " " << m
->target_addrs
<< dendl
;
2917 if (m
->request_ack
) {
2918 Context
*c(new C_AckMarkedDown(this, op
));
2924 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op
)
2926 op
->mark_osdmon_event(__func__
);
2927 auto m
= op
->get_req
<MOSDMarkMeDown
>();
2928 int target_osd
= m
->target_osd
;
2930 ceph_assert(osdmap
.is_up(target_osd
));
2931 ceph_assert(osdmap
.get_addrs(target_osd
) == m
->target_addrs
);
2933 mon
->clog
->info() << "osd." << target_osd
<< " marked itself down";
2934 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
2936 wait_for_finished_proposal(op
, new C_AckMarkedDown(this, op
));
2940 bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op
)
2942 op
->mark_osdmon_event(__func__
);
2943 auto m
= op
->get_req
<MOSDMarkMeDead
>();
2944 int from
= m
->target_osd
;
2946 // check permissions
2947 if (check_source(op
, m
->fsid
)) {
2952 // first, verify the reporting host is valid
2953 if (!m
->get_orig_source().is_osd()) {
2958 if (!osdmap
.exists(from
) ||
2959 !osdmap
.is_down(from
)) {
2960 dout(5) << __func__
<< " from nonexistent or up osd." << from
2961 << ", ignoring" << dendl
;
2962 send_incremental(op
, m
->get_epoch()+1);
2970 bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op
)
2972 op
->mark_osdmon_event(__func__
);
2973 auto m
= op
->get_req
<MOSDMarkMeDead
>();
2974 int target_osd
= m
->target_osd
;
2976 ceph_assert(osdmap
.is_down(target_osd
));
2978 mon
->clog
->info() << "osd." << target_osd
<< " marked itself dead as of e"
2980 if (!pending_inc
.new_xinfo
.count(target_osd
)) {
2981 pending_inc
.new_xinfo
[target_osd
] = osdmap
.osd_xinfo
[target_osd
];
2983 pending_inc
.new_xinfo
[target_osd
].dead_epoch
= m
->get_epoch();
2984 wait_for_finished_proposal(
2987 [op
, this] (int r
) {
2989 mon
->no_reply(op
); // ignore on success
2996 bool OSDMonitor::can_mark_down(int i
)
2998 if (osdmap
.is_nodown(i
)) {
2999 dout(5) << __func__
<< " osd." << i
<< " is marked as nodown, "
3000 << "will not mark it down" << dendl
;
3004 int num_osds
= osdmap
.get_num_osds();
3005 if (num_osds
== 0) {
3006 dout(5) << __func__
<< " no osds" << dendl
;
3009 int up
= osdmap
.get_num_up_osds() - pending_inc
.get_net_marked_down(&osdmap
);
3010 float up_ratio
= (float)up
/ (float)num_osds
;
3011 if (up_ratio
< g_conf()->mon_osd_min_up_ratio
) {
3012 dout(2) << __func__
<< " current up_ratio " << up_ratio
<< " < min "
3013 << g_conf()->mon_osd_min_up_ratio
3014 << ", will not mark osd." << i
<< " down" << dendl
;
3020 bool OSDMonitor::can_mark_up(int i
)
3022 if (osdmap
.is_noup(i
)) {
3023 dout(5) << __func__
<< " osd." << i
<< " is marked as noup, "
3024 << "will not mark it up" << dendl
;
3032 * @note the parameter @p i apparently only exists here so we can output the
3033 * osd's id on messages.
3035 bool OSDMonitor::can_mark_out(int i
)
3037 if (osdmap
.is_noout(i
)) {
3038 dout(5) << __func__
<< " osd." << i
<< " is marked as noout, "
3039 << "will not mark it out" << dendl
;
3043 int num_osds
= osdmap
.get_num_osds();
3044 if (num_osds
== 0) {
3045 dout(5) << __func__
<< " no osds" << dendl
;
3048 int in
= osdmap
.get_num_in_osds() - pending_inc
.get_net_marked_out(&osdmap
);
3049 float in_ratio
= (float)in
/ (float)num_osds
;
3050 if (in_ratio
< g_conf()->mon_osd_min_in_ratio
) {
3052 dout(5) << __func__
<< " current in_ratio " << in_ratio
<< " < min "
3053 << g_conf()->mon_osd_min_in_ratio
3054 << ", will not mark osd." << i
<< " out" << dendl
;
3056 dout(5) << __func__
<< " current in_ratio " << in_ratio
<< " < min "
3057 << g_conf()->mon_osd_min_in_ratio
3058 << ", will not mark osds out" << dendl
;
3065 bool OSDMonitor::can_mark_in(int i
)
3067 if (osdmap
.is_noin(i
)) {
3068 dout(5) << __func__
<< " osd." << i
<< " is marked as noin, "
3069 << "will not mark it in" << dendl
;
3076 bool OSDMonitor::check_failures(utime_t now
)
3078 bool found_failure
= false;
3079 for (map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
3080 p
!= failure_info
.end();
3082 if (can_mark_down(p
->first
)) {
3083 found_failure
|= check_failure(now
, p
->first
, p
->second
);
3086 return found_failure
;
3089 bool OSDMonitor::check_failure(utime_t now
, int target_osd
, failure_info_t
& fi
)
3091 // already pending failure?
3092 if (pending_inc
.new_state
.count(target_osd
) &&
3093 pending_inc
.new_state
[target_osd
] & CEPH_OSD_UP
) {
3094 dout(10) << " already pending failure" << dendl
;
3098 set
<string
> reporters_by_subtree
;
3099 auto reporter_subtree_level
= g_conf().get_val
<string
>("mon_osd_reporter_subtree_level");
3100 utime_t
orig_grace(g_conf()->osd_heartbeat_grace
, 0);
3101 utime_t max_failed_since
= fi
.get_failed_since();
3102 utime_t failed_for
= now
- max_failed_since
;
3104 utime_t grace
= orig_grace
;
3105 double my_grace
= 0, peer_grace
= 0;
3107 if (g_conf()->mon_osd_adjust_heartbeat_grace
) {
3108 double halflife
= (double)g_conf()->mon_osd_laggy_halflife
;
3109 decay_k
= ::log(.5) / halflife
;
3111 // scale grace period based on historical probability of 'lagginess'
3112 // (false positive failures due to slowness).
3113 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(target_osd
);
3114 double decay
= exp((double)failed_for
* decay_k
);
3115 dout(20) << " halflife " << halflife
<< " decay_k " << decay_k
3116 << " failed_for " << failed_for
<< " decay " << decay
<< dendl
;
3117 my_grace
= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
3121 // consider the peers reporting a failure a proxy for a potential
3122 // 'subcluster' over the overall cluster that is similarly
3123 // laggy. this is clearly not true in all cases, but will sometimes
3124 // help us localize the grace correction to a subset of the system
3125 // (say, a rack with a bad switch) that is unhappy.
3126 ceph_assert(fi
.reporters
.size());
3127 for (auto p
= fi
.reporters
.begin(); p
!= fi
.reporters
.end();) {
3128 // get the parent bucket whose type matches with "reporter_subtree_level".
3129 // fall back to OSD if the level doesn't exist.
3130 if (osdmap
.exists(p
->first
)) {
3131 auto reporter_loc
= osdmap
.crush
->get_full_location(p
->first
);
3132 if (auto iter
= reporter_loc
.find(reporter_subtree_level
);
3133 iter
== reporter_loc
.end()) {
3134 reporters_by_subtree
.insert("osd." + to_string(p
->first
));
3136 reporters_by_subtree
.insert(iter
->second
);
3138 if (g_conf()->mon_osd_adjust_heartbeat_grace
) {
3139 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(p
->first
);
3140 utime_t elapsed
= now
- xi
.down_stamp
;
3141 double decay
= exp((double)elapsed
* decay_k
);
3142 peer_grace
+= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
3146 fi
.cancel_report(p
->first
);;
3147 p
= fi
.reporters
.erase(p
);
3151 if (g_conf()->mon_osd_adjust_heartbeat_grace
) {
3152 peer_grace
/= (double)fi
.reporters
.size();
3153 grace
+= peer_grace
;
3156 dout(10) << " osd." << target_osd
<< " has "
3157 << fi
.reporters
.size() << " reporters, "
3158 << grace
<< " grace (" << orig_grace
<< " + " << my_grace
3159 << " + " << peer_grace
<< "), max_failed_since " << max_failed_since
3162 if (failed_for
>= grace
&&
3163 reporters_by_subtree
.size() >= g_conf().get_val
<uint64_t>("mon_osd_min_down_reporters")) {
3164 dout(1) << " we have enough reporters to mark osd." << target_osd
3165 << " down" << dendl
;
3166 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
3168 mon
->clog
->info() << "osd." << target_osd
<< " failed ("
3169 << osdmap
.crush
->get_full_location_ordered_string(
3172 << (int)reporters_by_subtree
.size()
3173 << " reporters from different "
3174 << reporter_subtree_level
<< " after "
3175 << failed_for
<< " >= grace " << grace
<< ")";
3181 void OSDMonitor::force_failure(int target_osd
, int by
)
3183 // already pending failure?
3184 if (pending_inc
.new_state
.count(target_osd
) &&
3185 pending_inc
.new_state
[target_osd
] & CEPH_OSD_UP
) {
3186 dout(10) << " already pending failure" << dendl
;
3190 dout(1) << " we're forcing failure of osd." << target_osd
<< dendl
;
3191 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
3192 if (!pending_inc
.new_xinfo
.count(target_osd
)) {
3193 pending_inc
.new_xinfo
[target_osd
] = osdmap
.osd_xinfo
[target_osd
];
3195 pending_inc
.new_xinfo
[target_osd
].dead_epoch
= pending_inc
.epoch
;
3197 mon
->clog
->info() << "osd." << target_osd
<< " failed ("
3198 << osdmap
.crush
->get_full_location_ordered_string(target_osd
)
3199 << ") (connection refused reported by osd." << by
<< ")";
3203 bool OSDMonitor::prepare_failure(MonOpRequestRef op
)
3205 op
->mark_osdmon_event(__func__
);
3206 auto m
= op
->get_req
<MOSDFailure
>();
3207 dout(1) << "prepare_failure osd." << m
->get_target_osd()
3208 << " " << m
->get_target_addrs()
3209 << " from " << m
->get_orig_source()
3210 << " is reporting failure:" << m
->if_osd_failed() << dendl
;
3212 int target_osd
= m
->get_target_osd();
3213 int reporter
= m
->get_orig_source().num();
3214 ceph_assert(osdmap
.is_up(target_osd
));
3215 ceph_assert(osdmap
.get_addrs(target_osd
) == m
->get_target_addrs());
3219 if (m
->if_osd_failed()) {
3220 // calculate failure time
3221 utime_t now
= ceph_clock_now();
3222 utime_t failed_since
=
3223 m
->get_recv_stamp() - utime_t(m
->failed_for
, 0);
3226 if (m
->is_immediate()) {
3227 mon
->clog
->debug() << "osd." << m
->get_target_osd()
3228 << " reported immediately failed by "
3229 << m
->get_orig_source();
3230 force_failure(target_osd
, reporter
);
3233 mon
->clog
->debug() << "osd." << m
->get_target_osd() << " reported failed by "
3234 << m
->get_orig_source();
3236 failure_info_t
& fi
= failure_info
[target_osd
];
3237 MonOpRequestRef old_op
= fi
.add_report(reporter
, failed_since
, op
);
3239 mon
->no_reply(old_op
);
3242 return check_failure(now
, target_osd
, fi
);
3244 // remove the report
3245 mon
->clog
->debug() << "osd." << m
->get_target_osd()
3246 << " failure report canceled by "
3247 << m
->get_orig_source();
3248 if (failure_info
.count(target_osd
)) {
3249 failure_info_t
& fi
= failure_info
[target_osd
];
3250 MonOpRequestRef report_op
= fi
.cancel_report(reporter
);
3252 mon
->no_reply(report_op
);
3254 if (fi
.reporters
.empty()) {
3255 dout(10) << " removing last failure_info for osd." << target_osd
3257 failure_info
.erase(target_osd
);
3259 dout(10) << " failure_info for osd." << target_osd
<< " now "
3260 << fi
.reporters
.size() << " reporters" << dendl
;
3263 dout(10) << " no failure_info for osd." << target_osd
<< dendl
;
3270 void OSDMonitor::process_failures()
3272 map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
3273 while (p
!= failure_info
.end()) {
3274 if (osdmap
.is_up(p
->first
)) {
3277 dout(10) << "process_failures osd." << p
->first
<< dendl
;
3278 list
<MonOpRequestRef
> ls
;
3279 p
->second
.take_report_messages(ls
);
3280 failure_info
.erase(p
++);
3282 while (!ls
.empty()) {
3283 MonOpRequestRef o
= ls
.front();
3285 o
->mark_event(__func__
);
3286 MOSDFailure
*m
= o
->get_req
<MOSDFailure
>();
3287 send_latest(o
, m
->get_epoch());
3296 void OSDMonitor::take_all_failures(list
<MonOpRequestRef
>& ls
)
3298 dout(10) << __func__
<< " on " << failure_info
.size() << " osds" << dendl
;
3300 for (map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
3301 p
!= failure_info
.end();
3303 p
->second
.take_report_messages(ls
);
3305 failure_info
.clear();
3311 bool OSDMonitor::preprocess_boot(MonOpRequestRef op
)
3313 op
->mark_osdmon_event(__func__
);
3314 auto m
= op
->get_req
<MOSDBoot
>();
3315 int from
= m
->get_orig_source_inst().name
.num();
3317 // check permissions, ignore if failed (no response expected)
3318 MonSession
*session
= op
->get_session();
3321 if (!session
->is_capable("osd", MON_CAP_X
)) {
3322 dout(0) << "got preprocess_boot message from entity with insufficient caps"
3323 << session
->caps
<< dendl
;
3327 if (m
->sb
.cluster_fsid
!= mon
->monmap
->fsid
) {
3328 dout(0) << "preprocess_boot on fsid " << m
->sb
.cluster_fsid
3329 << " != " << mon
->monmap
->fsid
<< dendl
;
3333 if (m
->get_orig_source_inst().addr
.is_blank_ip()) {
3334 dout(0) << "preprocess_boot got blank addr for " << m
->get_orig_source_inst() << dendl
;
3338 ceph_assert(m
->get_orig_source_inst().name
.is_osd());
3340 // force all osds to have gone through luminous prior to upgrade to nautilus
3342 vector
<string
> missing
;
3343 if (!HAVE_FEATURE(m
->osd_features
, SERVER_LUMINOUS
)) {
3344 missing
.push_back("CEPH_FEATURE_SERVER_LUMINOUS");
3346 if (!HAVE_FEATURE(m
->osd_features
, SERVER_JEWEL
)) {
3347 missing
.push_back("CEPH_FEATURE_SERVER_JEWEL");
3349 if (!HAVE_FEATURE(m
->osd_features
, SERVER_KRAKEN
)) {
3350 missing
.push_back("CEPH_FEATURE_SERVER_KRAKEN");
3352 if (!HAVE_FEATURE(m
->osd_features
, OSD_RECOVERY_DELETES
)) {
3353 missing
.push_back("CEPH_FEATURE_OSD_RECOVERY_DELETES");
3356 if (!missing
.empty()) {
3357 using std::experimental::make_ostream_joiner
;
3360 copy(begin(missing
), end(missing
), make_ostream_joiner(ss
, ";"));
3362 mon
->clog
->info() << "disallowing boot of OSD "
3363 << m
->get_orig_source_inst()
3364 << " because the osd lacks " << ss
.str();
3369 // make sure osd versions do not span more than 3 releases
3370 if (HAVE_FEATURE(m
->osd_features
, SERVER_OCTOPUS
) &&
3371 osdmap
.require_osd_release
< ceph_release_t::mimic
) {
3372 mon
->clog
->info() << "disallowing boot of octopus+ OSD "
3373 << m
->get_orig_source_inst()
3374 << " because require_osd_release < mimic";
3378 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
3379 // we are reusing a jewel feature bit that was retired in luminous.
3380 if (osdmap
.require_osd_release
>= ceph_release_t::luminous
&&
3381 osdmap
.test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT
) &&
3382 !(m
->osd_features
& CEPH_FEATURE_OSD_PGLOG_HARDLIMIT
)) {
3383 mon
->clog
->info() << "disallowing boot of OSD "
3384 << m
->get_orig_source_inst()
3385 << " because 'pglog_hardlimit' osdmap flag is set and OSD lacks the OSD_PGLOG_HARDLIMIT feature";
3390 if (osdmap
.is_up(from
) &&
3391 osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs()) &&
3392 osdmap
.get_cluster_addrs(from
).legacy_equals(m
->cluster_addrs
)) {
3394 dout(7) << "preprocess_boot dup from " << m
->get_orig_source()
3395 << " " << m
->get_orig_source_addrs()
3396 << " =~ " << osdmap
.get_addrs(from
) << dendl
;
3401 if (osdmap
.exists(from
) &&
3402 !osdmap
.get_uuid(from
).is_zero() &&
3403 osdmap
.get_uuid(from
) != m
->sb
.osd_fsid
) {
3404 dout(7) << __func__
<< " from " << m
->get_orig_source_inst()
3405 << " clashes with existing osd: different fsid"
3406 << " (ours: " << osdmap
.get_uuid(from
)
3407 << " ; theirs: " << m
->sb
.osd_fsid
<< ")" << dendl
;
3411 if (osdmap
.exists(from
) &&
3412 osdmap
.get_info(from
).up_from
> m
->version
&&
3413 osdmap
.get_most_recent_addrs(from
).legacy_equals(
3414 m
->get_orig_source_addrs())) {
3415 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl
;
3416 send_latest(op
, m
->sb
.current_epoch
+1);
3421 if (!can_mark_up(from
)) {
3422 dout(7) << "preprocess_boot ignoring boot from " << m
->get_orig_source_inst() << dendl
;
3423 send_latest(op
, m
->sb
.current_epoch
+1);
3427 dout(10) << "preprocess_boot from " << m
->get_orig_source_inst() << dendl
;
3434 bool OSDMonitor::prepare_boot(MonOpRequestRef op
)
3436 op
->mark_osdmon_event(__func__
);
3437 auto m
= op
->get_req
<MOSDBoot
>();
3438 dout(7) << __func__
<< " from " << m
->get_source()
3440 << " client_addrs" << m
->get_connection()->get_peer_addrs()
3441 << " cluster_addrs " << m
->cluster_addrs
3442 << " hb_back_addrs " << m
->hb_back_addrs
3443 << " hb_front_addrs " << m
->hb_front_addrs
3446 ceph_assert(m
->get_orig_source().is_osd());
3447 int from
= m
->get_orig_source().num();
3449 // does this osd exist?
3450 if (from
>= osdmap
.get_max_osd()) {
3451 dout(1) << "boot from osd." << from
<< " >= max_osd "
3452 << osdmap
.get_max_osd() << dendl
;
3456 int oldstate
= osdmap
.exists(from
) ? osdmap
.get_state(from
) : CEPH_OSD_NEW
;
3457 if (pending_inc
.new_state
.count(from
))
3458 oldstate
^= pending_inc
.new_state
[from
];
3460 // already up? mark down first?
3461 if (osdmap
.is_up(from
)) {
3462 dout(7) << __func__
<< " was up, first marking down osd." << from
<< " "
3463 << osdmap
.get_addrs(from
) << dendl
;
3464 // preprocess should have caught these; if not, assert.
3465 ceph_assert(!osdmap
.get_addrs(from
).legacy_equals(
3466 m
->get_orig_source_addrs()) ||
3467 !osdmap
.get_cluster_addrs(from
).legacy_equals(m
->cluster_addrs
));
3468 ceph_assert(osdmap
.get_uuid(from
) == m
->sb
.osd_fsid
);
3470 if (pending_inc
.new_state
.count(from
) == 0 ||
3471 (pending_inc
.new_state
[from
] & CEPH_OSD_UP
) == 0) {
3472 // mark previous guy down
3473 pending_inc
.new_state
[from
] = CEPH_OSD_UP
;
3475 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
3476 } else if (pending_inc
.new_up_client
.count(from
)) {
3477 // already prepared, just wait
3478 dout(7) << __func__
<< " already prepared, waiting on "
3479 << m
->get_orig_source_addr() << dendl
;
3480 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
3483 pending_inc
.new_up_client
[from
] = m
->get_orig_source_addrs();
3484 pending_inc
.new_up_cluster
[from
] = m
->cluster_addrs
;
3485 pending_inc
.new_hb_back_up
[from
] = m
->hb_back_addrs
;
3486 pending_inc
.new_hb_front_up
[from
] = m
->hb_front_addrs
;
3488 down_pending_out
.erase(from
); // if any
3491 osd_weight
[from
] = m
->sb
.weight
;
3494 dout(10) << " setting osd." << from
<< " uuid to " << m
->sb
.osd_fsid
3496 if (!osdmap
.exists(from
) || osdmap
.get_uuid(from
) != m
->sb
.osd_fsid
) {
3497 // preprocess should have caught this; if not, assert.
3498 ceph_assert(!osdmap
.exists(from
) || osdmap
.get_uuid(from
).is_zero());
3499 pending_inc
.new_uuid
[from
] = m
->sb
.osd_fsid
;
3503 if (m
->sb
.newest_map
== 0 && osdmap
.exists(from
)) {
3504 const osd_info_t
& i
= osdmap
.get_info(from
);
3505 if (i
.up_from
> i
.lost_at
) {
3506 dout(10) << " fresh osd; marking lost_at too" << dendl
;
3507 pending_inc
.new_lost
[from
] = osdmap
.get_epoch();
3512 bufferlist osd_metadata
;
3513 encode(m
->metadata
, osd_metadata
);
3514 pending_metadata
[from
] = osd_metadata
;
3515 pending_metadata_rm
.erase(from
);
3517 // adjust last clean unmount epoch?
3518 const osd_info_t
& info
= osdmap
.get_info(from
);
3519 dout(10) << " old osd_info: " << info
<< dendl
;
3520 if (m
->sb
.mounted
> info
.last_clean_begin
||
3521 (m
->sb
.mounted
== info
.last_clean_begin
&&
3522 m
->sb
.clean_thru
> info
.last_clean_end
)) {
3523 epoch_t begin
= m
->sb
.mounted
;
3524 epoch_t end
= m
->sb
.clean_thru
;
3526 dout(10) << __func__
<< " osd." << from
<< " last_clean_interval "
3527 << "[" << info
.last_clean_begin
<< "," << info
.last_clean_end
3528 << ") -> [" << begin
<< "-" << end
<< ")"
3530 pending_inc
.new_last_clean_interval
[from
] =
3531 pair
<epoch_t
,epoch_t
>(begin
, end
);
3534 if (pending_inc
.new_xinfo
.count(from
) == 0)
3535 pending_inc
.new_xinfo
[from
] = osdmap
.osd_xinfo
[from
];
3536 osd_xinfo_t
& xi
= pending_inc
.new_xinfo
[from
];
3537 if (m
->boot_epoch
== 0) {
3538 xi
.laggy_probability
*= (1.0 - g_conf()->mon_osd_laggy_weight
);
3539 xi
.laggy_interval
*= (1.0 - g_conf()->mon_osd_laggy_weight
);
3540 dout(10) << " not laggy, new xi " << xi
<< dendl
;
3542 if (xi
.down_stamp
.sec()) {
3543 int interval
= ceph_clock_now().sec() -
3544 xi
.down_stamp
.sec();
3545 if (g_conf()->mon_osd_laggy_max_interval
&&
3546 (interval
> g_conf()->mon_osd_laggy_max_interval
)) {
3547 interval
= g_conf()->mon_osd_laggy_max_interval
;
3550 interval
* g_conf()->mon_osd_laggy_weight
+
3551 xi
.laggy_interval
* (1.0 - g_conf()->mon_osd_laggy_weight
);
3553 xi
.laggy_probability
=
3554 g_conf()->mon_osd_laggy_weight
+
3555 xi
.laggy_probability
* (1.0 - g_conf()->mon_osd_laggy_weight
);
3556 dout(10) << " laggy, now xi " << xi
<< dendl
;
3559 // set features shared by the osd
3560 if (m
->osd_features
)
3561 xi
.features
= m
->osd_features
;
3563 xi
.features
= m
->get_connection()->get_features();
3566 if ((g_conf()->mon_osd_auto_mark_auto_out_in
&&
3567 (oldstate
& CEPH_OSD_AUTOOUT
)) ||
3568 (g_conf()->mon_osd_auto_mark_new_in
&& (oldstate
& CEPH_OSD_NEW
)) ||
3569 (g_conf()->mon_osd_auto_mark_in
)) {
3570 if (can_mark_in(from
)) {
3571 if (xi
.old_weight
> 0) {
3572 pending_inc
.new_weight
[from
] = xi
.old_weight
;
3575 pending_inc
.new_weight
[from
] = CEPH_OSD_IN
;
3578 dout(7) << __func__
<< " NOIN set, will not mark in "
3579 << m
->get_orig_source_addr() << dendl
;
3584 wait_for_finished_proposal(op
, new C_Booted(this, op
));
3589 void OSDMonitor::_booted(MonOpRequestRef op
, bool logit
)
3591 op
->mark_osdmon_event(__func__
);
3592 auto m
= op
->get_req
<MOSDBoot
>();
3593 dout(7) << "_booted " << m
->get_orig_source_inst()
3594 << " w " << m
->sb
.weight
<< " from " << m
->sb
.current_epoch
<< dendl
;
3597 mon
->clog
->info() << m
->get_source() << " " << m
->get_orig_source_addrs()
3601 send_latest(op
, m
->sb
.current_epoch
+1);
3608 bool OSDMonitor::preprocess_full(MonOpRequestRef op
)
3610 op
->mark_osdmon_event(__func__
);
3611 auto m
= op
->get_req
<MOSDFull
>();
3612 int from
= m
->get_orig_source().num();
3614 unsigned mask
= CEPH_OSD_NEARFULL
| CEPH_OSD_BACKFILLFULL
| CEPH_OSD_FULL
;
3616 // check permissions, ignore if failed
3617 MonSession
*session
= op
->get_session();
3620 if (!session
->is_capable("osd", MON_CAP_X
)) {
3621 dout(0) << "MOSDFull from entity with insufficient privileges:"
3622 << session
->caps
<< dendl
;
3626 // ignore a full message from the osd instance that already went down
3627 if (!osdmap
.exists(from
)) {
3628 dout(7) << __func__
<< " ignoring full message from nonexistent "
3629 << m
->get_orig_source_inst() << dendl
;
3632 if ((!osdmap
.is_up(from
) &&
3633 osdmap
.get_most_recent_addrs(from
).legacy_equals(
3634 m
->get_orig_source_addrs())) ||
3635 (osdmap
.is_up(from
) &&
3636 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs()))) {
3637 dout(7) << __func__
<< " ignoring full message from down "
3638 << m
->get_orig_source_inst() << dendl
;
3642 OSDMap::calc_state_set(osdmap
.get_state(from
), state
);
3644 if ((osdmap
.get_state(from
) & mask
) == m
->state
) {
3645 dout(7) << __func__
<< " state already " << state
<< " for osd." << from
3646 << " " << m
->get_orig_source_inst() << dendl
;
3647 _reply_map(op
, m
->version
);
3651 dout(10) << __func__
<< " want state " << state
<< " for osd." << from
3652 << " " << m
->get_orig_source_inst() << dendl
;
3659 bool OSDMonitor::prepare_full(MonOpRequestRef op
)
3661 op
->mark_osdmon_event(__func__
);
3662 auto m
= op
->get_req
<MOSDFull
>();
3663 const int from
= m
->get_orig_source().num();
3665 const unsigned mask
= CEPH_OSD_NEARFULL
| CEPH_OSD_BACKFILLFULL
| CEPH_OSD_FULL
;
3666 const unsigned want_state
= m
->state
& mask
; // safety first
3668 unsigned cur_state
= osdmap
.get_state(from
);
3669 auto p
= pending_inc
.new_state
.find(from
);
3670 if (p
!= pending_inc
.new_state
.end()) {
3671 cur_state
^= p
->second
;
3675 set
<string
> want_state_set
, cur_state_set
;
3676 OSDMap::calc_state_set(want_state
, want_state_set
);
3677 OSDMap::calc_state_set(cur_state
, cur_state_set
);
3679 if (cur_state
!= want_state
) {
3680 if (p
!= pending_inc
.new_state
.end()) {
3683 pending_inc
.new_state
[from
] = 0;
3685 pending_inc
.new_state
[from
] |= (osdmap
.get_state(from
) & mask
) ^ want_state
;
3686 dout(7) << __func__
<< " osd." << from
<< " " << cur_state_set
3687 << " -> " << want_state_set
<< dendl
;
3689 dout(7) << __func__
<< " osd." << from
<< " " << cur_state_set
3690 << " = wanted " << want_state_set
<< ", just waiting" << dendl
;
3693 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
3700 bool OSDMonitor::preprocess_alive(MonOpRequestRef op
)
3702 op
->mark_osdmon_event(__func__
);
3703 auto m
= op
->get_req
<MOSDAlive
>();
3704 int from
= m
->get_orig_source().num();
3706 // check permissions, ignore if failed
3707 MonSession
*session
= op
->get_session();
3710 if (!session
->is_capable("osd", MON_CAP_X
)) {
3711 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3712 << session
->caps
<< dendl
;
3716 if (!osdmap
.is_up(from
) ||
3717 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs())) {
3718 dout(7) << "preprocess_alive ignoring alive message from down "
3719 << m
->get_orig_source() << " " << m
->get_orig_source_addrs()
3724 if (osdmap
.get_up_thru(from
) >= m
->want
) {
3726 dout(7) << "preprocess_alive want up_thru " << m
->want
<< " dup from " << m
->get_orig_source_inst() << dendl
;
3727 _reply_map(op
, m
->version
);
3731 dout(10) << "preprocess_alive want up_thru " << m
->want
3732 << " from " << m
->get_orig_source_inst() << dendl
;
3739 bool OSDMonitor::prepare_alive(MonOpRequestRef op
)
3741 op
->mark_osdmon_event(__func__
);
3742 auto m
= op
->get_req
<MOSDAlive
>();
3743 int from
= m
->get_orig_source().num();
3745 if (0) { // we probably don't care much about these
3746 mon
->clog
->debug() << m
->get_orig_source_inst() << " alive";
3749 dout(7) << "prepare_alive want up_thru " << m
->want
<< " have " << m
->version
3750 << " from " << m
->get_orig_source_inst() << dendl
;
3752 update_up_thru(from
, m
->version
); // set to the latest map the OSD has
3753 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
3757 void OSDMonitor::_reply_map(MonOpRequestRef op
, epoch_t e
)
3759 op
->mark_osdmon_event(__func__
);
3760 dout(7) << "_reply_map " << e
3761 << " from " << op
->get_req()->get_orig_source_inst()
3767 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op
)
3769 op
->mark_osdmon_event(__func__
);
3770 auto m
= op
->get_req
<MOSDPGCreated
>();
3771 dout(10) << __func__
<< " " << *m
<< dendl
;
3772 auto session
= op
->get_session();
3775 dout(10) << __func__
<< ": no monitor session!" << dendl
;
3778 if (!session
->is_capable("osd", MON_CAP_X
)) {
3779 derr
<< __func__
<< " received from entity "
3780 << "with insufficient privileges " << session
->caps
<< dendl
;
3783 // always forward the "created!" to the leader
3787 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op
)
3789 op
->mark_osdmon_event(__func__
);
3790 auto m
= op
->get_req
<MOSDPGCreated
>();
3791 dout(10) << __func__
<< " " << *m
<< dendl
;
3792 auto src
= m
->get_orig_source();
3793 auto from
= src
.num();
3794 if (!src
.is_osd() ||
3795 !mon
->osdmon()->osdmap
.is_up(from
) ||
3796 !mon
->osdmon()->osdmap
.get_addrs(from
).legacy_equals(
3797 m
->get_orig_source_addrs())) {
3798 dout(1) << __func__
<< " ignoring stats from non-active osd." << dendl
;
3801 pending_created_pgs
.push_back(m
->pgid
);
3805 bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op
)
3807 op
->mark_osdmon_event(__func__
);
3808 auto m
= op
->get_req
<MOSDPGReadyToMerge
>();
3809 dout(10) << __func__
<< " " << *m
<< dendl
;
3810 const pg_pool_t
*pi
;
3811 auto session
= op
->get_session();
3813 dout(10) << __func__
<< ": no monitor session!" << dendl
;
3816 if (!session
->is_capable("osd", MON_CAP_X
)) {
3817 derr
<< __func__
<< " received from entity "
3818 << "with insufficient privileges " << session
->caps
<< dendl
;
3821 pi
= osdmap
.get_pg_pool(m
->pgid
.pool());
3823 derr
<< __func__
<< " pool for " << m
->pgid
<< " dne" << dendl
;
3826 if (pi
->get_pg_num() <= m
->pgid
.ps()) {
3827 dout(20) << " pg_num " << pi
->get_pg_num() << " already < " << m
->pgid
<< dendl
;
3830 if (pi
->get_pg_num() != m
->pgid
.ps() + 1) {
3831 derr
<< " OSD trying to merge wrong pgid " << m
->pgid
<< dendl
;
3834 if (pi
->get_pg_num_pending() > m
->pgid
.ps()) {
3835 dout(20) << " pg_num_pending " << pi
->get_pg_num_pending() << " > " << m
->pgid
<< dendl
;
3845 bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op
)
3847 op
->mark_osdmon_event(__func__
);
3848 auto m
= op
->get_req
<MOSDPGReadyToMerge
>();
3849 dout(10) << __func__
<< " " << *m
<< dendl
;
3851 if (pending_inc
.new_pools
.count(m
->pgid
.pool()))
3852 p
= pending_inc
.new_pools
[m
->pgid
.pool()];
3854 p
= *osdmap
.get_pg_pool(m
->pgid
.pool());
3855 if (p
.get_pg_num() != m
->pgid
.ps() + 1 ||
3856 p
.get_pg_num_pending() > m
->pgid
.ps()) {
3857 dout(10) << __func__
3858 << " race with concurrent pg_num[_pending] update, will retry"
3860 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
3865 p
.dec_pg_num(m
->pgid
,
3869 m
->last_epoch_started
,
3870 m
->last_epoch_clean
);
3871 p
.last_change
= pending_inc
.epoch
;
3873 // back off the merge attempt!
3874 p
.set_pg_num_pending(p
.get_pg_num());
3877 // force pre-nautilus clients to resend their ops, since they
3878 // don't understand pg_num_pending changes form a new interval
3879 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
3881 pending_inc
.new_pools
[m
->pgid
.pool()] = p
;
3883 auto prob
= g_conf().get_val
<double>("mon_inject_pg_merge_bounce_probability");
3886 prob
> (double)(rand() % 1000)/1000.0) {
3887 derr
<< __func__
<< " injecting pg merge pg_num bounce" << dendl
;
3888 auto n
= new MMonCommand(mon
->monmap
->get_fsid());
3889 n
->set_connection(m
->get_connection());
3890 n
->cmd
= { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
3891 osdmap
.get_pool_name(m
->pgid
.pool()) +
3892 "\", \"var\": \"pg_num_actual\", \"val\": \"" +
3893 stringify(m
->pgid
.ps() + 1) + "\"}" };
3894 MonOpRequestRef nop
= mon
->op_tracker
.create_request
<MonOpRequest
>(n
);
3895 nop
->set_type_service();
3896 wait_for_finished_proposal(op
, new C_RetryMessage(this, nop
));
3898 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
3907 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op
)
3909 auto m
= op
->get_req
<MOSDPGTemp
>();
3910 dout(10) << "preprocess_pgtemp " << *m
<< dendl
;
3911 mempool::osdmap::vector
<int> empty
;
3912 int from
= m
->get_orig_source().num();
3913 size_t ignore_cnt
= 0;
3916 MonSession
*session
= op
->get_session();
3919 if (!session
->is_capable("osd", MON_CAP_X
)) {
3920 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
3921 << session
->caps
<< dendl
;
3925 if (!osdmap
.is_up(from
) ||
3926 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs())) {
3927 dout(7) << "ignoring pgtemp message from down "
3928 << m
->get_orig_source() << " " << m
->get_orig_source_addrs()
3937 for (auto p
= m
->pg_temp
.begin(); p
!= m
->pg_temp
.end(); ++p
) {
3938 dout(20) << " " << p
->first
3939 << (osdmap
.pg_temp
->count(p
->first
) ? osdmap
.pg_temp
->get(p
->first
) : empty
)
3940 << " -> " << p
->second
<< dendl
;
3942 // does the pool exist?
3943 if (!osdmap
.have_pg_pool(p
->first
.pool())) {
3945 * 1. If the osdmap does not have the pool, it means the pool has been
3946 * removed in-between the osd sending this message and us handling it.
3947 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
3948 * not exist in the pending either, as the osds would not send a
3949 * message about a pool they know nothing about (yet).
3950 * 3. However, if the pool does exist in the pending, then it must be a
3951 * new pool, and not relevant to this message (see 1).
3953 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
3954 << ": pool has been removed" << dendl
;
3959 int acting_primary
= -1;
3960 osdmap
.pg_to_up_acting_osds(
3961 p
->first
, nullptr, nullptr, nullptr, &acting_primary
);
3962 if (acting_primary
!= from
) {
3963 /* If the source isn't the primary based on the current osdmap, we know
3964 * that the interval changed and that we can discard this message.
3965 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
3966 * which of two pg temp mappings on the same pg is more recent.
3968 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
3969 << ": primary has changed" << dendl
;
3975 if (p
->second
.empty() && (osdmap
.pg_temp
->count(p
->first
) ||
3976 osdmap
.primary_temp
->count(p
->first
)))
3979 // NOTE: we assume that this will clear pg_primary, so consider
3980 // an existing pg_primary field to imply a change
3981 if (p
->second
.size() &&
3982 (osdmap
.pg_temp
->count(p
->first
) == 0 ||
3983 osdmap
.pg_temp
->get(p
->first
) != p
->second
||
3984 osdmap
.primary_temp
->count(p
->first
)))
3988 // should we ignore all the pgs?
3989 if (ignore_cnt
== m
->pg_temp
.size())
3992 dout(7) << "preprocess_pgtemp e" << m
->map_epoch
<< " no changes from " << m
->get_orig_source_inst() << dendl
;
3993 _reply_map(op
, m
->map_epoch
);
4000 void OSDMonitor::update_up_thru(int from
, epoch_t up_thru
)
4002 epoch_t old_up_thru
= osdmap
.get_up_thru(from
);
4003 auto ut
= pending_inc
.new_up_thru
.find(from
);
4004 if (ut
!= pending_inc
.new_up_thru
.end()) {
4005 old_up_thru
= ut
->second
;
4007 if (up_thru
> old_up_thru
) {
4008 // set up_thru too, so the osd doesn't have to ask again
4009 pending_inc
.new_up_thru
[from
] = up_thru
;
4013 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op
)
4015 op
->mark_osdmon_event(__func__
);
4016 auto m
= op
->get_req
<MOSDPGTemp
>();
4017 int from
= m
->get_orig_source().num();
4018 dout(7) << "prepare_pgtemp e" << m
->map_epoch
<< " from " << m
->get_orig_source_inst() << dendl
;
4019 for (map
<pg_t
,vector
<int32_t> >::iterator p
= m
->pg_temp
.begin(); p
!= m
->pg_temp
.end(); ++p
) {
4020 uint64_t pool
= p
->first
.pool();
4021 if (pending_inc
.old_pools
.count(pool
)) {
4022 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
4023 << ": pool pending removal" << dendl
;
4026 if (!osdmap
.have_pg_pool(pool
)) {
4027 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
4028 << ": pool has been removed" << dendl
;
4031 pending_inc
.new_pg_temp
[p
->first
] =
4032 mempool::osdmap::vector
<int>(p
->second
.begin(), p
->second
.end());
4034 // unconditionally clear pg_primary (until this message can encode
4035 // a change for that, too.. at which point we need to also fix
4036 // preprocess_pg_temp)
4037 if (osdmap
.primary_temp
->count(p
->first
) ||
4038 pending_inc
.new_primary_temp
.count(p
->first
))
4039 pending_inc
.new_primary_temp
[p
->first
] = -1;
4042 // set up_thru too, so the osd doesn't have to ask again
4043 update_up_thru(from
, m
->map_epoch
);
4045 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->map_epoch
));
4052 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op
)
4054 op
->mark_osdmon_event(__func__
);
4055 auto m
= op
->get_req
<MRemoveSnaps
>();
4056 dout(7) << "preprocess_remove_snaps " << *m
<< dendl
;
4058 // check privilege, ignore if failed
4059 MonSession
*session
= op
->get_session();
4063 if (!session
->caps
.is_capable(
4065 session
->entity_name
,
4066 "osd", "osd pool rmsnap", {}, true, true, false,
4067 session
->get_peer_socket_addr())) {
4068 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
4069 << session
->caps
<< dendl
;
4073 for (map
<int, vector
<snapid_t
> >::iterator q
= m
->snaps
.begin();
4074 q
!= m
->snaps
.end();
4076 if (!osdmap
.have_pg_pool(q
->first
)) {
4077 dout(10) << " ignoring removed_snaps " << q
->second
4078 << " on non-existent pool " << q
->first
<< dendl
;
4081 const pg_pool_t
*pi
= osdmap
.get_pg_pool(q
->first
);
4082 for (vector
<snapid_t
>::iterator p
= q
->second
.begin();
4083 p
!= q
->second
.end();
4085 if (*p
> pi
->get_snap_seq() ||
4086 !_is_removed_snap(q
->first
, *p
)) {
4092 if (HAVE_FEATURE(m
->get_connection()->get_features(), SERVER_OCTOPUS
)) {
4093 auto reply
= make_message
<MRemoveSnaps
>();
4094 reply
->snaps
= m
->snaps
;
4095 mon
->send_reply(op
, reply
.detach());
4102 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op
)
4104 op
->mark_osdmon_event(__func__
);
4105 auto m
= op
->get_req
<MRemoveSnaps
>();
4106 dout(7) << "prepare_remove_snaps " << *m
<< dendl
;
4108 for (auto& [pool
, snaps
] : m
->snaps
) {
4109 if (!osdmap
.have_pg_pool(pool
)) {
4110 dout(10) << " ignoring removed_snaps " << snaps
4111 << " on non-existent pool " << pool
<< dendl
;
4115 pg_pool_t
& pi
= osdmap
.pools
[pool
];
4116 for (auto s
: snaps
) {
4117 if (!_is_removed_snap(pool
, s
) &&
4118 (!pending_inc
.new_pools
.count(pool
) ||
4119 !pending_inc
.new_pools
[pool
].removed_snaps
.contains(s
)) &&
4120 (!pending_inc
.new_removed_snaps
.count(pool
) ||
4121 !pending_inc
.new_removed_snaps
[pool
].contains(s
))) {
4122 pg_pool_t
*newpi
= pending_inc
.get_new_pool(pool
, &pi
);
4123 if (osdmap
.require_osd_release
< ceph_release_t::octopus
) {
4124 newpi
->removed_snaps
.insert(s
);
4125 dout(10) << " pool " << pool
<< " removed_snaps added " << s
4126 << " (now " << newpi
->removed_snaps
<< ")" << dendl
;
4128 newpi
->flags
|= pg_pool_t::FLAG_SELFMANAGED_SNAPS
;
4129 if (s
> newpi
->get_snap_seq()) {
4130 dout(10) << " pool " << pool
<< " snap_seq "
4131 << newpi
->get_snap_seq() << " -> " << s
<< dendl
;
4132 newpi
->set_snap_seq(s
);
4134 newpi
->set_snap_epoch(pending_inc
.epoch
);
4135 dout(10) << " added pool " << pool
<< " snap " << s
4136 << " to removed_snaps queue" << dendl
;
4137 pending_inc
.new_removed_snaps
[pool
].insert(s
);
4142 if (HAVE_FEATURE(m
->get_connection()->get_features(), SERVER_OCTOPUS
)) {
4143 auto reply
= make_message
<MRemoveSnaps
>();
4144 reply
->snaps
= m
->snaps
;
4145 wait_for_finished_proposal(op
, new C_ReplyOp(this, op
, reply
));
4151 bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op
)
4153 op
->mark_osdmon_event(__func__
);
4154 auto m
= op
->get_req
<MMonGetPurgedSnaps
>();
4155 dout(7) << __func__
<< " " << *m
<< dendl
;
4157 map
<epoch_t
,mempool::osdmap::map
<int64_t,snap_interval_set_t
>> r
;
4159 string k
= make_purged_snap_epoch_key(m
->start
);
4160 auto it
= mon
->store
->get_iterator(OSD_SNAP_PREFIX
);
4162 unsigned long epoch
= m
->last
;
4163 while (it
->valid()) {
4164 if (it
->key().find("purged_epoch_") != 0) {
4167 string k
= it
->key();
4168 int n
= sscanf(k
.c_str(), "purged_epoch_%lx", &epoch
);
4170 derr
<< __func__
<< " unable to parse key '" << it
->key() << "'" << dendl
;
4171 } else if (epoch
> m
->last
) {
4174 bufferlist bl
= it
->value();
4175 auto p
= bl
.cbegin();
4179 } catch (buffer::error
& e
) {
4180 derr
<< __func__
<< " unable to parse value for key '" << it
->key()
4185 n
+= 4 + v
.size() * 16;
4188 // impose a semi-arbitrary limit to message size
4194 auto reply
= make_message
<MMonGetPurgedSnapsReply
>(m
->start
, epoch
);
4195 reply
->purged_snaps
.swap(r
);
4196 mon
->send_reply(op
, reply
.detach());
4202 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op
)
4204 op
->mark_osdmon_event(__func__
);
4206 auto session
= op
->get_session();
4209 dout(10) << __func__
<< " no monitor session!" << dendl
;
4212 if (!session
->is_capable("osd", MON_CAP_X
)) {
4213 derr
<< __func__
<< " received from entity "
4214 << "with insufficient privileges " << session
->caps
<< dendl
;
4217 // Always forward the beacon to the leader, even if they are the same as
4218 // the old one. The leader will mark as down osds that haven't sent
4219 // beacon for a few minutes.
4223 bool OSDMonitor::prepare_beacon(MonOpRequestRef op
)
4225 op
->mark_osdmon_event(__func__
);
4226 const auto beacon
= op
->get_req
<MOSDBeacon
>();
4227 const auto src
= beacon
->get_orig_source();
4228 dout(10) << __func__
<< " " << *beacon
4229 << " from " << src
<< dendl
;
4230 int from
= src
.num();
4232 if (!src
.is_osd() ||
4233 !osdmap
.is_up(from
) ||
4234 !osdmap
.get_addrs(from
).legacy_equals(beacon
->get_orig_source_addrs())) {
4235 if (src
.is_osd() && !osdmap
.is_up(from
)) {
4236 // share some new maps with this guy in case it may not be
4237 // aware of its own deadness...
4238 send_latest(op
, beacon
->version
+1);
4240 dout(1) << " ignoring beacon from non-active osd." << from
<< dendl
;
4244 last_osd_report
[from
] = ceph_clock_now();
4245 osd_epochs
[from
] = beacon
->version
;
4247 for (const auto& pg
: beacon
->pgs
) {
4248 last_epoch_clean
.report(pg
, beacon
->min_last_epoch_clean
);
4251 if (osdmap
.osd_xinfo
[from
].last_purged_snaps_scrub
<
4252 beacon
->last_purged_snaps_scrub
) {
4253 if (pending_inc
.new_xinfo
.count(from
) == 0) {
4254 pending_inc
.new_xinfo
[from
] = osdmap
.osd_xinfo
[from
];
4256 pending_inc
.new_xinfo
[from
].last_purged_snaps_scrub
=
4257 beacon
->last_purged_snaps_scrub
;
4267 void OSDMonitor::send_latest(MonOpRequestRef op
, epoch_t start
)
4269 op
->mark_osdmon_event(__func__
);
4270 dout(5) << "send_latest to " << op
->get_req()->get_orig_source_inst()
4271 << " start " << start
<< dendl
;
4275 send_incremental(op
, start
);
4279 MOSDMap
*OSDMonitor::build_latest_full(uint64_t features
)
4281 MOSDMap
*r
= new MOSDMap(mon
->monmap
->fsid
, features
);
4282 get_version_full(osdmap
.get_epoch(), features
, r
->maps
[osdmap
.get_epoch()]);
4283 r
->oldest_map
= get_first_committed();
4284 r
->newest_map
= osdmap
.get_epoch();
4288 MOSDMap
*OSDMonitor::build_incremental(epoch_t from
, epoch_t to
, uint64_t features
)
4290 dout(10) << "build_incremental [" << from
<< ".." << to
<< "] with features "
4291 << std::hex
<< features
<< std::dec
<< dendl
;
4292 MOSDMap
*m
= new MOSDMap(mon
->monmap
->fsid
, features
);
4293 m
->oldest_map
= get_first_committed();
4294 m
->newest_map
= osdmap
.get_epoch();
4296 for (epoch_t e
= to
; e
>= from
&& e
> 0; e
--) {
4298 int err
= get_version(e
, features
, bl
);
4300 ceph_assert(bl
.length());
4301 // if (get_version(e, bl) > 0) {
4302 dout(20) << "build_incremental inc " << e
<< " "
4303 << bl
.length() << " bytes" << dendl
;
4304 m
->incremental_maps
[e
] = bl
;
4306 ceph_assert(err
== -ENOENT
);
4307 ceph_assert(!bl
.length());
4308 get_version_full(e
, features
, bl
);
4309 if (bl
.length() > 0) {
4310 //else if (get_version("full", e, bl) > 0) {
4311 dout(20) << "build_incremental full " << e
<< " "
4312 << bl
.length() << " bytes" << dendl
;
4315 ceph_abort(); // we should have all maps.
4322 void OSDMonitor::send_full(MonOpRequestRef op
)
4324 op
->mark_osdmon_event(__func__
);
4325 dout(5) << "send_full to " << op
->get_req()->get_orig_source_inst() << dendl
;
4326 mon
->send_reply(op
, build_latest_full(op
->get_session()->con_features
));
4329 void OSDMonitor::send_incremental(MonOpRequestRef op
, epoch_t first
)
4331 op
->mark_osdmon_event(__func__
);
4333 MonSession
*s
= op
->get_session();
4337 // oh, we can tell the other mon to do it
4338 dout(10) << __func__
<< " asking proxying mon to send_incremental from "
4340 MRoute
*r
= new MRoute(s
->proxy_tid
, NULL
);
4341 r
->send_osdmap_first
= first
;
4342 s
->proxy_con
->send_message(r
);
4343 op
->mark_event("reply: send routed send_osdmap_first reply");
4346 send_incremental(first
, s
, false, op
);
4350 void OSDMonitor::send_incremental(epoch_t first
,
4351 MonSession
*session
,
4353 MonOpRequestRef req
)
4355 dout(5) << "send_incremental [" << first
<< ".." << osdmap
.get_epoch() << "]"
4356 << " to " << session
->name
<< dendl
;
4358 // get feature of the peer
4359 // use quorum_con_features, if it's an anonymous connection.
4360 uint64_t features
= session
->con_features
? session
->con_features
:
4361 mon
->get_quorum_con_features();
4363 if (first
<= session
->osd_epoch
) {
4364 dout(10) << __func__
<< " " << session
->name
<< " should already have epoch "
4365 << session
->osd_epoch
<< dendl
;
4366 first
= session
->osd_epoch
+ 1;
4369 if (first
< get_first_committed()) {
4370 MOSDMap
*m
= new MOSDMap(osdmap
.get_fsid(), features
);
4371 m
->oldest_map
= get_first_committed();
4372 m
->newest_map
= osdmap
.get_epoch();
4374 first
= get_first_committed();
4376 int err
= get_version_full(first
, features
, bl
);
4377 ceph_assert(err
== 0);
4378 ceph_assert(bl
.length());
4379 dout(20) << "send_incremental starting with base full "
4380 << first
<< " " << bl
.length() << " bytes" << dendl
;
4381 m
->maps
[first
] = bl
;
4384 mon
->send_reply(req
, m
);
4385 session
->osd_epoch
= first
;
4388 session
->con
->send_message(m
);
4389 session
->osd_epoch
= first
;
4394 while (first
<= osdmap
.get_epoch()) {
4395 epoch_t last
= std::min
<epoch_t
>(first
+ g_conf()->osd_map_message_max
- 1,
4396 osdmap
.get_epoch());
4397 MOSDMap
*m
= build_incremental(first
, last
, features
);
4400 // send some maps. it may not be all of them, but it will get them
4402 mon
->send_reply(req
, m
);
4404 session
->con
->send_message(m
);
4407 session
->osd_epoch
= last
;
4413 int OSDMonitor::get_version(version_t ver
, bufferlist
& bl
)
4415 return get_version(ver
, mon
->get_quorum_con_features(), bl
);
4418 void OSDMonitor::reencode_incremental_map(bufferlist
& bl
, uint64_t features
)
4420 OSDMap::Incremental inc
;
4421 auto q
= bl
.cbegin();
4423 // always encode with subset of osdmap's canonical features
4424 uint64_t f
= features
& inc
.encode_features
;
4425 dout(20) << __func__
<< " " << inc
.epoch
<< " with features " << f
4428 if (inc
.fullmap
.length()) {
4429 // embedded full map?
4431 m
.decode(inc
.fullmap
);
4432 inc
.fullmap
.clear();
4433 m
.encode(inc
.fullmap
, f
| CEPH_FEATURE_RESERVED
);
4435 if (inc
.crush
.length()) {
4436 // embedded crush map
4438 auto p
= inc
.crush
.cbegin();
4441 c
.encode(inc
.crush
, f
);
4443 inc
.encode(bl
, f
| CEPH_FEATURE_RESERVED
);
4446 void OSDMonitor::reencode_full_map(bufferlist
& bl
, uint64_t features
)
4449 auto q
= bl
.cbegin();
4451 // always encode with subset of osdmap's canonical features
4452 uint64_t f
= features
& m
.get_encoding_features();
4453 dout(20) << __func__
<< " " << m
.get_epoch() << " with features " << f
4456 m
.encode(bl
, f
| CEPH_FEATURE_RESERVED
);
4459 int OSDMonitor::get_version(version_t ver
, uint64_t features
, bufferlist
& bl
)
4461 uint64_t significant_features
= OSDMap::get_significant_features(features
);
4462 if (inc_osd_cache
.lookup({ver
, significant_features
}, &bl
)) {
4465 int ret
= PaxosService::get_version(ver
, bl
);
4469 // NOTE: this check is imprecise; the OSDMap encoding features may
4470 // be a subset of the latest mon quorum features, but worst case we
4471 // reencode once and then cache the (identical) result under both
4473 if (significant_features
!=
4474 OSDMap::get_significant_features(mon
->get_quorum_con_features())) {
4475 reencode_incremental_map(bl
, features
);
4477 inc_osd_cache
.add_bytes({ver
, significant_features
}, bl
);
4481 int OSDMonitor::get_inc(version_t ver
, OSDMap::Incremental
& inc
)
4484 int err
= get_version(ver
, inc_bl
);
4485 ceph_assert(err
== 0);
4486 ceph_assert(inc_bl
.length());
4488 auto p
= inc_bl
.cbegin();
4490 dout(10) << __func__
<< " "
4491 << " epoch " << inc
.epoch
4492 << " inc_crc " << inc
.inc_crc
4493 << " full_crc " << inc
.full_crc
4494 << " encode_features " << inc
.encode_features
<< dendl
;
4498 int OSDMonitor::get_full_from_pinned_map(version_t ver
, bufferlist
& bl
)
4500 dout(10) << __func__
<< " ver " << ver
<< dendl
;
4502 version_t closest_pinned
= osdmap_manifest
.get_lower_closest_pinned(ver
);
4503 if (closest_pinned
== 0) {
4506 if (closest_pinned
> ver
) {
4507 dout(0) << __func__
<< " pinned: " << osdmap_manifest
.pinned
<< dendl
;
4509 ceph_assert(closest_pinned
<= ver
);
4511 dout(10) << __func__
<< " closest pinned ver " << closest_pinned
<< dendl
;
4513 // get osdmap incremental maps and apply on top of this one.
4515 bool has_cached_osdmap
= false;
4516 for (version_t v
= ver
-1; v
>= closest_pinned
; --v
) {
4517 if (full_osd_cache
.lookup({v
, mon
->get_quorum_con_features()},
4519 dout(10) << __func__
<< " found map in cache ver " << v
<< dendl
;
4521 has_cached_osdmap
= true;
4526 if (!has_cached_osdmap
) {
4527 int err
= PaxosService::get_version_full(closest_pinned
, osdm_bl
);
4529 derr
<< __func__
<< " closest pinned map ver " << closest_pinned
4530 << " not available! error: " << cpp_strerror(err
) << dendl
;
4532 ceph_assert(err
== 0);
4535 ceph_assert(osdm_bl
.length());
4538 osdm
.decode(osdm_bl
);
4540 dout(10) << __func__
<< " loaded osdmap epoch " << closest_pinned
4541 << " e" << osdm
.epoch
4542 << " crc " << osdm
.get_crc()
4543 << " -- applying incremental maps." << dendl
;
4545 uint64_t encode_features
= 0;
4546 for (version_t v
= closest_pinned
+ 1; v
<= ver
; ++v
) {
4547 dout(20) << __func__
<< " applying inc epoch " << v
<< dendl
;
4549 OSDMap::Incremental inc
;
4550 int err
= get_inc(v
, inc
);
4551 ceph_assert(err
== 0);
4553 encode_features
= inc
.encode_features
;
4555 err
= osdm
.apply_incremental(inc
);
4556 ceph_assert(err
== 0);
4558 // this block performs paranoid checks on map retrieval
4559 if (g_conf().get_val
<bool>("mon_debug_extra_checks") &&
4560 inc
.full_crc
!= 0) {
4562 uint64_t f
= encode_features
;
4564 f
= (mon
->quorum_con_features
? mon
->quorum_con_features
: -1);
4567 // encode osdmap to force calculating crcs
4569 osdm
.encode(tbl
, f
| CEPH_FEATURE_RESERVED
);
4570 // decode osdmap to compare crcs with what's expected by incremental
4574 if (tosdm
.get_crc() != inc
.full_crc
) {
4576 << " osdmap crc mismatch! (osdmap crc " << tosdm
.get_crc()
4577 << ", expected " << inc
.full_crc
<< ")" << dendl
;
4578 ceph_abort_msg("osdmap crc mismatch");
4582 // note: we cannot add the recently computed map to the cache, as is,
4583 // because we have not encoded the map into a bl.
4586 if (!encode_features
) {
4587 dout(10) << __func__
4588 << " last incremental map didn't have features;"
4589 << " defaulting to quorum's or all" << dendl
;
4591 (mon
->quorum_con_features
? mon
->quorum_con_features
: -1);
4593 osdm
.encode(bl
, encode_features
| CEPH_FEATURE_RESERVED
);
4598 int OSDMonitor::get_version_full(version_t ver
, bufferlist
& bl
)
4600 return get_version_full(ver
, mon
->get_quorum_con_features(), bl
);
4603 int OSDMonitor::get_version_full(version_t ver
, uint64_t features
,
4606 uint64_t significant_features
= OSDMap::get_significant_features(features
);
4607 if (full_osd_cache
.lookup({ver
, significant_features
}, &bl
)) {
4610 int ret
= PaxosService::get_version_full(ver
, bl
);
4611 if (ret
== -ENOENT
) {
4613 ret
= get_full_from_pinned_map(ver
, bl
);
4618 // NOTE: this check is imprecise; the OSDMap encoding features may
4619 // be a subset of the latest mon quorum features, but worst case we
4620 // reencode once and then cache the (identical) result under both
4622 if (significant_features
!=
4623 OSDMap::get_significant_features(mon
->get_quorum_con_features())) {
4624 reencode_full_map(bl
, features
);
4626 full_osd_cache
.add_bytes({ver
, significant_features
}, bl
);
4630 epoch_t
OSDMonitor::blacklist(const entity_addrvec_t
& av
, utime_t until
)
4632 dout(10) << "blacklist " << av
<< " until " << until
<< dendl
;
4633 for (auto a
: av
.v
) {
4634 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
4635 a
.set_type(entity_addr_t::TYPE_ANY
);
4637 a
.set_type(entity_addr_t::TYPE_LEGACY
);
4639 pending_inc
.new_blacklist
[a
] = until
;
4641 return pending_inc
.epoch
;
4644 epoch_t
OSDMonitor::blacklist(entity_addr_t a
, utime_t until
)
4646 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
4647 a
.set_type(entity_addr_t::TYPE_ANY
);
4649 a
.set_type(entity_addr_t::TYPE_LEGACY
);
4651 dout(10) << "blacklist " << a
<< " until " << until
<< dendl
;
4652 pending_inc
.new_blacklist
[a
] = until
;
4653 return pending_inc
.epoch
;
4657 void OSDMonitor::check_osdmap_subs()
4659 dout(10) << __func__
<< dendl
;
4660 if (!osdmap
.get_epoch()) {
4663 auto osdmap_subs
= mon
->session_map
.subs
.find("osdmap");
4664 if (osdmap_subs
== mon
->session_map
.subs
.end()) {
4667 auto p
= osdmap_subs
->second
->begin();
4671 check_osdmap_sub(sub
);
4675 void OSDMonitor::check_osdmap_sub(Subscription
*sub
)
4677 dout(10) << __func__
<< " " << sub
<< " next " << sub
->next
4678 << (sub
->onetime
? " (onetime)":" (ongoing)") << dendl
;
4679 if (sub
->next
<= osdmap
.get_epoch()) {
4681 send_incremental(sub
->next
, sub
->session
, sub
->incremental_onetime
);
4683 sub
->session
->con
->send_message(build_latest_full(sub
->session
->con_features
));
4685 mon
->session_map
.remove_sub(sub
);
4687 sub
->next
= osdmap
.get_epoch() + 1;
4691 void OSDMonitor::check_pg_creates_subs()
4693 if (!osdmap
.get_num_up_osds()) {
4696 ceph_assert(osdmap
.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB
);
4697 mon
->with_session_map([this](const MonSessionMap
& session_map
) {
4698 auto pg_creates_subs
= session_map
.subs
.find("osd_pg_creates");
4699 if (pg_creates_subs
== session_map
.subs
.end()) {
4702 for (auto sub
: *pg_creates_subs
->second
) {
4703 check_pg_creates_sub(sub
);
4708 void OSDMonitor::check_pg_creates_sub(Subscription
*sub
)
4710 dout(20) << __func__
<< " .. " << sub
->session
->name
<< dendl
;
4711 ceph_assert(sub
->type
== "osd_pg_creates");
4712 // only send these if the OSD is up. we will check_subs() when they do
4713 // come up so they will get the creates then.
4714 if (sub
->session
->name
.is_osd() &&
4715 mon
->osdmon()->osdmap
.is_up(sub
->session
->name
.num())) {
4716 sub
->next
= send_pg_creates(sub
->session
->name
.num(),
4717 sub
->session
->con
.get(),
4722 void OSDMonitor::do_application_enable(int64_t pool_id
,
4723 const std::string
&app_name
,
4724 const std::string
&app_key
,
4725 const std::string
&app_value
)
4727 ceph_assert(paxos
->is_plugged() && is_writeable());
4729 dout(20) << __func__
<< ": pool_id=" << pool_id
<< ", app_name=" << app_name
4732 ceph_assert(osdmap
.require_osd_release
>= ceph_release_t::luminous
);
4734 auto pp
= osdmap
.get_pg_pool(pool_id
);
4735 ceph_assert(pp
!= nullptr);
4738 if (pending_inc
.new_pools
.count(pool_id
)) {
4739 p
= pending_inc
.new_pools
[pool_id
];
4742 if (app_key
.empty()) {
4743 p
.application_metadata
.insert({app_name
, {}});
4745 p
.application_metadata
.insert({app_name
, {{app_key
, app_value
}}});
4747 p
.last_change
= pending_inc
.epoch
;
4748 pending_inc
.new_pools
[pool_id
] = p
;
4751 void OSDMonitor::do_set_pool_opt(int64_t pool_id
,
4752 pool_opts_t::key_t opt
,
4753 pool_opts_t::value_t val
)
4755 auto p
= pending_inc
.new_pools
.try_emplace(
4756 pool_id
, *osdmap
.get_pg_pool(pool_id
));
4757 p
.first
->second
.opts
.set(opt
, val
);
4760 unsigned OSDMonitor::scan_for_creating_pgs(
4761 const mempool::osdmap::map
<int64_t,pg_pool_t
>& pools
,
4762 const mempool::osdmap::set
<int64_t>& removed_pools
,
4764 creating_pgs_t
* creating_pgs
) const
4766 unsigned queued
= 0;
4767 for (auto& p
: pools
) {
4768 int64_t poolid
= p
.first
;
4769 if (creating_pgs
->created_pools
.count(poolid
)) {
4770 dout(10) << __func__
<< " already created " << poolid
<< dendl
;
4773 const pg_pool_t
& pool
= p
.second
;
4774 int ruleno
= osdmap
.crush
->find_rule(pool
.get_crush_rule(),
4775 pool
.get_type(), pool
.get_size());
4776 if (ruleno
< 0 || !osdmap
.crush
->rule_exists(ruleno
))
4779 const auto last_scan_epoch
= creating_pgs
->last_scan_epoch
;
4780 const auto created
= pool
.get_last_change();
4781 if (last_scan_epoch
&& created
<= last_scan_epoch
) {
4782 dout(10) << __func__
<< " no change in pool " << poolid
4783 << " " << pool
<< dendl
;
4786 if (removed_pools
.count(poolid
)) {
4787 dout(10) << __func__
<< " pool is being removed: " << poolid
4788 << " " << pool
<< dendl
;
4791 dout(10) << __func__
<< " queueing pool create for " << poolid
4792 << " " << pool
<< dendl
;
4793 creating_pgs
->create_pool(poolid
, pool
.get_pg_num(),
4800 void OSDMonitor::update_creating_pgs()
4802 dout(10) << __func__
<< " " << creating_pgs
.pgs
.size() << " pgs creating, "
4803 << creating_pgs
.queue
.size() << " pools in queue" << dendl
;
4804 decltype(creating_pgs_by_osd_epoch
) new_pgs_by_osd_epoch
;
4805 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
4806 for (const auto& pg
: creating_pgs
.pgs
) {
4807 int acting_primary
= -1;
4808 auto pgid
= pg
.first
;
4809 if (!osdmap
.pg_exists(pgid
)) {
4810 dout(20) << __func__
<< " ignoring " << pgid
<< " which should not exist"
4814 auto mapped
= pg
.second
.create_epoch
;
4815 dout(20) << __func__
<< " looking up " << pgid
<< "@" << mapped
<< dendl
;
4817 mapping
.get_primary_and_shard(pgid
, &acting_primary
, &spgid
);
4818 // check the previous creating_pgs, look for the target to whom the pg was
4819 // previously mapped
4820 for (const auto& pgs_by_epoch
: creating_pgs_by_osd_epoch
) {
4821 const auto last_acting_primary
= pgs_by_epoch
.first
;
4822 for (auto& pgs
: pgs_by_epoch
.second
) {
4823 if (pgs
.second
.count(spgid
)) {
4824 if (last_acting_primary
== acting_primary
) {
4827 dout(20) << __func__
<< " " << pgid
<< " "
4828 << " acting_primary:" << last_acting_primary
4829 << " -> " << acting_primary
<< dendl
;
4830 // note epoch if the target of the create message changed.
4831 mapped
= mapping
.get_epoch();
4836 mapped
= mapping
.get_epoch();
4840 dout(10) << __func__
<< " will instruct osd." << acting_primary
4841 << " to create " << pgid
<< "@" << mapped
<< dendl
;
4842 new_pgs_by_osd_epoch
[acting_primary
][mapped
].insert(spgid
);
4844 creating_pgs_by_osd_epoch
= std::move(new_pgs_by_osd_epoch
);
4845 creating_pgs_epoch
= mapping
.get_epoch();
4848 epoch_t
OSDMonitor::send_pg_creates(int osd
, Connection
*con
, epoch_t next
) const
4850 dout(30) << __func__
<< " osd." << osd
<< " next=" << next
4851 << " " << creating_pgs_by_osd_epoch
<< dendl
;
4852 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
4853 if (creating_pgs_epoch
<= creating_pgs
.last_scan_epoch
) {
4854 dout(20) << __func__
4855 << " not using stale creating_pgs@" << creating_pgs_epoch
<< dendl
;
4856 // the subscribers will be updated when the mapping is completed anyway
4859 auto creating_pgs_by_epoch
= creating_pgs_by_osd_epoch
.find(osd
);
4860 if (creating_pgs_by_epoch
== creating_pgs_by_osd_epoch
.end())
4862 ceph_assert(!creating_pgs_by_epoch
->second
.empty());
4864 MOSDPGCreate
*oldm
= nullptr; // for pre-mimic OSD compat
4865 MOSDPGCreate2
*m
= nullptr;
4867 bool old
= osdmap
.require_osd_release
< ceph_release_t::nautilus
;
4870 for (auto epoch_pgs
= creating_pgs_by_epoch
->second
.lower_bound(next
);
4871 epoch_pgs
!= creating_pgs_by_epoch
->second
.end(); ++epoch_pgs
) {
4872 auto epoch
= epoch_pgs
->first
;
4873 auto& pgs
= epoch_pgs
->second
;
4874 dout(20) << __func__
<< " osd." << osd
<< " from " << next
4875 << " : epoch " << epoch
<< " " << pgs
.size() << " pgs" << dendl
;
4877 for (auto& pg
: pgs
) {
4878 // Need the create time from the monitor using its clock to set
4879 // last_scrub_stamp upon pg creation.
4880 auto create
= creating_pgs
.pgs
.find(pg
.pgid
);
4881 ceph_assert(create
!= creating_pgs
.pgs
.end());
4884 oldm
= new MOSDPGCreate(creating_pgs_epoch
);
4886 oldm
->mkpg
.emplace(pg
.pgid
,
4887 pg_create_t
{create
->second
.create_epoch
, pg
.pgid
, 0});
4888 oldm
->ctimes
.emplace(pg
.pgid
, create
->second
.create_stamp
);
4891 m
= new MOSDPGCreate2(creating_pgs_epoch
);
4893 m
->pgs
.emplace(pg
, make_pair(create
->second
.create_epoch
,
4894 create
->second
.create_stamp
));
4895 if (create
->second
.history
.epoch_created
) {
4896 dout(20) << __func__
<< " " << pg
<< " " << create
->second
.history
4897 << " " << create
->second
.past_intervals
<< dendl
;
4898 m
->pg_extra
.emplace(pg
, make_pair(create
->second
.history
,
4899 create
->second
.past_intervals
));
4902 dout(20) << __func__
<< " will create " << pg
4903 << " at " << create
->second
.create_epoch
<< dendl
;
4907 con
->send_message(m
);
4909 con
->send_message(oldm
);
4911 dout(20) << __func__
<< " osd." << osd
<< " from " << next
4912 << " has nothing to send" << dendl
;
4916 // sub is current through last + 1
4923 void OSDMonitor::tick()
4925 if (!is_active()) return;
4927 dout(10) << osdmap
<< dendl
;
4929 // always update osdmap manifest, regardless of being the leader.
4930 load_osdmap_manifest();
4932 if (!mon
->is_leader()) return;
4934 bool do_propose
= false;
4935 utime_t now
= ceph_clock_now();
4937 if (handle_osd_timeouts(now
, last_osd_report
)) {
4942 if (check_failures(now
)) {
4946 // Force a proposal if we need to prune; pruning is performed on
4947 // ``encode_pending()``, hence why we need to regularly trigger a proposal
4948 // even if there's nothing going on.
4949 if (is_prune_enabled() && should_prune()) {
4953 // mark down osds out?
4955 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
4956 * influence at all. The decision is made based on the ratio of "in" osds,
4957 * and the function returns false if this ratio is lower that the minimum
4958 * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
4960 if (can_mark_out(-1)) {
4961 string down_out_subtree_limit
= g_conf().get_val
<string
>(
4962 "mon_osd_down_out_subtree_limit");
4963 set
<int> down_cache
; // quick cache of down subtrees
4965 map
<int,utime_t
>::iterator i
= down_pending_out
.begin();
4966 while (i
!= down_pending_out
.end()) {
4972 if (osdmap
.is_down(o
) &&
4975 utime_t
orig_grace(g_conf()->mon_osd_down_out_interval
, 0);
4976 utime_t grace
= orig_grace
;
4977 double my_grace
= 0.0;
4979 if (g_conf()->mon_osd_adjust_down_out_interval
) {
4980 // scale grace period the same way we do the heartbeat grace.
4981 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(o
);
4982 double halflife
= (double)g_conf()->mon_osd_laggy_halflife
;
4983 double decay_k
= ::log(.5) / halflife
;
4984 double decay
= exp((double)down
* decay_k
);
4985 dout(20) << "osd." << o
<< " laggy halflife " << halflife
<< " decay_k " << decay_k
4986 << " down for " << down
<< " decay " << decay
<< dendl
;
4987 my_grace
= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
4991 // is this an entire large subtree down?
4992 if (down_out_subtree_limit
.length()) {
4993 int type
= osdmap
.crush
->get_type_id(down_out_subtree_limit
);
4995 if (osdmap
.containing_subtree_is_down(cct
, o
, type
, &down_cache
)) {
4996 dout(10) << "tick entire containing " << down_out_subtree_limit
4997 << " subtree for osd." << o
4998 << " is down; resetting timer" << dendl
;
4999 // reset timer, too.
5000 down_pending_out
[o
] = now
;
5006 bool down_out
= !osdmap
.is_destroyed(o
) &&
5007 g_conf()->mon_osd_down_out_interval
> 0 && down
.sec() >= grace
;
5008 bool destroyed_out
= osdmap
.is_destroyed(o
) &&
5009 g_conf()->mon_osd_destroyed_out_interval
> 0 &&
5010 // this is not precise enough as we did not make a note when this osd
5011 // was marked as destroyed, but let's not bother with that
5012 // complexity for now.
5013 down
.sec() >= g_conf()->mon_osd_destroyed_out_interval
;
5014 if (down_out
|| destroyed_out
) {
5015 dout(10) << "tick marking osd." << o
<< " OUT after " << down
5016 << " sec (target " << grace
<< " = " << orig_grace
<< " + " << my_grace
<< ")" << dendl
;
5017 pending_inc
.new_weight
[o
] = CEPH_OSD_OUT
;
5019 // set the AUTOOUT bit.
5020 if (pending_inc
.new_state
.count(o
) == 0)
5021 pending_inc
.new_state
[o
] = 0;
5022 pending_inc
.new_state
[o
] |= CEPH_OSD_AUTOOUT
;
5024 // remember previous weight
5025 if (pending_inc
.new_xinfo
.count(o
) == 0)
5026 pending_inc
.new_xinfo
[o
] = osdmap
.osd_xinfo
[o
];
5027 pending_inc
.new_xinfo
[o
].old_weight
= osdmap
.osd_weight
[o
];
5031 mon
->clog
->info() << "Marking osd." << o
<< " out (has been down for "
5032 << int(down
.sec()) << " seconds)";
5037 down_pending_out
.erase(o
);
5040 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl
;
5043 // expire blacklisted items?
5044 for (ceph::unordered_map
<entity_addr_t
,utime_t
>::iterator p
= osdmap
.blacklist
.begin();
5045 p
!= osdmap
.blacklist
.end();
5047 if (p
->second
< now
) {
5048 dout(10) << "expiring blacklist item " << p
->first
<< " expired " << p
->second
<< " < now " << now
<< dendl
;
5049 pending_inc
.old_blacklist
.push_back(p
->first
);
5054 if (try_prune_purged_snaps()) {
5058 if (update_pools_status())
5062 !pending_inc
.new_pg_temp
.empty()) // also propose if we adjusted pg_temp
5066 std::lock_guard
l(balancer_lock
);
5067 if (ceph_using_tcmalloc() && mon_memory_autotune
&& pcm
!= nullptr) {
5070 _set_new_cache_sizes();
5071 dout(10) << "tick balancer "
5072 << " inc cache_bytes: " << inc_cache
->get_cache_bytes()
5073 << " inc comtd_bytes: " << inc_cache
->get_committed_size()
5074 << " inc used_bytes: " << inc_cache
->_get_used_bytes()
5075 << " inc num_osdmaps: " << inc_cache
->_get_num_osdmaps()
5077 dout(10) << "tick balancer "
5078 << " full cache_bytes: " << full_cache
->get_cache_bytes()
5079 << " full comtd_bytes: " << full_cache
->get_committed_size()
5080 << " full used_bytes: " << full_cache
->_get_used_bytes()
5081 << " full num_osdmaps: " << full_cache
->_get_num_osdmaps()
5087 void OSDMonitor::_set_new_cache_sizes()
5089 uint64_t cache_size
= 0;
5090 int64_t inc_alloc
= 0;
5091 int64_t full_alloc
= 0;
5092 int64_t kv_alloc
= 0;
5094 if (pcm
!= nullptr && rocksdb_binned_kv_cache
!= nullptr) {
5095 cache_size
= pcm
->get_tuned_mem();
5096 inc_alloc
= inc_cache
->get_committed_size();
5097 full_alloc
= full_cache
->get_committed_size();
5098 kv_alloc
= rocksdb_binned_kv_cache
->get_committed_size();
5101 inc_osd_cache
.set_bytes(inc_alloc
);
5102 full_osd_cache
.set_bytes(full_alloc
);
5104 dout(1) << __func__
<< " cache_size:" << cache_size
5105 << " inc_alloc: " << inc_alloc
5106 << " full_alloc: " << full_alloc
5107 << " kv_alloc: " << kv_alloc
5111 bool OSDMonitor::handle_osd_timeouts(const utime_t
&now
,
5112 std::map
<int,utime_t
> &last_osd_report
)
5114 utime_t
timeo(g_conf()->mon_osd_report_timeout
, 0);
5115 if (now
- mon
->get_leader_since() < timeo
) {
5116 // We haven't been the leader for long enough to consider OSD timeouts
5120 int max_osd
= osdmap
.get_max_osd();
5121 bool new_down
= false;
5123 for (int i
=0; i
< max_osd
; ++i
) {
5124 dout(30) << __func__
<< ": checking up on osd " << i
<< dendl
;
5125 if (!osdmap
.exists(i
)) {
5126 last_osd_report
.erase(i
); // if any
5129 if (!osdmap
.is_up(i
))
5131 const std::map
<int,utime_t
>::const_iterator t
= last_osd_report
.find(i
);
5132 if (t
== last_osd_report
.end()) {
5133 // it wasn't in the map; start the timer.
5134 last_osd_report
[i
] = now
;
5135 } else if (can_mark_down(i
)) {
5136 utime_t diff
= now
- t
->second
;
5138 mon
->clog
->info() << "osd." << i
<< " marked down after no beacon for "
5139 << diff
<< " seconds";
5140 derr
<< "no beacon from osd." << i
<< " since " << t
->second
5141 << ", " << diff
<< " seconds ago. marking down" << dendl
;
5142 pending_inc
.new_state
[i
] = CEPH_OSD_UP
;
5150 static void dump_cpu_list(Formatter
*f
, const char *name
,
5151 const string
& strlist
)
5154 size_t cpu_set_size
;
5155 if (parse_cpu_set_list(strlist
.c_str(), &cpu_set_size
, &cpu_set
) < 0) {
5158 set
<int> cpus
= cpu_set_to_set(cpu_set_size
, &cpu_set
);
5159 f
->open_array_section(name
);
5160 for (auto cpu
: cpus
) {
5161 f
->dump_int("cpu", cpu
);
5166 void OSDMonitor::dump_info(Formatter
*f
)
5168 f
->open_object_section("osdmap");
5172 f
->open_array_section("osd_metadata");
5173 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
5174 if (osdmap
.exists(i
)) {
5175 f
->open_object_section("osd");
5176 f
->dump_unsigned("id", i
);
5177 dump_osd_metadata(i
, f
, NULL
);
5183 f
->dump_unsigned("osdmap_first_committed", get_first_committed());
5184 f
->dump_unsigned("osdmap_last_committed", get_last_committed());
5186 f
->open_object_section("crushmap");
5187 osdmap
.crush
->dump(f
);
5190 if (has_osdmap_manifest
) {
5191 f
->open_object_section("osdmap_manifest");
5192 osdmap_manifest
.dump(f
);
5198 enum osd_pool_get_choices
{
5200 PG_NUM
, PGP_NUM
, CRUSH_RULE
, HASHPSPOOL
, EC_OVERWRITES
,
5201 NODELETE
, NOPGCHANGE
, NOSIZECHANGE
,
5202 WRITE_FADVISE_DONTNEED
, NOSCRUB
, NODEEP_SCRUB
,
5203 HIT_SET_TYPE
, HIT_SET_PERIOD
, HIT_SET_COUNT
, HIT_SET_FPP
,
5204 USE_GMT_HITSET
, TARGET_MAX_OBJECTS
, TARGET_MAX_BYTES
,
5205 CACHE_TARGET_DIRTY_RATIO
, CACHE_TARGET_DIRTY_HIGH_RATIO
,
5206 CACHE_TARGET_FULL_RATIO
,
5207 CACHE_MIN_FLUSH_AGE
, CACHE_MIN_EVICT_AGE
,
5208 ERASURE_CODE_PROFILE
, MIN_READ_RECENCY_FOR_PROMOTE
,
5209 MIN_WRITE_RECENCY_FOR_PROMOTE
, FAST_READ
,
5210 HIT_SET_GRADE_DECAY_RATE
, HIT_SET_SEARCH_LAST_N
,
5211 SCRUB_MIN_INTERVAL
, SCRUB_MAX_INTERVAL
, DEEP_SCRUB_INTERVAL
,
5212 RECOVERY_PRIORITY
, RECOVERY_OP_PRIORITY
, SCRUB_PRIORITY
,
5213 COMPRESSION_MODE
, COMPRESSION_ALGORITHM
, COMPRESSION_REQUIRED_RATIO
,
5214 COMPRESSION_MAX_BLOB_SIZE
, COMPRESSION_MIN_BLOB_SIZE
,
5215 CSUM_TYPE
, CSUM_MAX_BLOCK
, CSUM_MIN_BLOCK
, FINGERPRINT_ALGORITHM
,
5216 PG_AUTOSCALE_MODE
, PG_NUM_MIN
, TARGET_SIZE_BYTES
, TARGET_SIZE_RATIO
,
5217 PG_AUTOSCALE_BIAS
};
5219 std::set
<osd_pool_get_choices
>
5220 subtract_second_from_first(const std::set
<osd_pool_get_choices
>& first
,
5221 const std::set
<osd_pool_get_choices
>& second
)
5223 std::set
<osd_pool_get_choices
> result
;
5224 std::set_difference(first
.begin(), first
.end(),
5225 second
.begin(), second
.end(),
5226 std::inserter(result
, result
.end()));
5232 bool OSDMonitor::preprocess_command(MonOpRequestRef op
)
5234 op
->mark_osdmon_event(__func__
);
5235 auto m
= op
->get_req
<MMonCommand
>();
5238 stringstream ss
, ds
;
5241 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
5242 string rs
= ss
.str();
5243 mon
->reply_command(op
, -EINVAL
, rs
, get_last_committed());
5247 MonSession
*session
= op
->get_session();
5249 derr
<< __func__
<< " no session" << dendl
;
5250 mon
->reply_command(op
, -EACCES
, "access denied", get_last_committed());
5255 cmd_getval(cmdmap
, "prefix", prefix
);
5258 cmd_getval(cmdmap
, "format", format
, string("plain"));
5259 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
5261 if (prefix
== "osd stat") {
5263 f
->open_object_section("osdmap");
5264 osdmap
.print_summary(f
.get(), ds
, "", true);
5268 osdmap
.print_summary(nullptr, ds
, "", true);
5272 else if (prefix
== "osd dump" ||
5273 prefix
== "osd tree" ||
5274 prefix
== "osd tree-from" ||
5275 prefix
== "osd ls" ||
5276 prefix
== "osd getmap" ||
5277 prefix
== "osd getcrushmap" ||
5278 prefix
== "osd ls-tree" ||
5279 prefix
== "osd info") {
5284 cmd_getval(cmdmap
, "epoch", epochnum
, (int64_t)osdmap
.get_epoch());
5287 bufferlist osdmap_bl
;
5288 int err
= get_version_full(epoch
, osdmap_bl
);
5289 if (err
== -ENOENT
) {
5291 ss
<< "there is no map for epoch " << epoch
;
5294 ceph_assert(err
== 0);
5295 ceph_assert(osdmap_bl
.length());
5298 if (epoch
== osdmap
.get_epoch()) {
5302 p
->decode(osdmap_bl
);
5305 auto sg
= make_scope_guard([&] {
5311 if (prefix
== "osd dump") {
5314 f
->open_object_section("osdmap");
5324 } else if (prefix
== "osd ls") {
5326 f
->open_array_section("osds");
5327 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
5328 if (osdmap
.exists(i
)) {
5329 f
->dump_int("osd", i
);
5336 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
5337 if (osdmap
.exists(i
)) {
5346 } else if (prefix
== "osd info") {
5348 bool do_single_osd
= true;
5349 if (!cmd_getval(cmdmap
, "id", osd_id
)) {
5350 do_single_osd
= false;
5353 if (do_single_osd
&& !osdmap
.exists(osd_id
)) {
5354 ss
<< "osd." << osd_id
<< " does not exist";
5360 if (do_single_osd
) {
5361 osdmap
.dump_osd(osd_id
, f
.get());
5363 osdmap
.dump_osds(f
.get());
5367 if (do_single_osd
) {
5368 osdmap
.print_osd(osd_id
, ds
);
5370 osdmap
.print_osds(ds
);
5374 } else if (prefix
== "osd tree" || prefix
== "osd tree-from") {
5376 if (prefix
== "osd tree-from") {
5377 cmd_getval(cmdmap
, "bucket", bucket
);
5378 if (!osdmap
.crush
->name_exists(bucket
)) {
5379 ss
<< "bucket '" << bucket
<< "' does not exist";
5383 int id
= osdmap
.crush
->get_item_id(bucket
);
5385 ss
<< "\"" << bucket
<< "\" is not a bucket";
5391 vector
<string
> states
;
5392 cmd_getval(cmdmap
, "states", states
);
5393 unsigned filter
= 0;
5394 for (auto& s
: states
) {
5396 filter
|= OSDMap::DUMP_UP
;
5397 } else if (s
== "down") {
5398 filter
|= OSDMap::DUMP_DOWN
;
5399 } else if (s
== "in") {
5400 filter
|= OSDMap::DUMP_IN
;
5401 } else if (s
== "out") {
5402 filter
|= OSDMap::DUMP_OUT
;
5403 } else if (s
== "destroyed") {
5404 filter
|= OSDMap::DUMP_DESTROYED
;
5406 ss
<< "unrecognized state '" << s
<< "'";
5411 if ((filter
& (OSDMap::DUMP_IN
|OSDMap::DUMP_OUT
)) ==
5412 (OSDMap::DUMP_IN
|OSDMap::DUMP_OUT
)) {
5413 ss
<< "cannot specify both 'in' and 'out'";
5417 if (((filter
& (OSDMap::DUMP_UP
|OSDMap::DUMP_DOWN
)) ==
5418 (OSDMap::DUMP_UP
|OSDMap::DUMP_DOWN
)) ||
5419 ((filter
& (OSDMap::DUMP_UP
|OSDMap::DUMP_DESTROYED
)) ==
5420 (OSDMap::DUMP_UP
|OSDMap::DUMP_DESTROYED
)) ||
5421 ((filter
& (OSDMap::DUMP_DOWN
|OSDMap::DUMP_DESTROYED
)) ==
5422 (OSDMap::DUMP_DOWN
|OSDMap::DUMP_DESTROYED
))) {
5423 ss
<< "can specify only one of 'up', 'down' and 'destroyed'";
5428 f
->open_object_section("tree");
5429 p
->print_tree(f
.get(), NULL
, filter
, bucket
);
5433 p
->print_tree(NULL
, &ds
, filter
, bucket
);
5436 } else if (prefix
== "osd getmap") {
5437 rdata
.append(osdmap_bl
);
5438 ss
<< "got osdmap epoch " << p
->get_epoch();
5439 } else if (prefix
== "osd getcrushmap") {
5440 p
->crush
->encode(rdata
, mon
->get_quorum_con_features());
5441 ss
<< p
->get_crush_version();
5442 } else if (prefix
== "osd ls-tree") {
5444 cmd_getval(cmdmap
, "name", bucket_name
);
5446 r
= p
->get_osds_by_bucket_name(bucket_name
, &osds
);
5448 ss
<< "\"" << bucket_name
<< "\" does not exist";
5451 ss
<< "can not parse bucket name:\"" << bucket_name
<< "\"";
5456 f
->open_array_section("osds");
5457 for (auto &i
: osds
) {
5458 if (osdmap
.exists(i
)) {
5459 f
->dump_int("osd", i
);
5466 for (auto &i
: osds
) {
5467 if (osdmap
.exists(i
)) {
5478 } else if (prefix
== "osd getmaxosd") {
5480 f
->open_object_section("getmaxosd");
5481 f
->dump_unsigned("epoch", osdmap
.get_epoch());
5482 f
->dump_int("max_osd", osdmap
.get_max_osd());
5486 ds
<< "max_osd = " << osdmap
.get_max_osd() << " in epoch " << osdmap
.get_epoch();
5489 } else if (prefix
== "osd utilization") {
5491 osdmap
.summarize_mapping_stats(NULL
, NULL
, &out
, f
.get());
5498 } else if (prefix
== "osd find") {
5500 if (!cmd_getval(cmdmap
, "id", osd
)) {
5501 ss
<< "unable to parse osd id value '"
5502 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
5506 if (!osdmap
.exists(osd
)) {
5507 ss
<< "osd." << osd
<< " does not exist";
5512 cmd_getval(cmdmap
, "format", format
);
5513 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5514 f
->open_object_section("osd_location");
5515 f
->dump_int("osd", osd
);
5516 f
->dump_object("addrs", osdmap
.get_addrs(osd
));
5517 f
->dump_stream("osd_fsid") << osdmap
.get_uuid(osd
);
5519 // try to identify host, pod/container name, etc.
5520 map
<string
,string
> m
;
5521 load_metadata(osd
, m
, nullptr);
5522 if (auto p
= m
.find("hostname"); p
!= m
.end()) {
5523 f
->dump_string("host", p
->second
);
5526 "pod_name", "pod_namespace", // set by rook
5527 "container_name" // set by cephadm, ceph-ansible
5529 if (auto p
= m
.find(k
); p
!= m
.end()) {
5530 f
->dump_string(k
, p
->second
);
5534 // crush is helpful too
5535 f
->open_object_section("crush_location");
5536 map
<string
,string
> loc
= osdmap
.crush
->get_full_location(osd
);
5537 for (map
<string
,string
>::iterator p
= loc
.begin(); p
!= loc
.end(); ++p
)
5538 f
->dump_string(p
->first
.c_str(), p
->second
);
5542 } else if (prefix
== "osd metadata") {
5544 if (cmd_vartype_stringify(cmdmap
["id"]).size() &&
5545 !cmd_getval(cmdmap
, "id", osd
)) {
5546 ss
<< "unable to parse osd id value '"
5547 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
5551 if (osd
>= 0 && !osdmap
.exists(osd
)) {
5552 ss
<< "osd." << osd
<< " does not exist";
5557 cmd_getval(cmdmap
, "format", format
);
5558 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5560 f
->open_object_section("osd_metadata");
5561 f
->dump_unsigned("id", osd
);
5562 r
= dump_osd_metadata(osd
, f
.get(), &ss
);
5568 f
->open_array_section("osd_metadata");
5569 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
5570 if (osdmap
.exists(i
)) {
5571 f
->open_object_section("osd");
5572 f
->dump_unsigned("id", i
);
5573 r
= dump_osd_metadata(i
, f
.get(), NULL
);
5574 if (r
== -EINVAL
|| r
== -ENOENT
) {
5575 // Drop error, continue to get other daemons' metadata
5576 dout(4) << "No metadata for osd." << i
<< dendl
;
5588 } else if (prefix
== "osd versions") {
5590 f
.reset(Formatter::create("json-pretty"));
5591 count_metadata("ceph_version", f
.get());
5594 } else if (prefix
== "osd count-metadata") {
5596 f
.reset(Formatter::create("json-pretty"));
5598 cmd_getval(cmdmap
, "property", field
);
5599 count_metadata(field
, f
.get());
5602 } else if (prefix
== "osd numa-status") {
5605 f
->open_array_section("osds");
5607 tbl
.define_column("OSD", TextTable::LEFT
, TextTable::RIGHT
);
5608 tbl
.define_column("HOST", TextTable::LEFT
, TextTable::LEFT
);
5609 tbl
.define_column("NETWORK", TextTable::RIGHT
, TextTable::RIGHT
);
5610 tbl
.define_column("STORAGE", TextTable::RIGHT
, TextTable::RIGHT
);
5611 tbl
.define_column("AFFINITY", TextTable::RIGHT
, TextTable::RIGHT
);
5612 tbl
.define_column("CPUS", TextTable::LEFT
, TextTable::LEFT
);
5614 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
5615 if (osdmap
.exists(i
)) {
5616 map
<string
,string
> m
;
5618 if (load_metadata(i
, m
, &err
) < 0) {
5622 auto p
= m
.find("hostname");
5627 f
->open_object_section("osd");
5628 f
->dump_int("osd", i
);
5629 f
->dump_string("host", host
);
5630 for (auto n
: { "network_numa_node", "objectstore_numa_node",
5634 f
->dump_int(n
, atoi(p
->second
.c_str()));
5637 for (auto n
: { "network_numa_nodes", "objectstore_numa_nodes" }) {
5640 list
<string
> ls
= get_str_list(p
->second
, ",");
5641 f
->open_array_section(n
);
5642 for (auto node
: ls
) {
5643 f
->dump_int("node", atoi(node
.c_str()));
5648 for (auto n
: { "numa_node_cpus" }) {
5651 dump_cpu_list(f
.get(), n
, p
->second
);
5658 p
= m
.find("network_numa_nodes");
5664 p
= m
.find("objectstore_numa_nodes");
5670 p
= m
.find("numa_node");
5671 auto q
= m
.find("numa_node_cpus");
5672 if (p
!= m
.end() && q
!= m
.end()) {
5679 tbl
<< TextTable::endrow
;
5687 rdata
.append(stringify(tbl
));
5689 } else if (prefix
== "osd map") {
5690 string poolstr
, objstr
, namespacestr
;
5691 cmd_getval(cmdmap
, "pool", poolstr
);
5692 cmd_getval(cmdmap
, "object", objstr
);
5693 cmd_getval(cmdmap
, "nspace", namespacestr
);
5695 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
5697 ss
<< "pool " << poolstr
<< " does not exist";
5701 object_locator_t
oloc(pool
, namespacestr
);
5702 object_t
oid(objstr
);
5703 pg_t pgid
= osdmap
.object_locator_to_pg(oid
, oloc
);
5704 pg_t mpgid
= osdmap
.raw_pg_to_pg(pgid
);
5705 vector
<int> up
, acting
;
5707 osdmap
.pg_to_up_acting_osds(mpgid
, &up
, &up_p
, &acting
, &acting_p
);
5710 if (!namespacestr
.empty())
5711 fullobjname
= namespacestr
+ string("/") + oid
.name
;
5713 fullobjname
= oid
.name
;
5715 f
->open_object_section("osd_map");
5716 f
->dump_unsigned("epoch", osdmap
.get_epoch());
5717 f
->dump_string("pool", poolstr
);
5718 f
->dump_int("pool_id", pool
);
5719 f
->dump_stream("objname") << fullobjname
;
5720 f
->dump_stream("raw_pgid") << pgid
;
5721 f
->dump_stream("pgid") << mpgid
;
5722 f
->open_array_section("up");
5723 for (vector
<int>::iterator p
= up
.begin(); p
!= up
.end(); ++p
)
5724 f
->dump_int("osd", *p
);
5726 f
->dump_int("up_primary", up_p
);
5727 f
->open_array_section("acting");
5728 for (vector
<int>::iterator p
= acting
.begin(); p
!= acting
.end(); ++p
)
5729 f
->dump_int("osd", *p
);
5731 f
->dump_int("acting_primary", acting_p
);
5732 f
->close_section(); // osd_map
5735 ds
<< "osdmap e" << osdmap
.get_epoch()
5736 << " pool '" << poolstr
<< "' (" << pool
<< ")"
5737 << " object '" << fullobjname
<< "' ->"
5738 << " pg " << pgid
<< " (" << mpgid
<< ")"
5739 << " -> up (" << pg_vector_string(up
) << ", p" << up_p
<< ") acting ("
5740 << pg_vector_string(acting
) << ", p" << acting_p
<< ")";
5744 } else if (prefix
== "pg map") {
5747 cmd_getval(cmdmap
, "pgid", pgidstr
);
5748 if (!pgid
.parse(pgidstr
.c_str())) {
5749 ss
<< "invalid pgid '" << pgidstr
<< "'";
5753 vector
<int> up
, acting
;
5754 if (!osdmap
.have_pg_pool(pgid
.pool())) {
5755 ss
<< "pg '" << pgidstr
<< "' does not exist";
5759 pg_t mpgid
= osdmap
.raw_pg_to_pg(pgid
);
5760 osdmap
.pg_to_up_acting_osds(pgid
, up
, acting
);
5762 f
->open_object_section("pg_map");
5763 f
->dump_unsigned("epoch", osdmap
.get_epoch());
5764 f
->dump_stream("raw_pgid") << pgid
;
5765 f
->dump_stream("pgid") << mpgid
;
5766 f
->open_array_section("up");
5767 for (auto osd
: up
) {
5768 f
->dump_int("up_osd", osd
);
5771 f
->open_array_section("acting");
5772 for (auto osd
: acting
) {
5773 f
->dump_int("acting_osd", osd
);
5779 ds
<< "osdmap e" << osdmap
.get_epoch()
5780 << " pg " << pgid
<< " (" << mpgid
<< ")"
5781 << " -> up " << up
<< " acting " << acting
;
5786 } else if (prefix
== "osd lspools") {
5788 f
->open_array_section("pools");
5789 for (map
<int64_t, pg_pool_t
>::iterator p
= osdmap
.pools
.begin();
5790 p
!= osdmap
.pools
.end();
5793 f
->open_object_section("pool");
5794 f
->dump_int("poolnum", p
->first
);
5795 f
->dump_string("poolname", osdmap
.pool_name
[p
->first
]);
5798 ds
<< p
->first
<< ' ' << osdmap
.pool_name
[p
->first
];
5799 if (next(p
) != osdmap
.pools
.end()) {
5809 } else if (prefix
== "osd blacklist ls") {
5811 f
->open_array_section("blacklist");
5813 for (ceph::unordered_map
<entity_addr_t
,utime_t
>::iterator p
= osdmap
.blacklist
.begin();
5814 p
!= osdmap
.blacklist
.end();
5817 f
->open_object_section("entry");
5818 f
->dump_string("addr", p
->first
.get_legacy_str());
5819 f
->dump_stream("until") << p
->second
;
5824 ss
<< p
->first
<< " " << p
->second
;
5834 ss
<< "listed " << osdmap
.blacklist
.size() << " entries";
5836 } else if (prefix
== "osd pool ls") {
5838 cmd_getval(cmdmap
, "detail", detail
);
5839 if (!f
&& detail
== "detail") {
5841 osdmap
.print_pools(ss
);
5842 rdata
.append(ss
.str());
5845 f
->open_array_section("pools");
5846 for (map
<int64_t,pg_pool_t
>::const_iterator it
= osdmap
.get_pools().begin();
5847 it
!= osdmap
.get_pools().end();
5850 if (detail
== "detail") {
5851 f
->open_object_section("pool");
5852 f
->dump_int("pool_id", it
->first
);
5853 f
->dump_string("pool_name", osdmap
.get_pool_name(it
->first
));
5854 it
->second
.dump(f
.get());
5857 f
->dump_string("pool_name", osdmap
.get_pool_name(it
->first
));
5860 rdata
.append(osdmap
.get_pool_name(it
->first
) + "\n");
5869 } else if (prefix
== "osd crush get-tunable") {
5871 cmd_getval(cmdmap
, "tunable", tunable
);
5874 f
->open_object_section("tunable");
5875 if (tunable
== "straw_calc_version") {
5877 f
->dump_int(tunable
.c_str(), osdmap
.crush
->get_straw_calc_version());
5879 rss
<< osdmap
.crush
->get_straw_calc_version() << "\n";
5888 rdata
.append(rss
.str());
5892 } else if (prefix
== "osd pool get") {
5894 cmd_getval(cmdmap
, "pool", poolstr
);
5895 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
5897 ss
<< "unrecognized pool '" << poolstr
<< "'";
5902 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
5904 cmd_getval(cmdmap
, "var", var
);
5906 typedef std::map
<std::string
, osd_pool_get_choices
> choices_map_t
;
5907 const choices_map_t ALL_CHOICES
= {
5909 {"min_size", MIN_SIZE
},
5910 {"pg_num", PG_NUM
}, {"pgp_num", PGP_NUM
},
5911 {"crush_rule", CRUSH_RULE
}, {"hashpspool", HASHPSPOOL
},
5912 {"allow_ec_overwrites", EC_OVERWRITES
}, {"nodelete", NODELETE
},
5913 {"nopgchange", NOPGCHANGE
}, {"nosizechange", NOSIZECHANGE
},
5914 {"noscrub", NOSCRUB
}, {"nodeep-scrub", NODEEP_SCRUB
},
5915 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED
},
5916 {"hit_set_type", HIT_SET_TYPE
}, {"hit_set_period", HIT_SET_PERIOD
},
5917 {"hit_set_count", HIT_SET_COUNT
}, {"hit_set_fpp", HIT_SET_FPP
},
5918 {"use_gmt_hitset", USE_GMT_HITSET
},
5919 {"target_max_objects", TARGET_MAX_OBJECTS
},
5920 {"target_max_bytes", TARGET_MAX_BYTES
},
5921 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO
},
5922 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO
},
5923 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO
},
5924 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE
},
5925 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE
},
5926 {"erasure_code_profile", ERASURE_CODE_PROFILE
},
5927 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE
},
5928 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE
},
5929 {"fast_read", FAST_READ
},
5930 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE
},
5931 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N
},
5932 {"scrub_min_interval", SCRUB_MIN_INTERVAL
},
5933 {"scrub_max_interval", SCRUB_MAX_INTERVAL
},
5934 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL
},
5935 {"recovery_priority", RECOVERY_PRIORITY
},
5936 {"recovery_op_priority", RECOVERY_OP_PRIORITY
},
5937 {"scrub_priority", SCRUB_PRIORITY
},
5938 {"compression_mode", COMPRESSION_MODE
},
5939 {"compression_algorithm", COMPRESSION_ALGORITHM
},
5940 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO
},
5941 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE
},
5942 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE
},
5943 {"csum_type", CSUM_TYPE
},
5944 {"csum_max_block", CSUM_MAX_BLOCK
},
5945 {"csum_min_block", CSUM_MIN_BLOCK
},
5946 {"fingerprint_algorithm", FINGERPRINT_ALGORITHM
},
5947 {"pg_autoscale_mode", PG_AUTOSCALE_MODE
},
5948 {"pg_num_min", PG_NUM_MIN
},
5949 {"target_size_bytes", TARGET_SIZE_BYTES
},
5950 {"target_size_ratio", TARGET_SIZE_RATIO
},
5951 {"pg_autoscale_bias", PG_AUTOSCALE_BIAS
},
5954 typedef std::set
<osd_pool_get_choices
> choices_set_t
;
5956 const choices_set_t ONLY_TIER_CHOICES
= {
5957 HIT_SET_TYPE
, HIT_SET_PERIOD
, HIT_SET_COUNT
, HIT_SET_FPP
,
5958 TARGET_MAX_OBJECTS
, TARGET_MAX_BYTES
, CACHE_TARGET_FULL_RATIO
,
5959 CACHE_TARGET_DIRTY_RATIO
, CACHE_TARGET_DIRTY_HIGH_RATIO
,
5960 CACHE_MIN_FLUSH_AGE
, CACHE_MIN_EVICT_AGE
,
5961 MIN_READ_RECENCY_FOR_PROMOTE
,
5962 MIN_WRITE_RECENCY_FOR_PROMOTE
,
5963 HIT_SET_GRADE_DECAY_RATE
, HIT_SET_SEARCH_LAST_N
5965 const choices_set_t ONLY_ERASURE_CHOICES
= {
5966 EC_OVERWRITES
, ERASURE_CODE_PROFILE
5969 choices_set_t selected_choices
;
5971 for(choices_map_t::const_iterator it
= ALL_CHOICES
.begin();
5972 it
!= ALL_CHOICES
.end(); ++it
) {
5973 selected_choices
.insert(it
->second
);
5977 selected_choices
= subtract_second_from_first(selected_choices
,
5981 if(!p
->is_erasure()) {
5982 selected_choices
= subtract_second_from_first(selected_choices
,
5983 ONLY_ERASURE_CHOICES
);
5985 } else /* var != "all" */ {
5986 choices_map_t::const_iterator found
= ALL_CHOICES
.find(var
);
5987 osd_pool_get_choices selected
= found
->second
;
5989 if (!p
->is_tier() &&
5990 ONLY_TIER_CHOICES
.find(selected
) != ONLY_TIER_CHOICES
.end()) {
5991 ss
<< "pool '" << poolstr
5992 << "' is not a tier pool: variable not applicable";
5997 if (!p
->is_erasure() &&
5998 ONLY_ERASURE_CHOICES
.find(selected
)
5999 != ONLY_ERASURE_CHOICES
.end()) {
6000 ss
<< "pool '" << poolstr
6001 << "' is not a erasure pool: variable not applicable";
6006 if (pool_opts_t::is_opt_name(var
) &&
6007 !p
->opts
.is_set(pool_opts_t::get_opt_desc(var
).key
)) {
6008 ss
<< "option '" << var
<< "' is not set on pool '" << poolstr
<< "'";
6013 selected_choices
.insert(selected
);
6017 f
->open_object_section("pool");
6018 f
->dump_string("pool", poolstr
);
6019 f
->dump_int("pool_id", pool
);
6020 for(choices_set_t::const_iterator it
= selected_choices
.begin();
6021 it
!= selected_choices
.end(); ++it
) {
6022 choices_map_t::const_iterator i
;
6023 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
6024 if (i
->second
== *it
) {
6028 ceph_assert(i
!= ALL_CHOICES
.end());
6031 f
->dump_int("pg_num", p
->get_pg_num());
6034 f
->dump_int("pgp_num", p
->get_pgp_num());
6037 f
->dump_int("size", p
->get_size());
6040 f
->dump_int("min_size", p
->get_min_size());
6043 if (osdmap
.crush
->rule_exists(p
->get_crush_rule())) {
6044 f
->dump_string("crush_rule", osdmap
.crush
->get_rule_name(
6045 p
->get_crush_rule()));
6047 f
->dump_string("crush_rule", stringify(p
->get_crush_rule()));
6051 f
->dump_bool("allow_ec_overwrites",
6052 p
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
));
6054 case PG_AUTOSCALE_MODE
:
6055 f
->dump_string("pg_autoscale_mode",
6056 pg_pool_t::get_pg_autoscale_mode_name(
6057 p
->pg_autoscale_mode
));
6063 case WRITE_FADVISE_DONTNEED
:
6066 f
->dump_bool(i
->first
.c_str(),
6067 p
->has_flag(pg_pool_t::get_flag_by_name(i
->first
)));
6069 case HIT_SET_PERIOD
:
6070 f
->dump_int("hit_set_period", p
->hit_set_period
);
6073 f
->dump_int("hit_set_count", p
->hit_set_count
);
6076 f
->dump_string("hit_set_type",
6077 HitSet::get_type_name(p
->hit_set_params
.get_type()));
6081 if (p
->hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
6082 BloomHitSet::Params
*bloomp
=
6083 static_cast<BloomHitSet::Params
*>(p
->hit_set_params
.impl
.get());
6084 f
->dump_float("hit_set_fpp", bloomp
->get_fpp());
6085 } else if(var
!= "all") {
6087 ss
<< "hit set is not of type Bloom; " <<
6088 "invalid to get a false positive rate!";
6094 case USE_GMT_HITSET
:
6095 f
->dump_bool("use_gmt_hitset", p
->use_gmt_hitset
);
6097 case TARGET_MAX_OBJECTS
:
6098 f
->dump_unsigned("target_max_objects", p
->target_max_objects
);
6100 case TARGET_MAX_BYTES
:
6101 f
->dump_unsigned("target_max_bytes", p
->target_max_bytes
);
6103 case CACHE_TARGET_DIRTY_RATIO
:
6104 f
->dump_unsigned("cache_target_dirty_ratio_micro",
6105 p
->cache_target_dirty_ratio_micro
);
6106 f
->dump_float("cache_target_dirty_ratio",
6107 ((float)p
->cache_target_dirty_ratio_micro
/1000000));
6109 case CACHE_TARGET_DIRTY_HIGH_RATIO
:
6110 f
->dump_unsigned("cache_target_dirty_high_ratio_micro",
6111 p
->cache_target_dirty_high_ratio_micro
);
6112 f
->dump_float("cache_target_dirty_high_ratio",
6113 ((float)p
->cache_target_dirty_high_ratio_micro
/1000000));
6115 case CACHE_TARGET_FULL_RATIO
:
6116 f
->dump_unsigned("cache_target_full_ratio_micro",
6117 p
->cache_target_full_ratio_micro
);
6118 f
->dump_float("cache_target_full_ratio",
6119 ((float)p
->cache_target_full_ratio_micro
/1000000));
6121 case CACHE_MIN_FLUSH_AGE
:
6122 f
->dump_unsigned("cache_min_flush_age", p
->cache_min_flush_age
);
6124 case CACHE_MIN_EVICT_AGE
:
6125 f
->dump_unsigned("cache_min_evict_age", p
->cache_min_evict_age
);
6127 case ERASURE_CODE_PROFILE
:
6128 f
->dump_string("erasure_code_profile", p
->erasure_code_profile
);
6130 case MIN_READ_RECENCY_FOR_PROMOTE
:
6131 f
->dump_int("min_read_recency_for_promote",
6132 p
->min_read_recency_for_promote
);
6134 case MIN_WRITE_RECENCY_FOR_PROMOTE
:
6135 f
->dump_int("min_write_recency_for_promote",
6136 p
->min_write_recency_for_promote
);
6139 f
->dump_int("fast_read", p
->fast_read
);
6141 case HIT_SET_GRADE_DECAY_RATE
:
6142 f
->dump_int("hit_set_grade_decay_rate",
6143 p
->hit_set_grade_decay_rate
);
6145 case HIT_SET_SEARCH_LAST_N
:
6146 f
->dump_int("hit_set_search_last_n",
6147 p
->hit_set_search_last_n
);
6149 case SCRUB_MIN_INTERVAL
:
6150 case SCRUB_MAX_INTERVAL
:
6151 case DEEP_SCRUB_INTERVAL
:
6152 case RECOVERY_PRIORITY
:
6153 case RECOVERY_OP_PRIORITY
:
6154 case SCRUB_PRIORITY
:
6155 case COMPRESSION_MODE
:
6156 case COMPRESSION_ALGORITHM
:
6157 case COMPRESSION_REQUIRED_RATIO
:
6158 case COMPRESSION_MAX_BLOB_SIZE
:
6159 case COMPRESSION_MIN_BLOB_SIZE
:
6161 case CSUM_MAX_BLOCK
:
6162 case CSUM_MIN_BLOCK
:
6163 case FINGERPRINT_ALGORITHM
:
6165 case TARGET_SIZE_BYTES
:
6166 case TARGET_SIZE_RATIO
:
6167 case PG_AUTOSCALE_BIAS
:
6168 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc(i
->first
).key
;
6169 if (p
->opts
.is_set(key
)) {
6170 if(*it
== CSUM_TYPE
) {
6172 p
->opts
.get(pool_opts_t::CSUM_TYPE
, &val
);
6173 f
->dump_string(i
->first
.c_str(), Checksummer::get_csum_type_string(val
));
6175 p
->opts
.dump(i
->first
, f
.get());
6184 for(choices_set_t::const_iterator it
= selected_choices
.begin();
6185 it
!= selected_choices
.end(); ++it
) {
6186 choices_map_t::const_iterator i
;
6189 ss
<< "pg_num: " << p
->get_pg_num() << "\n";
6192 ss
<< "pgp_num: " << p
->get_pgp_num() << "\n";
6195 ss
<< "size: " << p
->get_size() << "\n";
6198 ss
<< "min_size: " << p
->get_min_size() << "\n";
6201 if (osdmap
.crush
->rule_exists(p
->get_crush_rule())) {
6202 ss
<< "crush_rule: " << osdmap
.crush
->get_rule_name(
6203 p
->get_crush_rule()) << "\n";
6205 ss
<< "crush_rule: " << p
->get_crush_rule() << "\n";
6208 case PG_AUTOSCALE_MODE
:
6209 ss
<< "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
6210 p
->pg_autoscale_mode
) <<"\n";
6212 case HIT_SET_PERIOD
:
6213 ss
<< "hit_set_period: " << p
->hit_set_period
<< "\n";
6216 ss
<< "hit_set_count: " << p
->hit_set_count
<< "\n";
6219 ss
<< "hit_set_type: " <<
6220 HitSet::get_type_name(p
->hit_set_params
.get_type()) << "\n";
6224 if (p
->hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
6225 BloomHitSet::Params
*bloomp
=
6226 static_cast<BloomHitSet::Params
*>(p
->hit_set_params
.impl
.get());
6227 ss
<< "hit_set_fpp: " << bloomp
->get_fpp() << "\n";
6228 } else if(var
!= "all") {
6229 ss
<< "hit set is not of type Bloom; " <<
6230 "invalid to get a false positive rate!";
6236 case USE_GMT_HITSET
:
6237 ss
<< "use_gmt_hitset: " << p
->use_gmt_hitset
<< "\n";
6239 case TARGET_MAX_OBJECTS
:
6240 ss
<< "target_max_objects: " << p
->target_max_objects
<< "\n";
6242 case TARGET_MAX_BYTES
:
6243 ss
<< "target_max_bytes: " << p
->target_max_bytes
<< "\n";
6245 case CACHE_TARGET_DIRTY_RATIO
:
6246 ss
<< "cache_target_dirty_ratio: "
6247 << ((float)p
->cache_target_dirty_ratio_micro
/1000000) << "\n";
6249 case CACHE_TARGET_DIRTY_HIGH_RATIO
:
6250 ss
<< "cache_target_dirty_high_ratio: "
6251 << ((float)p
->cache_target_dirty_high_ratio_micro
/1000000) << "\n";
6253 case CACHE_TARGET_FULL_RATIO
:
6254 ss
<< "cache_target_full_ratio: "
6255 << ((float)p
->cache_target_full_ratio_micro
/1000000) << "\n";
6257 case CACHE_MIN_FLUSH_AGE
:
6258 ss
<< "cache_min_flush_age: " << p
->cache_min_flush_age
<< "\n";
6260 case CACHE_MIN_EVICT_AGE
:
6261 ss
<< "cache_min_evict_age: " << p
->cache_min_evict_age
<< "\n";
6263 case ERASURE_CODE_PROFILE
:
6264 ss
<< "erasure_code_profile: " << p
->erasure_code_profile
<< "\n";
6266 case MIN_READ_RECENCY_FOR_PROMOTE
:
6267 ss
<< "min_read_recency_for_promote: " <<
6268 p
->min_read_recency_for_promote
<< "\n";
6270 case HIT_SET_GRADE_DECAY_RATE
:
6271 ss
<< "hit_set_grade_decay_rate: " <<
6272 p
->hit_set_grade_decay_rate
<< "\n";
6274 case HIT_SET_SEARCH_LAST_N
:
6275 ss
<< "hit_set_search_last_n: " <<
6276 p
->hit_set_search_last_n
<< "\n";
6279 ss
<< "allow_ec_overwrites: " <<
6280 (p
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
) ? "true" : "false") <<
6287 case WRITE_FADVISE_DONTNEED
:
6290 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
6291 if (i
->second
== *it
)
6294 ceph_assert(i
!= ALL_CHOICES
.end());
6295 ss
<< i
->first
<< ": " <<
6296 (p
->has_flag(pg_pool_t::get_flag_by_name(i
->first
)) ?
6297 "true" : "false") << "\n";
6299 case MIN_WRITE_RECENCY_FOR_PROMOTE
:
6300 ss
<< "min_write_recency_for_promote: " <<
6301 p
->min_write_recency_for_promote
<< "\n";
6304 ss
<< "fast_read: " << p
->fast_read
<< "\n";
6306 case SCRUB_MIN_INTERVAL
:
6307 case SCRUB_MAX_INTERVAL
:
6308 case DEEP_SCRUB_INTERVAL
:
6309 case RECOVERY_PRIORITY
:
6310 case RECOVERY_OP_PRIORITY
:
6311 case SCRUB_PRIORITY
:
6312 case COMPRESSION_MODE
:
6313 case COMPRESSION_ALGORITHM
:
6314 case COMPRESSION_REQUIRED_RATIO
:
6315 case COMPRESSION_MAX_BLOB_SIZE
:
6316 case COMPRESSION_MIN_BLOB_SIZE
:
6318 case CSUM_MAX_BLOCK
:
6319 case CSUM_MIN_BLOCK
:
6320 case FINGERPRINT_ALGORITHM
:
6322 case TARGET_SIZE_BYTES
:
6323 case TARGET_SIZE_RATIO
:
6324 case PG_AUTOSCALE_BIAS
:
6325 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
6326 if (i
->second
== *it
)
6329 ceph_assert(i
!= ALL_CHOICES
.end());
6331 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc(i
->first
).key
;
6332 if (p
->opts
.is_set(key
)) {
6333 if(key
== pool_opts_t::CSUM_TYPE
) {
6335 p
->opts
.get(key
, &val
);
6336 ss
<< i
->first
<< ": " << Checksummer::get_csum_type_string(val
) << "\n";
6338 ss
<< i
->first
<< ": " << p
->opts
.get(key
) << "\n";
6344 rdata
.append(ss
.str());
6349 } else if (prefix
== "osd pool get-quota") {
6351 cmd_getval(cmdmap
, "pool", pool_name
);
6353 int64_t poolid
= osdmap
.lookup_pg_pool_name(pool_name
);
6355 ceph_assert(poolid
== -ENOENT
);
6356 ss
<< "unrecognized pool '" << pool_name
<< "'";
6360 const pg_pool_t
*p
= osdmap
.get_pg_pool(poolid
);
6361 const pool_stat_t
* pstat
= mon
->mgrstatmon()->get_pool_stat(poolid
);
6362 const object_stat_sum_t
& sum
= pstat
->stats
.sum
;
6364 f
->open_object_section("pool_quotas");
6365 f
->dump_string("pool_name", pool_name
);
6366 f
->dump_unsigned("pool_id", poolid
);
6367 f
->dump_unsigned("quota_max_objects", p
->quota_max_objects
);
6368 f
->dump_int("current_num_objects", sum
.num_objects
);
6369 f
->dump_unsigned("quota_max_bytes", p
->quota_max_bytes
);
6370 f
->dump_int("current_num_bytes", sum
.num_bytes
);
6375 rs
<< "quotas for pool '" << pool_name
<< "':\n"
6376 << " max objects: ";
6377 if (p
->quota_max_objects
== 0)
6380 rs
<< si_u_t(p
->quota_max_objects
) << " objects";
6381 rs
<< " (current num objects: " << sum
.num_objects
<< " objects)";
6385 if (p
->quota_max_bytes
== 0)
6388 rs
<< byte_u_t(p
->quota_max_bytes
);
6389 rs
<< " (current num bytes: " << sum
.num_bytes
<< " bytes)";
6391 rdata
.append(rs
.str());
6395 } else if (prefix
== "osd crush rule list" ||
6396 prefix
== "osd crush rule ls") {
6398 f
->open_array_section("rules");
6399 osdmap
.crush
->list_rules(f
.get());
6404 osdmap
.crush
->list_rules(&ss
);
6405 rdata
.append(ss
.str());
6407 } else if (prefix
== "osd crush rule ls-by-class") {
6409 cmd_getval(cmdmap
, "class", class_name
);
6410 if (class_name
.empty()) {
6411 ss
<< "no class specified";
6416 r
= osdmap
.crush
->get_rules_by_class(class_name
, &rules
);
6418 ss
<< "failed to get rules by class '" << class_name
<< "'";
6422 f
->open_array_section("rules");
6423 for (auto &rule
: rules
) {
6424 f
->dump_string("name", osdmap
.crush
->get_rule_name(rule
));
6430 for (auto &rule
: rules
) {
6431 rs
<< osdmap
.crush
->get_rule_name(rule
) << "\n";
6433 rdata
.append(rs
.str());
6435 } else if (prefix
== "osd crush rule dump") {
6437 cmd_getval(cmdmap
, "name", name
);
6439 cmd_getval(cmdmap
, "format", format
);
6440 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6442 f
->open_array_section("rules");
6443 osdmap
.crush
->dump_rules(f
.get());
6446 int ruleno
= osdmap
.crush
->get_rule_id(name
);
6448 ss
<< "unknown crush rule '" << name
<< "'";
6452 osdmap
.crush
->dump_rule(ruleno
, f
.get());
6457 rdata
.append(rs
.str());
6458 } else if (prefix
== "osd crush dump") {
6460 cmd_getval(cmdmap
, "format", format
);
6461 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6462 f
->open_object_section("crush_map");
6463 osdmap
.crush
->dump(f
.get());
6468 rdata
.append(rs
.str());
6469 } else if (prefix
== "osd crush show-tunables") {
6471 cmd_getval(cmdmap
, "format", format
);
6472 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6473 f
->open_object_section("crush_map_tunables");
6474 osdmap
.crush
->dump_tunables(f
.get());
6479 rdata
.append(rs
.str());
6480 } else if (prefix
== "osd crush tree") {
6482 cmd_getval(cmdmap
, "shadow", shadow
);
6483 bool show_shadow
= shadow
== "--show-shadow";
6484 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
6486 f
->open_object_section("crush_tree");
6487 osdmap
.crush
->dump_tree(nullptr,
6489 osdmap
.get_pool_names(),
6495 osdmap
.crush
->dump_tree(&ss
,
6497 osdmap
.get_pool_names(),
6499 rdata
.append(ss
.str());
6501 } else if (prefix
== "osd crush ls") {
6503 if (!cmd_getval(cmdmap
, "node", name
)) {
6504 ss
<< "no node specified";
6508 if (!osdmap
.crush
->name_exists(name
)) {
6509 ss
<< "node '" << name
<< "' does not exist";
6513 int id
= osdmap
.crush
->get_item_id(name
);
6516 result
.push_back(id
);
6518 int num
= osdmap
.crush
->get_bucket_size(id
);
6519 for (int i
= 0; i
< num
; ++i
) {
6520 result
.push_back(osdmap
.crush
->get_bucket_item(id
, i
));
6524 f
->open_array_section("items");
6525 for (auto i
: result
) {
6526 f
->dump_string("item", osdmap
.crush
->get_item_name(i
));
6532 for (auto i
: result
) {
6533 ss
<< osdmap
.crush
->get_item_name(i
) << "\n";
6535 rdata
.append(ss
.str());
6538 } else if (prefix
== "osd crush class ls") {
6539 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6540 f
->open_array_section("crush_classes");
6541 for (auto i
: osdmap
.crush
->class_name
)
6542 f
->dump_string("class", i
.second
);
6545 } else if (prefix
== "osd crush class ls-osd") {
6547 cmd_getval(cmdmap
, "class", name
);
6549 osdmap
.crush
->get_devices_by_class(name
, &osds
);
6551 f
->open_array_section("osds");
6552 for (auto &osd
: osds
)
6553 f
->dump_int("osd", osd
);
6558 for (auto &osd
: osds
) {
6566 } else if (prefix
== "osd crush get-device-class") {
6567 vector
<string
> idvec
;
6568 cmd_getval(cmdmap
, "ids", idvec
);
6569 map
<int, string
> class_by_osd
;
6570 for (auto& id
: idvec
) {
6572 long osd
= parse_osd_id(id
.c_str(), &ts
);
6574 ss
<< "unable to parse osd id:'" << id
<< "'";
6578 auto device_class
= osdmap
.crush
->get_item_class(osd
);
6580 class_by_osd
[osd
] = device_class
;
6582 class_by_osd
[osd
] = ""; // no class
6585 f
->open_array_section("osd_device_classes");
6586 for (auto& i
: class_by_osd
) {
6587 f
->open_object_section("osd_device_class");
6588 f
->dump_int("osd", i
.first
);
6589 f
->dump_string("device_class", i
.second
);
6595 if (class_by_osd
.size() == 1) {
6596 // for single input, make a clean output
6597 ds
<< class_by_osd
.begin()->second
;
6599 // note that we do not group osds by class here
6600 for (auto it
= class_by_osd
.begin();
6601 it
!= class_by_osd
.end();
6603 ds
<< "osd." << it
->first
<< ' ' << it
->second
;
6604 if (next(it
) != class_by_osd
.end())
6610 } else if (prefix
== "osd erasure-code-profile ls") {
6611 const auto &profiles
= osdmap
.get_erasure_code_profiles();
6613 f
->open_array_section("erasure-code-profiles");
6614 for (auto i
= profiles
.begin(); i
!= profiles
.end(); ++i
) {
6616 f
->dump_string("profile", i
->first
.c_str());
6618 rdata
.append(i
->first
+ "\n");
6625 rdata
.append(rs
.str());
6627 } else if (prefix
== "osd crush weight-set ls") {
6628 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
6630 f
->open_array_section("weight_sets");
6631 if (osdmap
.crush
->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS
)) {
6632 f
->dump_string("pool", "(compat)");
6634 for (auto& i
: osdmap
.crush
->choose_args
) {
6636 f
->dump_string("pool", osdmap
.get_pool_name(i
.first
));
6643 if (osdmap
.crush
->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS
)) {
6646 for (auto& i
: osdmap
.crush
->choose_args
) {
6648 rs
<< osdmap
.get_pool_name(i
.first
) << "\n";
6651 rdata
.append(rs
.str());
6653 } else if (prefix
== "osd crush weight-set dump") {
6654 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty",
6656 osdmap
.crush
->dump_choose_args(f
.get());
6658 } else if (prefix
== "osd erasure-code-profile get") {
6660 cmd_getval(cmdmap
, "name", name
);
6661 if (!osdmap
.has_erasure_code_profile(name
)) {
6662 ss
<< "unknown erasure code profile '" << name
<< "'";
6666 const map
<string
,string
> &profile
= osdmap
.get_erasure_code_profile(name
);
6668 f
->open_object_section("profile");
6669 for (map
<string
,string
>::const_iterator i
= profile
.begin();
6673 f
->dump_string(i
->first
.c_str(), i
->second
.c_str());
6675 rdata
.append(i
->first
+ "=" + i
->second
+ "\n");
6682 rdata
.append(rs
.str());
6684 } else if (prefix
== "osd pool application get") {
6685 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty",
6688 cmd_getval(cmdmap
, "pool", pool_name
);
6690 cmd_getval(cmdmap
, "app", app
);
6692 cmd_getval(cmdmap
, "key", key
);
6694 if (pool_name
.empty()) {
6696 f
->open_object_section("pools");
6697 for (const auto &pool
: osdmap
.pools
) {
6698 std::string
name("<unknown>");
6699 const auto &pni
= osdmap
.pool_name
.find(pool
.first
);
6700 if (pni
!= osdmap
.pool_name
.end())
6702 f
->open_object_section(name
.c_str());
6703 for (auto &app_pair
: pool
.second
.application_metadata
) {
6704 f
->open_object_section(app_pair
.first
.c_str());
6705 for (auto &kv_pair
: app_pair
.second
) {
6706 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
6710 f
->close_section(); // name
6712 f
->close_section(); // pools
6715 int64_t pool
= osdmap
.lookup_pg_pool_name(pool_name
.c_str());
6717 ss
<< "unrecognized pool '" << pool_name
<< "'";
6721 auto p
= osdmap
.get_pg_pool(pool
);
6724 f
->open_object_section(pool_name
.c_str());
6725 for (auto &app_pair
: p
->application_metadata
) {
6726 f
->open_object_section(app_pair
.first
.c_str());
6727 for (auto &kv_pair
: app_pair
.second
) {
6728 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
6730 f
->close_section(); // application
6732 f
->close_section(); // pool_name
6737 auto app_it
= p
->application_metadata
.find(app
);
6738 if (app_it
== p
->application_metadata
.end()) {
6739 ss
<< "pool '" << pool_name
<< "' has no application '" << app
<< "'";
6743 // filter by pool + app
6745 f
->open_object_section(app_it
->first
.c_str());
6746 for (auto &kv_pair
: app_it
->second
) {
6747 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
6749 f
->close_section(); // application
6753 // filter by pool + app + key
6754 auto key_it
= app_it
->second
.find(key
);
6755 if (key_it
== app_it
->second
.end()) {
6756 ss
<< "application '" << app
<< "' on pool '" << pool_name
6757 << "' does not have key '" << key
<< "'";
6761 ss
<< key_it
->second
<< "\n";
6762 rdata
.append(ss
.str());
6765 } else if (prefix
== "osd get-require-min-compat-client") {
6766 ss
<< osdmap
.require_min_compat_client
<< std::endl
;
6767 rdata
.append(ss
.str());
6770 } else if (prefix
== "osd pool application enable" ||
6771 prefix
== "osd pool application disable" ||
6772 prefix
== "osd pool application set" ||
6773 prefix
== "osd pool application rm") {
6774 bool changed
= false;
6775 r
= preprocess_command_pool_application(prefix
, cmdmap
, ss
, &changed
);
6779 } else if (changed
) {
6780 // Valid mutation, proceed to prepare phase
6783 // Idempotent case, reply
6787 // try prepare update
6794 mon
->reply_command(op
, r
, rs
, rdata
, get_last_committed());
6798 void OSDMonitor::set_pool_flags(int64_t pool_id
, uint64_t flags
)
6800 pg_pool_t
*pool
= pending_inc
.get_new_pool(pool_id
,
6801 osdmap
.get_pg_pool(pool_id
));
6803 pool
->set_flag(flags
);
6806 void OSDMonitor::clear_pool_flags(int64_t pool_id
, uint64_t flags
)
6808 pg_pool_t
*pool
= pending_inc
.get_new_pool(pool_id
,
6809 osdmap
.get_pg_pool(pool_id
));
6811 pool
->unset_flag(flags
);
6814 string
OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch
)
6817 snprintf(k
, sizeof(k
), "purged_epoch_%08lx", (unsigned long)epoch
);
6821 string
OSDMonitor::make_purged_snap_key(int64_t pool
, snapid_t snap
)
6824 snprintf(k
, sizeof(k
), "purged_snap_%llu_%016llx",
6825 (unsigned long long)pool
, (unsigned long long)snap
);
6829 string
OSDMonitor::make_purged_snap_key_value(
6830 int64_t pool
, snapid_t snap
, snapid_t num
,
6831 epoch_t epoch
, bufferlist
*v
)
6833 // encode the *last* epoch in the key so that we can use forward
6834 // iteration only to search for an epoch in an interval.
6836 encode(snap
+ num
, *v
);
6838 return make_purged_snap_key(pool
, snap
+ num
- 1);
6842 int OSDMonitor::lookup_purged_snap(
6843 int64_t pool
, snapid_t snap
,
6844 snapid_t
*begin
, snapid_t
*end
)
6846 string k
= make_purged_snap_key(pool
, snap
);
6847 auto it
= mon
->store
->get_iterator(OSD_SNAP_PREFIX
);
6850 dout(20) << __func__
6851 << " pool " << pool
<< " snap " << snap
6852 << " - key '" << k
<< "' not found" << dendl
;
6855 if (it
->key().find("purged_snap_") != 0) {
6856 dout(20) << __func__
6857 << " pool " << pool
<< " snap " << snap
6858 << " - key '" << k
<< "' got '" << it
->key()
6859 << "', wrong prefix" << dendl
;
6862 string gotk
= it
->key();
6863 const char *format
= "purged_snap_%llu_";
6864 long long int keypool
;
6865 int n
= sscanf(gotk
.c_str(), format
, &keypool
);
6867 derr
<< __func__
<< " invalid k '" << gotk
<< "'" << dendl
;
6870 if (pool
!= keypool
) {
6871 dout(20) << __func__
6872 << " pool " << pool
<< " snap " << snap
6873 << " - key '" << k
<< "' got '" << gotk
6874 << "', wrong pool " << keypool
6878 bufferlist v
= it
->value();
6879 auto p
= v
.cbegin();
6882 if (snap
< *begin
|| snap
>= *end
) {
6883 dout(20) << __func__
6884 << " pool " << pool
<< " snap " << snap
6885 << " - found [" << *begin
<< "," << *end
<< "), no overlap"
6892 void OSDMonitor::insert_purged_snap_update(
6894 snapid_t start
, snapid_t end
,
6896 MonitorDBStore::TransactionRef t
)
6898 snapid_t before_begin
, before_end
;
6899 snapid_t after_begin
, after_end
;
6900 int b
= lookup_purged_snap(pool
, start
- 1,
6901 &before_begin
, &before_end
);
6902 int a
= lookup_purged_snap(pool
, end
,
6903 &after_begin
, &after_end
);
6905 dout(10) << __func__
6906 << " [" << start
<< "," << end
<< ") - joins ["
6907 << before_begin
<< "," << before_end
<< ") and ["
6908 << after_begin
<< "," << after_end
<< ")" << dendl
;
6909 // erase only the begin record; we'll overwrite the end one.
6910 t
->erase(OSD_SNAP_PREFIX
, make_purged_snap_key(pool
, before_end
- 1));
6912 string k
= make_purged_snap_key_value(pool
,
6913 before_begin
, after_end
- before_begin
,
6914 pending_inc
.epoch
, &v
);
6915 t
->put(OSD_SNAP_PREFIX
, k
, v
);
6917 dout(10) << __func__
6918 << " [" << start
<< "," << end
<< ") - join with earlier ["
6919 << before_begin
<< "," << before_end
<< ")" << dendl
;
6920 t
->erase(OSD_SNAP_PREFIX
, make_purged_snap_key(pool
, before_end
- 1));
6922 string k
= make_purged_snap_key_value(pool
,
6923 before_begin
, end
- before_begin
,
6924 pending_inc
.epoch
, &v
);
6925 t
->put(OSD_SNAP_PREFIX
, k
, v
);
6927 dout(10) << __func__
6928 << " [" << start
<< "," << end
<< ") - join with later ["
6929 << after_begin
<< "," << after_end
<< ")" << dendl
;
6930 // overwrite after record
6932 string k
= make_purged_snap_key_value(pool
,
6933 start
, after_end
- start
,
6934 pending_inc
.epoch
, &v
);
6935 t
->put(OSD_SNAP_PREFIX
, k
, v
);
6937 dout(10) << __func__
6938 << " [" << start
<< "," << end
<< ") - new"
6941 string k
= make_purged_snap_key_value(pool
,
6943 pending_inc
.epoch
, &v
);
6944 t
->put(OSD_SNAP_PREFIX
, k
, v
);
6948 bool OSDMonitor::try_prune_purged_snaps()
6950 if (!mon
->mgrstatmon()->is_readable()) {
6953 if (!pending_inc
.new_purged_snaps
.empty()) {
6954 return false; // we already pruned for this epoch
6957 unsigned max_prune
= cct
->_conf
.get_val
<uint64_t>(
6958 "mon_max_snap_prune_per_epoch");
6962 dout(10) << __func__
<< " max_prune " << max_prune
<< dendl
;
6964 unsigned actually_pruned
= 0;
6965 auto& purged_snaps
= mon
->mgrstatmon()->get_digest().purged_snaps
;
6966 for (auto& p
: osdmap
.get_pools()) {
6967 auto q
= purged_snaps
.find(p
.first
);
6968 if (q
== purged_snaps
.end()) {
6971 auto& purged
= q
->second
;
6972 if (purged
.empty()) {
6973 dout(20) << __func__
<< " " << p
.first
<< " nothing purged" << dendl
;
6976 dout(20) << __func__
<< " pool " << p
.first
<< " purged " << purged
<< dendl
;
6977 snap_interval_set_t to_prune
;
6978 unsigned maybe_pruned
= actually_pruned
;
6979 for (auto i
= purged
.begin(); i
!= purged
.end(); ++i
) {
6980 snapid_t begin
= i
.get_start();
6981 auto end
= i
.get_start() + i
.get_len();
6982 snapid_t pbegin
= 0, pend
= 0;
6983 int r
= lookup_purged_snap(p
.first
, begin
, &pbegin
, &pend
);
6986 // be a bit aggressive about backing off here, because the mon may
6987 // do a lot of work going through this set, and if we know the
6988 // purged set from the OSDs is at least *partly* stale we may as
6989 // well wait for it to be fresh.
6990 dout(20) << __func__
<< " we've already purged " << pbegin
6991 << "~" << (pend
- pbegin
) << dendl
;
6994 if (pbegin
&& pbegin
> begin
&& pbegin
< end
) {
6995 // the tail of [begin,end) is purged; shorten the range
6998 to_prune
.insert(begin
, end
- begin
);
6999 maybe_pruned
+= end
- begin
;
7000 if (maybe_pruned
>= max_prune
) {
7004 if (!to_prune
.empty()) {
7005 // PGs may still be reporting things as purged that we have already
7006 // pruned from removed_snaps_queue.
7007 snap_interval_set_t actual
;
7008 auto r
= osdmap
.removed_snaps_queue
.find(p
.first
);
7009 if (r
!= osdmap
.removed_snaps_queue
.end()) {
7010 actual
.intersection_of(to_prune
, r
->second
);
7012 actually_pruned
+= actual
.size();
7013 dout(10) << __func__
<< " pool " << p
.first
<< " reports pruned " << to_prune
7014 << ", actual pruned " << actual
<< dendl
;
7015 if (!actual
.empty()) {
7016 pending_inc
.new_purged_snaps
[p
.first
].swap(actual
);
7019 if (actually_pruned
>= max_prune
) {
7023 dout(10) << __func__
<< " actually pruned " << actually_pruned
<< dendl
;
7024 return !!actually_pruned
;
7027 bool OSDMonitor::update_pools_status()
7029 if (!mon
->mgrstatmon()->is_readable())
7034 auto& pools
= osdmap
.get_pools();
7035 for (auto it
= pools
.begin(); it
!= pools
.end(); ++it
) {
7036 const pool_stat_t
*pstat
= mon
->mgrstatmon()->get_pool_stat(it
->first
);
7039 const object_stat_sum_t
& sum
= pstat
->stats
.sum
;
7040 const pg_pool_t
&pool
= it
->second
;
7041 const string
& pool_name
= osdmap
.get_pool_name(it
->first
);
7044 (pool
.quota_max_bytes
> 0 && (uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) ||
7045 (pool
.quota_max_objects
> 0 && (uint64_t)sum
.num_objects
>= pool
.quota_max_objects
);
7047 if (pool
.has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
7051 mon
->clog
->info() << "pool '" << pool_name
7052 << "' no longer out of quota; removing NO_QUOTA flag";
7053 // below we cancel FLAG_FULL too, we'll set it again in
7054 // OSDMonitor::encode_pending if it still fails the osd-full checking.
7055 clear_pool_flags(it
->first
,
7056 pg_pool_t::FLAG_FULL_QUOTA
| pg_pool_t::FLAG_FULL
);
7062 if (pool
.quota_max_bytes
> 0 &&
7063 (uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) {
7064 mon
->clog
->warn() << "pool '" << pool_name
<< "' is full"
7065 << " (reached quota's max_bytes: "
7066 << byte_u_t(pool
.quota_max_bytes
) << ")";
7068 if (pool
.quota_max_objects
> 0 &&
7069 (uint64_t)sum
.num_objects
>= pool
.quota_max_objects
) {
7070 mon
->clog
->warn() << "pool '" << pool_name
<< "' is full"
7071 << " (reached quota's max_objects: "
7072 << pool
.quota_max_objects
<< ")";
7074 // set both FLAG_FULL_QUOTA and FLAG_FULL
7075 // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
7076 // since FLAG_FULL should always take precedence
7077 set_pool_flags(it
->first
,
7078 pg_pool_t::FLAG_FULL_QUOTA
| pg_pool_t::FLAG_FULL
);
7079 clear_pool_flags(it
->first
,
7080 pg_pool_t::FLAG_NEARFULL
|
7081 pg_pool_t::FLAG_BACKFILLFULL
);
7088 int OSDMonitor::prepare_new_pool(MonOpRequestRef op
)
7090 op
->mark_osdmon_event(__func__
);
7091 auto m
= op
->get_req
<MPoolOp
>();
7092 dout(10) << "prepare_new_pool from " << m
->get_connection() << dendl
;
7093 MonSession
*session
= op
->get_session();
7096 string erasure_code_profile
;
7100 ret
= prepare_new_pool(m
->name
, m
->crush_rule
, rule_name
,
7102 erasure_code_profile
,
7103 pg_pool_t::TYPE_REPLICATED
, 0, FAST_READ_OFF
, {},
7107 dout(10) << __func__
<< " got " << ret
<< " " << ss
.str() << dendl
;
7112 int OSDMonitor::crush_rename_bucket(const string
& srcname
,
7113 const string
& dstname
,
7118 // Avoid creating a pending crush if it does not already exists and
7119 // the rename would fail.
7121 if (!_have_pending_crush()) {
7122 ret
= _get_stable_crush().can_rename_bucket(srcname
,
7129 CrushWrapper newcrush
;
7130 _get_pending_crush(newcrush
);
7132 ret
= newcrush
.rename_bucket(srcname
,
7138 pending_inc
.crush
.clear();
7139 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7140 *ss
<< "renamed bucket " << srcname
<< " into " << dstname
;
7144 void OSDMonitor::check_legacy_ec_plugin(const string
& plugin
, const string
& profile
) const
7146 string replacement
= "";
7148 if (plugin
== "jerasure_generic" ||
7149 plugin
== "jerasure_sse3" ||
7150 plugin
== "jerasure_sse4" ||
7151 plugin
== "jerasure_neon") {
7152 replacement
= "jerasure";
7153 } else if (plugin
== "shec_generic" ||
7154 plugin
== "shec_sse3" ||
7155 plugin
== "shec_sse4" ||
7156 plugin
== "shec_neon") {
7157 replacement
= "shec";
7160 if (replacement
!= "") {
7161 dout(0) << "WARNING: erasure coding profile " << profile
<< " uses plugin "
7162 << plugin
<< " that has been deprecated. Please use "
7163 << replacement
<< " instead." << dendl
;
7167 int OSDMonitor::normalize_profile(const string
& profilename
,
7168 ErasureCodeProfile
&profile
,
7172 ErasureCodeInterfaceRef erasure_code
;
7173 ErasureCodePluginRegistry
&instance
= ErasureCodePluginRegistry::instance();
7174 ErasureCodeProfile::const_iterator plugin
= profile
.find("plugin");
7175 check_legacy_ec_plugin(plugin
->second
, profilename
);
7176 int err
= instance
.factory(plugin
->second
,
7177 g_conf().get_val
<std::string
>("erasure_code_dir"),
7178 profile
, &erasure_code
, ss
);
7183 err
= erasure_code
->init(profile
, ss
);
7188 auto it
= profile
.find("stripe_unit");
7189 if (it
!= profile
.end()) {
7191 uint32_t stripe_unit
= strict_iecstrtoll(it
->second
.c_str(), &err_str
);
7192 if (!err_str
.empty()) {
7193 *ss
<< "could not parse stripe_unit '" << it
->second
7194 << "': " << err_str
<< std::endl
;
7197 uint32_t data_chunks
= erasure_code
->get_data_chunk_count();
7198 uint32_t chunk_size
= erasure_code
->get_chunk_size(stripe_unit
* data_chunks
);
7199 if (chunk_size
!= stripe_unit
) {
7200 *ss
<< "stripe_unit " << stripe_unit
<< " does not match ec profile "
7201 << "alignment. Would be padded to " << chunk_size
7205 if ((stripe_unit
% 4096) != 0 && !force
) {
7206 *ss
<< "stripe_unit should be a multiple of 4096 bytes for best performance."
7207 << "use --force to override this check" << std::endl
;
7214 int OSDMonitor::crush_rule_create_erasure(const string
&name
,
7215 const string
&profile
,
7219 int ruleid
= osdmap
.crush
->get_rule_id(name
);
7220 if (ruleid
!= -ENOENT
) {
7221 *rule
= osdmap
.crush
->get_rule_mask_ruleset(ruleid
);
7225 CrushWrapper newcrush
;
7226 _get_pending_crush(newcrush
);
7228 ruleid
= newcrush
.get_rule_id(name
);
7229 if (ruleid
!= -ENOENT
) {
7230 *rule
= newcrush
.get_rule_mask_ruleset(ruleid
);
7233 ErasureCodeInterfaceRef erasure_code
;
7234 int err
= get_erasure_code(profile
, &erasure_code
, ss
);
7236 *ss
<< "failed to load plugin using profile " << profile
<< std::endl
;
7240 err
= erasure_code
->create_rule(name
, newcrush
, ss
);
7241 erasure_code
.reset();
7245 pending_inc
.crush
.clear();
7246 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7251 int OSDMonitor::get_erasure_code(const string
&erasure_code_profile
,
7252 ErasureCodeInterfaceRef
*erasure_code
,
7255 if (pending_inc
.has_erasure_code_profile(erasure_code_profile
))
7257 ErasureCodeProfile profile
=
7258 osdmap
.get_erasure_code_profile(erasure_code_profile
);
7259 ErasureCodeProfile::const_iterator plugin
=
7260 profile
.find("plugin");
7261 if (plugin
== profile
.end()) {
7262 *ss
<< "cannot determine the erasure code plugin"
7263 << " because there is no 'plugin' entry in the erasure_code_profile "
7264 << profile
<< std::endl
;
7267 check_legacy_ec_plugin(plugin
->second
, erasure_code_profile
);
7268 ErasureCodePluginRegistry
&instance
= ErasureCodePluginRegistry::instance();
7269 return instance
.factory(plugin
->second
,
7270 g_conf().get_val
<std::string
>("erasure_code_dir"),
7271 profile
, erasure_code
, ss
);
7274 int OSDMonitor::check_cluster_features(uint64_t features
,
7277 stringstream unsupported_ss
;
7278 int unsupported_count
= 0;
7279 if ((mon
->get_quorum_con_features() & features
) != features
) {
7280 unsupported_ss
<< "the monitor cluster";
7281 ++unsupported_count
;
7284 set
<int32_t> up_osds
;
7285 osdmap
.get_up_osds(up_osds
);
7286 for (set
<int32_t>::iterator it
= up_osds
.begin();
7287 it
!= up_osds
.end(); ++it
) {
7288 const osd_xinfo_t
&xi
= osdmap
.get_xinfo(*it
);
7289 if ((xi
.features
& features
) != features
) {
7290 if (unsupported_count
> 0)
7291 unsupported_ss
<< ", ";
7292 unsupported_ss
<< "osd." << *it
;
7293 unsupported_count
++;
7297 if (unsupported_count
> 0) {
7298 ss
<< "features " << features
<< " unsupported by: "
7299 << unsupported_ss
.str();
7303 // check pending osd state, too!
7304 for (map
<int32_t,osd_xinfo_t
>::const_iterator p
=
7305 pending_inc
.new_xinfo
.begin();
7306 p
!= pending_inc
.new_xinfo
.end(); ++p
) {
7307 const osd_xinfo_t
&xi
= p
->second
;
7308 if ((xi
.features
& features
) != features
) {
7309 dout(10) << __func__
<< " pending osd." << p
->first
7310 << " features are insufficient; retry" << dendl
;
7318 bool OSDMonitor::validate_crush_against_features(const CrushWrapper
*newcrush
,
7321 OSDMap::Incremental new_pending
= pending_inc
;
7322 encode(*newcrush
, new_pending
.crush
, mon
->get_quorum_con_features());
7324 newmap
.deepish_copy_from(osdmap
);
7325 newmap
.apply_incremental(new_pending
);
7328 if (newmap
.require_min_compat_client
!= ceph_release_t::unknown
) {
7329 auto mv
= newmap
.get_min_compat_client();
7330 if (mv
> newmap
.require_min_compat_client
) {
7331 ss
<< "new crush map requires client version " << mv
7332 << " but require_min_compat_client is "
7333 << newmap
.require_min_compat_client
;
7340 newmap
.get_features(CEPH_ENTITY_TYPE_MON
, NULL
) |
7341 newmap
.get_features(CEPH_ENTITY_TYPE_OSD
, NULL
);
7342 stringstream features_ss
;
7343 int r
= check_cluster_features(features
, features_ss
);
7345 ss
<< "Could not change CRUSH: " << features_ss
.str();
7352 bool OSDMonitor::erasure_code_profile_in_use(
7353 const mempool::osdmap::map
<int64_t, pg_pool_t
> &pools
,
7354 const string
&profile
,
7358 for (map
<int64_t, pg_pool_t
>::const_iterator p
= pools
.begin();
7361 if (p
->second
.erasure_code_profile
== profile
&& p
->second
.is_erasure()) {
7362 *ss
<< osdmap
.pool_name
[p
->first
] << " ";
7367 *ss
<< "pool(s) are using the erasure code profile '" << profile
<< "'";
7372 int OSDMonitor::parse_erasure_code_profile(const vector
<string
> &erasure_code_profile
,
7373 map
<string
,string
> *erasure_code_profile_map
,
7376 int r
= g_conf().with_val
<string
>("osd_pool_default_erasure_code_profile",
7379 erasure_code_profile_map
,
7383 ceph_assert((*erasure_code_profile_map
).count("plugin"));
7384 string default_plugin
= (*erasure_code_profile_map
)["plugin"];
7385 map
<string
,string
> user_map
;
7386 for (vector
<string
>::const_iterator i
= erasure_code_profile
.begin();
7387 i
!= erasure_code_profile
.end();
7389 size_t equal
= i
->find('=');
7390 if (equal
== string::npos
) {
7391 user_map
[*i
] = string();
7392 (*erasure_code_profile_map
)[*i
] = string();
7394 const string key
= i
->substr(0, equal
);
7396 const string value
= i
->substr(equal
);
7397 if (key
.find("ruleset-") == 0) {
7398 *ss
<< "property '" << key
<< "' is no longer supported; try "
7399 << "'crush-" << key
.substr(8) << "' instead";
7402 user_map
[key
] = value
;
7403 (*erasure_code_profile_map
)[key
] = value
;
7407 if (user_map
.count("plugin") && user_map
["plugin"] != default_plugin
)
7408 (*erasure_code_profile_map
) = user_map
;
7413 int OSDMonitor::prepare_pool_size(const unsigned pool_type
,
7414 const string
&erasure_code_profile
,
7416 unsigned *size
, unsigned *min_size
,
7420 switch (pool_type
) {
7421 case pg_pool_t::TYPE_REPLICATED
:
7422 if (repl_size
== 0) {
7423 repl_size
= g_conf().get_val
<uint64_t>("osd_pool_default_size");
7426 *min_size
= g_conf().get_osd_pool_default_min_size(repl_size
);
7428 case pg_pool_t::TYPE_ERASURE
:
7430 ErasureCodeInterfaceRef erasure_code
;
7431 err
= get_erasure_code(erasure_code_profile
, &erasure_code
, ss
);
7433 *size
= erasure_code
->get_chunk_count();
7435 erasure_code
->get_data_chunk_count() +
7436 std::min
<int>(1, erasure_code
->get_coding_chunk_count() - 1);
7437 assert(*min_size
<= *size
);
7438 assert(*min_size
>= erasure_code
->get_data_chunk_count());
7443 *ss
<< "prepare_pool_size: " << pool_type
<< " is not a known pool type";
7450 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type
,
7451 const string
&erasure_code_profile
,
7452 uint32_t *stripe_width
,
7456 switch (pool_type
) {
7457 case pg_pool_t::TYPE_REPLICATED
:
7460 case pg_pool_t::TYPE_ERASURE
:
7462 ErasureCodeProfile profile
=
7463 osdmap
.get_erasure_code_profile(erasure_code_profile
);
7464 ErasureCodeInterfaceRef erasure_code
;
7465 err
= get_erasure_code(erasure_code_profile
, &erasure_code
, ss
);
7468 uint32_t data_chunks
= erasure_code
->get_data_chunk_count();
7469 uint32_t stripe_unit
= g_conf().get_val
<Option::size_t>("osd_pool_erasure_code_stripe_unit");
7470 auto it
= profile
.find("stripe_unit");
7471 if (it
!= profile
.end()) {
7473 stripe_unit
= strict_iecstrtoll(it
->second
.c_str(), &err_str
);
7474 ceph_assert(err_str
.empty());
7476 *stripe_width
= data_chunks
*
7477 erasure_code
->get_chunk_size(stripe_unit
* data_chunks
);
7481 *ss
<< "prepare_pool_stripe_width: "
7482 << pool_type
<< " is not a known pool type";
7489 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type
,
7490 const string
&erasure_code_profile
,
7491 const string
&rule_name
,
7496 if (*crush_rule
< 0) {
7497 switch (pool_type
) {
7498 case pg_pool_t::TYPE_REPLICATED
:
7500 if (rule_name
== "") {
7502 *crush_rule
= osdmap
.crush
->get_osd_pool_default_crush_replicated_ruleset(cct
);
7503 if (*crush_rule
< 0) {
7504 // Errors may happen e.g. if no valid rule is available
7505 *ss
<< "No suitable CRUSH rule exists, check "
7506 << "'osd pool default crush *' config options";
7510 return get_crush_rule(rule_name
, crush_rule
, ss
);
7514 case pg_pool_t::TYPE_ERASURE
:
7516 int err
= crush_rule_create_erasure(rule_name
,
7517 erasure_code_profile
,
7521 dout(20) << "prepare_pool_crush_rule: rule "
7522 << rule_name
<< " try again" << dendl
;
7525 // need to wait for the crush rule to be proposed before proceeding
7536 *ss
<< "prepare_pool_crush_rule: " << pool_type
7537 << " is not a known pool type";
7542 if (!osdmap
.crush
->ruleset_exists(*crush_rule
)) {
7543 *ss
<< "CRUSH rule " << *crush_rule
<< " not found";
7551 int OSDMonitor::get_crush_rule(const string
&rule_name
,
7556 ret
= osdmap
.crush
->get_rule_id(rule_name
);
7557 if (ret
!= -ENOENT
) {
7561 CrushWrapper newcrush
;
7562 _get_pending_crush(newcrush
);
7564 ret
= newcrush
.get_rule_id(rule_name
);
7565 if (ret
!= -ENOENT
) {
7566 // found it, wait for it to be proposed
7567 dout(20) << __func__
<< ": rule " << rule_name
7568 << " try again" << dendl
;
7571 // Cannot find it , return error
7572 *ss
<< "specified rule " << rule_name
<< " doesn't exist";
7579 int OSDMonitor::check_pg_num(int64_t pool
, int pg_num
, int size
, ostream
*ss
)
7581 auto max_pgs_per_osd
= g_conf().get_val
<uint64_t>("mon_max_pg_per_osd");
7582 auto num_osds
= std::max(osdmap
.get_num_in_osds(), 3u); // assume min cluster size 3
7583 auto max_pgs
= max_pgs_per_osd
* num_osds
;
7584 uint64_t projected
= 0;
7586 projected
+= pg_num
* size
;
7588 for (const auto& i
: osdmap
.get_pools()) {
7589 if (i
.first
== pool
) {
7590 projected
+= pg_num
* size
;
7592 projected
+= i
.second
.get_pg_num_target() * i
.second
.get_size();
7595 if (projected
> max_pgs
) {
7597 *ss
<< "pool id " << pool
;
7599 *ss
<< " pg_num " << pg_num
<< " size " << size
7600 << " would mean " << projected
7601 << " total pgs, which exceeds max " << max_pgs
7602 << " (mon_max_pg_per_osd " << max_pgs_per_osd
7603 << " * num_in_osds " << num_osds
<< ")";
7610 * @param name The name of the new pool
7611 * @param crush_rule The crush rule to use. If <0, will use the system default
7612 * @param crush_rule_name The crush rule to use, if crush_rulset <0
7613 * @param pg_num The pg_num to use. If set to 0, will use the system default
7614 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
7615 * @param repl_size Replication factor, or 0 for default
7616 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
7617 * @param pool_type TYPE_ERASURE, or TYPE_REP
7618 * @param expected_num_objects expected number of objects on the pool
7619 * @param fast_read fast read type.
7620 * @param ss human readable error message, if any.
7622 * @return 0 on success, negative errno on failure.
7624 int OSDMonitor::prepare_new_pool(string
& name
,
7626 const string
&crush_rule_name
,
7627 unsigned pg_num
, unsigned pgp_num
,
7628 unsigned pg_num_min
,
7629 const uint64_t repl_size
,
7630 const uint64_t target_size_bytes
,
7631 const float target_size_ratio
,
7632 const string
&erasure_code_profile
,
7633 const unsigned pool_type
,
7634 const uint64_t expected_num_objects
,
7635 FastReadType fast_read
,
7636 const string
& pg_autoscale_mode
,
7639 if (name
.length() == 0)
7642 pg_num
= g_conf().get_val
<uint64_t>("osd_pool_default_pg_num");
7644 pgp_num
= g_conf().get_val
<uint64_t>("osd_pool_default_pgp_num");
7647 if (pg_num
> g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")) {
7648 *ss
<< "'pg_num' must be greater than 0 and less than or equal to "
7649 << g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")
7650 << " (you may adjust 'mon max pool pg num' for higher values)";
7653 if (pgp_num
> pg_num
) {
7654 *ss
<< "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
7655 << ", which in this case is " << pg_num
;
7658 if (pool_type
== pg_pool_t::TYPE_REPLICATED
&& fast_read
== FAST_READ_ON
) {
7659 *ss
<< "'fast_read' can only apply to erasure coding pool";
7663 r
= prepare_pool_crush_rule(pool_type
, erasure_code_profile
,
7664 crush_rule_name
, &crush_rule
, ss
);
7666 dout(10) << "prepare_pool_crush_rule returns " << r
<< dendl
;
7669 if (g_conf()->mon_osd_crush_smoke_test
) {
7670 CrushWrapper newcrush
;
7671 _get_pending_crush(newcrush
);
7673 CrushTester
tester(newcrush
, err
);
7674 tester
.set_min_x(0);
7675 tester
.set_max_x(50);
7676 tester
.set_rule(crush_rule
);
7677 auto start
= ceph::coarse_mono_clock::now();
7678 r
= tester
.test_with_fork(g_conf()->mon_lease
);
7679 auto duration
= ceph::coarse_mono_clock::now() - start
;
7681 dout(10) << "tester.test_with_fork returns " << r
7682 << ": " << err
.str() << dendl
;
7683 *ss
<< "crush test failed with " << r
<< ": " << err
.str();
7686 dout(10) << __func__
<< " crush smoke test duration: "
7687 << duration
<< dendl
;
7689 unsigned size
, min_size
;
7690 r
= prepare_pool_size(pool_type
, erasure_code_profile
, repl_size
,
7691 &size
, &min_size
, ss
);
7693 dout(10) << "prepare_pool_size returns " << r
<< dendl
;
7696 r
= check_pg_num(-1, pg_num
, size
, ss
);
7698 dout(10) << "check_pg_num returns " << r
<< dendl
;
7702 if (!osdmap
.crush
->check_crush_rule(crush_rule
, pool_type
, size
, *ss
)) {
7706 uint32_t stripe_width
= 0;
7707 r
= prepare_pool_stripe_width(pool_type
, erasure_code_profile
, &stripe_width
, ss
);
7709 dout(10) << "prepare_pool_stripe_width returns " << r
<< dendl
;
7714 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
7715 switch (fast_read
) {
7722 case FAST_READ_DEFAULT
:
7723 fread
= g_conf()->osd_pool_default_ec_fast_read
;
7726 *ss
<< "invalid fast_read setting: " << fast_read
;
7731 for (map
<int64_t,string
>::iterator p
= pending_inc
.new_pool_names
.begin();
7732 p
!= pending_inc
.new_pool_names
.end();
7734 if (p
->second
== name
)
7738 if (-1 == pending_inc
.new_pool_max
)
7739 pending_inc
.new_pool_max
= osdmap
.pool_max
;
7740 int64_t pool
= ++pending_inc
.new_pool_max
;
7742 pg_pool_t
*pi
= pending_inc
.get_new_pool(pool
, &empty
);
7743 pi
->create_time
= ceph_clock_now();
7744 pi
->type
= pool_type
;
7745 pi
->fast_read
= fread
;
7746 pi
->flags
= g_conf()->osd_pool_default_flags
;
7747 if (g_conf()->osd_pool_default_flag_hashpspool
)
7748 pi
->set_flag(pg_pool_t::FLAG_HASHPSPOOL
);
7749 if (g_conf()->osd_pool_default_flag_nodelete
)
7750 pi
->set_flag(pg_pool_t::FLAG_NODELETE
);
7751 if (g_conf()->osd_pool_default_flag_nopgchange
)
7752 pi
->set_flag(pg_pool_t::FLAG_NOPGCHANGE
);
7753 if (g_conf()->osd_pool_default_flag_nosizechange
)
7754 pi
->set_flag(pg_pool_t::FLAG_NOSIZECHANGE
);
7755 pi
->set_flag(pg_pool_t::FLAG_CREATING
);
7756 if (g_conf()->osd_pool_use_gmt_hitset
)
7757 pi
->use_gmt_hitset
= true;
7759 pi
->use_gmt_hitset
= false;
7762 pi
->min_size
= min_size
;
7763 pi
->crush_rule
= crush_rule
;
7764 pi
->expected_num_objects
= expected_num_objects
;
7765 pi
->object_hash
= CEPH_STR_HASH_RJENKINS
;
7767 if (auto m
= pg_pool_t::get_pg_autoscale_mode_by_name(
7768 g_conf().get_val
<string
>("osd_pool_default_pg_autoscale_mode"));
7769 m
!= pg_pool_t::pg_autoscale_mode_t::UNKNOWN
) {
7770 pi
->pg_autoscale_mode
= m
;
7772 pi
->pg_autoscale_mode
= pg_pool_t::pg_autoscale_mode_t::OFF
;
7774 auto max
= g_conf().get_val
<int64_t>("mon_osd_max_initial_pgs");
7776 max
> 0 ? std::min
<uint64_t>(pg_num
, std::max
<int64_t>(1, max
))
7778 pi
->set_pg_num_pending(pi
->get_pg_num());
7779 pi
->set_pg_num_target(pg_num
);
7780 pi
->set_pgp_num(pi
->get_pg_num());
7781 pi
->set_pgp_num_target(pgp_num
);
7782 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
&&
7784 pi
->opts
.set(pool_opts_t::PG_NUM_MIN
, static_cast<int64_t>(pg_num_min
));
7786 if (auto m
= pg_pool_t::get_pg_autoscale_mode_by_name(
7787 pg_autoscale_mode
); m
!= pg_pool_t::pg_autoscale_mode_t::UNKNOWN
) {
7788 pi
->pg_autoscale_mode
= m
;
7791 pi
->last_change
= pending_inc
.epoch
;
7794 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
7795 pi
->erasure_code_profile
= erasure_code_profile
;
7797 pi
->erasure_code_profile
= "";
7799 pi
->stripe_width
= stripe_width
;
7801 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
&&
7802 target_size_bytes
) {
7803 // only store for nautilus+ because TARGET_SIZE_BYTES may be
7804 // larger than int32_t max.
7805 pi
->opts
.set(pool_opts_t::TARGET_SIZE_BYTES
, static_cast<int64_t>(target_size_bytes
));
7807 if (target_size_ratio
> 0.0 &&
7808 osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
7809 // only store for nautilus+, just to be consistent and tidy.
7810 pi
->opts
.set(pool_opts_t::TARGET_SIZE_RATIO
, target_size_ratio
);
7813 pi
->cache_target_dirty_ratio_micro
=
7814 g_conf()->osd_pool_default_cache_target_dirty_ratio
* 1000000;
7815 pi
->cache_target_dirty_high_ratio_micro
=
7816 g_conf()->osd_pool_default_cache_target_dirty_high_ratio
* 1000000;
7817 pi
->cache_target_full_ratio_micro
=
7818 g_conf()->osd_pool_default_cache_target_full_ratio
* 1000000;
7819 pi
->cache_min_flush_age
= g_conf()->osd_pool_default_cache_min_flush_age
;
7820 pi
->cache_min_evict_age
= g_conf()->osd_pool_default_cache_min_evict_age
;
7822 pending_inc
.new_pool_names
[pool
] = name
;
7826 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op
, int flag
)
7828 op
->mark_osdmon_event(__func__
);
7830 if (pending_inc
.new_flags
< 0)
7831 pending_inc
.new_flags
= osdmap
.get_flags();
7832 pending_inc
.new_flags
|= flag
;
7833 ss
<< OSDMap::get_flag_string(flag
) << " is set";
7834 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
7835 get_last_committed() + 1));
7839 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op
, int flag
)
7841 op
->mark_osdmon_event(__func__
);
7843 if (pending_inc
.new_flags
< 0)
7844 pending_inc
.new_flags
= osdmap
.get_flags();
7845 pending_inc
.new_flags
&= ~flag
;
7846 ss
<< OSDMap::get_flag_string(flag
) << " is unset";
7847 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
7848 get_last_committed() + 1));
7852 int OSDMonitor::prepare_command_pool_set(const cmdmap_t
& cmdmap
,
7856 cmd_getval(cmdmap
, "pool", poolstr
);
7857 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
7859 ss
<< "unrecognized pool '" << poolstr
<< "'";
7863 cmd_getval(cmdmap
, "var", var
);
7865 pg_pool_t p
= *osdmap
.get_pg_pool(pool
);
7866 if (pending_inc
.new_pools
.count(pool
))
7867 p
= pending_inc
.new_pools
[pool
];
7869 // accept val as a json string in the normal case (current
7870 // generation monitor). parse out int or float values from the
7871 // string as needed. however, if it is not a string, try to pull
7872 // out an int, in case an older monitor with an older json schema is
7873 // forwarding a request.
7875 string interr
, floaterr
;
7878 int64_t uf
= 0; // micro-f
7879 cmd_getval(cmdmap
, "val", val
);
7882 "target_max_objects"
7884 auto iec_options
= {
7886 "target_size_bytes",
7887 "compression_max_blob_size",
7888 "compression_min_blob_size",
7892 if (count(begin(si_options
), end(si_options
), var
)) {
7893 n
= strict_si_cast
<int64_t>(val
.c_str(), &interr
);
7894 } else if (count(begin(iec_options
), end(iec_options
), var
)) {
7895 n
= strict_iec_cast
<int64_t>(val
.c_str(), &interr
);
7897 // parse string as both int and float; different fields use different types.
7898 n
= strict_strtoll(val
.c_str(), 10, &interr
);
7899 f
= strict_strtod(val
.c_str(), &floaterr
);
7900 uf
= llrintl(f
* (double)1000000.0);
7904 (var
== "hit_set_type" || var
== "hit_set_period" ||
7905 var
== "hit_set_count" || var
== "hit_set_fpp" ||
7906 var
== "target_max_objects" || var
== "target_max_bytes" ||
7907 var
== "cache_target_full_ratio" || var
== "cache_target_dirty_ratio" ||
7908 var
== "cache_target_dirty_high_ratio" || var
== "use_gmt_hitset" ||
7909 var
== "cache_min_flush_age" || var
== "cache_min_evict_age" ||
7910 var
== "hit_set_grade_decay_rate" || var
== "hit_set_search_last_n" ||
7911 var
== "min_read_recency_for_promote" || var
== "min_write_recency_for_promote")) {
7915 if (var
== "size") {
7916 if (p
.has_flag(pg_pool_t::FLAG_NOSIZECHANGE
)) {
7917 ss
<< "pool size change is disabled; you must unset nosizechange flag for the pool first";
7920 if (p
.type
== pg_pool_t::TYPE_ERASURE
) {
7921 ss
<< "can not change the size of an erasure-coded pool";
7924 if (interr
.length()) {
7925 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
7928 if (n
<= 0 || n
> 10) {
7929 ss
<< "pool size must be between 1 and 10";
7932 if (!osdmap
.crush
->check_crush_rule(p
.get_crush_rule(), p
.type
, n
, ss
)) {
7935 int r
= check_pg_num(pool
, p
.get_pg_num(), n
, &ss
);
7942 } else if (var
== "min_size") {
7943 if (p
.has_flag(pg_pool_t::FLAG_NOSIZECHANGE
)) {
7944 ss
<< "pool min size change is disabled; you must unset nosizechange flag for the pool first";
7947 if (interr
.length()) {
7948 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
7952 if (p
.type
!= pg_pool_t::TYPE_ERASURE
) {
7953 if (n
< 1 || n
> p
.size
) {
7954 ss
<< "pool min_size must be between 1 and size, which is set to " << (int)p
.size
;
7958 ErasureCodeInterfaceRef erasure_code
;
7961 int err
= get_erasure_code(p
.erasure_code_profile
, &erasure_code
, &tmp
);
7963 k
= erasure_code
->get_data_chunk_count();
7965 ss
<< __func__
<< " get_erasure_code failed: " << tmp
.str();
7969 if (n
< k
|| n
> p
.size
) {
7970 ss
<< "pool min_size must be between " << k
<< " and size, which is set to " << (int)p
.size
;
7975 } else if (var
== "pg_num_actual") {
7976 if (interr
.length()) {
7977 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
7980 if (n
== (int)p
.get_pg_num()) {
7983 if (static_cast<uint64_t>(n
) > g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")) {
7984 ss
<< "'pg_num' must be greater than 0 and less than or equal to "
7985 << g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")
7986 << " (you may adjust 'mon max pool pg num' for higher values)";
7989 if (p
.has_flag(pg_pool_t::FLAG_CREATING
)) {
7990 ss
<< "cannot adjust pg_num while initial PGs are being created";
7993 if (n
> (int)p
.get_pg_num()) {
7994 if (p
.get_pg_num() != p
.get_pg_num_pending()) {
7995 // force pre-nautilus clients to resend their ops, since they
7996 // don't understand pg_num_pending changes form a new interval
7997 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
8001 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8002 ss
<< "nautilus OSDs are required to adjust pg_num_pending";
8005 if (n
< (int)p
.get_pgp_num()) {
8006 ss
<< "specified pg_num " << n
<< " < pgp_num " << p
.get_pgp_num();
8009 if (n
< (int)p
.get_pg_num() - 1) {
8010 ss
<< "specified pg_num " << n
<< " < pg_num (" << p
.get_pg_num()
8011 << ") - 1; only single pg decrease is currently supported";
8014 p
.set_pg_num_pending(n
);
8015 // force pre-nautilus clients to resend their ops, since they
8016 // don't understand pg_num_pending changes form a new interval
8017 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
8019 // force pre-luminous clients to resend their ops, since they
8020 // don't understand that split PGs now form a new interval.
8021 p
.last_force_op_resend_preluminous
= pending_inc
.epoch
;
8022 } else if (var
== "pg_num") {
8023 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
8024 ss
<< "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8027 if (interr
.length()) {
8028 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8031 if (n
== (int)p
.get_pg_num_target()) {
8034 if (n
<= 0 || static_cast<uint64_t>(n
) >
8035 g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")) {
8036 ss
<< "'pg_num' must be greater than 0 and less than or equal to "
8037 << g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")
8038 << " (you may adjust 'mon max pool pg num' for higher values)";
8041 if (n
> (int)p
.get_pg_num_target()) {
8042 int r
= check_pg_num(pool
, n
, p
.get_size(), &ss
);
8047 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
8048 if (p
.cache_mode
!= pg_pool_t::CACHEMODE_NONE
&& !force
) {
8049 ss
<< "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
8053 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8054 ss
<< "nautilus OSDs are required to decrease pg_num";
8058 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8059 // pre-nautilus osdmap format; increase pg_num directly
8060 assert(n
> (int)p
.get_pg_num());
8061 // force pre-nautilus clients to resend their ops, since they
8062 // don't understand pg_num_target changes form a new interval
8063 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
8064 // force pre-luminous clients to resend their ops, since they
8065 // don't understand that split PGs now form a new interval.
8066 p
.last_force_op_resend_preluminous
= pending_inc
.epoch
;
8069 // set targets; mgr will adjust pg_num_actual and pgp_num later.
8070 // make pgp_num track pg_num if it already matches. if it is set
8071 // differently, leave it different and let the user control it
8073 if (p
.get_pg_num_target() == p
.get_pgp_num_target()) {
8074 p
.set_pgp_num_target(n
);
8076 p
.set_pg_num_target(n
);
8078 } else if (var
== "pgp_num_actual") {
8079 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
8080 ss
<< "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8083 if (interr
.length()) {
8084 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8088 ss
<< "specified pgp_num must > 0, but you set to " << n
;
8091 if (n
> (int)p
.get_pg_num()) {
8092 ss
<< "specified pgp_num " << n
<< " > pg_num " << p
.get_pg_num();
8095 if (n
> (int)p
.get_pg_num_pending()) {
8096 ss
<< "specified pgp_num " << n
8097 << " > pg_num_pending " << p
.get_pg_num_pending();
8101 } else if (var
== "pgp_num") {
8102 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
8103 ss
<< "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8106 if (interr
.length()) {
8107 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8111 ss
<< "specified pgp_num must > 0, but you set to " << n
;
8114 if (n
> (int)p
.get_pg_num_target()) {
8115 ss
<< "specified pgp_num " << n
<< " > pg_num " << p
.get_pg_num_target();
8118 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8119 // pre-nautilus osdmap format; increase pgp_num directly
8122 p
.set_pgp_num_target(n
);
8124 } else if (var
== "pg_autoscale_mode") {
8125 auto m
= pg_pool_t::get_pg_autoscale_mode_by_name(val
);
8126 if (m
== pg_pool_t::pg_autoscale_mode_t::UNKNOWN
) {
8127 ss
<< "specified invalid mode " << val
;
8130 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8131 ss
<< "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
8134 p
.pg_autoscale_mode
= m
;
8135 } else if (var
== "crush_rule") {
8136 int id
= osdmap
.crush
->get_rule_id(val
);
8137 if (id
== -ENOENT
) {
8138 ss
<< "crush rule " << val
<< " does not exist";
8142 ss
<< cpp_strerror(id
);
8145 if (!osdmap
.crush
->check_crush_rule(id
, p
.get_type(), p
.get_size(), ss
)) {
8149 } else if (var
== "nodelete" || var
== "nopgchange" ||
8150 var
== "nosizechange" || var
== "write_fadvise_dontneed" ||
8151 var
== "noscrub" || var
== "nodeep-scrub") {
8152 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
8153 // make sure we only compare against 'n' if we didn't receive a string
8154 if (val
== "true" || (interr
.empty() && n
== 1)) {
8156 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8159 ss
<< "expecting value 'true', 'false', '0', or '1'";
8162 } else if (var
== "hashpspool") {
8163 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
8165 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
8168 ss
<< "are you SURE? this will remap all placement groups in this pool,"
8169 " this triggers large data movement,"
8170 " pass --yes-i-really-mean-it if you really do.";
8173 // make sure we only compare against 'n' if we didn't receive a string
8174 if (val
== "true" || (interr
.empty() && n
== 1)) {
8176 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8179 ss
<< "expecting value 'true', 'false', '0', or '1'";
8182 } else if (var
== "hit_set_type") {
8184 p
.hit_set_params
= HitSet::Params();
8186 int err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
8189 if (val
== "bloom") {
8190 BloomHitSet::Params
*bsp
= new BloomHitSet::Params
;
8191 bsp
->set_fpp(g_conf().get_val
<double>("osd_pool_default_hit_set_bloom_fpp"));
8192 p
.hit_set_params
= HitSet::Params(bsp
);
8193 } else if (val
== "explicit_hash")
8194 p
.hit_set_params
= HitSet::Params(new ExplicitHashHitSet::Params
);
8195 else if (val
== "explicit_object")
8196 p
.hit_set_params
= HitSet::Params(new ExplicitObjectHitSet::Params
);
8198 ss
<< "unrecognized hit_set type '" << val
<< "'";
8202 } else if (var
== "hit_set_period") {
8203 if (interr
.length()) {
8204 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8207 ss
<< "hit_set_period should be non-negative";
8210 p
.hit_set_period
= n
;
8211 } else if (var
== "hit_set_count") {
8212 if (interr
.length()) {
8213 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8216 ss
<< "hit_set_count should be non-negative";
8219 p
.hit_set_count
= n
;
8220 } else if (var
== "hit_set_fpp") {
8221 if (floaterr
.length()) {
8222 ss
<< "error parsing floating point value '" << val
<< "': " << floaterr
;
8224 } else if (f
< 0 || f
> 1.0) {
8225 ss
<< "hit_set_fpp should be in the range 0..1";
8228 if (p
.hit_set_params
.get_type() != HitSet::TYPE_BLOOM
) {
8229 ss
<< "hit set is not of type Bloom; invalid to set a false positive rate!";
8232 BloomHitSet::Params
*bloomp
= static_cast<BloomHitSet::Params
*>(p
.hit_set_params
.impl
.get());
8234 } else if (var
== "use_gmt_hitset") {
8235 if (val
== "true" || (interr
.empty() && n
== 1)) {
8236 p
.use_gmt_hitset
= true;
8238 ss
<< "expecting value 'true' or '1'";
8241 } else if (var
== "allow_ec_overwrites") {
8242 if (!p
.is_erasure()) {
8243 ss
<< "ec overwrites can only be enabled for an erasure coded pool";
8247 if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites
&&
8248 !is_pool_currently_all_bluestore(pool
, p
, &err
)) {
8249 ss
<< "pool must only be stored on bluestore for scrubbing to work: " << err
.str();
8252 if (val
== "true" || (interr
.empty() && n
== 1)) {
8253 p
.flags
|= pg_pool_t::FLAG_EC_OVERWRITES
;
8254 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8255 ss
<< "ec overwrites cannot be disabled once enabled";
8258 ss
<< "expecting value 'true', 'false', '0', or '1'";
8261 } else if (var
== "target_max_objects") {
8262 if (interr
.length()) {
8263 ss
<< "error parsing int '" << val
<< "': " << interr
;
8266 p
.target_max_objects
= n
;
8267 } else if (var
== "target_max_bytes") {
8268 if (interr
.length()) {
8269 ss
<< "error parsing int '" << val
<< "': " << interr
;
8272 p
.target_max_bytes
= n
;
8273 } else if (var
== "cache_target_dirty_ratio") {
8274 if (floaterr
.length()) {
8275 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
8278 if (f
< 0 || f
> 1.0) {
8279 ss
<< "value must be in the range 0..1";
8282 p
.cache_target_dirty_ratio_micro
= uf
;
8283 } else if (var
== "cache_target_dirty_high_ratio") {
8284 if (floaterr
.length()) {
8285 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
8288 if (f
< 0 || f
> 1.0) {
8289 ss
<< "value must be in the range 0..1";
8292 p
.cache_target_dirty_high_ratio_micro
= uf
;
8293 } else if (var
== "cache_target_full_ratio") {
8294 if (floaterr
.length()) {
8295 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
8298 if (f
< 0 || f
> 1.0) {
8299 ss
<< "value must be in the range 0..1";
8302 p
.cache_target_full_ratio_micro
= uf
;
8303 } else if (var
== "cache_min_flush_age") {
8304 if (interr
.length()) {
8305 ss
<< "error parsing int '" << val
<< "': " << interr
;
8308 p
.cache_min_flush_age
= n
;
8309 } else if (var
== "cache_min_evict_age") {
8310 if (interr
.length()) {
8311 ss
<< "error parsing int '" << val
<< "': " << interr
;
8314 p
.cache_min_evict_age
= n
;
8315 } else if (var
== "min_read_recency_for_promote") {
8316 if (interr
.length()) {
8317 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8320 p
.min_read_recency_for_promote
= n
;
8321 } else if (var
== "hit_set_grade_decay_rate") {
8322 if (interr
.length()) {
8323 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8326 if (n
> 100 || n
< 0) {
8327 ss
<< "value out of range,valid range is 0 - 100";
8330 p
.hit_set_grade_decay_rate
= n
;
8331 } else if (var
== "hit_set_search_last_n") {
8332 if (interr
.length()) {
8333 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8336 if (n
> p
.hit_set_count
|| n
< 0) {
8337 ss
<< "value out of range,valid range is 0 - hit_set_count";
8340 p
.hit_set_search_last_n
= n
;
8341 } else if (var
== "min_write_recency_for_promote") {
8342 if (interr
.length()) {
8343 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8346 p
.min_write_recency_for_promote
= n
;
8347 } else if (var
== "fast_read") {
8348 if (p
.is_replicated()) {
8349 ss
<< "fast read is not supported in replication pool";
8352 if (val
== "true" || (interr
.empty() && n
== 1)) {
8354 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8355 p
.fast_read
= false;
8357 ss
<< "expecting value 'true', 'false', '0', or '1'";
8360 } else if (pool_opts_t::is_opt_name(var
)) {
8361 bool unset
= val
== "unset";
8362 if (var
== "compression_mode") {
8364 auto cmode
= Compressor::get_comp_mode_type(val
);
8366 ss
<< "unrecognized compression mode '" << val
<< "'";
8370 } else if (var
== "compression_algorithm") {
8372 auto alg
= Compressor::get_comp_alg_type(val
);
8374 ss
<< "unrecognized compression_algorithm '" << val
<< "'";
8378 } else if (var
== "compression_required_ratio") {
8379 if (floaterr
.length()) {
8380 ss
<< "error parsing float value '" << val
<< "': " << floaterr
;
8383 if (f
< 0 || f
> 1) {
8384 ss
<< "compression_required_ratio is out of range (0-1): '" << val
<< "'";
8387 } else if (var
== "csum_type") {
8388 auto t
= unset
? 0 : Checksummer::get_csum_string_type(val
);
8390 ss
<< "unrecognized csum_type '" << val
<< "'";
8393 //preserve csum_type numeric value
8396 } else if (var
== "compression_max_blob_size" ||
8397 var
== "compression_min_blob_size" ||
8398 var
== "csum_max_block" ||
8399 var
== "csum_min_block") {
8400 if (interr
.length()) {
8401 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8404 } else if (var
== "fingerprint_algorithm") {
8406 auto alg
= pg_pool_t::get_fingerprint_from_str(val
);
8408 ss
<< "unrecognized fingerprint_algorithm '" << val
<< "'";
8412 } else if (var
== "target_size_bytes") {
8413 if (interr
.length()) {
8414 ss
<< "error parsing unit value '" << val
<< "': " << interr
;
8417 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8418 ss
<< "must set require_osd_release to nautilus or "
8419 << "later before setting target_size_bytes";
8422 } else if (var
== "pg_num_min") {
8423 if (interr
.length()) {
8424 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8427 if (n
> (int)p
.get_pg_num_target()) {
8428 ss
<< "specified pg_num_min " << n
8429 << " > pg_num " << p
.get_pg_num_target();
8432 } else if (var
== "recovery_priority") {
8433 if (interr
.length()) {
8434 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8437 if (!g_conf()->debug_allow_any_pool_priority
) {
8438 if (n
> OSD_POOL_PRIORITY_MAX
|| n
< OSD_POOL_PRIORITY_MIN
) {
8439 ss
<< "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
8440 << " and " << OSD_POOL_PRIORITY_MAX
;
8444 } else if (var
== "pg_autoscale_bias") {
8445 if (f
< 0.0 || f
> 1000.0) {
8446 ss
<< "pg_autoscale_bias must be between 0 and 1000";
8451 pool_opts_t::opt_desc_t desc
= pool_opts_t::get_opt_desc(var
);
8452 switch (desc
.type
) {
8453 case pool_opts_t::STR
:
8455 p
.opts
.unset(desc
.key
);
8457 p
.opts
.set(desc
.key
, static_cast<std::string
>(val
));
8460 case pool_opts_t::INT
:
8461 if (interr
.length()) {
8462 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8466 p
.opts
.unset(desc
.key
);
8468 p
.opts
.set(desc
.key
, static_cast<int64_t>(n
));
8471 case pool_opts_t::DOUBLE
:
8472 if (floaterr
.length()) {
8473 ss
<< "error parsing floating point value '" << val
<< "': " << floaterr
;
8477 p
.opts
.unset(desc
.key
);
8479 p
.opts
.set(desc
.key
, static_cast<double>(f
));
8483 ceph_assert(!"unknown type");
8486 ss
<< "unrecognized variable '" << var
<< "'";
8489 if (val
!= "unset") {
8490 ss
<< "set pool " << pool
<< " " << var
<< " to " << val
;
8492 ss
<< "unset pool " << pool
<< " " << var
;
8494 p
.last_change
= pending_inc
.epoch
;
8495 pending_inc
.new_pools
[pool
] = p
;
8499 int OSDMonitor::prepare_command_pool_application(const string
&prefix
,
8500 const cmdmap_t
& cmdmap
,
8503 return _command_pool_application(prefix
, cmdmap
, ss
, nullptr, true);
8506 int OSDMonitor::preprocess_command_pool_application(const string
&prefix
,
8507 const cmdmap_t
& cmdmap
,
8511 return _command_pool_application(prefix
, cmdmap
, ss
, modified
, false);
8516 * Common logic for preprocess and prepare phases of pool application
8517 * tag commands. In preprocess mode we're only detecting invalid
8518 * commands, and determining whether it was a modification or a no-op.
8519 * In prepare mode we're actually updating the pending state.
8521 int OSDMonitor::_command_pool_application(const string
&prefix
,
8522 const cmdmap_t
& cmdmap
,
8528 cmd_getval(cmdmap
, "pool", pool_name
);
8529 int64_t pool
= osdmap
.lookup_pg_pool_name(pool_name
.c_str());
8531 ss
<< "unrecognized pool '" << pool_name
<< "'";
8535 pg_pool_t p
= *osdmap
.get_pg_pool(pool
);
8537 if (pending_inc
.new_pools
.count(pool
)) {
8538 p
= pending_inc
.new_pools
[pool
];
8543 cmd_getval(cmdmap
, "app", app
);
8544 bool app_exists
= (p
.application_metadata
.count(app
) > 0);
8547 cmd_getval(cmdmap
, "key", key
);
8549 ss
<< "key cannot be 'all'";
8554 cmd_getval(cmdmap
, "value", value
);
8555 if (value
== "all") {
8556 ss
<< "value cannot be 'all'";
8560 if (boost::algorithm::ends_with(prefix
, "enable")) {
8562 ss
<< "application name must be provided";
8567 ss
<< "application must be enabled on base tier";
8572 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
8574 if (!app_exists
&& !p
.application_metadata
.empty() && !force
) {
8575 ss
<< "Are you SURE? Pool '" << pool_name
<< "' already has an enabled "
8576 << "application; pass --yes-i-really-mean-it to proceed anyway";
8580 if (!app_exists
&& p
.application_metadata
.size() >= MAX_POOL_APPLICATIONS
) {
8581 ss
<< "too many enabled applications on pool '" << pool_name
<< "'; "
8582 << "max " << MAX_POOL_APPLICATIONS
;
8586 if (app
.length() > MAX_POOL_APPLICATION_LENGTH
) {
8587 ss
<< "application name '" << app
<< "' too long; max length "
8588 << MAX_POOL_APPLICATION_LENGTH
;
8593 p
.application_metadata
[app
] = {};
8595 ss
<< "enabled application '" << app
<< "' on pool '" << pool_name
<< "'";
8597 } else if (boost::algorithm::ends_with(prefix
, "disable")) {
8599 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
8602 ss
<< "Are you SURE? Disabling an application within a pool might result "
8603 << "in loss of application functionality; pass "
8604 << "--yes-i-really-mean-it to proceed anyway";
8609 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
8611 return 0; // idempotent
8614 p
.application_metadata
.erase(app
);
8615 ss
<< "disable application '" << app
<< "' on pool '" << pool_name
<< "'";
8617 } else if (boost::algorithm::ends_with(prefix
, "set")) {
8619 ss
<< "application metadata must be set on base tier";
8624 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
8630 cmd_getval(cmdmap
, "key", key
);
8633 ss
<< "key must be provided";
8637 auto &app_keys
= p
.application_metadata
[app
];
8638 if (app_keys
.count(key
) == 0 &&
8639 app_keys
.size() >= MAX_POOL_APPLICATION_KEYS
) {
8640 ss
<< "too many keys set for application '" << app
<< "' on pool '"
8641 << pool_name
<< "'; max " << MAX_POOL_APPLICATION_KEYS
;
8645 if (key
.length() > MAX_POOL_APPLICATION_LENGTH
) {
8646 ss
<< "key '" << app
<< "' too long; max length "
8647 << MAX_POOL_APPLICATION_LENGTH
;
8652 cmd_getval(cmdmap
, "value", value
);
8653 if (value
.length() > MAX_POOL_APPLICATION_LENGTH
) {
8654 ss
<< "value '" << value
<< "' too long; max length "
8655 << MAX_POOL_APPLICATION_LENGTH
;
8659 p
.application_metadata
[app
][key
] = value
;
8660 ss
<< "set application '" << app
<< "' key '" << key
<< "' to '"
8661 << value
<< "' on pool '" << pool_name
<< "'";
8662 } else if (boost::algorithm::ends_with(prefix
, "rm")) {
8664 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
8670 cmd_getval(cmdmap
, "key", key
);
8671 auto it
= p
.application_metadata
[app
].find(key
);
8672 if (it
== p
.application_metadata
[app
].end()) {
8673 ss
<< "application '" << app
<< "' on pool '" << pool_name
8674 << "' does not have key '" << key
<< "'";
8675 return 0; // idempotent
8678 p
.application_metadata
[app
].erase(it
);
8679 ss
<< "removed application '" << app
<< "' key '" << key
<< "' on pool '"
8680 << pool_name
<< "'";
8686 p
.last_change
= pending_inc
.epoch
;
8687 pending_inc
.new_pools
[pool
] = p
;
8690 // Because we fell through this far, we didn't hit no-op cases,
8691 // so pool was definitely modified
8692 if (modified
!= nullptr) {
8699 int OSDMonitor::_prepare_command_osd_crush_remove(
8700 CrushWrapper
&newcrush
,
8709 err
= newcrush
.remove_item_under(cct
, id
, ancestor
,
8712 err
= newcrush
.remove_item(cct
, id
, unlink_only
);
8717 void OSDMonitor::do_osd_crush_remove(CrushWrapper
& newcrush
)
8719 pending_inc
.crush
.clear();
8720 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8723 int OSDMonitor::prepare_command_osd_crush_remove(
8724 CrushWrapper
&newcrush
,
8730 int err
= _prepare_command_osd_crush_remove(
8731 newcrush
, id
, ancestor
,
8732 has_ancestor
, unlink_only
);
8737 ceph_assert(err
== 0);
8738 do_osd_crush_remove(newcrush
);
8743 int OSDMonitor::prepare_command_osd_remove(int32_t id
)
8745 if (osdmap
.is_up(id
)) {
8749 pending_inc
.new_state
[id
] = osdmap
.get_state(id
);
8750 pending_inc
.new_uuid
[id
] = uuid_d();
8751 pending_metadata_rm
.insert(id
);
8752 pending_metadata
.erase(id
);
8757 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id
)
8759 ceph_assert(existing_id
);
8762 for (int32_t i
= 0; i
< osdmap
.get_max_osd(); ++i
) {
8763 if (!osdmap
.exists(i
) &&
8764 pending_inc
.new_up_client
.count(i
) == 0 &&
8765 (pending_inc
.new_state
.count(i
) == 0 ||
8766 (pending_inc
.new_state
[i
] & CEPH_OSD_EXISTS
) == 0)) {
8772 if (pending_inc
.new_max_osd
< 0) {
8773 return osdmap
.get_max_osd();
8775 return pending_inc
.new_max_osd
;
8778 void OSDMonitor::do_osd_create(
8781 const string
& device_class
,
8784 dout(10) << __func__
<< " uuid " << uuid
<< dendl
;
8785 ceph_assert(new_id
);
8787 // We presume validation has been performed prior to calling this
8788 // function. We assert with prejudice.
8790 int32_t allocated_id
= -1; // declare here so we can jump
8791 int32_t existing_id
= -1;
8792 if (!uuid
.is_zero()) {
8793 existing_id
= osdmap
.identify_osd(uuid
);
8794 if (existing_id
>= 0) {
8795 ceph_assert(id
< 0 || id
== existing_id
);
8796 *new_id
= existing_id
;
8798 } else if (id
>= 0) {
8799 // uuid does not exist, and id has been provided, so just create
8806 // allocate a new id
8807 allocated_id
= _allocate_osd_id(&existing_id
);
8808 dout(10) << __func__
<< " allocated id " << allocated_id
8809 << " existing id " << existing_id
<< dendl
;
8810 if (existing_id
>= 0) {
8811 ceph_assert(existing_id
< osdmap
.get_max_osd());
8812 ceph_assert(allocated_id
< 0);
8813 pending_inc
.new_weight
[existing_id
] = CEPH_OSD_OUT
;
8814 *new_id
= existing_id
;
8815 } else if (allocated_id
>= 0) {
8816 ceph_assert(existing_id
< 0);
8818 if (pending_inc
.new_max_osd
< 0) {
8819 pending_inc
.new_max_osd
= osdmap
.get_max_osd() + 1;
8821 ++pending_inc
.new_max_osd
;
8823 *new_id
= pending_inc
.new_max_osd
- 1;
8824 ceph_assert(*new_id
== allocated_id
);
8826 ceph_abort_msg("unexpected condition");
8830 if (device_class
.size()) {
8831 CrushWrapper newcrush
;
8832 _get_pending_crush(newcrush
);
8833 if (newcrush
.get_max_devices() < *new_id
+ 1) {
8834 newcrush
.set_max_devices(*new_id
+ 1);
8836 string name
= string("osd.") + stringify(*new_id
);
8837 if (!newcrush
.item_exists(*new_id
)) {
8838 newcrush
.set_item_name(*new_id
, name
);
8841 int r
= newcrush
.update_device_class(*new_id
, device_class
, name
, &ss
);
8843 derr
<< __func__
<< " failed to set " << name
<< " device_class "
8844 << device_class
<< ": " << cpp_strerror(r
) << " - " << ss
.str()
8846 // non-fatal... this might be a replay and we want to be idempotent.
8848 dout(20) << __func__
<< " set " << name
<< " device_class " << device_class
8850 pending_inc
.crush
.clear();
8851 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8854 dout(20) << __func__
<< " no device_class" << dendl
;
8857 dout(10) << __func__
<< " using id " << *new_id
<< dendl
;
8858 if (osdmap
.get_max_osd() <= *new_id
&& pending_inc
.new_max_osd
<= *new_id
) {
8859 pending_inc
.new_max_osd
= *new_id
+ 1;
8862 pending_inc
.new_state
[*new_id
] |= CEPH_OSD_EXISTS
| CEPH_OSD_NEW
;
8863 if (!uuid
.is_zero())
8864 pending_inc
.new_uuid
[*new_id
] = uuid
;
8867 int OSDMonitor::validate_osd_create(
8870 const bool check_osd_exists
,
8871 int32_t* existing_id
,
8875 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
8876 << " check_osd_exists " << check_osd_exists
<< dendl
;
8878 ceph_assert(existing_id
);
8880 if (id
< 0 && uuid
.is_zero()) {
8881 // we have nothing to validate
8884 } else if (uuid
.is_zero()) {
8885 // we have an id but we will ignore it - because that's what
8886 // `osd create` does.
8891 * This function will be used to validate whether we are able to
8892 * create a new osd when the `uuid` is specified.
8894 * It will be used by both `osd create` and `osd new`, as the checks
8895 * are basically the same when it pertains to osd id and uuid validation.
8896 * However, `osd create` presumes an `uuid` is optional, for legacy
8897 * reasons, while `osd new` requires the `uuid` to be provided. This
8898 * means that `osd create` will not be idempotent if an `uuid` is not
8899 * provided, but we will always guarantee the idempotency of `osd new`.
8902 ceph_assert(!uuid
.is_zero());
8903 if (pending_inc
.identify_osd(uuid
) >= 0) {
8904 // osd is about to exist
8908 int32_t i
= osdmap
.identify_osd(uuid
);
8910 // osd already exists
8911 if (id
>= 0 && i
!= id
) {
8912 ss
<< "uuid " << uuid
<< " already in use for different id " << i
;
8915 // return a positive errno to distinguish between a blocking error
8916 // and an error we consider to not be a problem (i.e., this would be
8917 // an idempotent operation).
8923 if (pending_inc
.new_state
.count(id
)) {
8924 // osd is about to exist
8927 // we may not care if an osd exists if we are recreating a previously
8929 if (check_osd_exists
&& osdmap
.exists(id
)) {
8930 ss
<< "id " << id
<< " already in use and does not match uuid "
8938 int OSDMonitor::prepare_command_osd_create(
8941 int32_t* existing_id
,
8944 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
<< dendl
;
8945 ceph_assert(existing_id
);
8946 if (osdmap
.is_destroyed(id
)) {
8947 ss
<< "ceph osd create has been deprecated. Please use ceph osd new "
8952 if (uuid
.is_zero()) {
8953 dout(10) << __func__
<< " no uuid; assuming legacy `osd create`" << dendl
;
8956 return validate_osd_create(id
, uuid
, true, existing_id
, ss
);
8959 int OSDMonitor::prepare_command_osd_new(
8961 const cmdmap_t
& cmdmap
,
8962 const map
<string
,string
>& params
,
8970 ceph_assert(paxos
->is_plugged());
8972 dout(10) << __func__
<< " " << op
<< dendl
;
8974 /* validate command. abort now if something's wrong. */
8976 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
8978 * If `id` is not specified, we will identify any existing osd based
8979 * on `uuid`. Operation will be idempotent iff secrets match.
8981 * If `id` is specified, we will identify any existing osd based on
8982 * `uuid` and match against `id`. If they match, operation will be
8983 * idempotent iff secrets match.
8985 * `-i secrets.json` will be optional. If supplied, will be used
8986 * to check for idempotency when `id` and `uuid` match.
8988 * If `id` is not specified, and `uuid` does not exist, an id will
8989 * be found or allocated for the osd.
8991 * If `id` is specified, and the osd has been previously marked
8992 * as destroyed, then the `id` will be reused.
8994 if (!cmd_getval(cmdmap
, "uuid", uuidstr
)) {
8995 ss
<< "requires the OSD's UUID to be specified.";
8997 } else if (!uuid
.parse(uuidstr
.c_str())) {
8998 ss
<< "invalid UUID value '" << uuidstr
<< "'.";
9002 if (cmd_getval(cmdmap
, "id", id
) &&
9004 ss
<< "invalid OSD id; must be greater or equal than zero.";
9008 // are we running an `osd create`-like command, or recreating
9009 // a previously destroyed osd?
9011 bool is_recreate_destroyed
= (id
>= 0 && osdmap
.is_destroyed(id
));
9013 // we will care about `id` to assess whether osd is `destroyed`, or
9014 // to create a new osd.
9015 // we will need an `id` by the time we reach auth.
9017 int32_t existing_id
= -1;
9018 int err
= validate_osd_create(id
, uuid
, !is_recreate_destroyed
,
9021 bool may_be_idempotent
= false;
9022 if (err
== EEXIST
) {
9023 // this is idempotent from the osdmon's point-of-view
9024 may_be_idempotent
= true;
9025 ceph_assert(existing_id
>= 0);
9027 } else if (err
< 0) {
9031 if (!may_be_idempotent
) {
9032 // idempotency is out of the window. We are either creating a new
9033 // osd or recreating a destroyed osd.
9035 // We now need to figure out if we have an `id` (and if it's valid),
9036 // of find an `id` if we don't have one.
9038 // NOTE: we need to consider the case where the `id` is specified for
9039 // `osd create`, and we must honor it. So this means checking if
9040 // the `id` is destroyed, and if so assume the destroy; otherwise,
9041 // check if it `exists` - in which case we complain about not being
9042 // `destroyed`. In the end, if nothing fails, we must allow the
9043 // creation, so that we are compatible with `create`.
9044 if (id
>= 0 && osdmap
.exists(id
) && !osdmap
.is_destroyed(id
)) {
9045 dout(10) << __func__
<< " osd." << id
<< " isn't destroyed" << dendl
;
9046 ss
<< "OSD " << id
<< " has not yet been destroyed";
9048 } else if (id
< 0) {
9050 id
= _allocate_osd_id(&existing_id
);
9052 ceph_assert(existing_id
>= 0);
9055 dout(10) << __func__
<< " found id " << id
<< " to use" << dendl
;
9056 } else if (id
>= 0 && osdmap
.is_destroyed(id
)) {
9057 dout(10) << __func__
<< " recreating osd." << id
<< dendl
;
9059 dout(10) << __func__
<< " creating new osd." << id
<< dendl
;
9062 ceph_assert(id
>= 0);
9063 ceph_assert(osdmap
.exists(id
));
9066 // we are now able to either create a brand new osd or reuse an existing
9067 // osd that has been previously destroyed.
9069 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
<< dendl
;
9071 if (may_be_idempotent
&& params
.empty()) {
9072 // nothing to do, really.
9073 dout(10) << __func__
<< " idempotent and no params -- no op." << dendl
;
9074 ceph_assert(id
>= 0);
9076 f
->open_object_section("created_osd");
9077 f
->dump_int("osdid", id
);
9085 string device_class
;
9086 auto p
= params
.find("crush_device_class");
9087 if (p
!= params
.end()) {
9088 device_class
= p
->second
;
9089 dout(20) << __func__
<< " device_class will be " << device_class
<< dendl
;
9091 string cephx_secret
, lockbox_secret
, dmcrypt_key
;
9092 bool has_lockbox
= false;
9093 bool has_secrets
= params
.count("cephx_secret")
9094 || params
.count("cephx_lockbox_secret")
9095 || params
.count("dmcrypt_key");
9097 ConfigKeyService
*svc
= nullptr;
9098 AuthMonitor::auth_entity_t cephx_entity
, lockbox_entity
;
9101 if (params
.count("cephx_secret") == 0) {
9102 ss
<< "requires a cephx secret.";
9105 cephx_secret
= params
.at("cephx_secret");
9107 bool has_lockbox_secret
= (params
.count("cephx_lockbox_secret") > 0);
9108 bool has_dmcrypt_key
= (params
.count("dmcrypt_key") > 0);
9110 dout(10) << __func__
<< " has lockbox " << has_lockbox_secret
9111 << " dmcrypt " << has_dmcrypt_key
<< dendl
;
9113 if (has_lockbox_secret
&& has_dmcrypt_key
) {
9115 lockbox_secret
= params
.at("cephx_lockbox_secret");
9116 dmcrypt_key
= params
.at("dmcrypt_key");
9117 } else if (!has_lockbox_secret
!= !has_dmcrypt_key
) {
9118 ss
<< "requires both a cephx lockbox secret and a dm-crypt key.";
9122 dout(10) << __func__
<< " validate secrets using osd id " << id
<< dendl
;
9124 err
= mon
->authmon()->validate_osd_new(id
, uuid
,
9132 } else if (may_be_idempotent
&& err
!= EEXIST
) {
9133 // for this to be idempotent, `id` should already be >= 0; no need
9134 // to use validate_id.
9135 ceph_assert(id
>= 0);
9136 ss
<< "osd." << id
<< " exists but secrets do not match";
9141 svc
= (ConfigKeyService
*)mon
->config_key_service
;
9142 err
= svc
->validate_osd_new(uuid
, dmcrypt_key
, ss
);
9145 } else if (may_be_idempotent
&& err
!= EEXIST
) {
9146 ceph_assert(id
>= 0);
9147 ss
<< "osd." << id
<< " exists but dm-crypt key does not match.";
9152 ceph_assert(!has_secrets
|| !cephx_secret
.empty());
9153 ceph_assert(!has_lockbox
|| !lockbox_secret
.empty());
9155 if (may_be_idempotent
) {
9156 // we have nothing to do for either the osdmon or the authmon,
9157 // and we have no lockbox - so the config key service will not be
9158 // touched. This is therefore an idempotent operation, and we can
9159 // just return right away.
9160 dout(10) << __func__
<< " idempotent -- no op." << dendl
;
9161 ceph_assert(id
>= 0);
9163 f
->open_object_section("created_osd");
9164 f
->dump_int("osdid", id
);
9171 ceph_assert(!may_be_idempotent
);
9175 ceph_assert(!cephx_secret
.empty());
9176 ceph_assert((lockbox_secret
.empty() && dmcrypt_key
.empty()) ||
9177 (!lockbox_secret
.empty() && !dmcrypt_key
.empty()));
9179 err
= mon
->authmon()->do_osd_new(cephx_entity
,
9182 ceph_assert(0 == err
);
9185 ceph_assert(nullptr != svc
);
9186 svc
->do_osd_new(uuid
, dmcrypt_key
);
9190 if (is_recreate_destroyed
) {
9191 ceph_assert(id
>= 0);
9192 ceph_assert(osdmap
.is_destroyed(id
));
9193 pending_inc
.new_weight
[id
] = CEPH_OSD_OUT
;
9194 pending_inc
.new_state
[id
] |= CEPH_OSD_DESTROYED
;
9195 if ((osdmap
.get_state(id
) & CEPH_OSD_NEW
) == 0) {
9196 pending_inc
.new_state
[id
] |= CEPH_OSD_NEW
;
9198 if (osdmap
.get_state(id
) & CEPH_OSD_UP
) {
9199 // due to http://tracker.ceph.com/issues/20751 some clusters may
9200 // have UP set for non-existent OSDs; make sure it is cleared
9201 // for a newly created osd.
9202 pending_inc
.new_state
[id
] |= CEPH_OSD_UP
;
9204 pending_inc
.new_uuid
[id
] = uuid
;
9206 ceph_assert(id
>= 0);
9207 int32_t new_id
= -1;
9208 do_osd_create(id
, uuid
, device_class
, &new_id
);
9209 ceph_assert(new_id
>= 0);
9210 ceph_assert(id
== new_id
);
9214 f
->open_object_section("created_osd");
9215 f
->dump_int("osdid", id
);
9224 bool OSDMonitor::prepare_command(MonOpRequestRef op
)
9226 op
->mark_osdmon_event(__func__
);
9227 auto m
= op
->get_req
<MMonCommand
>();
9230 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
9231 string rs
= ss
.str();
9232 mon
->reply_command(op
, -EINVAL
, rs
, get_last_committed());
9236 MonSession
*session
= op
->get_session();
9238 derr
<< __func__
<< " no session" << dendl
;
9239 mon
->reply_command(op
, -EACCES
, "access denied", get_last_committed());
9243 return prepare_command_impl(op
, cmdmap
);
9246 static int parse_reweights(CephContext
*cct
,
9247 const cmdmap_t
& cmdmap
,
9248 const OSDMap
& osdmap
,
9249 map
<int32_t, uint32_t>* weights
)
9252 if (!cmd_getval(cmdmap
, "weights", weights_str
)) {
9255 std::replace(begin(weights_str
), end(weights_str
), '\'', '"');
9256 json_spirit::mValue json_value
;
9257 if (!json_spirit::read(weights_str
, json_value
)) {
9260 if (json_value
.type() != json_spirit::obj_type
) {
9263 const auto obj
= json_value
.get_obj();
9265 for (auto& osd_weight
: obj
) {
9266 auto osd_id
= std::stoi(osd_weight
.first
);
9267 if (!osdmap
.exists(osd_id
)) {
9270 if (osd_weight
.second
.type() != json_spirit::str_type
) {
9273 auto weight
= std::stoul(osd_weight
.second
.get_str());
9274 weights
->insert({osd_id
, weight
});
9276 } catch (const std::logic_error
& e
) {
9282 int OSDMonitor::prepare_command_osd_destroy(
9286 ceph_assert(paxos
->is_plugged());
9288 // we check if the osd exists for the benefit of `osd purge`, which may
9289 // have previously removed the osd. If the osd does not exist, return
9290 // -ENOENT to convey this, and let the caller deal with it.
9292 // we presume that all auth secrets and config keys were removed prior
9293 // to this command being called. if they exist by now, we also assume
9294 // they must have been created by some other command and do not pertain
9295 // to this non-existent osd.
9296 if (!osdmap
.exists(id
)) {
9297 dout(10) << __func__
<< " osd." << id
<< " does not exist." << dendl
;
9301 uuid_d uuid
= osdmap
.get_uuid(id
);
9302 dout(10) << __func__
<< " destroying osd." << id
9303 << " uuid " << uuid
<< dendl
;
9305 // if it has been destroyed, we assume our work here is done.
9306 if (osdmap
.is_destroyed(id
)) {
9307 ss
<< "destroyed osd." << id
;
9311 EntityName cephx_entity
, lockbox_entity
;
9312 bool idempotent_auth
= false, idempotent_cks
= false;
9314 int err
= mon
->authmon()->validate_osd_destroy(id
, uuid
,
9319 if (err
== -ENOENT
) {
9320 idempotent_auth
= true;
9326 ConfigKeyService
*svc
= (ConfigKeyService
*)mon
->config_key_service
;
9327 err
= svc
->validate_osd_destroy(id
, uuid
);
9329 ceph_assert(err
== -ENOENT
);
9331 idempotent_cks
= true;
9334 if (!idempotent_auth
) {
9335 err
= mon
->authmon()->do_osd_destroy(cephx_entity
, lockbox_entity
);
9336 ceph_assert(0 == err
);
9339 if (!idempotent_cks
) {
9340 svc
->do_osd_destroy(id
, uuid
);
9343 pending_inc
.new_state
[id
] = CEPH_OSD_DESTROYED
;
9344 pending_inc
.new_uuid
[id
] = uuid_d();
9346 // we can only propose_pending() once per service, otherwise we'll be
9347 // defying PaxosService and all laws of nature. Therefore, as we may
9348 // be used during 'osd purge', let's keep the caller responsible for
9350 ceph_assert(err
== 0);
9354 int OSDMonitor::prepare_command_osd_purge(
9358 ceph_assert(paxos
->is_plugged());
9359 dout(10) << __func__
<< " purging osd." << id
<< dendl
;
9361 ceph_assert(!osdmap
.is_up(id
));
9364 * This may look a bit weird, but this is what's going to happen:
9366 * 1. we make sure that removing from crush works
9367 * 2. we call `prepare_command_osd_destroy()`. If it returns an
9368 * error, then we abort the whole operation, as no updates
9369 * have been made. However, we this function will have
9370 * side-effects, thus we need to make sure that all operations
9371 * performed henceforth will *always* succeed.
9372 * 3. we call `prepare_command_osd_remove()`. Although this
9373 * function can return an error, it currently only checks if the
9374 * osd is up - and we have made sure that it is not so, so there
9375 * is no conflict, and it is effectively an update.
9376 * 4. finally, we call `do_osd_crush_remove()`, which will perform
9377 * the crush update we delayed from before.
9380 CrushWrapper newcrush
;
9381 _get_pending_crush(newcrush
);
9383 bool may_be_idempotent
= false;
9385 int err
= _prepare_command_osd_crush_remove(newcrush
, id
, 0, false, false);
9386 if (err
== -ENOENT
) {
9388 may_be_idempotent
= true;
9389 } else if (err
< 0) {
9390 ss
<< "error removing osd." << id
<< " from crush";
9394 // no point destroying the osd again if it has already been marked destroyed
9395 if (!osdmap
.is_destroyed(id
)) {
9396 err
= prepare_command_osd_destroy(id
, ss
);
9398 if (err
== -ENOENT
) {
9404 may_be_idempotent
= false;
9407 ceph_assert(0 == err
);
9409 if (may_be_idempotent
&& !osdmap
.exists(id
)) {
9410 dout(10) << __func__
<< " osd." << id
<< " does not exist and "
9411 << "we are idempotent." << dendl
;
9415 err
= prepare_command_osd_remove(id
);
9416 // we should not be busy, as we should have made sure this id is not up.
9417 ceph_assert(0 == err
);
9419 do_osd_crush_remove(newcrush
);
9423 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op
,
9424 const cmdmap_t
& cmdmap
)
9426 op
->mark_osdmon_event(__func__
);
9427 auto m
= op
->get_req
<MMonCommand
>();
9435 cmd_getval(cmdmap
, "format", format
, string("plain"));
9436 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
9439 cmd_getval(cmdmap
, "prefix", prefix
);
9443 bool osdid_present
= false;
9444 if (prefix
!= "osd pg-temp" &&
9445 prefix
!= "osd pg-upmap" &&
9446 prefix
!= "osd pg-upmap-items") { // avoid commands with non-int id arg
9447 osdid_present
= cmd_getval(cmdmap
, "id", osdid
);
9449 if (osdid_present
) {
9451 oss
<< "osd." << osdid
;
9452 osd_name
= oss
.str();
9455 // Even if there's a pending state with changes that could affect
9456 // a command, considering that said state isn't yet committed, we
9457 // just don't care about those changes if the command currently being
9458 // handled acts as a no-op against the current committed state.
9459 // In a nutshell, we assume this command happens *before*.
9461 // Let me make this clearer:
9463 // - If we have only one client, and that client issues some
9464 // operation that would conflict with this operation but is
9465 // still on the pending state, then we would be sure that said
9466 // operation wouldn't have returned yet, so the client wouldn't
9467 // issue this operation (unless the client didn't wait for the
9468 // operation to finish, and that would be the client's own fault).
9470 // - If we have more than one client, each client will observe
9471 // whatever is the state at the moment of the commit. So, if we
9472 // have two clients, one issuing an unlink and another issuing a
9473 // link, and if the link happens while the unlink is still on the
9474 // pending state, from the link's point-of-view this is a no-op.
9475 // If different clients are issuing conflicting operations and
9476 // they care about that, then the clients should make sure they
9477 // enforce some kind of concurrency mechanism -- from our
9478 // perspective that's what Douglas Adams would call an SEP.
9480 // This should be used as a general guideline for most commands handled
9481 // in this function. Adapt as you see fit, but please bear in mind that
9482 // this is the expected behavior.
9485 if (prefix
== "osd setcrushmap" ||
9486 (prefix
== "osd crush set" && !osdid_present
)) {
9487 if (pending_inc
.crush
.length()) {
9488 dout(10) << __func__
<< " waiting for pending crush update " << dendl
;
9489 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
9492 dout(10) << "prepare_command setting new crush map" << dendl
;
9493 bufferlist
data(m
->get_data());
9496 auto bl
= data
.cbegin();
9499 catch (const std::exception
&e
) {
9501 ss
<< "Failed to parse crushmap: " << e
.what();
9505 int64_t prior_version
= 0;
9506 if (cmd_getval(cmdmap
, "prior_version", prior_version
)) {
9507 if (prior_version
== osdmap
.get_crush_version() - 1) {
9508 // see if we are a resend of the last update. this is imperfect
9509 // (multiple racing updaters may not both get reliable success)
9510 // but we expect crush updaters (via this interface) to be rare-ish.
9511 bufferlist current
, proposed
;
9512 osdmap
.crush
->encode(current
, mon
->get_quorum_con_features());
9513 crush
.encode(proposed
, mon
->get_quorum_con_features());
9514 if (current
.contents_equal(proposed
)) {
9515 dout(10) << __func__
9516 << " proposed matches current and version equals previous"
9519 ss
<< osdmap
.get_crush_version();
9523 if (prior_version
!= osdmap
.get_crush_version()) {
9525 ss
<< "prior_version " << prior_version
<< " != crush version "
9526 << osdmap
.get_crush_version();
9531 if (crush
.has_legacy_rule_ids()) {
9533 ss
<< "crush maps with ruleset != ruleid are no longer allowed";
9536 if (!validate_crush_against_features(&crush
, ss
)) {
9541 err
= osdmap
.validate_crush_rules(&crush
, &ss
);
9546 if (g_conf()->mon_osd_crush_smoke_test
) {
9547 // sanity check: test some inputs to make sure this map isn't
9549 dout(10) << " testing map" << dendl
;
9551 CrushTester
tester(crush
, ess
);
9552 tester
.set_min_x(0);
9553 tester
.set_max_x(50);
9554 auto start
= ceph::coarse_mono_clock::now();
9555 int r
= tester
.test_with_fork(g_conf()->mon_lease
);
9556 auto duration
= ceph::coarse_mono_clock::now() - start
;
9558 dout(10) << " tester.test_with_fork returns " << r
9559 << ": " << ess
.str() << dendl
;
9560 ss
<< "crush smoke test failed with " << r
<< ": " << ess
.str();
9564 dout(10) << __func__
<< " crush somke test duration: "
9565 << duration
<< ", result: " << ess
.str() << dendl
;
9568 pending_inc
.crush
= data
;
9569 ss
<< osdmap
.get_crush_version() + 1;
9572 } else if (prefix
== "osd crush set-all-straw-buckets-to-straw2") {
9573 CrushWrapper newcrush
;
9574 _get_pending_crush(newcrush
);
9575 for (int b
= 0; b
< newcrush
.get_max_buckets(); ++b
) {
9577 if (newcrush
.bucket_exists(bid
) &&
9578 newcrush
.get_bucket_alg(bid
) == CRUSH_BUCKET_STRAW
) {
9579 dout(20) << " bucket " << bid
<< " is straw, can convert" << dendl
;
9580 newcrush
.bucket_set_alg(bid
, CRUSH_BUCKET_STRAW2
);
9583 if (!validate_crush_against_features(&newcrush
, ss
)) {
9587 pending_inc
.crush
.clear();
9588 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9589 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9590 get_last_committed() + 1));
9592 } else if (prefix
== "osd crush set-device-class") {
9593 string device_class
;
9594 if (!cmd_getval(cmdmap
, "class", device_class
)) {
9595 err
= -EINVAL
; // no value!
9600 vector
<string
> idvec
;
9601 cmd_getval(cmdmap
, "ids", idvec
);
9602 CrushWrapper newcrush
;
9603 _get_pending_crush(newcrush
);
9605 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
9609 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
9610 osdmap
.get_all_osds(osds
);
9613 // try traditional single osd way
9614 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
9616 // ss has reason for failure
9617 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
9624 for (auto &osd
: osds
) {
9625 if (!osdmap
.exists(osd
)) {
9626 ss
<< "osd." << osd
<< " does not exist. ";
9631 oss
<< "osd." << osd
;
9632 string name
= oss
.str();
9634 if (newcrush
.get_max_devices() < osd
+ 1) {
9635 newcrush
.set_max_devices(osd
+ 1);
9638 if (newcrush
.item_exists(osd
)) {
9639 action
= "updating";
9641 action
= "creating";
9642 newcrush
.set_item_name(osd
, name
);
9645 dout(5) << action
<< " crush item id " << osd
<< " name '" << name
9646 << "' device_class '" << device_class
<< "'"
9648 err
= newcrush
.update_device_class(osd
, device_class
, name
, &ss
);
9652 if (err
== 0 && !_have_pending_crush()) {
9654 // for single osd only, wildcard makes too much noise
9655 ss
<< "set-device-class item id " << osd
<< " name '" << name
9656 << "' device_class '" << device_class
<< "': no change. ";
9659 updated
.insert(osd
);
9664 if (!updated
.empty()) {
9665 pending_inc
.crush
.clear();
9666 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9667 ss
<< "set osd(s) " << updated
<< " to class '" << device_class
<< "'";
9669 wait_for_finished_proposal(op
,
9670 new Monitor::C_Command(mon
,op
, 0, rs
, get_last_committed() + 1));
9674 } else if (prefix
== "osd crush rm-device-class") {
9676 vector
<string
> idvec
;
9677 cmd_getval(cmdmap
, "ids", idvec
);
9678 CrushWrapper newcrush
;
9679 _get_pending_crush(newcrush
);
9682 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
9687 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
9688 osdmap
.get_all_osds(osds
);
9691 // try traditional single osd way
9692 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
9694 // ss has reason for failure
9695 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
9702 for (auto &osd
: osds
) {
9703 if (!osdmap
.exists(osd
)) {
9704 ss
<< "osd." << osd
<< " does not exist. ";
9708 auto class_name
= newcrush
.get_item_class(osd
);
9710 ss
<< "osd." << osd
<< " belongs to no class, ";
9713 // note that we do not verify if class_is_in_use here
9714 // in case the device is misclassified and user wants
9715 // to overridely reset...
9717 err
= newcrush
.remove_device_class(cct
, osd
, &ss
);
9719 // ss has reason for failure
9722 updated
.insert(osd
);
9726 if (!updated
.empty()) {
9727 pending_inc
.crush
.clear();
9728 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9729 ss
<< "done removing class of osd(s): " << updated
;
9731 wait_for_finished_proposal(op
,
9732 new Monitor::C_Command(mon
,op
, 0, rs
, get_last_committed() + 1));
9735 } else if (prefix
== "osd crush class create") {
9736 string device_class
;
9737 if (!cmd_getval(cmdmap
, "class", device_class
)) {
9738 err
= -EINVAL
; // no value!
9741 if (osdmap
.require_osd_release
< ceph_release_t::luminous
) {
9742 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
9743 << "luminous' before using crush device classes";
9747 if (!_have_pending_crush() &&
9748 _get_stable_crush().class_exists(device_class
)) {
9749 ss
<< "class '" << device_class
<< "' already exists";
9752 CrushWrapper newcrush
;
9753 _get_pending_crush(newcrush
);
9754 if (newcrush
.class_exists(device_class
)) {
9755 ss
<< "class '" << device_class
<< "' already exists";
9758 int class_id
= newcrush
.get_or_create_class_id(device_class
);
9759 pending_inc
.crush
.clear();
9760 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9761 ss
<< "created class " << device_class
<< " with id " << class_id
9764 } else if (prefix
== "osd crush class rm") {
9765 string device_class
;
9766 if (!cmd_getval(cmdmap
, "class", device_class
)) {
9767 err
= -EINVAL
; // no value!
9770 if (osdmap
.require_osd_release
< ceph_release_t::luminous
) {
9771 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
9772 << "luminous' before using crush device classes";
9777 if (!osdmap
.crush
->class_exists(device_class
)) {
9782 CrushWrapper newcrush
;
9783 _get_pending_crush(newcrush
);
9784 if (!newcrush
.class_exists(device_class
)) {
9785 err
= 0; // make command idempotent
9788 int class_id
= newcrush
.get_class_id(device_class
);
9790 if (newcrush
.class_is_in_use(class_id
, &ts
)) {
9792 ss
<< "class '" << device_class
<< "' " << ts
.str();
9796 // check if class is used by any erasure-code-profiles
9797 mempool::osdmap::map
<string
,map
<string
,string
>> old_ec_profiles
=
9798 osdmap
.get_erasure_code_profiles();
9799 auto ec_profiles
= pending_inc
.get_erasure_code_profiles();
9800 #ifdef HAVE_STDLIB_MAP_SPLICING
9801 ec_profiles
.merge(old_ec_profiles
);
9803 ec_profiles
.insert(make_move_iterator(begin(old_ec_profiles
)),
9804 make_move_iterator(end(old_ec_profiles
)));
9806 list
<string
> referenced_by
;
9807 for (auto &i
: ec_profiles
) {
9808 for (auto &j
: i
.second
) {
9809 if ("crush-device-class" == j
.first
&& device_class
== j
.second
) {
9810 referenced_by
.push_back(i
.first
);
9814 if (!referenced_by
.empty()) {
9816 ss
<< "class '" << device_class
9817 << "' is still referenced by erasure-code-profile(s): " << referenced_by
;
9822 newcrush
.get_devices_by_class(device_class
, &osds
);
9823 for (auto& p
: osds
) {
9824 err
= newcrush
.remove_device_class(g_ceph_context
, p
, &ss
);
9826 // ss has reason for failure
9832 // empty class, remove directly
9833 err
= newcrush
.remove_class_name(device_class
);
9835 ss
<< "class '" << device_class
<< "' cannot be removed '"
9836 << cpp_strerror(err
) << "'";
9841 pending_inc
.crush
.clear();
9842 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9843 ss
<< "removed class " << device_class
<< " with id " << class_id
9844 << " from crush map";
9846 } else if (prefix
== "osd crush class rename") {
9847 string srcname
, dstname
;
9848 if (!cmd_getval(cmdmap
, "srcname", srcname
)) {
9852 if (!cmd_getval(cmdmap
, "dstname", dstname
)) {
9857 CrushWrapper newcrush
;
9858 _get_pending_crush(newcrush
);
9859 if (!newcrush
.class_exists(srcname
) && newcrush
.class_exists(dstname
)) {
9860 // suppose this is a replay and return success
9861 // so command is idempotent
9862 ss
<< "already renamed to '" << dstname
<< "'";
9867 err
= newcrush
.rename_class(srcname
, dstname
);
9869 ss
<< "fail to rename '" << srcname
<< "' to '" << dstname
<< "' : "
9870 << cpp_strerror(err
);
9874 pending_inc
.crush
.clear();
9875 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9876 ss
<< "rename class '" << srcname
<< "' to '" << dstname
<< "'";
9878 } else if (prefix
== "osd crush add-bucket") {
9879 // os crush add-bucket <name> <type>
9880 string name
, typestr
;
9881 vector
<string
> argvec
;
9882 cmd_getval(cmdmap
, "name", name
);
9883 cmd_getval(cmdmap
, "type", typestr
);
9884 cmd_getval(cmdmap
, "args", argvec
);
9885 map
<string
,string
> loc
;
9886 if (!argvec
.empty()) {
9887 CrushWrapper::parse_loc_map(argvec
, &loc
);
9888 dout(0) << "will create and move bucket '" << name
9889 << "' to location " << loc
<< dendl
;
9892 if (!_have_pending_crush() &&
9893 _get_stable_crush().name_exists(name
)) {
9894 ss
<< "bucket '" << name
<< "' already exists";
9898 CrushWrapper newcrush
;
9899 _get_pending_crush(newcrush
);
9901 if (newcrush
.name_exists(name
)) {
9902 ss
<< "bucket '" << name
<< "' already exists";
9905 int type
= newcrush
.get_type_id(typestr
);
9907 ss
<< "type '" << typestr
<< "' does not exist";
9912 ss
<< "type '" << typestr
<< "' is for devices, not buckets";
9917 err
= newcrush
.add_bucket(0, 0,
9918 CRUSH_HASH_DEFAULT
, type
, 0, NULL
,
9921 ss
<< "add_bucket error: '" << cpp_strerror(err
) << "'";
9924 err
= newcrush
.set_item_name(bucketno
, name
);
9926 ss
<< "error setting bucket name to '" << name
<< "'";
9931 if (!newcrush
.check_item_loc(cct
, bucketno
, loc
,
9933 err
= newcrush
.move_bucket(cct
, bucketno
, loc
);
9935 ss
<< "error moving bucket '" << name
<< "' to location " << loc
;
9939 ss
<< "no need to move item id " << bucketno
<< " name '" << name
9940 << "' to location " << loc
<< " in crush map";
9944 pending_inc
.crush
.clear();
9945 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9947 ss
<< "added bucket " << name
<< " type " << typestr
9950 ss
<< "added bucket " << name
<< " type " << typestr
9951 << " to location " << loc
;
9954 } else if (prefix
== "osd crush rename-bucket") {
9955 string srcname
, dstname
;
9956 cmd_getval(cmdmap
, "srcname", srcname
);
9957 cmd_getval(cmdmap
, "dstname", dstname
);
9959 err
= crush_rename_bucket(srcname
, dstname
, &ss
);
9960 if (err
== -EALREADY
) // equivalent to success for idempotency
9966 } else if (prefix
== "osd crush weight-set create" ||
9967 prefix
== "osd crush weight-set create-compat") {
9968 CrushWrapper newcrush
;
9969 _get_pending_crush(newcrush
);
9972 if (newcrush
.has_non_straw2_buckets()) {
9973 ss
<< "crush map contains one or more bucket(s) that are not straw2";
9977 if (prefix
== "osd crush weight-set create") {
9978 if (osdmap
.require_min_compat_client
!= ceph_release_t::unknown
&&
9979 osdmap
.require_min_compat_client
< ceph_release_t::luminous
) {
9980 ss
<< "require_min_compat_client "
9981 << osdmap
.require_min_compat_client
9982 << " < luminous, which is required for per-pool weight-sets. "
9983 << "Try 'ceph osd set-require-min-compat-client luminous' "
9984 << "before using the new interface";
9988 string poolname
, mode
;
9989 cmd_getval(cmdmap
, "pool", poolname
);
9990 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
9992 ss
<< "pool '" << poolname
<< "' not found";
9996 cmd_getval(cmdmap
, "mode", mode
);
9997 if (mode
!= "flat" && mode
!= "positional") {
9998 ss
<< "unrecognized weight-set mode '" << mode
<< "'";
10002 positions
= mode
== "flat" ? 1 : osdmap
.get_pg_pool(pool
)->get_size();
10004 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
10007 if (!newcrush
.create_choose_args(pool
, positions
)) {
10008 if (pool
== CrushWrapper::DEFAULT_CHOOSE_ARGS
) {
10009 ss
<< "compat weight-set already created";
10011 ss
<< "weight-set for pool '" << osdmap
.get_pool_name(pool
)
10012 << "' already created";
10016 pending_inc
.crush
.clear();
10017 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10020 } else if (prefix
== "osd crush weight-set rm" ||
10021 prefix
== "osd crush weight-set rm-compat") {
10022 CrushWrapper newcrush
;
10023 _get_pending_crush(newcrush
);
10025 if (prefix
== "osd crush weight-set rm") {
10027 cmd_getval(cmdmap
, "pool", poolname
);
10028 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
10030 ss
<< "pool '" << poolname
<< "' not found";
10035 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
10037 newcrush
.rm_choose_args(pool
);
10038 pending_inc
.crush
.clear();
10039 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10042 } else if (prefix
== "osd crush weight-set reweight" ||
10043 prefix
== "osd crush weight-set reweight-compat") {
10044 string poolname
, item
;
10045 vector
<double> weight
;
10046 cmd_getval(cmdmap
, "pool", poolname
);
10047 cmd_getval(cmdmap
, "item", item
);
10048 cmd_getval(cmdmap
, "weight", weight
);
10049 CrushWrapper newcrush
;
10050 _get_pending_crush(newcrush
);
10052 if (prefix
== "osd crush weight-set reweight") {
10053 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
10055 ss
<< "pool '" << poolname
<< "' not found";
10059 if (!newcrush
.have_choose_args(pool
)) {
10060 ss
<< "no weight-set for pool '" << poolname
<< "'";
10064 auto arg_map
= newcrush
.choose_args_get(pool
);
10065 int positions
= newcrush
.get_choose_args_positions(arg_map
);
10066 if (weight
.size() != (size_t)positions
) {
10067 ss
<< "must specify exact " << positions
<< " weight values";
10072 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
10073 if (!newcrush
.have_choose_args(pool
)) {
10074 ss
<< "no backward-compatible weight-set";
10079 if (!newcrush
.name_exists(item
)) {
10080 ss
<< "item '" << item
<< "' does not exist";
10084 err
= newcrush
.choose_args_adjust_item_weightf(
10086 newcrush
.choose_args_get(pool
),
10087 newcrush
.get_item_id(item
),
10094 pending_inc
.crush
.clear();
10095 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10097 } else if (osdid_present
&&
10098 (prefix
== "osd crush set" || prefix
== "osd crush add")) {
10099 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
10100 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
10101 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
10103 if (!osdmap
.exists(osdid
)) {
10106 << " does not exist. Create it before updating the crush map";
10111 if (!cmd_getval(cmdmap
, "weight", weight
)) {
10112 ss
<< "unable to parse weight value '"
10113 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
10119 vector
<string
> argvec
;
10120 cmd_getval(cmdmap
, "args", argvec
);
10121 map
<string
,string
> loc
;
10122 CrushWrapper::parse_loc_map(argvec
, &loc
);
10124 if (prefix
== "osd crush set"
10125 && !_get_stable_crush().item_exists(osdid
)) {
10127 ss
<< "unable to set item id " << osdid
<< " name '" << osd_name
10128 << "' weight " << weight
<< " at location " << loc
10129 << ": does not exist";
10133 dout(5) << "adding/updating crush item id " << osdid
<< " name '"
10134 << osd_name
<< "' weight " << weight
<< " at location "
10136 CrushWrapper newcrush
;
10137 _get_pending_crush(newcrush
);
10140 if (prefix
== "osd crush set" ||
10141 newcrush
.check_item_loc(cct
, osdid
, loc
, (int *)NULL
)) {
10143 err
= newcrush
.update_item(cct
, osdid
, weight
, osd_name
, loc
);
10146 err
= newcrush
.insert_item(cct
, osdid
, weight
, osd_name
, loc
);
10154 if (err
== 0 && !_have_pending_crush()) {
10155 ss
<< action
<< " item id " << osdid
<< " name '" << osd_name
10156 << "' weight " << weight
<< " at location " << loc
<< ": no change";
10160 pending_inc
.crush
.clear();
10161 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10162 ss
<< action
<< " item id " << osdid
<< " name '" << osd_name
<< "' weight "
10163 << weight
<< " at location " << loc
<< " to crush map";
10165 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10166 get_last_committed() + 1));
10169 } else if (prefix
== "osd crush create-or-move") {
10171 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
10172 if (!osdmap
.exists(osdid
)) {
10175 << " does not exist. create it before updating the crush map";
10180 if (!cmd_getval(cmdmap
, "weight", weight
)) {
10181 ss
<< "unable to parse weight value '"
10182 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
10188 vector
<string
> argvec
;
10189 cmd_getval(cmdmap
, "args", argvec
);
10190 map
<string
,string
> loc
;
10191 CrushWrapper::parse_loc_map(argvec
, &loc
);
10193 dout(0) << "create-or-move crush item name '" << osd_name
10194 << "' initial_weight " << weight
<< " at location " << loc
10197 CrushWrapper newcrush
;
10198 _get_pending_crush(newcrush
);
10200 err
= newcrush
.create_or_move_item(cct
, osdid
, weight
, osd_name
, loc
,
10201 g_conf()->osd_crush_update_weight_set
);
10203 ss
<< "create-or-move updated item name '" << osd_name
10204 << "' weight " << weight
10205 << " at location " << loc
<< " to crush map";
10209 pending_inc
.crush
.clear();
10210 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10211 ss
<< "create-or-move updating item name '" << osd_name
10212 << "' weight " << weight
10213 << " at location " << loc
<< " to crush map";
10215 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10216 get_last_committed() + 1));
10221 } else if (prefix
== "osd crush move") {
10223 // osd crush move <name> <loc1> [<loc2> ...]
10225 vector
<string
> argvec
;
10226 cmd_getval(cmdmap
, "name", name
);
10227 cmd_getval(cmdmap
, "args", argvec
);
10228 map
<string
,string
> loc
;
10229 CrushWrapper::parse_loc_map(argvec
, &loc
);
10231 dout(0) << "moving crush item name '" << name
<< "' to location " << loc
<< dendl
;
10232 CrushWrapper newcrush
;
10233 _get_pending_crush(newcrush
);
10235 if (!newcrush
.name_exists(name
)) {
10237 ss
<< "item " << name
<< " does not exist";
10240 int id
= newcrush
.get_item_id(name
);
10242 if (!newcrush
.check_item_loc(cct
, id
, loc
, (int *)NULL
)) {
10244 err
= newcrush
.create_or_move_item(
10245 cct
, id
, 0, name
, loc
,
10246 g_conf()->osd_crush_update_weight_set
);
10248 err
= newcrush
.move_bucket(cct
, id
, loc
);
10251 ss
<< "moved item id " << id
<< " name '" << name
<< "' to location " << loc
<< " in crush map";
10252 pending_inc
.crush
.clear();
10253 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10255 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10256 get_last_committed() + 1));
10260 ss
<< "no need to move item id " << id
<< " name '" << name
<< "' to location " << loc
<< " in crush map";
10264 } else if (prefix
== "osd crush swap-bucket") {
10265 string source
, dest
;
10266 cmd_getval(cmdmap
, "source", source
);
10267 cmd_getval(cmdmap
, "dest", dest
);
10269 bool force
= false;
10270 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
10272 CrushWrapper newcrush
;
10273 _get_pending_crush(newcrush
);
10274 if (!newcrush
.name_exists(source
)) {
10275 ss
<< "source item " << source
<< " does not exist";
10279 if (!newcrush
.name_exists(dest
)) {
10280 ss
<< "dest item " << dest
<< " does not exist";
10284 int sid
= newcrush
.get_item_id(source
);
10285 int did
= newcrush
.get_item_id(dest
);
10287 if (newcrush
.get_immediate_parent_id(sid
, &sparent
) == 0 && !force
) {
10288 ss
<< "source item " << source
<< " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
10292 if (newcrush
.get_bucket_alg(sid
) != newcrush
.get_bucket_alg(did
) &&
10294 ss
<< "source bucket alg " << crush_alg_name(newcrush
.get_bucket_alg(sid
)) << " != "
10295 << "dest bucket alg " << crush_alg_name(newcrush
.get_bucket_alg(did
))
10296 << "; pass --yes-i-really-mean-it to proceed anyway";
10300 int r
= newcrush
.swap_bucket(cct
, sid
, did
);
10302 ss
<< "failed to swap bucket contents: " << cpp_strerror(r
);
10306 ss
<< "swapped bucket of " << source
<< " to " << dest
;
10307 pending_inc
.crush
.clear();
10308 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10309 wait_for_finished_proposal(op
,
10310 new Monitor::C_Command(mon
, op
, err
, ss
.str(),
10311 get_last_committed() + 1));
10313 } else if (prefix
== "osd crush link") {
10314 // osd crush link <name> <loc1> [<loc2> ...]
10316 cmd_getval(cmdmap
, "name", name
);
10317 vector
<string
> argvec
;
10318 cmd_getval(cmdmap
, "args", argvec
);
10319 map
<string
,string
> loc
;
10320 CrushWrapper::parse_loc_map(argvec
, &loc
);
10322 // Need an explicit check for name_exists because get_item_id returns
10324 int id
= osdmap
.crush
->get_item_id(name
);
10325 if (!osdmap
.crush
->name_exists(name
)) {
10327 ss
<< "item " << name
<< " does not exist";
10330 dout(5) << "resolved crush name '" << name
<< "' to id " << id
<< dendl
;
10332 if (osdmap
.crush
->check_item_loc(cct
, id
, loc
, (int*) NULL
)) {
10333 ss
<< "no need to move item id " << id
<< " name '" << name
10334 << "' to location " << loc
<< " in crush map";
10339 dout(5) << "linking crush item name '" << name
<< "' at location " << loc
<< dendl
;
10340 CrushWrapper newcrush
;
10341 _get_pending_crush(newcrush
);
10343 if (!newcrush
.name_exists(name
)) {
10345 ss
<< "item " << name
<< " does not exist";
10348 int id
= newcrush
.get_item_id(name
);
10349 if (!newcrush
.check_item_loc(cct
, id
, loc
, (int *)NULL
)) {
10350 err
= newcrush
.link_bucket(cct
, id
, loc
);
10352 ss
<< "linked item id " << id
<< " name '" << name
10353 << "' to location " << loc
<< " in crush map";
10354 pending_inc
.crush
.clear();
10355 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10357 ss
<< "cannot link item id " << id
<< " name '" << name
10358 << "' to location " << loc
;
10362 ss
<< "no need to move item id " << id
<< " name '" << name
10363 << "' to location " << loc
<< " in crush map";
10367 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, ss
.str(),
10368 get_last_committed() + 1));
10370 } else if (prefix
== "osd crush rm" ||
10371 prefix
== "osd crush remove" ||
10372 prefix
== "osd crush unlink") {
10374 // osd crush rm <id> [ancestor]
10375 CrushWrapper newcrush
;
10376 _get_pending_crush(newcrush
);
10379 cmd_getval(cmdmap
, "name", name
);
10381 if (!osdmap
.crush
->name_exists(name
)) {
10383 ss
<< "device '" << name
<< "' does not appear in the crush map";
10386 if (!newcrush
.name_exists(name
)) {
10388 ss
<< "device '" << name
<< "' does not appear in the crush map";
10390 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10391 get_last_committed() + 1));
10394 int id
= newcrush
.get_item_id(name
);
10397 bool unlink_only
= prefix
== "osd crush unlink";
10398 string ancestor_str
;
10399 if (cmd_getval(cmdmap
, "ancestor", ancestor_str
)) {
10400 if (!newcrush
.name_exists(ancestor_str
)) {
10402 ss
<< "ancestor item '" << ancestor_str
10403 << "' does not appear in the crush map";
10406 ancestor
= newcrush
.get_item_id(ancestor_str
);
10409 err
= prepare_command_osd_crush_remove(
10412 (ancestor
< 0), unlink_only
);
10414 if (err
== -ENOENT
) {
10415 ss
<< "item " << id
<< " does not appear in that position";
10421 pending_inc
.new_crush_node_flags
[id
] = 0;
10422 ss
<< "removed item id " << id
<< " name '" << name
<< "' from crush map";
10424 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10425 get_last_committed() + 1));
10430 } else if (prefix
== "osd crush reweight-all") {
10431 CrushWrapper newcrush
;
10432 _get_pending_crush(newcrush
);
10434 newcrush
.reweight(cct
);
10435 pending_inc
.crush
.clear();
10436 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10437 ss
<< "reweighted crush hierarchy";
10439 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10440 get_last_committed() + 1));
10442 } else if (prefix
== "osd crush reweight") {
10443 // osd crush reweight <name> <weight>
10444 CrushWrapper newcrush
;
10445 _get_pending_crush(newcrush
);
10448 cmd_getval(cmdmap
, "name", name
);
10449 if (!newcrush
.name_exists(name
)) {
10451 ss
<< "device '" << name
<< "' does not appear in the crush map";
10455 int id
= newcrush
.get_item_id(name
);
10457 ss
<< "device '" << name
<< "' is not a leaf in the crush map";
10462 if (!cmd_getval(cmdmap
, "weight", w
)) {
10463 ss
<< "unable to parse weight value '"
10464 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
10469 err
= newcrush
.adjust_item_weightf(cct
, id
, w
,
10470 g_conf()->osd_crush_update_weight_set
);
10473 pending_inc
.crush
.clear();
10474 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10475 ss
<< "reweighted item id " << id
<< " name '" << name
<< "' to " << w
10476 << " in crush map";
10478 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10479 get_last_committed() + 1));
10481 } else if (prefix
== "osd crush reweight-subtree") {
10482 // osd crush reweight <name> <weight>
10483 CrushWrapper newcrush
;
10484 _get_pending_crush(newcrush
);
10487 cmd_getval(cmdmap
, "name", name
);
10488 if (!newcrush
.name_exists(name
)) {
10490 ss
<< "device '" << name
<< "' does not appear in the crush map";
10494 int id
= newcrush
.get_item_id(name
);
10496 ss
<< "device '" << name
<< "' is not a subtree in the crush map";
10501 if (!cmd_getval(cmdmap
, "weight", w
)) {
10502 ss
<< "unable to parse weight value '"
10503 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
10508 err
= newcrush
.adjust_subtree_weightf(cct
, id
, w
,
10509 g_conf()->osd_crush_update_weight_set
);
10512 pending_inc
.crush
.clear();
10513 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10514 ss
<< "reweighted subtree id " << id
<< " name '" << name
<< "' to " << w
10515 << " in crush map";
10517 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10518 get_last_committed() + 1));
10520 } else if (prefix
== "osd crush tunables") {
10521 CrushWrapper newcrush
;
10522 _get_pending_crush(newcrush
);
10526 cmd_getval(cmdmap
, "profile", profile
);
10527 if (profile
== "legacy" || profile
== "argonaut") {
10528 newcrush
.set_tunables_legacy();
10529 } else if (profile
== "bobtail") {
10530 newcrush
.set_tunables_bobtail();
10531 } else if (profile
== "firefly") {
10532 newcrush
.set_tunables_firefly();
10533 } else if (profile
== "hammer") {
10534 newcrush
.set_tunables_hammer();
10535 } else if (profile
== "jewel") {
10536 newcrush
.set_tunables_jewel();
10537 } else if (profile
== "optimal") {
10538 newcrush
.set_tunables_optimal();
10539 } else if (profile
== "default") {
10540 newcrush
.set_tunables_default();
10542 ss
<< "unrecognized profile '" << profile
<< "'";
10547 if (!validate_crush_against_features(&newcrush
, ss
)) {
10552 pending_inc
.crush
.clear();
10553 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10554 ss
<< "adjusted tunables profile to " << profile
;
10556 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10557 get_last_committed() + 1));
10559 } else if (prefix
== "osd crush set-tunable") {
10560 CrushWrapper newcrush
;
10561 _get_pending_crush(newcrush
);
10565 cmd_getval(cmdmap
, "tunable", tunable
);
10567 int64_t value
= -1;
10568 if (!cmd_getval(cmdmap
, "value", value
)) {
10570 ss
<< "failed to parse integer value "
10571 << cmd_vartype_stringify(cmdmap
.at("value"));
10575 if (tunable
== "straw_calc_version") {
10576 if (value
!= 0 && value
!= 1) {
10577 ss
<< "value must be 0 or 1; got " << value
;
10581 newcrush
.set_straw_calc_version(value
);
10583 ss
<< "unrecognized tunable '" << tunable
<< "'";
10588 if (!validate_crush_against_features(&newcrush
, ss
)) {
10593 pending_inc
.crush
.clear();
10594 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10595 ss
<< "adjusted tunable " << tunable
<< " to " << value
;
10597 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10598 get_last_committed() + 1));
10601 } else if (prefix
== "osd crush rule create-simple") {
10602 string name
, root
, type
, mode
;
10603 cmd_getval(cmdmap
, "name", name
);
10604 cmd_getval(cmdmap
, "root", root
);
10605 cmd_getval(cmdmap
, "type", type
);
10606 cmd_getval(cmdmap
, "mode", mode
);
10610 if (osdmap
.crush
->rule_exists(name
)) {
10611 // The name is uniquely associated to a ruleid and the rule it contains
10612 // From the user point of view, the rule is more meaningfull.
10613 ss
<< "rule " << name
<< " already exists";
10618 CrushWrapper newcrush
;
10619 _get_pending_crush(newcrush
);
10621 if (newcrush
.rule_exists(name
)) {
10622 // The name is uniquely associated to a ruleid and the rule it contains
10623 // From the user point of view, the rule is more meaningfull.
10624 ss
<< "rule " << name
<< " already exists";
10627 int ruleno
= newcrush
.add_simple_rule(name
, root
, type
, "", mode
,
10628 pg_pool_t::TYPE_REPLICATED
, &ss
);
10634 pending_inc
.crush
.clear();
10635 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10638 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10639 get_last_committed() + 1));
10642 } else if (prefix
== "osd crush rule create-replicated") {
10643 string name
, root
, type
, device_class
;
10644 cmd_getval(cmdmap
, "name", name
);
10645 cmd_getval(cmdmap
, "root", root
);
10646 cmd_getval(cmdmap
, "type", type
);
10647 cmd_getval(cmdmap
, "class", device_class
);
10649 if (osdmap
.crush
->rule_exists(name
)) {
10650 // The name is uniquely associated to a ruleid and the rule it contains
10651 // From the user point of view, the rule is more meaningfull.
10652 ss
<< "rule " << name
<< " already exists";
10657 CrushWrapper newcrush
;
10658 _get_pending_crush(newcrush
);
10660 if (newcrush
.rule_exists(name
)) {
10661 // The name is uniquely associated to a ruleid and the rule it contains
10662 // From the user point of view, the rule is more meaningfull.
10663 ss
<< "rule " << name
<< " already exists";
10666 int ruleno
= newcrush
.add_simple_rule(
10667 name
, root
, type
, device_class
,
10668 "firstn", pg_pool_t::TYPE_REPLICATED
, &ss
);
10674 pending_inc
.crush
.clear();
10675 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10678 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10679 get_last_committed() + 1));
10682 } else if (prefix
== "osd erasure-code-profile rm") {
10684 cmd_getval(cmdmap
, "name", name
);
10686 if (erasure_code_profile_in_use(pending_inc
.new_pools
, name
, &ss
))
10689 if (erasure_code_profile_in_use(osdmap
.pools
, name
, &ss
)) {
10694 if (osdmap
.has_erasure_code_profile(name
) ||
10695 pending_inc
.new_erasure_code_profiles
.count(name
)) {
10696 if (osdmap
.has_erasure_code_profile(name
)) {
10697 pending_inc
.old_erasure_code_profiles
.push_back(name
);
10699 dout(20) << "erasure code profile rm " << name
<< ": creation canceled" << dendl
;
10700 pending_inc
.new_erasure_code_profiles
.erase(name
);
10704 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10705 get_last_committed() + 1));
10708 ss
<< "erasure-code-profile " << name
<< " does not exist";
10713 } else if (prefix
== "osd erasure-code-profile set") {
10715 cmd_getval(cmdmap
, "name", name
);
10716 vector
<string
> profile
;
10717 cmd_getval(cmdmap
, "profile", profile
);
10719 bool force
= false;
10720 cmd_getval(cmdmap
, "force", force
);
10722 map
<string
,string
> profile_map
;
10723 err
= parse_erasure_code_profile(profile
, &profile_map
, &ss
);
10726 if (profile_map
.find("plugin") == profile_map
.end()) {
10727 ss
<< "erasure-code-profile " << profile_map
10728 << " must contain a plugin entry" << std::endl
;
10732 string plugin
= profile_map
["plugin"];
10734 if (pending_inc
.has_erasure_code_profile(name
)) {
10735 dout(20) << "erasure code profile " << name
<< " try again" << dendl
;
10738 err
= normalize_profile(name
, profile_map
, force
, &ss
);
10742 if (osdmap
.has_erasure_code_profile(name
)) {
10743 ErasureCodeProfile existing_profile_map
=
10744 osdmap
.get_erasure_code_profile(name
);
10745 err
= normalize_profile(name
, existing_profile_map
, force
, &ss
);
10749 if (existing_profile_map
== profile_map
) {
10755 ss
<< "will not override erasure code profile " << name
10756 << " because the existing profile "
10757 << existing_profile_map
10758 << " is different from the proposed profile "
10764 dout(20) << "erasure code profile set " << name
<< "="
10765 << profile_map
<< dendl
;
10766 pending_inc
.set_erasure_code_profile(name
, profile_map
);
10770 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10771 get_last_committed() + 1));
10774 } else if (prefix
== "osd crush rule create-erasure") {
10775 err
= check_cluster_features(CEPH_FEATURE_CRUSH_V2
, ss
);
10776 if (err
== -EAGAIN
)
10780 string name
, poolstr
;
10781 cmd_getval(cmdmap
, "name", name
);
10783 cmd_getval(cmdmap
, "profile", profile
);
10785 profile
= "default";
10786 if (profile
== "default") {
10787 if (!osdmap
.has_erasure_code_profile(profile
)) {
10788 if (pending_inc
.has_erasure_code_profile(profile
)) {
10789 dout(20) << "erasure code profile " << profile
<< " already pending" << dendl
;
10793 map
<string
,string
> profile_map
;
10794 err
= osdmap
.get_erasure_code_profile_default(cct
,
10799 err
= normalize_profile(name
, profile_map
, true, &ss
);
10802 dout(20) << "erasure code profile set " << profile
<< "="
10803 << profile_map
<< dendl
;
10804 pending_inc
.set_erasure_code_profile(profile
, profile_map
);
10810 err
= crush_rule_create_erasure(name
, profile
, &rule
, &ss
);
10813 case -EEXIST
: // return immediately
10814 ss
<< "rule " << name
<< " already exists";
10818 case -EALREADY
: // wait for pending to be proposed
10819 ss
<< "rule " << name
<< " already exists";
10822 default: // non recoverable error
10827 ss
<< "created rule " << name
<< " at " << rule
;
10831 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10832 get_last_committed() + 1));
10835 } else if (prefix
== "osd crush rule rm") {
10837 cmd_getval(cmdmap
, "name", name
);
10839 if (!osdmap
.crush
->rule_exists(name
)) {
10840 ss
<< "rule " << name
<< " does not exist";
10845 CrushWrapper newcrush
;
10846 _get_pending_crush(newcrush
);
10848 if (!newcrush
.rule_exists(name
)) {
10849 ss
<< "rule " << name
<< " does not exist";
10852 int ruleno
= newcrush
.get_rule_id(name
);
10853 ceph_assert(ruleno
>= 0);
10855 // make sure it is not in use.
10856 // FIXME: this is ok in some situations, but let's not bother with that
10858 int ruleset
= newcrush
.get_rule_mask_ruleset(ruleno
);
10859 if (osdmap
.crush_rule_in_use(ruleset
)) {
10860 ss
<< "crush ruleset " << name
<< " " << ruleset
<< " is in use";
10865 err
= newcrush
.remove_rule(ruleno
);
10870 pending_inc
.crush
.clear();
10871 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10874 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10875 get_last_committed() + 1));
10878 } else if (prefix
== "osd crush rule rename") {
10881 cmd_getval(cmdmap
, "srcname", srcname
);
10882 cmd_getval(cmdmap
, "dstname", dstname
);
10883 if (srcname
.empty() || dstname
.empty()) {
10884 ss
<< "must specify both source rule name and destination rule name";
10888 if (srcname
== dstname
) {
10889 ss
<< "destination rule name is equal to source rule name";
10894 CrushWrapper newcrush
;
10895 _get_pending_crush(newcrush
);
10896 if (!newcrush
.rule_exists(srcname
) && newcrush
.rule_exists(dstname
)) {
10897 // srcname does not exist and dstname already exists
10898 // suppose this is a replay and return success
10899 // (so this command is idempotent)
10900 ss
<< "already renamed to '" << dstname
<< "'";
10905 err
= newcrush
.rename_rule(srcname
, dstname
, &ss
);
10907 // ss has reason for failure
10910 pending_inc
.crush
.clear();
10911 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10913 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10914 get_last_committed() + 1));
10917 } else if (prefix
== "osd setmaxosd") {
10919 if (!cmd_getval(cmdmap
, "newmax", newmax
)) {
10920 ss
<< "unable to parse 'newmax' value '"
10921 << cmd_vartype_stringify(cmdmap
.at("newmax")) << "'";
10926 if (newmax
> g_conf()->mon_max_osd
) {
10928 ss
<< "cannot set max_osd to " << newmax
<< " which is > conf.mon_max_osd ("
10929 << g_conf()->mon_max_osd
<< ")";
10933 // Don't allow shrinking OSD number as this will cause data loss
10934 // and may cause kernel crashes.
10935 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
10936 if (newmax
< osdmap
.get_max_osd()) {
10937 // Check if the OSDs exist between current max and new value.
10938 // If there are any OSDs exist, then don't allow shrinking number
10940 for (int i
= newmax
; i
< osdmap
.get_max_osd(); i
++) {
10941 if (osdmap
.exists(i
)) {
10943 ss
<< "cannot shrink max_osd to " << newmax
10944 << " because osd." << i
<< " (and possibly others) still in use";
10950 pending_inc
.new_max_osd
= newmax
;
10951 ss
<< "set new max_osd = " << pending_inc
.new_max_osd
;
10953 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10954 get_last_committed() + 1));
10957 } else if (prefix
== "osd set-full-ratio" ||
10958 prefix
== "osd set-backfillfull-ratio" ||
10959 prefix
== "osd set-nearfull-ratio") {
10961 if (!cmd_getval(cmdmap
, "ratio", n
)) {
10962 ss
<< "unable to parse 'ratio' value '"
10963 << cmd_vartype_stringify(cmdmap
.at("ratio")) << "'";
10967 if (prefix
== "osd set-full-ratio")
10968 pending_inc
.new_full_ratio
= n
;
10969 else if (prefix
== "osd set-backfillfull-ratio")
10970 pending_inc
.new_backfillfull_ratio
= n
;
10971 else if (prefix
== "osd set-nearfull-ratio")
10972 pending_inc
.new_nearfull_ratio
= n
;
10973 ss
<< prefix
<< " " << n
;
10975 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10976 get_last_committed() + 1));
10978 } else if (prefix
== "osd set-require-min-compat-client") {
10980 cmd_getval(cmdmap
, "version", v
);
10981 ceph_release_t vno
= ceph_release_from_name(v
);
10983 ss
<< "version " << v
<< " is not recognized";
10988 newmap
.deepish_copy_from(osdmap
);
10989 newmap
.apply_incremental(pending_inc
);
10990 newmap
.require_min_compat_client
= vno
;
10991 auto mvno
= newmap
.get_min_compat_client();
10993 ss
<< "osdmap current utilizes features that require " << mvno
10994 << "; cannot set require_min_compat_client below that to " << vno
;
10999 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
11002 mon
->get_combined_feature_map(&m
);
11003 uint64_t features
= ceph_release_features(ceph::to_integer
<int>(vno
));
11007 CEPH_ENTITY_TYPE_CLIENT
,
11008 CEPH_ENTITY_TYPE_MDS
,
11009 CEPH_ENTITY_TYPE_MGR
}) {
11010 auto p
= m
.m
.find(type
);
11011 if (p
== m
.m
.end()) {
11014 for (auto& q
: p
->second
) {
11015 uint64_t missing
= ~q
.first
& features
;
11018 ss
<< "cannot set require_min_compat_client to " << v
<< ": ";
11023 ss
<< q
.second
<< " connected " << ceph_entity_type_name(type
)
11024 << "(s) look like " << ceph_release_name(
11025 ceph_release_from_features(q
.first
))
11026 << " (missing 0x" << std::hex
<< missing
<< std::dec
<< ")";
11032 ss
<< "; add --yes-i-really-mean-it to do it anyway";
11037 ss
<< "set require_min_compat_client to " << vno
;
11038 pending_inc
.new_require_min_compat_client
= vno
;
11040 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11041 get_last_committed() + 1));
11043 } else if (prefix
== "osd pause") {
11044 return prepare_set_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11046 } else if (prefix
== "osd unpause") {
11047 return prepare_unset_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11049 } else if (prefix
== "osd set") {
11051 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
11054 cmd_getval(cmdmap
, "key", key
);
11055 if (key
== "pause")
11056 return prepare_set_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11057 else if (key
== "noup")
11058 return prepare_set_flag(op
, CEPH_OSDMAP_NOUP
);
11059 else if (key
== "nodown")
11060 return prepare_set_flag(op
, CEPH_OSDMAP_NODOWN
);
11061 else if (key
== "noout")
11062 return prepare_set_flag(op
, CEPH_OSDMAP_NOOUT
);
11063 else if (key
== "noin")
11064 return prepare_set_flag(op
, CEPH_OSDMAP_NOIN
);
11065 else if (key
== "nobackfill")
11066 return prepare_set_flag(op
, CEPH_OSDMAP_NOBACKFILL
);
11067 else if (key
== "norebalance")
11068 return prepare_set_flag(op
, CEPH_OSDMAP_NOREBALANCE
);
11069 else if (key
== "norecover")
11070 return prepare_set_flag(op
, CEPH_OSDMAP_NORECOVER
);
11071 else if (key
== "noscrub")
11072 return prepare_set_flag(op
, CEPH_OSDMAP_NOSCRUB
);
11073 else if (key
== "nodeep-scrub")
11074 return prepare_set_flag(op
, CEPH_OSDMAP_NODEEP_SCRUB
);
11075 else if (key
== "notieragent")
11076 return prepare_set_flag(op
, CEPH_OSDMAP_NOTIERAGENT
);
11077 else if (key
== "nosnaptrim")
11078 return prepare_set_flag(op
, CEPH_OSDMAP_NOSNAPTRIM
);
11079 else if (key
== "pglog_hardlimit") {
11080 if (!osdmap
.get_num_up_osds() && !sure
) {
11081 ss
<< "Not advisable to continue since no OSDs are up. Pass "
11082 << "--yes-i-really-mean-it if you really wish to continue.";
11086 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
11087 // we are reusing a jewel feature bit that was retired in luminous.
11088 if (osdmap
.require_osd_release
>= ceph_release_t::luminous
&&
11089 (HAVE_FEATURE(osdmap
.get_up_osd_features(), OSD_PGLOG_HARDLIMIT
)
11091 return prepare_set_flag(op
, CEPH_OSDMAP_PGLOG_HARDLIMIT
);
11093 ss
<< "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
11098 ss
<< "unrecognized flag '" << key
<< "'";
11102 } else if (prefix
== "osd unset") {
11104 cmd_getval(cmdmap
, "key", key
);
11105 if (key
== "pause")
11106 return prepare_unset_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11107 else if (key
== "noup")
11108 return prepare_unset_flag(op
, CEPH_OSDMAP_NOUP
);
11109 else if (key
== "nodown")
11110 return prepare_unset_flag(op
, CEPH_OSDMAP_NODOWN
);
11111 else if (key
== "noout")
11112 return prepare_unset_flag(op
, CEPH_OSDMAP_NOOUT
);
11113 else if (key
== "noin")
11114 return prepare_unset_flag(op
, CEPH_OSDMAP_NOIN
);
11115 else if (key
== "nobackfill")
11116 return prepare_unset_flag(op
, CEPH_OSDMAP_NOBACKFILL
);
11117 else if (key
== "norebalance")
11118 return prepare_unset_flag(op
, CEPH_OSDMAP_NOREBALANCE
);
11119 else if (key
== "norecover")
11120 return prepare_unset_flag(op
, CEPH_OSDMAP_NORECOVER
);
11121 else if (key
== "noscrub")
11122 return prepare_unset_flag(op
, CEPH_OSDMAP_NOSCRUB
);
11123 else if (key
== "nodeep-scrub")
11124 return prepare_unset_flag(op
, CEPH_OSDMAP_NODEEP_SCRUB
);
11125 else if (key
== "notieragent")
11126 return prepare_unset_flag(op
, CEPH_OSDMAP_NOTIERAGENT
);
11127 else if (key
== "nosnaptrim")
11128 return prepare_unset_flag(op
, CEPH_OSDMAP_NOSNAPTRIM
);
11130 ss
<< "unrecognized flag '" << key
<< "'";
11134 } else if (prefix
== "osd require-osd-release") {
11136 cmd_getval(cmdmap
, "release", release
);
11138 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
11139 ceph_release_t rel
= ceph_release_from_name(release
.c_str());
11141 ss
<< "unrecognized release " << release
;
11145 if (rel
== osdmap
.require_osd_release
) {
11150 ceph_assert(osdmap
.require_osd_release
>= ceph_release_t::luminous
);
11151 if (!osdmap
.get_num_up_osds() && !sure
) {
11152 ss
<< "Not advisable to continue since no OSDs are up. Pass "
11153 << "--yes-i-really-mean-it if you really wish to continue.";
11157 if (rel
== ceph_release_t::mimic
) {
11158 if (!mon
->monmap
->get_required_features().contains_all(
11159 ceph::features::mon::FEATURE_MIMIC
)) {
11160 ss
<< "not all mons are mimic";
11164 if ((!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_MIMIC
))
11166 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_MIMIC feature";
11170 } else if (rel
== ceph_release_t::nautilus
) {
11171 if (!mon
->monmap
->get_required_features().contains_all(
11172 ceph::features::mon::FEATURE_NAUTILUS
)) {
11173 ss
<< "not all mons are nautilus";
11177 if ((!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_NAUTILUS
))
11179 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_NAUTILUS feature";
11183 } else if (rel
== ceph_release_t::octopus
) {
11184 if (!mon
->monmap
->get_required_features().contains_all(
11185 ceph::features::mon::FEATURE_OCTOPUS
)) {
11186 ss
<< "not all mons are octopus";
11190 if ((!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_OCTOPUS
))
11192 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_OCTOPUS feature";
11197 ss
<< "not supported for this release yet";
11201 if (rel
< osdmap
.require_osd_release
) {
11202 ss
<< "require_osd_release cannot be lowered once it has been set";
11206 pending_inc
.new_require_osd_release
= rel
;
11208 } else if (prefix
== "osd down" ||
11209 prefix
== "osd out" ||
11210 prefix
== "osd in" ||
11211 prefix
== "osd rm" ||
11212 prefix
== "osd stop") {
11216 bool verbose
= true;
11217 bool definitely_dead
= false;
11219 vector
<string
> idvec
;
11220 cmd_getval(cmdmap
, "ids", idvec
);
11221 cmd_getval(cmdmap
, "definitely_dead", definitely_dead
);
11222 derr
<< "definitely_dead " << (int)definitely_dead
<< dendl
;
11223 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
11228 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
11229 if (prefix
== "osd in") {
11230 // touch out osds only
11231 osdmap
.get_out_existing_osds(osds
);
11233 osdmap
.get_all_osds(osds
);
11236 verbose
= false; // so the output is less noisy.
11238 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
11240 ss
<< "invalid osd id" << osd
;
11243 } else if (!osdmap
.exists(osd
)) {
11244 ss
<< "osd." << osd
<< " does not exist. ";
11251 for (auto &osd
: osds
) {
11252 if (prefix
== "osd down") {
11253 if (osdmap
.is_down(osd
)) {
11255 ss
<< "osd." << osd
<< " is already down. ";
11257 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_UP
);
11258 ss
<< "marked down osd." << osd
<< ". ";
11261 if (definitely_dead
) {
11262 if (!pending_inc
.new_xinfo
.count(osd
)) {
11263 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
11265 if (pending_inc
.new_xinfo
[osd
].dead_epoch
< pending_inc
.epoch
) {
11268 pending_inc
.new_xinfo
[osd
].dead_epoch
= pending_inc
.epoch
;
11270 } else if (prefix
== "osd out") {
11271 if (osdmap
.is_out(osd
)) {
11273 ss
<< "osd." << osd
<< " is already out. ";
11275 pending_inc
.new_weight
[osd
] = CEPH_OSD_OUT
;
11276 if (osdmap
.osd_weight
[osd
]) {
11277 if (pending_inc
.new_xinfo
.count(osd
) == 0) {
11278 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
11280 pending_inc
.new_xinfo
[osd
].old_weight
= osdmap
.osd_weight
[osd
];
11282 ss
<< "marked out osd." << osd
<< ". ";
11283 std::ostringstream msg
;
11284 msg
<< "Client " << op
->get_session()->entity_name
11285 << " marked osd." << osd
<< " out";
11286 if (osdmap
.is_up(osd
)) {
11287 msg
<< ", while it was still marked up";
11289 auto period
= ceph_clock_now() - down_pending_out
[osd
];
11290 msg
<< ", after it was down for " << int(period
.sec())
11294 mon
->clog
->info() << msg
.str();
11297 } else if (prefix
== "osd in") {
11298 if (osdmap
.is_in(osd
)) {
11300 ss
<< "osd." << osd
<< " is already in. ";
11302 if (osdmap
.osd_xinfo
[osd
].old_weight
> 0) {
11303 pending_inc
.new_weight
[osd
] = osdmap
.osd_xinfo
[osd
].old_weight
;
11304 if (pending_inc
.new_xinfo
.count(osd
) == 0) {
11305 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
11307 pending_inc
.new_xinfo
[osd
].old_weight
= 0;
11309 pending_inc
.new_weight
[osd
] = CEPH_OSD_IN
;
11311 ss
<< "marked in osd." << osd
<< ". ";
11314 } else if (prefix
== "osd rm") {
11315 err
= prepare_command_osd_remove(osd
);
11317 if (err
== -EBUSY
) {
11320 ss
<< "osd." << osd
<< " is still up; must be down before removal. ";
11322 ceph_assert(err
== 0);
11324 ss
<< ", osd." << osd
;
11326 ss
<< "removed osd." << osd
;
11330 } else if (prefix
== "osd stop") {
11331 if (osdmap
.is_stop(osd
)) {
11333 ss
<< "osd." << osd
<< " is already stopped. ";
11334 } else if (osdmap
.is_down(osd
)) {
11335 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_STOP
);
11336 ss
<< "stop down osd." << osd
<< ". ";
11339 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_UP
| CEPH_OSD_STOP
);
11340 ss
<< "stop osd." << osd
<< ". ";
11348 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
11349 get_last_committed() + 1));
11352 } else if (prefix
== "osd set-group" ||
11353 prefix
== "osd unset-group" ||
11354 prefix
== "osd add-noup" ||
11355 prefix
== "osd add-nodown" ||
11356 prefix
== "osd add-noin" ||
11357 prefix
== "osd add-noout" ||
11358 prefix
== "osd rm-noup" ||
11359 prefix
== "osd rm-nodown" ||
11360 prefix
== "osd rm-noin" ||
11361 prefix
== "osd rm-noout") {
11362 bool do_set
= prefix
== "osd set-group" ||
11363 prefix
.find("add") != string::npos
;
11365 unsigned flags
= 0;
11366 vector
<string
> who
;
11367 if (prefix
== "osd set-group" || prefix
== "osd unset-group") {
11368 cmd_getval(cmdmap
, "flags", flag_str
);
11369 cmd_getval(cmdmap
, "who", who
);
11370 vector
<string
> raw_flags
;
11371 boost::split(raw_flags
, flag_str
, boost::is_any_of(","));
11372 for (auto& f
: raw_flags
) {
11374 flags
|= CEPH_OSD_NOUP
;
11375 else if (f
== "nodown")
11376 flags
|= CEPH_OSD_NODOWN
;
11377 else if (f
== "noin")
11378 flags
|= CEPH_OSD_NOIN
;
11379 else if (f
== "noout")
11380 flags
|= CEPH_OSD_NOOUT
;
11382 ss
<< "unrecognized flag '" << f
<< "', must be one of "
11383 << "{noup,nodown,noin,noout}";
11389 cmd_getval(cmdmap
, "ids", who
);
11390 if (prefix
.find("noup") != string::npos
)
11391 flags
= CEPH_OSD_NOUP
;
11392 else if (prefix
.find("nodown") != string::npos
)
11393 flags
= CEPH_OSD_NODOWN
;
11394 else if (prefix
.find("noin") != string::npos
)
11395 flags
= CEPH_OSD_NOIN
;
11396 else if (prefix
.find("noout") != string::npos
)
11397 flags
= CEPH_OSD_NOOUT
;
11399 ceph_assert(0 == "Unreachable!");
11402 ss
<< "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
11407 ss
<< "must specify at least one or more targets to set/unset";
11412 set
<int> crush_nodes
;
11413 set
<int> device_classes
;
11414 for (auto& w
: who
) {
11415 if (w
== "any" || w
== "all" || w
== "*") {
11416 osdmap
.get_all_osds(osds
);
11419 std::stringstream ts
;
11420 if (auto osd
= parse_osd_id(w
.c_str(), &ts
); osd
>= 0) {
11422 } else if (osdmap
.crush
->name_exists(w
)) {
11423 crush_nodes
.insert(osdmap
.crush
->get_item_id(w
));
11424 } else if (osdmap
.crush
->class_exists(w
)) {
11425 device_classes
.insert(osdmap
.crush
->get_class_id(w
));
11427 ss
<< "unable to parse osd id or crush node or device class: "
11428 << "\"" << w
<< "\". ";
11431 if (osds
.empty() && crush_nodes
.empty() && device_classes
.empty()) {
11432 // ss has reason for failure
11437 for (auto osd
: osds
) {
11438 if (!osdmap
.exists(osd
)) {
11439 ss
<< "osd." << osd
<< " does not exist. ";
11443 if (flags
& CEPH_OSD_NOUP
) {
11444 any
|= osdmap
.is_noup_by_osd(osd
) ?
11445 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOUP
) :
11446 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOUP
);
11448 if (flags
& CEPH_OSD_NODOWN
) {
11449 any
|= osdmap
.is_nodown_by_osd(osd
) ?
11450 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NODOWN
) :
11451 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NODOWN
);
11453 if (flags
& CEPH_OSD_NOIN
) {
11454 any
|= osdmap
.is_noin_by_osd(osd
) ?
11455 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOIN
) :
11456 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOIN
);
11458 if (flags
& CEPH_OSD_NOOUT
) {
11459 any
|= osdmap
.is_noout_by_osd(osd
) ?
11460 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOOUT
) :
11461 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOOUT
);
11464 if (flags
& CEPH_OSD_NOUP
) {
11465 any
|= osdmap
.is_noup_by_osd(osd
) ?
11466 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOUP
) :
11467 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOUP
);
11469 if (flags
& CEPH_OSD_NODOWN
) {
11470 any
|= osdmap
.is_nodown_by_osd(osd
) ?
11471 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NODOWN
) :
11472 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NODOWN
);
11474 if (flags
& CEPH_OSD_NOIN
) {
11475 any
|= osdmap
.is_noin_by_osd(osd
) ?
11476 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOIN
) :
11477 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOIN
);
11479 if (flags
& CEPH_OSD_NOOUT
) {
11480 any
|= osdmap
.is_noout_by_osd(osd
) ?
11481 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOOUT
) :
11482 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOOUT
);
11486 for (auto& id
: crush_nodes
) {
11487 auto old_flags
= osdmap
.get_crush_node_flags(id
);
11488 auto& pending_flags
= pending_inc
.new_crush_node_flags
[id
];
11489 pending_flags
|= old_flags
; // adopt existing flags first!
11491 pending_flags
|= flags
;
11493 pending_flags
&= ~flags
;
11497 for (auto& id
: device_classes
) {
11498 auto old_flags
= osdmap
.get_device_class_flags(id
);
11499 auto& pending_flags
= pending_inc
.new_device_class_flags
[id
];
11500 pending_flags
|= old_flags
;
11502 pending_flags
|= flags
;
11504 pending_flags
&= ~flags
;
11510 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
11511 get_last_committed() + 1));
11514 } else if (prefix
== "osd pg-temp") {
11516 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
11517 ss
<< "unable to parse 'pgid' value '"
11518 << cmd_vartype_stringify(cmdmap
.at("pgid")) << "'";
11523 if (!pgid
.parse(pgidstr
.c_str())) {
11524 ss
<< "invalid pgid '" << pgidstr
<< "'";
11528 if (!osdmap
.pg_exists(pgid
)) {
11529 ss
<< "pg " << pgid
<< " does not exist";
11533 if (pending_inc
.new_pg_temp
.count(pgid
)) {
11534 dout(10) << __func__
<< " waiting for pending update on " << pgid
<< dendl
;
11535 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11539 vector
<int64_t> id_vec
;
11540 vector
<int32_t> new_pg_temp
;
11541 cmd_getval(cmdmap
, "id", id_vec
);
11542 if (id_vec
.empty()) {
11543 pending_inc
.new_pg_temp
[pgid
] = mempool::osdmap::vector
<int>();
11544 ss
<< "done cleaning up pg_temp of " << pgid
;
11547 for (auto osd
: id_vec
) {
11548 if (!osdmap
.exists(osd
)) {
11549 ss
<< "osd." << osd
<< " does not exist";
11553 new_pg_temp
.push_back(osd
);
11556 int pool_min_size
= osdmap
.get_pg_pool_min_size(pgid
);
11557 if ((int)new_pg_temp
.size() < pool_min_size
) {
11558 ss
<< "num of osds (" << new_pg_temp
.size() <<") < pool min size ("
11559 << pool_min_size
<< ")";
11564 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
11565 if ((int)new_pg_temp
.size() > pool_size
) {
11566 ss
<< "num of osds (" << new_pg_temp
.size() <<") > pool size ("
11567 << pool_size
<< ")";
11572 pending_inc
.new_pg_temp
[pgid
] = mempool::osdmap::vector
<int>(
11573 new_pg_temp
.begin(), new_pg_temp
.end());
11574 ss
<< "set " << pgid
<< " pg_temp mapping to " << new_pg_temp
;
11576 } else if (prefix
== "osd primary-temp") {
11578 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
11579 ss
<< "unable to parse 'pgid' value '"
11580 << cmd_vartype_stringify(cmdmap
.at("pgid")) << "'";
11585 if (!pgid
.parse(pgidstr
.c_str())) {
11586 ss
<< "invalid pgid '" << pgidstr
<< "'";
11590 if (!osdmap
.pg_exists(pgid
)) {
11591 ss
<< "pg " << pgid
<< " does not exist";
11597 if (!cmd_getval(cmdmap
, "id", osd
)) {
11598 ss
<< "unable to parse 'id' value '"
11599 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
11603 if (osd
!= -1 && !osdmap
.exists(osd
)) {
11604 ss
<< "osd." << osd
<< " does not exist";
11609 if (osdmap
.require_min_compat_client
!= ceph_release_t::unknown
&&
11610 osdmap
.require_min_compat_client
< ceph_release_t::firefly
) {
11611 ss
<< "require_min_compat_client "
11612 << osdmap
.require_min_compat_client
11613 << " < firefly, which is required for primary-temp";
11618 pending_inc
.new_primary_temp
[pgid
] = osd
;
11619 ss
<< "set " << pgid
<< " primary_temp mapping to " << osd
;
11621 } else if (prefix
== "pg repeer") {
11624 cmd_getval(cmdmap
, "pgid", pgidstr
);
11625 if (!pgid
.parse(pgidstr
.c_str())) {
11626 ss
<< "invalid pgid '" << pgidstr
<< "'";
11630 if (!osdmap
.pg_exists(pgid
)) {
11631 ss
<< "pg '" << pgidstr
<< "' does not exist";
11635 vector
<int> acting
;
11637 osdmap
.pg_to_acting_osds(pgid
, &acting
, &primary
);
11640 ss
<< "pg currently has no primary";
11643 if (acting
.size() > 1) {
11644 // map to just primary; it will map back to what it wants
11645 pending_inc
.new_pg_temp
[pgid
] = { primary
};
11647 // hmm, pick another arbitrary osd to induce a change. Note
11648 // that this won't work if there is only one suitable OSD in the cluster.
11651 for (i
= 0; i
< osdmap
.get_max_osd(); ++i
) {
11652 if (i
== primary
|| !osdmap
.is_up(i
) || !osdmap
.exists(i
)) {
11655 pending_inc
.new_pg_temp
[pgid
] = { primary
, i
};
11661 ss
<< "not enough up OSDs in the cluster to force repeer";
11666 } else if (prefix
== "osd pg-upmap" ||
11667 prefix
== "osd rm-pg-upmap" ||
11668 prefix
== "osd pg-upmap-items" ||
11669 prefix
== "osd rm-pg-upmap-items") {
11670 if (osdmap
.require_min_compat_client
< ceph_release_t::luminous
) {
11671 ss
<< "min_compat_client "
11672 << osdmap
.require_min_compat_client
11673 << " < luminous, which is required for pg-upmap. "
11674 << "Try 'ceph osd set-require-min-compat-client luminous' "
11675 << "before using the new interface";
11679 err
= check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP
, ss
);
11680 if (err
== -EAGAIN
)
11685 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
11686 ss
<< "unable to parse 'pgid' value '"
11687 << cmd_vartype_stringify(cmdmap
.at("pgid")) << "'";
11692 if (!pgid
.parse(pgidstr
.c_str())) {
11693 ss
<< "invalid pgid '" << pgidstr
<< "'";
11697 if (!osdmap
.pg_exists(pgid
)) {
11698 ss
<< "pg " << pgid
<< " does not exist";
11702 if (pending_inc
.old_pools
.count(pgid
.pool())) {
11703 ss
<< "pool of " << pgid
<< " is pending removal";
11706 wait_for_finished_proposal(op
,
11707 new Monitor::C_Command(mon
, op
, err
, rs
, get_last_committed() + 1));
11715 OP_RM_PG_UPMAP_ITEMS
,
11718 if (prefix
== "osd pg-upmap") {
11719 option
= OP_PG_UPMAP
;
11720 } else if (prefix
== "osd rm-pg-upmap") {
11721 option
= OP_RM_PG_UPMAP
;
11722 } else if (prefix
== "osd pg-upmap-items") {
11723 option
= OP_PG_UPMAP_ITEMS
;
11725 option
= OP_RM_PG_UPMAP_ITEMS
;
11728 // check pending upmap changes
11730 case OP_PG_UPMAP
: // fall through
11731 case OP_RM_PG_UPMAP
:
11732 if (pending_inc
.new_pg_upmap
.count(pgid
) ||
11733 pending_inc
.old_pg_upmap
.count(pgid
)) {
11734 dout(10) << __func__
<< " waiting for pending update on "
11736 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11741 case OP_PG_UPMAP_ITEMS
: // fall through
11742 case OP_RM_PG_UPMAP_ITEMS
:
11743 if (pending_inc
.new_pg_upmap_items
.count(pgid
) ||
11744 pending_inc
.old_pg_upmap_items
.count(pgid
)) {
11745 dout(10) << __func__
<< " waiting for pending update on "
11747 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11753 ceph_abort_msg("invalid option");
11759 vector
<int64_t> id_vec
;
11760 if (!cmd_getval(cmdmap
, "id", id_vec
)) {
11761 ss
<< "unable to parse 'id' value(s) '"
11762 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
11767 int pool_min_size
= osdmap
.get_pg_pool_min_size(pgid
);
11768 if ((int)id_vec
.size() < pool_min_size
) {
11769 ss
<< "num of osds (" << id_vec
.size() <<") < pool min size ("
11770 << pool_min_size
<< ")";
11775 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
11776 if ((int)id_vec
.size() > pool_size
) {
11777 ss
<< "num of osds (" << id_vec
.size() <<") > pool size ("
11778 << pool_size
<< ")";
11783 vector
<int32_t> new_pg_upmap
;
11784 for (auto osd
: id_vec
) {
11785 if (osd
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(osd
)) {
11786 ss
<< "osd." << osd
<< " does not exist";
11790 auto it
= std::find(new_pg_upmap
.begin(), new_pg_upmap
.end(), osd
);
11791 if (it
!= new_pg_upmap
.end()) {
11792 ss
<< "osd." << osd
<< " already exists, ";
11795 new_pg_upmap
.push_back(osd
);
11798 if (new_pg_upmap
.empty()) {
11799 ss
<< "no valid upmap items(pairs) is specified";
11804 pending_inc
.new_pg_upmap
[pgid
] = mempool::osdmap::vector
<int32_t>(
11805 new_pg_upmap
.begin(), new_pg_upmap
.end());
11806 ss
<< "set " << pgid
<< " pg_upmap mapping to " << new_pg_upmap
;
11810 case OP_RM_PG_UPMAP
:
11812 pending_inc
.old_pg_upmap
.insert(pgid
);
11813 ss
<< "clear " << pgid
<< " pg_upmap mapping";
11817 case OP_PG_UPMAP_ITEMS
:
11819 vector
<int64_t> id_vec
;
11820 if (!cmd_getval(cmdmap
, "id", id_vec
)) {
11821 ss
<< "unable to parse 'id' value(s) '"
11822 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
11827 if (id_vec
.size() % 2) {
11828 ss
<< "you must specify pairs of osd ids to be remapped";
11833 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
11834 if ((int)(id_vec
.size() / 2) > pool_size
) {
11835 ss
<< "num of osd pairs (" << id_vec
.size() / 2 <<") > pool size ("
11836 << pool_size
<< ")";
11841 vector
<pair
<int32_t,int32_t>> new_pg_upmap_items
;
11842 ostringstream items
;
11844 for (auto p
= id_vec
.begin(); p
!= id_vec
.end(); ++p
) {
11848 ss
<< "from osd." << from
<< " == to osd." << to
<< ", ";
11851 if (!osdmap
.exists(from
)) {
11852 ss
<< "osd." << from
<< " does not exist";
11856 if (to
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(to
)) {
11857 ss
<< "osd." << to
<< " does not exist";
11861 pair
<int32_t,int32_t> entry
= make_pair(from
, to
);
11862 auto it
= std::find(new_pg_upmap_items
.begin(),
11863 new_pg_upmap_items
.end(), entry
);
11864 if (it
!= new_pg_upmap_items
.end()) {
11865 ss
<< "osd." << from
<< " -> osd." << to
<< " already exists, ";
11868 new_pg_upmap_items
.push_back(entry
);
11869 items
<< from
<< "->" << to
<< ",";
11871 string
out(items
.str());
11872 out
.resize(out
.size() - 1); // drop last ','
11875 if (new_pg_upmap_items
.empty()) {
11876 ss
<< "no valid upmap items(pairs) is specified";
11881 pending_inc
.new_pg_upmap_items
[pgid
] =
11882 mempool::osdmap::vector
<pair
<int32_t,int32_t>>(
11883 new_pg_upmap_items
.begin(), new_pg_upmap_items
.end());
11884 ss
<< "set " << pgid
<< " pg_upmap_items mapping to " << out
;
11888 case OP_RM_PG_UPMAP_ITEMS
:
11890 pending_inc
.old_pg_upmap_items
.insert(pgid
);
11891 ss
<< "clear " << pgid
<< " pg_upmap_items mapping";
11896 ceph_abort_msg("invalid option");
11900 } else if (prefix
== "osd primary-affinity") {
11902 if (!cmd_getval(cmdmap
, "id", id
)) {
11903 ss
<< "invalid osd id value '"
11904 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
11909 if (!cmd_getval(cmdmap
, "weight", w
)) {
11910 ss
<< "unable to parse 'weight' value '"
11911 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
11915 long ww
= (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY
*w
);
11917 ss
<< "weight must be >= 0";
11921 if (osdmap
.require_min_compat_client
!= ceph_release_t::unknown
&&
11922 osdmap
.require_min_compat_client
< ceph_release_t::firefly
) {
11923 ss
<< "require_min_compat_client "
11924 << osdmap
.require_min_compat_client
11925 << " < firefly, which is required for primary-affinity";
11929 if (osdmap
.exists(id
)) {
11930 pending_inc
.new_primary_affinity
[id
] = ww
;
11931 ss
<< "set osd." << id
<< " primary-affinity to " << w
<< " (" << ios::hex
<< ww
<< ios::dec
<< ")";
11933 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11934 get_last_committed() + 1));
11937 ss
<< "osd." << id
<< " does not exist";
11941 } else if (prefix
== "osd reweight") {
11943 if (!cmd_getval(cmdmap
, "id", id
)) {
11944 ss
<< "unable to parse osd id value '"
11945 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
11950 if (!cmd_getval(cmdmap
, "weight", w
)) {
11951 ss
<< "unable to parse weight value '"
11952 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
11956 long ww
= (int)((double)CEPH_OSD_IN
*w
);
11958 ss
<< "weight must be >= 0";
11962 if (osdmap
.exists(id
)) {
11963 pending_inc
.new_weight
[id
] = ww
;
11964 ss
<< "reweighted osd." << id
<< " to " << w
<< " (" << std::hex
<< ww
<< std::dec
<< ")";
11966 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11967 get_last_committed() + 1));
11970 ss
<< "osd." << id
<< " does not exist";
11974 } else if (prefix
== "osd reweightn") {
11975 map
<int32_t, uint32_t> weights
;
11976 err
= parse_reweights(cct
, cmdmap
, osdmap
, &weights
);
11978 ss
<< "unable to parse 'weights' value '"
11979 << cmd_vartype_stringify(cmdmap
.at("weights")) << "'";
11982 pending_inc
.new_weight
.insert(weights
.begin(), weights
.end());
11983 wait_for_finished_proposal(
11985 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
, get_last_committed() + 1));
11987 } else if (prefix
== "osd lost") {
11989 if (!cmd_getval(cmdmap
, "id", id
)) {
11990 ss
<< "unable to parse osd id value '"
11991 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
11996 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
11998 ss
<< "are you SURE? this might mean real, permanent data loss. pass "
11999 "--yes-i-really-mean-it if you really do.";
12002 } else if (!osdmap
.exists(id
)) {
12003 ss
<< "osd." << id
<< " does not exist";
12006 } else if (!osdmap
.is_down(id
)) {
12007 ss
<< "osd." << id
<< " is not down";
12011 epoch_t e
= osdmap
.get_info(id
).down_at
;
12012 pending_inc
.new_lost
[id
] = e
;
12013 ss
<< "marked osd lost in epoch " << e
;
12015 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12016 get_last_committed() + 1));
12020 } else if (prefix
== "osd destroy-actual" ||
12021 prefix
== "osd purge-actual" ||
12022 prefix
== "osd purge-new") {
12023 /* Destroying an OSD means that we don't expect to further make use of
12024 * the OSDs data (which may even become unreadable after this operation),
12025 * and that we are okay with scrubbing all its cephx keys and config-key
12026 * data (which may include lockbox keys, thus rendering the osd's data
12029 * The OSD will not be removed. Instead, we will mark it as destroyed,
12030 * such that a subsequent call to `create` will not reuse the osd id.
12031 * This will play into being able to recreate the OSD, at the same
12032 * crush location, with minimal data movement.
12035 // make sure authmon is writeable.
12036 if (!mon
->authmon()->is_writeable()) {
12037 dout(10) << __func__
<< " waiting for auth mon to be writeable for "
12038 << "osd destroy" << dendl
;
12039 mon
->authmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
12044 if (!cmd_getval(cmdmap
, "id", id
)) {
12045 auto p
= cmdmap
.find("id");
12046 if (p
== cmdmap
.end()) {
12047 ss
<< "no osd id specified";
12049 ss
<< "unable to parse osd id value '"
12050 << cmd_vartype_stringify(cmdmap
.at("id")) << "";
12056 bool is_destroy
= (prefix
== "osd destroy-actual");
12058 ceph_assert("osd purge-actual" == prefix
||
12059 "osd purge-new" == prefix
);
12063 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
12065 ss
<< "Are you SURE? Did you verify with 'ceph osd safe-to-destroy'? "
12066 << "This will mean real, permanent data loss, as well "
12067 << "as deletion of cephx and lockbox keys. "
12068 << "Pass --yes-i-really-mean-it if you really do.";
12071 } else if (!osdmap
.exists(id
)) {
12072 ss
<< "osd." << id
<< " does not exist";
12073 err
= 0; // idempotent
12075 } else if (osdmap
.is_up(id
)) {
12076 ss
<< "osd." << id
<< " is not `down`.";
12079 } else if (is_destroy
&& osdmap
.is_destroyed(id
)) {
12080 ss
<< "destroyed osd." << id
;
12085 if (prefix
== "osd purge-new" &&
12086 (osdmap
.get_state(id
) & CEPH_OSD_NEW
) == 0) {
12087 ss
<< "osd." << id
<< " is not new";
12092 bool goto_reply
= false;
12096 err
= prepare_command_osd_destroy(id
, ss
);
12097 // we checked above that it should exist.
12098 ceph_assert(err
!= -ENOENT
);
12100 err
= prepare_command_osd_purge(id
, ss
);
12101 if (err
== -ENOENT
) {
12103 ss
<< "osd." << id
<< " does not exist.";
12109 if (err
< 0 || goto_reply
) {
12114 ss
<< "destroyed osd." << id
;
12116 ss
<< "purged osd." << id
;
12120 wait_for_finished_proposal(op
,
12121 new Monitor::C_Command(mon
, op
, 0, rs
, get_last_committed() + 1));
12122 force_immediate_propose();
12125 } else if (prefix
== "osd new") {
12127 // make sure authmon is writeable.
12128 if (!mon
->authmon()->is_writeable()) {
12129 dout(10) << __func__
<< " waiting for auth mon to be writeable for "
12130 << "osd new" << dendl
;
12131 mon
->authmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
12135 map
<string
,string
> param_map
;
12137 bufferlist bl
= m
->get_data();
12138 string param_json
= bl
.to_str();
12139 dout(20) << __func__
<< " osd new json = " << param_json
<< dendl
;
12141 err
= get_json_str_map(param_json
, ss
, ¶m_map
);
12145 dout(20) << __func__
<< " osd new params " << param_map
<< dendl
;
12148 err
= prepare_command_osd_new(op
, cmdmap
, param_map
, ss
, f
.get());
12161 if (err
== EEXIST
) {
12162 // idempotent operation
12167 wait_for_finished_proposal(op
,
12168 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
,
12169 get_last_committed() + 1));
12170 force_immediate_propose();
12173 } else if (prefix
== "osd create") {
12175 // optional id provided?
12176 int64_t id
= -1, cmd_id
= -1;
12177 if (cmd_getval(cmdmap
, "id", cmd_id
)) {
12179 ss
<< "invalid osd id value '" << cmd_id
<< "'";
12183 dout(10) << " osd create got id " << cmd_id
<< dendl
;
12188 if (cmd_getval(cmdmap
, "uuid", uuidstr
)) {
12189 if (!uuid
.parse(uuidstr
.c_str())) {
12190 ss
<< "invalid uuid value '" << uuidstr
<< "'";
12194 // we only care about the id if we also have the uuid, to
12195 // ensure the operation's idempotency.
12199 int32_t new_id
= -1;
12200 err
= prepare_command_osd_create(id
, uuid
, &new_id
, ss
);
12202 if (err
== -EAGAIN
) {
12203 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12206 // a check has failed; reply to the user.
12209 } else if (err
== EEXIST
) {
12210 // this is an idempotent operation; we can go ahead and reply.
12212 f
->open_object_section("created_osd");
12213 f
->dump_int("osdid", new_id
);
12214 f
->close_section();
12224 string empty_device_class
;
12225 do_osd_create(id
, uuid
, empty_device_class
, &new_id
);
12228 f
->open_object_section("created_osd");
12229 f
->dump_int("osdid", new_id
);
12230 f
->close_section();
12236 wait_for_finished_proposal(op
,
12237 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
,
12238 get_last_committed() + 1));
12241 } else if (prefix
== "osd blacklist clear") {
12242 pending_inc
.new_blacklist
.clear();
12243 std::list
<std::pair
<entity_addr_t
,utime_t
> > blacklist
;
12244 osdmap
.get_blacklist(&blacklist
);
12245 for (const auto &entry
: blacklist
) {
12246 pending_inc
.old_blacklist
.push_back(entry
.first
);
12248 ss
<< " removed all blacklist entries";
12250 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12251 get_last_committed() + 1));
12253 } else if (prefix
== "osd blacklist") {
12255 cmd_getval(cmdmap
, "addr", addrstr
);
12256 entity_addr_t addr
;
12257 if (!addr
.parse(addrstr
.c_str(), 0)) {
12258 ss
<< "unable to parse address " << addrstr
;
12263 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
12264 // always blacklist type ANY
12265 addr
.set_type(entity_addr_t::TYPE_ANY
);
12267 addr
.set_type(entity_addr_t::TYPE_LEGACY
);
12270 string blacklistop
;
12271 cmd_getval(cmdmap
, "blacklistop", blacklistop
);
12272 if (blacklistop
== "add") {
12273 utime_t expires
= ceph_clock_now();
12275 // default one hour
12276 cmd_getval(cmdmap
, "expire", d
,
12277 g_conf()->mon_osd_blacklist_default_expire
);
12280 pending_inc
.new_blacklist
[addr
] = expires
;
12283 // cancel any pending un-blacklisting request too
12284 auto it
= std::find(pending_inc
.old_blacklist
.begin(),
12285 pending_inc
.old_blacklist
.end(), addr
);
12286 if (it
!= pending_inc
.old_blacklist
.end()) {
12287 pending_inc
.old_blacklist
.erase(it
);
12291 ss
<< "blacklisting " << addr
<< " until " << expires
<< " (" << d
<< " sec)";
12293 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12294 get_last_committed() + 1));
12296 } else if (blacklistop
== "rm") {
12297 if (osdmap
.is_blacklisted(addr
) ||
12298 pending_inc
.new_blacklist
.count(addr
)) {
12299 if (osdmap
.is_blacklisted(addr
))
12300 pending_inc
.old_blacklist
.push_back(addr
);
12302 pending_inc
.new_blacklist
.erase(addr
);
12303 ss
<< "un-blacklisting " << addr
;
12305 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12306 get_last_committed() + 1));
12309 ss
<< addr
<< " isn't blacklisted";
12314 } else if (prefix
== "osd pool mksnap") {
12316 cmd_getval(cmdmap
, "pool", poolstr
);
12317 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
12319 ss
<< "unrecognized pool '" << poolstr
<< "'";
12324 cmd_getval(cmdmap
, "snap", snapname
);
12325 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
12326 if (p
->is_unmanaged_snaps_mode()) {
12327 ss
<< "pool " << poolstr
<< " is in unmanaged snaps mode";
12330 } else if (p
->snap_exists(snapname
.c_str())) {
12331 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " already exists";
12334 } else if (p
->is_tier()) {
12335 ss
<< "pool " << poolstr
<< " is a cache tier";
12340 if (pending_inc
.new_pools
.count(pool
))
12341 pp
= &pending_inc
.new_pools
[pool
];
12343 pp
= &pending_inc
.new_pools
[pool
];
12346 if (pp
->snap_exists(snapname
.c_str())) {
12347 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " already exists";
12349 pp
->add_snap(snapname
.c_str(), ceph_clock_now());
12350 pp
->set_snap_epoch(pending_inc
.epoch
);
12351 ss
<< "created pool " << poolstr
<< " snap " << snapname
;
12354 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12355 get_last_committed() + 1));
12357 } else if (prefix
== "osd pool rmsnap") {
12359 cmd_getval(cmdmap
, "pool", poolstr
);
12360 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
12362 ss
<< "unrecognized pool '" << poolstr
<< "'";
12367 cmd_getval(cmdmap
, "snap", snapname
);
12368 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
12369 if (p
->is_unmanaged_snaps_mode()) {
12370 ss
<< "pool " << poolstr
<< " is in unmanaged snaps mode";
12373 } else if (!p
->snap_exists(snapname
.c_str())) {
12374 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " does not exist";
12379 if (pending_inc
.new_pools
.count(pool
))
12380 pp
= &pending_inc
.new_pools
[pool
];
12382 pp
= &pending_inc
.new_pools
[pool
];
12385 snapid_t sn
= pp
->snap_exists(snapname
.c_str());
12387 pp
->remove_snap(sn
);
12388 pp
->set_snap_epoch(pending_inc
.epoch
);
12389 ss
<< "removed pool " << poolstr
<< " snap " << snapname
;
12391 ss
<< "already removed pool " << poolstr
<< " snap " << snapname
;
12394 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12395 get_last_committed() + 1));
12397 } else if (prefix
== "osd pool create") {
12398 int64_t pg_num
, pg_num_min
;
12400 cmd_getval(cmdmap
, "pg_num", pg_num
, int64_t(0));
12401 cmd_getval(cmdmap
, "pgp_num", pgp_num
, pg_num
);
12402 cmd_getval(cmdmap
, "pg_num_min", pg_num_min
, int64_t(0));
12404 string pool_type_str
;
12405 cmd_getval(cmdmap
, "pool_type", pool_type_str
);
12406 if (pool_type_str
.empty())
12407 pool_type_str
= g_conf().get_val
<string
>("osd_pool_default_type");
12410 cmd_getval(cmdmap
, "pool", poolstr
);
12411 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
12412 if (pool_id
>= 0) {
12413 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
12414 if (pool_type_str
!= p
->get_type_name()) {
12415 ss
<< "pool '" << poolstr
<< "' cannot change to type " << pool_type_str
;
12418 ss
<< "pool '" << poolstr
<< "' already exists";
12425 if (pool_type_str
== "replicated") {
12426 pool_type
= pg_pool_t::TYPE_REPLICATED
;
12427 } else if (pool_type_str
== "erasure") {
12428 pool_type
= pg_pool_t::TYPE_ERASURE
;
12430 ss
<< "unknown pool type '" << pool_type_str
<< "'";
12435 bool implicit_rule_creation
= false;
12436 int64_t expected_num_objects
= 0;
12438 cmd_getval(cmdmap
, "rule", rule_name
);
12439 string erasure_code_profile
;
12440 cmd_getval(cmdmap
, "erasure_code_profile", erasure_code_profile
);
12442 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
12443 if (erasure_code_profile
== "")
12444 erasure_code_profile
= "default";
12445 //handle the erasure code profile
12446 if (erasure_code_profile
== "default") {
12447 if (!osdmap
.has_erasure_code_profile(erasure_code_profile
)) {
12448 if (pending_inc
.has_erasure_code_profile(erasure_code_profile
)) {
12449 dout(20) << "erasure code profile " << erasure_code_profile
<< " already pending" << dendl
;
12453 map
<string
,string
> profile_map
;
12454 err
= osdmap
.get_erasure_code_profile_default(cct
,
12459 dout(20) << "erasure code profile " << erasure_code_profile
<< " set" << dendl
;
12460 pending_inc
.set_erasure_code_profile(erasure_code_profile
, profile_map
);
12464 if (rule_name
== "") {
12465 implicit_rule_creation
= true;
12466 if (erasure_code_profile
== "default") {
12467 rule_name
= "erasure-code";
12469 dout(1) << "implicitly use rule named after the pool: "
12470 << poolstr
<< dendl
;
12471 rule_name
= poolstr
;
12474 cmd_getval(cmdmap
, "expected_num_objects",
12475 expected_num_objects
, int64_t(0));
12477 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
12478 // and put expected_num_objects to rule field
12479 if (erasure_code_profile
!= "") { // cmd is from CLI
12480 if (rule_name
!= "") {
12482 expected_num_objects
= strict_strtoll(rule_name
.c_str(), 10, &interr
);
12483 if (interr
.length()) {
12484 ss
<< "error parsing integer value '" << rule_name
<< "': " << interr
;
12489 rule_name
= erasure_code_profile
;
12490 } else { // cmd is well-formed
12491 cmd_getval(cmdmap
, "expected_num_objects",
12492 expected_num_objects
, int64_t(0));
12496 if (!implicit_rule_creation
&& rule_name
!= "") {
12498 err
= get_crush_rule(rule_name
, &rule
, &ss
);
12499 if (err
== -EAGAIN
) {
12500 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12507 if (expected_num_objects
< 0) {
12508 ss
<< "'expected_num_objects' must be non-negative";
12513 if (expected_num_objects
> 0 &&
12514 cct
->_conf
->osd_objectstore
== "filestore" &&
12515 cct
->_conf
->filestore_merge_threshold
> 0) {
12516 ss
<< "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
12521 if (expected_num_objects
== 0 &&
12522 cct
->_conf
->osd_objectstore
== "filestore" &&
12523 cct
->_conf
->filestore_merge_threshold
< 0) {
12524 int osds
= osdmap
.get_num_osds();
12525 if (osds
&& (pg_num
>= 1024 || pg_num
/ osds
>= 100)) {
12526 ss
<< "For better initial performance on pools expected to store a "
12527 << "large number of objects, consider supplying the "
12528 << "expected_num_objects parameter when creating the pool.\n";
12532 int64_t fast_read_param
;
12533 cmd_getval(cmdmap
, "fast_read", fast_read_param
, int64_t(-1));
12534 FastReadType fast_read
= FAST_READ_DEFAULT
;
12535 if (fast_read_param
== 0)
12536 fast_read
= FAST_READ_OFF
;
12537 else if (fast_read_param
> 0)
12538 fast_read
= FAST_READ_ON
;
12540 int64_t repl_size
= 0;
12541 cmd_getval(cmdmap
, "size", repl_size
);
12542 int64_t target_size_bytes
= 0;
12543 double target_size_ratio
= 0.0;
12544 cmd_getval(cmdmap
, "target_size_bytes", target_size_bytes
);
12545 cmd_getval(cmdmap
, "target_size_ratio", target_size_ratio
);
12547 string pg_autoscale_mode
;
12548 cmd_getval(cmdmap
, "autoscale_mode", pg_autoscale_mode
);
12550 err
= prepare_new_pool(poolstr
,
12551 -1, // default crush rule
12553 pg_num
, pgp_num
, pg_num_min
,
12554 repl_size
, target_size_bytes
, target_size_ratio
,
12555 erasure_code_profile
, pool_type
,
12556 (uint64_t)expected_num_objects
,
12563 ss
<< "pool '" << poolstr
<< "' already exists";
12566 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12575 ss
<< "pool '" << poolstr
<< "' created";
12578 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12579 get_last_committed() + 1));
12582 } else if (prefix
== "osd pool delete" ||
12583 prefix
== "osd pool rm") {
12584 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
12585 string poolstr
, poolstr2
, sure
;
12586 cmd_getval(cmdmap
, "pool", poolstr
);
12587 cmd_getval(cmdmap
, "pool2", poolstr2
);
12588 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
12590 ss
<< "pool '" << poolstr
<< "' does not exist";
12595 bool force_no_fake
= false;
12596 cmd_getval(cmdmap
, "yes_i_really_really_mean_it", force_no_fake
);
12597 bool force
= false;
12598 cmd_getval(cmdmap
, "yes_i_really_really_mean_it_not_faking", force
);
12599 if (poolstr2
!= poolstr
||
12600 (!force
&& !force_no_fake
)) {
12601 ss
<< "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
12602 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
12603 << "followed by --yes-i-really-really-mean-it.";
12607 err
= _prepare_remove_pool(pool
, &ss
, force_no_fake
);
12608 if (err
== -EAGAIN
) {
12609 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12615 } else if (prefix
== "osd pool rename") {
12616 string srcpoolstr
, destpoolstr
;
12617 cmd_getval(cmdmap
, "srcpool", srcpoolstr
);
12618 cmd_getval(cmdmap
, "destpool", destpoolstr
);
12619 int64_t pool_src
= osdmap
.lookup_pg_pool_name(srcpoolstr
.c_str());
12620 int64_t pool_dst
= osdmap
.lookup_pg_pool_name(destpoolstr
.c_str());
12622 if (pool_src
< 0) {
12623 if (pool_dst
>= 0) {
12624 // src pool doesn't exist, dst pool does exist: to ensure idempotency
12625 // of operations, assume this rename succeeded, as it is not changing
12626 // the current state. Make sure we output something understandable
12627 // for whoever is issuing the command, if they are paying attention,
12628 // in case it was not intentional; or to avoid a "wtf?" and a bug
12629 // report in case it was intentional, while expecting a failure.
12630 ss
<< "pool '" << srcpoolstr
<< "' does not exist; pool '"
12631 << destpoolstr
<< "' does -- assuming successful rename";
12634 ss
<< "unrecognized pool '" << srcpoolstr
<< "'";
12638 } else if (pool_dst
>= 0) {
12639 // source pool exists and so does the destination pool
12640 ss
<< "pool '" << destpoolstr
<< "' already exists";
12645 int ret
= _prepare_rename_pool(pool_src
, destpoolstr
);
12647 ss
<< "pool '" << srcpoolstr
<< "' renamed to '" << destpoolstr
<< "'";
12649 ss
<< "failed to rename pool '" << srcpoolstr
<< "' to '" << destpoolstr
<< "': "
12650 << cpp_strerror(ret
);
12653 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, ret
, rs
,
12654 get_last_committed() + 1));
12657 } else if (prefix
== "osd pool set") {
12658 err
= prepare_command_pool_set(cmdmap
, ss
);
12659 if (err
== -EAGAIN
)
12665 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12666 get_last_committed() + 1));
12668 } else if (prefix
== "osd tier add") {
12669 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
12670 if (err
== -EAGAIN
)
12675 cmd_getval(cmdmap
, "pool", poolstr
);
12676 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
12678 ss
<< "unrecognized pool '" << poolstr
<< "'";
12682 string tierpoolstr
;
12683 cmd_getval(cmdmap
, "tierpool", tierpoolstr
);
12684 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
12685 if (tierpool_id
< 0) {
12686 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
12690 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
12692 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
12695 if (!_check_become_tier(tierpool_id
, tp
, pool_id
, p
, &err
, &ss
)) {
12699 // make sure new tier is empty
12700 string force_nonempty
;
12701 cmd_getval(cmdmap
, "force_nonempty", force_nonempty
);
12702 const pool_stat_t
*pstats
= mon
->mgrstatmon()->get_pool_stat(tierpool_id
);
12703 if (pstats
&& pstats
->stats
.sum
.num_objects
!= 0 &&
12704 force_nonempty
!= "--force-nonempty") {
12705 ss
<< "tier pool '" << tierpoolstr
<< "' is not empty; --force-nonempty to force";
12709 if (tp
->is_erasure()) {
12710 ss
<< "tier pool '" << tierpoolstr
12711 << "' is an ec pool, which cannot be a tier";
12715 if ((!tp
->removed_snaps
.empty() || !tp
->snaps
.empty()) &&
12716 ((force_nonempty
!= "--force-nonempty") ||
12717 (!g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps
))) {
12718 ss
<< "tier pool '" << tierpoolstr
<< "' has snapshot state; it cannot be added as a tier without breaking the pool";
12723 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
12724 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
12725 if (np
->tiers
.count(tierpool_id
) || ntp
->is_tier()) {
12726 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12729 np
->tiers
.insert(tierpool_id
);
12730 np
->set_snap_epoch(pending_inc
.epoch
); // tier will update to our snap info
12731 ntp
->tier_of
= pool_id
;
12732 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) a tier of '" << poolstr
<< "'";
12733 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
12734 get_last_committed() + 1));
12736 } else if (prefix
== "osd tier remove" ||
12737 prefix
== "osd tier rm") {
12739 cmd_getval(cmdmap
, "pool", poolstr
);
12740 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
12742 ss
<< "unrecognized pool '" << poolstr
<< "'";
12746 string tierpoolstr
;
12747 cmd_getval(cmdmap
, "tierpool", tierpoolstr
);
12748 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
12749 if (tierpool_id
< 0) {
12750 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
12754 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
12756 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
12759 if (!_check_remove_tier(pool_id
, p
, tp
, &err
, &ss
)) {
12763 if (p
->tiers
.count(tierpool_id
) == 0) {
12764 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) not a tier of '" << poolstr
<< "'";
12768 if (tp
->tier_of
!= pool_id
) {
12769 ss
<< "tier pool '" << tierpoolstr
<< "' is a tier of '"
12770 << osdmap
.get_pool_name(tp
->tier_of
) << "': "
12771 // be scary about it; this is an inconsistency and bells must go off
12772 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
12776 if (p
->read_tier
== tierpool_id
) {
12777 ss
<< "tier pool '" << tierpoolstr
<< "' is the overlay for '" << poolstr
<< "'; please remove-overlay first";
12782 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
12783 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
12784 if (np
->tiers
.count(tierpool_id
) == 0 ||
12785 ntp
->tier_of
!= pool_id
||
12786 np
->read_tier
== tierpool_id
) {
12787 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12790 np
->tiers
.erase(tierpool_id
);
12792 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) not a tier of '" << poolstr
<< "'";
12793 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
12794 get_last_committed() + 1));
12796 } else if (prefix
== "osd tier set-overlay") {
12797 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
12798 if (err
== -EAGAIN
)
12803 cmd_getval(cmdmap
, "pool", poolstr
);
12804 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
12806 ss
<< "unrecognized pool '" << poolstr
<< "'";
12810 string overlaypoolstr
;
12811 cmd_getval(cmdmap
, "overlaypool", overlaypoolstr
);
12812 int64_t overlaypool_id
= osdmap
.lookup_pg_pool_name(overlaypoolstr
);
12813 if (overlaypool_id
< 0) {
12814 ss
<< "unrecognized pool '" << overlaypoolstr
<< "'";
12818 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
12820 const pg_pool_t
*overlay_p
= osdmap
.get_pg_pool(overlaypool_id
);
12821 ceph_assert(overlay_p
);
12822 if (p
->tiers
.count(overlaypool_id
) == 0) {
12823 ss
<< "tier pool '" << overlaypoolstr
<< "' is not a tier of '" << poolstr
<< "'";
12827 if (p
->read_tier
== overlaypool_id
) {
12829 ss
<< "overlay for '" << poolstr
<< "' is now (or already was) '" << overlaypoolstr
<< "'";
12832 if (p
->has_read_tier()) {
12833 ss
<< "pool '" << poolstr
<< "' has overlay '"
12834 << osdmap
.get_pool_name(p
->read_tier
)
12835 << "'; please remove-overlay first";
12841 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
12842 np
->read_tier
= overlaypool_id
;
12843 np
->write_tier
= overlaypool_id
;
12844 np
->set_last_force_op_resend(pending_inc
.epoch
);
12845 pg_pool_t
*noverlay_p
= pending_inc
.get_new_pool(overlaypool_id
, overlay_p
);
12846 noverlay_p
->set_last_force_op_resend(pending_inc
.epoch
);
12847 ss
<< "overlay for '" << poolstr
<< "' is now (or already was) '" << overlaypoolstr
<< "'";
12848 if (overlay_p
->cache_mode
== pg_pool_t::CACHEMODE_NONE
)
12849 ss
<<" (WARNING: overlay pool cache_mode is still NONE)";
12850 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
12851 get_last_committed() + 1));
12853 } else if (prefix
== "osd tier remove-overlay" ||
12854 prefix
== "osd tier rm-overlay") {
12856 cmd_getval(cmdmap
, "pool", poolstr
);
12857 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
12859 ss
<< "unrecognized pool '" << poolstr
<< "'";
12863 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
12865 if (!p
->has_read_tier()) {
12867 ss
<< "there is now (or already was) no overlay for '" << poolstr
<< "'";
12871 if (!_check_remove_tier(pool_id
, p
, NULL
, &err
, &ss
)) {
12876 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
12877 if (np
->has_read_tier()) {
12878 const pg_pool_t
*op
= osdmap
.get_pg_pool(np
->read_tier
);
12879 pg_pool_t
*nop
= pending_inc
.get_new_pool(np
->read_tier
,op
);
12880 nop
->set_last_force_op_resend(pending_inc
.epoch
);
12882 if (np
->has_write_tier()) {
12883 const pg_pool_t
*op
= osdmap
.get_pg_pool(np
->write_tier
);
12884 pg_pool_t
*nop
= pending_inc
.get_new_pool(np
->write_tier
, op
);
12885 nop
->set_last_force_op_resend(pending_inc
.epoch
);
12887 np
->clear_read_tier();
12888 np
->clear_write_tier();
12889 np
->set_last_force_op_resend(pending_inc
.epoch
);
12890 ss
<< "there is now (or already was) no overlay for '" << poolstr
<< "'";
12891 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
12892 get_last_committed() + 1));
12894 } else if (prefix
== "osd tier cache-mode") {
12895 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
12896 if (err
== -EAGAIN
)
12901 cmd_getval(cmdmap
, "pool", poolstr
);
12902 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
12904 ss
<< "unrecognized pool '" << poolstr
<< "'";
12908 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
12910 if (!p
->is_tier()) {
12911 ss
<< "pool '" << poolstr
<< "' is not a tier";
12916 cmd_getval(cmdmap
, "mode", modestr
);
12917 pg_pool_t::cache_mode_t mode
= pg_pool_t::get_cache_mode_from_str(modestr
);
12918 if (int(mode
) < 0) {
12919 ss
<< "'" << modestr
<< "' is not a valid cache mode";
12925 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
12927 if (mode
== pg_pool_t::CACHEMODE_FORWARD
||
12928 mode
== pg_pool_t::CACHEMODE_READFORWARD
) {
12929 ss
<< "'" << modestr
<< "' is no longer a supported cache mode";
12933 if ((mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
12934 mode
!= pg_pool_t::CACHEMODE_NONE
&&
12935 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
12936 mode
!= pg_pool_t::CACHEMODE_READPROXY
) &&
12938 ss
<< "'" << modestr
<< "' is not a well-supported cache mode and may "
12939 << "corrupt your data. pass --yes-i-really-mean-it to force.";
12944 // pool already has this cache-mode set and there are no pending changes
12945 if (p
->cache_mode
== mode
&&
12946 (pending_inc
.new_pools
.count(pool_id
) == 0 ||
12947 pending_inc
.new_pools
[pool_id
].cache_mode
== p
->cache_mode
)) {
12948 ss
<< "set cache-mode for pool '" << poolstr
<< "'"
12949 << " to " << pg_pool_t::get_cache_mode_name(mode
);
12954 /* Mode description:
12956 * none: No cache-mode defined
12957 * forward: Forward all reads and writes to base pool [removed]
12958 * writeback: Cache writes, promote reads from base pool
12959 * readonly: Forward writes to base pool
12960 * readforward: Writes are in writeback mode, Reads are in forward mode [removed]
12961 * proxy: Proxy all reads and writes to base pool
12962 * readproxy: Writes are in writeback mode, Reads are in proxy mode
12964 * Hence, these are the allowed transitions:
12967 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
12968 * proxy -> readproxy || writeback || any IF num_objects_dirty == 0
12969 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
12970 * readproxy -> proxy || writeback || any IF num_objects_dirty == 0
12971 * writeback -> readproxy || proxy
12975 // We check if the transition is valid against the current pool mode, as
12976 // it is the only committed state thus far. We will blantly squash
12977 // whatever mode is on the pending state.
12979 if (p
->cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
&&
12980 (mode
!= pg_pool_t::CACHEMODE_PROXY
&&
12981 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) {
12982 ss
<< "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode
)
12983 << "' on a '" << pg_pool_t::get_cache_mode_name(p
->cache_mode
)
12984 << "' pool; only '"
12985 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY
)
12987 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY
)
12992 if ((p
->cache_mode
== pg_pool_t::CACHEMODE_READFORWARD
&&
12993 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
12994 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
12995 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) ||
12997 (p
->cache_mode
== pg_pool_t::CACHEMODE_READPROXY
&&
12998 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
12999 mode
!= pg_pool_t::CACHEMODE_PROXY
)) ||
13001 (p
->cache_mode
== pg_pool_t::CACHEMODE_PROXY
&&
13002 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13003 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) ||
13005 (p
->cache_mode
== pg_pool_t::CACHEMODE_FORWARD
&&
13006 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13007 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
13008 mode
!= pg_pool_t::CACHEMODE_READPROXY
))) {
13010 const pool_stat_t
* pstats
=
13011 mon
->mgrstatmon()->get_pool_stat(pool_id
);
13013 if (pstats
&& pstats
->stats
.sum
.num_objects_dirty
> 0) {
13014 ss
<< "unable to set cache-mode '"
13015 << pg_pool_t::get_cache_mode_name(mode
) << "' on pool '" << poolstr
13016 << "': dirty objects found";
13022 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13023 np
->cache_mode
= mode
;
13024 // set this both when moving to and from cache_mode NONE. this is to
13025 // capture legacy pools that were set up before this flag existed.
13026 np
->flags
|= pg_pool_t::FLAG_INCOMPLETE_CLONES
;
13027 ss
<< "set cache-mode for pool '" << poolstr
13028 << "' to " << pg_pool_t::get_cache_mode_name(mode
);
13029 if (mode
== pg_pool_t::CACHEMODE_NONE
) {
13030 const pg_pool_t
*base_pool
= osdmap
.get_pg_pool(np
->tier_of
);
13031 ceph_assert(base_pool
);
13032 if (base_pool
->read_tier
== pool_id
||
13033 base_pool
->write_tier
== pool_id
)
13034 ss
<<" (WARNING: pool is still configured as read or write tier)";
13036 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13037 get_last_committed() + 1));
13039 } else if (prefix
== "osd tier add-cache") {
13040 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
13041 if (err
== -EAGAIN
)
13046 cmd_getval(cmdmap
, "pool", poolstr
);
13047 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13049 ss
<< "unrecognized pool '" << poolstr
<< "'";
13053 string tierpoolstr
;
13054 cmd_getval(cmdmap
, "tierpool", tierpoolstr
);
13055 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
13056 if (tierpool_id
< 0) {
13057 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
13061 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13063 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
13066 if (!_check_become_tier(tierpool_id
, tp
, pool_id
, p
, &err
, &ss
)) {
13071 if (!cmd_getval(cmdmap
, "size", size
)) {
13072 ss
<< "unable to parse 'size' value '"
13073 << cmd_vartype_stringify(cmdmap
.at("size")) << "'";
13077 // make sure new tier is empty
13078 const pool_stat_t
*pstats
=
13079 mon
->mgrstatmon()->get_pool_stat(tierpool_id
);
13080 if (pstats
&& pstats
->stats
.sum
.num_objects
!= 0) {
13081 ss
<< "tier pool '" << tierpoolstr
<< "' is not empty";
13085 auto& modestr
= g_conf().get_val
<string
>("osd_tier_default_cache_mode");
13086 pg_pool_t::cache_mode_t mode
= pg_pool_t::get_cache_mode_from_str(modestr
);
13087 if (int(mode
) < 0) {
13088 ss
<< "osd tier cache default mode '" << modestr
<< "' is not a valid cache mode";
13092 HitSet::Params hsp
;
13093 auto& cache_hit_set_type
=
13094 g_conf().get_val
<string
>("osd_tier_default_cache_hit_set_type");
13095 if (cache_hit_set_type
== "bloom") {
13096 BloomHitSet::Params
*bsp
= new BloomHitSet::Params
;
13097 bsp
->set_fpp(g_conf().get_val
<double>("osd_pool_default_hit_set_bloom_fpp"));
13098 hsp
= HitSet::Params(bsp
);
13099 } else if (cache_hit_set_type
== "explicit_hash") {
13100 hsp
= HitSet::Params(new ExplicitHashHitSet::Params
);
13101 } else if (cache_hit_set_type
== "explicit_object") {
13102 hsp
= HitSet::Params(new ExplicitObjectHitSet::Params
);
13104 ss
<< "osd tier cache default hit set type '"
13105 << cache_hit_set_type
<< "' is not a known type";
13110 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13111 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
13112 if (np
->tiers
.count(tierpool_id
) || ntp
->is_tier()) {
13113 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13116 np
->tiers
.insert(tierpool_id
);
13117 np
->read_tier
= np
->write_tier
= tierpool_id
;
13118 np
->set_snap_epoch(pending_inc
.epoch
); // tier will update to our snap info
13119 np
->set_last_force_op_resend(pending_inc
.epoch
);
13120 ntp
->set_last_force_op_resend(pending_inc
.epoch
);
13121 ntp
->tier_of
= pool_id
;
13122 ntp
->cache_mode
= mode
;
13123 ntp
->hit_set_count
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_count");
13124 ntp
->hit_set_period
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_period");
13125 ntp
->min_read_recency_for_promote
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
13126 ntp
->min_write_recency_for_promote
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
13127 ntp
->hit_set_grade_decay_rate
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
13128 ntp
->hit_set_search_last_n
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
13129 ntp
->hit_set_params
= hsp
;
13130 ntp
->target_max_bytes
= size
;
13131 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) a cache tier of '" << poolstr
<< "'";
13132 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13133 get_last_committed() + 1));
13135 } else if (prefix
== "osd pool set-quota") {
13137 cmd_getval(cmdmap
, "pool", poolstr
);
13138 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13140 ss
<< "unrecognized pool '" << poolstr
<< "'";
13146 cmd_getval(cmdmap
, "field", field
);
13147 if (field
!= "max_objects" && field
!= "max_bytes") {
13148 ss
<< "unrecognized field '" << field
<< "'; should be 'max_bytes' or 'max_objects'";
13153 // val could contain unit designations, so we treat as a string
13155 cmd_getval(cmdmap
, "val", val
);
13158 if (field
== "max_objects") {
13159 value
= strict_sistrtoll(val
.c_str(), &tss
);
13160 } else if (field
== "max_bytes") {
13161 value
= strict_iecstrtoll(val
.c_str(), &tss
);
13163 ceph_abort_msg("unrecognized option");
13165 if (!tss
.empty()) {
13166 ss
<< "error parsing value '" << val
<< "': " << tss
;
13171 pg_pool_t
*pi
= pending_inc
.get_new_pool(pool_id
, osdmap
.get_pg_pool(pool_id
));
13172 if (field
== "max_objects") {
13173 pi
->quota_max_objects
= value
;
13174 } else if (field
== "max_bytes") {
13175 pi
->quota_max_bytes
= value
;
13177 ceph_abort_msg("unrecognized option");
13179 ss
<< "set-quota " << field
<< " = " << value
<< " for pool " << poolstr
;
13181 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
13182 get_last_committed() + 1));
13184 } else if (prefix
== "osd pool application enable" ||
13185 prefix
== "osd pool application disable" ||
13186 prefix
== "osd pool application set" ||
13187 prefix
== "osd pool application rm") {
13188 err
= prepare_command_pool_application(prefix
, cmdmap
, ss
);
13189 if (err
== -EAGAIN
) {
13191 } else if (err
< 0) {
13196 } else if (prefix
== "osd force-create-pg") {
13199 cmd_getval(cmdmap
, "pgid", pgidstr
);
13200 if (!pgid
.parse(pgidstr
.c_str())) {
13201 ss
<< "invalid pgid '" << pgidstr
<< "'";
13205 if (!osdmap
.pg_exists(pgid
)) {
13206 ss
<< "pg " << pgid
<< " should not exist";
13211 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
13213 ss
<< "This command will recreate a lost (as in data lost) PG with data in it, such "
13214 << "that the cluster will give up ever trying to recover the lost data. Do this "
13215 << "only if you are certain that all copies of the PG are in fact lost and you are "
13216 << "willing to accept that the data is permanently destroyed. Pass "
13217 << "--yes-i-really-mean-it to proceed.";
13223 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
13224 auto emplaced
= creating_pgs
.pgs
.emplace(
13226 creating_pgs_t::pg_create_info(osdmap
.get_epoch(),
13227 ceph_clock_now()));
13228 creating_now
= emplaced
.second
;
13230 if (creating_now
) {
13231 ss
<< "pg " << pgidstr
<< " now creating, ok";
13232 // set the pool's CREATING flag so that (1) the osd won't ignore our
13233 // create message and (2) we won't propose any future pg_num changes
13234 // until after the PG has been instantiated.
13235 if (pending_inc
.new_pools
.count(pgid
.pool()) == 0) {
13236 pending_inc
.new_pools
[pgid
.pool()] = *osdmap
.get_pg_pool(pgid
.pool());
13238 pending_inc
.new_pools
[pgid
.pool()].flags
|= pg_pool_t::FLAG_CREATING
;
13242 ss
<< "pg " << pgid
<< " already creating";
13252 if (err
< 0 && rs
.length() == 0)
13253 rs
= cpp_strerror(err
);
13254 mon
->reply_command(op
, err
, rs
, rdata
, get_last_committed());
13259 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
13260 get_last_committed() + 1));
13264 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13268 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op
)
13270 op
->mark_osdmon_event(__func__
);
13272 auto m
= op
->get_req
<MPoolOp
>();
13273 MonSession
*session
= op
->get_session();
13275 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
13280 case POOL_OP_CREATE_UNMANAGED_SNAP
:
13281 case POOL_OP_DELETE_UNMANAGED_SNAP
:
13283 const std::string
* pool_name
= nullptr;
13284 const pg_pool_t
*pg_pool
= osdmap
.get_pg_pool(m
->pool
);
13285 if (pg_pool
!= nullptr) {
13286 pool_name
= &osdmap
.get_pool_name(m
->pool
);
13289 if (!is_unmanaged_snap_op_permitted(cct
, mon
->key_server
,
13290 session
->entity_name
, session
->caps
,
13291 session
->get_peer_socket_addr(),
13293 dout(0) << "got unmanaged-snap pool op from entity with insufficient "
13294 << "privileges. message: " << *m
<< std::endl
13295 << "caps: " << session
->caps
<< dendl
;
13296 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
13302 if (!session
->is_capable("osd", MON_CAP_W
)) {
13303 dout(0) << "got pool op from entity with insufficient privileges. "
13304 << "message: " << *m
<< std::endl
13305 << "caps: " << session
->caps
<< dendl
;
13306 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
13315 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op
)
13317 op
->mark_osdmon_event(__func__
);
13318 auto m
= op
->get_req
<MPoolOp
>();
13320 if (enforce_pool_op_caps(op
)) {
13324 if (m
->fsid
!= mon
->monmap
->fsid
) {
13325 dout(0) << __func__
<< " drop message on fsid " << m
->fsid
13326 << " != " << mon
->monmap
->fsid
<< " for " << *m
<< dendl
;
13327 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13331 if (m
->op
== POOL_OP_CREATE
)
13332 return preprocess_pool_op_create(op
);
13334 const pg_pool_t
*p
= osdmap
.get_pg_pool(m
->pool
);
13335 if (p
== nullptr) {
13336 dout(10) << "attempt to operate on non-existent pool id " << m
->pool
<< dendl
;
13337 if (m
->op
== POOL_OP_DELETE
) {
13338 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13340 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
13345 // check if the snap and snapname exist
13346 bool snap_exists
= false;
13347 if (p
->snap_exists(m
->name
.c_str()))
13348 snap_exists
= true;
13351 case POOL_OP_CREATE_SNAP
:
13352 if (p
->is_unmanaged_snaps_mode() || p
->is_tier()) {
13353 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13357 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13361 case POOL_OP_CREATE_UNMANAGED_SNAP
:
13362 if (p
->is_pool_snaps_mode()) {
13363 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13367 case POOL_OP_DELETE_SNAP
:
13368 if (p
->is_unmanaged_snaps_mode()) {
13369 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13372 if (!snap_exists
) {
13373 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13377 case POOL_OP_DELETE_UNMANAGED_SNAP
:
13378 if (p
->is_pool_snaps_mode()) {
13379 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13382 if (_is_removed_snap(m
->pool
, m
->snapid
)) {
13383 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13387 case POOL_OP_DELETE
:
13388 if (osdmap
.lookup_pg_pool_name(m
->name
.c_str()) >= 0) {
13389 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13393 case POOL_OP_AUID_CHANGE
:
13403 bool OSDMonitor::_is_removed_snap(int64_t pool
, snapid_t snap
)
13405 if (!osdmap
.have_pg_pool(pool
)) {
13406 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
13407 << " - pool dne" << dendl
;
13410 if (osdmap
.in_removed_snaps_queue(pool
, snap
)) {
13411 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
13412 << " - in osdmap removed_snaps_queue" << dendl
;
13415 snapid_t begin
, end
;
13416 int r
= lookup_purged_snap(pool
, snap
, &begin
, &end
);
13418 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
13419 << " - purged, [" << begin
<< "," << end
<< ")" << dendl
;
13425 bool OSDMonitor::_is_pending_removed_snap(int64_t pool
, snapid_t snap
)
13427 if (pending_inc
.old_pools
.count(pool
)) {
13428 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
13429 << " - pool pending deletion" << dendl
;
13432 if (pending_inc
.in_new_removed_snaps(pool
, snap
)) {
13433 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
13434 << " - in pending new_removed_snaps" << dendl
;
13440 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op
)
13442 op
->mark_osdmon_event(__func__
);
13443 auto m
= op
->get_req
<MPoolOp
>();
13444 int64_t pool
= osdmap
.lookup_pg_pool_name(m
->name
.c_str());
13446 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13453 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op
)
13455 op
->mark_osdmon_event(__func__
);
13456 auto m
= op
->get_req
<MPoolOp
>();
13457 dout(10) << "prepare_pool_op " << *m
<< dendl
;
13458 if (m
->op
== POOL_OP_CREATE
) {
13459 return prepare_pool_op_create(op
);
13460 } else if (m
->op
== POOL_OP_DELETE
) {
13461 return prepare_pool_op_delete(op
);
13465 bool changed
= false;
13467 if (!osdmap
.have_pg_pool(m
->pool
)) {
13468 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
13472 const pg_pool_t
*pool
= osdmap
.get_pg_pool(m
->pool
);
13475 case POOL_OP_CREATE_SNAP
:
13476 if (pool
->is_tier()) {
13478 _pool_op_reply(op
, ret
, osdmap
.get_epoch());
13480 } // else, fall through
13481 case POOL_OP_DELETE_SNAP
:
13482 if (!pool
->is_unmanaged_snaps_mode()) {
13483 bool snap_exists
= pool
->snap_exists(m
->name
.c_str());
13484 if ((m
->op
== POOL_OP_CREATE_SNAP
&& snap_exists
)
13485 || (m
->op
== POOL_OP_DELETE_SNAP
&& !snap_exists
)) {
13493 _pool_op_reply(op
, ret
, osdmap
.get_epoch());
13496 case POOL_OP_DELETE_UNMANAGED_SNAP
:
13497 // we won't allow removal of an unmanaged snapshot from a pool
13498 // not in unmanaged snaps mode.
13499 if (!pool
->is_unmanaged_snaps_mode()) {
13500 _pool_op_reply(op
, -ENOTSUP
, osdmap
.get_epoch());
13504 case POOL_OP_CREATE_UNMANAGED_SNAP
:
13505 // but we will allow creating an unmanaged snapshot on any pool
13506 // as long as it is not in 'pool' snaps mode.
13507 if (pool
->is_pool_snaps_mode()) {
13508 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13513 // projected pool info
13515 if (pending_inc
.new_pools
.count(m
->pool
))
13516 pp
= pending_inc
.new_pools
[m
->pool
];
13518 pp
= *osdmap
.get_pg_pool(m
->pool
);
13520 bufferlist reply_data
;
13522 // pool snaps vs unmanaged snaps are mutually exclusive
13524 case POOL_OP_CREATE_SNAP
:
13525 case POOL_OP_DELETE_SNAP
:
13526 if (pp
.is_unmanaged_snaps_mode()) {
13532 case POOL_OP_CREATE_UNMANAGED_SNAP
:
13533 case POOL_OP_DELETE_UNMANAGED_SNAP
:
13534 if (pp
.is_pool_snaps_mode()) {
13541 case POOL_OP_CREATE_SNAP
:
13542 if (!pp
.snap_exists(m
->name
.c_str())) {
13543 pp
.add_snap(m
->name
.c_str(), ceph_clock_now());
13544 dout(10) << "create snap in pool " << m
->pool
<< " " << m
->name
13545 << " seq " << pp
.get_snap_epoch() << dendl
;
13550 case POOL_OP_DELETE_SNAP
:
13552 snapid_t s
= pp
.snap_exists(m
->name
.c_str());
13555 pending_inc
.new_removed_snaps
[m
->pool
].insert(s
);
13561 case POOL_OP_CREATE_UNMANAGED_SNAP
:
13563 uint64_t snapid
= pp
.add_unmanaged_snap(
13564 osdmap
.require_osd_release
< ceph_release_t::octopus
);
13565 encode(snapid
, reply_data
);
13570 case POOL_OP_DELETE_UNMANAGED_SNAP
:
13571 if (!_is_removed_snap(m
->pool
, m
->snapid
) &&
13572 !_is_pending_removed_snap(m
->pool
, m
->snapid
)) {
13573 if (m
->snapid
> pp
.get_snap_seq()) {
13574 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
13577 pp
.remove_unmanaged_snap(
13579 osdmap
.require_osd_release
< ceph_release_t::octopus
);
13580 pending_inc
.new_removed_snaps
[m
->pool
].insert(m
->snapid
);
13581 // also record the new seq as purged: this avoids a discontinuity
13582 // after all of the snaps have been purged, since the seq assigned
13583 // during removal lives in the same namespace as the actual snaps.
13584 pending_pseudo_purged_snaps
[m
->pool
].insert(pp
.get_snap_seq());
13589 case POOL_OP_AUID_CHANGE
:
13590 _pool_op_reply(op
, -EOPNOTSUPP
, osdmap
.get_epoch());
13599 pp
.set_snap_epoch(pending_inc
.epoch
);
13600 pending_inc
.new_pools
[m
->pool
] = pp
;
13604 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, ret
, pending_inc
.epoch
, &reply_data
));
13608 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op
)
13610 op
->mark_osdmon_event(__func__
);
13611 int err
= prepare_new_pool(op
);
13612 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, err
, pending_inc
.epoch
));
13616 int OSDMonitor::_check_remove_pool(int64_t pool_id
, const pg_pool_t
& pool
,
13619 const string
& poolstr
= osdmap
.get_pool_name(pool_id
);
13621 // If the Pool is in use by CephFS, refuse to delete it
13622 FSMap
const &pending_fsmap
= mon
->mdsmon()->get_pending_fsmap();
13623 if (pending_fsmap
.pool_in_use(pool_id
)) {
13624 *ss
<< "pool '" << poolstr
<< "' is in use by CephFS";
13628 if (pool
.tier_of
>= 0) {
13629 *ss
<< "pool '" << poolstr
<< "' is a tier of '"
13630 << osdmap
.get_pool_name(pool
.tier_of
) << "'";
13633 if (!pool
.tiers
.empty()) {
13634 *ss
<< "pool '" << poolstr
<< "' has tiers";
13635 for(auto tier
: pool
.tiers
) {
13636 *ss
<< " " << osdmap
.get_pool_name(tier
);
13641 if (!g_conf()->mon_allow_pool_delete
) {
13642 *ss
<< "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
13646 if (pool
.has_flag(pg_pool_t::FLAG_NODELETE
)) {
13647 *ss
<< "pool deletion is disabled; you must unset nodelete flag for the pool first";
13651 *ss
<< "pool '" << poolstr
<< "' removed";
13656 * Check if it is safe to add a tier to a base pool
13659 * True if the operation should proceed, false if we should abort here
13660 * (abort doesn't necessarily mean error, could be idempotency)
13662 bool OSDMonitor::_check_become_tier(
13663 const int64_t tier_pool_id
, const pg_pool_t
*tier_pool
,
13664 const int64_t base_pool_id
, const pg_pool_t
*base_pool
,
13668 const std::string
&tier_pool_name
= osdmap
.get_pool_name(tier_pool_id
);
13669 const std::string
&base_pool_name
= osdmap
.get_pool_name(base_pool_id
);
13671 const FSMap
&pending_fsmap
= mon
->mdsmon()->get_pending_fsmap();
13672 if (pending_fsmap
.pool_in_use(tier_pool_id
)) {
13673 *ss
<< "pool '" << tier_pool_name
<< "' is in use by CephFS";
13678 if (base_pool
->tiers
.count(tier_pool_id
)) {
13679 ceph_assert(tier_pool
->tier_of
== base_pool_id
);
13681 *ss
<< "pool '" << tier_pool_name
<< "' is now (or already was) a tier of '"
13682 << base_pool_name
<< "'";
13686 if (base_pool
->is_tier()) {
13687 *ss
<< "pool '" << base_pool_name
<< "' is already a tier of '"
13688 << osdmap
.get_pool_name(base_pool
->tier_of
) << "', "
13689 << "multiple tiers are not yet supported.";
13694 if (tier_pool
->has_tiers()) {
13695 *ss
<< "pool '" << tier_pool_name
<< "' has following tier(s) already:";
13696 for (set
<uint64_t>::iterator it
= tier_pool
->tiers
.begin();
13697 it
!= tier_pool
->tiers
.end(); ++it
)
13698 *ss
<< "'" << osdmap
.get_pool_name(*it
) << "',";
13699 *ss
<< " multiple tiers are not yet supported.";
13704 if (tier_pool
->is_tier()) {
13705 *ss
<< "tier pool '" << tier_pool_name
<< "' is already a tier of '"
13706 << osdmap
.get_pool_name(tier_pool
->tier_of
) << "'";
13717 * Check if it is safe to remove a tier from this base pool
13720 * True if the operation should proceed, false if we should abort here
13721 * (abort doesn't necessarily mean error, could be idempotency)
13723 bool OSDMonitor::_check_remove_tier(
13724 const int64_t base_pool_id
, const pg_pool_t
*base_pool
,
13725 const pg_pool_t
*tier_pool
,
13726 int *err
, ostream
*ss
) const
13728 const std::string
&base_pool_name
= osdmap
.get_pool_name(base_pool_id
);
13730 // Apply CephFS-specific checks
13731 const FSMap
&pending_fsmap
= mon
->mdsmon()->get_pending_fsmap();
13732 if (pending_fsmap
.pool_in_use(base_pool_id
)) {
13733 if (base_pool
->is_erasure() && !base_pool
->allows_ecoverwrites()) {
13734 // If the underlying pool is erasure coded and does not allow EC
13735 // overwrites, we can't permit the removal of the replicated tier that
13736 // CephFS relies on to access it
13737 *ss
<< "pool '" << base_pool_name
<<
13738 "' does not allow EC overwrites and is in use by CephFS"
13744 if (tier_pool
&& tier_pool
->cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
) {
13745 *ss
<< "pool '" << base_pool_name
<< "' is in use by CephFS, and this "
13746 "tier is still in use as a writeback cache. Change the cache "
13747 "mode and flush the cache before removing it";
13757 int OSDMonitor::_prepare_remove_pool(
13758 int64_t pool
, ostream
*ss
, bool no_fake
)
13760 dout(10) << __func__
<< " " << pool
<< dendl
;
13761 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
13762 int r
= _check_remove_pool(pool
, *p
, ss
);
13766 auto new_pool
= pending_inc
.new_pools
.find(pool
);
13767 if (new_pool
!= pending_inc
.new_pools
.end()) {
13768 // if there is a problem with the pending info, wait and retry
13770 const auto& p
= new_pool
->second
;
13771 int r
= _check_remove_pool(pool
, p
, ss
);
13776 if (pending_inc
.old_pools
.count(pool
)) {
13777 dout(10) << __func__
<< " " << pool
<< " already pending removal"
13782 if (g_conf()->mon_fake_pool_delete
&& !no_fake
) {
13783 string old_name
= osdmap
.get_pool_name(pool
);
13784 string new_name
= old_name
+ "." + stringify(pool
) + ".DELETED";
13785 dout(1) << __func__
<< " faking pool deletion: renaming " << pool
<< " "
13786 << old_name
<< " -> " << new_name
<< dendl
;
13787 pending_inc
.new_pool_names
[pool
] = new_name
;
13792 pending_inc
.old_pools
.insert(pool
);
13794 // remove any pg_temp mappings for this pool
13795 for (auto p
= osdmap
.pg_temp
->begin();
13796 p
!= osdmap
.pg_temp
->end();
13798 if (p
->first
.pool() == pool
) {
13799 dout(10) << __func__
<< " " << pool
<< " removing obsolete pg_temp "
13800 << p
->first
<< dendl
;
13801 pending_inc
.new_pg_temp
[p
->first
].clear();
13804 // remove any primary_temp mappings for this pool
13805 for (auto p
= osdmap
.primary_temp
->begin();
13806 p
!= osdmap
.primary_temp
->end();
13808 if (p
->first
.pool() == pool
) {
13809 dout(10) << __func__
<< " " << pool
13810 << " removing obsolete primary_temp" << p
->first
<< dendl
;
13811 pending_inc
.new_primary_temp
[p
->first
] = -1;
13814 // remove any pg_upmap mappings for this pool
13815 for (auto& p
: osdmap
.pg_upmap
) {
13816 if (p
.first
.pool() == pool
) {
13817 dout(10) << __func__
<< " " << pool
13818 << " removing obsolete pg_upmap "
13819 << p
.first
<< dendl
;
13820 pending_inc
.old_pg_upmap
.insert(p
.first
);
13823 // remove any pending pg_upmap mappings for this pool
13825 auto it
= pending_inc
.new_pg_upmap
.begin();
13826 while (it
!= pending_inc
.new_pg_upmap
.end()) {
13827 if (it
->first
.pool() == pool
) {
13828 dout(10) << __func__
<< " " << pool
13829 << " removing pending pg_upmap "
13830 << it
->first
<< dendl
;
13831 it
= pending_inc
.new_pg_upmap
.erase(it
);
13837 // remove any pg_upmap_items mappings for this pool
13838 for (auto& p
: osdmap
.pg_upmap_items
) {
13839 if (p
.first
.pool() == pool
) {
13840 dout(10) << __func__
<< " " << pool
13841 << " removing obsolete pg_upmap_items " << p
.first
13843 pending_inc
.old_pg_upmap_items
.insert(p
.first
);
13846 // remove any pending pg_upmap mappings for this pool
13848 auto it
= pending_inc
.new_pg_upmap_items
.begin();
13849 while (it
!= pending_inc
.new_pg_upmap_items
.end()) {
13850 if (it
->first
.pool() == pool
) {
13851 dout(10) << __func__
<< " " << pool
13852 << " removing pending pg_upmap_items "
13853 << it
->first
<< dendl
;
13854 it
= pending_inc
.new_pg_upmap_items
.erase(it
);
13861 // remove any choose_args for this pool
13862 CrushWrapper newcrush
;
13863 _get_pending_crush(newcrush
);
13864 if (newcrush
.have_choose_args(pool
)) {
13865 dout(10) << __func__
<< " removing choose_args for pool " << pool
<< dendl
;
13866 newcrush
.rm_choose_args(pool
);
13867 pending_inc
.crush
.clear();
13868 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
13873 int OSDMonitor::_prepare_rename_pool(int64_t pool
, string newname
)
13875 dout(10) << "_prepare_rename_pool " << pool
<< dendl
;
13876 if (pending_inc
.old_pools
.count(pool
)) {
13877 dout(10) << "_prepare_rename_pool " << pool
<< " pending removal" << dendl
;
13880 for (map
<int64_t,string
>::iterator p
= pending_inc
.new_pool_names
.begin();
13881 p
!= pending_inc
.new_pool_names
.end();
13883 if (p
->second
== newname
&& p
->first
!= pool
) {
13888 pending_inc
.new_pool_names
[pool
] = newname
;
13892 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op
)
13894 op
->mark_osdmon_event(__func__
);
13895 auto m
= op
->get_req
<MPoolOp
>();
13897 int ret
= _prepare_remove_pool(m
->pool
, &ss
, false);
13898 if (ret
== -EAGAIN
) {
13899 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13903 dout(10) << __func__
<< " got " << ret
<< " " << ss
.str() << dendl
;
13904 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, ret
,
13905 pending_inc
.epoch
));
13909 void OSDMonitor::_pool_op_reply(MonOpRequestRef op
,
13910 int ret
, epoch_t epoch
, bufferlist
*blp
)
13912 op
->mark_osdmon_event(__func__
);
13913 auto m
= op
->get_req
<MPoolOp
>();
13914 dout(20) << "_pool_op_reply " << ret
<< dendl
;
13915 MPoolOpReply
*reply
= new MPoolOpReply(m
->fsid
, m
->get_tid(),
13916 ret
, epoch
, get_last_committed(), blp
);
13917 mon
->send_reply(op
, reply
);
13920 void OSDMonitor::convert_pool_priorities(void)
13922 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc("recovery_priority").key
;
13923 int64_t max_prio
= 0;
13924 int64_t min_prio
= 0;
13925 for (const auto &i
: osdmap
.get_pools()) {
13926 const auto &pool
= i
.second
;
13928 if (pool
.opts
.is_set(key
)) {
13930 pool
.opts
.get(key
, &prio
);
13931 if (prio
> max_prio
)
13933 if (prio
< min_prio
)
13937 if (max_prio
<= OSD_POOL_PRIORITY_MAX
&& min_prio
>= OSD_POOL_PRIORITY_MIN
) {
13938 dout(20) << __func__
<< " nothing to fix" << dendl
;
13941 // Current pool priorities exceeds new maximum
13942 for (const auto &i
: osdmap
.get_pools()) {
13943 const auto pool_id
= i
.first
;
13944 pg_pool_t pool
= i
.second
;
13947 pool
.opts
.get(key
, &prio
);
13950 if (prio
> 0 && max_prio
> OSD_POOL_PRIORITY_MAX
) { // Likely scenario
13951 // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
13952 n
= (float)prio
/ max_prio
* OSD_POOL_PRIORITY_MAX
;
13953 } else if (prio
< 0 && min_prio
< OSD_POOL_PRIORITY_MIN
) {
13954 // Scaled priority range OSD_POOL_PRIORITY_MIN to 0
13955 n
= (float)prio
/ min_prio
* OSD_POOL_PRIORITY_MIN
;
13960 pool
.opts
.unset(key
);
13962 pool
.opts
.set(key
, static_cast<int64_t>(n
));
13964 dout(10) << __func__
<< " pool " << pool_id
13965 << " recovery_priority adjusted "
13966 << prio
<< " to " << n
<< dendl
;
13967 pool
.last_change
= pending_inc
.epoch
;
13968 pending_inc
.new_pools
[pool_id
] = pool
;