1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
10 * Author: Loic Dachary <loic@dachary.org>
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
20 #include <boost/algorithm/string.hpp>
21 #include <experimental/iterator>
25 #include "mon/OSDMonitor.h"
26 #include "mon/Monitor.h"
27 #include "mon/MDSMonitor.h"
28 #include "mon/MgrStatMonitor.h"
29 #include "mon/AuthMonitor.h"
30 #include "mon/ConfigKeyService.h"
32 #include "mon/MonitorDBStore.h"
33 #include "mon/Session.h"
35 #include "crush/CrushWrapper.h"
36 #include "crush/CrushTester.h"
37 #include "crush/CrushTreeDumper.h"
39 #include "messages/MOSDBeacon.h"
40 #include "messages/MOSDFailure.h"
41 #include "messages/MOSDMarkMeDown.h"
42 #include "messages/MOSDMarkMeDead.h"
43 #include "messages/MOSDFull.h"
44 #include "messages/MOSDMap.h"
45 #include "messages/MMonGetOSDMap.h"
46 #include "messages/MOSDBoot.h"
47 #include "messages/MOSDAlive.h"
48 #include "messages/MPoolOp.h"
49 #include "messages/MPoolOpReply.h"
50 #include "messages/MOSDPGCreate.h"
51 #include "messages/MOSDPGCreate2.h"
52 #include "messages/MOSDPGCreated.h"
53 #include "messages/MOSDPGTemp.h"
54 #include "messages/MOSDPGReadyToMerge.h"
55 #include "messages/MMonCommand.h"
56 #include "messages/MRemoveSnaps.h"
57 #include "messages/MOSDScrub.h"
58 #include "messages/MRoute.h"
59 #include "messages/MMonGetPurgedSnaps.h"
60 #include "messages/MMonGetPurgedSnapsReply.h"
62 #include "common/TextTable.h"
63 #include "common/Timer.h"
64 #include "common/ceph_argparse.h"
65 #include "common/perf_counters.h"
66 #include "common/PriorityCache.h"
67 #include "common/strtol.h"
68 #include "common/numa.h"
70 #include "common/config.h"
71 #include "common/errno.h"
73 #include "erasure-code/ErasureCodePlugin.h"
74 #include "compressor/Compressor.h"
75 #include "common/Checksummer.h"
77 #include "include/compat.h"
78 #include "include/ceph_assert.h"
79 #include "include/stringify.h"
80 #include "include/util.h"
81 #include "common/cmdparse.h"
82 #include "include/str_list.h"
83 #include "include/str_map.h"
84 #include "include/scope_guard.h"
85 #include "perfglue/heap_profiler.h"
87 #include "auth/cephx/CephxKeyServer.h"
88 #include "osd/OSDCap.h"
90 #include "json_spirit/json_spirit_reader.h"
92 #include <boost/algorithm/string/predicate.hpp>
94 #define dout_subsys ceph_subsys_mon
95 static const string
OSD_PG_CREATING_PREFIX("osd_pg_creating");
96 static const string
OSD_METADATA_PREFIX("osd_metadata");
97 static const string
OSD_SNAP_PREFIX("osd_snap");
101 OSD snapshot metadata
102 ---------------------
104 -- starting with mimic, removed in octopus --
106 "removed_epoch_%llu_%08lx" % (pool, epoch)
107 -> interval_set<snapid_t>
109 "removed_snap_%llu_%016llx" % (pool, last_snap)
110 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
113 -- starting with mimic --
115 "purged_snap_%llu_%016llx" % (pool, last_snap)
116 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
118 - note that the {removed,purged}_snap put the last snap in they key so
119 that we can use forward iteration only to search for an epoch in an
120 interval. e.g., to test if epoch N is removed/purged, we'll find a key
121 >= N that either does or doesn't contain the given snap.
124 -- starting with octopus --
126 "purged_epoch_%08lx" % epoch
127 -> map<int64_t,interval_set<snapid_t>>
130 using namespace TOPNSPC::common
;
133 struct OSDMemCache
: public PriorityCache::PriCache
{
135 int64_t cache_bytes
[PriorityCache::Priority::LAST
+1] = {0};
136 int64_t committed_bytes
= 0;
137 double cache_ratio
= 0;
139 OSDMemCache(OSDMonitor
*m
) : osdmon(m
) {};
141 virtual uint64_t _get_used_bytes() const = 0;
143 virtual int64_t request_cache_bytes(
144 PriorityCache::Priority pri
, uint64_t total_cache
) const {
145 int64_t assigned
= get_cache_bytes(pri
);
148 // All cache items are currently set to have PRI1 priority
149 case PriorityCache::Priority::PRI1
:
151 int64_t request
= _get_used_bytes();
152 return (request
> assigned
) ? request
- assigned
: 0;
160 virtual int64_t get_cache_bytes(PriorityCache::Priority pri
) const {
161 return cache_bytes
[pri
];
164 virtual int64_t get_cache_bytes() const {
167 for (int i
= 0; i
< PriorityCache::Priority::LAST
+ 1; i
++) {
168 PriorityCache::Priority pri
= static_cast<PriorityCache::Priority
>(i
);
169 total
+= get_cache_bytes(pri
);
174 virtual void set_cache_bytes(PriorityCache::Priority pri
, int64_t bytes
) {
175 cache_bytes
[pri
] = bytes
;
177 virtual void add_cache_bytes(PriorityCache::Priority pri
, int64_t bytes
) {
178 cache_bytes
[pri
] += bytes
;
180 virtual int64_t commit_cache_size(uint64_t total_cache
) {
181 committed_bytes
= PriorityCache::get_chunk(
182 get_cache_bytes(), total_cache
);
183 return committed_bytes
;
185 virtual int64_t get_committed_size() const {
186 return committed_bytes
;
188 virtual double get_cache_ratio() const {
191 virtual void set_cache_ratio(double ratio
) {
194 virtual string
get_cache_name() const = 0;
197 struct IncCache
: public OSDMemCache
{
198 IncCache(OSDMonitor
*m
) : OSDMemCache(m
) {};
200 virtual uint64_t _get_used_bytes() const {
201 return osdmon
->inc_osd_cache
.get_bytes();
204 virtual string
get_cache_name() const {
205 return "OSDMap Inc Cache";
208 uint64_t _get_num_osdmaps() const {
209 return osdmon
->inc_osd_cache
.get_size();
213 struct FullCache
: public OSDMemCache
{
214 FullCache(OSDMonitor
*m
) : OSDMemCache(m
) {};
216 virtual uint64_t _get_used_bytes() const {
217 return osdmon
->full_osd_cache
.get_bytes();
220 virtual string
get_cache_name() const {
221 return "OSDMap Full Cache";
224 uint64_t _get_num_osdmaps() const {
225 return osdmon
->full_osd_cache
.get_size();
229 std::shared_ptr
<IncCache
> inc_cache
;
230 std::shared_ptr
<FullCache
> full_cache
;
232 const uint32_t MAX_POOL_APPLICATIONS
= 4;
233 const uint32_t MAX_POOL_APPLICATION_KEYS
= 64;
234 const uint32_t MAX_POOL_APPLICATION_LENGTH
= 128;
236 bool is_osd_writable(const OSDCapGrant
& grant
, const std::string
* pool_name
) {
237 // Note: this doesn't include support for the application tag match
238 if ((grant
.spec
.allow
& OSD_CAP_W
) != 0) {
239 auto& match
= grant
.match
;
240 if (match
.is_match_all()) {
242 } else if (pool_name
!= nullptr &&
243 !match
.pool_namespace
.pool_name
.empty() &&
244 match
.pool_namespace
.pool_name
== *pool_name
) {
251 bool is_unmanaged_snap_op_permitted(CephContext
* cct
,
252 const KeyServer
& key_server
,
253 const EntityName
& entity_name
,
254 const MonCap
& mon_caps
,
255 const entity_addr_t
& peer_socket_addr
,
256 const std::string
* pool_name
)
258 typedef std::map
<std::string
, std::string
> CommandArgs
;
260 if (mon_caps
.is_capable(
261 cct
, entity_name
, "osd",
262 "osd pool op unmanaged-snap",
263 (pool_name
== nullptr ?
264 CommandArgs
{} /* pool DNE, require unrestricted cap */ :
265 CommandArgs
{{"poolname", *pool_name
}}),
271 AuthCapsInfo caps_info
;
272 if (!key_server
.get_service_caps(entity_name
, CEPH_ENTITY_TYPE_OSD
,
274 dout(10) << "unable to locate OSD cap data for " << entity_name
275 << " in auth db" << dendl
;
280 if (caps_info
.caps
.length() > 0) {
281 auto p
= caps_info
.caps
.cbegin();
284 } catch (const buffer::error
&err
) {
285 derr
<< "corrupt OSD cap data for " << entity_name
<< " in auth db"
292 if (!osd_cap
.parse(caps_str
, nullptr)) {
293 dout(10) << "unable to parse OSD cap data for " << entity_name
294 << " in auth db" << dendl
;
298 // if the entity has write permissions in one or all pools, permit
299 // usage of unmanaged-snapshots
300 if (osd_cap
.allow_all()) {
304 for (auto& grant
: osd_cap
.grants
) {
305 if (grant
.profile
.is_valid()) {
306 for (auto& profile_grant
: grant
.profile_grants
) {
307 if (is_osd_writable(profile_grant
, pool_name
)) {
311 } else if (is_osd_writable(grant
, pool_name
)) {
319 } // anonymous namespace
321 void LastEpochClean::Lec::report(ps_t ps
, epoch_t last_epoch_clean
)
323 if (epoch_by_pg
.size() <= ps
) {
324 epoch_by_pg
.resize(ps
+ 1, 0);
326 const auto old_lec
= epoch_by_pg
[ps
];
327 if (old_lec
>= last_epoch_clean
) {
331 epoch_by_pg
[ps
] = last_epoch_clean
;
332 if (last_epoch_clean
< floor
) {
333 floor
= last_epoch_clean
;
334 } else if (last_epoch_clean
> floor
) {
335 if (old_lec
== floor
) {
336 // probably should increase floor?
337 auto new_floor
= std::min_element(std::begin(epoch_by_pg
),
338 std::end(epoch_by_pg
));
342 if (ps
!= next_missing
) {
345 for (; next_missing
< epoch_by_pg
.size(); next_missing
++) {
346 if (epoch_by_pg
[next_missing
] == 0) {
352 void LastEpochClean::remove_pool(uint64_t pool
)
354 report_by_pool
.erase(pool
);
357 void LastEpochClean::report(const pg_t
& pg
, epoch_t last_epoch_clean
)
359 auto& lec
= report_by_pool
[pg
.pool()];
360 return lec
.report(pg
.ps(), last_epoch_clean
);
363 epoch_t
LastEpochClean::get_lower_bound(const OSDMap
& latest
) const
365 auto floor
= latest
.get_epoch();
366 for (auto& pool
: latest
.get_pools()) {
367 auto reported
= report_by_pool
.find(pool
.first
);
368 if (reported
== report_by_pool
.end()) {
371 if (reported
->second
.next_missing
< pool
.second
.get_pg_num()) {
374 if (reported
->second
.floor
< floor
) {
375 floor
= reported
->second
.floor
;
381 void LastEpochClean::dump(Formatter
*f
) const
383 f
->open_array_section("per_pool");
385 for (auto& it
: report_by_pool
) {
386 f
->open_object_section("pool");
387 f
->dump_unsigned("poolid", it
.first
);
388 f
->dump_unsigned("floor", it
.second
.floor
);
395 class C_UpdateCreatingPGs
: public Context
{
400 C_UpdateCreatingPGs(OSDMonitor
*osdmon
, epoch_t e
) :
401 osdmon(osdmon
), start(ceph_clock_now()), epoch(e
) {}
402 void finish(int r
) override
{
404 utime_t end
= ceph_clock_now();
405 dout(10) << "osdmap epoch " << epoch
<< " mapping took "
406 << (end
- start
) << " seconds" << dendl
;
407 osdmon
->update_creating_pgs();
408 osdmon
->check_pg_creates_subs();
414 #define dout_prefix _prefix(_dout, mon, osdmap)
415 static ostream
& _prefix(std::ostream
*_dout
, Monitor
*mon
, const OSDMap
& osdmap
) {
416 return *_dout
<< "mon." << mon
->name
<< "@" << mon
->rank
417 << "(" << mon
->get_state_name()
418 << ").osd e" << osdmap
.get_epoch() << " ";
421 OSDMonitor::OSDMonitor(
425 const string
& service_name
)
426 : PaxosService(mn
, p
, service_name
),
428 inc_osd_cache(g_conf()->mon_osd_cache_size
),
429 full_osd_cache(g_conf()->mon_osd_cache_size
),
430 has_osdmap_manifest(false),
431 mapper(mn
->cct
, &mn
->cpu_tp
)
433 inc_cache
= std::make_shared
<IncCache
>(this);
434 full_cache
= std::make_shared
<FullCache
>(this);
435 cct
->_conf
.add_observer(this);
436 int r
= _set_cache_sizes();
438 derr
<< __func__
<< " using default osd cache size - mon_osd_cache_size ("
439 << g_conf()->mon_osd_cache_size
440 << ") without priority cache management"
445 const char **OSDMonitor::get_tracked_conf_keys() const
447 static const char* KEYS
[] = {
449 "mon_memory_autotune",
450 "rocksdb_cache_size",
456 void OSDMonitor::handle_conf_change(const ConfigProxy
& conf
,
457 const std::set
<std::string
> &changed
)
459 dout(10) << __func__
<< " " << changed
<< dendl
;
461 if (changed
.count("mon_memory_autotune")) {
462 _set_cache_autotuning();
464 if (changed
.count("mon_memory_target") ||
465 changed
.count("rocksdb_cache_size")) {
466 int r
= _update_mon_cache_settings();
468 derr
<< __func__
<< " mon_memory_target:"
469 << g_conf()->mon_memory_target
470 << " rocksdb_cache_size:"
471 << g_conf()->rocksdb_cache_size
472 << ". Unable to update cache size."
478 void OSDMonitor::_set_cache_autotuning()
480 if (!g_conf()->mon_memory_autotune
&& pcm
!= nullptr) {
481 // Disable cache autotuning
482 std::lock_guard
l(balancer_lock
);
486 if (g_conf()->mon_memory_autotune
&& pcm
== nullptr) {
487 int r
= register_cache_with_pcm();
490 << " Error while registering osdmon caches with pcm."
491 << " Cache auto tuning not enabled."
493 mon_memory_autotune
= false;
495 mon_memory_autotune
= true;
500 int OSDMonitor::_update_mon_cache_settings()
502 if (g_conf()->mon_memory_target
<= 0 ||
503 g_conf()->mon_memory_target
< mon_memory_min
||
504 g_conf()->rocksdb_cache_size
<= 0) {
508 if (pcm
== nullptr && rocksdb_binned_kv_cache
== nullptr) {
509 derr
<< __func__
<< " not using pcm and rocksdb" << dendl
;
513 uint64_t old_mon_memory_target
= mon_memory_target
;
514 uint64_t old_rocksdb_cache_size
= rocksdb_cache_size
;
516 // Set the new pcm memory cache sizes
517 mon_memory_target
= g_conf()->mon_memory_target
;
518 rocksdb_cache_size
= g_conf()->rocksdb_cache_size
;
520 uint64_t base
= mon_memory_base
;
521 double fragmentation
= mon_memory_fragmentation
;
522 uint64_t target
= mon_memory_target
;
523 uint64_t min
= mon_memory_min
;
526 uint64_t ltarget
= (1.0 - fragmentation
) * target
;
527 if (ltarget
> base
+ min
) {
528 max
= ltarget
- base
;
531 int r
= _set_cache_ratios();
533 derr
<< __func__
<< " Cache ratios for pcm could not be set."
534 << " Review the kv (rocksdb) and mon_memory_target sizes."
536 mon_memory_target
= old_mon_memory_target
;
537 rocksdb_cache_size
= old_rocksdb_cache_size
;
541 if (mon_memory_autotune
&& pcm
!= nullptr) {
542 std::lock_guard
l(balancer_lock
);
543 // set pcm cache levels
544 pcm
->set_target_memory(target
);
545 pcm
->set_min_memory(min
);
546 pcm
->set_max_memory(max
);
547 // tune memory based on new values
550 _set_new_cache_sizes();
551 dout(1) << __func__
<< " Updated mon cache setting."
552 << " target: " << target
560 int OSDMonitor::_set_cache_sizes()
562 if (g_conf()->mon_memory_autotune
) {
563 // set the new osdmon cache targets to be managed by pcm
564 mon_osd_cache_size
= g_conf()->mon_osd_cache_size
;
565 rocksdb_cache_size
= g_conf()->rocksdb_cache_size
;
566 mon_memory_base
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_base");
567 mon_memory_fragmentation
= cct
->_conf
.get_val
<double>("osd_memory_expected_fragmentation");
568 mon_memory_target
= g_conf()->mon_memory_target
;
569 mon_memory_min
= g_conf()->mon_osd_cache_size_min
;
570 if (mon_memory_target
<= 0 || mon_memory_min
<= 0) {
571 derr
<< __func__
<< " mon_memory_target:" << mon_memory_target
572 << " mon_memory_min:" << mon_memory_min
573 << ". Invalid size option(s) provided."
577 // Set the initial inc and full LRU cache sizes
578 inc_osd_cache
.set_bytes(mon_memory_min
);
579 full_osd_cache
.set_bytes(mon_memory_min
);
580 mon_memory_autotune
= g_conf()->mon_memory_autotune
;
585 bool OSDMonitor::_have_pending_crush()
587 return pending_inc
.crush
.length() > 0;
590 CrushWrapper
&OSDMonitor::_get_stable_crush()
592 return *osdmap
.crush
;
595 void OSDMonitor::_get_pending_crush(CrushWrapper
& newcrush
)
598 if (pending_inc
.crush
.length())
599 bl
= pending_inc
.crush
;
601 osdmap
.crush
->encode(bl
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
603 auto p
= bl
.cbegin();
607 void OSDMonitor::create_initial()
609 dout(10) << "create_initial for " << mon
->monmap
->fsid
<< dendl
;
614 mon
->store
->get("mkfs", "osdmap", bl
);
618 newmap
.set_fsid(mon
->monmap
->fsid
);
620 newmap
.build_simple(cct
, 0, mon
->monmap
->fsid
, 0);
623 newmap
.created
= newmap
.modified
= ceph_clock_now();
625 // new clusters should sort bitwise by default.
626 newmap
.set_flag(CEPH_OSDMAP_SORTBITWISE
);
629 CEPH_OSDMAP_RECOVERY_DELETES
|
630 CEPH_OSDMAP_PURGED_SNAPDIRS
|
631 CEPH_OSDMAP_PGLOG_HARDLIMIT
;
632 newmap
.full_ratio
= g_conf()->mon_osd_full_ratio
;
633 if (newmap
.full_ratio
> 1.0) newmap
.full_ratio
/= 100;
634 newmap
.backfillfull_ratio
= g_conf()->mon_osd_backfillfull_ratio
;
635 if (newmap
.backfillfull_ratio
> 1.0) newmap
.backfillfull_ratio
/= 100;
636 newmap
.nearfull_ratio
= g_conf()->mon_osd_nearfull_ratio
;
637 if (newmap
.nearfull_ratio
> 1.0) newmap
.nearfull_ratio
/= 100;
639 // new cluster should require latest by default
640 if (g_conf().get_val
<bool>("mon_debug_no_require_octopus")) {
641 if (g_conf().get_val
<bool>("mon_debug_no_require_nautilus")) {
642 derr
<< __func__
<< " mon_debug_no_require_octopus and nautilus=true" << dendl
;
643 newmap
.require_osd_release
= ceph_release_t::mimic
;
645 derr
<< __func__
<< " mon_debug_no_require_octopus=true" << dendl
;
646 newmap
.require_osd_release
= ceph_release_t::nautilus
;
649 newmap
.require_osd_release
= ceph_release_t::octopus
;
650 ceph_release_t r
= ceph_release_from_name(
651 g_conf()->mon_osd_initial_require_min_compat_client
);
653 ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
655 newmap
.require_min_compat_client
= r
;
658 // encode into pending incremental
659 uint64_t features
= newmap
.get_encoding_features();
660 newmap
.encode(pending_inc
.fullmap
,
661 features
| CEPH_FEATURE_RESERVED
);
662 pending_inc
.full_crc
= newmap
.get_crc();
663 dout(20) << " full crc " << pending_inc
.full_crc
<< dendl
;
666 void OSDMonitor::get_store_prefixes(std::set
<string
>& s
) const
668 s
.insert(service_name
);
669 s
.insert(OSD_PG_CREATING_PREFIX
);
670 s
.insert(OSD_METADATA_PREFIX
);
671 s
.insert(OSD_SNAP_PREFIX
);
674 void OSDMonitor::update_from_paxos(bool *need_bootstrap
)
676 // we really don't care if the version has been updated, because we may
677 // have trimmed without having increased the last committed; yet, we may
678 // need to update the in-memory manifest.
679 load_osdmap_manifest();
681 version_t version
= get_last_committed();
682 if (version
== osdmap
.epoch
)
684 ceph_assert(version
> osdmap
.epoch
);
686 dout(15) << "update_from_paxos paxos e " << version
687 << ", my e " << osdmap
.epoch
<< dendl
;
690 if (!mapping_job
->is_done()) {
691 dout(1) << __func__
<< " mapping job "
692 << mapping_job
.get() << " did not complete, "
693 << mapping_job
->shards
<< " left, canceling" << dendl
;
694 mapping_job
->abort();
702 * We will possibly have a stashed latest that *we* wrote, and we will
703 * always be sure to have the oldest full map in the first..last range
704 * due to encode_trim_extra(), which includes the oldest full map in the trim
707 * encode_trim_extra() does not however write the full map's
708 * version to 'full_latest'. This is only done when we are building the
709 * full maps from the incremental versions. But don't panic! We make sure
710 * that the following conditions find whichever full map version is newer.
712 version_t latest_full
= get_version_latest_full();
713 if (latest_full
== 0 && get_first_committed() > 1)
714 latest_full
= get_first_committed();
716 if (get_first_committed() > 1 &&
717 latest_full
< get_first_committed()) {
718 // the monitor could be just sync'ed with its peer, and the latest_full key
719 // is not encoded in the paxos commits in encode_pending(), so we need to
720 // make sure we get it pointing to a proper version.
721 version_t lc
= get_last_committed();
722 version_t fc
= get_first_committed();
724 dout(10) << __func__
<< " looking for valid full map in interval"
725 << " [" << fc
<< ", " << lc
<< "]" << dendl
;
728 for (version_t v
= lc
; v
>= fc
; v
--) {
729 string full_key
= "full_" + stringify(v
);
730 if (mon
->store
->exists(get_service_name(), full_key
)) {
731 dout(10) << __func__
<< " found latest full map v " << v
<< dendl
;
737 ceph_assert(latest_full
> 0);
738 auto t(std::make_shared
<MonitorDBStore::Transaction
>());
739 put_version_latest_full(t
, latest_full
);
740 mon
->store
->apply_transaction(t
);
741 dout(10) << __func__
<< " updated the on-disk full map version to "
742 << latest_full
<< dendl
;
745 if ((latest_full
> 0) && (latest_full
> osdmap
.epoch
)) {
746 bufferlist latest_bl
;
747 get_version_full(latest_full
, latest_bl
);
748 ceph_assert(latest_bl
.length() != 0);
749 dout(7) << __func__
<< " loading latest full map e" << latest_full
<< dendl
;
751 osdmap
.decode(latest_bl
);
755 if (!mon
->store
->get(OSD_PG_CREATING_PREFIX
, "creating", bl
)) {
756 auto p
= bl
.cbegin();
757 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
758 creating_pgs
.decode(p
);
759 dout(7) << __func__
<< " loading creating_pgs last_scan_epoch "
760 << creating_pgs
.last_scan_epoch
761 << " with " << creating_pgs
.pgs
.size() << " pgs" << dendl
;
763 dout(1) << __func__
<< " missing creating pgs; upgrade from post-kraken?"
767 // walk through incrementals
768 MonitorDBStore::TransactionRef t
;
770 while (version
> osdmap
.epoch
) {
772 int err
= get_version(osdmap
.epoch
+1, inc_bl
);
773 ceph_assert(err
== 0);
774 ceph_assert(inc_bl
.length());
775 // set priority cache manager levels if the osdmap is
776 // being populated for the first time.
777 if (mon_memory_autotune
&& pcm
== nullptr) {
778 int r
= register_cache_with_pcm();
781 << " Error while registering osdmon caches with pcm."
782 << " Proceeding without cache auto tuning."
787 dout(7) << "update_from_paxos applying incremental " << osdmap
.epoch
+1
789 OSDMap::Incremental
inc(inc_bl
);
790 err
= osdmap
.apply_incremental(inc
);
791 ceph_assert(err
== 0);
794 t
.reset(new MonitorDBStore::Transaction
);
796 // Write out the full map for all past epochs. Encode the full
797 // map with the same features as the incremental. If we don't
798 // know, use the quorum features. If we don't know those either,
799 // encode with all features.
800 uint64_t f
= inc
.encode_features
;
802 f
= mon
->get_quorum_con_features();
806 osdmap
.encode(full_bl
, f
| CEPH_FEATURE_RESERVED
);
807 tx_size
+= full_bl
.length();
809 bufferlist orig_full_bl
;
810 get_version_full(osdmap
.epoch
, orig_full_bl
);
811 if (orig_full_bl
.length()) {
812 // the primary provided the full map
813 ceph_assert(inc
.have_crc
);
814 if (inc
.full_crc
!= osdmap
.crc
) {
815 // This will happen if the mons were running mixed versions in
816 // the past or some other circumstance made the full encoded
817 // maps divergent. Reloading here will bring us back into
818 // sync with the primary for this and all future maps. OSDs
819 // will also be brought back into sync when they discover the
820 // crc mismatch and request a full map from a mon.
821 derr
<< __func__
<< " full map CRC mismatch, resetting to canonical"
824 dout(20) << __func__
<< " my (bad) full osdmap:\n";
825 JSONFormatter
jf(true);
826 jf
.dump_object("osdmap", osdmap
);
828 *_dout
<< "\nhexdump:\n";
829 full_bl
.hexdump(*_dout
);
833 osdmap
.decode(orig_full_bl
);
835 dout(20) << __func__
<< " canonical full osdmap:\n";
836 JSONFormatter
jf(true);
837 jf
.dump_object("osdmap", osdmap
);
839 *_dout
<< "\nhexdump:\n";
840 orig_full_bl
.hexdump(*_dout
);
844 ceph_assert(!inc
.have_crc
);
845 put_version_full(t
, osdmap
.epoch
, full_bl
);
847 put_version_latest_full(t
, osdmap
.epoch
);
850 dout(1) << osdmap
<< dendl
;
852 if (osdmap
.epoch
== 1) {
853 t
->erase("mkfs", "osdmap");
856 if (tx_size
> g_conf()->mon_sync_max_payload_size
*2) {
857 mon
->store
->apply_transaction(t
);
858 t
= MonitorDBStore::TransactionRef();
861 for (const auto &osd_state
: inc
.new_state
) {
862 if (osd_state
.second
& CEPH_OSD_UP
) {
863 // could be marked up *or* down, but we're too lazy to check which
864 last_osd_report
.erase(osd_state
.first
);
866 if (osd_state
.second
& CEPH_OSD_EXISTS
) {
867 // could be created *or* destroyed, but we can safely drop it
868 osd_epochs
.erase(osd_state
.first
);
874 mon
->store
->apply_transaction(t
);
877 for (int o
= 0; o
< osdmap
.get_max_osd(); o
++) {
878 if (osdmap
.is_out(o
))
880 auto found
= down_pending_out
.find(o
);
881 if (osdmap
.is_down(o
)) {
882 // populate down -> out map
883 if (found
== down_pending_out
.end()) {
884 dout(10) << " adding osd." << o
<< " to down_pending_out map" << dendl
;
885 down_pending_out
[o
] = ceph_clock_now();
888 if (found
!= down_pending_out
.end()) {
889 dout(10) << " removing osd." << o
<< " from down_pending_out map" << dendl
;
890 down_pending_out
.erase(found
);
894 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
897 check_pg_creates_subs();
899 share_map_with_random_osd();
903 // make sure our feature bits reflect the latest map
904 update_msgr_features();
906 if (!mon
->is_leader()) {
907 // will be called by on_active() on the leader, avoid doing so twice
912 int OSDMonitor::register_cache_with_pcm()
914 if (mon_memory_target
<= 0 || mon_memory_min
<= 0) {
915 derr
<< __func__
<< " Invalid memory size specified for mon caches."
916 << " Caches will not be auto-tuned."
920 uint64_t base
= mon_memory_base
;
921 double fragmentation
= mon_memory_fragmentation
;
922 // For calculating total target memory, consider rocksdb cache size.
923 uint64_t target
= mon_memory_target
;
924 uint64_t min
= mon_memory_min
;
927 // Apply the same logic as in bluestore to set the max amount
928 // of memory to use for cache. Assume base memory for OSDMaps
929 // and then add in some overhead for fragmentation.
930 uint64_t ltarget
= (1.0 - fragmentation
) * target
;
931 if (ltarget
> base
+ min
) {
932 max
= ltarget
- base
;
935 rocksdb_binned_kv_cache
= mon
->store
->get_priority_cache();
936 if (!rocksdb_binned_kv_cache
) {
937 derr
<< __func__
<< " not using rocksdb" << dendl
;
941 int r
= _set_cache_ratios();
943 derr
<< __func__
<< " Cache ratios for pcm could not be set."
944 << " Review the kv (rocksdb) and mon_memory_target sizes."
949 pcm
= std::make_shared
<PriorityCache::Manager
>(
950 cct
, min
, max
, target
, true);
951 pcm
->insert("kv", rocksdb_binned_kv_cache
, true);
952 pcm
->insert("inc", inc_cache
, true);
953 pcm
->insert("full", full_cache
, true);
954 dout(1) << __func__
<< " pcm target: " << target
955 << " pcm max: " << max
956 << " pcm min: " << min
957 << " inc_osd_cache size: " << inc_osd_cache
.get_size()
962 int OSDMonitor::_set_cache_ratios()
964 double old_cache_kv_ratio
= cache_kv_ratio
;
966 // Set the cache ratios for kv(rocksdb), inc and full caches
967 cache_kv_ratio
= (double)rocksdb_cache_size
/ (double)mon_memory_target
;
968 if (cache_kv_ratio
>= 1.0) {
969 derr
<< __func__
<< " Cache kv ratio (" << cache_kv_ratio
970 << ") must be in range [0,<1.0]."
972 cache_kv_ratio
= old_cache_kv_ratio
;
975 rocksdb_binned_kv_cache
->set_cache_ratio(cache_kv_ratio
);
976 cache_inc_ratio
= cache_full_ratio
= (1.0 - cache_kv_ratio
) / 2;
977 inc_cache
->set_cache_ratio(cache_inc_ratio
);
978 full_cache
->set_cache_ratio(cache_full_ratio
);
980 dout(1) << __func__
<< " kv ratio " << cache_kv_ratio
981 << " inc ratio " << cache_inc_ratio
982 << " full ratio " << cache_full_ratio
987 void OSDMonitor::start_mapping()
989 // initiate mapping job
991 dout(10) << __func__
<< " canceling previous mapping_job " << mapping_job
.get()
993 mapping_job
->abort();
995 if (!osdmap
.get_pools().empty()) {
996 auto fin
= new C_UpdateCreatingPGs(this, osdmap
.get_epoch());
997 mapping_job
= mapping
.start_update(osdmap
, mapper
,
998 g_conf()->mon_osd_mapping_pgs_per_chunk
);
999 dout(10) << __func__
<< " started mapping job " << mapping_job
.get()
1000 << " at " << fin
->start
<< dendl
;
1001 mapping_job
->set_finish_event(fin
);
1003 dout(10) << __func__
<< " no pools, no mapping job" << dendl
;
1004 mapping_job
= nullptr;
1008 void OSDMonitor::update_msgr_features()
1011 types
.insert((int)entity_name_t::TYPE_OSD
);
1012 types
.insert((int)entity_name_t::TYPE_CLIENT
);
1013 types
.insert((int)entity_name_t::TYPE_MDS
);
1014 types
.insert((int)entity_name_t::TYPE_MON
);
1015 for (set
<int>::iterator q
= types
.begin(); q
!= types
.end(); ++q
) {
1017 uint64_t features
= osdmap
.get_features(*q
, &mask
);
1018 if ((mon
->messenger
->get_policy(*q
).features_required
& mask
) != features
) {
1019 dout(0) << "crush map has features " << features
<< ", adjusting msgr requires" << dendl
;
1020 ceph::net::Policy p
= mon
->messenger
->get_policy(*q
);
1021 p
.features_required
= (p
.features_required
& ~mask
) | features
;
1022 mon
->messenger
->set_policy(*q
, p
);
1027 void OSDMonitor::on_active()
1031 if (mon
->is_leader()) {
1032 mon
->clog
->debug() << "osdmap " << osdmap
;
1033 if (!priority_convert
) {
1034 // Only do this once at start-up
1035 convert_pool_priorities();
1036 priority_convert
= true;
1039 list
<MonOpRequestRef
> ls
;
1040 take_all_failures(ls
);
1041 while (!ls
.empty()) {
1042 MonOpRequestRef op
= ls
.front();
1043 op
->mark_osdmon_event(__func__
);
1051 void OSDMonitor::on_restart()
1053 last_osd_report
.clear();
1056 void OSDMonitor::on_shutdown()
1058 dout(10) << __func__
<< dendl
;
1060 dout(10) << __func__
<< " canceling previous mapping_job " << mapping_job
.get()
1062 mapping_job
->abort();
1065 // discard failure info, waiters
1066 list
<MonOpRequestRef
> ls
;
1067 take_all_failures(ls
);
1071 void OSDMonitor::update_logger()
1073 dout(10) << "update_logger" << dendl
;
1075 mon
->cluster_logger
->set(l_cluster_num_osd
, osdmap
.get_num_osds());
1076 mon
->cluster_logger
->set(l_cluster_num_osd_up
, osdmap
.get_num_up_osds());
1077 mon
->cluster_logger
->set(l_cluster_num_osd_in
, osdmap
.get_num_in_osds());
1078 mon
->cluster_logger
->set(l_cluster_osd_epoch
, osdmap
.get_epoch());
1081 void OSDMonitor::create_pending()
1083 pending_inc
= OSDMap::Incremental(osdmap
.epoch
+1);
1084 pending_inc
.fsid
= mon
->monmap
->fsid
;
1085 pending_metadata
.clear();
1086 pending_metadata_rm
.clear();
1087 pending_pseudo_purged_snaps
.clear();
1089 dout(10) << "create_pending e " << pending_inc
.epoch
<< dendl
;
1091 // safety checks (this shouldn't really happen)
1093 if (osdmap
.backfillfull_ratio
<= 0) {
1094 pending_inc
.new_backfillfull_ratio
= g_conf()->mon_osd_backfillfull_ratio
;
1095 if (pending_inc
.new_backfillfull_ratio
> 1.0)
1096 pending_inc
.new_backfillfull_ratio
/= 100;
1097 dout(1) << __func__
<< " setting backfillfull_ratio = "
1098 << pending_inc
.new_backfillfull_ratio
<< dendl
;
1100 if (osdmap
.full_ratio
<= 0) {
1101 pending_inc
.new_full_ratio
= g_conf()->mon_osd_full_ratio
;
1102 if (pending_inc
.new_full_ratio
> 1.0)
1103 pending_inc
.new_full_ratio
/= 100;
1104 dout(1) << __func__
<< " setting full_ratio = "
1105 << pending_inc
.new_full_ratio
<< dendl
;
1107 if (osdmap
.nearfull_ratio
<= 0) {
1108 pending_inc
.new_nearfull_ratio
= g_conf()->mon_osd_nearfull_ratio
;
1109 if (pending_inc
.new_nearfull_ratio
> 1.0)
1110 pending_inc
.new_nearfull_ratio
/= 100;
1111 dout(1) << __func__
<< " setting nearfull_ratio = "
1112 << pending_inc
.new_nearfull_ratio
<< dendl
;
1116 // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
1118 if (osdmap
.crush
->has_legacy_rule_ids()) {
1119 CrushWrapper newcrush
;
1120 _get_pending_crush(newcrush
);
1122 // First, for all pools, work out which rule they really used
1123 // by resolving ruleset to rule.
1124 for (const auto &i
: osdmap
.get_pools()) {
1125 const auto pool_id
= i
.first
;
1126 const auto &pool
= i
.second
;
1127 int new_rule_id
= newcrush
.find_rule(pool
.crush_rule
,
1128 pool
.type
, pool
.size
);
1130 dout(1) << __func__
<< " rewriting pool "
1131 << osdmap
.get_pool_name(pool_id
) << " crush ruleset "
1132 << pool
.crush_rule
<< " -> rule id " << new_rule_id
<< dendl
;
1133 if (pending_inc
.new_pools
.count(pool_id
) == 0) {
1134 pending_inc
.new_pools
[pool_id
] = pool
;
1136 pending_inc
.new_pools
[pool_id
].crush_rule
= new_rule_id
;
1139 // Now, go ahead and renumber all the rules so that their
1140 // rule_id field corresponds to their position in the array
1141 auto old_to_new
= newcrush
.renumber_rules();
1142 dout(1) << __func__
<< " Rewrote " << old_to_new
<< " crush IDs:" << dendl
;
1143 for (const auto &i
: old_to_new
) {
1144 dout(1) << __func__
<< " " << i
.first
<< " -> " << i
.second
<< dendl
;
1146 pending_inc
.crush
.clear();
1147 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
1152 OSDMonitor::update_pending_pgs(const OSDMap::Incremental
& inc
,
1153 const OSDMap
& nextmap
)
1155 dout(10) << __func__
<< dendl
;
1156 creating_pgs_t pending_creatings
;
1158 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
1159 pending_creatings
= creating_pgs
;
1161 // check for new or old pools
1162 if (pending_creatings
.last_scan_epoch
< inc
.epoch
) {
1163 unsigned queued
= 0;
1164 queued
+= scan_for_creating_pgs(osdmap
.get_pools(),
1167 &pending_creatings
);
1168 queued
+= scan_for_creating_pgs(inc
.new_pools
,
1171 &pending_creatings
);
1172 dout(10) << __func__
<< " " << queued
<< " pools queued" << dendl
;
1173 for (auto deleted_pool
: inc
.old_pools
) {
1174 auto removed
= pending_creatings
.remove_pool(deleted_pool
);
1175 dout(10) << __func__
<< " " << removed
1176 << " pg removed because containing pool deleted: "
1177 << deleted_pool
<< dendl
;
1178 last_epoch_clean
.remove_pool(deleted_pool
);
1180 // pgmon updates its creating_pgs in check_osd_map() which is called by
1181 // on_active() and check_osd_map() could be delayed if lease expires, so its
1182 // creating_pgs could be stale in comparison with the one of osdmon. let's
1183 // trim them here. otherwise, they will be added back after being erased.
1184 unsigned removed
= 0;
1185 for (auto& pg
: pending_created_pgs
) {
1186 dout(20) << __func__
<< " noting created pg " << pg
<< dendl
;
1187 pending_creatings
.created_pools
.insert(pg
.pool());
1188 removed
+= pending_creatings
.pgs
.erase(pg
);
1190 pending_created_pgs
.clear();
1191 dout(10) << __func__
<< " " << removed
1192 << " pgs removed because they're created" << dendl
;
1193 pending_creatings
.last_scan_epoch
= osdmap
.get_epoch();
1196 // filter out any pgs that shouldn't exist.
1198 auto i
= pending_creatings
.pgs
.begin();
1199 while (i
!= pending_creatings
.pgs
.end()) {
1200 if (!nextmap
.pg_exists(i
->first
)) {
1201 dout(10) << __func__
<< " removing pg " << i
->first
1202 << " which should not exist" << dendl
;
1203 i
= pending_creatings
.pgs
.erase(i
);
1211 unsigned max
= std::max
<int64_t>(1, g_conf()->mon_osd_max_creating_pgs
);
1212 const auto total
= pending_creatings
.pgs
.size();
1213 while (pending_creatings
.pgs
.size() < max
&&
1214 !pending_creatings
.queue
.empty()) {
1215 auto p
= pending_creatings
.queue
.begin();
1216 int64_t poolid
= p
->first
;
1217 dout(10) << __func__
<< " pool " << poolid
1218 << " created " << p
->second
.created
1219 << " modified " << p
->second
.modified
1220 << " [" << p
->second
.start
<< "-" << p
->second
.end
<< ")"
1222 int64_t n
= std::min
<int64_t>(max
- pending_creatings
.pgs
.size(),
1223 p
->second
.end
- p
->second
.start
);
1224 ps_t first
= p
->second
.start
;
1225 ps_t end
= first
+ n
;
1226 for (ps_t ps
= first
; ps
< end
; ++ps
) {
1227 const pg_t pgid
{ps
, static_cast<uint64_t>(poolid
)};
1228 // NOTE: use the *current* epoch as the PG creation epoch so that the
1229 // OSD does not have to generate a long set of PastIntervals.
1230 pending_creatings
.pgs
.emplace(
1232 creating_pgs_t::pg_create_info(inc
.epoch
,
1233 p
->second
.modified
));
1234 dout(10) << __func__
<< " adding " << pgid
<< dendl
;
1236 p
->second
.start
= end
;
1237 if (p
->second
.done()) {
1238 dout(10) << __func__
<< " done with queue for " << poolid
<< dendl
;
1239 pending_creatings
.queue
.erase(p
);
1241 dout(10) << __func__
<< " pool " << poolid
1242 << " now [" << p
->second
.start
<< "-" << p
->second
.end
<< ")"
1246 dout(10) << __func__
<< " queue remaining: " << pending_creatings
.queue
.size()
1247 << " pools" << dendl
;
1249 if (mon
->monmap
->min_mon_release
>= ceph_release_t::octopus
) {
1250 // walk creating pgs' history and past_intervals forward
1251 for (auto& i
: pending_creatings
.pgs
) {
1252 // this mirrors PG::start_peering_interval()
1253 pg_t pgid
= i
.first
;
1255 // this is a bit imprecise, but sufficient?
1256 struct min_size_predicate_t
: public IsPGRecoverablePredicate
{
1257 const pg_pool_t
*pi
;
1258 bool operator()(const set
<pg_shard_t
> &have
) const {
1259 return have
.size() >= pi
->min_size
;
1261 explicit min_size_predicate_t(const pg_pool_t
*i
) : pi(i
) {}
1262 } min_size_predicate(nextmap
.get_pg_pool(pgid
.pool()));
1264 vector
<int> up
, acting
;
1265 int up_primary
, acting_primary
;
1266 nextmap
.pg_to_up_acting_osds(
1267 pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
1268 if (i
.second
.history
.epoch_created
== 0) {
1269 // new pg entry, set it up
1271 i
.second
.acting
= acting
;
1272 i
.second
.up_primary
= up_primary
;
1273 i
.second
.acting_primary
= acting_primary
;
1274 i
.second
.history
= pg_history_t(i
.second
.create_epoch
,
1275 i
.second
.create_stamp
);
1276 dout(10) << __func__
<< " pg " << pgid
<< " just added, "
1277 << " up " << i
.second
.up
1278 << " p " << i
.second
.up_primary
1279 << " acting " << i
.second
.acting
1280 << " p " << i
.second
.acting_primary
1281 << " history " << i
.second
.history
1282 << " past_intervals " << i
.second
.past_intervals
1285 std::stringstream debug
;
1286 if (PastIntervals::check_new_interval(
1287 i
.second
.acting_primary
, acting_primary
,
1288 i
.second
.acting
, acting
,
1289 i
.second
.up_primary
, up_primary
,
1291 i
.second
.history
.same_interval_since
,
1292 i
.second
.history
.last_epoch_clean
,
1297 &i
.second
.past_intervals
,
1299 epoch_t e
= inc
.epoch
;
1300 i
.second
.history
.same_interval_since
= e
;
1301 if (i
.second
.up
!= up
) {
1302 i
.second
.history
.same_up_since
= e
;
1304 if (i
.second
.acting_primary
!= acting_primary
) {
1305 i
.second
.history
.same_primary_since
= e
;
1308 osdmap
.get_pg_num(pgid
.pool()),
1309 nextmap
.get_pg_num(pgid
.pool()),
1311 i
.second
.history
.last_epoch_split
= e
;
1313 dout(10) << __func__
<< " pg " << pgid
<< " new interval,"
1314 << " up " << i
.second
.up
<< " -> " << up
1315 << " p " << i
.second
.up_primary
<< " -> " << up_primary
1316 << " acting " << i
.second
.acting
<< " -> " << acting
1317 << " p " << i
.second
.acting_primary
<< " -> "
1319 << " history " << i
.second
.history
1320 << " past_intervals " << i
.second
.past_intervals
1322 dout(20) << " debug: " << debug
.str() << dendl
;
1324 i
.second
.acting
= acting
;
1325 i
.second
.up_primary
= up_primary
;
1326 i
.second
.acting_primary
= acting_primary
;
1331 dout(10) << __func__
1332 << " " << (pending_creatings
.pgs
.size() - total
)
1333 << "/" << pending_creatings
.pgs
.size()
1334 << " pgs added from queued pools" << dendl
;
1335 return pending_creatings
;
1338 void OSDMonitor::maybe_prime_pg_temp()
1341 if (pending_inc
.crush
.length()) {
1342 dout(10) << __func__
<< " new crush map, all" << dendl
;
1346 if (!pending_inc
.new_up_client
.empty()) {
1347 dout(10) << __func__
<< " new up osds, all" << dendl
;
1351 // check for interesting OSDs
1353 for (auto p
= pending_inc
.new_state
.begin();
1354 !all
&& p
!= pending_inc
.new_state
.end();
1356 if ((p
->second
& CEPH_OSD_UP
) &&
1357 osdmap
.is_up(p
->first
)) {
1358 osds
.insert(p
->first
);
1361 for (map
<int32_t,uint32_t>::iterator p
= pending_inc
.new_weight
.begin();
1362 !all
&& p
!= pending_inc
.new_weight
.end();
1364 if (p
->second
< osdmap
.get_weight(p
->first
)) {
1366 osds
.insert(p
->first
);
1368 dout(10) << __func__
<< " osd." << p
->first
<< " weight increase, all"
1374 if (!all
&& osds
.empty())
1379 mapping
.get_osd_acting_pgs(*osds
.begin()).size() * osds
.size();
1380 if (estimate
> mapping
.get_num_pgs() *
1381 g_conf()->mon_osd_prime_pg_temp_max_estimate
) {
1382 dout(10) << __func__
<< " estimate " << estimate
<< " pgs on "
1383 << osds
.size() << " osds >= "
1384 << g_conf()->mon_osd_prime_pg_temp_max_estimate
<< " of total "
1385 << mapping
.get_num_pgs() << " pgs, all"
1389 dout(10) << __func__
<< " estimate " << estimate
<< " pgs on "
1390 << osds
.size() << " osds" << dendl
;
1395 next
.deepish_copy_from(osdmap
);
1396 next
.apply_incremental(pending_inc
);
1398 if (next
.get_pools().empty()) {
1399 dout(10) << __func__
<< " no pools, no pg_temp priming" << dendl
;
1401 PrimeTempJob
job(next
, this);
1402 mapper
.queue(&job
, g_conf()->mon_osd_mapping_pgs_per_chunk
, {});
1403 if (job
.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time
)) {
1404 dout(10) << __func__
<< " done in " << job
.get_duration() << dendl
;
1406 dout(10) << __func__
<< " did not finish in "
1407 << g_conf()->mon_osd_prime_pg_temp_max_time
1408 << ", stopping" << dendl
;
1412 dout(10) << __func__
<< " " << osds
.size() << " interesting osds" << dendl
;
1413 utime_t stop
= ceph_clock_now();
1414 stop
+= g_conf()->mon_osd_prime_pg_temp_max_time
;
1415 const int chunk
= 1000;
1417 std::unordered_set
<pg_t
> did_pgs
;
1418 for (auto osd
: osds
) {
1419 auto& pgs
= mapping
.get_osd_acting_pgs(osd
);
1420 dout(20) << __func__
<< " osd." << osd
<< " " << pgs
<< dendl
;
1421 for (auto pgid
: pgs
) {
1422 if (!did_pgs
.insert(pgid
).second
) {
1425 prime_pg_temp(next
, pgid
);
1428 if (ceph_clock_now() > stop
) {
1429 dout(10) << __func__
<< " consumed more than "
1430 << g_conf()->mon_osd_prime_pg_temp_max_time
1431 << " seconds, stopping"
1441 void OSDMonitor::prime_pg_temp(
1445 // TODO: remove this creating_pgs direct access?
1446 if (creating_pgs
.pgs
.count(pgid
)) {
1449 if (!osdmap
.pg_exists(pgid
)) {
1453 vector
<int> up
, acting
;
1454 mapping
.get(pgid
, &up
, nullptr, &acting
, nullptr);
1456 vector
<int> next_up
, next_acting
;
1457 int next_up_primary
, next_acting_primary
;
1458 next
.pg_to_up_acting_osds(pgid
, &next_up
, &next_up_primary
,
1459 &next_acting
, &next_acting_primary
);
1460 if (acting
== next_acting
&&
1461 !(up
!= acting
&& next_up
== next_acting
))
1462 return; // no change since last epoch
1465 return; // if previously empty now we can be no worse off
1466 const pg_pool_t
*pool
= next
.get_pg_pool(pgid
.pool());
1467 if (pool
&& acting
.size() < pool
->min_size
)
1468 return; // can be no worse off than before
1470 if (next_up
== next_acting
) {
1472 dout(20) << __func__
<< " next_up == next_acting now, clear pg_temp"
1476 dout(20) << __func__
<< " " << pgid
<< " " << up
<< "/" << acting
1477 << " -> " << next_up
<< "/" << next_acting
1478 << ", priming " << acting
1481 std::lock_guard
l(prime_pg_temp_lock
);
1482 // do not touch a mapping if a change is pending
1483 pending_inc
.new_pg_temp
.emplace(
1485 mempool::osdmap::vector
<int>(acting
.begin(), acting
.end()));
1490 * @note receiving a transaction in this function gives a fair amount of
1491 * freedom to the service implementation if it does need it. It shouldn't.
1493 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t
)
1495 dout(10) << "encode_pending e " << pending_inc
.epoch
1499 dout(1) << __func__
<< " osdmap full prune encoded e"
1500 << pending_inc
.epoch
<< dendl
;
1503 // finalize up pending_inc
1504 pending_inc
.modified
= ceph_clock_now();
1506 int r
= pending_inc
.propagate_snaps_to_tiers(cct
, osdmap
);
1507 ceph_assert(r
== 0);
1510 if (!mapping_job
->is_done()) {
1511 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job "
1512 << mapping_job
.get() << " did not complete, "
1513 << mapping_job
->shards
<< " left" << dendl
;
1514 mapping_job
->abort();
1515 } else if (mapping
.get_epoch() < osdmap
.get_epoch()) {
1516 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job "
1517 << mapping_job
.get() << " is prior epoch "
1518 << mapping
.get_epoch() << dendl
;
1520 if (g_conf()->mon_osd_prime_pg_temp
) {
1521 maybe_prime_pg_temp();
1524 } else if (g_conf()->mon_osd_prime_pg_temp
) {
1525 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job did not start"
1528 mapping_job
.reset();
1530 // ensure we don't have blank new_state updates. these are interrpeted as
1531 // CEPH_OSD_UP (and almost certainly not what we want!).
1532 auto p
= pending_inc
.new_state
.begin();
1533 while (p
!= pending_inc
.new_state
.end()) {
1534 if (p
->second
== 0) {
1535 dout(10) << "new_state for osd." << p
->first
<< " is 0, removing" << dendl
;
1536 p
= pending_inc
.new_state
.erase(p
);
1538 if (p
->second
& CEPH_OSD_UP
) {
1539 pending_inc
.new_last_up_change
= pending_inc
.modified
;
1544 if (!pending_inc
.new_up_client
.empty()) {
1545 pending_inc
.new_last_up_change
= pending_inc
.modified
;
1547 for (auto& i
: pending_inc
.new_weight
) {
1548 if (i
.first
>= osdmap
.max_osd
) {
1550 // new osd is already marked in
1551 pending_inc
.new_last_in_change
= pending_inc
.modified
;
1554 } else if (!!i
.second
!= !!osdmap
.osd_weight
[i
.first
]) {
1555 // existing osd marked in or out
1556 pending_inc
.new_last_in_change
= pending_inc
.modified
;
1563 tmp
.deepish_copy_from(osdmap
);
1564 tmp
.apply_incremental(pending_inc
);
1566 // clean pg_temp mappings
1567 OSDMap::clean_temps(cct
, osdmap
, tmp
, &pending_inc
);
1569 // clean inappropriate pg_upmap/pg_upmap_items (if any)
1571 // check every upmapped pg for now
1572 // until we could reliably identify certain cases to ignore,
1573 // which is obviously the hard part TBD..
1574 vector
<pg_t
> pgs_to_check
;
1575 tmp
.get_upmap_pgs(&pgs_to_check
);
1576 if (pgs_to_check
.size() <
1577 static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk
* 2)) {
1578 // not enough pgs, do it inline
1579 tmp
.clean_pg_upmaps(cct
, &pending_inc
);
1581 CleanUpmapJob
job(cct
, tmp
, pending_inc
);
1582 mapper
.queue(&job
, g_conf()->mon_clean_pg_upmaps_per_chunk
, pgs_to_check
);
1587 // update creating pgs first so that we can remove the created pgid and
1588 // process the pool flag removal below in the same osdmap epoch.
1589 auto pending_creatings
= update_pending_pgs(pending_inc
, tmp
);
1590 bufferlist creatings_bl
;
1591 uint64_t features
= CEPH_FEATURES_ALL
;
1592 if (mon
->monmap
->min_mon_release
< ceph_release_t::octopus
) {
1593 dout(20) << __func__
<< " encoding pending pgs without octopus features"
1595 features
&= ~CEPH_FEATURE_SERVER_OCTOPUS
;
1597 encode(pending_creatings
, creatings_bl
, features
);
1598 t
->put(OSD_PG_CREATING_PREFIX
, "creating", creatings_bl
);
1600 // remove any old (or incompat) POOL_CREATING flags
1601 for (auto& i
: tmp
.get_pools()) {
1602 if (tmp
.require_osd_release
< ceph_release_t::nautilus
) {
1603 // pre-nautilus OSDMaps shouldn't get this flag.
1604 if (pending_inc
.new_pools
.count(i
.first
)) {
1605 pending_inc
.new_pools
[i
.first
].flags
&= ~pg_pool_t::FLAG_CREATING
;
1608 if (i
.second
.has_flag(pg_pool_t::FLAG_CREATING
) &&
1609 !pending_creatings
.still_creating_pool(i
.first
)) {
1610 dout(10) << __func__
<< " done creating pool " << i
.first
1611 << ", clearing CREATING flag" << dendl
;
1612 if (pending_inc
.new_pools
.count(i
.first
) == 0) {
1613 pending_inc
.new_pools
[i
.first
] = i
.second
;
1615 pending_inc
.new_pools
[i
.first
].flags
&= ~pg_pool_t::FLAG_CREATING
;
1619 // collect which pools are currently affected by
1620 // the near/backfill/full osd(s),
1621 // and set per-pool near/backfill/full flag instead
1622 set
<int64_t> full_pool_ids
;
1623 set
<int64_t> backfillfull_pool_ids
;
1624 set
<int64_t> nearfull_pool_ids
;
1625 tmp
.get_full_pools(cct
,
1627 &backfillfull_pool_ids
,
1628 &nearfull_pool_ids
);
1629 if (full_pool_ids
.empty() ||
1630 backfillfull_pool_ids
.empty() ||
1631 nearfull_pool_ids
.empty()) {
1632 // normal case - no nearfull, backfillfull or full osds
1633 // try cancel any improper nearfull/backfillfull/full pool
1635 for (auto &pool
: tmp
.get_pools()) {
1636 auto p
= pool
.first
;
1637 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
) &&
1638 nearfull_pool_ids
.empty()) {
1639 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1640 << "'s nearfull flag" << dendl
;
1641 if (pending_inc
.new_pools
.count(p
) == 0) {
1642 // load original pool info first!
1643 pending_inc
.new_pools
[p
] = pool
.second
;
1645 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1647 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
) &&
1648 backfillfull_pool_ids
.empty()) {
1649 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1650 << "'s backfillfull flag" << dendl
;
1651 if (pending_inc
.new_pools
.count(p
) == 0) {
1652 pending_inc
.new_pools
[p
] = pool
.second
;
1654 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1656 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
) &&
1657 full_pool_ids
.empty()) {
1658 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1659 // set by EQUOTA, skipping
1662 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1663 << "'s full flag" << dendl
;
1664 if (pending_inc
.new_pools
.count(p
) == 0) {
1665 pending_inc
.new_pools
[p
] = pool
.second
;
1667 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_FULL
;
1671 if (!full_pool_ids
.empty()) {
1672 dout(10) << __func__
<< " marking pool(s) " << full_pool_ids
1673 << " as full" << dendl
;
1674 for (auto &p
: full_pool_ids
) {
1675 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
)) {
1678 if (pending_inc
.new_pools
.count(p
) == 0) {
1679 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1681 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_FULL
;
1682 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1683 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1685 // cancel FLAG_FULL for pools which are no longer full too
1686 for (auto &pool
: tmp
.get_pools()) {
1687 auto p
= pool
.first
;
1688 if (full_pool_ids
.count(p
)) {
1689 // skip pools we have just marked as full above
1692 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
) ||
1693 tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1694 // don't touch if currently is not full
1695 // or is running out of quota (and hence considered as full)
1698 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1699 << "'s full flag" << dendl
;
1700 if (pending_inc
.new_pools
.count(p
) == 0) {
1701 pending_inc
.new_pools
[p
] = pool
.second
;
1703 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_FULL
;
1706 if (!backfillfull_pool_ids
.empty()) {
1707 for (auto &p
: backfillfull_pool_ids
) {
1708 if (full_pool_ids
.count(p
)) {
1709 // skip pools we have already considered as full above
1712 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1713 // make sure FLAG_FULL is truly set, so we are safe not
1714 // to set a extra (redundant) FLAG_BACKFILLFULL flag
1715 ceph_assert(tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
));
1718 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
)) {
1719 // don't bother if pool is already marked as backfillfull
1722 dout(10) << __func__
<< " marking pool '" << tmp
.pool_name
[p
]
1723 << "'s as backfillfull" << dendl
;
1724 if (pending_inc
.new_pools
.count(p
) == 0) {
1725 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1727 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_BACKFILLFULL
;
1728 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1730 // cancel FLAG_BACKFILLFULL for pools
1731 // which are no longer backfillfull too
1732 for (auto &pool
: tmp
.get_pools()) {
1733 auto p
= pool
.first
;
1734 if (full_pool_ids
.count(p
) || backfillfull_pool_ids
.count(p
)) {
1735 // skip pools we have just marked as backfillfull/full above
1738 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
)) {
1739 // and don't touch if currently is not backfillfull
1742 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1743 << "'s backfillfull flag" << dendl
;
1744 if (pending_inc
.new_pools
.count(p
) == 0) {
1745 pending_inc
.new_pools
[p
] = pool
.second
;
1747 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1750 if (!nearfull_pool_ids
.empty()) {
1751 for (auto &p
: nearfull_pool_ids
) {
1752 if (full_pool_ids
.count(p
) || backfillfull_pool_ids
.count(p
)) {
1755 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1756 // make sure FLAG_FULL is truly set, so we are safe not
1757 // to set a extra (redundant) FLAG_NEARFULL flag
1758 ceph_assert(tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
));
1761 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
)) {
1762 // don't bother if pool is already marked as nearfull
1765 dout(10) << __func__
<< " marking pool '" << tmp
.pool_name
[p
]
1766 << "'s as nearfull" << dendl
;
1767 if (pending_inc
.new_pools
.count(p
) == 0) {
1768 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1770 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_NEARFULL
;
1772 // cancel FLAG_NEARFULL for pools
1773 // which are no longer nearfull too
1774 for (auto &pool
: tmp
.get_pools()) {
1775 auto p
= pool
.first
;
1776 if (full_pool_ids
.count(p
) ||
1777 backfillfull_pool_ids
.count(p
) ||
1778 nearfull_pool_ids
.count(p
)) {
1779 // skip pools we have just marked as
1780 // nearfull/backfillfull/full above
1783 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
)) {
1784 // and don't touch if currently is not nearfull
1787 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1788 << "'s nearfull flag" << dendl
;
1789 if (pending_inc
.new_pools
.count(p
) == 0) {
1790 pending_inc
.new_pools
[p
] = pool
.second
;
1792 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1796 // min_compat_client?
1797 if (!tmp
.require_min_compat_client
) {
1798 auto mv
= tmp
.get_min_compat_client();
1799 dout(1) << __func__
<< " setting require_min_compat_client to currently "
1800 << "required " << mv
<< dendl
;
1801 mon
->clog
->info() << "setting require_min_compat_client to currently "
1802 << "required " << mv
;
1803 pending_inc
.new_require_min_compat_client
= mv
;
1806 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
&&
1807 tmp
.require_osd_release
>= ceph_release_t::nautilus
) {
1808 dout(10) << __func__
<< " first nautilus+ epoch" << dendl
;
1809 // add creating flags?
1810 for (auto& i
: tmp
.get_pools()) {
1811 if (pending_creatings
.still_creating_pool(i
.first
)) {
1812 dout(10) << __func__
<< " adding CREATING flag to pool " << i
.first
1814 if (pending_inc
.new_pools
.count(i
.first
) == 0) {
1815 pending_inc
.new_pools
[i
.first
] = i
.second
;
1817 pending_inc
.new_pools
[i
.first
].flags
|= pg_pool_t::FLAG_CREATING
;
1820 // adjust blacklist items to all be TYPE_ANY
1821 for (auto& i
: tmp
.blacklist
) {
1823 a
.set_type(entity_addr_t::TYPE_ANY
);
1824 pending_inc
.new_blacklist
[a
] = i
.second
;
1825 pending_inc
.old_blacklist
.push_back(i
.first
);
1829 if (osdmap
.require_osd_release
< ceph_release_t::octopus
&&
1830 tmp
.require_osd_release
>= ceph_release_t::octopus
) {
1831 dout(10) << __func__
<< " first octopus+ epoch" << dendl
;
1833 // adjust obsoleted cache modes
1834 for (auto& [poolid
, pi
] : tmp
.pools
) {
1835 if (pi
.cache_mode
== pg_pool_t::CACHEMODE_FORWARD
) {
1836 if (pending_inc
.new_pools
.count(poolid
) == 0) {
1837 pending_inc
.new_pools
[poolid
] = pi
;
1839 dout(10) << __func__
<< " switching pool " << poolid
1840 << " cachemode from forward -> proxy" << dendl
;
1841 pending_inc
.new_pools
[poolid
].cache_mode
= pg_pool_t::CACHEMODE_PROXY
;
1843 if (pi
.cache_mode
== pg_pool_t::CACHEMODE_READFORWARD
) {
1844 if (pending_inc
.new_pools
.count(poolid
) == 0) {
1845 pending_inc
.new_pools
[poolid
] = pi
;
1847 dout(10) << __func__
<< " switching pool " << poolid
1848 << " cachemode from readforward -> readproxy" << dendl
;
1849 pending_inc
.new_pools
[poolid
].cache_mode
=
1850 pg_pool_t::CACHEMODE_READPROXY
;
1854 // clear removed_snaps for every pool
1855 for (auto& [poolid
, pi
] : tmp
.pools
) {
1856 if (pi
.removed_snaps
.empty()) {
1859 if (pending_inc
.new_pools
.count(poolid
) == 0) {
1860 pending_inc
.new_pools
[poolid
] = pi
;
1862 dout(10) << __func__
<< " clearing pool " << poolid
<< " removed_snaps"
1864 pending_inc
.new_pools
[poolid
].removed_snaps
.clear();
1867 // create a combined purged snap epoch key for all purged snaps
1868 // prior to this epoch, and store it in the current epoch (i.e.,
1869 // the last pre-octopus epoch, just prior to the one we're
1871 auto it
= mon
->store
->get_iterator(OSD_SNAP_PREFIX
);
1872 it
->lower_bound("purged_snap_");
1873 map
<int64_t,snap_interval_set_t
> combined
;
1874 while (it
->valid()) {
1875 if (it
->key().find("purged_snap_") != 0) {
1878 string k
= it
->key();
1879 long long unsigned pool
;
1880 int n
= sscanf(k
.c_str(), "purged_snap_%llu_", &pool
);
1882 derr
<< __func__
<< " invalid purged_snaps key '" << k
<< "'" << dendl
;
1884 bufferlist v
= it
->value();
1885 auto p
= v
.cbegin();
1886 snapid_t begin
, end
;
1887 ceph::decode(begin
, p
);
1888 ceph::decode(end
, p
);
1889 combined
[pool
].insert(begin
, end
- begin
);
1893 if (!combined
.empty()) {
1894 string k
= make_purged_snap_epoch_key(pending_inc
.epoch
- 1);
1896 ceph::encode(combined
, v
);
1897 t
->put(OSD_SNAP_PREFIX
, k
, v
);
1898 dout(10) << __func__
<< " recording pre-octopus purged_snaps in epoch "
1899 << (pending_inc
.epoch
- 1) << ", " << v
.length() << " bytes"
1902 dout(10) << __func__
<< " there were no pre-octopus purged snaps"
1906 // clean out the old removed_snap_ and removed_epoch keys
1907 // ('`' is ASCII '_' + 1)
1908 t
->erase_range(OSD_SNAP_PREFIX
, "removed_snap_", "removed_snap`");
1909 t
->erase_range(OSD_SNAP_PREFIX
, "removed_epoch_", "removed_epoch`");
1914 for (auto i
= pending_inc
.new_state
.begin();
1915 i
!= pending_inc
.new_state
.end();
1917 int s
= i
->second
? i
->second
: CEPH_OSD_UP
;
1918 if (s
& CEPH_OSD_UP
) {
1919 dout(2) << " osd." << i
->first
<< " DOWN" << dendl
;
1920 // Reset laggy parameters if failure interval exceeds a threshold.
1921 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(i
->first
);
1922 if ((xi
.laggy_probability
|| xi
.laggy_interval
) && xi
.down_stamp
.sec()) {
1923 int last_failure_interval
= pending_inc
.modified
.sec() - xi
.down_stamp
.sec();
1924 if (grace_interval_threshold_exceeded(last_failure_interval
)) {
1925 set_default_laggy_params(i
->first
);
1929 if (s
& CEPH_OSD_EXISTS
)
1930 dout(2) << " osd." << i
->first
<< " DNE" << dendl
;
1932 for (auto i
= pending_inc
.new_up_client
.begin();
1933 i
!= pending_inc
.new_up_client
.end();
1935 //FIXME: insert cluster addresses too
1936 dout(2) << " osd." << i
->first
<< " UP " << i
->second
<< dendl
;
1938 for (map
<int32_t,uint32_t>::iterator i
= pending_inc
.new_weight
.begin();
1939 i
!= pending_inc
.new_weight
.end();
1941 if (i
->second
== CEPH_OSD_OUT
) {
1942 dout(2) << " osd." << i
->first
<< " OUT" << dendl
;
1943 } else if (i
->second
== CEPH_OSD_IN
) {
1944 dout(2) << " osd." << i
->first
<< " IN" << dendl
;
1946 dout(2) << " osd." << i
->first
<< " WEIGHT " << hex
<< i
->second
<< dec
<< dendl
;
1950 // features for osdmap and its incremental
1953 // encode full map and determine its crc
1956 tmp
.deepish_copy_from(osdmap
);
1957 tmp
.apply_incremental(pending_inc
);
1959 // determine appropriate features
1960 features
= tmp
.get_encoding_features();
1961 dout(10) << __func__
<< " encoding full map with "
1962 << tmp
.require_osd_release
1963 << " features " << features
<< dendl
;
1965 // the features should be a subset of the mon quorum's features!
1966 ceph_assert((features
& ~mon
->get_quorum_con_features()) == 0);
1969 encode(tmp
, fullbl
, features
| CEPH_FEATURE_RESERVED
);
1970 pending_inc
.full_crc
= tmp
.get_crc();
1972 // include full map in the txn. note that old monitors will
1973 // overwrite this. new ones will now skip the local full map
1974 // encode and reload from this.
1975 put_version_full(t
, pending_inc
.epoch
, fullbl
);
1979 ceph_assert(get_last_committed() + 1 == pending_inc
.epoch
);
1981 encode(pending_inc
, bl
, features
| CEPH_FEATURE_RESERVED
);
1983 dout(20) << " full_crc " << tmp
.get_crc()
1984 << " inc_crc " << pending_inc
.inc_crc
<< dendl
;
1986 /* put everything in the transaction */
1987 put_version(t
, pending_inc
.epoch
, bl
);
1988 put_last_committed(t
, pending_inc
.epoch
);
1991 for (map
<int,bufferlist
>::iterator p
= pending_metadata
.begin();
1992 p
!= pending_metadata
.end();
1994 t
->put(OSD_METADATA_PREFIX
, stringify(p
->first
), p
->second
);
1995 for (set
<int>::iterator p
= pending_metadata_rm
.begin();
1996 p
!= pending_metadata_rm
.end();
1998 t
->erase(OSD_METADATA_PREFIX
, stringify(*p
));
1999 pending_metadata
.clear();
2000 pending_metadata_rm
.clear();
2003 if (tmp
.require_osd_release
>= ceph_release_t::octopus
&&
2004 !pending_inc
.new_purged_snaps
.empty()) {
2005 // all snaps purged this epoch (across all pools)
2006 string k
= make_purged_snap_epoch_key(pending_inc
.epoch
);
2008 encode(pending_inc
.new_purged_snaps
, v
);
2009 t
->put(OSD_SNAP_PREFIX
, k
, v
);
2011 for (auto& i
: pending_inc
.new_purged_snaps
) {
2012 for (auto q
= i
.second
.begin();
2013 q
!= i
.second
.end();
2015 insert_purged_snap_update(i
.first
, q
.get_start(), q
.get_end(),
2020 for (auto& [pool
, snaps
] : pending_pseudo_purged_snaps
) {
2021 for (auto snap
: snaps
) {
2022 insert_purged_snap_update(pool
, snap
, snap
+ 1,
2029 health_check_map_t next
;
2030 tmp
.check_health(cct
, &next
);
2031 encode_health(next
, t
);
2034 int OSDMonitor::load_metadata(int osd
, map
<string
, string
>& m
, ostream
*err
)
2037 int r
= mon
->store
->get(OSD_METADATA_PREFIX
, stringify(osd
), bl
);
2041 auto p
= bl
.cbegin();
2044 catch (buffer::error
& e
) {
2046 *err
<< "osd." << osd
<< " metadata is corrupt";
2052 void OSDMonitor::count_metadata(const string
& field
, map
<string
,int> *out
)
2054 for (int osd
= 0; osd
< osdmap
.get_max_osd(); ++osd
) {
2055 if (osdmap
.is_up(osd
)) {
2056 map
<string
,string
> meta
;
2057 load_metadata(osd
, meta
, nullptr);
2058 auto p
= meta
.find(field
);
2059 if (p
== meta
.end()) {
2060 (*out
)["unknown"]++;
2062 (*out
)[p
->second
]++;
2068 void OSDMonitor::count_metadata(const string
& field
, Formatter
*f
)
2070 map
<string
,int> by_val
;
2071 count_metadata(field
, &by_val
);
2072 f
->open_object_section(field
.c_str());
2073 for (auto& p
: by_val
) {
2074 f
->dump_int(p
.first
.c_str(), p
.second
);
2079 int OSDMonitor::get_osd_objectstore_type(int osd
, string
*type
)
2081 map
<string
, string
> metadata
;
2082 int r
= load_metadata(osd
, metadata
, nullptr);
2086 auto it
= metadata
.find("osd_objectstore");
2087 if (it
== metadata
.end())
2093 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id
,
2094 const pg_pool_t
&pool
,
2097 // just check a few pgs for efficiency - this can't give a guarantee anyway,
2098 // since filestore osds could always join the pool later
2099 set
<int> checked_osds
;
2100 for (unsigned ps
= 0; ps
< std::min(8u, pool
.get_pg_num()); ++ps
) {
2101 vector
<int> up
, acting
;
2102 pg_t
pgid(ps
, pool_id
);
2103 osdmap
.pg_to_up_acting_osds(pgid
, up
, acting
);
2104 for (int osd
: up
) {
2105 if (checked_osds
.find(osd
) != checked_osds
.end())
2107 string objectstore_type
;
2108 int r
= get_osd_objectstore_type(osd
, &objectstore_type
);
2109 // allow with missing metadata, e.g. due to an osd never booting yet
2110 if (r
< 0 || objectstore_type
== "bluestore") {
2111 checked_osds
.insert(osd
);
2114 *err
<< "osd." << osd
<< " uses " << objectstore_type
;
2121 int OSDMonitor::dump_osd_metadata(int osd
, Formatter
*f
, ostream
*err
)
2123 map
<string
,string
> m
;
2124 if (int r
= load_metadata(osd
, m
, err
))
2126 for (map
<string
,string
>::iterator p
= m
.begin(); p
!= m
.end(); ++p
)
2127 f
->dump_string(p
->first
.c_str(), p
->second
);
2131 void OSDMonitor::print_nodes(Formatter
*f
)
2133 // group OSDs by their hosts
2134 map
<string
, list
<int> > osds
; // hostname => osd
2135 for (int osd
= 0; osd
< osdmap
.get_max_osd(); osd
++) {
2136 map
<string
, string
> m
;
2137 if (load_metadata(osd
, m
, NULL
)) {
2140 map
<string
, string
>::iterator hostname
= m
.find("hostname");
2141 if (hostname
== m
.end()) {
2142 // not likely though
2145 osds
[hostname
->second
].push_back(osd
);
2148 dump_services(f
, osds
, "osd");
2151 void OSDMonitor::share_map_with_random_osd()
2153 if (osdmap
.get_num_up_osds() == 0) {
2154 dout(10) << __func__
<< " no up osds, don't share with anyone" << dendl
;
2158 MonSession
*s
= mon
->session_map
.get_random_osd_session(&osdmap
);
2160 dout(10) << __func__
<< " no up osd on our session map" << dendl
;
2164 dout(10) << "committed, telling random " << s
->name
2165 << " all about it" << dendl
;
2167 // get feature of the peer
2168 // use quorum_con_features, if it's an anonymous connection.
2169 uint64_t features
= s
->con_features
? s
->con_features
:
2170 mon
->get_quorum_con_features();
2171 // whatev, they'll request more if they need it
2172 MOSDMap
*m
= build_incremental(osdmap
.get_epoch() - 1, osdmap
.get_epoch(), features
);
2173 s
->con
->send_message(m
);
2174 // NOTE: do *not* record osd has up to this epoch (as we do
2175 // elsewhere) as they may still need to request older values.
2178 version_t
OSDMonitor::get_trim_to() const
2180 if (mon
->get_quorum().empty()) {
2181 dout(10) << __func__
<< ": quorum not formed" << dendl
;
2186 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
2187 if (!creating_pgs
.pgs
.empty()) {
2192 if (g_conf().get_val
<bool>("mon_debug_block_osdmap_trim")) {
2194 << " blocking osdmap trim"
2195 " ('mon_debug_block_osdmap_trim' set to 'true')"
2201 epoch_t floor
= get_min_last_epoch_clean();
2202 dout(10) << " min_last_epoch_clean " << floor
<< dendl
;
2203 if (g_conf()->mon_osd_force_trim_to
> 0 &&
2204 g_conf()->mon_osd_force_trim_to
< (int)get_last_committed()) {
2205 floor
= g_conf()->mon_osd_force_trim_to
;
2206 dout(10) << " explicit mon_osd_force_trim_to = " << floor
<< dendl
;
2208 unsigned min
= g_conf()->mon_min_osdmap_epochs
;
2209 if (floor
+ min
> get_last_committed()) {
2210 if (min
< get_last_committed())
2211 floor
= get_last_committed() - min
;
2215 if (floor
> get_first_committed())
2221 epoch_t
OSDMonitor::get_min_last_epoch_clean() const
2223 auto floor
= last_epoch_clean
.get_lower_bound(osdmap
);
2224 // also scan osd epochs
2225 // don't trim past the oldest reported osd epoch
2226 for (auto& osd_epoch
: osd_epochs
) {
2227 if (osd_epoch
.second
< floor
&&
2228 osdmap
.is_in(osd_epoch
.first
)) {
2229 floor
= osd_epoch
.second
;
2235 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx
,
2238 dout(10) << __func__
<< " including full map for e " << first
<< dendl
;
2240 get_version_full(first
, bl
);
2241 put_version_full(tx
, first
, bl
);
2243 if (has_osdmap_manifest
&&
2244 first
> osdmap_manifest
.get_first_pinned()) {
2245 _prune_update_trimmed(tx
, first
);
2250 /* full osdmap prune
2252 * for more information, please refer to doc/dev/mon-osdmap-prune.rst
2255 void OSDMonitor::load_osdmap_manifest()
2257 bool store_has_manifest
=
2258 mon
->store
->exists(get_service_name(), "osdmap_manifest");
2260 if (!store_has_manifest
) {
2261 if (!has_osdmap_manifest
) {
2265 dout(20) << __func__
2266 << " dropping osdmap manifest from memory." << dendl
;
2267 osdmap_manifest
= osdmap_manifest_t();
2268 has_osdmap_manifest
= false;
2272 dout(20) << __func__
2273 << " osdmap manifest detected in store; reload." << dendl
;
2275 bufferlist manifest_bl
;
2276 int r
= get_value("osdmap_manifest", manifest_bl
);
2278 derr
<< __func__
<< " unable to read osdmap version manifest" << dendl
;
2279 ceph_abort_msg("error reading manifest");
2281 osdmap_manifest
.decode(manifest_bl
);
2282 has_osdmap_manifest
= true;
2284 dout(10) << __func__
<< " store osdmap manifest pinned ("
2285 << osdmap_manifest
.get_first_pinned()
2287 << osdmap_manifest
.get_last_pinned()
2292 bool OSDMonitor::should_prune() const
2294 version_t first
= get_first_committed();
2295 version_t last
= get_last_committed();
2296 version_t min_osdmap_epochs
=
2297 g_conf().get_val
<int64_t>("mon_min_osdmap_epochs");
2298 version_t prune_min
=
2299 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_min");
2300 version_t prune_interval
=
2301 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_interval");
2302 version_t last_pinned
= osdmap_manifest
.get_last_pinned();
2303 version_t last_to_pin
= last
- min_osdmap_epochs
;
2305 // Make it or break it constraints.
2307 // If any of these conditions fails, we will not prune, regardless of
2308 // whether we have an on-disk manifest with an on-going pruning state.
2310 if ((last
- first
) <= min_osdmap_epochs
) {
2311 // between the first and last committed epochs, we don't have
2312 // enough epochs to trim, much less to prune.
2313 dout(10) << __func__
2314 << " currently holding only " << (last
- first
)
2315 << " epochs (min osdmap epochs: " << min_osdmap_epochs
2316 << "); do not prune."
2320 } else if ((last_to_pin
- first
) < prune_min
) {
2321 // between the first committed epoch and the last epoch we would prune,
2322 // we simply don't have enough versions over the minimum to prune maps.
2323 dout(10) << __func__
2324 << " could only prune " << (last_to_pin
- first
)
2325 << " epochs (" << first
<< ".." << last_to_pin
<< "), which"
2326 " is less than the required minimum (" << prune_min
<< ")"
2330 } else if (has_osdmap_manifest
&& last_pinned
>= last_to_pin
) {
2331 dout(10) << __func__
2332 << " we have pruned as far as we can; do not prune."
2336 } else if (last_pinned
+ prune_interval
> last_to_pin
) {
2337 dout(10) << __func__
2338 << " not enough epochs to form an interval (last pinned: "
2339 << last_pinned
<< ", last to pin: "
2340 << last_to_pin
<< ", interval: " << prune_interval
<< ")"
2345 dout(15) << __func__
2346 << " should prune (" << last_pinned
<< ".." << last_to_pin
<< ")"
2347 << " lc (" << first
<< ".." << last
<< ")"
2352 void OSDMonitor::_prune_update_trimmed(
2353 MonitorDBStore::TransactionRef tx
,
2356 dout(10) << __func__
2357 << " first " << first
2358 << " last_pinned " << osdmap_manifest
.get_last_pinned()
2359 << " last_pinned " << osdmap_manifest
.get_last_pinned()
2362 osdmap_manifest_t manifest
= osdmap_manifest
;
2364 if (!manifest
.is_pinned(first
)) {
2365 manifest
.pin(first
);
2368 set
<version_t
>::iterator p_end
= manifest
.pinned
.find(first
);
2369 set
<version_t
>::iterator p
= manifest
.pinned
.begin();
2370 manifest
.pinned
.erase(p
, p_end
);
2371 ceph_assert(manifest
.get_first_pinned() == first
);
2373 if (manifest
.get_last_pinned() == first
+1 ||
2374 manifest
.pinned
.size() == 1) {
2375 // we reached the end of the line, as pinned maps go; clean up our
2376 // manifest, and let `should_prune()` decide whether we should prune
2378 tx
->erase(get_service_name(), "osdmap_manifest");
2383 manifest
.encode(bl
);
2384 tx
->put(get_service_name(), "osdmap_manifest", bl
);
2387 void OSDMonitor::prune_init(osdmap_manifest_t
& manifest
)
2389 dout(1) << __func__
<< dendl
;
2391 version_t pin_first
;
2393 // verify constrainsts on stable in-memory state
2394 if (!has_osdmap_manifest
) {
2395 // we must have never pruned, OR if we pruned the state must no longer
2396 // be relevant (i.e., the state must have been removed alongside with
2397 // the trim that *must* have removed past the last pinned map in a
2399 ceph_assert(osdmap_manifest
.pinned
.empty());
2400 ceph_assert(!mon
->store
->exists(get_service_name(), "osdmap_manifest"));
2401 pin_first
= get_first_committed();
2404 // we must have pruned in the past AND its state is still relevant
2405 // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
2406 // and thus we still hold a manifest in the store).
2407 ceph_assert(!osdmap_manifest
.pinned
.empty());
2408 ceph_assert(osdmap_manifest
.get_first_pinned() == get_first_committed());
2409 ceph_assert(osdmap_manifest
.get_last_pinned() < get_last_committed());
2411 dout(10) << __func__
2412 << " first_pinned " << osdmap_manifest
.get_first_pinned()
2413 << " last_pinned " << osdmap_manifest
.get_last_pinned()
2416 pin_first
= osdmap_manifest
.get_last_pinned();
2419 manifest
.pin(pin_first
);
2422 bool OSDMonitor::_prune_sanitize_options() const
2424 uint64_t prune_interval
=
2425 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_interval");
2426 uint64_t prune_min
=
2427 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_min");
2429 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_txsize");
2433 if (prune_interval
== 0) {
2435 << " prune is enabled BUT prune interval is zero; abort."
2438 } else if (prune_interval
== 1) {
2440 << " prune interval is equal to one, which essentially means"
2441 " no pruning; abort."
2445 if (prune_min
== 0) {
2447 << " prune is enabled BUT prune min is zero; abort."
2451 if (prune_interval
> prune_min
) {
2453 << " impossible to ascertain proper prune interval because"
2454 << " it is greater than the minimum prune epochs"
2455 << " (min: " << prune_min
<< ", interval: " << prune_interval
<< ")"
2460 if (txsize
< prune_interval
- 1) {
2462 << "'mon_osdmap_full_prune_txsize' (" << txsize
2463 << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval
- 1
2464 << "); abort." << dendl
;
2470 bool OSDMonitor::is_prune_enabled() const {
2471 return g_conf().get_val
<bool>("mon_osdmap_full_prune_enabled");
2474 bool OSDMonitor::is_prune_supported() const {
2475 return mon
->get_required_mon_features().contains_any(
2476 ceph::features::mon::FEATURE_OSDMAP_PRUNE
);
2481 * @returns true if has side-effects; false otherwise.
2483 bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx
)
2485 bool enabled
= is_prune_enabled();
2487 dout(1) << __func__
<< " osdmap full prune "
2488 << ( enabled
? "enabled" : "disabled")
2491 if (!enabled
|| !_prune_sanitize_options() || !should_prune()) {
2495 // we are beyond the minimum prune versions, we need to remove maps because
2496 // otherwise the store will grow unbounded and we may end up having issues
2497 // with available disk space or store hangs.
2499 // we will not pin all versions. We will leave a buffer number of versions.
2500 // this allows us the monitor to trim maps without caring too much about
2501 // pinned maps, and then allow us to use another ceph-mon without these
2502 // capabilities, without having to repair the store.
2504 osdmap_manifest_t manifest
= osdmap_manifest
;
2506 version_t first
= get_first_committed();
2507 version_t last
= get_last_committed();
2509 version_t last_to_pin
= last
- g_conf()->mon_min_osdmap_epochs
;
2510 version_t last_pinned
= manifest
.get_last_pinned();
2511 uint64_t prune_interval
=
2512 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_interval");
2514 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_txsize");
2516 prune_init(manifest
);
2518 // we need to get rid of some osdmaps
2521 << " lc (" << first
<< " .. " << last
<< ")"
2522 << " last_pinned " << last_pinned
2523 << " interval " << prune_interval
2524 << " last_to_pin " << last_to_pin
2527 // We will be erasing maps as we go.
2529 // We will erase all maps between `last_pinned` and the `next_to_pin`.
2531 // If `next_to_pin` happens to be greater than `last_to_pin`, then
2532 // we stop pruning. We could prune the maps between `next_to_pin` and
2533 // `last_to_pin`, but by not doing it we end up with neater pruned
2534 // intervals, aligned with `prune_interval`. Besides, this should not be a
2535 // problem as long as `prune_interval` is set to a sane value, instead of
2536 // hundreds or thousands of maps.
2538 auto map_exists
= [this](version_t v
) {
2539 string k
= mon
->store
->combine_strings("full", v
);
2540 return mon
->store
->exists(get_service_name(), k
);
2543 // 'interval' represents the number of maps from the last pinned
2544 // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2545 // version 11 next; all intermediate versions will be removed.
2547 // 'txsize' represents the maximum number of versions we'll be removing in
2548 // this iteration. If 'txsize' is large enough to perform multiple passes
2549 // pinning and removing maps, we will do so; if not, we'll do at least one
2550 // pass. We are quite relaxed about honouring 'txsize', but we'll always
2551 // ensure that we never go *over* the maximum.
2553 // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2554 uint64_t removal_interval
= prune_interval
- 1;
2556 if (txsize
< removal_interval
) {
2558 << " setting txsize to removal interval size ("
2559 << removal_interval
<< " versions"
2561 txsize
= removal_interval
;
2563 ceph_assert(removal_interval
> 0);
2565 uint64_t num_pruned
= 0;
2566 while (num_pruned
+ removal_interval
<= txsize
) {
2567 last_pinned
= manifest
.get_last_pinned();
2569 if (last_pinned
+ prune_interval
> last_to_pin
) {
2572 ceph_assert(last_pinned
< last_to_pin
);
2574 version_t next_pinned
= last_pinned
+ prune_interval
;
2575 ceph_assert(next_pinned
<= last_to_pin
);
2576 manifest
.pin(next_pinned
);
2578 dout(20) << __func__
2579 << " last_pinned " << last_pinned
2580 << " next_pinned " << next_pinned
2581 << " num_pruned " << num_pruned
2582 << " removal interval (" << (last_pinned
+1)
2583 << ".." << (next_pinned
-1) << ")"
2584 << " txsize " << txsize
<< dendl
;
2586 ceph_assert(map_exists(last_pinned
));
2587 ceph_assert(map_exists(next_pinned
));
2589 for (version_t v
= last_pinned
+1; v
< next_pinned
; ++v
) {
2590 ceph_assert(!manifest
.is_pinned(v
));
2592 dout(20) << __func__
<< " pruning full osdmap e" << v
<< dendl
;
2593 string full_key
= mon
->store
->combine_strings("full", v
);
2594 tx
->erase(get_service_name(), full_key
);
2599 ceph_assert(num_pruned
> 0);
2602 manifest
.encode(bl
);
2603 tx
->put(get_service_name(), "osdmap_manifest", bl
);
2611 bool OSDMonitor::preprocess_query(MonOpRequestRef op
)
2613 op
->mark_osdmon_event(__func__
);
2614 Message
*m
= op
->get_req();
2615 dout(10) << "preprocess_query " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
2617 switch (m
->get_type()) {
2619 case MSG_MON_COMMAND
:
2621 return preprocess_command(op
);
2622 } catch (const bad_cmd_get
& e
) {
2624 mon
->reply_command(op
, -EINVAL
, e
.what(), bl
, get_last_committed());
2627 case CEPH_MSG_MON_GET_OSDMAP
:
2628 return preprocess_get_osdmap(op
);
2631 case MSG_OSD_MARK_ME_DOWN
:
2632 return preprocess_mark_me_down(op
);
2633 case MSG_OSD_MARK_ME_DEAD
:
2634 return preprocess_mark_me_dead(op
);
2636 return preprocess_full(op
);
2637 case MSG_OSD_FAILURE
:
2638 return preprocess_failure(op
);
2640 return preprocess_boot(op
);
2642 return preprocess_alive(op
);
2643 case MSG_OSD_PG_CREATED
:
2644 return preprocess_pg_created(op
);
2645 case MSG_OSD_PG_READY_TO_MERGE
:
2646 return preprocess_pg_ready_to_merge(op
);
2647 case MSG_OSD_PGTEMP
:
2648 return preprocess_pgtemp(op
);
2649 case MSG_OSD_BEACON
:
2650 return preprocess_beacon(op
);
2652 case CEPH_MSG_POOLOP
:
2653 return preprocess_pool_op(op
);
2655 case MSG_REMOVE_SNAPS
:
2656 return preprocess_remove_snaps(op
);
2658 case MSG_MON_GET_PURGED_SNAPS
:
2659 return preprocess_get_purged_snaps(op
);
2667 bool OSDMonitor::prepare_update(MonOpRequestRef op
)
2669 op
->mark_osdmon_event(__func__
);
2670 Message
*m
= op
->get_req();
2671 dout(7) << "prepare_update " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
2673 switch (m
->get_type()) {
2675 case MSG_OSD_MARK_ME_DOWN
:
2676 return prepare_mark_me_down(op
);
2677 case MSG_OSD_MARK_ME_DEAD
:
2678 return prepare_mark_me_dead(op
);
2680 return prepare_full(op
);
2681 case MSG_OSD_FAILURE
:
2682 return prepare_failure(op
);
2684 return prepare_boot(op
);
2686 return prepare_alive(op
);
2687 case MSG_OSD_PG_CREATED
:
2688 return prepare_pg_created(op
);
2689 case MSG_OSD_PGTEMP
:
2690 return prepare_pgtemp(op
);
2691 case MSG_OSD_PG_READY_TO_MERGE
:
2692 return prepare_pg_ready_to_merge(op
);
2693 case MSG_OSD_BEACON
:
2694 return prepare_beacon(op
);
2696 case MSG_MON_COMMAND
:
2698 return prepare_command(op
);
2699 } catch (const bad_cmd_get
& e
) {
2701 mon
->reply_command(op
, -EINVAL
, e
.what(), bl
, get_last_committed());
2705 case CEPH_MSG_POOLOP
:
2706 return prepare_pool_op(op
);
2708 case MSG_REMOVE_SNAPS
:
2709 return prepare_remove_snaps(op
);
2719 bool OSDMonitor::should_propose(double& delay
)
2721 dout(10) << "should_propose" << dendl
;
2723 // if full map, propose immediately! any subsequent changes will be clobbered.
2724 if (pending_inc
.fullmap
.length())
2727 // adjust osd weights?
2728 if (!osd_weight
.empty() &&
2729 osd_weight
.size() == (unsigned)osdmap
.get_max_osd()) {
2730 dout(0) << " adjusting osd weights based on " << osd_weight
<< dendl
;
2731 osdmap
.adjust_osd_weights(osd_weight
, pending_inc
);
2737 return PaxosService::should_propose(delay
);
2742 // ---------------------------
2745 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op
)
2747 op
->mark_osdmon_event(__func__
);
2748 auto m
= op
->get_req
<MMonGetOSDMap
>();
2750 uint64_t features
= mon
->get_quorum_con_features();
2751 if (op
->get_session() && op
->get_session()->con_features
)
2752 features
= op
->get_session()->con_features
;
2754 dout(10) << __func__
<< " " << *m
<< dendl
;
2755 MOSDMap
*reply
= new MOSDMap(mon
->monmap
->fsid
, features
);
2756 epoch_t first
= get_first_committed();
2757 epoch_t last
= osdmap
.get_epoch();
2758 int max
= g_conf()->osd_map_message_max
;
2759 ssize_t max_bytes
= g_conf()->osd_map_message_max_bytes
;
2760 for (epoch_t e
= std::max(first
, m
->get_full_first());
2761 e
<= std::min(last
, m
->get_full_last()) && max
> 0 && max_bytes
> 0;
2763 bufferlist
& bl
= reply
->maps
[e
];
2764 int r
= get_version_full(e
, features
, bl
);
2765 ceph_assert(r
>= 0);
2766 max_bytes
-= bl
.length();
2768 for (epoch_t e
= std::max(first
, m
->get_inc_first());
2769 e
<= std::min(last
, m
->get_inc_last()) && max
> 0 && max_bytes
> 0;
2771 bufferlist
& bl
= reply
->incremental_maps
[e
];
2772 int r
= get_version(e
, features
, bl
);
2773 ceph_assert(r
>= 0);
2774 max_bytes
-= bl
.length();
2776 reply
->oldest_map
= first
;
2777 reply
->newest_map
= last
;
2778 mon
->send_reply(op
, reply
);
2783 // ---------------------------
2788 bool OSDMonitor::check_source(MonOpRequestRef op
, uuid_d fsid
) {
2789 // check permissions
2790 MonSession
*session
= op
->get_session();
2793 if (!session
->is_capable("osd", MON_CAP_X
)) {
2794 dout(0) << "got MOSDFailure from entity with insufficient caps "
2795 << session
->caps
<< dendl
;
2798 if (fsid
!= mon
->monmap
->fsid
) {
2799 dout(0) << "check_source: on fsid " << fsid
2800 << " != " << mon
->monmap
->fsid
<< dendl
;
2807 bool OSDMonitor::preprocess_failure(MonOpRequestRef op
)
2809 op
->mark_osdmon_event(__func__
);
2810 auto m
= op
->get_req
<MOSDFailure
>();
2811 // who is target_osd
2812 int badboy
= m
->get_target_osd();
2814 // check permissions
2815 if (check_source(op
, m
->fsid
))
2818 // first, verify the reporting host is valid
2819 if (m
->get_orig_source().is_osd()) {
2820 int from
= m
->get_orig_source().num();
2821 if (!osdmap
.exists(from
) ||
2822 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs()) ||
2823 (osdmap
.is_down(from
) && m
->if_osd_failed())) {
2824 dout(5) << "preprocess_failure from dead osd." << from
2825 << ", ignoring" << dendl
;
2826 send_incremental(op
, m
->get_epoch()+1);
2833 if (osdmap
.is_down(badboy
)) {
2834 dout(5) << "preprocess_failure dne(/dup?): osd." << m
->get_target_osd()
2835 << " " << m
->get_target_addrs()
2836 << ", from " << m
->get_orig_source() << dendl
;
2837 if (m
->get_epoch() < osdmap
.get_epoch())
2838 send_incremental(op
, m
->get_epoch()+1);
2841 if (osdmap
.get_addrs(badboy
) != m
->get_target_addrs()) {
2842 dout(5) << "preprocess_failure wrong osd: report osd." << m
->get_target_osd()
2843 << " " << m
->get_target_addrs()
2844 << " != map's " << osdmap
.get_addrs(badboy
)
2845 << ", from " << m
->get_orig_source() << dendl
;
2846 if (m
->get_epoch() < osdmap
.get_epoch())
2847 send_incremental(op
, m
->get_epoch()+1);
2851 // already reported?
2852 if (osdmap
.is_down(badboy
) ||
2853 osdmap
.get_up_from(badboy
) > m
->get_epoch()) {
2854 dout(5) << "preprocess_failure dup/old: osd." << m
->get_target_osd()
2855 << " " << m
->get_target_addrs()
2856 << ", from " << m
->get_orig_source() << dendl
;
2857 if (m
->get_epoch() < osdmap
.get_epoch())
2858 send_incremental(op
, m
->get_epoch()+1);
2862 if (!can_mark_down(badboy
)) {
2863 dout(5) << "preprocess_failure ignoring report of osd."
2864 << m
->get_target_osd() << " " << m
->get_target_addrs()
2865 << " from " << m
->get_orig_source() << dendl
;
2869 dout(10) << "preprocess_failure new: osd." << m
->get_target_osd()
2870 << " " << m
->get_target_addrs()
2871 << ", from " << m
->get_orig_source() << dendl
;
2879 class C_AckMarkedDown
: public C_MonOp
{
2885 : C_MonOp(op
), osdmon(osdmon
) {}
2887 void _finish(int r
) override
{
2889 auto m
= op
->get_req
<MOSDMarkMeDown
>();
2890 osdmon
->mon
->send_reply(
2897 false)); // ACK itself does not request an ack
2898 } else if (r
== -EAGAIN
) {
2899 osdmon
->dispatch(op
);
2901 ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r
);
2904 ~C_AckMarkedDown() override
{
2908 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op
)
2910 op
->mark_osdmon_event(__func__
);
2911 auto m
= op
->get_req
<MOSDMarkMeDown
>();
2912 int from
= m
->target_osd
;
2914 // check permissions
2915 if (check_source(op
, m
->fsid
))
2918 // first, verify the reporting host is valid
2919 if (!m
->get_orig_source().is_osd())
2922 if (!osdmap
.exists(from
) ||
2923 osdmap
.is_down(from
) ||
2924 osdmap
.get_addrs(from
) != m
->target_addrs
) {
2925 dout(5) << "preprocess_mark_me_down from dead osd."
2926 << from
<< ", ignoring" << dendl
;
2927 send_incremental(op
, m
->get_epoch()+1);
2931 // no down might be set
2932 if (!can_mark_down(from
))
2935 dout(10) << "MOSDMarkMeDown for: " << m
->get_orig_source()
2936 << " " << m
->target_addrs
<< dendl
;
2940 if (m
->request_ack
) {
2941 Context
*c(new C_AckMarkedDown(this, op
));
2947 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op
)
2949 op
->mark_osdmon_event(__func__
);
2950 auto m
= op
->get_req
<MOSDMarkMeDown
>();
2951 int target_osd
= m
->target_osd
;
2953 ceph_assert(osdmap
.is_up(target_osd
));
2954 ceph_assert(osdmap
.get_addrs(target_osd
) == m
->target_addrs
);
2956 mon
->clog
->info() << "osd." << target_osd
<< " marked itself down";
2957 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
2959 wait_for_finished_proposal(op
, new C_AckMarkedDown(this, op
));
2963 bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op
)
2965 op
->mark_osdmon_event(__func__
);
2966 auto m
= op
->get_req
<MOSDMarkMeDead
>();
2967 int from
= m
->target_osd
;
2969 // check permissions
2970 if (check_source(op
, m
->fsid
)) {
2975 // first, verify the reporting host is valid
2976 if (!m
->get_orig_source().is_osd()) {
2981 if (!osdmap
.exists(from
) ||
2982 !osdmap
.is_down(from
)) {
2983 dout(5) << __func__
<< " from nonexistent or up osd." << from
2984 << ", ignoring" << dendl
;
2985 send_incremental(op
, m
->get_epoch()+1);
2993 bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op
)
2995 op
->mark_osdmon_event(__func__
);
2996 auto m
= op
->get_req
<MOSDMarkMeDead
>();
2997 int target_osd
= m
->target_osd
;
2999 ceph_assert(osdmap
.is_down(target_osd
));
3001 mon
->clog
->info() << "osd." << target_osd
<< " marked itself dead as of e"
3003 if (!pending_inc
.new_xinfo
.count(target_osd
)) {
3004 pending_inc
.new_xinfo
[target_osd
] = osdmap
.osd_xinfo
[target_osd
];
3006 pending_inc
.new_xinfo
[target_osd
].dead_epoch
= m
->get_epoch();
3007 wait_for_finished_proposal(
3010 [op
, this] (int r
) {
3012 mon
->no_reply(op
); // ignore on success
3019 bool OSDMonitor::can_mark_down(int i
)
3021 if (osdmap
.is_nodown(i
)) {
3022 dout(5) << __func__
<< " osd." << i
<< " is marked as nodown, "
3023 << "will not mark it down" << dendl
;
3027 int num_osds
= osdmap
.get_num_osds();
3028 if (num_osds
== 0) {
3029 dout(5) << __func__
<< " no osds" << dendl
;
3032 int up
= osdmap
.get_num_up_osds() - pending_inc
.get_net_marked_down(&osdmap
);
3033 float up_ratio
= (float)up
/ (float)num_osds
;
3034 if (up_ratio
< g_conf()->mon_osd_min_up_ratio
) {
3035 dout(2) << __func__
<< " current up_ratio " << up_ratio
<< " < min "
3036 << g_conf()->mon_osd_min_up_ratio
3037 << ", will not mark osd." << i
<< " down" << dendl
;
3043 bool OSDMonitor::can_mark_up(int i
)
3045 if (osdmap
.is_noup(i
)) {
3046 dout(5) << __func__
<< " osd." << i
<< " is marked as noup, "
3047 << "will not mark it up" << dendl
;
3055 * @note the parameter @p i apparently only exists here so we can output the
3056 * osd's id on messages.
3058 bool OSDMonitor::can_mark_out(int i
)
3060 if (osdmap
.is_noout(i
)) {
3061 dout(5) << __func__
<< " osd." << i
<< " is marked as noout, "
3062 << "will not mark it out" << dendl
;
3066 int num_osds
= osdmap
.get_num_osds();
3067 if (num_osds
== 0) {
3068 dout(5) << __func__
<< " no osds" << dendl
;
3071 int in
= osdmap
.get_num_in_osds() - pending_inc
.get_net_marked_out(&osdmap
);
3072 float in_ratio
= (float)in
/ (float)num_osds
;
3073 if (in_ratio
< g_conf()->mon_osd_min_in_ratio
) {
3075 dout(5) << __func__
<< " current in_ratio " << in_ratio
<< " < min "
3076 << g_conf()->mon_osd_min_in_ratio
3077 << ", will not mark osd." << i
<< " out" << dendl
;
3079 dout(5) << __func__
<< " current in_ratio " << in_ratio
<< " < min "
3080 << g_conf()->mon_osd_min_in_ratio
3081 << ", will not mark osds out" << dendl
;
3088 bool OSDMonitor::can_mark_in(int i
)
3090 if (osdmap
.is_noin(i
)) {
3091 dout(5) << __func__
<< " osd." << i
<< " is marked as noin, "
3092 << "will not mark it in" << dendl
;
3099 bool OSDMonitor::check_failures(utime_t now
)
3101 bool found_failure
= false;
3102 for (map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
3103 p
!= failure_info
.end();
3105 if (can_mark_down(p
->first
)) {
3106 found_failure
|= check_failure(now
, p
->first
, p
->second
);
3109 return found_failure
;
3112 bool OSDMonitor::check_failure(utime_t now
, int target_osd
, failure_info_t
& fi
)
3114 // already pending failure?
3115 if (pending_inc
.new_state
.count(target_osd
) &&
3116 pending_inc
.new_state
[target_osd
] & CEPH_OSD_UP
) {
3117 dout(10) << " already pending failure" << dendl
;
3121 set
<string
> reporters_by_subtree
;
3122 auto reporter_subtree_level
= g_conf().get_val
<string
>("mon_osd_reporter_subtree_level");
3123 utime_t
orig_grace(g_conf()->osd_heartbeat_grace
, 0);
3124 utime_t max_failed_since
= fi
.get_failed_since();
3125 utime_t failed_for
= now
- max_failed_since
;
3127 utime_t grace
= orig_grace
;
3128 double my_grace
= 0, peer_grace
= 0;
3130 if (g_conf()->mon_osd_adjust_heartbeat_grace
) {
3131 double halflife
= (double)g_conf()->mon_osd_laggy_halflife
;
3132 decay_k
= ::log(.5) / halflife
;
3134 // scale grace period based on historical probability of 'lagginess'
3135 // (false positive failures due to slowness).
3136 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(target_osd
);
3137 double decay
= exp((double)failed_for
* decay_k
);
3138 dout(20) << " halflife " << halflife
<< " decay_k " << decay_k
3139 << " failed_for " << failed_for
<< " decay " << decay
<< dendl
;
3140 my_grace
= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
3144 // consider the peers reporting a failure a proxy for a potential
3145 // 'subcluster' over the overall cluster that is similarly
3146 // laggy. this is clearly not true in all cases, but will sometimes
3147 // help us localize the grace correction to a subset of the system
3148 // (say, a rack with a bad switch) that is unhappy.
3149 ceph_assert(fi
.reporters
.size());
3150 for (auto p
= fi
.reporters
.begin(); p
!= fi
.reporters
.end();) {
3151 // get the parent bucket whose type matches with "reporter_subtree_level".
3152 // fall back to OSD if the level doesn't exist.
3153 if (osdmap
.exists(p
->first
)) {
3154 auto reporter_loc
= osdmap
.crush
->get_full_location(p
->first
);
3155 if (auto iter
= reporter_loc
.find(reporter_subtree_level
);
3156 iter
== reporter_loc
.end()) {
3157 reporters_by_subtree
.insert("osd." + to_string(p
->first
));
3159 reporters_by_subtree
.insert(iter
->second
);
3161 if (g_conf()->mon_osd_adjust_heartbeat_grace
) {
3162 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(p
->first
);
3163 utime_t elapsed
= now
- xi
.down_stamp
;
3164 double decay
= exp((double)elapsed
* decay_k
);
3165 peer_grace
+= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
3169 fi
.cancel_report(p
->first
);;
3170 p
= fi
.reporters
.erase(p
);
3174 if (g_conf()->mon_osd_adjust_heartbeat_grace
) {
3175 peer_grace
/= (double)fi
.reporters
.size();
3176 grace
+= peer_grace
;
3179 dout(10) << " osd." << target_osd
<< " has "
3180 << fi
.reporters
.size() << " reporters, "
3181 << grace
<< " grace (" << orig_grace
<< " + " << my_grace
3182 << " + " << peer_grace
<< "), max_failed_since " << max_failed_since
3185 if (failed_for
>= grace
&&
3186 reporters_by_subtree
.size() >= g_conf().get_val
<uint64_t>("mon_osd_min_down_reporters")) {
3187 dout(1) << " we have enough reporters to mark osd." << target_osd
3188 << " down" << dendl
;
3189 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
3191 mon
->clog
->info() << "osd." << target_osd
<< " failed ("
3192 << osdmap
.crush
->get_full_location_ordered_string(
3195 << (int)reporters_by_subtree
.size()
3196 << " reporters from different "
3197 << reporter_subtree_level
<< " after "
3198 << failed_for
<< " >= grace " << grace
<< ")";
3204 void OSDMonitor::force_failure(int target_osd
, int by
)
3206 // already pending failure?
3207 if (pending_inc
.new_state
.count(target_osd
) &&
3208 pending_inc
.new_state
[target_osd
] & CEPH_OSD_UP
) {
3209 dout(10) << " already pending failure" << dendl
;
3213 dout(1) << " we're forcing failure of osd." << target_osd
<< dendl
;
3214 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
3215 if (!pending_inc
.new_xinfo
.count(target_osd
)) {
3216 pending_inc
.new_xinfo
[target_osd
] = osdmap
.osd_xinfo
[target_osd
];
3218 pending_inc
.new_xinfo
[target_osd
].dead_epoch
= pending_inc
.epoch
;
3220 mon
->clog
->info() << "osd." << target_osd
<< " failed ("
3221 << osdmap
.crush
->get_full_location_ordered_string(target_osd
)
3222 << ") (connection refused reported by osd." << by
<< ")";
3226 bool OSDMonitor::prepare_failure(MonOpRequestRef op
)
3228 op
->mark_osdmon_event(__func__
);
3229 auto m
= op
->get_req
<MOSDFailure
>();
3230 dout(1) << "prepare_failure osd." << m
->get_target_osd()
3231 << " " << m
->get_target_addrs()
3232 << " from " << m
->get_orig_source()
3233 << " is reporting failure:" << m
->if_osd_failed() << dendl
;
3235 int target_osd
= m
->get_target_osd();
3236 int reporter
= m
->get_orig_source().num();
3237 ceph_assert(osdmap
.is_up(target_osd
));
3238 ceph_assert(osdmap
.get_addrs(target_osd
) == m
->get_target_addrs());
3242 if (m
->if_osd_failed()) {
3243 // calculate failure time
3244 utime_t now
= ceph_clock_now();
3245 utime_t failed_since
=
3246 m
->get_recv_stamp() - utime_t(m
->failed_for
, 0);
3249 if (m
->is_immediate()) {
3250 mon
->clog
->debug() << "osd." << m
->get_target_osd()
3251 << " reported immediately failed by "
3252 << m
->get_orig_source();
3253 force_failure(target_osd
, reporter
);
3256 mon
->clog
->debug() << "osd." << m
->get_target_osd() << " reported failed by "
3257 << m
->get_orig_source();
3259 failure_info_t
& fi
= failure_info
[target_osd
];
3260 MonOpRequestRef old_op
= fi
.add_report(reporter
, failed_since
, op
);
3262 mon
->no_reply(old_op
);
3265 return check_failure(now
, target_osd
, fi
);
3267 // remove the report
3268 mon
->clog
->debug() << "osd." << m
->get_target_osd()
3269 << " failure report canceled by "
3270 << m
->get_orig_source();
3271 if (failure_info
.count(target_osd
)) {
3272 failure_info_t
& fi
= failure_info
[target_osd
];
3273 MonOpRequestRef report_op
= fi
.cancel_report(reporter
);
3275 mon
->no_reply(report_op
);
3277 if (fi
.reporters
.empty()) {
3278 dout(10) << " removing last failure_info for osd." << target_osd
3280 failure_info
.erase(target_osd
);
3282 dout(10) << " failure_info for osd." << target_osd
<< " now "
3283 << fi
.reporters
.size() << " reporters" << dendl
;
3286 dout(10) << " no failure_info for osd." << target_osd
<< dendl
;
3293 void OSDMonitor::process_failures()
3295 map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
3296 while (p
!= failure_info
.end()) {
3297 if (osdmap
.is_up(p
->first
)) {
3300 dout(10) << "process_failures osd." << p
->first
<< dendl
;
3301 list
<MonOpRequestRef
> ls
;
3302 p
->second
.take_report_messages(ls
);
3303 failure_info
.erase(p
++);
3305 while (!ls
.empty()) {
3306 MonOpRequestRef o
= ls
.front();
3308 o
->mark_event(__func__
);
3309 MOSDFailure
*m
= o
->get_req
<MOSDFailure
>();
3310 send_latest(o
, m
->get_epoch());
3319 void OSDMonitor::take_all_failures(list
<MonOpRequestRef
>& ls
)
3321 dout(10) << __func__
<< " on " << failure_info
.size() << " osds" << dendl
;
3323 for (map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
3324 p
!= failure_info
.end();
3326 p
->second
.take_report_messages(ls
);
3328 failure_info
.clear();
3331 int OSDMonitor::get_grace_interval_threshold()
3333 int halflife
= g_conf()->mon_osd_laggy_halflife
;
3334 // Scale the halflife period (default: 1_hr) by
3335 // a factor (48) to calculate the threshold.
3336 int grace_threshold_factor
= 48;
3337 return halflife
* grace_threshold_factor
;
3340 bool OSDMonitor::grace_interval_threshold_exceeded(int last_failed_interval
)
3342 int grace_interval_threshold_secs
= get_grace_interval_threshold();
3343 if (last_failed_interval
> grace_interval_threshold_secs
) {
3344 dout(1) << " last_failed_interval " << last_failed_interval
3345 << " > grace_interval_threshold_secs " << grace_interval_threshold_secs
3352 void OSDMonitor::set_default_laggy_params(int target_osd
)
3354 if (pending_inc
.new_xinfo
.count(target_osd
) == 0) {
3355 pending_inc
.new_xinfo
[target_osd
] = osdmap
.osd_xinfo
[target_osd
];
3357 osd_xinfo_t
& xi
= pending_inc
.new_xinfo
[target_osd
];
3358 xi
.down_stamp
= pending_inc
.modified
;
3359 xi
.laggy_probability
= 0.0;
3360 xi
.laggy_interval
= 0;
3361 dout(20) << __func__
<< " reset laggy, now xi " << xi
<< dendl
;
3367 bool OSDMonitor::preprocess_boot(MonOpRequestRef op
)
3369 op
->mark_osdmon_event(__func__
);
3370 auto m
= op
->get_req
<MOSDBoot
>();
3371 int from
= m
->get_orig_source_inst().name
.num();
3373 // check permissions, ignore if failed (no response expected)
3374 MonSession
*session
= op
->get_session();
3377 if (!session
->is_capable("osd", MON_CAP_X
)) {
3378 dout(0) << "got preprocess_boot message from entity with insufficient caps"
3379 << session
->caps
<< dendl
;
3383 if (m
->sb
.cluster_fsid
!= mon
->monmap
->fsid
) {
3384 dout(0) << "preprocess_boot on fsid " << m
->sb
.cluster_fsid
3385 << " != " << mon
->monmap
->fsid
<< dendl
;
3389 if (m
->get_orig_source_inst().addr
.is_blank_ip()) {
3390 dout(0) << "preprocess_boot got blank addr for " << m
->get_orig_source_inst() << dendl
;
3394 ceph_assert(m
->get_orig_source_inst().name
.is_osd());
3396 // force all osds to have gone through luminous prior to upgrade to nautilus
3398 vector
<string
> missing
;
3399 if (!HAVE_FEATURE(m
->osd_features
, SERVER_LUMINOUS
)) {
3400 missing
.push_back("CEPH_FEATURE_SERVER_LUMINOUS");
3402 if (!HAVE_FEATURE(m
->osd_features
, SERVER_JEWEL
)) {
3403 missing
.push_back("CEPH_FEATURE_SERVER_JEWEL");
3405 if (!HAVE_FEATURE(m
->osd_features
, SERVER_KRAKEN
)) {
3406 missing
.push_back("CEPH_FEATURE_SERVER_KRAKEN");
3408 if (!HAVE_FEATURE(m
->osd_features
, OSD_RECOVERY_DELETES
)) {
3409 missing
.push_back("CEPH_FEATURE_OSD_RECOVERY_DELETES");
3412 if (!missing
.empty()) {
3413 using std::experimental::make_ostream_joiner
;
3416 copy(begin(missing
), end(missing
), make_ostream_joiner(ss
, ";"));
3418 mon
->clog
->info() << "disallowing boot of OSD "
3419 << m
->get_orig_source_inst()
3420 << " because the osd lacks " << ss
.str();
3425 // make sure osd versions do not span more than 3 releases
3426 if (HAVE_FEATURE(m
->osd_features
, SERVER_OCTOPUS
) &&
3427 osdmap
.require_osd_release
< ceph_release_t::mimic
) {
3428 mon
->clog
->info() << "disallowing boot of octopus+ OSD "
3429 << m
->get_orig_source_inst()
3430 << " because require_osd_release < mimic";
3434 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
3435 // we are reusing a jewel feature bit that was retired in luminous.
3436 if (osdmap
.require_osd_release
>= ceph_release_t::luminous
&&
3437 osdmap
.test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT
) &&
3438 !(m
->osd_features
& CEPH_FEATURE_OSD_PGLOG_HARDLIMIT
)) {
3439 mon
->clog
->info() << "disallowing boot of OSD "
3440 << m
->get_orig_source_inst()
3441 << " because 'pglog_hardlimit' osdmap flag is set and OSD lacks the OSD_PGLOG_HARDLIMIT feature";
3446 if (osdmap
.is_up(from
) &&
3447 osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs()) &&
3448 osdmap
.get_cluster_addrs(from
).legacy_equals(m
->cluster_addrs
)) {
3450 dout(7) << "preprocess_boot dup from " << m
->get_orig_source()
3451 << " " << m
->get_orig_source_addrs()
3452 << " =~ " << osdmap
.get_addrs(from
) << dendl
;
3457 if (osdmap
.exists(from
) &&
3458 !osdmap
.get_uuid(from
).is_zero() &&
3459 osdmap
.get_uuid(from
) != m
->sb
.osd_fsid
) {
3460 dout(7) << __func__
<< " from " << m
->get_orig_source_inst()
3461 << " clashes with existing osd: different fsid"
3462 << " (ours: " << osdmap
.get_uuid(from
)
3463 << " ; theirs: " << m
->sb
.osd_fsid
<< ")" << dendl
;
3467 if (osdmap
.exists(from
) &&
3468 osdmap
.get_info(from
).up_from
> m
->version
&&
3469 osdmap
.get_most_recent_addrs(from
).legacy_equals(
3470 m
->get_orig_source_addrs())) {
3471 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl
;
3472 send_latest(op
, m
->sb
.current_epoch
+1);
3477 if (!can_mark_up(from
)) {
3478 dout(7) << "preprocess_boot ignoring boot from " << m
->get_orig_source_inst() << dendl
;
3479 send_latest(op
, m
->sb
.current_epoch
+1);
3483 dout(10) << "preprocess_boot from " << m
->get_orig_source_inst() << dendl
;
3490 bool OSDMonitor::prepare_boot(MonOpRequestRef op
)
3492 op
->mark_osdmon_event(__func__
);
3493 auto m
= op
->get_req
<MOSDBoot
>();
3494 dout(7) << __func__
<< " from " << m
->get_source()
3496 << " client_addrs" << m
->get_connection()->get_peer_addrs()
3497 << " cluster_addrs " << m
->cluster_addrs
3498 << " hb_back_addrs " << m
->hb_back_addrs
3499 << " hb_front_addrs " << m
->hb_front_addrs
3502 ceph_assert(m
->get_orig_source().is_osd());
3503 int from
= m
->get_orig_source().num();
3505 // does this osd exist?
3506 if (from
>= osdmap
.get_max_osd()) {
3507 dout(1) << "boot from osd." << from
<< " >= max_osd "
3508 << osdmap
.get_max_osd() << dendl
;
3512 int oldstate
= osdmap
.exists(from
) ? osdmap
.get_state(from
) : CEPH_OSD_NEW
;
3513 if (pending_inc
.new_state
.count(from
))
3514 oldstate
^= pending_inc
.new_state
[from
];
3516 // already up? mark down first?
3517 if (osdmap
.is_up(from
)) {
3518 dout(7) << __func__
<< " was up, first marking down osd." << from
<< " "
3519 << osdmap
.get_addrs(from
) << dendl
;
3520 // preprocess should have caught these; if not, assert.
3521 ceph_assert(!osdmap
.get_addrs(from
).legacy_equals(
3522 m
->get_orig_source_addrs()) ||
3523 !osdmap
.get_cluster_addrs(from
).legacy_equals(m
->cluster_addrs
));
3524 ceph_assert(osdmap
.get_uuid(from
) == m
->sb
.osd_fsid
);
3526 if (pending_inc
.new_state
.count(from
) == 0 ||
3527 (pending_inc
.new_state
[from
] & CEPH_OSD_UP
) == 0) {
3528 // mark previous guy down
3529 pending_inc
.new_state
[from
] = CEPH_OSD_UP
;
3531 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
3532 } else if (pending_inc
.new_up_client
.count(from
)) {
3533 // already prepared, just wait
3534 dout(7) << __func__
<< " already prepared, waiting on "
3535 << m
->get_orig_source_addr() << dendl
;
3536 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
3539 pending_inc
.new_up_client
[from
] = m
->get_orig_source_addrs();
3540 pending_inc
.new_up_cluster
[from
] = m
->cluster_addrs
;
3541 pending_inc
.new_hb_back_up
[from
] = m
->hb_back_addrs
;
3542 pending_inc
.new_hb_front_up
[from
] = m
->hb_front_addrs
;
3544 down_pending_out
.erase(from
); // if any
3547 osd_weight
[from
] = m
->sb
.weight
;
3550 dout(10) << " setting osd." << from
<< " uuid to " << m
->sb
.osd_fsid
3552 if (!osdmap
.exists(from
) || osdmap
.get_uuid(from
) != m
->sb
.osd_fsid
) {
3553 // preprocess should have caught this; if not, assert.
3554 ceph_assert(!osdmap
.exists(from
) || osdmap
.get_uuid(from
).is_zero());
3555 pending_inc
.new_uuid
[from
] = m
->sb
.osd_fsid
;
3559 if (m
->sb
.newest_map
== 0 && osdmap
.exists(from
)) {
3560 const osd_info_t
& i
= osdmap
.get_info(from
);
3561 if (i
.up_from
> i
.lost_at
) {
3562 dout(10) << " fresh osd; marking lost_at too" << dendl
;
3563 pending_inc
.new_lost
[from
] = osdmap
.get_epoch();
3568 bufferlist osd_metadata
;
3569 encode(m
->metadata
, osd_metadata
);
3570 pending_metadata
[from
] = osd_metadata
;
3571 pending_metadata_rm
.erase(from
);
3573 // adjust last clean unmount epoch?
3574 const osd_info_t
& info
= osdmap
.get_info(from
);
3575 dout(10) << " old osd_info: " << info
<< dendl
;
3576 if (m
->sb
.mounted
> info
.last_clean_begin
||
3577 (m
->sb
.mounted
== info
.last_clean_begin
&&
3578 m
->sb
.clean_thru
> info
.last_clean_end
)) {
3579 epoch_t begin
= m
->sb
.mounted
;
3580 epoch_t end
= m
->sb
.clean_thru
;
3582 dout(10) << __func__
<< " osd." << from
<< " last_clean_interval "
3583 << "[" << info
.last_clean_begin
<< "," << info
.last_clean_end
3584 << ") -> [" << begin
<< "-" << end
<< ")"
3586 pending_inc
.new_last_clean_interval
[from
] =
3587 pair
<epoch_t
,epoch_t
>(begin
, end
);
3590 if (pending_inc
.new_xinfo
.count(from
) == 0)
3591 pending_inc
.new_xinfo
[from
] = osdmap
.osd_xinfo
[from
];
3592 osd_xinfo_t
& xi
= pending_inc
.new_xinfo
[from
];
3593 if (m
->boot_epoch
== 0) {
3594 xi
.laggy_probability
*= (1.0 - g_conf()->mon_osd_laggy_weight
);
3595 xi
.laggy_interval
*= (1.0 - g_conf()->mon_osd_laggy_weight
);
3596 dout(10) << " not laggy, new xi " << xi
<< dendl
;
3598 if (xi
.down_stamp
.sec()) {
3599 int interval
= ceph_clock_now().sec() -
3600 xi
.down_stamp
.sec();
3601 if (g_conf()->mon_osd_laggy_max_interval
&&
3602 (interval
> g_conf()->mon_osd_laggy_max_interval
)) {
3603 interval
= g_conf()->mon_osd_laggy_max_interval
;
3606 interval
* g_conf()->mon_osd_laggy_weight
+
3607 xi
.laggy_interval
* (1.0 - g_conf()->mon_osd_laggy_weight
);
3609 xi
.laggy_probability
=
3610 g_conf()->mon_osd_laggy_weight
+
3611 xi
.laggy_probability
* (1.0 - g_conf()->mon_osd_laggy_weight
);
3612 dout(10) << " laggy, now xi " << xi
<< dendl
;
3615 // set features shared by the osd
3616 if (m
->osd_features
)
3617 xi
.features
= m
->osd_features
;
3619 xi
.features
= m
->get_connection()->get_features();
3622 if ((g_conf()->mon_osd_auto_mark_auto_out_in
&&
3623 (oldstate
& CEPH_OSD_AUTOOUT
)) ||
3624 (g_conf()->mon_osd_auto_mark_new_in
&& (oldstate
& CEPH_OSD_NEW
)) ||
3625 (g_conf()->mon_osd_auto_mark_in
)) {
3626 if (can_mark_in(from
)) {
3627 if (xi
.old_weight
> 0) {
3628 pending_inc
.new_weight
[from
] = xi
.old_weight
;
3631 pending_inc
.new_weight
[from
] = CEPH_OSD_IN
;
3634 dout(7) << __func__
<< " NOIN set, will not mark in "
3635 << m
->get_orig_source_addr() << dendl
;
3640 wait_for_finished_proposal(op
, new C_Booted(this, op
));
3645 void OSDMonitor::_booted(MonOpRequestRef op
, bool logit
)
3647 op
->mark_osdmon_event(__func__
);
3648 auto m
= op
->get_req
<MOSDBoot
>();
3649 dout(7) << "_booted " << m
->get_orig_source_inst()
3650 << " w " << m
->sb
.weight
<< " from " << m
->sb
.current_epoch
<< dendl
;
3653 mon
->clog
->info() << m
->get_source() << " " << m
->get_orig_source_addrs()
3657 send_latest(op
, m
->sb
.current_epoch
+1);
3664 bool OSDMonitor::preprocess_full(MonOpRequestRef op
)
3666 op
->mark_osdmon_event(__func__
);
3667 auto m
= op
->get_req
<MOSDFull
>();
3668 int from
= m
->get_orig_source().num();
3670 unsigned mask
= CEPH_OSD_NEARFULL
| CEPH_OSD_BACKFILLFULL
| CEPH_OSD_FULL
;
3672 // check permissions, ignore if failed
3673 MonSession
*session
= op
->get_session();
3676 if (!session
->is_capable("osd", MON_CAP_X
)) {
3677 dout(0) << "MOSDFull from entity with insufficient privileges:"
3678 << session
->caps
<< dendl
;
3682 // ignore a full message from the osd instance that already went down
3683 if (!osdmap
.exists(from
)) {
3684 dout(7) << __func__
<< " ignoring full message from nonexistent "
3685 << m
->get_orig_source_inst() << dendl
;
3688 if ((!osdmap
.is_up(from
) &&
3689 osdmap
.get_most_recent_addrs(from
).legacy_equals(
3690 m
->get_orig_source_addrs())) ||
3691 (osdmap
.is_up(from
) &&
3692 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs()))) {
3693 dout(7) << __func__
<< " ignoring full message from down "
3694 << m
->get_orig_source_inst() << dendl
;
3698 OSDMap::calc_state_set(osdmap
.get_state(from
), state
);
3700 if ((osdmap
.get_state(from
) & mask
) == m
->state
) {
3701 dout(7) << __func__
<< " state already " << state
<< " for osd." << from
3702 << " " << m
->get_orig_source_inst() << dendl
;
3703 _reply_map(op
, m
->version
);
3707 dout(10) << __func__
<< " want state " << state
<< " for osd." << from
3708 << " " << m
->get_orig_source_inst() << dendl
;
3715 bool OSDMonitor::prepare_full(MonOpRequestRef op
)
3717 op
->mark_osdmon_event(__func__
);
3718 auto m
= op
->get_req
<MOSDFull
>();
3719 const int from
= m
->get_orig_source().num();
3721 const unsigned mask
= CEPH_OSD_NEARFULL
| CEPH_OSD_BACKFILLFULL
| CEPH_OSD_FULL
;
3722 const unsigned want_state
= m
->state
& mask
; // safety first
3724 unsigned cur_state
= osdmap
.get_state(from
);
3725 auto p
= pending_inc
.new_state
.find(from
);
3726 if (p
!= pending_inc
.new_state
.end()) {
3727 cur_state
^= p
->second
;
3731 set
<string
> want_state_set
, cur_state_set
;
3732 OSDMap::calc_state_set(want_state
, want_state_set
);
3733 OSDMap::calc_state_set(cur_state
, cur_state_set
);
3735 if (cur_state
!= want_state
) {
3736 if (p
!= pending_inc
.new_state
.end()) {
3739 pending_inc
.new_state
[from
] = 0;
3741 pending_inc
.new_state
[from
] |= (osdmap
.get_state(from
) & mask
) ^ want_state
;
3742 dout(7) << __func__
<< " osd." << from
<< " " << cur_state_set
3743 << " -> " << want_state_set
<< dendl
;
3745 dout(7) << __func__
<< " osd." << from
<< " " << cur_state_set
3746 << " = wanted " << want_state_set
<< ", just waiting" << dendl
;
3749 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
3756 bool OSDMonitor::preprocess_alive(MonOpRequestRef op
)
3758 op
->mark_osdmon_event(__func__
);
3759 auto m
= op
->get_req
<MOSDAlive
>();
3760 int from
= m
->get_orig_source().num();
3762 // check permissions, ignore if failed
3763 MonSession
*session
= op
->get_session();
3766 if (!session
->is_capable("osd", MON_CAP_X
)) {
3767 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3768 << session
->caps
<< dendl
;
3772 if (!osdmap
.is_up(from
) ||
3773 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs())) {
3774 dout(7) << "preprocess_alive ignoring alive message from down "
3775 << m
->get_orig_source() << " " << m
->get_orig_source_addrs()
3780 if (osdmap
.get_up_thru(from
) >= m
->want
) {
3782 dout(7) << "preprocess_alive want up_thru " << m
->want
<< " dup from " << m
->get_orig_source_inst() << dendl
;
3783 _reply_map(op
, m
->version
);
3787 dout(10) << "preprocess_alive want up_thru " << m
->want
3788 << " from " << m
->get_orig_source_inst() << dendl
;
3795 bool OSDMonitor::prepare_alive(MonOpRequestRef op
)
3797 op
->mark_osdmon_event(__func__
);
3798 auto m
= op
->get_req
<MOSDAlive
>();
3799 int from
= m
->get_orig_source().num();
3801 if (0) { // we probably don't care much about these
3802 mon
->clog
->debug() << m
->get_orig_source_inst() << " alive";
3805 dout(7) << "prepare_alive want up_thru " << m
->want
<< " have " << m
->version
3806 << " from " << m
->get_orig_source_inst() << dendl
;
3808 update_up_thru(from
, m
->version
); // set to the latest map the OSD has
3809 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
3813 void OSDMonitor::_reply_map(MonOpRequestRef op
, epoch_t e
)
3815 op
->mark_osdmon_event(__func__
);
3816 dout(7) << "_reply_map " << e
3817 << " from " << op
->get_req()->get_orig_source_inst()
3823 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op
)
3825 op
->mark_osdmon_event(__func__
);
3826 auto m
= op
->get_req
<MOSDPGCreated
>();
3827 dout(10) << __func__
<< " " << *m
<< dendl
;
3828 auto session
= op
->get_session();
3831 dout(10) << __func__
<< ": no monitor session!" << dendl
;
3834 if (!session
->is_capable("osd", MON_CAP_X
)) {
3835 derr
<< __func__
<< " received from entity "
3836 << "with insufficient privileges " << session
->caps
<< dendl
;
3839 // always forward the "created!" to the leader
3843 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op
)
3845 op
->mark_osdmon_event(__func__
);
3846 auto m
= op
->get_req
<MOSDPGCreated
>();
3847 dout(10) << __func__
<< " " << *m
<< dendl
;
3848 auto src
= m
->get_orig_source();
3849 auto from
= src
.num();
3850 if (!src
.is_osd() ||
3851 !mon
->osdmon()->osdmap
.is_up(from
) ||
3852 !mon
->osdmon()->osdmap
.get_addrs(from
).legacy_equals(
3853 m
->get_orig_source_addrs())) {
3854 dout(1) << __func__
<< " ignoring stats from non-active osd." << dendl
;
3857 pending_created_pgs
.push_back(m
->pgid
);
3861 bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op
)
3863 op
->mark_osdmon_event(__func__
);
3864 auto m
= op
->get_req
<MOSDPGReadyToMerge
>();
3865 dout(10) << __func__
<< " " << *m
<< dendl
;
3866 const pg_pool_t
*pi
;
3867 auto session
= op
->get_session();
3869 dout(10) << __func__
<< ": no monitor session!" << dendl
;
3872 if (!session
->is_capable("osd", MON_CAP_X
)) {
3873 derr
<< __func__
<< " received from entity "
3874 << "with insufficient privileges " << session
->caps
<< dendl
;
3877 pi
= osdmap
.get_pg_pool(m
->pgid
.pool());
3879 derr
<< __func__
<< " pool for " << m
->pgid
<< " dne" << dendl
;
3882 if (pi
->get_pg_num() <= m
->pgid
.ps()) {
3883 dout(20) << " pg_num " << pi
->get_pg_num() << " already < " << m
->pgid
<< dendl
;
3886 if (pi
->get_pg_num() != m
->pgid
.ps() + 1) {
3887 derr
<< " OSD trying to merge wrong pgid " << m
->pgid
<< dendl
;
3890 if (pi
->get_pg_num_pending() > m
->pgid
.ps()) {
3891 dout(20) << " pg_num_pending " << pi
->get_pg_num_pending() << " > " << m
->pgid
<< dendl
;
3901 bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op
)
3903 op
->mark_osdmon_event(__func__
);
3904 auto m
= op
->get_req
<MOSDPGReadyToMerge
>();
3905 dout(10) << __func__
<< " " << *m
<< dendl
;
3907 if (pending_inc
.new_pools
.count(m
->pgid
.pool()))
3908 p
= pending_inc
.new_pools
[m
->pgid
.pool()];
3910 p
= *osdmap
.get_pg_pool(m
->pgid
.pool());
3911 if (p
.get_pg_num() != m
->pgid
.ps() + 1 ||
3912 p
.get_pg_num_pending() > m
->pgid
.ps()) {
3913 dout(10) << __func__
3914 << " race with concurrent pg_num[_pending] update, will retry"
3916 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
3921 p
.dec_pg_num(m
->pgid
,
3925 m
->last_epoch_started
,
3926 m
->last_epoch_clean
);
3927 p
.last_change
= pending_inc
.epoch
;
3929 // back off the merge attempt!
3930 p
.set_pg_num_pending(p
.get_pg_num());
3933 // force pre-nautilus clients to resend their ops, since they
3934 // don't understand pg_num_pending changes form a new interval
3935 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
3937 pending_inc
.new_pools
[m
->pgid
.pool()] = p
;
3939 auto prob
= g_conf().get_val
<double>("mon_inject_pg_merge_bounce_probability");
3942 prob
> (double)(rand() % 1000)/1000.0) {
3943 derr
<< __func__
<< " injecting pg merge pg_num bounce" << dendl
;
3944 auto n
= new MMonCommand(mon
->monmap
->get_fsid());
3945 n
->set_connection(m
->get_connection());
3946 n
->cmd
= { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
3947 osdmap
.get_pool_name(m
->pgid
.pool()) +
3948 "\", \"var\": \"pg_num_actual\", \"val\": \"" +
3949 stringify(m
->pgid
.ps() + 1) + "\"}" };
3950 MonOpRequestRef nop
= mon
->op_tracker
.create_request
<MonOpRequest
>(n
);
3951 nop
->set_type_service();
3952 wait_for_finished_proposal(op
, new C_RetryMessage(this, nop
));
3954 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
3963 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op
)
3965 auto m
= op
->get_req
<MOSDPGTemp
>();
3966 dout(10) << "preprocess_pgtemp " << *m
<< dendl
;
3967 mempool::osdmap::vector
<int> empty
;
3968 int from
= m
->get_orig_source().num();
3969 size_t ignore_cnt
= 0;
3972 MonSession
*session
= op
->get_session();
3975 if (!session
->is_capable("osd", MON_CAP_X
)) {
3976 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
3977 << session
->caps
<< dendl
;
3981 if (!osdmap
.is_up(from
) ||
3982 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs())) {
3983 dout(7) << "ignoring pgtemp message from down "
3984 << m
->get_orig_source() << " " << m
->get_orig_source_addrs()
3993 for (auto p
= m
->pg_temp
.begin(); p
!= m
->pg_temp
.end(); ++p
) {
3994 dout(20) << " " << p
->first
3995 << (osdmap
.pg_temp
->count(p
->first
) ? osdmap
.pg_temp
->get(p
->first
) : empty
)
3996 << " -> " << p
->second
<< dendl
;
3998 // does the pool exist?
3999 if (!osdmap
.have_pg_pool(p
->first
.pool())) {
4001 * 1. If the osdmap does not have the pool, it means the pool has been
4002 * removed in-between the osd sending this message and us handling it.
4003 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
4004 * not exist in the pending either, as the osds would not send a
4005 * message about a pool they know nothing about (yet).
4006 * 3. However, if the pool does exist in the pending, then it must be a
4007 * new pool, and not relevant to this message (see 1).
4009 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
4010 << ": pool has been removed" << dendl
;
4015 int acting_primary
= -1;
4016 osdmap
.pg_to_up_acting_osds(
4017 p
->first
, nullptr, nullptr, nullptr, &acting_primary
);
4018 if (acting_primary
!= from
) {
4019 /* If the source isn't the primary based on the current osdmap, we know
4020 * that the interval changed and that we can discard this message.
4021 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
4022 * which of two pg temp mappings on the same pg is more recent.
4024 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
4025 << ": primary has changed" << dendl
;
4031 if (p
->second
.empty() && (osdmap
.pg_temp
->count(p
->first
) ||
4032 osdmap
.primary_temp
->count(p
->first
)))
4035 // NOTE: we assume that this will clear pg_primary, so consider
4036 // an existing pg_primary field to imply a change
4037 if (p
->second
.size() &&
4038 (osdmap
.pg_temp
->count(p
->first
) == 0 ||
4039 osdmap
.pg_temp
->get(p
->first
) != p
->second
||
4040 osdmap
.primary_temp
->count(p
->first
)))
4044 // should we ignore all the pgs?
4045 if (ignore_cnt
== m
->pg_temp
.size())
4048 dout(7) << "preprocess_pgtemp e" << m
->map_epoch
<< " no changes from " << m
->get_orig_source_inst() << dendl
;
4049 _reply_map(op
, m
->map_epoch
);
4057 void OSDMonitor::update_up_thru(int from
, epoch_t up_thru
)
4059 epoch_t old_up_thru
= osdmap
.get_up_thru(from
);
4060 auto ut
= pending_inc
.new_up_thru
.find(from
);
4061 if (ut
!= pending_inc
.new_up_thru
.end()) {
4062 old_up_thru
= ut
->second
;
4064 if (up_thru
> old_up_thru
) {
4065 // set up_thru too, so the osd doesn't have to ask again
4066 pending_inc
.new_up_thru
[from
] = up_thru
;
4070 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op
)
4072 op
->mark_osdmon_event(__func__
);
4073 auto m
= op
->get_req
<MOSDPGTemp
>();
4074 int from
= m
->get_orig_source().num();
4075 dout(7) << "prepare_pgtemp e" << m
->map_epoch
<< " from " << m
->get_orig_source_inst() << dendl
;
4076 for (map
<pg_t
,vector
<int32_t> >::iterator p
= m
->pg_temp
.begin(); p
!= m
->pg_temp
.end(); ++p
) {
4077 uint64_t pool
= p
->first
.pool();
4078 if (pending_inc
.old_pools
.count(pool
)) {
4079 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
4080 << ": pool pending removal" << dendl
;
4083 if (!osdmap
.have_pg_pool(pool
)) {
4084 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
4085 << ": pool has been removed" << dendl
;
4088 pending_inc
.new_pg_temp
[p
->first
] =
4089 mempool::osdmap::vector
<int>(p
->second
.begin(), p
->second
.end());
4091 // unconditionally clear pg_primary (until this message can encode
4092 // a change for that, too.. at which point we need to also fix
4093 // preprocess_pg_temp)
4094 if (osdmap
.primary_temp
->count(p
->first
) ||
4095 pending_inc
.new_primary_temp
.count(p
->first
))
4096 pending_inc
.new_primary_temp
[p
->first
] = -1;
4099 // set up_thru too, so the osd doesn't have to ask again
4100 update_up_thru(from
, m
->map_epoch
);
4102 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->map_epoch
));
4109 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op
)
4111 op
->mark_osdmon_event(__func__
);
4112 auto m
= op
->get_req
<MRemoveSnaps
>();
4113 dout(7) << "preprocess_remove_snaps " << *m
<< dendl
;
4115 // check privilege, ignore if failed
4116 MonSession
*session
= op
->get_session();
4120 if (!session
->caps
.is_capable(
4122 session
->entity_name
,
4123 "osd", "osd pool rmsnap", {}, true, true, false,
4124 session
->get_peer_socket_addr())) {
4125 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
4126 << session
->caps
<< dendl
;
4130 for (map
<int, vector
<snapid_t
> >::iterator q
= m
->snaps
.begin();
4131 q
!= m
->snaps
.end();
4133 if (!osdmap
.have_pg_pool(q
->first
)) {
4134 dout(10) << " ignoring removed_snaps " << q
->second
4135 << " on non-existent pool " << q
->first
<< dendl
;
4138 const pg_pool_t
*pi
= osdmap
.get_pg_pool(q
->first
);
4139 for (vector
<snapid_t
>::iterator p
= q
->second
.begin();
4140 p
!= q
->second
.end();
4142 if (*p
> pi
->get_snap_seq() ||
4143 !_is_removed_snap(q
->first
, *p
)) {
4149 if (HAVE_FEATURE(m
->get_connection()->get_features(), SERVER_OCTOPUS
)) {
4150 auto reply
= make_message
<MRemoveSnaps
>();
4151 reply
->snaps
= m
->snaps
;
4152 mon
->send_reply(op
, reply
.detach());
4159 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op
)
4161 op
->mark_osdmon_event(__func__
);
4162 auto m
= op
->get_req
<MRemoveSnaps
>();
4163 dout(7) << "prepare_remove_snaps " << *m
<< dendl
;
4165 for (auto& [pool
, snaps
] : m
->snaps
) {
4166 if (!osdmap
.have_pg_pool(pool
)) {
4167 dout(10) << " ignoring removed_snaps " << snaps
4168 << " on non-existent pool " << pool
<< dendl
;
4172 pg_pool_t
& pi
= osdmap
.pools
[pool
];
4173 for (auto s
: snaps
) {
4174 if (!_is_removed_snap(pool
, s
) &&
4175 (!pending_inc
.new_pools
.count(pool
) ||
4176 !pending_inc
.new_pools
[pool
].removed_snaps
.contains(s
)) &&
4177 (!pending_inc
.new_removed_snaps
.count(pool
) ||
4178 !pending_inc
.new_removed_snaps
[pool
].contains(s
))) {
4179 pg_pool_t
*newpi
= pending_inc
.get_new_pool(pool
, &pi
);
4180 if (osdmap
.require_osd_release
< ceph_release_t::octopus
) {
4181 newpi
->removed_snaps
.insert(s
);
4182 dout(10) << " pool " << pool
<< " removed_snaps added " << s
4183 << " (now " << newpi
->removed_snaps
<< ")" << dendl
;
4185 newpi
->flags
|= pg_pool_t::FLAG_SELFMANAGED_SNAPS
;
4186 if (s
> newpi
->get_snap_seq()) {
4187 dout(10) << " pool " << pool
<< " snap_seq "
4188 << newpi
->get_snap_seq() << " -> " << s
<< dendl
;
4189 newpi
->set_snap_seq(s
);
4191 newpi
->set_snap_epoch(pending_inc
.epoch
);
4192 dout(10) << " added pool " << pool
<< " snap " << s
4193 << " to removed_snaps queue" << dendl
;
4194 pending_inc
.new_removed_snaps
[pool
].insert(s
);
4199 if (HAVE_FEATURE(m
->get_connection()->get_features(), SERVER_OCTOPUS
)) {
4200 auto reply
= make_message
<MRemoveSnaps
>();
4201 reply
->snaps
= m
->snaps
;
4202 wait_for_finished_proposal(op
, new C_ReplyOp(this, op
, reply
));
4208 bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op
)
4210 op
->mark_osdmon_event(__func__
);
4211 auto m
= op
->get_req
<MMonGetPurgedSnaps
>();
4212 dout(7) << __func__
<< " " << *m
<< dendl
;
4214 map
<epoch_t
,mempool::osdmap::map
<int64_t,snap_interval_set_t
>> r
;
4216 string k
= make_purged_snap_epoch_key(m
->start
);
4217 auto it
= mon
->store
->get_iterator(OSD_SNAP_PREFIX
);
4219 unsigned long epoch
= m
->last
;
4220 while (it
->valid()) {
4221 if (it
->key().find("purged_epoch_") != 0) {
4224 string k
= it
->key();
4225 int n
= sscanf(k
.c_str(), "purged_epoch_%lx", &epoch
);
4227 derr
<< __func__
<< " unable to parse key '" << it
->key() << "'" << dendl
;
4228 } else if (epoch
> m
->last
) {
4231 bufferlist bl
= it
->value();
4232 auto p
= bl
.cbegin();
4236 } catch (buffer::error
& e
) {
4237 derr
<< __func__
<< " unable to parse value for key '" << it
->key()
4242 n
+= 4 + v
.size() * 16;
4245 // impose a semi-arbitrary limit to message size
4251 auto reply
= make_message
<MMonGetPurgedSnapsReply
>(m
->start
, epoch
);
4252 reply
->purged_snaps
.swap(r
);
4253 mon
->send_reply(op
, reply
.detach());
4259 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op
)
4261 op
->mark_osdmon_event(__func__
);
4263 auto session
= op
->get_session();
4266 dout(10) << __func__
<< " no monitor session!" << dendl
;
4269 if (!session
->is_capable("osd", MON_CAP_X
)) {
4270 derr
<< __func__
<< " received from entity "
4271 << "with insufficient privileges " << session
->caps
<< dendl
;
4274 // Always forward the beacon to the leader, even if they are the same as
4275 // the old one. The leader will mark as down osds that haven't sent
4276 // beacon for a few minutes.
4280 bool OSDMonitor::prepare_beacon(MonOpRequestRef op
)
4282 op
->mark_osdmon_event(__func__
);
4283 const auto beacon
= op
->get_req
<MOSDBeacon
>();
4284 const auto src
= beacon
->get_orig_source();
4285 dout(10) << __func__
<< " " << *beacon
4286 << " from " << src
<< dendl
;
4287 int from
= src
.num();
4289 if (!src
.is_osd() ||
4290 !osdmap
.is_up(from
) ||
4291 !osdmap
.get_addrs(from
).legacy_equals(beacon
->get_orig_source_addrs())) {
4292 if (src
.is_osd() && !osdmap
.is_up(from
)) {
4293 // share some new maps with this guy in case it may not be
4294 // aware of its own deadness...
4295 send_latest(op
, beacon
->version
+1);
4297 dout(1) << " ignoring beacon from non-active osd." << from
<< dendl
;
4301 last_osd_report
[from
] = ceph_clock_now();
4302 osd_epochs
[from
] = beacon
->version
;
4304 for (const auto& pg
: beacon
->pgs
) {
4305 last_epoch_clean
.report(pg
, beacon
->min_last_epoch_clean
);
4308 if (osdmap
.osd_xinfo
[from
].last_purged_snaps_scrub
<
4309 beacon
->last_purged_snaps_scrub
) {
4310 if (pending_inc
.new_xinfo
.count(from
) == 0) {
4311 pending_inc
.new_xinfo
[from
] = osdmap
.osd_xinfo
[from
];
4313 pending_inc
.new_xinfo
[from
].last_purged_snaps_scrub
=
4314 beacon
->last_purged_snaps_scrub
;
4324 void OSDMonitor::send_latest(MonOpRequestRef op
, epoch_t start
)
4326 op
->mark_osdmon_event(__func__
);
4327 dout(5) << "send_latest to " << op
->get_req()->get_orig_source_inst()
4328 << " start " << start
<< dendl
;
4332 send_incremental(op
, start
);
4336 MOSDMap
*OSDMonitor::build_latest_full(uint64_t features
)
4338 MOSDMap
*r
= new MOSDMap(mon
->monmap
->fsid
, features
);
4339 get_version_full(osdmap
.get_epoch(), features
, r
->maps
[osdmap
.get_epoch()]);
4340 r
->oldest_map
= get_first_committed();
4341 r
->newest_map
= osdmap
.get_epoch();
4345 MOSDMap
*OSDMonitor::build_incremental(epoch_t from
, epoch_t to
, uint64_t features
)
4347 dout(10) << "build_incremental [" << from
<< ".." << to
<< "] with features "
4348 << std::hex
<< features
<< std::dec
<< dendl
;
4349 MOSDMap
*m
= new MOSDMap(mon
->monmap
->fsid
, features
);
4350 m
->oldest_map
= get_first_committed();
4351 m
->newest_map
= osdmap
.get_epoch();
4353 for (epoch_t e
= to
; e
>= from
&& e
> 0; e
--) {
4355 int err
= get_version(e
, features
, bl
);
4357 ceph_assert(bl
.length());
4358 // if (get_version(e, bl) > 0) {
4359 dout(20) << "build_incremental inc " << e
<< " "
4360 << bl
.length() << " bytes" << dendl
;
4361 m
->incremental_maps
[e
] = bl
;
4363 ceph_assert(err
== -ENOENT
);
4364 ceph_assert(!bl
.length());
4365 get_version_full(e
, features
, bl
);
4366 if (bl
.length() > 0) {
4367 //else if (get_version("full", e, bl) > 0) {
4368 dout(20) << "build_incremental full " << e
<< " "
4369 << bl
.length() << " bytes" << dendl
;
4372 ceph_abort(); // we should have all maps.
4379 void OSDMonitor::send_full(MonOpRequestRef op
)
4381 op
->mark_osdmon_event(__func__
);
4382 dout(5) << "send_full to " << op
->get_req()->get_orig_source_inst() << dendl
;
4383 mon
->send_reply(op
, build_latest_full(op
->get_session()->con_features
));
4386 void OSDMonitor::send_incremental(MonOpRequestRef op
, epoch_t first
)
4388 op
->mark_osdmon_event(__func__
);
4390 MonSession
*s
= op
->get_session();
4394 // oh, we can tell the other mon to do it
4395 dout(10) << __func__
<< " asking proxying mon to send_incremental from "
4397 MRoute
*r
= new MRoute(s
->proxy_tid
, NULL
);
4398 r
->send_osdmap_first
= first
;
4399 s
->proxy_con
->send_message(r
);
4400 op
->mark_event("reply: send routed send_osdmap_first reply");
4403 send_incremental(first
, s
, false, op
);
4407 void OSDMonitor::send_incremental(epoch_t first
,
4408 MonSession
*session
,
4410 MonOpRequestRef req
)
4412 dout(5) << "send_incremental [" << first
<< ".." << osdmap
.get_epoch() << "]"
4413 << " to " << session
->name
<< dendl
;
4415 // get feature of the peer
4416 // use quorum_con_features, if it's an anonymous connection.
4417 uint64_t features
= session
->con_features
? session
->con_features
:
4418 mon
->get_quorum_con_features();
4420 if (first
<= session
->osd_epoch
) {
4421 dout(10) << __func__
<< " " << session
->name
<< " should already have epoch "
4422 << session
->osd_epoch
<< dendl
;
4423 first
= session
->osd_epoch
+ 1;
4426 if (first
< get_first_committed()) {
4427 MOSDMap
*m
= new MOSDMap(osdmap
.get_fsid(), features
);
4428 m
->oldest_map
= get_first_committed();
4429 m
->newest_map
= osdmap
.get_epoch();
4431 first
= get_first_committed();
4433 int err
= get_version_full(first
, features
, bl
);
4434 ceph_assert(err
== 0);
4435 ceph_assert(bl
.length());
4436 dout(20) << "send_incremental starting with base full "
4437 << first
<< " " << bl
.length() << " bytes" << dendl
;
4438 m
->maps
[first
] = bl
;
4441 mon
->send_reply(req
, m
);
4442 session
->osd_epoch
= first
;
4445 session
->con
->send_message(m
);
4446 session
->osd_epoch
= first
;
4451 while (first
<= osdmap
.get_epoch()) {
4452 epoch_t last
= std::min
<epoch_t
>(first
+ g_conf()->osd_map_message_max
- 1,
4453 osdmap
.get_epoch());
4454 MOSDMap
*m
= build_incremental(first
, last
, features
);
4457 // send some maps. it may not be all of them, but it will get them
4459 mon
->send_reply(req
, m
);
4461 session
->con
->send_message(m
);
4464 session
->osd_epoch
= last
;
4470 int OSDMonitor::get_version(version_t ver
, bufferlist
& bl
)
4472 return get_version(ver
, mon
->get_quorum_con_features(), bl
);
4475 void OSDMonitor::reencode_incremental_map(bufferlist
& bl
, uint64_t features
)
4477 OSDMap::Incremental inc
;
4478 auto q
= bl
.cbegin();
4480 // always encode with subset of osdmap's canonical features
4481 uint64_t f
= features
& inc
.encode_features
;
4482 dout(20) << __func__
<< " " << inc
.epoch
<< " with features " << f
4485 if (inc
.fullmap
.length()) {
4486 // embedded full map?
4488 m
.decode(inc
.fullmap
);
4489 inc
.fullmap
.clear();
4490 m
.encode(inc
.fullmap
, f
| CEPH_FEATURE_RESERVED
);
4492 if (inc
.crush
.length()) {
4493 // embedded crush map
4495 auto p
= inc
.crush
.cbegin();
4498 c
.encode(inc
.crush
, f
);
4500 inc
.encode(bl
, f
| CEPH_FEATURE_RESERVED
);
4503 void OSDMonitor::reencode_full_map(bufferlist
& bl
, uint64_t features
)
4506 auto q
= bl
.cbegin();
4508 // always encode with subset of osdmap's canonical features
4509 uint64_t f
= features
& m
.get_encoding_features();
4510 dout(20) << __func__
<< " " << m
.get_epoch() << " with features " << f
4513 m
.encode(bl
, f
| CEPH_FEATURE_RESERVED
);
4516 int OSDMonitor::get_version(version_t ver
, uint64_t features
, bufferlist
& bl
)
4518 uint64_t significant_features
= OSDMap::get_significant_features(features
);
4519 if (inc_osd_cache
.lookup({ver
, significant_features
}, &bl
)) {
4522 int ret
= PaxosService::get_version(ver
, bl
);
4526 // NOTE: this check is imprecise; the OSDMap encoding features may
4527 // be a subset of the latest mon quorum features, but worst case we
4528 // reencode once and then cache the (identical) result under both
4530 if (significant_features
!=
4531 OSDMap::get_significant_features(mon
->get_quorum_con_features())) {
4532 reencode_incremental_map(bl
, features
);
4534 inc_osd_cache
.add_bytes({ver
, significant_features
}, bl
);
4538 int OSDMonitor::get_inc(version_t ver
, OSDMap::Incremental
& inc
)
4541 int err
= get_version(ver
, inc_bl
);
4542 ceph_assert(err
== 0);
4543 ceph_assert(inc_bl
.length());
4545 auto p
= inc_bl
.cbegin();
4547 dout(10) << __func__
<< " "
4548 << " epoch " << inc
.epoch
4549 << " inc_crc " << inc
.inc_crc
4550 << " full_crc " << inc
.full_crc
4551 << " encode_features " << inc
.encode_features
<< dendl
;
4555 int OSDMonitor::get_full_from_pinned_map(version_t ver
, bufferlist
& bl
)
4557 dout(10) << __func__
<< " ver " << ver
<< dendl
;
4559 version_t closest_pinned
= osdmap_manifest
.get_lower_closest_pinned(ver
);
4560 if (closest_pinned
== 0) {
4563 if (closest_pinned
> ver
) {
4564 dout(0) << __func__
<< " pinned: " << osdmap_manifest
.pinned
<< dendl
;
4566 ceph_assert(closest_pinned
<= ver
);
4568 dout(10) << __func__
<< " closest pinned ver " << closest_pinned
<< dendl
;
4570 // get osdmap incremental maps and apply on top of this one.
4572 bool has_cached_osdmap
= false;
4573 for (version_t v
= ver
-1; v
>= closest_pinned
; --v
) {
4574 if (full_osd_cache
.lookup({v
, mon
->get_quorum_con_features()},
4576 dout(10) << __func__
<< " found map in cache ver " << v
<< dendl
;
4578 has_cached_osdmap
= true;
4583 if (!has_cached_osdmap
) {
4584 int err
= PaxosService::get_version_full(closest_pinned
, osdm_bl
);
4586 derr
<< __func__
<< " closest pinned map ver " << closest_pinned
4587 << " not available! error: " << cpp_strerror(err
) << dendl
;
4589 ceph_assert(err
== 0);
4592 ceph_assert(osdm_bl
.length());
4595 osdm
.decode(osdm_bl
);
4597 dout(10) << __func__
<< " loaded osdmap epoch " << closest_pinned
4598 << " e" << osdm
.epoch
4599 << " crc " << osdm
.get_crc()
4600 << " -- applying incremental maps." << dendl
;
4602 uint64_t encode_features
= 0;
4603 for (version_t v
= closest_pinned
+ 1; v
<= ver
; ++v
) {
4604 dout(20) << __func__
<< " applying inc epoch " << v
<< dendl
;
4606 OSDMap::Incremental inc
;
4607 int err
= get_inc(v
, inc
);
4608 ceph_assert(err
== 0);
4610 encode_features
= inc
.encode_features
;
4612 err
= osdm
.apply_incremental(inc
);
4613 ceph_assert(err
== 0);
4615 // this block performs paranoid checks on map retrieval
4616 if (g_conf().get_val
<bool>("mon_debug_extra_checks") &&
4617 inc
.full_crc
!= 0) {
4619 uint64_t f
= encode_features
;
4621 f
= (mon
->quorum_con_features
? mon
->quorum_con_features
: -1);
4624 // encode osdmap to force calculating crcs
4626 osdm
.encode(tbl
, f
| CEPH_FEATURE_RESERVED
);
4627 // decode osdmap to compare crcs with what's expected by incremental
4631 if (tosdm
.get_crc() != inc
.full_crc
) {
4633 << " osdmap crc mismatch! (osdmap crc " << tosdm
.get_crc()
4634 << ", expected " << inc
.full_crc
<< ")" << dendl
;
4635 ceph_abort_msg("osdmap crc mismatch");
4639 // note: we cannot add the recently computed map to the cache, as is,
4640 // because we have not encoded the map into a bl.
4643 if (!encode_features
) {
4644 dout(10) << __func__
4645 << " last incremental map didn't have features;"
4646 << " defaulting to quorum's or all" << dendl
;
4648 (mon
->quorum_con_features
? mon
->quorum_con_features
: -1);
4650 osdm
.encode(bl
, encode_features
| CEPH_FEATURE_RESERVED
);
4655 int OSDMonitor::get_version_full(version_t ver
, bufferlist
& bl
)
4657 return get_version_full(ver
, mon
->get_quorum_con_features(), bl
);
4660 int OSDMonitor::get_version_full(version_t ver
, uint64_t features
,
4663 uint64_t significant_features
= OSDMap::get_significant_features(features
);
4664 if (full_osd_cache
.lookup({ver
, significant_features
}, &bl
)) {
4667 int ret
= PaxosService::get_version_full(ver
, bl
);
4668 if (ret
== -ENOENT
) {
4670 ret
= get_full_from_pinned_map(ver
, bl
);
4675 // NOTE: this check is imprecise; the OSDMap encoding features may
4676 // be a subset of the latest mon quorum features, but worst case we
4677 // reencode once and then cache the (identical) result under both
4679 if (significant_features
!=
4680 OSDMap::get_significant_features(mon
->get_quorum_con_features())) {
4681 reencode_full_map(bl
, features
);
4683 full_osd_cache
.add_bytes({ver
, significant_features
}, bl
);
4687 epoch_t
OSDMonitor::blacklist(const entity_addrvec_t
& av
, utime_t until
)
4689 dout(10) << "blacklist " << av
<< " until " << until
<< dendl
;
4690 for (auto a
: av
.v
) {
4691 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
4692 a
.set_type(entity_addr_t::TYPE_ANY
);
4694 a
.set_type(entity_addr_t::TYPE_LEGACY
);
4696 pending_inc
.new_blacklist
[a
] = until
;
4698 return pending_inc
.epoch
;
4701 epoch_t
OSDMonitor::blacklist(entity_addr_t a
, utime_t until
)
4703 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
4704 a
.set_type(entity_addr_t::TYPE_ANY
);
4706 a
.set_type(entity_addr_t::TYPE_LEGACY
);
4708 dout(10) << "blacklist " << a
<< " until " << until
<< dendl
;
4709 pending_inc
.new_blacklist
[a
] = until
;
4710 return pending_inc
.epoch
;
4714 void OSDMonitor::check_osdmap_subs()
4716 dout(10) << __func__
<< dendl
;
4717 if (!osdmap
.get_epoch()) {
4720 auto osdmap_subs
= mon
->session_map
.subs
.find("osdmap");
4721 if (osdmap_subs
== mon
->session_map
.subs
.end()) {
4724 auto p
= osdmap_subs
->second
->begin();
4728 check_osdmap_sub(sub
);
4732 void OSDMonitor::check_osdmap_sub(Subscription
*sub
)
4734 dout(10) << __func__
<< " " << sub
<< " next " << sub
->next
4735 << (sub
->onetime
? " (onetime)":" (ongoing)") << dendl
;
4736 if (sub
->next
<= osdmap
.get_epoch()) {
4738 send_incremental(sub
->next
, sub
->session
, sub
->incremental_onetime
);
4740 sub
->session
->con
->send_message(build_latest_full(sub
->session
->con_features
));
4742 mon
->session_map
.remove_sub(sub
);
4744 sub
->next
= osdmap
.get_epoch() + 1;
4748 void OSDMonitor::check_pg_creates_subs()
4750 if (!osdmap
.get_num_up_osds()) {
4753 ceph_assert(osdmap
.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB
);
4754 mon
->with_session_map([this](const MonSessionMap
& session_map
) {
4755 auto pg_creates_subs
= session_map
.subs
.find("osd_pg_creates");
4756 if (pg_creates_subs
== session_map
.subs
.end()) {
4759 for (auto sub
: *pg_creates_subs
->second
) {
4760 check_pg_creates_sub(sub
);
4765 void OSDMonitor::check_pg_creates_sub(Subscription
*sub
)
4767 dout(20) << __func__
<< " .. " << sub
->session
->name
<< dendl
;
4768 ceph_assert(sub
->type
== "osd_pg_creates");
4769 // only send these if the OSD is up. we will check_subs() when they do
4770 // come up so they will get the creates then.
4771 if (sub
->session
->name
.is_osd() &&
4772 mon
->osdmon()->osdmap
.is_up(sub
->session
->name
.num())) {
4773 sub
->next
= send_pg_creates(sub
->session
->name
.num(),
4774 sub
->session
->con
.get(),
4779 void OSDMonitor::do_application_enable(int64_t pool_id
,
4780 const std::string
&app_name
,
4781 const std::string
&app_key
,
4782 const std::string
&app_value
,
4785 ceph_assert(paxos
->is_plugged() && is_writeable());
4787 dout(20) << __func__
<< ": pool_id=" << pool_id
<< ", app_name=" << app_name
4790 ceph_assert(osdmap
.require_osd_release
>= ceph_release_t::luminous
);
4792 auto pp
= osdmap
.get_pg_pool(pool_id
);
4793 ceph_assert(pp
!= nullptr);
4796 if (pending_inc
.new_pools
.count(pool_id
)) {
4797 p
= pending_inc
.new_pools
[pool_id
];
4800 if (app_key
.empty()) {
4801 p
.application_metadata
.insert({app_name
, {}});
4804 p
.application_metadata
[app_name
][app_key
] = app_value
;
4806 p
.application_metadata
.insert({app_name
, {{app_key
, app_value
}}});
4809 p
.last_change
= pending_inc
.epoch
;
4810 pending_inc
.new_pools
[pool_id
] = p
;
4813 void OSDMonitor::do_set_pool_opt(int64_t pool_id
,
4814 pool_opts_t::key_t opt
,
4815 pool_opts_t::value_t val
)
4817 auto p
= pending_inc
.new_pools
.try_emplace(
4818 pool_id
, *osdmap
.get_pg_pool(pool_id
));
4819 p
.first
->second
.opts
.set(opt
, val
);
4822 unsigned OSDMonitor::scan_for_creating_pgs(
4823 const mempool::osdmap::map
<int64_t,pg_pool_t
>& pools
,
4824 const mempool::osdmap::set
<int64_t>& removed_pools
,
4826 creating_pgs_t
* creating_pgs
) const
4828 unsigned queued
= 0;
4829 for (auto& p
: pools
) {
4830 int64_t poolid
= p
.first
;
4831 if (creating_pgs
->created_pools
.count(poolid
)) {
4832 dout(10) << __func__
<< " already created " << poolid
<< dendl
;
4835 const pg_pool_t
& pool
= p
.second
;
4836 int ruleno
= osdmap
.crush
->find_rule(pool
.get_crush_rule(),
4837 pool
.get_type(), pool
.get_size());
4838 if (ruleno
< 0 || !osdmap
.crush
->rule_exists(ruleno
))
4841 const auto last_scan_epoch
= creating_pgs
->last_scan_epoch
;
4842 const auto created
= pool
.get_last_change();
4843 if (last_scan_epoch
&& created
<= last_scan_epoch
) {
4844 dout(10) << __func__
<< " no change in pool " << poolid
4845 << " " << pool
<< dendl
;
4848 if (removed_pools
.count(poolid
)) {
4849 dout(10) << __func__
<< " pool is being removed: " << poolid
4850 << " " << pool
<< dendl
;
4853 dout(10) << __func__
<< " queueing pool create for " << poolid
4854 << " " << pool
<< dendl
;
4855 creating_pgs
->create_pool(poolid
, pool
.get_pg_num(),
4862 void OSDMonitor::update_creating_pgs()
4864 dout(10) << __func__
<< " " << creating_pgs
.pgs
.size() << " pgs creating, "
4865 << creating_pgs
.queue
.size() << " pools in queue" << dendl
;
4866 decltype(creating_pgs_by_osd_epoch
) new_pgs_by_osd_epoch
;
4867 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
4868 for (const auto& pg
: creating_pgs
.pgs
) {
4869 int acting_primary
= -1;
4870 auto pgid
= pg
.first
;
4871 if (!osdmap
.pg_exists(pgid
)) {
4872 dout(20) << __func__
<< " ignoring " << pgid
<< " which should not exist"
4876 auto mapped
= pg
.second
.create_epoch
;
4877 dout(20) << __func__
<< " looking up " << pgid
<< "@" << mapped
<< dendl
;
4879 mapping
.get_primary_and_shard(pgid
, &acting_primary
, &spgid
);
4880 // check the previous creating_pgs, look for the target to whom the pg was
4881 // previously mapped
4882 for (const auto& pgs_by_epoch
: creating_pgs_by_osd_epoch
) {
4883 const auto last_acting_primary
= pgs_by_epoch
.first
;
4884 for (auto& pgs
: pgs_by_epoch
.second
) {
4885 if (pgs
.second
.count(spgid
)) {
4886 if (last_acting_primary
== acting_primary
) {
4889 dout(20) << __func__
<< " " << pgid
<< " "
4890 << " acting_primary:" << last_acting_primary
4891 << " -> " << acting_primary
<< dendl
;
4892 // note epoch if the target of the create message changed.
4893 mapped
= mapping
.get_epoch();
4898 mapped
= mapping
.get_epoch();
4902 dout(10) << __func__
<< " will instruct osd." << acting_primary
4903 << " to create " << pgid
<< "@" << mapped
<< dendl
;
4904 new_pgs_by_osd_epoch
[acting_primary
][mapped
].insert(spgid
);
4906 creating_pgs_by_osd_epoch
= std::move(new_pgs_by_osd_epoch
);
4907 creating_pgs_epoch
= mapping
.get_epoch();
4910 epoch_t
OSDMonitor::send_pg_creates(int osd
, Connection
*con
, epoch_t next
) const
4912 dout(30) << __func__
<< " osd." << osd
<< " next=" << next
4913 << " " << creating_pgs_by_osd_epoch
<< dendl
;
4914 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
4915 if (creating_pgs_epoch
<= creating_pgs
.last_scan_epoch
) {
4916 dout(20) << __func__
4917 << " not using stale creating_pgs@" << creating_pgs_epoch
<< dendl
;
4918 // the subscribers will be updated when the mapping is completed anyway
4921 auto creating_pgs_by_epoch
= creating_pgs_by_osd_epoch
.find(osd
);
4922 if (creating_pgs_by_epoch
== creating_pgs_by_osd_epoch
.end())
4924 ceph_assert(!creating_pgs_by_epoch
->second
.empty());
4926 MOSDPGCreate
*oldm
= nullptr; // for pre-mimic OSD compat
4927 MOSDPGCreate2
*m
= nullptr;
4929 bool old
= osdmap
.require_osd_release
< ceph_release_t::nautilus
;
4932 for (auto epoch_pgs
= creating_pgs_by_epoch
->second
.lower_bound(next
);
4933 epoch_pgs
!= creating_pgs_by_epoch
->second
.end(); ++epoch_pgs
) {
4934 auto epoch
= epoch_pgs
->first
;
4935 auto& pgs
= epoch_pgs
->second
;
4936 dout(20) << __func__
<< " osd." << osd
<< " from " << next
4937 << " : epoch " << epoch
<< " " << pgs
.size() << " pgs" << dendl
;
4939 for (auto& pg
: pgs
) {
4940 // Need the create time from the monitor using its clock to set
4941 // last_scrub_stamp upon pg creation.
4942 auto create
= creating_pgs
.pgs
.find(pg
.pgid
);
4943 ceph_assert(create
!= creating_pgs
.pgs
.end());
4946 oldm
= new MOSDPGCreate(creating_pgs_epoch
);
4948 oldm
->mkpg
.emplace(pg
.pgid
,
4949 pg_create_t
{create
->second
.create_epoch
, pg
.pgid
, 0});
4950 oldm
->ctimes
.emplace(pg
.pgid
, create
->second
.create_stamp
);
4953 m
= new MOSDPGCreate2(creating_pgs_epoch
);
4955 m
->pgs
.emplace(pg
, make_pair(create
->second
.create_epoch
,
4956 create
->second
.create_stamp
));
4957 if (create
->second
.history
.epoch_created
) {
4958 dout(20) << __func__
<< " " << pg
<< " " << create
->second
.history
4959 << " " << create
->second
.past_intervals
<< dendl
;
4960 m
->pg_extra
.emplace(pg
, make_pair(create
->second
.history
,
4961 create
->second
.past_intervals
));
4964 dout(20) << __func__
<< " will create " << pg
4965 << " at " << create
->second
.create_epoch
<< dendl
;
4969 con
->send_message(m
);
4971 con
->send_message(oldm
);
4973 dout(20) << __func__
<< " osd." << osd
<< " from " << next
4974 << " has nothing to send" << dendl
;
4978 // sub is current through last + 1
4985 void OSDMonitor::tick()
4987 if (!is_active()) return;
4989 dout(10) << osdmap
<< dendl
;
4991 // always update osdmap manifest, regardless of being the leader.
4992 load_osdmap_manifest();
4994 // always tune priority cache manager memory on leader and peons
4995 if (ceph_using_tcmalloc() && mon_memory_autotune
) {
4996 std::lock_guard
l(balancer_lock
);
4997 if (pcm
!= nullptr) {
5000 _set_new_cache_sizes();
5001 dout(10) << "tick balancer "
5002 << " inc cache_bytes: " << inc_cache
->get_cache_bytes()
5003 << " inc comtd_bytes: " << inc_cache
->get_committed_size()
5004 << " inc used_bytes: " << inc_cache
->_get_used_bytes()
5005 << " inc num_osdmaps: " << inc_cache
->_get_num_osdmaps()
5007 dout(10) << "tick balancer "
5008 << " full cache_bytes: " << full_cache
->get_cache_bytes()
5009 << " full comtd_bytes: " << full_cache
->get_committed_size()
5010 << " full used_bytes: " << full_cache
->_get_used_bytes()
5011 << " full num_osdmaps: " << full_cache
->_get_num_osdmaps()
5016 if (!mon
->is_leader()) return;
5018 bool do_propose
= false;
5019 utime_t now
= ceph_clock_now();
5021 if (handle_osd_timeouts(now
, last_osd_report
)) {
5026 if (check_failures(now
)) {
5030 // Force a proposal if we need to prune; pruning is performed on
5031 // ``encode_pending()``, hence why we need to regularly trigger a proposal
5032 // even if there's nothing going on.
5033 if (is_prune_enabled() && should_prune()) {
5037 // mark down osds out?
5039 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
5040 * influence at all. The decision is made based on the ratio of "in" osds,
5041 * and the function returns false if this ratio is lower that the minimum
5042 * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
5044 if (can_mark_out(-1)) {
5045 string down_out_subtree_limit
= g_conf().get_val
<string
>(
5046 "mon_osd_down_out_subtree_limit");
5047 set
<int> down_cache
; // quick cache of down subtrees
5049 map
<int,utime_t
>::iterator i
= down_pending_out
.begin();
5050 while (i
!= down_pending_out
.end()) {
5056 if (osdmap
.is_down(o
) &&
5059 utime_t
orig_grace(g_conf()->mon_osd_down_out_interval
, 0);
5060 utime_t grace
= orig_grace
;
5061 double my_grace
= 0.0;
5063 if (g_conf()->mon_osd_adjust_down_out_interval
) {
5064 // scale grace period the same way we do the heartbeat grace.
5065 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(o
);
5066 double halflife
= (double)g_conf()->mon_osd_laggy_halflife
;
5067 double decay_k
= ::log(.5) / halflife
;
5068 double decay
= exp((double)down
* decay_k
);
5069 dout(20) << "osd." << o
<< " laggy halflife " << halflife
<< " decay_k " << decay_k
5070 << " down for " << down
<< " decay " << decay
<< dendl
;
5071 my_grace
= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
5075 // is this an entire large subtree down?
5076 if (down_out_subtree_limit
.length()) {
5077 int type
= osdmap
.crush
->get_type_id(down_out_subtree_limit
);
5079 if (osdmap
.containing_subtree_is_down(cct
, o
, type
, &down_cache
)) {
5080 dout(10) << "tick entire containing " << down_out_subtree_limit
5081 << " subtree for osd." << o
5082 << " is down; resetting timer" << dendl
;
5083 // reset timer, too.
5084 down_pending_out
[o
] = now
;
5090 bool down_out
= !osdmap
.is_destroyed(o
) &&
5091 g_conf()->mon_osd_down_out_interval
> 0 && down
.sec() >= grace
;
5092 bool destroyed_out
= osdmap
.is_destroyed(o
) &&
5093 g_conf()->mon_osd_destroyed_out_interval
> 0 &&
5094 // this is not precise enough as we did not make a note when this osd
5095 // was marked as destroyed, but let's not bother with that
5096 // complexity for now.
5097 down
.sec() >= g_conf()->mon_osd_destroyed_out_interval
;
5098 if (down_out
|| destroyed_out
) {
5099 dout(10) << "tick marking osd." << o
<< " OUT after " << down
5100 << " sec (target " << grace
<< " = " << orig_grace
<< " + " << my_grace
<< ")" << dendl
;
5101 pending_inc
.new_weight
[o
] = CEPH_OSD_OUT
;
5103 // set the AUTOOUT bit.
5104 if (pending_inc
.new_state
.count(o
) == 0)
5105 pending_inc
.new_state
[o
] = 0;
5106 pending_inc
.new_state
[o
] |= CEPH_OSD_AUTOOUT
;
5108 // remember previous weight
5109 if (pending_inc
.new_xinfo
.count(o
) == 0)
5110 pending_inc
.new_xinfo
[o
] = osdmap
.osd_xinfo
[o
];
5111 pending_inc
.new_xinfo
[o
].old_weight
= osdmap
.osd_weight
[o
];
5115 mon
->clog
->info() << "Marking osd." << o
<< " out (has been down for "
5116 << int(down
.sec()) << " seconds)";
5121 down_pending_out
.erase(o
);
5124 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl
;
5127 // expire blacklisted items?
5128 for (ceph::unordered_map
<entity_addr_t
,utime_t
>::iterator p
= osdmap
.blacklist
.begin();
5129 p
!= osdmap
.blacklist
.end();
5131 if (p
->second
< now
) {
5132 dout(10) << "expiring blacklist item " << p
->first
<< " expired " << p
->second
<< " < now " << now
<< dendl
;
5133 pending_inc
.old_blacklist
.push_back(p
->first
);
5138 if (try_prune_purged_snaps()) {
5142 if (update_pools_status())
5146 !pending_inc
.new_pg_temp
.empty()) // also propose if we adjusted pg_temp
5150 void OSDMonitor::_set_new_cache_sizes()
5152 uint64_t cache_size
= 0;
5153 int64_t inc_alloc
= 0;
5154 int64_t full_alloc
= 0;
5155 int64_t kv_alloc
= 0;
5157 if (pcm
!= nullptr && rocksdb_binned_kv_cache
!= nullptr) {
5158 cache_size
= pcm
->get_tuned_mem();
5159 inc_alloc
= inc_cache
->get_committed_size();
5160 full_alloc
= full_cache
->get_committed_size();
5161 kv_alloc
= rocksdb_binned_kv_cache
->get_committed_size();
5164 inc_osd_cache
.set_bytes(inc_alloc
);
5165 full_osd_cache
.set_bytes(full_alloc
);
5167 dout(1) << __func__
<< " cache_size:" << cache_size
5168 << " inc_alloc: " << inc_alloc
5169 << " full_alloc: " << full_alloc
5170 << " kv_alloc: " << kv_alloc
5174 bool OSDMonitor::handle_osd_timeouts(const utime_t
&now
,
5175 std::map
<int,utime_t
> &last_osd_report
)
5177 utime_t
timeo(g_conf()->mon_osd_report_timeout
, 0);
5178 if (now
- mon
->get_leader_since() < timeo
) {
5179 // We haven't been the leader for long enough to consider OSD timeouts
5183 int max_osd
= osdmap
.get_max_osd();
5184 bool new_down
= false;
5186 for (int i
=0; i
< max_osd
; ++i
) {
5187 dout(30) << __func__
<< ": checking up on osd " << i
<< dendl
;
5188 if (!osdmap
.exists(i
)) {
5189 last_osd_report
.erase(i
); // if any
5192 if (!osdmap
.is_up(i
))
5194 const std::map
<int,utime_t
>::const_iterator t
= last_osd_report
.find(i
);
5195 if (t
== last_osd_report
.end()) {
5196 // it wasn't in the map; start the timer.
5197 last_osd_report
[i
] = now
;
5198 } else if (can_mark_down(i
)) {
5199 utime_t diff
= now
- t
->second
;
5201 mon
->clog
->info() << "osd." << i
<< " marked down after no beacon for "
5202 << diff
<< " seconds";
5203 derr
<< "no beacon from osd." << i
<< " since " << t
->second
5204 << ", " << diff
<< " seconds ago. marking down" << dendl
;
5205 pending_inc
.new_state
[i
] = CEPH_OSD_UP
;
5213 static void dump_cpu_list(Formatter
*f
, const char *name
,
5214 const string
& strlist
)
5217 size_t cpu_set_size
;
5218 if (parse_cpu_set_list(strlist
.c_str(), &cpu_set_size
, &cpu_set
) < 0) {
5221 set
<int> cpus
= cpu_set_to_set(cpu_set_size
, &cpu_set
);
5222 f
->open_array_section(name
);
5223 for (auto cpu
: cpus
) {
5224 f
->dump_int("cpu", cpu
);
5229 void OSDMonitor::dump_info(Formatter
*f
)
5231 f
->open_object_section("osdmap");
5235 f
->open_array_section("osd_metadata");
5236 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
5237 if (osdmap
.exists(i
)) {
5238 f
->open_object_section("osd");
5239 f
->dump_unsigned("id", i
);
5240 dump_osd_metadata(i
, f
, NULL
);
5246 f
->open_object_section("osdmap_clean_epochs");
5247 f
->dump_unsigned("min_last_epoch_clean", get_min_last_epoch_clean());
5249 f
->open_object_section("last_epoch_clean");
5250 last_epoch_clean
.dump(f
);
5253 f
->open_array_section("osd_epochs");
5254 for (auto& osd_epoch
: osd_epochs
) {
5255 f
->open_object_section("osd");
5256 f
->dump_unsigned("id", osd_epoch
.first
);
5257 f
->dump_unsigned("epoch", osd_epoch
.second
);
5260 f
->close_section(); // osd_epochs
5262 f
->close_section(); // osd_clean_epochs
5264 f
->dump_unsigned("osdmap_first_committed", get_first_committed());
5265 f
->dump_unsigned("osdmap_last_committed", get_last_committed());
5267 f
->open_object_section("crushmap");
5268 osdmap
.crush
->dump(f
);
5271 if (has_osdmap_manifest
) {
5272 f
->open_object_section("osdmap_manifest");
5273 osdmap_manifest
.dump(f
);
5279 enum osd_pool_get_choices
{
5281 PG_NUM
, PGP_NUM
, CRUSH_RULE
, HASHPSPOOL
, EC_OVERWRITES
,
5282 NODELETE
, NOPGCHANGE
, NOSIZECHANGE
,
5283 WRITE_FADVISE_DONTNEED
, NOSCRUB
, NODEEP_SCRUB
,
5284 HIT_SET_TYPE
, HIT_SET_PERIOD
, HIT_SET_COUNT
, HIT_SET_FPP
,
5285 USE_GMT_HITSET
, TARGET_MAX_OBJECTS
, TARGET_MAX_BYTES
,
5286 CACHE_TARGET_DIRTY_RATIO
, CACHE_TARGET_DIRTY_HIGH_RATIO
,
5287 CACHE_TARGET_FULL_RATIO
,
5288 CACHE_MIN_FLUSH_AGE
, CACHE_MIN_EVICT_AGE
,
5289 ERASURE_CODE_PROFILE
, MIN_READ_RECENCY_FOR_PROMOTE
,
5290 MIN_WRITE_RECENCY_FOR_PROMOTE
, FAST_READ
,
5291 HIT_SET_GRADE_DECAY_RATE
, HIT_SET_SEARCH_LAST_N
,
5292 SCRUB_MIN_INTERVAL
, SCRUB_MAX_INTERVAL
, DEEP_SCRUB_INTERVAL
,
5293 RECOVERY_PRIORITY
, RECOVERY_OP_PRIORITY
, SCRUB_PRIORITY
,
5294 COMPRESSION_MODE
, COMPRESSION_ALGORITHM
, COMPRESSION_REQUIRED_RATIO
,
5295 COMPRESSION_MAX_BLOB_SIZE
, COMPRESSION_MIN_BLOB_SIZE
,
5296 CSUM_TYPE
, CSUM_MAX_BLOCK
, CSUM_MIN_BLOCK
, FINGERPRINT_ALGORITHM
,
5297 PG_AUTOSCALE_MODE
, PG_NUM_MIN
, TARGET_SIZE_BYTES
, TARGET_SIZE_RATIO
,
5298 PG_AUTOSCALE_BIAS
};
5300 std::set
<osd_pool_get_choices
>
5301 subtract_second_from_first(const std::set
<osd_pool_get_choices
>& first
,
5302 const std::set
<osd_pool_get_choices
>& second
)
5304 std::set
<osd_pool_get_choices
> result
;
5305 std::set_difference(first
.begin(), first
.end(),
5306 second
.begin(), second
.end(),
5307 std::inserter(result
, result
.end()));
5313 bool OSDMonitor::preprocess_command(MonOpRequestRef op
)
5315 op
->mark_osdmon_event(__func__
);
5316 auto m
= op
->get_req
<MMonCommand
>();
5319 stringstream ss
, ds
;
5322 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
5323 string rs
= ss
.str();
5324 mon
->reply_command(op
, -EINVAL
, rs
, get_last_committed());
5328 MonSession
*session
= op
->get_session();
5330 derr
<< __func__
<< " no session" << dendl
;
5331 mon
->reply_command(op
, -EACCES
, "access denied", get_last_committed());
5336 cmd_getval(cmdmap
, "prefix", prefix
);
5339 cmd_getval(cmdmap
, "format", format
, string("plain"));
5340 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
5342 if (prefix
== "osd stat") {
5344 f
->open_object_section("osdmap");
5345 osdmap
.print_summary(f
.get(), ds
, "", true);
5349 osdmap
.print_summary(nullptr, ds
, "", true);
5353 else if (prefix
== "osd dump" ||
5354 prefix
== "osd tree" ||
5355 prefix
== "osd tree-from" ||
5356 prefix
== "osd ls" ||
5357 prefix
== "osd getmap" ||
5358 prefix
== "osd getcrushmap" ||
5359 prefix
== "osd ls-tree" ||
5360 prefix
== "osd info") {
5365 cmd_getval(cmdmap
, "epoch", epochnum
, (int64_t)osdmap
.get_epoch());
5368 bufferlist osdmap_bl
;
5369 int err
= get_version_full(epoch
, osdmap_bl
);
5370 if (err
== -ENOENT
) {
5372 ss
<< "there is no map for epoch " << epoch
;
5375 ceph_assert(err
== 0);
5376 ceph_assert(osdmap_bl
.length());
5379 if (epoch
== osdmap
.get_epoch()) {
5383 p
->decode(osdmap_bl
);
5386 auto sg
= make_scope_guard([&] {
5392 if (prefix
== "osd dump") {
5395 f
->open_object_section("osdmap");
5405 } else if (prefix
== "osd ls") {
5407 f
->open_array_section("osds");
5408 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
5409 if (osdmap
.exists(i
)) {
5410 f
->dump_int("osd", i
);
5417 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
5418 if (osdmap
.exists(i
)) {
5427 } else if (prefix
== "osd info") {
5429 bool do_single_osd
= true;
5430 if (!cmd_getval(cmdmap
, "id", osd_id
)) {
5431 do_single_osd
= false;
5434 if (do_single_osd
&& !osdmap
.exists(osd_id
)) {
5435 ss
<< "osd." << osd_id
<< " does not exist";
5441 if (do_single_osd
) {
5442 osdmap
.dump_osd(osd_id
, f
.get());
5444 osdmap
.dump_osds(f
.get());
5448 if (do_single_osd
) {
5449 osdmap
.print_osd(osd_id
, ds
);
5451 osdmap
.print_osds(ds
);
5455 } else if (prefix
== "osd tree" || prefix
== "osd tree-from") {
5457 if (prefix
== "osd tree-from") {
5458 cmd_getval(cmdmap
, "bucket", bucket
);
5459 if (!osdmap
.crush
->name_exists(bucket
)) {
5460 ss
<< "bucket '" << bucket
<< "' does not exist";
5464 int id
= osdmap
.crush
->get_item_id(bucket
);
5466 ss
<< "\"" << bucket
<< "\" is not a bucket";
5472 vector
<string
> states
;
5473 cmd_getval(cmdmap
, "states", states
);
5474 unsigned filter
= 0;
5475 for (auto& s
: states
) {
5477 filter
|= OSDMap::DUMP_UP
;
5478 } else if (s
== "down") {
5479 filter
|= OSDMap::DUMP_DOWN
;
5480 } else if (s
== "in") {
5481 filter
|= OSDMap::DUMP_IN
;
5482 } else if (s
== "out") {
5483 filter
|= OSDMap::DUMP_OUT
;
5484 } else if (s
== "destroyed") {
5485 filter
|= OSDMap::DUMP_DESTROYED
;
5487 ss
<< "unrecognized state '" << s
<< "'";
5492 if ((filter
& (OSDMap::DUMP_IN
|OSDMap::DUMP_OUT
)) ==
5493 (OSDMap::DUMP_IN
|OSDMap::DUMP_OUT
)) {
5494 ss
<< "cannot specify both 'in' and 'out'";
5498 if (((filter
& (OSDMap::DUMP_UP
|OSDMap::DUMP_DOWN
)) ==
5499 (OSDMap::DUMP_UP
|OSDMap::DUMP_DOWN
)) ||
5500 ((filter
& (OSDMap::DUMP_UP
|OSDMap::DUMP_DESTROYED
)) ==
5501 (OSDMap::DUMP_UP
|OSDMap::DUMP_DESTROYED
)) ||
5502 ((filter
& (OSDMap::DUMP_DOWN
|OSDMap::DUMP_DESTROYED
)) ==
5503 (OSDMap::DUMP_DOWN
|OSDMap::DUMP_DESTROYED
))) {
5504 ss
<< "can specify only one of 'up', 'down' and 'destroyed'";
5509 f
->open_object_section("tree");
5510 p
->print_tree(f
.get(), NULL
, filter
, bucket
);
5514 p
->print_tree(NULL
, &ds
, filter
, bucket
);
5517 } else if (prefix
== "osd getmap") {
5518 rdata
.append(osdmap_bl
);
5519 ss
<< "got osdmap epoch " << p
->get_epoch();
5520 } else if (prefix
== "osd getcrushmap") {
5521 p
->crush
->encode(rdata
, mon
->get_quorum_con_features());
5522 ss
<< p
->get_crush_version();
5523 } else if (prefix
== "osd ls-tree") {
5525 cmd_getval(cmdmap
, "name", bucket_name
);
5527 r
= p
->get_osds_by_bucket_name(bucket_name
, &osds
);
5529 ss
<< "\"" << bucket_name
<< "\" does not exist";
5532 ss
<< "can not parse bucket name:\"" << bucket_name
<< "\"";
5537 f
->open_array_section("osds");
5538 for (auto &i
: osds
) {
5539 if (osdmap
.exists(i
)) {
5540 f
->dump_int("osd", i
);
5547 for (auto &i
: osds
) {
5548 if (osdmap
.exists(i
)) {
5559 } else if (prefix
== "osd getmaxosd") {
5561 f
->open_object_section("getmaxosd");
5562 f
->dump_unsigned("epoch", osdmap
.get_epoch());
5563 f
->dump_int("max_osd", osdmap
.get_max_osd());
5567 ds
<< "max_osd = " << osdmap
.get_max_osd() << " in epoch " << osdmap
.get_epoch();
5570 } else if (prefix
== "osd utilization") {
5572 osdmap
.summarize_mapping_stats(NULL
, NULL
, &out
, f
.get());
5579 } else if (prefix
== "osd find") {
5581 if (!cmd_getval(cmdmap
, "id", osd
)) {
5582 ss
<< "unable to parse osd id value '"
5583 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
5587 if (!osdmap
.exists(osd
)) {
5588 ss
<< "osd." << osd
<< " does not exist";
5593 cmd_getval(cmdmap
, "format", format
);
5594 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5595 f
->open_object_section("osd_location");
5596 f
->dump_int("osd", osd
);
5597 f
->dump_object("addrs", osdmap
.get_addrs(osd
));
5598 f
->dump_stream("osd_fsid") << osdmap
.get_uuid(osd
);
5600 // try to identify host, pod/container name, etc.
5601 map
<string
,string
> m
;
5602 load_metadata(osd
, m
, nullptr);
5603 if (auto p
= m
.find("hostname"); p
!= m
.end()) {
5604 f
->dump_string("host", p
->second
);
5607 "pod_name", "pod_namespace", // set by rook
5608 "container_name" // set by cephadm, ceph-ansible
5610 if (auto p
= m
.find(k
); p
!= m
.end()) {
5611 f
->dump_string(k
, p
->second
);
5615 // crush is helpful too
5616 f
->open_object_section("crush_location");
5617 map
<string
,string
> loc
= osdmap
.crush
->get_full_location(osd
);
5618 for (map
<string
,string
>::iterator p
= loc
.begin(); p
!= loc
.end(); ++p
)
5619 f
->dump_string(p
->first
.c_str(), p
->second
);
5623 } else if (prefix
== "osd metadata") {
5625 if (cmd_vartype_stringify(cmdmap
["id"]).size() &&
5626 !cmd_getval(cmdmap
, "id", osd
)) {
5627 ss
<< "unable to parse osd id value '"
5628 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
5632 if (osd
>= 0 && !osdmap
.exists(osd
)) {
5633 ss
<< "osd." << osd
<< " does not exist";
5638 cmd_getval(cmdmap
, "format", format
);
5639 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5641 f
->open_object_section("osd_metadata");
5642 f
->dump_unsigned("id", osd
);
5643 r
= dump_osd_metadata(osd
, f
.get(), &ss
);
5649 f
->open_array_section("osd_metadata");
5650 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
5651 if (osdmap
.exists(i
)) {
5652 f
->open_object_section("osd");
5653 f
->dump_unsigned("id", i
);
5654 r
= dump_osd_metadata(i
, f
.get(), NULL
);
5655 if (r
== -EINVAL
|| r
== -ENOENT
) {
5656 // Drop error, continue to get other daemons' metadata
5657 dout(4) << "No metadata for osd." << i
<< dendl
;
5669 } else if (prefix
== "osd versions") {
5671 f
.reset(Formatter::create("json-pretty"));
5672 count_metadata("ceph_version", f
.get());
5675 } else if (prefix
== "osd count-metadata") {
5677 f
.reset(Formatter::create("json-pretty"));
5679 cmd_getval(cmdmap
, "property", field
);
5680 count_metadata(field
, f
.get());
5683 } else if (prefix
== "osd numa-status") {
5686 f
->open_array_section("osds");
5688 tbl
.define_column("OSD", TextTable::LEFT
, TextTable::RIGHT
);
5689 tbl
.define_column("HOST", TextTable::LEFT
, TextTable::LEFT
);
5690 tbl
.define_column("NETWORK", TextTable::RIGHT
, TextTable::RIGHT
);
5691 tbl
.define_column("STORAGE", TextTable::RIGHT
, TextTable::RIGHT
);
5692 tbl
.define_column("AFFINITY", TextTable::RIGHT
, TextTable::RIGHT
);
5693 tbl
.define_column("CPUS", TextTable::LEFT
, TextTable::LEFT
);
5695 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
5696 if (osdmap
.exists(i
)) {
5697 map
<string
,string
> m
;
5699 if (load_metadata(i
, m
, &err
) < 0) {
5703 auto p
= m
.find("hostname");
5708 f
->open_object_section("osd");
5709 f
->dump_int("osd", i
);
5710 f
->dump_string("host", host
);
5711 for (auto n
: { "network_numa_node", "objectstore_numa_node",
5715 f
->dump_int(n
, atoi(p
->second
.c_str()));
5718 for (auto n
: { "network_numa_nodes", "objectstore_numa_nodes" }) {
5721 list
<string
> ls
= get_str_list(p
->second
, ",");
5722 f
->open_array_section(n
);
5723 for (auto node
: ls
) {
5724 f
->dump_int("node", atoi(node
.c_str()));
5729 for (auto n
: { "numa_node_cpus" }) {
5732 dump_cpu_list(f
.get(), n
, p
->second
);
5739 p
= m
.find("network_numa_nodes");
5745 p
= m
.find("objectstore_numa_nodes");
5751 p
= m
.find("numa_node");
5752 auto q
= m
.find("numa_node_cpus");
5753 if (p
!= m
.end() && q
!= m
.end()) {
5760 tbl
<< TextTable::endrow
;
5768 rdata
.append(stringify(tbl
));
5770 } else if (prefix
== "osd map") {
5771 string poolstr
, objstr
, namespacestr
;
5772 cmd_getval(cmdmap
, "pool", poolstr
);
5773 cmd_getval(cmdmap
, "object", objstr
);
5774 cmd_getval(cmdmap
, "nspace", namespacestr
);
5776 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
5778 ss
<< "pool " << poolstr
<< " does not exist";
5782 object_locator_t
oloc(pool
, namespacestr
);
5783 object_t
oid(objstr
);
5784 pg_t pgid
= osdmap
.object_locator_to_pg(oid
, oloc
);
5785 pg_t mpgid
= osdmap
.raw_pg_to_pg(pgid
);
5786 vector
<int> up
, acting
;
5788 osdmap
.pg_to_up_acting_osds(mpgid
, &up
, &up_p
, &acting
, &acting_p
);
5791 if (!namespacestr
.empty())
5792 fullobjname
= namespacestr
+ string("/") + oid
.name
;
5794 fullobjname
= oid
.name
;
5796 f
->open_object_section("osd_map");
5797 f
->dump_unsigned("epoch", osdmap
.get_epoch());
5798 f
->dump_string("pool", poolstr
);
5799 f
->dump_int("pool_id", pool
);
5800 f
->dump_stream("objname") << fullobjname
;
5801 f
->dump_stream("raw_pgid") << pgid
;
5802 f
->dump_stream("pgid") << mpgid
;
5803 f
->open_array_section("up");
5804 for (vector
<int>::iterator p
= up
.begin(); p
!= up
.end(); ++p
)
5805 f
->dump_int("osd", *p
);
5807 f
->dump_int("up_primary", up_p
);
5808 f
->open_array_section("acting");
5809 for (vector
<int>::iterator p
= acting
.begin(); p
!= acting
.end(); ++p
)
5810 f
->dump_int("osd", *p
);
5812 f
->dump_int("acting_primary", acting_p
);
5813 f
->close_section(); // osd_map
5816 ds
<< "osdmap e" << osdmap
.get_epoch()
5817 << " pool '" << poolstr
<< "' (" << pool
<< ")"
5818 << " object '" << fullobjname
<< "' ->"
5819 << " pg " << pgid
<< " (" << mpgid
<< ")"
5820 << " -> up (" << pg_vector_string(up
) << ", p" << up_p
<< ") acting ("
5821 << pg_vector_string(acting
) << ", p" << acting_p
<< ")";
5825 } else if (prefix
== "pg map") {
5828 cmd_getval(cmdmap
, "pgid", pgidstr
);
5829 if (!pgid
.parse(pgidstr
.c_str())) {
5830 ss
<< "invalid pgid '" << pgidstr
<< "'";
5834 vector
<int> up
, acting
;
5835 if (!osdmap
.have_pg_pool(pgid
.pool())) {
5836 ss
<< "pg '" << pgidstr
<< "' does not exist";
5840 pg_t mpgid
= osdmap
.raw_pg_to_pg(pgid
);
5841 osdmap
.pg_to_up_acting_osds(pgid
, up
, acting
);
5843 f
->open_object_section("pg_map");
5844 f
->dump_unsigned("epoch", osdmap
.get_epoch());
5845 f
->dump_stream("raw_pgid") << pgid
;
5846 f
->dump_stream("pgid") << mpgid
;
5847 f
->open_array_section("up");
5848 for (auto osd
: up
) {
5849 f
->dump_int("up_osd", osd
);
5852 f
->open_array_section("acting");
5853 for (auto osd
: acting
) {
5854 f
->dump_int("acting_osd", osd
);
5860 ds
<< "osdmap e" << osdmap
.get_epoch()
5861 << " pg " << pgid
<< " (" << mpgid
<< ")"
5862 << " -> up " << up
<< " acting " << acting
;
5867 } else if (prefix
== "osd lspools") {
5869 f
->open_array_section("pools");
5870 for (map
<int64_t, pg_pool_t
>::iterator p
= osdmap
.pools
.begin();
5871 p
!= osdmap
.pools
.end();
5874 f
->open_object_section("pool");
5875 f
->dump_int("poolnum", p
->first
);
5876 f
->dump_string("poolname", osdmap
.pool_name
[p
->first
]);
5879 ds
<< p
->first
<< ' ' << osdmap
.pool_name
[p
->first
];
5880 if (next(p
) != osdmap
.pools
.end()) {
5890 } else if (prefix
== "osd blacklist ls") {
5892 f
->open_array_section("blacklist");
5894 for (ceph::unordered_map
<entity_addr_t
,utime_t
>::iterator p
= osdmap
.blacklist
.begin();
5895 p
!= osdmap
.blacklist
.end();
5898 f
->open_object_section("entry");
5899 f
->dump_string("addr", p
->first
.get_legacy_str());
5900 f
->dump_stream("until") << p
->second
;
5905 ss
<< p
->first
<< " " << p
->second
;
5915 ss
<< "listed " << osdmap
.blacklist
.size() << " entries";
5917 } else if (prefix
== "osd pool ls") {
5919 cmd_getval(cmdmap
, "detail", detail
);
5920 if (!f
&& detail
== "detail") {
5922 osdmap
.print_pools(ss
);
5923 rdata
.append(ss
.str());
5926 f
->open_array_section("pools");
5927 for (map
<int64_t,pg_pool_t
>::const_iterator it
= osdmap
.get_pools().begin();
5928 it
!= osdmap
.get_pools().end();
5931 if (detail
== "detail") {
5932 f
->open_object_section("pool");
5933 f
->dump_int("pool_id", it
->first
);
5934 f
->dump_string("pool_name", osdmap
.get_pool_name(it
->first
));
5935 it
->second
.dump(f
.get());
5938 f
->dump_string("pool_name", osdmap
.get_pool_name(it
->first
));
5941 rdata
.append(osdmap
.get_pool_name(it
->first
) + "\n");
5950 } else if (prefix
== "osd crush get-tunable") {
5952 cmd_getval(cmdmap
, "tunable", tunable
);
5955 f
->open_object_section("tunable");
5956 if (tunable
== "straw_calc_version") {
5958 f
->dump_int(tunable
.c_str(), osdmap
.crush
->get_straw_calc_version());
5960 rss
<< osdmap
.crush
->get_straw_calc_version() << "\n";
5969 rdata
.append(rss
.str());
5973 } else if (prefix
== "osd pool get") {
5975 cmd_getval(cmdmap
, "pool", poolstr
);
5976 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
5978 ss
<< "unrecognized pool '" << poolstr
<< "'";
5983 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
5985 cmd_getval(cmdmap
, "var", var
);
5987 typedef std::map
<std::string
, osd_pool_get_choices
> choices_map_t
;
5988 const choices_map_t ALL_CHOICES
= {
5990 {"min_size", MIN_SIZE
},
5991 {"pg_num", PG_NUM
}, {"pgp_num", PGP_NUM
},
5992 {"crush_rule", CRUSH_RULE
}, {"hashpspool", HASHPSPOOL
},
5993 {"allow_ec_overwrites", EC_OVERWRITES
}, {"nodelete", NODELETE
},
5994 {"nopgchange", NOPGCHANGE
}, {"nosizechange", NOSIZECHANGE
},
5995 {"noscrub", NOSCRUB
}, {"nodeep-scrub", NODEEP_SCRUB
},
5996 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED
},
5997 {"hit_set_type", HIT_SET_TYPE
}, {"hit_set_period", HIT_SET_PERIOD
},
5998 {"hit_set_count", HIT_SET_COUNT
}, {"hit_set_fpp", HIT_SET_FPP
},
5999 {"use_gmt_hitset", USE_GMT_HITSET
},
6000 {"target_max_objects", TARGET_MAX_OBJECTS
},
6001 {"target_max_bytes", TARGET_MAX_BYTES
},
6002 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO
},
6003 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO
},
6004 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO
},
6005 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE
},
6006 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE
},
6007 {"erasure_code_profile", ERASURE_CODE_PROFILE
},
6008 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE
},
6009 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE
},
6010 {"fast_read", FAST_READ
},
6011 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE
},
6012 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N
},
6013 {"scrub_min_interval", SCRUB_MIN_INTERVAL
},
6014 {"scrub_max_interval", SCRUB_MAX_INTERVAL
},
6015 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL
},
6016 {"recovery_priority", RECOVERY_PRIORITY
},
6017 {"recovery_op_priority", RECOVERY_OP_PRIORITY
},
6018 {"scrub_priority", SCRUB_PRIORITY
},
6019 {"compression_mode", COMPRESSION_MODE
},
6020 {"compression_algorithm", COMPRESSION_ALGORITHM
},
6021 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO
},
6022 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE
},
6023 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE
},
6024 {"csum_type", CSUM_TYPE
},
6025 {"csum_max_block", CSUM_MAX_BLOCK
},
6026 {"csum_min_block", CSUM_MIN_BLOCK
},
6027 {"fingerprint_algorithm", FINGERPRINT_ALGORITHM
},
6028 {"pg_autoscale_mode", PG_AUTOSCALE_MODE
},
6029 {"pg_num_min", PG_NUM_MIN
},
6030 {"target_size_bytes", TARGET_SIZE_BYTES
},
6031 {"target_size_ratio", TARGET_SIZE_RATIO
},
6032 {"pg_autoscale_bias", PG_AUTOSCALE_BIAS
},
6035 typedef std::set
<osd_pool_get_choices
> choices_set_t
;
6037 const choices_set_t ONLY_TIER_CHOICES
= {
6038 HIT_SET_TYPE
, HIT_SET_PERIOD
, HIT_SET_COUNT
, HIT_SET_FPP
,
6039 TARGET_MAX_OBJECTS
, TARGET_MAX_BYTES
, CACHE_TARGET_FULL_RATIO
,
6040 CACHE_TARGET_DIRTY_RATIO
, CACHE_TARGET_DIRTY_HIGH_RATIO
,
6041 CACHE_MIN_FLUSH_AGE
, CACHE_MIN_EVICT_AGE
,
6042 MIN_READ_RECENCY_FOR_PROMOTE
,
6043 MIN_WRITE_RECENCY_FOR_PROMOTE
,
6044 HIT_SET_GRADE_DECAY_RATE
, HIT_SET_SEARCH_LAST_N
6046 const choices_set_t ONLY_ERASURE_CHOICES
= {
6047 EC_OVERWRITES
, ERASURE_CODE_PROFILE
6050 choices_set_t selected_choices
;
6052 for(choices_map_t::const_iterator it
= ALL_CHOICES
.begin();
6053 it
!= ALL_CHOICES
.end(); ++it
) {
6054 selected_choices
.insert(it
->second
);
6058 selected_choices
= subtract_second_from_first(selected_choices
,
6062 if(!p
->is_erasure()) {
6063 selected_choices
= subtract_second_from_first(selected_choices
,
6064 ONLY_ERASURE_CHOICES
);
6066 } else /* var != "all" */ {
6067 choices_map_t::const_iterator found
= ALL_CHOICES
.find(var
);
6068 osd_pool_get_choices selected
= found
->second
;
6070 if (!p
->is_tier() &&
6071 ONLY_TIER_CHOICES
.find(selected
) != ONLY_TIER_CHOICES
.end()) {
6072 ss
<< "pool '" << poolstr
6073 << "' is not a tier pool: variable not applicable";
6078 if (!p
->is_erasure() &&
6079 ONLY_ERASURE_CHOICES
.find(selected
)
6080 != ONLY_ERASURE_CHOICES
.end()) {
6081 ss
<< "pool '" << poolstr
6082 << "' is not a erasure pool: variable not applicable";
6087 if (pool_opts_t::is_opt_name(var
) &&
6088 !p
->opts
.is_set(pool_opts_t::get_opt_desc(var
).key
)) {
6089 ss
<< "option '" << var
<< "' is not set on pool '" << poolstr
<< "'";
6094 selected_choices
.insert(selected
);
6098 f
->open_object_section("pool");
6099 f
->dump_string("pool", poolstr
);
6100 f
->dump_int("pool_id", pool
);
6101 for(choices_set_t::const_iterator it
= selected_choices
.begin();
6102 it
!= selected_choices
.end(); ++it
) {
6103 choices_map_t::const_iterator i
;
6104 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
6105 if (i
->second
== *it
) {
6109 ceph_assert(i
!= ALL_CHOICES
.end());
6112 f
->dump_int("pg_num", p
->get_pg_num());
6115 f
->dump_int("pgp_num", p
->get_pgp_num());
6118 f
->dump_int("size", p
->get_size());
6121 f
->dump_int("min_size", p
->get_min_size());
6124 if (osdmap
.crush
->rule_exists(p
->get_crush_rule())) {
6125 f
->dump_string("crush_rule", osdmap
.crush
->get_rule_name(
6126 p
->get_crush_rule()));
6128 f
->dump_string("crush_rule", stringify(p
->get_crush_rule()));
6132 f
->dump_bool("allow_ec_overwrites",
6133 p
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
));
6135 case PG_AUTOSCALE_MODE
:
6136 f
->dump_string("pg_autoscale_mode",
6137 pg_pool_t::get_pg_autoscale_mode_name(
6138 p
->pg_autoscale_mode
));
6144 case WRITE_FADVISE_DONTNEED
:
6147 f
->dump_bool(i
->first
.c_str(),
6148 p
->has_flag(pg_pool_t::get_flag_by_name(i
->first
)));
6150 case HIT_SET_PERIOD
:
6151 f
->dump_int("hit_set_period", p
->hit_set_period
);
6154 f
->dump_int("hit_set_count", p
->hit_set_count
);
6157 f
->dump_string("hit_set_type",
6158 HitSet::get_type_name(p
->hit_set_params
.get_type()));
6162 if (p
->hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
6163 BloomHitSet::Params
*bloomp
=
6164 static_cast<BloomHitSet::Params
*>(p
->hit_set_params
.impl
.get());
6165 f
->dump_float("hit_set_fpp", bloomp
->get_fpp());
6166 } else if(var
!= "all") {
6168 ss
<< "hit set is not of type Bloom; " <<
6169 "invalid to get a false positive rate!";
6175 case USE_GMT_HITSET
:
6176 f
->dump_bool("use_gmt_hitset", p
->use_gmt_hitset
);
6178 case TARGET_MAX_OBJECTS
:
6179 f
->dump_unsigned("target_max_objects", p
->target_max_objects
);
6181 case TARGET_MAX_BYTES
:
6182 f
->dump_unsigned("target_max_bytes", p
->target_max_bytes
);
6184 case CACHE_TARGET_DIRTY_RATIO
:
6185 f
->dump_unsigned("cache_target_dirty_ratio_micro",
6186 p
->cache_target_dirty_ratio_micro
);
6187 f
->dump_float("cache_target_dirty_ratio",
6188 ((float)p
->cache_target_dirty_ratio_micro
/1000000));
6190 case CACHE_TARGET_DIRTY_HIGH_RATIO
:
6191 f
->dump_unsigned("cache_target_dirty_high_ratio_micro",
6192 p
->cache_target_dirty_high_ratio_micro
);
6193 f
->dump_float("cache_target_dirty_high_ratio",
6194 ((float)p
->cache_target_dirty_high_ratio_micro
/1000000));
6196 case CACHE_TARGET_FULL_RATIO
:
6197 f
->dump_unsigned("cache_target_full_ratio_micro",
6198 p
->cache_target_full_ratio_micro
);
6199 f
->dump_float("cache_target_full_ratio",
6200 ((float)p
->cache_target_full_ratio_micro
/1000000));
6202 case CACHE_MIN_FLUSH_AGE
:
6203 f
->dump_unsigned("cache_min_flush_age", p
->cache_min_flush_age
);
6205 case CACHE_MIN_EVICT_AGE
:
6206 f
->dump_unsigned("cache_min_evict_age", p
->cache_min_evict_age
);
6208 case ERASURE_CODE_PROFILE
:
6209 f
->dump_string("erasure_code_profile", p
->erasure_code_profile
);
6211 case MIN_READ_RECENCY_FOR_PROMOTE
:
6212 f
->dump_int("min_read_recency_for_promote",
6213 p
->min_read_recency_for_promote
);
6215 case MIN_WRITE_RECENCY_FOR_PROMOTE
:
6216 f
->dump_int("min_write_recency_for_promote",
6217 p
->min_write_recency_for_promote
);
6220 f
->dump_int("fast_read", p
->fast_read
);
6222 case HIT_SET_GRADE_DECAY_RATE
:
6223 f
->dump_int("hit_set_grade_decay_rate",
6224 p
->hit_set_grade_decay_rate
);
6226 case HIT_SET_SEARCH_LAST_N
:
6227 f
->dump_int("hit_set_search_last_n",
6228 p
->hit_set_search_last_n
);
6230 case SCRUB_MIN_INTERVAL
:
6231 case SCRUB_MAX_INTERVAL
:
6232 case DEEP_SCRUB_INTERVAL
:
6233 case RECOVERY_PRIORITY
:
6234 case RECOVERY_OP_PRIORITY
:
6235 case SCRUB_PRIORITY
:
6236 case COMPRESSION_MODE
:
6237 case COMPRESSION_ALGORITHM
:
6238 case COMPRESSION_REQUIRED_RATIO
:
6239 case COMPRESSION_MAX_BLOB_SIZE
:
6240 case COMPRESSION_MIN_BLOB_SIZE
:
6242 case CSUM_MAX_BLOCK
:
6243 case CSUM_MIN_BLOCK
:
6244 case FINGERPRINT_ALGORITHM
:
6246 case TARGET_SIZE_BYTES
:
6247 case TARGET_SIZE_RATIO
:
6248 case PG_AUTOSCALE_BIAS
:
6249 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc(i
->first
).key
;
6250 if (p
->opts
.is_set(key
)) {
6251 if(*it
== CSUM_TYPE
) {
6253 p
->opts
.get(pool_opts_t::CSUM_TYPE
, &val
);
6254 f
->dump_string(i
->first
.c_str(), Checksummer::get_csum_type_string(val
));
6256 p
->opts
.dump(i
->first
, f
.get());
6265 for(choices_set_t::const_iterator it
= selected_choices
.begin();
6266 it
!= selected_choices
.end(); ++it
) {
6267 choices_map_t::const_iterator i
;
6270 ss
<< "pg_num: " << p
->get_pg_num() << "\n";
6273 ss
<< "pgp_num: " << p
->get_pgp_num() << "\n";
6276 ss
<< "size: " << p
->get_size() << "\n";
6279 ss
<< "min_size: " << p
->get_min_size() << "\n";
6282 if (osdmap
.crush
->rule_exists(p
->get_crush_rule())) {
6283 ss
<< "crush_rule: " << osdmap
.crush
->get_rule_name(
6284 p
->get_crush_rule()) << "\n";
6286 ss
<< "crush_rule: " << p
->get_crush_rule() << "\n";
6289 case PG_AUTOSCALE_MODE
:
6290 ss
<< "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
6291 p
->pg_autoscale_mode
) <<"\n";
6293 case HIT_SET_PERIOD
:
6294 ss
<< "hit_set_period: " << p
->hit_set_period
<< "\n";
6297 ss
<< "hit_set_count: " << p
->hit_set_count
<< "\n";
6300 ss
<< "hit_set_type: " <<
6301 HitSet::get_type_name(p
->hit_set_params
.get_type()) << "\n";
6305 if (p
->hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
6306 BloomHitSet::Params
*bloomp
=
6307 static_cast<BloomHitSet::Params
*>(p
->hit_set_params
.impl
.get());
6308 ss
<< "hit_set_fpp: " << bloomp
->get_fpp() << "\n";
6309 } else if(var
!= "all") {
6310 ss
<< "hit set is not of type Bloom; " <<
6311 "invalid to get a false positive rate!";
6317 case USE_GMT_HITSET
:
6318 ss
<< "use_gmt_hitset: " << p
->use_gmt_hitset
<< "\n";
6320 case TARGET_MAX_OBJECTS
:
6321 ss
<< "target_max_objects: " << p
->target_max_objects
<< "\n";
6323 case TARGET_MAX_BYTES
:
6324 ss
<< "target_max_bytes: " << p
->target_max_bytes
<< "\n";
6326 case CACHE_TARGET_DIRTY_RATIO
:
6327 ss
<< "cache_target_dirty_ratio: "
6328 << ((float)p
->cache_target_dirty_ratio_micro
/1000000) << "\n";
6330 case CACHE_TARGET_DIRTY_HIGH_RATIO
:
6331 ss
<< "cache_target_dirty_high_ratio: "
6332 << ((float)p
->cache_target_dirty_high_ratio_micro
/1000000) << "\n";
6334 case CACHE_TARGET_FULL_RATIO
:
6335 ss
<< "cache_target_full_ratio: "
6336 << ((float)p
->cache_target_full_ratio_micro
/1000000) << "\n";
6338 case CACHE_MIN_FLUSH_AGE
:
6339 ss
<< "cache_min_flush_age: " << p
->cache_min_flush_age
<< "\n";
6341 case CACHE_MIN_EVICT_AGE
:
6342 ss
<< "cache_min_evict_age: " << p
->cache_min_evict_age
<< "\n";
6344 case ERASURE_CODE_PROFILE
:
6345 ss
<< "erasure_code_profile: " << p
->erasure_code_profile
<< "\n";
6347 case MIN_READ_RECENCY_FOR_PROMOTE
:
6348 ss
<< "min_read_recency_for_promote: " <<
6349 p
->min_read_recency_for_promote
<< "\n";
6351 case HIT_SET_GRADE_DECAY_RATE
:
6352 ss
<< "hit_set_grade_decay_rate: " <<
6353 p
->hit_set_grade_decay_rate
<< "\n";
6355 case HIT_SET_SEARCH_LAST_N
:
6356 ss
<< "hit_set_search_last_n: " <<
6357 p
->hit_set_search_last_n
<< "\n";
6360 ss
<< "allow_ec_overwrites: " <<
6361 (p
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
) ? "true" : "false") <<
6368 case WRITE_FADVISE_DONTNEED
:
6371 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
6372 if (i
->second
== *it
)
6375 ceph_assert(i
!= ALL_CHOICES
.end());
6376 ss
<< i
->first
<< ": " <<
6377 (p
->has_flag(pg_pool_t::get_flag_by_name(i
->first
)) ?
6378 "true" : "false") << "\n";
6380 case MIN_WRITE_RECENCY_FOR_PROMOTE
:
6381 ss
<< "min_write_recency_for_promote: " <<
6382 p
->min_write_recency_for_promote
<< "\n";
6385 ss
<< "fast_read: " << p
->fast_read
<< "\n";
6387 case SCRUB_MIN_INTERVAL
:
6388 case SCRUB_MAX_INTERVAL
:
6389 case DEEP_SCRUB_INTERVAL
:
6390 case RECOVERY_PRIORITY
:
6391 case RECOVERY_OP_PRIORITY
:
6392 case SCRUB_PRIORITY
:
6393 case COMPRESSION_MODE
:
6394 case COMPRESSION_ALGORITHM
:
6395 case COMPRESSION_REQUIRED_RATIO
:
6396 case COMPRESSION_MAX_BLOB_SIZE
:
6397 case COMPRESSION_MIN_BLOB_SIZE
:
6399 case CSUM_MAX_BLOCK
:
6400 case CSUM_MIN_BLOCK
:
6401 case FINGERPRINT_ALGORITHM
:
6403 case TARGET_SIZE_BYTES
:
6404 case TARGET_SIZE_RATIO
:
6405 case PG_AUTOSCALE_BIAS
:
6406 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
6407 if (i
->second
== *it
)
6410 ceph_assert(i
!= ALL_CHOICES
.end());
6412 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc(i
->first
).key
;
6413 if (p
->opts
.is_set(key
)) {
6414 if(key
== pool_opts_t::CSUM_TYPE
) {
6416 p
->opts
.get(key
, &val
);
6417 ss
<< i
->first
<< ": " << Checksummer::get_csum_type_string(val
) << "\n";
6419 ss
<< i
->first
<< ": " << p
->opts
.get(key
) << "\n";
6425 rdata
.append(ss
.str());
6430 } else if (prefix
== "osd pool get-quota") {
6432 cmd_getval(cmdmap
, "pool", pool_name
);
6434 int64_t poolid
= osdmap
.lookup_pg_pool_name(pool_name
);
6436 ceph_assert(poolid
== -ENOENT
);
6437 ss
<< "unrecognized pool '" << pool_name
<< "'";
6441 const pg_pool_t
*p
= osdmap
.get_pg_pool(poolid
);
6442 const pool_stat_t
* pstat
= mon
->mgrstatmon()->get_pool_stat(poolid
);
6443 const object_stat_sum_t
& sum
= pstat
->stats
.sum
;
6445 f
->open_object_section("pool_quotas");
6446 f
->dump_string("pool_name", pool_name
);
6447 f
->dump_unsigned("pool_id", poolid
);
6448 f
->dump_unsigned("quota_max_objects", p
->quota_max_objects
);
6449 f
->dump_int("current_num_objects", sum
.num_objects
);
6450 f
->dump_unsigned("quota_max_bytes", p
->quota_max_bytes
);
6451 f
->dump_int("current_num_bytes", sum
.num_bytes
);
6456 rs
<< "quotas for pool '" << pool_name
<< "':\n"
6457 << " max objects: ";
6458 if (p
->quota_max_objects
== 0)
6461 rs
<< si_u_t(p
->quota_max_objects
) << " objects";
6462 rs
<< " (current num objects: " << sum
.num_objects
<< " objects)";
6466 if (p
->quota_max_bytes
== 0)
6469 rs
<< byte_u_t(p
->quota_max_bytes
);
6470 rs
<< " (current num bytes: " << sum
.num_bytes
<< " bytes)";
6472 rdata
.append(rs
.str());
6476 } else if (prefix
== "osd crush rule list" ||
6477 prefix
== "osd crush rule ls") {
6479 f
->open_array_section("rules");
6480 osdmap
.crush
->list_rules(f
.get());
6485 osdmap
.crush
->list_rules(&ss
);
6486 rdata
.append(ss
.str());
6488 } else if (prefix
== "osd crush rule ls-by-class") {
6490 cmd_getval(cmdmap
, "class", class_name
);
6491 if (class_name
.empty()) {
6492 ss
<< "no class specified";
6497 r
= osdmap
.crush
->get_rules_by_class(class_name
, &rules
);
6499 ss
<< "failed to get rules by class '" << class_name
<< "'";
6503 f
->open_array_section("rules");
6504 for (auto &rule
: rules
) {
6505 f
->dump_string("name", osdmap
.crush
->get_rule_name(rule
));
6511 for (auto &rule
: rules
) {
6512 rs
<< osdmap
.crush
->get_rule_name(rule
) << "\n";
6514 rdata
.append(rs
.str());
6516 } else if (prefix
== "osd crush rule dump") {
6518 cmd_getval(cmdmap
, "name", name
);
6520 cmd_getval(cmdmap
, "format", format
);
6521 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6523 f
->open_array_section("rules");
6524 osdmap
.crush
->dump_rules(f
.get());
6527 int ruleno
= osdmap
.crush
->get_rule_id(name
);
6529 ss
<< "unknown crush rule '" << name
<< "'";
6533 osdmap
.crush
->dump_rule(ruleno
, f
.get());
6538 rdata
.append(rs
.str());
6539 } else if (prefix
== "osd crush dump") {
6541 cmd_getval(cmdmap
, "format", format
);
6542 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6543 f
->open_object_section("crush_map");
6544 osdmap
.crush
->dump(f
.get());
6549 rdata
.append(rs
.str());
6550 } else if (prefix
== "osd crush show-tunables") {
6552 cmd_getval(cmdmap
, "format", format
);
6553 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6554 f
->open_object_section("crush_map_tunables");
6555 osdmap
.crush
->dump_tunables(f
.get());
6560 rdata
.append(rs
.str());
6561 } else if (prefix
== "osd crush tree") {
6563 cmd_getval(cmdmap
, "shadow", shadow
);
6564 bool show_shadow
= shadow
== "--show-shadow";
6565 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
6567 f
->open_object_section("crush_tree");
6568 osdmap
.crush
->dump_tree(nullptr,
6570 osdmap
.get_pool_names(),
6576 osdmap
.crush
->dump_tree(&ss
,
6578 osdmap
.get_pool_names(),
6580 rdata
.append(ss
.str());
6582 } else if (prefix
== "osd crush ls") {
6584 if (!cmd_getval(cmdmap
, "node", name
)) {
6585 ss
<< "no node specified";
6589 if (!osdmap
.crush
->name_exists(name
)) {
6590 ss
<< "node '" << name
<< "' does not exist";
6594 int id
= osdmap
.crush
->get_item_id(name
);
6597 result
.push_back(id
);
6599 int num
= osdmap
.crush
->get_bucket_size(id
);
6600 for (int i
= 0; i
< num
; ++i
) {
6601 result
.push_back(osdmap
.crush
->get_bucket_item(id
, i
));
6605 f
->open_array_section("items");
6606 for (auto i
: result
) {
6607 f
->dump_string("item", osdmap
.crush
->get_item_name(i
));
6613 for (auto i
: result
) {
6614 ss
<< osdmap
.crush
->get_item_name(i
) << "\n";
6616 rdata
.append(ss
.str());
6619 } else if (prefix
== "osd crush class ls") {
6620 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6621 f
->open_array_section("crush_classes");
6622 for (auto i
: osdmap
.crush
->class_name
)
6623 f
->dump_string("class", i
.second
);
6626 } else if (prefix
== "osd crush class ls-osd") {
6628 cmd_getval(cmdmap
, "class", name
);
6630 osdmap
.crush
->get_devices_by_class(name
, &osds
);
6632 f
->open_array_section("osds");
6633 for (auto &osd
: osds
)
6634 f
->dump_int("osd", osd
);
6639 for (auto &osd
: osds
) {
6647 } else if (prefix
== "osd crush get-device-class") {
6648 vector
<string
> idvec
;
6649 cmd_getval(cmdmap
, "ids", idvec
);
6650 map
<int, string
> class_by_osd
;
6651 for (auto& id
: idvec
) {
6653 long osd
= parse_osd_id(id
.c_str(), &ts
);
6655 ss
<< "unable to parse osd id:'" << id
<< "'";
6659 auto device_class
= osdmap
.crush
->get_item_class(osd
);
6661 class_by_osd
[osd
] = device_class
;
6663 class_by_osd
[osd
] = ""; // no class
6666 f
->open_array_section("osd_device_classes");
6667 for (auto& i
: class_by_osd
) {
6668 f
->open_object_section("osd_device_class");
6669 f
->dump_int("osd", i
.first
);
6670 f
->dump_string("device_class", i
.second
);
6676 if (class_by_osd
.size() == 1) {
6677 // for single input, make a clean output
6678 ds
<< class_by_osd
.begin()->second
;
6680 // note that we do not group osds by class here
6681 for (auto it
= class_by_osd
.begin();
6682 it
!= class_by_osd
.end();
6684 ds
<< "osd." << it
->first
<< ' ' << it
->second
;
6685 if (next(it
) != class_by_osd
.end())
6691 } else if (prefix
== "osd erasure-code-profile ls") {
6692 const auto &profiles
= osdmap
.get_erasure_code_profiles();
6694 f
->open_array_section("erasure-code-profiles");
6695 for (auto i
= profiles
.begin(); i
!= profiles
.end(); ++i
) {
6697 f
->dump_string("profile", i
->first
.c_str());
6699 rdata
.append(i
->first
+ "\n");
6706 rdata
.append(rs
.str());
6708 } else if (prefix
== "osd crush weight-set ls") {
6709 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
6711 f
->open_array_section("weight_sets");
6712 if (osdmap
.crush
->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS
)) {
6713 f
->dump_string("pool", "(compat)");
6715 for (auto& i
: osdmap
.crush
->choose_args
) {
6717 f
->dump_string("pool", osdmap
.get_pool_name(i
.first
));
6724 if (osdmap
.crush
->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS
)) {
6727 for (auto& i
: osdmap
.crush
->choose_args
) {
6729 rs
<< osdmap
.get_pool_name(i
.first
) << "\n";
6732 rdata
.append(rs
.str());
6734 } else if (prefix
== "osd crush weight-set dump") {
6735 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty",
6737 osdmap
.crush
->dump_choose_args(f
.get());
6739 } else if (prefix
== "osd erasure-code-profile get") {
6741 cmd_getval(cmdmap
, "name", name
);
6742 if (!osdmap
.has_erasure_code_profile(name
)) {
6743 ss
<< "unknown erasure code profile '" << name
<< "'";
6747 const map
<string
,string
> &profile
= osdmap
.get_erasure_code_profile(name
);
6749 f
->open_object_section("profile");
6750 for (map
<string
,string
>::const_iterator i
= profile
.begin();
6754 f
->dump_string(i
->first
.c_str(), i
->second
.c_str());
6756 rdata
.append(i
->first
+ "=" + i
->second
+ "\n");
6763 rdata
.append(rs
.str());
6765 } else if (prefix
== "osd pool application get") {
6766 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty",
6769 cmd_getval(cmdmap
, "pool", pool_name
);
6771 cmd_getval(cmdmap
, "app", app
);
6773 cmd_getval(cmdmap
, "key", key
);
6775 if (pool_name
.empty()) {
6777 f
->open_object_section("pools");
6778 for (const auto &pool
: osdmap
.pools
) {
6779 std::string
name("<unknown>");
6780 const auto &pni
= osdmap
.pool_name
.find(pool
.first
);
6781 if (pni
!= osdmap
.pool_name
.end())
6783 f
->open_object_section(name
.c_str());
6784 for (auto &app_pair
: pool
.second
.application_metadata
) {
6785 f
->open_object_section(app_pair
.first
.c_str());
6786 for (auto &kv_pair
: app_pair
.second
) {
6787 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
6791 f
->close_section(); // name
6793 f
->close_section(); // pools
6796 int64_t pool
= osdmap
.lookup_pg_pool_name(pool_name
.c_str());
6798 ss
<< "unrecognized pool '" << pool_name
<< "'";
6802 auto p
= osdmap
.get_pg_pool(pool
);
6805 f
->open_object_section(pool_name
.c_str());
6806 for (auto &app_pair
: p
->application_metadata
) {
6807 f
->open_object_section(app_pair
.first
.c_str());
6808 for (auto &kv_pair
: app_pair
.second
) {
6809 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
6811 f
->close_section(); // application
6813 f
->close_section(); // pool_name
6818 auto app_it
= p
->application_metadata
.find(app
);
6819 if (app_it
== p
->application_metadata
.end()) {
6820 ss
<< "pool '" << pool_name
<< "' has no application '" << app
<< "'";
6824 // filter by pool + app
6826 f
->open_object_section(app_it
->first
.c_str());
6827 for (auto &kv_pair
: app_it
->second
) {
6828 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
6830 f
->close_section(); // application
6834 // filter by pool + app + key
6835 auto key_it
= app_it
->second
.find(key
);
6836 if (key_it
== app_it
->second
.end()) {
6837 ss
<< "application '" << app
<< "' on pool '" << pool_name
6838 << "' does not have key '" << key
<< "'";
6842 ss
<< key_it
->second
<< "\n";
6843 rdata
.append(ss
.str());
6846 } else if (prefix
== "osd get-require-min-compat-client") {
6847 ss
<< osdmap
.require_min_compat_client
<< std::endl
;
6848 rdata
.append(ss
.str());
6851 } else if (prefix
== "osd pool application enable" ||
6852 prefix
== "osd pool application disable" ||
6853 prefix
== "osd pool application set" ||
6854 prefix
== "osd pool application rm") {
6855 bool changed
= false;
6856 r
= preprocess_command_pool_application(prefix
, cmdmap
, ss
, &changed
);
6860 } else if (changed
) {
6861 // Valid mutation, proceed to prepare phase
6864 // Idempotent case, reply
6868 // try prepare update
6875 mon
->reply_command(op
, r
, rs
, rdata
, get_last_committed());
6879 void OSDMonitor::set_pool_flags(int64_t pool_id
, uint64_t flags
)
6881 pg_pool_t
*pool
= pending_inc
.get_new_pool(pool_id
,
6882 osdmap
.get_pg_pool(pool_id
));
6884 pool
->set_flag(flags
);
6887 void OSDMonitor::clear_pool_flags(int64_t pool_id
, uint64_t flags
)
6889 pg_pool_t
*pool
= pending_inc
.get_new_pool(pool_id
,
6890 osdmap
.get_pg_pool(pool_id
));
6892 pool
->unset_flag(flags
);
6895 string
OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch
)
6898 snprintf(k
, sizeof(k
), "purged_epoch_%08lx", (unsigned long)epoch
);
6902 string
OSDMonitor::make_purged_snap_key(int64_t pool
, snapid_t snap
)
6905 snprintf(k
, sizeof(k
), "purged_snap_%llu_%016llx",
6906 (unsigned long long)pool
, (unsigned long long)snap
);
6910 string
OSDMonitor::make_purged_snap_key_value(
6911 int64_t pool
, snapid_t snap
, snapid_t num
,
6912 epoch_t epoch
, bufferlist
*v
)
6914 // encode the *last* epoch in the key so that we can use forward
6915 // iteration only to search for an epoch in an interval.
6917 encode(snap
+ num
, *v
);
6919 return make_purged_snap_key(pool
, snap
+ num
- 1);
6923 int OSDMonitor::lookup_purged_snap(
6924 int64_t pool
, snapid_t snap
,
6925 snapid_t
*begin
, snapid_t
*end
)
6927 string k
= make_purged_snap_key(pool
, snap
);
6928 auto it
= mon
->store
->get_iterator(OSD_SNAP_PREFIX
);
6931 dout(20) << __func__
6932 << " pool " << pool
<< " snap " << snap
6933 << " - key '" << k
<< "' not found" << dendl
;
6936 if (it
->key().find("purged_snap_") != 0) {
6937 dout(20) << __func__
6938 << " pool " << pool
<< " snap " << snap
6939 << " - key '" << k
<< "' got '" << it
->key()
6940 << "', wrong prefix" << dendl
;
6943 string gotk
= it
->key();
6944 const char *format
= "purged_snap_%llu_";
6945 long long int keypool
;
6946 int n
= sscanf(gotk
.c_str(), format
, &keypool
);
6948 derr
<< __func__
<< " invalid k '" << gotk
<< "'" << dendl
;
6951 if (pool
!= keypool
) {
6952 dout(20) << __func__
6953 << " pool " << pool
<< " snap " << snap
6954 << " - key '" << k
<< "' got '" << gotk
6955 << "', wrong pool " << keypool
6959 bufferlist v
= it
->value();
6960 auto p
= v
.cbegin();
6963 if (snap
< *begin
|| snap
>= *end
) {
6964 dout(20) << __func__
6965 << " pool " << pool
<< " snap " << snap
6966 << " - found [" << *begin
<< "," << *end
<< "), no overlap"
6973 void OSDMonitor::insert_purged_snap_update(
6975 snapid_t start
, snapid_t end
,
6977 MonitorDBStore::TransactionRef t
)
6979 snapid_t before_begin
, before_end
;
6980 snapid_t after_begin
, after_end
;
6981 int b
= lookup_purged_snap(pool
, start
- 1,
6982 &before_begin
, &before_end
);
6983 int a
= lookup_purged_snap(pool
, end
,
6984 &after_begin
, &after_end
);
6986 dout(10) << __func__
6987 << " [" << start
<< "," << end
<< ") - joins ["
6988 << before_begin
<< "," << before_end
<< ") and ["
6989 << after_begin
<< "," << after_end
<< ")" << dendl
;
6990 // erase only the begin record; we'll overwrite the end one.
6991 t
->erase(OSD_SNAP_PREFIX
, make_purged_snap_key(pool
, before_end
- 1));
6993 string k
= make_purged_snap_key_value(pool
,
6994 before_begin
, after_end
- before_begin
,
6995 pending_inc
.epoch
, &v
);
6996 t
->put(OSD_SNAP_PREFIX
, k
, v
);
6998 dout(10) << __func__
6999 << " [" << start
<< "," << end
<< ") - join with earlier ["
7000 << before_begin
<< "," << before_end
<< ")" << dendl
;
7001 t
->erase(OSD_SNAP_PREFIX
, make_purged_snap_key(pool
, before_end
- 1));
7003 string k
= make_purged_snap_key_value(pool
,
7004 before_begin
, end
- before_begin
,
7005 pending_inc
.epoch
, &v
);
7006 t
->put(OSD_SNAP_PREFIX
, k
, v
);
7008 dout(10) << __func__
7009 << " [" << start
<< "," << end
<< ") - join with later ["
7010 << after_begin
<< "," << after_end
<< ")" << dendl
;
7011 // overwrite after record
7013 string k
= make_purged_snap_key_value(pool
,
7014 start
, after_end
- start
,
7015 pending_inc
.epoch
, &v
);
7016 t
->put(OSD_SNAP_PREFIX
, k
, v
);
7018 dout(10) << __func__
7019 << " [" << start
<< "," << end
<< ") - new"
7022 string k
= make_purged_snap_key_value(pool
,
7024 pending_inc
.epoch
, &v
);
7025 t
->put(OSD_SNAP_PREFIX
, k
, v
);
7029 bool OSDMonitor::try_prune_purged_snaps()
7031 if (!mon
->mgrstatmon()->is_readable()) {
7034 if (!pending_inc
.new_purged_snaps
.empty()) {
7035 return false; // we already pruned for this epoch
7038 unsigned max_prune
= cct
->_conf
.get_val
<uint64_t>(
7039 "mon_max_snap_prune_per_epoch");
7043 dout(10) << __func__
<< " max_prune " << max_prune
<< dendl
;
7045 unsigned actually_pruned
= 0;
7046 auto& purged_snaps
= mon
->mgrstatmon()->get_digest().purged_snaps
;
7047 for (auto& p
: osdmap
.get_pools()) {
7048 auto q
= purged_snaps
.find(p
.first
);
7049 if (q
== purged_snaps
.end()) {
7052 auto& purged
= q
->second
;
7053 if (purged
.empty()) {
7054 dout(20) << __func__
<< " " << p
.first
<< " nothing purged" << dendl
;
7057 dout(20) << __func__
<< " pool " << p
.first
<< " purged " << purged
<< dendl
;
7058 snap_interval_set_t to_prune
;
7059 unsigned maybe_pruned
= actually_pruned
;
7060 for (auto i
= purged
.begin(); i
!= purged
.end(); ++i
) {
7061 snapid_t begin
= i
.get_start();
7062 auto end
= i
.get_start() + i
.get_len();
7063 snapid_t pbegin
= 0, pend
= 0;
7064 int r
= lookup_purged_snap(p
.first
, begin
, &pbegin
, &pend
);
7067 // be a bit aggressive about backing off here, because the mon may
7068 // do a lot of work going through this set, and if we know the
7069 // purged set from the OSDs is at least *partly* stale we may as
7070 // well wait for it to be fresh.
7071 dout(20) << __func__
<< " we've already purged " << pbegin
7072 << "~" << (pend
- pbegin
) << dendl
;
7075 if (pbegin
&& pbegin
> begin
&& pbegin
< end
) {
7076 // the tail of [begin,end) is purged; shorten the range
7079 to_prune
.insert(begin
, end
- begin
);
7080 maybe_pruned
+= end
- begin
;
7081 if (maybe_pruned
>= max_prune
) {
7085 if (!to_prune
.empty()) {
7086 // PGs may still be reporting things as purged that we have already
7087 // pruned from removed_snaps_queue.
7088 snap_interval_set_t actual
;
7089 auto r
= osdmap
.removed_snaps_queue
.find(p
.first
);
7090 if (r
!= osdmap
.removed_snaps_queue
.end()) {
7091 actual
.intersection_of(to_prune
, r
->second
);
7093 actually_pruned
+= actual
.size();
7094 dout(10) << __func__
<< " pool " << p
.first
<< " reports pruned " << to_prune
7095 << ", actual pruned " << actual
<< dendl
;
7096 if (!actual
.empty()) {
7097 pending_inc
.new_purged_snaps
[p
.first
].swap(actual
);
7100 if (actually_pruned
>= max_prune
) {
7104 dout(10) << __func__
<< " actually pruned " << actually_pruned
<< dendl
;
7105 return !!actually_pruned
;
7108 bool OSDMonitor::update_pools_status()
7110 if (!mon
->mgrstatmon()->is_readable())
7115 auto& pools
= osdmap
.get_pools();
7116 for (auto it
= pools
.begin(); it
!= pools
.end(); ++it
) {
7117 const pool_stat_t
*pstat
= mon
->mgrstatmon()->get_pool_stat(it
->first
);
7120 const object_stat_sum_t
& sum
= pstat
->stats
.sum
;
7121 const pg_pool_t
&pool
= it
->second
;
7122 const string
& pool_name
= osdmap
.get_pool_name(it
->first
);
7125 (pool
.quota_max_bytes
> 0 && (uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) ||
7126 (pool
.quota_max_objects
> 0 && (uint64_t)sum
.num_objects
>= pool
.quota_max_objects
);
7128 if (pool
.has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
7132 mon
->clog
->info() << "pool '" << pool_name
7133 << "' no longer out of quota; removing NO_QUOTA flag";
7134 // below we cancel FLAG_FULL too, we'll set it again in
7135 // OSDMonitor::encode_pending if it still fails the osd-full checking.
7136 clear_pool_flags(it
->first
,
7137 pg_pool_t::FLAG_FULL_QUOTA
| pg_pool_t::FLAG_FULL
);
7143 if (pool
.quota_max_bytes
> 0 &&
7144 (uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) {
7145 mon
->clog
->warn() << "pool '" << pool_name
<< "' is full"
7146 << " (reached quota's max_bytes: "
7147 << byte_u_t(pool
.quota_max_bytes
) << ")";
7149 if (pool
.quota_max_objects
> 0 &&
7150 (uint64_t)sum
.num_objects
>= pool
.quota_max_objects
) {
7151 mon
->clog
->warn() << "pool '" << pool_name
<< "' is full"
7152 << " (reached quota's max_objects: "
7153 << pool
.quota_max_objects
<< ")";
7155 // set both FLAG_FULL_QUOTA and FLAG_FULL
7156 // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
7157 // since FLAG_FULL should always take precedence
7158 set_pool_flags(it
->first
,
7159 pg_pool_t::FLAG_FULL_QUOTA
| pg_pool_t::FLAG_FULL
);
7160 clear_pool_flags(it
->first
,
7161 pg_pool_t::FLAG_NEARFULL
|
7162 pg_pool_t::FLAG_BACKFILLFULL
);
7169 int OSDMonitor::prepare_new_pool(MonOpRequestRef op
)
7171 op
->mark_osdmon_event(__func__
);
7172 auto m
= op
->get_req
<MPoolOp
>();
7173 dout(10) << "prepare_new_pool from " << m
->get_connection() << dendl
;
7174 MonSession
*session
= op
->get_session();
7177 string erasure_code_profile
;
7181 ret
= prepare_new_pool(m
->name
, m
->crush_rule
, rule_name
,
7183 erasure_code_profile
,
7184 pg_pool_t::TYPE_REPLICATED
, 0, FAST_READ_OFF
, {},
7188 dout(10) << __func__
<< " got " << ret
<< " " << ss
.str() << dendl
;
7193 int OSDMonitor::crush_rename_bucket(const string
& srcname
,
7194 const string
& dstname
,
7199 // Avoid creating a pending crush if it does not already exists and
7200 // the rename would fail.
7202 if (!_have_pending_crush()) {
7203 ret
= _get_stable_crush().can_rename_bucket(srcname
,
7210 CrushWrapper newcrush
;
7211 _get_pending_crush(newcrush
);
7213 ret
= newcrush
.rename_bucket(srcname
,
7219 pending_inc
.crush
.clear();
7220 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7221 *ss
<< "renamed bucket " << srcname
<< " into " << dstname
;
7225 void OSDMonitor::check_legacy_ec_plugin(const string
& plugin
, const string
& profile
) const
7227 string replacement
= "";
7229 if (plugin
== "jerasure_generic" ||
7230 plugin
== "jerasure_sse3" ||
7231 plugin
== "jerasure_sse4" ||
7232 plugin
== "jerasure_neon") {
7233 replacement
= "jerasure";
7234 } else if (plugin
== "shec_generic" ||
7235 plugin
== "shec_sse3" ||
7236 plugin
== "shec_sse4" ||
7237 plugin
== "shec_neon") {
7238 replacement
= "shec";
7241 if (replacement
!= "") {
7242 dout(0) << "WARNING: erasure coding profile " << profile
<< " uses plugin "
7243 << plugin
<< " that has been deprecated. Please use "
7244 << replacement
<< " instead." << dendl
;
7248 int OSDMonitor::normalize_profile(const string
& profilename
,
7249 ErasureCodeProfile
&profile
,
7253 ErasureCodeInterfaceRef erasure_code
;
7254 ErasureCodePluginRegistry
&instance
= ErasureCodePluginRegistry::instance();
7255 ErasureCodeProfile::const_iterator plugin
= profile
.find("plugin");
7256 check_legacy_ec_plugin(plugin
->second
, profilename
);
7257 int err
= instance
.factory(plugin
->second
,
7258 g_conf().get_val
<std::string
>("erasure_code_dir"),
7259 profile
, &erasure_code
, ss
);
7264 err
= erasure_code
->init(profile
, ss
);
7269 auto it
= profile
.find("stripe_unit");
7270 if (it
!= profile
.end()) {
7272 uint32_t stripe_unit
= strict_iecstrtoll(it
->second
.c_str(), &err_str
);
7273 if (!err_str
.empty()) {
7274 *ss
<< "could not parse stripe_unit '" << it
->second
7275 << "': " << err_str
<< std::endl
;
7278 uint32_t data_chunks
= erasure_code
->get_data_chunk_count();
7279 uint32_t chunk_size
= erasure_code
->get_chunk_size(stripe_unit
* data_chunks
);
7280 if (chunk_size
!= stripe_unit
) {
7281 *ss
<< "stripe_unit " << stripe_unit
<< " does not match ec profile "
7282 << "alignment. Would be padded to " << chunk_size
7286 if ((stripe_unit
% 4096) != 0 && !force
) {
7287 *ss
<< "stripe_unit should be a multiple of 4096 bytes for best performance."
7288 << "use --force to override this check" << std::endl
;
7295 int OSDMonitor::crush_rule_create_erasure(const string
&name
,
7296 const string
&profile
,
7300 int ruleid
= osdmap
.crush
->get_rule_id(name
);
7301 if (ruleid
!= -ENOENT
) {
7302 *rule
= osdmap
.crush
->get_rule_mask_ruleset(ruleid
);
7306 CrushWrapper newcrush
;
7307 _get_pending_crush(newcrush
);
7309 ruleid
= newcrush
.get_rule_id(name
);
7310 if (ruleid
!= -ENOENT
) {
7311 *rule
= newcrush
.get_rule_mask_ruleset(ruleid
);
7314 ErasureCodeInterfaceRef erasure_code
;
7315 int err
= get_erasure_code(profile
, &erasure_code
, ss
);
7317 *ss
<< "failed to load plugin using profile " << profile
<< std::endl
;
7321 err
= erasure_code
->create_rule(name
, newcrush
, ss
);
7322 erasure_code
.reset();
7326 pending_inc
.crush
.clear();
7327 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7332 int OSDMonitor::get_erasure_code(const string
&erasure_code_profile
,
7333 ErasureCodeInterfaceRef
*erasure_code
,
7336 if (pending_inc
.has_erasure_code_profile(erasure_code_profile
))
7338 ErasureCodeProfile profile
=
7339 osdmap
.get_erasure_code_profile(erasure_code_profile
);
7340 ErasureCodeProfile::const_iterator plugin
=
7341 profile
.find("plugin");
7342 if (plugin
== profile
.end()) {
7343 *ss
<< "cannot determine the erasure code plugin"
7344 << " because there is no 'plugin' entry in the erasure_code_profile "
7345 << profile
<< std::endl
;
7348 check_legacy_ec_plugin(plugin
->second
, erasure_code_profile
);
7349 ErasureCodePluginRegistry
&instance
= ErasureCodePluginRegistry::instance();
7350 return instance
.factory(plugin
->second
,
7351 g_conf().get_val
<std::string
>("erasure_code_dir"),
7352 profile
, erasure_code
, ss
);
7355 int OSDMonitor::check_cluster_features(uint64_t features
,
7358 stringstream unsupported_ss
;
7359 int unsupported_count
= 0;
7360 if ((mon
->get_quorum_con_features() & features
) != features
) {
7361 unsupported_ss
<< "the monitor cluster";
7362 ++unsupported_count
;
7365 set
<int32_t> up_osds
;
7366 osdmap
.get_up_osds(up_osds
);
7367 for (set
<int32_t>::iterator it
= up_osds
.begin();
7368 it
!= up_osds
.end(); ++it
) {
7369 const osd_xinfo_t
&xi
= osdmap
.get_xinfo(*it
);
7370 if ((xi
.features
& features
) != features
) {
7371 if (unsupported_count
> 0)
7372 unsupported_ss
<< ", ";
7373 unsupported_ss
<< "osd." << *it
;
7374 unsupported_count
++;
7378 if (unsupported_count
> 0) {
7379 ss
<< "features " << features
<< " unsupported by: "
7380 << unsupported_ss
.str();
7384 // check pending osd state, too!
7385 for (map
<int32_t,osd_xinfo_t
>::const_iterator p
=
7386 pending_inc
.new_xinfo
.begin();
7387 p
!= pending_inc
.new_xinfo
.end(); ++p
) {
7388 const osd_xinfo_t
&xi
= p
->second
;
7389 if ((xi
.features
& features
) != features
) {
7390 dout(10) << __func__
<< " pending osd." << p
->first
7391 << " features are insufficient; retry" << dendl
;
7399 bool OSDMonitor::validate_crush_against_features(const CrushWrapper
*newcrush
,
7402 OSDMap::Incremental new_pending
= pending_inc
;
7403 encode(*newcrush
, new_pending
.crush
, mon
->get_quorum_con_features());
7405 newmap
.deepish_copy_from(osdmap
);
7406 newmap
.apply_incremental(new_pending
);
7409 if (newmap
.require_min_compat_client
!= ceph_release_t::unknown
) {
7410 auto mv
= newmap
.get_min_compat_client();
7411 if (mv
> newmap
.require_min_compat_client
) {
7412 ss
<< "new crush map requires client version " << mv
7413 << " but require_min_compat_client is "
7414 << newmap
.require_min_compat_client
;
7421 newmap
.get_features(CEPH_ENTITY_TYPE_MON
, NULL
) |
7422 newmap
.get_features(CEPH_ENTITY_TYPE_OSD
, NULL
);
7423 stringstream features_ss
;
7424 int r
= check_cluster_features(features
, features_ss
);
7426 ss
<< "Could not change CRUSH: " << features_ss
.str();
7433 bool OSDMonitor::erasure_code_profile_in_use(
7434 const mempool::osdmap::map
<int64_t, pg_pool_t
> &pools
,
7435 const string
&profile
,
7439 for (map
<int64_t, pg_pool_t
>::const_iterator p
= pools
.begin();
7442 if (p
->second
.erasure_code_profile
== profile
&& p
->second
.is_erasure()) {
7443 *ss
<< osdmap
.pool_name
[p
->first
] << " ";
7448 *ss
<< "pool(s) are using the erasure code profile '" << profile
<< "'";
7453 int OSDMonitor::parse_erasure_code_profile(const vector
<string
> &erasure_code_profile
,
7454 map
<string
,string
> *erasure_code_profile_map
,
7457 int r
= g_conf().with_val
<string
>("osd_pool_default_erasure_code_profile",
7460 erasure_code_profile_map
,
7464 ceph_assert((*erasure_code_profile_map
).count("plugin"));
7465 string default_plugin
= (*erasure_code_profile_map
)["plugin"];
7466 map
<string
,string
> user_map
;
7467 for (vector
<string
>::const_iterator i
= erasure_code_profile
.begin();
7468 i
!= erasure_code_profile
.end();
7470 size_t equal
= i
->find('=');
7471 if (equal
== string::npos
) {
7472 user_map
[*i
] = string();
7473 (*erasure_code_profile_map
)[*i
] = string();
7475 const string key
= i
->substr(0, equal
);
7477 const string value
= i
->substr(equal
);
7478 if (key
.find("ruleset-") == 0) {
7479 *ss
<< "property '" << key
<< "' is no longer supported; try "
7480 << "'crush-" << key
.substr(8) << "' instead";
7483 user_map
[key
] = value
;
7484 (*erasure_code_profile_map
)[key
] = value
;
7488 if (user_map
.count("plugin") && user_map
["plugin"] != default_plugin
)
7489 (*erasure_code_profile_map
) = user_map
;
7494 int OSDMonitor::prepare_pool_size(const unsigned pool_type
,
7495 const string
&erasure_code_profile
,
7497 unsigned *size
, unsigned *min_size
,
7501 switch (pool_type
) {
7502 case pg_pool_t::TYPE_REPLICATED
:
7503 if (repl_size
== 0) {
7504 repl_size
= g_conf().get_val
<uint64_t>("osd_pool_default_size");
7507 *min_size
= g_conf().get_osd_pool_default_min_size(repl_size
);
7509 case pg_pool_t::TYPE_ERASURE
:
7511 ErasureCodeInterfaceRef erasure_code
;
7512 err
= get_erasure_code(erasure_code_profile
, &erasure_code
, ss
);
7514 *size
= erasure_code
->get_chunk_count();
7516 erasure_code
->get_data_chunk_count() +
7517 std::min
<int>(1, erasure_code
->get_coding_chunk_count() - 1);
7518 assert(*min_size
<= *size
);
7519 assert(*min_size
>= erasure_code
->get_data_chunk_count());
7524 *ss
<< "prepare_pool_size: " << pool_type
<< " is not a known pool type";
7531 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type
,
7532 const string
&erasure_code_profile
,
7533 uint32_t *stripe_width
,
7537 switch (pool_type
) {
7538 case pg_pool_t::TYPE_REPLICATED
:
7541 case pg_pool_t::TYPE_ERASURE
:
7543 ErasureCodeProfile profile
=
7544 osdmap
.get_erasure_code_profile(erasure_code_profile
);
7545 ErasureCodeInterfaceRef erasure_code
;
7546 err
= get_erasure_code(erasure_code_profile
, &erasure_code
, ss
);
7549 uint32_t data_chunks
= erasure_code
->get_data_chunk_count();
7550 uint32_t stripe_unit
= g_conf().get_val
<Option::size_t>("osd_pool_erasure_code_stripe_unit");
7551 auto it
= profile
.find("stripe_unit");
7552 if (it
!= profile
.end()) {
7554 stripe_unit
= strict_iecstrtoll(it
->second
.c_str(), &err_str
);
7555 ceph_assert(err_str
.empty());
7557 *stripe_width
= data_chunks
*
7558 erasure_code
->get_chunk_size(stripe_unit
* data_chunks
);
7562 *ss
<< "prepare_pool_stripe_width: "
7563 << pool_type
<< " is not a known pool type";
7570 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type
,
7571 const string
&erasure_code_profile
,
7572 const string
&rule_name
,
7577 if (*crush_rule
< 0) {
7578 switch (pool_type
) {
7579 case pg_pool_t::TYPE_REPLICATED
:
7581 if (rule_name
== "") {
7583 *crush_rule
= osdmap
.crush
->get_osd_pool_default_crush_replicated_ruleset(cct
);
7584 if (*crush_rule
< 0) {
7585 // Errors may happen e.g. if no valid rule is available
7586 *ss
<< "No suitable CRUSH rule exists, check "
7587 << "'osd pool default crush *' config options";
7591 return get_crush_rule(rule_name
, crush_rule
, ss
);
7595 case pg_pool_t::TYPE_ERASURE
:
7597 int err
= crush_rule_create_erasure(rule_name
,
7598 erasure_code_profile
,
7602 dout(20) << "prepare_pool_crush_rule: rule "
7603 << rule_name
<< " try again" << dendl
;
7606 // need to wait for the crush rule to be proposed before proceeding
7617 *ss
<< "prepare_pool_crush_rule: " << pool_type
7618 << " is not a known pool type";
7623 if (!osdmap
.crush
->ruleset_exists(*crush_rule
)) {
7624 *ss
<< "CRUSH rule " << *crush_rule
<< " not found";
7632 int OSDMonitor::get_crush_rule(const string
&rule_name
,
7637 ret
= osdmap
.crush
->get_rule_id(rule_name
);
7638 if (ret
!= -ENOENT
) {
7642 CrushWrapper newcrush
;
7643 _get_pending_crush(newcrush
);
7645 ret
= newcrush
.get_rule_id(rule_name
);
7646 if (ret
!= -ENOENT
) {
7647 // found it, wait for it to be proposed
7648 dout(20) << __func__
<< ": rule " << rule_name
7649 << " try again" << dendl
;
7652 // Cannot find it , return error
7653 *ss
<< "specified rule " << rule_name
<< " doesn't exist";
7660 int OSDMonitor::check_pg_num(int64_t pool
, int pg_num
, int size
, ostream
*ss
)
7662 auto max_pgs_per_osd
= g_conf().get_val
<uint64_t>("mon_max_pg_per_osd");
7663 auto num_osds
= std::max(osdmap
.get_num_in_osds(), 3u); // assume min cluster size 3
7664 auto max_pgs
= max_pgs_per_osd
* num_osds
;
7665 uint64_t projected
= 0;
7667 projected
+= pg_num
* size
;
7669 for (const auto& i
: osdmap
.get_pools()) {
7670 if (i
.first
== pool
) {
7671 projected
+= pg_num
* size
;
7673 projected
+= i
.second
.get_pg_num_target() * i
.second
.get_size();
7676 if (projected
> max_pgs
) {
7678 *ss
<< "pool id " << pool
;
7680 *ss
<< " pg_num " << pg_num
<< " size " << size
7681 << " would mean " << projected
7682 << " total pgs, which exceeds max " << max_pgs
7683 << " (mon_max_pg_per_osd " << max_pgs_per_osd
7684 << " * num_in_osds " << num_osds
<< ")";
7691 * @param name The name of the new pool
7692 * @param crush_rule The crush rule to use. If <0, will use the system default
7693 * @param crush_rule_name The crush rule to use, if crush_rulset <0
7694 * @param pg_num The pg_num to use. If set to 0, will use the system default
7695 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
7696 * @param repl_size Replication factor, or 0 for default
7697 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
7698 * @param pool_type TYPE_ERASURE, or TYPE_REP
7699 * @param expected_num_objects expected number of objects on the pool
7700 * @param fast_read fast read type.
7701 * @param ss human readable error message, if any.
7703 * @return 0 on success, negative errno on failure.
7705 int OSDMonitor::prepare_new_pool(string
& name
,
7707 const string
&crush_rule_name
,
7708 unsigned pg_num
, unsigned pgp_num
,
7709 unsigned pg_num_min
,
7710 const uint64_t repl_size
,
7711 const uint64_t target_size_bytes
,
7712 const float target_size_ratio
,
7713 const string
&erasure_code_profile
,
7714 const unsigned pool_type
,
7715 const uint64_t expected_num_objects
,
7716 FastReadType fast_read
,
7717 const string
& pg_autoscale_mode
,
7720 if (name
.length() == 0)
7723 pg_num
= g_conf().get_val
<uint64_t>("osd_pool_default_pg_num");
7725 pgp_num
= g_conf().get_val
<uint64_t>("osd_pool_default_pgp_num");
7728 if (pg_num
> g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")) {
7729 *ss
<< "'pg_num' must be greater than 0 and less than or equal to "
7730 << g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")
7731 << " (you may adjust 'mon max pool pg num' for higher values)";
7734 if (pgp_num
> pg_num
) {
7735 *ss
<< "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
7736 << ", which in this case is " << pg_num
;
7739 if (pool_type
== pg_pool_t::TYPE_REPLICATED
&& fast_read
== FAST_READ_ON
) {
7740 *ss
<< "'fast_read' can only apply to erasure coding pool";
7744 r
= prepare_pool_crush_rule(pool_type
, erasure_code_profile
,
7745 crush_rule_name
, &crush_rule
, ss
);
7747 dout(10) << "prepare_pool_crush_rule returns " << r
<< dendl
;
7750 if (g_conf()->mon_osd_crush_smoke_test
) {
7751 CrushWrapper newcrush
;
7752 _get_pending_crush(newcrush
);
7754 CrushTester
tester(newcrush
, err
);
7755 tester
.set_min_x(0);
7756 tester
.set_max_x(50);
7757 tester
.set_rule(crush_rule
);
7758 auto start
= ceph::coarse_mono_clock::now();
7759 r
= tester
.test_with_fork(g_conf()->mon_lease
);
7760 auto duration
= ceph::coarse_mono_clock::now() - start
;
7762 dout(10) << "tester.test_with_fork returns " << r
7763 << ": " << err
.str() << dendl
;
7764 *ss
<< "crush test failed with " << r
<< ": " << err
.str();
7767 dout(10) << __func__
<< " crush smoke test duration: "
7768 << duration
<< dendl
;
7770 unsigned size
, min_size
;
7771 r
= prepare_pool_size(pool_type
, erasure_code_profile
, repl_size
,
7772 &size
, &min_size
, ss
);
7774 dout(10) << "prepare_pool_size returns " << r
<< dendl
;
7777 r
= check_pg_num(-1, pg_num
, size
, ss
);
7779 dout(10) << "check_pg_num returns " << r
<< dendl
;
7783 if (!osdmap
.crush
->check_crush_rule(crush_rule
, pool_type
, size
, *ss
)) {
7787 uint32_t stripe_width
= 0;
7788 r
= prepare_pool_stripe_width(pool_type
, erasure_code_profile
, &stripe_width
, ss
);
7790 dout(10) << "prepare_pool_stripe_width returns " << r
<< dendl
;
7795 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
7796 switch (fast_read
) {
7803 case FAST_READ_DEFAULT
:
7804 fread
= g_conf()->osd_pool_default_ec_fast_read
;
7807 *ss
<< "invalid fast_read setting: " << fast_read
;
7812 for (map
<int64_t,string
>::iterator p
= pending_inc
.new_pool_names
.begin();
7813 p
!= pending_inc
.new_pool_names
.end();
7815 if (p
->second
== name
)
7819 if (-1 == pending_inc
.new_pool_max
)
7820 pending_inc
.new_pool_max
= osdmap
.pool_max
;
7821 int64_t pool
= ++pending_inc
.new_pool_max
;
7823 pg_pool_t
*pi
= pending_inc
.get_new_pool(pool
, &empty
);
7824 pi
->create_time
= ceph_clock_now();
7825 pi
->type
= pool_type
;
7826 pi
->fast_read
= fread
;
7827 pi
->flags
= g_conf()->osd_pool_default_flags
;
7828 if (g_conf()->osd_pool_default_flag_hashpspool
)
7829 pi
->set_flag(pg_pool_t::FLAG_HASHPSPOOL
);
7830 if (g_conf()->osd_pool_default_flag_nodelete
)
7831 pi
->set_flag(pg_pool_t::FLAG_NODELETE
);
7832 if (g_conf()->osd_pool_default_flag_nopgchange
)
7833 pi
->set_flag(pg_pool_t::FLAG_NOPGCHANGE
);
7834 if (g_conf()->osd_pool_default_flag_nosizechange
)
7835 pi
->set_flag(pg_pool_t::FLAG_NOSIZECHANGE
);
7836 pi
->set_flag(pg_pool_t::FLAG_CREATING
);
7837 if (g_conf()->osd_pool_use_gmt_hitset
)
7838 pi
->use_gmt_hitset
= true;
7840 pi
->use_gmt_hitset
= false;
7843 pi
->min_size
= min_size
;
7844 pi
->crush_rule
= crush_rule
;
7845 pi
->expected_num_objects
= expected_num_objects
;
7846 pi
->object_hash
= CEPH_STR_HASH_RJENKINS
;
7848 if (auto m
= pg_pool_t::get_pg_autoscale_mode_by_name(
7849 g_conf().get_val
<string
>("osd_pool_default_pg_autoscale_mode"));
7850 m
!= pg_pool_t::pg_autoscale_mode_t::UNKNOWN
) {
7851 pi
->pg_autoscale_mode
= m
;
7853 pi
->pg_autoscale_mode
= pg_pool_t::pg_autoscale_mode_t::OFF
;
7855 auto max
= g_conf().get_val
<int64_t>("mon_osd_max_initial_pgs");
7857 max
> 0 ? std::min
<uint64_t>(pg_num
, std::max
<int64_t>(1, max
))
7859 pi
->set_pg_num_pending(pi
->get_pg_num());
7860 pi
->set_pg_num_target(pg_num
);
7861 pi
->set_pgp_num(pi
->get_pg_num());
7862 pi
->set_pgp_num_target(pgp_num
);
7863 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
&&
7865 pi
->opts
.set(pool_opts_t::PG_NUM_MIN
, static_cast<int64_t>(pg_num_min
));
7867 if (auto m
= pg_pool_t::get_pg_autoscale_mode_by_name(
7868 pg_autoscale_mode
); m
!= pg_pool_t::pg_autoscale_mode_t::UNKNOWN
) {
7869 pi
->pg_autoscale_mode
= m
;
7872 pi
->last_change
= pending_inc
.epoch
;
7875 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
7876 pi
->erasure_code_profile
= erasure_code_profile
;
7878 pi
->erasure_code_profile
= "";
7880 pi
->stripe_width
= stripe_width
;
7882 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
&&
7883 target_size_bytes
) {
7884 // only store for nautilus+ because TARGET_SIZE_BYTES may be
7885 // larger than int32_t max.
7886 pi
->opts
.set(pool_opts_t::TARGET_SIZE_BYTES
, static_cast<int64_t>(target_size_bytes
));
7888 if (target_size_ratio
> 0.0 &&
7889 osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
7890 // only store for nautilus+, just to be consistent and tidy.
7891 pi
->opts
.set(pool_opts_t::TARGET_SIZE_RATIO
, target_size_ratio
);
7894 pi
->cache_target_dirty_ratio_micro
=
7895 g_conf()->osd_pool_default_cache_target_dirty_ratio
* 1000000;
7896 pi
->cache_target_dirty_high_ratio_micro
=
7897 g_conf()->osd_pool_default_cache_target_dirty_high_ratio
* 1000000;
7898 pi
->cache_target_full_ratio_micro
=
7899 g_conf()->osd_pool_default_cache_target_full_ratio
* 1000000;
7900 pi
->cache_min_flush_age
= g_conf()->osd_pool_default_cache_min_flush_age
;
7901 pi
->cache_min_evict_age
= g_conf()->osd_pool_default_cache_min_evict_age
;
7903 pending_inc
.new_pool_names
[pool
] = name
;
7907 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op
, int flag
)
7909 op
->mark_osdmon_event(__func__
);
7911 if (pending_inc
.new_flags
< 0)
7912 pending_inc
.new_flags
= osdmap
.get_flags();
7913 pending_inc
.new_flags
|= flag
;
7914 ss
<< OSDMap::get_flag_string(flag
) << " is set";
7915 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
7916 get_last_committed() + 1));
7920 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op
, int flag
)
7922 op
->mark_osdmon_event(__func__
);
7924 if (pending_inc
.new_flags
< 0)
7925 pending_inc
.new_flags
= osdmap
.get_flags();
7926 pending_inc
.new_flags
&= ~flag
;
7927 ss
<< OSDMap::get_flag_string(flag
) << " is unset";
7928 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
7929 get_last_committed() + 1));
7933 int OSDMonitor::prepare_command_pool_set(const cmdmap_t
& cmdmap
,
7937 cmd_getval(cmdmap
, "pool", poolstr
);
7938 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
7940 ss
<< "unrecognized pool '" << poolstr
<< "'";
7944 cmd_getval(cmdmap
, "var", var
);
7946 pg_pool_t p
= *osdmap
.get_pg_pool(pool
);
7947 if (pending_inc
.new_pools
.count(pool
))
7948 p
= pending_inc
.new_pools
[pool
];
7950 // accept val as a json string in the normal case (current
7951 // generation monitor). parse out int or float values from the
7952 // string as needed. however, if it is not a string, try to pull
7953 // out an int, in case an older monitor with an older json schema is
7954 // forwarding a request.
7956 string interr
, floaterr
;
7959 int64_t uf
= 0; // micro-f
7960 cmd_getval(cmdmap
, "val", val
);
7963 "target_max_objects"
7965 auto iec_options
= {
7967 "target_size_bytes",
7968 "compression_max_blob_size",
7969 "compression_min_blob_size",
7973 if (count(begin(si_options
), end(si_options
), var
)) {
7974 n
= strict_si_cast
<int64_t>(val
.c_str(), &interr
);
7975 } else if (count(begin(iec_options
), end(iec_options
), var
)) {
7976 n
= strict_iec_cast
<int64_t>(val
.c_str(), &interr
);
7978 // parse string as both int and float; different fields use different types.
7979 n
= strict_strtoll(val
.c_str(), 10, &interr
);
7980 f
= strict_strtod(val
.c_str(), &floaterr
);
7981 uf
= llrintl(f
* (double)1000000.0);
7985 (var
== "hit_set_type" || var
== "hit_set_period" ||
7986 var
== "hit_set_count" || var
== "hit_set_fpp" ||
7987 var
== "target_max_objects" || var
== "target_max_bytes" ||
7988 var
== "cache_target_full_ratio" || var
== "cache_target_dirty_ratio" ||
7989 var
== "cache_target_dirty_high_ratio" || var
== "use_gmt_hitset" ||
7990 var
== "cache_min_flush_age" || var
== "cache_min_evict_age" ||
7991 var
== "hit_set_grade_decay_rate" || var
== "hit_set_search_last_n" ||
7992 var
== "min_read_recency_for_promote" || var
== "min_write_recency_for_promote")) {
7996 if (var
== "size") {
7997 if (p
.has_flag(pg_pool_t::FLAG_NOSIZECHANGE
)) {
7998 ss
<< "pool size change is disabled; you must unset nosizechange flag for the pool first";
8001 if (p
.type
== pg_pool_t::TYPE_ERASURE
) {
8002 ss
<< "can not change the size of an erasure-coded pool";
8005 if (interr
.length()) {
8006 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8009 if (n
<= 0 || n
> 10) {
8010 ss
<< "pool size must be between 1 and 10";
8013 if (!osdmap
.crush
->check_crush_rule(p
.get_crush_rule(), p
.type
, n
, ss
)) {
8016 int r
= check_pg_num(pool
, p
.get_pg_num(), n
, &ss
);
8021 p
.min_size
= g_conf().get_osd_pool_default_min_size(p
.size
);
8022 } else if (var
== "min_size") {
8023 if (p
.has_flag(pg_pool_t::FLAG_NOSIZECHANGE
)) {
8024 ss
<< "pool min size change is disabled; you must unset nosizechange flag for the pool first";
8027 if (interr
.length()) {
8028 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8032 if (p
.type
!= pg_pool_t::TYPE_ERASURE
) {
8033 if (n
< 1 || n
> p
.size
) {
8034 ss
<< "pool min_size must be between 1 and size, which is set to " << (int)p
.size
;
8038 ErasureCodeInterfaceRef erasure_code
;
8041 int err
= get_erasure_code(p
.erasure_code_profile
, &erasure_code
, &tmp
);
8043 k
= erasure_code
->get_data_chunk_count();
8045 ss
<< __func__
<< " get_erasure_code failed: " << tmp
.str();
8049 if (n
< k
|| n
> p
.size
) {
8050 ss
<< "pool min_size must be between " << k
<< " and size, which is set to " << (int)p
.size
;
8055 } else if (var
== "pg_num_actual") {
8056 if (interr
.length()) {
8057 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8060 if (n
== (int)p
.get_pg_num()) {
8063 if (static_cast<uint64_t>(n
) > g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")) {
8064 ss
<< "'pg_num' must be greater than 0 and less than or equal to "
8065 << g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")
8066 << " (you may adjust 'mon max pool pg num' for higher values)";
8069 if (p
.has_flag(pg_pool_t::FLAG_CREATING
)) {
8070 ss
<< "cannot adjust pg_num while initial PGs are being created";
8073 if (n
> (int)p
.get_pg_num()) {
8074 if (p
.get_pg_num() != p
.get_pg_num_pending()) {
8075 // force pre-nautilus clients to resend their ops, since they
8076 // don't understand pg_num_pending changes form a new interval
8077 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
8081 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8082 ss
<< "nautilus OSDs are required to adjust pg_num_pending";
8085 if (n
< (int)p
.get_pgp_num()) {
8086 ss
<< "specified pg_num " << n
<< " < pgp_num " << p
.get_pgp_num();
8089 if (n
< (int)p
.get_pg_num() - 1) {
8090 ss
<< "specified pg_num " << n
<< " < pg_num (" << p
.get_pg_num()
8091 << ") - 1; only single pg decrease is currently supported";
8094 p
.set_pg_num_pending(n
);
8095 // force pre-nautilus clients to resend their ops, since they
8096 // don't understand pg_num_pending changes form a new interval
8097 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
8099 // force pre-luminous clients to resend their ops, since they
8100 // don't understand that split PGs now form a new interval.
8101 p
.last_force_op_resend_preluminous
= pending_inc
.epoch
;
8102 } else if (var
== "pg_num") {
8103 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
8104 ss
<< "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8107 if (interr
.length()) {
8108 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8111 if (n
== (int)p
.get_pg_num_target()) {
8114 if (n
<= 0 || static_cast<uint64_t>(n
) >
8115 g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")) {
8116 ss
<< "'pg_num' must be greater than 0 and less than or equal to "
8117 << g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")
8118 << " (you may adjust 'mon max pool pg num' for higher values)";
8121 if (n
> (int)p
.get_pg_num_target()) {
8122 int r
= check_pg_num(pool
, n
, p
.get_size(), &ss
);
8127 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
8128 if (p
.cache_mode
!= pg_pool_t::CACHEMODE_NONE
&& !force
) {
8129 ss
<< "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
8133 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8134 ss
<< "nautilus OSDs are required to decrease pg_num";
8138 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8139 // pre-nautilus osdmap format; increase pg_num directly
8140 assert(n
> (int)p
.get_pg_num());
8141 // force pre-nautilus clients to resend their ops, since they
8142 // don't understand pg_num_target changes form a new interval
8143 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
8144 // force pre-luminous clients to resend their ops, since they
8145 // don't understand that split PGs now form a new interval.
8146 p
.last_force_op_resend_preluminous
= pending_inc
.epoch
;
8149 // set targets; mgr will adjust pg_num_actual and pgp_num later.
8150 // make pgp_num track pg_num if it already matches. if it is set
8151 // differently, leave it different and let the user control it
8153 if (p
.get_pg_num_target() == p
.get_pgp_num_target()) {
8154 p
.set_pgp_num_target(n
);
8156 p
.set_pg_num_target(n
);
8158 } else if (var
== "pgp_num_actual") {
8159 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
8160 ss
<< "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8163 if (interr
.length()) {
8164 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8168 ss
<< "specified pgp_num must > 0, but you set to " << n
;
8171 if (n
> (int)p
.get_pg_num()) {
8172 ss
<< "specified pgp_num " << n
<< " > pg_num " << p
.get_pg_num();
8175 if (n
> (int)p
.get_pg_num_pending()) {
8176 ss
<< "specified pgp_num " << n
8177 << " > pg_num_pending " << p
.get_pg_num_pending();
8181 } else if (var
== "pgp_num") {
8182 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
8183 ss
<< "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8186 if (interr
.length()) {
8187 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8191 ss
<< "specified pgp_num must > 0, but you set to " << n
;
8194 if (n
> (int)p
.get_pg_num_target()) {
8195 ss
<< "specified pgp_num " << n
<< " > pg_num " << p
.get_pg_num_target();
8198 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8199 // pre-nautilus osdmap format; increase pgp_num directly
8202 p
.set_pgp_num_target(n
);
8204 } else if (var
== "pg_autoscale_mode") {
8205 auto m
= pg_pool_t::get_pg_autoscale_mode_by_name(val
);
8206 if (m
== pg_pool_t::pg_autoscale_mode_t::UNKNOWN
) {
8207 ss
<< "specified invalid mode " << val
;
8210 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8211 ss
<< "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
8214 p
.pg_autoscale_mode
= m
;
8215 } else if (var
== "crush_rule") {
8216 int id
= osdmap
.crush
->get_rule_id(val
);
8217 if (id
== -ENOENT
) {
8218 ss
<< "crush rule " << val
<< " does not exist";
8222 ss
<< cpp_strerror(id
);
8225 if (!osdmap
.crush
->check_crush_rule(id
, p
.get_type(), p
.get_size(), ss
)) {
8229 } else if (var
== "nodelete" || var
== "nopgchange" ||
8230 var
== "nosizechange" || var
== "write_fadvise_dontneed" ||
8231 var
== "noscrub" || var
== "nodeep-scrub") {
8232 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
8233 // make sure we only compare against 'n' if we didn't receive a string
8234 if (val
== "true" || (interr
.empty() && n
== 1)) {
8236 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8239 ss
<< "expecting value 'true', 'false', '0', or '1'";
8242 } else if (var
== "hashpspool") {
8243 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
8245 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
8248 ss
<< "are you SURE? this will remap all placement groups in this pool,"
8249 " this triggers large data movement,"
8250 " pass --yes-i-really-mean-it if you really do.";
8253 // make sure we only compare against 'n' if we didn't receive a string
8254 if (val
== "true" || (interr
.empty() && n
== 1)) {
8256 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8259 ss
<< "expecting value 'true', 'false', '0', or '1'";
8262 } else if (var
== "hit_set_type") {
8264 p
.hit_set_params
= HitSet::Params();
8266 int err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
8269 if (val
== "bloom") {
8270 BloomHitSet::Params
*bsp
= new BloomHitSet::Params
;
8271 bsp
->set_fpp(g_conf().get_val
<double>("osd_pool_default_hit_set_bloom_fpp"));
8272 p
.hit_set_params
= HitSet::Params(bsp
);
8273 } else if (val
== "explicit_hash")
8274 p
.hit_set_params
= HitSet::Params(new ExplicitHashHitSet::Params
);
8275 else if (val
== "explicit_object")
8276 p
.hit_set_params
= HitSet::Params(new ExplicitObjectHitSet::Params
);
8278 ss
<< "unrecognized hit_set type '" << val
<< "'";
8282 } else if (var
== "hit_set_period") {
8283 if (interr
.length()) {
8284 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8287 ss
<< "hit_set_period should be non-negative";
8290 p
.hit_set_period
= n
;
8291 } else if (var
== "hit_set_count") {
8292 if (interr
.length()) {
8293 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8296 ss
<< "hit_set_count should be non-negative";
8299 p
.hit_set_count
= n
;
8300 } else if (var
== "hit_set_fpp") {
8301 if (floaterr
.length()) {
8302 ss
<< "error parsing floating point value '" << val
<< "': " << floaterr
;
8304 } else if (f
< 0 || f
> 1.0) {
8305 ss
<< "hit_set_fpp should be in the range 0..1";
8308 if (p
.hit_set_params
.get_type() != HitSet::TYPE_BLOOM
) {
8309 ss
<< "hit set is not of type Bloom; invalid to set a false positive rate!";
8312 BloomHitSet::Params
*bloomp
= static_cast<BloomHitSet::Params
*>(p
.hit_set_params
.impl
.get());
8314 } else if (var
== "use_gmt_hitset") {
8315 if (val
== "true" || (interr
.empty() && n
== 1)) {
8316 p
.use_gmt_hitset
= true;
8318 ss
<< "expecting value 'true' or '1'";
8321 } else if (var
== "allow_ec_overwrites") {
8322 if (!p
.is_erasure()) {
8323 ss
<< "ec overwrites can only be enabled for an erasure coded pool";
8327 if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites
&&
8328 !is_pool_currently_all_bluestore(pool
, p
, &err
)) {
8329 ss
<< "pool must only be stored on bluestore for scrubbing to work: " << err
.str();
8332 if (val
== "true" || (interr
.empty() && n
== 1)) {
8333 p
.flags
|= pg_pool_t::FLAG_EC_OVERWRITES
;
8334 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8335 ss
<< "ec overwrites cannot be disabled once enabled";
8338 ss
<< "expecting value 'true', 'false', '0', or '1'";
8341 } else if (var
== "target_max_objects") {
8342 if (interr
.length()) {
8343 ss
<< "error parsing int '" << val
<< "': " << interr
;
8346 p
.target_max_objects
= n
;
8347 } else if (var
== "target_max_bytes") {
8348 if (interr
.length()) {
8349 ss
<< "error parsing int '" << val
<< "': " << interr
;
8352 p
.target_max_bytes
= n
;
8353 } else if (var
== "cache_target_dirty_ratio") {
8354 if (floaterr
.length()) {
8355 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
8358 if (f
< 0 || f
> 1.0) {
8359 ss
<< "value must be in the range 0..1";
8362 p
.cache_target_dirty_ratio_micro
= uf
;
8363 } else if (var
== "cache_target_dirty_high_ratio") {
8364 if (floaterr
.length()) {
8365 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
8368 if (f
< 0 || f
> 1.0) {
8369 ss
<< "value must be in the range 0..1";
8372 p
.cache_target_dirty_high_ratio_micro
= uf
;
8373 } else if (var
== "cache_target_full_ratio") {
8374 if (floaterr
.length()) {
8375 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
8378 if (f
< 0 || f
> 1.0) {
8379 ss
<< "value must be in the range 0..1";
8382 p
.cache_target_full_ratio_micro
= uf
;
8383 } else if (var
== "cache_min_flush_age") {
8384 if (interr
.length()) {
8385 ss
<< "error parsing int '" << val
<< "': " << interr
;
8388 p
.cache_min_flush_age
= n
;
8389 } else if (var
== "cache_min_evict_age") {
8390 if (interr
.length()) {
8391 ss
<< "error parsing int '" << val
<< "': " << interr
;
8394 p
.cache_min_evict_age
= n
;
8395 } else if (var
== "min_read_recency_for_promote") {
8396 if (interr
.length()) {
8397 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8400 p
.min_read_recency_for_promote
= n
;
8401 } else if (var
== "hit_set_grade_decay_rate") {
8402 if (interr
.length()) {
8403 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8406 if (n
> 100 || n
< 0) {
8407 ss
<< "value out of range,valid range is 0 - 100";
8410 p
.hit_set_grade_decay_rate
= n
;
8411 } else if (var
== "hit_set_search_last_n") {
8412 if (interr
.length()) {
8413 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8416 if (n
> p
.hit_set_count
|| n
< 0) {
8417 ss
<< "value out of range,valid range is 0 - hit_set_count";
8420 p
.hit_set_search_last_n
= n
;
8421 } else if (var
== "min_write_recency_for_promote") {
8422 if (interr
.length()) {
8423 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8426 p
.min_write_recency_for_promote
= n
;
8427 } else if (var
== "fast_read") {
8428 if (p
.is_replicated()) {
8429 ss
<< "fast read is not supported in replication pool";
8432 if (val
== "true" || (interr
.empty() && n
== 1)) {
8434 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8435 p
.fast_read
= false;
8437 ss
<< "expecting value 'true', 'false', '0', or '1'";
8440 } else if (pool_opts_t::is_opt_name(var
)) {
8441 bool unset
= val
== "unset";
8442 if (var
== "compression_mode") {
8444 auto cmode
= Compressor::get_comp_mode_type(val
);
8446 ss
<< "unrecognized compression mode '" << val
<< "'";
8450 } else if (var
== "compression_algorithm") {
8452 auto alg
= Compressor::get_comp_alg_type(val
);
8454 ss
<< "unrecognized compression_algorithm '" << val
<< "'";
8458 } else if (var
== "compression_required_ratio") {
8459 if (floaterr
.length()) {
8460 ss
<< "error parsing float value '" << val
<< "': " << floaterr
;
8463 if (f
< 0 || f
> 1) {
8464 ss
<< "compression_required_ratio is out of range (0-1): '" << val
<< "'";
8467 } else if (var
== "csum_type") {
8468 auto t
= unset
? 0 : Checksummer::get_csum_string_type(val
);
8470 ss
<< "unrecognized csum_type '" << val
<< "'";
8473 //preserve csum_type numeric value
8476 } else if (var
== "compression_max_blob_size" ||
8477 var
== "compression_min_blob_size" ||
8478 var
== "csum_max_block" ||
8479 var
== "csum_min_block") {
8480 if (interr
.length()) {
8481 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8484 } else if (var
== "fingerprint_algorithm") {
8486 auto alg
= pg_pool_t::get_fingerprint_from_str(val
);
8488 ss
<< "unrecognized fingerprint_algorithm '" << val
<< "'";
8492 } else if (var
== "target_size_bytes") {
8493 if (interr
.length()) {
8494 ss
<< "error parsing unit value '" << val
<< "': " << interr
;
8497 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8498 ss
<< "must set require_osd_release to nautilus or "
8499 << "later before setting target_size_bytes";
8502 } else if (var
== "pg_num_min") {
8503 if (interr
.length()) {
8504 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8507 if (n
> (int)p
.get_pg_num_target()) {
8508 ss
<< "specified pg_num_min " << n
8509 << " > pg_num " << p
.get_pg_num_target();
8512 } else if (var
== "recovery_priority") {
8513 if (interr
.length()) {
8514 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8517 if (!g_conf()->debug_allow_any_pool_priority
) {
8518 if (n
> OSD_POOL_PRIORITY_MAX
|| n
< OSD_POOL_PRIORITY_MIN
) {
8519 ss
<< "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
8520 << " and " << OSD_POOL_PRIORITY_MAX
;
8524 } else if (var
== "pg_autoscale_bias") {
8525 if (f
< 0.0 || f
> 1000.0) {
8526 ss
<< "pg_autoscale_bias must be between 0 and 1000";
8531 pool_opts_t::opt_desc_t desc
= pool_opts_t::get_opt_desc(var
);
8532 switch (desc
.type
) {
8533 case pool_opts_t::STR
:
8535 p
.opts
.unset(desc
.key
);
8537 p
.opts
.set(desc
.key
, static_cast<std::string
>(val
));
8540 case pool_opts_t::INT
:
8541 if (interr
.length()) {
8542 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8546 p
.opts
.unset(desc
.key
);
8548 p
.opts
.set(desc
.key
, static_cast<int64_t>(n
));
8551 case pool_opts_t::DOUBLE
:
8552 if (floaterr
.length()) {
8553 ss
<< "error parsing floating point value '" << val
<< "': " << floaterr
;
8557 p
.opts
.unset(desc
.key
);
8559 p
.opts
.set(desc
.key
, static_cast<double>(f
));
8563 ceph_assert(!"unknown type");
8566 ss
<< "unrecognized variable '" << var
<< "'";
8569 if (val
!= "unset") {
8570 ss
<< "set pool " << pool
<< " " << var
<< " to " << val
;
8572 ss
<< "unset pool " << pool
<< " " << var
;
8574 p
.last_change
= pending_inc
.epoch
;
8575 pending_inc
.new_pools
[pool
] = p
;
8579 int OSDMonitor::prepare_command_pool_application(const string
&prefix
,
8580 const cmdmap_t
& cmdmap
,
8583 return _command_pool_application(prefix
, cmdmap
, ss
, nullptr, true);
8586 int OSDMonitor::preprocess_command_pool_application(const string
&prefix
,
8587 const cmdmap_t
& cmdmap
,
8591 return _command_pool_application(prefix
, cmdmap
, ss
, modified
, false);
8596 * Common logic for preprocess and prepare phases of pool application
8597 * tag commands. In preprocess mode we're only detecting invalid
8598 * commands, and determining whether it was a modification or a no-op.
8599 * In prepare mode we're actually updating the pending state.
8601 int OSDMonitor::_command_pool_application(const string
&prefix
,
8602 const cmdmap_t
& cmdmap
,
8608 cmd_getval(cmdmap
, "pool", pool_name
);
8609 int64_t pool
= osdmap
.lookup_pg_pool_name(pool_name
.c_str());
8611 ss
<< "unrecognized pool '" << pool_name
<< "'";
8615 pg_pool_t p
= *osdmap
.get_pg_pool(pool
);
8617 if (pending_inc
.new_pools
.count(pool
)) {
8618 p
= pending_inc
.new_pools
[pool
];
8623 cmd_getval(cmdmap
, "app", app
);
8624 bool app_exists
= (p
.application_metadata
.count(app
) > 0);
8627 cmd_getval(cmdmap
, "key", key
);
8629 ss
<< "key cannot be 'all'";
8634 cmd_getval(cmdmap
, "value", value
);
8635 if (value
== "all") {
8636 ss
<< "value cannot be 'all'";
8640 if (boost::algorithm::ends_with(prefix
, "enable")) {
8642 ss
<< "application name must be provided";
8647 ss
<< "application must be enabled on base tier";
8652 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
8654 if (!app_exists
&& !p
.application_metadata
.empty() && !force
) {
8655 ss
<< "Are you SURE? Pool '" << pool_name
<< "' already has an enabled "
8656 << "application; pass --yes-i-really-mean-it to proceed anyway";
8660 if (!app_exists
&& p
.application_metadata
.size() >= MAX_POOL_APPLICATIONS
) {
8661 ss
<< "too many enabled applications on pool '" << pool_name
<< "'; "
8662 << "max " << MAX_POOL_APPLICATIONS
;
8666 if (app
.length() > MAX_POOL_APPLICATION_LENGTH
) {
8667 ss
<< "application name '" << app
<< "' too long; max length "
8668 << MAX_POOL_APPLICATION_LENGTH
;
8673 p
.application_metadata
[app
] = {};
8675 ss
<< "enabled application '" << app
<< "' on pool '" << pool_name
<< "'";
8677 } else if (boost::algorithm::ends_with(prefix
, "disable")) {
8679 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
8682 ss
<< "Are you SURE? Disabling an application within a pool might result "
8683 << "in loss of application functionality; pass "
8684 << "--yes-i-really-mean-it to proceed anyway";
8689 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
8691 return 0; // idempotent
8694 p
.application_metadata
.erase(app
);
8695 ss
<< "disable application '" << app
<< "' on pool '" << pool_name
<< "'";
8697 } else if (boost::algorithm::ends_with(prefix
, "set")) {
8699 ss
<< "application metadata must be set on base tier";
8704 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
8710 cmd_getval(cmdmap
, "key", key
);
8713 ss
<< "key must be provided";
8717 auto &app_keys
= p
.application_metadata
[app
];
8718 if (app_keys
.count(key
) == 0 &&
8719 app_keys
.size() >= MAX_POOL_APPLICATION_KEYS
) {
8720 ss
<< "too many keys set for application '" << app
<< "' on pool '"
8721 << pool_name
<< "'; max " << MAX_POOL_APPLICATION_KEYS
;
8725 if (key
.length() > MAX_POOL_APPLICATION_LENGTH
) {
8726 ss
<< "key '" << app
<< "' too long; max length "
8727 << MAX_POOL_APPLICATION_LENGTH
;
8732 cmd_getval(cmdmap
, "value", value
);
8733 if (value
.length() > MAX_POOL_APPLICATION_LENGTH
) {
8734 ss
<< "value '" << value
<< "' too long; max length "
8735 << MAX_POOL_APPLICATION_LENGTH
;
8739 p
.application_metadata
[app
][key
] = value
;
8740 ss
<< "set application '" << app
<< "' key '" << key
<< "' to '"
8741 << value
<< "' on pool '" << pool_name
<< "'";
8742 } else if (boost::algorithm::ends_with(prefix
, "rm")) {
8744 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
8750 cmd_getval(cmdmap
, "key", key
);
8751 auto it
= p
.application_metadata
[app
].find(key
);
8752 if (it
== p
.application_metadata
[app
].end()) {
8753 ss
<< "application '" << app
<< "' on pool '" << pool_name
8754 << "' does not have key '" << key
<< "'";
8755 return 0; // idempotent
8758 p
.application_metadata
[app
].erase(it
);
8759 ss
<< "removed application '" << app
<< "' key '" << key
<< "' on pool '"
8760 << pool_name
<< "'";
8766 p
.last_change
= pending_inc
.epoch
;
8767 pending_inc
.new_pools
[pool
] = p
;
8770 // Because we fell through this far, we didn't hit no-op cases,
8771 // so pool was definitely modified
8772 if (modified
!= nullptr) {
8779 int OSDMonitor::_prepare_command_osd_crush_remove(
8780 CrushWrapper
&newcrush
,
8789 err
= newcrush
.remove_item_under(cct
, id
, ancestor
,
8792 err
= newcrush
.remove_item(cct
, id
, unlink_only
);
8797 void OSDMonitor::do_osd_crush_remove(CrushWrapper
& newcrush
)
8799 pending_inc
.crush
.clear();
8800 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8803 int OSDMonitor::prepare_command_osd_crush_remove(
8804 CrushWrapper
&newcrush
,
8810 int err
= _prepare_command_osd_crush_remove(
8811 newcrush
, id
, ancestor
,
8812 has_ancestor
, unlink_only
);
8817 ceph_assert(err
== 0);
8818 do_osd_crush_remove(newcrush
);
8823 int OSDMonitor::prepare_command_osd_remove(int32_t id
)
8825 if (osdmap
.is_up(id
)) {
8829 pending_inc
.new_state
[id
] = osdmap
.get_state(id
);
8830 pending_inc
.new_uuid
[id
] = uuid_d();
8831 pending_metadata_rm
.insert(id
);
8832 pending_metadata
.erase(id
);
8837 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id
)
8839 ceph_assert(existing_id
);
8842 for (int32_t i
= 0; i
< osdmap
.get_max_osd(); ++i
) {
8843 if (!osdmap
.exists(i
) &&
8844 pending_inc
.new_up_client
.count(i
) == 0 &&
8845 (pending_inc
.new_state
.count(i
) == 0 ||
8846 (pending_inc
.new_state
[i
] & CEPH_OSD_EXISTS
) == 0)) {
8852 if (pending_inc
.new_max_osd
< 0) {
8853 return osdmap
.get_max_osd();
8855 return pending_inc
.new_max_osd
;
8858 void OSDMonitor::do_osd_create(
8861 const string
& device_class
,
8864 dout(10) << __func__
<< " uuid " << uuid
<< dendl
;
8865 ceph_assert(new_id
);
8867 // We presume validation has been performed prior to calling this
8868 // function. We assert with prejudice.
8870 int32_t allocated_id
= -1; // declare here so we can jump
8871 int32_t existing_id
= -1;
8872 if (!uuid
.is_zero()) {
8873 existing_id
= osdmap
.identify_osd(uuid
);
8874 if (existing_id
>= 0) {
8875 ceph_assert(id
< 0 || id
== existing_id
);
8876 *new_id
= existing_id
;
8878 } else if (id
>= 0) {
8879 // uuid does not exist, and id has been provided, so just create
8886 // allocate a new id
8887 allocated_id
= _allocate_osd_id(&existing_id
);
8888 dout(10) << __func__
<< " allocated id " << allocated_id
8889 << " existing id " << existing_id
<< dendl
;
8890 if (existing_id
>= 0) {
8891 ceph_assert(existing_id
< osdmap
.get_max_osd());
8892 ceph_assert(allocated_id
< 0);
8893 pending_inc
.new_weight
[existing_id
] = CEPH_OSD_OUT
;
8894 *new_id
= existing_id
;
8895 } else if (allocated_id
>= 0) {
8896 ceph_assert(existing_id
< 0);
8898 if (pending_inc
.new_max_osd
< 0) {
8899 pending_inc
.new_max_osd
= osdmap
.get_max_osd() + 1;
8901 ++pending_inc
.new_max_osd
;
8903 *new_id
= pending_inc
.new_max_osd
- 1;
8904 ceph_assert(*new_id
== allocated_id
);
8906 ceph_abort_msg("unexpected condition");
8910 if (device_class
.size()) {
8911 CrushWrapper newcrush
;
8912 _get_pending_crush(newcrush
);
8913 if (newcrush
.get_max_devices() < *new_id
+ 1) {
8914 newcrush
.set_max_devices(*new_id
+ 1);
8916 string name
= string("osd.") + stringify(*new_id
);
8917 if (!newcrush
.item_exists(*new_id
)) {
8918 newcrush
.set_item_name(*new_id
, name
);
8921 int r
= newcrush
.update_device_class(*new_id
, device_class
, name
, &ss
);
8923 derr
<< __func__
<< " failed to set " << name
<< " device_class "
8924 << device_class
<< ": " << cpp_strerror(r
) << " - " << ss
.str()
8926 // non-fatal... this might be a replay and we want to be idempotent.
8928 dout(20) << __func__
<< " set " << name
<< " device_class " << device_class
8930 pending_inc
.crush
.clear();
8931 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8934 dout(20) << __func__
<< " no device_class" << dendl
;
8937 dout(10) << __func__
<< " using id " << *new_id
<< dendl
;
8938 if (osdmap
.get_max_osd() <= *new_id
&& pending_inc
.new_max_osd
<= *new_id
) {
8939 pending_inc
.new_max_osd
= *new_id
+ 1;
8942 pending_inc
.new_state
[*new_id
] |= CEPH_OSD_EXISTS
| CEPH_OSD_NEW
;
8943 if (!uuid
.is_zero())
8944 pending_inc
.new_uuid
[*new_id
] = uuid
;
8947 int OSDMonitor::validate_osd_create(
8950 const bool check_osd_exists
,
8951 int32_t* existing_id
,
8955 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
8956 << " check_osd_exists " << check_osd_exists
<< dendl
;
8958 ceph_assert(existing_id
);
8960 if (id
< 0 && uuid
.is_zero()) {
8961 // we have nothing to validate
8964 } else if (uuid
.is_zero()) {
8965 // we have an id but we will ignore it - because that's what
8966 // `osd create` does.
8971 * This function will be used to validate whether we are able to
8972 * create a new osd when the `uuid` is specified.
8974 * It will be used by both `osd create` and `osd new`, as the checks
8975 * are basically the same when it pertains to osd id and uuid validation.
8976 * However, `osd create` presumes an `uuid` is optional, for legacy
8977 * reasons, while `osd new` requires the `uuid` to be provided. This
8978 * means that `osd create` will not be idempotent if an `uuid` is not
8979 * provided, but we will always guarantee the idempotency of `osd new`.
8982 ceph_assert(!uuid
.is_zero());
8983 if (pending_inc
.identify_osd(uuid
) >= 0) {
8984 // osd is about to exist
8988 int32_t i
= osdmap
.identify_osd(uuid
);
8990 // osd already exists
8991 if (id
>= 0 && i
!= id
) {
8992 ss
<< "uuid " << uuid
<< " already in use for different id " << i
;
8995 // return a positive errno to distinguish between a blocking error
8996 // and an error we consider to not be a problem (i.e., this would be
8997 // an idempotent operation).
9003 if (pending_inc
.new_state
.count(id
)) {
9004 // osd is about to exist
9007 // we may not care if an osd exists if we are recreating a previously
9009 if (check_osd_exists
&& osdmap
.exists(id
)) {
9010 ss
<< "id " << id
<< " already in use and does not match uuid "
9018 int OSDMonitor::prepare_command_osd_create(
9021 int32_t* existing_id
,
9024 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
<< dendl
;
9025 ceph_assert(existing_id
);
9026 if (osdmap
.is_destroyed(id
)) {
9027 ss
<< "ceph osd create has been deprecated. Please use ceph osd new "
9032 if (uuid
.is_zero()) {
9033 dout(10) << __func__
<< " no uuid; assuming legacy `osd create`" << dendl
;
9036 return validate_osd_create(id
, uuid
, true, existing_id
, ss
);
9039 int OSDMonitor::prepare_command_osd_new(
9041 const cmdmap_t
& cmdmap
,
9042 const map
<string
,string
>& params
,
9050 ceph_assert(paxos
->is_plugged());
9052 dout(10) << __func__
<< " " << op
<< dendl
;
9054 /* validate command. abort now if something's wrong. */
9056 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
9058 * If `id` is not specified, we will identify any existing osd based
9059 * on `uuid`. Operation will be idempotent iff secrets match.
9061 * If `id` is specified, we will identify any existing osd based on
9062 * `uuid` and match against `id`. If they match, operation will be
9063 * idempotent iff secrets match.
9065 * `-i secrets.json` will be optional. If supplied, will be used
9066 * to check for idempotency when `id` and `uuid` match.
9068 * If `id` is not specified, and `uuid` does not exist, an id will
9069 * be found or allocated for the osd.
9071 * If `id` is specified, and the osd has been previously marked
9072 * as destroyed, then the `id` will be reused.
9074 if (!cmd_getval(cmdmap
, "uuid", uuidstr
)) {
9075 ss
<< "requires the OSD's UUID to be specified.";
9077 } else if (!uuid
.parse(uuidstr
.c_str())) {
9078 ss
<< "invalid UUID value '" << uuidstr
<< "'.";
9082 if (cmd_getval(cmdmap
, "id", id
) &&
9084 ss
<< "invalid OSD id; must be greater or equal than zero.";
9088 // are we running an `osd create`-like command, or recreating
9089 // a previously destroyed osd?
9091 bool is_recreate_destroyed
= (id
>= 0 && osdmap
.is_destroyed(id
));
9093 // we will care about `id` to assess whether osd is `destroyed`, or
9094 // to create a new osd.
9095 // we will need an `id` by the time we reach auth.
9097 int32_t existing_id
= -1;
9098 int err
= validate_osd_create(id
, uuid
, !is_recreate_destroyed
,
9101 bool may_be_idempotent
= false;
9102 if (err
== EEXIST
) {
9103 // this is idempotent from the osdmon's point-of-view
9104 may_be_idempotent
= true;
9105 ceph_assert(existing_id
>= 0);
9107 } else if (err
< 0) {
9111 if (!may_be_idempotent
) {
9112 // idempotency is out of the window. We are either creating a new
9113 // osd or recreating a destroyed osd.
9115 // We now need to figure out if we have an `id` (and if it's valid),
9116 // of find an `id` if we don't have one.
9118 // NOTE: we need to consider the case where the `id` is specified for
9119 // `osd create`, and we must honor it. So this means checking if
9120 // the `id` is destroyed, and if so assume the destroy; otherwise,
9121 // check if it `exists` - in which case we complain about not being
9122 // `destroyed`. In the end, if nothing fails, we must allow the
9123 // creation, so that we are compatible with `create`.
9124 if (id
>= 0 && osdmap
.exists(id
) && !osdmap
.is_destroyed(id
)) {
9125 dout(10) << __func__
<< " osd." << id
<< " isn't destroyed" << dendl
;
9126 ss
<< "OSD " << id
<< " has not yet been destroyed";
9128 } else if (id
< 0) {
9130 id
= _allocate_osd_id(&existing_id
);
9132 ceph_assert(existing_id
>= 0);
9135 dout(10) << __func__
<< " found id " << id
<< " to use" << dendl
;
9136 } else if (id
>= 0 && osdmap
.is_destroyed(id
)) {
9137 dout(10) << __func__
<< " recreating osd." << id
<< dendl
;
9139 dout(10) << __func__
<< " creating new osd." << id
<< dendl
;
9142 ceph_assert(id
>= 0);
9143 ceph_assert(osdmap
.exists(id
));
9146 // we are now able to either create a brand new osd or reuse an existing
9147 // osd that has been previously destroyed.
9149 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
<< dendl
;
9151 if (may_be_idempotent
&& params
.empty()) {
9152 // nothing to do, really.
9153 dout(10) << __func__
<< " idempotent and no params -- no op." << dendl
;
9154 ceph_assert(id
>= 0);
9156 f
->open_object_section("created_osd");
9157 f
->dump_int("osdid", id
);
9165 string device_class
;
9166 auto p
= params
.find("crush_device_class");
9167 if (p
!= params
.end()) {
9168 device_class
= p
->second
;
9169 dout(20) << __func__
<< " device_class will be " << device_class
<< dendl
;
9171 string cephx_secret
, lockbox_secret
, dmcrypt_key
;
9172 bool has_lockbox
= false;
9173 bool has_secrets
= params
.count("cephx_secret")
9174 || params
.count("cephx_lockbox_secret")
9175 || params
.count("dmcrypt_key");
9177 ConfigKeyService
*svc
= nullptr;
9178 AuthMonitor::auth_entity_t cephx_entity
, lockbox_entity
;
9181 if (params
.count("cephx_secret") == 0) {
9182 ss
<< "requires a cephx secret.";
9185 cephx_secret
= params
.at("cephx_secret");
9187 bool has_lockbox_secret
= (params
.count("cephx_lockbox_secret") > 0);
9188 bool has_dmcrypt_key
= (params
.count("dmcrypt_key") > 0);
9190 dout(10) << __func__
<< " has lockbox " << has_lockbox_secret
9191 << " dmcrypt " << has_dmcrypt_key
<< dendl
;
9193 if (has_lockbox_secret
&& has_dmcrypt_key
) {
9195 lockbox_secret
= params
.at("cephx_lockbox_secret");
9196 dmcrypt_key
= params
.at("dmcrypt_key");
9197 } else if (!has_lockbox_secret
!= !has_dmcrypt_key
) {
9198 ss
<< "requires both a cephx lockbox secret and a dm-crypt key.";
9202 dout(10) << __func__
<< " validate secrets using osd id " << id
<< dendl
;
9204 err
= mon
->authmon()->validate_osd_new(id
, uuid
,
9212 } else if (may_be_idempotent
&& err
!= EEXIST
) {
9213 // for this to be idempotent, `id` should already be >= 0; no need
9214 // to use validate_id.
9215 ceph_assert(id
>= 0);
9216 ss
<< "osd." << id
<< " exists but secrets do not match";
9221 svc
= (ConfigKeyService
*)mon
->config_key_service
;
9222 err
= svc
->validate_osd_new(uuid
, dmcrypt_key
, ss
);
9225 } else if (may_be_idempotent
&& err
!= EEXIST
) {
9226 ceph_assert(id
>= 0);
9227 ss
<< "osd." << id
<< " exists but dm-crypt key does not match.";
9232 ceph_assert(!has_secrets
|| !cephx_secret
.empty());
9233 ceph_assert(!has_lockbox
|| !lockbox_secret
.empty());
9235 if (may_be_idempotent
) {
9236 // we have nothing to do for either the osdmon or the authmon,
9237 // and we have no lockbox - so the config key service will not be
9238 // touched. This is therefore an idempotent operation, and we can
9239 // just return right away.
9240 dout(10) << __func__
<< " idempotent -- no op." << dendl
;
9241 ceph_assert(id
>= 0);
9243 f
->open_object_section("created_osd");
9244 f
->dump_int("osdid", id
);
9251 ceph_assert(!may_be_idempotent
);
9255 ceph_assert(!cephx_secret
.empty());
9256 ceph_assert((lockbox_secret
.empty() && dmcrypt_key
.empty()) ||
9257 (!lockbox_secret
.empty() && !dmcrypt_key
.empty()));
9259 err
= mon
->authmon()->do_osd_new(cephx_entity
,
9262 ceph_assert(0 == err
);
9265 ceph_assert(nullptr != svc
);
9266 svc
->do_osd_new(uuid
, dmcrypt_key
);
9270 if (is_recreate_destroyed
) {
9271 ceph_assert(id
>= 0);
9272 ceph_assert(osdmap
.is_destroyed(id
));
9273 pending_inc
.new_weight
[id
] = CEPH_OSD_OUT
;
9274 pending_inc
.new_state
[id
] |= CEPH_OSD_DESTROYED
;
9275 if ((osdmap
.get_state(id
) & CEPH_OSD_NEW
) == 0) {
9276 pending_inc
.new_state
[id
] |= CEPH_OSD_NEW
;
9278 if (osdmap
.get_state(id
) & CEPH_OSD_UP
) {
9279 // due to http://tracker.ceph.com/issues/20751 some clusters may
9280 // have UP set for non-existent OSDs; make sure it is cleared
9281 // for a newly created osd.
9282 pending_inc
.new_state
[id
] |= CEPH_OSD_UP
;
9284 pending_inc
.new_uuid
[id
] = uuid
;
9286 ceph_assert(id
>= 0);
9287 int32_t new_id
= -1;
9288 do_osd_create(id
, uuid
, device_class
, &new_id
);
9289 ceph_assert(new_id
>= 0);
9290 ceph_assert(id
== new_id
);
9294 f
->open_object_section("created_osd");
9295 f
->dump_int("osdid", id
);
9304 bool OSDMonitor::prepare_command(MonOpRequestRef op
)
9306 op
->mark_osdmon_event(__func__
);
9307 auto m
= op
->get_req
<MMonCommand
>();
9310 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
9311 string rs
= ss
.str();
9312 mon
->reply_command(op
, -EINVAL
, rs
, get_last_committed());
9316 MonSession
*session
= op
->get_session();
9318 derr
<< __func__
<< " no session" << dendl
;
9319 mon
->reply_command(op
, -EACCES
, "access denied", get_last_committed());
9323 return prepare_command_impl(op
, cmdmap
);
9326 static int parse_reweights(CephContext
*cct
,
9327 const cmdmap_t
& cmdmap
,
9328 const OSDMap
& osdmap
,
9329 map
<int32_t, uint32_t>* weights
)
9332 if (!cmd_getval(cmdmap
, "weights", weights_str
)) {
9335 std::replace(begin(weights_str
), end(weights_str
), '\'', '"');
9336 json_spirit::mValue json_value
;
9337 if (!json_spirit::read(weights_str
, json_value
)) {
9340 if (json_value
.type() != json_spirit::obj_type
) {
9343 const auto obj
= json_value
.get_obj();
9345 for (auto& osd_weight
: obj
) {
9346 auto osd_id
= std::stoi(osd_weight
.first
);
9347 if (!osdmap
.exists(osd_id
)) {
9350 if (osd_weight
.second
.type() != json_spirit::str_type
) {
9353 auto weight
= std::stoul(osd_weight
.second
.get_str());
9354 weights
->insert({osd_id
, weight
});
9356 } catch (const std::logic_error
& e
) {
9362 int OSDMonitor::prepare_command_osd_destroy(
9366 ceph_assert(paxos
->is_plugged());
9368 // we check if the osd exists for the benefit of `osd purge`, which may
9369 // have previously removed the osd. If the osd does not exist, return
9370 // -ENOENT to convey this, and let the caller deal with it.
9372 // we presume that all auth secrets and config keys were removed prior
9373 // to this command being called. if they exist by now, we also assume
9374 // they must have been created by some other command and do not pertain
9375 // to this non-existent osd.
9376 if (!osdmap
.exists(id
)) {
9377 dout(10) << __func__
<< " osd." << id
<< " does not exist." << dendl
;
9381 uuid_d uuid
= osdmap
.get_uuid(id
);
9382 dout(10) << __func__
<< " destroying osd." << id
9383 << " uuid " << uuid
<< dendl
;
9385 // if it has been destroyed, we assume our work here is done.
9386 if (osdmap
.is_destroyed(id
)) {
9387 ss
<< "destroyed osd." << id
;
9391 EntityName cephx_entity
, lockbox_entity
;
9392 bool idempotent_auth
= false, idempotent_cks
= false;
9394 int err
= mon
->authmon()->validate_osd_destroy(id
, uuid
,
9399 if (err
== -ENOENT
) {
9400 idempotent_auth
= true;
9406 ConfigKeyService
*svc
= (ConfigKeyService
*)mon
->config_key_service
;
9407 err
= svc
->validate_osd_destroy(id
, uuid
);
9409 ceph_assert(err
== -ENOENT
);
9411 idempotent_cks
= true;
9414 if (!idempotent_auth
) {
9415 err
= mon
->authmon()->do_osd_destroy(cephx_entity
, lockbox_entity
);
9416 ceph_assert(0 == err
);
9419 if (!idempotent_cks
) {
9420 svc
->do_osd_destroy(id
, uuid
);
9423 pending_inc
.new_state
[id
] = CEPH_OSD_DESTROYED
;
9424 pending_inc
.new_uuid
[id
] = uuid_d();
9426 // we can only propose_pending() once per service, otherwise we'll be
9427 // defying PaxosService and all laws of nature. Therefore, as we may
9428 // be used during 'osd purge', let's keep the caller responsible for
9430 ceph_assert(err
== 0);
9434 int OSDMonitor::prepare_command_osd_purge(
9438 ceph_assert(paxos
->is_plugged());
9439 dout(10) << __func__
<< " purging osd." << id
<< dendl
;
9441 ceph_assert(!osdmap
.is_up(id
));
9444 * This may look a bit weird, but this is what's going to happen:
9446 * 1. we make sure that removing from crush works
9447 * 2. we call `prepare_command_osd_destroy()`. If it returns an
9448 * error, then we abort the whole operation, as no updates
9449 * have been made. However, we this function will have
9450 * side-effects, thus we need to make sure that all operations
9451 * performed henceforth will *always* succeed.
9452 * 3. we call `prepare_command_osd_remove()`. Although this
9453 * function can return an error, it currently only checks if the
9454 * osd is up - and we have made sure that it is not so, so there
9455 * is no conflict, and it is effectively an update.
9456 * 4. finally, we call `do_osd_crush_remove()`, which will perform
9457 * the crush update we delayed from before.
9460 CrushWrapper newcrush
;
9461 _get_pending_crush(newcrush
);
9463 bool may_be_idempotent
= false;
9465 int err
= _prepare_command_osd_crush_remove(newcrush
, id
, 0, false, false);
9466 if (err
== -ENOENT
) {
9468 may_be_idempotent
= true;
9469 } else if (err
< 0) {
9470 ss
<< "error removing osd." << id
<< " from crush";
9474 // no point destroying the osd again if it has already been marked destroyed
9475 if (!osdmap
.is_destroyed(id
)) {
9476 err
= prepare_command_osd_destroy(id
, ss
);
9478 if (err
== -ENOENT
) {
9484 may_be_idempotent
= false;
9487 ceph_assert(0 == err
);
9489 if (may_be_idempotent
&& !osdmap
.exists(id
)) {
9490 dout(10) << __func__
<< " osd." << id
<< " does not exist and "
9491 << "we are idempotent." << dendl
;
9495 err
= prepare_command_osd_remove(id
);
9496 // we should not be busy, as we should have made sure this id is not up.
9497 ceph_assert(0 == err
);
9499 do_osd_crush_remove(newcrush
);
9503 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op
,
9504 const cmdmap_t
& cmdmap
)
9506 op
->mark_osdmon_event(__func__
);
9507 auto m
= op
->get_req
<MMonCommand
>();
9515 cmd_getval(cmdmap
, "format", format
, string("plain"));
9516 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
9519 cmd_getval(cmdmap
, "prefix", prefix
);
9523 bool osdid_present
= false;
9524 if (prefix
!= "osd pg-temp" &&
9525 prefix
!= "osd pg-upmap" &&
9526 prefix
!= "osd pg-upmap-items") { // avoid commands with non-int id arg
9527 osdid_present
= cmd_getval(cmdmap
, "id", osdid
);
9529 if (osdid_present
) {
9531 oss
<< "osd." << osdid
;
9532 osd_name
= oss
.str();
9535 // Even if there's a pending state with changes that could affect
9536 // a command, considering that said state isn't yet committed, we
9537 // just don't care about those changes if the command currently being
9538 // handled acts as a no-op against the current committed state.
9539 // In a nutshell, we assume this command happens *before*.
9541 // Let me make this clearer:
9543 // - If we have only one client, and that client issues some
9544 // operation that would conflict with this operation but is
9545 // still on the pending state, then we would be sure that said
9546 // operation wouldn't have returned yet, so the client wouldn't
9547 // issue this operation (unless the client didn't wait for the
9548 // operation to finish, and that would be the client's own fault).
9550 // - If we have more than one client, each client will observe
9551 // whatever is the state at the moment of the commit. So, if we
9552 // have two clients, one issuing an unlink and another issuing a
9553 // link, and if the link happens while the unlink is still on the
9554 // pending state, from the link's point-of-view this is a no-op.
9555 // If different clients are issuing conflicting operations and
9556 // they care about that, then the clients should make sure they
9557 // enforce some kind of concurrency mechanism -- from our
9558 // perspective that's what Douglas Adams would call an SEP.
9560 // This should be used as a general guideline for most commands handled
9561 // in this function. Adapt as you see fit, but please bear in mind that
9562 // this is the expected behavior.
9565 if (prefix
== "osd setcrushmap" ||
9566 (prefix
== "osd crush set" && !osdid_present
)) {
9567 if (pending_inc
.crush
.length()) {
9568 dout(10) << __func__
<< " waiting for pending crush update " << dendl
;
9569 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
9572 dout(10) << "prepare_command setting new crush map" << dendl
;
9573 bufferlist
data(m
->get_data());
9576 auto bl
= data
.cbegin();
9579 catch (const std::exception
&e
) {
9581 ss
<< "Failed to parse crushmap: " << e
.what();
9585 int64_t prior_version
= 0;
9586 if (cmd_getval(cmdmap
, "prior_version", prior_version
)) {
9587 if (prior_version
== osdmap
.get_crush_version() - 1) {
9588 // see if we are a resend of the last update. this is imperfect
9589 // (multiple racing updaters may not both get reliable success)
9590 // but we expect crush updaters (via this interface) to be rare-ish.
9591 bufferlist current
, proposed
;
9592 osdmap
.crush
->encode(current
, mon
->get_quorum_con_features());
9593 crush
.encode(proposed
, mon
->get_quorum_con_features());
9594 if (current
.contents_equal(proposed
)) {
9595 dout(10) << __func__
9596 << " proposed matches current and version equals previous"
9599 ss
<< osdmap
.get_crush_version();
9603 if (prior_version
!= osdmap
.get_crush_version()) {
9605 ss
<< "prior_version " << prior_version
<< " != crush version "
9606 << osdmap
.get_crush_version();
9611 if (crush
.has_legacy_rule_ids()) {
9613 ss
<< "crush maps with ruleset != ruleid are no longer allowed";
9616 if (!validate_crush_against_features(&crush
, ss
)) {
9621 err
= osdmap
.validate_crush_rules(&crush
, &ss
);
9626 if (g_conf()->mon_osd_crush_smoke_test
) {
9627 // sanity check: test some inputs to make sure this map isn't
9629 dout(10) << " testing map" << dendl
;
9631 CrushTester
tester(crush
, ess
);
9632 tester
.set_min_x(0);
9633 tester
.set_max_x(50);
9634 auto start
= ceph::coarse_mono_clock::now();
9635 int r
= tester
.test_with_fork(g_conf()->mon_lease
);
9636 auto duration
= ceph::coarse_mono_clock::now() - start
;
9638 dout(10) << " tester.test_with_fork returns " << r
9639 << ": " << ess
.str() << dendl
;
9640 ss
<< "crush smoke test failed with " << r
<< ": " << ess
.str();
9644 dout(10) << __func__
<< " crush somke test duration: "
9645 << duration
<< ", result: " << ess
.str() << dendl
;
9648 pending_inc
.crush
= data
;
9649 ss
<< osdmap
.get_crush_version() + 1;
9652 } else if (prefix
== "osd crush set-all-straw-buckets-to-straw2") {
9653 CrushWrapper newcrush
;
9654 _get_pending_crush(newcrush
);
9655 for (int b
= 0; b
< newcrush
.get_max_buckets(); ++b
) {
9657 if (newcrush
.bucket_exists(bid
) &&
9658 newcrush
.get_bucket_alg(bid
) == CRUSH_BUCKET_STRAW
) {
9659 dout(20) << " bucket " << bid
<< " is straw, can convert" << dendl
;
9660 newcrush
.bucket_set_alg(bid
, CRUSH_BUCKET_STRAW2
);
9663 if (!validate_crush_against_features(&newcrush
, ss
)) {
9667 pending_inc
.crush
.clear();
9668 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9669 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9670 get_last_committed() + 1));
9672 } else if (prefix
== "osd crush set-device-class") {
9673 string device_class
;
9674 if (!cmd_getval(cmdmap
, "class", device_class
)) {
9675 err
= -EINVAL
; // no value!
9680 vector
<string
> idvec
;
9681 cmd_getval(cmdmap
, "ids", idvec
);
9682 CrushWrapper newcrush
;
9683 _get_pending_crush(newcrush
);
9685 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
9689 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
9690 osdmap
.get_all_osds(osds
);
9693 // try traditional single osd way
9694 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
9696 // ss has reason for failure
9697 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
9704 for (auto &osd
: osds
) {
9705 if (!osdmap
.exists(osd
)) {
9706 ss
<< "osd." << osd
<< " does not exist. ";
9711 oss
<< "osd." << osd
;
9712 string name
= oss
.str();
9714 if (newcrush
.get_max_devices() < osd
+ 1) {
9715 newcrush
.set_max_devices(osd
+ 1);
9718 if (newcrush
.item_exists(osd
)) {
9719 action
= "updating";
9721 action
= "creating";
9722 newcrush
.set_item_name(osd
, name
);
9725 dout(5) << action
<< " crush item id " << osd
<< " name '" << name
9726 << "' device_class '" << device_class
<< "'"
9728 err
= newcrush
.update_device_class(osd
, device_class
, name
, &ss
);
9732 if (err
== 0 && !_have_pending_crush()) {
9734 // for single osd only, wildcard makes too much noise
9735 ss
<< "set-device-class item id " << osd
<< " name '" << name
9736 << "' device_class '" << device_class
<< "': no change. ";
9739 updated
.insert(osd
);
9744 if (!updated
.empty()) {
9745 pending_inc
.crush
.clear();
9746 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9747 ss
<< "set osd(s) " << updated
<< " to class '" << device_class
<< "'";
9749 wait_for_finished_proposal(op
,
9750 new Monitor::C_Command(mon
,op
, 0, rs
, get_last_committed() + 1));
9754 } else if (prefix
== "osd crush rm-device-class") {
9756 vector
<string
> idvec
;
9757 cmd_getval(cmdmap
, "ids", idvec
);
9758 CrushWrapper newcrush
;
9759 _get_pending_crush(newcrush
);
9762 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
9767 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
9768 osdmap
.get_all_osds(osds
);
9771 // try traditional single osd way
9772 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
9774 // ss has reason for failure
9775 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
9782 for (auto &osd
: osds
) {
9783 if (!osdmap
.exists(osd
)) {
9784 ss
<< "osd." << osd
<< " does not exist. ";
9788 auto class_name
= newcrush
.get_item_class(osd
);
9790 ss
<< "osd." << osd
<< " belongs to no class, ";
9793 // note that we do not verify if class_is_in_use here
9794 // in case the device is misclassified and user wants
9795 // to overridely reset...
9797 err
= newcrush
.remove_device_class(cct
, osd
, &ss
);
9799 // ss has reason for failure
9802 updated
.insert(osd
);
9806 if (!updated
.empty()) {
9807 pending_inc
.crush
.clear();
9808 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9809 ss
<< "done removing class of osd(s): " << updated
;
9811 wait_for_finished_proposal(op
,
9812 new Monitor::C_Command(mon
,op
, 0, rs
, get_last_committed() + 1));
9815 } else if (prefix
== "osd crush class create") {
9816 string device_class
;
9817 if (!cmd_getval(cmdmap
, "class", device_class
)) {
9818 err
= -EINVAL
; // no value!
9821 if (osdmap
.require_osd_release
< ceph_release_t::luminous
) {
9822 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
9823 << "luminous' before using crush device classes";
9827 if (!_have_pending_crush() &&
9828 _get_stable_crush().class_exists(device_class
)) {
9829 ss
<< "class '" << device_class
<< "' already exists";
9832 CrushWrapper newcrush
;
9833 _get_pending_crush(newcrush
);
9834 if (newcrush
.class_exists(device_class
)) {
9835 ss
<< "class '" << device_class
<< "' already exists";
9838 int class_id
= newcrush
.get_or_create_class_id(device_class
);
9839 pending_inc
.crush
.clear();
9840 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9841 ss
<< "created class " << device_class
<< " with id " << class_id
9844 } else if (prefix
== "osd crush class rm") {
9845 string device_class
;
9846 if (!cmd_getval(cmdmap
, "class", device_class
)) {
9847 err
= -EINVAL
; // no value!
9850 if (osdmap
.require_osd_release
< ceph_release_t::luminous
) {
9851 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
9852 << "luminous' before using crush device classes";
9857 if (!osdmap
.crush
->class_exists(device_class
)) {
9862 CrushWrapper newcrush
;
9863 _get_pending_crush(newcrush
);
9864 if (!newcrush
.class_exists(device_class
)) {
9865 err
= 0; // make command idempotent
9868 int class_id
= newcrush
.get_class_id(device_class
);
9870 if (newcrush
.class_is_in_use(class_id
, &ts
)) {
9872 ss
<< "class '" << device_class
<< "' " << ts
.str();
9876 // check if class is used by any erasure-code-profiles
9877 mempool::osdmap::map
<string
,map
<string
,string
>> old_ec_profiles
=
9878 osdmap
.get_erasure_code_profiles();
9879 auto ec_profiles
= pending_inc
.get_erasure_code_profiles();
9880 #ifdef HAVE_STDLIB_MAP_SPLICING
9881 ec_profiles
.merge(old_ec_profiles
);
9883 ec_profiles
.insert(make_move_iterator(begin(old_ec_profiles
)),
9884 make_move_iterator(end(old_ec_profiles
)));
9886 list
<string
> referenced_by
;
9887 for (auto &i
: ec_profiles
) {
9888 for (auto &j
: i
.second
) {
9889 if ("crush-device-class" == j
.first
&& device_class
== j
.second
) {
9890 referenced_by
.push_back(i
.first
);
9894 if (!referenced_by
.empty()) {
9896 ss
<< "class '" << device_class
9897 << "' is still referenced by erasure-code-profile(s): " << referenced_by
;
9902 newcrush
.get_devices_by_class(device_class
, &osds
);
9903 for (auto& p
: osds
) {
9904 err
= newcrush
.remove_device_class(g_ceph_context
, p
, &ss
);
9906 // ss has reason for failure
9912 // empty class, remove directly
9913 err
= newcrush
.remove_class_name(device_class
);
9915 ss
<< "class '" << device_class
<< "' cannot be removed '"
9916 << cpp_strerror(err
) << "'";
9921 pending_inc
.crush
.clear();
9922 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9923 ss
<< "removed class " << device_class
<< " with id " << class_id
9924 << " from crush map";
9926 } else if (prefix
== "osd crush class rename") {
9927 string srcname
, dstname
;
9928 if (!cmd_getval(cmdmap
, "srcname", srcname
)) {
9932 if (!cmd_getval(cmdmap
, "dstname", dstname
)) {
9937 CrushWrapper newcrush
;
9938 _get_pending_crush(newcrush
);
9939 if (!newcrush
.class_exists(srcname
) && newcrush
.class_exists(dstname
)) {
9940 // suppose this is a replay and return success
9941 // so command is idempotent
9942 ss
<< "already renamed to '" << dstname
<< "'";
9947 err
= newcrush
.rename_class(srcname
, dstname
);
9949 ss
<< "fail to rename '" << srcname
<< "' to '" << dstname
<< "' : "
9950 << cpp_strerror(err
);
9954 pending_inc
.crush
.clear();
9955 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9956 ss
<< "rename class '" << srcname
<< "' to '" << dstname
<< "'";
9958 } else if (prefix
== "osd crush add-bucket") {
9959 // os crush add-bucket <name> <type>
9960 string name
, typestr
;
9961 vector
<string
> argvec
;
9962 cmd_getval(cmdmap
, "name", name
);
9963 cmd_getval(cmdmap
, "type", typestr
);
9964 cmd_getval(cmdmap
, "args", argvec
);
9965 map
<string
,string
> loc
;
9966 if (!argvec
.empty()) {
9967 CrushWrapper::parse_loc_map(argvec
, &loc
);
9968 dout(0) << "will create and move bucket '" << name
9969 << "' to location " << loc
<< dendl
;
9972 if (!_have_pending_crush() &&
9973 _get_stable_crush().name_exists(name
)) {
9974 ss
<< "bucket '" << name
<< "' already exists";
9978 CrushWrapper newcrush
;
9979 _get_pending_crush(newcrush
);
9981 if (newcrush
.name_exists(name
)) {
9982 ss
<< "bucket '" << name
<< "' already exists";
9985 int type
= newcrush
.get_type_id(typestr
);
9987 ss
<< "type '" << typestr
<< "' does not exist";
9992 ss
<< "type '" << typestr
<< "' is for devices, not buckets";
9997 err
= newcrush
.add_bucket(0, 0,
9998 CRUSH_HASH_DEFAULT
, type
, 0, NULL
,
10001 ss
<< "add_bucket error: '" << cpp_strerror(err
) << "'";
10004 err
= newcrush
.set_item_name(bucketno
, name
);
10006 ss
<< "error setting bucket name to '" << name
<< "'";
10010 if (!loc
.empty()) {
10011 if (!newcrush
.check_item_loc(cct
, bucketno
, loc
,
10013 err
= newcrush
.move_bucket(cct
, bucketno
, loc
);
10015 ss
<< "error moving bucket '" << name
<< "' to location " << loc
;
10019 ss
<< "no need to move item id " << bucketno
<< " name '" << name
10020 << "' to location " << loc
<< " in crush map";
10024 pending_inc
.crush
.clear();
10025 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10027 ss
<< "added bucket " << name
<< " type " << typestr
10028 << " to crush map";
10030 ss
<< "added bucket " << name
<< " type " << typestr
10031 << " to location " << loc
;
10034 } else if (prefix
== "osd crush rename-bucket") {
10035 string srcname
, dstname
;
10036 cmd_getval(cmdmap
, "srcname", srcname
);
10037 cmd_getval(cmdmap
, "dstname", dstname
);
10039 err
= crush_rename_bucket(srcname
, dstname
, &ss
);
10040 if (err
== -EALREADY
) // equivalent to success for idempotency
10046 } else if (prefix
== "osd crush weight-set create" ||
10047 prefix
== "osd crush weight-set create-compat") {
10048 CrushWrapper newcrush
;
10049 _get_pending_crush(newcrush
);
10052 if (newcrush
.has_non_straw2_buckets()) {
10053 ss
<< "crush map contains one or more bucket(s) that are not straw2";
10057 if (prefix
== "osd crush weight-set create") {
10058 if (osdmap
.require_min_compat_client
!= ceph_release_t::unknown
&&
10059 osdmap
.require_min_compat_client
< ceph_release_t::luminous
) {
10060 ss
<< "require_min_compat_client "
10061 << osdmap
.require_min_compat_client
10062 << " < luminous, which is required for per-pool weight-sets. "
10063 << "Try 'ceph osd set-require-min-compat-client luminous' "
10064 << "before using the new interface";
10068 string poolname
, mode
;
10069 cmd_getval(cmdmap
, "pool", poolname
);
10070 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
10072 ss
<< "pool '" << poolname
<< "' not found";
10076 cmd_getval(cmdmap
, "mode", mode
);
10077 if (mode
!= "flat" && mode
!= "positional") {
10078 ss
<< "unrecognized weight-set mode '" << mode
<< "'";
10082 positions
= mode
== "flat" ? 1 : osdmap
.get_pg_pool(pool
)->get_size();
10084 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
10087 if (!newcrush
.create_choose_args(pool
, positions
)) {
10088 if (pool
== CrushWrapper::DEFAULT_CHOOSE_ARGS
) {
10089 ss
<< "compat weight-set already created";
10091 ss
<< "weight-set for pool '" << osdmap
.get_pool_name(pool
)
10092 << "' already created";
10096 pending_inc
.crush
.clear();
10097 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10100 } else if (prefix
== "osd crush weight-set rm" ||
10101 prefix
== "osd crush weight-set rm-compat") {
10102 CrushWrapper newcrush
;
10103 _get_pending_crush(newcrush
);
10105 if (prefix
== "osd crush weight-set rm") {
10107 cmd_getval(cmdmap
, "pool", poolname
);
10108 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
10110 ss
<< "pool '" << poolname
<< "' not found";
10115 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
10117 newcrush
.rm_choose_args(pool
);
10118 pending_inc
.crush
.clear();
10119 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10122 } else if (prefix
== "osd crush weight-set reweight" ||
10123 prefix
== "osd crush weight-set reweight-compat") {
10124 string poolname
, item
;
10125 vector
<double> weight
;
10126 cmd_getval(cmdmap
, "pool", poolname
);
10127 cmd_getval(cmdmap
, "item", item
);
10128 cmd_getval(cmdmap
, "weight", weight
);
10129 CrushWrapper newcrush
;
10130 _get_pending_crush(newcrush
);
10132 if (prefix
== "osd crush weight-set reweight") {
10133 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
10135 ss
<< "pool '" << poolname
<< "' not found";
10139 if (!newcrush
.have_choose_args(pool
)) {
10140 ss
<< "no weight-set for pool '" << poolname
<< "'";
10144 auto arg_map
= newcrush
.choose_args_get(pool
);
10145 int positions
= newcrush
.get_choose_args_positions(arg_map
);
10146 if (weight
.size() != (size_t)positions
) {
10147 ss
<< "must specify exact " << positions
<< " weight values";
10152 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
10153 if (!newcrush
.have_choose_args(pool
)) {
10154 ss
<< "no backward-compatible weight-set";
10159 if (!newcrush
.name_exists(item
)) {
10160 ss
<< "item '" << item
<< "' does not exist";
10164 err
= newcrush
.choose_args_adjust_item_weightf(
10166 newcrush
.choose_args_get(pool
),
10167 newcrush
.get_item_id(item
),
10174 pending_inc
.crush
.clear();
10175 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10177 } else if (osdid_present
&&
10178 (prefix
== "osd crush set" || prefix
== "osd crush add")) {
10179 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
10180 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
10181 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
10183 if (!osdmap
.exists(osdid
)) {
10186 << " does not exist. Create it before updating the crush map";
10191 if (!cmd_getval(cmdmap
, "weight", weight
)) {
10192 ss
<< "unable to parse weight value '"
10193 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
10199 vector
<string
> argvec
;
10200 cmd_getval(cmdmap
, "args", argvec
);
10201 map
<string
,string
> loc
;
10202 CrushWrapper::parse_loc_map(argvec
, &loc
);
10204 if (prefix
== "osd crush set"
10205 && !_get_stable_crush().item_exists(osdid
)) {
10207 ss
<< "unable to set item id " << osdid
<< " name '" << osd_name
10208 << "' weight " << weight
<< " at location " << loc
10209 << ": does not exist";
10213 dout(5) << "adding/updating crush item id " << osdid
<< " name '"
10214 << osd_name
<< "' weight " << weight
<< " at location "
10216 CrushWrapper newcrush
;
10217 _get_pending_crush(newcrush
);
10220 if (prefix
== "osd crush set" ||
10221 newcrush
.check_item_loc(cct
, osdid
, loc
, (int *)NULL
)) {
10223 err
= newcrush
.update_item(cct
, osdid
, weight
, osd_name
, loc
);
10226 err
= newcrush
.insert_item(cct
, osdid
, weight
, osd_name
, loc
);
10234 if (err
== 0 && !_have_pending_crush()) {
10235 ss
<< action
<< " item id " << osdid
<< " name '" << osd_name
10236 << "' weight " << weight
<< " at location " << loc
<< ": no change";
10240 pending_inc
.crush
.clear();
10241 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10242 ss
<< action
<< " item id " << osdid
<< " name '" << osd_name
<< "' weight "
10243 << weight
<< " at location " << loc
<< " to crush map";
10245 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10246 get_last_committed() + 1));
10249 } else if (prefix
== "osd crush create-or-move") {
10251 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
10252 if (!osdmap
.exists(osdid
)) {
10255 << " does not exist. create it before updating the crush map";
10260 if (!cmd_getval(cmdmap
, "weight", weight
)) {
10261 ss
<< "unable to parse weight value '"
10262 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
10268 vector
<string
> argvec
;
10269 cmd_getval(cmdmap
, "args", argvec
);
10270 map
<string
,string
> loc
;
10271 CrushWrapper::parse_loc_map(argvec
, &loc
);
10273 dout(0) << "create-or-move crush item name '" << osd_name
10274 << "' initial_weight " << weight
<< " at location " << loc
10277 CrushWrapper newcrush
;
10278 _get_pending_crush(newcrush
);
10280 err
= newcrush
.create_or_move_item(cct
, osdid
, weight
, osd_name
, loc
,
10281 g_conf()->osd_crush_update_weight_set
);
10283 ss
<< "create-or-move updated item name '" << osd_name
10284 << "' weight " << weight
10285 << " at location " << loc
<< " to crush map";
10289 pending_inc
.crush
.clear();
10290 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10291 ss
<< "create-or-move updating item name '" << osd_name
10292 << "' weight " << weight
10293 << " at location " << loc
<< " to crush map";
10295 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10296 get_last_committed() + 1));
10301 } else if (prefix
== "osd crush move") {
10303 // osd crush move <name> <loc1> [<loc2> ...]
10305 vector
<string
> argvec
;
10306 cmd_getval(cmdmap
, "name", name
);
10307 cmd_getval(cmdmap
, "args", argvec
);
10308 map
<string
,string
> loc
;
10309 CrushWrapper::parse_loc_map(argvec
, &loc
);
10311 dout(0) << "moving crush item name '" << name
<< "' to location " << loc
<< dendl
;
10312 CrushWrapper newcrush
;
10313 _get_pending_crush(newcrush
);
10315 if (!newcrush
.name_exists(name
)) {
10317 ss
<< "item " << name
<< " does not exist";
10320 int id
= newcrush
.get_item_id(name
);
10322 if (!newcrush
.check_item_loc(cct
, id
, loc
, (int *)NULL
)) {
10324 err
= newcrush
.create_or_move_item(
10325 cct
, id
, 0, name
, loc
,
10326 g_conf()->osd_crush_update_weight_set
);
10328 err
= newcrush
.move_bucket(cct
, id
, loc
);
10331 ss
<< "moved item id " << id
<< " name '" << name
<< "' to location " << loc
<< " in crush map";
10332 pending_inc
.crush
.clear();
10333 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10335 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10336 get_last_committed() + 1));
10340 ss
<< "no need to move item id " << id
<< " name '" << name
<< "' to location " << loc
<< " in crush map";
10344 } else if (prefix
== "osd crush swap-bucket") {
10345 string source
, dest
;
10346 cmd_getval(cmdmap
, "source", source
);
10347 cmd_getval(cmdmap
, "dest", dest
);
10349 bool force
= false;
10350 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
10352 CrushWrapper newcrush
;
10353 _get_pending_crush(newcrush
);
10354 if (!newcrush
.name_exists(source
)) {
10355 ss
<< "source item " << source
<< " does not exist";
10359 if (!newcrush
.name_exists(dest
)) {
10360 ss
<< "dest item " << dest
<< " does not exist";
10364 int sid
= newcrush
.get_item_id(source
);
10365 int did
= newcrush
.get_item_id(dest
);
10367 if (newcrush
.get_immediate_parent_id(sid
, &sparent
) == 0 && !force
) {
10368 ss
<< "source item " << source
<< " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
10372 if (newcrush
.get_bucket_alg(sid
) != newcrush
.get_bucket_alg(did
) &&
10374 ss
<< "source bucket alg " << crush_alg_name(newcrush
.get_bucket_alg(sid
)) << " != "
10375 << "dest bucket alg " << crush_alg_name(newcrush
.get_bucket_alg(did
))
10376 << "; pass --yes-i-really-mean-it to proceed anyway";
10380 int r
= newcrush
.swap_bucket(cct
, sid
, did
);
10382 ss
<< "failed to swap bucket contents: " << cpp_strerror(r
);
10386 ss
<< "swapped bucket of " << source
<< " to " << dest
;
10387 pending_inc
.crush
.clear();
10388 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10389 wait_for_finished_proposal(op
,
10390 new Monitor::C_Command(mon
, op
, err
, ss
.str(),
10391 get_last_committed() + 1));
10393 } else if (prefix
== "osd crush link") {
10394 // osd crush link <name> <loc1> [<loc2> ...]
10396 cmd_getval(cmdmap
, "name", name
);
10397 vector
<string
> argvec
;
10398 cmd_getval(cmdmap
, "args", argvec
);
10399 map
<string
,string
> loc
;
10400 CrushWrapper::parse_loc_map(argvec
, &loc
);
10402 // Need an explicit check for name_exists because get_item_id returns
10404 int id
= osdmap
.crush
->get_item_id(name
);
10405 if (!osdmap
.crush
->name_exists(name
)) {
10407 ss
<< "item " << name
<< " does not exist";
10410 dout(5) << "resolved crush name '" << name
<< "' to id " << id
<< dendl
;
10412 if (osdmap
.crush
->check_item_loc(cct
, id
, loc
, (int*) NULL
)) {
10413 ss
<< "no need to move item id " << id
<< " name '" << name
10414 << "' to location " << loc
<< " in crush map";
10419 dout(5) << "linking crush item name '" << name
<< "' at location " << loc
<< dendl
;
10420 CrushWrapper newcrush
;
10421 _get_pending_crush(newcrush
);
10423 if (!newcrush
.name_exists(name
)) {
10425 ss
<< "item " << name
<< " does not exist";
10428 int id
= newcrush
.get_item_id(name
);
10429 if (!newcrush
.check_item_loc(cct
, id
, loc
, (int *)NULL
)) {
10430 err
= newcrush
.link_bucket(cct
, id
, loc
);
10432 ss
<< "linked item id " << id
<< " name '" << name
10433 << "' to location " << loc
<< " in crush map";
10434 pending_inc
.crush
.clear();
10435 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10437 ss
<< "cannot link item id " << id
<< " name '" << name
10438 << "' to location " << loc
;
10442 ss
<< "no need to move item id " << id
<< " name '" << name
10443 << "' to location " << loc
<< " in crush map";
10447 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, ss
.str(),
10448 get_last_committed() + 1));
10450 } else if (prefix
== "osd crush rm" ||
10451 prefix
== "osd crush remove" ||
10452 prefix
== "osd crush unlink") {
10454 // osd crush rm <id> [ancestor]
10455 CrushWrapper newcrush
;
10456 _get_pending_crush(newcrush
);
10459 cmd_getval(cmdmap
, "name", name
);
10461 if (!osdmap
.crush
->name_exists(name
)) {
10463 ss
<< "device '" << name
<< "' does not appear in the crush map";
10466 if (!newcrush
.name_exists(name
)) {
10468 ss
<< "device '" << name
<< "' does not appear in the crush map";
10470 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10471 get_last_committed() + 1));
10474 int id
= newcrush
.get_item_id(name
);
10477 bool unlink_only
= prefix
== "osd crush unlink";
10478 string ancestor_str
;
10479 if (cmd_getval(cmdmap
, "ancestor", ancestor_str
)) {
10480 if (!newcrush
.name_exists(ancestor_str
)) {
10482 ss
<< "ancestor item '" << ancestor_str
10483 << "' does not appear in the crush map";
10486 ancestor
= newcrush
.get_item_id(ancestor_str
);
10489 err
= prepare_command_osd_crush_remove(
10492 (ancestor
< 0), unlink_only
);
10494 if (err
== -ENOENT
) {
10495 ss
<< "item " << id
<< " does not appear in that position";
10501 pending_inc
.new_crush_node_flags
[id
] = 0;
10502 ss
<< "removed item id " << id
<< " name '" << name
<< "' from crush map";
10504 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10505 get_last_committed() + 1));
10510 } else if (prefix
== "osd crush reweight-all") {
10511 CrushWrapper newcrush
;
10512 _get_pending_crush(newcrush
);
10514 newcrush
.reweight(cct
);
10515 pending_inc
.crush
.clear();
10516 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10517 ss
<< "reweighted crush hierarchy";
10519 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10520 get_last_committed() + 1));
10522 } else if (prefix
== "osd crush reweight") {
10523 // osd crush reweight <name> <weight>
10524 CrushWrapper newcrush
;
10525 _get_pending_crush(newcrush
);
10528 cmd_getval(cmdmap
, "name", name
);
10529 if (!newcrush
.name_exists(name
)) {
10531 ss
<< "device '" << name
<< "' does not appear in the crush map";
10535 int id
= newcrush
.get_item_id(name
);
10537 ss
<< "device '" << name
<< "' is not a leaf in the crush map";
10542 if (!cmd_getval(cmdmap
, "weight", w
)) {
10543 ss
<< "unable to parse weight value '"
10544 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
10549 err
= newcrush
.adjust_item_weightf(cct
, id
, w
,
10550 g_conf()->osd_crush_update_weight_set
);
10553 pending_inc
.crush
.clear();
10554 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10555 ss
<< "reweighted item id " << id
<< " name '" << name
<< "' to " << w
10556 << " in crush map";
10558 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10559 get_last_committed() + 1));
10561 } else if (prefix
== "osd crush reweight-subtree") {
10562 // osd crush reweight <name> <weight>
10563 CrushWrapper newcrush
;
10564 _get_pending_crush(newcrush
);
10567 cmd_getval(cmdmap
, "name", name
);
10568 if (!newcrush
.name_exists(name
)) {
10570 ss
<< "device '" << name
<< "' does not appear in the crush map";
10574 int id
= newcrush
.get_item_id(name
);
10576 ss
<< "device '" << name
<< "' is not a subtree in the crush map";
10581 if (!cmd_getval(cmdmap
, "weight", w
)) {
10582 ss
<< "unable to parse weight value '"
10583 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
10588 err
= newcrush
.adjust_subtree_weightf(cct
, id
, w
,
10589 g_conf()->osd_crush_update_weight_set
);
10592 pending_inc
.crush
.clear();
10593 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10594 ss
<< "reweighted subtree id " << id
<< " name '" << name
<< "' to " << w
10595 << " in crush map";
10597 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10598 get_last_committed() + 1));
10600 } else if (prefix
== "osd crush tunables") {
10601 CrushWrapper newcrush
;
10602 _get_pending_crush(newcrush
);
10606 cmd_getval(cmdmap
, "profile", profile
);
10607 if (profile
== "legacy" || profile
== "argonaut") {
10608 newcrush
.set_tunables_legacy();
10609 } else if (profile
== "bobtail") {
10610 newcrush
.set_tunables_bobtail();
10611 } else if (profile
== "firefly") {
10612 newcrush
.set_tunables_firefly();
10613 } else if (profile
== "hammer") {
10614 newcrush
.set_tunables_hammer();
10615 } else if (profile
== "jewel") {
10616 newcrush
.set_tunables_jewel();
10617 } else if (profile
== "optimal") {
10618 newcrush
.set_tunables_optimal();
10619 } else if (profile
== "default") {
10620 newcrush
.set_tunables_default();
10622 ss
<< "unrecognized profile '" << profile
<< "'";
10627 if (!validate_crush_against_features(&newcrush
, ss
)) {
10632 pending_inc
.crush
.clear();
10633 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10634 ss
<< "adjusted tunables profile to " << profile
;
10636 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10637 get_last_committed() + 1));
10639 } else if (prefix
== "osd crush set-tunable") {
10640 CrushWrapper newcrush
;
10641 _get_pending_crush(newcrush
);
10645 cmd_getval(cmdmap
, "tunable", tunable
);
10647 int64_t value
= -1;
10648 if (!cmd_getval(cmdmap
, "value", value
)) {
10650 ss
<< "failed to parse integer value "
10651 << cmd_vartype_stringify(cmdmap
.at("value"));
10655 if (tunable
== "straw_calc_version") {
10656 if (value
!= 0 && value
!= 1) {
10657 ss
<< "value must be 0 or 1; got " << value
;
10661 newcrush
.set_straw_calc_version(value
);
10663 ss
<< "unrecognized tunable '" << tunable
<< "'";
10668 if (!validate_crush_against_features(&newcrush
, ss
)) {
10673 pending_inc
.crush
.clear();
10674 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10675 ss
<< "adjusted tunable " << tunable
<< " to " << value
;
10677 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10678 get_last_committed() + 1));
10681 } else if (prefix
== "osd crush rule create-simple") {
10682 string name
, root
, type
, mode
;
10683 cmd_getval(cmdmap
, "name", name
);
10684 cmd_getval(cmdmap
, "root", root
);
10685 cmd_getval(cmdmap
, "type", type
);
10686 cmd_getval(cmdmap
, "mode", mode
);
10690 if (osdmap
.crush
->rule_exists(name
)) {
10691 // The name is uniquely associated to a ruleid and the rule it contains
10692 // From the user point of view, the rule is more meaningfull.
10693 ss
<< "rule " << name
<< " already exists";
10698 CrushWrapper newcrush
;
10699 _get_pending_crush(newcrush
);
10701 if (newcrush
.rule_exists(name
)) {
10702 // The name is uniquely associated to a ruleid and the rule it contains
10703 // From the user point of view, the rule is more meaningfull.
10704 ss
<< "rule " << name
<< " already exists";
10707 int ruleno
= newcrush
.add_simple_rule(name
, root
, type
, "", mode
,
10708 pg_pool_t::TYPE_REPLICATED
, &ss
);
10714 pending_inc
.crush
.clear();
10715 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10718 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10719 get_last_committed() + 1));
10722 } else if (prefix
== "osd crush rule create-replicated") {
10723 string name
, root
, type
, device_class
;
10724 cmd_getval(cmdmap
, "name", name
);
10725 cmd_getval(cmdmap
, "root", root
);
10726 cmd_getval(cmdmap
, "type", type
);
10727 cmd_getval(cmdmap
, "class", device_class
);
10729 if (osdmap
.crush
->rule_exists(name
)) {
10730 // The name is uniquely associated to a ruleid and the rule it contains
10731 // From the user point of view, the rule is more meaningfull.
10732 ss
<< "rule " << name
<< " already exists";
10737 CrushWrapper newcrush
;
10738 _get_pending_crush(newcrush
);
10740 if (newcrush
.rule_exists(name
)) {
10741 // The name is uniquely associated to a ruleid and the rule it contains
10742 // From the user point of view, the rule is more meaningfull.
10743 ss
<< "rule " << name
<< " already exists";
10746 int ruleno
= newcrush
.add_simple_rule(
10747 name
, root
, type
, device_class
,
10748 "firstn", pg_pool_t::TYPE_REPLICATED
, &ss
);
10754 pending_inc
.crush
.clear();
10755 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10758 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10759 get_last_committed() + 1));
10762 } else if (prefix
== "osd erasure-code-profile rm") {
10764 cmd_getval(cmdmap
, "name", name
);
10766 if (erasure_code_profile_in_use(pending_inc
.new_pools
, name
, &ss
))
10769 if (erasure_code_profile_in_use(osdmap
.pools
, name
, &ss
)) {
10774 if (osdmap
.has_erasure_code_profile(name
) ||
10775 pending_inc
.new_erasure_code_profiles
.count(name
)) {
10776 if (osdmap
.has_erasure_code_profile(name
)) {
10777 pending_inc
.old_erasure_code_profiles
.push_back(name
);
10779 dout(20) << "erasure code profile rm " << name
<< ": creation canceled" << dendl
;
10780 pending_inc
.new_erasure_code_profiles
.erase(name
);
10784 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10785 get_last_committed() + 1));
10788 ss
<< "erasure-code-profile " << name
<< " does not exist";
10793 } else if (prefix
== "osd erasure-code-profile set") {
10795 cmd_getval(cmdmap
, "name", name
);
10796 vector
<string
> profile
;
10797 cmd_getval(cmdmap
, "profile", profile
);
10799 bool force
= false;
10800 cmd_getval(cmdmap
, "force", force
);
10802 map
<string
,string
> profile_map
;
10803 err
= parse_erasure_code_profile(profile
, &profile_map
, &ss
);
10806 if (profile_map
.find("plugin") == profile_map
.end()) {
10807 ss
<< "erasure-code-profile " << profile_map
10808 << " must contain a plugin entry" << std::endl
;
10812 string plugin
= profile_map
["plugin"];
10814 if (pending_inc
.has_erasure_code_profile(name
)) {
10815 dout(20) << "erasure code profile " << name
<< " try again" << dendl
;
10818 err
= normalize_profile(name
, profile_map
, force
, &ss
);
10822 if (osdmap
.has_erasure_code_profile(name
)) {
10823 ErasureCodeProfile existing_profile_map
=
10824 osdmap
.get_erasure_code_profile(name
);
10825 err
= normalize_profile(name
, existing_profile_map
, force
, &ss
);
10829 if (existing_profile_map
== profile_map
) {
10835 ss
<< "will not override erasure code profile " << name
10836 << " because the existing profile "
10837 << existing_profile_map
10838 << " is different from the proposed profile "
10844 dout(20) << "erasure code profile set " << name
<< "="
10845 << profile_map
<< dendl
;
10846 pending_inc
.set_erasure_code_profile(name
, profile_map
);
10850 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10851 get_last_committed() + 1));
10854 } else if (prefix
== "osd crush rule create-erasure") {
10855 err
= check_cluster_features(CEPH_FEATURE_CRUSH_V2
, ss
);
10856 if (err
== -EAGAIN
)
10860 string name
, poolstr
;
10861 cmd_getval(cmdmap
, "name", name
);
10863 cmd_getval(cmdmap
, "profile", profile
);
10865 profile
= "default";
10866 if (profile
== "default") {
10867 if (!osdmap
.has_erasure_code_profile(profile
)) {
10868 if (pending_inc
.has_erasure_code_profile(profile
)) {
10869 dout(20) << "erasure code profile " << profile
<< " already pending" << dendl
;
10873 map
<string
,string
> profile_map
;
10874 err
= osdmap
.get_erasure_code_profile_default(cct
,
10879 err
= normalize_profile(name
, profile_map
, true, &ss
);
10882 dout(20) << "erasure code profile set " << profile
<< "="
10883 << profile_map
<< dendl
;
10884 pending_inc
.set_erasure_code_profile(profile
, profile_map
);
10890 err
= crush_rule_create_erasure(name
, profile
, &rule
, &ss
);
10893 case -EEXIST
: // return immediately
10894 ss
<< "rule " << name
<< " already exists";
10898 case -EALREADY
: // wait for pending to be proposed
10899 ss
<< "rule " << name
<< " already exists";
10902 default: // non recoverable error
10907 ss
<< "created rule " << name
<< " at " << rule
;
10911 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10912 get_last_committed() + 1));
10915 } else if (prefix
== "osd crush rule rm") {
10917 cmd_getval(cmdmap
, "name", name
);
10919 if (!osdmap
.crush
->rule_exists(name
)) {
10920 ss
<< "rule " << name
<< " does not exist";
10925 CrushWrapper newcrush
;
10926 _get_pending_crush(newcrush
);
10928 if (!newcrush
.rule_exists(name
)) {
10929 ss
<< "rule " << name
<< " does not exist";
10932 int ruleno
= newcrush
.get_rule_id(name
);
10933 ceph_assert(ruleno
>= 0);
10935 // make sure it is not in use.
10936 // FIXME: this is ok in some situations, but let's not bother with that
10938 int ruleset
= newcrush
.get_rule_mask_ruleset(ruleno
);
10939 if (osdmap
.crush_rule_in_use(ruleset
)) {
10940 ss
<< "crush ruleset " << name
<< " " << ruleset
<< " is in use";
10945 err
= newcrush
.remove_rule(ruleno
);
10950 pending_inc
.crush
.clear();
10951 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10954 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10955 get_last_committed() + 1));
10958 } else if (prefix
== "osd crush rule rename") {
10961 cmd_getval(cmdmap
, "srcname", srcname
);
10962 cmd_getval(cmdmap
, "dstname", dstname
);
10963 if (srcname
.empty() || dstname
.empty()) {
10964 ss
<< "must specify both source rule name and destination rule name";
10968 if (srcname
== dstname
) {
10969 ss
<< "destination rule name is equal to source rule name";
10974 CrushWrapper newcrush
;
10975 _get_pending_crush(newcrush
);
10976 if (!newcrush
.rule_exists(srcname
) && newcrush
.rule_exists(dstname
)) {
10977 // srcname does not exist and dstname already exists
10978 // suppose this is a replay and return success
10979 // (so this command is idempotent)
10980 ss
<< "already renamed to '" << dstname
<< "'";
10985 err
= newcrush
.rename_rule(srcname
, dstname
, &ss
);
10987 // ss has reason for failure
10990 pending_inc
.crush
.clear();
10991 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10993 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10994 get_last_committed() + 1));
10997 } else if (prefix
== "osd setmaxosd") {
10999 if (!cmd_getval(cmdmap
, "newmax", newmax
)) {
11000 ss
<< "unable to parse 'newmax' value '"
11001 << cmd_vartype_stringify(cmdmap
.at("newmax")) << "'";
11006 if (newmax
> g_conf()->mon_max_osd
) {
11008 ss
<< "cannot set max_osd to " << newmax
<< " which is > conf.mon_max_osd ("
11009 << g_conf()->mon_max_osd
<< ")";
11013 // Don't allow shrinking OSD number as this will cause data loss
11014 // and may cause kernel crashes.
11015 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
11016 if (newmax
< osdmap
.get_max_osd()) {
11017 // Check if the OSDs exist between current max and new value.
11018 // If there are any OSDs exist, then don't allow shrinking number
11020 for (int i
= newmax
; i
< osdmap
.get_max_osd(); i
++) {
11021 if (osdmap
.exists(i
)) {
11023 ss
<< "cannot shrink max_osd to " << newmax
11024 << " because osd." << i
<< " (and possibly others) still in use";
11030 pending_inc
.new_max_osd
= newmax
;
11031 ss
<< "set new max_osd = " << pending_inc
.new_max_osd
;
11033 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11034 get_last_committed() + 1));
11037 } else if (prefix
== "osd set-full-ratio" ||
11038 prefix
== "osd set-backfillfull-ratio" ||
11039 prefix
== "osd set-nearfull-ratio") {
11041 if (!cmd_getval(cmdmap
, "ratio", n
)) {
11042 ss
<< "unable to parse 'ratio' value '"
11043 << cmd_vartype_stringify(cmdmap
.at("ratio")) << "'";
11047 if (prefix
== "osd set-full-ratio")
11048 pending_inc
.new_full_ratio
= n
;
11049 else if (prefix
== "osd set-backfillfull-ratio")
11050 pending_inc
.new_backfillfull_ratio
= n
;
11051 else if (prefix
== "osd set-nearfull-ratio")
11052 pending_inc
.new_nearfull_ratio
= n
;
11053 ss
<< prefix
<< " " << n
;
11055 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11056 get_last_committed() + 1));
11058 } else if (prefix
== "osd set-require-min-compat-client") {
11060 cmd_getval(cmdmap
, "version", v
);
11061 ceph_release_t vno
= ceph_release_from_name(v
);
11063 ss
<< "version " << v
<< " is not recognized";
11068 newmap
.deepish_copy_from(osdmap
);
11069 newmap
.apply_incremental(pending_inc
);
11070 newmap
.require_min_compat_client
= vno
;
11071 auto mvno
= newmap
.get_min_compat_client();
11073 ss
<< "osdmap current utilizes features that require " << mvno
11074 << "; cannot set require_min_compat_client below that to " << vno
;
11079 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
11082 mon
->get_combined_feature_map(&m
);
11083 uint64_t features
= ceph_release_features(ceph::to_integer
<int>(vno
));
11087 CEPH_ENTITY_TYPE_CLIENT
,
11088 CEPH_ENTITY_TYPE_MDS
,
11089 CEPH_ENTITY_TYPE_MGR
}) {
11090 auto p
= m
.m
.find(type
);
11091 if (p
== m
.m
.end()) {
11094 for (auto& q
: p
->second
) {
11095 uint64_t missing
= ~q
.first
& features
;
11098 ss
<< "cannot set require_min_compat_client to " << v
<< ": ";
11103 ss
<< q
.second
<< " connected " << ceph_entity_type_name(type
)
11104 << "(s) look like " << ceph_release_name(
11105 ceph_release_from_features(q
.first
))
11106 << " (missing 0x" << std::hex
<< missing
<< std::dec
<< ")";
11112 ss
<< "; add --yes-i-really-mean-it to do it anyway";
11117 ss
<< "set require_min_compat_client to " << vno
;
11118 pending_inc
.new_require_min_compat_client
= vno
;
11120 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11121 get_last_committed() + 1));
11123 } else if (prefix
== "osd pause") {
11124 return prepare_set_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11126 } else if (prefix
== "osd unpause") {
11127 return prepare_unset_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11129 } else if (prefix
== "osd set") {
11131 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
11134 cmd_getval(cmdmap
, "key", key
);
11135 if (key
== "pause")
11136 return prepare_set_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11137 else if (key
== "noup")
11138 return prepare_set_flag(op
, CEPH_OSDMAP_NOUP
);
11139 else if (key
== "nodown")
11140 return prepare_set_flag(op
, CEPH_OSDMAP_NODOWN
);
11141 else if (key
== "noout")
11142 return prepare_set_flag(op
, CEPH_OSDMAP_NOOUT
);
11143 else if (key
== "noin")
11144 return prepare_set_flag(op
, CEPH_OSDMAP_NOIN
);
11145 else if (key
== "nobackfill")
11146 return prepare_set_flag(op
, CEPH_OSDMAP_NOBACKFILL
);
11147 else if (key
== "norebalance")
11148 return prepare_set_flag(op
, CEPH_OSDMAP_NOREBALANCE
);
11149 else if (key
== "norecover")
11150 return prepare_set_flag(op
, CEPH_OSDMAP_NORECOVER
);
11151 else if (key
== "noscrub")
11152 return prepare_set_flag(op
, CEPH_OSDMAP_NOSCRUB
);
11153 else if (key
== "nodeep-scrub")
11154 return prepare_set_flag(op
, CEPH_OSDMAP_NODEEP_SCRUB
);
11155 else if (key
== "notieragent")
11156 return prepare_set_flag(op
, CEPH_OSDMAP_NOTIERAGENT
);
11157 else if (key
== "nosnaptrim")
11158 return prepare_set_flag(op
, CEPH_OSDMAP_NOSNAPTRIM
);
11159 else if (key
== "pglog_hardlimit") {
11160 if (!osdmap
.get_num_up_osds() && !sure
) {
11161 ss
<< "Not advisable to continue since no OSDs are up. Pass "
11162 << "--yes-i-really-mean-it if you really wish to continue.";
11166 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
11167 // we are reusing a jewel feature bit that was retired in luminous.
11168 if (osdmap
.require_osd_release
>= ceph_release_t::luminous
&&
11169 (HAVE_FEATURE(osdmap
.get_up_osd_features(), OSD_PGLOG_HARDLIMIT
)
11171 return prepare_set_flag(op
, CEPH_OSDMAP_PGLOG_HARDLIMIT
);
11173 ss
<< "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
11178 ss
<< "unrecognized flag '" << key
<< "'";
11182 } else if (prefix
== "osd unset") {
11184 cmd_getval(cmdmap
, "key", key
);
11185 if (key
== "pause")
11186 return prepare_unset_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11187 else if (key
== "noup")
11188 return prepare_unset_flag(op
, CEPH_OSDMAP_NOUP
);
11189 else if (key
== "nodown")
11190 return prepare_unset_flag(op
, CEPH_OSDMAP_NODOWN
);
11191 else if (key
== "noout")
11192 return prepare_unset_flag(op
, CEPH_OSDMAP_NOOUT
);
11193 else if (key
== "noin")
11194 return prepare_unset_flag(op
, CEPH_OSDMAP_NOIN
);
11195 else if (key
== "nobackfill")
11196 return prepare_unset_flag(op
, CEPH_OSDMAP_NOBACKFILL
);
11197 else if (key
== "norebalance")
11198 return prepare_unset_flag(op
, CEPH_OSDMAP_NOREBALANCE
);
11199 else if (key
== "norecover")
11200 return prepare_unset_flag(op
, CEPH_OSDMAP_NORECOVER
);
11201 else if (key
== "noscrub")
11202 return prepare_unset_flag(op
, CEPH_OSDMAP_NOSCRUB
);
11203 else if (key
== "nodeep-scrub")
11204 return prepare_unset_flag(op
, CEPH_OSDMAP_NODEEP_SCRUB
);
11205 else if (key
== "notieragent")
11206 return prepare_unset_flag(op
, CEPH_OSDMAP_NOTIERAGENT
);
11207 else if (key
== "nosnaptrim")
11208 return prepare_unset_flag(op
, CEPH_OSDMAP_NOSNAPTRIM
);
11210 ss
<< "unrecognized flag '" << key
<< "'";
11214 } else if (prefix
== "osd require-osd-release") {
11216 cmd_getval(cmdmap
, "release", release
);
11218 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
11219 ceph_release_t rel
= ceph_release_from_name(release
.c_str());
11221 ss
<< "unrecognized release " << release
;
11225 if (rel
== osdmap
.require_osd_release
) {
11230 ceph_assert(osdmap
.require_osd_release
>= ceph_release_t::luminous
);
11231 if (!osdmap
.get_num_up_osds() && !sure
) {
11232 ss
<< "Not advisable to continue since no OSDs are up. Pass "
11233 << "--yes-i-really-mean-it if you really wish to continue.";
11237 if (rel
== ceph_release_t::mimic
) {
11238 if (!mon
->monmap
->get_required_features().contains_all(
11239 ceph::features::mon::FEATURE_MIMIC
)) {
11240 ss
<< "not all mons are mimic";
11244 if ((!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_MIMIC
))
11246 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_MIMIC feature";
11250 } else if (rel
== ceph_release_t::nautilus
) {
11251 if (!mon
->monmap
->get_required_features().contains_all(
11252 ceph::features::mon::FEATURE_NAUTILUS
)) {
11253 ss
<< "not all mons are nautilus";
11257 if ((!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_NAUTILUS
))
11259 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_NAUTILUS feature";
11263 } else if (rel
== ceph_release_t::octopus
) {
11264 if (!mon
->monmap
->get_required_features().contains_all(
11265 ceph::features::mon::FEATURE_OCTOPUS
)) {
11266 ss
<< "not all mons are octopus";
11270 if ((!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_OCTOPUS
))
11272 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_OCTOPUS feature";
11277 ss
<< "not supported for this release yet";
11281 if (rel
< osdmap
.require_osd_release
) {
11282 ss
<< "require_osd_release cannot be lowered once it has been set";
11286 pending_inc
.new_require_osd_release
= rel
;
11288 } else if (prefix
== "osd down" ||
11289 prefix
== "osd out" ||
11290 prefix
== "osd in" ||
11291 prefix
== "osd rm" ||
11292 prefix
== "osd stop") {
11296 bool verbose
= true;
11297 bool definitely_dead
= false;
11299 vector
<string
> idvec
;
11300 cmd_getval(cmdmap
, "ids", idvec
);
11301 cmd_getval(cmdmap
, "definitely_dead", definitely_dead
);
11302 derr
<< "definitely_dead " << (int)definitely_dead
<< dendl
;
11303 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
11308 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
11309 if (prefix
== "osd in") {
11310 // touch out osds only
11311 osdmap
.get_out_existing_osds(osds
);
11313 osdmap
.get_all_osds(osds
);
11316 verbose
= false; // so the output is less noisy.
11318 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
11320 ss
<< "invalid osd id" << osd
;
11323 } else if (!osdmap
.exists(osd
)) {
11324 ss
<< "osd." << osd
<< " does not exist. ";
11331 for (auto &osd
: osds
) {
11332 if (prefix
== "osd down") {
11333 if (osdmap
.is_down(osd
)) {
11335 ss
<< "osd." << osd
<< " is already down. ";
11337 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_UP
);
11338 ss
<< "marked down osd." << osd
<< ". ";
11341 if (definitely_dead
) {
11342 if (!pending_inc
.new_xinfo
.count(osd
)) {
11343 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
11345 if (pending_inc
.new_xinfo
[osd
].dead_epoch
< pending_inc
.epoch
) {
11348 pending_inc
.new_xinfo
[osd
].dead_epoch
= pending_inc
.epoch
;
11350 } else if (prefix
== "osd out") {
11351 if (osdmap
.is_out(osd
)) {
11353 ss
<< "osd." << osd
<< " is already out. ";
11355 pending_inc
.new_weight
[osd
] = CEPH_OSD_OUT
;
11356 if (osdmap
.osd_weight
[osd
]) {
11357 if (pending_inc
.new_xinfo
.count(osd
) == 0) {
11358 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
11360 pending_inc
.new_xinfo
[osd
].old_weight
= osdmap
.osd_weight
[osd
];
11362 ss
<< "marked out osd." << osd
<< ". ";
11363 std::ostringstream msg
;
11364 msg
<< "Client " << op
->get_session()->entity_name
11365 << " marked osd." << osd
<< " out";
11366 if (osdmap
.is_up(osd
)) {
11367 msg
<< ", while it was still marked up";
11369 auto period
= ceph_clock_now() - down_pending_out
[osd
];
11370 msg
<< ", after it was down for " << int(period
.sec())
11374 mon
->clog
->info() << msg
.str();
11377 } else if (prefix
== "osd in") {
11378 if (osdmap
.is_in(osd
)) {
11380 ss
<< "osd." << osd
<< " is already in. ";
11382 if (osdmap
.osd_xinfo
[osd
].old_weight
> 0) {
11383 pending_inc
.new_weight
[osd
] = osdmap
.osd_xinfo
[osd
].old_weight
;
11384 if (pending_inc
.new_xinfo
.count(osd
) == 0) {
11385 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
11387 pending_inc
.new_xinfo
[osd
].old_weight
= 0;
11389 pending_inc
.new_weight
[osd
] = CEPH_OSD_IN
;
11391 ss
<< "marked in osd." << osd
<< ". ";
11394 } else if (prefix
== "osd rm") {
11395 err
= prepare_command_osd_remove(osd
);
11397 if (err
== -EBUSY
) {
11400 ss
<< "osd." << osd
<< " is still up; must be down before removal. ";
11402 ceph_assert(err
== 0);
11404 ss
<< ", osd." << osd
;
11406 ss
<< "removed osd." << osd
;
11410 } else if (prefix
== "osd stop") {
11411 if (osdmap
.is_stop(osd
)) {
11413 ss
<< "osd." << osd
<< " is already stopped. ";
11414 } else if (osdmap
.is_down(osd
)) {
11415 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_STOP
);
11416 ss
<< "stop down osd." << osd
<< ". ";
11419 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_UP
| CEPH_OSD_STOP
);
11420 ss
<< "stop osd." << osd
<< ". ";
11428 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
11429 get_last_committed() + 1));
11432 } else if (prefix
== "osd set-group" ||
11433 prefix
== "osd unset-group" ||
11434 prefix
== "osd add-noup" ||
11435 prefix
== "osd add-nodown" ||
11436 prefix
== "osd add-noin" ||
11437 prefix
== "osd add-noout" ||
11438 prefix
== "osd rm-noup" ||
11439 prefix
== "osd rm-nodown" ||
11440 prefix
== "osd rm-noin" ||
11441 prefix
== "osd rm-noout") {
11442 bool do_set
= prefix
== "osd set-group" ||
11443 prefix
.find("add") != string::npos
;
11445 unsigned flags
= 0;
11446 vector
<string
> who
;
11447 if (prefix
== "osd set-group" || prefix
== "osd unset-group") {
11448 cmd_getval(cmdmap
, "flags", flag_str
);
11449 cmd_getval(cmdmap
, "who", who
);
11450 vector
<string
> raw_flags
;
11451 boost::split(raw_flags
, flag_str
, boost::is_any_of(","));
11452 for (auto& f
: raw_flags
) {
11454 flags
|= CEPH_OSD_NOUP
;
11455 else if (f
== "nodown")
11456 flags
|= CEPH_OSD_NODOWN
;
11457 else if (f
== "noin")
11458 flags
|= CEPH_OSD_NOIN
;
11459 else if (f
== "noout")
11460 flags
|= CEPH_OSD_NOOUT
;
11462 ss
<< "unrecognized flag '" << f
<< "', must be one of "
11463 << "{noup,nodown,noin,noout}";
11469 cmd_getval(cmdmap
, "ids", who
);
11470 if (prefix
.find("noup") != string::npos
)
11471 flags
= CEPH_OSD_NOUP
;
11472 else if (prefix
.find("nodown") != string::npos
)
11473 flags
= CEPH_OSD_NODOWN
;
11474 else if (prefix
.find("noin") != string::npos
)
11475 flags
= CEPH_OSD_NOIN
;
11476 else if (prefix
.find("noout") != string::npos
)
11477 flags
= CEPH_OSD_NOOUT
;
11479 ceph_assert(0 == "Unreachable!");
11482 ss
<< "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
11487 ss
<< "must specify at least one or more targets to set/unset";
11492 set
<int> crush_nodes
;
11493 set
<int> device_classes
;
11494 for (auto& w
: who
) {
11495 if (w
== "any" || w
== "all" || w
== "*") {
11496 osdmap
.get_all_osds(osds
);
11499 std::stringstream ts
;
11500 if (auto osd
= parse_osd_id(w
.c_str(), &ts
); osd
>= 0) {
11502 } else if (osdmap
.crush
->name_exists(w
)) {
11503 crush_nodes
.insert(osdmap
.crush
->get_item_id(w
));
11504 } else if (osdmap
.crush
->class_exists(w
)) {
11505 device_classes
.insert(osdmap
.crush
->get_class_id(w
));
11507 ss
<< "unable to parse osd id or crush node or device class: "
11508 << "\"" << w
<< "\". ";
11511 if (osds
.empty() && crush_nodes
.empty() && device_classes
.empty()) {
11512 // ss has reason for failure
11517 for (auto osd
: osds
) {
11518 if (!osdmap
.exists(osd
)) {
11519 ss
<< "osd." << osd
<< " does not exist. ";
11523 if (flags
& CEPH_OSD_NOUP
) {
11524 any
|= osdmap
.is_noup_by_osd(osd
) ?
11525 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOUP
) :
11526 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOUP
);
11528 if (flags
& CEPH_OSD_NODOWN
) {
11529 any
|= osdmap
.is_nodown_by_osd(osd
) ?
11530 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NODOWN
) :
11531 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NODOWN
);
11533 if (flags
& CEPH_OSD_NOIN
) {
11534 any
|= osdmap
.is_noin_by_osd(osd
) ?
11535 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOIN
) :
11536 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOIN
);
11538 if (flags
& CEPH_OSD_NOOUT
) {
11539 any
|= osdmap
.is_noout_by_osd(osd
) ?
11540 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOOUT
) :
11541 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOOUT
);
11544 if (flags
& CEPH_OSD_NOUP
) {
11545 any
|= osdmap
.is_noup_by_osd(osd
) ?
11546 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOUP
) :
11547 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOUP
);
11549 if (flags
& CEPH_OSD_NODOWN
) {
11550 any
|= osdmap
.is_nodown_by_osd(osd
) ?
11551 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NODOWN
) :
11552 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NODOWN
);
11554 if (flags
& CEPH_OSD_NOIN
) {
11555 any
|= osdmap
.is_noin_by_osd(osd
) ?
11556 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOIN
) :
11557 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOIN
);
11559 if (flags
& CEPH_OSD_NOOUT
) {
11560 any
|= osdmap
.is_noout_by_osd(osd
) ?
11561 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOOUT
) :
11562 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOOUT
);
11566 for (auto& id
: crush_nodes
) {
11567 auto old_flags
= osdmap
.get_crush_node_flags(id
);
11568 auto& pending_flags
= pending_inc
.new_crush_node_flags
[id
];
11569 pending_flags
|= old_flags
; // adopt existing flags first!
11571 pending_flags
|= flags
;
11573 pending_flags
&= ~flags
;
11577 for (auto& id
: device_classes
) {
11578 auto old_flags
= osdmap
.get_device_class_flags(id
);
11579 auto& pending_flags
= pending_inc
.new_device_class_flags
[id
];
11580 pending_flags
|= old_flags
;
11582 pending_flags
|= flags
;
11584 pending_flags
&= ~flags
;
11590 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
11591 get_last_committed() + 1));
11594 } else if (prefix
== "osd pg-temp") {
11596 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
11597 ss
<< "unable to parse 'pgid' value '"
11598 << cmd_vartype_stringify(cmdmap
.at("pgid")) << "'";
11603 if (!pgid
.parse(pgidstr
.c_str())) {
11604 ss
<< "invalid pgid '" << pgidstr
<< "'";
11608 if (!osdmap
.pg_exists(pgid
)) {
11609 ss
<< "pg " << pgid
<< " does not exist";
11613 if (pending_inc
.new_pg_temp
.count(pgid
)) {
11614 dout(10) << __func__
<< " waiting for pending update on " << pgid
<< dendl
;
11615 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11619 vector
<int64_t> id_vec
;
11620 vector
<int32_t> new_pg_temp
;
11621 cmd_getval(cmdmap
, "id", id_vec
);
11622 if (id_vec
.empty()) {
11623 pending_inc
.new_pg_temp
[pgid
] = mempool::osdmap::vector
<int>();
11624 ss
<< "done cleaning up pg_temp of " << pgid
;
11627 for (auto osd
: id_vec
) {
11628 if (!osdmap
.exists(osd
)) {
11629 ss
<< "osd." << osd
<< " does not exist";
11633 new_pg_temp
.push_back(osd
);
11636 int pool_min_size
= osdmap
.get_pg_pool_min_size(pgid
);
11637 if ((int)new_pg_temp
.size() < pool_min_size
) {
11638 ss
<< "num of osds (" << new_pg_temp
.size() <<") < pool min size ("
11639 << pool_min_size
<< ")";
11644 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
11645 if ((int)new_pg_temp
.size() > pool_size
) {
11646 ss
<< "num of osds (" << new_pg_temp
.size() <<") > pool size ("
11647 << pool_size
<< ")";
11652 pending_inc
.new_pg_temp
[pgid
] = mempool::osdmap::vector
<int>(
11653 new_pg_temp
.begin(), new_pg_temp
.end());
11654 ss
<< "set " << pgid
<< " pg_temp mapping to " << new_pg_temp
;
11656 } else if (prefix
== "osd primary-temp") {
11658 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
11659 ss
<< "unable to parse 'pgid' value '"
11660 << cmd_vartype_stringify(cmdmap
.at("pgid")) << "'";
11665 if (!pgid
.parse(pgidstr
.c_str())) {
11666 ss
<< "invalid pgid '" << pgidstr
<< "'";
11670 if (!osdmap
.pg_exists(pgid
)) {
11671 ss
<< "pg " << pgid
<< " does not exist";
11677 if (!cmd_getval(cmdmap
, "id", osd
)) {
11678 ss
<< "unable to parse 'id' value '"
11679 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
11683 if (osd
!= -1 && !osdmap
.exists(osd
)) {
11684 ss
<< "osd." << osd
<< " does not exist";
11689 if (osdmap
.require_min_compat_client
!= ceph_release_t::unknown
&&
11690 osdmap
.require_min_compat_client
< ceph_release_t::firefly
) {
11691 ss
<< "require_min_compat_client "
11692 << osdmap
.require_min_compat_client
11693 << " < firefly, which is required for primary-temp";
11698 pending_inc
.new_primary_temp
[pgid
] = osd
;
11699 ss
<< "set " << pgid
<< " primary_temp mapping to " << osd
;
11701 } else if (prefix
== "pg repeer") {
11704 cmd_getval(cmdmap
, "pgid", pgidstr
);
11705 if (!pgid
.parse(pgidstr
.c_str())) {
11706 ss
<< "invalid pgid '" << pgidstr
<< "'";
11710 if (!osdmap
.pg_exists(pgid
)) {
11711 ss
<< "pg '" << pgidstr
<< "' does not exist";
11715 vector
<int> acting
;
11717 osdmap
.pg_to_acting_osds(pgid
, &acting
, &primary
);
11720 ss
<< "pg currently has no primary";
11723 if (acting
.size() > 1) {
11724 // map to just primary; it will map back to what it wants
11725 pending_inc
.new_pg_temp
[pgid
] = { primary
};
11727 // hmm, pick another arbitrary osd to induce a change. Note
11728 // that this won't work if there is only one suitable OSD in the cluster.
11731 for (i
= 0; i
< osdmap
.get_max_osd(); ++i
) {
11732 if (i
== primary
|| !osdmap
.is_up(i
) || !osdmap
.exists(i
)) {
11735 pending_inc
.new_pg_temp
[pgid
] = { primary
, i
};
11741 ss
<< "not enough up OSDs in the cluster to force repeer";
11746 } else if (prefix
== "osd pg-upmap" ||
11747 prefix
== "osd rm-pg-upmap" ||
11748 prefix
== "osd pg-upmap-items" ||
11749 prefix
== "osd rm-pg-upmap-items") {
11750 if (osdmap
.require_min_compat_client
< ceph_release_t::luminous
) {
11751 ss
<< "min_compat_client "
11752 << osdmap
.require_min_compat_client
11753 << " < luminous, which is required for pg-upmap. "
11754 << "Try 'ceph osd set-require-min-compat-client luminous' "
11755 << "before using the new interface";
11759 err
= check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP
, ss
);
11760 if (err
== -EAGAIN
)
11765 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
11766 ss
<< "unable to parse 'pgid' value '"
11767 << cmd_vartype_stringify(cmdmap
.at("pgid")) << "'";
11772 if (!pgid
.parse(pgidstr
.c_str())) {
11773 ss
<< "invalid pgid '" << pgidstr
<< "'";
11777 if (!osdmap
.pg_exists(pgid
)) {
11778 ss
<< "pg " << pgid
<< " does not exist";
11782 if (pending_inc
.old_pools
.count(pgid
.pool())) {
11783 ss
<< "pool of " << pgid
<< " is pending removal";
11786 wait_for_finished_proposal(op
,
11787 new Monitor::C_Command(mon
, op
, err
, rs
, get_last_committed() + 1));
11795 OP_RM_PG_UPMAP_ITEMS
,
11798 if (prefix
== "osd pg-upmap") {
11799 option
= OP_PG_UPMAP
;
11800 } else if (prefix
== "osd rm-pg-upmap") {
11801 option
= OP_RM_PG_UPMAP
;
11802 } else if (prefix
== "osd pg-upmap-items") {
11803 option
= OP_PG_UPMAP_ITEMS
;
11805 option
= OP_RM_PG_UPMAP_ITEMS
;
11808 // check pending upmap changes
11810 case OP_PG_UPMAP
: // fall through
11811 case OP_RM_PG_UPMAP
:
11812 if (pending_inc
.new_pg_upmap
.count(pgid
) ||
11813 pending_inc
.old_pg_upmap
.count(pgid
)) {
11814 dout(10) << __func__
<< " waiting for pending update on "
11816 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11821 case OP_PG_UPMAP_ITEMS
: // fall through
11822 case OP_RM_PG_UPMAP_ITEMS
:
11823 if (pending_inc
.new_pg_upmap_items
.count(pgid
) ||
11824 pending_inc
.old_pg_upmap_items
.count(pgid
)) {
11825 dout(10) << __func__
<< " waiting for pending update on "
11827 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11833 ceph_abort_msg("invalid option");
11839 vector
<int64_t> id_vec
;
11840 if (!cmd_getval(cmdmap
, "id", id_vec
)) {
11841 ss
<< "unable to parse 'id' value(s) '"
11842 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
11847 int pool_min_size
= osdmap
.get_pg_pool_min_size(pgid
);
11848 if ((int)id_vec
.size() < pool_min_size
) {
11849 ss
<< "num of osds (" << id_vec
.size() <<") < pool min size ("
11850 << pool_min_size
<< ")";
11855 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
11856 if ((int)id_vec
.size() > pool_size
) {
11857 ss
<< "num of osds (" << id_vec
.size() <<") > pool size ("
11858 << pool_size
<< ")";
11863 vector
<int32_t> new_pg_upmap
;
11864 for (auto osd
: id_vec
) {
11865 if (osd
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(osd
)) {
11866 ss
<< "osd." << osd
<< " does not exist";
11870 auto it
= std::find(new_pg_upmap
.begin(), new_pg_upmap
.end(), osd
);
11871 if (it
!= new_pg_upmap
.end()) {
11872 ss
<< "osd." << osd
<< " already exists, ";
11875 new_pg_upmap
.push_back(osd
);
11878 if (new_pg_upmap
.empty()) {
11879 ss
<< "no valid upmap items(pairs) is specified";
11884 pending_inc
.new_pg_upmap
[pgid
] = mempool::osdmap::vector
<int32_t>(
11885 new_pg_upmap
.begin(), new_pg_upmap
.end());
11886 ss
<< "set " << pgid
<< " pg_upmap mapping to " << new_pg_upmap
;
11890 case OP_RM_PG_UPMAP
:
11892 pending_inc
.old_pg_upmap
.insert(pgid
);
11893 ss
<< "clear " << pgid
<< " pg_upmap mapping";
11897 case OP_PG_UPMAP_ITEMS
:
11899 vector
<int64_t> id_vec
;
11900 if (!cmd_getval(cmdmap
, "id", id_vec
)) {
11901 ss
<< "unable to parse 'id' value(s) '"
11902 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
11907 if (id_vec
.size() % 2) {
11908 ss
<< "you must specify pairs of osd ids to be remapped";
11913 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
11914 if ((int)(id_vec
.size() / 2) > pool_size
) {
11915 ss
<< "num of osd pairs (" << id_vec
.size() / 2 <<") > pool size ("
11916 << pool_size
<< ")";
11921 vector
<pair
<int32_t,int32_t>> new_pg_upmap_items
;
11922 ostringstream items
;
11924 for (auto p
= id_vec
.begin(); p
!= id_vec
.end(); ++p
) {
11928 ss
<< "from osd." << from
<< " == to osd." << to
<< ", ";
11931 if (!osdmap
.exists(from
)) {
11932 ss
<< "osd." << from
<< " does not exist";
11936 if (to
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(to
)) {
11937 ss
<< "osd." << to
<< " does not exist";
11941 pair
<int32_t,int32_t> entry
= make_pair(from
, to
);
11942 auto it
= std::find(new_pg_upmap_items
.begin(),
11943 new_pg_upmap_items
.end(), entry
);
11944 if (it
!= new_pg_upmap_items
.end()) {
11945 ss
<< "osd." << from
<< " -> osd." << to
<< " already exists, ";
11948 new_pg_upmap_items
.push_back(entry
);
11949 items
<< from
<< "->" << to
<< ",";
11951 string
out(items
.str());
11952 out
.resize(out
.size() - 1); // drop last ','
11955 if (new_pg_upmap_items
.empty()) {
11956 ss
<< "no valid upmap items(pairs) is specified";
11961 pending_inc
.new_pg_upmap_items
[pgid
] =
11962 mempool::osdmap::vector
<pair
<int32_t,int32_t>>(
11963 new_pg_upmap_items
.begin(), new_pg_upmap_items
.end());
11964 ss
<< "set " << pgid
<< " pg_upmap_items mapping to " << out
;
11968 case OP_RM_PG_UPMAP_ITEMS
:
11970 pending_inc
.old_pg_upmap_items
.insert(pgid
);
11971 ss
<< "clear " << pgid
<< " pg_upmap_items mapping";
11976 ceph_abort_msg("invalid option");
11980 } else if (prefix
== "osd primary-affinity") {
11982 if (!cmd_getval(cmdmap
, "id", id
)) {
11983 ss
<< "invalid osd id value '"
11984 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
11989 if (!cmd_getval(cmdmap
, "weight", w
)) {
11990 ss
<< "unable to parse 'weight' value '"
11991 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
11995 long ww
= (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY
*w
);
11997 ss
<< "weight must be >= 0";
12001 if (osdmap
.require_min_compat_client
!= ceph_release_t::unknown
&&
12002 osdmap
.require_min_compat_client
< ceph_release_t::firefly
) {
12003 ss
<< "require_min_compat_client "
12004 << osdmap
.require_min_compat_client
12005 << " < firefly, which is required for primary-affinity";
12009 if (osdmap
.exists(id
)) {
12010 pending_inc
.new_primary_affinity
[id
] = ww
;
12011 ss
<< "set osd." << id
<< " primary-affinity to " << w
<< " (" << ios::hex
<< ww
<< ios::dec
<< ")";
12013 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12014 get_last_committed() + 1));
12017 ss
<< "osd." << id
<< " does not exist";
12021 } else if (prefix
== "osd reweight") {
12023 if (!cmd_getval(cmdmap
, "id", id
)) {
12024 ss
<< "unable to parse osd id value '"
12025 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12030 if (!cmd_getval(cmdmap
, "weight", w
)) {
12031 ss
<< "unable to parse weight value '"
12032 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
12036 long ww
= (int)((double)CEPH_OSD_IN
*w
);
12038 ss
<< "weight must be >= 0";
12042 if (osdmap
.exists(id
)) {
12043 pending_inc
.new_weight
[id
] = ww
;
12044 ss
<< "reweighted osd." << id
<< " to " << w
<< " (" << std::hex
<< ww
<< std::dec
<< ")";
12046 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12047 get_last_committed() + 1));
12050 ss
<< "osd." << id
<< " does not exist";
12054 } else if (prefix
== "osd reweightn") {
12055 map
<int32_t, uint32_t> weights
;
12056 err
= parse_reweights(cct
, cmdmap
, osdmap
, &weights
);
12058 ss
<< "unable to parse 'weights' value '"
12059 << cmd_vartype_stringify(cmdmap
.at("weights")) << "'";
12062 pending_inc
.new_weight
.insert(weights
.begin(), weights
.end());
12063 wait_for_finished_proposal(
12065 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
, get_last_committed() + 1));
12067 } else if (prefix
== "osd lost") {
12069 if (!cmd_getval(cmdmap
, "id", id
)) {
12070 ss
<< "unable to parse osd id value '"
12071 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12076 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
12078 ss
<< "are you SURE? this might mean real, permanent data loss. pass "
12079 "--yes-i-really-mean-it if you really do.";
12082 } else if (!osdmap
.exists(id
)) {
12083 ss
<< "osd." << id
<< " does not exist";
12086 } else if (!osdmap
.is_down(id
)) {
12087 ss
<< "osd." << id
<< " is not down";
12091 epoch_t e
= osdmap
.get_info(id
).down_at
;
12092 pending_inc
.new_lost
[id
] = e
;
12093 ss
<< "marked osd lost in epoch " << e
;
12095 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12096 get_last_committed() + 1));
12100 } else if (prefix
== "osd destroy-actual" ||
12101 prefix
== "osd purge-actual" ||
12102 prefix
== "osd purge-new") {
12103 /* Destroying an OSD means that we don't expect to further make use of
12104 * the OSDs data (which may even become unreadable after this operation),
12105 * and that we are okay with scrubbing all its cephx keys and config-key
12106 * data (which may include lockbox keys, thus rendering the osd's data
12109 * The OSD will not be removed. Instead, we will mark it as destroyed,
12110 * such that a subsequent call to `create` will not reuse the osd id.
12111 * This will play into being able to recreate the OSD, at the same
12112 * crush location, with minimal data movement.
12115 // make sure authmon is writeable.
12116 if (!mon
->authmon()->is_writeable()) {
12117 dout(10) << __func__
<< " waiting for auth mon to be writeable for "
12118 << "osd destroy" << dendl
;
12119 mon
->authmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
12124 if (!cmd_getval(cmdmap
, "id", id
)) {
12125 auto p
= cmdmap
.find("id");
12126 if (p
== cmdmap
.end()) {
12127 ss
<< "no osd id specified";
12129 ss
<< "unable to parse osd id value '"
12130 << cmd_vartype_stringify(cmdmap
.at("id")) << "";
12136 bool is_destroy
= (prefix
== "osd destroy-actual");
12138 ceph_assert("osd purge-actual" == prefix
||
12139 "osd purge-new" == prefix
);
12143 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
12145 ss
<< "Are you SURE? Did you verify with 'ceph osd safe-to-destroy'? "
12146 << "This will mean real, permanent data loss, as well "
12147 << "as deletion of cephx and lockbox keys. "
12148 << "Pass --yes-i-really-mean-it if you really do.";
12151 } else if (!osdmap
.exists(id
)) {
12152 ss
<< "osd." << id
<< " does not exist";
12153 err
= 0; // idempotent
12155 } else if (osdmap
.is_up(id
)) {
12156 ss
<< "osd." << id
<< " is not `down`.";
12159 } else if (is_destroy
&& osdmap
.is_destroyed(id
)) {
12160 ss
<< "destroyed osd." << id
;
12165 if (prefix
== "osd purge-new" &&
12166 (osdmap
.get_state(id
) & CEPH_OSD_NEW
) == 0) {
12167 ss
<< "osd." << id
<< " is not new";
12172 bool goto_reply
= false;
12176 err
= prepare_command_osd_destroy(id
, ss
);
12177 // we checked above that it should exist.
12178 ceph_assert(err
!= -ENOENT
);
12180 err
= prepare_command_osd_purge(id
, ss
);
12181 if (err
== -ENOENT
) {
12183 ss
<< "osd." << id
<< " does not exist.";
12189 if (err
< 0 || goto_reply
) {
12194 ss
<< "destroyed osd." << id
;
12196 ss
<< "purged osd." << id
;
12200 wait_for_finished_proposal(op
,
12201 new Monitor::C_Command(mon
, op
, 0, rs
, get_last_committed() + 1));
12202 force_immediate_propose();
12205 } else if (prefix
== "osd new") {
12207 // make sure authmon is writeable.
12208 if (!mon
->authmon()->is_writeable()) {
12209 dout(10) << __func__
<< " waiting for auth mon to be writeable for "
12210 << "osd new" << dendl
;
12211 mon
->authmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
12215 map
<string
,string
> param_map
;
12217 bufferlist bl
= m
->get_data();
12218 string param_json
= bl
.to_str();
12219 dout(20) << __func__
<< " osd new json = " << param_json
<< dendl
;
12221 err
= get_json_str_map(param_json
, ss
, ¶m_map
);
12225 dout(20) << __func__
<< " osd new params " << param_map
<< dendl
;
12228 err
= prepare_command_osd_new(op
, cmdmap
, param_map
, ss
, f
.get());
12241 if (err
== EEXIST
) {
12242 // idempotent operation
12247 wait_for_finished_proposal(op
,
12248 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
,
12249 get_last_committed() + 1));
12250 force_immediate_propose();
12253 } else if (prefix
== "osd create") {
12255 // optional id provided?
12256 int64_t id
= -1, cmd_id
= -1;
12257 if (cmd_getval(cmdmap
, "id", cmd_id
)) {
12259 ss
<< "invalid osd id value '" << cmd_id
<< "'";
12263 dout(10) << " osd create got id " << cmd_id
<< dendl
;
12268 if (cmd_getval(cmdmap
, "uuid", uuidstr
)) {
12269 if (!uuid
.parse(uuidstr
.c_str())) {
12270 ss
<< "invalid uuid value '" << uuidstr
<< "'";
12274 // we only care about the id if we also have the uuid, to
12275 // ensure the operation's idempotency.
12279 int32_t new_id
= -1;
12280 err
= prepare_command_osd_create(id
, uuid
, &new_id
, ss
);
12282 if (err
== -EAGAIN
) {
12283 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12286 // a check has failed; reply to the user.
12289 } else if (err
== EEXIST
) {
12290 // this is an idempotent operation; we can go ahead and reply.
12292 f
->open_object_section("created_osd");
12293 f
->dump_int("osdid", new_id
);
12294 f
->close_section();
12304 string empty_device_class
;
12305 do_osd_create(id
, uuid
, empty_device_class
, &new_id
);
12308 f
->open_object_section("created_osd");
12309 f
->dump_int("osdid", new_id
);
12310 f
->close_section();
12316 wait_for_finished_proposal(op
,
12317 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
,
12318 get_last_committed() + 1));
12321 } else if (prefix
== "osd blacklist clear") {
12322 pending_inc
.new_blacklist
.clear();
12323 std::list
<std::pair
<entity_addr_t
,utime_t
> > blacklist
;
12324 osdmap
.get_blacklist(&blacklist
);
12325 for (const auto &entry
: blacklist
) {
12326 pending_inc
.old_blacklist
.push_back(entry
.first
);
12328 ss
<< " removed all blacklist entries";
12330 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12331 get_last_committed() + 1));
12333 } else if (prefix
== "osd blacklist") {
12335 cmd_getval(cmdmap
, "addr", addrstr
);
12336 entity_addr_t addr
;
12337 if (!addr
.parse(addrstr
.c_str(), 0)) {
12338 ss
<< "unable to parse address " << addrstr
;
12343 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
12344 // always blacklist type ANY
12345 addr
.set_type(entity_addr_t::TYPE_ANY
);
12347 addr
.set_type(entity_addr_t::TYPE_LEGACY
);
12350 string blacklistop
;
12351 cmd_getval(cmdmap
, "blacklistop", blacklistop
);
12352 if (blacklistop
== "add") {
12353 utime_t expires
= ceph_clock_now();
12355 // default one hour
12356 cmd_getval(cmdmap
, "expire", d
,
12357 g_conf()->mon_osd_blacklist_default_expire
);
12360 pending_inc
.new_blacklist
[addr
] = expires
;
12363 // cancel any pending un-blacklisting request too
12364 auto it
= std::find(pending_inc
.old_blacklist
.begin(),
12365 pending_inc
.old_blacklist
.end(), addr
);
12366 if (it
!= pending_inc
.old_blacklist
.end()) {
12367 pending_inc
.old_blacklist
.erase(it
);
12371 ss
<< "blacklisting " << addr
<< " until " << expires
<< " (" << d
<< " sec)";
12373 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12374 get_last_committed() + 1));
12376 } else if (blacklistop
== "rm") {
12377 if (osdmap
.is_blacklisted(addr
) ||
12378 pending_inc
.new_blacklist
.count(addr
)) {
12379 if (osdmap
.is_blacklisted(addr
))
12380 pending_inc
.old_blacklist
.push_back(addr
);
12382 pending_inc
.new_blacklist
.erase(addr
);
12383 ss
<< "un-blacklisting " << addr
;
12385 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12386 get_last_committed() + 1));
12389 ss
<< addr
<< " isn't blacklisted";
12394 } else if (prefix
== "osd pool mksnap") {
12396 cmd_getval(cmdmap
, "pool", poolstr
);
12397 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
12399 ss
<< "unrecognized pool '" << poolstr
<< "'";
12404 cmd_getval(cmdmap
, "snap", snapname
);
12405 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
12406 if (p
->is_unmanaged_snaps_mode()) {
12407 ss
<< "pool " << poolstr
<< " is in unmanaged snaps mode";
12410 } else if (p
->snap_exists(snapname
.c_str())) {
12411 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " already exists";
12414 } else if (p
->is_tier()) {
12415 ss
<< "pool " << poolstr
<< " is a cache tier";
12420 if (pending_inc
.new_pools
.count(pool
))
12421 pp
= &pending_inc
.new_pools
[pool
];
12423 pp
= &pending_inc
.new_pools
[pool
];
12426 if (pp
->snap_exists(snapname
.c_str())) {
12427 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " already exists";
12429 pp
->add_snap(snapname
.c_str(), ceph_clock_now());
12430 pp
->set_snap_epoch(pending_inc
.epoch
);
12431 ss
<< "created pool " << poolstr
<< " snap " << snapname
;
12434 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12435 get_last_committed() + 1));
12437 } else if (prefix
== "osd pool rmsnap") {
12439 cmd_getval(cmdmap
, "pool", poolstr
);
12440 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
12442 ss
<< "unrecognized pool '" << poolstr
<< "'";
12447 cmd_getval(cmdmap
, "snap", snapname
);
12448 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
12449 if (p
->is_unmanaged_snaps_mode()) {
12450 ss
<< "pool " << poolstr
<< " is in unmanaged snaps mode";
12453 } else if (!p
->snap_exists(snapname
.c_str())) {
12454 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " does not exist";
12459 if (pending_inc
.new_pools
.count(pool
))
12460 pp
= &pending_inc
.new_pools
[pool
];
12462 pp
= &pending_inc
.new_pools
[pool
];
12465 snapid_t sn
= pp
->snap_exists(snapname
.c_str());
12467 pp
->remove_snap(sn
);
12468 pp
->set_snap_epoch(pending_inc
.epoch
);
12469 ss
<< "removed pool " << poolstr
<< " snap " << snapname
;
12471 ss
<< "already removed pool " << poolstr
<< " snap " << snapname
;
12474 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12475 get_last_committed() + 1));
12477 } else if (prefix
== "osd pool create") {
12478 int64_t pg_num
, pg_num_min
;
12480 cmd_getval(cmdmap
, "pg_num", pg_num
, int64_t(0));
12481 cmd_getval(cmdmap
, "pgp_num", pgp_num
, pg_num
);
12482 cmd_getval(cmdmap
, "pg_num_min", pg_num_min
, int64_t(0));
12484 string pool_type_str
;
12485 cmd_getval(cmdmap
, "pool_type", pool_type_str
);
12486 if (pool_type_str
.empty())
12487 pool_type_str
= g_conf().get_val
<string
>("osd_pool_default_type");
12490 cmd_getval(cmdmap
, "pool", poolstr
);
12491 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
12492 if (pool_id
>= 0) {
12493 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
12494 if (pool_type_str
!= p
->get_type_name()) {
12495 ss
<< "pool '" << poolstr
<< "' cannot change to type " << pool_type_str
;
12498 ss
<< "pool '" << poolstr
<< "' already exists";
12505 if (pool_type_str
== "replicated") {
12506 pool_type
= pg_pool_t::TYPE_REPLICATED
;
12507 } else if (pool_type_str
== "erasure") {
12508 pool_type
= pg_pool_t::TYPE_ERASURE
;
12510 ss
<< "unknown pool type '" << pool_type_str
<< "'";
12515 bool implicit_rule_creation
= false;
12516 int64_t expected_num_objects
= 0;
12518 cmd_getval(cmdmap
, "rule", rule_name
);
12519 string erasure_code_profile
;
12520 cmd_getval(cmdmap
, "erasure_code_profile", erasure_code_profile
);
12522 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
12523 if (erasure_code_profile
== "")
12524 erasure_code_profile
= "default";
12525 //handle the erasure code profile
12526 if (erasure_code_profile
== "default") {
12527 if (!osdmap
.has_erasure_code_profile(erasure_code_profile
)) {
12528 if (pending_inc
.has_erasure_code_profile(erasure_code_profile
)) {
12529 dout(20) << "erasure code profile " << erasure_code_profile
<< " already pending" << dendl
;
12533 map
<string
,string
> profile_map
;
12534 err
= osdmap
.get_erasure_code_profile_default(cct
,
12539 dout(20) << "erasure code profile " << erasure_code_profile
<< " set" << dendl
;
12540 pending_inc
.set_erasure_code_profile(erasure_code_profile
, profile_map
);
12544 if (rule_name
== "") {
12545 implicit_rule_creation
= true;
12546 if (erasure_code_profile
== "default") {
12547 rule_name
= "erasure-code";
12549 dout(1) << "implicitly use rule named after the pool: "
12550 << poolstr
<< dendl
;
12551 rule_name
= poolstr
;
12554 cmd_getval(cmdmap
, "expected_num_objects",
12555 expected_num_objects
, int64_t(0));
12557 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
12558 // and put expected_num_objects to rule field
12559 if (erasure_code_profile
!= "") { // cmd is from CLI
12560 if (rule_name
!= "") {
12562 expected_num_objects
= strict_strtoll(rule_name
.c_str(), 10, &interr
);
12563 if (interr
.length()) {
12564 ss
<< "error parsing integer value '" << rule_name
<< "': " << interr
;
12569 rule_name
= erasure_code_profile
;
12570 } else { // cmd is well-formed
12571 cmd_getval(cmdmap
, "expected_num_objects",
12572 expected_num_objects
, int64_t(0));
12576 if (!implicit_rule_creation
&& rule_name
!= "") {
12578 err
= get_crush_rule(rule_name
, &rule
, &ss
);
12579 if (err
== -EAGAIN
) {
12580 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12587 if (expected_num_objects
< 0) {
12588 ss
<< "'expected_num_objects' must be non-negative";
12594 osdmap
.get_all_osds(osds
);
12595 bool has_filestore_osd
= std::any_of(osds
.begin(), osds
.end(), [this](int osd
) {
12597 if (!get_osd_objectstore_type(osd
, &type
)) {
12598 return type
== "filestore";
12604 if (has_filestore_osd
&&
12605 expected_num_objects
> 0 &&
12606 cct
->_conf
->filestore_merge_threshold
> 0) {
12607 ss
<< "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
12612 if (has_filestore_osd
&&
12613 expected_num_objects
== 0 &&
12614 cct
->_conf
->filestore_merge_threshold
< 0) {
12615 int osds
= osdmap
.get_num_osds();
12617 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
12618 if (!sure
&& osds
&& (pg_num
>= 1024 || pg_num
/ osds
>= 100)) {
12619 ss
<< "For better initial performance on pools expected to store a "
12620 << "large number of objects, consider supplying the "
12621 << "expected_num_objects parameter when creating the pool."
12622 << " Pass --yes-i-really-mean-it to ignore it";
12628 int64_t fast_read_param
;
12629 cmd_getval(cmdmap
, "fast_read", fast_read_param
, int64_t(-1));
12630 FastReadType fast_read
= FAST_READ_DEFAULT
;
12631 if (fast_read_param
== 0)
12632 fast_read
= FAST_READ_OFF
;
12633 else if (fast_read_param
> 0)
12634 fast_read
= FAST_READ_ON
;
12636 int64_t repl_size
= 0;
12637 cmd_getval(cmdmap
, "size", repl_size
);
12638 int64_t target_size_bytes
= 0;
12639 double target_size_ratio
= 0.0;
12640 cmd_getval(cmdmap
, "target_size_bytes", target_size_bytes
);
12641 cmd_getval(cmdmap
, "target_size_ratio", target_size_ratio
);
12643 string pg_autoscale_mode
;
12644 cmd_getval(cmdmap
, "autoscale_mode", pg_autoscale_mode
);
12646 err
= prepare_new_pool(poolstr
,
12647 -1, // default crush rule
12649 pg_num
, pgp_num
, pg_num_min
,
12650 repl_size
, target_size_bytes
, target_size_ratio
,
12651 erasure_code_profile
, pool_type
,
12652 (uint64_t)expected_num_objects
,
12659 ss
<< "pool '" << poolstr
<< "' already exists";
12662 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12671 ss
<< "pool '" << poolstr
<< "' created";
12674 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12675 get_last_committed() + 1));
12678 } else if (prefix
== "osd pool delete" ||
12679 prefix
== "osd pool rm") {
12680 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
12681 string poolstr
, poolstr2
, sure
;
12682 cmd_getval(cmdmap
, "pool", poolstr
);
12683 cmd_getval(cmdmap
, "pool2", poolstr2
);
12684 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
12686 ss
<< "pool '" << poolstr
<< "' does not exist";
12691 bool force_no_fake
= false;
12692 cmd_getval(cmdmap
, "yes_i_really_really_mean_it", force_no_fake
);
12693 bool force
= false;
12694 cmd_getval(cmdmap
, "yes_i_really_really_mean_it_not_faking", force
);
12695 if (poolstr2
!= poolstr
||
12696 (!force
&& !force_no_fake
)) {
12697 ss
<< "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
12698 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
12699 << "followed by --yes-i-really-really-mean-it.";
12703 err
= _prepare_remove_pool(pool
, &ss
, force_no_fake
);
12704 if (err
== -EAGAIN
) {
12705 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12711 } else if (prefix
== "osd pool rename") {
12712 string srcpoolstr
, destpoolstr
;
12713 cmd_getval(cmdmap
, "srcpool", srcpoolstr
);
12714 cmd_getval(cmdmap
, "destpool", destpoolstr
);
12715 int64_t pool_src
= osdmap
.lookup_pg_pool_name(srcpoolstr
.c_str());
12716 int64_t pool_dst
= osdmap
.lookup_pg_pool_name(destpoolstr
.c_str());
12718 if (pool_src
< 0) {
12719 if (pool_dst
>= 0) {
12720 // src pool doesn't exist, dst pool does exist: to ensure idempotency
12721 // of operations, assume this rename succeeded, as it is not changing
12722 // the current state. Make sure we output something understandable
12723 // for whoever is issuing the command, if they are paying attention,
12724 // in case it was not intentional; or to avoid a "wtf?" and a bug
12725 // report in case it was intentional, while expecting a failure.
12726 ss
<< "pool '" << srcpoolstr
<< "' does not exist; pool '"
12727 << destpoolstr
<< "' does -- assuming successful rename";
12730 ss
<< "unrecognized pool '" << srcpoolstr
<< "'";
12734 } else if (pool_dst
>= 0) {
12735 // source pool exists and so does the destination pool
12736 ss
<< "pool '" << destpoolstr
<< "' already exists";
12741 int ret
= _prepare_rename_pool(pool_src
, destpoolstr
);
12743 ss
<< "pool '" << srcpoolstr
<< "' renamed to '" << destpoolstr
<< "'";
12745 ss
<< "failed to rename pool '" << srcpoolstr
<< "' to '" << destpoolstr
<< "': "
12746 << cpp_strerror(ret
);
12749 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, ret
, rs
,
12750 get_last_committed() + 1));
12753 } else if (prefix
== "osd pool set") {
12754 err
= prepare_command_pool_set(cmdmap
, ss
);
12755 if (err
== -EAGAIN
)
12761 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12762 get_last_committed() + 1));
12764 } else if (prefix
== "osd tier add") {
12765 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
12766 if (err
== -EAGAIN
)
12771 cmd_getval(cmdmap
, "pool", poolstr
);
12772 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
12774 ss
<< "unrecognized pool '" << poolstr
<< "'";
12778 string tierpoolstr
;
12779 cmd_getval(cmdmap
, "tierpool", tierpoolstr
);
12780 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
12781 if (tierpool_id
< 0) {
12782 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
12786 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
12788 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
12791 if (!_check_become_tier(tierpool_id
, tp
, pool_id
, p
, &err
, &ss
)) {
12795 // make sure new tier is empty
12796 string force_nonempty
;
12797 cmd_getval(cmdmap
, "force_nonempty", force_nonempty
);
12798 const pool_stat_t
*pstats
= mon
->mgrstatmon()->get_pool_stat(tierpool_id
);
12799 if (pstats
&& pstats
->stats
.sum
.num_objects
!= 0 &&
12800 force_nonempty
!= "--force-nonempty") {
12801 ss
<< "tier pool '" << tierpoolstr
<< "' is not empty; --force-nonempty to force";
12805 if (tp
->is_erasure()) {
12806 ss
<< "tier pool '" << tierpoolstr
12807 << "' is an ec pool, which cannot be a tier";
12811 if ((!tp
->removed_snaps
.empty() || !tp
->snaps
.empty()) &&
12812 ((force_nonempty
!= "--force-nonempty") ||
12813 (!g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps
))) {
12814 ss
<< "tier pool '" << tierpoolstr
<< "' has snapshot state; it cannot be added as a tier without breaking the pool";
12819 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
12820 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
12821 if (np
->tiers
.count(tierpool_id
) || ntp
->is_tier()) {
12822 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12825 np
->tiers
.insert(tierpool_id
);
12826 np
->set_snap_epoch(pending_inc
.epoch
); // tier will update to our snap info
12827 ntp
->tier_of
= pool_id
;
12828 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) a tier of '" << poolstr
<< "'";
12829 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
12830 get_last_committed() + 1));
12832 } else if (prefix
== "osd tier remove" ||
12833 prefix
== "osd tier rm") {
12835 cmd_getval(cmdmap
, "pool", poolstr
);
12836 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
12838 ss
<< "unrecognized pool '" << poolstr
<< "'";
12842 string tierpoolstr
;
12843 cmd_getval(cmdmap
, "tierpool", tierpoolstr
);
12844 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
12845 if (tierpool_id
< 0) {
12846 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
12850 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
12852 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
12855 if (!_check_remove_tier(pool_id
, p
, tp
, &err
, &ss
)) {
12859 if (p
->tiers
.count(tierpool_id
) == 0) {
12860 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) not a tier of '" << poolstr
<< "'";
12864 if (tp
->tier_of
!= pool_id
) {
12865 ss
<< "tier pool '" << tierpoolstr
<< "' is a tier of '"
12866 << osdmap
.get_pool_name(tp
->tier_of
) << "': "
12867 // be scary about it; this is an inconsistency and bells must go off
12868 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
12872 if (p
->read_tier
== tierpool_id
) {
12873 ss
<< "tier pool '" << tierpoolstr
<< "' is the overlay for '" << poolstr
<< "'; please remove-overlay first";
12878 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
12879 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
12880 if (np
->tiers
.count(tierpool_id
) == 0 ||
12881 ntp
->tier_of
!= pool_id
||
12882 np
->read_tier
== tierpool_id
) {
12883 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12886 np
->tiers
.erase(tierpool_id
);
12888 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) not a tier of '" << poolstr
<< "'";
12889 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
12890 get_last_committed() + 1));
12892 } else if (prefix
== "osd tier set-overlay") {
12893 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
12894 if (err
== -EAGAIN
)
12899 cmd_getval(cmdmap
, "pool", poolstr
);
12900 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
12902 ss
<< "unrecognized pool '" << poolstr
<< "'";
12906 string overlaypoolstr
;
12907 cmd_getval(cmdmap
, "overlaypool", overlaypoolstr
);
12908 int64_t overlaypool_id
= osdmap
.lookup_pg_pool_name(overlaypoolstr
);
12909 if (overlaypool_id
< 0) {
12910 ss
<< "unrecognized pool '" << overlaypoolstr
<< "'";
12914 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
12916 const pg_pool_t
*overlay_p
= osdmap
.get_pg_pool(overlaypool_id
);
12917 ceph_assert(overlay_p
);
12918 if (p
->tiers
.count(overlaypool_id
) == 0) {
12919 ss
<< "tier pool '" << overlaypoolstr
<< "' is not a tier of '" << poolstr
<< "'";
12923 if (p
->read_tier
== overlaypool_id
) {
12925 ss
<< "overlay for '" << poolstr
<< "' is now (or already was) '" << overlaypoolstr
<< "'";
12928 if (p
->has_read_tier()) {
12929 ss
<< "pool '" << poolstr
<< "' has overlay '"
12930 << osdmap
.get_pool_name(p
->read_tier
)
12931 << "'; please remove-overlay first";
12937 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
12938 np
->read_tier
= overlaypool_id
;
12939 np
->write_tier
= overlaypool_id
;
12940 np
->set_last_force_op_resend(pending_inc
.epoch
);
12941 pg_pool_t
*noverlay_p
= pending_inc
.get_new_pool(overlaypool_id
, overlay_p
);
12942 noverlay_p
->set_last_force_op_resend(pending_inc
.epoch
);
12943 ss
<< "overlay for '" << poolstr
<< "' is now (or already was) '" << overlaypoolstr
<< "'";
12944 if (overlay_p
->cache_mode
== pg_pool_t::CACHEMODE_NONE
)
12945 ss
<<" (WARNING: overlay pool cache_mode is still NONE)";
12946 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
12947 get_last_committed() + 1));
12949 } else if (prefix
== "osd tier remove-overlay" ||
12950 prefix
== "osd tier rm-overlay") {
12952 cmd_getval(cmdmap
, "pool", poolstr
);
12953 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
12955 ss
<< "unrecognized pool '" << poolstr
<< "'";
12959 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
12961 if (!p
->has_read_tier()) {
12963 ss
<< "there is now (or already was) no overlay for '" << poolstr
<< "'";
12967 if (!_check_remove_tier(pool_id
, p
, NULL
, &err
, &ss
)) {
12972 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
12973 if (np
->has_read_tier()) {
12974 const pg_pool_t
*op
= osdmap
.get_pg_pool(np
->read_tier
);
12975 pg_pool_t
*nop
= pending_inc
.get_new_pool(np
->read_tier
,op
);
12976 nop
->set_last_force_op_resend(pending_inc
.epoch
);
12978 if (np
->has_write_tier()) {
12979 const pg_pool_t
*op
= osdmap
.get_pg_pool(np
->write_tier
);
12980 pg_pool_t
*nop
= pending_inc
.get_new_pool(np
->write_tier
, op
);
12981 nop
->set_last_force_op_resend(pending_inc
.epoch
);
12983 np
->clear_read_tier();
12984 np
->clear_write_tier();
12985 np
->set_last_force_op_resend(pending_inc
.epoch
);
12986 ss
<< "there is now (or already was) no overlay for '" << poolstr
<< "'";
12987 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
12988 get_last_committed() + 1));
12990 } else if (prefix
== "osd tier cache-mode") {
12991 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
12992 if (err
== -EAGAIN
)
12997 cmd_getval(cmdmap
, "pool", poolstr
);
12998 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13000 ss
<< "unrecognized pool '" << poolstr
<< "'";
13004 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13006 if (!p
->is_tier()) {
13007 ss
<< "pool '" << poolstr
<< "' is not a tier";
13012 cmd_getval(cmdmap
, "mode", modestr
);
13013 pg_pool_t::cache_mode_t mode
= pg_pool_t::get_cache_mode_from_str(modestr
);
13014 if (int(mode
) < 0) {
13015 ss
<< "'" << modestr
<< "' is not a valid cache mode";
13021 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
13023 if (mode
== pg_pool_t::CACHEMODE_FORWARD
||
13024 mode
== pg_pool_t::CACHEMODE_READFORWARD
) {
13025 ss
<< "'" << modestr
<< "' is no longer a supported cache mode";
13029 if ((mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13030 mode
!= pg_pool_t::CACHEMODE_NONE
&&
13031 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
13032 mode
!= pg_pool_t::CACHEMODE_READPROXY
) &&
13034 ss
<< "'" << modestr
<< "' is not a well-supported cache mode and may "
13035 << "corrupt your data. pass --yes-i-really-mean-it to force.";
13040 // pool already has this cache-mode set and there are no pending changes
13041 if (p
->cache_mode
== mode
&&
13042 (pending_inc
.new_pools
.count(pool_id
) == 0 ||
13043 pending_inc
.new_pools
[pool_id
].cache_mode
== p
->cache_mode
)) {
13044 ss
<< "set cache-mode for pool '" << poolstr
<< "'"
13045 << " to " << pg_pool_t::get_cache_mode_name(mode
);
13050 /* Mode description:
13052 * none: No cache-mode defined
13053 * forward: Forward all reads and writes to base pool [removed]
13054 * writeback: Cache writes, promote reads from base pool
13055 * readonly: Forward writes to base pool
13056 * readforward: Writes are in writeback mode, Reads are in forward mode [removed]
13057 * proxy: Proxy all reads and writes to base pool
13058 * readproxy: Writes are in writeback mode, Reads are in proxy mode
13060 * Hence, these are the allowed transitions:
13063 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
13064 * proxy -> readproxy || writeback || any IF num_objects_dirty == 0
13065 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
13066 * readproxy -> proxy || writeback || any IF num_objects_dirty == 0
13067 * writeback -> readproxy || proxy
13071 // We check if the transition is valid against the current pool mode, as
13072 // it is the only committed state thus far. We will blantly squash
13073 // whatever mode is on the pending state.
13075 if (p
->cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
&&
13076 (mode
!= pg_pool_t::CACHEMODE_PROXY
&&
13077 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) {
13078 ss
<< "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode
)
13079 << "' on a '" << pg_pool_t::get_cache_mode_name(p
->cache_mode
)
13080 << "' pool; only '"
13081 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY
)
13086 if ((p
->cache_mode
== pg_pool_t::CACHEMODE_READFORWARD
&&
13087 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13088 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
13089 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) ||
13091 (p
->cache_mode
== pg_pool_t::CACHEMODE_READPROXY
&&
13092 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13093 mode
!= pg_pool_t::CACHEMODE_PROXY
)) ||
13095 (p
->cache_mode
== pg_pool_t::CACHEMODE_PROXY
&&
13096 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13097 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) ||
13099 (p
->cache_mode
== pg_pool_t::CACHEMODE_FORWARD
&&
13100 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13101 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
13102 mode
!= pg_pool_t::CACHEMODE_READPROXY
))) {
13104 const pool_stat_t
* pstats
=
13105 mon
->mgrstatmon()->get_pool_stat(pool_id
);
13107 if (pstats
&& pstats
->stats
.sum
.num_objects_dirty
> 0) {
13108 ss
<< "unable to set cache-mode '"
13109 << pg_pool_t::get_cache_mode_name(mode
) << "' on pool '" << poolstr
13110 << "': dirty objects found";
13116 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13117 np
->cache_mode
= mode
;
13118 // set this both when moving to and from cache_mode NONE. this is to
13119 // capture legacy pools that were set up before this flag existed.
13120 np
->flags
|= pg_pool_t::FLAG_INCOMPLETE_CLONES
;
13121 ss
<< "set cache-mode for pool '" << poolstr
13122 << "' to " << pg_pool_t::get_cache_mode_name(mode
);
13123 if (mode
== pg_pool_t::CACHEMODE_NONE
) {
13124 const pg_pool_t
*base_pool
= osdmap
.get_pg_pool(np
->tier_of
);
13125 ceph_assert(base_pool
);
13126 if (base_pool
->read_tier
== pool_id
||
13127 base_pool
->write_tier
== pool_id
)
13128 ss
<<" (WARNING: pool is still configured as read or write tier)";
13130 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13131 get_last_committed() + 1));
13133 } else if (prefix
== "osd tier add-cache") {
13134 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
13135 if (err
== -EAGAIN
)
13140 cmd_getval(cmdmap
, "pool", poolstr
);
13141 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13143 ss
<< "unrecognized pool '" << poolstr
<< "'";
13147 string tierpoolstr
;
13148 cmd_getval(cmdmap
, "tierpool", tierpoolstr
);
13149 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
13150 if (tierpool_id
< 0) {
13151 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
13155 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13157 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
13160 if (!_check_become_tier(tierpool_id
, tp
, pool_id
, p
, &err
, &ss
)) {
13165 if (!cmd_getval(cmdmap
, "size", size
)) {
13166 ss
<< "unable to parse 'size' value '"
13167 << cmd_vartype_stringify(cmdmap
.at("size")) << "'";
13171 // make sure new tier is empty
13172 const pool_stat_t
*pstats
=
13173 mon
->mgrstatmon()->get_pool_stat(tierpool_id
);
13174 if (pstats
&& pstats
->stats
.sum
.num_objects
!= 0) {
13175 ss
<< "tier pool '" << tierpoolstr
<< "' is not empty";
13179 auto& modestr
= g_conf().get_val
<string
>("osd_tier_default_cache_mode");
13180 pg_pool_t::cache_mode_t mode
= pg_pool_t::get_cache_mode_from_str(modestr
);
13181 if (int(mode
) < 0) {
13182 ss
<< "osd tier cache default mode '" << modestr
<< "' is not a valid cache mode";
13186 HitSet::Params hsp
;
13187 auto& cache_hit_set_type
=
13188 g_conf().get_val
<string
>("osd_tier_default_cache_hit_set_type");
13189 if (cache_hit_set_type
== "bloom") {
13190 BloomHitSet::Params
*bsp
= new BloomHitSet::Params
;
13191 bsp
->set_fpp(g_conf().get_val
<double>("osd_pool_default_hit_set_bloom_fpp"));
13192 hsp
= HitSet::Params(bsp
);
13193 } else if (cache_hit_set_type
== "explicit_hash") {
13194 hsp
= HitSet::Params(new ExplicitHashHitSet::Params
);
13195 } else if (cache_hit_set_type
== "explicit_object") {
13196 hsp
= HitSet::Params(new ExplicitObjectHitSet::Params
);
13198 ss
<< "osd tier cache default hit set type '"
13199 << cache_hit_set_type
<< "' is not a known type";
13204 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13205 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
13206 if (np
->tiers
.count(tierpool_id
) || ntp
->is_tier()) {
13207 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13210 np
->tiers
.insert(tierpool_id
);
13211 np
->read_tier
= np
->write_tier
= tierpool_id
;
13212 np
->set_snap_epoch(pending_inc
.epoch
); // tier will update to our snap info
13213 np
->set_last_force_op_resend(pending_inc
.epoch
);
13214 ntp
->set_last_force_op_resend(pending_inc
.epoch
);
13215 ntp
->tier_of
= pool_id
;
13216 ntp
->cache_mode
= mode
;
13217 ntp
->hit_set_count
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_count");
13218 ntp
->hit_set_period
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_period");
13219 ntp
->min_read_recency_for_promote
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
13220 ntp
->min_write_recency_for_promote
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
13221 ntp
->hit_set_grade_decay_rate
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
13222 ntp
->hit_set_search_last_n
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
13223 ntp
->hit_set_params
= hsp
;
13224 ntp
->target_max_bytes
= size
;
13225 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) a cache tier of '" << poolstr
<< "'";
13226 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13227 get_last_committed() + 1));
13229 } else if (prefix
== "osd pool set-quota") {
13231 cmd_getval(cmdmap
, "pool", poolstr
);
13232 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13234 ss
<< "unrecognized pool '" << poolstr
<< "'";
13240 cmd_getval(cmdmap
, "field", field
);
13241 if (field
!= "max_objects" && field
!= "max_bytes") {
13242 ss
<< "unrecognized field '" << field
<< "'; should be 'max_bytes' or 'max_objects'";
13247 // val could contain unit designations, so we treat as a string
13249 cmd_getval(cmdmap
, "val", val
);
13252 if (field
== "max_objects") {
13253 value
= strict_sistrtoll(val
.c_str(), &tss
);
13254 } else if (field
== "max_bytes") {
13255 value
= strict_iecstrtoll(val
.c_str(), &tss
);
13257 ceph_abort_msg("unrecognized option");
13259 if (!tss
.empty()) {
13260 ss
<< "error parsing value '" << val
<< "': " << tss
;
13265 pg_pool_t
*pi
= pending_inc
.get_new_pool(pool_id
, osdmap
.get_pg_pool(pool_id
));
13266 if (field
== "max_objects") {
13267 pi
->quota_max_objects
= value
;
13268 } else if (field
== "max_bytes") {
13269 pi
->quota_max_bytes
= value
;
13271 ceph_abort_msg("unrecognized option");
13273 ss
<< "set-quota " << field
<< " = " << value
<< " for pool " << poolstr
;
13275 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
13276 get_last_committed() + 1));
13278 } else if (prefix
== "osd pool application enable" ||
13279 prefix
== "osd pool application disable" ||
13280 prefix
== "osd pool application set" ||
13281 prefix
== "osd pool application rm") {
13282 err
= prepare_command_pool_application(prefix
, cmdmap
, ss
);
13283 if (err
== -EAGAIN
) {
13285 } else if (err
< 0) {
13290 } else if (prefix
== "osd force-create-pg") {
13293 cmd_getval(cmdmap
, "pgid", pgidstr
);
13294 if (!pgid
.parse(pgidstr
.c_str())) {
13295 ss
<< "invalid pgid '" << pgidstr
<< "'";
13299 if (!osdmap
.pg_exists(pgid
)) {
13300 ss
<< "pg " << pgid
<< " should not exist";
13305 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
13307 ss
<< "This command will recreate a lost (as in data lost) PG with data in it, such "
13308 << "that the cluster will give up ever trying to recover the lost data. Do this "
13309 << "only if you are certain that all copies of the PG are in fact lost and you are "
13310 << "willing to accept that the data is permanently destroyed. Pass "
13311 << "--yes-i-really-mean-it to proceed.";
13317 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
13318 auto emplaced
= creating_pgs
.pgs
.emplace(
13320 creating_pgs_t::pg_create_info(osdmap
.get_epoch(),
13321 ceph_clock_now()));
13322 creating_now
= emplaced
.second
;
13324 if (creating_now
) {
13325 ss
<< "pg " << pgidstr
<< " now creating, ok";
13326 // set the pool's CREATING flag so that (1) the osd won't ignore our
13327 // create message and (2) we won't propose any future pg_num changes
13328 // until after the PG has been instantiated.
13329 if (pending_inc
.new_pools
.count(pgid
.pool()) == 0) {
13330 pending_inc
.new_pools
[pgid
.pool()] = *osdmap
.get_pg_pool(pgid
.pool());
13332 pending_inc
.new_pools
[pgid
.pool()].flags
|= pg_pool_t::FLAG_CREATING
;
13336 ss
<< "pg " << pgid
<< " already creating";
13346 if (err
< 0 && rs
.length() == 0)
13347 rs
= cpp_strerror(err
);
13348 mon
->reply_command(op
, err
, rs
, rdata
, get_last_committed());
13353 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
13354 get_last_committed() + 1));
13358 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13362 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op
)
13364 op
->mark_osdmon_event(__func__
);
13366 auto m
= op
->get_req
<MPoolOp
>();
13367 MonSession
*session
= op
->get_session();
13369 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
13374 case POOL_OP_CREATE_UNMANAGED_SNAP
:
13375 case POOL_OP_DELETE_UNMANAGED_SNAP
:
13377 const std::string
* pool_name
= nullptr;
13378 const pg_pool_t
*pg_pool
= osdmap
.get_pg_pool(m
->pool
);
13379 if (pg_pool
!= nullptr) {
13380 pool_name
= &osdmap
.get_pool_name(m
->pool
);
13383 if (!is_unmanaged_snap_op_permitted(cct
, mon
->key_server
,
13384 session
->entity_name
, session
->caps
,
13385 session
->get_peer_socket_addr(),
13387 dout(0) << "got unmanaged-snap pool op from entity with insufficient "
13388 << "privileges. message: " << *m
<< std::endl
13389 << "caps: " << session
->caps
<< dendl
;
13390 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
13396 if (!session
->is_capable("osd", MON_CAP_W
)) {
13397 dout(0) << "got pool op from entity with insufficient privileges. "
13398 << "message: " << *m
<< std::endl
13399 << "caps: " << session
->caps
<< dendl
;
13400 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
13409 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op
)
13411 op
->mark_osdmon_event(__func__
);
13412 auto m
= op
->get_req
<MPoolOp
>();
13414 if (enforce_pool_op_caps(op
)) {
13418 if (m
->fsid
!= mon
->monmap
->fsid
) {
13419 dout(0) << __func__
<< " drop message on fsid " << m
->fsid
13420 << " != " << mon
->monmap
->fsid
<< " for " << *m
<< dendl
;
13421 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13425 if (m
->op
== POOL_OP_CREATE
)
13426 return preprocess_pool_op_create(op
);
13428 const pg_pool_t
*p
= osdmap
.get_pg_pool(m
->pool
);
13429 if (p
== nullptr) {
13430 dout(10) << "attempt to operate on non-existent pool id " << m
->pool
<< dendl
;
13431 if (m
->op
== POOL_OP_DELETE
) {
13432 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13434 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
13439 // check if the snap and snapname exist
13440 bool snap_exists
= false;
13441 if (p
->snap_exists(m
->name
.c_str()))
13442 snap_exists
= true;
13445 case POOL_OP_CREATE_SNAP
:
13446 if (p
->is_unmanaged_snaps_mode() || p
->is_tier()) {
13447 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13451 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13455 case POOL_OP_CREATE_UNMANAGED_SNAP
:
13456 if (p
->is_pool_snaps_mode()) {
13457 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13461 case POOL_OP_DELETE_SNAP
:
13462 if (p
->is_unmanaged_snaps_mode()) {
13463 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13466 if (!snap_exists
) {
13467 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13471 case POOL_OP_DELETE_UNMANAGED_SNAP
:
13472 if (p
->is_pool_snaps_mode()) {
13473 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13476 if (_is_removed_snap(m
->pool
, m
->snapid
)) {
13477 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13481 case POOL_OP_DELETE
:
13482 if (osdmap
.lookup_pg_pool_name(m
->name
.c_str()) >= 0) {
13483 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13487 case POOL_OP_AUID_CHANGE
:
13497 bool OSDMonitor::_is_removed_snap(int64_t pool
, snapid_t snap
)
13499 if (!osdmap
.have_pg_pool(pool
)) {
13500 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
13501 << " - pool dne" << dendl
;
13504 if (osdmap
.in_removed_snaps_queue(pool
, snap
)) {
13505 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
13506 << " - in osdmap removed_snaps_queue" << dendl
;
13509 snapid_t begin
, end
;
13510 int r
= lookup_purged_snap(pool
, snap
, &begin
, &end
);
13512 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
13513 << " - purged, [" << begin
<< "," << end
<< ")" << dendl
;
13519 bool OSDMonitor::_is_pending_removed_snap(int64_t pool
, snapid_t snap
)
13521 if (pending_inc
.old_pools
.count(pool
)) {
13522 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
13523 << " - pool pending deletion" << dendl
;
13526 if (pending_inc
.in_new_removed_snaps(pool
, snap
)) {
13527 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
13528 << " - in pending new_removed_snaps" << dendl
;
13534 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op
)
13536 op
->mark_osdmon_event(__func__
);
13537 auto m
= op
->get_req
<MPoolOp
>();
13538 int64_t pool
= osdmap
.lookup_pg_pool_name(m
->name
.c_str());
13540 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13547 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op
)
13549 op
->mark_osdmon_event(__func__
);
13550 auto m
= op
->get_req
<MPoolOp
>();
13551 dout(10) << "prepare_pool_op " << *m
<< dendl
;
13552 if (m
->op
== POOL_OP_CREATE
) {
13553 return prepare_pool_op_create(op
);
13554 } else if (m
->op
== POOL_OP_DELETE
) {
13555 return prepare_pool_op_delete(op
);
13559 bool changed
= false;
13561 if (!osdmap
.have_pg_pool(m
->pool
)) {
13562 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
13566 const pg_pool_t
*pool
= osdmap
.get_pg_pool(m
->pool
);
13569 case POOL_OP_CREATE_SNAP
:
13570 if (pool
->is_tier()) {
13572 _pool_op_reply(op
, ret
, osdmap
.get_epoch());
13574 } // else, fall through
13575 case POOL_OP_DELETE_SNAP
:
13576 if (!pool
->is_unmanaged_snaps_mode()) {
13577 bool snap_exists
= pool
->snap_exists(m
->name
.c_str());
13578 if ((m
->op
== POOL_OP_CREATE_SNAP
&& snap_exists
)
13579 || (m
->op
== POOL_OP_DELETE_SNAP
&& !snap_exists
)) {
13587 _pool_op_reply(op
, ret
, osdmap
.get_epoch());
13590 case POOL_OP_DELETE_UNMANAGED_SNAP
:
13591 // we won't allow removal of an unmanaged snapshot from a pool
13592 // not in unmanaged snaps mode.
13593 if (!pool
->is_unmanaged_snaps_mode()) {
13594 _pool_op_reply(op
, -ENOTSUP
, osdmap
.get_epoch());
13598 case POOL_OP_CREATE_UNMANAGED_SNAP
:
13599 // but we will allow creating an unmanaged snapshot on any pool
13600 // as long as it is not in 'pool' snaps mode.
13601 if (pool
->is_pool_snaps_mode()) {
13602 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13607 // projected pool info
13609 if (pending_inc
.new_pools
.count(m
->pool
))
13610 pp
= pending_inc
.new_pools
[m
->pool
];
13612 pp
= *osdmap
.get_pg_pool(m
->pool
);
13614 bufferlist reply_data
;
13616 // pool snaps vs unmanaged snaps are mutually exclusive
13618 case POOL_OP_CREATE_SNAP
:
13619 case POOL_OP_DELETE_SNAP
:
13620 if (pp
.is_unmanaged_snaps_mode()) {
13626 case POOL_OP_CREATE_UNMANAGED_SNAP
:
13627 case POOL_OP_DELETE_UNMANAGED_SNAP
:
13628 if (pp
.is_pool_snaps_mode()) {
13635 case POOL_OP_CREATE_SNAP
:
13636 if (!pp
.snap_exists(m
->name
.c_str())) {
13637 pp
.add_snap(m
->name
.c_str(), ceph_clock_now());
13638 dout(10) << "create snap in pool " << m
->pool
<< " " << m
->name
13639 << " seq " << pp
.get_snap_epoch() << dendl
;
13644 case POOL_OP_DELETE_SNAP
:
13646 snapid_t s
= pp
.snap_exists(m
->name
.c_str());
13649 pending_inc
.new_removed_snaps
[m
->pool
].insert(s
);
13655 case POOL_OP_CREATE_UNMANAGED_SNAP
:
13657 uint64_t snapid
= pp
.add_unmanaged_snap(
13658 osdmap
.require_osd_release
< ceph_release_t::octopus
);
13659 encode(snapid
, reply_data
);
13664 case POOL_OP_DELETE_UNMANAGED_SNAP
:
13665 if (!_is_removed_snap(m
->pool
, m
->snapid
) &&
13666 !_is_pending_removed_snap(m
->pool
, m
->snapid
)) {
13667 if (m
->snapid
> pp
.get_snap_seq()) {
13668 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
13671 pp
.remove_unmanaged_snap(
13673 osdmap
.require_osd_release
< ceph_release_t::octopus
);
13674 pending_inc
.new_removed_snaps
[m
->pool
].insert(m
->snapid
);
13675 // also record the new seq as purged: this avoids a discontinuity
13676 // after all of the snaps have been purged, since the seq assigned
13677 // during removal lives in the same namespace as the actual snaps.
13678 pending_pseudo_purged_snaps
[m
->pool
].insert(pp
.get_snap_seq());
13683 case POOL_OP_AUID_CHANGE
:
13684 _pool_op_reply(op
, -EOPNOTSUPP
, osdmap
.get_epoch());
13693 pp
.set_snap_epoch(pending_inc
.epoch
);
13694 pending_inc
.new_pools
[m
->pool
] = pp
;
13698 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, ret
, pending_inc
.epoch
, &reply_data
));
13702 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op
)
13704 op
->mark_osdmon_event(__func__
);
13705 int err
= prepare_new_pool(op
);
13706 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, err
, pending_inc
.epoch
));
13710 int OSDMonitor::_check_remove_pool(int64_t pool_id
, const pg_pool_t
& pool
,
13713 const string
& poolstr
= osdmap
.get_pool_name(pool_id
);
13715 // If the Pool is in use by CephFS, refuse to delete it
13716 FSMap
const &pending_fsmap
= mon
->mdsmon()->get_pending_fsmap();
13717 if (pending_fsmap
.pool_in_use(pool_id
)) {
13718 *ss
<< "pool '" << poolstr
<< "' is in use by CephFS";
13722 if (pool
.tier_of
>= 0) {
13723 *ss
<< "pool '" << poolstr
<< "' is a tier of '"
13724 << osdmap
.get_pool_name(pool
.tier_of
) << "'";
13727 if (!pool
.tiers
.empty()) {
13728 *ss
<< "pool '" << poolstr
<< "' has tiers";
13729 for(auto tier
: pool
.tiers
) {
13730 *ss
<< " " << osdmap
.get_pool_name(tier
);
13735 if (!g_conf()->mon_allow_pool_delete
) {
13736 *ss
<< "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
13740 if (pool
.has_flag(pg_pool_t::FLAG_NODELETE
)) {
13741 *ss
<< "pool deletion is disabled; you must unset nodelete flag for the pool first";
13745 *ss
<< "pool '" << poolstr
<< "' removed";
13750 * Check if it is safe to add a tier to a base pool
13753 * True if the operation should proceed, false if we should abort here
13754 * (abort doesn't necessarily mean error, could be idempotency)
13756 bool OSDMonitor::_check_become_tier(
13757 const int64_t tier_pool_id
, const pg_pool_t
*tier_pool
,
13758 const int64_t base_pool_id
, const pg_pool_t
*base_pool
,
13762 const std::string
&tier_pool_name
= osdmap
.get_pool_name(tier_pool_id
);
13763 const std::string
&base_pool_name
= osdmap
.get_pool_name(base_pool_id
);
13765 const FSMap
&pending_fsmap
= mon
->mdsmon()->get_pending_fsmap();
13766 if (pending_fsmap
.pool_in_use(tier_pool_id
)) {
13767 *ss
<< "pool '" << tier_pool_name
<< "' is in use by CephFS";
13772 if (base_pool
->tiers
.count(tier_pool_id
)) {
13773 ceph_assert(tier_pool
->tier_of
== base_pool_id
);
13775 *ss
<< "pool '" << tier_pool_name
<< "' is now (or already was) a tier of '"
13776 << base_pool_name
<< "'";
13780 if (base_pool
->is_tier()) {
13781 *ss
<< "pool '" << base_pool_name
<< "' is already a tier of '"
13782 << osdmap
.get_pool_name(base_pool
->tier_of
) << "', "
13783 << "multiple tiers are not yet supported.";
13788 if (tier_pool
->has_tiers()) {
13789 *ss
<< "pool '" << tier_pool_name
<< "' has following tier(s) already:";
13790 for (set
<uint64_t>::iterator it
= tier_pool
->tiers
.begin();
13791 it
!= tier_pool
->tiers
.end(); ++it
)
13792 *ss
<< "'" << osdmap
.get_pool_name(*it
) << "',";
13793 *ss
<< " multiple tiers are not yet supported.";
13798 if (tier_pool
->is_tier()) {
13799 *ss
<< "tier pool '" << tier_pool_name
<< "' is already a tier of '"
13800 << osdmap
.get_pool_name(tier_pool
->tier_of
) << "'";
13811 * Check if it is safe to remove a tier from this base pool
13814 * True if the operation should proceed, false if we should abort here
13815 * (abort doesn't necessarily mean error, could be idempotency)
13817 bool OSDMonitor::_check_remove_tier(
13818 const int64_t base_pool_id
, const pg_pool_t
*base_pool
,
13819 const pg_pool_t
*tier_pool
,
13820 int *err
, ostream
*ss
) const
13822 const std::string
&base_pool_name
= osdmap
.get_pool_name(base_pool_id
);
13824 // Apply CephFS-specific checks
13825 const FSMap
&pending_fsmap
= mon
->mdsmon()->get_pending_fsmap();
13826 if (pending_fsmap
.pool_in_use(base_pool_id
)) {
13827 if (base_pool
->is_erasure() && !base_pool
->allows_ecoverwrites()) {
13828 // If the underlying pool is erasure coded and does not allow EC
13829 // overwrites, we can't permit the removal of the replicated tier that
13830 // CephFS relies on to access it
13831 *ss
<< "pool '" << base_pool_name
<<
13832 "' does not allow EC overwrites and is in use by CephFS"
13838 if (tier_pool
&& tier_pool
->cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
) {
13839 *ss
<< "pool '" << base_pool_name
<< "' is in use by CephFS, and this "
13840 "tier is still in use as a writeback cache. Change the cache "
13841 "mode and flush the cache before removing it";
13851 int OSDMonitor::_prepare_remove_pool(
13852 int64_t pool
, ostream
*ss
, bool no_fake
)
13854 dout(10) << __func__
<< " " << pool
<< dendl
;
13855 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
13856 int r
= _check_remove_pool(pool
, *p
, ss
);
13860 auto new_pool
= pending_inc
.new_pools
.find(pool
);
13861 if (new_pool
!= pending_inc
.new_pools
.end()) {
13862 // if there is a problem with the pending info, wait and retry
13864 const auto& p
= new_pool
->second
;
13865 int r
= _check_remove_pool(pool
, p
, ss
);
13870 if (pending_inc
.old_pools
.count(pool
)) {
13871 dout(10) << __func__
<< " " << pool
<< " already pending removal"
13876 if (g_conf()->mon_fake_pool_delete
&& !no_fake
) {
13877 string old_name
= osdmap
.get_pool_name(pool
);
13878 string new_name
= old_name
+ "." + stringify(pool
) + ".DELETED";
13879 dout(1) << __func__
<< " faking pool deletion: renaming " << pool
<< " "
13880 << old_name
<< " -> " << new_name
<< dendl
;
13881 pending_inc
.new_pool_names
[pool
] = new_name
;
13886 pending_inc
.old_pools
.insert(pool
);
13888 // remove any pg_temp mappings for this pool
13889 for (auto p
= osdmap
.pg_temp
->begin();
13890 p
!= osdmap
.pg_temp
->end();
13892 if (p
->first
.pool() == pool
) {
13893 dout(10) << __func__
<< " " << pool
<< " removing obsolete pg_temp "
13894 << p
->first
<< dendl
;
13895 pending_inc
.new_pg_temp
[p
->first
].clear();
13898 // remove any primary_temp mappings for this pool
13899 for (auto p
= osdmap
.primary_temp
->begin();
13900 p
!= osdmap
.primary_temp
->end();
13902 if (p
->first
.pool() == pool
) {
13903 dout(10) << __func__
<< " " << pool
13904 << " removing obsolete primary_temp" << p
->first
<< dendl
;
13905 pending_inc
.new_primary_temp
[p
->first
] = -1;
13908 // remove any pg_upmap mappings for this pool
13909 for (auto& p
: osdmap
.pg_upmap
) {
13910 if (p
.first
.pool() == pool
) {
13911 dout(10) << __func__
<< " " << pool
13912 << " removing obsolete pg_upmap "
13913 << p
.first
<< dendl
;
13914 pending_inc
.old_pg_upmap
.insert(p
.first
);
13917 // remove any pending pg_upmap mappings for this pool
13919 auto it
= pending_inc
.new_pg_upmap
.begin();
13920 while (it
!= pending_inc
.new_pg_upmap
.end()) {
13921 if (it
->first
.pool() == pool
) {
13922 dout(10) << __func__
<< " " << pool
13923 << " removing pending pg_upmap "
13924 << it
->first
<< dendl
;
13925 it
= pending_inc
.new_pg_upmap
.erase(it
);
13931 // remove any pg_upmap_items mappings for this pool
13932 for (auto& p
: osdmap
.pg_upmap_items
) {
13933 if (p
.first
.pool() == pool
) {
13934 dout(10) << __func__
<< " " << pool
13935 << " removing obsolete pg_upmap_items " << p
.first
13937 pending_inc
.old_pg_upmap_items
.insert(p
.first
);
13940 // remove any pending pg_upmap mappings for this pool
13942 auto it
= pending_inc
.new_pg_upmap_items
.begin();
13943 while (it
!= pending_inc
.new_pg_upmap_items
.end()) {
13944 if (it
->first
.pool() == pool
) {
13945 dout(10) << __func__
<< " " << pool
13946 << " removing pending pg_upmap_items "
13947 << it
->first
<< dendl
;
13948 it
= pending_inc
.new_pg_upmap_items
.erase(it
);
13955 // remove any choose_args for this pool
13956 CrushWrapper newcrush
;
13957 _get_pending_crush(newcrush
);
13958 if (newcrush
.have_choose_args(pool
)) {
13959 dout(10) << __func__
<< " removing choose_args for pool " << pool
<< dendl
;
13960 newcrush
.rm_choose_args(pool
);
13961 pending_inc
.crush
.clear();
13962 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
13967 int OSDMonitor::_prepare_rename_pool(int64_t pool
, string newname
)
13969 dout(10) << "_prepare_rename_pool " << pool
<< dendl
;
13970 if (pending_inc
.old_pools
.count(pool
)) {
13971 dout(10) << "_prepare_rename_pool " << pool
<< " pending removal" << dendl
;
13974 for (map
<int64_t,string
>::iterator p
= pending_inc
.new_pool_names
.begin();
13975 p
!= pending_inc
.new_pool_names
.end();
13977 if (p
->second
== newname
&& p
->first
!= pool
) {
13982 pending_inc
.new_pool_names
[pool
] = newname
;
13986 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op
)
13988 op
->mark_osdmon_event(__func__
);
13989 auto m
= op
->get_req
<MPoolOp
>();
13991 int ret
= _prepare_remove_pool(m
->pool
, &ss
, false);
13992 if (ret
== -EAGAIN
) {
13993 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13997 dout(10) << __func__
<< " got " << ret
<< " " << ss
.str() << dendl
;
13998 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, ret
,
13999 pending_inc
.epoch
));
14003 void OSDMonitor::_pool_op_reply(MonOpRequestRef op
,
14004 int ret
, epoch_t epoch
, bufferlist
*blp
)
14006 op
->mark_osdmon_event(__func__
);
14007 auto m
= op
->get_req
<MPoolOp
>();
14008 dout(20) << "_pool_op_reply " << ret
<< dendl
;
14009 MPoolOpReply
*reply
= new MPoolOpReply(m
->fsid
, m
->get_tid(),
14010 ret
, epoch
, get_last_committed(), blp
);
14011 mon
->send_reply(op
, reply
);
14014 void OSDMonitor::convert_pool_priorities(void)
14016 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc("recovery_priority").key
;
14017 int64_t max_prio
= 0;
14018 int64_t min_prio
= 0;
14019 for (const auto &i
: osdmap
.get_pools()) {
14020 const auto &pool
= i
.second
;
14022 if (pool
.opts
.is_set(key
)) {
14024 pool
.opts
.get(key
, &prio
);
14025 if (prio
> max_prio
)
14027 if (prio
< min_prio
)
14031 if (max_prio
<= OSD_POOL_PRIORITY_MAX
&& min_prio
>= OSD_POOL_PRIORITY_MIN
) {
14032 dout(20) << __func__
<< " nothing to fix" << dendl
;
14035 // Current pool priorities exceeds new maximum
14036 for (const auto &i
: osdmap
.get_pools()) {
14037 const auto pool_id
= i
.first
;
14038 pg_pool_t pool
= i
.second
;
14041 pool
.opts
.get(key
, &prio
);
14044 if (prio
> 0 && max_prio
> OSD_POOL_PRIORITY_MAX
) { // Likely scenario
14045 // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
14046 n
= (float)prio
/ max_prio
* OSD_POOL_PRIORITY_MAX
;
14047 } else if (prio
< 0 && min_prio
< OSD_POOL_PRIORITY_MIN
) {
14048 // Scaled priority range OSD_POOL_PRIORITY_MIN to 0
14049 n
= (float)prio
/ min_prio
* OSD_POOL_PRIORITY_MIN
;
14054 pool
.opts
.unset(key
);
14056 pool
.opts
.set(key
, static_cast<int64_t>(n
));
14058 dout(10) << __func__
<< " pool " << pool_id
14059 << " recovery_priority adjusted "
14060 << prio
<< " to " << n
<< dendl
;
14061 pool
.last_change
= pending_inc
.epoch
;
14062 pending_inc
.new_pools
[pool_id
] = pool
;