1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
10 * Author: Loic Dachary <loic@dachary.org>
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
20 #include <boost/algorithm/string.hpp>
21 #include <experimental/iterator>
25 #include "mon/OSDMonitor.h"
26 #include "mon/Monitor.h"
27 #include "mon/MDSMonitor.h"
28 #include "mon/MgrStatMonitor.h"
29 #include "mon/AuthMonitor.h"
30 #include "mon/ConfigKeyService.h"
32 #include "mon/MonitorDBStore.h"
33 #include "mon/Session.h"
35 #include "crush/CrushWrapper.h"
36 #include "crush/CrushTester.h"
37 #include "crush/CrushTreeDumper.h"
39 #include "messages/MOSDBeacon.h"
40 #include "messages/MOSDFailure.h"
41 #include "messages/MOSDMarkMeDown.h"
42 #include "messages/MOSDMarkMeDead.h"
43 #include "messages/MOSDFull.h"
44 #include "messages/MOSDMap.h"
45 #include "messages/MMonGetOSDMap.h"
46 #include "messages/MOSDBoot.h"
47 #include "messages/MOSDAlive.h"
48 #include "messages/MPoolOp.h"
49 #include "messages/MPoolOpReply.h"
50 #include "messages/MOSDPGCreate.h"
51 #include "messages/MOSDPGCreate2.h"
52 #include "messages/MOSDPGCreated.h"
53 #include "messages/MOSDPGTemp.h"
54 #include "messages/MOSDPGReadyToMerge.h"
55 #include "messages/MMonCommand.h"
56 #include "messages/MRemoveSnaps.h"
57 #include "messages/MOSDScrub.h"
58 #include "messages/MRoute.h"
59 #include "messages/MMonGetPurgedSnaps.h"
60 #include "messages/MMonGetPurgedSnapsReply.h"
62 #include "common/TextTable.h"
63 #include "common/Timer.h"
64 #include "common/ceph_argparse.h"
65 #include "common/perf_counters.h"
66 #include "common/PriorityCache.h"
67 #include "common/strtol.h"
68 #include "common/numa.h"
70 #include "common/config.h"
71 #include "common/errno.h"
73 #include "erasure-code/ErasureCodePlugin.h"
74 #include "compressor/Compressor.h"
75 #include "common/Checksummer.h"
77 #include "include/compat.h"
78 #include "include/ceph_assert.h"
79 #include "include/stringify.h"
80 #include "include/util.h"
81 #include "common/cmdparse.h"
82 #include "include/str_list.h"
83 #include "include/str_map.h"
84 #include "include/scope_guard.h"
85 #include "perfglue/heap_profiler.h"
87 #include "auth/cephx/CephxKeyServer.h"
88 #include "osd/OSDCap.h"
90 #include "json_spirit/json_spirit_reader.h"
92 #include <boost/algorithm/string/predicate.hpp>
94 #define dout_subsys ceph_subsys_mon
95 static const string
OSD_PG_CREATING_PREFIX("osd_pg_creating");
96 static const string
OSD_METADATA_PREFIX("osd_metadata");
97 static const string
OSD_SNAP_PREFIX("osd_snap");
101 OSD snapshot metadata
102 ---------------------
104 -- starting with mimic, removed in octopus --
106 "removed_epoch_%llu_%08lx" % (pool, epoch)
107 -> interval_set<snapid_t>
109 "removed_snap_%llu_%016llx" % (pool, last_snap)
110 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
113 -- starting with mimic --
115 "purged_snap_%llu_%016llx" % (pool, last_snap)
116 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
118 - note that the {removed,purged}_snap put the last snap in they key so
119 that we can use forward iteration only to search for an epoch in an
120 interval. e.g., to test if epoch N is removed/purged, we'll find a key
121 >= N that either does or doesn't contain the given snap.
124 -- starting with octopus --
126 "purged_epoch_%08lx" % epoch
127 -> map<int64_t,interval_set<snapid_t>>
130 using namespace TOPNSPC::common
;
133 struct OSDMemCache
: public PriorityCache::PriCache
{
135 int64_t cache_bytes
[PriorityCache::Priority::LAST
+1] = {0};
136 int64_t committed_bytes
= 0;
137 double cache_ratio
= 0;
139 OSDMemCache(OSDMonitor
*m
) : osdmon(m
) {};
141 virtual uint64_t _get_used_bytes() const = 0;
143 virtual int64_t request_cache_bytes(
144 PriorityCache::Priority pri
, uint64_t total_cache
) const {
145 int64_t assigned
= get_cache_bytes(pri
);
148 // All cache items are currently set to have PRI1 priority
149 case PriorityCache::Priority::PRI1
:
151 int64_t request
= _get_used_bytes();
152 return (request
> assigned
) ? request
- assigned
: 0;
160 virtual int64_t get_cache_bytes(PriorityCache::Priority pri
) const {
161 return cache_bytes
[pri
];
164 virtual int64_t get_cache_bytes() const {
167 for (int i
= 0; i
< PriorityCache::Priority::LAST
+ 1; i
++) {
168 PriorityCache::Priority pri
= static_cast<PriorityCache::Priority
>(i
);
169 total
+= get_cache_bytes(pri
);
174 virtual void set_cache_bytes(PriorityCache::Priority pri
, int64_t bytes
) {
175 cache_bytes
[pri
] = bytes
;
177 virtual void add_cache_bytes(PriorityCache::Priority pri
, int64_t bytes
) {
178 cache_bytes
[pri
] += bytes
;
180 virtual int64_t commit_cache_size(uint64_t total_cache
) {
181 committed_bytes
= PriorityCache::get_chunk(
182 get_cache_bytes(), total_cache
);
183 return committed_bytes
;
185 virtual int64_t get_committed_size() const {
186 return committed_bytes
;
188 virtual double get_cache_ratio() const {
191 virtual void set_cache_ratio(double ratio
) {
194 virtual string
get_cache_name() const = 0;
197 struct IncCache
: public OSDMemCache
{
198 IncCache(OSDMonitor
*m
) : OSDMemCache(m
) {};
200 virtual uint64_t _get_used_bytes() const {
201 return osdmon
->inc_osd_cache
.get_bytes();
204 virtual string
get_cache_name() const {
205 return "OSDMap Inc Cache";
208 uint64_t _get_num_osdmaps() const {
209 return osdmon
->inc_osd_cache
.get_size();
213 struct FullCache
: public OSDMemCache
{
214 FullCache(OSDMonitor
*m
) : OSDMemCache(m
) {};
216 virtual uint64_t _get_used_bytes() const {
217 return osdmon
->full_osd_cache
.get_bytes();
220 virtual string
get_cache_name() const {
221 return "OSDMap Full Cache";
224 uint64_t _get_num_osdmaps() const {
225 return osdmon
->full_osd_cache
.get_size();
229 std::shared_ptr
<IncCache
> inc_cache
;
230 std::shared_ptr
<FullCache
> full_cache
;
232 const uint32_t MAX_POOL_APPLICATIONS
= 4;
233 const uint32_t MAX_POOL_APPLICATION_KEYS
= 64;
234 const uint32_t MAX_POOL_APPLICATION_LENGTH
= 128;
236 bool is_osd_writable(const OSDCapGrant
& grant
, const std::string
* pool_name
) {
237 // Note: this doesn't include support for the application tag match
238 if ((grant
.spec
.allow
& OSD_CAP_W
) != 0) {
239 auto& match
= grant
.match
;
240 if (match
.is_match_all()) {
242 } else if (pool_name
!= nullptr &&
243 !match
.pool_namespace
.pool_name
.empty() &&
244 match
.pool_namespace
.pool_name
== *pool_name
) {
251 bool is_unmanaged_snap_op_permitted(CephContext
* cct
,
252 const KeyServer
& key_server
,
253 const EntityName
& entity_name
,
254 const MonCap
& mon_caps
,
255 const entity_addr_t
& peer_socket_addr
,
256 const std::string
* pool_name
)
258 typedef std::map
<std::string
, std::string
> CommandArgs
;
260 if (mon_caps
.is_capable(
261 cct
, entity_name
, "osd",
262 "osd pool op unmanaged-snap",
263 (pool_name
== nullptr ?
264 CommandArgs
{} /* pool DNE, require unrestricted cap */ :
265 CommandArgs
{{"poolname", *pool_name
}}),
271 AuthCapsInfo caps_info
;
272 if (!key_server
.get_service_caps(entity_name
, CEPH_ENTITY_TYPE_OSD
,
274 dout(10) << "unable to locate OSD cap data for " << entity_name
275 << " in auth db" << dendl
;
280 if (caps_info
.caps
.length() > 0) {
281 auto p
= caps_info
.caps
.cbegin();
284 } catch (const buffer::error
&err
) {
285 derr
<< "corrupt OSD cap data for " << entity_name
<< " in auth db"
292 if (!osd_cap
.parse(caps_str
, nullptr)) {
293 dout(10) << "unable to parse OSD cap data for " << entity_name
294 << " in auth db" << dendl
;
298 // if the entity has write permissions in one or all pools, permit
299 // usage of unmanaged-snapshots
300 if (osd_cap
.allow_all()) {
304 for (auto& grant
: osd_cap
.grants
) {
305 if (grant
.profile
.is_valid()) {
306 for (auto& profile_grant
: grant
.profile_grants
) {
307 if (is_osd_writable(profile_grant
, pool_name
)) {
311 } else if (is_osd_writable(grant
, pool_name
)) {
319 } // anonymous namespace
321 void LastEpochClean::Lec::report(ps_t ps
, epoch_t last_epoch_clean
)
323 if (epoch_by_pg
.size() <= ps
) {
324 epoch_by_pg
.resize(ps
+ 1, 0);
326 const auto old_lec
= epoch_by_pg
[ps
];
327 if (old_lec
>= last_epoch_clean
) {
331 epoch_by_pg
[ps
] = last_epoch_clean
;
332 if (last_epoch_clean
< floor
) {
333 floor
= last_epoch_clean
;
334 } else if (last_epoch_clean
> floor
) {
335 if (old_lec
== floor
) {
336 // probably should increase floor?
337 auto new_floor
= std::min_element(std::begin(epoch_by_pg
),
338 std::end(epoch_by_pg
));
342 if (ps
!= next_missing
) {
345 for (; next_missing
< epoch_by_pg
.size(); next_missing
++) {
346 if (epoch_by_pg
[next_missing
] == 0) {
352 void LastEpochClean::remove_pool(uint64_t pool
)
354 report_by_pool
.erase(pool
);
357 void LastEpochClean::report(const pg_t
& pg
, epoch_t last_epoch_clean
)
359 auto& lec
= report_by_pool
[pg
.pool()];
360 return lec
.report(pg
.ps(), last_epoch_clean
);
363 epoch_t
LastEpochClean::get_lower_bound(const OSDMap
& latest
) const
365 auto floor
= latest
.get_epoch();
366 for (auto& pool
: latest
.get_pools()) {
367 auto reported
= report_by_pool
.find(pool
.first
);
368 if (reported
== report_by_pool
.end()) {
371 if (reported
->second
.next_missing
< pool
.second
.get_pg_num()) {
374 if (reported
->second
.floor
< floor
) {
375 floor
= reported
->second
.floor
;
381 void LastEpochClean::dump(Formatter
*f
) const
383 f
->open_array_section("per_pool");
385 for (auto& it
: report_by_pool
) {
386 f
->open_object_section("pool");
387 f
->dump_unsigned("poolid", it
.first
);
388 f
->dump_unsigned("floor", it
.second
.floor
);
395 class C_UpdateCreatingPGs
: public Context
{
400 C_UpdateCreatingPGs(OSDMonitor
*osdmon
, epoch_t e
) :
401 osdmon(osdmon
), start(ceph_clock_now()), epoch(e
) {}
402 void finish(int r
) override
{
404 utime_t end
= ceph_clock_now();
405 dout(10) << "osdmap epoch " << epoch
<< " mapping took "
406 << (end
- start
) << " seconds" << dendl
;
407 osdmon
->update_creating_pgs();
408 osdmon
->check_pg_creates_subs();
414 #define dout_prefix _prefix(_dout, mon, osdmap)
415 static ostream
& _prefix(std::ostream
*_dout
, Monitor
*mon
, const OSDMap
& osdmap
) {
416 return *_dout
<< "mon." << mon
->name
<< "@" << mon
->rank
417 << "(" << mon
->get_state_name()
418 << ").osd e" << osdmap
.get_epoch() << " ";
421 OSDMonitor::OSDMonitor(
425 const string
& service_name
)
426 : PaxosService(mn
, p
, service_name
),
428 inc_osd_cache(g_conf()->mon_osd_cache_size
),
429 full_osd_cache(g_conf()->mon_osd_cache_size
),
430 has_osdmap_manifest(false),
431 mapper(mn
->cct
, &mn
->cpu_tp
)
433 inc_cache
= std::make_shared
<IncCache
>(this);
434 full_cache
= std::make_shared
<FullCache
>(this);
435 cct
->_conf
.add_observer(this);
436 int r
= _set_cache_sizes();
438 derr
<< __func__
<< " using default osd cache size - mon_osd_cache_size ("
439 << g_conf()->mon_osd_cache_size
440 << ") without priority cache management"
445 const char **OSDMonitor::get_tracked_conf_keys() const
447 static const char* KEYS
[] = {
449 "mon_memory_autotune",
450 "rocksdb_cache_size",
456 void OSDMonitor::handle_conf_change(const ConfigProxy
& conf
,
457 const std::set
<std::string
> &changed
)
459 dout(10) << __func__
<< " " << changed
<< dendl
;
461 if (changed
.count("mon_memory_autotune")) {
462 _set_cache_autotuning();
464 if (changed
.count("mon_memory_target") ||
465 changed
.count("rocksdb_cache_size")) {
466 int r
= _update_mon_cache_settings();
468 derr
<< __func__
<< " mon_memory_target:"
469 << g_conf()->mon_memory_target
470 << " rocksdb_cache_size:"
471 << g_conf()->rocksdb_cache_size
472 << ". Unable to update cache size."
478 void OSDMonitor::_set_cache_autotuning()
480 if (!g_conf()->mon_memory_autotune
&& pcm
!= nullptr) {
481 // Disable cache autotuning
482 std::lock_guard
l(balancer_lock
);
486 if (g_conf()->mon_memory_autotune
&& pcm
== nullptr) {
487 int r
= register_cache_with_pcm();
490 << " Error while registering osdmon caches with pcm."
491 << " Cache auto tuning not enabled."
493 mon_memory_autotune
= false;
495 mon_memory_autotune
= true;
500 int OSDMonitor::_update_mon_cache_settings()
502 if (g_conf()->mon_memory_target
<= 0 ||
503 g_conf()->mon_memory_target
< mon_memory_min
||
504 g_conf()->rocksdb_cache_size
<= 0) {
508 if (pcm
== nullptr && rocksdb_binned_kv_cache
== nullptr) {
509 derr
<< __func__
<< " not using pcm and rocksdb" << dendl
;
513 uint64_t old_mon_memory_target
= mon_memory_target
;
514 uint64_t old_rocksdb_cache_size
= rocksdb_cache_size
;
516 // Set the new pcm memory cache sizes
517 mon_memory_target
= g_conf()->mon_memory_target
;
518 rocksdb_cache_size
= g_conf()->rocksdb_cache_size
;
520 uint64_t base
= mon_memory_base
;
521 double fragmentation
= mon_memory_fragmentation
;
522 uint64_t target
= mon_memory_target
;
523 uint64_t min
= mon_memory_min
;
526 uint64_t ltarget
= (1.0 - fragmentation
) * target
;
527 if (ltarget
> base
+ min
) {
528 max
= ltarget
- base
;
531 int r
= _set_cache_ratios();
533 derr
<< __func__
<< " Cache ratios for pcm could not be set."
534 << " Review the kv (rocksdb) and mon_memory_target sizes."
536 mon_memory_target
= old_mon_memory_target
;
537 rocksdb_cache_size
= old_rocksdb_cache_size
;
541 if (mon_memory_autotune
&& pcm
!= nullptr) {
542 std::lock_guard
l(balancer_lock
);
543 // set pcm cache levels
544 pcm
->set_target_memory(target
);
545 pcm
->set_min_memory(min
);
546 pcm
->set_max_memory(max
);
547 // tune memory based on new values
550 _set_new_cache_sizes();
551 dout(1) << __func__
<< " Updated mon cache setting."
552 << " target: " << target
560 int OSDMonitor::_set_cache_sizes()
562 if (g_conf()->mon_memory_autotune
) {
563 // set the new osdmon cache targets to be managed by pcm
564 mon_osd_cache_size
= g_conf()->mon_osd_cache_size
;
565 rocksdb_cache_size
= g_conf()->rocksdb_cache_size
;
566 mon_memory_base
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_base");
567 mon_memory_fragmentation
= cct
->_conf
.get_val
<double>("osd_memory_expected_fragmentation");
568 mon_memory_target
= g_conf()->mon_memory_target
;
569 mon_memory_min
= g_conf()->mon_osd_cache_size_min
;
570 if (mon_memory_target
<= 0 || mon_memory_min
<= 0) {
571 derr
<< __func__
<< " mon_memory_target:" << mon_memory_target
572 << " mon_memory_min:" << mon_memory_min
573 << ". Invalid size option(s) provided."
577 // Set the initial inc and full LRU cache sizes
578 inc_osd_cache
.set_bytes(mon_memory_min
);
579 full_osd_cache
.set_bytes(mon_memory_min
);
580 mon_memory_autotune
= g_conf()->mon_memory_autotune
;
585 bool OSDMonitor::_have_pending_crush()
587 return pending_inc
.crush
.length() > 0;
590 CrushWrapper
&OSDMonitor::_get_stable_crush()
592 return *osdmap
.crush
;
595 void OSDMonitor::_get_pending_crush(CrushWrapper
& newcrush
)
598 if (pending_inc
.crush
.length())
599 bl
= pending_inc
.crush
;
601 osdmap
.crush
->encode(bl
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
603 auto p
= bl
.cbegin();
607 void OSDMonitor::create_initial()
609 dout(10) << "create_initial for " << mon
->monmap
->fsid
<< dendl
;
614 mon
->store
->get("mkfs", "osdmap", bl
);
618 newmap
.set_fsid(mon
->monmap
->fsid
);
620 newmap
.build_simple(cct
, 0, mon
->monmap
->fsid
, 0);
623 newmap
.created
= newmap
.modified
= ceph_clock_now();
625 // new clusters should sort bitwise by default.
626 newmap
.set_flag(CEPH_OSDMAP_SORTBITWISE
);
629 CEPH_OSDMAP_RECOVERY_DELETES
|
630 CEPH_OSDMAP_PURGED_SNAPDIRS
|
631 CEPH_OSDMAP_PGLOG_HARDLIMIT
;
632 newmap
.full_ratio
= g_conf()->mon_osd_full_ratio
;
633 if (newmap
.full_ratio
> 1.0) newmap
.full_ratio
/= 100;
634 newmap
.backfillfull_ratio
= g_conf()->mon_osd_backfillfull_ratio
;
635 if (newmap
.backfillfull_ratio
> 1.0) newmap
.backfillfull_ratio
/= 100;
636 newmap
.nearfull_ratio
= g_conf()->mon_osd_nearfull_ratio
;
637 if (newmap
.nearfull_ratio
> 1.0) newmap
.nearfull_ratio
/= 100;
639 // new cluster should require latest by default
640 if (g_conf().get_val
<bool>("mon_debug_no_require_octopus")) {
641 if (g_conf().get_val
<bool>("mon_debug_no_require_nautilus")) {
642 derr
<< __func__
<< " mon_debug_no_require_octopus and nautilus=true" << dendl
;
643 newmap
.require_osd_release
= ceph_release_t::mimic
;
645 derr
<< __func__
<< " mon_debug_no_require_octopus=true" << dendl
;
646 newmap
.require_osd_release
= ceph_release_t::nautilus
;
649 newmap
.require_osd_release
= ceph_release_t::octopus
;
650 ceph_release_t r
= ceph_release_from_name(
651 g_conf()->mon_osd_initial_require_min_compat_client
);
653 ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
655 newmap
.require_min_compat_client
= r
;
658 // encode into pending incremental
659 uint64_t features
= newmap
.get_encoding_features();
660 newmap
.encode(pending_inc
.fullmap
,
661 features
| CEPH_FEATURE_RESERVED
);
662 pending_inc
.full_crc
= newmap
.get_crc();
663 dout(20) << " full crc " << pending_inc
.full_crc
<< dendl
;
666 void OSDMonitor::get_store_prefixes(std::set
<string
>& s
) const
668 s
.insert(service_name
);
669 s
.insert(OSD_PG_CREATING_PREFIX
);
670 s
.insert(OSD_METADATA_PREFIX
);
671 s
.insert(OSD_SNAP_PREFIX
);
674 void OSDMonitor::update_from_paxos(bool *need_bootstrap
)
676 // we really don't care if the version has been updated, because we may
677 // have trimmed without having increased the last committed; yet, we may
678 // need to update the in-memory manifest.
679 load_osdmap_manifest();
681 version_t version
= get_last_committed();
682 if (version
== osdmap
.epoch
)
684 ceph_assert(version
> osdmap
.epoch
);
686 dout(15) << "update_from_paxos paxos e " << version
687 << ", my e " << osdmap
.epoch
<< dendl
;
690 if (!mapping_job
->is_done()) {
691 dout(1) << __func__
<< " mapping job "
692 << mapping_job
.get() << " did not complete, "
693 << mapping_job
->shards
<< " left, canceling" << dendl
;
694 mapping_job
->abort();
702 * We will possibly have a stashed latest that *we* wrote, and we will
703 * always be sure to have the oldest full map in the first..last range
704 * due to encode_trim_extra(), which includes the oldest full map in the trim
707 * encode_trim_extra() does not however write the full map's
708 * version to 'full_latest'. This is only done when we are building the
709 * full maps from the incremental versions. But don't panic! We make sure
710 * that the following conditions find whichever full map version is newer.
712 version_t latest_full
= get_version_latest_full();
713 if (latest_full
== 0 && get_first_committed() > 1)
714 latest_full
= get_first_committed();
716 if (get_first_committed() > 1 &&
717 latest_full
< get_first_committed()) {
718 // the monitor could be just sync'ed with its peer, and the latest_full key
719 // is not encoded in the paxos commits in encode_pending(), so we need to
720 // make sure we get it pointing to a proper version.
721 version_t lc
= get_last_committed();
722 version_t fc
= get_first_committed();
724 dout(10) << __func__
<< " looking for valid full map in interval"
725 << " [" << fc
<< ", " << lc
<< "]" << dendl
;
728 for (version_t v
= lc
; v
>= fc
; v
--) {
729 string full_key
= "full_" + stringify(v
);
730 if (mon
->store
->exists(get_service_name(), full_key
)) {
731 dout(10) << __func__
<< " found latest full map v " << v
<< dendl
;
737 ceph_assert(latest_full
> 0);
738 auto t(std::make_shared
<MonitorDBStore::Transaction
>());
739 put_version_latest_full(t
, latest_full
);
740 mon
->store
->apply_transaction(t
);
741 dout(10) << __func__
<< " updated the on-disk full map version to "
742 << latest_full
<< dendl
;
745 if ((latest_full
> 0) && (latest_full
> osdmap
.epoch
)) {
746 bufferlist latest_bl
;
747 get_version_full(latest_full
, latest_bl
);
748 ceph_assert(latest_bl
.length() != 0);
749 dout(7) << __func__
<< " loading latest full map e" << latest_full
<< dendl
;
751 osdmap
.decode(latest_bl
);
755 if (!mon
->store
->get(OSD_PG_CREATING_PREFIX
, "creating", bl
)) {
756 auto p
= bl
.cbegin();
757 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
758 creating_pgs
.decode(p
);
759 dout(7) << __func__
<< " loading creating_pgs last_scan_epoch "
760 << creating_pgs
.last_scan_epoch
761 << " with " << creating_pgs
.pgs
.size() << " pgs" << dendl
;
763 dout(1) << __func__
<< " missing creating pgs; upgrade from post-kraken?"
767 // walk through incrementals
768 MonitorDBStore::TransactionRef t
;
770 while (version
> osdmap
.epoch
) {
772 int err
= get_version(osdmap
.epoch
+1, inc_bl
);
773 ceph_assert(err
== 0);
774 ceph_assert(inc_bl
.length());
775 // set priority cache manager levels if the osdmap is
776 // being populated for the first time.
777 if (mon_memory_autotune
&& pcm
== nullptr) {
778 int r
= register_cache_with_pcm();
781 << " Error while registering osdmon caches with pcm."
782 << " Proceeding without cache auto tuning."
787 dout(7) << "update_from_paxos applying incremental " << osdmap
.epoch
+1
789 OSDMap::Incremental
inc(inc_bl
);
790 err
= osdmap
.apply_incremental(inc
);
791 ceph_assert(err
== 0);
794 t
.reset(new MonitorDBStore::Transaction
);
796 // Write out the full map for all past epochs. Encode the full
797 // map with the same features as the incremental. If we don't
798 // know, use the quorum features. If we don't know those either,
799 // encode with all features.
800 uint64_t f
= inc
.encode_features
;
802 f
= mon
->get_quorum_con_features();
806 osdmap
.encode(full_bl
, f
| CEPH_FEATURE_RESERVED
);
807 tx_size
+= full_bl
.length();
809 bufferlist orig_full_bl
;
810 get_version_full(osdmap
.epoch
, orig_full_bl
);
811 if (orig_full_bl
.length()) {
812 // the primary provided the full map
813 ceph_assert(inc
.have_crc
);
814 if (inc
.full_crc
!= osdmap
.crc
) {
815 // This will happen if the mons were running mixed versions in
816 // the past or some other circumstance made the full encoded
817 // maps divergent. Reloading here will bring us back into
818 // sync with the primary for this and all future maps. OSDs
819 // will also be brought back into sync when they discover the
820 // crc mismatch and request a full map from a mon.
821 derr
<< __func__
<< " full map CRC mismatch, resetting to canonical"
824 dout(20) << __func__
<< " my (bad) full osdmap:\n";
825 JSONFormatter
jf(true);
826 jf
.dump_object("osdmap", osdmap
);
828 *_dout
<< "\nhexdump:\n";
829 full_bl
.hexdump(*_dout
);
833 osdmap
.decode(orig_full_bl
);
835 dout(20) << __func__
<< " canonical full osdmap:\n";
836 JSONFormatter
jf(true);
837 jf
.dump_object("osdmap", osdmap
);
839 *_dout
<< "\nhexdump:\n";
840 orig_full_bl
.hexdump(*_dout
);
844 ceph_assert(!inc
.have_crc
);
845 put_version_full(t
, osdmap
.epoch
, full_bl
);
847 put_version_latest_full(t
, osdmap
.epoch
);
850 dout(1) << osdmap
<< dendl
;
852 if (osdmap
.epoch
== 1) {
853 t
->erase("mkfs", "osdmap");
856 if (tx_size
> g_conf()->mon_sync_max_payload_size
*2) {
857 mon
->store
->apply_transaction(t
);
858 t
= MonitorDBStore::TransactionRef();
861 for (const auto &osd_state
: inc
.new_state
) {
862 if (osd_state
.second
& CEPH_OSD_UP
) {
863 // could be marked up *or* down, but we're too lazy to check which
864 last_osd_report
.erase(osd_state
.first
);
866 if (osd_state
.second
& CEPH_OSD_EXISTS
) {
867 // could be created *or* destroyed, but we can safely drop it
868 osd_epochs
.erase(osd_state
.first
);
874 mon
->store
->apply_transaction(t
);
877 for (int o
= 0; o
< osdmap
.get_max_osd(); o
++) {
878 if (osdmap
.is_out(o
))
880 auto found
= down_pending_out
.find(o
);
881 if (osdmap
.is_down(o
)) {
882 // populate down -> out map
883 if (found
== down_pending_out
.end()) {
884 dout(10) << " adding osd." << o
<< " to down_pending_out map" << dendl
;
885 down_pending_out
[o
] = ceph_clock_now();
888 if (found
!= down_pending_out
.end()) {
889 dout(10) << " removing osd." << o
<< " from down_pending_out map" << dendl
;
890 down_pending_out
.erase(found
);
894 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
897 check_pg_creates_subs();
899 share_map_with_random_osd();
903 // make sure our feature bits reflect the latest map
904 update_msgr_features();
906 if (!mon
->is_leader()) {
907 // will be called by on_active() on the leader, avoid doing so twice
912 int OSDMonitor::register_cache_with_pcm()
914 if (mon_memory_target
<= 0 || mon_memory_min
<= 0) {
915 derr
<< __func__
<< " Invalid memory size specified for mon caches."
916 << " Caches will not be auto-tuned."
920 uint64_t base
= mon_memory_base
;
921 double fragmentation
= mon_memory_fragmentation
;
922 // For calculating total target memory, consider rocksdb cache size.
923 uint64_t target
= mon_memory_target
;
924 uint64_t min
= mon_memory_min
;
927 // Apply the same logic as in bluestore to set the max amount
928 // of memory to use for cache. Assume base memory for OSDMaps
929 // and then add in some overhead for fragmentation.
930 uint64_t ltarget
= (1.0 - fragmentation
) * target
;
931 if (ltarget
> base
+ min
) {
932 max
= ltarget
- base
;
935 rocksdb_binned_kv_cache
= mon
->store
->get_priority_cache();
936 if (!rocksdb_binned_kv_cache
) {
937 derr
<< __func__
<< " not using rocksdb" << dendl
;
941 int r
= _set_cache_ratios();
943 derr
<< __func__
<< " Cache ratios for pcm could not be set."
944 << " Review the kv (rocksdb) and mon_memory_target sizes."
949 pcm
= std::make_shared
<PriorityCache::Manager
>(
950 cct
, min
, max
, target
, true);
951 pcm
->insert("kv", rocksdb_binned_kv_cache
, true);
952 pcm
->insert("inc", inc_cache
, true);
953 pcm
->insert("full", full_cache
, true);
954 dout(1) << __func__
<< " pcm target: " << target
955 << " pcm max: " << max
956 << " pcm min: " << min
957 << " inc_osd_cache size: " << inc_osd_cache
.get_size()
962 int OSDMonitor::_set_cache_ratios()
964 double old_cache_kv_ratio
= cache_kv_ratio
;
966 // Set the cache ratios for kv(rocksdb), inc and full caches
967 cache_kv_ratio
= (double)rocksdb_cache_size
/ (double)mon_memory_target
;
968 if (cache_kv_ratio
>= 1.0) {
969 derr
<< __func__
<< " Cache kv ratio (" << cache_kv_ratio
970 << ") must be in range [0,<1.0]."
972 cache_kv_ratio
= old_cache_kv_ratio
;
975 rocksdb_binned_kv_cache
->set_cache_ratio(cache_kv_ratio
);
976 cache_inc_ratio
= cache_full_ratio
= (1.0 - cache_kv_ratio
) / 2;
977 inc_cache
->set_cache_ratio(cache_inc_ratio
);
978 full_cache
->set_cache_ratio(cache_full_ratio
);
980 dout(1) << __func__
<< " kv ratio " << cache_kv_ratio
981 << " inc ratio " << cache_inc_ratio
982 << " full ratio " << cache_full_ratio
987 void OSDMonitor::start_mapping()
989 // initiate mapping job
991 dout(10) << __func__
<< " canceling previous mapping_job " << mapping_job
.get()
993 mapping_job
->abort();
995 if (!osdmap
.get_pools().empty()) {
996 auto fin
= new C_UpdateCreatingPGs(this, osdmap
.get_epoch());
997 mapping_job
= mapping
.start_update(osdmap
, mapper
,
998 g_conf()->mon_osd_mapping_pgs_per_chunk
);
999 dout(10) << __func__
<< " started mapping job " << mapping_job
.get()
1000 << " at " << fin
->start
<< dendl
;
1001 mapping_job
->set_finish_event(fin
);
1003 dout(10) << __func__
<< " no pools, no mapping job" << dendl
;
1004 mapping_job
= nullptr;
1008 void OSDMonitor::update_msgr_features()
1011 types
.insert((int)entity_name_t::TYPE_OSD
);
1012 types
.insert((int)entity_name_t::TYPE_CLIENT
);
1013 types
.insert((int)entity_name_t::TYPE_MDS
);
1014 types
.insert((int)entity_name_t::TYPE_MON
);
1015 for (set
<int>::iterator q
= types
.begin(); q
!= types
.end(); ++q
) {
1017 uint64_t features
= osdmap
.get_features(*q
, &mask
);
1018 if ((mon
->messenger
->get_policy(*q
).features_required
& mask
) != features
) {
1019 dout(0) << "crush map has features " << features
<< ", adjusting msgr requires" << dendl
;
1020 ceph::net::Policy p
= mon
->messenger
->get_policy(*q
);
1021 p
.features_required
= (p
.features_required
& ~mask
) | features
;
1022 mon
->messenger
->set_policy(*q
, p
);
1027 void OSDMonitor::on_active()
1031 if (mon
->is_leader()) {
1032 mon
->clog
->debug() << "osdmap " << osdmap
;
1033 if (!priority_convert
) {
1034 // Only do this once at start-up
1035 convert_pool_priorities();
1036 priority_convert
= true;
1039 list
<MonOpRequestRef
> ls
;
1040 take_all_failures(ls
);
1041 while (!ls
.empty()) {
1042 MonOpRequestRef op
= ls
.front();
1043 op
->mark_osdmon_event(__func__
);
1051 void OSDMonitor::on_restart()
1053 last_osd_report
.clear();
1056 void OSDMonitor::on_shutdown()
1058 dout(10) << __func__
<< dendl
;
1060 dout(10) << __func__
<< " canceling previous mapping_job " << mapping_job
.get()
1062 mapping_job
->abort();
1065 // discard failure info, waiters
1066 list
<MonOpRequestRef
> ls
;
1067 take_all_failures(ls
);
1071 void OSDMonitor::update_logger()
1073 dout(10) << "update_logger" << dendl
;
1075 mon
->cluster_logger
->set(l_cluster_num_osd
, osdmap
.get_num_osds());
1076 mon
->cluster_logger
->set(l_cluster_num_osd_up
, osdmap
.get_num_up_osds());
1077 mon
->cluster_logger
->set(l_cluster_num_osd_in
, osdmap
.get_num_in_osds());
1078 mon
->cluster_logger
->set(l_cluster_osd_epoch
, osdmap
.get_epoch());
1081 void OSDMonitor::create_pending()
1083 pending_inc
= OSDMap::Incremental(osdmap
.epoch
+1);
1084 pending_inc
.fsid
= mon
->monmap
->fsid
;
1085 pending_metadata
.clear();
1086 pending_metadata_rm
.clear();
1087 pending_pseudo_purged_snaps
.clear();
1089 dout(10) << "create_pending e " << pending_inc
.epoch
<< dendl
;
1091 // safety checks (this shouldn't really happen)
1093 if (osdmap
.backfillfull_ratio
<= 0) {
1094 pending_inc
.new_backfillfull_ratio
= g_conf()->mon_osd_backfillfull_ratio
;
1095 if (pending_inc
.new_backfillfull_ratio
> 1.0)
1096 pending_inc
.new_backfillfull_ratio
/= 100;
1097 dout(1) << __func__
<< " setting backfillfull_ratio = "
1098 << pending_inc
.new_backfillfull_ratio
<< dendl
;
1100 if (osdmap
.full_ratio
<= 0) {
1101 pending_inc
.new_full_ratio
= g_conf()->mon_osd_full_ratio
;
1102 if (pending_inc
.new_full_ratio
> 1.0)
1103 pending_inc
.new_full_ratio
/= 100;
1104 dout(1) << __func__
<< " setting full_ratio = "
1105 << pending_inc
.new_full_ratio
<< dendl
;
1107 if (osdmap
.nearfull_ratio
<= 0) {
1108 pending_inc
.new_nearfull_ratio
= g_conf()->mon_osd_nearfull_ratio
;
1109 if (pending_inc
.new_nearfull_ratio
> 1.0)
1110 pending_inc
.new_nearfull_ratio
/= 100;
1111 dout(1) << __func__
<< " setting nearfull_ratio = "
1112 << pending_inc
.new_nearfull_ratio
<< dendl
;
1116 // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
1118 if (osdmap
.crush
->has_legacy_rule_ids()) {
1119 CrushWrapper newcrush
;
1120 _get_pending_crush(newcrush
);
1122 // First, for all pools, work out which rule they really used
1123 // by resolving ruleset to rule.
1124 for (const auto &i
: osdmap
.get_pools()) {
1125 const auto pool_id
= i
.first
;
1126 const auto &pool
= i
.second
;
1127 int new_rule_id
= newcrush
.find_rule(pool
.crush_rule
,
1128 pool
.type
, pool
.size
);
1130 dout(1) << __func__
<< " rewriting pool "
1131 << osdmap
.get_pool_name(pool_id
) << " crush ruleset "
1132 << pool
.crush_rule
<< " -> rule id " << new_rule_id
<< dendl
;
1133 if (pending_inc
.new_pools
.count(pool_id
) == 0) {
1134 pending_inc
.new_pools
[pool_id
] = pool
;
1136 pending_inc
.new_pools
[pool_id
].crush_rule
= new_rule_id
;
1139 // Now, go ahead and renumber all the rules so that their
1140 // rule_id field corresponds to their position in the array
1141 auto old_to_new
= newcrush
.renumber_rules();
1142 dout(1) << __func__
<< " Rewrote " << old_to_new
<< " crush IDs:" << dendl
;
1143 for (const auto &i
: old_to_new
) {
1144 dout(1) << __func__
<< " " << i
.first
<< " -> " << i
.second
<< dendl
;
1146 pending_inc
.crush
.clear();
1147 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
1152 OSDMonitor::update_pending_pgs(const OSDMap::Incremental
& inc
,
1153 const OSDMap
& nextmap
)
1155 dout(10) << __func__
<< dendl
;
1156 creating_pgs_t pending_creatings
;
1158 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
1159 pending_creatings
= creating_pgs
;
1161 // check for new or old pools
1162 if (pending_creatings
.last_scan_epoch
< inc
.epoch
) {
1163 unsigned queued
= 0;
1164 queued
+= scan_for_creating_pgs(osdmap
.get_pools(),
1167 &pending_creatings
);
1168 queued
+= scan_for_creating_pgs(inc
.new_pools
,
1171 &pending_creatings
);
1172 dout(10) << __func__
<< " " << queued
<< " pools queued" << dendl
;
1173 for (auto deleted_pool
: inc
.old_pools
) {
1174 auto removed
= pending_creatings
.remove_pool(deleted_pool
);
1175 dout(10) << __func__
<< " " << removed
1176 << " pg removed because containing pool deleted: "
1177 << deleted_pool
<< dendl
;
1178 last_epoch_clean
.remove_pool(deleted_pool
);
1180 // pgmon updates its creating_pgs in check_osd_map() which is called by
1181 // on_active() and check_osd_map() could be delayed if lease expires, so its
1182 // creating_pgs could be stale in comparison with the one of osdmon. let's
1183 // trim them here. otherwise, they will be added back after being erased.
1184 unsigned removed
= 0;
1185 for (auto& pg
: pending_created_pgs
) {
1186 dout(20) << __func__
<< " noting created pg " << pg
<< dendl
;
1187 pending_creatings
.created_pools
.insert(pg
.pool());
1188 removed
+= pending_creatings
.pgs
.erase(pg
);
1190 pending_created_pgs
.clear();
1191 dout(10) << __func__
<< " " << removed
1192 << " pgs removed because they're created" << dendl
;
1193 pending_creatings
.last_scan_epoch
= osdmap
.get_epoch();
1196 // filter out any pgs that shouldn't exist.
1198 auto i
= pending_creatings
.pgs
.begin();
1199 while (i
!= pending_creatings
.pgs
.end()) {
1200 if (!nextmap
.pg_exists(i
->first
)) {
1201 dout(10) << __func__
<< " removing pg " << i
->first
1202 << " which should not exist" << dendl
;
1203 i
= pending_creatings
.pgs
.erase(i
);
1211 unsigned max
= std::max
<int64_t>(1, g_conf()->mon_osd_max_creating_pgs
);
1212 const auto total
= pending_creatings
.pgs
.size();
1213 while (pending_creatings
.pgs
.size() < max
&&
1214 !pending_creatings
.queue
.empty()) {
1215 auto p
= pending_creatings
.queue
.begin();
1216 int64_t poolid
= p
->first
;
1217 dout(10) << __func__
<< " pool " << poolid
1218 << " created " << p
->second
.created
1219 << " modified " << p
->second
.modified
1220 << " [" << p
->second
.start
<< "-" << p
->second
.end
<< ")"
1222 int64_t n
= std::min
<int64_t>(max
- pending_creatings
.pgs
.size(),
1223 p
->second
.end
- p
->second
.start
);
1224 ps_t first
= p
->second
.start
;
1225 ps_t end
= first
+ n
;
1226 for (ps_t ps
= first
; ps
< end
; ++ps
) {
1227 const pg_t pgid
{ps
, static_cast<uint64_t>(poolid
)};
1228 // NOTE: use the *current* epoch as the PG creation epoch so that the
1229 // OSD does not have to generate a long set of PastIntervals.
1230 pending_creatings
.pgs
.emplace(
1232 creating_pgs_t::pg_create_info(inc
.epoch
,
1233 p
->second
.modified
));
1234 dout(10) << __func__
<< " adding " << pgid
<< dendl
;
1236 p
->second
.start
= end
;
1237 if (p
->second
.done()) {
1238 dout(10) << __func__
<< " done with queue for " << poolid
<< dendl
;
1239 pending_creatings
.queue
.erase(p
);
1241 dout(10) << __func__
<< " pool " << poolid
1242 << " now [" << p
->second
.start
<< "-" << p
->second
.end
<< ")"
1246 dout(10) << __func__
<< " queue remaining: " << pending_creatings
.queue
.size()
1247 << " pools" << dendl
;
1249 if (mon
->monmap
->min_mon_release
>= ceph_release_t::octopus
) {
1250 // walk creating pgs' history and past_intervals forward
1251 for (auto& i
: pending_creatings
.pgs
) {
1252 // this mirrors PG::start_peering_interval()
1253 pg_t pgid
= i
.first
;
1255 // this is a bit imprecise, but sufficient?
1256 struct min_size_predicate_t
: public IsPGRecoverablePredicate
{
1257 const pg_pool_t
*pi
;
1258 bool operator()(const set
<pg_shard_t
> &have
) const {
1259 return have
.size() >= pi
->min_size
;
1261 explicit min_size_predicate_t(const pg_pool_t
*i
) : pi(i
) {}
1262 } min_size_predicate(nextmap
.get_pg_pool(pgid
.pool()));
1264 vector
<int> up
, acting
;
1265 int up_primary
, acting_primary
;
1266 nextmap
.pg_to_up_acting_osds(
1267 pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
1268 if (i
.second
.history
.epoch_created
== 0) {
1269 // new pg entry, set it up
1271 i
.second
.acting
= acting
;
1272 i
.second
.up_primary
= up_primary
;
1273 i
.second
.acting_primary
= acting_primary
;
1274 i
.second
.history
= pg_history_t(i
.second
.create_epoch
,
1275 i
.second
.create_stamp
);
1276 dout(10) << __func__
<< " pg " << pgid
<< " just added, "
1277 << " up " << i
.second
.up
1278 << " p " << i
.second
.up_primary
1279 << " acting " << i
.second
.acting
1280 << " p " << i
.second
.acting_primary
1281 << " history " << i
.second
.history
1282 << " past_intervals " << i
.second
.past_intervals
1285 std::stringstream debug
;
1286 if (PastIntervals::check_new_interval(
1287 i
.second
.acting_primary
, acting_primary
,
1288 i
.second
.acting
, acting
,
1289 i
.second
.up_primary
, up_primary
,
1291 i
.second
.history
.same_interval_since
,
1292 i
.second
.history
.last_epoch_clean
,
1297 &i
.second
.past_intervals
,
1299 epoch_t e
= inc
.epoch
;
1300 i
.second
.history
.same_interval_since
= e
;
1301 if (i
.second
.up
!= up
) {
1302 i
.second
.history
.same_up_since
= e
;
1304 if (i
.second
.acting_primary
!= acting_primary
) {
1305 i
.second
.history
.same_primary_since
= e
;
1308 osdmap
.get_pg_num(pgid
.pool()),
1309 nextmap
.get_pg_num(pgid
.pool()),
1311 i
.second
.history
.last_epoch_split
= e
;
1313 dout(10) << __func__
<< " pg " << pgid
<< " new interval,"
1314 << " up " << i
.second
.up
<< " -> " << up
1315 << " p " << i
.second
.up_primary
<< " -> " << up_primary
1316 << " acting " << i
.second
.acting
<< " -> " << acting
1317 << " p " << i
.second
.acting_primary
<< " -> "
1319 << " history " << i
.second
.history
1320 << " past_intervals " << i
.second
.past_intervals
1322 dout(20) << " debug: " << debug
.str() << dendl
;
1324 i
.second
.acting
= acting
;
1325 i
.second
.up_primary
= up_primary
;
1326 i
.second
.acting_primary
= acting_primary
;
1331 dout(10) << __func__
1332 << " " << (pending_creatings
.pgs
.size() - total
)
1333 << "/" << pending_creatings
.pgs
.size()
1334 << " pgs added from queued pools" << dendl
;
1335 return pending_creatings
;
1338 void OSDMonitor::maybe_prime_pg_temp()
1341 if (pending_inc
.crush
.length()) {
1342 dout(10) << __func__
<< " new crush map, all" << dendl
;
1346 if (!pending_inc
.new_up_client
.empty()) {
1347 dout(10) << __func__
<< " new up osds, all" << dendl
;
1351 // check for interesting OSDs
1353 for (auto p
= pending_inc
.new_state
.begin();
1354 !all
&& p
!= pending_inc
.new_state
.end();
1356 if ((p
->second
& CEPH_OSD_UP
) &&
1357 osdmap
.is_up(p
->first
)) {
1358 osds
.insert(p
->first
);
1361 for (map
<int32_t,uint32_t>::iterator p
= pending_inc
.new_weight
.begin();
1362 !all
&& p
!= pending_inc
.new_weight
.end();
1364 if (p
->second
< osdmap
.get_weight(p
->first
)) {
1366 osds
.insert(p
->first
);
1368 dout(10) << __func__
<< " osd." << p
->first
<< " weight increase, all"
1374 if (!all
&& osds
.empty())
1379 mapping
.get_osd_acting_pgs(*osds
.begin()).size() * osds
.size();
1380 if (estimate
> mapping
.get_num_pgs() *
1381 g_conf()->mon_osd_prime_pg_temp_max_estimate
) {
1382 dout(10) << __func__
<< " estimate " << estimate
<< " pgs on "
1383 << osds
.size() << " osds >= "
1384 << g_conf()->mon_osd_prime_pg_temp_max_estimate
<< " of total "
1385 << mapping
.get_num_pgs() << " pgs, all"
1389 dout(10) << __func__
<< " estimate " << estimate
<< " pgs on "
1390 << osds
.size() << " osds" << dendl
;
1395 next
.deepish_copy_from(osdmap
);
1396 next
.apply_incremental(pending_inc
);
1398 if (next
.get_pools().empty()) {
1399 dout(10) << __func__
<< " no pools, no pg_temp priming" << dendl
;
1401 PrimeTempJob
job(next
, this);
1402 mapper
.queue(&job
, g_conf()->mon_osd_mapping_pgs_per_chunk
, {});
1403 if (job
.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time
)) {
1404 dout(10) << __func__
<< " done in " << job
.get_duration() << dendl
;
1406 dout(10) << __func__
<< " did not finish in "
1407 << g_conf()->mon_osd_prime_pg_temp_max_time
1408 << ", stopping" << dendl
;
1412 dout(10) << __func__
<< " " << osds
.size() << " interesting osds" << dendl
;
1413 utime_t stop
= ceph_clock_now();
1414 stop
+= g_conf()->mon_osd_prime_pg_temp_max_time
;
1415 const int chunk
= 1000;
1417 std::unordered_set
<pg_t
> did_pgs
;
1418 for (auto osd
: osds
) {
1419 auto& pgs
= mapping
.get_osd_acting_pgs(osd
);
1420 dout(20) << __func__
<< " osd." << osd
<< " " << pgs
<< dendl
;
1421 for (auto pgid
: pgs
) {
1422 if (!did_pgs
.insert(pgid
).second
) {
1425 prime_pg_temp(next
, pgid
);
1428 if (ceph_clock_now() > stop
) {
1429 dout(10) << __func__
<< " consumed more than "
1430 << g_conf()->mon_osd_prime_pg_temp_max_time
1431 << " seconds, stopping"
1441 void OSDMonitor::prime_pg_temp(
1445 // TODO: remove this creating_pgs direct access?
1446 if (creating_pgs
.pgs
.count(pgid
)) {
1449 if (!osdmap
.pg_exists(pgid
)) {
1453 vector
<int> up
, acting
;
1454 mapping
.get(pgid
, &up
, nullptr, &acting
, nullptr);
1456 vector
<int> next_up
, next_acting
;
1457 int next_up_primary
, next_acting_primary
;
1458 next
.pg_to_up_acting_osds(pgid
, &next_up
, &next_up_primary
,
1459 &next_acting
, &next_acting_primary
);
1460 if (acting
== next_acting
&&
1461 !(up
!= acting
&& next_up
== next_acting
))
1462 return; // no change since last epoch
1465 return; // if previously empty now we can be no worse off
1466 const pg_pool_t
*pool
= next
.get_pg_pool(pgid
.pool());
1467 if (pool
&& acting
.size() < pool
->min_size
)
1468 return; // can be no worse off than before
1470 if (next_up
== next_acting
) {
1472 dout(20) << __func__
<< " next_up == next_acting now, clear pg_temp"
1476 dout(20) << __func__
<< " " << pgid
<< " " << up
<< "/" << acting
1477 << " -> " << next_up
<< "/" << next_acting
1478 << ", priming " << acting
1481 std::lock_guard
l(prime_pg_temp_lock
);
1482 // do not touch a mapping if a change is pending
1483 pending_inc
.new_pg_temp
.emplace(
1485 mempool::osdmap::vector
<int>(acting
.begin(), acting
.end()));
1490 * @note receiving a transaction in this function gives a fair amount of
1491 * freedom to the service implementation if it does need it. It shouldn't.
1493 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t
)
1495 dout(10) << "encode_pending e " << pending_inc
.epoch
1499 dout(1) << __func__
<< " osdmap full prune encoded e"
1500 << pending_inc
.epoch
<< dendl
;
1503 // finalize up pending_inc
1504 pending_inc
.modified
= ceph_clock_now();
1506 int r
= pending_inc
.propagate_snaps_to_tiers(cct
, osdmap
);
1507 ceph_assert(r
== 0);
1510 if (!mapping_job
->is_done()) {
1511 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job "
1512 << mapping_job
.get() << " did not complete, "
1513 << mapping_job
->shards
<< " left" << dendl
;
1514 mapping_job
->abort();
1515 } else if (mapping
.get_epoch() < osdmap
.get_epoch()) {
1516 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job "
1517 << mapping_job
.get() << " is prior epoch "
1518 << mapping
.get_epoch() << dendl
;
1520 if (g_conf()->mon_osd_prime_pg_temp
) {
1521 maybe_prime_pg_temp();
1524 } else if (g_conf()->mon_osd_prime_pg_temp
) {
1525 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job did not start"
1528 mapping_job
.reset();
1530 // ensure we don't have blank new_state updates. these are interrpeted as
1531 // CEPH_OSD_UP (and almost certainly not what we want!).
1532 auto p
= pending_inc
.new_state
.begin();
1533 while (p
!= pending_inc
.new_state
.end()) {
1534 if (p
->second
== 0) {
1535 dout(10) << "new_state for osd." << p
->first
<< " is 0, removing" << dendl
;
1536 p
= pending_inc
.new_state
.erase(p
);
1538 if (p
->second
& CEPH_OSD_UP
) {
1539 pending_inc
.new_last_up_change
= pending_inc
.modified
;
1544 if (!pending_inc
.new_up_client
.empty()) {
1545 pending_inc
.new_last_up_change
= pending_inc
.modified
;
1547 for (auto& i
: pending_inc
.new_weight
) {
1548 if (i
.first
>= osdmap
.max_osd
) {
1550 // new osd is already marked in
1551 pending_inc
.new_last_in_change
= pending_inc
.modified
;
1554 } else if (!!i
.second
!= !!osdmap
.osd_weight
[i
.first
]) {
1555 // existing osd marked in or out
1556 pending_inc
.new_last_in_change
= pending_inc
.modified
;
1563 tmp
.deepish_copy_from(osdmap
);
1564 tmp
.apply_incremental(pending_inc
);
1566 // clean pg_temp mappings
1567 OSDMap::clean_temps(cct
, osdmap
, tmp
, &pending_inc
);
1569 // clean inappropriate pg_upmap/pg_upmap_items (if any)
1571 // check every upmapped pg for now
1572 // until we could reliably identify certain cases to ignore,
1573 // which is obviously the hard part TBD..
1574 vector
<pg_t
> pgs_to_check
;
1575 tmp
.get_upmap_pgs(&pgs_to_check
);
1576 if (pgs_to_check
.size() <
1577 static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk
* 2)) {
1578 // not enough pgs, do it inline
1579 tmp
.clean_pg_upmaps(cct
, &pending_inc
);
1581 CleanUpmapJob
job(cct
, tmp
, pending_inc
);
1582 mapper
.queue(&job
, g_conf()->mon_clean_pg_upmaps_per_chunk
, pgs_to_check
);
1587 // update creating pgs first so that we can remove the created pgid and
1588 // process the pool flag removal below in the same osdmap epoch.
1589 auto pending_creatings
= update_pending_pgs(pending_inc
, tmp
);
1590 bufferlist creatings_bl
;
1591 uint64_t features
= CEPH_FEATURES_ALL
;
1592 if (mon
->monmap
->min_mon_release
< ceph_release_t::octopus
) {
1593 dout(20) << __func__
<< " encoding pending pgs without octopus features"
1595 features
&= ~CEPH_FEATURE_SERVER_OCTOPUS
;
1597 encode(pending_creatings
, creatings_bl
, features
);
1598 t
->put(OSD_PG_CREATING_PREFIX
, "creating", creatings_bl
);
1600 // remove any old (or incompat) POOL_CREATING flags
1601 for (auto& i
: tmp
.get_pools()) {
1602 if (tmp
.require_osd_release
< ceph_release_t::nautilus
) {
1603 // pre-nautilus OSDMaps shouldn't get this flag.
1604 if (pending_inc
.new_pools
.count(i
.first
)) {
1605 pending_inc
.new_pools
[i
.first
].flags
&= ~pg_pool_t::FLAG_CREATING
;
1608 if (i
.second
.has_flag(pg_pool_t::FLAG_CREATING
) &&
1609 !pending_creatings
.still_creating_pool(i
.first
)) {
1610 dout(10) << __func__
<< " done creating pool " << i
.first
1611 << ", clearing CREATING flag" << dendl
;
1612 if (pending_inc
.new_pools
.count(i
.first
) == 0) {
1613 pending_inc
.new_pools
[i
.first
] = i
.second
;
1615 pending_inc
.new_pools
[i
.first
].flags
&= ~pg_pool_t::FLAG_CREATING
;
1619 // collect which pools are currently affected by
1620 // the near/backfill/full osd(s),
1621 // and set per-pool near/backfill/full flag instead
1622 set
<int64_t> full_pool_ids
;
1623 set
<int64_t> backfillfull_pool_ids
;
1624 set
<int64_t> nearfull_pool_ids
;
1625 tmp
.get_full_pools(cct
,
1627 &backfillfull_pool_ids
,
1628 &nearfull_pool_ids
);
1629 if (full_pool_ids
.empty() ||
1630 backfillfull_pool_ids
.empty() ||
1631 nearfull_pool_ids
.empty()) {
1632 // normal case - no nearfull, backfillfull or full osds
1633 // try cancel any improper nearfull/backfillfull/full pool
1635 for (auto &pool
: tmp
.get_pools()) {
1636 auto p
= pool
.first
;
1637 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
) &&
1638 nearfull_pool_ids
.empty()) {
1639 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1640 << "'s nearfull flag" << dendl
;
1641 if (pending_inc
.new_pools
.count(p
) == 0) {
1642 // load original pool info first!
1643 pending_inc
.new_pools
[p
] = pool
.second
;
1645 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1647 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
) &&
1648 backfillfull_pool_ids
.empty()) {
1649 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1650 << "'s backfillfull flag" << dendl
;
1651 if (pending_inc
.new_pools
.count(p
) == 0) {
1652 pending_inc
.new_pools
[p
] = pool
.second
;
1654 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1656 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
) &&
1657 full_pool_ids
.empty()) {
1658 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1659 // set by EQUOTA, skipping
1662 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1663 << "'s full flag" << dendl
;
1664 if (pending_inc
.new_pools
.count(p
) == 0) {
1665 pending_inc
.new_pools
[p
] = pool
.second
;
1667 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_FULL
;
1671 if (!full_pool_ids
.empty()) {
1672 dout(10) << __func__
<< " marking pool(s) " << full_pool_ids
1673 << " as full" << dendl
;
1674 for (auto &p
: full_pool_ids
) {
1675 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
)) {
1678 if (pending_inc
.new_pools
.count(p
) == 0) {
1679 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1681 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_FULL
;
1682 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1683 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1685 // cancel FLAG_FULL for pools which are no longer full too
1686 for (auto &pool
: tmp
.get_pools()) {
1687 auto p
= pool
.first
;
1688 if (full_pool_ids
.count(p
)) {
1689 // skip pools we have just marked as full above
1692 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
) ||
1693 tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1694 // don't touch if currently is not full
1695 // or is running out of quota (and hence considered as full)
1698 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1699 << "'s full flag" << dendl
;
1700 if (pending_inc
.new_pools
.count(p
) == 0) {
1701 pending_inc
.new_pools
[p
] = pool
.second
;
1703 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_FULL
;
1706 if (!backfillfull_pool_ids
.empty()) {
1707 for (auto &p
: backfillfull_pool_ids
) {
1708 if (full_pool_ids
.count(p
)) {
1709 // skip pools we have already considered as full above
1712 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1713 // make sure FLAG_FULL is truly set, so we are safe not
1714 // to set a extra (redundant) FLAG_BACKFILLFULL flag
1715 ceph_assert(tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
));
1718 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
)) {
1719 // don't bother if pool is already marked as backfillfull
1722 dout(10) << __func__
<< " marking pool '" << tmp
.pool_name
[p
]
1723 << "'s as backfillfull" << dendl
;
1724 if (pending_inc
.new_pools
.count(p
) == 0) {
1725 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1727 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_BACKFILLFULL
;
1728 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1730 // cancel FLAG_BACKFILLFULL for pools
1731 // which are no longer backfillfull too
1732 for (auto &pool
: tmp
.get_pools()) {
1733 auto p
= pool
.first
;
1734 if (full_pool_ids
.count(p
) || backfillfull_pool_ids
.count(p
)) {
1735 // skip pools we have just marked as backfillfull/full above
1738 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
)) {
1739 // and don't touch if currently is not backfillfull
1742 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1743 << "'s backfillfull flag" << dendl
;
1744 if (pending_inc
.new_pools
.count(p
) == 0) {
1745 pending_inc
.new_pools
[p
] = pool
.second
;
1747 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1750 if (!nearfull_pool_ids
.empty()) {
1751 for (auto &p
: nearfull_pool_ids
) {
1752 if (full_pool_ids
.count(p
) || backfillfull_pool_ids
.count(p
)) {
1755 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1756 // make sure FLAG_FULL is truly set, so we are safe not
1757 // to set a extra (redundant) FLAG_NEARFULL flag
1758 ceph_assert(tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
));
1761 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
)) {
1762 // don't bother if pool is already marked as nearfull
1765 dout(10) << __func__
<< " marking pool '" << tmp
.pool_name
[p
]
1766 << "'s as nearfull" << dendl
;
1767 if (pending_inc
.new_pools
.count(p
) == 0) {
1768 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1770 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_NEARFULL
;
1772 // cancel FLAG_NEARFULL for pools
1773 // which are no longer nearfull too
1774 for (auto &pool
: tmp
.get_pools()) {
1775 auto p
= pool
.first
;
1776 if (full_pool_ids
.count(p
) ||
1777 backfillfull_pool_ids
.count(p
) ||
1778 nearfull_pool_ids
.count(p
)) {
1779 // skip pools we have just marked as
1780 // nearfull/backfillfull/full above
1783 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
)) {
1784 // and don't touch if currently is not nearfull
1787 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1788 << "'s nearfull flag" << dendl
;
1789 if (pending_inc
.new_pools
.count(p
) == 0) {
1790 pending_inc
.new_pools
[p
] = pool
.second
;
1792 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1796 // min_compat_client?
1797 if (!tmp
.require_min_compat_client
) {
1798 auto mv
= tmp
.get_min_compat_client();
1799 dout(1) << __func__
<< " setting require_min_compat_client to currently "
1800 << "required " << mv
<< dendl
;
1801 mon
->clog
->info() << "setting require_min_compat_client to currently "
1802 << "required " << mv
;
1803 pending_inc
.new_require_min_compat_client
= mv
;
1806 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
&&
1807 tmp
.require_osd_release
>= ceph_release_t::nautilus
) {
1808 dout(10) << __func__
<< " first nautilus+ epoch" << dendl
;
1809 // add creating flags?
1810 for (auto& i
: tmp
.get_pools()) {
1811 if (pending_creatings
.still_creating_pool(i
.first
)) {
1812 dout(10) << __func__
<< " adding CREATING flag to pool " << i
.first
1814 if (pending_inc
.new_pools
.count(i
.first
) == 0) {
1815 pending_inc
.new_pools
[i
.first
] = i
.second
;
1817 pending_inc
.new_pools
[i
.first
].flags
|= pg_pool_t::FLAG_CREATING
;
1820 // adjust blacklist items to all be TYPE_ANY
1821 for (auto& i
: tmp
.blacklist
) {
1823 a
.set_type(entity_addr_t::TYPE_ANY
);
1824 pending_inc
.new_blacklist
[a
] = i
.second
;
1825 pending_inc
.old_blacklist
.push_back(i
.first
);
1829 if (osdmap
.require_osd_release
< ceph_release_t::octopus
&&
1830 tmp
.require_osd_release
>= ceph_release_t::octopus
) {
1831 dout(10) << __func__
<< " first octopus+ epoch" << dendl
;
1833 // adjust obsoleted cache modes
1834 for (auto& [poolid
, pi
] : tmp
.pools
) {
1835 if (pi
.cache_mode
== pg_pool_t::CACHEMODE_FORWARD
) {
1836 if (pending_inc
.new_pools
.count(poolid
) == 0) {
1837 pending_inc
.new_pools
[poolid
] = pi
;
1839 dout(10) << __func__
<< " switching pool " << poolid
1840 << " cachemode from forward -> proxy" << dendl
;
1841 pending_inc
.new_pools
[poolid
].cache_mode
= pg_pool_t::CACHEMODE_PROXY
;
1843 if (pi
.cache_mode
== pg_pool_t::CACHEMODE_READFORWARD
) {
1844 if (pending_inc
.new_pools
.count(poolid
) == 0) {
1845 pending_inc
.new_pools
[poolid
] = pi
;
1847 dout(10) << __func__
<< " switching pool " << poolid
1848 << " cachemode from readforward -> readproxy" << dendl
;
1849 pending_inc
.new_pools
[poolid
].cache_mode
=
1850 pg_pool_t::CACHEMODE_READPROXY
;
1854 // clear removed_snaps for every pool
1855 for (auto& [poolid
, pi
] : tmp
.pools
) {
1856 if (pi
.removed_snaps
.empty()) {
1859 if (pending_inc
.new_pools
.count(poolid
) == 0) {
1860 pending_inc
.new_pools
[poolid
] = pi
;
1862 dout(10) << __func__
<< " clearing pool " << poolid
<< " removed_snaps"
1864 pending_inc
.new_pools
[poolid
].removed_snaps
.clear();
1867 // create a combined purged snap epoch key for all purged snaps
1868 // prior to this epoch, and store it in the current epoch (i.e.,
1869 // the last pre-octopus epoch, just prior to the one we're
1871 auto it
= mon
->store
->get_iterator(OSD_SNAP_PREFIX
);
1872 it
->lower_bound("purged_snap_");
1873 map
<int64_t,snap_interval_set_t
> combined
;
1874 while (it
->valid()) {
1875 if (it
->key().find("purged_snap_") != 0) {
1878 string k
= it
->key();
1879 long long unsigned pool
;
1880 int n
= sscanf(k
.c_str(), "purged_snap_%llu_", &pool
);
1882 derr
<< __func__
<< " invalid purged_snaps key '" << k
<< "'" << dendl
;
1884 bufferlist v
= it
->value();
1885 auto p
= v
.cbegin();
1886 snapid_t begin
, end
;
1887 ceph::decode(begin
, p
);
1888 ceph::decode(end
, p
);
1889 combined
[pool
].insert(begin
, end
- begin
);
1893 if (!combined
.empty()) {
1894 string k
= make_purged_snap_epoch_key(pending_inc
.epoch
- 1);
1896 ceph::encode(combined
, v
);
1897 t
->put(OSD_SNAP_PREFIX
, k
, v
);
1898 dout(10) << __func__
<< " recording pre-octopus purged_snaps in epoch "
1899 << (pending_inc
.epoch
- 1) << ", " << v
.length() << " bytes"
1902 dout(10) << __func__
<< " there were no pre-octopus purged snaps"
1906 // clean out the old removed_snap_ and removed_epoch keys
1907 // ('`' is ASCII '_' + 1)
1908 t
->erase_range(OSD_SNAP_PREFIX
, "removed_snap_", "removed_snap`");
1909 t
->erase_range(OSD_SNAP_PREFIX
, "removed_epoch_", "removed_epoch`");
1914 for (auto i
= pending_inc
.new_state
.begin();
1915 i
!= pending_inc
.new_state
.end();
1917 int s
= i
->second
? i
->second
: CEPH_OSD_UP
;
1918 if (s
& CEPH_OSD_UP
)
1919 dout(2) << " osd." << i
->first
<< " DOWN" << dendl
;
1920 if (s
& CEPH_OSD_EXISTS
)
1921 dout(2) << " osd." << i
->first
<< " DNE" << dendl
;
1923 for (auto i
= pending_inc
.new_up_client
.begin();
1924 i
!= pending_inc
.new_up_client
.end();
1926 //FIXME: insert cluster addresses too
1927 dout(2) << " osd." << i
->first
<< " UP " << i
->second
<< dendl
;
1929 for (map
<int32_t,uint32_t>::iterator i
= pending_inc
.new_weight
.begin();
1930 i
!= pending_inc
.new_weight
.end();
1932 if (i
->second
== CEPH_OSD_OUT
) {
1933 dout(2) << " osd." << i
->first
<< " OUT" << dendl
;
1934 } else if (i
->second
== CEPH_OSD_IN
) {
1935 dout(2) << " osd." << i
->first
<< " IN" << dendl
;
1937 dout(2) << " osd." << i
->first
<< " WEIGHT " << hex
<< i
->second
<< dec
<< dendl
;
1941 // features for osdmap and its incremental
1944 // encode full map and determine its crc
1947 tmp
.deepish_copy_from(osdmap
);
1948 tmp
.apply_incremental(pending_inc
);
1950 // determine appropriate features
1951 features
= tmp
.get_encoding_features();
1952 dout(10) << __func__
<< " encoding full map with "
1953 << tmp
.require_osd_release
1954 << " features " << features
<< dendl
;
1956 // the features should be a subset of the mon quorum's features!
1957 ceph_assert((features
& ~mon
->get_quorum_con_features()) == 0);
1960 encode(tmp
, fullbl
, features
| CEPH_FEATURE_RESERVED
);
1961 pending_inc
.full_crc
= tmp
.get_crc();
1963 // include full map in the txn. note that old monitors will
1964 // overwrite this. new ones will now skip the local full map
1965 // encode and reload from this.
1966 put_version_full(t
, pending_inc
.epoch
, fullbl
);
1970 ceph_assert(get_last_committed() + 1 == pending_inc
.epoch
);
1972 encode(pending_inc
, bl
, features
| CEPH_FEATURE_RESERVED
);
1974 dout(20) << " full_crc " << tmp
.get_crc()
1975 << " inc_crc " << pending_inc
.inc_crc
<< dendl
;
1977 /* put everything in the transaction */
1978 put_version(t
, pending_inc
.epoch
, bl
);
1979 put_last_committed(t
, pending_inc
.epoch
);
1982 for (map
<int,bufferlist
>::iterator p
= pending_metadata
.begin();
1983 p
!= pending_metadata
.end();
1985 t
->put(OSD_METADATA_PREFIX
, stringify(p
->first
), p
->second
);
1986 for (set
<int>::iterator p
= pending_metadata_rm
.begin();
1987 p
!= pending_metadata_rm
.end();
1989 t
->erase(OSD_METADATA_PREFIX
, stringify(*p
));
1990 pending_metadata
.clear();
1991 pending_metadata_rm
.clear();
1994 if (tmp
.require_osd_release
>= ceph_release_t::octopus
&&
1995 !pending_inc
.new_purged_snaps
.empty()) {
1996 // all snaps purged this epoch (across all pools)
1997 string k
= make_purged_snap_epoch_key(pending_inc
.epoch
);
1999 encode(pending_inc
.new_purged_snaps
, v
);
2000 t
->put(OSD_SNAP_PREFIX
, k
, v
);
2002 for (auto& i
: pending_inc
.new_purged_snaps
) {
2003 for (auto q
= i
.second
.begin();
2004 q
!= i
.second
.end();
2006 insert_purged_snap_update(i
.first
, q
.get_start(), q
.get_end(),
2011 for (auto& [pool
, snaps
] : pending_pseudo_purged_snaps
) {
2012 for (auto snap
: snaps
) {
2013 insert_purged_snap_update(pool
, snap
, snap
+ 1,
2020 health_check_map_t next
;
2021 tmp
.check_health(cct
, &next
);
2022 encode_health(next
, t
);
2025 int OSDMonitor::load_metadata(int osd
, map
<string
, string
>& m
, ostream
*err
)
2028 int r
= mon
->store
->get(OSD_METADATA_PREFIX
, stringify(osd
), bl
);
2032 auto p
= bl
.cbegin();
2035 catch (buffer::error
& e
) {
2037 *err
<< "osd." << osd
<< " metadata is corrupt";
2043 void OSDMonitor::count_metadata(const string
& field
, map
<string
,int> *out
)
2045 for (int osd
= 0; osd
< osdmap
.get_max_osd(); ++osd
) {
2046 if (osdmap
.is_up(osd
)) {
2047 map
<string
,string
> meta
;
2048 load_metadata(osd
, meta
, nullptr);
2049 auto p
= meta
.find(field
);
2050 if (p
== meta
.end()) {
2051 (*out
)["unknown"]++;
2053 (*out
)[p
->second
]++;
2059 void OSDMonitor::count_metadata(const string
& field
, Formatter
*f
)
2061 map
<string
,int> by_val
;
2062 count_metadata(field
, &by_val
);
2063 f
->open_object_section(field
.c_str());
2064 for (auto& p
: by_val
) {
2065 f
->dump_int(p
.first
.c_str(), p
.second
);
2070 int OSDMonitor::get_osd_objectstore_type(int osd
, string
*type
)
2072 map
<string
, string
> metadata
;
2073 int r
= load_metadata(osd
, metadata
, nullptr);
2077 auto it
= metadata
.find("osd_objectstore");
2078 if (it
== metadata
.end())
2084 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id
,
2085 const pg_pool_t
&pool
,
2088 // just check a few pgs for efficiency - this can't give a guarantee anyway,
2089 // since filestore osds could always join the pool later
2090 set
<int> checked_osds
;
2091 for (unsigned ps
= 0; ps
< std::min(8u, pool
.get_pg_num()); ++ps
) {
2092 vector
<int> up
, acting
;
2093 pg_t
pgid(ps
, pool_id
);
2094 osdmap
.pg_to_up_acting_osds(pgid
, up
, acting
);
2095 for (int osd
: up
) {
2096 if (checked_osds
.find(osd
) != checked_osds
.end())
2098 string objectstore_type
;
2099 int r
= get_osd_objectstore_type(osd
, &objectstore_type
);
2100 // allow with missing metadata, e.g. due to an osd never booting yet
2101 if (r
< 0 || objectstore_type
== "bluestore") {
2102 checked_osds
.insert(osd
);
2105 *err
<< "osd." << osd
<< " uses " << objectstore_type
;
2112 int OSDMonitor::dump_osd_metadata(int osd
, Formatter
*f
, ostream
*err
)
2114 map
<string
,string
> m
;
2115 if (int r
= load_metadata(osd
, m
, err
))
2117 for (map
<string
,string
>::iterator p
= m
.begin(); p
!= m
.end(); ++p
)
2118 f
->dump_string(p
->first
.c_str(), p
->second
);
2122 void OSDMonitor::print_nodes(Formatter
*f
)
2124 // group OSDs by their hosts
2125 map
<string
, list
<int> > osds
; // hostname => osd
2126 for (int osd
= 0; osd
< osdmap
.get_max_osd(); osd
++) {
2127 map
<string
, string
> m
;
2128 if (load_metadata(osd
, m
, NULL
)) {
2131 map
<string
, string
>::iterator hostname
= m
.find("hostname");
2132 if (hostname
== m
.end()) {
2133 // not likely though
2136 osds
[hostname
->second
].push_back(osd
);
2139 dump_services(f
, osds
, "osd");
2142 void OSDMonitor::share_map_with_random_osd()
2144 if (osdmap
.get_num_up_osds() == 0) {
2145 dout(10) << __func__
<< " no up osds, don't share with anyone" << dendl
;
2149 MonSession
*s
= mon
->session_map
.get_random_osd_session(&osdmap
);
2151 dout(10) << __func__
<< " no up osd on our session map" << dendl
;
2155 dout(10) << "committed, telling random " << s
->name
2156 << " all about it" << dendl
;
2158 // get feature of the peer
2159 // use quorum_con_features, if it's an anonymous connection.
2160 uint64_t features
= s
->con_features
? s
->con_features
:
2161 mon
->get_quorum_con_features();
2162 // whatev, they'll request more if they need it
2163 MOSDMap
*m
= build_incremental(osdmap
.get_epoch() - 1, osdmap
.get_epoch(), features
);
2164 s
->con
->send_message(m
);
2165 // NOTE: do *not* record osd has up to this epoch (as we do
2166 // elsewhere) as they may still need to request older values.
2169 version_t
OSDMonitor::get_trim_to() const
2171 if (mon
->get_quorum().empty()) {
2172 dout(10) << __func__
<< ": quorum not formed" << dendl
;
2177 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
2178 if (!creating_pgs
.pgs
.empty()) {
2183 if (g_conf().get_val
<bool>("mon_debug_block_osdmap_trim")) {
2185 << " blocking osdmap trim"
2186 " ('mon_debug_block_osdmap_trim' set to 'true')"
2192 epoch_t floor
= get_min_last_epoch_clean();
2193 dout(10) << " min_last_epoch_clean " << floor
<< dendl
;
2194 if (g_conf()->mon_osd_force_trim_to
> 0 &&
2195 g_conf()->mon_osd_force_trim_to
< (int)get_last_committed()) {
2196 floor
= g_conf()->mon_osd_force_trim_to
;
2197 dout(10) << " explicit mon_osd_force_trim_to = " << floor
<< dendl
;
2199 unsigned min
= g_conf()->mon_min_osdmap_epochs
;
2200 if (floor
+ min
> get_last_committed()) {
2201 if (min
< get_last_committed())
2202 floor
= get_last_committed() - min
;
2206 if (floor
> get_first_committed())
2212 epoch_t
OSDMonitor::get_min_last_epoch_clean() const
2214 auto floor
= last_epoch_clean
.get_lower_bound(osdmap
);
2215 // also scan osd epochs
2216 // don't trim past the oldest reported osd epoch
2217 for (auto& osd_epoch
: osd_epochs
) {
2218 if (osd_epoch
.second
< floor
&&
2219 osdmap
.is_out(osd_epoch
.first
)) {
2220 floor
= osd_epoch
.second
;
2226 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx
,
2229 dout(10) << __func__
<< " including full map for e " << first
<< dendl
;
2231 get_version_full(first
, bl
);
2232 put_version_full(tx
, first
, bl
);
2234 if (has_osdmap_manifest
&&
2235 first
> osdmap_manifest
.get_first_pinned()) {
2236 _prune_update_trimmed(tx
, first
);
2241 /* full osdmap prune
2243 * for more information, please refer to doc/dev/mon-osdmap-prune.rst
2246 void OSDMonitor::load_osdmap_manifest()
2248 bool store_has_manifest
=
2249 mon
->store
->exists(get_service_name(), "osdmap_manifest");
2251 if (!store_has_manifest
) {
2252 if (!has_osdmap_manifest
) {
2256 dout(20) << __func__
2257 << " dropping osdmap manifest from memory." << dendl
;
2258 osdmap_manifest
= osdmap_manifest_t();
2259 has_osdmap_manifest
= false;
2263 dout(20) << __func__
2264 << " osdmap manifest detected in store; reload." << dendl
;
2266 bufferlist manifest_bl
;
2267 int r
= get_value("osdmap_manifest", manifest_bl
);
2269 derr
<< __func__
<< " unable to read osdmap version manifest" << dendl
;
2270 ceph_abort_msg("error reading manifest");
2272 osdmap_manifest
.decode(manifest_bl
);
2273 has_osdmap_manifest
= true;
2275 dout(10) << __func__
<< " store osdmap manifest pinned ("
2276 << osdmap_manifest
.get_first_pinned()
2278 << osdmap_manifest
.get_last_pinned()
2283 bool OSDMonitor::should_prune() const
2285 version_t first
= get_first_committed();
2286 version_t last
= get_last_committed();
2287 version_t min_osdmap_epochs
=
2288 g_conf().get_val
<int64_t>("mon_min_osdmap_epochs");
2289 version_t prune_min
=
2290 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_min");
2291 version_t prune_interval
=
2292 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_interval");
2293 version_t last_pinned
= osdmap_manifest
.get_last_pinned();
2294 version_t last_to_pin
= last
- min_osdmap_epochs
;
2296 // Make it or break it constraints.
2298 // If any of these conditions fails, we will not prune, regardless of
2299 // whether we have an on-disk manifest with an on-going pruning state.
2301 if ((last
- first
) <= min_osdmap_epochs
) {
2302 // between the first and last committed epochs, we don't have
2303 // enough epochs to trim, much less to prune.
2304 dout(10) << __func__
2305 << " currently holding only " << (last
- first
)
2306 << " epochs (min osdmap epochs: " << min_osdmap_epochs
2307 << "); do not prune."
2311 } else if ((last_to_pin
- first
) < prune_min
) {
2312 // between the first committed epoch and the last epoch we would prune,
2313 // we simply don't have enough versions over the minimum to prune maps.
2314 dout(10) << __func__
2315 << " could only prune " << (last_to_pin
- first
)
2316 << " epochs (" << first
<< ".." << last_to_pin
<< "), which"
2317 " is less than the required minimum (" << prune_min
<< ")"
2321 } else if (has_osdmap_manifest
&& last_pinned
>= last_to_pin
) {
2322 dout(10) << __func__
2323 << " we have pruned as far as we can; do not prune."
2327 } else if (last_pinned
+ prune_interval
> last_to_pin
) {
2328 dout(10) << __func__
2329 << " not enough epochs to form an interval (last pinned: "
2330 << last_pinned
<< ", last to pin: "
2331 << last_to_pin
<< ", interval: " << prune_interval
<< ")"
2336 dout(15) << __func__
2337 << " should prune (" << last_pinned
<< ".." << last_to_pin
<< ")"
2338 << " lc (" << first
<< ".." << last
<< ")"
2343 void OSDMonitor::_prune_update_trimmed(
2344 MonitorDBStore::TransactionRef tx
,
2347 dout(10) << __func__
2348 << " first " << first
2349 << " last_pinned " << osdmap_manifest
.get_last_pinned()
2350 << " last_pinned " << osdmap_manifest
.get_last_pinned()
2353 osdmap_manifest_t manifest
= osdmap_manifest
;
2355 if (!manifest
.is_pinned(first
)) {
2356 manifest
.pin(first
);
2359 set
<version_t
>::iterator p_end
= manifest
.pinned
.find(first
);
2360 set
<version_t
>::iterator p
= manifest
.pinned
.begin();
2361 manifest
.pinned
.erase(p
, p_end
);
2362 ceph_assert(manifest
.get_first_pinned() == first
);
2364 if (manifest
.get_last_pinned() == first
+1 ||
2365 manifest
.pinned
.size() == 1) {
2366 // we reached the end of the line, as pinned maps go; clean up our
2367 // manifest, and let `should_prune()` decide whether we should prune
2369 tx
->erase(get_service_name(), "osdmap_manifest");
2374 manifest
.encode(bl
);
2375 tx
->put(get_service_name(), "osdmap_manifest", bl
);
2378 void OSDMonitor::prune_init(osdmap_manifest_t
& manifest
)
2380 dout(1) << __func__
<< dendl
;
2382 version_t pin_first
;
2384 // verify constrainsts on stable in-memory state
2385 if (!has_osdmap_manifest
) {
2386 // we must have never pruned, OR if we pruned the state must no longer
2387 // be relevant (i.e., the state must have been removed alongside with
2388 // the trim that *must* have removed past the last pinned map in a
2390 ceph_assert(osdmap_manifest
.pinned
.empty());
2391 ceph_assert(!mon
->store
->exists(get_service_name(), "osdmap_manifest"));
2392 pin_first
= get_first_committed();
2395 // we must have pruned in the past AND its state is still relevant
2396 // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
2397 // and thus we still hold a manifest in the store).
2398 ceph_assert(!osdmap_manifest
.pinned
.empty());
2399 ceph_assert(osdmap_manifest
.get_first_pinned() == get_first_committed());
2400 ceph_assert(osdmap_manifest
.get_last_pinned() < get_last_committed());
2402 dout(10) << __func__
2403 << " first_pinned " << osdmap_manifest
.get_first_pinned()
2404 << " last_pinned " << osdmap_manifest
.get_last_pinned()
2407 pin_first
= osdmap_manifest
.get_last_pinned();
2410 manifest
.pin(pin_first
);
2413 bool OSDMonitor::_prune_sanitize_options() const
2415 uint64_t prune_interval
=
2416 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_interval");
2417 uint64_t prune_min
=
2418 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_min");
2420 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_txsize");
2424 if (prune_interval
== 0) {
2426 << " prune is enabled BUT prune interval is zero; abort."
2429 } else if (prune_interval
== 1) {
2431 << " prune interval is equal to one, which essentially means"
2432 " no pruning; abort."
2436 if (prune_min
== 0) {
2438 << " prune is enabled BUT prune min is zero; abort."
2442 if (prune_interval
> prune_min
) {
2444 << " impossible to ascertain proper prune interval because"
2445 << " it is greater than the minimum prune epochs"
2446 << " (min: " << prune_min
<< ", interval: " << prune_interval
<< ")"
2451 if (txsize
< prune_interval
- 1) {
2453 << "'mon_osdmap_full_prune_txsize' (" << txsize
2454 << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval
- 1
2455 << "); abort." << dendl
;
2461 bool OSDMonitor::is_prune_enabled() const {
2462 return g_conf().get_val
<bool>("mon_osdmap_full_prune_enabled");
2465 bool OSDMonitor::is_prune_supported() const {
2466 return mon
->get_required_mon_features().contains_any(
2467 ceph::features::mon::FEATURE_OSDMAP_PRUNE
);
2472 * @returns true if has side-effects; false otherwise.
2474 bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx
)
2476 bool enabled
= is_prune_enabled();
2478 dout(1) << __func__
<< " osdmap full prune "
2479 << ( enabled
? "enabled" : "disabled")
2482 if (!enabled
|| !_prune_sanitize_options() || !should_prune()) {
2486 // we are beyond the minimum prune versions, we need to remove maps because
2487 // otherwise the store will grow unbounded and we may end up having issues
2488 // with available disk space or store hangs.
2490 // we will not pin all versions. We will leave a buffer number of versions.
2491 // this allows us the monitor to trim maps without caring too much about
2492 // pinned maps, and then allow us to use another ceph-mon without these
2493 // capabilities, without having to repair the store.
2495 osdmap_manifest_t manifest
= osdmap_manifest
;
2497 version_t first
= get_first_committed();
2498 version_t last
= get_last_committed();
2500 version_t last_to_pin
= last
- g_conf()->mon_min_osdmap_epochs
;
2501 version_t last_pinned
= manifest
.get_last_pinned();
2502 uint64_t prune_interval
=
2503 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_interval");
2505 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_txsize");
2507 prune_init(manifest
);
2509 // we need to get rid of some osdmaps
2512 << " lc (" << first
<< " .. " << last
<< ")"
2513 << " last_pinned " << last_pinned
2514 << " interval " << prune_interval
2515 << " last_to_pin " << last_to_pin
2518 // We will be erasing maps as we go.
2520 // We will erase all maps between `last_pinned` and the `next_to_pin`.
2522 // If `next_to_pin` happens to be greater than `last_to_pin`, then
2523 // we stop pruning. We could prune the maps between `next_to_pin` and
2524 // `last_to_pin`, but by not doing it we end up with neater pruned
2525 // intervals, aligned with `prune_interval`. Besides, this should not be a
2526 // problem as long as `prune_interval` is set to a sane value, instead of
2527 // hundreds or thousands of maps.
2529 auto map_exists
= [this](version_t v
) {
2530 string k
= mon
->store
->combine_strings("full", v
);
2531 return mon
->store
->exists(get_service_name(), k
);
2534 // 'interval' represents the number of maps from the last pinned
2535 // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2536 // version 11 next; all intermediate versions will be removed.
2538 // 'txsize' represents the maximum number of versions we'll be removing in
2539 // this iteration. If 'txsize' is large enough to perform multiple passes
2540 // pinning and removing maps, we will do so; if not, we'll do at least one
2541 // pass. We are quite relaxed about honouring 'txsize', but we'll always
2542 // ensure that we never go *over* the maximum.
2544 // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2545 uint64_t removal_interval
= prune_interval
- 1;
2547 if (txsize
< removal_interval
) {
2549 << " setting txsize to removal interval size ("
2550 << removal_interval
<< " versions"
2552 txsize
= removal_interval
;
2554 ceph_assert(removal_interval
> 0);
2556 uint64_t num_pruned
= 0;
2557 while (num_pruned
+ removal_interval
<= txsize
) {
2558 last_pinned
= manifest
.get_last_pinned();
2560 if (last_pinned
+ prune_interval
> last_to_pin
) {
2563 ceph_assert(last_pinned
< last_to_pin
);
2565 version_t next_pinned
= last_pinned
+ prune_interval
;
2566 ceph_assert(next_pinned
<= last_to_pin
);
2567 manifest
.pin(next_pinned
);
2569 dout(20) << __func__
2570 << " last_pinned " << last_pinned
2571 << " next_pinned " << next_pinned
2572 << " num_pruned " << num_pruned
2573 << " removal interval (" << (last_pinned
+1)
2574 << ".." << (next_pinned
-1) << ")"
2575 << " txsize " << txsize
<< dendl
;
2577 ceph_assert(map_exists(last_pinned
));
2578 ceph_assert(map_exists(next_pinned
));
2580 for (version_t v
= last_pinned
+1; v
< next_pinned
; ++v
) {
2581 ceph_assert(!manifest
.is_pinned(v
));
2583 dout(20) << __func__
<< " pruning full osdmap e" << v
<< dendl
;
2584 string full_key
= mon
->store
->combine_strings("full", v
);
2585 tx
->erase(get_service_name(), full_key
);
2590 ceph_assert(num_pruned
> 0);
2593 manifest
.encode(bl
);
2594 tx
->put(get_service_name(), "osdmap_manifest", bl
);
2602 bool OSDMonitor::preprocess_query(MonOpRequestRef op
)
2604 op
->mark_osdmon_event(__func__
);
2605 Message
*m
= op
->get_req();
2606 dout(10) << "preprocess_query " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
2608 switch (m
->get_type()) {
2610 case MSG_MON_COMMAND
:
2612 return preprocess_command(op
);
2613 } catch (const bad_cmd_get
& e
) {
2615 mon
->reply_command(op
, -EINVAL
, e
.what(), bl
, get_last_committed());
2618 case CEPH_MSG_MON_GET_OSDMAP
:
2619 return preprocess_get_osdmap(op
);
2622 case MSG_OSD_MARK_ME_DOWN
:
2623 return preprocess_mark_me_down(op
);
2624 case MSG_OSD_MARK_ME_DEAD
:
2625 return preprocess_mark_me_dead(op
);
2627 return preprocess_full(op
);
2628 case MSG_OSD_FAILURE
:
2629 return preprocess_failure(op
);
2631 return preprocess_boot(op
);
2633 return preprocess_alive(op
);
2634 case MSG_OSD_PG_CREATED
:
2635 return preprocess_pg_created(op
);
2636 case MSG_OSD_PG_READY_TO_MERGE
:
2637 return preprocess_pg_ready_to_merge(op
);
2638 case MSG_OSD_PGTEMP
:
2639 return preprocess_pgtemp(op
);
2640 case MSG_OSD_BEACON
:
2641 return preprocess_beacon(op
);
2643 case CEPH_MSG_POOLOP
:
2644 return preprocess_pool_op(op
);
2646 case MSG_REMOVE_SNAPS
:
2647 return preprocess_remove_snaps(op
);
2649 case MSG_MON_GET_PURGED_SNAPS
:
2650 return preprocess_get_purged_snaps(op
);
2658 bool OSDMonitor::prepare_update(MonOpRequestRef op
)
2660 op
->mark_osdmon_event(__func__
);
2661 Message
*m
= op
->get_req();
2662 dout(7) << "prepare_update " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
2664 switch (m
->get_type()) {
2666 case MSG_OSD_MARK_ME_DOWN
:
2667 return prepare_mark_me_down(op
);
2668 case MSG_OSD_MARK_ME_DEAD
:
2669 return prepare_mark_me_dead(op
);
2671 return prepare_full(op
);
2672 case MSG_OSD_FAILURE
:
2673 return prepare_failure(op
);
2675 return prepare_boot(op
);
2677 return prepare_alive(op
);
2678 case MSG_OSD_PG_CREATED
:
2679 return prepare_pg_created(op
);
2680 case MSG_OSD_PGTEMP
:
2681 return prepare_pgtemp(op
);
2682 case MSG_OSD_PG_READY_TO_MERGE
:
2683 return prepare_pg_ready_to_merge(op
);
2684 case MSG_OSD_BEACON
:
2685 return prepare_beacon(op
);
2687 case MSG_MON_COMMAND
:
2689 return prepare_command(op
);
2690 } catch (const bad_cmd_get
& e
) {
2692 mon
->reply_command(op
, -EINVAL
, e
.what(), bl
, get_last_committed());
2696 case CEPH_MSG_POOLOP
:
2697 return prepare_pool_op(op
);
2699 case MSG_REMOVE_SNAPS
:
2700 return prepare_remove_snaps(op
);
2710 bool OSDMonitor::should_propose(double& delay
)
2712 dout(10) << "should_propose" << dendl
;
2714 // if full map, propose immediately! any subsequent changes will be clobbered.
2715 if (pending_inc
.fullmap
.length())
2718 // adjust osd weights?
2719 if (!osd_weight
.empty() &&
2720 osd_weight
.size() == (unsigned)osdmap
.get_max_osd()) {
2721 dout(0) << " adjusting osd weights based on " << osd_weight
<< dendl
;
2722 osdmap
.adjust_osd_weights(osd_weight
, pending_inc
);
2728 return PaxosService::should_propose(delay
);
2733 // ---------------------------
2736 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op
)
2738 op
->mark_osdmon_event(__func__
);
2739 auto m
= op
->get_req
<MMonGetOSDMap
>();
2741 uint64_t features
= mon
->get_quorum_con_features();
2742 if (op
->get_session() && op
->get_session()->con_features
)
2743 features
= op
->get_session()->con_features
;
2745 dout(10) << __func__
<< " " << *m
<< dendl
;
2746 MOSDMap
*reply
= new MOSDMap(mon
->monmap
->fsid
, features
);
2747 epoch_t first
= get_first_committed();
2748 epoch_t last
= osdmap
.get_epoch();
2749 int max
= g_conf()->osd_map_message_max
;
2750 ssize_t max_bytes
= g_conf()->osd_map_message_max_bytes
;
2751 for (epoch_t e
= std::max(first
, m
->get_full_first());
2752 e
<= std::min(last
, m
->get_full_last()) && max
> 0 && max_bytes
> 0;
2754 bufferlist
& bl
= reply
->maps
[e
];
2755 int r
= get_version_full(e
, features
, bl
);
2756 ceph_assert(r
>= 0);
2757 max_bytes
-= bl
.length();
2759 for (epoch_t e
= std::max(first
, m
->get_inc_first());
2760 e
<= std::min(last
, m
->get_inc_last()) && max
> 0 && max_bytes
> 0;
2762 bufferlist
& bl
= reply
->incremental_maps
[e
];
2763 int r
= get_version(e
, features
, bl
);
2764 ceph_assert(r
>= 0);
2765 max_bytes
-= bl
.length();
2767 reply
->oldest_map
= first
;
2768 reply
->newest_map
= last
;
2769 mon
->send_reply(op
, reply
);
2774 // ---------------------------
2779 bool OSDMonitor::check_source(MonOpRequestRef op
, uuid_d fsid
) {
2780 // check permissions
2781 MonSession
*session
= op
->get_session();
2784 if (!session
->is_capable("osd", MON_CAP_X
)) {
2785 dout(0) << "got MOSDFailure from entity with insufficient caps "
2786 << session
->caps
<< dendl
;
2789 if (fsid
!= mon
->monmap
->fsid
) {
2790 dout(0) << "check_source: on fsid " << fsid
2791 << " != " << mon
->monmap
->fsid
<< dendl
;
2798 bool OSDMonitor::preprocess_failure(MonOpRequestRef op
)
2800 op
->mark_osdmon_event(__func__
);
2801 auto m
= op
->get_req
<MOSDFailure
>();
2802 // who is target_osd
2803 int badboy
= m
->get_target_osd();
2805 // check permissions
2806 if (check_source(op
, m
->fsid
))
2809 // first, verify the reporting host is valid
2810 if (m
->get_orig_source().is_osd()) {
2811 int from
= m
->get_orig_source().num();
2812 if (!osdmap
.exists(from
) ||
2813 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs()) ||
2814 (osdmap
.is_down(from
) && m
->if_osd_failed())) {
2815 dout(5) << "preprocess_failure from dead osd." << from
2816 << ", ignoring" << dendl
;
2817 send_incremental(op
, m
->get_epoch()+1);
2824 if (osdmap
.is_down(badboy
)) {
2825 dout(5) << "preprocess_failure dne(/dup?): osd." << m
->get_target_osd()
2826 << " " << m
->get_target_addrs()
2827 << ", from " << m
->get_orig_source() << dendl
;
2828 if (m
->get_epoch() < osdmap
.get_epoch())
2829 send_incremental(op
, m
->get_epoch()+1);
2832 if (osdmap
.get_addrs(badboy
) != m
->get_target_addrs()) {
2833 dout(5) << "preprocess_failure wrong osd: report osd." << m
->get_target_osd()
2834 << " " << m
->get_target_addrs()
2835 << " != map's " << osdmap
.get_addrs(badboy
)
2836 << ", from " << m
->get_orig_source() << dendl
;
2837 if (m
->get_epoch() < osdmap
.get_epoch())
2838 send_incremental(op
, m
->get_epoch()+1);
2842 // already reported?
2843 if (osdmap
.is_down(badboy
) ||
2844 osdmap
.get_up_from(badboy
) > m
->get_epoch()) {
2845 dout(5) << "preprocess_failure dup/old: osd." << m
->get_target_osd()
2846 << " " << m
->get_target_addrs()
2847 << ", from " << m
->get_orig_source() << dendl
;
2848 if (m
->get_epoch() < osdmap
.get_epoch())
2849 send_incremental(op
, m
->get_epoch()+1);
2853 if (!can_mark_down(badboy
)) {
2854 dout(5) << "preprocess_failure ignoring report of osd."
2855 << m
->get_target_osd() << " " << m
->get_target_addrs()
2856 << " from " << m
->get_orig_source() << dendl
;
2860 dout(10) << "preprocess_failure new: osd." << m
->get_target_osd()
2861 << " " << m
->get_target_addrs()
2862 << ", from " << m
->get_orig_source() << dendl
;
2870 class C_AckMarkedDown
: public C_MonOp
{
2876 : C_MonOp(op
), osdmon(osdmon
) {}
2878 void _finish(int r
) override
{
2880 auto m
= op
->get_req
<MOSDMarkMeDown
>();
2881 osdmon
->mon
->send_reply(
2888 false)); // ACK itself does not request an ack
2889 } else if (r
== -EAGAIN
) {
2890 osdmon
->dispatch(op
);
2892 ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r
);
2895 ~C_AckMarkedDown() override
{
2899 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op
)
2901 op
->mark_osdmon_event(__func__
);
2902 auto m
= op
->get_req
<MOSDMarkMeDown
>();
2903 int from
= m
->target_osd
;
2905 // check permissions
2906 if (check_source(op
, m
->fsid
))
2909 // first, verify the reporting host is valid
2910 if (!m
->get_orig_source().is_osd())
2913 if (!osdmap
.exists(from
) ||
2914 osdmap
.is_down(from
) ||
2915 osdmap
.get_addrs(from
) != m
->target_addrs
) {
2916 dout(5) << "preprocess_mark_me_down from dead osd."
2917 << from
<< ", ignoring" << dendl
;
2918 send_incremental(op
, m
->get_epoch()+1);
2922 // no down might be set
2923 if (!can_mark_down(from
))
2926 dout(10) << "MOSDMarkMeDown for: " << m
->get_orig_source()
2927 << " " << m
->target_addrs
<< dendl
;
2931 if (m
->request_ack
) {
2932 Context
*c(new C_AckMarkedDown(this, op
));
2938 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op
)
2940 op
->mark_osdmon_event(__func__
);
2941 auto m
= op
->get_req
<MOSDMarkMeDown
>();
2942 int target_osd
= m
->target_osd
;
2944 ceph_assert(osdmap
.is_up(target_osd
));
2945 ceph_assert(osdmap
.get_addrs(target_osd
) == m
->target_addrs
);
2947 mon
->clog
->info() << "osd." << target_osd
<< " marked itself down";
2948 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
2950 wait_for_finished_proposal(op
, new C_AckMarkedDown(this, op
));
2954 bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op
)
2956 op
->mark_osdmon_event(__func__
);
2957 auto m
= op
->get_req
<MOSDMarkMeDead
>();
2958 int from
= m
->target_osd
;
2960 // check permissions
2961 if (check_source(op
, m
->fsid
)) {
2966 // first, verify the reporting host is valid
2967 if (!m
->get_orig_source().is_osd()) {
2972 if (!osdmap
.exists(from
) ||
2973 !osdmap
.is_down(from
)) {
2974 dout(5) << __func__
<< " from nonexistent or up osd." << from
2975 << ", ignoring" << dendl
;
2976 send_incremental(op
, m
->get_epoch()+1);
2984 bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op
)
2986 op
->mark_osdmon_event(__func__
);
2987 auto m
= op
->get_req
<MOSDMarkMeDead
>();
2988 int target_osd
= m
->target_osd
;
2990 ceph_assert(osdmap
.is_down(target_osd
));
2992 mon
->clog
->info() << "osd." << target_osd
<< " marked itself dead as of e"
2994 if (!pending_inc
.new_xinfo
.count(target_osd
)) {
2995 pending_inc
.new_xinfo
[target_osd
] = osdmap
.osd_xinfo
[target_osd
];
2997 pending_inc
.new_xinfo
[target_osd
].dead_epoch
= m
->get_epoch();
2998 wait_for_finished_proposal(
3001 [op
, this] (int r
) {
3003 mon
->no_reply(op
); // ignore on success
3010 bool OSDMonitor::can_mark_down(int i
)
3012 if (osdmap
.is_nodown(i
)) {
3013 dout(5) << __func__
<< " osd." << i
<< " is marked as nodown, "
3014 << "will not mark it down" << dendl
;
3018 int num_osds
= osdmap
.get_num_osds();
3019 if (num_osds
== 0) {
3020 dout(5) << __func__
<< " no osds" << dendl
;
3023 int up
= osdmap
.get_num_up_osds() - pending_inc
.get_net_marked_down(&osdmap
);
3024 float up_ratio
= (float)up
/ (float)num_osds
;
3025 if (up_ratio
< g_conf()->mon_osd_min_up_ratio
) {
3026 dout(2) << __func__
<< " current up_ratio " << up_ratio
<< " < min "
3027 << g_conf()->mon_osd_min_up_ratio
3028 << ", will not mark osd." << i
<< " down" << dendl
;
3034 bool OSDMonitor::can_mark_up(int i
)
3036 if (osdmap
.is_noup(i
)) {
3037 dout(5) << __func__
<< " osd." << i
<< " is marked as noup, "
3038 << "will not mark it up" << dendl
;
3046 * @note the parameter @p i apparently only exists here so we can output the
3047 * osd's id on messages.
3049 bool OSDMonitor::can_mark_out(int i
)
3051 if (osdmap
.is_noout(i
)) {
3052 dout(5) << __func__
<< " osd." << i
<< " is marked as noout, "
3053 << "will not mark it out" << dendl
;
3057 int num_osds
= osdmap
.get_num_osds();
3058 if (num_osds
== 0) {
3059 dout(5) << __func__
<< " no osds" << dendl
;
3062 int in
= osdmap
.get_num_in_osds() - pending_inc
.get_net_marked_out(&osdmap
);
3063 float in_ratio
= (float)in
/ (float)num_osds
;
3064 if (in_ratio
< g_conf()->mon_osd_min_in_ratio
) {
3066 dout(5) << __func__
<< " current in_ratio " << in_ratio
<< " < min "
3067 << g_conf()->mon_osd_min_in_ratio
3068 << ", will not mark osd." << i
<< " out" << dendl
;
3070 dout(5) << __func__
<< " current in_ratio " << in_ratio
<< " < min "
3071 << g_conf()->mon_osd_min_in_ratio
3072 << ", will not mark osds out" << dendl
;
3079 bool OSDMonitor::can_mark_in(int i
)
3081 if (osdmap
.is_noin(i
)) {
3082 dout(5) << __func__
<< " osd." << i
<< " is marked as noin, "
3083 << "will not mark it in" << dendl
;
3090 bool OSDMonitor::check_failures(utime_t now
)
3092 bool found_failure
= false;
3093 for (map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
3094 p
!= failure_info
.end();
3096 if (can_mark_down(p
->first
)) {
3097 found_failure
|= check_failure(now
, p
->first
, p
->second
);
3100 return found_failure
;
3103 bool OSDMonitor::check_failure(utime_t now
, int target_osd
, failure_info_t
& fi
)
3105 // already pending failure?
3106 if (pending_inc
.new_state
.count(target_osd
) &&
3107 pending_inc
.new_state
[target_osd
] & CEPH_OSD_UP
) {
3108 dout(10) << " already pending failure" << dendl
;
3112 set
<string
> reporters_by_subtree
;
3113 auto reporter_subtree_level
= g_conf().get_val
<string
>("mon_osd_reporter_subtree_level");
3114 utime_t
orig_grace(g_conf()->osd_heartbeat_grace
, 0);
3115 utime_t max_failed_since
= fi
.get_failed_since();
3116 utime_t failed_for
= now
- max_failed_since
;
3118 utime_t grace
= orig_grace
;
3119 double my_grace
= 0, peer_grace
= 0;
3121 if (g_conf()->mon_osd_adjust_heartbeat_grace
) {
3122 double halflife
= (double)g_conf()->mon_osd_laggy_halflife
;
3123 decay_k
= ::log(.5) / halflife
;
3125 // scale grace period based on historical probability of 'lagginess'
3126 // (false positive failures due to slowness).
3127 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(target_osd
);
3128 double decay
= exp((double)failed_for
* decay_k
);
3129 dout(20) << " halflife " << halflife
<< " decay_k " << decay_k
3130 << " failed_for " << failed_for
<< " decay " << decay
<< dendl
;
3131 my_grace
= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
3135 // consider the peers reporting a failure a proxy for a potential
3136 // 'subcluster' over the overall cluster that is similarly
3137 // laggy. this is clearly not true in all cases, but will sometimes
3138 // help us localize the grace correction to a subset of the system
3139 // (say, a rack with a bad switch) that is unhappy.
3140 ceph_assert(fi
.reporters
.size());
3141 for (auto p
= fi
.reporters
.begin(); p
!= fi
.reporters
.end();) {
3142 // get the parent bucket whose type matches with "reporter_subtree_level".
3143 // fall back to OSD if the level doesn't exist.
3144 if (osdmap
.exists(p
->first
)) {
3145 auto reporter_loc
= osdmap
.crush
->get_full_location(p
->first
);
3146 if (auto iter
= reporter_loc
.find(reporter_subtree_level
);
3147 iter
== reporter_loc
.end()) {
3148 reporters_by_subtree
.insert("osd." + to_string(p
->first
));
3150 reporters_by_subtree
.insert(iter
->second
);
3152 if (g_conf()->mon_osd_adjust_heartbeat_grace
) {
3153 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(p
->first
);
3154 utime_t elapsed
= now
- xi
.down_stamp
;
3155 double decay
= exp((double)elapsed
* decay_k
);
3156 peer_grace
+= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
3160 fi
.cancel_report(p
->first
);;
3161 p
= fi
.reporters
.erase(p
);
3165 if (g_conf()->mon_osd_adjust_heartbeat_grace
) {
3166 peer_grace
/= (double)fi
.reporters
.size();
3167 grace
+= peer_grace
;
3170 dout(10) << " osd." << target_osd
<< " has "
3171 << fi
.reporters
.size() << " reporters, "
3172 << grace
<< " grace (" << orig_grace
<< " + " << my_grace
3173 << " + " << peer_grace
<< "), max_failed_since " << max_failed_since
3176 if (failed_for
>= grace
&&
3177 reporters_by_subtree
.size() >= g_conf().get_val
<uint64_t>("mon_osd_min_down_reporters")) {
3178 dout(1) << " we have enough reporters to mark osd." << target_osd
3179 << " down" << dendl
;
3180 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
3182 mon
->clog
->info() << "osd." << target_osd
<< " failed ("
3183 << osdmap
.crush
->get_full_location_ordered_string(
3186 << (int)reporters_by_subtree
.size()
3187 << " reporters from different "
3188 << reporter_subtree_level
<< " after "
3189 << failed_for
<< " >= grace " << grace
<< ")";
3195 void OSDMonitor::force_failure(int target_osd
, int by
)
3197 // already pending failure?
3198 if (pending_inc
.new_state
.count(target_osd
) &&
3199 pending_inc
.new_state
[target_osd
] & CEPH_OSD_UP
) {
3200 dout(10) << " already pending failure" << dendl
;
3204 dout(1) << " we're forcing failure of osd." << target_osd
<< dendl
;
3205 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
3206 if (!pending_inc
.new_xinfo
.count(target_osd
)) {
3207 pending_inc
.new_xinfo
[target_osd
] = osdmap
.osd_xinfo
[target_osd
];
3209 pending_inc
.new_xinfo
[target_osd
].dead_epoch
= pending_inc
.epoch
;
3211 mon
->clog
->info() << "osd." << target_osd
<< " failed ("
3212 << osdmap
.crush
->get_full_location_ordered_string(target_osd
)
3213 << ") (connection refused reported by osd." << by
<< ")";
3217 bool OSDMonitor::prepare_failure(MonOpRequestRef op
)
3219 op
->mark_osdmon_event(__func__
);
3220 auto m
= op
->get_req
<MOSDFailure
>();
3221 dout(1) << "prepare_failure osd." << m
->get_target_osd()
3222 << " " << m
->get_target_addrs()
3223 << " from " << m
->get_orig_source()
3224 << " is reporting failure:" << m
->if_osd_failed() << dendl
;
3226 int target_osd
= m
->get_target_osd();
3227 int reporter
= m
->get_orig_source().num();
3228 ceph_assert(osdmap
.is_up(target_osd
));
3229 ceph_assert(osdmap
.get_addrs(target_osd
) == m
->get_target_addrs());
3233 if (m
->if_osd_failed()) {
3234 // calculate failure time
3235 utime_t now
= ceph_clock_now();
3236 utime_t failed_since
=
3237 m
->get_recv_stamp() - utime_t(m
->failed_for
, 0);
3240 if (m
->is_immediate()) {
3241 mon
->clog
->debug() << "osd." << m
->get_target_osd()
3242 << " reported immediately failed by "
3243 << m
->get_orig_source();
3244 force_failure(target_osd
, reporter
);
3247 mon
->clog
->debug() << "osd." << m
->get_target_osd() << " reported failed by "
3248 << m
->get_orig_source();
3250 failure_info_t
& fi
= failure_info
[target_osd
];
3251 MonOpRequestRef old_op
= fi
.add_report(reporter
, failed_since
, op
);
3253 mon
->no_reply(old_op
);
3256 return check_failure(now
, target_osd
, fi
);
3258 // remove the report
3259 mon
->clog
->debug() << "osd." << m
->get_target_osd()
3260 << " failure report canceled by "
3261 << m
->get_orig_source();
3262 if (failure_info
.count(target_osd
)) {
3263 failure_info_t
& fi
= failure_info
[target_osd
];
3264 MonOpRequestRef report_op
= fi
.cancel_report(reporter
);
3266 mon
->no_reply(report_op
);
3268 if (fi
.reporters
.empty()) {
3269 dout(10) << " removing last failure_info for osd." << target_osd
3271 failure_info
.erase(target_osd
);
3273 dout(10) << " failure_info for osd." << target_osd
<< " now "
3274 << fi
.reporters
.size() << " reporters" << dendl
;
3277 dout(10) << " no failure_info for osd." << target_osd
<< dendl
;
3284 void OSDMonitor::process_failures()
3286 map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
3287 while (p
!= failure_info
.end()) {
3288 if (osdmap
.is_up(p
->first
)) {
3291 dout(10) << "process_failures osd." << p
->first
<< dendl
;
3292 list
<MonOpRequestRef
> ls
;
3293 p
->second
.take_report_messages(ls
);
3294 failure_info
.erase(p
++);
3296 while (!ls
.empty()) {
3297 MonOpRequestRef o
= ls
.front();
3299 o
->mark_event(__func__
);
3300 MOSDFailure
*m
= o
->get_req
<MOSDFailure
>();
3301 send_latest(o
, m
->get_epoch());
3310 void OSDMonitor::take_all_failures(list
<MonOpRequestRef
>& ls
)
3312 dout(10) << __func__
<< " on " << failure_info
.size() << " osds" << dendl
;
3314 for (map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
3315 p
!= failure_info
.end();
3317 p
->second
.take_report_messages(ls
);
3319 failure_info
.clear();
3325 bool OSDMonitor::preprocess_boot(MonOpRequestRef op
)
3327 op
->mark_osdmon_event(__func__
);
3328 auto m
= op
->get_req
<MOSDBoot
>();
3329 int from
= m
->get_orig_source_inst().name
.num();
3331 // check permissions, ignore if failed (no response expected)
3332 MonSession
*session
= op
->get_session();
3335 if (!session
->is_capable("osd", MON_CAP_X
)) {
3336 dout(0) << "got preprocess_boot message from entity with insufficient caps"
3337 << session
->caps
<< dendl
;
3341 if (m
->sb
.cluster_fsid
!= mon
->monmap
->fsid
) {
3342 dout(0) << "preprocess_boot on fsid " << m
->sb
.cluster_fsid
3343 << " != " << mon
->monmap
->fsid
<< dendl
;
3347 if (m
->get_orig_source_inst().addr
.is_blank_ip()) {
3348 dout(0) << "preprocess_boot got blank addr for " << m
->get_orig_source_inst() << dendl
;
3352 ceph_assert(m
->get_orig_source_inst().name
.is_osd());
3354 // force all osds to have gone through luminous prior to upgrade to nautilus
3356 vector
<string
> missing
;
3357 if (!HAVE_FEATURE(m
->osd_features
, SERVER_LUMINOUS
)) {
3358 missing
.push_back("CEPH_FEATURE_SERVER_LUMINOUS");
3360 if (!HAVE_FEATURE(m
->osd_features
, SERVER_JEWEL
)) {
3361 missing
.push_back("CEPH_FEATURE_SERVER_JEWEL");
3363 if (!HAVE_FEATURE(m
->osd_features
, SERVER_KRAKEN
)) {
3364 missing
.push_back("CEPH_FEATURE_SERVER_KRAKEN");
3366 if (!HAVE_FEATURE(m
->osd_features
, OSD_RECOVERY_DELETES
)) {
3367 missing
.push_back("CEPH_FEATURE_OSD_RECOVERY_DELETES");
3370 if (!missing
.empty()) {
3371 using std::experimental::make_ostream_joiner
;
3374 copy(begin(missing
), end(missing
), make_ostream_joiner(ss
, ";"));
3376 mon
->clog
->info() << "disallowing boot of OSD "
3377 << m
->get_orig_source_inst()
3378 << " because the osd lacks " << ss
.str();
3383 // make sure osd versions do not span more than 3 releases
3384 if (HAVE_FEATURE(m
->osd_features
, SERVER_OCTOPUS
) &&
3385 osdmap
.require_osd_release
< ceph_release_t::mimic
) {
3386 mon
->clog
->info() << "disallowing boot of octopus+ OSD "
3387 << m
->get_orig_source_inst()
3388 << " because require_osd_release < mimic";
3392 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
3393 // we are reusing a jewel feature bit that was retired in luminous.
3394 if (osdmap
.require_osd_release
>= ceph_release_t::luminous
&&
3395 osdmap
.test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT
) &&
3396 !(m
->osd_features
& CEPH_FEATURE_OSD_PGLOG_HARDLIMIT
)) {
3397 mon
->clog
->info() << "disallowing boot of OSD "
3398 << m
->get_orig_source_inst()
3399 << " because 'pglog_hardlimit' osdmap flag is set and OSD lacks the OSD_PGLOG_HARDLIMIT feature";
3404 if (osdmap
.is_up(from
) &&
3405 osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs()) &&
3406 osdmap
.get_cluster_addrs(from
).legacy_equals(m
->cluster_addrs
)) {
3408 dout(7) << "preprocess_boot dup from " << m
->get_orig_source()
3409 << " " << m
->get_orig_source_addrs()
3410 << " =~ " << osdmap
.get_addrs(from
) << dendl
;
3415 if (osdmap
.exists(from
) &&
3416 !osdmap
.get_uuid(from
).is_zero() &&
3417 osdmap
.get_uuid(from
) != m
->sb
.osd_fsid
) {
3418 dout(7) << __func__
<< " from " << m
->get_orig_source_inst()
3419 << " clashes with existing osd: different fsid"
3420 << " (ours: " << osdmap
.get_uuid(from
)
3421 << " ; theirs: " << m
->sb
.osd_fsid
<< ")" << dendl
;
3425 if (osdmap
.exists(from
) &&
3426 osdmap
.get_info(from
).up_from
> m
->version
&&
3427 osdmap
.get_most_recent_addrs(from
).legacy_equals(
3428 m
->get_orig_source_addrs())) {
3429 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl
;
3430 send_latest(op
, m
->sb
.current_epoch
+1);
3435 if (!can_mark_up(from
)) {
3436 dout(7) << "preprocess_boot ignoring boot from " << m
->get_orig_source_inst() << dendl
;
3437 send_latest(op
, m
->sb
.current_epoch
+1);
3441 dout(10) << "preprocess_boot from " << m
->get_orig_source_inst() << dendl
;
3448 bool OSDMonitor::prepare_boot(MonOpRequestRef op
)
3450 op
->mark_osdmon_event(__func__
);
3451 auto m
= op
->get_req
<MOSDBoot
>();
3452 dout(7) << __func__
<< " from " << m
->get_source()
3454 << " client_addrs" << m
->get_connection()->get_peer_addrs()
3455 << " cluster_addrs " << m
->cluster_addrs
3456 << " hb_back_addrs " << m
->hb_back_addrs
3457 << " hb_front_addrs " << m
->hb_front_addrs
3460 ceph_assert(m
->get_orig_source().is_osd());
3461 int from
= m
->get_orig_source().num();
3463 // does this osd exist?
3464 if (from
>= osdmap
.get_max_osd()) {
3465 dout(1) << "boot from osd." << from
<< " >= max_osd "
3466 << osdmap
.get_max_osd() << dendl
;
3470 int oldstate
= osdmap
.exists(from
) ? osdmap
.get_state(from
) : CEPH_OSD_NEW
;
3471 if (pending_inc
.new_state
.count(from
))
3472 oldstate
^= pending_inc
.new_state
[from
];
3474 // already up? mark down first?
3475 if (osdmap
.is_up(from
)) {
3476 dout(7) << __func__
<< " was up, first marking down osd." << from
<< " "
3477 << osdmap
.get_addrs(from
) << dendl
;
3478 // preprocess should have caught these; if not, assert.
3479 ceph_assert(!osdmap
.get_addrs(from
).legacy_equals(
3480 m
->get_orig_source_addrs()) ||
3481 !osdmap
.get_cluster_addrs(from
).legacy_equals(m
->cluster_addrs
));
3482 ceph_assert(osdmap
.get_uuid(from
) == m
->sb
.osd_fsid
);
3484 if (pending_inc
.new_state
.count(from
) == 0 ||
3485 (pending_inc
.new_state
[from
] & CEPH_OSD_UP
) == 0) {
3486 // mark previous guy down
3487 pending_inc
.new_state
[from
] = CEPH_OSD_UP
;
3489 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
3490 } else if (pending_inc
.new_up_client
.count(from
)) {
3491 // already prepared, just wait
3492 dout(7) << __func__
<< " already prepared, waiting on "
3493 << m
->get_orig_source_addr() << dendl
;
3494 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
3497 pending_inc
.new_up_client
[from
] = m
->get_orig_source_addrs();
3498 pending_inc
.new_up_cluster
[from
] = m
->cluster_addrs
;
3499 pending_inc
.new_hb_back_up
[from
] = m
->hb_back_addrs
;
3500 pending_inc
.new_hb_front_up
[from
] = m
->hb_front_addrs
;
3502 down_pending_out
.erase(from
); // if any
3505 osd_weight
[from
] = m
->sb
.weight
;
3508 dout(10) << " setting osd." << from
<< " uuid to " << m
->sb
.osd_fsid
3510 if (!osdmap
.exists(from
) || osdmap
.get_uuid(from
) != m
->sb
.osd_fsid
) {
3511 // preprocess should have caught this; if not, assert.
3512 ceph_assert(!osdmap
.exists(from
) || osdmap
.get_uuid(from
).is_zero());
3513 pending_inc
.new_uuid
[from
] = m
->sb
.osd_fsid
;
3517 if (m
->sb
.newest_map
== 0 && osdmap
.exists(from
)) {
3518 const osd_info_t
& i
= osdmap
.get_info(from
);
3519 if (i
.up_from
> i
.lost_at
) {
3520 dout(10) << " fresh osd; marking lost_at too" << dendl
;
3521 pending_inc
.new_lost
[from
] = osdmap
.get_epoch();
3526 bufferlist osd_metadata
;
3527 encode(m
->metadata
, osd_metadata
);
3528 pending_metadata
[from
] = osd_metadata
;
3529 pending_metadata_rm
.erase(from
);
3531 // adjust last clean unmount epoch?
3532 const osd_info_t
& info
= osdmap
.get_info(from
);
3533 dout(10) << " old osd_info: " << info
<< dendl
;
3534 if (m
->sb
.mounted
> info
.last_clean_begin
||
3535 (m
->sb
.mounted
== info
.last_clean_begin
&&
3536 m
->sb
.clean_thru
> info
.last_clean_end
)) {
3537 epoch_t begin
= m
->sb
.mounted
;
3538 epoch_t end
= m
->sb
.clean_thru
;
3540 dout(10) << __func__
<< " osd." << from
<< " last_clean_interval "
3541 << "[" << info
.last_clean_begin
<< "," << info
.last_clean_end
3542 << ") -> [" << begin
<< "-" << end
<< ")"
3544 pending_inc
.new_last_clean_interval
[from
] =
3545 pair
<epoch_t
,epoch_t
>(begin
, end
);
3548 if (pending_inc
.new_xinfo
.count(from
) == 0)
3549 pending_inc
.new_xinfo
[from
] = osdmap
.osd_xinfo
[from
];
3550 osd_xinfo_t
& xi
= pending_inc
.new_xinfo
[from
];
3551 if (m
->boot_epoch
== 0) {
3552 xi
.laggy_probability
*= (1.0 - g_conf()->mon_osd_laggy_weight
);
3553 xi
.laggy_interval
*= (1.0 - g_conf()->mon_osd_laggy_weight
);
3554 dout(10) << " not laggy, new xi " << xi
<< dendl
;
3556 if (xi
.down_stamp
.sec()) {
3557 int interval
= ceph_clock_now().sec() -
3558 xi
.down_stamp
.sec();
3559 if (g_conf()->mon_osd_laggy_max_interval
&&
3560 (interval
> g_conf()->mon_osd_laggy_max_interval
)) {
3561 interval
= g_conf()->mon_osd_laggy_max_interval
;
3564 interval
* g_conf()->mon_osd_laggy_weight
+
3565 xi
.laggy_interval
* (1.0 - g_conf()->mon_osd_laggy_weight
);
3567 xi
.laggy_probability
=
3568 g_conf()->mon_osd_laggy_weight
+
3569 xi
.laggy_probability
* (1.0 - g_conf()->mon_osd_laggy_weight
);
3570 dout(10) << " laggy, now xi " << xi
<< dendl
;
3573 // set features shared by the osd
3574 if (m
->osd_features
)
3575 xi
.features
= m
->osd_features
;
3577 xi
.features
= m
->get_connection()->get_features();
3580 if ((g_conf()->mon_osd_auto_mark_auto_out_in
&&
3581 (oldstate
& CEPH_OSD_AUTOOUT
)) ||
3582 (g_conf()->mon_osd_auto_mark_new_in
&& (oldstate
& CEPH_OSD_NEW
)) ||
3583 (g_conf()->mon_osd_auto_mark_in
)) {
3584 if (can_mark_in(from
)) {
3585 if (xi
.old_weight
> 0) {
3586 pending_inc
.new_weight
[from
] = xi
.old_weight
;
3589 pending_inc
.new_weight
[from
] = CEPH_OSD_IN
;
3592 dout(7) << __func__
<< " NOIN set, will not mark in "
3593 << m
->get_orig_source_addr() << dendl
;
3598 wait_for_finished_proposal(op
, new C_Booted(this, op
));
3603 void OSDMonitor::_booted(MonOpRequestRef op
, bool logit
)
3605 op
->mark_osdmon_event(__func__
);
3606 auto m
= op
->get_req
<MOSDBoot
>();
3607 dout(7) << "_booted " << m
->get_orig_source_inst()
3608 << " w " << m
->sb
.weight
<< " from " << m
->sb
.current_epoch
<< dendl
;
3611 mon
->clog
->info() << m
->get_source() << " " << m
->get_orig_source_addrs()
3615 send_latest(op
, m
->sb
.current_epoch
+1);
3622 bool OSDMonitor::preprocess_full(MonOpRequestRef op
)
3624 op
->mark_osdmon_event(__func__
);
3625 auto m
= op
->get_req
<MOSDFull
>();
3626 int from
= m
->get_orig_source().num();
3628 unsigned mask
= CEPH_OSD_NEARFULL
| CEPH_OSD_BACKFILLFULL
| CEPH_OSD_FULL
;
3630 // check permissions, ignore if failed
3631 MonSession
*session
= op
->get_session();
3634 if (!session
->is_capable("osd", MON_CAP_X
)) {
3635 dout(0) << "MOSDFull from entity with insufficient privileges:"
3636 << session
->caps
<< dendl
;
3640 // ignore a full message from the osd instance that already went down
3641 if (!osdmap
.exists(from
)) {
3642 dout(7) << __func__
<< " ignoring full message from nonexistent "
3643 << m
->get_orig_source_inst() << dendl
;
3646 if ((!osdmap
.is_up(from
) &&
3647 osdmap
.get_most_recent_addrs(from
).legacy_equals(
3648 m
->get_orig_source_addrs())) ||
3649 (osdmap
.is_up(from
) &&
3650 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs()))) {
3651 dout(7) << __func__
<< " ignoring full message from down "
3652 << m
->get_orig_source_inst() << dendl
;
3656 OSDMap::calc_state_set(osdmap
.get_state(from
), state
);
3658 if ((osdmap
.get_state(from
) & mask
) == m
->state
) {
3659 dout(7) << __func__
<< " state already " << state
<< " for osd." << from
3660 << " " << m
->get_orig_source_inst() << dendl
;
3661 _reply_map(op
, m
->version
);
3665 dout(10) << __func__
<< " want state " << state
<< " for osd." << from
3666 << " " << m
->get_orig_source_inst() << dendl
;
3673 bool OSDMonitor::prepare_full(MonOpRequestRef op
)
3675 op
->mark_osdmon_event(__func__
);
3676 auto m
= op
->get_req
<MOSDFull
>();
3677 const int from
= m
->get_orig_source().num();
3679 const unsigned mask
= CEPH_OSD_NEARFULL
| CEPH_OSD_BACKFILLFULL
| CEPH_OSD_FULL
;
3680 const unsigned want_state
= m
->state
& mask
; // safety first
3682 unsigned cur_state
= osdmap
.get_state(from
);
3683 auto p
= pending_inc
.new_state
.find(from
);
3684 if (p
!= pending_inc
.new_state
.end()) {
3685 cur_state
^= p
->second
;
3689 set
<string
> want_state_set
, cur_state_set
;
3690 OSDMap::calc_state_set(want_state
, want_state_set
);
3691 OSDMap::calc_state_set(cur_state
, cur_state_set
);
3693 if (cur_state
!= want_state
) {
3694 if (p
!= pending_inc
.new_state
.end()) {
3697 pending_inc
.new_state
[from
] = 0;
3699 pending_inc
.new_state
[from
] |= (osdmap
.get_state(from
) & mask
) ^ want_state
;
3700 dout(7) << __func__
<< " osd." << from
<< " " << cur_state_set
3701 << " -> " << want_state_set
<< dendl
;
3703 dout(7) << __func__
<< " osd." << from
<< " " << cur_state_set
3704 << " = wanted " << want_state_set
<< ", just waiting" << dendl
;
3707 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
3714 bool OSDMonitor::preprocess_alive(MonOpRequestRef op
)
3716 op
->mark_osdmon_event(__func__
);
3717 auto m
= op
->get_req
<MOSDAlive
>();
3718 int from
= m
->get_orig_source().num();
3720 // check permissions, ignore if failed
3721 MonSession
*session
= op
->get_session();
3724 if (!session
->is_capable("osd", MON_CAP_X
)) {
3725 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3726 << session
->caps
<< dendl
;
3730 if (!osdmap
.is_up(from
) ||
3731 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs())) {
3732 dout(7) << "preprocess_alive ignoring alive message from down "
3733 << m
->get_orig_source() << " " << m
->get_orig_source_addrs()
3738 if (osdmap
.get_up_thru(from
) >= m
->want
) {
3740 dout(7) << "preprocess_alive want up_thru " << m
->want
<< " dup from " << m
->get_orig_source_inst() << dendl
;
3741 _reply_map(op
, m
->version
);
3745 dout(10) << "preprocess_alive want up_thru " << m
->want
3746 << " from " << m
->get_orig_source_inst() << dendl
;
3753 bool OSDMonitor::prepare_alive(MonOpRequestRef op
)
3755 op
->mark_osdmon_event(__func__
);
3756 auto m
= op
->get_req
<MOSDAlive
>();
3757 int from
= m
->get_orig_source().num();
3759 if (0) { // we probably don't care much about these
3760 mon
->clog
->debug() << m
->get_orig_source_inst() << " alive";
3763 dout(7) << "prepare_alive want up_thru " << m
->want
<< " have " << m
->version
3764 << " from " << m
->get_orig_source_inst() << dendl
;
3766 update_up_thru(from
, m
->version
); // set to the latest map the OSD has
3767 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
3771 void OSDMonitor::_reply_map(MonOpRequestRef op
, epoch_t e
)
3773 op
->mark_osdmon_event(__func__
);
3774 dout(7) << "_reply_map " << e
3775 << " from " << op
->get_req()->get_orig_source_inst()
3781 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op
)
3783 op
->mark_osdmon_event(__func__
);
3784 auto m
= op
->get_req
<MOSDPGCreated
>();
3785 dout(10) << __func__
<< " " << *m
<< dendl
;
3786 auto session
= op
->get_session();
3789 dout(10) << __func__
<< ": no monitor session!" << dendl
;
3792 if (!session
->is_capable("osd", MON_CAP_X
)) {
3793 derr
<< __func__
<< " received from entity "
3794 << "with insufficient privileges " << session
->caps
<< dendl
;
3797 // always forward the "created!" to the leader
3801 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op
)
3803 op
->mark_osdmon_event(__func__
);
3804 auto m
= op
->get_req
<MOSDPGCreated
>();
3805 dout(10) << __func__
<< " " << *m
<< dendl
;
3806 auto src
= m
->get_orig_source();
3807 auto from
= src
.num();
3808 if (!src
.is_osd() ||
3809 !mon
->osdmon()->osdmap
.is_up(from
) ||
3810 !mon
->osdmon()->osdmap
.get_addrs(from
).legacy_equals(
3811 m
->get_orig_source_addrs())) {
3812 dout(1) << __func__
<< " ignoring stats from non-active osd." << dendl
;
3815 pending_created_pgs
.push_back(m
->pgid
);
3819 bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op
)
3821 op
->mark_osdmon_event(__func__
);
3822 auto m
= op
->get_req
<MOSDPGReadyToMerge
>();
3823 dout(10) << __func__
<< " " << *m
<< dendl
;
3824 const pg_pool_t
*pi
;
3825 auto session
= op
->get_session();
3827 dout(10) << __func__
<< ": no monitor session!" << dendl
;
3830 if (!session
->is_capable("osd", MON_CAP_X
)) {
3831 derr
<< __func__
<< " received from entity "
3832 << "with insufficient privileges " << session
->caps
<< dendl
;
3835 pi
= osdmap
.get_pg_pool(m
->pgid
.pool());
3837 derr
<< __func__
<< " pool for " << m
->pgid
<< " dne" << dendl
;
3840 if (pi
->get_pg_num() <= m
->pgid
.ps()) {
3841 dout(20) << " pg_num " << pi
->get_pg_num() << " already < " << m
->pgid
<< dendl
;
3844 if (pi
->get_pg_num() != m
->pgid
.ps() + 1) {
3845 derr
<< " OSD trying to merge wrong pgid " << m
->pgid
<< dendl
;
3848 if (pi
->get_pg_num_pending() > m
->pgid
.ps()) {
3849 dout(20) << " pg_num_pending " << pi
->get_pg_num_pending() << " > " << m
->pgid
<< dendl
;
3859 bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op
)
3861 op
->mark_osdmon_event(__func__
);
3862 auto m
= op
->get_req
<MOSDPGReadyToMerge
>();
3863 dout(10) << __func__
<< " " << *m
<< dendl
;
3865 if (pending_inc
.new_pools
.count(m
->pgid
.pool()))
3866 p
= pending_inc
.new_pools
[m
->pgid
.pool()];
3868 p
= *osdmap
.get_pg_pool(m
->pgid
.pool());
3869 if (p
.get_pg_num() != m
->pgid
.ps() + 1 ||
3870 p
.get_pg_num_pending() > m
->pgid
.ps()) {
3871 dout(10) << __func__
3872 << " race with concurrent pg_num[_pending] update, will retry"
3874 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
3879 p
.dec_pg_num(m
->pgid
,
3883 m
->last_epoch_started
,
3884 m
->last_epoch_clean
);
3885 p
.last_change
= pending_inc
.epoch
;
3887 // back off the merge attempt!
3888 p
.set_pg_num_pending(p
.get_pg_num());
3891 // force pre-nautilus clients to resend their ops, since they
3892 // don't understand pg_num_pending changes form a new interval
3893 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
3895 pending_inc
.new_pools
[m
->pgid
.pool()] = p
;
3897 auto prob
= g_conf().get_val
<double>("mon_inject_pg_merge_bounce_probability");
3900 prob
> (double)(rand() % 1000)/1000.0) {
3901 derr
<< __func__
<< " injecting pg merge pg_num bounce" << dendl
;
3902 auto n
= new MMonCommand(mon
->monmap
->get_fsid());
3903 n
->set_connection(m
->get_connection());
3904 n
->cmd
= { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
3905 osdmap
.get_pool_name(m
->pgid
.pool()) +
3906 "\", \"var\": \"pg_num_actual\", \"val\": \"" +
3907 stringify(m
->pgid
.ps() + 1) + "\"}" };
3908 MonOpRequestRef nop
= mon
->op_tracker
.create_request
<MonOpRequest
>(n
);
3909 nop
->set_type_service();
3910 wait_for_finished_proposal(op
, new C_RetryMessage(this, nop
));
3912 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
3921 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op
)
3923 auto m
= op
->get_req
<MOSDPGTemp
>();
3924 dout(10) << "preprocess_pgtemp " << *m
<< dendl
;
3925 mempool::osdmap::vector
<int> empty
;
3926 int from
= m
->get_orig_source().num();
3927 size_t ignore_cnt
= 0;
3930 MonSession
*session
= op
->get_session();
3933 if (!session
->is_capable("osd", MON_CAP_X
)) {
3934 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
3935 << session
->caps
<< dendl
;
3939 if (!osdmap
.is_up(from
) ||
3940 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs())) {
3941 dout(7) << "ignoring pgtemp message from down "
3942 << m
->get_orig_source() << " " << m
->get_orig_source_addrs()
3951 for (auto p
= m
->pg_temp
.begin(); p
!= m
->pg_temp
.end(); ++p
) {
3952 dout(20) << " " << p
->first
3953 << (osdmap
.pg_temp
->count(p
->first
) ? osdmap
.pg_temp
->get(p
->first
) : empty
)
3954 << " -> " << p
->second
<< dendl
;
3956 // does the pool exist?
3957 if (!osdmap
.have_pg_pool(p
->first
.pool())) {
3959 * 1. If the osdmap does not have the pool, it means the pool has been
3960 * removed in-between the osd sending this message and us handling it.
3961 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
3962 * not exist in the pending either, as the osds would not send a
3963 * message about a pool they know nothing about (yet).
3964 * 3. However, if the pool does exist in the pending, then it must be a
3965 * new pool, and not relevant to this message (see 1).
3967 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
3968 << ": pool has been removed" << dendl
;
3973 int acting_primary
= -1;
3974 osdmap
.pg_to_up_acting_osds(
3975 p
->first
, nullptr, nullptr, nullptr, &acting_primary
);
3976 if (acting_primary
!= from
) {
3977 /* If the source isn't the primary based on the current osdmap, we know
3978 * that the interval changed and that we can discard this message.
3979 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
3980 * which of two pg temp mappings on the same pg is more recent.
3982 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
3983 << ": primary has changed" << dendl
;
3989 if (p
->second
.empty() && (osdmap
.pg_temp
->count(p
->first
) ||
3990 osdmap
.primary_temp
->count(p
->first
)))
3993 // NOTE: we assume that this will clear pg_primary, so consider
3994 // an existing pg_primary field to imply a change
3995 if (p
->second
.size() &&
3996 (osdmap
.pg_temp
->count(p
->first
) == 0 ||
3997 osdmap
.pg_temp
->get(p
->first
) != p
->second
||
3998 osdmap
.primary_temp
->count(p
->first
)))
4002 // should we ignore all the pgs?
4003 if (ignore_cnt
== m
->pg_temp
.size())
4006 dout(7) << "preprocess_pgtemp e" << m
->map_epoch
<< " no changes from " << m
->get_orig_source_inst() << dendl
;
4007 _reply_map(op
, m
->map_epoch
);
4014 void OSDMonitor::update_up_thru(int from
, epoch_t up_thru
)
4016 epoch_t old_up_thru
= osdmap
.get_up_thru(from
);
4017 auto ut
= pending_inc
.new_up_thru
.find(from
);
4018 if (ut
!= pending_inc
.new_up_thru
.end()) {
4019 old_up_thru
= ut
->second
;
4021 if (up_thru
> old_up_thru
) {
4022 // set up_thru too, so the osd doesn't have to ask again
4023 pending_inc
.new_up_thru
[from
] = up_thru
;
4027 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op
)
4029 op
->mark_osdmon_event(__func__
);
4030 auto m
= op
->get_req
<MOSDPGTemp
>();
4031 int from
= m
->get_orig_source().num();
4032 dout(7) << "prepare_pgtemp e" << m
->map_epoch
<< " from " << m
->get_orig_source_inst() << dendl
;
4033 for (map
<pg_t
,vector
<int32_t> >::iterator p
= m
->pg_temp
.begin(); p
!= m
->pg_temp
.end(); ++p
) {
4034 uint64_t pool
= p
->first
.pool();
4035 if (pending_inc
.old_pools
.count(pool
)) {
4036 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
4037 << ": pool pending removal" << dendl
;
4040 if (!osdmap
.have_pg_pool(pool
)) {
4041 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
4042 << ": pool has been removed" << dendl
;
4045 pending_inc
.new_pg_temp
[p
->first
] =
4046 mempool::osdmap::vector
<int>(p
->second
.begin(), p
->second
.end());
4048 // unconditionally clear pg_primary (until this message can encode
4049 // a change for that, too.. at which point we need to also fix
4050 // preprocess_pg_temp)
4051 if (osdmap
.primary_temp
->count(p
->first
) ||
4052 pending_inc
.new_primary_temp
.count(p
->first
))
4053 pending_inc
.new_primary_temp
[p
->first
] = -1;
4056 // set up_thru too, so the osd doesn't have to ask again
4057 update_up_thru(from
, m
->map_epoch
);
4059 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->map_epoch
));
4066 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op
)
4068 op
->mark_osdmon_event(__func__
);
4069 auto m
= op
->get_req
<MRemoveSnaps
>();
4070 dout(7) << "preprocess_remove_snaps " << *m
<< dendl
;
4072 // check privilege, ignore if failed
4073 MonSession
*session
= op
->get_session();
4077 if (!session
->caps
.is_capable(
4079 session
->entity_name
,
4080 "osd", "osd pool rmsnap", {}, true, true, false,
4081 session
->get_peer_socket_addr())) {
4082 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
4083 << session
->caps
<< dendl
;
4087 for (map
<int, vector
<snapid_t
> >::iterator q
= m
->snaps
.begin();
4088 q
!= m
->snaps
.end();
4090 if (!osdmap
.have_pg_pool(q
->first
)) {
4091 dout(10) << " ignoring removed_snaps " << q
->second
4092 << " on non-existent pool " << q
->first
<< dendl
;
4095 const pg_pool_t
*pi
= osdmap
.get_pg_pool(q
->first
);
4096 for (vector
<snapid_t
>::iterator p
= q
->second
.begin();
4097 p
!= q
->second
.end();
4099 if (*p
> pi
->get_snap_seq() ||
4100 !_is_removed_snap(q
->first
, *p
)) {
4106 if (HAVE_FEATURE(m
->get_connection()->get_features(), SERVER_OCTOPUS
)) {
4107 auto reply
= make_message
<MRemoveSnaps
>();
4108 reply
->snaps
= m
->snaps
;
4109 mon
->send_reply(op
, reply
.detach());
4116 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op
)
4118 op
->mark_osdmon_event(__func__
);
4119 auto m
= op
->get_req
<MRemoveSnaps
>();
4120 dout(7) << "prepare_remove_snaps " << *m
<< dendl
;
4122 for (auto& [pool
, snaps
] : m
->snaps
) {
4123 if (!osdmap
.have_pg_pool(pool
)) {
4124 dout(10) << " ignoring removed_snaps " << snaps
4125 << " on non-existent pool " << pool
<< dendl
;
4129 pg_pool_t
& pi
= osdmap
.pools
[pool
];
4130 for (auto s
: snaps
) {
4131 if (!_is_removed_snap(pool
, s
) &&
4132 (!pending_inc
.new_pools
.count(pool
) ||
4133 !pending_inc
.new_pools
[pool
].removed_snaps
.contains(s
)) &&
4134 (!pending_inc
.new_removed_snaps
.count(pool
) ||
4135 !pending_inc
.new_removed_snaps
[pool
].contains(s
))) {
4136 pg_pool_t
*newpi
= pending_inc
.get_new_pool(pool
, &pi
);
4137 if (osdmap
.require_osd_release
< ceph_release_t::octopus
) {
4138 newpi
->removed_snaps
.insert(s
);
4139 dout(10) << " pool " << pool
<< " removed_snaps added " << s
4140 << " (now " << newpi
->removed_snaps
<< ")" << dendl
;
4142 newpi
->flags
|= pg_pool_t::FLAG_SELFMANAGED_SNAPS
;
4143 if (s
> newpi
->get_snap_seq()) {
4144 dout(10) << " pool " << pool
<< " snap_seq "
4145 << newpi
->get_snap_seq() << " -> " << s
<< dendl
;
4146 newpi
->set_snap_seq(s
);
4148 newpi
->set_snap_epoch(pending_inc
.epoch
);
4149 dout(10) << " added pool " << pool
<< " snap " << s
4150 << " to removed_snaps queue" << dendl
;
4151 pending_inc
.new_removed_snaps
[pool
].insert(s
);
4156 if (HAVE_FEATURE(m
->get_connection()->get_features(), SERVER_OCTOPUS
)) {
4157 auto reply
= make_message
<MRemoveSnaps
>();
4158 reply
->snaps
= m
->snaps
;
4159 wait_for_finished_proposal(op
, new C_ReplyOp(this, op
, reply
));
4165 bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op
)
4167 op
->mark_osdmon_event(__func__
);
4168 auto m
= op
->get_req
<MMonGetPurgedSnaps
>();
4169 dout(7) << __func__
<< " " << *m
<< dendl
;
4171 map
<epoch_t
,mempool::osdmap::map
<int64_t,snap_interval_set_t
>> r
;
4173 string k
= make_purged_snap_epoch_key(m
->start
);
4174 auto it
= mon
->store
->get_iterator(OSD_SNAP_PREFIX
);
4176 unsigned long epoch
= m
->last
;
4177 while (it
->valid()) {
4178 if (it
->key().find("purged_epoch_") != 0) {
4181 string k
= it
->key();
4182 int n
= sscanf(k
.c_str(), "purged_epoch_%lx", &epoch
);
4184 derr
<< __func__
<< " unable to parse key '" << it
->key() << "'" << dendl
;
4185 } else if (epoch
> m
->last
) {
4188 bufferlist bl
= it
->value();
4189 auto p
= bl
.cbegin();
4193 } catch (buffer::error
& e
) {
4194 derr
<< __func__
<< " unable to parse value for key '" << it
->key()
4199 n
+= 4 + v
.size() * 16;
4202 // impose a semi-arbitrary limit to message size
4208 auto reply
= make_message
<MMonGetPurgedSnapsReply
>(m
->start
, epoch
);
4209 reply
->purged_snaps
.swap(r
);
4210 mon
->send_reply(op
, reply
.detach());
4216 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op
)
4218 op
->mark_osdmon_event(__func__
);
4220 auto session
= op
->get_session();
4223 dout(10) << __func__
<< " no monitor session!" << dendl
;
4226 if (!session
->is_capable("osd", MON_CAP_X
)) {
4227 derr
<< __func__
<< " received from entity "
4228 << "with insufficient privileges " << session
->caps
<< dendl
;
4231 // Always forward the beacon to the leader, even if they are the same as
4232 // the old one. The leader will mark as down osds that haven't sent
4233 // beacon for a few minutes.
4237 bool OSDMonitor::prepare_beacon(MonOpRequestRef op
)
4239 op
->mark_osdmon_event(__func__
);
4240 const auto beacon
= op
->get_req
<MOSDBeacon
>();
4241 const auto src
= beacon
->get_orig_source();
4242 dout(10) << __func__
<< " " << *beacon
4243 << " from " << src
<< dendl
;
4244 int from
= src
.num();
4246 if (!src
.is_osd() ||
4247 !osdmap
.is_up(from
) ||
4248 !osdmap
.get_addrs(from
).legacy_equals(beacon
->get_orig_source_addrs())) {
4249 if (src
.is_osd() && !osdmap
.is_up(from
)) {
4250 // share some new maps with this guy in case it may not be
4251 // aware of its own deadness...
4252 send_latest(op
, beacon
->version
+1);
4254 dout(1) << " ignoring beacon from non-active osd." << from
<< dendl
;
4258 last_osd_report
[from
] = ceph_clock_now();
4259 osd_epochs
[from
] = beacon
->version
;
4261 for (const auto& pg
: beacon
->pgs
) {
4262 last_epoch_clean
.report(pg
, beacon
->min_last_epoch_clean
);
4265 if (osdmap
.osd_xinfo
[from
].last_purged_snaps_scrub
<
4266 beacon
->last_purged_snaps_scrub
) {
4267 if (pending_inc
.new_xinfo
.count(from
) == 0) {
4268 pending_inc
.new_xinfo
[from
] = osdmap
.osd_xinfo
[from
];
4270 pending_inc
.new_xinfo
[from
].last_purged_snaps_scrub
=
4271 beacon
->last_purged_snaps_scrub
;
4281 void OSDMonitor::send_latest(MonOpRequestRef op
, epoch_t start
)
4283 op
->mark_osdmon_event(__func__
);
4284 dout(5) << "send_latest to " << op
->get_req()->get_orig_source_inst()
4285 << " start " << start
<< dendl
;
4289 send_incremental(op
, start
);
4293 MOSDMap
*OSDMonitor::build_latest_full(uint64_t features
)
4295 MOSDMap
*r
= new MOSDMap(mon
->monmap
->fsid
, features
);
4296 get_version_full(osdmap
.get_epoch(), features
, r
->maps
[osdmap
.get_epoch()]);
4297 r
->oldest_map
= get_first_committed();
4298 r
->newest_map
= osdmap
.get_epoch();
4302 MOSDMap
*OSDMonitor::build_incremental(epoch_t from
, epoch_t to
, uint64_t features
)
4304 dout(10) << "build_incremental [" << from
<< ".." << to
<< "] with features "
4305 << std::hex
<< features
<< std::dec
<< dendl
;
4306 MOSDMap
*m
= new MOSDMap(mon
->monmap
->fsid
, features
);
4307 m
->oldest_map
= get_first_committed();
4308 m
->newest_map
= osdmap
.get_epoch();
4310 for (epoch_t e
= to
; e
>= from
&& e
> 0; e
--) {
4312 int err
= get_version(e
, features
, bl
);
4314 ceph_assert(bl
.length());
4315 // if (get_version(e, bl) > 0) {
4316 dout(20) << "build_incremental inc " << e
<< " "
4317 << bl
.length() << " bytes" << dendl
;
4318 m
->incremental_maps
[e
] = bl
;
4320 ceph_assert(err
== -ENOENT
);
4321 ceph_assert(!bl
.length());
4322 get_version_full(e
, features
, bl
);
4323 if (bl
.length() > 0) {
4324 //else if (get_version("full", e, bl) > 0) {
4325 dout(20) << "build_incremental full " << e
<< " "
4326 << bl
.length() << " bytes" << dendl
;
4329 ceph_abort(); // we should have all maps.
4336 void OSDMonitor::send_full(MonOpRequestRef op
)
4338 op
->mark_osdmon_event(__func__
);
4339 dout(5) << "send_full to " << op
->get_req()->get_orig_source_inst() << dendl
;
4340 mon
->send_reply(op
, build_latest_full(op
->get_session()->con_features
));
4343 void OSDMonitor::send_incremental(MonOpRequestRef op
, epoch_t first
)
4345 op
->mark_osdmon_event(__func__
);
4347 MonSession
*s
= op
->get_session();
4351 // oh, we can tell the other mon to do it
4352 dout(10) << __func__
<< " asking proxying mon to send_incremental from "
4354 MRoute
*r
= new MRoute(s
->proxy_tid
, NULL
);
4355 r
->send_osdmap_first
= first
;
4356 s
->proxy_con
->send_message(r
);
4357 op
->mark_event("reply: send routed send_osdmap_first reply");
4360 send_incremental(first
, s
, false, op
);
4364 void OSDMonitor::send_incremental(epoch_t first
,
4365 MonSession
*session
,
4367 MonOpRequestRef req
)
4369 dout(5) << "send_incremental [" << first
<< ".." << osdmap
.get_epoch() << "]"
4370 << " to " << session
->name
<< dendl
;
4372 // get feature of the peer
4373 // use quorum_con_features, if it's an anonymous connection.
4374 uint64_t features
= session
->con_features
? session
->con_features
:
4375 mon
->get_quorum_con_features();
4377 if (first
<= session
->osd_epoch
) {
4378 dout(10) << __func__
<< " " << session
->name
<< " should already have epoch "
4379 << session
->osd_epoch
<< dendl
;
4380 first
= session
->osd_epoch
+ 1;
4383 if (first
< get_first_committed()) {
4384 MOSDMap
*m
= new MOSDMap(osdmap
.get_fsid(), features
);
4385 m
->oldest_map
= get_first_committed();
4386 m
->newest_map
= osdmap
.get_epoch();
4388 first
= get_first_committed();
4390 int err
= get_version_full(first
, features
, bl
);
4391 ceph_assert(err
== 0);
4392 ceph_assert(bl
.length());
4393 dout(20) << "send_incremental starting with base full "
4394 << first
<< " " << bl
.length() << " bytes" << dendl
;
4395 m
->maps
[first
] = bl
;
4398 mon
->send_reply(req
, m
);
4399 session
->osd_epoch
= first
;
4402 session
->con
->send_message(m
);
4403 session
->osd_epoch
= first
;
4408 while (first
<= osdmap
.get_epoch()) {
4409 epoch_t last
= std::min
<epoch_t
>(first
+ g_conf()->osd_map_message_max
- 1,
4410 osdmap
.get_epoch());
4411 MOSDMap
*m
= build_incremental(first
, last
, features
);
4414 // send some maps. it may not be all of them, but it will get them
4416 mon
->send_reply(req
, m
);
4418 session
->con
->send_message(m
);
4421 session
->osd_epoch
= last
;
4427 int OSDMonitor::get_version(version_t ver
, bufferlist
& bl
)
4429 return get_version(ver
, mon
->get_quorum_con_features(), bl
);
4432 void OSDMonitor::reencode_incremental_map(bufferlist
& bl
, uint64_t features
)
4434 OSDMap::Incremental inc
;
4435 auto q
= bl
.cbegin();
4437 // always encode with subset of osdmap's canonical features
4438 uint64_t f
= features
& inc
.encode_features
;
4439 dout(20) << __func__
<< " " << inc
.epoch
<< " with features " << f
4442 if (inc
.fullmap
.length()) {
4443 // embedded full map?
4445 m
.decode(inc
.fullmap
);
4446 inc
.fullmap
.clear();
4447 m
.encode(inc
.fullmap
, f
| CEPH_FEATURE_RESERVED
);
4449 if (inc
.crush
.length()) {
4450 // embedded crush map
4452 auto p
= inc
.crush
.cbegin();
4455 c
.encode(inc
.crush
, f
);
4457 inc
.encode(bl
, f
| CEPH_FEATURE_RESERVED
);
4460 void OSDMonitor::reencode_full_map(bufferlist
& bl
, uint64_t features
)
4463 auto q
= bl
.cbegin();
4465 // always encode with subset of osdmap's canonical features
4466 uint64_t f
= features
& m
.get_encoding_features();
4467 dout(20) << __func__
<< " " << m
.get_epoch() << " with features " << f
4470 m
.encode(bl
, f
| CEPH_FEATURE_RESERVED
);
4473 int OSDMonitor::get_version(version_t ver
, uint64_t features
, bufferlist
& bl
)
4475 uint64_t significant_features
= OSDMap::get_significant_features(features
);
4476 if (inc_osd_cache
.lookup({ver
, significant_features
}, &bl
)) {
4479 int ret
= PaxosService::get_version(ver
, bl
);
4483 // NOTE: this check is imprecise; the OSDMap encoding features may
4484 // be a subset of the latest mon quorum features, but worst case we
4485 // reencode once and then cache the (identical) result under both
4487 if (significant_features
!=
4488 OSDMap::get_significant_features(mon
->get_quorum_con_features())) {
4489 reencode_incremental_map(bl
, features
);
4491 inc_osd_cache
.add_bytes({ver
, significant_features
}, bl
);
4495 int OSDMonitor::get_inc(version_t ver
, OSDMap::Incremental
& inc
)
4498 int err
= get_version(ver
, inc_bl
);
4499 ceph_assert(err
== 0);
4500 ceph_assert(inc_bl
.length());
4502 auto p
= inc_bl
.cbegin();
4504 dout(10) << __func__
<< " "
4505 << " epoch " << inc
.epoch
4506 << " inc_crc " << inc
.inc_crc
4507 << " full_crc " << inc
.full_crc
4508 << " encode_features " << inc
.encode_features
<< dendl
;
4512 int OSDMonitor::get_full_from_pinned_map(version_t ver
, bufferlist
& bl
)
4514 dout(10) << __func__
<< " ver " << ver
<< dendl
;
4516 version_t closest_pinned
= osdmap_manifest
.get_lower_closest_pinned(ver
);
4517 if (closest_pinned
== 0) {
4520 if (closest_pinned
> ver
) {
4521 dout(0) << __func__
<< " pinned: " << osdmap_manifest
.pinned
<< dendl
;
4523 ceph_assert(closest_pinned
<= ver
);
4525 dout(10) << __func__
<< " closest pinned ver " << closest_pinned
<< dendl
;
4527 // get osdmap incremental maps and apply on top of this one.
4529 bool has_cached_osdmap
= false;
4530 for (version_t v
= ver
-1; v
>= closest_pinned
; --v
) {
4531 if (full_osd_cache
.lookup({v
, mon
->get_quorum_con_features()},
4533 dout(10) << __func__
<< " found map in cache ver " << v
<< dendl
;
4535 has_cached_osdmap
= true;
4540 if (!has_cached_osdmap
) {
4541 int err
= PaxosService::get_version_full(closest_pinned
, osdm_bl
);
4543 derr
<< __func__
<< " closest pinned map ver " << closest_pinned
4544 << " not available! error: " << cpp_strerror(err
) << dendl
;
4546 ceph_assert(err
== 0);
4549 ceph_assert(osdm_bl
.length());
4552 osdm
.decode(osdm_bl
);
4554 dout(10) << __func__
<< " loaded osdmap epoch " << closest_pinned
4555 << " e" << osdm
.epoch
4556 << " crc " << osdm
.get_crc()
4557 << " -- applying incremental maps." << dendl
;
4559 uint64_t encode_features
= 0;
4560 for (version_t v
= closest_pinned
+ 1; v
<= ver
; ++v
) {
4561 dout(20) << __func__
<< " applying inc epoch " << v
<< dendl
;
4563 OSDMap::Incremental inc
;
4564 int err
= get_inc(v
, inc
);
4565 ceph_assert(err
== 0);
4567 encode_features
= inc
.encode_features
;
4569 err
= osdm
.apply_incremental(inc
);
4570 ceph_assert(err
== 0);
4572 // this block performs paranoid checks on map retrieval
4573 if (g_conf().get_val
<bool>("mon_debug_extra_checks") &&
4574 inc
.full_crc
!= 0) {
4576 uint64_t f
= encode_features
;
4578 f
= (mon
->quorum_con_features
? mon
->quorum_con_features
: -1);
4581 // encode osdmap to force calculating crcs
4583 osdm
.encode(tbl
, f
| CEPH_FEATURE_RESERVED
);
4584 // decode osdmap to compare crcs with what's expected by incremental
4588 if (tosdm
.get_crc() != inc
.full_crc
) {
4590 << " osdmap crc mismatch! (osdmap crc " << tosdm
.get_crc()
4591 << ", expected " << inc
.full_crc
<< ")" << dendl
;
4592 ceph_abort_msg("osdmap crc mismatch");
4596 // note: we cannot add the recently computed map to the cache, as is,
4597 // because we have not encoded the map into a bl.
4600 if (!encode_features
) {
4601 dout(10) << __func__
4602 << " last incremental map didn't have features;"
4603 << " defaulting to quorum's or all" << dendl
;
4605 (mon
->quorum_con_features
? mon
->quorum_con_features
: -1);
4607 osdm
.encode(bl
, encode_features
| CEPH_FEATURE_RESERVED
);
4612 int OSDMonitor::get_version_full(version_t ver
, bufferlist
& bl
)
4614 return get_version_full(ver
, mon
->get_quorum_con_features(), bl
);
4617 int OSDMonitor::get_version_full(version_t ver
, uint64_t features
,
4620 uint64_t significant_features
= OSDMap::get_significant_features(features
);
4621 if (full_osd_cache
.lookup({ver
, significant_features
}, &bl
)) {
4624 int ret
= PaxosService::get_version_full(ver
, bl
);
4625 if (ret
== -ENOENT
) {
4627 ret
= get_full_from_pinned_map(ver
, bl
);
4632 // NOTE: this check is imprecise; the OSDMap encoding features may
4633 // be a subset of the latest mon quorum features, but worst case we
4634 // reencode once and then cache the (identical) result under both
4636 if (significant_features
!=
4637 OSDMap::get_significant_features(mon
->get_quorum_con_features())) {
4638 reencode_full_map(bl
, features
);
4640 full_osd_cache
.add_bytes({ver
, significant_features
}, bl
);
4644 epoch_t
OSDMonitor::blacklist(const entity_addrvec_t
& av
, utime_t until
)
4646 dout(10) << "blacklist " << av
<< " until " << until
<< dendl
;
4647 for (auto a
: av
.v
) {
4648 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
4649 a
.set_type(entity_addr_t::TYPE_ANY
);
4651 a
.set_type(entity_addr_t::TYPE_LEGACY
);
4653 pending_inc
.new_blacklist
[a
] = until
;
4655 return pending_inc
.epoch
;
4658 epoch_t
OSDMonitor::blacklist(entity_addr_t a
, utime_t until
)
4660 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
4661 a
.set_type(entity_addr_t::TYPE_ANY
);
4663 a
.set_type(entity_addr_t::TYPE_LEGACY
);
4665 dout(10) << "blacklist " << a
<< " until " << until
<< dendl
;
4666 pending_inc
.new_blacklist
[a
] = until
;
4667 return pending_inc
.epoch
;
4671 void OSDMonitor::check_osdmap_subs()
4673 dout(10) << __func__
<< dendl
;
4674 if (!osdmap
.get_epoch()) {
4677 auto osdmap_subs
= mon
->session_map
.subs
.find("osdmap");
4678 if (osdmap_subs
== mon
->session_map
.subs
.end()) {
4681 auto p
= osdmap_subs
->second
->begin();
4685 check_osdmap_sub(sub
);
4689 void OSDMonitor::check_osdmap_sub(Subscription
*sub
)
4691 dout(10) << __func__
<< " " << sub
<< " next " << sub
->next
4692 << (sub
->onetime
? " (onetime)":" (ongoing)") << dendl
;
4693 if (sub
->next
<= osdmap
.get_epoch()) {
4695 send_incremental(sub
->next
, sub
->session
, sub
->incremental_onetime
);
4697 sub
->session
->con
->send_message(build_latest_full(sub
->session
->con_features
));
4699 mon
->session_map
.remove_sub(sub
);
4701 sub
->next
= osdmap
.get_epoch() + 1;
4705 void OSDMonitor::check_pg_creates_subs()
4707 if (!osdmap
.get_num_up_osds()) {
4710 ceph_assert(osdmap
.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB
);
4711 mon
->with_session_map([this](const MonSessionMap
& session_map
) {
4712 auto pg_creates_subs
= session_map
.subs
.find("osd_pg_creates");
4713 if (pg_creates_subs
== session_map
.subs
.end()) {
4716 for (auto sub
: *pg_creates_subs
->second
) {
4717 check_pg_creates_sub(sub
);
4722 void OSDMonitor::check_pg_creates_sub(Subscription
*sub
)
4724 dout(20) << __func__
<< " .. " << sub
->session
->name
<< dendl
;
4725 ceph_assert(sub
->type
== "osd_pg_creates");
4726 // only send these if the OSD is up. we will check_subs() when they do
4727 // come up so they will get the creates then.
4728 if (sub
->session
->name
.is_osd() &&
4729 mon
->osdmon()->osdmap
.is_up(sub
->session
->name
.num())) {
4730 sub
->next
= send_pg_creates(sub
->session
->name
.num(),
4731 sub
->session
->con
.get(),
4736 void OSDMonitor::do_application_enable(int64_t pool_id
,
4737 const std::string
&app_name
,
4738 const std::string
&app_key
,
4739 const std::string
&app_value
,
4742 ceph_assert(paxos
->is_plugged() && is_writeable());
4744 dout(20) << __func__
<< ": pool_id=" << pool_id
<< ", app_name=" << app_name
4747 ceph_assert(osdmap
.require_osd_release
>= ceph_release_t::luminous
);
4749 auto pp
= osdmap
.get_pg_pool(pool_id
);
4750 ceph_assert(pp
!= nullptr);
4753 if (pending_inc
.new_pools
.count(pool_id
)) {
4754 p
= pending_inc
.new_pools
[pool_id
];
4757 if (app_key
.empty()) {
4758 p
.application_metadata
.insert({app_name
, {}});
4761 p
.application_metadata
[app_name
][app_key
] = app_value
;
4763 p
.application_metadata
.insert({app_name
, {{app_key
, app_value
}}});
4766 p
.last_change
= pending_inc
.epoch
;
4767 pending_inc
.new_pools
[pool_id
] = p
;
4770 void OSDMonitor::do_set_pool_opt(int64_t pool_id
,
4771 pool_opts_t::key_t opt
,
4772 pool_opts_t::value_t val
)
4774 auto p
= pending_inc
.new_pools
.try_emplace(
4775 pool_id
, *osdmap
.get_pg_pool(pool_id
));
4776 p
.first
->second
.opts
.set(opt
, val
);
4779 unsigned OSDMonitor::scan_for_creating_pgs(
4780 const mempool::osdmap::map
<int64_t,pg_pool_t
>& pools
,
4781 const mempool::osdmap::set
<int64_t>& removed_pools
,
4783 creating_pgs_t
* creating_pgs
) const
4785 unsigned queued
= 0;
4786 for (auto& p
: pools
) {
4787 int64_t poolid
= p
.first
;
4788 if (creating_pgs
->created_pools
.count(poolid
)) {
4789 dout(10) << __func__
<< " already created " << poolid
<< dendl
;
4792 const pg_pool_t
& pool
= p
.second
;
4793 int ruleno
= osdmap
.crush
->find_rule(pool
.get_crush_rule(),
4794 pool
.get_type(), pool
.get_size());
4795 if (ruleno
< 0 || !osdmap
.crush
->rule_exists(ruleno
))
4798 const auto last_scan_epoch
= creating_pgs
->last_scan_epoch
;
4799 const auto created
= pool
.get_last_change();
4800 if (last_scan_epoch
&& created
<= last_scan_epoch
) {
4801 dout(10) << __func__
<< " no change in pool " << poolid
4802 << " " << pool
<< dendl
;
4805 if (removed_pools
.count(poolid
)) {
4806 dout(10) << __func__
<< " pool is being removed: " << poolid
4807 << " " << pool
<< dendl
;
4810 dout(10) << __func__
<< " queueing pool create for " << poolid
4811 << " " << pool
<< dendl
;
4812 creating_pgs
->create_pool(poolid
, pool
.get_pg_num(),
4819 void OSDMonitor::update_creating_pgs()
4821 dout(10) << __func__
<< " " << creating_pgs
.pgs
.size() << " pgs creating, "
4822 << creating_pgs
.queue
.size() << " pools in queue" << dendl
;
4823 decltype(creating_pgs_by_osd_epoch
) new_pgs_by_osd_epoch
;
4824 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
4825 for (const auto& pg
: creating_pgs
.pgs
) {
4826 int acting_primary
= -1;
4827 auto pgid
= pg
.first
;
4828 if (!osdmap
.pg_exists(pgid
)) {
4829 dout(20) << __func__
<< " ignoring " << pgid
<< " which should not exist"
4833 auto mapped
= pg
.second
.create_epoch
;
4834 dout(20) << __func__
<< " looking up " << pgid
<< "@" << mapped
<< dendl
;
4836 mapping
.get_primary_and_shard(pgid
, &acting_primary
, &spgid
);
4837 // check the previous creating_pgs, look for the target to whom the pg was
4838 // previously mapped
4839 for (const auto& pgs_by_epoch
: creating_pgs_by_osd_epoch
) {
4840 const auto last_acting_primary
= pgs_by_epoch
.first
;
4841 for (auto& pgs
: pgs_by_epoch
.second
) {
4842 if (pgs
.second
.count(spgid
)) {
4843 if (last_acting_primary
== acting_primary
) {
4846 dout(20) << __func__
<< " " << pgid
<< " "
4847 << " acting_primary:" << last_acting_primary
4848 << " -> " << acting_primary
<< dendl
;
4849 // note epoch if the target of the create message changed.
4850 mapped
= mapping
.get_epoch();
4855 mapped
= mapping
.get_epoch();
4859 dout(10) << __func__
<< " will instruct osd." << acting_primary
4860 << " to create " << pgid
<< "@" << mapped
<< dendl
;
4861 new_pgs_by_osd_epoch
[acting_primary
][mapped
].insert(spgid
);
4863 creating_pgs_by_osd_epoch
= std::move(new_pgs_by_osd_epoch
);
4864 creating_pgs_epoch
= mapping
.get_epoch();
4867 epoch_t
OSDMonitor::send_pg_creates(int osd
, Connection
*con
, epoch_t next
) const
4869 dout(30) << __func__
<< " osd." << osd
<< " next=" << next
4870 << " " << creating_pgs_by_osd_epoch
<< dendl
;
4871 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
4872 if (creating_pgs_epoch
<= creating_pgs
.last_scan_epoch
) {
4873 dout(20) << __func__
4874 << " not using stale creating_pgs@" << creating_pgs_epoch
<< dendl
;
4875 // the subscribers will be updated when the mapping is completed anyway
4878 auto creating_pgs_by_epoch
= creating_pgs_by_osd_epoch
.find(osd
);
4879 if (creating_pgs_by_epoch
== creating_pgs_by_osd_epoch
.end())
4881 ceph_assert(!creating_pgs_by_epoch
->second
.empty());
4883 MOSDPGCreate
*oldm
= nullptr; // for pre-mimic OSD compat
4884 MOSDPGCreate2
*m
= nullptr;
4886 bool old
= osdmap
.require_osd_release
< ceph_release_t::nautilus
;
4889 for (auto epoch_pgs
= creating_pgs_by_epoch
->second
.lower_bound(next
);
4890 epoch_pgs
!= creating_pgs_by_epoch
->second
.end(); ++epoch_pgs
) {
4891 auto epoch
= epoch_pgs
->first
;
4892 auto& pgs
= epoch_pgs
->second
;
4893 dout(20) << __func__
<< " osd." << osd
<< " from " << next
4894 << " : epoch " << epoch
<< " " << pgs
.size() << " pgs" << dendl
;
4896 for (auto& pg
: pgs
) {
4897 // Need the create time from the monitor using its clock to set
4898 // last_scrub_stamp upon pg creation.
4899 auto create
= creating_pgs
.pgs
.find(pg
.pgid
);
4900 ceph_assert(create
!= creating_pgs
.pgs
.end());
4903 oldm
= new MOSDPGCreate(creating_pgs_epoch
);
4905 oldm
->mkpg
.emplace(pg
.pgid
,
4906 pg_create_t
{create
->second
.create_epoch
, pg
.pgid
, 0});
4907 oldm
->ctimes
.emplace(pg
.pgid
, create
->second
.create_stamp
);
4910 m
= new MOSDPGCreate2(creating_pgs_epoch
);
4912 m
->pgs
.emplace(pg
, make_pair(create
->second
.create_epoch
,
4913 create
->second
.create_stamp
));
4914 if (create
->second
.history
.epoch_created
) {
4915 dout(20) << __func__
<< " " << pg
<< " " << create
->second
.history
4916 << " " << create
->second
.past_intervals
<< dendl
;
4917 m
->pg_extra
.emplace(pg
, make_pair(create
->second
.history
,
4918 create
->second
.past_intervals
));
4921 dout(20) << __func__
<< " will create " << pg
4922 << " at " << create
->second
.create_epoch
<< dendl
;
4926 con
->send_message(m
);
4928 con
->send_message(oldm
);
4930 dout(20) << __func__
<< " osd." << osd
<< " from " << next
4931 << " has nothing to send" << dendl
;
4935 // sub is current through last + 1
4942 void OSDMonitor::tick()
4944 if (!is_active()) return;
4946 dout(10) << osdmap
<< dendl
;
4948 // always update osdmap manifest, regardless of being the leader.
4949 load_osdmap_manifest();
4951 // always tune priority cache manager memory on leader and peons
4952 if (ceph_using_tcmalloc() && mon_memory_autotune
) {
4953 std::lock_guard
l(balancer_lock
);
4954 if (pcm
!= nullptr) {
4957 _set_new_cache_sizes();
4958 dout(10) << "tick balancer "
4959 << " inc cache_bytes: " << inc_cache
->get_cache_bytes()
4960 << " inc comtd_bytes: " << inc_cache
->get_committed_size()
4961 << " inc used_bytes: " << inc_cache
->_get_used_bytes()
4962 << " inc num_osdmaps: " << inc_cache
->_get_num_osdmaps()
4964 dout(10) << "tick balancer "
4965 << " full cache_bytes: " << full_cache
->get_cache_bytes()
4966 << " full comtd_bytes: " << full_cache
->get_committed_size()
4967 << " full used_bytes: " << full_cache
->_get_used_bytes()
4968 << " full num_osdmaps: " << full_cache
->_get_num_osdmaps()
4973 if (!mon
->is_leader()) return;
4975 bool do_propose
= false;
4976 utime_t now
= ceph_clock_now();
4978 if (handle_osd_timeouts(now
, last_osd_report
)) {
4983 if (check_failures(now
)) {
4987 // Force a proposal if we need to prune; pruning is performed on
4988 // ``encode_pending()``, hence why we need to regularly trigger a proposal
4989 // even if there's nothing going on.
4990 if (is_prune_enabled() && should_prune()) {
4994 // mark down osds out?
4996 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
4997 * influence at all. The decision is made based on the ratio of "in" osds,
4998 * and the function returns false if this ratio is lower that the minimum
4999 * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
5001 if (can_mark_out(-1)) {
5002 string down_out_subtree_limit
= g_conf().get_val
<string
>(
5003 "mon_osd_down_out_subtree_limit");
5004 set
<int> down_cache
; // quick cache of down subtrees
5006 map
<int,utime_t
>::iterator i
= down_pending_out
.begin();
5007 while (i
!= down_pending_out
.end()) {
5013 if (osdmap
.is_down(o
) &&
5016 utime_t
orig_grace(g_conf()->mon_osd_down_out_interval
, 0);
5017 utime_t grace
= orig_grace
;
5018 double my_grace
= 0.0;
5020 if (g_conf()->mon_osd_adjust_down_out_interval
) {
5021 // scale grace period the same way we do the heartbeat grace.
5022 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(o
);
5023 double halflife
= (double)g_conf()->mon_osd_laggy_halflife
;
5024 double decay_k
= ::log(.5) / halflife
;
5025 double decay
= exp((double)down
* decay_k
);
5026 dout(20) << "osd." << o
<< " laggy halflife " << halflife
<< " decay_k " << decay_k
5027 << " down for " << down
<< " decay " << decay
<< dendl
;
5028 my_grace
= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
5032 // is this an entire large subtree down?
5033 if (down_out_subtree_limit
.length()) {
5034 int type
= osdmap
.crush
->get_type_id(down_out_subtree_limit
);
5036 if (osdmap
.containing_subtree_is_down(cct
, o
, type
, &down_cache
)) {
5037 dout(10) << "tick entire containing " << down_out_subtree_limit
5038 << " subtree for osd." << o
5039 << " is down; resetting timer" << dendl
;
5040 // reset timer, too.
5041 down_pending_out
[o
] = now
;
5047 bool down_out
= !osdmap
.is_destroyed(o
) &&
5048 g_conf()->mon_osd_down_out_interval
> 0 && down
.sec() >= grace
;
5049 bool destroyed_out
= osdmap
.is_destroyed(o
) &&
5050 g_conf()->mon_osd_destroyed_out_interval
> 0 &&
5051 // this is not precise enough as we did not make a note when this osd
5052 // was marked as destroyed, but let's not bother with that
5053 // complexity for now.
5054 down
.sec() >= g_conf()->mon_osd_destroyed_out_interval
;
5055 if (down_out
|| destroyed_out
) {
5056 dout(10) << "tick marking osd." << o
<< " OUT after " << down
5057 << " sec (target " << grace
<< " = " << orig_grace
<< " + " << my_grace
<< ")" << dendl
;
5058 pending_inc
.new_weight
[o
] = CEPH_OSD_OUT
;
5060 // set the AUTOOUT bit.
5061 if (pending_inc
.new_state
.count(o
) == 0)
5062 pending_inc
.new_state
[o
] = 0;
5063 pending_inc
.new_state
[o
] |= CEPH_OSD_AUTOOUT
;
5065 // remember previous weight
5066 if (pending_inc
.new_xinfo
.count(o
) == 0)
5067 pending_inc
.new_xinfo
[o
] = osdmap
.osd_xinfo
[o
];
5068 pending_inc
.new_xinfo
[o
].old_weight
= osdmap
.osd_weight
[o
];
5072 mon
->clog
->info() << "Marking osd." << o
<< " out (has been down for "
5073 << int(down
.sec()) << " seconds)";
5078 down_pending_out
.erase(o
);
5081 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl
;
5084 // expire blacklisted items?
5085 for (ceph::unordered_map
<entity_addr_t
,utime_t
>::iterator p
= osdmap
.blacklist
.begin();
5086 p
!= osdmap
.blacklist
.end();
5088 if (p
->second
< now
) {
5089 dout(10) << "expiring blacklist item " << p
->first
<< " expired " << p
->second
<< " < now " << now
<< dendl
;
5090 pending_inc
.old_blacklist
.push_back(p
->first
);
5095 if (try_prune_purged_snaps()) {
5099 if (update_pools_status())
5103 !pending_inc
.new_pg_temp
.empty()) // also propose if we adjusted pg_temp
5107 void OSDMonitor::_set_new_cache_sizes()
5109 uint64_t cache_size
= 0;
5110 int64_t inc_alloc
= 0;
5111 int64_t full_alloc
= 0;
5112 int64_t kv_alloc
= 0;
5114 if (pcm
!= nullptr && rocksdb_binned_kv_cache
!= nullptr) {
5115 cache_size
= pcm
->get_tuned_mem();
5116 inc_alloc
= inc_cache
->get_committed_size();
5117 full_alloc
= full_cache
->get_committed_size();
5118 kv_alloc
= rocksdb_binned_kv_cache
->get_committed_size();
5121 inc_osd_cache
.set_bytes(inc_alloc
);
5122 full_osd_cache
.set_bytes(full_alloc
);
5124 dout(1) << __func__
<< " cache_size:" << cache_size
5125 << " inc_alloc: " << inc_alloc
5126 << " full_alloc: " << full_alloc
5127 << " kv_alloc: " << kv_alloc
5131 bool OSDMonitor::handle_osd_timeouts(const utime_t
&now
,
5132 std::map
<int,utime_t
> &last_osd_report
)
5134 utime_t
timeo(g_conf()->mon_osd_report_timeout
, 0);
5135 if (now
- mon
->get_leader_since() < timeo
) {
5136 // We haven't been the leader for long enough to consider OSD timeouts
5140 int max_osd
= osdmap
.get_max_osd();
5141 bool new_down
= false;
5143 for (int i
=0; i
< max_osd
; ++i
) {
5144 dout(30) << __func__
<< ": checking up on osd " << i
<< dendl
;
5145 if (!osdmap
.exists(i
)) {
5146 last_osd_report
.erase(i
); // if any
5149 if (!osdmap
.is_up(i
))
5151 const std::map
<int,utime_t
>::const_iterator t
= last_osd_report
.find(i
);
5152 if (t
== last_osd_report
.end()) {
5153 // it wasn't in the map; start the timer.
5154 last_osd_report
[i
] = now
;
5155 } else if (can_mark_down(i
)) {
5156 utime_t diff
= now
- t
->second
;
5158 mon
->clog
->info() << "osd." << i
<< " marked down after no beacon for "
5159 << diff
<< " seconds";
5160 derr
<< "no beacon from osd." << i
<< " since " << t
->second
5161 << ", " << diff
<< " seconds ago. marking down" << dendl
;
5162 pending_inc
.new_state
[i
] = CEPH_OSD_UP
;
5170 static void dump_cpu_list(Formatter
*f
, const char *name
,
5171 const string
& strlist
)
5174 size_t cpu_set_size
;
5175 if (parse_cpu_set_list(strlist
.c_str(), &cpu_set_size
, &cpu_set
) < 0) {
5178 set
<int> cpus
= cpu_set_to_set(cpu_set_size
, &cpu_set
);
5179 f
->open_array_section(name
);
5180 for (auto cpu
: cpus
) {
5181 f
->dump_int("cpu", cpu
);
5186 void OSDMonitor::dump_info(Formatter
*f
)
5188 f
->open_object_section("osdmap");
5192 f
->open_array_section("osd_metadata");
5193 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
5194 if (osdmap
.exists(i
)) {
5195 f
->open_object_section("osd");
5196 f
->dump_unsigned("id", i
);
5197 dump_osd_metadata(i
, f
, NULL
);
5203 f
->open_object_section("osdmap_clean_epochs");
5204 f
->dump_unsigned("min_last_epoch_clean", get_min_last_epoch_clean());
5206 f
->open_object_section("last_epoch_clean");
5207 last_epoch_clean
.dump(f
);
5210 f
->open_array_section("osd_epochs");
5211 for (auto& osd_epoch
: osd_epochs
) {
5212 f
->open_object_section("osd");
5213 f
->dump_unsigned("id", osd_epoch
.first
);
5214 f
->dump_unsigned("epoch", osd_epoch
.second
);
5217 f
->close_section(); // osd_epochs
5219 f
->close_section(); // osd_clean_epochs
5221 f
->dump_unsigned("osdmap_first_committed", get_first_committed());
5222 f
->dump_unsigned("osdmap_last_committed", get_last_committed());
5224 f
->open_object_section("crushmap");
5225 osdmap
.crush
->dump(f
);
5228 if (has_osdmap_manifest
) {
5229 f
->open_object_section("osdmap_manifest");
5230 osdmap_manifest
.dump(f
);
5236 enum osd_pool_get_choices
{
5238 PG_NUM
, PGP_NUM
, CRUSH_RULE
, HASHPSPOOL
, EC_OVERWRITES
,
5239 NODELETE
, NOPGCHANGE
, NOSIZECHANGE
,
5240 WRITE_FADVISE_DONTNEED
, NOSCRUB
, NODEEP_SCRUB
,
5241 HIT_SET_TYPE
, HIT_SET_PERIOD
, HIT_SET_COUNT
, HIT_SET_FPP
,
5242 USE_GMT_HITSET
, TARGET_MAX_OBJECTS
, TARGET_MAX_BYTES
,
5243 CACHE_TARGET_DIRTY_RATIO
, CACHE_TARGET_DIRTY_HIGH_RATIO
,
5244 CACHE_TARGET_FULL_RATIO
,
5245 CACHE_MIN_FLUSH_AGE
, CACHE_MIN_EVICT_AGE
,
5246 ERASURE_CODE_PROFILE
, MIN_READ_RECENCY_FOR_PROMOTE
,
5247 MIN_WRITE_RECENCY_FOR_PROMOTE
, FAST_READ
,
5248 HIT_SET_GRADE_DECAY_RATE
, HIT_SET_SEARCH_LAST_N
,
5249 SCRUB_MIN_INTERVAL
, SCRUB_MAX_INTERVAL
, DEEP_SCRUB_INTERVAL
,
5250 RECOVERY_PRIORITY
, RECOVERY_OP_PRIORITY
, SCRUB_PRIORITY
,
5251 COMPRESSION_MODE
, COMPRESSION_ALGORITHM
, COMPRESSION_REQUIRED_RATIO
,
5252 COMPRESSION_MAX_BLOB_SIZE
, COMPRESSION_MIN_BLOB_SIZE
,
5253 CSUM_TYPE
, CSUM_MAX_BLOCK
, CSUM_MIN_BLOCK
, FINGERPRINT_ALGORITHM
,
5254 PG_AUTOSCALE_MODE
, PG_NUM_MIN
, TARGET_SIZE_BYTES
, TARGET_SIZE_RATIO
,
5255 PG_AUTOSCALE_BIAS
};
5257 std::set
<osd_pool_get_choices
>
5258 subtract_second_from_first(const std::set
<osd_pool_get_choices
>& first
,
5259 const std::set
<osd_pool_get_choices
>& second
)
5261 std::set
<osd_pool_get_choices
> result
;
5262 std::set_difference(first
.begin(), first
.end(),
5263 second
.begin(), second
.end(),
5264 std::inserter(result
, result
.end()));
5270 bool OSDMonitor::preprocess_command(MonOpRequestRef op
)
5272 op
->mark_osdmon_event(__func__
);
5273 auto m
= op
->get_req
<MMonCommand
>();
5276 stringstream ss
, ds
;
5279 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
5280 string rs
= ss
.str();
5281 mon
->reply_command(op
, -EINVAL
, rs
, get_last_committed());
5285 MonSession
*session
= op
->get_session();
5287 derr
<< __func__
<< " no session" << dendl
;
5288 mon
->reply_command(op
, -EACCES
, "access denied", get_last_committed());
5293 cmd_getval(cmdmap
, "prefix", prefix
);
5296 cmd_getval(cmdmap
, "format", format
, string("plain"));
5297 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
5299 if (prefix
== "osd stat") {
5301 f
->open_object_section("osdmap");
5302 osdmap
.print_summary(f
.get(), ds
, "", true);
5306 osdmap
.print_summary(nullptr, ds
, "", true);
5310 else if (prefix
== "osd dump" ||
5311 prefix
== "osd tree" ||
5312 prefix
== "osd tree-from" ||
5313 prefix
== "osd ls" ||
5314 prefix
== "osd getmap" ||
5315 prefix
== "osd getcrushmap" ||
5316 prefix
== "osd ls-tree" ||
5317 prefix
== "osd info") {
5322 cmd_getval(cmdmap
, "epoch", epochnum
, (int64_t)osdmap
.get_epoch());
5325 bufferlist osdmap_bl
;
5326 int err
= get_version_full(epoch
, osdmap_bl
);
5327 if (err
== -ENOENT
) {
5329 ss
<< "there is no map for epoch " << epoch
;
5332 ceph_assert(err
== 0);
5333 ceph_assert(osdmap_bl
.length());
5336 if (epoch
== osdmap
.get_epoch()) {
5340 p
->decode(osdmap_bl
);
5343 auto sg
= make_scope_guard([&] {
5349 if (prefix
== "osd dump") {
5352 f
->open_object_section("osdmap");
5362 } else if (prefix
== "osd ls") {
5364 f
->open_array_section("osds");
5365 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
5366 if (osdmap
.exists(i
)) {
5367 f
->dump_int("osd", i
);
5374 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
5375 if (osdmap
.exists(i
)) {
5384 } else if (prefix
== "osd info") {
5386 bool do_single_osd
= true;
5387 if (!cmd_getval(cmdmap
, "id", osd_id
)) {
5388 do_single_osd
= false;
5391 if (do_single_osd
&& !osdmap
.exists(osd_id
)) {
5392 ss
<< "osd." << osd_id
<< " does not exist";
5398 if (do_single_osd
) {
5399 osdmap
.dump_osd(osd_id
, f
.get());
5401 osdmap
.dump_osds(f
.get());
5405 if (do_single_osd
) {
5406 osdmap
.print_osd(osd_id
, ds
);
5408 osdmap
.print_osds(ds
);
5412 } else if (prefix
== "osd tree" || prefix
== "osd tree-from") {
5414 if (prefix
== "osd tree-from") {
5415 cmd_getval(cmdmap
, "bucket", bucket
);
5416 if (!osdmap
.crush
->name_exists(bucket
)) {
5417 ss
<< "bucket '" << bucket
<< "' does not exist";
5421 int id
= osdmap
.crush
->get_item_id(bucket
);
5423 ss
<< "\"" << bucket
<< "\" is not a bucket";
5429 vector
<string
> states
;
5430 cmd_getval(cmdmap
, "states", states
);
5431 unsigned filter
= 0;
5432 for (auto& s
: states
) {
5434 filter
|= OSDMap::DUMP_UP
;
5435 } else if (s
== "down") {
5436 filter
|= OSDMap::DUMP_DOWN
;
5437 } else if (s
== "in") {
5438 filter
|= OSDMap::DUMP_IN
;
5439 } else if (s
== "out") {
5440 filter
|= OSDMap::DUMP_OUT
;
5441 } else if (s
== "destroyed") {
5442 filter
|= OSDMap::DUMP_DESTROYED
;
5444 ss
<< "unrecognized state '" << s
<< "'";
5449 if ((filter
& (OSDMap::DUMP_IN
|OSDMap::DUMP_OUT
)) ==
5450 (OSDMap::DUMP_IN
|OSDMap::DUMP_OUT
)) {
5451 ss
<< "cannot specify both 'in' and 'out'";
5455 if (((filter
& (OSDMap::DUMP_UP
|OSDMap::DUMP_DOWN
)) ==
5456 (OSDMap::DUMP_UP
|OSDMap::DUMP_DOWN
)) ||
5457 ((filter
& (OSDMap::DUMP_UP
|OSDMap::DUMP_DESTROYED
)) ==
5458 (OSDMap::DUMP_UP
|OSDMap::DUMP_DESTROYED
)) ||
5459 ((filter
& (OSDMap::DUMP_DOWN
|OSDMap::DUMP_DESTROYED
)) ==
5460 (OSDMap::DUMP_DOWN
|OSDMap::DUMP_DESTROYED
))) {
5461 ss
<< "can specify only one of 'up', 'down' and 'destroyed'";
5466 f
->open_object_section("tree");
5467 p
->print_tree(f
.get(), NULL
, filter
, bucket
);
5471 p
->print_tree(NULL
, &ds
, filter
, bucket
);
5474 } else if (prefix
== "osd getmap") {
5475 rdata
.append(osdmap_bl
);
5476 ss
<< "got osdmap epoch " << p
->get_epoch();
5477 } else if (prefix
== "osd getcrushmap") {
5478 p
->crush
->encode(rdata
, mon
->get_quorum_con_features());
5479 ss
<< p
->get_crush_version();
5480 } else if (prefix
== "osd ls-tree") {
5482 cmd_getval(cmdmap
, "name", bucket_name
);
5484 r
= p
->get_osds_by_bucket_name(bucket_name
, &osds
);
5486 ss
<< "\"" << bucket_name
<< "\" does not exist";
5489 ss
<< "can not parse bucket name:\"" << bucket_name
<< "\"";
5494 f
->open_array_section("osds");
5495 for (auto &i
: osds
) {
5496 if (osdmap
.exists(i
)) {
5497 f
->dump_int("osd", i
);
5504 for (auto &i
: osds
) {
5505 if (osdmap
.exists(i
)) {
5516 } else if (prefix
== "osd getmaxosd") {
5518 f
->open_object_section("getmaxosd");
5519 f
->dump_unsigned("epoch", osdmap
.get_epoch());
5520 f
->dump_int("max_osd", osdmap
.get_max_osd());
5524 ds
<< "max_osd = " << osdmap
.get_max_osd() << " in epoch " << osdmap
.get_epoch();
5527 } else if (prefix
== "osd utilization") {
5529 osdmap
.summarize_mapping_stats(NULL
, NULL
, &out
, f
.get());
5536 } else if (prefix
== "osd find") {
5538 if (!cmd_getval(cmdmap
, "id", osd
)) {
5539 ss
<< "unable to parse osd id value '"
5540 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
5544 if (!osdmap
.exists(osd
)) {
5545 ss
<< "osd." << osd
<< " does not exist";
5550 cmd_getval(cmdmap
, "format", format
);
5551 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5552 f
->open_object_section("osd_location");
5553 f
->dump_int("osd", osd
);
5554 f
->dump_object("addrs", osdmap
.get_addrs(osd
));
5555 f
->dump_stream("osd_fsid") << osdmap
.get_uuid(osd
);
5557 // try to identify host, pod/container name, etc.
5558 map
<string
,string
> m
;
5559 load_metadata(osd
, m
, nullptr);
5560 if (auto p
= m
.find("hostname"); p
!= m
.end()) {
5561 f
->dump_string("host", p
->second
);
5564 "pod_name", "pod_namespace", // set by rook
5565 "container_name" // set by cephadm, ceph-ansible
5567 if (auto p
= m
.find(k
); p
!= m
.end()) {
5568 f
->dump_string(k
, p
->second
);
5572 // crush is helpful too
5573 f
->open_object_section("crush_location");
5574 map
<string
,string
> loc
= osdmap
.crush
->get_full_location(osd
);
5575 for (map
<string
,string
>::iterator p
= loc
.begin(); p
!= loc
.end(); ++p
)
5576 f
->dump_string(p
->first
.c_str(), p
->second
);
5580 } else if (prefix
== "osd metadata") {
5582 if (cmd_vartype_stringify(cmdmap
["id"]).size() &&
5583 !cmd_getval(cmdmap
, "id", osd
)) {
5584 ss
<< "unable to parse osd id value '"
5585 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
5589 if (osd
>= 0 && !osdmap
.exists(osd
)) {
5590 ss
<< "osd." << osd
<< " does not exist";
5595 cmd_getval(cmdmap
, "format", format
);
5596 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5598 f
->open_object_section("osd_metadata");
5599 f
->dump_unsigned("id", osd
);
5600 r
= dump_osd_metadata(osd
, f
.get(), &ss
);
5606 f
->open_array_section("osd_metadata");
5607 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
5608 if (osdmap
.exists(i
)) {
5609 f
->open_object_section("osd");
5610 f
->dump_unsigned("id", i
);
5611 r
= dump_osd_metadata(i
, f
.get(), NULL
);
5612 if (r
== -EINVAL
|| r
== -ENOENT
) {
5613 // Drop error, continue to get other daemons' metadata
5614 dout(4) << "No metadata for osd." << i
<< dendl
;
5626 } else if (prefix
== "osd versions") {
5628 f
.reset(Formatter::create("json-pretty"));
5629 count_metadata("ceph_version", f
.get());
5632 } else if (prefix
== "osd count-metadata") {
5634 f
.reset(Formatter::create("json-pretty"));
5636 cmd_getval(cmdmap
, "property", field
);
5637 count_metadata(field
, f
.get());
5640 } else if (prefix
== "osd numa-status") {
5643 f
->open_array_section("osds");
5645 tbl
.define_column("OSD", TextTable::LEFT
, TextTable::RIGHT
);
5646 tbl
.define_column("HOST", TextTable::LEFT
, TextTable::LEFT
);
5647 tbl
.define_column("NETWORK", TextTable::RIGHT
, TextTable::RIGHT
);
5648 tbl
.define_column("STORAGE", TextTable::RIGHT
, TextTable::RIGHT
);
5649 tbl
.define_column("AFFINITY", TextTable::RIGHT
, TextTable::RIGHT
);
5650 tbl
.define_column("CPUS", TextTable::LEFT
, TextTable::LEFT
);
5652 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
5653 if (osdmap
.exists(i
)) {
5654 map
<string
,string
> m
;
5656 if (load_metadata(i
, m
, &err
) < 0) {
5660 auto p
= m
.find("hostname");
5665 f
->open_object_section("osd");
5666 f
->dump_int("osd", i
);
5667 f
->dump_string("host", host
);
5668 for (auto n
: { "network_numa_node", "objectstore_numa_node",
5672 f
->dump_int(n
, atoi(p
->second
.c_str()));
5675 for (auto n
: { "network_numa_nodes", "objectstore_numa_nodes" }) {
5678 list
<string
> ls
= get_str_list(p
->second
, ",");
5679 f
->open_array_section(n
);
5680 for (auto node
: ls
) {
5681 f
->dump_int("node", atoi(node
.c_str()));
5686 for (auto n
: { "numa_node_cpus" }) {
5689 dump_cpu_list(f
.get(), n
, p
->second
);
5696 p
= m
.find("network_numa_nodes");
5702 p
= m
.find("objectstore_numa_nodes");
5708 p
= m
.find("numa_node");
5709 auto q
= m
.find("numa_node_cpus");
5710 if (p
!= m
.end() && q
!= m
.end()) {
5717 tbl
<< TextTable::endrow
;
5725 rdata
.append(stringify(tbl
));
5727 } else if (prefix
== "osd map") {
5728 string poolstr
, objstr
, namespacestr
;
5729 cmd_getval(cmdmap
, "pool", poolstr
);
5730 cmd_getval(cmdmap
, "object", objstr
);
5731 cmd_getval(cmdmap
, "nspace", namespacestr
);
5733 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
5735 ss
<< "pool " << poolstr
<< " does not exist";
5739 object_locator_t
oloc(pool
, namespacestr
);
5740 object_t
oid(objstr
);
5741 pg_t pgid
= osdmap
.object_locator_to_pg(oid
, oloc
);
5742 pg_t mpgid
= osdmap
.raw_pg_to_pg(pgid
);
5743 vector
<int> up
, acting
;
5745 osdmap
.pg_to_up_acting_osds(mpgid
, &up
, &up_p
, &acting
, &acting_p
);
5748 if (!namespacestr
.empty())
5749 fullobjname
= namespacestr
+ string("/") + oid
.name
;
5751 fullobjname
= oid
.name
;
5753 f
->open_object_section("osd_map");
5754 f
->dump_unsigned("epoch", osdmap
.get_epoch());
5755 f
->dump_string("pool", poolstr
);
5756 f
->dump_int("pool_id", pool
);
5757 f
->dump_stream("objname") << fullobjname
;
5758 f
->dump_stream("raw_pgid") << pgid
;
5759 f
->dump_stream("pgid") << mpgid
;
5760 f
->open_array_section("up");
5761 for (vector
<int>::iterator p
= up
.begin(); p
!= up
.end(); ++p
)
5762 f
->dump_int("osd", *p
);
5764 f
->dump_int("up_primary", up_p
);
5765 f
->open_array_section("acting");
5766 for (vector
<int>::iterator p
= acting
.begin(); p
!= acting
.end(); ++p
)
5767 f
->dump_int("osd", *p
);
5769 f
->dump_int("acting_primary", acting_p
);
5770 f
->close_section(); // osd_map
5773 ds
<< "osdmap e" << osdmap
.get_epoch()
5774 << " pool '" << poolstr
<< "' (" << pool
<< ")"
5775 << " object '" << fullobjname
<< "' ->"
5776 << " pg " << pgid
<< " (" << mpgid
<< ")"
5777 << " -> up (" << pg_vector_string(up
) << ", p" << up_p
<< ") acting ("
5778 << pg_vector_string(acting
) << ", p" << acting_p
<< ")";
5782 } else if (prefix
== "pg map") {
5785 cmd_getval(cmdmap
, "pgid", pgidstr
);
5786 if (!pgid
.parse(pgidstr
.c_str())) {
5787 ss
<< "invalid pgid '" << pgidstr
<< "'";
5791 vector
<int> up
, acting
;
5792 if (!osdmap
.have_pg_pool(pgid
.pool())) {
5793 ss
<< "pg '" << pgidstr
<< "' does not exist";
5797 pg_t mpgid
= osdmap
.raw_pg_to_pg(pgid
);
5798 osdmap
.pg_to_up_acting_osds(pgid
, up
, acting
);
5800 f
->open_object_section("pg_map");
5801 f
->dump_unsigned("epoch", osdmap
.get_epoch());
5802 f
->dump_stream("raw_pgid") << pgid
;
5803 f
->dump_stream("pgid") << mpgid
;
5804 f
->open_array_section("up");
5805 for (auto osd
: up
) {
5806 f
->dump_int("up_osd", osd
);
5809 f
->open_array_section("acting");
5810 for (auto osd
: acting
) {
5811 f
->dump_int("acting_osd", osd
);
5817 ds
<< "osdmap e" << osdmap
.get_epoch()
5818 << " pg " << pgid
<< " (" << mpgid
<< ")"
5819 << " -> up " << up
<< " acting " << acting
;
5824 } else if (prefix
== "osd lspools") {
5826 f
->open_array_section("pools");
5827 for (map
<int64_t, pg_pool_t
>::iterator p
= osdmap
.pools
.begin();
5828 p
!= osdmap
.pools
.end();
5831 f
->open_object_section("pool");
5832 f
->dump_int("poolnum", p
->first
);
5833 f
->dump_string("poolname", osdmap
.pool_name
[p
->first
]);
5836 ds
<< p
->first
<< ' ' << osdmap
.pool_name
[p
->first
];
5837 if (next(p
) != osdmap
.pools
.end()) {
5847 } else if (prefix
== "osd blacklist ls") {
5849 f
->open_array_section("blacklist");
5851 for (ceph::unordered_map
<entity_addr_t
,utime_t
>::iterator p
= osdmap
.blacklist
.begin();
5852 p
!= osdmap
.blacklist
.end();
5855 f
->open_object_section("entry");
5856 f
->dump_string("addr", p
->first
.get_legacy_str());
5857 f
->dump_stream("until") << p
->second
;
5862 ss
<< p
->first
<< " " << p
->second
;
5872 ss
<< "listed " << osdmap
.blacklist
.size() << " entries";
5874 } else if (prefix
== "osd pool ls") {
5876 cmd_getval(cmdmap
, "detail", detail
);
5877 if (!f
&& detail
== "detail") {
5879 osdmap
.print_pools(ss
);
5880 rdata
.append(ss
.str());
5883 f
->open_array_section("pools");
5884 for (map
<int64_t,pg_pool_t
>::const_iterator it
= osdmap
.get_pools().begin();
5885 it
!= osdmap
.get_pools().end();
5888 if (detail
== "detail") {
5889 f
->open_object_section("pool");
5890 f
->dump_int("pool_id", it
->first
);
5891 f
->dump_string("pool_name", osdmap
.get_pool_name(it
->first
));
5892 it
->second
.dump(f
.get());
5895 f
->dump_string("pool_name", osdmap
.get_pool_name(it
->first
));
5898 rdata
.append(osdmap
.get_pool_name(it
->first
) + "\n");
5907 } else if (prefix
== "osd crush get-tunable") {
5909 cmd_getval(cmdmap
, "tunable", tunable
);
5912 f
->open_object_section("tunable");
5913 if (tunable
== "straw_calc_version") {
5915 f
->dump_int(tunable
.c_str(), osdmap
.crush
->get_straw_calc_version());
5917 rss
<< osdmap
.crush
->get_straw_calc_version() << "\n";
5926 rdata
.append(rss
.str());
5930 } else if (prefix
== "osd pool get") {
5932 cmd_getval(cmdmap
, "pool", poolstr
);
5933 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
5935 ss
<< "unrecognized pool '" << poolstr
<< "'";
5940 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
5942 cmd_getval(cmdmap
, "var", var
);
5944 typedef std::map
<std::string
, osd_pool_get_choices
> choices_map_t
;
5945 const choices_map_t ALL_CHOICES
= {
5947 {"min_size", MIN_SIZE
},
5948 {"pg_num", PG_NUM
}, {"pgp_num", PGP_NUM
},
5949 {"crush_rule", CRUSH_RULE
}, {"hashpspool", HASHPSPOOL
},
5950 {"allow_ec_overwrites", EC_OVERWRITES
}, {"nodelete", NODELETE
},
5951 {"nopgchange", NOPGCHANGE
}, {"nosizechange", NOSIZECHANGE
},
5952 {"noscrub", NOSCRUB
}, {"nodeep-scrub", NODEEP_SCRUB
},
5953 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED
},
5954 {"hit_set_type", HIT_SET_TYPE
}, {"hit_set_period", HIT_SET_PERIOD
},
5955 {"hit_set_count", HIT_SET_COUNT
}, {"hit_set_fpp", HIT_SET_FPP
},
5956 {"use_gmt_hitset", USE_GMT_HITSET
},
5957 {"target_max_objects", TARGET_MAX_OBJECTS
},
5958 {"target_max_bytes", TARGET_MAX_BYTES
},
5959 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO
},
5960 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO
},
5961 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO
},
5962 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE
},
5963 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE
},
5964 {"erasure_code_profile", ERASURE_CODE_PROFILE
},
5965 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE
},
5966 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE
},
5967 {"fast_read", FAST_READ
},
5968 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE
},
5969 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N
},
5970 {"scrub_min_interval", SCRUB_MIN_INTERVAL
},
5971 {"scrub_max_interval", SCRUB_MAX_INTERVAL
},
5972 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL
},
5973 {"recovery_priority", RECOVERY_PRIORITY
},
5974 {"recovery_op_priority", RECOVERY_OP_PRIORITY
},
5975 {"scrub_priority", SCRUB_PRIORITY
},
5976 {"compression_mode", COMPRESSION_MODE
},
5977 {"compression_algorithm", COMPRESSION_ALGORITHM
},
5978 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO
},
5979 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE
},
5980 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE
},
5981 {"csum_type", CSUM_TYPE
},
5982 {"csum_max_block", CSUM_MAX_BLOCK
},
5983 {"csum_min_block", CSUM_MIN_BLOCK
},
5984 {"fingerprint_algorithm", FINGERPRINT_ALGORITHM
},
5985 {"pg_autoscale_mode", PG_AUTOSCALE_MODE
},
5986 {"pg_num_min", PG_NUM_MIN
},
5987 {"target_size_bytes", TARGET_SIZE_BYTES
},
5988 {"target_size_ratio", TARGET_SIZE_RATIO
},
5989 {"pg_autoscale_bias", PG_AUTOSCALE_BIAS
},
5992 typedef std::set
<osd_pool_get_choices
> choices_set_t
;
5994 const choices_set_t ONLY_TIER_CHOICES
= {
5995 HIT_SET_TYPE
, HIT_SET_PERIOD
, HIT_SET_COUNT
, HIT_SET_FPP
,
5996 TARGET_MAX_OBJECTS
, TARGET_MAX_BYTES
, CACHE_TARGET_FULL_RATIO
,
5997 CACHE_TARGET_DIRTY_RATIO
, CACHE_TARGET_DIRTY_HIGH_RATIO
,
5998 CACHE_MIN_FLUSH_AGE
, CACHE_MIN_EVICT_AGE
,
5999 MIN_READ_RECENCY_FOR_PROMOTE
,
6000 MIN_WRITE_RECENCY_FOR_PROMOTE
,
6001 HIT_SET_GRADE_DECAY_RATE
, HIT_SET_SEARCH_LAST_N
6003 const choices_set_t ONLY_ERASURE_CHOICES
= {
6004 EC_OVERWRITES
, ERASURE_CODE_PROFILE
6007 choices_set_t selected_choices
;
6009 for(choices_map_t::const_iterator it
= ALL_CHOICES
.begin();
6010 it
!= ALL_CHOICES
.end(); ++it
) {
6011 selected_choices
.insert(it
->second
);
6015 selected_choices
= subtract_second_from_first(selected_choices
,
6019 if(!p
->is_erasure()) {
6020 selected_choices
= subtract_second_from_first(selected_choices
,
6021 ONLY_ERASURE_CHOICES
);
6023 } else /* var != "all" */ {
6024 choices_map_t::const_iterator found
= ALL_CHOICES
.find(var
);
6025 osd_pool_get_choices selected
= found
->second
;
6027 if (!p
->is_tier() &&
6028 ONLY_TIER_CHOICES
.find(selected
) != ONLY_TIER_CHOICES
.end()) {
6029 ss
<< "pool '" << poolstr
6030 << "' is not a tier pool: variable not applicable";
6035 if (!p
->is_erasure() &&
6036 ONLY_ERASURE_CHOICES
.find(selected
)
6037 != ONLY_ERASURE_CHOICES
.end()) {
6038 ss
<< "pool '" << poolstr
6039 << "' is not a erasure pool: variable not applicable";
6044 if (pool_opts_t::is_opt_name(var
) &&
6045 !p
->opts
.is_set(pool_opts_t::get_opt_desc(var
).key
)) {
6046 ss
<< "option '" << var
<< "' is not set on pool '" << poolstr
<< "'";
6051 selected_choices
.insert(selected
);
6055 f
->open_object_section("pool");
6056 f
->dump_string("pool", poolstr
);
6057 f
->dump_int("pool_id", pool
);
6058 for(choices_set_t::const_iterator it
= selected_choices
.begin();
6059 it
!= selected_choices
.end(); ++it
) {
6060 choices_map_t::const_iterator i
;
6061 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
6062 if (i
->second
== *it
) {
6066 ceph_assert(i
!= ALL_CHOICES
.end());
6069 f
->dump_int("pg_num", p
->get_pg_num());
6072 f
->dump_int("pgp_num", p
->get_pgp_num());
6075 f
->dump_int("size", p
->get_size());
6078 f
->dump_int("min_size", p
->get_min_size());
6081 if (osdmap
.crush
->rule_exists(p
->get_crush_rule())) {
6082 f
->dump_string("crush_rule", osdmap
.crush
->get_rule_name(
6083 p
->get_crush_rule()));
6085 f
->dump_string("crush_rule", stringify(p
->get_crush_rule()));
6089 f
->dump_bool("allow_ec_overwrites",
6090 p
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
));
6092 case PG_AUTOSCALE_MODE
:
6093 f
->dump_string("pg_autoscale_mode",
6094 pg_pool_t::get_pg_autoscale_mode_name(
6095 p
->pg_autoscale_mode
));
6101 case WRITE_FADVISE_DONTNEED
:
6104 f
->dump_bool(i
->first
.c_str(),
6105 p
->has_flag(pg_pool_t::get_flag_by_name(i
->first
)));
6107 case HIT_SET_PERIOD
:
6108 f
->dump_int("hit_set_period", p
->hit_set_period
);
6111 f
->dump_int("hit_set_count", p
->hit_set_count
);
6114 f
->dump_string("hit_set_type",
6115 HitSet::get_type_name(p
->hit_set_params
.get_type()));
6119 if (p
->hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
6120 BloomHitSet::Params
*bloomp
=
6121 static_cast<BloomHitSet::Params
*>(p
->hit_set_params
.impl
.get());
6122 f
->dump_float("hit_set_fpp", bloomp
->get_fpp());
6123 } else if(var
!= "all") {
6125 ss
<< "hit set is not of type Bloom; " <<
6126 "invalid to get a false positive rate!";
6132 case USE_GMT_HITSET
:
6133 f
->dump_bool("use_gmt_hitset", p
->use_gmt_hitset
);
6135 case TARGET_MAX_OBJECTS
:
6136 f
->dump_unsigned("target_max_objects", p
->target_max_objects
);
6138 case TARGET_MAX_BYTES
:
6139 f
->dump_unsigned("target_max_bytes", p
->target_max_bytes
);
6141 case CACHE_TARGET_DIRTY_RATIO
:
6142 f
->dump_unsigned("cache_target_dirty_ratio_micro",
6143 p
->cache_target_dirty_ratio_micro
);
6144 f
->dump_float("cache_target_dirty_ratio",
6145 ((float)p
->cache_target_dirty_ratio_micro
/1000000));
6147 case CACHE_TARGET_DIRTY_HIGH_RATIO
:
6148 f
->dump_unsigned("cache_target_dirty_high_ratio_micro",
6149 p
->cache_target_dirty_high_ratio_micro
);
6150 f
->dump_float("cache_target_dirty_high_ratio",
6151 ((float)p
->cache_target_dirty_high_ratio_micro
/1000000));
6153 case CACHE_TARGET_FULL_RATIO
:
6154 f
->dump_unsigned("cache_target_full_ratio_micro",
6155 p
->cache_target_full_ratio_micro
);
6156 f
->dump_float("cache_target_full_ratio",
6157 ((float)p
->cache_target_full_ratio_micro
/1000000));
6159 case CACHE_MIN_FLUSH_AGE
:
6160 f
->dump_unsigned("cache_min_flush_age", p
->cache_min_flush_age
);
6162 case CACHE_MIN_EVICT_AGE
:
6163 f
->dump_unsigned("cache_min_evict_age", p
->cache_min_evict_age
);
6165 case ERASURE_CODE_PROFILE
:
6166 f
->dump_string("erasure_code_profile", p
->erasure_code_profile
);
6168 case MIN_READ_RECENCY_FOR_PROMOTE
:
6169 f
->dump_int("min_read_recency_for_promote",
6170 p
->min_read_recency_for_promote
);
6172 case MIN_WRITE_RECENCY_FOR_PROMOTE
:
6173 f
->dump_int("min_write_recency_for_promote",
6174 p
->min_write_recency_for_promote
);
6177 f
->dump_int("fast_read", p
->fast_read
);
6179 case HIT_SET_GRADE_DECAY_RATE
:
6180 f
->dump_int("hit_set_grade_decay_rate",
6181 p
->hit_set_grade_decay_rate
);
6183 case HIT_SET_SEARCH_LAST_N
:
6184 f
->dump_int("hit_set_search_last_n",
6185 p
->hit_set_search_last_n
);
6187 case SCRUB_MIN_INTERVAL
:
6188 case SCRUB_MAX_INTERVAL
:
6189 case DEEP_SCRUB_INTERVAL
:
6190 case RECOVERY_PRIORITY
:
6191 case RECOVERY_OP_PRIORITY
:
6192 case SCRUB_PRIORITY
:
6193 case COMPRESSION_MODE
:
6194 case COMPRESSION_ALGORITHM
:
6195 case COMPRESSION_REQUIRED_RATIO
:
6196 case COMPRESSION_MAX_BLOB_SIZE
:
6197 case COMPRESSION_MIN_BLOB_SIZE
:
6199 case CSUM_MAX_BLOCK
:
6200 case CSUM_MIN_BLOCK
:
6201 case FINGERPRINT_ALGORITHM
:
6203 case TARGET_SIZE_BYTES
:
6204 case TARGET_SIZE_RATIO
:
6205 case PG_AUTOSCALE_BIAS
:
6206 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc(i
->first
).key
;
6207 if (p
->opts
.is_set(key
)) {
6208 if(*it
== CSUM_TYPE
) {
6210 p
->opts
.get(pool_opts_t::CSUM_TYPE
, &val
);
6211 f
->dump_string(i
->first
.c_str(), Checksummer::get_csum_type_string(val
));
6213 p
->opts
.dump(i
->first
, f
.get());
6222 for(choices_set_t::const_iterator it
= selected_choices
.begin();
6223 it
!= selected_choices
.end(); ++it
) {
6224 choices_map_t::const_iterator i
;
6227 ss
<< "pg_num: " << p
->get_pg_num() << "\n";
6230 ss
<< "pgp_num: " << p
->get_pgp_num() << "\n";
6233 ss
<< "size: " << p
->get_size() << "\n";
6236 ss
<< "min_size: " << p
->get_min_size() << "\n";
6239 if (osdmap
.crush
->rule_exists(p
->get_crush_rule())) {
6240 ss
<< "crush_rule: " << osdmap
.crush
->get_rule_name(
6241 p
->get_crush_rule()) << "\n";
6243 ss
<< "crush_rule: " << p
->get_crush_rule() << "\n";
6246 case PG_AUTOSCALE_MODE
:
6247 ss
<< "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
6248 p
->pg_autoscale_mode
) <<"\n";
6250 case HIT_SET_PERIOD
:
6251 ss
<< "hit_set_period: " << p
->hit_set_period
<< "\n";
6254 ss
<< "hit_set_count: " << p
->hit_set_count
<< "\n";
6257 ss
<< "hit_set_type: " <<
6258 HitSet::get_type_name(p
->hit_set_params
.get_type()) << "\n";
6262 if (p
->hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
6263 BloomHitSet::Params
*bloomp
=
6264 static_cast<BloomHitSet::Params
*>(p
->hit_set_params
.impl
.get());
6265 ss
<< "hit_set_fpp: " << bloomp
->get_fpp() << "\n";
6266 } else if(var
!= "all") {
6267 ss
<< "hit set is not of type Bloom; " <<
6268 "invalid to get a false positive rate!";
6274 case USE_GMT_HITSET
:
6275 ss
<< "use_gmt_hitset: " << p
->use_gmt_hitset
<< "\n";
6277 case TARGET_MAX_OBJECTS
:
6278 ss
<< "target_max_objects: " << p
->target_max_objects
<< "\n";
6280 case TARGET_MAX_BYTES
:
6281 ss
<< "target_max_bytes: " << p
->target_max_bytes
<< "\n";
6283 case CACHE_TARGET_DIRTY_RATIO
:
6284 ss
<< "cache_target_dirty_ratio: "
6285 << ((float)p
->cache_target_dirty_ratio_micro
/1000000) << "\n";
6287 case CACHE_TARGET_DIRTY_HIGH_RATIO
:
6288 ss
<< "cache_target_dirty_high_ratio: "
6289 << ((float)p
->cache_target_dirty_high_ratio_micro
/1000000) << "\n";
6291 case CACHE_TARGET_FULL_RATIO
:
6292 ss
<< "cache_target_full_ratio: "
6293 << ((float)p
->cache_target_full_ratio_micro
/1000000) << "\n";
6295 case CACHE_MIN_FLUSH_AGE
:
6296 ss
<< "cache_min_flush_age: " << p
->cache_min_flush_age
<< "\n";
6298 case CACHE_MIN_EVICT_AGE
:
6299 ss
<< "cache_min_evict_age: " << p
->cache_min_evict_age
<< "\n";
6301 case ERASURE_CODE_PROFILE
:
6302 ss
<< "erasure_code_profile: " << p
->erasure_code_profile
<< "\n";
6304 case MIN_READ_RECENCY_FOR_PROMOTE
:
6305 ss
<< "min_read_recency_for_promote: " <<
6306 p
->min_read_recency_for_promote
<< "\n";
6308 case HIT_SET_GRADE_DECAY_RATE
:
6309 ss
<< "hit_set_grade_decay_rate: " <<
6310 p
->hit_set_grade_decay_rate
<< "\n";
6312 case HIT_SET_SEARCH_LAST_N
:
6313 ss
<< "hit_set_search_last_n: " <<
6314 p
->hit_set_search_last_n
<< "\n";
6317 ss
<< "allow_ec_overwrites: " <<
6318 (p
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
) ? "true" : "false") <<
6325 case WRITE_FADVISE_DONTNEED
:
6328 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
6329 if (i
->second
== *it
)
6332 ceph_assert(i
!= ALL_CHOICES
.end());
6333 ss
<< i
->first
<< ": " <<
6334 (p
->has_flag(pg_pool_t::get_flag_by_name(i
->first
)) ?
6335 "true" : "false") << "\n";
6337 case MIN_WRITE_RECENCY_FOR_PROMOTE
:
6338 ss
<< "min_write_recency_for_promote: " <<
6339 p
->min_write_recency_for_promote
<< "\n";
6342 ss
<< "fast_read: " << p
->fast_read
<< "\n";
6344 case SCRUB_MIN_INTERVAL
:
6345 case SCRUB_MAX_INTERVAL
:
6346 case DEEP_SCRUB_INTERVAL
:
6347 case RECOVERY_PRIORITY
:
6348 case RECOVERY_OP_PRIORITY
:
6349 case SCRUB_PRIORITY
:
6350 case COMPRESSION_MODE
:
6351 case COMPRESSION_ALGORITHM
:
6352 case COMPRESSION_REQUIRED_RATIO
:
6353 case COMPRESSION_MAX_BLOB_SIZE
:
6354 case COMPRESSION_MIN_BLOB_SIZE
:
6356 case CSUM_MAX_BLOCK
:
6357 case CSUM_MIN_BLOCK
:
6358 case FINGERPRINT_ALGORITHM
:
6360 case TARGET_SIZE_BYTES
:
6361 case TARGET_SIZE_RATIO
:
6362 case PG_AUTOSCALE_BIAS
:
6363 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
6364 if (i
->second
== *it
)
6367 ceph_assert(i
!= ALL_CHOICES
.end());
6369 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc(i
->first
).key
;
6370 if (p
->opts
.is_set(key
)) {
6371 if(key
== pool_opts_t::CSUM_TYPE
) {
6373 p
->opts
.get(key
, &val
);
6374 ss
<< i
->first
<< ": " << Checksummer::get_csum_type_string(val
) << "\n";
6376 ss
<< i
->first
<< ": " << p
->opts
.get(key
) << "\n";
6382 rdata
.append(ss
.str());
6387 } else if (prefix
== "osd pool get-quota") {
6389 cmd_getval(cmdmap
, "pool", pool_name
);
6391 int64_t poolid
= osdmap
.lookup_pg_pool_name(pool_name
);
6393 ceph_assert(poolid
== -ENOENT
);
6394 ss
<< "unrecognized pool '" << pool_name
<< "'";
6398 const pg_pool_t
*p
= osdmap
.get_pg_pool(poolid
);
6399 const pool_stat_t
* pstat
= mon
->mgrstatmon()->get_pool_stat(poolid
);
6400 const object_stat_sum_t
& sum
= pstat
->stats
.sum
;
6402 f
->open_object_section("pool_quotas");
6403 f
->dump_string("pool_name", pool_name
);
6404 f
->dump_unsigned("pool_id", poolid
);
6405 f
->dump_unsigned("quota_max_objects", p
->quota_max_objects
);
6406 f
->dump_int("current_num_objects", sum
.num_objects
);
6407 f
->dump_unsigned("quota_max_bytes", p
->quota_max_bytes
);
6408 f
->dump_int("current_num_bytes", sum
.num_bytes
);
6413 rs
<< "quotas for pool '" << pool_name
<< "':\n"
6414 << " max objects: ";
6415 if (p
->quota_max_objects
== 0)
6418 rs
<< si_u_t(p
->quota_max_objects
) << " objects";
6419 rs
<< " (current num objects: " << sum
.num_objects
<< " objects)";
6423 if (p
->quota_max_bytes
== 0)
6426 rs
<< byte_u_t(p
->quota_max_bytes
);
6427 rs
<< " (current num bytes: " << sum
.num_bytes
<< " bytes)";
6429 rdata
.append(rs
.str());
6433 } else if (prefix
== "osd crush rule list" ||
6434 prefix
== "osd crush rule ls") {
6436 f
->open_array_section("rules");
6437 osdmap
.crush
->list_rules(f
.get());
6442 osdmap
.crush
->list_rules(&ss
);
6443 rdata
.append(ss
.str());
6445 } else if (prefix
== "osd crush rule ls-by-class") {
6447 cmd_getval(cmdmap
, "class", class_name
);
6448 if (class_name
.empty()) {
6449 ss
<< "no class specified";
6454 r
= osdmap
.crush
->get_rules_by_class(class_name
, &rules
);
6456 ss
<< "failed to get rules by class '" << class_name
<< "'";
6460 f
->open_array_section("rules");
6461 for (auto &rule
: rules
) {
6462 f
->dump_string("name", osdmap
.crush
->get_rule_name(rule
));
6468 for (auto &rule
: rules
) {
6469 rs
<< osdmap
.crush
->get_rule_name(rule
) << "\n";
6471 rdata
.append(rs
.str());
6473 } else if (prefix
== "osd crush rule dump") {
6475 cmd_getval(cmdmap
, "name", name
);
6477 cmd_getval(cmdmap
, "format", format
);
6478 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6480 f
->open_array_section("rules");
6481 osdmap
.crush
->dump_rules(f
.get());
6484 int ruleno
= osdmap
.crush
->get_rule_id(name
);
6486 ss
<< "unknown crush rule '" << name
<< "'";
6490 osdmap
.crush
->dump_rule(ruleno
, f
.get());
6495 rdata
.append(rs
.str());
6496 } else if (prefix
== "osd crush dump") {
6498 cmd_getval(cmdmap
, "format", format
);
6499 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6500 f
->open_object_section("crush_map");
6501 osdmap
.crush
->dump(f
.get());
6506 rdata
.append(rs
.str());
6507 } else if (prefix
== "osd crush show-tunables") {
6509 cmd_getval(cmdmap
, "format", format
);
6510 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6511 f
->open_object_section("crush_map_tunables");
6512 osdmap
.crush
->dump_tunables(f
.get());
6517 rdata
.append(rs
.str());
6518 } else if (prefix
== "osd crush tree") {
6520 cmd_getval(cmdmap
, "shadow", shadow
);
6521 bool show_shadow
= shadow
== "--show-shadow";
6522 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
6524 f
->open_object_section("crush_tree");
6525 osdmap
.crush
->dump_tree(nullptr,
6527 osdmap
.get_pool_names(),
6533 osdmap
.crush
->dump_tree(&ss
,
6535 osdmap
.get_pool_names(),
6537 rdata
.append(ss
.str());
6539 } else if (prefix
== "osd crush ls") {
6541 if (!cmd_getval(cmdmap
, "node", name
)) {
6542 ss
<< "no node specified";
6546 if (!osdmap
.crush
->name_exists(name
)) {
6547 ss
<< "node '" << name
<< "' does not exist";
6551 int id
= osdmap
.crush
->get_item_id(name
);
6554 result
.push_back(id
);
6556 int num
= osdmap
.crush
->get_bucket_size(id
);
6557 for (int i
= 0; i
< num
; ++i
) {
6558 result
.push_back(osdmap
.crush
->get_bucket_item(id
, i
));
6562 f
->open_array_section("items");
6563 for (auto i
: result
) {
6564 f
->dump_string("item", osdmap
.crush
->get_item_name(i
));
6570 for (auto i
: result
) {
6571 ss
<< osdmap
.crush
->get_item_name(i
) << "\n";
6573 rdata
.append(ss
.str());
6576 } else if (prefix
== "osd crush class ls") {
6577 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6578 f
->open_array_section("crush_classes");
6579 for (auto i
: osdmap
.crush
->class_name
)
6580 f
->dump_string("class", i
.second
);
6583 } else if (prefix
== "osd crush class ls-osd") {
6585 cmd_getval(cmdmap
, "class", name
);
6587 osdmap
.crush
->get_devices_by_class(name
, &osds
);
6589 f
->open_array_section("osds");
6590 for (auto &osd
: osds
)
6591 f
->dump_int("osd", osd
);
6596 for (auto &osd
: osds
) {
6604 } else if (prefix
== "osd crush get-device-class") {
6605 vector
<string
> idvec
;
6606 cmd_getval(cmdmap
, "ids", idvec
);
6607 map
<int, string
> class_by_osd
;
6608 for (auto& id
: idvec
) {
6610 long osd
= parse_osd_id(id
.c_str(), &ts
);
6612 ss
<< "unable to parse osd id:'" << id
<< "'";
6616 auto device_class
= osdmap
.crush
->get_item_class(osd
);
6618 class_by_osd
[osd
] = device_class
;
6620 class_by_osd
[osd
] = ""; // no class
6623 f
->open_array_section("osd_device_classes");
6624 for (auto& i
: class_by_osd
) {
6625 f
->open_object_section("osd_device_class");
6626 f
->dump_int("osd", i
.first
);
6627 f
->dump_string("device_class", i
.second
);
6633 if (class_by_osd
.size() == 1) {
6634 // for single input, make a clean output
6635 ds
<< class_by_osd
.begin()->second
;
6637 // note that we do not group osds by class here
6638 for (auto it
= class_by_osd
.begin();
6639 it
!= class_by_osd
.end();
6641 ds
<< "osd." << it
->first
<< ' ' << it
->second
;
6642 if (next(it
) != class_by_osd
.end())
6648 } else if (prefix
== "osd erasure-code-profile ls") {
6649 const auto &profiles
= osdmap
.get_erasure_code_profiles();
6651 f
->open_array_section("erasure-code-profiles");
6652 for (auto i
= profiles
.begin(); i
!= profiles
.end(); ++i
) {
6654 f
->dump_string("profile", i
->first
.c_str());
6656 rdata
.append(i
->first
+ "\n");
6663 rdata
.append(rs
.str());
6665 } else if (prefix
== "osd crush weight-set ls") {
6666 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
6668 f
->open_array_section("weight_sets");
6669 if (osdmap
.crush
->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS
)) {
6670 f
->dump_string("pool", "(compat)");
6672 for (auto& i
: osdmap
.crush
->choose_args
) {
6674 f
->dump_string("pool", osdmap
.get_pool_name(i
.first
));
6681 if (osdmap
.crush
->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS
)) {
6684 for (auto& i
: osdmap
.crush
->choose_args
) {
6686 rs
<< osdmap
.get_pool_name(i
.first
) << "\n";
6689 rdata
.append(rs
.str());
6691 } else if (prefix
== "osd crush weight-set dump") {
6692 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty",
6694 osdmap
.crush
->dump_choose_args(f
.get());
6696 } else if (prefix
== "osd erasure-code-profile get") {
6698 cmd_getval(cmdmap
, "name", name
);
6699 if (!osdmap
.has_erasure_code_profile(name
)) {
6700 ss
<< "unknown erasure code profile '" << name
<< "'";
6704 const map
<string
,string
> &profile
= osdmap
.get_erasure_code_profile(name
);
6706 f
->open_object_section("profile");
6707 for (map
<string
,string
>::const_iterator i
= profile
.begin();
6711 f
->dump_string(i
->first
.c_str(), i
->second
.c_str());
6713 rdata
.append(i
->first
+ "=" + i
->second
+ "\n");
6720 rdata
.append(rs
.str());
6722 } else if (prefix
== "osd pool application get") {
6723 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty",
6726 cmd_getval(cmdmap
, "pool", pool_name
);
6728 cmd_getval(cmdmap
, "app", app
);
6730 cmd_getval(cmdmap
, "key", key
);
6732 if (pool_name
.empty()) {
6734 f
->open_object_section("pools");
6735 for (const auto &pool
: osdmap
.pools
) {
6736 std::string
name("<unknown>");
6737 const auto &pni
= osdmap
.pool_name
.find(pool
.first
);
6738 if (pni
!= osdmap
.pool_name
.end())
6740 f
->open_object_section(name
.c_str());
6741 for (auto &app_pair
: pool
.second
.application_metadata
) {
6742 f
->open_object_section(app_pair
.first
.c_str());
6743 for (auto &kv_pair
: app_pair
.second
) {
6744 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
6748 f
->close_section(); // name
6750 f
->close_section(); // pools
6753 int64_t pool
= osdmap
.lookup_pg_pool_name(pool_name
.c_str());
6755 ss
<< "unrecognized pool '" << pool_name
<< "'";
6759 auto p
= osdmap
.get_pg_pool(pool
);
6762 f
->open_object_section(pool_name
.c_str());
6763 for (auto &app_pair
: p
->application_metadata
) {
6764 f
->open_object_section(app_pair
.first
.c_str());
6765 for (auto &kv_pair
: app_pair
.second
) {
6766 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
6768 f
->close_section(); // application
6770 f
->close_section(); // pool_name
6775 auto app_it
= p
->application_metadata
.find(app
);
6776 if (app_it
== p
->application_metadata
.end()) {
6777 ss
<< "pool '" << pool_name
<< "' has no application '" << app
<< "'";
6781 // filter by pool + app
6783 f
->open_object_section(app_it
->first
.c_str());
6784 for (auto &kv_pair
: app_it
->second
) {
6785 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
6787 f
->close_section(); // application
6791 // filter by pool + app + key
6792 auto key_it
= app_it
->second
.find(key
);
6793 if (key_it
== app_it
->second
.end()) {
6794 ss
<< "application '" << app
<< "' on pool '" << pool_name
6795 << "' does not have key '" << key
<< "'";
6799 ss
<< key_it
->second
<< "\n";
6800 rdata
.append(ss
.str());
6803 } else if (prefix
== "osd get-require-min-compat-client") {
6804 ss
<< osdmap
.require_min_compat_client
<< std::endl
;
6805 rdata
.append(ss
.str());
6808 } else if (prefix
== "osd pool application enable" ||
6809 prefix
== "osd pool application disable" ||
6810 prefix
== "osd pool application set" ||
6811 prefix
== "osd pool application rm") {
6812 bool changed
= false;
6813 r
= preprocess_command_pool_application(prefix
, cmdmap
, ss
, &changed
);
6817 } else if (changed
) {
6818 // Valid mutation, proceed to prepare phase
6821 // Idempotent case, reply
6825 // try prepare update
6832 mon
->reply_command(op
, r
, rs
, rdata
, get_last_committed());
6836 void OSDMonitor::set_pool_flags(int64_t pool_id
, uint64_t flags
)
6838 pg_pool_t
*pool
= pending_inc
.get_new_pool(pool_id
,
6839 osdmap
.get_pg_pool(pool_id
));
6841 pool
->set_flag(flags
);
6844 void OSDMonitor::clear_pool_flags(int64_t pool_id
, uint64_t flags
)
6846 pg_pool_t
*pool
= pending_inc
.get_new_pool(pool_id
,
6847 osdmap
.get_pg_pool(pool_id
));
6849 pool
->unset_flag(flags
);
6852 string
OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch
)
6855 snprintf(k
, sizeof(k
), "purged_epoch_%08lx", (unsigned long)epoch
);
6859 string
OSDMonitor::make_purged_snap_key(int64_t pool
, snapid_t snap
)
6862 snprintf(k
, sizeof(k
), "purged_snap_%llu_%016llx",
6863 (unsigned long long)pool
, (unsigned long long)snap
);
6867 string
OSDMonitor::make_purged_snap_key_value(
6868 int64_t pool
, snapid_t snap
, snapid_t num
,
6869 epoch_t epoch
, bufferlist
*v
)
6871 // encode the *last* epoch in the key so that we can use forward
6872 // iteration only to search for an epoch in an interval.
6874 encode(snap
+ num
, *v
);
6876 return make_purged_snap_key(pool
, snap
+ num
- 1);
6880 int OSDMonitor::lookup_purged_snap(
6881 int64_t pool
, snapid_t snap
,
6882 snapid_t
*begin
, snapid_t
*end
)
6884 string k
= make_purged_snap_key(pool
, snap
);
6885 auto it
= mon
->store
->get_iterator(OSD_SNAP_PREFIX
);
6888 dout(20) << __func__
6889 << " pool " << pool
<< " snap " << snap
6890 << " - key '" << k
<< "' not found" << dendl
;
6893 if (it
->key().find("purged_snap_") != 0) {
6894 dout(20) << __func__
6895 << " pool " << pool
<< " snap " << snap
6896 << " - key '" << k
<< "' got '" << it
->key()
6897 << "', wrong prefix" << dendl
;
6900 string gotk
= it
->key();
6901 const char *format
= "purged_snap_%llu_";
6902 long long int keypool
;
6903 int n
= sscanf(gotk
.c_str(), format
, &keypool
);
6905 derr
<< __func__
<< " invalid k '" << gotk
<< "'" << dendl
;
6908 if (pool
!= keypool
) {
6909 dout(20) << __func__
6910 << " pool " << pool
<< " snap " << snap
6911 << " - key '" << k
<< "' got '" << gotk
6912 << "', wrong pool " << keypool
6916 bufferlist v
= it
->value();
6917 auto p
= v
.cbegin();
6920 if (snap
< *begin
|| snap
>= *end
) {
6921 dout(20) << __func__
6922 << " pool " << pool
<< " snap " << snap
6923 << " - found [" << *begin
<< "," << *end
<< "), no overlap"
6930 void OSDMonitor::insert_purged_snap_update(
6932 snapid_t start
, snapid_t end
,
6934 MonitorDBStore::TransactionRef t
)
6936 snapid_t before_begin
, before_end
;
6937 snapid_t after_begin
, after_end
;
6938 int b
= lookup_purged_snap(pool
, start
- 1,
6939 &before_begin
, &before_end
);
6940 int a
= lookup_purged_snap(pool
, end
,
6941 &after_begin
, &after_end
);
6943 dout(10) << __func__
6944 << " [" << start
<< "," << end
<< ") - joins ["
6945 << before_begin
<< "," << before_end
<< ") and ["
6946 << after_begin
<< "," << after_end
<< ")" << dendl
;
6947 // erase only the begin record; we'll overwrite the end one.
6948 t
->erase(OSD_SNAP_PREFIX
, make_purged_snap_key(pool
, before_end
- 1));
6950 string k
= make_purged_snap_key_value(pool
,
6951 before_begin
, after_end
- before_begin
,
6952 pending_inc
.epoch
, &v
);
6953 t
->put(OSD_SNAP_PREFIX
, k
, v
);
6955 dout(10) << __func__
6956 << " [" << start
<< "," << end
<< ") - join with earlier ["
6957 << before_begin
<< "," << before_end
<< ")" << dendl
;
6958 t
->erase(OSD_SNAP_PREFIX
, make_purged_snap_key(pool
, before_end
- 1));
6960 string k
= make_purged_snap_key_value(pool
,
6961 before_begin
, end
- before_begin
,
6962 pending_inc
.epoch
, &v
);
6963 t
->put(OSD_SNAP_PREFIX
, k
, v
);
6965 dout(10) << __func__
6966 << " [" << start
<< "," << end
<< ") - join with later ["
6967 << after_begin
<< "," << after_end
<< ")" << dendl
;
6968 // overwrite after record
6970 string k
= make_purged_snap_key_value(pool
,
6971 start
, after_end
- start
,
6972 pending_inc
.epoch
, &v
);
6973 t
->put(OSD_SNAP_PREFIX
, k
, v
);
6975 dout(10) << __func__
6976 << " [" << start
<< "," << end
<< ") - new"
6979 string k
= make_purged_snap_key_value(pool
,
6981 pending_inc
.epoch
, &v
);
6982 t
->put(OSD_SNAP_PREFIX
, k
, v
);
6986 bool OSDMonitor::try_prune_purged_snaps()
6988 if (!mon
->mgrstatmon()->is_readable()) {
6991 if (!pending_inc
.new_purged_snaps
.empty()) {
6992 return false; // we already pruned for this epoch
6995 unsigned max_prune
= cct
->_conf
.get_val
<uint64_t>(
6996 "mon_max_snap_prune_per_epoch");
7000 dout(10) << __func__
<< " max_prune " << max_prune
<< dendl
;
7002 unsigned actually_pruned
= 0;
7003 auto& purged_snaps
= mon
->mgrstatmon()->get_digest().purged_snaps
;
7004 for (auto& p
: osdmap
.get_pools()) {
7005 auto q
= purged_snaps
.find(p
.first
);
7006 if (q
== purged_snaps
.end()) {
7009 auto& purged
= q
->second
;
7010 if (purged
.empty()) {
7011 dout(20) << __func__
<< " " << p
.first
<< " nothing purged" << dendl
;
7014 dout(20) << __func__
<< " pool " << p
.first
<< " purged " << purged
<< dendl
;
7015 snap_interval_set_t to_prune
;
7016 unsigned maybe_pruned
= actually_pruned
;
7017 for (auto i
= purged
.begin(); i
!= purged
.end(); ++i
) {
7018 snapid_t begin
= i
.get_start();
7019 auto end
= i
.get_start() + i
.get_len();
7020 snapid_t pbegin
= 0, pend
= 0;
7021 int r
= lookup_purged_snap(p
.first
, begin
, &pbegin
, &pend
);
7024 // be a bit aggressive about backing off here, because the mon may
7025 // do a lot of work going through this set, and if we know the
7026 // purged set from the OSDs is at least *partly* stale we may as
7027 // well wait for it to be fresh.
7028 dout(20) << __func__
<< " we've already purged " << pbegin
7029 << "~" << (pend
- pbegin
) << dendl
;
7032 if (pbegin
&& pbegin
> begin
&& pbegin
< end
) {
7033 // the tail of [begin,end) is purged; shorten the range
7036 to_prune
.insert(begin
, end
- begin
);
7037 maybe_pruned
+= end
- begin
;
7038 if (maybe_pruned
>= max_prune
) {
7042 if (!to_prune
.empty()) {
7043 // PGs may still be reporting things as purged that we have already
7044 // pruned from removed_snaps_queue.
7045 snap_interval_set_t actual
;
7046 auto r
= osdmap
.removed_snaps_queue
.find(p
.first
);
7047 if (r
!= osdmap
.removed_snaps_queue
.end()) {
7048 actual
.intersection_of(to_prune
, r
->second
);
7050 actually_pruned
+= actual
.size();
7051 dout(10) << __func__
<< " pool " << p
.first
<< " reports pruned " << to_prune
7052 << ", actual pruned " << actual
<< dendl
;
7053 if (!actual
.empty()) {
7054 pending_inc
.new_purged_snaps
[p
.first
].swap(actual
);
7057 if (actually_pruned
>= max_prune
) {
7061 dout(10) << __func__
<< " actually pruned " << actually_pruned
<< dendl
;
7062 return !!actually_pruned
;
7065 bool OSDMonitor::update_pools_status()
7067 if (!mon
->mgrstatmon()->is_readable())
7072 auto& pools
= osdmap
.get_pools();
7073 for (auto it
= pools
.begin(); it
!= pools
.end(); ++it
) {
7074 const pool_stat_t
*pstat
= mon
->mgrstatmon()->get_pool_stat(it
->first
);
7077 const object_stat_sum_t
& sum
= pstat
->stats
.sum
;
7078 const pg_pool_t
&pool
= it
->second
;
7079 const string
& pool_name
= osdmap
.get_pool_name(it
->first
);
7082 (pool
.quota_max_bytes
> 0 && (uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) ||
7083 (pool
.quota_max_objects
> 0 && (uint64_t)sum
.num_objects
>= pool
.quota_max_objects
);
7085 if (pool
.has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
7089 mon
->clog
->info() << "pool '" << pool_name
7090 << "' no longer out of quota; removing NO_QUOTA flag";
7091 // below we cancel FLAG_FULL too, we'll set it again in
7092 // OSDMonitor::encode_pending if it still fails the osd-full checking.
7093 clear_pool_flags(it
->first
,
7094 pg_pool_t::FLAG_FULL_QUOTA
| pg_pool_t::FLAG_FULL
);
7100 if (pool
.quota_max_bytes
> 0 &&
7101 (uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) {
7102 mon
->clog
->warn() << "pool '" << pool_name
<< "' is full"
7103 << " (reached quota's max_bytes: "
7104 << byte_u_t(pool
.quota_max_bytes
) << ")";
7106 if (pool
.quota_max_objects
> 0 &&
7107 (uint64_t)sum
.num_objects
>= pool
.quota_max_objects
) {
7108 mon
->clog
->warn() << "pool '" << pool_name
<< "' is full"
7109 << " (reached quota's max_objects: "
7110 << pool
.quota_max_objects
<< ")";
7112 // set both FLAG_FULL_QUOTA and FLAG_FULL
7113 // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
7114 // since FLAG_FULL should always take precedence
7115 set_pool_flags(it
->first
,
7116 pg_pool_t::FLAG_FULL_QUOTA
| pg_pool_t::FLAG_FULL
);
7117 clear_pool_flags(it
->first
,
7118 pg_pool_t::FLAG_NEARFULL
|
7119 pg_pool_t::FLAG_BACKFILLFULL
);
7126 int OSDMonitor::prepare_new_pool(MonOpRequestRef op
)
7128 op
->mark_osdmon_event(__func__
);
7129 auto m
= op
->get_req
<MPoolOp
>();
7130 dout(10) << "prepare_new_pool from " << m
->get_connection() << dendl
;
7131 MonSession
*session
= op
->get_session();
7134 string erasure_code_profile
;
7138 ret
= prepare_new_pool(m
->name
, m
->crush_rule
, rule_name
,
7140 erasure_code_profile
,
7141 pg_pool_t::TYPE_REPLICATED
, 0, FAST_READ_OFF
, {},
7145 dout(10) << __func__
<< " got " << ret
<< " " << ss
.str() << dendl
;
7150 int OSDMonitor::crush_rename_bucket(const string
& srcname
,
7151 const string
& dstname
,
7156 // Avoid creating a pending crush if it does not already exists and
7157 // the rename would fail.
7159 if (!_have_pending_crush()) {
7160 ret
= _get_stable_crush().can_rename_bucket(srcname
,
7167 CrushWrapper newcrush
;
7168 _get_pending_crush(newcrush
);
7170 ret
= newcrush
.rename_bucket(srcname
,
7176 pending_inc
.crush
.clear();
7177 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7178 *ss
<< "renamed bucket " << srcname
<< " into " << dstname
;
7182 void OSDMonitor::check_legacy_ec_plugin(const string
& plugin
, const string
& profile
) const
7184 string replacement
= "";
7186 if (plugin
== "jerasure_generic" ||
7187 plugin
== "jerasure_sse3" ||
7188 plugin
== "jerasure_sse4" ||
7189 plugin
== "jerasure_neon") {
7190 replacement
= "jerasure";
7191 } else if (plugin
== "shec_generic" ||
7192 plugin
== "shec_sse3" ||
7193 plugin
== "shec_sse4" ||
7194 plugin
== "shec_neon") {
7195 replacement
= "shec";
7198 if (replacement
!= "") {
7199 dout(0) << "WARNING: erasure coding profile " << profile
<< " uses plugin "
7200 << plugin
<< " that has been deprecated. Please use "
7201 << replacement
<< " instead." << dendl
;
7205 int OSDMonitor::normalize_profile(const string
& profilename
,
7206 ErasureCodeProfile
&profile
,
7210 ErasureCodeInterfaceRef erasure_code
;
7211 ErasureCodePluginRegistry
&instance
= ErasureCodePluginRegistry::instance();
7212 ErasureCodeProfile::const_iterator plugin
= profile
.find("plugin");
7213 check_legacy_ec_plugin(plugin
->second
, profilename
);
7214 int err
= instance
.factory(plugin
->second
,
7215 g_conf().get_val
<std::string
>("erasure_code_dir"),
7216 profile
, &erasure_code
, ss
);
7221 err
= erasure_code
->init(profile
, ss
);
7226 auto it
= profile
.find("stripe_unit");
7227 if (it
!= profile
.end()) {
7229 uint32_t stripe_unit
= strict_iecstrtoll(it
->second
.c_str(), &err_str
);
7230 if (!err_str
.empty()) {
7231 *ss
<< "could not parse stripe_unit '" << it
->second
7232 << "': " << err_str
<< std::endl
;
7235 uint32_t data_chunks
= erasure_code
->get_data_chunk_count();
7236 uint32_t chunk_size
= erasure_code
->get_chunk_size(stripe_unit
* data_chunks
);
7237 if (chunk_size
!= stripe_unit
) {
7238 *ss
<< "stripe_unit " << stripe_unit
<< " does not match ec profile "
7239 << "alignment. Would be padded to " << chunk_size
7243 if ((stripe_unit
% 4096) != 0 && !force
) {
7244 *ss
<< "stripe_unit should be a multiple of 4096 bytes for best performance."
7245 << "use --force to override this check" << std::endl
;
7252 int OSDMonitor::crush_rule_create_erasure(const string
&name
,
7253 const string
&profile
,
7257 int ruleid
= osdmap
.crush
->get_rule_id(name
);
7258 if (ruleid
!= -ENOENT
) {
7259 *rule
= osdmap
.crush
->get_rule_mask_ruleset(ruleid
);
7263 CrushWrapper newcrush
;
7264 _get_pending_crush(newcrush
);
7266 ruleid
= newcrush
.get_rule_id(name
);
7267 if (ruleid
!= -ENOENT
) {
7268 *rule
= newcrush
.get_rule_mask_ruleset(ruleid
);
7271 ErasureCodeInterfaceRef erasure_code
;
7272 int err
= get_erasure_code(profile
, &erasure_code
, ss
);
7274 *ss
<< "failed to load plugin using profile " << profile
<< std::endl
;
7278 err
= erasure_code
->create_rule(name
, newcrush
, ss
);
7279 erasure_code
.reset();
7283 pending_inc
.crush
.clear();
7284 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7289 int OSDMonitor::get_erasure_code(const string
&erasure_code_profile
,
7290 ErasureCodeInterfaceRef
*erasure_code
,
7293 if (pending_inc
.has_erasure_code_profile(erasure_code_profile
))
7295 ErasureCodeProfile profile
=
7296 osdmap
.get_erasure_code_profile(erasure_code_profile
);
7297 ErasureCodeProfile::const_iterator plugin
=
7298 profile
.find("plugin");
7299 if (plugin
== profile
.end()) {
7300 *ss
<< "cannot determine the erasure code plugin"
7301 << " because there is no 'plugin' entry in the erasure_code_profile "
7302 << profile
<< std::endl
;
7305 check_legacy_ec_plugin(plugin
->second
, erasure_code_profile
);
7306 ErasureCodePluginRegistry
&instance
= ErasureCodePluginRegistry::instance();
7307 return instance
.factory(plugin
->second
,
7308 g_conf().get_val
<std::string
>("erasure_code_dir"),
7309 profile
, erasure_code
, ss
);
7312 int OSDMonitor::check_cluster_features(uint64_t features
,
7315 stringstream unsupported_ss
;
7316 int unsupported_count
= 0;
7317 if ((mon
->get_quorum_con_features() & features
) != features
) {
7318 unsupported_ss
<< "the monitor cluster";
7319 ++unsupported_count
;
7322 set
<int32_t> up_osds
;
7323 osdmap
.get_up_osds(up_osds
);
7324 for (set
<int32_t>::iterator it
= up_osds
.begin();
7325 it
!= up_osds
.end(); ++it
) {
7326 const osd_xinfo_t
&xi
= osdmap
.get_xinfo(*it
);
7327 if ((xi
.features
& features
) != features
) {
7328 if (unsupported_count
> 0)
7329 unsupported_ss
<< ", ";
7330 unsupported_ss
<< "osd." << *it
;
7331 unsupported_count
++;
7335 if (unsupported_count
> 0) {
7336 ss
<< "features " << features
<< " unsupported by: "
7337 << unsupported_ss
.str();
7341 // check pending osd state, too!
7342 for (map
<int32_t,osd_xinfo_t
>::const_iterator p
=
7343 pending_inc
.new_xinfo
.begin();
7344 p
!= pending_inc
.new_xinfo
.end(); ++p
) {
7345 const osd_xinfo_t
&xi
= p
->second
;
7346 if ((xi
.features
& features
) != features
) {
7347 dout(10) << __func__
<< " pending osd." << p
->first
7348 << " features are insufficient; retry" << dendl
;
7356 bool OSDMonitor::validate_crush_against_features(const CrushWrapper
*newcrush
,
7359 OSDMap::Incremental new_pending
= pending_inc
;
7360 encode(*newcrush
, new_pending
.crush
, mon
->get_quorum_con_features());
7362 newmap
.deepish_copy_from(osdmap
);
7363 newmap
.apply_incremental(new_pending
);
7366 if (newmap
.require_min_compat_client
!= ceph_release_t::unknown
) {
7367 auto mv
= newmap
.get_min_compat_client();
7368 if (mv
> newmap
.require_min_compat_client
) {
7369 ss
<< "new crush map requires client version " << mv
7370 << " but require_min_compat_client is "
7371 << newmap
.require_min_compat_client
;
7378 newmap
.get_features(CEPH_ENTITY_TYPE_MON
, NULL
) |
7379 newmap
.get_features(CEPH_ENTITY_TYPE_OSD
, NULL
);
7380 stringstream features_ss
;
7381 int r
= check_cluster_features(features
, features_ss
);
7383 ss
<< "Could not change CRUSH: " << features_ss
.str();
7390 bool OSDMonitor::erasure_code_profile_in_use(
7391 const mempool::osdmap::map
<int64_t, pg_pool_t
> &pools
,
7392 const string
&profile
,
7396 for (map
<int64_t, pg_pool_t
>::const_iterator p
= pools
.begin();
7399 if (p
->second
.erasure_code_profile
== profile
&& p
->second
.is_erasure()) {
7400 *ss
<< osdmap
.pool_name
[p
->first
] << " ";
7405 *ss
<< "pool(s) are using the erasure code profile '" << profile
<< "'";
7410 int OSDMonitor::parse_erasure_code_profile(const vector
<string
> &erasure_code_profile
,
7411 map
<string
,string
> *erasure_code_profile_map
,
7414 int r
= g_conf().with_val
<string
>("osd_pool_default_erasure_code_profile",
7417 erasure_code_profile_map
,
7421 ceph_assert((*erasure_code_profile_map
).count("plugin"));
7422 string default_plugin
= (*erasure_code_profile_map
)["plugin"];
7423 map
<string
,string
> user_map
;
7424 for (vector
<string
>::const_iterator i
= erasure_code_profile
.begin();
7425 i
!= erasure_code_profile
.end();
7427 size_t equal
= i
->find('=');
7428 if (equal
== string::npos
) {
7429 user_map
[*i
] = string();
7430 (*erasure_code_profile_map
)[*i
] = string();
7432 const string key
= i
->substr(0, equal
);
7434 const string value
= i
->substr(equal
);
7435 if (key
.find("ruleset-") == 0) {
7436 *ss
<< "property '" << key
<< "' is no longer supported; try "
7437 << "'crush-" << key
.substr(8) << "' instead";
7440 user_map
[key
] = value
;
7441 (*erasure_code_profile_map
)[key
] = value
;
7445 if (user_map
.count("plugin") && user_map
["plugin"] != default_plugin
)
7446 (*erasure_code_profile_map
) = user_map
;
7451 int OSDMonitor::prepare_pool_size(const unsigned pool_type
,
7452 const string
&erasure_code_profile
,
7454 unsigned *size
, unsigned *min_size
,
7458 switch (pool_type
) {
7459 case pg_pool_t::TYPE_REPLICATED
:
7460 if (repl_size
== 0) {
7461 repl_size
= g_conf().get_val
<uint64_t>("osd_pool_default_size");
7464 *min_size
= g_conf().get_osd_pool_default_min_size(repl_size
);
7466 case pg_pool_t::TYPE_ERASURE
:
7468 ErasureCodeInterfaceRef erasure_code
;
7469 err
= get_erasure_code(erasure_code_profile
, &erasure_code
, ss
);
7471 *size
= erasure_code
->get_chunk_count();
7473 erasure_code
->get_data_chunk_count() +
7474 std::min
<int>(1, erasure_code
->get_coding_chunk_count() - 1);
7475 assert(*min_size
<= *size
);
7476 assert(*min_size
>= erasure_code
->get_data_chunk_count());
7481 *ss
<< "prepare_pool_size: " << pool_type
<< " is not a known pool type";
7488 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type
,
7489 const string
&erasure_code_profile
,
7490 uint32_t *stripe_width
,
7494 switch (pool_type
) {
7495 case pg_pool_t::TYPE_REPLICATED
:
7498 case pg_pool_t::TYPE_ERASURE
:
7500 ErasureCodeProfile profile
=
7501 osdmap
.get_erasure_code_profile(erasure_code_profile
);
7502 ErasureCodeInterfaceRef erasure_code
;
7503 err
= get_erasure_code(erasure_code_profile
, &erasure_code
, ss
);
7506 uint32_t data_chunks
= erasure_code
->get_data_chunk_count();
7507 uint32_t stripe_unit
= g_conf().get_val
<Option::size_t>("osd_pool_erasure_code_stripe_unit");
7508 auto it
= profile
.find("stripe_unit");
7509 if (it
!= profile
.end()) {
7511 stripe_unit
= strict_iecstrtoll(it
->second
.c_str(), &err_str
);
7512 ceph_assert(err_str
.empty());
7514 *stripe_width
= data_chunks
*
7515 erasure_code
->get_chunk_size(stripe_unit
* data_chunks
);
7519 *ss
<< "prepare_pool_stripe_width: "
7520 << pool_type
<< " is not a known pool type";
7527 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type
,
7528 const string
&erasure_code_profile
,
7529 const string
&rule_name
,
7534 if (*crush_rule
< 0) {
7535 switch (pool_type
) {
7536 case pg_pool_t::TYPE_REPLICATED
:
7538 if (rule_name
== "") {
7540 *crush_rule
= osdmap
.crush
->get_osd_pool_default_crush_replicated_ruleset(cct
);
7541 if (*crush_rule
< 0) {
7542 // Errors may happen e.g. if no valid rule is available
7543 *ss
<< "No suitable CRUSH rule exists, check "
7544 << "'osd pool default crush *' config options";
7548 return get_crush_rule(rule_name
, crush_rule
, ss
);
7552 case pg_pool_t::TYPE_ERASURE
:
7554 int err
= crush_rule_create_erasure(rule_name
,
7555 erasure_code_profile
,
7559 dout(20) << "prepare_pool_crush_rule: rule "
7560 << rule_name
<< " try again" << dendl
;
7563 // need to wait for the crush rule to be proposed before proceeding
7574 *ss
<< "prepare_pool_crush_rule: " << pool_type
7575 << " is not a known pool type";
7580 if (!osdmap
.crush
->ruleset_exists(*crush_rule
)) {
7581 *ss
<< "CRUSH rule " << *crush_rule
<< " not found";
7589 int OSDMonitor::get_crush_rule(const string
&rule_name
,
7594 ret
= osdmap
.crush
->get_rule_id(rule_name
);
7595 if (ret
!= -ENOENT
) {
7599 CrushWrapper newcrush
;
7600 _get_pending_crush(newcrush
);
7602 ret
= newcrush
.get_rule_id(rule_name
);
7603 if (ret
!= -ENOENT
) {
7604 // found it, wait for it to be proposed
7605 dout(20) << __func__
<< ": rule " << rule_name
7606 << " try again" << dendl
;
7609 // Cannot find it , return error
7610 *ss
<< "specified rule " << rule_name
<< " doesn't exist";
7617 int OSDMonitor::check_pg_num(int64_t pool
, int pg_num
, int size
, ostream
*ss
)
7619 auto max_pgs_per_osd
= g_conf().get_val
<uint64_t>("mon_max_pg_per_osd");
7620 auto num_osds
= std::max(osdmap
.get_num_in_osds(), 3u); // assume min cluster size 3
7621 auto max_pgs
= max_pgs_per_osd
* num_osds
;
7622 uint64_t projected
= 0;
7624 projected
+= pg_num
* size
;
7626 for (const auto& i
: osdmap
.get_pools()) {
7627 if (i
.first
== pool
) {
7628 projected
+= pg_num
* size
;
7630 projected
+= i
.second
.get_pg_num_target() * i
.second
.get_size();
7633 if (projected
> max_pgs
) {
7635 *ss
<< "pool id " << pool
;
7637 *ss
<< " pg_num " << pg_num
<< " size " << size
7638 << " would mean " << projected
7639 << " total pgs, which exceeds max " << max_pgs
7640 << " (mon_max_pg_per_osd " << max_pgs_per_osd
7641 << " * num_in_osds " << num_osds
<< ")";
7648 * @param name The name of the new pool
7649 * @param crush_rule The crush rule to use. If <0, will use the system default
7650 * @param crush_rule_name The crush rule to use, if crush_rulset <0
7651 * @param pg_num The pg_num to use. If set to 0, will use the system default
7652 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
7653 * @param repl_size Replication factor, or 0 for default
7654 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
7655 * @param pool_type TYPE_ERASURE, or TYPE_REP
7656 * @param expected_num_objects expected number of objects on the pool
7657 * @param fast_read fast read type.
7658 * @param ss human readable error message, if any.
7660 * @return 0 on success, negative errno on failure.
7662 int OSDMonitor::prepare_new_pool(string
& name
,
7664 const string
&crush_rule_name
,
7665 unsigned pg_num
, unsigned pgp_num
,
7666 unsigned pg_num_min
,
7667 const uint64_t repl_size
,
7668 const uint64_t target_size_bytes
,
7669 const float target_size_ratio
,
7670 const string
&erasure_code_profile
,
7671 const unsigned pool_type
,
7672 const uint64_t expected_num_objects
,
7673 FastReadType fast_read
,
7674 const string
& pg_autoscale_mode
,
7677 if (name
.length() == 0)
7680 pg_num
= g_conf().get_val
<uint64_t>("osd_pool_default_pg_num");
7682 pgp_num
= g_conf().get_val
<uint64_t>("osd_pool_default_pgp_num");
7685 if (pg_num
> g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")) {
7686 *ss
<< "'pg_num' must be greater than 0 and less than or equal to "
7687 << g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")
7688 << " (you may adjust 'mon max pool pg num' for higher values)";
7691 if (pgp_num
> pg_num
) {
7692 *ss
<< "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
7693 << ", which in this case is " << pg_num
;
7696 if (pool_type
== pg_pool_t::TYPE_REPLICATED
&& fast_read
== FAST_READ_ON
) {
7697 *ss
<< "'fast_read' can only apply to erasure coding pool";
7701 r
= prepare_pool_crush_rule(pool_type
, erasure_code_profile
,
7702 crush_rule_name
, &crush_rule
, ss
);
7704 dout(10) << "prepare_pool_crush_rule returns " << r
<< dendl
;
7707 if (g_conf()->mon_osd_crush_smoke_test
) {
7708 CrushWrapper newcrush
;
7709 _get_pending_crush(newcrush
);
7711 CrushTester
tester(newcrush
, err
);
7712 tester
.set_min_x(0);
7713 tester
.set_max_x(50);
7714 tester
.set_rule(crush_rule
);
7715 auto start
= ceph::coarse_mono_clock::now();
7716 r
= tester
.test_with_fork(g_conf()->mon_lease
);
7717 auto duration
= ceph::coarse_mono_clock::now() - start
;
7719 dout(10) << "tester.test_with_fork returns " << r
7720 << ": " << err
.str() << dendl
;
7721 *ss
<< "crush test failed with " << r
<< ": " << err
.str();
7724 dout(10) << __func__
<< " crush smoke test duration: "
7725 << duration
<< dendl
;
7727 unsigned size
, min_size
;
7728 r
= prepare_pool_size(pool_type
, erasure_code_profile
, repl_size
,
7729 &size
, &min_size
, ss
);
7731 dout(10) << "prepare_pool_size returns " << r
<< dendl
;
7734 r
= check_pg_num(-1, pg_num
, size
, ss
);
7736 dout(10) << "check_pg_num returns " << r
<< dendl
;
7740 if (!osdmap
.crush
->check_crush_rule(crush_rule
, pool_type
, size
, *ss
)) {
7744 uint32_t stripe_width
= 0;
7745 r
= prepare_pool_stripe_width(pool_type
, erasure_code_profile
, &stripe_width
, ss
);
7747 dout(10) << "prepare_pool_stripe_width returns " << r
<< dendl
;
7752 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
7753 switch (fast_read
) {
7760 case FAST_READ_DEFAULT
:
7761 fread
= g_conf()->osd_pool_default_ec_fast_read
;
7764 *ss
<< "invalid fast_read setting: " << fast_read
;
7769 for (map
<int64_t,string
>::iterator p
= pending_inc
.new_pool_names
.begin();
7770 p
!= pending_inc
.new_pool_names
.end();
7772 if (p
->second
== name
)
7776 if (-1 == pending_inc
.new_pool_max
)
7777 pending_inc
.new_pool_max
= osdmap
.pool_max
;
7778 int64_t pool
= ++pending_inc
.new_pool_max
;
7780 pg_pool_t
*pi
= pending_inc
.get_new_pool(pool
, &empty
);
7781 pi
->create_time
= ceph_clock_now();
7782 pi
->type
= pool_type
;
7783 pi
->fast_read
= fread
;
7784 pi
->flags
= g_conf()->osd_pool_default_flags
;
7785 if (g_conf()->osd_pool_default_flag_hashpspool
)
7786 pi
->set_flag(pg_pool_t::FLAG_HASHPSPOOL
);
7787 if (g_conf()->osd_pool_default_flag_nodelete
)
7788 pi
->set_flag(pg_pool_t::FLAG_NODELETE
);
7789 if (g_conf()->osd_pool_default_flag_nopgchange
)
7790 pi
->set_flag(pg_pool_t::FLAG_NOPGCHANGE
);
7791 if (g_conf()->osd_pool_default_flag_nosizechange
)
7792 pi
->set_flag(pg_pool_t::FLAG_NOSIZECHANGE
);
7793 pi
->set_flag(pg_pool_t::FLAG_CREATING
);
7794 if (g_conf()->osd_pool_use_gmt_hitset
)
7795 pi
->use_gmt_hitset
= true;
7797 pi
->use_gmt_hitset
= false;
7800 pi
->min_size
= min_size
;
7801 pi
->crush_rule
= crush_rule
;
7802 pi
->expected_num_objects
= expected_num_objects
;
7803 pi
->object_hash
= CEPH_STR_HASH_RJENKINS
;
7805 if (auto m
= pg_pool_t::get_pg_autoscale_mode_by_name(
7806 g_conf().get_val
<string
>("osd_pool_default_pg_autoscale_mode"));
7807 m
!= pg_pool_t::pg_autoscale_mode_t::UNKNOWN
) {
7808 pi
->pg_autoscale_mode
= m
;
7810 pi
->pg_autoscale_mode
= pg_pool_t::pg_autoscale_mode_t::OFF
;
7812 auto max
= g_conf().get_val
<int64_t>("mon_osd_max_initial_pgs");
7814 max
> 0 ? std::min
<uint64_t>(pg_num
, std::max
<int64_t>(1, max
))
7816 pi
->set_pg_num_pending(pi
->get_pg_num());
7817 pi
->set_pg_num_target(pg_num
);
7818 pi
->set_pgp_num(pi
->get_pg_num());
7819 pi
->set_pgp_num_target(pgp_num
);
7820 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
&&
7822 pi
->opts
.set(pool_opts_t::PG_NUM_MIN
, static_cast<int64_t>(pg_num_min
));
7824 if (auto m
= pg_pool_t::get_pg_autoscale_mode_by_name(
7825 pg_autoscale_mode
); m
!= pg_pool_t::pg_autoscale_mode_t::UNKNOWN
) {
7826 pi
->pg_autoscale_mode
= m
;
7829 pi
->last_change
= pending_inc
.epoch
;
7832 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
7833 pi
->erasure_code_profile
= erasure_code_profile
;
7835 pi
->erasure_code_profile
= "";
7837 pi
->stripe_width
= stripe_width
;
7839 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
&&
7840 target_size_bytes
) {
7841 // only store for nautilus+ because TARGET_SIZE_BYTES may be
7842 // larger than int32_t max.
7843 pi
->opts
.set(pool_opts_t::TARGET_SIZE_BYTES
, static_cast<int64_t>(target_size_bytes
));
7845 if (target_size_ratio
> 0.0 &&
7846 osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
7847 // only store for nautilus+, just to be consistent and tidy.
7848 pi
->opts
.set(pool_opts_t::TARGET_SIZE_RATIO
, target_size_ratio
);
7851 pi
->cache_target_dirty_ratio_micro
=
7852 g_conf()->osd_pool_default_cache_target_dirty_ratio
* 1000000;
7853 pi
->cache_target_dirty_high_ratio_micro
=
7854 g_conf()->osd_pool_default_cache_target_dirty_high_ratio
* 1000000;
7855 pi
->cache_target_full_ratio_micro
=
7856 g_conf()->osd_pool_default_cache_target_full_ratio
* 1000000;
7857 pi
->cache_min_flush_age
= g_conf()->osd_pool_default_cache_min_flush_age
;
7858 pi
->cache_min_evict_age
= g_conf()->osd_pool_default_cache_min_evict_age
;
7860 pending_inc
.new_pool_names
[pool
] = name
;
7864 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op
, int flag
)
7866 op
->mark_osdmon_event(__func__
);
7868 if (pending_inc
.new_flags
< 0)
7869 pending_inc
.new_flags
= osdmap
.get_flags();
7870 pending_inc
.new_flags
|= flag
;
7871 ss
<< OSDMap::get_flag_string(flag
) << " is set";
7872 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
7873 get_last_committed() + 1));
7877 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op
, int flag
)
7879 op
->mark_osdmon_event(__func__
);
7881 if (pending_inc
.new_flags
< 0)
7882 pending_inc
.new_flags
= osdmap
.get_flags();
7883 pending_inc
.new_flags
&= ~flag
;
7884 ss
<< OSDMap::get_flag_string(flag
) << " is unset";
7885 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
7886 get_last_committed() + 1));
7890 int OSDMonitor::prepare_command_pool_set(const cmdmap_t
& cmdmap
,
7894 cmd_getval(cmdmap
, "pool", poolstr
);
7895 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
7897 ss
<< "unrecognized pool '" << poolstr
<< "'";
7901 cmd_getval(cmdmap
, "var", var
);
7903 pg_pool_t p
= *osdmap
.get_pg_pool(pool
);
7904 if (pending_inc
.new_pools
.count(pool
))
7905 p
= pending_inc
.new_pools
[pool
];
7907 // accept val as a json string in the normal case (current
7908 // generation monitor). parse out int or float values from the
7909 // string as needed. however, if it is not a string, try to pull
7910 // out an int, in case an older monitor with an older json schema is
7911 // forwarding a request.
7913 string interr
, floaterr
;
7916 int64_t uf
= 0; // micro-f
7917 cmd_getval(cmdmap
, "val", val
);
7920 "target_max_objects"
7922 auto iec_options
= {
7924 "target_size_bytes",
7925 "compression_max_blob_size",
7926 "compression_min_blob_size",
7930 if (count(begin(si_options
), end(si_options
), var
)) {
7931 n
= strict_si_cast
<int64_t>(val
.c_str(), &interr
);
7932 } else if (count(begin(iec_options
), end(iec_options
), var
)) {
7933 n
= strict_iec_cast
<int64_t>(val
.c_str(), &interr
);
7935 // parse string as both int and float; different fields use different types.
7936 n
= strict_strtoll(val
.c_str(), 10, &interr
);
7937 f
= strict_strtod(val
.c_str(), &floaterr
);
7938 uf
= llrintl(f
* (double)1000000.0);
7942 (var
== "hit_set_type" || var
== "hit_set_period" ||
7943 var
== "hit_set_count" || var
== "hit_set_fpp" ||
7944 var
== "target_max_objects" || var
== "target_max_bytes" ||
7945 var
== "cache_target_full_ratio" || var
== "cache_target_dirty_ratio" ||
7946 var
== "cache_target_dirty_high_ratio" || var
== "use_gmt_hitset" ||
7947 var
== "cache_min_flush_age" || var
== "cache_min_evict_age" ||
7948 var
== "hit_set_grade_decay_rate" || var
== "hit_set_search_last_n" ||
7949 var
== "min_read_recency_for_promote" || var
== "min_write_recency_for_promote")) {
7953 if (var
== "size") {
7954 if (p
.has_flag(pg_pool_t::FLAG_NOSIZECHANGE
)) {
7955 ss
<< "pool size change is disabled; you must unset nosizechange flag for the pool first";
7958 if (p
.type
== pg_pool_t::TYPE_ERASURE
) {
7959 ss
<< "can not change the size of an erasure-coded pool";
7962 if (interr
.length()) {
7963 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
7966 if (n
<= 0 || n
> 10) {
7967 ss
<< "pool size must be between 1 and 10";
7970 if (!osdmap
.crush
->check_crush_rule(p
.get_crush_rule(), p
.type
, n
, ss
)) {
7973 int r
= check_pg_num(pool
, p
.get_pg_num(), n
, &ss
);
7978 p
.min_size
= g_conf().get_osd_pool_default_min_size(p
.size
);
7979 } else if (var
== "min_size") {
7980 if (p
.has_flag(pg_pool_t::FLAG_NOSIZECHANGE
)) {
7981 ss
<< "pool min size change is disabled; you must unset nosizechange flag for the pool first";
7984 if (interr
.length()) {
7985 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
7989 if (p
.type
!= pg_pool_t::TYPE_ERASURE
) {
7990 if (n
< 1 || n
> p
.size
) {
7991 ss
<< "pool min_size must be between 1 and size, which is set to " << (int)p
.size
;
7995 ErasureCodeInterfaceRef erasure_code
;
7998 int err
= get_erasure_code(p
.erasure_code_profile
, &erasure_code
, &tmp
);
8000 k
= erasure_code
->get_data_chunk_count();
8002 ss
<< __func__
<< " get_erasure_code failed: " << tmp
.str();
8006 if (n
< k
|| n
> p
.size
) {
8007 ss
<< "pool min_size must be between " << k
<< " and size, which is set to " << (int)p
.size
;
8012 } else if (var
== "pg_num_actual") {
8013 if (interr
.length()) {
8014 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8017 if (n
== (int)p
.get_pg_num()) {
8020 if (static_cast<uint64_t>(n
) > g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")) {
8021 ss
<< "'pg_num' must be greater than 0 and less than or equal to "
8022 << g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")
8023 << " (you may adjust 'mon max pool pg num' for higher values)";
8026 if (p
.has_flag(pg_pool_t::FLAG_CREATING
)) {
8027 ss
<< "cannot adjust pg_num while initial PGs are being created";
8030 if (n
> (int)p
.get_pg_num()) {
8031 if (p
.get_pg_num() != p
.get_pg_num_pending()) {
8032 // force pre-nautilus clients to resend their ops, since they
8033 // don't understand pg_num_pending changes form a new interval
8034 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
8038 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8039 ss
<< "nautilus OSDs are required to adjust pg_num_pending";
8042 if (n
< (int)p
.get_pgp_num()) {
8043 ss
<< "specified pg_num " << n
<< " < pgp_num " << p
.get_pgp_num();
8046 if (n
< (int)p
.get_pg_num() - 1) {
8047 ss
<< "specified pg_num " << n
<< " < pg_num (" << p
.get_pg_num()
8048 << ") - 1; only single pg decrease is currently supported";
8051 p
.set_pg_num_pending(n
);
8052 // force pre-nautilus clients to resend their ops, since they
8053 // don't understand pg_num_pending changes form a new interval
8054 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
8056 // force pre-luminous clients to resend their ops, since they
8057 // don't understand that split PGs now form a new interval.
8058 p
.last_force_op_resend_preluminous
= pending_inc
.epoch
;
8059 } else if (var
== "pg_num") {
8060 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
8061 ss
<< "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8064 if (interr
.length()) {
8065 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8068 if (n
== (int)p
.get_pg_num_target()) {
8071 if (n
<= 0 || static_cast<uint64_t>(n
) >
8072 g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")) {
8073 ss
<< "'pg_num' must be greater than 0 and less than or equal to "
8074 << g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")
8075 << " (you may adjust 'mon max pool pg num' for higher values)";
8078 if (n
> (int)p
.get_pg_num_target()) {
8079 int r
= check_pg_num(pool
, n
, p
.get_size(), &ss
);
8084 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
8085 if (p
.cache_mode
!= pg_pool_t::CACHEMODE_NONE
&& !force
) {
8086 ss
<< "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
8090 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8091 ss
<< "nautilus OSDs are required to decrease pg_num";
8095 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8096 // pre-nautilus osdmap format; increase pg_num directly
8097 assert(n
> (int)p
.get_pg_num());
8098 // force pre-nautilus clients to resend their ops, since they
8099 // don't understand pg_num_target changes form a new interval
8100 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
8101 // force pre-luminous clients to resend their ops, since they
8102 // don't understand that split PGs now form a new interval.
8103 p
.last_force_op_resend_preluminous
= pending_inc
.epoch
;
8106 // set targets; mgr will adjust pg_num_actual and pgp_num later.
8107 // make pgp_num track pg_num if it already matches. if it is set
8108 // differently, leave it different and let the user control it
8110 if (p
.get_pg_num_target() == p
.get_pgp_num_target()) {
8111 p
.set_pgp_num_target(n
);
8113 p
.set_pg_num_target(n
);
8115 } else if (var
== "pgp_num_actual") {
8116 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
8117 ss
<< "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8120 if (interr
.length()) {
8121 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8125 ss
<< "specified pgp_num must > 0, but you set to " << n
;
8128 if (n
> (int)p
.get_pg_num()) {
8129 ss
<< "specified pgp_num " << n
<< " > pg_num " << p
.get_pg_num();
8132 if (n
> (int)p
.get_pg_num_pending()) {
8133 ss
<< "specified pgp_num " << n
8134 << " > pg_num_pending " << p
.get_pg_num_pending();
8138 } else if (var
== "pgp_num") {
8139 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
8140 ss
<< "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8143 if (interr
.length()) {
8144 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8148 ss
<< "specified pgp_num must > 0, but you set to " << n
;
8151 if (n
> (int)p
.get_pg_num_target()) {
8152 ss
<< "specified pgp_num " << n
<< " > pg_num " << p
.get_pg_num_target();
8155 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8156 // pre-nautilus osdmap format; increase pgp_num directly
8159 p
.set_pgp_num_target(n
);
8161 } else if (var
== "pg_autoscale_mode") {
8162 auto m
= pg_pool_t::get_pg_autoscale_mode_by_name(val
);
8163 if (m
== pg_pool_t::pg_autoscale_mode_t::UNKNOWN
) {
8164 ss
<< "specified invalid mode " << val
;
8167 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8168 ss
<< "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
8171 p
.pg_autoscale_mode
= m
;
8172 } else if (var
== "crush_rule") {
8173 int id
= osdmap
.crush
->get_rule_id(val
);
8174 if (id
== -ENOENT
) {
8175 ss
<< "crush rule " << val
<< " does not exist";
8179 ss
<< cpp_strerror(id
);
8182 if (!osdmap
.crush
->check_crush_rule(id
, p
.get_type(), p
.get_size(), ss
)) {
8186 } else if (var
== "nodelete" || var
== "nopgchange" ||
8187 var
== "nosizechange" || var
== "write_fadvise_dontneed" ||
8188 var
== "noscrub" || var
== "nodeep-scrub") {
8189 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
8190 // make sure we only compare against 'n' if we didn't receive a string
8191 if (val
== "true" || (interr
.empty() && n
== 1)) {
8193 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8196 ss
<< "expecting value 'true', 'false', '0', or '1'";
8199 } else if (var
== "hashpspool") {
8200 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
8202 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
8205 ss
<< "are you SURE? this will remap all placement groups in this pool,"
8206 " this triggers large data movement,"
8207 " pass --yes-i-really-mean-it if you really do.";
8210 // make sure we only compare against 'n' if we didn't receive a string
8211 if (val
== "true" || (interr
.empty() && n
== 1)) {
8213 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8216 ss
<< "expecting value 'true', 'false', '0', or '1'";
8219 } else if (var
== "hit_set_type") {
8221 p
.hit_set_params
= HitSet::Params();
8223 int err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
8226 if (val
== "bloom") {
8227 BloomHitSet::Params
*bsp
= new BloomHitSet::Params
;
8228 bsp
->set_fpp(g_conf().get_val
<double>("osd_pool_default_hit_set_bloom_fpp"));
8229 p
.hit_set_params
= HitSet::Params(bsp
);
8230 } else if (val
== "explicit_hash")
8231 p
.hit_set_params
= HitSet::Params(new ExplicitHashHitSet::Params
);
8232 else if (val
== "explicit_object")
8233 p
.hit_set_params
= HitSet::Params(new ExplicitObjectHitSet::Params
);
8235 ss
<< "unrecognized hit_set type '" << val
<< "'";
8239 } else if (var
== "hit_set_period") {
8240 if (interr
.length()) {
8241 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8244 ss
<< "hit_set_period should be non-negative";
8247 p
.hit_set_period
= n
;
8248 } else if (var
== "hit_set_count") {
8249 if (interr
.length()) {
8250 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8253 ss
<< "hit_set_count should be non-negative";
8256 p
.hit_set_count
= n
;
8257 } else if (var
== "hit_set_fpp") {
8258 if (floaterr
.length()) {
8259 ss
<< "error parsing floating point value '" << val
<< "': " << floaterr
;
8261 } else if (f
< 0 || f
> 1.0) {
8262 ss
<< "hit_set_fpp should be in the range 0..1";
8265 if (p
.hit_set_params
.get_type() != HitSet::TYPE_BLOOM
) {
8266 ss
<< "hit set is not of type Bloom; invalid to set a false positive rate!";
8269 BloomHitSet::Params
*bloomp
= static_cast<BloomHitSet::Params
*>(p
.hit_set_params
.impl
.get());
8271 } else if (var
== "use_gmt_hitset") {
8272 if (val
== "true" || (interr
.empty() && n
== 1)) {
8273 p
.use_gmt_hitset
= true;
8275 ss
<< "expecting value 'true' or '1'";
8278 } else if (var
== "allow_ec_overwrites") {
8279 if (!p
.is_erasure()) {
8280 ss
<< "ec overwrites can only be enabled for an erasure coded pool";
8284 if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites
&&
8285 !is_pool_currently_all_bluestore(pool
, p
, &err
)) {
8286 ss
<< "pool must only be stored on bluestore for scrubbing to work: " << err
.str();
8289 if (val
== "true" || (interr
.empty() && n
== 1)) {
8290 p
.flags
|= pg_pool_t::FLAG_EC_OVERWRITES
;
8291 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8292 ss
<< "ec overwrites cannot be disabled once enabled";
8295 ss
<< "expecting value 'true', 'false', '0', or '1'";
8298 } else if (var
== "target_max_objects") {
8299 if (interr
.length()) {
8300 ss
<< "error parsing int '" << val
<< "': " << interr
;
8303 p
.target_max_objects
= n
;
8304 } else if (var
== "target_max_bytes") {
8305 if (interr
.length()) {
8306 ss
<< "error parsing int '" << val
<< "': " << interr
;
8309 p
.target_max_bytes
= n
;
8310 } else if (var
== "cache_target_dirty_ratio") {
8311 if (floaterr
.length()) {
8312 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
8315 if (f
< 0 || f
> 1.0) {
8316 ss
<< "value must be in the range 0..1";
8319 p
.cache_target_dirty_ratio_micro
= uf
;
8320 } else if (var
== "cache_target_dirty_high_ratio") {
8321 if (floaterr
.length()) {
8322 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
8325 if (f
< 0 || f
> 1.0) {
8326 ss
<< "value must be in the range 0..1";
8329 p
.cache_target_dirty_high_ratio_micro
= uf
;
8330 } else if (var
== "cache_target_full_ratio") {
8331 if (floaterr
.length()) {
8332 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
8335 if (f
< 0 || f
> 1.0) {
8336 ss
<< "value must be in the range 0..1";
8339 p
.cache_target_full_ratio_micro
= uf
;
8340 } else if (var
== "cache_min_flush_age") {
8341 if (interr
.length()) {
8342 ss
<< "error parsing int '" << val
<< "': " << interr
;
8345 p
.cache_min_flush_age
= n
;
8346 } else if (var
== "cache_min_evict_age") {
8347 if (interr
.length()) {
8348 ss
<< "error parsing int '" << val
<< "': " << interr
;
8351 p
.cache_min_evict_age
= n
;
8352 } else if (var
== "min_read_recency_for_promote") {
8353 if (interr
.length()) {
8354 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8357 p
.min_read_recency_for_promote
= n
;
8358 } else if (var
== "hit_set_grade_decay_rate") {
8359 if (interr
.length()) {
8360 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8363 if (n
> 100 || n
< 0) {
8364 ss
<< "value out of range,valid range is 0 - 100";
8367 p
.hit_set_grade_decay_rate
= n
;
8368 } else if (var
== "hit_set_search_last_n") {
8369 if (interr
.length()) {
8370 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8373 if (n
> p
.hit_set_count
|| n
< 0) {
8374 ss
<< "value out of range,valid range is 0 - hit_set_count";
8377 p
.hit_set_search_last_n
= n
;
8378 } else if (var
== "min_write_recency_for_promote") {
8379 if (interr
.length()) {
8380 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8383 p
.min_write_recency_for_promote
= n
;
8384 } else if (var
== "fast_read") {
8385 if (p
.is_replicated()) {
8386 ss
<< "fast read is not supported in replication pool";
8389 if (val
== "true" || (interr
.empty() && n
== 1)) {
8391 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8392 p
.fast_read
= false;
8394 ss
<< "expecting value 'true', 'false', '0', or '1'";
8397 } else if (pool_opts_t::is_opt_name(var
)) {
8398 bool unset
= val
== "unset";
8399 if (var
== "compression_mode") {
8401 auto cmode
= Compressor::get_comp_mode_type(val
);
8403 ss
<< "unrecognized compression mode '" << val
<< "'";
8407 } else if (var
== "compression_algorithm") {
8409 auto alg
= Compressor::get_comp_alg_type(val
);
8411 ss
<< "unrecognized compression_algorithm '" << val
<< "'";
8415 } else if (var
== "compression_required_ratio") {
8416 if (floaterr
.length()) {
8417 ss
<< "error parsing float value '" << val
<< "': " << floaterr
;
8420 if (f
< 0 || f
> 1) {
8421 ss
<< "compression_required_ratio is out of range (0-1): '" << val
<< "'";
8424 } else if (var
== "csum_type") {
8425 auto t
= unset
? 0 : Checksummer::get_csum_string_type(val
);
8427 ss
<< "unrecognized csum_type '" << val
<< "'";
8430 //preserve csum_type numeric value
8433 } else if (var
== "compression_max_blob_size" ||
8434 var
== "compression_min_blob_size" ||
8435 var
== "csum_max_block" ||
8436 var
== "csum_min_block") {
8437 if (interr
.length()) {
8438 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8441 } else if (var
== "fingerprint_algorithm") {
8443 auto alg
= pg_pool_t::get_fingerprint_from_str(val
);
8445 ss
<< "unrecognized fingerprint_algorithm '" << val
<< "'";
8449 } else if (var
== "target_size_bytes") {
8450 if (interr
.length()) {
8451 ss
<< "error parsing unit value '" << val
<< "': " << interr
;
8454 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8455 ss
<< "must set require_osd_release to nautilus or "
8456 << "later before setting target_size_bytes";
8459 } else if (var
== "pg_num_min") {
8460 if (interr
.length()) {
8461 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8464 if (n
> (int)p
.get_pg_num_target()) {
8465 ss
<< "specified pg_num_min " << n
8466 << " > pg_num " << p
.get_pg_num_target();
8469 } else if (var
== "recovery_priority") {
8470 if (interr
.length()) {
8471 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8474 if (!g_conf()->debug_allow_any_pool_priority
) {
8475 if (n
> OSD_POOL_PRIORITY_MAX
|| n
< OSD_POOL_PRIORITY_MIN
) {
8476 ss
<< "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
8477 << " and " << OSD_POOL_PRIORITY_MAX
;
8481 } else if (var
== "pg_autoscale_bias") {
8482 if (f
< 0.0 || f
> 1000.0) {
8483 ss
<< "pg_autoscale_bias must be between 0 and 1000";
8488 pool_opts_t::opt_desc_t desc
= pool_opts_t::get_opt_desc(var
);
8489 switch (desc
.type
) {
8490 case pool_opts_t::STR
:
8492 p
.opts
.unset(desc
.key
);
8494 p
.opts
.set(desc
.key
, static_cast<std::string
>(val
));
8497 case pool_opts_t::INT
:
8498 if (interr
.length()) {
8499 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8503 p
.opts
.unset(desc
.key
);
8505 p
.opts
.set(desc
.key
, static_cast<int64_t>(n
));
8508 case pool_opts_t::DOUBLE
:
8509 if (floaterr
.length()) {
8510 ss
<< "error parsing floating point value '" << val
<< "': " << floaterr
;
8514 p
.opts
.unset(desc
.key
);
8516 p
.opts
.set(desc
.key
, static_cast<double>(f
));
8520 ceph_assert(!"unknown type");
8523 ss
<< "unrecognized variable '" << var
<< "'";
8526 if (val
!= "unset") {
8527 ss
<< "set pool " << pool
<< " " << var
<< " to " << val
;
8529 ss
<< "unset pool " << pool
<< " " << var
;
8531 p
.last_change
= pending_inc
.epoch
;
8532 pending_inc
.new_pools
[pool
] = p
;
8536 int OSDMonitor::prepare_command_pool_application(const string
&prefix
,
8537 const cmdmap_t
& cmdmap
,
8540 return _command_pool_application(prefix
, cmdmap
, ss
, nullptr, true);
8543 int OSDMonitor::preprocess_command_pool_application(const string
&prefix
,
8544 const cmdmap_t
& cmdmap
,
8548 return _command_pool_application(prefix
, cmdmap
, ss
, modified
, false);
8553 * Common logic for preprocess and prepare phases of pool application
8554 * tag commands. In preprocess mode we're only detecting invalid
8555 * commands, and determining whether it was a modification or a no-op.
8556 * In prepare mode we're actually updating the pending state.
8558 int OSDMonitor::_command_pool_application(const string
&prefix
,
8559 const cmdmap_t
& cmdmap
,
8565 cmd_getval(cmdmap
, "pool", pool_name
);
8566 int64_t pool
= osdmap
.lookup_pg_pool_name(pool_name
.c_str());
8568 ss
<< "unrecognized pool '" << pool_name
<< "'";
8572 pg_pool_t p
= *osdmap
.get_pg_pool(pool
);
8574 if (pending_inc
.new_pools
.count(pool
)) {
8575 p
= pending_inc
.new_pools
[pool
];
8580 cmd_getval(cmdmap
, "app", app
);
8581 bool app_exists
= (p
.application_metadata
.count(app
) > 0);
8584 cmd_getval(cmdmap
, "key", key
);
8586 ss
<< "key cannot be 'all'";
8591 cmd_getval(cmdmap
, "value", value
);
8592 if (value
== "all") {
8593 ss
<< "value cannot be 'all'";
8597 if (boost::algorithm::ends_with(prefix
, "enable")) {
8599 ss
<< "application name must be provided";
8604 ss
<< "application must be enabled on base tier";
8609 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
8611 if (!app_exists
&& !p
.application_metadata
.empty() && !force
) {
8612 ss
<< "Are you SURE? Pool '" << pool_name
<< "' already has an enabled "
8613 << "application; pass --yes-i-really-mean-it to proceed anyway";
8617 if (!app_exists
&& p
.application_metadata
.size() >= MAX_POOL_APPLICATIONS
) {
8618 ss
<< "too many enabled applications on pool '" << pool_name
<< "'; "
8619 << "max " << MAX_POOL_APPLICATIONS
;
8623 if (app
.length() > MAX_POOL_APPLICATION_LENGTH
) {
8624 ss
<< "application name '" << app
<< "' too long; max length "
8625 << MAX_POOL_APPLICATION_LENGTH
;
8630 p
.application_metadata
[app
] = {};
8632 ss
<< "enabled application '" << app
<< "' on pool '" << pool_name
<< "'";
8634 } else if (boost::algorithm::ends_with(prefix
, "disable")) {
8636 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
8639 ss
<< "Are you SURE? Disabling an application within a pool might result "
8640 << "in loss of application functionality; pass "
8641 << "--yes-i-really-mean-it to proceed anyway";
8646 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
8648 return 0; // idempotent
8651 p
.application_metadata
.erase(app
);
8652 ss
<< "disable application '" << app
<< "' on pool '" << pool_name
<< "'";
8654 } else if (boost::algorithm::ends_with(prefix
, "set")) {
8656 ss
<< "application metadata must be set on base tier";
8661 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
8667 cmd_getval(cmdmap
, "key", key
);
8670 ss
<< "key must be provided";
8674 auto &app_keys
= p
.application_metadata
[app
];
8675 if (app_keys
.count(key
) == 0 &&
8676 app_keys
.size() >= MAX_POOL_APPLICATION_KEYS
) {
8677 ss
<< "too many keys set for application '" << app
<< "' on pool '"
8678 << pool_name
<< "'; max " << MAX_POOL_APPLICATION_KEYS
;
8682 if (key
.length() > MAX_POOL_APPLICATION_LENGTH
) {
8683 ss
<< "key '" << app
<< "' too long; max length "
8684 << MAX_POOL_APPLICATION_LENGTH
;
8689 cmd_getval(cmdmap
, "value", value
);
8690 if (value
.length() > MAX_POOL_APPLICATION_LENGTH
) {
8691 ss
<< "value '" << value
<< "' too long; max length "
8692 << MAX_POOL_APPLICATION_LENGTH
;
8696 p
.application_metadata
[app
][key
] = value
;
8697 ss
<< "set application '" << app
<< "' key '" << key
<< "' to '"
8698 << value
<< "' on pool '" << pool_name
<< "'";
8699 } else if (boost::algorithm::ends_with(prefix
, "rm")) {
8701 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
8707 cmd_getval(cmdmap
, "key", key
);
8708 auto it
= p
.application_metadata
[app
].find(key
);
8709 if (it
== p
.application_metadata
[app
].end()) {
8710 ss
<< "application '" << app
<< "' on pool '" << pool_name
8711 << "' does not have key '" << key
<< "'";
8712 return 0; // idempotent
8715 p
.application_metadata
[app
].erase(it
);
8716 ss
<< "removed application '" << app
<< "' key '" << key
<< "' on pool '"
8717 << pool_name
<< "'";
8723 p
.last_change
= pending_inc
.epoch
;
8724 pending_inc
.new_pools
[pool
] = p
;
8727 // Because we fell through this far, we didn't hit no-op cases,
8728 // so pool was definitely modified
8729 if (modified
!= nullptr) {
8736 int OSDMonitor::_prepare_command_osd_crush_remove(
8737 CrushWrapper
&newcrush
,
8746 err
= newcrush
.remove_item_under(cct
, id
, ancestor
,
8749 err
= newcrush
.remove_item(cct
, id
, unlink_only
);
8754 void OSDMonitor::do_osd_crush_remove(CrushWrapper
& newcrush
)
8756 pending_inc
.crush
.clear();
8757 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8760 int OSDMonitor::prepare_command_osd_crush_remove(
8761 CrushWrapper
&newcrush
,
8767 int err
= _prepare_command_osd_crush_remove(
8768 newcrush
, id
, ancestor
,
8769 has_ancestor
, unlink_only
);
8774 ceph_assert(err
== 0);
8775 do_osd_crush_remove(newcrush
);
8780 int OSDMonitor::prepare_command_osd_remove(int32_t id
)
8782 if (osdmap
.is_up(id
)) {
8786 pending_inc
.new_state
[id
] = osdmap
.get_state(id
);
8787 pending_inc
.new_uuid
[id
] = uuid_d();
8788 pending_metadata_rm
.insert(id
);
8789 pending_metadata
.erase(id
);
8794 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id
)
8796 ceph_assert(existing_id
);
8799 for (int32_t i
= 0; i
< osdmap
.get_max_osd(); ++i
) {
8800 if (!osdmap
.exists(i
) &&
8801 pending_inc
.new_up_client
.count(i
) == 0 &&
8802 (pending_inc
.new_state
.count(i
) == 0 ||
8803 (pending_inc
.new_state
[i
] & CEPH_OSD_EXISTS
) == 0)) {
8809 if (pending_inc
.new_max_osd
< 0) {
8810 return osdmap
.get_max_osd();
8812 return pending_inc
.new_max_osd
;
8815 void OSDMonitor::do_osd_create(
8818 const string
& device_class
,
8821 dout(10) << __func__
<< " uuid " << uuid
<< dendl
;
8822 ceph_assert(new_id
);
8824 // We presume validation has been performed prior to calling this
8825 // function. We assert with prejudice.
8827 int32_t allocated_id
= -1; // declare here so we can jump
8828 int32_t existing_id
= -1;
8829 if (!uuid
.is_zero()) {
8830 existing_id
= osdmap
.identify_osd(uuid
);
8831 if (existing_id
>= 0) {
8832 ceph_assert(id
< 0 || id
== existing_id
);
8833 *new_id
= existing_id
;
8835 } else if (id
>= 0) {
8836 // uuid does not exist, and id has been provided, so just create
8843 // allocate a new id
8844 allocated_id
= _allocate_osd_id(&existing_id
);
8845 dout(10) << __func__
<< " allocated id " << allocated_id
8846 << " existing id " << existing_id
<< dendl
;
8847 if (existing_id
>= 0) {
8848 ceph_assert(existing_id
< osdmap
.get_max_osd());
8849 ceph_assert(allocated_id
< 0);
8850 pending_inc
.new_weight
[existing_id
] = CEPH_OSD_OUT
;
8851 *new_id
= existing_id
;
8852 } else if (allocated_id
>= 0) {
8853 ceph_assert(existing_id
< 0);
8855 if (pending_inc
.new_max_osd
< 0) {
8856 pending_inc
.new_max_osd
= osdmap
.get_max_osd() + 1;
8858 ++pending_inc
.new_max_osd
;
8860 *new_id
= pending_inc
.new_max_osd
- 1;
8861 ceph_assert(*new_id
== allocated_id
);
8863 ceph_abort_msg("unexpected condition");
8867 if (device_class
.size()) {
8868 CrushWrapper newcrush
;
8869 _get_pending_crush(newcrush
);
8870 if (newcrush
.get_max_devices() < *new_id
+ 1) {
8871 newcrush
.set_max_devices(*new_id
+ 1);
8873 string name
= string("osd.") + stringify(*new_id
);
8874 if (!newcrush
.item_exists(*new_id
)) {
8875 newcrush
.set_item_name(*new_id
, name
);
8878 int r
= newcrush
.update_device_class(*new_id
, device_class
, name
, &ss
);
8880 derr
<< __func__
<< " failed to set " << name
<< " device_class "
8881 << device_class
<< ": " << cpp_strerror(r
) << " - " << ss
.str()
8883 // non-fatal... this might be a replay and we want to be idempotent.
8885 dout(20) << __func__
<< " set " << name
<< " device_class " << device_class
8887 pending_inc
.crush
.clear();
8888 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8891 dout(20) << __func__
<< " no device_class" << dendl
;
8894 dout(10) << __func__
<< " using id " << *new_id
<< dendl
;
8895 if (osdmap
.get_max_osd() <= *new_id
&& pending_inc
.new_max_osd
<= *new_id
) {
8896 pending_inc
.new_max_osd
= *new_id
+ 1;
8899 pending_inc
.new_state
[*new_id
] |= CEPH_OSD_EXISTS
| CEPH_OSD_NEW
;
8900 if (!uuid
.is_zero())
8901 pending_inc
.new_uuid
[*new_id
] = uuid
;
8904 int OSDMonitor::validate_osd_create(
8907 const bool check_osd_exists
,
8908 int32_t* existing_id
,
8912 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
8913 << " check_osd_exists " << check_osd_exists
<< dendl
;
8915 ceph_assert(existing_id
);
8917 if (id
< 0 && uuid
.is_zero()) {
8918 // we have nothing to validate
8921 } else if (uuid
.is_zero()) {
8922 // we have an id but we will ignore it - because that's what
8923 // `osd create` does.
8928 * This function will be used to validate whether we are able to
8929 * create a new osd when the `uuid` is specified.
8931 * It will be used by both `osd create` and `osd new`, as the checks
8932 * are basically the same when it pertains to osd id and uuid validation.
8933 * However, `osd create` presumes an `uuid` is optional, for legacy
8934 * reasons, while `osd new` requires the `uuid` to be provided. This
8935 * means that `osd create` will not be idempotent if an `uuid` is not
8936 * provided, but we will always guarantee the idempotency of `osd new`.
8939 ceph_assert(!uuid
.is_zero());
8940 if (pending_inc
.identify_osd(uuid
) >= 0) {
8941 // osd is about to exist
8945 int32_t i
= osdmap
.identify_osd(uuid
);
8947 // osd already exists
8948 if (id
>= 0 && i
!= id
) {
8949 ss
<< "uuid " << uuid
<< " already in use for different id " << i
;
8952 // return a positive errno to distinguish between a blocking error
8953 // and an error we consider to not be a problem (i.e., this would be
8954 // an idempotent operation).
8960 if (pending_inc
.new_state
.count(id
)) {
8961 // osd is about to exist
8964 // we may not care if an osd exists if we are recreating a previously
8966 if (check_osd_exists
&& osdmap
.exists(id
)) {
8967 ss
<< "id " << id
<< " already in use and does not match uuid "
8975 int OSDMonitor::prepare_command_osd_create(
8978 int32_t* existing_id
,
8981 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
<< dendl
;
8982 ceph_assert(existing_id
);
8983 if (osdmap
.is_destroyed(id
)) {
8984 ss
<< "ceph osd create has been deprecated. Please use ceph osd new "
8989 if (uuid
.is_zero()) {
8990 dout(10) << __func__
<< " no uuid; assuming legacy `osd create`" << dendl
;
8993 return validate_osd_create(id
, uuid
, true, existing_id
, ss
);
8996 int OSDMonitor::prepare_command_osd_new(
8998 const cmdmap_t
& cmdmap
,
8999 const map
<string
,string
>& params
,
9007 ceph_assert(paxos
->is_plugged());
9009 dout(10) << __func__
<< " " << op
<< dendl
;
9011 /* validate command. abort now if something's wrong. */
9013 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
9015 * If `id` is not specified, we will identify any existing osd based
9016 * on `uuid`. Operation will be idempotent iff secrets match.
9018 * If `id` is specified, we will identify any existing osd based on
9019 * `uuid` and match against `id`. If they match, operation will be
9020 * idempotent iff secrets match.
9022 * `-i secrets.json` will be optional. If supplied, will be used
9023 * to check for idempotency when `id` and `uuid` match.
9025 * If `id` is not specified, and `uuid` does not exist, an id will
9026 * be found or allocated for the osd.
9028 * If `id` is specified, and the osd has been previously marked
9029 * as destroyed, then the `id` will be reused.
9031 if (!cmd_getval(cmdmap
, "uuid", uuidstr
)) {
9032 ss
<< "requires the OSD's UUID to be specified.";
9034 } else if (!uuid
.parse(uuidstr
.c_str())) {
9035 ss
<< "invalid UUID value '" << uuidstr
<< "'.";
9039 if (cmd_getval(cmdmap
, "id", id
) &&
9041 ss
<< "invalid OSD id; must be greater or equal than zero.";
9045 // are we running an `osd create`-like command, or recreating
9046 // a previously destroyed osd?
9048 bool is_recreate_destroyed
= (id
>= 0 && osdmap
.is_destroyed(id
));
9050 // we will care about `id` to assess whether osd is `destroyed`, or
9051 // to create a new osd.
9052 // we will need an `id` by the time we reach auth.
9054 int32_t existing_id
= -1;
9055 int err
= validate_osd_create(id
, uuid
, !is_recreate_destroyed
,
9058 bool may_be_idempotent
= false;
9059 if (err
== EEXIST
) {
9060 // this is idempotent from the osdmon's point-of-view
9061 may_be_idempotent
= true;
9062 ceph_assert(existing_id
>= 0);
9064 } else if (err
< 0) {
9068 if (!may_be_idempotent
) {
9069 // idempotency is out of the window. We are either creating a new
9070 // osd or recreating a destroyed osd.
9072 // We now need to figure out if we have an `id` (and if it's valid),
9073 // of find an `id` if we don't have one.
9075 // NOTE: we need to consider the case where the `id` is specified for
9076 // `osd create`, and we must honor it. So this means checking if
9077 // the `id` is destroyed, and if so assume the destroy; otherwise,
9078 // check if it `exists` - in which case we complain about not being
9079 // `destroyed`. In the end, if nothing fails, we must allow the
9080 // creation, so that we are compatible with `create`.
9081 if (id
>= 0 && osdmap
.exists(id
) && !osdmap
.is_destroyed(id
)) {
9082 dout(10) << __func__
<< " osd." << id
<< " isn't destroyed" << dendl
;
9083 ss
<< "OSD " << id
<< " has not yet been destroyed";
9085 } else if (id
< 0) {
9087 id
= _allocate_osd_id(&existing_id
);
9089 ceph_assert(existing_id
>= 0);
9092 dout(10) << __func__
<< " found id " << id
<< " to use" << dendl
;
9093 } else if (id
>= 0 && osdmap
.is_destroyed(id
)) {
9094 dout(10) << __func__
<< " recreating osd." << id
<< dendl
;
9096 dout(10) << __func__
<< " creating new osd." << id
<< dendl
;
9099 ceph_assert(id
>= 0);
9100 ceph_assert(osdmap
.exists(id
));
9103 // we are now able to either create a brand new osd or reuse an existing
9104 // osd that has been previously destroyed.
9106 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
<< dendl
;
9108 if (may_be_idempotent
&& params
.empty()) {
9109 // nothing to do, really.
9110 dout(10) << __func__
<< " idempotent and no params -- no op." << dendl
;
9111 ceph_assert(id
>= 0);
9113 f
->open_object_section("created_osd");
9114 f
->dump_int("osdid", id
);
9122 string device_class
;
9123 auto p
= params
.find("crush_device_class");
9124 if (p
!= params
.end()) {
9125 device_class
= p
->second
;
9126 dout(20) << __func__
<< " device_class will be " << device_class
<< dendl
;
9128 string cephx_secret
, lockbox_secret
, dmcrypt_key
;
9129 bool has_lockbox
= false;
9130 bool has_secrets
= params
.count("cephx_secret")
9131 || params
.count("cephx_lockbox_secret")
9132 || params
.count("dmcrypt_key");
9134 ConfigKeyService
*svc
= nullptr;
9135 AuthMonitor::auth_entity_t cephx_entity
, lockbox_entity
;
9138 if (params
.count("cephx_secret") == 0) {
9139 ss
<< "requires a cephx secret.";
9142 cephx_secret
= params
.at("cephx_secret");
9144 bool has_lockbox_secret
= (params
.count("cephx_lockbox_secret") > 0);
9145 bool has_dmcrypt_key
= (params
.count("dmcrypt_key") > 0);
9147 dout(10) << __func__
<< " has lockbox " << has_lockbox_secret
9148 << " dmcrypt " << has_dmcrypt_key
<< dendl
;
9150 if (has_lockbox_secret
&& has_dmcrypt_key
) {
9152 lockbox_secret
= params
.at("cephx_lockbox_secret");
9153 dmcrypt_key
= params
.at("dmcrypt_key");
9154 } else if (!has_lockbox_secret
!= !has_dmcrypt_key
) {
9155 ss
<< "requires both a cephx lockbox secret and a dm-crypt key.";
9159 dout(10) << __func__
<< " validate secrets using osd id " << id
<< dendl
;
9161 err
= mon
->authmon()->validate_osd_new(id
, uuid
,
9169 } else if (may_be_idempotent
&& err
!= EEXIST
) {
9170 // for this to be idempotent, `id` should already be >= 0; no need
9171 // to use validate_id.
9172 ceph_assert(id
>= 0);
9173 ss
<< "osd." << id
<< " exists but secrets do not match";
9178 svc
= (ConfigKeyService
*)mon
->config_key_service
;
9179 err
= svc
->validate_osd_new(uuid
, dmcrypt_key
, ss
);
9182 } else if (may_be_idempotent
&& err
!= EEXIST
) {
9183 ceph_assert(id
>= 0);
9184 ss
<< "osd." << id
<< " exists but dm-crypt key does not match.";
9189 ceph_assert(!has_secrets
|| !cephx_secret
.empty());
9190 ceph_assert(!has_lockbox
|| !lockbox_secret
.empty());
9192 if (may_be_idempotent
) {
9193 // we have nothing to do for either the osdmon or the authmon,
9194 // and we have no lockbox - so the config key service will not be
9195 // touched. This is therefore an idempotent operation, and we can
9196 // just return right away.
9197 dout(10) << __func__
<< " idempotent -- no op." << dendl
;
9198 ceph_assert(id
>= 0);
9200 f
->open_object_section("created_osd");
9201 f
->dump_int("osdid", id
);
9208 ceph_assert(!may_be_idempotent
);
9212 ceph_assert(!cephx_secret
.empty());
9213 ceph_assert((lockbox_secret
.empty() && dmcrypt_key
.empty()) ||
9214 (!lockbox_secret
.empty() && !dmcrypt_key
.empty()));
9216 err
= mon
->authmon()->do_osd_new(cephx_entity
,
9219 ceph_assert(0 == err
);
9222 ceph_assert(nullptr != svc
);
9223 svc
->do_osd_new(uuid
, dmcrypt_key
);
9227 if (is_recreate_destroyed
) {
9228 ceph_assert(id
>= 0);
9229 ceph_assert(osdmap
.is_destroyed(id
));
9230 pending_inc
.new_weight
[id
] = CEPH_OSD_OUT
;
9231 pending_inc
.new_state
[id
] |= CEPH_OSD_DESTROYED
;
9232 if ((osdmap
.get_state(id
) & CEPH_OSD_NEW
) == 0) {
9233 pending_inc
.new_state
[id
] |= CEPH_OSD_NEW
;
9235 if (osdmap
.get_state(id
) & CEPH_OSD_UP
) {
9236 // due to http://tracker.ceph.com/issues/20751 some clusters may
9237 // have UP set for non-existent OSDs; make sure it is cleared
9238 // for a newly created osd.
9239 pending_inc
.new_state
[id
] |= CEPH_OSD_UP
;
9241 pending_inc
.new_uuid
[id
] = uuid
;
9243 ceph_assert(id
>= 0);
9244 int32_t new_id
= -1;
9245 do_osd_create(id
, uuid
, device_class
, &new_id
);
9246 ceph_assert(new_id
>= 0);
9247 ceph_assert(id
== new_id
);
9251 f
->open_object_section("created_osd");
9252 f
->dump_int("osdid", id
);
9261 bool OSDMonitor::prepare_command(MonOpRequestRef op
)
9263 op
->mark_osdmon_event(__func__
);
9264 auto m
= op
->get_req
<MMonCommand
>();
9267 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
9268 string rs
= ss
.str();
9269 mon
->reply_command(op
, -EINVAL
, rs
, get_last_committed());
9273 MonSession
*session
= op
->get_session();
9275 derr
<< __func__
<< " no session" << dendl
;
9276 mon
->reply_command(op
, -EACCES
, "access denied", get_last_committed());
9280 return prepare_command_impl(op
, cmdmap
);
9283 static int parse_reweights(CephContext
*cct
,
9284 const cmdmap_t
& cmdmap
,
9285 const OSDMap
& osdmap
,
9286 map
<int32_t, uint32_t>* weights
)
9289 if (!cmd_getval(cmdmap
, "weights", weights_str
)) {
9292 std::replace(begin(weights_str
), end(weights_str
), '\'', '"');
9293 json_spirit::mValue json_value
;
9294 if (!json_spirit::read(weights_str
, json_value
)) {
9297 if (json_value
.type() != json_spirit::obj_type
) {
9300 const auto obj
= json_value
.get_obj();
9302 for (auto& osd_weight
: obj
) {
9303 auto osd_id
= std::stoi(osd_weight
.first
);
9304 if (!osdmap
.exists(osd_id
)) {
9307 if (osd_weight
.second
.type() != json_spirit::str_type
) {
9310 auto weight
= std::stoul(osd_weight
.second
.get_str());
9311 weights
->insert({osd_id
, weight
});
9313 } catch (const std::logic_error
& e
) {
9319 int OSDMonitor::prepare_command_osd_destroy(
9323 ceph_assert(paxos
->is_plugged());
9325 // we check if the osd exists for the benefit of `osd purge`, which may
9326 // have previously removed the osd. If the osd does not exist, return
9327 // -ENOENT to convey this, and let the caller deal with it.
9329 // we presume that all auth secrets and config keys were removed prior
9330 // to this command being called. if they exist by now, we also assume
9331 // they must have been created by some other command and do not pertain
9332 // to this non-existent osd.
9333 if (!osdmap
.exists(id
)) {
9334 dout(10) << __func__
<< " osd." << id
<< " does not exist." << dendl
;
9338 uuid_d uuid
= osdmap
.get_uuid(id
);
9339 dout(10) << __func__
<< " destroying osd." << id
9340 << " uuid " << uuid
<< dendl
;
9342 // if it has been destroyed, we assume our work here is done.
9343 if (osdmap
.is_destroyed(id
)) {
9344 ss
<< "destroyed osd." << id
;
9348 EntityName cephx_entity
, lockbox_entity
;
9349 bool idempotent_auth
= false, idempotent_cks
= false;
9351 int err
= mon
->authmon()->validate_osd_destroy(id
, uuid
,
9356 if (err
== -ENOENT
) {
9357 idempotent_auth
= true;
9363 ConfigKeyService
*svc
= (ConfigKeyService
*)mon
->config_key_service
;
9364 err
= svc
->validate_osd_destroy(id
, uuid
);
9366 ceph_assert(err
== -ENOENT
);
9368 idempotent_cks
= true;
9371 if (!idempotent_auth
) {
9372 err
= mon
->authmon()->do_osd_destroy(cephx_entity
, lockbox_entity
);
9373 ceph_assert(0 == err
);
9376 if (!idempotent_cks
) {
9377 svc
->do_osd_destroy(id
, uuid
);
9380 pending_inc
.new_state
[id
] = CEPH_OSD_DESTROYED
;
9381 pending_inc
.new_uuid
[id
] = uuid_d();
9383 // we can only propose_pending() once per service, otherwise we'll be
9384 // defying PaxosService and all laws of nature. Therefore, as we may
9385 // be used during 'osd purge', let's keep the caller responsible for
9387 ceph_assert(err
== 0);
9391 int OSDMonitor::prepare_command_osd_purge(
9395 ceph_assert(paxos
->is_plugged());
9396 dout(10) << __func__
<< " purging osd." << id
<< dendl
;
9398 ceph_assert(!osdmap
.is_up(id
));
9401 * This may look a bit weird, but this is what's going to happen:
9403 * 1. we make sure that removing from crush works
9404 * 2. we call `prepare_command_osd_destroy()`. If it returns an
9405 * error, then we abort the whole operation, as no updates
9406 * have been made. However, we this function will have
9407 * side-effects, thus we need to make sure that all operations
9408 * performed henceforth will *always* succeed.
9409 * 3. we call `prepare_command_osd_remove()`. Although this
9410 * function can return an error, it currently only checks if the
9411 * osd is up - and we have made sure that it is not so, so there
9412 * is no conflict, and it is effectively an update.
9413 * 4. finally, we call `do_osd_crush_remove()`, which will perform
9414 * the crush update we delayed from before.
9417 CrushWrapper newcrush
;
9418 _get_pending_crush(newcrush
);
9420 bool may_be_idempotent
= false;
9422 int err
= _prepare_command_osd_crush_remove(newcrush
, id
, 0, false, false);
9423 if (err
== -ENOENT
) {
9425 may_be_idempotent
= true;
9426 } else if (err
< 0) {
9427 ss
<< "error removing osd." << id
<< " from crush";
9431 // no point destroying the osd again if it has already been marked destroyed
9432 if (!osdmap
.is_destroyed(id
)) {
9433 err
= prepare_command_osd_destroy(id
, ss
);
9435 if (err
== -ENOENT
) {
9441 may_be_idempotent
= false;
9444 ceph_assert(0 == err
);
9446 if (may_be_idempotent
&& !osdmap
.exists(id
)) {
9447 dout(10) << __func__
<< " osd." << id
<< " does not exist and "
9448 << "we are idempotent." << dendl
;
9452 err
= prepare_command_osd_remove(id
);
9453 // we should not be busy, as we should have made sure this id is not up.
9454 ceph_assert(0 == err
);
9456 do_osd_crush_remove(newcrush
);
9460 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op
,
9461 const cmdmap_t
& cmdmap
)
9463 op
->mark_osdmon_event(__func__
);
9464 auto m
= op
->get_req
<MMonCommand
>();
9472 cmd_getval(cmdmap
, "format", format
, string("plain"));
9473 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
9476 cmd_getval(cmdmap
, "prefix", prefix
);
9480 bool osdid_present
= false;
9481 if (prefix
!= "osd pg-temp" &&
9482 prefix
!= "osd pg-upmap" &&
9483 prefix
!= "osd pg-upmap-items") { // avoid commands with non-int id arg
9484 osdid_present
= cmd_getval(cmdmap
, "id", osdid
);
9486 if (osdid_present
) {
9488 oss
<< "osd." << osdid
;
9489 osd_name
= oss
.str();
9492 // Even if there's a pending state with changes that could affect
9493 // a command, considering that said state isn't yet committed, we
9494 // just don't care about those changes if the command currently being
9495 // handled acts as a no-op against the current committed state.
9496 // In a nutshell, we assume this command happens *before*.
9498 // Let me make this clearer:
9500 // - If we have only one client, and that client issues some
9501 // operation that would conflict with this operation but is
9502 // still on the pending state, then we would be sure that said
9503 // operation wouldn't have returned yet, so the client wouldn't
9504 // issue this operation (unless the client didn't wait for the
9505 // operation to finish, and that would be the client's own fault).
9507 // - If we have more than one client, each client will observe
9508 // whatever is the state at the moment of the commit. So, if we
9509 // have two clients, one issuing an unlink and another issuing a
9510 // link, and if the link happens while the unlink is still on the
9511 // pending state, from the link's point-of-view this is a no-op.
9512 // If different clients are issuing conflicting operations and
9513 // they care about that, then the clients should make sure they
9514 // enforce some kind of concurrency mechanism -- from our
9515 // perspective that's what Douglas Adams would call an SEP.
9517 // This should be used as a general guideline for most commands handled
9518 // in this function. Adapt as you see fit, but please bear in mind that
9519 // this is the expected behavior.
9522 if (prefix
== "osd setcrushmap" ||
9523 (prefix
== "osd crush set" && !osdid_present
)) {
9524 if (pending_inc
.crush
.length()) {
9525 dout(10) << __func__
<< " waiting for pending crush update " << dendl
;
9526 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
9529 dout(10) << "prepare_command setting new crush map" << dendl
;
9530 bufferlist
data(m
->get_data());
9533 auto bl
= data
.cbegin();
9536 catch (const std::exception
&e
) {
9538 ss
<< "Failed to parse crushmap: " << e
.what();
9542 int64_t prior_version
= 0;
9543 if (cmd_getval(cmdmap
, "prior_version", prior_version
)) {
9544 if (prior_version
== osdmap
.get_crush_version() - 1) {
9545 // see if we are a resend of the last update. this is imperfect
9546 // (multiple racing updaters may not both get reliable success)
9547 // but we expect crush updaters (via this interface) to be rare-ish.
9548 bufferlist current
, proposed
;
9549 osdmap
.crush
->encode(current
, mon
->get_quorum_con_features());
9550 crush
.encode(proposed
, mon
->get_quorum_con_features());
9551 if (current
.contents_equal(proposed
)) {
9552 dout(10) << __func__
9553 << " proposed matches current and version equals previous"
9556 ss
<< osdmap
.get_crush_version();
9560 if (prior_version
!= osdmap
.get_crush_version()) {
9562 ss
<< "prior_version " << prior_version
<< " != crush version "
9563 << osdmap
.get_crush_version();
9568 if (crush
.has_legacy_rule_ids()) {
9570 ss
<< "crush maps with ruleset != ruleid are no longer allowed";
9573 if (!validate_crush_against_features(&crush
, ss
)) {
9578 err
= osdmap
.validate_crush_rules(&crush
, &ss
);
9583 if (g_conf()->mon_osd_crush_smoke_test
) {
9584 // sanity check: test some inputs to make sure this map isn't
9586 dout(10) << " testing map" << dendl
;
9588 CrushTester
tester(crush
, ess
);
9589 tester
.set_min_x(0);
9590 tester
.set_max_x(50);
9591 auto start
= ceph::coarse_mono_clock::now();
9592 int r
= tester
.test_with_fork(g_conf()->mon_lease
);
9593 auto duration
= ceph::coarse_mono_clock::now() - start
;
9595 dout(10) << " tester.test_with_fork returns " << r
9596 << ": " << ess
.str() << dendl
;
9597 ss
<< "crush smoke test failed with " << r
<< ": " << ess
.str();
9601 dout(10) << __func__
<< " crush somke test duration: "
9602 << duration
<< ", result: " << ess
.str() << dendl
;
9605 pending_inc
.crush
= data
;
9606 ss
<< osdmap
.get_crush_version() + 1;
9609 } else if (prefix
== "osd crush set-all-straw-buckets-to-straw2") {
9610 CrushWrapper newcrush
;
9611 _get_pending_crush(newcrush
);
9612 for (int b
= 0; b
< newcrush
.get_max_buckets(); ++b
) {
9614 if (newcrush
.bucket_exists(bid
) &&
9615 newcrush
.get_bucket_alg(bid
) == CRUSH_BUCKET_STRAW
) {
9616 dout(20) << " bucket " << bid
<< " is straw, can convert" << dendl
;
9617 newcrush
.bucket_set_alg(bid
, CRUSH_BUCKET_STRAW2
);
9620 if (!validate_crush_against_features(&newcrush
, ss
)) {
9624 pending_inc
.crush
.clear();
9625 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9626 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9627 get_last_committed() + 1));
9629 } else if (prefix
== "osd crush set-device-class") {
9630 string device_class
;
9631 if (!cmd_getval(cmdmap
, "class", device_class
)) {
9632 err
= -EINVAL
; // no value!
9637 vector
<string
> idvec
;
9638 cmd_getval(cmdmap
, "ids", idvec
);
9639 CrushWrapper newcrush
;
9640 _get_pending_crush(newcrush
);
9642 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
9646 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
9647 osdmap
.get_all_osds(osds
);
9650 // try traditional single osd way
9651 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
9653 // ss has reason for failure
9654 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
9661 for (auto &osd
: osds
) {
9662 if (!osdmap
.exists(osd
)) {
9663 ss
<< "osd." << osd
<< " does not exist. ";
9668 oss
<< "osd." << osd
;
9669 string name
= oss
.str();
9671 if (newcrush
.get_max_devices() < osd
+ 1) {
9672 newcrush
.set_max_devices(osd
+ 1);
9675 if (newcrush
.item_exists(osd
)) {
9676 action
= "updating";
9678 action
= "creating";
9679 newcrush
.set_item_name(osd
, name
);
9682 dout(5) << action
<< " crush item id " << osd
<< " name '" << name
9683 << "' device_class '" << device_class
<< "'"
9685 err
= newcrush
.update_device_class(osd
, device_class
, name
, &ss
);
9689 if (err
== 0 && !_have_pending_crush()) {
9691 // for single osd only, wildcard makes too much noise
9692 ss
<< "set-device-class item id " << osd
<< " name '" << name
9693 << "' device_class '" << device_class
<< "': no change. ";
9696 updated
.insert(osd
);
9701 if (!updated
.empty()) {
9702 pending_inc
.crush
.clear();
9703 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9704 ss
<< "set osd(s) " << updated
<< " to class '" << device_class
<< "'";
9706 wait_for_finished_proposal(op
,
9707 new Monitor::C_Command(mon
,op
, 0, rs
, get_last_committed() + 1));
9711 } else if (prefix
== "osd crush rm-device-class") {
9713 vector
<string
> idvec
;
9714 cmd_getval(cmdmap
, "ids", idvec
);
9715 CrushWrapper newcrush
;
9716 _get_pending_crush(newcrush
);
9719 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
9724 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
9725 osdmap
.get_all_osds(osds
);
9728 // try traditional single osd way
9729 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
9731 // ss has reason for failure
9732 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
9739 for (auto &osd
: osds
) {
9740 if (!osdmap
.exists(osd
)) {
9741 ss
<< "osd." << osd
<< " does not exist. ";
9745 auto class_name
= newcrush
.get_item_class(osd
);
9747 ss
<< "osd." << osd
<< " belongs to no class, ";
9750 // note that we do not verify if class_is_in_use here
9751 // in case the device is misclassified and user wants
9752 // to overridely reset...
9754 err
= newcrush
.remove_device_class(cct
, osd
, &ss
);
9756 // ss has reason for failure
9759 updated
.insert(osd
);
9763 if (!updated
.empty()) {
9764 pending_inc
.crush
.clear();
9765 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9766 ss
<< "done removing class of osd(s): " << updated
;
9768 wait_for_finished_proposal(op
,
9769 new Monitor::C_Command(mon
,op
, 0, rs
, get_last_committed() + 1));
9772 } else if (prefix
== "osd crush class create") {
9773 string device_class
;
9774 if (!cmd_getval(cmdmap
, "class", device_class
)) {
9775 err
= -EINVAL
; // no value!
9778 if (osdmap
.require_osd_release
< ceph_release_t::luminous
) {
9779 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
9780 << "luminous' before using crush device classes";
9784 if (!_have_pending_crush() &&
9785 _get_stable_crush().class_exists(device_class
)) {
9786 ss
<< "class '" << device_class
<< "' already exists";
9789 CrushWrapper newcrush
;
9790 _get_pending_crush(newcrush
);
9791 if (newcrush
.class_exists(device_class
)) {
9792 ss
<< "class '" << device_class
<< "' already exists";
9795 int class_id
= newcrush
.get_or_create_class_id(device_class
);
9796 pending_inc
.crush
.clear();
9797 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9798 ss
<< "created class " << device_class
<< " with id " << class_id
9801 } else if (prefix
== "osd crush class rm") {
9802 string device_class
;
9803 if (!cmd_getval(cmdmap
, "class", device_class
)) {
9804 err
= -EINVAL
; // no value!
9807 if (osdmap
.require_osd_release
< ceph_release_t::luminous
) {
9808 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
9809 << "luminous' before using crush device classes";
9814 if (!osdmap
.crush
->class_exists(device_class
)) {
9819 CrushWrapper newcrush
;
9820 _get_pending_crush(newcrush
);
9821 if (!newcrush
.class_exists(device_class
)) {
9822 err
= 0; // make command idempotent
9825 int class_id
= newcrush
.get_class_id(device_class
);
9827 if (newcrush
.class_is_in_use(class_id
, &ts
)) {
9829 ss
<< "class '" << device_class
<< "' " << ts
.str();
9833 // check if class is used by any erasure-code-profiles
9834 mempool::osdmap::map
<string
,map
<string
,string
>> old_ec_profiles
=
9835 osdmap
.get_erasure_code_profiles();
9836 auto ec_profiles
= pending_inc
.get_erasure_code_profiles();
9837 #ifdef HAVE_STDLIB_MAP_SPLICING
9838 ec_profiles
.merge(old_ec_profiles
);
9840 ec_profiles
.insert(make_move_iterator(begin(old_ec_profiles
)),
9841 make_move_iterator(end(old_ec_profiles
)));
9843 list
<string
> referenced_by
;
9844 for (auto &i
: ec_profiles
) {
9845 for (auto &j
: i
.second
) {
9846 if ("crush-device-class" == j
.first
&& device_class
== j
.second
) {
9847 referenced_by
.push_back(i
.first
);
9851 if (!referenced_by
.empty()) {
9853 ss
<< "class '" << device_class
9854 << "' is still referenced by erasure-code-profile(s): " << referenced_by
;
9859 newcrush
.get_devices_by_class(device_class
, &osds
);
9860 for (auto& p
: osds
) {
9861 err
= newcrush
.remove_device_class(g_ceph_context
, p
, &ss
);
9863 // ss has reason for failure
9869 // empty class, remove directly
9870 err
= newcrush
.remove_class_name(device_class
);
9872 ss
<< "class '" << device_class
<< "' cannot be removed '"
9873 << cpp_strerror(err
) << "'";
9878 pending_inc
.crush
.clear();
9879 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9880 ss
<< "removed class " << device_class
<< " with id " << class_id
9881 << " from crush map";
9883 } else if (prefix
== "osd crush class rename") {
9884 string srcname
, dstname
;
9885 if (!cmd_getval(cmdmap
, "srcname", srcname
)) {
9889 if (!cmd_getval(cmdmap
, "dstname", dstname
)) {
9894 CrushWrapper newcrush
;
9895 _get_pending_crush(newcrush
);
9896 if (!newcrush
.class_exists(srcname
) && newcrush
.class_exists(dstname
)) {
9897 // suppose this is a replay and return success
9898 // so command is idempotent
9899 ss
<< "already renamed to '" << dstname
<< "'";
9904 err
= newcrush
.rename_class(srcname
, dstname
);
9906 ss
<< "fail to rename '" << srcname
<< "' to '" << dstname
<< "' : "
9907 << cpp_strerror(err
);
9911 pending_inc
.crush
.clear();
9912 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9913 ss
<< "rename class '" << srcname
<< "' to '" << dstname
<< "'";
9915 } else if (prefix
== "osd crush add-bucket") {
9916 // os crush add-bucket <name> <type>
9917 string name
, typestr
;
9918 vector
<string
> argvec
;
9919 cmd_getval(cmdmap
, "name", name
);
9920 cmd_getval(cmdmap
, "type", typestr
);
9921 cmd_getval(cmdmap
, "args", argvec
);
9922 map
<string
,string
> loc
;
9923 if (!argvec
.empty()) {
9924 CrushWrapper::parse_loc_map(argvec
, &loc
);
9925 dout(0) << "will create and move bucket '" << name
9926 << "' to location " << loc
<< dendl
;
9929 if (!_have_pending_crush() &&
9930 _get_stable_crush().name_exists(name
)) {
9931 ss
<< "bucket '" << name
<< "' already exists";
9935 CrushWrapper newcrush
;
9936 _get_pending_crush(newcrush
);
9938 if (newcrush
.name_exists(name
)) {
9939 ss
<< "bucket '" << name
<< "' already exists";
9942 int type
= newcrush
.get_type_id(typestr
);
9944 ss
<< "type '" << typestr
<< "' does not exist";
9949 ss
<< "type '" << typestr
<< "' is for devices, not buckets";
9954 err
= newcrush
.add_bucket(0, 0,
9955 CRUSH_HASH_DEFAULT
, type
, 0, NULL
,
9958 ss
<< "add_bucket error: '" << cpp_strerror(err
) << "'";
9961 err
= newcrush
.set_item_name(bucketno
, name
);
9963 ss
<< "error setting bucket name to '" << name
<< "'";
9968 if (!newcrush
.check_item_loc(cct
, bucketno
, loc
,
9970 err
= newcrush
.move_bucket(cct
, bucketno
, loc
);
9972 ss
<< "error moving bucket '" << name
<< "' to location " << loc
;
9976 ss
<< "no need to move item id " << bucketno
<< " name '" << name
9977 << "' to location " << loc
<< " in crush map";
9981 pending_inc
.crush
.clear();
9982 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9984 ss
<< "added bucket " << name
<< " type " << typestr
9987 ss
<< "added bucket " << name
<< " type " << typestr
9988 << " to location " << loc
;
9991 } else if (prefix
== "osd crush rename-bucket") {
9992 string srcname
, dstname
;
9993 cmd_getval(cmdmap
, "srcname", srcname
);
9994 cmd_getval(cmdmap
, "dstname", dstname
);
9996 err
= crush_rename_bucket(srcname
, dstname
, &ss
);
9997 if (err
== -EALREADY
) // equivalent to success for idempotency
10003 } else if (prefix
== "osd crush weight-set create" ||
10004 prefix
== "osd crush weight-set create-compat") {
10005 CrushWrapper newcrush
;
10006 _get_pending_crush(newcrush
);
10009 if (newcrush
.has_non_straw2_buckets()) {
10010 ss
<< "crush map contains one or more bucket(s) that are not straw2";
10014 if (prefix
== "osd crush weight-set create") {
10015 if (osdmap
.require_min_compat_client
!= ceph_release_t::unknown
&&
10016 osdmap
.require_min_compat_client
< ceph_release_t::luminous
) {
10017 ss
<< "require_min_compat_client "
10018 << osdmap
.require_min_compat_client
10019 << " < luminous, which is required for per-pool weight-sets. "
10020 << "Try 'ceph osd set-require-min-compat-client luminous' "
10021 << "before using the new interface";
10025 string poolname
, mode
;
10026 cmd_getval(cmdmap
, "pool", poolname
);
10027 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
10029 ss
<< "pool '" << poolname
<< "' not found";
10033 cmd_getval(cmdmap
, "mode", mode
);
10034 if (mode
!= "flat" && mode
!= "positional") {
10035 ss
<< "unrecognized weight-set mode '" << mode
<< "'";
10039 positions
= mode
== "flat" ? 1 : osdmap
.get_pg_pool(pool
)->get_size();
10041 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
10044 if (!newcrush
.create_choose_args(pool
, positions
)) {
10045 if (pool
== CrushWrapper::DEFAULT_CHOOSE_ARGS
) {
10046 ss
<< "compat weight-set already created";
10048 ss
<< "weight-set for pool '" << osdmap
.get_pool_name(pool
)
10049 << "' already created";
10053 pending_inc
.crush
.clear();
10054 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10057 } else if (prefix
== "osd crush weight-set rm" ||
10058 prefix
== "osd crush weight-set rm-compat") {
10059 CrushWrapper newcrush
;
10060 _get_pending_crush(newcrush
);
10062 if (prefix
== "osd crush weight-set rm") {
10064 cmd_getval(cmdmap
, "pool", poolname
);
10065 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
10067 ss
<< "pool '" << poolname
<< "' not found";
10072 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
10074 newcrush
.rm_choose_args(pool
);
10075 pending_inc
.crush
.clear();
10076 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10079 } else if (prefix
== "osd crush weight-set reweight" ||
10080 prefix
== "osd crush weight-set reweight-compat") {
10081 string poolname
, item
;
10082 vector
<double> weight
;
10083 cmd_getval(cmdmap
, "pool", poolname
);
10084 cmd_getval(cmdmap
, "item", item
);
10085 cmd_getval(cmdmap
, "weight", weight
);
10086 CrushWrapper newcrush
;
10087 _get_pending_crush(newcrush
);
10089 if (prefix
== "osd crush weight-set reweight") {
10090 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
10092 ss
<< "pool '" << poolname
<< "' not found";
10096 if (!newcrush
.have_choose_args(pool
)) {
10097 ss
<< "no weight-set for pool '" << poolname
<< "'";
10101 auto arg_map
= newcrush
.choose_args_get(pool
);
10102 int positions
= newcrush
.get_choose_args_positions(arg_map
);
10103 if (weight
.size() != (size_t)positions
) {
10104 ss
<< "must specify exact " << positions
<< " weight values";
10109 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
10110 if (!newcrush
.have_choose_args(pool
)) {
10111 ss
<< "no backward-compatible weight-set";
10116 if (!newcrush
.name_exists(item
)) {
10117 ss
<< "item '" << item
<< "' does not exist";
10121 err
= newcrush
.choose_args_adjust_item_weightf(
10123 newcrush
.choose_args_get(pool
),
10124 newcrush
.get_item_id(item
),
10131 pending_inc
.crush
.clear();
10132 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10134 } else if (osdid_present
&&
10135 (prefix
== "osd crush set" || prefix
== "osd crush add")) {
10136 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
10137 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
10138 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
10140 if (!osdmap
.exists(osdid
)) {
10143 << " does not exist. Create it before updating the crush map";
10148 if (!cmd_getval(cmdmap
, "weight", weight
)) {
10149 ss
<< "unable to parse weight value '"
10150 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
10156 vector
<string
> argvec
;
10157 cmd_getval(cmdmap
, "args", argvec
);
10158 map
<string
,string
> loc
;
10159 CrushWrapper::parse_loc_map(argvec
, &loc
);
10161 if (prefix
== "osd crush set"
10162 && !_get_stable_crush().item_exists(osdid
)) {
10164 ss
<< "unable to set item id " << osdid
<< " name '" << osd_name
10165 << "' weight " << weight
<< " at location " << loc
10166 << ": does not exist";
10170 dout(5) << "adding/updating crush item id " << osdid
<< " name '"
10171 << osd_name
<< "' weight " << weight
<< " at location "
10173 CrushWrapper newcrush
;
10174 _get_pending_crush(newcrush
);
10177 if (prefix
== "osd crush set" ||
10178 newcrush
.check_item_loc(cct
, osdid
, loc
, (int *)NULL
)) {
10180 err
= newcrush
.update_item(cct
, osdid
, weight
, osd_name
, loc
);
10183 err
= newcrush
.insert_item(cct
, osdid
, weight
, osd_name
, loc
);
10191 if (err
== 0 && !_have_pending_crush()) {
10192 ss
<< action
<< " item id " << osdid
<< " name '" << osd_name
10193 << "' weight " << weight
<< " at location " << loc
<< ": no change";
10197 pending_inc
.crush
.clear();
10198 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10199 ss
<< action
<< " item id " << osdid
<< " name '" << osd_name
<< "' weight "
10200 << weight
<< " at location " << loc
<< " to crush map";
10202 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10203 get_last_committed() + 1));
10206 } else if (prefix
== "osd crush create-or-move") {
10208 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
10209 if (!osdmap
.exists(osdid
)) {
10212 << " does not exist. create it before updating the crush map";
10217 if (!cmd_getval(cmdmap
, "weight", weight
)) {
10218 ss
<< "unable to parse weight value '"
10219 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
10225 vector
<string
> argvec
;
10226 cmd_getval(cmdmap
, "args", argvec
);
10227 map
<string
,string
> loc
;
10228 CrushWrapper::parse_loc_map(argvec
, &loc
);
10230 dout(0) << "create-or-move crush item name '" << osd_name
10231 << "' initial_weight " << weight
<< " at location " << loc
10234 CrushWrapper newcrush
;
10235 _get_pending_crush(newcrush
);
10237 err
= newcrush
.create_or_move_item(cct
, osdid
, weight
, osd_name
, loc
,
10238 g_conf()->osd_crush_update_weight_set
);
10240 ss
<< "create-or-move updated item name '" << osd_name
10241 << "' weight " << weight
10242 << " at location " << loc
<< " to crush map";
10246 pending_inc
.crush
.clear();
10247 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10248 ss
<< "create-or-move updating item name '" << osd_name
10249 << "' weight " << weight
10250 << " at location " << loc
<< " to crush map";
10252 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10253 get_last_committed() + 1));
10258 } else if (prefix
== "osd crush move") {
10260 // osd crush move <name> <loc1> [<loc2> ...]
10262 vector
<string
> argvec
;
10263 cmd_getval(cmdmap
, "name", name
);
10264 cmd_getval(cmdmap
, "args", argvec
);
10265 map
<string
,string
> loc
;
10266 CrushWrapper::parse_loc_map(argvec
, &loc
);
10268 dout(0) << "moving crush item name '" << name
<< "' to location " << loc
<< dendl
;
10269 CrushWrapper newcrush
;
10270 _get_pending_crush(newcrush
);
10272 if (!newcrush
.name_exists(name
)) {
10274 ss
<< "item " << name
<< " does not exist";
10277 int id
= newcrush
.get_item_id(name
);
10279 if (!newcrush
.check_item_loc(cct
, id
, loc
, (int *)NULL
)) {
10281 err
= newcrush
.create_or_move_item(
10282 cct
, id
, 0, name
, loc
,
10283 g_conf()->osd_crush_update_weight_set
);
10285 err
= newcrush
.move_bucket(cct
, id
, loc
);
10288 ss
<< "moved item id " << id
<< " name '" << name
<< "' to location " << loc
<< " in crush map";
10289 pending_inc
.crush
.clear();
10290 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10292 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10293 get_last_committed() + 1));
10297 ss
<< "no need to move item id " << id
<< " name '" << name
<< "' to location " << loc
<< " in crush map";
10301 } else if (prefix
== "osd crush swap-bucket") {
10302 string source
, dest
;
10303 cmd_getval(cmdmap
, "source", source
);
10304 cmd_getval(cmdmap
, "dest", dest
);
10306 bool force
= false;
10307 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
10309 CrushWrapper newcrush
;
10310 _get_pending_crush(newcrush
);
10311 if (!newcrush
.name_exists(source
)) {
10312 ss
<< "source item " << source
<< " does not exist";
10316 if (!newcrush
.name_exists(dest
)) {
10317 ss
<< "dest item " << dest
<< " does not exist";
10321 int sid
= newcrush
.get_item_id(source
);
10322 int did
= newcrush
.get_item_id(dest
);
10324 if (newcrush
.get_immediate_parent_id(sid
, &sparent
) == 0 && !force
) {
10325 ss
<< "source item " << source
<< " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
10329 if (newcrush
.get_bucket_alg(sid
) != newcrush
.get_bucket_alg(did
) &&
10331 ss
<< "source bucket alg " << crush_alg_name(newcrush
.get_bucket_alg(sid
)) << " != "
10332 << "dest bucket alg " << crush_alg_name(newcrush
.get_bucket_alg(did
))
10333 << "; pass --yes-i-really-mean-it to proceed anyway";
10337 int r
= newcrush
.swap_bucket(cct
, sid
, did
);
10339 ss
<< "failed to swap bucket contents: " << cpp_strerror(r
);
10343 ss
<< "swapped bucket of " << source
<< " to " << dest
;
10344 pending_inc
.crush
.clear();
10345 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10346 wait_for_finished_proposal(op
,
10347 new Monitor::C_Command(mon
, op
, err
, ss
.str(),
10348 get_last_committed() + 1));
10350 } else if (prefix
== "osd crush link") {
10351 // osd crush link <name> <loc1> [<loc2> ...]
10353 cmd_getval(cmdmap
, "name", name
);
10354 vector
<string
> argvec
;
10355 cmd_getval(cmdmap
, "args", argvec
);
10356 map
<string
,string
> loc
;
10357 CrushWrapper::parse_loc_map(argvec
, &loc
);
10359 // Need an explicit check for name_exists because get_item_id returns
10361 int id
= osdmap
.crush
->get_item_id(name
);
10362 if (!osdmap
.crush
->name_exists(name
)) {
10364 ss
<< "item " << name
<< " does not exist";
10367 dout(5) << "resolved crush name '" << name
<< "' to id " << id
<< dendl
;
10369 if (osdmap
.crush
->check_item_loc(cct
, id
, loc
, (int*) NULL
)) {
10370 ss
<< "no need to move item id " << id
<< " name '" << name
10371 << "' to location " << loc
<< " in crush map";
10376 dout(5) << "linking crush item name '" << name
<< "' at location " << loc
<< dendl
;
10377 CrushWrapper newcrush
;
10378 _get_pending_crush(newcrush
);
10380 if (!newcrush
.name_exists(name
)) {
10382 ss
<< "item " << name
<< " does not exist";
10385 int id
= newcrush
.get_item_id(name
);
10386 if (!newcrush
.check_item_loc(cct
, id
, loc
, (int *)NULL
)) {
10387 err
= newcrush
.link_bucket(cct
, id
, loc
);
10389 ss
<< "linked item id " << id
<< " name '" << name
10390 << "' to location " << loc
<< " in crush map";
10391 pending_inc
.crush
.clear();
10392 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10394 ss
<< "cannot link item id " << id
<< " name '" << name
10395 << "' to location " << loc
;
10399 ss
<< "no need to move item id " << id
<< " name '" << name
10400 << "' to location " << loc
<< " in crush map";
10404 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, ss
.str(),
10405 get_last_committed() + 1));
10407 } else if (prefix
== "osd crush rm" ||
10408 prefix
== "osd crush remove" ||
10409 prefix
== "osd crush unlink") {
10411 // osd crush rm <id> [ancestor]
10412 CrushWrapper newcrush
;
10413 _get_pending_crush(newcrush
);
10416 cmd_getval(cmdmap
, "name", name
);
10418 if (!osdmap
.crush
->name_exists(name
)) {
10420 ss
<< "device '" << name
<< "' does not appear in the crush map";
10423 if (!newcrush
.name_exists(name
)) {
10425 ss
<< "device '" << name
<< "' does not appear in the crush map";
10427 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10428 get_last_committed() + 1));
10431 int id
= newcrush
.get_item_id(name
);
10434 bool unlink_only
= prefix
== "osd crush unlink";
10435 string ancestor_str
;
10436 if (cmd_getval(cmdmap
, "ancestor", ancestor_str
)) {
10437 if (!newcrush
.name_exists(ancestor_str
)) {
10439 ss
<< "ancestor item '" << ancestor_str
10440 << "' does not appear in the crush map";
10443 ancestor
= newcrush
.get_item_id(ancestor_str
);
10446 err
= prepare_command_osd_crush_remove(
10449 (ancestor
< 0), unlink_only
);
10451 if (err
== -ENOENT
) {
10452 ss
<< "item " << id
<< " does not appear in that position";
10458 pending_inc
.new_crush_node_flags
[id
] = 0;
10459 ss
<< "removed item id " << id
<< " name '" << name
<< "' from crush map";
10461 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10462 get_last_committed() + 1));
10467 } else if (prefix
== "osd crush reweight-all") {
10468 CrushWrapper newcrush
;
10469 _get_pending_crush(newcrush
);
10471 newcrush
.reweight(cct
);
10472 pending_inc
.crush
.clear();
10473 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10474 ss
<< "reweighted crush hierarchy";
10476 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10477 get_last_committed() + 1));
10479 } else if (prefix
== "osd crush reweight") {
10480 // osd crush reweight <name> <weight>
10481 CrushWrapper newcrush
;
10482 _get_pending_crush(newcrush
);
10485 cmd_getval(cmdmap
, "name", name
);
10486 if (!newcrush
.name_exists(name
)) {
10488 ss
<< "device '" << name
<< "' does not appear in the crush map";
10492 int id
= newcrush
.get_item_id(name
);
10494 ss
<< "device '" << name
<< "' is not a leaf in the crush map";
10499 if (!cmd_getval(cmdmap
, "weight", w
)) {
10500 ss
<< "unable to parse weight value '"
10501 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
10506 err
= newcrush
.adjust_item_weightf(cct
, id
, w
,
10507 g_conf()->osd_crush_update_weight_set
);
10510 pending_inc
.crush
.clear();
10511 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10512 ss
<< "reweighted item id " << id
<< " name '" << name
<< "' to " << w
10513 << " in crush map";
10515 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10516 get_last_committed() + 1));
10518 } else if (prefix
== "osd crush reweight-subtree") {
10519 // osd crush reweight <name> <weight>
10520 CrushWrapper newcrush
;
10521 _get_pending_crush(newcrush
);
10524 cmd_getval(cmdmap
, "name", name
);
10525 if (!newcrush
.name_exists(name
)) {
10527 ss
<< "device '" << name
<< "' does not appear in the crush map";
10531 int id
= newcrush
.get_item_id(name
);
10533 ss
<< "device '" << name
<< "' is not a subtree in the crush map";
10538 if (!cmd_getval(cmdmap
, "weight", w
)) {
10539 ss
<< "unable to parse weight value '"
10540 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
10545 err
= newcrush
.adjust_subtree_weightf(cct
, id
, w
,
10546 g_conf()->osd_crush_update_weight_set
);
10549 pending_inc
.crush
.clear();
10550 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10551 ss
<< "reweighted subtree id " << id
<< " name '" << name
<< "' to " << w
10552 << " in crush map";
10554 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10555 get_last_committed() + 1));
10557 } else if (prefix
== "osd crush tunables") {
10558 CrushWrapper newcrush
;
10559 _get_pending_crush(newcrush
);
10563 cmd_getval(cmdmap
, "profile", profile
);
10564 if (profile
== "legacy" || profile
== "argonaut") {
10565 newcrush
.set_tunables_legacy();
10566 } else if (profile
== "bobtail") {
10567 newcrush
.set_tunables_bobtail();
10568 } else if (profile
== "firefly") {
10569 newcrush
.set_tunables_firefly();
10570 } else if (profile
== "hammer") {
10571 newcrush
.set_tunables_hammer();
10572 } else if (profile
== "jewel") {
10573 newcrush
.set_tunables_jewel();
10574 } else if (profile
== "optimal") {
10575 newcrush
.set_tunables_optimal();
10576 } else if (profile
== "default") {
10577 newcrush
.set_tunables_default();
10579 ss
<< "unrecognized profile '" << profile
<< "'";
10584 if (!validate_crush_against_features(&newcrush
, ss
)) {
10589 pending_inc
.crush
.clear();
10590 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10591 ss
<< "adjusted tunables profile to " << profile
;
10593 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10594 get_last_committed() + 1));
10596 } else if (prefix
== "osd crush set-tunable") {
10597 CrushWrapper newcrush
;
10598 _get_pending_crush(newcrush
);
10602 cmd_getval(cmdmap
, "tunable", tunable
);
10604 int64_t value
= -1;
10605 if (!cmd_getval(cmdmap
, "value", value
)) {
10607 ss
<< "failed to parse integer value "
10608 << cmd_vartype_stringify(cmdmap
.at("value"));
10612 if (tunable
== "straw_calc_version") {
10613 if (value
!= 0 && value
!= 1) {
10614 ss
<< "value must be 0 or 1; got " << value
;
10618 newcrush
.set_straw_calc_version(value
);
10620 ss
<< "unrecognized tunable '" << tunable
<< "'";
10625 if (!validate_crush_against_features(&newcrush
, ss
)) {
10630 pending_inc
.crush
.clear();
10631 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10632 ss
<< "adjusted tunable " << tunable
<< " to " << value
;
10634 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10635 get_last_committed() + 1));
10638 } else if (prefix
== "osd crush rule create-simple") {
10639 string name
, root
, type
, mode
;
10640 cmd_getval(cmdmap
, "name", name
);
10641 cmd_getval(cmdmap
, "root", root
);
10642 cmd_getval(cmdmap
, "type", type
);
10643 cmd_getval(cmdmap
, "mode", mode
);
10647 if (osdmap
.crush
->rule_exists(name
)) {
10648 // The name is uniquely associated to a ruleid and the rule it contains
10649 // From the user point of view, the rule is more meaningfull.
10650 ss
<< "rule " << name
<< " already exists";
10655 CrushWrapper newcrush
;
10656 _get_pending_crush(newcrush
);
10658 if (newcrush
.rule_exists(name
)) {
10659 // The name is uniquely associated to a ruleid and the rule it contains
10660 // From the user point of view, the rule is more meaningfull.
10661 ss
<< "rule " << name
<< " already exists";
10664 int ruleno
= newcrush
.add_simple_rule(name
, root
, type
, "", mode
,
10665 pg_pool_t::TYPE_REPLICATED
, &ss
);
10671 pending_inc
.crush
.clear();
10672 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10675 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10676 get_last_committed() + 1));
10679 } else if (prefix
== "osd crush rule create-replicated") {
10680 string name
, root
, type
, device_class
;
10681 cmd_getval(cmdmap
, "name", name
);
10682 cmd_getval(cmdmap
, "root", root
);
10683 cmd_getval(cmdmap
, "type", type
);
10684 cmd_getval(cmdmap
, "class", device_class
);
10686 if (osdmap
.crush
->rule_exists(name
)) {
10687 // The name is uniquely associated to a ruleid and the rule it contains
10688 // From the user point of view, the rule is more meaningfull.
10689 ss
<< "rule " << name
<< " already exists";
10694 CrushWrapper newcrush
;
10695 _get_pending_crush(newcrush
);
10697 if (newcrush
.rule_exists(name
)) {
10698 // The name is uniquely associated to a ruleid and the rule it contains
10699 // From the user point of view, the rule is more meaningfull.
10700 ss
<< "rule " << name
<< " already exists";
10703 int ruleno
= newcrush
.add_simple_rule(
10704 name
, root
, type
, device_class
,
10705 "firstn", pg_pool_t::TYPE_REPLICATED
, &ss
);
10711 pending_inc
.crush
.clear();
10712 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10715 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10716 get_last_committed() + 1));
10719 } else if (prefix
== "osd erasure-code-profile rm") {
10721 cmd_getval(cmdmap
, "name", name
);
10723 if (erasure_code_profile_in_use(pending_inc
.new_pools
, name
, &ss
))
10726 if (erasure_code_profile_in_use(osdmap
.pools
, name
, &ss
)) {
10731 if (osdmap
.has_erasure_code_profile(name
) ||
10732 pending_inc
.new_erasure_code_profiles
.count(name
)) {
10733 if (osdmap
.has_erasure_code_profile(name
)) {
10734 pending_inc
.old_erasure_code_profiles
.push_back(name
);
10736 dout(20) << "erasure code profile rm " << name
<< ": creation canceled" << dendl
;
10737 pending_inc
.new_erasure_code_profiles
.erase(name
);
10741 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10742 get_last_committed() + 1));
10745 ss
<< "erasure-code-profile " << name
<< " does not exist";
10750 } else if (prefix
== "osd erasure-code-profile set") {
10752 cmd_getval(cmdmap
, "name", name
);
10753 vector
<string
> profile
;
10754 cmd_getval(cmdmap
, "profile", profile
);
10756 bool force
= false;
10757 cmd_getval(cmdmap
, "force", force
);
10759 map
<string
,string
> profile_map
;
10760 err
= parse_erasure_code_profile(profile
, &profile_map
, &ss
);
10763 if (profile_map
.find("plugin") == profile_map
.end()) {
10764 ss
<< "erasure-code-profile " << profile_map
10765 << " must contain a plugin entry" << std::endl
;
10769 string plugin
= profile_map
["plugin"];
10771 if (pending_inc
.has_erasure_code_profile(name
)) {
10772 dout(20) << "erasure code profile " << name
<< " try again" << dendl
;
10775 err
= normalize_profile(name
, profile_map
, force
, &ss
);
10779 if (osdmap
.has_erasure_code_profile(name
)) {
10780 ErasureCodeProfile existing_profile_map
=
10781 osdmap
.get_erasure_code_profile(name
);
10782 err
= normalize_profile(name
, existing_profile_map
, force
, &ss
);
10786 if (existing_profile_map
== profile_map
) {
10792 ss
<< "will not override erasure code profile " << name
10793 << " because the existing profile "
10794 << existing_profile_map
10795 << " is different from the proposed profile "
10801 dout(20) << "erasure code profile set " << name
<< "="
10802 << profile_map
<< dendl
;
10803 pending_inc
.set_erasure_code_profile(name
, profile_map
);
10807 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10808 get_last_committed() + 1));
10811 } else if (prefix
== "osd crush rule create-erasure") {
10812 err
= check_cluster_features(CEPH_FEATURE_CRUSH_V2
, ss
);
10813 if (err
== -EAGAIN
)
10817 string name
, poolstr
;
10818 cmd_getval(cmdmap
, "name", name
);
10820 cmd_getval(cmdmap
, "profile", profile
);
10822 profile
= "default";
10823 if (profile
== "default") {
10824 if (!osdmap
.has_erasure_code_profile(profile
)) {
10825 if (pending_inc
.has_erasure_code_profile(profile
)) {
10826 dout(20) << "erasure code profile " << profile
<< " already pending" << dendl
;
10830 map
<string
,string
> profile_map
;
10831 err
= osdmap
.get_erasure_code_profile_default(cct
,
10836 err
= normalize_profile(name
, profile_map
, true, &ss
);
10839 dout(20) << "erasure code profile set " << profile
<< "="
10840 << profile_map
<< dendl
;
10841 pending_inc
.set_erasure_code_profile(profile
, profile_map
);
10847 err
= crush_rule_create_erasure(name
, profile
, &rule
, &ss
);
10850 case -EEXIST
: // return immediately
10851 ss
<< "rule " << name
<< " already exists";
10855 case -EALREADY
: // wait for pending to be proposed
10856 ss
<< "rule " << name
<< " already exists";
10859 default: // non recoverable error
10864 ss
<< "created rule " << name
<< " at " << rule
;
10868 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10869 get_last_committed() + 1));
10872 } else if (prefix
== "osd crush rule rm") {
10874 cmd_getval(cmdmap
, "name", name
);
10876 if (!osdmap
.crush
->rule_exists(name
)) {
10877 ss
<< "rule " << name
<< " does not exist";
10882 CrushWrapper newcrush
;
10883 _get_pending_crush(newcrush
);
10885 if (!newcrush
.rule_exists(name
)) {
10886 ss
<< "rule " << name
<< " does not exist";
10889 int ruleno
= newcrush
.get_rule_id(name
);
10890 ceph_assert(ruleno
>= 0);
10892 // make sure it is not in use.
10893 // FIXME: this is ok in some situations, but let's not bother with that
10895 int ruleset
= newcrush
.get_rule_mask_ruleset(ruleno
);
10896 if (osdmap
.crush_rule_in_use(ruleset
)) {
10897 ss
<< "crush ruleset " << name
<< " " << ruleset
<< " is in use";
10902 err
= newcrush
.remove_rule(ruleno
);
10907 pending_inc
.crush
.clear();
10908 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10911 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10912 get_last_committed() + 1));
10915 } else if (prefix
== "osd crush rule rename") {
10918 cmd_getval(cmdmap
, "srcname", srcname
);
10919 cmd_getval(cmdmap
, "dstname", dstname
);
10920 if (srcname
.empty() || dstname
.empty()) {
10921 ss
<< "must specify both source rule name and destination rule name";
10925 if (srcname
== dstname
) {
10926 ss
<< "destination rule name is equal to source rule name";
10931 CrushWrapper newcrush
;
10932 _get_pending_crush(newcrush
);
10933 if (!newcrush
.rule_exists(srcname
) && newcrush
.rule_exists(dstname
)) {
10934 // srcname does not exist and dstname already exists
10935 // suppose this is a replay and return success
10936 // (so this command is idempotent)
10937 ss
<< "already renamed to '" << dstname
<< "'";
10942 err
= newcrush
.rename_rule(srcname
, dstname
, &ss
);
10944 // ss has reason for failure
10947 pending_inc
.crush
.clear();
10948 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10950 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10951 get_last_committed() + 1));
10954 } else if (prefix
== "osd setmaxosd") {
10956 if (!cmd_getval(cmdmap
, "newmax", newmax
)) {
10957 ss
<< "unable to parse 'newmax' value '"
10958 << cmd_vartype_stringify(cmdmap
.at("newmax")) << "'";
10963 if (newmax
> g_conf()->mon_max_osd
) {
10965 ss
<< "cannot set max_osd to " << newmax
<< " which is > conf.mon_max_osd ("
10966 << g_conf()->mon_max_osd
<< ")";
10970 // Don't allow shrinking OSD number as this will cause data loss
10971 // and may cause kernel crashes.
10972 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
10973 if (newmax
< osdmap
.get_max_osd()) {
10974 // Check if the OSDs exist between current max and new value.
10975 // If there are any OSDs exist, then don't allow shrinking number
10977 for (int i
= newmax
; i
< osdmap
.get_max_osd(); i
++) {
10978 if (osdmap
.exists(i
)) {
10980 ss
<< "cannot shrink max_osd to " << newmax
10981 << " because osd." << i
<< " (and possibly others) still in use";
10987 pending_inc
.new_max_osd
= newmax
;
10988 ss
<< "set new max_osd = " << pending_inc
.new_max_osd
;
10990 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10991 get_last_committed() + 1));
10994 } else if (prefix
== "osd set-full-ratio" ||
10995 prefix
== "osd set-backfillfull-ratio" ||
10996 prefix
== "osd set-nearfull-ratio") {
10998 if (!cmd_getval(cmdmap
, "ratio", n
)) {
10999 ss
<< "unable to parse 'ratio' value '"
11000 << cmd_vartype_stringify(cmdmap
.at("ratio")) << "'";
11004 if (prefix
== "osd set-full-ratio")
11005 pending_inc
.new_full_ratio
= n
;
11006 else if (prefix
== "osd set-backfillfull-ratio")
11007 pending_inc
.new_backfillfull_ratio
= n
;
11008 else if (prefix
== "osd set-nearfull-ratio")
11009 pending_inc
.new_nearfull_ratio
= n
;
11010 ss
<< prefix
<< " " << n
;
11012 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11013 get_last_committed() + 1));
11015 } else if (prefix
== "osd set-require-min-compat-client") {
11017 cmd_getval(cmdmap
, "version", v
);
11018 ceph_release_t vno
= ceph_release_from_name(v
);
11020 ss
<< "version " << v
<< " is not recognized";
11025 newmap
.deepish_copy_from(osdmap
);
11026 newmap
.apply_incremental(pending_inc
);
11027 newmap
.require_min_compat_client
= vno
;
11028 auto mvno
= newmap
.get_min_compat_client();
11030 ss
<< "osdmap current utilizes features that require " << mvno
11031 << "; cannot set require_min_compat_client below that to " << vno
;
11036 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
11039 mon
->get_combined_feature_map(&m
);
11040 uint64_t features
= ceph_release_features(ceph::to_integer
<int>(vno
));
11044 CEPH_ENTITY_TYPE_CLIENT
,
11045 CEPH_ENTITY_TYPE_MDS
,
11046 CEPH_ENTITY_TYPE_MGR
}) {
11047 auto p
= m
.m
.find(type
);
11048 if (p
== m
.m
.end()) {
11051 for (auto& q
: p
->second
) {
11052 uint64_t missing
= ~q
.first
& features
;
11055 ss
<< "cannot set require_min_compat_client to " << v
<< ": ";
11060 ss
<< q
.second
<< " connected " << ceph_entity_type_name(type
)
11061 << "(s) look like " << ceph_release_name(
11062 ceph_release_from_features(q
.first
))
11063 << " (missing 0x" << std::hex
<< missing
<< std::dec
<< ")";
11069 ss
<< "; add --yes-i-really-mean-it to do it anyway";
11074 ss
<< "set require_min_compat_client to " << vno
;
11075 pending_inc
.new_require_min_compat_client
= vno
;
11077 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11078 get_last_committed() + 1));
11080 } else if (prefix
== "osd pause") {
11081 return prepare_set_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11083 } else if (prefix
== "osd unpause") {
11084 return prepare_unset_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11086 } else if (prefix
== "osd set") {
11088 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
11091 cmd_getval(cmdmap
, "key", key
);
11092 if (key
== "pause")
11093 return prepare_set_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11094 else if (key
== "noup")
11095 return prepare_set_flag(op
, CEPH_OSDMAP_NOUP
);
11096 else if (key
== "nodown")
11097 return prepare_set_flag(op
, CEPH_OSDMAP_NODOWN
);
11098 else if (key
== "noout")
11099 return prepare_set_flag(op
, CEPH_OSDMAP_NOOUT
);
11100 else if (key
== "noin")
11101 return prepare_set_flag(op
, CEPH_OSDMAP_NOIN
);
11102 else if (key
== "nobackfill")
11103 return prepare_set_flag(op
, CEPH_OSDMAP_NOBACKFILL
);
11104 else if (key
== "norebalance")
11105 return prepare_set_flag(op
, CEPH_OSDMAP_NOREBALANCE
);
11106 else if (key
== "norecover")
11107 return prepare_set_flag(op
, CEPH_OSDMAP_NORECOVER
);
11108 else if (key
== "noscrub")
11109 return prepare_set_flag(op
, CEPH_OSDMAP_NOSCRUB
);
11110 else if (key
== "nodeep-scrub")
11111 return prepare_set_flag(op
, CEPH_OSDMAP_NODEEP_SCRUB
);
11112 else if (key
== "notieragent")
11113 return prepare_set_flag(op
, CEPH_OSDMAP_NOTIERAGENT
);
11114 else if (key
== "nosnaptrim")
11115 return prepare_set_flag(op
, CEPH_OSDMAP_NOSNAPTRIM
);
11116 else if (key
== "pglog_hardlimit") {
11117 if (!osdmap
.get_num_up_osds() && !sure
) {
11118 ss
<< "Not advisable to continue since no OSDs are up. Pass "
11119 << "--yes-i-really-mean-it if you really wish to continue.";
11123 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
11124 // we are reusing a jewel feature bit that was retired in luminous.
11125 if (osdmap
.require_osd_release
>= ceph_release_t::luminous
&&
11126 (HAVE_FEATURE(osdmap
.get_up_osd_features(), OSD_PGLOG_HARDLIMIT
)
11128 return prepare_set_flag(op
, CEPH_OSDMAP_PGLOG_HARDLIMIT
);
11130 ss
<< "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
11135 ss
<< "unrecognized flag '" << key
<< "'";
11139 } else if (prefix
== "osd unset") {
11141 cmd_getval(cmdmap
, "key", key
);
11142 if (key
== "pause")
11143 return prepare_unset_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11144 else if (key
== "noup")
11145 return prepare_unset_flag(op
, CEPH_OSDMAP_NOUP
);
11146 else if (key
== "nodown")
11147 return prepare_unset_flag(op
, CEPH_OSDMAP_NODOWN
);
11148 else if (key
== "noout")
11149 return prepare_unset_flag(op
, CEPH_OSDMAP_NOOUT
);
11150 else if (key
== "noin")
11151 return prepare_unset_flag(op
, CEPH_OSDMAP_NOIN
);
11152 else if (key
== "nobackfill")
11153 return prepare_unset_flag(op
, CEPH_OSDMAP_NOBACKFILL
);
11154 else if (key
== "norebalance")
11155 return prepare_unset_flag(op
, CEPH_OSDMAP_NOREBALANCE
);
11156 else if (key
== "norecover")
11157 return prepare_unset_flag(op
, CEPH_OSDMAP_NORECOVER
);
11158 else if (key
== "noscrub")
11159 return prepare_unset_flag(op
, CEPH_OSDMAP_NOSCRUB
);
11160 else if (key
== "nodeep-scrub")
11161 return prepare_unset_flag(op
, CEPH_OSDMAP_NODEEP_SCRUB
);
11162 else if (key
== "notieragent")
11163 return prepare_unset_flag(op
, CEPH_OSDMAP_NOTIERAGENT
);
11164 else if (key
== "nosnaptrim")
11165 return prepare_unset_flag(op
, CEPH_OSDMAP_NOSNAPTRIM
);
11167 ss
<< "unrecognized flag '" << key
<< "'";
11171 } else if (prefix
== "osd require-osd-release") {
11173 cmd_getval(cmdmap
, "release", release
);
11175 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
11176 ceph_release_t rel
= ceph_release_from_name(release
.c_str());
11178 ss
<< "unrecognized release " << release
;
11182 if (rel
== osdmap
.require_osd_release
) {
11187 ceph_assert(osdmap
.require_osd_release
>= ceph_release_t::luminous
);
11188 if (!osdmap
.get_num_up_osds() && !sure
) {
11189 ss
<< "Not advisable to continue since no OSDs are up. Pass "
11190 << "--yes-i-really-mean-it if you really wish to continue.";
11194 if (rel
== ceph_release_t::mimic
) {
11195 if (!mon
->monmap
->get_required_features().contains_all(
11196 ceph::features::mon::FEATURE_MIMIC
)) {
11197 ss
<< "not all mons are mimic";
11201 if ((!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_MIMIC
))
11203 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_MIMIC feature";
11207 } else if (rel
== ceph_release_t::nautilus
) {
11208 if (!mon
->monmap
->get_required_features().contains_all(
11209 ceph::features::mon::FEATURE_NAUTILUS
)) {
11210 ss
<< "not all mons are nautilus";
11214 if ((!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_NAUTILUS
))
11216 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_NAUTILUS feature";
11220 } else if (rel
== ceph_release_t::octopus
) {
11221 if (!mon
->monmap
->get_required_features().contains_all(
11222 ceph::features::mon::FEATURE_OCTOPUS
)) {
11223 ss
<< "not all mons are octopus";
11227 if ((!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_OCTOPUS
))
11229 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_OCTOPUS feature";
11234 ss
<< "not supported for this release yet";
11238 if (rel
< osdmap
.require_osd_release
) {
11239 ss
<< "require_osd_release cannot be lowered once it has been set";
11243 pending_inc
.new_require_osd_release
= rel
;
11245 } else if (prefix
== "osd down" ||
11246 prefix
== "osd out" ||
11247 prefix
== "osd in" ||
11248 prefix
== "osd rm" ||
11249 prefix
== "osd stop") {
11253 bool verbose
= true;
11254 bool definitely_dead
= false;
11256 vector
<string
> idvec
;
11257 cmd_getval(cmdmap
, "ids", idvec
);
11258 cmd_getval(cmdmap
, "definitely_dead", definitely_dead
);
11259 derr
<< "definitely_dead " << (int)definitely_dead
<< dendl
;
11260 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
11265 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
11266 if (prefix
== "osd in") {
11267 // touch out osds only
11268 osdmap
.get_out_existing_osds(osds
);
11270 osdmap
.get_all_osds(osds
);
11273 verbose
= false; // so the output is less noisy.
11275 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
11277 ss
<< "invalid osd id" << osd
;
11280 } else if (!osdmap
.exists(osd
)) {
11281 ss
<< "osd." << osd
<< " does not exist. ";
11288 for (auto &osd
: osds
) {
11289 if (prefix
== "osd down") {
11290 if (osdmap
.is_down(osd
)) {
11292 ss
<< "osd." << osd
<< " is already down. ";
11294 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_UP
);
11295 ss
<< "marked down osd." << osd
<< ". ";
11298 if (definitely_dead
) {
11299 if (!pending_inc
.new_xinfo
.count(osd
)) {
11300 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
11302 if (pending_inc
.new_xinfo
[osd
].dead_epoch
< pending_inc
.epoch
) {
11305 pending_inc
.new_xinfo
[osd
].dead_epoch
= pending_inc
.epoch
;
11307 } else if (prefix
== "osd out") {
11308 if (osdmap
.is_out(osd
)) {
11310 ss
<< "osd." << osd
<< " is already out. ";
11312 pending_inc
.new_weight
[osd
] = CEPH_OSD_OUT
;
11313 if (osdmap
.osd_weight
[osd
]) {
11314 if (pending_inc
.new_xinfo
.count(osd
) == 0) {
11315 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
11317 pending_inc
.new_xinfo
[osd
].old_weight
= osdmap
.osd_weight
[osd
];
11319 ss
<< "marked out osd." << osd
<< ". ";
11320 std::ostringstream msg
;
11321 msg
<< "Client " << op
->get_session()->entity_name
11322 << " marked osd." << osd
<< " out";
11323 if (osdmap
.is_up(osd
)) {
11324 msg
<< ", while it was still marked up";
11326 auto period
= ceph_clock_now() - down_pending_out
[osd
];
11327 msg
<< ", after it was down for " << int(period
.sec())
11331 mon
->clog
->info() << msg
.str();
11334 } else if (prefix
== "osd in") {
11335 if (osdmap
.is_in(osd
)) {
11337 ss
<< "osd." << osd
<< " is already in. ";
11339 if (osdmap
.osd_xinfo
[osd
].old_weight
> 0) {
11340 pending_inc
.new_weight
[osd
] = osdmap
.osd_xinfo
[osd
].old_weight
;
11341 if (pending_inc
.new_xinfo
.count(osd
) == 0) {
11342 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
11344 pending_inc
.new_xinfo
[osd
].old_weight
= 0;
11346 pending_inc
.new_weight
[osd
] = CEPH_OSD_IN
;
11348 ss
<< "marked in osd." << osd
<< ". ";
11351 } else if (prefix
== "osd rm") {
11352 err
= prepare_command_osd_remove(osd
);
11354 if (err
== -EBUSY
) {
11357 ss
<< "osd." << osd
<< " is still up; must be down before removal. ";
11359 ceph_assert(err
== 0);
11361 ss
<< ", osd." << osd
;
11363 ss
<< "removed osd." << osd
;
11367 } else if (prefix
== "osd stop") {
11368 if (osdmap
.is_stop(osd
)) {
11370 ss
<< "osd." << osd
<< " is already stopped. ";
11371 } else if (osdmap
.is_down(osd
)) {
11372 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_STOP
);
11373 ss
<< "stop down osd." << osd
<< ". ";
11376 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_UP
| CEPH_OSD_STOP
);
11377 ss
<< "stop osd." << osd
<< ". ";
11385 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
11386 get_last_committed() + 1));
11389 } else if (prefix
== "osd set-group" ||
11390 prefix
== "osd unset-group" ||
11391 prefix
== "osd add-noup" ||
11392 prefix
== "osd add-nodown" ||
11393 prefix
== "osd add-noin" ||
11394 prefix
== "osd add-noout" ||
11395 prefix
== "osd rm-noup" ||
11396 prefix
== "osd rm-nodown" ||
11397 prefix
== "osd rm-noin" ||
11398 prefix
== "osd rm-noout") {
11399 bool do_set
= prefix
== "osd set-group" ||
11400 prefix
.find("add") != string::npos
;
11402 unsigned flags
= 0;
11403 vector
<string
> who
;
11404 if (prefix
== "osd set-group" || prefix
== "osd unset-group") {
11405 cmd_getval(cmdmap
, "flags", flag_str
);
11406 cmd_getval(cmdmap
, "who", who
);
11407 vector
<string
> raw_flags
;
11408 boost::split(raw_flags
, flag_str
, boost::is_any_of(","));
11409 for (auto& f
: raw_flags
) {
11411 flags
|= CEPH_OSD_NOUP
;
11412 else if (f
== "nodown")
11413 flags
|= CEPH_OSD_NODOWN
;
11414 else if (f
== "noin")
11415 flags
|= CEPH_OSD_NOIN
;
11416 else if (f
== "noout")
11417 flags
|= CEPH_OSD_NOOUT
;
11419 ss
<< "unrecognized flag '" << f
<< "', must be one of "
11420 << "{noup,nodown,noin,noout}";
11426 cmd_getval(cmdmap
, "ids", who
);
11427 if (prefix
.find("noup") != string::npos
)
11428 flags
= CEPH_OSD_NOUP
;
11429 else if (prefix
.find("nodown") != string::npos
)
11430 flags
= CEPH_OSD_NODOWN
;
11431 else if (prefix
.find("noin") != string::npos
)
11432 flags
= CEPH_OSD_NOIN
;
11433 else if (prefix
.find("noout") != string::npos
)
11434 flags
= CEPH_OSD_NOOUT
;
11436 ceph_assert(0 == "Unreachable!");
11439 ss
<< "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
11444 ss
<< "must specify at least one or more targets to set/unset";
11449 set
<int> crush_nodes
;
11450 set
<int> device_classes
;
11451 for (auto& w
: who
) {
11452 if (w
== "any" || w
== "all" || w
== "*") {
11453 osdmap
.get_all_osds(osds
);
11456 std::stringstream ts
;
11457 if (auto osd
= parse_osd_id(w
.c_str(), &ts
); osd
>= 0) {
11459 } else if (osdmap
.crush
->name_exists(w
)) {
11460 crush_nodes
.insert(osdmap
.crush
->get_item_id(w
));
11461 } else if (osdmap
.crush
->class_exists(w
)) {
11462 device_classes
.insert(osdmap
.crush
->get_class_id(w
));
11464 ss
<< "unable to parse osd id or crush node or device class: "
11465 << "\"" << w
<< "\". ";
11468 if (osds
.empty() && crush_nodes
.empty() && device_classes
.empty()) {
11469 // ss has reason for failure
11474 for (auto osd
: osds
) {
11475 if (!osdmap
.exists(osd
)) {
11476 ss
<< "osd." << osd
<< " does not exist. ";
11480 if (flags
& CEPH_OSD_NOUP
) {
11481 any
|= osdmap
.is_noup_by_osd(osd
) ?
11482 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOUP
) :
11483 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOUP
);
11485 if (flags
& CEPH_OSD_NODOWN
) {
11486 any
|= osdmap
.is_nodown_by_osd(osd
) ?
11487 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NODOWN
) :
11488 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NODOWN
);
11490 if (flags
& CEPH_OSD_NOIN
) {
11491 any
|= osdmap
.is_noin_by_osd(osd
) ?
11492 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOIN
) :
11493 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOIN
);
11495 if (flags
& CEPH_OSD_NOOUT
) {
11496 any
|= osdmap
.is_noout_by_osd(osd
) ?
11497 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOOUT
) :
11498 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOOUT
);
11501 if (flags
& CEPH_OSD_NOUP
) {
11502 any
|= osdmap
.is_noup_by_osd(osd
) ?
11503 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOUP
) :
11504 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOUP
);
11506 if (flags
& CEPH_OSD_NODOWN
) {
11507 any
|= osdmap
.is_nodown_by_osd(osd
) ?
11508 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NODOWN
) :
11509 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NODOWN
);
11511 if (flags
& CEPH_OSD_NOIN
) {
11512 any
|= osdmap
.is_noin_by_osd(osd
) ?
11513 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOIN
) :
11514 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOIN
);
11516 if (flags
& CEPH_OSD_NOOUT
) {
11517 any
|= osdmap
.is_noout_by_osd(osd
) ?
11518 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOOUT
) :
11519 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOOUT
);
11523 for (auto& id
: crush_nodes
) {
11524 auto old_flags
= osdmap
.get_crush_node_flags(id
);
11525 auto& pending_flags
= pending_inc
.new_crush_node_flags
[id
];
11526 pending_flags
|= old_flags
; // adopt existing flags first!
11528 pending_flags
|= flags
;
11530 pending_flags
&= ~flags
;
11534 for (auto& id
: device_classes
) {
11535 auto old_flags
= osdmap
.get_device_class_flags(id
);
11536 auto& pending_flags
= pending_inc
.new_device_class_flags
[id
];
11537 pending_flags
|= old_flags
;
11539 pending_flags
|= flags
;
11541 pending_flags
&= ~flags
;
11547 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
11548 get_last_committed() + 1));
11551 } else if (prefix
== "osd pg-temp") {
11553 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
11554 ss
<< "unable to parse 'pgid' value '"
11555 << cmd_vartype_stringify(cmdmap
.at("pgid")) << "'";
11560 if (!pgid
.parse(pgidstr
.c_str())) {
11561 ss
<< "invalid pgid '" << pgidstr
<< "'";
11565 if (!osdmap
.pg_exists(pgid
)) {
11566 ss
<< "pg " << pgid
<< " does not exist";
11570 if (pending_inc
.new_pg_temp
.count(pgid
)) {
11571 dout(10) << __func__
<< " waiting for pending update on " << pgid
<< dendl
;
11572 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11576 vector
<int64_t> id_vec
;
11577 vector
<int32_t> new_pg_temp
;
11578 cmd_getval(cmdmap
, "id", id_vec
);
11579 if (id_vec
.empty()) {
11580 pending_inc
.new_pg_temp
[pgid
] = mempool::osdmap::vector
<int>();
11581 ss
<< "done cleaning up pg_temp of " << pgid
;
11584 for (auto osd
: id_vec
) {
11585 if (!osdmap
.exists(osd
)) {
11586 ss
<< "osd." << osd
<< " does not exist";
11590 new_pg_temp
.push_back(osd
);
11593 int pool_min_size
= osdmap
.get_pg_pool_min_size(pgid
);
11594 if ((int)new_pg_temp
.size() < pool_min_size
) {
11595 ss
<< "num of osds (" << new_pg_temp
.size() <<") < pool min size ("
11596 << pool_min_size
<< ")";
11601 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
11602 if ((int)new_pg_temp
.size() > pool_size
) {
11603 ss
<< "num of osds (" << new_pg_temp
.size() <<") > pool size ("
11604 << pool_size
<< ")";
11609 pending_inc
.new_pg_temp
[pgid
] = mempool::osdmap::vector
<int>(
11610 new_pg_temp
.begin(), new_pg_temp
.end());
11611 ss
<< "set " << pgid
<< " pg_temp mapping to " << new_pg_temp
;
11613 } else if (prefix
== "osd primary-temp") {
11615 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
11616 ss
<< "unable to parse 'pgid' value '"
11617 << cmd_vartype_stringify(cmdmap
.at("pgid")) << "'";
11622 if (!pgid
.parse(pgidstr
.c_str())) {
11623 ss
<< "invalid pgid '" << pgidstr
<< "'";
11627 if (!osdmap
.pg_exists(pgid
)) {
11628 ss
<< "pg " << pgid
<< " does not exist";
11634 if (!cmd_getval(cmdmap
, "id", osd
)) {
11635 ss
<< "unable to parse 'id' value '"
11636 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
11640 if (osd
!= -1 && !osdmap
.exists(osd
)) {
11641 ss
<< "osd." << osd
<< " does not exist";
11646 if (osdmap
.require_min_compat_client
!= ceph_release_t::unknown
&&
11647 osdmap
.require_min_compat_client
< ceph_release_t::firefly
) {
11648 ss
<< "require_min_compat_client "
11649 << osdmap
.require_min_compat_client
11650 << " < firefly, which is required for primary-temp";
11655 pending_inc
.new_primary_temp
[pgid
] = osd
;
11656 ss
<< "set " << pgid
<< " primary_temp mapping to " << osd
;
11658 } else if (prefix
== "pg repeer") {
11661 cmd_getval(cmdmap
, "pgid", pgidstr
);
11662 if (!pgid
.parse(pgidstr
.c_str())) {
11663 ss
<< "invalid pgid '" << pgidstr
<< "'";
11667 if (!osdmap
.pg_exists(pgid
)) {
11668 ss
<< "pg '" << pgidstr
<< "' does not exist";
11672 vector
<int> acting
;
11674 osdmap
.pg_to_acting_osds(pgid
, &acting
, &primary
);
11677 ss
<< "pg currently has no primary";
11680 if (acting
.size() > 1) {
11681 // map to just primary; it will map back to what it wants
11682 pending_inc
.new_pg_temp
[pgid
] = { primary
};
11684 // hmm, pick another arbitrary osd to induce a change. Note
11685 // that this won't work if there is only one suitable OSD in the cluster.
11688 for (i
= 0; i
< osdmap
.get_max_osd(); ++i
) {
11689 if (i
== primary
|| !osdmap
.is_up(i
) || !osdmap
.exists(i
)) {
11692 pending_inc
.new_pg_temp
[pgid
] = { primary
, i
};
11698 ss
<< "not enough up OSDs in the cluster to force repeer";
11703 } else if (prefix
== "osd pg-upmap" ||
11704 prefix
== "osd rm-pg-upmap" ||
11705 prefix
== "osd pg-upmap-items" ||
11706 prefix
== "osd rm-pg-upmap-items") {
11707 if (osdmap
.require_min_compat_client
< ceph_release_t::luminous
) {
11708 ss
<< "min_compat_client "
11709 << osdmap
.require_min_compat_client
11710 << " < luminous, which is required for pg-upmap. "
11711 << "Try 'ceph osd set-require-min-compat-client luminous' "
11712 << "before using the new interface";
11716 err
= check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP
, ss
);
11717 if (err
== -EAGAIN
)
11722 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
11723 ss
<< "unable to parse 'pgid' value '"
11724 << cmd_vartype_stringify(cmdmap
.at("pgid")) << "'";
11729 if (!pgid
.parse(pgidstr
.c_str())) {
11730 ss
<< "invalid pgid '" << pgidstr
<< "'";
11734 if (!osdmap
.pg_exists(pgid
)) {
11735 ss
<< "pg " << pgid
<< " does not exist";
11739 if (pending_inc
.old_pools
.count(pgid
.pool())) {
11740 ss
<< "pool of " << pgid
<< " is pending removal";
11743 wait_for_finished_proposal(op
,
11744 new Monitor::C_Command(mon
, op
, err
, rs
, get_last_committed() + 1));
11752 OP_RM_PG_UPMAP_ITEMS
,
11755 if (prefix
== "osd pg-upmap") {
11756 option
= OP_PG_UPMAP
;
11757 } else if (prefix
== "osd rm-pg-upmap") {
11758 option
= OP_RM_PG_UPMAP
;
11759 } else if (prefix
== "osd pg-upmap-items") {
11760 option
= OP_PG_UPMAP_ITEMS
;
11762 option
= OP_RM_PG_UPMAP_ITEMS
;
11765 // check pending upmap changes
11767 case OP_PG_UPMAP
: // fall through
11768 case OP_RM_PG_UPMAP
:
11769 if (pending_inc
.new_pg_upmap
.count(pgid
) ||
11770 pending_inc
.old_pg_upmap
.count(pgid
)) {
11771 dout(10) << __func__
<< " waiting for pending update on "
11773 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11778 case OP_PG_UPMAP_ITEMS
: // fall through
11779 case OP_RM_PG_UPMAP_ITEMS
:
11780 if (pending_inc
.new_pg_upmap_items
.count(pgid
) ||
11781 pending_inc
.old_pg_upmap_items
.count(pgid
)) {
11782 dout(10) << __func__
<< " waiting for pending update on "
11784 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11790 ceph_abort_msg("invalid option");
11796 vector
<int64_t> id_vec
;
11797 if (!cmd_getval(cmdmap
, "id", id_vec
)) {
11798 ss
<< "unable to parse 'id' value(s) '"
11799 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
11804 int pool_min_size
= osdmap
.get_pg_pool_min_size(pgid
);
11805 if ((int)id_vec
.size() < pool_min_size
) {
11806 ss
<< "num of osds (" << id_vec
.size() <<") < pool min size ("
11807 << pool_min_size
<< ")";
11812 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
11813 if ((int)id_vec
.size() > pool_size
) {
11814 ss
<< "num of osds (" << id_vec
.size() <<") > pool size ("
11815 << pool_size
<< ")";
11820 vector
<int32_t> new_pg_upmap
;
11821 for (auto osd
: id_vec
) {
11822 if (osd
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(osd
)) {
11823 ss
<< "osd." << osd
<< " does not exist";
11827 auto it
= std::find(new_pg_upmap
.begin(), new_pg_upmap
.end(), osd
);
11828 if (it
!= new_pg_upmap
.end()) {
11829 ss
<< "osd." << osd
<< " already exists, ";
11832 new_pg_upmap
.push_back(osd
);
11835 if (new_pg_upmap
.empty()) {
11836 ss
<< "no valid upmap items(pairs) is specified";
11841 pending_inc
.new_pg_upmap
[pgid
] = mempool::osdmap::vector
<int32_t>(
11842 new_pg_upmap
.begin(), new_pg_upmap
.end());
11843 ss
<< "set " << pgid
<< " pg_upmap mapping to " << new_pg_upmap
;
11847 case OP_RM_PG_UPMAP
:
11849 pending_inc
.old_pg_upmap
.insert(pgid
);
11850 ss
<< "clear " << pgid
<< " pg_upmap mapping";
11854 case OP_PG_UPMAP_ITEMS
:
11856 vector
<int64_t> id_vec
;
11857 if (!cmd_getval(cmdmap
, "id", id_vec
)) {
11858 ss
<< "unable to parse 'id' value(s) '"
11859 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
11864 if (id_vec
.size() % 2) {
11865 ss
<< "you must specify pairs of osd ids to be remapped";
11870 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
11871 if ((int)(id_vec
.size() / 2) > pool_size
) {
11872 ss
<< "num of osd pairs (" << id_vec
.size() / 2 <<") > pool size ("
11873 << pool_size
<< ")";
11878 vector
<pair
<int32_t,int32_t>> new_pg_upmap_items
;
11879 ostringstream items
;
11881 for (auto p
= id_vec
.begin(); p
!= id_vec
.end(); ++p
) {
11885 ss
<< "from osd." << from
<< " == to osd." << to
<< ", ";
11888 if (!osdmap
.exists(from
)) {
11889 ss
<< "osd." << from
<< " does not exist";
11893 if (to
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(to
)) {
11894 ss
<< "osd." << to
<< " does not exist";
11898 pair
<int32_t,int32_t> entry
= make_pair(from
, to
);
11899 auto it
= std::find(new_pg_upmap_items
.begin(),
11900 new_pg_upmap_items
.end(), entry
);
11901 if (it
!= new_pg_upmap_items
.end()) {
11902 ss
<< "osd." << from
<< " -> osd." << to
<< " already exists, ";
11905 new_pg_upmap_items
.push_back(entry
);
11906 items
<< from
<< "->" << to
<< ",";
11908 string
out(items
.str());
11909 out
.resize(out
.size() - 1); // drop last ','
11912 if (new_pg_upmap_items
.empty()) {
11913 ss
<< "no valid upmap items(pairs) is specified";
11918 pending_inc
.new_pg_upmap_items
[pgid
] =
11919 mempool::osdmap::vector
<pair
<int32_t,int32_t>>(
11920 new_pg_upmap_items
.begin(), new_pg_upmap_items
.end());
11921 ss
<< "set " << pgid
<< " pg_upmap_items mapping to " << out
;
11925 case OP_RM_PG_UPMAP_ITEMS
:
11927 pending_inc
.old_pg_upmap_items
.insert(pgid
);
11928 ss
<< "clear " << pgid
<< " pg_upmap_items mapping";
11933 ceph_abort_msg("invalid option");
11937 } else if (prefix
== "osd primary-affinity") {
11939 if (!cmd_getval(cmdmap
, "id", id
)) {
11940 ss
<< "invalid osd id value '"
11941 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
11946 if (!cmd_getval(cmdmap
, "weight", w
)) {
11947 ss
<< "unable to parse 'weight' value '"
11948 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
11952 long ww
= (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY
*w
);
11954 ss
<< "weight must be >= 0";
11958 if (osdmap
.require_min_compat_client
!= ceph_release_t::unknown
&&
11959 osdmap
.require_min_compat_client
< ceph_release_t::firefly
) {
11960 ss
<< "require_min_compat_client "
11961 << osdmap
.require_min_compat_client
11962 << " < firefly, which is required for primary-affinity";
11966 if (osdmap
.exists(id
)) {
11967 pending_inc
.new_primary_affinity
[id
] = ww
;
11968 ss
<< "set osd." << id
<< " primary-affinity to " << w
<< " (" << ios::hex
<< ww
<< ios::dec
<< ")";
11970 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11971 get_last_committed() + 1));
11974 ss
<< "osd." << id
<< " does not exist";
11978 } else if (prefix
== "osd reweight") {
11980 if (!cmd_getval(cmdmap
, "id", id
)) {
11981 ss
<< "unable to parse osd id value '"
11982 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
11987 if (!cmd_getval(cmdmap
, "weight", w
)) {
11988 ss
<< "unable to parse weight value '"
11989 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
11993 long ww
= (int)((double)CEPH_OSD_IN
*w
);
11995 ss
<< "weight must be >= 0";
11999 if (osdmap
.exists(id
)) {
12000 pending_inc
.new_weight
[id
] = ww
;
12001 ss
<< "reweighted osd." << id
<< " to " << w
<< " (" << std::hex
<< ww
<< std::dec
<< ")";
12003 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12004 get_last_committed() + 1));
12007 ss
<< "osd." << id
<< " does not exist";
12011 } else if (prefix
== "osd reweightn") {
12012 map
<int32_t, uint32_t> weights
;
12013 err
= parse_reweights(cct
, cmdmap
, osdmap
, &weights
);
12015 ss
<< "unable to parse 'weights' value '"
12016 << cmd_vartype_stringify(cmdmap
.at("weights")) << "'";
12019 pending_inc
.new_weight
.insert(weights
.begin(), weights
.end());
12020 wait_for_finished_proposal(
12022 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
, get_last_committed() + 1));
12024 } else if (prefix
== "osd lost") {
12026 if (!cmd_getval(cmdmap
, "id", id
)) {
12027 ss
<< "unable to parse osd id value '"
12028 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12033 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
12035 ss
<< "are you SURE? this might mean real, permanent data loss. pass "
12036 "--yes-i-really-mean-it if you really do.";
12039 } else if (!osdmap
.exists(id
)) {
12040 ss
<< "osd." << id
<< " does not exist";
12043 } else if (!osdmap
.is_down(id
)) {
12044 ss
<< "osd." << id
<< " is not down";
12048 epoch_t e
= osdmap
.get_info(id
).down_at
;
12049 pending_inc
.new_lost
[id
] = e
;
12050 ss
<< "marked osd lost in epoch " << e
;
12052 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12053 get_last_committed() + 1));
12057 } else if (prefix
== "osd destroy-actual" ||
12058 prefix
== "osd purge-actual" ||
12059 prefix
== "osd purge-new") {
12060 /* Destroying an OSD means that we don't expect to further make use of
12061 * the OSDs data (which may even become unreadable after this operation),
12062 * and that we are okay with scrubbing all its cephx keys and config-key
12063 * data (which may include lockbox keys, thus rendering the osd's data
12066 * The OSD will not be removed. Instead, we will mark it as destroyed,
12067 * such that a subsequent call to `create` will not reuse the osd id.
12068 * This will play into being able to recreate the OSD, at the same
12069 * crush location, with minimal data movement.
12072 // make sure authmon is writeable.
12073 if (!mon
->authmon()->is_writeable()) {
12074 dout(10) << __func__
<< " waiting for auth mon to be writeable for "
12075 << "osd destroy" << dendl
;
12076 mon
->authmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
12081 if (!cmd_getval(cmdmap
, "id", id
)) {
12082 auto p
= cmdmap
.find("id");
12083 if (p
== cmdmap
.end()) {
12084 ss
<< "no osd id specified";
12086 ss
<< "unable to parse osd id value '"
12087 << cmd_vartype_stringify(cmdmap
.at("id")) << "";
12093 bool is_destroy
= (prefix
== "osd destroy-actual");
12095 ceph_assert("osd purge-actual" == prefix
||
12096 "osd purge-new" == prefix
);
12100 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
12102 ss
<< "Are you SURE? Did you verify with 'ceph osd safe-to-destroy'? "
12103 << "This will mean real, permanent data loss, as well "
12104 << "as deletion of cephx and lockbox keys. "
12105 << "Pass --yes-i-really-mean-it if you really do.";
12108 } else if (!osdmap
.exists(id
)) {
12109 ss
<< "osd." << id
<< " does not exist";
12110 err
= 0; // idempotent
12112 } else if (osdmap
.is_up(id
)) {
12113 ss
<< "osd." << id
<< " is not `down`.";
12116 } else if (is_destroy
&& osdmap
.is_destroyed(id
)) {
12117 ss
<< "destroyed osd." << id
;
12122 if (prefix
== "osd purge-new" &&
12123 (osdmap
.get_state(id
) & CEPH_OSD_NEW
) == 0) {
12124 ss
<< "osd." << id
<< " is not new";
12129 bool goto_reply
= false;
12133 err
= prepare_command_osd_destroy(id
, ss
);
12134 // we checked above that it should exist.
12135 ceph_assert(err
!= -ENOENT
);
12137 err
= prepare_command_osd_purge(id
, ss
);
12138 if (err
== -ENOENT
) {
12140 ss
<< "osd." << id
<< " does not exist.";
12146 if (err
< 0 || goto_reply
) {
12151 ss
<< "destroyed osd." << id
;
12153 ss
<< "purged osd." << id
;
12157 wait_for_finished_proposal(op
,
12158 new Monitor::C_Command(mon
, op
, 0, rs
, get_last_committed() + 1));
12159 force_immediate_propose();
12162 } else if (prefix
== "osd new") {
12164 // make sure authmon is writeable.
12165 if (!mon
->authmon()->is_writeable()) {
12166 dout(10) << __func__
<< " waiting for auth mon to be writeable for "
12167 << "osd new" << dendl
;
12168 mon
->authmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
12172 map
<string
,string
> param_map
;
12174 bufferlist bl
= m
->get_data();
12175 string param_json
= bl
.to_str();
12176 dout(20) << __func__
<< " osd new json = " << param_json
<< dendl
;
12178 err
= get_json_str_map(param_json
, ss
, ¶m_map
);
12182 dout(20) << __func__
<< " osd new params " << param_map
<< dendl
;
12185 err
= prepare_command_osd_new(op
, cmdmap
, param_map
, ss
, f
.get());
12198 if (err
== EEXIST
) {
12199 // idempotent operation
12204 wait_for_finished_proposal(op
,
12205 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
,
12206 get_last_committed() + 1));
12207 force_immediate_propose();
12210 } else if (prefix
== "osd create") {
12212 // optional id provided?
12213 int64_t id
= -1, cmd_id
= -1;
12214 if (cmd_getval(cmdmap
, "id", cmd_id
)) {
12216 ss
<< "invalid osd id value '" << cmd_id
<< "'";
12220 dout(10) << " osd create got id " << cmd_id
<< dendl
;
12225 if (cmd_getval(cmdmap
, "uuid", uuidstr
)) {
12226 if (!uuid
.parse(uuidstr
.c_str())) {
12227 ss
<< "invalid uuid value '" << uuidstr
<< "'";
12231 // we only care about the id if we also have the uuid, to
12232 // ensure the operation's idempotency.
12236 int32_t new_id
= -1;
12237 err
= prepare_command_osd_create(id
, uuid
, &new_id
, ss
);
12239 if (err
== -EAGAIN
) {
12240 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12243 // a check has failed; reply to the user.
12246 } else if (err
== EEXIST
) {
12247 // this is an idempotent operation; we can go ahead and reply.
12249 f
->open_object_section("created_osd");
12250 f
->dump_int("osdid", new_id
);
12251 f
->close_section();
12261 string empty_device_class
;
12262 do_osd_create(id
, uuid
, empty_device_class
, &new_id
);
12265 f
->open_object_section("created_osd");
12266 f
->dump_int("osdid", new_id
);
12267 f
->close_section();
12273 wait_for_finished_proposal(op
,
12274 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
,
12275 get_last_committed() + 1));
12278 } else if (prefix
== "osd blacklist clear") {
12279 pending_inc
.new_blacklist
.clear();
12280 std::list
<std::pair
<entity_addr_t
,utime_t
> > blacklist
;
12281 osdmap
.get_blacklist(&blacklist
);
12282 for (const auto &entry
: blacklist
) {
12283 pending_inc
.old_blacklist
.push_back(entry
.first
);
12285 ss
<< " removed all blacklist entries";
12287 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12288 get_last_committed() + 1));
12290 } else if (prefix
== "osd blacklist") {
12292 cmd_getval(cmdmap
, "addr", addrstr
);
12293 entity_addr_t addr
;
12294 if (!addr
.parse(addrstr
.c_str(), 0)) {
12295 ss
<< "unable to parse address " << addrstr
;
12300 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
12301 // always blacklist type ANY
12302 addr
.set_type(entity_addr_t::TYPE_ANY
);
12304 addr
.set_type(entity_addr_t::TYPE_LEGACY
);
12307 string blacklistop
;
12308 cmd_getval(cmdmap
, "blacklistop", blacklistop
);
12309 if (blacklistop
== "add") {
12310 utime_t expires
= ceph_clock_now();
12312 // default one hour
12313 cmd_getval(cmdmap
, "expire", d
,
12314 g_conf()->mon_osd_blacklist_default_expire
);
12317 pending_inc
.new_blacklist
[addr
] = expires
;
12320 // cancel any pending un-blacklisting request too
12321 auto it
= std::find(pending_inc
.old_blacklist
.begin(),
12322 pending_inc
.old_blacklist
.end(), addr
);
12323 if (it
!= pending_inc
.old_blacklist
.end()) {
12324 pending_inc
.old_blacklist
.erase(it
);
12328 ss
<< "blacklisting " << addr
<< " until " << expires
<< " (" << d
<< " sec)";
12330 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12331 get_last_committed() + 1));
12333 } else if (blacklistop
== "rm") {
12334 if (osdmap
.is_blacklisted(addr
) ||
12335 pending_inc
.new_blacklist
.count(addr
)) {
12336 if (osdmap
.is_blacklisted(addr
))
12337 pending_inc
.old_blacklist
.push_back(addr
);
12339 pending_inc
.new_blacklist
.erase(addr
);
12340 ss
<< "un-blacklisting " << addr
;
12342 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12343 get_last_committed() + 1));
12346 ss
<< addr
<< " isn't blacklisted";
12351 } else if (prefix
== "osd pool mksnap") {
12353 cmd_getval(cmdmap
, "pool", poolstr
);
12354 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
12356 ss
<< "unrecognized pool '" << poolstr
<< "'";
12361 cmd_getval(cmdmap
, "snap", snapname
);
12362 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
12363 if (p
->is_unmanaged_snaps_mode()) {
12364 ss
<< "pool " << poolstr
<< " is in unmanaged snaps mode";
12367 } else if (p
->snap_exists(snapname
.c_str())) {
12368 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " already exists";
12371 } else if (p
->is_tier()) {
12372 ss
<< "pool " << poolstr
<< " is a cache tier";
12377 if (pending_inc
.new_pools
.count(pool
))
12378 pp
= &pending_inc
.new_pools
[pool
];
12380 pp
= &pending_inc
.new_pools
[pool
];
12383 if (pp
->snap_exists(snapname
.c_str())) {
12384 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " already exists";
12386 pp
->add_snap(snapname
.c_str(), ceph_clock_now());
12387 pp
->set_snap_epoch(pending_inc
.epoch
);
12388 ss
<< "created pool " << poolstr
<< " snap " << snapname
;
12391 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12392 get_last_committed() + 1));
12394 } else if (prefix
== "osd pool rmsnap") {
12396 cmd_getval(cmdmap
, "pool", poolstr
);
12397 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
12399 ss
<< "unrecognized pool '" << poolstr
<< "'";
12404 cmd_getval(cmdmap
, "snap", snapname
);
12405 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
12406 if (p
->is_unmanaged_snaps_mode()) {
12407 ss
<< "pool " << poolstr
<< " is in unmanaged snaps mode";
12410 } else if (!p
->snap_exists(snapname
.c_str())) {
12411 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " does not exist";
12416 if (pending_inc
.new_pools
.count(pool
))
12417 pp
= &pending_inc
.new_pools
[pool
];
12419 pp
= &pending_inc
.new_pools
[pool
];
12422 snapid_t sn
= pp
->snap_exists(snapname
.c_str());
12424 pp
->remove_snap(sn
);
12425 pp
->set_snap_epoch(pending_inc
.epoch
);
12426 ss
<< "removed pool " << poolstr
<< " snap " << snapname
;
12428 ss
<< "already removed pool " << poolstr
<< " snap " << snapname
;
12431 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12432 get_last_committed() + 1));
12434 } else if (prefix
== "osd pool create") {
12435 int64_t pg_num
, pg_num_min
;
12437 cmd_getval(cmdmap
, "pg_num", pg_num
, int64_t(0));
12438 cmd_getval(cmdmap
, "pgp_num", pgp_num
, pg_num
);
12439 cmd_getval(cmdmap
, "pg_num_min", pg_num_min
, int64_t(0));
12441 string pool_type_str
;
12442 cmd_getval(cmdmap
, "pool_type", pool_type_str
);
12443 if (pool_type_str
.empty())
12444 pool_type_str
= g_conf().get_val
<string
>("osd_pool_default_type");
12447 cmd_getval(cmdmap
, "pool", poolstr
);
12448 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
12449 if (pool_id
>= 0) {
12450 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
12451 if (pool_type_str
!= p
->get_type_name()) {
12452 ss
<< "pool '" << poolstr
<< "' cannot change to type " << pool_type_str
;
12455 ss
<< "pool '" << poolstr
<< "' already exists";
12462 if (pool_type_str
== "replicated") {
12463 pool_type
= pg_pool_t::TYPE_REPLICATED
;
12464 } else if (pool_type_str
== "erasure") {
12465 pool_type
= pg_pool_t::TYPE_ERASURE
;
12467 ss
<< "unknown pool type '" << pool_type_str
<< "'";
12472 bool implicit_rule_creation
= false;
12473 int64_t expected_num_objects
= 0;
12475 cmd_getval(cmdmap
, "rule", rule_name
);
12476 string erasure_code_profile
;
12477 cmd_getval(cmdmap
, "erasure_code_profile", erasure_code_profile
);
12479 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
12480 if (erasure_code_profile
== "")
12481 erasure_code_profile
= "default";
12482 //handle the erasure code profile
12483 if (erasure_code_profile
== "default") {
12484 if (!osdmap
.has_erasure_code_profile(erasure_code_profile
)) {
12485 if (pending_inc
.has_erasure_code_profile(erasure_code_profile
)) {
12486 dout(20) << "erasure code profile " << erasure_code_profile
<< " already pending" << dendl
;
12490 map
<string
,string
> profile_map
;
12491 err
= osdmap
.get_erasure_code_profile_default(cct
,
12496 dout(20) << "erasure code profile " << erasure_code_profile
<< " set" << dendl
;
12497 pending_inc
.set_erasure_code_profile(erasure_code_profile
, profile_map
);
12501 if (rule_name
== "") {
12502 implicit_rule_creation
= true;
12503 if (erasure_code_profile
== "default") {
12504 rule_name
= "erasure-code";
12506 dout(1) << "implicitly use rule named after the pool: "
12507 << poolstr
<< dendl
;
12508 rule_name
= poolstr
;
12511 cmd_getval(cmdmap
, "expected_num_objects",
12512 expected_num_objects
, int64_t(0));
12514 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
12515 // and put expected_num_objects to rule field
12516 if (erasure_code_profile
!= "") { // cmd is from CLI
12517 if (rule_name
!= "") {
12519 expected_num_objects
= strict_strtoll(rule_name
.c_str(), 10, &interr
);
12520 if (interr
.length()) {
12521 ss
<< "error parsing integer value '" << rule_name
<< "': " << interr
;
12526 rule_name
= erasure_code_profile
;
12527 } else { // cmd is well-formed
12528 cmd_getval(cmdmap
, "expected_num_objects",
12529 expected_num_objects
, int64_t(0));
12533 if (!implicit_rule_creation
&& rule_name
!= "") {
12535 err
= get_crush_rule(rule_name
, &rule
, &ss
);
12536 if (err
== -EAGAIN
) {
12537 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12544 if (expected_num_objects
< 0) {
12545 ss
<< "'expected_num_objects' must be non-negative";
12550 if (expected_num_objects
> 0 &&
12551 cct
->_conf
->osd_objectstore
== "filestore" &&
12552 cct
->_conf
->filestore_merge_threshold
> 0) {
12553 ss
<< "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
12558 if (expected_num_objects
== 0 &&
12559 cct
->_conf
->osd_objectstore
== "filestore" &&
12560 cct
->_conf
->filestore_merge_threshold
< 0) {
12561 int osds
= osdmap
.get_num_osds();
12562 if (osds
&& (pg_num
>= 1024 || pg_num
/ osds
>= 100)) {
12563 ss
<< "For better initial performance on pools expected to store a "
12564 << "large number of objects, consider supplying the "
12565 << "expected_num_objects parameter when creating the pool.\n";
12569 int64_t fast_read_param
;
12570 cmd_getval(cmdmap
, "fast_read", fast_read_param
, int64_t(-1));
12571 FastReadType fast_read
= FAST_READ_DEFAULT
;
12572 if (fast_read_param
== 0)
12573 fast_read
= FAST_READ_OFF
;
12574 else if (fast_read_param
> 0)
12575 fast_read
= FAST_READ_ON
;
12577 int64_t repl_size
= 0;
12578 cmd_getval(cmdmap
, "size", repl_size
);
12579 int64_t target_size_bytes
= 0;
12580 double target_size_ratio
= 0.0;
12581 cmd_getval(cmdmap
, "target_size_bytes", target_size_bytes
);
12582 cmd_getval(cmdmap
, "target_size_ratio", target_size_ratio
);
12584 string pg_autoscale_mode
;
12585 cmd_getval(cmdmap
, "autoscale_mode", pg_autoscale_mode
);
12587 err
= prepare_new_pool(poolstr
,
12588 -1, // default crush rule
12590 pg_num
, pgp_num
, pg_num_min
,
12591 repl_size
, target_size_bytes
, target_size_ratio
,
12592 erasure_code_profile
, pool_type
,
12593 (uint64_t)expected_num_objects
,
12600 ss
<< "pool '" << poolstr
<< "' already exists";
12603 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12612 ss
<< "pool '" << poolstr
<< "' created";
12615 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12616 get_last_committed() + 1));
12619 } else if (prefix
== "osd pool delete" ||
12620 prefix
== "osd pool rm") {
12621 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
12622 string poolstr
, poolstr2
, sure
;
12623 cmd_getval(cmdmap
, "pool", poolstr
);
12624 cmd_getval(cmdmap
, "pool2", poolstr2
);
12625 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
12627 ss
<< "pool '" << poolstr
<< "' does not exist";
12632 bool force_no_fake
= false;
12633 cmd_getval(cmdmap
, "yes_i_really_really_mean_it", force_no_fake
);
12634 bool force
= false;
12635 cmd_getval(cmdmap
, "yes_i_really_really_mean_it_not_faking", force
);
12636 if (poolstr2
!= poolstr
||
12637 (!force
&& !force_no_fake
)) {
12638 ss
<< "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
12639 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
12640 << "followed by --yes-i-really-really-mean-it.";
12644 err
= _prepare_remove_pool(pool
, &ss
, force_no_fake
);
12645 if (err
== -EAGAIN
) {
12646 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12652 } else if (prefix
== "osd pool rename") {
12653 string srcpoolstr
, destpoolstr
;
12654 cmd_getval(cmdmap
, "srcpool", srcpoolstr
);
12655 cmd_getval(cmdmap
, "destpool", destpoolstr
);
12656 int64_t pool_src
= osdmap
.lookup_pg_pool_name(srcpoolstr
.c_str());
12657 int64_t pool_dst
= osdmap
.lookup_pg_pool_name(destpoolstr
.c_str());
12659 if (pool_src
< 0) {
12660 if (pool_dst
>= 0) {
12661 // src pool doesn't exist, dst pool does exist: to ensure idempotency
12662 // of operations, assume this rename succeeded, as it is not changing
12663 // the current state. Make sure we output something understandable
12664 // for whoever is issuing the command, if they are paying attention,
12665 // in case it was not intentional; or to avoid a "wtf?" and a bug
12666 // report in case it was intentional, while expecting a failure.
12667 ss
<< "pool '" << srcpoolstr
<< "' does not exist; pool '"
12668 << destpoolstr
<< "' does -- assuming successful rename";
12671 ss
<< "unrecognized pool '" << srcpoolstr
<< "'";
12675 } else if (pool_dst
>= 0) {
12676 // source pool exists and so does the destination pool
12677 ss
<< "pool '" << destpoolstr
<< "' already exists";
12682 int ret
= _prepare_rename_pool(pool_src
, destpoolstr
);
12684 ss
<< "pool '" << srcpoolstr
<< "' renamed to '" << destpoolstr
<< "'";
12686 ss
<< "failed to rename pool '" << srcpoolstr
<< "' to '" << destpoolstr
<< "': "
12687 << cpp_strerror(ret
);
12690 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, ret
, rs
,
12691 get_last_committed() + 1));
12694 } else if (prefix
== "osd pool set") {
12695 err
= prepare_command_pool_set(cmdmap
, ss
);
12696 if (err
== -EAGAIN
)
12702 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12703 get_last_committed() + 1));
12705 } else if (prefix
== "osd tier add") {
12706 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
12707 if (err
== -EAGAIN
)
12712 cmd_getval(cmdmap
, "pool", poolstr
);
12713 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
12715 ss
<< "unrecognized pool '" << poolstr
<< "'";
12719 string tierpoolstr
;
12720 cmd_getval(cmdmap
, "tierpool", tierpoolstr
);
12721 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
12722 if (tierpool_id
< 0) {
12723 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
12727 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
12729 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
12732 if (!_check_become_tier(tierpool_id
, tp
, pool_id
, p
, &err
, &ss
)) {
12736 // make sure new tier is empty
12737 string force_nonempty
;
12738 cmd_getval(cmdmap
, "force_nonempty", force_nonempty
);
12739 const pool_stat_t
*pstats
= mon
->mgrstatmon()->get_pool_stat(tierpool_id
);
12740 if (pstats
&& pstats
->stats
.sum
.num_objects
!= 0 &&
12741 force_nonempty
!= "--force-nonempty") {
12742 ss
<< "tier pool '" << tierpoolstr
<< "' is not empty; --force-nonempty to force";
12746 if (tp
->is_erasure()) {
12747 ss
<< "tier pool '" << tierpoolstr
12748 << "' is an ec pool, which cannot be a tier";
12752 if ((!tp
->removed_snaps
.empty() || !tp
->snaps
.empty()) &&
12753 ((force_nonempty
!= "--force-nonempty") ||
12754 (!g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps
))) {
12755 ss
<< "tier pool '" << tierpoolstr
<< "' has snapshot state; it cannot be added as a tier without breaking the pool";
12760 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
12761 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
12762 if (np
->tiers
.count(tierpool_id
) || ntp
->is_tier()) {
12763 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12766 np
->tiers
.insert(tierpool_id
);
12767 np
->set_snap_epoch(pending_inc
.epoch
); // tier will update to our snap info
12768 ntp
->tier_of
= pool_id
;
12769 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) a tier of '" << poolstr
<< "'";
12770 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
12771 get_last_committed() + 1));
12773 } else if (prefix
== "osd tier remove" ||
12774 prefix
== "osd tier rm") {
12776 cmd_getval(cmdmap
, "pool", poolstr
);
12777 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
12779 ss
<< "unrecognized pool '" << poolstr
<< "'";
12783 string tierpoolstr
;
12784 cmd_getval(cmdmap
, "tierpool", tierpoolstr
);
12785 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
12786 if (tierpool_id
< 0) {
12787 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
12791 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
12793 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
12796 if (!_check_remove_tier(pool_id
, p
, tp
, &err
, &ss
)) {
12800 if (p
->tiers
.count(tierpool_id
) == 0) {
12801 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) not a tier of '" << poolstr
<< "'";
12805 if (tp
->tier_of
!= pool_id
) {
12806 ss
<< "tier pool '" << tierpoolstr
<< "' is a tier of '"
12807 << osdmap
.get_pool_name(tp
->tier_of
) << "': "
12808 // be scary about it; this is an inconsistency and bells must go off
12809 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
12813 if (p
->read_tier
== tierpool_id
) {
12814 ss
<< "tier pool '" << tierpoolstr
<< "' is the overlay for '" << poolstr
<< "'; please remove-overlay first";
12819 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
12820 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
12821 if (np
->tiers
.count(tierpool_id
) == 0 ||
12822 ntp
->tier_of
!= pool_id
||
12823 np
->read_tier
== tierpool_id
) {
12824 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12827 np
->tiers
.erase(tierpool_id
);
12829 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) not a tier of '" << poolstr
<< "'";
12830 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
12831 get_last_committed() + 1));
12833 } else if (prefix
== "osd tier set-overlay") {
12834 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
12835 if (err
== -EAGAIN
)
12840 cmd_getval(cmdmap
, "pool", poolstr
);
12841 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
12843 ss
<< "unrecognized pool '" << poolstr
<< "'";
12847 string overlaypoolstr
;
12848 cmd_getval(cmdmap
, "overlaypool", overlaypoolstr
);
12849 int64_t overlaypool_id
= osdmap
.lookup_pg_pool_name(overlaypoolstr
);
12850 if (overlaypool_id
< 0) {
12851 ss
<< "unrecognized pool '" << overlaypoolstr
<< "'";
12855 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
12857 const pg_pool_t
*overlay_p
= osdmap
.get_pg_pool(overlaypool_id
);
12858 ceph_assert(overlay_p
);
12859 if (p
->tiers
.count(overlaypool_id
) == 0) {
12860 ss
<< "tier pool '" << overlaypoolstr
<< "' is not a tier of '" << poolstr
<< "'";
12864 if (p
->read_tier
== overlaypool_id
) {
12866 ss
<< "overlay for '" << poolstr
<< "' is now (or already was) '" << overlaypoolstr
<< "'";
12869 if (p
->has_read_tier()) {
12870 ss
<< "pool '" << poolstr
<< "' has overlay '"
12871 << osdmap
.get_pool_name(p
->read_tier
)
12872 << "'; please remove-overlay first";
12878 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
12879 np
->read_tier
= overlaypool_id
;
12880 np
->write_tier
= overlaypool_id
;
12881 np
->set_last_force_op_resend(pending_inc
.epoch
);
12882 pg_pool_t
*noverlay_p
= pending_inc
.get_new_pool(overlaypool_id
, overlay_p
);
12883 noverlay_p
->set_last_force_op_resend(pending_inc
.epoch
);
12884 ss
<< "overlay for '" << poolstr
<< "' is now (or already was) '" << overlaypoolstr
<< "'";
12885 if (overlay_p
->cache_mode
== pg_pool_t::CACHEMODE_NONE
)
12886 ss
<<" (WARNING: overlay pool cache_mode is still NONE)";
12887 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
12888 get_last_committed() + 1));
12890 } else if (prefix
== "osd tier remove-overlay" ||
12891 prefix
== "osd tier rm-overlay") {
12893 cmd_getval(cmdmap
, "pool", poolstr
);
12894 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
12896 ss
<< "unrecognized pool '" << poolstr
<< "'";
12900 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
12902 if (!p
->has_read_tier()) {
12904 ss
<< "there is now (or already was) no overlay for '" << poolstr
<< "'";
12908 if (!_check_remove_tier(pool_id
, p
, NULL
, &err
, &ss
)) {
12913 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
12914 if (np
->has_read_tier()) {
12915 const pg_pool_t
*op
= osdmap
.get_pg_pool(np
->read_tier
);
12916 pg_pool_t
*nop
= pending_inc
.get_new_pool(np
->read_tier
,op
);
12917 nop
->set_last_force_op_resend(pending_inc
.epoch
);
12919 if (np
->has_write_tier()) {
12920 const pg_pool_t
*op
= osdmap
.get_pg_pool(np
->write_tier
);
12921 pg_pool_t
*nop
= pending_inc
.get_new_pool(np
->write_tier
, op
);
12922 nop
->set_last_force_op_resend(pending_inc
.epoch
);
12924 np
->clear_read_tier();
12925 np
->clear_write_tier();
12926 np
->set_last_force_op_resend(pending_inc
.epoch
);
12927 ss
<< "there is now (or already was) no overlay for '" << poolstr
<< "'";
12928 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
12929 get_last_committed() + 1));
12931 } else if (prefix
== "osd tier cache-mode") {
12932 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
12933 if (err
== -EAGAIN
)
12938 cmd_getval(cmdmap
, "pool", poolstr
);
12939 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
12941 ss
<< "unrecognized pool '" << poolstr
<< "'";
12945 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
12947 if (!p
->is_tier()) {
12948 ss
<< "pool '" << poolstr
<< "' is not a tier";
12953 cmd_getval(cmdmap
, "mode", modestr
);
12954 pg_pool_t::cache_mode_t mode
= pg_pool_t::get_cache_mode_from_str(modestr
);
12955 if (int(mode
) < 0) {
12956 ss
<< "'" << modestr
<< "' is not a valid cache mode";
12962 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
12964 if (mode
== pg_pool_t::CACHEMODE_FORWARD
||
12965 mode
== pg_pool_t::CACHEMODE_READFORWARD
) {
12966 ss
<< "'" << modestr
<< "' is no longer a supported cache mode";
12970 if ((mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
12971 mode
!= pg_pool_t::CACHEMODE_NONE
&&
12972 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
12973 mode
!= pg_pool_t::CACHEMODE_READPROXY
) &&
12975 ss
<< "'" << modestr
<< "' is not a well-supported cache mode and may "
12976 << "corrupt your data. pass --yes-i-really-mean-it to force.";
12981 // pool already has this cache-mode set and there are no pending changes
12982 if (p
->cache_mode
== mode
&&
12983 (pending_inc
.new_pools
.count(pool_id
) == 0 ||
12984 pending_inc
.new_pools
[pool_id
].cache_mode
== p
->cache_mode
)) {
12985 ss
<< "set cache-mode for pool '" << poolstr
<< "'"
12986 << " to " << pg_pool_t::get_cache_mode_name(mode
);
12991 /* Mode description:
12993 * none: No cache-mode defined
12994 * forward: Forward all reads and writes to base pool [removed]
12995 * writeback: Cache writes, promote reads from base pool
12996 * readonly: Forward writes to base pool
12997 * readforward: Writes are in writeback mode, Reads are in forward mode [removed]
12998 * proxy: Proxy all reads and writes to base pool
12999 * readproxy: Writes are in writeback mode, Reads are in proxy mode
13001 * Hence, these are the allowed transitions:
13004 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
13005 * proxy -> readproxy || writeback || any IF num_objects_dirty == 0
13006 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
13007 * readproxy -> proxy || writeback || any IF num_objects_dirty == 0
13008 * writeback -> readproxy || proxy
13012 // We check if the transition is valid against the current pool mode, as
13013 // it is the only committed state thus far. We will blantly squash
13014 // whatever mode is on the pending state.
13016 if (p
->cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
&&
13017 (mode
!= pg_pool_t::CACHEMODE_PROXY
&&
13018 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) {
13019 ss
<< "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode
)
13020 << "' on a '" << pg_pool_t::get_cache_mode_name(p
->cache_mode
)
13021 << "' pool; only '"
13022 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY
)
13024 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY
)
13029 if ((p
->cache_mode
== pg_pool_t::CACHEMODE_READFORWARD
&&
13030 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13031 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
13032 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) ||
13034 (p
->cache_mode
== pg_pool_t::CACHEMODE_READPROXY
&&
13035 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13036 mode
!= pg_pool_t::CACHEMODE_PROXY
)) ||
13038 (p
->cache_mode
== pg_pool_t::CACHEMODE_PROXY
&&
13039 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13040 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) ||
13042 (p
->cache_mode
== pg_pool_t::CACHEMODE_FORWARD
&&
13043 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13044 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
13045 mode
!= pg_pool_t::CACHEMODE_READPROXY
))) {
13047 const pool_stat_t
* pstats
=
13048 mon
->mgrstatmon()->get_pool_stat(pool_id
);
13050 if (pstats
&& pstats
->stats
.sum
.num_objects_dirty
> 0) {
13051 ss
<< "unable to set cache-mode '"
13052 << pg_pool_t::get_cache_mode_name(mode
) << "' on pool '" << poolstr
13053 << "': dirty objects found";
13059 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13060 np
->cache_mode
= mode
;
13061 // set this both when moving to and from cache_mode NONE. this is to
13062 // capture legacy pools that were set up before this flag existed.
13063 np
->flags
|= pg_pool_t::FLAG_INCOMPLETE_CLONES
;
13064 ss
<< "set cache-mode for pool '" << poolstr
13065 << "' to " << pg_pool_t::get_cache_mode_name(mode
);
13066 if (mode
== pg_pool_t::CACHEMODE_NONE
) {
13067 const pg_pool_t
*base_pool
= osdmap
.get_pg_pool(np
->tier_of
);
13068 ceph_assert(base_pool
);
13069 if (base_pool
->read_tier
== pool_id
||
13070 base_pool
->write_tier
== pool_id
)
13071 ss
<<" (WARNING: pool is still configured as read or write tier)";
13073 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13074 get_last_committed() + 1));
13076 } else if (prefix
== "osd tier add-cache") {
13077 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
13078 if (err
== -EAGAIN
)
13083 cmd_getval(cmdmap
, "pool", poolstr
);
13084 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13086 ss
<< "unrecognized pool '" << poolstr
<< "'";
13090 string tierpoolstr
;
13091 cmd_getval(cmdmap
, "tierpool", tierpoolstr
);
13092 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
13093 if (tierpool_id
< 0) {
13094 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
13098 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13100 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
13103 if (!_check_become_tier(tierpool_id
, tp
, pool_id
, p
, &err
, &ss
)) {
13108 if (!cmd_getval(cmdmap
, "size", size
)) {
13109 ss
<< "unable to parse 'size' value '"
13110 << cmd_vartype_stringify(cmdmap
.at("size")) << "'";
13114 // make sure new tier is empty
13115 const pool_stat_t
*pstats
=
13116 mon
->mgrstatmon()->get_pool_stat(tierpool_id
);
13117 if (pstats
&& pstats
->stats
.sum
.num_objects
!= 0) {
13118 ss
<< "tier pool '" << tierpoolstr
<< "' is not empty";
13122 auto& modestr
= g_conf().get_val
<string
>("osd_tier_default_cache_mode");
13123 pg_pool_t::cache_mode_t mode
= pg_pool_t::get_cache_mode_from_str(modestr
);
13124 if (int(mode
) < 0) {
13125 ss
<< "osd tier cache default mode '" << modestr
<< "' is not a valid cache mode";
13129 HitSet::Params hsp
;
13130 auto& cache_hit_set_type
=
13131 g_conf().get_val
<string
>("osd_tier_default_cache_hit_set_type");
13132 if (cache_hit_set_type
== "bloom") {
13133 BloomHitSet::Params
*bsp
= new BloomHitSet::Params
;
13134 bsp
->set_fpp(g_conf().get_val
<double>("osd_pool_default_hit_set_bloom_fpp"));
13135 hsp
= HitSet::Params(bsp
);
13136 } else if (cache_hit_set_type
== "explicit_hash") {
13137 hsp
= HitSet::Params(new ExplicitHashHitSet::Params
);
13138 } else if (cache_hit_set_type
== "explicit_object") {
13139 hsp
= HitSet::Params(new ExplicitObjectHitSet::Params
);
13141 ss
<< "osd tier cache default hit set type '"
13142 << cache_hit_set_type
<< "' is not a known type";
13147 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13148 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
13149 if (np
->tiers
.count(tierpool_id
) || ntp
->is_tier()) {
13150 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13153 np
->tiers
.insert(tierpool_id
);
13154 np
->read_tier
= np
->write_tier
= tierpool_id
;
13155 np
->set_snap_epoch(pending_inc
.epoch
); // tier will update to our snap info
13156 np
->set_last_force_op_resend(pending_inc
.epoch
);
13157 ntp
->set_last_force_op_resend(pending_inc
.epoch
);
13158 ntp
->tier_of
= pool_id
;
13159 ntp
->cache_mode
= mode
;
13160 ntp
->hit_set_count
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_count");
13161 ntp
->hit_set_period
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_period");
13162 ntp
->min_read_recency_for_promote
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
13163 ntp
->min_write_recency_for_promote
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
13164 ntp
->hit_set_grade_decay_rate
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
13165 ntp
->hit_set_search_last_n
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
13166 ntp
->hit_set_params
= hsp
;
13167 ntp
->target_max_bytes
= size
;
13168 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) a cache tier of '" << poolstr
<< "'";
13169 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13170 get_last_committed() + 1));
13172 } else if (prefix
== "osd pool set-quota") {
13174 cmd_getval(cmdmap
, "pool", poolstr
);
13175 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13177 ss
<< "unrecognized pool '" << poolstr
<< "'";
13183 cmd_getval(cmdmap
, "field", field
);
13184 if (field
!= "max_objects" && field
!= "max_bytes") {
13185 ss
<< "unrecognized field '" << field
<< "'; should be 'max_bytes' or 'max_objects'";
13190 // val could contain unit designations, so we treat as a string
13192 cmd_getval(cmdmap
, "val", val
);
13195 if (field
== "max_objects") {
13196 value
= strict_sistrtoll(val
.c_str(), &tss
);
13197 } else if (field
== "max_bytes") {
13198 value
= strict_iecstrtoll(val
.c_str(), &tss
);
13200 ceph_abort_msg("unrecognized option");
13202 if (!tss
.empty()) {
13203 ss
<< "error parsing value '" << val
<< "': " << tss
;
13208 pg_pool_t
*pi
= pending_inc
.get_new_pool(pool_id
, osdmap
.get_pg_pool(pool_id
));
13209 if (field
== "max_objects") {
13210 pi
->quota_max_objects
= value
;
13211 } else if (field
== "max_bytes") {
13212 pi
->quota_max_bytes
= value
;
13214 ceph_abort_msg("unrecognized option");
13216 ss
<< "set-quota " << field
<< " = " << value
<< " for pool " << poolstr
;
13218 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
13219 get_last_committed() + 1));
13221 } else if (prefix
== "osd pool application enable" ||
13222 prefix
== "osd pool application disable" ||
13223 prefix
== "osd pool application set" ||
13224 prefix
== "osd pool application rm") {
13225 err
= prepare_command_pool_application(prefix
, cmdmap
, ss
);
13226 if (err
== -EAGAIN
) {
13228 } else if (err
< 0) {
13233 } else if (prefix
== "osd force-create-pg") {
13236 cmd_getval(cmdmap
, "pgid", pgidstr
);
13237 if (!pgid
.parse(pgidstr
.c_str())) {
13238 ss
<< "invalid pgid '" << pgidstr
<< "'";
13242 if (!osdmap
.pg_exists(pgid
)) {
13243 ss
<< "pg " << pgid
<< " should not exist";
13248 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
13250 ss
<< "This command will recreate a lost (as in data lost) PG with data in it, such "
13251 << "that the cluster will give up ever trying to recover the lost data. Do this "
13252 << "only if you are certain that all copies of the PG are in fact lost and you are "
13253 << "willing to accept that the data is permanently destroyed. Pass "
13254 << "--yes-i-really-mean-it to proceed.";
13260 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
13261 auto emplaced
= creating_pgs
.pgs
.emplace(
13263 creating_pgs_t::pg_create_info(osdmap
.get_epoch(),
13264 ceph_clock_now()));
13265 creating_now
= emplaced
.second
;
13267 if (creating_now
) {
13268 ss
<< "pg " << pgidstr
<< " now creating, ok";
13269 // set the pool's CREATING flag so that (1) the osd won't ignore our
13270 // create message and (2) we won't propose any future pg_num changes
13271 // until after the PG has been instantiated.
13272 if (pending_inc
.new_pools
.count(pgid
.pool()) == 0) {
13273 pending_inc
.new_pools
[pgid
.pool()] = *osdmap
.get_pg_pool(pgid
.pool());
13275 pending_inc
.new_pools
[pgid
.pool()].flags
|= pg_pool_t::FLAG_CREATING
;
13279 ss
<< "pg " << pgid
<< " already creating";
13289 if (err
< 0 && rs
.length() == 0)
13290 rs
= cpp_strerror(err
);
13291 mon
->reply_command(op
, err
, rs
, rdata
, get_last_committed());
13296 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
13297 get_last_committed() + 1));
13301 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13305 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op
)
13307 op
->mark_osdmon_event(__func__
);
13309 auto m
= op
->get_req
<MPoolOp
>();
13310 MonSession
*session
= op
->get_session();
13312 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
13317 case POOL_OP_CREATE_UNMANAGED_SNAP
:
13318 case POOL_OP_DELETE_UNMANAGED_SNAP
:
13320 const std::string
* pool_name
= nullptr;
13321 const pg_pool_t
*pg_pool
= osdmap
.get_pg_pool(m
->pool
);
13322 if (pg_pool
!= nullptr) {
13323 pool_name
= &osdmap
.get_pool_name(m
->pool
);
13326 if (!is_unmanaged_snap_op_permitted(cct
, mon
->key_server
,
13327 session
->entity_name
, session
->caps
,
13328 session
->get_peer_socket_addr(),
13330 dout(0) << "got unmanaged-snap pool op from entity with insufficient "
13331 << "privileges. message: " << *m
<< std::endl
13332 << "caps: " << session
->caps
<< dendl
;
13333 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
13339 if (!session
->is_capable("osd", MON_CAP_W
)) {
13340 dout(0) << "got pool op from entity with insufficient privileges. "
13341 << "message: " << *m
<< std::endl
13342 << "caps: " << session
->caps
<< dendl
;
13343 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
13352 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op
)
13354 op
->mark_osdmon_event(__func__
);
13355 auto m
= op
->get_req
<MPoolOp
>();
13357 if (enforce_pool_op_caps(op
)) {
13361 if (m
->fsid
!= mon
->monmap
->fsid
) {
13362 dout(0) << __func__
<< " drop message on fsid " << m
->fsid
13363 << " != " << mon
->monmap
->fsid
<< " for " << *m
<< dendl
;
13364 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13368 if (m
->op
== POOL_OP_CREATE
)
13369 return preprocess_pool_op_create(op
);
13371 const pg_pool_t
*p
= osdmap
.get_pg_pool(m
->pool
);
13372 if (p
== nullptr) {
13373 dout(10) << "attempt to operate on non-existent pool id " << m
->pool
<< dendl
;
13374 if (m
->op
== POOL_OP_DELETE
) {
13375 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13377 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
13382 // check if the snap and snapname exist
13383 bool snap_exists
= false;
13384 if (p
->snap_exists(m
->name
.c_str()))
13385 snap_exists
= true;
13388 case POOL_OP_CREATE_SNAP
:
13389 if (p
->is_unmanaged_snaps_mode() || p
->is_tier()) {
13390 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13394 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13398 case POOL_OP_CREATE_UNMANAGED_SNAP
:
13399 if (p
->is_pool_snaps_mode()) {
13400 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13404 case POOL_OP_DELETE_SNAP
:
13405 if (p
->is_unmanaged_snaps_mode()) {
13406 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13409 if (!snap_exists
) {
13410 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13414 case POOL_OP_DELETE_UNMANAGED_SNAP
:
13415 if (p
->is_pool_snaps_mode()) {
13416 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13419 if (_is_removed_snap(m
->pool
, m
->snapid
)) {
13420 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13424 case POOL_OP_DELETE
:
13425 if (osdmap
.lookup_pg_pool_name(m
->name
.c_str()) >= 0) {
13426 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13430 case POOL_OP_AUID_CHANGE
:
13440 bool OSDMonitor::_is_removed_snap(int64_t pool
, snapid_t snap
)
13442 if (!osdmap
.have_pg_pool(pool
)) {
13443 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
13444 << " - pool dne" << dendl
;
13447 if (osdmap
.in_removed_snaps_queue(pool
, snap
)) {
13448 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
13449 << " - in osdmap removed_snaps_queue" << dendl
;
13452 snapid_t begin
, end
;
13453 int r
= lookup_purged_snap(pool
, snap
, &begin
, &end
);
13455 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
13456 << " - purged, [" << begin
<< "," << end
<< ")" << dendl
;
13462 bool OSDMonitor::_is_pending_removed_snap(int64_t pool
, snapid_t snap
)
13464 if (pending_inc
.old_pools
.count(pool
)) {
13465 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
13466 << " - pool pending deletion" << dendl
;
13469 if (pending_inc
.in_new_removed_snaps(pool
, snap
)) {
13470 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
13471 << " - in pending new_removed_snaps" << dendl
;
13477 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op
)
13479 op
->mark_osdmon_event(__func__
);
13480 auto m
= op
->get_req
<MPoolOp
>();
13481 int64_t pool
= osdmap
.lookup_pg_pool_name(m
->name
.c_str());
13483 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13490 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op
)
13492 op
->mark_osdmon_event(__func__
);
13493 auto m
= op
->get_req
<MPoolOp
>();
13494 dout(10) << "prepare_pool_op " << *m
<< dendl
;
13495 if (m
->op
== POOL_OP_CREATE
) {
13496 return prepare_pool_op_create(op
);
13497 } else if (m
->op
== POOL_OP_DELETE
) {
13498 return prepare_pool_op_delete(op
);
13502 bool changed
= false;
13504 if (!osdmap
.have_pg_pool(m
->pool
)) {
13505 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
13509 const pg_pool_t
*pool
= osdmap
.get_pg_pool(m
->pool
);
13512 case POOL_OP_CREATE_SNAP
:
13513 if (pool
->is_tier()) {
13515 _pool_op_reply(op
, ret
, osdmap
.get_epoch());
13517 } // else, fall through
13518 case POOL_OP_DELETE_SNAP
:
13519 if (!pool
->is_unmanaged_snaps_mode()) {
13520 bool snap_exists
= pool
->snap_exists(m
->name
.c_str());
13521 if ((m
->op
== POOL_OP_CREATE_SNAP
&& snap_exists
)
13522 || (m
->op
== POOL_OP_DELETE_SNAP
&& !snap_exists
)) {
13530 _pool_op_reply(op
, ret
, osdmap
.get_epoch());
13533 case POOL_OP_DELETE_UNMANAGED_SNAP
:
13534 // we won't allow removal of an unmanaged snapshot from a pool
13535 // not in unmanaged snaps mode.
13536 if (!pool
->is_unmanaged_snaps_mode()) {
13537 _pool_op_reply(op
, -ENOTSUP
, osdmap
.get_epoch());
13541 case POOL_OP_CREATE_UNMANAGED_SNAP
:
13542 // but we will allow creating an unmanaged snapshot on any pool
13543 // as long as it is not in 'pool' snaps mode.
13544 if (pool
->is_pool_snaps_mode()) {
13545 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13550 // projected pool info
13552 if (pending_inc
.new_pools
.count(m
->pool
))
13553 pp
= pending_inc
.new_pools
[m
->pool
];
13555 pp
= *osdmap
.get_pg_pool(m
->pool
);
13557 bufferlist reply_data
;
13559 // pool snaps vs unmanaged snaps are mutually exclusive
13561 case POOL_OP_CREATE_SNAP
:
13562 case POOL_OP_DELETE_SNAP
:
13563 if (pp
.is_unmanaged_snaps_mode()) {
13569 case POOL_OP_CREATE_UNMANAGED_SNAP
:
13570 case POOL_OP_DELETE_UNMANAGED_SNAP
:
13571 if (pp
.is_pool_snaps_mode()) {
13578 case POOL_OP_CREATE_SNAP
:
13579 if (!pp
.snap_exists(m
->name
.c_str())) {
13580 pp
.add_snap(m
->name
.c_str(), ceph_clock_now());
13581 dout(10) << "create snap in pool " << m
->pool
<< " " << m
->name
13582 << " seq " << pp
.get_snap_epoch() << dendl
;
13587 case POOL_OP_DELETE_SNAP
:
13589 snapid_t s
= pp
.snap_exists(m
->name
.c_str());
13592 pending_inc
.new_removed_snaps
[m
->pool
].insert(s
);
13598 case POOL_OP_CREATE_UNMANAGED_SNAP
:
13600 uint64_t snapid
= pp
.add_unmanaged_snap(
13601 osdmap
.require_osd_release
< ceph_release_t::octopus
);
13602 encode(snapid
, reply_data
);
13607 case POOL_OP_DELETE_UNMANAGED_SNAP
:
13608 if (!_is_removed_snap(m
->pool
, m
->snapid
) &&
13609 !_is_pending_removed_snap(m
->pool
, m
->snapid
)) {
13610 if (m
->snapid
> pp
.get_snap_seq()) {
13611 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
13614 pp
.remove_unmanaged_snap(
13616 osdmap
.require_osd_release
< ceph_release_t::octopus
);
13617 pending_inc
.new_removed_snaps
[m
->pool
].insert(m
->snapid
);
13618 // also record the new seq as purged: this avoids a discontinuity
13619 // after all of the snaps have been purged, since the seq assigned
13620 // during removal lives in the same namespace as the actual snaps.
13621 pending_pseudo_purged_snaps
[m
->pool
].insert(pp
.get_snap_seq());
13626 case POOL_OP_AUID_CHANGE
:
13627 _pool_op_reply(op
, -EOPNOTSUPP
, osdmap
.get_epoch());
13636 pp
.set_snap_epoch(pending_inc
.epoch
);
13637 pending_inc
.new_pools
[m
->pool
] = pp
;
13641 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, ret
, pending_inc
.epoch
, &reply_data
));
13645 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op
)
13647 op
->mark_osdmon_event(__func__
);
13648 int err
= prepare_new_pool(op
);
13649 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, err
, pending_inc
.epoch
));
13653 int OSDMonitor::_check_remove_pool(int64_t pool_id
, const pg_pool_t
& pool
,
13656 const string
& poolstr
= osdmap
.get_pool_name(pool_id
);
13658 // If the Pool is in use by CephFS, refuse to delete it
13659 FSMap
const &pending_fsmap
= mon
->mdsmon()->get_pending_fsmap();
13660 if (pending_fsmap
.pool_in_use(pool_id
)) {
13661 *ss
<< "pool '" << poolstr
<< "' is in use by CephFS";
13665 if (pool
.tier_of
>= 0) {
13666 *ss
<< "pool '" << poolstr
<< "' is a tier of '"
13667 << osdmap
.get_pool_name(pool
.tier_of
) << "'";
13670 if (!pool
.tiers
.empty()) {
13671 *ss
<< "pool '" << poolstr
<< "' has tiers";
13672 for(auto tier
: pool
.tiers
) {
13673 *ss
<< " " << osdmap
.get_pool_name(tier
);
13678 if (!g_conf()->mon_allow_pool_delete
) {
13679 *ss
<< "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
13683 if (pool
.has_flag(pg_pool_t::FLAG_NODELETE
)) {
13684 *ss
<< "pool deletion is disabled; you must unset nodelete flag for the pool first";
13688 *ss
<< "pool '" << poolstr
<< "' removed";
13693 * Check if it is safe to add a tier to a base pool
13696 * True if the operation should proceed, false if we should abort here
13697 * (abort doesn't necessarily mean error, could be idempotency)
13699 bool OSDMonitor::_check_become_tier(
13700 const int64_t tier_pool_id
, const pg_pool_t
*tier_pool
,
13701 const int64_t base_pool_id
, const pg_pool_t
*base_pool
,
13705 const std::string
&tier_pool_name
= osdmap
.get_pool_name(tier_pool_id
);
13706 const std::string
&base_pool_name
= osdmap
.get_pool_name(base_pool_id
);
13708 const FSMap
&pending_fsmap
= mon
->mdsmon()->get_pending_fsmap();
13709 if (pending_fsmap
.pool_in_use(tier_pool_id
)) {
13710 *ss
<< "pool '" << tier_pool_name
<< "' is in use by CephFS";
13715 if (base_pool
->tiers
.count(tier_pool_id
)) {
13716 ceph_assert(tier_pool
->tier_of
== base_pool_id
);
13718 *ss
<< "pool '" << tier_pool_name
<< "' is now (or already was) a tier of '"
13719 << base_pool_name
<< "'";
13723 if (base_pool
->is_tier()) {
13724 *ss
<< "pool '" << base_pool_name
<< "' is already a tier of '"
13725 << osdmap
.get_pool_name(base_pool
->tier_of
) << "', "
13726 << "multiple tiers are not yet supported.";
13731 if (tier_pool
->has_tiers()) {
13732 *ss
<< "pool '" << tier_pool_name
<< "' has following tier(s) already:";
13733 for (set
<uint64_t>::iterator it
= tier_pool
->tiers
.begin();
13734 it
!= tier_pool
->tiers
.end(); ++it
)
13735 *ss
<< "'" << osdmap
.get_pool_name(*it
) << "',";
13736 *ss
<< " multiple tiers are not yet supported.";
13741 if (tier_pool
->is_tier()) {
13742 *ss
<< "tier pool '" << tier_pool_name
<< "' is already a tier of '"
13743 << osdmap
.get_pool_name(tier_pool
->tier_of
) << "'";
13754 * Check if it is safe to remove a tier from this base pool
13757 * True if the operation should proceed, false if we should abort here
13758 * (abort doesn't necessarily mean error, could be idempotency)
13760 bool OSDMonitor::_check_remove_tier(
13761 const int64_t base_pool_id
, const pg_pool_t
*base_pool
,
13762 const pg_pool_t
*tier_pool
,
13763 int *err
, ostream
*ss
) const
13765 const std::string
&base_pool_name
= osdmap
.get_pool_name(base_pool_id
);
13767 // Apply CephFS-specific checks
13768 const FSMap
&pending_fsmap
= mon
->mdsmon()->get_pending_fsmap();
13769 if (pending_fsmap
.pool_in_use(base_pool_id
)) {
13770 if (base_pool
->is_erasure() && !base_pool
->allows_ecoverwrites()) {
13771 // If the underlying pool is erasure coded and does not allow EC
13772 // overwrites, we can't permit the removal of the replicated tier that
13773 // CephFS relies on to access it
13774 *ss
<< "pool '" << base_pool_name
<<
13775 "' does not allow EC overwrites and is in use by CephFS"
13781 if (tier_pool
&& tier_pool
->cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
) {
13782 *ss
<< "pool '" << base_pool_name
<< "' is in use by CephFS, and this "
13783 "tier is still in use as a writeback cache. Change the cache "
13784 "mode and flush the cache before removing it";
13794 int OSDMonitor::_prepare_remove_pool(
13795 int64_t pool
, ostream
*ss
, bool no_fake
)
13797 dout(10) << __func__
<< " " << pool
<< dendl
;
13798 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
13799 int r
= _check_remove_pool(pool
, *p
, ss
);
13803 auto new_pool
= pending_inc
.new_pools
.find(pool
);
13804 if (new_pool
!= pending_inc
.new_pools
.end()) {
13805 // if there is a problem with the pending info, wait and retry
13807 const auto& p
= new_pool
->second
;
13808 int r
= _check_remove_pool(pool
, p
, ss
);
13813 if (pending_inc
.old_pools
.count(pool
)) {
13814 dout(10) << __func__
<< " " << pool
<< " already pending removal"
13819 if (g_conf()->mon_fake_pool_delete
&& !no_fake
) {
13820 string old_name
= osdmap
.get_pool_name(pool
);
13821 string new_name
= old_name
+ "." + stringify(pool
) + ".DELETED";
13822 dout(1) << __func__
<< " faking pool deletion: renaming " << pool
<< " "
13823 << old_name
<< " -> " << new_name
<< dendl
;
13824 pending_inc
.new_pool_names
[pool
] = new_name
;
13829 pending_inc
.old_pools
.insert(pool
);
13831 // remove any pg_temp mappings for this pool
13832 for (auto p
= osdmap
.pg_temp
->begin();
13833 p
!= osdmap
.pg_temp
->end();
13835 if (p
->first
.pool() == pool
) {
13836 dout(10) << __func__
<< " " << pool
<< " removing obsolete pg_temp "
13837 << p
->first
<< dendl
;
13838 pending_inc
.new_pg_temp
[p
->first
].clear();
13841 // remove any primary_temp mappings for this pool
13842 for (auto p
= osdmap
.primary_temp
->begin();
13843 p
!= osdmap
.primary_temp
->end();
13845 if (p
->first
.pool() == pool
) {
13846 dout(10) << __func__
<< " " << pool
13847 << " removing obsolete primary_temp" << p
->first
<< dendl
;
13848 pending_inc
.new_primary_temp
[p
->first
] = -1;
13851 // remove any pg_upmap mappings for this pool
13852 for (auto& p
: osdmap
.pg_upmap
) {
13853 if (p
.first
.pool() == pool
) {
13854 dout(10) << __func__
<< " " << pool
13855 << " removing obsolete pg_upmap "
13856 << p
.first
<< dendl
;
13857 pending_inc
.old_pg_upmap
.insert(p
.first
);
13860 // remove any pending pg_upmap mappings for this pool
13862 auto it
= pending_inc
.new_pg_upmap
.begin();
13863 while (it
!= pending_inc
.new_pg_upmap
.end()) {
13864 if (it
->first
.pool() == pool
) {
13865 dout(10) << __func__
<< " " << pool
13866 << " removing pending pg_upmap "
13867 << it
->first
<< dendl
;
13868 it
= pending_inc
.new_pg_upmap
.erase(it
);
13874 // remove any pg_upmap_items mappings for this pool
13875 for (auto& p
: osdmap
.pg_upmap_items
) {
13876 if (p
.first
.pool() == pool
) {
13877 dout(10) << __func__
<< " " << pool
13878 << " removing obsolete pg_upmap_items " << p
.first
13880 pending_inc
.old_pg_upmap_items
.insert(p
.first
);
13883 // remove any pending pg_upmap mappings for this pool
13885 auto it
= pending_inc
.new_pg_upmap_items
.begin();
13886 while (it
!= pending_inc
.new_pg_upmap_items
.end()) {
13887 if (it
->first
.pool() == pool
) {
13888 dout(10) << __func__
<< " " << pool
13889 << " removing pending pg_upmap_items "
13890 << it
->first
<< dendl
;
13891 it
= pending_inc
.new_pg_upmap_items
.erase(it
);
13898 // remove any choose_args for this pool
13899 CrushWrapper newcrush
;
13900 _get_pending_crush(newcrush
);
13901 if (newcrush
.have_choose_args(pool
)) {
13902 dout(10) << __func__
<< " removing choose_args for pool " << pool
<< dendl
;
13903 newcrush
.rm_choose_args(pool
);
13904 pending_inc
.crush
.clear();
13905 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
13910 int OSDMonitor::_prepare_rename_pool(int64_t pool
, string newname
)
13912 dout(10) << "_prepare_rename_pool " << pool
<< dendl
;
13913 if (pending_inc
.old_pools
.count(pool
)) {
13914 dout(10) << "_prepare_rename_pool " << pool
<< " pending removal" << dendl
;
13917 for (map
<int64_t,string
>::iterator p
= pending_inc
.new_pool_names
.begin();
13918 p
!= pending_inc
.new_pool_names
.end();
13920 if (p
->second
== newname
&& p
->first
!= pool
) {
13925 pending_inc
.new_pool_names
[pool
] = newname
;
13929 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op
)
13931 op
->mark_osdmon_event(__func__
);
13932 auto m
= op
->get_req
<MPoolOp
>();
13934 int ret
= _prepare_remove_pool(m
->pool
, &ss
, false);
13935 if (ret
== -EAGAIN
) {
13936 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13940 dout(10) << __func__
<< " got " << ret
<< " " << ss
.str() << dendl
;
13941 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, ret
,
13942 pending_inc
.epoch
));
13946 void OSDMonitor::_pool_op_reply(MonOpRequestRef op
,
13947 int ret
, epoch_t epoch
, bufferlist
*blp
)
13949 op
->mark_osdmon_event(__func__
);
13950 auto m
= op
->get_req
<MPoolOp
>();
13951 dout(20) << "_pool_op_reply " << ret
<< dendl
;
13952 MPoolOpReply
*reply
= new MPoolOpReply(m
->fsid
, m
->get_tid(),
13953 ret
, epoch
, get_last_committed(), blp
);
13954 mon
->send_reply(op
, reply
);
13957 void OSDMonitor::convert_pool_priorities(void)
13959 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc("recovery_priority").key
;
13960 int64_t max_prio
= 0;
13961 int64_t min_prio
= 0;
13962 for (const auto &i
: osdmap
.get_pools()) {
13963 const auto &pool
= i
.second
;
13965 if (pool
.opts
.is_set(key
)) {
13967 pool
.opts
.get(key
, &prio
);
13968 if (prio
> max_prio
)
13970 if (prio
< min_prio
)
13974 if (max_prio
<= OSD_POOL_PRIORITY_MAX
&& min_prio
>= OSD_POOL_PRIORITY_MIN
) {
13975 dout(20) << __func__
<< " nothing to fix" << dendl
;
13978 // Current pool priorities exceeds new maximum
13979 for (const auto &i
: osdmap
.get_pools()) {
13980 const auto pool_id
= i
.first
;
13981 pg_pool_t pool
= i
.second
;
13984 pool
.opts
.get(key
, &prio
);
13987 if (prio
> 0 && max_prio
> OSD_POOL_PRIORITY_MAX
) { // Likely scenario
13988 // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
13989 n
= (float)prio
/ max_prio
* OSD_POOL_PRIORITY_MAX
;
13990 } else if (prio
< 0 && min_prio
< OSD_POOL_PRIORITY_MIN
) {
13991 // Scaled priority range OSD_POOL_PRIORITY_MIN to 0
13992 n
= (float)prio
/ min_prio
* OSD_POOL_PRIORITY_MIN
;
13997 pool
.opts
.unset(key
);
13999 pool
.opts
.set(key
, static_cast<int64_t>(n
));
14001 dout(10) << __func__
<< " pool " << pool_id
14002 << " recovery_priority adjusted "
14003 << prio
<< " to " << n
<< dendl
;
14004 pool
.last_change
= pending_inc
.epoch
;
14005 pending_inc
.new_pools
[pool_id
] = pool
;