1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
10 * Author: Loic Dachary <loic@dachary.org>
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
20 #include <boost/algorithm/string.hpp>
21 #include <experimental/iterator>
25 #include "mon/OSDMonitor.h"
26 #include "mon/Monitor.h"
27 #include "mon/MDSMonitor.h"
28 #include "mon/MgrStatMonitor.h"
29 #include "mon/AuthMonitor.h"
30 #include "mon/ConfigKeyService.h"
32 #include "mon/MonitorDBStore.h"
33 #include "mon/Session.h"
35 #include "crush/CrushWrapper.h"
36 #include "crush/CrushTester.h"
37 #include "crush/CrushTreeDumper.h"
39 #include "messages/MOSDBeacon.h"
40 #include "messages/MOSDFailure.h"
41 #include "messages/MOSDMarkMeDown.h"
42 #include "messages/MOSDMarkMeDead.h"
43 #include "messages/MOSDFull.h"
44 #include "messages/MOSDMap.h"
45 #include "messages/MMonGetOSDMap.h"
46 #include "messages/MOSDBoot.h"
47 #include "messages/MOSDAlive.h"
48 #include "messages/MPoolOp.h"
49 #include "messages/MPoolOpReply.h"
50 #include "messages/MOSDPGCreate.h"
51 #include "messages/MOSDPGCreate2.h"
52 #include "messages/MOSDPGCreated.h"
53 #include "messages/MOSDPGTemp.h"
54 #include "messages/MOSDPGReadyToMerge.h"
55 #include "messages/MMonCommand.h"
56 #include "messages/MRemoveSnaps.h"
57 #include "messages/MOSDScrub.h"
58 #include "messages/MRoute.h"
59 #include "messages/MMonGetPurgedSnaps.h"
60 #include "messages/MMonGetPurgedSnapsReply.h"
62 #include "common/TextTable.h"
63 #include "common/Timer.h"
64 #include "common/ceph_argparse.h"
65 #include "common/perf_counters.h"
66 #include "common/PriorityCache.h"
67 #include "common/strtol.h"
68 #include "common/numa.h"
70 #include "common/config.h"
71 #include "common/errno.h"
73 #include "erasure-code/ErasureCodePlugin.h"
74 #include "compressor/Compressor.h"
75 #include "common/Checksummer.h"
77 #include "include/compat.h"
78 #include "include/ceph_assert.h"
79 #include "include/stringify.h"
80 #include "include/util.h"
81 #include "common/cmdparse.h"
82 #include "include/str_list.h"
83 #include "include/str_map.h"
84 #include "include/scope_guard.h"
85 #include "perfglue/heap_profiler.h"
87 #include "auth/cephx/CephxKeyServer.h"
88 #include "osd/OSDCap.h"
90 #include "json_spirit/json_spirit_reader.h"
92 #include <boost/algorithm/string/predicate.hpp>
94 #define dout_subsys ceph_subsys_mon
95 static const string
OSD_PG_CREATING_PREFIX("osd_pg_creating");
96 static const string
OSD_METADATA_PREFIX("osd_metadata");
97 static const string
OSD_SNAP_PREFIX("osd_snap");
101 OSD snapshot metadata
102 ---------------------
104 -- starting with mimic, removed in octopus --
106 "removed_epoch_%llu_%08lx" % (pool, epoch)
107 -> interval_set<snapid_t>
109 "removed_snap_%llu_%016llx" % (pool, last_snap)
110 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
113 -- starting with mimic --
115 "purged_snap_%llu_%016llx" % (pool, last_snap)
116 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
118 - note that the {removed,purged}_snap put the last snap in they key so
119 that we can use forward iteration only to search for an epoch in an
120 interval. e.g., to test if epoch N is removed/purged, we'll find a key
121 >= N that either does or doesn't contain the given snap.
124 -- starting with octopus --
126 "purged_epoch_%08lx" % epoch
127 -> map<int64_t,interval_set<snapid_t>>
130 using namespace TOPNSPC::common
;
133 struct OSDMemCache
: public PriorityCache::PriCache
{
135 int64_t cache_bytes
[PriorityCache::Priority::LAST
+1] = {0};
136 int64_t committed_bytes
= 0;
137 double cache_ratio
= 0;
139 OSDMemCache(OSDMonitor
*m
) : osdmon(m
) {};
141 virtual uint64_t _get_used_bytes() const = 0;
143 virtual int64_t request_cache_bytes(
144 PriorityCache::Priority pri
, uint64_t total_cache
) const {
145 int64_t assigned
= get_cache_bytes(pri
);
148 // All cache items are currently set to have PRI1 priority
149 case PriorityCache::Priority::PRI1
:
151 int64_t request
= _get_used_bytes();
152 return (request
> assigned
) ? request
- assigned
: 0;
160 virtual int64_t get_cache_bytes(PriorityCache::Priority pri
) const {
161 return cache_bytes
[pri
];
164 virtual int64_t get_cache_bytes() const {
167 for (int i
= 0; i
< PriorityCache::Priority::LAST
+ 1; i
++) {
168 PriorityCache::Priority pri
= static_cast<PriorityCache::Priority
>(i
);
169 total
+= get_cache_bytes(pri
);
174 virtual void set_cache_bytes(PriorityCache::Priority pri
, int64_t bytes
) {
175 cache_bytes
[pri
] = bytes
;
177 virtual void add_cache_bytes(PriorityCache::Priority pri
, int64_t bytes
) {
178 cache_bytes
[pri
] += bytes
;
180 virtual int64_t commit_cache_size(uint64_t total_cache
) {
181 committed_bytes
= PriorityCache::get_chunk(
182 get_cache_bytes(), total_cache
);
183 return committed_bytes
;
185 virtual int64_t get_committed_size() const {
186 return committed_bytes
;
188 virtual double get_cache_ratio() const {
191 virtual void set_cache_ratio(double ratio
) {
194 virtual string
get_cache_name() const = 0;
197 struct IncCache
: public OSDMemCache
{
198 IncCache(OSDMonitor
*m
) : OSDMemCache(m
) {};
200 virtual uint64_t _get_used_bytes() const {
201 return osdmon
->inc_osd_cache
.get_bytes();
204 virtual string
get_cache_name() const {
205 return "OSDMap Inc Cache";
208 uint64_t _get_num_osdmaps() const {
209 return osdmon
->inc_osd_cache
.get_size();
213 struct FullCache
: public OSDMemCache
{
214 FullCache(OSDMonitor
*m
) : OSDMemCache(m
) {};
216 virtual uint64_t _get_used_bytes() const {
217 return osdmon
->full_osd_cache
.get_bytes();
220 virtual string
get_cache_name() const {
221 return "OSDMap Full Cache";
224 uint64_t _get_num_osdmaps() const {
225 return osdmon
->full_osd_cache
.get_size();
229 std::shared_ptr
<IncCache
> inc_cache
;
230 std::shared_ptr
<FullCache
> full_cache
;
232 const uint32_t MAX_POOL_APPLICATIONS
= 4;
233 const uint32_t MAX_POOL_APPLICATION_KEYS
= 64;
234 const uint32_t MAX_POOL_APPLICATION_LENGTH
= 128;
236 bool is_osd_writable(const OSDCapGrant
& grant
, const std::string
* pool_name
) {
237 // Note: this doesn't include support for the application tag match
238 if ((grant
.spec
.allow
& OSD_CAP_W
) != 0) {
239 auto& match
= grant
.match
;
240 if (match
.is_match_all()) {
242 } else if (pool_name
!= nullptr &&
243 !match
.pool_namespace
.pool_name
.empty() &&
244 match
.pool_namespace
.pool_name
== *pool_name
) {
251 bool is_unmanaged_snap_op_permitted(CephContext
* cct
,
252 const KeyServer
& key_server
,
253 const EntityName
& entity_name
,
254 const MonCap
& mon_caps
,
255 const entity_addr_t
& peer_socket_addr
,
256 const std::string
* pool_name
)
258 typedef std::map
<std::string
, std::string
> CommandArgs
;
260 if (mon_caps
.is_capable(
261 cct
, entity_name
, "osd",
262 "osd pool op unmanaged-snap",
263 (pool_name
== nullptr ?
264 CommandArgs
{} /* pool DNE, require unrestricted cap */ :
265 CommandArgs
{{"poolname", *pool_name
}}),
271 AuthCapsInfo caps_info
;
272 if (!key_server
.get_service_caps(entity_name
, CEPH_ENTITY_TYPE_OSD
,
274 dout(10) << "unable to locate OSD cap data for " << entity_name
275 << " in auth db" << dendl
;
280 if (caps_info
.caps
.length() > 0) {
281 auto p
= caps_info
.caps
.cbegin();
284 } catch (const buffer::error
&err
) {
285 derr
<< "corrupt OSD cap data for " << entity_name
<< " in auth db"
292 if (!osd_cap
.parse(caps_str
, nullptr)) {
293 dout(10) << "unable to parse OSD cap data for " << entity_name
294 << " in auth db" << dendl
;
298 // if the entity has write permissions in one or all pools, permit
299 // usage of unmanaged-snapshots
300 if (osd_cap
.allow_all()) {
304 for (auto& grant
: osd_cap
.grants
) {
305 if (grant
.profile
.is_valid()) {
306 for (auto& profile_grant
: grant
.profile_grants
) {
307 if (is_osd_writable(profile_grant
, pool_name
)) {
311 } else if (is_osd_writable(grant
, pool_name
)) {
319 } // anonymous namespace
321 void LastEpochClean::Lec::report(ps_t ps
, epoch_t last_epoch_clean
)
323 if (epoch_by_pg
.size() <= ps
) {
324 epoch_by_pg
.resize(ps
+ 1, 0);
326 const auto old_lec
= epoch_by_pg
[ps
];
327 if (old_lec
>= last_epoch_clean
) {
331 epoch_by_pg
[ps
] = last_epoch_clean
;
332 if (last_epoch_clean
< floor
) {
333 floor
= last_epoch_clean
;
334 } else if (last_epoch_clean
> floor
) {
335 if (old_lec
== floor
) {
336 // probably should increase floor?
337 auto new_floor
= std::min_element(std::begin(epoch_by_pg
),
338 std::end(epoch_by_pg
));
342 if (ps
!= next_missing
) {
345 for (; next_missing
< epoch_by_pg
.size(); next_missing
++) {
346 if (epoch_by_pg
[next_missing
] == 0) {
352 void LastEpochClean::remove_pool(uint64_t pool
)
354 report_by_pool
.erase(pool
);
357 void LastEpochClean::report(const pg_t
& pg
, epoch_t last_epoch_clean
)
359 auto& lec
= report_by_pool
[pg
.pool()];
360 return lec
.report(pg
.ps(), last_epoch_clean
);
363 epoch_t
LastEpochClean::get_lower_bound(const OSDMap
& latest
) const
365 auto floor
= latest
.get_epoch();
366 for (auto& pool
: latest
.get_pools()) {
367 auto reported
= report_by_pool
.find(pool
.first
);
368 if (reported
== report_by_pool
.end()) {
371 if (reported
->second
.next_missing
< pool
.second
.get_pg_num()) {
374 if (reported
->second
.floor
< floor
) {
375 floor
= reported
->second
.floor
;
381 void LastEpochClean::dump(Formatter
*f
) const
383 f
->open_array_section("per_pool");
385 for (auto& it
: report_by_pool
) {
386 f
->open_object_section("pool");
387 f
->dump_unsigned("poolid", it
.first
);
388 f
->dump_unsigned("floor", it
.second
.floor
);
395 class C_UpdateCreatingPGs
: public Context
{
400 C_UpdateCreatingPGs(OSDMonitor
*osdmon
, epoch_t e
) :
401 osdmon(osdmon
), start(ceph_clock_now()), epoch(e
) {}
402 void finish(int r
) override
{
404 utime_t end
= ceph_clock_now();
405 dout(10) << "osdmap epoch " << epoch
<< " mapping took "
406 << (end
- start
) << " seconds" << dendl
;
407 osdmon
->update_creating_pgs();
408 osdmon
->check_pg_creates_subs();
414 #define dout_prefix _prefix(_dout, mon, osdmap)
415 static ostream
& _prefix(std::ostream
*_dout
, Monitor
*mon
, const OSDMap
& osdmap
) {
416 return *_dout
<< "mon." << mon
->name
<< "@" << mon
->rank
417 << "(" << mon
->get_state_name()
418 << ").osd e" << osdmap
.get_epoch() << " ";
421 OSDMonitor::OSDMonitor(
425 const string
& service_name
)
426 : PaxosService(mn
, p
, service_name
),
428 inc_osd_cache(g_conf()->mon_osd_cache_size
),
429 full_osd_cache(g_conf()->mon_osd_cache_size
),
430 has_osdmap_manifest(false),
431 mapper(mn
->cct
, &mn
->cpu_tp
)
433 inc_cache
= std::make_shared
<IncCache
>(this);
434 full_cache
= std::make_shared
<FullCache
>(this);
435 cct
->_conf
.add_observer(this);
436 int r
= _set_cache_sizes();
438 derr
<< __func__
<< " using default osd cache size - mon_osd_cache_size ("
439 << g_conf()->mon_osd_cache_size
440 << ") without priority cache management"
445 const char **OSDMonitor::get_tracked_conf_keys() const
447 static const char* KEYS
[] = {
449 "mon_memory_autotune",
450 "rocksdb_cache_size",
456 void OSDMonitor::handle_conf_change(const ConfigProxy
& conf
,
457 const std::set
<std::string
> &changed
)
459 dout(10) << __func__
<< " " << changed
<< dendl
;
461 if (changed
.count("mon_memory_autotune")) {
462 _set_cache_autotuning();
464 if (changed
.count("mon_memory_target") ||
465 changed
.count("rocksdb_cache_size")) {
466 int r
= _update_mon_cache_settings();
468 derr
<< __func__
<< " mon_memory_target:"
469 << g_conf()->mon_memory_target
470 << " rocksdb_cache_size:"
471 << g_conf()->rocksdb_cache_size
472 << ". Unable to update cache size."
478 void OSDMonitor::_set_cache_autotuning()
480 if (!g_conf()->mon_memory_autotune
&& pcm
!= nullptr) {
481 // Disable cache autotuning
482 std::lock_guard
l(balancer_lock
);
486 if (g_conf()->mon_memory_autotune
&& pcm
== nullptr) {
487 int r
= register_cache_with_pcm();
490 << " Error while registering osdmon caches with pcm."
491 << " Cache auto tuning not enabled."
493 mon_memory_autotune
= false;
495 mon_memory_autotune
= true;
500 int OSDMonitor::_update_mon_cache_settings()
502 if (g_conf()->mon_memory_target
<= 0 ||
503 g_conf()->mon_memory_target
< mon_memory_min
||
504 g_conf()->rocksdb_cache_size
<= 0) {
508 if (pcm
== nullptr && rocksdb_binned_kv_cache
== nullptr) {
509 derr
<< __func__
<< " not using pcm and rocksdb" << dendl
;
513 uint64_t old_mon_memory_target
= mon_memory_target
;
514 uint64_t old_rocksdb_cache_size
= rocksdb_cache_size
;
516 // Set the new pcm memory cache sizes
517 mon_memory_target
= g_conf()->mon_memory_target
;
518 rocksdb_cache_size
= g_conf()->rocksdb_cache_size
;
520 uint64_t base
= mon_memory_base
;
521 double fragmentation
= mon_memory_fragmentation
;
522 uint64_t target
= mon_memory_target
;
523 uint64_t min
= mon_memory_min
;
526 uint64_t ltarget
= (1.0 - fragmentation
) * target
;
527 if (ltarget
> base
+ min
) {
528 max
= ltarget
- base
;
531 int r
= _set_cache_ratios();
533 derr
<< __func__
<< " Cache ratios for pcm could not be set."
534 << " Review the kv (rocksdb) and mon_memory_target sizes."
536 mon_memory_target
= old_mon_memory_target
;
537 rocksdb_cache_size
= old_rocksdb_cache_size
;
541 if (mon_memory_autotune
&& pcm
!= nullptr) {
542 std::lock_guard
l(balancer_lock
);
543 // set pcm cache levels
544 pcm
->set_target_memory(target
);
545 pcm
->set_min_memory(min
);
546 pcm
->set_max_memory(max
);
547 // tune memory based on new values
550 _set_new_cache_sizes();
551 dout(1) << __func__
<< " Updated mon cache setting."
552 << " target: " << target
560 int OSDMonitor::_set_cache_sizes()
562 if (g_conf()->mon_memory_autotune
) {
563 // set the new osdmon cache targets to be managed by pcm
564 mon_osd_cache_size
= g_conf()->mon_osd_cache_size
;
565 rocksdb_cache_size
= g_conf()->rocksdb_cache_size
;
566 mon_memory_base
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_base");
567 mon_memory_fragmentation
= cct
->_conf
.get_val
<double>("osd_memory_expected_fragmentation");
568 mon_memory_target
= g_conf()->mon_memory_target
;
569 mon_memory_min
= g_conf()->mon_osd_cache_size_min
;
570 if (mon_memory_target
<= 0 || mon_memory_min
<= 0) {
571 derr
<< __func__
<< " mon_memory_target:" << mon_memory_target
572 << " mon_memory_min:" << mon_memory_min
573 << ". Invalid size option(s) provided."
577 // Set the initial inc and full LRU cache sizes
578 inc_osd_cache
.set_bytes(mon_memory_min
);
579 full_osd_cache
.set_bytes(mon_memory_min
);
580 mon_memory_autotune
= g_conf()->mon_memory_autotune
;
585 bool OSDMonitor::_have_pending_crush()
587 return pending_inc
.crush
.length() > 0;
590 CrushWrapper
&OSDMonitor::_get_stable_crush()
592 return *osdmap
.crush
;
595 void OSDMonitor::_get_pending_crush(CrushWrapper
& newcrush
)
598 if (pending_inc
.crush
.length())
599 bl
= pending_inc
.crush
;
601 osdmap
.crush
->encode(bl
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
603 auto p
= bl
.cbegin();
607 void OSDMonitor::create_initial()
609 dout(10) << "create_initial for " << mon
->monmap
->fsid
<< dendl
;
614 mon
->store
->get("mkfs", "osdmap", bl
);
618 newmap
.set_fsid(mon
->monmap
->fsid
);
620 newmap
.build_simple(cct
, 0, mon
->monmap
->fsid
, 0);
623 newmap
.created
= newmap
.modified
= ceph_clock_now();
625 // new clusters should sort bitwise by default.
626 newmap
.set_flag(CEPH_OSDMAP_SORTBITWISE
);
629 CEPH_OSDMAP_RECOVERY_DELETES
|
630 CEPH_OSDMAP_PURGED_SNAPDIRS
|
631 CEPH_OSDMAP_PGLOG_HARDLIMIT
;
632 newmap
.full_ratio
= g_conf()->mon_osd_full_ratio
;
633 if (newmap
.full_ratio
> 1.0) newmap
.full_ratio
/= 100;
634 newmap
.backfillfull_ratio
= g_conf()->mon_osd_backfillfull_ratio
;
635 if (newmap
.backfillfull_ratio
> 1.0) newmap
.backfillfull_ratio
/= 100;
636 newmap
.nearfull_ratio
= g_conf()->mon_osd_nearfull_ratio
;
637 if (newmap
.nearfull_ratio
> 1.0) newmap
.nearfull_ratio
/= 100;
639 // new cluster should require latest by default
640 if (g_conf().get_val
<bool>("mon_debug_no_require_octopus")) {
641 if (g_conf().get_val
<bool>("mon_debug_no_require_nautilus")) {
642 derr
<< __func__
<< " mon_debug_no_require_octopus and nautilus=true" << dendl
;
643 newmap
.require_osd_release
= ceph_release_t::mimic
;
645 derr
<< __func__
<< " mon_debug_no_require_octopus=true" << dendl
;
646 newmap
.require_osd_release
= ceph_release_t::nautilus
;
649 newmap
.require_osd_release
= ceph_release_t::octopus
;
650 ceph_release_t r
= ceph_release_from_name(
651 g_conf()->mon_osd_initial_require_min_compat_client
);
653 ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
655 newmap
.require_min_compat_client
= r
;
658 // encode into pending incremental
659 uint64_t features
= newmap
.get_encoding_features();
660 newmap
.encode(pending_inc
.fullmap
,
661 features
| CEPH_FEATURE_RESERVED
);
662 pending_inc
.full_crc
= newmap
.get_crc();
663 dout(20) << " full crc " << pending_inc
.full_crc
<< dendl
;
666 void OSDMonitor::get_store_prefixes(std::set
<string
>& s
) const
668 s
.insert(service_name
);
669 s
.insert(OSD_PG_CREATING_PREFIX
);
670 s
.insert(OSD_METADATA_PREFIX
);
671 s
.insert(OSD_SNAP_PREFIX
);
674 void OSDMonitor::update_from_paxos(bool *need_bootstrap
)
676 // we really don't care if the version has been updated, because we may
677 // have trimmed without having increased the last committed; yet, we may
678 // need to update the in-memory manifest.
679 load_osdmap_manifest();
681 version_t version
= get_last_committed();
682 if (version
== osdmap
.epoch
)
684 ceph_assert(version
> osdmap
.epoch
);
686 dout(15) << "update_from_paxos paxos e " << version
687 << ", my e " << osdmap
.epoch
<< dendl
;
690 if (!mapping_job
->is_done()) {
691 dout(1) << __func__
<< " mapping job "
692 << mapping_job
.get() << " did not complete, "
693 << mapping_job
->shards
<< " left, canceling" << dendl
;
694 mapping_job
->abort();
702 * We will possibly have a stashed latest that *we* wrote, and we will
703 * always be sure to have the oldest full map in the first..last range
704 * due to encode_trim_extra(), which includes the oldest full map in the trim
707 * encode_trim_extra() does not however write the full map's
708 * version to 'full_latest'. This is only done when we are building the
709 * full maps from the incremental versions. But don't panic! We make sure
710 * that the following conditions find whichever full map version is newer.
712 version_t latest_full
= get_version_latest_full();
713 if (latest_full
== 0 && get_first_committed() > 1)
714 latest_full
= get_first_committed();
716 if (get_first_committed() > 1 &&
717 latest_full
< get_first_committed()) {
718 // the monitor could be just sync'ed with its peer, and the latest_full key
719 // is not encoded in the paxos commits in encode_pending(), so we need to
720 // make sure we get it pointing to a proper version.
721 version_t lc
= get_last_committed();
722 version_t fc
= get_first_committed();
724 dout(10) << __func__
<< " looking for valid full map in interval"
725 << " [" << fc
<< ", " << lc
<< "]" << dendl
;
728 for (version_t v
= lc
; v
>= fc
; v
--) {
729 string full_key
= "full_" + stringify(v
);
730 if (mon
->store
->exists(get_service_name(), full_key
)) {
731 dout(10) << __func__
<< " found latest full map v " << v
<< dendl
;
737 ceph_assert(latest_full
> 0);
738 auto t(std::make_shared
<MonitorDBStore::Transaction
>());
739 put_version_latest_full(t
, latest_full
);
740 mon
->store
->apply_transaction(t
);
741 dout(10) << __func__
<< " updated the on-disk full map version to "
742 << latest_full
<< dendl
;
745 if ((latest_full
> 0) && (latest_full
> osdmap
.epoch
)) {
746 bufferlist latest_bl
;
747 get_version_full(latest_full
, latest_bl
);
748 ceph_assert(latest_bl
.length() != 0);
749 dout(7) << __func__
<< " loading latest full map e" << latest_full
<< dendl
;
751 osdmap
.decode(latest_bl
);
755 if (!mon
->store
->get(OSD_PG_CREATING_PREFIX
, "creating", bl
)) {
756 auto p
= bl
.cbegin();
757 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
758 creating_pgs
.decode(p
);
759 dout(7) << __func__
<< " loading creating_pgs last_scan_epoch "
760 << creating_pgs
.last_scan_epoch
761 << " with " << creating_pgs
.pgs
.size() << " pgs" << dendl
;
763 dout(1) << __func__
<< " missing creating pgs; upgrade from post-kraken?"
767 // walk through incrementals
768 MonitorDBStore::TransactionRef t
;
770 while (version
> osdmap
.epoch
) {
772 int err
= get_version(osdmap
.epoch
+1, inc_bl
);
773 ceph_assert(err
== 0);
774 ceph_assert(inc_bl
.length());
775 // set priority cache manager levels if the osdmap is
776 // being populated for the first time.
777 if (mon_memory_autotune
&& pcm
== nullptr) {
778 int r
= register_cache_with_pcm();
781 << " Error while registering osdmon caches with pcm."
782 << " Proceeding without cache auto tuning."
787 dout(7) << "update_from_paxos applying incremental " << osdmap
.epoch
+1
789 OSDMap::Incremental
inc(inc_bl
);
790 err
= osdmap
.apply_incremental(inc
);
791 ceph_assert(err
== 0);
794 t
.reset(new MonitorDBStore::Transaction
);
796 // Write out the full map for all past epochs. Encode the full
797 // map with the same features as the incremental. If we don't
798 // know, use the quorum features. If we don't know those either,
799 // encode with all features.
800 uint64_t f
= inc
.encode_features
;
802 f
= mon
->get_quorum_con_features();
806 osdmap
.encode(full_bl
, f
| CEPH_FEATURE_RESERVED
);
807 tx_size
+= full_bl
.length();
809 bufferlist orig_full_bl
;
810 get_version_full(osdmap
.epoch
, orig_full_bl
);
811 if (orig_full_bl
.length()) {
812 // the primary provided the full map
813 ceph_assert(inc
.have_crc
);
814 if (inc
.full_crc
!= osdmap
.crc
) {
815 // This will happen if the mons were running mixed versions in
816 // the past or some other circumstance made the full encoded
817 // maps divergent. Reloading here will bring us back into
818 // sync with the primary for this and all future maps. OSDs
819 // will also be brought back into sync when they discover the
820 // crc mismatch and request a full map from a mon.
821 derr
<< __func__
<< " full map CRC mismatch, resetting to canonical"
824 dout(20) << __func__
<< " my (bad) full osdmap:\n";
825 JSONFormatter
jf(true);
826 jf
.dump_object("osdmap", osdmap
);
828 *_dout
<< "\nhexdump:\n";
829 full_bl
.hexdump(*_dout
);
833 osdmap
.decode(orig_full_bl
);
835 dout(20) << __func__
<< " canonical full osdmap:\n";
836 JSONFormatter
jf(true);
837 jf
.dump_object("osdmap", osdmap
);
839 *_dout
<< "\nhexdump:\n";
840 orig_full_bl
.hexdump(*_dout
);
844 ceph_assert(!inc
.have_crc
);
845 put_version_full(t
, osdmap
.epoch
, full_bl
);
847 put_version_latest_full(t
, osdmap
.epoch
);
850 dout(1) << osdmap
<< dendl
;
852 if (osdmap
.epoch
== 1) {
853 t
->erase("mkfs", "osdmap");
856 if (tx_size
> g_conf()->mon_sync_max_payload_size
*2) {
857 mon
->store
->apply_transaction(t
);
858 t
= MonitorDBStore::TransactionRef();
861 for (const auto &osd_state
: inc
.new_state
) {
862 if (osd_state
.second
& CEPH_OSD_UP
) {
863 // could be marked up *or* down, but we're too lazy to check which
864 last_osd_report
.erase(osd_state
.first
);
866 if (osd_state
.second
& CEPH_OSD_EXISTS
) {
867 // could be created *or* destroyed, but we can safely drop it
868 osd_epochs
.erase(osd_state
.first
);
874 mon
->store
->apply_transaction(t
);
877 for (int o
= 0; o
< osdmap
.get_max_osd(); o
++) {
878 if (osdmap
.is_out(o
))
880 auto found
= down_pending_out
.find(o
);
881 if (osdmap
.is_down(o
)) {
882 // populate down -> out map
883 if (found
== down_pending_out
.end()) {
884 dout(10) << " adding osd." << o
<< " to down_pending_out map" << dendl
;
885 down_pending_out
[o
] = ceph_clock_now();
888 if (found
!= down_pending_out
.end()) {
889 dout(10) << " removing osd." << o
<< " from down_pending_out map" << dendl
;
890 down_pending_out
.erase(found
);
894 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
897 check_pg_creates_subs();
899 share_map_with_random_osd();
903 // make sure our feature bits reflect the latest map
904 update_msgr_features();
906 if (!mon
->is_leader()) {
907 // will be called by on_active() on the leader, avoid doing so twice
912 int OSDMonitor::register_cache_with_pcm()
914 if (mon_memory_target
<= 0 || mon_memory_min
<= 0) {
915 derr
<< __func__
<< " Invalid memory size specified for mon caches."
916 << " Caches will not be auto-tuned."
920 uint64_t base
= mon_memory_base
;
921 double fragmentation
= mon_memory_fragmentation
;
922 // For calculating total target memory, consider rocksdb cache size.
923 uint64_t target
= mon_memory_target
;
924 uint64_t min
= mon_memory_min
;
927 // Apply the same logic as in bluestore to set the max amount
928 // of memory to use for cache. Assume base memory for OSDMaps
929 // and then add in some overhead for fragmentation.
930 uint64_t ltarget
= (1.0 - fragmentation
) * target
;
931 if (ltarget
> base
+ min
) {
932 max
= ltarget
- base
;
935 rocksdb_binned_kv_cache
= mon
->store
->get_priority_cache();
936 if (!rocksdb_binned_kv_cache
) {
937 derr
<< __func__
<< " not using rocksdb" << dendl
;
941 int r
= _set_cache_ratios();
943 derr
<< __func__
<< " Cache ratios for pcm could not be set."
944 << " Review the kv (rocksdb) and mon_memory_target sizes."
949 pcm
= std::make_shared
<PriorityCache::Manager
>(
950 cct
, min
, max
, target
, true);
951 pcm
->insert("kv", rocksdb_binned_kv_cache
, true);
952 pcm
->insert("inc", inc_cache
, true);
953 pcm
->insert("full", full_cache
, true);
954 dout(1) << __func__
<< " pcm target: " << target
955 << " pcm max: " << max
956 << " pcm min: " << min
957 << " inc_osd_cache size: " << inc_osd_cache
.get_size()
962 int OSDMonitor::_set_cache_ratios()
964 double old_cache_kv_ratio
= cache_kv_ratio
;
966 // Set the cache ratios for kv(rocksdb), inc and full caches
967 cache_kv_ratio
= (double)rocksdb_cache_size
/ (double)mon_memory_target
;
968 if (cache_kv_ratio
>= 1.0) {
969 derr
<< __func__
<< " Cache kv ratio (" << cache_kv_ratio
970 << ") must be in range [0,<1.0]."
972 cache_kv_ratio
= old_cache_kv_ratio
;
975 rocksdb_binned_kv_cache
->set_cache_ratio(cache_kv_ratio
);
976 cache_inc_ratio
= cache_full_ratio
= (1.0 - cache_kv_ratio
) / 2;
977 inc_cache
->set_cache_ratio(cache_inc_ratio
);
978 full_cache
->set_cache_ratio(cache_full_ratio
);
980 dout(1) << __func__
<< " kv ratio " << cache_kv_ratio
981 << " inc ratio " << cache_inc_ratio
982 << " full ratio " << cache_full_ratio
987 void OSDMonitor::start_mapping()
989 // initiate mapping job
991 dout(10) << __func__
<< " canceling previous mapping_job " << mapping_job
.get()
993 mapping_job
->abort();
995 if (!osdmap
.get_pools().empty()) {
996 auto fin
= new C_UpdateCreatingPGs(this, osdmap
.get_epoch());
997 mapping_job
= mapping
.start_update(osdmap
, mapper
,
998 g_conf()->mon_osd_mapping_pgs_per_chunk
);
999 dout(10) << __func__
<< " started mapping job " << mapping_job
.get()
1000 << " at " << fin
->start
<< dendl
;
1001 mapping_job
->set_finish_event(fin
);
1003 dout(10) << __func__
<< " no pools, no mapping job" << dendl
;
1004 mapping_job
= nullptr;
1008 void OSDMonitor::update_msgr_features()
1011 types
.insert((int)entity_name_t::TYPE_OSD
);
1012 types
.insert((int)entity_name_t::TYPE_CLIENT
);
1013 types
.insert((int)entity_name_t::TYPE_MDS
);
1014 types
.insert((int)entity_name_t::TYPE_MON
);
1015 for (set
<int>::iterator q
= types
.begin(); q
!= types
.end(); ++q
) {
1017 uint64_t features
= osdmap
.get_features(*q
, &mask
);
1018 if ((mon
->messenger
->get_policy(*q
).features_required
& mask
) != features
) {
1019 dout(0) << "crush map has features " << features
<< ", adjusting msgr requires" << dendl
;
1020 ceph::net::Policy p
= mon
->messenger
->get_policy(*q
);
1021 p
.features_required
= (p
.features_required
& ~mask
) | features
;
1022 mon
->messenger
->set_policy(*q
, p
);
1027 void OSDMonitor::on_active()
1031 if (mon
->is_leader()) {
1032 mon
->clog
->debug() << "osdmap " << osdmap
;
1033 if (!priority_convert
) {
1034 // Only do this once at start-up
1035 convert_pool_priorities();
1036 priority_convert
= true;
1039 list
<MonOpRequestRef
> ls
;
1040 take_all_failures(ls
);
1041 while (!ls
.empty()) {
1042 MonOpRequestRef op
= ls
.front();
1043 op
->mark_osdmon_event(__func__
);
1051 void OSDMonitor::on_restart()
1053 last_osd_report
.clear();
1056 void OSDMonitor::on_shutdown()
1058 dout(10) << __func__
<< dendl
;
1060 dout(10) << __func__
<< " canceling previous mapping_job " << mapping_job
.get()
1062 mapping_job
->abort();
1065 // discard failure info, waiters
1066 list
<MonOpRequestRef
> ls
;
1067 take_all_failures(ls
);
1071 void OSDMonitor::update_logger()
1073 dout(10) << "update_logger" << dendl
;
1075 mon
->cluster_logger
->set(l_cluster_num_osd
, osdmap
.get_num_osds());
1076 mon
->cluster_logger
->set(l_cluster_num_osd_up
, osdmap
.get_num_up_osds());
1077 mon
->cluster_logger
->set(l_cluster_num_osd_in
, osdmap
.get_num_in_osds());
1078 mon
->cluster_logger
->set(l_cluster_osd_epoch
, osdmap
.get_epoch());
1081 void OSDMonitor::create_pending()
1083 pending_inc
= OSDMap::Incremental(osdmap
.epoch
+1);
1084 pending_inc
.fsid
= mon
->monmap
->fsid
;
1085 pending_metadata
.clear();
1086 pending_metadata_rm
.clear();
1087 pending_pseudo_purged_snaps
.clear();
1089 dout(10) << "create_pending e " << pending_inc
.epoch
<< dendl
;
1091 // safety checks (this shouldn't really happen)
1093 if (osdmap
.backfillfull_ratio
<= 0) {
1094 pending_inc
.new_backfillfull_ratio
= g_conf()->mon_osd_backfillfull_ratio
;
1095 if (pending_inc
.new_backfillfull_ratio
> 1.0)
1096 pending_inc
.new_backfillfull_ratio
/= 100;
1097 dout(1) << __func__
<< " setting backfillfull_ratio = "
1098 << pending_inc
.new_backfillfull_ratio
<< dendl
;
1100 if (osdmap
.full_ratio
<= 0) {
1101 pending_inc
.new_full_ratio
= g_conf()->mon_osd_full_ratio
;
1102 if (pending_inc
.new_full_ratio
> 1.0)
1103 pending_inc
.new_full_ratio
/= 100;
1104 dout(1) << __func__
<< " setting full_ratio = "
1105 << pending_inc
.new_full_ratio
<< dendl
;
1107 if (osdmap
.nearfull_ratio
<= 0) {
1108 pending_inc
.new_nearfull_ratio
= g_conf()->mon_osd_nearfull_ratio
;
1109 if (pending_inc
.new_nearfull_ratio
> 1.0)
1110 pending_inc
.new_nearfull_ratio
/= 100;
1111 dout(1) << __func__
<< " setting nearfull_ratio = "
1112 << pending_inc
.new_nearfull_ratio
<< dendl
;
1116 // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
1118 if (osdmap
.crush
->has_legacy_rule_ids()) {
1119 CrushWrapper newcrush
;
1120 _get_pending_crush(newcrush
);
1122 // First, for all pools, work out which rule they really used
1123 // by resolving ruleset to rule.
1124 for (const auto &i
: osdmap
.get_pools()) {
1125 const auto pool_id
= i
.first
;
1126 const auto &pool
= i
.second
;
1127 int new_rule_id
= newcrush
.find_rule(pool
.crush_rule
,
1128 pool
.type
, pool
.size
);
1130 dout(1) << __func__
<< " rewriting pool "
1131 << osdmap
.get_pool_name(pool_id
) << " crush ruleset "
1132 << pool
.crush_rule
<< " -> rule id " << new_rule_id
<< dendl
;
1133 if (pending_inc
.new_pools
.count(pool_id
) == 0) {
1134 pending_inc
.new_pools
[pool_id
] = pool
;
1136 pending_inc
.new_pools
[pool_id
].crush_rule
= new_rule_id
;
1139 // Now, go ahead and renumber all the rules so that their
1140 // rule_id field corresponds to their position in the array
1141 auto old_to_new
= newcrush
.renumber_rules();
1142 dout(1) << __func__
<< " Rewrote " << old_to_new
<< " crush IDs:" << dendl
;
1143 for (const auto &i
: old_to_new
) {
1144 dout(1) << __func__
<< " " << i
.first
<< " -> " << i
.second
<< dendl
;
1146 pending_inc
.crush
.clear();
1147 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
1152 OSDMonitor::update_pending_pgs(const OSDMap::Incremental
& inc
,
1153 const OSDMap
& nextmap
)
1155 dout(10) << __func__
<< dendl
;
1156 creating_pgs_t pending_creatings
;
1158 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
1159 pending_creatings
= creating_pgs
;
1161 // check for new or old pools
1162 if (pending_creatings
.last_scan_epoch
< inc
.epoch
) {
1163 unsigned queued
= 0;
1164 queued
+= scan_for_creating_pgs(osdmap
.get_pools(),
1167 &pending_creatings
);
1168 queued
+= scan_for_creating_pgs(inc
.new_pools
,
1171 &pending_creatings
);
1172 dout(10) << __func__
<< " " << queued
<< " pools queued" << dendl
;
1173 for (auto deleted_pool
: inc
.old_pools
) {
1174 auto removed
= pending_creatings
.remove_pool(deleted_pool
);
1175 dout(10) << __func__
<< " " << removed
1176 << " pg removed because containing pool deleted: "
1177 << deleted_pool
<< dendl
;
1178 last_epoch_clean
.remove_pool(deleted_pool
);
1180 // pgmon updates its creating_pgs in check_osd_map() which is called by
1181 // on_active() and check_osd_map() could be delayed if lease expires, so its
1182 // creating_pgs could be stale in comparison with the one of osdmon. let's
1183 // trim them here. otherwise, they will be added back after being erased.
1184 unsigned removed
= 0;
1185 for (auto& pg
: pending_created_pgs
) {
1186 dout(20) << __func__
<< " noting created pg " << pg
<< dendl
;
1187 pending_creatings
.created_pools
.insert(pg
.pool());
1188 removed
+= pending_creatings
.pgs
.erase(pg
);
1190 pending_created_pgs
.clear();
1191 dout(10) << __func__
<< " " << removed
1192 << " pgs removed because they're created" << dendl
;
1193 pending_creatings
.last_scan_epoch
= osdmap
.get_epoch();
1196 // filter out any pgs that shouldn't exist.
1198 auto i
= pending_creatings
.pgs
.begin();
1199 while (i
!= pending_creatings
.pgs
.end()) {
1200 if (!nextmap
.pg_exists(i
->first
)) {
1201 dout(10) << __func__
<< " removing pg " << i
->first
1202 << " which should not exist" << dendl
;
1203 i
= pending_creatings
.pgs
.erase(i
);
1211 unsigned max
= std::max
<int64_t>(1, g_conf()->mon_osd_max_creating_pgs
);
1212 const auto total
= pending_creatings
.pgs
.size();
1213 while (pending_creatings
.pgs
.size() < max
&&
1214 !pending_creatings
.queue
.empty()) {
1215 auto p
= pending_creatings
.queue
.begin();
1216 int64_t poolid
= p
->first
;
1217 dout(10) << __func__
<< " pool " << poolid
1218 << " created " << p
->second
.created
1219 << " modified " << p
->second
.modified
1220 << " [" << p
->second
.start
<< "-" << p
->second
.end
<< ")"
1222 int64_t n
= std::min
<int64_t>(max
- pending_creatings
.pgs
.size(),
1223 p
->second
.end
- p
->second
.start
);
1224 ps_t first
= p
->second
.start
;
1225 ps_t end
= first
+ n
;
1226 for (ps_t ps
= first
; ps
< end
; ++ps
) {
1227 const pg_t pgid
{ps
, static_cast<uint64_t>(poolid
)};
1228 // NOTE: use the *current* epoch as the PG creation epoch so that the
1229 // OSD does not have to generate a long set of PastIntervals.
1230 pending_creatings
.pgs
.emplace(
1232 creating_pgs_t::pg_create_info(inc
.epoch
,
1233 p
->second
.modified
));
1234 dout(10) << __func__
<< " adding " << pgid
<< dendl
;
1236 p
->second
.start
= end
;
1237 if (p
->second
.done()) {
1238 dout(10) << __func__
<< " done with queue for " << poolid
<< dendl
;
1239 pending_creatings
.queue
.erase(p
);
1241 dout(10) << __func__
<< " pool " << poolid
1242 << " now [" << p
->second
.start
<< "-" << p
->second
.end
<< ")"
1246 dout(10) << __func__
<< " queue remaining: " << pending_creatings
.queue
.size()
1247 << " pools" << dendl
;
1249 if (mon
->monmap
->min_mon_release
>= ceph_release_t::octopus
) {
1250 // walk creating pgs' history and past_intervals forward
1251 for (auto& i
: pending_creatings
.pgs
) {
1252 // this mirrors PG::start_peering_interval()
1253 pg_t pgid
= i
.first
;
1255 // this is a bit imprecise, but sufficient?
1256 struct min_size_predicate_t
: public IsPGRecoverablePredicate
{
1257 const pg_pool_t
*pi
;
1258 bool operator()(const set
<pg_shard_t
> &have
) const {
1259 return have
.size() >= pi
->min_size
;
1261 explicit min_size_predicate_t(const pg_pool_t
*i
) : pi(i
) {}
1262 } min_size_predicate(nextmap
.get_pg_pool(pgid
.pool()));
1264 vector
<int> up
, acting
;
1265 int up_primary
, acting_primary
;
1266 nextmap
.pg_to_up_acting_osds(
1267 pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
1268 if (i
.second
.history
.epoch_created
== 0) {
1269 // new pg entry, set it up
1271 i
.second
.acting
= acting
;
1272 i
.second
.up_primary
= up_primary
;
1273 i
.second
.acting_primary
= acting_primary
;
1274 i
.second
.history
= pg_history_t(i
.second
.create_epoch
,
1275 i
.second
.create_stamp
);
1276 dout(10) << __func__
<< " pg " << pgid
<< " just added, "
1277 << " up " << i
.second
.up
1278 << " p " << i
.second
.up_primary
1279 << " acting " << i
.second
.acting
1280 << " p " << i
.second
.acting_primary
1281 << " history " << i
.second
.history
1282 << " past_intervals " << i
.second
.past_intervals
1285 std::stringstream debug
;
1286 if (PastIntervals::check_new_interval(
1287 i
.second
.acting_primary
, acting_primary
,
1288 i
.second
.acting
, acting
,
1289 i
.second
.up_primary
, up_primary
,
1291 i
.second
.history
.same_interval_since
,
1292 i
.second
.history
.last_epoch_clean
,
1297 &i
.second
.past_intervals
,
1299 epoch_t e
= inc
.epoch
;
1300 i
.second
.history
.same_interval_since
= e
;
1301 if (i
.second
.up
!= up
) {
1302 i
.second
.history
.same_up_since
= e
;
1304 if (i
.second
.acting_primary
!= acting_primary
) {
1305 i
.second
.history
.same_primary_since
= e
;
1308 osdmap
.get_pg_num(pgid
.pool()),
1309 nextmap
.get_pg_num(pgid
.pool()),
1311 i
.second
.history
.last_epoch_split
= e
;
1313 dout(10) << __func__
<< " pg " << pgid
<< " new interval,"
1314 << " up " << i
.second
.up
<< " -> " << up
1315 << " p " << i
.second
.up_primary
<< " -> " << up_primary
1316 << " acting " << i
.second
.acting
<< " -> " << acting
1317 << " p " << i
.second
.acting_primary
<< " -> "
1319 << " history " << i
.second
.history
1320 << " past_intervals " << i
.second
.past_intervals
1322 dout(20) << " debug: " << debug
.str() << dendl
;
1324 i
.second
.acting
= acting
;
1325 i
.second
.up_primary
= up_primary
;
1326 i
.second
.acting_primary
= acting_primary
;
1331 dout(10) << __func__
1332 << " " << (pending_creatings
.pgs
.size() - total
)
1333 << "/" << pending_creatings
.pgs
.size()
1334 << " pgs added from queued pools" << dendl
;
1335 return pending_creatings
;
1338 void OSDMonitor::maybe_prime_pg_temp()
1341 if (pending_inc
.crush
.length()) {
1342 dout(10) << __func__
<< " new crush map, all" << dendl
;
1346 if (!pending_inc
.new_up_client
.empty()) {
1347 dout(10) << __func__
<< " new up osds, all" << dendl
;
1351 // check for interesting OSDs
1353 for (auto p
= pending_inc
.new_state
.begin();
1354 !all
&& p
!= pending_inc
.new_state
.end();
1356 if ((p
->second
& CEPH_OSD_UP
) &&
1357 osdmap
.is_up(p
->first
)) {
1358 osds
.insert(p
->first
);
1361 for (map
<int32_t,uint32_t>::iterator p
= pending_inc
.new_weight
.begin();
1362 !all
&& p
!= pending_inc
.new_weight
.end();
1364 if (p
->second
< osdmap
.get_weight(p
->first
)) {
1366 osds
.insert(p
->first
);
1368 dout(10) << __func__
<< " osd." << p
->first
<< " weight increase, all"
1374 if (!all
&& osds
.empty())
1379 mapping
.get_osd_acting_pgs(*osds
.begin()).size() * osds
.size();
1380 if (estimate
> mapping
.get_num_pgs() *
1381 g_conf()->mon_osd_prime_pg_temp_max_estimate
) {
1382 dout(10) << __func__
<< " estimate " << estimate
<< " pgs on "
1383 << osds
.size() << " osds >= "
1384 << g_conf()->mon_osd_prime_pg_temp_max_estimate
<< " of total "
1385 << mapping
.get_num_pgs() << " pgs, all"
1389 dout(10) << __func__
<< " estimate " << estimate
<< " pgs on "
1390 << osds
.size() << " osds" << dendl
;
1395 next
.deepish_copy_from(osdmap
);
1396 next
.apply_incremental(pending_inc
);
1398 if (next
.get_pools().empty()) {
1399 dout(10) << __func__
<< " no pools, no pg_temp priming" << dendl
;
1401 PrimeTempJob
job(next
, this);
1402 mapper
.queue(&job
, g_conf()->mon_osd_mapping_pgs_per_chunk
, {});
1403 if (job
.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time
)) {
1404 dout(10) << __func__
<< " done in " << job
.get_duration() << dendl
;
1406 dout(10) << __func__
<< " did not finish in "
1407 << g_conf()->mon_osd_prime_pg_temp_max_time
1408 << ", stopping" << dendl
;
1412 dout(10) << __func__
<< " " << osds
.size() << " interesting osds" << dendl
;
1413 utime_t stop
= ceph_clock_now();
1414 stop
+= g_conf()->mon_osd_prime_pg_temp_max_time
;
1415 const int chunk
= 1000;
1417 std::unordered_set
<pg_t
> did_pgs
;
1418 for (auto osd
: osds
) {
1419 auto& pgs
= mapping
.get_osd_acting_pgs(osd
);
1420 dout(20) << __func__
<< " osd." << osd
<< " " << pgs
<< dendl
;
1421 for (auto pgid
: pgs
) {
1422 if (!did_pgs
.insert(pgid
).second
) {
1425 prime_pg_temp(next
, pgid
);
1428 if (ceph_clock_now() > stop
) {
1429 dout(10) << __func__
<< " consumed more than "
1430 << g_conf()->mon_osd_prime_pg_temp_max_time
1431 << " seconds, stopping"
1441 void OSDMonitor::prime_pg_temp(
1445 // TODO: remove this creating_pgs direct access?
1446 if (creating_pgs
.pgs
.count(pgid
)) {
1449 if (!osdmap
.pg_exists(pgid
)) {
1453 vector
<int> up
, acting
;
1454 mapping
.get(pgid
, &up
, nullptr, &acting
, nullptr);
1456 vector
<int> next_up
, next_acting
;
1457 int next_up_primary
, next_acting_primary
;
1458 next
.pg_to_up_acting_osds(pgid
, &next_up
, &next_up_primary
,
1459 &next_acting
, &next_acting_primary
);
1460 if (acting
== next_acting
&&
1461 !(up
!= acting
&& next_up
== next_acting
))
1462 return; // no change since last epoch
1465 return; // if previously empty now we can be no worse off
1466 const pg_pool_t
*pool
= next
.get_pg_pool(pgid
.pool());
1467 if (pool
&& acting
.size() < pool
->min_size
)
1468 return; // can be no worse off than before
1470 if (next_up
== next_acting
) {
1472 dout(20) << __func__
<< " next_up == next_acting now, clear pg_temp"
1476 dout(20) << __func__
<< " " << pgid
<< " " << up
<< "/" << acting
1477 << " -> " << next_up
<< "/" << next_acting
1478 << ", priming " << acting
1481 std::lock_guard
l(prime_pg_temp_lock
);
1482 // do not touch a mapping if a change is pending
1483 pending_inc
.new_pg_temp
.emplace(
1485 mempool::osdmap::vector
<int>(acting
.begin(), acting
.end()));
1490 * @note receiving a transaction in this function gives a fair amount of
1491 * freedom to the service implementation if it does need it. It shouldn't.
1493 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t
)
1495 dout(10) << "encode_pending e " << pending_inc
.epoch
1499 dout(1) << __func__
<< " osdmap full prune encoded e"
1500 << pending_inc
.epoch
<< dendl
;
1503 // finalize up pending_inc
1504 pending_inc
.modified
= ceph_clock_now();
1506 int r
= pending_inc
.propagate_base_properties_to_tiers(cct
, osdmap
);
1507 ceph_assert(r
== 0);
1510 if (!mapping_job
->is_done()) {
1511 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job "
1512 << mapping_job
.get() << " did not complete, "
1513 << mapping_job
->shards
<< " left" << dendl
;
1514 mapping_job
->abort();
1515 } else if (mapping
.get_epoch() < osdmap
.get_epoch()) {
1516 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job "
1517 << mapping_job
.get() << " is prior epoch "
1518 << mapping
.get_epoch() << dendl
;
1520 if (g_conf()->mon_osd_prime_pg_temp
) {
1521 maybe_prime_pg_temp();
1524 } else if (g_conf()->mon_osd_prime_pg_temp
) {
1525 dout(1) << __func__
<< " skipping prime_pg_temp; mapping job did not start"
1528 mapping_job
.reset();
1530 // ensure we don't have blank new_state updates. these are interrpeted as
1531 // CEPH_OSD_UP (and almost certainly not what we want!).
1532 auto p
= pending_inc
.new_state
.begin();
1533 while (p
!= pending_inc
.new_state
.end()) {
1534 if (p
->second
== 0) {
1535 dout(10) << "new_state for osd." << p
->first
<< " is 0, removing" << dendl
;
1536 p
= pending_inc
.new_state
.erase(p
);
1538 if (p
->second
& CEPH_OSD_UP
) {
1539 pending_inc
.new_last_up_change
= pending_inc
.modified
;
1544 if (!pending_inc
.new_up_client
.empty()) {
1545 pending_inc
.new_last_up_change
= pending_inc
.modified
;
1547 for (auto& i
: pending_inc
.new_weight
) {
1548 if (i
.first
>= osdmap
.max_osd
) {
1550 // new osd is already marked in
1551 pending_inc
.new_last_in_change
= pending_inc
.modified
;
1554 } else if (!!i
.second
!= !!osdmap
.osd_weight
[i
.first
]) {
1555 // existing osd marked in or out
1556 pending_inc
.new_last_in_change
= pending_inc
.modified
;
1563 tmp
.deepish_copy_from(osdmap
);
1564 tmp
.apply_incremental(pending_inc
);
1566 // clean pg_temp mappings
1567 OSDMap::clean_temps(cct
, osdmap
, tmp
, &pending_inc
);
1569 // clean inappropriate pg_upmap/pg_upmap_items (if any)
1571 // check every upmapped pg for now
1572 // until we could reliably identify certain cases to ignore,
1573 // which is obviously the hard part TBD..
1574 vector
<pg_t
> pgs_to_check
;
1575 tmp
.get_upmap_pgs(&pgs_to_check
);
1576 if (pgs_to_check
.size() <
1577 static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk
* 2)) {
1578 // not enough pgs, do it inline
1579 tmp
.clean_pg_upmaps(cct
, &pending_inc
);
1581 CleanUpmapJob
job(cct
, tmp
, pending_inc
);
1582 mapper
.queue(&job
, g_conf()->mon_clean_pg_upmaps_per_chunk
, pgs_to_check
);
1587 // update creating pgs first so that we can remove the created pgid and
1588 // process the pool flag removal below in the same osdmap epoch.
1589 auto pending_creatings
= update_pending_pgs(pending_inc
, tmp
);
1590 bufferlist creatings_bl
;
1591 uint64_t features
= CEPH_FEATURES_ALL
;
1592 if (mon
->monmap
->min_mon_release
< ceph_release_t::octopus
) {
1593 dout(20) << __func__
<< " encoding pending pgs without octopus features"
1595 features
&= ~CEPH_FEATURE_SERVER_OCTOPUS
;
1597 encode(pending_creatings
, creatings_bl
, features
);
1598 t
->put(OSD_PG_CREATING_PREFIX
, "creating", creatings_bl
);
1600 // remove any old (or incompat) POOL_CREATING flags
1601 for (auto& i
: tmp
.get_pools()) {
1602 if (tmp
.require_osd_release
< ceph_release_t::nautilus
) {
1603 // pre-nautilus OSDMaps shouldn't get this flag.
1604 if (pending_inc
.new_pools
.count(i
.first
)) {
1605 pending_inc
.new_pools
[i
.first
].flags
&= ~pg_pool_t::FLAG_CREATING
;
1608 if (i
.second
.has_flag(pg_pool_t::FLAG_CREATING
) &&
1609 !pending_creatings
.still_creating_pool(i
.first
)) {
1610 dout(10) << __func__
<< " done creating pool " << i
.first
1611 << ", clearing CREATING flag" << dendl
;
1612 if (pending_inc
.new_pools
.count(i
.first
) == 0) {
1613 pending_inc
.new_pools
[i
.first
] = i
.second
;
1615 pending_inc
.new_pools
[i
.first
].flags
&= ~pg_pool_t::FLAG_CREATING
;
1619 // collect which pools are currently affected by
1620 // the near/backfill/full osd(s),
1621 // and set per-pool near/backfill/full flag instead
1622 set
<int64_t> full_pool_ids
;
1623 set
<int64_t> backfillfull_pool_ids
;
1624 set
<int64_t> nearfull_pool_ids
;
1625 tmp
.get_full_pools(cct
,
1627 &backfillfull_pool_ids
,
1628 &nearfull_pool_ids
);
1629 if (full_pool_ids
.empty() ||
1630 backfillfull_pool_ids
.empty() ||
1631 nearfull_pool_ids
.empty()) {
1632 // normal case - no nearfull, backfillfull or full osds
1633 // try cancel any improper nearfull/backfillfull/full pool
1635 for (auto &pool
: tmp
.get_pools()) {
1636 auto p
= pool
.first
;
1637 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
) &&
1638 nearfull_pool_ids
.empty()) {
1639 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1640 << "'s nearfull flag" << dendl
;
1641 if (pending_inc
.new_pools
.count(p
) == 0) {
1642 // load original pool info first!
1643 pending_inc
.new_pools
[p
] = pool
.second
;
1645 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1647 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
) &&
1648 backfillfull_pool_ids
.empty()) {
1649 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1650 << "'s backfillfull flag" << dendl
;
1651 if (pending_inc
.new_pools
.count(p
) == 0) {
1652 pending_inc
.new_pools
[p
] = pool
.second
;
1654 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1656 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
) &&
1657 full_pool_ids
.empty()) {
1658 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1659 // set by EQUOTA, skipping
1662 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1663 << "'s full flag" << dendl
;
1664 if (pending_inc
.new_pools
.count(p
) == 0) {
1665 pending_inc
.new_pools
[p
] = pool
.second
;
1667 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_FULL
;
1671 if (!full_pool_ids
.empty()) {
1672 dout(10) << __func__
<< " marking pool(s) " << full_pool_ids
1673 << " as full" << dendl
;
1674 for (auto &p
: full_pool_ids
) {
1675 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
)) {
1678 if (pending_inc
.new_pools
.count(p
) == 0) {
1679 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1681 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_FULL
;
1682 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1683 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1685 // cancel FLAG_FULL for pools which are no longer full too
1686 for (auto &pool
: tmp
.get_pools()) {
1687 auto p
= pool
.first
;
1688 if (full_pool_ids
.count(p
)) {
1689 // skip pools we have just marked as full above
1692 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
) ||
1693 tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1694 // don't touch if currently is not full
1695 // or is running out of quota (and hence considered as full)
1698 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1699 << "'s full flag" << dendl
;
1700 if (pending_inc
.new_pools
.count(p
) == 0) {
1701 pending_inc
.new_pools
[p
] = pool
.second
;
1703 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_FULL
;
1706 if (!backfillfull_pool_ids
.empty()) {
1707 for (auto &p
: backfillfull_pool_ids
) {
1708 if (full_pool_ids
.count(p
)) {
1709 // skip pools we have already considered as full above
1712 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1713 // make sure FLAG_FULL is truly set, so we are safe not
1714 // to set a extra (redundant) FLAG_BACKFILLFULL flag
1715 ceph_assert(tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
));
1718 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
)) {
1719 // don't bother if pool is already marked as backfillfull
1722 dout(10) << __func__
<< " marking pool '" << tmp
.pool_name
[p
]
1723 << "'s as backfillfull" << dendl
;
1724 if (pending_inc
.new_pools
.count(p
) == 0) {
1725 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1727 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_BACKFILLFULL
;
1728 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1730 // cancel FLAG_BACKFILLFULL for pools
1731 // which are no longer backfillfull too
1732 for (auto &pool
: tmp
.get_pools()) {
1733 auto p
= pool
.first
;
1734 if (full_pool_ids
.count(p
) || backfillfull_pool_ids
.count(p
)) {
1735 // skip pools we have just marked as backfillfull/full above
1738 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_BACKFILLFULL
)) {
1739 // and don't touch if currently is not backfillfull
1742 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1743 << "'s backfillfull flag" << dendl
;
1744 if (pending_inc
.new_pools
.count(p
) == 0) {
1745 pending_inc
.new_pools
[p
] = pool
.second
;
1747 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_BACKFILLFULL
;
1750 if (!nearfull_pool_ids
.empty()) {
1751 for (auto &p
: nearfull_pool_ids
) {
1752 if (full_pool_ids
.count(p
) || backfillfull_pool_ids
.count(p
)) {
1755 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
1756 // make sure FLAG_FULL is truly set, so we are safe not
1757 // to set a extra (redundant) FLAG_NEARFULL flag
1758 ceph_assert(tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_FULL
));
1761 if (tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
)) {
1762 // don't bother if pool is already marked as nearfull
1765 dout(10) << __func__
<< " marking pool '" << tmp
.pool_name
[p
]
1766 << "'s as nearfull" << dendl
;
1767 if (pending_inc
.new_pools
.count(p
) == 0) {
1768 pending_inc
.new_pools
[p
] = tmp
.pools
[p
];
1770 pending_inc
.new_pools
[p
].flags
|= pg_pool_t::FLAG_NEARFULL
;
1772 // cancel FLAG_NEARFULL for pools
1773 // which are no longer nearfull too
1774 for (auto &pool
: tmp
.get_pools()) {
1775 auto p
= pool
.first
;
1776 if (full_pool_ids
.count(p
) ||
1777 backfillfull_pool_ids
.count(p
) ||
1778 nearfull_pool_ids
.count(p
)) {
1779 // skip pools we have just marked as
1780 // nearfull/backfillfull/full above
1783 if (!tmp
.get_pg_pool(p
)->has_flag(pg_pool_t::FLAG_NEARFULL
)) {
1784 // and don't touch if currently is not nearfull
1787 dout(10) << __func__
<< " clearing pool '" << tmp
.pool_name
[p
]
1788 << "'s nearfull flag" << dendl
;
1789 if (pending_inc
.new_pools
.count(p
) == 0) {
1790 pending_inc
.new_pools
[p
] = pool
.second
;
1792 pending_inc
.new_pools
[p
].flags
&= ~pg_pool_t::FLAG_NEARFULL
;
1796 // min_compat_client?
1797 if (!tmp
.require_min_compat_client
) {
1798 auto mv
= tmp
.get_min_compat_client();
1799 dout(1) << __func__
<< " setting require_min_compat_client to currently "
1800 << "required " << mv
<< dendl
;
1801 mon
->clog
->info() << "setting require_min_compat_client to currently "
1802 << "required " << mv
;
1803 pending_inc
.new_require_min_compat_client
= mv
;
1806 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
&&
1807 tmp
.require_osd_release
>= ceph_release_t::nautilus
) {
1808 dout(10) << __func__
<< " first nautilus+ epoch" << dendl
;
1809 // add creating flags?
1810 for (auto& i
: tmp
.get_pools()) {
1811 if (pending_creatings
.still_creating_pool(i
.first
)) {
1812 dout(10) << __func__
<< " adding CREATING flag to pool " << i
.first
1814 if (pending_inc
.new_pools
.count(i
.first
) == 0) {
1815 pending_inc
.new_pools
[i
.first
] = i
.second
;
1817 pending_inc
.new_pools
[i
.first
].flags
|= pg_pool_t::FLAG_CREATING
;
1820 // adjust blacklist items to all be TYPE_ANY
1821 for (auto& i
: tmp
.blacklist
) {
1823 a
.set_type(entity_addr_t::TYPE_ANY
);
1824 pending_inc
.new_blacklist
[a
] = i
.second
;
1825 pending_inc
.old_blacklist
.push_back(i
.first
);
1829 if (osdmap
.require_osd_release
< ceph_release_t::octopus
&&
1830 tmp
.require_osd_release
>= ceph_release_t::octopus
) {
1831 dout(10) << __func__
<< " first octopus+ epoch" << dendl
;
1833 // adjust obsoleted cache modes
1834 for (auto& [poolid
, pi
] : tmp
.pools
) {
1835 if (pi
.cache_mode
== pg_pool_t::CACHEMODE_FORWARD
) {
1836 if (pending_inc
.new_pools
.count(poolid
) == 0) {
1837 pending_inc
.new_pools
[poolid
] = pi
;
1839 dout(10) << __func__
<< " switching pool " << poolid
1840 << " cachemode from forward -> proxy" << dendl
;
1841 pending_inc
.new_pools
[poolid
].cache_mode
= pg_pool_t::CACHEMODE_PROXY
;
1843 if (pi
.cache_mode
== pg_pool_t::CACHEMODE_READFORWARD
) {
1844 if (pending_inc
.new_pools
.count(poolid
) == 0) {
1845 pending_inc
.new_pools
[poolid
] = pi
;
1847 dout(10) << __func__
<< " switching pool " << poolid
1848 << " cachemode from readforward -> readproxy" << dendl
;
1849 pending_inc
.new_pools
[poolid
].cache_mode
=
1850 pg_pool_t::CACHEMODE_READPROXY
;
1854 // clear removed_snaps for every pool
1855 for (auto& [poolid
, pi
] : tmp
.pools
) {
1856 if (pi
.removed_snaps
.empty()) {
1859 if (pending_inc
.new_pools
.count(poolid
) == 0) {
1860 pending_inc
.new_pools
[poolid
] = pi
;
1862 dout(10) << __func__
<< " clearing pool " << poolid
<< " removed_snaps"
1864 pending_inc
.new_pools
[poolid
].removed_snaps
.clear();
1867 // create a combined purged snap epoch key for all purged snaps
1868 // prior to this epoch, and store it in the current epoch (i.e.,
1869 // the last pre-octopus epoch, just prior to the one we're
1871 auto it
= mon
->store
->get_iterator(OSD_SNAP_PREFIX
);
1872 it
->lower_bound("purged_snap_");
1873 map
<int64_t,snap_interval_set_t
> combined
;
1874 while (it
->valid()) {
1875 if (it
->key().find("purged_snap_") != 0) {
1878 string k
= it
->key();
1879 long long unsigned pool
;
1880 int n
= sscanf(k
.c_str(), "purged_snap_%llu_", &pool
);
1882 derr
<< __func__
<< " invalid purged_snaps key '" << k
<< "'" << dendl
;
1884 bufferlist v
= it
->value();
1885 auto p
= v
.cbegin();
1886 snapid_t begin
, end
;
1887 ceph::decode(begin
, p
);
1888 ceph::decode(end
, p
);
1889 combined
[pool
].insert(begin
, end
- begin
);
1893 if (!combined
.empty()) {
1894 string k
= make_purged_snap_epoch_key(pending_inc
.epoch
- 1);
1896 ceph::encode(combined
, v
);
1897 t
->put(OSD_SNAP_PREFIX
, k
, v
);
1898 dout(10) << __func__
<< " recording pre-octopus purged_snaps in epoch "
1899 << (pending_inc
.epoch
- 1) << ", " << v
.length() << " bytes"
1902 dout(10) << __func__
<< " there were no pre-octopus purged snaps"
1906 // clean out the old removed_snap_ and removed_epoch keys
1907 // ('`' is ASCII '_' + 1)
1908 t
->erase_range(OSD_SNAP_PREFIX
, "removed_snap_", "removed_snap`");
1909 t
->erase_range(OSD_SNAP_PREFIX
, "removed_epoch_", "removed_epoch`");
1914 for (auto i
= pending_inc
.new_state
.begin();
1915 i
!= pending_inc
.new_state
.end();
1917 int s
= i
->second
? i
->second
: CEPH_OSD_UP
;
1918 if (s
& CEPH_OSD_UP
) {
1919 dout(2) << " osd." << i
->first
<< " DOWN" << dendl
;
1920 // Reset laggy parameters if failure interval exceeds a threshold.
1921 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(i
->first
);
1922 if ((xi
.laggy_probability
|| xi
.laggy_interval
) && xi
.down_stamp
.sec()) {
1923 int last_failure_interval
= pending_inc
.modified
.sec() - xi
.down_stamp
.sec();
1924 if (grace_interval_threshold_exceeded(last_failure_interval
)) {
1925 set_default_laggy_params(i
->first
);
1929 if (s
& CEPH_OSD_EXISTS
)
1930 dout(2) << " osd." << i
->first
<< " DNE" << dendl
;
1932 for (auto i
= pending_inc
.new_up_client
.begin();
1933 i
!= pending_inc
.new_up_client
.end();
1935 //FIXME: insert cluster addresses too
1936 dout(2) << " osd." << i
->first
<< " UP " << i
->second
<< dendl
;
1938 for (map
<int32_t,uint32_t>::iterator i
= pending_inc
.new_weight
.begin();
1939 i
!= pending_inc
.new_weight
.end();
1941 if (i
->second
== CEPH_OSD_OUT
) {
1942 dout(2) << " osd." << i
->first
<< " OUT" << dendl
;
1943 } else if (i
->second
== CEPH_OSD_IN
) {
1944 dout(2) << " osd." << i
->first
<< " IN" << dendl
;
1946 dout(2) << " osd." << i
->first
<< " WEIGHT " << hex
<< i
->second
<< dec
<< dendl
;
1950 // features for osdmap and its incremental
1953 // encode full map and determine its crc
1956 tmp
.deepish_copy_from(osdmap
);
1957 tmp
.apply_incremental(pending_inc
);
1959 // determine appropriate features
1960 features
= tmp
.get_encoding_features();
1961 dout(10) << __func__
<< " encoding full map with "
1962 << tmp
.require_osd_release
1963 << " features " << features
<< dendl
;
1965 // the features should be a subset of the mon quorum's features!
1966 ceph_assert((features
& ~mon
->get_quorum_con_features()) == 0);
1969 encode(tmp
, fullbl
, features
| CEPH_FEATURE_RESERVED
);
1970 pending_inc
.full_crc
= tmp
.get_crc();
1972 // include full map in the txn. note that old monitors will
1973 // overwrite this. new ones will now skip the local full map
1974 // encode and reload from this.
1975 put_version_full(t
, pending_inc
.epoch
, fullbl
);
1979 ceph_assert(get_last_committed() + 1 == pending_inc
.epoch
);
1981 encode(pending_inc
, bl
, features
| CEPH_FEATURE_RESERVED
);
1983 dout(20) << " full_crc " << tmp
.get_crc()
1984 << " inc_crc " << pending_inc
.inc_crc
<< dendl
;
1986 /* put everything in the transaction */
1987 put_version(t
, pending_inc
.epoch
, bl
);
1988 put_last_committed(t
, pending_inc
.epoch
);
1991 for (map
<int,bufferlist
>::iterator p
= pending_metadata
.begin();
1992 p
!= pending_metadata
.end();
1994 t
->put(OSD_METADATA_PREFIX
, stringify(p
->first
), p
->second
);
1995 for (set
<int>::iterator p
= pending_metadata_rm
.begin();
1996 p
!= pending_metadata_rm
.end();
1998 t
->erase(OSD_METADATA_PREFIX
, stringify(*p
));
1999 pending_metadata
.clear();
2000 pending_metadata_rm
.clear();
2003 if (tmp
.require_osd_release
>= ceph_release_t::octopus
&&
2004 !pending_inc
.new_purged_snaps
.empty()) {
2005 // all snaps purged this epoch (across all pools)
2006 string k
= make_purged_snap_epoch_key(pending_inc
.epoch
);
2008 encode(pending_inc
.new_purged_snaps
, v
);
2009 t
->put(OSD_SNAP_PREFIX
, k
, v
);
2011 for (auto& i
: pending_inc
.new_purged_snaps
) {
2012 for (auto q
= i
.second
.begin();
2013 q
!= i
.second
.end();
2015 insert_purged_snap_update(i
.first
, q
.get_start(), q
.get_end(),
2020 for (auto& [pool
, snaps
] : pending_pseudo_purged_snaps
) {
2021 for (auto snap
: snaps
) {
2022 insert_purged_snap_update(pool
, snap
, snap
+ 1,
2029 health_check_map_t next
;
2030 tmp
.check_health(cct
, &next
);
2031 encode_health(next
, t
);
2034 int OSDMonitor::load_metadata(int osd
, map
<string
, string
>& m
, ostream
*err
)
2037 int r
= mon
->store
->get(OSD_METADATA_PREFIX
, stringify(osd
), bl
);
2041 auto p
= bl
.cbegin();
2044 catch (buffer::error
& e
) {
2046 *err
<< "osd." << osd
<< " metadata is corrupt";
2052 void OSDMonitor::count_metadata(const string
& field
, map
<string
,int> *out
)
2054 for (int osd
= 0; osd
< osdmap
.get_max_osd(); ++osd
) {
2055 if (osdmap
.is_up(osd
)) {
2056 map
<string
,string
> meta
;
2057 load_metadata(osd
, meta
, nullptr);
2058 auto p
= meta
.find(field
);
2059 if (p
== meta
.end()) {
2060 (*out
)["unknown"]++;
2062 (*out
)[p
->second
]++;
2068 void OSDMonitor::count_metadata(const string
& field
, Formatter
*f
)
2070 map
<string
,int> by_val
;
2071 count_metadata(field
, &by_val
);
2072 f
->open_object_section(field
.c_str());
2073 for (auto& p
: by_val
) {
2074 f
->dump_int(p
.first
.c_str(), p
.second
);
2079 int OSDMonitor::get_osd_objectstore_type(int osd
, string
*type
)
2081 map
<string
, string
> metadata
;
2082 int r
= load_metadata(osd
, metadata
, nullptr);
2086 auto it
= metadata
.find("osd_objectstore");
2087 if (it
== metadata
.end())
2093 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id
,
2094 const pg_pool_t
&pool
,
2097 // just check a few pgs for efficiency - this can't give a guarantee anyway,
2098 // since filestore osds could always join the pool later
2099 set
<int> checked_osds
;
2100 for (unsigned ps
= 0; ps
< std::min(8u, pool
.get_pg_num()); ++ps
) {
2101 vector
<int> up
, acting
;
2102 pg_t
pgid(ps
, pool_id
);
2103 osdmap
.pg_to_up_acting_osds(pgid
, up
, acting
);
2104 for (int osd
: up
) {
2105 if (checked_osds
.find(osd
) != checked_osds
.end())
2107 string objectstore_type
;
2108 int r
= get_osd_objectstore_type(osd
, &objectstore_type
);
2109 // allow with missing metadata, e.g. due to an osd never booting yet
2110 if (r
< 0 || objectstore_type
== "bluestore") {
2111 checked_osds
.insert(osd
);
2114 *err
<< "osd." << osd
<< " uses " << objectstore_type
;
2121 int OSDMonitor::dump_osd_metadata(int osd
, Formatter
*f
, ostream
*err
)
2123 map
<string
,string
> m
;
2124 if (int r
= load_metadata(osd
, m
, err
))
2126 for (map
<string
,string
>::iterator p
= m
.begin(); p
!= m
.end(); ++p
)
2127 f
->dump_string(p
->first
.c_str(), p
->second
);
2131 void OSDMonitor::print_nodes(Formatter
*f
)
2133 // group OSDs by their hosts
2134 map
<string
, list
<int> > osds
; // hostname => osd
2135 for (int osd
= 0; osd
< osdmap
.get_max_osd(); osd
++) {
2136 map
<string
, string
> m
;
2137 if (load_metadata(osd
, m
, NULL
)) {
2140 map
<string
, string
>::iterator hostname
= m
.find("hostname");
2141 if (hostname
== m
.end()) {
2142 // not likely though
2145 osds
[hostname
->second
].push_back(osd
);
2148 dump_services(f
, osds
, "osd");
2151 void OSDMonitor::share_map_with_random_osd()
2153 if (osdmap
.get_num_up_osds() == 0) {
2154 dout(10) << __func__
<< " no up osds, don't share with anyone" << dendl
;
2158 MonSession
*s
= mon
->session_map
.get_random_osd_session(&osdmap
);
2160 dout(10) << __func__
<< " no up osd on our session map" << dendl
;
2164 dout(10) << "committed, telling random " << s
->name
2165 << " all about it" << dendl
;
2167 // get feature of the peer
2168 // use quorum_con_features, if it's an anonymous connection.
2169 uint64_t features
= s
->con_features
? s
->con_features
:
2170 mon
->get_quorum_con_features();
2171 // whatev, they'll request more if they need it
2172 MOSDMap
*m
= build_incremental(osdmap
.get_epoch() - 1, osdmap
.get_epoch(), features
);
2173 s
->con
->send_message(m
);
2174 // NOTE: do *not* record osd has up to this epoch (as we do
2175 // elsewhere) as they may still need to request older values.
2178 version_t
OSDMonitor::get_trim_to() const
2180 if (mon
->get_quorum().empty()) {
2181 dout(10) << __func__
<< ": quorum not formed" << dendl
;
2186 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
2187 if (!creating_pgs
.pgs
.empty()) {
2192 if (g_conf().get_val
<bool>("mon_debug_block_osdmap_trim")) {
2194 << " blocking osdmap trim"
2195 " ('mon_debug_block_osdmap_trim' set to 'true')"
2201 epoch_t floor
= get_min_last_epoch_clean();
2202 dout(10) << " min_last_epoch_clean " << floor
<< dendl
;
2203 if (g_conf()->mon_osd_force_trim_to
> 0 &&
2204 g_conf()->mon_osd_force_trim_to
< (int)get_last_committed()) {
2205 floor
= g_conf()->mon_osd_force_trim_to
;
2206 dout(10) << " explicit mon_osd_force_trim_to = " << floor
<< dendl
;
2208 unsigned min
= g_conf()->mon_min_osdmap_epochs
;
2209 if (floor
+ min
> get_last_committed()) {
2210 if (min
< get_last_committed())
2211 floor
= get_last_committed() - min
;
2215 if (floor
> get_first_committed())
2221 epoch_t
OSDMonitor::get_min_last_epoch_clean() const
2223 auto floor
= last_epoch_clean
.get_lower_bound(osdmap
);
2224 // also scan osd epochs
2225 // don't trim past the oldest reported osd epoch
2226 for (auto& osd_epoch
: osd_epochs
) {
2227 if (osd_epoch
.second
< floor
&&
2228 osdmap
.is_in(osd_epoch
.first
)) {
2229 floor
= osd_epoch
.second
;
2235 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx
,
2238 dout(10) << __func__
<< " including full map for e " << first
<< dendl
;
2240 get_version_full(first
, bl
);
2241 put_version_full(tx
, first
, bl
);
2243 if (has_osdmap_manifest
&&
2244 first
> osdmap_manifest
.get_first_pinned()) {
2245 _prune_update_trimmed(tx
, first
);
2250 /* full osdmap prune
2252 * for more information, please refer to doc/dev/mon-osdmap-prune.rst
2255 void OSDMonitor::load_osdmap_manifest()
2257 bool store_has_manifest
=
2258 mon
->store
->exists(get_service_name(), "osdmap_manifest");
2260 if (!store_has_manifest
) {
2261 if (!has_osdmap_manifest
) {
2265 dout(20) << __func__
2266 << " dropping osdmap manifest from memory." << dendl
;
2267 osdmap_manifest
= osdmap_manifest_t();
2268 has_osdmap_manifest
= false;
2272 dout(20) << __func__
2273 << " osdmap manifest detected in store; reload." << dendl
;
2275 bufferlist manifest_bl
;
2276 int r
= get_value("osdmap_manifest", manifest_bl
);
2278 derr
<< __func__
<< " unable to read osdmap version manifest" << dendl
;
2279 ceph_abort_msg("error reading manifest");
2281 osdmap_manifest
.decode(manifest_bl
);
2282 has_osdmap_manifest
= true;
2284 dout(10) << __func__
<< " store osdmap manifest pinned ("
2285 << osdmap_manifest
.get_first_pinned()
2287 << osdmap_manifest
.get_last_pinned()
2292 bool OSDMonitor::should_prune() const
2294 version_t first
= get_first_committed();
2295 version_t last
= get_last_committed();
2296 version_t min_osdmap_epochs
=
2297 g_conf().get_val
<int64_t>("mon_min_osdmap_epochs");
2298 version_t prune_min
=
2299 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_min");
2300 version_t prune_interval
=
2301 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_interval");
2302 version_t last_pinned
= osdmap_manifest
.get_last_pinned();
2303 version_t last_to_pin
= last
- min_osdmap_epochs
;
2305 // Make it or break it constraints.
2307 // If any of these conditions fails, we will not prune, regardless of
2308 // whether we have an on-disk manifest with an on-going pruning state.
2310 if ((last
- first
) <= min_osdmap_epochs
) {
2311 // between the first and last committed epochs, we don't have
2312 // enough epochs to trim, much less to prune.
2313 dout(10) << __func__
2314 << " currently holding only " << (last
- first
)
2315 << " epochs (min osdmap epochs: " << min_osdmap_epochs
2316 << "); do not prune."
2320 } else if ((last_to_pin
- first
) < prune_min
) {
2321 // between the first committed epoch and the last epoch we would prune,
2322 // we simply don't have enough versions over the minimum to prune maps.
2323 dout(10) << __func__
2324 << " could only prune " << (last_to_pin
- first
)
2325 << " epochs (" << first
<< ".." << last_to_pin
<< "), which"
2326 " is less than the required minimum (" << prune_min
<< ")"
2330 } else if (has_osdmap_manifest
&& last_pinned
>= last_to_pin
) {
2331 dout(10) << __func__
2332 << " we have pruned as far as we can; do not prune."
2336 } else if (last_pinned
+ prune_interval
> last_to_pin
) {
2337 dout(10) << __func__
2338 << " not enough epochs to form an interval (last pinned: "
2339 << last_pinned
<< ", last to pin: "
2340 << last_to_pin
<< ", interval: " << prune_interval
<< ")"
2345 dout(15) << __func__
2346 << " should prune (" << last_pinned
<< ".." << last_to_pin
<< ")"
2347 << " lc (" << first
<< ".." << last
<< ")"
2352 void OSDMonitor::_prune_update_trimmed(
2353 MonitorDBStore::TransactionRef tx
,
2356 dout(10) << __func__
2357 << " first " << first
2358 << " last_pinned " << osdmap_manifest
.get_last_pinned()
2359 << " last_pinned " << osdmap_manifest
.get_last_pinned()
2362 osdmap_manifest_t manifest
= osdmap_manifest
;
2364 if (!manifest
.is_pinned(first
)) {
2365 manifest
.pin(first
);
2368 set
<version_t
>::iterator p_end
= manifest
.pinned
.find(first
);
2369 set
<version_t
>::iterator p
= manifest
.pinned
.begin();
2370 manifest
.pinned
.erase(p
, p_end
);
2371 ceph_assert(manifest
.get_first_pinned() == first
);
2373 if (manifest
.get_last_pinned() == first
+1 ||
2374 manifest
.pinned
.size() == 1) {
2375 // we reached the end of the line, as pinned maps go; clean up our
2376 // manifest, and let `should_prune()` decide whether we should prune
2378 tx
->erase(get_service_name(), "osdmap_manifest");
2383 manifest
.encode(bl
);
2384 tx
->put(get_service_name(), "osdmap_manifest", bl
);
2387 void OSDMonitor::prune_init(osdmap_manifest_t
& manifest
)
2389 dout(1) << __func__
<< dendl
;
2391 version_t pin_first
;
2393 // verify constrainsts on stable in-memory state
2394 if (!has_osdmap_manifest
) {
2395 // we must have never pruned, OR if we pruned the state must no longer
2396 // be relevant (i.e., the state must have been removed alongside with
2397 // the trim that *must* have removed past the last pinned map in a
2399 ceph_assert(osdmap_manifest
.pinned
.empty());
2400 ceph_assert(!mon
->store
->exists(get_service_name(), "osdmap_manifest"));
2401 pin_first
= get_first_committed();
2404 // we must have pruned in the past AND its state is still relevant
2405 // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
2406 // and thus we still hold a manifest in the store).
2407 ceph_assert(!osdmap_manifest
.pinned
.empty());
2408 ceph_assert(osdmap_manifest
.get_first_pinned() == get_first_committed());
2409 ceph_assert(osdmap_manifest
.get_last_pinned() < get_last_committed());
2411 dout(10) << __func__
2412 << " first_pinned " << osdmap_manifest
.get_first_pinned()
2413 << " last_pinned " << osdmap_manifest
.get_last_pinned()
2416 pin_first
= osdmap_manifest
.get_last_pinned();
2419 manifest
.pin(pin_first
);
2422 bool OSDMonitor::_prune_sanitize_options() const
2424 uint64_t prune_interval
=
2425 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_interval");
2426 uint64_t prune_min
=
2427 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_min");
2429 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_txsize");
2433 if (prune_interval
== 0) {
2435 << " prune is enabled BUT prune interval is zero; abort."
2438 } else if (prune_interval
== 1) {
2440 << " prune interval is equal to one, which essentially means"
2441 " no pruning; abort."
2445 if (prune_min
== 0) {
2447 << " prune is enabled BUT prune min is zero; abort."
2451 if (prune_interval
> prune_min
) {
2453 << " impossible to ascertain proper prune interval because"
2454 << " it is greater than the minimum prune epochs"
2455 << " (min: " << prune_min
<< ", interval: " << prune_interval
<< ")"
2460 if (txsize
< prune_interval
- 1) {
2462 << "'mon_osdmap_full_prune_txsize' (" << txsize
2463 << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval
- 1
2464 << "); abort." << dendl
;
2470 bool OSDMonitor::is_prune_enabled() const {
2471 return g_conf().get_val
<bool>("mon_osdmap_full_prune_enabled");
2474 bool OSDMonitor::is_prune_supported() const {
2475 return mon
->get_required_mon_features().contains_any(
2476 ceph::features::mon::FEATURE_OSDMAP_PRUNE
);
2481 * @returns true if has side-effects; false otherwise.
2483 bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx
)
2485 bool enabled
= is_prune_enabled();
2487 dout(1) << __func__
<< " osdmap full prune "
2488 << ( enabled
? "enabled" : "disabled")
2491 if (!enabled
|| !_prune_sanitize_options() || !should_prune()) {
2495 // we are beyond the minimum prune versions, we need to remove maps because
2496 // otherwise the store will grow unbounded and we may end up having issues
2497 // with available disk space or store hangs.
2499 // we will not pin all versions. We will leave a buffer number of versions.
2500 // this allows us the monitor to trim maps without caring too much about
2501 // pinned maps, and then allow us to use another ceph-mon without these
2502 // capabilities, without having to repair the store.
2504 osdmap_manifest_t manifest
= osdmap_manifest
;
2506 version_t first
= get_first_committed();
2507 version_t last
= get_last_committed();
2509 version_t last_to_pin
= last
- g_conf()->mon_min_osdmap_epochs
;
2510 version_t last_pinned
= manifest
.get_last_pinned();
2511 uint64_t prune_interval
=
2512 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_interval");
2514 g_conf().get_val
<uint64_t>("mon_osdmap_full_prune_txsize");
2516 prune_init(manifest
);
2518 // we need to get rid of some osdmaps
2521 << " lc (" << first
<< " .. " << last
<< ")"
2522 << " last_pinned " << last_pinned
2523 << " interval " << prune_interval
2524 << " last_to_pin " << last_to_pin
2527 // We will be erasing maps as we go.
2529 // We will erase all maps between `last_pinned` and the `next_to_pin`.
2531 // If `next_to_pin` happens to be greater than `last_to_pin`, then
2532 // we stop pruning. We could prune the maps between `next_to_pin` and
2533 // `last_to_pin`, but by not doing it we end up with neater pruned
2534 // intervals, aligned with `prune_interval`. Besides, this should not be a
2535 // problem as long as `prune_interval` is set to a sane value, instead of
2536 // hundreds or thousands of maps.
2538 auto map_exists
= [this](version_t v
) {
2539 string k
= mon
->store
->combine_strings("full", v
);
2540 return mon
->store
->exists(get_service_name(), k
);
2543 // 'interval' represents the number of maps from the last pinned
2544 // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2545 // version 11 next; all intermediate versions will be removed.
2547 // 'txsize' represents the maximum number of versions we'll be removing in
2548 // this iteration. If 'txsize' is large enough to perform multiple passes
2549 // pinning and removing maps, we will do so; if not, we'll do at least one
2550 // pass. We are quite relaxed about honouring 'txsize', but we'll always
2551 // ensure that we never go *over* the maximum.
2553 // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2554 uint64_t removal_interval
= prune_interval
- 1;
2556 if (txsize
< removal_interval
) {
2558 << " setting txsize to removal interval size ("
2559 << removal_interval
<< " versions"
2561 txsize
= removal_interval
;
2563 ceph_assert(removal_interval
> 0);
2565 uint64_t num_pruned
= 0;
2566 while (num_pruned
+ removal_interval
<= txsize
) {
2567 last_pinned
= manifest
.get_last_pinned();
2569 if (last_pinned
+ prune_interval
> last_to_pin
) {
2572 ceph_assert(last_pinned
< last_to_pin
);
2574 version_t next_pinned
= last_pinned
+ prune_interval
;
2575 ceph_assert(next_pinned
<= last_to_pin
);
2576 manifest
.pin(next_pinned
);
2578 dout(20) << __func__
2579 << " last_pinned " << last_pinned
2580 << " next_pinned " << next_pinned
2581 << " num_pruned " << num_pruned
2582 << " removal interval (" << (last_pinned
+1)
2583 << ".." << (next_pinned
-1) << ")"
2584 << " txsize " << txsize
<< dendl
;
2586 ceph_assert(map_exists(last_pinned
));
2587 ceph_assert(map_exists(next_pinned
));
2589 for (version_t v
= last_pinned
+1; v
< next_pinned
; ++v
) {
2590 ceph_assert(!manifest
.is_pinned(v
));
2592 dout(20) << __func__
<< " pruning full osdmap e" << v
<< dendl
;
2593 string full_key
= mon
->store
->combine_strings("full", v
);
2594 tx
->erase(get_service_name(), full_key
);
2599 ceph_assert(num_pruned
> 0);
2602 manifest
.encode(bl
);
2603 tx
->put(get_service_name(), "osdmap_manifest", bl
);
2611 bool OSDMonitor::preprocess_query(MonOpRequestRef op
)
2613 op
->mark_osdmon_event(__func__
);
2614 Message
*m
= op
->get_req();
2615 dout(10) << "preprocess_query " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
2617 switch (m
->get_type()) {
2619 case MSG_MON_COMMAND
:
2621 return preprocess_command(op
);
2622 } catch (const bad_cmd_get
& e
) {
2624 mon
->reply_command(op
, -EINVAL
, e
.what(), bl
, get_last_committed());
2627 case CEPH_MSG_MON_GET_OSDMAP
:
2628 return preprocess_get_osdmap(op
);
2631 case MSG_OSD_MARK_ME_DOWN
:
2632 return preprocess_mark_me_down(op
);
2633 case MSG_OSD_MARK_ME_DEAD
:
2634 return preprocess_mark_me_dead(op
);
2636 return preprocess_full(op
);
2637 case MSG_OSD_FAILURE
:
2638 return preprocess_failure(op
);
2640 return preprocess_boot(op
);
2642 return preprocess_alive(op
);
2643 case MSG_OSD_PG_CREATED
:
2644 return preprocess_pg_created(op
);
2645 case MSG_OSD_PG_READY_TO_MERGE
:
2646 return preprocess_pg_ready_to_merge(op
);
2647 case MSG_OSD_PGTEMP
:
2648 return preprocess_pgtemp(op
);
2649 case MSG_OSD_BEACON
:
2650 return preprocess_beacon(op
);
2652 case CEPH_MSG_POOLOP
:
2653 return preprocess_pool_op(op
);
2655 case MSG_REMOVE_SNAPS
:
2656 return preprocess_remove_snaps(op
);
2658 case MSG_MON_GET_PURGED_SNAPS
:
2659 return preprocess_get_purged_snaps(op
);
2667 bool OSDMonitor::prepare_update(MonOpRequestRef op
)
2669 op
->mark_osdmon_event(__func__
);
2670 Message
*m
= op
->get_req();
2671 dout(7) << "prepare_update " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
2673 switch (m
->get_type()) {
2675 case MSG_OSD_MARK_ME_DOWN
:
2676 return prepare_mark_me_down(op
);
2677 case MSG_OSD_MARK_ME_DEAD
:
2678 return prepare_mark_me_dead(op
);
2680 return prepare_full(op
);
2681 case MSG_OSD_FAILURE
:
2682 return prepare_failure(op
);
2684 return prepare_boot(op
);
2686 return prepare_alive(op
);
2687 case MSG_OSD_PG_CREATED
:
2688 return prepare_pg_created(op
);
2689 case MSG_OSD_PGTEMP
:
2690 return prepare_pgtemp(op
);
2691 case MSG_OSD_PG_READY_TO_MERGE
:
2692 return prepare_pg_ready_to_merge(op
);
2693 case MSG_OSD_BEACON
:
2694 return prepare_beacon(op
);
2696 case MSG_MON_COMMAND
:
2698 return prepare_command(op
);
2699 } catch (const bad_cmd_get
& e
) {
2701 mon
->reply_command(op
, -EINVAL
, e
.what(), bl
, get_last_committed());
2705 case CEPH_MSG_POOLOP
:
2706 return prepare_pool_op(op
);
2708 case MSG_REMOVE_SNAPS
:
2709 return prepare_remove_snaps(op
);
2719 bool OSDMonitor::should_propose(double& delay
)
2721 dout(10) << "should_propose" << dendl
;
2723 // if full map, propose immediately! any subsequent changes will be clobbered.
2724 if (pending_inc
.fullmap
.length())
2727 // adjust osd weights?
2728 if (!osd_weight
.empty() &&
2729 osd_weight
.size() == (unsigned)osdmap
.get_max_osd()) {
2730 dout(0) << " adjusting osd weights based on " << osd_weight
<< dendl
;
2731 osdmap
.adjust_osd_weights(osd_weight
, pending_inc
);
2737 return PaxosService::should_propose(delay
);
2742 // ---------------------------
2745 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op
)
2747 op
->mark_osdmon_event(__func__
);
2748 auto m
= op
->get_req
<MMonGetOSDMap
>();
2750 uint64_t features
= mon
->get_quorum_con_features();
2751 if (op
->get_session() && op
->get_session()->con_features
)
2752 features
= op
->get_session()->con_features
;
2754 dout(10) << __func__
<< " " << *m
<< dendl
;
2755 MOSDMap
*reply
= new MOSDMap(mon
->monmap
->fsid
, features
);
2756 epoch_t first
= get_first_committed();
2757 epoch_t last
= osdmap
.get_epoch();
2758 int max
= g_conf()->osd_map_message_max
;
2759 ssize_t max_bytes
= g_conf()->osd_map_message_max_bytes
;
2760 for (epoch_t e
= std::max(first
, m
->get_full_first());
2761 e
<= std::min(last
, m
->get_full_last()) && max
> 0 && max_bytes
> 0;
2763 bufferlist
& bl
= reply
->maps
[e
];
2764 int r
= get_version_full(e
, features
, bl
);
2765 ceph_assert(r
>= 0);
2766 max_bytes
-= bl
.length();
2768 for (epoch_t e
= std::max(first
, m
->get_inc_first());
2769 e
<= std::min(last
, m
->get_inc_last()) && max
> 0 && max_bytes
> 0;
2771 bufferlist
& bl
= reply
->incremental_maps
[e
];
2772 int r
= get_version(e
, features
, bl
);
2773 ceph_assert(r
>= 0);
2774 max_bytes
-= bl
.length();
2776 reply
->oldest_map
= first
;
2777 reply
->newest_map
= last
;
2778 mon
->send_reply(op
, reply
);
2783 // ---------------------------
2788 bool OSDMonitor::check_source(MonOpRequestRef op
, uuid_d fsid
) {
2789 // check permissions
2790 MonSession
*session
= op
->get_session();
2793 if (!session
->is_capable("osd", MON_CAP_X
)) {
2794 dout(0) << "got MOSDFailure from entity with insufficient caps "
2795 << session
->caps
<< dendl
;
2798 if (fsid
!= mon
->monmap
->fsid
) {
2799 dout(0) << "check_source: on fsid " << fsid
2800 << " != " << mon
->monmap
->fsid
<< dendl
;
2807 bool OSDMonitor::preprocess_failure(MonOpRequestRef op
)
2809 op
->mark_osdmon_event(__func__
);
2810 auto m
= op
->get_req
<MOSDFailure
>();
2811 // who is target_osd
2812 int badboy
= m
->get_target_osd();
2814 // check permissions
2815 if (check_source(op
, m
->fsid
))
2818 // first, verify the reporting host is valid
2819 if (m
->get_orig_source().is_osd()) {
2820 int from
= m
->get_orig_source().num();
2821 if (!osdmap
.exists(from
) ||
2822 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs()) ||
2823 (osdmap
.is_down(from
) && m
->if_osd_failed())) {
2824 dout(5) << "preprocess_failure from dead osd." << from
2825 << ", ignoring" << dendl
;
2826 send_incremental(op
, m
->get_epoch()+1);
2833 if (osdmap
.is_down(badboy
)) {
2834 dout(5) << "preprocess_failure dne(/dup?): osd." << m
->get_target_osd()
2835 << " " << m
->get_target_addrs()
2836 << ", from " << m
->get_orig_source() << dendl
;
2837 if (m
->get_epoch() < osdmap
.get_epoch())
2838 send_incremental(op
, m
->get_epoch()+1);
2841 if (osdmap
.get_addrs(badboy
) != m
->get_target_addrs()) {
2842 dout(5) << "preprocess_failure wrong osd: report osd." << m
->get_target_osd()
2843 << " " << m
->get_target_addrs()
2844 << " != map's " << osdmap
.get_addrs(badboy
)
2845 << ", from " << m
->get_orig_source() << dendl
;
2846 if (m
->get_epoch() < osdmap
.get_epoch())
2847 send_incremental(op
, m
->get_epoch()+1);
2851 // already reported?
2852 if (osdmap
.is_down(badboy
) ||
2853 osdmap
.get_up_from(badboy
) > m
->get_epoch()) {
2854 dout(5) << "preprocess_failure dup/old: osd." << m
->get_target_osd()
2855 << " " << m
->get_target_addrs()
2856 << ", from " << m
->get_orig_source() << dendl
;
2857 if (m
->get_epoch() < osdmap
.get_epoch())
2858 send_incremental(op
, m
->get_epoch()+1);
2862 if (!can_mark_down(badboy
)) {
2863 dout(5) << "preprocess_failure ignoring report of osd."
2864 << m
->get_target_osd() << " " << m
->get_target_addrs()
2865 << " from " << m
->get_orig_source() << dendl
;
2869 dout(10) << "preprocess_failure new: osd." << m
->get_target_osd()
2870 << " " << m
->get_target_addrs()
2871 << ", from " << m
->get_orig_source() << dendl
;
2879 class C_AckMarkedDown
: public C_MonOp
{
2885 : C_MonOp(op
), osdmon(osdmon
) {}
2887 void _finish(int r
) override
{
2889 auto m
= op
->get_req
<MOSDMarkMeDown
>();
2890 osdmon
->mon
->send_reply(
2897 false)); // ACK itself does not request an ack
2898 } else if (r
== -EAGAIN
) {
2899 osdmon
->dispatch(op
);
2901 ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r
);
2904 ~C_AckMarkedDown() override
{
2908 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op
)
2910 op
->mark_osdmon_event(__func__
);
2911 auto m
= op
->get_req
<MOSDMarkMeDown
>();
2912 int from
= m
->target_osd
;
2914 // check permissions
2915 if (check_source(op
, m
->fsid
))
2918 // first, verify the reporting host is valid
2919 if (!m
->get_orig_source().is_osd())
2922 if (!osdmap
.exists(from
) ||
2923 osdmap
.is_down(from
) ||
2924 osdmap
.get_addrs(from
) != m
->target_addrs
) {
2925 dout(5) << "preprocess_mark_me_down from dead osd."
2926 << from
<< ", ignoring" << dendl
;
2927 send_incremental(op
, m
->get_epoch()+1);
2931 // no down might be set
2932 if (!can_mark_down(from
))
2935 dout(10) << "MOSDMarkMeDown for: " << m
->get_orig_source()
2936 << " " << m
->target_addrs
<< dendl
;
2940 if (m
->request_ack
) {
2941 Context
*c(new C_AckMarkedDown(this, op
));
2947 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op
)
2949 op
->mark_osdmon_event(__func__
);
2950 auto m
= op
->get_req
<MOSDMarkMeDown
>();
2951 int target_osd
= m
->target_osd
;
2953 ceph_assert(osdmap
.is_up(target_osd
));
2954 ceph_assert(osdmap
.get_addrs(target_osd
) == m
->target_addrs
);
2956 mon
->clog
->info() << "osd." << target_osd
<< " marked itself down";
2957 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
2959 wait_for_finished_proposal(op
, new C_AckMarkedDown(this, op
));
2963 bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op
)
2965 op
->mark_osdmon_event(__func__
);
2966 auto m
= op
->get_req
<MOSDMarkMeDead
>();
2967 int from
= m
->target_osd
;
2969 // check permissions
2970 if (check_source(op
, m
->fsid
)) {
2975 // first, verify the reporting host is valid
2976 if (!m
->get_orig_source().is_osd()) {
2981 if (!osdmap
.exists(from
) ||
2982 !osdmap
.is_down(from
)) {
2983 dout(5) << __func__
<< " from nonexistent or up osd." << from
2984 << ", ignoring" << dendl
;
2985 send_incremental(op
, m
->get_epoch()+1);
2993 bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op
)
2995 op
->mark_osdmon_event(__func__
);
2996 auto m
= op
->get_req
<MOSDMarkMeDead
>();
2997 int target_osd
= m
->target_osd
;
2999 ceph_assert(osdmap
.is_down(target_osd
));
3001 mon
->clog
->info() << "osd." << target_osd
<< " marked itself dead as of e"
3003 if (!pending_inc
.new_xinfo
.count(target_osd
)) {
3004 pending_inc
.new_xinfo
[target_osd
] = osdmap
.osd_xinfo
[target_osd
];
3006 pending_inc
.new_xinfo
[target_osd
].dead_epoch
= m
->get_epoch();
3007 wait_for_finished_proposal(
3010 [op
, this] (int r
) {
3012 mon
->no_reply(op
); // ignore on success
3019 bool OSDMonitor::can_mark_down(int i
)
3021 if (osdmap
.is_nodown(i
)) {
3022 dout(5) << __func__
<< " osd." << i
<< " is marked as nodown, "
3023 << "will not mark it down" << dendl
;
3027 int num_osds
= osdmap
.get_num_osds();
3028 if (num_osds
== 0) {
3029 dout(5) << __func__
<< " no osds" << dendl
;
3032 int up
= osdmap
.get_num_up_osds() - pending_inc
.get_net_marked_down(&osdmap
);
3033 float up_ratio
= (float)up
/ (float)num_osds
;
3034 if (up_ratio
< g_conf()->mon_osd_min_up_ratio
) {
3035 dout(2) << __func__
<< " current up_ratio " << up_ratio
<< " < min "
3036 << g_conf()->mon_osd_min_up_ratio
3037 << ", will not mark osd." << i
<< " down" << dendl
;
3043 bool OSDMonitor::can_mark_up(int i
)
3045 if (osdmap
.is_noup(i
)) {
3046 dout(5) << __func__
<< " osd." << i
<< " is marked as noup, "
3047 << "will not mark it up" << dendl
;
3055 * @note the parameter @p i apparently only exists here so we can output the
3056 * osd's id on messages.
3058 bool OSDMonitor::can_mark_out(int i
)
3060 if (osdmap
.is_noout(i
)) {
3061 dout(5) << __func__
<< " osd." << i
<< " is marked as noout, "
3062 << "will not mark it out" << dendl
;
3066 int num_osds
= osdmap
.get_num_osds();
3067 if (num_osds
== 0) {
3068 dout(5) << __func__
<< " no osds" << dendl
;
3071 int in
= osdmap
.get_num_in_osds() - pending_inc
.get_net_marked_out(&osdmap
);
3072 float in_ratio
= (float)in
/ (float)num_osds
;
3073 if (in_ratio
< g_conf()->mon_osd_min_in_ratio
) {
3075 dout(5) << __func__
<< " current in_ratio " << in_ratio
<< " < min "
3076 << g_conf()->mon_osd_min_in_ratio
3077 << ", will not mark osd." << i
<< " out" << dendl
;
3079 dout(5) << __func__
<< " current in_ratio " << in_ratio
<< " < min "
3080 << g_conf()->mon_osd_min_in_ratio
3081 << ", will not mark osds out" << dendl
;
3088 bool OSDMonitor::can_mark_in(int i
)
3090 if (osdmap
.is_noin(i
)) {
3091 dout(5) << __func__
<< " osd." << i
<< " is marked as noin, "
3092 << "will not mark it in" << dendl
;
3099 bool OSDMonitor::check_failures(utime_t now
)
3101 bool found_failure
= false;
3102 auto p
= failure_info
.begin();
3103 while (p
!= failure_info
.end()) {
3104 auto& [target_osd
, fi
] = *p
;
3105 if (can_mark_down(target_osd
) &&
3106 check_failure(now
, target_osd
, fi
)) {
3107 found_failure
= true;
3109 } else if (is_failure_stale(now
, fi
)) {
3110 dout(10) << " dropping stale failure_info for osd." << target_osd
3111 << " from " << fi
.reporters
.size() << " reporters"
3113 p
= failure_info
.erase(p
);
3118 return found_failure
;
3121 utime_t
OSDMonitor::get_grace_time(utime_t now
,
3123 failure_info_t
& fi
) const
3125 utime_t
orig_grace(g_conf()->osd_heartbeat_grace
, 0);
3126 if (!g_conf()->mon_osd_adjust_heartbeat_grace
) {
3129 utime_t grace
= orig_grace
;
3130 double halflife
= (double)g_conf()->mon_osd_laggy_halflife
;
3131 double decay_k
= ::log(.5) / halflife
;
3133 // scale grace period based on historical probability of 'lagginess'
3134 // (false positive failures due to slowness).
3135 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(target_osd
);
3136 const utime_t failed_for
= now
- fi
.get_failed_since();
3137 double decay
= exp((double)failed_for
* decay_k
);
3138 dout(20) << " halflife " << halflife
<< " decay_k " << decay_k
3139 << " failed_for " << failed_for
<< " decay " << decay
<< dendl
;
3140 double my_grace
= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
3143 // consider the peers reporting a failure a proxy for a potential
3144 // 'subcluster' over the overall cluster that is similarly
3145 // laggy. this is clearly not true in all cases, but will sometimes
3146 // help us localize the grace correction to a subset of the system
3147 // (say, a rack with a bad switch) that is unhappy.
3148 double peer_grace
= 0;
3149 for (auto& [reporter
, report
] : fi
.reporters
) {
3150 if (osdmap
.exists(reporter
)) {
3151 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(reporter
);
3152 utime_t elapsed
= now
- xi
.down_stamp
;
3153 double decay
= exp((double)elapsed
* decay_k
);
3154 peer_grace
+= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
3157 peer_grace
/= (double)fi
.reporters
.size();
3158 grace
+= peer_grace
;
3159 dout(10) << " osd." << target_osd
<< " has "
3160 << fi
.reporters
.size() << " reporters, "
3161 << grace
<< " grace (" << orig_grace
<< " + " << my_grace
3162 << " + " << peer_grace
<< "), max_failed_since " << fi
.get_failed_since()
3168 bool OSDMonitor::check_failure(utime_t now
, int target_osd
, failure_info_t
& fi
)
3170 // already pending failure?
3171 if (pending_inc
.new_state
.count(target_osd
) &&
3172 pending_inc
.new_state
[target_osd
] & CEPH_OSD_UP
) {
3173 dout(10) << " already pending failure" << dendl
;
3177 set
<string
> reporters_by_subtree
;
3178 auto reporter_subtree_level
= g_conf().get_val
<string
>("mon_osd_reporter_subtree_level");
3179 ceph_assert(fi
.reporters
.size());
3180 for (auto p
= fi
.reporters
.begin(); p
!= fi
.reporters
.end();) {
3181 // get the parent bucket whose type matches with "reporter_subtree_level".
3182 // fall back to OSD if the level doesn't exist.
3183 if (osdmap
.exists(p
->first
)) {
3184 auto reporter_loc
= osdmap
.crush
->get_full_location(p
->first
);
3185 if (auto iter
= reporter_loc
.find(reporter_subtree_level
);
3186 iter
== reporter_loc
.end()) {
3187 reporters_by_subtree
.insert("osd." + to_string(p
->first
));
3189 reporters_by_subtree
.insert(iter
->second
);
3193 fi
.cancel_report(p
->first
);;
3194 p
= fi
.reporters
.erase(p
);
3197 if (reporters_by_subtree
.size() < g_conf().get_val
<uint64_t>("mon_osd_min_down_reporters")) {
3200 const utime_t failed_for
= now
- fi
.get_failed_since();
3201 const utime_t grace
= get_grace_time(now
, target_osd
, fi
);
3202 if (failed_for
>= grace
) {
3203 dout(1) << " we have enough reporters to mark osd." << target_osd
3204 << " down" << dendl
;
3205 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
3207 mon
->clog
->info() << "osd." << target_osd
<< " failed ("
3208 << osdmap
.crush
->get_full_location_ordered_string(
3211 << (int)reporters_by_subtree
.size()
3212 << " reporters from different "
3213 << reporter_subtree_level
<< " after "
3214 << failed_for
<< " >= grace " << grace
<< ")";
3220 bool OSDMonitor::is_failure_stale(utime_t now
, failure_info_t
& fi
) const
3222 // if it takes too long to either cancel the report to mark the osd down,
3223 // some reporters must have failed to cancel their reports. let's just
3224 // forget these reports.
3225 const utime_t failed_for
= now
- fi
.get_failed_since();
3226 auto heartbeat_grace
= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_grace");
3227 auto heartbeat_stale
= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_stale");
3228 return failed_for
>= (heartbeat_grace
+ heartbeat_stale
);
3231 void OSDMonitor::force_failure(int target_osd
, int by
)
3233 // already pending failure?
3234 if (pending_inc
.new_state
.count(target_osd
) &&
3235 pending_inc
.new_state
[target_osd
] & CEPH_OSD_UP
) {
3236 dout(10) << " already pending failure" << dendl
;
3240 dout(1) << " we're forcing failure of osd." << target_osd
<< dendl
;
3241 pending_inc
.new_state
[target_osd
] = CEPH_OSD_UP
;
3242 if (!pending_inc
.new_xinfo
.count(target_osd
)) {
3243 pending_inc
.new_xinfo
[target_osd
] = osdmap
.osd_xinfo
[target_osd
];
3245 pending_inc
.new_xinfo
[target_osd
].dead_epoch
= pending_inc
.epoch
;
3247 mon
->clog
->info() << "osd." << target_osd
<< " failed ("
3248 << osdmap
.crush
->get_full_location_ordered_string(target_osd
)
3249 << ") (connection refused reported by osd." << by
<< ")";
3253 bool OSDMonitor::prepare_failure(MonOpRequestRef op
)
3255 op
->mark_osdmon_event(__func__
);
3256 auto m
= op
->get_req
<MOSDFailure
>();
3257 dout(1) << "prepare_failure osd." << m
->get_target_osd()
3258 << " " << m
->get_target_addrs()
3259 << " from " << m
->get_orig_source()
3260 << " is reporting failure:" << m
->if_osd_failed() << dendl
;
3262 int target_osd
= m
->get_target_osd();
3263 int reporter
= m
->get_orig_source().num();
3264 ceph_assert(osdmap
.is_up(target_osd
));
3265 ceph_assert(osdmap
.get_addrs(target_osd
) == m
->get_target_addrs());
3269 if (m
->if_osd_failed()) {
3270 // calculate failure time
3271 utime_t now
= ceph_clock_now();
3272 utime_t failed_since
=
3273 m
->get_recv_stamp() - utime_t(m
->failed_for
, 0);
3276 if (m
->is_immediate()) {
3277 mon
->clog
->debug() << "osd." << m
->get_target_osd()
3278 << " reported immediately failed by "
3279 << m
->get_orig_source();
3280 force_failure(target_osd
, reporter
);
3283 mon
->clog
->debug() << "osd." << m
->get_target_osd() << " reported failed by "
3284 << m
->get_orig_source();
3286 failure_info_t
& fi
= failure_info
[target_osd
];
3287 fi
.add_report(reporter
, failed_since
, op
);
3288 return check_failure(now
, target_osd
, fi
);
3290 // remove the report
3291 mon
->clog
->debug() << "osd." << m
->get_target_osd()
3292 << " failure report canceled by "
3293 << m
->get_orig_source();
3294 if (failure_info
.count(target_osd
)) {
3295 failure_info_t
& fi
= failure_info
[target_osd
];
3296 fi
.cancel_report(reporter
);
3297 if (fi
.reporters
.empty()) {
3298 dout(10) << " removing last failure_info for osd." << target_osd
3300 failure_info
.erase(target_osd
);
3302 dout(10) << " failure_info for osd." << target_osd
<< " now "
3303 << fi
.reporters
.size() << " reporters" << dendl
;
3306 dout(10) << " no failure_info for osd." << target_osd
<< dendl
;
3313 void OSDMonitor::process_failures()
3315 map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
3316 while (p
!= failure_info
.end()) {
3317 if (osdmap
.is_up(p
->first
)) {
3320 dout(10) << "process_failures osd." << p
->first
<< dendl
;
3321 list
<MonOpRequestRef
> ls
;
3322 p
->second
.take_report_messages(ls
);
3323 failure_info
.erase(p
++);
3325 while (!ls
.empty()) {
3326 MonOpRequestRef o
= ls
.front();
3328 o
->mark_event(__func__
);
3329 MOSDFailure
*m
= o
->get_req
<MOSDFailure
>();
3330 send_latest(o
, m
->get_epoch());
3339 void OSDMonitor::take_all_failures(list
<MonOpRequestRef
>& ls
)
3341 dout(10) << __func__
<< " on " << failure_info
.size() << " osds" << dendl
;
3343 for (map
<int,failure_info_t
>::iterator p
= failure_info
.begin();
3344 p
!= failure_info
.end();
3346 p
->second
.take_report_messages(ls
);
3348 failure_info
.clear();
3351 int OSDMonitor::get_grace_interval_threshold()
3353 int halflife
= g_conf()->mon_osd_laggy_halflife
;
3354 // Scale the halflife period (default: 1_hr) by
3355 // a factor (48) to calculate the threshold.
3356 int grace_threshold_factor
= 48;
3357 return halflife
* grace_threshold_factor
;
3360 bool OSDMonitor::grace_interval_threshold_exceeded(int last_failed_interval
)
3362 int grace_interval_threshold_secs
= get_grace_interval_threshold();
3363 if (last_failed_interval
> grace_interval_threshold_secs
) {
3364 dout(1) << " last_failed_interval " << last_failed_interval
3365 << " > grace_interval_threshold_secs " << grace_interval_threshold_secs
3372 void OSDMonitor::set_default_laggy_params(int target_osd
)
3374 if (pending_inc
.new_xinfo
.count(target_osd
) == 0) {
3375 pending_inc
.new_xinfo
[target_osd
] = osdmap
.osd_xinfo
[target_osd
];
3377 osd_xinfo_t
& xi
= pending_inc
.new_xinfo
[target_osd
];
3378 xi
.down_stamp
= pending_inc
.modified
;
3379 xi
.laggy_probability
= 0.0;
3380 xi
.laggy_interval
= 0;
3381 dout(20) << __func__
<< " reset laggy, now xi " << xi
<< dendl
;
3387 bool OSDMonitor::preprocess_boot(MonOpRequestRef op
)
3389 op
->mark_osdmon_event(__func__
);
3390 auto m
= op
->get_req
<MOSDBoot
>();
3391 int from
= m
->get_orig_source_inst().name
.num();
3393 // check permissions, ignore if failed (no response expected)
3394 MonSession
*session
= op
->get_session();
3397 if (!session
->is_capable("osd", MON_CAP_X
)) {
3398 dout(0) << "got preprocess_boot message from entity with insufficient caps"
3399 << session
->caps
<< dendl
;
3403 if (m
->sb
.cluster_fsid
!= mon
->monmap
->fsid
) {
3404 dout(0) << "preprocess_boot on fsid " << m
->sb
.cluster_fsid
3405 << " != " << mon
->monmap
->fsid
<< dendl
;
3409 if (m
->get_orig_source_inst().addr
.is_blank_ip()) {
3410 dout(0) << "preprocess_boot got blank addr for " << m
->get_orig_source_inst() << dendl
;
3414 ceph_assert(m
->get_orig_source_inst().name
.is_osd());
3416 // force all osds to have gone through luminous prior to upgrade to nautilus
3418 vector
<string
> missing
;
3419 if (!HAVE_FEATURE(m
->osd_features
, SERVER_LUMINOUS
)) {
3420 missing
.push_back("CEPH_FEATURE_SERVER_LUMINOUS");
3422 if (!HAVE_FEATURE(m
->osd_features
, SERVER_JEWEL
)) {
3423 missing
.push_back("CEPH_FEATURE_SERVER_JEWEL");
3425 if (!HAVE_FEATURE(m
->osd_features
, SERVER_KRAKEN
)) {
3426 missing
.push_back("CEPH_FEATURE_SERVER_KRAKEN");
3428 if (!HAVE_FEATURE(m
->osd_features
, OSD_RECOVERY_DELETES
)) {
3429 missing
.push_back("CEPH_FEATURE_OSD_RECOVERY_DELETES");
3432 if (!missing
.empty()) {
3433 using std::experimental::make_ostream_joiner
;
3436 copy(begin(missing
), end(missing
), make_ostream_joiner(ss
, ";"));
3438 mon
->clog
->info() << "disallowing boot of OSD "
3439 << m
->get_orig_source_inst()
3440 << " because the osd lacks " << ss
.str();
3445 // make sure osd versions do not span more than 3 releases
3446 if (HAVE_FEATURE(m
->osd_features
, SERVER_OCTOPUS
) &&
3447 osdmap
.require_osd_release
< ceph_release_t::mimic
) {
3448 mon
->clog
->info() << "disallowing boot of octopus+ OSD "
3449 << m
->get_orig_source_inst()
3450 << " because require_osd_release < mimic";
3454 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
3455 // we are reusing a jewel feature bit that was retired in luminous.
3456 if (osdmap
.require_osd_release
>= ceph_release_t::luminous
&&
3457 osdmap
.test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT
) &&
3458 !(m
->osd_features
& CEPH_FEATURE_OSD_PGLOG_HARDLIMIT
)) {
3459 mon
->clog
->info() << "disallowing boot of OSD "
3460 << m
->get_orig_source_inst()
3461 << " because 'pglog_hardlimit' osdmap flag is set and OSD lacks the OSD_PGLOG_HARDLIMIT feature";
3466 if (osdmap
.is_up(from
) &&
3467 osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs()) &&
3468 osdmap
.get_cluster_addrs(from
).legacy_equals(m
->cluster_addrs
)) {
3470 dout(7) << "preprocess_boot dup from " << m
->get_orig_source()
3471 << " " << m
->get_orig_source_addrs()
3472 << " =~ " << osdmap
.get_addrs(from
) << dendl
;
3477 if (osdmap
.exists(from
) &&
3478 !osdmap
.get_uuid(from
).is_zero() &&
3479 osdmap
.get_uuid(from
) != m
->sb
.osd_fsid
) {
3480 dout(7) << __func__
<< " from " << m
->get_orig_source_inst()
3481 << " clashes with existing osd: different fsid"
3482 << " (ours: " << osdmap
.get_uuid(from
)
3483 << " ; theirs: " << m
->sb
.osd_fsid
<< ")" << dendl
;
3487 if (osdmap
.exists(from
) &&
3488 osdmap
.get_info(from
).up_from
> m
->version
&&
3489 osdmap
.get_most_recent_addrs(from
).legacy_equals(
3490 m
->get_orig_source_addrs())) {
3491 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl
;
3492 send_latest(op
, m
->sb
.current_epoch
+1);
3497 if (!can_mark_up(from
)) {
3498 dout(7) << "preprocess_boot ignoring boot from " << m
->get_orig_source_inst() << dendl
;
3499 send_latest(op
, m
->sb
.current_epoch
+1);
3503 dout(10) << "preprocess_boot from " << m
->get_orig_source_inst() << dendl
;
3510 bool OSDMonitor::prepare_boot(MonOpRequestRef op
)
3512 op
->mark_osdmon_event(__func__
);
3513 auto m
= op
->get_req
<MOSDBoot
>();
3514 dout(7) << __func__
<< " from " << m
->get_source()
3516 << " client_addrs" << m
->get_connection()->get_peer_addrs()
3517 << " cluster_addrs " << m
->cluster_addrs
3518 << " hb_back_addrs " << m
->hb_back_addrs
3519 << " hb_front_addrs " << m
->hb_front_addrs
3522 ceph_assert(m
->get_orig_source().is_osd());
3523 int from
= m
->get_orig_source().num();
3525 // does this osd exist?
3526 if (from
>= osdmap
.get_max_osd()) {
3527 dout(1) << "boot from osd." << from
<< " >= max_osd "
3528 << osdmap
.get_max_osd() << dendl
;
3532 int oldstate
= osdmap
.exists(from
) ? osdmap
.get_state(from
) : CEPH_OSD_NEW
;
3533 if (pending_inc
.new_state
.count(from
))
3534 oldstate
^= pending_inc
.new_state
[from
];
3536 // already up? mark down first?
3537 if (osdmap
.is_up(from
)) {
3538 dout(7) << __func__
<< " was up, first marking down osd." << from
<< " "
3539 << osdmap
.get_addrs(from
) << dendl
;
3540 // preprocess should have caught these; if not, assert.
3541 ceph_assert(!osdmap
.get_addrs(from
).legacy_equals(
3542 m
->get_orig_source_addrs()) ||
3543 !osdmap
.get_cluster_addrs(from
).legacy_equals(m
->cluster_addrs
));
3544 ceph_assert(osdmap
.get_uuid(from
) == m
->sb
.osd_fsid
);
3546 if (pending_inc
.new_state
.count(from
) == 0 ||
3547 (pending_inc
.new_state
[from
] & CEPH_OSD_UP
) == 0) {
3548 // mark previous guy down
3549 pending_inc
.new_state
[from
] = CEPH_OSD_UP
;
3551 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
3552 } else if (pending_inc
.new_up_client
.count(from
)) {
3553 // already prepared, just wait
3554 dout(7) << __func__
<< " already prepared, waiting on "
3555 << m
->get_orig_source_addr() << dendl
;
3556 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
3559 pending_inc
.new_up_client
[from
] = m
->get_orig_source_addrs();
3560 pending_inc
.new_up_cluster
[from
] = m
->cluster_addrs
;
3561 pending_inc
.new_hb_back_up
[from
] = m
->hb_back_addrs
;
3562 pending_inc
.new_hb_front_up
[from
] = m
->hb_front_addrs
;
3564 down_pending_out
.erase(from
); // if any
3567 osd_weight
[from
] = m
->sb
.weight
;
3570 dout(10) << " setting osd." << from
<< " uuid to " << m
->sb
.osd_fsid
3572 if (!osdmap
.exists(from
) || osdmap
.get_uuid(from
) != m
->sb
.osd_fsid
) {
3573 // preprocess should have caught this; if not, assert.
3574 ceph_assert(!osdmap
.exists(from
) || osdmap
.get_uuid(from
).is_zero());
3575 pending_inc
.new_uuid
[from
] = m
->sb
.osd_fsid
;
3579 if (m
->sb
.newest_map
== 0 && osdmap
.exists(from
)) {
3580 const osd_info_t
& i
= osdmap
.get_info(from
);
3581 if (i
.up_from
> i
.lost_at
) {
3582 dout(10) << " fresh osd; marking lost_at too" << dendl
;
3583 pending_inc
.new_lost
[from
] = osdmap
.get_epoch();
3588 bufferlist osd_metadata
;
3589 encode(m
->metadata
, osd_metadata
);
3590 pending_metadata
[from
] = osd_metadata
;
3591 pending_metadata_rm
.erase(from
);
3593 // adjust last clean unmount epoch?
3594 const osd_info_t
& info
= osdmap
.get_info(from
);
3595 dout(10) << " old osd_info: " << info
<< dendl
;
3596 if (m
->sb
.mounted
> info
.last_clean_begin
||
3597 (m
->sb
.mounted
== info
.last_clean_begin
&&
3598 m
->sb
.clean_thru
> info
.last_clean_end
)) {
3599 epoch_t begin
= m
->sb
.mounted
;
3600 epoch_t end
= m
->sb
.clean_thru
;
3602 dout(10) << __func__
<< " osd." << from
<< " last_clean_interval "
3603 << "[" << info
.last_clean_begin
<< "," << info
.last_clean_end
3604 << ") -> [" << begin
<< "-" << end
<< ")"
3606 pending_inc
.new_last_clean_interval
[from
] =
3607 pair
<epoch_t
,epoch_t
>(begin
, end
);
3610 if (pending_inc
.new_xinfo
.count(from
) == 0)
3611 pending_inc
.new_xinfo
[from
] = osdmap
.osd_xinfo
[from
];
3612 osd_xinfo_t
& xi
= pending_inc
.new_xinfo
[from
];
3613 if (m
->boot_epoch
== 0) {
3614 xi
.laggy_probability
*= (1.0 - g_conf()->mon_osd_laggy_weight
);
3615 xi
.laggy_interval
*= (1.0 - g_conf()->mon_osd_laggy_weight
);
3616 dout(10) << " not laggy, new xi " << xi
<< dendl
;
3618 if (xi
.down_stamp
.sec()) {
3619 int interval
= ceph_clock_now().sec() -
3620 xi
.down_stamp
.sec();
3621 if (g_conf()->mon_osd_laggy_max_interval
&&
3622 (interval
> g_conf()->mon_osd_laggy_max_interval
)) {
3623 interval
= g_conf()->mon_osd_laggy_max_interval
;
3626 interval
* g_conf()->mon_osd_laggy_weight
+
3627 xi
.laggy_interval
* (1.0 - g_conf()->mon_osd_laggy_weight
);
3629 xi
.laggy_probability
=
3630 g_conf()->mon_osd_laggy_weight
+
3631 xi
.laggy_probability
* (1.0 - g_conf()->mon_osd_laggy_weight
);
3632 dout(10) << " laggy, now xi " << xi
<< dendl
;
3635 // set features shared by the osd
3636 if (m
->osd_features
)
3637 xi
.features
= m
->osd_features
;
3639 xi
.features
= m
->get_connection()->get_features();
3642 if ((g_conf()->mon_osd_auto_mark_auto_out_in
&&
3643 (oldstate
& CEPH_OSD_AUTOOUT
)) ||
3644 (g_conf()->mon_osd_auto_mark_new_in
&& (oldstate
& CEPH_OSD_NEW
)) ||
3645 (g_conf()->mon_osd_auto_mark_in
)) {
3646 if (can_mark_in(from
)) {
3647 if (xi
.old_weight
> 0) {
3648 pending_inc
.new_weight
[from
] = xi
.old_weight
;
3651 pending_inc
.new_weight
[from
] = CEPH_OSD_IN
;
3654 dout(7) << __func__
<< " NOIN set, will not mark in "
3655 << m
->get_orig_source_addr() << dendl
;
3660 wait_for_finished_proposal(op
, new C_Booted(this, op
));
3665 void OSDMonitor::_booted(MonOpRequestRef op
, bool logit
)
3667 op
->mark_osdmon_event(__func__
);
3668 auto m
= op
->get_req
<MOSDBoot
>();
3669 dout(7) << "_booted " << m
->get_orig_source_inst()
3670 << " w " << m
->sb
.weight
<< " from " << m
->sb
.current_epoch
<< dendl
;
3673 mon
->clog
->info() << m
->get_source() << " " << m
->get_orig_source_addrs()
3677 send_latest(op
, m
->sb
.current_epoch
+1);
3684 bool OSDMonitor::preprocess_full(MonOpRequestRef op
)
3686 op
->mark_osdmon_event(__func__
);
3687 auto m
= op
->get_req
<MOSDFull
>();
3688 int from
= m
->get_orig_source().num();
3690 unsigned mask
= CEPH_OSD_NEARFULL
| CEPH_OSD_BACKFILLFULL
| CEPH_OSD_FULL
;
3692 // check permissions, ignore if failed
3693 MonSession
*session
= op
->get_session();
3696 if (!session
->is_capable("osd", MON_CAP_X
)) {
3697 dout(0) << "MOSDFull from entity with insufficient privileges:"
3698 << session
->caps
<< dendl
;
3702 // ignore a full message from the osd instance that already went down
3703 if (!osdmap
.exists(from
)) {
3704 dout(7) << __func__
<< " ignoring full message from nonexistent "
3705 << m
->get_orig_source_inst() << dendl
;
3708 if ((!osdmap
.is_up(from
) &&
3709 osdmap
.get_most_recent_addrs(from
).legacy_equals(
3710 m
->get_orig_source_addrs())) ||
3711 (osdmap
.is_up(from
) &&
3712 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs()))) {
3713 dout(7) << __func__
<< " ignoring full message from down "
3714 << m
->get_orig_source_inst() << dendl
;
3718 OSDMap::calc_state_set(osdmap
.get_state(from
), state
);
3720 if ((osdmap
.get_state(from
) & mask
) == m
->state
) {
3721 dout(7) << __func__
<< " state already " << state
<< " for osd." << from
3722 << " " << m
->get_orig_source_inst() << dendl
;
3723 _reply_map(op
, m
->version
);
3727 dout(10) << __func__
<< " want state " << state
<< " for osd." << from
3728 << " " << m
->get_orig_source_inst() << dendl
;
3735 bool OSDMonitor::prepare_full(MonOpRequestRef op
)
3737 op
->mark_osdmon_event(__func__
);
3738 auto m
= op
->get_req
<MOSDFull
>();
3739 const int from
= m
->get_orig_source().num();
3741 const unsigned mask
= CEPH_OSD_NEARFULL
| CEPH_OSD_BACKFILLFULL
| CEPH_OSD_FULL
;
3742 const unsigned want_state
= m
->state
& mask
; // safety first
3744 unsigned cur_state
= osdmap
.get_state(from
);
3745 auto p
= pending_inc
.new_state
.find(from
);
3746 if (p
!= pending_inc
.new_state
.end()) {
3747 cur_state
^= p
->second
;
3751 set
<string
> want_state_set
, cur_state_set
;
3752 OSDMap::calc_state_set(want_state
, want_state_set
);
3753 OSDMap::calc_state_set(cur_state
, cur_state_set
);
3755 if (cur_state
!= want_state
) {
3756 if (p
!= pending_inc
.new_state
.end()) {
3759 pending_inc
.new_state
[from
] = 0;
3761 pending_inc
.new_state
[from
] |= (osdmap
.get_state(from
) & mask
) ^ want_state
;
3762 dout(7) << __func__
<< " osd." << from
<< " " << cur_state_set
3763 << " -> " << want_state_set
<< dendl
;
3765 dout(7) << __func__
<< " osd." << from
<< " " << cur_state_set
3766 << " = wanted " << want_state_set
<< ", just waiting" << dendl
;
3769 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
3776 bool OSDMonitor::preprocess_alive(MonOpRequestRef op
)
3778 op
->mark_osdmon_event(__func__
);
3779 auto m
= op
->get_req
<MOSDAlive
>();
3780 int from
= m
->get_orig_source().num();
3782 // check permissions, ignore if failed
3783 MonSession
*session
= op
->get_session();
3786 if (!session
->is_capable("osd", MON_CAP_X
)) {
3787 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3788 << session
->caps
<< dendl
;
3792 if (!osdmap
.is_up(from
) ||
3793 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs())) {
3794 dout(7) << "preprocess_alive ignoring alive message from down "
3795 << m
->get_orig_source() << " " << m
->get_orig_source_addrs()
3800 if (osdmap
.get_up_thru(from
) >= m
->want
) {
3802 dout(7) << "preprocess_alive want up_thru " << m
->want
<< " dup from " << m
->get_orig_source_inst() << dendl
;
3803 _reply_map(op
, m
->version
);
3807 dout(10) << "preprocess_alive want up_thru " << m
->want
3808 << " from " << m
->get_orig_source_inst() << dendl
;
3815 bool OSDMonitor::prepare_alive(MonOpRequestRef op
)
3817 op
->mark_osdmon_event(__func__
);
3818 auto m
= op
->get_req
<MOSDAlive
>();
3819 int from
= m
->get_orig_source().num();
3821 if (0) { // we probably don't care much about these
3822 mon
->clog
->debug() << m
->get_orig_source_inst() << " alive";
3825 dout(7) << "prepare_alive want up_thru " << m
->want
<< " have " << m
->version
3826 << " from " << m
->get_orig_source_inst() << dendl
;
3828 update_up_thru(from
, m
->version
); // set to the latest map the OSD has
3829 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
3833 void OSDMonitor::_reply_map(MonOpRequestRef op
, epoch_t e
)
3835 op
->mark_osdmon_event(__func__
);
3836 dout(7) << "_reply_map " << e
3837 << " from " << op
->get_req()->get_orig_source_inst()
3843 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op
)
3845 op
->mark_osdmon_event(__func__
);
3846 auto m
= op
->get_req
<MOSDPGCreated
>();
3847 dout(10) << __func__
<< " " << *m
<< dendl
;
3848 auto session
= op
->get_session();
3851 dout(10) << __func__
<< ": no monitor session!" << dendl
;
3854 if (!session
->is_capable("osd", MON_CAP_X
)) {
3855 derr
<< __func__
<< " received from entity "
3856 << "with insufficient privileges " << session
->caps
<< dendl
;
3859 // always forward the "created!" to the leader
3863 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op
)
3865 op
->mark_osdmon_event(__func__
);
3866 auto m
= op
->get_req
<MOSDPGCreated
>();
3867 dout(10) << __func__
<< " " << *m
<< dendl
;
3868 auto src
= m
->get_orig_source();
3869 auto from
= src
.num();
3870 if (!src
.is_osd() ||
3871 !mon
->osdmon()->osdmap
.is_up(from
) ||
3872 !mon
->osdmon()->osdmap
.get_addrs(from
).legacy_equals(
3873 m
->get_orig_source_addrs())) {
3874 dout(1) << __func__
<< " ignoring stats from non-active osd." << dendl
;
3877 pending_created_pgs
.push_back(m
->pgid
);
3881 bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op
)
3883 op
->mark_osdmon_event(__func__
);
3884 auto m
= op
->get_req
<MOSDPGReadyToMerge
>();
3885 dout(10) << __func__
<< " " << *m
<< dendl
;
3886 const pg_pool_t
*pi
;
3887 auto session
= op
->get_session();
3889 dout(10) << __func__
<< ": no monitor session!" << dendl
;
3892 if (!session
->is_capable("osd", MON_CAP_X
)) {
3893 derr
<< __func__
<< " received from entity "
3894 << "with insufficient privileges " << session
->caps
<< dendl
;
3897 pi
= osdmap
.get_pg_pool(m
->pgid
.pool());
3899 derr
<< __func__
<< " pool for " << m
->pgid
<< " dne" << dendl
;
3902 if (pi
->get_pg_num() <= m
->pgid
.ps()) {
3903 dout(20) << " pg_num " << pi
->get_pg_num() << " already < " << m
->pgid
<< dendl
;
3906 if (pi
->get_pg_num() != m
->pgid
.ps() + 1) {
3907 derr
<< " OSD trying to merge wrong pgid " << m
->pgid
<< dendl
;
3910 if (pi
->get_pg_num_pending() > m
->pgid
.ps()) {
3911 dout(20) << " pg_num_pending " << pi
->get_pg_num_pending() << " > " << m
->pgid
<< dendl
;
3921 bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op
)
3923 op
->mark_osdmon_event(__func__
);
3924 auto m
= op
->get_req
<MOSDPGReadyToMerge
>();
3925 dout(10) << __func__
<< " " << *m
<< dendl
;
3927 if (pending_inc
.new_pools
.count(m
->pgid
.pool()))
3928 p
= pending_inc
.new_pools
[m
->pgid
.pool()];
3930 p
= *osdmap
.get_pg_pool(m
->pgid
.pool());
3931 if (p
.get_pg_num() != m
->pgid
.ps() + 1 ||
3932 p
.get_pg_num_pending() > m
->pgid
.ps()) {
3933 dout(10) << __func__
3934 << " race with concurrent pg_num[_pending] update, will retry"
3936 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
3941 p
.dec_pg_num(m
->pgid
,
3945 m
->last_epoch_started
,
3946 m
->last_epoch_clean
);
3947 p
.last_change
= pending_inc
.epoch
;
3949 // back off the merge attempt!
3950 p
.set_pg_num_pending(p
.get_pg_num());
3953 // force pre-nautilus clients to resend their ops, since they
3954 // don't understand pg_num_pending changes form a new interval
3955 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
3957 pending_inc
.new_pools
[m
->pgid
.pool()] = p
;
3959 auto prob
= g_conf().get_val
<double>("mon_inject_pg_merge_bounce_probability");
3962 prob
> (double)(rand() % 1000)/1000.0) {
3963 derr
<< __func__
<< " injecting pg merge pg_num bounce" << dendl
;
3964 auto n
= new MMonCommand(mon
->monmap
->get_fsid());
3965 n
->set_connection(m
->get_connection());
3966 n
->cmd
= { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
3967 osdmap
.get_pool_name(m
->pgid
.pool()) +
3968 "\", \"var\": \"pg_num_actual\", \"val\": \"" +
3969 stringify(m
->pgid
.ps() + 1) + "\"}" };
3970 MonOpRequestRef nop
= mon
->op_tracker
.create_request
<MonOpRequest
>(n
);
3971 nop
->set_type_service();
3972 wait_for_finished_proposal(op
, new C_RetryMessage(this, nop
));
3974 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->version
));
3983 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op
)
3985 auto m
= op
->get_req
<MOSDPGTemp
>();
3986 dout(10) << "preprocess_pgtemp " << *m
<< dendl
;
3987 mempool::osdmap::vector
<int> empty
;
3988 int from
= m
->get_orig_source().num();
3989 size_t ignore_cnt
= 0;
3992 MonSession
*session
= op
->get_session();
3995 if (!session
->is_capable("osd", MON_CAP_X
)) {
3996 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
3997 << session
->caps
<< dendl
;
4001 if (!osdmap
.is_up(from
) ||
4002 !osdmap
.get_addrs(from
).legacy_equals(m
->get_orig_source_addrs())) {
4003 dout(7) << "ignoring pgtemp message from down "
4004 << m
->get_orig_source() << " " << m
->get_orig_source_addrs()
4013 for (auto p
= m
->pg_temp
.begin(); p
!= m
->pg_temp
.end(); ++p
) {
4014 dout(20) << " " << p
->first
4015 << (osdmap
.pg_temp
->count(p
->first
) ? osdmap
.pg_temp
->get(p
->first
) : empty
)
4016 << " -> " << p
->second
<< dendl
;
4018 // does the pool exist?
4019 if (!osdmap
.have_pg_pool(p
->first
.pool())) {
4021 * 1. If the osdmap does not have the pool, it means the pool has been
4022 * removed in-between the osd sending this message and us handling it.
4023 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
4024 * not exist in the pending either, as the osds would not send a
4025 * message about a pool they know nothing about (yet).
4026 * 3. However, if the pool does exist in the pending, then it must be a
4027 * new pool, and not relevant to this message (see 1).
4029 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
4030 << ": pool has been removed" << dendl
;
4035 int acting_primary
= -1;
4036 osdmap
.pg_to_up_acting_osds(
4037 p
->first
, nullptr, nullptr, nullptr, &acting_primary
);
4038 if (acting_primary
!= from
) {
4039 /* If the source isn't the primary based on the current osdmap, we know
4040 * that the interval changed and that we can discard this message.
4041 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
4042 * which of two pg temp mappings on the same pg is more recent.
4044 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
4045 << ": primary has changed" << dendl
;
4051 if (p
->second
.empty() && (osdmap
.pg_temp
->count(p
->first
) ||
4052 osdmap
.primary_temp
->count(p
->first
)))
4055 // NOTE: we assume that this will clear pg_primary, so consider
4056 // an existing pg_primary field to imply a change
4057 if (p
->second
.size() &&
4058 (osdmap
.pg_temp
->count(p
->first
) == 0 ||
4059 osdmap
.pg_temp
->get(p
->first
) != p
->second
||
4060 osdmap
.primary_temp
->count(p
->first
)))
4064 // should we ignore all the pgs?
4065 if (ignore_cnt
== m
->pg_temp
.size())
4068 dout(7) << "preprocess_pgtemp e" << m
->map_epoch
<< " no changes from " << m
->get_orig_source_inst() << dendl
;
4069 _reply_map(op
, m
->map_epoch
);
4077 void OSDMonitor::update_up_thru(int from
, epoch_t up_thru
)
4079 epoch_t old_up_thru
= osdmap
.get_up_thru(from
);
4080 auto ut
= pending_inc
.new_up_thru
.find(from
);
4081 if (ut
!= pending_inc
.new_up_thru
.end()) {
4082 old_up_thru
= ut
->second
;
4084 if (up_thru
> old_up_thru
) {
4085 // set up_thru too, so the osd doesn't have to ask again
4086 pending_inc
.new_up_thru
[from
] = up_thru
;
4090 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op
)
4092 op
->mark_osdmon_event(__func__
);
4093 auto m
= op
->get_req
<MOSDPGTemp
>();
4094 int from
= m
->get_orig_source().num();
4095 dout(7) << "prepare_pgtemp e" << m
->map_epoch
<< " from " << m
->get_orig_source_inst() << dendl
;
4096 for (map
<pg_t
,vector
<int32_t> >::iterator p
= m
->pg_temp
.begin(); p
!= m
->pg_temp
.end(); ++p
) {
4097 uint64_t pool
= p
->first
.pool();
4098 if (pending_inc
.old_pools
.count(pool
)) {
4099 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
4100 << ": pool pending removal" << dendl
;
4103 if (!osdmap
.have_pg_pool(pool
)) {
4104 dout(10) << __func__
<< " ignore " << p
->first
<< " -> " << p
->second
4105 << ": pool has been removed" << dendl
;
4108 pending_inc
.new_pg_temp
[p
->first
] =
4109 mempool::osdmap::vector
<int>(p
->second
.begin(), p
->second
.end());
4111 // unconditionally clear pg_primary (until this message can encode
4112 // a change for that, too.. at which point we need to also fix
4113 // preprocess_pg_temp)
4114 if (osdmap
.primary_temp
->count(p
->first
) ||
4115 pending_inc
.new_primary_temp
.count(p
->first
))
4116 pending_inc
.new_primary_temp
[p
->first
] = -1;
4119 // set up_thru too, so the osd doesn't have to ask again
4120 update_up_thru(from
, m
->map_epoch
);
4122 wait_for_finished_proposal(op
, new C_ReplyMap(this, op
, m
->map_epoch
));
4129 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op
)
4131 op
->mark_osdmon_event(__func__
);
4132 auto m
= op
->get_req
<MRemoveSnaps
>();
4133 dout(7) << "preprocess_remove_snaps " << *m
<< dendl
;
4135 // check privilege, ignore if failed
4136 MonSession
*session
= op
->get_session();
4140 if (!session
->caps
.is_capable(
4142 session
->entity_name
,
4143 "osd", "osd pool rmsnap", {}, true, true, false,
4144 session
->get_peer_socket_addr())) {
4145 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
4146 << session
->caps
<< dendl
;
4150 for (map
<int, vector
<snapid_t
> >::iterator q
= m
->snaps
.begin();
4151 q
!= m
->snaps
.end();
4153 if (!osdmap
.have_pg_pool(q
->first
)) {
4154 dout(10) << " ignoring removed_snaps " << q
->second
4155 << " on non-existent pool " << q
->first
<< dendl
;
4158 const pg_pool_t
*pi
= osdmap
.get_pg_pool(q
->first
);
4159 for (vector
<snapid_t
>::iterator p
= q
->second
.begin();
4160 p
!= q
->second
.end();
4162 if (*p
> pi
->get_snap_seq() ||
4163 !_is_removed_snap(q
->first
, *p
)) {
4169 if (HAVE_FEATURE(m
->get_connection()->get_features(), SERVER_OCTOPUS
)) {
4170 auto reply
= make_message
<MRemoveSnaps
>();
4171 reply
->snaps
= m
->snaps
;
4172 mon
->send_reply(op
, reply
.detach());
4179 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op
)
4181 op
->mark_osdmon_event(__func__
);
4182 auto m
= op
->get_req
<MRemoveSnaps
>();
4183 dout(7) << "prepare_remove_snaps " << *m
<< dendl
;
4185 for (auto& [pool
, snaps
] : m
->snaps
) {
4186 if (!osdmap
.have_pg_pool(pool
)) {
4187 dout(10) << " ignoring removed_snaps " << snaps
4188 << " on non-existent pool " << pool
<< dendl
;
4192 pg_pool_t
& pi
= osdmap
.pools
[pool
];
4193 for (auto s
: snaps
) {
4194 if (!_is_removed_snap(pool
, s
) &&
4195 (!pending_inc
.new_pools
.count(pool
) ||
4196 !pending_inc
.new_pools
[pool
].removed_snaps
.contains(s
)) &&
4197 (!pending_inc
.new_removed_snaps
.count(pool
) ||
4198 !pending_inc
.new_removed_snaps
[pool
].contains(s
))) {
4199 pg_pool_t
*newpi
= pending_inc
.get_new_pool(pool
, &pi
);
4200 if (osdmap
.require_osd_release
< ceph_release_t::octopus
) {
4201 newpi
->removed_snaps
.insert(s
);
4202 dout(10) << " pool " << pool
<< " removed_snaps added " << s
4203 << " (now " << newpi
->removed_snaps
<< ")" << dendl
;
4205 newpi
->flags
|= pg_pool_t::FLAG_SELFMANAGED_SNAPS
;
4206 if (s
> newpi
->get_snap_seq()) {
4207 dout(10) << " pool " << pool
<< " snap_seq "
4208 << newpi
->get_snap_seq() << " -> " << s
<< dendl
;
4209 newpi
->set_snap_seq(s
);
4211 newpi
->set_snap_epoch(pending_inc
.epoch
);
4212 dout(10) << " added pool " << pool
<< " snap " << s
4213 << " to removed_snaps queue" << dendl
;
4214 pending_inc
.new_removed_snaps
[pool
].insert(s
);
4219 if (HAVE_FEATURE(m
->get_connection()->get_features(), SERVER_OCTOPUS
)) {
4220 auto reply
= make_message
<MRemoveSnaps
>();
4221 reply
->snaps
= m
->snaps
;
4222 wait_for_finished_proposal(op
, new C_ReplyOp(this, op
, reply
));
4228 bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op
)
4230 op
->mark_osdmon_event(__func__
);
4231 auto m
= op
->get_req
<MMonGetPurgedSnaps
>();
4232 dout(7) << __func__
<< " " << *m
<< dendl
;
4234 map
<epoch_t
,mempool::osdmap::map
<int64_t,snap_interval_set_t
>> r
;
4236 string k
= make_purged_snap_epoch_key(m
->start
);
4237 auto it
= mon
->store
->get_iterator(OSD_SNAP_PREFIX
);
4239 unsigned long epoch
= m
->last
;
4240 while (it
->valid()) {
4241 if (it
->key().find("purged_epoch_") != 0) {
4244 string k
= it
->key();
4245 int n
= sscanf(k
.c_str(), "purged_epoch_%lx", &epoch
);
4247 derr
<< __func__
<< " unable to parse key '" << it
->key() << "'" << dendl
;
4248 } else if (epoch
> m
->last
) {
4251 bufferlist bl
= it
->value();
4252 auto p
= bl
.cbegin();
4256 } catch (buffer::error
& e
) {
4257 derr
<< __func__
<< " unable to parse value for key '" << it
->key()
4262 n
+= 4 + v
.size() * 16;
4265 // impose a semi-arbitrary limit to message size
4271 auto reply
= make_message
<MMonGetPurgedSnapsReply
>(m
->start
, epoch
);
4272 reply
->purged_snaps
.swap(r
);
4273 mon
->send_reply(op
, reply
.detach());
4279 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op
)
4281 op
->mark_osdmon_event(__func__
);
4283 auto session
= op
->get_session();
4286 dout(10) << __func__
<< " no monitor session!" << dendl
;
4289 if (!session
->is_capable("osd", MON_CAP_X
)) {
4290 derr
<< __func__
<< " received from entity "
4291 << "with insufficient privileges " << session
->caps
<< dendl
;
4294 // Always forward the beacon to the leader, even if they are the same as
4295 // the old one. The leader will mark as down osds that haven't sent
4296 // beacon for a few minutes.
4300 bool OSDMonitor::prepare_beacon(MonOpRequestRef op
)
4302 op
->mark_osdmon_event(__func__
);
4303 const auto beacon
= op
->get_req
<MOSDBeacon
>();
4304 const auto src
= beacon
->get_orig_source();
4305 dout(10) << __func__
<< " " << *beacon
4306 << " from " << src
<< dendl
;
4307 int from
= src
.num();
4309 if (!src
.is_osd() ||
4310 !osdmap
.is_up(from
) ||
4311 !osdmap
.get_addrs(from
).legacy_equals(beacon
->get_orig_source_addrs())) {
4312 if (src
.is_osd() && !osdmap
.is_up(from
)) {
4313 // share some new maps with this guy in case it may not be
4314 // aware of its own deadness...
4315 send_latest(op
, beacon
->version
+1);
4317 dout(1) << " ignoring beacon from non-active osd." << from
<< dendl
;
4321 last_osd_report
[from
] = ceph_clock_now();
4322 osd_epochs
[from
] = beacon
->version
;
4324 for (const auto& pg
: beacon
->pgs
) {
4325 last_epoch_clean
.report(pg
, beacon
->min_last_epoch_clean
);
4328 if (osdmap
.osd_xinfo
[from
].last_purged_snaps_scrub
<
4329 beacon
->last_purged_snaps_scrub
) {
4330 if (pending_inc
.new_xinfo
.count(from
) == 0) {
4331 pending_inc
.new_xinfo
[from
] = osdmap
.osd_xinfo
[from
];
4333 pending_inc
.new_xinfo
[from
].last_purged_snaps_scrub
=
4334 beacon
->last_purged_snaps_scrub
;
4344 void OSDMonitor::send_latest(MonOpRequestRef op
, epoch_t start
)
4346 op
->mark_osdmon_event(__func__
);
4347 dout(5) << "send_latest to " << op
->get_req()->get_orig_source_inst()
4348 << " start " << start
<< dendl
;
4352 send_incremental(op
, start
);
4356 MOSDMap
*OSDMonitor::build_latest_full(uint64_t features
)
4358 MOSDMap
*r
= new MOSDMap(mon
->monmap
->fsid
, features
);
4359 get_version_full(osdmap
.get_epoch(), features
, r
->maps
[osdmap
.get_epoch()]);
4360 r
->oldest_map
= get_first_committed();
4361 r
->newest_map
= osdmap
.get_epoch();
4365 MOSDMap
*OSDMonitor::build_incremental(epoch_t from
, epoch_t to
, uint64_t features
)
4367 dout(10) << "build_incremental [" << from
<< ".." << to
<< "] with features "
4368 << std::hex
<< features
<< std::dec
<< dendl
;
4369 MOSDMap
*m
= new MOSDMap(mon
->monmap
->fsid
, features
);
4370 m
->oldest_map
= get_first_committed();
4371 m
->newest_map
= osdmap
.get_epoch();
4373 for (epoch_t e
= to
; e
>= from
&& e
> 0; e
--) {
4375 int err
= get_version(e
, features
, bl
);
4377 ceph_assert(bl
.length());
4378 // if (get_version(e, bl) > 0) {
4379 dout(20) << "build_incremental inc " << e
<< " "
4380 << bl
.length() << " bytes" << dendl
;
4381 m
->incremental_maps
[e
] = bl
;
4383 ceph_assert(err
== -ENOENT
);
4384 ceph_assert(!bl
.length());
4385 get_version_full(e
, features
, bl
);
4386 if (bl
.length() > 0) {
4387 //else if (get_version("full", e, bl) > 0) {
4388 dout(20) << "build_incremental full " << e
<< " "
4389 << bl
.length() << " bytes" << dendl
;
4392 ceph_abort(); // we should have all maps.
4399 void OSDMonitor::send_full(MonOpRequestRef op
)
4401 op
->mark_osdmon_event(__func__
);
4402 dout(5) << "send_full to " << op
->get_req()->get_orig_source_inst() << dendl
;
4403 mon
->send_reply(op
, build_latest_full(op
->get_session()->con_features
));
4406 void OSDMonitor::send_incremental(MonOpRequestRef op
, epoch_t first
)
4408 op
->mark_osdmon_event(__func__
);
4410 MonSession
*s
= op
->get_session();
4414 // oh, we can tell the other mon to do it
4415 dout(10) << __func__
<< " asking proxying mon to send_incremental from "
4417 MRoute
*r
= new MRoute(s
->proxy_tid
, NULL
);
4418 r
->send_osdmap_first
= first
;
4419 s
->proxy_con
->send_message(r
);
4420 op
->mark_event("reply: send routed send_osdmap_first reply");
4423 send_incremental(first
, s
, false, op
);
4427 void OSDMonitor::send_incremental(epoch_t first
,
4428 MonSession
*session
,
4430 MonOpRequestRef req
)
4432 dout(5) << "send_incremental [" << first
<< ".." << osdmap
.get_epoch() << "]"
4433 << " to " << session
->name
<< dendl
;
4435 // get feature of the peer
4436 // use quorum_con_features, if it's an anonymous connection.
4437 uint64_t features
= session
->con_features
? session
->con_features
:
4438 mon
->get_quorum_con_features();
4440 if (first
<= session
->osd_epoch
) {
4441 dout(10) << __func__
<< " " << session
->name
<< " should already have epoch "
4442 << session
->osd_epoch
<< dendl
;
4443 first
= session
->osd_epoch
+ 1;
4446 if (first
< get_first_committed()) {
4447 MOSDMap
*m
= new MOSDMap(osdmap
.get_fsid(), features
);
4448 m
->oldest_map
= get_first_committed();
4449 m
->newest_map
= osdmap
.get_epoch();
4451 first
= get_first_committed();
4453 int err
= get_version_full(first
, features
, bl
);
4454 ceph_assert(err
== 0);
4455 ceph_assert(bl
.length());
4456 dout(20) << "send_incremental starting with base full "
4457 << first
<< " " << bl
.length() << " bytes" << dendl
;
4458 m
->maps
[first
] = bl
;
4461 mon
->send_reply(req
, m
);
4462 session
->osd_epoch
= first
;
4465 session
->con
->send_message(m
);
4466 session
->osd_epoch
= first
;
4471 while (first
<= osdmap
.get_epoch()) {
4472 epoch_t last
= std::min
<epoch_t
>(first
+ g_conf()->osd_map_message_max
- 1,
4473 osdmap
.get_epoch());
4474 MOSDMap
*m
= build_incremental(first
, last
, features
);
4477 // send some maps. it may not be all of them, but it will get them
4479 mon
->send_reply(req
, m
);
4481 session
->con
->send_message(m
);
4484 session
->osd_epoch
= last
;
4490 int OSDMonitor::get_version(version_t ver
, bufferlist
& bl
)
4492 return get_version(ver
, mon
->get_quorum_con_features(), bl
);
4495 void OSDMonitor::reencode_incremental_map(bufferlist
& bl
, uint64_t features
)
4497 OSDMap::Incremental inc
;
4498 auto q
= bl
.cbegin();
4500 // always encode with subset of osdmap's canonical features
4501 uint64_t f
= features
& inc
.encode_features
;
4502 dout(20) << __func__
<< " " << inc
.epoch
<< " with features " << f
4505 if (inc
.fullmap
.length()) {
4506 // embedded full map?
4508 m
.decode(inc
.fullmap
);
4509 inc
.fullmap
.clear();
4510 m
.encode(inc
.fullmap
, f
| CEPH_FEATURE_RESERVED
);
4512 if (inc
.crush
.length()) {
4513 // embedded crush map
4515 auto p
= inc
.crush
.cbegin();
4518 c
.encode(inc
.crush
, f
);
4520 inc
.encode(bl
, f
| CEPH_FEATURE_RESERVED
);
4523 void OSDMonitor::reencode_full_map(bufferlist
& bl
, uint64_t features
)
4526 auto q
= bl
.cbegin();
4528 // always encode with subset of osdmap's canonical features
4529 uint64_t f
= features
& m
.get_encoding_features();
4530 dout(20) << __func__
<< " " << m
.get_epoch() << " with features " << f
4533 m
.encode(bl
, f
| CEPH_FEATURE_RESERVED
);
4536 int OSDMonitor::get_version(version_t ver
, uint64_t features
, bufferlist
& bl
)
4538 uint64_t significant_features
= OSDMap::get_significant_features(features
);
4539 if (inc_osd_cache
.lookup({ver
, significant_features
}, &bl
)) {
4542 int ret
= PaxosService::get_version(ver
, bl
);
4546 // NOTE: this check is imprecise; the OSDMap encoding features may
4547 // be a subset of the latest mon quorum features, but worst case we
4548 // reencode once and then cache the (identical) result under both
4550 if (significant_features
!=
4551 OSDMap::get_significant_features(mon
->get_quorum_con_features())) {
4552 reencode_incremental_map(bl
, features
);
4554 inc_osd_cache
.add_bytes({ver
, significant_features
}, bl
);
4558 int OSDMonitor::get_inc(version_t ver
, OSDMap::Incremental
& inc
)
4561 int err
= get_version(ver
, inc_bl
);
4562 ceph_assert(err
== 0);
4563 ceph_assert(inc_bl
.length());
4565 auto p
= inc_bl
.cbegin();
4567 dout(10) << __func__
<< " "
4568 << " epoch " << inc
.epoch
4569 << " inc_crc " << inc
.inc_crc
4570 << " full_crc " << inc
.full_crc
4571 << " encode_features " << inc
.encode_features
<< dendl
;
4575 int OSDMonitor::get_full_from_pinned_map(version_t ver
, bufferlist
& bl
)
4577 dout(10) << __func__
<< " ver " << ver
<< dendl
;
4579 version_t closest_pinned
= osdmap_manifest
.get_lower_closest_pinned(ver
);
4580 if (closest_pinned
== 0) {
4583 if (closest_pinned
> ver
) {
4584 dout(0) << __func__
<< " pinned: " << osdmap_manifest
.pinned
<< dendl
;
4586 ceph_assert(closest_pinned
<= ver
);
4588 dout(10) << __func__
<< " closest pinned ver " << closest_pinned
<< dendl
;
4590 // get osdmap incremental maps and apply on top of this one.
4592 bool has_cached_osdmap
= false;
4593 for (version_t v
= ver
-1; v
>= closest_pinned
; --v
) {
4594 if (full_osd_cache
.lookup({v
, mon
->get_quorum_con_features()},
4596 dout(10) << __func__
<< " found map in cache ver " << v
<< dendl
;
4598 has_cached_osdmap
= true;
4603 if (!has_cached_osdmap
) {
4604 int err
= PaxosService::get_version_full(closest_pinned
, osdm_bl
);
4606 derr
<< __func__
<< " closest pinned map ver " << closest_pinned
4607 << " not available! error: " << cpp_strerror(err
) << dendl
;
4609 ceph_assert(err
== 0);
4612 ceph_assert(osdm_bl
.length());
4615 osdm
.decode(osdm_bl
);
4617 dout(10) << __func__
<< " loaded osdmap epoch " << closest_pinned
4618 << " e" << osdm
.epoch
4619 << " crc " << osdm
.get_crc()
4620 << " -- applying incremental maps." << dendl
;
4622 uint64_t encode_features
= 0;
4623 for (version_t v
= closest_pinned
+ 1; v
<= ver
; ++v
) {
4624 dout(20) << __func__
<< " applying inc epoch " << v
<< dendl
;
4626 OSDMap::Incremental inc
;
4627 int err
= get_inc(v
, inc
);
4628 ceph_assert(err
== 0);
4630 encode_features
= inc
.encode_features
;
4632 err
= osdm
.apply_incremental(inc
);
4633 ceph_assert(err
== 0);
4635 // this block performs paranoid checks on map retrieval
4636 if (g_conf().get_val
<bool>("mon_debug_extra_checks") &&
4637 inc
.full_crc
!= 0) {
4639 uint64_t f
= encode_features
;
4641 f
= (mon
->quorum_con_features
? mon
->quorum_con_features
: -1);
4644 // encode osdmap to force calculating crcs
4646 osdm
.encode(tbl
, f
| CEPH_FEATURE_RESERVED
);
4647 // decode osdmap to compare crcs with what's expected by incremental
4651 if (tosdm
.get_crc() != inc
.full_crc
) {
4653 << " osdmap crc mismatch! (osdmap crc " << tosdm
.get_crc()
4654 << ", expected " << inc
.full_crc
<< ")" << dendl
;
4655 ceph_abort_msg("osdmap crc mismatch");
4659 // note: we cannot add the recently computed map to the cache, as is,
4660 // because we have not encoded the map into a bl.
4663 if (!encode_features
) {
4664 dout(10) << __func__
4665 << " last incremental map didn't have features;"
4666 << " defaulting to quorum's or all" << dendl
;
4668 (mon
->quorum_con_features
? mon
->quorum_con_features
: -1);
4670 osdm
.encode(bl
, encode_features
| CEPH_FEATURE_RESERVED
);
4675 int OSDMonitor::get_version_full(version_t ver
, bufferlist
& bl
)
4677 return get_version_full(ver
, mon
->get_quorum_con_features(), bl
);
4680 int OSDMonitor::get_version_full(version_t ver
, uint64_t features
,
4683 uint64_t significant_features
= OSDMap::get_significant_features(features
);
4684 if (full_osd_cache
.lookup({ver
, significant_features
}, &bl
)) {
4687 int ret
= PaxosService::get_version_full(ver
, bl
);
4688 if (ret
== -ENOENT
) {
4690 ret
= get_full_from_pinned_map(ver
, bl
);
4695 // NOTE: this check is imprecise; the OSDMap encoding features may
4696 // be a subset of the latest mon quorum features, but worst case we
4697 // reencode once and then cache the (identical) result under both
4699 if (significant_features
!=
4700 OSDMap::get_significant_features(mon
->get_quorum_con_features())) {
4701 reencode_full_map(bl
, features
);
4703 full_osd_cache
.add_bytes({ver
, significant_features
}, bl
);
4707 epoch_t
OSDMonitor::blacklist(const entity_addrvec_t
& av
, utime_t until
)
4709 dout(10) << "blacklist " << av
<< " until " << until
<< dendl
;
4710 for (auto a
: av
.v
) {
4711 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
4712 a
.set_type(entity_addr_t::TYPE_ANY
);
4714 a
.set_type(entity_addr_t::TYPE_LEGACY
);
4716 pending_inc
.new_blacklist
[a
] = until
;
4718 return pending_inc
.epoch
;
4721 epoch_t
OSDMonitor::blacklist(entity_addr_t a
, utime_t until
)
4723 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
4724 a
.set_type(entity_addr_t::TYPE_ANY
);
4726 a
.set_type(entity_addr_t::TYPE_LEGACY
);
4728 dout(10) << "blacklist " << a
<< " until " << until
<< dendl
;
4729 pending_inc
.new_blacklist
[a
] = until
;
4730 return pending_inc
.epoch
;
4734 void OSDMonitor::check_osdmap_subs()
4736 dout(10) << __func__
<< dendl
;
4737 if (!osdmap
.get_epoch()) {
4740 auto osdmap_subs
= mon
->session_map
.subs
.find("osdmap");
4741 if (osdmap_subs
== mon
->session_map
.subs
.end()) {
4744 auto p
= osdmap_subs
->second
->begin();
4748 check_osdmap_sub(sub
);
4752 void OSDMonitor::check_osdmap_sub(Subscription
*sub
)
4754 dout(10) << __func__
<< " " << sub
<< " next " << sub
->next
4755 << (sub
->onetime
? " (onetime)":" (ongoing)") << dendl
;
4756 if (sub
->next
<= osdmap
.get_epoch()) {
4758 send_incremental(sub
->next
, sub
->session
, sub
->incremental_onetime
);
4760 sub
->session
->con
->send_message(build_latest_full(sub
->session
->con_features
));
4762 mon
->session_map
.remove_sub(sub
);
4764 sub
->next
= osdmap
.get_epoch() + 1;
4768 void OSDMonitor::check_pg_creates_subs()
4770 if (!osdmap
.get_num_up_osds()) {
4773 ceph_assert(osdmap
.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB
);
4774 mon
->with_session_map([this](const MonSessionMap
& session_map
) {
4775 auto pg_creates_subs
= session_map
.subs
.find("osd_pg_creates");
4776 if (pg_creates_subs
== session_map
.subs
.end()) {
4779 for (auto sub
: *pg_creates_subs
->second
) {
4780 check_pg_creates_sub(sub
);
4785 void OSDMonitor::check_pg_creates_sub(Subscription
*sub
)
4787 dout(20) << __func__
<< " .. " << sub
->session
->name
<< dendl
;
4788 ceph_assert(sub
->type
== "osd_pg_creates");
4789 // only send these if the OSD is up. we will check_subs() when they do
4790 // come up so they will get the creates then.
4791 if (sub
->session
->name
.is_osd() &&
4792 mon
->osdmon()->osdmap
.is_up(sub
->session
->name
.num())) {
4793 sub
->next
= send_pg_creates(sub
->session
->name
.num(),
4794 sub
->session
->con
.get(),
4799 void OSDMonitor::do_application_enable(int64_t pool_id
,
4800 const std::string
&app_name
,
4801 const std::string
&app_key
,
4802 const std::string
&app_value
,
4805 ceph_assert(paxos
->is_plugged() && is_writeable());
4807 dout(20) << __func__
<< ": pool_id=" << pool_id
<< ", app_name=" << app_name
4810 ceph_assert(osdmap
.require_osd_release
>= ceph_release_t::luminous
);
4812 auto pp
= osdmap
.get_pg_pool(pool_id
);
4813 ceph_assert(pp
!= nullptr);
4816 if (pending_inc
.new_pools
.count(pool_id
)) {
4817 p
= pending_inc
.new_pools
[pool_id
];
4820 if (app_key
.empty()) {
4821 p
.application_metadata
.insert({app_name
, {}});
4824 p
.application_metadata
[app_name
][app_key
] = app_value
;
4826 p
.application_metadata
.insert({app_name
, {{app_key
, app_value
}}});
4829 p
.last_change
= pending_inc
.epoch
;
4830 pending_inc
.new_pools
[pool_id
] = p
;
4833 void OSDMonitor::do_set_pool_opt(int64_t pool_id
,
4834 pool_opts_t::key_t opt
,
4835 pool_opts_t::value_t val
)
4837 auto p
= pending_inc
.new_pools
.try_emplace(
4838 pool_id
, *osdmap
.get_pg_pool(pool_id
));
4839 p
.first
->second
.opts
.set(opt
, val
);
4842 unsigned OSDMonitor::scan_for_creating_pgs(
4843 const mempool::osdmap::map
<int64_t,pg_pool_t
>& pools
,
4844 const mempool::osdmap::set
<int64_t>& removed_pools
,
4846 creating_pgs_t
* creating_pgs
) const
4848 unsigned queued
= 0;
4849 for (auto& p
: pools
) {
4850 int64_t poolid
= p
.first
;
4851 if (creating_pgs
->created_pools
.count(poolid
)) {
4852 dout(10) << __func__
<< " already created " << poolid
<< dendl
;
4855 const pg_pool_t
& pool
= p
.second
;
4856 int ruleno
= osdmap
.crush
->find_rule(pool
.get_crush_rule(),
4857 pool
.get_type(), pool
.get_size());
4858 if (ruleno
< 0 || !osdmap
.crush
->rule_exists(ruleno
))
4861 const auto last_scan_epoch
= creating_pgs
->last_scan_epoch
;
4862 const auto created
= pool
.get_last_change();
4863 if (last_scan_epoch
&& created
<= last_scan_epoch
) {
4864 dout(10) << __func__
<< " no change in pool " << poolid
4865 << " " << pool
<< dendl
;
4868 if (removed_pools
.count(poolid
)) {
4869 dout(10) << __func__
<< " pool is being removed: " << poolid
4870 << " " << pool
<< dendl
;
4873 dout(10) << __func__
<< " queueing pool create for " << poolid
4874 << " " << pool
<< dendl
;
4875 creating_pgs
->create_pool(poolid
, pool
.get_pg_num(),
4882 void OSDMonitor::update_creating_pgs()
4884 dout(10) << __func__
<< " " << creating_pgs
.pgs
.size() << " pgs creating, "
4885 << creating_pgs
.queue
.size() << " pools in queue" << dendl
;
4886 decltype(creating_pgs_by_osd_epoch
) new_pgs_by_osd_epoch
;
4887 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
4888 for (const auto& pg
: creating_pgs
.pgs
) {
4889 int acting_primary
= -1;
4890 auto pgid
= pg
.first
;
4891 if (!osdmap
.pg_exists(pgid
)) {
4892 dout(20) << __func__
<< " ignoring " << pgid
<< " which should not exist"
4896 auto mapped
= pg
.second
.create_epoch
;
4897 dout(20) << __func__
<< " looking up " << pgid
<< "@" << mapped
<< dendl
;
4899 mapping
.get_primary_and_shard(pgid
, &acting_primary
, &spgid
);
4900 // check the previous creating_pgs, look for the target to whom the pg was
4901 // previously mapped
4902 for (const auto& pgs_by_epoch
: creating_pgs_by_osd_epoch
) {
4903 const auto last_acting_primary
= pgs_by_epoch
.first
;
4904 for (auto& pgs
: pgs_by_epoch
.second
) {
4905 if (pgs
.second
.count(spgid
)) {
4906 if (last_acting_primary
== acting_primary
) {
4909 dout(20) << __func__
<< " " << pgid
<< " "
4910 << " acting_primary:" << last_acting_primary
4911 << " -> " << acting_primary
<< dendl
;
4912 // note epoch if the target of the create message changed.
4913 mapped
= mapping
.get_epoch();
4918 mapped
= mapping
.get_epoch();
4922 dout(10) << __func__
<< " will instruct osd." << acting_primary
4923 << " to create " << pgid
<< "@" << mapped
<< dendl
;
4924 new_pgs_by_osd_epoch
[acting_primary
][mapped
].insert(spgid
);
4926 creating_pgs_by_osd_epoch
= std::move(new_pgs_by_osd_epoch
);
4927 creating_pgs_epoch
= mapping
.get_epoch();
4930 epoch_t
OSDMonitor::send_pg_creates(int osd
, Connection
*con
, epoch_t next
) const
4932 dout(30) << __func__
<< " osd." << osd
<< " next=" << next
4933 << " " << creating_pgs_by_osd_epoch
<< dendl
;
4934 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
4935 if (creating_pgs_epoch
<= creating_pgs
.last_scan_epoch
) {
4936 dout(20) << __func__
4937 << " not using stale creating_pgs@" << creating_pgs_epoch
<< dendl
;
4938 // the subscribers will be updated when the mapping is completed anyway
4941 auto creating_pgs_by_epoch
= creating_pgs_by_osd_epoch
.find(osd
);
4942 if (creating_pgs_by_epoch
== creating_pgs_by_osd_epoch
.end())
4944 ceph_assert(!creating_pgs_by_epoch
->second
.empty());
4946 MOSDPGCreate
*oldm
= nullptr; // for pre-mimic OSD compat
4947 MOSDPGCreate2
*m
= nullptr;
4949 bool old
= osdmap
.require_osd_release
< ceph_release_t::nautilus
;
4952 for (auto epoch_pgs
= creating_pgs_by_epoch
->second
.lower_bound(next
);
4953 epoch_pgs
!= creating_pgs_by_epoch
->second
.end(); ++epoch_pgs
) {
4954 auto epoch
= epoch_pgs
->first
;
4955 auto& pgs
= epoch_pgs
->second
;
4956 dout(20) << __func__
<< " osd." << osd
<< " from " << next
4957 << " : epoch " << epoch
<< " " << pgs
.size() << " pgs" << dendl
;
4959 for (auto& pg
: pgs
) {
4960 // Need the create time from the monitor using its clock to set
4961 // last_scrub_stamp upon pg creation.
4962 auto create
= creating_pgs
.pgs
.find(pg
.pgid
);
4963 ceph_assert(create
!= creating_pgs
.pgs
.end());
4966 oldm
= new MOSDPGCreate(creating_pgs_epoch
);
4968 oldm
->mkpg
.emplace(pg
.pgid
,
4969 pg_create_t
{create
->second
.create_epoch
, pg
.pgid
, 0});
4970 oldm
->ctimes
.emplace(pg
.pgid
, create
->second
.create_stamp
);
4973 m
= new MOSDPGCreate2(creating_pgs_epoch
);
4975 m
->pgs
.emplace(pg
, make_pair(create
->second
.create_epoch
,
4976 create
->second
.create_stamp
));
4977 if (create
->second
.history
.epoch_created
) {
4978 dout(20) << __func__
<< " " << pg
<< " " << create
->second
.history
4979 << " " << create
->second
.past_intervals
<< dendl
;
4980 m
->pg_extra
.emplace(pg
, make_pair(create
->second
.history
,
4981 create
->second
.past_intervals
));
4984 dout(20) << __func__
<< " will create " << pg
4985 << " at " << create
->second
.create_epoch
<< dendl
;
4989 con
->send_message(m
);
4991 con
->send_message(oldm
);
4993 dout(20) << __func__
<< " osd." << osd
<< " from " << next
4994 << " has nothing to send" << dendl
;
4998 // sub is current through last + 1
5005 void OSDMonitor::tick()
5007 if (!is_active()) return;
5009 dout(10) << osdmap
<< dendl
;
5011 // always update osdmap manifest, regardless of being the leader.
5012 load_osdmap_manifest();
5014 // always tune priority cache manager memory on leader and peons
5015 if (ceph_using_tcmalloc() && mon_memory_autotune
) {
5016 std::lock_guard
l(balancer_lock
);
5017 if (pcm
!= nullptr) {
5020 _set_new_cache_sizes();
5021 dout(10) << "tick balancer "
5022 << " inc cache_bytes: " << inc_cache
->get_cache_bytes()
5023 << " inc comtd_bytes: " << inc_cache
->get_committed_size()
5024 << " inc used_bytes: " << inc_cache
->_get_used_bytes()
5025 << " inc num_osdmaps: " << inc_cache
->_get_num_osdmaps()
5027 dout(10) << "tick balancer "
5028 << " full cache_bytes: " << full_cache
->get_cache_bytes()
5029 << " full comtd_bytes: " << full_cache
->get_committed_size()
5030 << " full used_bytes: " << full_cache
->_get_used_bytes()
5031 << " full num_osdmaps: " << full_cache
->_get_num_osdmaps()
5036 if (!mon
->is_leader()) return;
5038 bool do_propose
= false;
5039 utime_t now
= ceph_clock_now();
5041 if (handle_osd_timeouts(now
, last_osd_report
)) {
5046 if (check_failures(now
)) {
5050 // Force a proposal if we need to prune; pruning is performed on
5051 // ``encode_pending()``, hence why we need to regularly trigger a proposal
5052 // even if there's nothing going on.
5053 if (is_prune_enabled() && should_prune()) {
5057 // mark down osds out?
5059 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
5060 * influence at all. The decision is made based on the ratio of "in" osds,
5061 * and the function returns false if this ratio is lower that the minimum
5062 * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
5064 if (can_mark_out(-1)) {
5065 string down_out_subtree_limit
= g_conf().get_val
<string
>(
5066 "mon_osd_down_out_subtree_limit");
5067 set
<int> down_cache
; // quick cache of down subtrees
5069 map
<int,utime_t
>::iterator i
= down_pending_out
.begin();
5070 while (i
!= down_pending_out
.end()) {
5076 if (osdmap
.is_down(o
) &&
5079 utime_t
orig_grace(g_conf()->mon_osd_down_out_interval
, 0);
5080 utime_t grace
= orig_grace
;
5081 double my_grace
= 0.0;
5083 if (g_conf()->mon_osd_adjust_down_out_interval
) {
5084 // scale grace period the same way we do the heartbeat grace.
5085 const osd_xinfo_t
& xi
= osdmap
.get_xinfo(o
);
5086 double halflife
= (double)g_conf()->mon_osd_laggy_halflife
;
5087 double decay_k
= ::log(.5) / halflife
;
5088 double decay
= exp((double)down
* decay_k
);
5089 dout(20) << "osd." << o
<< " laggy halflife " << halflife
<< " decay_k " << decay_k
5090 << " down for " << down
<< " decay " << decay
<< dendl
;
5091 my_grace
= decay
* (double)xi
.laggy_interval
* xi
.laggy_probability
;
5095 // is this an entire large subtree down?
5096 if (down_out_subtree_limit
.length()) {
5097 int type
= osdmap
.crush
->get_type_id(down_out_subtree_limit
);
5099 if (osdmap
.containing_subtree_is_down(cct
, o
, type
, &down_cache
)) {
5100 dout(10) << "tick entire containing " << down_out_subtree_limit
5101 << " subtree for osd." << o
5102 << " is down; resetting timer" << dendl
;
5103 // reset timer, too.
5104 down_pending_out
[o
] = now
;
5110 bool down_out
= !osdmap
.is_destroyed(o
) &&
5111 g_conf()->mon_osd_down_out_interval
> 0 && down
.sec() >= grace
;
5112 bool destroyed_out
= osdmap
.is_destroyed(o
) &&
5113 g_conf()->mon_osd_destroyed_out_interval
> 0 &&
5114 // this is not precise enough as we did not make a note when this osd
5115 // was marked as destroyed, but let's not bother with that
5116 // complexity for now.
5117 down
.sec() >= g_conf()->mon_osd_destroyed_out_interval
;
5118 if (down_out
|| destroyed_out
) {
5119 dout(10) << "tick marking osd." << o
<< " OUT after " << down
5120 << " sec (target " << grace
<< " = " << orig_grace
<< " + " << my_grace
<< ")" << dendl
;
5121 pending_inc
.new_weight
[o
] = CEPH_OSD_OUT
;
5123 // set the AUTOOUT bit.
5124 if (pending_inc
.new_state
.count(o
) == 0)
5125 pending_inc
.new_state
[o
] = 0;
5126 pending_inc
.new_state
[o
] |= CEPH_OSD_AUTOOUT
;
5128 // remember previous weight
5129 if (pending_inc
.new_xinfo
.count(o
) == 0)
5130 pending_inc
.new_xinfo
[o
] = osdmap
.osd_xinfo
[o
];
5131 pending_inc
.new_xinfo
[o
].old_weight
= osdmap
.osd_weight
[o
];
5135 mon
->clog
->info() << "Marking osd." << o
<< " out (has been down for "
5136 << int(down
.sec()) << " seconds)";
5141 down_pending_out
.erase(o
);
5144 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl
;
5147 // expire blacklisted items?
5148 for (ceph::unordered_map
<entity_addr_t
,utime_t
>::iterator p
= osdmap
.blacklist
.begin();
5149 p
!= osdmap
.blacklist
.end();
5151 if (p
->second
< now
) {
5152 dout(10) << "expiring blacklist item " << p
->first
<< " expired " << p
->second
<< " < now " << now
<< dendl
;
5153 pending_inc
.old_blacklist
.push_back(p
->first
);
5158 if (try_prune_purged_snaps()) {
5162 if (update_pools_status())
5166 !pending_inc
.new_pg_temp
.empty()) // also propose if we adjusted pg_temp
5170 void OSDMonitor::_set_new_cache_sizes()
5172 uint64_t cache_size
= 0;
5173 int64_t inc_alloc
= 0;
5174 int64_t full_alloc
= 0;
5175 int64_t kv_alloc
= 0;
5177 if (pcm
!= nullptr && rocksdb_binned_kv_cache
!= nullptr) {
5178 cache_size
= pcm
->get_tuned_mem();
5179 inc_alloc
= inc_cache
->get_committed_size();
5180 full_alloc
= full_cache
->get_committed_size();
5181 kv_alloc
= rocksdb_binned_kv_cache
->get_committed_size();
5184 inc_osd_cache
.set_bytes(inc_alloc
);
5185 full_osd_cache
.set_bytes(full_alloc
);
5187 dout(1) << __func__
<< " cache_size:" << cache_size
5188 << " inc_alloc: " << inc_alloc
5189 << " full_alloc: " << full_alloc
5190 << " kv_alloc: " << kv_alloc
5194 bool OSDMonitor::handle_osd_timeouts(const utime_t
&now
,
5195 std::map
<int,utime_t
> &last_osd_report
)
5197 utime_t
timeo(g_conf()->mon_osd_report_timeout
, 0);
5198 if (now
- mon
->get_leader_since() < timeo
) {
5199 // We haven't been the leader for long enough to consider OSD timeouts
5203 int max_osd
= osdmap
.get_max_osd();
5204 bool new_down
= false;
5206 for (int i
=0; i
< max_osd
; ++i
) {
5207 dout(30) << __func__
<< ": checking up on osd " << i
<< dendl
;
5208 if (!osdmap
.exists(i
)) {
5209 last_osd_report
.erase(i
); // if any
5212 if (!osdmap
.is_up(i
))
5214 const std::map
<int,utime_t
>::const_iterator t
= last_osd_report
.find(i
);
5215 if (t
== last_osd_report
.end()) {
5216 // it wasn't in the map; start the timer.
5217 last_osd_report
[i
] = now
;
5218 } else if (can_mark_down(i
)) {
5219 utime_t diff
= now
- t
->second
;
5221 mon
->clog
->info() << "osd." << i
<< " marked down after no beacon for "
5222 << diff
<< " seconds";
5223 derr
<< "no beacon from osd." << i
<< " since " << t
->second
5224 << ", " << diff
<< " seconds ago. marking down" << dendl
;
5225 pending_inc
.new_state
[i
] = CEPH_OSD_UP
;
5233 static void dump_cpu_list(Formatter
*f
, const char *name
,
5234 const string
& strlist
)
5237 size_t cpu_set_size
;
5238 if (parse_cpu_set_list(strlist
.c_str(), &cpu_set_size
, &cpu_set
) < 0) {
5241 set
<int> cpus
= cpu_set_to_set(cpu_set_size
, &cpu_set
);
5242 f
->open_array_section(name
);
5243 for (auto cpu
: cpus
) {
5244 f
->dump_int("cpu", cpu
);
5249 void OSDMonitor::dump_info(Formatter
*f
)
5251 f
->open_object_section("osdmap");
5255 f
->open_array_section("osd_metadata");
5256 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
5257 if (osdmap
.exists(i
)) {
5258 f
->open_object_section("osd");
5259 f
->dump_unsigned("id", i
);
5260 dump_osd_metadata(i
, f
, NULL
);
5266 f
->open_object_section("osdmap_clean_epochs");
5267 f
->dump_unsigned("min_last_epoch_clean", get_min_last_epoch_clean());
5269 f
->open_object_section("last_epoch_clean");
5270 last_epoch_clean
.dump(f
);
5273 f
->open_array_section("osd_epochs");
5274 for (auto& osd_epoch
: osd_epochs
) {
5275 f
->open_object_section("osd");
5276 f
->dump_unsigned("id", osd_epoch
.first
);
5277 f
->dump_unsigned("epoch", osd_epoch
.second
);
5280 f
->close_section(); // osd_epochs
5282 f
->close_section(); // osd_clean_epochs
5284 f
->dump_unsigned("osdmap_first_committed", get_first_committed());
5285 f
->dump_unsigned("osdmap_last_committed", get_last_committed());
5287 f
->open_object_section("crushmap");
5288 osdmap
.crush
->dump(f
);
5291 if (has_osdmap_manifest
) {
5292 f
->open_object_section("osdmap_manifest");
5293 osdmap_manifest
.dump(f
);
5299 enum osd_pool_get_choices
{
5301 PG_NUM
, PGP_NUM
, CRUSH_RULE
, HASHPSPOOL
, EC_OVERWRITES
,
5302 NODELETE
, NOPGCHANGE
, NOSIZECHANGE
,
5303 WRITE_FADVISE_DONTNEED
, NOSCRUB
, NODEEP_SCRUB
,
5304 HIT_SET_TYPE
, HIT_SET_PERIOD
, HIT_SET_COUNT
, HIT_SET_FPP
,
5305 USE_GMT_HITSET
, TARGET_MAX_OBJECTS
, TARGET_MAX_BYTES
,
5306 CACHE_TARGET_DIRTY_RATIO
, CACHE_TARGET_DIRTY_HIGH_RATIO
,
5307 CACHE_TARGET_FULL_RATIO
,
5308 CACHE_MIN_FLUSH_AGE
, CACHE_MIN_EVICT_AGE
,
5309 ERASURE_CODE_PROFILE
, MIN_READ_RECENCY_FOR_PROMOTE
,
5310 MIN_WRITE_RECENCY_FOR_PROMOTE
, FAST_READ
,
5311 HIT_SET_GRADE_DECAY_RATE
, HIT_SET_SEARCH_LAST_N
,
5312 SCRUB_MIN_INTERVAL
, SCRUB_MAX_INTERVAL
, DEEP_SCRUB_INTERVAL
,
5313 RECOVERY_PRIORITY
, RECOVERY_OP_PRIORITY
, SCRUB_PRIORITY
,
5314 COMPRESSION_MODE
, COMPRESSION_ALGORITHM
, COMPRESSION_REQUIRED_RATIO
,
5315 COMPRESSION_MAX_BLOB_SIZE
, COMPRESSION_MIN_BLOB_SIZE
,
5316 CSUM_TYPE
, CSUM_MAX_BLOCK
, CSUM_MIN_BLOCK
, FINGERPRINT_ALGORITHM
,
5317 PG_AUTOSCALE_MODE
, PG_NUM_MIN
, TARGET_SIZE_BYTES
, TARGET_SIZE_RATIO
,
5318 PG_AUTOSCALE_BIAS
};
5320 std::set
<osd_pool_get_choices
>
5321 subtract_second_from_first(const std::set
<osd_pool_get_choices
>& first
,
5322 const std::set
<osd_pool_get_choices
>& second
)
5324 std::set
<osd_pool_get_choices
> result
;
5325 std::set_difference(first
.begin(), first
.end(),
5326 second
.begin(), second
.end(),
5327 std::inserter(result
, result
.end()));
5333 bool OSDMonitor::preprocess_command(MonOpRequestRef op
)
5335 op
->mark_osdmon_event(__func__
);
5336 auto m
= op
->get_req
<MMonCommand
>();
5339 stringstream ss
, ds
;
5342 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
5343 string rs
= ss
.str();
5344 mon
->reply_command(op
, -EINVAL
, rs
, get_last_committed());
5348 MonSession
*session
= op
->get_session();
5350 derr
<< __func__
<< " no session" << dendl
;
5351 mon
->reply_command(op
, -EACCES
, "access denied", get_last_committed());
5356 cmd_getval(cmdmap
, "prefix", prefix
);
5359 cmd_getval(cmdmap
, "format", format
, string("plain"));
5360 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
5362 if (prefix
== "osd stat") {
5364 f
->open_object_section("osdmap");
5365 osdmap
.print_summary(f
.get(), ds
, "", true);
5369 osdmap
.print_summary(nullptr, ds
, "", true);
5373 else if (prefix
== "osd dump" ||
5374 prefix
== "osd tree" ||
5375 prefix
== "osd tree-from" ||
5376 prefix
== "osd ls" ||
5377 prefix
== "osd getmap" ||
5378 prefix
== "osd getcrushmap" ||
5379 prefix
== "osd ls-tree" ||
5380 prefix
== "osd info") {
5385 cmd_getval(cmdmap
, "epoch", epochnum
, (int64_t)osdmap
.get_epoch());
5388 bufferlist osdmap_bl
;
5389 int err
= get_version_full(epoch
, osdmap_bl
);
5390 if (err
== -ENOENT
) {
5392 ss
<< "there is no map for epoch " << epoch
;
5395 ceph_assert(err
== 0);
5396 ceph_assert(osdmap_bl
.length());
5399 if (epoch
== osdmap
.get_epoch()) {
5403 p
->decode(osdmap_bl
);
5406 auto sg
= make_scope_guard([&] {
5412 if (prefix
== "osd dump") {
5415 f
->open_object_section("osdmap");
5425 } else if (prefix
== "osd ls") {
5427 f
->open_array_section("osds");
5428 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
5429 if (osdmap
.exists(i
)) {
5430 f
->dump_int("osd", i
);
5437 for (int i
= 0; i
< osdmap
.get_max_osd(); i
++) {
5438 if (osdmap
.exists(i
)) {
5447 } else if (prefix
== "osd info") {
5449 bool do_single_osd
= true;
5450 if (!cmd_getval(cmdmap
, "id", osd_id
)) {
5451 do_single_osd
= false;
5454 if (do_single_osd
&& !osdmap
.exists(osd_id
)) {
5455 ss
<< "osd." << osd_id
<< " does not exist";
5461 if (do_single_osd
) {
5462 osdmap
.dump_osd(osd_id
, f
.get());
5464 osdmap
.dump_osds(f
.get());
5468 if (do_single_osd
) {
5469 osdmap
.print_osd(osd_id
, ds
);
5471 osdmap
.print_osds(ds
);
5475 } else if (prefix
== "osd tree" || prefix
== "osd tree-from") {
5477 if (prefix
== "osd tree-from") {
5478 cmd_getval(cmdmap
, "bucket", bucket
);
5479 if (!osdmap
.crush
->name_exists(bucket
)) {
5480 ss
<< "bucket '" << bucket
<< "' does not exist";
5484 int id
= osdmap
.crush
->get_item_id(bucket
);
5486 ss
<< "\"" << bucket
<< "\" is not a bucket";
5492 vector
<string
> states
;
5493 cmd_getval(cmdmap
, "states", states
);
5494 unsigned filter
= 0;
5495 for (auto& s
: states
) {
5497 filter
|= OSDMap::DUMP_UP
;
5498 } else if (s
== "down") {
5499 filter
|= OSDMap::DUMP_DOWN
;
5500 } else if (s
== "in") {
5501 filter
|= OSDMap::DUMP_IN
;
5502 } else if (s
== "out") {
5503 filter
|= OSDMap::DUMP_OUT
;
5504 } else if (s
== "destroyed") {
5505 filter
|= OSDMap::DUMP_DESTROYED
;
5507 ss
<< "unrecognized state '" << s
<< "'";
5512 if ((filter
& (OSDMap::DUMP_IN
|OSDMap::DUMP_OUT
)) ==
5513 (OSDMap::DUMP_IN
|OSDMap::DUMP_OUT
)) {
5514 ss
<< "cannot specify both 'in' and 'out'";
5518 if (((filter
& (OSDMap::DUMP_UP
|OSDMap::DUMP_DOWN
)) ==
5519 (OSDMap::DUMP_UP
|OSDMap::DUMP_DOWN
)) ||
5520 ((filter
& (OSDMap::DUMP_UP
|OSDMap::DUMP_DESTROYED
)) ==
5521 (OSDMap::DUMP_UP
|OSDMap::DUMP_DESTROYED
)) ||
5522 ((filter
& (OSDMap::DUMP_DOWN
|OSDMap::DUMP_DESTROYED
)) ==
5523 (OSDMap::DUMP_DOWN
|OSDMap::DUMP_DESTROYED
))) {
5524 ss
<< "can specify only one of 'up', 'down' and 'destroyed'";
5529 f
->open_object_section("tree");
5530 p
->print_tree(f
.get(), NULL
, filter
, bucket
);
5534 p
->print_tree(NULL
, &ds
, filter
, bucket
);
5537 } else if (prefix
== "osd getmap") {
5538 rdata
.append(osdmap_bl
);
5539 ss
<< "got osdmap epoch " << p
->get_epoch();
5540 } else if (prefix
== "osd getcrushmap") {
5541 p
->crush
->encode(rdata
, mon
->get_quorum_con_features());
5542 ss
<< p
->get_crush_version();
5543 } else if (prefix
== "osd ls-tree") {
5545 cmd_getval(cmdmap
, "name", bucket_name
);
5547 r
= p
->get_osds_by_bucket_name(bucket_name
, &osds
);
5549 ss
<< "\"" << bucket_name
<< "\" does not exist";
5552 ss
<< "can not parse bucket name:\"" << bucket_name
<< "\"";
5557 f
->open_array_section("osds");
5558 for (auto &i
: osds
) {
5559 if (osdmap
.exists(i
)) {
5560 f
->dump_int("osd", i
);
5567 for (auto &i
: osds
) {
5568 if (osdmap
.exists(i
)) {
5579 } else if (prefix
== "osd getmaxosd") {
5581 f
->open_object_section("getmaxosd");
5582 f
->dump_unsigned("epoch", osdmap
.get_epoch());
5583 f
->dump_int("max_osd", osdmap
.get_max_osd());
5587 ds
<< "max_osd = " << osdmap
.get_max_osd() << " in epoch " << osdmap
.get_epoch();
5590 } else if (prefix
== "osd utilization") {
5592 osdmap
.summarize_mapping_stats(NULL
, NULL
, &out
, f
.get());
5599 } else if (prefix
== "osd find") {
5601 if (!cmd_getval(cmdmap
, "id", osd
)) {
5602 ss
<< "unable to parse osd id value '"
5603 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
5607 if (!osdmap
.exists(osd
)) {
5608 ss
<< "osd." << osd
<< " does not exist";
5613 cmd_getval(cmdmap
, "format", format
);
5614 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5615 f
->open_object_section("osd_location");
5616 f
->dump_int("osd", osd
);
5617 f
->dump_object("addrs", osdmap
.get_addrs(osd
));
5618 f
->dump_stream("osd_fsid") << osdmap
.get_uuid(osd
);
5620 // try to identify host, pod/container name, etc.
5621 map
<string
,string
> m
;
5622 load_metadata(osd
, m
, nullptr);
5623 if (auto p
= m
.find("hostname"); p
!= m
.end()) {
5624 f
->dump_string("host", p
->second
);
5627 "pod_name", "pod_namespace", // set by rook
5628 "container_name" // set by cephadm, ceph-ansible
5630 if (auto p
= m
.find(k
); p
!= m
.end()) {
5631 f
->dump_string(k
, p
->second
);
5635 // crush is helpful too
5636 f
->open_object_section("crush_location");
5637 map
<string
,string
> loc
= osdmap
.crush
->get_full_location(osd
);
5638 for (map
<string
,string
>::iterator p
= loc
.begin(); p
!= loc
.end(); ++p
)
5639 f
->dump_string(p
->first
.c_str(), p
->second
);
5643 } else if (prefix
== "osd metadata") {
5645 if (cmd_vartype_stringify(cmdmap
["id"]).size() &&
5646 !cmd_getval(cmdmap
, "id", osd
)) {
5647 ss
<< "unable to parse osd id value '"
5648 << cmd_vartype_stringify(cmdmap
["id"]) << "'";
5652 if (osd
>= 0 && !osdmap
.exists(osd
)) {
5653 ss
<< "osd." << osd
<< " does not exist";
5658 cmd_getval(cmdmap
, "format", format
);
5659 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
5661 f
->open_object_section("osd_metadata");
5662 f
->dump_unsigned("id", osd
);
5663 r
= dump_osd_metadata(osd
, f
.get(), &ss
);
5669 f
->open_array_section("osd_metadata");
5670 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
5671 if (osdmap
.exists(i
)) {
5672 f
->open_object_section("osd");
5673 f
->dump_unsigned("id", i
);
5674 r
= dump_osd_metadata(i
, f
.get(), NULL
);
5675 if (r
== -EINVAL
|| r
== -ENOENT
) {
5676 // Drop error, continue to get other daemons' metadata
5677 dout(4) << "No metadata for osd." << i
<< dendl
;
5689 } else if (prefix
== "osd versions") {
5691 f
.reset(Formatter::create("json-pretty"));
5692 count_metadata("ceph_version", f
.get());
5695 } else if (prefix
== "osd count-metadata") {
5697 f
.reset(Formatter::create("json-pretty"));
5699 cmd_getval(cmdmap
, "property", field
);
5700 count_metadata(field
, f
.get());
5703 } else if (prefix
== "osd numa-status") {
5706 f
->open_array_section("osds");
5708 tbl
.define_column("OSD", TextTable::LEFT
, TextTable::RIGHT
);
5709 tbl
.define_column("HOST", TextTable::LEFT
, TextTable::LEFT
);
5710 tbl
.define_column("NETWORK", TextTable::RIGHT
, TextTable::RIGHT
);
5711 tbl
.define_column("STORAGE", TextTable::RIGHT
, TextTable::RIGHT
);
5712 tbl
.define_column("AFFINITY", TextTable::RIGHT
, TextTable::RIGHT
);
5713 tbl
.define_column("CPUS", TextTable::LEFT
, TextTable::LEFT
);
5715 for (int i
=0; i
<osdmap
.get_max_osd(); ++i
) {
5716 if (osdmap
.exists(i
)) {
5717 map
<string
,string
> m
;
5719 if (load_metadata(i
, m
, &err
) < 0) {
5723 auto p
= m
.find("hostname");
5728 f
->open_object_section("osd");
5729 f
->dump_int("osd", i
);
5730 f
->dump_string("host", host
);
5731 for (auto n
: { "network_numa_node", "objectstore_numa_node",
5735 f
->dump_int(n
, atoi(p
->second
.c_str()));
5738 for (auto n
: { "network_numa_nodes", "objectstore_numa_nodes" }) {
5741 list
<string
> ls
= get_str_list(p
->second
, ",");
5742 f
->open_array_section(n
);
5743 for (auto node
: ls
) {
5744 f
->dump_int("node", atoi(node
.c_str()));
5749 for (auto n
: { "numa_node_cpus" }) {
5752 dump_cpu_list(f
.get(), n
, p
->second
);
5759 p
= m
.find("network_numa_nodes");
5765 p
= m
.find("objectstore_numa_nodes");
5771 p
= m
.find("numa_node");
5772 auto q
= m
.find("numa_node_cpus");
5773 if (p
!= m
.end() && q
!= m
.end()) {
5780 tbl
<< TextTable::endrow
;
5788 rdata
.append(stringify(tbl
));
5790 } else if (prefix
== "osd map") {
5791 string poolstr
, objstr
, namespacestr
;
5792 cmd_getval(cmdmap
, "pool", poolstr
);
5793 cmd_getval(cmdmap
, "object", objstr
);
5794 cmd_getval(cmdmap
, "nspace", namespacestr
);
5796 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
5798 ss
<< "pool " << poolstr
<< " does not exist";
5802 object_locator_t
oloc(pool
, namespacestr
);
5803 object_t
oid(objstr
);
5804 pg_t pgid
= osdmap
.object_locator_to_pg(oid
, oloc
);
5805 pg_t mpgid
= osdmap
.raw_pg_to_pg(pgid
);
5806 vector
<int> up
, acting
;
5808 osdmap
.pg_to_up_acting_osds(mpgid
, &up
, &up_p
, &acting
, &acting_p
);
5811 if (!namespacestr
.empty())
5812 fullobjname
= namespacestr
+ string("/") + oid
.name
;
5814 fullobjname
= oid
.name
;
5816 f
->open_object_section("osd_map");
5817 f
->dump_unsigned("epoch", osdmap
.get_epoch());
5818 f
->dump_string("pool", poolstr
);
5819 f
->dump_int("pool_id", pool
);
5820 f
->dump_stream("objname") << fullobjname
;
5821 f
->dump_stream("raw_pgid") << pgid
;
5822 f
->dump_stream("pgid") << mpgid
;
5823 f
->open_array_section("up");
5824 for (vector
<int>::iterator p
= up
.begin(); p
!= up
.end(); ++p
)
5825 f
->dump_int("osd", *p
);
5827 f
->dump_int("up_primary", up_p
);
5828 f
->open_array_section("acting");
5829 for (vector
<int>::iterator p
= acting
.begin(); p
!= acting
.end(); ++p
)
5830 f
->dump_int("osd", *p
);
5832 f
->dump_int("acting_primary", acting_p
);
5833 f
->close_section(); // osd_map
5836 ds
<< "osdmap e" << osdmap
.get_epoch()
5837 << " pool '" << poolstr
<< "' (" << pool
<< ")"
5838 << " object '" << fullobjname
<< "' ->"
5839 << " pg " << pgid
<< " (" << mpgid
<< ")"
5840 << " -> up (" << pg_vector_string(up
) << ", p" << up_p
<< ") acting ("
5841 << pg_vector_string(acting
) << ", p" << acting_p
<< ")";
5845 } else if (prefix
== "pg map") {
5848 cmd_getval(cmdmap
, "pgid", pgidstr
);
5849 if (!pgid
.parse(pgidstr
.c_str())) {
5850 ss
<< "invalid pgid '" << pgidstr
<< "'";
5854 vector
<int> up
, acting
;
5855 if (!osdmap
.have_pg_pool(pgid
.pool())) {
5856 ss
<< "pg '" << pgidstr
<< "' does not exist";
5860 pg_t mpgid
= osdmap
.raw_pg_to_pg(pgid
);
5861 osdmap
.pg_to_up_acting_osds(pgid
, up
, acting
);
5863 f
->open_object_section("pg_map");
5864 f
->dump_unsigned("epoch", osdmap
.get_epoch());
5865 f
->dump_stream("raw_pgid") << pgid
;
5866 f
->dump_stream("pgid") << mpgid
;
5867 f
->open_array_section("up");
5868 for (auto osd
: up
) {
5869 f
->dump_int("up_osd", osd
);
5872 f
->open_array_section("acting");
5873 for (auto osd
: acting
) {
5874 f
->dump_int("acting_osd", osd
);
5880 ds
<< "osdmap e" << osdmap
.get_epoch()
5881 << " pg " << pgid
<< " (" << mpgid
<< ")"
5882 << " -> up " << up
<< " acting " << acting
;
5887 } else if (prefix
== "osd lspools") {
5889 f
->open_array_section("pools");
5890 for (map
<int64_t, pg_pool_t
>::iterator p
= osdmap
.pools
.begin();
5891 p
!= osdmap
.pools
.end();
5894 f
->open_object_section("pool");
5895 f
->dump_int("poolnum", p
->first
);
5896 f
->dump_string("poolname", osdmap
.pool_name
[p
->first
]);
5899 ds
<< p
->first
<< ' ' << osdmap
.pool_name
[p
->first
];
5900 if (next(p
) != osdmap
.pools
.end()) {
5910 } else if (prefix
== "osd blacklist ls") {
5912 f
->open_array_section("blacklist");
5914 for (ceph::unordered_map
<entity_addr_t
,utime_t
>::iterator p
= osdmap
.blacklist
.begin();
5915 p
!= osdmap
.blacklist
.end();
5918 f
->open_object_section("entry");
5919 f
->dump_string("addr", p
->first
.get_legacy_str());
5920 f
->dump_stream("until") << p
->second
;
5925 ss
<< p
->first
<< " " << p
->second
;
5935 ss
<< "listed " << osdmap
.blacklist
.size() << " entries";
5937 } else if (prefix
== "osd pool ls") {
5939 cmd_getval(cmdmap
, "detail", detail
);
5940 if (!f
&& detail
== "detail") {
5942 osdmap
.print_pools(ss
);
5943 rdata
.append(ss
.str());
5946 f
->open_array_section("pools");
5947 for (map
<int64_t,pg_pool_t
>::const_iterator it
= osdmap
.get_pools().begin();
5948 it
!= osdmap
.get_pools().end();
5951 if (detail
== "detail") {
5952 f
->open_object_section("pool");
5953 f
->dump_int("pool_id", it
->first
);
5954 f
->dump_string("pool_name", osdmap
.get_pool_name(it
->first
));
5955 it
->second
.dump(f
.get());
5958 f
->dump_string("pool_name", osdmap
.get_pool_name(it
->first
));
5961 rdata
.append(osdmap
.get_pool_name(it
->first
) + "\n");
5970 } else if (prefix
== "osd crush get-tunable") {
5972 cmd_getval(cmdmap
, "tunable", tunable
);
5975 f
->open_object_section("tunable");
5976 if (tunable
== "straw_calc_version") {
5978 f
->dump_int(tunable
.c_str(), osdmap
.crush
->get_straw_calc_version());
5980 rss
<< osdmap
.crush
->get_straw_calc_version() << "\n";
5989 rdata
.append(rss
.str());
5993 } else if (prefix
== "osd pool get") {
5995 cmd_getval(cmdmap
, "pool", poolstr
);
5996 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
5998 ss
<< "unrecognized pool '" << poolstr
<< "'";
6003 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
6005 cmd_getval(cmdmap
, "var", var
);
6007 typedef std::map
<std::string
, osd_pool_get_choices
> choices_map_t
;
6008 const choices_map_t ALL_CHOICES
= {
6010 {"min_size", MIN_SIZE
},
6011 {"pg_num", PG_NUM
}, {"pgp_num", PGP_NUM
},
6012 {"crush_rule", CRUSH_RULE
}, {"hashpspool", HASHPSPOOL
},
6013 {"allow_ec_overwrites", EC_OVERWRITES
}, {"nodelete", NODELETE
},
6014 {"nopgchange", NOPGCHANGE
}, {"nosizechange", NOSIZECHANGE
},
6015 {"noscrub", NOSCRUB
}, {"nodeep-scrub", NODEEP_SCRUB
},
6016 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED
},
6017 {"hit_set_type", HIT_SET_TYPE
}, {"hit_set_period", HIT_SET_PERIOD
},
6018 {"hit_set_count", HIT_SET_COUNT
}, {"hit_set_fpp", HIT_SET_FPP
},
6019 {"use_gmt_hitset", USE_GMT_HITSET
},
6020 {"target_max_objects", TARGET_MAX_OBJECTS
},
6021 {"target_max_bytes", TARGET_MAX_BYTES
},
6022 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO
},
6023 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO
},
6024 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO
},
6025 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE
},
6026 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE
},
6027 {"erasure_code_profile", ERASURE_CODE_PROFILE
},
6028 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE
},
6029 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE
},
6030 {"fast_read", FAST_READ
},
6031 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE
},
6032 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N
},
6033 {"scrub_min_interval", SCRUB_MIN_INTERVAL
},
6034 {"scrub_max_interval", SCRUB_MAX_INTERVAL
},
6035 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL
},
6036 {"recovery_priority", RECOVERY_PRIORITY
},
6037 {"recovery_op_priority", RECOVERY_OP_PRIORITY
},
6038 {"scrub_priority", SCRUB_PRIORITY
},
6039 {"compression_mode", COMPRESSION_MODE
},
6040 {"compression_algorithm", COMPRESSION_ALGORITHM
},
6041 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO
},
6042 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE
},
6043 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE
},
6044 {"csum_type", CSUM_TYPE
},
6045 {"csum_max_block", CSUM_MAX_BLOCK
},
6046 {"csum_min_block", CSUM_MIN_BLOCK
},
6047 {"fingerprint_algorithm", FINGERPRINT_ALGORITHM
},
6048 {"pg_autoscale_mode", PG_AUTOSCALE_MODE
},
6049 {"pg_num_min", PG_NUM_MIN
},
6050 {"target_size_bytes", TARGET_SIZE_BYTES
},
6051 {"target_size_ratio", TARGET_SIZE_RATIO
},
6052 {"pg_autoscale_bias", PG_AUTOSCALE_BIAS
},
6055 typedef std::set
<osd_pool_get_choices
> choices_set_t
;
6057 const choices_set_t ONLY_TIER_CHOICES
= {
6058 HIT_SET_TYPE
, HIT_SET_PERIOD
, HIT_SET_COUNT
, HIT_SET_FPP
,
6059 TARGET_MAX_OBJECTS
, TARGET_MAX_BYTES
, CACHE_TARGET_FULL_RATIO
,
6060 CACHE_TARGET_DIRTY_RATIO
, CACHE_TARGET_DIRTY_HIGH_RATIO
,
6061 CACHE_MIN_FLUSH_AGE
, CACHE_MIN_EVICT_AGE
,
6062 MIN_READ_RECENCY_FOR_PROMOTE
,
6063 MIN_WRITE_RECENCY_FOR_PROMOTE
,
6064 HIT_SET_GRADE_DECAY_RATE
, HIT_SET_SEARCH_LAST_N
6066 const choices_set_t ONLY_ERASURE_CHOICES
= {
6067 EC_OVERWRITES
, ERASURE_CODE_PROFILE
6070 choices_set_t selected_choices
;
6072 for(choices_map_t::const_iterator it
= ALL_CHOICES
.begin();
6073 it
!= ALL_CHOICES
.end(); ++it
) {
6074 selected_choices
.insert(it
->second
);
6078 selected_choices
= subtract_second_from_first(selected_choices
,
6082 if(!p
->is_erasure()) {
6083 selected_choices
= subtract_second_from_first(selected_choices
,
6084 ONLY_ERASURE_CHOICES
);
6086 } else /* var != "all" */ {
6087 choices_map_t::const_iterator found
= ALL_CHOICES
.find(var
);
6088 osd_pool_get_choices selected
= found
->second
;
6090 if (!p
->is_tier() &&
6091 ONLY_TIER_CHOICES
.find(selected
) != ONLY_TIER_CHOICES
.end()) {
6092 ss
<< "pool '" << poolstr
6093 << "' is not a tier pool: variable not applicable";
6098 if (!p
->is_erasure() &&
6099 ONLY_ERASURE_CHOICES
.find(selected
)
6100 != ONLY_ERASURE_CHOICES
.end()) {
6101 ss
<< "pool '" << poolstr
6102 << "' is not a erasure pool: variable not applicable";
6107 if (pool_opts_t::is_opt_name(var
) &&
6108 !p
->opts
.is_set(pool_opts_t::get_opt_desc(var
).key
)) {
6109 ss
<< "option '" << var
<< "' is not set on pool '" << poolstr
<< "'";
6114 selected_choices
.insert(selected
);
6118 f
->open_object_section("pool");
6119 f
->dump_string("pool", poolstr
);
6120 f
->dump_int("pool_id", pool
);
6121 for(choices_set_t::const_iterator it
= selected_choices
.begin();
6122 it
!= selected_choices
.end(); ++it
) {
6123 choices_map_t::const_iterator i
;
6124 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
6125 if (i
->second
== *it
) {
6129 ceph_assert(i
!= ALL_CHOICES
.end());
6132 f
->dump_int("pg_num", p
->get_pg_num());
6135 f
->dump_int("pgp_num", p
->get_pgp_num());
6138 f
->dump_int("size", p
->get_size());
6141 f
->dump_int("min_size", p
->get_min_size());
6144 if (osdmap
.crush
->rule_exists(p
->get_crush_rule())) {
6145 f
->dump_string("crush_rule", osdmap
.crush
->get_rule_name(
6146 p
->get_crush_rule()));
6148 f
->dump_string("crush_rule", stringify(p
->get_crush_rule()));
6152 f
->dump_bool("allow_ec_overwrites",
6153 p
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
));
6155 case PG_AUTOSCALE_MODE
:
6156 f
->dump_string("pg_autoscale_mode",
6157 pg_pool_t::get_pg_autoscale_mode_name(
6158 p
->pg_autoscale_mode
));
6164 case WRITE_FADVISE_DONTNEED
:
6167 f
->dump_bool(i
->first
.c_str(),
6168 p
->has_flag(pg_pool_t::get_flag_by_name(i
->first
)));
6170 case HIT_SET_PERIOD
:
6171 f
->dump_int("hit_set_period", p
->hit_set_period
);
6174 f
->dump_int("hit_set_count", p
->hit_set_count
);
6177 f
->dump_string("hit_set_type",
6178 HitSet::get_type_name(p
->hit_set_params
.get_type()));
6182 if (p
->hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
6183 BloomHitSet::Params
*bloomp
=
6184 static_cast<BloomHitSet::Params
*>(p
->hit_set_params
.impl
.get());
6185 f
->dump_float("hit_set_fpp", bloomp
->get_fpp());
6186 } else if(var
!= "all") {
6188 ss
<< "hit set is not of type Bloom; " <<
6189 "invalid to get a false positive rate!";
6195 case USE_GMT_HITSET
:
6196 f
->dump_bool("use_gmt_hitset", p
->use_gmt_hitset
);
6198 case TARGET_MAX_OBJECTS
:
6199 f
->dump_unsigned("target_max_objects", p
->target_max_objects
);
6201 case TARGET_MAX_BYTES
:
6202 f
->dump_unsigned("target_max_bytes", p
->target_max_bytes
);
6204 case CACHE_TARGET_DIRTY_RATIO
:
6205 f
->dump_unsigned("cache_target_dirty_ratio_micro",
6206 p
->cache_target_dirty_ratio_micro
);
6207 f
->dump_float("cache_target_dirty_ratio",
6208 ((float)p
->cache_target_dirty_ratio_micro
/1000000));
6210 case CACHE_TARGET_DIRTY_HIGH_RATIO
:
6211 f
->dump_unsigned("cache_target_dirty_high_ratio_micro",
6212 p
->cache_target_dirty_high_ratio_micro
);
6213 f
->dump_float("cache_target_dirty_high_ratio",
6214 ((float)p
->cache_target_dirty_high_ratio_micro
/1000000));
6216 case CACHE_TARGET_FULL_RATIO
:
6217 f
->dump_unsigned("cache_target_full_ratio_micro",
6218 p
->cache_target_full_ratio_micro
);
6219 f
->dump_float("cache_target_full_ratio",
6220 ((float)p
->cache_target_full_ratio_micro
/1000000));
6222 case CACHE_MIN_FLUSH_AGE
:
6223 f
->dump_unsigned("cache_min_flush_age", p
->cache_min_flush_age
);
6225 case CACHE_MIN_EVICT_AGE
:
6226 f
->dump_unsigned("cache_min_evict_age", p
->cache_min_evict_age
);
6228 case ERASURE_CODE_PROFILE
:
6229 f
->dump_string("erasure_code_profile", p
->erasure_code_profile
);
6231 case MIN_READ_RECENCY_FOR_PROMOTE
:
6232 f
->dump_int("min_read_recency_for_promote",
6233 p
->min_read_recency_for_promote
);
6235 case MIN_WRITE_RECENCY_FOR_PROMOTE
:
6236 f
->dump_int("min_write_recency_for_promote",
6237 p
->min_write_recency_for_promote
);
6240 f
->dump_int("fast_read", p
->fast_read
);
6242 case HIT_SET_GRADE_DECAY_RATE
:
6243 f
->dump_int("hit_set_grade_decay_rate",
6244 p
->hit_set_grade_decay_rate
);
6246 case HIT_SET_SEARCH_LAST_N
:
6247 f
->dump_int("hit_set_search_last_n",
6248 p
->hit_set_search_last_n
);
6250 case SCRUB_MIN_INTERVAL
:
6251 case SCRUB_MAX_INTERVAL
:
6252 case DEEP_SCRUB_INTERVAL
:
6253 case RECOVERY_PRIORITY
:
6254 case RECOVERY_OP_PRIORITY
:
6255 case SCRUB_PRIORITY
:
6256 case COMPRESSION_MODE
:
6257 case COMPRESSION_ALGORITHM
:
6258 case COMPRESSION_REQUIRED_RATIO
:
6259 case COMPRESSION_MAX_BLOB_SIZE
:
6260 case COMPRESSION_MIN_BLOB_SIZE
:
6262 case CSUM_MAX_BLOCK
:
6263 case CSUM_MIN_BLOCK
:
6264 case FINGERPRINT_ALGORITHM
:
6266 case TARGET_SIZE_BYTES
:
6267 case TARGET_SIZE_RATIO
:
6268 case PG_AUTOSCALE_BIAS
:
6269 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc(i
->first
).key
;
6270 if (p
->opts
.is_set(key
)) {
6271 if(*it
== CSUM_TYPE
) {
6273 p
->opts
.get(pool_opts_t::CSUM_TYPE
, &val
);
6274 f
->dump_string(i
->first
.c_str(), Checksummer::get_csum_type_string(val
));
6276 p
->opts
.dump(i
->first
, f
.get());
6285 for(choices_set_t::const_iterator it
= selected_choices
.begin();
6286 it
!= selected_choices
.end(); ++it
) {
6287 choices_map_t::const_iterator i
;
6290 ss
<< "pg_num: " << p
->get_pg_num() << "\n";
6293 ss
<< "pgp_num: " << p
->get_pgp_num() << "\n";
6296 ss
<< "size: " << p
->get_size() << "\n";
6299 ss
<< "min_size: " << p
->get_min_size() << "\n";
6302 if (osdmap
.crush
->rule_exists(p
->get_crush_rule())) {
6303 ss
<< "crush_rule: " << osdmap
.crush
->get_rule_name(
6304 p
->get_crush_rule()) << "\n";
6306 ss
<< "crush_rule: " << p
->get_crush_rule() << "\n";
6309 case PG_AUTOSCALE_MODE
:
6310 ss
<< "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
6311 p
->pg_autoscale_mode
) <<"\n";
6313 case HIT_SET_PERIOD
:
6314 ss
<< "hit_set_period: " << p
->hit_set_period
<< "\n";
6317 ss
<< "hit_set_count: " << p
->hit_set_count
<< "\n";
6320 ss
<< "hit_set_type: " <<
6321 HitSet::get_type_name(p
->hit_set_params
.get_type()) << "\n";
6325 if (p
->hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
6326 BloomHitSet::Params
*bloomp
=
6327 static_cast<BloomHitSet::Params
*>(p
->hit_set_params
.impl
.get());
6328 ss
<< "hit_set_fpp: " << bloomp
->get_fpp() << "\n";
6329 } else if(var
!= "all") {
6330 ss
<< "hit set is not of type Bloom; " <<
6331 "invalid to get a false positive rate!";
6337 case USE_GMT_HITSET
:
6338 ss
<< "use_gmt_hitset: " << p
->use_gmt_hitset
<< "\n";
6340 case TARGET_MAX_OBJECTS
:
6341 ss
<< "target_max_objects: " << p
->target_max_objects
<< "\n";
6343 case TARGET_MAX_BYTES
:
6344 ss
<< "target_max_bytes: " << p
->target_max_bytes
<< "\n";
6346 case CACHE_TARGET_DIRTY_RATIO
:
6347 ss
<< "cache_target_dirty_ratio: "
6348 << ((float)p
->cache_target_dirty_ratio_micro
/1000000) << "\n";
6350 case CACHE_TARGET_DIRTY_HIGH_RATIO
:
6351 ss
<< "cache_target_dirty_high_ratio: "
6352 << ((float)p
->cache_target_dirty_high_ratio_micro
/1000000) << "\n";
6354 case CACHE_TARGET_FULL_RATIO
:
6355 ss
<< "cache_target_full_ratio: "
6356 << ((float)p
->cache_target_full_ratio_micro
/1000000) << "\n";
6358 case CACHE_MIN_FLUSH_AGE
:
6359 ss
<< "cache_min_flush_age: " << p
->cache_min_flush_age
<< "\n";
6361 case CACHE_MIN_EVICT_AGE
:
6362 ss
<< "cache_min_evict_age: " << p
->cache_min_evict_age
<< "\n";
6364 case ERASURE_CODE_PROFILE
:
6365 ss
<< "erasure_code_profile: " << p
->erasure_code_profile
<< "\n";
6367 case MIN_READ_RECENCY_FOR_PROMOTE
:
6368 ss
<< "min_read_recency_for_promote: " <<
6369 p
->min_read_recency_for_promote
<< "\n";
6371 case HIT_SET_GRADE_DECAY_RATE
:
6372 ss
<< "hit_set_grade_decay_rate: " <<
6373 p
->hit_set_grade_decay_rate
<< "\n";
6375 case HIT_SET_SEARCH_LAST_N
:
6376 ss
<< "hit_set_search_last_n: " <<
6377 p
->hit_set_search_last_n
<< "\n";
6380 ss
<< "allow_ec_overwrites: " <<
6381 (p
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
) ? "true" : "false") <<
6388 case WRITE_FADVISE_DONTNEED
:
6391 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
6392 if (i
->second
== *it
)
6395 ceph_assert(i
!= ALL_CHOICES
.end());
6396 ss
<< i
->first
<< ": " <<
6397 (p
->has_flag(pg_pool_t::get_flag_by_name(i
->first
)) ?
6398 "true" : "false") << "\n";
6400 case MIN_WRITE_RECENCY_FOR_PROMOTE
:
6401 ss
<< "min_write_recency_for_promote: " <<
6402 p
->min_write_recency_for_promote
<< "\n";
6405 ss
<< "fast_read: " << p
->fast_read
<< "\n";
6407 case SCRUB_MIN_INTERVAL
:
6408 case SCRUB_MAX_INTERVAL
:
6409 case DEEP_SCRUB_INTERVAL
:
6410 case RECOVERY_PRIORITY
:
6411 case RECOVERY_OP_PRIORITY
:
6412 case SCRUB_PRIORITY
:
6413 case COMPRESSION_MODE
:
6414 case COMPRESSION_ALGORITHM
:
6415 case COMPRESSION_REQUIRED_RATIO
:
6416 case COMPRESSION_MAX_BLOB_SIZE
:
6417 case COMPRESSION_MIN_BLOB_SIZE
:
6419 case CSUM_MAX_BLOCK
:
6420 case CSUM_MIN_BLOCK
:
6421 case FINGERPRINT_ALGORITHM
:
6423 case TARGET_SIZE_BYTES
:
6424 case TARGET_SIZE_RATIO
:
6425 case PG_AUTOSCALE_BIAS
:
6426 for (i
= ALL_CHOICES
.begin(); i
!= ALL_CHOICES
.end(); ++i
) {
6427 if (i
->second
== *it
)
6430 ceph_assert(i
!= ALL_CHOICES
.end());
6432 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc(i
->first
).key
;
6433 if (p
->opts
.is_set(key
)) {
6434 if(key
== pool_opts_t::CSUM_TYPE
) {
6436 p
->opts
.get(key
, &val
);
6437 ss
<< i
->first
<< ": " << Checksummer::get_csum_type_string(val
) << "\n";
6439 ss
<< i
->first
<< ": " << p
->opts
.get(key
) << "\n";
6445 rdata
.append(ss
.str());
6450 } else if (prefix
== "osd pool get-quota") {
6452 cmd_getval(cmdmap
, "pool", pool_name
);
6454 int64_t poolid
= osdmap
.lookup_pg_pool_name(pool_name
);
6456 ceph_assert(poolid
== -ENOENT
);
6457 ss
<< "unrecognized pool '" << pool_name
<< "'";
6461 const pg_pool_t
*p
= osdmap
.get_pg_pool(poolid
);
6462 const pool_stat_t
* pstat
= mon
->mgrstatmon()->get_pool_stat(poolid
);
6463 const object_stat_sum_t
& sum
= pstat
->stats
.sum
;
6465 f
->open_object_section("pool_quotas");
6466 f
->dump_string("pool_name", pool_name
);
6467 f
->dump_unsigned("pool_id", poolid
);
6468 f
->dump_unsigned("quota_max_objects", p
->quota_max_objects
);
6469 f
->dump_int("current_num_objects", sum
.num_objects
);
6470 f
->dump_unsigned("quota_max_bytes", p
->quota_max_bytes
);
6471 f
->dump_int("current_num_bytes", sum
.num_bytes
);
6476 rs
<< "quotas for pool '" << pool_name
<< "':\n"
6477 << " max objects: ";
6478 if (p
->quota_max_objects
== 0)
6481 rs
<< si_u_t(p
->quota_max_objects
) << " objects";
6482 rs
<< " (current num objects: " << sum
.num_objects
<< " objects)";
6486 if (p
->quota_max_bytes
== 0)
6489 rs
<< byte_u_t(p
->quota_max_bytes
);
6490 rs
<< " (current num bytes: " << sum
.num_bytes
<< " bytes)";
6492 rdata
.append(rs
.str());
6496 } else if (prefix
== "osd crush rule list" ||
6497 prefix
== "osd crush rule ls") {
6499 f
->open_array_section("rules");
6500 osdmap
.crush
->list_rules(f
.get());
6505 osdmap
.crush
->list_rules(&ss
);
6506 rdata
.append(ss
.str());
6508 } else if (prefix
== "osd crush rule ls-by-class") {
6510 cmd_getval(cmdmap
, "class", class_name
);
6511 if (class_name
.empty()) {
6512 ss
<< "no class specified";
6517 r
= osdmap
.crush
->get_rules_by_class(class_name
, &rules
);
6519 ss
<< "failed to get rules by class '" << class_name
<< "'";
6523 f
->open_array_section("rules");
6524 for (auto &rule
: rules
) {
6525 f
->dump_string("name", osdmap
.crush
->get_rule_name(rule
));
6531 for (auto &rule
: rules
) {
6532 rs
<< osdmap
.crush
->get_rule_name(rule
) << "\n";
6534 rdata
.append(rs
.str());
6536 } else if (prefix
== "osd crush rule dump") {
6538 cmd_getval(cmdmap
, "name", name
);
6540 cmd_getval(cmdmap
, "format", format
);
6541 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6543 f
->open_array_section("rules");
6544 osdmap
.crush
->dump_rules(f
.get());
6547 int ruleno
= osdmap
.crush
->get_rule_id(name
);
6549 ss
<< "unknown crush rule '" << name
<< "'";
6553 osdmap
.crush
->dump_rule(ruleno
, f
.get());
6558 rdata
.append(rs
.str());
6559 } else if (prefix
== "osd crush dump") {
6561 cmd_getval(cmdmap
, "format", format
);
6562 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6563 f
->open_object_section("crush_map");
6564 osdmap
.crush
->dump(f
.get());
6569 rdata
.append(rs
.str());
6570 } else if (prefix
== "osd crush show-tunables") {
6572 cmd_getval(cmdmap
, "format", format
);
6573 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6574 f
->open_object_section("crush_map_tunables");
6575 osdmap
.crush
->dump_tunables(f
.get());
6580 rdata
.append(rs
.str());
6581 } else if (prefix
== "osd crush tree") {
6583 cmd_getval(cmdmap
, "shadow", shadow
);
6584 bool show_shadow
= shadow
== "--show-shadow";
6585 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
6587 f
->open_object_section("crush_tree");
6588 osdmap
.crush
->dump_tree(nullptr,
6590 osdmap
.get_pool_names(),
6596 osdmap
.crush
->dump_tree(&ss
,
6598 osdmap
.get_pool_names(),
6600 rdata
.append(ss
.str());
6602 } else if (prefix
== "osd crush ls") {
6604 if (!cmd_getval(cmdmap
, "node", name
)) {
6605 ss
<< "no node specified";
6609 if (!osdmap
.crush
->name_exists(name
)) {
6610 ss
<< "node '" << name
<< "' does not exist";
6614 int id
= osdmap
.crush
->get_item_id(name
);
6617 result
.push_back(id
);
6619 int num
= osdmap
.crush
->get_bucket_size(id
);
6620 for (int i
= 0; i
< num
; ++i
) {
6621 result
.push_back(osdmap
.crush
->get_bucket_item(id
, i
));
6625 f
->open_array_section("items");
6626 for (auto i
: result
) {
6627 f
->dump_string("item", osdmap
.crush
->get_item_name(i
));
6633 for (auto i
: result
) {
6634 ss
<< osdmap
.crush
->get_item_name(i
) << "\n";
6636 rdata
.append(ss
.str());
6639 } else if (prefix
== "osd crush class ls") {
6640 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json-pretty"));
6641 f
->open_array_section("crush_classes");
6642 for (auto i
: osdmap
.crush
->class_name
)
6643 f
->dump_string("class", i
.second
);
6646 } else if (prefix
== "osd crush class ls-osd") {
6648 cmd_getval(cmdmap
, "class", name
);
6650 osdmap
.crush
->get_devices_by_class(name
, &osds
);
6652 f
->open_array_section("osds");
6653 for (auto &osd
: osds
)
6654 f
->dump_int("osd", osd
);
6659 for (auto &osd
: osds
) {
6667 } else if (prefix
== "osd crush get-device-class") {
6668 vector
<string
> idvec
;
6669 cmd_getval(cmdmap
, "ids", idvec
);
6670 map
<int, string
> class_by_osd
;
6671 for (auto& id
: idvec
) {
6673 long osd
= parse_osd_id(id
.c_str(), &ts
);
6675 ss
<< "unable to parse osd id:'" << id
<< "'";
6679 auto device_class
= osdmap
.crush
->get_item_class(osd
);
6681 class_by_osd
[osd
] = device_class
;
6683 class_by_osd
[osd
] = ""; // no class
6686 f
->open_array_section("osd_device_classes");
6687 for (auto& i
: class_by_osd
) {
6688 f
->open_object_section("osd_device_class");
6689 f
->dump_int("osd", i
.first
);
6690 f
->dump_string("device_class", i
.second
);
6696 if (class_by_osd
.size() == 1) {
6697 // for single input, make a clean output
6698 ds
<< class_by_osd
.begin()->second
;
6700 // note that we do not group osds by class here
6701 for (auto it
= class_by_osd
.begin();
6702 it
!= class_by_osd
.end();
6704 ds
<< "osd." << it
->first
<< ' ' << it
->second
;
6705 if (next(it
) != class_by_osd
.end())
6711 } else if (prefix
== "osd erasure-code-profile ls") {
6712 const auto &profiles
= osdmap
.get_erasure_code_profiles();
6714 f
->open_array_section("erasure-code-profiles");
6715 for (auto i
= profiles
.begin(); i
!= profiles
.end(); ++i
) {
6717 f
->dump_string("profile", i
->first
.c_str());
6719 rdata
.append(i
->first
+ "\n");
6726 rdata
.append(rs
.str());
6728 } else if (prefix
== "osd crush weight-set ls") {
6729 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
6731 f
->open_array_section("weight_sets");
6732 if (osdmap
.crush
->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS
)) {
6733 f
->dump_string("pool", "(compat)");
6735 for (auto& i
: osdmap
.crush
->choose_args
) {
6737 f
->dump_string("pool", osdmap
.get_pool_name(i
.first
));
6744 if (osdmap
.crush
->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS
)) {
6747 for (auto& i
: osdmap
.crush
->choose_args
) {
6749 rs
<< osdmap
.get_pool_name(i
.first
) << "\n";
6752 rdata
.append(rs
.str());
6754 } else if (prefix
== "osd crush weight-set dump") {
6755 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty",
6757 osdmap
.crush
->dump_choose_args(f
.get());
6759 } else if (prefix
== "osd erasure-code-profile get") {
6761 cmd_getval(cmdmap
, "name", name
);
6762 if (!osdmap
.has_erasure_code_profile(name
)) {
6763 ss
<< "unknown erasure code profile '" << name
<< "'";
6767 const map
<string
,string
> &profile
= osdmap
.get_erasure_code_profile(name
);
6769 f
->open_object_section("profile");
6770 for (map
<string
,string
>::const_iterator i
= profile
.begin();
6774 f
->dump_string(i
->first
.c_str(), i
->second
.c_str());
6776 rdata
.append(i
->first
+ "=" + i
->second
+ "\n");
6783 rdata
.append(rs
.str());
6785 } else if (prefix
== "osd pool application get") {
6786 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty",
6789 cmd_getval(cmdmap
, "pool", pool_name
);
6791 cmd_getval(cmdmap
, "app", app
);
6793 cmd_getval(cmdmap
, "key", key
);
6795 if (pool_name
.empty()) {
6797 f
->open_object_section("pools");
6798 for (const auto &pool
: osdmap
.pools
) {
6799 std::string
name("<unknown>");
6800 const auto &pni
= osdmap
.pool_name
.find(pool
.first
);
6801 if (pni
!= osdmap
.pool_name
.end())
6803 f
->open_object_section(name
.c_str());
6804 for (auto &app_pair
: pool
.second
.application_metadata
) {
6805 f
->open_object_section(app_pair
.first
.c_str());
6806 for (auto &kv_pair
: app_pair
.second
) {
6807 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
6811 f
->close_section(); // name
6813 f
->close_section(); // pools
6816 int64_t pool
= osdmap
.lookup_pg_pool_name(pool_name
.c_str());
6818 ss
<< "unrecognized pool '" << pool_name
<< "'";
6822 auto p
= osdmap
.get_pg_pool(pool
);
6825 f
->open_object_section(pool_name
.c_str());
6826 for (auto &app_pair
: p
->application_metadata
) {
6827 f
->open_object_section(app_pair
.first
.c_str());
6828 for (auto &kv_pair
: app_pair
.second
) {
6829 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
6831 f
->close_section(); // application
6833 f
->close_section(); // pool_name
6838 auto app_it
= p
->application_metadata
.find(app
);
6839 if (app_it
== p
->application_metadata
.end()) {
6840 ss
<< "pool '" << pool_name
<< "' has no application '" << app
<< "'";
6844 // filter by pool + app
6846 f
->open_object_section(app_it
->first
.c_str());
6847 for (auto &kv_pair
: app_it
->second
) {
6848 f
->dump_string(kv_pair
.first
.c_str(), kv_pair
.second
);
6850 f
->close_section(); // application
6854 // filter by pool + app + key
6855 auto key_it
= app_it
->second
.find(key
);
6856 if (key_it
== app_it
->second
.end()) {
6857 ss
<< "application '" << app
<< "' on pool '" << pool_name
6858 << "' does not have key '" << key
<< "'";
6862 ss
<< key_it
->second
<< "\n";
6863 rdata
.append(ss
.str());
6866 } else if (prefix
== "osd get-require-min-compat-client") {
6867 ss
<< osdmap
.require_min_compat_client
<< std::endl
;
6868 rdata
.append(ss
.str());
6871 } else if (prefix
== "osd pool application enable" ||
6872 prefix
== "osd pool application disable" ||
6873 prefix
== "osd pool application set" ||
6874 prefix
== "osd pool application rm") {
6875 bool changed
= false;
6876 r
= preprocess_command_pool_application(prefix
, cmdmap
, ss
, &changed
);
6880 } else if (changed
) {
6881 // Valid mutation, proceed to prepare phase
6884 // Idempotent case, reply
6888 // try prepare update
6895 mon
->reply_command(op
, r
, rs
, rdata
, get_last_committed());
6899 void OSDMonitor::set_pool_flags(int64_t pool_id
, uint64_t flags
)
6901 pg_pool_t
*pool
= pending_inc
.get_new_pool(pool_id
,
6902 osdmap
.get_pg_pool(pool_id
));
6904 pool
->set_flag(flags
);
6907 void OSDMonitor::clear_pool_flags(int64_t pool_id
, uint64_t flags
)
6909 pg_pool_t
*pool
= pending_inc
.get_new_pool(pool_id
,
6910 osdmap
.get_pg_pool(pool_id
));
6912 pool
->unset_flag(flags
);
6915 string
OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch
)
6918 snprintf(k
, sizeof(k
), "purged_epoch_%08lx", (unsigned long)epoch
);
6922 string
OSDMonitor::make_purged_snap_key(int64_t pool
, snapid_t snap
)
6925 snprintf(k
, sizeof(k
), "purged_snap_%llu_%016llx",
6926 (unsigned long long)pool
, (unsigned long long)snap
);
6930 string
OSDMonitor::make_purged_snap_key_value(
6931 int64_t pool
, snapid_t snap
, snapid_t num
,
6932 epoch_t epoch
, bufferlist
*v
)
6934 // encode the *last* epoch in the key so that we can use forward
6935 // iteration only to search for an epoch in an interval.
6937 encode(snap
+ num
, *v
);
6939 return make_purged_snap_key(pool
, snap
+ num
- 1);
6943 int OSDMonitor::lookup_purged_snap(
6944 int64_t pool
, snapid_t snap
,
6945 snapid_t
*begin
, snapid_t
*end
)
6947 string k
= make_purged_snap_key(pool
, snap
);
6948 auto it
= mon
->store
->get_iterator(OSD_SNAP_PREFIX
);
6951 dout(20) << __func__
6952 << " pool " << pool
<< " snap " << snap
6953 << " - key '" << k
<< "' not found" << dendl
;
6956 if (it
->key().find("purged_snap_") != 0) {
6957 dout(20) << __func__
6958 << " pool " << pool
<< " snap " << snap
6959 << " - key '" << k
<< "' got '" << it
->key()
6960 << "', wrong prefix" << dendl
;
6963 string gotk
= it
->key();
6964 const char *format
= "purged_snap_%llu_";
6965 long long int keypool
;
6966 int n
= sscanf(gotk
.c_str(), format
, &keypool
);
6968 derr
<< __func__
<< " invalid k '" << gotk
<< "'" << dendl
;
6971 if (pool
!= keypool
) {
6972 dout(20) << __func__
6973 << " pool " << pool
<< " snap " << snap
6974 << " - key '" << k
<< "' got '" << gotk
6975 << "', wrong pool " << keypool
6979 bufferlist v
= it
->value();
6980 auto p
= v
.cbegin();
6983 if (snap
< *begin
|| snap
>= *end
) {
6984 dout(20) << __func__
6985 << " pool " << pool
<< " snap " << snap
6986 << " - found [" << *begin
<< "," << *end
<< "), no overlap"
6993 void OSDMonitor::insert_purged_snap_update(
6995 snapid_t start
, snapid_t end
,
6997 MonitorDBStore::TransactionRef t
)
6999 snapid_t before_begin
, before_end
;
7000 snapid_t after_begin
, after_end
;
7001 int b
= lookup_purged_snap(pool
, start
- 1,
7002 &before_begin
, &before_end
);
7003 int a
= lookup_purged_snap(pool
, end
,
7004 &after_begin
, &after_end
);
7006 dout(10) << __func__
7007 << " [" << start
<< "," << end
<< ") - joins ["
7008 << before_begin
<< "," << before_end
<< ") and ["
7009 << after_begin
<< "," << after_end
<< ")" << dendl
;
7010 // erase only the begin record; we'll overwrite the end one.
7011 t
->erase(OSD_SNAP_PREFIX
, make_purged_snap_key(pool
, before_end
- 1));
7013 string k
= make_purged_snap_key_value(pool
,
7014 before_begin
, after_end
- before_begin
,
7015 pending_inc
.epoch
, &v
);
7016 t
->put(OSD_SNAP_PREFIX
, k
, v
);
7018 dout(10) << __func__
7019 << " [" << start
<< "," << end
<< ") - join with earlier ["
7020 << before_begin
<< "," << before_end
<< ")" << dendl
;
7021 t
->erase(OSD_SNAP_PREFIX
, make_purged_snap_key(pool
, before_end
- 1));
7023 string k
= make_purged_snap_key_value(pool
,
7024 before_begin
, end
- before_begin
,
7025 pending_inc
.epoch
, &v
);
7026 t
->put(OSD_SNAP_PREFIX
, k
, v
);
7028 dout(10) << __func__
7029 << " [" << start
<< "," << end
<< ") - join with later ["
7030 << after_begin
<< "," << after_end
<< ")" << dendl
;
7031 // overwrite after record
7033 string k
= make_purged_snap_key_value(pool
,
7034 start
, after_end
- start
,
7035 pending_inc
.epoch
, &v
);
7036 t
->put(OSD_SNAP_PREFIX
, k
, v
);
7038 dout(10) << __func__
7039 << " [" << start
<< "," << end
<< ") - new"
7042 string k
= make_purged_snap_key_value(pool
,
7044 pending_inc
.epoch
, &v
);
7045 t
->put(OSD_SNAP_PREFIX
, k
, v
);
7049 bool OSDMonitor::try_prune_purged_snaps()
7051 if (!mon
->mgrstatmon()->is_readable()) {
7054 if (!pending_inc
.new_purged_snaps
.empty()) {
7055 return false; // we already pruned for this epoch
7058 unsigned max_prune
= cct
->_conf
.get_val
<uint64_t>(
7059 "mon_max_snap_prune_per_epoch");
7063 dout(10) << __func__
<< " max_prune " << max_prune
<< dendl
;
7065 unsigned actually_pruned
= 0;
7066 auto& purged_snaps
= mon
->mgrstatmon()->get_digest().purged_snaps
;
7067 for (auto& p
: osdmap
.get_pools()) {
7068 auto q
= purged_snaps
.find(p
.first
);
7069 if (q
== purged_snaps
.end()) {
7072 auto& purged
= q
->second
;
7073 if (purged
.empty()) {
7074 dout(20) << __func__
<< " " << p
.first
<< " nothing purged" << dendl
;
7077 dout(20) << __func__
<< " pool " << p
.first
<< " purged " << purged
<< dendl
;
7078 snap_interval_set_t to_prune
;
7079 unsigned maybe_pruned
= actually_pruned
;
7080 for (auto i
= purged
.begin(); i
!= purged
.end(); ++i
) {
7081 snapid_t begin
= i
.get_start();
7082 auto end
= i
.get_start() + i
.get_len();
7083 snapid_t pbegin
= 0, pend
= 0;
7084 int r
= lookup_purged_snap(p
.first
, begin
, &pbegin
, &pend
);
7087 // be a bit aggressive about backing off here, because the mon may
7088 // do a lot of work going through this set, and if we know the
7089 // purged set from the OSDs is at least *partly* stale we may as
7090 // well wait for it to be fresh.
7091 dout(20) << __func__
<< " we've already purged " << pbegin
7092 << "~" << (pend
- pbegin
) << dendl
;
7095 if (pbegin
&& pbegin
> begin
&& pbegin
< end
) {
7096 // the tail of [begin,end) is purged; shorten the range
7099 to_prune
.insert(begin
, end
- begin
);
7100 maybe_pruned
+= end
- begin
;
7101 if (maybe_pruned
>= max_prune
) {
7105 if (!to_prune
.empty()) {
7106 // PGs may still be reporting things as purged that we have already
7107 // pruned from removed_snaps_queue.
7108 snap_interval_set_t actual
;
7109 auto r
= osdmap
.removed_snaps_queue
.find(p
.first
);
7110 if (r
!= osdmap
.removed_snaps_queue
.end()) {
7111 actual
.intersection_of(to_prune
, r
->second
);
7113 actually_pruned
+= actual
.size();
7114 dout(10) << __func__
<< " pool " << p
.first
<< " reports pruned " << to_prune
7115 << ", actual pruned " << actual
<< dendl
;
7116 if (!actual
.empty()) {
7117 pending_inc
.new_purged_snaps
[p
.first
].swap(actual
);
7120 if (actually_pruned
>= max_prune
) {
7124 dout(10) << __func__
<< " actually pruned " << actually_pruned
<< dendl
;
7125 return !!actually_pruned
;
7128 bool OSDMonitor::update_pools_status()
7130 if (!mon
->mgrstatmon()->is_readable())
7135 auto& pools
= osdmap
.get_pools();
7136 for (auto it
= pools
.begin(); it
!= pools
.end(); ++it
) {
7137 const pool_stat_t
*pstat
= mon
->mgrstatmon()->get_pool_stat(it
->first
);
7140 const object_stat_sum_t
& sum
= pstat
->stats
.sum
;
7141 const pg_pool_t
&pool
= it
->second
;
7142 const string
& pool_name
= osdmap
.get_pool_name(it
->first
);
7145 (pool
.quota_max_bytes
> 0 && (uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) ||
7146 (pool
.quota_max_objects
> 0 && (uint64_t)sum
.num_objects
>= pool
.quota_max_objects
);
7148 if (pool
.has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
7152 mon
->clog
->info() << "pool '" << pool_name
7153 << "' no longer out of quota; removing NO_QUOTA flag";
7154 // below we cancel FLAG_FULL too, we'll set it again in
7155 // OSDMonitor::encode_pending if it still fails the osd-full checking.
7156 clear_pool_flags(it
->first
,
7157 pg_pool_t::FLAG_FULL_QUOTA
| pg_pool_t::FLAG_FULL
);
7163 if (pool
.quota_max_bytes
> 0 &&
7164 (uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) {
7165 mon
->clog
->warn() << "pool '" << pool_name
<< "' is full"
7166 << " (reached quota's max_bytes: "
7167 << byte_u_t(pool
.quota_max_bytes
) << ")";
7169 if (pool
.quota_max_objects
> 0 &&
7170 (uint64_t)sum
.num_objects
>= pool
.quota_max_objects
) {
7171 mon
->clog
->warn() << "pool '" << pool_name
<< "' is full"
7172 << " (reached quota's max_objects: "
7173 << pool
.quota_max_objects
<< ")";
7175 // set both FLAG_FULL_QUOTA and FLAG_FULL
7176 // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
7177 // since FLAG_FULL should always take precedence
7178 set_pool_flags(it
->first
,
7179 pg_pool_t::FLAG_FULL_QUOTA
| pg_pool_t::FLAG_FULL
);
7180 clear_pool_flags(it
->first
,
7181 pg_pool_t::FLAG_NEARFULL
|
7182 pg_pool_t::FLAG_BACKFILLFULL
);
7189 int OSDMonitor::prepare_new_pool(MonOpRequestRef op
)
7191 op
->mark_osdmon_event(__func__
);
7192 auto m
= op
->get_req
<MPoolOp
>();
7193 dout(10) << "prepare_new_pool from " << m
->get_connection() << dendl
;
7194 MonSession
*session
= op
->get_session();
7197 string erasure_code_profile
;
7201 ret
= prepare_new_pool(m
->name
, m
->crush_rule
, rule_name
,
7203 erasure_code_profile
,
7204 pg_pool_t::TYPE_REPLICATED
, 0, FAST_READ_OFF
, {},
7208 dout(10) << __func__
<< " got " << ret
<< " " << ss
.str() << dendl
;
7213 int OSDMonitor::crush_rename_bucket(const string
& srcname
,
7214 const string
& dstname
,
7219 // Avoid creating a pending crush if it does not already exists and
7220 // the rename would fail.
7222 if (!_have_pending_crush()) {
7223 ret
= _get_stable_crush().can_rename_bucket(srcname
,
7230 CrushWrapper newcrush
;
7231 _get_pending_crush(newcrush
);
7233 ret
= newcrush
.rename_bucket(srcname
,
7239 pending_inc
.crush
.clear();
7240 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7241 *ss
<< "renamed bucket " << srcname
<< " into " << dstname
;
7245 void OSDMonitor::check_legacy_ec_plugin(const string
& plugin
, const string
& profile
) const
7247 string replacement
= "";
7249 if (plugin
== "jerasure_generic" ||
7250 plugin
== "jerasure_sse3" ||
7251 plugin
== "jerasure_sse4" ||
7252 plugin
== "jerasure_neon") {
7253 replacement
= "jerasure";
7254 } else if (plugin
== "shec_generic" ||
7255 plugin
== "shec_sse3" ||
7256 plugin
== "shec_sse4" ||
7257 plugin
== "shec_neon") {
7258 replacement
= "shec";
7261 if (replacement
!= "") {
7262 dout(0) << "WARNING: erasure coding profile " << profile
<< " uses plugin "
7263 << plugin
<< " that has been deprecated. Please use "
7264 << replacement
<< " instead." << dendl
;
7268 int OSDMonitor::normalize_profile(const string
& profilename
,
7269 ErasureCodeProfile
&profile
,
7273 ErasureCodeInterfaceRef erasure_code
;
7274 ErasureCodePluginRegistry
&instance
= ErasureCodePluginRegistry::instance();
7275 ErasureCodeProfile::const_iterator plugin
= profile
.find("plugin");
7276 check_legacy_ec_plugin(plugin
->second
, profilename
);
7277 int err
= instance
.factory(plugin
->second
,
7278 g_conf().get_val
<std::string
>("erasure_code_dir"),
7279 profile
, &erasure_code
, ss
);
7284 err
= erasure_code
->init(profile
, ss
);
7289 auto it
= profile
.find("stripe_unit");
7290 if (it
!= profile
.end()) {
7292 uint32_t stripe_unit
= strict_iecstrtoll(it
->second
.c_str(), &err_str
);
7293 if (!err_str
.empty()) {
7294 *ss
<< "could not parse stripe_unit '" << it
->second
7295 << "': " << err_str
<< std::endl
;
7298 uint32_t data_chunks
= erasure_code
->get_data_chunk_count();
7299 uint32_t chunk_size
= erasure_code
->get_chunk_size(stripe_unit
* data_chunks
);
7300 if (chunk_size
!= stripe_unit
) {
7301 *ss
<< "stripe_unit " << stripe_unit
<< " does not match ec profile "
7302 << "alignment. Would be padded to " << chunk_size
7306 if ((stripe_unit
% 4096) != 0 && !force
) {
7307 *ss
<< "stripe_unit should be a multiple of 4096 bytes for best performance."
7308 << "use --force to override this check" << std::endl
;
7315 int OSDMonitor::crush_rule_create_erasure(const string
&name
,
7316 const string
&profile
,
7320 int ruleid
= osdmap
.crush
->get_rule_id(name
);
7321 if (ruleid
!= -ENOENT
) {
7322 *rule
= osdmap
.crush
->get_rule_mask_ruleset(ruleid
);
7326 CrushWrapper newcrush
;
7327 _get_pending_crush(newcrush
);
7329 ruleid
= newcrush
.get_rule_id(name
);
7330 if (ruleid
!= -ENOENT
) {
7331 *rule
= newcrush
.get_rule_mask_ruleset(ruleid
);
7334 ErasureCodeInterfaceRef erasure_code
;
7335 int err
= get_erasure_code(profile
, &erasure_code
, ss
);
7337 *ss
<< "failed to load plugin using profile " << profile
<< std::endl
;
7341 err
= erasure_code
->create_rule(name
, newcrush
, ss
);
7342 erasure_code
.reset();
7346 pending_inc
.crush
.clear();
7347 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
7352 int OSDMonitor::get_erasure_code(const string
&erasure_code_profile
,
7353 ErasureCodeInterfaceRef
*erasure_code
,
7356 if (pending_inc
.has_erasure_code_profile(erasure_code_profile
))
7358 ErasureCodeProfile profile
=
7359 osdmap
.get_erasure_code_profile(erasure_code_profile
);
7360 ErasureCodeProfile::const_iterator plugin
=
7361 profile
.find("plugin");
7362 if (plugin
== profile
.end()) {
7363 *ss
<< "cannot determine the erasure code plugin"
7364 << " because there is no 'plugin' entry in the erasure_code_profile "
7365 << profile
<< std::endl
;
7368 check_legacy_ec_plugin(plugin
->second
, erasure_code_profile
);
7369 ErasureCodePluginRegistry
&instance
= ErasureCodePluginRegistry::instance();
7370 return instance
.factory(plugin
->second
,
7371 g_conf().get_val
<std::string
>("erasure_code_dir"),
7372 profile
, erasure_code
, ss
);
7375 int OSDMonitor::check_cluster_features(uint64_t features
,
7378 stringstream unsupported_ss
;
7379 int unsupported_count
= 0;
7380 if ((mon
->get_quorum_con_features() & features
) != features
) {
7381 unsupported_ss
<< "the monitor cluster";
7382 ++unsupported_count
;
7385 set
<int32_t> up_osds
;
7386 osdmap
.get_up_osds(up_osds
);
7387 for (set
<int32_t>::iterator it
= up_osds
.begin();
7388 it
!= up_osds
.end(); ++it
) {
7389 const osd_xinfo_t
&xi
= osdmap
.get_xinfo(*it
);
7390 if ((xi
.features
& features
) != features
) {
7391 if (unsupported_count
> 0)
7392 unsupported_ss
<< ", ";
7393 unsupported_ss
<< "osd." << *it
;
7394 unsupported_count
++;
7398 if (unsupported_count
> 0) {
7399 ss
<< "features " << features
<< " unsupported by: "
7400 << unsupported_ss
.str();
7404 // check pending osd state, too!
7405 for (map
<int32_t,osd_xinfo_t
>::const_iterator p
=
7406 pending_inc
.new_xinfo
.begin();
7407 p
!= pending_inc
.new_xinfo
.end(); ++p
) {
7408 const osd_xinfo_t
&xi
= p
->second
;
7409 if ((xi
.features
& features
) != features
) {
7410 dout(10) << __func__
<< " pending osd." << p
->first
7411 << " features are insufficient; retry" << dendl
;
7419 bool OSDMonitor::validate_crush_against_features(const CrushWrapper
*newcrush
,
7422 OSDMap::Incremental new_pending
= pending_inc
;
7423 encode(*newcrush
, new_pending
.crush
, mon
->get_quorum_con_features());
7425 newmap
.deepish_copy_from(osdmap
);
7426 newmap
.apply_incremental(new_pending
);
7429 if (newmap
.require_min_compat_client
!= ceph_release_t::unknown
) {
7430 auto mv
= newmap
.get_min_compat_client();
7431 if (mv
> newmap
.require_min_compat_client
) {
7432 ss
<< "new crush map requires client version " << mv
7433 << " but require_min_compat_client is "
7434 << newmap
.require_min_compat_client
;
7441 newmap
.get_features(CEPH_ENTITY_TYPE_MON
, NULL
) |
7442 newmap
.get_features(CEPH_ENTITY_TYPE_OSD
, NULL
);
7443 stringstream features_ss
;
7444 int r
= check_cluster_features(features
, features_ss
);
7446 ss
<< "Could not change CRUSH: " << features_ss
.str();
7453 bool OSDMonitor::erasure_code_profile_in_use(
7454 const mempool::osdmap::map
<int64_t, pg_pool_t
> &pools
,
7455 const string
&profile
,
7459 for (map
<int64_t, pg_pool_t
>::const_iterator p
= pools
.begin();
7462 if (p
->second
.erasure_code_profile
== profile
&& p
->second
.is_erasure()) {
7463 *ss
<< osdmap
.pool_name
[p
->first
] << " ";
7468 *ss
<< "pool(s) are using the erasure code profile '" << profile
<< "'";
7473 int OSDMonitor::parse_erasure_code_profile(const vector
<string
> &erasure_code_profile
,
7474 map
<string
,string
> *erasure_code_profile_map
,
7477 int r
= g_conf().with_val
<string
>("osd_pool_default_erasure_code_profile",
7480 erasure_code_profile_map
,
7484 ceph_assert((*erasure_code_profile_map
).count("plugin"));
7485 string default_plugin
= (*erasure_code_profile_map
)["plugin"];
7486 map
<string
,string
> user_map
;
7487 for (vector
<string
>::const_iterator i
= erasure_code_profile
.begin();
7488 i
!= erasure_code_profile
.end();
7490 size_t equal
= i
->find('=');
7491 if (equal
== string::npos
) {
7492 user_map
[*i
] = string();
7493 (*erasure_code_profile_map
)[*i
] = string();
7495 const string key
= i
->substr(0, equal
);
7497 const string value
= i
->substr(equal
);
7498 if (key
.find("ruleset-") == 0) {
7499 *ss
<< "property '" << key
<< "' is no longer supported; try "
7500 << "'crush-" << key
.substr(8) << "' instead";
7503 user_map
[key
] = value
;
7504 (*erasure_code_profile_map
)[key
] = value
;
7508 if (user_map
.count("plugin") && user_map
["plugin"] != default_plugin
)
7509 (*erasure_code_profile_map
) = user_map
;
7514 int OSDMonitor::prepare_pool_size(const unsigned pool_type
,
7515 const string
&erasure_code_profile
,
7517 unsigned *size
, unsigned *min_size
,
7521 switch (pool_type
) {
7522 case pg_pool_t::TYPE_REPLICATED
:
7523 if (repl_size
== 0) {
7524 repl_size
= g_conf().get_val
<uint64_t>("osd_pool_default_size");
7527 *min_size
= g_conf().get_osd_pool_default_min_size(repl_size
);
7529 case pg_pool_t::TYPE_ERASURE
:
7531 ErasureCodeInterfaceRef erasure_code
;
7532 err
= get_erasure_code(erasure_code_profile
, &erasure_code
, ss
);
7534 *size
= erasure_code
->get_chunk_count();
7536 erasure_code
->get_data_chunk_count() +
7537 std::min
<int>(1, erasure_code
->get_coding_chunk_count() - 1);
7538 assert(*min_size
<= *size
);
7539 assert(*min_size
>= erasure_code
->get_data_chunk_count());
7544 *ss
<< "prepare_pool_size: " << pool_type
<< " is not a known pool type";
7551 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type
,
7552 const string
&erasure_code_profile
,
7553 uint32_t *stripe_width
,
7557 switch (pool_type
) {
7558 case pg_pool_t::TYPE_REPLICATED
:
7561 case pg_pool_t::TYPE_ERASURE
:
7563 ErasureCodeProfile profile
=
7564 osdmap
.get_erasure_code_profile(erasure_code_profile
);
7565 ErasureCodeInterfaceRef erasure_code
;
7566 err
= get_erasure_code(erasure_code_profile
, &erasure_code
, ss
);
7569 uint32_t data_chunks
= erasure_code
->get_data_chunk_count();
7570 uint32_t stripe_unit
= g_conf().get_val
<Option::size_t>("osd_pool_erasure_code_stripe_unit");
7571 auto it
= profile
.find("stripe_unit");
7572 if (it
!= profile
.end()) {
7574 stripe_unit
= strict_iecstrtoll(it
->second
.c_str(), &err_str
);
7575 ceph_assert(err_str
.empty());
7577 *stripe_width
= data_chunks
*
7578 erasure_code
->get_chunk_size(stripe_unit
* data_chunks
);
7582 *ss
<< "prepare_pool_stripe_width: "
7583 << pool_type
<< " is not a known pool type";
7590 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type
,
7591 const string
&erasure_code_profile
,
7592 const string
&rule_name
,
7597 if (*crush_rule
< 0) {
7598 switch (pool_type
) {
7599 case pg_pool_t::TYPE_REPLICATED
:
7601 if (rule_name
== "") {
7603 *crush_rule
= osdmap
.crush
->get_osd_pool_default_crush_replicated_ruleset(cct
);
7604 if (*crush_rule
< 0) {
7605 // Errors may happen e.g. if no valid rule is available
7606 *ss
<< "No suitable CRUSH rule exists, check "
7607 << "'osd pool default crush *' config options";
7611 return get_crush_rule(rule_name
, crush_rule
, ss
);
7615 case pg_pool_t::TYPE_ERASURE
:
7617 int err
= crush_rule_create_erasure(rule_name
,
7618 erasure_code_profile
,
7622 dout(20) << "prepare_pool_crush_rule: rule "
7623 << rule_name
<< " try again" << dendl
;
7626 // need to wait for the crush rule to be proposed before proceeding
7637 *ss
<< "prepare_pool_crush_rule: " << pool_type
7638 << " is not a known pool type";
7643 if (!osdmap
.crush
->ruleset_exists(*crush_rule
)) {
7644 *ss
<< "CRUSH rule " << *crush_rule
<< " not found";
7652 int OSDMonitor::get_crush_rule(const string
&rule_name
,
7657 ret
= osdmap
.crush
->get_rule_id(rule_name
);
7658 if (ret
!= -ENOENT
) {
7662 CrushWrapper newcrush
;
7663 _get_pending_crush(newcrush
);
7665 ret
= newcrush
.get_rule_id(rule_name
);
7666 if (ret
!= -ENOENT
) {
7667 // found it, wait for it to be proposed
7668 dout(20) << __func__
<< ": rule " << rule_name
7669 << " try again" << dendl
;
7672 // Cannot find it , return error
7673 *ss
<< "specified rule " << rule_name
<< " doesn't exist";
7680 int OSDMonitor::check_pg_num(int64_t pool
, int pg_num
, int size
, ostream
*ss
)
7682 auto max_pgs_per_osd
= g_conf().get_val
<uint64_t>("mon_max_pg_per_osd");
7683 auto num_osds
= std::max(osdmap
.get_num_in_osds(), 3u); // assume min cluster size 3
7684 auto max_pgs
= max_pgs_per_osd
* num_osds
;
7685 uint64_t projected
= 0;
7687 projected
+= pg_num
* size
;
7689 for (const auto& i
: osdmap
.get_pools()) {
7690 if (i
.first
== pool
) {
7691 projected
+= pg_num
* size
;
7693 projected
+= i
.second
.get_pg_num_target() * i
.second
.get_size();
7696 if (projected
> max_pgs
) {
7698 *ss
<< "pool id " << pool
;
7700 *ss
<< " pg_num " << pg_num
<< " size " << size
7701 << " would mean " << projected
7702 << " total pgs, which exceeds max " << max_pgs
7703 << " (mon_max_pg_per_osd " << max_pgs_per_osd
7704 << " * num_in_osds " << num_osds
<< ")";
7711 * @param name The name of the new pool
7712 * @param crush_rule The crush rule to use. If <0, will use the system default
7713 * @param crush_rule_name The crush rule to use, if crush_rulset <0
7714 * @param pg_num The pg_num to use. If set to 0, will use the system default
7715 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
7716 * @param repl_size Replication factor, or 0 for default
7717 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
7718 * @param pool_type TYPE_ERASURE, or TYPE_REP
7719 * @param expected_num_objects expected number of objects on the pool
7720 * @param fast_read fast read type.
7721 * @param ss human readable error message, if any.
7723 * @return 0 on success, negative errno on failure.
7725 int OSDMonitor::prepare_new_pool(string
& name
,
7727 const string
&crush_rule_name
,
7728 unsigned pg_num
, unsigned pgp_num
,
7729 unsigned pg_num_min
,
7730 const uint64_t repl_size
,
7731 const uint64_t target_size_bytes
,
7732 const float target_size_ratio
,
7733 const string
&erasure_code_profile
,
7734 const unsigned pool_type
,
7735 const uint64_t expected_num_objects
,
7736 FastReadType fast_read
,
7737 const string
& pg_autoscale_mode
,
7740 if (name
.length() == 0)
7743 pg_num
= g_conf().get_val
<uint64_t>("osd_pool_default_pg_num");
7745 pgp_num
= g_conf().get_val
<uint64_t>("osd_pool_default_pgp_num");
7748 if (pg_num
> g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")) {
7749 *ss
<< "'pg_num' must be greater than 0 and less than or equal to "
7750 << g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")
7751 << " (you may adjust 'mon max pool pg num' for higher values)";
7754 if (pgp_num
> pg_num
) {
7755 *ss
<< "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
7756 << ", which in this case is " << pg_num
;
7759 if (pool_type
== pg_pool_t::TYPE_REPLICATED
&& fast_read
== FAST_READ_ON
) {
7760 *ss
<< "'fast_read' can only apply to erasure coding pool";
7764 r
= prepare_pool_crush_rule(pool_type
, erasure_code_profile
,
7765 crush_rule_name
, &crush_rule
, ss
);
7767 dout(10) << "prepare_pool_crush_rule returns " << r
<< dendl
;
7770 if (g_conf()->mon_osd_crush_smoke_test
) {
7771 CrushWrapper newcrush
;
7772 _get_pending_crush(newcrush
);
7774 CrushTester
tester(newcrush
, err
);
7775 tester
.set_min_x(0);
7776 tester
.set_max_x(50);
7777 tester
.set_rule(crush_rule
);
7778 auto start
= ceph::coarse_mono_clock::now();
7779 r
= tester
.test_with_fork(g_conf()->mon_lease
);
7780 auto duration
= ceph::coarse_mono_clock::now() - start
;
7782 dout(10) << "tester.test_with_fork returns " << r
7783 << ": " << err
.str() << dendl
;
7784 *ss
<< "crush test failed with " << r
<< ": " << err
.str();
7787 dout(10) << __func__
<< " crush smoke test duration: "
7788 << duration
<< dendl
;
7790 unsigned size
, min_size
;
7791 r
= prepare_pool_size(pool_type
, erasure_code_profile
, repl_size
,
7792 &size
, &min_size
, ss
);
7794 dout(10) << "prepare_pool_size returns " << r
<< dendl
;
7797 r
= check_pg_num(-1, pg_num
, size
, ss
);
7799 dout(10) << "check_pg_num returns " << r
<< dendl
;
7803 if (!osdmap
.crush
->check_crush_rule(crush_rule
, pool_type
, size
, *ss
)) {
7807 uint32_t stripe_width
= 0;
7808 r
= prepare_pool_stripe_width(pool_type
, erasure_code_profile
, &stripe_width
, ss
);
7810 dout(10) << "prepare_pool_stripe_width returns " << r
<< dendl
;
7815 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
7816 switch (fast_read
) {
7823 case FAST_READ_DEFAULT
:
7824 fread
= g_conf()->osd_pool_default_ec_fast_read
;
7827 *ss
<< "invalid fast_read setting: " << fast_read
;
7832 for (map
<int64_t,string
>::iterator p
= pending_inc
.new_pool_names
.begin();
7833 p
!= pending_inc
.new_pool_names
.end();
7835 if (p
->second
== name
)
7839 if (-1 == pending_inc
.new_pool_max
)
7840 pending_inc
.new_pool_max
= osdmap
.pool_max
;
7841 int64_t pool
= ++pending_inc
.new_pool_max
;
7843 pg_pool_t
*pi
= pending_inc
.get_new_pool(pool
, &empty
);
7844 pi
->create_time
= ceph_clock_now();
7845 pi
->type
= pool_type
;
7846 pi
->fast_read
= fread
;
7847 pi
->flags
= g_conf()->osd_pool_default_flags
;
7848 if (g_conf()->osd_pool_default_flag_hashpspool
)
7849 pi
->set_flag(pg_pool_t::FLAG_HASHPSPOOL
);
7850 if (g_conf()->osd_pool_default_flag_nodelete
)
7851 pi
->set_flag(pg_pool_t::FLAG_NODELETE
);
7852 if (g_conf()->osd_pool_default_flag_nopgchange
)
7853 pi
->set_flag(pg_pool_t::FLAG_NOPGCHANGE
);
7854 if (g_conf()->osd_pool_default_flag_nosizechange
)
7855 pi
->set_flag(pg_pool_t::FLAG_NOSIZECHANGE
);
7856 pi
->set_flag(pg_pool_t::FLAG_CREATING
);
7857 if (g_conf()->osd_pool_use_gmt_hitset
)
7858 pi
->use_gmt_hitset
= true;
7860 pi
->use_gmt_hitset
= false;
7863 pi
->min_size
= min_size
;
7864 pi
->crush_rule
= crush_rule
;
7865 pi
->expected_num_objects
= expected_num_objects
;
7866 pi
->object_hash
= CEPH_STR_HASH_RJENKINS
;
7868 if (auto m
= pg_pool_t::get_pg_autoscale_mode_by_name(
7869 g_conf().get_val
<string
>("osd_pool_default_pg_autoscale_mode"));
7870 m
!= pg_pool_t::pg_autoscale_mode_t::UNKNOWN
) {
7871 pi
->pg_autoscale_mode
= m
;
7873 pi
->pg_autoscale_mode
= pg_pool_t::pg_autoscale_mode_t::OFF
;
7875 auto max
= g_conf().get_val
<int64_t>("mon_osd_max_initial_pgs");
7877 max
> 0 ? std::min
<uint64_t>(pg_num
, std::max
<int64_t>(1, max
))
7879 pi
->set_pg_num_pending(pi
->get_pg_num());
7880 pi
->set_pg_num_target(pg_num
);
7881 pi
->set_pgp_num(pi
->get_pg_num());
7882 pi
->set_pgp_num_target(pgp_num
);
7883 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
&&
7885 pi
->opts
.set(pool_opts_t::PG_NUM_MIN
, static_cast<int64_t>(pg_num_min
));
7887 if (auto m
= pg_pool_t::get_pg_autoscale_mode_by_name(
7888 pg_autoscale_mode
); m
!= pg_pool_t::pg_autoscale_mode_t::UNKNOWN
) {
7889 pi
->pg_autoscale_mode
= m
;
7892 pi
->last_change
= pending_inc
.epoch
;
7895 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
7896 pi
->erasure_code_profile
= erasure_code_profile
;
7898 pi
->erasure_code_profile
= "";
7900 pi
->stripe_width
= stripe_width
;
7902 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
&&
7903 target_size_bytes
) {
7904 // only store for nautilus+ because TARGET_SIZE_BYTES may be
7905 // larger than int32_t max.
7906 pi
->opts
.set(pool_opts_t::TARGET_SIZE_BYTES
, static_cast<int64_t>(target_size_bytes
));
7908 if (target_size_ratio
> 0.0 &&
7909 osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
7910 // only store for nautilus+, just to be consistent and tidy.
7911 pi
->opts
.set(pool_opts_t::TARGET_SIZE_RATIO
, target_size_ratio
);
7914 pi
->cache_target_dirty_ratio_micro
=
7915 g_conf()->osd_pool_default_cache_target_dirty_ratio
* 1000000;
7916 pi
->cache_target_dirty_high_ratio_micro
=
7917 g_conf()->osd_pool_default_cache_target_dirty_high_ratio
* 1000000;
7918 pi
->cache_target_full_ratio_micro
=
7919 g_conf()->osd_pool_default_cache_target_full_ratio
* 1000000;
7920 pi
->cache_min_flush_age
= g_conf()->osd_pool_default_cache_min_flush_age
;
7921 pi
->cache_min_evict_age
= g_conf()->osd_pool_default_cache_min_evict_age
;
7923 pending_inc
.new_pool_names
[pool
] = name
;
7927 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op
, int flag
)
7929 op
->mark_osdmon_event(__func__
);
7931 if (pending_inc
.new_flags
< 0)
7932 pending_inc
.new_flags
= osdmap
.get_flags();
7933 pending_inc
.new_flags
|= flag
;
7934 ss
<< OSDMap::get_flag_string(flag
) << " is set";
7935 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
7936 get_last_committed() + 1));
7940 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op
, int flag
)
7942 op
->mark_osdmon_event(__func__
);
7944 if (pending_inc
.new_flags
< 0)
7945 pending_inc
.new_flags
= osdmap
.get_flags();
7946 pending_inc
.new_flags
&= ~flag
;
7947 ss
<< OSDMap::get_flag_string(flag
) << " is unset";
7948 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
7949 get_last_committed() + 1));
7953 int OSDMonitor::prepare_command_pool_set(const cmdmap_t
& cmdmap
,
7957 cmd_getval(cmdmap
, "pool", poolstr
);
7958 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
7960 ss
<< "unrecognized pool '" << poolstr
<< "'";
7964 cmd_getval(cmdmap
, "var", var
);
7966 pg_pool_t p
= *osdmap
.get_pg_pool(pool
);
7967 if (pending_inc
.new_pools
.count(pool
))
7968 p
= pending_inc
.new_pools
[pool
];
7970 // accept val as a json string in the normal case (current
7971 // generation monitor). parse out int or float values from the
7972 // string as needed. however, if it is not a string, try to pull
7973 // out an int, in case an older monitor with an older json schema is
7974 // forwarding a request.
7976 string interr
, floaterr
;
7979 int64_t uf
= 0; // micro-f
7980 cmd_getval(cmdmap
, "val", val
);
7983 "target_max_objects"
7985 auto iec_options
= {
7987 "target_size_bytes",
7988 "compression_max_blob_size",
7989 "compression_min_blob_size",
7993 if (count(begin(si_options
), end(si_options
), var
)) {
7994 n
= strict_si_cast
<int64_t>(val
.c_str(), &interr
);
7995 } else if (count(begin(iec_options
), end(iec_options
), var
)) {
7996 n
= strict_iec_cast
<int64_t>(val
.c_str(), &interr
);
7998 // parse string as both int and float; different fields use different types.
7999 n
= strict_strtoll(val
.c_str(), 10, &interr
);
8000 f
= strict_strtod(val
.c_str(), &floaterr
);
8001 uf
= llrintl(f
* (double)1000000.0);
8005 (var
== "hit_set_type" || var
== "hit_set_period" ||
8006 var
== "hit_set_count" || var
== "hit_set_fpp" ||
8007 var
== "target_max_objects" || var
== "target_max_bytes" ||
8008 var
== "cache_target_full_ratio" || var
== "cache_target_dirty_ratio" ||
8009 var
== "cache_target_dirty_high_ratio" || var
== "use_gmt_hitset" ||
8010 var
== "cache_min_flush_age" || var
== "cache_min_evict_age" ||
8011 var
== "hit_set_grade_decay_rate" || var
== "hit_set_search_last_n" ||
8012 var
== "min_read_recency_for_promote" || var
== "min_write_recency_for_promote")) {
8016 if (var
== "size") {
8017 if (p
.has_flag(pg_pool_t::FLAG_NOSIZECHANGE
)) {
8018 ss
<< "pool size change is disabled; you must unset nosizechange flag for the pool first";
8021 if (p
.type
== pg_pool_t::TYPE_ERASURE
) {
8022 ss
<< "can not change the size of an erasure-coded pool";
8025 if (interr
.length()) {
8026 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8029 if (n
<= 0 || n
> 10) {
8030 ss
<< "pool size must be between 1 and 10";
8033 if (!osdmap
.crush
->check_crush_rule(p
.get_crush_rule(), p
.type
, n
, ss
)) {
8036 int r
= check_pg_num(pool
, p
.get_pg_num(), n
, &ss
);
8041 p
.min_size
= g_conf().get_osd_pool_default_min_size(p
.size
);
8042 } else if (var
== "min_size") {
8043 if (p
.has_flag(pg_pool_t::FLAG_NOSIZECHANGE
)) {
8044 ss
<< "pool min size change is disabled; you must unset nosizechange flag for the pool first";
8047 if (interr
.length()) {
8048 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8052 if (p
.type
!= pg_pool_t::TYPE_ERASURE
) {
8053 if (n
< 1 || n
> p
.size
) {
8054 ss
<< "pool min_size must be between 1 and size, which is set to " << (int)p
.size
;
8058 ErasureCodeInterfaceRef erasure_code
;
8061 int err
= get_erasure_code(p
.erasure_code_profile
, &erasure_code
, &tmp
);
8063 k
= erasure_code
->get_data_chunk_count();
8065 ss
<< __func__
<< " get_erasure_code failed: " << tmp
.str();
8069 if (n
< k
|| n
> p
.size
) {
8070 ss
<< "pool min_size must be between " << k
<< " and size, which is set to " << (int)p
.size
;
8075 } else if (var
== "pg_num_actual") {
8076 if (interr
.length()) {
8077 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8080 if (n
== (int)p
.get_pg_num()) {
8083 if (static_cast<uint64_t>(n
) > g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")) {
8084 ss
<< "'pg_num' must be greater than 0 and less than or equal to "
8085 << g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")
8086 << " (you may adjust 'mon max pool pg num' for higher values)";
8089 if (p
.has_flag(pg_pool_t::FLAG_CREATING
)) {
8090 ss
<< "cannot adjust pg_num while initial PGs are being created";
8093 if (n
> (int)p
.get_pg_num()) {
8094 if (p
.get_pg_num() != p
.get_pg_num_pending()) {
8095 // force pre-nautilus clients to resend their ops, since they
8096 // don't understand pg_num_pending changes form a new interval
8097 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
8101 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8102 ss
<< "nautilus OSDs are required to adjust pg_num_pending";
8105 if (n
< (int)p
.get_pgp_num()) {
8106 ss
<< "specified pg_num " << n
<< " < pgp_num " << p
.get_pgp_num();
8109 if (n
< (int)p
.get_pg_num() - 1) {
8110 ss
<< "specified pg_num " << n
<< " < pg_num (" << p
.get_pg_num()
8111 << ") - 1; only single pg decrease is currently supported";
8114 p
.set_pg_num_pending(n
);
8115 // force pre-nautilus clients to resend their ops, since they
8116 // don't understand pg_num_pending changes form a new interval
8117 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
8119 // force pre-luminous clients to resend their ops, since they
8120 // don't understand that split PGs now form a new interval.
8121 p
.last_force_op_resend_preluminous
= pending_inc
.epoch
;
8122 } else if (var
== "pg_num") {
8123 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
8124 ss
<< "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8127 if (interr
.length()) {
8128 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8131 if (n
== (int)p
.get_pg_num_target()) {
8134 if (n
<= 0 || static_cast<uint64_t>(n
) >
8135 g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")) {
8136 ss
<< "'pg_num' must be greater than 0 and less than or equal to "
8137 << g_conf().get_val
<uint64_t>("mon_max_pool_pg_num")
8138 << " (you may adjust 'mon max pool pg num' for higher values)";
8141 if (n
> (int)p
.get_pg_num_target()) {
8142 int r
= check_pg_num(pool
, n
, p
.get_size(), &ss
);
8147 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
8148 if (p
.cache_mode
!= pg_pool_t::CACHEMODE_NONE
&& !force
) {
8149 ss
<< "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
8153 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8154 ss
<< "nautilus OSDs are required to decrease pg_num";
8158 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8159 // pre-nautilus osdmap format; increase pg_num directly
8160 assert(n
> (int)p
.get_pg_num());
8161 // force pre-nautilus clients to resend their ops, since they
8162 // don't understand pg_num_target changes form a new interval
8163 p
.last_force_op_resend_prenautilus
= pending_inc
.epoch
;
8164 // force pre-luminous clients to resend their ops, since they
8165 // don't understand that split PGs now form a new interval.
8166 p
.last_force_op_resend_preluminous
= pending_inc
.epoch
;
8169 // set targets; mgr will adjust pg_num_actual and pgp_num later.
8170 // make pgp_num track pg_num if it already matches. if it is set
8171 // differently, leave it different and let the user control it
8173 if (p
.get_pg_num_target() == p
.get_pgp_num_target()) {
8174 p
.set_pgp_num_target(n
);
8176 p
.set_pg_num_target(n
);
8178 } else if (var
== "pgp_num_actual") {
8179 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
8180 ss
<< "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8183 if (interr
.length()) {
8184 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8188 ss
<< "specified pgp_num must > 0, but you set to " << n
;
8191 if (n
> (int)p
.get_pg_num()) {
8192 ss
<< "specified pgp_num " << n
<< " > pg_num " << p
.get_pg_num();
8195 if (n
> (int)p
.get_pg_num_pending()) {
8196 ss
<< "specified pgp_num " << n
8197 << " > pg_num_pending " << p
.get_pg_num_pending();
8201 } else if (var
== "pgp_num") {
8202 if (p
.has_flag(pg_pool_t::FLAG_NOPGCHANGE
)) {
8203 ss
<< "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8206 if (interr
.length()) {
8207 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8211 ss
<< "specified pgp_num must > 0, but you set to " << n
;
8214 if (n
> (int)p
.get_pg_num_target()) {
8215 ss
<< "specified pgp_num " << n
<< " > pg_num " << p
.get_pg_num_target();
8218 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8219 // pre-nautilus osdmap format; increase pgp_num directly
8222 p
.set_pgp_num_target(n
);
8224 } else if (var
== "pg_autoscale_mode") {
8225 auto m
= pg_pool_t::get_pg_autoscale_mode_by_name(val
);
8226 if (m
== pg_pool_t::pg_autoscale_mode_t::UNKNOWN
) {
8227 ss
<< "specified invalid mode " << val
;
8230 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8231 ss
<< "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
8234 p
.pg_autoscale_mode
= m
;
8235 } else if (var
== "crush_rule") {
8236 int id
= osdmap
.crush
->get_rule_id(val
);
8237 if (id
== -ENOENT
) {
8238 ss
<< "crush rule " << val
<< " does not exist";
8242 ss
<< cpp_strerror(id
);
8245 if (!osdmap
.crush
->check_crush_rule(id
, p
.get_type(), p
.get_size(), ss
)) {
8249 } else if (var
== "nodelete" || var
== "nopgchange" ||
8250 var
== "nosizechange" || var
== "write_fadvise_dontneed" ||
8251 var
== "noscrub" || var
== "nodeep-scrub") {
8252 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
8253 // make sure we only compare against 'n' if we didn't receive a string
8254 if (val
== "true" || (interr
.empty() && n
== 1)) {
8256 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8259 ss
<< "expecting value 'true', 'false', '0', or '1'";
8262 } else if (var
== "hashpspool") {
8263 uint64_t flag
= pg_pool_t::get_flag_by_name(var
);
8265 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
8268 ss
<< "are you SURE? this will remap all placement groups in this pool,"
8269 " this triggers large data movement,"
8270 " pass --yes-i-really-mean-it if you really do.";
8273 // make sure we only compare against 'n' if we didn't receive a string
8274 if (val
== "true" || (interr
.empty() && n
== 1)) {
8276 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8279 ss
<< "expecting value 'true', 'false', '0', or '1'";
8282 } else if (var
== "hit_set_type") {
8284 p
.hit_set_params
= HitSet::Params();
8286 int err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
8289 if (val
== "bloom") {
8290 BloomHitSet::Params
*bsp
= new BloomHitSet::Params
;
8291 bsp
->set_fpp(g_conf().get_val
<double>("osd_pool_default_hit_set_bloom_fpp"));
8292 p
.hit_set_params
= HitSet::Params(bsp
);
8293 } else if (val
== "explicit_hash")
8294 p
.hit_set_params
= HitSet::Params(new ExplicitHashHitSet::Params
);
8295 else if (val
== "explicit_object")
8296 p
.hit_set_params
= HitSet::Params(new ExplicitObjectHitSet::Params
);
8298 ss
<< "unrecognized hit_set type '" << val
<< "'";
8302 } else if (var
== "hit_set_period") {
8303 if (interr
.length()) {
8304 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8307 ss
<< "hit_set_period should be non-negative";
8310 p
.hit_set_period
= n
;
8311 } else if (var
== "hit_set_count") {
8312 if (interr
.length()) {
8313 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8316 ss
<< "hit_set_count should be non-negative";
8319 p
.hit_set_count
= n
;
8320 } else if (var
== "hit_set_fpp") {
8321 if (floaterr
.length()) {
8322 ss
<< "error parsing floating point value '" << val
<< "': " << floaterr
;
8324 } else if (f
< 0 || f
> 1.0) {
8325 ss
<< "hit_set_fpp should be in the range 0..1";
8328 if (p
.hit_set_params
.get_type() != HitSet::TYPE_BLOOM
) {
8329 ss
<< "hit set is not of type Bloom; invalid to set a false positive rate!";
8332 BloomHitSet::Params
*bloomp
= static_cast<BloomHitSet::Params
*>(p
.hit_set_params
.impl
.get());
8334 } else if (var
== "use_gmt_hitset") {
8335 if (val
== "true" || (interr
.empty() && n
== 1)) {
8336 p
.use_gmt_hitset
= true;
8338 ss
<< "expecting value 'true' or '1'";
8341 } else if (var
== "allow_ec_overwrites") {
8342 if (!p
.is_erasure()) {
8343 ss
<< "ec overwrites can only be enabled for an erasure coded pool";
8347 if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites
&&
8348 !is_pool_currently_all_bluestore(pool
, p
, &err
)) {
8349 ss
<< "pool must only be stored on bluestore for scrubbing to work: " << err
.str();
8352 if (val
== "true" || (interr
.empty() && n
== 1)) {
8353 p
.flags
|= pg_pool_t::FLAG_EC_OVERWRITES
;
8354 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8355 ss
<< "ec overwrites cannot be disabled once enabled";
8358 ss
<< "expecting value 'true', 'false', '0', or '1'";
8361 } else if (var
== "target_max_objects") {
8362 if (interr
.length()) {
8363 ss
<< "error parsing int '" << val
<< "': " << interr
;
8366 p
.target_max_objects
= n
;
8367 } else if (var
== "target_max_bytes") {
8368 if (interr
.length()) {
8369 ss
<< "error parsing int '" << val
<< "': " << interr
;
8372 p
.target_max_bytes
= n
;
8373 } else if (var
== "cache_target_dirty_ratio") {
8374 if (floaterr
.length()) {
8375 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
8378 if (f
< 0 || f
> 1.0) {
8379 ss
<< "value must be in the range 0..1";
8382 p
.cache_target_dirty_ratio_micro
= uf
;
8383 } else if (var
== "cache_target_dirty_high_ratio") {
8384 if (floaterr
.length()) {
8385 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
8388 if (f
< 0 || f
> 1.0) {
8389 ss
<< "value must be in the range 0..1";
8392 p
.cache_target_dirty_high_ratio_micro
= uf
;
8393 } else if (var
== "cache_target_full_ratio") {
8394 if (floaterr
.length()) {
8395 ss
<< "error parsing float '" << val
<< "': " << floaterr
;
8398 if (f
< 0 || f
> 1.0) {
8399 ss
<< "value must be in the range 0..1";
8402 p
.cache_target_full_ratio_micro
= uf
;
8403 } else if (var
== "cache_min_flush_age") {
8404 if (interr
.length()) {
8405 ss
<< "error parsing int '" << val
<< "': " << interr
;
8408 p
.cache_min_flush_age
= n
;
8409 } else if (var
== "cache_min_evict_age") {
8410 if (interr
.length()) {
8411 ss
<< "error parsing int '" << val
<< "': " << interr
;
8414 p
.cache_min_evict_age
= n
;
8415 } else if (var
== "min_read_recency_for_promote") {
8416 if (interr
.length()) {
8417 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8420 p
.min_read_recency_for_promote
= n
;
8421 } else if (var
== "hit_set_grade_decay_rate") {
8422 if (interr
.length()) {
8423 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8426 if (n
> 100 || n
< 0) {
8427 ss
<< "value out of range,valid range is 0 - 100";
8430 p
.hit_set_grade_decay_rate
= n
;
8431 } else if (var
== "hit_set_search_last_n") {
8432 if (interr
.length()) {
8433 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8436 if (n
> p
.hit_set_count
|| n
< 0) {
8437 ss
<< "value out of range,valid range is 0 - hit_set_count";
8440 p
.hit_set_search_last_n
= n
;
8441 } else if (var
== "min_write_recency_for_promote") {
8442 if (interr
.length()) {
8443 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8446 p
.min_write_recency_for_promote
= n
;
8447 } else if (var
== "fast_read") {
8448 if (p
.is_replicated()) {
8449 ss
<< "fast read is not supported in replication pool";
8452 if (val
== "true" || (interr
.empty() && n
== 1)) {
8454 } else if (val
== "false" || (interr
.empty() && n
== 0)) {
8455 p
.fast_read
= false;
8457 ss
<< "expecting value 'true', 'false', '0', or '1'";
8460 } else if (pool_opts_t::is_opt_name(var
)) {
8461 bool unset
= val
== "unset";
8462 if (var
== "compression_mode") {
8464 auto cmode
= Compressor::get_comp_mode_type(val
);
8466 ss
<< "unrecognized compression mode '" << val
<< "'";
8470 } else if (var
== "compression_algorithm") {
8472 auto alg
= Compressor::get_comp_alg_type(val
);
8474 ss
<< "unrecognized compression_algorithm '" << val
<< "'";
8478 } else if (var
== "compression_required_ratio") {
8479 if (floaterr
.length()) {
8480 ss
<< "error parsing float value '" << val
<< "': " << floaterr
;
8483 if (f
< 0 || f
> 1) {
8484 ss
<< "compression_required_ratio is out of range (0-1): '" << val
<< "'";
8487 } else if (var
== "csum_type") {
8488 auto t
= unset
? 0 : Checksummer::get_csum_string_type(val
);
8490 ss
<< "unrecognized csum_type '" << val
<< "'";
8493 //preserve csum_type numeric value
8496 } else if (var
== "compression_max_blob_size" ||
8497 var
== "compression_min_blob_size" ||
8498 var
== "csum_max_block" ||
8499 var
== "csum_min_block") {
8500 if (interr
.length()) {
8501 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8504 } else if (var
== "fingerprint_algorithm") {
8506 auto alg
= pg_pool_t::get_fingerprint_from_str(val
);
8508 ss
<< "unrecognized fingerprint_algorithm '" << val
<< "'";
8512 } else if (var
== "target_size_bytes") {
8513 if (interr
.length()) {
8514 ss
<< "error parsing unit value '" << val
<< "': " << interr
;
8517 if (osdmap
.require_osd_release
< ceph_release_t::nautilus
) {
8518 ss
<< "must set require_osd_release to nautilus or "
8519 << "later before setting target_size_bytes";
8522 } else if (var
== "pg_num_min") {
8523 if (interr
.length()) {
8524 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8527 if (n
> (int)p
.get_pg_num_target()) {
8528 ss
<< "specified pg_num_min " << n
8529 << " > pg_num " << p
.get_pg_num_target();
8532 } else if (var
== "recovery_priority") {
8533 if (interr
.length()) {
8534 ss
<< "error parsing int value '" << val
<< "': " << interr
;
8537 if (!g_conf()->debug_allow_any_pool_priority
) {
8538 if (n
> OSD_POOL_PRIORITY_MAX
|| n
< OSD_POOL_PRIORITY_MIN
) {
8539 ss
<< "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
8540 << " and " << OSD_POOL_PRIORITY_MAX
;
8544 } else if (var
== "pg_autoscale_bias") {
8545 if (f
< 0.0 || f
> 1000.0) {
8546 ss
<< "pg_autoscale_bias must be between 0 and 1000";
8551 pool_opts_t::opt_desc_t desc
= pool_opts_t::get_opt_desc(var
);
8552 switch (desc
.type
) {
8553 case pool_opts_t::STR
:
8555 p
.opts
.unset(desc
.key
);
8557 p
.opts
.set(desc
.key
, static_cast<std::string
>(val
));
8560 case pool_opts_t::INT
:
8561 if (interr
.length()) {
8562 ss
<< "error parsing integer value '" << val
<< "': " << interr
;
8566 p
.opts
.unset(desc
.key
);
8568 p
.opts
.set(desc
.key
, static_cast<int64_t>(n
));
8571 case pool_opts_t::DOUBLE
:
8572 if (floaterr
.length()) {
8573 ss
<< "error parsing floating point value '" << val
<< "': " << floaterr
;
8577 p
.opts
.unset(desc
.key
);
8579 p
.opts
.set(desc
.key
, static_cast<double>(f
));
8583 ceph_assert(!"unknown type");
8586 ss
<< "unrecognized variable '" << var
<< "'";
8589 if (val
!= "unset") {
8590 ss
<< "set pool " << pool
<< " " << var
<< " to " << val
;
8592 ss
<< "unset pool " << pool
<< " " << var
;
8594 p
.last_change
= pending_inc
.epoch
;
8595 pending_inc
.new_pools
[pool
] = p
;
8599 int OSDMonitor::prepare_command_pool_application(const string
&prefix
,
8600 const cmdmap_t
& cmdmap
,
8603 return _command_pool_application(prefix
, cmdmap
, ss
, nullptr, true);
8606 int OSDMonitor::preprocess_command_pool_application(const string
&prefix
,
8607 const cmdmap_t
& cmdmap
,
8611 return _command_pool_application(prefix
, cmdmap
, ss
, modified
, false);
8616 * Common logic for preprocess and prepare phases of pool application
8617 * tag commands. In preprocess mode we're only detecting invalid
8618 * commands, and determining whether it was a modification or a no-op.
8619 * In prepare mode we're actually updating the pending state.
8621 int OSDMonitor::_command_pool_application(const string
&prefix
,
8622 const cmdmap_t
& cmdmap
,
8628 cmd_getval(cmdmap
, "pool", pool_name
);
8629 int64_t pool
= osdmap
.lookup_pg_pool_name(pool_name
.c_str());
8631 ss
<< "unrecognized pool '" << pool_name
<< "'";
8635 pg_pool_t p
= *osdmap
.get_pg_pool(pool
);
8637 if (pending_inc
.new_pools
.count(pool
)) {
8638 p
= pending_inc
.new_pools
[pool
];
8643 cmd_getval(cmdmap
, "app", app
);
8644 bool app_exists
= (p
.application_metadata
.count(app
) > 0);
8647 cmd_getval(cmdmap
, "key", key
);
8649 ss
<< "key cannot be 'all'";
8654 cmd_getval(cmdmap
, "value", value
);
8655 if (value
== "all") {
8656 ss
<< "value cannot be 'all'";
8660 if (boost::algorithm::ends_with(prefix
, "enable")) {
8662 ss
<< "application name must be provided";
8667 ss
<< "application must be enabled on base tier";
8672 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
8674 if (!app_exists
&& !p
.application_metadata
.empty() && !force
) {
8675 ss
<< "Are you SURE? Pool '" << pool_name
<< "' already has an enabled "
8676 << "application; pass --yes-i-really-mean-it to proceed anyway";
8680 if (!app_exists
&& p
.application_metadata
.size() >= MAX_POOL_APPLICATIONS
) {
8681 ss
<< "too many enabled applications on pool '" << pool_name
<< "'; "
8682 << "max " << MAX_POOL_APPLICATIONS
;
8686 if (app
.length() > MAX_POOL_APPLICATION_LENGTH
) {
8687 ss
<< "application name '" << app
<< "' too long; max length "
8688 << MAX_POOL_APPLICATION_LENGTH
;
8693 p
.application_metadata
[app
] = {};
8695 ss
<< "enabled application '" << app
<< "' on pool '" << pool_name
<< "'";
8697 } else if (boost::algorithm::ends_with(prefix
, "disable")) {
8699 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
8702 ss
<< "Are you SURE? Disabling an application within a pool might result "
8703 << "in loss of application functionality; pass "
8704 << "--yes-i-really-mean-it to proceed anyway";
8709 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
8711 return 0; // idempotent
8714 p
.application_metadata
.erase(app
);
8715 ss
<< "disable application '" << app
<< "' on pool '" << pool_name
<< "'";
8717 } else if (boost::algorithm::ends_with(prefix
, "set")) {
8719 ss
<< "application metadata must be set on base tier";
8724 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
8730 cmd_getval(cmdmap
, "key", key
);
8733 ss
<< "key must be provided";
8737 auto &app_keys
= p
.application_metadata
[app
];
8738 if (app_keys
.count(key
) == 0 &&
8739 app_keys
.size() >= MAX_POOL_APPLICATION_KEYS
) {
8740 ss
<< "too many keys set for application '" << app
<< "' on pool '"
8741 << pool_name
<< "'; max " << MAX_POOL_APPLICATION_KEYS
;
8745 if (key
.length() > MAX_POOL_APPLICATION_LENGTH
) {
8746 ss
<< "key '" << app
<< "' too long; max length "
8747 << MAX_POOL_APPLICATION_LENGTH
;
8752 cmd_getval(cmdmap
, "value", value
);
8753 if (value
.length() > MAX_POOL_APPLICATION_LENGTH
) {
8754 ss
<< "value '" << value
<< "' too long; max length "
8755 << MAX_POOL_APPLICATION_LENGTH
;
8759 p
.application_metadata
[app
][key
] = value
;
8760 ss
<< "set application '" << app
<< "' key '" << key
<< "' to '"
8761 << value
<< "' on pool '" << pool_name
<< "'";
8762 } else if (boost::algorithm::ends_with(prefix
, "rm")) {
8764 ss
<< "application '" << app
<< "' is not enabled on pool '" << pool_name
8770 cmd_getval(cmdmap
, "key", key
);
8771 auto it
= p
.application_metadata
[app
].find(key
);
8772 if (it
== p
.application_metadata
[app
].end()) {
8773 ss
<< "application '" << app
<< "' on pool '" << pool_name
8774 << "' does not have key '" << key
<< "'";
8775 return 0; // idempotent
8778 p
.application_metadata
[app
].erase(it
);
8779 ss
<< "removed application '" << app
<< "' key '" << key
<< "' on pool '"
8780 << pool_name
<< "'";
8786 p
.last_change
= pending_inc
.epoch
;
8787 pending_inc
.new_pools
[pool
] = p
;
8790 // Because we fell through this far, we didn't hit no-op cases,
8791 // so pool was definitely modified
8792 if (modified
!= nullptr) {
8799 int OSDMonitor::_prepare_command_osd_crush_remove(
8800 CrushWrapper
&newcrush
,
8809 err
= newcrush
.remove_item_under(cct
, id
, ancestor
,
8812 err
= newcrush
.remove_item(cct
, id
, unlink_only
);
8817 void OSDMonitor::do_osd_crush_remove(CrushWrapper
& newcrush
)
8819 pending_inc
.crush
.clear();
8820 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8823 int OSDMonitor::prepare_command_osd_crush_remove(
8824 CrushWrapper
&newcrush
,
8830 int err
= _prepare_command_osd_crush_remove(
8831 newcrush
, id
, ancestor
,
8832 has_ancestor
, unlink_only
);
8837 ceph_assert(err
== 0);
8838 do_osd_crush_remove(newcrush
);
8843 int OSDMonitor::prepare_command_osd_remove(int32_t id
)
8845 if (osdmap
.is_up(id
)) {
8849 pending_inc
.new_state
[id
] = osdmap
.get_state(id
);
8850 pending_inc
.new_uuid
[id
] = uuid_d();
8851 pending_metadata_rm
.insert(id
);
8852 pending_metadata
.erase(id
);
8857 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id
)
8859 ceph_assert(existing_id
);
8862 for (int32_t i
= 0; i
< osdmap
.get_max_osd(); ++i
) {
8863 if (!osdmap
.exists(i
) &&
8864 pending_inc
.new_up_client
.count(i
) == 0 &&
8865 (pending_inc
.new_state
.count(i
) == 0 ||
8866 (pending_inc
.new_state
[i
] & CEPH_OSD_EXISTS
) == 0)) {
8872 if (pending_inc
.new_max_osd
< 0) {
8873 return osdmap
.get_max_osd();
8875 return pending_inc
.new_max_osd
;
8878 void OSDMonitor::do_osd_create(
8881 const string
& device_class
,
8884 dout(10) << __func__
<< " uuid " << uuid
<< dendl
;
8885 ceph_assert(new_id
);
8887 // We presume validation has been performed prior to calling this
8888 // function. We assert with prejudice.
8890 int32_t allocated_id
= -1; // declare here so we can jump
8891 int32_t existing_id
= -1;
8892 if (!uuid
.is_zero()) {
8893 existing_id
= osdmap
.identify_osd(uuid
);
8894 if (existing_id
>= 0) {
8895 ceph_assert(id
< 0 || id
== existing_id
);
8896 *new_id
= existing_id
;
8898 } else if (id
>= 0) {
8899 // uuid does not exist, and id has been provided, so just create
8906 // allocate a new id
8907 allocated_id
= _allocate_osd_id(&existing_id
);
8908 dout(10) << __func__
<< " allocated id " << allocated_id
8909 << " existing id " << existing_id
<< dendl
;
8910 if (existing_id
>= 0) {
8911 ceph_assert(existing_id
< osdmap
.get_max_osd());
8912 ceph_assert(allocated_id
< 0);
8913 pending_inc
.new_weight
[existing_id
] = CEPH_OSD_OUT
;
8914 *new_id
= existing_id
;
8915 } else if (allocated_id
>= 0) {
8916 ceph_assert(existing_id
< 0);
8918 if (pending_inc
.new_max_osd
< 0) {
8919 pending_inc
.new_max_osd
= osdmap
.get_max_osd() + 1;
8921 ++pending_inc
.new_max_osd
;
8923 *new_id
= pending_inc
.new_max_osd
- 1;
8924 ceph_assert(*new_id
== allocated_id
);
8926 ceph_abort_msg("unexpected condition");
8930 if (device_class
.size()) {
8931 CrushWrapper newcrush
;
8932 _get_pending_crush(newcrush
);
8933 if (newcrush
.get_max_devices() < *new_id
+ 1) {
8934 newcrush
.set_max_devices(*new_id
+ 1);
8936 string name
= string("osd.") + stringify(*new_id
);
8937 if (!newcrush
.item_exists(*new_id
)) {
8938 newcrush
.set_item_name(*new_id
, name
);
8941 int r
= newcrush
.update_device_class(*new_id
, device_class
, name
, &ss
);
8943 derr
<< __func__
<< " failed to set " << name
<< " device_class "
8944 << device_class
<< ": " << cpp_strerror(r
) << " - " << ss
.str()
8946 // non-fatal... this might be a replay and we want to be idempotent.
8948 dout(20) << __func__
<< " set " << name
<< " device_class " << device_class
8950 pending_inc
.crush
.clear();
8951 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
8954 dout(20) << __func__
<< " no device_class" << dendl
;
8957 dout(10) << __func__
<< " using id " << *new_id
<< dendl
;
8958 if (osdmap
.get_max_osd() <= *new_id
&& pending_inc
.new_max_osd
<= *new_id
) {
8959 pending_inc
.new_max_osd
= *new_id
+ 1;
8962 pending_inc
.new_state
[*new_id
] |= CEPH_OSD_EXISTS
| CEPH_OSD_NEW
;
8963 if (!uuid
.is_zero())
8964 pending_inc
.new_uuid
[*new_id
] = uuid
;
8967 int OSDMonitor::validate_osd_create(
8970 const bool check_osd_exists
,
8971 int32_t* existing_id
,
8975 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
8976 << " check_osd_exists " << check_osd_exists
<< dendl
;
8978 ceph_assert(existing_id
);
8980 if (id
< 0 && uuid
.is_zero()) {
8981 // we have nothing to validate
8984 } else if (uuid
.is_zero()) {
8985 // we have an id but we will ignore it - because that's what
8986 // `osd create` does.
8991 * This function will be used to validate whether we are able to
8992 * create a new osd when the `uuid` is specified.
8994 * It will be used by both `osd create` and `osd new`, as the checks
8995 * are basically the same when it pertains to osd id and uuid validation.
8996 * However, `osd create` presumes an `uuid` is optional, for legacy
8997 * reasons, while `osd new` requires the `uuid` to be provided. This
8998 * means that `osd create` will not be idempotent if an `uuid` is not
8999 * provided, but we will always guarantee the idempotency of `osd new`.
9002 ceph_assert(!uuid
.is_zero());
9003 if (pending_inc
.identify_osd(uuid
) >= 0) {
9004 // osd is about to exist
9008 int32_t i
= osdmap
.identify_osd(uuid
);
9010 // osd already exists
9011 if (id
>= 0 && i
!= id
) {
9012 ss
<< "uuid " << uuid
<< " already in use for different id " << i
;
9015 // return a positive errno to distinguish between a blocking error
9016 // and an error we consider to not be a problem (i.e., this would be
9017 // an idempotent operation).
9023 if (pending_inc
.new_state
.count(id
)) {
9024 // osd is about to exist
9027 // we may not care if an osd exists if we are recreating a previously
9029 if (check_osd_exists
&& osdmap
.exists(id
)) {
9030 ss
<< "id " << id
<< " already in use and does not match uuid "
9038 int OSDMonitor::prepare_command_osd_create(
9041 int32_t* existing_id
,
9044 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
<< dendl
;
9045 ceph_assert(existing_id
);
9046 if (osdmap
.is_destroyed(id
)) {
9047 ss
<< "ceph osd create has been deprecated. Please use ceph osd new "
9052 if (uuid
.is_zero()) {
9053 dout(10) << __func__
<< " no uuid; assuming legacy `osd create`" << dendl
;
9056 return validate_osd_create(id
, uuid
, true, existing_id
, ss
);
9059 int OSDMonitor::prepare_command_osd_new(
9061 const cmdmap_t
& cmdmap
,
9062 const map
<string
,string
>& params
,
9070 ceph_assert(paxos
->is_plugged());
9072 dout(10) << __func__
<< " " << op
<< dendl
;
9074 /* validate command. abort now if something's wrong. */
9076 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
9078 * If `id` is not specified, we will identify any existing osd based
9079 * on `uuid`. Operation will be idempotent iff secrets match.
9081 * If `id` is specified, we will identify any existing osd based on
9082 * `uuid` and match against `id`. If they match, operation will be
9083 * idempotent iff secrets match.
9085 * `-i secrets.json` will be optional. If supplied, will be used
9086 * to check for idempotency when `id` and `uuid` match.
9088 * If `id` is not specified, and `uuid` does not exist, an id will
9089 * be found or allocated for the osd.
9091 * If `id` is specified, and the osd has been previously marked
9092 * as destroyed, then the `id` will be reused.
9094 if (!cmd_getval(cmdmap
, "uuid", uuidstr
)) {
9095 ss
<< "requires the OSD's UUID to be specified.";
9097 } else if (!uuid
.parse(uuidstr
.c_str())) {
9098 ss
<< "invalid UUID value '" << uuidstr
<< "'.";
9102 if (cmd_getval(cmdmap
, "id", id
) &&
9104 ss
<< "invalid OSD id; must be greater or equal than zero.";
9108 // are we running an `osd create`-like command, or recreating
9109 // a previously destroyed osd?
9111 bool is_recreate_destroyed
= (id
>= 0 && osdmap
.is_destroyed(id
));
9113 // we will care about `id` to assess whether osd is `destroyed`, or
9114 // to create a new osd.
9115 // we will need an `id` by the time we reach auth.
9117 int32_t existing_id
= -1;
9118 int err
= validate_osd_create(id
, uuid
, !is_recreate_destroyed
,
9121 bool may_be_idempotent
= false;
9122 if (err
== EEXIST
) {
9123 // this is idempotent from the osdmon's point-of-view
9124 may_be_idempotent
= true;
9125 ceph_assert(existing_id
>= 0);
9127 } else if (err
< 0) {
9131 if (!may_be_idempotent
) {
9132 // idempotency is out of the window. We are either creating a new
9133 // osd or recreating a destroyed osd.
9135 // We now need to figure out if we have an `id` (and if it's valid),
9136 // of find an `id` if we don't have one.
9138 // NOTE: we need to consider the case where the `id` is specified for
9139 // `osd create`, and we must honor it. So this means checking if
9140 // the `id` is destroyed, and if so assume the destroy; otherwise,
9141 // check if it `exists` - in which case we complain about not being
9142 // `destroyed`. In the end, if nothing fails, we must allow the
9143 // creation, so that we are compatible with `create`.
9144 if (id
>= 0 && osdmap
.exists(id
) && !osdmap
.is_destroyed(id
)) {
9145 dout(10) << __func__
<< " osd." << id
<< " isn't destroyed" << dendl
;
9146 ss
<< "OSD " << id
<< " has not yet been destroyed";
9148 } else if (id
< 0) {
9150 id
= _allocate_osd_id(&existing_id
);
9152 ceph_assert(existing_id
>= 0);
9155 dout(10) << __func__
<< " found id " << id
<< " to use" << dendl
;
9156 } else if (id
>= 0 && osdmap
.is_destroyed(id
)) {
9157 dout(10) << __func__
<< " recreating osd." << id
<< dendl
;
9159 dout(10) << __func__
<< " creating new osd." << id
<< dendl
;
9162 ceph_assert(id
>= 0);
9163 ceph_assert(osdmap
.exists(id
));
9166 // we are now able to either create a brand new osd or reuse an existing
9167 // osd that has been previously destroyed.
9169 dout(10) << __func__
<< " id " << id
<< " uuid " << uuid
<< dendl
;
9171 if (may_be_idempotent
&& params
.empty()) {
9172 // nothing to do, really.
9173 dout(10) << __func__
<< " idempotent and no params -- no op." << dendl
;
9174 ceph_assert(id
>= 0);
9176 f
->open_object_section("created_osd");
9177 f
->dump_int("osdid", id
);
9185 string device_class
;
9186 auto p
= params
.find("crush_device_class");
9187 if (p
!= params
.end()) {
9188 device_class
= p
->second
;
9189 dout(20) << __func__
<< " device_class will be " << device_class
<< dendl
;
9191 string cephx_secret
, lockbox_secret
, dmcrypt_key
;
9192 bool has_lockbox
= false;
9193 bool has_secrets
= params
.count("cephx_secret")
9194 || params
.count("cephx_lockbox_secret")
9195 || params
.count("dmcrypt_key");
9197 ConfigKeyService
*svc
= nullptr;
9198 AuthMonitor::auth_entity_t cephx_entity
, lockbox_entity
;
9201 if (params
.count("cephx_secret") == 0) {
9202 ss
<< "requires a cephx secret.";
9205 cephx_secret
= params
.at("cephx_secret");
9207 bool has_lockbox_secret
= (params
.count("cephx_lockbox_secret") > 0);
9208 bool has_dmcrypt_key
= (params
.count("dmcrypt_key") > 0);
9210 dout(10) << __func__
<< " has lockbox " << has_lockbox_secret
9211 << " dmcrypt " << has_dmcrypt_key
<< dendl
;
9213 if (has_lockbox_secret
&& has_dmcrypt_key
) {
9215 lockbox_secret
= params
.at("cephx_lockbox_secret");
9216 dmcrypt_key
= params
.at("dmcrypt_key");
9217 } else if (!has_lockbox_secret
!= !has_dmcrypt_key
) {
9218 ss
<< "requires both a cephx lockbox secret and a dm-crypt key.";
9222 dout(10) << __func__
<< " validate secrets using osd id " << id
<< dendl
;
9224 err
= mon
->authmon()->validate_osd_new(id
, uuid
,
9232 } else if (may_be_idempotent
&& err
!= EEXIST
) {
9233 // for this to be idempotent, `id` should already be >= 0; no need
9234 // to use validate_id.
9235 ceph_assert(id
>= 0);
9236 ss
<< "osd." << id
<< " exists but secrets do not match";
9241 svc
= (ConfigKeyService
*)mon
->config_key_service
;
9242 err
= svc
->validate_osd_new(uuid
, dmcrypt_key
, ss
);
9245 } else if (may_be_idempotent
&& err
!= EEXIST
) {
9246 ceph_assert(id
>= 0);
9247 ss
<< "osd." << id
<< " exists but dm-crypt key does not match.";
9252 ceph_assert(!has_secrets
|| !cephx_secret
.empty());
9253 ceph_assert(!has_lockbox
|| !lockbox_secret
.empty());
9255 if (may_be_idempotent
) {
9256 // we have nothing to do for either the osdmon or the authmon,
9257 // and we have no lockbox - so the config key service will not be
9258 // touched. This is therefore an idempotent operation, and we can
9259 // just return right away.
9260 dout(10) << __func__
<< " idempotent -- no op." << dendl
;
9261 ceph_assert(id
>= 0);
9263 f
->open_object_section("created_osd");
9264 f
->dump_int("osdid", id
);
9271 ceph_assert(!may_be_idempotent
);
9275 ceph_assert(!cephx_secret
.empty());
9276 ceph_assert((lockbox_secret
.empty() && dmcrypt_key
.empty()) ||
9277 (!lockbox_secret
.empty() && !dmcrypt_key
.empty()));
9279 err
= mon
->authmon()->do_osd_new(cephx_entity
,
9282 ceph_assert(0 == err
);
9285 ceph_assert(nullptr != svc
);
9286 svc
->do_osd_new(uuid
, dmcrypt_key
);
9290 if (is_recreate_destroyed
) {
9291 ceph_assert(id
>= 0);
9292 ceph_assert(osdmap
.is_destroyed(id
));
9293 pending_inc
.new_weight
[id
] = CEPH_OSD_OUT
;
9294 pending_inc
.new_state
[id
] |= CEPH_OSD_DESTROYED
;
9295 if ((osdmap
.get_state(id
) & CEPH_OSD_NEW
) == 0) {
9296 pending_inc
.new_state
[id
] |= CEPH_OSD_NEW
;
9298 if (osdmap
.get_state(id
) & CEPH_OSD_UP
) {
9299 // due to http://tracker.ceph.com/issues/20751 some clusters may
9300 // have UP set for non-existent OSDs; make sure it is cleared
9301 // for a newly created osd.
9302 pending_inc
.new_state
[id
] |= CEPH_OSD_UP
;
9304 pending_inc
.new_uuid
[id
] = uuid
;
9306 ceph_assert(id
>= 0);
9307 int32_t new_id
= -1;
9308 do_osd_create(id
, uuid
, device_class
, &new_id
);
9309 ceph_assert(new_id
>= 0);
9310 ceph_assert(id
== new_id
);
9314 f
->open_object_section("created_osd");
9315 f
->dump_int("osdid", id
);
9324 bool OSDMonitor::prepare_command(MonOpRequestRef op
)
9326 op
->mark_osdmon_event(__func__
);
9327 auto m
= op
->get_req
<MMonCommand
>();
9330 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
9331 string rs
= ss
.str();
9332 mon
->reply_command(op
, -EINVAL
, rs
, get_last_committed());
9336 MonSession
*session
= op
->get_session();
9338 derr
<< __func__
<< " no session" << dendl
;
9339 mon
->reply_command(op
, -EACCES
, "access denied", get_last_committed());
9343 return prepare_command_impl(op
, cmdmap
);
9346 static int parse_reweights(CephContext
*cct
,
9347 const cmdmap_t
& cmdmap
,
9348 const OSDMap
& osdmap
,
9349 map
<int32_t, uint32_t>* weights
)
9352 if (!cmd_getval(cmdmap
, "weights", weights_str
)) {
9355 std::replace(begin(weights_str
), end(weights_str
), '\'', '"');
9356 json_spirit::mValue json_value
;
9357 if (!json_spirit::read(weights_str
, json_value
)) {
9360 if (json_value
.type() != json_spirit::obj_type
) {
9363 const auto obj
= json_value
.get_obj();
9365 for (auto& osd_weight
: obj
) {
9366 auto osd_id
= std::stoi(osd_weight
.first
);
9367 if (!osdmap
.exists(osd_id
)) {
9370 if (osd_weight
.second
.type() != json_spirit::str_type
) {
9373 auto weight
= std::stoul(osd_weight
.second
.get_str());
9374 weights
->insert({osd_id
, weight
});
9376 } catch (const std::logic_error
& e
) {
9382 int OSDMonitor::prepare_command_osd_destroy(
9386 ceph_assert(paxos
->is_plugged());
9388 // we check if the osd exists for the benefit of `osd purge`, which may
9389 // have previously removed the osd. If the osd does not exist, return
9390 // -ENOENT to convey this, and let the caller deal with it.
9392 // we presume that all auth secrets and config keys were removed prior
9393 // to this command being called. if they exist by now, we also assume
9394 // they must have been created by some other command and do not pertain
9395 // to this non-existent osd.
9396 if (!osdmap
.exists(id
)) {
9397 dout(10) << __func__
<< " osd." << id
<< " does not exist." << dendl
;
9401 uuid_d uuid
= osdmap
.get_uuid(id
);
9402 dout(10) << __func__
<< " destroying osd." << id
9403 << " uuid " << uuid
<< dendl
;
9405 // if it has been destroyed, we assume our work here is done.
9406 if (osdmap
.is_destroyed(id
)) {
9407 ss
<< "destroyed osd." << id
;
9411 EntityName cephx_entity
, lockbox_entity
;
9412 bool idempotent_auth
= false, idempotent_cks
= false;
9414 int err
= mon
->authmon()->validate_osd_destroy(id
, uuid
,
9419 if (err
== -ENOENT
) {
9420 idempotent_auth
= true;
9426 ConfigKeyService
*svc
= (ConfigKeyService
*)mon
->config_key_service
;
9427 err
= svc
->validate_osd_destroy(id
, uuid
);
9429 ceph_assert(err
== -ENOENT
);
9431 idempotent_cks
= true;
9434 if (!idempotent_auth
) {
9435 err
= mon
->authmon()->do_osd_destroy(cephx_entity
, lockbox_entity
);
9436 ceph_assert(0 == err
);
9439 if (!idempotent_cks
) {
9440 svc
->do_osd_destroy(id
, uuid
);
9443 pending_inc
.new_state
[id
] = CEPH_OSD_DESTROYED
;
9444 pending_inc
.new_uuid
[id
] = uuid_d();
9446 // we can only propose_pending() once per service, otherwise we'll be
9447 // defying PaxosService and all laws of nature. Therefore, as we may
9448 // be used during 'osd purge', let's keep the caller responsible for
9450 ceph_assert(err
== 0);
9454 int OSDMonitor::prepare_command_osd_purge(
9458 ceph_assert(paxos
->is_plugged());
9459 dout(10) << __func__
<< " purging osd." << id
<< dendl
;
9461 ceph_assert(!osdmap
.is_up(id
));
9464 * This may look a bit weird, but this is what's going to happen:
9466 * 1. we make sure that removing from crush works
9467 * 2. we call `prepare_command_osd_destroy()`. If it returns an
9468 * error, then we abort the whole operation, as no updates
9469 * have been made. However, we this function will have
9470 * side-effects, thus we need to make sure that all operations
9471 * performed henceforth will *always* succeed.
9472 * 3. we call `prepare_command_osd_remove()`. Although this
9473 * function can return an error, it currently only checks if the
9474 * osd is up - and we have made sure that it is not so, so there
9475 * is no conflict, and it is effectively an update.
9476 * 4. finally, we call `do_osd_crush_remove()`, which will perform
9477 * the crush update we delayed from before.
9480 CrushWrapper newcrush
;
9481 _get_pending_crush(newcrush
);
9483 bool may_be_idempotent
= false;
9485 int err
= _prepare_command_osd_crush_remove(newcrush
, id
, 0, false, false);
9486 if (err
== -ENOENT
) {
9488 may_be_idempotent
= true;
9489 } else if (err
< 0) {
9490 ss
<< "error removing osd." << id
<< " from crush";
9494 // no point destroying the osd again if it has already been marked destroyed
9495 if (!osdmap
.is_destroyed(id
)) {
9496 err
= prepare_command_osd_destroy(id
, ss
);
9498 if (err
== -ENOENT
) {
9504 may_be_idempotent
= false;
9507 ceph_assert(0 == err
);
9509 if (may_be_idempotent
&& !osdmap
.exists(id
)) {
9510 dout(10) << __func__
<< " osd." << id
<< " does not exist and "
9511 << "we are idempotent." << dendl
;
9515 err
= prepare_command_osd_remove(id
);
9516 // we should not be busy, as we should have made sure this id is not up.
9517 ceph_assert(0 == err
);
9519 do_osd_crush_remove(newcrush
);
9523 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op
,
9524 const cmdmap_t
& cmdmap
)
9526 op
->mark_osdmon_event(__func__
);
9527 auto m
= op
->get_req
<MMonCommand
>();
9535 cmd_getval(cmdmap
, "format", format
, string("plain"));
9536 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
9539 cmd_getval(cmdmap
, "prefix", prefix
);
9543 bool osdid_present
= false;
9544 if (prefix
!= "osd pg-temp" &&
9545 prefix
!= "osd pg-upmap" &&
9546 prefix
!= "osd pg-upmap-items") { // avoid commands with non-int id arg
9547 osdid_present
= cmd_getval(cmdmap
, "id", osdid
);
9549 if (osdid_present
) {
9551 oss
<< "osd." << osdid
;
9552 osd_name
= oss
.str();
9555 // Even if there's a pending state with changes that could affect
9556 // a command, considering that said state isn't yet committed, we
9557 // just don't care about those changes if the command currently being
9558 // handled acts as a no-op against the current committed state.
9559 // In a nutshell, we assume this command happens *before*.
9561 // Let me make this clearer:
9563 // - If we have only one client, and that client issues some
9564 // operation that would conflict with this operation but is
9565 // still on the pending state, then we would be sure that said
9566 // operation wouldn't have returned yet, so the client wouldn't
9567 // issue this operation (unless the client didn't wait for the
9568 // operation to finish, and that would be the client's own fault).
9570 // - If we have more than one client, each client will observe
9571 // whatever is the state at the moment of the commit. So, if we
9572 // have two clients, one issuing an unlink and another issuing a
9573 // link, and if the link happens while the unlink is still on the
9574 // pending state, from the link's point-of-view this is a no-op.
9575 // If different clients are issuing conflicting operations and
9576 // they care about that, then the clients should make sure they
9577 // enforce some kind of concurrency mechanism -- from our
9578 // perspective that's what Douglas Adams would call an SEP.
9580 // This should be used as a general guideline for most commands handled
9581 // in this function. Adapt as you see fit, but please bear in mind that
9582 // this is the expected behavior.
9585 if (prefix
== "osd setcrushmap" ||
9586 (prefix
== "osd crush set" && !osdid_present
)) {
9587 if (pending_inc
.crush
.length()) {
9588 dout(10) << __func__
<< " waiting for pending crush update " << dendl
;
9589 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
9592 dout(10) << "prepare_command setting new crush map" << dendl
;
9593 bufferlist
data(m
->get_data());
9596 auto bl
= data
.cbegin();
9599 catch (const std::exception
&e
) {
9601 ss
<< "Failed to parse crushmap: " << e
.what();
9605 int64_t prior_version
= 0;
9606 if (cmd_getval(cmdmap
, "prior_version", prior_version
)) {
9607 if (prior_version
== osdmap
.get_crush_version() - 1) {
9608 // see if we are a resend of the last update. this is imperfect
9609 // (multiple racing updaters may not both get reliable success)
9610 // but we expect crush updaters (via this interface) to be rare-ish.
9611 bufferlist current
, proposed
;
9612 osdmap
.crush
->encode(current
, mon
->get_quorum_con_features());
9613 crush
.encode(proposed
, mon
->get_quorum_con_features());
9614 if (current
.contents_equal(proposed
)) {
9615 dout(10) << __func__
9616 << " proposed matches current and version equals previous"
9619 ss
<< osdmap
.get_crush_version();
9623 if (prior_version
!= osdmap
.get_crush_version()) {
9625 ss
<< "prior_version " << prior_version
<< " != crush version "
9626 << osdmap
.get_crush_version();
9631 if (crush
.has_legacy_rule_ids()) {
9633 ss
<< "crush maps with ruleset != ruleid are no longer allowed";
9636 if (!validate_crush_against_features(&crush
, ss
)) {
9641 err
= osdmap
.validate_crush_rules(&crush
, &ss
);
9646 if (g_conf()->mon_osd_crush_smoke_test
) {
9647 // sanity check: test some inputs to make sure this map isn't
9649 dout(10) << " testing map" << dendl
;
9651 CrushTester
tester(crush
, ess
);
9652 tester
.set_min_x(0);
9653 tester
.set_max_x(50);
9654 auto start
= ceph::coarse_mono_clock::now();
9655 int r
= tester
.test_with_fork(g_conf()->mon_lease
);
9656 auto duration
= ceph::coarse_mono_clock::now() - start
;
9658 dout(10) << " tester.test_with_fork returns " << r
9659 << ": " << ess
.str() << dendl
;
9660 ss
<< "crush smoke test failed with " << r
<< ": " << ess
.str();
9664 dout(10) << __func__
<< " crush somke test duration: "
9665 << duration
<< ", result: " << ess
.str() << dendl
;
9668 pending_inc
.crush
= data
;
9669 ss
<< osdmap
.get_crush_version() + 1;
9672 } else if (prefix
== "osd crush set-all-straw-buckets-to-straw2") {
9673 CrushWrapper newcrush
;
9674 _get_pending_crush(newcrush
);
9675 for (int b
= 0; b
< newcrush
.get_max_buckets(); ++b
) {
9677 if (newcrush
.bucket_exists(bid
) &&
9678 newcrush
.get_bucket_alg(bid
) == CRUSH_BUCKET_STRAW
) {
9679 dout(20) << " bucket " << bid
<< " is straw, can convert" << dendl
;
9680 newcrush
.bucket_set_alg(bid
, CRUSH_BUCKET_STRAW2
);
9683 if (!validate_crush_against_features(&newcrush
, ss
)) {
9687 pending_inc
.crush
.clear();
9688 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9689 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
9690 get_last_committed() + 1));
9692 } else if (prefix
== "osd crush set-device-class") {
9693 string device_class
;
9694 if (!cmd_getval(cmdmap
, "class", device_class
)) {
9695 err
= -EINVAL
; // no value!
9700 vector
<string
> idvec
;
9701 cmd_getval(cmdmap
, "ids", idvec
);
9702 CrushWrapper newcrush
;
9703 _get_pending_crush(newcrush
);
9705 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
9709 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
9710 osdmap
.get_all_osds(osds
);
9713 // try traditional single osd way
9714 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
9716 // ss has reason for failure
9717 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
9724 for (auto &osd
: osds
) {
9725 if (!osdmap
.exists(osd
)) {
9726 ss
<< "osd." << osd
<< " does not exist. ";
9731 oss
<< "osd." << osd
;
9732 string name
= oss
.str();
9734 if (newcrush
.get_max_devices() < osd
+ 1) {
9735 newcrush
.set_max_devices(osd
+ 1);
9738 if (newcrush
.item_exists(osd
)) {
9739 action
= "updating";
9741 action
= "creating";
9742 newcrush
.set_item_name(osd
, name
);
9745 dout(5) << action
<< " crush item id " << osd
<< " name '" << name
9746 << "' device_class '" << device_class
<< "'"
9748 err
= newcrush
.update_device_class(osd
, device_class
, name
, &ss
);
9752 if (err
== 0 && !_have_pending_crush()) {
9754 // for single osd only, wildcard makes too much noise
9755 ss
<< "set-device-class item id " << osd
<< " name '" << name
9756 << "' device_class '" << device_class
<< "': no change. ";
9759 updated
.insert(osd
);
9764 pending_inc
.crush
.clear();
9765 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9766 ss
<< "set osd(s) " << updated
<< " to class '" << device_class
<< "'";
9768 wait_for_finished_proposal(
9770 new Monitor::C_Command(mon
,op
, 0, rs
, get_last_committed() + 1));
9772 } else if (prefix
== "osd crush rm-device-class") {
9774 vector
<string
> idvec
;
9775 cmd_getval(cmdmap
, "ids", idvec
);
9776 CrushWrapper newcrush
;
9777 _get_pending_crush(newcrush
);
9780 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
9785 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
9786 osdmap
.get_all_osds(osds
);
9789 // try traditional single osd way
9790 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
9792 // ss has reason for failure
9793 ss
<< ", unable to parse osd id:\"" << idvec
[j
] << "\". ";
9800 for (auto &osd
: osds
) {
9801 if (!osdmap
.exists(osd
)) {
9802 ss
<< "osd." << osd
<< " does not exist. ";
9806 auto class_name
= newcrush
.get_item_class(osd
);
9808 ss
<< "osd." << osd
<< " belongs to no class, ";
9811 // note that we do not verify if class_is_in_use here
9812 // in case the device is misclassified and user wants
9813 // to overridely reset...
9815 err
= newcrush
.remove_device_class(cct
, osd
, &ss
);
9817 // ss has reason for failure
9820 updated
.insert(osd
);
9824 pending_inc
.crush
.clear();
9825 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9826 ss
<< "done removing class of osd(s): " << updated
;
9828 wait_for_finished_proposal(
9830 new Monitor::C_Command(mon
,op
, 0, rs
, get_last_committed() + 1));
9832 } else if (prefix
== "osd crush class create") {
9833 string device_class
;
9834 if (!cmd_getval(cmdmap
, "class", device_class
)) {
9835 err
= -EINVAL
; // no value!
9838 if (osdmap
.require_osd_release
< ceph_release_t::luminous
) {
9839 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
9840 << "luminous' before using crush device classes";
9844 if (!_have_pending_crush() &&
9845 _get_stable_crush().class_exists(device_class
)) {
9846 ss
<< "class '" << device_class
<< "' already exists";
9849 CrushWrapper newcrush
;
9850 _get_pending_crush(newcrush
);
9851 if (newcrush
.class_exists(device_class
)) {
9852 ss
<< "class '" << device_class
<< "' already exists";
9855 int class_id
= newcrush
.get_or_create_class_id(device_class
);
9856 pending_inc
.crush
.clear();
9857 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9858 ss
<< "created class " << device_class
<< " with id " << class_id
9861 } else if (prefix
== "osd crush class rm") {
9862 string device_class
;
9863 if (!cmd_getval(cmdmap
, "class", device_class
)) {
9864 err
= -EINVAL
; // no value!
9867 if (osdmap
.require_osd_release
< ceph_release_t::luminous
) {
9868 ss
<< "you must complete the upgrade and 'ceph osd require-osd-release "
9869 << "luminous' before using crush device classes";
9874 if (!osdmap
.crush
->class_exists(device_class
)) {
9879 CrushWrapper newcrush
;
9880 _get_pending_crush(newcrush
);
9881 if (!newcrush
.class_exists(device_class
)) {
9882 err
= 0; // make command idempotent
9885 int class_id
= newcrush
.get_class_id(device_class
);
9887 if (newcrush
.class_is_in_use(class_id
, &ts
)) {
9889 ss
<< "class '" << device_class
<< "' " << ts
.str();
9893 // check if class is used by any erasure-code-profiles
9894 mempool::osdmap::map
<string
,map
<string
,string
>> old_ec_profiles
=
9895 osdmap
.get_erasure_code_profiles();
9896 auto ec_profiles
= pending_inc
.get_erasure_code_profiles();
9897 #ifdef HAVE_STDLIB_MAP_SPLICING
9898 ec_profiles
.merge(old_ec_profiles
);
9900 ec_profiles
.insert(make_move_iterator(begin(old_ec_profiles
)),
9901 make_move_iterator(end(old_ec_profiles
)));
9903 list
<string
> referenced_by
;
9904 for (auto &i
: ec_profiles
) {
9905 for (auto &j
: i
.second
) {
9906 if ("crush-device-class" == j
.first
&& device_class
== j
.second
) {
9907 referenced_by
.push_back(i
.first
);
9911 if (!referenced_by
.empty()) {
9913 ss
<< "class '" << device_class
9914 << "' is still referenced by erasure-code-profile(s): " << referenced_by
;
9919 newcrush
.get_devices_by_class(device_class
, &osds
);
9920 for (auto& p
: osds
) {
9921 err
= newcrush
.remove_device_class(g_ceph_context
, p
, &ss
);
9923 // ss has reason for failure
9929 // empty class, remove directly
9930 err
= newcrush
.remove_class_name(device_class
);
9932 ss
<< "class '" << device_class
<< "' cannot be removed '"
9933 << cpp_strerror(err
) << "'";
9938 pending_inc
.crush
.clear();
9939 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9940 ss
<< "removed class " << device_class
<< " with id " << class_id
9941 << " from crush map";
9943 } else if (prefix
== "osd crush class rename") {
9944 string srcname
, dstname
;
9945 if (!cmd_getval(cmdmap
, "srcname", srcname
)) {
9949 if (!cmd_getval(cmdmap
, "dstname", dstname
)) {
9954 CrushWrapper newcrush
;
9955 _get_pending_crush(newcrush
);
9956 if (!newcrush
.class_exists(srcname
) && newcrush
.class_exists(dstname
)) {
9957 // suppose this is a replay and return success
9958 // so command is idempotent
9959 ss
<< "already renamed to '" << dstname
<< "'";
9964 err
= newcrush
.rename_class(srcname
, dstname
);
9966 ss
<< "fail to rename '" << srcname
<< "' to '" << dstname
<< "' : "
9967 << cpp_strerror(err
);
9971 pending_inc
.crush
.clear();
9972 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
9973 ss
<< "rename class '" << srcname
<< "' to '" << dstname
<< "'";
9975 } else if (prefix
== "osd crush add-bucket") {
9976 // os crush add-bucket <name> <type>
9977 string name
, typestr
;
9978 vector
<string
> argvec
;
9979 cmd_getval(cmdmap
, "name", name
);
9980 cmd_getval(cmdmap
, "type", typestr
);
9981 cmd_getval(cmdmap
, "args", argvec
);
9982 map
<string
,string
> loc
;
9983 if (!argvec
.empty()) {
9984 CrushWrapper::parse_loc_map(argvec
, &loc
);
9985 dout(0) << "will create and move bucket '" << name
9986 << "' to location " << loc
<< dendl
;
9989 if (!_have_pending_crush() &&
9990 _get_stable_crush().name_exists(name
)) {
9991 ss
<< "bucket '" << name
<< "' already exists";
9995 CrushWrapper newcrush
;
9996 _get_pending_crush(newcrush
);
9998 if (newcrush
.name_exists(name
)) {
9999 ss
<< "bucket '" << name
<< "' already exists";
10002 int type
= newcrush
.get_type_id(typestr
);
10004 ss
<< "type '" << typestr
<< "' does not exist";
10009 ss
<< "type '" << typestr
<< "' is for devices, not buckets";
10014 err
= newcrush
.add_bucket(0, 0,
10015 CRUSH_HASH_DEFAULT
, type
, 0, NULL
,
10018 ss
<< "add_bucket error: '" << cpp_strerror(err
) << "'";
10021 err
= newcrush
.set_item_name(bucketno
, name
);
10023 ss
<< "error setting bucket name to '" << name
<< "'";
10027 if (!loc
.empty()) {
10028 if (!newcrush
.check_item_loc(cct
, bucketno
, loc
,
10030 err
= newcrush
.move_bucket(cct
, bucketno
, loc
);
10032 ss
<< "error moving bucket '" << name
<< "' to location " << loc
;
10036 ss
<< "no need to move item id " << bucketno
<< " name '" << name
10037 << "' to location " << loc
<< " in crush map";
10041 pending_inc
.crush
.clear();
10042 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10044 ss
<< "added bucket " << name
<< " type " << typestr
10045 << " to crush map";
10047 ss
<< "added bucket " << name
<< " type " << typestr
10048 << " to location " << loc
;
10051 } else if (prefix
== "osd crush rename-bucket") {
10052 string srcname
, dstname
;
10053 cmd_getval(cmdmap
, "srcname", srcname
);
10054 cmd_getval(cmdmap
, "dstname", dstname
);
10056 err
= crush_rename_bucket(srcname
, dstname
, &ss
);
10057 if (err
== -EALREADY
) // equivalent to success for idempotency
10063 } else if (prefix
== "osd crush weight-set create" ||
10064 prefix
== "osd crush weight-set create-compat") {
10065 CrushWrapper newcrush
;
10066 _get_pending_crush(newcrush
);
10069 if (newcrush
.has_non_straw2_buckets()) {
10070 ss
<< "crush map contains one or more bucket(s) that are not straw2";
10074 if (prefix
== "osd crush weight-set create") {
10075 if (osdmap
.require_min_compat_client
!= ceph_release_t::unknown
&&
10076 osdmap
.require_min_compat_client
< ceph_release_t::luminous
) {
10077 ss
<< "require_min_compat_client "
10078 << osdmap
.require_min_compat_client
10079 << " < luminous, which is required for per-pool weight-sets. "
10080 << "Try 'ceph osd set-require-min-compat-client luminous' "
10081 << "before using the new interface";
10085 string poolname
, mode
;
10086 cmd_getval(cmdmap
, "pool", poolname
);
10087 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
10089 ss
<< "pool '" << poolname
<< "' not found";
10093 cmd_getval(cmdmap
, "mode", mode
);
10094 if (mode
!= "flat" && mode
!= "positional") {
10095 ss
<< "unrecognized weight-set mode '" << mode
<< "'";
10099 positions
= mode
== "flat" ? 1 : osdmap
.get_pg_pool(pool
)->get_size();
10101 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
10104 if (!newcrush
.create_choose_args(pool
, positions
)) {
10105 if (pool
== CrushWrapper::DEFAULT_CHOOSE_ARGS
) {
10106 ss
<< "compat weight-set already created";
10108 ss
<< "weight-set for pool '" << osdmap
.get_pool_name(pool
)
10109 << "' already created";
10113 pending_inc
.crush
.clear();
10114 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10117 } else if (prefix
== "osd crush weight-set rm" ||
10118 prefix
== "osd crush weight-set rm-compat") {
10119 CrushWrapper newcrush
;
10120 _get_pending_crush(newcrush
);
10122 if (prefix
== "osd crush weight-set rm") {
10124 cmd_getval(cmdmap
, "pool", poolname
);
10125 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
10127 ss
<< "pool '" << poolname
<< "' not found";
10132 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
10134 newcrush
.rm_choose_args(pool
);
10135 pending_inc
.crush
.clear();
10136 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10139 } else if (prefix
== "osd crush weight-set reweight" ||
10140 prefix
== "osd crush weight-set reweight-compat") {
10141 string poolname
, item
;
10142 vector
<double> weight
;
10143 cmd_getval(cmdmap
, "pool", poolname
);
10144 cmd_getval(cmdmap
, "item", item
);
10145 cmd_getval(cmdmap
, "weight", weight
);
10146 CrushWrapper newcrush
;
10147 _get_pending_crush(newcrush
);
10149 if (prefix
== "osd crush weight-set reweight") {
10150 pool
= osdmap
.lookup_pg_pool_name(poolname
.c_str());
10152 ss
<< "pool '" << poolname
<< "' not found";
10156 if (!newcrush
.have_choose_args(pool
)) {
10157 ss
<< "no weight-set for pool '" << poolname
<< "'";
10161 auto arg_map
= newcrush
.choose_args_get(pool
);
10162 int positions
= newcrush
.get_choose_args_positions(arg_map
);
10163 if (weight
.size() != (size_t)positions
) {
10164 ss
<< "must specify exact " << positions
<< " weight values";
10169 pool
= CrushWrapper::DEFAULT_CHOOSE_ARGS
;
10170 if (!newcrush
.have_choose_args(pool
)) {
10171 ss
<< "no backward-compatible weight-set";
10176 if (!newcrush
.name_exists(item
)) {
10177 ss
<< "item '" << item
<< "' does not exist";
10181 err
= newcrush
.choose_args_adjust_item_weightf(
10183 newcrush
.choose_args_get(pool
),
10184 newcrush
.get_item_id(item
),
10191 pending_inc
.crush
.clear();
10192 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10194 } else if (osdid_present
&&
10195 (prefix
== "osd crush set" || prefix
== "osd crush add")) {
10196 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
10197 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
10198 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
10200 if (!osdmap
.exists(osdid
)) {
10203 << " does not exist. Create it before updating the crush map";
10208 if (!cmd_getval(cmdmap
, "weight", weight
)) {
10209 ss
<< "unable to parse weight value '"
10210 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
10216 vector
<string
> argvec
;
10217 cmd_getval(cmdmap
, "args", argvec
);
10218 map
<string
,string
> loc
;
10219 CrushWrapper::parse_loc_map(argvec
, &loc
);
10221 if (prefix
== "osd crush set"
10222 && !_get_stable_crush().item_exists(osdid
)) {
10224 ss
<< "unable to set item id " << osdid
<< " name '" << osd_name
10225 << "' weight " << weight
<< " at location " << loc
10226 << ": does not exist";
10230 dout(5) << "adding/updating crush item id " << osdid
<< " name '"
10231 << osd_name
<< "' weight " << weight
<< " at location "
10233 CrushWrapper newcrush
;
10234 _get_pending_crush(newcrush
);
10237 if (prefix
== "osd crush set" ||
10238 newcrush
.check_item_loc(cct
, osdid
, loc
, (int *)NULL
)) {
10240 err
= newcrush
.update_item(cct
, osdid
, weight
, osd_name
, loc
);
10243 err
= newcrush
.insert_item(cct
, osdid
, weight
, osd_name
, loc
);
10251 if (err
== 0 && !_have_pending_crush()) {
10252 ss
<< action
<< " item id " << osdid
<< " name '" << osd_name
10253 << "' weight " << weight
<< " at location " << loc
<< ": no change";
10257 pending_inc
.crush
.clear();
10258 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10259 ss
<< action
<< " item id " << osdid
<< " name '" << osd_name
<< "' weight "
10260 << weight
<< " at location " << loc
<< " to crush map";
10262 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10263 get_last_committed() + 1));
10266 } else if (prefix
== "osd crush create-or-move") {
10268 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
10269 if (!osdmap
.exists(osdid
)) {
10272 << " does not exist. create it before updating the crush map";
10277 if (!cmd_getval(cmdmap
, "weight", weight
)) {
10278 ss
<< "unable to parse weight value '"
10279 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
10285 vector
<string
> argvec
;
10286 cmd_getval(cmdmap
, "args", argvec
);
10287 map
<string
,string
> loc
;
10288 CrushWrapper::parse_loc_map(argvec
, &loc
);
10290 dout(0) << "create-or-move crush item name '" << osd_name
10291 << "' initial_weight " << weight
<< " at location " << loc
10294 CrushWrapper newcrush
;
10295 _get_pending_crush(newcrush
);
10297 err
= newcrush
.create_or_move_item(cct
, osdid
, weight
, osd_name
, loc
,
10298 g_conf()->osd_crush_update_weight_set
);
10300 ss
<< "create-or-move updated item name '" << osd_name
10301 << "' weight " << weight
10302 << " at location " << loc
<< " to crush map";
10306 pending_inc
.crush
.clear();
10307 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10308 ss
<< "create-or-move updating item name '" << osd_name
10309 << "' weight " << weight
10310 << " at location " << loc
<< " to crush map";
10312 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10313 get_last_committed() + 1));
10318 } else if (prefix
== "osd crush move") {
10320 // osd crush move <name> <loc1> [<loc2> ...]
10322 vector
<string
> argvec
;
10323 cmd_getval(cmdmap
, "name", name
);
10324 cmd_getval(cmdmap
, "args", argvec
);
10325 map
<string
,string
> loc
;
10326 CrushWrapper::parse_loc_map(argvec
, &loc
);
10328 dout(0) << "moving crush item name '" << name
<< "' to location " << loc
<< dendl
;
10329 CrushWrapper newcrush
;
10330 _get_pending_crush(newcrush
);
10332 if (!newcrush
.name_exists(name
)) {
10334 ss
<< "item " << name
<< " does not exist";
10337 int id
= newcrush
.get_item_id(name
);
10339 if (!newcrush
.check_item_loc(cct
, id
, loc
, (int *)NULL
)) {
10341 err
= newcrush
.create_or_move_item(
10342 cct
, id
, 0, name
, loc
,
10343 g_conf()->osd_crush_update_weight_set
);
10345 err
= newcrush
.move_bucket(cct
, id
, loc
);
10348 ss
<< "moved item id " << id
<< " name '" << name
<< "' to location " << loc
<< " in crush map";
10349 pending_inc
.crush
.clear();
10350 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10352 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10353 get_last_committed() + 1));
10357 ss
<< "no need to move item id " << id
<< " name '" << name
<< "' to location " << loc
<< " in crush map";
10361 } else if (prefix
== "osd crush swap-bucket") {
10362 string source
, dest
;
10363 cmd_getval(cmdmap
, "source", source
);
10364 cmd_getval(cmdmap
, "dest", dest
);
10366 bool force
= false;
10367 cmd_getval(cmdmap
, "yes_i_really_mean_it", force
);
10369 CrushWrapper newcrush
;
10370 _get_pending_crush(newcrush
);
10371 if (!newcrush
.name_exists(source
)) {
10372 ss
<< "source item " << source
<< " does not exist";
10376 if (!newcrush
.name_exists(dest
)) {
10377 ss
<< "dest item " << dest
<< " does not exist";
10381 int sid
= newcrush
.get_item_id(source
);
10382 int did
= newcrush
.get_item_id(dest
);
10384 if (newcrush
.get_immediate_parent_id(sid
, &sparent
) == 0 && !force
) {
10385 ss
<< "source item " << source
<< " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
10389 if (newcrush
.get_bucket_alg(sid
) != newcrush
.get_bucket_alg(did
) &&
10391 ss
<< "source bucket alg " << crush_alg_name(newcrush
.get_bucket_alg(sid
)) << " != "
10392 << "dest bucket alg " << crush_alg_name(newcrush
.get_bucket_alg(did
))
10393 << "; pass --yes-i-really-mean-it to proceed anyway";
10397 int r
= newcrush
.swap_bucket(cct
, sid
, did
);
10399 ss
<< "failed to swap bucket contents: " << cpp_strerror(r
);
10403 ss
<< "swapped bucket of " << source
<< " to " << dest
;
10404 pending_inc
.crush
.clear();
10405 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10406 wait_for_finished_proposal(op
,
10407 new Monitor::C_Command(mon
, op
, err
, ss
.str(),
10408 get_last_committed() + 1));
10410 } else if (prefix
== "osd crush link") {
10411 // osd crush link <name> <loc1> [<loc2> ...]
10413 cmd_getval(cmdmap
, "name", name
);
10414 vector
<string
> argvec
;
10415 cmd_getval(cmdmap
, "args", argvec
);
10416 map
<string
,string
> loc
;
10417 CrushWrapper::parse_loc_map(argvec
, &loc
);
10419 // Need an explicit check for name_exists because get_item_id returns
10421 int id
= osdmap
.crush
->get_item_id(name
);
10422 if (!osdmap
.crush
->name_exists(name
)) {
10424 ss
<< "item " << name
<< " does not exist";
10427 dout(5) << "resolved crush name '" << name
<< "' to id " << id
<< dendl
;
10429 if (osdmap
.crush
->check_item_loc(cct
, id
, loc
, (int*) NULL
)) {
10430 ss
<< "no need to move item id " << id
<< " name '" << name
10431 << "' to location " << loc
<< " in crush map";
10436 dout(5) << "linking crush item name '" << name
<< "' at location " << loc
<< dendl
;
10437 CrushWrapper newcrush
;
10438 _get_pending_crush(newcrush
);
10440 if (!newcrush
.name_exists(name
)) {
10442 ss
<< "item " << name
<< " does not exist";
10445 int id
= newcrush
.get_item_id(name
);
10446 if (!newcrush
.check_item_loc(cct
, id
, loc
, (int *)NULL
)) {
10447 err
= newcrush
.link_bucket(cct
, id
, loc
);
10449 ss
<< "linked item id " << id
<< " name '" << name
10450 << "' to location " << loc
<< " in crush map";
10451 pending_inc
.crush
.clear();
10452 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10454 ss
<< "cannot link item id " << id
<< " name '" << name
10455 << "' to location " << loc
;
10459 ss
<< "no need to move item id " << id
<< " name '" << name
10460 << "' to location " << loc
<< " in crush map";
10464 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, ss
.str(),
10465 get_last_committed() + 1));
10467 } else if (prefix
== "osd crush rm" ||
10468 prefix
== "osd crush remove" ||
10469 prefix
== "osd crush unlink") {
10471 // osd crush rm <id> [ancestor]
10472 CrushWrapper newcrush
;
10473 _get_pending_crush(newcrush
);
10476 cmd_getval(cmdmap
, "name", name
);
10478 if (!osdmap
.crush
->name_exists(name
)) {
10480 ss
<< "device '" << name
<< "' does not appear in the crush map";
10483 if (!newcrush
.name_exists(name
)) {
10485 ss
<< "device '" << name
<< "' does not appear in the crush map";
10487 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10488 get_last_committed() + 1));
10491 int id
= newcrush
.get_item_id(name
);
10494 bool unlink_only
= prefix
== "osd crush unlink";
10495 string ancestor_str
;
10496 if (cmd_getval(cmdmap
, "ancestor", ancestor_str
)) {
10497 if (!newcrush
.name_exists(ancestor_str
)) {
10499 ss
<< "ancestor item '" << ancestor_str
10500 << "' does not appear in the crush map";
10503 ancestor
= newcrush
.get_item_id(ancestor_str
);
10506 err
= prepare_command_osd_crush_remove(
10509 (ancestor
< 0), unlink_only
);
10511 if (err
== -ENOENT
) {
10512 ss
<< "item " << id
<< " does not appear in that position";
10518 pending_inc
.new_crush_node_flags
[id
] = 0;
10519 ss
<< "removed item id " << id
<< " name '" << name
<< "' from crush map";
10521 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10522 get_last_committed() + 1));
10527 } else if (prefix
== "osd crush reweight-all") {
10528 CrushWrapper newcrush
;
10529 _get_pending_crush(newcrush
);
10531 newcrush
.reweight(cct
);
10532 pending_inc
.crush
.clear();
10533 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10534 ss
<< "reweighted crush hierarchy";
10536 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10537 get_last_committed() + 1));
10539 } else if (prefix
== "osd crush reweight") {
10540 // osd crush reweight <name> <weight>
10541 CrushWrapper newcrush
;
10542 _get_pending_crush(newcrush
);
10545 cmd_getval(cmdmap
, "name", name
);
10546 if (!newcrush
.name_exists(name
)) {
10548 ss
<< "device '" << name
<< "' does not appear in the crush map";
10552 int id
= newcrush
.get_item_id(name
);
10554 ss
<< "device '" << name
<< "' is not a leaf in the crush map";
10559 if (!cmd_getval(cmdmap
, "weight", w
)) {
10560 ss
<< "unable to parse weight value '"
10561 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
10566 err
= newcrush
.adjust_item_weightf(cct
, id
, w
,
10567 g_conf()->osd_crush_update_weight_set
);
10570 pending_inc
.crush
.clear();
10571 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10572 ss
<< "reweighted item id " << id
<< " name '" << name
<< "' to " << w
10573 << " in crush map";
10575 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10576 get_last_committed() + 1));
10578 } else if (prefix
== "osd crush reweight-subtree") {
10579 // osd crush reweight <name> <weight>
10580 CrushWrapper newcrush
;
10581 _get_pending_crush(newcrush
);
10584 cmd_getval(cmdmap
, "name", name
);
10585 if (!newcrush
.name_exists(name
)) {
10587 ss
<< "device '" << name
<< "' does not appear in the crush map";
10591 int id
= newcrush
.get_item_id(name
);
10593 ss
<< "device '" << name
<< "' is not a subtree in the crush map";
10598 if (!cmd_getval(cmdmap
, "weight", w
)) {
10599 ss
<< "unable to parse weight value '"
10600 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
10605 err
= newcrush
.adjust_subtree_weightf(cct
, id
, w
,
10606 g_conf()->osd_crush_update_weight_set
);
10609 pending_inc
.crush
.clear();
10610 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10611 ss
<< "reweighted subtree id " << id
<< " name '" << name
<< "' to " << w
10612 << " in crush map";
10614 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10615 get_last_committed() + 1));
10617 } else if (prefix
== "osd crush tunables") {
10618 CrushWrapper newcrush
;
10619 _get_pending_crush(newcrush
);
10623 cmd_getval(cmdmap
, "profile", profile
);
10624 if (profile
== "legacy" || profile
== "argonaut") {
10625 newcrush
.set_tunables_legacy();
10626 } else if (profile
== "bobtail") {
10627 newcrush
.set_tunables_bobtail();
10628 } else if (profile
== "firefly") {
10629 newcrush
.set_tunables_firefly();
10630 } else if (profile
== "hammer") {
10631 newcrush
.set_tunables_hammer();
10632 } else if (profile
== "jewel") {
10633 newcrush
.set_tunables_jewel();
10634 } else if (profile
== "optimal") {
10635 newcrush
.set_tunables_optimal();
10636 } else if (profile
== "default") {
10637 newcrush
.set_tunables_default();
10639 ss
<< "unrecognized profile '" << profile
<< "'";
10644 if (!validate_crush_against_features(&newcrush
, ss
)) {
10649 pending_inc
.crush
.clear();
10650 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10651 ss
<< "adjusted tunables profile to " << profile
;
10653 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10654 get_last_committed() + 1));
10656 } else if (prefix
== "osd crush set-tunable") {
10657 CrushWrapper newcrush
;
10658 _get_pending_crush(newcrush
);
10662 cmd_getval(cmdmap
, "tunable", tunable
);
10664 int64_t value
= -1;
10665 if (!cmd_getval(cmdmap
, "value", value
)) {
10667 ss
<< "failed to parse integer value "
10668 << cmd_vartype_stringify(cmdmap
.at("value"));
10672 if (tunable
== "straw_calc_version") {
10673 if (value
!= 0 && value
!= 1) {
10674 ss
<< "value must be 0 or 1; got " << value
;
10678 newcrush
.set_straw_calc_version(value
);
10680 ss
<< "unrecognized tunable '" << tunable
<< "'";
10685 if (!validate_crush_against_features(&newcrush
, ss
)) {
10690 pending_inc
.crush
.clear();
10691 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10692 ss
<< "adjusted tunable " << tunable
<< " to " << value
;
10694 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10695 get_last_committed() + 1));
10698 } else if (prefix
== "osd crush rule create-simple") {
10699 string name
, root
, type
, mode
;
10700 cmd_getval(cmdmap
, "name", name
);
10701 cmd_getval(cmdmap
, "root", root
);
10702 cmd_getval(cmdmap
, "type", type
);
10703 cmd_getval(cmdmap
, "mode", mode
);
10707 if (osdmap
.crush
->rule_exists(name
)) {
10708 // The name is uniquely associated to a ruleid and the rule it contains
10709 // From the user point of view, the rule is more meaningfull.
10710 ss
<< "rule " << name
<< " already exists";
10715 CrushWrapper newcrush
;
10716 _get_pending_crush(newcrush
);
10718 if (newcrush
.rule_exists(name
)) {
10719 // The name is uniquely associated to a ruleid and the rule it contains
10720 // From the user point of view, the rule is more meaningfull.
10721 ss
<< "rule " << name
<< " already exists";
10724 int ruleno
= newcrush
.add_simple_rule(name
, root
, type
, "", mode
,
10725 pg_pool_t::TYPE_REPLICATED
, &ss
);
10731 pending_inc
.crush
.clear();
10732 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10735 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10736 get_last_committed() + 1));
10739 } else if (prefix
== "osd crush rule create-replicated") {
10740 string name
, root
, type
, device_class
;
10741 cmd_getval(cmdmap
, "name", name
);
10742 cmd_getval(cmdmap
, "root", root
);
10743 cmd_getval(cmdmap
, "type", type
);
10744 cmd_getval(cmdmap
, "class", device_class
);
10746 if (osdmap
.crush
->rule_exists(name
)) {
10747 // The name is uniquely associated to a ruleid and the rule it contains
10748 // From the user point of view, the rule is more meaningfull.
10749 ss
<< "rule " << name
<< " already exists";
10754 CrushWrapper newcrush
;
10755 _get_pending_crush(newcrush
);
10757 if (newcrush
.rule_exists(name
)) {
10758 // The name is uniquely associated to a ruleid and the rule it contains
10759 // From the user point of view, the rule is more meaningfull.
10760 ss
<< "rule " << name
<< " already exists";
10763 int ruleno
= newcrush
.add_simple_rule(
10764 name
, root
, type
, device_class
,
10765 "firstn", pg_pool_t::TYPE_REPLICATED
, &ss
);
10771 pending_inc
.crush
.clear();
10772 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10775 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10776 get_last_committed() + 1));
10779 } else if (prefix
== "osd erasure-code-profile rm") {
10781 cmd_getval(cmdmap
, "name", name
);
10783 if (erasure_code_profile_in_use(pending_inc
.new_pools
, name
, &ss
))
10786 if (erasure_code_profile_in_use(osdmap
.pools
, name
, &ss
)) {
10791 if (osdmap
.has_erasure_code_profile(name
) ||
10792 pending_inc
.new_erasure_code_profiles
.count(name
)) {
10793 if (osdmap
.has_erasure_code_profile(name
)) {
10794 pending_inc
.old_erasure_code_profiles
.push_back(name
);
10796 dout(20) << "erasure code profile rm " << name
<< ": creation canceled" << dendl
;
10797 pending_inc
.new_erasure_code_profiles
.erase(name
);
10801 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10802 get_last_committed() + 1));
10805 ss
<< "erasure-code-profile " << name
<< " does not exist";
10810 } else if (prefix
== "osd erasure-code-profile set") {
10812 cmd_getval(cmdmap
, "name", name
);
10813 vector
<string
> profile
;
10814 cmd_getval(cmdmap
, "profile", profile
);
10816 bool force
= false;
10817 cmd_getval(cmdmap
, "force", force
);
10819 map
<string
,string
> profile_map
;
10820 err
= parse_erasure_code_profile(profile
, &profile_map
, &ss
);
10823 if (auto found
= profile_map
.find("crush-failure-domain");
10824 found
!= profile_map
.end()) {
10825 const auto& failure_domain
= found
->second
;
10826 int failure_domain_type
= osdmap
.crush
->get_type_id(failure_domain
);
10827 if (failure_domain_type
< 0) {
10828 ss
<< "erasure-code-profile " << profile_map
10829 << " contains an invalid failure-domain " << std::quoted(failure_domain
);
10835 if (profile_map
.find("plugin") == profile_map
.end()) {
10836 ss
<< "erasure-code-profile " << profile_map
10837 << " must contain a plugin entry" << std::endl
;
10841 string plugin
= profile_map
["plugin"];
10843 if (pending_inc
.has_erasure_code_profile(name
)) {
10844 dout(20) << "erasure code profile " << name
<< " try again" << dendl
;
10847 err
= normalize_profile(name
, profile_map
, force
, &ss
);
10851 if (osdmap
.has_erasure_code_profile(name
)) {
10852 ErasureCodeProfile existing_profile_map
=
10853 osdmap
.get_erasure_code_profile(name
);
10854 err
= normalize_profile(name
, existing_profile_map
, force
, &ss
);
10858 if (existing_profile_map
== profile_map
) {
10864 ss
<< "will not override erasure code profile " << name
10865 << " because the existing profile "
10866 << existing_profile_map
10867 << " is different from the proposed profile "
10873 dout(20) << "erasure code profile set " << name
<< "="
10874 << profile_map
<< dendl
;
10875 pending_inc
.set_erasure_code_profile(name
, profile_map
);
10879 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10880 get_last_committed() + 1));
10883 } else if (prefix
== "osd crush rule create-erasure") {
10884 err
= check_cluster_features(CEPH_FEATURE_CRUSH_V2
, ss
);
10885 if (err
== -EAGAIN
)
10889 string name
, poolstr
;
10890 cmd_getval(cmdmap
, "name", name
);
10892 cmd_getval(cmdmap
, "profile", profile
);
10894 profile
= "default";
10895 if (profile
== "default") {
10896 if (!osdmap
.has_erasure_code_profile(profile
)) {
10897 if (pending_inc
.has_erasure_code_profile(profile
)) {
10898 dout(20) << "erasure code profile " << profile
<< " already pending" << dendl
;
10902 map
<string
,string
> profile_map
;
10903 err
= osdmap
.get_erasure_code_profile_default(cct
,
10908 err
= normalize_profile(name
, profile_map
, true, &ss
);
10911 dout(20) << "erasure code profile set " << profile
<< "="
10912 << profile_map
<< dendl
;
10913 pending_inc
.set_erasure_code_profile(profile
, profile_map
);
10919 err
= crush_rule_create_erasure(name
, profile
, &rule
, &ss
);
10922 case -EEXIST
: // return immediately
10923 ss
<< "rule " << name
<< " already exists";
10927 case -EALREADY
: // wait for pending to be proposed
10928 ss
<< "rule " << name
<< " already exists";
10931 default: // non recoverable error
10936 ss
<< "created rule " << name
<< " at " << rule
;
10940 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10941 get_last_committed() + 1));
10944 } else if (prefix
== "osd crush rule rm") {
10946 cmd_getval(cmdmap
, "name", name
);
10948 if (!osdmap
.crush
->rule_exists(name
)) {
10949 ss
<< "rule " << name
<< " does not exist";
10954 CrushWrapper newcrush
;
10955 _get_pending_crush(newcrush
);
10957 if (!newcrush
.rule_exists(name
)) {
10958 ss
<< "rule " << name
<< " does not exist";
10961 int ruleno
= newcrush
.get_rule_id(name
);
10962 ceph_assert(ruleno
>= 0);
10964 // make sure it is not in use.
10965 // FIXME: this is ok in some situations, but let's not bother with that
10967 int ruleset
= newcrush
.get_rule_mask_ruleset(ruleno
);
10968 if (osdmap
.crush_rule_in_use(ruleset
)) {
10969 ss
<< "crush ruleset " << name
<< " " << ruleset
<< " is in use";
10974 err
= newcrush
.remove_rule(ruleno
);
10979 pending_inc
.crush
.clear();
10980 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
10983 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
10984 get_last_committed() + 1));
10987 } else if (prefix
== "osd crush rule rename") {
10990 cmd_getval(cmdmap
, "srcname", srcname
);
10991 cmd_getval(cmdmap
, "dstname", dstname
);
10992 if (srcname
.empty() || dstname
.empty()) {
10993 ss
<< "must specify both source rule name and destination rule name";
10997 if (srcname
== dstname
) {
10998 ss
<< "destination rule name is equal to source rule name";
11003 CrushWrapper newcrush
;
11004 _get_pending_crush(newcrush
);
11005 if (!newcrush
.rule_exists(srcname
) && newcrush
.rule_exists(dstname
)) {
11006 // srcname does not exist and dstname already exists
11007 // suppose this is a replay and return success
11008 // (so this command is idempotent)
11009 ss
<< "already renamed to '" << dstname
<< "'";
11014 err
= newcrush
.rename_rule(srcname
, dstname
, &ss
);
11016 // ss has reason for failure
11019 pending_inc
.crush
.clear();
11020 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
11022 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11023 get_last_committed() + 1));
11026 } else if (prefix
== "osd setmaxosd") {
11028 if (!cmd_getval(cmdmap
, "newmax", newmax
)) {
11029 ss
<< "unable to parse 'newmax' value '"
11030 << cmd_vartype_stringify(cmdmap
.at("newmax")) << "'";
11035 if (newmax
> g_conf()->mon_max_osd
) {
11037 ss
<< "cannot set max_osd to " << newmax
<< " which is > conf.mon_max_osd ("
11038 << g_conf()->mon_max_osd
<< ")";
11042 // Don't allow shrinking OSD number as this will cause data loss
11043 // and may cause kernel crashes.
11044 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
11045 if (newmax
< osdmap
.get_max_osd()) {
11046 // Check if the OSDs exist between current max and new value.
11047 // If there are any OSDs exist, then don't allow shrinking number
11049 for (int i
= newmax
; i
< osdmap
.get_max_osd(); i
++) {
11050 if (osdmap
.exists(i
)) {
11052 ss
<< "cannot shrink max_osd to " << newmax
11053 << " because osd." << i
<< " (and possibly others) still in use";
11059 pending_inc
.new_max_osd
= newmax
;
11060 ss
<< "set new max_osd = " << pending_inc
.new_max_osd
;
11062 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11063 get_last_committed() + 1));
11066 } else if (prefix
== "osd set-full-ratio" ||
11067 prefix
== "osd set-backfillfull-ratio" ||
11068 prefix
== "osd set-nearfull-ratio") {
11070 if (!cmd_getval(cmdmap
, "ratio", n
)) {
11071 ss
<< "unable to parse 'ratio' value '"
11072 << cmd_vartype_stringify(cmdmap
.at("ratio")) << "'";
11076 if (prefix
== "osd set-full-ratio")
11077 pending_inc
.new_full_ratio
= n
;
11078 else if (prefix
== "osd set-backfillfull-ratio")
11079 pending_inc
.new_backfillfull_ratio
= n
;
11080 else if (prefix
== "osd set-nearfull-ratio")
11081 pending_inc
.new_nearfull_ratio
= n
;
11082 ss
<< prefix
<< " " << n
;
11084 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11085 get_last_committed() + 1));
11087 } else if (prefix
== "osd set-require-min-compat-client") {
11089 cmd_getval(cmdmap
, "version", v
);
11090 ceph_release_t vno
= ceph_release_from_name(v
);
11092 ss
<< "version " << v
<< " is not recognized";
11097 newmap
.deepish_copy_from(osdmap
);
11098 newmap
.apply_incremental(pending_inc
);
11099 newmap
.require_min_compat_client
= vno
;
11100 auto mvno
= newmap
.get_min_compat_client();
11102 ss
<< "osdmap current utilizes features that require " << mvno
11103 << "; cannot set require_min_compat_client below that to " << vno
;
11108 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
11111 mon
->get_combined_feature_map(&m
);
11112 uint64_t features
= ceph_release_features(ceph::to_integer
<int>(vno
));
11116 CEPH_ENTITY_TYPE_CLIENT
,
11117 CEPH_ENTITY_TYPE_MDS
,
11118 CEPH_ENTITY_TYPE_MGR
}) {
11119 auto p
= m
.m
.find(type
);
11120 if (p
== m
.m
.end()) {
11123 for (auto& q
: p
->second
) {
11124 uint64_t missing
= ~q
.first
& features
;
11127 ss
<< "cannot set require_min_compat_client to " << v
<< ": ";
11132 ss
<< q
.second
<< " connected " << ceph_entity_type_name(type
)
11133 << "(s) look like " << ceph_release_name(
11134 ceph_release_from_features(q
.first
))
11135 << " (missing 0x" << std::hex
<< missing
<< std::dec
<< ")";
11141 ss
<< "; add --yes-i-really-mean-it to do it anyway";
11146 ss
<< "set require_min_compat_client to " << vno
;
11147 pending_inc
.new_require_min_compat_client
= vno
;
11149 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
11150 get_last_committed() + 1));
11152 } else if (prefix
== "osd pause") {
11153 return prepare_set_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11155 } else if (prefix
== "osd unpause") {
11156 return prepare_unset_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11158 } else if (prefix
== "osd set") {
11160 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
11163 cmd_getval(cmdmap
, "key", key
);
11164 if (key
== "pause")
11165 return prepare_set_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11166 else if (key
== "noup")
11167 return prepare_set_flag(op
, CEPH_OSDMAP_NOUP
);
11168 else if (key
== "nodown")
11169 return prepare_set_flag(op
, CEPH_OSDMAP_NODOWN
);
11170 else if (key
== "noout")
11171 return prepare_set_flag(op
, CEPH_OSDMAP_NOOUT
);
11172 else if (key
== "noin")
11173 return prepare_set_flag(op
, CEPH_OSDMAP_NOIN
);
11174 else if (key
== "nobackfill")
11175 return prepare_set_flag(op
, CEPH_OSDMAP_NOBACKFILL
);
11176 else if (key
== "norebalance")
11177 return prepare_set_flag(op
, CEPH_OSDMAP_NOREBALANCE
);
11178 else if (key
== "norecover")
11179 return prepare_set_flag(op
, CEPH_OSDMAP_NORECOVER
);
11180 else if (key
== "noscrub")
11181 return prepare_set_flag(op
, CEPH_OSDMAP_NOSCRUB
);
11182 else if (key
== "nodeep-scrub")
11183 return prepare_set_flag(op
, CEPH_OSDMAP_NODEEP_SCRUB
);
11184 else if (key
== "notieragent")
11185 return prepare_set_flag(op
, CEPH_OSDMAP_NOTIERAGENT
);
11186 else if (key
== "nosnaptrim")
11187 return prepare_set_flag(op
, CEPH_OSDMAP_NOSNAPTRIM
);
11188 else if (key
== "pglog_hardlimit") {
11189 if (!osdmap
.get_num_up_osds() && !sure
) {
11190 ss
<< "Not advisable to continue since no OSDs are up. Pass "
11191 << "--yes-i-really-mean-it if you really wish to continue.";
11195 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
11196 // we are reusing a jewel feature bit that was retired in luminous.
11197 if (osdmap
.require_osd_release
>= ceph_release_t::luminous
&&
11198 (HAVE_FEATURE(osdmap
.get_up_osd_features(), OSD_PGLOG_HARDLIMIT
)
11200 return prepare_set_flag(op
, CEPH_OSDMAP_PGLOG_HARDLIMIT
);
11202 ss
<< "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
11207 ss
<< "unrecognized flag '" << key
<< "'";
11211 } else if (prefix
== "osd unset") {
11213 cmd_getval(cmdmap
, "key", key
);
11214 if (key
== "pause")
11215 return prepare_unset_flag(op
, CEPH_OSDMAP_PAUSERD
| CEPH_OSDMAP_PAUSEWR
);
11216 else if (key
== "noup")
11217 return prepare_unset_flag(op
, CEPH_OSDMAP_NOUP
);
11218 else if (key
== "nodown")
11219 return prepare_unset_flag(op
, CEPH_OSDMAP_NODOWN
);
11220 else if (key
== "noout")
11221 return prepare_unset_flag(op
, CEPH_OSDMAP_NOOUT
);
11222 else if (key
== "noin")
11223 return prepare_unset_flag(op
, CEPH_OSDMAP_NOIN
);
11224 else if (key
== "nobackfill")
11225 return prepare_unset_flag(op
, CEPH_OSDMAP_NOBACKFILL
);
11226 else if (key
== "norebalance")
11227 return prepare_unset_flag(op
, CEPH_OSDMAP_NOREBALANCE
);
11228 else if (key
== "norecover")
11229 return prepare_unset_flag(op
, CEPH_OSDMAP_NORECOVER
);
11230 else if (key
== "noscrub")
11231 return prepare_unset_flag(op
, CEPH_OSDMAP_NOSCRUB
);
11232 else if (key
== "nodeep-scrub")
11233 return prepare_unset_flag(op
, CEPH_OSDMAP_NODEEP_SCRUB
);
11234 else if (key
== "notieragent")
11235 return prepare_unset_flag(op
, CEPH_OSDMAP_NOTIERAGENT
);
11236 else if (key
== "nosnaptrim")
11237 return prepare_unset_flag(op
, CEPH_OSDMAP_NOSNAPTRIM
);
11239 ss
<< "unrecognized flag '" << key
<< "'";
11243 } else if (prefix
== "osd require-osd-release") {
11245 cmd_getval(cmdmap
, "release", release
);
11247 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
11248 ceph_release_t rel
= ceph_release_from_name(release
.c_str());
11250 ss
<< "unrecognized release " << release
;
11254 if (rel
== osdmap
.require_osd_release
) {
11259 ceph_assert(osdmap
.require_osd_release
>= ceph_release_t::luminous
);
11260 if (!osdmap
.get_num_up_osds() && !sure
) {
11261 ss
<< "Not advisable to continue since no OSDs are up. Pass "
11262 << "--yes-i-really-mean-it if you really wish to continue.";
11266 if (rel
== ceph_release_t::mimic
) {
11267 if (!mon
->monmap
->get_required_features().contains_all(
11268 ceph::features::mon::FEATURE_MIMIC
)) {
11269 ss
<< "not all mons are mimic";
11273 if ((!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_MIMIC
))
11275 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_MIMIC feature";
11279 } else if (rel
== ceph_release_t::nautilus
) {
11280 if (!mon
->monmap
->get_required_features().contains_all(
11281 ceph::features::mon::FEATURE_NAUTILUS
)) {
11282 ss
<< "not all mons are nautilus";
11286 if ((!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_NAUTILUS
))
11288 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_NAUTILUS feature";
11292 } else if (rel
== ceph_release_t::octopus
) {
11293 if (!mon
->monmap
->get_required_features().contains_all(
11294 ceph::features::mon::FEATURE_OCTOPUS
)) {
11295 ss
<< "not all mons are octopus";
11299 if ((!HAVE_FEATURE(osdmap
.get_up_osd_features(), SERVER_OCTOPUS
))
11301 ss
<< "not all up OSDs have CEPH_FEATURE_SERVER_OCTOPUS feature";
11306 ss
<< "not supported for this release yet";
11310 if (rel
< osdmap
.require_osd_release
) {
11311 ss
<< "require_osd_release cannot be lowered once it has been set";
11315 pending_inc
.new_require_osd_release
= rel
;
11317 } else if (prefix
== "osd down" ||
11318 prefix
== "osd out" ||
11319 prefix
== "osd in" ||
11320 prefix
== "osd rm" ||
11321 prefix
== "osd stop") {
11325 bool verbose
= true;
11326 bool definitely_dead
= false;
11328 vector
<string
> idvec
;
11329 cmd_getval(cmdmap
, "ids", idvec
);
11330 cmd_getval(cmdmap
, "definitely_dead", definitely_dead
);
11331 derr
<< "definitely_dead " << (int)definitely_dead
<< dendl
;
11332 for (unsigned j
= 0; j
< idvec
.size() && !stop
; j
++) {
11337 (idvec
[0] == "any" || idvec
[0] == "all" || idvec
[0] == "*")) {
11338 if (prefix
== "osd in") {
11339 // touch out osds only
11340 osdmap
.get_out_existing_osds(osds
);
11342 osdmap
.get_all_osds(osds
);
11345 verbose
= false; // so the output is less noisy.
11347 long osd
= parse_osd_id(idvec
[j
].c_str(), &ss
);
11349 ss
<< "invalid osd id" << osd
;
11352 } else if (!osdmap
.exists(osd
)) {
11353 ss
<< "osd." << osd
<< " does not exist. ";
11360 for (auto &osd
: osds
) {
11361 if (prefix
== "osd down") {
11362 if (osdmap
.is_down(osd
)) {
11364 ss
<< "osd." << osd
<< " is already down. ";
11366 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_UP
);
11367 ss
<< "marked down osd." << osd
<< ". ";
11370 if (definitely_dead
) {
11371 if (!pending_inc
.new_xinfo
.count(osd
)) {
11372 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
11374 if (pending_inc
.new_xinfo
[osd
].dead_epoch
< pending_inc
.epoch
) {
11377 pending_inc
.new_xinfo
[osd
].dead_epoch
= pending_inc
.epoch
;
11379 } else if (prefix
== "osd out") {
11380 if (osdmap
.is_out(osd
)) {
11382 ss
<< "osd." << osd
<< " is already out. ";
11384 pending_inc
.new_weight
[osd
] = CEPH_OSD_OUT
;
11385 if (osdmap
.osd_weight
[osd
]) {
11386 if (pending_inc
.new_xinfo
.count(osd
) == 0) {
11387 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
11389 pending_inc
.new_xinfo
[osd
].old_weight
= osdmap
.osd_weight
[osd
];
11391 ss
<< "marked out osd." << osd
<< ". ";
11392 std::ostringstream msg
;
11393 msg
<< "Client " << op
->get_session()->entity_name
11394 << " marked osd." << osd
<< " out";
11395 if (osdmap
.is_up(osd
)) {
11396 msg
<< ", while it was still marked up";
11398 auto period
= ceph_clock_now() - down_pending_out
[osd
];
11399 msg
<< ", after it was down for " << int(period
.sec())
11403 mon
->clog
->info() << msg
.str();
11406 } else if (prefix
== "osd in") {
11407 if (osdmap
.is_in(osd
)) {
11409 ss
<< "osd." << osd
<< " is already in. ";
11411 if (osdmap
.osd_xinfo
[osd
].old_weight
> 0) {
11412 pending_inc
.new_weight
[osd
] = osdmap
.osd_xinfo
[osd
].old_weight
;
11413 if (pending_inc
.new_xinfo
.count(osd
) == 0) {
11414 pending_inc
.new_xinfo
[osd
] = osdmap
.osd_xinfo
[osd
];
11416 pending_inc
.new_xinfo
[osd
].old_weight
= 0;
11418 pending_inc
.new_weight
[osd
] = CEPH_OSD_IN
;
11420 ss
<< "marked in osd." << osd
<< ". ";
11423 } else if (prefix
== "osd rm") {
11424 err
= prepare_command_osd_remove(osd
);
11426 if (err
== -EBUSY
) {
11429 ss
<< "osd." << osd
<< " is still up; must be down before removal. ";
11431 ceph_assert(err
== 0);
11433 ss
<< ", osd." << osd
;
11435 ss
<< "removed osd." << osd
;
11439 } else if (prefix
== "osd stop") {
11440 if (osdmap
.is_stop(osd
)) {
11442 ss
<< "osd." << osd
<< " is already stopped. ";
11443 } else if (osdmap
.is_down(osd
)) {
11444 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_STOP
);
11445 ss
<< "stop down osd." << osd
<< ". ";
11448 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_UP
| CEPH_OSD_STOP
);
11449 ss
<< "stop osd." << osd
<< ". ";
11457 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
11458 get_last_committed() + 1));
11461 } else if (prefix
== "osd set-group" ||
11462 prefix
== "osd unset-group" ||
11463 prefix
== "osd add-noup" ||
11464 prefix
== "osd add-nodown" ||
11465 prefix
== "osd add-noin" ||
11466 prefix
== "osd add-noout" ||
11467 prefix
== "osd rm-noup" ||
11468 prefix
== "osd rm-nodown" ||
11469 prefix
== "osd rm-noin" ||
11470 prefix
== "osd rm-noout") {
11471 bool do_set
= prefix
== "osd set-group" ||
11472 prefix
.find("add") != string::npos
;
11474 unsigned flags
= 0;
11475 vector
<string
> who
;
11476 if (prefix
== "osd set-group" || prefix
== "osd unset-group") {
11477 cmd_getval(cmdmap
, "flags", flag_str
);
11478 cmd_getval(cmdmap
, "who", who
);
11479 vector
<string
> raw_flags
;
11480 boost::split(raw_flags
, flag_str
, boost::is_any_of(","));
11481 for (auto& f
: raw_flags
) {
11483 flags
|= CEPH_OSD_NOUP
;
11484 else if (f
== "nodown")
11485 flags
|= CEPH_OSD_NODOWN
;
11486 else if (f
== "noin")
11487 flags
|= CEPH_OSD_NOIN
;
11488 else if (f
== "noout")
11489 flags
|= CEPH_OSD_NOOUT
;
11491 ss
<< "unrecognized flag '" << f
<< "', must be one of "
11492 << "{noup,nodown,noin,noout}";
11498 cmd_getval(cmdmap
, "ids", who
);
11499 if (prefix
.find("noup") != string::npos
)
11500 flags
= CEPH_OSD_NOUP
;
11501 else if (prefix
.find("nodown") != string::npos
)
11502 flags
= CEPH_OSD_NODOWN
;
11503 else if (prefix
.find("noin") != string::npos
)
11504 flags
= CEPH_OSD_NOIN
;
11505 else if (prefix
.find("noout") != string::npos
)
11506 flags
= CEPH_OSD_NOOUT
;
11508 ceph_assert(0 == "Unreachable!");
11511 ss
<< "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
11516 ss
<< "must specify at least one or more targets to set/unset";
11521 set
<int> crush_nodes
;
11522 set
<int> device_classes
;
11523 for (auto& w
: who
) {
11524 if (w
== "any" || w
== "all" || w
== "*") {
11525 osdmap
.get_all_osds(osds
);
11528 std::stringstream ts
;
11529 if (auto osd
= parse_osd_id(w
.c_str(), &ts
); osd
>= 0) {
11531 } else if (osdmap
.crush
->name_exists(w
)) {
11532 crush_nodes
.insert(osdmap
.crush
->get_item_id(w
));
11533 } else if (osdmap
.crush
->class_exists(w
)) {
11534 device_classes
.insert(osdmap
.crush
->get_class_id(w
));
11536 ss
<< "unable to parse osd id or crush node or device class: "
11537 << "\"" << w
<< "\". ";
11540 if (osds
.empty() && crush_nodes
.empty() && device_classes
.empty()) {
11541 // ss has reason for failure
11546 for (auto osd
: osds
) {
11547 if (!osdmap
.exists(osd
)) {
11548 ss
<< "osd." << osd
<< " does not exist. ";
11552 if (flags
& CEPH_OSD_NOUP
) {
11553 any
|= osdmap
.is_noup_by_osd(osd
) ?
11554 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOUP
) :
11555 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOUP
);
11557 if (flags
& CEPH_OSD_NODOWN
) {
11558 any
|= osdmap
.is_nodown_by_osd(osd
) ?
11559 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NODOWN
) :
11560 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NODOWN
);
11562 if (flags
& CEPH_OSD_NOIN
) {
11563 any
|= osdmap
.is_noin_by_osd(osd
) ?
11564 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOIN
) :
11565 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOIN
);
11567 if (flags
& CEPH_OSD_NOOUT
) {
11568 any
|= osdmap
.is_noout_by_osd(osd
) ?
11569 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOOUT
) :
11570 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOOUT
);
11573 if (flags
& CEPH_OSD_NOUP
) {
11574 any
|= osdmap
.is_noup_by_osd(osd
) ?
11575 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOUP
) :
11576 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOUP
);
11578 if (flags
& CEPH_OSD_NODOWN
) {
11579 any
|= osdmap
.is_nodown_by_osd(osd
) ?
11580 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NODOWN
) :
11581 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NODOWN
);
11583 if (flags
& CEPH_OSD_NOIN
) {
11584 any
|= osdmap
.is_noin_by_osd(osd
) ?
11585 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOIN
) :
11586 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOIN
);
11588 if (flags
& CEPH_OSD_NOOUT
) {
11589 any
|= osdmap
.is_noout_by_osd(osd
) ?
11590 pending_inc
.pending_osd_state_set(osd
, CEPH_OSD_NOOUT
) :
11591 pending_inc
.pending_osd_state_clear(osd
, CEPH_OSD_NOOUT
);
11595 for (auto& id
: crush_nodes
) {
11596 auto old_flags
= osdmap
.get_crush_node_flags(id
);
11597 auto& pending_flags
= pending_inc
.new_crush_node_flags
[id
];
11598 pending_flags
|= old_flags
; // adopt existing flags first!
11600 pending_flags
|= flags
;
11602 pending_flags
&= ~flags
;
11606 for (auto& id
: device_classes
) {
11607 auto old_flags
= osdmap
.get_device_class_flags(id
);
11608 auto& pending_flags
= pending_inc
.new_device_class_flags
[id
];
11609 pending_flags
|= old_flags
;
11611 pending_flags
|= flags
;
11613 pending_flags
&= ~flags
;
11619 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, err
, rs
,
11620 get_last_committed() + 1));
11623 } else if (prefix
== "osd pg-temp") {
11625 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
11626 ss
<< "unable to parse 'pgid' value '"
11627 << cmd_vartype_stringify(cmdmap
.at("pgid")) << "'";
11632 if (!pgid
.parse(pgidstr
.c_str())) {
11633 ss
<< "invalid pgid '" << pgidstr
<< "'";
11637 if (!osdmap
.pg_exists(pgid
)) {
11638 ss
<< "pg " << pgid
<< " does not exist";
11642 if (pending_inc
.new_pg_temp
.count(pgid
)) {
11643 dout(10) << __func__
<< " waiting for pending update on " << pgid
<< dendl
;
11644 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11648 vector
<int64_t> id_vec
;
11649 vector
<int32_t> new_pg_temp
;
11650 cmd_getval(cmdmap
, "id", id_vec
);
11651 if (id_vec
.empty()) {
11652 pending_inc
.new_pg_temp
[pgid
] = mempool::osdmap::vector
<int>();
11653 ss
<< "done cleaning up pg_temp of " << pgid
;
11656 for (auto osd
: id_vec
) {
11657 if (!osdmap
.exists(osd
)) {
11658 ss
<< "osd." << osd
<< " does not exist";
11662 new_pg_temp
.push_back(osd
);
11665 int pool_min_size
= osdmap
.get_pg_pool_min_size(pgid
);
11666 if ((int)new_pg_temp
.size() < pool_min_size
) {
11667 ss
<< "num of osds (" << new_pg_temp
.size() <<") < pool min size ("
11668 << pool_min_size
<< ")";
11673 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
11674 if ((int)new_pg_temp
.size() > pool_size
) {
11675 ss
<< "num of osds (" << new_pg_temp
.size() <<") > pool size ("
11676 << pool_size
<< ")";
11681 pending_inc
.new_pg_temp
[pgid
] = mempool::osdmap::vector
<int>(
11682 new_pg_temp
.begin(), new_pg_temp
.end());
11683 ss
<< "set " << pgid
<< " pg_temp mapping to " << new_pg_temp
;
11685 } else if (prefix
== "osd primary-temp") {
11687 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
11688 ss
<< "unable to parse 'pgid' value '"
11689 << cmd_vartype_stringify(cmdmap
.at("pgid")) << "'";
11694 if (!pgid
.parse(pgidstr
.c_str())) {
11695 ss
<< "invalid pgid '" << pgidstr
<< "'";
11699 if (!osdmap
.pg_exists(pgid
)) {
11700 ss
<< "pg " << pgid
<< " does not exist";
11706 if (!cmd_getval(cmdmap
, "id", osd
)) {
11707 ss
<< "unable to parse 'id' value '"
11708 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
11712 if (osd
!= -1 && !osdmap
.exists(osd
)) {
11713 ss
<< "osd." << osd
<< " does not exist";
11718 if (osdmap
.require_min_compat_client
!= ceph_release_t::unknown
&&
11719 osdmap
.require_min_compat_client
< ceph_release_t::firefly
) {
11720 ss
<< "require_min_compat_client "
11721 << osdmap
.require_min_compat_client
11722 << " < firefly, which is required for primary-temp";
11727 pending_inc
.new_primary_temp
[pgid
] = osd
;
11728 ss
<< "set " << pgid
<< " primary_temp mapping to " << osd
;
11730 } else if (prefix
== "pg repeer") {
11733 cmd_getval(cmdmap
, "pgid", pgidstr
);
11734 if (!pgid
.parse(pgidstr
.c_str())) {
11735 ss
<< "invalid pgid '" << pgidstr
<< "'";
11739 if (!osdmap
.pg_exists(pgid
)) {
11740 ss
<< "pg '" << pgidstr
<< "' does not exist";
11744 vector
<int> acting
;
11746 osdmap
.pg_to_acting_osds(pgid
, &acting
, &primary
);
11749 ss
<< "pg currently has no primary";
11752 if (acting
.size() > 1) {
11753 // map to just primary; it will map back to what it wants
11754 pending_inc
.new_pg_temp
[pgid
] = { primary
};
11756 // hmm, pick another arbitrary osd to induce a change. Note
11757 // that this won't work if there is only one suitable OSD in the cluster.
11760 for (i
= 0; i
< osdmap
.get_max_osd(); ++i
) {
11761 if (i
== primary
|| !osdmap
.is_up(i
) || !osdmap
.exists(i
)) {
11764 pending_inc
.new_pg_temp
[pgid
] = { primary
, i
};
11770 ss
<< "not enough up OSDs in the cluster to force repeer";
11775 } else if (prefix
== "osd pg-upmap" ||
11776 prefix
== "osd rm-pg-upmap" ||
11777 prefix
== "osd pg-upmap-items" ||
11778 prefix
== "osd rm-pg-upmap-items") {
11779 if (osdmap
.require_min_compat_client
< ceph_release_t::luminous
) {
11780 ss
<< "min_compat_client "
11781 << osdmap
.require_min_compat_client
11782 << " < luminous, which is required for pg-upmap. "
11783 << "Try 'ceph osd set-require-min-compat-client luminous' "
11784 << "before using the new interface";
11788 err
= check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP
, ss
);
11789 if (err
== -EAGAIN
)
11794 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
11795 ss
<< "unable to parse 'pgid' value '"
11796 << cmd_vartype_stringify(cmdmap
.at("pgid")) << "'";
11801 if (!pgid
.parse(pgidstr
.c_str())) {
11802 ss
<< "invalid pgid '" << pgidstr
<< "'";
11806 if (!osdmap
.pg_exists(pgid
)) {
11807 ss
<< "pg " << pgid
<< " does not exist";
11811 if (pending_inc
.old_pools
.count(pgid
.pool())) {
11812 ss
<< "pool of " << pgid
<< " is pending removal";
11815 wait_for_finished_proposal(op
,
11816 new Monitor::C_Command(mon
, op
, err
, rs
, get_last_committed() + 1));
11824 OP_RM_PG_UPMAP_ITEMS
,
11827 if (prefix
== "osd pg-upmap") {
11828 option
= OP_PG_UPMAP
;
11829 } else if (prefix
== "osd rm-pg-upmap") {
11830 option
= OP_RM_PG_UPMAP
;
11831 } else if (prefix
== "osd pg-upmap-items") {
11832 option
= OP_PG_UPMAP_ITEMS
;
11834 option
= OP_RM_PG_UPMAP_ITEMS
;
11837 // check pending upmap changes
11839 case OP_PG_UPMAP
: // fall through
11840 case OP_RM_PG_UPMAP
:
11841 if (pending_inc
.new_pg_upmap
.count(pgid
) ||
11842 pending_inc
.old_pg_upmap
.count(pgid
)) {
11843 dout(10) << __func__
<< " waiting for pending update on "
11845 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11850 case OP_PG_UPMAP_ITEMS
: // fall through
11851 case OP_RM_PG_UPMAP_ITEMS
:
11852 if (pending_inc
.new_pg_upmap_items
.count(pgid
) ||
11853 pending_inc
.old_pg_upmap_items
.count(pgid
)) {
11854 dout(10) << __func__
<< " waiting for pending update on "
11856 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
11862 ceph_abort_msg("invalid option");
11868 vector
<int64_t> id_vec
;
11869 if (!cmd_getval(cmdmap
, "id", id_vec
)) {
11870 ss
<< "unable to parse 'id' value(s) '"
11871 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
11876 int pool_min_size
= osdmap
.get_pg_pool_min_size(pgid
);
11877 if ((int)id_vec
.size() < pool_min_size
) {
11878 ss
<< "num of osds (" << id_vec
.size() <<") < pool min size ("
11879 << pool_min_size
<< ")";
11884 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
11885 if ((int)id_vec
.size() > pool_size
) {
11886 ss
<< "num of osds (" << id_vec
.size() <<") > pool size ("
11887 << pool_size
<< ")";
11892 vector
<int32_t> new_pg_upmap
;
11893 for (auto osd
: id_vec
) {
11894 if (osd
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(osd
)) {
11895 ss
<< "osd." << osd
<< " does not exist";
11899 auto it
= std::find(new_pg_upmap
.begin(), new_pg_upmap
.end(), osd
);
11900 if (it
!= new_pg_upmap
.end()) {
11901 ss
<< "osd." << osd
<< " already exists, ";
11904 new_pg_upmap
.push_back(osd
);
11907 if (new_pg_upmap
.empty()) {
11908 ss
<< "no valid upmap items(pairs) is specified";
11913 pending_inc
.new_pg_upmap
[pgid
] = mempool::osdmap::vector
<int32_t>(
11914 new_pg_upmap
.begin(), new_pg_upmap
.end());
11915 ss
<< "set " << pgid
<< " pg_upmap mapping to " << new_pg_upmap
;
11919 case OP_RM_PG_UPMAP
:
11921 pending_inc
.old_pg_upmap
.insert(pgid
);
11922 ss
<< "clear " << pgid
<< " pg_upmap mapping";
11926 case OP_PG_UPMAP_ITEMS
:
11928 vector
<int64_t> id_vec
;
11929 if (!cmd_getval(cmdmap
, "id", id_vec
)) {
11930 ss
<< "unable to parse 'id' value(s) '"
11931 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
11936 if (id_vec
.size() % 2) {
11937 ss
<< "you must specify pairs of osd ids to be remapped";
11942 int pool_size
= osdmap
.get_pg_pool_size(pgid
);
11943 if ((int)(id_vec
.size() / 2) > pool_size
) {
11944 ss
<< "num of osd pairs (" << id_vec
.size() / 2 <<") > pool size ("
11945 << pool_size
<< ")";
11950 vector
<pair
<int32_t,int32_t>> new_pg_upmap_items
;
11951 ostringstream items
;
11953 for (auto p
= id_vec
.begin(); p
!= id_vec
.end(); ++p
) {
11957 ss
<< "from osd." << from
<< " == to osd." << to
<< ", ";
11960 if (!osdmap
.exists(from
)) {
11961 ss
<< "osd." << from
<< " does not exist";
11965 if (to
!= CRUSH_ITEM_NONE
&& !osdmap
.exists(to
)) {
11966 ss
<< "osd." << to
<< " does not exist";
11970 pair
<int32_t,int32_t> entry
= make_pair(from
, to
);
11971 auto it
= std::find(new_pg_upmap_items
.begin(),
11972 new_pg_upmap_items
.end(), entry
);
11973 if (it
!= new_pg_upmap_items
.end()) {
11974 ss
<< "osd." << from
<< " -> osd." << to
<< " already exists, ";
11977 new_pg_upmap_items
.push_back(entry
);
11978 items
<< from
<< "->" << to
<< ",";
11980 string
out(items
.str());
11981 out
.resize(out
.size() - 1); // drop last ','
11984 if (new_pg_upmap_items
.empty()) {
11985 ss
<< "no valid upmap items(pairs) is specified";
11990 pending_inc
.new_pg_upmap_items
[pgid
] =
11991 mempool::osdmap::vector
<pair
<int32_t,int32_t>>(
11992 new_pg_upmap_items
.begin(), new_pg_upmap_items
.end());
11993 ss
<< "set " << pgid
<< " pg_upmap_items mapping to " << out
;
11997 case OP_RM_PG_UPMAP_ITEMS
:
11999 pending_inc
.old_pg_upmap_items
.insert(pgid
);
12000 ss
<< "clear " << pgid
<< " pg_upmap_items mapping";
12005 ceph_abort_msg("invalid option");
12009 } else if (prefix
== "osd primary-affinity") {
12011 if (!cmd_getval(cmdmap
, "id", id
)) {
12012 ss
<< "invalid osd id value '"
12013 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12018 if (!cmd_getval(cmdmap
, "weight", w
)) {
12019 ss
<< "unable to parse 'weight' value '"
12020 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
12024 long ww
= (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY
*w
);
12026 ss
<< "weight must be >= 0";
12030 if (osdmap
.require_min_compat_client
!= ceph_release_t::unknown
&&
12031 osdmap
.require_min_compat_client
< ceph_release_t::firefly
) {
12032 ss
<< "require_min_compat_client "
12033 << osdmap
.require_min_compat_client
12034 << " < firefly, which is required for primary-affinity";
12038 if (osdmap
.exists(id
)) {
12039 pending_inc
.new_primary_affinity
[id
] = ww
;
12040 ss
<< "set osd." << id
<< " primary-affinity to " << w
<< " (" << ios::hex
<< ww
<< ios::dec
<< ")";
12042 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12043 get_last_committed() + 1));
12046 ss
<< "osd." << id
<< " does not exist";
12050 } else if (prefix
== "osd reweight") {
12052 if (!cmd_getval(cmdmap
, "id", id
)) {
12053 ss
<< "unable to parse osd id value '"
12054 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12059 if (!cmd_getval(cmdmap
, "weight", w
)) {
12060 ss
<< "unable to parse weight value '"
12061 << cmd_vartype_stringify(cmdmap
.at("weight")) << "'";
12065 long ww
= (int)((double)CEPH_OSD_IN
*w
);
12067 ss
<< "weight must be >= 0";
12071 if (osdmap
.exists(id
)) {
12072 pending_inc
.new_weight
[id
] = ww
;
12073 ss
<< "reweighted osd." << id
<< " to " << w
<< " (" << std::hex
<< ww
<< std::dec
<< ")";
12075 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12076 get_last_committed() + 1));
12079 ss
<< "osd." << id
<< " does not exist";
12083 } else if (prefix
== "osd reweightn") {
12084 map
<int32_t, uint32_t> weights
;
12085 err
= parse_reweights(cct
, cmdmap
, osdmap
, &weights
);
12087 ss
<< "unable to parse 'weights' value '"
12088 << cmd_vartype_stringify(cmdmap
.at("weights")) << "'";
12091 pending_inc
.new_weight
.insert(weights
.begin(), weights
.end());
12092 wait_for_finished_proposal(
12094 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
, get_last_committed() + 1));
12096 } else if (prefix
== "osd lost") {
12098 if (!cmd_getval(cmdmap
, "id", id
)) {
12099 ss
<< "unable to parse osd id value '"
12100 << cmd_vartype_stringify(cmdmap
.at("id")) << "'";
12105 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
12107 ss
<< "are you SURE? this might mean real, permanent data loss. pass "
12108 "--yes-i-really-mean-it if you really do.";
12111 } else if (!osdmap
.exists(id
)) {
12112 ss
<< "osd." << id
<< " does not exist";
12115 } else if (!osdmap
.is_down(id
)) {
12116 ss
<< "osd." << id
<< " is not down";
12120 epoch_t e
= osdmap
.get_info(id
).down_at
;
12121 pending_inc
.new_lost
[id
] = e
;
12122 ss
<< "marked osd lost in epoch " << e
;
12124 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12125 get_last_committed() + 1));
12129 } else if (prefix
== "osd destroy-actual" ||
12130 prefix
== "osd purge-actual" ||
12131 prefix
== "osd purge-new") {
12132 /* Destroying an OSD means that we don't expect to further make use of
12133 * the OSDs data (which may even become unreadable after this operation),
12134 * and that we are okay with scrubbing all its cephx keys and config-key
12135 * data (which may include lockbox keys, thus rendering the osd's data
12138 * The OSD will not be removed. Instead, we will mark it as destroyed,
12139 * such that a subsequent call to `create` will not reuse the osd id.
12140 * This will play into being able to recreate the OSD, at the same
12141 * crush location, with minimal data movement.
12144 // make sure authmon is writeable.
12145 if (!mon
->authmon()->is_writeable()) {
12146 dout(10) << __func__
<< " waiting for auth mon to be writeable for "
12147 << "osd destroy" << dendl
;
12148 mon
->authmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
12153 if (!cmd_getval(cmdmap
, "id", id
)) {
12154 auto p
= cmdmap
.find("id");
12155 if (p
== cmdmap
.end()) {
12156 ss
<< "no osd id specified";
12158 ss
<< "unable to parse osd id value '"
12159 << cmd_vartype_stringify(cmdmap
.at("id")) << "";
12165 bool is_destroy
= (prefix
== "osd destroy-actual");
12167 ceph_assert("osd purge-actual" == prefix
||
12168 "osd purge-new" == prefix
);
12172 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
12174 ss
<< "Are you SURE? Did you verify with 'ceph osd safe-to-destroy'? "
12175 << "This will mean real, permanent data loss, as well "
12176 << "as deletion of cephx and lockbox keys. "
12177 << "Pass --yes-i-really-mean-it if you really do.";
12180 } else if (!osdmap
.exists(id
)) {
12181 ss
<< "osd." << id
<< " does not exist";
12182 err
= 0; // idempotent
12184 } else if (osdmap
.is_up(id
)) {
12185 ss
<< "osd." << id
<< " is not `down`.";
12188 } else if (is_destroy
&& osdmap
.is_destroyed(id
)) {
12189 ss
<< "destroyed osd." << id
;
12194 if (prefix
== "osd purge-new" &&
12195 (osdmap
.get_state(id
) & CEPH_OSD_NEW
) == 0) {
12196 ss
<< "osd." << id
<< " is not new";
12201 bool goto_reply
= false;
12205 err
= prepare_command_osd_destroy(id
, ss
);
12206 // we checked above that it should exist.
12207 ceph_assert(err
!= -ENOENT
);
12209 err
= prepare_command_osd_purge(id
, ss
);
12210 if (err
== -ENOENT
) {
12212 ss
<< "osd." << id
<< " does not exist.";
12218 if (err
< 0 || goto_reply
) {
12223 ss
<< "destroyed osd." << id
;
12225 ss
<< "purged osd." << id
;
12229 wait_for_finished_proposal(op
,
12230 new Monitor::C_Command(mon
, op
, 0, rs
, get_last_committed() + 1));
12231 force_immediate_propose();
12234 } else if (prefix
== "osd new") {
12236 // make sure authmon is writeable.
12237 if (!mon
->authmon()->is_writeable()) {
12238 dout(10) << __func__
<< " waiting for auth mon to be writeable for "
12239 << "osd new" << dendl
;
12240 mon
->authmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
12244 map
<string
,string
> param_map
;
12246 bufferlist bl
= m
->get_data();
12247 string param_json
= bl
.to_str();
12248 dout(20) << __func__
<< " osd new json = " << param_json
<< dendl
;
12250 err
= get_json_str_map(param_json
, ss
, ¶m_map
);
12254 dout(20) << __func__
<< " osd new params " << param_map
<< dendl
;
12257 err
= prepare_command_osd_new(op
, cmdmap
, param_map
, ss
, f
.get());
12270 if (err
== EEXIST
) {
12271 // idempotent operation
12276 wait_for_finished_proposal(op
,
12277 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
,
12278 get_last_committed() + 1));
12279 force_immediate_propose();
12282 } else if (prefix
== "osd create") {
12284 // optional id provided?
12285 int64_t id
= -1, cmd_id
= -1;
12286 if (cmd_getval(cmdmap
, "id", cmd_id
)) {
12288 ss
<< "invalid osd id value '" << cmd_id
<< "'";
12292 dout(10) << " osd create got id " << cmd_id
<< dendl
;
12297 if (cmd_getval(cmdmap
, "uuid", uuidstr
)) {
12298 if (!uuid
.parse(uuidstr
.c_str())) {
12299 ss
<< "invalid uuid value '" << uuidstr
<< "'";
12303 // we only care about the id if we also have the uuid, to
12304 // ensure the operation's idempotency.
12308 int32_t new_id
= -1;
12309 err
= prepare_command_osd_create(id
, uuid
, &new_id
, ss
);
12311 if (err
== -EAGAIN
) {
12312 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12315 // a check has failed; reply to the user.
12318 } else if (err
== EEXIST
) {
12319 // this is an idempotent operation; we can go ahead and reply.
12321 f
->open_object_section("created_osd");
12322 f
->dump_int("osdid", new_id
);
12323 f
->close_section();
12333 string empty_device_class
;
12334 do_osd_create(id
, uuid
, empty_device_class
, &new_id
);
12337 f
->open_object_section("created_osd");
12338 f
->dump_int("osdid", new_id
);
12339 f
->close_section();
12345 wait_for_finished_proposal(op
,
12346 new Monitor::C_Command(mon
, op
, 0, rs
, rdata
,
12347 get_last_committed() + 1));
12350 } else if (prefix
== "osd blacklist clear") {
12351 pending_inc
.new_blacklist
.clear();
12352 std::list
<std::pair
<entity_addr_t
,utime_t
> > blacklist
;
12353 osdmap
.get_blacklist(&blacklist
);
12354 for (const auto &entry
: blacklist
) {
12355 pending_inc
.old_blacklist
.push_back(entry
.first
);
12357 ss
<< " removed all blacklist entries";
12359 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12360 get_last_committed() + 1));
12362 } else if (prefix
== "osd blacklist") {
12364 cmd_getval(cmdmap
, "addr", addrstr
);
12365 entity_addr_t addr
;
12366 if (!addr
.parse(addrstr
.c_str(), 0)) {
12367 ss
<< "unable to parse address " << addrstr
;
12372 if (osdmap
.require_osd_release
>= ceph_release_t::nautilus
) {
12373 // always blacklist type ANY
12374 addr
.set_type(entity_addr_t::TYPE_ANY
);
12376 addr
.set_type(entity_addr_t::TYPE_LEGACY
);
12379 string blacklistop
;
12380 cmd_getval(cmdmap
, "blacklistop", blacklistop
);
12381 if (blacklistop
== "add") {
12382 utime_t expires
= ceph_clock_now();
12384 // default one hour
12385 cmd_getval(cmdmap
, "expire", d
,
12386 g_conf()->mon_osd_blacklist_default_expire
);
12389 pending_inc
.new_blacklist
[addr
] = expires
;
12392 // cancel any pending un-blacklisting request too
12393 auto it
= std::find(pending_inc
.old_blacklist
.begin(),
12394 pending_inc
.old_blacklist
.end(), addr
);
12395 if (it
!= pending_inc
.old_blacklist
.end()) {
12396 pending_inc
.old_blacklist
.erase(it
);
12400 ss
<< "blacklisting " << addr
<< " until " << expires
<< " (" << d
<< " sec)";
12402 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12403 get_last_committed() + 1));
12405 } else if (blacklistop
== "rm") {
12406 if (osdmap
.is_blacklisted(addr
) ||
12407 pending_inc
.new_blacklist
.count(addr
)) {
12408 if (osdmap
.is_blacklisted(addr
))
12409 pending_inc
.old_blacklist
.push_back(addr
);
12411 pending_inc
.new_blacklist
.erase(addr
);
12412 ss
<< "un-blacklisting " << addr
;
12414 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12415 get_last_committed() + 1));
12418 ss
<< addr
<< " isn't blacklisted";
12423 } else if (prefix
== "osd pool mksnap") {
12425 cmd_getval(cmdmap
, "pool", poolstr
);
12426 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
12428 ss
<< "unrecognized pool '" << poolstr
<< "'";
12433 cmd_getval(cmdmap
, "snap", snapname
);
12434 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
12435 if (p
->is_unmanaged_snaps_mode()) {
12436 ss
<< "pool " << poolstr
<< " is in unmanaged snaps mode";
12439 } else if (p
->snap_exists(snapname
.c_str())) {
12440 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " already exists";
12443 } else if (p
->is_tier()) {
12444 ss
<< "pool " << poolstr
<< " is a cache tier";
12449 if (pending_inc
.new_pools
.count(pool
))
12450 pp
= &pending_inc
.new_pools
[pool
];
12452 pp
= &pending_inc
.new_pools
[pool
];
12455 if (pp
->snap_exists(snapname
.c_str())) {
12456 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " already exists";
12458 pp
->add_snap(snapname
.c_str(), ceph_clock_now());
12459 pp
->set_snap_epoch(pending_inc
.epoch
);
12460 ss
<< "created pool " << poolstr
<< " snap " << snapname
;
12463 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12464 get_last_committed() + 1));
12466 } else if (prefix
== "osd pool rmsnap") {
12468 cmd_getval(cmdmap
, "pool", poolstr
);
12469 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
12471 ss
<< "unrecognized pool '" << poolstr
<< "'";
12476 cmd_getval(cmdmap
, "snap", snapname
);
12477 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
12478 if (p
->is_unmanaged_snaps_mode()) {
12479 ss
<< "pool " << poolstr
<< " is in unmanaged snaps mode";
12482 } else if (!p
->snap_exists(snapname
.c_str())) {
12483 ss
<< "pool " << poolstr
<< " snap " << snapname
<< " does not exist";
12488 if (pending_inc
.new_pools
.count(pool
))
12489 pp
= &pending_inc
.new_pools
[pool
];
12491 pp
= &pending_inc
.new_pools
[pool
];
12494 snapid_t sn
= pp
->snap_exists(snapname
.c_str());
12496 pp
->remove_snap(sn
);
12497 pp
->set_snap_epoch(pending_inc
.epoch
);
12498 ss
<< "removed pool " << poolstr
<< " snap " << snapname
;
12500 ss
<< "already removed pool " << poolstr
<< " snap " << snapname
;
12503 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12504 get_last_committed() + 1));
12506 } else if (prefix
== "osd pool create") {
12507 int64_t pg_num
, pg_num_min
;
12509 cmd_getval(cmdmap
, "pg_num", pg_num
, int64_t(0));
12510 cmd_getval(cmdmap
, "pgp_num", pgp_num
, pg_num
);
12511 cmd_getval(cmdmap
, "pg_num_min", pg_num_min
, int64_t(0));
12513 string pool_type_str
;
12514 cmd_getval(cmdmap
, "pool_type", pool_type_str
);
12515 if (pool_type_str
.empty())
12516 pool_type_str
= g_conf().get_val
<string
>("osd_pool_default_type");
12519 cmd_getval(cmdmap
, "pool", poolstr
);
12520 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
12521 if (pool_id
>= 0) {
12522 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
12523 if (pool_type_str
!= p
->get_type_name()) {
12524 ss
<< "pool '" << poolstr
<< "' cannot change to type " << pool_type_str
;
12527 ss
<< "pool '" << poolstr
<< "' already exists";
12534 if (pool_type_str
== "replicated") {
12535 pool_type
= pg_pool_t::TYPE_REPLICATED
;
12536 } else if (pool_type_str
== "erasure") {
12537 pool_type
= pg_pool_t::TYPE_ERASURE
;
12539 ss
<< "unknown pool type '" << pool_type_str
<< "'";
12544 bool implicit_rule_creation
= false;
12545 int64_t expected_num_objects
= 0;
12547 cmd_getval(cmdmap
, "rule", rule_name
);
12548 string erasure_code_profile
;
12549 cmd_getval(cmdmap
, "erasure_code_profile", erasure_code_profile
);
12551 if (pool_type
== pg_pool_t::TYPE_ERASURE
) {
12552 if (erasure_code_profile
== "")
12553 erasure_code_profile
= "default";
12554 //handle the erasure code profile
12555 if (erasure_code_profile
== "default") {
12556 if (!osdmap
.has_erasure_code_profile(erasure_code_profile
)) {
12557 if (pending_inc
.has_erasure_code_profile(erasure_code_profile
)) {
12558 dout(20) << "erasure code profile " << erasure_code_profile
<< " already pending" << dendl
;
12562 map
<string
,string
> profile_map
;
12563 err
= osdmap
.get_erasure_code_profile_default(cct
,
12568 dout(20) << "erasure code profile " << erasure_code_profile
<< " set" << dendl
;
12569 pending_inc
.set_erasure_code_profile(erasure_code_profile
, profile_map
);
12573 if (rule_name
== "") {
12574 implicit_rule_creation
= true;
12575 if (erasure_code_profile
== "default") {
12576 rule_name
= "erasure-code";
12578 dout(1) << "implicitly use rule named after the pool: "
12579 << poolstr
<< dendl
;
12580 rule_name
= poolstr
;
12583 cmd_getval(cmdmap
, "expected_num_objects",
12584 expected_num_objects
, int64_t(0));
12586 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
12587 // and put expected_num_objects to rule field
12588 if (erasure_code_profile
!= "") { // cmd is from CLI
12589 if (rule_name
!= "") {
12591 expected_num_objects
= strict_strtoll(rule_name
.c_str(), 10, &interr
);
12592 if (interr
.length()) {
12593 ss
<< "error parsing integer value '" << rule_name
<< "': " << interr
;
12598 rule_name
= erasure_code_profile
;
12599 } else { // cmd is well-formed
12600 cmd_getval(cmdmap
, "expected_num_objects",
12601 expected_num_objects
, int64_t(0));
12605 if (!implicit_rule_creation
&& rule_name
!= "") {
12607 err
= get_crush_rule(rule_name
, &rule
, &ss
);
12608 if (err
== -EAGAIN
) {
12609 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12616 if (expected_num_objects
< 0) {
12617 ss
<< "'expected_num_objects' must be non-negative";
12623 osdmap
.get_all_osds(osds
);
12624 bool has_filestore_osd
= std::any_of(osds
.begin(), osds
.end(), [this](int osd
) {
12626 if (!get_osd_objectstore_type(osd
, &type
)) {
12627 return type
== "filestore";
12633 if (has_filestore_osd
&&
12634 expected_num_objects
> 0 &&
12635 cct
->_conf
->filestore_merge_threshold
> 0) {
12636 ss
<< "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
12641 if (has_filestore_osd
&&
12642 expected_num_objects
== 0 &&
12643 cct
->_conf
->filestore_merge_threshold
< 0) {
12644 int osds
= osdmap
.get_num_osds();
12646 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
12647 if (!sure
&& osds
&& (pg_num
>= 1024 || pg_num
/ osds
>= 100)) {
12648 ss
<< "For better initial performance on pools expected to store a "
12649 << "large number of objects, consider supplying the "
12650 << "expected_num_objects parameter when creating the pool."
12651 << " Pass --yes-i-really-mean-it to ignore it";
12657 int64_t fast_read_param
;
12658 cmd_getval(cmdmap
, "fast_read", fast_read_param
, int64_t(-1));
12659 FastReadType fast_read
= FAST_READ_DEFAULT
;
12660 if (fast_read_param
== 0)
12661 fast_read
= FAST_READ_OFF
;
12662 else if (fast_read_param
> 0)
12663 fast_read
= FAST_READ_ON
;
12665 int64_t repl_size
= 0;
12666 cmd_getval(cmdmap
, "size", repl_size
);
12667 int64_t target_size_bytes
= 0;
12668 double target_size_ratio
= 0.0;
12669 cmd_getval(cmdmap
, "target_size_bytes", target_size_bytes
);
12670 cmd_getval(cmdmap
, "target_size_ratio", target_size_ratio
);
12672 string pg_autoscale_mode
;
12673 cmd_getval(cmdmap
, "autoscale_mode", pg_autoscale_mode
);
12675 err
= prepare_new_pool(poolstr
,
12676 -1, // default crush rule
12678 pg_num
, pgp_num
, pg_num_min
,
12679 repl_size
, target_size_bytes
, target_size_ratio
,
12680 erasure_code_profile
, pool_type
,
12681 (uint64_t)expected_num_objects
,
12688 ss
<< "pool '" << poolstr
<< "' already exists";
12691 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12700 ss
<< "pool '" << poolstr
<< "' created";
12703 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12704 get_last_committed() + 1));
12707 } else if (prefix
== "osd pool delete" ||
12708 prefix
== "osd pool rm") {
12709 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
12710 string poolstr
, poolstr2
, sure
;
12711 cmd_getval(cmdmap
, "pool", poolstr
);
12712 cmd_getval(cmdmap
, "pool2", poolstr2
);
12713 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
12715 ss
<< "pool '" << poolstr
<< "' does not exist";
12720 bool force_no_fake
= false;
12721 cmd_getval(cmdmap
, "yes_i_really_really_mean_it", force_no_fake
);
12722 bool force
= false;
12723 cmd_getval(cmdmap
, "yes_i_really_really_mean_it_not_faking", force
);
12724 if (poolstr2
!= poolstr
||
12725 (!force
&& !force_no_fake
)) {
12726 ss
<< "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
12727 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
12728 << "followed by --yes-i-really-really-mean-it.";
12732 err
= _prepare_remove_pool(pool
, &ss
, force_no_fake
);
12733 if (err
== -EAGAIN
) {
12734 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12740 } else if (prefix
== "osd pool rename") {
12741 string srcpoolstr
, destpoolstr
;
12742 cmd_getval(cmdmap
, "srcpool", srcpoolstr
);
12743 cmd_getval(cmdmap
, "destpool", destpoolstr
);
12744 int64_t pool_src
= osdmap
.lookup_pg_pool_name(srcpoolstr
.c_str());
12745 int64_t pool_dst
= osdmap
.lookup_pg_pool_name(destpoolstr
.c_str());
12747 if (pool_src
< 0) {
12748 if (pool_dst
>= 0) {
12749 // src pool doesn't exist, dst pool does exist: to ensure idempotency
12750 // of operations, assume this rename succeeded, as it is not changing
12751 // the current state. Make sure we output something understandable
12752 // for whoever is issuing the command, if they are paying attention,
12753 // in case it was not intentional; or to avoid a "wtf?" and a bug
12754 // report in case it was intentional, while expecting a failure.
12755 ss
<< "pool '" << srcpoolstr
<< "' does not exist; pool '"
12756 << destpoolstr
<< "' does -- assuming successful rename";
12759 ss
<< "unrecognized pool '" << srcpoolstr
<< "'";
12763 } else if (pool_dst
>= 0) {
12764 // source pool exists and so does the destination pool
12765 ss
<< "pool '" << destpoolstr
<< "' already exists";
12770 int ret
= _prepare_rename_pool(pool_src
, destpoolstr
);
12772 ss
<< "pool '" << srcpoolstr
<< "' renamed to '" << destpoolstr
<< "'";
12774 ss
<< "failed to rename pool '" << srcpoolstr
<< "' to '" << destpoolstr
<< "': "
12775 << cpp_strerror(ret
);
12778 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, ret
, rs
,
12779 get_last_committed() + 1));
12782 } else if (prefix
== "osd pool set") {
12783 err
= prepare_command_pool_set(cmdmap
, ss
);
12784 if (err
== -EAGAIN
)
12790 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
12791 get_last_committed() + 1));
12793 } else if (prefix
== "osd tier add") {
12794 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
12795 if (err
== -EAGAIN
)
12800 cmd_getval(cmdmap
, "pool", poolstr
);
12801 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
12803 ss
<< "unrecognized pool '" << poolstr
<< "'";
12807 string tierpoolstr
;
12808 cmd_getval(cmdmap
, "tierpool", tierpoolstr
);
12809 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
12810 if (tierpool_id
< 0) {
12811 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
12815 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
12817 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
12820 if (!_check_become_tier(tierpool_id
, tp
, pool_id
, p
, &err
, &ss
)) {
12824 // make sure new tier is empty
12825 string force_nonempty
;
12826 cmd_getval(cmdmap
, "force_nonempty", force_nonempty
);
12827 const pool_stat_t
*pstats
= mon
->mgrstatmon()->get_pool_stat(tierpool_id
);
12828 if (pstats
&& pstats
->stats
.sum
.num_objects
!= 0 &&
12829 force_nonempty
!= "--force-nonempty") {
12830 ss
<< "tier pool '" << tierpoolstr
<< "' is not empty; --force-nonempty to force";
12834 if (tp
->is_erasure()) {
12835 ss
<< "tier pool '" << tierpoolstr
12836 << "' is an ec pool, which cannot be a tier";
12840 if ((!tp
->removed_snaps
.empty() || !tp
->snaps
.empty()) &&
12841 ((force_nonempty
!= "--force-nonempty") ||
12842 (!g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps
))) {
12843 ss
<< "tier pool '" << tierpoolstr
<< "' has snapshot state; it cannot be added as a tier without breaking the pool";
12848 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
12849 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
12850 if (np
->tiers
.count(tierpool_id
) || ntp
->is_tier()) {
12851 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12854 np
->tiers
.insert(tierpool_id
);
12855 np
->set_snap_epoch(pending_inc
.epoch
); // tier will update to our snap info
12856 ntp
->tier_of
= pool_id
;
12857 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) a tier of '" << poolstr
<< "'";
12858 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
12859 get_last_committed() + 1));
12861 } else if (prefix
== "osd tier remove" ||
12862 prefix
== "osd tier rm") {
12864 cmd_getval(cmdmap
, "pool", poolstr
);
12865 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
12867 ss
<< "unrecognized pool '" << poolstr
<< "'";
12871 string tierpoolstr
;
12872 cmd_getval(cmdmap
, "tierpool", tierpoolstr
);
12873 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
12874 if (tierpool_id
< 0) {
12875 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
12879 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
12881 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
12884 if (!_check_remove_tier(pool_id
, p
, tp
, &err
, &ss
)) {
12888 if (p
->tiers
.count(tierpool_id
) == 0) {
12889 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) not a tier of '" << poolstr
<< "'";
12893 if (tp
->tier_of
!= pool_id
) {
12894 ss
<< "tier pool '" << tierpoolstr
<< "' is a tier of '"
12895 << osdmap
.get_pool_name(tp
->tier_of
) << "': "
12896 // be scary about it; this is an inconsistency and bells must go off
12897 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
12901 if (p
->read_tier
== tierpool_id
) {
12902 ss
<< "tier pool '" << tierpoolstr
<< "' is the overlay for '" << poolstr
<< "'; please remove-overlay first";
12907 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
12908 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
12909 if (np
->tiers
.count(tierpool_id
) == 0 ||
12910 ntp
->tier_of
!= pool_id
||
12911 np
->read_tier
== tierpool_id
) {
12912 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
12915 np
->tiers
.erase(tierpool_id
);
12917 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) not a tier of '" << poolstr
<< "'";
12918 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
12919 get_last_committed() + 1));
12921 } else if (prefix
== "osd tier set-overlay") {
12922 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
12923 if (err
== -EAGAIN
)
12928 cmd_getval(cmdmap
, "pool", poolstr
);
12929 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
12931 ss
<< "unrecognized pool '" << poolstr
<< "'";
12935 string overlaypoolstr
;
12936 cmd_getval(cmdmap
, "overlaypool", overlaypoolstr
);
12937 int64_t overlaypool_id
= osdmap
.lookup_pg_pool_name(overlaypoolstr
);
12938 if (overlaypool_id
< 0) {
12939 ss
<< "unrecognized pool '" << overlaypoolstr
<< "'";
12943 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
12945 const pg_pool_t
*overlay_p
= osdmap
.get_pg_pool(overlaypool_id
);
12946 ceph_assert(overlay_p
);
12947 if (p
->tiers
.count(overlaypool_id
) == 0) {
12948 ss
<< "tier pool '" << overlaypoolstr
<< "' is not a tier of '" << poolstr
<< "'";
12952 if (p
->read_tier
== overlaypool_id
) {
12954 ss
<< "overlay for '" << poolstr
<< "' is now (or already was) '" << overlaypoolstr
<< "'";
12957 if (p
->has_read_tier()) {
12958 ss
<< "pool '" << poolstr
<< "' has overlay '"
12959 << osdmap
.get_pool_name(p
->read_tier
)
12960 << "'; please remove-overlay first";
12966 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
12967 np
->read_tier
= overlaypool_id
;
12968 np
->write_tier
= overlaypool_id
;
12969 np
->set_last_force_op_resend(pending_inc
.epoch
);
12970 pg_pool_t
*noverlay_p
= pending_inc
.get_new_pool(overlaypool_id
, overlay_p
);
12971 noverlay_p
->set_last_force_op_resend(pending_inc
.epoch
);
12972 ss
<< "overlay for '" << poolstr
<< "' is now (or already was) '" << overlaypoolstr
<< "'";
12973 if (overlay_p
->cache_mode
== pg_pool_t::CACHEMODE_NONE
)
12974 ss
<<" (WARNING: overlay pool cache_mode is still NONE)";
12975 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
12976 get_last_committed() + 1));
12978 } else if (prefix
== "osd tier remove-overlay" ||
12979 prefix
== "osd tier rm-overlay") {
12981 cmd_getval(cmdmap
, "pool", poolstr
);
12982 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
12984 ss
<< "unrecognized pool '" << poolstr
<< "'";
12988 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
12990 if (!p
->has_read_tier()) {
12992 ss
<< "there is now (or already was) no overlay for '" << poolstr
<< "'";
12996 if (!_check_remove_tier(pool_id
, p
, NULL
, &err
, &ss
)) {
13001 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13002 if (np
->has_read_tier()) {
13003 const pg_pool_t
*op
= osdmap
.get_pg_pool(np
->read_tier
);
13004 pg_pool_t
*nop
= pending_inc
.get_new_pool(np
->read_tier
,op
);
13005 nop
->set_last_force_op_resend(pending_inc
.epoch
);
13007 if (np
->has_write_tier()) {
13008 const pg_pool_t
*op
= osdmap
.get_pg_pool(np
->write_tier
);
13009 pg_pool_t
*nop
= pending_inc
.get_new_pool(np
->write_tier
, op
);
13010 nop
->set_last_force_op_resend(pending_inc
.epoch
);
13012 np
->clear_read_tier();
13013 np
->clear_write_tier();
13014 np
->set_last_force_op_resend(pending_inc
.epoch
);
13015 ss
<< "there is now (or already was) no overlay for '" << poolstr
<< "'";
13016 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13017 get_last_committed() + 1));
13019 } else if (prefix
== "osd tier cache-mode") {
13020 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
13021 if (err
== -EAGAIN
)
13026 cmd_getval(cmdmap
, "pool", poolstr
);
13027 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13029 ss
<< "unrecognized pool '" << poolstr
<< "'";
13033 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13035 if (!p
->is_tier()) {
13036 ss
<< "pool '" << poolstr
<< "' is not a tier";
13041 cmd_getval(cmdmap
, "mode", modestr
);
13042 pg_pool_t::cache_mode_t mode
= pg_pool_t::get_cache_mode_from_str(modestr
);
13043 if (int(mode
) < 0) {
13044 ss
<< "'" << modestr
<< "' is not a valid cache mode";
13050 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
13052 if (mode
== pg_pool_t::CACHEMODE_FORWARD
||
13053 mode
== pg_pool_t::CACHEMODE_READFORWARD
) {
13054 ss
<< "'" << modestr
<< "' is no longer a supported cache mode";
13058 if ((mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13059 mode
!= pg_pool_t::CACHEMODE_NONE
&&
13060 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
13061 mode
!= pg_pool_t::CACHEMODE_READPROXY
) &&
13063 ss
<< "'" << modestr
<< "' is not a well-supported cache mode and may "
13064 << "corrupt your data. pass --yes-i-really-mean-it to force.";
13069 // pool already has this cache-mode set and there are no pending changes
13070 if (p
->cache_mode
== mode
&&
13071 (pending_inc
.new_pools
.count(pool_id
) == 0 ||
13072 pending_inc
.new_pools
[pool_id
].cache_mode
== p
->cache_mode
)) {
13073 ss
<< "set cache-mode for pool '" << poolstr
<< "'"
13074 << " to " << pg_pool_t::get_cache_mode_name(mode
);
13079 /* Mode description:
13081 * none: No cache-mode defined
13082 * forward: Forward all reads and writes to base pool [removed]
13083 * writeback: Cache writes, promote reads from base pool
13084 * readonly: Forward writes to base pool
13085 * readforward: Writes are in writeback mode, Reads are in forward mode [removed]
13086 * proxy: Proxy all reads and writes to base pool
13087 * readproxy: Writes are in writeback mode, Reads are in proxy mode
13089 * Hence, these are the allowed transitions:
13092 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
13093 * proxy -> readproxy || writeback || any IF num_objects_dirty == 0
13094 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
13095 * readproxy -> proxy || writeback || any IF num_objects_dirty == 0
13096 * writeback -> readproxy || proxy
13100 // We check if the transition is valid against the current pool mode, as
13101 // it is the only committed state thus far. We will blantly squash
13102 // whatever mode is on the pending state.
13104 if (p
->cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
&&
13105 (mode
!= pg_pool_t::CACHEMODE_PROXY
&&
13106 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) {
13107 ss
<< "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode
)
13108 << "' on a '" << pg_pool_t::get_cache_mode_name(p
->cache_mode
)
13109 << "' pool; only '"
13110 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY
)
13115 if ((p
->cache_mode
== pg_pool_t::CACHEMODE_READFORWARD
&&
13116 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13117 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
13118 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) ||
13120 (p
->cache_mode
== pg_pool_t::CACHEMODE_READPROXY
&&
13121 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13122 mode
!= pg_pool_t::CACHEMODE_PROXY
)) ||
13124 (p
->cache_mode
== pg_pool_t::CACHEMODE_PROXY
&&
13125 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13126 mode
!= pg_pool_t::CACHEMODE_READPROXY
)) ||
13128 (p
->cache_mode
== pg_pool_t::CACHEMODE_FORWARD
&&
13129 (mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13130 mode
!= pg_pool_t::CACHEMODE_PROXY
&&
13131 mode
!= pg_pool_t::CACHEMODE_READPROXY
))) {
13133 const pool_stat_t
* pstats
=
13134 mon
->mgrstatmon()->get_pool_stat(pool_id
);
13136 if (pstats
&& pstats
->stats
.sum
.num_objects_dirty
> 0) {
13137 ss
<< "unable to set cache-mode '"
13138 << pg_pool_t::get_cache_mode_name(mode
) << "' on pool '" << poolstr
13139 << "': dirty objects found";
13145 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13146 np
->cache_mode
= mode
;
13147 // set this both when moving to and from cache_mode NONE. this is to
13148 // capture legacy pools that were set up before this flag existed.
13149 np
->flags
|= pg_pool_t::FLAG_INCOMPLETE_CLONES
;
13150 ss
<< "set cache-mode for pool '" << poolstr
13151 << "' to " << pg_pool_t::get_cache_mode_name(mode
);
13152 if (mode
== pg_pool_t::CACHEMODE_NONE
) {
13153 const pg_pool_t
*base_pool
= osdmap
.get_pg_pool(np
->tier_of
);
13154 ceph_assert(base_pool
);
13155 if (base_pool
->read_tier
== pool_id
||
13156 base_pool
->write_tier
== pool_id
)
13157 ss
<<" (WARNING: pool is still configured as read or write tier)";
13159 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13160 get_last_committed() + 1));
13162 } else if (prefix
== "osd tier add-cache") {
13163 err
= check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL
, ss
);
13164 if (err
== -EAGAIN
)
13169 cmd_getval(cmdmap
, "pool", poolstr
);
13170 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13172 ss
<< "unrecognized pool '" << poolstr
<< "'";
13176 string tierpoolstr
;
13177 cmd_getval(cmdmap
, "tierpool", tierpoolstr
);
13178 int64_t tierpool_id
= osdmap
.lookup_pg_pool_name(tierpoolstr
);
13179 if (tierpool_id
< 0) {
13180 ss
<< "unrecognized pool '" << tierpoolstr
<< "'";
13184 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool_id
);
13186 const pg_pool_t
*tp
= osdmap
.get_pg_pool(tierpool_id
);
13189 if (!_check_become_tier(tierpool_id
, tp
, pool_id
, p
, &err
, &ss
)) {
13194 if (!cmd_getval(cmdmap
, "size", size
)) {
13195 ss
<< "unable to parse 'size' value '"
13196 << cmd_vartype_stringify(cmdmap
.at("size")) << "'";
13200 // make sure new tier is empty
13201 const pool_stat_t
*pstats
=
13202 mon
->mgrstatmon()->get_pool_stat(tierpool_id
);
13203 if (pstats
&& pstats
->stats
.sum
.num_objects
!= 0) {
13204 ss
<< "tier pool '" << tierpoolstr
<< "' is not empty";
13208 auto& modestr
= g_conf().get_val
<string
>("osd_tier_default_cache_mode");
13209 pg_pool_t::cache_mode_t mode
= pg_pool_t::get_cache_mode_from_str(modestr
);
13210 if (int(mode
) < 0) {
13211 ss
<< "osd tier cache default mode '" << modestr
<< "' is not a valid cache mode";
13215 HitSet::Params hsp
;
13216 auto& cache_hit_set_type
=
13217 g_conf().get_val
<string
>("osd_tier_default_cache_hit_set_type");
13218 if (cache_hit_set_type
== "bloom") {
13219 BloomHitSet::Params
*bsp
= new BloomHitSet::Params
;
13220 bsp
->set_fpp(g_conf().get_val
<double>("osd_pool_default_hit_set_bloom_fpp"));
13221 hsp
= HitSet::Params(bsp
);
13222 } else if (cache_hit_set_type
== "explicit_hash") {
13223 hsp
= HitSet::Params(new ExplicitHashHitSet::Params
);
13224 } else if (cache_hit_set_type
== "explicit_object") {
13225 hsp
= HitSet::Params(new ExplicitObjectHitSet::Params
);
13227 ss
<< "osd tier cache default hit set type '"
13228 << cache_hit_set_type
<< "' is not a known type";
13233 pg_pool_t
*np
= pending_inc
.get_new_pool(pool_id
, p
);
13234 pg_pool_t
*ntp
= pending_inc
.get_new_pool(tierpool_id
, tp
);
13235 if (np
->tiers
.count(tierpool_id
) || ntp
->is_tier()) {
13236 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13239 np
->tiers
.insert(tierpool_id
);
13240 np
->read_tier
= np
->write_tier
= tierpool_id
;
13241 np
->set_snap_epoch(pending_inc
.epoch
); // tier will update to our snap info
13242 np
->set_last_force_op_resend(pending_inc
.epoch
);
13243 ntp
->set_last_force_op_resend(pending_inc
.epoch
);
13244 ntp
->tier_of
= pool_id
;
13245 ntp
->cache_mode
= mode
;
13246 ntp
->hit_set_count
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_count");
13247 ntp
->hit_set_period
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_period");
13248 ntp
->min_read_recency_for_promote
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
13249 ntp
->min_write_recency_for_promote
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
13250 ntp
->hit_set_grade_decay_rate
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
13251 ntp
->hit_set_search_last_n
= g_conf().get_val
<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
13252 ntp
->hit_set_params
= hsp
;
13253 ntp
->target_max_bytes
= size
;
13254 ss
<< "pool '" << tierpoolstr
<< "' is now (or already was) a cache tier of '" << poolstr
<< "'";
13255 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, ss
.str(),
13256 get_last_committed() + 1));
13258 } else if (prefix
== "osd pool set-quota") {
13260 cmd_getval(cmdmap
, "pool", poolstr
);
13261 int64_t pool_id
= osdmap
.lookup_pg_pool_name(poolstr
);
13263 ss
<< "unrecognized pool '" << poolstr
<< "'";
13269 cmd_getval(cmdmap
, "field", field
);
13270 if (field
!= "max_objects" && field
!= "max_bytes") {
13271 ss
<< "unrecognized field '" << field
<< "'; should be 'max_bytes' or 'max_objects'";
13276 // val could contain unit designations, so we treat as a string
13278 cmd_getval(cmdmap
, "val", val
);
13281 if (field
== "max_objects") {
13282 value
= strict_sistrtoll(val
.c_str(), &tss
);
13283 } else if (field
== "max_bytes") {
13284 value
= strict_iecstrtoll(val
.c_str(), &tss
);
13286 ceph_abort_msg("unrecognized option");
13288 if (!tss
.empty()) {
13289 ss
<< "error parsing value '" << val
<< "': " << tss
;
13294 pg_pool_t
*pi
= pending_inc
.get_new_pool(pool_id
, osdmap
.get_pg_pool(pool_id
));
13295 if (field
== "max_objects") {
13296 pi
->quota_max_objects
= value
;
13297 } else if (field
== "max_bytes") {
13298 pi
->quota_max_bytes
= value
;
13300 ceph_abort_msg("unrecognized option");
13302 ss
<< "set-quota " << field
<< " = " << value
<< " for pool " << poolstr
;
13304 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
13305 get_last_committed() + 1));
13307 } else if (prefix
== "osd pool application enable" ||
13308 prefix
== "osd pool application disable" ||
13309 prefix
== "osd pool application set" ||
13310 prefix
== "osd pool application rm") {
13311 err
= prepare_command_pool_application(prefix
, cmdmap
, ss
);
13312 if (err
== -EAGAIN
) {
13314 } else if (err
< 0) {
13319 } else if (prefix
== "osd force-create-pg") {
13322 cmd_getval(cmdmap
, "pgid", pgidstr
);
13323 if (!pgid
.parse(pgidstr
.c_str())) {
13324 ss
<< "invalid pgid '" << pgidstr
<< "'";
13328 if (!osdmap
.pg_exists(pgid
)) {
13329 ss
<< "pg " << pgid
<< " should not exist";
13334 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
13336 ss
<< "This command will recreate a lost (as in data lost) PG with data in it, such "
13337 << "that the cluster will give up ever trying to recover the lost data. Do this "
13338 << "only if you are certain that all copies of the PG are in fact lost and you are "
13339 << "willing to accept that the data is permanently destroyed. Pass "
13340 << "--yes-i-really-mean-it to proceed.";
13346 std::lock_guard
<std::mutex
> l(creating_pgs_lock
);
13347 auto emplaced
= creating_pgs
.pgs
.emplace(
13349 creating_pgs_t::pg_create_info(osdmap
.get_epoch(),
13350 ceph_clock_now()));
13351 creating_now
= emplaced
.second
;
13353 if (creating_now
) {
13354 ss
<< "pg " << pgidstr
<< " now creating, ok";
13355 // set the pool's CREATING flag so that (1) the osd won't ignore our
13356 // create message and (2) we won't propose any future pg_num changes
13357 // until after the PG has been instantiated.
13358 if (pending_inc
.new_pools
.count(pgid
.pool()) == 0) {
13359 pending_inc
.new_pools
[pgid
.pool()] = *osdmap
.get_pg_pool(pgid
.pool());
13361 pending_inc
.new_pools
[pgid
.pool()].flags
|= pg_pool_t::FLAG_CREATING
;
13365 ss
<< "pg " << pgid
<< " already creating";
13375 if (err
< 0 && rs
.length() == 0)
13376 rs
= cpp_strerror(err
);
13377 mon
->reply_command(op
, err
, rs
, rdata
, get_last_committed());
13382 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, 0, rs
,
13383 get_last_committed() + 1));
13387 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
13391 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op
)
13393 op
->mark_osdmon_event(__func__
);
13395 auto m
= op
->get_req
<MPoolOp
>();
13396 MonSession
*session
= op
->get_session();
13398 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
13403 case POOL_OP_CREATE_UNMANAGED_SNAP
:
13404 case POOL_OP_DELETE_UNMANAGED_SNAP
:
13406 const std::string
* pool_name
= nullptr;
13407 const pg_pool_t
*pg_pool
= osdmap
.get_pg_pool(m
->pool
);
13408 if (pg_pool
!= nullptr) {
13409 pool_name
= &osdmap
.get_pool_name(m
->pool
);
13412 if (!is_unmanaged_snap_op_permitted(cct
, mon
->key_server
,
13413 session
->entity_name
, session
->caps
,
13414 session
->get_peer_socket_addr(),
13416 dout(0) << "got unmanaged-snap pool op from entity with insufficient "
13417 << "privileges. message: " << *m
<< std::endl
13418 << "caps: " << session
->caps
<< dendl
;
13419 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
13425 if (!session
->is_capable("osd", MON_CAP_W
)) {
13426 dout(0) << "got pool op from entity with insufficient privileges. "
13427 << "message: " << *m
<< std::endl
13428 << "caps: " << session
->caps
<< dendl
;
13429 _pool_op_reply(op
, -EPERM
, osdmap
.get_epoch());
13438 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op
)
13440 op
->mark_osdmon_event(__func__
);
13441 auto m
= op
->get_req
<MPoolOp
>();
13443 if (enforce_pool_op_caps(op
)) {
13447 if (m
->fsid
!= mon
->monmap
->fsid
) {
13448 dout(0) << __func__
<< " drop message on fsid " << m
->fsid
13449 << " != " << mon
->monmap
->fsid
<< " for " << *m
<< dendl
;
13450 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13454 if (m
->op
== POOL_OP_CREATE
)
13455 return preprocess_pool_op_create(op
);
13457 const pg_pool_t
*p
= osdmap
.get_pg_pool(m
->pool
);
13458 if (p
== nullptr) {
13459 dout(10) << "attempt to operate on non-existent pool id " << m
->pool
<< dendl
;
13460 if (m
->op
== POOL_OP_DELETE
) {
13461 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13463 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
13468 // check if the snap and snapname exist
13469 bool snap_exists
= false;
13470 if (p
->snap_exists(m
->name
.c_str()))
13471 snap_exists
= true;
13474 case POOL_OP_CREATE_SNAP
:
13475 if (p
->is_unmanaged_snaps_mode() || p
->is_tier()) {
13476 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13480 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13484 case POOL_OP_CREATE_UNMANAGED_SNAP
:
13485 if (p
->is_pool_snaps_mode()) {
13486 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13490 case POOL_OP_DELETE_SNAP
:
13491 if (p
->is_unmanaged_snaps_mode()) {
13492 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13495 if (!snap_exists
) {
13496 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13500 case POOL_OP_DELETE_UNMANAGED_SNAP
:
13501 if (p
->is_pool_snaps_mode()) {
13502 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13505 if (_is_removed_snap(m
->pool
, m
->snapid
)) {
13506 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13510 case POOL_OP_DELETE
:
13511 if (osdmap
.lookup_pg_pool_name(m
->name
.c_str()) >= 0) {
13512 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13516 case POOL_OP_AUID_CHANGE
:
13526 bool OSDMonitor::_is_removed_snap(int64_t pool
, snapid_t snap
)
13528 if (!osdmap
.have_pg_pool(pool
)) {
13529 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
13530 << " - pool dne" << dendl
;
13533 if (osdmap
.in_removed_snaps_queue(pool
, snap
)) {
13534 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
13535 << " - in osdmap removed_snaps_queue" << dendl
;
13538 snapid_t begin
, end
;
13539 int r
= lookup_purged_snap(pool
, snap
, &begin
, &end
);
13541 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
13542 << " - purged, [" << begin
<< "," << end
<< ")" << dendl
;
13548 bool OSDMonitor::_is_pending_removed_snap(int64_t pool
, snapid_t snap
)
13550 if (pending_inc
.old_pools
.count(pool
)) {
13551 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
13552 << " - pool pending deletion" << dendl
;
13555 if (pending_inc
.in_new_removed_snaps(pool
, snap
)) {
13556 dout(10) << __func__
<< " pool " << pool
<< " snap " << snap
13557 << " - in pending new_removed_snaps" << dendl
;
13563 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op
)
13565 op
->mark_osdmon_event(__func__
);
13566 auto m
= op
->get_req
<MPoolOp
>();
13567 int64_t pool
= osdmap
.lookup_pg_pool_name(m
->name
.c_str());
13569 _pool_op_reply(op
, 0, osdmap
.get_epoch());
13576 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op
)
13578 op
->mark_osdmon_event(__func__
);
13579 auto m
= op
->get_req
<MPoolOp
>();
13580 dout(10) << "prepare_pool_op " << *m
<< dendl
;
13581 if (m
->op
== POOL_OP_CREATE
) {
13582 return prepare_pool_op_create(op
);
13583 } else if (m
->op
== POOL_OP_DELETE
) {
13584 return prepare_pool_op_delete(op
);
13588 bool changed
= false;
13590 if (!osdmap
.have_pg_pool(m
->pool
)) {
13591 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
13595 const pg_pool_t
*pool
= osdmap
.get_pg_pool(m
->pool
);
13598 case POOL_OP_CREATE_SNAP
:
13599 if (pool
->is_tier()) {
13601 _pool_op_reply(op
, ret
, osdmap
.get_epoch());
13603 } // else, fall through
13604 case POOL_OP_DELETE_SNAP
:
13605 if (!pool
->is_unmanaged_snaps_mode()) {
13606 bool snap_exists
= pool
->snap_exists(m
->name
.c_str());
13607 if ((m
->op
== POOL_OP_CREATE_SNAP
&& snap_exists
)
13608 || (m
->op
== POOL_OP_DELETE_SNAP
&& !snap_exists
)) {
13616 _pool_op_reply(op
, ret
, osdmap
.get_epoch());
13619 case POOL_OP_DELETE_UNMANAGED_SNAP
:
13620 // we won't allow removal of an unmanaged snapshot from a pool
13621 // not in unmanaged snaps mode.
13622 if (!pool
->is_unmanaged_snaps_mode()) {
13623 _pool_op_reply(op
, -ENOTSUP
, osdmap
.get_epoch());
13627 case POOL_OP_CREATE_UNMANAGED_SNAP
:
13628 // but we will allow creating an unmanaged snapshot on any pool
13629 // as long as it is not in 'pool' snaps mode.
13630 if (pool
->is_pool_snaps_mode()) {
13631 _pool_op_reply(op
, -EINVAL
, osdmap
.get_epoch());
13636 // projected pool info
13638 if (pending_inc
.new_pools
.count(m
->pool
))
13639 pp
= pending_inc
.new_pools
[m
->pool
];
13641 pp
= *osdmap
.get_pg_pool(m
->pool
);
13643 bufferlist reply_data
;
13645 // pool snaps vs unmanaged snaps are mutually exclusive
13647 case POOL_OP_CREATE_SNAP
:
13648 case POOL_OP_DELETE_SNAP
:
13649 if (pp
.is_unmanaged_snaps_mode()) {
13655 case POOL_OP_CREATE_UNMANAGED_SNAP
:
13656 case POOL_OP_DELETE_UNMANAGED_SNAP
:
13657 if (pp
.is_pool_snaps_mode()) {
13664 case POOL_OP_CREATE_SNAP
:
13665 if (!pp
.snap_exists(m
->name
.c_str())) {
13666 pp
.add_snap(m
->name
.c_str(), ceph_clock_now());
13667 dout(10) << "create snap in pool " << m
->pool
<< " " << m
->name
13668 << " seq " << pp
.get_snap_epoch() << dendl
;
13673 case POOL_OP_DELETE_SNAP
:
13675 snapid_t s
= pp
.snap_exists(m
->name
.c_str());
13678 pending_inc
.new_removed_snaps
[m
->pool
].insert(s
);
13684 case POOL_OP_CREATE_UNMANAGED_SNAP
:
13686 uint64_t snapid
= pp
.add_unmanaged_snap(
13687 osdmap
.require_osd_release
< ceph_release_t::octopus
);
13688 encode(snapid
, reply_data
);
13693 case POOL_OP_DELETE_UNMANAGED_SNAP
:
13694 if (!_is_removed_snap(m
->pool
, m
->snapid
) &&
13695 !_is_pending_removed_snap(m
->pool
, m
->snapid
)) {
13696 if (m
->snapid
> pp
.get_snap_seq()) {
13697 _pool_op_reply(op
, -ENOENT
, osdmap
.get_epoch());
13700 pp
.remove_unmanaged_snap(
13702 osdmap
.require_osd_release
< ceph_release_t::octopus
);
13703 pending_inc
.new_removed_snaps
[m
->pool
].insert(m
->snapid
);
13704 // also record the new seq as purged: this avoids a discontinuity
13705 // after all of the snaps have been purged, since the seq assigned
13706 // during removal lives in the same namespace as the actual snaps.
13707 pending_pseudo_purged_snaps
[m
->pool
].insert(pp
.get_snap_seq());
13712 case POOL_OP_AUID_CHANGE
:
13713 _pool_op_reply(op
, -EOPNOTSUPP
, osdmap
.get_epoch());
13722 pp
.set_snap_epoch(pending_inc
.epoch
);
13723 pending_inc
.new_pools
[m
->pool
] = pp
;
13727 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, ret
, pending_inc
.epoch
, &reply_data
));
13731 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op
)
13733 op
->mark_osdmon_event(__func__
);
13734 int err
= prepare_new_pool(op
);
13735 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, err
, pending_inc
.epoch
));
13739 int OSDMonitor::_check_remove_pool(int64_t pool_id
, const pg_pool_t
& pool
,
13742 const string
& poolstr
= osdmap
.get_pool_name(pool_id
);
13744 // If the Pool is in use by CephFS, refuse to delete it
13745 FSMap
const &pending_fsmap
= mon
->mdsmon()->get_pending_fsmap();
13746 if (pending_fsmap
.pool_in_use(pool_id
)) {
13747 *ss
<< "pool '" << poolstr
<< "' is in use by CephFS";
13751 if (pool
.tier_of
>= 0) {
13752 *ss
<< "pool '" << poolstr
<< "' is a tier of '"
13753 << osdmap
.get_pool_name(pool
.tier_of
) << "'";
13756 if (!pool
.tiers
.empty()) {
13757 *ss
<< "pool '" << poolstr
<< "' has tiers";
13758 for(auto tier
: pool
.tiers
) {
13759 *ss
<< " " << osdmap
.get_pool_name(tier
);
13764 if (!g_conf()->mon_allow_pool_delete
) {
13765 *ss
<< "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
13769 if (pool
.has_flag(pg_pool_t::FLAG_NODELETE
)) {
13770 *ss
<< "pool deletion is disabled; you must unset nodelete flag for the pool first";
13774 *ss
<< "pool '" << poolstr
<< "' removed";
13779 * Check if it is safe to add a tier to a base pool
13782 * True if the operation should proceed, false if we should abort here
13783 * (abort doesn't necessarily mean error, could be idempotency)
13785 bool OSDMonitor::_check_become_tier(
13786 const int64_t tier_pool_id
, const pg_pool_t
*tier_pool
,
13787 const int64_t base_pool_id
, const pg_pool_t
*base_pool
,
13791 const std::string
&tier_pool_name
= osdmap
.get_pool_name(tier_pool_id
);
13792 const std::string
&base_pool_name
= osdmap
.get_pool_name(base_pool_id
);
13794 const FSMap
&pending_fsmap
= mon
->mdsmon()->get_pending_fsmap();
13795 if (pending_fsmap
.pool_in_use(tier_pool_id
)) {
13796 *ss
<< "pool '" << tier_pool_name
<< "' is in use by CephFS";
13801 if (base_pool
->tiers
.count(tier_pool_id
)) {
13802 ceph_assert(tier_pool
->tier_of
== base_pool_id
);
13804 *ss
<< "pool '" << tier_pool_name
<< "' is now (or already was) a tier of '"
13805 << base_pool_name
<< "'";
13809 if (base_pool
->is_tier()) {
13810 *ss
<< "pool '" << base_pool_name
<< "' is already a tier of '"
13811 << osdmap
.get_pool_name(base_pool
->tier_of
) << "', "
13812 << "multiple tiers are not yet supported.";
13817 if (tier_pool
->has_tiers()) {
13818 *ss
<< "pool '" << tier_pool_name
<< "' has following tier(s) already:";
13819 for (set
<uint64_t>::iterator it
= tier_pool
->tiers
.begin();
13820 it
!= tier_pool
->tiers
.end(); ++it
)
13821 *ss
<< "'" << osdmap
.get_pool_name(*it
) << "',";
13822 *ss
<< " multiple tiers are not yet supported.";
13827 if (tier_pool
->is_tier()) {
13828 *ss
<< "tier pool '" << tier_pool_name
<< "' is already a tier of '"
13829 << osdmap
.get_pool_name(tier_pool
->tier_of
) << "'";
13840 * Check if it is safe to remove a tier from this base pool
13843 * True if the operation should proceed, false if we should abort here
13844 * (abort doesn't necessarily mean error, could be idempotency)
13846 bool OSDMonitor::_check_remove_tier(
13847 const int64_t base_pool_id
, const pg_pool_t
*base_pool
,
13848 const pg_pool_t
*tier_pool
,
13849 int *err
, ostream
*ss
) const
13851 const std::string
&base_pool_name
= osdmap
.get_pool_name(base_pool_id
);
13853 // Apply CephFS-specific checks
13854 const FSMap
&pending_fsmap
= mon
->mdsmon()->get_pending_fsmap();
13855 if (pending_fsmap
.pool_in_use(base_pool_id
)) {
13856 if (base_pool
->is_erasure() && !base_pool
->allows_ecoverwrites()) {
13857 // If the underlying pool is erasure coded and does not allow EC
13858 // overwrites, we can't permit the removal of the replicated tier that
13859 // CephFS relies on to access it
13860 *ss
<< "pool '" << base_pool_name
<<
13861 "' does not allow EC overwrites and is in use by CephFS"
13867 if (tier_pool
&& tier_pool
->cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
) {
13868 *ss
<< "pool '" << base_pool_name
<< "' is in use by CephFS, and this "
13869 "tier is still in use as a writeback cache. Change the cache "
13870 "mode and flush the cache before removing it";
13880 int OSDMonitor::_prepare_remove_pool(
13881 int64_t pool
, ostream
*ss
, bool no_fake
)
13883 dout(10) << __func__
<< " " << pool
<< dendl
;
13884 const pg_pool_t
*p
= osdmap
.get_pg_pool(pool
);
13885 int r
= _check_remove_pool(pool
, *p
, ss
);
13889 auto new_pool
= pending_inc
.new_pools
.find(pool
);
13890 if (new_pool
!= pending_inc
.new_pools
.end()) {
13891 // if there is a problem with the pending info, wait and retry
13893 const auto& p
= new_pool
->second
;
13894 int r
= _check_remove_pool(pool
, p
, ss
);
13899 if (pending_inc
.old_pools
.count(pool
)) {
13900 dout(10) << __func__
<< " " << pool
<< " already pending removal"
13905 if (g_conf()->mon_fake_pool_delete
&& !no_fake
) {
13906 string old_name
= osdmap
.get_pool_name(pool
);
13907 string new_name
= old_name
+ "." + stringify(pool
) + ".DELETED";
13908 dout(1) << __func__
<< " faking pool deletion: renaming " << pool
<< " "
13909 << old_name
<< " -> " << new_name
<< dendl
;
13910 pending_inc
.new_pool_names
[pool
] = new_name
;
13915 pending_inc
.old_pools
.insert(pool
);
13917 // remove any pg_temp mappings for this pool
13918 for (auto p
= osdmap
.pg_temp
->begin();
13919 p
!= osdmap
.pg_temp
->end();
13921 if (p
->first
.pool() == pool
) {
13922 dout(10) << __func__
<< " " << pool
<< " removing obsolete pg_temp "
13923 << p
->first
<< dendl
;
13924 pending_inc
.new_pg_temp
[p
->first
].clear();
13927 // remove any primary_temp mappings for this pool
13928 for (auto p
= osdmap
.primary_temp
->begin();
13929 p
!= osdmap
.primary_temp
->end();
13931 if (p
->first
.pool() == pool
) {
13932 dout(10) << __func__
<< " " << pool
13933 << " removing obsolete primary_temp" << p
->first
<< dendl
;
13934 pending_inc
.new_primary_temp
[p
->first
] = -1;
13937 // remove any pg_upmap mappings for this pool
13938 for (auto& p
: osdmap
.pg_upmap
) {
13939 if (p
.first
.pool() == pool
) {
13940 dout(10) << __func__
<< " " << pool
13941 << " removing obsolete pg_upmap "
13942 << p
.first
<< dendl
;
13943 pending_inc
.old_pg_upmap
.insert(p
.first
);
13946 // remove any pending pg_upmap mappings for this pool
13948 auto it
= pending_inc
.new_pg_upmap
.begin();
13949 while (it
!= pending_inc
.new_pg_upmap
.end()) {
13950 if (it
->first
.pool() == pool
) {
13951 dout(10) << __func__
<< " " << pool
13952 << " removing pending pg_upmap "
13953 << it
->first
<< dendl
;
13954 it
= pending_inc
.new_pg_upmap
.erase(it
);
13960 // remove any pg_upmap_items mappings for this pool
13961 for (auto& p
: osdmap
.pg_upmap_items
) {
13962 if (p
.first
.pool() == pool
) {
13963 dout(10) << __func__
<< " " << pool
13964 << " removing obsolete pg_upmap_items " << p
.first
13966 pending_inc
.old_pg_upmap_items
.insert(p
.first
);
13969 // remove any pending pg_upmap mappings for this pool
13971 auto it
= pending_inc
.new_pg_upmap_items
.begin();
13972 while (it
!= pending_inc
.new_pg_upmap_items
.end()) {
13973 if (it
->first
.pool() == pool
) {
13974 dout(10) << __func__
<< " " << pool
13975 << " removing pending pg_upmap_items "
13976 << it
->first
<< dendl
;
13977 it
= pending_inc
.new_pg_upmap_items
.erase(it
);
13984 // remove any choose_args for this pool
13985 CrushWrapper newcrush
;
13986 _get_pending_crush(newcrush
);
13987 if (newcrush
.have_choose_args(pool
)) {
13988 dout(10) << __func__
<< " removing choose_args for pool " << pool
<< dendl
;
13989 newcrush
.rm_choose_args(pool
);
13990 pending_inc
.crush
.clear();
13991 newcrush
.encode(pending_inc
.crush
, mon
->get_quorum_con_features());
13996 int OSDMonitor::_prepare_rename_pool(int64_t pool
, string newname
)
13998 dout(10) << "_prepare_rename_pool " << pool
<< dendl
;
13999 if (pending_inc
.old_pools
.count(pool
)) {
14000 dout(10) << "_prepare_rename_pool " << pool
<< " pending removal" << dendl
;
14003 for (map
<int64_t,string
>::iterator p
= pending_inc
.new_pool_names
.begin();
14004 p
!= pending_inc
.new_pool_names
.end();
14006 if (p
->second
== newname
&& p
->first
!= pool
) {
14011 pending_inc
.new_pool_names
[pool
] = newname
;
14015 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op
)
14017 op
->mark_osdmon_event(__func__
);
14018 auto m
= op
->get_req
<MPoolOp
>();
14020 int ret
= _prepare_remove_pool(m
->pool
, &ss
, false);
14021 if (ret
== -EAGAIN
) {
14022 wait_for_finished_proposal(op
, new C_RetryMessage(this, op
));
14026 dout(10) << __func__
<< " got " << ret
<< " " << ss
.str() << dendl
;
14027 wait_for_finished_proposal(op
, new OSDMonitor::C_PoolOp(this, op
, ret
,
14028 pending_inc
.epoch
));
14032 void OSDMonitor::_pool_op_reply(MonOpRequestRef op
,
14033 int ret
, epoch_t epoch
, bufferlist
*blp
)
14035 op
->mark_osdmon_event(__func__
);
14036 auto m
= op
->get_req
<MPoolOp
>();
14037 dout(20) << "_pool_op_reply " << ret
<< dendl
;
14038 MPoolOpReply
*reply
= new MPoolOpReply(m
->fsid
, m
->get_tid(),
14039 ret
, epoch
, get_last_committed(), blp
);
14040 mon
->send_reply(op
, reply
);
14043 void OSDMonitor::convert_pool_priorities(void)
14045 pool_opts_t::key_t key
= pool_opts_t::get_opt_desc("recovery_priority").key
;
14046 int64_t max_prio
= 0;
14047 int64_t min_prio
= 0;
14048 for (const auto &i
: osdmap
.get_pools()) {
14049 const auto &pool
= i
.second
;
14051 if (pool
.opts
.is_set(key
)) {
14053 pool
.opts
.get(key
, &prio
);
14054 if (prio
> max_prio
)
14056 if (prio
< min_prio
)
14060 if (max_prio
<= OSD_POOL_PRIORITY_MAX
&& min_prio
>= OSD_POOL_PRIORITY_MIN
) {
14061 dout(20) << __func__
<< " nothing to fix" << dendl
;
14064 // Current pool priorities exceeds new maximum
14065 for (const auto &i
: osdmap
.get_pools()) {
14066 const auto pool_id
= i
.first
;
14067 pg_pool_t pool
= i
.second
;
14070 pool
.opts
.get(key
, &prio
);
14073 if (prio
> 0 && max_prio
> OSD_POOL_PRIORITY_MAX
) { // Likely scenario
14074 // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
14075 n
= (float)prio
/ max_prio
* OSD_POOL_PRIORITY_MAX
;
14076 } else if (prio
< 0 && min_prio
< OSD_POOL_PRIORITY_MIN
) {
14077 // Scaled priority range OSD_POOL_PRIORITY_MIN to 0
14078 n
= (float)prio
/ min_prio
* OSD_POOL_PRIORITY_MIN
;
14083 pool
.opts
.unset(key
);
14085 pool
.opts
.set(key
, static_cast<int64_t>(n
));
14087 dout(10) << __func__
<< " pool " << pool_id
14088 << " recovery_priority adjusted "
14089 << prio
<< " to " << n
<< dendl
;
14090 pool
.last_change
= pending_inc
.epoch
;
14091 pending_inc
.new_pools
[pool_id
] = pool
;